From 5916dbe039bec4e4398db5d0e5e38167b51fbfbe Mon Sep 17 00:00:00 2001 From: Haishuang Yan Date: Wed, 27 Sep 2017 11:35:40 +0800 Subject: [PATCH 0001/1640] ipv4: Namespaceify tcp_fastopen knob Different namespace application might require enable TCP Fast Open feature independently of the host. This patch series continues making more of the TCP Fast Open related sysctl knobs be per net-namespace. Reported-by: Luca BRUNO Signed-off-by: Haishuang Yan Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 1 + include/net/tcp.h | 1 - net/ipv4/af_inet.c | 7 ++++--- net/ipv4/sysctl_net_ipv4.c | 14 +++++++------- net/ipv4/tcp.c | 4 ++-- net/ipv4/tcp_fastopen.c | 11 +++++------ net/ipv4/tcp_ipv4.c | 2 ++ 7 files changed, 21 insertions(+), 19 deletions(-) diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index ed9c8aaa65f0..e1e42211679c 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -132,6 +132,7 @@ struct netns_ipv4 { int sysctl_tcp_default_init_rwnd; struct inet_timewait_death_row tcp_death_row; int sysctl_max_syn_backlog; + int sysctl_tcp_fastopen; #ifdef CONFIG_NET_L3_MASTER_DEV int sysctl_udp_l3mdev_accept; diff --git a/include/net/tcp.h b/include/net/tcp.h index 3d18bc1fd737..84a9d7b23223 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -249,7 +249,6 @@ void tcp_time_wait(struct sock *sk, int state, int timeo); /* sysctl variables for tcp */ -extern int sysctl_tcp_fastopen; extern int sysctl_tcp_retrans_collapse; extern int sysctl_tcp_stdurg; extern int sysctl_tcp_rfc1337; diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index ca240a689866..1a87291c057f 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -196,7 +196,7 @@ int inet_listen(struct socket *sock, int backlog) { struct sock *sk = sock->sk; unsigned char old_state; - int err; + int err, tcp_fastopen; lock_sock(sk); @@ -218,8 +218,9 @@ int inet_listen(struct socket *sock, int backlog) * because the socket was in TCP_LISTEN state previously but * was shutdown() rather than close(). */ - if ((sysctl_tcp_fastopen & TFO_SERVER_WO_SOCKOPT1) && - (sysctl_tcp_fastopen & TFO_SERVER_ENABLE) && + tcp_fastopen = sock_net(sk)->ipv4.sysctl_tcp_fastopen; + if ((tcp_fastopen & TFO_SERVER_WO_SOCKOPT1) && + (tcp_fastopen & TFO_SERVER_ENABLE) && !inet_csk(sk)->icsk_accept_queue.fastopenq.max_qlen) { fastopen_queue_tune(sk, backlog); tcp_fastopen_init_key_once(true); diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 55ef264a8293..92559d0b3860 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -364,13 +364,6 @@ static struct ctl_table ipv4_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, - { - .procname = "tcp_fastopen", - .data = &sysctl_tcp_fastopen, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, { .procname = "tcp_fastopen_key", .mode = 0600, @@ -1077,6 +1070,13 @@ static struct ctl_table ipv4_net_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, + { + .procname = "tcp_fastopen", + .data = &init_net.ipv4.sysctl_tcp_fastopen, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, #ifdef CONFIG_IP_ROUTE_MULTIPATH { .procname = "fib_multipath_use_neigh", diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 52c1bd2bd9b5..79c26c55a242 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1159,7 +1159,7 @@ static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, struct sockaddr *uaddr = msg->msg_name; int err, flags; - if (!(sysctl_tcp_fastopen & TFO_CLIENT_ENABLE) || + if (!(sock_net(sk)->ipv4.sysctl_tcp_fastopen & TFO_CLIENT_ENABLE) || (uaddr && msg->msg_namelen >= sizeof(uaddr->sa_family) && uaddr->sa_family == AF_UNSPEC)) return -EOPNOTSUPP; @@ -2815,7 +2815,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, case TCP_FASTOPEN_CONNECT: if (val > 1 || val < 0) { err = -EINVAL; - } else if (sysctl_tcp_fastopen & TFO_CLIENT_ENABLE) { + } else if (net->ipv4.sysctl_tcp_fastopen & TFO_CLIENT_ENABLE) { if (sk->sk_state == TCP_CLOSE) tp->fastopen_connect = val; else diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index 0edd8d357e3d..50da3ac07b5d 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -10,8 +10,6 @@ #include #include -int sysctl_tcp_fastopen __read_mostly = TFO_CLIENT_ENABLE; - struct tcp_fastopen_context __rcu *tcp_fastopen_ctx; static DEFINE_SPINLOCK(tcp_fastopen_ctx_lock); @@ -282,21 +280,22 @@ struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb, struct request_sock *req, struct tcp_fastopen_cookie *foc) { - struct tcp_fastopen_cookie valid_foc = { .len = -1 }; bool syn_data = TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq + 1; + int tcp_fastopen = sock_net(sk)->ipv4.sysctl_tcp_fastopen; + struct tcp_fastopen_cookie valid_foc = { .len = -1 }; struct sock *child; if (foc->len == 0) /* Client requests a cookie */ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENCOOKIEREQD); - if (!((sysctl_tcp_fastopen & TFO_SERVER_ENABLE) && + if (!((tcp_fastopen & TFO_SERVER_ENABLE) && (syn_data || foc->len >= 0) && tcp_fastopen_queue_check(sk))) { foc->len = -1; return NULL; } - if (syn_data && (sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_REQD)) + if (syn_data && (tcp_fastopen & TFO_SERVER_COOKIE_NOT_REQD)) goto fastopen; if (foc->len >= 0 && /* Client presents or requests a cookie */ @@ -350,7 +349,7 @@ bool tcp_fastopen_cookie_check(struct sock *sk, u16 *mss, return false; } - if (sysctl_tcp_fastopen & TFO_CLIENT_NO_COOKIE) { + if (sock_net(sk)->ipv4.sysctl_tcp_fastopen & TFO_CLIENT_NO_COOKIE) { cookie->len = -1; return true; } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 98ff399740b4..955eebac3dc9 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2554,6 +2554,8 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.sysctl_tcp_early_retrans = 3; net->ipv4.sysctl_tcp_default_init_rwnd = TCP_INIT_CWND * 2; + net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE; + return 0; fail: tcp_sk_exit(net); From a4fe05c7cf25d7c1a26174a4437719550275337f Mon Sep 17 00:00:00 2001 From: Haishuang Yan Date: Wed, 27 Sep 2017 11:35:41 +0800 Subject: [PATCH 0002/1640] ipv4: Remove the 'publish' logic in tcp_fastopen_init_key_once MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The 'publish' logic is not necessary after commit dfea2aa65424 ("tcp: Do not call tcp_fastopen_reset_cipher from interrupt context"), because in tcp_fastopen_cookie_gen,it wouldn't call tcp_fastopen_init_key_once. Change-Id: I6f58d1b7e689b4dacb8617093f61537c18383bd6 Signed-off-by: Haishuang Yan Signed-off-by: David S. Miller --- include/net/tcp.h | 2 +- net/ipv4/af_inet.c | 2 +- net/ipv4/sysctl_net_ipv4.c | 5 ----- net/ipv4/tcp.c | 2 +- net/ipv4/tcp_fastopen.c | 4 ++-- 5 files changed, 5 insertions(+), 10 deletions(-) diff --git a/include/net/tcp.h b/include/net/tcp.h index 84a9d7b23223..eb8a3012e54d 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1626,7 +1626,7 @@ void tcp_fastopen_add_skb(struct sock *sk, struct sk_buff *skb); struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb, struct request_sock *req, struct tcp_fastopen_cookie *foc); -void tcp_fastopen_init_key_once(bool publish); +void tcp_fastopen_init_key_once(void); bool tcp_fastopen_cookie_check(struct sock *sk, u16 *mss, struct tcp_fastopen_cookie *cookie); bool tcp_fastopen_defer_connect(struct sock *sk, int *err); diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 1a87291c057f..df4b2c3a2b58 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -223,7 +223,7 @@ int inet_listen(struct socket *sock, int backlog) (tcp_fastopen & TFO_SERVER_ENABLE) && !inet_csk(sk)->icsk_accept_queue.fastopenq.max_qlen) { fastopen_queue_tune(sk, backlog); - tcp_fastopen_init_key_once(true); + tcp_fastopen_init_key_once(); } err = inet_csk_listen_start(sk, backlog); diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 92559d0b3860..bbd7643968bc 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -297,11 +297,6 @@ static int proc_tcp_fastopen_key(struct ctl_table *ctl, int write, ret = -EINVAL; goto bad_key; } - /* Generate a dummy secret but don't publish it. This - * is needed so we don't regenerate a new key on the - * first invocation of tcp_fastopen_cookie_gen - */ - tcp_fastopen_init_key_once(false); for (i = 0; i < ARRAY_SIZE(user_key); i++) key[i] = cpu_to_le32(user_key[i]); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 79c26c55a242..a0c583e5ce1c 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2805,7 +2805,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, case TCP_FASTOPEN: if (val >= 0 && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) { - tcp_fastopen_init_key_once(true); + tcp_fastopen_init_key_once(); fastopen_queue_tune(sk, val); } else { diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index 50da3ac07b5d..cfe50b7b84b3 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -14,7 +14,7 @@ struct tcp_fastopen_context __rcu *tcp_fastopen_ctx; static DEFINE_SPINLOCK(tcp_fastopen_ctx_lock); -void tcp_fastopen_init_key_once(bool publish) +void tcp_fastopen_init_key_once(void) { static u8 key[TCP_FASTOPEN_KEY_LENGTH]; @@ -24,7 +24,7 @@ void tcp_fastopen_init_key_once(bool publish) * All call sites of tcp_fastopen_cookie_gen also check * for a valid cookie, so this is an acceptable risk. */ - if (net_get_random_once(key, sizeof(key)) && publish) + if (net_get_random_once(key, sizeof(key))) tcp_fastopen_reset_cipher(key, sizeof(key)); } From f2012acf4fcdbad3866dac75ff94bd6808fac6e2 Mon Sep 17 00:00:00 2001 From: Haishuang Yan Date: Wed, 27 Sep 2017 11:35:42 +0800 Subject: [PATCH 0003/1640] ipv4: Namespaceify tcp_fastopen_key knob Different namespace application might require different tcp_fastopen_key independently of the host. David Miller pointed out there is a leak without releasing the context of tcp_fastopen_key during netns teardown. So add the release action in exit_batch path. Tested: 1. Container namespace: 2817fff2-f803cf97-eadfd1f3-78c0992b cookie key in tcp syn packets: Fast Open Cookie Kind: TCP Fast Open Cookie (34) Length: 10 Fast Open Cookie: 1e5dd82a8c492ca9 2. Host: 107d7c5f-68eb2ac7-02fb06e6-ed341702 cookie key in tcp syn packets: Fast Open Cookie Kind: TCP Fast Open Cookie (34) Length: 10 Fast Open Cookie: e213c02bf0afbc8a Change-Id: I510de0e6add8bfdf9dd48ccea9796b620518c431 Signed-off-by: Haishuang Yan Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 4 +++ include/net/tcp.h | 6 ++-- net/ipv4/af_inet.c | 2 +- net/ipv4/sysctl_net_ipv4.c | 21 +++++++------ net/ipv4/tcp.c | 2 +- net/ipv4/tcp_fastopen.c | 64 +++++++++++++++++++++++++------------- net/ipv4/tcp_ipv4.c | 6 ++++ 7 files changed, 70 insertions(+), 35 deletions(-) diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index e1e42211679c..e4c63cf52075 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -38,6 +38,8 @@ struct inet_timewait_death_row { int sysctl_max_tw_buckets; }; +struct tcp_fastopen_context; + struct netns_ipv4 { #ifdef CONFIG_SYSCTL struct ctl_table_header *forw_hdr; @@ -133,6 +135,8 @@ struct netns_ipv4 { struct inet_timewait_death_row tcp_death_row; int sysctl_max_syn_backlog; int sysctl_tcp_fastopen; + struct tcp_fastopen_context __rcu *tcp_fastopen_ctx; + spinlock_t tcp_fastopen_ctx_lock; #ifdef CONFIG_NET_L3_MASTER_DEV int sysctl_udp_l3mdev_accept; diff --git a/include/net/tcp.h b/include/net/tcp.h index eb8a3012e54d..61b9387ee0f5 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1620,13 +1620,13 @@ struct tcp_fastopen_request { }; void tcp_free_fastopen_req(struct tcp_sock *tp); -extern struct tcp_fastopen_context __rcu *tcp_fastopen_ctx; -int tcp_fastopen_reset_cipher(void *key, unsigned int len); +void tcp_fastopen_ctx_destroy(struct net *net); +int tcp_fastopen_reset_cipher(struct net *net, void *key, unsigned int len); void tcp_fastopen_add_skb(struct sock *sk, struct sk_buff *skb); struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb, struct request_sock *req, struct tcp_fastopen_cookie *foc); -void tcp_fastopen_init_key_once(void); +void tcp_fastopen_init_key_once(struct net *net); bool tcp_fastopen_cookie_check(struct sock *sk, u16 *mss, struct tcp_fastopen_cookie *cookie); bool tcp_fastopen_defer_connect(struct sock *sk, int *err); diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index df4b2c3a2b58..d2cc14fff8ae 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -223,7 +223,7 @@ int inet_listen(struct socket *sock, int backlog) (tcp_fastopen & TFO_SERVER_ENABLE) && !inet_csk(sk)->icsk_accept_queue.fastopenq.max_qlen) { fastopen_queue_tune(sk, backlog); - tcp_fastopen_init_key_once(); + tcp_fastopen_init_key_once(sock_net(sk)); } err = inet_csk_listen_start(sk, backlog); diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index bbd7643968bc..d6d697417cd3 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -262,10 +262,12 @@ static int proc_allowed_congestion_control(struct ctl_table *ctl, return ret; } -static int proc_tcp_fastopen_key(struct ctl_table *ctl, int write, +static int proc_tcp_fastopen_key(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { + struct net *net = container_of(table->data, struct net, + ipv4.sysctl_tcp_fastopen); struct ctl_table tbl = { .maxlen = (TCP_FASTOPEN_KEY_LENGTH * 2 + 10) }; struct tcp_fastopen_context *ctxt; u32 user_key[4]; /* 16 bytes, matching TCP_FASTOPEN_KEY_LENGTH */ @@ -277,7 +279,7 @@ static int proc_tcp_fastopen_key(struct ctl_table *ctl, int write, return -ENOMEM; rcu_read_lock(); - ctxt = rcu_dereference(tcp_fastopen_ctx); + ctxt = rcu_dereference(net->ipv4.tcp_fastopen_ctx); if (ctxt) memcpy(key, ctxt->key, TCP_FASTOPEN_KEY_LENGTH); else @@ -301,7 +303,7 @@ static int proc_tcp_fastopen_key(struct ctl_table *ctl, int write, for (i = 0; i < ARRAY_SIZE(user_key); i++) key[i] = cpu_to_le32(user_key[i]); - tcp_fastopen_reset_cipher(key, TCP_FASTOPEN_KEY_LENGTH); + tcp_fastopen_reset_cipher(net, key, TCP_FASTOPEN_KEY_LENGTH); } bad_key: @@ -359,12 +361,6 @@ static struct ctl_table ipv4_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, - { - .procname = "tcp_fastopen_key", - .mode = 0600, - .maxlen = ((TCP_FASTOPEN_KEY_LENGTH * 2) + 10), - .proc_handler = proc_tcp_fastopen_key, - }, { .procname = "tcp_fastopen_blackhole_timeout_sec", .data = &sysctl_tcp_fastopen_blackhole_timeout, @@ -1072,6 +1068,13 @@ static struct ctl_table ipv4_net_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "tcp_fastopen_key", + .mode = 0600, + .data = &init_net.ipv4.sysctl_tcp_fastopen, + .maxlen = ((TCP_FASTOPEN_KEY_LENGTH * 2) + 10), + .proc_handler = proc_tcp_fastopen_key, + }, #ifdef CONFIG_IP_ROUTE_MULTIPATH { .procname = "fib_multipath_use_neigh", diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index a0c583e5ce1c..0f358655f884 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2805,7 +2805,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, case TCP_FASTOPEN: if (val >= 0 && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) { - tcp_fastopen_init_key_once(); + tcp_fastopen_init_key_once(net); fastopen_queue_tune(sk, val); } else { diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index cfe50b7b84b3..6f5ab32c5810 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -10,13 +10,18 @@ #include #include -struct tcp_fastopen_context __rcu *tcp_fastopen_ctx; - -static DEFINE_SPINLOCK(tcp_fastopen_ctx_lock); - -void tcp_fastopen_init_key_once(void) +void tcp_fastopen_init_key_once(struct net *net) { - static u8 key[TCP_FASTOPEN_KEY_LENGTH]; + u8 key[TCP_FASTOPEN_KEY_LENGTH]; + struct tcp_fastopen_context *ctxt; + + rcu_read_lock(); + ctxt = rcu_dereference(net->ipv4.tcp_fastopen_ctx); + if (ctxt) { + rcu_read_unlock(); + return; + } + rcu_read_unlock(); /* tcp_fastopen_reset_cipher publishes the new context * atomically, so we allow this race happening here. @@ -24,8 +29,8 @@ void tcp_fastopen_init_key_once(void) * All call sites of tcp_fastopen_cookie_gen also check * for a valid cookie, so this is an acceptable risk. */ - if (net_get_random_once(key, sizeof(key))) - tcp_fastopen_reset_cipher(key, sizeof(key)); + get_random_bytes(key, sizeof(key)); + tcp_fastopen_reset_cipher(net, key, sizeof(key)); } static void tcp_fastopen_ctx_free(struct rcu_head *head) @@ -36,7 +41,22 @@ static void tcp_fastopen_ctx_free(struct rcu_head *head) kfree(ctx); } -int tcp_fastopen_reset_cipher(void *key, unsigned int len) +void tcp_fastopen_ctx_destroy(struct net *net) +{ + struct tcp_fastopen_context *ctxt; + + spin_lock(&net->ipv4.tcp_fastopen_ctx_lock); + + ctxt = rcu_dereference_protected(net->ipv4.tcp_fastopen_ctx, + lockdep_is_held(&net->ipv4.tcp_fastopen_ctx_lock)); + rcu_assign_pointer(net->ipv4.tcp_fastopen_ctx, NULL); + spin_unlock(&net->ipv4.tcp_fastopen_ctx_lock); + + if (ctxt) + call_rcu(&ctxt->rcu, tcp_fastopen_ctx_free); +} + +int tcp_fastopen_reset_cipher(struct net *net, void *key, unsigned int len) { int err; struct tcp_fastopen_context *ctx, *octx; @@ -60,26 +80,27 @@ error: kfree(ctx); } memcpy(ctx->key, key, len); - spin_lock(&tcp_fastopen_ctx_lock); + spin_lock(&net->ipv4.tcp_fastopen_ctx_lock); - octx = rcu_dereference_protected(tcp_fastopen_ctx, - lockdep_is_held(&tcp_fastopen_ctx_lock)); - rcu_assign_pointer(tcp_fastopen_ctx, ctx); - spin_unlock(&tcp_fastopen_ctx_lock); + octx = rcu_dereference_protected(net->ipv4.tcp_fastopen_ctx, + lockdep_is_held(&net->ipv4.tcp_fastopen_ctx_lock)); + rcu_assign_pointer(net->ipv4.tcp_fastopen_ctx, ctx); + spin_unlock(&net->ipv4.tcp_fastopen_ctx_lock); if (octx) call_rcu(&octx->rcu, tcp_fastopen_ctx_free); return err; } -static bool __tcp_fastopen_cookie_gen(const void *path, +static bool __tcp_fastopen_cookie_gen(struct net *net, + const void *path, struct tcp_fastopen_cookie *foc) { struct tcp_fastopen_context *ctx; bool ok = false; rcu_read_lock(); - ctx = rcu_dereference(tcp_fastopen_ctx); + ctx = rcu_dereference(net->ipv4.tcp_fastopen_ctx); if (ctx) { crypto_cipher_encrypt_one(ctx->tfm, foc->val, path); foc->len = TCP_FASTOPEN_COOKIE_SIZE; @@ -95,7 +116,8 @@ static bool __tcp_fastopen_cookie_gen(const void *path, * * XXX (TFO) - refactor when TCP_FASTOPEN_COOKIE_SIZE != AES_BLOCK_SIZE. */ -static bool tcp_fastopen_cookie_gen(struct request_sock *req, +static bool tcp_fastopen_cookie_gen(struct net *net, + struct request_sock *req, struct sk_buff *syn, struct tcp_fastopen_cookie *foc) { @@ -103,7 +125,7 @@ static bool tcp_fastopen_cookie_gen(struct request_sock *req, const struct iphdr *iph = ip_hdr(syn); __be32 path[4] = { iph->saddr, iph->daddr, 0, 0 }; - return __tcp_fastopen_cookie_gen(path, foc); + return __tcp_fastopen_cookie_gen(net, path, foc); } #if IS_ENABLED(CONFIG_IPV6) @@ -111,13 +133,13 @@ static bool tcp_fastopen_cookie_gen(struct request_sock *req, const struct ipv6hdr *ip6h = ipv6_hdr(syn); struct tcp_fastopen_cookie tmp; - if (__tcp_fastopen_cookie_gen(&ip6h->saddr, &tmp)) { + if (__tcp_fastopen_cookie_gen(net, &ip6h->saddr, &tmp)) { struct in6_addr *buf = &tmp.addr; int i; for (i = 0; i < 4; i++) buf->s6_addr32[i] ^= ip6h->daddr.s6_addr32[i]; - return __tcp_fastopen_cookie_gen(buf, foc); + return __tcp_fastopen_cookie_gen(net, buf, foc); } } #endif @@ -299,7 +321,7 @@ struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb, goto fastopen; if (foc->len >= 0 && /* Client presents or requests a cookie */ - tcp_fastopen_cookie_gen(req, skb, &valid_foc) && + tcp_fastopen_cookie_gen(sock_net(sk), req, skb, &valid_foc) && foc->len == TCP_FASTOPEN_COOKIE_SIZE && foc->len == valid_foc.len && !memcmp(foc->val, valid_foc.val, foc->len)) { diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 955eebac3dc9..a7b903de70cd 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2555,6 +2555,7 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.sysctl_tcp_default_init_rwnd = TCP_INIT_CWND * 2; net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE; + spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock); return 0; fail: @@ -2565,7 +2566,12 @@ fail: static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list) { + struct net *net; + inet_twsk_purge(&tcp_hashinfo, AF_INET); + + list_for_each_entry(net, net_exit_list, exit_list) + tcp_fastopen_ctx_destroy(net); } static struct pernet_operations __net_initdata tcp_sk_ops = { From aad0a41eae96dc056d9bbba343b883af0415e395 Mon Sep 17 00:00:00 2001 From: Haishuang Yan Date: Wed, 27 Sep 2017 11:35:43 +0800 Subject: [PATCH 0004/1640] ipv4: Namespaceify tcp_fastopen_blackhole_timeout knob Different namespace application might require different time period in second to disable Fastopen on active TCP sockets. Tested: Simulate following similar situation that the server's data gets dropped after 3WHS. C ---- syn-data ---> S C <--- syn/ack ----- S C ---- ack --------> S S (accept & write) C? X <- data ------ S [retry and timeout] And then print netstat of TCPFastOpenBlackhole, the counter increased as expected when the firewall blackhole issue is detected and active TFO is disabled. # cat /proc/net/netstat | awk '{print $91}' TCPFastOpenBlackhole 1 Signed-off-by: Haishuang Yan Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 3 +++ net/ipv4/sysctl_net_ipv4.c | 20 +++++++++++--------- net/ipv4/tcp_fastopen.c | 30 +++++++++++------------------- net/ipv4/tcp_ipv4.c | 2 ++ 4 files changed, 27 insertions(+), 28 deletions(-) diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index e4c63cf52075..1d02c06cfc8f 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -137,6 +137,9 @@ struct netns_ipv4 { int sysctl_tcp_fastopen; struct tcp_fastopen_context __rcu *tcp_fastopen_ctx; spinlock_t tcp_fastopen_ctx_lock; + unsigned int sysctl_tcp_fastopen_blackhole_timeout; + atomic_t tfo_active_disable_times; + unsigned long tfo_active_disable_stamp; #ifdef CONFIG_NET_L3_MASTER_DEV int sysctl_udp_l3mdev_accept; diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index d6d697417cd3..af466f334107 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -319,11 +319,13 @@ static int proc_tfo_blackhole_detect_timeout(struct ctl_table *table, void __user *buffer, size_t *lenp, loff_t *ppos) { + struct net *net = container_of(table->data, struct net, + ipv4.sysctl_tcp_fastopen_blackhole_timeout); int ret; ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (write && ret == 0) - tcp_fastopen_active_timeout_reset(); + atomic_set(&net->ipv4.tfo_active_disable_times, 0); return ret; } @@ -361,14 +363,6 @@ static struct ctl_table ipv4_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, - { - .procname = "tcp_fastopen_blackhole_timeout_sec", - .data = &sysctl_tcp_fastopen_blackhole_timeout, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_tfo_blackhole_detect_timeout, - .extra1 = &zero, - }, { .procname = "tcp_abort_on_overflow", .data = &sysctl_tcp_abort_on_overflow, @@ -1075,6 +1069,14 @@ static struct ctl_table ipv4_net_table[] = { .maxlen = ((TCP_FASTOPEN_KEY_LENGTH * 2) + 10), .proc_handler = proc_tcp_fastopen_key, }, + { + .procname = "tcp_fastopen_blackhole_timeout_sec", + .data = &init_net.ipv4.sysctl_tcp_fastopen_blackhole_timeout, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_tfo_blackhole_detect_timeout, + .extra1 = &zero, + }, #ifdef CONFIG_IP_ROUTE_MULTIPATH { .procname = "fib_multipath_use_neigh", diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index 6f5ab32c5810..3649ec284d40 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -425,25 +425,16 @@ EXPORT_SYMBOL(tcp_fastopen_defer_connect); * TFO connection with data exchanges. */ -/* Default to 1hr */ -unsigned int sysctl_tcp_fastopen_blackhole_timeout __read_mostly = 60 * 60; -static atomic_t tfo_active_disable_times __read_mostly = ATOMIC_INIT(0); -static unsigned long tfo_active_disable_stamp __read_mostly; - /* Disable active TFO and record current jiffies and * tfo_active_disable_times */ void tcp_fastopen_active_disable(struct sock *sk) { - atomic_inc(&tfo_active_disable_times); - tfo_active_disable_stamp = jiffies; - NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENBLACKHOLE); -} + struct net *net = sock_net(sk); -/* Reset tfo_active_disable_times to 0 */ -void tcp_fastopen_active_timeout_reset(void) -{ - atomic_set(&tfo_active_disable_times, 0); + atomic_inc(&net->ipv4.tfo_active_disable_times); + net->ipv4.tfo_active_disable_stamp = jiffies; + NET_INC_STATS(net, LINUX_MIB_TCPFASTOPENBLACKHOLE); } /* Calculate timeout for tfo active disable @@ -452,17 +443,18 @@ void tcp_fastopen_active_timeout_reset(void) */ bool tcp_fastopen_active_should_disable(struct sock *sk) { - int tfo_da_times = atomic_read(&tfo_active_disable_times); - int multiplier; + unsigned int tfo_bh_timeout = sock_net(sk)->ipv4.sysctl_tcp_fastopen_blackhole_timeout; + int tfo_da_times = atomic_read(&sock_net(sk)->ipv4.tfo_active_disable_times); unsigned long timeout; + int multiplier; if (!tfo_da_times) return false; /* Limit timout to max: 2^6 * initial timeout */ multiplier = 1 << min(tfo_da_times - 1, 6); - timeout = multiplier * sysctl_tcp_fastopen_blackhole_timeout * HZ; - if (time_before(jiffies, tfo_active_disable_stamp + timeout)) + timeout = multiplier * tfo_bh_timeout * HZ; + if (time_before(jiffies, sock_net(sk)->ipv4.tfo_active_disable_stamp + timeout)) return true; /* Mark check bit so we can check for successful active TFO @@ -496,10 +488,10 @@ void tcp_fastopen_active_disable_ofo_check(struct sock *sk) } } } else if (tp->syn_fastopen_ch && - atomic_read(&tfo_active_disable_times)) { + atomic_read(&sock_net(sk)->ipv4.tfo_active_disable_times)) { dst = sk_dst_get(sk); if (!(dst && dst->dev && (dst->dev->flags & IFF_LOOPBACK))) - tcp_fastopen_active_timeout_reset(); + atomic_set(&sock_net(sk)->ipv4.tfo_active_disable_times, 0); dst_release(dst); } } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index a7b903de70cd..d4fcca200dff 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2556,6 +2556,8 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE; spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock); + net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60; + atomic_set(&net->ipv4.tfo_active_disable_times, 0); return 0; fail: From 730098eb04418307ea5b9aadb12472bb4655a855 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 19 Jul 2021 02:20:28 -0700 Subject: [PATCH 0005/1640] net/tcp_fastopen: remove obsolete extern After cited commit, sysctl_tcp_fastopen_blackhole_timeout is no longer a global variable. Fixes: 3733be14a32b ("ipv4: Namespaceify tcp_fastopen_blackhole_timeout knob") Signed-off-by: Eric Dumazet Cc: Haishuang Yan Cc: Yuchung Cheng Cc: Neal Cardwell Acked-by: Wei Wang Link: https://lore.kernel.org/r/20210719092028.3016745-1-eric.dumazet@gmail.com Signed-off-by: Jakub Kicinski --- include/net/tcp.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/net/tcp.h b/include/net/tcp.h index 61b9387ee0f5..516e8a8d5f00 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1639,7 +1639,6 @@ struct tcp_fastopen_context { struct rcu_head rcu; }; -extern unsigned int sysctl_tcp_fastopen_blackhole_timeout; void tcp_fastopen_active_disable(struct sock *sk); bool tcp_fastopen_active_should_disable(struct sock *sk); void tcp_fastopen_active_disable_ofo_check(struct sock *sk); From 9398a9b7b5fea099a48302a3ba6977cef91f406d Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Tue, 14 Nov 2017 08:25:49 -0800 Subject: [PATCH 0006/1640] tcp: Namespace-ify sysctl_tcp_default_congestion_control Make default TCP default congestion control to a per namespace value. This changes default congestion control to a pointer to congestion ops (rather than implicit as first element of available lsit). The congestion control setting of new namespaces is inherited from the current setting of the root namespace. Change-Id: Ic8b93798df8df6f60d3642392f1fcf32c41a85c8 Signed-off-by: Stephen Hemminger Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 1 + include/net/tcp.h | 6 +-- net/ipv4/fib_semantics.c | 4 +- net/ipv4/sysctl_net_ipv4.c | 19 ++++++---- net/ipv4/tcp_cong.c | 76 ++++++++++++++++++-------------------- net/ipv4/tcp_ipv4.c | 9 +++++ net/ipv6/route.c | 3 +- 7 files changed, 64 insertions(+), 54 deletions(-) diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 1d02c06cfc8f..7885f3feeec9 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -135,6 +135,7 @@ struct netns_ipv4 { struct inet_timewait_death_row tcp_death_row; int sysctl_max_syn_backlog; int sysctl_tcp_fastopen; + const struct tcp_congestion_ops __rcu *tcp_congestion_control; struct tcp_fastopen_context __rcu *tcp_fastopen_ctx; spinlock_t tcp_fastopen_ctx_lock; unsigned int sysctl_tcp_fastopen_blackhole_timeout; diff --git a/include/net/tcp.h b/include/net/tcp.h index 516e8a8d5f00..4f539724d7c2 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1085,8 +1085,8 @@ void tcp_unregister_congestion_control(struct tcp_congestion_ops *type); void tcp_assign_congestion_control(struct sock *sk); void tcp_init_congestion_control(struct sock *sk); void tcp_cleanup_congestion_control(struct sock *sk); -int tcp_set_default_congestion_control(const char *name); -void tcp_get_default_congestion_control(char *name); +int tcp_set_default_congestion_control(struct net *net, const char *name); +void tcp_get_default_congestion_control(struct net *net, char *name); void tcp_get_available_congestion_control(char *buf, size_t len); void tcp_get_allowed_congestion_control(char *buf, size_t len); int tcp_set_allowed_congestion_control(char *allowed); @@ -1101,7 +1101,7 @@ void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked); extern struct tcp_congestion_ops tcp_reno; struct tcp_congestion_ops *tcp_ca_find_key(u32 key); -u32 tcp_ca_get_key_by_name(const char *name, bool *ecn_ca); +u32 tcp_ca_get_key_by_name(struct net *net, const char *name, bool *ecn_ca); #ifdef CONFIG_INET char *tcp_ca_get_name_by_key(u32 key, char *buffer); #else diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index bc233fdfae0f..b08ef1b24cb5 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -723,7 +723,7 @@ bool fib_metrics_match(struct fib_config *cfg, struct fib_info *fi) bool ecn_ca = false; nla_strlcpy(tmp, nla, sizeof(tmp)); - val = tcp_ca_get_key_by_name(tmp, &ecn_ca); + val = tcp_ca_get_key_by_name(fi->fib_net, tmp, &ecn_ca); } else { if (nla_len(nla) != sizeof(u32)) return false; @@ -1049,7 +1049,7 @@ fib_convert_metrics(struct fib_info *fi, const struct fib_config *cfg) char tmp[TCP_CA_NAME_MAX]; nla_strlcpy(tmp, nla, sizeof(tmp)); - val = tcp_ca_get_key_by_name(tmp, &ecn_ca); + val = tcp_ca_get_key_by_name(fi->fib_net, tmp, &ecn_ca); if (val == TCP_CA_UNSPEC) return -EINVAL; } else { diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index af466f334107..885499ea3242 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -210,6 +210,8 @@ static int ipv4_ping_group_range(struct ctl_table *table, int write, static int proc_tcp_congestion_control(struct ctl_table *ctl, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { + struct net *net = container_of(ctl->data, struct net, + ipv4.tcp_congestion_control); char val[TCP_CA_NAME_MAX]; struct ctl_table tbl = { .data = val, @@ -217,11 +219,11 @@ static int proc_tcp_congestion_control(struct ctl_table *ctl, int write, }; int ret; - tcp_get_default_congestion_control(val); + tcp_get_default_congestion_control(net, val); ret = proc_dostring(&tbl, write, buffer, lenp, ppos); if (write && ret == 0) - ret = tcp_set_default_congestion_control(val); + ret = tcp_set_default_congestion_control(net, val); return ret; } @@ -516,12 +518,6 @@ static struct ctl_table ipv4_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, - { - .procname = "tcp_congestion_control", - .mode = 0644, - .maxlen = TCP_CA_NAME_MAX, - .proc_handler = proc_tcp_congestion_control, - }, { .procname = "tcp_workaround_signed_windows", .data = &sysctl_tcp_workaround_signed_windows, @@ -945,6 +941,13 @@ static struct ctl_table ipv4_net_table[] = { .extra1 = &one }, #endif + { + .procname = "tcp_congestion_control", + .data = &init_net.ipv4.tcp_congestion_control, + .mode = 0644, + .maxlen = TCP_CA_NAME_MAX, + .proc_handler = proc_tcp_congestion_control, + }, { .procname = "tcp_keepalive_time", .data = &init_net.ipv4.sysctl_tcp_keepalive_time, diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index b193dcebbf7e..00a7482b6fbd 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -33,9 +33,11 @@ static struct tcp_congestion_ops *tcp_ca_find(const char *name) } /* Must be called with rcu lock held */ -static const struct tcp_congestion_ops *__tcp_ca_find_autoload(const char *name) +static struct tcp_congestion_ops *tcp_ca_find_autoload(struct net *net, + const char *name) { - const struct tcp_congestion_ops *ca = tcp_ca_find(name); + struct tcp_congestion_ops *ca = tcp_ca_find(name); + #ifdef CONFIG_MODULES if (!ca && capable(CAP_NET_ADMIN)) { rcu_read_unlock(); @@ -115,7 +117,7 @@ void tcp_unregister_congestion_control(struct tcp_congestion_ops *ca) } EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control); -u32 tcp_ca_get_key_by_name(const char *name, bool *ecn_ca) +u32 tcp_ca_get_key_by_name(struct net *net, const char *name, bool *ecn_ca) { const struct tcp_congestion_ops *ca; u32 key = TCP_CA_UNSPEC; @@ -123,7 +125,7 @@ u32 tcp_ca_get_key_by_name(const char *name, bool *ecn_ca) might_sleep(); rcu_read_lock(); - ca = __tcp_ca_find_autoload(name); + ca = tcp_ca_find_autoload(net, name); if (ca) { key = ca->key; *ecn_ca = ca->flags & TCP_CONG_NEEDS_ECN; @@ -153,23 +155,18 @@ EXPORT_SYMBOL_GPL(tcp_ca_get_name_by_key); /* Assign choice of congestion control. */ void tcp_assign_congestion_control(struct sock *sk) { + struct net *net = sock_net(sk); struct inet_connection_sock *icsk = inet_csk(sk); - struct tcp_congestion_ops *ca; + const struct tcp_congestion_ops *ca; rcu_read_lock(); - list_for_each_entry_rcu(ca, &tcp_cong_list, list) { - if (likely(try_module_get(ca->owner))) { - icsk->icsk_ca_ops = ca; - goto out; - } - /* Fallback to next available. The last really - * guaranteed fallback is Reno from this list. - */ - } -out: + ca = rcu_dereference(net->ipv4.tcp_congestion_control); + if (unlikely(!try_module_get(ca->owner))) + ca = &tcp_reno; + icsk->icsk_ca_ops = ca; rcu_read_unlock(); - memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv)); + memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv)); if (ca->flags & TCP_CONG_NEEDS_ECN) INET_ECN_xmit(sk); else @@ -219,29 +216,27 @@ void tcp_cleanup_congestion_control(struct sock *sk) } /* Used by sysctl to change default congestion control */ -int tcp_set_default_congestion_control(const char *name) +int tcp_set_default_congestion_control(struct net *net, const char *name) { struct tcp_congestion_ops *ca; - int ret = -ENOENT; + const struct tcp_congestion_ops *prev; + int ret; - spin_lock(&tcp_cong_list_lock); - ca = tcp_ca_find(name); -#ifdef CONFIG_MODULES - if (!ca && capable(CAP_NET_ADMIN)) { - spin_unlock(&tcp_cong_list_lock); + rcu_read_lock(); + ca = tcp_ca_find_autoload(net, name); + if (!ca) { + ret = -ENOENT; + } else if (!try_module_get(ca->owner)) { + ret = -EBUSY; + } else { + prev = xchg(&net->ipv4.tcp_congestion_control, ca); + if (prev) + module_put(prev->owner); - request_module("tcp_%s", name); - spin_lock(&tcp_cong_list_lock); - ca = tcp_ca_find(name); - } -#endif - - if (ca) { - ca->flags |= TCP_CONG_NON_RESTRICTED; /* default is always allowed */ - list_move(&ca->list, &tcp_cong_list); + ca->flags |= TCP_CONG_NON_RESTRICTED; ret = 0; } - spin_unlock(&tcp_cong_list_lock); + rcu_read_unlock(); return ret; } @@ -249,7 +244,8 @@ int tcp_set_default_congestion_control(const char *name) /* Set default value from kernel configuration at bootup */ static int __init tcp_congestion_default(void) { - return tcp_set_default_congestion_control(CONFIG_DEFAULT_TCP_CONG); + return tcp_set_default_congestion_control(&init_net, + CONFIG_DEFAULT_TCP_CONG); } late_initcall(tcp_congestion_default); @@ -269,14 +265,12 @@ void tcp_get_available_congestion_control(char *buf, size_t maxlen) } /* Get current default congestion control */ -void tcp_get_default_congestion_control(char *name) +void tcp_get_default_congestion_control(struct net *net, char *name) { - struct tcp_congestion_ops *ca; - /* We will always have reno... */ - BUG_ON(list_empty(&tcp_cong_list)); + const struct tcp_congestion_ops *ca; rcu_read_lock(); - ca = list_entry(tcp_cong_list.next, struct tcp_congestion_ops, list); + ca = rcu_dereference(net->ipv4.tcp_congestion_control); strncpy(name, ca->name, TCP_CA_NAME_MAX); rcu_read_unlock(); } @@ -357,12 +351,14 @@ int tcp_set_congestion_control(struct sock *sk, const char *name, bool load, if (!load) ca = tcp_ca_find(name); else - ca = __tcp_ca_find_autoload(name); + ca = tcp_ca_find_autoload(sock_net(sk), name); + /* No change asking for existing value */ if (ca == icsk->icsk_ca_ops) { icsk->icsk_ca_setsockopt = 1; goto out; } + if (!ca) { err = -ENOENT; } else if (!load) { diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index d4fcca200dff..4bd8da8728bc 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2490,6 +2490,8 @@ static void __net_exit tcp_sk_exit(struct net *net) { int cpu; + module_put(net->ipv4.tcp_congestion_control->owner); + for_each_possible_cpu(cpu) inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu)); free_percpu(net->ipv4.tcp_sk); @@ -2559,6 +2561,13 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60; atomic_set(&net->ipv4.tfo_active_disable_times, 0); + /* Reno is always built in */ + if (!net_eq(net, &init_net) && + try_module_get(init_net.ipv4.tcp_congestion_control->owner)) + net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control; + else + net->ipv4.tcp_congestion_control = &tcp_reno; + return 0; fail: tcp_sk_exit(net); diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 69288fc80a02..25a164b843f5 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1821,6 +1821,7 @@ out: static int ip6_convert_metrics(struct mx6_config *mxc, const struct fib6_config *cfg) { + struct net *net = cfg->fc_nlinfo.nl_net; bool ecn_ca = false; struct nlattr *nla; int remaining; @@ -1846,7 +1847,7 @@ static int ip6_convert_metrics(struct mx6_config *mxc, char tmp[TCP_CA_NAME_MAX]; nla_strlcpy(tmp, nla, sizeof(tmp)); - val = tcp_ca_get_key_by_name(tmp, &ecn_ca); + val = tcp_ca_get_key_by_name(net, tmp, &ecn_ca); if (val == TCP_CA_UNSPEC) goto err; } else { From dc63b2edc4e173a09a296611a3834e3ecde751b8 Mon Sep 17 00:00:00 2001 From: Jonathon Reinhart Date: Sat, 1 May 2021 04:28:22 -0400 Subject: [PATCH 0007/1640] net: Only allow init netns to set default tcp cong to a restricted algo tcp_set_default_congestion_control() is netns-safe in that it writes to &net->ipv4.tcp_congestion_control, but it also sets ca->flags |= TCP_CONG_NON_RESTRICTED which is not namespaced. This has the unintended side-effect of changing the global net.ipv4.tcp_allowed_congestion_control sysctl, despite the fact that it is read-only: 97684f0970f6 ("net: Make tcp_allowed_congestion_control readonly in non-init netns") Resolve this netns "leak" by only allowing the init netns to set the default algorithm to one that is restricted. This restriction could be removed if tcp_allowed_congestion_control were namespace-ified in the future. This bug was uncovered with https://github.com/JonathonReinhart/linux-netns-sysctl-verify Fixes: 6670e1524477 ("tcp: Namespace-ify sysctl_tcp_default_congestion_control") Signed-off-by: Jonathon Reinhart Signed-off-by: David S. Miller --- net/ipv4/tcp_cong.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index 00a7482b6fbd..533f8d84d2f7 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -228,6 +228,10 @@ int tcp_set_default_congestion_control(struct net *net, const char *name) ret = -ENOENT; } else if (!try_module_get(ca->owner)) { ret = -EBUSY; + } else if (!net_eq(net, &init_net) && + !(ca->flags & TCP_CONG_NON_RESTRICTED)) { + /* Only init netns can set default to a restricted algorithm */ + ret = -EPERM; } else { prev = xchg(&net->ipv4.tcp_congestion_control, ca); if (prev) From 0856e9414757b0dc74348cacd1dee214be4b6e88 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 28 May 2024 11:43:53 +0000 Subject: [PATCH 0008/1640] BACKPORT: net: fix __dst_negative_advice() race __dst_negative_advice() does not enforce proper RCU rules when sk->dst_cache must be cleared, leading to possible UAF. RCU rules are that we must first clear sk->sk_dst_cache, then call dst_release(old_dst). Note that sk_dst_reset(sk) is implementing this protocol correctly, while __dst_negative_advice() uses the wrong order. Given that ip6_negative_advice() has special logic against RTF_CACHE, this means each of the three ->negative_advice() existing methods must perform the sk_dst_reset() themselves. Note the check against NULL dst is centralized in __dst_negative_advice(), there is no need to duplicate it in various callbacks. Many thanks to Clement Lecigne for tracking this issue. This old bug became visible after the blamed commit, using UDP sockets. Bug: 343727534 Fixes: a87cb3e48ee8 ("net: Facility to report route quality of connected sockets") Reported-by: Clement Lecigne Diagnosed-by: Clement Lecigne Signed-off-by: Eric Dumazet Cc: Tom Herbert Reviewed-by: David Ahern Link: https://lore.kernel.org/r/20240528114353.1794151-1-edumazet@google.com Signed-off-by: Jakub Kicinski (cherry picked from commit 92f1655aa2b2294d0b49925f3b875a634bd3b59e) [Lee: Trivial/unrelated conflict - no change to the patch] Signed-off-by: Lee Jones Change-Id: I293734dca1b81fcb712e1de294f51e96a405f7e4 [mkbestas: Non trivial backport to 4.14] Signed-off-by: Michael Bestas --- include/net/dst_ops.h | 2 +- include/net/sock.h | 13 +++---------- net/ipv4/route.c | 22 ++++++++-------------- net/ipv6/route.c | 25 +++++++++++++------------ net/xfrm/xfrm_policy.c | 11 +++-------- 5 files changed, 28 insertions(+), 45 deletions(-) diff --git a/include/net/dst_ops.h b/include/net/dst_ops.h index 632086b2f644..3ae2fda29507 100644 --- a/include/net/dst_ops.h +++ b/include/net/dst_ops.h @@ -24,7 +24,7 @@ struct dst_ops { void (*destroy)(struct dst_entry *); void (*ifdown)(struct dst_entry *, struct net_device *dev, int how); - struct dst_entry * (*negative_advice)(struct dst_entry *); + void (*negative_advice)(struct sock *sk, struct dst_entry *); void (*link_failure)(struct sk_buff *); void (*update_pmtu)(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb, u32 mtu, diff --git a/include/net/sock.h b/include/net/sock.h index f35411bab1a7..9f58e77fe19a 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1841,19 +1841,12 @@ sk_dst_get(struct sock *sk) static inline void dst_negative_advice(struct sock *sk) { - struct dst_entry *ndst, *dst = __sk_dst_get(sk); + struct dst_entry *dst = __sk_dst_get(sk); sk_rethink_txhash(sk); - if (dst && dst->ops->negative_advice) { - ndst = dst->ops->negative_advice(dst); - - if (ndst != dst) { - rcu_assign_pointer(sk->sk_dst_cache, ndst); - sk_tx_queue_clear(sk); - WRITE_ONCE(sk->sk_dst_pending_confirm, 0); - } - } + if (dst && dst->ops->negative_advice) + dst->ops->negative_advice(sk, dst); } static inline void diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 2ce7fbec55ea..c4d628d72b90 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -141,7 +141,8 @@ static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT; static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie); static unsigned int ipv4_default_advmss(const struct dst_entry *dst); static unsigned int ipv4_mtu(const struct dst_entry *dst); -static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst); +static void ipv4_negative_advice(struct sock *sk, + struct dst_entry *dst); static void ipv4_link_failure(struct sk_buff *skb); static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb, u32 mtu, @@ -863,22 +864,15 @@ static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buf __ip_do_redirect(rt, skb, &fl4, true); } -static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst) +static void ipv4_negative_advice(struct sock *sk, + struct dst_entry *dst) { struct rtable *rt = (struct rtable *)dst; - struct dst_entry *ret = dst; - if (rt) { - if (dst->obsolete > 0) { - ip_rt_put(rt); - ret = NULL; - } else if ((rt->rt_flags & RTCF_REDIRECTED) || - rt->dst.expires) { - ip_rt_put(rt); - ret = NULL; - } - } - return ret; + if ((dst->obsolete > 0) || + (rt->rt_flags & RTCF_REDIRECTED) || + rt->dst.expires) + sk_dst_reset(sk); } /* diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 25a164b843f5..c919fe02f09d 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -81,7 +81,8 @@ static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort); static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie); static unsigned int ip6_default_advmss(const struct dst_entry *dst); static unsigned int ip6_mtu(const struct dst_entry *dst); -static struct dst_entry *ip6_negative_advice(struct dst_entry *); +static void ip6_negative_advice(struct sock *sk, + struct dst_entry *dst); static void ip6_dst_destroy(struct dst_entry *); static void ip6_dst_ifdown(struct dst_entry *, struct net_device *dev, int how); @@ -1415,22 +1416,22 @@ static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie) return rt6_check(rt, cookie); } -static struct dst_entry *ip6_negative_advice(struct dst_entry *dst) +static void ip6_negative_advice(struct sock *sk, + struct dst_entry *dst) { struct rt6_info *rt = (struct rt6_info *) dst; - if (rt) { - if (rt->rt6i_flags & RTF_CACHE) { - if (rt6_check_expired(rt)) { - ip6_del_rt(rt); - dst = NULL; - } - } else { - dst_release(dst); - dst = NULL; + if (rt->rt6i_flags & RTF_CACHE) { + if (rt6_check_expired(rt)) { + /* counteract the dst_release() in sk_dst_reset() */ + dst_hold(dst); + sk_dst_reset(sk); + + ip6_del_rt(rt); } + return; } - return dst; + sk_dst_reset(sk); } static void ip6_link_failure(struct sk_buff *skb) diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index e61e32918f7b..4ab3d5484839 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -2552,15 +2552,10 @@ static void xfrm_link_failure(struct sk_buff *skb) /* Impossible. Such dst must be popped before reaches point of failure. */ } -static struct dst_entry *xfrm_negative_advice(struct dst_entry *dst) +static void xfrm_negative_advice(struct sock *sk, struct dst_entry *dst) { - if (dst) { - if (dst->obsolete) { - dst_release(dst); - dst = NULL; - } - } - return dst; + if (dst->obsolete) + sk_dst_reset(sk); } static void xfrm_init_pmtu(struct dst_entry *dst) From f619e50aad41ff55aaae94aea2887f8a7eb4ba05 Mon Sep 17 00:00:00 2001 From: Samuel Pascua Date: Wed, 17 Sep 2025 20:51:25 +0800 Subject: [PATCH 0009/1640] Revert "bpf: add skb->tstamp r/w access from tc clsact and cg skb progs" This reverts commit a1cee8c2b7791e91c1a55c1e28aceaaf8a2957d7. --- include/uapi/linux/bpf.h | 1 - net/core/filter.c | 24 ----------------- tools/include/uapi/linux/bpf.h | 1 - tools/testing/selftests/bpf/test_verifier.c | 29 --------------------- 4 files changed, 55 deletions(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 95c8350cf4fe..9e6816912da1 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1200,7 +1200,6 @@ struct __sk_buff { /* ... here. */ __u32 data_meta; __bpf_md_ptr(struct bpf_sock *, sk); - __u64 tstamp; }; struct bpf_tunnel_key { diff --git a/net/core/filter.c b/net/core/filter.c index 3f47792532be..2796a7ca032b 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4166,9 +4166,6 @@ static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type if (type == BPF_WRITE || size != sizeof(__u64)) return false; info->reg_type = PTR_TO_SOCK_COMMON_OR_NULL; - case bpf_ctx_range(struct __sk_buff, tstamp): - if (size != sizeof(__u64)) - return false; break; default: /* Only narrow read access allowed for now. */ @@ -4197,7 +4194,6 @@ static bool sk_filter_is_valid_access(int off, int size, case bpf_ctx_range(struct __sk_buff, data_end): case bpf_ctx_range_ptr(struct __sk_buff, flow_keys): case bpf_ctx_range_till(struct __sk_buff, family, local_port): - case bpf_ctx_range(struct __sk_buff, tstamp): return false; } @@ -4223,7 +4219,6 @@ static bool lwt_is_valid_access(int off, int size, case bpf_ctx_range_ptr(struct __sk_buff, flow_keys): case bpf_ctx_range_till(struct __sk_buff, family, local_port): case bpf_ctx_range(struct __sk_buff, data_meta): - case bpf_ctx_range(struct __sk_buff, tstamp): return false; } @@ -4402,7 +4397,6 @@ static bool tc_cls_act_is_valid_access(int off, int size, case bpf_ctx_range(struct __sk_buff, priority): case bpf_ctx_range(struct __sk_buff, tc_classid): case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]): - case bpf_ctx_range(struct __sk_buff, tstamp): case bpf_ctx_range(struct __sk_buff, queue_mapping): break; default: @@ -4607,7 +4601,6 @@ static bool sk_skb_is_valid_access(int off, int size, switch (off) { case bpf_ctx_range(struct __sk_buff, tc_classid): case bpf_ctx_range(struct __sk_buff, data_meta): - case bpf_ctx_range(struct __sk_buff, tstamp): return false; } @@ -4689,7 +4682,6 @@ static bool flow_dissector_is_valid_access(int off, int size, break; case bpf_ctx_range(struct __sk_buff, tc_classid): case bpf_ctx_range_till(struct __sk_buff, family, local_port): - case bpf_ctx_range(struct __sk_buff, tstamp): return false; } @@ -5012,22 +5004,6 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type, si->dst_reg, si->src_reg, offsetof(struct sk_buff, sk)); break; - - case offsetof(struct __sk_buff, tstamp): - BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, tstamp) != 8); - - if (type == BPF_WRITE) - *insn++ = BPF_STX_MEM(BPF_DW, - si->dst_reg, si->src_reg, - bpf_target_off(struct sk_buff, - tstamp, 8, - target_size)); - else - *insn++ = BPF_LDX_MEM(BPF_DW, - si->dst_reg, si->src_reg, - bpf_target_off(struct sk_buff, - tstamp, 8, - target_size)); } return insn - insn_buf; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 63ed136b872e..a5030c61347f 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -799,7 +799,6 @@ struct __sk_buff { __u32 local_ip6[4]; /* Stored in network byte order */ __u32 remote_port; /* Stored in network byte order */ __u32 local_port; /* stored in host byte order */ - __u64 tstamp; }; struct bpf_tunnel_key { diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c index 6fee1aad274d..f2e9b37a4463 100644 --- a/tools/testing/selftests/bpf/test_verifier.c +++ b/tools/testing/selftests/bpf/test_verifier.c @@ -1805,10 +1805,6 @@ static struct bpf_test tests[] = { offsetof(struct __sk_buff, tc_index)), BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_0, offsetof(struct __sk_buff, cb[3])), - BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, - offsetof(struct __sk_buff, tstamp)), - BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_0, - offsetof(struct __sk_buff, tstamp)), BPF_EXIT_INSN(), }, .errstr_unpriv = "", @@ -3994,31 +3990,6 @@ static struct bpf_test tests[] = { .result = REJECT, .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, }, - { - "write tstamp from CGROUP_SKB", - .insns = { - BPF_MOV64_IMM(BPF_REG_0, 0), - BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_0, - offsetof(struct __sk_buff, tstamp)), - BPF_MOV64_IMM(BPF_REG_0, 0), - BPF_EXIT_INSN(), - }, - .result = ACCEPT, - .result_unpriv = REJECT, - .errstr_unpriv = "invalid bpf_context access off=152 size=8", - .prog_type = BPF_PROG_TYPE_CGROUP_SKB, - }, - { - "read tstamp from CGROUP_SKB", - .insns = { - BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, - offsetof(struct __sk_buff, tstamp)), - BPF_MOV64_IMM(BPF_REG_0, 0), - BPF_EXIT_INSN(), - }, - .result = ACCEPT, - .prog_type = BPF_PROG_TYPE_CGROUP_SKB, - }, { "multiple registers share map_lookup_elem result", .insns = { From 26eec353a55729398d821eeba501a23dab0f1238 Mon Sep 17 00:00:00 2001 From: Samuel Pascua Date: Wed, 17 Sep 2025 22:05:09 +0800 Subject: [PATCH 0010/1640] Revert "bpf: add skb->queue_mapping write access from tc clsact" This reverts commit b1c03fa86be1c7fadedfa3bd4e035db481eae96a. --- net/core/filter.c | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/net/core/filter.c b/net/core/filter.c index 2796a7ca032b..133e8827eff6 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4397,7 +4397,6 @@ static bool tc_cls_act_is_valid_access(int off, int size, case bpf_ctx_range(struct __sk_buff, priority): case bpf_ctx_range(struct __sk_buff, tc_classid): case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]): - case bpf_ctx_range(struct __sk_buff, queue_mapping): break; default: return false; @@ -4770,18 +4769,9 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type, break; case offsetof(struct __sk_buff, queue_mapping): - if (type == BPF_WRITE) { - *insn++ = BPF_JMP_IMM(BPF_JGE, si->src_reg, NO_QUEUE_MAPPING, 1); - *insn++ = BPF_STX_MEM(BPF_H, si->dst_reg, si->src_reg, - bpf_target_off(struct sk_buff, - queue_mapping, - 2, target_size)); - } else { - *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, - bpf_target_off(struct sk_buff, - queue_mapping, - 2, target_size)); - } + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, + bpf_target_off(struct sk_buff, queue_mapping, 2, + target_size)); break; case offsetof(struct __sk_buff, vlan_present): From d76efd57ff12cf3498a190d996981efc33800489 Mon Sep 17 00:00:00 2001 From: Samuel Pascua Date: Thu, 2 Oct 2025 08:59:00 +0800 Subject: [PATCH 0011/1640] Revert "bpf: add __weak hook for allocating executable memory" This reverts commit d37413bbfd5d1a69a80355386c54603b5ae3e44f. --- kernel/bpf/core.c | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index b312af3dcd35..ec9fb28e863e 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -703,16 +703,6 @@ static void bpf_jit_uncharge_modmem(u32 pages) atomic_long_sub(pages, &bpf_jit_current); } -void *__weak bpf_jit_alloc_exec(unsigned long size) -{ - return module_alloc(size); -} - -void __weak bpf_jit_free_exec(void *addr) -{ - module_memfree(addr); -} - #if IS_ENABLED(CONFIG_BPF_JIT) && IS_ENABLED(CONFIG_CFI_CLANG) bool __weak arch_bpf_jit_check_func(const struct bpf_prog *prog) { @@ -738,7 +728,7 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr, if (bpf_jit_charge_modmem(pages)) return NULL; - hdr = bpf_jit_alloc_exec(size); + hdr = module_alloc(size); if (!hdr) { bpf_jit_uncharge_modmem(pages); return NULL; @@ -767,7 +757,7 @@ void bpf_jit_binary_free(struct bpf_binary_header *hdr) #ifdef CONFIG_RKP_MODULE_SUPPORT uh_call(UH_APP_RKP, RKP_BFP_LOAD, (u64)hdr, (u64)(hdr->pages * PAGE_SIZE), RKP_BPF_JIT_FREE, 0); #endif - bpf_jit_free_exec(hdr); + module_memfree(hdr); bpf_jit_uncharge_modmem(pages); } From 7b5f8b6f153e94a28a601fa15b23ce5732a3c7f9 Mon Sep 17 00:00:00 2001 From: Samuel Pascua Date: Thu, 18 Sep 2025 15:43:03 +0800 Subject: [PATCH 0012/1640] net: ipv6: remove rmnet logging Change-Id: I31049e42ebb5a6548c94d50b4cc02062d37a46dd Signed-off-by: Samuel Pascua --- include/net/ip6_fib.h | 3 --- net/ipv6/ip6_fib.c | 5 ----- 2 files changed, 8 deletions(-) diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 63ffb47b0e52..ea4354cc767b 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -229,9 +229,6 @@ static inline void rt6_hold(struct rt6_info *rt) static inline void rt6_release(struct rt6_info *rt) { if (atomic_dec_and_test(&rt->rt6i_ref)) { - if (strstr(rt->dst.dev->name, "rmnet_data")) - net_log("rt6_release(): %s : Prefix: %pI6/%u, GW: %pI6, prot: %u\n", - rt->dst.dev->name, &rt->rt6i_dst.addr, rt->rt6i_dst.plen, &rt->rt6i_gateway, rt->rt6i_protocol); rt6_free_pcpu(rt); dst_dev_put(&rt->dst); dst_release(&rt->dst); diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index aae8dab8b031..8400b64ec169 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -38,7 +38,6 @@ #include #include -#include #define RT6_DEBUG 2 #if RT6_DEBUG >= 3 @@ -1218,10 +1217,6 @@ out: #endif goto failure; } - if (!err && strstr(rt->dst.dev->name, "rmnet_data")) - net_log("fib6_add(): %s : Prefix: %pI6/%u, GW: %pI6, table: %u, proto: %u\n", - rt->dst.dev->name, &rt->rt6i_dst.addr, rt->rt6i_dst.plen, &rt->rt6i_gateway, - rt->rt6i_table->tb6_id, rt->rt6i_protocol); return err; failure: From 393ad533950ed6eb15ba8c8a7fa66265f0149408 Mon Sep 17 00:00:00 2001 From: Petar Penkov Date: Mon, 29 Jul 2019 09:59:14 -0700 Subject: [PATCH 0013/1640] tcp: add skb-less helpers to retrieve SYN cookie This patch allows generation of a SYN cookie before an SKB has been allocated, as is the case at XDP. Signed-off-by: Petar Penkov Reviewed-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov --- include/net/tcp.h | 10 ++++++ net/ipv4/tcp_input.c | 73 ++++++++++++++++++++++++++++++++++++++++++++ net/ipv4/tcp_ipv4.c | 15 +++++++++ net/ipv6/tcp_ipv6.c | 15 +++++++++ 4 files changed, 113 insertions(+) diff --git a/include/net/tcp.h b/include/net/tcp.h index 4f539724d7c2..265d6c48be9f 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -457,6 +457,16 @@ void tcp_parse_options(const struct net *net, const struct sk_buff *skb, int estab, struct tcp_fastopen_cookie *foc); const u8 *tcp_parse_md5sig_option(const struct tcphdr *th); +/* + * BPF SKB-less helpers + */ +u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph, + struct tcphdr *th, u32 *cookie); +u16 tcp_v6_get_syncookie(struct sock *sk, struct ipv6hdr *iph, + struct tcphdr *th, u32 *cookie); +u16 tcp_get_syncookie_mss(struct request_sock_ops *rsk_ops, + const struct tcp_request_sock_ops *af_ops, + struct sock *sk, struct tcphdr *th); /* * TCP v4 functions exported for the inet6 API */ diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 9b0878780430..cf171f89d4a2 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3818,6 +3818,49 @@ static void tcp_parse_fastopen_option(int len, const unsigned char *cookie, foc->exp = exp_opt; } +/* Try to parse the MSS option from the TCP header. Return 0 on failure, clamped + * value on success. + */ +static u16 tcp_parse_mss_option(const struct tcphdr *th, u16 user_mss) +{ + const unsigned char *ptr = (const unsigned char *)(th + 1); + int length = (th->doff * 4) - sizeof(struct tcphdr); + u16 mss = 0; + + while (length > 0) { + int opcode = *ptr++; + int opsize; + + switch (opcode) { + case TCPOPT_EOL: + return mss; + case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */ + length--; + continue; + default: + if (length < 2) + return mss; + opsize = *ptr++; + if (opsize < 2) /* "silly options" */ + return mss; + if (opsize > length) + return mss; /* fail on partial options */ + if (opcode == TCPOPT_MSS && opsize == TCPOLEN_MSS) { + u16 in_mss = get_unaligned_be16(ptr); + + if (in_mss) { + if (user_mss && user_mss < in_mss) + in_mss = user_mss; + mss = in_mss; + } + } + ptr += opsize - 2; + length -= opsize; + } + } + return mss; +} + /* Look for tcp options. Normally only called on SYN and SYNACK packets. * But, this can also be called on packets in the established flow when * the fast version below fails. @@ -6403,6 +6446,36 @@ static void tcp_reqsk_record_syn(const struct sock *sk, } } +/* If a SYN cookie is required and supported, returns a clamped MSS value to be + * used for SYN cookie generation. + */ +u16 tcp_get_syncookie_mss(struct request_sock_ops *rsk_ops, + const struct tcp_request_sock_ops *af_ops, + struct sock *sk, struct tcphdr *th) +{ + struct tcp_sock *tp = tcp_sk(sk); + u16 mss; + + if (sock_net(sk)->ipv4.sysctl_tcp_syncookies != 2 && + !inet_csk_reqsk_queue_is_full(sk)) + return 0; + + if (!tcp_syn_flood_action(sk, rsk_ops->slab_name)) + return 0; + + if (sk_acceptq_is_full(sk)) { + NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); + return 0; + } + + mss = tcp_parse_mss_option(th, tp->rx_opt.user_mss); + if (!mss) + mss = af_ops->mss_clamp; + + return mss; +} +EXPORT_SYMBOL_GPL(tcp_get_syncookie_mss); + int tcp_conn_request(struct request_sock_ops *rsk_ops, const struct tcp_request_sock_ops *af_ops, struct sock *sk, struct sk_buff *skb) diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 4bd8da8728bc..b4908f96aa84 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1477,6 +1477,21 @@ static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb) return sk; } +u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph, + struct tcphdr *th, u32 *cookie) +{ + u16 mss = 0; +#ifdef CONFIG_SYN_COOKIES + mss = tcp_get_syncookie_mss(&tcp_request_sock_ops, + &tcp_request_sock_ipv4_ops, sk, th); + if (mss) { + *cookie = __cookie_v4_init_sequence(iph, th, &mss); + tcp_synq_overflow(sk); + } +#endif + return mss; +} + /* The socket must have it's spinlock held when we get * here, unless it is a TCP_LISTEN socket. * diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index e2f90a5cbd5e..4b9d5e509075 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1041,6 +1041,21 @@ static struct sock *tcp_v6_cookie_check(struct sock *sk, struct sk_buff *skb) return sk; } +u16 tcp_v6_get_syncookie(struct sock *sk, struct ipv6hdr *iph, + struct tcphdr *th, u32 *cookie) +{ + u16 mss = 0; +#ifdef CONFIG_SYN_COOKIES + mss = tcp_get_syncookie_mss(&tcp6_request_sock_ops, + &tcp_request_sock_ipv6_ops, sk, th); + if (mss) { + *cookie = __cookie_v6_init_sequence(iph, th, &mss); + tcp_synq_overflow(sk); + } +#endif + return mss; +} + static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb) { if (skb->protocol == htons(ETH_P_IP)) From f6522a4f9a85f378e8e5a6e6226c40b7b722faaf Mon Sep 17 00:00:00 2001 From: Petar Penkov Date: Mon, 29 Jul 2019 09:59:13 -0700 Subject: [PATCH 0014/1640] tcp: tcp_syn_flood_action read port from socket This allows us to call this function before an SKB has been allocated. Signed-off-by: Petar Penkov Reviewed-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov --- net/ipv4/tcp_input.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index cf171f89d4a2..b0448d64529f 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -6402,9 +6402,7 @@ EXPORT_SYMBOL(inet_reqsk_alloc); /* * Return true if a syncookie should be sent */ -static bool tcp_syn_flood_action(const struct sock *sk, - const struct sk_buff *skb, - const char *proto) +static bool tcp_syn_flood_action(const struct sock *sk, const char *proto) { struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue; const char *msg = "Dropping request"; @@ -6424,7 +6422,7 @@ static bool tcp_syn_flood_action(const struct sock *sk, net->ipv4.sysctl_tcp_syncookies != 2 && xchg(&queue->synflood_warned, 1) == 0) pr_info("%s: Possible SYN flooding on port %d. %s. Check SNMP counters.\n", - proto, ntohs(tcp_hdr(skb)->dest), msg); + proto, sk->sk_num, msg); return want_cookie; } @@ -6497,7 +6495,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, */ if ((net->ipv4.sysctl_tcp_syncookies == 2 || inet_csk_reqsk_queue_is_full(sk)) && !isn) { - want_cookie = tcp_syn_flood_action(sk, skb, rsk_ops->slab_name); + want_cookie = tcp_syn_flood_action(sk, rsk_ops->slab_name); if (!want_cookie) goto drop; } From 3996f04715ab780dca8d7ce5a074fdf0c0284970 Mon Sep 17 00:00:00 2001 From: Tim Zimmermann Date: Tue, 22 Jul 2025 10:45:19 +0200 Subject: [PATCH 0015/1640] Squashed revert of BPF backports Revert "Partially revert "fixup: add back code missed during BPF picking"" This reverts commit cc477455f73d317733850a9e4818dfd90be4d33d. Revert "bpf: lpm_trie: check left child of last leftmost node for NULL" This reverts commit e89007b7df49292c5ae52b3d165c0d815a61cd10. Revert "BACKPORT: bpf: Fix out-of-bounds write in trie_get_next_key()" This reverts commit a1c4f565bb00b05ab3734a64451c08b0b965ce42. Revert "bpf: Fix exact match conditions in trie_get_next_key()" This reverts commit 4356a64dad3d38372147457b3004930c6e2e9c51. Revert "bpf: fix kernel page fault in lpm map trie_get_next_key" This reverts commit df4649b5d6cb374edbb67e5a5ecbd102a2e6c897. Revert "bpf: implement MAP_GET_NEXT_KEY command for LPM_TRIE map" This reverts commit fe6656a5d48df6144fe9929399c648957166edd0. Revert "bpf: allow helpers to return PTR_TO_SOCK_COMMON" This reverts commit b24d1ae9ccbf3ebe6f4baa50d2d48c03be02bc17. Revert "bpf: implement lookup-free direct value access for maps" This reverts commit de1959fcd3df0629380894d9c47ebb253c920ad1. Revert "bpf: Add bpf_verifier_vlog() and bpf_verifier_log_needed()" This reverts commit b777824607bd3eb8c9130f4639d97d15bcac9af5. Revert "bpf: Don't return EINVAL from {get,set}sockopt when optlen > PAGE_SIZE" This reverts commit 4cfef728c1eac6cce34f4fff1fbab3e66dc430d9. Revert "bpf: always allocate at least 16 bytes for setsockopt hook" This reverts commit 59817f83c964c753e93a75128ecaad4eeaa769fc. Revert "bpf, sockmap: convert to generic sk_msg interface" This reverts commit fe4ef742e22924b21749de333211941d0205501e. Revert "bpf: sockmap, convert bpf_compute_data_pointers to bpf_*_sk_skb" This reverts commit d17c8c2c2f623e087d6c297de50c173a006e6e55. Revert "bpf: sockmap: fix typos" This reverts commit 07e31378d7795371cdbccce06b4125b27ffce536. Revert "sockmap: convert refcnt to an atomic refcnt" This reverts commit c1fa11ec9da5dc0e8cae4334c550264cff77eef9. Revert "bpf: sockmap, add hash map support" This reverts commit 3f43379c38e329e9a7d4b5a1640670de37ba317b. Revert "bpf: sockmap, refactor sockmap routines to work with hashmap" This reverts commit 41a2b6e925db031978eb2484835f60908de884d7. Revert "bpf: implement getsockopt and setsockopt hooks" This reverts commit 9526fe6ff3e06939c12bb781e0dda01a8f3017ec. Revert "bpf: Introduce bpf sk local storage" This reverts commit ffedc38a46ddaca40de672fafe78c45fbfae9839. Revert "bpf: introduce BPF_F_LOCK flag" This reverts commit e7f5758fbcb1674e17c645837f7bff3b1febbad5. Revert "bpf: Introduce ARG_PTR_TO_{INT,LONG} arg types" This reverts commit e29b4e3c2bdd3b5d0d34668836ae8e5115cb31af. Revert "bpf/verifier: add ARG_PTR_TO_UNINIT_MAP_VALUE" This reverts commit f25c66c27cd6a774fb73769d804f91e969dd5f7b. Revert "bpf: allow map helpers access to map values directly" This reverts commit 7af696635219d0c5cdf1a166bb7543cae9e50328. Revert "bpf: add writable context for raw tracepoints" This reverts commit a546d8f0433039cee0de6ce96d5d35c4033a7b98. Revert "bpf: Add struct bpf_tcp_sock and BPF_FUNC_tcp_sock" This reverts commit 03093478c52e79c94791a04f8138d5c019119087. Revert "bpf: Support socket lookup in CGROUP_SOCK_ADDR progs" This reverts commit 8047013945361fbff0e449c8a212cb6fc93a5245. Revert "bpf: Extend the sk_lookup() helper to XDP hookpoint." This reverts commit 8315368983086e70ccc6f103d710903c63cca7df. Revert "xdp: generic XDP handling of xdp_rxq_info" This reverts commit 11d9514e6e6801941abf1c0485fd4ef53082d970. Revert "xdp: move struct xdp_buff from filter.h to xdp.h" This reverts commit a1795f54e4d99e02d5cb84a46fac0240cf29e206. Revert "net: avoid including xdp.h in filter.h" This reverts commit a39c59398f3ab64de44e5953ee0bd23c5136bb48. Revert "xdp: base API for new XDP rx-queue info concept" This reverts commit 49fb5bae77ab2041a2ad9f9f87ad7e0a6e215fdf. Revert "net: Add asynchronous callbacks for xfrm on layer 2." This reverts commit d0656f64d7719993d5634a9fc6600026e9a805ee. Revert "xfrm: Separate ESP handling from segmentation for GRO packets." This reverts commit c8afadf7f5ed8786652d307558345ef90ea91726. Revert "net: move secpath_exist helper to sk_buff.h" This reverts commit 0e5483057121dad47567b01845c656955e51989e. Revert "sk_buff: add skb extension infrastructure" This reverts commit 3a9ae74b075757495c4becf4dd1eec056d364801. Revert "fixup: add back code missed during BPF picking" This reverts commit 74ec8cef7051b5af72f2a6d83ca8c51c3c61c444. Revert "bpf: undo prog rejection on read-only lock failure" This reverts commit af2dc6e4993c4221603dbe6e81a3d0c8269f3171. Revert "bpf: Add helper to retrieve socket in BPF" This reverts commit 53495e3bc33cb46d9961ea122f576faded058aa1. Revert "SQUASH! bpf: Add a bpf_sock pointer to __sk_buff and a bpf_sk_fullsock helpe" This reverts commit 3b25fbf81c041af954d9f5ac1c7867eb07c40b07. Revert "bpf: introduce bpf_spin_lock" This reverts commit 0095fb54160e4f8b326fa8df103e334f90c5ab56. Revert "bpf: enable cgroup local storage map pretty print with kind_flag" This reverts commit 3fe92cb79b5eae557b113c37b03e78efee2280db. Revert "bpf: btf: fix struct/union/fwd types with kind_flag" This reverts commit 2bd4856277f459974dd6234a849cbe20fd475b8f. Revert "bpf: add bpffs pretty print for cgroup local storage maps" This reverts commit e07d8c8279f37cee8471846a63acc51f1ab7ce03. Revert "bpf: pass struct btf pointer to the map_check_btf() callback" This reverts commit 78a8140faf32710799c19495db28d71693c98030. Revert "bpf: Define cgroup_bpf_enabled for CONFIG_CGROUP_BPF=n" This reverts commit aada945d89950c67099e490af1c4c25eef7f31e6. Revert "bpf: introduce per-cpu cgroup local storage" This reverts commit d37432968663559f06c7fd7df44197a807fb84ca. Revert "bpf: btf: Rename btf_key_id and btf_value_id in bpf_map_info" This reverts commit 063c5a25e5f47e8b82b6c43a44ed7be851884abb. Revert "bpf: fix a compilation error when CONFIG_BPF_SYSCALL is not defined" This reverts commit bcf5bfaf50bb6f1f981d5c538f87e6da7aab78f2. Revert "bpf: Create a new btf_name_by_offset() for non type name use case" This reverts commit 52b4739d0bdd763e1b00feb50bef8a821f5c7570. Revert "bpf: reject any prog that failed read-only lock" This reverts commit 30d1bfec06a3bcaa773213113904580e3046a57a. Revert "bpf: Add bpf_line_info support" This reverts commit 50b094eeeb1ced32c62b3a10045bbf43126de760. Revert "bpf: don't leave partial mangled prog in jit_subprogs error path" This reverts commit a466f85be89f5daab4bd748f92915ea713d63934. Revert "bpf: btf: support proper non-jit func info" This reverts commit 492a556de94c502376ec3b0d5a724ec9fe9f6996. Revert "bpf: Introduce bpf_func_info" This reverts commit 39cade88686b0d9b7befc1f14e9d2c2cad19a769. Revert "bpf: btf: Add BTF_KIND_FUNC and BTF_KIND_FUNC_PROTO" This reverts commit 2010b6bacc271a48e74942506f3cf45268b6c264. Revert "bpf: fix bpf_prog_get_info_by_fd to return 0 func_lens for unpriv" This reverts commit a0ea14ac88a0f5529a635fc6e20277942fc6bb99. Revert "bpf: Expose check_uarg_tail_zero()" This reverts commit 1190aaae686534c2854838b3d642dac45d26b1f4. Revert "bpf: Append prog->aux->name in bpf_get_prog_name()" This reverts commit 8b82528df4a11a8501393c854978662fc218014e. Revert "bpf: get JITed image lengths of functions via syscall" This reverts commit 0722dbc626915fcb9acb952ebc1fcb0c4554cb07. Revert "bpf: get kernel symbol addresses via syscall" This reverts commit 6736ec7558dd262fef6669eec02a9797c7c4ecb7. Revert "bpf: Add gpl_compatible flag to struct bpf_prog_info" This reverts commit b60c7a51fd3692259c93413f3e87150078be1dac. Revert "bpf: centre subprog information fields" This reverts commit b5186fdf6f3e1bb38d7e4abfed5bf7dd6f85a6c3. Revert "bpf: unify main prog and subprog" This reverts commit e8e2ad5d9ae98bc7b85b99c0712a5dfbfc151a41. Revert "bpf: fix maximum stack depth tracking logic" This reverts commit 10c7127615dc2c00b724069a1620b2232d905113. Revert "bpf, x64: fix memleak when not converging on calls" This reverts commit 6bc867f718ef2656266f984b605151971026cc98. Revert "bpf: decouple btf from seq bpf fs dump and enable more maps" This reverts commit 3036e2c4384d3f43c695b88c8a1cf97b8337e3bd. Revert "bpf: Add reference tracking to verifier" This reverts commit 3a4900a188ac4de817dc6f114f01159d7bdd2f3e. Revert "bpf: properly enforce index mask to prevent out-of-bounds speculation" This reverts commit ef85925d5c07b46f7447487605da601fc7be026e. Revert "bpf, verifier: detect misconfigured mem, size argument pair" This reverts commit c3853ee3cb96833e907f18bf90e78040fe4cf06f. Revert "bpf: introduce ARG_PTR_TO_MEM_OR_NULL" This reverts commit 58560e13f545f2a079bbce17ac1b731d8b94fec7. Revert "bpf: Macrofy stack state copy" This reverts commit 88d98d8c2ae320ab248150eb86e1c89427e5017c. Revert "bpf: Generalize ptr_or_null regs check" This reverts commit d2cbc2e57b8624699a1548e67b7b3ce992b396fc. Revert "bpf: Add iterator for spilled registers" This reverts commit d956e1ba51a7e5ce86bb35002e26d4c1e0a2497c. Revert "bpf/verifier: refine retval R0 state for bpf_get_stack helper" This reverts commit ceaf6d678ccb60da107b0455da64c7bf90c5102d. Revert "bpf: Remove struct bpf_verifier_env argument from print_bpf_insn" This reverts commit 058fd54c07a289f9b506f2d2326434e411fa65fe. Revert "bpf: annotate bpf_insn_print_t with __printf" This reverts commit 9b07d2ccf07855d62446e274d817672713f15be4. Revert "bpf: allow for correlation of maps and helpers in dump" This reverts commit af690c2e2d177352f7270f77d8a6bc9e9f60c98c. Revert "bpf: Add bpf_patch_call_args prototype to include/linux/bpf.h" This reverts commit 8a2c588b3ab98916147fe4a449312ce8db70c471. Revert "bpf: x64: add JIT support for multi-function programs" This reverts commit 752f261e545f80942272c6becf82def1729f84be. Revert "bpf: fix net.core.bpf_jit_enable race" This reverts commit 4720901114c20204aa3ffa2076265d2c8cc9e81b. Revert "bpf: add support for bpf_call to interpreter" This reverts commit c79b2e547adc8e50dabc72244370cfd37ac6a6bd. Revert "bpf: introduce function calls (verification)" This reverts commit f779fda96c7d9e921525f48d67fa2e9c68b4bd48. Revert "bpf: cleanup register_is_null()" This reverts commit 1c81f751670b4feb3102e4de136e25fa24e303fe. Revert "bpf: print liveness info to verifier log" This reverts commit fdc851301b33b9d646bd1d37124cbd45cedcd62b. Revert "bpf: also improve pattern matches for meta access" This reverts commit 9aa150d07927b911f26e0db2af0efd6aa07b8707. Revert "bpf: add meta pointer for direct access" This reverts commit 94f3f502ef9ef150ed687113cfbd38e91b5edc44. Revert "bpf: rename bpf_compute_data_end into bpf_compute_data_pointers" This reverts commit 9573c6feb301346cd1493eea4e363c6d8345e899. Revert "bpf: squash of log related commits" This reverts commit b08f2111e030a72a92eec4ebd6201165d03a20b8. Revert "bpf: move instruction printing into a separate file" This reverts commit 8fcbd39afb58847914f3f84d9c076000e09d2fb9. Revert "bpf: btf: Introduce BTF ID" This reverts commit 423c40d67dfc783c3b0cb227d9da53e725e0f35c. Revert "bpf: btf: Add pretty print support to the basic arraymap" This reverts commit 6cd4d5bba662ca0d8980e5806ef37e0341eab929. Revert "nsfs: clean-up ns_get_path() signature to return int" This reverts commit ec1ce41701f411c5dee396cec2931fb651f447cc. Revert "bpf_obj_do_pin(): switch to vfs_mkobj(), quit abusing ->mknod()" This reverts commit 8fbcb4ebf5a751f4685cdd2757cff2264032a5d9. Revert "bpf: offload: report device information about offloaded maps" This reverts commit 1105e63f25a9db675671288b583a5ce2c7d10b1f. Revert "bpf: offload: add map offload infrastructure" This reverts commit 20cdf9df3d5bd010d799ea3c80219f625c998307. Revert "bpf: add map_alloc_check callback" This reverts commit 6feb4121ea083053ac9587ac426195efe9fb143d. Revert "bpf: offload: factor out netdev checking at allocation time" This reverts commit 1425fb5676b8fe9d761f2f6545e4be8880ce0ac8. Revert "bpf: rename bpf_dev_offload -> bpf_prog_offload" This reverts commit a03ae0ec508200433fd6c35b87e342df4de0b320. Revert "bpf: offload: allow netdev to disappear while verifier is running" This reverts commit f6cf7214fd1ff3a018009ba90c33eac1d8de21de. Revert "bpf: offload: free program id when device disappears" This reverts commit b12b5e56b799cfe900ab8f0ee4177c6c08a904c6. Revert "bpf: offload: report device information for offloaded programs" This reverts commit c73c9a0ffa332eeb49927a48780f5537597e2d42. Revert "bpf: offload: don't require rtnl for dev list manipulation" This reverts commit 1993f08662f07581a370899a2da209ba0c996dbb. Revert "bpf: offload: ignore namespace moves" This reverts commit 9fefb21d8aa2691019f9c4f0b8025fb45ba60b49. Revert "bpf: Add PTR_TO_SOCKET verifier type" This reverts commit 55fdbc844801cd4007237fa6c5842b46985a5c9a. Revert "bpf: extend cgroup bpf core to allow multiple cgroup storage types" This reverts commit a6d82e371ef32fb24d493cff32765b4607581dd4. Revert "bpf: permit CGROUP_DEVICE programs accessing helper bpf_get_current_cgroup_id()" This reverts commit 1bfd0a07a8317004a89d6de736e24861db8281b5. Revert "bpf: implement bpf_get_current_cgroup_id() helper" This reverts commit 23603ed6d7df86392701a7ea7d9a1dba66f28d4b. Revert "bpf: introduce the bpf_get_local_storage() helper function" This reverts commit 3d777256b1c9f34975c5230d836023ea3e0d4cfd. Revert "bpf/verifier: introduce BPF_PTR_TO_MAP_VALUE" This reverts commit 93c12733dc97984f7bf57a77160eacc480bfc3de. Revert "bpf: extend bpf_prog_array to store pointers to the cgroup storage" This reverts commit b26baff1fb34607938c9ac0e421e3f4b5fedad4d. Revert "BACKPORT: bpf: allocate cgroup storage entries on attaching bpf programs" This reverts commit 804605c21a3be3277c0031504dcd3fdd1be64290. Revert "bpf: include errno.h from bpf-cgroup.h" This reverts commit 6b4df332b357e9a5942ca4c6f985cd33dfc30e25. Revert "bpf: pass a pointer to a cgroup storage using pcpu variable" This reverts commit c8af92dc9fc00e49f06f6997969284ef5e5c5af5. Revert "bpf: introduce cgroup storage maps" This reverts commit c61c2271cb8a1e47678bddc8cdfae83035a07fec. Revert "bpf: add ability to charge bpf maps memory dynamically" This reverts commit 3a430745e9f675b450477fffead5568046432f29. Revert "bpf: add helper for copying attrs to struct bpf_map" This reverts commit 6d7be0ae93371692e564c00003ce184cbaefbb8d. Revert "bpf: introduce new bpf cpu map type BPF_MAP_TYPE_CPUMAP" This reverts commit 15f584d2d3d4814cfbd3059ab810db02af8773a0. Revert "bpf/tracing: fix a deadlock in perf_event_detach_bpf_prog" This reverts commit fc9bf5e48985f7c3a39bf34a27477a2607a5dc6d. Revert "bpf: set maximum number of attached progs to 64 for a single perf tp" This reverts commit 0d5fc9795d824fbca21b81c8d91748ba21313d4c. Revert "bpf: avoid rcu_dereference inside bpf_event_mutex lock region" This reverts commit 948e200e3173dd959de907e326f2a2c90eda4b28. Revert "bpf: fix bpf_prog_array_copy_to_user() issues" This reverts commit 66811698b8de9b3cf13c09730d287b6d1d5d3699. Revert "bpf: fix pointer offsets in context for 32 bit" This reverts commit 99661813c136c52e56b328a2a8ecd2bc0e187eba. Revert "BACKPORT: bpf: create tcp_bpf_ulp allowing BPF to monitor socket TX/RX data" This reverts commit 36f0ea00dd121b13f80617e5b2eb93ba160df85a. Revert "BACKPORT: bpf: Sysctl hook" This reverts commit 4a543990e03b5de4a2c23777abd0f77afd61cc2d. Revert "BACKPORT: flow_dissector: implements flow dissector BPF hook" This reverts commit de610a8a4324170a0deaf12e2e64c2ff068785fb. Revert "BACKPORT: bpf: Add base proto function for cgroup-bpf programs" This reverts commit f3ac0a6cbec3472ff2e3808a436891881f3cbf87. Revert "FROMLIST: [net-next,v2,1/2] bpf: Allow CGROUP_SKB eBPF program to access sk_buff" This reverts commit 6d4dcc0e3de628003d91075e4b1ab1a128b8892e. Revert "BACKPORT: bpf: introduce BPF_RAW_TRACEPOINT" This reverts commit b2a5c6b4958c8250e58ddb6c334018a5f7ee5437. Revert "bpf/tracing: fix kernel/events/core.c compilation error" This reverts commit 70249d4eb7359e9dc59e044951beb99d0d8725cd. Revert "BACKPORT: bpf/tracing: allow user space to query prog array on the same tp" This reverts commit 08a6d8c01372940bfec78fdc6cb8a47e08c745b0. Revert "bpf: sockmap, add sock close() hook to remove socks" This reverts commit e6b363b8d09d9740dff309fb4dc88e7a1e90726b. Revert "BACKPORT: bpf: remove the verifier ops from program structure" This reverts commit 94c2f61efa741bf6a97415f42cfbfb9ec83dfd8e. Revert "bpf, cgroup: implement eBPF-based device controller for cgroup v2" This reverts commit 22faa9c56550a34488e607ca3aca59c68b1f7938. Revert "BACKPORT: bpf: split verifier and program ops" This reverts commit d2b1388504c1129d5756bb9b20af9bd64e75d015. Revert "bpf: btf: Break up btf_type_is_void()" This reverts commit 052989c47b68feaf381d371ec1e6a169edc26d30. Revert "bpf: btf: refactor btf_int_bits_seq_show()" This reverts commit 8cc3fb30656cfab91205194a8ee7661bdd95e005. Revert "BACKPORT: bpf: fix unconnected udp hooks" This reverts commit b108e725aa70e39cfd37296d1a1d31e8896fa7b7. Revert "BACKPORT: bpf: enforce return code for cgroup-bpf programs" This reverts commit 10215080915bfbdaa9f666a95ffda02cc1ef7a29. Revert "bpf: Hooks for sys_sendmsg" This reverts commit cd847db1be8a37e0e7e9c813b5d8f93697dc5af0. Revert "BACKPORT: devmap: Allow map lookups from eBPF" This reverts commit 37da95fde647e8967b362e0769136bfbebc03628. Revert "BACKPORT: xdp: Add devmap_hash map type for looking up devices by hashed index" This reverts commit ae6a87f44c4ef20ac290ce68c4d5b542cf46f3d7. Revert "kernel: bpf: devmap: Create __dev_map_alloc_node" This reverts commit 15928a97ed93cf9f606a21bf869ff421b997a2c5. Revert "BACKPORT: bpf: Post-hooks for sys_bind" This reverts commit c221d44e76c3ab69285c9986680e5eb726cf157b. Revert "BACKPORT: bpf: Hooks for sys_connect" This reverts commit 003311ea43163c77e4e0c1921b81438286925baa. Revert "BACKPORT: net: Introduce __inet_bind() and __inet6_bind" This reverts commit 74f1eb60012c13bd606e4dc718e63aec7f8cce8f. Revert "BACKPORT: bpf: Hooks for sys_bind" This reverts commit cef0bd97f2fec8363c3ef58b2cb508deaa9bc5b2. Revert "BACKPORT: bpf: introduce BPF_PROG_QUERY command" This reverts commit a4ef81ce48cb25843ddb4d4331dacf2742215909. Revert "BACKPORT: bpf: Check attach type at prog load time" This reverts commit 750a3f976c75797e572a6dfdd2e8865b8b49964a. Revert "bpf: offload: rename the ifindex field" This reverts commit 921e6becfb28fbe505603bf927f195d1d72a0eea. Revert "BACKPORT: bpf: offload: add infrastructure for loading programs for a specific netdev" This reverts commit cb1607a58d026a4ac1d9e71f6c3cd1dc23820e2f. Revert "BACKPORT: net: bpf: rename ndo_xdp to ndo_bpf" This reverts commit 932d47ebc5910bb1ec954002206b1ce8749a9cd6. Revert "bpf: btf: fix truncated last_member_type_id in btf_struct_resolve" This reverts commit e7af669fe00a8e2030913088836189a9f65a04d8. Revert "bpf/btf: Fix BTF verification of enum members in struct/union" This reverts commit a098516b98fe35e8f0e89709443fff8b37eb04b8. Revert "bpf: fix BTF limits" This reverts commit 794ad07fab9540989f96351c11b039e2229c2a8e. Revert "bpf, btf: fix a missing check bug in btf_parse" This reverts commit 27c4178ecc8edbb2306fa479f275ffd35f5b57c9. Revert "bpf: btf: Fix a missing check bug" This reverts commit 71f5a7d140aa5a37d164e217b2fefcb2d409b894. Revert "bpf: btf: Fix end boundary calculation for type section" This reverts commit 549615befd671b6877677acb009b66cd374408d3. Revert "bpf: fix bpf_skb_load_bytes_relative pkt length check" This reverts commit 5f3d68c4da18dfbcde4c02cb34c63599709fcf3c. Revert "bpf: btf: Ensure the member->offset is in the right order" This reverts commit 4f9d26cbc747a4728c4944b7dc9725fc2737f892. Revert "bpf: btf: Clean up BTF_INT_BITS() in uapi btf.h" This reverts commit 480c6f80a14431f6d680a687363dcb0d9cd1d7a8. Revert "bpf: btf: Fix bitfield extraction for big endian" This reverts commit 0463c259aa21e99d1bf798c8cf54da18b5906938. Revert "bpf: btf: Ensure t->type == 0 for BTF_KIND_FWD" This reverts commit ecc54be6970a3484eb163ac09996856c9ece5727. Revert "bpf: btf: Check array t->size" This reverts commit 3cda848b9be9fbb6dfa8912a425801c263bcbff7. Revert "bpf: btf: avoid -Wreturn-type warning" This reverts commit fd7fede5952004dcacb39f318249c4cf8e5c51e0. Revert "bpf: btf: Avoid variable length array" This reverts commit 2826641eb171c705d0b2db86d8834eff33945d0e. Revert "bpf: btf: Remove unused bits from uapi/linux/btf.h" This reverts commit 2d9e7a574f7e47a027974ec616ac812ad6a2d086. Revert "bpf: btf: Check array->index_type" This reverts commit f9ee68f7e8a471450536a70b43bd96d4bdfbfb81. Revert "bpf: btf: Change how section is supported in btf_header" This reverts commit 63a4474da4bf56c8a700d542bcf3a57a4b737ed6. Revert "bpf: Fix compiler warning on info.map_ids for 32bit platform" This reverts commit a4f706ea7d2b874ef739168a12a30ae5454487a6. Revert "BACKPORT: bpf: Use char in prog and map name" This reverts commit 8d4ad88eabb5d1500814c5f5b76a11f80346669c. Revert "bpf: Change bpf_obj_name_cpy() to better ensure map's name is init by 0" This reverts commit c4acfd3c9f5a97123c240676750f3e4ae2a2c24c. Revert "BACKPORT: bpf: Add map_name to bpf_map_info" This reverts commit 0e03a4e584eabe3f4c448f06f271753cdaae3aab. Revert "BACKPORT: bpf: Add name, load_time, uid and map_ids to bpf_prog_info" This reverts commit 16872f60e6c1fc6b10e905ff18c14d8aaeb4e09d. Revert "bpf: btf: Avoid WARN_ON when CONFIG_REFCOUNT_FULL=y" This reverts commit 0b618ec6e162e650aaa583a31f4de4c4558148bf. Revert "BACKPORT: bpf: btf: Clean up btf.h in uapi" This reverts commit ea0c0ad08c18ddf62dbb6c8edc814c75cbb3e8b9. Revert "bpf: btf: Add BPF_OBJ_GET_INFO_BY_FD support to BTF fd" This reverts commit f51fe1d1edb742176c622bc93301e98a1cbf2e63. Revert "BACKPORT: bpf: btf: Add BPF_BTF_LOAD command" This reverts commit 85db8f764069f15d1b181bea67336ce4d66a58c1. Revert "bpf: btf: Add pretty print capability for data with BTF type info" This reverts commit 0a8aae433c53b1f441cab70979517660fb6a6038. Revert "bpf: btf: Check members of struct/union" This reverts commit ce2e8103ac1a977ce32db51ec042faea6f100a3d. Revert "bpf: btf: Validate type reference" This reverts commit a1aa96e6dae2b4c8c0b0a4dedab3006d3f697460. Revert "bpf: Update logging functions to work with BTF" This reverts commit b9289460f0a6b5c261ec0b6dcafa6fcd09d4957e. Revert "BACKPORT: bpf: btf: Introduce BPF Type Format (BTF)" This reverts commit ceebd58f6470e8ec6d9d694ab382fe88f43b998b. Revert "BACKPORT: bpf: Rename bpf_verifer_log" This reverts commit 50bdc7513d966811fb418d24a0e5797ffd8c907c. Revert "BACKPORT: bpf: encapsulate verifier log state into a structure" This reverts commit 0bcb397bde4675fdeb977d9debed20ed213f9ecd. Change-Id: Iecaa276b078c6d2db773a8071e7da9e6195277d6 --- arch/arm/net/bpf_jit_32.c | 2 +- arch/arm64/net/bpf_jit_comp.c | 2 +- arch/s390/net/bpf_jit_comp.c | 2 +- arch/sparc/net/bpf_jit_comp_64.c | 2 +- arch/x86/net/bpf_jit_comp.c | 51 +- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 2 +- drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c | 2 +- drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.h | 2 +- .../net/ethernet/cavium/thunder/nicvf_main.c | 4 +- drivers/net/ethernet/intel/i40e/i40e_main.c | 6 +- drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 4 +- .../net/ethernet/mellanox/mlx4/en_netdev.c | 6 +- .../net/ethernet/mellanox/mlx5/core/en_main.c | 4 +- .../ethernet/netronome/nfp/nfp_net_common.c | 4 +- drivers/net/ethernet/qlogic/qede/qede.h | 2 +- .../net/ethernet/qlogic/qede/qede_filter.c | 2 +- drivers/net/ethernet/qlogic/qede/qede_main.c | 4 +- drivers/net/tun.c | 5 +- drivers/net/virtio_net.c | 6 +- fs/nsfs.c | 28 +- fs/proc/namespaces.c | 6 +- fs/proc/proc_sysctl.c | 5 - include/asm-generic/vmlinux.lds.h | 10 - include/linux/bpf-cgroup.h | 255 +- include/linux/bpf.h | 390 +- include/linux/bpf_types.h | 47 +- include/linux/bpf_verifier.h | 125 +- include/linux/btf.h | 73 - include/linux/device_cgroup.h | 8 +- include/linux/filter.h | 132 +- include/linux/netdevice.h | 51 +- include/linux/proc_ns.h | 4 +- include/linux/skbuff.h | 190 +- include/linux/skmsg.h | 371 -- include/linux/trace_events.h | 48 - include/linux/tracepoint-defs.h | 7 - include/net/addrconf.h | 7 - include/net/bpf_sk_storage.h | 13 - include/net/inet_common.h | 2 - include/net/ipv6.h | 2 - include/net/net_namespace.h | 4 +- include/net/sch_generic.h | 12 +- include/net/sock.h | 9 - include/net/tcp.h | 39 +- include/net/udp.h | 1 - include/net/xdp.h | 70 - include/net/xfrm.h | 32 +- include/trace/bpf_probe.h | 25 +- include/trace/define_trace.h | 1 - include/uapi/linux/bpf.h | 538 +-- include/uapi/linux/btf.h | 140 - include/uapi/linux/perf_event.h | 23 - kernel/Kconfig.locks | 3 - kernel/bpf/Makefile | 8 +- kernel/bpf/arraymap.c | 103 +- kernel/bpf/btf.c | 2930 ------------- kernel/bpf/cgroup.c | 699 +--- kernel/bpf/core.c | 449 +- kernel/bpf/cpumap.c | 555 --- kernel/bpf/devmap.c | 262 +- kernel/bpf/disasm.c | 261 -- kernel/bpf/disasm.h | 48 - kernel/bpf/hashtab.c | 79 +- kernel/bpf/helpers.c | 127 - kernel/bpf/inode.c | 169 +- kernel/bpf/local_storage.c | 574 --- kernel/bpf/lpm_trie.c | 117 +- kernel/bpf/map_in_map.c | 5 - kernel/bpf/offload.c | 462 --- kernel/bpf/sockmap.c | 920 ++++ kernel/bpf/stackmap.c | 7 +- kernel/bpf/syscall.c | 980 +---- kernel/bpf/verifier.c | 3689 ++++------------- kernel/cgroup/cgroup.c | 10 - kernel/events/core.c | 4 +- kernel/trace/bpf_trace.c | 305 +- net/Kconfig | 14 - net/bpf/test_run.c | 4 +- net/core/Makefile | 5 +- net/core/bpf_sk_storage.c | 812 ---- net/core/dev.c | 151 +- net/core/filter.c | 1427 +------ net/core/flow_dissector.c | 135 - net/core/lwt_bpf.c | 2 +- net/core/rtnetlink.c | 4 +- net/core/skbuff.c | 150 - net/core/skmsg.c | 763 ---- net/core/sock_map.c | 1002 ----- net/core/xdp.c | 67 - net/ipv4/Makefile | 1 - net/ipv4/af_inet.c | 71 +- net/ipv4/esp4.c | 24 +- net/ipv4/esp4_offload.c | 74 +- net/ipv4/ip_output.c | 1 - net/ipv4/tcp.c | 10 +- net/ipv4/tcp_bpf.c | 655 --- net/ipv4/tcp_ipv4.c | 16 - net/ipv4/udp.c | 38 +- net/ipv4/xfrm4_mode_tunnel.c | 4 +- net/ipv6/af_inet6.c | 66 +- net/ipv6/esp6.c | 24 +- net/ipv6/esp6_offload.c | 84 +- net/ipv6/ip6_output.c | 1 - net/ipv6/tcp_ipv6.c | 16 - net/ipv6/udp.c | 48 - net/ipv6/xfrm6_mode_tunnel.c | 4 +- net/netfilter/nft_meta.c | 2 +- net/packet/af_packet.c | 4 +- net/sched/act_bpf.c | 4 +- net/sched/cls_bpf.c | 4 +- net/sched/sch_generic.c | 15 +- net/socket.c | 32 - net/strparser/Kconfig | 4 +- net/xfrm/xfrm_device.c | 144 +- tools/include/uapi/linux/bpf.h | 19 - tools/testing/selftests/bpf/test_lpm_map.c | 359 ++ tools/testing/selftests/bpf/test_verifier.c | 72 - 117 files changed, 2938 insertions(+), 18940 deletions(-) delete mode 100644 include/linux/btf.h delete mode 100644 include/linux/skmsg.h delete mode 100644 include/net/bpf_sk_storage.h delete mode 100644 include/net/xdp.h delete mode 100644 include/uapi/linux/btf.h delete mode 100644 kernel/bpf/btf.c delete mode 100644 kernel/bpf/cpumap.c delete mode 100644 kernel/bpf/disasm.c delete mode 100644 kernel/bpf/disasm.h delete mode 100644 kernel/bpf/local_storage.c delete mode 100644 kernel/bpf/offload.c create mode 100644 kernel/bpf/sockmap.c delete mode 100644 net/core/bpf_sk_storage.c delete mode 100644 net/core/skmsg.c delete mode 100644 net/core/sock_map.c delete mode 100644 net/core/xdp.c delete mode 100644 net/ipv4/tcp_bpf.c create mode 100644 tools/testing/selftests/bpf/test_lpm_map.c diff --git a/arch/arm/net/bpf_jit_32.c b/arch/arm/net/bpf_jit_32.c index 68aa2f6d9f83..e13aca6e6d4b 100644 --- a/arch/arm/net/bpf_jit_32.c +++ b/arch/arm/net/bpf_jit_32.c @@ -1827,7 +1827,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) /* If BPF JIT was not enabled then we must fall back to * the interpreter. */ - if (!prog->jit_requested) + if (!bpf_jit_enable) return orig_prog; /* If constant blinding was enabled and we failed during blinding diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index 7b1b649da4b9..0b8ab4b12538 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -856,7 +856,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) int image_size; u8 *image_ptr; - if (!prog->jit_requested) + if (!bpf_jit_enable) return orig_prog; tmp = bpf_jit_blind_constants(prog); diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c index 6b1003fdd05d..60029baaa72a 100644 --- a/arch/s390/net/bpf_jit_comp.c +++ b/arch/s390/net/bpf_jit_comp.c @@ -1363,7 +1363,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp) struct bpf_jit jit; int pass; - if (!fp->jit_requested) + if (!bpf_jit_enable) return orig_fp; tmp = bpf_jit_blind_constants(fp); diff --git a/arch/sparc/net/bpf_jit_comp_64.c b/arch/sparc/net/bpf_jit_comp_64.c index 85ae4b0d5fbc..dfb1a62abe93 100644 --- a/arch/sparc/net/bpf_jit_comp_64.c +++ b/arch/sparc/net/bpf_jit_comp_64.c @@ -1529,7 +1529,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) u8 *image_ptr; int pass; - if (!prog->jit_requested) + if (!bpf_jit_enable) return orig_prog; tmp = bpf_jit_blind_constants(prog); diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 94b4d9d869d5..a114c319cac2 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -1129,29 +1129,19 @@ common_load: return proglen; } -struct x64_jit_data { - struct bpf_binary_header *header; - int *addrs; - u8 *image; - int proglen; - struct jit_context ctx; -}; - struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) { struct bpf_binary_header *header = NULL; struct bpf_prog *tmp, *orig_prog = prog; - struct x64_jit_data *jit_data; int proglen, oldproglen = 0; struct jit_context ctx = {}; bool tmp_blinded = false; - bool extra_pass = false; u8 *image = NULL; int *addrs; int pass; int i; - if (!prog->jit_requested) + if (!bpf_jit_enable) return orig_prog; tmp = bpf_jit_blind_constants(prog); @@ -1165,28 +1155,10 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) prog = tmp; } - jit_data = prog->aux->jit_data; - if (!jit_data) { - jit_data = kzalloc(sizeof(*jit_data), GFP_KERNEL); - if (!jit_data) { - prog = orig_prog; - goto out; - } - prog->aux->jit_data = jit_data; - } - addrs = jit_data->addrs; - if (addrs) { - ctx = jit_data->ctx; - oldproglen = jit_data->proglen; - image = jit_data->image; - header = jit_data->header; - extra_pass = true; - goto skip_init_addrs; - } addrs = kmalloc(prog->len * sizeof(*addrs), GFP_KERNEL); if (!addrs) { prog = orig_prog; - goto out_addrs; + goto out; } /* Before first pass, make a rough estimation of addrs[] @@ -1198,7 +1170,6 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) } ctx.cleanup_addr = proglen; -skip_init_addrs: /* JITed image shrinks with every pass and the loop iterates * until the image stops shrinking. Very large bpf programs * may converge on the last pass. In such case do one more @@ -1239,15 +1210,7 @@ out_image: if (image) { bpf_flush_icache(header, image + proglen); - if (!prog->is_func || extra_pass) { - bpf_jit_binary_lock_ro(header); - } else { - jit_data->addrs = addrs; - jit_data->ctx = ctx; - jit_data->proglen = proglen; - jit_data->image = image; - jit_data->header = header; - } + bpf_jit_binary_lock_ro(header); prog->bpf_func = (void *)image; prog->jited = 1; prog->jited_len = proglen; @@ -1255,14 +1218,8 @@ out_image: prog = orig_prog; } - if (!image || !prog->is_func || extra_pass) { - if (image) - bpf_prog_fill_jited_linfo(prog, addrs); out_addrs: - kfree(addrs); - kfree(jit_data); - prog->aux->jit_data = NULL; - } + kfree(addrs); out: if (tmp_blinded) bpf_jit_prog_release_other(prog, prog == orig_prog ? diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index df6e76e5d414..180a7ef588cf 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -7800,7 +7800,7 @@ static const struct net_device_ops bnxt_netdev_ops = { #endif .ndo_udp_tunnel_add = bnxt_udp_tunnel_add, .ndo_udp_tunnel_del = bnxt_udp_tunnel_del, - .ndo_bpf = bnxt_xdp, + .ndo_xdp = bnxt_xdp, .ndo_bridge_getlink = bnxt_bridge_getlink, .ndo_bridge_setlink = bnxt_bridge_setlink, .ndo_get_phys_port_name = bnxt_get_phys_port_name diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c index ba0869bd60e0..d8f0c837b72c 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c @@ -207,7 +207,7 @@ static int bnxt_xdp_set(struct bnxt *bp, struct bpf_prog *prog) return 0; } -int bnxt_xdp(struct net_device *dev, struct netdev_bpf *xdp) +int bnxt_xdp(struct net_device *dev, struct netdev_xdp *xdp) { struct bnxt *bp = netdev_priv(dev); int rc; diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.h b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.h index 414b748038ca..12a5ad66b564 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.h @@ -16,6 +16,6 @@ void bnxt_tx_int_xdp(struct bnxt *bp, struct bnxt_napi *bnapi, int nr_pkts); bool bnxt_rx_xdp(struct bnxt *bp, struct bnxt_rx_ring_info *rxr, u16 cons, struct page *page, u8 **data_ptr, unsigned int *len, u8 *event); -int bnxt_xdp(struct net_device *dev, struct netdev_bpf *xdp); +int bnxt_xdp(struct net_device *dev, struct netdev_xdp *xdp); #endif diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_main.c b/drivers/net/ethernet/cavium/thunder/nicvf_main.c index 79e476cdd97d..df1c4ba7e0c9 100644 --- a/drivers/net/ethernet/cavium/thunder/nicvf_main.c +++ b/drivers/net/ethernet/cavium/thunder/nicvf_main.c @@ -1772,7 +1772,7 @@ static int nicvf_xdp_setup(struct nicvf *nic, struct bpf_prog *prog) return ret; } -static int nicvf_xdp(struct net_device *netdev, struct netdev_bpf *xdp) +static int nicvf_xdp(struct net_device *netdev, struct netdev_xdp *xdp) { struct nicvf *nic = netdev_priv(netdev); @@ -1805,7 +1805,7 @@ static const struct net_device_ops nicvf_netdev_ops = { .ndo_tx_timeout = nicvf_tx_timeout, .ndo_fix_features = nicvf_fix_features, .ndo_set_features = nicvf_set_features, - .ndo_bpf = nicvf_xdp, + .ndo_xdp = nicvf_xdp, }; static int nicvf_probe(struct pci_dev *pdev, const struct pci_device_id *ent) diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index 751c931fe184..0e0bc67a28bf 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -9645,12 +9645,12 @@ static int i40e_xdp_setup(struct i40e_vsi *vsi, } /** - * i40e_xdp - implements ndo_bpf for i40e + * i40e_xdp - implements ndo_xdp for i40e * @dev: netdevice * @xdp: XDP command **/ static int i40e_xdp(struct net_device *dev, - struct netdev_bpf *xdp) + struct netdev_xdp *xdp) { struct i40e_netdev_priv *np = netdev_priv(dev); struct i40e_vsi *vsi = np->vsi; @@ -9702,7 +9702,7 @@ static const struct net_device_ops i40e_netdev_ops = { .ndo_features_check = i40e_features_check, .ndo_bridge_getlink = i40e_ndo_bridge_getlink, .ndo_bridge_setlink = i40e_ndo_bridge_setlink, - .ndo_bpf = i40e_xdp, + .ndo_xdp = i40e_xdp, }; /** diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index 64b8cc5c6283..ca9bdaed31c5 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -9885,7 +9885,7 @@ static int ixgbe_xdp_setup(struct net_device *dev, struct bpf_prog *prog) return 0; } -static int ixgbe_xdp(struct net_device *dev, struct netdev_bpf *xdp) +static int ixgbe_xdp(struct net_device *dev, struct netdev_xdp *xdp) { struct ixgbe_adapter *adapter = netdev_priv(dev); @@ -9994,7 +9994,7 @@ static const struct net_device_ops ixgbe_netdev_ops = { .ndo_udp_tunnel_add = ixgbe_add_udp_tunnel_port, .ndo_udp_tunnel_del = ixgbe_del_udp_tunnel_port, .ndo_features_check = ixgbe_features_check, - .ndo_bpf = ixgbe_xdp, + .ndo_xdp = ixgbe_xdp, .ndo_xdp_xmit = ixgbe_xdp_xmit, .ndo_xdp_flush = ixgbe_xdp_flush, }; diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c index 1bc7e3497a1a..70a80e43d833 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c @@ -2930,7 +2930,7 @@ static u32 mlx4_xdp_query(struct net_device *dev) return prog_id; } -static int mlx4_xdp(struct net_device *dev, struct netdev_bpf *xdp) +static int mlx4_xdp(struct net_device *dev, struct netdev_xdp *xdp) { switch (xdp->command) { case XDP_SETUP_PROG: @@ -2972,7 +2972,7 @@ static const struct net_device_ops mlx4_netdev_ops = { .ndo_udp_tunnel_del = mlx4_en_del_vxlan_port, .ndo_features_check = mlx4_en_features_check, .ndo_set_tx_maxrate = mlx4_en_set_tx_maxrate, - .ndo_bpf = mlx4_xdp, + .ndo_xdp = mlx4_xdp, }; static const struct net_device_ops mlx4_netdev_ops_master = { @@ -3009,7 +3009,7 @@ static const struct net_device_ops mlx4_netdev_ops_master = { .ndo_udp_tunnel_del = mlx4_en_del_vxlan_port, .ndo_features_check = mlx4_en_features_check, .ndo_set_tx_maxrate = mlx4_en_set_tx_maxrate, - .ndo_bpf = mlx4_xdp, + .ndo_xdp = mlx4_xdp, }; struct mlx4_en_bond { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 707c87f9987c..75c491ab6127 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -3735,7 +3735,7 @@ static u32 mlx5e_xdp_query(struct net_device *dev) return prog_id; } -static int mlx5e_xdp(struct net_device *dev, struct netdev_bpf *xdp) +static int mlx5e_xdp(struct net_device *dev, struct netdev_xdp *xdp) { switch (xdp->command) { case XDP_SETUP_PROG: @@ -3787,7 +3787,7 @@ static const struct net_device_ops mlx5e_netdev_ops = { .ndo_rx_flow_steer = mlx5e_rx_flow_steer, #endif .ndo_tx_timeout = mlx5e_tx_timeout, - .ndo_bpf = mlx5e_xdp, + .ndo_xdp = mlx5e_xdp, #ifdef CONFIG_NET_POLL_CONTROLLER .ndo_poll_controller = mlx5e_netpoll, #endif diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c index f6050b13ff1b..bffa25d6dc29 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c @@ -3422,7 +3422,7 @@ nfp_net_xdp_setup(struct nfp_net *nn, struct bpf_prog *prog, u32 flags, return 0; } -static int nfp_net_xdp(struct net_device *netdev, struct netdev_bpf *xdp) +static int nfp_net_xdp(struct net_device *netdev, struct netdev_xdp *xdp) { struct nfp_net *nn = netdev_priv(netdev); @@ -3485,7 +3485,7 @@ const struct net_device_ops nfp_net_netdev_ops = { .ndo_get_phys_port_name = nfp_port_get_phys_port_name, .ndo_udp_tunnel_add = nfp_net_add_vxlan_port, .ndo_udp_tunnel_del = nfp_net_del_vxlan_port, - .ndo_bpf = nfp_net_xdp, + .ndo_xdp = nfp_net_xdp, }; /** diff --git a/drivers/net/ethernet/qlogic/qede/qede.h b/drivers/net/ethernet/qlogic/qede/qede.h index 4cc9af175a76..c132b08cefde 100644 --- a/drivers/net/ethernet/qlogic/qede/qede.h +++ b/drivers/net/ethernet/qlogic/qede/qede.h @@ -505,7 +505,7 @@ void qede_fill_rss_params(struct qede_dev *edev, void qede_udp_tunnel_add(struct net_device *dev, struct udp_tunnel_info *ti); void qede_udp_tunnel_del(struct net_device *dev, struct udp_tunnel_info *ti); -int qede_xdp(struct net_device *dev, struct netdev_bpf *xdp); +int qede_xdp(struct net_device *dev, struct netdev_xdp *xdp); #ifdef CONFIG_DCB void qede_set_dcbnl_ops(struct net_device *ndev); diff --git a/drivers/net/ethernet/qlogic/qede/qede_filter.c b/drivers/net/ethernet/qlogic/qede/qede_filter.c index 924cb2ea664d..e7ad95de3da8 100644 --- a/drivers/net/ethernet/qlogic/qede/qede_filter.c +++ b/drivers/net/ethernet/qlogic/qede/qede_filter.c @@ -1065,7 +1065,7 @@ static int qede_xdp_set(struct qede_dev *edev, struct bpf_prog *prog) return 0; } -int qede_xdp(struct net_device *dev, struct netdev_bpf *xdp) +int qede_xdp(struct net_device *dev, struct netdev_xdp *xdp) { struct qede_dev *edev = netdev_priv(dev); diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c b/drivers/net/ethernet/qlogic/qede/qede_main.c index a2da52362d09..99de923728ec 100644 --- a/drivers/net/ethernet/qlogic/qede/qede_main.c +++ b/drivers/net/ethernet/qlogic/qede/qede_main.c @@ -557,7 +557,7 @@ static const struct net_device_ops qede_netdev_ops = { .ndo_udp_tunnel_add = qede_udp_tunnel_add, .ndo_udp_tunnel_del = qede_udp_tunnel_del, .ndo_features_check = qede_features_check, - .ndo_bpf = qede_xdp, + .ndo_xdp = qede_xdp, #ifdef CONFIG_RFS_ACCEL .ndo_rx_flow_steer = qede_rx_flow_steer, #endif @@ -595,7 +595,7 @@ static const struct net_device_ops qede_netdev_vf_xdp_ops = { .ndo_udp_tunnel_add = qede_udp_tunnel_add, .ndo_udp_tunnel_del = qede_udp_tunnel_del, .ndo_features_check = qede_features_check, - .ndo_bpf = qede_xdp, + .ndo_xdp = qede_xdp, }; /* ------------------------------------------------------------------------- diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 757dff1c7216..c125b06f4298 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -1077,7 +1077,7 @@ static u32 tun_xdp_query(struct net_device *dev) return 0; } -static int tun_xdp(struct net_device *dev, struct netdev_bpf *xdp) +static int tun_xdp(struct net_device *dev, struct netdev_xdp *xdp) { switch (xdp->command) { case XDP_SETUP_PROG: @@ -1121,7 +1121,7 @@ static const struct net_device_ops tap_netdev_ops = { .ndo_features_check = passthru_features_check, .ndo_set_rx_headroom = tun_set_headroom, .ndo_get_stats64 = tun_net_get_stats64, - .ndo_bpf = tun_xdp, + .ndo_xdp = tun_xdp, }; static void tun_flow_init(struct tun_struct *tun) @@ -1369,7 +1369,6 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun, xdp.data_hard_start = buf; xdp.data = buf + pad; - xdp_set_data_meta_invalid(&xdp); xdp.data_end = xdp.data + len; orig_data = xdp.data; act = bpf_prog_run_xdp(xdp_prog, &xdp); diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 339d6c0b162a..f90b95b0c2e8 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -551,7 +551,6 @@ static struct sk_buff *receive_small(struct net_device *dev, xdp.data_hard_start = buf + VIRTNET_RX_PAD + vi->hdr_len; xdp.data = xdp.data_hard_start + xdp_headroom; - xdp_set_data_meta_invalid(&xdp); xdp.data_end = xdp.data + len; orig_data = xdp.data; act = bpf_prog_run_xdp(xdp_prog, &xdp); @@ -674,7 +673,6 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, data = page_address(xdp_page) + offset; xdp.data_hard_start = data - VIRTIO_XDP_HEADROOM + vi->hdr_len; xdp.data = data + vi->hdr_len; - xdp_set_data_meta_invalid(&xdp); xdp.data_end = xdp.data + (len - vi->hdr_len); act = bpf_prog_run_xdp(xdp_prog, &xdp); @@ -2097,7 +2095,7 @@ static u32 virtnet_xdp_query(struct net_device *dev) return 0; } -static int virtnet_xdp(struct net_device *dev, struct netdev_bpf *xdp) +static int virtnet_xdp(struct net_device *dev, struct netdev_xdp *xdp) { switch (xdp->command) { case XDP_SETUP_PROG: @@ -2124,7 +2122,7 @@ static const struct net_device_ops virtnet_netdev = { #ifdef CONFIG_NET_POLL_CONTROLLER .ndo_poll_controller = virtnet_netpoll, #endif - .ndo_bpf = virtnet_xdp, + .ndo_xdp = virtnet_xdp, .ndo_features_check = passthru_features_check, }; diff --git a/fs/nsfs.c b/fs/nsfs.c index 2aff289a7160..35fa13910e43 100644 --- a/fs/nsfs.c +++ b/fs/nsfs.c @@ -51,7 +51,7 @@ static void nsfs_evict(struct inode *inode) ns->ops->put(ns); } -static int __ns_get_path(struct path *path, struct ns_common *ns) +static void *__ns_get_path(struct path *path, struct ns_common *ns) { struct vfsmount *mnt = nsfs_mnt; struct dentry *dentry; @@ -70,13 +70,13 @@ static int __ns_get_path(struct path *path, struct ns_common *ns) got_it: path->mnt = mntget(mnt); path->dentry = dentry; - return 0; + return NULL; slow: rcu_read_unlock(); inode = new_inode_pseudo(mnt->mnt_sb); if (!inode) { ns->ops->put(ns); - return -ENOMEM; + return ERR_PTR(-ENOMEM); } inode->i_ino = ns->inum; inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); @@ -88,7 +88,7 @@ slow: dentry = d_alloc_pseudo(mnt->mnt_sb, &empty_name); if (!dentry) { iput(inode); - return -ENOMEM; + return ERR_PTR(-ENOMEM); } d_instantiate(dentry, inode); dentry->d_flags |= DCACHE_RCUACCESS; @@ -98,22 +98,22 @@ slow: d_delete(dentry); /* make sure ->d_prune() does nothing */ dput(dentry); cpu_relax(); - return -EAGAIN; + return ERR_PTR(-EAGAIN); } goto got_it; } -int ns_get_path_cb(struct path *path, ns_get_path_helper_t *ns_get_cb, +void *ns_get_path_cb(struct path *path, ns_get_path_helper_t *ns_get_cb, void *private_data) { - int ret; + void *ret; do { struct ns_common *ns = ns_get_cb(private_data); if (!ns) - return -ENOENT; + return ERR_PTR(-ENOENT); ret = __ns_get_path(path, ns); - } while (ret == -EAGAIN); + } while (ret == ERR_PTR(-EAGAIN)); return ret; } @@ -126,7 +126,7 @@ static struct ns_common *ns_get_path_task(void *private_data) struct ns_get_path_task_args *args = private_data; return args->ns_ops->get(args->task); } -int ns_get_path(struct path *path, struct task_struct *task, +void *ns_get_path(struct path *path, struct task_struct *task, const struct proc_ns_operations *ns_ops) { struct ns_get_path_task_args args = { @@ -141,7 +141,7 @@ int open_related_ns(struct ns_common *ns, { struct path path = {}; struct file *f; - int err; + void *err; int fd; fd = get_unused_fd_flags(O_CLOEXEC); @@ -158,11 +158,11 @@ int open_related_ns(struct ns_common *ns, } err = __ns_get_path(&path, relative); - } while (err == -EAGAIN); + } while (err == ERR_PTR(-EAGAIN)); - if (err) { + if (IS_ERR(err)) { put_unused_fd(fd); - return err; + return PTR_ERR(err); } f = dentry_open(&path, O_RDONLY, current_cred()); diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c index 6f1e43762338..59b17e509f46 100644 --- a/fs/proc/namespaces.c +++ b/fs/proc/namespaces.c @@ -42,14 +42,14 @@ static const char *proc_ns_get_link(struct dentry *dentry, const struct proc_ns_operations *ns_ops = PROC_I(inode)->ns_ops; struct task_struct *task; struct path ns_path; - int error = -EACCES; + void *error = ERR_PTR(-EACCES); if (!dentry) return ERR_PTR(-ECHILD); task = get_proc_task(inode); if (!task) - return ERR_PTR(-EACCES); + return error; if (ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) { error = ns_get_path(&ns_path, task, ns_ops); @@ -57,7 +57,7 @@ static const char *proc_ns_get_link(struct dentry *dentry, nd_jump_link(&ns_path); } put_task_struct(task); - return ERR_PTR(error); + return error; } static int proc_ns_readlink(struct dentry *dentry, char __user *buffer, int buflen) diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index e384b4a96191..f4b46f796901 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -13,7 +13,6 @@ #include #include #include -#include #include #include "internal.h" @@ -595,10 +594,6 @@ static ssize_t proc_sys_call_handler(struct file *filp, void __user *buf, if (!table->proc_handler) goto out; - error = BPF_CGROUP_RUN_PROG_SYSCTL(head, table, write); - if (error) - goto out; - /* careful: calling conventions are nasty here */ res = count; error = table->proc_handler(table, write, buf, &res, ppos); diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index b89ddad8e182..dfe27f9dc2f6 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -172,15 +172,6 @@ #define TRACE_SYSCALLS() #endif -#ifdef CONFIG_BPF_EVENTS -#define BPF_RAW_TP() STRUCT_ALIGN(); \ - VMLINUX_SYMBOL(__start__bpf_raw_tp) = .; \ - KEEP(*(__bpf_raw_tp_map)) \ - VMLINUX_SYMBOL(__stop__bpf_raw_tp) = .; -#else -#define BPF_RAW_TP() -#endif - #ifdef CONFIG_SERIAL_EARLYCON #define EARLYCON_TABLE() . = ALIGN(8); \ VMLINUX_SYMBOL(__earlycon_table) = .; \ @@ -249,7 +240,6 @@ LIKELY_PROFILE() \ BRANCH_PROFILE() \ TRACE_PRINTKS() \ - BPF_RAW_TP() \ TRACEPOINT_STR() /* diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index 4b6cd41e2675..540c44fab023 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -2,54 +2,22 @@ #ifndef _BPF_CGROUP_H #define _BPF_CGROUP_H -#include -#include #include -#include -#include #include struct sock; -struct sockaddr; struct cgroup; struct sk_buff; -struct bpf_map; -struct bpf_prog; struct bpf_sock_ops_kern; -struct bpf_cgroup_storage; -struct ctl_table; -struct ctl_table_header; #ifdef CONFIG_CGROUP_BPF extern struct static_key_false cgroup_bpf_enabled_key; #define cgroup_bpf_enabled static_branch_unlikely(&cgroup_bpf_enabled_key) -DECLARE_PER_CPU(void*, bpf_cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]); -#define for_each_cgroup_storage_type(stype) \ - for (stype = 0; stype < MAX_BPF_CGROUP_STORAGE_TYPE; stype++) - -struct bpf_cgroup_storage_map; -struct bpf_storage_buffer { - struct rcu_head rcu; - char data[0]; -}; -struct bpf_cgroup_storage { - union { - struct bpf_storage_buffer *buf; - void __percpu *percpu_buf; - }; - struct bpf_cgroup_storage_map *map; - struct bpf_cgroup_storage_key key; - struct list_head list; - struct rb_node node; - struct rcu_head rcu; -}; - struct bpf_prog_list { struct list_head node; struct bpf_prog *prog; - struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE]; }; struct bpf_prog_array; @@ -77,16 +45,12 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, enum bpf_attach_type type, u32 flags); int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, enum bpf_attach_type type, u32 flags); -int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, - union bpf_attr __user *uattr); /* Wrapper for __cgroup_bpf_*() protected by cgroup_mutex */ int cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, enum bpf_attach_type type, u32 flags); int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, enum bpf_attach_type type, u32 flags); -int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, - union bpf_attr __user *uattr); int __cgroup_bpf_run_filter_skb(struct sock *sk, struct sk_buff *skb, @@ -95,65 +59,10 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk, int __cgroup_bpf_run_filter_sk(struct sock *sk, enum bpf_attach_type type); -int __cgroup_bpf_run_filter_sock_addr(struct sock *sk, - struct sockaddr *uaddr, - enum bpf_attach_type type, - void *t_ctx); - int __cgroup_bpf_run_filter_sock_ops(struct sock *sk, struct bpf_sock_ops_kern *sock_ops, enum bpf_attach_type type); -int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, - short access, enum bpf_attach_type type); - -int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, - struct ctl_table *table, int write, - enum bpf_attach_type type); - -int __cgroup_bpf_run_filter_setsockopt(struct sock *sock, int *level, - int *optname, char __user *optval, - int *optlen, char **kernel_optval); -int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, - int optname, char __user *optval, - int __user *optlen, int max_optlen, - int retval); - -static inline enum bpf_cgroup_storage_type cgroup_storage_type( - struct bpf_map *map) -{ - if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) - return BPF_CGROUP_STORAGE_PERCPU; - - return BPF_CGROUP_STORAGE_SHARED; -} -static inline void bpf_cgroup_storage_set(struct bpf_cgroup_storage - *storage[MAX_BPF_CGROUP_STORAGE_TYPE]) -{ - enum bpf_cgroup_storage_type stype; - struct bpf_storage_buffer *buf; - for_each_cgroup_storage_type(stype) { - if (!storage[stype]) - continue; - buf = READ_ONCE(storage[stype]->buf); - this_cpu_write(bpf_cgroup_storage[stype], &buf->data[0]); - } -} - -struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(struct bpf_prog *prog, - enum bpf_cgroup_storage_type stype); -void bpf_cgroup_storage_free(struct bpf_cgroup_storage *storage); -void bpf_cgroup_storage_link(struct bpf_cgroup_storage *storage, - struct cgroup *cgroup, - enum bpf_attach_type type); -void bpf_cgroup_storage_unlink(struct bpf_cgroup_storage *storage); -int bpf_cgroup_storage_assign(struct bpf_prog *prog, struct bpf_map *map); -void bpf_cgroup_storage_release(struct bpf_prog *prog, struct bpf_map *map); - -int bpf_percpu_cgroup_storage_copy(struct bpf_map *map, void *key, void *value); -int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, - void *value, u64 flags); - /* Wrappers for __cgroup_bpf_run_filter_skb() guarded by cgroup_bpf_enabled. */ #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb) \ ({ \ @@ -177,78 +86,16 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, __ret; \ }) -#define BPF_CGROUP_RUN_SK_PROG(sk, type) \ +#define BPF_CGROUP_RUN_PROG_INET_SOCK(sk) \ ({ \ int __ret = 0; \ if (cgroup_bpf_enabled && sk) { \ - __ret = __cgroup_bpf_run_filter_sk(sk, type); \ + __ret = __cgroup_bpf_run_filter_sk(sk, \ + BPF_CGROUP_INET_SOCK_CREATE); \ } \ __ret; \ }) -#define BPF_CGROUP_RUN_PROG_INET_SOCK(sk) \ - BPF_CGROUP_RUN_SK_PROG(sk, BPF_CGROUP_INET_SOCK_CREATE) - -#define BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk) \ - BPF_CGROUP_RUN_SK_PROG(sk, BPF_CGROUP_INET4_POST_BIND) - -#define BPF_CGROUP_RUN_PROG_INET6_POST_BIND(sk) \ - BPF_CGROUP_RUN_SK_PROG(sk, BPF_CGROUP_INET6_POST_BIND) - -#define BPF_CGROUP_RUN_SA_PROG(sk, uaddr, type) \ -({ \ - int __ret = 0; \ - if (cgroup_bpf_enabled) \ - __ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, type, \ - NULL); \ - __ret; \ -}) - -#define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, type, t_ctx) \ -({ \ - int __ret = 0; \ - if (cgroup_bpf_enabled) { \ - lock_sock(sk); \ - __ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, type, \ - t_ctx); \ - release_sock(sk); \ - } \ - __ret; \ -}) - -#define BPF_CGROUP_RUN_PROG_INET4_BIND(sk, uaddr) \ - BPF_CGROUP_RUN_SA_PROG(sk, uaddr, BPF_CGROUP_INET4_BIND) - -#define BPF_CGROUP_RUN_PROG_INET6_BIND(sk, uaddr) \ - BPF_CGROUP_RUN_SA_PROG(sk, uaddr, BPF_CGROUP_INET6_BIND) - -#define BPF_CGROUP_PRE_CONNECT_ENABLED(sk) (cgroup_bpf_enabled && \ - sk->sk_prot->pre_connect) - -#define BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr) \ - BPF_CGROUP_RUN_SA_PROG(sk, uaddr, BPF_CGROUP_INET4_CONNECT) - -#define BPF_CGROUP_RUN_PROG_INET6_CONNECT(sk, uaddr) \ - BPF_CGROUP_RUN_SA_PROG(sk, uaddr, BPF_CGROUP_INET6_CONNECT) - -#define BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr) \ - BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_INET4_CONNECT, NULL) - -#define BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr) \ - BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_INET6_CONNECT, NULL) - -#define BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk, uaddr, t_ctx) \ - BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_UDP4_SENDMSG, t_ctx) - -#define BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk, uaddr, t_ctx) \ - BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_UDP6_SENDMSG, t_ctx) - -#define BPF_CGROUP_RUN_PROG_UDP4_RECVMSG_LOCK(sk, uaddr) \ - BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_UDP4_RECVMSG, NULL) - -#define BPF_CGROUP_RUN_PROG_UDP6_RECVMSG_LOCK(sk, uaddr) \ - BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_UDP6_RECVMSG, NULL) - #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) \ ({ \ int __ret = 0; \ @@ -261,112 +108,16 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, } \ __ret; \ }) - -#define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type, major, minor, access) \ -({ \ - int __ret = 0; \ - if (cgroup_bpf_enabled) \ - __ret = __cgroup_bpf_check_dev_permission(type, major, minor, \ - access, \ - BPF_CGROUP_DEVICE); \ - \ - __ret; \ -}) - -#define BPF_CGROUP_RUN_PROG_SYSCTL(head, table, write) \ -({ \ - int __ret = 0; \ - if (cgroup_bpf_enabled) \ - __ret = __cgroup_bpf_run_filter_sysctl(head, table, write, \ - BPF_CGROUP_SYSCTL); \ - __ret; \ -}) - -#define BPF_CGROUP_RUN_PROG_SETSOCKOPT(sock, level, optname, optval, optlen, \ - kernel_optval) \ -({ \ - int __ret = 0; \ - if (cgroup_bpf_enabled) \ - __ret = __cgroup_bpf_run_filter_setsockopt(sock, level, \ - optname, optval, \ - optlen, \ - kernel_optval); \ - __ret; \ -}) - -#define BPF_CGROUP_GETSOCKOPT_MAX_OPTLEN(optlen) \ -({ \ - int __ret = 0; \ - if (cgroup_bpf_enabled) \ - get_user(__ret, optlen); \ - __ret; \ -}) - -#define BPF_CGROUP_RUN_PROG_GETSOCKOPT(sock, level, optname, optval, optlen, \ - max_optlen, retval) \ -({ \ - int __ret = retval; \ - if (cgroup_bpf_enabled) \ - __ret = __cgroup_bpf_run_filter_getsockopt(sock, level, \ - optname, optval, \ - optlen, max_optlen, \ - retval); \ - __ret; \ -}) - #else -static inline void bpf_cgroup_storage_set( - struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE]) {} -static inline int bpf_cgroup_storage_assign(struct bpf_prog *prog, - struct bpf_map *map) { return 0; } -static inline void bpf_cgroup_storage_release(struct bpf_prog *prog, - struct bpf_map *map) {} -static inline struct bpf_cgroup_storage *bpf_cgroup_storage_alloc( - struct bpf_prog *prog, enum bpf_cgroup_storage_type stype) { return 0; } -static inline void bpf_cgroup_storage_free( - struct bpf_cgroup_storage *storage) {} - struct cgroup_bpf {}; static inline void cgroup_bpf_put(struct cgroup *cgrp) {} static inline int cgroup_bpf_inherit(struct cgroup *cgrp) { return 0; } -static inline int bpf_percpu_cgroup_storage_copy(struct bpf_map *map, void *key, - void *value) { - return 0; -} -static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map, - void *key, void *value, u64 flags) { - return 0; -} - -#define cgroup_bpf_enabled (0) -#define BPF_CGROUP_PRE_CONNECT_ENABLED(sk) (0) #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET_SOCK(sk) ({ 0; }) -#define BPF_CGROUP_RUN_PROG_INET4_BIND(sk, uaddr) ({ 0; }) -#define BPF_CGROUP_RUN_PROG_INET6_BIND(sk, uaddr) ({ 0; }) -#define BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk) ({ 0; }) -#define BPF_CGROUP_RUN_PROG_INET6_POST_BIND(sk) ({ 0; }) -#define BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr) ({ 0; }) -#define BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr) ({ 0; }) -#define BPF_CGROUP_RUN_PROG_INET6_CONNECT(sk, uaddr) ({ 0; }) -#define BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr) ({ 0; }) -#define BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk, uaddr, t_ctx) ({ 0; }) -#define BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk, uaddr, t_ctx) ({ 0; }) -#define BPF_CGROUP_RUN_PROG_UDP4_RECVMSG_LOCK(sk, uaddr) ({ 0; }) -#define BPF_CGROUP_RUN_PROG_UDP6_RECVMSG_LOCK(sk, uaddr) ({ 0; }) #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) ({ 0; }) -#define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type,major,minor,access) ({ 0; }) -#define BPF_CGROUP_RUN_PROG_SYSCTL(head, table, write) ({ 0; }) -#define BPF_CGROUP_GETSOCKOPT_MAX_OPTLEN(optlen) ({ 0; }) -#define BPF_CGROUP_RUN_PROG_GETSOCKOPT(sock, level, optname, optval, \ - optlen, max_optlen, retval) ({ retval; }) -#define BPF_CGROUP_RUN_PROG_SETSOCKOPT(sock, level, optname, optval, optlen, \ - kernel_optval) ({ 0; }) - -#define for_each_cgroup_storage_type(stype) for (; false; ) #endif /* CONFIG_CGROUP_BPF */ diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 7074ebe4c2d0..b9712fb3a320 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -15,21 +15,14 @@ #include #include #include -#include -struct bpf_verifier_env; struct perf_event; struct bpf_prog; struct bpf_map; -struct sock; -struct seq_file; -struct btf; -struct btf_type; /* map is generic key/value storage optionally accesible by eBPF programs */ struct bpf_map_ops { /* funcs callable from userspace (via syscall) */ - int (*map_alloc_check)(union bpf_attr *attr); struct bpf_map *(*map_alloc)(union bpf_attr *attr); void (*map_release)(struct bpf_map *map, struct file *map_file); void (*map_free)(struct bpf_map *map); @@ -52,22 +45,10 @@ struct bpf_map_ops { void (*map_fd_put_ptr)(struct bpf_map *map, void *ptr, bool need_defer); u32 (*map_gen_lookup)(struct bpf_map *map, struct bpf_insn *insn_buf); u32 (*map_fd_sys_lookup_elem)(void *ptr); - void (*map_seq_show_elem)(struct bpf_map *map, void *key, - struct seq_file *m); - int (*map_check_btf)(const struct bpf_map *map, - const struct btf *btf, - const struct btf_type *key_type, - const struct btf_type *value_type); - - /* Direct value access helpers. */ - int (*map_direct_value_addr)(const struct bpf_map *map, - u64 *imm, u32 off); - int (*map_direct_value_meta)(const struct bpf_map *map, - u64 imm, u32 *off); }; struct bpf_map { - /* The first two cachelines with read-mostly members of which some + /* 1st cacheline with read-mostly members of which some * are also accessed in fast-path (e.g. ops, max_entries). */ const struct bpf_map_ops *ops ____cacheline_aligned; @@ -80,90 +61,21 @@ struct bpf_map { u32 value_size; u32 max_entries; u32 map_flags; - int spin_lock_off; /* >=0 valid offset, <0 error */ + u32 pages; u32 id; int numa_node; - u32 btf_key_type_id; - u32 btf_value_type_id; - struct btf *btf; - u32 pages; bool unpriv_array; - /* 51 bytes hole */ + /* 7 bytes hole */ - /* The 3rd and 4th cacheline with misc members to avoid false sharing + /* 2nd cacheline with misc members to avoid false sharing * particularly with refcounting. */ struct user_struct *user ____cacheline_aligned; atomic_t refcnt; atomic_t usercnt; struct work_struct work; - char name[BPF_OBJ_NAME_LEN]; }; -static inline bool map_value_has_spin_lock(const struct bpf_map *map) -{ - return map->spin_lock_off >= 0; -} - -static inline void check_and_init_map_lock(struct bpf_map *map, void *dst) -{ - if (likely(!map_value_has_spin_lock(map))) - return; - *(struct bpf_spin_lock *)(dst + map->spin_lock_off) = - (struct bpf_spin_lock){}; -} - -/* copy everything but bpf_spin_lock */ -static inline void copy_map_value(struct bpf_map *map, void *dst, void *src) -{ - if (unlikely(map_value_has_spin_lock(map))) { - u32 off = map->spin_lock_off; - memcpy(dst, src, off); - memcpy(dst + off + sizeof(struct bpf_spin_lock), - src + off + sizeof(struct bpf_spin_lock), - map->value_size - off - sizeof(struct bpf_spin_lock)); - } else { - memcpy(dst, src, map->value_size); - } -} - -void copy_map_value_locked(struct bpf_map *map, void *dst, void *src, - bool lock_src); - -struct bpf_offloaded_map; -struct bpf_map_dev_ops { - int (*map_get_next_key)(struct bpf_offloaded_map *map, - void *key, void *next_key); - int (*map_lookup_elem)(struct bpf_offloaded_map *map, - void *key, void *value); - int (*map_update_elem)(struct bpf_offloaded_map *map, - void *key, void *value, u64 flags); - int (*map_delete_elem)(struct bpf_offloaded_map *map, void *key); -}; -struct bpf_offloaded_map { - struct bpf_map map; - struct net_device *netdev; - const struct bpf_map_dev_ops *dev_ops; - void *dev_priv; - struct list_head offloads; -}; -static inline struct bpf_offloaded_map *map_to_offmap(struct bpf_map *map) -{ - return container_of(map, struct bpf_offloaded_map, map); -} - -static inline bool bpf_map_support_seq_show(const struct bpf_map *map) -{ - return map->btf && map->ops->map_seq_show_elem; -} - -int map_check_no_btf(const struct bpf_map *map, - const struct btf *btf, - const struct btf_type *key_type, - const struct btf_type *value_type); - -extern const struct bpf_map_ops bpf_map_offload_ops; - /* function argument constraints */ enum bpf_arg_type { ARG_DONTCARE = 0, /* unused argument in helper function */ @@ -174,14 +86,11 @@ enum bpf_arg_type { ARG_CONST_MAP_PTR, /* const argument used as pointer to bpf_map */ ARG_PTR_TO_MAP_KEY, /* pointer to stack used as map key */ ARG_PTR_TO_MAP_VALUE, /* pointer to stack used as map value */ - ARG_PTR_TO_UNINIT_MAP_VALUE, /* pointer to valid memory used to store a map value */ - ARG_PTR_TO_MAP_VALUE_OR_NULL, /* pointer to stack used as map value or NULL */ /* the following constraints used to prototype bpf_memcmp() and other * functions that access data on eBPF program stack */ ARG_PTR_TO_MEM, /* pointer to valid memory (stack, packet, map value) */ - ARG_PTR_TO_MEM_OR_NULL, /* pointer to valid memory or NULL */ ARG_PTR_TO_UNINIT_MEM, /* pointer to memory does not need to be initialized, * helper function must fill all bytes or clear * them in error case. @@ -192,22 +101,13 @@ enum bpf_arg_type { ARG_PTR_TO_CTX, /* pointer to context */ ARG_ANYTHING, /* any (initialized) argument is ok */ - ARG_PTR_TO_SPIN_LOCK, /* pointer to bpf_spin_lock */ - ARG_PTR_TO_SOCK_COMMON, /* pointer to sock_common */ - ARG_PTR_TO_INT, /* pointer to int */ - ARG_PTR_TO_LONG, /* pointer to long */ - ARG_PTR_TO_SOCKET, /* pointer to bpf_sock (fullsock) */ }; /* type of values returned from helper functions */ enum bpf_return_type { RET_INTEGER, /* function returns integer */ RET_VOID, /* function doesn't return anything */ - RET_PTR_TO_MAP_VALUE, /* returns a pointer to map elem value */ RET_PTR_TO_MAP_VALUE_OR_NULL, /* returns a pointer to map elem value or NULL */ - RET_PTR_TO_SOCKET_OR_NULL, /* returns a pointer to a socket or NULL */ - RET_PTR_TO_TCP_SOCK_OR_NULL, /* returns a pointer to a tcp_sock or NULL */ - RET_PTR_TO_SOCK_COMMON_OR_NULL, /* returns a pointer to a sock_common or NULL */ }; /* eBPF function prototype used by verifier to allow BPF_CALLs from eBPF programs @@ -255,17 +155,8 @@ enum bpf_reg_type { PTR_TO_MAP_VALUE, /* reg points to map element value */ PTR_TO_MAP_VALUE_OR_NULL,/* points to map elem value or NULL */ PTR_TO_STACK, /* reg == frame_pointer + offset */ - PTR_TO_PACKET_META, /* skb->data - meta_len */ PTR_TO_PACKET, /* reg points to skb->data */ PTR_TO_PACKET_END, /* skb->data + headlen */ - PTR_TO_FLOW_KEYS, /* reg points to bpf_flow_keys */ - PTR_TO_SOCKET, /* reg points to struct bpf_sock */ - PTR_TO_SOCKET_OR_NULL, /* reg points to struct bpf_sock or NULL */ - PTR_TO_SOCK_COMMON, /* reg points to sock_common */ - PTR_TO_SOCK_COMMON_OR_NULL, /* reg points to sock_common or NULL */ - PTR_TO_TCP_SOCK, /* reg points to struct tcp_sock */ - PTR_TO_TCP_SOCK_OR_NULL, /* reg points to struct tcp_sock or NULL */ - PTR_TO_TP_BUFFER, /* reg points to a writable raw tp's buffer */ }; /* The information passed from prog-specific *_is_valid_access @@ -282,22 +173,14 @@ bpf_ctx_record_field_size(struct bpf_insn_access_aux *aux, u32 size) aux->ctx_field_size = size; } -struct bpf_prog_ops { - int (*test_run)(struct bpf_prog *prog, const union bpf_attr *kattr, - union bpf_attr __user *uattr); -}; - struct bpf_verifier_ops { /* return eBPF function prototype for verification */ - const struct bpf_func_proto * - (*get_func_proto)(enum bpf_func_id func_id, - const struct bpf_prog *prog); + const struct bpf_func_proto *(*get_func_proto)(enum bpf_func_id func_id); /* return true if 'size' wide access at offset 'off' within bpf_context * with 'type' (read or write) is allowed */ bool (*is_valid_access)(int off, int size, enum bpf_access_type type, - const struct bpf_prog *prog, struct bpf_insn_access_aux *info); int (*gen_prologue)(struct bpf_insn *insn, bool direct_write, const struct bpf_prog *prog); @@ -305,77 +188,25 @@ struct bpf_verifier_ops { const struct bpf_insn *src, struct bpf_insn *dst, struct bpf_prog *prog, u32 *target_size); + int (*test_run)(struct bpf_prog *prog, const union bpf_attr *kattr, + union bpf_attr __user *uattr); }; -struct bpf_prog_offload_ops { - int (*insn_hook)(struct bpf_verifier_env *env, - int insn_idx, int prev_insn_idx); -}; - -struct bpf_prog_offload { - struct bpf_prog *prog; - struct net_device *netdev; - void *dev_priv; - struct list_head offloads; - bool dev_state; - const struct bpf_prog_offload_ops *dev_ops; -}; - -enum bpf_cgroup_storage_type { - BPF_CGROUP_STORAGE_SHARED, - BPF_CGROUP_STORAGE_PERCPU, - __BPF_CGROUP_STORAGE_MAX -}; -#define MAX_BPF_CGROUP_STORAGE_TYPE __BPF_CGROUP_STORAGE_MAX - struct bpf_prog_aux { atomic_t refcnt; u32 used_map_cnt; u32 max_ctx_offset; - u32 max_tp_access; u32 stack_depth; u32 id; - u32 func_cnt; /* used by non-func prog as the number of func progs */ - u32 func_idx; /* 0 for non-func prog, the index in func array for func prog */ - struct bpf_prog **func; - void *jit_data; /* JIT specific data. arch dependent */ struct latch_tree_node ksym_tnode; struct list_head ksym_lnode; - const struct bpf_prog_ops *ops; + const struct bpf_verifier_ops *ops; struct bpf_map **used_maps; struct bpf_prog *prog; struct user_struct *user; - u64 load_time; /* ns since boottime */ - struct bpf_map *cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]; - char name[BPF_OBJ_NAME_LEN]; #ifdef CONFIG_SECURITY void *security; #endif - struct bpf_prog_offload *offload; - struct btf *btf; - struct bpf_func_info *func_info; - /* bpf_line_info loaded from userspace. linfo->insn_off - * has the xlated insn offset. - * Both the main and sub prog share the same linfo. - * The subprog can access its first linfo by - * using the linfo_idx. - */ - struct bpf_line_info *linfo; - /* jited_linfo is the jited addr of the linfo. It has a - * one to one mapping to linfo: - * jited_linfo[i] is the jited addr for the linfo[i]->insn_off. - * Both the main and sub prog share the same jited_linfo. - * The subprog can access its first jited_linfo by - * using the linfo_idx. - */ - void **jited_linfo; - u32 func_info_cnt; - u32 nr_linfo; - /* subprog can use linfo_idx to access its first linfo and - * jited_linfo. - * main prog always has linfo_idx == 0 - */ - u32 linfo_idx; union { struct work_struct work; struct rcu_head rcu; @@ -419,11 +250,6 @@ const struct bpf_func_proto *bpf_get_trace_printk_proto(void); typedef unsigned long (*bpf_ctx_copy_t)(void *dst, const void *src, unsigned long off, unsigned long len); -typedef u32 (*bpf_convert_ctx_access_t)(enum bpf_access_type type, - const struct bpf_insn *src, - struct bpf_insn *dst, - struct bpf_prog *prog, - u32 *target_size); u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size, void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy); @@ -445,28 +271,16 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr, * The 'struct bpf_prog_array *' should only be replaced with xchg() * since other cpus are walking the array of pointers in parallel. */ -struct bpf_prog_array_item { - struct bpf_prog *prog; - struct bpf_cgroup_storage *cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]; -}; - struct bpf_prog_array { struct rcu_head rcu; - struct bpf_prog_array_item items[0]; + struct bpf_prog *progs[0]; }; struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags); void bpf_prog_array_free(struct bpf_prog_array __rcu *progs); -int bpf_prog_array_length(struct bpf_prog_array __rcu *progs); -bool bpf_prog_array_is_empty(struct bpf_prog_array *array); -int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *progs, - __u32 __user *prog_ids, u32 cnt); void bpf_prog_array_delete_safe(struct bpf_prog_array __rcu *progs, struct bpf_prog *old_prog); -int bpf_prog_array_copy_info(struct bpf_prog_array __rcu *array, - u32 *prog_ids, u32 request_cnt, - u32 *prog_cnt); int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array, struct bpf_prog *exclude_prog, struct bpf_prog *include_prog, @@ -474,19 +288,17 @@ int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array, #define __BPF_PROG_RUN_ARRAY(array, ctx, func, check_non_null) \ ({ \ - struct bpf_prog_array_item *_item; \ - struct bpf_prog *_prog; \ + struct bpf_prog **_prog, *__prog; \ struct bpf_prog_array *_array; \ u32 _ret = 1; \ rcu_read_lock(); \ _array = rcu_dereference(array); \ if (unlikely(check_non_null && !_array))\ goto _out; \ - _item = &_array->items[0]; \ - while ((_prog = READ_ONCE(_item->prog))) { \ - bpf_cgroup_storage_set(_item->cgroup_storage); \ - _ret &= func(_prog, ctx); \ - _item++; \ + _prog = _array->progs; \ + while ((__prog = READ_ONCE(*_prog))) { \ + _ret &= func(__prog, ctx); \ + _prog++; \ } \ _out: \ rcu_read_unlock(); \ @@ -505,18 +317,14 @@ DECLARE_PER_CPU(int, bpf_prog_active); extern const struct file_operations bpf_map_fops; extern const struct file_operations bpf_prog_fops; -#define BPF_PROG_TYPE(_id, _name) \ - extern const struct bpf_prog_ops _name ## _prog_ops; \ - extern const struct bpf_verifier_ops _name ## _verifier_ops; +#define BPF_PROG_TYPE(_id, _ops) \ + extern const struct bpf_verifier_ops _ops; #define BPF_MAP_TYPE(_id, _ops) \ extern const struct bpf_map_ops _ops; #include #undef BPF_PROG_TYPE #undef BPF_MAP_TYPE -extern const struct bpf_verifier_ops bpf_offload_verifier_ops; -extern const struct bpf_prog_ops bpf_offload_prog_ops; - struct bpf_prog *bpf_prog_get(u32 ufd); struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type); struct bpf_prog * __must_check bpf_prog_add(struct bpf_prog *prog, int i); @@ -527,20 +335,14 @@ void bpf_prog_put(struct bpf_prog *prog); int __bpf_prog_charge(struct user_struct *user, u32 pages); void __bpf_prog_uncharge(struct user_struct *user, u32 pages); -void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock); -void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock); - struct bpf_map *bpf_map_get_with_uref(u32 ufd); struct bpf_map *__bpf_map_get(struct fd f); struct bpf_map * __must_check bpf_map_inc(struct bpf_map *map, bool uref); void bpf_map_put_with_uref(struct bpf_map *map); void bpf_map_put(struct bpf_map *map); int bpf_map_precharge_memlock(u32 pages); -int bpf_map_charge_memlock(struct bpf_map *map, u32 pages); -void bpf_map_uncharge_memlock(struct bpf_map *map, u32 pages); void *bpf_map_area_alloc(size_t size, int numa_node); void bpf_map_area_free(void *base); -void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr); extern int sysctl_unprivileged_bpf_disabled; @@ -567,8 +369,6 @@ int bpf_fd_htab_map_update_elem(struct bpf_map *map, struct file *map_file, int bpf_fd_htab_map_lookup_elem(struct bpf_map *map, void *key, u32 *value); int bpf_get_file_flag(int flags); -int bpf_check_uarg_tail_zero(void __user *uaddr, size_t expected_size, - size_t actual_size); /* memcpy that is used with 8-byte aligned pointers, power-of-8 size and * forced to use 'long' read/writes to try to atomically copy long counters. @@ -587,17 +387,12 @@ static inline void bpf_long_memcpy(void *dst, const void *src, u32 size) } /* verify correctness of eBPF program */ -int bpf_check(struct bpf_prog **fp, union bpf_attr *attr, - union bpf_attr __user *uattr); -#ifndef CONFIG_BPF_JIT_ALWAYS_ON -void bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth); -#endif +int bpf_check(struct bpf_prog **fp, union bpf_attr *attr); struct bpf_prog *bpf_prog_get_type_path(const char *name, enum bpf_prog_type type); /* Map specifics */ struct net_device *__dev_map_lookup_elem(struct bpf_map *map, u32 key); -struct net_device *__dev_map_hash_lookup_elem(struct bpf_map *map, u32 key); void __dev_map_insert_ctx(struct bpf_map *map, u32 index); void __dev_map_flush(struct bpf_map *map); @@ -675,12 +470,6 @@ static inline struct net_device *__dev_map_lookup_elem(struct bpf_map *map, return NULL; } -static inline struct net_device *__dev_map_hash_lookup_elem(struct bpf_map *map, - u32 key) -{ - return NULL; -} - static inline void __dev_map_insert_ctx(struct bpf_map *map, u32 index) { } @@ -696,74 +485,21 @@ static inline bool unprivileged_ebpf_enabled(void) #endif /* CONFIG_BPF_SYSCALL */ -int bpf_prog_offload_compile(struct bpf_prog *prog); -void bpf_prog_offload_destroy(struct bpf_prog *prog); -int bpf_prog_offload_info_fill(struct bpf_prog_info *info, - struct bpf_prog *prog); - -int bpf_map_offload_info_fill(struct bpf_map_info *info, struct bpf_map *map); - -int bpf_map_offload_lookup_elem(struct bpf_map *map, void *key, void *value); -int bpf_map_offload_update_elem(struct bpf_map *map, - void *key, void *value, u64 flags); -int bpf_map_offload_delete_elem(struct bpf_map *map, void *key); -int bpf_map_offload_get_next_key(struct bpf_map *map, - void *key, void *next_key); -bool bpf_offload_dev_match(struct bpf_prog *prog, struct bpf_map *map); - -#if defined(CONFIG_NET) && defined(CONFIG_BPF_SYSCALL) -int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr); - -static inline bool bpf_prog_is_dev_bound(struct bpf_prog_aux *aux) -{ - return aux->offload; -} -static inline bool bpf_map_is_dev_bound(struct bpf_map *map) -{ - return unlikely(map->ops == &bpf_map_offload_ops); -} -struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr); -void bpf_map_offload_map_free(struct bpf_map *map); +#if defined(CONFIG_STREAM_PARSER) && defined(CONFIG_BPF_SYSCALL) +struct sock *__sock_map_lookup_elem(struct bpf_map *map, u32 key); +int sock_map_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type); #else -static inline int bpf_prog_offload_init(struct bpf_prog *prog, - union bpf_attr *attr) +static inline struct sock *__sock_map_lookup_elem(struct bpf_map *map, u32 key) +{ + return NULL; +} + +static inline int sock_map_prog(struct bpf_map *map, + struct bpf_prog *prog, + u32 type) { return -EOPNOTSUPP; } - -static inline bool bpf_prog_is_dev_bound(struct bpf_prog_aux *aux) -{ - return false; -} - -static inline bool bpf_map_is_dev_bound(struct bpf_map *map) -{ - return false; -} -static inline struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr) -{ - return ERR_PTR(-EOPNOTSUPP); -} -static inline void bpf_map_offload_map_free(struct bpf_map *map) -{ -} -#endif /* CONFIG_NET && CONFIG_BPF_SYSCALL */ - -#if defined(CONFIG_BPF_STREAM_PARSER) -int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, u32 which); -int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog); -#else -static inline int sock_map_prog_update(struct bpf_map *map, - struct bpf_prog *prog, u32 which) -{ - return -EOPNOTSUPP; -} - -static inline int sock_map_get_from_fd(const union bpf_attr *attr, - struct bpf_prog *prog) -{ - return -EINVAL; -} #endif /* verifier prototypes for helper functions called from eBPF programs */ @@ -784,79 +520,9 @@ extern const struct bpf_func_proto bpf_skb_vlan_push_proto; extern const struct bpf_func_proto bpf_skb_vlan_pop_proto; extern const struct bpf_func_proto bpf_get_stackid_proto; extern const struct bpf_func_proto bpf_sock_map_update_proto; -extern const struct bpf_func_proto bpf_sock_hash_update_proto; -extern const struct bpf_func_proto bpf_get_current_cgroup_id_proto; -extern const struct bpf_func_proto bpf_spin_lock_proto; -extern const struct bpf_func_proto bpf_spin_unlock_proto; -extern const struct bpf_func_proto bpf_msg_redirect_hash_proto; -extern const struct bpf_func_proto bpf_msg_redirect_map_proto; -extern const struct bpf_func_proto bpf_sk_redirect_hash_proto; -extern const struct bpf_func_proto bpf_sk_redirect_map_proto; -extern const struct bpf_func_proto bpf_get_local_storage_proto; -extern const struct bpf_func_proto bpf_tcp_sock_proto; /* Shared helpers among cBPF and eBPF. */ void bpf_user_rnd_init_once(void); u64 bpf_user_rnd_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); -#if defined(CONFIG_NET) -bool bpf_sock_common_is_valid_access(int off, int size, - enum bpf_access_type type, - struct bpf_insn_access_aux *info); -bool bpf_sock_is_valid_access(int off, int size, enum bpf_access_type type, - struct bpf_insn_access_aux *info); -u32 bpf_sock_convert_ctx_access(enum bpf_access_type type, - const struct bpf_insn *si, - struct bpf_insn *insn_buf, - struct bpf_prog *prog, - u32 *target_size); -#else -static inline bool bpf_sock_common_is_valid_access(int off, int size, - enum bpf_access_type type, - struct bpf_insn_access_aux *info) -{ - return false; -} -static inline bool bpf_sock_is_valid_access(int off, int size, - enum bpf_access_type type, - struct bpf_insn_access_aux *info) -{ - return false; -} -static inline u32 bpf_sock_convert_ctx_access(enum bpf_access_type type, - const struct bpf_insn *si, - struct bpf_insn *insn_buf, - struct bpf_prog *prog, - u32 *target_size) -{ - return 0; -} -#endif - -#ifdef CONFIG_INET -bool bpf_tcp_sock_is_valid_access(int off, int size, enum bpf_access_type type, - struct bpf_insn_access_aux *info); -u32 bpf_tcp_sock_convert_ctx_access(enum bpf_access_type type, - const struct bpf_insn *si, - struct bpf_insn *insn_buf, - struct bpf_prog *prog, - u32 *target_size); -#else -static inline bool bpf_tcp_sock_is_valid_access(int off, int size, - enum bpf_access_type type, - struct bpf_insn_access_aux *info) -{ - return false; -} - -static inline u32 bpf_tcp_sock_convert_ctx_access(enum bpf_access_type type, - const struct bpf_insn *si, - struct bpf_insn *insn_buf, - struct bpf_prog *prog, - u32 *target_size) -{ - return 0; -} -#endif /* CONFIG_INET */ - #endif /* _LINUX_BPF_H */ diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index 80715af1e553..e1149327a0c0 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -2,34 +2,23 @@ /* internal file - do not include directly */ #ifdef CONFIG_NET -BPF_PROG_TYPE(BPF_PROG_TYPE_SOCKET_FILTER, sk_filter) -BPF_PROG_TYPE(BPF_PROG_TYPE_SCHED_CLS, tc_cls_act) -BPF_PROG_TYPE(BPF_PROG_TYPE_SCHED_ACT, tc_cls_act) -BPF_PROG_TYPE(BPF_PROG_TYPE_XDP, xdp) -BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SKB, cg_skb) -BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SOCK, cg_sock) -BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SOCK_ADDR, cg_sock_addr) -BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_IN, lwt_inout) -BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_OUT, lwt_inout) -BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_XMIT, lwt_xmit) -BPF_PROG_TYPE(BPF_PROG_TYPE_SOCK_OPS, sock_ops) -BPF_PROG_TYPE(BPF_PROG_TYPE_SK_SKB, sk_skb) -BPF_PROG_TYPE(BPF_PROG_TYPE_SK_MSG, sk_msg) +BPF_PROG_TYPE(BPF_PROG_TYPE_SOCKET_FILTER, sk_filter_prog_ops) +BPF_PROG_TYPE(BPF_PROG_TYPE_SCHED_CLS, tc_cls_act_prog_ops) +BPF_PROG_TYPE(BPF_PROG_TYPE_SCHED_ACT, tc_cls_act_prog_ops) +BPF_PROG_TYPE(BPF_PROG_TYPE_XDP, xdp_prog_ops) +BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SKB, cg_skb_prog_ops) +BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SOCK, cg_sock_prog_ops) +BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_IN, lwt_inout_prog_ops) +BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_OUT, lwt_inout_prog_ops) +BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_XMIT, lwt_xmit_prog_ops) +BPF_PROG_TYPE(BPF_PROG_TYPE_SOCK_OPS, sock_ops_prog_ops) +BPF_PROG_TYPE(BPF_PROG_TYPE_SK_SKB, sk_skb_prog_ops) #endif #ifdef CONFIG_BPF_EVENTS -BPF_PROG_TYPE(BPF_PROG_TYPE_KPROBE, kprobe) -BPF_PROG_TYPE(BPF_PROG_TYPE_TRACEPOINT, tracepoint) -BPF_PROG_TYPE(BPF_PROG_TYPE_PERF_EVENT, perf_event) -BPF_PROG_TYPE(BPF_PROG_TYPE_RAW_TRACEPOINT, raw_tracepoint) -BPF_PROG_TYPE(BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE, raw_tracepoint_writable) +BPF_PROG_TYPE(BPF_PROG_TYPE_KPROBE, kprobe_prog_ops) +BPF_PROG_TYPE(BPF_PROG_TYPE_TRACEPOINT, tracepoint_prog_ops) +BPF_PROG_TYPE(BPF_PROG_TYPE_PERF_EVENT, perf_event_prog_ops) #endif -#ifdef CONFIG_CGROUP_BPF -BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_DEVICE, cg_dev) -BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SYSCTL, cg_sysctl) -BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SOCKOPT, cg_sockopt) -#endif - -BPF_PROG_TYPE(BPF_PROG_TYPE_FLOW_DISSECTOR, flow_dissector) BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY, array_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_ARRAY, percpu_array_map_ops) @@ -38,10 +27,6 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_PERF_EVENT_ARRAY, perf_event_array_map_ops) #ifdef CONFIG_CGROUPS BPF_MAP_TYPE(BPF_MAP_TYPE_CGROUP_ARRAY, cgroup_array_map_ops) #endif -#ifdef CONFIG_CGROUP_BPF -BPF_MAP_TYPE(BPF_MAP_TYPE_CGROUP_STORAGE, cgroup_storage_map_ops) -BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE, cgroup_storage_map_ops) -#endif BPF_MAP_TYPE(BPF_MAP_TYPE_HASH, htab_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_HASH, htab_percpu_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_LRU_HASH, htab_lru_map_ops) @@ -54,11 +39,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY_OF_MAPS, array_of_maps_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_HASH_OF_MAPS, htab_of_maps_map_ops) #ifdef CONFIG_NET BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP, dev_map_ops) -BPF_MAP_TYPE(BPF_MAP_TYPE_SK_STORAGE, sk_storage_map_ops) -BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP_HASH, dev_map_hash_ops) #ifdef CONFIG_STREAM_PARSER BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKMAP, sock_map_ops) -BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKHASH, sock_hash_ops) #endif -BPF_MAP_TYPE(BPF_MAP_TYPE_CPUMAP, cpu_map_ops) #endif diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 0e2dda88234b..8509484cada4 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -60,8 +60,6 @@ struct bpf_reg_state { * offset, so they can share range knowledge. * For PTR_TO_MAP_VALUE_OR_NULL this is used to share which map value we * came from, when one is tested for != NULL. - * For PTR_TO_SOCKET this is used to share which pointers retain the - * same reference to the socket, to determine proper reference freeing. */ u32 id; /* Ordering of fields matters. See states_equal() */ @@ -81,14 +79,6 @@ struct bpf_reg_state { s64 smax_value; /* maximum possible (s64)value */ u64 umin_value; /* minimum possible (u64)value */ u64 umax_value; /* maximum possible (u64)value */ - /* Inside the callee two registers can be both PTR_TO_STACK like - * R1=fp-8 and R2=fp-8, but one of them points to this function stack - * while another to the caller's stack. To differentiate them 'frameno' - * is used which is an index in bpf_verifier_state->frame[] array - * pointing to bpf_func_state. - * This field must be second to last, for states_equal() reasons. - */ - u32 frameno; /* This field must be last, for states_equal() reasons. */ enum bpf_reg_liveness live; }; @@ -105,62 +95,18 @@ struct bpf_stack_state { struct bpf_reg_state spilled_ptr; u8 slot_type[BPF_REG_SIZE]; }; -struct bpf_reference_state { - /* Track each reference created with a unique id, even if the same - * instruction creates the reference multiple times (eg, via CALL). - */ - int id; - /* Instruction where the allocation of this reference occurred. This - * is used purely to inform the user of a reference leak. - */ - int insn_idx; -}; /* state of the program: * type of all registers and stack info */ -struct bpf_func_state { +struct bpf_verifier_state { struct bpf_reg_state regs[MAX_BPF_REG]; struct bpf_verifier_state *parent; - /* index of call instruction that called into this func */ - int callsite; - /* stack frame number of this function state from pov of - * enclosing bpf_verifier_state. - * 0 = main function, 1 = first callee. - */ - u32 frameno; - /* subprog number == index within subprog_stack_depth - * zero == main subprog - */ - u32 subprogno; - /* The following fields should be last. See copy_func_state() */ - int acquired_refs; - struct bpf_reference_state *refs; int allocated_stack; struct bpf_stack_state *stack; -}; - -#define MAX_CALL_FRAMES 8 -struct bpf_verifier_state { - /* call stack tracking */ - struct bpf_func_state *frame[MAX_CALL_FRAMES]; - struct bpf_verifier_state *parent; - u32 curframe; - u32 active_spin_lock; bool speculative; }; -#define bpf_get_spilled_reg(slot, frame) \ - (((slot < frame->allocated_stack / BPF_REG_SIZE) && \ - (frame->stack[slot].slot_type[0] == STACK_SPILL)) \ - ? &frame->stack[slot].spilled_ptr : NULL) - -/* Iterate over 'frame', setting 'reg' to either NULL or a spilled register. */ -#define bpf_for_each_spilled_reg(iter, frame, reg) \ - for (iter = 0, reg = bpf_get_spilled_reg(iter, frame); \ - iter < frame->allocated_stack / BPF_REG_SIZE; \ - iter++, reg = bpf_get_spilled_reg(iter, frame)) - /* linked list of verifier states used to prune search */ struct bpf_verifier_state_list { struct bpf_verifier_state state; @@ -179,13 +125,8 @@ struct bpf_verifier_state_list { struct bpf_insn_aux_data { union { enum bpf_reg_type ptr_type; /* pointer type for load/store insns */ - unsigned long map_state; /* pointer/poison value for maps */ - s32 call_imm; /* saved imm field of call insn */ + struct bpf_map *map_ptr; /* pointer for call insn into lookup_elem */ u32 alu_limit; /* limit for add/sub register with pointer */ - struct { - u32 map_index; /* index into used_maps[] */ - u32 map_off; /* offset from value base address */ - }; }; int ctx_field_size; /* the ctx field size for load insn, maybe 0 */ int sanitize_stack_off; /* stack slot to be cleared */ @@ -195,38 +136,10 @@ struct bpf_insn_aux_data { #define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */ -#define BPF_VERIFIER_TMP_LOG_SIZE 1024 - -struct bpf_verifier_log { - u32 level; - char kbuf[BPF_VERIFIER_TMP_LOG_SIZE]; - char __user *ubuf; - u32 len_used; - u32 len_total; -}; - -static inline bool bpf_verifier_log_full(const struct bpf_verifier_log *log) -{ - return log->len_used >= log->len_total - 1; -} - -static inline bool bpf_verifier_log_needed(const struct bpf_verifier_log *log) -{ - return log->level && log->ubuf && !bpf_verifier_log_full(log); -} - -__printf(2, 3) void bpf_verifier_log_write(struct bpf_verifier_env *env, - const char *fmt, ...); - -void bpf_verifier_vlog(struct bpf_verifier_log *log, const char *fmt, - va_list args); - -#define BPF_MAX_SUBPROGS 256 - -struct bpf_subprog_info { - u32 start; /* insn idx of function entry point */ - u32 linfo_idx; /* The idx to the main_prog->aux->linfo */ - u16 stack_depth; /* max. stack depth used by this function */ +struct bpf_verifier_env; +struct bpf_ext_analyzer_ops { + int (*insn_hook)(struct bpf_verifier_env *env, + int insn_idx, int prev_insn_idx); }; /* single container for all structs @@ -236,7 +149,6 @@ struct bpf_verifier_env { u32 insn_idx; u32 prev_insn_idx; struct bpf_prog *prog; /* eBPF program being verified */ - const struct bpf_verifier_ops *ops; struct bpf_verifier_stack_elem *head; /* stack of verifier states to be processed */ int stack_size; /* number of states to be processed */ bool strict_alignment; /* perform strict pointer alignment checks */ @@ -250,37 +162,14 @@ struct bpf_verifier_env { bool allow_ptr_leaks; bool seen_direct_write; struct bpf_insn_aux_data *insn_aux_data; /* array of per-insn state */ - - struct bpf_verifier_log log; - struct bpf_subprog_info subprog_info[BPF_MAX_SUBPROGS + 1]; - u32 subprog_cnt; }; -static inline struct bpf_func_state *cur_func(struct bpf_verifier_env *env) -{ - struct bpf_verifier_state *cur = env->cur_state; - - return cur->frame[cur->curframe]; -} - static inline struct bpf_reg_state *cur_regs(struct bpf_verifier_env *env) { - return cur_func(env)->regs; + return env->cur_state->regs; } -int bpf_prog_offload_verifier_prep(struct bpf_verifier_env *env); -int bpf_prog_offload_verify_insn(struct bpf_verifier_env *env, - int insn_idx, int prev_insn_idx); - int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops, void *priv); -#include -#include -static inline void *__compat_kvcalloc(size_t n, size_t size, gfp_t flags) -{ - return kvmalloc_array(n, size, flags | __GFP_ZERO); -} -#define kvcalloc __compat_kvcalloc - #endif /* _LINUX_BPF_VERIFIER_H */ diff --git a/include/linux/btf.h b/include/linux/btf.h deleted file mode 100644 index 634216e1f258..000000000000 --- a/include/linux/btf.h +++ /dev/null @@ -1,73 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (c) 2018 Facebook */ - -#ifndef _LINUX_BTF_H -#define _LINUX_BTF_H 1 - -#include - -struct btf; -struct btf_member; -struct btf_type; -union bpf_attr; - -extern const struct file_operations btf_fops; - -void btf_put(struct btf *btf); -int btf_new_fd(const union bpf_attr *attr); -struct btf *btf_get_by_fd(int fd); -int btf_get_info_by_fd(const struct btf *btf, - const union bpf_attr *attr, - union bpf_attr __user *uattr); -/* Figure out the size of a type_id. If type_id is a modifier - * (e.g. const), it will be resolved to find out the type with size. - * - * For example: - * In describing "const void *", type_id is "const" and "const" - * refers to "void *". The return type will be "void *". - * - * If type_id is a simple "int", then return type will be "int". - * - * @btf: struct btf object - * @type_id: Find out the size of type_id. The type_id of the return - * type is set to *type_id. - * @ret_size: It can be NULL. If not NULL, the size of the return - * type is set to *ret_size. - * Return: The btf_type (resolved to another type with size info if needed). - * NULL is returned if type_id itself does not have size info - * (e.g. void) or it cannot be resolved to another type that - * has size info. - * *type_id and *ret_size will not be changed in the - * NULL return case. - */ -const struct btf_type *btf_type_id_size(const struct btf *btf, - u32 *type_id, - u32 *ret_size); -void btf_type_seq_show(const struct btf *btf, u32 type_id, void *obj, - struct seq_file *m); -int btf_get_fd_by_id(u32 id); -u32 btf_id(const struct btf *btf); -bool btf_member_is_reg_int(const struct btf *btf, const struct btf_type *s, - const struct btf_member *m, - u32 expected_offset, u32 expected_size); - -int btf_find_spin_lock(const struct btf *btf, const struct btf_type *t); - -#ifdef CONFIG_BPF_SYSCALL -const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id); -const char *btf_name_by_offset(const struct btf *btf, u32 offset); -#else -static inline const struct btf_type *btf_type_by_id(const struct btf *btf, - u32 type_id) -{ - return NULL; -} - -static inline const char *btf_name_by_offset(const struct btf *btf, - u32 offset) -{ - return NULL; -} -#endif - -#endif diff --git a/include/linux/device_cgroup.h b/include/linux/device_cgroup.h index 8557efe096dc..2d93d7ecd479 100644 --- a/include/linux/device_cgroup.h +++ b/include/linux/device_cgroup.h @@ -1,6 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ #include -#include #define DEVCG_ACC_MKNOD 1 #define DEVCG_ACC_READ 2 @@ -20,15 +19,10 @@ static inline int __devcgroup_check_permission(short type, u32 major, u32 minor, { return 0; } #endif -#if defined(CONFIG_CGROUP_DEVICE) || defined(CONFIG_CGROUP_BPF) +#ifdef CONFIG_CGROUP_DEVICE static inline int devcgroup_check_permission(short type, u32 major, u32 minor, short access) { - int rc = BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type, major, minor, access); - - if (rc) - return -EPERM; - return __devcgroup_check_permission(type, major, minor, access); } diff --git a/include/linux/filter.h b/include/linux/filter.h index 1f50979749e5..dada9e36521e 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -18,7 +18,6 @@ #include #include #include -#include #include @@ -29,10 +28,6 @@ struct sk_buff; struct sock; struct seccomp_data; struct bpf_prog_aux; -struct xdp_rxq_info; -struct xdp_buff; -struct ctl_table; -struct ctl_table_header; /* ArgX, context and stack frame pointer register positions. Note, * Arg1, Arg2, Arg3, etc are used as argument mappings of function @@ -59,9 +54,6 @@ struct ctl_table_header; /* unused opcode to mark special call to bpf_tail_call() helper */ #define BPF_TAIL_CALL 0xf0 -/* unused opcode to mark call to interpreter with arguments */ -#define BPF_CALL_ARGS 0xe0 - /* As per nm, we expose JITed images as text (code) section for * kallsyms. That way, tools like perf can find it to match * addresses. @@ -455,14 +447,6 @@ struct ctl_table_header; #define bpf_ctx_range_till(TYPE, MEMBER1, MEMBER2) \ offsetof(TYPE, MEMBER1) ... offsetofend(TYPE, MEMBER2) - 1 -#if BITS_PER_LONG == 64 -# define bpf_ctx_range_ptr(TYPE, MEMBER) \ - offsetof(TYPE, MEMBER) ... offsetofend(TYPE, MEMBER) - 1 -#else -# define bpf_ctx_range_ptr(TYPE, MEMBER) \ - offsetof(TYPE, MEMBER) ... offsetof(TYPE, MEMBER) + 8 - 1 -#endif /* BITS_PER_LONG == 64 */ - #define bpf_target_off(TYPE, MEMBER, SIZE, PTR_SIZE) \ ({ \ BUILD_BUG_ON(FIELD_SIZEOF(TYPE, MEMBER) != (SIZE)); \ @@ -489,22 +473,18 @@ struct bpf_binary_header { #ifdef CONFIG_CFI_CLANG u32 magic; #endif - u32 pages; + unsigned int pages; u8 image[]; }; struct bpf_prog { u16 pages; /* Number of allocated pages */ u16 jited:1, /* Is our filter JIT'ed? */ - jit_requested:1,/* archs need to JIT the prog */ - undo_set_mem:1, /* Passed set_memory_ro() checkpoint */ + locked:1, /* Program image locked? */ gpl_compatible:1, /* Is filter GPL compatible? */ cb_access:1, /* Is control block accessed? */ - dst_needed:1, /* Do we need dst entry? */ - blinded:1, /* Was blinded */ - is_func:1; /* program is a bpf function */ + dst_needed:1; /* Do we need dst entry? */ enum bpf_prog_type type; /* Type of BPF program */ - enum bpf_attach_type expected_attach_type; /* For some prog types */ u32 len; /* Number of filter blocks */ u32 jited_len; /* Size of jited insns in bytes */ u8 tag[BPF_TAG_SIZE]; @@ -586,23 +566,24 @@ static inline void bpf_jit_set_header_magic(struct bpf_binary_header *hdr) struct bpf_skb_data_end { struct qdisc_skb_cb qdisc_cb; - void *data_meta; void *data_end; }; -/* Compute the linear packet data range [data, data_end) which - * will be accessed by various program types (cls_bpf, act_bpf, - * lwt, ...). Subsystems allowing direct data access must (!) - * ensure that cb[] area can be written to when BPF program is - * invoked (otherwise cb[] save/restore is necessary). +struct xdp_buff { + void *data; + void *data_end; + void *data_hard_start; +}; + +/* compute the linear packet data range [data, data_end) which + * will be accessed by cls_bpf, act_bpf and lwt programs */ -static inline void bpf_compute_data_pointers(struct sk_buff *skb) +static inline void bpf_compute_data_end(struct sk_buff *skb) { struct bpf_skb_data_end *cb = (struct bpf_skb_data_end *)skb->cb; BUILD_BUG_ON(sizeof(*cb) > FIELD_SIZEOF(struct sk_buff, cb)); - cb->data_meta = skb->data - skb_metadata_len(skb); - cb->data_end = skb->data + skb_headlen(skb); + cb->data_end = skb->data + skb_headlen(skb); } static inline u8 *bpf_skb_cb(struct sk_buff *skb) @@ -708,27 +689,50 @@ bpf_ctx_narrow_access_ok(u32 off, u32 size, const u32 size_default) #define bpf_classic_proglen(fprog) (fprog->len * sizeof(fprog->filter[0])) +#ifdef CONFIG_ARCH_HAS_SET_MEMORY static inline void bpf_prog_lock_ro(struct bpf_prog *fp) { - fp->undo_set_mem = 1; - set_memory_ro((unsigned long)fp, fp->pages); + fp->locked = 1; + WARN_ON_ONCE(set_memory_ro((unsigned long)fp, fp->pages)); } static inline void bpf_prog_unlock_ro(struct bpf_prog *fp) { - if (fp->undo_set_mem) - set_memory_rw((unsigned long)fp, fp->pages); + if (fp->locked) { + WARN_ON_ONCE(set_memory_rw((unsigned long)fp, fp->pages)); + /* In case set_memory_rw() fails, we want to be the first + * to crash here instead of some random place later on. + */ + fp->locked = 0; + } } static inline void bpf_jit_binary_lock_ro(struct bpf_binary_header *hdr) { - set_memory_ro((unsigned long)hdr, hdr->pages); + WARN_ON_ONCE(set_memory_ro((unsigned long)hdr, hdr->pages)); } static inline void bpf_jit_binary_unlock_ro(struct bpf_binary_header *hdr) { - set_memory_rw((unsigned long)hdr, hdr->pages); + WARN_ON_ONCE(set_memory_rw((unsigned long)hdr, hdr->pages)); } +#else +static inline void bpf_prog_lock_ro(struct bpf_prog *fp) +{ +} + +static inline void bpf_prog_unlock_ro(struct bpf_prog *fp) +{ +} + +static inline void bpf_jit_binary_lock_ro(struct bpf_binary_header *hdr) +{ +} + +static inline void bpf_jit_binary_unlock_ro(struct bpf_binary_header *hdr) +{ +} +#endif /* CONFIG_ARCH_HAS_SET_MEMORY */ static inline struct bpf_binary_header * bpf_jit_binary_hdr(const struct bpf_prog *fp) @@ -748,13 +752,6 @@ static inline int sk_filter(struct sock *sk, struct sk_buff *skb) struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err); void bpf_prog_free(struct bpf_prog *fp); -void bpf_prog_free_linfo(struct bpf_prog *prog); -void bpf_prog_fill_jited_linfo(struct bpf_prog *prog, - const u32 *insn_to_jit_off); -int bpf_prog_alloc_jited_linfo(struct bpf_prog *prog); -void bpf_prog_free_jited_linfo(struct bpf_prog *prog); -void bpf_prog_free_unused_jited_linfo(struct bpf_prog *prog); - struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags); struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size, gfp_t gfp_extra_flags); @@ -786,22 +783,11 @@ bool sk_filter_charge(struct sock *sk, struct sk_filter *fp); void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp); u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); -#define __bpf_call_base_args \ - ((u64 (*)(u64, u64, u64, u64, u64, const struct bpf_insn *)) \ - __bpf_call_base) struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog); void bpf_jit_compile(struct bpf_prog *prog); bool bpf_helper_changes_pkt_data(void *func); -static inline bool bpf_dump_raw_ok(void) -{ - /* Reconstruction of call-sites is dependent on kallsyms, - * thus make dump the same restriction. - */ - return kallsyms_show_value() == 1; -} - struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, const struct bpf_insn *patch, u32 len); @@ -819,6 +805,9 @@ int xdp_do_redirect(struct net_device *dev, void xdp_do_flush_map(void); void bpf_warn_invalid_xdp_action(u32 act); +void bpf_warn_invalid_xdp_redirect(u32 ifindex); + +struct sock *do_sk_redirect_map(struct sk_buff *skb); #ifdef CONFIG_BPF_JIT extern int bpf_jit_enable; @@ -872,7 +861,7 @@ static inline bool bpf_prog_ebpf_jited(const struct bpf_prog *fp) return fp->jited && bpf_jit_is_ebpf(); } -static inline bool bpf_jit_blinding_enabled(struct bpf_prog *prog) +static inline bool bpf_jit_blinding_enabled(void) { /* These are the prerequisites, should someone ever have the * idea to call blinding outside of them, we make sure to @@ -880,7 +869,7 @@ static inline bool bpf_jit_blinding_enabled(struct bpf_prog *prog) */ if (!bpf_jit_is_ebpf()) return false; - if (!prog->jit_requested) + if (!bpf_jit_enable) return false; if (!bpf_jit_harden) return false; @@ -1053,17 +1042,6 @@ static inline int bpf_tell_extensions(void) return SKF_AD_MAX; } -struct bpf_sock_addr_kern { - struct sock *sk; - struct sockaddr *uaddr; - /* Temporary "register" to make indirect stores to nested structures - * defined above. We need three registers to make such a store, but - * only two (src and dst) are available at convert_ctx_access time - */ - u64 tmp_reg; - void *t_ctx; /* Attach type specific context. */ -}; - struct bpf_sock_ops_kern { struct sock *sk; u32 op; @@ -1073,20 +1051,4 @@ struct bpf_sock_ops_kern { }; }; -struct bpf_sysctl_kern { - struct ctl_table_header *head; - struct ctl_table *table; - int write; -}; - -struct bpf_sockopt_kern { - struct sock *sk; - u8 *optval; - u8 *optval_end; - s32 level; - s32 optname; - s32 optlen; - s32 retval; -}; - #endif /* __LINUX_FILTER_H__ */ diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index b1542e8a68a0..9b5ace9ce053 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -44,7 +44,6 @@ #include #endif #include -#include #include #include @@ -703,7 +702,6 @@ struct netdev_rx_queue { #endif struct kobject kobj; struct net_device *dev; - struct xdp_rxq_info xdp_rxq; } ____cacheline_aligned_in_smp; /* @@ -795,10 +793,10 @@ enum tc_setup_type { TC_SETUP_CLSBPF, }; -/* These structures hold the attributes of bpf state that are being passed - * to the netdevice through the bpf op. +/* These structures hold the attributes of xdp state that are being passed + * to the netdevice through the xdp op. */ -enum bpf_netdev_command { +enum xdp_netdev_command { /* Set or clear a bpf program used in the earliest stages of packet * rx. The prog will have been loaded as BPF_PROG_TYPE_XDP. The callee * is responsible for calling bpf_prog_put on any old progs that are @@ -813,19 +811,12 @@ enum bpf_netdev_command { * is equivalent to XDP_ATTACHED_DRV. */ XDP_QUERY_PROG, - /* BPF program for offload callbacks, invoked at program load time. */ - BPF_OFFLOAD_VERIFIER_PREP, - BPF_OFFLOAD_TRANSLATE, - BPF_OFFLOAD_DESTROY, - BPF_OFFLOAD_MAP_ALLOC, - BPF_OFFLOAD_MAP_FREE, }; -struct bpf_prog_offload_ops; struct netlink_ext_ack; -struct netdev_bpf { - enum bpf_netdev_command command; +struct netdev_xdp { + enum xdp_netdev_command command; union { /* XDP_SETUP_PROG */ struct { @@ -838,19 +829,6 @@ struct netdev_bpf { u8 prog_attached; u32 prog_id; }; - /* BPF_OFFLOAD_VERIFIER_PREP */ - struct { - struct bpf_prog *prog; - const struct bpf_prog_offload_ops *ops; /* callee set */ - } verifier; - /* BPF_OFFLOAD_TRANSLATE, BPF_OFFLOAD_DESTROY */ - struct { - struct bpf_prog *prog; - } offload; - /* BPF_OFFLOAD_MAP_ALLOC, BPF_OFFLOAD_MAP_FREE */ - struct { - struct bpf_offloaded_map *offmap; - }; }; }; @@ -1184,10 +1162,9 @@ struct macsec_ops { * appropriate rx headroom value allows avoiding skb head copy on * forward. Setting a negative value resets the rx headroom to the * default value. - * int (*ndo_bpf)(struct net_device *dev, struct netdev_bpf *bpf); + * int (*ndo_xdp)(struct net_device *dev, struct netdev_xdp *xdp); * This function is used to set or query state related to XDP on the - * netdevice and manage BPF offload. See definition of - * enum bpf_netdev_command for details. + * netdevice. See definition of enum xdp_netdev_command for details. * int (*ndo_xdp_xmit)(struct net_device *dev, struct xdp_buff *xdp); * This function is used to submit a XDP packet for transmit on a * netdevice. @@ -1375,8 +1352,8 @@ struct net_device_ops { struct sk_buff *skb); void (*ndo_set_rx_headroom)(struct net_device *dev, int needed_headroom); - int (*ndo_bpf)(struct net_device *dev, - struct netdev_bpf *bpf); + int (*ndo_xdp)(struct net_device *dev, + struct netdev_xdp *xdp); int (*ndo_xdp_xmit)(struct net_device *dev, struct xdp_buff *xdp); void (*ndo_xdp_flush)(struct net_device *dev); @@ -2880,9 +2857,7 @@ struct softnet_data { struct Qdisc *output_queue; struct Qdisc **output_queue_tailp; struct sk_buff *completion_queue; -#ifdef CONFIG_XFRM_OFFLOAD - struct sk_buff_head xfrm_backlog; -#endif + #ifdef CONFIG_RPS /* input_queue_head should be written by cpu owning this struct, * and only read by other cpus. Worth using a cache line. @@ -3414,14 +3389,14 @@ int dev_get_phys_port_id(struct net_device *dev, int dev_get_phys_port_name(struct net_device *dev, char *name, size_t len); int dev_change_proto_down(struct net_device *dev, bool proto_down); -struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev, bool *again); +struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev); struct sk_buff *dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, struct netdev_queue *txq, int *ret); -typedef int (*bpf_op_t)(struct net_device *dev, struct netdev_bpf *bpf); +typedef int (*xdp_op_t)(struct net_device *dev, struct netdev_xdp *xdp); int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack, int fd, u32 flags); -u8 __dev_xdp_attached(struct net_device *dev, bpf_op_t xdp_op, u32 *prog_id); +u8 __dev_xdp_attached(struct net_device *dev, xdp_op_t xdp_op, u32 *prog_id); int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb); int dev_forward_skb(struct net_device *dev, struct sk_buff *skb); diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h index 480946b3b449..912a91bb41c5 100644 --- a/include/linux/proc_ns.h +++ b/include/linux/proc_ns.h @@ -76,10 +76,10 @@ static inline int ns_alloc_inum(struct ns_common *ns) extern struct file *proc_ns_fget(int fd); #define get_proc_ns(inode) ((struct ns_common *)(inode)->i_private) -extern int ns_get_path(struct path *path, struct task_struct *task, +extern void *ns_get_path(struct path *path, struct task_struct *task, const struct proc_ns_operations *ns_ops); typedef struct ns_common *ns_get_path_helper_t(void *); -extern int ns_get_path_cb(struct path *path, ns_get_path_helper_t ns_get_cb, +extern void *ns_get_path_cb(struct path *path, ns_get_path_helper_t ns_get_cb, void *private_data); extern int ns_get_name(char *buf, size_t size, struct task_struct *task, const struct proc_ns_operations *ns_ops); diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index efbfae01adfb..3abb6361a173 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -243,9 +243,6 @@ struct scatterlist; struct pipe_inode_info; struct iov_iter; struct napi_struct; -struct bpf_prog; -union bpf_attr; -struct skb_ext; #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) struct nf_conntrack { @@ -491,9 +488,8 @@ int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb, * the end of the header data, ie. at skb->end. */ struct skb_shared_info { - __u8 __unused; - __u8 meta_len; - __u8 nr_frags; + unsigned short _unused; + unsigned char nr_frags; __u8 tx_flags; unsigned short gso_size; /* Warning: this field is not always filled in (UFO)! */ @@ -637,7 +633,6 @@ typedef unsigned char *sk_buff_data_t; * @queue_mapping: Queue mapping for multiqueue devices * @xmit_more: More SKBs are pending for this queue * @pfmemalloc: skbuff was allocated from PFMEMALLOC reserves - * @active_extensions: active extensions (skb_ext_id types) * @ndisc_nodetype: router type (from link layer) * @ooo_okay: allow the mapping of a socket to a queue to be changed * @l4_hash: indicate hash is a canonical 4-tuple hash over transport @@ -666,7 +661,6 @@ typedef unsigned char *sk_buff_data_t; * @data: Data head pointer * @truesize: Buffer size * @users: User count - see {datagram,tcp}.c - * @extensions: allocated extensions, valid if active_extensions is nonzero */ struct sk_buff { @@ -743,9 +737,7 @@ struct sk_buff { head_frag:1, xmit_more:1, pfmemalloc:1; -#ifdef CONFIG_SKB_EXTENSIONS - __u8 active_extensions; -#endif + /* fields enclosed in headers_start/headers_end are copied * using a single memcpy() in __copy_skb_header() */ @@ -859,10 +851,6 @@ struct sk_buff { *data; unsigned int truesize; refcount_t users; -#ifdef CONFIG_SKB_EXTENSIONS - /* only useable after checking ->active_extensions != 0 */ - struct skb_ext *extensions; -#endif }; #ifdef __KERNEL__ @@ -1200,11 +1188,6 @@ void skb_flow_dissector_init(struct flow_dissector *flow_dissector, const struct flow_dissector_key *key, unsigned int key_count); -int skb_flow_dissector_bpf_prog_attach(const union bpf_attr *attr, - struct bpf_prog *prog); - -int skb_flow_dissector_bpf_prog_detach(const union bpf_attr *attr); - bool __skb_flow_dissect(const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container, @@ -3549,66 +3532,6 @@ static inline ktime_t net_invalid_timestamp(void) return 0; } -static inline u8 skb_metadata_len(const struct sk_buff *skb) -{ - return skb_shinfo(skb)->meta_len; -} - -static inline void *skb_metadata_end(const struct sk_buff *skb) -{ - return skb_mac_header(skb); -} - -static inline bool __skb_metadata_differs(const struct sk_buff *skb_a, - const struct sk_buff *skb_b, - u8 meta_len) -{ - const void *a = skb_metadata_end(skb_a); - const void *b = skb_metadata_end(skb_b); - /* Using more efficient varaiant than plain call to memcmp(). */ -#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 - u64 diffs = 0; - switch (meta_len) { -#define __it(x, op) (x -= sizeof(u##op)) -#define __it_diff(a, b, op) (*(u##op *)__it(a, op)) ^ (*(u##op *)__it(b, op)) - case 32: diffs |= __it_diff(a, b, 64); - case 24: diffs |= __it_diff(a, b, 64); - case 16: diffs |= __it_diff(a, b, 64); - case 8: diffs |= __it_diff(a, b, 64); - break; - case 28: diffs |= __it_diff(a, b, 64); - case 20: diffs |= __it_diff(a, b, 64); - case 12: diffs |= __it_diff(a, b, 64); - case 4: diffs |= __it_diff(a, b, 32); - break; - } - return diffs; -#else - return memcmp(a - meta_len, b - meta_len, meta_len); -#endif -} - -static inline bool skb_metadata_differs(const struct sk_buff *skb_a, - const struct sk_buff *skb_b) -{ - u8 len_a = skb_metadata_len(skb_a); - u8 len_b = skb_metadata_len(skb_b); - if (!(len_a | len_b)) - return false; - return len_a != len_b ? - true : __skb_metadata_differs(skb_a, skb_b, len_a); -} - -static inline void skb_metadata_set(struct sk_buff *skb, u8 meta_len) -{ - skb_shinfo(skb)->meta_len = meta_len; -} - -static inline void skb_metadata_clear(struct sk_buff *skb) -{ - skb_metadata_set(skb, 0); -} - struct sk_buff *skb_clone_sk(struct sk_buff *skb); #ifdef CONFIG_NETWORK_PHY_TIMESTAMPING @@ -3929,100 +3852,6 @@ static inline void nf_conntrack_get(struct nf_conntrack *nfct) atomic_inc(&nfct->use); } #endif - -#ifdef CONFIG_SKB_EXTENSIONS -enum skb_ext_id { -#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) - SKB_EXT_BRIDGE_NF, -#endif - SKB_EXT_NUM, /* must be last */ -}; - -/** - * struct skb_ext - sk_buff extensions - * @refcnt: 1 on allocation, deallocated on 0 - * @offset: offset to add to @data to obtain extension address - * @chunks: size currently allocated, stored in SKB_EXT_ALIGN_SHIFT units - * @data: start of extension data, variable sized - * - * Note: offsets/lengths are stored in chunks of 8 bytes, this allows - * to use 'u8' types while allowing up to 2kb worth of extension data. - */ -struct skb_ext { - refcount_t refcnt; - u8 offset[SKB_EXT_NUM]; /* in chunks of 8 bytes */ - u8 chunks; /* same */ - char data[0] __aligned(8); -}; - -void *skb_ext_add(struct sk_buff *skb, enum skb_ext_id id); -void __skb_ext_del(struct sk_buff *skb, enum skb_ext_id id); -void __skb_ext_put(struct skb_ext *ext); -static inline void skb_ext_put(struct sk_buff *skb) -{ - if (skb->active_extensions) - __skb_ext_put(skb->extensions); -} - -static inline void skb_ext_get(struct sk_buff *skb) -{ - if (skb->active_extensions) { - struct skb_ext *ext = skb->extensions; - if (ext) - refcount_inc(&ext->refcnt); - } -} - -static inline void __skb_ext_copy(struct sk_buff *dst, - const struct sk_buff *src) -{ - dst->active_extensions = src->active_extensions; - if (src->active_extensions) { - struct skb_ext *ext = src->extensions; - refcount_inc(&ext->refcnt); - dst->extensions = ext; - } -} - -static inline void skb_ext_copy(struct sk_buff *dst, const struct sk_buff *src) -{ - skb_ext_put(dst); - __skb_ext_copy(dst, src); -} - -static inline bool __skb_ext_exist(const struct skb_ext *ext, enum skb_ext_id i) -{ - return !!ext->offset[i]; -} - -static inline bool skb_ext_exist(const struct sk_buff *skb, enum skb_ext_id id) -{ - return skb->active_extensions & (1 << id); -} - -static inline void skb_ext_del(struct sk_buff *skb, enum skb_ext_id id) -{ - if (skb_ext_exist(skb, id)) - __skb_ext_del(skb, id); -} - -static inline void *skb_ext_find(const struct sk_buff *skb, enum skb_ext_id id) -{ - if (skb_ext_exist(skb, id)) { - struct skb_ext *ext = skb->extensions; - return (void *)ext + (ext->offset[id] << 3); - } - return NULL; -} - -#else -static inline void skb_ext_put(struct sk_buff *skb) {} -static inline void skb_ext_get(struct sk_buff *skb) {} -static inline void skb_ext_del(struct sk_buff *skb, int unused) {} -static inline void __skb_ext_copy(struct sk_buff *d, const struct sk_buff *s) {} -static inline void skb_ext_copy(struct sk_buff *dst, const struct sk_buff *s) {} -#endif /* CONFIG_SKB_EXTENSIONS */ - #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) static inline void nf_bridge_put(struct nf_bridge_info *nf_bridge) { @@ -4108,19 +3937,12 @@ static inline void skb_init_secmark(struct sk_buff *skb) { } #endif -static inline int secpath_exists(const struct sk_buff *skb) -{ -#ifdef CONFIG_XFRM - return skb->sp != NULL; -#else - return 0; -#endif -} - static inline bool skb_irq_freeable(const struct sk_buff *skb) { return !skb->destructor && - !secpath_exists(skb) && +#if IS_ENABLED(CONFIG_XFRM) + !skb->sp && +#endif !skb_nfct(skb) && !skb->_skb_refdst && !skb_has_frag_list(skb); diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h deleted file mode 100644 index 95678103c4a0..000000000000 --- a/include/linux/skmsg.h +++ /dev/null @@ -1,371 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (c) 2017 - 2018 Covalent IO, Inc. http://covalent.io */ - -#ifndef _LINUX_SKMSG_H -#define _LINUX_SKMSG_H - -#include -#include -#include -#include - -#include -#include -#include - -#define MAX_MSG_FRAGS MAX_SKB_FRAGS - -enum __sk_action { - __SK_DROP = 0, - __SK_PASS, - __SK_REDIRECT, - __SK_NONE, -}; - -struct sk_msg_sg { - u32 start; - u32 curr; - u32 end; - u32 size; - u32 copybreak; - bool copy[MAX_MSG_FRAGS]; - struct scatterlist data[MAX_MSG_FRAGS]; -}; - -struct sk_msg { - struct sk_msg_sg sg; - void *data; - void *data_end; - u32 apply_bytes; - u32 cork_bytes; - u32 flags; - struct sk_buff *skb; - struct sock *sk_redir; - struct sock *sk; - struct list_head list; -}; - -struct sk_psock_progs { - struct bpf_prog *msg_parser; - struct bpf_prog *skb_parser; - struct bpf_prog *skb_verdict; -}; - -enum sk_psock_state_bits { - SK_PSOCK_TX_ENABLED, -}; - -struct sk_psock_link { - struct list_head list; - struct bpf_map *map; - void *link_raw; -}; - -struct sk_psock_parser { - struct strparser strp; - bool enabled; - void (*saved_data_ready)(struct sock *sk); -}; - -struct sk_psock_work_state { - struct sk_buff *skb; - u32 len; - u32 off; -}; - -struct sk_psock { - struct sock *sk; - struct sock *sk_redir; - u32 apply_bytes; - u32 cork_bytes; - u32 eval; - struct sk_msg *cork; - struct sk_psock_progs progs; - struct sk_psock_parser parser; - struct sk_buff_head ingress_skb; - struct list_head ingress_msg; - unsigned long state; - struct list_head link; - spinlock_t link_lock; - refcount_t refcnt; - void (*saved_unhash)(struct sock *sk); - void (*saved_close)(struct sock *sk, long timeout); - void (*saved_write_space)(struct sock *sk); - struct proto *sk_proto; - struct sk_psock_work_state work_state; - struct work_struct work; - union { - struct rcu_head rcu; - struct work_struct gc; - }; -}; - -int sk_msg_alloc(struct sock *sk, struct sk_msg *msg, int len, - int elem_first_coalesce); -void sk_msg_trim(struct sock *sk, struct sk_msg *msg, int len); -int sk_msg_free(struct sock *sk, struct sk_msg *msg); -int sk_msg_free_nocharge(struct sock *sk, struct sk_msg *msg); -void sk_msg_free_partial(struct sock *sk, struct sk_msg *msg, u32 bytes); -void sk_msg_free_partial_nocharge(struct sock *sk, struct sk_msg *msg, - u32 bytes); - -void sk_msg_return(struct sock *sk, struct sk_msg *msg, int bytes); - -int sk_msg_zerocopy_from_iter(struct sock *sk, struct iov_iter *from, - struct sk_msg *msg, u32 bytes); -int sk_msg_memcopy_from_iter(struct sock *sk, struct iov_iter *from, - struct sk_msg *msg, u32 bytes); - -static inline void sk_msg_check_to_free(struct sk_msg *msg, u32 i, u32 bytes) -{ - WARN_ON(i == msg->sg.end && bytes); -} - -static inline void sk_msg_apply_bytes(struct sk_psock *psock, u32 bytes) -{ - if (psock->apply_bytes) { - if (psock->apply_bytes < bytes) - psock->apply_bytes = 0; - else - psock->apply_bytes -= bytes; - } -} - -#define sk_msg_iter_var_prev(var) \ - do { \ - if (var == 0) \ - var = MAX_MSG_FRAGS - 1; \ - else \ - var--; \ - } while (0) - -#define sk_msg_iter_var_next(var) \ - do { \ - var++; \ - if (var == MAX_MSG_FRAGS) \ - var = 0; \ - } while (0) - -#define sk_msg_iter_prev(msg, which) \ - sk_msg_iter_var_prev(msg->sg.which) - -#define sk_msg_iter_next(msg, which) \ - sk_msg_iter_var_next(msg->sg.which) - -static inline void sk_msg_clear_meta(struct sk_msg *msg) -{ - memset(&msg->sg, 0, offsetofend(struct sk_msg_sg, copy)); -} - -static inline void sk_msg_init(struct sk_msg *msg) -{ - memset(msg, 0, sizeof(*msg)); - sg_init_marker(msg->sg.data, ARRAY_SIZE(msg->sg.data)); -} - -static inline void sk_msg_xfer(struct sk_msg *dst, struct sk_msg *src, - int which, u32 size) -{ - dst->sg.data[which] = src->sg.data[which]; - dst->sg.data[which].length = size; - src->sg.data[which].length -= size; - src->sg.data[which].offset += size; -} - -static inline u32 sk_msg_elem_used(const struct sk_msg *msg) -{ - return msg->sg.end >= msg->sg.start ? - msg->sg.end - msg->sg.start : - msg->sg.end + (MAX_MSG_FRAGS - msg->sg.start); -} - -static inline bool sk_msg_full(const struct sk_msg *msg) -{ - return (msg->sg.end == msg->sg.start) && msg->sg.size; -} - -static inline struct scatterlist *sk_msg_elem(struct sk_msg *msg, int which) -{ - return &msg->sg.data[which]; -} - -static inline struct page *sk_msg_page(struct sk_msg *msg, int which) -{ - return sg_page(sk_msg_elem(msg, which)); -} - -static inline bool sk_msg_to_ingress(const struct sk_msg *msg) -{ - return msg->flags & BPF_F_INGRESS; -} - -static inline void sk_msg_compute_data_pointers(struct sk_msg *msg) -{ - struct scatterlist *sge = sk_msg_elem(msg, msg->sg.start); - - if (msg->sg.copy[msg->sg.start]) { - msg->data = NULL; - msg->data_end = NULL; - } else { - msg->data = sg_virt(sge); - msg->data_end = msg->data + sge->length; - } -} - -static inline void sk_msg_page_add(struct sk_msg *msg, struct page *page, - u32 len, u32 offset) -{ - struct scatterlist *sge; - - get_page(page); - sge = sk_msg_elem(msg, msg->sg.end); - sg_set_page(sge, page, len, offset); - sg_unmark_end(sge); - - msg->sg.copy[msg->sg.end] = true; - msg->sg.size += len; - sk_msg_iter_next(msg, end); -} - -static inline struct sk_psock *sk_psock(const struct sock *sk) -{ - return rcu_dereference_sk_user_data(sk); -} - -static inline bool sk_has_psock(struct sock *sk) -{ - return sk_psock(sk) != NULL && sk->sk_prot->recvmsg == tcp_bpf_recvmsg; -} - -static inline void sk_psock_queue_msg(struct sk_psock *psock, - struct sk_msg *msg) -{ - list_add_tail(&msg->list, &psock->ingress_msg); -} - -static inline void sk_psock_report_error(struct sk_psock *psock, int err) -{ - struct sock *sk = psock->sk; - - sk->sk_err = err; - sk->sk_error_report(sk); -} - -struct sk_psock *sk_psock_init(struct sock *sk, int node); - -int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock); -void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock); -void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock); - -int sk_psock_msg_verdict(struct sock *sk, struct sk_psock *psock, - struct sk_msg *msg); - -static inline struct sk_psock_link *sk_psock_init_link(void) -{ - return kzalloc(sizeof(struct sk_psock_link), - GFP_ATOMIC | __GFP_NOWARN); -} - -static inline void sk_psock_free_link(struct sk_psock_link *link) -{ - kfree(link); -} - -struct sk_psock_link *sk_psock_link_pop(struct sk_psock *psock); -#if defined(CONFIG_BPF_STREAM_PARSER) -void sk_psock_unlink(struct sock *sk, struct sk_psock_link *link); -#else -static inline void sk_psock_unlink(struct sock *sk, - struct sk_psock_link *link) -{ -} -#endif - -void __sk_psock_purge_ingress_msg(struct sk_psock *psock); - -static inline void sk_psock_cork_free(struct sk_psock *psock) -{ - if (psock->cork) { - sk_msg_free(psock->sk, psock->cork); - kfree(psock->cork); - psock->cork = NULL; - } -} - -static inline void sk_psock_update_proto(struct sock *sk, - struct sk_psock *psock, - struct proto *ops) -{ - psock->saved_unhash = sk->sk_prot->unhash; - psock->saved_close = sk->sk_prot->close; - psock->saved_write_space = sk->sk_write_space; - - psock->sk_proto = sk->sk_prot; - sk->sk_prot = ops; -} - -static inline void sk_psock_restore_proto(struct sock *sk, - struct sk_psock *psock) -{ - if (psock->sk_proto) { - sk->sk_prot = psock->sk_proto; - psock->sk_proto = NULL; - } -} - -static inline void sk_psock_set_state(struct sk_psock *psock, - enum sk_psock_state_bits bit) -{ - set_bit(bit, &psock->state); -} - -static inline void sk_psock_clear_state(struct sk_psock *psock, - enum sk_psock_state_bits bit) -{ - clear_bit(bit, &psock->state); -} - -static inline bool sk_psock_test_state(const struct sk_psock *psock, - enum sk_psock_state_bits bit) -{ - return test_bit(bit, &psock->state); -} - -static inline struct sk_psock *sk_psock_get(struct sock *sk) -{ - struct sk_psock *psock; - - rcu_read_lock(); - psock = sk_psock(sk); - if (psock && !refcount_inc_not_zero(&psock->refcnt)) - psock = NULL; - rcu_read_unlock(); - return psock; -} - -void sk_psock_stop(struct sock *sk, struct sk_psock *psock); -void sk_psock_destroy(struct rcu_head *rcu); -void sk_psock_drop(struct sock *sk, struct sk_psock *psock); - -static inline void sk_psock_put(struct sock *sk, struct sk_psock *psock) -{ - if (refcount_dec_and_test(&psock->refcnt)) - sk_psock_drop(sk, psock); -} - -static inline void psock_set_prog(struct bpf_prog **pprog, - struct bpf_prog *prog) -{ - prog = xchg(pprog, prog); - if (prog) - bpf_prog_put(prog); -} - -static inline void psock_progs_drop(struct sk_psock_progs *progs) -{ - psock_set_prog(&progs->msg_parser, NULL); - psock_set_prog(&progs->skb_parser, NULL); - psock_set_prog(&progs->skb_verdict, NULL); -} - -#endif /* _LINUX_SKMSG_H */ diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 6bbdef44d811..8c2ba9c2794a 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -464,10 +464,6 @@ trace_trigger_soft_disabled(struct trace_event_file *file) unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx); int perf_event_attach_bpf_prog(struct perf_event *event, struct bpf_prog *prog); void perf_event_detach_bpf_prog(struct perf_event *event); -int perf_event_query_prog_array(struct perf_event *event, void __user *info); -int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *prog); -int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog *prog); -struct bpf_raw_event_map *bpf_find_raw_tracepoint(const char *name); #else static inline unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx) { @@ -482,23 +478,6 @@ perf_event_attach_bpf_prog(struct perf_event *event, struct bpf_prog *prog) static inline void perf_event_detach_bpf_prog(struct perf_event *event) { } -static inline int -perf_event_query_prog_array(struct perf_event *event, void __user *info) -{ - return -EOPNOTSUPP; -} -static inline int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *p) -{ - return -EOPNOTSUPP; -} -static inline int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog *p) -{ - return -EOPNOTSUPP; -} -static inline struct bpf_raw_event_map *bpf_find_raw_tracepoint(const char *name) -{ - return NULL; -} #endif enum { @@ -557,33 +536,6 @@ extern void ftrace_profile_free_filter(struct perf_event *event); void perf_trace_buf_update(void *record, u16 type); void *perf_trace_buf_alloc(int size, struct pt_regs **regs, int *rctxp); -void bpf_trace_run1(struct bpf_prog *prog, u64 arg1); -void bpf_trace_run2(struct bpf_prog *prog, u64 arg1, u64 arg2); -void bpf_trace_run3(struct bpf_prog *prog, u64 arg1, u64 arg2, - u64 arg3); -void bpf_trace_run4(struct bpf_prog *prog, u64 arg1, u64 arg2, - u64 arg3, u64 arg4); -void bpf_trace_run5(struct bpf_prog *prog, u64 arg1, u64 arg2, - u64 arg3, u64 arg4, u64 arg5); -void bpf_trace_run6(struct bpf_prog *prog, u64 arg1, u64 arg2, - u64 arg3, u64 arg4, u64 arg5, u64 arg6); -void bpf_trace_run7(struct bpf_prog *prog, u64 arg1, u64 arg2, - u64 arg3, u64 arg4, u64 arg5, u64 arg6, u64 arg7); -void bpf_trace_run8(struct bpf_prog *prog, u64 arg1, u64 arg2, - u64 arg3, u64 arg4, u64 arg5, u64 arg6, u64 arg7, - u64 arg8); -void bpf_trace_run9(struct bpf_prog *prog, u64 arg1, u64 arg2, - u64 arg3, u64 arg4, u64 arg5, u64 arg6, u64 arg7, - u64 arg8, u64 arg9); -void bpf_trace_run10(struct bpf_prog *prog, u64 arg1, u64 arg2, - u64 arg3, u64 arg4, u64 arg5, u64 arg6, u64 arg7, - u64 arg8, u64 arg9, u64 arg10); -void bpf_trace_run11(struct bpf_prog *prog, u64 arg1, u64 arg2, - u64 arg3, u64 arg4, u64 arg5, u64 arg6, u64 arg7, - u64 arg8, u64 arg9, u64 arg10, u64 arg11); -void bpf_trace_run12(struct bpf_prog *prog, u64 arg1, u64 arg2, - u64 arg3, u64 arg4, u64 arg5, u64 arg6, u64 arg7, - u64 arg8, u64 arg9, u64 arg10, u64 arg11, u64 arg12); void perf_trace_run_bpf_submit(void *raw_data, int size, int rctx, struct trace_event_call *call, u64 count, struct pt_regs *regs, struct hlist_head *head, diff --git a/include/linux/tracepoint-defs.h b/include/linux/tracepoint-defs.h index 1c752459aac4..64ed7064f1fa 100644 --- a/include/linux/tracepoint-defs.h +++ b/include/linux/tracepoint-defs.h @@ -35,11 +35,4 @@ struct tracepoint { struct tracepoint_func __rcu *funcs; }; -struct bpf_raw_event_map { - struct tracepoint *tp; - void *bpf_func; - u32 num_args; - u32 writable_size; -} __aligned(32); - #endif diff --git a/include/net/addrconf.h b/include/net/addrconf.h index d06ff0183766..9e10ad495233 100644 --- a/include/net/addrconf.h +++ b/include/net/addrconf.h @@ -235,13 +235,6 @@ struct ipv6_stub { }; extern const struct ipv6_stub *ipv6_stub __read_mostly; -/* A stub used by bpf helpers. Similarly ugly as ipv6_stub */ -struct ipv6_bpf_stub { - int (*inet6_bind)(struct sock *sk, struct sockaddr *uaddr, int addr_len, - bool force_bind_address_no_port, bool with_lock); -}; -extern const struct ipv6_bpf_stub *ipv6_bpf_stub __read_mostly; - /* * identify MLD packets for MLD filter exceptions */ diff --git a/include/net/bpf_sk_storage.h b/include/net/bpf_sk_storage.h deleted file mode 100644 index b9dcb02e756b..000000000000 --- a/include/net/bpf_sk_storage.h +++ /dev/null @@ -1,13 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (c) 2019 Facebook */ -#ifndef _BPF_SK_STORAGE_H -#define _BPF_SK_STORAGE_H - -struct sock; - -void bpf_sk_storage_free(struct sock *sk); - -extern const struct bpf_func_proto bpf_sk_storage_get_proto; -extern const struct bpf_func_proto bpf_sk_storage_delete_proto; - -#endif /* _BPF_SK_STORAGE_H */ diff --git a/include/net/inet_common.h b/include/net/inet_common.h index 0c6b3274925e..5a54c9570977 100644 --- a/include/net/inet_common.h +++ b/include/net/inet_common.h @@ -32,8 +32,6 @@ int inet_shutdown(struct socket *sock, int how); int inet_listen(struct socket *sock, int backlog); void inet_sock_destruct(struct sock *sk); int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len); -int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, - bool force_bind_address_no_port, bool with_lock); int inet_getname(struct socket *sock, struct sockaddr *uaddr, int *uaddr_len, int peer); int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg); diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 384e71baa1b0..b3ad33761cce 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -945,8 +945,6 @@ void ipv6_local_rxpmtu(struct sock *sk, struct flowi6 *fl6, u32 mtu); void inet6_cleanup_sock(struct sock *sk); void inet6_sock_destruct(struct sock *sk); int inet6_release(struct socket *sock); -int __inet6_bind(struct sock *sock, struct sockaddr *uaddr, int addr_len, - bool force_bind_address_no_port, bool with_lock); int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len); int inet6_getname(struct socket *sock, struct sockaddr *uaddr, int *uaddr_len, int peer); diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index 672e7ec3d89b..a1fc638aee47 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -42,7 +42,7 @@ struct ctl_table_header; struct net_generic; struct sock; struct netns_ipvs; -struct bpf_prog; + #define NETDEV_HASHBITS 8 #define NETDEV_HASHENTRIES (1 << NETDEV_HASHBITS) @@ -137,8 +137,6 @@ struct net { #endif struct net_generic __rcu *gen; - struct bpf_prog __rcu *flow_dissector_prog; - /* Note : following structs are cache line aligned */ #ifdef CONFIG_XFRM struct netns_xfrm xfrm; diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 2e5fe6a6eb60..c4ab9934b41d 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -19,7 +19,6 @@ struct Qdisc_ops; struct qdisc_walker; struct tcf_walker; struct module; -struct bpf_flow_keys; struct qdisc_rate_table { struct tc_ratespec rate; @@ -256,14 +255,9 @@ struct tcf_proto { }; struct qdisc_skb_cb { - union { - struct { - unsigned int pkt_len; - u16 slave_dev_queue_mapping; - u16 tc_classid; - }; - struct bpf_flow_keys *flow_keys; - }; + unsigned int pkt_len; + u16 slave_dev_queue_mapping; + u16 tc_classid; #define QDISC_CB_PRIV_LEN 20 unsigned char data[QDISC_CB_PRIV_LEN]; }; diff --git a/include/net/sock.h b/include/net/sock.h index 9f58e77fe19a..58749b227a79 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -236,8 +236,6 @@ struct sock_common { /* public: */ }; -struct bpf_sk_storage; - /** * struct sock - network layer representation of sockets * @__sk_common: shared layout with inet_timewait_sock @@ -504,9 +502,6 @@ struct sock { struct sk_buff *skb); void (*sk_destruct)(struct sock *sk); struct sock_reuseport __rcu *sk_reuseport_cb; -#ifdef CONFIG_BPF_SYSCALL - struct bpf_sk_storage __rcu *sk_bpf_storage; -#endif struct rcu_head sk_rcu; }; @@ -1075,9 +1070,6 @@ static inline void sk_prot_clear_nulls(struct sock *sk, int size) struct proto { void (*close)(struct sock *sk, long timeout); - int (*pre_connect)(struct sock *sk, - struct sockaddr *uaddr, - int addr_len); int (*connect)(struct sock *sk, struct sockaddr *uaddr, int addr_len); @@ -1137,7 +1129,6 @@ struct proto { #endif bool (*stream_memory_free)(const struct sock *sk); - bool (*stream_memory_read)(const struct sock *sk); /* Memory pressure */ void (*enter_memory_pressure)(struct sock *sk); void (*leave_memory_pressure)(struct sock *sk); diff --git a/include/net/tcp.h b/include/net/tcp.h index 265d6c48be9f..be3f56ab10ad 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -882,8 +882,9 @@ struct tcp_skb_cb { #endif } header; /* For incoming skbs */ struct { + __u32 key; __u32 flags; - struct sock *sk_redir; + struct bpf_map *map; void *data_end; } bpf; }; @@ -891,25 +892,6 @@ struct tcp_skb_cb { #define TCP_SKB_CB(__skb) ((struct tcp_skb_cb *)&((__skb)->cb[0])) -static inline void bpf_compute_data_end_sk_skb(struct sk_buff *skb) -{ - TCP_SKB_CB(skb)->bpf.data_end = skb->data + skb_headlen(skb); -} - -static inline bool tcp_skb_bpf_ingress(const struct sk_buff *skb) -{ - return TCP_SKB_CB(skb)->bpf.flags & BPF_F_INGRESS; -} - -static inline struct sock *tcp_skb_bpf_redirect_fetch(struct sk_buff *skb) -{ - return TCP_SKB_CB(skb)->bpf.sk_redir; -} - -static inline void tcp_skb_bpf_redirect_clear(struct sk_buff *skb) -{ - TCP_SKB_CB(skb)->bpf.sk_redir = NULL; -} #if IS_ENABLED(CONFIG_IPV6) /* This is the variant of inet6_iif() that must be used by TCP, @@ -2137,7 +2119,6 @@ enum hrtimer_restart tcp_pace_kick(struct hrtimer *timer); enum { TCP_ULP_TLS, - TCP_ULP_BPF, }; struct tcp_ulp_ops { @@ -2156,7 +2137,6 @@ struct tcp_ulp_ops { int tcp_register_ulp(struct tcp_ulp_ops *type); void tcp_unregister_ulp(struct tcp_ulp_ops *type); int tcp_set_ulp(struct sock *sk, const char *name); -int tcp_set_ulp_id(struct sock *sk, const int ulp); void tcp_get_available_ulp(char *buf, size_t len); void tcp_cleanup_ulp(struct sock *sk); @@ -2164,21 +2144,6 @@ void tcp_cleanup_ulp(struct sock *sk); __MODULE_INFO(alias, alias_userspace, name); \ __MODULE_INFO(alias, alias_tcp_ulp, "tcp-ulp-" name) -struct sk_msg; -struct sk_psock; - -int tcp_bpf_init(struct sock *sk); -void tcp_bpf_reinit(struct sock *sk); - -int tcp_bpf_sendmsg_redir(struct sock *sk, struct sk_msg *msg, u32 bytes, - int flags); - -int tcp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, - int nonblock, int flags, int *addr_len); - -int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock, - struct msghdr *msg, int len); - /* Call BPF_SOCK_OPS program that returns an int. If the return value * is < 0, then the BPF op failed (for example if the loaded BPF * program does not support the chosen operation or there is no BPF diff --git a/include/net/udp.h b/include/net/udp.h index 8f95d2234443..c3d9ad972763 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -283,7 +283,6 @@ void udp4_hwcsum(struct sk_buff *skb, __be32 src, __be32 dst); int udp_rcv(struct sk_buff *skb); int udp_ioctl(struct sock *sk, int cmd, unsigned long arg); int udp_init_sock(struct sock *sk); -int udp_pre_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len); int __udp_disconnect(struct sock *sk, int flags); int udp_disconnect(struct sock *sk, int flags); unsigned int udp_poll(struct file *file, struct socket *sock, poll_table *wait); diff --git a/include/net/xdp.h b/include/net/xdp.h deleted file mode 100644 index a5e755868a82..000000000000 --- a/include/net/xdp.h +++ /dev/null @@ -1,70 +0,0 @@ -/* include/net/xdp.h - * - * Copyright (c) 2017 Jesper Dangaard Brouer, Red Hat Inc. - * Released under terms in GPL version 2. See COPYING. - */ -#ifndef __LINUX_NET_XDP_H__ -#define __LINUX_NET_XDP_H__ - -/** - * DOC: XDP RX-queue information - * - * The XDP RX-queue info (xdp_rxq_info) is associated with the driver - * level RX-ring queues. It is information that is specific to how - * the driver have configured a given RX-ring queue. - * - * Each xdp_buff frame received in the driver carry a (pointer) - * reference to this xdp_rxq_info structure. This provides the XDP - * data-path read-access to RX-info for both kernel and bpf-side - * (limited subset). - * - * For now, direct access is only safe while running in NAPI/softirq - * context. Contents is read-mostly and must not be updated during - * driver NAPI/softirq poll. - * - * The driver usage API is a register and unregister API. - * - * The struct is not directly tied to the XDP prog. A new XDP prog - * can be attached as long as it doesn't change the underlying - * RX-ring. If the RX-ring does change significantly, the NIC driver - * naturally need to stop the RX-ring before purging and reallocating - * memory. In that process the driver MUST call unregistor (which - * also apply for driver shutdown and unload). The register API is - * also mandatory during RX-ring setup. - */ - -struct xdp_rxq_info { - struct net_device *dev; - u32 queue_index; - u32 reg_state; -} ____cacheline_aligned; /* perf critical, avoid false-sharing */ - -struct xdp_buff { - void *data; - void *data_end; - void *data_meta; - void *data_hard_start; - struct xdp_rxq_info *rxq; -}; - -int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq, - struct net_device *dev, u32 queue_index); -void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq); -void xdp_rxq_info_unused(struct xdp_rxq_info *xdp_rxq); - -/* Drivers not supporting XDP metadata can use this helper, which - * rejects any room expansion for metadata as a result. - */ -static __always_inline void -xdp_set_data_meta_invalid(struct xdp_buff *xdp) -{ - xdp->data_meta = xdp->data + 1; -} - -static __always_inline bool -xdp_data_meta_unsupported(const struct xdp_buff *xdp) -{ - return unlikely(xdp->data_meta > xdp->data); -} - -#endif /* __LINUX_NET_XDP_H__ */ diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 5096f8774ee2..3137313b58fd 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -1048,7 +1048,6 @@ struct xfrm_offload { #define XFRM_GSO_SEGMENT 16 #define XFRM_GRO 32 #define XFRM_ESP_NO_TRAILER 64 -#define XFRM_DEV_RESUME 128 __u32 status; #define CRYPTO_SUCCESS 1 @@ -1072,6 +1071,15 @@ struct sec_path { struct xfrm_offload ovec[XFRM_MAX_OFFLOAD_DEPTH]; }; +static inline int secpath_exists(struct sk_buff *skb) +{ +#ifdef CONFIG_XFRM + return skb->sp != NULL; +#else + return 0; +#endif +} + static inline struct sec_path * secpath_get(struct sec_path *sp) { @@ -1884,27 +1892,21 @@ static inline struct xfrm_state *xfrm_input_state(struct sk_buff *skb) { return skb->sp->xvec[skb->sp->len - 1]; } -#endif static inline struct xfrm_offload *xfrm_offload(struct sk_buff *skb) { -#ifdef CONFIG_XFRM struct sec_path *sp = skb->sp; if (!sp || !sp->olen || sp->len != sp->olen) return NULL; return &sp->ovec[sp->olen - 1]; -#else - return NULL; -#endif } +#endif void __net_init xfrm_dev_init(void); #ifdef CONFIG_XFRM_OFFLOAD -void xfrm_dev_resume(struct sk_buff *skb); -void xfrm_dev_backlog(struct softnet_data *sd); -struct sk_buff *validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t features, bool *again); +int validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t features); int xfrm_dev_state_add(struct net *net, struct xfrm_state *x, struct xfrm_user_offload *xuo); bool xfrm_dev_offload_ok(struct sk_buff *skb, struct xfrm_state *x); @@ -1943,17 +1945,9 @@ static inline void xfrm_dev_state_free(struct xfrm_state *x) } } #else -static inline void xfrm_dev_resume(struct sk_buff *skb) +static inline int validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t features) { -} - -static inline void xfrm_dev_backlog(struct softnet_data *sd) -{ -} - -static inline struct sk_buff *validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t features, bool *again) -{ - return skb; + return 0; } static inline int xfrm_dev_state_add(struct net *net, struct xfrm_state *x, struct xfrm_user_offload *xuo) diff --git a/include/trace/bpf_probe.h b/include/trace/bpf_probe.h index 0a6e052e8da1..505dae0bed80 100644 --- a/include/trace/bpf_probe.h +++ b/include/trace/bpf_probe.h @@ -69,7 +69,8 @@ __bpf_trace_##call(void *__data, proto) \ * to make sure that if the tracepoint handling changes, the * bpf probe will fail to compile unless it too is updated. */ -#define __DEFINE_EVENT(template, call, proto, args, size) \ +#undef DEFINE_EVENT +#define DEFINE_EVENT(template, call, proto, args) \ static inline void bpf_test_probe_##call(void) \ { \ check_trace_callback_type_##call(__bpf_trace_##template); \ @@ -80,34 +81,12 @@ __bpf_trace_tp_map_##call = { \ .tp = &__tracepoint_##call, \ .bpf_func = (void *)__bpf_trace_##template, \ .num_args = COUNT_ARGS(args), \ - .writable_size = size, \ }; -#define FIRST(x, ...) x -#undef DEFINE_EVENT_WRITABLE -#define DEFINE_EVENT_WRITABLE(template, call, proto, args, size) \ -static inline void bpf_test_buffer_##call(void) \ -{ \ - /* BUILD_BUG_ON() is ignored if the code is completely eliminated, but \ - * BUILD_BUG_ON_ZERO() uses a different mechanism that is not \ - * dead-code-eliminated. \ - */ \ - FIRST(proto); \ - (void)BUILD_BUG_ON_ZERO(size != sizeof(*FIRST(args))); \ -} \ -__DEFINE_EVENT(template, call, PARAMS(proto), PARAMS(args), size) - -#undef DEFINE_EVENT -#define DEFINE_EVENT(template, call, proto, args) \ - __DEFINE_EVENT(template, call, PARAMS(proto), PARAMS(args), 0) #undef DEFINE_EVENT_PRINT #define DEFINE_EVENT_PRINT(template, name, proto, args, print) \ DEFINE_EVENT(template, name, PARAMS(proto), PARAMS(args)) #include TRACE_INCLUDE(TRACE_INCLUDE_FILE) - -#undef DEFINE_EVENT_WRITABLE -#undef __DEFINE_EVENT -#undef FIRST #endif /* CONFIG_BPF_EVENTS */ diff --git a/include/trace/define_trace.h b/include/trace/define_trace.h index cb30c5532144..d9e3d4aa3f6e 100644 --- a/include/trace/define_trace.h +++ b/include/trace/define_trace.h @@ -95,7 +95,6 @@ #ifdef TRACEPOINTS_ENABLED #include #include -#include #endif #undef TRACE_EVENT diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 9e6816912da1..3064575d843d 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -75,11 +75,6 @@ struct bpf_lpm_trie_key { __u8 data[0]; /* Arbitrary size */ }; -struct bpf_cgroup_storage_key { - __u64 cgroup_inode_id; /* cgroup inode id */ - __u32 attach_type; /* program attach type */ -}; - /* BPF syscall commands, see bpf(2) man-page for details. */ enum bpf_cmd { BPF_MAP_CREATE, @@ -98,10 +93,6 @@ enum bpf_cmd { BPF_PROG_GET_FD_BY_ID, BPF_MAP_GET_FD_BY_ID, BPF_OBJ_GET_INFO_BY_FD, - BPF_PROG_QUERY, - BPF_RAW_TRACEPOINT_OPEN, - BPF_BTF_LOAD = 18, - BPF_BTF_GET_FD_BY_ID = 19, }; enum bpf_map_type { @@ -121,12 +112,6 @@ enum bpf_map_type { BPF_MAP_TYPE_HASH_OF_MAPS, BPF_MAP_TYPE_DEVMAP, BPF_MAP_TYPE_SOCKMAP, - BPF_MAP_TYPE_CPUMAP, - BPF_MAP_TYPE_SOCKHASH = 18, - BPF_MAP_TYPE_CGROUP_STORAGE = 19, - BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE = 21, - BPF_MAP_TYPE_SK_STORAGE = 24, - BPF_MAP_TYPE_DEVMAP_HASH = 25, }; enum bpf_prog_type { @@ -145,14 +130,6 @@ enum bpf_prog_type { BPF_PROG_TYPE_LWT_XMIT, BPF_PROG_TYPE_SOCK_OPS, BPF_PROG_TYPE_SK_SKB, - BPF_PROG_TYPE_CGROUP_DEVICE, - BPF_PROG_TYPE_SK_MSG = 16, - BPF_PROG_TYPE_RAW_TRACEPOINT = 17, - BPF_PROG_TYPE_CGROUP_SOCK_ADDR = 18, - BPF_PROG_TYPE_FLOW_DISSECTOR = 22, - BPF_PROG_TYPE_CGROUP_SYSCTL = 23, - BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE = 24, - BPF_PROG_TYPE_CGROUP_SOCKOPT = 25, }; enum bpf_attach_type { @@ -162,22 +139,6 @@ enum bpf_attach_type { BPF_CGROUP_SOCK_OPS, BPF_SK_SKB_STREAM_PARSER, BPF_SK_SKB_STREAM_VERDICT, - BPF_CGROUP_DEVICE, - BPF_SK_MSG_VERDICT = 7, - BPF_CGROUP_INET4_BIND = 8, - BPF_CGROUP_INET6_BIND = 9, - BPF_CGROUP_INET4_CONNECT, - BPF_CGROUP_INET6_CONNECT, - BPF_CGROUP_INET4_POST_BIND, - BPF_CGROUP_INET6_POST_BIND, - BPF_CGROUP_UDP4_SENDMSG, - BPF_CGROUP_UDP6_SENDMSG, - BPF_FLOW_DISSECTOR = 17, - BPF_CGROUP_SYSCTL = 18, - BPF_CGROUP_UDP4_RECVMSG = 19, - BPF_CGROUP_UDP6_RECVMSG = 20, - BPF_CGROUP_GETSOCKOPT = 21, - BPF_CGROUP_SETSOCKOPT = 22, __MAX_BPF_ATTACH_TYPE }; @@ -232,30 +193,12 @@ enum bpf_attach_type { */ #define BPF_F_STRICT_ALIGNMENT (1U << 0) -/* When BPF ldimm64's insn[0].src_reg != 0 then this can have - * two extensions: - * - * insn[0].src_reg: BPF_PSEUDO_MAP_FD BPF_PSEUDO_MAP_VALUE - * insn[0].imm: map fd map fd - * insn[1].imm: 0 offset into value - * insn[0].off: 0 0 - * insn[1].off: 0 0 - * ldimm64 rewrite: address of map address of map[0]+offset - * verifier type: CONST_PTR_TO_MAP PTR_TO_MAP_VALUE - */ #define BPF_PSEUDO_MAP_FD 1 -#define BPF_PSEUDO_MAP_VALUE 2 - -/* when bpf_call->src_reg == BPF_PSEUDO_CALL, bpf_call->imm == pc-relative - * offset to another bpf function - */ -#define BPF_PSEUDO_CALL 1 /* flags for BPF_MAP_UPDATE_ELEM command */ #define BPF_ANY 0 /* create new element or update existing */ #define BPF_NOEXIST 1 /* create new element if it didn't exist */ #define BPF_EXIST 2 /* update existing element */ -#define BPF_F_LOCK 4 /* spin_lock-ed map_lookup/map_update */ /* flags for BPF_MAP_CREATE command */ #define BPF_F_NO_PREALLOC (1U << 0) @@ -269,18 +212,10 @@ enum bpf_attach_type { /* Specify numa node during map creation */ #define BPF_F_NUMA_NODE (1U << 2) -/* flags for BPF_PROG_QUERY */ -#define BPF_F_QUERY_EFFECTIVE (1U << 0) - -#define BPF_OBJ_NAME_LEN 16U - /* Flags for accessing BPF object */ #define BPF_F_RDONLY (1U << 3) #define BPF_F_WRONLY (1U << 4) -/* Flags for accessing BPF object from program side. */ -#define BPF_F_RDONLY_PROG (1U << 7) - union bpf_attr { struct { /* anonymous struct used by BPF_MAP_CREATE command */ __u32 map_type; /* one of enum bpf_map_type */ @@ -294,11 +229,6 @@ union bpf_attr { __u32 numa_node; /* numa node (effective only if * BPF_F_NUMA_NODE is set). */ - char map_name[BPF_OBJ_NAME_LEN]; - __u32 map_ifindex; /* ifindex of netdev to create on */ - __u32 btf_fd; /* fd pointing to a BTF type data */ - __u32 btf_key_type_id; /* BTF type_id of the key */ - __u32 btf_value_type_id; /* BTF type_id of the value */ }; struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */ @@ -321,20 +251,6 @@ union bpf_attr { __aligned_u64 log_buf; /* user supplied buffer */ __u32 kern_version; /* checked when prog_type=kprobe */ __u32 prog_flags; - char prog_name[BPF_OBJ_NAME_LEN]; - __u32 prog_ifindex; /* ifindex of netdev to prep for */ - /* For some prog types expected attach type must be known at - * load time to verify attach type specific parts of prog - * (context accesses, allowed helpers, etc). - */ - __u32 expected_attach_type; - __u32 prog_btf_fd; /* fd pointing to BTF type data */ - __u32 func_info_rec_size; /* userspace bpf_func_info size */ - __aligned_u64 func_info; /* func info */ - __u32 func_info_cnt; /* number of bpf_func_info records */ - __u32 line_info_rec_size; /* userspace bpf_line_info size */ - __aligned_u64 line_info; /* line info */ - __u32 line_info_cnt; /* number of bpf_line_info records */ }; struct { /* anonymous struct used by BPF_OBJ_* commands */ @@ -366,7 +282,6 @@ union bpf_attr { __u32 start_id; __u32 prog_id; __u32 map_id; - __u32 btf_id; }; __u32 next_id; __u32 open_flags; @@ -377,28 +292,6 @@ union bpf_attr { __u32 info_len; __aligned_u64 info; } info; - - struct { /* anonymous struct used by BPF_PROG_QUERY command */ - __u32 target_fd; /* container object to query */ - __u32 attach_type; - __u32 query_flags; - __u32 attach_flags; - __aligned_u64 prog_ids; - __u32 prog_cnt; - } query; - - struct { - __u64 name; - __u32 prog_fd; - } raw_tracepoint; - - struct { /* anonymous struct for BPF_BTF_LOAD */ - __aligned_u64 btf; - __aligned_u64 btf_log_buf; - __u32 btf_size; - __u32 btf_log_size; - __u32 btf_log_level; - }; } __attribute__((aligned(8))); /* BPF helper function descriptions: @@ -756,217 +649,6 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_bind(ctx, addr, addr_len) - * Bind socket to address. Only binding to IP is supported, no port can be - * set in addr. - * @ctx: pointer to context of type bpf_sock_addr - * @addr: pointer to struct sockaddr to bind socket to - * @addr_len: length of sockaddr structure - * Return: 0 on success or negative error code - * - * void* get_local_storage(void *map, u64 flags) - * Description - * Get the pointer to the local storage area. - * The type and the size of the local storage is defined - * by the *map* argument. - * The *flags* meaning is specific for each map type, - * and has to be 0 for cgroup local storage. - * - * Depending on the bpf program type, a local storage area - * can be shared between multiple instances of the bpf program, - * running simultaneously. - * - * A user should care about the synchronization by himself. - * For example, by using the BPF_STX_XADD instruction to alter - * the shared data. - * Return - * Pointer to the local storage area. - * - * u64 bpf_get_current_cgroup_id(void) - * Return - * A 64-bit integer containing the current cgroup id based - * on the cgroup within which the current task is running. - * - * int bpf_xdp_adjust_meta(xdp_md, delta) - * Adjust the xdp_md.data_meta by delta - * @xdp_md: pointer to xdp_md - * @delta: An positive/negative integer to be added to xdp_md.data_meta - * Return: 0 on success or negative on error - * - * struct bpf_sock *bpf_sk_fullsock(struct bpf_sock *sk) - * Description - * This helper gets a **struct bpf_sock** pointer such - * that all the fields in bpf_sock can be accessed. - * Return - * A **struct bpf_sock** pointer on success, or NULL in - * case of failure. - * - * struct bpf_sock *bpf_sk_lookup_tcp(void *ctx, struct bpf_sock_tuple *tuple, u32 tuple_size, u32 netns, u64 flags) - * Description - * Look for TCP socket matching *tuple*, optionally in a child - * network namespace *netns*. The return value must be checked, - * and if non-NULL, released via **bpf_sk_release**\ (). - * - * The *ctx* should point to the context of the program, such as - * the skb or socket (depending on the hook in use). This is used - * to determine the base network namespace for the lookup. - * - * *tuple_size* must be one of: - * - * **sizeof**\ (*tuple*\ **->ipv4**) - * Look for an IPv4 socket. - * **sizeof**\ (*tuple*\ **->ipv6**) - * Look for an IPv6 socket. - * - * If the *netns* is zero, then the socket lookup table in the - * netns associated with the *ctx* will be used. For the TC hooks, - * this in the netns of the device in the skb. For socket hooks, - * this in the netns of the socket. If *netns* is non-zero, then - * it specifies the ID of the netns relative to the netns - * associated with the *ctx*. - * - * All values for *flags* are reserved for future usage, and must - * be left at zero. - * - * This helper is available only if the kernel was compiled with - * **CONFIG_NET** configuration option. - * Return - * Pointer to *struct bpf_sock*, or NULL in case of failure. - * For sockets with reuseport option, *struct bpf_sock* - * return is from reuse->socks[] using hash of the packet. - * - * struct bpf_sock *bpf_sk_lookup_udp(void *ctx, struct bpf_sock_tuple *tuple, u32 tuple_size, u32 netns, u64 flags) - * Description - * Look for UDP socket matching *tuple*, optionally in a child - * network namespace *netns*. The return value must be checked, - * and if non-NULL, released via **bpf_sk_release**\ (). - * - * The *ctx* should point to the context of the program, such as - * the skb or socket (depending on the hook in use). This is used - * to determine the base network namespace for the lookup. - * - * *tuple_size* must be one of: - * - * **sizeof**\ (*tuple*\ **->ipv4**) - * Look for an IPv4 socket. - * **sizeof**\ (*tuple*\ **->ipv6**) - * Look for an IPv6 socket. - * - * If the *netns* is zero, then the socket lookup table in the - * netns associated with the *ctx* will be used. For the TC hooks, - * this in the netns of the device in the skb. For socket hooks, - * this in the netns of the socket. If *netns* is non-zero, then - * it specifies the ID of the netns relative to the netns - * associated with the *ctx*. - * - * All values for *flags* are reserved for future usage, and must - * be left at zero. - * - * This helper is available only if the kernel was compiled with - * **CONFIG_NET** configuration option. - * Return - * Pointer to *struct bpf_sock*, or NULL in case of failure. - * For sockets with reuseport option, *struct bpf_sock* - * return is from reuse->socks[] using hash of the packet. - * - * int bpf_sk_release(struct bpf_sock *sk) - * Description - * Release the reference held by *sock*. *sock* must be a non-NULL - * pointer that was returned from bpf_sk_lookup_xxx\ (). - * Return - * 0 on success, or a negative error in case of failure. - * - * struct bpf_tcp_sock *bpf_tcp_sock(struct bpf_sock *sk) - * Description - * This helper gets a **struct bpf_tcp_sock** pointer from a - * **struct bpf_sock** pointer. - * - * Return - * A **struct bpf_tcp_sock** pointer on success, or NULL in - * case of failure. - * - * void *bpf_sk_storage_get(struct bpf_map *map, struct bpf_sock *sk, void *value, u64 flags) - * Description - * Get a bpf-local-storage from a sk. - * - * Logically, it could be thought of getting the value from - * a *map* with *sk* as the **key**. From this - * perspective, the usage is not much different from - * **bpf_map_lookup_elem(map, &sk)** except this - * helper enforces the key must be a **bpf_fullsock()** - * and the map must be a BPF_MAP_TYPE_SK_STORAGE also. - * - * Underneath, the value is stored locally at *sk* instead of - * the map. The *map* is used as the bpf-local-storage **type**. - * The bpf-local-storage **type** (i.e. the *map*) is searched - * against all bpf-local-storages residing at sk. - * - * An optional *flags* (BPF_SK_STORAGE_GET_F_CREATE) can be - * used such that a new bpf-local-storage will be - * created if one does not exist. *value* can be used - * together with BPF_SK_STORAGE_GET_F_CREATE to specify - * the initial value of a bpf-local-storage. If *value* is - * NULL, the new bpf-local-storage will be zero initialized. - * Return - * A bpf-local-storage pointer is returned on success. - * - * **NULL** if not found or there was an error in adding - * a new bpf-local-storage. - * - * int bpf_sk_storage_delete(struct bpf_map *map, struct bpf_sock *sk) - * Description - * Delete a bpf-local-storage from a sk. - * Return - * 0 on success. - * - * **-ENOENT** if the bpf-local-storage cannot be found. - * - * int bpf_sock_hash_update(struct bpf_sock_ops_kern *skops, struct bpf_map *map, void *key, u64 flags) - * Description - * Add an entry to, or update a sockhash *map* referencing sockets. - * The *skops* is used as a new value for the entry associated to - * *key*. *flags* is one of: - * - * **BPF_NOEXIST** - * The entry for *key* must not exist in the map. - * **BPF_EXIST** - * The entry for *key* must already exist in the map. - * **BPF_ANY** - * No condition on the existence of the entry for *key*. - * - * If the *map* has eBPF programs (parser and verdict), those will - * be inherited by the socket being added. If the socket is - * already attached to eBPF programs, this results in an error. - * Return - * 0 on success, or a negative error in case of failure. - * - * int bpf_msg_redirect_hash(struct sk_msg_buff *msg, struct bpf_map *map, void *key, u64 flags) - * Description - * This helper is used in programs implementing policies at the - * socket level. If the message *msg* is allowed to pass (i.e. if - * the verdict eBPF program returns **SK_PASS**), redirect it to - * the socket referenced by *map* (of type - * **BPF_MAP_TYPE_SOCKHASH**) using hash *key*. Both ingress and - * egress interfaces can be used for redirection. The - * **BPF_F_INGRESS** value in *flags* is used to make the - * distinction (ingress path is selected if the flag is present, - * egress path otherwise). This is the only flag supported for now. - * Return - * **SK_PASS** on success, or **SK_DROP** on error. - * - * int bpf_sk_redirect_hash(struct sk_buff *skb, struct bpf_map *map, void *key, u64 flags) - * Description - * This helper is used in programs implementing policies at the - * skb socket level. If the sk_buff *skb* is allowed to pass (i.e. - * if the verdeict eBPF program returns **SK_PASS**), redirect it - * to the socket referenced by *map* (of type - * **BPF_MAP_TYPE_SOCKHASH**) using hash *key*. Both ingress and - * egress interfaces can be used for redirection. The - * **BPF_F_INGRESS** value in *flags* is used to make the - * distinction (ingress path is selected if the flag is present, - * egress otherwise). This is the only flag supported for now. - * Return - * **SK_PASS** on success, or **SK_DROP** on error. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -1144,9 +826,6 @@ enum bpf_func_id { /* BPF_FUNC_perf_event_output for sk_buff input context. */ #define BPF_F_CTXLEN_MASK (0xfffffULL << 32) -/* BPF_FUNC_sk_storage_get flags */ -#define BPF_SK_STORAGE_GET_F_CREATE (1ULL << 0) - /* Mode for BPF_FUNC_skb_adjust_room helper. */ enum bpf_adj_room_mode { BPF_ADJ_ROOM_NET, @@ -1158,12 +837,6 @@ enum bpf_hdr_start_off { BPF_HDR_START_NET, }; -#define __bpf_md_ptr(type, name) \ -union { \ - type name; \ - __u64 :64; \ -} __attribute__((aligned(8))) - /* user accessible mirror of in-kernel sk_buff. * new fields can only be added to the end of this structure */ @@ -1187,7 +860,7 @@ struct __sk_buff { __u32 data_end; __u32 napi_id; - /* Accessed by BPF_PROG_TYPE_sk_skb types from here to ... */ + /* accessed by BPF_PROG_TYPE_sk_skb types */ __u32 family; __u32 remote_ip4; /* Stored in network byte order */ __u32 local_ip4; /* Stored in network byte order */ @@ -1195,11 +868,6 @@ struct __sk_buff { __u32 local_ip6[4]; /* Stored in network byte order */ __u32 remote_port; /* Stored in network byte order */ __u32 local_port; /* stored in host byte order */ - - __bpf_md_ptr(struct bpf_flow_keys *, flow_keys); - /* ... here. */ - __u32 data_meta; - __bpf_md_ptr(struct bpf_sock *, sk); }; struct bpf_tunnel_key { @@ -1237,71 +905,6 @@ struct bpf_sock { __u32 protocol; __u32 mark; __u32 priority; - __u32 src_ip4; /* Allows 1,2,4-byte read. - * Stored in network byte order. - */ - __u32 src_ip6[4]; /* Allows 1,2,4-byte read. - * Stored in network byte order. - */ - __u32 src_port; /* Allows 4-byte read. - * Stored in host byte order - */ -}; - -struct bpf_tcp_sock { - __u32 snd_cwnd; /* Sending congestion window */ - __u32 srtt_us; /* smoothed round trip time << 3 in usecs */ - __u32 rtt_min; - __u32 snd_ssthresh; /* Slow start size threshold */ - __u32 rcv_nxt; /* What we want to receive next */ - __u32 snd_nxt; /* Next sequence we send */ - __u32 snd_una; /* First byte we want an ack for */ - __u32 mss_cache; /* Cached effective mss, not including SACKS */ - __u32 ecn_flags; /* ECN status bits. */ - __u32 rate_delivered; /* saved rate sample: packets delivered */ - __u32 rate_interval_us; /* saved rate sample: time elapsed */ - __u32 packets_out; /* Packets which are "in flight" */ - __u32 retrans_out; /* Retransmitted packets out */ - __u32 total_retrans; /* Total retransmits for entire connection */ - __u32 segs_in; /* RFC4898 tcpEStatsPerfSegsIn - * total number of segments in. - */ - __u32 data_segs_in; /* RFC4898 tcpEStatsPerfDataSegsIn - * total number of data segments in. - */ - __u32 segs_out; /* RFC4898 tcpEStatsPerfSegsOut - * The total number of segments sent. - */ - __u32 data_segs_out; /* RFC4898 tcpEStatsPerfDataSegsOut - * total number of data segments sent. - */ - __u32 lost_out; /* Lost packets */ - __u32 sacked_out; /* SACK'd packets */ - __u64 bytes_received; /* RFC4898 tcpEStatsAppHCThruOctetsReceived - * sum(delta(rcv_nxt)), or how many bytes - * were acked. - */ - __u64 bytes_acked; /* RFC4898 tcpEStatsAppHCThruOctetsAcked - * sum(delta(snd_una)), or how many bytes - * were acked. - */ -}; - -struct bpf_sock_tuple { - union { - struct { - __be32 saddr; - __be32 daddr; - __be16 sport; - __be16 dport; - } ipv4; - struct { - __be32 saddr[4]; - __be32 daddr[4]; - __be16 sport; - __be16 dport; - } ipv6; - }; }; #define XDP_PACKET_HEADROOM 256 @@ -1325,7 +928,6 @@ enum xdp_action { struct xdp_md { __u32 data; __u32 data_end; - __u32 data_meta; }; enum sk_action { @@ -1333,14 +935,6 @@ enum sk_action { SK_PASS, }; -/* user accessible metadata for SK_MSG packet hook, new fields must - * be added to the end of this structure - */ -struct sk_msg_md { - __bpf_md_ptr(void *, data); - __bpf_md_ptr(void *, data_end); -}; - #define BPF_TAG_SIZE 8 struct bpf_prog_info { @@ -1351,29 +945,6 @@ struct bpf_prog_info { __u32 xlated_prog_len; __aligned_u64 jited_prog_insns; __aligned_u64 xlated_prog_insns; - __u64 load_time; /* ns since boottime */ - __u32 created_by_uid; - __u32 nr_map_ids; - __aligned_u64 map_ids; - char name[BPF_OBJ_NAME_LEN]; - __u32 ifindex; - __u32 gpl_compatible:1; - __u64 netns_dev; - __u64 netns_ino; - __u32 nr_jited_ksyms; - __u32 nr_jited_func_lens; - __aligned_u64 jited_ksyms; - __aligned_u64 jited_func_lens; - __u32 btf_id; - __u32 func_info_rec_size; - __aligned_u64 func_info; - __u32 func_info_cnt; - __u32 line_info_cnt; - __aligned_u64 line_info; - __aligned_u64 jited_line_info; - __u32 jited_line_info_cnt; - __u32 line_info_rec_size; - __u32 jited_line_info_rec_size; } __attribute__((aligned(8))); struct bpf_map_info { @@ -1383,41 +954,8 @@ struct bpf_map_info { __u32 value_size; __u32 max_entries; __u32 map_flags; - char name[BPF_OBJ_NAME_LEN]; - __u32 ifindex; - __u64 netns_dev; - __u64 netns_ino; - __u32 btf_id; - __u32 btf_key_type_id; - __u32 btf_value_type_id; } __attribute__((aligned(8))); -/* User bpf_sock_addr struct to access socket fields and sockaddr struct passed - * by user and intended to be used by socket (e.g. to bind to, depends on - * attach attach type). - */ -struct bpf_sock_addr { - __u32 user_family; /* Allows 4-byte read, but no write. */ - __u32 user_ip4; /* Allows 1,2,4-byte read and 4-byte write. - * Stored in network byte order. - */ - __u32 user_ip6[4]; /* Allows 1,2,4-byte read an 4-byte write. - * Stored in network byte order. - */ - __u32 user_port; /* Allows 4-byte read and write. - * Stored in network byte order - */ - __u32 family; /* Allows 4-byte read, but no write */ - __u32 type; /* Allows 4-byte read, but no write */ - __u32 protocol; /* Allows 4-byte read, but no write */ - __u32 msg_src_ip4; /* Allows 1,2,4-byte read an 4-byte write. - * Stored in network byte order. - */ - __u32 msg_src_ip6[4]; /* Allows 1,2,4-byte read an 4-byte write. - * Stored in network byte order. - */ -}; - /* User bpf_sock_ops struct to access socket values and specify request ops * and their replies. * Some of this fields are in network (bigendian) byte order and may need @@ -1470,78 +1008,4 @@ enum { #define TCP_BPF_IW 1001 /* Set TCP initial congestion window */ #define TCP_BPF_SNDCWND_CLAMP 1002 /* Set sndcwnd_clamp */ -#define BPF_DEVCG_ACC_MKNOD (1ULL << 0) -#define BPF_DEVCG_ACC_READ (1ULL << 1) -#define BPF_DEVCG_ACC_WRITE (1ULL << 2) - -#define BPF_DEVCG_DEV_BLOCK (1ULL << 0) -#define BPF_DEVCG_DEV_CHAR (1ULL << 1) - -struct bpf_cgroup_dev_ctx { - __u32 access_type; /* (access << 16) | type */ - __u32 major; - __u32 minor; -}; - -struct bpf_raw_tracepoint_args { - __u64 args[0]; -}; - -struct bpf_flow_keys { - __u16 nhoff; - __u16 thoff; - __u16 addr_proto; /* ETH_P_* of valid addrs */ - __u8 is_frag; - __u8 is_first_frag; - __u8 is_encap; - __u8 ip_proto; - __be16 n_proto; - __be16 sport; - __be16 dport; - union { - struct { - __be32 ipv4_src; - __be32 ipv4_dst; - }; - struct { - __u32 ipv6_src[4]; /* in6_addr; network order */ - __u32 ipv6_dst[4]; /* in6_addr; network order */ - }; - }; -}; - -struct bpf_sysctl { - __u32 write; /* Sysctl is being read (= 0) or written (= 1). - * Allows 1,2,4-byte read, but no write. - */ -}; - -struct bpf_func_info { - __u32 insn_offset; - __u32 type_id; -}; - -#define BPF_LINE_INFO_LINE_NUM(line_col) ((line_col) >> 10) -#define BPF_LINE_INFO_LINE_COL(line_col) ((line_col) & 0x3ff) -struct bpf_line_info { - __u32 insn_off; - __u32 file_name_off; - __u32 line_off; - __u32 line_col; -}; - -struct bpf_spin_lock { - __u32 val; -}; - -struct bpf_sockopt { - __bpf_md_ptr(struct bpf_sock *, sk); - __bpf_md_ptr(void *, optval); - __bpf_md_ptr(void *, optval_end); - __s32 level; - __s32 optname; - __s32 optlen; - __s32 retval; -}; - #endif /* _UAPI__LINUX_BPF_H__ */ diff --git a/include/uapi/linux/btf.h b/include/uapi/linux/btf.h deleted file mode 100644 index 11b0cd6bbc42..000000000000 --- a/include/uapi/linux/btf.h +++ /dev/null @@ -1,140 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -/* Copyright (c) 2018 Facebook */ -#ifndef _UAPI__LINUX_BTF_H__ -#define _UAPI__LINUX_BTF_H__ - -#include - -#define BTF_MAGIC 0xeB9F -#define BTF_VERSION 1 - -struct btf_header { - __u16 magic; - __u8 version; - __u8 flags; - __u32 hdr_len; - - /* All offsets are in bytes relative to the end of this header */ - __u32 type_off; /* offset of type section */ - __u32 type_len; /* length of type section */ - __u32 str_off; /* offset of string section */ - __u32 str_len; /* length of string section */ -}; - -/* Max # of type identifier */ -#define BTF_MAX_TYPE 0x000fffff -/* Max offset into the string section */ -#define BTF_MAX_NAME_OFFSET 0x00ffffff -/* Max # of struct/union/enum members or func args */ -#define BTF_MAX_VLEN 0xffff - -struct btf_type { - __u32 name_off; - /* "info" bits arrangement - * bits 0-15: vlen (e.g. # of struct's members) - * bits 16-23: unused - * bits 24-27: kind (e.g. int, ptr, array...etc) - * bits 28-30: unused - * bit 31: kind_flag, currently used by - * struct, union and fwd - */ - __u32 info; - /* "size" is used by INT, ENUM, STRUCT and UNION. - * "size" tells the size of the type it is describing. - * - * "type" is used by PTR, TYPEDEF, VOLATILE, CONST, RESTRICT, - * FUNC and FUNC_PROTO. - * "type" is a type_id referring to another type. - */ - union { - __u32 size; - __u32 type; - }; -}; - -#define BTF_INFO_KIND(info) (((info) >> 24) & 0x0f) -#define BTF_INFO_VLEN(info) ((info) & 0xffff) -#define BTF_INFO_KFLAG(info) ((info) >> 31) - -#define BTF_KIND_UNKN 0 /* Unknown */ -#define BTF_KIND_INT 1 /* Integer */ -#define BTF_KIND_PTR 2 /* Pointer */ -#define BTF_KIND_ARRAY 3 /* Array */ -#define BTF_KIND_STRUCT 4 /* Struct */ -#define BTF_KIND_UNION 5 /* Union */ -#define BTF_KIND_ENUM 6 /* Enumeration */ -#define BTF_KIND_FWD 7 /* Forward */ -#define BTF_KIND_TYPEDEF 8 /* Typedef */ -#define BTF_KIND_VOLATILE 9 /* Volatile */ -#define BTF_KIND_CONST 10 /* Const */ -#define BTF_KIND_RESTRICT 11 /* Restrict */ -#define BTF_KIND_FUNC 12 /* Function */ -#define BTF_KIND_FUNC_PROTO 13 /* Function Proto */ -#define BTF_KIND_MAX 13 -#define NR_BTF_KINDS 14 - -/* For some specific BTF_KIND, "struct btf_type" is immediately - * followed by extra data. - */ - -/* BTF_KIND_INT is followed by a u32 and the following - * is the 32 bits arrangement: - */ -#define BTF_INT_ENCODING(VAL) (((VAL) & 0x0f000000) >> 24) -#define BTF_INT_OFFSET(VAL) (((VAL & 0x00ff0000)) >> 16) -#define BTF_INT_BITS(VAL) ((VAL) & 0x000000ff) - -/* Attributes stored in the BTF_INT_ENCODING */ -#define BTF_INT_SIGNED (1 << 0) -#define BTF_INT_CHAR (1 << 1) -#define BTF_INT_BOOL (1 << 2) - -/* BTF_KIND_ENUM is followed by multiple "struct btf_enum". - * The exact number of btf_enum is stored in the vlen (of the - * info in "struct btf_type"). - */ -struct btf_enum { - __u32 name_off; - __s32 val; -}; - -/* BTF_KIND_ARRAY is followed by one "struct btf_array" */ -struct btf_array { - __u32 type; - __u32 index_type; - __u32 nelems; -}; - -/* BTF_KIND_STRUCT and BTF_KIND_UNION are followed - * by multiple "struct btf_member". The exact number - * of btf_member is stored in the vlen (of the info in - * "struct btf_type"). - */ -struct btf_member { - __u32 name_off; - __u32 type; - /* If the type info kind_flag is set, the btf_member offset - * contains both member bitfield size and bit offset. The - * bitfield size is set for bitfield members. If the type - * info kind_flag is not set, the offset contains only bit - * offset. - */ - __u32 offset; -}; - -/* If the struct/union type info kind_flag is set, the - * following two macros are used to access bitfield_size - * and bit_offset from btf_member.offset. - */ -#define BTF_MEMBER_BITFIELD_SIZE(val) ((val) >> 24) -#define BTF_MEMBER_BIT_OFFSET(val) ((val) & 0xffffff) - -/* BTF_KIND_FUNC_PROTO is followed by multiple "struct btf_param". - * The exact number of btf_param is stored in the vlen (of the - * info in "struct btf_type"). - */ -struct btf_param { - __u32 name_off; - __u32 type; -}; -#endif /* _UAPI__LINUX_BTF_H__ */ diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 64a797500d4e..fc72a3839c9d 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -418,28 +418,6 @@ struct perf_event_attr { __u16 __reserved_2; /* align to __u64 */ }; -/* - * Structure used by below PERF_EVENT_IOC_QUERY_BPF command - * to query bpf programs attached to the same perf tracepoint - * as the given perf event. - */ -struct perf_event_query_bpf { - /* - * The below ids array length - */ - __u32 ids_len; - /* - * Set by the kernel to indicate the number of - * available programs - */ - __u32 prog_cnt; - /* - * User provided buffer to store program ids - */ - __u32 ids[0]; -}; - - #define perf_flags(attr) (*(&(attr)->read_format + 1)) /* @@ -455,7 +433,6 @@ struct perf_event_query_bpf { #define PERF_EVENT_IOC_ID _IOR('$', 7, __u64 *) #define PERF_EVENT_IOC_SET_BPF _IOW('$', 8, __u32) #define PERF_EVENT_IOC_PAUSE_OUTPUT _IOW('$', 9, __u32) -#define PERF_EVENT_IOC_QUERY_BPF _IOWR('$', 10, struct perf_event_query_bpf *) enum perf_event_ioc_flags { PERF_IOC_FLAG_GROUP = 1U << 0, diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks index c4c8062429a3..621c296fe8f8 100644 --- a/kernel/Kconfig.locks +++ b/kernel/Kconfig.locks @@ -242,9 +242,6 @@ config QUEUED_SPINLOCKS def_bool y if ARCH_USE_QUEUED_SPINLOCKS depends on SMP -config BPF_ARCH_SPINLOCK - bool - config ARCH_USE_QUEUED_RWLOCKS bool diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index c06336e079e4..be282c135a66 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -4,13 +4,11 @@ CFLAGS_core.o += $(call cc-disable-warning, override-init) obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o -obj-$(CONFIG_BPF_SYSCALL) += local_storage.o -obj-$(CONFIG_BPF_SYSCALL) += btf.o -obj-$(CONFIG_BPF_SYSCALL) += disasm.o ifeq ($(CONFIG_NET),y) obj-$(CONFIG_BPF_SYSCALL) += devmap.o -obj-$(CONFIG_BPF_SYSCALL) += cpumap.o -obj-$(CONFIG_BPF_SYSCALL) += offload.o +ifeq ($(CONFIG_STREAM_PARSER),y) +obj-$(CONFIG_BPF_SYSCALL) += sockmap.o +endif endif ifeq ($(CONFIG_PERF_EVENTS),y) obj-$(CONFIG_BPF_SYSCALL) += stackmap.o diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index b294a5510529..aede91834b7c 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -11,13 +11,11 @@ * General Public License for more details. */ #include -#include #include #include #include #include #include -#include #include "map_in_map.h" @@ -157,39 +155,6 @@ static void *array_map_lookup_elem(struct bpf_map *map, void *key) return array->value + array->elem_size * (index & array->index_mask); } -static int array_map_direct_value_addr(const struct bpf_map *map, u64 *imm, - u32 off) -{ - struct bpf_array *array = container_of(map, struct bpf_array, map); - - if (map->max_entries != 1) - return -ENOTSUPP; - - if (off >= map->value_size) - return -EINVAL; - - *imm = (unsigned long)array->value; - - return 0; -} -static int array_map_direct_value_meta(const struct bpf_map *map, u64 imm, - u32 *off) -{ - struct bpf_array *array = container_of(map, struct bpf_array, map); - u64 base = (unsigned long)array->value; - u64 range = array->elem_size; - - if (map->max_entries != 1) - return -ENOTSUPP; - - if (imm < base || imm >= base + range) - return -ENOENT; - - *off = imm - base; - - return 0; -} - /* emit BPF instructions equivalent to C code of array_map_lookup_elem() */ static u32 array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) { @@ -283,9 +248,8 @@ static int array_map_update_elem(struct bpf_map *map, void *key, void *value, { struct bpf_array *array = container_of(map, struct bpf_array, map); u32 index = *(u32 *)key; - char *val; - if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST)) + if (unlikely(map_flags > BPF_EXIST)) /* unknown flags */ return -EINVAL; @@ -293,26 +257,17 @@ static int array_map_update_elem(struct bpf_map *map, void *key, void *value, /* all elements were pre-allocated, cannot insert a new one */ return -E2BIG; - if (unlikely(map_flags & BPF_NOEXIST)) + if (unlikely(map_flags == BPF_NOEXIST)) /* all elements already exist */ return -EEXIST; - if (unlikely((map_flags & BPF_F_LOCK) && - !map_value_has_spin_lock(map))) - return -EINVAL; - - if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { + if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) memcpy(this_cpu_ptr(array->pptrs[index & array->index_mask]), value, map->value_size); - } else { - val = array->value + - array->elem_size * (index & array->index_mask); - if (map_flags & BPF_F_LOCK) - copy_map_value_locked(map, val, value, false); - else - copy_map_value(map, val, value); - } - + else + memcpy(array->value + + array->elem_size * (index & array->index_mask), + value, map->value_size); return 0; } @@ -378,41 +333,6 @@ static void array_map_free(struct bpf_map *map) bpf_map_area_free(array); } -static void array_map_seq_show_elem(struct bpf_map *map, void *key, - struct seq_file *m) -{ - void *value; - rcu_read_lock(); - value = array_map_lookup_elem(map, key); - if (!value) { - rcu_read_unlock(); - return; - } - seq_printf(m, "%u: ", *(u32 *)key); - btf_type_seq_show(map->btf, map->btf_value_type_id, value, m); - seq_puts(m, "\n"); - rcu_read_unlock(); -} - -static int array_map_check_btf(const struct bpf_map *map, - const struct btf *btf, - const struct btf_type *key_type, - const struct btf_type *value_type) -{ - u32 int_data; - - if (BTF_INFO_KIND(key_type->info) != BTF_KIND_INT) - return -EINVAL; - int_data = *(u32 *)(key_type + 1); - /* bpf array can only take a u32 key. This check makes sure - * that the btf matches the attr used during map_create. - */ - - if (BTF_INT_BITS(int_data) != 32 || BTF_INT_OFFSET(int_data)) - return -EINVAL; - return 0; -} - const struct bpf_map_ops array_map_ops = { .map_alloc = array_map_alloc, .map_free = array_map_free, @@ -421,10 +341,6 @@ const struct bpf_map_ops array_map_ops = { .map_update_elem = array_map_update_elem, .map_delete_elem = array_map_delete_elem, .map_gen_lookup = array_map_gen_lookup, - .map_direct_value_addr = array_map_direct_value_addr, - .map_direct_value_meta = array_map_direct_value_meta, - .map_seq_show_elem = array_map_seq_show_elem, - .map_check_btf = array_map_check_btf, }; const struct bpf_map_ops percpu_array_map_ops = { @@ -434,7 +350,6 @@ const struct bpf_map_ops percpu_array_map_ops = { .map_lookup_elem = percpu_array_map_lookup_elem, .map_update_elem = array_map_update_elem, .map_delete_elem = array_map_delete_elem, - .map_check_btf = array_map_check_btf, }; static struct bpf_map *fd_array_map_alloc(union bpf_attr *attr) @@ -576,7 +491,6 @@ const struct bpf_map_ops prog_array_map_ops = { .map_fd_put_ptr = prog_fd_array_put_ptr, .map_fd_sys_lookup_elem = prog_fd_array_sys_lookup_elem, .map_release_uref = bpf_fd_array_map_clear, - .map_check_btf = map_check_no_btf, }; static struct bpf_event_entry *bpf_event_entry_gen(struct file *perf_file, @@ -665,7 +579,6 @@ const struct bpf_map_ops perf_event_array_map_ops = { .map_fd_get_ptr = perf_event_fd_array_get_ptr, .map_fd_put_ptr = perf_event_fd_array_put_ptr, .map_release = perf_event_fd_array_release, - .map_check_btf = map_check_no_btf, }; #ifdef CONFIG_CGROUPS @@ -696,7 +609,6 @@ const struct bpf_map_ops cgroup_array_map_ops = { .map_delete_elem = fd_array_map_delete_elem, .map_fd_get_ptr = cgroup_fd_array_get_ptr, .map_fd_put_ptr = cgroup_fd_array_put_ptr, - .map_check_btf = map_check_no_btf, }; #endif @@ -780,5 +692,4 @@ const struct bpf_map_ops array_of_maps_map_ops = { .map_fd_put_ptr = bpf_map_fd_put_ptr, .map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem, .map_gen_lookup = array_of_map_gen_lookup, - .map_check_btf = map_check_no_btf, }; diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c deleted file mode 100644 index cb8b05bdaeb0..000000000000 --- a/kernel/bpf/btf.c +++ /dev/null @@ -1,2930 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (c) 2018 Facebook */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -/* BTF (BPF Type Format) is the meta data format which describes - * the data types of BPF program/map. Hence, it basically focus - * on the C programming language which the modern BPF is primary - * using. - * - * ELF Section: - * ~~~~~~~~~~~ - * The BTF data is stored under the ".BTF" ELF section - * - * struct btf_type: - * ~~~~~~~~~~~~~~~ - * Each 'struct btf_type' object describes a C data type. - * Depending on the type it is describing, a 'struct btf_type' - * object may be followed by more data. F.e. - * To describe an array, 'struct btf_type' is followed by - * 'struct btf_array'. - * - * 'struct btf_type' and any extra data following it are - * 4 bytes aligned. - * - * Type section: - * ~~~~~~~~~~~~~ - * The BTF type section contains a list of 'struct btf_type' objects. - * Each one describes a C type. Recall from the above section - * that a 'struct btf_type' object could be immediately followed by extra - * data in order to desribe some particular C types. - * - * type_id: - * ~~~~~~~ - * Each btf_type object is identified by a type_id. The type_id - * is implicitly implied by the location of the btf_type object in - * the BTF type section. The first one has type_id 1. The second - * one has type_id 2...etc. Hence, an earlier btf_type has - * a smaller type_id. - * - * A btf_type object may refer to another btf_type object by using - * type_id (i.e. the "type" in the "struct btf_type"). - * - * NOTE that we cannot assume any reference-order. - * A btf_type object can refer to an earlier btf_type object - * but it can also refer to a later btf_type object. - * - * For example, to describe "const void *". A btf_type - * object describing "const" may refer to another btf_type - * object describing "void *". This type-reference is done - * by specifying type_id: - * - * [1] CONST (anon) type_id=2 - * [2] PTR (anon) type_id=0 - * - * The above is the btf_verifier debug log: - * - Each line started with "[?]" is a btf_type object - * - [?] is the type_id of the btf_type object. - * - CONST/PTR is the BTF_KIND_XXX - * - "(anon)" is the name of the type. It just - * happens that CONST and PTR has no name. - * - type_id=XXX is the 'u32 type' in btf_type - * - * NOTE: "void" has type_id 0 - * - * String section: - * ~~~~~~~~~~~~~~ - * The BTF string section contains the names used by the type section. - * Each string is referred by an "offset" from the beginning of the - * string section. - * - * Each string is '\0' terminated. - * - * The first character in the string section must be '\0' - * which is used to mean 'anonymous'. Some btf_type may not - * have a name. - */ - -/* BTF verification: - * - * To verify BTF data, two passes are needed. - * - * Pass #1 - * ~~~~~~~ - * The first pass is to collect all btf_type objects to - * an array: "btf->types". - * - * Depending on the C type that a btf_type is describing, - * a btf_type may be followed by extra data. We don't know - * how many btf_type is there, and more importantly we don't - * know where each btf_type is located in the type section. - * - * Without knowing the location of each type_id, most verifications - * cannot be done. e.g. an earlier btf_type may refer to a later - * btf_type (recall the "const void *" above), so we cannot - * check this type-reference in the first pass. - * - * In the first pass, it still does some verifications (e.g. - * checking the name is a valid offset to the string section). - * - * Pass #2 - * ~~~~~~~ - * The main focus is to resolve a btf_type that is referring - * to another type. - * - * We have to ensure the referring type: - * 1) does exist in the BTF (i.e. in btf->types[]) - * 2) does not cause a loop: - * struct A { - * struct B b; - * }; - * - * struct B { - * struct A a; - * }; - * - * btf_type_needs_resolve() decides if a btf_type needs - * to be resolved. - * - * The needs_resolve type implements the "resolve()" ops which - * essentially does a DFS and detects backedge. - * - * During resolve (or DFS), different C types have different - * "RESOLVED" conditions. - * - * When resolving a BTF_KIND_STRUCT, we need to resolve all its - * members because a member is always referring to another - * type. A struct's member can be treated as "RESOLVED" if - * it is referring to a BTF_KIND_PTR. Otherwise, the - * following valid C struct would be rejected: - * - * struct A { - * int m; - * struct A *a; - * }; - * - * When resolving a BTF_KIND_PTR, it needs to keep resolving if - * it is referring to another BTF_KIND_PTR. Otherwise, we cannot - * detect a pointer loop, e.g.: - * BTF_KIND_CONST -> BTF_KIND_PTR -> BTF_KIND_CONST -> BTF_KIND_PTR + - * ^ | - * +-----------------------------------------+ - * - */ - -#define BITS_PER_U64 (sizeof(u64) * BITS_PER_BYTE) -#define BITS_PER_BYTE_MASK (BITS_PER_BYTE - 1) -#define BITS_PER_BYTE_MASKED(bits) ((bits) & BITS_PER_BYTE_MASK) -#define BITS_ROUNDDOWN_BYTES(bits) ((bits) >> 3) -#define BITS_ROUNDUP_BYTES(bits) \ - (BITS_ROUNDDOWN_BYTES(bits) + !!BITS_PER_BYTE_MASKED(bits)) - -#define BTF_INFO_MASK 0x8f00ffff -#define BTF_INT_MASK 0x0fffffff -#define BTF_TYPE_ID_VALID(type_id) ((type_id) <= BTF_MAX_TYPE) -#define BTF_STR_OFFSET_VALID(name_off) ((name_off) <= BTF_MAX_NAME_OFFSET) - -/* 16MB for 64k structs and each has 16 members and - * a few MB spaces for the string section. - * The hard limit is S32_MAX. - */ -#define BTF_MAX_SIZE (16 * 1024 * 1024) - -#define for_each_member(i, struct_type, member) \ - for (i = 0, member = btf_type_member(struct_type); \ - i < btf_type_vlen(struct_type); \ - i++, member++) - -#define for_each_member_from(i, from, struct_type, member) \ - for (i = from, member = btf_type_member(struct_type) + from; \ - i < btf_type_vlen(struct_type); \ - i++, member++) - -static DEFINE_IDR(btf_idr); -static DEFINE_SPINLOCK(btf_idr_lock); - -struct btf { - void *data; - struct btf_type **types; - u32 *resolved_ids; - u32 *resolved_sizes; - const char *strings; - void *nohdr_data; - struct btf_header hdr; - u32 nr_types; - u32 types_size; - u32 data_size; - refcount_t refcnt; - u32 id; - struct rcu_head rcu; -}; - -enum verifier_phase { - CHECK_META, - CHECK_TYPE, -}; - -struct resolve_vertex { - const struct btf_type *t; - u32 type_id; - u16 next_member; -}; - -enum visit_state { - NOT_VISITED, - VISITED, - RESOLVED, -}; - -enum resolve_mode { - RESOLVE_TBD, /* To Be Determined */ - RESOLVE_PTR, /* Resolving for Pointer */ - RESOLVE_STRUCT_OR_ARRAY, /* Resolving for struct/union - * or array - */ -}; - -#define MAX_RESOLVE_DEPTH 32 - -struct btf_sec_info { - u32 off; - u32 len; -}; - -struct btf_verifier_env { - struct btf *btf; - u8 *visit_states; - struct resolve_vertex stack[MAX_RESOLVE_DEPTH]; - struct bpf_verifier_log log; - u32 log_type_id; - u32 top_stack; - enum verifier_phase phase; - enum resolve_mode resolve_mode; -}; - -static const char * const btf_kind_str[NR_BTF_KINDS] = { - [BTF_KIND_UNKN] = "UNKNOWN", - [BTF_KIND_INT] = "INT", - [BTF_KIND_PTR] = "PTR", - [BTF_KIND_ARRAY] = "ARRAY", - [BTF_KIND_STRUCT] = "STRUCT", - [BTF_KIND_UNION] = "UNION", - [BTF_KIND_ENUM] = "ENUM", - [BTF_KIND_FWD] = "FWD", - [BTF_KIND_TYPEDEF] = "TYPEDEF", - [BTF_KIND_VOLATILE] = "VOLATILE", - [BTF_KIND_CONST] = "CONST", - [BTF_KIND_RESTRICT] = "RESTRICT", - [BTF_KIND_FUNC] = "FUNC", - [BTF_KIND_FUNC_PROTO] = "FUNC_PROTO", -}; - -struct btf_kind_operations { - s32 (*check_meta)(struct btf_verifier_env *env, - const struct btf_type *t, - u32 meta_left); - int (*resolve)(struct btf_verifier_env *env, - const struct resolve_vertex *v); - int (*check_member)(struct btf_verifier_env *env, - const struct btf_type *struct_type, - const struct btf_member *member, - const struct btf_type *member_type); - int (*check_kflag_member)(struct btf_verifier_env *env, - const struct btf_type *struct_type, - const struct btf_member *member, - const struct btf_type *member_type); - void (*log_details)(struct btf_verifier_env *env, - const struct btf_type *t); - void (*seq_show)(const struct btf *btf, const struct btf_type *t, - u32 type_id, void *data, u8 bits_offsets, - struct seq_file *m); -}; - -static const struct btf_kind_operations * const kind_ops[NR_BTF_KINDS]; -static struct btf_type btf_void; - -static int btf_resolve(struct btf_verifier_env *env, - const struct btf_type *t, u32 type_id); - -static bool btf_type_is_modifier(const struct btf_type *t) -{ - /* Some of them is not strictly a C modifier - * but they are grouped into the same bucket - * for BTF concern: - * A type (t) that refers to another - * type through t->type AND its size cannot - * be determined without following the t->type. - * - * ptr does not fall into this bucket - * because its size is always sizeof(void *). - */ - switch (BTF_INFO_KIND(t->info)) { - case BTF_KIND_TYPEDEF: - case BTF_KIND_VOLATILE: - case BTF_KIND_CONST: - case BTF_KIND_RESTRICT: - return true; - } - - return false; -} - -static bool btf_type_is_void(const struct btf_type *t) -{ - return t == &btf_void; - -} -static bool btf_type_is_fwd(const struct btf_type *t) -{ - return BTF_INFO_KIND(t->info) == BTF_KIND_FWD; -} - -static bool btf_type_is_func(const struct btf_type *t) -{ - return BTF_INFO_KIND(t->info) == BTF_KIND_FUNC; -} - -static bool btf_type_is_func_proto(const struct btf_type *t) -{ - return BTF_INFO_KIND(t->info) == BTF_KIND_FUNC_PROTO; -} - -static bool btf_type_nosize(const struct btf_type *t) -{ - return btf_type_is_void(t) || btf_type_is_fwd(t) || - btf_type_is_func(t) || btf_type_is_func_proto(t); -} - -static bool btf_type_nosize_or_null(const struct btf_type *t) -{ - return !t || btf_type_nosize(t); -} - -/* union is only a special case of struct: - * all its offsetof(member) == 0 - */ -static bool btf_type_is_struct(const struct btf_type *t) -{ - u8 kind = BTF_INFO_KIND(t->info); - - return kind == BTF_KIND_STRUCT || kind == BTF_KIND_UNION; -} - -static bool __btf_type_is_struct(const struct btf_type *t) -{ - return BTF_INFO_KIND(t->info) == BTF_KIND_STRUCT; -} - -static bool btf_type_is_array(const struct btf_type *t) -{ - return BTF_INFO_KIND(t->info) == BTF_KIND_ARRAY; -} - -static bool btf_type_is_ptr(const struct btf_type *t) -{ - return BTF_INFO_KIND(t->info) == BTF_KIND_PTR; -} - -static bool btf_type_is_int(const struct btf_type *t) -{ - return BTF_INFO_KIND(t->info) == BTF_KIND_INT; -} - -/* What types need to be resolved? - * - * btf_type_is_modifier() is an obvious one. - * - * btf_type_is_struct() because its member refers to - * another type (through member->type). - - * btf_type_is_array() because its element (array->type) - * refers to another type. Array can be thought of a - * special case of struct while array just has the same - * member-type repeated by array->nelems of times. - */ -static bool btf_type_needs_resolve(const struct btf_type *t) -{ - return btf_type_is_modifier(t) || - btf_type_is_ptr(t) || - btf_type_is_struct(t) || - btf_type_is_array(t); -} - -/* t->size can be used */ -static bool btf_type_has_size(const struct btf_type *t) -{ - switch (BTF_INFO_KIND(t->info)) { - case BTF_KIND_INT: - case BTF_KIND_STRUCT: - case BTF_KIND_UNION: - case BTF_KIND_ENUM: - return true; - } - - return false; -} - -static const char *btf_int_encoding_str(u8 encoding) -{ - if (encoding == 0) - return "(none)"; - else if (encoding == BTF_INT_SIGNED) - return "SIGNED"; - else if (encoding == BTF_INT_CHAR) - return "CHAR"; - else if (encoding == BTF_INT_BOOL) - return "BOOL"; - else - return "UNKN"; -} - -static u16 btf_type_vlen(const struct btf_type *t) -{ - return BTF_INFO_VLEN(t->info); -} - -static bool btf_type_kflag(const struct btf_type *t) -{ - return BTF_INFO_KFLAG(t->info); -} - -static u32 btf_member_bit_offset(const struct btf_type *struct_type, - const struct btf_member *member) -{ - return btf_type_kflag(struct_type) ? BTF_MEMBER_BIT_OFFSET(member->offset) - : member->offset; -} - -static u32 btf_member_bitfield_size(const struct btf_type *struct_type, - const struct btf_member *member) -{ - return btf_type_kflag(struct_type) ? BTF_MEMBER_BITFIELD_SIZE(member->offset) - : 0; -} - -static u32 btf_type_int(const struct btf_type *t) -{ - return *(u32 *)(t + 1); -} - -static const struct btf_array *btf_type_array(const struct btf_type *t) -{ - return (const struct btf_array *)(t + 1); -} - -static const struct btf_member *btf_type_member(const struct btf_type *t) -{ - return (const struct btf_member *)(t + 1); -} - -static const struct btf_enum *btf_type_enum(const struct btf_type *t) -{ - return (const struct btf_enum *)(t + 1); -} - -static const struct btf_kind_operations *btf_type_ops(const struct btf_type *t) -{ - return kind_ops[BTF_INFO_KIND(t->info)]; -} - -bool btf_name_offset_valid(const struct btf *btf, u32 offset) -{ - return BTF_STR_OFFSET_VALID(offset) && - offset < btf->hdr.str_len; -} - -/* Only C-style identifier is permitted. This can be relaxed if - * necessary. - */ -static bool btf_name_valid_identifier(const struct btf *btf, u32 offset) -{ - /* offset must be valid */ - const char *src = &btf->strings[offset]; - const char *src_limit; - - if (!isalpha(*src) && *src != '_') - return false; - - /* set a limit on identifier length */ - src_limit = src + KSYM_NAME_LEN; - src++; - while (*src && src < src_limit) { - if (!isalnum(*src) && *src != '_') - return false; - src++; - } - return !*src; -} - -static const char *__btf_name_by_offset(const struct btf *btf, u32 offset) -{ - if (!offset) - return "(anon)"; - else if (offset < btf->hdr.str_len) - return &btf->strings[offset]; - else - return "(invalid-name-offset)"; -} - -const char *btf_name_by_offset(const struct btf *btf, u32 offset) -{ - if (offset < btf->hdr.str_len) - return &btf->strings[offset]; - - return NULL; -} - -const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id) -{ - if (type_id > btf->nr_types) - return NULL; - - return btf->types[type_id]; -} - -/* - * Regular int is not a bit field and it must be either - * u8/u16/u32/u64. - */ -static bool btf_type_int_is_regular(const struct btf_type *t) -{ - u8 nr_bits, nr_bytes; - u32 int_data; - - int_data = btf_type_int(t); - nr_bits = BTF_INT_BITS(int_data); - nr_bytes = BITS_ROUNDUP_BYTES(nr_bits); - if (BITS_PER_BYTE_MASKED(nr_bits) || - BTF_INT_OFFSET(int_data) || - (nr_bytes != sizeof(u8) && nr_bytes != sizeof(u16) && - nr_bytes != sizeof(u32) && nr_bytes != sizeof(u64))) { - return false; - } - - return true; -} - -/* -* Check that given struct member is a regular int with expected - * offset and size. - */ -bool btf_member_is_reg_int(const struct btf *btf, const struct btf_type *s, - const struct btf_member *m, - u32 expected_offset, u32 expected_size) -{ - const struct btf_type *t; - u32 id, int_data; - u8 nr_bits; - - id = m->type; - t = btf_type_id_size(btf, &id, NULL); - if (!t || !btf_type_is_int(t)) - return false; - - int_data = btf_type_int(t); - nr_bits = BTF_INT_BITS(int_data); - if (btf_type_kflag(s)) { - u32 bitfield_size = BTF_MEMBER_BITFIELD_SIZE(m->offset); - u32 bit_offset = BTF_MEMBER_BIT_OFFSET(m->offset); - /* if kflag set, int should be a regular int and - * bit offset should be at byte boundary. - */ - return !bitfield_size && - BITS_ROUNDUP_BYTES(bit_offset) == expected_offset && - BITS_ROUNDUP_BYTES(nr_bits) == expected_size; - } - if (BTF_INT_OFFSET(int_data) || - BITS_PER_BYTE_MASKED(m->offset) || - BITS_ROUNDUP_BYTES(m->offset) != expected_offset || - BITS_PER_BYTE_MASKED(nr_bits) || - BITS_ROUNDUP_BYTES(nr_bits) != expected_size) - return false; - - return true; -} - -__printf(2, 3) static void __btf_verifier_log(struct bpf_verifier_log *log, - const char *fmt, ...) -{ - va_list args; - - va_start(args, fmt); - bpf_verifier_vlog(log, fmt, args); - va_end(args); -} - -__printf(2, 3) static void btf_verifier_log(struct btf_verifier_env *env, - const char *fmt, ...) -{ - struct bpf_verifier_log *log = &env->log; - va_list args; - - if (!bpf_verifier_log_needed(log)) - return; - - va_start(args, fmt); - bpf_verifier_vlog(log, fmt, args); - va_end(args); -} - -__printf(4, 5) static void __btf_verifier_log_type(struct btf_verifier_env *env, - const struct btf_type *t, - bool log_details, - const char *fmt, ...) -{ - struct bpf_verifier_log *log = &env->log; - u8 kind = BTF_INFO_KIND(t->info); - struct btf *btf = env->btf; - va_list args; - - if (!bpf_verifier_log_needed(log)) - return; - - __btf_verifier_log(log, "[%u] %s %s%s", - env->log_type_id, - btf_kind_str[kind], - __btf_name_by_offset(btf, t->name_off), - log_details ? " " : ""); - - if (log_details) - btf_type_ops(t)->log_details(env, t); - - if (fmt && *fmt) { - __btf_verifier_log(log, " "); - va_start(args, fmt); - bpf_verifier_vlog(log, fmt, args); - va_end(args); - } - - __btf_verifier_log(log, "\n"); -} - -#define btf_verifier_log_type(env, t, ...) \ - __btf_verifier_log_type((env), (t), true, __VA_ARGS__) -#define btf_verifier_log_basic(env, t, ...) \ - __btf_verifier_log_type((env), (t), false, __VA_ARGS__) - -__printf(4, 5) -static void btf_verifier_log_member(struct btf_verifier_env *env, - const struct btf_type *struct_type, - const struct btf_member *member, - const char *fmt, ...) -{ - struct bpf_verifier_log *log = &env->log; - struct btf *btf = env->btf; - va_list args; - - if (!bpf_verifier_log_needed(log)) - return; - - /* The CHECK_META phase already did a btf dump. - * - * If member is logged again, it must hit an error in - * parsing this member. It is useful to print out which - * struct this member belongs to. - */ - if (env->phase != CHECK_META) - btf_verifier_log_type(env, struct_type, NULL); - - if (btf_type_kflag(struct_type)) - __btf_verifier_log(log, - "\t%s type_id=%u bitfield_size=%u bits_offset=%u", - __btf_name_by_offset(btf, member->name_off), - member->type, - BTF_MEMBER_BITFIELD_SIZE(member->offset), - BTF_MEMBER_BIT_OFFSET(member->offset)); - else - __btf_verifier_log(log, "\t%s type_id=%u bits_offset=%u", - __btf_name_by_offset(btf, member->name_off), - member->type, member->offset); - - if (fmt && *fmt) { - __btf_verifier_log(log, " "); - va_start(args, fmt); - bpf_verifier_vlog(log, fmt, args); - va_end(args); - } - - __btf_verifier_log(log, "\n"); -} - -static void btf_verifier_log_hdr(struct btf_verifier_env *env, - u32 btf_data_size) -{ - struct bpf_verifier_log *log = &env->log; - const struct btf *btf = env->btf; - const struct btf_header *hdr; - - if (!bpf_verifier_log_needed(log)) - return; - - hdr = &btf->hdr; - __btf_verifier_log(log, "magic: 0x%x\n", hdr->magic); - __btf_verifier_log(log, "version: %u\n", hdr->version); - __btf_verifier_log(log, "flags: 0x%x\n", hdr->flags); - __btf_verifier_log(log, "hdr_len: %u\n", hdr->hdr_len); - __btf_verifier_log(log, "type_off: %u\n", hdr->type_off); - __btf_verifier_log(log, "type_len: %u\n", hdr->type_len); - __btf_verifier_log(log, "str_off: %u\n", hdr->str_off); - __btf_verifier_log(log, "str_len: %u\n", hdr->str_len); - __btf_verifier_log(log, "btf_total_size: %u\n", btf_data_size); -} - -static int btf_add_type(struct btf_verifier_env *env, struct btf_type *t) -{ - struct btf *btf = env->btf; - - /* < 2 because +1 for btf_void which is always in btf->types[0]. - * btf_void is not accounted in btf->nr_types because btf_void - * does not come from the BTF file. - */ - if (btf->types_size - btf->nr_types < 2) { - /* Expand 'types' array */ - - struct btf_type **new_types; - u32 expand_by, new_size; - - if (btf->types_size == BTF_MAX_TYPE) { - btf_verifier_log(env, "Exceeded max num of types"); - return -E2BIG; - } - - expand_by = max_t(u32, btf->types_size >> 2, 16); - new_size = min_t(u32, BTF_MAX_TYPE, - btf->types_size + expand_by); - - new_types = kvzalloc(new_size * sizeof(*new_types), - GFP_KERNEL | __GFP_NOWARN); - if (!new_types) - return -ENOMEM; - - if (btf->nr_types == 0) - new_types[0] = &btf_void; - else - memcpy(new_types, btf->types, - sizeof(*btf->types) * (btf->nr_types + 1)); - - kvfree(btf->types); - btf->types = new_types; - btf->types_size = new_size; - } - - btf->types[++(btf->nr_types)] = t; - - return 0; -} - -static int btf_alloc_id(struct btf *btf) -{ - int id; - idr_preload(GFP_KERNEL); - spin_lock_bh(&btf_idr_lock); - id = idr_alloc_cyclic(&btf_idr, btf, 1, INT_MAX, GFP_ATOMIC); - if (id > 0) - btf->id = id; - spin_unlock_bh(&btf_idr_lock); - idr_preload_end(); - if (WARN_ON_ONCE(!id)) - return -ENOSPC; - return id > 0 ? 0 : id; -} -static void btf_free_id(struct btf *btf) -{ - unsigned long flags; - /* - * In map-in-map, calling map_delete_elem() on outer - * map will call bpf_map_put on the inner map. - * It will then eventually call btf_free_id() - * on the inner map. Some of the map_delete_elem() - * implementation may have irq disabled, so - * we need to use the _irqsave() version instead - * of the _bh() version. - */ - spin_lock_irqsave(&btf_idr_lock, flags); - idr_remove(&btf_idr, btf->id); - spin_unlock_irqrestore(&btf_idr_lock, flags); -} - -static void btf_free(struct btf *btf) -{ - kvfree(btf->types); - kvfree(btf->resolved_sizes); - kvfree(btf->resolved_ids); - kvfree(btf->data); - kfree(btf); -} - -static void btf_free_rcu(struct rcu_head *rcu) -{ - struct btf *btf = container_of(rcu, struct btf, rcu); - btf_free(btf); -} - -void btf_put(struct btf *btf) -{ - if (btf && refcount_dec_and_test(&btf->refcnt)) { - btf_free_id(btf); - call_rcu(&btf->rcu, btf_free_rcu); - } -} - -static int env_resolve_init(struct btf_verifier_env *env) -{ - struct btf *btf = env->btf; - u32 nr_types = btf->nr_types; - u32 *resolved_sizes = NULL; - u32 *resolved_ids = NULL; - u8 *visit_states = NULL; - - /* +1 for btf_void */ - resolved_sizes = kvzalloc((nr_types + 1) * sizeof(*resolved_sizes), - GFP_KERNEL | __GFP_NOWARN); - if (!resolved_sizes) - goto nomem; - - resolved_ids = kvzalloc((nr_types + 1) * sizeof(*resolved_ids), - GFP_KERNEL | __GFP_NOWARN); - if (!resolved_ids) - goto nomem; - - visit_states = kvzalloc((nr_types + 1) * sizeof(*visit_states), - GFP_KERNEL | __GFP_NOWARN); - if (!visit_states) - goto nomem; - - btf->resolved_sizes = resolved_sizes; - btf->resolved_ids = resolved_ids; - env->visit_states = visit_states; - - return 0; - -nomem: - kvfree(resolved_sizes); - kvfree(resolved_ids); - kvfree(visit_states); - return -ENOMEM; -} - -static void btf_verifier_env_free(struct btf_verifier_env *env) -{ - kvfree(env->visit_states); - kfree(env); -} - -static bool env_type_is_resolve_sink(const struct btf_verifier_env *env, - const struct btf_type *next_type) -{ - switch (env->resolve_mode) { - case RESOLVE_TBD: - /* int, enum or void is a sink */ - return !btf_type_needs_resolve(next_type); - case RESOLVE_PTR: - /* int, enum, void, struct, array, func or func_proto is a sink - * for ptr - */ - return !btf_type_is_modifier(next_type) && - !btf_type_is_ptr(next_type); - case RESOLVE_STRUCT_OR_ARRAY: - /* int, enum, void, ptr, func or func_proto is a sink - * for struct and array - */ - return !btf_type_is_modifier(next_type) && - !btf_type_is_array(next_type) && - !btf_type_is_struct(next_type); - default: - BUG(); - } -} - -static bool env_type_is_resolved(const struct btf_verifier_env *env, - u32 type_id) -{ - return env->visit_states[type_id] == RESOLVED; -} - -static int env_stack_push(struct btf_verifier_env *env, - const struct btf_type *t, u32 type_id) -{ - struct resolve_vertex *v; - - if (env->top_stack == MAX_RESOLVE_DEPTH) - return -E2BIG; - - if (env->visit_states[type_id] != NOT_VISITED) - return -EEXIST; - - env->visit_states[type_id] = VISITED; - - v = &env->stack[env->top_stack++]; - v->t = t; - v->type_id = type_id; - v->next_member = 0; - - if (env->resolve_mode == RESOLVE_TBD) { - if (btf_type_is_ptr(t)) - env->resolve_mode = RESOLVE_PTR; - else if (btf_type_is_struct(t) || btf_type_is_array(t)) - env->resolve_mode = RESOLVE_STRUCT_OR_ARRAY; - } - - return 0; -} - -static void env_stack_set_next_member(struct btf_verifier_env *env, - u16 next_member) -{ - env->stack[env->top_stack - 1].next_member = next_member; -} - -static void env_stack_pop_resolved(struct btf_verifier_env *env, - u32 resolved_type_id, - u32 resolved_size) -{ - u32 type_id = env->stack[--(env->top_stack)].type_id; - struct btf *btf = env->btf; - - btf->resolved_sizes[type_id] = resolved_size; - btf->resolved_ids[type_id] = resolved_type_id; - env->visit_states[type_id] = RESOLVED; -} - -static const struct resolve_vertex *env_stack_peak(struct btf_verifier_env *env) -{ - return env->top_stack ? &env->stack[env->top_stack - 1] : NULL; -} - -/* The input param "type_id" must point to a needs_resolve type */ -static const struct btf_type *btf_type_id_resolve(const struct btf *btf, - u32 *type_id) -{ - *type_id = btf->resolved_ids[*type_id]; - return btf_type_by_id(btf, *type_id); -} - -const struct btf_type *btf_type_id_size(const struct btf *btf, - u32 *type_id, u32 *ret_size) -{ - const struct btf_type *size_type; - u32 size_type_id = *type_id; - u32 size = 0; - - size_type = btf_type_by_id(btf, size_type_id); - if (btf_type_nosize_or_null(size_type)) - return NULL; - - if (btf_type_has_size(size_type)) { - size = size_type->size; - } else if (btf_type_is_array(size_type)) { - size = btf->resolved_sizes[size_type_id]; - } else if (btf_type_is_ptr(size_type)) { - size = sizeof(void *); - } else { - if (WARN_ON_ONCE(!btf_type_is_modifier(size_type))) - return NULL; - - size = btf->resolved_sizes[size_type_id]; - size_type_id = btf->resolved_ids[size_type_id]; - size_type = btf_type_by_id(btf, size_type_id); - if (btf_type_nosize_or_null(size_type)) - return NULL; - } - - *type_id = size_type_id; - if (ret_size) - *ret_size = size; - - return size_type; -} - -static int btf_df_check_member(struct btf_verifier_env *env, - const struct btf_type *struct_type, - const struct btf_member *member, - const struct btf_type *member_type) -{ - btf_verifier_log_basic(env, struct_type, - "Unsupported check_member"); - return -EINVAL; -} - -static int btf_df_check_kflag_member(struct btf_verifier_env *env, - const struct btf_type *struct_type, - const struct btf_member *member, - const struct btf_type *member_type) -{ - btf_verifier_log_basic(env, struct_type, - "Unsupported check_kflag_member"); - return -EINVAL; -} - -/* Used for ptr, array and struct/union type members. - * int, enum and modifier types have their specific callback functions. - */ -static int btf_generic_check_kflag_member(struct btf_verifier_env *env, - const struct btf_type *struct_type, - const struct btf_member *member, - const struct btf_type *member_type) -{ - if (BTF_MEMBER_BITFIELD_SIZE(member->offset)) { - btf_verifier_log_member(env, struct_type, member, - "Invalid member bitfield_size"); - return -EINVAL; - } - /* bitfield size is 0, so member->offset represents bit offset only. - * It is safe to call non kflag check_member variants. - */ - return btf_type_ops(member_type)->check_member(env, struct_type, - member, - member_type); -} - -static int btf_df_resolve(struct btf_verifier_env *env, - const struct resolve_vertex *v) -{ - btf_verifier_log_basic(env, v->t, "Unsupported resolve"); - return -EINVAL; -} - -static void btf_df_seq_show(const struct btf *btf, const struct btf_type *t, - u32 type_id, void *data, u8 bits_offsets, - struct seq_file *m) -{ - seq_printf(m, "", BTF_INFO_KIND(t->info)); -} - -static int btf_int_check_member(struct btf_verifier_env *env, - const struct btf_type *struct_type, - const struct btf_member *member, - const struct btf_type *member_type) -{ - u32 int_data = btf_type_int(member_type); - u32 struct_bits_off = member->offset; - u32 struct_size = struct_type->size; - u32 nr_copy_bits; - u32 bytes_offset; - - if (U32_MAX - struct_bits_off < BTF_INT_OFFSET(int_data)) { - btf_verifier_log_member(env, struct_type, member, - "bits_offset exceeds U32_MAX"); - return -EINVAL; - } - - struct_bits_off += BTF_INT_OFFSET(int_data); - bytes_offset = BITS_ROUNDDOWN_BYTES(struct_bits_off); - nr_copy_bits = BTF_INT_BITS(int_data) + - BITS_PER_BYTE_MASKED(struct_bits_off); - - if (nr_copy_bits > BITS_PER_U64) { - btf_verifier_log_member(env, struct_type, member, - "nr_copy_bits exceeds 64"); - return -EINVAL; - } - - if (struct_size < bytes_offset || - struct_size - bytes_offset < BITS_ROUNDUP_BYTES(nr_copy_bits)) { - btf_verifier_log_member(env, struct_type, member, - "Member exceeds struct_size"); - return -EINVAL; - } - - return 0; -} - -static int btf_int_check_kflag_member(struct btf_verifier_env *env, - const struct btf_type *struct_type, - const struct btf_member *member, - const struct btf_type *member_type) -{ - u32 struct_bits_off, nr_bits, nr_int_data_bits, bytes_offset; - u32 int_data = btf_type_int(member_type); - u32 struct_size = struct_type->size; - u32 nr_copy_bits; - /* a regular int type is required for the kflag int member */ - if (!btf_type_int_is_regular(member_type)) { - btf_verifier_log_member(env, struct_type, member, - "Invalid member base type"); - return -EINVAL; - } - /* check sanity of bitfield size */ - nr_bits = BTF_MEMBER_BITFIELD_SIZE(member->offset); - struct_bits_off = BTF_MEMBER_BIT_OFFSET(member->offset); - nr_int_data_bits = BTF_INT_BITS(int_data); - if (!nr_bits) { - /* Not a bitfield member, member offset must be at byte - * boundary. - */ - if (BITS_PER_BYTE_MASKED(struct_bits_off)) { - btf_verifier_log_member(env, struct_type, member, - "Invalid member offset"); - return -EINVAL; - } - nr_bits = nr_int_data_bits; - } else if (nr_bits > nr_int_data_bits) { - btf_verifier_log_member(env, struct_type, member, - "Invalid member bitfield_size"); - return -EINVAL; - } - bytes_offset = BITS_ROUNDDOWN_BYTES(struct_bits_off); - nr_copy_bits = nr_bits + BITS_PER_BYTE_MASKED(struct_bits_off); - if (nr_copy_bits > BITS_PER_U64) { - btf_verifier_log_member(env, struct_type, member, - "nr_copy_bits exceeds 64"); - return -EINVAL; - } - if (struct_size < bytes_offset || - struct_size - bytes_offset < BITS_ROUNDUP_BYTES(nr_copy_bits)) { - btf_verifier_log_member(env, struct_type, member, - "Member exceeds struct_size"); - return -EINVAL; - } - return 0; -} - -static s32 btf_int_check_meta(struct btf_verifier_env *env, - const struct btf_type *t, - u32 meta_left) -{ - u32 int_data, nr_bits, meta_needed = sizeof(int_data); - u16 encoding; - - if (meta_left < meta_needed) { - btf_verifier_log_basic(env, t, - "meta_left:%u meta_needed:%u", - meta_left, meta_needed); - return -EINVAL; - } - - if (btf_type_vlen(t)) { - btf_verifier_log_type(env, t, "vlen != 0"); - return -EINVAL; - } - - if (btf_type_kflag(t)) { - btf_verifier_log_type(env, t, "Invalid btf_info kind_flag"); - return -EINVAL; - } - - int_data = btf_type_int(t); - if (int_data & ~BTF_INT_MASK) { - btf_verifier_log_basic(env, t, "Invalid int_data:%x", - int_data); - return -EINVAL; - } - - nr_bits = BTF_INT_BITS(int_data) + BTF_INT_OFFSET(int_data); - - if (nr_bits > BITS_PER_U64) { - btf_verifier_log_type(env, t, "nr_bits exceeds %zu", - BITS_PER_U64); - return -EINVAL; - } - - if (BITS_ROUNDUP_BYTES(nr_bits) > t->size) { - btf_verifier_log_type(env, t, "nr_bits exceeds type_size"); - return -EINVAL; - } - - /* - * Only one of the encoding bits is allowed and it - * should be sufficient for the pretty print purpose (i.e. decoding). - * Multiple bits can be allowed later if it is found - * to be insufficient. - */ - encoding = BTF_INT_ENCODING(int_data); - if (encoding && - encoding != BTF_INT_SIGNED && - encoding != BTF_INT_CHAR && - encoding != BTF_INT_BOOL) { - btf_verifier_log_type(env, t, "Unsupported encoding"); - return -ENOTSUPP; - } - - btf_verifier_log_type(env, t, NULL); - - return meta_needed; -} - -static void btf_int_log(struct btf_verifier_env *env, - const struct btf_type *t) -{ - int int_data = btf_type_int(t); - - btf_verifier_log(env, - "size=%u bits_offset=%u nr_bits=%u encoding=%s", - t->size, BTF_INT_OFFSET(int_data), - BTF_INT_BITS(int_data), - btf_int_encoding_str(BTF_INT_ENCODING(int_data))); -} - -static void btf_bitfield_seq_show(void *data, u8 bits_offset, - u8 nr_bits, struct seq_file *m) -{ - u16 left_shift_bits, right_shift_bits; - u8 nr_copy_bytes; - u8 nr_copy_bits; - u64 print_num; - - data += BITS_ROUNDDOWN_BYTES(bits_offset); - bits_offset = BITS_PER_BYTE_MASKED(bits_offset); - nr_copy_bits = nr_bits + bits_offset; - nr_copy_bytes = BITS_ROUNDUP_BYTES(nr_copy_bits); - - print_num = 0; - memcpy(&print_num, data, nr_copy_bytes); - -#ifdef __BIG_ENDIAN_BITFIELD - left_shift_bits = bits_offset; -#else - left_shift_bits = BITS_PER_U64 - nr_copy_bits; -#endif - right_shift_bits = BITS_PER_U64 - nr_bits; - - print_num <<= left_shift_bits; - print_num >>= right_shift_bits; - - seq_printf(m, "0x%llx", print_num); -} - -static void btf_int_bits_seq_show(const struct btf *btf, - const struct btf_type *t, - void *data, u8 bits_offset, - struct seq_file *m) -{ - u32 int_data = btf_type_int(t); - u8 nr_bits = BTF_INT_BITS(int_data); - u8 total_bits_offset; - - /* - * bits_offset is at most 7. - * BTF_INT_OFFSET() cannot exceed 64 bits. - */ - total_bits_offset = bits_offset + BTF_INT_OFFSET(int_data); - btf_bitfield_seq_show(data, total_bits_offset, nr_bits, m); -} - -static void btf_int_seq_show(const struct btf *btf, const struct btf_type *t, - u32 type_id, void *data, u8 bits_offset, - struct seq_file *m) -{ - u32 int_data = btf_type_int(t); - u8 encoding = BTF_INT_ENCODING(int_data); - bool sign = encoding & BTF_INT_SIGNED; - u8 nr_bits = BTF_INT_BITS(int_data); - - if (bits_offset || BTF_INT_OFFSET(int_data) || - BITS_PER_BYTE_MASKED(nr_bits)) { - btf_int_bits_seq_show(btf, t, data, bits_offset, m); - return; - } - - switch (nr_bits) { - case 64: - if (sign) - seq_printf(m, "%lld", *(s64 *)data); - else - seq_printf(m, "%llu", *(u64 *)data); - break; - case 32: - if (sign) - seq_printf(m, "%d", *(s32 *)data); - else - seq_printf(m, "%u", *(u32 *)data); - break; - case 16: - if (sign) - seq_printf(m, "%d", *(s16 *)data); - else - seq_printf(m, "%u", *(u16 *)data); - break; - case 8: - if (sign) - seq_printf(m, "%d", *(s8 *)data); - else - seq_printf(m, "%u", *(u8 *)data); - break; - default: - btf_int_bits_seq_show(btf, t, data, bits_offset, m); - } -} - -static const struct btf_kind_operations int_ops = { - .check_meta = btf_int_check_meta, - .resolve = btf_df_resolve, - .check_member = btf_int_check_member, - .check_kflag_member = btf_int_check_kflag_member, - .log_details = btf_int_log, - .seq_show = btf_int_seq_show, -}; - -static int btf_modifier_check_member(struct btf_verifier_env *env, - const struct btf_type *struct_type, - const struct btf_member *member, - const struct btf_type *member_type) -{ - const struct btf_type *resolved_type; - u32 resolved_type_id = member->type; - struct btf_member resolved_member; - struct btf *btf = env->btf; - - resolved_type = btf_type_id_size(btf, &resolved_type_id, NULL); - if (!resolved_type) { - btf_verifier_log_member(env, struct_type, member, - "Invalid member"); - return -EINVAL; - } - - resolved_member = *member; - resolved_member.type = resolved_type_id; - - return btf_type_ops(resolved_type)->check_member(env, struct_type, - &resolved_member, - resolved_type); -} - -static int btf_modifier_check_kflag_member(struct btf_verifier_env *env, - const struct btf_type *struct_type, - const struct btf_member *member, - const struct btf_type *member_type) -{ - const struct btf_type *resolved_type; - u32 resolved_type_id = member->type; - struct btf_member resolved_member; - struct btf *btf = env->btf; - resolved_type = btf_type_id_size(btf, &resolved_type_id, NULL); - if (!resolved_type) { - btf_verifier_log_member(env, struct_type, member, - "Invalid member"); - return -EINVAL; - } - resolved_member = *member; - resolved_member.type = resolved_type_id; - return btf_type_ops(resolved_type)->check_kflag_member(env, struct_type, - &resolved_member, - resolved_type); -} - -static int btf_ptr_check_member(struct btf_verifier_env *env, - const struct btf_type *struct_type, - const struct btf_member *member, - const struct btf_type *member_type) -{ - u32 struct_size, struct_bits_off, bytes_offset; - - struct_size = struct_type->size; - struct_bits_off = member->offset; - bytes_offset = BITS_ROUNDDOWN_BYTES(struct_bits_off); - - if (BITS_PER_BYTE_MASKED(struct_bits_off)) { - btf_verifier_log_member(env, struct_type, member, - "Member is not byte aligned"); - return -EINVAL; - } - - if (struct_size - bytes_offset < sizeof(void *)) { - btf_verifier_log_member(env, struct_type, member, - "Member exceeds struct_size"); - return -EINVAL; - } - - return 0; -} - -static int btf_ref_type_check_meta(struct btf_verifier_env *env, - const struct btf_type *t, - u32 meta_left) -{ - if (btf_type_vlen(t)) { - btf_verifier_log_type(env, t, "vlen != 0"); - return -EINVAL; - } - - if (btf_type_kflag(t)) { - btf_verifier_log_type(env, t, "Invalid btf_info kind_flag"); - return -EINVAL; - } - - if (!BTF_TYPE_ID_VALID(t->type)) { - btf_verifier_log_type(env, t, "Invalid type_id"); - return -EINVAL; - } - - btf_verifier_log_type(env, t, NULL); - - return 0; -} - -static int btf_modifier_resolve(struct btf_verifier_env *env, - const struct resolve_vertex *v) -{ - const struct btf_type *t = v->t; - const struct btf_type *next_type; - u32 next_type_id = t->type; - struct btf *btf = env->btf; - u32 next_type_size = 0; - - next_type = btf_type_by_id(btf, next_type_id); - if (!next_type) { - btf_verifier_log_type(env, v->t, "Invalid type_id"); - return -EINVAL; - } - - if (!env_type_is_resolve_sink(env, next_type) && - !env_type_is_resolved(env, next_type_id)) - return env_stack_push(env, next_type, next_type_id); - - /* Figure out the resolved next_type_id with size. - * They will be stored in the current modifier's - * resolved_ids and resolved_sizes such that it can - * save us a few type-following when we use it later (e.g. in - * pretty print). - */ - if (!btf_type_id_size(btf, &next_type_id, &next_type_size)) { - if (env_type_is_resolved(env, next_type_id)) - next_type = btf_type_id_resolve(btf, &next_type_id); - - /* "typedef void new_void", "const void"...etc */ - if (!btf_type_is_void(next_type) && - !btf_type_is_fwd(next_type)) { - btf_verifier_log_type(env, v->t, "Invalid type_id"); - return -EINVAL; - } - } - - env_stack_pop_resolved(env, next_type_id, next_type_size); - - return 0; -} - -static int btf_ptr_resolve(struct btf_verifier_env *env, - const struct resolve_vertex *v) -{ - const struct btf_type *next_type; - const struct btf_type *t = v->t; - u32 next_type_id = t->type; - struct btf *btf = env->btf; - - next_type = btf_type_by_id(btf, next_type_id); - if (!next_type) { - btf_verifier_log_type(env, v->t, "Invalid type_id"); - return -EINVAL; - } - - if (!env_type_is_resolve_sink(env, next_type) && - !env_type_is_resolved(env, next_type_id)) - return env_stack_push(env, next_type, next_type_id); - - /* If the modifier was RESOLVED during RESOLVE_STRUCT_OR_ARRAY, - * the modifier may have stopped resolving when it was resolved - * to a ptr (last-resolved-ptr). - * - * We now need to continue from the last-resolved-ptr to - * ensure the last-resolved-ptr will not referring back to - * the currenct ptr (t). - */ - if (btf_type_is_modifier(next_type)) { - const struct btf_type *resolved_type; - u32 resolved_type_id; - - resolved_type_id = next_type_id; - resolved_type = btf_type_id_resolve(btf, &resolved_type_id); - - if (btf_type_is_ptr(resolved_type) && - !env_type_is_resolve_sink(env, resolved_type) && - !env_type_is_resolved(env, resolved_type_id)) - return env_stack_push(env, resolved_type, - resolved_type_id); - } - - if (!btf_type_id_size(btf, &next_type_id, NULL)) { - if (env_type_is_resolved(env, next_type_id)) - next_type = btf_type_id_resolve(btf, &next_type_id); - - if (!btf_type_is_void(next_type) && - !btf_type_is_fwd(next_type) && - !btf_type_is_func_proto(next_type)) { - btf_verifier_log_type(env, v->t, "Invalid type_id"); - return -EINVAL; - } - } - - env_stack_pop_resolved(env, next_type_id, 0); - - return 0; -} - -static void btf_modifier_seq_show(const struct btf *btf, - const struct btf_type *t, - u32 type_id, void *data, - u8 bits_offset, struct seq_file *m) -{ - t = btf_type_id_resolve(btf, &type_id); - - btf_type_ops(t)->seq_show(btf, t, type_id, data, bits_offset, m); -} - -static void btf_ptr_seq_show(const struct btf *btf, const struct btf_type *t, - u32 type_id, void *data, u8 bits_offset, - struct seq_file *m) -{ - /* It is a hashed value */ - seq_printf(m, "%p", *(void **)data); -} - -static void btf_ref_type_log(struct btf_verifier_env *env, - const struct btf_type *t) -{ - btf_verifier_log(env, "type_id=%u", t->type); -} - -static struct btf_kind_operations modifier_ops = { - .check_meta = btf_ref_type_check_meta, - .resolve = btf_modifier_resolve, - .check_member = btf_modifier_check_member, - .check_kflag_member = btf_modifier_check_kflag_member, - .log_details = btf_ref_type_log, - .seq_show = btf_modifier_seq_show, -}; - -static struct btf_kind_operations ptr_ops = { - .check_meta = btf_ref_type_check_meta, - .resolve = btf_ptr_resolve, - .check_member = btf_ptr_check_member, - .check_kflag_member = btf_generic_check_kflag_member, - .log_details = btf_ref_type_log, - .seq_show = btf_ptr_seq_show, -}; - -static s32 btf_fwd_check_meta(struct btf_verifier_env *env, - const struct btf_type *t, - u32 meta_left) -{ - if (btf_type_vlen(t)) { - btf_verifier_log_type(env, t, "vlen != 0"); - return -EINVAL; - } - - if (t->type) { - btf_verifier_log_type(env, t, "type != 0"); - return -EINVAL; - } - - btf_verifier_log_type(env, t, NULL); - - return 0; -} - -static struct btf_kind_operations fwd_ops = { - .check_meta = btf_fwd_check_meta, - .resolve = btf_df_resolve, - .check_member = btf_df_check_member, - .check_kflag_member = btf_df_check_kflag_member, - .log_details = btf_ref_type_log, - .seq_show = btf_df_seq_show, -}; - -static int btf_array_check_member(struct btf_verifier_env *env, - const struct btf_type *struct_type, - const struct btf_member *member, - const struct btf_type *member_type) -{ - u32 struct_bits_off = member->offset; - u32 struct_size, bytes_offset; - u32 array_type_id, array_size; - struct btf *btf = env->btf; - - if (BITS_PER_BYTE_MASKED(struct_bits_off)) { - btf_verifier_log_member(env, struct_type, member, - "Member is not byte aligned"); - return -EINVAL; - } - - array_type_id = member->type; - btf_type_id_size(btf, &array_type_id, &array_size); - struct_size = struct_type->size; - bytes_offset = BITS_ROUNDDOWN_BYTES(struct_bits_off); - if (struct_size - bytes_offset < array_size) { - btf_verifier_log_member(env, struct_type, member, - "Member exceeds struct_size"); - return -EINVAL; - } - - return 0; -} - -static s32 btf_array_check_meta(struct btf_verifier_env *env, - const struct btf_type *t, - u32 meta_left) -{ - const struct btf_array *array = btf_type_array(t); - u32 meta_needed = sizeof(*array); - - if (meta_left < meta_needed) { - btf_verifier_log_basic(env, t, - "meta_left:%u meta_needed:%u", - meta_left, meta_needed); - return -EINVAL; - } - - if (btf_type_vlen(t)) { - btf_verifier_log_type(env, t, "vlen != 0"); - return -EINVAL; - } - - if (btf_type_kflag(t)) { - btf_verifier_log_type(env, t, "Invalid btf_info kind_flag"); - return -EINVAL; - } - - if (t->size) { - btf_verifier_log_type(env, t, "size != 0"); - return -EINVAL; - } - - /* Array elem type and index type cannot be in type void, - * so !array->type and !array->index_type are not allowed. - */ - if (!array->type || !BTF_TYPE_ID_VALID(array->type)) { - btf_verifier_log_type(env, t, "Invalid elem"); - return -EINVAL; - } - - if (!array->index_type || !BTF_TYPE_ID_VALID(array->index_type)) { - btf_verifier_log_type(env, t, "Invalid index"); - return -EINVAL; - } - - btf_verifier_log_type(env, t, NULL); - - return meta_needed; -} - -static int btf_array_resolve(struct btf_verifier_env *env, - const struct resolve_vertex *v) -{ - const struct btf_array *array = btf_type_array(v->t); - const struct btf_type *elem_type, *index_type; - u32 elem_type_id, index_type_id; - struct btf *btf = env->btf; - u32 elem_size; - - /* Check array->index_type */ - index_type_id = array->index_type; - index_type = btf_type_by_id(btf, index_type_id); - if (btf_type_nosize_or_null(index_type)) { - btf_verifier_log_type(env, v->t, "Invalid index"); - return -EINVAL; - } - - if (!env_type_is_resolve_sink(env, index_type) && - !env_type_is_resolved(env, index_type_id)) - return env_stack_push(env, index_type, index_type_id); - - index_type = btf_type_id_size(btf, &index_type_id, NULL); - if (!index_type || !btf_type_is_int(index_type) || - !btf_type_int_is_regular(index_type)) { - btf_verifier_log_type(env, v->t, "Invalid index"); - return -EINVAL; - } - - /* Check array->type */ - elem_type_id = array->type; - elem_type = btf_type_by_id(btf, elem_type_id); - if (btf_type_nosize_or_null(elem_type)) { - btf_verifier_log_type(env, v->t, - "Invalid elem"); - return -EINVAL; - } - - if (!env_type_is_resolve_sink(env, elem_type) && - !env_type_is_resolved(env, elem_type_id)) - return env_stack_push(env, elem_type, elem_type_id); - - elem_type = btf_type_id_size(btf, &elem_type_id, &elem_size); - if (!elem_type) { - btf_verifier_log_type(env, v->t, "Invalid elem"); - return -EINVAL; - } - - if (btf_type_is_int(elem_type) && !btf_type_int_is_regular(elem_type)) { - btf_verifier_log_type(env, v->t, "Invalid array of int"); - return -EINVAL; - } - - if (array->nelems && elem_size > U32_MAX / array->nelems) { - btf_verifier_log_type(env, v->t, - "Array size overflows U32_MAX"); - return -EINVAL; - } - - env_stack_pop_resolved(env, elem_type_id, elem_size * array->nelems); - - return 0; -} - -static void btf_array_log(struct btf_verifier_env *env, - const struct btf_type *t) -{ - const struct btf_array *array = btf_type_array(t); - - btf_verifier_log(env, "type_id=%u index_type_id=%u nr_elems=%u", - array->type, array->index_type, array->nelems); -} - -static void btf_array_seq_show(const struct btf *btf, const struct btf_type *t, - u32 type_id, void *data, u8 bits_offset, - struct seq_file *m) -{ - const struct btf_array *array = btf_type_array(t); - const struct btf_kind_operations *elem_ops; - const struct btf_type *elem_type; - u32 i, elem_size, elem_type_id; - - elem_type_id = array->type; - elem_type = btf_type_id_size(btf, &elem_type_id, &elem_size); - elem_ops = btf_type_ops(elem_type); - seq_puts(m, "["); - for (i = 0; i < array->nelems; i++) { - if (i) - seq_puts(m, ","); - - elem_ops->seq_show(btf, elem_type, elem_type_id, data, - bits_offset, m); - data += elem_size; - } - seq_puts(m, "]"); -} - -static struct btf_kind_operations array_ops = { - .check_meta = btf_array_check_meta, - .resolve = btf_array_resolve, - .check_member = btf_array_check_member, - .check_kflag_member = btf_generic_check_kflag_member, - .log_details = btf_array_log, - .seq_show = btf_array_seq_show, -}; - -static int btf_struct_check_member(struct btf_verifier_env *env, - const struct btf_type *struct_type, - const struct btf_member *member, - const struct btf_type *member_type) -{ - u32 struct_bits_off = member->offset; - u32 struct_size, bytes_offset; - - if (BITS_PER_BYTE_MASKED(struct_bits_off)) { - btf_verifier_log_member(env, struct_type, member, - "Member is not byte aligned"); - return -EINVAL; - } - - struct_size = struct_type->size; - bytes_offset = BITS_ROUNDDOWN_BYTES(struct_bits_off); - if (struct_size - bytes_offset < member_type->size) { - btf_verifier_log_member(env, struct_type, member, - "Member exceeds struct_size"); - return -EINVAL; - } - - return 0; -} - -static s32 btf_struct_check_meta(struct btf_verifier_env *env, - const struct btf_type *t, - u32 meta_left) -{ - bool is_union = BTF_INFO_KIND(t->info) == BTF_KIND_UNION; - const struct btf_member *member; - u32 meta_needed, last_offset; - struct btf *btf = env->btf; - u32 struct_size = t->size; - u32 offset; - u16 i; - - meta_needed = btf_type_vlen(t) * sizeof(*member); - if (meta_left < meta_needed) { - btf_verifier_log_basic(env, t, - "meta_left:%u meta_needed:%u", - meta_left, meta_needed); - return -EINVAL; - } - - btf_verifier_log_type(env, t, NULL); - - last_offset = 0; - for_each_member(i, t, member) { - if (!btf_name_offset_valid(btf, member->name_off)) { - btf_verifier_log_member(env, t, member, - "Invalid member name_offset:%u", - member->name_off); - return -EINVAL; - } - - /* A member cannot be in type void */ - if (!member->type || !BTF_TYPE_ID_VALID(member->type)) { - btf_verifier_log_member(env, t, member, - "Invalid type_id"); - return -EINVAL; - } - - offset = btf_member_bit_offset(t, member); - if (is_union && offset) { - btf_verifier_log_member(env, t, member, - "Invalid member bits_offset"); - return -EINVAL; - } - - /* - * ">" instead of ">=" because the last member could be - * "char a[0];" - */ - if (last_offset > offset) { - btf_verifier_log_member(env, t, member, - "Invalid member bits_offset"); - return -EINVAL; - } - - if (BITS_ROUNDUP_BYTES(offset) > struct_size) { - btf_verifier_log_member(env, t, member, - "Memmber bits_offset exceeds its struct size"); - return -EINVAL; - } - - btf_verifier_log_member(env, t, member, NULL); - last_offset = offset; - } - - return meta_needed; -} - -static int btf_struct_resolve(struct btf_verifier_env *env, - const struct resolve_vertex *v) -{ - const struct btf_member *member; - int err; - u16 i; - - /* Before continue resolving the next_member, - * ensure the last member is indeed resolved to a - * type with size info. - */ - if (v->next_member) { - const struct btf_type *last_member_type; - const struct btf_member *last_member; - u32 last_member_type_id; - - last_member = btf_type_member(v->t) + v->next_member - 1; - last_member_type_id = last_member->type; - if (WARN_ON_ONCE(!env_type_is_resolved(env, - last_member_type_id))) - return -EINVAL; - - last_member_type = btf_type_by_id(env->btf, - last_member_type_id); - if (btf_type_kflag(v->t)) - err = btf_type_ops(last_member_type)->check_kflag_member(env, v->t, - last_member, - last_member_type); - else - err = btf_type_ops(last_member_type)->check_member(env, v->t, - last_member, - last_member_type); - if (err) - return err; - } - - for_each_member_from(i, v->next_member, v->t, member) { - u32 member_type_id = member->type; - const struct btf_type *member_type = btf_type_by_id(env->btf, - member_type_id); - - if (btf_type_nosize_or_null(member_type)) { - btf_verifier_log_member(env, v->t, member, - "Invalid member"); - return -EINVAL; - } - - if (!env_type_is_resolve_sink(env, member_type) && - !env_type_is_resolved(env, member_type_id)) { - env_stack_set_next_member(env, i + 1); - return env_stack_push(env, member_type, member_type_id); - } - - if (btf_type_kflag(v->t)) - err = btf_type_ops(member_type)->check_kflag_member(env, v->t, - member, - member_type); - else - err = btf_type_ops(member_type)->check_member(env, v->t, - member, - member_type); - if (err) - return err; - } - - env_stack_pop_resolved(env, 0, 0); - - return 0; -} - -static void btf_struct_log(struct btf_verifier_env *env, - const struct btf_type *t) -{ - btf_verifier_log(env, "size=%u vlen=%u", t->size, btf_type_vlen(t)); -} - -/* find 'struct bpf_spin_lock' in map value. - * return >= 0 offset if found - * and < 0 in case of error - */ -int btf_find_spin_lock(const struct btf *btf, const struct btf_type *t) -{ - const struct btf_member *member; - u32 i, off = -ENOENT; - if (!__btf_type_is_struct(t)) - return -EINVAL; - for_each_member(i, t, member) { - const struct btf_type *member_type = btf_type_by_id(btf, - member->type); - if (!__btf_type_is_struct(member_type)) - continue; - if (member_type->size != sizeof(struct bpf_spin_lock)) - continue; - if (strcmp(__btf_name_by_offset(btf, member_type->name_off), - "bpf_spin_lock")) - continue; - if (off != -ENOENT) - /* only one 'struct bpf_spin_lock' is allowed */ - return -E2BIG; - off = btf_member_bit_offset(t, member); - if (off % 8) - /* valid C code cannot generate such BTF */ - return -EINVAL; - off /= 8; - if (off % __alignof__(struct bpf_spin_lock)) - /* valid struct bpf_spin_lock will be 4 byte aligned */ - return -EINVAL; - } - return off; -} - -static void btf_struct_seq_show(const struct btf *btf, const struct btf_type *t, - u32 type_id, void *data, u8 bits_offset, - struct seq_file *m) -{ - const char *seq = BTF_INFO_KIND(t->info) == BTF_KIND_UNION ? "|" : ","; - const struct btf_member *member; - u32 i; - - seq_puts(m, "{"); - for_each_member(i, t, member) { - const struct btf_type *member_type = btf_type_by_id(btf, - member->type); - const struct btf_kind_operations *ops; - u32 member_offset, bitfield_size; - u32 bytes_offset; - u8 bits8_offset; - - if (i) - seq_puts(m, seq); - - member_offset = btf_member_bit_offset(t, member); - bitfield_size = btf_member_bitfield_size(t, member); - if (bitfield_size) { - btf_bitfield_seq_show(data, member_offset, - bitfield_size, m); - } else { - bytes_offset = BITS_ROUNDDOWN_BYTES(member_offset); - bits8_offset = BITS_PER_BYTE_MASKED(member_offset); - ops = btf_type_ops(member_type); - ops->seq_show(btf, member_type, member->type, - data + bytes_offset, bits8_offset, m); - } - } - seq_puts(m, "}"); -} - -static struct btf_kind_operations struct_ops = { - .check_meta = btf_struct_check_meta, - .resolve = btf_struct_resolve, - .check_member = btf_struct_check_member, - .check_kflag_member = btf_generic_check_kflag_member, - .log_details = btf_struct_log, - .seq_show = btf_struct_seq_show, -}; - -static int btf_enum_check_member(struct btf_verifier_env *env, - const struct btf_type *struct_type, - const struct btf_member *member, - const struct btf_type *member_type) -{ - u32 struct_bits_off = member->offset; - u32 struct_size, bytes_offset; - - if (BITS_PER_BYTE_MASKED(struct_bits_off)) { - btf_verifier_log_member(env, struct_type, member, - "Member is not byte aligned"); - return -EINVAL; - } - - struct_size = struct_type->size; - bytes_offset = BITS_ROUNDDOWN_BYTES(struct_bits_off); - if (struct_size - bytes_offset < member_type->size) { - btf_verifier_log_member(env, struct_type, member, - "Member exceeds struct_size"); - return -EINVAL; - } - - return 0; -} - -static int btf_enum_check_kflag_member(struct btf_verifier_env *env, - const struct btf_type *struct_type, - const struct btf_member *member, - const struct btf_type *member_type) -{ - u32 struct_bits_off, nr_bits, bytes_end, struct_size; - u32 int_bitsize = sizeof(int) * BITS_PER_BYTE; - struct_bits_off = BTF_MEMBER_BIT_OFFSET(member->offset); - nr_bits = BTF_MEMBER_BITFIELD_SIZE(member->offset); - if (!nr_bits) { - if (BITS_PER_BYTE_MASKED(struct_bits_off)) { - btf_verifier_log_member(env, struct_type, member, - "Member is not byte aligned"); - return -EINVAL; - } - nr_bits = int_bitsize; - } else if (nr_bits > int_bitsize) { - btf_verifier_log_member(env, struct_type, member, - "Invalid member bitfield_size"); - return -EINVAL; - } - struct_size = struct_type->size; - bytes_end = BITS_ROUNDUP_BYTES(struct_bits_off + nr_bits); - if (struct_size < bytes_end) { - btf_verifier_log_member(env, struct_type, member, - "Member exceeds struct_size"); - return -EINVAL; - } - return 0; -} - -static s32 btf_enum_check_meta(struct btf_verifier_env *env, - const struct btf_type *t, - u32 meta_left) -{ - const struct btf_enum *enums = btf_type_enum(t); - struct btf *btf = env->btf; - u16 i, nr_enums; - u32 meta_needed; - - nr_enums = btf_type_vlen(t); - meta_needed = nr_enums * sizeof(*enums); - - if (meta_left < meta_needed) { - btf_verifier_log_basic(env, t, - "meta_left:%u meta_needed:%u", - meta_left, meta_needed); - return -EINVAL; - } - - if (btf_type_kflag(t)) { - btf_verifier_log_type(env, t, "Invalid btf_info kind_flag"); - return -EINVAL; - } - - if (t->size != sizeof(int)) { - btf_verifier_log_type(env, t, "Expected size:%zu", - sizeof(int)); - return -EINVAL; - } - - btf_verifier_log_type(env, t, NULL); - - for (i = 0; i < nr_enums; i++) { - if (!btf_name_offset_valid(btf, enums[i].name_off)) { - btf_verifier_log(env, "\tInvalid name_offset:%u", - enums[i].name_off); - return -EINVAL; - } - - btf_verifier_log(env, "\t%s val=%d\n", - __btf_name_by_offset(btf, enums[i].name_off), - enums[i].val); - } - - return meta_needed; -} - -static void btf_enum_log(struct btf_verifier_env *env, - const struct btf_type *t) -{ - btf_verifier_log(env, "size=%u vlen=%u", t->size, btf_type_vlen(t)); -} - -static void btf_enum_seq_show(const struct btf *btf, const struct btf_type *t, - u32 type_id, void *data, u8 bits_offset, - struct seq_file *m) -{ - const struct btf_enum *enums = btf_type_enum(t); - u32 i, nr_enums = btf_type_vlen(t); - int v = *(int *)data; - - for (i = 0; i < nr_enums; i++) { - if (v == enums[i].val) { - seq_printf(m, "%s", - __btf_name_by_offset(btf, enums[i].name_off)); - return; - } - } - - seq_printf(m, "%d", v); -} - -static struct btf_kind_operations enum_ops = { - .check_meta = btf_enum_check_meta, - .resolve = btf_df_resolve, - .check_member = btf_enum_check_member, - .check_kflag_member = btf_enum_check_kflag_member, - .log_details = btf_enum_log, - .seq_show = btf_enum_seq_show, -}; - -static s32 btf_func_proto_check_meta(struct btf_verifier_env *env, - const struct btf_type *t, - u32 meta_left) -{ - u32 meta_needed = btf_type_vlen(t) * sizeof(struct btf_param); - - if (meta_left < meta_needed) { - btf_verifier_log_basic(env, t, - "meta_left:%u meta_needed:%u", - meta_left, meta_needed); - return -EINVAL; - } - - if (t->name_off) { - btf_verifier_log_type(env, t, "Invalid name"); - return -EINVAL; - } - - if (btf_type_kflag(t)) { - btf_verifier_log_type(env, t, "Invalid btf_info kind_flag"); - return -EINVAL; - } - - btf_verifier_log_type(env, t, NULL); - return meta_needed; -} - -static void btf_func_proto_log(struct btf_verifier_env *env, - const struct btf_type *t) -{ - const struct btf_param *args = (const struct btf_param *)(t + 1); - u16 nr_args = btf_type_vlen(t), i; - btf_verifier_log(env, "return=%u args=(", t->type); - - if (!nr_args) { - btf_verifier_log(env, "void"); - goto done; - } - - if (nr_args == 1 && !args[0].type) { - /* Only one vararg */ - btf_verifier_log(env, "vararg"); - goto done; - } - - btf_verifier_log(env, "%u %s", args[0].type, - __btf_name_by_offset(env->btf, - args[0].name_off)); - for (i = 1; i < nr_args - 1; i++) - btf_verifier_log(env, ", %u %s", args[i].type, - __btf_name_by_offset(env->btf, - args[i].name_off)); - if (nr_args > 1) { - const struct btf_param *last_arg = &args[nr_args - 1]; - if (last_arg->type) - btf_verifier_log(env, ", %u %s", last_arg->type, - __btf_name_by_offset(env->btf, - last_arg->name_off)); - else - btf_verifier_log(env, ", vararg"); - } -done: - btf_verifier_log(env, ")"); -} - -static struct btf_kind_operations func_proto_ops = { - .check_meta = btf_func_proto_check_meta, - .resolve = btf_df_resolve, - /* - * BTF_KIND_FUNC_PROTO cannot be directly referred by - * a struct's member. - * - * It should be a funciton pointer instead. - * (i.e. struct's member -> BTF_KIND_PTR -> BTF_KIND_FUNC_PROTO) - * - * Hence, there is no btf_func_check_member(). - */ - .check_member = btf_df_check_member, - .check_kflag_member = btf_df_check_kflag_member, - .log_details = btf_func_proto_log, - .seq_show = btf_df_seq_show, -}; - -static s32 btf_func_check_meta(struct btf_verifier_env *env, - const struct btf_type *t, - u32 meta_left) -{ - if (!t->name_off || - !btf_name_valid_identifier(env->btf, t->name_off)) { - btf_verifier_log_type(env, t, "Invalid name"); - return -EINVAL; - } - - if (btf_type_vlen(t)) { - btf_verifier_log_type(env, t, "vlen != 0"); - return -EINVAL; - } - - if (btf_type_kflag(t)) { - btf_verifier_log_type(env, t, "Invalid btf_info kind_flag"); - return -EINVAL; - } - - btf_verifier_log_type(env, t, NULL); - return 0; -} - -static struct btf_kind_operations func_ops = { - .check_meta = btf_func_check_meta, - .resolve = btf_df_resolve, - .check_member = btf_df_check_member, - .check_kflag_member = btf_df_check_kflag_member, - .log_details = btf_ref_type_log, - .seq_show = btf_df_seq_show, -}; - -static int btf_func_proto_check(struct btf_verifier_env *env, - const struct btf_type *t) -{ - const struct btf_type *ret_type; - const struct btf_param *args; - const struct btf *btf; - u16 nr_args, i; - int err; - btf = env->btf; - args = (const struct btf_param *)(t + 1); - nr_args = btf_type_vlen(t); - - /* Check func return type which could be "void" (t->type == 0) */ - if (t->type) { - u32 ret_type_id = t->type; - ret_type = btf_type_by_id(btf, ret_type_id); - if (!ret_type) { - btf_verifier_log_type(env, t, "Invalid return type"); - return -EINVAL; - } - if (btf_type_needs_resolve(ret_type) && - !env_type_is_resolved(env, ret_type_id)) { - err = btf_resolve(env, ret_type, ret_type_id); - if (err) - return err; - } - /* Ensure the return type is a type that has a size */ - if (!btf_type_id_size(btf, &ret_type_id, NULL)) { - btf_verifier_log_type(env, t, "Invalid return type"); - return -EINVAL; - } - } - - if (!nr_args) - return 0; - - /* Last func arg type_id could be 0 if it is a vararg */ - if (!args[nr_args - 1].type) { - if (args[nr_args - 1].name_off) { - btf_verifier_log_type(env, t, "Invalid arg#%u", - nr_args); - return -EINVAL; - } - nr_args--; - } - - err = 0; - for (i = 0; i < nr_args; i++) { - const struct btf_type *arg_type; - u32 arg_type_id; - arg_type_id = args[i].type; - arg_type = btf_type_by_id(btf, arg_type_id); - if (!arg_type) { - btf_verifier_log_type(env, t, "Invalid arg#%u", i + 1); - err = -EINVAL; - break; - } - - if (args[i].name_off && - (!btf_name_offset_valid(btf, args[i].name_off) || - !btf_name_valid_identifier(btf, args[i].name_off))) { - btf_verifier_log_type(env, t, - "Invalid arg#%u", i + 1); - err = -EINVAL; - break; - } - - if (btf_type_needs_resolve(arg_type) && - !env_type_is_resolved(env, arg_type_id)) { - err = btf_resolve(env, arg_type, arg_type_id); - if (err) - break; - } - - if (!btf_type_id_size(btf, &arg_type_id, NULL)) { - btf_verifier_log_type(env, t, "Invalid arg#%u", i + 1); - err = -EINVAL; - break; - } - } - return err; -} -static int btf_func_check(struct btf_verifier_env *env, - const struct btf_type *t) -{ - const struct btf_type *proto_type; - const struct btf_param *args; - const struct btf *btf; - u16 nr_args, i; - btf = env->btf; - proto_type = btf_type_by_id(btf, t->type); - - if (!proto_type || !btf_type_is_func_proto(proto_type)) { - btf_verifier_log_type(env, t, "Invalid type_id"); - return -EINVAL; - } - - args = (const struct btf_param *)(proto_type + 1); - nr_args = btf_type_vlen(proto_type); - for (i = 0; i < nr_args; i++) { - if (!args[i].name_off && args[i].type) { - btf_verifier_log_type(env, t, "Invalid arg#%u", i + 1); - return -EINVAL; - } - } - return 0; -} - -static const struct btf_kind_operations * const kind_ops[NR_BTF_KINDS] = { - [BTF_KIND_INT] = &int_ops, - [BTF_KIND_PTR] = &ptr_ops, - [BTF_KIND_ARRAY] = &array_ops, - [BTF_KIND_STRUCT] = &struct_ops, - [BTF_KIND_UNION] = &struct_ops, - [BTF_KIND_ENUM] = &enum_ops, - [BTF_KIND_FWD] = &fwd_ops, - [BTF_KIND_TYPEDEF] = &modifier_ops, - [BTF_KIND_VOLATILE] = &modifier_ops, - [BTF_KIND_CONST] = &modifier_ops, - [BTF_KIND_RESTRICT] = &modifier_ops, - [BTF_KIND_FUNC] = &func_ops, - [BTF_KIND_FUNC_PROTO] = &func_proto_ops, -}; - -static s32 btf_check_meta(struct btf_verifier_env *env, - const struct btf_type *t, - u32 meta_left) -{ - u32 saved_meta_left = meta_left; - s32 var_meta_size; - - if (meta_left < sizeof(*t)) { - btf_verifier_log(env, "[%u] meta_left:%u meta_needed:%zu", - env->log_type_id, meta_left, sizeof(*t)); - return -EINVAL; - } - meta_left -= sizeof(*t); - - if (t->info & ~BTF_INFO_MASK) { - btf_verifier_log(env, "[%u] Invalid btf_info:%x", - env->log_type_id, t->info); - return -EINVAL; - } - - if (BTF_INFO_KIND(t->info) > BTF_KIND_MAX || - BTF_INFO_KIND(t->info) == BTF_KIND_UNKN) { - btf_verifier_log(env, "[%u] Invalid kind:%u", - env->log_type_id, BTF_INFO_KIND(t->info)); - return -EINVAL; - } - - if (!btf_name_offset_valid(env->btf, t->name_off)) { - btf_verifier_log(env, "[%u] Invalid name_offset:%u", - env->log_type_id, t->name_off); - return -EINVAL; - } - - var_meta_size = btf_type_ops(t)->check_meta(env, t, meta_left); - if (var_meta_size < 0) - return var_meta_size; - - meta_left -= var_meta_size; - - return saved_meta_left - meta_left; -} - -static int btf_check_all_metas(struct btf_verifier_env *env) -{ - struct btf *btf = env->btf; - struct btf_header *hdr; - void *cur, *end; - - hdr = &btf->hdr; - cur = btf->nohdr_data + hdr->type_off; - end = cur + hdr->type_len; - - env->log_type_id = 1; - while (cur < end) { - struct btf_type *t = cur; - s32 meta_size; - - meta_size = btf_check_meta(env, t, end - cur); - if (meta_size < 0) - return meta_size; - - btf_add_type(env, t); - cur += meta_size; - env->log_type_id++; - } - - return 0; -} - -static bool btf_resolve_valid(struct btf_verifier_env *env, - const struct btf_type *t, - u32 type_id) -{ - struct btf *btf = env->btf; - - if (!env_type_is_resolved(env, type_id)) - return false; - - if (btf_type_is_struct(t)) - return !btf->resolved_ids[type_id] && - !btf->resolved_sizes[type_id]; - - if (btf_type_is_modifier(t) || btf_type_is_ptr(t)) { - t = btf_type_id_resolve(btf, &type_id); - return t && !btf_type_is_modifier(t); - } - - if (btf_type_is_array(t)) { - const struct btf_array *array = btf_type_array(t); - const struct btf_type *elem_type; - u32 elem_type_id = array->type; - u32 elem_size; - - elem_type = btf_type_id_size(btf, &elem_type_id, &elem_size); - return elem_type && !btf_type_is_modifier(elem_type) && - (array->nelems * elem_size == - btf->resolved_sizes[type_id]); - } - - return false; -} - -static int btf_resolve(struct btf_verifier_env *env, - const struct btf_type *t, u32 type_id) -{ - u32 save_log_type_id = env->log_type_id; - const struct resolve_vertex *v; - int err = 0; - - env->resolve_mode = RESOLVE_TBD; - env_stack_push(env, t, type_id); - - while (!err && (v = env_stack_peak(env))) { - env->log_type_id = v->type_id; - err = btf_type_ops(v->t)->resolve(env, v); - } - - env->log_type_id = type_id; - if (err == -E2BIG) { - btf_verifier_log_type(env, t, - "Exceeded max resolving depth:%u", - MAX_RESOLVE_DEPTH); - } else if (err == -EEXIST) { - btf_verifier_log_type(env, t, "Loop detected"); - } - - /* Final sanity check */ - if (!err && !btf_resolve_valid(env, t, type_id)) { - btf_verifier_log_type(env, t, "Invalid resolve state"); - err = -EINVAL; - } - - env->log_type_id = save_log_type_id; - return err; -} - -static int btf_check_all_types(struct btf_verifier_env *env) -{ - struct btf *btf = env->btf; - u32 type_id; - int err; - - err = env_resolve_init(env); - if (err) - return err; - - env->phase++; - for (type_id = 1; type_id <= btf->nr_types; type_id++) { - const struct btf_type *t = btf_type_by_id(btf, type_id); - - env->log_type_id = type_id; - if (btf_type_needs_resolve(t) && - !env_type_is_resolved(env, type_id)) { - err = btf_resolve(env, t, type_id); - if (err) - return err; - } - - if (btf_type_is_func_proto(t)) { - err = btf_func_proto_check(env, t); - if (err) - return err; - } - - if (btf_type_is_func(t)) { - err = btf_func_check(env, t); - if (err) - return err; - } - } - - return 0; -} - -static int btf_parse_type_sec(struct btf_verifier_env *env) -{ - const struct btf_header *hdr = &env->btf->hdr; - int err; - - /* Type section must align to 4 bytes */ - if (hdr->type_off & (sizeof(u32) - 1)) { - btf_verifier_log(env, "Unaligned type_off"); - return -EINVAL; - } - - if (!hdr->type_len) { - btf_verifier_log(env, "No type found"); - return -EINVAL; - } - - err = btf_check_all_metas(env); - if (err) - return err; - - return btf_check_all_types(env); -} - -static int btf_parse_str_sec(struct btf_verifier_env *env) -{ - const struct btf_header *hdr; - struct btf *btf = env->btf; - const char *start, *end; - - hdr = &btf->hdr; - start = btf->nohdr_data + hdr->str_off; - end = start + hdr->str_len; - - if (end != btf->data + btf->data_size) { - btf_verifier_log(env, "String section is not at the end"); - return -EINVAL; - } - - if (!hdr->str_len || hdr->str_len - 1 > BTF_MAX_NAME_OFFSET || - start[0] || end[-1]) { - btf_verifier_log(env, "Invalid string section"); - return -EINVAL; - } - - btf->strings = start; - - return 0; -} - -static const size_t btf_sec_info_offset[] = { - offsetof(struct btf_header, type_off), - offsetof(struct btf_header, str_off), -}; - -static int btf_sec_info_cmp(const void *a, const void *b) -{ - const struct btf_sec_info *x = a; - const struct btf_sec_info *y = b; - - return (int)(x->off - y->off) ? : (int)(x->len - y->len); -} - -static int btf_check_sec_info(struct btf_verifier_env *env, - u32 btf_data_size) -{ - struct btf_sec_info secs[ARRAY_SIZE(btf_sec_info_offset)]; - u32 total, expected_total, i; - const struct btf_header *hdr; - const struct btf *btf; - - btf = env->btf; - hdr = &btf->hdr; - - /* Populate the secs from hdr */ - for (i = 0; i < ARRAY_SIZE(btf_sec_info_offset); i++) - secs[i] = *(struct btf_sec_info *)((void *)hdr + - btf_sec_info_offset[i]); - - sort(secs, ARRAY_SIZE(btf_sec_info_offset), - sizeof(struct btf_sec_info), btf_sec_info_cmp, NULL); - - /* Check for gaps and overlap among sections */ - total = 0; - expected_total = btf_data_size - hdr->hdr_len; - for (i = 0; i < ARRAY_SIZE(btf_sec_info_offset); i++) { - if (expected_total < secs[i].off) { - btf_verifier_log(env, "Invalid section offset"); - return -EINVAL; - } - if (total < secs[i].off) { - /* gap */ - btf_verifier_log(env, "Unsupported section found"); - return -EINVAL; - } - if (total > secs[i].off) { - btf_verifier_log(env, "Section overlap found"); - return -EINVAL; - } - if (expected_total - total < secs[i].len) { - btf_verifier_log(env, - "Total section length too long"); - return -EINVAL; - } - total += secs[i].len; - } - - /* There is data other than hdr and known sections */ - if (expected_total != total) { - btf_verifier_log(env, "Unsupported section found"); - return -EINVAL; - } - - return 0; -} - -static int btf_parse_hdr(struct btf_verifier_env *env) -{ - u32 hdr_len, hdr_copy, btf_data_size; - const struct btf_header *hdr; - struct btf *btf; - int err; - - btf = env->btf; - btf_data_size = btf->data_size; - - if (btf_data_size < - offsetof(struct btf_header, hdr_len) + sizeof(hdr->hdr_len)) { - btf_verifier_log(env, "hdr_len not found"); - return -EINVAL; - } - - hdr = btf->data; - hdr_len = hdr->hdr_len; - if (btf_data_size < hdr_len) { - btf_verifier_log(env, "btf_header not found"); - return -EINVAL; - } - - /* Ensure the unsupported header fields are zero */ - if (hdr_len > sizeof(btf->hdr)) { - u8 *expected_zero = btf->data + sizeof(btf->hdr); - u8 *end = btf->data + hdr_len; - - for (; expected_zero < end; expected_zero++) { - if (*expected_zero) { - btf_verifier_log(env, "Unsupported btf_header"); - return -E2BIG; - } - } - } - - hdr_copy = min_t(u32, hdr_len, sizeof(btf->hdr)); - memcpy(&btf->hdr, btf->data, hdr_copy); - - hdr = &btf->hdr; - - if (hdr->hdr_len != hdr_len) - return -EINVAL; - - btf_verifier_log_hdr(env, btf_data_size); - - if (hdr->magic != BTF_MAGIC) { - btf_verifier_log(env, "Invalid magic"); - return -EINVAL; - } - - if (hdr->version != BTF_VERSION) { - btf_verifier_log(env, "Unsupported version"); - return -ENOTSUPP; - } - - if (hdr->flags) { - btf_verifier_log(env, "Unsupported flags"); - return -ENOTSUPP; - } - - if (btf_data_size == hdr->hdr_len) { - btf_verifier_log(env, "No data"); - return -EINVAL; - } - - err = btf_check_sec_info(env, btf_data_size); - if (err) - return err; - - return 0; -} - -static struct btf *btf_parse(void __user *btf_data, u32 btf_data_size, - u32 log_level, char __user *log_ubuf, u32 log_size) -{ - struct btf_verifier_env *env = NULL; - struct bpf_verifier_log *log; - struct btf *btf = NULL; - u8 *data; - int err; - - if (btf_data_size > BTF_MAX_SIZE) - return ERR_PTR(-E2BIG); - - env = kzalloc(sizeof(*env), GFP_KERNEL | __GFP_NOWARN); - if (!env) - return ERR_PTR(-ENOMEM); - - log = &env->log; - if (log_level || log_ubuf || log_size) { - /* user requested verbose verifier output - * and supplied buffer to store the verification trace - */ - log->level = log_level; - log->ubuf = log_ubuf; - log->len_total = log_size; - - /* log attributes have to be sane */ - if (log->len_total < 128 || log->len_total > UINT_MAX >> 8 || - !log->level || !log->ubuf) { - err = -EINVAL; - goto errout; - } - } - - btf = kzalloc(sizeof(*btf), GFP_KERNEL | __GFP_NOWARN); - if (!btf) { - err = -ENOMEM; - goto errout; - } - env->btf = btf; - - data = kvmalloc(btf_data_size, GFP_KERNEL | __GFP_NOWARN); - if (!data) { - err = -ENOMEM; - goto errout; - } - - btf->data = data; - btf->data_size = btf_data_size; - - if (copy_from_user(data, btf_data, btf_data_size)) { - err = -EFAULT; - goto errout; - } - - err = btf_parse_hdr(env); - if (err) - goto errout; - - btf->nohdr_data = btf->data + btf->hdr.hdr_len; - - err = btf_parse_str_sec(env); - if (err) - goto errout; - - err = btf_parse_type_sec(env); - if (err) - goto errout; - - if (log->level && bpf_verifier_log_full(log)) { - err = -ENOSPC; - goto errout; - } - - btf_verifier_env_free(env); - refcount_set(&btf->refcnt, 1); - return btf; - -errout: - btf_verifier_env_free(env); - if (btf) - btf_free(btf); - return ERR_PTR(err); -} - -void btf_type_seq_show(const struct btf *btf, u32 type_id, void *obj, - struct seq_file *m) -{ - const struct btf_type *t = btf_type_by_id(btf, type_id); - - btf_type_ops(t)->seq_show(btf, t, type_id, obj, 0, m); -} - -static int btf_release(struct inode *inode, struct file *filp) -{ - btf_put(filp->private_data); - return 0; -} - -const struct file_operations btf_fops = { - .release = btf_release, -}; - -static int __btf_new_fd(struct btf *btf) -{ - return anon_inode_getfd("btf", &btf_fops, btf, O_RDONLY | O_CLOEXEC); -} - -int btf_new_fd(const union bpf_attr *attr) -{ - struct btf *btf; - int ret; - - btf = btf_parse(u64_to_user_ptr(attr->btf), - attr->btf_size, attr->btf_log_level, - u64_to_user_ptr(attr->btf_log_buf), - attr->btf_log_size); - if (IS_ERR(btf)) - return PTR_ERR(btf); - - ret = btf_alloc_id(btf); - if (ret) { - btf_free(btf); - return ret; - } - /* - * The BTF ID is published to the userspace. - * All BTF free must go through call_rcu() from - * now on (i.e. free by calling btf_put()). - */ - ret = __btf_new_fd(btf); - if (ret < 0) - btf_put(btf); - - return ret; -} - -struct btf *btf_get_by_fd(int fd) -{ - struct btf *btf; - struct fd f; - - f = fdget(fd); - - if (!f.file) - return ERR_PTR(-EBADF); - - if (f.file->f_op != &btf_fops) { - fdput(f); - return ERR_PTR(-EINVAL); - } - - btf = f.file->private_data; - refcount_inc(&btf->refcnt); - fdput(f); - - return btf; -} - -int btf_get_info_by_fd(const struct btf *btf, - const union bpf_attr *attr, - union bpf_attr __user *uattr) -{ - void __user *udata = u64_to_user_ptr(attr->info.info); - u32 copy_len = min_t(u32, btf->data_size, - attr->info.info_len); - - if (copy_to_user(udata, btf->data, copy_len) || - put_user(btf->data_size, &uattr->info.info_len)) - return -EFAULT; - - return 0; -} - -int btf_get_fd_by_id(u32 id) -{ - struct btf *btf; - int fd; - rcu_read_lock(); - btf = idr_find(&btf_idr, id); - if (!btf || !refcount_inc_not_zero(&btf->refcnt)) - btf = ERR_PTR(-ENOENT); - rcu_read_unlock(); - if (IS_ERR(btf)) - return PTR_ERR(btf); - fd = __btf_new_fd(btf); - if (fd < 0) - btf_put(btf); - return fd; -} -u32 btf_id(const struct btf *btf) -{ - return btf->id; -} diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 6e084cfa03b2..3a7964a703aa 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -11,13 +11,10 @@ #include #include #include -#include #include -#include #include #include #include -#include DEFINE_STATIC_KEY_FALSE(cgroup_bpf_enabled_key); EXPORT_SYMBOL(cgroup_bpf_enabled_key); @@ -28,7 +25,6 @@ EXPORT_SYMBOL(cgroup_bpf_enabled_key); */ void cgroup_bpf_put(struct cgroup *cgrp) { - enum bpf_cgroup_storage_type stype; unsigned int type; for (type = 0; type < ARRAY_SIZE(cgrp->bpf.progs); type++) { @@ -38,10 +34,6 @@ void cgroup_bpf_put(struct cgroup *cgrp) list_for_each_entry_safe(pl, tmp, progs, node) { list_del(&pl->node); bpf_prog_put(pl->prog); - for_each_cgroup_storage_type(stype) { - bpf_cgroup_storage_unlink(pl->storage[stype]); - bpf_cgroup_storage_free(pl->storage[stype]); - } kfree(pl); static_branch_dec(&cgroup_bpf_enabled_key); } @@ -103,7 +95,6 @@ static int compute_effective_progs(struct cgroup *cgrp, enum bpf_attach_type type, struct bpf_prog_array __rcu **array) { - enum bpf_cgroup_storage_type stype; struct bpf_prog_array *progs; struct bpf_prog_list *pl; struct cgroup *p = cgrp; @@ -124,20 +115,15 @@ static int compute_effective_progs(struct cgroup *cgrp, cnt = 0; p = cgrp; do { - if (cnt > 0 && !(p->bpf.flags[type] & BPF_F_ALLOW_MULTI)) - continue; - - list_for_each_entry(pl, - &p->bpf.progs[type], node) { - if (!pl->prog) - continue; - progs->items[cnt].prog = pl->prog; - for_each_cgroup_storage_type(stype) - progs->items[cnt].cgroup_storage[stype] = - pl->storage[stype]; - cnt++; - } - } while ((p = cgroup_parent(p))); + if (cnt == 0 || (p->bpf.flags[type] & BPF_F_ALLOW_MULTI)) + list_for_each_entry(pl, + &p->bpf.progs[type], node) { + if (!pl->prog) + continue; + progs->progs[cnt++] = pl->prog; + } + p = cgroup_parent(p); + } while (p); rcu_assign_pointer(*array, progs); return 0; @@ -202,9 +188,6 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, { struct list_head *progs = &cgrp->bpf.progs[type]; struct bpf_prog *old_prog = NULL; - struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE], - *old_storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {NULL}; - enum bpf_cgroup_storage_type stype; struct cgroup_subsys_state *css; struct bpf_prog_list *pl; bool pl_was_allocated; @@ -228,59 +211,31 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, if (prog_list_length(progs) >= BPF_CGROUP_MAX_PROGS) return -E2BIG; - for_each_cgroup_storage_type(stype) { - storage[stype] = bpf_cgroup_storage_alloc(prog, stype); - if (IS_ERR(storage[stype])) { - storage[stype] = NULL; - for_each_cgroup_storage_type(stype) - bpf_cgroup_storage_free(storage[stype]); - return -ENOMEM; - } - } - if (flags & BPF_F_ALLOW_MULTI) { - list_for_each_entry(pl, progs, node) { - if (pl->prog == prog) { + list_for_each_entry(pl, progs, node) + if (pl->prog == prog) /* disallow attaching the same prog twice */ - for_each_cgroup_storage_type(stype) - bpf_cgroup_storage_free(storage[stype]); return -EINVAL; - } - } pl = kmalloc(sizeof(*pl), GFP_KERNEL); - if (!pl) { - for_each_cgroup_storage_type(stype) - bpf_cgroup_storage_free(storage[stype]); + if (!pl) return -ENOMEM; - } pl_was_allocated = true; pl->prog = prog; - for_each_cgroup_storage_type(stype) - pl->storage[stype] = storage[stype]; list_add_tail(&pl->node, progs); } else { if (list_empty(progs)) { pl = kmalloc(sizeof(*pl), GFP_KERNEL); - if (!pl) { - for_each_cgroup_storage_type(stype) - bpf_cgroup_storage_free(storage[stype]); + if (!pl) return -ENOMEM; - } pl_was_allocated = true; list_add_tail(&pl->node, progs); } else { pl = list_first_entry(progs, typeof(*pl), node); old_prog = pl->prog; - for_each_cgroup_storage_type(stype) { - old_storage[stype] = pl->storage[stype]; - bpf_cgroup_storage_unlink(old_storage[stype]); - } pl_was_allocated = false; } pl->prog = prog; - for_each_cgroup_storage_type(stype) - pl->storage[stype] = storage[stype]; } old_flags = cgrp->bpf.flags[type]; @@ -304,17 +259,10 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, } static_branch_inc(&cgroup_bpf_enabled_key); - for_each_cgroup_storage_type(stype) { - if (!old_storage[stype]) - continue; - bpf_cgroup_storage_free(old_storage[stype]); - } if (old_prog) { bpf_prog_put(old_prog); static_branch_dec(&cgroup_bpf_enabled_key); } - for_each_cgroup_storage_type(stype) - bpf_cgroup_storage_link(storage[stype], cgrp, type); return 0; cleanup: @@ -328,18 +276,13 @@ cleanup: desc->bpf.inactive = NULL; } - /* and cleanup the prog list */ - pl->prog = old_prog; - for_each_cgroup_storage_type(stype) { - bpf_cgroup_storage_free(pl->storage[stype]); - pl->storage[stype] = old_storage[stype]; - bpf_cgroup_storage_link(old_storage[stype], cgrp, type); + /* and cleanup the prog list */ + pl->prog = old_prog; + if (pl_was_allocated) { + list_del(&pl->node); + kfree(pl); } - if (pl_was_allocated) { - list_del(&pl->node); - kfree(pl); - } - return err; + return err; } /** @@ -355,7 +298,6 @@ int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, enum bpf_attach_type type, u32 unused_flags) { struct list_head *progs = &cgrp->bpf.progs[type]; - enum bpf_cgroup_storage_type stype; u32 flags = cgrp->bpf.flags[type]; struct bpf_prog *old_prog = NULL; struct cgroup_subsys_state *css; @@ -416,10 +358,6 @@ int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, /* now can actually delete it from this cgroup list */ list_del(&pl->node); - for_each_cgroup_storage_type(stype) { - bpf_cgroup_storage_unlink(pl->storage[stype]); - bpf_cgroup_storage_free(pl->storage[stype]); - } kfree(pl); if (list_empty(progs)) /* last program was detached, reset flags to zero */ @@ -445,52 +383,6 @@ cleanup: return err; } -/* Must be called with cgroup_mutex held to avoid races. */ -int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, - union bpf_attr __user *uattr) -{ - __u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids); - enum bpf_attach_type type = attr->query.attach_type; - struct list_head *progs = &cgrp->bpf.progs[type]; - u32 flags = cgrp->bpf.flags[type]; - int cnt, ret = 0, i; - - if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) - cnt = bpf_prog_array_length(cgrp->bpf.effective[type]); - else - cnt = prog_list_length(progs); - - if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags))) - return -EFAULT; - if (copy_to_user(&uattr->query.prog_cnt, &cnt, sizeof(cnt))) - return -EFAULT; - if (attr->query.prog_cnt == 0 || !prog_ids || !cnt) - /* return early if user requested only program count + flags */ - return 0; - if (attr->query.prog_cnt < cnt) { - cnt = attr->query.prog_cnt; - ret = -ENOSPC; - } - - if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) { - return bpf_prog_array_copy_to_user(cgrp->bpf.effective[type], - prog_ids, cnt); - } else { - struct bpf_prog_list *pl; - u32 id; - - i = 0; - list_for_each_entry(pl, progs, node) { - id = pl->prog->aux->id; - if (copy_to_user(prog_ids + i, &id, sizeof(id))) - return -EFAULT; - if (++i == cnt) - break; - } - } - return ret; -} - /** * __cgroup_bpf_run_filter_skb() - Run a program for packet filtering * @sk: The socket sending or receiving traffic @@ -557,51 +449,6 @@ int __cgroup_bpf_run_filter_sk(struct sock *sk, } EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk); -/** - * __cgroup_bpf_run_filter_sock_addr() - Run a program on a sock and - * provided by user sockaddr - * @sk: sock struct that will use sockaddr - * @uaddr: sockaddr struct provided by user - * @type: The type of program to be exectuted - * @t_ctx: Pointer to attach type specific context - * - * socket is expected to be of type INET or INET6. - * - * This function will return %-EPERM if an attached program is found and - * returned value != 1 during execution. In all other cases, 0 is returned. - */ -int __cgroup_bpf_run_filter_sock_addr(struct sock *sk, - struct sockaddr *uaddr, - enum bpf_attach_type type, - void *t_ctx) -{ - struct bpf_sock_addr_kern ctx = { - .sk = sk, - .uaddr = uaddr, - .t_ctx = t_ctx, - }; - struct sockaddr_storage unspec; - struct cgroup *cgrp; - int ret; - - /* Check socket family since not all sockets represent network - * endpoint (e.g. AF_UNIX). - */ - if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6) - return 0; - - if (!ctx.uaddr) { - memset(&unspec, 0, sizeof(unspec)); - ctx.uaddr = (struct sockaddr *)&unspec; - } - - cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); - ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx, BPF_PROG_RUN); - - return ret == 1 ? 0 : -EPERM; -} -EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_addr); - /** * __cgroup_bpf_run_filter_sock_ops() - Run a program on a sock * @sk: socket to get cgroup from @@ -630,511 +477,3 @@ int __cgroup_bpf_run_filter_sock_ops(struct sock *sk, return ret == 1 ? 0 : -EPERM; } EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_ops); - -int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, - short access, enum bpf_attach_type type) -{ - struct cgroup *cgrp; - struct bpf_cgroup_dev_ctx ctx = { - .access_type = (access << 16) | dev_type, - .major = major, - .minor = minor, - }; - int allow = 1; - - rcu_read_lock(); - cgrp = task_dfl_cgroup(current); - allow = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx, - BPF_PROG_RUN); - rcu_read_unlock(); - - return !allow; -} -EXPORT_SYMBOL(__cgroup_bpf_check_dev_permission); - -static const struct bpf_func_proto * -cgroup_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) -{ - switch (func_id) { - case BPF_FUNC_map_lookup_elem: - return &bpf_map_lookup_elem_proto; - case BPF_FUNC_map_update_elem: - return &bpf_map_update_elem_proto; - case BPF_FUNC_map_delete_elem: - return &bpf_map_delete_elem_proto; - case BPF_FUNC_get_current_uid_gid: - return &bpf_get_current_uid_gid_proto; - case BPF_FUNC_get_local_storage: - return &bpf_get_local_storage_proto; - case BPF_FUNC_get_current_cgroup_id: - return &bpf_get_current_cgroup_id_proto; - case BPF_FUNC_trace_printk: - if (capable(CAP_SYS_ADMIN)) - return bpf_get_trace_printk_proto(); - default: - return NULL; - } -} - -static const struct bpf_func_proto * -cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) -{ - return cgroup_base_func_proto(func_id, prog); -} - -static bool cgroup_dev_is_valid_access(int off, int size, enum bpf_access_type type, - const struct bpf_prog *prog, - struct bpf_insn_access_aux *info) -{ - if (type == BPF_WRITE) - return false; - - if (off < 0 || off + size > sizeof(struct bpf_cgroup_dev_ctx)) - return false; - /* The verifier guarantees that size > 0. */ - if (off % size != 0) - return false; - if (size != sizeof(__u32)) - return false; - - return true; -} - -const struct bpf_prog_ops cg_dev_prog_ops = { -}; - -const struct bpf_verifier_ops cg_dev_verifier_ops = { - .get_func_proto = cgroup_dev_func_proto, - .is_valid_access = cgroup_dev_is_valid_access, -}; - - -/** - * __cgroup_bpf_run_filter_sysctl - Run a program on sysctl - * - * @head: sysctl table header - * @table: sysctl table - * @write: sysctl is being read (= 0) or written (= 1) - * @type: type of program to be executed - * - * Program is run when sysctl is being accessed, either read or written, and - * can allow or deny such access. - * - * This function will return %-EPERM if an attached program is found and - * returned value != 1 during execution. In all other cases 0 is returned. - */ -int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, - struct ctl_table *table, int write, - enum bpf_attach_type type) -{ - struct bpf_sysctl_kern ctx = { - .head = head, - .table = table, - .write = write, - }; - struct cgroup *cgrp; - int ret; - - rcu_read_lock(); - cgrp = task_dfl_cgroup(current); - ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx, BPF_PROG_RUN); - rcu_read_unlock(); - - return ret == 1 ? 0 : -EPERM; -} - -EXPORT_SYMBOL(__cgroup_bpf_run_filter_sysctl); - -static bool __cgroup_bpf_prog_array_is_empty(struct cgroup *cgrp, - enum bpf_attach_type attach_type) -{ - struct bpf_prog_array *prog_array; - bool empty; - - rcu_read_lock(); - prog_array = rcu_dereference(cgrp->bpf.effective[attach_type]); - empty = bpf_prog_array_is_empty(prog_array); - rcu_read_unlock(); - return empty; -} - -static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen) -{ - if (unlikely(max_optlen < 0)) - return -EINVAL; - - if (unlikely(max_optlen > PAGE_SIZE)) { - /* We don't expose optvals that are greater than PAGE_SIZE - * to the BPF program. - */ - max_optlen = PAGE_SIZE; - } - - ctx->optval = kzalloc(max_optlen, GFP_USER); - if (!ctx->optval) - return -ENOMEM; - - ctx->optval_end = ctx->optval + max_optlen; - return max_optlen; -} - -static void sockopt_free_buf(struct bpf_sockopt_kern *ctx) -{ - kfree(ctx->optval); -} - -int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level, - int *optname, char __user *optval, - int *optlen, char **kernel_optval) -{ - struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); - struct bpf_sockopt_kern ctx = { - .sk = sk, - .level = *level, - .optname = *optname, - }; - int ret, max_optlen; - - /* Opportunistic check to see whether we have any BPF program - * attached to the hook so we don't waste time allocating - * memory and locking the socket. - */ - if (!cgroup_bpf_enabled || - __cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_SETSOCKOPT)) - return 0; - - /* Allocate a bit more than the initial user buffer for - * BPF program. The canonical use case is overriding - * TCP_CONGESTION(nv) to TCP_CONGESTION(cubic). - */ - max_optlen = max_t(int, 16, *optlen); - - max_optlen = sockopt_alloc_buf(&ctx, max_optlen); - if (max_optlen < 0) - return max_optlen; - - ctx.optlen = *optlen; - - if (copy_from_user(ctx.optval, optval, min(*optlen, max_optlen)) != 0) { - ret = -EFAULT; - goto out; - } - - lock_sock(sk); - ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[BPF_CGROUP_SETSOCKOPT], - &ctx, BPF_PROG_RUN); - release_sock(sk); - if (!ret) { - ret = -EPERM; - goto out; - } - if (ctx.optlen == -1) { - /* optlen set to -1, bypass kernel */ - ret = 1; - } else if (ctx.optlen > max_optlen || ctx.optlen < -1) { - /* optlen is out of bounds */ - ret = -EFAULT; - } else { - /* optlen within bounds, run kernel handler */ - ret = 0; - /* export any potential modifications */ - *level = ctx.level; - *optname = ctx.optname; - - /* optlen == 0 from BPF indicates that we should - * use original userspace data. - */ - if (ctx.optlen != 0) { - *optlen = ctx.optlen; - *kernel_optval = ctx.optval; - } - } -out: - if (ret) - sockopt_free_buf(&ctx); - return ret; -} - -EXPORT_SYMBOL(__cgroup_bpf_run_filter_setsockopt); -int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, - int optname, char __user *optval, - int __user *optlen, int max_optlen, - int retval) -{ - struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); - struct bpf_sockopt_kern ctx = { - .sk = sk, - .level = level, - .optname = optname, - .retval = retval, - }; - int ret; - - /* Opportunistic check to see whether we have any BPF program - * attached to the hook so we don't waste time allocating - * memory and locking the socket. - */ - if (!cgroup_bpf_enabled || - __cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_GETSOCKOPT)) - return retval; - - ctx.optlen = max_optlen; - - max_optlen = sockopt_alloc_buf(&ctx, max_optlen); - if (max_optlen < 0) - return max_optlen; - - if (!retval) { - /* If kernel getsockopt finished successfully, - * copy whatever was returned to the user back - * into our temporary buffer. Set optlen to the - * one that kernel returned as well to let - * BPF programs inspect the value. - */ - if (get_user(ctx.optlen, optlen)) { - ret = -EFAULT; - goto out; - } - - if (copy_from_user(ctx.optval, optval, - min(ctx.optlen, max_optlen)) != 0) { - ret = -EFAULT; - goto out; - } - } - - lock_sock(sk); - ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[BPF_CGROUP_GETSOCKOPT], - &ctx, BPF_PROG_RUN); - release_sock(sk); - if (!ret) { - ret = -EPERM; - goto out; - } - - if (ctx.optlen > max_optlen) { - ret = -EFAULT; - goto out; - } - - /* BPF programs only allowed to set retval to 0, not some - * arbitrary value. - */ - if (ctx.retval != 0 && ctx.retval != retval) { - ret = -EFAULT; - goto out; - } - - if (ctx.optlen != 0) { - if (copy_to_user(optval, ctx.optval, ctx.optlen) || - put_user(ctx.optlen, optlen)) { - ret = -EFAULT; - goto out; - } - } - - ret = ctx.retval; -out: - sockopt_free_buf(&ctx); - return ret; -} -EXPORT_SYMBOL(__cgroup_bpf_run_filter_getsockopt); - -static const struct bpf_func_proto * -sysctl_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) -{ - return cgroup_dev_func_proto(func_id, prog); -} - -static bool sysctl_is_valid_access(int off, int size, enum bpf_access_type type, - const struct bpf_prog *prog, - struct bpf_insn_access_aux *info) -{ - const int size_default = sizeof(__u32); - - if (off < 0 || off + size > sizeof(struct bpf_sysctl) || - off % size || type != BPF_READ) - return false; - - switch (off) { - case offsetof(struct bpf_sysctl, write): - bpf_ctx_record_field_size(info, size_default); - return bpf_ctx_narrow_access_ok(off, size, size_default); - default: - return false; - } -} - -static u32 sysctl_convert_ctx_access(enum bpf_access_type type, - const struct bpf_insn *si, - struct bpf_insn *insn_buf, - struct bpf_prog *prog, u32 *target_size) -{ - struct bpf_insn *insn = insn_buf; - - switch (si->off) { - case offsetof(struct bpf_sysctl, write): - *insn++ = BPF_LDX_MEM( - BPF_SIZE(si->code), si->dst_reg, si->src_reg, - bpf_target_off(struct bpf_sysctl_kern, write, - FIELD_SIZEOF(struct bpf_sysctl_kern, - write), - target_size)); - break; - } - - return insn - insn_buf; -} - -const struct bpf_verifier_ops cg_sysctl_verifier_ops = { - .get_func_proto = sysctl_func_proto, - .is_valid_access = sysctl_is_valid_access, - .convert_ctx_access = sysctl_convert_ctx_access, -}; - -const struct bpf_prog_ops cg_sysctl_prog_ops = { -}; - -static const struct bpf_func_proto * -cg_sockopt_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) -{ - switch (func_id) { - case BPF_FUNC_sk_storage_get: - return &bpf_sk_storage_get_proto; - case BPF_FUNC_sk_storage_delete: - return &bpf_sk_storage_delete_proto; -#ifdef CONFIG_INET - case BPF_FUNC_tcp_sock: - return &bpf_tcp_sock_proto; -#endif - default: - return cgroup_base_func_proto(func_id, prog); - } -} - -static bool cg_sockopt_is_valid_access(int off, int size, - enum bpf_access_type type, - const struct bpf_prog *prog, - struct bpf_insn_access_aux *info) -{ - const int size_default = sizeof(__u32); - if (off < 0 || off >= sizeof(struct bpf_sockopt)) - return false; - if (off % size != 0) - return false; - if (type == BPF_WRITE) { - switch (off) { - case offsetof(struct bpf_sockopt, retval): - if (size != size_default) - return false; - return prog->expected_attach_type == - BPF_CGROUP_GETSOCKOPT; - case offsetof(struct bpf_sockopt, optname): - /* fallthrough */ - case offsetof(struct bpf_sockopt, level): - if (size != size_default) - return false; - return prog->expected_attach_type == - BPF_CGROUP_SETSOCKOPT; - case offsetof(struct bpf_sockopt, optlen): - return size == size_default; - default: - return false; - } - } - switch (off) { - case offsetof(struct bpf_sockopt, sk): - if (size != sizeof(__u64)) - return false; - info->reg_type = PTR_TO_SOCKET; - break; - case offsetof(struct bpf_sockopt, optval): - if (size != sizeof(__u64)) - return false; - info->reg_type = PTR_TO_PACKET; - break; - case offsetof(struct bpf_sockopt, optval_end): - if (size != sizeof(__u64)) - return false; - info->reg_type = PTR_TO_PACKET_END; - break; - case offsetof(struct bpf_sockopt, retval): - if (size != size_default) - return false; - return prog->expected_attach_type == BPF_CGROUP_GETSOCKOPT; - default: - if (size != size_default) - return false; - break; - } - - return true; -} - -#define CG_SOCKOPT_ACCESS_FIELD(T, F) \ - T(BPF_FIELD_SIZEOF(struct bpf_sockopt_kern, F), \ - si->dst_reg, si->src_reg, \ - offsetof(struct bpf_sockopt_kern, F)) -static u32 cg_sockopt_convert_ctx_access(enum bpf_access_type type, - const struct bpf_insn *si, - struct bpf_insn *insn_buf, - struct bpf_prog *prog, - u32 *target_size) -{ - struct bpf_insn *insn = insn_buf; - switch (si->off) { - case offsetof(struct bpf_sockopt, sk): - *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, sk); - break; - case offsetof(struct bpf_sockopt, level): - if (type == BPF_WRITE) - *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, level); - else - *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, level); - break; - case offsetof(struct bpf_sockopt, optname): - if (type == BPF_WRITE) - *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, optname); - else - *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optname); - break; - case offsetof(struct bpf_sockopt, optlen): - if (type == BPF_WRITE) - *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, optlen); - else - *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optlen); - break; - case offsetof(struct bpf_sockopt, retval): - if (type == BPF_WRITE) - *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, retval); - else - *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, retval); - break; - case offsetof(struct bpf_sockopt, optval): - *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optval); - break; - case offsetof(struct bpf_sockopt, optval_end): - *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optval_end); - break; - } - return insn - insn_buf; -} - -static int cg_sockopt_get_prologue(struct bpf_insn *insn_buf, - bool direct_write, - const struct bpf_prog *prog) -{ - /* Nothing to do for sockopt argument. The data is kzalloc'ated. - */ - return 0; -} - -const struct bpf_verifier_ops cg_sockopt_verifier_ops = { - .get_func_proto = cg_sockopt_func_proto, - .is_valid_access = cg_sockopt_is_valid_access, - .convert_ctx_access = cg_sockopt_convert_ctx_access, - .gen_prologue = cg_sockopt_get_prologue, -}; - -const struct bpf_prog_ops cg_sockopt_prog_ops = { -}; diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index ec9fb28e863e..4d34ab12d0df 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -21,14 +21,12 @@ * Kris Katterjohn - Added many additional checks in bpf_check_classic() */ -#include #include #include #include #include #include #include -#include #include #include #include @@ -102,7 +100,6 @@ struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags) fp->pages = size / PAGE_SIZE; fp->aux = aux; fp->aux->prog = fp; - fp->jit_requested = ebpf_jit_enabled(); INIT_LIST_HEAD_RCU(&fp->aux->ksym_lnode); @@ -110,84 +107,6 @@ struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags) } EXPORT_SYMBOL_GPL(bpf_prog_alloc); -int bpf_prog_alloc_jited_linfo(struct bpf_prog *prog) -{ - if (!prog->aux->nr_linfo || !prog->jit_requested) - return 0; - prog->aux->jited_linfo = kcalloc(prog->aux->nr_linfo, - sizeof(*prog->aux->jited_linfo), - GFP_KERNEL | __GFP_NOWARN); - if (!prog->aux->jited_linfo) - return -ENOMEM; - return 0; -} - -void bpf_prog_free_jited_linfo(struct bpf_prog *prog) -{ - kfree(prog->aux->jited_linfo); - prog->aux->jited_linfo = NULL; -} - -void bpf_prog_free_unused_jited_linfo(struct bpf_prog *prog) -{ - if (prog->aux->jited_linfo && !prog->aux->jited_linfo[0]) - bpf_prog_free_jited_linfo(prog); -} - -/* The jit engine is responsible to provide an array - * for insn_off to the jited_off mapping (insn_to_jit_off). - * - * The idx to this array is the insn_off. Hence, the insn_off - * here is relative to the prog itself instead of the main prog. - * This array has one entry for each xlated bpf insn. - * - * jited_off is the byte off to the last byte of the jited insn. - * - * Hence, with - * insn_start: - * The first bpf insn off of the prog. The insn off - * here is relative to the main prog. - * e.g. if prog is a subprog, insn_start > 0 - * linfo_idx: - * The prog's idx to prog->aux->linfo and jited_linfo - * - * jited_linfo[linfo_idx] = prog->bpf_func - * - * For i > linfo_idx, - * - * jited_linfo[i] = prog->bpf_func + - * insn_to_jit_off[linfo[i].insn_off - insn_start - 1] - */ -void bpf_prog_fill_jited_linfo(struct bpf_prog *prog, - const u32 *insn_to_jit_off) -{ - u32 linfo_idx, insn_start, insn_end, nr_linfo, i; - const struct bpf_line_info *linfo; - void **jited_linfo; - if (!prog->aux->jited_linfo) - /* Userspace did not provide linfo */ - return; - linfo_idx = prog->aux->linfo_idx; - linfo = &prog->aux->linfo[linfo_idx]; - insn_start = linfo[0].insn_off; - insn_end = insn_start + prog->len; - jited_linfo = &prog->aux->jited_linfo[linfo_idx]; - jited_linfo[0] = prog->bpf_func; - nr_linfo = prog->aux->nr_linfo - linfo_idx; - for (i = 1; i < nr_linfo && linfo[i].insn_off < insn_end; i++) - /* The verifier ensures that linfo[i].insn_off is - * strictly increasing - */ - jited_linfo[i] = prog->bpf_func + - insn_to_jit_off[linfo[i].insn_off - insn_start - 1]; -} - -void bpf_prog_free_linfo(struct bpf_prog *prog) -{ - bpf_prog_free_jited_linfo(prog); - kvfree(prog->aux->linfo); -} - struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size, gfp_t gfp_extra_flags) { @@ -260,8 +179,7 @@ int bpf_prog_calc_tag(struct bpf_prog *fp) dst[i] = fp->insnsi[i]; if (!was_ld_map && dst[i].code == (BPF_LD | BPF_IMM | BPF_DW) && - (dst[i].src_reg == BPF_PSEUDO_MAP_FD || - dst[i].src_reg == BPF_PSEUDO_MAP_VALUE)) { + dst[i].src_reg == BPF_PSEUDO_MAP_FD) { was_ld_map = true; dst[i].imm = 0; } else if (was_ld_map && @@ -305,15 +223,39 @@ int bpf_prog_calc_tag(struct bpf_prog *fp) return 0; } +static bool bpf_is_jmp_and_has_target(const struct bpf_insn *insn) +{ + return BPF_CLASS(insn->code) == BPF_JMP && + /* Call and Exit are both special jumps with no + * target inside the BPF instruction image. + */ + BPF_OP(insn->code) != BPF_CALL && + BPF_OP(insn->code) != BPF_EXIT; +} + +static int bpf_adj_delta_to_off(struct bpf_insn *insn, u32 pos, u32 delta, + u32 curr, const bool probe_pass) +{ + const s32 off_min = S16_MIN, off_max = S16_MAX; + s32 off = insn->off; + + if (curr < pos && curr + off + 1 > pos) + off += delta; + else if (curr > pos + delta && curr + off + 1 <= pos + delta) + off -= delta; + if (off < off_min || off > off_max) + return -ERANGE; + if (!probe_pass) + insn->off = off; + return 0; +} + static int bpf_adj_branches(struct bpf_prog *prog, u32 pos, u32 delta, const bool probe_pass) { u32 i, insn_cnt = prog->len + (probe_pass ? delta : 0); struct bpf_insn *insn = prog->insnsi; int ret = 0; - bool pseudo_call; - u8 code; - int off; for (i = 0; i < insn_cnt; i++, insn++) { /* In the probing pass we still operate on the original, @@ -325,31 +267,11 @@ static int bpf_adj_branches(struct bpf_prog *prog, u32 pos, u32 delta, insn++; } - code = insn->code; - if (BPF_CLASS(code) != BPF_JMP) + if (!bpf_is_jmp_and_has_target(insn)) continue; - if (BPF_OP(code) == BPF_EXIT) - continue; - if (BPF_OP(code) == BPF_CALL) { - if (insn->src_reg == BPF_PSEUDO_CALL) - pseudo_call = true; - else - continue; - } else { - pseudo_call = false; - } - off = pseudo_call ? insn->imm : insn->off; /* Adjust offset of jmps if we cross patch boundaries. */ - if (i < pos && i + off + 1 > pos) - off += delta; - else if (i > pos + delta && i + off + 1 <= pos + delta) - off -= delta; - if (pseudo_call) - insn->imm = off; - else - insn->off = off; - + ret = bpf_adj_delta_to_off(insn, pos, delta, i, probe_pass); if (ret) break; } @@ -357,22 +279,6 @@ static int bpf_adj_branches(struct bpf_prog *prog, u32 pos, u32 delta, return ret; } -static void bpf_adj_linfo(struct bpf_prog *prog, u32 off, u32 delta) -{ - struct bpf_line_info *linfo; - u32 i, nr_linfo; - nr_linfo = prog->aux->nr_linfo; - if (!nr_linfo || !delta) - return; - linfo = prog->aux->linfo; - for (i = 0; i < nr_linfo; i++) - if (off < linfo[i].insn_off) - break; - /* Push all off < linfo[i].insn_off by delta */ - for (; i < nr_linfo; i++) - linfo[i].insn_off += delta; -} - struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, const struct bpf_insn *patch, u32 len) { @@ -428,8 +334,6 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, */ BUG_ON(bpf_adj_branches(prog_adj, off, insn_delta, false)); - bpf_adj_linfo(prog_adj, off, insn_delta); - return prog_adj; } @@ -457,37 +361,12 @@ bpf_get_prog_addr_region(const struct bpf_prog *prog, static void bpf_get_prog_name(const struct bpf_prog *prog, char *sym) { - const char *end = sym + KSYM_NAME_LEN; - const struct btf_type *type; - const char *func_name; - BUILD_BUG_ON(sizeof("bpf_prog_") + - sizeof(prog->tag) * 2 + - /* name has been null terminated. - * We should need +1 for the '_' preceding - * the name. However, the null character - * is double counted between the name and the - * sizeof("bpf_prog_") above, so we omit - * the +1 here. - */ - sizeof(prog->aux->name) > KSYM_NAME_LEN); + sizeof(prog->tag) * 2 + 1 > KSYM_NAME_LEN); sym += snprintf(sym, KSYM_NAME_LEN, "bpf_prog_"); sym = bin2hex(sym, prog->tag, sizeof(prog->tag)); - - /* prog->aux->name will be ignored if full btf name is available */ - if (prog->aux->btf) { - type = btf_type_by_id(prog->aux->btf, - prog->aux->func_info[prog->aux->func_idx].type_id); - func_name = btf_name_by_offset(prog->aux->btf, type->name_off); - snprintf(sym, (size_t)(end - sym), "_%s", func_name); - return; - } - - if (prog->aux->name[0]) - snprintf(sym, (size_t)(end - sym), "_%s", prog->aux->name); - else - *sym = 0; + *sym = 0; } static __always_inline unsigned long @@ -739,7 +618,6 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr, bpf_jit_set_header_magic(hdr); hdr->pages = pages; - hole = min_t(unsigned int, size - (proglen + sizeof(*hdr)), PAGE_SIZE - sizeof(*hdr)); start = (get_random_int() % hole) & ~(alignment - 1); @@ -953,7 +831,7 @@ struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *prog) struct bpf_insn *insn; int i, rewritten; - if (!bpf_jit_blinding_enabled(prog) || prog->blinded) + if (!bpf_jit_blinding_enabled()) return prog; clone = bpf_prog_clone_create(prog, GFP_USER); @@ -995,16 +873,13 @@ struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *prog) i += insn_delta; } - clone->blinded = 1; return clone; } #endif /* CONFIG_BPF_JIT */ /* Base function for offset calculation. Needs to go into .text section, * therefore keeping it non-static as well; will also be used by JITs - * anyway later on, so do not let the compiler omit it. This also needs - * to go into kallsyms for correlation from e.g. bpftool, so naming - * must not change. + * anyway later on, so do not let the compiler omit it. */ noinline u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) { @@ -1020,7 +895,8 @@ EXPORT_SYMBOL_GPL(__bpf_call_base); * * Decode and execute eBPF instructions. */ -static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, u64 *stack) +static unsigned int ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, + u64 *stack) { u64 tmp; static const void *jumptable[256] = { @@ -1080,7 +956,6 @@ static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, u64 *stack) [BPF_ALU64 | BPF_NEG] = &&ALU64_NEG, /* Call instruction */ [BPF_JMP | BPF_CALL] = &&JMP_CALL, - [BPF_JMP | BPF_CALL_ARGS] = &&JMP_CALL_ARGS, [BPF_JMP | BPF_TAIL_CALL] = &&JMP_TAIL_CALL, /* Jumps */ [BPF_JMP | BPF_JA] = &&JMP_JA, @@ -1263,13 +1138,6 @@ select_insn: BPF_R4, BPF_R5); CONT; - JMP_CALL_ARGS: - BPF_R0 = (__bpf_call_base_args + insn->imm)(BPF_R1, BPF_R2, - BPF_R3, BPF_R4, - BPF_R5, - insn + insn->off + 1); - CONT; - JMP_TAIL_CALL: { struct bpf_map *map = (struct bpf_map *) (unsigned long) BPF_R2; struct bpf_array *array = container_of(map, struct bpf_array, map); @@ -1542,23 +1410,6 @@ static unsigned int PROG_NAME(stack_size)(const void *ctx, const struct bpf_insn return ___bpf_prog_run(regs, insn, stack); \ } -#define PROG_NAME_ARGS(stack_size) __bpf_prog_run_args##stack_size -#define DEFINE_BPF_PROG_RUN_ARGS(stack_size) \ -static u64 PROG_NAME_ARGS(stack_size)(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5, \ - const struct bpf_insn *insn) \ -{ \ - u64 stack[stack_size / sizeof(u64)]; \ - u64 regs[MAX_BPF_REG]; \ -\ - FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; \ - BPF_R1 = r1; \ - BPF_R2 = r2; \ - BPF_R3 = r3; \ - BPF_R4 = r4; \ - BPF_R5 = r5; \ - return ___bpf_prog_run(regs, insn, stack); \ -} - #define EVAL1(FN, X) FN(X) #define EVAL2(FN, X, Y...) FN(X) EVAL1(FN, Y) #define EVAL3(FN, X, Y...) FN(X) EVAL2(FN, Y) @@ -1570,10 +1421,6 @@ EVAL6(DEFINE_BPF_PROG_RUN, 32, 64, 96, 128, 160, 192); EVAL6(DEFINE_BPF_PROG_RUN, 224, 256, 288, 320, 352, 384); EVAL4(DEFINE_BPF_PROG_RUN, 416, 448, 480, 512); -EVAL6(DEFINE_BPF_PROG_RUN_ARGS, 32, 64, 96, 128, 160, 192); -EVAL6(DEFINE_BPF_PROG_RUN_ARGS, 224, 256, 288, 320, 352, 384); -EVAL4(DEFINE_BPF_PROG_RUN_ARGS, 416, 448, 480, 512); - #define PROG_NAME_LIST(stack_size) PROG_NAME(stack_size), static unsigned int (*interpreters[])(const void *ctx, @@ -1581,25 +1428,6 @@ static unsigned int (*interpreters[])(const void *ctx, EVAL6(PROG_NAME_LIST, 32, 64, 96, 128, 160, 192) EVAL6(PROG_NAME_LIST, 224, 256, 288, 320, 352, 384) EVAL4(PROG_NAME_LIST, 416, 448, 480, 512) - -#undef PROG_NAME_LIST -#define PROG_NAME_LIST(stack_size) PROG_NAME_ARGS(stack_size), -static u64 (*interpreters_args[])(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5, - const struct bpf_insn *insn) = { -EVAL6(PROG_NAME_LIST, 32, 64, 96, 128, 160, 192) -EVAL6(PROG_NAME_LIST, 224, 256, 288, 320, 352, 384) -EVAL4(PROG_NAME_LIST, 416, 448, 480, 512) -}; -#undef PROG_NAME_LIST -void bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth) -{ - stack_depth = max_t(u32, stack_depth, 1); - insn->off = (s16) insn->imm; - insn->imm = interpreters_args[(round_up(stack_depth, 32) / 32) - 1] - - __bpf_call_base_args; - insn->code = BPF_JMP | BPF_CALL_ARGS; -} - }; #else @@ -1651,16 +1479,6 @@ static int bpf_check_tail_call(const struct bpf_prog *fp) return 0; } -static void bpf_prog_select_func(struct bpf_prog *fp) -{ -#ifndef CONFIG_BPF_JIT_ALWAYS_ON - u32 stack_depth = max_t(u32, fp->aux->stack_depth, 1); - fp->bpf_func = interpreters[(round_up(stack_depth, 32) / 32) - 1]; -#else - fp->bpf_func = __bpf_prog_ret0_warn; -#endif -} - /** * bpf_prog_select_runtime - select exec runtime for BPF program * @fp: bpf_prog populated with internal BPF program @@ -1671,13 +1489,13 @@ static void bpf_prog_select_func(struct bpf_prog *fp) */ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err) { - /* In case of BPF to BPF calls, verifier did all the prep - * work with regards to JITing, etc. - */ - if (fp->bpf_func) - goto finalize; +#ifndef CONFIG_BPF_JIT_ALWAYS_ON + u32 stack_depth = max_t(u32, fp->aux->stack_depth, 1); - bpf_prog_select_func(fp); + fp->bpf_func = interpreters[(round_up(stack_depth, 32) / 32) - 1]; +#else + fp->bpf_func = __bpf_prog_ret0_warn; +#endif /* eBPF JITs can rewrite the program in case constant * blinding is active. However, in case of error during @@ -1685,28 +1503,13 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err) * valid program, which in this case would simply not * be JITed, but falls back to the interpreter. */ - if (!bpf_prog_is_dev_bound(fp->aux)) { - *err = bpf_prog_alloc_jited_linfo(fp); - if (*err) - return fp; - - fp = bpf_int_jit_compile(fp); - if (!fp->jited) { - bpf_prog_free_jited_linfo(fp); + fp = bpf_int_jit_compile(fp); #ifdef CONFIG_BPF_JIT_ALWAYS_ON - *err = -ENOTSUPP; - return fp; -#endif - } else { - bpf_prog_free_unused_jited_linfo(fp); - } - } else { - *err = bpf_prog_offload_compile(fp); - if (*err) - return fp; + if (!fp->jited) { + *err = -ENOTSUPP; + return fp; } - -finalize: +#endif bpf_prog_lock_ro(fp); /* The tail call compatibility check can only be done at @@ -1751,8 +1554,7 @@ struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags) { if (prog_cnt) return kzalloc(sizeof(struct bpf_prog_array) + - sizeof(struct bpf_prog_array_item) * - (prog_cnt + 1), + sizeof(struct bpf_prog *) * (prog_cnt + 1), flags); return &empty_prog_array.hdr; @@ -1766,53 +1568,14 @@ void bpf_prog_array_free(struct bpf_prog_array __rcu *progs) kfree_rcu(progs, rcu); } -static bool bpf_prog_array_copy_core(struct bpf_prog_array __rcu *array, - u32 *prog_ids, - u32 request_cnt) -{ - struct bpf_prog_array_item *item; - int i = 0; - item = rcu_dereference(array)->items; - for (; item->prog; item++) { - if (item->prog == &dummy_bpf_prog.prog) - continue; - prog_ids[i] = item->prog->aux->id; - if (++i == request_cnt) { - item++; - break; - } - } - return !!(item->prog); -} - -int bpf_prog_array_copy_info(struct bpf_prog_array __rcu *array, - u32 *prog_ids, u32 request_cnt, - u32 *prog_cnt) -{ - u32 cnt = 0; - - if (array) - cnt = bpf_prog_array_length(array); - - *prog_cnt = cnt; - - /* return early if user requested only program count or nothing to copy */ - if (!request_cnt || !cnt) - return 0; - - /* this function is called under trace/bpf_trace.c: bpf_event_mutex */ - return bpf_prog_array_copy_core(array, prog_ids, request_cnt) ? -ENOSPC - : 0; -} - -void bpf_prog_array_delete_safe(struct bpf_prog_array __rcu *array, +void bpf_prog_array_delete_safe(struct bpf_prog_array __rcu *progs, struct bpf_prog *old_prog) { - struct bpf_prog_array_item *item = array->items; + struct bpf_prog **prog = progs->progs; - for (; item->prog; item++) - if (item->prog == old_prog) { - WRITE_ONCE(item->prog, &dummy_bpf_prog.prog); + for (; *prog; prog++) + if (*prog == old_prog) { + WRITE_ONCE(*prog, &dummy_bpf_prog.prog); break; } } @@ -1823,24 +1586,20 @@ int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array, struct bpf_prog_array **new_array) { int new_prog_cnt, carry_prog_cnt = 0; - struct bpf_prog_array_item *existing; + struct bpf_prog **existing_prog; struct bpf_prog_array *array; - bool found_exclude = false; int new_prog_idx = 0; /* Figure out how many existing progs we need to carry over to * the new array. */ if (old_array) { - existing = old_array->items; - for (; existing->prog; existing++) { - if (existing->prog == exclude_prog) { - found_exclude = true; - continue; - } - if (existing->prog != &dummy_bpf_prog.prog) + existing_prog = old_array->progs; + for (; *existing_prog; existing_prog++) { + if (*existing_prog != exclude_prog && + *existing_prog != &dummy_bpf_prog.prog) carry_prog_cnt++; - if (existing->prog == include_prog) + if (*existing_prog == include_prog) return -EEXIST; } } @@ -1863,94 +1622,25 @@ int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array, /* Fill in the new prog array */ if (carry_prog_cnt) { - existing = old_array->items; - for (; existing->prog; existing++) { - if (existing->prog != exclude_prog && - existing->prog != &dummy_bpf_prog.prog) { - array->items[new_prog_idx++].prog = - existing->prog; - } - } + existing_prog = old_array->progs; + for (; *existing_prog; existing_prog++) + if (*existing_prog != exclude_prog && + *existing_prog != &dummy_bpf_prog.prog) + array->progs[new_prog_idx++] = *existing_prog; } if (include_prog) - array->items[new_prog_idx++].prog = include_prog; - array->items[new_prog_idx].prog = NULL; + array->progs[new_prog_idx++] = include_prog; + array->progs[new_prog_idx] = NULL; *new_array = array; return 0; } -int bpf_prog_array_length(struct bpf_prog_array __rcu *array) -{ - struct bpf_prog_array_item *item; - u32 cnt = 0; - - rcu_read_lock(); - item = rcu_dereference(array)->items; - for (; item->prog; item++) - if (item->prog != &dummy_bpf_prog.prog) - cnt++; - rcu_read_unlock(); - return cnt; -} - -bool bpf_prog_array_is_empty(struct bpf_prog_array *array) -{ - struct bpf_prog_array_item *item; - for (item = array->items; item->prog; item++) - if (item->prog != &dummy_bpf_prog.prog) - return false; - return true; -} - -int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *array, - __u32 __user *prog_ids, u32 cnt) -{ - unsigned long err = 0; - bool nospc; - u32 *ids; - /* users of this function are doing: - * cnt = bpf_prog_array_length(); - * if (cnt > 0) - * bpf_prog_array_copy_to_user(..., cnt); - * so below kcalloc doesn't need extra cnt > 0 check, but - * bpf_prog_array_length() releases rcu lock and - * prog array could have been swapped with empty or larger array, - * so always copy 'cnt' prog_ids to the user. - * In a rare race the user will see zero prog_ids - */ - ids = kcalloc(cnt, sizeof(u32), GFP_USER); - if (!ids) - return -ENOMEM; - rcu_read_lock(); - nospc = bpf_prog_array_copy_core(array, ids, cnt); - rcu_read_unlock(); - err = copy_to_user(prog_ids, ids, cnt * sizeof(u32)); - kfree(ids); - if (err) - return -EFAULT; - if (nospc) - return -ENOSPC; - return 0; -} - static void bpf_prog_free_deferred(struct work_struct *work) { struct bpf_prog_aux *aux; - int i; aux = container_of(work, struct bpf_prog_aux, work); - if (bpf_prog_is_dev_bound(aux)) - bpf_prog_offload_destroy(aux->prog); - - - for (i = 0; i < aux->func_cnt; i++) - bpf_jit_free(aux->func[i]); - if (aux->func_cnt) { - kfree(aux->func); - bpf_prog_unlock_free(aux->prog); - } else { - bpf_jit_free(aux->prog); - } + bpf_jit_free(aux->prog); } /* Free internal BPF program */ @@ -1993,8 +1683,6 @@ BPF_CALL_0(bpf_user_rnd_u32) const struct bpf_func_proto bpf_map_lookup_elem_proto __weak; const struct bpf_func_proto bpf_map_update_elem_proto __weak; const struct bpf_func_proto bpf_map_delete_elem_proto __weak; -const struct bpf_func_proto bpf_spin_lock_proto __weak; -const struct bpf_func_proto bpf_spin_unlock_proto __weak; const struct bpf_func_proto bpf_get_prandom_u32_proto __weak; const struct bpf_func_proto bpf_get_smp_processor_id_proto __weak; @@ -2005,8 +1693,7 @@ const struct bpf_func_proto bpf_ktime_get_boot_ns_proto __weak; const struct bpf_func_proto bpf_get_current_pid_tgid_proto __weak; const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak; const struct bpf_func_proto bpf_get_current_comm_proto __weak; -const struct bpf_func_proto bpf_get_current_cgroup_id_proto __weak; -const struct bpf_func_proto bpf_get_local_storage_proto __weak; +const struct bpf_func_proto bpf_sock_map_update_proto __weak; const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void) { diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c deleted file mode 100644 index 0b2dcf0990e0..000000000000 --- a/kernel/bpf/cpumap.c +++ /dev/null @@ -1,555 +0,0 @@ -/* bpf/cpumap.c - * - * Copyright (c) 2017 Jesper Dangaard Brouer, Red Hat Inc. - * Released under terms in GPL version 2. See COPYING. - */ - -/* The 'cpumap' is primarily used as a backend map for XDP BPF helper - * call bpf_redirect_map() and XDP_REDIRECT action, like 'devmap'. - * - * Unlike devmap which redirects XDP frames out another NIC device, - * this map type redirects raw XDP frames to another CPU. The remote - * CPU will do SKB-allocation and call the normal network stack. - * - * This is a scalability and isolation mechanism, that allow - * separating the early driver network XDP layer, from the rest of the - * netstack, and assigning dedicated CPUs for this stage. This - * basically allows for 10G wirespeed pre-filtering via bpf. - */ -#include -#include -#include - -#include -#include -#include -#include - -/* General idea: XDP packets getting XDP redirected to another CPU, - * will maximum be stored/queued for one driver ->poll() call. It is - * guaranteed that setting flush bit and flush operation happen on - * same CPU. Thus, cpu_map_flush operation can deduct via this_cpu_ptr() - * which queue in bpf_cpu_map_entry contains packets. - */ - -#define CPU_MAP_BULK_SIZE 8 /* 8 == one cacheline on 64-bit archs */ -struct xdp_bulk_queue { - void *q[CPU_MAP_BULK_SIZE]; - unsigned int count; -}; - -/* Struct for every remote "destination" CPU in map */ -struct bpf_cpu_map_entry { - u32 qsize; /* Queue size placeholder for map lookup */ - - /* XDP can run multiple RX-ring queues, need __percpu enqueue store */ - struct xdp_bulk_queue __percpu *bulkq; - - /* Queue with potential multi-producers, and single-consumer kthread */ - struct ptr_ring *queue; - struct task_struct *kthread; - struct work_struct kthread_stop_wq; - - atomic_t refcnt; /* Control when this struct can be free'ed */ - struct rcu_head rcu; -}; - -struct bpf_cpu_map { - struct bpf_map map; - /* Below members specific for map type */ - struct bpf_cpu_map_entry **cpu_map; - unsigned long __percpu *flush_needed; -}; - -static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu, - struct xdp_bulk_queue *bq); - -static u64 cpu_map_bitmap_size(const union bpf_attr *attr) -{ - return BITS_TO_LONGS(attr->max_entries) * sizeof(unsigned long); -} - -static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) -{ - struct bpf_cpu_map *cmap; - int err = -ENOMEM; - u64 cost; - int ret; - - if (!capable(CAP_SYS_ADMIN)) - return ERR_PTR(-EPERM); - - /* check sanity of attributes */ - if (attr->max_entries == 0 || attr->key_size != 4 || - attr->value_size != 4 || attr->map_flags & ~BPF_F_NUMA_NODE) - return ERR_PTR(-EINVAL); - - cmap = kzalloc(sizeof(*cmap), GFP_USER); - if (!cmap) - return ERR_PTR(-ENOMEM); - - bpf_map_init_from_attr(&cmap->map, attr); - - /* Pre-limit array size based on NR_CPUS, not final CPU check */ - if (cmap->map.max_entries > NR_CPUS) { - err = -E2BIG; - goto free_cmap; - } - - /* make sure page count doesn't overflow */ - cost = (u64) cmap->map.max_entries * sizeof(struct bpf_cpu_map_entry *); - cost += cpu_map_bitmap_size(attr) * num_possible_cpus(); - if (cost >= U32_MAX - PAGE_SIZE) - goto free_cmap; - cmap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; - - /* Notice returns -EPERM on if map size is larger than memlock limit */ - ret = bpf_map_precharge_memlock(cmap->map.pages); - if (ret) { - err = ret; - goto free_cmap; - } - - /* A per cpu bitfield with a bit per possible CPU in map */ - cmap->flush_needed = __alloc_percpu(cpu_map_bitmap_size(attr), - __alignof__(unsigned long)); - if (!cmap->flush_needed) - goto free_cmap; - - /* Alloc array for possible remote "destination" CPUs */ - cmap->cpu_map = bpf_map_area_alloc(cmap->map.max_entries * - sizeof(struct bpf_cpu_map_entry *), - cmap->map.numa_node); - if (!cmap->cpu_map) - goto free_percpu; - - return &cmap->map; -free_percpu: - free_percpu(cmap->flush_needed); -free_cmap: - kfree(cmap); - return ERR_PTR(err); -} - -void __cpu_map_queue_destructor(void *ptr) -{ - /* The tear-down procedure should have made sure that queue is - * empty. See __cpu_map_entry_replace() and work-queue - * invoked cpu_map_kthread_stop(). Catch any broken behaviour - * gracefully and warn once. - */ - if (WARN_ON_ONCE(ptr)) - page_frag_free(ptr); -} - -static void put_cpu_map_entry(struct bpf_cpu_map_entry *rcpu) -{ - if (atomic_dec_and_test(&rcpu->refcnt)) { - /* The queue should be empty at this point */ - ptr_ring_cleanup(rcpu->queue, __cpu_map_queue_destructor); - kfree(rcpu->queue); - kfree(rcpu); - } -} - -static void get_cpu_map_entry(struct bpf_cpu_map_entry *rcpu) -{ - atomic_inc(&rcpu->refcnt); -} - -/* called from workqueue, to workaround syscall using preempt_disable */ -static void cpu_map_kthread_stop(struct work_struct *work) -{ - struct bpf_cpu_map_entry *rcpu; - - rcpu = container_of(work, struct bpf_cpu_map_entry, kthread_stop_wq); - - /* Wait for flush in __cpu_map_entry_free(), via full RCU barrier, - * as it waits until all in-flight call_rcu() callbacks complete. - */ - rcu_barrier(); - - /* kthread_stop will wake_up_process and wait for it to complete */ - kthread_stop(rcpu->kthread); -} - -static int cpu_map_kthread_run(void *data) -{ - struct bpf_cpu_map_entry *rcpu = data; - - set_current_state(TASK_INTERRUPTIBLE); - - /* When kthread gives stop order, then rcpu have been disconnected - * from map, thus no new packets can enter. Remaining in-flight - * per CPU stored packets are flushed to this queue. Wait honoring - * kthread_stop signal until queue is empty. - */ - while (!kthread_should_stop() || !__ptr_ring_empty(rcpu->queue)) { - struct xdp_pkt *xdp_pkt; - - schedule(); - /* Do work */ - while ((xdp_pkt = ptr_ring_consume(rcpu->queue))) { - /* For now just "refcnt-free" */ - page_frag_free(xdp_pkt); - } - __set_current_state(TASK_INTERRUPTIBLE); - } - __set_current_state(TASK_RUNNING); - - put_cpu_map_entry(rcpu); - return 0; -} - -struct bpf_cpu_map_entry *__cpu_map_entry_alloc(u32 qsize, u32 cpu, int map_id) -{ - gfp_t gfp = GFP_ATOMIC|__GFP_NOWARN; - struct bpf_cpu_map_entry *rcpu; - int numa, err; - - /* Have map->numa_node, but choose node of redirect target CPU */ - numa = cpu_to_node(cpu); - - rcpu = kzalloc_node(sizeof(*rcpu), gfp, numa); - if (!rcpu) - return NULL; - - /* Alloc percpu bulkq */ - rcpu->bulkq = __alloc_percpu_gfp(sizeof(*rcpu->bulkq), - sizeof(void *), gfp); - if (!rcpu->bulkq) - goto free_rcu; - - /* Alloc queue */ - rcpu->queue = kzalloc_node(sizeof(*rcpu->queue), gfp, numa); - if (!rcpu->queue) - goto free_bulkq; - - err = ptr_ring_init(rcpu->queue, qsize, gfp); - if (err) - goto free_queue; - - rcpu->qsize = qsize; - - /* Setup kthread */ - rcpu->kthread = kthread_create_on_node(cpu_map_kthread_run, rcpu, numa, - "cpumap/%d/map:%d", cpu, map_id); - if (IS_ERR(rcpu->kthread)) - goto free_ptr_ring; - - get_cpu_map_entry(rcpu); /* 1-refcnt for being in cmap->cpu_map[] */ - get_cpu_map_entry(rcpu); /* 1-refcnt for kthread */ - - /* Make sure kthread runs on a single CPU */ - kthread_bind(rcpu->kthread, cpu); - wake_up_process(rcpu->kthread); - - return rcpu; - -free_ptr_ring: - ptr_ring_cleanup(rcpu->queue, NULL); -free_queue: - kfree(rcpu->queue); -free_bulkq: - free_percpu(rcpu->bulkq); -free_rcu: - kfree(rcpu); - return NULL; -} - -void __cpu_map_entry_free(struct rcu_head *rcu) -{ - struct bpf_cpu_map_entry *rcpu; - int cpu; - - /* This cpu_map_entry have been disconnected from map and one - * RCU graze-period have elapsed. Thus, XDP cannot queue any - * new packets and cannot change/set flush_needed that can - * find this entry. - */ - rcpu = container_of(rcu, struct bpf_cpu_map_entry, rcu); - - /* Flush remaining packets in percpu bulkq */ - for_each_online_cpu(cpu) { - struct xdp_bulk_queue *bq = per_cpu_ptr(rcpu->bulkq, cpu); - - /* No concurrent bq_enqueue can run at this point */ - bq_flush_to_queue(rcpu, bq); - } - free_percpu(rcpu->bulkq); - /* Cannot kthread_stop() here, last put free rcpu resources */ - put_cpu_map_entry(rcpu); -} - -/* After xchg pointer to bpf_cpu_map_entry, use the call_rcu() to - * ensure any driver rcu critical sections have completed, but this - * does not guarantee a flush has happened yet. Because driver side - * rcu_read_lock/unlock only protects the running XDP program. The - * atomic xchg and NULL-ptr check in __cpu_map_flush() makes sure a - * pending flush op doesn't fail. - * - * The bpf_cpu_map_entry is still used by the kthread, and there can - * still be pending packets (in queue and percpu bulkq). A refcnt - * makes sure to last user (kthread_stop vs. call_rcu) free memory - * resources. - * - * The rcu callback __cpu_map_entry_free flush remaining packets in - * percpu bulkq to queue. Due to caller map_delete_elem() disable - * preemption, cannot call kthread_stop() to make sure queue is empty. - * Instead a work_queue is started for stopping kthread, - * cpu_map_kthread_stop, which waits for an RCU graze period before - * stopping kthread, emptying the queue. - */ -void __cpu_map_entry_replace(struct bpf_cpu_map *cmap, - u32 key_cpu, struct bpf_cpu_map_entry *rcpu) -{ - struct bpf_cpu_map_entry *old_rcpu; - - old_rcpu = xchg(&cmap->cpu_map[key_cpu], rcpu); - if (old_rcpu) { - call_rcu(&old_rcpu->rcu, __cpu_map_entry_free); - INIT_WORK(&old_rcpu->kthread_stop_wq, cpu_map_kthread_stop); - schedule_work(&old_rcpu->kthread_stop_wq); - } -} - -int cpu_map_delete_elem(struct bpf_map *map, void *key) -{ - struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); - u32 key_cpu = *(u32 *)key; - - if (key_cpu >= map->max_entries) - return -EINVAL; - - /* notice caller map_delete_elem() use preempt_disable() */ - __cpu_map_entry_replace(cmap, key_cpu, NULL); - return 0; -} - -int cpu_map_update_elem(struct bpf_map *map, void *key, void *value, - u64 map_flags) -{ - struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); - struct bpf_cpu_map_entry *rcpu; - - /* Array index key correspond to CPU number */ - u32 key_cpu = *(u32 *)key; - /* Value is the queue size */ - u32 qsize = *(u32 *)value; - - if (unlikely(map_flags > BPF_EXIST)) - return -EINVAL; - if (unlikely(key_cpu >= cmap->map.max_entries)) - return -E2BIG; - if (unlikely(map_flags == BPF_NOEXIST)) - return -EEXIST; - if (unlikely(qsize > 16384)) /* sanity limit on qsize */ - return -EOVERFLOW; - - /* Make sure CPU is a valid possible cpu */ - if (!cpu_possible(key_cpu)) - return -ENODEV; - - if (qsize == 0) { - rcpu = NULL; /* Same as deleting */ - } else { - /* Updating qsize cause re-allocation of bpf_cpu_map_entry */ - rcpu = __cpu_map_entry_alloc(qsize, key_cpu, map->id); - if (!rcpu) - return -ENOMEM; - } - rcu_read_lock(); - __cpu_map_entry_replace(cmap, key_cpu, rcpu); - rcu_read_unlock(); - return 0; -} - -void cpu_map_free(struct bpf_map *map) -{ - struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); - int cpu; - u32 i; - - /* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0, - * so the bpf programs (can be more than one that used this map) were - * disconnected from events. Wait for outstanding critical sections in - * these programs to complete. The rcu critical section only guarantees - * no further "XDP/bpf-side" reads against bpf_cpu_map->cpu_map. - * It does __not__ ensure pending flush operations (if any) are - * complete. - */ - synchronize_rcu(); - - /* To ensure all pending flush operations have completed wait for flush - * bitmap to indicate all flush_needed bits to be zero on _all_ cpus. - * Because the above synchronize_rcu() ensures the map is disconnected - * from the program we can assume no new bits will be set. - */ - for_each_online_cpu(cpu) { - unsigned long *bitmap = per_cpu_ptr(cmap->flush_needed, cpu); - - while (!bitmap_empty(bitmap, cmap->map.max_entries)) - cond_resched(); - } - - /* For cpu_map the remote CPUs can still be using the entries - * (struct bpf_cpu_map_entry). - */ - for (i = 0; i < cmap->map.max_entries; i++) { - struct bpf_cpu_map_entry *rcpu; - - rcpu = READ_ONCE(cmap->cpu_map[i]); - if (!rcpu) - continue; - - /* bq flush and cleanup happens after RCU graze-period */ - __cpu_map_entry_replace(cmap, i, NULL); /* call_rcu */ - } - free_percpu(cmap->flush_needed); - bpf_map_area_free(cmap->cpu_map); - kfree(cmap); -} - -struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key) -{ - struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); - struct bpf_cpu_map_entry *rcpu; - - if (key >= map->max_entries) - return NULL; - - rcpu = READ_ONCE(cmap->cpu_map[key]); - return rcpu; -} - -static void *cpu_map_lookup_elem(struct bpf_map *map, void *key) -{ - struct bpf_cpu_map_entry *rcpu = - __cpu_map_lookup_elem(map, *(u32 *)key); - - return rcpu ? &rcpu->qsize : NULL; -} - -static int cpu_map_get_next_key(struct bpf_map *map, void *key, void *next_key) -{ - struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); - u32 index = key ? *(u32 *)key : U32_MAX; - u32 *next = next_key; - - if (index >= cmap->map.max_entries) { - *next = 0; - return 0; - } - - if (index == cmap->map.max_entries - 1) - return -ENOENT; - *next = index + 1; - return 0; -} - -const struct bpf_map_ops cpu_map_ops = { - .map_alloc = cpu_map_alloc, - .map_free = cpu_map_free, - .map_delete_elem = cpu_map_delete_elem, - .map_update_elem = cpu_map_update_elem, - .map_lookup_elem = cpu_map_lookup_elem, - .map_get_next_key = cpu_map_get_next_key, - .map_check_btf = map_check_no_btf, -}; - -static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu, - struct xdp_bulk_queue *bq) -{ - struct ptr_ring *q; - int i; - - if (unlikely(!bq->count)) - return 0; - - q = rcpu->queue; - spin_lock(&q->producer_lock); - - for (i = 0; i < bq->count; i++) { - void *xdp_pkt = bq->q[i]; - int err; - - err = __ptr_ring_produce(q, xdp_pkt); - if (err) { - /* Free xdp_pkt */ - page_frag_free(xdp_pkt); - } - } - bq->count = 0; - spin_unlock(&q->producer_lock); - - return 0; -} - -/* Notice: Will change in later patch */ -struct xdp_pkt { - void *data; - u16 len; - u16 headroom; -}; - -/* Runs under RCU-read-side, plus in softirq under NAPI protection. - * Thus, safe percpu variable access. - */ -int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_pkt *xdp_pkt) -{ - struct xdp_bulk_queue *bq = this_cpu_ptr(rcpu->bulkq); - - if (unlikely(bq->count == CPU_MAP_BULK_SIZE)) - bq_flush_to_queue(rcpu, bq); - - /* Notice, xdp_buff/page MUST be queued here, long enough for - * driver to code invoking us to finished, due to driver - * (e.g. ixgbe) recycle tricks based on page-refcnt. - * - * Thus, incoming xdp_pkt is always queued here (else we race - * with another CPU on page-refcnt and remaining driver code). - * Queue time is very short, as driver will invoke flush - * operation, when completing napi->poll call. - */ - bq->q[bq->count++] = xdp_pkt; - return 0; -} - -void __cpu_map_insert_ctx(struct bpf_map *map, u32 bit) -{ - struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); - unsigned long *bitmap = this_cpu_ptr(cmap->flush_needed); - - __set_bit(bit, bitmap); -} - -void __cpu_map_flush(struct bpf_map *map) -{ - struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); - unsigned long *bitmap = this_cpu_ptr(cmap->flush_needed); - u32 bit; - - /* The napi->poll softirq makes sure __cpu_map_insert_ctx() - * and __cpu_map_flush() happen on same CPU. Thus, the percpu - * bitmap indicate which percpu bulkq have packets. - */ - for_each_set_bit(bit, bitmap, map->max_entries) { - struct bpf_cpu_map_entry *rcpu = READ_ONCE(cmap->cpu_map[bit]); - struct xdp_bulk_queue *bq; - - /* This is possible if entry is removed by user space - * between xdp redirect and flush op. - */ - if (unlikely(!rcpu)) - continue; - - __clear_bit(bit, bitmap); - - /* Flush all frames in bulkq to real queue */ - bq = this_cpu_ptr(rcpu->bulkq); - bq_flush_to_queue(rcpu, bq); - - /* If already running, costs spin_lock_irqsave + smb_mb */ - wake_up_process(rcpu->kthread); - } -} diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 18ec86effea5..1a846a636ae1 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -46,12 +46,6 @@ * notifier hook walks the map we know that new dev references can not be * added by the user because core infrastructure ensures dev_get_by_index() * calls will fail at this point. - * - * The devmap_hash type is a map type which interprets keys as ifindexes and - * indexes these using a hashmap. This allows maps that use ifindex as key to be - * densely packed instead of having holes in the lookup array for unused - * ifindexes. The setup and packet enqueue/send code is shared between the two - * types of devmap; only the lookup and insertion is different. */ #include #include @@ -61,7 +55,6 @@ struct bpf_dtab_netdev { struct net_device *dev; - struct hlist_node index_hlist; struct bpf_dtab *dtab; unsigned int bit; struct rcu_head rcu; @@ -72,30 +65,11 @@ struct bpf_dtab { struct bpf_dtab_netdev **netdev_map; unsigned long __percpu *flush_needed; struct list_head list; - - /* these are only used for DEVMAP_HASH type maps */ - struct hlist_head *dev_index_head; - spinlock_t index_lock; - unsigned int items; - u32 n_buckets; }; static DEFINE_SPINLOCK(dev_map_lock); static LIST_HEAD(dev_map_list); -static struct hlist_head *dev_map_create_hash(unsigned int entries) -{ - int i; - struct hlist_head *hash; - - hash = kmalloc_array(entries, sizeof(*hash), GFP_KERNEL); - if (hash != NULL) - for (i = 0; i < entries; i++) - INIT_HLIST_HEAD(&hash[i]); - - return hash; -} - static u64 dev_map_bitmap_size(const union bpf_attr *attr) { return BITS_TO_LONGS((u64) attr->max_entries) * sizeof(unsigned long); @@ -115,16 +89,17 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) attr->value_size != 4 || attr->map_flags & ~DEV_CREATE_FLAG_MASK) return ERR_PTR(-EINVAL); - /* Lookup returns a pointer straight to dev->ifindex, so make sure the - * verifier prevents writes from the BPF side - */ - attr->map_flags |= BPF_F_RDONLY_PROG; - dtab = kzalloc(sizeof(*dtab), GFP_USER); if (!dtab) return ERR_PTR(-ENOMEM); - bpf_map_init_from_attr(&dtab->map, attr); + /* mandatory map attributes */ + dtab->map.map_type = attr->map_type; + dtab->map.key_size = attr->key_size; + dtab->map.value_size = attr->value_size; + dtab->map.max_entries = attr->max_entries; + dtab->map.map_flags = attr->map_flags; + dtab->map.numa_node = bpf_map_attr_numa_node(attr); /* make sure page count doesn't overflow */ cost = (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *); @@ -134,16 +109,6 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) dtab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; - if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) { - dtab->n_buckets = roundup_pow_of_two(dtab->map.max_entries); - - if (!dtab->n_buckets) { /* Overflow check */ - err = -EINVAL; - goto free_dtab; - } - cost += sizeof(struct hlist_head) * dtab->n_buckets; - } - /* if map size is larger than memlock limit, reject it early */ err = bpf_map_precharge_memlock(dtab->map.pages); if (err) @@ -164,24 +129,13 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) if (!dtab->netdev_map) goto free_dtab; - if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) { - dtab->dev_index_head = dev_map_create_hash(dtab->n_buckets); - if (!dtab->dev_index_head) - goto free_map_area; - - spin_lock_init(&dtab->index_lock); - } - spin_lock(&dev_map_lock); list_add_tail_rcu(&dtab->list, &dev_map_list); spin_unlock(&dev_map_lock); return &dtab->map; -free_map_area: - bpf_map_area_free(dtab->netdev_map); free_dtab: free_percpu(dtab->flush_needed); - kfree(dtab->dev_index_head); kfree(dtab); return ERR_PTR(err); } @@ -233,7 +187,6 @@ static void dev_map_free(struct bpf_map *map) free_percpu(dtab->flush_needed); bpf_map_area_free(dtab->netdev_map); - kfree(dtab->dev_index_head); kfree(dtab); } @@ -254,77 +207,6 @@ static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key) return 0; } -static inline struct hlist_head *dev_map_index_hash(struct bpf_dtab *dtab, - int idx) -{ - return &dtab->dev_index_head[idx & (dtab->n_buckets - 1)]; -} - -static struct bpf_dtab_netdev *__dev_map_hash_lookup_elem_dtab(struct bpf_map *map, u32 key) -{ - struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); - struct hlist_head *head = dev_map_index_hash(dtab, key); - struct bpf_dtab_netdev *dev; - - hlist_for_each_entry_rcu(dev, head, index_hlist) - if (dev->bit == key) - return dev; - - return NULL; -} - -struct net_device *__dev_map_hash_lookup_elem(struct bpf_map *map, u32 key) -{ - struct bpf_dtab_netdev *dev = __dev_map_hash_lookup_elem_dtab(map, key); - - return dev ? dev->dev : NULL; -} - -static int dev_map_hash_get_next_key(struct bpf_map *map, void *key, - void *next_key) -{ - struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); - u32 idx, *next = next_key; - struct bpf_dtab_netdev *dev, *next_dev; - struct hlist_head *head; - int i = 0; - - if (!key) - goto find_first; - - idx = *(u32 *)key; - - dev = __dev_map_hash_lookup_elem_dtab(map, idx); - if (!dev) - goto find_first; - - next_dev = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(&dev->index_hlist)), - struct bpf_dtab_netdev, index_hlist); - - if (next_dev) { - *next = next_dev->bit; - return 0; - } - - i = idx & (dtab->n_buckets - 1); - i++; - - find_first: - for (; i < dtab->n_buckets; i++) { - head = dev_map_index_hash(dtab, i); - - next_dev = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)), - struct bpf_dtab_netdev, - index_hlist); - if (next_dev) { - *next = next_dev->bit; - return 0; - } - } - - return -ENOENT; -} - void __dev_map_insert_ctx(struct bpf_map *map, u32 bit) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); @@ -386,13 +268,6 @@ static void *dev_map_lookup_elem(struct bpf_map *map, void *key) return dev ? &dev->ifindex : NULL; } -static void *dev_map_hash_lookup_elem(struct bpf_map *map, void *key) -{ - struct net_device *dev = __dev_map_hash_lookup_elem(map, *(u32 *)key); - - return dev ? &dev->ifindex : NULL; -} - static void dev_map_flush_old(struct bpf_dtab_netdev *dev) { if (dev->dev->netdev_ops->ndo_xdp_flush) { @@ -442,52 +317,6 @@ static int dev_map_delete_elem(struct bpf_map *map, void *key) return 0; } -static int dev_map_hash_delete_elem(struct bpf_map *map, void *key) -{ - struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); - struct bpf_dtab_netdev *old_dev; - int k = *(u32 *)key; - unsigned long flags; - int ret = -ENOENT; - - spin_lock_irqsave(&dtab->index_lock, flags); - - old_dev = __dev_map_hash_lookup_elem_dtab(map, k); - if (old_dev) { - dtab->items--; - hlist_del_init_rcu(&old_dev->index_hlist); - call_rcu(&old_dev->rcu, __dev_map_entry_free); - ret = 0; - } - spin_unlock_irqrestore(&dtab->index_lock, flags); - - return ret; -} - -static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net, - struct bpf_dtab *dtab, - u32 ifindex, - unsigned int idx) -{ - gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN; - struct bpf_dtab_netdev *dev; - - dev = kmalloc_node(sizeof(*dev), gfp, dtab->map.numa_node); - if (!dev) - return ERR_PTR(-ENOMEM); - - dev->dev = dev_get_by_index(net, ifindex); - if (!dev->dev) { - kfree(dev); - return ERR_PTR(-EINVAL); - } - - dev->bit = idx; - dev->dtab = dtab; - - return dev; -} - static int dev_map_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags) { @@ -507,9 +336,19 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value, if (!ifindex) { dev = NULL; } else { - dev = __dev_map_alloc_node(net, dtab, ifindex, i); - if (IS_ERR(dev)) - return PTR_ERR(dev); + dev = kmalloc_node(sizeof(*dev), GFP_ATOMIC | __GFP_NOWARN, + map->numa_node); + if (!dev) + return -ENOMEM; + + dev->dev = dev_get_by_index(net, ifindex); + if (!dev->dev) { + kfree(dev); + return -EINVAL; + } + + dev->bit = i; + dev->dtab = dtab; } /* Use call_rcu() here to ensure rcu critical sections have completed @@ -523,57 +362,6 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value, return 0; } - -static int __dev_map_hash_update_elem(struct net *net, struct bpf_map *map, - void *key, void *value, u64 map_flags) -{ - struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); - struct bpf_dtab_netdev *dev, *old_dev; - u32 ifindex = *(u32 *)value; - u32 idx = *(u32 *)key; - unsigned long flags; - - if (unlikely(map_flags > BPF_EXIST || !ifindex)) - return -EINVAL; - - old_dev = __dev_map_hash_lookup_elem_dtab(map, idx); - if (old_dev && (map_flags & BPF_NOEXIST)) - return -EEXIST; - - dev = __dev_map_alloc_node(net, dtab, ifindex, idx); - if (IS_ERR(dev)) - return PTR_ERR(dev); - - spin_lock_irqsave(&dtab->index_lock, flags); - - if (old_dev) { - hlist_del_rcu(&old_dev->index_hlist); - } else { - if (dtab->items >= dtab->map.max_entries) { - spin_unlock_irqrestore(&dtab->index_lock, flags); - call_rcu(&dev->rcu, __dev_map_entry_free); - return -E2BIG; - } - dtab->items++; - } - - hlist_add_head_rcu(&dev->index_hlist, - dev_map_index_hash(dtab, idx)); - spin_unlock_irqrestore(&dtab->index_lock, flags); - - if (old_dev) - call_rcu(&old_dev->rcu, __dev_map_entry_free); - - return 0; -} - -static int dev_map_hash_update_elem(struct bpf_map *map, void *key, void *value, - u64 map_flags) -{ - return __dev_map_hash_update_elem(current->nsproxy->net_ns, - map, key, value, map_flags); -} - const struct bpf_map_ops dev_map_ops = { .map_alloc = dev_map_alloc, .map_free = dev_map_free, @@ -581,16 +369,6 @@ const struct bpf_map_ops dev_map_ops = { .map_lookup_elem = dev_map_lookup_elem, .map_update_elem = dev_map_update_elem, .map_delete_elem = dev_map_delete_elem, - .map_check_btf = map_check_no_btf, -}; - -const struct bpf_map_ops dev_map_hash_ops = { - .map_alloc = dev_map_alloc, - .map_free = dev_map_free, - .map_get_next_key = dev_map_hash_get_next_key, - .map_lookup_elem = dev_map_hash_lookup_elem, - .map_update_elem = dev_map_hash_update_elem, - .map_delete_elem = dev_map_hash_delete_elem, }; static int dev_map_notification(struct notifier_block *notifier, diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c deleted file mode 100644 index d70920831285..000000000000 --- a/kernel/bpf/disasm.c +++ /dev/null @@ -1,261 +0,0 @@ -/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com - * Copyright (c) 2016 Facebook - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - */ - -#include - -#include "disasm.h" - -#define __BPF_FUNC_STR_FN(x) [BPF_FUNC_ ## x] = __stringify(bpf_ ## x) -static const char * const func_id_str[] = { - __BPF_FUNC_MAPPER(__BPF_FUNC_STR_FN) -}; -#undef __BPF_FUNC_STR_FN - -static const char *__func_get_name(const struct bpf_insn_cbs *cbs, - const struct bpf_insn *insn, - char *buff, size_t len) -{ - BUILD_BUG_ON(ARRAY_SIZE(func_id_str) != __BPF_FUNC_MAX_ID); - - if (insn->src_reg != BPF_PSEUDO_CALL && - insn->imm >= 0 && insn->imm < __BPF_FUNC_MAX_ID && - func_id_str[insn->imm]) - return func_id_str[insn->imm]; - - if (cbs && cbs->cb_call) - return cbs->cb_call(cbs->private_data, insn); - - if (insn->src_reg == BPF_PSEUDO_CALL) - snprintf(buff, len, "%+d", insn->imm); - - return buff; -} - -static const char *__func_imm_name(const struct bpf_insn_cbs *cbs, - const struct bpf_insn *insn, - u64 full_imm, char *buff, size_t len) -{ - if (cbs && cbs->cb_imm) - return cbs->cb_imm(cbs->private_data, insn, full_imm); - snprintf(buff, len, "0x%llx", (unsigned long long)full_imm); - - return buff; -} - -const char *func_id_name(int id) -{ - if (id >= 0 && id < __BPF_FUNC_MAX_ID && func_id_str[id]) - return func_id_str[id]; - else - return "unknown"; -} - -const char *const bpf_class_string[8] = { - [BPF_LD] = "ld", - [BPF_LDX] = "ldx", - [BPF_ST] = "st", - [BPF_STX] = "stx", - [BPF_ALU] = "alu", - [BPF_JMP] = "jmp", - [BPF_RET] = "BUG", - [BPF_ALU64] = "alu64", -}; - -const char *const bpf_alu_string[16] = { - [BPF_ADD >> 4] = "+=", - [BPF_SUB >> 4] = "-=", - [BPF_MUL >> 4] = "*=", - [BPF_DIV >> 4] = "/=", - [BPF_OR >> 4] = "|=", - [BPF_AND >> 4] = "&=", - [BPF_LSH >> 4] = "<<=", - [BPF_RSH >> 4] = ">>=", - [BPF_NEG >> 4] = "neg", - [BPF_MOD >> 4] = "%=", - [BPF_XOR >> 4] = "^=", - [BPF_MOV >> 4] = "=", - [BPF_ARSH >> 4] = "s>>=", - [BPF_END >> 4] = "endian", -}; - -static const char *const bpf_ldst_string[] = { - [BPF_W >> 3] = "u32", - [BPF_H >> 3] = "u16", - [BPF_B >> 3] = "u8", - [BPF_DW >> 3] = "u64", -}; - -static const char *const bpf_jmp_string[16] = { - [BPF_JA >> 4] = "jmp", - [BPF_JEQ >> 4] = "==", - [BPF_JGT >> 4] = ">", - [BPF_JLT >> 4] = "<", - [BPF_JGE >> 4] = ">=", - [BPF_JLE >> 4] = "<=", - [BPF_JSET >> 4] = "&", - [BPF_JNE >> 4] = "!=", - [BPF_JSGT >> 4] = "s>", - [BPF_JSLT >> 4] = "s<", - [BPF_JSGE >> 4] = "s>=", - [BPF_JSLE >> 4] = "s<=", - [BPF_CALL >> 4] = "call", - [BPF_EXIT >> 4] = "exit", -}; - -static void print_bpf_end_insn(bpf_insn_print_t verbose, - void *private_data, - const struct bpf_insn *insn) -{ - verbose(private_data, "(%02x) r%d = %s%d r%d\n", - insn->code, insn->dst_reg, - BPF_SRC(insn->code) == BPF_TO_BE ? "be" : "le", - insn->imm, insn->dst_reg); -} - -void print_bpf_insn(const struct bpf_insn_cbs *cbs, - const struct bpf_insn *insn, - bool allow_ptr_leaks) -{ - const bpf_insn_print_t verbose = cbs->cb_print; - u8 class = BPF_CLASS(insn->code); - - if (class == BPF_ALU || class == BPF_ALU64) { - if (BPF_OP(insn->code) == BPF_END) { - if (class == BPF_ALU64) - verbose(cbs->private_data, "BUG_alu64_%02x\n", insn->code); - else - print_bpf_end_insn(verbose, cbs->private_data, insn); - } else if (BPF_OP(insn->code) == BPF_NEG) { - verbose(cbs->private_data, "(%02x) r%d = %s-r%d\n", - insn->code, insn->dst_reg, - class == BPF_ALU ? "(u32) " : "", - insn->dst_reg); - } else if (BPF_SRC(insn->code) == BPF_X) { - verbose(cbs->private_data, "(%02x) %sr%d %s %sr%d\n", - insn->code, class == BPF_ALU ? "(u32) " : "", - insn->dst_reg, - bpf_alu_string[BPF_OP(insn->code) >> 4], - class == BPF_ALU ? "(u32) " : "", - insn->src_reg); - } else { - verbose(cbs->private_data, "(%02x) %sr%d %s %s%d\n", - insn->code, class == BPF_ALU ? "(u32) " : "", - insn->dst_reg, - bpf_alu_string[BPF_OP(insn->code) >> 4], - class == BPF_ALU ? "(u32) " : "", - insn->imm); - } - } else if (class == BPF_STX) { - if (BPF_MODE(insn->code) == BPF_MEM) - verbose(cbs->private_data, "(%02x) *(%s *)(r%d %+d) = r%d\n", - insn->code, - bpf_ldst_string[BPF_SIZE(insn->code) >> 3], - insn->dst_reg, - insn->off, insn->src_reg); - else if (BPF_MODE(insn->code) == BPF_XADD) - verbose(cbs->private_data, "(%02x) lock *(%s *)(r%d %+d) += r%d\n", - insn->code, - bpf_ldst_string[BPF_SIZE(insn->code) >> 3], - insn->dst_reg, insn->off, - insn->src_reg); - else - verbose(cbs->private_data, "BUG_%02x\n", insn->code); - } else if (class == BPF_ST) { - if (BPF_MODE(insn->code) != BPF_MEM) { - verbose(cbs->private_data, "BUG_st_%02x\n", insn->code); - return; - } - verbose(cbs->private_data, "(%02x) *(%s *)(r%d %+d) = %d\n", - insn->code, - bpf_ldst_string[BPF_SIZE(insn->code) >> 3], - insn->dst_reg, - insn->off, insn->imm); - } else if (class == BPF_LDX) { - if (BPF_MODE(insn->code) != BPF_MEM) { - verbose(cbs->private_data, "BUG_ldx_%02x\n", insn->code); - return; - } - verbose(cbs->private_data, "(%02x) r%d = *(%s *)(r%d %+d)\n", - insn->code, insn->dst_reg, - bpf_ldst_string[BPF_SIZE(insn->code) >> 3], - insn->src_reg, insn->off); - } else if (class == BPF_LD) { - if (BPF_MODE(insn->code) == BPF_ABS) { - verbose(cbs->private_data, "(%02x) r0 = *(%s *)skb[%d]\n", - insn->code, - bpf_ldst_string[BPF_SIZE(insn->code) >> 3], - insn->imm); - } else if (BPF_MODE(insn->code) == BPF_IND) { - verbose(cbs->private_data, "(%02x) r0 = *(%s *)skb[r%d + %d]\n", - insn->code, - bpf_ldst_string[BPF_SIZE(insn->code) >> 3], - insn->src_reg, insn->imm); - } else if (BPF_MODE(insn->code) == BPF_IMM && - BPF_SIZE(insn->code) == BPF_DW) { - /* At this point, we already made sure that the second - * part of the ldimm64 insn is accessible. - */ - u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm; - bool is_ptr = insn->src_reg == BPF_PSEUDO_MAP_FD || - insn->src_reg == BPF_PSEUDO_MAP_VALUE; - char tmp[64]; - - if (is_ptr && !allow_ptr_leaks) - imm = 0; - - verbose(cbs->private_data, "(%02x) r%d = %s\n", - insn->code, insn->dst_reg, - __func_imm_name(cbs, insn, imm, - tmp, sizeof(tmp))); - } else { - verbose(cbs->private_data, "BUG_ld_%02x\n", insn->code); - return; - } - } else if (class == BPF_JMP) { - u8 opcode = BPF_OP(insn->code); - - if (opcode == BPF_CALL) { - char tmp[64]; - if (insn->src_reg == BPF_PSEUDO_CALL) { - verbose(cbs->private_data, "(%02x) call pc%s\n", - insn->code, - __func_get_name(cbs, insn, - tmp, sizeof(tmp))); - } else { - strcpy(tmp, "unknown"); - verbose(cbs->private_data, "(%02x) call %s#%d\n", insn->code, - __func_get_name(cbs, insn, - tmp, sizeof(tmp)), - insn->imm); - } - } else if (insn->code == (BPF_JMP | BPF_JA)) { - verbose(cbs->private_data, "(%02x) goto pc%+d\n", - insn->code, insn->off); - } else if (insn->code == (BPF_JMP | BPF_EXIT)) { - verbose(cbs->private_data, "(%02x) exit\n", insn->code); - } else if (BPF_SRC(insn->code) == BPF_X) { - verbose(cbs->private_data, "(%02x) if r%d %s r%d goto pc%+d\n", - insn->code, insn->dst_reg, - bpf_jmp_string[BPF_OP(insn->code) >> 4], - insn->src_reg, insn->off); - } else { - verbose(cbs->private_data, "(%02x) if r%d %s 0x%x goto pc%+d\n", - insn->code, insn->dst_reg, - bpf_jmp_string[BPF_OP(insn->code) >> 4], - insn->imm, insn->off); - } - } else { - verbose(cbs->private_data, "(%02x) %s\n", - insn->code, bpf_class_string[class]); - } -} diff --git a/kernel/bpf/disasm.h b/kernel/bpf/disasm.h deleted file mode 100644 index 786ebe260000..000000000000 --- a/kernel/bpf/disasm.h +++ /dev/null @@ -1,48 +0,0 @@ -/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com - * Copyright (c) 2016 Facebook - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - */ - -#ifndef __BPF_DISASM_H__ -#define __BPF_DISASM_H__ - -#include -#include -#include -#ifndef __KERNEL__ -#include -#include -#endif - -extern const char *const bpf_alu_string[16]; -extern const char *const bpf_class_string[8]; - -const char *func_id_name(int id); - -typedef __printf(2, 3) void (*bpf_insn_print_t)(void *private_data, - const char *, ...); -typedef const char *(*bpf_insn_revmap_call_t)(void *private_data, - const struct bpf_insn *insn); -typedef const char *(*bpf_insn_print_imm_t)(void *private_data, - const struct bpf_insn *insn, - __u64 full_imm); -struct bpf_insn_cbs { - bpf_insn_print_t cb_print; - bpf_insn_revmap_call_t cb_call; - bpf_insn_print_imm_t cb_imm; - void *private_data; -}; - -void print_bpf_insn(const struct bpf_insn_cbs *cbs, - const struct bpf_insn *insn, - bool allow_ptr_leaks); - -#endif diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index b169df182ee5..773d0805088f 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -238,6 +238,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) */ bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU); bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC); + int numa_node = bpf_map_attr_numa_node(attr); struct bpf_htab *htab; int err, i; u64 cost; @@ -263,11 +264,20 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) if (lru && !prealloc) return ERR_PTR(-ENOTSUPP); + if (numa_node != NUMA_NO_NODE && (percpu || percpu_lru)) + return ERR_PTR(-EINVAL); + htab = kzalloc(sizeof(*htab), GFP_USER); if (!htab) return ERR_PTR(-ENOMEM); - bpf_map_init_from_attr(&htab->map, attr); + /* mandatory map attributes */ + htab->map.map_type = attr->map_type; + htab->map.key_size = attr->key_size; + htab->map.value_size = attr->value_size; + htab->map.max_entries = attr->max_entries; + htab->map.map_flags = attr->map_flags; + htab->map.numa_node = numa_node; /* check sanity of attributes. * value_size == 0 may be allowed in the future to use map as a set @@ -692,12 +702,21 @@ static bool fd_htab_map_needs_adjust(const struct bpf_htab *htab) BITS_PER_LONG == 64; } +static u32 htab_size_value(const struct bpf_htab *htab, bool percpu) +{ + u32 size = htab->map.value_size; + + if (percpu || fd_htab_map_needs_adjust(htab)) + size = round_up(size, 8); + return size; +} + static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, void *value, u32 key_size, u32 hash, bool percpu, bool onallcpus, struct htab_elem *old_elem) { - u32 size = htab->map.value_size; + u32 size = htab_size_value(htab, percpu); bool prealloc = htab_is_prealloc(htab); struct htab_elem *l_new, **pl_new; void __percpu *pptr; @@ -736,13 +755,10 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, l_new = ERR_PTR(-ENOMEM); goto dec_count; } - check_and_init_map_lock(&htab->map, - l_new->key + round_up(key_size, 8)); } memcpy(l_new->key, key, key_size); if (percpu) { - size = round_up(size, 8); if (prealloc) { pptr = htab_elem_get_ptr(l_new, key_size); } else { @@ -760,13 +776,8 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, if (!prealloc) htab_elem_set_ptr(l_new, key_size, pptr); - } else if (fd_htab_map_needs_adjust(htab)) { - size = round_up(size, 8); - memcpy(l_new->key + round_up(key_size, 8), value, size); } else { - copy_map_value(&htab->map, - l_new->key + round_up(key_size, 8), - value); + memcpy(l_new->key + round_up(key_size, 8), value, size); } l_new->hash = hash; @@ -779,11 +790,11 @@ dec_count: static int check_flags(struct bpf_htab *htab, struct htab_elem *l_old, u64 map_flags) { - if (l_old && (map_flags & ~BPF_F_LOCK) == BPF_NOEXIST) + if (l_old && map_flags == BPF_NOEXIST) /* elem already exists */ return -EEXIST; - if (!l_old && (map_flags & ~BPF_F_LOCK) == BPF_EXIST) + if (!l_old && map_flags == BPF_EXIST) /* elem doesn't exist, cannot update it */ return -ENOENT; @@ -824,20 +835,6 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, if (ret) goto err; - if (unlikely(l_old && (map_flags & BPF_F_LOCK))) { - /* first lookup without the bucket lock didn't find the element, - * but second lookup with the bucket lock found it. - * This case is highly unlikely, but has to be dealt with: - * grab the element lock in addition to the bucket lock - * and update element in place - */ - copy_map_value_locked(map, - l_old->key + round_up(key_size, 8), - value, false); - ret = 0; - goto err; - } - l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false, l_old); if (IS_ERR(l_new)) { @@ -950,31 +947,6 @@ static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key, b = __select_bucket(htab, hash); head = &b->head; - if (unlikely(map_flags & BPF_F_LOCK)) { - if (unlikely(!map_value_has_spin_lock(map))) - return -EINVAL; - - /* find an element without taking the bucket lock */ - l_old = lookup_nulls_elem_raw(head, hash, key, key_size, - htab->n_buckets); - ret = check_flags(htab, l_old, map_flags); - if (ret) - return ret; - - if (l_old) { - /* grab the element lock and update value in place */ - copy_map_value_locked(map, - l_old->key + round_up(key_size, 8), - value, false); - return 0; - } - - /* fall through, grab the bucket lock and lookup again. - * 99.9% chance that the element won't be found, - * but second lookup under lock has to be done. - */ - } - /* bpf_map_update_elem() can be called in_irq() */ raw_spin_lock_irqsave(&b->lock, flags); @@ -1015,7 +987,7 @@ static int __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, u32 key_size, hash; int ret; - if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST)) + if (unlikely(map_flags > BPF_EXIST)) /* unknown flags */ return -EINVAL; @@ -1429,5 +1401,4 @@ const struct bpf_map_ops htab_of_maps_map_ops = { .map_fd_put_ptr = bpf_map_fd_put_ptr, .map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem, .map_gen_lookup = htab_of_map_gen_lookup, - .map_check_btf = map_check_no_btf, }; diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index a2cf6744d88a..aac170ea6c7e 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -191,130 +191,3 @@ const struct bpf_func_proto bpf_get_current_comm_proto = { .arg1_type = ARG_PTR_TO_UNINIT_MEM, .arg2_type = ARG_CONST_SIZE, }; - -#if defined(CONFIG_QUEUED_SPINLOCKS) || defined(CONFIG_BPF_ARCH_SPINLOCK) -static inline void __bpf_spin_lock(struct bpf_spin_lock *lock) -{ - arch_spinlock_t *l = (void *)lock; - union { - __u32 val; - arch_spinlock_t lock; - } u = { .lock = __ARCH_SPIN_LOCK_UNLOCKED }; - compiletime_assert(u.val == 0, "__ARCH_SPIN_LOCK_UNLOCKED not 0"); - BUILD_BUG_ON(sizeof(*l) != sizeof(__u32)); - BUILD_BUG_ON(sizeof(*lock) != sizeof(__u32)); - arch_spin_lock(l); -} - -static inline void __bpf_spin_unlock(struct bpf_spin_lock *lock) -{ - arch_spinlock_t *l = (void *)lock; - arch_spin_unlock(l); -} - -#else -static inline void __bpf_spin_lock(struct bpf_spin_lock *lock) -{ - atomic_t *l = (void *)lock; - BUILD_BUG_ON(sizeof(*l) != sizeof(*lock)); - do { - atomic_cond_read_relaxed(l, !VAL); - } while (atomic_xchg(l, 1)); -} - -static inline void __bpf_spin_unlock(struct bpf_spin_lock *lock) -{ - atomic_t *l = (void *)lock; - atomic_set_release(l, 0); -} -#endif - -static DEFINE_PER_CPU(unsigned long, irqsave_flags); -notrace BPF_CALL_1(bpf_spin_lock, struct bpf_spin_lock *, lock) -{ - unsigned long flags; - local_irq_save(flags); - __bpf_spin_lock(lock); - __this_cpu_write(irqsave_flags, flags); - return 0; -} - -const struct bpf_func_proto bpf_spin_lock_proto = { - .func = bpf_spin_lock, - .gpl_only = false, - .ret_type = RET_VOID, - .arg1_type = ARG_PTR_TO_SPIN_LOCK, -}; - -notrace BPF_CALL_1(bpf_spin_unlock, struct bpf_spin_lock *, lock) -{ - unsigned long flags; - flags = __this_cpu_read(irqsave_flags); - __bpf_spin_unlock(lock); - local_irq_restore(flags); - return 0; -} - -const struct bpf_func_proto bpf_spin_unlock_proto = { - .func = bpf_spin_unlock, - .gpl_only = false, - .ret_type = RET_VOID, - .arg1_type = ARG_PTR_TO_SPIN_LOCK, -}; - -void copy_map_value_locked(struct bpf_map *map, void *dst, void *src, - bool lock_src) -{ - struct bpf_spin_lock *lock; - if (lock_src) - lock = src + map->spin_lock_off; - else - lock = dst + map->spin_lock_off; - preempt_disable(); - ____bpf_spin_lock(lock); - copy_map_value(map, dst, src); - ____bpf_spin_unlock(lock); - preempt_enable(); -} - -#ifdef CONFIG_CGROUPS -BPF_CALL_0(bpf_get_current_cgroup_id) -{ - struct cgroup *cgrp = task_dfl_cgroup(current); - return cgrp->kn->id.id; -} -const struct bpf_func_proto bpf_get_current_cgroup_id_proto = { - .func = bpf_get_current_cgroup_id, - .gpl_only = false, - .ret_type = RET_INTEGER, -}; -#endif - -#ifdef CONFIG_CGROUP_BPF -DECLARE_PER_CPU(void*, bpf_cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]); -BPF_CALL_2(bpf_get_local_storage, struct bpf_map *, map, u64, flags) -{ - /* flags argument is not used now, - * but provides an ability to extend the API. - * verifier checks that its value is correct. - */ - enum bpf_cgroup_storage_type stype = cgroup_storage_type(map); - struct bpf_cgroup_storage *storage; - void *ptr; - - storage = this_cpu_read(bpf_cgroup_storage[stype]); - - if (stype == BPF_CGROUP_STORAGE_SHARED) - ptr = &READ_ONCE(storage->buf)->data[0]; - else - ptr = this_cpu_ptr(storage->percpu_buf); - return (unsigned long)ptr; -} -const struct bpf_func_proto bpf_get_local_storage_proto = { - .func = bpf_get_local_storage, - .gpl_only = false, - .ret_type = RET_PTR_TO_MAP_VALUE, - .arg1_type = ARG_CONST_MAP_PTR, - .arg2_type = ARG_ANYTHING, -}; -#endif diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 0f27179d8358..556d70b9e731 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -150,149 +150,39 @@ static int bpf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) return 0; } -struct map_iter { - void *key; - bool done; -}; -static struct map_iter *map_iter(struct seq_file *m) +static int bpf_mkobj_ops(struct inode *dir, struct dentry *dentry, + umode_t mode, const struct inode_operations *iops) { - return m->private; -} -static struct bpf_map *seq_file_to_map(struct seq_file *m) -{ - return file_inode(m->file)->i_private; -} -static void map_iter_free(struct map_iter *iter) -{ - if (iter) { - kfree(iter->key); - kfree(iter); - } -} -static struct map_iter *map_iter_alloc(struct bpf_map *map) -{ - struct map_iter *iter; - iter = kzalloc(sizeof(*iter), GFP_KERNEL | __GFP_NOWARN); - if (!iter) - goto error; - iter->key = kzalloc(map->key_size, GFP_KERNEL | __GFP_NOWARN); - if (!iter->key) - goto error; - return iter; -error: - map_iter_free(iter); - return NULL; -} -static void *map_seq_next(struct seq_file *m, void *v, loff_t *pos) -{ - struct bpf_map *map = seq_file_to_map(m); - void *key = map_iter(m)->key; - if (map_iter(m)->done) - return NULL; - if (unlikely(v == SEQ_START_TOKEN)) - goto done; - if (map->ops->map_get_next_key(map, key, key)) { - map_iter(m)->done = true; - return NULL; - } -done: - ++(*pos); - return key; -} -static void *map_seq_start(struct seq_file *m, loff_t *pos) -{ - if (map_iter(m)->done) - return NULL; - return *pos ? map_iter(m)->key : SEQ_START_TOKEN; -} -static void map_seq_stop(struct seq_file *m, void *v) -{ -} -static int map_seq_show(struct seq_file *m, void *v) -{ - struct bpf_map *map = seq_file_to_map(m); - void *key = map_iter(m)->key; - if (unlikely(v == SEQ_START_TOKEN)) { - seq_puts(m, "# WARNING!! The output is for debug purpose only\n"); - seq_puts(m, "# WARNING!! The output format will change\n"); - } else { - map->ops->map_seq_show_elem(map, key, m); - } - return 0; -} -static const struct seq_operations bpffs_map_seq_ops = { - .start = map_seq_start, - .next = map_seq_next, - .show = map_seq_show, - .stop = map_seq_stop, -}; -static int bpffs_map_open(struct inode *inode, struct file *file) -{ - struct bpf_map *map = inode->i_private; - struct map_iter *iter; - struct seq_file *m; - int err; - iter = map_iter_alloc(map); - if (!iter) - return -ENOMEM; - err = seq_open(file, &bpffs_map_seq_ops); - if (err) { - map_iter_free(iter); - return err; - } - m = file->private_data; - m->private = iter; - return 0; -} -static int bpffs_map_release(struct inode *inode, struct file *file) -{ - struct seq_file *m = file->private_data; - map_iter_free(map_iter(m)); - return seq_release(inode, file); -} -/* bpffs_map_fops should only implement the basic - * read operation for a BPF map. The purpose is to - * provide a simple user intuitive way to do - * "cat bpffs/pathto/a-pinned-map". - * - * Other operations (e.g. write, lookup...) should be realized by - * the userspace tools (e.g. bpftool) through the - * BPF_OBJ_GET_INFO_BY_FD and the map's lookup/update - * interface. - */ -static const struct file_operations bpffs_map_fops = { - .open = bpffs_map_open, - .read = seq_read, - .release = bpffs_map_release, -}; + struct inode *inode; -static int bpf_mkobj_ops(struct dentry *dentry, umode_t mode, void *raw, - const struct inode_operations *iops, - const struct file_operations *fops) -{ - struct inode *dir = dentry->d_parent->d_inode; - struct inode *inode = bpf_get_inode(dir->i_sb, dir, mode); + inode = bpf_get_inode(dir->i_sb, dir, mode | S_IFREG); if (IS_ERR(inode)) return PTR_ERR(inode); inode->i_op = iops; - inode->i_private = raw; - inode->i_fop = fops; + inode->i_private = dentry->d_fsdata; bpf_dentry_finalize(dentry, inode, dir); return 0; } -static int bpf_mkprog(struct dentry *dentry, umode_t mode, void *arg) +static int bpf_mkobj(struct inode *dir, struct dentry *dentry, umode_t mode, + dev_t devt) { - return bpf_mkobj_ops(dentry, mode, arg, &bpf_prog_iops, NULL); -} + enum bpf_type type = MINOR(devt); -static int bpf_mkmap(struct dentry *dentry, umode_t mode, void *arg) -{ - struct bpf_map *map = arg; - return bpf_mkobj_ops(dentry, mode, arg, &bpf_map_iops, - map->btf ? &bpffs_map_fops : NULL); + if (MAJOR(devt) != UNNAMED_MAJOR || !S_ISREG(mode) || + dentry->d_fsdata == NULL) + return -EPERM; + + switch (type) { + case BPF_TYPE_PROG: + return bpf_mkobj_ops(dir, dentry, mode, &bpf_prog_iops); + case BPF_TYPE_MAP: + return bpf_mkobj_ops(dir, dentry, mode, &bpf_map_iops); + default: + return -EPERM; + } } static struct dentry * @@ -328,6 +218,7 @@ static int bpf_symlink(struct inode *dir, struct dentry *dentry, static const struct inode_operations bpf_dir_iops = { .lookup = bpf_lookup, + .mknod = bpf_mkobj, .mkdir = bpf_mkdir, .symlink = bpf_symlink, .rmdir = simple_rmdir, @@ -343,6 +234,7 @@ static int bpf_obj_do_pin(const struct filename *pathname, void *raw, struct inode *dir; struct path path; umode_t mode; + dev_t devt; int ret; dentry = kern_path_create(AT_FDCWD, pathname->name, &path, 0); @@ -350,7 +242,9 @@ static int bpf_obj_do_pin(const struct filename *pathname, void *raw, return PTR_ERR(dentry); mode = S_IFREG | ((S_IRUSR | S_IWUSR) & ~current_umask()); - ret = security_path_mknod(&path, dentry, mode, 0); + devt = MKDEV(UNNAMED_MAJOR, type); + + ret = security_path_mknod(&path, dentry, mode, devt); if (ret) goto out; @@ -360,16 +254,9 @@ static int bpf_obj_do_pin(const struct filename *pathname, void *raw, goto out; } - switch (type) { - case BPF_TYPE_PROG: - ret = vfs_mkobj(dentry, mode, bpf_mkprog, raw); - break; - case BPF_TYPE_MAP: - ret = vfs_mkobj(dentry, mode, bpf_mkmap, raw); - break; - default: - ret = -EPERM; - } + dentry->d_fsdata = raw; + ret = vfs_mknod(dir, dentry, mode, devt); + dentry->d_fsdata = NULL; out: done_path_create(&path, dentry); return ret; diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c deleted file mode 100644 index 08860b9963c7..000000000000 --- a/kernel/bpf/local_storage.c +++ /dev/null @@ -1,574 +0,0 @@ -//SPDX-License-Identifier: GPL-2.0 -#include -#include -#include -#include -#include -#include -#include -#include -#include - -DEFINE_PER_CPU(void*, bpf_cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]); - -#ifdef CONFIG_CGROUP_BPF - -#define LOCAL_STORAGE_CREATE_FLAG_MASK \ - (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) - -struct bpf_cgroup_storage_map { - struct bpf_map map; - - spinlock_t lock; - struct bpf_prog *prog; - struct rb_root root; - struct list_head list; -}; - -static struct bpf_cgroup_storage_map *map_to_storage(struct bpf_map *map) -{ - return container_of(map, struct bpf_cgroup_storage_map, map); -} - -static int bpf_cgroup_storage_key_cmp( - const struct bpf_cgroup_storage_key *key1, - const struct bpf_cgroup_storage_key *key2) -{ - if (key1->cgroup_inode_id < key2->cgroup_inode_id) - return -1; - else if (key1->cgroup_inode_id > key2->cgroup_inode_id) - return 1; - else if (key1->attach_type < key2->attach_type) - return -1; - else if (key1->attach_type > key2->attach_type) - return 1; - return 0; -} - -static struct bpf_cgroup_storage *cgroup_storage_lookup( - struct bpf_cgroup_storage_map *map, struct bpf_cgroup_storage_key *key, - bool locked) -{ - struct rb_root *root = &map->root; - struct rb_node *node; - - if (!locked) - spin_lock_bh(&map->lock); - - node = root->rb_node; - while (node) { - struct bpf_cgroup_storage *storage; - - storage = container_of(node, struct bpf_cgroup_storage, node); - - switch (bpf_cgroup_storage_key_cmp(key, &storage->key)) { - case -1: - node = node->rb_left; - break; - case 1: - node = node->rb_right; - break; - default: - if (!locked) - spin_unlock_bh(&map->lock); - return storage; - } - } - - if (!locked) - spin_unlock_bh(&map->lock); - - return NULL; -} - -static int cgroup_storage_insert(struct bpf_cgroup_storage_map *map, - struct bpf_cgroup_storage *storage) -{ - struct rb_root *root = &map->root; - struct rb_node **new = &(root->rb_node), *parent = NULL; - - while (*new) { - struct bpf_cgroup_storage *this; - - this = container_of(*new, struct bpf_cgroup_storage, node); - - parent = *new; - switch (bpf_cgroup_storage_key_cmp(&storage->key, &this->key)) { - case -1: - new = &((*new)->rb_left); - break; - case 1: - new = &((*new)->rb_right); - break; - default: - return -EEXIST; - } - } - - rb_link_node(&storage->node, parent, new); - rb_insert_color(&storage->node, root); - - return 0; -} - -static void *cgroup_storage_lookup_elem(struct bpf_map *_map, void *_key) -{ - struct bpf_cgroup_storage_map *map = map_to_storage(_map); - struct bpf_cgroup_storage_key *key = _key; - struct bpf_cgroup_storage *storage; - - storage = cgroup_storage_lookup(map, key, false); - if (!storage) - return NULL; - - return &READ_ONCE(storage->buf)->data[0]; -} - -static int cgroup_storage_update_elem(struct bpf_map *map, void *_key, - void *value, u64 flags) -{ - struct bpf_cgroup_storage_key *key = _key; - struct bpf_cgroup_storage *storage; - struct bpf_storage_buffer *new; - - if (unlikely(flags & ~(BPF_F_LOCK | BPF_EXIST | BPF_NOEXIST))) - return -EINVAL; - - if (unlikely(flags & BPF_NOEXIST)) - return -EINVAL; - - if (unlikely((flags & BPF_F_LOCK) && - !map_value_has_spin_lock(map))) - return -EINVAL; - - storage = cgroup_storage_lookup((struct bpf_cgroup_storage_map *)map, - key, false); - if (!storage) - return -ENOENT; - - if (flags & BPF_F_LOCK) { - copy_map_value_locked(map, storage->buf->data, value, false); - return 0; - } - - new = kmalloc_node(sizeof(struct bpf_storage_buffer) + - map->value_size, __GFP_ZERO | GFP_USER, - map->numa_node); - if (!new) - return -ENOMEM; - - memcpy(&new->data[0], value, map->value_size); - - new = xchg(&storage->buf, new); - kfree_rcu(new, rcu); - - return 0; -} - -int bpf_percpu_cgroup_storage_copy(struct bpf_map *_map, void *_key, - void *value) -{ - struct bpf_cgroup_storage_map *map = map_to_storage(_map); - struct bpf_cgroup_storage_key *key = _key; - struct bpf_cgroup_storage *storage; - int cpu, off = 0; - u32 size; - rcu_read_lock(); - storage = cgroup_storage_lookup(map, key, false); - if (!storage) { - rcu_read_unlock(); - return -ENOENT; - } - /* per_cpu areas are zero-filled and bpf programs can only - * access 'value_size' of them, so copying rounded areas - * will not leak any kernel data - */ - size = round_up(_map->value_size, 8); - for_each_possible_cpu(cpu) { - bpf_long_memcpy(value + off, - per_cpu_ptr(storage->percpu_buf, cpu), size); - off += size; - } - rcu_read_unlock(); - return 0; -} - -int bpf_percpu_cgroup_storage_update(struct bpf_map *_map, void *_key, - void *value, u64 map_flags) -{ - struct bpf_cgroup_storage_map *map = map_to_storage(_map); - struct bpf_cgroup_storage_key *key = _key; - struct bpf_cgroup_storage *storage; - int cpu, off = 0; - u32 size; - if (map_flags != BPF_ANY && map_flags != BPF_EXIST) - return -EINVAL; - rcu_read_lock(); - storage = cgroup_storage_lookup(map, key, false); - if (!storage) { - rcu_read_unlock(); - return -ENOENT; - } - /* the user space will provide round_up(value_size, 8) bytes that - * will be copied into per-cpu area. bpf programs can only access - * value_size of it. During lookup the same extra bytes will be - * returned or zeros which were zero-filled by percpu_alloc, - * so no kernel data leaks possible - */ - size = round_up(_map->value_size, 8); - for_each_possible_cpu(cpu) { - bpf_long_memcpy(per_cpu_ptr(storage->percpu_buf, cpu), - value + off, size); - off += size; - } - rcu_read_unlock(); - return 0; -} - -static int cgroup_storage_get_next_key(struct bpf_map *_map, void *_key, - void *_next_key) -{ - struct bpf_cgroup_storage_map *map = map_to_storage(_map); - struct bpf_cgroup_storage_key *key = _key; - struct bpf_cgroup_storage_key *next = _next_key; - struct bpf_cgroup_storage *storage; - - spin_lock_bh(&map->lock); - - if (list_empty(&map->list)) - goto enoent; - - if (key) { - storage = cgroup_storage_lookup(map, key, true); - if (!storage) - goto enoent; - - storage = list_next_entry(storage, list); - if (!storage) - goto enoent; - } else { - storage = list_first_entry(&map->list, - struct bpf_cgroup_storage, list); - } - - spin_unlock_bh(&map->lock); - next->attach_type = storage->key.attach_type; - next->cgroup_inode_id = storage->key.cgroup_inode_id; - return 0; - -enoent: - spin_unlock_bh(&map->lock); - return -ENOENT; -} - -static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr) -{ - int numa_node = bpf_map_attr_numa_node(attr); - struct bpf_cgroup_storage_map *map; - - if (attr->key_size != sizeof(struct bpf_cgroup_storage_key)) - return ERR_PTR(-EINVAL); - - if (attr->value_size > PAGE_SIZE) - return ERR_PTR(-E2BIG); - - if (attr->map_flags & ~LOCAL_STORAGE_CREATE_FLAG_MASK) - /* reserved bits should not be used */ - return ERR_PTR(-EINVAL); - - if (attr->max_entries) - /* max_entries is not used and enforced to be 0 */ - return ERR_PTR(-EINVAL); - - map = kmalloc_node(sizeof(struct bpf_cgroup_storage_map), - __GFP_ZERO | GFP_USER, numa_node); - if (!map) - return ERR_PTR(-ENOMEM); - - map->map.pages = round_up(sizeof(struct bpf_cgroup_storage_map), - PAGE_SIZE) >> PAGE_SHIFT; - - /* copy mandatory map attributes */ - bpf_map_init_from_attr(&map->map, attr); - - spin_lock_init(&map->lock); - map->root = RB_ROOT; - INIT_LIST_HEAD(&map->list); - - return &map->map; -} - -static void cgroup_storage_map_free(struct bpf_map *_map) -{ - struct bpf_cgroup_storage_map *map = map_to_storage(_map); - - WARN_ON(!RB_EMPTY_ROOT(&map->root)); - WARN_ON(!list_empty(&map->list)); - - kfree(map); -} - -static int cgroup_storage_delete_elem(struct bpf_map *map, void *key) -{ - return -EINVAL; -} - -static int cgroup_storage_check_btf(const struct bpf_map *map, - const struct btf *btf, - const struct btf_type *key_type, - const struct btf_type *value_type) -{ - struct btf_member *m; - u32 offset, size; - - /* Key is expected to be of struct bpf_cgroup_storage_key type, - * which is: - * struct bpf_cgroup_storage_key { - * __u64 cgroup_inode_id; - * __u32 attach_type; - * }; - */ - /* - * Key_type must be a structure with two fields. - */ - if (BTF_INFO_KIND(key_type->info) != BTF_KIND_STRUCT || - BTF_INFO_VLEN(key_type->info) != 2) - return -EINVAL; - /* - * The first field must be a 64 bit integer at 0 offset. - */ - m = (struct btf_member *)(key_type + 1); - size = FIELD_SIZEOF(struct bpf_cgroup_storage_key, cgroup_inode_id); - if (!btf_member_is_reg_int(btf, key_type, m, 0, size)) - return -EINVAL; - /* - * The second field must be a 32 bit integer at 64 bit offset. - */ - m++; - offset = offsetof(struct bpf_cgroup_storage_key, attach_type); - size = FIELD_SIZEOF(struct bpf_cgroup_storage_key, attach_type); - - if (!btf_member_is_reg_int(btf, key_type, m, offset, size)) - return -EINVAL; - - return 0; -} - -static void cgroup_storage_seq_show_elem(struct bpf_map *map, void *_key, - struct seq_file *m) -{ - enum bpf_cgroup_storage_type stype = cgroup_storage_type(map); - struct bpf_cgroup_storage_key *key = _key; - struct bpf_cgroup_storage *storage; - int cpu; - rcu_read_lock(); - storage = cgroup_storage_lookup(map_to_storage(map), key, false); - - if (!storage) { - rcu_read_unlock(); - return; - } - - btf_type_seq_show(map->btf, map->btf_key_type_id, key, m); - stype = cgroup_storage_type(map); - - if (stype == BPF_CGROUP_STORAGE_SHARED) { - seq_puts(m, ": "); - btf_type_seq_show(map->btf, map->btf_value_type_id, - &READ_ONCE(storage->buf)->data[0], m); - seq_puts(m, "\n"); - } else { - seq_puts(m, ": {\n"); - for_each_possible_cpu(cpu) { - seq_printf(m, "\tcpu%d: ", cpu); - btf_type_seq_show(map->btf, map->btf_value_type_id, - per_cpu_ptr(storage->percpu_buf, cpu), - m); - seq_puts(m, "\n"); - } - seq_puts(m, "}\n"); - } - rcu_read_unlock(); -} - -const struct bpf_map_ops cgroup_storage_map_ops = { - .map_alloc = cgroup_storage_map_alloc, - .map_free = cgroup_storage_map_free, - .map_get_next_key = cgroup_storage_get_next_key, - .map_lookup_elem = cgroup_storage_lookup_elem, - .map_update_elem = cgroup_storage_update_elem, - .map_delete_elem = cgroup_storage_delete_elem, - .map_check_btf = cgroup_storage_check_btf, - .map_seq_show_elem = cgroup_storage_seq_show_elem, -}; - -int bpf_cgroup_storage_assign(struct bpf_prog *prog, struct bpf_map *_map) -{ - enum bpf_cgroup_storage_type stype = cgroup_storage_type(_map); - struct bpf_cgroup_storage_map *map = map_to_storage(_map); - int ret = -EBUSY; - - spin_lock_bh(&map->lock); - - if (map->prog && map->prog != prog) - goto unlock; - if (prog->aux->cgroup_storage[stype] && - prog->aux->cgroup_storage[stype] != _map) - goto unlock; - - map->prog = prog; - prog->aux->cgroup_storage[stype] = _map; - ret = 0; -unlock: - spin_unlock_bh(&map->lock); - - return ret; -} - -void bpf_cgroup_storage_release(struct bpf_prog *prog, struct bpf_map *_map) -{ - enum bpf_cgroup_storage_type stype = cgroup_storage_type(_map); - struct bpf_cgroup_storage_map *map = map_to_storage(_map); - - spin_lock_bh(&map->lock); - if (map->prog == prog) { - WARN_ON(prog->aux->cgroup_storage[stype] != _map); - map->prog = NULL; - prog->aux->cgroup_storage[stype] = NULL; - } - spin_unlock_bh(&map->lock); -} - -static size_t bpf_cgroup_storage_calculate_size(struct bpf_map *map, u32 *pages) -{ - size_t size; - if (cgroup_storage_type(map) == BPF_CGROUP_STORAGE_SHARED) { - size = sizeof(struct bpf_storage_buffer) + map->value_size; - *pages = round_up(sizeof(struct bpf_cgroup_storage) + size, - PAGE_SIZE) >> PAGE_SHIFT; - } else { - size = map->value_size; - *pages = round_up(round_up(size, 8) * num_possible_cpus(), - PAGE_SIZE) >> PAGE_SHIFT; - } - return size; -} - -struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(struct bpf_prog *prog, - enum bpf_cgroup_storage_type stype) -{ - struct bpf_cgroup_storage *storage; - struct bpf_map *map; - gfp_t flags; - size_t size; - u32 pages; - - map = prog->aux->cgroup_storage[stype]; - if (!map) - return NULL; - - size = bpf_cgroup_storage_calculate_size(map, &pages); - if (bpf_map_charge_memlock(map, pages)) - return ERR_PTR(-EPERM); - - storage = kmalloc_node(sizeof(struct bpf_cgroup_storage), - __GFP_ZERO | GFP_USER, map->numa_node); - - if (!storage) - goto enomem; - - flags = __GFP_ZERO | GFP_USER; - if (stype == BPF_CGROUP_STORAGE_SHARED) { - storage->buf = kmalloc_node(size, flags, map->numa_node); - if (!storage->buf) - goto enomem; - } else { - storage->percpu_buf = __alloc_percpu_gfp(size, 8, flags); - if (!storage->percpu_buf) - goto enomem; - } - - storage->map = (struct bpf_cgroup_storage_map *)map; - - return storage; -enomem: - bpf_map_uncharge_memlock(map, pages); - kfree(storage); - return ERR_PTR(-ENOMEM); -} - -static void free_shared_cgroup_storage_rcu(struct rcu_head *rcu) -{ - struct bpf_cgroup_storage *storage = - container_of(rcu, struct bpf_cgroup_storage, rcu); - kfree(storage->buf); - kfree(storage); -} - -static void free_percpu_cgroup_storage_rcu(struct rcu_head *rcu) -{ - struct bpf_cgroup_storage *storage = - container_of(rcu, struct bpf_cgroup_storage, rcu); - free_percpu(storage->percpu_buf); - kfree(storage); -} - -void bpf_cgroup_storage_free(struct bpf_cgroup_storage *storage) -{ - enum bpf_cgroup_storage_type stype; - struct bpf_map *map; - u32 pages; - - if (!storage) - return; - - map = &storage->map->map; - bpf_cgroup_storage_calculate_size(map, &pages); - bpf_map_uncharge_memlock(map, pages); - - stype = cgroup_storage_type(map); - if (stype == BPF_CGROUP_STORAGE_SHARED) - call_rcu(&storage->rcu, free_shared_cgroup_storage_rcu); - else - call_rcu(&storage->rcu, free_percpu_cgroup_storage_rcu); -} - -void bpf_cgroup_storage_link(struct bpf_cgroup_storage *storage, - struct cgroup *cgroup, - enum bpf_attach_type type) -{ - struct bpf_cgroup_storage_map *map; - - if (!storage) - return; - - storage->key.attach_type = type; - storage->key.cgroup_inode_id = cgroup->kn->id.id; - - map = storage->map; - - spin_lock_bh(&map->lock); - WARN_ON(cgroup_storage_insert(map, storage)); - list_add(&storage->list, &map->list); - spin_unlock_bh(&map->lock); -} - -void bpf_cgroup_storage_unlink(struct bpf_cgroup_storage *storage) -{ - struct bpf_cgroup_storage_map *map; - struct rb_root *root; - - if (!storage) - return; - - map = storage->map; - - spin_lock_bh(&map->lock); - root = &map->root; - rb_erase(&storage->node, root); - - list_del(&storage->list); - spin_unlock_bh(&map->lock); -} - -#endif diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index 3c1877c56d6c..3925794d8188 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -15,7 +15,6 @@ #include #include #include -#include /* Intermediate node */ #define LPM_TREE_NODE_FLAG_IM BIT(0) @@ -436,7 +435,13 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr) if (!trie) return ERR_PTR(-ENOMEM); - bpf_map_init_from_attr(&trie->map, attr); + /* copy mandatory map attributes */ + trie->map.map_type = attr->map_type; + trie->map.key_size = attr->key_size; + trie->map.value_size = attr->value_size; + trie->map.max_entries = attr->max_entries; + trie->map.map_flags = attr->map_flags; + trie->map.numa_node = bpf_map_attr_numa_node(attr); trie->data_size = attr->key_size - offsetof(struct bpf_lpm_trie_key, data); trie->max_prefixlen = trie->data_size * 8; @@ -507,112 +512,9 @@ out: kfree(trie); } -static int trie_get_next_key(struct bpf_map *map, void *_key, void *_next_key) +static int trie_get_next_key(struct bpf_map *map, void *key, void *next_key) { - struct lpm_trie_node *node, *next_node = NULL, *parent, *search_root; - struct lpm_trie *trie = container_of(map, struct lpm_trie, map); - struct bpf_lpm_trie_key *key = _key, *next_key = _next_key; - struct lpm_trie_node **node_stack = NULL; - int err = 0, stack_ptr = -1; - unsigned int next_bit; - size_t matchlen = 0; - - /* The get_next_key follows postorder. For the 4 node example in - * the top of this file, the trie_get_next_key() returns the following - * one after another: - * 192.168.0.0/24 - * 192.168.1.0/24 - * 192.168.128.0/24 - * 192.168.0.0/16 - * - * The idea is to return more specific keys before less specific ones. - */ - - /* Empty trie */ - search_root = rcu_dereference(trie->root); - if (!search_root) - return -ENOENT; - - /* For invalid key, find the leftmost node in the trie */ - if (!key || key->prefixlen > trie->max_prefixlen) - goto find_leftmost; - - node_stack = kmalloc_array(trie->max_prefixlen + 1, - sizeof(struct lpm_trie_node *), - GFP_ATOMIC | __GFP_NOWARN); - if (!node_stack) - return -ENOMEM; - - /* Try to find the exact node for the given key */ - for (node = search_root; node;) { - node_stack[++stack_ptr] = node; - matchlen = longest_prefix_match(trie, node, key); - if (node->prefixlen != matchlen || - node->prefixlen == key->prefixlen) - break; - - next_bit = extract_bit(key->data, node->prefixlen); - node = rcu_dereference(node->child[next_bit]); - } - if (!node || node->prefixlen != matchlen || - (node->flags & LPM_TREE_NODE_FLAG_IM)) - goto find_leftmost; - - /* The node with the exactly-matching key has been found, - * find the first node in postorder after the matched node. - */ - node = node_stack[stack_ptr]; - while (stack_ptr > 0) { - parent = node_stack[stack_ptr - 1]; - if (rcu_dereference(parent->child[0]) == node) { - search_root = rcu_dereference(parent->child[1]); - if (search_root) - goto find_leftmost; - } - if (!(parent->flags & LPM_TREE_NODE_FLAG_IM)) { - next_node = parent; - goto do_copy; - } - - node = parent; - stack_ptr--; - } - - /* did not find anything */ - err = -ENOENT; - goto free_stack; - -find_leftmost: - /* Find the leftmost non-intermediate node, all intermediate nodes - * have exact two children, so this function will never return NULL. - */ - for (node = search_root; node;) { - if (node->flags & LPM_TREE_NODE_FLAG_IM) { - node = rcu_dereference(node->child[0]); - } else { - next_node = node; - node = rcu_dereference(node->child[0]); - if (!node) - node = rcu_dereference(next_node->child[1]); - } - } -do_copy: - next_key->prefixlen = next_node->prefixlen; - memcpy((void *)next_key + offsetof(struct bpf_lpm_trie_key, data), - next_node->data, trie->data_size); -free_stack: - kfree(node_stack); - return err; -} - -static int trie_check_btf(const struct bpf_map *map, - const struct btf *btf, - const struct btf_type *key_type, - const struct btf_type *value_type) -{ - /* Keys must have struct bpf_lpm_trie_key embedded. */ - return BTF_INFO_KIND(key_type->info) != BTF_KIND_STRUCT ? - -EINVAL : 0; + return -ENOTSUPP; } const struct bpf_map_ops trie_map_ops = { @@ -622,5 +524,4 @@ const struct bpf_map_ops trie_map_ops = { .map_lookup_elem = trie_lookup_elem, .map_update_elem = trie_update_elem, .map_delete_elem = trie_delete_elem, - .map_check_btf = trie_check_btf, }; diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c index 1ed945522f0e..1878aace6a5c 100644 --- a/kernel/bpf/map_in_map.c +++ b/kernel/bpf/map_in_map.c @@ -35,11 +35,6 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd) return ERR_PTR(-EINVAL); } - if (map_value_has_spin_lock(inner_map)) { - fdput(f); - return ERR_PTR(-ENOTSUPP); - } - inner_map_meta_size = sizeof(*inner_map_meta); /* In some cases verifier needs to access beyond just base map. */ if (inner_map->ops == &array_map_ops) diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c deleted file mode 100644 index e60bb5c3465e..000000000000 --- a/kernel/bpf/offload.c +++ /dev/null @@ -1,462 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -/* Protects bpf_prog_offload_devs, bpf_map_offload_devs and offload members - * of all progs. - * RTNL lock cannot be taken when holding this lock. - */ -static DECLARE_RWSEM(bpf_devs_lock); -static LIST_HEAD(bpf_prog_offload_devs); -static LIST_HEAD(bpf_map_offload_devs); - -static int bpf_dev_offload_check(struct net_device *netdev) -{ - if (!netdev) - return -EINVAL; - if (!netdev->netdev_ops->ndo_bpf) - return -EOPNOTSUPP; - return 0; -} - -int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr) -{ - struct bpf_prog_offload *offload; - int err; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - if (attr->prog_flags) - return -EINVAL; - - offload = kzalloc(sizeof(*offload), GFP_USER); - if (!offload) - return -ENOMEM; - - offload->prog = prog; - - offload->netdev = dev_get_by_index(current->nsproxy->net_ns, - attr->prog_ifindex); - - err = bpf_dev_offload_check(offload->netdev); - if (err) - goto err_maybe_put; - - down_write(&bpf_devs_lock); - if (offload->netdev->reg_state != NETREG_REGISTERED) { - err = -EINVAL; - goto err_unlock; - } - - prog->aux->offload = offload; - list_add_tail(&offload->offloads, &bpf_prog_offload_devs); - dev_put(offload->netdev); - up_write(&bpf_devs_lock); - - return 0; -err_unlock: - up_write(&bpf_devs_lock); -err_maybe_put: - if (offload->netdev) - dev_put(offload->netdev); - kfree(offload); - return err; -} - -static int __bpf_offload_ndo(struct bpf_prog *prog, enum bpf_netdev_command cmd, - struct netdev_bpf *data) -{ - struct net_device *netdev = prog->aux->offload->netdev; - - ASSERT_RTNL(); - - if (!netdev) - return -ENODEV; - - data->command = cmd; - - return netdev->netdev_ops->ndo_bpf(netdev, data); -} - -int bpf_prog_offload_verifier_prep(struct bpf_verifier_env *env) -{ - struct netdev_bpf data = {}; - int err; - - data.verifier.prog = env->prog; - - rtnl_lock(); - err = __bpf_offload_ndo(env->prog, BPF_OFFLOAD_VERIFIER_PREP, &data); - if (err) - goto exit_unlock; - - env->prog->aux->offload->dev_ops = data.verifier.ops; - env->prog->aux->offload->dev_state = true; -exit_unlock: - rtnl_unlock(); - return err; -} - -int bpf_prog_offload_verify_insn(struct bpf_verifier_env *env, - int insn_idx, int prev_insn_idx) -{ - struct bpf_prog_offload *offload; - int ret = -ENODEV; - down_read(&bpf_devs_lock); - offload = env->prog->aux->offload; - if (offload->netdev) - ret = offload->dev_ops->insn_hook(env, insn_idx, prev_insn_idx); - up_read(&bpf_devs_lock); - return ret; -} - -static void __bpf_prog_offload_destroy(struct bpf_prog *prog) -{ - struct bpf_prog_offload *offload = prog->aux->offload; - struct netdev_bpf data = {}; - - data.offload.prog = prog; - - if (offload->dev_state) - WARN_ON(__bpf_offload_ndo(prog, BPF_OFFLOAD_DESTROY, &data)); - - /* Make sure BPF_PROG_GET_NEXT_ID can't find this dead program */ - bpf_prog_free_id(prog, true); - - offload->dev_state = false; - list_del_init(&offload->offloads); - offload->netdev = NULL; -} - -void bpf_prog_offload_destroy(struct bpf_prog *prog) -{ - struct bpf_prog_offload *offload = prog->aux->offload; - - rtnl_lock(); - down_write(&bpf_devs_lock); - __bpf_prog_offload_destroy(prog); - up_write(&bpf_devs_lock); - rtnl_unlock(); - - kfree(offload); -} - -static int bpf_prog_offload_translate(struct bpf_prog *prog) -{ - struct netdev_bpf data = {}; - int ret; - - data.offload.prog = prog; - - rtnl_lock(); - ret = __bpf_offload_ndo(prog, BPF_OFFLOAD_TRANSLATE, &data); - rtnl_unlock(); - - return ret; -} - -static unsigned int bpf_prog_warn_on_exec(const void *ctx, - const struct bpf_insn *insn) -{ - WARN(1, "attempt to execute device eBPF program on the host!"); - return 0; -} - -int bpf_prog_offload_compile(struct bpf_prog *prog) -{ - prog->bpf_func = bpf_prog_warn_on_exec; - - return bpf_prog_offload_translate(prog); -} - -struct ns_get_path_bpf_prog_args { - struct bpf_prog *prog; - struct bpf_prog_info *info; -}; - -static struct ns_common *bpf_prog_offload_info_fill_ns(void *private_data) -{ - struct ns_get_path_bpf_prog_args *args = private_data; - struct bpf_prog_aux *aux = args->prog->aux; - struct ns_common *ns; - struct net *net; - - rtnl_lock(); - down_read(&bpf_devs_lock); - - if (aux->offload) { - args->info->ifindex = aux->offload->netdev->ifindex; - net = dev_net(aux->offload->netdev); - get_net(net); - ns = &net->ns; - } else { - args->info->ifindex = 0; - ns = NULL; - } - - up_read(&bpf_devs_lock); - rtnl_unlock(); - - return ns; -} - -int bpf_prog_offload_info_fill(struct bpf_prog_info *info, - struct bpf_prog *prog) -{ - struct ns_get_path_bpf_prog_args args = { - .prog = prog, - .info = info, - }; - struct inode *ns_inode; - struct path ns_path; - int res; - - res = ns_get_path_cb(&ns_path, bpf_prog_offload_info_fill_ns, &args); - if (res) { - if (!info->ifindex) - return -ENODEV; - return res; - } - - ns_inode = ns_path.dentry->d_inode; - info->netns_dev = new_encode_dev(ns_inode->i_sb->s_dev); - info->netns_ino = ns_inode->i_ino; - path_put(&ns_path); - - return 0; -} - -const struct bpf_verifier_ops bpf_offload_verifier_ops = { -}; - -const struct bpf_prog_ops bpf_offload_prog_ops = { -}; - -static int bpf_map_offload_ndo(struct bpf_offloaded_map *offmap, - enum bpf_netdev_command cmd) -{ - struct netdev_bpf data = {}; - struct net_device *netdev; - ASSERT_RTNL(); - data.command = cmd; - data.offmap = offmap; - /* Caller must make sure netdev is valid */ - netdev = offmap->netdev; - return netdev->netdev_ops->ndo_bpf(netdev, &data); -} -struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr) -{ - struct net *net = current->nsproxy->net_ns; - struct bpf_offloaded_map *offmap; - int err; - if (!capable(CAP_SYS_ADMIN)) - return ERR_PTR(-EPERM); - if (attr->map_type != BPF_MAP_TYPE_HASH) - return ERR_PTR(-EINVAL); - offmap = kzalloc(sizeof(*offmap), GFP_USER); - if (!offmap) - return ERR_PTR(-ENOMEM); - bpf_map_init_from_attr(&offmap->map, attr); - rtnl_lock(); - down_write(&bpf_devs_lock); - offmap->netdev = __dev_get_by_index(net, attr->map_ifindex); - err = bpf_dev_offload_check(offmap->netdev); - if (err) - goto err_unlock; - err = bpf_map_offload_ndo(offmap, BPF_OFFLOAD_MAP_ALLOC); - if (err) - goto err_unlock; - list_add_tail(&offmap->offloads, &bpf_map_offload_devs); - up_write(&bpf_devs_lock); - rtnl_unlock(); - return &offmap->map; -err_unlock: - up_write(&bpf_devs_lock); - rtnl_unlock(); - kfree(offmap); - return ERR_PTR(err); -} -static void __bpf_map_offload_destroy(struct bpf_offloaded_map *offmap) -{ - WARN_ON(bpf_map_offload_ndo(offmap, BPF_OFFLOAD_MAP_FREE)); - /* Make sure BPF_MAP_GET_NEXT_ID can't find this dead map */ - bpf_map_free_id(&offmap->map, true); - list_del_init(&offmap->offloads); - offmap->netdev = NULL; -} -void bpf_map_offload_map_free(struct bpf_map *map) -{ - struct bpf_offloaded_map *offmap = map_to_offmap(map); - rtnl_lock(); - down_write(&bpf_devs_lock); - if (offmap->netdev) - __bpf_map_offload_destroy(offmap); - up_write(&bpf_devs_lock); - rtnl_unlock(); - kfree(offmap); -} -int bpf_map_offload_lookup_elem(struct bpf_map *map, void *key, void *value) -{ - struct bpf_offloaded_map *offmap = map_to_offmap(map); - int ret = -ENODEV; - down_read(&bpf_devs_lock); - if (offmap->netdev) - ret = offmap->dev_ops->map_lookup_elem(offmap, key, value); - up_read(&bpf_devs_lock); - return ret; -} -int bpf_map_offload_update_elem(struct bpf_map *map, - void *key, void *value, u64 flags) -{ - struct bpf_offloaded_map *offmap = map_to_offmap(map); - int ret = -ENODEV; - if (unlikely(flags > BPF_EXIST)) - return -EINVAL; - down_read(&bpf_devs_lock); - if (offmap->netdev) - ret = offmap->dev_ops->map_update_elem(offmap, key, value, - flags); - up_read(&bpf_devs_lock); - return ret; -} -int bpf_map_offload_delete_elem(struct bpf_map *map, void *key) -{ - struct bpf_offloaded_map *offmap = map_to_offmap(map); - int ret = -ENODEV; - down_read(&bpf_devs_lock); - if (offmap->netdev) - ret = offmap->dev_ops->map_delete_elem(offmap, key); - up_read(&bpf_devs_lock); - return ret; -} -int bpf_map_offload_get_next_key(struct bpf_map *map, void *key, void *next_key) -{ - struct bpf_offloaded_map *offmap = map_to_offmap(map); - int ret = -ENODEV; - down_read(&bpf_devs_lock); - if (offmap->netdev) - ret = offmap->dev_ops->map_get_next_key(offmap, key, next_key); - up_read(&bpf_devs_lock); - return ret; -} - -struct ns_get_path_bpf_map_args { - struct bpf_offloaded_map *offmap; - struct bpf_map_info *info; -}; -static struct ns_common *bpf_map_offload_info_fill_ns(void *private_data) -{ - struct ns_get_path_bpf_map_args *args = private_data; - struct ns_common *ns; - struct net *net; - rtnl_lock(); - down_read(&bpf_devs_lock); - if (args->offmap->netdev) { - args->info->ifindex = args->offmap->netdev->ifindex; - net = dev_net(args->offmap->netdev); - get_net(net); - ns = &net->ns; - } else { - args->info->ifindex = 0; - ns = NULL; - } - up_read(&bpf_devs_lock); - rtnl_unlock(); - return ns; -} -int bpf_map_offload_info_fill(struct bpf_map_info *info, struct bpf_map *map) -{ - struct ns_get_path_bpf_map_args args = { - .offmap = map_to_offmap(map), - .info = info, - }; - struct inode *ns_inode; - struct path ns_path; - int res; - res = ns_get_path_cb(&ns_path, bpf_map_offload_info_fill_ns, &args); - if (res) { - if (!info->ifindex) - return -ENODEV; - return res; - } - ns_inode = ns_path.dentry->d_inode; - info->netns_dev = new_encode_dev(ns_inode->i_sb->s_dev); - info->netns_ino = ns_inode->i_ino; - path_put(&ns_path); - return 0; -} - -bool bpf_offload_dev_match(struct bpf_prog *prog, struct bpf_map *map) -{ - struct bpf_offloaded_map *offmap; - struct bpf_prog_offload *offload; - bool ret; - if (!!bpf_prog_is_dev_bound(prog->aux) != !!bpf_map_is_dev_bound(map)) - return false; - if (!bpf_prog_is_dev_bound(prog->aux)) - return true; - down_read(&bpf_devs_lock); - offload = prog->aux->offload; - offmap = map_to_offmap(map); - ret = offload && offload->netdev == offmap->netdev; - up_read(&bpf_devs_lock); - return ret; -} -static void bpf_offload_orphan_all_progs(struct net_device *netdev) -{ - struct bpf_prog_offload *offload, *tmp; - list_for_each_entry_safe(offload, tmp, &bpf_prog_offload_devs, offloads) - if (offload->netdev == netdev) - __bpf_prog_offload_destroy(offload->prog); -} -static void bpf_offload_orphan_all_maps(struct net_device *netdev) -{ - struct bpf_offloaded_map *offmap, *tmp; - list_for_each_entry_safe(offmap, tmp, &bpf_map_offload_devs, offloads) - if (offmap->netdev == netdev) - __bpf_map_offload_destroy(offmap); -} - -static int bpf_offload_notification(struct notifier_block *notifier, - ulong event, void *ptr) -{ - struct net_device *netdev = netdev_notifier_info_to_dev(ptr); - - ASSERT_RTNL(); - - switch (event) { - case NETDEV_UNREGISTER: - /* ignore namespace changes */ - if (netdev->reg_state != NETREG_UNREGISTERING) - break; - - down_write(&bpf_devs_lock); - bpf_offload_orphan_all_progs(netdev); - bpf_offload_orphan_all_maps(netdev); - up_write(&bpf_devs_lock); - break; - default: - break; - } - return NOTIFY_OK; -} - -static struct notifier_block bpf_offload_notifier = { - .notifier_call = bpf_offload_notification, -}; - -static int __init bpf_offload_init(void) -{ - register_netdevice_notifier(&bpf_offload_notifier); - return 0; -} - -subsys_initcall(bpf_offload_init); diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c new file mode 100644 index 000000000000..0fffca42dac9 --- /dev/null +++ b/kernel/bpf/sockmap.c @@ -0,0 +1,920 @@ +/* Copyright (c) 2017 Covalent IO, Inc. http://covalent.io + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +/* A BPF sock_map is used to store sock objects. This is primarly used + * for doing socket redirect with BPF helper routines. + * + * A sock map may have BPF programs attached to it, currently a program + * used to parse packets and a program to provide a verdict and redirect + * decision on the packet are supported. Any programs attached to a sock + * map are inherited by sock objects when they are added to the map. If + * no BPF programs are attached the sock object may only be used for sock + * redirect. + * + * A sock object may be in multiple maps, but can only inherit a single + * parse or verdict program. If adding a sock object to a map would result + * in having multiple parsing programs the update will return an EBUSY error. + * + * For reference this program is similar to devmap used in XDP context + * reviewing these together may be useful. For an example please review + * ./samples/bpf/sockmap/. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define SOCK_CREATE_FLAG_MASK \ + (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) + +struct bpf_stab { + struct bpf_map map; + struct sock **sock_map; + struct bpf_prog *bpf_parse; + struct bpf_prog *bpf_verdict; +}; + +enum smap_psock_state { + SMAP_TX_RUNNING, +}; + +struct smap_psock_map_entry { + struct list_head list; + struct sock **entry; +}; + +struct smap_psock { + struct rcu_head rcu; + /* refcnt is used inside sk_callback_lock */ + u32 refcnt; + + /* datapath variables */ + struct sk_buff_head rxqueue; + bool strp_enabled; + + /* datapath error path cache across tx work invocations */ + int save_rem; + int save_off; + struct sk_buff *save_skb; + + struct strparser strp; + struct bpf_prog *bpf_parse; + struct bpf_prog *bpf_verdict; + struct list_head maps; + + /* Back reference used when sock callback trigger sockmap operations */ + struct sock *sock; + unsigned long state; + + struct work_struct tx_work; + struct work_struct gc_work; + + void (*save_data_ready)(struct sock *sk); + void (*save_write_space)(struct sock *sk); + void (*save_state_change)(struct sock *sk); +}; + +static inline struct smap_psock *smap_psock_sk(const struct sock *sk) +{ + return rcu_dereference_sk_user_data(sk); +} + +/* compute the linear packet data range [data, data_end) for skb when + * sk_skb type programs are in use. + */ +static inline void bpf_compute_data_end_sk_skb(struct sk_buff *skb) +{ + TCP_SKB_CB(skb)->bpf.data_end = skb->data + skb_headlen(skb); +} + +enum __sk_action { + __SK_DROP = 0, + __SK_PASS, + __SK_REDIRECT, +}; + +static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb) +{ + struct bpf_prog *prog = READ_ONCE(psock->bpf_verdict); + int rc; + + if (unlikely(!prog)) + return __SK_DROP; + + skb_orphan(skb); + /* We need to ensure that BPF metadata for maps is also cleared + * when we orphan the skb so that we don't have the possibility + * to reference a stale map. + */ + TCP_SKB_CB(skb)->bpf.map = NULL; + skb->sk = psock->sock; + bpf_compute_data_end_sk_skb(skb); + preempt_disable(); + rc = (*prog->bpf_func)(skb, prog->insnsi); + preempt_enable(); + skb->sk = NULL; + + /* Moving return codes from UAPI namespace into internal namespace */ + return rc == SK_PASS ? + (TCP_SKB_CB(skb)->bpf.map ? __SK_REDIRECT : __SK_PASS) : + __SK_DROP; +} + +static void smap_do_verdict(struct smap_psock *psock, struct sk_buff *skb) +{ + struct sock *sk; + int rc; + + rc = smap_verdict_func(psock, skb); + switch (rc) { + case __SK_REDIRECT: + sk = do_sk_redirect_map(skb); + if (likely(sk)) { + struct smap_psock *peer = smap_psock_sk(sk); + + if (likely(peer && + test_bit(SMAP_TX_RUNNING, &peer->state) && + !sock_flag(sk, SOCK_DEAD) && + sock_writeable(sk))) { + skb_set_owner_w(skb, sk); + skb_queue_tail(&peer->rxqueue, skb); + schedule_work(&peer->tx_work); + break; + } + } + /* Fall through and free skb otherwise */ + case __SK_DROP: + default: + kfree_skb(skb); + } +} + +static void smap_report_sk_error(struct smap_psock *psock, int err) +{ + struct sock *sk = psock->sock; + + sk->sk_err = err; + sk->sk_error_report(sk); +} + +static void smap_release_sock(struct smap_psock *psock, struct sock *sock); + +/* Called with lock_sock(sk) held */ +static void smap_state_change(struct sock *sk) +{ + struct smap_psock_map_entry *e, *tmp; + struct smap_psock *psock; + struct socket_wq *wq; + struct sock *osk; + + rcu_read_lock(); + + /* Allowing transitions into an established syn_recv states allows + * for early binding sockets to a smap object before the connection + * is established. + */ + switch (sk->sk_state) { + case TCP_SYN_SENT: + case TCP_SYN_RECV: + case TCP_ESTABLISHED: + break; + case TCP_CLOSE_WAIT: + case TCP_CLOSING: + case TCP_LAST_ACK: + case TCP_FIN_WAIT1: + case TCP_FIN_WAIT2: + case TCP_LISTEN: + break; + case TCP_CLOSE: + /* Only release if the map entry is in fact the sock in + * question. There is a case where the operator deletes + * the sock from the map, but the TCP sock is closed before + * the psock is detached. Use cmpxchg to verify correct + * sock is removed. + */ + psock = smap_psock_sk(sk); + if (unlikely(!psock)) + break; + write_lock_bh(&sk->sk_callback_lock); + list_for_each_entry_safe(e, tmp, &psock->maps, list) { + osk = cmpxchg(e->entry, sk, NULL); + if (osk == sk) { + list_del(&e->list); + smap_release_sock(psock, sk); + } + } + write_unlock_bh(&sk->sk_callback_lock); + break; + default: + psock = smap_psock_sk(sk); + if (unlikely(!psock)) + break; + smap_report_sk_error(psock, EPIPE); + break; + } + + wq = rcu_dereference(sk->sk_wq); + if (skwq_has_sleeper(wq)) + wake_up_interruptible_all(&wq->wait); + rcu_read_unlock(); +} + +static void smap_read_sock_strparser(struct strparser *strp, + struct sk_buff *skb) +{ + struct smap_psock *psock; + + rcu_read_lock(); + psock = container_of(strp, struct smap_psock, strp); + smap_do_verdict(psock, skb); + rcu_read_unlock(); +} + +/* Called with lock held on socket */ +static void smap_data_ready(struct sock *sk) +{ + struct smap_psock *psock; + + rcu_read_lock(); + psock = smap_psock_sk(sk); + if (likely(psock)) { + write_lock_bh(&sk->sk_callback_lock); + strp_data_ready(&psock->strp); + write_unlock_bh(&sk->sk_callback_lock); + } + rcu_read_unlock(); +} + +static void smap_tx_work(struct work_struct *w) +{ + struct smap_psock *psock; + struct sk_buff *skb; + int rem, off, n; + + psock = container_of(w, struct smap_psock, tx_work); + + /* lock sock to avoid losing sk_socket at some point during loop */ + lock_sock(psock->sock); + if (psock->save_skb) { + skb = psock->save_skb; + rem = psock->save_rem; + off = psock->save_off; + psock->save_skb = NULL; + goto start; + } + + while ((skb = skb_dequeue(&psock->rxqueue))) { + rem = skb->len; + off = 0; +start: + do { + if (likely(psock->sock->sk_socket)) + n = skb_send_sock_locked(psock->sock, + skb, off, rem); + else + n = -EINVAL; + if (n <= 0) { + if (n == -EAGAIN) { + /* Retry when space is available */ + psock->save_skb = skb; + psock->save_rem = rem; + psock->save_off = off; + goto out; + } + /* Hard errors break pipe and stop xmit */ + smap_report_sk_error(psock, n ? -n : EPIPE); + clear_bit(SMAP_TX_RUNNING, &psock->state); + kfree_skb(skb); + goto out; + } + rem -= n; + off += n; + } while (rem); + kfree_skb(skb); + } +out: + release_sock(psock->sock); +} + +static void smap_write_space(struct sock *sk) +{ + struct smap_psock *psock; + void (*write_space)(struct sock *sk); + + rcu_read_lock(); + psock = smap_psock_sk(sk); + if (likely(psock && test_bit(SMAP_TX_RUNNING, &psock->state))) + schedule_work(&psock->tx_work); + write_space = psock->save_write_space; + rcu_read_unlock(); + write_space(sk); +} + +static void smap_stop_sock(struct smap_psock *psock, struct sock *sk) +{ + if (!psock->strp_enabled) + return; + sk->sk_data_ready = psock->save_data_ready; + sk->sk_write_space = psock->save_write_space; + sk->sk_state_change = psock->save_state_change; + psock->save_data_ready = NULL; + psock->save_write_space = NULL; + psock->save_state_change = NULL; + strp_stop(&psock->strp); + psock->strp_enabled = false; +} + +static void smap_destroy_psock(struct rcu_head *rcu) +{ + struct smap_psock *psock = container_of(rcu, + struct smap_psock, rcu); + + /* Now that a grace period has passed there is no longer + * any reference to this sock in the sockmap so we can + * destroy the psock, strparser, and bpf programs. But, + * because we use workqueue sync operations we can not + * do it in rcu context + */ + schedule_work(&psock->gc_work); +} + +static void smap_release_sock(struct smap_psock *psock, struct sock *sock) +{ + psock->refcnt--; + if (psock->refcnt) + return; + + smap_stop_sock(psock, sock); + clear_bit(SMAP_TX_RUNNING, &psock->state); + rcu_assign_sk_user_data(sock, NULL); + call_rcu_sched(&psock->rcu, smap_destroy_psock); +} + +static int smap_parse_func_strparser(struct strparser *strp, + struct sk_buff *skb) +{ + struct smap_psock *psock; + struct bpf_prog *prog; + int rc; + + rcu_read_lock(); + psock = container_of(strp, struct smap_psock, strp); + prog = READ_ONCE(psock->bpf_parse); + + if (unlikely(!prog)) { + rcu_read_unlock(); + return skb->len; + } + + /* Attach socket for bpf program to use if needed we can do this + * because strparser clones the skb before handing it to a upper + * layer, meaning skb_orphan has been called. We NULL sk on the + * way out to ensure we don't trigger a BUG_ON in skb/sk operations + * later and because we are not charging the memory of this skb to + * any socket yet. + */ + skb->sk = psock->sock; + bpf_compute_data_end_sk_skb(skb); + rc = (*prog->bpf_func)(skb, prog->insnsi); + skb->sk = NULL; + rcu_read_unlock(); + return rc; +} + + +static int smap_read_sock_done(struct strparser *strp, int err) +{ + return err; +} + +static int smap_init_sock(struct smap_psock *psock, + struct sock *sk) +{ + static const struct strp_callbacks cb = { + .rcv_msg = smap_read_sock_strparser, + .parse_msg = smap_parse_func_strparser, + .read_sock_done = smap_read_sock_done, + }; + + return strp_init(&psock->strp, sk, &cb); +} + +static void smap_init_progs(struct smap_psock *psock, + struct bpf_stab *stab, + struct bpf_prog *verdict, + struct bpf_prog *parse) +{ + struct bpf_prog *orig_parse, *orig_verdict; + + orig_parse = xchg(&psock->bpf_parse, parse); + orig_verdict = xchg(&psock->bpf_verdict, verdict); + + if (orig_verdict) + bpf_prog_put(orig_verdict); + if (orig_parse) + bpf_prog_put(orig_parse); +} + +static void smap_start_sock(struct smap_psock *psock, struct sock *sk) +{ + if (sk->sk_data_ready == smap_data_ready) + return; + psock->save_data_ready = sk->sk_data_ready; + psock->save_write_space = sk->sk_write_space; + psock->save_state_change = sk->sk_state_change; + sk->sk_data_ready = smap_data_ready; + sk->sk_write_space = smap_write_space; + sk->sk_state_change = smap_state_change; + psock->strp_enabled = true; +} + +static void sock_map_remove_complete(struct bpf_stab *stab) +{ + bpf_map_area_free(stab->sock_map); + kfree(stab); +} + +static void smap_gc_work(struct work_struct *w) +{ + struct smap_psock_map_entry *e, *tmp; + struct smap_psock *psock; + + psock = container_of(w, struct smap_psock, gc_work); + + /* no callback lock needed because we already detached sockmap ops */ + if (psock->strp_enabled) + strp_done(&psock->strp); + + cancel_work_sync(&psock->tx_work); + __skb_queue_purge(&psock->rxqueue); + + /* At this point all strparser and xmit work must be complete */ + if (psock->bpf_parse) + bpf_prog_put(psock->bpf_parse); + if (psock->bpf_verdict) + bpf_prog_put(psock->bpf_verdict); + + list_for_each_entry_safe(e, tmp, &psock->maps, list) { + list_del(&e->list); + kfree(e); + } + + sock_put(psock->sock); + kfree(psock); +} + +static struct smap_psock *smap_init_psock(struct sock *sock, + struct bpf_stab *stab) +{ + struct smap_psock *psock; + + psock = kzalloc_node(sizeof(struct smap_psock), + GFP_ATOMIC | __GFP_NOWARN, + stab->map.numa_node); + if (!psock) + return ERR_PTR(-ENOMEM); + + psock->sock = sock; + skb_queue_head_init(&psock->rxqueue); + INIT_WORK(&psock->tx_work, smap_tx_work); + INIT_WORK(&psock->gc_work, smap_gc_work); + INIT_LIST_HEAD(&psock->maps); + psock->refcnt = 1; + + rcu_assign_sk_user_data(sock, psock); + sock_hold(sock); + return psock; +} + +static struct bpf_map *sock_map_alloc(union bpf_attr *attr) +{ + struct bpf_stab *stab; + int err = -EINVAL; + u64 cost; + + if (!capable(CAP_NET_ADMIN)) + return ERR_PTR(-EPERM); + + /* check sanity of attributes */ + if (attr->max_entries == 0 || attr->key_size != 4 || + attr->value_size != 4 || attr->map_flags & ~SOCK_CREATE_FLAG_MASK) + return ERR_PTR(-EINVAL); + + if (attr->value_size > KMALLOC_MAX_SIZE) + return ERR_PTR(-E2BIG); + + stab = kzalloc(sizeof(*stab), GFP_USER); + if (!stab) + return ERR_PTR(-ENOMEM); + + /* mandatory map attributes */ + stab->map.map_type = attr->map_type; + stab->map.key_size = attr->key_size; + stab->map.value_size = attr->value_size; + stab->map.max_entries = attr->max_entries; + stab->map.map_flags = attr->map_flags; + stab->map.numa_node = bpf_map_attr_numa_node(attr); + + /* make sure page count doesn't overflow */ + cost = (u64) stab->map.max_entries * sizeof(struct sock *); + if (cost >= U32_MAX - PAGE_SIZE) + goto free_stab; + + stab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; + + /* if map size is larger than memlock limit, reject it early */ + err = bpf_map_precharge_memlock(stab->map.pages); + if (err) + goto free_stab; + + err = -ENOMEM; + stab->sock_map = bpf_map_area_alloc(stab->map.max_entries * + sizeof(struct sock *), + stab->map.numa_node); + if (!stab->sock_map) + goto free_stab; + + return &stab->map; +free_stab: + kfree(stab); + return ERR_PTR(err); +} + +static void smap_list_remove(struct smap_psock *psock, struct sock **entry) +{ + struct smap_psock_map_entry *e, *tmp; + + list_for_each_entry_safe(e, tmp, &psock->maps, list) { + if (e->entry == entry) { + list_del(&e->list); + break; + } + } +} + +static void sock_map_free(struct bpf_map *map) +{ + struct bpf_stab *stab = container_of(map, struct bpf_stab, map); + int i; + + synchronize_rcu(); + + /* At this point no update, lookup or delete operations can happen. + * However, be aware we can still get a socket state event updates, + * and data ready callabacks that reference the psock from sk_user_data + * Also psock worker threads are still in-flight. So smap_release_sock + * will only free the psock after cancel_sync on the worker threads + * and a grace period expire to ensure psock is really safe to remove. + */ + rcu_read_lock(); + for (i = 0; i < stab->map.max_entries; i++) { + struct smap_psock *psock; + struct sock *sock; + + sock = xchg(&stab->sock_map[i], NULL); + if (!sock) + continue; + + write_lock_bh(&sock->sk_callback_lock); + psock = smap_psock_sk(sock); + /* This check handles a racing sock event that can get the + * sk_callback_lock before this case but after xchg happens + * causing the refcnt to hit zero and sock user data (psock) + * to be null and queued for garbage collection. + */ + if (likely(psock)) { + smap_list_remove(psock, &stab->sock_map[i]); + smap_release_sock(psock, sock); + } + write_unlock_bh(&sock->sk_callback_lock); + } + rcu_read_unlock(); + + sock_map_remove_complete(stab); +} + +static int sock_map_get_next_key(struct bpf_map *map, void *key, void *next_key) +{ + struct bpf_stab *stab = container_of(map, struct bpf_stab, map); + u32 i = key ? *(u32 *)key : U32_MAX; + u32 *next = (u32 *)next_key; + + if (i >= stab->map.max_entries) { + *next = 0; + return 0; + } + + if (i == stab->map.max_entries - 1) + return -ENOENT; + + *next = i + 1; + return 0; +} + +struct sock *__sock_map_lookup_elem(struct bpf_map *map, u32 key) +{ + struct bpf_stab *stab = container_of(map, struct bpf_stab, map); + + if (key >= map->max_entries) + return NULL; + + return READ_ONCE(stab->sock_map[key]); +} + +static int sock_map_delete_elem(struct bpf_map *map, void *key) +{ + struct bpf_stab *stab = container_of(map, struct bpf_stab, map); + struct smap_psock *psock; + int k = *(u32 *)key; + struct sock *sock; + + if (k >= map->max_entries) + return -EINVAL; + + sock = xchg(&stab->sock_map[k], NULL); + if (!sock) + return -EINVAL; + + write_lock_bh(&sock->sk_callback_lock); + psock = smap_psock_sk(sock); + if (!psock) + goto out; + + if (psock->bpf_parse) + smap_stop_sock(psock, sock); + smap_list_remove(psock, &stab->sock_map[k]); + smap_release_sock(psock, sock); +out: + write_unlock_bh(&sock->sk_callback_lock); + return 0; +} + +/* Locking notes: Concurrent updates, deletes, and lookups are allowed and are + * done inside rcu critical sections. This ensures on updates that the psock + * will not be released via smap_release_sock() until concurrent updates/deletes + * complete. All operations operate on sock_map using cmpxchg and xchg + * operations to ensure we do not get stale references. Any reads into the + * map must be done with READ_ONCE() because of this. + * + * A psock is destroyed via call_rcu and after any worker threads are cancelled + * and syncd so we are certain all references from the update/lookup/delete + * operations as well as references in the data path are no longer in use. + * + * Psocks may exist in multiple maps, but only a single set of parse/verdict + * programs may be inherited from the maps it belongs to. A reference count + * is kept with the total number of references to the psock from all maps. The + * psock will not be released until this reaches zero. The psock and sock + * user data data use the sk_callback_lock to protect critical data structures + * from concurrent access. This allows us to avoid two updates from modifying + * the user data in sock and the lock is required anyways for modifying + * callbacks, we simply increase its scope slightly. + * + * Rules to follow, + * - psock must always be read inside RCU critical section + * - sk_user_data must only be modified inside sk_callback_lock and read + * inside RCU critical section. + * - psock->maps list must only be read & modified inside sk_callback_lock + * - sock_map must use READ_ONCE and (cmp)xchg operations + * - BPF verdict/parse programs must use READ_ONCE and xchg operations + */ +static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops, + struct bpf_map *map, + void *key, u64 flags) +{ + struct bpf_stab *stab = container_of(map, struct bpf_stab, map); + struct smap_psock_map_entry *e = NULL; + struct bpf_prog *verdict, *parse; + struct sock *osock, *sock; + struct smap_psock *psock; + u32 i = *(u32 *)key; + int err; + + if (unlikely(flags > BPF_EXIST)) + return -EINVAL; + + if (unlikely(i >= stab->map.max_entries)) + return -E2BIG; + + sock = READ_ONCE(stab->sock_map[i]); + if (flags == BPF_EXIST && !sock) + return -ENOENT; + else if (flags == BPF_NOEXIST && sock) + return -EEXIST; + + sock = skops->sk; + + /* 1. If sock map has BPF programs those will be inherited by the + * sock being added. If the sock is already attached to BPF programs + * this results in an error. + */ + verdict = READ_ONCE(stab->bpf_verdict); + parse = READ_ONCE(stab->bpf_parse); + + if (parse && verdict) { + /* bpf prog refcnt may be zero if a concurrent attach operation + * removes the program after the above READ_ONCE() but before + * we increment the refcnt. If this is the case abort with an + * error. + */ + verdict = bpf_prog_inc_not_zero(stab->bpf_verdict); + if (IS_ERR(verdict)) + return PTR_ERR(verdict); + + parse = bpf_prog_inc_not_zero(stab->bpf_parse); + if (IS_ERR(parse)) { + bpf_prog_put(verdict); + return PTR_ERR(parse); + } + } + + write_lock_bh(&sock->sk_callback_lock); + psock = smap_psock_sk(sock); + + /* 2. Do not allow inheriting programs if psock exists and has + * already inherited programs. This would create confusion on + * which parser/verdict program is running. If no psock exists + * create one. Inside sk_callback_lock to ensure concurrent create + * doesn't update user data. + */ + if (psock) { + if (READ_ONCE(psock->bpf_parse) && parse) { + err = -EBUSY; + goto out_progs; + } + psock->refcnt++; + } else { + psock = smap_init_psock(sock, stab); + if (IS_ERR(psock)) { + err = PTR_ERR(psock); + goto out_progs; + } + + set_bit(SMAP_TX_RUNNING, &psock->state); + } + + e = kzalloc(sizeof(*e), GFP_ATOMIC | __GFP_NOWARN); + if (!e) { + err = -ENOMEM; + goto out_progs; + } + e->entry = &stab->sock_map[i]; + + /* 3. At this point we have a reference to a valid psock that is + * running. Attach any BPF programs needed. + */ + if (parse && verdict && !psock->strp_enabled) { + err = smap_init_sock(psock, sock); + if (err) + goto out_free; + smap_init_progs(psock, stab, verdict, parse); + smap_start_sock(psock, sock); + } + + /* 4. Place psock in sockmap for use and stop any programs on + * the old sock assuming its not the same sock we are replacing + * it with. Because we can only have a single set of programs if + * old_sock has a strp we can stop it. + */ + list_add_tail(&e->list, &psock->maps); + write_unlock_bh(&sock->sk_callback_lock); + + osock = xchg(&stab->sock_map[i], sock); + if (osock) { + struct smap_psock *opsock = smap_psock_sk(osock); + + write_lock_bh(&osock->sk_callback_lock); + if (osock != sock && parse) + smap_stop_sock(opsock, osock); + smap_list_remove(opsock, &stab->sock_map[i]); + smap_release_sock(opsock, osock); + write_unlock_bh(&osock->sk_callback_lock); + } + return 0; +out_free: + smap_release_sock(psock, sock); +out_progs: + if (verdict) + bpf_prog_put(verdict); + if (parse) + bpf_prog_put(parse); + write_unlock_bh(&sock->sk_callback_lock); + kfree(e); + return err; +} + +int sock_map_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type) +{ + struct bpf_stab *stab = container_of(map, struct bpf_stab, map); + struct bpf_prog *orig; + + if (unlikely(map->map_type != BPF_MAP_TYPE_SOCKMAP)) + return -EINVAL; + + switch (type) { + case BPF_SK_SKB_STREAM_PARSER: + orig = xchg(&stab->bpf_parse, prog); + break; + case BPF_SK_SKB_STREAM_VERDICT: + orig = xchg(&stab->bpf_verdict, prog); + break; + default: + return -EOPNOTSUPP; + } + + if (orig) + bpf_prog_put(orig); + + return 0; +} + +static void *sock_map_lookup(struct bpf_map *map, void *key) +{ + return NULL; +} + +static int sock_map_update_elem(struct bpf_map *map, + void *key, void *value, u64 flags) +{ + struct bpf_sock_ops_kern skops; + u32 fd = *(u32 *)value; + struct socket *socket; + int err; + + socket = sockfd_lookup(fd, &err); + if (!socket) + return err; + + skops.sk = socket->sk; + if (!skops.sk) { + fput(socket->file); + return -EINVAL; + } + + if (skops.sk->sk_type != SOCK_STREAM || + skops.sk->sk_protocol != IPPROTO_TCP) { + fput(socket->file); + return -EOPNOTSUPP; + } + + err = sock_map_ctx_update_elem(&skops, map, key, flags); + fput(socket->file); + return err; +} + +static void sock_map_release(struct bpf_map *map) +{ + struct bpf_stab *stab = container_of(map, struct bpf_stab, map); + struct bpf_prog *orig; + + orig = xchg(&stab->bpf_parse, NULL); + if (orig) + bpf_prog_put(orig); + orig = xchg(&stab->bpf_verdict, NULL); + if (orig) + bpf_prog_put(orig); +} + +const struct bpf_map_ops sock_map_ops = { + .map_alloc = sock_map_alloc, + .map_free = sock_map_free, + .map_lookup_elem = sock_map_lookup, + .map_get_next_key = sock_map_get_next_key, + .map_update_elem = sock_map_update_elem, + .map_delete_elem = sock_map_delete_elem, + .map_release_uref = sock_map_release, +}; + +BPF_CALL_4(bpf_sock_map_update, struct bpf_sock_ops_kern *, bpf_sock, + struct bpf_map *, map, void *, key, u64, flags) +{ + WARN_ON_ONCE(!rcu_read_lock_held()); + return sock_map_ctx_update_elem(bpf_sock, map, key, flags); +} + +const struct bpf_func_proto bpf_sock_map_update_proto = { + .func = bpf_sock_map_update, + .gpl_only = false, + .pkt_access = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_PTR_TO_MAP_KEY, + .arg4_type = ARG_ANYTHING, +}; diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index ed71857961cd..f14d8865d75b 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -94,10 +94,14 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr) if (cost >= U32_MAX - PAGE_SIZE) goto free_smap; - bpf_map_init_from_attr(&smap->map, attr); + smap->map.map_type = attr->map_type; + smap->map.key_size = attr->key_size; smap->map.value_size = value_size; + smap->map.max_entries = attr->max_entries; + smap->map.map_flags = attr->map_flags; smap->n_buckets = n_buckets; smap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; + smap->map.numa_node = bpf_map_attr_numa_node(attr); err = bpf_map_precharge_memlock(smap->map.pages); if (err) @@ -279,5 +283,4 @@ const struct bpf_map_ops stack_map_ops = { .map_lookup_elem = stack_map_lookup_elem, .map_update_elem = stack_map_update_elem, .map_delete_elem = stack_map_delete_elem, - .map_check_btf = map_check_no_btf, }; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 261067c99afa..3ae2ea263613 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -11,7 +11,6 @@ */ #include #include -#include #include #include #include @@ -24,10 +23,6 @@ #include #include #include -#include -#include -#include -#include #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PROG_ARRAY || \ (map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \ @@ -65,9 +60,9 @@ static const struct bpf_map_ops * const bpf_map_types[] = { * copy_from_user() call. However, this is not a concern since this function is * meant to be a future-proofing of bits. */ -int bpf_check_uarg_tail_zero(void __user *uaddr, - size_t expected_size, - size_t actual_size) +static int check_uarg_tail_zero(void __user *uaddr, + size_t expected_size, + size_t actual_size) { unsigned char __user *addr; unsigned char __user *end; @@ -97,35 +92,18 @@ int bpf_check_uarg_tail_zero(void __user *uaddr, return 0; } -const struct bpf_map_ops bpf_map_offload_ops = { - .map_alloc = bpf_map_offload_map_alloc, - .map_free = bpf_map_offload_map_free, - .map_check_btf = map_check_no_btf, -}; - static struct bpf_map *find_and_alloc_map(union bpf_attr *attr) { - const struct bpf_map_ops *ops; struct bpf_map *map; - int err; - if (attr->map_type >= ARRAY_SIZE(bpf_map_types)) - return ERR_PTR(-EINVAL); - ops = bpf_map_types[attr->map_type]; - if (!ops) + if (attr->map_type >= ARRAY_SIZE(bpf_map_types) || + !bpf_map_types[attr->map_type]) return ERR_PTR(-EINVAL); - if (attr->map_ifindex) - ops = &bpf_map_offload_ops; - if (ops->map_alloc_check) { - err = ops->map_alloc_check(attr); - if (err) - return ERR_PTR(err); - } - map = ops->map_alloc(attr); + map = bpf_map_types[attr->map_type]->map_alloc(attr); if (IS_ERR(map)) return map; - map->ops = ops; + map->ops = bpf_map_types[attr->map_type]; map->map_type = attr->map_type; return map; } @@ -154,16 +132,6 @@ void bpf_map_area_free(void *area) kvfree(area); } -void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr) -{ - map->map_type = attr->map_type; - map->key_size = attr->key_size; - map->value_size = attr->value_size; - map->max_entries = attr->max_entries; - map->map_flags = attr->map_flags; - map->numa_node = bpf_map_attr_numa_node(attr); -} - int bpf_map_precharge_memlock(u32 pages) { struct user_struct *user = get_current_user(); @@ -177,58 +145,32 @@ int bpf_map_precharge_memlock(u32 pages) return 0; } -static int bpf_charge_memlock(struct user_struct *user, u32 pages) +static int bpf_map_charge_memlock(struct bpf_map *map) { - unsigned long memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; + struct user_struct *user = get_current_user(); + unsigned long memlock_limit; - if (atomic_long_add_return(pages, &user->locked_vm) > memlock_limit) { - atomic_long_sub(pages, &user->locked_vm); + memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; + + atomic_long_add(map->pages, &user->locked_vm); + + if (atomic_long_read(&user->locked_vm) > memlock_limit) { + atomic_long_sub(map->pages, &user->locked_vm); + free_uid(user); return -EPERM; } + map->user = user; return 0; } -static void bpf_uncharge_memlock(struct user_struct *user, u32 pages) -{ - atomic_long_sub(pages, &user->locked_vm); -} -static int bpf_map_init_memlock(struct bpf_map *map) -{ - struct user_struct *user = get_current_user(); - int ret; - - ret = bpf_charge_memlock(user, map->pages); - if (ret) { - free_uid(user); - return ret; - } - map->user = user; - return ret; -} - -static void bpf_map_release_memlock(struct bpf_map *map) +static void bpf_map_uncharge_memlock(struct bpf_map *map) { struct user_struct *user = map->user; - bpf_uncharge_memlock(user, map->pages); + atomic_long_sub(map->pages, &user->locked_vm); free_uid(user); } -int bpf_map_charge_memlock(struct bpf_map *map, u32 pages) -{ - int ret; - ret = bpf_charge_memlock(map->user, pages); - if (ret) - return ret; - map->pages += pages; - return ret; -} -void bpf_map_uncharge_memlock(struct bpf_map *map, u32 pages) -{ - bpf_uncharge_memlock(map->user, pages); - map->pages -= pages; -} - static int bpf_map_alloc_id(struct bpf_map *map) { int id; @@ -245,25 +187,16 @@ static int bpf_map_alloc_id(struct bpf_map *map) return id > 0 ? 0 : id; } -void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock) +static void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock) { unsigned long flags; - /* Offloaded maps are removed from the IDR store when their device - * disappears - even if someone holds an fd to them they are unusable, - * the memory is gone, all ops will fail; they are simply waiting for - * refcnt to drop to be freed. - */ - if (!map->id) - return; - if (do_idr_lock) spin_lock_irqsave(&map_idr_lock, flags); else __acquire(&map_idr_lock); idr_remove(&map_idr, map->id); - map->id = 0; if (do_idr_lock) spin_unlock_irqrestore(&map_idr_lock, flags); @@ -276,7 +209,7 @@ static void bpf_map_free_deferred(struct work_struct *work) { struct bpf_map *map = container_of(work, struct bpf_map, work); - bpf_map_release_memlock(map); + bpf_map_uncharge_memlock(map); security_bpf_map_free(map); /* implementation dependent freeing */ map->ops->map_free(map); @@ -298,7 +231,6 @@ static void __bpf_map_put(struct bpf_map *map, bool do_idr_lock) if (atomic_dec_and_test(&map->refcnt)) { /* bpf_map_free_id() must be called first */ bpf_map_free_id(map, do_idr_lock); - btf_put(map->btf); INIT_WORK(&map->work, bpf_map_free_deferred); schedule_work(&map->work); } @@ -421,75 +353,7 @@ int bpf_get_file_flag(int flags) offsetof(union bpf_attr, CMD##_LAST_FIELD) - \ sizeof(attr->CMD##_LAST_FIELD)) != NULL -/* dst and src must have at least BPF_OBJ_NAME_LEN number of bytes. - * Return 0 on success and < 0 on error. - */ -static int bpf_obj_name_cpy(char *dst, const char *src) -{ - const char *end = src + BPF_OBJ_NAME_LEN; - - memset(dst, 0, BPF_OBJ_NAME_LEN); - - /* Copy all isalnum() and '_' char */ - while (src < end && *src) { - if (!isalnum(*src) && *src != '_') - return -EINVAL; - *dst++ = *src++; - } - - /* No '\0' found in BPF_OBJ_NAME_LEN number of bytes */ - if (src == end) - return -EINVAL; - - return 0; -} - -int map_check_no_btf(const struct bpf_map *map, - const struct btf *btf, - const struct btf_type *key_type, - const struct btf_type *value_type) -{ - return -ENOTSUPP; -} - -static int map_check_btf(struct bpf_map *map, const struct btf *btf, - u32 btf_key_id, u32 btf_value_id) -{ - const struct btf_type *key_type, *value_type; - u32 key_size, value_size; - int ret = 0; - key_type = btf_type_id_size(btf, &btf_key_id, &key_size); - - if (!key_type || key_size != map->key_size) - return -EINVAL; - - value_type = btf_type_id_size(btf, &btf_value_id, &value_size); - if (!value_type || value_size != map->value_size) - return -EINVAL; - - map->spin_lock_off = btf_find_spin_lock(btf, value_type); - - if (map_value_has_spin_lock(map)) { - if (map->map_type != BPF_MAP_TYPE_HASH && - map->map_type != BPF_MAP_TYPE_ARRAY && - map->map_type != BPF_MAP_TYPE_SK_STORAGE) - return -ENOTSUPP; - if (map->spin_lock_off + sizeof(struct bpf_spin_lock) > - map->value_size) { - WARN_ONCE(1, - "verifier bug spin_lock_off %d value_size %d\n", - map->spin_lock_off, map->value_size); - return -EFAULT; - } - } - - if (map->ops->map_check_btf) - ret = map->ops->map_check_btf(map, btf, key_type, value_type); - - return ret; -} - -#define BPF_MAP_CREATE_LAST_FIELD btf_value_type_id +#define BPF_MAP_CREATE_LAST_FIELD numa_node /* called via syscall */ static int map_create(union bpf_attr *attr) { @@ -516,45 +380,14 @@ static int map_create(union bpf_attr *attr) if (IS_ERR(map)) return PTR_ERR(map); - err = bpf_obj_name_cpy(map->name, attr->map_name); - if (err) - goto free_map_nouncharge; - atomic_set(&map->refcnt, 1); atomic_set(&map->usercnt, 1); - if (bpf_map_support_seq_show(map) && - (attr->btf_key_type_id || attr->btf_value_type_id)) { - struct btf *btf; - - if (!attr->btf_key_type_id || !attr->btf_value_type_id) { - err = -EINVAL; - goto free_map_nouncharge; - } - btf = btf_get_by_fd(attr->btf_fd); - if (IS_ERR(btf)) { - err = PTR_ERR(btf); - goto free_map_nouncharge; - } - - err = map_check_btf(map, btf, attr->btf_key_type_id, - attr->btf_value_type_id); - if (err) { - btf_put(btf); - goto free_map_nouncharge; - } - map->btf = btf; - map->btf_key_type_id = attr->btf_key_type_id; - map->btf_value_type_id = attr->btf_value_type_id; - } else { - map->spin_lock_off = -EINVAL; - } - err = security_bpf_map_alloc(map); if (err) goto free_map_nouncharge; - err = bpf_map_init_memlock(map); + err = bpf_map_charge_memlock(map); if (err) goto free_map_sec; @@ -578,11 +411,10 @@ static int map_create(union bpf_attr *attr) return err; free_map: - bpf_map_release_memlock(map); + bpf_map_uncharge_memlock(map); free_map_sec: security_bpf_map_free(map); free_map_nouncharge: - btf_put(map->btf); map->ops->map_free(map); return err; } @@ -659,7 +491,7 @@ int __weak bpf_stackmap_copy(struct bpf_map *map, void *key, void *value) } /* last field in 'union bpf_attr' used by this command */ -#define BPF_MAP_LOOKUP_ELEM_LAST_FIELD flags +#define BPF_MAP_LOOKUP_ELEM_LAST_FIELD value static int map_lookup_elem(union bpf_attr *attr) { @@ -675,9 +507,6 @@ static int map_lookup_elem(union bpf_attr *attr) if (CHECK_ATTR(BPF_MAP_LOOKUP_ELEM)) return -EINVAL; - if (attr->flags & ~BPF_F_LOCK) - return -EINVAL; - f = fdget(ufd); map = __bpf_map_get(f); if (IS_ERR(map)) @@ -688,12 +517,6 @@ static int map_lookup_elem(union bpf_attr *attr) goto err_put; } - if ((attr->flags & BPF_F_LOCK) && - !map_value_has_spin_lock(map)) { - err = -EINVAL; - goto err_put; - } - key = memdup_user(ukey, map->key_size); if (IS_ERR(key)) { err = PTR_ERR(key); @@ -702,8 +525,7 @@ static int map_lookup_elem(union bpf_attr *attr) if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH || - map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY || - map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) + map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) value_size = round_up(map->value_size, 8) * num_possible_cpus(); else if (IS_FD_MAP(map)) value_size = sizeof(u32); @@ -715,20 +537,13 @@ static int map_lookup_elem(union bpf_attr *attr) if (!value) goto free_key; - if (bpf_map_is_dev_bound(map)) { - err = bpf_map_offload_lookup_elem(map, key, value); - } else if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || - map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) { + if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || + map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) { err = bpf_percpu_hash_copy(map, key, value); } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { err = bpf_percpu_array_copy(map, key, value); - } else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) { - err = bpf_percpu_cgroup_storage_copy(map, key, value); } else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) { err = bpf_stackmap_copy(map, key, value); - } else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) { - err = bpf_percpu_cgroup_storage_update(map, key, value, - attr->flags); } else if (IS_FD_ARRAY(map)) { err = bpf_fd_array_map_lookup_elem(map, key, value); } else if (IS_FD_HASH(map)) { @@ -739,15 +554,8 @@ static int map_lookup_elem(union bpf_attr *attr) ptr = map->ops->map_lookup_elem_sys_only(map, key); else ptr = map->ops->map_lookup_elem(map, key); - if (ptr) { - if (attr->flags & BPF_F_LOCK) - /* lock 'ptr' and copy everything but lock */ - copy_map_value_locked(map, value, ptr, true); - else - copy_map_value(map, value, ptr); - /* mask lock, since value wasn't zero inited */ - check_and_init_map_lock(map, value); - } + if (ptr) + memcpy(value, ptr, value_size); rcu_read_unlock(); err = ptr ? 0 : -ENOENT; } @@ -808,12 +616,6 @@ static int map_update_elem(union bpf_attr *attr) goto err_put; } - if ((attr->flags & BPF_F_LOCK) && - !map_value_has_spin_lock(map)) { - err = -EINVAL; - goto err_put; - } - key = memdup_user(ukey, map->key_size); if (IS_ERR(key)) { err = PTR_ERR(key); @@ -822,8 +624,7 @@ static int map_update_elem(union bpf_attr *attr) if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH || - map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY || - map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) + map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) value_size = round_up(map->value_size, 8) * num_possible_cpus(); else value_size = map->value_size; @@ -837,15 +638,6 @@ static int map_update_elem(union bpf_attr *attr) if (copy_from_user(value, uvalue, value_size) != 0) goto free_value; - /* Need to create a kthread, thus must support schedule */ - if (bpf_map_is_dev_bound(map)) { - err = bpf_map_offload_update_elem(map, key, value, attr->flags); - goto out; - } else if (map->map_type == BPF_MAP_TYPE_CPUMAP) { - err = map->ops->map_update_elem(map, key, value, attr->flags); - goto out; - } - /* must increment bpf_prog_active to avoid kprobe+bpf triggering from * inside bpf map update or delete otherwise deadlocks are possible */ @@ -878,7 +670,6 @@ static int map_update_elem(union bpf_attr *attr) preempt_enable(); maybe_wait_bpf_programs(map); -out: if (!err) trace_bpf_map_update_elem(map, ufd, key, value); free_value: @@ -920,11 +711,6 @@ static int map_delete_elem(union bpf_attr *attr) goto err_put; } - if (bpf_map_is_dev_bound(map)) { - err = bpf_map_offload_delete_elem(map, key); - goto out; - } - preempt_disable(); __this_cpu_inc(bpf_prog_active); rcu_read_lock(); @@ -934,7 +720,6 @@ static int map_delete_elem(union bpf_attr *attr) preempt_enable(); maybe_wait_bpf_programs(map); -out: if (!err) trace_bpf_map_delete_elem(map, ufd, key); kfree(key); @@ -984,15 +769,9 @@ static int map_get_next_key(union bpf_attr *attr) if (!next_key) goto free_key; - if (bpf_map_is_dev_bound(map)) { - err = bpf_map_offload_get_next_key(map, key, next_key); - goto out; - } - rcu_read_lock(); err = map->ops->map_get_next_key(map, key, next_key); rcu_read_unlock(); -out: if (err) goto free_next_key; @@ -1012,9 +791,9 @@ err_put: return err; } -static const struct bpf_prog_ops * const bpf_prog_types[] = { -#define BPF_PROG_TYPE(_id, _name) \ - [_id] = & _name ## _prog_ops, +static const struct bpf_verifier_ops * const bpf_prog_types[] = { +#define BPF_PROG_TYPE(_id, _ops) \ + [_id] = &_ops, #define BPF_MAP_TYPE(_id, _ops) #include #undef BPF_PROG_TYPE @@ -1026,10 +805,7 @@ static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog) if (type >= ARRAY_SIZE(bpf_prog_types) || !bpf_prog_types[type]) return -EINVAL; - if (!bpf_prog_is_dev_bound(prog->aux)) - prog->aux->ops = bpf_prog_types[type]; - else - prog->aux->ops = &bpf_offload_prog_ops; + prog->aux->ops = bpf_prog_types[type]; prog->type = type; return 0; } @@ -1037,16 +813,8 @@ static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog) /* drop refcnt on maps used by eBPF program and free auxilary data */ static void free_used_maps(struct bpf_prog_aux *aux) { - enum bpf_cgroup_storage_type stype; int i; - for_each_cgroup_storage_type(stype) { - if (!aux->cgroup_storage[stype]) - continue; - bpf_cgroup_storage_release(aux->prog, - aux->cgroup_storage[stype]); - } - for (i = 0; i < aux->used_map_cnt; i++) bpf_map_put(aux->used_maps[i]); @@ -1115,13 +883,9 @@ static int bpf_prog_alloc_id(struct bpf_prog *prog) return id > 0 ? 0 : id; } -void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock) +static void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock) { - /* cBPF to eBPF migrations are currently not in the idr store. - * Offloaded programs are removed from the store when their device - * disappears - even if someone grabs an fd to them they are unusable, - * simply waiting for refcnt to drop to be freed. - */ + /* cBPF to eBPF migrations are currently not in the idr store. */ if (!prog->aux->id) return; @@ -1132,7 +896,6 @@ void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock) idr_remove(&prog_idr, prog->aux->id); - prog->aux->id = 0; if (do_idr_lock) spin_unlock_bh(&prog_idr_lock); else @@ -1156,10 +919,6 @@ static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock) /* bpf_prog_free_id() must be called first */ bpf_prog_free_id(prog, do_idr_lock); bpf_prog_kallsyms_del(prog); - btf_put(prog->aux->btf); - kvfree(prog->aux->func_info); - bpf_prog_free_linfo(prog); - call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu); } } @@ -1276,7 +1035,7 @@ struct bpf_prog *bpf_prog_inc_not_zero(struct bpf_prog *prog) } EXPORT_SYMBOL_GPL(bpf_prog_inc_not_zero); -static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *attach_type) +static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *type) { struct fd f = fdget(ufd); struct bpf_prog *prog; @@ -1284,7 +1043,7 @@ static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *attach_type) prog = ____bpf_prog_get(f); if (IS_ERR(prog)) return prog; - if (attach_type && (prog->type != *attach_type || prog->aux->offload)) { + if (type && prog->type != *type) { prog = ERR_PTR(-EINVAL); goto out; } @@ -1310,90 +1069,10 @@ struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type) } EXPORT_SYMBOL_GPL(bpf_prog_get_type); -/* Initially all BPF programs could be loaded w/o specifying - * expected_attach_type. Later for some of them specifying expected_attach_type - * at load time became required so that program could be validated properly. - * Programs of types that are allowed to be loaded both w/ and w/o (for - * backward compatibility) expected_attach_type, should have the default attach - * type assigned to expected_attach_type for the latter case, so that it can be - * validated later at attach time. - * - * bpf_prog_load_fixup_attach_type() sets expected_attach_type in @attr if - * prog type requires it but has some attach types that have to be backward - * compatible. - */ -static void bpf_prog_load_fixup_attach_type(union bpf_attr *attr) -{ - switch (attr->prog_type) { - case BPF_PROG_TYPE_CGROUP_SOCK: - /* Unfortunately BPF_ATTACH_TYPE_UNSPEC enumeration doesn't - * exist so checking for non-zero is the way to go here. - */ - if (!attr->expected_attach_type) - attr->expected_attach_type = - BPF_CGROUP_INET_SOCK_CREATE; - break; - } -} - -static int -bpf_prog_load_check_attach_type(enum bpf_prog_type prog_type, - enum bpf_attach_type expected_attach_type) -{ - switch (prog_type) { - case BPF_PROG_TYPE_CGROUP_SOCK: - switch (expected_attach_type) { - case BPF_CGROUP_INET_SOCK_CREATE: - case BPF_CGROUP_INET4_POST_BIND: - case BPF_CGROUP_INET6_POST_BIND: - return 0; - default: - return -EINVAL; - } - case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: - switch (expected_attach_type) { - case BPF_CGROUP_INET4_BIND: - case BPF_CGROUP_INET6_BIND: - case BPF_CGROUP_INET4_CONNECT: - case BPF_CGROUP_INET6_CONNECT: - case BPF_CGROUP_UDP4_SENDMSG: - case BPF_CGROUP_UDP6_SENDMSG: - case BPF_CGROUP_UDP4_RECVMSG: - case BPF_CGROUP_UDP6_RECVMSG: - return 0; - default: - return -EINVAL; - } - case BPF_PROG_TYPE_CGROUP_SOCKOPT: - switch (expected_attach_type) { - case BPF_CGROUP_SETSOCKOPT: - case BPF_CGROUP_GETSOCKOPT: - return 0; - default: - return -EINVAL; - } - default: - return 0; - } -} - -static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, - enum bpf_attach_type attach_type) -{ - switch (prog->type) { - case BPF_PROG_TYPE_CGROUP_SOCK: - case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: - case BPF_PROG_TYPE_CGROUP_SOCKOPT: - return attach_type == prog->expected_attach_type ? 0 : -EINVAL; - default: - return 0; - } -} - /* last field in 'union bpf_attr' used by this command */ -#define BPF_PROG_LOAD_LAST_FIELD line_info_cnt +#define BPF_PROG_LOAD_LAST_FIELD prog_flags -static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) +static int bpf_prog_load(union bpf_attr *attr) { enum bpf_prog_type type = attr->prog_type; struct bpf_prog *prog; @@ -1428,17 +1107,11 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) !capable(CAP_SYS_ADMIN)) return -EPERM; - bpf_prog_load_fixup_attach_type(attr); - if (bpf_prog_load_check_attach_type(type, attr->expected_attach_type)) - return -EINVAL; - /* plain bpf_prog allocation */ prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER); if (!prog) return -ENOMEM; - prog->expected_attach_type = attr->expected_attach_type; - err = security_bpf_prog_alloc(prog->aux); if (err) goto free_prog_nouncharge; @@ -1460,27 +1133,17 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) atomic_set(&prog->aux->refcnt, 1); prog->gpl_compatible = is_gpl ? 1 : 0; - if (attr->prog_ifindex) { - err = bpf_prog_offload_init(prog, attr); - if (err) - goto free_prog; - } - /* find program type: socket_filter vs tracing_filter */ err = find_prog_type(type, prog); if (err < 0) goto free_prog; - prog->aux->load_time = ktime_get_boot_ns(); - err = bpf_obj_name_cpy(prog->aux->name, attr->prog_name); - if (err) - goto free_prog; - /* run eBPF verifier */ - err = bpf_check(&prog, attr, uattr); + err = bpf_check(&prog, attr); if (err < 0) goto free_used_maps; + /* eBPF program is ready to be JITed */ prog = bpf_prog_select_runtime(prog, &err); if (err < 0) goto free_used_maps; @@ -1512,7 +1175,6 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) return err; free_used_maps: - bpf_prog_free_linfo(prog); free_used_maps(prog->aux); free_prog: bpf_prog_uncharge_memlock(prog); @@ -1543,89 +1205,44 @@ static int bpf_obj_get(const union bpf_attr *attr) attr->file_flags); } -struct bpf_raw_tracepoint { - struct bpf_raw_event_map *btp; - struct bpf_prog *prog; -}; - -static int bpf_raw_tracepoint_release(struct inode *inode, struct file *filp) -{ - struct bpf_raw_tracepoint *raw_tp = filp->private_data; - - if (raw_tp->prog) { - bpf_probe_unregister(raw_tp->btp, raw_tp->prog); - bpf_prog_put(raw_tp->prog); - } - kfree(raw_tp); - return 0; -} - -static const struct file_operations bpf_raw_tp_fops = { - .release = bpf_raw_tracepoint_release, - .read = bpf_dummy_read, - .write = bpf_dummy_write, -}; - -#define BPF_RAW_TRACEPOINT_OPEN_LAST_FIELD raw_tracepoint.prog_fd -static int bpf_raw_tracepoint_open(const union bpf_attr *attr) -{ - struct bpf_raw_tracepoint *raw_tp; - struct bpf_raw_event_map *btp; - struct bpf_prog *prog; - char tp_name[128]; - int tp_fd, err; - - if (strncpy_from_user(tp_name, u64_to_user_ptr(attr->raw_tracepoint.name), - sizeof(tp_name) - 1) < 0) - return -EFAULT; - tp_name[sizeof(tp_name) - 1] = 0; - - btp = bpf_find_raw_tracepoint(tp_name); - if (!btp) - return -ENOENT; - - raw_tp = kzalloc(sizeof(*raw_tp), GFP_USER); - if (!raw_tp) - return -ENOMEM; - raw_tp->btp = btp; - - prog = bpf_prog_get(attr->raw_tracepoint.prog_fd); - if (IS_ERR(prog)) { - err = PTR_ERR(prog); - goto out_free_tp; - } - - if (prog->type != BPF_PROG_TYPE_RAW_TRACEPOINT && - prog->type != BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE) { - err = -EINVAL; - goto out_put_prog; - } - - err = bpf_probe_register(raw_tp->btp, prog); - if (err) - goto out_put_prog; - - raw_tp->prog = prog; - tp_fd = anon_inode_getfd("bpf-raw-tracepoint", &bpf_raw_tp_fops, raw_tp, - O_CLOEXEC); - if (tp_fd < 0) { - bpf_probe_unregister(raw_tp->btp, prog); - err = tp_fd; - goto out_put_prog; - } - return tp_fd; - -out_put_prog: - bpf_prog_put(prog); -out_free_tp: - kfree(raw_tp); - return err; -} - #ifdef CONFIG_CGROUP_BPF #define BPF_PROG_ATTACH_LAST_FIELD attach_flags +static int sockmap_get_from_fd(const union bpf_attr *attr, bool attach) +{ + struct bpf_prog *prog = NULL; + int ufd = attr->target_fd; + struct bpf_map *map; + struct fd f; + int err; + + f = fdget(ufd); + map = __bpf_map_get(f); + if (IS_ERR(map)) + return PTR_ERR(map); + + if (attach) { + prog = bpf_prog_get_type(attr->attach_bpf_fd, + BPF_PROG_TYPE_SK_SKB); + if (IS_ERR(prog)) { + fdput(f); + return PTR_ERR(prog); + } + } + + err = sock_map_prog(map, prog, attr->attach_type); + if (err) { + fdput(f); + if (prog) + bpf_prog_put(prog); + return err; + } + + fdput(f); + return 0; +} + #define BPF_F_ATTACH_MASK \ (BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI) @@ -1651,43 +1268,14 @@ static int bpf_prog_attach(const union bpf_attr *attr) ptype = BPF_PROG_TYPE_CGROUP_SKB; break; case BPF_CGROUP_INET_SOCK_CREATE: - case BPF_CGROUP_INET4_POST_BIND: - case BPF_CGROUP_INET6_POST_BIND: ptype = BPF_PROG_TYPE_CGROUP_SOCK; break; - case BPF_CGROUP_INET4_BIND: - case BPF_CGROUP_INET6_BIND: - case BPF_CGROUP_INET4_CONNECT: - case BPF_CGROUP_INET6_CONNECT: - case BPF_CGROUP_UDP4_SENDMSG: - case BPF_CGROUP_UDP6_SENDMSG: - case BPF_CGROUP_UDP4_RECVMSG: - case BPF_CGROUP_UDP6_RECVMSG: - ptype = BPF_PROG_TYPE_CGROUP_SOCK_ADDR; - break; case BPF_CGROUP_SOCK_OPS: ptype = BPF_PROG_TYPE_SOCK_OPS; break; - case BPF_CGROUP_DEVICE: - ptype = BPF_PROG_TYPE_CGROUP_DEVICE; - break; - case BPF_SK_MSG_VERDICT: - ret = sock_map_get_from_fd(attr, prog); - break; case BPF_SK_SKB_STREAM_PARSER: case BPF_SK_SKB_STREAM_VERDICT: - ret = sock_map_get_from_fd(attr, prog); - break; - case BPF_FLOW_DISSECTOR: - ptype = BPF_PROG_TYPE_FLOW_DISSECTOR; - break; - case BPF_CGROUP_SYSCTL: - ptype = BPF_PROG_TYPE_CGROUP_SYSCTL; - break; - case BPF_CGROUP_GETSOCKOPT: - case BPF_CGROUP_SETSOCKOPT: - ptype = BPF_PROG_TYPE_CGROUP_SOCKOPT; - break; + return sockmap_get_from_fd(attr, true); default: return -EINVAL; } @@ -1696,11 +1284,6 @@ static int bpf_prog_attach(const union bpf_attr *attr) if (IS_ERR(prog)) return PTR_ERR(prog); - if (bpf_prog_attach_check_attach_type(prog, attr->attach_type)) { - bpf_prog_put(prog); - return -EINVAL; - } - cgrp = cgroup_get_from_fd(attr->target_fd); if (IS_ERR(cgrp)) { bpf_prog_put(prog); @@ -1737,40 +1320,14 @@ static int bpf_prog_detach(const union bpf_attr *attr) ptype = BPF_PROG_TYPE_CGROUP_SKB; break; case BPF_CGROUP_INET_SOCK_CREATE: - case BPF_CGROUP_INET4_POST_BIND: - case BPF_CGROUP_INET6_POST_BIND: ptype = BPF_PROG_TYPE_CGROUP_SOCK; break; - case BPF_CGROUP_INET4_BIND: - case BPF_CGROUP_INET6_BIND: - case BPF_CGROUP_INET4_CONNECT: - case BPF_CGROUP_INET6_CONNECT: - case BPF_CGROUP_UDP4_SENDMSG: - case BPF_CGROUP_UDP6_SENDMSG: - case BPF_CGROUP_UDP4_RECVMSG: - case BPF_CGROUP_UDP6_RECVMSG: - ptype = BPF_PROG_TYPE_CGROUP_SOCK_ADDR; - break; case BPF_CGROUP_SOCK_OPS: ptype = BPF_PROG_TYPE_SOCK_OPS; break; - case BPF_CGROUP_DEVICE: - ptype = BPF_PROG_TYPE_CGROUP_DEVICE; - break; - case BPF_SK_MSG_VERDICT: - return sock_map_get_from_fd(attr, NULL); case BPF_SK_SKB_STREAM_PARSER: case BPF_SK_SKB_STREAM_VERDICT: - return sock_map_get_from_fd(attr, NULL); - case BPF_FLOW_DISSECTOR: - return skb_flow_dissector_bpf_prog_detach(attr); - case BPF_CGROUP_SYSCTL: - ptype = BPF_PROG_TYPE_CGROUP_SYSCTL; - break; - case BPF_CGROUP_GETSOCKOPT: - case BPF_CGROUP_SETSOCKOPT: - ptype = BPF_PROG_TYPE_CGROUP_SOCKOPT; - break; + return sockmap_get_from_fd(attr, false); default: return -EINVAL; } @@ -1790,50 +1347,6 @@ static int bpf_prog_detach(const union bpf_attr *attr) return ret; } -#define BPF_PROG_QUERY_LAST_FIELD query.prog_cnt - -static int bpf_prog_query(const union bpf_attr *attr, - union bpf_attr __user *uattr) -{ - struct cgroup *cgrp; - int ret; - - if (!capable(CAP_NET_ADMIN)) - return -EPERM; - if (CHECK_ATTR(BPF_PROG_QUERY)) - return -EINVAL; - if (attr->query.query_flags & ~BPF_F_QUERY_EFFECTIVE) - return -EINVAL; - - switch (attr->query.attach_type) { - case BPF_CGROUP_INET_INGRESS: - case BPF_CGROUP_INET_EGRESS: - case BPF_CGROUP_INET_SOCK_CREATE: - case BPF_CGROUP_INET4_BIND: - case BPF_CGROUP_INET6_BIND: - case BPF_CGROUP_INET4_POST_BIND: - case BPF_CGROUP_INET6_POST_BIND: - case BPF_CGROUP_INET4_CONNECT: - case BPF_CGROUP_INET6_CONNECT: - case BPF_CGROUP_UDP4_SENDMSG: - case BPF_CGROUP_UDP6_SENDMSG: - case BPF_CGROUP_UDP4_RECVMSG: - case BPF_CGROUP_UDP6_RECVMSG: - case BPF_CGROUP_SOCK_OPS: - case BPF_CGROUP_SYSCTL: - case BPF_CGROUP_GETSOCKOPT: - case BPF_CGROUP_SETSOCKOPT: - break; - default: - return -EINVAL; - } - cgrp = cgroup_get_from_fd(attr->query.target_fd); - if (IS_ERR(cgrp)) - return PTR_ERR(cgrp); - ret = cgroup_bpf_query(cgrp, attr, uattr); - cgroup_put(cgrp); - return ret; -} #endif /* CONFIG_CGROUP_BPF */ #define BPF_PROG_TEST_RUN_LAST_FIELD test.duration @@ -1956,111 +1469,6 @@ static int bpf_map_get_fd_by_id(const union bpf_attr *attr) return fd; } -static const struct bpf_map *bpf_map_from_imm(const struct bpf_prog *prog, - unsigned long addr, u32 *off, - u32 *type) -{ - const struct bpf_map *map; - int i; - - for (i = 0, *off = 0; i < prog->aux->used_map_cnt; i++) { - map = prog->aux->used_maps[i]; - if (map == (void *)addr) { - *type = BPF_PSEUDO_MAP_FD; - return map; - } - if (!map->ops->map_direct_value_meta) - continue; - if (!map->ops->map_direct_value_meta(map, addr, off)) { - *type = BPF_PSEUDO_MAP_VALUE; - return map; - } - } - - return NULL; -} - -static struct bpf_insn *bpf_insn_prepare_dump(const struct bpf_prog *prog) -{ - const struct bpf_map *map; - struct bpf_insn *insns; - u32 off, type; - u64 imm; - int i; - - insns = kmemdup(prog->insnsi, bpf_prog_insn_size(prog), - GFP_USER); - if (!insns) - return insns; - - for (i = 0; i < prog->len; i++) { - if (insns[i].code == (BPF_JMP | BPF_TAIL_CALL)) { - insns[i].code = BPF_JMP | BPF_CALL; - insns[i].imm = BPF_FUNC_tail_call; - /* fall-through */ - } - - if (insns[i].code == (BPF_JMP | BPF_CALL) || - insns[i].code == (BPF_JMP | BPF_CALL_ARGS)) { - if (insns[i].code == (BPF_JMP | BPF_CALL_ARGS)) - insns[i].code = BPF_JMP | BPF_CALL; - if (!bpf_dump_raw_ok()) - insns[i].imm = 0; - continue; - } - - if (insns[i].code != (BPF_LD | BPF_IMM | BPF_DW)) - continue; - - imm = ((u64)insns[i + 1].imm << 32) | (u32)insns[i].imm; - map = bpf_map_from_imm(prog, imm, &off, &type); - - if (map) { - insns[i].src_reg = type; - insns[i].imm = map->id; - insns[i + 1].imm = off; - continue; - } - - if (!bpf_dump_raw_ok() && - imm == (unsigned long)prog->aux) { - insns[i].imm = 0; - insns[i + 1].imm = 0; - continue; - } - } - - return insns; -} - -static int set_info_rec_size(struct bpf_prog_info *info) -{ - /* - * Ensure info.*_rec_size is the same as kernel expected size - * - * or - * - * Only allow zero *_rec_size if both _rec_size and _cnt are - * zero. In this case, the kernel will set the expected - * _rec_size back to the info. - */ - if ((info->func_info_cnt || info->func_info_rec_size) && - info->func_info_rec_size != sizeof(struct bpf_func_info)) - return -EINVAL; - if ((info->line_info_cnt || info->line_info_rec_size) && - info->line_info_rec_size != sizeof(struct bpf_line_info)) - return -EINVAL; - if ((info->jited_line_info_cnt || info->jited_line_info_rec_size) && - info->jited_line_info_rec_size != sizeof(__u64)) - return -EINVAL; - info->func_info_rec_size = sizeof(struct bpf_func_info); - info->line_info_rec_size = sizeof(struct bpf_line_info); - info->jited_line_info_rec_size = sizeof(__u64); - - return 0; -} - - static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, const union bpf_attr *attr, union bpf_attr __user *uattr) @@ -2072,7 +1480,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, u32 ulen; int err; - err = bpf_check_uarg_tail_zero(uinfo, sizeof(info), info_len); + err = check_uarg_tail_zero(uinfo, sizeof(info), info_len); if (err) return err; info_len = min_t(u32, sizeof(info), info_len); @@ -2083,190 +1491,33 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, info.type = prog->type; info.id = prog->aux->id; - info.load_time = prog->aux->load_time; - info.created_by_uid = from_kuid_munged(current_user_ns(), - prog->aux->user->uid); - info.gpl_compatible = prog->gpl_compatible; memcpy(info.tag, prog->tag, sizeof(prog->tag)); - memcpy(info.name, prog->aux->name, sizeof(prog->aux->name)); - - ulen = info.nr_map_ids; - info.nr_map_ids = prog->aux->used_map_cnt; - ulen = min_t(u32, info.nr_map_ids, ulen); - if (ulen) { - u32 __user *user_map_ids = u64_to_user_ptr(info.map_ids); - u32 i; - - for (i = 0; i < ulen; i++) - if (put_user(prog->aux->used_maps[i]->id, - &user_map_ids[i])) - return -EFAULT; - } - - err = set_info_rec_size(&info); - if (err) - return err; if (!capable(CAP_SYS_ADMIN)) { info.jited_prog_len = 0; info.xlated_prog_len = 0; - info.nr_jited_ksyms = 0; - info.nr_jited_func_lens = 0; - info.func_info_cnt = 0; - info.line_info_cnt = 0; - info.jited_line_info_cnt = 0; goto done; } ulen = info.jited_prog_len; info.jited_prog_len = prog->jited_len; if (info.jited_prog_len && ulen) { - if (bpf_dump_raw_ok()) { - uinsns = u64_to_user_ptr(info.jited_prog_insns); - ulen = min_t(u32, info.jited_prog_len, ulen); - if (copy_to_user(uinsns, prog->bpf_func, ulen)) - return -EFAULT; - } else { - info.jited_prog_insns = 0; - } + uinsns = u64_to_user_ptr(info.jited_prog_insns); + ulen = min_t(u32, info.jited_prog_len, ulen); + if (copy_to_user(uinsns, prog->bpf_func, ulen)) + return -EFAULT; } ulen = info.xlated_prog_len; info.xlated_prog_len = bpf_prog_insn_size(prog); if (info.xlated_prog_len && ulen) { - struct bpf_insn *insns_sanitized; - bool fault; - if (prog->blinded && !bpf_dump_raw_ok()) { - info.xlated_prog_insns = 0; - goto done; - } - insns_sanitized = bpf_insn_prepare_dump(prog); - if (!insns_sanitized) - return -ENOMEM; uinsns = u64_to_user_ptr(info.xlated_prog_insns); ulen = min_t(u32, info.xlated_prog_len, ulen); - fault = copy_to_user(uinsns, insns_sanitized, ulen); - kfree(insns_sanitized); - if (fault) + if (copy_to_user(uinsns, prog->insnsi, ulen)) return -EFAULT; } - if (bpf_prog_is_dev_bound(prog->aux)) { - err = bpf_prog_offload_info_fill(&info, prog); - if (err) - return err; - } - - ulen = info.nr_jited_ksyms; - info.nr_jited_ksyms = prog->aux->func_cnt; - if (info.nr_jited_ksyms && ulen) { - if (bpf_dump_raw_ok()) { - u64 __user *user_ksyms; - ulong ksym_addr; - u32 i; - - /* copy the address of the kernel symbol - * corresponding to each function - */ - ulen = min_t(u32, info.nr_jited_ksyms, ulen); - user_ksyms = u64_to_user_ptr(info.jited_ksyms); - for (i = 0; i < ulen; i++) { - ksym_addr = (ulong) prog->aux->func[i]->bpf_func; - ksym_addr &= PAGE_MASK; - if (put_user((u64) ksym_addr, &user_ksyms[i])) - return -EFAULT; - } - } else { - info.jited_ksyms = 0; - } - } - - ulen = info.nr_jited_func_lens; - info.nr_jited_func_lens = prog->aux->func_cnt; - if (info.nr_jited_func_lens && ulen) { - if (bpf_dump_raw_ok()) { - u32 __user *user_lens; - u32 func_len, i; - - /* copy the JITed image lengths for each function */ - ulen = min_t(u32, info.nr_jited_func_lens, ulen); - user_lens = u64_to_user_ptr(info.jited_func_lens); - for (i = 0; i < ulen; i++) { - func_len = prog->aux->func[i]->jited_len; - if (put_user(func_len, &user_lens[i])) - return -EFAULT; - } - } else { - info.jited_func_lens = 0; - } - } - - if (prog->aux->btf) { - u32 krec_size = sizeof(struct bpf_func_info); - u32 ucnt, urec_size; - - info.btf_id = btf_id(prog->aux->btf); - - ucnt = info.func_info_cnt; - info.func_info_cnt = prog->aux->func_info_cnt; - urec_size = info.func_info_rec_size; - info.func_info_rec_size = krec_size; - if (ucnt) { - /* expect passed-in urec_size is what the kernel expects */ - if (urec_size != info.func_info_rec_size) - return -EINVAL; - - if (bpf_dump_raw_ok()) { - char __user *user_finfo; - user_finfo = u64_to_user_ptr(info.func_info); - ucnt = min_t(u32, info.func_info_cnt, ucnt); - if (copy_to_user(user_finfo, prog->aux->func_info, - krec_size * ucnt)) - return -EFAULT; - } else { - info.func_info_cnt = 0; - } - } - } else { - info.func_info_cnt = 0; - } - - ulen = info.line_info_cnt; - info.line_info_cnt = prog->aux->nr_linfo; - if (info.line_info_cnt && ulen) { - if (bpf_dump_raw_ok()) { - __u8 __user *user_linfo; - user_linfo = u64_to_user_ptr(info.line_info); - ulen = min_t(u32, info.line_info_cnt, ulen); - if (copy_to_user(user_linfo, prog->aux->linfo, - info.line_info_rec_size * ulen)) - return -EFAULT; - } else { - info.line_info = 0; - } - } - ulen = info.jited_line_info_cnt; - if (prog->aux->jited_linfo) - info.jited_line_info_cnt = prog->aux->nr_linfo; - else - info.jited_line_info_cnt = 0; - if (info.jited_line_info_cnt && ulen) { - if (bpf_dump_raw_ok()) { - __u64 __user *user_linfo; - u32 i; - user_linfo = u64_to_user_ptr(info.jited_line_info); - ulen = min_t(u32, info.jited_line_info_cnt, ulen); - for (i = 0; i < ulen; i++) { - if (put_user((__u64)(long)prog->aux->jited_linfo[i], - &user_linfo[i])) - return -EFAULT; - } - } else { - info.jited_line_info = 0; - } - } - done: if (copy_to_user(uinfo, &info, info_len) || put_user(info_len, &uattr->info.info_len)) @@ -2284,7 +1535,7 @@ static int bpf_map_get_info_by_fd(struct bpf_map *map, u32 info_len = attr->info.info_len; int err; - err = bpf_check_uarg_tail_zero(uinfo, sizeof(info), info_len); + err = check_uarg_tail_zero(uinfo, sizeof(info), info_len); if (err) return err; info_len = min_t(u32, sizeof(info), info_len); @@ -2296,19 +1547,6 @@ static int bpf_map_get_info_by_fd(struct bpf_map *map, info.value_size = map->value_size; info.max_entries = map->max_entries; info.map_flags = map->map_flags; - memcpy(info.name, map->name, sizeof(map->name)); - - if (map->btf) { - info.btf_id = btf_id(map->btf); - info.btf_key_type_id = map->btf_key_type_id; - info.btf_value_type_id = map->btf_value_type_id; - } - - if (bpf_map_is_dev_bound(map)) { - err = bpf_map_offload_info_fill(&info, map); - if (err) - return err; - } if (copy_to_user(uinfo, &info, info_len) || put_user(info_len, &uattr->info.info_len)) @@ -2339,8 +1577,6 @@ static int bpf_obj_get_info_by_fd(const union bpf_attr *attr, else if (f.file->f_op == &bpf_map_fops) err = bpf_map_get_info_by_fd(f.file->private_data, attr, uattr); - else if (f.file->f_op == &btf_fops) - err = btf_get_info_by_fd(f.file->private_data, attr, uattr); else err = -EINVAL; @@ -2348,32 +1584,6 @@ static int bpf_obj_get_info_by_fd(const union bpf_attr *attr, return err; } -#define BPF_BTF_LOAD_LAST_FIELD btf_log_level - -static int bpf_btf_load(const union bpf_attr *attr) -{ - if (CHECK_ATTR(BPF_BTF_LOAD)) - return -EINVAL; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - return btf_new_fd(attr); -} - -#define BPF_BTF_GET_FD_BY_ID_LAST_FIELD btf_id - -static int bpf_btf_get_fd_by_id(const union bpf_attr *attr) -{ - if (CHECK_ATTR(BPF_BTF_GET_FD_BY_ID)) - return -EINVAL; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - return btf_get_fd_by_id(attr->btf_id); -} - SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) { union bpf_attr attr; @@ -2382,7 +1592,7 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz if (sysctl_unprivileged_bpf_disabled && !capable(CAP_SYS_ADMIN)) return -EPERM; - err = bpf_check_uarg_tail_zero(uattr, sizeof(attr), size); + err = check_uarg_tail_zero(uattr, sizeof(attr), size); if (err) return err; size = min_t(u32, size, sizeof(attr)); @@ -2413,7 +1623,7 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz err = map_get_next_key(&attr); break; case BPF_PROG_LOAD: - err = bpf_prog_load(&attr, uattr); + err = bpf_prog_load(&attr); break; case BPF_OBJ_PIN: err = bpf_obj_pin(&attr); @@ -2428,9 +1638,6 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz case BPF_PROG_DETACH: err = bpf_prog_detach(&attr); break; - case BPF_PROG_QUERY: - err = bpf_prog_query(&attr, uattr); - break; #endif case BPF_PROG_TEST_RUN: err = bpf_prog_test_run(&attr, uattr); @@ -2452,15 +1659,6 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz case BPF_OBJ_GET_INFO_BY_FD: err = bpf_obj_get_info_by_fd(&attr, uattr); break; - case BPF_RAW_TRACEPOINT_OPEN: - err = bpf_raw_tracepoint_open(&attr); - break; - case BPF_BTF_LOAD: - err = bpf_btf_load(&attr); - break; - case BPF_BTF_GET_FD_BY_ID: - err = bpf_btf_get_fd_by_id(&attr); - break; default: err = -EINVAL; break; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index ee2c13fa1f5d..a55e264cdb54 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1,6 +1,5 @@ /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com * Copyright (c) 2016 Facebook - * Copyright (c) 2018 Covalent IO, Inc. http://covalent.io * * This program is free software; you can redistribute it and/or * modify it under the terms of version 2 of the GNU General Public @@ -10,50 +9,17 @@ * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. - * - * The following reference types represent a potential reference to a kernel - * resource which, after first being allocated, must be checked and freed by - * the BPF program: - * - PTR_TO_SOCKET_OR_NULL, PTR_TO_SOCKET - * - * When the verifier sees a helper call return a reference type, it allocates a - * pointer id for the reference and stores it in the current function state. - * Similar to the way that PTR_TO_MAP_VALUE_OR_NULL is converted into - * PTR_TO_MAP_VALUE, PTR_TO_SOCKET_OR_NULL becomes PTR_TO_SOCKET when the type - * passes through a NULL-check conditional. For the branch wherein the state is - * changed to CONST_IMM, the verifier releases the reference. - * - * For each helper function that allocates a reference, such as - * bpf_sk_lookup_tcp(), there is a corresponding release function, such as - * bpf_sk_release(). When a reference type passes into the release function, - * the verifier also releases the reference. If any unchecked or unreleased - * reference remains at the end of the program, the verifier rejects it. */ -#include #include #include #include #include -#include #include #include #include #include #include #include -#include -#include - -#include "disasm.h" - -static const struct bpf_verifier_ops * const bpf_verifier_ops[] = { -#define BPF_PROG_TYPE(_id, _name) \ - [_id] = & _name ## _verifier_ops, -#define BPF_MAP_TYPE(_id, _ops) -#include -#undef BPF_PROG_TYPE -#undef BPF_MAP_TYPE -}; /* bpf_check() is a static code analyzer that walks eBPF program * instruction by instruction and updates register/stack state. @@ -177,28 +143,7 @@ struct bpf_verifier_stack_elem { #define BPF_COMPLEXITY_LIMIT_INSNS 131072 #define BPF_COMPLEXITY_LIMIT_STACK 1024 -#define BPF_MAP_PTR_UNPRIV 1UL -#define BPF_MAP_PTR_POISON ((void *)((0xeB9FUL << 1) + \ - POISON_POINTER_DELTA)) -#define BPF_MAP_PTR(X) ((struct bpf_map *)((X) & ~BPF_MAP_PTR_UNPRIV)) -static bool bpf_map_ptr_poisoned(const struct bpf_insn_aux_data *aux) -{ - return BPF_MAP_PTR(aux->map_state) == BPF_MAP_PTR_POISON; -} - -static bool bpf_map_ptr_unpriv(const struct bpf_insn_aux_data *aux) -{ - return aux->map_state & BPF_MAP_PTR_UNPRIV; -} - -static void bpf_map_ptr_store(struct bpf_insn_aux_data *aux, - const struct bpf_map *map, bool unpriv) -{ - BUILD_BUG_ON((unsigned long)BPF_MAP_PTR_POISON & BPF_MAP_PTR_UNPRIV); - unpriv |= bpf_map_ptr_unpriv(aux); - aux->map_state = (unsigned long)map | - (unpriv ? BPF_MAP_PTR_UNPRIV : 0UL); -} +#define BPF_MAP_PTR_POISON ((void *)0xeB9F + POISON_POINTER_DELTA) struct bpf_call_arg_meta { struct bpf_map *map_ptr; @@ -206,129 +151,31 @@ struct bpf_call_arg_meta { bool pkt_access; int regno; int access_size; - s64 msize_smax_value; - u64 msize_umax_value; - int ptr_id; - int func_id; }; +/* verbose verifier prints what it's seeing + * bpf_check() is called under lock, so no race to access these global vars + */ +static u32 log_level, log_size, log_len; +static char *log_buf; + static DEFINE_MUTEX(bpf_verifier_lock); -void bpf_verifier_vlog(struct bpf_verifier_log *log, const char *fmt, - va_list args) -{ - unsigned int n; - - n = vscnprintf(log->kbuf, BPF_VERIFIER_TMP_LOG_SIZE, fmt, args); - WARN_ONCE(n >= BPF_VERIFIER_TMP_LOG_SIZE - 1, - "verifier log line truncated - local buffer too short\n"); - - n = min(log->len_total - log->len_used - 1, n); - log->kbuf[n] = '\0'; - - if (!copy_to_user(log->ubuf + log->len_used, log->kbuf, n + 1)) - log->len_used += n; - else - log->ubuf = NULL; -} - /* log_level controls verbosity level of eBPF verifier. - * bpf_verifier_log_write() is used to dump the verification trace to the log, - * so the user can figure out what's wrong with the program + * verbose() is used to dump the verification trace to the log, so the user + * can figure out what's wrong with the program */ -__printf(2, 3) void bpf_verifier_log_write(struct bpf_verifier_env *env, - const char *fmt, ...) +static __printf(1, 2) void verbose(const char *fmt, ...) { va_list args; - if (!bpf_verifier_log_needed(&env->log)) + if (log_level == 0 || log_len >= log_size - 1) return; va_start(args, fmt); - bpf_verifier_vlog(&env->log, fmt, args); + log_len += vscnprintf(log_buf + log_len, log_size - log_len, fmt, args); va_end(args); } -EXPORT_SYMBOL_GPL(bpf_verifier_log_write); - -__printf(2, 3) static void verbose(void *private_data, const char *fmt, ...) -{ - struct bpf_verifier_env *env = private_data; - va_list args; - - if (!bpf_verifier_log_needed(&env->log)) - return; - - va_start(args, fmt); - bpf_verifier_vlog(&env->log, fmt, args); - va_end(args); -} - -static bool type_is_pkt_pointer(enum bpf_reg_type type) -{ - return type == PTR_TO_PACKET || - type == PTR_TO_PACKET_META; -} - -static bool type_is_sk_pointer(enum bpf_reg_type type) -{ - return type == PTR_TO_SOCKET || - type == PTR_TO_SOCK_COMMON || - type == PTR_TO_TCP_SOCK; -} - -static bool reg_type_may_be_null(enum bpf_reg_type type) -{ - return type == PTR_TO_MAP_VALUE_OR_NULL || - type == PTR_TO_SOCKET_OR_NULL || - type == PTR_TO_SOCK_COMMON_OR_NULL || - type == PTR_TO_TCP_SOCK_OR_NULL; -} - -static bool type_is_refcounted(enum bpf_reg_type type) -{ - return type == PTR_TO_SOCKET; -} - -static bool type_is_refcounted_or_null(enum bpf_reg_type type) -{ - return type == PTR_TO_SOCKET || type == PTR_TO_SOCKET_OR_NULL; -} - -static bool reg_is_refcounted(const struct bpf_reg_state *reg) -{ - return type_is_refcounted(reg->type); -} - -static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg) -{ - return reg->type == PTR_TO_MAP_VALUE && - map_value_has_spin_lock(reg->map_ptr); -} - -static bool reg_is_refcounted_or_null(const struct bpf_reg_state *reg) -{ - return type_is_refcounted_or_null(reg->type); -} - -static bool arg_type_is_refcounted(enum bpf_arg_type type) -{ - return type == ARG_PTR_TO_SOCKET; -} - -/* Determine whether the function releases some resources allocated by another - * function call. The first reference type argument will be assumed to be - * released by release_reference(). - */ -static bool is_release_function(enum bpf_func_id func_id) -{ - return func_id == BPF_FUNC_sk_release; -} - -static bool is_acquire_function(enum bpf_func_id func_id) -{ - return func_id == BPF_FUNC_sk_lookup_tcp || - func_id == BPF_FUNC_sk_lookup_udp; -} /* string representation of 'enum bpf_reg_type' */ static const char * const reg_type_str[] = { @@ -340,71 +187,51 @@ static const char * const reg_type_str[] = { [PTR_TO_MAP_VALUE_OR_NULL] = "map_value_or_null", [PTR_TO_STACK] = "fp", [PTR_TO_PACKET] = "pkt", - [PTR_TO_PACKET_META] = "pkt_meta", [PTR_TO_PACKET_END] = "pkt_end", - [PTR_TO_FLOW_KEYS] = "flow_keys", - [PTR_TO_SOCKET] = "sock", - [PTR_TO_SOCKET_OR_NULL] = "sock_or_null", - [PTR_TO_SOCK_COMMON] = "sock_common", - [PTR_TO_SOCK_COMMON_OR_NULL] = "sock_common_or_null", - [PTR_TO_TCP_SOCK] = "tcp_sock", - [PTR_TO_TCP_SOCK_OR_NULL] = "tcp_sock_or_null", - [PTR_TO_TP_BUFFER] = "tp_buffer", }; -static void print_liveness(struct bpf_verifier_env *env, - enum bpf_reg_liveness live) +#define __BPF_FUNC_STR_FN(x) [BPF_FUNC_ ## x] = __stringify(bpf_ ## x) +static const char * const func_id_str[] = { + __BPF_FUNC_MAPPER(__BPF_FUNC_STR_FN) +}; +#undef __BPF_FUNC_STR_FN + +static const char *func_id_name(int id) { - if (live & (REG_LIVE_READ | REG_LIVE_WRITTEN)) - verbose(env, "_"); - if (live & REG_LIVE_READ) - verbose(env, "r"); - if (live & REG_LIVE_WRITTEN) - verbose(env, "w"); + BUILD_BUG_ON(ARRAY_SIZE(func_id_str) != __BPF_FUNC_MAX_ID); + + if (id >= 0 && id < __BPF_FUNC_MAX_ID && func_id_str[id]) + return func_id_str[id]; + else + return "unknown"; } -static struct bpf_func_state *func(struct bpf_verifier_env *env, - const struct bpf_reg_state *reg) +static void print_verifier_state(struct bpf_verifier_state *state) { - struct bpf_verifier_state *cur = env->cur_state; - - return cur->frame[reg->frameno]; -} - -static void print_verifier_state(struct bpf_verifier_env *env, - const struct bpf_func_state *state) -{ - const struct bpf_reg_state *reg; + struct bpf_reg_state *reg; enum bpf_reg_type t; int i; - if (state->frameno) - verbose(env, " frame%d:", state->frameno); - for (i = 0; i < MAX_BPF_REG; i++) { reg = &state->regs[i]; t = reg->type; if (t == NOT_INIT) continue; - verbose(env, " R%d", i); - print_liveness(env, reg->live); - verbose(env, "=%s", reg_type_str[t]); + verbose(" R%d=%s", i, reg_type_str[t]); if ((t == SCALAR_VALUE || t == PTR_TO_STACK) && tnum_is_const(reg->var_off)) { /* reg->off should be 0 for SCALAR_VALUE */ - verbose(env, "%lld", reg->var_off.value + reg->off); - if (t == PTR_TO_STACK) - verbose(env, ",call_%d", func(env, reg)->callsite); + verbose("%lld", reg->var_off.value + reg->off); } else { - verbose(env, "(id=%d", reg->id); + verbose("(id=%d", reg->id); if (t != SCALAR_VALUE) - verbose(env, ",off=%d", reg->off); - if (type_is_pkt_pointer(t)) - verbose(env, ",r=%d", reg->range); + verbose(",off=%d", reg->off); + if (t == PTR_TO_PACKET) + verbose(",r=%d", reg->range); else if (t == CONST_PTR_TO_MAP || t == PTR_TO_MAP_VALUE || t == PTR_TO_MAP_VALUE_OR_NULL) - verbose(env, ",ks=%d,vs=%d", + verbose(",ks=%d,vs=%d", reg->map_ptr->key_size, reg->map_ptr->value_size); if (tnum_is_const(reg->var_off)) { @@ -412,194 +239,262 @@ static void print_verifier_state(struct bpf_verifier_env *env, * could be a pointer whose offset is too big * for reg->off */ - verbose(env, ",imm=%llx", reg->var_off.value); + verbose(",imm=%llx", reg->var_off.value); } else { if (reg->smin_value != reg->umin_value && reg->smin_value != S64_MIN) - verbose(env, ",smin_value=%lld", + verbose(",smin_value=%lld", (long long)reg->smin_value); if (reg->smax_value != reg->umax_value && reg->smax_value != S64_MAX) - verbose(env, ",smax_value=%lld", + verbose(",smax_value=%lld", (long long)reg->smax_value); if (reg->umin_value != 0) - verbose(env, ",umin_value=%llu", + verbose(",umin_value=%llu", (unsigned long long)reg->umin_value); if (reg->umax_value != U64_MAX) - verbose(env, ",umax_value=%llu", + verbose(",umax_value=%llu", (unsigned long long)reg->umax_value); if (!tnum_is_unknown(reg->var_off)) { char tn_buf[48]; tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose(env, ",var_off=%s", tn_buf); + verbose(",var_off=%s", tn_buf); } } - verbose(env, ")"); + verbose(")"); } } for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { - if (state->stack[i].slot_type[0] == STACK_SPILL) { - verbose(env, " fp%d", - (-i - 1) * BPF_REG_SIZE); - print_liveness(env, state->stack[i].spilled_ptr.live); - verbose(env, "=%s", + if (state->stack[i].slot_type[0] == STACK_SPILL) + verbose(" fp%d=%s", + (-i - 1) * BPF_REG_SIZE, reg_type_str[state->stack[i].spilled_ptr.type]); + } + verbose("\n"); +} + +static const char *const bpf_class_string[] = { + [BPF_LD] = "ld", + [BPF_LDX] = "ldx", + [BPF_ST] = "st", + [BPF_STX] = "stx", + [BPF_ALU] = "alu", + [BPF_JMP] = "jmp", + [BPF_RET] = "BUG", + [BPF_ALU64] = "alu64", +}; + +static const char *const bpf_alu_string[16] = { + [BPF_ADD >> 4] = "+=", + [BPF_SUB >> 4] = "-=", + [BPF_MUL >> 4] = "*=", + [BPF_DIV >> 4] = "/=", + [BPF_OR >> 4] = "|=", + [BPF_AND >> 4] = "&=", + [BPF_LSH >> 4] = "<<=", + [BPF_RSH >> 4] = ">>=", + [BPF_NEG >> 4] = "neg", + [BPF_MOD >> 4] = "%=", + [BPF_XOR >> 4] = "^=", + [BPF_MOV >> 4] = "=", + [BPF_ARSH >> 4] = "s>>=", + [BPF_END >> 4] = "endian", +}; + +static const char *const bpf_ldst_string[] = { + [BPF_W >> 3] = "u32", + [BPF_H >> 3] = "u16", + [BPF_B >> 3] = "u8", + [BPF_DW >> 3] = "u64", +}; + +static const char *const bpf_jmp_string[16] = { + [BPF_JA >> 4] = "jmp", + [BPF_JEQ >> 4] = "==", + [BPF_JGT >> 4] = ">", + [BPF_JLT >> 4] = "<", + [BPF_JGE >> 4] = ">=", + [BPF_JLE >> 4] = "<=", + [BPF_JSET >> 4] = "&", + [BPF_JNE >> 4] = "!=", + [BPF_JSGT >> 4] = "s>", + [BPF_JSLT >> 4] = "s<", + [BPF_JSGE >> 4] = "s>=", + [BPF_JSLE >> 4] = "s<=", + [BPF_CALL >> 4] = "call", + [BPF_EXIT >> 4] = "exit", +}; + +static void print_bpf_insn(const struct bpf_verifier_env *env, + const struct bpf_insn *insn) +{ + u8 class = BPF_CLASS(insn->code); + + if (class == BPF_ALU || class == BPF_ALU64) { + if (BPF_SRC(insn->code) == BPF_X) + verbose("(%02x) %sr%d %s %sr%d\n", + insn->code, class == BPF_ALU ? "(u32) " : "", + insn->dst_reg, + bpf_alu_string[BPF_OP(insn->code) >> 4], + class == BPF_ALU ? "(u32) " : "", + insn->src_reg); + else + verbose("(%02x) %sr%d %s %s%d\n", + insn->code, class == BPF_ALU ? "(u32) " : "", + insn->dst_reg, + bpf_alu_string[BPF_OP(insn->code) >> 4], + class == BPF_ALU ? "(u32) " : "", + insn->imm); + } else if (class == BPF_STX) { + if (BPF_MODE(insn->code) == BPF_MEM) + verbose("(%02x) *(%s *)(r%d %+d) = r%d\n", + insn->code, + bpf_ldst_string[BPF_SIZE(insn->code) >> 3], + insn->dst_reg, + insn->off, insn->src_reg); + else if (BPF_MODE(insn->code) == BPF_XADD) + verbose("(%02x) lock *(%s *)(r%d %+d) += r%d\n", + insn->code, + bpf_ldst_string[BPF_SIZE(insn->code) >> 3], + insn->dst_reg, insn->off, + insn->src_reg); + else + verbose("BUG_%02x\n", insn->code); + } else if (class == BPF_ST) { + if (BPF_MODE(insn->code) != BPF_MEM) { + verbose("BUG_st_%02x\n", insn->code); + return; } - } - if (state->acquired_refs && state->refs[0].id) { - verbose(env, " refs=%d", state->refs[0].id); - for (i = 1; i < state->acquired_refs; i++) - if (state->refs[i].id) - verbose(env, ",%d", state->refs[i].id); - } - verbose(env, "\n"); -} - -#define COPY_STATE_FN(NAME, COUNT, FIELD, SIZE) \ -static int copy_##NAME##_state(struct bpf_func_state *dst, \ - const struct bpf_func_state *src) \ -{ \ - if (!src->FIELD) \ - return 0; \ - if (WARN_ON_ONCE(dst->COUNT < src->COUNT)) { \ - /* internal bug, make state invalid to reject the program */ \ - memset(dst, 0, sizeof(*dst)); \ - return -EFAULT; \ - } \ - memcpy(dst->FIELD, src->FIELD, \ - sizeof(*src->FIELD) * (src->COUNT / SIZE)); \ - return 0; \ -} - -/* copy_reference_state() */ -COPY_STATE_FN(reference, acquired_refs, refs, 1) -/* copy_stack_state() */ -COPY_STATE_FN(stack, allocated_stack, stack, BPF_REG_SIZE) -#undef COPY_STATE_FN -#define REALLOC_STATE_FN(NAME, COUNT, FIELD, SIZE) \ -static int realloc_##NAME##_state(struct bpf_func_state *state, int size, \ - bool copy_old) \ -{ \ - u32 old_size = state->COUNT; \ - struct bpf_##NAME##_state *new_##FIELD; \ - int slot = size / SIZE; \ - \ - if (size <= old_size || !size) { \ - if (copy_old) \ - return 0; \ - state->COUNT = slot * SIZE; \ - if (!size && old_size) { \ - kfree(state->FIELD); \ - state->FIELD = NULL; \ - } \ - return 0; \ - } \ - new_##FIELD = kmalloc_array(slot, sizeof(struct bpf_##NAME##_state), \ - GFP_KERNEL); \ - if (!new_##FIELD) \ - return -ENOMEM; \ - if (copy_old) { \ - if (state->FIELD) \ - memcpy(new_##FIELD, state->FIELD, \ - sizeof(*new_##FIELD) * (old_size / SIZE)); \ - memset(new_##FIELD + old_size / SIZE, 0, \ - sizeof(*new_##FIELD) * (size - old_size) / SIZE); \ - } \ - state->COUNT = slot * SIZE; \ - kfree(state->FIELD); \ - state->FIELD = new_##FIELD; \ - return 0; \ -} -/* realloc_reference_state() */ -REALLOC_STATE_FN(reference, acquired_refs, refs, 1) -/* realloc_stack_state() */ -REALLOC_STATE_FN(stack, allocated_stack, stack, BPF_REG_SIZE) -#undef REALLOC_STATE_FN - -/* do_check() starts with zero-sized stack in struct bpf_verifier_state to - * make it consume minimal amount of memory. check_stack_write() access from - * the program calls into realloc_func_state() to grow the stack size. - * Note there is a non-zero 'parent' pointer inside bpf_verifier_state - * which realloc_stack_state() copies over. It points to previous - * bpf_verifier_state which is never reallocated. - */ -static int realloc_func_state(struct bpf_func_state *state, int stack_size, - int refs_size, bool copy_old) -{ - int err = realloc_reference_state(state, refs_size, copy_old); - if (err) - return err; - return realloc_stack_state(state, stack_size, copy_old); -} - -/* Acquire a pointer id from the env and update the state->refs to include - * this new pointer reference. - * On success, returns a valid pointer id to associate with the register - * On failure, returns a negative errno. - */ -static int acquire_reference_state(struct bpf_verifier_env *env, int insn_idx) -{ - struct bpf_func_state *state = cur_func(env); - int new_ofs = state->acquired_refs; - int id, err; - err = realloc_reference_state(state, state->acquired_refs + 1, true); - if (err) - return err; - id = ++env->id_gen; - state->refs[new_ofs].id = id; - state->refs[new_ofs].insn_idx = insn_idx; - return id; -} - -/* release function corresponding to acquire_reference_state(). Idempotent. */ -static int release_reference_state(struct bpf_func_state *state, int ptr_id) -{ - int i, last_idx; - - last_idx = state->acquired_refs - 1; - for (i = 0; i < state->acquired_refs; i++) { - if (state->refs[i].id == ptr_id) { - if (last_idx && i != last_idx) - memcpy(&state->refs[i], &state->refs[last_idx], - sizeof(*state->refs)); - memset(&state->refs[last_idx], 0, sizeof(*state->refs)); - state->acquired_refs--; - return 0; + verbose("(%02x) *(%s *)(r%d %+d) = %d\n", + insn->code, + bpf_ldst_string[BPF_SIZE(insn->code) >> 3], + insn->dst_reg, + insn->off, insn->imm); + } else if (class == BPF_LDX) { + if (BPF_MODE(insn->code) != BPF_MEM) { + verbose("BUG_ldx_%02x\n", insn->code); + return; } + verbose("(%02x) r%d = *(%s *)(r%d %+d)\n", + insn->code, insn->dst_reg, + bpf_ldst_string[BPF_SIZE(insn->code) >> 3], + insn->src_reg, insn->off); + } else if (class == BPF_LD) { + if (BPF_MODE(insn->code) == BPF_ABS) { + verbose("(%02x) r0 = *(%s *)skb[%d]\n", + insn->code, + bpf_ldst_string[BPF_SIZE(insn->code) >> 3], + insn->imm); + } else if (BPF_MODE(insn->code) == BPF_IND) { + verbose("(%02x) r0 = *(%s *)skb[r%d + %d]\n", + insn->code, + bpf_ldst_string[BPF_SIZE(insn->code) >> 3], + insn->src_reg, insn->imm); + } else if (BPF_MODE(insn->code) == BPF_IMM && + BPF_SIZE(insn->code) == BPF_DW) { + /* At this point, we already made sure that the second + * part of the ldimm64 insn is accessible. + */ + u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm; + bool map_ptr = insn->src_reg == BPF_PSEUDO_MAP_FD; + + if (map_ptr && !env->allow_ptr_leaks) + imm = 0; + + verbose("(%02x) r%d = 0x%llx\n", insn->code, + insn->dst_reg, (unsigned long long)imm); + } else { + verbose("BUG_ld_%02x\n", insn->code); + return; + } + } else if (class == BPF_JMP) { + u8 opcode = BPF_OP(insn->code); + + if (opcode == BPF_CALL) { + verbose("(%02x) call %s#%d\n", insn->code, + func_id_name(insn->imm), insn->imm); + } else if (insn->code == (BPF_JMP | BPF_JA)) { + verbose("(%02x) goto pc%+d\n", + insn->code, insn->off); + } else if (insn->code == (BPF_JMP | BPF_EXIT)) { + verbose("(%02x) exit\n", insn->code); + } else if (BPF_SRC(insn->code) == BPF_X) { + verbose("(%02x) if r%d %s r%d goto pc%+d\n", + insn->code, insn->dst_reg, + bpf_jmp_string[BPF_OP(insn->code) >> 4], + insn->src_reg, insn->off); + } else { + verbose("(%02x) if r%d %s 0x%x goto pc%+d\n", + insn->code, insn->dst_reg, + bpf_jmp_string[BPF_OP(insn->code) >> 4], + insn->imm, insn->off); + } + } else { + verbose("(%02x) %s\n", insn->code, bpf_class_string[class]); } - return -EINVAL; } -static int transfer_reference_state(struct bpf_func_state *dst, - struct bpf_func_state *src) +static int copy_stack_state(struct bpf_verifier_state *dst, + const struct bpf_verifier_state *src) { - int err = realloc_reference_state(dst, src->acquired_refs, false); - if (err) - return err; - err = copy_reference_state(dst, src); - if (err) - return err; + if (!src->stack) + return 0; + if (WARN_ON_ONCE(dst->allocated_stack < src->allocated_stack)) { + /* internal bug, make state invalid to reject the program */ + memset(dst, 0, sizeof(*dst)); + return -EFAULT; + } + memcpy(dst->stack, src->stack, + sizeof(*src->stack) * (src->allocated_stack / BPF_REG_SIZE)); return 0; } -static void free_func_state(struct bpf_func_state *state) +/* do_check() starts with zero-sized stack in struct bpf_verifier_state to + * make it consume minimal amount of memory. check_stack_write() access from + * the program calls into realloc_verifier_state() to grow the stack size. + * Note there is a non-zero 'parent' pointer inside bpf_verifier_state + * which this function copies over. It points to previous bpf_verifier_state + * which is never reallocated + */ +static int realloc_verifier_state(struct bpf_verifier_state *state, int size, + bool copy_old) { - kfree(state->refs); + u32 old_size = state->allocated_stack; + struct bpf_stack_state *new_stack; + int slot = size / BPF_REG_SIZE; + + if (size <= old_size || !size) { + if (copy_old) + return 0; + state->allocated_stack = slot * BPF_REG_SIZE; + if (!size && old_size) { + kfree(state->stack); + state->stack = NULL; + } + return 0; + } + new_stack = kmalloc_array(slot, sizeof(struct bpf_stack_state), + GFP_KERNEL); + if (!new_stack) + return -ENOMEM; + if (copy_old) { + if (state->stack) + memcpy(new_stack, state->stack, + sizeof(*new_stack) * (old_size / BPF_REG_SIZE)); + memset(new_stack + old_size / BPF_REG_SIZE, 0, + sizeof(*new_stack) * (size - old_size) / BPF_REG_SIZE); + } + state->allocated_stack = slot * BPF_REG_SIZE; kfree(state->stack); - kfree(state); + state->stack = new_stack; + return 0; } static void free_verifier_state(struct bpf_verifier_state *state, bool free_self) { - int i; - - for (i = 0; i <= state->curframe; i++) { - free_func_state(state->frame[i]); - state->frame[i] = NULL; - } - + kfree(state->stack); if (free_self) kfree(state); } @@ -607,56 +502,18 @@ static void free_verifier_state(struct bpf_verifier_state *state, /* copy verifier state from src to dst growing dst stack space * when necessary to accommodate larger src stack */ -static int copy_func_state(struct bpf_func_state *dst, - const struct bpf_func_state *src) +static int copy_verifier_state(struct bpf_verifier_state *dst, + const struct bpf_verifier_state *src) { int err; - err = realloc_func_state(dst, src->allocated_stack, src->acquired_refs, - false); - if (err) - return err; - - memcpy(dst, src, offsetof(struct bpf_func_state, acquired_refs)); - err = copy_reference_state(dst, src); - + err = realloc_verifier_state(dst, src->allocated_stack, false); if (err) return err; + memcpy(dst, src, offsetof(struct bpf_verifier_state, allocated_stack)); return copy_stack_state(dst, src); } -static int copy_verifier_state(struct bpf_verifier_state *dst_state, - const struct bpf_verifier_state *src) -{ - struct bpf_func_state *dst; - int i, err; - - /* if dst has more stack frames then src frame, free them */ - for (i = src->curframe + 1; i <= dst_state->curframe; i++) { - free_func_state(dst_state->frame[i]); - dst_state->frame[i] = NULL; - } - - dst_state->curframe = src->curframe; - dst_state->parent = src->parent; - dst_state->active_spin_lock = src->active_spin_lock; - - for (i = 0; i <= src->curframe; i++) { - dst = dst_state->frame[i]; - if (!dst) { - dst = kzalloc(sizeof(*dst), GFP_KERNEL); - if (!dst) - return -ENOMEM; - dst_state->frame[i] = dst; - } - err = copy_func_state(dst, src->frame[i]); - if (err) - return err; - } - - return 0; -} - static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx, int *insn_idx) { @@ -706,7 +563,7 @@ static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env, if (err) goto err; if (env->stack_size > BPF_COMPLEXITY_LIMIT_STACK) { - verbose(env, "BPF program is too complex\n"); + verbose("BPF program is too complex\n"); goto err; } return &elem->st; @@ -720,10 +577,6 @@ err: static const int caller_saved[CALLER_SAVED_REGS] = { BPF_REG_0, BPF_REG_1, BPF_REG_2, BPF_REG_3, BPF_REG_4, BPF_REG_5 }; -#define CALLEE_SAVED_REGS 5 -static const int callee_saved[CALLEE_SAVED_REGS] = { - BPF_REG_6, BPF_REG_7, BPF_REG_8, BPF_REG_9 -}; static void __mark_reg_not_init(struct bpf_reg_state *reg); @@ -748,36 +601,10 @@ static void __mark_reg_known_zero(struct bpf_reg_state *reg) __mark_reg_known(reg, 0); } -static bool reg_is_pkt_pointer(const struct bpf_reg_state *reg) -{ - return type_is_pkt_pointer(reg->type); -} - -static bool reg_is_pkt_pointer_any(const struct bpf_reg_state *reg) -{ - return reg_is_pkt_pointer(reg) || - reg->type == PTR_TO_PACKET_END; -} - -/* Unmodified PTR_TO_PACKET[_META,_END] register from ctx access. */ -static bool reg_is_init_pkt_pointer(const struct bpf_reg_state *reg, - enum bpf_reg_type which) -{ - /* The register can already have a range from prior markings. - * This is fine as long as it hasn't been advanced from its - * origin. - */ - return reg->type == which && - reg->id == 0 && - reg->off == 0 && - tnum_equals_const(reg->var_off, 0); -} - -static void mark_reg_known_zero(struct bpf_verifier_env *env, - struct bpf_reg_state *regs, u32 regno) +static void mark_reg_known_zero(struct bpf_reg_state *regs, u32 regno) { if (WARN_ON(regno >= MAX_BPF_REG)) { - verbose(env, "mark_reg_known_zero(env, regs, %u)\n", regno); + verbose("mark_reg_known_zero(regs, %u)\n", regno); /* Something bad happened, let's kill all regs */ for (regno = 0; regno < MAX_BPF_REG; regno++) __mark_reg_not_init(regs + regno); @@ -859,15 +686,13 @@ static void __mark_reg_unknown(struct bpf_reg_state *reg) reg->id = 0; reg->off = 0; reg->var_off = tnum_unknown; - reg->frameno = 0; __mark_reg_unbounded(reg); } -static void mark_reg_unknown(struct bpf_verifier_env *env, - struct bpf_reg_state *regs, u32 regno) +static void mark_reg_unknown(struct bpf_reg_state *regs, u32 regno) { if (WARN_ON(regno >= MAX_BPF_REG)) { - verbose(env, "mark_reg_unknown(env, regs, %u)\n", regno); + verbose("mark_reg_unknown(regs, %u)\n", regno); /* Something bad happened, let's kill all regs */ for (regno = 0; regno < MAX_BPF_REG; regno++) __mark_reg_not_init(regs + regno); @@ -882,11 +707,10 @@ static void __mark_reg_not_init(struct bpf_reg_state *reg) reg->type = NOT_INIT; } -static void mark_reg_not_init(struct bpf_verifier_env *env, - struct bpf_reg_state *regs, u32 regno) +static void mark_reg_not_init(struct bpf_reg_state *regs, u32 regno) { if (WARN_ON(regno >= MAX_BPF_REG)) { - verbose(env, "mark_reg_not_init(env, regs, %u)\n", regno); + verbose("mark_reg_not_init(regs, %u)\n", regno); /* Something bad happened, let's kill all regs */ for (regno = 0; regno < MAX_BPF_REG; regno++) __mark_reg_not_init(regs + regno); @@ -895,36 +719,22 @@ static void mark_reg_not_init(struct bpf_verifier_env *env, __mark_reg_not_init(regs + regno); } -static void init_reg_state(struct bpf_verifier_env *env, - struct bpf_func_state *state) +static void init_reg_state(struct bpf_reg_state *regs) { - struct bpf_reg_state *regs = state->regs; int i; for (i = 0; i < MAX_BPF_REG; i++) { - mark_reg_not_init(env, regs, i); + mark_reg_not_init(regs, i); regs[i].live = REG_LIVE_NONE; } /* frame pointer */ regs[BPF_REG_FP].type = PTR_TO_STACK; - mark_reg_known_zero(env, regs, BPF_REG_FP); - regs[BPF_REG_FP].frameno = state->frameno; + mark_reg_known_zero(regs, BPF_REG_FP); /* 1st arg to a function */ regs[BPF_REG_1].type = PTR_TO_CTX; - mark_reg_known_zero(env, regs, BPF_REG_1); -} - -#define BPF_MAIN_FUNC (-1) -static void init_func_state(struct bpf_verifier_env *env, - struct bpf_func_state *state, - int callsite, int frameno, int subprogno) -{ - state->callsite = callsite; - state->frameno = frameno; - state->subprogno = subprogno; - init_reg_state(env, state); + mark_reg_known_zero(regs, BPF_REG_1); } enum reg_arg_type { @@ -933,225 +743,51 @@ enum reg_arg_type { DST_OP_NO_MARK /* same as above, check only, don't mark */ }; -static int cmp_subprogs(const void *a, const void *b) +static void mark_reg_read(const struct bpf_verifier_state *state, u32 regno) { - return ((struct bpf_subprog_info *)a)->start - - ((struct bpf_subprog_info *)b)->start; -} - -static int find_subprog(struct bpf_verifier_env *env, int off) -{ - struct bpf_subprog_info *p; - - p = bsearch(&off, env->subprog_info, env->subprog_cnt, - sizeof(env->subprog_info[0]), cmp_subprogs); - if (!p) - return -ENOENT; - - return p - env->subprog_info; -} - -static int add_subprog(struct bpf_verifier_env *env, int off) -{ - int insn_cnt = env->prog->len; - int ret; - if (off >= insn_cnt || off < 0) { - verbose(env, "call to invalid destination\n"); - return -EINVAL; - } - - ret = find_subprog(env, off); - if (ret >= 0) - return 0; - - if (env->subprog_cnt > BPF_MAX_SUBPROGS) { - verbose(env, "too many subprograms\n"); - return -E2BIG; - } - - env->subprog_info[env->subprog_cnt++].start = off; - sort(env->subprog_info, env->subprog_cnt, - sizeof(env->subprog_info[0]), cmp_subprogs, NULL); - - return 0; -} -static int check_subprogs(struct bpf_verifier_env *env) -{ - int i, ret, subprog_start, subprog_end, off, cur_subprog = 0; - struct bpf_subprog_info *subprog = env->subprog_info; - struct bpf_insn *insn = env->prog->insnsi; - int insn_cnt = env->prog->len; - - /* Add entry function. */ - ret = add_subprog(env, 0); - if (ret < 0) - return ret; - - /* determine subprog starts. The end is one before the next starts */ - for (i = 0; i < insn_cnt; i++) { - if (insn[i].code != (BPF_JMP | BPF_CALL)) - continue; - if (insn[i].src_reg != BPF_PSEUDO_CALL) - continue; - if (!env->allow_ptr_leaks) { - verbose(env, "function calls to other bpf functions are allowed for root only\n"); - return -EPERM; - } - if (bpf_prog_is_dev_bound(env->prog->aux)) { - verbose(env, "funcation calls in offloaded programs are not supported yet\n"); - return -EINVAL; - } - ret = add_subprog(env, i + insn[i].imm + 1); - if (ret < 0) - return ret; - } - - if (env->log.level > 1) - for (i = 0; i < env->subprog_cnt; i++) - verbose(env, "func#%d @%d\n", i, subprog[i].start); - - /* now check that all jumps are within the same subprog */ - subprog_start = 0; - if (env->subprog_cnt == cur_subprog + 1) - subprog_end = insn_cnt; - else - subprog_end = subprog[cur_subprog + 1].start; - for (i = 0; i < insn_cnt; i++) { - u8 code = insn[i].code; - if (BPF_CLASS(code) != BPF_JMP) - goto next; - if (BPF_OP(code) == BPF_EXIT || BPF_OP(code) == BPF_CALL) - goto next; - off = i + insn[i].off + 1; - if (off < subprog_start || off >= subprog_end) { - verbose(env, "jump out of range from insn %d to %d\n", i, off); - return -EINVAL; - } -next: - if (i == subprog_end - 1) { - /* to avoid fall-through from one subprog into another - * the last insn of the subprog should be either exit - * or unconditional jump back - */ - if (code != (BPF_JMP | BPF_EXIT) && - code != (BPF_JMP | BPF_JA)) { - verbose(env, "last insn is not an exit or jmp\n"); - return -EINVAL; - } - cur_subprog++; - subprog_start = subprog_end; - if (env->subprog_cnt == cur_subprog + 1) - subprog_end = insn_cnt; - else - subprog_end = subprog[cur_subprog + 1].start; - } - } - return 0; -} - -struct bpf_verifier_state *skip_callee(struct bpf_verifier_env *env, - const struct bpf_verifier_state *state, - struct bpf_verifier_state *parent, - u32 regno) -{ - struct bpf_verifier_state *tmp = NULL; - - /* 'parent' could be a state of caller and - * 'state' could be a state of callee. In such case - * parent->curframe < state->curframe - * and it's ok for r1 - r5 registers - * - * 'parent' could be a callee's state after it bpf_exit-ed. - * In such case parent->curframe > state->curframe - * and it's ok for r0 only - */ - if (parent->curframe == state->curframe || - (parent->curframe < state->curframe && - regno >= BPF_REG_1 && regno <= BPF_REG_5) || - (parent->curframe > state->curframe && - regno == BPF_REG_0)) - return parent; - - if (parent->curframe > state->curframe && - regno >= BPF_REG_6) { - /* for callee saved regs we have to skip the whole chain - * of states that belong to callee and mark as LIVE_READ - * the registers before the call - */ - tmp = parent; - while (tmp && tmp->curframe != state->curframe) { - tmp = tmp->parent; - } - if (!tmp) - goto bug; - parent = tmp; - } else { - goto bug; - } - - return parent; -bug: - verbose(env, "verifier bug regno %d tmp %p\n", regno, tmp); - verbose(env, "regno %d parent frame %d current frame %d\n", - regno, parent->curframe, state->curframe); - return 0; -} - -static int mark_reg_read(struct bpf_verifier_env *env, - const struct bpf_verifier_state *state, - struct bpf_verifier_state *parent, - u32 regno) -{ - bool writes = parent == state->parent; /* Observe write marks */ + struct bpf_verifier_state *parent = state->parent; if (regno == BPF_REG_FP) /* We don't need to worry about FP liveness because it's read-only */ - return 0; + return; while (parent) { /* if read wasn't screened by an earlier write ... */ - if (writes && state->frame[state->curframe]->regs[regno].live & REG_LIVE_WRITTEN) + if (state->regs[regno].live & REG_LIVE_WRITTEN) break; - parent = skip_callee(env, state, parent, regno); - if (!parent) - return -EFAULT; /* ... then we depend on parent's value */ - parent->frame[parent->curframe]->regs[regno].live |= REG_LIVE_READ; + parent->regs[regno].live |= REG_LIVE_READ; state = parent; parent = state->parent; - writes = true; } - return 0; } static int check_reg_arg(struct bpf_verifier_env *env, u32 regno, enum reg_arg_type t) { - struct bpf_verifier_state *vstate = env->cur_state; - struct bpf_func_state *state = vstate->frame[vstate->curframe]; - struct bpf_reg_state *regs = state->regs; + struct bpf_reg_state *regs = env->cur_state->regs; if (regno >= MAX_BPF_REG) { - verbose(env, "R%d is invalid\n", regno); + verbose("R%d is invalid\n", regno); return -EINVAL; } if (t == SRC_OP) { /* check whether register used as source operand can be read */ if (regs[regno].type == NOT_INIT) { - verbose(env, "R%d !read_ok\n", regno); + verbose("R%d !read_ok\n", regno); return -EACCES; } - return mark_reg_read(env, vstate, vstate->parent, regno); + mark_reg_read(env->cur_state, regno); } else { /* check whether register used as dest operand can be written to */ if (regno == BPF_REG_FP) { - verbose(env, "frame pointer is read only\n"); + verbose("frame pointer is read only\n"); return -EACCES; } regs[regno].live |= REG_LIVE_WRITTEN; if (t == DST_OP) - mark_reg_unknown(env, regs, regno); + mark_reg_unknown(regs, regno); } return 0; } @@ -1164,16 +800,8 @@ static bool is_spillable_regtype(enum bpf_reg_type type) case PTR_TO_STACK: case PTR_TO_CTX: case PTR_TO_PACKET: - case PTR_TO_PACKET_META: case PTR_TO_PACKET_END: - case PTR_TO_FLOW_KEYS: case CONST_PTR_TO_MAP: - case PTR_TO_SOCKET: - case PTR_TO_SOCKET_OR_NULL: - case PTR_TO_SOCK_COMMON: - case PTR_TO_SOCK_COMMON_OR_NULL: - case PTR_TO_TCP_SOCK: - case PTR_TO_TCP_SOCK_OR_NULL: return true; default: return false; @@ -1184,15 +812,13 @@ static bool is_spillable_regtype(enum bpf_reg_type type) * stack boundary and alignment are checked in check_mem_access() */ static int check_stack_write(struct bpf_verifier_env *env, - struct bpf_func_state *state, /* func where register points to */ - int off, int size, int value_regno, int insn_idx) + struct bpf_verifier_state *state, int off, + int size, int value_regno, int insn_idx) { - struct bpf_func_state *cur; /* state of the current function */ int i, slot = -off - 1, spi = slot / BPF_REG_SIZE, err; - enum bpf_reg_type type; - err = realloc_func_state(state, round_up(slot + 1, BPF_REG_SIZE), - state->acquired_refs, true); + err = realloc_verifier_state(state, round_up(slot + 1, BPF_REG_SIZE), + true); if (err) return err; /* caller checked that off % size == 0 and -MAX_BPF_STACK <= off < 0, @@ -1201,27 +827,21 @@ static int check_stack_write(struct bpf_verifier_env *env, if (!env->allow_ptr_leaks && state->stack[spi].slot_type[0] == STACK_SPILL && size != BPF_REG_SIZE) { - verbose(env, "attempt to corrupt spilled pointer on stack\n"); + verbose("attempt to corrupt spilled pointer on stack\n"); return -EACCES; } - cur = env->cur_state->frame[env->cur_state->curframe]; if (value_regno >= 0 && - is_spillable_regtype((type = cur->regs[value_regno].type))) { + is_spillable_regtype(state->regs[value_regno].type)) { /* register containing pointer is being spilled into stack */ if (size != BPF_REG_SIZE) { - verbose(env, "invalid size of register spill\n"); + verbose("invalid size of register spill\n"); return -EACCES; } - if (state != cur && type == PTR_TO_STACK) { - verbose(env, "cannot spill pointers to stack into stack frame of the caller\n"); - return -EINVAL; - } - /* save register state */ - state->stack[spi].spilled_ptr = cur->regs[value_regno]; + state->stack[spi].spilled_ptr = state->regs[value_regno]; state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN; for (i = 0; i < BPF_REG_SIZE; i++) { @@ -1242,7 +862,7 @@ static int check_stack_write(struct bpf_verifier_env *env, * into two different stack slots, since verifier * cannot sanitize them */ - verbose(env, "insn %d cannot access two stack slots fp%d and fp%d", + verbose("insn %d cannot access two stack slots fp%d and fp%d", insn_idx, *poff, soff); return -EINVAL; } @@ -1261,99 +881,63 @@ static int check_stack_write(struct bpf_verifier_env *env, return 0; } -/* registers of every function are unique and mark_reg_read() propagates - * the liveness in the following cases: - * - from callee into caller for R1 - R5 that were used as arguments - * - from caller into callee for R0 that used as result of the call - * - from caller to the same caller skipping states of the callee for R6 - R9, - * since R6 - R9 are callee saved by implicit function prologue and - * caller's R6 != callee's R6, so when we propagate liveness up to - * parent states we need to skip callee states for R6 - R9. - * - * stack slot marking is different, since stacks of caller and callee are - * accessible in both (since caller can pass a pointer to caller's stack to - * callee which can pass it to another function), hence mark_stack_slot_read() - * has to propagate the stack liveness to all parent states at given frame number. - * Consider code: - * f1() { - * ptr = fp - 8; - * *ptr = ctx; - * call f2 { - * .. = *ptr; - * } - * .. = *ptr; - * } - * First *ptr is reading from f1's stack and mark_stack_slot_read() has - * to mark liveness at the f1's frame and not f2's frame. - * Second *ptr is also reading from f1's stack and mark_stack_slot_read() has - * to propagate liveness to f2 states at f1's frame level and further into - * f1 states at f1's frame level until write into that stack slot - */ -static void mark_stack_slot_read(struct bpf_verifier_env *env, - const struct bpf_verifier_state *state, - struct bpf_verifier_state *parent, - int slot, int frameno) +static void mark_stack_slot_read(const struct bpf_verifier_state *state, int slot) { - bool writes = parent == state->parent; /* Observe write marks */ + struct bpf_verifier_state *parent = state->parent; while (parent) { /* if read wasn't screened by an earlier write ... */ - if (writes && state->frame[frameno]->stack[slot].spilled_ptr.live & REG_LIVE_WRITTEN) + if (state->stack[slot].spilled_ptr.live & REG_LIVE_WRITTEN) break; /* ... then we depend on parent's value */ - parent->frame[frameno]->stack[slot].spilled_ptr.live |= REG_LIVE_READ; + parent->stack[slot].spilled_ptr.live |= REG_LIVE_READ; state = parent; parent = state->parent; - writes = true; } } -static int check_stack_read(struct bpf_verifier_env *env, - struct bpf_func_state *reg_state /* func where register points to */, - int off, int size, int value_regno) +static int check_stack_read(struct bpf_verifier_state *state, int off, int size, + int value_regno) { - struct bpf_verifier_state *vstate = env->cur_state; - struct bpf_func_state *state = vstate->frame[vstate->curframe]; int i, slot = -off - 1, spi = slot / BPF_REG_SIZE; u8 *stype; - if (reg_state->allocated_stack <= slot) { - verbose(env, "invalid read from stack off %d+0 size %d\n", + if (state->allocated_stack <= slot) { + verbose("invalid read from stack off %d+0 size %d\n", off, size); return -EACCES; } - stype = reg_state->stack[spi].slot_type; + stype = state->stack[spi].slot_type; if (stype[0] == STACK_SPILL) { if (size != BPF_REG_SIZE) { - verbose(env, "invalid size of register spill\n"); + verbose("invalid size of register spill\n"); return -EACCES; } for (i = 1; i < BPF_REG_SIZE; i++) { if (stype[(slot - i) % BPF_REG_SIZE] != STACK_SPILL) { - verbose(env, "corrupted spill memory\n"); + verbose("corrupted spill memory\n"); return -EACCES; } } if (value_regno >= 0) { /* restore register state from stack */ - state->regs[value_regno] = reg_state->stack[spi].spilled_ptr; - mark_stack_slot_read(env, vstate, vstate->parent, spi, - reg_state->frameno); + state->regs[value_regno] = state->stack[spi].spilled_ptr; + mark_stack_slot_read(state, spi); } return 0; } else { for (i = 0; i < size; i++) { if (stype[(slot - i) % BPF_REG_SIZE] != STACK_MISC) { - verbose(env, "invalid read from stack off %d+%d size %d\n", + verbose("invalid read from stack off %d+%d size %d\n", off, i, size); return -EACCES; } } if (value_regno >= 0) /* have read misc data from the stack */ - mark_reg_unknown(env, state->regs, value_regno); + mark_reg_unknown(state->regs, value_regno); return 0; } } @@ -1370,13 +954,13 @@ static int check_stack_access(struct bpf_verifier_env *env, char tn_buf[48]; tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose(env, "variable stack access var_off=%s off=%d size=%d", + verbose("variable stack access var_off=%s off=%d size=%d", tn_buf, off, size); return -EACCES; } if (off >= 0 || off < -MAX_BPF_STACK) { - verbose(env, "invalid stack off=%d size=%d\n", off, size); + verbose("invalid stack off=%d size=%d\n", off, size); return -EACCES; } @@ -1391,7 +975,7 @@ static int __check_map_access(struct bpf_verifier_env *env, u32 regno, int off, struct bpf_map *map = regs[regno].map_ptr; if (off < 0 || size <= 0 || off + size > map->value_size) { - verbose(env, "invalid access to map value, value_size=%d off=%d size=%d\n", + verbose("invalid access to map value, value_size=%d off=%d size=%d\n", map->value_size, off, size); return -EACCES; } @@ -1402,8 +986,7 @@ static int __check_map_access(struct bpf_verifier_env *env, u32 regno, int off, static int check_map_access(struct bpf_verifier_env *env, u32 regno, int off, int size) { - struct bpf_verifier_state *vstate = env->cur_state; - struct bpf_func_state *state = vstate->frame[vstate->curframe]; + struct bpf_verifier_state *state = env->cur_state; struct bpf_reg_state *reg = &state->regs[regno]; int err; @@ -1411,8 +994,8 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, * need to try adding each of min_value and max_value to off * to make sure our theoretical access will be safe. */ - if (env->log.level) - print_verifier_state(env, state); + if (log_level) + print_verifier_state(state); /* The minimum value is only important with signed * comparisons where we can't assume the floor of a @@ -1424,14 +1007,13 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, (reg->smin_value == S64_MIN || (off + reg->smin_value != (s64)(s32)(off + reg->smin_value)) || reg->smin_value + off < 0)) { - verbose(env, "R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n", + verbose("R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n", regno); return -EACCES; } - err = __check_map_access(env, regno, reg->smin_value + off, size); if (err) { - verbose(env, "R%d min value is outside of the array range\n", regno); + verbose("R%d min value is outside of the array range\n", regno); return err; } @@ -1440,29 +1022,13 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, * If reg->umax_value + off could overflow, treat that as unbounded too. */ if (reg->umax_value >= BPF_MAX_VAR_OFF) { - verbose(env, "R%d unbounded memory access, make sure to bounds check any array access into a map\n", + verbose("R%d unbounded memory access, make sure to bounds check any array access into a map\n", regno); return -EACCES; } - err = __check_map_access(env, regno, reg->umax_value + off, size); if (err) - verbose(env, "R%d max value is outside of the array range\n", regno); - - if (map_value_has_spin_lock(reg->map_ptr)) { - u32 lock = reg->map_ptr->spin_lock_off; - /* if any part of struct bpf_spin_lock can be touched by - * load/store reject this program. - * To check that [x1, x2) overlaps with [y1, y2) - * it is sufficient to check x1 < y2 && y1 < x2. - */ - if (reg->smin_value + off < lock + sizeof(struct bpf_spin_lock) && - lock < reg->umax_value + off + size) { - verbose(env, "bpf_spin_lock cannot be accessed directly by load/store\n"); - return -EACCES; - } - } - + verbose("R%d max value is outside of the array range\n", regno); return err; } @@ -1484,17 +1050,11 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env, case BPF_PROG_TYPE_XDP: case BPF_PROG_TYPE_LWT_XMIT: case BPF_PROG_TYPE_SK_SKB: - case BPF_PROG_TYPE_SK_MSG: - case BPF_PROG_TYPE_FLOW_DISSECTOR: if (meta) return meta->pkt_access; env->seen_direct_write = true; return true; - case BPF_PROG_TYPE_CGROUP_SOCKOPT: - if (t == BPF_WRITE) - env->seen_direct_write = true; - return true; default: return false; } @@ -1507,7 +1067,7 @@ static int __check_packet_access(struct bpf_verifier_env *env, u32 regno, struct bpf_reg_state *reg = ®s[regno]; if (off < 0 || size <= 0 || (u64)off + size > reg->range) { - verbose(env, "invalid access to packet, off=%d size=%d, R%d(id=%d,off=%d,r=%d)\n", + verbose("invalid access to packet, off=%d size=%d, R%d(id=%d,off=%d,r=%d)\n", off, size, regno, reg->id, reg->off, reg->range); return -EACCES; } @@ -1530,13 +1090,13 @@ static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off, * detail to prove they're safe. */ if (reg->smin_value < 0) { - verbose(env, "R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n", + verbose("R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n", regno); return -EACCES; } err = __check_packet_access(env, regno, off, size); if (err) { - verbose(env, "R%d offset is outside of the packet\n", regno); + verbose("R%d offset is outside of the packet\n", regno); return err; } return err; @@ -1554,8 +1114,8 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, if (env->analyzer_ops) return 0; - if (env->ops->is_valid_access && - env->ops->is_valid_access(off, size, t, env->prog, &info)) { + if (env->prog->aux->ops->is_valid_access && + env->prog->aux->ops->is_valid_access(off, size, t, &info)) { /* A non zero info.ctx_field_size indicates that this field is a * candidate for later verifier transformation to load the whole * field and then apply a mask when accessed with a narrower @@ -1572,55 +1132,7 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, return 0; } - verbose(env, "invalid bpf_context access off=%d size=%d\n", off, size); - return -EACCES; -} - -static int check_flow_keys_access(struct bpf_verifier_env *env, int off, - int size) -{ - if (size < 0 || off < 0 || - (u64)off + size > sizeof(struct bpf_flow_keys)) { - return -EACCES; - } - return 0; -} - -static int check_sock_access(struct bpf_verifier_env *env, int insn_idx, - u32 regno, int off, int size, - enum bpf_access_type t) -{ - struct bpf_reg_state *regs = cur_regs(env); - struct bpf_reg_state *reg = ®s[regno]; - struct bpf_insn_access_aux info = {}; - bool valid; - - if (reg->smin_value < 0) { - return -EACCES; - } - - switch (reg->type) { - case PTR_TO_SOCK_COMMON: - valid = bpf_sock_common_is_valid_access(off, size, t, &info); - break; - case PTR_TO_SOCKET: - valid = bpf_sock_is_valid_access(off, size, t, &info); - break; - case PTR_TO_TCP_SOCK: - valid = bpf_tcp_sock_is_valid_access(off, size, t, &info); - break; - default: - valid = false; - } - - if (valid) { - env->insn_aux_data[insn_idx].ctx_field_size = - info.ctx_field_size; - return 0; - } - - verbose(env, "R%d invalid %s access off=%d size=%d\n", - regno, reg_type_str[reg->type], off, size); + verbose("invalid bpf_context access off=%d size=%d\n", off, size); return -EACCES; } @@ -1633,11 +1145,6 @@ static bool __is_pointer_value(bool allow_ptr_leaks, return reg->type != SCALAR_VALUE; } -static struct bpf_reg_state *reg_state(struct bpf_verifier_env *env, int regno) -{ - return cur_regs(env) + regno; -} - static bool is_pointer_value(struct bpf_verifier_env *env, int regno) { return __is_pointer_value(env->allow_ptr_leaks, cur_regs(env) + regno); @@ -1650,12 +1157,6 @@ static bool is_ctx_reg(struct bpf_verifier_env *env, int regno) return reg->type == PTR_TO_CTX; } -static bool is_sk_reg(struct bpf_verifier_env *env, int regno) -{ - const struct bpf_reg_state *reg = reg_state(env, regno); - return type_is_sk_pointer(reg->type); -} - static bool is_pkt_reg(struct bpf_verifier_env *env, int regno) { const struct bpf_reg_state *reg = cur_regs(env) + regno; @@ -1663,8 +1164,7 @@ static bool is_pkt_reg(struct bpf_verifier_env *env, int regno) return reg->type == PTR_TO_PACKET; } -static int check_pkt_ptr_alignment(struct bpf_verifier_env *env, - const struct bpf_reg_state *reg, +static int check_pkt_ptr_alignment(const struct bpf_reg_state *reg, int off, int size, bool strict) { struct tnum reg_off; @@ -1689,7 +1189,7 @@ static int check_pkt_ptr_alignment(struct bpf_verifier_env *env, char tn_buf[48]; tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose(env, "misaligned packet access off %d+%s+%d+%d size %d\n", + verbose("misaligned packet access off %d+%s+%d+%d size %d\n", ip_align, tn_buf, reg->off, off, size); return -EACCES; } @@ -1697,8 +1197,7 @@ static int check_pkt_ptr_alignment(struct bpf_verifier_env *env, return 0; } -static int check_generic_ptr_alignment(struct bpf_verifier_env *env, - const struct bpf_reg_state *reg, +static int check_generic_ptr_alignment(const struct bpf_reg_state *reg, const char *pointer_desc, int off, int size, bool strict) { @@ -1713,7 +1212,7 @@ static int check_generic_ptr_alignment(struct bpf_verifier_env *env, char tn_buf[48]; tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose(env, "misaligned %saccess off %s+%d+%d size %d\n", + verbose("misaligned %saccess off %s+%d+%d size %d\n", pointer_desc, tn_buf, reg->off, off, size); return -EACCES; } @@ -1730,14 +1229,8 @@ static int check_ptr_alignment(struct bpf_verifier_env *env, switch (reg->type) { case PTR_TO_PACKET: - case PTR_TO_PACKET_META: - /* Special case, because of NET_IP_ALIGN. Given metadata sits - * right in front, treat it the very same way. - */ - return check_pkt_ptr_alignment(env, reg, off, size, strict); - case PTR_TO_FLOW_KEYS: - pointer_desc = "flow keys "; - break; + /* special case, because of NET_IP_ALIGN */ + return check_pkt_ptr_alignment(reg, off, size, strict); case PTR_TO_MAP_VALUE: pointer_desc = "value "; break; @@ -1752,120 +1245,12 @@ static int check_ptr_alignment(struct bpf_verifier_env *env, */ strict = true; break; - case PTR_TO_SOCKET: - pointer_desc = "sock "; - break; - case PTR_TO_SOCK_COMMON: - pointer_desc = "sock_common "; - break; - case PTR_TO_TCP_SOCK: - pointer_desc = "tcp_sock "; - break; default: break; } - return check_generic_ptr_alignment(env, reg, pointer_desc, off, size, - strict); + return check_generic_ptr_alignment(reg, pointer_desc, off, size, strict); } -static int update_stack_depth(struct bpf_verifier_env *env, - const struct bpf_func_state *func, - int off) -{ - u16 stack = env->subprog_info[func->subprogno].stack_depth; - - if (stack >= -off) - return 0; - - /* update known max for given subprogram */ - env->subprog_info[func->subprogno].stack_depth = -off; - return 0; -} - -/* starting from main bpf function walk all instructions of the function - * and recursively walk all callees that given function can call. - * Ignore jump and exit insns. - * Since recursion is prevented by check_cfg() this algorithm - * only needs a local stack of MAX_CALL_FRAMES to remember callsites - */ -static int check_max_stack_depth(struct bpf_verifier_env *env) -{ - int depth = 0, frame = 0, idx = 0, i = 0, subprog_end; - struct bpf_subprog_info *subprog = env->subprog_info; - struct bpf_insn *insn = env->prog->insnsi; - int insn_cnt = env->prog->len; - int ret_insn[MAX_CALL_FRAMES]; - int ret_prog[MAX_CALL_FRAMES]; - -process_func: - /* round up to 32-bytes, since this is granularity - * of interpreter stack size - */ - depth += round_up(max_t(u32, subprog[idx].stack_depth, 1), 32); - if (depth > MAX_BPF_STACK) { - verbose(env, "combined stack size of %d calls is %d. Too large\n", - frame + 1, depth); - return -EACCES; - } - -continue_func: - if (env->subprog_cnt == idx + 1) - subprog_end = insn_cnt; - else - subprog_end = subprog[idx + 1].start; - for (; i < subprog_end; i++) { - if (insn[i].code != (BPF_JMP | BPF_CALL)) - continue; - if (insn[i].src_reg != BPF_PSEUDO_CALL) - continue; - /* remember insn and function to return to */ - ret_insn[frame] = i + 1; - ret_prog[frame] = idx; - /* find the callee */ - i = i + insn[i].imm + 1; - idx = find_subprog(env, i); - if (idx < 0) { - WARN_ONCE(1, "verifier bug. No program starts at insn %d\n", - i); - return -EFAULT; - } - frame++; - if (frame >= MAX_CALL_FRAMES) { - WARN_ONCE(1, "verifier bug. Call stack is too deep\n"); - return -EFAULT; - } - goto process_func; - } - /* end of for() loop means the last insn of the 'subprog' - * was reached. Doesn't matter whether it was JA or EXIT - */ - if (frame == 0) - return 0; - - depth -= round_up(max_t(u32, subprog[idx].stack_depth, 1), 32); - frame--; - i = ret_insn[frame]; - idx = ret_prog[frame]; - goto continue_func; -} - -#ifndef CONFIG_BPF_JIT_ALWAYS_ON -static int get_callee_stack_depth(struct bpf_verifier_env *env, - const struct bpf_insn *insn, int idx) -{ - int start = idx + insn->imm + 1, subprog; - subprog = find_subprog(env, start); - - if (subprog < 0) { - WARN_ONCE(1, "verifier bug. No program starts at insn %d\n", - start); - return -EFAULT; - } - - return env->subprog_info[subprog].stack_depth; -} -#endif - static int check_ctx_reg(struct bpf_verifier_env *env, const struct bpf_reg_state *reg, int regno) { @@ -1874,7 +1259,7 @@ static int check_ctx_reg(struct bpf_verifier_env *env, */ if (reg->off) { - verbose(env, "dereference of modified ctx ptr R%d off=%d disallowed\n", + verbose("dereference of modified ctx ptr R%d off=%d disallowed\n", regno, reg->off); return -EACCES; } @@ -1883,40 +1268,13 @@ static int check_ctx_reg(struct bpf_verifier_env *env, char tn_buf[48]; tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose(env, "variable ctx access var_off=%s disallowed\n", tn_buf); + verbose("variable ctx access var_off=%s disallowed\n", tn_buf); return -EACCES; } return 0; } -static int check_tp_buffer_access(struct bpf_verifier_env *env, - const struct bpf_reg_state *reg, - int regno, int off, int size) -{ - if (off < 0) { - verbose(env, - "R%d invalid tracepoint buffer access: off=%d, size=%d", - regno, off, size); - return -EACCES; - } - - if (!tnum_is_const(reg->var_off) || reg->var_off.value) { - char tn_buf[48]; - tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose(env, - "R%d invalid variable buffer offset: off=%d, var_off=%s", - regno, off, tn_buf); - return -EACCES; - } - - if (off + size > env->prog->aux->max_tp_access) - env->prog->aux->max_tp_access = off + size; - - return 0; -} - - /* truncate register to smaller size (in bytes) * must be called with size < BPF_REG_SIZE */ @@ -1950,9 +1308,9 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn int off, int bpf_size, enum bpf_access_type t, int value_regno, bool strict_alignment_once) { + struct bpf_verifier_state *state = env->cur_state; struct bpf_reg_state *regs = cur_regs(env); struct bpf_reg_state *reg = regs + regno; - struct bpf_func_state *state; int size, err = 0; size = bpf_size_to_bytes(bpf_size); @@ -1970,20 +1328,20 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn if (reg->type == PTR_TO_MAP_VALUE) { if (t == BPF_WRITE && value_regno >= 0 && is_pointer_value(env, value_regno)) { - verbose(env, "R%d leaks addr into map\n", value_regno); + verbose("R%d leaks addr into map\n", value_regno); return -EACCES; } err = check_map_access(env, regno, off, size); if (!err && t == BPF_READ && value_regno >= 0) - mark_reg_unknown(env, regs, value_regno); + mark_reg_unknown(regs, value_regno); } else if (reg->type == PTR_TO_CTX) { enum bpf_reg_type reg_type = SCALAR_VALUE; if (t == BPF_WRITE && value_regno >= 0 && is_pointer_value(env, value_regno)) { - verbose(env, "R%d leaks addr into ctx\n", value_regno); + verbose("R%d leaks addr into ctx\n", value_regno); return -EACCES; } err = check_ctx_reg(env, reg, regno); @@ -1996,13 +1354,10 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn * PTR_TO_PACKET[_END]. In the latter case, we know * the offset is zero. */ - if (reg_type == SCALAR_VALUE) { - mark_reg_unknown(env, regs, value_regno); - } else { - mark_reg_known_zero(env, regs, value_regno); - if (reg_type_may_be_null(reg_type)) - regs[value_regno].id = ++env->id_gen; - } + if (reg_type == SCALAR_VALUE) + mark_reg_unknown(regs, value_regno); + else + mark_reg_known_zero(regs, value_regno); regs[value_regno].id = 0; regs[value_regno].off = 0; regs[value_regno].range = 0; @@ -2015,49 +1370,30 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn if (err) return err; - state = func(env, reg); - err = update_stack_depth(env, state, off); - if (err) - return err; + if (env->prog->aux->stack_depth < -off) + env->prog->aux->stack_depth = -off; if (t == BPF_WRITE) err = check_stack_write(env, state, off, size, value_regno, insn_idx); else - err = check_stack_read(env, state, off, size, - value_regno); - } else if (reg_is_pkt_pointer(reg)) { + err = check_stack_read(state, off, size, value_regno); + } else if (reg->type == PTR_TO_PACKET) { if (t == BPF_WRITE && !may_access_direct_pkt_data(env, NULL, t)) { + verbose("cannot write into packet\n"); return -EACCES; } if (t == BPF_WRITE && value_regno >= 0 && is_pointer_value(env, value_regno)) { + verbose("R%d leaks addr into packet\n", value_regno); return -EACCES; } err = check_packet_access(env, regno, off, size); if (!err && t == BPF_READ && value_regno >= 0) - mark_reg_unknown(env, regs, value_regno); - } else if (reg->type == PTR_TO_FLOW_KEYS) { - if (t == BPF_WRITE && value_regno >= 0 && - is_pointer_value(env, value_regno)) { - return -EACCES; - } - - err = check_flow_keys_access(env, off, size); - } else if (type_is_sk_pointer(reg->type)) { - if (t == BPF_WRITE) { - verbose(env, "R%d cannot write into %s\n", - regno, reg_type_str[reg->type]); - return -EACCES; - } - err = check_sock_access(env, insn_idx, regno, off, size, t); - if (!err && value_regno >= 0) - mark_reg_unknown(env, regs, value_regno); - } else if (reg->type == PTR_TO_TP_BUFFER) { - err = check_tp_buffer_access(env, reg, regno, off, size); - if (!err && t == BPF_READ && value_regno >= 0) - mark_reg_unknown(env, regs, value_regno); + mark_reg_unknown(regs, value_regno); } else { + verbose("R%d invalid mem access '%s'\n", + regno, reg_type_str[reg->type]); return -EACCES; } @@ -2075,7 +1411,7 @@ static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_ins if ((BPF_SIZE(insn->code) != BPF_W && BPF_SIZE(insn->code) != BPF_DW) || insn->imm != 0) { - verbose(env, "BPF_XADD uses reserved fields\n"); + verbose("BPF_XADD uses reserved fields\n"); return -EINVAL; } @@ -2090,14 +1426,13 @@ static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_ins return err; if (is_pointer_value(env, insn->src_reg)) { - verbose(env, "R%d leaks addr into mem\n", insn->src_reg); + verbose("R%d leaks addr into mem\n", insn->src_reg); return -EACCES; } if (is_ctx_reg(env, insn->dst_reg) || - is_pkt_reg(env, insn->dst_reg) || - is_sk_reg(env, insn->dst_reg)) { - verbose(env, "BPF_XADD stores into R%d %s is not allowed\n", + is_pkt_reg(env, insn->dst_reg)) { + verbose("BPF_XADD stores into R%d %s is not allowed\n", insn->dst_reg, is_ctx_reg(env, insn->dst_reg) ? "context" : "packet"); return -EACCES; @@ -2115,9 +1450,9 @@ static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_ins } /* Does this register contain a constant zero? */ -static bool register_is_null(struct bpf_reg_state *reg) +static bool register_is_null(struct bpf_reg_state reg) { - return reg->type == SCALAR_VALUE && tnum_equals_const(reg->var_off, 0); + return reg.type == SCALAR_VALUE && tnum_equals_const(reg.var_off, 0); } /* when register 'regno' is passed into function that will read 'access_size' @@ -2130,39 +1465,42 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, int access_size, bool zero_size_allowed, struct bpf_call_arg_meta *meta) { - struct bpf_reg_state *reg = cur_regs(env) + regno; - struct bpf_func_state *state = func(env, reg); + struct bpf_verifier_state *state = env->cur_state; + struct bpf_reg_state *regs = state->regs; int off, i, slot, spi; - if (reg->type != PTR_TO_STACK) { + if (regs[regno].type != PTR_TO_STACK) { /* Allow zero-byte read from NULL, regardless of pointer type */ if (zero_size_allowed && access_size == 0 && - register_is_null(reg)) + register_is_null(regs[regno])) return 0; - verbose(env, "R%d type=%s expected=%s\n", regno, - reg_type_str[reg->type], + verbose("R%d type=%s expected=%s\n", regno, + reg_type_str[regs[regno].type], reg_type_str[PTR_TO_STACK]); return -EACCES; } /* Only allow fixed-offset stack reads */ - if (!tnum_is_const(reg->var_off)) { + if (!tnum_is_const(regs[regno].var_off)) { char tn_buf[48]; - tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose(env, "invalid variable stack read R%d var_off=%s\n", + tnum_strn(tn_buf, sizeof(tn_buf), regs[regno].var_off); + verbose("invalid variable stack read R%d var_off=%s\n", regno, tn_buf); return -EACCES; } - off = reg->off + reg->var_off.value; + off = regs[regno].off + regs[regno].var_off.value; if (off >= 0 || off < -MAX_BPF_STACK || off + access_size > 0 || access_size <= 0) { - verbose(env, "invalid stack type R%d off=%d access_size=%d\n", + verbose("invalid stack type R%d off=%d access_size=%d\n", regno, off, access_size); return -EACCES; } + if (env->prog->aux->stack_depth < -off) + env->prog->aux->stack_depth = -off; + if (meta && meta->raw_mode) { meta->access_size = access_size; meta->regno = regno; @@ -2175,13 +1513,12 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, if (state->allocated_stack <= slot || state->stack[spi].slot_type[slot % BPF_REG_SIZE] != STACK_MISC) { - verbose(env, "invalid indirect read from stack off %d+%d size %d\n", + verbose("invalid indirect read from stack off %d+%d size %d\n", off, i, access_size); return -EACCES; } } - - return update_stack_depth(env, state, off); + return 0; } static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, @@ -2192,10 +1529,7 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, switch (reg->type) { case PTR_TO_PACKET: - case PTR_TO_PACKET_META: return check_packet_access(env, regno, reg->off, access_size); - case PTR_TO_FLOW_KEYS: - return check_flow_keys_access(env, reg->off, access_size); case PTR_TO_MAP_VALUE: return check_map_access(env, regno, reg->off, access_size); default: /* scalar_value|ptr_to_stack or invalid ptr */ @@ -2204,119 +1538,6 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, } } -/* Implementation details: - * bpf_map_lookup returns PTR_TO_MAP_VALUE_OR_NULL - * Two bpf_map_lookups (even with the same key) will have different reg->id. - * For traditional PTR_TO_MAP_VALUE the verifier clears reg->id after - * value_or_null->value transition, since the verifier only cares about - * the range of access to valid map value pointer and doesn't care about actual - * address of the map element. - * For maps with 'struct bpf_spin_lock' inside map value the verifier keeps - * reg->id > 0 after value_or_null->value transition. By doing so - * two bpf_map_lookups will be considered two different pointers that - * point to different bpf_spin_locks. - * The verifier allows taking only one bpf_spin_lock at a time to avoid - * dead-locks. - * Since only one bpf_spin_lock is allowed the checks are simpler than - * reg_is_refcounted() logic. The verifier needs to remember only - * one spin_lock instead of array of acquired_refs. - * cur_state->active_spin_lock remembers which map value element got locked - * and clears it after bpf_spin_unlock. - */ -static int process_spin_lock(struct bpf_verifier_env *env, int regno, - bool is_lock) -{ - struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; - struct bpf_verifier_state *cur = env->cur_state; - bool is_const = tnum_is_const(reg->var_off); - struct bpf_map *map = reg->map_ptr; - u64 val = reg->var_off.value; - if (reg->type != PTR_TO_MAP_VALUE) { - verbose(env, "R%d is not a pointer to map_value\n", regno); - return -EINVAL; - } - if (!is_const) { - verbose(env, - "R%d doesn't have constant offset. bpf_spin_lock has to be at the constant offset\n", - regno); - return -EINVAL; - } - if (!map->btf) { - verbose(env, - "map '%s' has to have BTF in order to use bpf_spin_lock\n", - map->name); - return -EINVAL; - } - if (!map_value_has_spin_lock(map)) { - if (map->spin_lock_off == -E2BIG) - verbose(env, - "map '%s' has more than one 'struct bpf_spin_lock'\n", - map->name); - else if (map->spin_lock_off == -ENOENT) - verbose(env, - "map '%s' doesn't have 'struct bpf_spin_lock'\n", - map->name); - else - verbose(env, - "map '%s' is not a struct type or bpf_spin_lock is mangled\n", - map->name); - return -EINVAL; - } - if (map->spin_lock_off != val + reg->off) { - verbose(env, "off %lld doesn't point to 'struct bpf_spin_lock'\n", - val + reg->off); - return -EINVAL; - } - if (is_lock) { - if (cur->active_spin_lock) { - verbose(env, - "Locking two bpf_spin_locks are not allowed\n"); - return -EINVAL; - } - cur->active_spin_lock = reg->id; - } else { - if (!cur->active_spin_lock) { - verbose(env, "bpf_spin_unlock without taking a lock\n"); - return -EINVAL; - } - if (cur->active_spin_lock != reg->id) { - verbose(env, "bpf_spin_unlock of different lock\n"); - return -EINVAL; - } - cur->active_spin_lock = 0; - } - - return 0; -} - -static bool arg_type_is_mem_ptr(enum bpf_arg_type type) -{ - return type == ARG_PTR_TO_MEM || - type == ARG_PTR_TO_MEM_OR_NULL || - type == ARG_PTR_TO_UNINIT_MEM; -} - -static bool arg_type_is_mem_size(enum bpf_arg_type type) -{ - return type == ARG_CONST_SIZE || - type == ARG_CONST_SIZE_OR_ZERO; -} - -static bool arg_type_is_int_ptr(enum bpf_arg_type type) -{ - return type == ARG_PTR_TO_INT || - type == ARG_PTR_TO_LONG; -} - -static int int_ptr_type_to_size(enum bpf_arg_type type) -{ - if (type == ARG_PTR_TO_INT) - return sizeof(u32); - else if (type == ARG_PTR_TO_LONG) - return sizeof(u64); - return -EINVAL; -} - static int check_func_arg(struct bpf_verifier_env *env, u32 regno, enum bpf_arg_type arg_type, struct bpf_call_arg_meta *meta) @@ -2334,29 +1555,22 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, if (arg_type == ARG_ANYTHING) { if (is_pointer_value(env, regno)) { - verbose(env, "R%d leaks addr into helper function\n", regno); + verbose("R%d leaks addr into helper function\n", regno); return -EACCES; } return 0; } - if (type_is_pkt_pointer(type) && + if (type == PTR_TO_PACKET && !may_access_direct_pkt_data(env, meta, BPF_READ)) { - verbose(env, "helper access to the packet is not allowed\n"); + verbose("helper access to the packet is not allowed\n"); return -EACCES; } if (arg_type == ARG_PTR_TO_MAP_KEY || - arg_type == ARG_PTR_TO_MAP_VALUE || - arg_type == ARG_PTR_TO_UNINIT_MAP_VALUE || - arg_type == ARG_PTR_TO_MAP_VALUE_OR_NULL) { + arg_type == ARG_PTR_TO_MAP_VALUE) { expected_type = PTR_TO_STACK; - if (register_is_null(reg) && - arg_type == ARG_PTR_TO_MAP_VALUE_OR_NULL) - /* final test in check_stack_boundary() */; - else if (!type_is_pkt_pointer(type) && - type != PTR_TO_MAP_VALUE && - type != expected_type) + if (type != PTR_TO_PACKET && type != expected_type) goto err_type; } else if (arg_type == ARG_CONST_SIZE || arg_type == ARG_CONST_SIZE_OR_ZERO) { @@ -2374,58 +1588,21 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, err = check_ctx_reg(env, reg, regno); if (err < 0) return err; - } else if (arg_type == ARG_PTR_TO_SOCK_COMMON) { - expected_type = PTR_TO_SOCK_COMMON; - /* Any sk pointer can be ARG_PTR_TO_SOCK_COMMON */ - if (!type_is_sk_pointer(type)) - goto err_type; - } else if (arg_type == ARG_PTR_TO_SOCKET) { - expected_type = PTR_TO_SOCKET; - if (type != expected_type) - goto err_type; - if (meta->ptr_id || !reg->id) { - verbose(env, "verifier internal error: mismatched references meta=%d, reg=%d\n", - meta->ptr_id, reg->id); - return -EFAULT; - } - meta->ptr_id = reg->id; - } else if (arg_type == ARG_PTR_TO_SOCKET) { - expected_type = PTR_TO_SOCKET; - if (type != expected_type) - goto err_type; - } else if (arg_type == ARG_PTR_TO_SPIN_LOCK) { - if (meta->func_id == BPF_FUNC_spin_lock) { - if (process_spin_lock(env, regno, true)) - return -EACCES; - } else if (meta->func_id == BPF_FUNC_spin_unlock) { - if (process_spin_lock(env, regno, false)) - return -EACCES; - } else { - verbose(env, "verifier internal error\n"); - return -EFAULT; - } - } else if (arg_type_is_mem_ptr(arg_type)) { + } else if (arg_type == ARG_PTR_TO_MEM || + arg_type == ARG_PTR_TO_UNINIT_MEM) { expected_type = PTR_TO_STACK; /* One exception here. In case function allows for NULL to be * passed in as argument, it's a SCALAR_VALUE type. Final test * happens during stack boundary checking. */ - if (register_is_null(reg) && - arg_type == ARG_PTR_TO_MEM_OR_NULL) + if (register_is_null(*reg)) /* final test in check_stack_boundary() */; - else if (!type_is_pkt_pointer(type) && - type != PTR_TO_MAP_VALUE && + else if (type != PTR_TO_PACKET && type != PTR_TO_MAP_VALUE && type != expected_type) goto err_type; meta->raw_mode = arg_type == ARG_PTR_TO_UNINIT_MEM; - } else if (arg_type_is_int_ptr(arg_type)) { - expected_type = PTR_TO_STACK; - if (!type_is_pkt_pointer(type) && - type != PTR_TO_MAP_VALUE && - type != expected_type) - goto err_type; } else { - verbose(env, "unsupported arg_type %d\n", arg_type); + verbose("unsupported arg_type %d\n", arg_type); return -EFAULT; } @@ -2443,36 +1620,45 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, * we have to check map_key here. Otherwise it means * that kernel subsystem misconfigured verifier */ - verbose(env, "invalid map_ptr to access map->key\n"); + verbose("invalid map_ptr to access map->key\n"); return -EACCES; } - err = check_helper_mem_access(env, regno, - meta->map_ptr->key_size, false, - NULL); - } else if (arg_type == ARG_PTR_TO_MAP_VALUE || - (arg_type == ARG_PTR_TO_MAP_VALUE_OR_NULL && - !register_is_null(reg)) || - arg_type == ARG_PTR_TO_UNINIT_MAP_VALUE) { + if (type == PTR_TO_PACKET) + err = check_packet_access(env, regno, reg->off, + meta->map_ptr->key_size); + else + err = check_stack_boundary(env, regno, + meta->map_ptr->key_size, + false, NULL); + } else if (arg_type == ARG_PTR_TO_MAP_VALUE) { /* bpf_map_xxx(..., map_ptr, ..., value) call: * check [value, value + map->value_size) validity */ if (!meta->map_ptr) { /* kernel subsystem misconfigured verifier */ - verbose(env, "invalid map_ptr to access map->value\n"); + verbose("invalid map_ptr to access map->value\n"); return -EACCES; } - meta->raw_mode = (arg_type == ARG_PTR_TO_UNINIT_MAP_VALUE); - err = check_helper_mem_access(env, regno, - meta->map_ptr->value_size, false, - meta); - } else if (arg_type_is_mem_size(arg_type)) { + if (type == PTR_TO_PACKET) + err = check_packet_access(env, regno, reg->off, + meta->map_ptr->value_size); + else + err = check_stack_boundary(env, regno, + meta->map_ptr->value_size, + false, NULL); + } else if (arg_type == ARG_CONST_SIZE || + arg_type == ARG_CONST_SIZE_OR_ZERO) { bool zero_size_allowed = (arg_type == ARG_CONST_SIZE_OR_ZERO); - /* remember the mem_size which may be used later - * to refine return values. + /* bpf_xxx(..., buf, len) call will access 'len' bytes + * from stack pointer 'buf'. Check it + * note: regno == len, regno - 1 == buf */ - meta->msize_smax_value = reg->smax_value; - meta->msize_umax_value = reg->umax_value; + if (regno == 0) { + /* kernel subsystem misconfigured verifier */ + verbose("ARG_CONST_SIZE cannot be first argument\n"); + return -EACCES; + } /* The register is SCALAR_VALUE; the access check * happens using its boundaries. @@ -2487,7 +1673,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, meta = NULL; if (reg->smin_value < 0) { - verbose(env, "R%d min value is negative, either use unsigned or 'var &= const'\n", + verbose("R%d min value is negative, either use unsigned or 'var &= const'\n", regno); return -EACCES; } @@ -2501,30 +1687,23 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, } if (reg->umax_value >= BPF_MAX_VAR_SIZ) { - verbose(env, "R%d unbounded memory access, use 'var &= const' or 'if (var < const)'\n", + verbose("R%d unbounded memory access, use 'var &= const' or 'if (var < const)'\n", regno); return -EACCES; } err = check_helper_mem_access(env, regno - 1, reg->umax_value, zero_size_allowed, meta); - } else if (arg_type_is_int_ptr(arg_type)) { - int size = int_ptr_type_to_size(arg_type); - err = check_helper_mem_access(env, regno, size, false, meta); - if (err) - return err; - err = check_ptr_alignment(env, reg, 0, size, true); } return err; err_type: - verbose(env, "R%d type=%s expected=%s\n", regno, + verbose("R%d type=%s expected=%s\n", regno, reg_type_str[type], reg_type_str[expected_type]); return -EACCES; } -static int check_map_func_compatibility(struct bpf_verifier_env *env, - struct bpf_map *map, int func_id) +static int check_map_func_compatibility(struct bpf_map *map, int func_id) { if (!map) return 0; @@ -2549,19 +1728,11 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, func_id != BPF_FUNC_current_task_under_cgroup) goto error; break; - case BPF_MAP_TYPE_CGROUP_STORAGE: - case BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE: - if (func_id != BPF_FUNC_get_local_storage) - goto error; - break; + /* devmap returns a pointer to a live net_device ifindex that we cannot + * allow to be modified from bpf side. So do not allow lookup elements + * for now. + */ case BPF_MAP_TYPE_DEVMAP: - case BPF_MAP_TYPE_DEVMAP_HASH: - if (func_id != BPF_FUNC_redirect_map && - func_id != BPF_FUNC_map_lookup_elem) - goto error; - break; - /* Restrict bpf side of cpumap, open when use-cases appear */ - case BPF_MAP_TYPE_CPUMAP: if (func_id != BPF_FUNC_redirect_map) goto error; break; @@ -2573,20 +1744,7 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, case BPF_MAP_TYPE_SOCKMAP: if (func_id != BPF_FUNC_sk_redirect_map && func_id != BPF_FUNC_sock_map_update && - func_id != BPF_FUNC_map_delete_elem && - func_id != BPF_FUNC_msg_redirect_map) - goto error; - break; - case BPF_MAP_TYPE_SK_STORAGE: - if (func_id != BPF_FUNC_sk_storage_get && - func_id != BPF_FUNC_sk_storage_delete) - goto error; - break; - case BPF_MAP_TYPE_SOCKHASH: - if (func_id != BPF_FUNC_sk_redirect_hash && - func_id != BPF_FUNC_sock_hash_update && - func_id != BPF_FUNC_map_delete_elem && - func_id != BPF_FUNC_msg_redirect_hash) + func_id != BPF_FUNC_map_delete_elem) goto error; break; default: @@ -2598,10 +1756,6 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, case BPF_FUNC_tail_call: if (map->map_type != BPF_MAP_TYPE_PROG_ARRAY) goto error; - if (env->subprog_cnt > 1) { - verbose(env, "tail_calls are not allowed in programs with bpf-to-bpf calls\n"); - return -EINVAL; - } break; case BPF_FUNC_perf_event_read: case BPF_FUNC_perf_event_output: @@ -2618,30 +1772,15 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, goto error; break; case BPF_FUNC_redirect_map: - if (map->map_type != BPF_MAP_TYPE_DEVMAP && - map->map_type != BPF_MAP_TYPE_DEVMAP_HASH) + if (map->map_type != BPF_MAP_TYPE_DEVMAP) goto error; break; case BPF_FUNC_sk_redirect_map: - case BPF_FUNC_msg_redirect_map: - case BPF_FUNC_sock_map_update: if (map->map_type != BPF_MAP_TYPE_SOCKMAP) goto error; break; - case BPF_FUNC_sk_redirect_hash: - case BPF_FUNC_msg_redirect_hash: - case BPF_FUNC_sock_hash_update: - if (map->map_type != BPF_MAP_TYPE_SOCKHASH) - goto error; - break; - case BPF_FUNC_get_local_storage: - if (map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE && - map->map_type != BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) - goto error; - break; - case BPF_FUNC_sk_storage_get: - case BPF_FUNC_sk_storage_delete: - if (map->map_type != BPF_MAP_TYPE_SK_STORAGE) + case BPF_FUNC_sock_map_update: + if (map->map_type != BPF_MAP_TYPE_SOCKMAP) goto error; break; default: @@ -2650,12 +1789,12 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, return 0; error: - verbose(env, "cannot pass map_type %d into func %s#%d\n", + verbose("cannot pass map_type %d into func %s#%d\n", map->map_type, func_id_name(func_id), func_id); return -EINVAL; } -static bool check_raw_mode_ok(const struct bpf_func_proto *fn) +static int check_raw_mode(const struct bpf_func_proto *fn) { int count = 0; @@ -2669,294 +1808,36 @@ static bool check_raw_mode_ok(const struct bpf_func_proto *fn) count++; if (fn->arg5_type == ARG_PTR_TO_UNINIT_MEM) count++; - /* We only support one arg being in raw mode at the moment, - * which is sufficient for the helper functions we have - * right now. - */ - return count <= 1; -} -static bool check_args_pair_invalid(enum bpf_arg_type arg_curr, - enum bpf_arg_type arg_next) -{ - return (arg_type_is_mem_ptr(arg_curr) && - !arg_type_is_mem_size(arg_next)) || - (!arg_type_is_mem_ptr(arg_curr) && - arg_type_is_mem_size(arg_next)); -} - -static bool check_arg_pair_ok(const struct bpf_func_proto *fn) -{ - /* bpf_xxx(..., buf, len) call will access 'len' - * bytes from memory 'buf'. Both arg types need - * to be paired, so make sure there's no buggy - * helper function specification. - */ - if (arg_type_is_mem_size(fn->arg1_type) || - arg_type_is_mem_ptr(fn->arg5_type) || - check_args_pair_invalid(fn->arg1_type, fn->arg2_type) || - check_args_pair_invalid(fn->arg2_type, fn->arg3_type) || - check_args_pair_invalid(fn->arg3_type, fn->arg4_type) || - check_args_pair_invalid(fn->arg4_type, fn->arg5_type)) - return false; - return true; -} - -static bool check_refcount_ok(const struct bpf_func_proto *fn) -{ - int count = 0; - if (arg_type_is_refcounted(fn->arg1_type)) - count++; - if (arg_type_is_refcounted(fn->arg2_type)) - count++; - if (arg_type_is_refcounted(fn->arg3_type)) - count++; - if (arg_type_is_refcounted(fn->arg4_type)) - count++; - if (arg_type_is_refcounted(fn->arg5_type)) - count++; - /* We only support one arg being unreferenced at the moment, - * which is sufficient for the helper functions we have right now. - */ - return count <= 1; -} - -static int check_func_proto(const struct bpf_func_proto *fn) -{ - return check_raw_mode_ok(fn) && - check_arg_pair_ok(fn) && - check_refcount_ok(fn) ? 0 : -EINVAL; + return count > 1 ? -EINVAL : 0; } /* Packet data might have moved, any old PTR_TO_PACKET[_END] are now invalid, * so turn them into unknown SCALAR_VALUE. */ -static void __clear_all_pkt_pointers(struct bpf_verifier_env *env, - struct bpf_func_state *state) -{ - struct bpf_reg_state *regs = state->regs, *reg; - int i; - - for (i = 0; i < MAX_BPF_REG; i++) - if (reg_is_pkt_pointer_any(®s[i])) - mark_reg_unknown(env, regs, i); - - bpf_for_each_spilled_reg(i, state, reg) { - if (!reg) - continue; - if (reg_is_pkt_pointer_any(reg)) - __mark_reg_unknown(reg); - } -} - static void clear_all_pkt_pointers(struct bpf_verifier_env *env) { - struct bpf_verifier_state *vstate = env->cur_state; - int i; - - for (i = 0; i <= vstate->curframe; i++) - __clear_all_pkt_pointers(env, vstate->frame[i]); -} - -static void release_reg_references(struct bpf_verifier_env *env, - struct bpf_func_state *state, int id) -{ + struct bpf_verifier_state *state = env->cur_state; struct bpf_reg_state *regs = state->regs, *reg; int i; + for (i = 0; i < MAX_BPF_REG; i++) - if (regs[i].id == id) - mark_reg_unknown(env, regs, i); - bpf_for_each_spilled_reg(i, state, reg) { - if (!reg) + if (regs[i].type == PTR_TO_PACKET || + regs[i].type == PTR_TO_PACKET_END) + mark_reg_unknown(regs, i); + + for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { + if (state->stack[i].slot_type[0] != STACK_SPILL) continue; - if (reg_is_refcounted(reg) && reg->id == id) - __mark_reg_unknown(reg); + reg = &state->stack[i].spilled_ptr; + if (reg->type != PTR_TO_PACKET && + reg->type != PTR_TO_PACKET_END) + continue; + __mark_reg_unknown(reg); } } -/* The pointer with the specified id has released its reference to kernel - * resources. Identify all copies of the same pointer and clear the reference. - */ -static int release_reference(struct bpf_verifier_env *env, - struct bpf_call_arg_meta *meta) -{ - struct bpf_verifier_state *vstate = env->cur_state; - int i; - for (i = 0; i <= vstate->curframe; i++) - release_reg_references(env, vstate->frame[i], meta->ptr_id); - - return release_reference_state(cur_func(env), meta->ptr_id); -} - - -static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, - int *insn_idx) -{ - struct bpf_verifier_state *state = env->cur_state; - struct bpf_func_state *caller, *callee; - int i, err, subprog, target_insn; - - if (state->curframe >= MAX_CALL_FRAMES) { - verbose(env, "the call stack of %d frames is too deep\n", - state->curframe); - return -E2BIG; - } - - target_insn = *insn_idx + insn->imm; - subprog = find_subprog(env, target_insn + 1); - - if (subprog < 0) { - verbose(env, "verifier bug. No program starts at insn %d\n", - target_insn + 1); - return -EFAULT; - } - - caller = state->frame[state->curframe]; - if (state->frame[state->curframe + 1]) { - verbose(env, "verifier bug. Frame %d already allocated\n", - state->curframe + 1); - return -EFAULT; - } - - callee = kzalloc(sizeof(*callee), GFP_KERNEL); - if (!callee) - return -ENOMEM; - - state->frame[state->curframe + 1] = callee; - /* callee cannot access r0, r6 - r9 for reading and has to write - * into its own stack before reading from it. - * callee can read/write into caller's stack - */ - init_func_state(env, callee, - /* remember the callsite, it will be used by bpf_exit */ - *insn_idx /* callsite */, - state->curframe + 1 /* frameno within this callchain */, - subprog /* subprog number within this prog */); - - /* Transfer references to the callee */ - err = transfer_reference_state(callee, caller); - if (err) - return err; - - /* copy r1 - r5 args that callee can access */ - - for (i = BPF_REG_1; i <= BPF_REG_5; i++) - callee->regs[i] = caller->regs[i]; - /* after the call regsiters r0 - r5 were scratched */ - - for (i = 0; i < CALLER_SAVED_REGS; i++) { - mark_reg_not_init(env, caller->regs, caller_saved[i]); - check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK); - } - /* only increment it after check_reg_arg() finished */ - state->curframe++; - /* and go analyze first insn of the callee */ - *insn_idx = target_insn; - if (env->log.level) { - verbose(env, "caller:\n"); - print_verifier_state(env, caller); - verbose(env, "callee:\n"); - print_verifier_state(env, callee); - } - - return 0; -} - -static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) -{ - struct bpf_verifier_state *state = env->cur_state; - struct bpf_func_state *caller, *callee; - struct bpf_reg_state *r0; - int err; - callee = state->frame[state->curframe]; - r0 = &callee->regs[BPF_REG_0]; - - if (r0->type == PTR_TO_STACK) { - /* technically it's ok to return caller's stack pointer - * (or caller's caller's pointer) back to the caller, - * since these pointers are valid. Only current stack - * pointer will be invalid as soon as function exits, - * but let's be conservative - */ - verbose(env, "cannot return stack pointer to the caller\n"); - return -EINVAL; - } - - state->curframe--; - caller = state->frame[state->curframe]; - /* return to the caller whatever r0 had in the callee */ - caller->regs[BPF_REG_0] = *r0; - - /* Transfer references to the caller */ - err = transfer_reference_state(caller, callee); - if (err) - return err; - - *insn_idx = callee->callsite + 1; - if (env->log.level) { - verbose(env, "returning from callee:\n"); - print_verifier_state(env, callee); - verbose(env, "to caller at %d:\n", *insn_idx); - print_verifier_state(env, caller); - } - - /* clear everything in the callee */ - free_func_state(callee); - state->frame[state->curframe + 1] = NULL; - - return 0; -} - -static int -record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta, - int func_id, int insn_idx) -{ - struct bpf_insn_aux_data *aux = &env->insn_aux_data[insn_idx]; - if (func_id != BPF_FUNC_tail_call && - func_id != BPF_FUNC_map_lookup_elem) - return 0; - if (meta->map_ptr == NULL) { - verbose(env, "kernel subsystem misconfigured verifier\n"); - return -EINVAL; - } - if (!BPF_MAP_PTR(aux->map_state)) - bpf_map_ptr_store(aux, meta->map_ptr, - meta->map_ptr->unpriv_array); - else if (BPF_MAP_PTR(aux->map_state) != meta->map_ptr) - bpf_map_ptr_store(aux, BPF_MAP_PTR_POISON, - meta->map_ptr->unpriv_array); - return 0; -} - -static int check_reference_leak(struct bpf_verifier_env *env) -{ - struct bpf_func_state *state = cur_func(env); - int i; - for (i = 0; i < state->acquired_refs; i++) { - verbose(env, "Unreleased reference id=%d alloc_insn=%d\n", - state->refs[i].id, state->refs[i].insn_idx); - } - return state->acquired_refs ? -EINVAL : 0; -} - -static void do_refine_retval_range(struct bpf_reg_state *regs, int ret_type, - int func_id, - struct bpf_call_arg_meta *meta) -{ - struct bpf_reg_state *ret_reg = ®s[BPF_REG_0]; - if (ret_type != RET_INTEGER || - (func_id != BPF_FUNC_get_stack && - func_id != BPF_FUNC_probe_read_str)) - return; - - ret_reg->smax_value = meta->msize_smax_value; - ret_reg->umax_value = meta->msize_umax_value; - __reg_deduce_bounds(ret_reg); - __reg_bound_offset(ret_reg); -} - - -static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn_idx) +static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx) { const struct bpf_func_proto *fn = NULL; struct bpf_reg_state *regs; @@ -2966,21 +1847,21 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn /* find function prototype */ if (func_id < 0 || func_id >= __BPF_FUNC_MAX_ID) { - verbose(env, "invalid func %s#%d\n", func_id_name(func_id), func_id); + verbose("invalid func %s#%d\n", func_id_name(func_id), func_id); return -EINVAL; } - if (env->ops->get_func_proto) - fn = env->ops->get_func_proto(func_id, env->prog); + if (env->prog->aux->ops->get_func_proto) + fn = env->prog->aux->ops->get_func_proto(func_id); if (!fn) { - verbose(env, "unknown func %s#%d\n", func_id_name(func_id), func_id); + verbose("unknown func %s#%d\n", func_id_name(func_id), func_id); return -EINVAL; } /* eBPF programs must be GPL compatible to use GPL-ed functions */ if (!env->prog->gpl_compatible && fn->gpl_only) { - verbose(env, "cannot call GPL only function from proprietary program\n"); + verbose("cannot call GPL only function from proprietary program\n"); return -EINVAL; } @@ -2989,14 +1870,16 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn memset(&meta, 0, sizeof(meta)); meta.pkt_access = fn->pkt_access; - err = check_func_proto(fn); + /* We only support one arg being in raw mode at the moment, which + * is sufficient for the helper functions we have right now. + */ + err = check_raw_mode(fn); if (err) { - verbose(env, "kernel subsystem misconfigured func %s#%d\n", + verbose("kernel subsystem misconfigured func %s#%d\n", func_id_name(func_id), func_id); return err; } - meta.func_id = func_id; /* check args */ err = check_func_arg(env, BPF_REG_1, fn->arg1_type, &meta); if (err) @@ -3004,7 +1887,13 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn err = check_func_arg(env, BPF_REG_2, fn->arg2_type, &meta); if (err) return err; - + if (func_id == BPF_FUNC_tail_call) { + if (meta.map_ptr == NULL) { + verbose("verifier bug\n"); + return -EINVAL; + } + env->insn_aux_data[insn_idx].map_ptr = meta.map_ptr; + } err = check_func_arg(env, BPF_REG_3, fn->arg3_type, &meta); if (err) return err; @@ -3015,10 +1904,6 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn if (err) return err; - err = record_func_map(env, &meta, func_id, insn_idx); - if (err) - return err; - /* Mark slots with STACK_MISC in case of raw mode, stack offset * is inferred from register state. */ @@ -3029,93 +1914,48 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn return err; } - if (func_id == BPF_FUNC_tail_call) { - err = check_reference_leak(env); - if (err) { - verbose(env, "tail_call would lead to reference leak\n"); - return err; - } - } else if (is_release_function(func_id)) { - err = release_reference(env, &meta); - if (err) { - verbose(env, "func %s#%d reference has not been acquired before\n", - func_id_name(func_id), func_id); - return err; - } - } - regs = cur_regs(env); - - /* check that flags argument in get_local_storage(map, flags) is 0, - * this is required because get_local_storage() can't return an error. - */ - if (func_id == BPF_FUNC_get_local_storage && - !register_is_null(®s[BPF_REG_2])) { - return -EINVAL; - } - /* reset caller saved regs */ for (i = 0; i < CALLER_SAVED_REGS; i++) { - mark_reg_not_init(env, regs, caller_saved[i]); + mark_reg_not_init(regs, caller_saved[i]); check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK); } /* update return register (already marked as written above) */ if (fn->ret_type == RET_INTEGER) { /* sets type to SCALAR_VALUE */ - mark_reg_unknown(env, regs, BPF_REG_0); + mark_reg_unknown(regs, BPF_REG_0); } else if (fn->ret_type == RET_VOID) { regs[BPF_REG_0].type = NOT_INIT; - } else if (fn->ret_type == RET_PTR_TO_MAP_VALUE_OR_NULL || - fn->ret_type == RET_PTR_TO_MAP_VALUE) { - if (fn->ret_type == RET_PTR_TO_MAP_VALUE) - regs[BPF_REG_0].type = PTR_TO_MAP_VALUE; - else - regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL; + } else if (fn->ret_type == RET_PTR_TO_MAP_VALUE_OR_NULL) { + struct bpf_insn_aux_data *insn_aux; + regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL; /* There is no offset yet applied, variable or fixed */ - mark_reg_known_zero(env, regs, BPF_REG_0); + mark_reg_known_zero(regs, BPF_REG_0); regs[BPF_REG_0].off = 0; /* remember map_ptr, so that check_map_access() * can check 'value_size' boundary of memory access * to map element returned from bpf_map_lookup_elem() */ if (meta.map_ptr == NULL) { - verbose(env, "kernel subsystem misconfigured verifier\n"); + verbose("kernel subsystem misconfigured verifier\n"); return -EINVAL; } regs[BPF_REG_0].map_ptr = meta.map_ptr; regs[BPF_REG_0].id = ++env->id_gen; - } else if (fn->ret_type == RET_PTR_TO_SOCKET_OR_NULL) { - mark_reg_known_zero(env, regs, BPF_REG_0); - regs[BPF_REG_0].type = PTR_TO_SOCKET_OR_NULL; - if (is_acquire_function(func_id)) { - int id = acquire_reference_state(env, insn_idx); - if (id < 0) - return id; - /* For release_reference() */ - regs[BPF_REG_0].id = id; - } else { - /* For mark_ptr_or_null_reg() */ - regs[BPF_REG_0].id = ++env->id_gen; - } - } else if (fn->ret_type == RET_PTR_TO_SOCK_COMMON_OR_NULL) { - mark_reg_known_zero(env, regs, BPF_REG_0); - regs[BPF_REG_0].type = PTR_TO_SOCK_COMMON_OR_NULL; - regs[BPF_REG_0].id = ++env->id_gen; - } else if (fn->ret_type == RET_PTR_TO_TCP_SOCK_OR_NULL) { - mark_reg_known_zero(env, regs, BPF_REG_0); - regs[BPF_REG_0].type = PTR_TO_TCP_SOCK_OR_NULL; - regs[BPF_REG_0].id = ++env->id_gen; + insn_aux = &env->insn_aux_data[insn_idx]; + if (!insn_aux->map_ptr) + insn_aux->map_ptr = meta.map_ptr; + else if (insn_aux->map_ptr != meta.map_ptr) + insn_aux->map_ptr = BPF_MAP_PTR_POISON; } else { - verbose(env, "unknown return type %d of func %s#%d\n", + verbose("unknown return type %d of func %s#%d\n", fn->ret_type, func_id_name(func_id), func_id); return -EINVAL; } - do_refine_retval_range(regs, fn->ret_type, func_id, &meta); - - err = check_map_func_compatibility(env, meta.map_ptr, func_id); + err = check_map_func_compatibility(meta.map_ptr, func_id); if (err) return err; @@ -3153,25 +1993,25 @@ static bool check_reg_sane_offset(struct bpf_verifier_env *env, s64 smin = reg->smin_value; if (known && (val >= BPF_MAX_VAR_OFF || val <= -BPF_MAX_VAR_OFF)) { - verbose(env, "math between %s pointer and %lld is not allowed\n", + verbose("math between %s pointer and %lld is not allowed\n", reg_type_str[type], val); return false; } if (reg->off >= BPF_MAX_VAR_OFF || reg->off <= -BPF_MAX_VAR_OFF) { - verbose(env, "%s pointer offset %d is not allowed\n", + verbose("%s pointer offset %d is not allowed\n", reg_type_str[type], reg->off); return false; } if (smin == S64_MIN) { - verbose(env, "math between %s pointer and register with unbounded min value is not allowed\n", + verbose("math between %s pointer and register with unbounded min value is not allowed\n", reg_type_str[type]); return false; } if (smin >= BPF_MAX_VAR_OFF || smin <= -BPF_MAX_VAR_OFF) { - verbose(env, "value %lld makes %s pointer be out of bounds\n", + verbose("value %lld makes %s pointer be out of bounds\n", smin, reg_type_str[type]); return false; } @@ -3367,27 +2207,27 @@ static int sanitize_err(struct bpf_verifier_env *env, switch (reason) { case REASON_BOUNDS: - verbose(env, "R%d has unknown scalar with mixed signed bounds, %s\n", + verbose("R%d has unknown scalar with mixed signed bounds, %s\n", off_reg == dst_reg ? dst : src, err); break; case REASON_TYPE: - verbose(env, "R%d has pointer with unsupported alu operation, %s\n", + verbose("R%d has pointer with unsupported alu operation, %s\n", off_reg == dst_reg ? src : dst, err); break; case REASON_PATHS: - verbose(env, "R%d tried to %s from different maps, paths or scalars, %s\n", + verbose("R%d tried to %s from different maps, paths or scalars, %s\n", dst, op, err); break; case REASON_LIMIT: - verbose(env, "R%d tried to %s beyond pointer bounds, %s\n", + verbose("R%d tried to %s beyond pointer bounds, %s\n", dst, op, err); break; case REASON_STACK: - verbose(env, "R%d could not be pushed for speculative verification, %s\n", + verbose("R%d could not be pushed for speculative verification, %s\n", dst, err); break; default: - verbose(env, "verifier internal error: unknown reason (%d)\n", + verbose("verifier internal error: unknown reason (%d)\n", reason); break; } @@ -3411,14 +2251,14 @@ static int sanitize_check_bounds(struct bpf_verifier_env *env, case PTR_TO_STACK: if (check_stack_access(env, dst_reg, dst_reg->off + dst_reg->var_off.value, 1)) { - verbose(env, "R%d stack pointer arithmetic goes out of range, " + verbose("R%d stack pointer arithmetic goes out of range, " "prohibited for !root\n", dst); return -EACCES; } break; case PTR_TO_MAP_VALUE: if (check_map_access(env, dst, dst_reg->off, 1)) { - verbose(env, "R%d pointer arithmetic of map value goes out of range, " + verbose("R%d pointer arithmetic of map value goes out of range, " "prohibited for !root\n", dst); return -EACCES; } @@ -3440,9 +2280,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, const struct bpf_reg_state *ptr_reg, const struct bpf_reg_state *off_reg) { - struct bpf_verifier_state *vstate = env->cur_state; - struct bpf_func_state *state = vstate->frame[vstate->curframe]; - struct bpf_reg_state *regs = state->regs, *dst_reg; + struct bpf_reg_state *regs = cur_regs(env), *dst_reg; bool known = tnum_is_const(off_reg->var_off); s64 smin_val = off_reg->smin_value, smax_val = off_reg->smax_value, smin_ptr = ptr_reg->smin_value, smax_ptr = ptr_reg->smax_value; @@ -3450,7 +2288,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, umin_ptr = ptr_reg->umin_value, umax_ptr = ptr_reg->umax_value; struct bpf_sanitize_info info = {}; u8 opcode = BPF_OP(insn->code); - u32 dst = insn->dst_reg, src = insn->src_reg; + u32 dst = insn->dst_reg; int ret; dst_reg = ®s[dst]; @@ -3466,36 +2304,25 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, if (BPF_CLASS(insn->code) != BPF_ALU64) { /* 32-bit ALU ops on pointers produce (meaningless) scalars */ - verbose(env, "R%d 32-bit pointer arithmetic prohibited\n", + verbose("R%d 32-bit pointer arithmetic prohibited\n", dst); return -EACCES; } - switch (ptr_reg->type) { - case PTR_TO_MAP_VALUE_OR_NULL: - verbose(env, "R%d pointer arithmetic on %s prohibited, null-check it first\n", - dst, reg_type_str[ptr_reg->type]); + if (ptr_reg->type == PTR_TO_MAP_VALUE_OR_NULL) { + verbose("R%d pointer arithmetic on PTR_TO_MAP_VALUE_OR_NULL prohibited, null-check it first\n", + dst); return -EACCES; - case CONST_PTR_TO_MAP: - case PTR_TO_PACKET_END: - case PTR_TO_SOCKET: - case PTR_TO_SOCKET_OR_NULL: - case PTR_TO_SOCK_COMMON: - case PTR_TO_SOCK_COMMON_OR_NULL: - case PTR_TO_TCP_SOCK: - case PTR_TO_TCP_SOCK_OR_NULL: - verbose(env, "R%d pointer arithmetic on %s prohibited\n", - dst, reg_type_str[ptr_reg->type]); + } + if (ptr_reg->type == CONST_PTR_TO_MAP) { + verbose("R%d pointer arithmetic on CONST_PTR_TO_MAP prohibited\n", + dst); + return -EACCES; + } + if (ptr_reg->type == PTR_TO_PACKET_END) { + verbose("R%d pointer arithmetic on PTR_TO_PACKET_END prohibited\n", + dst); return -EACCES; - case PTR_TO_MAP_VALUE: - if (!env->allow_ptr_leaks && !known && (smin_val < 0) != (smax_val < 0)) { - verbose(env, "R%d has unknown scalar with mixed signed bounds, pointer arithmetic with it prohibited for !root\n", - off_reg == dst_reg ? dst : src); - return -EACCES; - } - /* fall-through */ - default: - break; } /* In case of 'scalar += pointer', dst_reg inherits pointer type and id. @@ -3560,7 +2387,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, dst_reg->var_off = tnum_add(ptr_reg->var_off, off_reg->var_off); dst_reg->off = ptr_reg->off; dst_reg->raw = ptr_reg->raw; - if (reg_is_pkt_pointer(ptr_reg)) { + if (ptr_reg->type == PTR_TO_PACKET) { dst_reg->id = ++env->id_gen; /* something was added to pkt_ptr, set range to zero */ dst_reg->raw = 0; @@ -3569,7 +2396,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, case BPF_SUB: if (dst_reg == off_reg) { /* scalar -= pointer. Creates an unknown scalar */ - verbose(env, "R%d tried to subtract pointer from scalar\n", + verbose("R%d tried to subtract pointer from scalar\n", dst); return -EACCES; } @@ -3578,7 +2405,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, * be able to deal with it. */ if (ptr_reg->type == PTR_TO_STACK) { - verbose(env, "R%d subtraction from stack pointer prohibited\n", + verbose("R%d subtraction from stack pointer prohibited\n", dst); return -EACCES; } @@ -3619,7 +2446,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, dst_reg->var_off = tnum_sub(ptr_reg->var_off, off_reg->var_off); dst_reg->off = ptr_reg->off; dst_reg->raw = ptr_reg->raw; - if (reg_is_pkt_pointer(ptr_reg)) { + if (ptr_reg->type == PTR_TO_PACKET) { dst_reg->id = ++env->id_gen; /* something was added to pkt_ptr, set range to zero */ if (smin_val < 0) @@ -3630,12 +2457,12 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, case BPF_OR: case BPF_XOR: /* bitwise ops on pointers are troublesome. */ - verbose(env, "R%d bitwise operator %s on pointer prohibited\n", + verbose("R%d bitwise operator %s on pointer prohibited\n", dst, bpf_alu_string[opcode >> 4]); return -EACCES; default: /* other operators (e.g. MUL,LSH) produce non-pointer results */ - verbose(env, "R%d pointer arithmetic with %s operator prohibited\n", + verbose("R%d pointer arithmetic with %s operator prohibited\n", dst, bpf_alu_string[opcode >> 4]); return -EACCES; } @@ -3845,7 +2672,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, /* Shifts greater than 31 or 63 are undefined. * This includes shifts by a negative number. */ - mark_reg_unknown(env, regs, insn->dst_reg); + mark_reg_unknown(regs, insn->dst_reg); break; } /* We lose all sign bit information (except what we can pick @@ -3873,7 +2700,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, /* Shifts greater than 31 or 63 are undefined. * This includes shifts by a negative number. */ - mark_reg_unknown(env, regs, insn->dst_reg); + mark_reg_unknown(regs, insn->dst_reg); break; } /* BPF_RSH is an unsigned shift. If the value in dst_reg might @@ -3903,7 +2730,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, __update_reg_bounds(dst_reg); break; default: - mark_reg_unknown(env, regs, insn->dst_reg); + mark_reg_unknown(regs, insn->dst_reg); break; } @@ -3924,9 +2751,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, static int adjust_reg_min_max_vals(struct bpf_verifier_env *env, struct bpf_insn *insn) { - struct bpf_verifier_state *vstate = env->cur_state; - struct bpf_func_state *state = vstate->frame[vstate->curframe]; - struct bpf_reg_state *regs = state->regs, *dst_reg, *src_reg; + struct bpf_reg_state *regs = cur_regs(env), *dst_reg, *src_reg; struct bpf_reg_state *ptr_reg = NULL, off_reg = {0}; u8 opcode = BPF_OP(insn->code); @@ -3943,10 +2768,10 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env, * pointer subtraction */ if (opcode == BPF_SUB && env->allow_ptr_leaks) { - mark_reg_unknown(env, regs, insn->dst_reg); + mark_reg_unknown(regs, insn->dst_reg); return 0; } - verbose(env, "R%d pointer %s pointer prohibited\n", + verbose("R%d pointer %s pointer prohibited\n", insn->dst_reg, bpf_alu_string[opcode >> 4]); return -EACCES; @@ -3977,13 +2802,13 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env, /* Got here implies adding two SCALAR_VALUEs */ if (WARN_ON_ONCE(ptr_reg)) { - print_verifier_state(env, state); - verbose(env, "verifier internal error: unexpected ptr_reg\n"); + print_verifier_state(env->cur_state); + verbose("verifier internal error: unexpected ptr_reg\n"); return -EINVAL; } if (WARN_ON(!src_reg)) { - print_verifier_state(env, state); - verbose(env, "verifier internal error: no src_reg\n"); + print_verifier_state(env->cur_state); + verbose("verifier internal error: no src_reg\n"); return -EINVAL; } return adjust_scalar_min_max_vals(env, insn, dst_reg, *src_reg); @@ -4001,14 +2826,14 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) if (BPF_SRC(insn->code) != 0 || insn->src_reg != BPF_REG_0 || insn->off != 0 || insn->imm != 0) { - verbose(env, "BPF_NEG uses reserved fields\n"); + verbose("BPF_NEG uses reserved fields\n"); return -EINVAL; } } else { if (insn->src_reg != BPF_REG_0 || insn->off != 0 || (insn->imm != 16 && insn->imm != 32 && insn->imm != 64) || BPF_CLASS(insn->code) == BPF_ALU64) { - verbose(env, "BPF_END uses reserved fields\n"); + verbose("BPF_END uses reserved fields\n"); return -EINVAL; } } @@ -4019,7 +2844,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) return err; if (is_pointer_value(env, insn->dst_reg)) { - verbose(env, "R%d pointer arithmetic prohibited\n", + verbose("R%d pointer arithmetic prohibited\n", insn->dst_reg); return -EACCES; } @@ -4033,7 +2858,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) if (BPF_SRC(insn->code) == BPF_X) { if (insn->imm != 0 || insn->off != 0) { - verbose(env, "BPF_MOV uses reserved fields\n"); + verbose("BPF_MOV uses reserved fields\n"); return -EINVAL; } @@ -4043,7 +2868,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) return err; } else { if (insn->src_reg != BPF_REG_0 || insn->off != 0) { - verbose(env, "BPF_MOV uses reserved fields\n"); + verbose("BPF_MOV uses reserved fields\n"); return -EINVAL; } } @@ -4063,11 +2888,11 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) } else { /* R1 = (u32) R2 */ if (is_pointer_value(env, insn->src_reg)) { - verbose(env, "R%d partial copy of pointer\n", + verbose("R%d partial copy of pointer\n", insn->src_reg); return -EACCES; } - mark_reg_unknown(env, regs, insn->dst_reg); + mark_reg_unknown(regs, insn->dst_reg); coerce_reg_to_size(®s[insn->dst_reg], 4); } } else { @@ -4085,14 +2910,14 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) } } else if (opcode > BPF_END) { - verbose(env, "invalid BPF_ALU opcode %x\n", opcode); + verbose("invalid BPF_ALU opcode %x\n", opcode); return -EINVAL; } else { /* all other ALU ops: and, sub, xor, add, ... */ if (BPF_SRC(insn->code) == BPF_X) { if (insn->imm != 0 || insn->off != 0) { - verbose(env, "BPF_ALU uses reserved fields\n"); + verbose("BPF_ALU uses reserved fields\n"); return -EINVAL; } /* check src1 operand */ @@ -4101,7 +2926,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) return err; } else { if (insn->src_reg != BPF_REG_0 || insn->off != 0) { - verbose(env, "BPF_ALU uses reserved fields\n"); + verbose("BPF_ALU uses reserved fields\n"); return -EINVAL; } } @@ -4113,12 +2938,12 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) if ((opcode == BPF_MOD || opcode == BPF_DIV) && BPF_SRC(insn->code) == BPF_K && insn->imm == 0) { - verbose(env, "div by zero\n"); + verbose("div by zero\n"); return -EINVAL; } if (opcode == BPF_ARSH && BPF_CLASS(insn->code) != BPF_ALU64) { - verbose(env, "BPF_ARSH not supported for 32 bit ALU\n"); + verbose("BPF_ARSH not supported for 32 bit ALU\n"); return -EINVAL; } @@ -4127,7 +2952,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) int size = BPF_CLASS(insn->code) == BPF_ALU64 ? 64 : 32; if (insn->imm < 0 || insn->imm >= size) { - verbose(env, "invalid shift %d\n", insn->imm); + verbose("invalid shift %d\n", insn->imm); return -EINVAL; } } @@ -4143,15 +2968,13 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) return 0; } -static void find_good_pkt_pointers(struct bpf_verifier_state *vstate, +static void find_good_pkt_pointers(struct bpf_verifier_state *state, struct bpf_reg_state *dst_reg, - enum bpf_reg_type type, bool range_right_open) { - struct bpf_func_state *state = vstate->frame[vstate->curframe]; struct bpf_reg_state *regs = state->regs, *reg; u16 new_range; - int i, j; + int i; if (dst_reg->off < 0 || (dst_reg->off == 0 && range_right_open)) @@ -4217,18 +3040,16 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *vstate, * dst_reg->off is known < MAX_PACKET_OFF, therefore it fits in a u16. */ for (i = 0; i < MAX_BPF_REG; i++) - if (regs[i].type == type && regs[i].id == dst_reg->id) + if (regs[i].type == PTR_TO_PACKET && regs[i].id == dst_reg->id) /* keep the maximum range already checked */ regs[i].range = max(regs[i].range, new_range); - for (j = 0; j <= vstate->curframe; j++) { - state = vstate->frame[j]; - bpf_for_each_spilled_reg(i, state, reg) { - if (!reg) - continue; - if (reg->type == type && reg->id == dst_reg->id) - reg->range = max(reg->range, new_range); - } + for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { + if (state->stack[i].slot_type[0] != STACK_SPILL) + continue; + reg = &state->stack[i].spilled_ptr; + if (reg->type == PTR_TO_PACKET && reg->id == dst_reg->id) + reg->range = max(reg->range, new_range); } } @@ -4431,11 +3252,12 @@ static void reg_combine_min_max(struct bpf_reg_state *true_src, } } -static void mark_ptr_or_null_reg(struct bpf_func_state *state, - struct bpf_reg_state *reg, u32 id, - bool is_null) +static void mark_map_reg(struct bpf_reg_state *regs, u32 regno, u32 id, + bool is_null) { - if (reg_type_may_be_null(reg->type) && reg->id == id) { + struct bpf_reg_state *reg = ®s[regno]; + + if (reg->type == PTR_TO_MAP_VALUE_OR_NULL && reg->id == id) { /* Old offset (both fixed and variable parts) should * have been known-zero, because we don't allow pointer * arithmetic on pointers that might be NULL. @@ -4448,170 +3270,56 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state, } if (is_null) { reg->type = SCALAR_VALUE; - } else if (reg->type == PTR_TO_MAP_VALUE_OR_NULL) { - if (reg->map_ptr->inner_map_meta) { - reg->type = CONST_PTR_TO_MAP; - reg->map_ptr = reg->map_ptr->inner_map_meta; - } else { - reg->type = PTR_TO_MAP_VALUE; - } - } else if (reg->type == PTR_TO_SOCKET_OR_NULL) { - reg->type = PTR_TO_SOCKET; - } else if (reg->type == PTR_TO_SOCK_COMMON_OR_NULL) { - reg->type = PTR_TO_SOCK_COMMON; - } else if (reg->type == PTR_TO_TCP_SOCK_OR_NULL) { - reg->type = PTR_TO_TCP_SOCK; - } - - if (is_null || !(reg_is_refcounted(reg) || - reg_may_point_to_spin_lock(reg))) { - /* We don't need id from this point onwards anymore, - * thus we should better reset it, so that state - * pruning has chances to take effect. - */ - reg->id = 0; + } else if (reg->map_ptr->inner_map_meta) { + reg->type = CONST_PTR_TO_MAP; + reg->map_ptr = reg->map_ptr->inner_map_meta; + } else { + reg->type = PTR_TO_MAP_VALUE; } + /* We don't need id from this point onwards anymore, thus we + * should better reset it, so that state pruning has chances + * to take effect. + */ + reg->id = 0; } } /* The logic is similar to find_good_pkt_pointers(), both could eventually * be folded together at some point. */ -static void mark_ptr_or_null_regs(struct bpf_verifier_state *vstate, u32 regno, - bool is_null) +static void mark_map_regs(struct bpf_verifier_state *state, u32 regno, + bool is_null) { - struct bpf_func_state *state = vstate->frame[vstate->curframe]; - struct bpf_reg_state *reg, *regs = state->regs; + struct bpf_reg_state *regs = state->regs; u32 id = regs[regno].id; - int i, j; - - if (reg_is_refcounted_or_null(®s[regno]) && is_null) - release_reference_state(state, id); + int i; for (i = 0; i < MAX_BPF_REG; i++) - mark_ptr_or_null_reg(state, ®s[i], id, is_null); + mark_map_reg(regs, i, id, is_null); - for (j = 0; j <= vstate->curframe; j++) { - state = vstate->frame[j]; - bpf_for_each_spilled_reg(i, state, reg) { - if (!reg) - continue; - mark_ptr_or_null_reg(state, reg, id, is_null); - } + for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { + if (state->stack[i].slot_type[0] != STACK_SPILL) + continue; + mark_map_reg(&state->stack[i].spilled_ptr, 0, id, is_null); } } -static bool try_match_pkt_pointers(const struct bpf_insn *insn, - struct bpf_reg_state *dst_reg, - struct bpf_reg_state *src_reg, - struct bpf_verifier_state *this_branch, - struct bpf_verifier_state *other_branch) -{ - if (BPF_SRC(insn->code) != BPF_X) - return false; - - switch (BPF_OP(insn->code)) { - case BPF_JGT: - if ((dst_reg->type == PTR_TO_PACKET && - src_reg->type == PTR_TO_PACKET_END) || - (dst_reg->type == PTR_TO_PACKET_META && - reg_is_init_pkt_pointer(src_reg, PTR_TO_PACKET))) { - /* pkt_data' > pkt_end, pkt_meta' > pkt_data */ - find_good_pkt_pointers(this_branch, dst_reg, - dst_reg->type, false); - } else if ((dst_reg->type == PTR_TO_PACKET_END && - src_reg->type == PTR_TO_PACKET) || - (reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) && - src_reg->type == PTR_TO_PACKET_META)) { - /* pkt_end > pkt_data', pkt_data > pkt_meta' */ - find_good_pkt_pointers(other_branch, src_reg, - src_reg->type, true); - } else { - return false; - } - break; - case BPF_JLT: - if ((dst_reg->type == PTR_TO_PACKET && - src_reg->type == PTR_TO_PACKET_END) || - (dst_reg->type == PTR_TO_PACKET_META && - reg_is_init_pkt_pointer(src_reg, PTR_TO_PACKET))) { - /* pkt_data' < pkt_end, pkt_meta' < pkt_data */ - find_good_pkt_pointers(other_branch, dst_reg, - dst_reg->type, true); - } else if ((dst_reg->type == PTR_TO_PACKET_END && - src_reg->type == PTR_TO_PACKET) || - (reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) && - src_reg->type == PTR_TO_PACKET_META)) { - /* pkt_end < pkt_data', pkt_data > pkt_meta' */ - find_good_pkt_pointers(this_branch, src_reg, - src_reg->type, false); - } else { - return false; - } - break; - case BPF_JGE: - if ((dst_reg->type == PTR_TO_PACKET && - src_reg->type == PTR_TO_PACKET_END) || - (dst_reg->type == PTR_TO_PACKET_META && - reg_is_init_pkt_pointer(src_reg, PTR_TO_PACKET))) { - /* pkt_data' >= pkt_end, pkt_meta' >= pkt_data */ - find_good_pkt_pointers(this_branch, dst_reg, - dst_reg->type, true); - } else if ((dst_reg->type == PTR_TO_PACKET_END && - src_reg->type == PTR_TO_PACKET) || - (reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) && - src_reg->type == PTR_TO_PACKET_META)) { - /* pkt_end >= pkt_data', pkt_data >= pkt_meta' */ - find_good_pkt_pointers(other_branch, src_reg, - src_reg->type, false); - } else { - return false; - } - break; - case BPF_JLE: - if ((dst_reg->type == PTR_TO_PACKET && - src_reg->type == PTR_TO_PACKET_END) || - (dst_reg->type == PTR_TO_PACKET_META && - reg_is_init_pkt_pointer(src_reg, PTR_TO_PACKET))) { - /* pkt_data' <= pkt_end, pkt_meta' <= pkt_data */ - find_good_pkt_pointers(other_branch, dst_reg, - dst_reg->type, false); - } else if ((dst_reg->type == PTR_TO_PACKET_END && - src_reg->type == PTR_TO_PACKET) || - (reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) && - src_reg->type == PTR_TO_PACKET_META)) { - /* pkt_end <= pkt_data', pkt_data <= pkt_meta' */ - find_good_pkt_pointers(this_branch, src_reg, - src_reg->type, true); - } else { - return false; - } - break; - default: - return false; - } - - return true; -} - static int check_cond_jmp_op(struct bpf_verifier_env *env, struct bpf_insn *insn, int *insn_idx) { - struct bpf_verifier_state *this_branch = env->cur_state; - struct bpf_verifier_state *other_branch; - struct bpf_reg_state *regs = this_branch->frame[this_branch->curframe]->regs; - struct bpf_reg_state *dst_reg, *other_branch_regs; + struct bpf_verifier_state *other_branch, *this_branch = env->cur_state; + struct bpf_reg_state *regs = this_branch->regs, *dst_reg; u8 opcode = BPF_OP(insn->code); int err; if (opcode > BPF_JSLE) { - verbose(env, "invalid BPF_JMP opcode %x\n", opcode); + verbose("invalid BPF_JMP opcode %x\n", opcode); return -EINVAL; } if (BPF_SRC(insn->code) == BPF_X) { if (insn->imm != 0) { - verbose(env, "BPF_JMP uses reserved fields\n"); + verbose("BPF_JMP uses reserved fields\n"); return -EINVAL; } @@ -4621,13 +3329,13 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, return err; if (is_pointer_value(env, insn->src_reg)) { - verbose(env, "R%d pointer comparison prohibited\n", + verbose("R%d pointer comparison prohibited\n", insn->src_reg); return -EACCES; } } else { if (insn->src_reg != BPF_REG_0) { - verbose(env, "BPF_JMP uses reserved fields\n"); + verbose("BPF_JMP uses reserved fields\n"); return -EINVAL; } } @@ -4664,7 +3372,6 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, if (!other_branch) return -EFAULT; - other_branch_regs = other_branch->frame[other_branch->curframe]->regs; /* detect if we are comparing against a constant value so we can adjust * our min/max values for our dst register. * this is only legit if both are scalars (or pointers to the same @@ -4676,61 +3383,103 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, if (dst_reg->type == SCALAR_VALUE && regs[insn->src_reg].type == SCALAR_VALUE) { if (tnum_is_const(regs[insn->src_reg].var_off)) - reg_set_min_max(&other_branch_regs[insn->dst_reg], + reg_set_min_max(&other_branch->regs[insn->dst_reg], dst_reg, regs[insn->src_reg].var_off.value, opcode); else if (tnum_is_const(dst_reg->var_off)) - reg_set_min_max_inv(&other_branch_regs[insn->src_reg], + reg_set_min_max_inv(&other_branch->regs[insn->src_reg], ®s[insn->src_reg], dst_reg->var_off.value, opcode); else if (opcode == BPF_JEQ || opcode == BPF_JNE) /* Comparing for equality, we can combine knowledge */ - reg_combine_min_max(&other_branch_regs[insn->src_reg], - &other_branch_regs[insn->dst_reg], + reg_combine_min_max(&other_branch->regs[insn->src_reg], + &other_branch->regs[insn->dst_reg], ®s[insn->src_reg], ®s[insn->dst_reg], opcode); } } else if (dst_reg->type == SCALAR_VALUE) { - reg_set_min_max(&other_branch_regs[insn->dst_reg], + reg_set_min_max(&other_branch->regs[insn->dst_reg], dst_reg, insn->imm, opcode); } /* detect if R == 0 where R is returned from bpf_map_lookup_elem() */ if (BPF_SRC(insn->code) == BPF_K && insn->imm == 0 && (opcode == BPF_JEQ || opcode == BPF_JNE) && - reg_type_may_be_null(dst_reg->type)) { - /* Mark all identical registers in each branch as either + dst_reg->type == PTR_TO_MAP_VALUE_OR_NULL) { + /* Mark all identical map registers in each branch as either * safe or unknown depending R == 0 or R != 0 conditional. */ - mark_ptr_or_null_regs(this_branch, insn->dst_reg, - opcode == BPF_JNE); - mark_ptr_or_null_regs(other_branch, insn->dst_reg, - opcode == BPF_JEQ); - } else if (!try_match_pkt_pointers(insn, dst_reg, ®s[insn->src_reg], - this_branch, other_branch) && - is_pointer_value(env, insn->dst_reg)) { - verbose(env, "R%d pointer comparison prohibited\n", insn->dst_reg); + mark_map_regs(this_branch, insn->dst_reg, opcode == BPF_JNE); + mark_map_regs(other_branch, insn->dst_reg, opcode == BPF_JEQ); + } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGT && + dst_reg->type == PTR_TO_PACKET && + regs[insn->src_reg].type == PTR_TO_PACKET_END) { + /* pkt_data' > pkt_end */ + find_good_pkt_pointers(this_branch, dst_reg, false); + } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGT && + dst_reg->type == PTR_TO_PACKET_END && + regs[insn->src_reg].type == PTR_TO_PACKET) { + /* pkt_end > pkt_data' */ + find_good_pkt_pointers(other_branch, ®s[insn->src_reg], true); + } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLT && + dst_reg->type == PTR_TO_PACKET && + regs[insn->src_reg].type == PTR_TO_PACKET_END) { + /* pkt_data' < pkt_end */ + find_good_pkt_pointers(other_branch, dst_reg, true); + } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLT && + dst_reg->type == PTR_TO_PACKET_END && + regs[insn->src_reg].type == PTR_TO_PACKET) { + /* pkt_end < pkt_data' */ + find_good_pkt_pointers(this_branch, ®s[insn->src_reg], false); + } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGE && + dst_reg->type == PTR_TO_PACKET && + regs[insn->src_reg].type == PTR_TO_PACKET_END) { + /* pkt_data' >= pkt_end */ + find_good_pkt_pointers(this_branch, dst_reg, true); + } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGE && + dst_reg->type == PTR_TO_PACKET_END && + regs[insn->src_reg].type == PTR_TO_PACKET) { + /* pkt_end >= pkt_data' */ + find_good_pkt_pointers(other_branch, ®s[insn->src_reg], false); + } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLE && + dst_reg->type == PTR_TO_PACKET && + regs[insn->src_reg].type == PTR_TO_PACKET_END) { + /* pkt_data' <= pkt_end */ + find_good_pkt_pointers(other_branch, dst_reg, false); + } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLE && + dst_reg->type == PTR_TO_PACKET_END && + regs[insn->src_reg].type == PTR_TO_PACKET) { + /* pkt_end <= pkt_data' */ + find_good_pkt_pointers(this_branch, ®s[insn->src_reg], true); + } else if (is_pointer_value(env, insn->dst_reg)) { + verbose("R%d pointer comparison prohibited\n", insn->dst_reg); return -EACCES; } - if (env->log.level) - print_verifier_state(env, this_branch->frame[this_branch->curframe]); + if (log_level) + print_verifier_state(this_branch); return 0; } +/* return the map pointer stored inside BPF_LD_IMM64 instruction */ +static struct bpf_map *ld_imm64_to_map_ptr(struct bpf_insn *insn) +{ + u64 imm64 = ((u64) (u32) insn[0].imm) | ((u64) (u32) insn[1].imm) << 32; + + return (struct bpf_map *) (unsigned long) imm64; +} + /* verify BPF_LD_IMM64 instruction */ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn) { - struct bpf_insn_aux_data *aux = cur_aux(env); struct bpf_reg_state *regs = cur_regs(env); - struct bpf_map *map; int err; if (BPF_SIZE(insn->code) != BPF_DW) { - verbose(env, "invalid BPF_LD_IMM insn\n"); + verbose("invalid BPF_LD_IMM insn\n"); return -EINVAL; } if (insn->off != 0) { - verbose(env, "BPF_LD_IMM64 uses reserved fields\n"); + verbose("BPF_LD_IMM64 uses reserved fields\n"); return -EINVAL; } @@ -4746,23 +3495,11 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn) return 0; } - map = env->used_maps[aux->map_index]; - mark_reg_known_zero(env, regs, insn->dst_reg); - - regs[insn->dst_reg].map_ptr = map; - - if (insn->src_reg == BPF_PSEUDO_MAP_VALUE) { - regs[insn->dst_reg].type = PTR_TO_MAP_VALUE; - regs[insn->dst_reg].off = aux->map_off; - if (map_value_has_spin_lock(map)) - regs[insn->dst_reg].id = ++env->id_gen; - } else if (insn->src_reg == BPF_PSEUDO_MAP_FD) { - regs[insn->dst_reg].type = CONST_PTR_TO_MAP; - } else { - verbose(env, "bpf verifier is misconfigured\n"); - return -EINVAL; - } + /* replace_map_fd_with_map_ptr() should have caught bad ld_imm64 */ + BUG_ON(insn->src_reg != BPF_PSEUDO_MAP_FD); + regs[insn->dst_reg].type = CONST_PTR_TO_MAP; + regs[insn->dst_reg].map_ptr = ld_imm64_to_map_ptr(insn); return 0; } @@ -4772,7 +3509,6 @@ static bool may_access_skb(enum bpf_prog_type type) case BPF_PROG_TYPE_SOCKET_FILTER: case BPF_PROG_TYPE_SCHED_CLS: case BPF_PROG_TYPE_SCHED_ACT: - case BPF_PROG_TYPE_CGROUP_SKB: return true; default: return false; @@ -4802,26 +3538,14 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) int i, err; if (!may_access_skb(env->prog->type)) { - verbose(env, "BPF_LD_[ABS|IND] instructions not allowed for this program type\n"); - return -EINVAL; - } - - if (env->subprog_cnt > 1) { - /* when program has LD_ABS insn JITs and interpreter assume - * that r1 == ctx == skb which is not the case for callees - * that can have arbitrary arguments. It's problematic - * for main prog as well since JITs would need to analyze - * all functions in order to make proper register save/restore - * decisions in the main prog. Hence disallow LD_ABS with calls - */ - verbose(env, "BPF_LD_[ABS|IND] instructions cannot be mixed with bpf-to-bpf calls\n"); + verbose("BPF_LD_[ABS|IND] instructions not allowed for this program type\n"); return -EINVAL; } if (insn->dst_reg != BPF_REG_0 || insn->off != 0 || BPF_SIZE(insn->code) == BPF_DW || (mode == BPF_ABS && insn->src_reg != BPF_REG_0)) { - verbose(env, "BPF_LD_[ABS|IND] uses reserved fields\n"); + verbose("BPF_LD_[ABS|IND] uses reserved fields\n"); return -EINVAL; } @@ -4830,23 +3554,8 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) if (err) return err; - /* Disallow usage of BPF_LD_[ABS|IND] with reference tracking, as - * gen_ld_abs() may terminate the program at runtime, leading to - * reference leak. - */ - err = check_reference_leak(env); - if (err) { - verbose(env, "BPF_LD_[ABS|IND] cannot be mixed with socket references\n"); - return err; - } - - if (env->cur_state->active_spin_lock) { - verbose(env, "BPF_LD_[ABS|IND] cannot be used inside bpf_spin_lock-ed region\n"); - return -EINVAL; - } - if (regs[ctx_reg].type != PTR_TO_CTX) { - verbose(env, "at the time of BPF_LD_ABS|IND R6 != pointer to skb\n"); + verbose("at the time of BPF_LD_ABS|IND R6 != pointer to skb\n"); return -EINVAL; } @@ -4863,7 +3572,7 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) /* reset caller saved regs to unreadable */ for (i = 0; i < CALLER_SAVED_REGS; i++) { - mark_reg_not_init(env, regs, caller_saved[i]); + mark_reg_not_init(regs, caller_saved[i]); check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK); } @@ -4871,50 +3580,7 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) * the value fetched from the packet. * Already marked as written above. */ - mark_reg_unknown(env, regs, BPF_REG_0); - return 0; -} - -static int check_return_code(struct bpf_verifier_env *env) -{ - struct bpf_reg_state *reg; - struct tnum range = tnum_range(0, 1); - - switch (env->prog->type) { - case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: - if (env->prog->expected_attach_type == BPF_CGROUP_UDP4_RECVMSG || - env->prog->expected_attach_type == BPF_CGROUP_UDP6_RECVMSG) - range = tnum_range(1, 1); - case BPF_PROG_TYPE_CGROUP_SKB: - case BPF_PROG_TYPE_CGROUP_SOCK: - case BPF_PROG_TYPE_SOCK_OPS: - case BPF_PROG_TYPE_CGROUP_SOCKOPT: - break; - default: - return 0; - } - - reg = cur_regs(env) + BPF_REG_0; - if (reg->type != SCALAR_VALUE) { - verbose(env, "At program exit the register R0 is not a known value (%s)\n", - reg_type_str[reg->type]); - return -EINVAL; - } - - if (!tnum_in(range, reg->var_off)) { - char tn_buf[48]; - - verbose(env, "At program exit the register R0 "); - if (!tnum_is_unknown(reg->var_off)) { - tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose(env, "has value %s", tn_buf); - } else { - verbose(env, "has unknown scalar value"); - } - tnum_strn(tn_buf, sizeof(tn_buf), range); - verbose(env, " should have been in %s\n", tn_buf); - return -EINVAL; - } + mark_reg_unknown(regs, BPF_REG_0); return 0; } @@ -4978,7 +3644,7 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env) return 0; if (w < 0 || w >= env->prog->len) { - verbose(env, "jump out of range from insn %d to %d\n", t, w); + verbose("jump out of range from insn %d to %d\n", t, w); return -EINVAL; } @@ -4995,13 +3661,13 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env) insn_stack[cur_stack++] = w; return 1; } else if ((insn_state[w] & 0xF0) == DISCOVERED) { - verbose(env, "back-edge from insn %d to %d\n", t, w); + verbose("back-edge from insn %d to %d\n", t, w); return -EINVAL; } else if (insn_state[w] == EXPLORED) { /* forward- or cross-edge */ insn_state[t] = DISCOVERED | e; } else { - verbose(env, "insn state internal bug\n"); + verbose("insn state internal bug\n"); return -EFAULT; } return 0; @@ -5017,10 +3683,6 @@ static int check_cfg(struct bpf_verifier_env *env) int ret = 0; int i, t; - ret = check_subprogs(env); - if (ret < 0) - return ret; - insn_state = kcalloc(insn_cnt, sizeof(int), GFP_KERNEL); if (!insn_state) return -ENOMEM; @@ -5053,14 +3715,6 @@ peek_stack: goto err_free; if (t + 1 < insn_cnt) env->explored_states[t + 1] = STATE_LIST_MARK; - if (insns[t].src_reg == BPF_PSEUDO_CALL) { - env->explored_states[t] = STATE_LIST_MARK; - ret = push_insn(t, t + insns[t].imm + 1, BRANCH, env); - if (ret == 1) - goto peek_stack; - else if (ret < 0) - goto err_free; - } } else if (opcode == BPF_JA) { if (BPF_SRC(insns[t].code) != BPF_K) { ret = -EINVAL; @@ -5107,7 +3761,7 @@ peek_stack: mark_explored: insn_state[t] = EXPLORED; if (cur_stack-- <= 0) { - verbose(env, "pop stack internal bug\n"); + verbose("pop stack internal bug\n"); ret = -EFAULT; goto err_free; } @@ -5116,7 +3770,7 @@ mark_explored: check_state: for (i = 0; i < insn_cnt; i++) { if (insn_state[i] != EXPLORED) { - verbose(env, "unreachable insn %d\n", i); + verbose("unreachable insn %d\n", i); ret = -EINVAL; goto err_free; } @@ -5129,255 +3783,6 @@ err_free: return ret; } -/* The minimum supported BTF func info size */ -#define MIN_BPF_FUNCINFO_SIZE 8 -#define MAX_FUNCINFO_REC_SIZE 252 -static int check_btf_func(struct bpf_verifier_env *env, - const union bpf_attr *attr, - union bpf_attr __user *uattr) -{ - u32 i, nfuncs, urec_size, min_size, prev_offset; - u32 krec_size = sizeof(struct bpf_func_info); - struct bpf_func_info *krecord; - const struct btf_type *type; - struct bpf_prog *prog; - const struct btf *btf; - void __user *urecord; - int ret = 0; - nfuncs = attr->func_info_cnt; - - if (!nfuncs) - return 0; - - if (nfuncs != env->subprog_cnt) { - verbose(env, "number of funcs in func_info doesn't match number of subprogs\n"); - return -EINVAL; - } - - urec_size = attr->func_info_rec_size; - if (urec_size < MIN_BPF_FUNCINFO_SIZE || - urec_size > MAX_FUNCINFO_REC_SIZE || - urec_size % sizeof(u32)) { - verbose(env, "invalid func info rec size %u\n", urec_size); - return -EINVAL; - } - - prog = env->prog; - btf = prog->aux->btf; - - urecord = u64_to_user_ptr(attr->func_info); - min_size = min_t(u32, krec_size, urec_size); - - krecord = kvcalloc(nfuncs, krec_size, GFP_KERNEL | __GFP_NOWARN); - if (!krecord) - return -ENOMEM; - - for (i = 0; i < nfuncs; i++) { - ret = bpf_check_uarg_tail_zero(urecord, krec_size, urec_size); - if (ret) { - if (ret == -E2BIG) { - verbose(env, "nonzero tailing record in func info"); - /* set the size kernel expects so loader can zero - * out the rest of the record. - */ - if (put_user(min_size, &uattr->func_info_rec_size)) - ret = -EFAULT; - } - goto err_free; - } - - if (copy_from_user(&krecord[i], urecord, min_size)) { - ret = -EFAULT; - goto err_free; - } - - /* check insn_offset */ - if (i == 0) { - if (krecord[i].insn_offset) { - verbose(env, - "nonzero insn_offset %u for the first func info record", - krecord[i].insn_offset); - ret = -EINVAL; - goto err_free; - } - } else if (krecord[i].insn_offset <= prev_offset) { - verbose(env, - "same or smaller insn offset (%u) than previous func info record (%u)", - krecord[i].insn_offset, prev_offset); - ret = -EINVAL; - goto err_free; - } - - if (env->subprog_info[i].start != krecord[i].insn_offset) { - verbose(env, "func_info BTF section doesn't match subprog layout in BPF program\n"); - ret = -EINVAL; - goto err_free; - } - - /* check type_id */ - type = btf_type_by_id(btf, krecord[i].type_id); - if (!type || BTF_INFO_KIND(type->info) != BTF_KIND_FUNC) { - verbose(env, "invalid type id %d in func info", - krecord[i].type_id); - ret = -EINVAL; - goto err_free; - } - - prev_offset = krecord[i].insn_offset; - urecord += urec_size; - } - - prog->aux->func_info = krecord; - prog->aux->func_info_cnt = nfuncs; - return 0; - -err_free: - kvfree(krecord); - return ret; -} - -static void adjust_btf_func(struct bpf_verifier_env *env) -{ - int i; - if (!env->prog->aux->func_info) - return; - for (i = 0; i < env->subprog_cnt; i++) - env->prog->aux->func_info[i].insn_offset = env->subprog_info[i].start; -} - -#define MIN_BPF_LINEINFO_SIZE (offsetof(struct bpf_line_info, line_col) + \ - sizeof(((struct bpf_line_info *)(0))->line_col)) -#define MAX_LINEINFO_REC_SIZE MAX_FUNCINFO_REC_SIZE -static int check_btf_line(struct bpf_verifier_env *env, - const union bpf_attr *attr, - union bpf_attr __user *uattr) -{ - u32 i, s, nr_linfo, ncopy, expected_size, rec_size, prev_offset = 0; - struct bpf_subprog_info *sub; - struct bpf_line_info *linfo; - struct bpf_prog *prog; - const struct btf *btf; - void __user *ulinfo; - int err; - - nr_linfo = attr->line_info_cnt; - - if (!nr_linfo) - return 0; - - rec_size = attr->line_info_rec_size; - if (rec_size < MIN_BPF_LINEINFO_SIZE || - rec_size > MAX_LINEINFO_REC_SIZE || - rec_size & (sizeof(u32) - 1)) - return -EINVAL; - - /* Need to zero it in case the userspace may - * pass in a smaller bpf_line_info object. - */ - linfo = kvcalloc(nr_linfo, sizeof(struct bpf_line_info), - GFP_KERNEL | __GFP_NOWARN); - if (!linfo) - return -ENOMEM; - - prog = env->prog; - btf = prog->aux->btf; - s = 0; - sub = env->subprog_info; - ulinfo = u64_to_user_ptr(attr->line_info); - expected_size = sizeof(struct bpf_line_info); - ncopy = min_t(u32, expected_size, rec_size); - - for (i = 0; i < nr_linfo; i++) { - err = bpf_check_uarg_tail_zero(ulinfo, expected_size, rec_size); - if (err) { - if (err == -E2BIG) { - verbose(env, "nonzero tailing record in line_info"); - if (put_user(expected_size, - &uattr->line_info_rec_size)) - err = -EFAULT; - } - goto err_free; - } - if (copy_from_user(&linfo[i], ulinfo, ncopy)) { - err = -EFAULT; - goto err_free; - } - /* - * Check insn_off to ensure - * 1) strictly increasing AND - * 2) bounded by prog->len - * - * The linfo[0].insn_off == 0 check logically falls into - * the later "missing bpf_line_info for func..." case - * because the first linfo[0].insn_off must be the - * first sub also and the first sub must have - * subprog_info[0].start == 0. - */ - if ((i && linfo[i].insn_off <= prev_offset) || - linfo[i].insn_off >= prog->len) { - verbose(env, "Invalid line_info[%u].insn_off:%u (prev_offset:%u prog->len:%u)\n", - i, linfo[i].insn_off, prev_offset, - prog->len); - err = -EINVAL; - goto err_free; - } - if (!btf_name_by_offset(btf, linfo[i].line_off) || - !btf_name_by_offset(btf, linfo[i].file_name_off)) { - verbose(env, "Invalid line_info[%u].line_off or .file_name_off\n", i); - err = -EINVAL; - goto err_free; - } - if (s != env->subprog_cnt) { - if (linfo[i].insn_off == sub[s].start) { - sub[s].linfo_idx = i; - s++; - } else if (sub[s].start < linfo[i].insn_off) { - verbose(env, "missing bpf_line_info for func#%u\n", s); - err = -EINVAL; - goto err_free; - } - } - prev_offset = linfo[i].insn_off; - ulinfo += rec_size; - } - - if (s != env->subprog_cnt) { - verbose(env, "missing bpf_line_info for %u funcs starting from func#%u\n", - env->subprog_cnt - s, s); - err = -EINVAL; - goto err_free; - } - - prog->aux->linfo = linfo; - prog->aux->nr_linfo = nr_linfo; - return 0; - -err_free: - kvfree(linfo); - return err; -} - -static int check_btf_info(struct bpf_verifier_env *env, - const union bpf_attr *attr, - union bpf_attr __user *uattr) -{ - struct btf *btf; - int err; - if (!attr->func_info_cnt && !attr->line_info_cnt) - return 0; - btf = btf_get_by_fd(attr->prog_btf_fd); - if (IS_ERR(btf)) - return PTR_ERR(btf); - env->prog->aux->btf = btf; - err = check_btf_func(env, attr, uattr); - if (err) - return err; - err = check_btf_line(env, attr, uattr); - if (err) - return err; - return 0; -} - /* check %cur's range satisfies %old's */ static bool range_within(struct bpf_reg_state *old, struct bpf_reg_state *cur) @@ -5428,19 +3833,11 @@ static bool check_ids(u32 old_id, u32 cur_id, struct idpair *idmap) static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur, struct idpair *idmap) { - bool equal; - if (!(rold->live & REG_LIVE_READ)) /* explored state didn't use this */ return true; - equal = memcmp(rold, rcur, offsetof(struct bpf_reg_state, frameno)) == 0; - if (rold->type == PTR_TO_STACK) - /* two stack pointers are equal only if they're pointing to - * the same stack frame, since fp-8 in foo != fp-8 in bar - */ - return equal && rold->frameno == rcur->frameno; - if (equal) + if (memcmp(rold, rcur, offsetof(struct bpf_reg_state, live)) == 0) return true; if (rold->type == NOT_INIT) @@ -5467,11 +3864,8 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur, case PTR_TO_MAP_VALUE: /* If the new min/max/var_off satisfy the old ones and * everything else matches, we are OK. - * 'id' is not compared, since it's only used for maps with - * bpf_spin_lock inside map element and in such cases if - * the rest of the prog is valid for one map element then - * it's valid for all map elements regardless of the key - * used in bpf_map_lookup() + * We don't care about the 'id' value, because nothing + * uses it for PTR_TO_MAP_VALUE (only for ..._OR_NULL) */ return memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)) == 0 && range_within(rold, rcur) && @@ -5490,9 +3884,8 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur, return false; /* Check our ids match any regs they're supposed to */ return check_ids(rold->id, rcur->id, idmap); - case PTR_TO_PACKET_META: case PTR_TO_PACKET: - if (rcur->type != rold->type) + if (rcur->type != PTR_TO_PACKET) return false; /* We must have at least as much range as the old ptr * did, so that any accesses which were safe before are @@ -5515,14 +3908,8 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur, tnum_in(rold->var_off, rcur->var_off); case PTR_TO_CTX: case CONST_PTR_TO_MAP: + case PTR_TO_STACK: case PTR_TO_PACKET_END: - case PTR_TO_FLOW_KEYS: - case PTR_TO_SOCKET: - case PTR_TO_SOCKET_OR_NULL: - case PTR_TO_SOCK_COMMON: - case PTR_TO_SOCK_COMMON_OR_NULL: - case PTR_TO_TCP_SOCK: - case PTR_TO_TCP_SOCK_OR_NULL: /* Only valid matches are exact, which memcmp() above * would have accepted */ @@ -5536,8 +3923,8 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur, return false; } -static bool stacksafe(struct bpf_func_state *old, - struct bpf_func_state *cur, +static bool stacksafe(struct bpf_verifier_state *old, + struct bpf_verifier_state *cur, struct idpair *idmap) { int i, spi; @@ -5587,14 +3974,6 @@ static bool stacksafe(struct bpf_func_state *old, return true; } -static bool refsafe(struct bpf_func_state *old, struct bpf_func_state *cur) -{ - if (old->acquired_refs != cur->acquired_refs) - return false; - return !memcmp(old->refs, cur->refs, - sizeof(*old->refs) * old->acquired_refs); -} - /* compare two verifier states * * all states stored in state_list are known to be valid, since @@ -5621,13 +4000,20 @@ static bool refsafe(struct bpf_func_state *old, struct bpf_func_state *cur) * whereas register type in current state is meaningful, it means that * the current state will reach 'bpf_exit' instruction safely */ -static bool func_states_equal(struct bpf_func_state *old, - struct bpf_func_state *cur) +static bool states_equal(struct bpf_verifier_env *env, + struct bpf_verifier_state *old, + struct bpf_verifier_state *cur) { struct idpair *idmap; bool ret = false; int i; + /* Verification state from speculative execution simulation + * must never prune a non-speculative execution one. + */ + if (old->speculative && !cur->speculative) + return false; + idmap = kcalloc(ID_MAP_SIZE, sizeof(struct idpair), GFP_KERNEL); /* If we failed to allocate the idmap, just say it's not safe */ if (!idmap) @@ -5640,92 +4026,77 @@ static bool func_states_equal(struct bpf_func_state *old, if (!stacksafe(old, cur, idmap)) goto out_free; - - if (!refsafe(old, cur)) - goto out_free; - ret = true; - out_free: kfree(idmap); return ret; } -static bool states_equal(struct bpf_verifier_env *env, - struct bpf_verifier_state *old, - struct bpf_verifier_state *cur) +/* A write screens off any subsequent reads; but write marks come from the + * straight-line code between a state and its parent. When we arrive at a + * jump target (in the first iteration of the propagate_liveness() loop), + * we didn't arrive by the straight-line code, so read marks in state must + * propagate to parent regardless of state's write marks. + */ +static bool do_propagate_liveness(const struct bpf_verifier_state *state, + struct bpf_verifier_state *parent) { + bool writes = parent == state->parent; /* Observe write marks */ + bool touched = false; /* any changes made? */ int i; - if (old->curframe != cur->curframe) - return false; - - if (old->active_spin_lock != cur->active_spin_lock) - return false; - - /* for states to be equal callsites have to be the same - * and all frame states need to be equivalent - */ - for (i = 0; i <= old->curframe; i++) { - if (old->frame[i]->callsite != cur->frame[i]->callsite) - return false; - if (!func_states_equal(old->frame[i], cur->frame[i])) - return false; - } - - return true; -} - -/* A write screens off any subsequent reads; but write marks come from the - * straight-line code between a state and its parent. When we arrive at an - * equivalent state (jump target or such) we didn't arrive by the straight-line - * code, so read marks in the state must propagate to the parent regardless - * of the state's write marks. That's what 'parent == state->parent' comparison - * in mark_reg_read() and mark_stack_slot_read() is for. - */ -static int propagate_liveness(struct bpf_verifier_env *env, - const struct bpf_verifier_state *vstate, - struct bpf_verifier_state *vparent) -{ - int i, frame, err = 0; - struct bpf_func_state *state, *parent; - - if (vparent->curframe != vstate->curframe) { - WARN(1, "propagate_live: parent frame %d current frame %d\n", - vparent->curframe, vstate->curframe); - return -EFAULT; - } - + if (!parent) + return touched; /* Propagate read liveness of registers... */ BUILD_BUG_ON(BPF_REG_FP + 1 != MAX_BPF_REG); /* We don't need to worry about FP liveness because it's read-only */ for (i = 0; i < BPF_REG_FP; i++) { - if (vparent->frame[vparent->curframe]->regs[i].live & REG_LIVE_READ) + if (parent->regs[i].live & REG_LIVE_READ) continue; - if (vstate->frame[vstate->curframe]->regs[i].live & REG_LIVE_READ) { - err = mark_reg_read(env, vstate, vparent, i); - if (err) - return err; + if (writes && (state->regs[i].live & REG_LIVE_WRITTEN)) + continue; + if (state->regs[i].live & REG_LIVE_READ) { + parent->regs[i].live |= REG_LIVE_READ; + touched = true; } } - /* ... and stack slots */ - for (frame = 0; frame <= vstate->curframe; frame++) { - state = vstate->frame[frame]; - parent = vparent->frame[frame]; - for (i = 0; i < state->allocated_stack / BPF_REG_SIZE && - i < parent->allocated_stack / BPF_REG_SIZE; i++) { - if (parent->stack[i].slot_type[0] != STACK_SPILL) - continue; - if (state->stack[i].slot_type[0] != STACK_SPILL) - continue; - if (parent->stack[i].spilled_ptr.live & REG_LIVE_READ) - continue; - if (state->stack[i].spilled_ptr.live & REG_LIVE_READ) - mark_stack_slot_read(env, vstate, vparent, i, frame); + for (i = 0; i < state->allocated_stack / BPF_REG_SIZE && + i < parent->allocated_stack / BPF_REG_SIZE; i++) { + if (parent->stack[i].slot_type[0] != STACK_SPILL) + continue; + if (state->stack[i].slot_type[0] != STACK_SPILL) + continue; + if (parent->stack[i].spilled_ptr.live & REG_LIVE_READ) + continue; + if (writes && + (state->stack[i].spilled_ptr.live & REG_LIVE_WRITTEN)) + continue; + if (state->stack[i].spilled_ptr.live & REG_LIVE_READ) { + parent->stack[i].spilled_ptr.live |= REG_LIVE_READ; + touched = true; } } - return err; + return touched; +} + +/* "parent" is "a state from which we reach the current state", but initially + * it is not the state->parent (i.e. "the state whose straight-line code leads + * to the current state"), instead it is the state that happened to arrive at + * a (prunable) equivalent of the current state. See comment above + * do_propagate_liveness() for consequences of this. + * This function is just a more efficient way of calling mark_reg_read() or + * mark_stack_slot_read() on each reg in "parent" that is read in "state", + * though it requires that parent != state->parent in the call arguments. + */ +static void propagate_liveness(const struct bpf_verifier_state *state, + struct bpf_verifier_state *parent) +{ + while (do_propagate_liveness(state, parent)) { + /* Something changed, so we need to feed those changes onward */ + state = parent; + parent = state->parent; + } } static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) @@ -5733,7 +4104,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) struct bpf_verifier_state_list *new_sl; struct bpf_verifier_state_list *sl; struct bpf_verifier_state *cur = env->cur_state; - int i, j, err; + int i, err; sl = env->explored_states[insn_idx]; if (!sl) @@ -5754,9 +4125,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) * they'll be immediately forgotten as we're pruning * this state and will pop a new one. */ - err = propagate_liveness(env, &sl->state, cur); - if (err) - return err; + propagate_liveness(&sl->state, cur); return 1; } sl = sl->next; @@ -5764,10 +4133,9 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) /* there were no equivalent states, remember current one. * technically the current state is not proven to be safe yet, - * but it will either reach outer most bpf_exit (which means it's safe) - * or it will be rejected. Since there are no loops, we won't be - * seeing this tuple (frame[0].callsite, frame[1].callsite, .. insn_idx) - * again on the way to bpf_exit + * but it will either reach bpf_exit (which means it's safe) or + * it will be rejected. Since there are no loops, we won't be + * seeing this 'insn_idx' instruction again on the way to bpf_exit */ new_sl = kzalloc(sizeof(struct bpf_verifier_state_list), GFP_KERNEL); if (!new_sl) @@ -5791,49 +4159,20 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) * explored_states can get read marks.) */ for (i = 0; i < BPF_REG_FP; i++) - cur->frame[cur->curframe]->regs[i].live = REG_LIVE_NONE; - /* all stack frames are accessible from callee, clear them all */ - for (j = 0; j <= cur->curframe; j++) { - struct bpf_func_state *frame = cur->frame[j]; - for (i = 0; i < frame->allocated_stack / BPF_REG_SIZE; i++) - if (frame->stack[i].slot_type[0] == STACK_SPILL) - frame->stack[i].spilled_ptr.live = REG_LIVE_NONE; - } + cur->regs[i].live = REG_LIVE_NONE; + for (i = 0; i < cur->allocated_stack / BPF_REG_SIZE; i++) + if (cur->stack[i].slot_type[0] == STACK_SPILL) + cur->stack[i].spilled_ptr.live = REG_LIVE_NONE; return 0; } -/* Return true if it's OK to have the same insn return a different type. */ -static bool reg_type_mismatch_ok(enum bpf_reg_type type) +static int ext_analyzer_insn_hook(struct bpf_verifier_env *env, + int insn_idx, int prev_insn_idx) { - switch (type) { - case PTR_TO_CTX: - case PTR_TO_SOCKET: - case PTR_TO_SOCKET_OR_NULL: - case PTR_TO_SOCK_COMMON: - case PTR_TO_SOCK_COMMON_OR_NULL: - case PTR_TO_TCP_SOCK: - case PTR_TO_TCP_SOCK_OR_NULL: - return false; - default: - return true; - } -} -/* If an instruction was previously used with particular pointer types, then we - * need to be careful to avoid cases such as the below, where it may be ok - * for one branch accessing the pointer, but not ok for the other branch: - * - * R1 = sock_ptr - * goto X; - * ... - * R1 = some_other_valid_ptr; - * goto X; - * ... - * R2 = *(u32 *)(R1 + 0); - */ -static bool reg_type_mismatch(enum bpf_reg_type src, enum bpf_reg_type prev) -{ - return src != prev && (!reg_type_mismatch_ok(src) || - !reg_type_mismatch_ok(prev)); + if (!env->analyzer_ops || !env->analyzer_ops->insn_hook) + return 0; + + return env->analyzer_ops->insn_hook(env, insn_idx, prev_insn_idx); } static int do_check(struct bpf_verifier_env *env) @@ -5841,34 +4180,23 @@ static int do_check(struct bpf_verifier_env *env) struct bpf_verifier_state *state; struct bpf_insn *insns = env->prog->insnsi; struct bpf_reg_state *regs; - int insn_cnt = env->prog->len, i; + int insn_cnt = env->prog->len; int insn_processed = 0; bool do_print_state = false; state = kzalloc(sizeof(struct bpf_verifier_state), GFP_KERNEL); if (!state) return -ENOMEM; - - state->curframe = 0; - state->parent = NULL; - state->frame[0] = kzalloc(sizeof(struct bpf_func_state), GFP_KERNEL); - if (!state->frame[0]) { - kfree(state); - return -ENOMEM; - } env->cur_state = state; - init_func_state(env, state->frame[0], - BPF_MAIN_FUNC /* callsite */, - 0 /* frameno */, - 0 /* subprogno, zero == main subprog */); - + init_reg_state(state->regs); + state->parent = NULL; for (;;) { struct bpf_insn *insn; u8 class; int err; if (env->insn_idx >= insn_cnt) { - verbose(env, "invalid insn idx %d insn_cnt %d\n", + verbose("invalid insn idx %d insn_cnt %d\n", env->insn_idx, insn_cnt); return -EFAULT; } @@ -5877,7 +4205,7 @@ static int do_check(struct bpf_verifier_env *env) class = BPF_CLASS(insn->code); if (++insn_processed > BPF_COMPLEXITY_LIMIT_INSNS) { - verbose(env, "BPF program is too large. Processed %d insn\n", + verbose("BPF program is too large. Processed %d insn\n", insn_processed); return -E2BIG; } @@ -5887,14 +4215,14 @@ static int do_check(struct bpf_verifier_env *env) return err; if (err == 1) { /* found equivalent state, can prune the search */ - if (env->log.level) { + if (log_level) { if (do_print_state) - verbose(env, "\nfrom %d to %d%s: safe\n", + verbose("\nfrom %d to %d%s: safe\n", env->prev_insn_idx, env->insn_idx, env->cur_state->speculative ? " (speculative execution)" : ""); else - verbose(env, "%d: safe\n", env->insn_idx); + verbose("%d: safe\n", env->insn_idx); } goto process_bpf_exit; } @@ -5902,34 +4230,26 @@ static int do_check(struct bpf_verifier_env *env) if (need_resched()) cond_resched(); - if (env->log.level > 1 || - (env->log.level && do_print_state)) { - if (env->log.level > 1) - verbose(env, "%d:", env->insn_idx); + if (log_level > 1 || (log_level && do_print_state)) { + if (log_level > 1) + verbose("%d:", env->insn_idx); else - verbose(env, "\nfrom %d to %d%s:", + verbose("\nfrom %d to %d%s:", env->prev_insn_idx, env->insn_idx, env->cur_state->speculative ? " (speculative execution)" : ""); - print_verifier_state(env, state->frame[state->curframe]); + print_verifier_state(env->cur_state); do_print_state = false; } - if (env->log.level) { - const struct bpf_insn_cbs cbs = { - .cb_print = verbose, - .private_data = env, - }; - verbose(env, "%d: ", env->insn_idx); - print_bpf_insn(&cbs, insn, env->allow_ptr_leaks); + if (log_level) { + verbose("%d: ", env->insn_idx); + print_bpf_insn(env, insn); } - if (bpf_prog_is_dev_bound(env->prog->aux)) { - err = bpf_prog_offload_verify_insn(env, env->insn_idx, - env->prev_insn_idx); - if (err) - return err; - } + err = ext_analyzer_insn_hook(env, env->insn_idx, env->prev_insn_idx); + if (err) + return err; regs = cur_regs(env); env->insn_aux_data[env->insn_idx].seen = true; @@ -5971,7 +4291,10 @@ static int do_check(struct bpf_verifier_env *env) * save type to validate intersecting paths */ *prev_src_type = src_reg_type; - } else if (reg_type_mismatch(src_reg_type, *prev_src_type)) { + + } else if (src_reg_type != *prev_src_type && + (src_reg_type == PTR_TO_CTX || + *prev_src_type == PTR_TO_CTX)) { /* ABuser program is trying to use the same insn * dst_reg = *(u32*) (src_reg + off) * with different pointer types: @@ -5979,7 +4302,7 @@ static int do_check(struct bpf_verifier_env *env) * src_reg == stack|map in some other branch. * Reject it. */ - verbose(env, "same insn cannot be used with different pointers\n"); + verbose("same insn cannot be used with different pointers\n"); return -EINVAL; } @@ -6016,15 +4339,17 @@ static int do_check(struct bpf_verifier_env *env) if (*prev_dst_type == NOT_INIT) { *prev_dst_type = dst_reg_type; - } else if (reg_type_mismatch(dst_reg_type, *prev_dst_type)) { - verbose(env, "same insn cannot be used with different pointers\n"); + } else if (dst_reg_type != *prev_dst_type && + (dst_reg_type == PTR_TO_CTX || + *prev_dst_type == PTR_TO_CTX)) { + verbose("same insn cannot be used with different pointers\n"); return -EINVAL; } } else if (class == BPF_ST) { if (BPF_MODE(insn->code) != BPF_MEM || insn->src_reg != BPF_REG_0) { - verbose(env, "BPF_ST uses reserved fields\n"); + verbose("BPF_ST uses reserved fields\n"); return -EINVAL; } /* check src operand */ @@ -6033,7 +4358,7 @@ static int do_check(struct bpf_verifier_env *env) return err; if (is_ctx_reg(env, insn->dst_reg)) { - verbose(env, "BPF_ST stores into R%d context is not allowed\n", + verbose("BPF_ST stores into R%d context is not allowed\n", insn->dst_reg); return -EACCES; } @@ -6051,23 +4376,13 @@ static int do_check(struct bpf_verifier_env *env) if (opcode == BPF_CALL) { if (BPF_SRC(insn->code) != BPF_K || insn->off != 0 || - (insn->src_reg != BPF_REG_0 && - insn->src_reg != BPF_PSEUDO_CALL) || + insn->src_reg != BPF_REG_0 || insn->dst_reg != BPF_REG_0) { - verbose(env, "BPF_CALL uses reserved fields\n"); + verbose("BPF_CALL uses reserved fields\n"); return -EINVAL; } - if (env->cur_state->active_spin_lock && - (insn->src_reg == BPF_PSEUDO_CALL || - insn->imm != BPF_FUNC_spin_unlock)) { - verbose(env, "function calls are not allowed while holding a lock\n"); - return -EINVAL; - } - if (insn->src_reg == BPF_PSEUDO_CALL) - err = check_func_call(env, insn, &env->insn_idx); - else - err = check_helper_call(env, insn->imm, env->insn_idx); + err = check_call(env, insn->imm, env->insn_idx); if (err) return err; @@ -6076,7 +4391,7 @@ static int do_check(struct bpf_verifier_env *env) insn->imm != 0 || insn->src_reg != BPF_REG_0 || insn->dst_reg != BPF_REG_0) { - verbose(env, "BPF_JA uses reserved fields\n"); + verbose("BPF_JA uses reserved fields\n"); return -EINVAL; } @@ -6088,28 +4403,10 @@ static int do_check(struct bpf_verifier_env *env) insn->imm != 0 || insn->src_reg != BPF_REG_0 || insn->dst_reg != BPF_REG_0) { - verbose(env, "BPF_EXIT uses reserved fields\n"); + verbose("BPF_EXIT uses reserved fields\n"); return -EINVAL; } - if (env->cur_state->active_spin_lock) { - verbose(env, "bpf_spin_unlock is missing\n"); - return -EINVAL; - } - - if (state->curframe) { - /* exit from nested function */ - err = prepare_func_exit(env, &env->insn_idx); - if (err) - return err; - do_print_state = true; - continue; - } - - err = check_reference_leak(env); - if (err) - return err; - /* eBPF calling convetion is such that R0 is used * to return the value from eBPF program. * Make sure that it's readable at this time @@ -6121,13 +4418,10 @@ static int do_check(struct bpf_verifier_env *env) return err; if (is_pointer_value(env, BPF_REG_0)) { - verbose(env, "R0 leaks addr as return value\n"); + verbose("R0 leaks addr as return value\n"); return -EACCES; } - err = check_return_code(env); - if (err) - return err; process_bpf_exit: err = pop_stack(env, &env->prev_insn_idx, &env->insn_idx); if (err < 0) { @@ -6159,27 +4453,19 @@ process_bpf_exit: env->insn_idx++; env->insn_aux_data[env->insn_idx].seen = true; } else { - verbose(env, "invalid BPF_LD mode\n"); + verbose("invalid BPF_LD mode\n"); return -EINVAL; } } else { - verbose(env, "unknown insn class %d\n", class); + verbose("unknown insn class %d\n", class); return -EINVAL; } env->insn_idx++; } - verbose(env, "processed %d insns, stack depth ", insn_processed); - for (i = 0; i < env->subprog_cnt; i++) { - u32 depth = env->subprog_info[i].stack_depth; - - verbose(env, "%d", depth); - if (i + 1 < env->subprog_cnt) - verbose(env, "+"); - } - verbose(env, "\n"); - env->prog->aux->stack_depth = env->subprog_info[0].stack_depth; + verbose("processed %d insns, stack depth %d\n", + insn_processed, env->prog->aux->stack_depth); return 0; } @@ -6191,21 +4477,7 @@ static int check_map_prealloc(struct bpf_map *map) !(map->map_flags & BPF_F_NO_PREALLOC); } -static bool is_tracing_prog_type(enum bpf_prog_type type) -{ - switch (type) { - case BPF_PROG_TYPE_KPROBE: - case BPF_PROG_TYPE_TRACEPOINT: - case BPF_PROG_TYPE_PERF_EVENT: - case BPF_PROG_TYPE_RAW_TRACEPOINT: - return true; - default: - return false; - } -} - -static int check_map_prog_compatibility(struct bpf_verifier_env *env, - struct bpf_map *map, +static int check_map_prog_compatibility(struct bpf_map *map, struct bpf_prog *prog) { @@ -6216,38 +4488,18 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, */ if (prog->type == BPF_PROG_TYPE_PERF_EVENT) { if (!check_map_prealloc(map)) { - verbose(env, "perf_event programs can only use preallocated hash map\n"); + verbose("perf_event programs can only use preallocated hash map\n"); return -EINVAL; } if (map->inner_map_meta && !check_map_prealloc(map->inner_map_meta)) { - verbose(env, "perf_event programs can only use preallocated inner hash map\n"); + verbose("perf_event programs can only use preallocated inner hash map\n"); return -EINVAL; } } - - if ((is_tracing_prog_type(prog->type) || - prog->type == BPF_PROG_TYPE_SOCKET_FILTER) && - map_value_has_spin_lock(map)) { - verbose(env, "tracing progs cannot use bpf_spin_lock yet\n"); - return -EINVAL; - } - - if ((bpf_prog_is_dev_bound(prog->aux) || bpf_map_is_dev_bound(map)) && - !bpf_offload_dev_match(prog, map)) { - verbose(env, "offload device mismatch between prog and map\n"); - return -EINVAL; - } - return 0; } -static bool bpf_map_is_cgroup_storage(struct bpf_map *map) -{ - return (map->map_type == BPF_MAP_TYPE_CGROUP_STORAGE || - map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE); -} - /* look for pseudo eBPF instructions that access map FDs and * replace them with actual map pointers */ @@ -6264,100 +4516,61 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env) for (i = 0; i < insn_cnt; i++, insn++) { if (BPF_CLASS(insn->code) == BPF_LDX && (BPF_MODE(insn->code) != BPF_MEM || insn->imm != 0)) { - verbose(env, "BPF_LDX uses reserved fields\n"); + verbose("BPF_LDX uses reserved fields\n"); return -EINVAL; } if (BPF_CLASS(insn->code) == BPF_STX && ((BPF_MODE(insn->code) != BPF_MEM && BPF_MODE(insn->code) != BPF_XADD) || insn->imm != 0)) { - verbose(env, "BPF_STX uses reserved fields\n"); + verbose("BPF_STX uses reserved fields\n"); return -EINVAL; } if (insn[0].code == (BPF_LD | BPF_IMM | BPF_DW)) { - struct bpf_insn_aux_data *aux; struct bpf_map *map; struct fd f; - u64 addr; if (i == insn_cnt - 1 || insn[1].code != 0 || insn[1].dst_reg != 0 || insn[1].src_reg != 0 || insn[1].off != 0) { - verbose(env, "invalid bpf_ld_imm64 insn\n"); + verbose("invalid bpf_ld_imm64 insn\n"); return -EINVAL; } - if (insn[0].src_reg == 0) + if (insn->src_reg == 0) /* valid generic load 64-bit imm */ goto next_insn; - /* In final convert_pseudo_ld_imm64() step, this is - * converted into regular 64-bit imm load insn. - */ - if ((insn[0].src_reg != BPF_PSEUDO_MAP_FD && - insn[0].src_reg != BPF_PSEUDO_MAP_VALUE) || - (insn[0].src_reg == BPF_PSEUDO_MAP_FD && - insn[1].imm != 0)) { - verbose(env, - "unrecognized bpf_ld_imm64 insn\n"); + if (insn->src_reg != BPF_PSEUDO_MAP_FD) { + verbose("unrecognized bpf_ld_imm64 insn\n"); + return -EINVAL; } f = fdget(insn->imm); map = __bpf_map_get(f); if (IS_ERR(map)) { - verbose(env, "fd %d is not pointing to valid bpf_map\n", + verbose("fd %d is not pointing to valid bpf_map\n", insn->imm); return PTR_ERR(map); } - err = check_map_prog_compatibility(env, map, env->prog); + err = check_map_prog_compatibility(map, env->prog); if (err) { fdput(f); return err; } - aux = &env->insn_aux_data[i]; - - if (insn->src_reg == BPF_PSEUDO_MAP_FD) { - addr = (unsigned long)map; - } else { - u32 off = insn[1].imm; - if (off >= BPF_MAX_VAR_OFF) { - verbose(env, "direct value offset of %u is not allowed\n", off); - fdput(f); - return -EINVAL; - } - - if (!map->ops->map_direct_value_addr) { - verbose(env, "no direct value access support for this map type\n"); - fdput(f); - return -EINVAL; - } - - err = map->ops->map_direct_value_addr(map, &addr, off); - if (err) { - verbose(env, "invalid access to map value pointer, value_size=%u off=%u\n", - map->value_size, off); - fdput(f); - return err; - } - - aux->map_off = off; - addr += off; - } - - insn[0].imm = (u32)addr; - insn[1].imm = addr >> 32; + /* store map pointer inside BPF_LD_IMM64 instruction */ + insn[0].imm = (u32) (unsigned long) map; + insn[1].imm = ((u64) (unsigned long) map) >> 32; /* check whether we recorded this map already */ - for (j = 0; j < env->used_map_cnt; j++) { + for (j = 0; j < env->used_map_cnt; j++) if (env->used_maps[j] == map) { - aux->map_index = j; fdput(f); goto next_insn; } - } if (env->used_map_cnt >= MAX_USED_MAPS) { fdput(f); @@ -6374,16 +4587,8 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env) fdput(f); return PTR_ERR(map); } - - aux->map_index = env->used_map_cnt; env->used_maps[env->used_map_cnt++] = map; - if (bpf_map_is_cgroup_storage(map) && - bpf_cgroup_storage_assign(env->prog, map)) { - fdput(f); - return -EBUSY; - } - fdput(f); next_insn: insn++; @@ -6401,16 +4606,8 @@ next_insn: /* drop refcnt of maps used by the rejected program */ static void release_maps(struct bpf_verifier_env *env) { - enum bpf_cgroup_storage_type stype; int i; - for_each_cgroup_storage_type(stype) { - if (!env->prog->aux->cgroup_storage[stype]) - continue; - bpf_cgroup_storage_release(env->prog, - env->prog->aux->cgroup_storage[stype]); - } - for (i = 0; i < env->used_map_cnt; i++) bpf_map_put(env->used_maps[i]); } @@ -6452,18 +4649,6 @@ static int adjust_insn_aux_data(struct bpf_verifier_env *env, u32 prog_len, return 0; } -static void adjust_subprog_starts(struct bpf_verifier_env *env, u32 off, u32 len) -{ - int i; - if (len == 1) - return; - for (i = 0; i < env->subprog_cnt; i++) { - if (env->subprog_info[i].start < off) - continue; - env->subprog_info[i].start += len - 1; - } -} - static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 off, const struct bpf_insn *patch, u32 len) { @@ -6474,7 +4659,6 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of return NULL; if (adjust_insn_aux_data(env, new_prog->len, off, len)) return NULL; - adjust_subprog_starts(env, off, len); return new_prog; } @@ -6502,7 +4686,7 @@ static void sanitize_dead_code(struct bpf_verifier_env *env) */ static int convert_ctx_accesses(struct bpf_verifier_env *env) { - const struct bpf_verifier_ops *ops = env->ops; + const struct bpf_verifier_ops *ops = env->prog->aux->ops; int i, cnt, size, ctx_field_size, delta = 0; const int insn_cnt = env->prog->len; struct bpf_insn insn_buf[16], *insn; @@ -6515,7 +4699,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) cnt = ops->gen_prologue(insn_buf, env->seen_direct_write, env->prog); if (cnt >= ARRAY_SIZE(insn_buf)) { - verbose(env, "bpf verifier is misconfigured\n"); + verbose("bpf verifier is misconfigured\n"); return -EINVAL; } else if (cnt) { new_prog = bpf_patch_insn_data(env, 0, insn_buf, cnt); @@ -6527,13 +4711,12 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) } } - if (bpf_prog_is_dev_bound(env->prog->aux)) + if (!ops->convert_ctx_access) return 0; insn = env->prog->insnsi + delta; for (i = 0; i < insn_cnt; i++, insn++) { - bpf_convert_ctx_access_t convert_ctx_access; if (insn->code == (BPF_LDX | BPF_MEM | BPF_B) || insn->code == (BPF_LDX | BPF_MEM | BPF_H) || insn->code == (BPF_LDX | BPF_MEM | BPF_W) || @@ -6575,22 +4758,9 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) continue; } - switch (env->insn_aux_data[i + delta].ptr_type) { - case PTR_TO_CTX: - if (!ops->convert_ctx_access) - continue; - convert_ctx_access = ops->convert_ctx_access; - break; - case PTR_TO_SOCKET: - case PTR_TO_SOCK_COMMON: - convert_ctx_access = bpf_sock_convert_ctx_access; - break; - case PTR_TO_TCP_SOCK: - convert_ctx_access = bpf_tcp_sock_convert_ctx_access; - break; - default: + if (env->insn_aux_data[i + delta].ptr_type != PTR_TO_CTX) continue; - } + ctx_field_size = env->insn_aux_data[i + delta].ctx_field_size; size = BPF_LDST_BYTES(insn); @@ -6605,7 +4775,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) u8 size_code; if (type == BPF_WRITE) { - verbose(env, "bpf verifier narrow ctx access misconfigured\n"); + verbose("bpf verifier narrow ctx access misconfigured\n"); return -EINVAL; } @@ -6620,11 +4790,11 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) } target_size = 0; - cnt = convert_ctx_access(type, insn, insn_buf, env->prog, - &target_size); + cnt = ops->convert_ctx_access(type, insn, insn_buf, env->prog, + &target_size); if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf) || (ctx_field_size && !target_size)) { - verbose(env, "bpf verifier is misconfigured\n"); + verbose("bpf verifier is misconfigured\n"); return -EINVAL; } @@ -6651,196 +4821,6 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) return 0; } -static int jit_subprogs(struct bpf_verifier_env *env) -{ - struct bpf_prog *prog = env->prog, **func, *tmp; - int i, j, subprog_start, subprog_end = 0, len, subprog; - struct bpf_insn *insn; - void *old_bpf_func; - int err; - - if (env->subprog_cnt <= 1) - return 0; - - for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) { - if (insn->code != (BPF_JMP | BPF_CALL) || - insn->src_reg != BPF_PSEUDO_CALL) - continue; - subprog = find_subprog(env, i + insn->imm + 1); - if (subprog < 0) { - WARN_ONCE(1, "verifier bug. No program starts at insn %d\n", - i + insn->imm + 1); - return -EFAULT; - } - /* temporarily remember subprog id inside insn instead of - * aux_data, since next loop will split up all insns into funcs - */ - insn->off = subprog; - /* remember original imm in case JIT fails and fallback - * to interpreter will be needed - */ - env->insn_aux_data[i].call_imm = insn->imm; - /* point imm to __bpf_call_base+1 from JITs point of view */ - insn->imm = 1; - } - - err = bpf_prog_alloc_jited_linfo(prog); - if (err) - goto out_undo_insn; - err = -ENOMEM; - - func = kzalloc(sizeof(prog) * env->subprog_cnt, GFP_KERNEL); - if (!func) - goto out_undo_insn; - - for (i = 0; i < env->subprog_cnt; i++) { - subprog_start = subprog_end; - if (env->subprog_cnt == i + 1) - subprog_end = prog->len; - else - subprog_end = env->subprog_info[i + 1].start; - - len = subprog_end - subprog_start; - func[i] = bpf_prog_alloc(bpf_prog_size(len), GFP_USER); - - if (!func[i]) - goto out_free; - - memcpy(func[i]->insnsi, &prog->insnsi[subprog_start], - len * sizeof(struct bpf_insn)); - func[i]->len = len; - func[i]->is_func = 1; - func[i]->aux->func_idx = i; - /* the btf and func_info will be freed only at prog->aux */ - func[i]->aux->btf = prog->aux->btf; - func[i]->aux->func_info = prog->aux->func_info; - - /* Use bpf_prog_F_tag to indicate functions in stack traces. - * Long term would need debug info to populate names - */ - func[i]->aux->name[0] = 'F'; - func[i]->aux->stack_depth = env->subprog_info[i].stack_depth; - func[i]->jit_requested = 1; - func[i]->aux->linfo = prog->aux->linfo; - func[i]->aux->nr_linfo = prog->aux->nr_linfo; - func[i]->aux->jited_linfo = prog->aux->jited_linfo; - func[i]->aux->linfo_idx = env->subprog_info[i].linfo_idx; - func[i] = bpf_int_jit_compile(func[i]); - if (!func[i]->jited) { - err = -ENOTSUPP; - goto out_free; - } - cond_resched(); - } - /* at this point all bpf functions were successfully JITed - * now populate all bpf_calls with correct addresses and - * run last pass of JIT - */ - for (i = 0; i < env->subprog_cnt; i++) { - insn = func[i]->insnsi; - for (j = 0; j < func[i]->len; j++, insn++) { - if (insn->code != (BPF_JMP | BPF_CALL) || - insn->src_reg != BPF_PSEUDO_CALL) - continue; - subprog = insn->off; - insn->off = 0; - insn->imm = (u64 (*)(u64, u64, u64, u64, u64)) - func[subprog]->bpf_func - - __bpf_call_base; - } - } - for (i = 0; i < env->subprog_cnt; i++) { - old_bpf_func = func[i]->bpf_func; - tmp = bpf_int_jit_compile(func[i]); - if (tmp != func[i] || func[i]->bpf_func != old_bpf_func) { - verbose(env, "JIT doesn't support bpf-to-bpf calls\n"); - err = -ENOTSUPP; - goto out_free; - } - cond_resched(); - } - /* finally lock prog and jit images for all functions and - * populate kallsysm - */ - for (i = 0; i < env->subprog_cnt; i++) { - bpf_prog_lock_ro(func[i]); - bpf_prog_kallsyms_add(func[i]); - } - - /* Last step: make now unused interpreter insns from main - * prog consistent for later dump requests, so they can - * later look the same as if they were interpreted only. - */ - for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) { - if (insn->code != (BPF_JMP | BPF_CALL) || - insn->src_reg != BPF_PSEUDO_CALL) - continue; - insn->off = env->insn_aux_data[i].call_imm; - /* Upon error here we cannot fall back to interpreter but - * need a hard reject of the program. Thus -EFAULT is - * propagated in any case. - */ - subprog = find_subprog(env, i + insn->off + 1); - insn->imm = subprog; - } - - prog->jited = 1; - prog->bpf_func = func[0]->bpf_func; - prog->aux->func = func; - prog->aux->func_cnt = env->subprog_cnt; - bpf_prog_free_unused_jited_linfo(prog); - return 0; -out_free: - for (i = 0; i < env->subprog_cnt; i++) - if (func[i]) - bpf_jit_free(func[i]); - kfree(func); -out_undo_insn: - /* cleanup main prog to be interpreted */ - prog->jit_requested = 0; - for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) { - if (insn->code != (BPF_JMP | BPF_CALL) || - insn->src_reg != BPF_PSEUDO_CALL) - continue; - insn->off = 0; - insn->imm = env->insn_aux_data[i].call_imm; - } - bpf_prog_free_jited_linfo(prog); - return err; -} - -static int fixup_call_args(struct bpf_verifier_env *env) -{ -#ifndef CONFIG_BPF_JIT_ALWAYS_ON - struct bpf_prog *prog = env->prog; - struct bpf_insn *insn = prog->insnsi; - int i, depth; -#endif - int err = 0; - - if (env->prog->jit_requested && - !bpf_prog_is_dev_bound(env->prog->aux)) { - err = jit_subprogs(env); - if (err == 0) - return 0; - if (err == -EFAULT) - return err; - } -#ifndef CONFIG_BPF_JIT_ALWAYS_ON - for (i = 0; i < prog->len; i++, insn++) { - if (insn->code != (BPF_JMP | BPF_CALL) || - insn->src_reg != BPF_PSEUDO_CALL) - continue; - depth = get_callee_stack_depth(env, insn, i); - if (depth < 0) - return depth; - bpf_patch_call_args(insn, depth); - } - err = 0; -#endif - return err; -} - /* fixup insn->imm field of bpf_call instructions * and inline eligible helpers as explicit sequence of BPF instructions * @@ -6852,11 +4832,11 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) struct bpf_insn *insn = prog->insnsi; const struct bpf_func_proto *fn; const int insn_cnt = prog->len; - struct bpf_insn_aux_data *aux; struct bpf_insn insn_buf[16]; struct bpf_prog *new_prog; struct bpf_map *map_ptr; int i, cnt, delta = 0; + struct bpf_insn_aux_data *aux; for (i = 0; i < insn_cnt; i++, insn++) { if (insn->code == (BPF_ALU64 | BPF_MOD | BPF_X) || @@ -6956,9 +4936,6 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) if (insn->code != (BPF_JMP | BPF_CALL)) continue; - if (insn->src_reg == BPF_PSEUDO_CALL) - continue; - if (insn->imm == BPF_FUNC_get_route_realm) prog->dst_needed = 1; if (insn->imm == BPF_FUNC_get_prandom_u32) @@ -6980,22 +4957,19 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) insn->imm = 0; insn->code = BPF_JMP | BPF_TAIL_CALL; - aux = &env->insn_aux_data[i + delta]; - if (!bpf_map_ptr_unpriv(aux)) - continue; - /* instead of changing every JIT dealing with tail_call * emit two extra insns: * if (index >= max_entries) goto out; * index &= array->index_mask; * to avoid out-of-bounds cpu speculation */ - if (bpf_map_ptr_poisoned(aux)) { - verbose(env, "tail_call obusing map_ptr\n"); + map_ptr = env->insn_aux_data[i + delta].map_ptr; + if (map_ptr == BPF_MAP_PTR_POISON) { + verbose("tail_call obusing map_ptr\n"); return -EINVAL; - - map_ptr = BPF_MAP_PTR(aux->map_state);} - + } + if (!map_ptr->unpriv_array) + continue; insn_buf[0] = BPF_JMP_IMM(BPF_JGE, BPF_REG_3, map_ptr->max_entries, 2); insn_buf[1] = BPF_ALU32_IMM(BPF_AND, BPF_REG_3, @@ -7017,18 +4991,16 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) /* BPF_EMIT_CALL() assumptions in some of the map_gen_lookup * handlers are currently limited to 64 bit only. */ - if (prog->jit_requested && BITS_PER_LONG == 64 && + if (ebpf_jit_enabled() && BITS_PER_LONG == 64 && insn->imm == BPF_FUNC_map_lookup_elem) { - aux = &env->insn_aux_data[i + delta]; - if (bpf_map_ptr_poisoned(aux)) - goto patch_call_imm; - map_ptr = BPF_MAP_PTR(aux->map_state); - if (!map_ptr->ops->map_gen_lookup) + map_ptr = env->insn_aux_data[i + delta].map_ptr; + if (map_ptr == BPF_MAP_PTR_POISON || + !map_ptr->ops->map_gen_lookup) goto patch_call_imm; cnt = map_ptr->ops->map_gen_lookup(map_ptr, insn_buf); if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) { - verbose(env, "bpf verifier is misconfigured\n"); + verbose("bpf verifier is misconfigured\n"); return -EINVAL; } @@ -7067,12 +5039,12 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) insn = new_prog->insnsi + i + delta; } patch_call_imm: - fn = env->ops->get_func_proto(insn->imm, env->prog); + fn = prog->aux->ops->get_func_proto(insn->imm); /* all functions that have prototype and verifier allowed * programs to call them, must be real in-kernel functions */ if (!fn->func) { - verbose(env, "kernel subsystem misconfigured func %s#%d\n", + verbose("kernel subsystem misconfigured func %s#%d\n", func_id_name(insn->imm), insn->imm); return -EFAULT; } @@ -7105,10 +5077,9 @@ static void free_states(struct bpf_verifier_env *env) kfree(env->explored_states); } -int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, - union bpf_attr __user *uattr) +int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) { - struct bpf_verifier_log *log; + char __user *log_ubuf = NULL; struct bpf_verifier_env *env; int ret = -EINVAL; @@ -7118,7 +5089,6 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, env = kzalloc(sizeof(struct bpf_verifier_env), GFP_KERNEL); if (!env) return -ENOMEM; - log = &env->log; env->insn_aux_data = vzalloc(sizeof(struct bpf_insn_aux_data) * (*prog)->len); @@ -7126,7 +5096,6 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, if (!env->insn_aux_data) goto err_free_env; env->prog = *prog; - env->ops = bpf_verifier_ops[env->prog->type]; /* grab the mutex to protect few globals used by verifier */ mutex_lock(&bpf_verifier_lock); @@ -7135,29 +5104,29 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, /* user requested verbose verifier output * and supplied buffer to store the verification trace */ - log->level = attr->log_level; - log->ubuf = (char __user *) (unsigned long) attr->log_buf; - log->len_total = attr->log_size; + log_level = attr->log_level; + log_ubuf = (char __user *) (unsigned long) attr->log_buf; + log_size = attr->log_size; + log_len = 0; ret = -EINVAL; - /* log attributes have to be sane */ - if (log->len_total < 128 || log->len_total > UINT_MAX >> 8 || - !log->level || !log->ubuf) + /* log_* values have to be sane */ + if (log_size < 128 || log_size > UINT_MAX >> 8 || + log_level == 0 || log_ubuf == NULL) goto err_unlock; ret = -ENOMEM; + log_buf = vmalloc(log_size); + if (!log_buf) + goto err_unlock; + } else { + log_level = 0; } env->strict_alignment = !!(attr->prog_flags & BPF_F_STRICT_ALIGNMENT); if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) env->strict_alignment = true; - if (bpf_prog_is_dev_bound(env->prog->aux)) { - ret = bpf_prog_offload_verifier_prep(env); - if (ret) - goto err_unlock; - } - ret = replace_map_fd_with_map_ptr(env); if (ret < 0) goto skip_full_check; @@ -7169,15 +5138,11 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, if (!env->explored_states) goto skip_full_check; - env->allow_ptr_leaks = capable(CAP_SYS_ADMIN); - ret = check_cfg(env); if (ret < 0) goto skip_full_check; - ret = check_btf_info(env, attr, uattr); - if (ret < 0) - goto skip_full_check; + env->allow_ptr_leaks = capable(CAP_SYS_ADMIN); ret = do_check(env); if (env->cur_state) { @@ -7192,9 +5157,6 @@ skip_full_check: if (ret == 0) sanitize_dead_code(env); - if (ret == 0) - ret = check_max_stack_depth(env); - if (ret == 0) /* program is valid, convert *(u32*)(ctx + off) accesses */ ret = convert_ctx_accesses(env); @@ -7202,16 +5164,17 @@ skip_full_check: if (ret == 0) ret = fixup_bpf_calls(env); - if (ret == 0) - ret = fixup_call_args(env); - - if (log->level && bpf_verifier_log_full(log)) { + if (log_level && log_len >= log_size - 1) { + BUG_ON(log_len >= log_size); + /* verifier log exceeded user supplied buffer */ ret = -ENOSPC; + /* fall through to return what was recorded */ } - if (log->level && !log->ubuf) { + /* copy verifier log back to user space including trailing zero */ + if (log_level && copy_to_user(log_ubuf, log_buf, log_len + 1) != 0) { ret = -EFAULT; - goto err_release_maps; + goto free_log_buf; } if (ret == 0 && env->used_map_cnt) { @@ -7222,7 +5185,7 @@ skip_full_check: if (!env->prog->aux->used_maps) { ret = -ENOMEM; - goto err_release_maps; + goto free_log_buf; } memcpy(env->prog->aux->used_maps, env->used_maps, @@ -7235,10 +5198,9 @@ skip_full_check: convert_pseudo_ld_imm64(env); } - if (ret == 0) - adjust_btf_func(env); - -err_release_maps: +free_log_buf: + if (log_level) + vfree(log_buf); if (!env->prog->aux->used_maps) /* if we didn't copy map pointers into bpf_prog_info, release * them now. Otherwise free_used_maps() will release them. @@ -7269,13 +5231,14 @@ int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops, if (!env->insn_aux_data) goto err_free_env; env->prog = prog; - env->ops = bpf_verifier_ops[env->prog->type]; env->analyzer_ops = ops; env->analyzer_priv = priv; /* grab the mutex to protect few globals used by verifier */ mutex_lock(&bpf_verifier_lock); + log_level = 0; + env->strict_alignment = false; if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) env->strict_alignment = true; diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 604773a53d8b..37784813af85 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -6125,14 +6125,4 @@ int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, mutex_unlock(&cgroup_mutex); return ret; } -int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, - union bpf_attr __user *uattr) -{ - int ret; - - mutex_lock(&cgroup_mutex); - ret = __cgroup_bpf_query(cgrp, attr, uattr); - mutex_unlock(&cgroup_mutex); - return ret; -} #endif /* CONFIG_CGROUP_BPF */ diff --git a/kernel/events/core.c b/kernel/events/core.c index 55845a733f5c..d3497bdd3413 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5104,8 +5104,6 @@ static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned lon rcu_read_unlock(); return 0; } - case PERF_EVENT_IOC_QUERY_BPF: - return perf_event_query_prog_array(event, (void __user *)arg); default: return -ENOTTY; } @@ -7107,7 +7105,7 @@ static void perf_fill_ns_link_info(struct perf_ns_link_info *ns_link_info, { struct path ns_path; struct inode *ns_inode; - int error; + void *error; error = ns_get_path(&ns_path, task, ns_ops); if (!error) { diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index d723b6d03356..7ddfb056693b 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -468,8 +468,7 @@ static const struct bpf_func_proto bpf_probe_read_str_proto = { .arg3_type = ARG_ANYTHING, }; -static const struct bpf_func_proto * -tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id) { switch (func_id) { case BPF_FUNC_map_lookup_elem: @@ -510,15 +509,12 @@ tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_prandom_u32_proto; case BPF_FUNC_probe_read_str: return &bpf_probe_read_str_proto; - case BPF_FUNC_get_current_cgroup_id: - return &bpf_get_current_cgroup_id_proto; default: return NULL; } } -static const struct bpf_func_proto * -kprobe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func_id) { switch (func_id) { case BPF_FUNC_perf_event_output: @@ -526,13 +522,12 @@ kprobe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_get_stackid: return &bpf_get_stackid_proto; default: - return tracing_func_proto(func_id, prog); + return tracing_func_proto(func_id); } } /* bpf+kprobe programs can access fields of 'struct pt_regs' */ static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type type, - const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { if (off < 0 || off >= sizeof(struct pt_regs)) @@ -551,14 +546,11 @@ static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type return true; } -const struct bpf_verifier_ops kprobe_verifier_ops = { +const struct bpf_verifier_ops kprobe_prog_ops = { .get_func_proto = kprobe_prog_func_proto, .is_valid_access = kprobe_prog_is_valid_access, }; -const struct bpf_prog_ops kprobe_prog_ops = { -}; - BPF_CALL_5(bpf_perf_event_output_tp, void *, tp_buff, struct bpf_map *, map, u64, flags, void *, data, u64, size) { @@ -606,8 +598,7 @@ static const struct bpf_func_proto bpf_get_stackid_proto_tp = { .arg3_type = ARG_ANYTHING, }; -static const struct bpf_func_proto * -tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +static const struct bpf_func_proto *tp_prog_func_proto(enum bpf_func_id func_id) { switch (func_id) { case BPF_FUNC_perf_event_output: @@ -615,12 +606,11 @@ tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_get_stackid: return &bpf_get_stackid_proto_tp; default: - return tracing_func_proto(func_id, prog); + return tracing_func_proto(func_id); } } static bool tp_prog_is_valid_access(int off, int size, enum bpf_access_type type, - const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { if (off < sizeof(void *) || off >= PERF_MAX_TRACE_SIZE) @@ -634,118 +624,12 @@ static bool tp_prog_is_valid_access(int off, int size, enum bpf_access_type type return true; } -const struct bpf_verifier_ops tracepoint_verifier_ops = { +const struct bpf_verifier_ops tracepoint_prog_ops = { .get_func_proto = tp_prog_func_proto, .is_valid_access = tp_prog_is_valid_access, }; -const struct bpf_prog_ops tracepoint_prog_ops = { -}; - -/* - * bpf_raw_tp_regs are separate from bpf_pt_regs used from skb/xdp - * to avoid potential recursive reuse issue when/if tracepoints are added - * inside bpf_*_event_output and/or bpf_get_stack_id - */ -static DEFINE_PER_CPU(struct pt_regs, bpf_raw_tp_regs); -BPF_CALL_5(bpf_perf_event_output_raw_tp, struct bpf_raw_tracepoint_args *, args, - struct bpf_map *, map, u64, flags, void *, data, u64, size) -{ - struct pt_regs *regs = this_cpu_ptr(&bpf_raw_tp_regs); - - perf_fetch_caller_regs(regs); - return ____bpf_perf_event_output(regs, map, flags, data, size); -} - -static const struct bpf_func_proto bpf_perf_event_output_proto_raw_tp = { - .func = bpf_perf_event_output_raw_tp, - .gpl_only = true, - .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_CONST_MAP_PTR, - .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_PTR_TO_MEM, - .arg5_type = ARG_CONST_SIZE_OR_ZERO, -}; - -BPF_CALL_3(bpf_get_stackid_raw_tp, struct bpf_raw_tracepoint_args *, args, - struct bpf_map *, map, u64, flags) -{ - struct pt_regs *regs = this_cpu_ptr(&bpf_raw_tp_regs); - - perf_fetch_caller_regs(regs); - /* similar to bpf_perf_event_output_tp, but pt_regs fetched differently */ - return bpf_get_stackid((unsigned long) regs, (unsigned long) map, - flags, 0, 0); -} - -static const struct bpf_func_proto bpf_get_stackid_proto_raw_tp = { - .func = bpf_get_stackid_raw_tp, - .gpl_only = true, - .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_CONST_MAP_PTR, - .arg3_type = ARG_ANYTHING, -}; - -static const struct bpf_func_proto *raw_tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) -{ - switch (func_id) { - case BPF_FUNC_perf_event_output: - return &bpf_perf_event_output_proto_raw_tp; - case BPF_FUNC_get_stackid: - return &bpf_get_stackid_proto_raw_tp; - default: - return tracing_func_proto(func_id, prog); - } -} - -static bool raw_tp_prog_is_valid_access(int off, int size, - enum bpf_access_type type, - const struct bpf_prog *prog, - struct bpf_insn_access_aux *info) -{ - /* largest tracepoint in the kernel has 12 args */ - if (off < 0 || off >= sizeof(__u64) * 12) - return false; - if (type != BPF_READ) - return false; - if (off % size != 0) - return false; - return true; -} - -const struct bpf_verifier_ops raw_tracepoint_verifier_ops = { - .get_func_proto = raw_tp_prog_func_proto, - .is_valid_access = raw_tp_prog_is_valid_access, -}; - -const struct bpf_prog_ops raw_tracepoint_prog_ops = { -}; - -static bool raw_tp_writable_prog_is_valid_access(int off, int size, - enum bpf_access_type type, - const struct bpf_prog *prog, - struct bpf_insn_access_aux *info) -{ - if (off == 0) { - if (size != sizeof(u64) || type != BPF_READ) - return false; - info->reg_type = PTR_TO_TP_BUFFER; - } - return raw_tp_prog_is_valid_access(off, size, type, prog, info); -} - -const struct bpf_verifier_ops raw_tracepoint_writable_verifier_ops = { - .get_func_proto = raw_tp_prog_func_proto, - .is_valid_access = raw_tp_writable_prog_is_valid_access, -}; - -const struct bpf_prog_ops raw_tracepoint_writable_prog_ops = { -}; - static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type, - const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { const int size_sp = FIELD_SIZEOF(struct bpf_perf_event_data, @@ -800,19 +684,14 @@ static u32 pe_prog_convert_ctx_access(enum bpf_access_type type, return insn - insn_buf; } -const struct bpf_verifier_ops perf_event_verifier_ops = { +const struct bpf_verifier_ops perf_event_prog_ops = { .get_func_proto = tp_prog_func_proto, .is_valid_access = pe_prog_is_valid_access, .convert_ctx_access = pe_prog_convert_ctx_access, }; -const struct bpf_prog_ops perf_event_prog_ops = { -}; - static DEFINE_MUTEX(bpf_event_mutex); -#define BPF_TRACE_MAX_PROGS 64 - int perf_event_attach_bpf_prog(struct perf_event *event, struct bpf_prog *prog) { @@ -823,24 +702,20 @@ int perf_event_attach_bpf_prog(struct perf_event *event, mutex_lock(&bpf_event_mutex); if (event->prog) - goto unlock; + goto out; - old_array = event->tp_event->prog_array; - if (old_array && - bpf_prog_array_length(old_array) >= BPF_TRACE_MAX_PROGS) { - ret = -E2BIG; - goto unlock; - } + old_array = rcu_dereference_protected(event->tp_event->prog_array, + lockdep_is_held(&bpf_event_mutex)); ret = bpf_prog_array_copy(old_array, NULL, prog, &new_array); if (ret < 0) - goto unlock; + goto out; /* set the new array to event->tp_event and set event->prog */ event->prog = prog; rcu_assign_pointer(event->tp_event->prog_array, new_array); bpf_prog_array_free(old_array); -unlock: +out: mutex_unlock(&bpf_event_mutex); return ret; } @@ -854,9 +729,11 @@ void perf_event_detach_bpf_prog(struct perf_event *event) mutex_lock(&bpf_event_mutex); if (!event->prog) - goto unlock; + goto out; + + old_array = rcu_dereference_protected(event->tp_event->prog_array, + lockdep_is_held(&bpf_event_mutex)); - old_array = event->tp_event->prog_array; ret = bpf_prog_array_copy(old_array, event->prog, NULL, &new_array); if (ret < 0) { bpf_prog_array_delete_safe(old_array, event->prog); @@ -868,152 +745,6 @@ void perf_event_detach_bpf_prog(struct perf_event *event) bpf_prog_put(event->prog); event->prog = NULL; -unlock: +out: mutex_unlock(&bpf_event_mutex); } - -int perf_event_query_prog_array(struct perf_event *event, void __user *info) -{ - struct perf_event_query_bpf __user *uquery = info; - struct perf_event_query_bpf query = {}; - u32 *ids, prog_cnt, ids_len; - int ret; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - if (event->attr.type != PERF_TYPE_TRACEPOINT) - return -EINVAL; - if (copy_from_user(&query, uquery, sizeof(query))) - return -EFAULT; - ids_len = query.ids_len; - if (ids_len > BPF_TRACE_MAX_PROGS) - return -E2BIG; - ids = kcalloc(ids_len, sizeof(u32), GFP_USER | __GFP_NOWARN); - if (!ids) - return -ENOMEM; - /* - * The above kcalloc returns ZERO_SIZE_PTR when ids_len = 0, which - * is required when user only wants to check for uquery->prog_cnt. - * There is no need to check for it since the case is handled - * gracefully in bpf_prog_array_copy_info. - */ - mutex_lock(&bpf_event_mutex); - ret = bpf_prog_array_copy_info(event->tp_event->prog_array, - ids, - ids_len, - &prog_cnt); - mutex_unlock(&bpf_event_mutex); - - if (copy_to_user(&uquery->prog_cnt, &prog_cnt, sizeof(prog_cnt)) || - copy_to_user(uquery->ids, ids, ids_len * sizeof(u32))) - ret = -EFAULT; - kfree(ids); - - return ret; -} - -extern struct bpf_raw_event_map __start__bpf_raw_tp[]; -extern struct bpf_raw_event_map __stop__bpf_raw_tp[]; - -struct bpf_raw_event_map *bpf_find_raw_tracepoint(const char *name) -{ - struct bpf_raw_event_map *btp = __start__bpf_raw_tp; - - for (; btp < __stop__bpf_raw_tp; btp++) { - if (!strcmp(btp->tp->name, name)) - return btp; - } - return NULL; -} - -static __always_inline -void __bpf_trace_run(struct bpf_prog *prog, u64 *args) -{ - rcu_read_lock(); - preempt_disable(); - (void) BPF_PROG_RUN(prog, args); - preempt_enable(); - rcu_read_unlock(); -} - -#define UNPACK(...) __VA_ARGS__ -#define REPEAT_1(FN, DL, X, ...) FN(X) -#define REPEAT_2(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_1(FN, DL, __VA_ARGS__) -#define REPEAT_3(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_2(FN, DL, __VA_ARGS__) -#define REPEAT_4(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_3(FN, DL, __VA_ARGS__) -#define REPEAT_5(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_4(FN, DL, __VA_ARGS__) -#define REPEAT_6(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_5(FN, DL, __VA_ARGS__) -#define REPEAT_7(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_6(FN, DL, __VA_ARGS__) -#define REPEAT_8(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_7(FN, DL, __VA_ARGS__) -#define REPEAT_9(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_8(FN, DL, __VA_ARGS__) -#define REPEAT_10(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_9(FN, DL, __VA_ARGS__) -#define REPEAT_11(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_10(FN, DL, __VA_ARGS__) -#define REPEAT_12(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_11(FN, DL, __VA_ARGS__) -#define REPEAT(X, FN, DL, ...) REPEAT_##X(FN, DL, __VA_ARGS__) - -#define SARG(X) u64 arg##X -#define COPY(X) args[X] = arg##X - -#define __DL_COM (,) -#define __DL_SEM (;) - -#define __SEQ_0_11 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 - -#define BPF_TRACE_DEFN_x(x) \ - void bpf_trace_run##x(struct bpf_prog *prog, \ - REPEAT(x, SARG, __DL_COM, __SEQ_0_11)) \ - { \ - u64 args[x]; \ - REPEAT(x, COPY, __DL_SEM, __SEQ_0_11); \ - __bpf_trace_run(prog, args); \ - } \ - EXPORT_SYMBOL_GPL(bpf_trace_run##x) -BPF_TRACE_DEFN_x(1); -BPF_TRACE_DEFN_x(2); -BPF_TRACE_DEFN_x(3); -BPF_TRACE_DEFN_x(4); -BPF_TRACE_DEFN_x(5); -BPF_TRACE_DEFN_x(6); -BPF_TRACE_DEFN_x(7); -BPF_TRACE_DEFN_x(8); -BPF_TRACE_DEFN_x(9); -BPF_TRACE_DEFN_x(10); -BPF_TRACE_DEFN_x(11); -BPF_TRACE_DEFN_x(12); - -static int __bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *prog) -{ -struct tracepoint *tp = btp->tp; - - /* - * check that program doesn't access arguments beyond what's - * available in this tracepoint - */ - if (prog->aux->max_ctx_offset > btp->num_args * sizeof(u64)) - return -EINVAL; - - if (prog->aux->max_tp_access > btp->writable_size) - return -EINVAL; - - return tracepoint_probe_register(tp, (void *)btp->bpf_func, prog); -} - -int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *prog) -{ - int err; - - mutex_lock(&bpf_event_mutex); - err = __bpf_probe_register(btp, prog); - mutex_unlock(&bpf_event_mutex); - return err; -} - -int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog *prog) -{ - int err; - - mutex_lock(&bpf_event_mutex); - err = tracepoint_probe_unregister(btp->tp, (void *)btp->bpf_func, prog); - mutex_unlock(&bpf_event_mutex); - return err; -} diff --git a/net/Kconfig b/net/Kconfig index d3b44dba74ec..7806b964776c 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -51,9 +51,6 @@ config NET_INGRESS config NET_EGRESS bool -config SKB_EXTENSIONS - bool - menu "Networking options" source "net/packet/Kconfig" @@ -301,11 +298,8 @@ config BPF_JIT config BPF_STREAM_PARSER bool "enable BPF STREAM_PARSER" - depends on INET depends on BPF_SYSCALL - depends on CGROUP_BPF select STREAM_PARSER - select NET_SOCK_MSG ---help--- Enabling this allows a stream parser to be used with BPF_MAP_TYPE_SOCKMAP. @@ -440,14 +434,6 @@ config GRO_CELLS bool default n -config NET_SOCK_MSG - bool - default n - help - The NET_SOCK_MSG provides a framework for plain sockets (e.g. TCP) or - ULPs (upper layer modules, e.g. TLS) to process L7 application data - with the help of BPF programs. - config NET_DEVLINK tristate "Network physical/parent device Netlink interface" help diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 9f942cccffad..a8736c68fa14 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -10,7 +10,6 @@ #include #include #include -#include static __always_inline u32 bpf_test_run_one(struct bpf_prog *prog, void *ctx) { @@ -136,7 +135,7 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr, if (is_l2) __skb_push(skb, hh_len); if (is_direct_pkt_access) - bpf_compute_data_pointers(skb); + bpf_compute_data_end(skb); retval = bpf_test_run(prog, skb, repeat, &duration); if (!is_l2) { if (skb_headroom(skb) < hh_len) { @@ -175,7 +174,6 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr, xdp.data_hard_start = data; xdp.data = data + XDP_PACKET_HEADROOM + NET_IP_ALIGN; - xdp.data_meta = xdp.data; xdp.data_end = xdp.data + size; retval = bpf_test_run(prog, &xdp, repeat, &duration); diff --git a/net/core/Makefile b/net/core/Makefile index 2ce40e69aae0..475570161efb 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -11,11 +11,10 @@ obj-$(CONFIG_SYSCTL) += sysctl_net_core.o obj-y += dev.o ethtool.o dev_addr_lists.o dst.o netevent.o \ neighbour.o rtnetlink.o utils.o link_watch.o filter.o \ sock_diag.o dev_ioctl.o tso.o sock_reuseport.o \ - fib_notifier.o net_ipc_log.o dev_monitor.o xdp.o + fib_notifier.o net_ipc_log.o dev_monitor.o obj-y += net-sysfs.o obj-$(CONFIG_PROC_FS) += net-procfs.o -obj-$(CONFIG_NET_SOCK_MSG) += skmsg.o obj-$(CONFIG_NET_PKTGEN) += pktgen.o obj-$(CONFIG_NETPOLL) += netpoll.o obj-$(CONFIG_FIB_RULES) += fib_rules.o @@ -28,9 +27,7 @@ obj-$(CONFIG_CGROUP_NET_CLASSID) += netclassid_cgroup.o obj-$(CONFIG_LWTUNNEL) += lwtunnel.o obj-$(CONFIG_LWTUNNEL_BPF) += lwt_bpf.o obj-$(CONFIG_SOCKEV_NLMCAST) += sockev_nlmcast.o -obj-$(CONFIG_BPF_STREAM_PARSER) += sock_map.o obj-$(CONFIG_DST_CACHE) += dst_cache.o obj-$(CONFIG_HWBM) += hwbm.o obj-$(CONFIG_NET_DEVLINK) += devlink.o obj-$(CONFIG_GRO_CELLS) += gro_cells.o -obj-$(CONFIG_BPF_SYSCALL) += bpf_sk_storage.o diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c deleted file mode 100644 index 3462809d3dbe..000000000000 --- a/net/core/bpf_sk_storage.c +++ /dev/null @@ -1,812 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* Copyright (c) 2019 Facebook */ -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -static inline void *__compat_kvcalloc(size_t n, size_t size, gfp_t flags) -{ - return kvmalloc_array(n, size, flags | __GFP_ZERO); -} -#define kvcalloc __compat_kvcalloc - -static atomic_t cache_idx; - -struct bucket { - struct hlist_head list; - raw_spinlock_t lock; -}; - -/* Thp map is not the primary owner of a bpf_sk_storage_elem. - * Instead, the sk->sk_bpf_storage is. - * - * The map (bpf_sk_storage_map) is for two purposes - * 1. Define the size of the "sk local storage". It is - * the map's value_size. - * - * 2. Maintain a list to keep track of all elems such - * that they can be cleaned up during the map destruction. - * - * When a bpf local storage is being looked up for a - * particular sk, the "bpf_map" pointer is actually used - * as the "key" to search in the list of elem in - * sk->sk_bpf_storage. - * - * Hence, consider sk->sk_bpf_storage is the mini-map - * with the "bpf_map" pointer as the searching key. - */ -struct bpf_sk_storage_map { - struct bpf_map map; - /* Lookup elem does not require accessing the map. - * - * Updating/Deleting requires a bucket lock to - * link/unlink the elem from the map. Having - * multiple buckets to improve contention. - */ - struct bucket *buckets; - u32 bucket_log; - u16 elem_size; - u16 cache_idx; -}; - -struct bpf_sk_storage_data { - /* smap is used as the searching key when looking up - * from sk->sk_bpf_storage. - * - * Put it in the same cacheline as the data to minimize - * the number of cachelines access during the cache hit case. - */ - struct bpf_sk_storage_map __rcu *smap; - u8 data[0] __aligned(8); -}; - -/* Linked to bpf_sk_storage and bpf_sk_storage_map */ -struct bpf_sk_storage_elem { - struct hlist_node map_node; /* Linked to bpf_sk_storage_map */ - struct hlist_node snode; /* Linked to bpf_sk_storage */ - struct bpf_sk_storage __rcu *sk_storage; - struct rcu_head rcu; - /* 8 bytes hole */ - /* The data is stored in aother cacheline to minimize - * the number of cachelines access during a cache hit. - */ - struct bpf_sk_storage_data sdata ____cacheline_aligned; -}; - -#define SELEM(_SDATA) container_of((_SDATA), struct bpf_sk_storage_elem, sdata) -#define SDATA(_SELEM) (&(_SELEM)->sdata) -#define BPF_SK_STORAGE_CACHE_SIZE 16 - -struct bpf_sk_storage { - struct bpf_sk_storage_data __rcu *cache[BPF_SK_STORAGE_CACHE_SIZE]; - struct hlist_head list; /* List of bpf_sk_storage_elem */ - struct sock *sk; /* The sk that owns the the above "list" of - * bpf_sk_storage_elem. - */ - struct rcu_head rcu; - raw_spinlock_t lock; /* Protect adding/removing from the "list" */ -}; - -static struct bucket *select_bucket(struct bpf_sk_storage_map *smap, - struct bpf_sk_storage_elem *selem) -{ - return &smap->buckets[hash_ptr(selem, smap->bucket_log)]; -} - -static int omem_charge(struct sock *sk, unsigned int size) -{ - /* same check as in sock_kmalloc() */ - if (size <= sysctl_optmem_max && - atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) { - atomic_add(size, &sk->sk_omem_alloc); - return 0; - } - - return -ENOMEM; -} - -static bool selem_linked_to_sk(const struct bpf_sk_storage_elem *selem) -{ - return !hlist_unhashed(&selem->snode); -} - -static bool selem_linked_to_map(const struct bpf_sk_storage_elem *selem) -{ - return !hlist_unhashed(&selem->map_node); -} - -static struct bpf_sk_storage_elem *selem_alloc(struct bpf_sk_storage_map *smap, - struct sock *sk, void *value, - bool charge_omem) -{ - struct bpf_sk_storage_elem *selem; - - if (charge_omem && omem_charge(sk, smap->elem_size)) - return NULL; - - selem = kzalloc(smap->elem_size, GFP_ATOMIC | __GFP_NOWARN); - if (selem) { - if (value) - memcpy(SDATA(selem)->data, value, smap->map.value_size); - return selem; - } - - if (charge_omem) - atomic_sub(smap->elem_size, &sk->sk_omem_alloc); - - return NULL; -} - -/* sk_storage->lock must be held and selem->sk_storage == sk_storage. - * The caller must ensure selem->smap is still valid to be - * dereferenced for its smap->elem_size and smap->cache_idx. - */ -static bool __selem_unlink_sk(struct bpf_sk_storage *sk_storage, - struct bpf_sk_storage_elem *selem, - bool uncharge_omem) -{ - struct bpf_sk_storage_map *smap; - bool free_sk_storage; - struct sock *sk; - - smap = rcu_dereference(SDATA(selem)->smap); - sk = sk_storage->sk; - - /* All uncharging on sk->sk_omem_alloc must be done first. - * sk may be freed once the last selem is unlinked from sk_storage. - */ - if (uncharge_omem) - atomic_sub(smap->elem_size, &sk->sk_omem_alloc); - - free_sk_storage = hlist_is_singular_node(&selem->snode, - &sk_storage->list); - if (free_sk_storage) { - atomic_sub(sizeof(struct bpf_sk_storage), &sk->sk_omem_alloc); - sk_storage->sk = NULL; - /* After this RCU_INIT, sk may be freed and cannot be used */ - RCU_INIT_POINTER(sk->sk_bpf_storage, NULL); - - /* sk_storage is not freed now. sk_storage->lock is - * still held and raw_spin_unlock_bh(&sk_storage->lock) - * will be done by the caller. - * - * Although the unlock will be done under - * rcu_read_lock(), it is more intutivie to - * read if kfree_rcu(sk_storage, rcu) is done - * after the raw_spin_unlock_bh(&sk_storage->lock). - * - * Hence, a "bool free_sk_storage" is returned - * to the caller which then calls the kfree_rcu() - * after unlock. - */ - } - hlist_del_init_rcu(&selem->snode); - if (rcu_access_pointer(sk_storage->cache[smap->cache_idx]) == - SDATA(selem)) - RCU_INIT_POINTER(sk_storage->cache[smap->cache_idx], NULL); - - kfree_rcu(selem, rcu); - - return free_sk_storage; -} - -static void selem_unlink_sk(struct bpf_sk_storage_elem *selem) -{ - struct bpf_sk_storage *sk_storage; - bool free_sk_storage = false; - - if (unlikely(!selem_linked_to_sk(selem))) - /* selem has already been unlinked from sk */ - return; - - sk_storage = rcu_dereference(selem->sk_storage); - raw_spin_lock_bh(&sk_storage->lock); - if (likely(selem_linked_to_sk(selem))) - free_sk_storage = __selem_unlink_sk(sk_storage, selem, true); - raw_spin_unlock_bh(&sk_storage->lock); - - if (free_sk_storage) - kfree_rcu(sk_storage, rcu); -} - -/* sk_storage->lock must be held and sk_storage->list cannot be empty */ -static void __selem_link_sk(struct bpf_sk_storage *sk_storage, - struct bpf_sk_storage_elem *selem) -{ - RCU_INIT_POINTER(selem->sk_storage, sk_storage); - hlist_add_head(&selem->snode, &sk_storage->list); -} - -static void selem_unlink_map(struct bpf_sk_storage_elem *selem) -{ - struct bpf_sk_storage_map *smap; - struct bucket *b; - - if (unlikely(!selem_linked_to_map(selem))) - /* selem has already be unlinked from smap */ - return; - - smap = rcu_dereference(SDATA(selem)->smap); - b = select_bucket(smap, selem); - raw_spin_lock_bh(&b->lock); - if (likely(selem_linked_to_map(selem))) - hlist_del_init_rcu(&selem->map_node); - raw_spin_unlock_bh(&b->lock); -} - -static void selem_link_map(struct bpf_sk_storage_map *smap, - struct bpf_sk_storage_elem *selem) -{ - struct bucket *b = select_bucket(smap, selem); - - raw_spin_lock_bh(&b->lock); - RCU_INIT_POINTER(SDATA(selem)->smap, smap); - hlist_add_head_rcu(&selem->map_node, &b->list); - raw_spin_unlock_bh(&b->lock); -} - -static void selem_unlink(struct bpf_sk_storage_elem *selem) -{ - /* Always unlink from map before unlinking from sk_storage - * because selem will be freed after successfully unlinked from - * the sk_storage. - */ - selem_unlink_map(selem); - selem_unlink_sk(selem); -} - -static struct bpf_sk_storage_data * -__sk_storage_lookup(struct bpf_sk_storage *sk_storage, - struct bpf_sk_storage_map *smap, - bool cacheit_lockit) -{ - struct bpf_sk_storage_data *sdata; - struct bpf_sk_storage_elem *selem; - - /* Fast path (cache hit) */ - sdata = rcu_dereference(sk_storage->cache[smap->cache_idx]); - if (sdata && rcu_access_pointer(sdata->smap) == smap) - return sdata; - - /* Slow path (cache miss) */ - hlist_for_each_entry_rcu(selem, &sk_storage->list, snode) - if (rcu_access_pointer(SDATA(selem)->smap) == smap) - break; - - if (!selem) - return NULL; - - sdata = SDATA(selem); - if (cacheit_lockit) { - /* spinlock is needed to avoid racing with the - * parallel delete. Otherwise, publishing an already - * deleted sdata to the cache will become a use-after-free - * problem in the next __sk_storage_lookup(). - */ - raw_spin_lock_bh(&sk_storage->lock); - if (selem_linked_to_sk(selem)) - rcu_assign_pointer(sk_storage->cache[smap->cache_idx], - sdata); - raw_spin_unlock_bh(&sk_storage->lock); - } - - return sdata; -} - -static struct bpf_sk_storage_data * -sk_storage_lookup(struct sock *sk, struct bpf_map *map, bool cacheit_lockit) -{ - struct bpf_sk_storage *sk_storage; - struct bpf_sk_storage_map *smap; - - sk_storage = rcu_dereference(sk->sk_bpf_storage); - if (!sk_storage) - return NULL; - - smap = (struct bpf_sk_storage_map *)map; - return __sk_storage_lookup(sk_storage, smap, cacheit_lockit); -} - -static int check_flags(const struct bpf_sk_storage_data *old_sdata, - u64 map_flags) -{ - if (old_sdata && (map_flags & ~BPF_F_LOCK) == BPF_NOEXIST) - /* elem already exists */ - return -EEXIST; - - if (!old_sdata && (map_flags & ~BPF_F_LOCK) == BPF_EXIST) - /* elem doesn't exist, cannot update it */ - return -ENOENT; - - return 0; -} - -static int sk_storage_alloc(struct sock *sk, - struct bpf_sk_storage_map *smap, - struct bpf_sk_storage_elem *first_selem) -{ - struct bpf_sk_storage *prev_sk_storage, *sk_storage; - int err; - - err = omem_charge(sk, sizeof(*sk_storage)); - if (err) - return err; - - sk_storage = kzalloc(sizeof(*sk_storage), GFP_ATOMIC | __GFP_NOWARN); - if (!sk_storage) { - err = -ENOMEM; - goto uncharge; - } - INIT_HLIST_HEAD(&sk_storage->list); - raw_spin_lock_init(&sk_storage->lock); - sk_storage->sk = sk; - - __selem_link_sk(sk_storage, first_selem); - selem_link_map(smap, first_selem); - /* Publish sk_storage to sk. sk->sk_lock cannot be acquired. - * Hence, atomic ops is used to set sk->sk_bpf_storage - * from NULL to the newly allocated sk_storage ptr. - * - * From now on, the sk->sk_bpf_storage pointer is protected - * by the sk_storage->lock. Hence, when freeing - * the sk->sk_bpf_storage, the sk_storage->lock must - * be held before setting sk->sk_bpf_storage to NULL. - */ - prev_sk_storage = cmpxchg((struct bpf_sk_storage **)&sk->sk_bpf_storage, - NULL, sk_storage); - if (unlikely(prev_sk_storage)) { - selem_unlink_map(first_selem); - err = -EAGAIN; - goto uncharge; - - /* Note that even first_selem was linked to smap's - * bucket->list, first_selem can be freed immediately - * (instead of kfree_rcu) because - * bpf_sk_storage_map_free() does a - * synchronize_rcu() before walking the bucket->list. - * Hence, no one is accessing selem from the - * bucket->list under rcu_read_lock(). - */ - } - - return 0; - -uncharge: - kfree(sk_storage); - atomic_sub(sizeof(*sk_storage), &sk->sk_omem_alloc); - return err; -} - -/* sk cannot be going away because it is linking new elem - * to sk->sk_bpf_storage. (i.e. sk->sk_refcnt cannot be 0). - * Otherwise, it will become a leak (and other memory issues - * during map destruction). - */ -static struct bpf_sk_storage_data *sk_storage_update(struct sock *sk, - struct bpf_map *map, - void *value, - u64 map_flags) -{ - struct bpf_sk_storage_data *old_sdata = NULL; - struct bpf_sk_storage_elem *selem; - struct bpf_sk_storage *sk_storage; - struct bpf_sk_storage_map *smap; - int err; - - /* BPF_EXIST and BPF_NOEXIST cannot be both set */ - if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST) || - /* BPF_F_LOCK can only be used in a value with spin_lock */ - unlikely((map_flags & BPF_F_LOCK) && !map_value_has_spin_lock(map))) - return ERR_PTR(-EINVAL); - - smap = (struct bpf_sk_storage_map *)map; - sk_storage = rcu_dereference(sk->sk_bpf_storage); - if (!sk_storage || hlist_empty(&sk_storage->list)) { - /* Very first elem for this sk */ - err = check_flags(NULL, map_flags); - if (err) - return ERR_PTR(err); - - selem = selem_alloc(smap, sk, value, true); - if (!selem) - return ERR_PTR(-ENOMEM); - - err = sk_storage_alloc(sk, smap, selem); - if (err) { - kfree(selem); - atomic_sub(smap->elem_size, &sk->sk_omem_alloc); - return ERR_PTR(err); - } - - return SDATA(selem); - } - - if ((map_flags & BPF_F_LOCK) && !(map_flags & BPF_NOEXIST)) { - /* Hoping to find an old_sdata to do inline update - * such that it can avoid taking the sk_storage->lock - * and changing the lists. - */ - old_sdata = __sk_storage_lookup(sk_storage, smap, false); - err = check_flags(old_sdata, map_flags); - if (err) - return ERR_PTR(err); - if (old_sdata && selem_linked_to_sk(SELEM(old_sdata))) { - copy_map_value_locked(map, old_sdata->data, - value, false); - return old_sdata; - } - } - - raw_spin_lock_bh(&sk_storage->lock); - - /* Recheck sk_storage->list under sk_storage->lock */ - if (unlikely(hlist_empty(&sk_storage->list))) { - /* A parallel del is happening and sk_storage is going - * away. It has just been checked before, so very - * unlikely. Return instead of retry to keep things - * simple. - */ - err = -EAGAIN; - goto unlock_err; - } - - old_sdata = __sk_storage_lookup(sk_storage, smap, false); - err = check_flags(old_sdata, map_flags); - if (err) - goto unlock_err; - - if (old_sdata && (map_flags & BPF_F_LOCK)) { - copy_map_value_locked(map, old_sdata->data, value, false); - selem = SELEM(old_sdata); - goto unlock; - } - - /* sk_storage->lock is held. Hence, we are sure - * we can unlink and uncharge the old_sdata successfully - * later. Hence, instead of charging the new selem now - * and then uncharge the old selem later (which may cause - * a potential but unnecessary charge failure), avoid taking - * a charge at all here (the "!old_sdata" check) and the - * old_sdata will not be uncharged later during __selem_unlink_sk(). - */ - selem = selem_alloc(smap, sk, value, !old_sdata); - if (!selem) { - err = -ENOMEM; - goto unlock_err; - } - - /* First, link the new selem to the map */ - selem_link_map(smap, selem); - - /* Second, link (and publish) the new selem to sk_storage */ - __selem_link_sk(sk_storage, selem); - - /* Third, remove old selem, SELEM(old_sdata) */ - if (old_sdata) { - selem_unlink_map(SELEM(old_sdata)); - __selem_unlink_sk(sk_storage, SELEM(old_sdata), false); - } - -unlock: - raw_spin_unlock_bh(&sk_storage->lock); - return SDATA(selem); - -unlock_err: - raw_spin_unlock_bh(&sk_storage->lock); - return ERR_PTR(err); -} - -static int sk_storage_delete(struct sock *sk, struct bpf_map *map) -{ - struct bpf_sk_storage_data *sdata; - - sdata = sk_storage_lookup(sk, map, false); - if (!sdata) - return -ENOENT; - - selem_unlink(SELEM(sdata)); - - return 0; -} - -/* Called by __sk_destruct() */ -void bpf_sk_storage_free(struct sock *sk) -{ - struct bpf_sk_storage_elem *selem; - struct bpf_sk_storage *sk_storage; - bool free_sk_storage = false; - struct hlist_node *n; - - rcu_read_lock(); - sk_storage = rcu_dereference(sk->sk_bpf_storage); - if (!sk_storage) { - rcu_read_unlock(); - return; - } - - /* Netiher the bpf_prog nor the bpf-map's syscall - * could be modifying the sk_storage->list now. - * Thus, no elem can be added-to or deleted-from the - * sk_storage->list by the bpf_prog or by the bpf-map's syscall. - * - * It is racing with bpf_sk_storage_map_free() alone - * when unlinking elem from the sk_storage->list and - * the map's bucket->list. - */ - raw_spin_lock_bh(&sk_storage->lock); - hlist_for_each_entry_safe(selem, n, &sk_storage->list, snode) { - /* Always unlink from map before unlinking from - * sk_storage. - */ - selem_unlink_map(selem); - free_sk_storage = __selem_unlink_sk(sk_storage, selem, true); - } - raw_spin_unlock_bh(&sk_storage->lock); - rcu_read_unlock(); - - if (free_sk_storage) - kfree_rcu(sk_storage, rcu); -} - -static void bpf_sk_storage_map_free(struct bpf_map *map) -{ - struct bpf_sk_storage_elem *selem; - struct bpf_sk_storage_map *smap; - struct bucket *b; - unsigned int i; - - smap = (struct bpf_sk_storage_map *)map; - - synchronize_rcu(); - - /* bpf prog and the userspace can no longer access this map - * now. No new selem (of this map) can be added - * to the sk->sk_bpf_storage or to the map bucket's list. - * - * The elem of this map can be cleaned up here - * or - * by bpf_sk_storage_free() during __sk_destruct(). - */ - for (i = 0; i < (1U << smap->bucket_log); i++) { - b = &smap->buckets[i]; - - rcu_read_lock(); - /* No one is adding to b->list now */ - while ((selem = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(&b->list)), - struct bpf_sk_storage_elem, - map_node))) { - selem_unlink(selem); - cond_resched_rcu(); - } - rcu_read_unlock(); - } - - /* bpf_sk_storage_free() may still need to access the map. - * e.g. bpf_sk_storage_free() has unlinked selem from the map - * which then made the above while((selem = ...)) loop - * exited immediately. - * - * However, the bpf_sk_storage_free() still needs to access - * the smap->elem_size to do the uncharging in - * __selem_unlink_sk(). - * - * Hence, wait another rcu grace period for the - * bpf_sk_storage_free() to finish. - */ - synchronize_rcu(); - - kvfree(smap->buckets); - kfree(map); -} - -static int bpf_sk_storage_map_alloc_check(union bpf_attr *attr) -{ - if (attr->map_flags != BPF_F_NO_PREALLOC || attr->max_entries || - attr->key_size != sizeof(int) || !attr->value_size || - /* Enforce BTF for userspace sk dumping */ - !attr->btf_key_type_id || !attr->btf_value_type_id) - return -EINVAL; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - if (attr->value_size >= KMALLOC_MAX_SIZE - - MAX_BPF_STACK - sizeof(struct bpf_sk_storage_elem) || - /* U16_MAX is much more than enough for sk local storage - * considering a tcp_sock is ~2k. - */ - attr->value_size > U16_MAX - sizeof(struct bpf_sk_storage_elem)) - return -E2BIG; - - return 0; -} - -static struct bpf_map *bpf_sk_storage_map_alloc(union bpf_attr *attr) -{ - struct bpf_sk_storage_map *smap; - unsigned int i; - u32 nbuckets; - u64 cost; - - smap = kzalloc(sizeof(*smap), GFP_USER | __GFP_NOWARN); - if (!smap) - return ERR_PTR(-ENOMEM); - bpf_map_init_from_attr(&smap->map, attr); - - smap->bucket_log = ilog2(roundup_pow_of_two(num_possible_cpus())); - nbuckets = 1U << smap->bucket_log; - smap->buckets = kvcalloc(sizeof(*smap->buckets), nbuckets, - GFP_USER | __GFP_NOWARN); - if (!smap->buckets) { - kfree(smap); - return ERR_PTR(-ENOMEM); - } - cost = sizeof(*smap->buckets) * nbuckets + sizeof(*smap); - - for (i = 0; i < nbuckets; i++) { - INIT_HLIST_HEAD(&smap->buckets[i].list); - raw_spin_lock_init(&smap->buckets[i].lock); - } - - smap->elem_size = sizeof(struct bpf_sk_storage_elem) + attr->value_size; - smap->cache_idx = (unsigned int)atomic_inc_return(&cache_idx) % - BPF_SK_STORAGE_CACHE_SIZE; - smap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; - - return &smap->map; -} - -static int notsupp_get_next_key(struct bpf_map *map, void *key, - void *next_key) -{ - return -ENOTSUPP; -} - -static int bpf_sk_storage_map_check_btf(const struct bpf_map *map, - const struct btf *btf, - const struct btf_type *key_type, - const struct btf_type *value_type) -{ - u32 int_data; - - if (BTF_INFO_KIND(key_type->info) != BTF_KIND_INT) - return -EINVAL; - - int_data = *(u32 *)(key_type + 1); - if (BTF_INT_BITS(int_data) != 32 || BTF_INT_OFFSET(int_data)) - return -EINVAL; - - return 0; -} - -static void *bpf_fd_sk_storage_lookup_elem(struct bpf_map *map, void *key) -{ - struct bpf_sk_storage_data *sdata; - struct socket *sock; - int fd, err; - - fd = *(int *)key; - sock = sockfd_lookup(fd, &err); - if (sock) { - sdata = sk_storage_lookup(sock->sk, map, true); - sockfd_put(sock); - return sdata ? sdata->data : NULL; - } - - return ERR_PTR(err); -} - -static int bpf_fd_sk_storage_update_elem(struct bpf_map *map, void *key, - void *value, u64 map_flags) -{ - struct bpf_sk_storage_data *sdata; - struct socket *sock; - int fd, err; - - fd = *(int *)key; - sock = sockfd_lookup(fd, &err); - if (sock) { - sdata = sk_storage_update(sock->sk, map, value, map_flags); - sockfd_put(sock); - return IS_ERR(sdata) ? PTR_ERR(sdata) : 0; - } - - return err; -} - -static int bpf_fd_sk_storage_delete_elem(struct bpf_map *map, void *key) -{ - struct socket *sock; - int fd, err; - - fd = *(int *)key; - sock = sockfd_lookup(fd, &err); - if (sock) { - err = sk_storage_delete(sock->sk, map); - sockfd_put(sock); - return err; - } - - return err; -} - -BPF_CALL_4(bpf_sk_storage_get, struct bpf_map *, map, struct sock *, sk, - void *, value, u64, flags) -{ - struct bpf_sk_storage_data *sdata; - - if (flags > BPF_SK_STORAGE_GET_F_CREATE) - return (unsigned long)NULL; - - sdata = sk_storage_lookup(sk, map, true); - if (sdata) - return (unsigned long)sdata->data; - - if (flags == BPF_SK_STORAGE_GET_F_CREATE && - /* Cannot add new elem to a going away sk. - * Otherwise, the new elem may become a leak - * (and also other memory issues during map - * destruction). - */ - refcount_inc_not_zero(&sk->sk_refcnt)) { - sdata = sk_storage_update(sk, map, value, BPF_NOEXIST); - /* sk must be a fullsock (guaranteed by verifier), - * so sock_gen_put() is unnecessary. - */ - sock_put(sk); - return IS_ERR(sdata) ? - (unsigned long)NULL : (unsigned long)sdata->data; - } - - return (unsigned long)NULL; -} - -BPF_CALL_2(bpf_sk_storage_delete, struct bpf_map *, map, struct sock *, sk) -{ - if (refcount_inc_not_zero(&sk->sk_refcnt)) { - int err; - - err = sk_storage_delete(sk, map); - sock_put(sk); - return err; - } - - return -ENOENT; -} - -const struct bpf_map_ops sk_storage_map_ops = { - .map_alloc_check = bpf_sk_storage_map_alloc_check, - .map_alloc = bpf_sk_storage_map_alloc, - .map_free = bpf_sk_storage_map_free, - .map_get_next_key = notsupp_get_next_key, - .map_lookup_elem = bpf_fd_sk_storage_lookup_elem, - .map_update_elem = bpf_fd_sk_storage_update_elem, - .map_delete_elem = bpf_fd_sk_storage_delete_elem, - .map_check_btf = bpf_sk_storage_map_check_btf, -}; - -const struct bpf_func_proto bpf_sk_storage_get_proto = { - .func = bpf_sk_storage_get, - .gpl_only = false, - .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, - .arg1_type = ARG_CONST_MAP_PTR, - .arg2_type = ARG_PTR_TO_SOCKET, - .arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL, - .arg4_type = ARG_ANYTHING, -}; - -const struct bpf_func_proto bpf_sk_storage_delete_proto = { - .func = bpf_sk_storage_delete, - .gpl_only = false, - .ret_type = RET_INTEGER, - .arg1_type = ARG_CONST_MAP_PTR, - .arg2_type = ARG_PTR_TO_SOCKET, -}; diff --git a/net/core/dev.c b/net/core/dev.c index 9dc4a585c8e8..9a0f4c34c632 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3085,7 +3085,7 @@ sw_checksum: } EXPORT_SYMBOL(skb_csum_hwoffload_help); -static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev, bool *again) +static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev) { netdev_features_t features; @@ -3113,6 +3113,9 @@ static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device __skb_linearize(skb)) goto out_kfree_skb; + if (validate_xmit_xfrm(skb, features)) + goto out_kfree_skb; + /* If packet is not checksummed and device does not * support checksumming for this protocol, complete * checksumming here. @@ -3129,8 +3132,6 @@ static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device } } - skb = validate_xmit_xfrm(skb, features, again); - return skb; out_kfree_skb: @@ -3140,7 +3141,7 @@ out_null: return NULL; } -struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev, bool *again) +struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev) { struct sk_buff *next, *head = NULL, *tail; @@ -3151,7 +3152,7 @@ struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *d /* in case skb wont be segmented, point to itself */ skb->prev = skb; - skb = validate_xmit_skb(skb, dev, again); + skb = validate_xmit_skb(skb, dev); if (!skb) continue; @@ -3481,7 +3482,6 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv) struct netdev_queue *txq; struct Qdisc *q; int rc = -ENOMEM; - bool again = false; skb_reset_mac_header(skb); @@ -3543,7 +3543,7 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv) XMIT_RECURSION_LIMIT)) goto recursion_alert; - skb = validate_xmit_skb(skb, dev, &again); + skb = validate_xmit_skb(skb, dev); if (!skb) goto out; @@ -3942,35 +3942,11 @@ drop: return NET_RX_DROP; } -static struct netdev_rx_queue *netif_get_rxqueue(struct sk_buff *skb) -{ - struct net_device *dev = skb->dev; - struct netdev_rx_queue *rxqueue; - - rxqueue = dev->_rx; - - if (skb_rx_queue_recorded(skb)) { - u16 index = skb_get_rx_queue(skb); - if (unlikely(index >= dev->real_num_rx_queues)) { - WARN_ONCE(dev->real_num_rx_queues > 1, - "%s received packet on queue %u, but number " - "of RX queues is %u\n", - dev->name, index, dev->real_num_rx_queues); - return rxqueue; /* Return first rxqueue */ - } - - rxqueue += index; - } - - return rxqueue; -} - static u32 netif_receive_generic_xdp(struct sk_buff *skb, struct bpf_prog *xdp_prog) { - struct netdev_rx_queue *rxqueue; - u32 metalen, act = XDP_DROP; struct xdp_buff xdp; + u32 act = XDP_DROP; void *orig_data; int hlen, off; u32 mac_len; @@ -3981,24 +3957,8 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, if (skb_cloned(skb)) return XDP_PASS; - /* XDP packets must be linear and must have sufficient headroom - * of XDP_PACKET_HEADROOM bytes. This is the guarantee that also - * native XDP provides, thus we need to do it here as well. - */ - if (skb_is_nonlinear(skb) || - skb_headroom(skb) < XDP_PACKET_HEADROOM) { - int hroom = XDP_PACKET_HEADROOM - skb_headroom(skb); - int troom = skb->tail + skb->data_len - skb->end; - /* In case we have to go down the path and also linearize, - * then lets do the pskb_expand_head() work just once here. - */ - if (pskb_expand_head(skb, - hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0, - troom > 0 ? troom + 128 : 0, GFP_ATOMIC)) - goto do_drop; - if (troom > 0 && __skb_linearize(skb)) - goto do_drop; - } + if (skb_linearize(skb)) + goto do_drop; /* The XDP program wants to see the packet starting at the MAC * header. @@ -4007,13 +3967,9 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, hlen = skb_headlen(skb) + mac_len; xdp.data = skb->data - mac_len; xdp.data_end = xdp.data + hlen; - xdp.data_meta = xdp.data; xdp.data_hard_start = skb->data - skb_headroom(skb); orig_data = xdp.data; - rxqueue = netif_get_rxqueue(skb); - xdp.rxq = &rxqueue->xdp_rxq; - act = bpf_prog_run_xdp(xdp_prog, &xdp); off = xdp.data - orig_data; @@ -4027,11 +3983,8 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, case XDP_REDIRECT: case XDP_TX: __skb_push(skb, mac_len); - break; + /* fall through */ case XDP_PASS: - metalen = xdp.data - xdp.data_meta; - if (metalen) - skb_metadata_set(skb, metalen); break; default: @@ -4254,7 +4207,6 @@ static __latent_entropy void net_tx_action(struct softirq_action *h) spin_unlock(root_lock); } } - xfrm_dev_backlog(sd); } #if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_ATM_LANE) @@ -4631,7 +4583,7 @@ static int __netif_receive_skb(struct sk_buff *skb) return ret; } -static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp) +static int generic_xdp_install(struct net_device *dev, struct netdev_xdp *xdp) { struct bpf_prog *old = rtnl_dereference(dev->xdp_prog); struct bpf_prog *new = xdp->prog; @@ -4860,7 +4812,6 @@ static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb) diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev; diffs |= p->vlan_tci ^ skb->vlan_tci; diffs |= skb_metadata_dst_cmp(p, skb); - diffs |= skb_metadata_differs(p, skb); if (maclen == ETH_HLEN) diffs |= compare_ether_header(skb_mac_header(p), skb_mac_header(skb)); @@ -7191,26 +7142,26 @@ int dev_change_proto_down(struct net_device *dev, bool proto_down) } EXPORT_SYMBOL(dev_change_proto_down); -u8 __dev_xdp_attached(struct net_device *dev, bpf_op_t bpf_op, u32 *prog_id) +u8 __dev_xdp_attached(struct net_device *dev, xdp_op_t xdp_op, u32 *prog_id) { - struct netdev_bpf xdp; + struct netdev_xdp xdp; memset(&xdp, 0, sizeof(xdp)); xdp.command = XDP_QUERY_PROG; /* Query must always succeed. */ - WARN_ON(bpf_op(dev, &xdp) < 0); + WARN_ON(xdp_op(dev, &xdp) < 0); if (prog_id) *prog_id = xdp.prog_id; return xdp.prog_attached; } -static int dev_xdp_install(struct net_device *dev, bpf_op_t bpf_op, +static int dev_xdp_install(struct net_device *dev, xdp_op_t xdp_op, struct netlink_ext_ack *extack, u32 flags, struct bpf_prog *prog) { - struct netdev_bpf xdp; + struct netdev_xdp xdp; memset(&xdp, 0, sizeof(xdp)); if (flags & XDP_FLAGS_HW_MODE) @@ -7221,7 +7172,7 @@ static int dev_xdp_install(struct net_device *dev, bpf_op_t bpf_op, xdp.flags = flags; xdp.prog = prog; - return bpf_op(dev, &xdp); + return xdp_op(dev, &xdp); } /** @@ -7238,24 +7189,24 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack, { const struct net_device_ops *ops = dev->netdev_ops; struct bpf_prog *prog = NULL; - bpf_op_t bpf_op, bpf_chk; + xdp_op_t xdp_op, xdp_chk; int err; ASSERT_RTNL(); - bpf_op = bpf_chk = ops->ndo_bpf; - if (!bpf_op && (flags & (XDP_FLAGS_DRV_MODE | XDP_FLAGS_HW_MODE))) + xdp_op = xdp_chk = ops->ndo_xdp; + if (!xdp_op && (flags & (XDP_FLAGS_DRV_MODE | XDP_FLAGS_HW_MODE))) return -EOPNOTSUPP; - if (!bpf_op || (flags & XDP_FLAGS_SKB_MODE)) - bpf_op = generic_xdp_install; - if (bpf_op == bpf_chk) - bpf_chk = generic_xdp_install; + if (!xdp_op || (flags & XDP_FLAGS_SKB_MODE)) + xdp_op = generic_xdp_install; + if (xdp_op == xdp_chk) + xdp_chk = generic_xdp_install; if (fd >= 0) { - if (bpf_chk && __dev_xdp_attached(dev, bpf_chk, NULL)) + if (xdp_chk && __dev_xdp_attached(dev, xdp_chk, NULL)) return -EEXIST; if ((flags & XDP_FLAGS_UPDATE_IF_NOEXIST) && - __dev_xdp_attached(dev, bpf_op, NULL)) + __dev_xdp_attached(dev, xdp_op, NULL)) return -EBUSY; prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_XDP); @@ -7263,7 +7214,7 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack, return PTR_ERR(prog); } - err = dev_xdp_install(dev, bpf_op, extack, flags, prog); + err = dev_xdp_install(dev, xdp_op, extack, flags, prog); if (err < 0 && prog) bpf_prog_put(prog); @@ -7636,12 +7587,12 @@ void netif_stacked_transfer_operstate(const struct net_device *rootdev, } EXPORT_SYMBOL(netif_stacked_transfer_operstate); +#ifdef CONFIG_SYSFS static int netif_alloc_rx_queues(struct net_device *dev) { unsigned int i, count = dev->num_rx_queues; struct netdev_rx_queue *rx; size_t sz = count * sizeof(*rx); - int err = 0; BUG_ON(count < 1); @@ -7651,38 +7602,11 @@ static int netif_alloc_rx_queues(struct net_device *dev) dev->_rx = rx; - for (i = 0; i < count; i++) { - rx[i].dev = dev; - - /* XDP RX-queue setup */ - err = xdp_rxq_info_reg(&rx[i].xdp_rxq, dev, i); - if (err < 0) - goto err_rxq_info; - } - return 0; - -err_rxq_info: - /* Rollback successful reg's and free other resources */ - while (i--) - xdp_rxq_info_unreg(&rx[i].xdp_rxq); - kfree(dev->_rx); - dev->_rx = NULL; - return err; -} - -static void netif_free_rx_queues(struct net_device *dev) -{ - unsigned int i, count = dev->num_rx_queues; - struct netdev_rx_queue *rx; - /* netif_alloc_rx_queues alloc failed, resources have been unreg'ed */ - if (!dev->_rx) - return; - - rx = dev->_rx; for (i = 0; i < count; i++) - xdp_rxq_info_unreg(&rx[i].xdp_rxq); - + rx[i].dev = dev; + return 0; } +#endif static void netdev_init_one_queue(struct net_device *dev, struct netdev_queue *queue, void *_unused) @@ -7693,7 +7617,9 @@ static void netdev_init_one_queue(struct net_device *dev, queue->xmit_lock_owner = -1; netdev_queue_numa_node_write(queue, NUMA_NO_NODE); queue->dev = dev; +#ifdef CONFIG_BQL dql_init(&queue->dql, HZ); +#endif } static void netif_free_tx_queues(struct net_device *dev) @@ -8326,10 +8252,12 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, if (netif_alloc_netdev_queues(dev)) goto free_all; +#ifdef CONFIG_SYSFS dev->num_rx_queues = rxqs; dev->real_num_rx_queues = rxqs; if (netif_alloc_rx_queues(dev)) goto free_all; +#endif strcpy(dev->name, name); dev->name_assign_type = name_assign_type; @@ -8369,7 +8297,9 @@ void free_netdev(struct net_device *dev) might_sleep(); netif_free_tx_queues(dev); - netif_free_rx_queues(dev); +#ifdef CONFIG_SYSFS + kvfree(dev->_rx); +#endif kfree(rcu_dereference_protected(dev->ingress_queue, 1)); @@ -8971,9 +8901,6 @@ static int __init net_dev_init(void) skb_queue_head_init(&sd->input_pkt_queue); skb_queue_head_init(&sd->process_queue); -#ifdef CONFIG_XFRM_OFFLOAD - skb_queue_head_init(&sd->xfrm_backlog); -#endif INIT_LIST_HEAD(&sd->poll_list); sd->output_queue_tailp = &sd->output_queue; #ifdef CONFIG_RPS diff --git a/net/core/filter.c b/net/core/filter.c index 133e8827eff6..9ac44ebb694f 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -33,12 +33,10 @@ #include #include #include -#include #include #include #include #include -#include #include #include #include @@ -58,11 +56,6 @@ #include #include #include -#include -#include -#include -#include -#include /** * sk_filter_trim_cap - run a packet through a socket filter @@ -1429,7 +1422,7 @@ static inline int bpf_try_make_writable(struct sk_buff *skb, { int err = __bpf_try_make_writable(skb, write_len); - bpf_compute_data_pointers(skb); + bpf_compute_data_end(skb); return err; } @@ -1520,26 +1513,24 @@ static const struct bpf_func_proto bpf_skb_load_bytes_proto = { BPF_CALL_5(bpf_skb_load_bytes_relative, const struct sk_buff *, skb, u32, offset, void *, to, u32, len, u32, start_header) { - u8 *end = skb_tail_pointer(skb); - u8 *net = skb_network_header(skb); - u8 *mac = skb_mac_header(skb); u8 *ptr; - if (unlikely(offset > 0xffff || len > (end - mac))) + if (unlikely(offset > 0xffff || len > skb_headlen(skb))) goto err_clear; switch (start_header) { case BPF_HDR_START_MAC: - ptr = mac + offset; + ptr = skb_mac_header(skb) + offset; break; case BPF_HDR_START_NET: - ptr = net + offset; + ptr = skb_network_header(skb) + offset; break; default: goto err_clear; } - if (likely(ptr >= mac && ptr + len <= end)) { + if (likely(ptr >= skb_mac_header(skb) && + ptr + len <= skb_tail_pointer(skb))) { memcpy(to, ptr, len); return 0; } @@ -1582,49 +1573,6 @@ static const struct bpf_func_proto bpf_skb_pull_data_proto = { .arg2_type = ARG_ANYTHING, }; -static inline int sk_skb_try_make_writable(struct sk_buff *skb, - unsigned int write_len) -{ - int err = __bpf_try_make_writable(skb, write_len); - bpf_compute_data_end_sk_skb(skb); - return err; -} - -BPF_CALL_2(sk_skb_pull_data, struct sk_buff *, skb, u32, len) -{ - /* Idea is the following: should the needed direct read/write - * test fail during runtime, we can pull in more data and redo - * again, since implicitly, we invalidate previous checks here. - * - * Or, since we know how much we need to make read/writeable, - * this can be done once at the program beginning for direct - * access case. By this we overcome limitations of only current - * headroom being accessible. - */ - return sk_skb_try_make_writable(skb, len ? : skb_headlen(skb)); -} - -static const struct bpf_func_proto sk_skb_pull_data_proto = { - .func = sk_skb_pull_data, - .gpl_only = false, - .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_ANYTHING, -}; - -BPF_CALL_1(bpf_sk_fullsock, struct sock *, sk) -{ - sk = sk_to_full_sk(sk); - return sk_fullsock(sk) ? (unsigned long)sk : (unsigned long)NULL; -} - -static const struct bpf_func_proto bpf_sk_fullsock_proto = { - .func = bpf_sk_fullsock, - .gpl_only = false, - .ret_type = RET_PTR_TO_SOCKET_OR_NULL, - .arg1_type = ARG_PTR_TO_SOCK_COMMON, -}; - BPF_CALL_5(bpf_l3_csum_replace, struct sk_buff *, skb, u32, offset, u64, from, u64, to, u64, flags) { @@ -1953,6 +1901,47 @@ static const struct bpf_func_proto bpf_redirect_proto = { .arg2_type = ARG_ANYTHING, }; +BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb, + struct bpf_map *, map, u32, key, u64, flags) +{ + struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); + + /* If user passes invalid input drop the packet. */ + if (unlikely(flags)) + return SK_DROP; + + tcb->bpf.key = key; + tcb->bpf.flags = flags; + tcb->bpf.map = map; + + return SK_PASS; +} + +struct sock *do_sk_redirect_map(struct sk_buff *skb) +{ + struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); + struct sock *sk = NULL; + + if (tcb->bpf.map) { + sk = __sock_map_lookup_elem(tcb->bpf.map, tcb->bpf.key); + + tcb->bpf.key = 0; + tcb->bpf.map = NULL; + } + + return sk; +} + +static const struct bpf_func_proto bpf_sk_redirect_map_proto = { + .func = bpf_sk_redirect_map, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_ANYTHING, +}; + BPF_CALL_1(bpf_get_cgroup_classid, const struct sk_buff *, skb) { return task_get_classid(skb); @@ -2041,7 +2030,7 @@ BPF_CALL_3(bpf_skb_vlan_push, struct sk_buff *, skb, __be16, vlan_proto, ret = skb_vlan_push(skb, vlan_proto, vlan_tci); bpf_pull_mac_rcsum(skb); - bpf_compute_data_pointers(skb); + bpf_compute_data_end(skb); return ret; } @@ -2063,7 +2052,7 @@ BPF_CALL_1(bpf_skb_vlan_pop, struct sk_buff *, skb) ret = skb_vlan_pop(skb); bpf_pull_mac_rcsum(skb); - bpf_compute_data_pointers(skb); + bpf_compute_data_end(skb); return ret; } @@ -2268,7 +2257,7 @@ BPF_CALL_3(bpf_skb_change_proto, struct sk_buff *, skb, __be16, proto, * need to be verified first. */ ret = bpf_skb_proto_xlat(skb, proto); - bpf_compute_data_pointers(skb); + bpf_compute_data_end(skb); return ret; } @@ -2402,7 +2391,7 @@ static int bpf_skb_adjust_net(struct sk_buff *skb, s32 len_diff) ret = shrink ? bpf_skb_net_shrink(skb, len_diff_abs) : bpf_skb_net_grow(skb, len_diff_abs); - bpf_compute_data_pointers(skb); + bpf_compute_data_end(skb); return ret; } @@ -2455,8 +2444,8 @@ static int bpf_skb_trim_rcsum(struct sk_buff *skb, unsigned int new_len) return __skb_trim_rcsum(skb, new_len); } -static inline int __bpf_skb_change_tail(struct sk_buff *skb, u32 new_len, - u64 flags) +BPF_CALL_3(bpf_skb_change_tail, struct sk_buff *, skb, u32, new_len, + u64, flags) { u32 max_len = BPF_SKB_MAX_LEN; u32 min_len = __bpf_skb_min_len(skb); @@ -2492,14 +2481,8 @@ static inline int __bpf_skb_change_tail(struct sk_buff *skb, u32 new_len, if (!ret && skb_is_gso(skb)) skb_gso_reset(skb); } - return ret; -} -BPF_CALL_3(bpf_skb_change_tail, struct sk_buff *, skb, u32, new_len, - u64, flags) -{ - int ret = __bpf_skb_change_tail(skb, new_len, flags); - bpf_compute_data_pointers(skb); + bpf_compute_data_end(skb); return ret; } @@ -2512,25 +2495,8 @@ static const struct bpf_func_proto bpf_skb_change_tail_proto = { .arg3_type = ARG_ANYTHING, }; -BPF_CALL_3(sk_skb_change_tail, struct sk_buff *, skb, u32, new_len, +BPF_CALL_3(bpf_skb_change_head, struct sk_buff *, skb, u32, head_room, u64, flags) -{ - int ret = __bpf_skb_change_tail(skb, new_len, flags); - bpf_compute_data_end_sk_skb(skb); - return ret; -} - -static const struct bpf_func_proto sk_skb_change_tail_proto = { - .func = sk_skb_change_tail, - .gpl_only = false, - .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_ANYTHING, - .arg3_type = ARG_ANYTHING, -}; - -static inline int __bpf_skb_change_head(struct sk_buff *skb, u32 head_room, - u64 flags) { u32 max_len = BPF_SKB_MAX_LEN; u32 new_len = skb->len + head_room; @@ -2557,15 +2523,8 @@ static inline int __bpf_skb_change_head(struct sk_buff *skb, u32 head_room, skb_reset_mac_len(skb); } - return ret; -} -BPF_CALL_3(bpf_skb_change_head, struct sk_buff *, skb, u32, head_room, - u64, flags) -{ - int ret = __bpf_skb_change_head(skb, head_room, flags); - - bpf_compute_data_pointers(skb); - return ret; + bpf_compute_data_end(skb); + return 0; } static const struct bpf_func_proto bpf_skb_change_head_proto = { @@ -2577,43 +2536,15 @@ static const struct bpf_func_proto bpf_skb_change_head_proto = { .arg3_type = ARG_ANYTHING, }; -BPF_CALL_3(sk_skb_change_head, struct sk_buff *, skb, u32, head_room, - u64, flags) -{ - int ret = __bpf_skb_change_head(skb, head_room, flags); - bpf_compute_data_end_sk_skb(skb); - return ret; -} - -static const struct bpf_func_proto sk_skb_change_head_proto = { - .func = sk_skb_change_head, - .gpl_only = false, - .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_ANYTHING, - .arg3_type = ARG_ANYTHING, -}; - -static unsigned long xdp_get_metalen(const struct xdp_buff *xdp) -{ - return xdp_data_meta_unsupported(xdp) ? 0 : - xdp->data - xdp->data_meta; -} - BPF_CALL_2(bpf_xdp_adjust_head, struct xdp_buff *, xdp, int, offset) { - unsigned long metalen = xdp_get_metalen(xdp); - void *data_start = xdp->data_hard_start + metalen; void *data = xdp->data + offset; - if (unlikely(data < data_start || + if (unlikely(data < xdp->data_hard_start || data > xdp->data_end - ETH_HLEN)) return -EINVAL; - if (metalen) - memmove(xdp->data_meta + offset, - xdp->data_meta, metalen); - xdp->data_meta += offset; + xdp->data = data; return 0; } @@ -2626,29 +2557,6 @@ static const struct bpf_func_proto bpf_xdp_adjust_head_proto = { .arg2_type = ARG_ANYTHING, }; -BPF_CALL_2(bpf_xdp_adjust_meta, struct xdp_buff *, xdp, int, offset) -{ - void *meta = xdp->data_meta + offset; - unsigned long metalen = xdp->data - meta; - if (xdp_data_meta_unsupported(xdp)) - return -ENOTSUPP; - if (unlikely(meta < xdp->data_hard_start || - meta > xdp->data)) - return -EINVAL; - if (unlikely((metalen & (sizeof(__u32) - 1)) || - (metalen > 32))) - return -EACCES; - xdp->data_meta = meta; - return 0; -} -static const struct bpf_func_proto bpf_xdp_adjust_meta_proto = { - .func = bpf_xdp_adjust_meta, - .gpl_only = false, - .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_ANYTHING, -}; - static int __bpf_tx_xdp(struct net_device *dev, struct bpf_map *map, struct xdp_buff *xdp, @@ -2707,10 +2615,7 @@ static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp, goto err; } - if (map->map_type == BPF_MAP_TYPE_DEVMAP) - fwd = __dev_map_lookup_elem(map, index); - else if (map->map_type == BPF_MAP_TYPE_DEVMAP_HASH) - fwd = __dev_map_hash_lookup_elem(map, index); + fwd = __dev_map_lookup_elem(map, index); if (!fwd) { err = -EINVAL; goto err; @@ -2781,10 +2686,7 @@ int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb, map = NULL; goto err; } - if (map->map_type == BPF_MAP_TYPE_DEVMAP) - fwd = __dev_map_lookup_elem(map, index); - else if (map->map_type == BPF_MAP_TYPE_DEVMAP_HASH) - fwd = __dev_map_hash_lookup_elem(map, index); + fwd = __dev_map_lookup_elem(map, index); } else { fwd = dev_get_by_index_rcu(dev_net(dev), index); } @@ -2866,349 +2768,6 @@ static const struct bpf_func_proto bpf_xdp_redirect_map_proto = { .arg3_type = ARG_ANYTHING, }; -#ifdef CONFIG_INET -struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple, - int dif, int sdif, u8 family, u8 proto) -{ - bool refcounted = false; - struct sock *sk = NULL; - if (family == AF_INET) { - __be32 src4 = tuple->ipv4.saddr; - __be32 dst4 = tuple->ipv4.daddr; - - if (proto == IPPROTO_TCP) - sk = __inet_lookup(net, &tcp_hashinfo, NULL, 0, - src4, tuple->ipv4.sport, - dst4, tuple->ipv4.dport, - dif, sdif, &refcounted); - else - sk = __udp4_lib_lookup(net, src4, tuple->ipv4.sport, - dst4, tuple->ipv4.dport, - dif, sdif, &udp_table, NULL); -#if IS_ENABLED(CONFIG_IPV6) - } else { - struct in6_addr *src6 = (struct in6_addr *)&tuple->ipv6.saddr; - struct in6_addr *dst6 = (struct in6_addr *)&tuple->ipv6.daddr; - - if (proto == IPPROTO_TCP) - sk = __inet6_lookup(net, &tcp_hashinfo, NULL, 0, - src6, tuple->ipv6.sport, - dst6, tuple->ipv6.dport, - dif, sdif, &refcounted); - else - sk = __udp6_lib_lookup(net, src6, tuple->ipv6.sport, - dst6, tuple->ipv6.dport, - dif, sdif, &udp_table, NULL); -#endif - } - if (unlikely(sk && !refcounted && !sock_flag(sk, SOCK_RCU_FREE))) { - WARN_ONCE(1, "Found non-RCU, unreferenced socket!"); - sk = NULL; - } - return sk; -} -/* bpf_sk_lookup performs the core lookup for different types of sockets, - * taking a reference on the socket if it doesn't have the flag SOCK_RCU_FREE. - * Returns the socket as an 'unsigned long' to simplify the casting in the - * callers to satisfy BPF_CALL declarations. - */ -static unsigned long -__bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, - struct net *caller_net, u32 ifindex, u8 proto, u64 netns_id, - u64 flags) -{ - struct sock *sk = NULL; - u8 family = AF_UNSPEC; - struct net *net; - int sdif; - - family = len == sizeof(tuple->ipv4) ? AF_INET : AF_INET6; - - if (unlikely(family == AF_UNSPEC || netns_id > U32_MAX || flags)) - goto out; - - if (family == AF_INET) - sdif = inet_sdif(skb); - else - sdif = inet6_sdif(skb); - - if (netns_id) { - net = get_net_ns_by_id(caller_net, netns_id); - if (unlikely(!net)) - goto out; - sk = sk_lookup(net, tuple, ifindex, sdif, family, proto); - put_net(net); - } else { - net = caller_net; - sk = sk_lookup(net, tuple, ifindex, sdif, family, proto); - } - if (sk) - sk = sk_to_full_sk(sk); -out: - return (unsigned long) sk; -} - -static unsigned long -bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, - u8 proto, u64 netns_id, u64 flags) -{ - struct net *caller_net; - int ifindex; - - if (skb->dev) { - caller_net = dev_net(skb->dev); - ifindex = skb->dev->ifindex; - } else { - caller_net = sock_net(skb->sk); - ifindex = 0; - } - - return __bpf_sk_lookup(skb, tuple, len, caller_net, ifindex, - proto, netns_id, flags); -} - -BPF_CALL_5(bpf_sk_lookup_tcp, struct sk_buff *, skb, - struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) -{ - return bpf_sk_lookup(skb, tuple, len, IPPROTO_TCP, netns_id, flags); -} -static const struct bpf_func_proto bpf_sk_lookup_tcp_proto = { - .func = bpf_sk_lookup_tcp, - .gpl_only = false, - .pkt_access = true, - .ret_type = RET_PTR_TO_SOCKET_OR_NULL, - .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_MEM, - .arg3_type = ARG_CONST_SIZE, - .arg4_type = ARG_ANYTHING, - .arg5_type = ARG_ANYTHING, -}; -BPF_CALL_5(bpf_sk_lookup_udp, struct sk_buff *, skb, - struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) -{ - return bpf_sk_lookup(skb, tuple, len, IPPROTO_UDP, netns_id, flags); -} -static const struct bpf_func_proto bpf_sk_lookup_udp_proto = { - .func = bpf_sk_lookup_udp, - .gpl_only = false, - .pkt_access = true, - .ret_type = RET_PTR_TO_SOCKET_OR_NULL, - .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_MEM, - .arg3_type = ARG_CONST_SIZE, - .arg4_type = ARG_ANYTHING, - .arg5_type = ARG_ANYTHING, -}; -BPF_CALL_1(bpf_sk_release, struct sock *, sk) -{ - if (!sock_flag(sk, SOCK_RCU_FREE)) - sock_gen_put(sk); - return 0; -} -static const struct bpf_func_proto bpf_sk_release_proto = { - .func = bpf_sk_release, - .gpl_only = false, - .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_SOCKET, -}; - -BPF_CALL_5(bpf_xdp_sk_lookup_udp, struct xdp_buff *, ctx, - struct bpf_sock_tuple *, tuple, u32, len, u32, netns_id, u64, flags) -{ - struct net *caller_net = dev_net(ctx->rxq->dev); - int ifindex = ctx->rxq->dev->ifindex; - - return __bpf_sk_lookup(NULL, tuple, len, caller_net, ifindex, - IPPROTO_UDP, netns_id, flags); -} - -static const struct bpf_func_proto bpf_xdp_sk_lookup_udp_proto = { - .func = bpf_xdp_sk_lookup_udp, - .gpl_only = false, - .pkt_access = true, - .ret_type = RET_PTR_TO_SOCKET_OR_NULL, - .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_MEM, - .arg3_type = ARG_CONST_SIZE, - .arg4_type = ARG_ANYTHING, - .arg5_type = ARG_ANYTHING, -}; - -BPF_CALL_5(bpf_xdp_sk_lookup_tcp, struct xdp_buff *, ctx, - struct bpf_sock_tuple *, tuple, u32, len, u32, netns_id, u64, flags) -{ - struct net *caller_net = dev_net(ctx->rxq->dev); - int ifindex = ctx->rxq->dev->ifindex; - - return __bpf_sk_lookup(NULL, tuple, len, caller_net, ifindex, - IPPROTO_TCP, netns_id, flags); -} - -static const struct bpf_func_proto bpf_xdp_sk_lookup_tcp_proto = { - .func = bpf_xdp_sk_lookup_tcp, - .gpl_only = false, - .pkt_access = true, - .ret_type = RET_PTR_TO_SOCKET_OR_NULL, - .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_MEM, - .arg3_type = ARG_CONST_SIZE, - .arg4_type = ARG_ANYTHING, - .arg5_type = ARG_ANYTHING, -}; - -BPF_CALL_5(bpf_sock_addr_sk_lookup_tcp, struct bpf_sock_addr_kern *, ctx, - struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) -{ - return __bpf_sk_lookup(NULL, tuple, len, sock_net(ctx->sk), 0, - IPPROTO_TCP, netns_id, flags); -} - -static const struct bpf_func_proto bpf_sock_addr_sk_lookup_tcp_proto = { - .func = bpf_sock_addr_sk_lookup_tcp, - .gpl_only = false, - .ret_type = RET_PTR_TO_SOCKET_OR_NULL, - .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_MEM, - .arg3_type = ARG_CONST_SIZE, - .arg4_type = ARG_ANYTHING, - .arg5_type = ARG_ANYTHING, -}; - -BPF_CALL_5(bpf_sock_addr_sk_lookup_udp, struct bpf_sock_addr_kern *, ctx, - struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) -{ - return __bpf_sk_lookup(NULL, tuple, len, sock_net(ctx->sk), 0, - IPPROTO_UDP, netns_id, flags); -} - -static const struct bpf_func_proto bpf_sock_addr_sk_lookup_udp_proto = { - .func = bpf_sock_addr_sk_lookup_udp, - .gpl_only = false, - .ret_type = RET_PTR_TO_SOCKET_OR_NULL, - .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_MEM, - .arg3_type = ARG_CONST_SIZE, - .arg4_type = ARG_ANYTHING, - .arg5_type = ARG_ANYTHING, -}; - -bool bpf_tcp_sock_is_valid_access(int off, int size, enum bpf_access_type type, - struct bpf_insn_access_aux *info) -{ - if (off < 0 || off >= offsetofend(struct bpf_tcp_sock, bytes_acked)) - return false; - if (off % size != 0) - return false; - switch (off) { - case offsetof(struct bpf_tcp_sock, bytes_received): - case offsetof(struct bpf_tcp_sock, bytes_acked): - return size == sizeof(__u64); - default: - return size == sizeof(__u32); - } -} - -#define CONVERT_COMMON_TCP_SOCK_FIELDS(md_type, CONVERT) \ -do { \ - switch (si->off) { \ - case offsetof(md_type, snd_cwnd): \ - CONVERT(snd_cwnd); break; \ - case offsetof(md_type, srtt_us): \ - CONVERT(srtt_us); break; \ - case offsetof(md_type, snd_ssthresh): \ - CONVERT(snd_ssthresh); break; \ - case offsetof(md_type, rcv_nxt): \ - CONVERT(rcv_nxt); break; \ - case offsetof(md_type, snd_nxt): \ - CONVERT(snd_nxt); break; \ - case offsetof(md_type, snd_una): \ - CONVERT(snd_una); break; \ - case offsetof(md_type, mss_cache): \ - CONVERT(mss_cache); break; \ - case offsetof(md_type, ecn_flags): \ - CONVERT(ecn_flags); break; \ - case offsetof(md_type, rate_delivered): \ - CONVERT(rate_delivered); break; \ - case offsetof(md_type, rate_interval_us): \ - CONVERT(rate_interval_us); break; \ - case offsetof(md_type, packets_out): \ - CONVERT(packets_out); break; \ - case offsetof(md_type, retrans_out): \ - CONVERT(retrans_out); break; \ - case offsetof(md_type, total_retrans): \ - CONVERT(total_retrans); break; \ - case offsetof(md_type, segs_in): \ - CONVERT(segs_in); break; \ - case offsetof(md_type, data_segs_in): \ - CONVERT(data_segs_in); break; \ - case offsetof(md_type, segs_out): \ - CONVERT(segs_out); break; \ - case offsetof(md_type, data_segs_out): \ - CONVERT(data_segs_out); break; \ - case offsetof(md_type, lost_out): \ - CONVERT(lost_out); break; \ - case offsetof(md_type, sacked_out): \ - CONVERT(sacked_out); break; \ - case offsetof(md_type, bytes_received): \ - CONVERT(bytes_received); break; \ - case offsetof(md_type, bytes_acked): \ - CONVERT(bytes_acked); break; \ - } \ -} while (0) - -u32 bpf_tcp_sock_convert_ctx_access(enum bpf_access_type type, - const struct bpf_insn *si, - struct bpf_insn *insn_buf, - struct bpf_prog *prog, u32 *target_size) -{ - struct bpf_insn *insn = insn_buf; - -#define BPF_TCP_SOCK_GET_COMMON(FIELD) \ - do { \ - BUILD_BUG_ON(FIELD_SIZEOF(struct tcp_sock, FIELD) > \ - FIELD_SIZEOF(struct bpf_tcp_sock, FIELD)); \ - *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct tcp_sock, FIELD),\ - si->dst_reg, si->src_reg, \ - offsetof(struct tcp_sock, FIELD)); \ - } while (0) - CONVERT_COMMON_TCP_SOCK_FIELDS(struct bpf_tcp_sock, - BPF_TCP_SOCK_GET_COMMON); - if (insn > insn_buf) - return insn - insn_buf; - - switch (si->off) { - case offsetof(struct bpf_tcp_sock, rtt_min): - BUILD_BUG_ON(FIELD_SIZEOF(struct tcp_sock, rtt_min) != - sizeof(struct minmax)); - BUILD_BUG_ON(sizeof(struct minmax) < - sizeof(struct minmax_sample)); - *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, - offsetof(struct tcp_sock, rtt_min) + - offsetof(struct minmax_sample, v)); - break; - } - return insn - insn_buf; -} - -BPF_CALL_1(bpf_tcp_sock, struct sock *, sk) -{ - sk = sk_to_full_sk(sk); - - if (sk_fullsock(sk) && sk->sk_protocol == IPPROTO_TCP) - return (unsigned long)sk; - - return (unsigned long)NULL; -} - -const struct bpf_func_proto bpf_tcp_sock_proto = { - .func = bpf_tcp_sock, - .gpl_only = false, - .ret_type = RET_PTR_TO_TCP_SOCK_OR_NULL, - .arg1_type = ARG_PTR_TO_SOCK_COMMON, -}; - -#endif /* CONFIG_INET */ - bool bpf_helper_changes_pkt_data(void *func) { if (func == bpf_skb_vlan_push || @@ -3216,17 +2775,13 @@ bool bpf_helper_changes_pkt_data(void *func) func == bpf_skb_store_bytes || func == bpf_skb_change_proto || func == bpf_skb_change_head || - func == sk_skb_change_head || func == bpf_skb_change_tail || - func == sk_skb_change_tail || func == bpf_skb_adjust_room || func == bpf_skb_pull_data || - func == sk_skb_pull_data || func == bpf_clone_redirect || func == bpf_l3_csum_replace || func == bpf_l4_csum_replace || - func == bpf_xdp_adjust_head || - func == bpf_xdp_adjust_meta) + func == bpf_xdp_adjust_head) return true; return false; @@ -3694,52 +3249,6 @@ static const struct bpf_func_proto bpf_setsockopt_proto = { .arg5_type = ARG_CONST_SIZE, }; -const struct ipv6_bpf_stub *ipv6_bpf_stub __read_mostly; -EXPORT_SYMBOL_GPL(ipv6_bpf_stub); - -BPF_CALL_3(bpf_bind, struct bpf_sock_addr_kern *, ctx, struct sockaddr *, addr, - int, addr_len) -{ -#ifdef CONFIG_INET - struct sock *sk = ctx->sk; - int err; - - /* Binding to port can be expensive so it's prohibited in the helper. - * Only binding to IP is supported. - */ - err = -EINVAL; - if (addr->sa_family == AF_INET) { - if (addr_len < sizeof(struct sockaddr_in)) - return err; - if (((struct sockaddr_in *)addr)->sin_port != htons(0)) - return err; - return __inet_bind(sk, addr, addr_len, true, false); -#if IS_ENABLED(CONFIG_IPV6) - } else if (addr->sa_family == AF_INET6) { - if (addr_len < SIN6_LEN_RFC2133) - return err; - if (((struct sockaddr_in6 *)addr)->sin6_port != htons(0)) - return err; - /* ipv6_bpf_stub cannot be NULL, since it's called from - * bpf_cgroup_inet6_connect hook and ipv6 is already loaded - */ - return ipv6_bpf_stub->inet6_bind(sk, addr, addr_len, true, false); -#endif /* CONFIG_IPV6 */ - } -#endif /* CONFIG_INET */ - - return -EAFNOSUPPORT; -} - -static const struct bpf_func_proto bpf_bind_proto = { - .func = bpf_bind, - .gpl_only = false, - .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_MEM, - .arg3_type = ARG_CONST_SIZE, -}; - static const struct bpf_func_proto * bpf_base_func_proto(enum bpf_func_id func_id) { @@ -3762,25 +3271,16 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_ktime_get_ns_proto; case BPF_FUNC_ktime_get_boot_ns: return &bpf_ktime_get_boot_ns_proto; - default: - break; - } - if (!capable(CAP_SYS_ADMIN)) - return NULL; - switch (func_id) { - case BPF_FUNC_spin_lock: - return &bpf_spin_lock_proto; - case BPF_FUNC_spin_unlock: - return &bpf_spin_unlock_proto; case BPF_FUNC_trace_printk: - return bpf_get_trace_printk_proto(); + if (capable(CAP_SYS_ADMIN)) + return bpf_get_trace_printk_proto(); default: return NULL; } } static const struct bpf_func_proto * -sock_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +sock_filter_func_proto(enum bpf_func_id func_id) { switch (func_id) { /* inet and inet6 sockets are created in a process @@ -3788,47 +3288,13 @@ sock_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) */ case BPF_FUNC_get_current_uid_gid: return &bpf_get_current_uid_gid_proto; - case BPF_FUNC_get_local_storage: - return &bpf_get_local_storage_proto; default: return bpf_base_func_proto(func_id); } } static const struct bpf_func_proto * -sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) -{ - switch (func_id) { - /* inet and inet6 sockets are created in a process - * context so there is always a valid uid/gid - */ - case BPF_FUNC_get_current_uid_gid: - return &bpf_get_current_uid_gid_proto; - case BPF_FUNC_get_local_storage: - return &bpf_get_local_storage_proto; - case BPF_FUNC_bind: - switch (prog->expected_attach_type) { - case BPF_CGROUP_INET4_CONNECT: - case BPF_CGROUP_INET6_CONNECT: - return &bpf_bind_proto; - default: - return NULL; - } -#ifdef CONFIG_INET - case BPF_FUNC_sk_lookup_tcp: - return &bpf_sock_addr_sk_lookup_tcp_proto; - case BPF_FUNC_sk_lookup_udp: - return &bpf_sock_addr_sk_lookup_udp_proto; - case BPF_FUNC_sk_release: - return &bpf_sk_release_proto; -#endif /* CONFIG_INET */ - default: - return bpf_base_func_proto(func_id); - } -} - -static const struct bpf_func_proto * -sk_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +sk_filter_func_proto(enum bpf_func_id func_id) { switch (func_id) { case BPF_FUNC_skb_load_bytes: @@ -3844,32 +3310,8 @@ sk_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) } } -const struct bpf_func_proto bpf_sk_storage_get_proto __weak; -const struct bpf_func_proto bpf_sk_storage_delete_proto __weak; - static const struct bpf_func_proto * -cg_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) -{ - switch (func_id) { - case BPF_FUNC_get_local_storage: - return &bpf_get_local_storage_proto; - case BPF_FUNC_sk_fullsock: - return &bpf_sk_fullsock_proto; - case BPF_FUNC_sk_storage_get: - return &bpf_sk_storage_get_proto; - case BPF_FUNC_sk_storage_delete: - return &bpf_sk_storage_delete_proto; -#ifdef CONFIG_INET - case BPF_FUNC_tcp_sock: - return &bpf_tcp_sock_proto; -#endif - default: - return sk_filter_func_proto(func_id, prog); - } -} - -static const struct bpf_func_proto * -tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +tc_cls_act_func_proto(enum bpf_func_id func_id) { switch (func_id) { case BPF_FUNC_skb_store_bytes: @@ -3934,27 +3376,13 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_socket_cookie_proto; case BPF_FUNC_get_socket_uid: return &bpf_get_socket_uid_proto; - case BPF_FUNC_sk_fullsock: - return &bpf_sk_fullsock_proto; - case BPF_FUNC_sk_storage_get: - return &bpf_sk_storage_get_proto; - case BPF_FUNC_sk_storage_delete: - return &bpf_sk_storage_delete_proto; - case BPF_FUNC_sk_lookup_tcp: - return &bpf_sk_lookup_tcp_proto; - case BPF_FUNC_sk_lookup_udp: - return &bpf_sk_lookup_udp_proto; - case BPF_FUNC_sk_release: - return &bpf_sk_release_proto; - case BPF_FUNC_tcp_sock: - return &bpf_tcp_sock_proto; default: return bpf_base_func_proto(func_id); } } static const struct bpf_func_proto * -xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +xdp_func_proto(enum bpf_func_id func_id) { switch (func_id) { case BPF_FUNC_perf_event_output: @@ -3963,8 +3391,6 @@ xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_smp_processor_id_proto; case BPF_FUNC_xdp_adjust_head: return &bpf_xdp_adjust_head_proto; - case BPF_FUNC_xdp_adjust_meta: - return &bpf_xdp_adjust_meta_proto; case BPF_FUNC_redirect: return &bpf_xdp_redirect_proto; case BPF_FUNC_redirect_map: @@ -3974,11 +3400,8 @@ xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) } } -const struct bpf_func_proto bpf_sock_map_update_proto __weak; -const struct bpf_func_proto bpf_sock_hash_update_proto __weak; - static const struct bpf_func_proto * -lwt_inout_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +lwt_inout_func_proto(enum bpf_func_id func_id) { switch (func_id) { case BPF_FUNC_skb_load_bytes: @@ -4005,44 +3428,19 @@ lwt_inout_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) } static const struct bpf_func_proto * -sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) + sock_ops_func_proto(enum bpf_func_id func_id) { switch (func_id) { case BPF_FUNC_setsockopt: return &bpf_setsockopt_proto; case BPF_FUNC_sock_map_update: return &bpf_sock_map_update_proto; - case BPF_FUNC_sock_hash_update: - return &bpf_sock_hash_update_proto; - case BPF_FUNC_get_local_storage: - return &bpf_get_local_storage_proto; default: return bpf_base_func_proto(func_id); } } -const struct bpf_func_proto bpf_msg_redirect_map_proto __weak; -const struct bpf_func_proto bpf_msg_redirect_hash_proto __weak; - -static const struct bpf_func_proto *sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) -{ - switch (func_id) { - case BPF_FUNC_msg_redirect_map: - return &bpf_msg_redirect_map_proto; - case BPF_FUNC_msg_redirect_hash: - return &bpf_msg_redirect_hash_proto; - case BPF_FUNC_get_local_storage: - return &bpf_get_local_storage_proto; - default: - return bpf_base_func_proto(func_id); - } -} - -const struct bpf_func_proto bpf_sk_redirect_map_proto __weak; -const struct bpf_func_proto bpf_sk_redirect_hash_proto __weak; - -static const struct bpf_func_proto * -sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +static const struct bpf_func_proto *sk_skb_func_proto(enum bpf_func_id func_id) { switch (func_id) { case BPF_FUNC_skb_store_bytes: @@ -4050,45 +3448,24 @@ sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_skb_load_bytes: return &bpf_skb_load_bytes_proto; case BPF_FUNC_skb_pull_data: - return &sk_skb_pull_data_proto; + return &bpf_skb_pull_data_proto; case BPF_FUNC_skb_change_tail: - return &sk_skb_change_tail_proto; + return &bpf_skb_change_tail_proto; case BPF_FUNC_skb_change_head: - return &sk_skb_change_head_proto; + return &bpf_skb_change_head_proto; case BPF_FUNC_get_socket_cookie: return &bpf_get_socket_cookie_proto; case BPF_FUNC_get_socket_uid: return &bpf_get_socket_uid_proto; case BPF_FUNC_sk_redirect_map: return &bpf_sk_redirect_map_proto; - case BPF_FUNC_sk_redirect_hash: - return &bpf_sk_redirect_hash_proto; - case BPF_FUNC_get_local_storage: - return &bpf_get_local_storage_proto; - case BPF_FUNC_sk_lookup_tcp: - return &bpf_sk_lookup_tcp_proto; - case BPF_FUNC_sk_lookup_udp: - return &bpf_sk_lookup_udp_proto; - case BPF_FUNC_sk_release: - return &bpf_sk_release_proto; default: return bpf_base_func_proto(func_id); } } static const struct bpf_func_proto * -flow_dissector_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) -{ - switch (func_id) { - case BPF_FUNC_skb_load_bytes: - return &bpf_skb_load_bytes_proto; - default: - return bpf_base_func_proto(func_id); - } -} - -static const struct bpf_func_proto * -lwt_xmit_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +lwt_xmit_func_proto(enum bpf_func_id func_id) { switch (func_id) { case BPF_FUNC_skb_get_tunnel_key: @@ -4117,21 +3494,12 @@ lwt_xmit_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_l4_csum_replace_proto; case BPF_FUNC_set_hash_invalid: return &bpf_set_hash_invalid_proto; -#ifdef CONFIG_INET - case BPF_FUNC_sk_lookup_udp: - return &bpf_xdp_sk_lookup_udp_proto; - case BPF_FUNC_sk_lookup_tcp: - return &bpf_xdp_sk_lookup_tcp_proto; - case BPF_FUNC_sk_release: - return &bpf_sk_release_proto; -#endif default: - return lwt_inout_func_proto(func_id, prog); + return lwt_inout_func_proto(func_id); } } static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type, - const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { const int size_default = sizeof(__u32); @@ -4153,20 +3521,10 @@ static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type case bpf_ctx_range_till(struct __sk_buff, remote_ip4, remote_ip4): case bpf_ctx_range_till(struct __sk_buff, local_ip4, local_ip4): case bpf_ctx_range(struct __sk_buff, data): - case bpf_ctx_range(struct __sk_buff, data_meta): case bpf_ctx_range(struct __sk_buff, data_end): if (size != size_default) return false; break; - case bpf_ctx_range_ptr(struct __sk_buff, flow_keys): - if (size != sizeof(__u64)) - return false; - break; - case offsetof(struct __sk_buff, sk): - if (type == BPF_WRITE || size != sizeof(__u64)) - return false; - info->reg_type = PTR_TO_SOCK_COMMON_OR_NULL; - break; default: /* Only narrow read access allowed for now. */ if (type == BPF_WRITE) { @@ -4184,15 +3542,12 @@ static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type static bool sk_filter_is_valid_access(int off, int size, enum bpf_access_type type, - const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { switch (off) { case bpf_ctx_range(struct __sk_buff, tc_classid): case bpf_ctx_range(struct __sk_buff, data): - case bpf_ctx_range(struct __sk_buff, data_meta): case bpf_ctx_range(struct __sk_buff, data_end): - case bpf_ctx_range_ptr(struct __sk_buff, flow_keys): case bpf_ctx_range_till(struct __sk_buff, family, local_port): return false; } @@ -4206,19 +3561,16 @@ static bool sk_filter_is_valid_access(int off, int size, } } - return bpf_skb_is_valid_access(off, size, type, prog, info); + return bpf_skb_is_valid_access(off, size, type, info); } static bool lwt_is_valid_access(int off, int size, enum bpf_access_type type, - const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { switch (off) { case bpf_ctx_range(struct __sk_buff, tc_classid): - case bpf_ctx_range_ptr(struct __sk_buff, flow_keys): case bpf_ctx_range_till(struct __sk_buff, family, local_port): - case bpf_ctx_range(struct __sk_buff, data_meta): return false; } @@ -4242,102 +3594,33 @@ static bool lwt_is_valid_access(int off, int size, break; } - return bpf_skb_is_valid_access(off, size, type, prog, info); -} - - -/* Attach type specific accesses */ -static bool __sock_filter_check_attach_type(int off, - enum bpf_access_type access_type, - enum bpf_attach_type attach_type) -{ - switch (off) { - case offsetof(struct bpf_sock, bound_dev_if): - case offsetof(struct bpf_sock, mark): - case offsetof(struct bpf_sock, priority): - switch (attach_type) { - case BPF_CGROUP_INET_SOCK_CREATE: - goto full_access; - default: - return false; - } - case bpf_ctx_range(struct bpf_sock, src_ip4): - switch (attach_type) { - case BPF_CGROUP_INET4_POST_BIND: - goto read_only; - default: - return false; - } - case bpf_ctx_range_till(struct bpf_sock, src_ip6[0], src_ip6[3]): - switch (attach_type) { - case BPF_CGROUP_INET6_POST_BIND: - goto read_only; - default: - return false; - } - case bpf_ctx_range(struct bpf_sock, src_port): - switch (attach_type) { - case BPF_CGROUP_INET4_POST_BIND: - case BPF_CGROUP_INET6_POST_BIND: - goto read_only; - default: - return false; - } - } -read_only: - return access_type == BPF_READ; -full_access: - return true; -} - -static bool __sock_filter_check_size(int off, int size, - struct bpf_insn_access_aux *info) -{ - const int size_default = sizeof(__u32); - - switch (off) { - case bpf_ctx_range(struct bpf_sock, src_ip4): - case bpf_ctx_range_till(struct bpf_sock, src_ip6[0], src_ip6[3]): - bpf_ctx_record_field_size(info, size_default); - return bpf_ctx_narrow_access_ok(off, size, size_default); - } - - return size == size_default; -} - -bool bpf_sock_common_is_valid_access(int off, int size, - enum bpf_access_type type, - struct bpf_insn_access_aux *info) -{ - switch (off) { - case bpf_ctx_range_till(struct bpf_sock, type, priority): - return false; - default: - return bpf_sock_is_valid_access(off, size, type, info); - } -} - -bool bpf_sock_is_valid_access(int off, int size, enum bpf_access_type type, - struct bpf_insn_access_aux *info) -{ - if (off < 0 || off >= sizeof(struct bpf_sock)) - return false; - if (off % size != 0) - return false; - if (!__sock_filter_check_size(off, size, info)) - return false; - return true; + return bpf_skb_is_valid_access(off, size, type, info); } static bool sock_filter_is_valid_access(int off, int size, enum bpf_access_type type, - const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { - if (!bpf_sock_is_valid_access(off, size, type, info)) + if (type == BPF_WRITE) { + switch (off) { + case offsetof(struct bpf_sock, bound_dev_if): + case offsetof(struct bpf_sock, mark): + case offsetof(struct bpf_sock, priority): + break; + default: + return false; + } + } + + if (off < 0 || off + size > sizeof(struct bpf_sock)) return false; - return __sock_filter_check_attach_type(off, type, - prog->expected_attach_type); + /* The verifier guarantees that size > 0. */ + if (off % size != 0) + return false; + if (size != sizeof(__u32)) + return false; + + return true; } static int bpf_unclone_prologue(struct bpf_insn *insn_buf, bool direct_write, @@ -4387,7 +3670,6 @@ static int tc_cls_act_prologue(struct bpf_insn *insn_buf, bool direct_write, static bool tc_cls_act_is_valid_access(int off, int size, enum bpf_access_type type, - const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { if (type == BPF_WRITE) { @@ -4407,18 +3689,14 @@ static bool tc_cls_act_is_valid_access(int off, int size, case bpf_ctx_range(struct __sk_buff, data): info->reg_type = PTR_TO_PACKET; break; - case bpf_ctx_range(struct __sk_buff, data_meta): - info->reg_type = PTR_TO_PACKET_META; - break; case bpf_ctx_range(struct __sk_buff, data_end): info->reg_type = PTR_TO_PACKET_END; break; - case bpf_ctx_range_ptr(struct __sk_buff, flow_keys): case bpf_ctx_range_till(struct __sk_buff, family, local_port): return false; } - return bpf_skb_is_valid_access(off, size, type, prog, info); + return bpf_skb_is_valid_access(off, size, type, info); } static bool __is_valid_xdp_access(int off, int size) @@ -4435,7 +3713,6 @@ static bool __is_valid_xdp_access(int off, int size) static bool xdp_is_valid_access(int off, int size, enum bpf_access_type type, - const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { if (type == BPF_WRITE) @@ -4445,9 +3722,6 @@ static bool xdp_is_valid_access(int off, int size, case offsetof(struct xdp_md, data): info->reg_type = PTR_TO_PACKET; break; - case offsetof(struct xdp_md, data_meta): - info->reg_type = PTR_TO_PACKET_META; - break; case offsetof(struct xdp_md, data_end): info->reg_type = PTR_TO_PACKET_END; break; @@ -4466,95 +3740,6 @@ void bpf_warn_invalid_xdp_action(u32 act) } EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action); -static bool sock_addr_is_valid_access(int off, int size, - enum bpf_access_type type, - const struct bpf_prog *prog, - struct bpf_insn_access_aux *info) -{ - const int size_default = sizeof(__u32); - - if (off < 0 || off >= sizeof(struct bpf_sock_addr)) - return false; - if (off % size != 0) - return false; - - /* Disallow access to IPv6 fields from IPv4 contex and vise - * versa. - */ - switch (off) { - case bpf_ctx_range(struct bpf_sock_addr, user_ip4): - switch (prog->expected_attach_type) { - case BPF_CGROUP_INET4_BIND: - case BPF_CGROUP_INET4_CONNECT: - case BPF_CGROUP_UDP4_SENDMSG: - case BPF_CGROUP_UDP4_RECVMSG: - break; - default: - return false; - } - break; - case bpf_ctx_range_till(struct bpf_sock_addr, user_ip6[0], user_ip6[3]): - switch (prog->expected_attach_type) { - case BPF_CGROUP_INET6_BIND: - case BPF_CGROUP_INET6_CONNECT: - case BPF_CGROUP_UDP6_SENDMSG: - case BPF_CGROUP_UDP6_RECVMSG: - break; - default: - return false; - } - break; - case bpf_ctx_range(struct bpf_sock_addr, msg_src_ip4): - switch (prog->expected_attach_type) { - case BPF_CGROUP_UDP4_SENDMSG: - break; - default: - return false; - } - break; - case bpf_ctx_range_till(struct bpf_sock_addr, msg_src_ip6[0], - msg_src_ip6[3]): - switch (prog->expected_attach_type) { - case BPF_CGROUP_UDP6_SENDMSG: - break; - default: - return false; - } - break; - } - - switch (off) { - case bpf_ctx_range(struct bpf_sock_addr, user_ip4): - case bpf_ctx_range_till(struct bpf_sock_addr, user_ip6[0], user_ip6[3]): - case bpf_ctx_range(struct bpf_sock_addr, msg_src_ip4): - case bpf_ctx_range_till(struct bpf_sock_addr, msg_src_ip6[0], - msg_src_ip6[3]): - /* Only narrow read access allowed for now. */ - if (type == BPF_READ) { - bpf_ctx_record_field_size(info, size_default); - if (!bpf_ctx_narrow_access_ok(off, size, size_default)) - return false; - } else { - if (size != size_default) - return false; - } - break; - case bpf_ctx_range(struct bpf_sock_addr, user_port): - if (size != size_default) - return false; - break; - default: - if (type == BPF_READ) { - if (size != size_default) - return false; - } else { - return false; - } - } - - return true; -} - static bool __is_valid_sock_ops_access(int off, int size) { if (off < 0 || off >= sizeof(struct bpf_sock_ops)) @@ -4570,7 +3755,6 @@ static bool __is_valid_sock_ops_access(int off, int size) static bool sock_ops_is_valid_access(int off, int size, enum bpf_access_type type, - const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { if (type == BPF_WRITE) { @@ -4594,15 +3778,8 @@ static int sk_skb_prologue(struct bpf_insn *insn_buf, bool direct_write, static bool sk_skb_is_valid_access(int off, int size, enum bpf_access_type type, - const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { - switch (off) { - case bpf_ctx_range(struct __sk_buff, tc_classid): - case bpf_ctx_range(struct __sk_buff, data_meta): - return false; - } - if (type == BPF_WRITE) { switch (off) { case bpf_ctx_range(struct __sk_buff, tc_index): @@ -4615,76 +3792,17 @@ static bool sk_skb_is_valid_access(int off, int size, switch (off) { case bpf_ctx_range(struct __sk_buff, mark): - case bpf_ctx_range_ptr(struct __sk_buff, flow_keys): - case bpf_ctx_range(struct __sk_buff, data): - info->reg_type = PTR_TO_PACKET; - break; - case bpf_ctx_range(struct __sk_buff, data_end): - info->reg_type = PTR_TO_PACKET_END; - break; - } - - return bpf_skb_is_valid_access(off, size, type, prog, info); -} - - -static bool sk_msg_is_valid_access(int off, int size, - enum bpf_access_type type, - const struct bpf_prog *prog, - struct bpf_insn_access_aux *info) -{ - if (type == BPF_WRITE) - return false; - - switch (off) { - case offsetof(struct sk_msg_md, data): - info->reg_type = PTR_TO_PACKET; - break; - case offsetof(struct sk_msg_md, data_end): - info->reg_type = PTR_TO_PACKET_END; - break; - } - - if (off < 0 || off >= sizeof(struct sk_msg_md)) - return false; - if (off % size != 0) - return false; - if (size != sizeof(__u64)) - return false; - - return true; -} - -static bool flow_dissector_is_valid_access(int off, int size, - enum bpf_access_type type, - const struct bpf_prog *prog, - struct bpf_insn_access_aux *info) -{ - if (type == BPF_WRITE) { - switch (off) { - case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]): - break; - default: - return false; - } - } - - switch (off) { - case bpf_ctx_range(struct __sk_buff, data): - info->reg_type = PTR_TO_PACKET; - break; - case bpf_ctx_range(struct __sk_buff, data_end): - info->reg_type = PTR_TO_PACKET_END; - break; - case bpf_ctx_range_ptr(struct __sk_buff, flow_keys): - info->reg_type = PTR_TO_FLOW_KEYS; - break; case bpf_ctx_range(struct __sk_buff, tc_classid): - case bpf_ctx_range_till(struct __sk_buff, family, local_port): return false; + case bpf_ctx_range(struct __sk_buff, data): + info->reg_type = PTR_TO_PACKET; + break; + case bpf_ctx_range(struct __sk_buff, data_end): + info->reg_type = PTR_TO_PACKET_END; + break; } - return bpf_skb_is_valid_access(off, size, type, prog, info); + return bpf_skb_is_valid_access(off, size, type, info); } static u32 bpf_convert_ctx_access(enum bpf_access_type type, @@ -4832,15 +3950,6 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type, offsetof(struct sk_buff, data)); break; - case offsetof(struct __sk_buff, data_meta): - off = si->off; - off -= offsetof(struct __sk_buff, data_meta); - off += offsetof(struct sk_buff, cb); - off += offsetof(struct bpf_skb_data_end, data_meta); - *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, - si->src_reg, off); - break; - case offsetof(struct __sk_buff, data_end): off = si->off; off -= offsetof(struct __sk_buff, data_end); @@ -4981,31 +4090,17 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type, bpf_target_off(struct sock_common, skc_num, 2, target_size)); break; - case offsetof(struct __sk_buff, flow_keys): - off = si->off; - off -= offsetof(struct __sk_buff, flow_keys); - off += offsetof(struct sk_buff, cb); - off += offsetof(struct qdisc_skb_cb, flow_keys); - *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, - si->src_reg, off); - break; - case offsetof(struct __sk_buff, sk): - *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk), - si->dst_reg, si->src_reg, - offsetof(struct sk_buff, sk)); - break; } return insn - insn_buf; } -u32 bpf_sock_convert_ctx_access(enum bpf_access_type type, - const struct bpf_insn *si, - struct bpf_insn *insn_buf, - struct bpf_prog *prog, u32 *target_size) +static u32 sock_filter_convert_ctx_access(enum bpf_access_type type, + const struct bpf_insn *si, + struct bpf_insn *insn_buf, + struct bpf_prog *prog, u32 *target_size) { struct bpf_insn *insn = insn_buf; - int off; switch (si->off) { case offsetof(struct bpf_sock, bound_dev_if): @@ -5061,43 +4156,6 @@ u32 bpf_sock_convert_ctx_access(enum bpf_access_type type, *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_PROTO_MASK); *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, SK_FL_PROTO_SHIFT); break; - - case offsetof(struct bpf_sock, src_ip4): - *insn++ = BPF_LDX_MEM( - BPF_SIZE(si->code), si->dst_reg, si->src_reg, - bpf_target_off(struct sock_common, skc_rcv_saddr, - FIELD_SIZEOF(struct sock_common, - skc_rcv_saddr), - target_size)); - break; - - case bpf_ctx_range_till(struct bpf_sock, src_ip6[0], src_ip6[3]): -#if IS_ENABLED(CONFIG_IPV6) - off = si->off; - off -= offsetof(struct bpf_sock, src_ip6[0]); - *insn++ = BPF_LDX_MEM( - BPF_SIZE(si->code), si->dst_reg, si->src_reg, - bpf_target_off( - struct sock_common, - skc_v6_rcv_saddr.s6_addr32[0], - FIELD_SIZEOF(struct sock_common, - skc_v6_rcv_saddr.s6_addr32[0]), - target_size) + off); -#else - (void)off; - *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); -#endif - break; - - case offsetof(struct bpf_sock, src_port): - *insn++ = BPF_LDX_MEM( - BPF_FIELD_SIZEOF(struct sock_common, skc_num), - si->dst_reg, si->src_reg, - bpf_target_off(struct sock_common, skc_num, - FIELD_SIZEOF(struct sock_common, - skc_num), - target_size)); - break; } return insn - insn_buf; @@ -5140,11 +4198,6 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type, si->dst_reg, si->src_reg, offsetof(struct xdp_buff, data)); break; - case offsetof(struct xdp_md, data_meta): - *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data_meta), - si->dst_reg, si->src_reg, - offsetof(struct xdp_buff, data_meta)); - break; case offsetof(struct xdp_md, data_end): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data_end), si->dst_reg, si->src_reg, @@ -5155,169 +4208,6 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type, return insn - insn_buf; } -/* SOCK_ADDR_LOAD_NESTED_FIELD() loads Nested Field S.F.NF where S is type of - * context Structure, F is Field in context structure that contains a pointer - * to Nested Structure of type NS that has the field NF. - * - * SIZE encodes the load size (BPF_B, BPF_H, etc). It's up to caller to make - * sure that SIZE is not greater than actual size of S.F.NF. - * - * If offset OFF is provided, the load happens from that offset relative to - * offset of NF. - */ -#define SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF(S, NS, F, NF, SIZE, OFF) \ - do { \ - *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(S, F), si->dst_reg, \ - si->src_reg, offsetof(S, F)); \ - *insn++ = BPF_LDX_MEM( \ - SIZE, si->dst_reg, si->dst_reg, \ - bpf_target_off(NS, NF, FIELD_SIZEOF(NS, NF), \ - target_size) \ - + OFF); \ - } while (0) - -#define SOCK_ADDR_LOAD_NESTED_FIELD(S, NS, F, NF) \ - SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF(S, NS, F, NF, \ - BPF_FIELD_SIZEOF(NS, NF), 0) - -/* SOCK_ADDR_STORE_NESTED_FIELD_OFF() has semantic similar to - * SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF() but for store operation. - * - * It doesn't support SIZE argument though since narrow stores are not - * supported for now. - * - * In addition it uses Temporary Field TF (member of struct S) as the 3rd - * "register" since two registers available in convert_ctx_access are not - * enough: we can't override neither SRC, since it contains value to store, nor - * DST since it contains pointer to context that may be used by later - * instructions. But we need a temporary place to save pointer to nested - * structure whose field we want to store to. - */ -#define SOCK_ADDR_STORE_NESTED_FIELD_OFF(S, NS, F, NF, OFF, TF) \ - do { \ - int tmp_reg = BPF_REG_9; \ - if (si->src_reg == tmp_reg || si->dst_reg == tmp_reg) \ - --tmp_reg; \ - if (si->src_reg == tmp_reg || si->dst_reg == tmp_reg) \ - --tmp_reg; \ - *insn++ = BPF_STX_MEM(BPF_DW, si->dst_reg, tmp_reg, \ - offsetof(S, TF)); \ - *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(S, F), tmp_reg, \ - si->dst_reg, offsetof(S, F)); \ - *insn++ = BPF_STX_MEM( \ - BPF_FIELD_SIZEOF(NS, NF), tmp_reg, si->src_reg, \ - bpf_target_off(NS, NF, FIELD_SIZEOF(NS, NF), \ - target_size) \ - + OFF); \ - *insn++ = BPF_LDX_MEM(BPF_DW, tmp_reg, si->dst_reg, \ - offsetof(S, TF)); \ - } while (0) - -#define SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF(S, NS, F, NF, SIZE, OFF, \ - TF) \ - do { \ - if (type == BPF_WRITE) { \ - SOCK_ADDR_STORE_NESTED_FIELD_OFF(S, NS, F, NF, OFF, \ - TF); \ - } else { \ - SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF( \ - S, NS, F, NF, SIZE, OFF); \ - } \ - } while (0) - -#define SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD(S, NS, F, NF, TF) \ - SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF( \ - S, NS, F, NF, BPF_FIELD_SIZEOF(NS, NF), 0, TF) - -static u32 sock_addr_convert_ctx_access(enum bpf_access_type type, - const struct bpf_insn *si, - struct bpf_insn *insn_buf, - struct bpf_prog *prog, u32 *target_size) -{ - struct bpf_insn *insn = insn_buf; - int off; - - switch (si->off) { - case offsetof(struct bpf_sock_addr, user_family): - SOCK_ADDR_LOAD_NESTED_FIELD(struct bpf_sock_addr_kern, - struct sockaddr, uaddr, sa_family); - break; - - case offsetof(struct bpf_sock_addr, user_ip4): - SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF( - struct bpf_sock_addr_kern, struct sockaddr_in, uaddr, - sin_addr, BPF_SIZE(si->code), 0, tmp_reg); - break; - - case bpf_ctx_range_till(struct bpf_sock_addr, user_ip6[0], user_ip6[3]): - off = si->off; - off -= offsetof(struct bpf_sock_addr, user_ip6[0]); - SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF( - struct bpf_sock_addr_kern, struct sockaddr_in6, uaddr, - sin6_addr.s6_addr32[0], BPF_SIZE(si->code), off, - tmp_reg); - break; - - case offsetof(struct bpf_sock_addr, user_port): - /* To get port we need to know sa_family first and then treat - * sockaddr as either sockaddr_in or sockaddr_in6. - * Though we can simplify since port field has same offset and - * size in both structures. - * Here we check this invariant and use just one of the - * structures if it's true. - */ - BUILD_BUG_ON(offsetof(struct sockaddr_in, sin_port) != - offsetof(struct sockaddr_in6, sin6_port)); - BUILD_BUG_ON(FIELD_SIZEOF(struct sockaddr_in, sin_port) != - FIELD_SIZEOF(struct sockaddr_in6, sin6_port)); - SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD(struct bpf_sock_addr_kern, - struct sockaddr_in6, uaddr, - sin6_port, tmp_reg); - break; - - case offsetof(struct bpf_sock_addr, family): - SOCK_ADDR_LOAD_NESTED_FIELD(struct bpf_sock_addr_kern, - struct sock, sk, sk_family); - break; - - case offsetof(struct bpf_sock_addr, type): - SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF( - struct bpf_sock_addr_kern, struct sock, sk, - __sk_flags_offset, BPF_W, 0); - *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_TYPE_MASK); - *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, SK_FL_TYPE_SHIFT); - break; - - case offsetof(struct bpf_sock_addr, protocol): - SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF( - struct bpf_sock_addr_kern, struct sock, sk, - __sk_flags_offset, BPF_W, 0); - *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_PROTO_MASK); - *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, - SK_FL_PROTO_SHIFT); - break; - - case offsetof(struct bpf_sock_addr, msg_src_ip4): - /* Treat t_ctx as struct in_addr for msg_src_ip4. */ - SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF( - struct bpf_sock_addr_kern, struct in_addr, t_ctx, - s_addr, BPF_SIZE(si->code), 0, tmp_reg); - break; - - case bpf_ctx_range_till(struct bpf_sock_addr, msg_src_ip6[0], - msg_src_ip6[3]): - off = si->off; - off -= offsetof(struct bpf_sock_addr, msg_src_ip6[0]); - /* Treat t_ctx as struct in6_addr for msg_src_ip6. */ - SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF( - struct bpf_sock_addr_kern, struct in6_addr, t_ctx, - s6_addr32[0], BPF_SIZE(si->code), off, tmp_reg); - break; - } - - return insn - insn_buf; -} - static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, @@ -5476,145 +4366,68 @@ static u32 sk_skb_convert_ctx_access(enum bpf_access_type type, return insn - insn_buf; } -static u32 sk_msg_convert_ctx_access(enum bpf_access_type type, - const struct bpf_insn *si, - struct bpf_insn *insn_buf, - struct bpf_prog *prog, u32 *target_size) -{ - struct bpf_insn *insn = insn_buf; - - switch (si->off) { - case offsetof(struct sk_msg_md, data): - *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg, data), - si->dst_reg, si->src_reg, - offsetof(struct sk_msg, data)); - break; - case offsetof(struct sk_msg_md, data_end): - *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg, data_end), - si->dst_reg, si->src_reg, - offsetof(struct sk_msg, data_end)); - break; - } - - return insn - insn_buf; -} - -const struct bpf_verifier_ops sk_filter_verifier_ops = { +const struct bpf_verifier_ops sk_filter_prog_ops = { .get_func_proto = sk_filter_func_proto, .is_valid_access = sk_filter_is_valid_access, .convert_ctx_access = bpf_convert_ctx_access, }; -const struct bpf_prog_ops sk_filter_prog_ops = { -}; - -const struct bpf_verifier_ops tc_cls_act_verifier_ops = { +const struct bpf_verifier_ops tc_cls_act_prog_ops = { .get_func_proto = tc_cls_act_func_proto, .is_valid_access = tc_cls_act_is_valid_access, .convert_ctx_access = tc_cls_act_convert_ctx_access, .gen_prologue = tc_cls_act_prologue, -}; - -const struct bpf_prog_ops tc_cls_act_prog_ops = { .test_run = bpf_prog_test_run_skb, }; -const struct bpf_verifier_ops xdp_verifier_ops = { +const struct bpf_verifier_ops xdp_prog_ops = { .get_func_proto = xdp_func_proto, .is_valid_access = xdp_is_valid_access, .convert_ctx_access = xdp_convert_ctx_access, -}; - -const struct bpf_prog_ops xdp_prog_ops = { .test_run = bpf_prog_test_run_xdp, }; -const struct bpf_verifier_ops cg_skb_verifier_ops = { - .get_func_proto = cg_skb_func_proto, +const struct bpf_verifier_ops cg_skb_prog_ops = { + .get_func_proto = sk_filter_func_proto, .is_valid_access = sk_filter_is_valid_access, .convert_ctx_access = bpf_convert_ctx_access, -}; - -const struct bpf_prog_ops cg_skb_prog_ops = { .test_run = bpf_prog_test_run_skb, }; -const struct bpf_verifier_ops lwt_inout_verifier_ops = { +const struct bpf_verifier_ops lwt_inout_prog_ops = { .get_func_proto = lwt_inout_func_proto, .is_valid_access = lwt_is_valid_access, .convert_ctx_access = bpf_convert_ctx_access, -}; - -const struct bpf_prog_ops lwt_inout_prog_ops = { .test_run = bpf_prog_test_run_skb, }; -const struct bpf_verifier_ops lwt_xmit_verifier_ops = { +const struct bpf_verifier_ops lwt_xmit_prog_ops = { .get_func_proto = lwt_xmit_func_proto, .is_valid_access = lwt_is_valid_access, .convert_ctx_access = bpf_convert_ctx_access, .gen_prologue = tc_cls_act_prologue, -}; - -const struct bpf_prog_ops lwt_xmit_prog_ops = { .test_run = bpf_prog_test_run_skb, }; -const struct bpf_verifier_ops cg_sock_verifier_ops = { +const struct bpf_verifier_ops cg_sock_prog_ops = { .get_func_proto = sock_filter_func_proto, .is_valid_access = sock_filter_is_valid_access, - .convert_ctx_access = bpf_sock_convert_ctx_access, + .convert_ctx_access = sock_filter_convert_ctx_access, }; -const struct bpf_prog_ops cg_sock_prog_ops = { -}; - -const struct bpf_verifier_ops cg_sock_addr_verifier_ops = { - .get_func_proto = sock_addr_func_proto, - .is_valid_access = sock_addr_is_valid_access, - .convert_ctx_access = sock_addr_convert_ctx_access, -}; - -const struct bpf_prog_ops cg_sock_addr_prog_ops = { -}; - -const struct bpf_verifier_ops sock_ops_verifier_ops = { +const struct bpf_verifier_ops sock_ops_prog_ops = { .get_func_proto = sock_ops_func_proto, .is_valid_access = sock_ops_is_valid_access, .convert_ctx_access = sock_ops_convert_ctx_access, }; -const struct bpf_prog_ops sock_ops_prog_ops = { -}; - -const struct bpf_verifier_ops sk_skb_verifier_ops = { +const struct bpf_verifier_ops sk_skb_prog_ops = { .get_func_proto = sk_skb_func_proto, .is_valid_access = sk_skb_is_valid_access, .convert_ctx_access = sk_skb_convert_ctx_access, .gen_prologue = sk_skb_prologue, }; -const struct bpf_prog_ops sk_skb_prog_ops = { -}; - -const struct bpf_verifier_ops sk_msg_verifier_ops = { - .get_func_proto = sk_msg_func_proto, - .is_valid_access = sk_msg_is_valid_access, - .convert_ctx_access = sk_msg_convert_ctx_access, -}; - -const struct bpf_prog_ops sk_msg_prog_ops = { -}; - -const struct bpf_verifier_ops flow_dissector_verifier_ops = { - .get_func_proto = flow_dissector_func_proto, - .is_valid_access = flow_dissector_is_valid_access, - .convert_ctx_access = bpf_convert_ctx_access, -}; - -const struct bpf_prog_ops flow_dissector_prog_ops = { -}; - int sk_detach_filter(struct sock *sk) { int ret = -ENOENT; diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 886283d20817..b4dddb685fc2 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -22,9 +22,6 @@ #include #include #include -#include - -static DEFINE_MUTEX(flow_dissector_mutex); static void dissector_set_key(struct flow_dissector *flow_dissector, enum flow_dissector_key_id key_id) @@ -62,45 +59,6 @@ void skb_flow_dissector_init(struct flow_dissector *flow_dissector, } EXPORT_SYMBOL(skb_flow_dissector_init); -int skb_flow_dissector_bpf_prog_attach(const union bpf_attr *attr, - struct bpf_prog *prog) -{ - struct bpf_prog *attached; - struct net *net; - - net = current->nsproxy->net_ns; - mutex_lock(&flow_dissector_mutex); - attached = rcu_dereference_protected(net->flow_dissector_prog, - lockdep_is_held(&flow_dissector_mutex)); - if (attached) { - /* Only one BPF program can be attached at a time */ - mutex_unlock(&flow_dissector_mutex); - return -EEXIST; - } - rcu_assign_pointer(net->flow_dissector_prog, prog); - mutex_unlock(&flow_dissector_mutex); - return 0; -} - -int skb_flow_dissector_bpf_prog_detach(const union bpf_attr *attr) -{ - struct bpf_prog *attached; - struct net *net; - - net = current->nsproxy->net_ns; - mutex_lock(&flow_dissector_mutex); - attached = rcu_dereference_protected(net->flow_dissector_prog, - lockdep_is_held(&flow_dissector_mutex)); - if (!attached) { - mutex_unlock(&flow_dissector_mutex); - return -ENOENT; - } - bpf_prog_put(attached); - RCU_INIT_POINTER(net->flow_dissector_prog, NULL); - mutex_unlock(&flow_dissector_mutex); - return 0; -} - /** * skb_flow_get_be16 - extract be16 entity * @skb: sk_buff to extract from @@ -450,60 +408,6 @@ static bool skb_flow_dissect_allowed(int *num_hdrs) return (*num_hdrs <= MAX_FLOW_DISSECT_HDRS); } -static void __skb_flow_bpf_to_target(const struct bpf_flow_keys *flow_keys, - struct flow_dissector *flow_dissector, - void *target_container) -{ - struct flow_dissector_key_control *key_control; - struct flow_dissector_key_basic *key_basic; - struct flow_dissector_key_addrs *key_addrs; - struct flow_dissector_key_ports *key_ports; - - key_control = skb_flow_dissector_target(flow_dissector, - FLOW_DISSECTOR_KEY_CONTROL, - target_container); - key_control->thoff = flow_keys->thoff; - if (flow_keys->is_frag) - key_control->flags |= FLOW_DIS_IS_FRAGMENT; - if (flow_keys->is_first_frag) - key_control->flags |= FLOW_DIS_FIRST_FRAG; - if (flow_keys->is_encap) - key_control->flags |= FLOW_DIS_ENCAPSULATION; - - key_basic = skb_flow_dissector_target(flow_dissector, - FLOW_DISSECTOR_KEY_BASIC, - target_container); - key_basic->n_proto = flow_keys->n_proto; - key_basic->ip_proto = flow_keys->ip_proto; - - if (flow_keys->addr_proto == ETH_P_IP && - dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_IPV4_ADDRS)) { - key_addrs = skb_flow_dissector_target(flow_dissector, - FLOW_DISSECTOR_KEY_IPV4_ADDRS, - target_container); - key_addrs->v4addrs.src = flow_keys->ipv4_src; - key_addrs->v4addrs.dst = flow_keys->ipv4_dst; - key_control->addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; - } else if (flow_keys->addr_proto == ETH_P_IPV6 && - dissector_uses_key(flow_dissector, - FLOW_DISSECTOR_KEY_IPV6_ADDRS)) { - key_addrs = skb_flow_dissector_target(flow_dissector, - FLOW_DISSECTOR_KEY_IPV6_ADDRS, - target_container); - memcpy(&key_addrs->v6addrs, &flow_keys->ipv6_src, - sizeof(key_addrs->v6addrs)); - key_control->addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; - } - - if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_PORTS)) { - key_ports = skb_flow_dissector_target(flow_dissector, - FLOW_DISSECTOR_KEY_PORTS, - target_container); - key_ports->src = flow_keys->sport; - key_ports->dst = flow_keys->dport; - } -} - /** * __skb_flow_dissect - extract the flow_keys struct and return it * @skb: sk_buff to extract the flow from, can be NULL if the rest are specified @@ -535,7 +439,6 @@ bool __skb_flow_dissect(const struct sk_buff *skb, struct flow_dissector_key_vlan *key_vlan; enum flow_dissect_ret fdret; bool skip_vlan = false; - struct bpf_prog *attached; int num_hdrs = 0; u8 ip_proto = 0; bool ret; @@ -576,44 +479,6 @@ bool __skb_flow_dissect(const struct sk_buff *skb, FLOW_DISSECTOR_KEY_BASIC, target_container); - rcu_read_lock(); - attached = skb ? rcu_dereference(dev_net(skb->dev)->flow_dissector_prog) - : NULL; - if (attached) { - /* Note that even though the const qualifier is discarded - * throughout the execution of the BPF program, all changes(the - * control block) are reverted after the BPF program returns. - * Therefore, __skb_flow_dissect does not alter the skb. - */ - struct bpf_flow_keys flow_keys = {}; - struct bpf_skb_data_end cb_saved; - struct bpf_skb_data_end *cb; - u32 result; - - cb = (struct bpf_skb_data_end *)skb->cb; - - /* Save Control Block */ - memcpy(&cb_saved, cb, sizeof(cb_saved)); - memset(cb, 0, sizeof(cb_saved)); - - /* Pass parameters to the BPF program */ - cb->qdisc_cb.flow_keys = &flow_keys; - flow_keys.nhoff = nhoff; - - bpf_compute_data_pointers((struct sk_buff *)skb); - result = BPF_PROG_RUN(attached, skb); - - /* Restore state */ - memcpy(cb, &cb_saved, sizeof(cb_saved)); - - __skb_flow_bpf_to_target(&flow_keys, flow_dissector, - target_container); - key_control->thoff = min_t(u16, key_control->thoff, skb->len); - rcu_read_unlock(); - return result == BPF_OK; - } - rcu_read_unlock(); - if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ETH_ADDRS)) { struct ethhdr *eth = eth_hdr(skb); diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c index 680782c53225..65313c766ab3 100644 --- a/net/core/lwt_bpf.c +++ b/net/core/lwt_bpf.c @@ -51,7 +51,7 @@ static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt, */ preempt_disable(); rcu_read_lock(); - bpf_compute_data_pointers(skb); + bpf_compute_data_end(skb); ret = bpf_prog_run_save_cb(lwt->prog, skb); rcu_read_unlock(); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index ed3c304ab418..1aca1f3f2120 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1242,10 +1242,10 @@ static u8 rtnl_xdp_attached_mode(struct net_device *dev, u32 *prog_id) *prog_id = generic_xdp_prog->aux->id; return XDP_ATTACHED_SKB; } - if (!ops->ndo_bpf) + if (!ops->ndo_xdp) return XDP_ATTACHED_NONE; - return __dev_xdp_attached(dev, ops->ndo_bpf, prog_id); + return __dev_xdp_attached(dev, ops->ndo_xdp, prog_id); } static int rtnl_xdp_fill(struct sk_buff *skb, struct net_device *dev) diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 7c8965ae0718..cebd33caaa00 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -79,9 +79,6 @@ struct kmem_cache *skbuff_head_cache __read_mostly; static struct kmem_cache *skbuff_fclone_cache __read_mostly; -#ifdef CONFIG_SKB_EXTENSIONS -static struct kmem_cache *skbuff_ext_cache __ro_after_init; -#endif int sysctl_max_skb_frags __read_mostly = MAX_SKB_FRAGS; EXPORT_SYMBOL(sysctl_max_skb_frags); @@ -642,7 +639,6 @@ void skb_release_head_state(struct sk_buff *skb) #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) nf_bridge_put(skb->nf_bridge); #endif - skb_ext_put(skb); } /* Free everything but the sk_buff shell. */ @@ -822,7 +818,6 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old) new->dev = old->dev; memcpy(new->cb, old->cb, sizeof(old->cb)); skb_dst_copy(new, old); - __skb_ext_copy(new, old); #ifdef CONFIG_XFRM new->sp = secpath_get(old->sp); #endif @@ -1540,8 +1535,6 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, skb->nohdr = 0; atomic_set(&skb_shinfo(skb)->dataref, 1); - skb_metadata_clear(skb); - /* It is not generally safe to change skb->truesize. * For the moment, we really care of rx path, or * when skb is orphaned (not attached to a socket). @@ -3988,38 +3981,6 @@ done: } EXPORT_SYMBOL_GPL(skb_gro_receive); -#ifdef CONFIG_SKB_EXTENSIONS -#define SKB_EXT_ALIGN_VALUE 8 -#define SKB_EXT_CHUNKSIZEOF(x) (ALIGN((sizeof(x)), SKB_EXT_ALIGN_VALUE) / SKB_EXT_ALIGN_VALUE) -static const u8 skb_ext_type_len[] = { -#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) - [SKB_EXT_BRIDGE_NF] = SKB_EXT_CHUNKSIZEOF(struct nf_bridge_info), -#endif -}; - -static __always_inline unsigned int skb_ext_total_length(void) -{ - return SKB_EXT_CHUNKSIZEOF(struct skb_ext) + -#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) - skb_ext_type_len[SKB_EXT_BRIDGE_NF] + -#endif - 0; -} - -static void skb_extensions_init(void) -{ - BUILD_BUG_ON(SKB_EXT_NUM >= 8); - BUILD_BUG_ON(skb_ext_total_length() > 255); - skbuff_ext_cache = kmem_cache_create("skbuff_ext_cache", - SKB_EXT_ALIGN_VALUE * skb_ext_total_length(), - 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC, - NULL); -} -#else -static void skb_extensions_init(void) {} -#endif - void __init skb_init(void) { skbuff_head_cache = kmem_cache_create("skbuff_head_cache", @@ -4032,7 +3993,6 @@ void __init skb_init(void) 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); - skb_extensions_init(); } static int @@ -5635,113 +5595,3 @@ void skb_condense(struct sk_buff *skb) */ skb->truesize = SKB_TRUESIZE(skb_end_offset(skb)); } - -#ifdef CONFIG_SKB_EXTENSIONS -static void *skb_ext_get_ptr(struct skb_ext *ext, enum skb_ext_id id) -{ - return (void *)ext + (ext->offset[id] * SKB_EXT_ALIGN_VALUE); -} - -static struct skb_ext *skb_ext_alloc(void) -{ - struct skb_ext *new = kmem_cache_alloc(skbuff_ext_cache, GFP_ATOMIC); - - if (new) { - memset(new->offset, 0, sizeof(new->offset)); - refcount_set(&new->refcnt, 1); - } - - return new; -} - -static struct skb_ext *skb_ext_maybe_cow(struct skb_ext *old) -{ - struct skb_ext *new; - - if (refcount_read(&old->refcnt) == 1) - return old; - - new = kmem_cache_alloc(skbuff_ext_cache, GFP_ATOMIC); - if (!new) - return NULL; - - memcpy(new, old, old->chunks * SKB_EXT_ALIGN_VALUE); - refcount_set(&new->refcnt, 1); - __skb_ext_put(old); - - return new; -} - -/** - * skb_ext_add - allocate space for given extension, COW if needed - * @skb: buffer - * @id: extension to allocate space for - * - * Allocates enough space for the given extension. - * If the extension is already present, a pointer to that extension - * is returned. - * - * If the skb was cloned, COW applies and the returned memory can be - * modified without changing the extension space of clones buffers. - * - * Returns pointer to the extension or NULL on allocation failure. - */ -void *skb_ext_add(struct sk_buff *skb, enum skb_ext_id id) -{ - struct skb_ext *new, *old = NULL; - unsigned int newlen, newoff; - - if (skb->active_extensions) { - old = skb->extensions; - new = skb_ext_maybe_cow(old); - if (!new) - return NULL; - if (__skb_ext_exist(old, id)) { - if (old != new) - skb->extensions = new; - goto set_active; - } - newoff = old->chunks; - } else { - newoff = SKB_EXT_CHUNKSIZEOF(*new); - new = skb_ext_alloc(); - if (!new) - return NULL; - } - - newlen = newoff + skb_ext_type_len[id]; - new->chunks = newlen; - new->offset[id] = newoff; - skb->extensions = new; - -set_active: - skb->active_extensions |= 1 << id; - return skb_ext_get_ptr(new, id); -} - -EXPORT_SYMBOL(skb_ext_add); -void __skb_ext_del(struct sk_buff *skb, enum skb_ext_id id) -{ - struct skb_ext *ext = skb->extensions; - skb->active_extensions &= ~(1 << id); - if (skb->active_extensions == 0) { - skb->extensions = NULL; - __skb_ext_put(ext); - } -} - -EXPORT_SYMBOL(__skb_ext_del); -void __skb_ext_put(struct skb_ext *ext) -{ - /* If this is last clone, nothing can increment - * it after check passes. Avoids one atomic op. - */ - if (refcount_read(&ext->refcnt) == 1) - goto free_now; - if (!refcount_dec_and_test(&ext->refcnt)) - return; -free_now: - kmem_cache_free(skbuff_ext_cache, ext); -} -EXPORT_SYMBOL(__skb_ext_put); -#endif /* CONFIG_SKB_EXTENSIONS */ diff --git a/net/core/skmsg.c b/net/core/skmsg.c deleted file mode 100644 index ae2b281c9c57..000000000000 --- a/net/core/skmsg.c +++ /dev/null @@ -1,763 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* Copyright (c) 2017 - 2018 Covalent IO, Inc. http://covalent.io */ - -#include -#include -#include - -#include -#include - -static bool sk_msg_try_coalesce_ok(struct sk_msg *msg, int elem_first_coalesce) -{ - if (msg->sg.end > msg->sg.start && - elem_first_coalesce < msg->sg.end) - return true; - - if (msg->sg.end < msg->sg.start && - (elem_first_coalesce > msg->sg.start || - elem_first_coalesce < msg->sg.end)) - return true; - - return false; -} - -int sk_msg_alloc(struct sock *sk, struct sk_msg *msg, int len, - int elem_first_coalesce) -{ - struct page_frag *pfrag = sk_page_frag(sk); - int ret = 0; - - len -= msg->sg.size; - while (len > 0) { - struct scatterlist *sge; - u32 orig_offset; - int use, i; - - if (!sk_page_frag_refill(sk, pfrag)) - return -ENOMEM; - - orig_offset = pfrag->offset; - use = min_t(int, len, pfrag->size - orig_offset); - if (!sk_wmem_schedule(sk, use)) - return -ENOMEM; - - i = msg->sg.end; - sk_msg_iter_var_prev(i); - sge = &msg->sg.data[i]; - - if (sk_msg_try_coalesce_ok(msg, elem_first_coalesce) && - sg_page(sge) == pfrag->page && - sge->offset + sge->length == orig_offset) { - sge->length += use; - } else { - if (sk_msg_full(msg)) { - ret = -ENOSPC; - break; - } - - sge = &msg->sg.data[msg->sg.end]; - sg_unmark_end(sge); - sg_set_page(sge, pfrag->page, use, orig_offset); - get_page(pfrag->page); - sk_msg_iter_next(msg, end); - } - - sk_mem_charge(sk, use); - msg->sg.size += use; - pfrag->offset += use; - len -= use; - } - - return ret; -} -EXPORT_SYMBOL_GPL(sk_msg_alloc); - -void sk_msg_return_zero(struct sock *sk, struct sk_msg *msg, int bytes) -{ - int i = msg->sg.start; - - do { - struct scatterlist *sge = sk_msg_elem(msg, i); - - if (bytes < sge->length) { - sge->length -= bytes; - sge->offset += bytes; - sk_mem_uncharge(sk, bytes); - break; - } - - sk_mem_uncharge(sk, sge->length); - bytes -= sge->length; - sge->length = 0; - sge->offset = 0; - sk_msg_iter_var_next(i); - } while (bytes && i != msg->sg.end); - msg->sg.start = i; -} -EXPORT_SYMBOL_GPL(sk_msg_return_zero); - -void sk_msg_return(struct sock *sk, struct sk_msg *msg, int bytes) -{ - int i = msg->sg.start; - - do { - struct scatterlist *sge = &msg->sg.data[i]; - int uncharge = (bytes < sge->length) ? bytes : sge->length; - - sk_mem_uncharge(sk, uncharge); - bytes -= uncharge; - sk_msg_iter_var_next(i); - } while (i != msg->sg.end); -} -EXPORT_SYMBOL_GPL(sk_msg_return); - -static int sk_msg_free_elem(struct sock *sk, struct sk_msg *msg, u32 i, - bool charge) -{ - struct scatterlist *sge = sk_msg_elem(msg, i); - u32 len = sge->length; - - if (charge) - sk_mem_uncharge(sk, len); - if (!msg->skb) - put_page(sg_page(sge)); - memset(sge, 0, sizeof(*sge)); - return len; -} - -static int __sk_msg_free(struct sock *sk, struct sk_msg *msg, u32 i, - bool charge) -{ - struct scatterlist *sge = sk_msg_elem(msg, i); - int freed = 0; - - while (msg->sg.size) { - msg->sg.size -= sge->length; - freed += sk_msg_free_elem(sk, msg, i, charge); - sk_msg_iter_var_next(i); - sk_msg_check_to_free(msg, i, msg->sg.size); - sge = sk_msg_elem(msg, i); - } - if (msg->skb) - consume_skb(msg->skb); - sk_msg_init(msg); - return freed; -} - -int sk_msg_free_nocharge(struct sock *sk, struct sk_msg *msg) -{ - return __sk_msg_free(sk, msg, msg->sg.start, false); -} -EXPORT_SYMBOL_GPL(sk_msg_free_nocharge); - -int sk_msg_free(struct sock *sk, struct sk_msg *msg) -{ - return __sk_msg_free(sk, msg, msg->sg.start, true); -} -EXPORT_SYMBOL_GPL(sk_msg_free); - -static void __sk_msg_free_partial(struct sock *sk, struct sk_msg *msg, - u32 bytes, bool charge) -{ - struct scatterlist *sge; - u32 i = msg->sg.start; - - while (bytes) { - sge = sk_msg_elem(msg, i); - if (!sge->length) - break; - if (bytes < sge->length) { - if (charge) - sk_mem_uncharge(sk, bytes); - sge->length -= bytes; - sge->offset += bytes; - msg->sg.size -= bytes; - break; - } - - msg->sg.size -= sge->length; - bytes -= sge->length; - sk_msg_free_elem(sk, msg, i, charge); - sk_msg_iter_var_next(i); - sk_msg_check_to_free(msg, i, bytes); - } - msg->sg.start = i; -} - -void sk_msg_free_partial(struct sock *sk, struct sk_msg *msg, u32 bytes) -{ - __sk_msg_free_partial(sk, msg, bytes, true); -} -EXPORT_SYMBOL_GPL(sk_msg_free_partial); - -void sk_msg_free_partial_nocharge(struct sock *sk, struct sk_msg *msg, - u32 bytes) -{ - __sk_msg_free_partial(sk, msg, bytes, false); -} - -void sk_msg_trim(struct sock *sk, struct sk_msg *msg, int len) -{ - int trim = msg->sg.size - len; - u32 i = msg->sg.end; - - if (trim <= 0) { - WARN_ON(trim < 0); - return; - } - - sk_msg_iter_var_prev(i); - msg->sg.size = len; - while (msg->sg.data[i].length && - trim >= msg->sg.data[i].length) { - trim -= msg->sg.data[i].length; - sk_msg_free_elem(sk, msg, i, true); - sk_msg_iter_var_prev(i); - if (!trim) - goto out; - } - - msg->sg.data[i].length -= trim; - sk_mem_uncharge(sk, trim); -out: - /* If we trim data before curr pointer update copybreak and current - * so that any future copy operations start at new copy location. - * However trimed data that has not yet been used in a copy op - * does not require an update. - */ - if (msg->sg.curr >= i) { - msg->sg.curr = i; - msg->sg.copybreak = msg->sg.data[i].length; - } - sk_msg_iter_var_next(i); - msg->sg.end = i; -} -EXPORT_SYMBOL_GPL(sk_msg_trim); - -int sk_msg_zerocopy_from_iter(struct sock *sk, struct iov_iter *from, - struct sk_msg *msg, u32 bytes) -{ - int i, maxpages, ret = 0, num_elems = sk_msg_elem_used(msg); - const int to_max_pages = MAX_MSG_FRAGS; - struct page *pages[MAX_MSG_FRAGS]; - ssize_t orig, copied, use, offset; - - orig = msg->sg.size; - while (bytes > 0) { - i = 0; - maxpages = to_max_pages - num_elems; - if (maxpages == 0) { - ret = -EFAULT; - goto out; - } - - copied = iov_iter_get_pages(from, pages, bytes, maxpages, - &offset); - if (copied <= 0) { - ret = -EFAULT; - goto out; - } - - iov_iter_advance(from, copied); - bytes -= copied; - msg->sg.size += copied; - - while (copied) { - use = min_t(int, copied, PAGE_SIZE - offset); - sg_set_page(&msg->sg.data[msg->sg.end], - pages[i], use, offset); - sg_unmark_end(&msg->sg.data[msg->sg.end]); - sk_mem_charge(sk, use); - - offset = 0; - copied -= use; - sk_msg_iter_next(msg, end); - num_elems++; - i++; - } - /* When zerocopy is mixed with sk_msg_*copy* operations we - * may have a copybreak set in this case clear and prefer - * zerocopy remainder when possible. - */ - msg->sg.copybreak = 0; - msg->sg.curr = msg->sg.end; - } -out: - /* Revert iov_iter updates, msg will need to use 'trim' later if it - * also needs to be cleared. - */ - if (ret) - iov_iter_revert(from, msg->sg.size - orig); - return ret; -} -EXPORT_SYMBOL_GPL(sk_msg_zerocopy_from_iter); - -int sk_msg_memcopy_from_iter(struct sock *sk, struct iov_iter *from, - struct sk_msg *msg, u32 bytes) -{ - int ret = -ENOSPC, i = msg->sg.curr; - struct scatterlist *sge; - u32 copy, buf_size; - void *to; - - do { - sge = sk_msg_elem(msg, i); - /* This is possible if a trim operation shrunk the buffer */ - if (msg->sg.copybreak >= sge->length) { - msg->sg.copybreak = 0; - sk_msg_iter_var_next(i); - if (i == msg->sg.end) - break; - sge = sk_msg_elem(msg, i); - } - - buf_size = sge->length - msg->sg.copybreak; - copy = (buf_size > bytes) ? bytes : buf_size; - to = sg_virt(sge) + msg->sg.copybreak; - msg->sg.copybreak += copy; - if (sk->sk_route_caps & NETIF_F_NOCACHE_COPY) - ret = copy_from_iter_nocache(to, copy, from); - else - ret = copy_from_iter(to, copy, from); - if (ret != copy) { - ret = -EFAULT; - goto out; - } - bytes -= copy; - if (!bytes) - break; - msg->sg.copybreak = 0; - sk_msg_iter_var_next(i); - } while (i != msg->sg.end); -out: - msg->sg.curr = i; - return ret; -} -EXPORT_SYMBOL_GPL(sk_msg_memcopy_from_iter); - -static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb) -{ - struct sock *sk = psock->sk; - int copied = 0, num_sge; - struct sk_msg *msg; - - msg = kzalloc(sizeof(*msg), __GFP_NOWARN | GFP_ATOMIC); - if (unlikely(!msg)) - return -EAGAIN; - if (!sk_rmem_schedule(sk, skb, skb->len)) { - kfree(msg); - return -EAGAIN; - } - - sk_msg_init(msg); - num_sge = skb_to_sgvec(skb, msg->sg.data, 0, skb->len); - if (unlikely(num_sge < 0)) { - kfree(msg); - return num_sge; - } - - sk_mem_charge(sk, skb->len); - copied = skb->len; - msg->sg.start = 0; - msg->sg.end = num_sge == MAX_MSG_FRAGS ? 0 : num_sge; - msg->skb = skb; - - sk_psock_queue_msg(psock, msg); - sk->sk_data_ready(sk); - return copied; -} - -static int sk_psock_handle_skb(struct sk_psock *psock, struct sk_buff *skb, - u32 off, u32 len, bool ingress) -{ - if (ingress) - return sk_psock_skb_ingress(psock, skb); - else - return skb_send_sock_locked(psock->sk, skb, off, len); -} - -static void sk_psock_backlog(struct work_struct *work) -{ - struct sk_psock *psock = container_of(work, struct sk_psock, work); - struct sk_psock_work_state *state = &psock->work_state; - struct sk_buff *skb; - bool ingress; - u32 len, off; - int ret; - - /* Lock sock to avoid losing sk_socket during loop. */ - lock_sock(psock->sk); - if (state->skb) { - skb = state->skb; - len = state->len; - off = state->off; - state->skb = NULL; - goto start; - } - - while ((skb = skb_dequeue(&psock->ingress_skb))) { - len = skb->len; - off = 0; -start: - ingress = tcp_skb_bpf_ingress(skb); - do { - ret = -EIO; - if (likely(psock->sk->sk_socket)) - ret = sk_psock_handle_skb(psock, skb, off, - len, ingress); - if (ret <= 0) { - if (ret == -EAGAIN) { - state->skb = skb; - state->len = len; - state->off = off; - goto end; - } - /* Hard errors break pipe and stop xmit. */ - sk_psock_report_error(psock, ret ? -ret : EPIPE); - sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED); - kfree_skb(skb); - goto end; - } - off += ret; - len -= ret; - } while (len); - - if (!ingress) - kfree_skb(skb); - } -end: - release_sock(psock->sk); -} - -struct sk_psock *sk_psock_init(struct sock *sk, int node) -{ - struct sk_psock *psock = kzalloc_node(sizeof(*psock), - GFP_ATOMIC | __GFP_NOWARN, - node); - if (!psock) - return NULL; - - psock->sk = sk; - psock->eval = __SK_NONE; - - INIT_LIST_HEAD(&psock->link); - spin_lock_init(&psock->link_lock); - - INIT_WORK(&psock->work, sk_psock_backlog); - INIT_LIST_HEAD(&psock->ingress_msg); - skb_queue_head_init(&psock->ingress_skb); - - sk_psock_set_state(psock, SK_PSOCK_TX_ENABLED); - refcount_set(&psock->refcnt, 1); - - rcu_assign_sk_user_data(sk, psock); - sock_hold(sk); - - return psock; -} -EXPORT_SYMBOL_GPL(sk_psock_init); - -struct sk_psock_link *sk_psock_link_pop(struct sk_psock *psock) -{ - struct sk_psock_link *link; - - spin_lock_bh(&psock->link_lock); - link = list_first_entry_or_null(&psock->link, struct sk_psock_link, - list); - if (link) - list_del(&link->list); - spin_unlock_bh(&psock->link_lock); - return link; -} - -void __sk_psock_purge_ingress_msg(struct sk_psock *psock) -{ - struct sk_msg *msg, *tmp; - - list_for_each_entry_safe(msg, tmp, &psock->ingress_msg, list) { - list_del(&msg->list); - sk_msg_free(psock->sk, msg); - kfree(msg); - } -} - -static void sk_psock_zap_ingress(struct sk_psock *psock) -{ - __skb_queue_purge(&psock->ingress_skb); - __sk_psock_purge_ingress_msg(psock); -} - -static void sk_psock_link_destroy(struct sk_psock *psock) -{ - struct sk_psock_link *link, *tmp; - - list_for_each_entry_safe(link, tmp, &psock->link, list) { - list_del(&link->list); - sk_psock_free_link(link); - } -} - -static void sk_psock_destroy_deferred(struct work_struct *gc) -{ - struct sk_psock *psock = container_of(gc, struct sk_psock, gc); - - /* No sk_callback_lock since already detached. */ - if (psock->parser.enabled) - strp_done(&psock->parser.strp); - - cancel_work_sync(&psock->work); - - psock_progs_drop(&psock->progs); - - sk_psock_link_destroy(psock); - sk_psock_cork_free(psock); - sk_psock_zap_ingress(psock); - - if (psock->sk_redir) - sock_put(psock->sk_redir); - sock_put(psock->sk); - kfree(psock); -} - -void sk_psock_destroy(struct rcu_head *rcu) -{ - struct sk_psock *psock = container_of(rcu, struct sk_psock, rcu); - - INIT_WORK(&psock->gc, sk_psock_destroy_deferred); - schedule_work(&psock->gc); -} -EXPORT_SYMBOL_GPL(sk_psock_destroy); - -void sk_psock_drop(struct sock *sk, struct sk_psock *psock) -{ - rcu_assign_sk_user_data(sk, NULL); - sk_psock_cork_free(psock); - sk_psock_restore_proto(sk, psock); - - write_lock_bh(&sk->sk_callback_lock); - if (psock->progs.skb_parser) - sk_psock_stop_strp(sk, psock); - write_unlock_bh(&sk->sk_callback_lock); - sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED); - - call_rcu_sched(&psock->rcu, sk_psock_destroy); -} -EXPORT_SYMBOL_GPL(sk_psock_drop); - -static int sk_psock_map_verd(int verdict, bool redir) -{ - switch (verdict) { - case SK_PASS: - return redir ? __SK_REDIRECT : __SK_PASS; - case SK_DROP: - default: - break; - } - - return __SK_DROP; -} - -int sk_psock_msg_verdict(struct sock *sk, struct sk_psock *psock, - struct sk_msg *msg) -{ - struct bpf_prog *prog; - int ret; - - preempt_disable(); - rcu_read_lock(); - prog = READ_ONCE(psock->progs.msg_parser); - if (unlikely(!prog)) { - ret = __SK_PASS; - goto out; - } - - sk_msg_compute_data_pointers(msg); - msg->sk = sk; - ret = BPF_PROG_RUN(prog, msg); - ret = sk_psock_map_verd(ret, msg->sk_redir); - psock->apply_bytes = msg->apply_bytes; - if (ret == __SK_REDIRECT) { - if (psock->sk_redir) - sock_put(psock->sk_redir); - psock->sk_redir = msg->sk_redir; - if (!psock->sk_redir) { - ret = __SK_DROP; - goto out; - } - sock_hold(psock->sk_redir); - } -out: - rcu_read_unlock(); - preempt_enable(); - return ret; -} -EXPORT_SYMBOL_GPL(sk_psock_msg_verdict); - -static int sk_psock_bpf_run(struct sk_psock *psock, struct bpf_prog *prog, - struct sk_buff *skb) -{ - int ret; - - skb->sk = psock->sk; - bpf_compute_data_end_sk_skb(skb); - preempt_disable(); - ret = BPF_PROG_RUN(prog, skb); - preempt_enable(); - /* strparser clones the skb before handing it to a upper layer, - * meaning skb_orphan has been called. We NULL sk on the way out - * to ensure we don't trigger a BUG_ON() in skb/sk operations - * later and because we are not charging the memory of this skb - * to any socket yet. - */ - skb->sk = NULL; - return ret; -} - -static struct sk_psock *sk_psock_from_strp(struct strparser *strp) -{ - struct sk_psock_parser *parser; - - parser = container_of(strp, struct sk_psock_parser, strp); - return container_of(parser, struct sk_psock, parser); -} - -static void sk_psock_verdict_apply(struct sk_psock *psock, - struct sk_buff *skb, int verdict) -{ - struct sk_psock *psock_other; - struct sock *sk_other; - bool ingress; - - switch (verdict) { - case __SK_REDIRECT: - sk_other = tcp_skb_bpf_redirect_fetch(skb); - if (unlikely(!sk_other)) - goto out_free; - psock_other = sk_psock(sk_other); - if (!psock_other || sock_flag(sk_other, SOCK_DEAD) || - !sk_psock_test_state(psock_other, SK_PSOCK_TX_ENABLED)) - goto out_free; - ingress = tcp_skb_bpf_ingress(skb); - if ((!ingress && sock_writeable(sk_other)) || - (ingress && - atomic_read(&sk_other->sk_rmem_alloc) <= - sk_other->sk_rcvbuf)) { - if (!ingress) - skb_set_owner_w(skb, sk_other); - skb_queue_tail(&psock_other->ingress_skb, skb); - schedule_work(&psock_other->work); - break; - } - /* fall-through */ - case __SK_DROP: - /* fall-through */ - default: -out_free: - kfree_skb(skb); - } -} - -static void sk_psock_strp_read(struct strparser *strp, struct sk_buff *skb) -{ - struct sk_psock *psock = sk_psock_from_strp(strp); - struct bpf_prog *prog; - int ret = __SK_DROP; - - rcu_read_lock(); - prog = READ_ONCE(psock->progs.skb_verdict); - if (likely(prog)) { - skb_orphan(skb); - tcp_skb_bpf_redirect_clear(skb); - ret = sk_psock_bpf_run(psock, prog, skb); - ret = sk_psock_map_verd(ret, tcp_skb_bpf_redirect_fetch(skb)); - } - rcu_read_unlock(); - sk_psock_verdict_apply(psock, skb, ret); -} - -static int sk_psock_strp_read_done(struct strparser *strp, int err) -{ - return err; -} - -static int sk_psock_strp_parse(struct strparser *strp, struct sk_buff *skb) -{ - struct sk_psock *psock = sk_psock_from_strp(strp); - struct bpf_prog *prog; - int ret = skb->len; - - rcu_read_lock(); - prog = READ_ONCE(psock->progs.skb_parser); - if (likely(prog)) - ret = sk_psock_bpf_run(psock, prog, skb); - rcu_read_unlock(); - return ret; -} - -/* Called with socket lock held. */ -static void sk_psock_data_ready(struct sock *sk) -{ - struct sk_psock *psock; - - rcu_read_lock(); - psock = sk_psock(sk); - if (likely(psock)) { - write_lock_bh(&sk->sk_callback_lock); - strp_data_ready(&psock->parser.strp); - write_unlock_bh(&sk->sk_callback_lock); - } - rcu_read_unlock(); -} - -static void sk_psock_write_space(struct sock *sk) -{ - struct sk_psock *psock; - void (*write_space)(struct sock *sk); - - rcu_read_lock(); - psock = sk_psock(sk); - if (likely(psock && sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED))) - schedule_work(&psock->work); - write_space = psock->saved_write_space; - rcu_read_unlock(); - write_space(sk); -} - -int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock) -{ - static const struct strp_callbacks cb = { - .rcv_msg = sk_psock_strp_read, - .read_sock_done = sk_psock_strp_read_done, - .parse_msg = sk_psock_strp_parse, - }; - - psock->parser.enabled = false; - return strp_init(&psock->parser.strp, sk, &cb); -} - -void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock) -{ - struct sk_psock_parser *parser = &psock->parser; - - if (parser->enabled) - return; - - parser->saved_data_ready = sk->sk_data_ready; - sk->sk_data_ready = sk_psock_data_ready; - sk->sk_write_space = sk_psock_write_space; - parser->enabled = true; -} - -void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock) -{ - struct sk_psock_parser *parser = &psock->parser; - - if (!parser->enabled) - return; - - sk->sk_data_ready = parser->saved_data_ready; - parser->saved_data_ready = NULL; - strp_stop(&parser->strp); - parser->enabled = false; -} diff --git a/net/core/sock_map.c b/net/core/sock_map.c deleted file mode 100644 index 3c0e44cb811a..000000000000 --- a/net/core/sock_map.c +++ /dev/null @@ -1,1002 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* Copyright (c) 2017 - 2018 Covalent IO, Inc. http://covalent.io */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -struct bpf_stab { - struct bpf_map map; - struct sock **sks; - struct sk_psock_progs progs; - raw_spinlock_t lock; -}; - -#define SOCK_CREATE_FLAG_MASK \ - (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) - -static struct bpf_map *sock_map_alloc(union bpf_attr *attr) -{ - struct bpf_stab *stab; - u64 cost; - int err; - - if (!capable(CAP_NET_ADMIN)) - return ERR_PTR(-EPERM); - if (attr->max_entries == 0 || - attr->key_size != 4 || - attr->value_size != 4 || - attr->map_flags & ~SOCK_CREATE_FLAG_MASK) - return ERR_PTR(-EINVAL); - - stab = kzalloc(sizeof(*stab), GFP_USER); - if (!stab) - return ERR_PTR(-ENOMEM); - - bpf_map_init_from_attr(&stab->map, attr); - raw_spin_lock_init(&stab->lock); - - /* Make sure page count doesn't overflow. */ - cost = (u64) stab->map.max_entries * sizeof(struct sock *); - if (cost >= U32_MAX - PAGE_SIZE) { - err = -EINVAL; - goto free_stab; - } - - stab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; - err = bpf_map_precharge_memlock(stab->map.pages); - if (err) - goto free_stab; - - stab->sks = bpf_map_area_alloc(stab->map.max_entries * - sizeof(struct sock *), - stab->map.numa_node); - if (stab->sks) - return &stab->map; - err = -ENOMEM; -free_stab: - kfree(stab); - return ERR_PTR(err); -} - -int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog) -{ - u32 ufd = attr->target_fd; - struct bpf_map *map; - struct fd f; - int ret; - - f = fdget(ufd); - map = __bpf_map_get(f); - if (IS_ERR(map)) - return PTR_ERR(map); - ret = sock_map_prog_update(map, prog, attr->attach_type); - fdput(f); - return ret; -} - -static void sock_map_sk_acquire(struct sock *sk) - __acquires(&sk->sk_lock.slock) -{ - lock_sock(sk); - preempt_disable(); - rcu_read_lock(); -} - -static void sock_map_sk_release(struct sock *sk) - __releases(&sk->sk_lock.slock) -{ - rcu_read_unlock(); - preempt_enable(); - release_sock(sk); -} - -static void sock_map_add_link(struct sk_psock *psock, - struct sk_psock_link *link, - struct bpf_map *map, void *link_raw) -{ - link->link_raw = link_raw; - link->map = map; - spin_lock_bh(&psock->link_lock); - list_add_tail(&link->list, &psock->link); - spin_unlock_bh(&psock->link_lock); -} - -static void sock_map_del_link(struct sock *sk, - struct sk_psock *psock, void *link_raw) -{ - struct sk_psock_link *link, *tmp; - bool strp_stop = false; - - spin_lock_bh(&psock->link_lock); - list_for_each_entry_safe(link, tmp, &psock->link, list) { - if (link->link_raw == link_raw) { - struct bpf_map *map = link->map; - struct bpf_stab *stab = container_of(map, struct bpf_stab, - map); - if (psock->parser.enabled && stab->progs.skb_parser) - strp_stop = true; - list_del(&link->list); - sk_psock_free_link(link); - } - } - spin_unlock_bh(&psock->link_lock); - if (strp_stop) { - write_lock_bh(&sk->sk_callback_lock); - sk_psock_stop_strp(sk, psock); - write_unlock_bh(&sk->sk_callback_lock); - } -} - -static void sock_map_unref(struct sock *sk, void *link_raw) -{ - struct sk_psock *psock = sk_psock(sk); - - if (likely(psock)) { - sock_map_del_link(sk, psock, link_raw); - sk_psock_put(sk, psock); - } -} - -static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs, - struct sock *sk) -{ - struct bpf_prog *msg_parser, *skb_parser, *skb_verdict; - bool skb_progs, sk_psock_is_new = false; - struct sk_psock *psock; - int ret; - - skb_verdict = READ_ONCE(progs->skb_verdict); - skb_parser = READ_ONCE(progs->skb_parser); - skb_progs = skb_parser && skb_verdict; - if (skb_progs) { - skb_verdict = bpf_prog_inc_not_zero(skb_verdict); - if (IS_ERR(skb_verdict)) - return PTR_ERR(skb_verdict); - skb_parser = bpf_prog_inc_not_zero(skb_parser); - if (IS_ERR(skb_parser)) { - bpf_prog_put(skb_verdict); - return PTR_ERR(skb_parser); - } - } - - msg_parser = READ_ONCE(progs->msg_parser); - if (msg_parser) { - msg_parser = bpf_prog_inc_not_zero(msg_parser); - if (IS_ERR(msg_parser)) { - ret = PTR_ERR(msg_parser); - goto out; - } - } - - psock = sk_psock_get(sk); - if (psock) { - if (!sk_has_psock(sk)) { - ret = -EBUSY; - goto out_progs; - } - if ((msg_parser && READ_ONCE(psock->progs.msg_parser)) || - (skb_progs && READ_ONCE(psock->progs.skb_parser))) { - sk_psock_put(sk, psock); - ret = -EBUSY; - goto out_progs; - } - } else { - psock = sk_psock_init(sk, map->numa_node); - if (!psock) { - ret = -ENOMEM; - goto out_progs; - } - sk_psock_is_new = true; - } - - if (msg_parser) - psock_set_prog(&psock->progs.msg_parser, msg_parser); - if (sk_psock_is_new) { - ret = tcp_bpf_init(sk); - if (ret < 0) - goto out_drop; - } else { - tcp_bpf_reinit(sk); - } - - write_lock_bh(&sk->sk_callback_lock); - if (skb_progs && !psock->parser.enabled) { - ret = sk_psock_init_strp(sk, psock); - if (ret) { - write_unlock_bh(&sk->sk_callback_lock); - goto out_drop; - } - psock_set_prog(&psock->progs.skb_verdict, skb_verdict); - psock_set_prog(&psock->progs.skb_parser, skb_parser); - sk_psock_start_strp(sk, psock); - } - write_unlock_bh(&sk->sk_callback_lock); - return 0; -out_drop: - sk_psock_put(sk, psock); -out_progs: - if (msg_parser) - bpf_prog_put(msg_parser); -out: - if (skb_progs) { - bpf_prog_put(skb_verdict); - bpf_prog_put(skb_parser); - } - return ret; -} - -static void sock_map_free(struct bpf_map *map) -{ - struct bpf_stab *stab = container_of(map, struct bpf_stab, map); - int i; - - synchronize_rcu(); - rcu_read_lock(); - raw_spin_lock_bh(&stab->lock); - for (i = 0; i < stab->map.max_entries; i++) { - struct sock **psk = &stab->sks[i]; - struct sock *sk; - - sk = xchg(psk, NULL); - if (sk) - sock_map_unref(sk, psk); - } - raw_spin_unlock_bh(&stab->lock); - rcu_read_unlock(); - - bpf_map_area_free(stab->sks); - kfree(stab); -} - -static void sock_map_release_progs(struct bpf_map *map) -{ - psock_progs_drop(&container_of(map, struct bpf_stab, map)->progs); -} - -static struct sock *__sock_map_lookup_elem(struct bpf_map *map, u32 key) -{ - struct bpf_stab *stab = container_of(map, struct bpf_stab, map); - - WARN_ON_ONCE(!rcu_read_lock_held()); - - if (unlikely(key >= map->max_entries)) - return NULL; - return READ_ONCE(stab->sks[key]); -} - -static void *sock_map_lookup(struct bpf_map *map, void *key) -{ - return ERR_PTR(-EOPNOTSUPP); -} - -static int __sock_map_delete(struct bpf_stab *stab, struct sock *sk_test, - struct sock **psk) -{ - struct sock *sk; - - raw_spin_lock_bh(&stab->lock); - sk = *psk; - if (!sk_test || sk_test == sk) - *psk = NULL; - raw_spin_unlock_bh(&stab->lock); - if (unlikely(!sk)) - return -EINVAL; - sock_map_unref(sk, psk); - return 0; -} - -static void sock_map_delete_from_link(struct bpf_map *map, struct sock *sk, - void *link_raw) -{ - struct bpf_stab *stab = container_of(map, struct bpf_stab, map); - - __sock_map_delete(stab, sk, link_raw); -} - -static int sock_map_delete_elem(struct bpf_map *map, void *key) -{ - struct bpf_stab *stab = container_of(map, struct bpf_stab, map); - u32 i = *(u32 *)key; - struct sock **psk; - - if (unlikely(i >= map->max_entries)) - return -EINVAL; - - psk = &stab->sks[i]; - return __sock_map_delete(stab, NULL, psk); -} - -static int sock_map_get_next_key(struct bpf_map *map, void *key, void *next) -{ - struct bpf_stab *stab = container_of(map, struct bpf_stab, map); - u32 i = key ? *(u32 *)key : U32_MAX; - u32 *key_next = next; - - if (i == stab->map.max_entries - 1) - return -ENOENT; - if (i >= stab->map.max_entries) - *key_next = 0; - else - *key_next = i + 1; - return 0; -} - -static int sock_map_update_common(struct bpf_map *map, u32 idx, - struct sock *sk, u64 flags) -{ - struct bpf_stab *stab = container_of(map, struct bpf_stab, map); - struct sk_psock_link *link; - struct sk_psock *psock; - struct sock *osk; - int ret; - - WARN_ON_ONCE(!rcu_read_lock_held()); - if (unlikely(flags > BPF_EXIST)) - return -EINVAL; - if (unlikely(idx >= map->max_entries)) - return -E2BIG; - - link = sk_psock_init_link(); - if (!link) - return -ENOMEM; - - ret = sock_map_link(map, &stab->progs, sk); - if (ret < 0) - goto out_free; - - psock = sk_psock(sk); - WARN_ON_ONCE(!psock); - - raw_spin_lock_bh(&stab->lock); - osk = stab->sks[idx]; - if (osk && flags == BPF_NOEXIST) { - ret = -EEXIST; - goto out_unlock; - } else if (!osk && flags == BPF_EXIST) { - ret = -ENOENT; - goto out_unlock; - } - - sock_map_add_link(psock, link, map, &stab->sks[idx]); - stab->sks[idx] = sk; - if (osk) - sock_map_unref(osk, &stab->sks[idx]); - raw_spin_unlock_bh(&stab->lock); - return 0; -out_unlock: - raw_spin_unlock_bh(&stab->lock); - if (psock) - sk_psock_put(sk, psock); -out_free: - sk_psock_free_link(link); - return ret; -} - -static bool sock_map_op_okay(const struct bpf_sock_ops_kern *ops) -{ - return ops->op == BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB || - ops->op == BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB; -} - -static bool sock_map_sk_is_suitable(const struct sock *sk) -{ - return sk->sk_type == SOCK_STREAM && - sk->sk_protocol == IPPROTO_TCP; -} - -static int sock_map_update_elem(struct bpf_map *map, void *key, - void *value, u64 flags) -{ - u32 ufd = *(u32 *)value; - u32 idx = *(u32 *)key; - struct socket *sock; - struct sock *sk; - int ret; - - sock = sockfd_lookup(ufd, &ret); - if (!sock) - return ret; - sk = sock->sk; - if (!sk) { - ret = -EINVAL; - goto out; - } - if (!sock_map_sk_is_suitable(sk) || - sk->sk_state != TCP_ESTABLISHED) { - ret = -EOPNOTSUPP; - goto out; - } - - sock_map_sk_acquire(sk); - ret = sock_map_update_common(map, idx, sk, flags); - sock_map_sk_release(sk); -out: - fput(sock->file); - return ret; -} - -BPF_CALL_4(bpf_sock_map_update, struct bpf_sock_ops_kern *, sops, - struct bpf_map *, map, void *, key, u64, flags) -{ - WARN_ON_ONCE(!rcu_read_lock_held()); - - if (likely(sock_map_sk_is_suitable(sops->sk) && - sock_map_op_okay(sops))) - return sock_map_update_common(map, *(u32 *)key, sops->sk, - flags); - return -EOPNOTSUPP; -} - -const struct bpf_func_proto bpf_sock_map_update_proto = { - .func = bpf_sock_map_update, - .gpl_only = false, - .pkt_access = true, - .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_CONST_MAP_PTR, - .arg3_type = ARG_PTR_TO_MAP_KEY, - .arg4_type = ARG_ANYTHING, -}; - -BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb, - struct bpf_map *, map, u32, key, u64, flags) -{ - struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); - - if (unlikely(flags & ~(BPF_F_INGRESS))) - return SK_DROP; - tcb->bpf.flags = flags; - tcb->bpf.sk_redir = __sock_map_lookup_elem(map, key); - if (!tcb->bpf.sk_redir) - return SK_DROP; - return SK_PASS; -} - -const struct bpf_func_proto bpf_sk_redirect_map_proto = { - .func = bpf_sk_redirect_map, - .gpl_only = false, - .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_CONST_MAP_PTR, - .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_ANYTHING, -}; - -BPF_CALL_4(bpf_msg_redirect_map, struct sk_msg *, msg, - struct bpf_map *, map, u32, key, u64, flags) -{ - if (unlikely(flags & ~(BPF_F_INGRESS))) - return SK_DROP; - msg->flags = flags; - msg->sk_redir = __sock_map_lookup_elem(map, key); - if (!msg->sk_redir) - return SK_DROP; - return SK_PASS; -} - -const struct bpf_func_proto bpf_msg_redirect_map_proto = { - .func = bpf_msg_redirect_map, - .gpl_only = false, - .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_CONST_MAP_PTR, - .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_ANYTHING, -}; - -const struct bpf_map_ops sock_map_ops = { - .map_alloc = sock_map_alloc, - .map_free = sock_map_free, - .map_get_next_key = sock_map_get_next_key, - .map_update_elem = sock_map_update_elem, - .map_delete_elem = sock_map_delete_elem, - .map_lookup_elem = sock_map_lookup, - .map_release_uref = sock_map_release_progs, - .map_check_btf = map_check_no_btf, -}; - -struct bpf_htab_elem { - struct rcu_head rcu; - u32 hash; - struct sock *sk; - struct hlist_node node; - u8 key[0]; -}; - -struct bpf_htab_bucket { - struct hlist_head head; - raw_spinlock_t lock; -}; - -struct bpf_htab { - struct bpf_map map; - struct bpf_htab_bucket *buckets; - u32 buckets_num; - u32 elem_size; - struct sk_psock_progs progs; - atomic_t count; -}; - -static inline u32 sock_hash_bucket_hash(const void *key, u32 len) -{ - return jhash(key, len, 0); -} - -static struct bpf_htab_bucket *sock_hash_select_bucket(struct bpf_htab *htab, - u32 hash) -{ - return &htab->buckets[hash & (htab->buckets_num - 1)]; -} - -static struct bpf_htab_elem * -sock_hash_lookup_elem_raw(struct hlist_head *head, u32 hash, void *key, - u32 key_size) -{ - struct bpf_htab_elem *elem; - - hlist_for_each_entry_rcu(elem, head, node) { - if (elem->hash == hash && - !memcmp(&elem->key, key, key_size)) - return elem; - } - - return NULL; -} - -static struct sock *__sock_hash_lookup_elem(struct bpf_map *map, void *key) -{ - struct bpf_htab *htab = container_of(map, struct bpf_htab, map); - u32 key_size = map->key_size, hash; - struct bpf_htab_bucket *bucket; - struct bpf_htab_elem *elem; - - WARN_ON_ONCE(!rcu_read_lock_held()); - - hash = sock_hash_bucket_hash(key, key_size); - bucket = sock_hash_select_bucket(htab, hash); - elem = sock_hash_lookup_elem_raw(&bucket->head, hash, key, key_size); - - return elem ? elem->sk : NULL; -} - -static void sock_hash_free_elem(struct bpf_htab *htab, - struct bpf_htab_elem *elem) -{ - atomic_dec(&htab->count); - kfree_rcu(elem, rcu); -} - -static void sock_hash_delete_from_link(struct bpf_map *map, struct sock *sk, - void *link_raw) -{ - struct bpf_htab *htab = container_of(map, struct bpf_htab, map); - struct bpf_htab_elem *elem_probe, *elem = link_raw; - struct bpf_htab_bucket *bucket; - - WARN_ON_ONCE(!rcu_read_lock_held()); - bucket = sock_hash_select_bucket(htab, elem->hash); - - /* elem may be deleted in parallel from the map, but access here - * is okay since it's going away only after RCU grace period. - * However, we need to check whether it's still present. - */ - raw_spin_lock_bh(&bucket->lock); - elem_probe = sock_hash_lookup_elem_raw(&bucket->head, elem->hash, - elem->key, map->key_size); - if (elem_probe && elem_probe == elem) { - hlist_del_rcu(&elem->node); - sock_map_unref(elem->sk, elem); - sock_hash_free_elem(htab, elem); - } - raw_spin_unlock_bh(&bucket->lock); -} - -static int sock_hash_delete_elem(struct bpf_map *map, void *key) -{ - struct bpf_htab *htab = container_of(map, struct bpf_htab, map); - u32 hash, key_size = map->key_size; - struct bpf_htab_bucket *bucket; - struct bpf_htab_elem *elem; - int ret = -ENOENT; - - hash = sock_hash_bucket_hash(key, key_size); - bucket = sock_hash_select_bucket(htab, hash); - - raw_spin_lock_bh(&bucket->lock); - elem = sock_hash_lookup_elem_raw(&bucket->head, hash, key, key_size); - if (elem) { - hlist_del_rcu(&elem->node); - sock_map_unref(elem->sk, elem); - sock_hash_free_elem(htab, elem); - ret = 0; - } - raw_spin_unlock_bh(&bucket->lock); - return ret; -} - -static struct bpf_htab_elem *sock_hash_alloc_elem(struct bpf_htab *htab, - void *key, u32 key_size, - u32 hash, struct sock *sk, - struct bpf_htab_elem *old) -{ - struct bpf_htab_elem *new; - - if (atomic_inc_return(&htab->count) > htab->map.max_entries) { - if (!old) { - atomic_dec(&htab->count); - return ERR_PTR(-E2BIG); - } - } - - new = kmalloc_node(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN, - htab->map.numa_node); - if (!new) { - atomic_dec(&htab->count); - return ERR_PTR(-ENOMEM); - } - memcpy(new->key, key, key_size); - new->sk = sk; - new->hash = hash; - return new; -} - -static int sock_hash_update_common(struct bpf_map *map, void *key, - struct sock *sk, u64 flags) -{ - struct bpf_htab *htab = container_of(map, struct bpf_htab, map); - u32 key_size = map->key_size, hash; - struct bpf_htab_elem *elem, *elem_new; - struct bpf_htab_bucket *bucket; - struct sk_psock_link *link; - struct sk_psock *psock; - int ret; - - WARN_ON_ONCE(!rcu_read_lock_held()); - if (unlikely(flags > BPF_EXIST)) - return -EINVAL; - - link = sk_psock_init_link(); - if (!link) - return -ENOMEM; - - ret = sock_map_link(map, &htab->progs, sk); - if (ret < 0) - goto out_free; - - psock = sk_psock(sk); - WARN_ON_ONCE(!psock); - - hash = sock_hash_bucket_hash(key, key_size); - bucket = sock_hash_select_bucket(htab, hash); - - raw_spin_lock_bh(&bucket->lock); - elem = sock_hash_lookup_elem_raw(&bucket->head, hash, key, key_size); - if (elem && flags == BPF_NOEXIST) { - ret = -EEXIST; - goto out_unlock; - } else if (!elem && flags == BPF_EXIST) { - ret = -ENOENT; - goto out_unlock; - } - - elem_new = sock_hash_alloc_elem(htab, key, key_size, hash, sk, elem); - if (IS_ERR(elem_new)) { - ret = PTR_ERR(elem_new); - goto out_unlock; - } - - sock_map_add_link(psock, link, map, elem_new); - /* Add new element to the head of the list, so that - * concurrent search will find it before old elem. - */ - hlist_add_head_rcu(&elem_new->node, &bucket->head); - if (elem) { - hlist_del_rcu(&elem->node); - sock_map_unref(elem->sk, elem); - sock_hash_free_elem(htab, elem); - } - raw_spin_unlock_bh(&bucket->lock); - return 0; -out_unlock: - raw_spin_unlock_bh(&bucket->lock); - sk_psock_put(sk, psock); -out_free: - sk_psock_free_link(link); - return ret; -} - -static int sock_hash_update_elem(struct bpf_map *map, void *key, - void *value, u64 flags) -{ - u32 ufd = *(u32 *)value; - struct socket *sock; - struct sock *sk; - int ret; - - sock = sockfd_lookup(ufd, &ret); - if (!sock) - return ret; - sk = sock->sk; - if (!sk) { - ret = -EINVAL; - goto out; - } - if (!sock_map_sk_is_suitable(sk) || - sk->sk_state != TCP_ESTABLISHED) { - ret = -EOPNOTSUPP; - goto out; - } - - sock_map_sk_acquire(sk); - ret = sock_hash_update_common(map, key, sk, flags); - sock_map_sk_release(sk); -out: - fput(sock->file); - return ret; -} - -static int sock_hash_get_next_key(struct bpf_map *map, void *key, - void *key_next) -{ - struct bpf_htab *htab = container_of(map, struct bpf_htab, map); - struct bpf_htab_elem *elem, *elem_next; - u32 hash, key_size = map->key_size; - struct hlist_head *head; - int i = 0; - - if (!key) - goto find_first_elem; - hash = sock_hash_bucket_hash(key, key_size); - head = &sock_hash_select_bucket(htab, hash)->head; - elem = sock_hash_lookup_elem_raw(head, hash, key, key_size); - if (!elem) - goto find_first_elem; - - elem_next = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(&elem->node)), - struct bpf_htab_elem, node); - if (elem_next) { - memcpy(key_next, elem_next->key, key_size); - return 0; - } - - i = hash & (htab->buckets_num - 1); - i++; -find_first_elem: - for (; i < htab->buckets_num; i++) { - head = &sock_hash_select_bucket(htab, i)->head; - elem_next = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)), - struct bpf_htab_elem, node); - if (elem_next) { - memcpy(key_next, elem_next->key, key_size); - return 0; - } - } - - return -ENOENT; -} - -static struct bpf_map *sock_hash_alloc(union bpf_attr *attr) -{ - struct bpf_htab *htab; - int i, err; - u64 cost; - - if (!capable(CAP_NET_ADMIN)) - return ERR_PTR(-EPERM); - if (attr->max_entries == 0 || - attr->key_size == 0 || - attr->value_size != 4 || - attr->map_flags & ~SOCK_CREATE_FLAG_MASK) - return ERR_PTR(-EINVAL); - if (attr->key_size > MAX_BPF_STACK) - return ERR_PTR(-E2BIG); - - htab = kzalloc(sizeof(*htab), GFP_USER); - if (!htab) - return ERR_PTR(-ENOMEM); - - bpf_map_init_from_attr(&htab->map, attr); - - htab->buckets_num = roundup_pow_of_two(htab->map.max_entries); - htab->elem_size = sizeof(struct bpf_htab_elem) + - round_up(htab->map.key_size, 8); - if (htab->buckets_num == 0 || - htab->buckets_num > U32_MAX / sizeof(struct bpf_htab_bucket)) { - err = -EINVAL; - goto free_htab; - } - - cost = (u64) htab->buckets_num * sizeof(struct bpf_htab_bucket) + - (u64) htab->elem_size * htab->map.max_entries; - if (cost >= U32_MAX - PAGE_SIZE) { - err = -EINVAL; - goto free_htab; - } - - htab->buckets = bpf_map_area_alloc(htab->buckets_num * - sizeof(struct bpf_htab_bucket), - htab->map.numa_node); - if (!htab->buckets) { - err = -ENOMEM; - goto free_htab; - } - - for (i = 0; i < htab->buckets_num; i++) { - INIT_HLIST_HEAD(&htab->buckets[i].head); - raw_spin_lock_init(&htab->buckets[i].lock); - } - - return &htab->map; -free_htab: - kfree(htab); - return ERR_PTR(err); -} - -static void sock_hash_free(struct bpf_map *map) -{ - struct bpf_htab *htab = container_of(map, struct bpf_htab, map); - struct bpf_htab_bucket *bucket; - struct bpf_htab_elem *elem; - struct hlist_node *node; - int i; - - synchronize_rcu(); - rcu_read_lock(); - for (i = 0; i < htab->buckets_num; i++) { - bucket = sock_hash_select_bucket(htab, i); - raw_spin_lock_bh(&bucket->lock); - hlist_for_each_entry_safe(elem, node, &bucket->head, node) { - hlist_del_rcu(&elem->node); - sock_map_unref(elem->sk, elem); - } - raw_spin_unlock_bh(&bucket->lock); - } - rcu_read_unlock(); - - bpf_map_area_free(htab->buckets); - kfree(htab); -} - -static void sock_hash_release_progs(struct bpf_map *map) -{ - psock_progs_drop(&container_of(map, struct bpf_htab, map)->progs); -} - -BPF_CALL_4(bpf_sock_hash_update, struct bpf_sock_ops_kern *, sops, - struct bpf_map *, map, void *, key, u64, flags) -{ - WARN_ON_ONCE(!rcu_read_lock_held()); - - if (likely(sock_map_sk_is_suitable(sops->sk) && - sock_map_op_okay(sops))) - return sock_hash_update_common(map, key, sops->sk, flags); - return -EOPNOTSUPP; -} - -const struct bpf_func_proto bpf_sock_hash_update_proto = { - .func = bpf_sock_hash_update, - .gpl_only = false, - .pkt_access = true, - .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_CONST_MAP_PTR, - .arg3_type = ARG_PTR_TO_MAP_KEY, - .arg4_type = ARG_ANYTHING, -}; - -BPF_CALL_4(bpf_sk_redirect_hash, struct sk_buff *, skb, - struct bpf_map *, map, void *, key, u64, flags) -{ - struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); - - if (unlikely(flags & ~(BPF_F_INGRESS))) - return SK_DROP; - tcb->bpf.flags = flags; - tcb->bpf.sk_redir = __sock_hash_lookup_elem(map, key); - if (!tcb->bpf.sk_redir) - return SK_DROP; - return SK_PASS; -} - -const struct bpf_func_proto bpf_sk_redirect_hash_proto = { - .func = bpf_sk_redirect_hash, - .gpl_only = false, - .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_CONST_MAP_PTR, - .arg3_type = ARG_PTR_TO_MAP_KEY, - .arg4_type = ARG_ANYTHING, -}; - -BPF_CALL_4(bpf_msg_redirect_hash, struct sk_msg *, msg, - struct bpf_map *, map, void *, key, u64, flags) -{ - if (unlikely(flags & ~(BPF_F_INGRESS))) - return SK_DROP; - msg->flags = flags; - msg->sk_redir = __sock_hash_lookup_elem(map, key); - if (!msg->sk_redir) - return SK_DROP; - return SK_PASS; -} - -const struct bpf_func_proto bpf_msg_redirect_hash_proto = { - .func = bpf_msg_redirect_hash, - .gpl_only = false, - .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_CONST_MAP_PTR, - .arg3_type = ARG_PTR_TO_MAP_KEY, - .arg4_type = ARG_ANYTHING, -}; - -const struct bpf_map_ops sock_hash_ops = { - .map_alloc = sock_hash_alloc, - .map_free = sock_hash_free, - .map_get_next_key = sock_hash_get_next_key, - .map_update_elem = sock_hash_update_elem, - .map_delete_elem = sock_hash_delete_elem, - .map_lookup_elem = sock_map_lookup, - .map_release_uref = sock_hash_release_progs, - .map_check_btf = map_check_no_btf, -}; - -static struct sk_psock_progs *sock_map_progs(struct bpf_map *map) -{ - switch (map->map_type) { - case BPF_MAP_TYPE_SOCKMAP: - return &container_of(map, struct bpf_stab, map)->progs; - case BPF_MAP_TYPE_SOCKHASH: - return &container_of(map, struct bpf_htab, map)->progs; - default: - break; - } - - return NULL; -} - -int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, - u32 which) -{ - struct sk_psock_progs *progs = sock_map_progs(map); - - if (!progs) - return -EOPNOTSUPP; - - switch (which) { - case BPF_SK_MSG_VERDICT: - psock_set_prog(&progs->msg_parser, prog); - break; - case BPF_SK_SKB_STREAM_PARSER: - psock_set_prog(&progs->skb_parser, prog); - break; - case BPF_SK_SKB_STREAM_VERDICT: - psock_set_prog(&progs->skb_verdict, prog); - break; - default: - return -EOPNOTSUPP; - } - - return 0; -} - -void sk_psock_unlink(struct sock *sk, struct sk_psock_link *link) -{ - switch (link->map->map_type) { - case BPF_MAP_TYPE_SOCKMAP: - return sock_map_delete_from_link(link->map, sk, - link->link_raw); - case BPF_MAP_TYPE_SOCKHASH: - return sock_hash_delete_from_link(link->map, sk, - link->link_raw); - default: - break; - } -} diff --git a/net/core/xdp.c b/net/core/xdp.c deleted file mode 100644 index 229bc5a0ee04..000000000000 --- a/net/core/xdp.c +++ /dev/null @@ -1,67 +0,0 @@ -/* net/core/xdp.c - * - * Copyright (c) 2017 Jesper Dangaard Brouer, Red Hat Inc. - * Released under terms in GPL version 2. See COPYING. - */ -#include -#include - -#include - -#define REG_STATE_NEW 0x0 -#define REG_STATE_REGISTERED 0x1 -#define REG_STATE_UNREGISTERED 0x2 -#define REG_STATE_UNUSED 0x3 - -void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq) -{ - /* Simplify driver cleanup code paths, allow unreg "unused" */ - if (xdp_rxq->reg_state == REG_STATE_UNUSED) - return; - - WARN(!(xdp_rxq->reg_state == REG_STATE_REGISTERED), "Driver BUG"); - - xdp_rxq->reg_state = REG_STATE_UNREGISTERED; - xdp_rxq->dev = NULL; -} -EXPORT_SYMBOL_GPL(xdp_rxq_info_unreg); - -static void xdp_rxq_info_init(struct xdp_rxq_info *xdp_rxq) -{ - memset(xdp_rxq, 0, sizeof(*xdp_rxq)); -} - -/* Returns 0 on success, negative on failure */ -int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq, - struct net_device *dev, u32 queue_index) -{ - if (xdp_rxq->reg_state == REG_STATE_UNUSED) { - WARN(1, "Driver promised not to register this"); - return -EINVAL; - } - - if (xdp_rxq->reg_state == REG_STATE_REGISTERED) { - WARN(1, "Missing unregister, handled but fix driver"); - xdp_rxq_info_unreg(xdp_rxq); - } - - if (!dev) { - WARN(1, "Missing net_device from driver"); - return -ENODEV; - } - - /* State either UNREGISTERED or NEW */ - xdp_rxq_info_init(xdp_rxq); - xdp_rxq->dev = dev; - xdp_rxq->queue_index = queue_index; - - xdp_rxq->reg_state = REG_STATE_REGISTERED; - return 0; -} -EXPORT_SYMBOL_GPL(xdp_rxq_info_reg); - -void xdp_rxq_info_unused(struct xdp_rxq_info *xdp_rxq) -{ - xdp_rxq->reg_state = REG_STATE_UNUSED; -} -EXPORT_SYMBOL_GPL(xdp_rxq_info_unused); diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index 532dda77cdb4..c6c8ad1d4b6d 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -60,7 +60,6 @@ obj-$(CONFIG_TCP_CONG_SCALABLE) += tcp_scalable.o obj-$(CONFIG_TCP_CONG_LP) += tcp_lp.o obj-$(CONFIG_TCP_CONG_YEAH) += tcp_yeah.o obj-$(CONFIG_TCP_CONG_ILLINOIS) += tcp_illinois.o -obj-$(CONFIG_NET_SOCK_MSG) += tcp_bpf.o obj-$(CONFIG_NETLABEL) += cipso_ipv4.o obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \ diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index d2cc14fff8ae..f6b67140fa9d 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -430,32 +430,9 @@ int inet_release(struct socket *sock) EXPORT_SYMBOL(inet_release); int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) -{ - struct sock *sk = sock->sk; - int err; - - /* If the socket has its own bind function then use it. (RAW) */ - if (sk->sk_prot->bind) { - return sk->sk_prot->bind(sk, uaddr, addr_len); - } - if (addr_len < sizeof(struct sockaddr_in)) - return -EINVAL; - - /* BPF prog is run before any checks are done so that if the prog - * changes context in a wrong way it will be caught. - */ - err = BPF_CGROUP_RUN_PROG_INET4_BIND(sk, uaddr); - if (err) - return err; - - return __inet_bind(sk, uaddr, addr_len, false, true); -} -EXPORT_SYMBOL(inet_bind); - -int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, - bool force_bind_address_no_port, bool with_lock) { struct sockaddr_in *addr = (struct sockaddr_in *)uaddr; + struct sock *sk = sock->sk; struct inet_sock *inet = inet_sk(sk); struct net *net = sock_net(sk); unsigned short snum; @@ -463,6 +440,15 @@ int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, u32 tb_id = RT_TABLE_LOCAL; int err; + /* If the socket has its own bind function then use it. (RAW) */ + if (sk->sk_prot->bind) { + err = sk->sk_prot->bind(sk, uaddr, addr_len); + goto out; + } + err = -EINVAL; + if (addr_len < sizeof(struct sockaddr_in)) + goto out; + if (addr->sin_family != AF_INET) { /* Compatibility games : accept AF_UNSPEC (mapped to AF_INET) * only if s_addr is INADDR_ANY. @@ -505,8 +491,7 @@ int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, * would be illegal to use them (multicast/broadcast) in * which case the sending device address is used. */ - if (with_lock) - lock_sock(sk); + lock_sock(sk); /* Check these errors (active socket, double bind). */ err = -EINVAL; @@ -518,18 +503,11 @@ int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, inet->inet_saddr = 0; /* Use device */ /* Make sure we are allowed to bind here. */ - if (snum || !(inet->bind_address_no_port || - force_bind_address_no_port)) { - if (sk->sk_prot->get_port(sk, snum)) { - inet->inet_saddr = inet->inet_rcv_saddr = 0; - err = -EADDRINUSE; - goto out_release_sock; - } - err = BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk); - if (err) { - inet->inet_saddr = inet->inet_rcv_saddr = 0; - goto out_release_sock; - } + if ((snum || !inet->bind_address_no_port) && + sk->sk_prot->get_port(sk, snum)) { + inet->inet_saddr = inet->inet_rcv_saddr = 0; + err = -EADDRINUSE; + goto out_release_sock; } if (inet->inet_rcv_saddr) @@ -542,17 +520,16 @@ int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, sk_dst_reset(sk); err = 0; out_release_sock: - if (with_lock) - release_sock(sk); + release_sock(sk); out: return err; } +EXPORT_SYMBOL(inet_bind); int inet_dgram_connect(struct socket *sock, struct sockaddr *uaddr, int addr_len, int flags) { struct sock *sk = sock->sk; - int err; const struct proto *prot; if (addr_len < sizeof(uaddr->sa_family)) @@ -564,12 +541,6 @@ int inet_dgram_connect(struct socket *sock, struct sockaddr *uaddr, if (uaddr->sa_family == AF_UNSPEC) return prot->disconnect(sk, flags); - if (BPF_CGROUP_PRE_CONNECT_ENABLED(sk)) { - err = sk->sk_prot->pre_connect(sk, uaddr, addr_len); - if (err) - return err; - } - if (!inet_sk(sk)->inet_num && inet_autobind(sk)) return -EAGAIN; return prot->connect(sk, uaddr, addr_len); @@ -650,12 +621,6 @@ int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, if (sk->sk_state != TCP_CLOSE) goto out; - if (BPF_CGROUP_PRE_CONNECT_ENABLED(sk)) { - err = sk->sk_prot->pre_connect(sk, uaddr, addr_len); - if (err) - goto out; - } - err = sk->sk_prot->connect(sk, uaddr, addr_len); if (err < 0) goto out; diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index e883b920f2a1..79fa2d7852ef 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -121,32 +121,14 @@ static void esp_ssg_unref(struct xfrm_state *x, void *tmp) static void esp_output_done(struct crypto_async_request *base, int err) { struct sk_buff *skb = base->data; - struct xfrm_offload *xo = xfrm_offload(skb); void *tmp; - struct xfrm_state *x; - - if (xo && (xo->flags & XFRM_DEV_RESUME)) - x = skb->sp->xvec[skb->sp->len - 1]; - else - x = skb_dst(skb)->xfrm; + struct dst_entry *dst = skb_dst(skb); + struct xfrm_state *x = dst->xfrm; tmp = ESP_SKB_CB(skb)->tmp; esp_ssg_unref(x, tmp); kfree(tmp); - - if (xo && (xo->flags & XFRM_DEV_RESUME)) { - if (err) { - XFRM_INC_STATS(xs_net(x), LINUX_MIB_XFRMOUTSTATEPROTOERROR); - kfree_skb(skb); - return; - } - - skb_push(skb, skb->data - skb_mac_header(skb)); - secpath_reset(skb); - xfrm_dev_resume(skb); - } else { - xfrm_output_resume(skb, err); - } + xfrm_output_resume(skb, err); } /* Move ESP header back into place. */ diff --git a/net/ipv4/esp4_offload.c b/net/ipv4/esp4_offload.c index 815b90c9caf8..5be59ccb61aa 100644 --- a/net/ipv4/esp4_offload.c +++ b/net/ipv4/esp4_offload.c @@ -109,38 +109,78 @@ static void esp4_gso_encap(struct xfrm_state *x, struct sk_buff *skb) static struct sk_buff *esp4_gso_segment(struct sk_buff *skb, netdev_features_t features) { + __u32 seq; + int err = 0; + struct sk_buff *skb2; struct xfrm_state *x; struct ip_esp_hdr *esph; struct crypto_aead *aead; + struct sk_buff *segs = ERR_PTR(-EINVAL); netdev_features_t esp_features = features; struct xfrm_offload *xo = xfrm_offload(skb); if (!xo) - return ERR_PTR(-EINVAL); + goto out; if (!(skb_shinfo(skb)->gso_type & SKB_GSO_ESP)) - return ERR_PTR(-EINVAL); + goto out; + + seq = xo->seq.low; x = skb->sp->xvec[skb->sp->len - 1]; aead = x->data; esph = ip_esp_hdr(skb); if (esph->spi != x->id.spi) - return ERR_PTR(-EINVAL); + goto out; if (!pskb_may_pull(skb, sizeof(*esph) + crypto_aead_ivsize(aead))) - return ERR_PTR(-EINVAL); + goto out; __skb_pull(skb, sizeof(*esph) + crypto_aead_ivsize(aead)); skb->encap_hdr_csum = 1; - if (!(features & NETIF_F_HW_ESP) || !x->xso.offload_handle || - (x->xso.dev != skb->dev)) + if (!(features & NETIF_F_HW_ESP)) esp_features = features & ~(NETIF_F_SG | NETIF_F_CSUM_MASK); - xo->flags |= XFRM_GSO_SEGMENT; - return x->outer_mode->gso_segment(x, skb, esp_features); + segs = x->outer_mode->gso_segment(x, skb, esp_features); + if (IS_ERR_OR_NULL(segs)) + goto out; + + __skb_pull(skb, skb->data - skb_mac_header(skb)); + + skb2 = segs; + do { + struct sk_buff *nskb = skb2->next; + + xo = xfrm_offload(skb2); + xo->flags |= XFRM_GSO_SEGMENT; + xo->seq.low = seq; + xo->seq.hi = xfrm_replay_seqhi(x, seq); + + if(!(features & NETIF_F_HW_ESP)) + xo->flags |= CRYPTO_FALLBACK; + + x->outer_mode->xmit(x, skb2); + + err = x->type_offload->xmit(x, skb2, esp_features); + if (err) { + kfree_skb_list(segs); + return ERR_PTR(err); + } + + if (!skb_is_gso(skb2)) + seq++; + else + seq += skb_shinfo(skb2)->gso_segs; + + skb_push(skb2, skb2->mac_len); + skb2 = nskb; + } while (skb2); + +out: + return segs; } static int esp_input_tail(struct xfrm_state *x, struct sk_buff *skb) @@ -167,7 +207,6 @@ static int esp_xmit(struct xfrm_state *x, struct sk_buff *skb, netdev_features_ struct crypto_aead *aead; struct esp_info esp; bool hw_offload = true; - __u32 seq; esp.inplace = true; @@ -206,28 +245,23 @@ static int esp_xmit(struct xfrm_state *x, struct sk_buff *skb, netdev_features_ return esp.nfrags; } - seq = xo->seq.low; - esph = esp.esph; esph->spi = x->id.spi; skb_push(skb, -skb_network_offset(skb)); if (xo->flags & XFRM_GSO_SEGMENT) { - esph->seq_no = htonl(seq); - if (!skb_is_gso(skb)) - xo->seq.low++; - else - xo->seq.low += skb_shinfo(skb)->gso_segs; + esph->seq_no = htonl(xo->seq.low); + } else { + ip_hdr(skb)->tot_len = htons(skb->len); + ip_send_check(ip_hdr(skb)); } - esp.seqno = cpu_to_be64(seq + ((u64)xo->seq.hi << 32)); - ip_hdr(skb)->tot_len = htons(skb->len); - ip_send_check(ip_hdr(skb)); - if (hw_offload) return 0; + esp.seqno = cpu_to_be64(xo->seq.low + ((u64)xo->seq.hi << 32)); + err = esp_output_tail(x, skb, &esp); if (err) return err; diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 32066e2375c9..ecf367ae4d7a 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -542,7 +542,6 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from) to->tc_index = from->tc_index; #endif nf_copy(to, from); - skb_ext_copy(to, from); #if IS_ENABLED(CONFIG_IP_VS) to->ipvs_property = from->ipvs_property; #endif diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 0f358655f884..c55ba90b3a9e 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -476,14 +476,6 @@ static void tcp_tx_timestamp(struct sock *sk, u16 tsflags, struct sk_buff *skb) } } -static inline bool tcp_stream_is_readable(const struct tcp_sock *tp, - int target, struct sock *sk) -{ - return (tp->rcv_nxt - tp->copied_seq >= target) || - (sk->sk_prot->stream_memory_read ? - sk->sk_prot->stream_memory_read(sk) : false); -} - /* * Wait for a TCP event. * @@ -555,7 +547,7 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait) tp->urg_data) target++; - if (tcp_stream_is_readable(tp, target, sk)) + if (tp->rcv_nxt - tp->copied_seq >= target) mask |= POLLIN | POLLRDNORM; if (!(sk->sk_shutdown & SEND_SHUTDOWN)) { diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c deleted file mode 100644 index 80debb0daf37..000000000000 --- a/net/ipv4/tcp_bpf.c +++ /dev/null @@ -1,655 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* Copyright (c) 2017 - 2018 Covalent IO, Inc. http://covalent.io */ - -#include -#include -#include -#include -#include - -#include - -static bool tcp_bpf_stream_read(const struct sock *sk) -{ - struct sk_psock *psock; - bool empty = true; - - rcu_read_lock(); - psock = sk_psock(sk); - if (likely(psock)) - empty = list_empty(&psock->ingress_msg); - rcu_read_unlock(); - return !empty; -} - -static int tcp_bpf_wait_data(struct sock *sk, struct sk_psock *psock, - int flags, long timeo, int *err) -{ - DEFINE_WAIT_FUNC(wait, woken_wake_function); - int ret; - - add_wait_queue(sk_sleep(sk), &wait); - sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); - ret = sk_wait_event(sk, &timeo, - !list_empty(&psock->ingress_msg) || - !skb_queue_empty(&sk->sk_receive_queue), &wait); - sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); - remove_wait_queue(sk_sleep(sk), &wait); - return ret; -} - -int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock, - struct msghdr *msg, int len) -{ - struct iov_iter *iter = &msg->msg_iter; - int i, ret, copied = 0; - - while (copied != len) { - struct scatterlist *sge; - struct sk_msg *msg_rx; - - msg_rx = list_first_entry_or_null(&psock->ingress_msg, - struct sk_msg, list); - if (unlikely(!msg_rx)) - break; - - i = msg_rx->sg.start; - do { - struct page *page; - int copy; - - sge = sk_msg_elem(msg_rx, i); - copy = sge->length; - page = sg_page(sge); - if (copied + copy > len) - copy = len - copied; - ret = copy_page_to_iter(page, sge->offset, copy, iter); - if (ret != copy) { - msg_rx->sg.start = i; - return -EFAULT; - } - - copied += copy; - sge->offset += copy; - sge->length -= copy; - sk_mem_uncharge(sk, copy); - if (!sge->length) { - i++; - if (i == MAX_SKB_FRAGS) - i = 0; - if (!msg_rx->skb) - put_page(page); - } - - if (copied == len) - break; - } while (i != msg_rx->sg.end); - - msg_rx->sg.start = i; - if (!sge->length && msg_rx->sg.start == msg_rx->sg.end) { - list_del(&msg_rx->list); - if (msg_rx->skb) - consume_skb(msg_rx->skb); - kfree(msg_rx); - } - } - - return copied; -} -EXPORT_SYMBOL_GPL(__tcp_bpf_recvmsg); - -int tcp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, - int nonblock, int flags, int *addr_len) -{ - struct sk_psock *psock; - int copied, ret; - - if (unlikely(flags & MSG_ERRQUEUE)) - return inet_recv_error(sk, msg, len, addr_len); - if (!skb_queue_empty(&sk->sk_receive_queue)) - return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len); - - psock = sk_psock_get(sk); - if (unlikely(!psock)) - return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len); - lock_sock(sk); -msg_bytes_ready: - copied = __tcp_bpf_recvmsg(sk, psock, msg, len); - if (!copied) { - int data, err = 0; - long timeo; - - timeo = sock_rcvtimeo(sk, nonblock); - data = tcp_bpf_wait_data(sk, psock, flags, timeo, &err); - if (data) { - if (skb_queue_empty(&sk->sk_receive_queue)) - goto msg_bytes_ready; - release_sock(sk); - sk_psock_put(sk, psock); - return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len); - } - if (err) { - ret = err; - goto out; - } - } - ret = copied; -out: - release_sock(sk); - sk_psock_put(sk, psock); - return ret; -} - -static int bpf_tcp_ingress(struct sock *sk, struct sk_psock *psock, - struct sk_msg *msg, u32 apply_bytes, int flags) -{ - bool apply = apply_bytes; - struct scatterlist *sge; - u32 size, copied = 0; - struct sk_msg *tmp; - int i, ret = 0; - - tmp = kzalloc(sizeof(*tmp), __GFP_NOWARN | GFP_KERNEL); - if (unlikely(!tmp)) - return -ENOMEM; - - lock_sock(sk); - tmp->sg.start = msg->sg.start; - i = msg->sg.start; - do { - sge = sk_msg_elem(msg, i); - size = (apply && apply_bytes < sge->length) ? - apply_bytes : sge->length; - if (!sk_wmem_schedule(sk, size)) { - if (!copied) - ret = -ENOMEM; - break; - } - - sk_mem_charge(sk, size); - sk_msg_xfer(tmp, msg, i, size); - copied += size; - if (sge->length) - get_page(sk_msg_page(tmp, i)); - sk_msg_iter_var_next(i); - tmp->sg.end = i; - if (apply) { - apply_bytes -= size; - if (!apply_bytes) - break; - } - } while (i != msg->sg.end); - - if (!ret) { - msg->sg.start = i; - msg->sg.size -= apply_bytes; - sk_psock_queue_msg(psock, tmp); - sk->sk_data_ready(sk); - } else { - sk_msg_free(sk, tmp); - kfree(tmp); - } - - release_sock(sk); - return ret; -} - -static int tcp_bpf_push(struct sock *sk, struct sk_msg *msg, u32 apply_bytes, - int flags, bool uncharge) -{ - bool apply = apply_bytes; - struct scatterlist *sge; - struct page *page; - int size, ret = 0; - u32 off; - - while (1) { - sge = sk_msg_elem(msg, msg->sg.start); - size = (apply && apply_bytes < sge->length) ? - apply_bytes : sge->length; - off = sge->offset; - page = sg_page(sge); - - tcp_rate_check_app_limited(sk); -retry: - ret = do_tcp_sendpages(sk, page, off, size, flags); - if (ret <= 0) - return ret; - if (apply) - apply_bytes -= ret; - msg->sg.size -= ret; - sge->offset += ret; - sge->length -= ret; - if (uncharge) - sk_mem_uncharge(sk, ret); - if (ret != size) { - size -= ret; - off += ret; - goto retry; - } - if (!sge->length) { - put_page(page); - sk_msg_iter_next(msg, start); - sg_init_table(sge, 1); - if (msg->sg.start == msg->sg.end) - break; - } - if (apply && !apply_bytes) - break; - } - - return 0; -} - -static int tcp_bpf_push_locked(struct sock *sk, struct sk_msg *msg, - u32 apply_bytes, int flags, bool uncharge) -{ - int ret; - - lock_sock(sk); - ret = tcp_bpf_push(sk, msg, apply_bytes, flags, uncharge); - release_sock(sk); - return ret; -} - -int tcp_bpf_sendmsg_redir(struct sock *sk, struct sk_msg *msg, - u32 bytes, int flags) -{ - bool ingress = sk_msg_to_ingress(msg); - struct sk_psock *psock = sk_psock_get(sk); - int ret; - - if (unlikely(!psock)) { - sk_msg_free(sk, msg); - return 0; - } - ret = ingress ? bpf_tcp_ingress(sk, psock, msg, bytes, flags) : - tcp_bpf_push_locked(sk, msg, bytes, flags, false); - sk_psock_put(sk, psock); - return ret; -} -EXPORT_SYMBOL_GPL(tcp_bpf_sendmsg_redir); - -static int tcp_bpf_send_verdict(struct sock *sk, struct sk_psock *psock, - struct sk_msg *msg, int *copied, int flags) -{ - bool cork = false, enospc = msg->sg.start == msg->sg.end; - struct sock *sk_redir; - u32 tosend; - int ret; - -more_data: - if (psock->eval == __SK_NONE) - psock->eval = sk_psock_msg_verdict(sk, psock, msg); - - if (msg->cork_bytes && - msg->cork_bytes > msg->sg.size && !enospc) { - psock->cork_bytes = msg->cork_bytes - msg->sg.size; - if (!psock->cork) { - psock->cork = kzalloc(sizeof(*psock->cork), - GFP_ATOMIC | __GFP_NOWARN); - if (!psock->cork) - return -ENOMEM; - } - memcpy(psock->cork, msg, sizeof(*msg)); - return 0; - } - - tosend = msg->sg.size; - if (psock->apply_bytes && psock->apply_bytes < tosend) - tosend = psock->apply_bytes; - - switch (psock->eval) { - case __SK_PASS: - ret = tcp_bpf_push(sk, msg, tosend, flags, true); - if (unlikely(ret)) { - *copied -= sk_msg_free(sk, msg); - break; - } - sk_msg_apply_bytes(psock, tosend); - break; - case __SK_REDIRECT: - sk_redir = psock->sk_redir; - sk_msg_apply_bytes(psock, tosend); - if (psock->cork) { - cork = true; - psock->cork = NULL; - } - sk_msg_return(sk, msg, tosend); - release_sock(sk); - ret = tcp_bpf_sendmsg_redir(sk_redir, msg, tosend, flags); - lock_sock(sk); - if (unlikely(ret < 0)) { - int free = sk_msg_free_nocharge(sk, msg); - - if (!cork) - *copied -= free; - } - if (cork) { - sk_msg_free(sk, msg); - kfree(msg); - msg = NULL; - ret = 0; - } - break; - case __SK_DROP: - default: - sk_msg_free_partial(sk, msg, tosend); - sk_msg_apply_bytes(psock, tosend); - *copied -= tosend; - return -EACCES; - } - - if (likely(!ret)) { - if (!psock->apply_bytes) { - psock->eval = __SK_NONE; - if (psock->sk_redir) { - sock_put(psock->sk_redir); - psock->sk_redir = NULL; - } - } - if (msg && - msg->sg.data[msg->sg.start].page_link && - msg->sg.data[msg->sg.start].length) - goto more_data; - } - return ret; -} - -static int tcp_bpf_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) -{ - struct sk_msg tmp, *msg_tx = NULL; - int flags = msg->msg_flags | MSG_NO_SHARED_FRAGS; - int copied = 0, err = 0; - struct sk_psock *psock; - long timeo; - - psock = sk_psock_get(sk); - if (unlikely(!psock)) - return tcp_sendmsg(sk, msg, size); - - lock_sock(sk); - timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); - while (msg_data_left(msg)) { - bool enospc = false; - u32 copy, osize; - - if (sk->sk_err) { - err = -sk->sk_err; - goto out_err; - } - - copy = msg_data_left(msg); - if (!sk_stream_memory_free(sk)) - goto wait_for_sndbuf; - if (psock->cork) { - msg_tx = psock->cork; - } else { - msg_tx = &tmp; - sk_msg_init(msg_tx); - } - - osize = msg_tx->sg.size; - err = sk_msg_alloc(sk, msg_tx, msg_tx->sg.size + copy, msg_tx->sg.end - 1); - if (err) { - if (err != -ENOSPC) - goto wait_for_memory; - enospc = true; - copy = msg_tx->sg.size - osize; - } - - err = sk_msg_memcopy_from_iter(sk, &msg->msg_iter, msg_tx, - copy); - if (err < 0) { - sk_msg_trim(sk, msg_tx, osize); - goto out_err; - } - - copied += copy; - if (psock->cork_bytes) { - if (size > psock->cork_bytes) - psock->cork_bytes = 0; - else - psock->cork_bytes -= size; - if (psock->cork_bytes && !enospc) - goto out_err; - /* All cork bytes are accounted, rerun the prog. */ - psock->eval = __SK_NONE; - psock->cork_bytes = 0; - } - - err = tcp_bpf_send_verdict(sk, psock, msg_tx, &copied, flags); - if (unlikely(err < 0)) - goto out_err; - continue; -wait_for_sndbuf: - set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); -wait_for_memory: - err = sk_stream_wait_memory(sk, &timeo); - if (err) { - if (msg_tx && msg_tx != psock->cork) - sk_msg_free(sk, msg_tx); - goto out_err; - } - } -out_err: - if (err < 0) - err = sk_stream_error(sk, msg->msg_flags, err); - release_sock(sk); - sk_psock_put(sk, psock); - return copied ? copied : err; -} - -static int tcp_bpf_sendpage(struct sock *sk, struct page *page, int offset, - size_t size, int flags) -{ - struct sk_msg tmp, *msg = NULL; - int err = 0, copied = 0; - struct sk_psock *psock; - bool enospc = false; - - psock = sk_psock_get(sk); - if (unlikely(!psock)) - return tcp_sendpage(sk, page, offset, size, flags); - - lock_sock(sk); - if (psock->cork) { - msg = psock->cork; - } else { - msg = &tmp; - sk_msg_init(msg); - } - - /* Catch case where ring is full and sendpage is stalled. */ - if (unlikely(sk_msg_full(msg))) - goto out_err; - - sk_msg_page_add(msg, page, size, offset); - sk_mem_charge(sk, size); - copied = size; - if (sk_msg_full(msg)) - enospc = true; - if (psock->cork_bytes) { - if (size > psock->cork_bytes) - psock->cork_bytes = 0; - else - psock->cork_bytes -= size; - if (psock->cork_bytes && !enospc) - goto out_err; - /* All cork bytes are accounted, rerun the prog. */ - psock->eval = __SK_NONE; - psock->cork_bytes = 0; - } - - err = tcp_bpf_send_verdict(sk, psock, msg, &copied, flags); -out_err: - release_sock(sk); - sk_psock_put(sk, psock); - return copied ? copied : err; -} - -static void tcp_bpf_remove(struct sock *sk, struct sk_psock *psock) -{ - struct sk_psock_link *link; - - sk_psock_cork_free(psock); - __sk_psock_purge_ingress_msg(psock); - while ((link = sk_psock_link_pop(psock))) { - sk_psock_unlink(sk, link); - sk_psock_free_link(link); - } -} - -static void tcp_bpf_unhash(struct sock *sk) -{ - void (*saved_unhash)(struct sock *sk); - struct sk_psock *psock; - - rcu_read_lock(); - psock = sk_psock(sk); - if (unlikely(!psock)) { - rcu_read_unlock(); - if (sk->sk_prot->unhash) - sk->sk_prot->unhash(sk); - return; - } - - saved_unhash = psock->saved_unhash; - tcp_bpf_remove(sk, psock); - rcu_read_unlock(); - saved_unhash(sk); -} - -static void tcp_bpf_close(struct sock *sk, long timeout) -{ - void (*saved_close)(struct sock *sk, long timeout); - struct sk_psock *psock; - - lock_sock(sk); - rcu_read_lock(); - psock = sk_psock(sk); - if (unlikely(!psock)) { - rcu_read_unlock(); - release_sock(sk); - return sk->sk_prot->close(sk, timeout); - } - - saved_close = psock->saved_close; - tcp_bpf_remove(sk, psock); - rcu_read_unlock(); - release_sock(sk); - saved_close(sk, timeout); -} - -enum { - TCP_BPF_IPV4, - TCP_BPF_IPV6, - TCP_BPF_NUM_PROTS, -}; - -enum { - TCP_BPF_BASE, - TCP_BPF_TX, - TCP_BPF_NUM_CFGS, -}; - -static struct proto *tcpv6_prot_saved __read_mostly; -static DEFINE_SPINLOCK(tcpv6_prot_lock); -static struct proto tcp_bpf_prots[TCP_BPF_NUM_PROTS][TCP_BPF_NUM_CFGS]; - -static void tcp_bpf_rebuild_protos(struct proto prot[TCP_BPF_NUM_CFGS], - struct proto *base) -{ - prot[TCP_BPF_BASE] = *base; - prot[TCP_BPF_BASE].unhash = tcp_bpf_unhash; - prot[TCP_BPF_BASE].close = tcp_bpf_close; - prot[TCP_BPF_BASE].recvmsg = tcp_bpf_recvmsg; - prot[TCP_BPF_BASE].stream_memory_read = tcp_bpf_stream_read; - - prot[TCP_BPF_TX] = prot[TCP_BPF_BASE]; - prot[TCP_BPF_TX].sendmsg = tcp_bpf_sendmsg; - prot[TCP_BPF_TX].sendpage = tcp_bpf_sendpage; -} - -static void tcp_bpf_check_v6_needs_rebuild(struct sock *sk, struct proto *ops) -{ - if (sk->sk_family == AF_INET6 && - unlikely(ops != smp_load_acquire(&tcpv6_prot_saved))) { - spin_lock_bh(&tcpv6_prot_lock); - if (likely(ops != tcpv6_prot_saved)) { - tcp_bpf_rebuild_protos(tcp_bpf_prots[TCP_BPF_IPV6], ops); - smp_store_release(&tcpv6_prot_saved, ops); - } - spin_unlock_bh(&tcpv6_prot_lock); - } -} - -static int __init tcp_bpf_v4_build_proto(void) -{ - tcp_bpf_rebuild_protos(tcp_bpf_prots[TCP_BPF_IPV4], &tcp_prot); - return 0; -} -core_initcall(tcp_bpf_v4_build_proto); - -static void tcp_bpf_update_sk_prot(struct sock *sk, struct sk_psock *psock) -{ - int family = sk->sk_family == AF_INET6 ? TCP_BPF_IPV6 : TCP_BPF_IPV4; - int config = psock->progs.msg_parser ? TCP_BPF_TX : TCP_BPF_BASE; - - sk_psock_update_proto(sk, psock, &tcp_bpf_prots[family][config]); -} - -static void tcp_bpf_reinit_sk_prot(struct sock *sk, struct sk_psock *psock) -{ - int family = sk->sk_family == AF_INET6 ? TCP_BPF_IPV6 : TCP_BPF_IPV4; - int config = psock->progs.msg_parser ? TCP_BPF_TX : TCP_BPF_BASE; - - /* Reinit occurs when program types change e.g. TCP_BPF_TX is removed - * or added requiring sk_prot hook updates. We keep original saved - * hooks in this case. - */ - sk->sk_prot = &tcp_bpf_prots[family][config]; -} - -static int tcp_bpf_assert_proto_ops(struct proto *ops) -{ - /* In order to avoid retpoline, we make assumptions when we call - * into ops if e.g. a psock is not present. Make sure they are - * indeed valid assumptions. - */ - return ops->recvmsg == tcp_recvmsg && - ops->sendmsg == tcp_sendmsg && - ops->sendpage == tcp_sendpage ? 0 : -ENOTSUPP; -} - -void tcp_bpf_reinit(struct sock *sk) -{ - struct sk_psock *psock; - - sock_owned_by_me(sk); - - rcu_read_lock(); - psock = sk_psock(sk); - tcp_bpf_reinit_sk_prot(sk, psock); - rcu_read_unlock(); -} - -int tcp_bpf_init(struct sock *sk) -{ - struct proto *ops = READ_ONCE(sk->sk_prot); - struct sk_psock *psock; - - sock_owned_by_me(sk); - - rcu_read_lock(); - psock = sk_psock(sk); - if (unlikely(!psock || psock->sk_proto || - tcp_bpf_assert_proto_ops(ops))) { - rcu_read_unlock(); - return -EINVAL; - } - tcp_bpf_check_v6_needs_rebuild(sk, ops); - tcp_bpf_update_sk_prot(sk, psock); - rcu_read_unlock(); - return 0; -} diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index b4908f96aa84..35a363a18b94 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -138,21 +138,6 @@ int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) } EXPORT_SYMBOL_GPL(tcp_twsk_unique); -static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr, - int addr_len) -{ - /* This check is replicated from tcp_v4_connect() and intended to - * prevent BPF program called below from accessing bytes that are out - * of the bound specified by user in addr_len. - */ - if (addr_len < sizeof(struct sockaddr_in)) - return -EINVAL; - - sock_owned_by_me(sk); - - return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr); -} - /* This will initiate an outgoing connection. */ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { @@ -2457,7 +2442,6 @@ struct proto tcp_prot = { .name = "TCP", .owner = THIS_MODULE, .close = tcp_close, - .pre_connect = tcp_v4_pre_connect, .connect = tcp_v4_connect, .disconnect = tcp_disconnect, .accept = inet_csk_accept, diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 74d0e661c9e5..cb128fcdb182 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -932,7 +932,6 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) { struct inet_sock *inet = inet_sk(sk); struct udp_sock *up = udp_sk(sk); - DECLARE_SOCKADDR(struct sockaddr_in *, usin, msg->msg_name); struct flowi4 fl4_stack; struct flowi4 *fl4; int ulen = len; @@ -987,7 +986,8 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) /* * Get and verify the address. */ - if (usin) { + if (msg->msg_name) { + DECLARE_SOCKADDR(struct sockaddr_in *, usin, msg->msg_name); if (msg->msg_namelen < sizeof(*usin)) return -EINVAL; if (usin->sin_family != AF_INET) { @@ -1041,22 +1041,6 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) rcu_read_unlock(); } - if (cgroup_bpf_enabled && !connected) { - err = BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk, - (struct sockaddr *)usin, &ipc.addr); - if (err) - goto out_free; - if (usin) { - if (usin->sin_port == 0) { - /* BPF program set invalid port. Reject it. */ - err = -EINVAL; - goto out_free; - } - daddr = usin->sin_addr.s_addr; - dport = usin->sin_port; - } - } - saddr = ipc.addr; ipc.addr = faddr = daddr; @@ -1751,10 +1735,6 @@ try_again: sin->sin_addr.s_addr = ip_hdr(skb)->saddr; memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); *addr_len = sizeof(*sin); - - if (cgroup_bpf_enabled) - BPF_CGROUP_RUN_PROG_UDP4_RECVMSG_LOCK(sk, - (struct sockaddr *)sin); } if (udp_sk(sk)->gro_enabled) @@ -1784,19 +1764,6 @@ csum_copy_err: goto try_again; } -int udp_pre_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) -{ - /* This check is replicated from __ip4_datagram_connect() and - * intended to prevent BPF program called below from accessing bytes - * that are out of the bound specified by user in addr_len. - */ - if (addr_len < sizeof(struct sockaddr_in)) - return -EINVAL; - - return BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr); -} -EXPORT_SYMBOL(udp_pre_connect); - int __udp_disconnect(struct sock *sk, int flags) { struct inet_sock *inet = inet_sk(sk); @@ -2898,7 +2865,6 @@ struct proto udp_prot = { .name = "UDP", .owner = THIS_MODULE, .close = udp_lib_close, - .pre_connect = udp_pre_connect, .connect = ip4_datagram_connect, .disconnect = udp_disconnect, .ioctl = udp_ioctl, diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c index 0614b4b4015e..e6265e2c274e 100644 --- a/net/ipv4/xfrm4_mode_tunnel.c +++ b/net/ipv4/xfrm4_mode_tunnel.c @@ -112,9 +112,11 @@ static void xfrm4_mode_tunnel_xmit(struct xfrm_state *x, struct sk_buff *skb) { struct xfrm_offload *xo = xfrm_offload(skb); - if (xo->flags & XFRM_GSO_SEGMENT) + if (xo->flags & XFRM_GSO_SEGMENT) { + skb->network_header = skb->network_header - x->props.header_len; skb->transport_header = skb->network_header + sizeof(struct iphdr); + } skb_reset_mac_len(skb); pskb_pull(skb, skb->mac_len + x->props.header_len); diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index afb20eb37f54..888650b0fae8 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -283,8 +283,16 @@ out_rcu_unlock: /* bind for INET6 API */ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) { + struct sockaddr_in6 *addr = (struct sockaddr_in6 *)uaddr; struct sock *sk = sock->sk; const struct proto *prot; + struct inet_sock *inet = inet_sk(sk); + struct ipv6_pinfo *np = inet6_sk(sk); + struct net *net = sock_net(sk); + __be32 v4addr = 0; + unsigned short snum; + bool saved_ipv6only; + int addr_type = 0; int err = 0; /* IPV6_ADDRFORM can change sk->sk_prot under us. */ @@ -296,35 +304,11 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) if (addr_len < SIN6_LEN_RFC2133) return -EINVAL; - /* BPF prog is run before any checks are done so that if the prog - * changes context in a wrong way it will be caught. - */ - err = BPF_CGROUP_RUN_PROG_INET6_BIND(sk, uaddr); - if (err) - return err; - - return __inet6_bind(sk, uaddr, addr_len, false, true); -} -EXPORT_SYMBOL(inet6_bind); - -int __inet6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, - bool force_bind_address_no_port, bool with_lock) -{ - struct sockaddr_in6 *addr = (struct sockaddr_in6 *)uaddr; - struct inet_sock *inet = inet_sk(sk); - struct ipv6_pinfo *np = inet6_sk(sk); - struct net *net = sock_net(sk); - __be32 v4addr = 0; - unsigned short snum; - bool saved_ipv6only; - int addr_type = 0; - int err = 0; - if (addr->sin6_family != AF_INET6) return -EAFNOSUPPORT; addr_type = ipv6_addr_type(&addr->sin6_addr); - if ((addr_type & IPV6_ADDR_MULTICAST) && sk->sk_type == SOCK_STREAM) + if ((addr_type & IPV6_ADDR_MULTICAST) && sock->type == SOCK_STREAM) return -EINVAL; snum = ntohs(addr->sin6_port); @@ -332,8 +316,7 @@ int __inet6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE)) return -EACCES; - if (with_lock) - lock_sock(sk); + lock_sock(sk); /* Check these errors (active socket, double bind). */ if (sk->sk_state != TCP_CLOSE || inet->inet_num) { @@ -436,20 +419,12 @@ int __inet6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, sk->sk_ipv6only = 1; /* Make sure we are allowed to bind here. */ - if (snum || !(inet->bind_address_no_port || - force_bind_address_no_port)) { - if (sk->sk_prot->get_port(sk, snum)) { - sk->sk_ipv6only = saved_ipv6only; - inet_reset_saddr(sk); - err = -EADDRINUSE; - goto out; - } - err = BPF_CGROUP_RUN_PROG_INET6_POST_BIND(sk); - if (err) { - sk->sk_ipv6only = saved_ipv6only; - inet_reset_saddr(sk); - goto out; - } + if ((snum || !inet->bind_address_no_port) && + sk->sk_prot->get_port(sk, snum)) { + sk->sk_ipv6only = saved_ipv6only; + inet_reset_saddr(sk); + err = -EADDRINUSE; + goto out; } if (addr_type != IPV6_ADDR_ANY) @@ -460,13 +435,13 @@ int __inet6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, inet->inet_dport = 0; inet->inet_daddr = 0; out: - if (with_lock) - release_sock(sk); + release_sock(sk); return err; out_unlock: rcu_read_unlock(); goto out; } +EXPORT_SYMBOL(inet6_bind); int inet6_release(struct socket *sock) { @@ -923,10 +898,6 @@ static const struct ipv6_stub ipv6_stub_impl = { .nd_tbl = &nd_tbl, }; -static const struct ipv6_bpf_stub ipv6_bpf_stub_impl = { - .inet6_bind = __inet6_bind, -}; - static int __init inet6_init(void) { struct list_head *r; @@ -1083,7 +1054,6 @@ static int __init inet6_init(void) /* ensure that ipv6 stubs are visible only after ipv6 is ready */ wmb(); ipv6_stub = &ipv6_stub_impl; - ipv6_bpf_stub = &ipv6_bpf_stub_impl; out: return err; diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index 8fcfcab407d8..25c12d0ccd28 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -141,32 +141,14 @@ static void esp_ssg_unref(struct xfrm_state *x, void *tmp) static void esp_output_done(struct crypto_async_request *base, int err) { struct sk_buff *skb = base->data; - struct xfrm_offload *xo = xfrm_offload(skb); void *tmp; - - struct xfrm_state *x; - if (xo && (xo->flags & XFRM_DEV_RESUME)) - x = skb->sp->xvec[skb->sp->len - 1]; - else - x = skb_dst(skb)->xfrm; + struct dst_entry *dst = skb_dst(skb); + struct xfrm_state *x = dst->xfrm; tmp = ESP_SKB_CB(skb)->tmp; esp_ssg_unref(x, tmp); kfree(tmp); - - if (xo && (xo->flags & XFRM_DEV_RESUME)) { - if (err) { - XFRM_INC_STATS(xs_net(x), LINUX_MIB_XFRMOUTSTATEPROTOERROR); - kfree_skb(skb); - return; - } - - skb_push(skb, skb->data - skb_mac_header(skb)); - secpath_reset(skb); - xfrm_dev_resume(skb); - } else { - xfrm_output_resume(skb, err); - } + xfrm_output_resume(skb, err); } /* Move ESP header back into place. */ diff --git a/net/ipv6/esp6_offload.c b/net/ipv6/esp6_offload.c index d5902d19a1c7..7c72b85c9339 100644 --- a/net/ipv6/esp6_offload.c +++ b/net/ipv6/esp6_offload.c @@ -143,38 +143,78 @@ static void esp6_gso_encap(struct xfrm_state *x, struct sk_buff *skb) static struct sk_buff *esp6_gso_segment(struct sk_buff *skb, netdev_features_t features) { + __u32 seq; + int err = 0; + struct sk_buff *skb2; struct xfrm_state *x; struct ip_esp_hdr *esph; struct crypto_aead *aead; + struct sk_buff *segs = ERR_PTR(-EINVAL); netdev_features_t esp_features = features; struct xfrm_offload *xo = xfrm_offload(skb); if (!xo) - return ERR_PTR(-EINVAL); + goto out; if (!(skb_shinfo(skb)->gso_type & SKB_GSO_ESP)) - return ERR_PTR(-EINVAL); + goto out; + + seq = xo->seq.low; x = skb->sp->xvec[skb->sp->len - 1]; aead = x->data; esph = ip_esp_hdr(skb); if (esph->spi != x->id.spi) - return ERR_PTR(-EINVAL); + goto out; if (!pskb_may_pull(skb, sizeof(*esph) + crypto_aead_ivsize(aead))) - return ERR_PTR(-EINVAL); + goto out; __skb_pull(skb, sizeof(*esph) + crypto_aead_ivsize(aead)); skb->encap_hdr_csum = 1; - if (!(features & NETIF_F_HW_ESP) || !x->xso.offload_handle || - (x->xso.dev != skb->dev)) + if (!(features & NETIF_F_HW_ESP)) esp_features = features & ~(NETIF_F_SG | NETIF_F_CSUM_MASK); - xo->flags |= XFRM_GSO_SEGMENT; - return x->outer_mode->gso_segment(x, skb, esp_features); + segs = x->outer_mode->gso_segment(x, skb, esp_features); + if (IS_ERR_OR_NULL(segs)) + goto out; + + __skb_pull(skb, skb->data - skb_mac_header(skb)); + + skb2 = segs; + do { + struct sk_buff *nskb = skb2->next; + + xo = xfrm_offload(skb2); + xo->flags |= XFRM_GSO_SEGMENT; + xo->seq.low = seq; + xo->seq.hi = xfrm_replay_seqhi(x, seq); + + if(!(features & NETIF_F_HW_ESP)) + xo->flags |= CRYPTO_FALLBACK; + + x->outer_mode->xmit(x, skb2); + + err = x->type_offload->xmit(x, skb2, esp_features); + if (err) { + kfree_skb_list(segs); + return ERR_PTR(err); + } + + if (!skb_is_gso(skb2)) + seq++; + else + seq += skb_shinfo(skb2)->gso_segs; + + skb_push(skb2, skb2->mac_len); + skb2 = nskb; + } while (skb2); + +out: + return segs; } static int esp6_input_tail(struct xfrm_state *x, struct sk_buff *skb) @@ -193,7 +233,6 @@ static int esp6_input_tail(struct xfrm_state *x, struct sk_buff *skb) static int esp6_xmit(struct xfrm_state *x, struct sk_buff *skb, netdev_features_t features) { - int len; int err; int alen; int blksize; @@ -202,7 +241,6 @@ static int esp6_xmit(struct xfrm_state *x, struct sk_buff *skb, netdev_features struct crypto_aead *aead; struct esp_info esp; bool hw_offload = true; - __u32 seq; esp.inplace = true; @@ -238,32 +276,28 @@ static int esp6_xmit(struct xfrm_state *x, struct sk_buff *skb, netdev_features return esp.nfrags; } - seq = xo->seq.low; - esph = ip_esp_hdr(skb); esph->spi = x->id.spi; skb_push(skb, -skb_network_offset(skb)); if (xo->flags & XFRM_GSO_SEGMENT) { - esph->seq_no = htonl(seq); - if (!skb_is_gso(skb)) - xo->seq.low++; - else - xo->seq.low += skb_shinfo(skb)->gso_segs; + esph->seq_no = htonl(xo->seq.low); + } else { + int len; + + len = skb->len - sizeof(struct ipv6hdr); + if (len > IPV6_MAXPLEN) + len = 0; + + ipv6_hdr(skb)->payload_len = htons(len); } - esp.seqno = cpu_to_be64(xo->seq.low + ((u64)xo->seq.hi << 32)); - len = skb->len - sizeof(struct ipv6hdr); - - if (len > IPV6_MAXPLEN) - len = 0; - - ipv6_hdr(skb)->payload_len = htons(len); - if (hw_offload) return 0; + esp.seqno = cpu_to_be64(xo->seq.low + ((u64)xo->seq.hi << 32)); + err = esp6_output_tail(x, skb, &esp); if (err) return err; diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 2e66900a753d..5f725d4ee73e 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -643,7 +643,6 @@ static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from) to->tc_index = from->tc_index; #endif nf_copy(to, from); - skb_ext_copy(to, from); skb_copy_secmark(to, from); } diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 4b9d5e509075..756280071544 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -115,21 +115,6 @@ static u32 tcp_v6_init_ts_off(const struct net *net, const struct sk_buff *skb) ipv6_hdr(skb)->saddr.s6_addr32); } -static int tcp_v6_pre_connect(struct sock *sk, struct sockaddr *uaddr, - int addr_len) -{ - /* This check is replicated from tcp_v6_connect() and intended to - * prevent BPF program called below from accessing bytes that are out - * of the bound specified by user in addr_len. - */ - if (addr_len < SIN6_LEN_RFC2133) - return -EINVAL; - - sock_owned_by_me(sk); - - return BPF_CGROUP_RUN_PROG_INET6_CONNECT(sk, uaddr); -} - static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { @@ -1979,7 +1964,6 @@ struct proto tcpv6_prot = { .name = "TCPv6", .owner = THIS_MODULE, .close = tcp_close, - .pre_connect = tcp_v6_pre_connect, .connect = tcp_v6_connect, .disconnect = tcp_disconnect, .accept = inet_csk_accept, diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index f0405d98444d..e22e15b06dcb 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -460,10 +460,6 @@ try_again: inet6_iif(skb)); } *addr_len = sizeof(*sin6); - - if (cgroup_bpf_enabled) - BPF_CGROUP_RUN_PROG_UDP6_RECVMSG_LOCK(sk, - (struct sockaddr *)sin6); } if (udp_sk(sk)->gro_enabled) @@ -1036,25 +1032,6 @@ static void udp_v6_flush_pending_frames(struct sock *sk) } } -static int udpv6_pre_connect(struct sock *sk, struct sockaddr *uaddr, - int addr_len) -{ - /* The following checks are replicated from __ip6_datagram_connect() - * and intended to prevent BPF program called below from accessing - * bytes that are out of the bound specified by user in addr_len. - */ - if (uaddr->sa_family == AF_INET) { - if (__ipv6_only_sock(sk)) - return -EAFNOSUPPORT; - return udp_pre_connect(sk, uaddr, addr_len); - } - - if (addr_len < SIN6_LEN_RFC2133) - return -EINVAL; - - return BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr); -} - /** * udp6_hwcsum_outgoing - handle outgoing HW checksumming * @sk: socket we are sending on @@ -1390,29 +1367,6 @@ do_udp_sendmsg: fl6.saddr = np->saddr; fl6.fl6_sport = inet->inet_sport; - if (cgroup_bpf_enabled && !connected) { - err = BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk, - (struct sockaddr *)sin6, &fl6.saddr); - if (err) - goto out_no_dst; - if (sin6) { - if (ipv6_addr_v4mapped(&sin6->sin6_addr)) { - /* BPF program rewrote IPv6-only by IPv4-mapped - * IPv6. It's currently unsupported. - */ - err = -ENOTSUPP; - goto out_no_dst; - } - if (sin6->sin6_port == 0) { - /* BPF program set invalid port. Reject it. */ - err = -EINVAL; - goto out_no_dst; - } - fl6.fl6_dport = sin6->sin6_port; - fl6.daddr = sin6->sin6_addr; - } - } - final_p = fl6_update_dst(&fl6, opt, &final); if (final_p) connected = 0; @@ -1509,7 +1463,6 @@ release_dst: out: dst_release(dst); -out_no_dst: fl6_sock_release(flowlabel); txopt_put(opt_to_free); if (!err) @@ -1657,7 +1610,6 @@ struct proto udpv6_prot = { .name = "UDPv6", .owner = THIS_MODULE, .close = udp_lib_close, - .pre_connect = udpv6_pre_connect, .connect = ip6_datagram_connect, .disconnect = udp_disconnect, .ioctl = udp_ioctl, diff --git a/net/ipv6/xfrm6_mode_tunnel.c b/net/ipv6/xfrm6_mode_tunnel.c index 49bbc69e6f4d..02556e356f87 100644 --- a/net/ipv6/xfrm6_mode_tunnel.c +++ b/net/ipv6/xfrm6_mode_tunnel.c @@ -112,8 +112,10 @@ static void xfrm6_mode_tunnel_xmit(struct xfrm_state *x, struct sk_buff *skb) { struct xfrm_offload *xo = xfrm_offload(skb); - if (xo->flags & XFRM_GSO_SEGMENT) + if (xo->flags & XFRM_GSO_SEGMENT) { + skb->network_header = skb->network_header - x->props.header_len; skb->transport_header = skb->network_header + sizeof(struct ipv6hdr); + } skb_reset_mac_len(skb); pskb_pull(skb, skb->mac_len + x->props.header_len); diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c index 6a34cad6eab6..72dc268aaa7f 100644 --- a/net/netfilter/nft_meta.c +++ b/net/netfilter/nft_meta.c @@ -212,7 +212,7 @@ void nft_meta_get_eval(const struct nft_expr *expr, } #ifdef CONFIG_XFRM case NFT_META_SECPATH: - nft_reg_store8(dest, secpath_exists(skb)); + nft_reg_store8(dest, !!skb->sp); break; #endif default: diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 5d8ab4d411aa..e3edf0c7e2e4 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -250,14 +250,12 @@ static int packet_direct_xmit(struct sk_buff *skb) struct sk_buff *orig_skb = skb; struct netdev_queue *txq; int ret = NETDEV_TX_BUSY; - bool again = false; - if (unlikely(!netif_running(dev) || !netif_carrier_ok(dev))) goto drop; - skb = validate_xmit_skb_list(skb, dev, &again); + skb = validate_xmit_skb_list(skb, dev); if (skb != orig_skb) goto drop; diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c index 6bc400025566..bdc8885c0448 100644 --- a/net/sched/act_bpf.c +++ b/net/sched/act_bpf.c @@ -49,11 +49,11 @@ static int tcf_bpf(struct sk_buff *skb, const struct tc_action *act, filter = rcu_dereference(prog->filter); if (at_ingress) { __skb_push(skb, skb->mac_len); - bpf_compute_data_pointers(skb); + bpf_compute_data_end(skb); filter_res = BPF_PROG_RUN(filter, skb); __skb_pull(skb, skb->mac_len); } else { - bpf_compute_data_pointers(skb); + bpf_compute_data_end(skb); filter_res = BPF_PROG_RUN(filter, skb); } rcu_read_unlock(); diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c index 052073137987..3a499530f321 100644 --- a/net/sched/cls_bpf.c +++ b/net/sched/cls_bpf.c @@ -102,11 +102,11 @@ static int cls_bpf_classify(struct sk_buff *skb, const struct tcf_proto *tp, } else if (at_ingress) { /* It is safe to push/pull even if skb_shared() */ __skb_push(skb, skb->mac_len); - bpf_compute_data_pointers(skb); + bpf_compute_data_end(skb); filter_res = BPF_PROG_RUN(prog->filter, skb); __skb_pull(skb, skb->mac_len); } else { - bpf_compute_data_pointers(skb); + bpf_compute_data_end(skb); filter_res = BPF_PROG_RUN(prog->filter, skb); } diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 05c7196ae849..4a76ceeca6fd 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -30,7 +30,6 @@ #include #include #include -#include /* Qdisc to use by default */ const struct Qdisc_ops *default_qdisc_ops = &pfifo_fast_ops; @@ -120,8 +119,6 @@ static struct sk_buff *dequeue_skb(struct Qdisc *q, bool *validate, if (unlikely(skb)) { /* skb in gso_skb were already validated */ *validate = false; - if (xfrm_offload(skb)) - *validate = true; /* check the reason of requeuing without tx lock first */ txq = skb_get_tx_queue(txq->dev, skb); if (!netif_xmit_frozen_or_stopped(txq)) { @@ -175,23 +172,13 @@ int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q, spinlock_t *root_lock, bool validate) { int ret = NETDEV_TX_BUSY; - bool again = false; /* And release qdisc */ spin_unlock(root_lock); /* Note that we validate skb (GSO, checksum, ...) outside of locks */ if (validate) - skb = validate_xmit_skb_list(skb, dev, &again); - -#ifdef CONFIG_XFRM_OFFLOAD - if (unlikely(again)) { - if (root_lock) - spin_lock(root_lock); - dev_requeue_skb(skb, q); - return false; - } -#endif + skb = validate_xmit_skb_list(skb, dev); if (likely(skb)) { HARD_TX_LOCK(dev, txq, smp_processor_id()); diff --git a/net/socket.c b/net/socket.c index 415b918c036c..f938849dc11e 100644 --- a/net/socket.c +++ b/net/socket.c @@ -1868,8 +1868,6 @@ SYSCALL_DEFINE4(recv, int, fd, void __user *, ubuf, size_t, size, static int __sys_setsockopt(int fd, int level, int optname, char __user *optval, int optlen) { - mm_segment_t oldfs = get_fs(); - char *kernel_optval = NULL; int err, fput_needed; struct socket *sock; @@ -1882,22 +1880,6 @@ static int __sys_setsockopt(int fd, int level, int optname, if (err) goto out_put; - err = BPF_CGROUP_RUN_PROG_SETSOCKOPT(sock->sk, &level, - &optname, optval, &optlen, - &kernel_optval); - if (err < 0) { - goto out_put; - - } else if (err > 0) { - err = 0; - goto out_put; - } - - if (kernel_optval) { - set_fs(KERNEL_DS); - optval = (char __user __force *)kernel_optval; - } - if (level == SOL_SOCKET) err = sock_setsockopt(sock, level, optname, optval, @@ -1906,12 +1888,6 @@ static int __sys_setsockopt(int fd, int level, int optname, err = sock->ops->setsockopt(sock, level, optname, optval, optlen); - - if (kernel_optval) { - set_fs(oldfs); - kfree(kernel_optval); - } - out_put: fput_light(sock->file, fput_needed); } @@ -1934,7 +1910,6 @@ static int __sys_getsockopt(int fd, int level, int optname, { int err, fput_needed; struct socket *sock; - int max_optlen; sock = sockfd_lookup_light(fd, &err, &fput_needed); if (sock != NULL) { @@ -1942,8 +1917,6 @@ static int __sys_getsockopt(int fd, int level, int optname, if (err) goto out_put; - max_optlen = BPF_CGROUP_GETSOCKOPT_MAX_OPTLEN(optlen); - if (level == SOL_SOCKET) err = sock_getsockopt(sock, level, optname, optval, @@ -1952,11 +1925,6 @@ static int __sys_getsockopt(int fd, int level, int optname, err = sock->ops->getsockopt(sock, level, optname, optval, optlen); - - err = BPF_CGROUP_RUN_PROG_GETSOCKOPT(sock->sk, level, optname, - optval, optlen, - max_optlen, err); - out_put: fput_light(sock->file, fput_needed); } diff --git a/net/strparser/Kconfig b/net/strparser/Kconfig index 94da19a2a220..6cff3f6d0c3a 100644 --- a/net/strparser/Kconfig +++ b/net/strparser/Kconfig @@ -1,2 +1,4 @@ + config STREAM_PARSER - def_bool n + tristate + default n diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c index 6150442639b2..a07df7bb41c2 100644 --- a/net/xfrm/xfrm_device.c +++ b/net/xfrm/xfrm_device.c @@ -23,107 +23,32 @@ #include #ifdef CONFIG_XFRM_OFFLOAD -struct sk_buff *validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t features, bool *again) +int validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t features) { int err; - unsigned long flags; struct xfrm_state *x; - struct sk_buff *skb2; - netdev_features_t esp_features = features; - struct softnet_data *sd; struct xfrm_offload *xo = xfrm_offload(skb); - if (!xo) - return skb; + if (skb_is_gso(skb)) + return 0; - if (!(features & NETIF_F_HW_ESP)) - esp_features = features & ~(NETIF_F_SG | NETIF_F_CSUM_MASK); + if (xo) { + x = skb->sp->xvec[skb->sp->len - 1]; + if (xo->flags & XFRM_GRO || x->xso.flags & XFRM_OFFLOAD_INBOUND) + return 0; - x = skb->sp->xvec[skb->sp->len - 1]; - if (xo->flags & XFRM_GRO || x->xso.flags & XFRM_OFFLOAD_INBOUND) - return skb; - - local_irq_save(flags); - sd = this_cpu_ptr(&softnet_data); - err = !skb_queue_empty(&sd->xfrm_backlog); - local_irq_restore(flags); - - if (err) { - *again = true; - return skb; - } - - if (skb_is_gso(skb)) { - struct net_device *dev = skb->dev; - if (unlikely(!x->xso.offload_handle || (x->xso.dev != dev))) { - struct sk_buff *segs; - /* Packet got rerouted, fixup features and segment it. */ - esp_features = esp_features & ~(NETIF_F_HW_ESP - | NETIF_F_GSO_ESP); - - segs = skb_gso_segment(skb, esp_features); - if (IS_ERR(segs)) { - kfree_skb(skb); - atomic_long_inc(&dev->tx_dropped); - return NULL; - } else { - consume_skb(skb); - skb = segs; - } - } - } - - if (!skb->next) { x->outer_mode->xmit(x, skb); - xo->flags |= XFRM_DEV_RESUME; - - err = x->type_offload->xmit(x, skb, esp_features); + err = x->type_offload->xmit(x, skb, features); if (err) { - if (err == -EINPROGRESS) - return NULL; - XFRM_INC_STATS(xs_net(x), LINUX_MIB_XFRMOUTSTATEPROTOERROR); - kfree_skb(skb); - return NULL; + return err; } skb_push(skb, skb->data - skb_mac_header(skb)); - return skb; } - skb2 = skb; - do { - struct sk_buff *nskb = skb2->next; - skb2->next = NULL; - - xo = xfrm_offload(skb2); - xo->flags |= XFRM_DEV_RESUME; - - x->outer_mode->xmit(x, skb2); - err = x->type_offload->xmit(x, skb2, esp_features); - if (!err) { - skb2->next = nskb; - } else if (err != -EINPROGRESS) { - XFRM_INC_STATS(xs_net(x), LINUX_MIB_XFRMOUTSTATEPROTOERROR); - skb2->next = nskb; - kfree_skb_list(skb2); - return NULL; - } else { - if (skb == skb2) - skb = nskb; - - if (!skb) - return NULL; - - goto skip_push; - } - - skb_push(skb2, skb2->data - skb_mac_header(skb2)); -skip_push: - skb2 = nskb; - } while (skb2); - return skb; + return 0; } EXPORT_SYMBOL_GPL(validate_xmit_xfrm); @@ -220,55 +145,6 @@ ok: return true; } EXPORT_SYMBOL_GPL(xfrm_dev_offload_ok); - -void xfrm_dev_resume(struct sk_buff *skb) -{ - struct net_device *dev = skb->dev; - int ret = NETDEV_TX_BUSY; - struct netdev_queue *txq; - struct softnet_data *sd; - unsigned long flags; - - rcu_read_lock(); - txq = netdev_pick_tx(dev, skb, NULL); - HARD_TX_LOCK(dev, txq, smp_processor_id()); - - if (!netif_xmit_frozen_or_stopped(txq)) - skb = dev_hard_start_xmit(skb, dev, txq, &ret); - - HARD_TX_UNLOCK(dev, txq); - if (!dev_xmit_complete(ret)) { - local_irq_save(flags); - sd = this_cpu_ptr(&softnet_data); - skb_queue_tail(&sd->xfrm_backlog, skb); - raise_softirq_irqoff(NET_TX_SOFTIRQ); - local_irq_restore(flags); - } - - rcu_read_unlock(); -} - -EXPORT_SYMBOL_GPL(xfrm_dev_resume); -void xfrm_dev_backlog(struct softnet_data *sd) -{ - struct sk_buff_head *xfrm_backlog = &sd->xfrm_backlog; - struct sk_buff_head list; - struct sk_buff *skb; - - if (skb_queue_empty(xfrm_backlog)) - return; - - __skb_queue_head_init(&list); - spin_lock(&xfrm_backlog->lock); - skb_queue_splice_init(xfrm_backlog, &list); - spin_unlock(&xfrm_backlog->lock); - - while (!skb_queue_empty(&list)) { - skb = __skb_dequeue(&list); - xfrm_dev_resume(skb); - } -} - #endif static int xfrm_dev_register(struct net_device *dev) diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index a5030c61347f..000f99f85f9c 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -112,7 +112,6 @@ enum bpf_map_type { BPF_MAP_TYPE_HASH_OF_MAPS, BPF_MAP_TYPE_DEVMAP, BPF_MAP_TYPE_SOCKMAP, - BPF_MAP_TYPE_CPUMAP, }; enum bpf_prog_type { @@ -131,7 +130,6 @@ enum bpf_prog_type { BPF_PROG_TYPE_LWT_XMIT, BPF_PROG_TYPE_SOCK_OPS, BPF_PROG_TYPE_SK_SKB, - BPF_PROG_TYPE_CGROUP_DEVICE, }; enum bpf_attach_type { @@ -141,7 +139,6 @@ enum bpf_attach_type { BPF_CGROUP_SOCK_OPS, BPF_SK_SKB_STREAM_PARSER, BPF_SK_SKB_STREAM_VERDICT, - BPF_CGROUP_DEVICE, __MAX_BPF_ATTACH_TYPE }; @@ -885,9 +882,6 @@ struct bpf_map_info { __u32 value_size; __u32 max_entries; __u32 map_flags; - __u32 ifindex; - __u64 netns_dev; - __u64 netns_ino; } __attribute__((aligned(8))); /* User bpf_sock_ops struct to access socket values and specify request ops @@ -942,17 +936,4 @@ enum { #define TCP_BPF_IW 1001 /* Set TCP initial congestion window */ #define TCP_BPF_SNDCWND_CLAMP 1002 /* Set sndcwnd_clamp */ -#define BPF_DEVCG_ACC_MKNOD (1ULL << 0) -#define BPF_DEVCG_ACC_READ (1ULL << 1) -#define BPF_DEVCG_ACC_WRITE (1ULL << 2) - -#define BPF_DEVCG_DEV_BLOCK (1ULL << 0) -#define BPF_DEVCG_DEV_CHAR (1ULL << 1) - -struct bpf_cgroup_dev_ctx { - __u32 access_type; /* (access << 16) | type */ - __u32 major; - __u32 minor; -}; - #endif /* _UAPI__LINUX_BPF_H__ */ diff --git a/tools/testing/selftests/bpf/test_lpm_map.c b/tools/testing/selftests/bpf/test_lpm_map.c new file mode 100644 index 000000000000..f93a333cbf2c --- /dev/null +++ b/tools/testing/selftests/bpf/test_lpm_map.c @@ -0,0 +1,359 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Randomized tests for eBPF longest-prefix-match maps + * + * This program runs randomized tests against the lpm-bpf-map. It implements a + * "Trivial Longest Prefix Match" (tlpm) based on simple, linear, singly linked + * lists. The implementation should be pretty straightforward. + * + * Based on tlpm, this inserts randomized data into bpf-lpm-maps and verifies + * the trie-based bpf-map implementation behaves the same way as tlpm. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include "bpf_util.h" + +struct tlpm_node { + struct tlpm_node *next; + size_t n_bits; + uint8_t key[]; +}; + +static struct tlpm_node *tlpm_add(struct tlpm_node *list, + const uint8_t *key, + size_t n_bits) +{ + struct tlpm_node *node; + size_t n; + + /* add new entry with @key/@n_bits to @list and return new head */ + + n = (n_bits + 7) / 8; + node = malloc(sizeof(*node) + n); + assert(node); + + node->next = list; + node->n_bits = n_bits; + memcpy(node->key, key, n); + + return node; +} + +static void tlpm_clear(struct tlpm_node *list) +{ + struct tlpm_node *node; + + /* free all entries in @list */ + + while ((node = list)) { + list = list->next; + free(node); + } +} + +static struct tlpm_node *tlpm_match(struct tlpm_node *list, + const uint8_t *key, + size_t n_bits) +{ + struct tlpm_node *best = NULL; + size_t i; + + /* Perform longest prefix-match on @key/@n_bits. That is, iterate all + * entries and match each prefix against @key. Remember the "best" + * entry we find (i.e., the longest prefix that matches) and return it + * to the caller when done. + */ + + for ( ; list; list = list->next) { + for (i = 0; i < n_bits && i < list->n_bits; ++i) { + if ((key[i / 8] & (1 << (7 - i % 8))) != + (list->key[i / 8] & (1 << (7 - i % 8)))) + break; + } + + if (i >= list->n_bits) { + if (!best || i > best->n_bits) + best = list; + } + } + + return best; +} + +static void test_lpm_basic(void) +{ + struct tlpm_node *list = NULL, *t1, *t2; + + /* very basic, static tests to verify tlpm works as expected */ + + assert(!tlpm_match(list, (uint8_t[]){ 0xff }, 8)); + + t1 = list = tlpm_add(list, (uint8_t[]){ 0xff }, 8); + assert(t1 == tlpm_match(list, (uint8_t[]){ 0xff }, 8)); + assert(t1 == tlpm_match(list, (uint8_t[]){ 0xff, 0xff }, 16)); + assert(t1 == tlpm_match(list, (uint8_t[]){ 0xff, 0x00 }, 16)); + assert(!tlpm_match(list, (uint8_t[]){ 0x7f }, 8)); + assert(!tlpm_match(list, (uint8_t[]){ 0xfe }, 8)); + assert(!tlpm_match(list, (uint8_t[]){ 0xff }, 7)); + + t2 = list = tlpm_add(list, (uint8_t[]){ 0xff, 0xff }, 16); + assert(t1 == tlpm_match(list, (uint8_t[]){ 0xff }, 8)); + assert(t2 == tlpm_match(list, (uint8_t[]){ 0xff, 0xff }, 16)); + assert(t1 == tlpm_match(list, (uint8_t[]){ 0xff, 0xff }, 15)); + assert(!tlpm_match(list, (uint8_t[]){ 0x7f, 0xff }, 16)); + + tlpm_clear(list); +} + +static void test_lpm_order(void) +{ + struct tlpm_node *t1, *t2, *l1 = NULL, *l2 = NULL; + size_t i, j; + + /* Verify the tlpm implementation works correctly regardless of the + * order of entries. Insert a random set of entries into @l1, and copy + * the same data in reverse order into @l2. Then verify a lookup of + * random keys will yield the same result in both sets. + */ + + for (i = 0; i < (1 << 12); ++i) + l1 = tlpm_add(l1, (uint8_t[]){ + rand() % 0xff, + rand() % 0xff, + }, rand() % 16 + 1); + + for (t1 = l1; t1; t1 = t1->next) + l2 = tlpm_add(l2, t1->key, t1->n_bits); + + for (i = 0; i < (1 << 8); ++i) { + uint8_t key[] = { rand() % 0xff, rand() % 0xff }; + + t1 = tlpm_match(l1, key, 16); + t2 = tlpm_match(l2, key, 16); + + assert(!t1 == !t2); + if (t1) { + assert(t1->n_bits == t2->n_bits); + for (j = 0; j < t1->n_bits; ++j) + assert((t1->key[j / 8] & (1 << (7 - j % 8))) == + (t2->key[j / 8] & (1 << (7 - j % 8)))); + } + } + + tlpm_clear(l1); + tlpm_clear(l2); +} + +static void test_lpm_map(int keysize) +{ + size_t i, j, n_matches, n_nodes, n_lookups; + struct tlpm_node *t, *list = NULL; + struct bpf_lpm_trie_key *key; + uint8_t *data, *value; + int r, map; + + /* Compare behavior of tlpm vs. bpf-lpm. Create a randomized set of + * prefixes and insert it into both tlpm and bpf-lpm. Then run some + * randomized lookups and verify both maps return the same result. + */ + + n_matches = 0; + n_nodes = 1 << 8; + n_lookups = 1 << 16; + + data = alloca(keysize); + memset(data, 0, keysize); + + value = alloca(keysize + 1); + memset(value, 0, keysize + 1); + + key = alloca(sizeof(*key) + keysize); + memset(key, 0, sizeof(*key) + keysize); + + map = bpf_create_map(BPF_MAP_TYPE_LPM_TRIE, + sizeof(*key) + keysize, + keysize + 1, + 4096, + BPF_F_NO_PREALLOC); + assert(map >= 0); + + for (i = 0; i < n_nodes; ++i) { + for (j = 0; j < keysize; ++j) + value[j] = rand() & 0xff; + value[keysize] = rand() % (8 * keysize + 1); + + list = tlpm_add(list, value, value[keysize]); + + key->prefixlen = value[keysize]; + memcpy(key->data, value, keysize); + r = bpf_map_update_elem(map, key, value, 0); + assert(!r); + } + + for (i = 0; i < n_lookups; ++i) { + for (j = 0; j < keysize; ++j) + data[j] = rand() & 0xff; + + t = tlpm_match(list, data, 8 * keysize); + + key->prefixlen = 8 * keysize; + memcpy(key->data, data, keysize); + r = bpf_map_lookup_elem(map, key, value); + assert(!r || errno == ENOENT); + assert(!t == !!r); + + if (t) { + ++n_matches; + assert(t->n_bits == value[keysize]); + for (j = 0; j < t->n_bits; ++j) + assert((t->key[j / 8] & (1 << (7 - j % 8))) == + (value[j / 8] & (1 << (7 - j % 8)))); + } + } + + close(map); + tlpm_clear(list); + + /* With 255 random nodes in the map, we are pretty likely to match + * something on every lookup. For statistics, use this: + * + * printf(" nodes: %zu\n" + * "lookups: %zu\n" + * "matches: %zu\n", n_nodes, n_lookups, n_matches); + */ +} + +/* Test the implementation with some 'real world' examples */ + +static void test_lpm_ipaddr(void) +{ + struct bpf_lpm_trie_key *key_ipv4; + struct bpf_lpm_trie_key *key_ipv6; + size_t key_size_ipv4; + size_t key_size_ipv6; + int map_fd_ipv4; + int map_fd_ipv6; + __u64 value; + + key_size_ipv4 = sizeof(*key_ipv4) + sizeof(__u32); + key_size_ipv6 = sizeof(*key_ipv6) + sizeof(__u32) * 4; + key_ipv4 = alloca(key_size_ipv4); + key_ipv6 = alloca(key_size_ipv6); + + map_fd_ipv4 = bpf_create_map(BPF_MAP_TYPE_LPM_TRIE, + key_size_ipv4, sizeof(value), + 100, BPF_F_NO_PREALLOC); + assert(map_fd_ipv4 >= 0); + + map_fd_ipv6 = bpf_create_map(BPF_MAP_TYPE_LPM_TRIE, + key_size_ipv6, sizeof(value), + 100, BPF_F_NO_PREALLOC); + assert(map_fd_ipv6 >= 0); + + /* Fill data some IPv4 and IPv6 address ranges */ + value = 1; + key_ipv4->prefixlen = 16; + inet_pton(AF_INET, "192.168.0.0", key_ipv4->data); + assert(bpf_map_update_elem(map_fd_ipv4, key_ipv4, &value, 0) == 0); + + value = 2; + key_ipv4->prefixlen = 24; + inet_pton(AF_INET, "192.168.0.0", key_ipv4->data); + assert(bpf_map_update_elem(map_fd_ipv4, key_ipv4, &value, 0) == 0); + + value = 3; + key_ipv4->prefixlen = 24; + inet_pton(AF_INET, "192.168.128.0", key_ipv4->data); + assert(bpf_map_update_elem(map_fd_ipv4, key_ipv4, &value, 0) == 0); + + value = 5; + key_ipv4->prefixlen = 24; + inet_pton(AF_INET, "192.168.1.0", key_ipv4->data); + assert(bpf_map_update_elem(map_fd_ipv4, key_ipv4, &value, 0) == 0); + + value = 4; + key_ipv4->prefixlen = 23; + inet_pton(AF_INET, "192.168.0.0", key_ipv4->data); + assert(bpf_map_update_elem(map_fd_ipv4, key_ipv4, &value, 0) == 0); + + value = 0xdeadbeef; + key_ipv6->prefixlen = 64; + inet_pton(AF_INET6, "2a00:1450:4001:814::200e", key_ipv6->data); + assert(bpf_map_update_elem(map_fd_ipv6, key_ipv6, &value, 0) == 0); + + /* Set tprefixlen to maximum for lookups */ + key_ipv4->prefixlen = 32; + key_ipv6->prefixlen = 128; + + /* Test some lookups that should come back with a value */ + inet_pton(AF_INET, "192.168.128.23", key_ipv4->data); + assert(bpf_map_lookup_elem(map_fd_ipv4, key_ipv4, &value) == 0); + assert(value == 3); + + inet_pton(AF_INET, "192.168.0.1", key_ipv4->data); + assert(bpf_map_lookup_elem(map_fd_ipv4, key_ipv4, &value) == 0); + assert(value == 2); + + inet_pton(AF_INET6, "2a00:1450:4001:814::", key_ipv6->data); + assert(bpf_map_lookup_elem(map_fd_ipv6, key_ipv6, &value) == 0); + assert(value == 0xdeadbeef); + + inet_pton(AF_INET6, "2a00:1450:4001:814::1", key_ipv6->data); + assert(bpf_map_lookup_elem(map_fd_ipv6, key_ipv6, &value) == 0); + assert(value == 0xdeadbeef); + + /* Test some lookups that should not match any entry */ + inet_pton(AF_INET, "10.0.0.1", key_ipv4->data); + assert(bpf_map_lookup_elem(map_fd_ipv4, key_ipv4, &value) == -1 && + errno == ENOENT); + + inet_pton(AF_INET, "11.11.11.11", key_ipv4->data); + assert(bpf_map_lookup_elem(map_fd_ipv4, key_ipv4, &value) == -1 && + errno == ENOENT); + + inet_pton(AF_INET6, "2a00:ffff::", key_ipv6->data); + assert(bpf_map_lookup_elem(map_fd_ipv6, key_ipv6, &value) == -1 && + errno == ENOENT); + + close(map_fd_ipv4); + close(map_fd_ipv6); +} + +int main(void) +{ + struct rlimit limit = { RLIM_INFINITY, RLIM_INFINITY }; + int i, ret; + + /* we want predictable, pseudo random tests */ + srand(0xf00ba1); + + /* allow unlimited locked memory */ + ret = setrlimit(RLIMIT_MEMLOCK, &limit); + if (ret < 0) + perror("Unable to lift memlock rlimit"); + + test_lpm_basic(); + test_lpm_order(); + + /* Test with 8, 16, 24, 32, ... 128 bit prefix length */ + for (i = 1; i <= 16; ++i) + test_lpm_map(i); + + test_lpm_ipaddr(); + + printf("test_lpm: OK\n"); + return 0; +} diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c index f2e9b37a4463..f7757f7f6d2b 100644 --- a/tools/testing/selftests/bpf/test_verifier.c +++ b/tools/testing/selftests/bpf/test_verifier.c @@ -8046,78 +8046,6 @@ static struct bpf_test tests[] = { .result = REJECT, .errstr = "variable ctx access var_off=(0x0; 0x4)", }, - { - "bpf_exit with invalid return code. test1", - .insns = { - BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, 0), - BPF_EXIT_INSN(), - }, - .errstr = "R0 has value (0x0; 0xffffffff)", - .result = REJECT, - .prog_type = BPF_PROG_TYPE_CGROUP_SOCK, - }, - { - "bpf_exit with invalid return code. test2", - .insns = { - BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, 0), - BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 1), - BPF_EXIT_INSN(), - }, - .result = ACCEPT, - .prog_type = BPF_PROG_TYPE_CGROUP_SOCK, - }, - { - "bpf_exit with invalid return code. test3", - .insns = { - BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, 0), - BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 3), - BPF_EXIT_INSN(), - }, - .errstr = "R0 has value (0x0; 0x3)", - .result = REJECT, - .prog_type = BPF_PROG_TYPE_CGROUP_SOCK, - }, - { - "bpf_exit with invalid return code. test4", - .insns = { - BPF_MOV64_IMM(BPF_REG_0, 1), - BPF_EXIT_INSN(), - }, - .result = ACCEPT, - .prog_type = BPF_PROG_TYPE_CGROUP_SOCK, - }, - { - "bpf_exit with invalid return code. test5", - .insns = { - BPF_MOV64_IMM(BPF_REG_0, 2), - BPF_EXIT_INSN(), - }, - .errstr = "R0 has value (0x2; 0x0)", - .result = REJECT, - .prog_type = BPF_PROG_TYPE_CGROUP_SOCK, - }, - { - "bpf_exit with invalid return code. test6", - .insns = { - BPF_MOV64_REG(BPF_REG_0, BPF_REG_1), - BPF_EXIT_INSN(), - }, - .errstr = "R0 is not a known value (ctx)", - .result = REJECT, - .prog_type = BPF_PROG_TYPE_CGROUP_SOCK, - }, - { - "bpf_exit with invalid return code. test7", - .insns = { - BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, 0), - BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, 4), - BPF_ALU64_REG(BPF_MUL, BPF_REG_0, BPF_REG_2), - BPF_EXIT_INSN(), - }, - .errstr = "R0 has unknown scalar value", - .result = REJECT, - .prog_type = BPF_PROG_TYPE_CGROUP_SOCK, - }, }; static int probe_filter_length(const struct bpf_insn *fp) From 685fa51869c6d893978c3325b12777e5b472438c Mon Sep 17 00:00:00 2001 From: Craig Gallek Date: Mon, 18 Sep 2017 15:30:55 -0400 Subject: [PATCH 0016/1640] UPSTREAM: bpf: Implement map_delete_elem for BPF_MAP_TYPE_LPM_TRIE This is a simple non-recursive delete operation. It prunes paths of empty nodes in the tree, but it does not try to further compress the tree as nodes are removed. Signed-off-by: Craig Gallek Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/lpm_trie.c | 80 +++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 77 insertions(+), 3 deletions(-) diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index 3925794d8188..324ce250acd8 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -392,10 +392,84 @@ out: return ret; } -static int trie_delete_elem(struct bpf_map *map, void *key) +/* Called from syscall or from eBPF program */ +static int trie_delete_elem(struct bpf_map *map, void *_key) { - /* TODO */ - return -ENOSYS; + struct lpm_trie *trie = container_of(map, struct lpm_trie, map); + struct bpf_lpm_trie_key *key = _key; + struct lpm_trie_node __rcu **trim; + struct lpm_trie_node *node; + unsigned long irq_flags; + unsigned int next_bit; + size_t matchlen = 0; + int ret = 0; + + if (key->prefixlen > trie->max_prefixlen) + return -EINVAL; + + raw_spin_lock_irqsave(&trie->lock, irq_flags); + + /* Walk the tree looking for an exact key/length match and keeping + * track of where we could begin trimming the tree. The trim-point + * is the sub-tree along the walk consisting of only single-child + * intermediate nodes and ending at a leaf node that we want to + * remove. + */ + trim = &trie->root; + node = rcu_dereference_protected( + trie->root, lockdep_is_held(&trie->lock)); + while (node) { + matchlen = longest_prefix_match(trie, node, key); + + if (node->prefixlen != matchlen || + node->prefixlen == key->prefixlen) + break; + + next_bit = extract_bit(key->data, node->prefixlen); + /* If we hit a node that has more than one child or is a valid + * prefix itself, do not remove it. Reset the root of the trim + * path to its descendant on our path. + */ + if (!(node->flags & LPM_TREE_NODE_FLAG_IM) || + (node->child[0] && node->child[1])) + trim = &node->child[next_bit]; + node = rcu_dereference_protected( + node->child[next_bit], lockdep_is_held(&trie->lock)); + } + + if (!node || node->prefixlen != key->prefixlen || + (node->flags & LPM_TREE_NODE_FLAG_IM)) { + ret = -ENOENT; + goto out; + } + + trie->n_entries--; + + /* If the node we are removing is not a leaf node, simply mark it + * as intermediate and we are done. + */ + if (rcu_access_pointer(node->child[0]) || + rcu_access_pointer(node->child[1])) { + node->flags |= LPM_TREE_NODE_FLAG_IM; + goto out; + } + + /* trim should now point to the slot holding the start of a path from + * zero or more intermediate nodes to our leaf node for deletion. + */ + while ((node = rcu_dereference_protected( + *trim, lockdep_is_held(&trie->lock)))) { + RCU_INIT_POINTER(*trim, NULL); + trim = rcu_access_pointer(node->child[0]) ? + &node->child[0] : + &node->child[1]; + kfree_rcu(node, rcu); + } + +out: + raw_spin_unlock_irqrestore(&trie->lock, irq_flags); + + return ret; } #define LPM_DATA_SIZE_MAX 256 From 939019e63e964b576e91106c71ea97088f501dc5 Mon Sep 17 00:00:00 2001 From: Craig Gallek Date: Thu, 21 Sep 2017 18:43:29 -0400 Subject: [PATCH 0017/1640] UPSTREAM: bpf: Optimize lpm trie delete Before the delete operator was added, this datastructure maintained an invariant that intermediate nodes were only present when necessary to build the tree. This patch updates the delete operation to reinstate that invariant by removing unnecessary intermediate nodes after a node is removed and thus keeping the tree structure at a minimal size. Suggested-by: Daniel Mack Signed-off-by: Craig Gallek Signed-off-by: David S. Miller --- kernel/bpf/lpm_trie.c | 71 ++++++++++++++++++++++++++----------------- 1 file changed, 43 insertions(+), 28 deletions(-) diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index 324ce250acd8..ad7cbb6cbabd 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -397,8 +397,8 @@ static int trie_delete_elem(struct bpf_map *map, void *_key) { struct lpm_trie *trie = container_of(map, struct lpm_trie, map); struct bpf_lpm_trie_key *key = _key; - struct lpm_trie_node __rcu **trim; - struct lpm_trie_node *node; + struct lpm_trie_node __rcu **trim, **trim2; + struct lpm_trie_node *node, *parent; unsigned long irq_flags; unsigned int next_bit; size_t matchlen = 0; @@ -410,31 +410,26 @@ static int trie_delete_elem(struct bpf_map *map, void *_key) raw_spin_lock_irqsave(&trie->lock, irq_flags); /* Walk the tree looking for an exact key/length match and keeping - * track of where we could begin trimming the tree. The trim-point - * is the sub-tree along the walk consisting of only single-child - * intermediate nodes and ending at a leaf node that we want to - * remove. + * track of the path we traverse. We will need to know the node + * we wish to delete, and the slot that points to the node we want + * to delete. We may also need to know the nodes parent and the + * slot that contains it. */ trim = &trie->root; - node = rcu_dereference_protected( - trie->root, lockdep_is_held(&trie->lock)); - while (node) { + trim2 = trim; + parent = NULL; + while ((node = rcu_dereference_protected( + *trim, lockdep_is_held(&trie->lock)))) { matchlen = longest_prefix_match(trie, node, key); if (node->prefixlen != matchlen || node->prefixlen == key->prefixlen) break; + parent = node; + trim2 = trim; next_bit = extract_bit(key->data, node->prefixlen); - /* If we hit a node that has more than one child or is a valid - * prefix itself, do not remove it. Reset the root of the trim - * path to its descendant on our path. - */ - if (!(node->flags & LPM_TREE_NODE_FLAG_IM) || - (node->child[0] && node->child[1])) - trim = &node->child[next_bit]; - node = rcu_dereference_protected( - node->child[next_bit], lockdep_is_held(&trie->lock)); + trim = &node->child[next_bit]; } if (!node || node->prefixlen != key->prefixlen || @@ -445,27 +440,47 @@ static int trie_delete_elem(struct bpf_map *map, void *_key) trie->n_entries--; - /* If the node we are removing is not a leaf node, simply mark it + /* If the node we are removing has two children, simply mark it * as intermediate and we are done. */ - if (rcu_access_pointer(node->child[0]) || + if (rcu_access_pointer(node->child[0]) && rcu_access_pointer(node->child[1])) { node->flags |= LPM_TREE_NODE_FLAG_IM; goto out; } - /* trim should now point to the slot holding the start of a path from - * zero or more intermediate nodes to our leaf node for deletion. + /* If the parent of the node we are about to delete is an intermediate + * node, and the deleted node doesn't have any children, we can delete + * the intermediate parent as well and promote its other child + * up the tree. Doing this maintains the invariant that all + * intermediate nodes have exactly 2 children and that there are no + * unnecessary intermediate nodes in the tree. */ - while ((node = rcu_dereference_protected( - *trim, lockdep_is_held(&trie->lock)))) { - RCU_INIT_POINTER(*trim, NULL); - trim = rcu_access_pointer(node->child[0]) ? - &node->child[0] : - &node->child[1]; + if (parent && (parent->flags & LPM_TREE_NODE_FLAG_IM) && + !node->child[0] && !node->child[1]) { + if (node == rcu_access_pointer(parent->child[0])) + rcu_assign_pointer( + *trim2, rcu_access_pointer(parent->child[1])); + else + rcu_assign_pointer( + *trim2, rcu_access_pointer(parent->child[0])); + kfree_rcu(parent, rcu); kfree_rcu(node, rcu); + goto out; } + /* The node we are removing has either zero or one child. If there + * is a child, move it into the removed node's slot then delete + * the node. Otherwise just clear the slot and delete the node. + */ + if (node->child[0]) + rcu_assign_pointer(*trim, rcu_access_pointer(node->child[0])); + else if (node->child[1]) + rcu_assign_pointer(*trim, rcu_access_pointer(node->child[1])); + else + RCU_INIT_POINTER(*trim, NULL); + kfree_rcu(node, rcu); + out: raw_spin_unlock_irqrestore(&trie->lock, irq_flags); From a7605ec4d01b1c4430f97b66a1b35b7f1eaa53ee Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 25 Sep 2017 02:25:51 +0200 Subject: [PATCH 0018/1640] BACKPORT: bpf: add meta pointer for direct access This work enables generic transfer of metadata from XDP into skb. The basic idea is that we can make use of the fact that the resulting skb must be linear and already comes with a larger headroom for supporting bpf_xdp_adjust_head(), which mangles xdp->data. Here, we base our work on a similar principle and introduce a small helper bpf_xdp_adjust_meta() for adjusting a new pointer called xdp->data_meta. Thus, the packet has a flexible and programmable room for meta data, followed by the actual packet data. struct xdp_buff is therefore laid out that we first point to data_hard_start, then data_meta directly prepended to data followed by data_end marking the end of packet. bpf_xdp_adjust_head() takes into account whether we have meta data already prepended and if so, memmove()s this along with the given offset provided there's enough room. xdp->data_meta is optional and programs are not required to use it. The rationale is that when we process the packet in XDP (e.g. as DoS filter), we can push further meta data along with it for the XDP_PASS case, and give the guarantee that a clsact ingress BPF program on the same device can pick this up for further post-processing. Since we work with skb there, we can also set skb->mark, skb->priority or other skb meta data out of BPF, thus having this scratch space generic and programmable allows for more flexibility than defining a direct 1:1 transfer of potentially new XDP members into skb (it's also more efficient as we don't need to initialize/handle each of such new members). The facility also works together with GRO aggregation. The scratch space at the head of the packet can be multiple of 4 byte up to 32 byte large. Drivers not yet supporting xdp->data_meta can simply be set up with xdp->data_meta as xdp->data + 1 as bpf_xdp_adjust_meta() will detect this and bail out, such that the subsequent match against xdp->data for later access is guaranteed to fail. The verifier treats xdp->data_meta/xdp->data the same way as we treat xdp->data/xdp->data_end pointer comparisons. The requirement for doing the compare against xdp->data is that it hasn't been modified from it's original address we got from ctx access. It may have a range marking already from prior successful xdp->data/xdp->data_end pointer comparisons though. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Acked-by: John Fastabend Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c | 1 + .../net/ethernet/cavium/thunder/nicvf_main.c | 1 + drivers/net/ethernet/intel/i40e/i40e_txrx.c | 1 + drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 1 + drivers/net/ethernet/mellanox/mlx4/en_rx.c | 1 + .../net/ethernet/mellanox/mlx5/core/en_rx.c | 1 + .../ethernet/netronome/nfp/nfp_net_common.c | 1 + drivers/net/ethernet/qlogic/qede/qede_fp.c | 1 + drivers/net/tun.c | 1 + drivers/net/virtio_net.c | 2 + include/linux/bpf.h | 1 + include/linux/filter.h | 21 ++- include/linux/skbuff.h | 68 ++++++++- include/uapi/linux/bpf.h | 12 +- kernel/bpf/verifier.c | 132 +++++++++++++----- net/bpf/test_run.c | 1 + net/core/dev.c | 33 ++++- net/core/filter.c | 76 +++++++++- net/core/skbuff.c | 2 + 19 files changed, 310 insertions(+), 47 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c index d8f0c837b72c..06ce63c00821 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c @@ -94,6 +94,7 @@ bool bnxt_rx_xdp(struct bnxt *bp, struct bnxt_rx_ring_info *rxr, u16 cons, xdp.data_hard_start = *data_ptr - offset; xdp.data = *data_ptr; + xdp_set_data_meta_invalid(&xdp); xdp.data_end = *data_ptr + *len; orig_data = xdp.data; mapping = rx_buf->mapping - bp->rx_dma_offset; diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_main.c b/drivers/net/ethernet/cavium/thunder/nicvf_main.c index df1c4ba7e0c9..204736468687 100644 --- a/drivers/net/ethernet/cavium/thunder/nicvf_main.c +++ b/drivers/net/ethernet/cavium/thunder/nicvf_main.c @@ -541,6 +541,7 @@ static inline bool nicvf_xdp_rx(struct nicvf *nic, struct bpf_prog *prog, xdp.data_hard_start = page_address(page); xdp.data = (void *)cpu_addr; + xdp_set_data_meta_invalid(&xdp); xdp.data_end = xdp.data + len; orig_data = xdp.data; diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c index 02871e0e2024..22fb4afb137a 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c @@ -2117,6 +2117,7 @@ static int i40e_clean_rx_irq(struct i40e_ring *rx_ring, int budget) if (!skb) { xdp.data = page_address(rx_buffer->page) + rx_buffer->page_offset; + xdp_set_data_meta_invalid(&xdp); xdp.data_hard_start = xdp.data - i40e_rx_offset(rx_ring); xdp.data_end = xdp.data + size; diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index ca9bdaed31c5..2557fffcb87b 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -2339,6 +2339,7 @@ static int ixgbe_clean_rx_irq(struct ixgbe_q_vector *q_vector, if (!skb) { xdp.data = page_address(rx_buffer->page) + rx_buffer->page_offset; + xdp_set_data_meta_invalid(&xdp); xdp.data_hard_start = xdp.data - ixgbe_rx_offset(rx_ring); xdp.data_end = xdp.data + size; diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c index bb0063e851c3..0fbed144c597 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c @@ -780,6 +780,7 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud xdp.data_hard_start = va - frags[0].page_offset; xdp.data = va; + xdp_set_data_meta_invalid(&xdp); xdp.data_end = xdp.data + length; orig_data = xdp.data; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c index bf311a3c3e02..ce96230b98ed 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c @@ -845,6 +845,7 @@ static inline int mlx5e_xdp_handle(struct mlx5e_rq *rq, return false; xdp.data = va + *rx_headroom; + xdp_set_data_meta_invalid(&xdp); xdp.data_end = xdp.data + *len; xdp.data_hard_start = va; diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c index bffa25d6dc29..43b1cba913ed 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c @@ -1614,6 +1614,7 @@ static int nfp_net_run_xdp(struct bpf_prog *prog, void *data, void *hard_start, xdp.data_hard_start = hard_start; xdp.data = data + *off; + xdp_set_data_meta_invalid(&xdp); xdp.data_end = data + *off + *len; orig_data = xdp.data; diff --git a/drivers/net/ethernet/qlogic/qede/qede_fp.c b/drivers/net/ethernet/qlogic/qede/qede_fp.c index 8a8e1616b79a..c044156b9ba6 100644 --- a/drivers/net/ethernet/qlogic/qede/qede_fp.c +++ b/drivers/net/ethernet/qlogic/qede/qede_fp.c @@ -1002,6 +1002,7 @@ static bool qede_rx_xdp(struct qede_dev *edev, xdp.data_hard_start = page_address(bd->data); xdp.data = xdp.data_hard_start + *data_offset; + xdp_set_data_meta_invalid(&xdp); xdp.data_end = xdp.data + *len; /* Queues always have a full reset currently, so for the time diff --git a/drivers/net/tun.c b/drivers/net/tun.c index c125b06f4298..10870ac05157 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -1369,6 +1369,7 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun, xdp.data_hard_start = buf; xdp.data = buf + pad; + xdp_set_data_meta_invalid(&xdp); xdp.data_end = xdp.data + len; orig_data = xdp.data; act = bpf_prog_run_xdp(xdp_prog, &xdp); diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index f90b95b0c2e8..d4313df996fa 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -551,6 +551,7 @@ static struct sk_buff *receive_small(struct net_device *dev, xdp.data_hard_start = buf + VIRTNET_RX_PAD + vi->hdr_len; xdp.data = xdp.data_hard_start + xdp_headroom; + xdp_set_data_meta_invalid(&xdp); xdp.data_end = xdp.data + len; orig_data = xdp.data; act = bpf_prog_run_xdp(xdp_prog, &xdp); @@ -673,6 +674,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, data = page_address(xdp_page) + offset; xdp.data_hard_start = data - VIRTIO_XDP_HEADROOM + vi->hdr_len; xdp.data = data + vi->hdr_len; + xdp_set_data_meta_invalid(&xdp); xdp.data_end = xdp.data + (len - vi->hdr_len); act = bpf_prog_run_xdp(xdp_prog, &xdp); diff --git a/include/linux/bpf.h b/include/linux/bpf.h index b9712fb3a320..50012eace318 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -155,6 +155,7 @@ enum bpf_reg_type { PTR_TO_MAP_VALUE, /* reg points to map element value */ PTR_TO_MAP_VALUE_OR_NULL,/* points to map elem value or NULL */ PTR_TO_STACK, /* reg == frame_pointer + offset */ + PTR_TO_PACKET_META, /* skb->data - meta_len */ PTR_TO_PACKET, /* reg points to skb->data */ PTR_TO_PACKET_END, /* skb->data + headlen */ }; diff --git a/include/linux/filter.h b/include/linux/filter.h index dada9e36521e..c3038118a4c0 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -566,12 +566,14 @@ static inline void bpf_jit_set_header_magic(struct bpf_binary_header *hdr) struct bpf_skb_data_end { struct qdisc_skb_cb qdisc_cb; + void *data_meta; void *data_end; }; struct xdp_buff { void *data; void *data_end; + void *data_meta; void *data_hard_start; }; @@ -583,7 +585,8 @@ static inline void bpf_compute_data_end(struct sk_buff *skb) struct bpf_skb_data_end *cb = (struct bpf_skb_data_end *)skb->cb; BUILD_BUG_ON(sizeof(*cb) > FIELD_SIZEOF(struct sk_buff, cb)); - cb->data_end = skb->data + skb_headlen(skb); + cb->data_meta = skb->data - skb_metadata_len(skb); + cb->data_end = skb->data + skb_headlen(skb); } static inline u8 *bpf_skb_cb(struct sk_buff *skb) @@ -804,8 +807,22 @@ int xdp_do_redirect(struct net_device *dev, struct bpf_prog *prog); void xdp_do_flush_map(void); +/* Drivers not supporting XDP metadata can use this helper, which + * rejects any room expansion for metadata as a result. + */ +static __always_inline void +xdp_set_data_meta_invalid(struct xdp_buff *xdp) +{ + xdp->data_meta = xdp->data + 1; +} + +static __always_inline bool +xdp_data_meta_unsupported(const struct xdp_buff *xdp) +{ + return unlikely(xdp->data_meta > xdp->data); +} + void bpf_warn_invalid_xdp_action(u32 act); -void bpf_warn_invalid_xdp_redirect(u32 ifindex); struct sock *do_sk_redirect_map(struct sk_buff *skb); diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 3abb6361a173..1dabd92fc251 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -488,8 +488,9 @@ int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb, * the end of the header data, ie. at skb->end. */ struct skb_shared_info { - unsigned short _unused; - unsigned char nr_frags; + __u8 __unused; + __u8 meta_len; + __u8 nr_frags; __u8 tx_flags; unsigned short gso_size; /* Warning: this field is not always filled in (UFO)! */ @@ -3532,6 +3533,69 @@ static inline ktime_t net_invalid_timestamp(void) return 0; } +static inline u8 skb_metadata_len(const struct sk_buff *skb) +{ + return skb_shinfo(skb)->meta_len; +} + +static inline void *skb_metadata_end(const struct sk_buff *skb) +{ + return skb_mac_header(skb); +} + +static inline bool __skb_metadata_differs(const struct sk_buff *skb_a, + const struct sk_buff *skb_b, + u8 meta_len) +{ + const void *a = skb_metadata_end(skb_a); + const void *b = skb_metadata_end(skb_b); + /* Using more efficient varaiant than plain call to memcmp(). */ +#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 + u64 diffs = 0; + + switch (meta_len) { +#define __it(x, op) (x -= sizeof(u##op)) +#define __it_diff(a, b, op) (*(u##op *)__it(a, op)) ^ (*(u##op *)__it(b, op)) + case 32: diffs |= __it_diff(a, b, 64); + case 24: diffs |= __it_diff(a, b, 64); + case 16: diffs |= __it_diff(a, b, 64); + case 8: diffs |= __it_diff(a, b, 64); + break; + case 28: diffs |= __it_diff(a, b, 64); + case 20: diffs |= __it_diff(a, b, 64); + case 12: diffs |= __it_diff(a, b, 64); + case 4: diffs |= __it_diff(a, b, 32); + break; + } + return diffs; +#else + return memcmp(a - meta_len, b - meta_len, meta_len); +#endif +} + +static inline bool skb_metadata_differs(const struct sk_buff *skb_a, + const struct sk_buff *skb_b) +{ + u8 len_a = skb_metadata_len(skb_a); + u8 len_b = skb_metadata_len(skb_b); + + if (!(len_a | len_b)) + return false; + + return len_a != len_b ? + true : __skb_metadata_differs(skb_a, skb_b, len_a); +} + +static inline void skb_metadata_set(struct sk_buff *skb, u8 meta_len) +{ + skb_shinfo(skb)->meta_len = meta_len; +} + +static inline void skb_metadata_clear(struct sk_buff *skb) +{ + skb_metadata_set(skb, 0); +} + struct sk_buff *skb_clone_sk(struct sk_buff *skb); #ifdef CONFIG_NETWORK_PHY_TIMESTAMPING diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 3064575d843d..5baff234265f 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -626,6 +626,12 @@ union bpf_attr { * @key: key to insert/update sock in map * @flags: same flags as map update elem * + * int bpf_xdp_adjust_meta(xdp_md, delta) + * Adjust the xdp_md.data_meta by delta + * @xdp_md: pointer to xdp_md + * @delta: An positive/negative integer to be added to xdp_md.data_meta + * Return: 0 on success or negative on error + * * int skb_load_bytes_relative(const struct sk_buff *skb, u32 offset, void *to, u32 len, u32 start_header) * Description * This helper is similar to **bpf_skb_load_bytes**\ () in that @@ -860,7 +866,7 @@ struct __sk_buff { __u32 data_end; __u32 napi_id; - /* accessed by BPF_PROG_TYPE_sk_skb types */ + /* Accessed by BPF_PROG_TYPE_sk_skb types from here to ... */ __u32 family; __u32 remote_ip4; /* Stored in network byte order */ __u32 local_ip4; /* Stored in network byte order */ @@ -868,6 +874,9 @@ struct __sk_buff { __u32 local_ip6[4]; /* Stored in network byte order */ __u32 remote_port; /* Stored in network byte order */ __u32 local_port; /* stored in host byte order */ + /* ... here. */ + + __u32 data_meta; }; struct bpf_tunnel_key { @@ -928,6 +937,7 @@ enum xdp_action { struct xdp_md { __u32 data; __u32 data_end; + __u32 data_meta; }; enum sk_action { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a55e264cdb54..4dff5b5e0dda 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -177,6 +177,12 @@ static __printf(1, 2) void verbose(const char *fmt, ...) va_end(args); } +static bool type_is_pkt_pointer(enum bpf_reg_type type) +{ + return type == PTR_TO_PACKET || + type == PTR_TO_PACKET_META; +} + /* string representation of 'enum bpf_reg_type' */ static const char * const reg_type_str[] = { [NOT_INIT] = "?", @@ -187,6 +193,7 @@ static const char * const reg_type_str[] = { [PTR_TO_MAP_VALUE_OR_NULL] = "map_value_or_null", [PTR_TO_STACK] = "fp", [PTR_TO_PACKET] = "pkt", + [PTR_TO_PACKET_META] = "pkt_meta", [PTR_TO_PACKET_END] = "pkt_end", }; @@ -226,7 +233,7 @@ static void print_verifier_state(struct bpf_verifier_state *state) verbose("(id=%d", reg->id); if (t != SCALAR_VALUE) verbose(",off=%d", reg->off); - if (t == PTR_TO_PACKET) + if (type_is_pkt_pointer(t)) verbose(",r=%d", reg->range); else if (t == CONST_PTR_TO_MAP || t == PTR_TO_MAP_VALUE || @@ -613,6 +620,31 @@ static void mark_reg_known_zero(struct bpf_reg_state *regs, u32 regno) __mark_reg_known_zero(regs + regno); } +static bool reg_is_pkt_pointer(const struct bpf_reg_state *reg) +{ + return type_is_pkt_pointer(reg->type); +} + +static bool reg_is_pkt_pointer_any(const struct bpf_reg_state *reg) +{ + return reg_is_pkt_pointer(reg) || + reg->type == PTR_TO_PACKET_END; +} + +/* Unmodified PTR_TO_PACKET[_META,_END] register from ctx access. */ +static bool reg_is_init_pkt_pointer(const struct bpf_reg_state *reg, + enum bpf_reg_type which) +{ + /* The register can already have a range from prior markings. + * This is fine as long as it hasn't been advanced from its + * origin. + */ + return reg->type == which && + reg->id == 0 && + reg->off == 0 && + tnum_equals_const(reg->var_off, 0); +} + /* Attempts to improve min/max values based on var_off information */ static void __update_reg_bounds(struct bpf_reg_state *reg) { @@ -800,6 +832,7 @@ static bool is_spillable_regtype(enum bpf_reg_type type) case PTR_TO_STACK: case PTR_TO_CTX: case PTR_TO_PACKET: + case PTR_TO_PACKET_META: case PTR_TO_PACKET_END: case CONST_PTR_TO_MAP: return true; @@ -1161,7 +1194,7 @@ static bool is_pkt_reg(struct bpf_verifier_env *env, int regno) { const struct bpf_reg_state *reg = cur_regs(env) + regno; - return reg->type == PTR_TO_PACKET; + return type_is_pkt_pointer(reg->type); } static int check_pkt_ptr_alignment(const struct bpf_reg_state *reg, @@ -1229,7 +1262,10 @@ static int check_ptr_alignment(struct bpf_verifier_env *env, switch (reg->type) { case PTR_TO_PACKET: - /* special case, because of NET_IP_ALIGN */ + case PTR_TO_PACKET_META: + /* Special case, because of NET_IP_ALIGN. Given metadata sits + * right in front, treat it the very same way. + */ return check_pkt_ptr_alignment(reg, off, size, strict); case PTR_TO_MAP_VALUE: pointer_desc = "value "; @@ -1351,8 +1387,8 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn err = check_ctx_access(env, insn_idx, off, size, t, ®_type); if (!err && t == BPF_READ && value_regno >= 0) { /* ctx access returns either a scalar, or a - * PTR_TO_PACKET[_END]. In the latter case, we know - * the offset is zero. + * PTR_TO_PACKET[_META,_END]. In the latter + * case, we know the offset is zero. */ if (reg_type == SCALAR_VALUE) mark_reg_unknown(regs, value_regno); @@ -1378,7 +1414,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn value_regno, insn_idx); else err = check_stack_read(state, off, size, value_regno); - } else if (reg->type == PTR_TO_PACKET) { + } else if (reg_is_pkt_pointer(reg)) { if (t == BPF_WRITE && !may_access_direct_pkt_data(env, NULL, t)) { verbose("cannot write into packet\n"); return -EACCES; @@ -1529,6 +1565,7 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, switch (reg->type) { case PTR_TO_PACKET: + case PTR_TO_PACKET_META: return check_packet_access(env, regno, reg->off, access_size); case PTR_TO_MAP_VALUE: return check_map_access(env, regno, reg->off, access_size); @@ -1561,7 +1598,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, return 0; } - if (type == PTR_TO_PACKET && + if (type_is_pkt_pointer(type) && !may_access_direct_pkt_data(env, meta, BPF_READ)) { verbose("helper access to the packet is not allowed\n"); return -EACCES; @@ -1570,7 +1607,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, if (arg_type == ARG_PTR_TO_MAP_KEY || arg_type == ARG_PTR_TO_MAP_VALUE) { expected_type = PTR_TO_STACK; - if (type != PTR_TO_PACKET && type != expected_type) + if (!type_is_pkt_pointer(type) && + type != expected_type) goto err_type; } else if (arg_type == ARG_CONST_SIZE || arg_type == ARG_CONST_SIZE_OR_ZERO) { @@ -1597,7 +1635,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, */ if (register_is_null(*reg)) /* final test in check_stack_boundary() */; - else if (type != PTR_TO_PACKET && type != PTR_TO_MAP_VALUE && + else if (!type_is_pkt_pointer(type) && + type != PTR_TO_MAP_VALUE && type != expected_type) goto err_type; meta->raw_mode = arg_type == ARG_PTR_TO_UNINIT_MEM; @@ -1623,7 +1662,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, verbose("invalid map_ptr to access map->key\n"); return -EACCES; } - if (type == PTR_TO_PACKET) + if (type_is_pkt_pointer(type)) err = check_packet_access(env, regno, reg->off, meta->map_ptr->key_size); else @@ -1639,7 +1678,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, verbose("invalid map_ptr to access map->value\n"); return -EACCES; } - if (type == PTR_TO_PACKET) + if (type_is_pkt_pointer(type)) err = check_packet_access(env, regno, reg->off, meta->map_ptr->value_size); else @@ -1812,8 +1851,8 @@ static int check_raw_mode(const struct bpf_func_proto *fn) return count > 1 ? -EINVAL : 0; } -/* Packet data might have moved, any old PTR_TO_PACKET[_END] are now invalid, - * so turn them into unknown SCALAR_VALUE. +/* Packet data might have moved, any old PTR_TO_PACKET[_META,_END] + * are now invalid, so turn them into unknown SCALAR_VALUE. */ static void clear_all_pkt_pointers(struct bpf_verifier_env *env) { @@ -1822,18 +1861,15 @@ static void clear_all_pkt_pointers(struct bpf_verifier_env *env) int i; for (i = 0; i < MAX_BPF_REG; i++) - if (regs[i].type == PTR_TO_PACKET || - regs[i].type == PTR_TO_PACKET_END) + if (reg_is_pkt_pointer_any(®s[i])) mark_reg_unknown(regs, i); for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { if (state->stack[i].slot_type[0] != STACK_SPILL) continue; reg = &state->stack[i].spilled_ptr; - if (reg->type != PTR_TO_PACKET && - reg->type != PTR_TO_PACKET_END) - continue; - __mark_reg_unknown(reg); + if (reg_is_pkt_pointer_any(reg)) + __mark_reg_unknown(reg); } } @@ -2387,7 +2423,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, dst_reg->var_off = tnum_add(ptr_reg->var_off, off_reg->var_off); dst_reg->off = ptr_reg->off; dst_reg->raw = ptr_reg->raw; - if (ptr_reg->type == PTR_TO_PACKET) { + if (reg_is_pkt_pointer(ptr_reg)) { dst_reg->id = ++env->id_gen; /* something was added to pkt_ptr, set range to zero */ dst_reg->raw = 0; @@ -2446,7 +2482,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, dst_reg->var_off = tnum_sub(ptr_reg->var_off, off_reg->var_off); dst_reg->off = ptr_reg->off; dst_reg->raw = ptr_reg->raw; - if (ptr_reg->type == PTR_TO_PACKET) { + if (reg_is_pkt_pointer(ptr_reg)) { dst_reg->id = ++env->id_gen; /* something was added to pkt_ptr, set range to zero */ if (smin_val < 0) @@ -2970,6 +3006,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) static void find_good_pkt_pointers(struct bpf_verifier_state *state, struct bpf_reg_state *dst_reg, + enum bpf_reg_type type, bool range_right_open) { struct bpf_reg_state *regs = state->regs, *reg; @@ -3040,7 +3077,7 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state, * dst_reg->off is known < MAX_PACKET_OFF, therefore it fits in a u16. */ for (i = 0; i < MAX_BPF_REG; i++) - if (regs[i].type == PTR_TO_PACKET && regs[i].id == dst_reg->id) + if (regs[i].type == type && regs[i].id == dst_reg->id) /* keep the maximum range already checked */ regs[i].range = max(regs[i].range, new_range); @@ -3048,8 +3085,8 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state, if (state->stack[i].slot_type[0] != STACK_SPILL) continue; reg = &state->stack[i].spilled_ptr; - if (reg->type == PTR_TO_PACKET && reg->id == dst_reg->id) - reg->range = max(reg->range, new_range); + if (reg->type == type && reg->id == dst_reg->id) + reg->range = max_t(u16, reg->range, new_range); } } @@ -3415,42 +3452,70 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, dst_reg->type == PTR_TO_PACKET && regs[insn->src_reg].type == PTR_TO_PACKET_END) { /* pkt_data' > pkt_end */ - find_good_pkt_pointers(this_branch, dst_reg, false); + find_good_pkt_pointers(this_branch, dst_reg, + PTR_TO_PACKET, false); } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGT && dst_reg->type == PTR_TO_PACKET_END && regs[insn->src_reg].type == PTR_TO_PACKET) { /* pkt_end > pkt_data' */ - find_good_pkt_pointers(other_branch, ®s[insn->src_reg], true); + find_good_pkt_pointers(other_branch, ®s[insn->src_reg], + PTR_TO_PACKET, true); } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLT && dst_reg->type == PTR_TO_PACKET && regs[insn->src_reg].type == PTR_TO_PACKET_END) { /* pkt_data' < pkt_end */ - find_good_pkt_pointers(other_branch, dst_reg, true); + find_good_pkt_pointers(other_branch, dst_reg, PTR_TO_PACKET, + true); } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLT && dst_reg->type == PTR_TO_PACKET_END && regs[insn->src_reg].type == PTR_TO_PACKET) { /* pkt_end < pkt_data' */ - find_good_pkt_pointers(this_branch, ®s[insn->src_reg], false); + find_good_pkt_pointers(this_branch, ®s[insn->src_reg], + PTR_TO_PACKET, false); } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGE && dst_reg->type == PTR_TO_PACKET && regs[insn->src_reg].type == PTR_TO_PACKET_END) { /* pkt_data' >= pkt_end */ - find_good_pkt_pointers(this_branch, dst_reg, true); + find_good_pkt_pointers(this_branch, dst_reg, + PTR_TO_PACKET, true); } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGE && dst_reg->type == PTR_TO_PACKET_END && regs[insn->src_reg].type == PTR_TO_PACKET) { /* pkt_end >= pkt_data' */ - find_good_pkt_pointers(other_branch, ®s[insn->src_reg], false); + find_good_pkt_pointers(other_branch, ®s[insn->src_reg], + PTR_TO_PACKET, false); } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLE && dst_reg->type == PTR_TO_PACKET && regs[insn->src_reg].type == PTR_TO_PACKET_END) { /* pkt_data' <= pkt_end */ - find_good_pkt_pointers(other_branch, dst_reg, false); + find_good_pkt_pointers(other_branch, dst_reg, + PTR_TO_PACKET, false); } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLE && dst_reg->type == PTR_TO_PACKET_END && regs[insn->src_reg].type == PTR_TO_PACKET) { /* pkt_end <= pkt_data' */ - find_good_pkt_pointers(this_branch, ®s[insn->src_reg], true); + find_good_pkt_pointers(this_branch, ®s[insn->src_reg], + PTR_TO_PACKET, true); + } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGT && + dst_reg->type == PTR_TO_PACKET_META && + reg_is_init_pkt_pointer(®s[insn->src_reg], PTR_TO_PACKET)) { + find_good_pkt_pointers(this_branch, dst_reg, + PTR_TO_PACKET_META, false); + } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLT && + dst_reg->type == PTR_TO_PACKET_META && + reg_is_init_pkt_pointer(®s[insn->src_reg], PTR_TO_PACKET)) { + find_good_pkt_pointers(other_branch, dst_reg, + PTR_TO_PACKET_META, false); + } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGE && + reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) && + regs[insn->src_reg].type == PTR_TO_PACKET_META) { + find_good_pkt_pointers(other_branch, ®s[insn->src_reg], + PTR_TO_PACKET_META, false); + } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLE && + reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) && + regs[insn->src_reg].type == PTR_TO_PACKET_META) { + find_good_pkt_pointers(this_branch, ®s[insn->src_reg], + PTR_TO_PACKET_META, false); } else if (is_pointer_value(env, insn->dst_reg)) { verbose("R%d pointer comparison prohibited\n", insn->dst_reg); return -EACCES; @@ -3884,8 +3949,9 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur, return false; /* Check our ids match any regs they're supposed to */ return check_ids(rold->id, rcur->id, idmap); + case PTR_TO_PACKET_META: case PTR_TO_PACKET: - if (rcur->type != PTR_TO_PACKET) + if (rcur->type != rold->type) return false; /* We must have at least as much range as the old ptr * did, so that any accesses which were safe before are diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index a8736c68fa14..0e314d5feef4 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -174,6 +174,7 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr, xdp.data_hard_start = data; xdp.data = data + XDP_PACKET_HEADROOM + NET_IP_ALIGN; + xdp.data_meta = xdp.data; xdp.data_end = xdp.data + size; retval = bpf_test_run(prog, &xdp, repeat, &duration); diff --git a/net/core/dev.c b/net/core/dev.c index 9a0f4c34c632..ee44c889e03d 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3945,8 +3945,8 @@ drop: static u32 netif_receive_generic_xdp(struct sk_buff *skb, struct bpf_prog *xdp_prog) { + u32 metalen, act = XDP_DROP; struct xdp_buff xdp; - u32 act = XDP_DROP; void *orig_data; int hlen, off; u32 mac_len; @@ -3957,8 +3957,25 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, if (skb_cloned(skb)) return XDP_PASS; - if (skb_linearize(skb)) - goto do_drop; + /* XDP packets must be linear and must have sufficient headroom + * of XDP_PACKET_HEADROOM bytes. This is the guarantee that also + * native XDP provides, thus we need to do it here as well. + */ + if (skb_is_nonlinear(skb) || + skb_headroom(skb) < XDP_PACKET_HEADROOM) { + int hroom = XDP_PACKET_HEADROOM - skb_headroom(skb); + int troom = skb->tail + skb->data_len - skb->end; + + /* In case we have to go down the path and also linearize, + * then lets do the pskb_expand_head() work just once here. + */ + if (pskb_expand_head(skb, + hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0, + troom > 0 ? troom + 128 : 0, GFP_ATOMIC)) + goto do_drop; + if (troom > 0 && __skb_linearize(skb)) + goto do_drop; + } /* The XDP program wants to see the packet starting at the MAC * header. @@ -3966,6 +3983,7 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, mac_len = skb->data - skb_mac_header(skb); hlen = skb_headlen(skb) + mac_len; xdp.data = skb->data - mac_len; + xdp.data_meta = xdp.data; xdp.data_end = xdp.data + hlen; xdp.data_hard_start = skb->data - skb_headroom(skb); orig_data = xdp.data; @@ -3983,10 +4001,12 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, case XDP_REDIRECT: case XDP_TX: __skb_push(skb, mac_len); - /* fall through */ - case XDP_PASS: break; - + case XDP_PASS: + metalen = xdp.data - xdp.data_meta; + if (metalen) + skb_metadata_set(skb, metalen); + break; default: bpf_warn_invalid_xdp_action(act); /* fall through */ @@ -4812,6 +4832,7 @@ static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb) diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev; diffs |= p->vlan_tci ^ skb->vlan_tci; diffs |= skb_metadata_dst_cmp(p, skb); + diffs |= skb_metadata_differs(p, skb); if (maclen == ETH_HLEN) diffs |= compare_ether_header(skb_mac_header(p), skb_mac_header(skb)); diff --git a/net/core/filter.c b/net/core/filter.c index 9ac44ebb694f..3e147cdd8dcf 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2536,14 +2536,26 @@ static const struct bpf_func_proto bpf_skb_change_head_proto = { .arg3_type = ARG_ANYTHING, }; +static unsigned long xdp_get_metalen(const struct xdp_buff *xdp) +{ + return xdp_data_meta_unsupported(xdp) ? 0 : + xdp->data - xdp->data_meta; +} + BPF_CALL_2(bpf_xdp_adjust_head, struct xdp_buff *, xdp, int, offset) { + unsigned long metalen = xdp_get_metalen(xdp); + void *data_start = xdp->data_hard_start + metalen; void *data = xdp->data + offset; - if (unlikely(data < xdp->data_hard_start || + if (unlikely(data < data_start || data > xdp->data_end - ETH_HLEN)) return -EINVAL; + if (metalen) + memmove(xdp->data_meta + offset, + xdp->data_meta, metalen); + xdp->data_meta += offset; xdp->data = data; return 0; @@ -2557,6 +2569,33 @@ static const struct bpf_func_proto bpf_xdp_adjust_head_proto = { .arg2_type = ARG_ANYTHING, }; +BPF_CALL_2(bpf_xdp_adjust_meta, struct xdp_buff *, xdp, int, offset) +{ + void *meta = xdp->data_meta + offset; + unsigned long metalen = xdp->data - meta; + + if (xdp_data_meta_unsupported(xdp)) + return -ENOTSUPP; + if (unlikely(meta < xdp->data_hard_start || + meta > xdp->data)) + return -EINVAL; + if (unlikely((metalen & (sizeof(__u32) - 1)) || + (metalen > 32))) + return -EACCES; + + xdp->data_meta = meta; + + return 0; +} + +static const struct bpf_func_proto bpf_xdp_adjust_meta_proto = { + .func = bpf_xdp_adjust_meta, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, +}; + static int __bpf_tx_xdp(struct net_device *dev, struct bpf_map *map, struct xdp_buff *xdp, @@ -2781,7 +2820,8 @@ bool bpf_helper_changes_pkt_data(void *func) func == bpf_clone_redirect || func == bpf_l3_csum_replace || func == bpf_l4_csum_replace || - func == bpf_xdp_adjust_head) + func == bpf_xdp_adjust_head || + func == bpf_xdp_adjust_meta) return true; return false; @@ -3391,6 +3431,8 @@ xdp_func_proto(enum bpf_func_id func_id) return &bpf_get_smp_processor_id_proto; case BPF_FUNC_xdp_adjust_head: return &bpf_xdp_adjust_head_proto; + case BPF_FUNC_xdp_adjust_meta: + return &bpf_xdp_adjust_meta_proto; case BPF_FUNC_redirect: return &bpf_xdp_redirect_proto; case BPF_FUNC_redirect_map: @@ -3521,6 +3563,7 @@ static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type case bpf_ctx_range_till(struct __sk_buff, remote_ip4, remote_ip4): case bpf_ctx_range_till(struct __sk_buff, local_ip4, local_ip4): case bpf_ctx_range(struct __sk_buff, data): + case bpf_ctx_range(struct __sk_buff, data_meta): case bpf_ctx_range(struct __sk_buff, data_end): if (size != size_default) return false; @@ -3547,6 +3590,7 @@ static bool sk_filter_is_valid_access(int off, int size, switch (off) { case bpf_ctx_range(struct __sk_buff, tc_classid): case bpf_ctx_range(struct __sk_buff, data): + case bpf_ctx_range(struct __sk_buff, data_meta): case bpf_ctx_range(struct __sk_buff, data_end): case bpf_ctx_range_till(struct __sk_buff, family, local_port): return false; @@ -3571,6 +3615,7 @@ static bool lwt_is_valid_access(int off, int size, switch (off) { case bpf_ctx_range(struct __sk_buff, tc_classid): case bpf_ctx_range_till(struct __sk_buff, family, local_port): + case bpf_ctx_range(struct __sk_buff, data_meta): return false; } @@ -3689,6 +3734,9 @@ static bool tc_cls_act_is_valid_access(int off, int size, case bpf_ctx_range(struct __sk_buff, data): info->reg_type = PTR_TO_PACKET; break; + case bpf_ctx_range(struct __sk_buff, data_meta): + info->reg_type = PTR_TO_PACKET_META; + break; case bpf_ctx_range(struct __sk_buff, data_end): info->reg_type = PTR_TO_PACKET_END; break; @@ -3722,6 +3770,9 @@ static bool xdp_is_valid_access(int off, int size, case offsetof(struct xdp_md, data): info->reg_type = PTR_TO_PACKET; break; + case offsetof(struct xdp_md, data_meta): + info->reg_type = PTR_TO_PACKET_META; + break; case offsetof(struct xdp_md, data_end): info->reg_type = PTR_TO_PACKET_END; break; @@ -3780,6 +3831,12 @@ static bool sk_skb_is_valid_access(int off, int size, enum bpf_access_type type, struct bpf_insn_access_aux *info) { + switch (off) { + case bpf_ctx_range(struct __sk_buff, tc_classid): + case bpf_ctx_range(struct __sk_buff, data_meta): + return false; + } + if (type == BPF_WRITE) { switch (off) { case bpf_ctx_range(struct __sk_buff, tc_index): @@ -3792,7 +3849,6 @@ static bool sk_skb_is_valid_access(int off, int size, switch (off) { case bpf_ctx_range(struct __sk_buff, mark): - case bpf_ctx_range(struct __sk_buff, tc_classid): return false; case bpf_ctx_range(struct __sk_buff, data): info->reg_type = PTR_TO_PACKET; @@ -3950,6 +4006,15 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type, offsetof(struct sk_buff, data)); break; + case offsetof(struct __sk_buff, data_meta): + off = si->off; + off -= offsetof(struct __sk_buff, data_meta); + off += offsetof(struct sk_buff, cb); + off += offsetof(struct bpf_skb_data_end, data_meta); + *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, + si->src_reg, off); + break; + case offsetof(struct __sk_buff, data_end): off = si->off; off -= offsetof(struct __sk_buff, data_end); @@ -4198,6 +4263,11 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type, si->dst_reg, si->src_reg, offsetof(struct xdp_buff, data)); break; + case offsetof(struct xdp_md, data_meta): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data_meta), + si->dst_reg, si->src_reg, + offsetof(struct xdp_buff, data_meta)); + break; case offsetof(struct xdp_md, data_end): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data_end), si->dst_reg, si->src_reg, diff --git a/net/core/skbuff.c b/net/core/skbuff.c index cebd33caaa00..4baf28959bd1 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -1535,6 +1535,8 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, skb->nohdr = 0; atomic_set(&skb_shinfo(skb)->dataref, 1); + skb_metadata_clear(skb); + /* It is not generally safe to change skb->truesize. * For the moment, we really care of rx path, or * when skb is orphaned (not attached to a socket). From 638b1bb26720e8d96cc81c6fff365dd3e82302ae Mon Sep 17 00:00:00 2001 From: Edward Cree Date: Tue, 26 Sep 2017 16:35:13 +0100 Subject: [PATCH 0019/1640] UPSTREAM: bpf/verifier: improve disassembly of BPF_END instructions print_bpf_insn() was treating all BPF_ALU[64] the same, but BPF_END has a different structure: it has a size in insn->imm (even if it's BPF_X) and uses the BPF_SRC (X or K) to indicate which endianness to use. So it needs different code to print it. Signed-off-by: Edward Cree Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 4dff5b5e0dda..aaa128427960 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -333,26 +333,40 @@ static const char *const bpf_jmp_string[16] = { [BPF_EXIT >> 4] = "exit", }; +static void print_bpf_end_insn(const struct bpf_verifier_env *env, + const struct bpf_insn *insn) +{ + verbose("(%02x) r%d = %s%d r%d\n", insn->code, insn->dst_reg, + BPF_SRC(insn->code) == BPF_TO_BE ? "be" : "le", + insn->imm, insn->dst_reg); +} + static void print_bpf_insn(const struct bpf_verifier_env *env, const struct bpf_insn *insn) { u8 class = BPF_CLASS(insn->code); if (class == BPF_ALU || class == BPF_ALU64) { - if (BPF_SRC(insn->code) == BPF_X) + if (BPF_OP(insn->code) == BPF_END) { + if (class == BPF_ALU64) + verbose("BUG_alu64_%02x\n", insn->code); + else + print_bpf_end_insn(env, insn); + } else if (BPF_SRC(insn->code) == BPF_X) { verbose("(%02x) %sr%d %s %sr%d\n", insn->code, class == BPF_ALU ? "(u32) " : "", insn->dst_reg, bpf_alu_string[BPF_OP(insn->code) >> 4], class == BPF_ALU ? "(u32) " : "", insn->src_reg); - else + } else { verbose("(%02x) %sr%d %s %s%d\n", insn->code, class == BPF_ALU ? "(u32) " : "", insn->dst_reg, bpf_alu_string[BPF_OP(insn->code) >> 4], class == BPF_ALU ? "(u32) " : "", insn->imm); + } } else if (class == BPF_STX) { if (BPF_MODE(insn->code) == BPF_MEM) verbose("(%02x) *(%s *)(r%d %+d) = r%d\n", From f7a990998ff438ba10afef17aa4682bacd69fca8 Mon Sep 17 00:00:00 2001 From: Edward Cree Date: Tue, 26 Sep 2017 16:35:29 +0100 Subject: [PATCH 0020/1640] UPSTREAM: bpf/verifier: improve disassembly of BPF_NEG instructions BPF_NEG takes only one operand, unlike the bulk of BPF_ALU[64] which are compound-assignments. So give it its own format in print_bpf_insn(). Signed-off-by: Edward Cree Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index aaa128427960..7352e5b398e5 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -352,6 +352,11 @@ static void print_bpf_insn(const struct bpf_verifier_env *env, verbose("BUG_alu64_%02x\n", insn->code); else print_bpf_end_insn(env, insn); + } else if (BPF_OP(insn->code) == BPF_NEG) { + verbose("(%02x) r%d = %s-r%d\n", + insn->code, insn->dst_reg, + class == BPF_ALU ? "(u32) " : "", + insn->dst_reg); } else if (BPF_SRC(insn->code) == BPF_X) { verbose("(%02x) %sr%d %s %sr%d\n", insn->code, class == BPF_ALU ? "(u32) " : "", From fb27e98a1a76f222440c6ba6cf5aa0d11ecf9ad1 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 27 Sep 2017 14:37:52 -0700 Subject: [PATCH 0021/1640] BACKPORT: bpf: Add name, load_time, uid and map_ids to bpf_prog_info The patch adds name and load_time to struct bpf_prog_aux. They are also exported to bpf_prog_info. The bpf_prog's name is passed by userspace during BPF_PROG_LOAD. The kernel only stores the first (BPF_PROG_NAME_LEN - 1) bytes and the name stored in the kernel is always \0 terminated. The kernel will reject name that contains characters other than isalnum() and '_'. It will also reject name that is not null terminated. The existing 'user->uid' of the bpf_prog_aux is also exported to the bpf_prog_info as created_by_uid. The existing 'used_maps' of the bpf_prog_aux is exported to the newly added members 'nr_map_ids' and 'map_ids' of the bpf_prog_info. On the input, nr_map_ids tells how big the userspace's map_ids buffer is. On the output, nr_map_ids tells the exact user_map_cnt and it will only copy up to the userspace's map_ids buffer is allowed. Signed-off-by: Martin KaFai Lau Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/linux/bpf.h | 2 ++ include/uapi/linux/bpf.h | 8 +++++++ kernel/bpf/syscall.c | 51 +++++++++++++++++++++++++++++++++++++++- 3 files changed, 60 insertions(+), 1 deletion(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 50012eace318..6b5c1da3c0cb 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -205,6 +205,8 @@ struct bpf_prog_aux { struct bpf_map **used_maps; struct bpf_prog *prog; struct user_struct *user; + u64 load_time; /* ns since boottime */ + u8 name[BPF_OBJ_NAME_LEN]; #ifdef CONFIG_SECURITY void *security; #endif diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 5baff234265f..6535b91a8cbb 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -212,6 +212,8 @@ enum bpf_attach_type { /* Specify numa node during map creation */ #define BPF_F_NUMA_NODE (1U << 2) +#define BPF_OBJ_NAME_LEN 16U + /* Flags for accessing BPF object */ #define BPF_F_RDONLY (1U << 3) #define BPF_F_WRONLY (1U << 4) @@ -251,6 +253,7 @@ union bpf_attr { __aligned_u64 log_buf; /* user supplied buffer */ __u32 kern_version; /* checked when prog_type=kprobe */ __u32 prog_flags; + __u8 prog_name[BPF_OBJ_NAME_LEN]; }; struct { /* anonymous struct used by BPF_OBJ_* commands */ @@ -955,6 +958,11 @@ struct bpf_prog_info { __u32 xlated_prog_len; __aligned_u64 jited_prog_insns; __aligned_u64 xlated_prog_insns; + __u64 load_time; /* ns since boottime */ + __u32 created_by_uid; + __u32 nr_map_ids; + __aligned_u64 map_ids; + __u8 name[BPF_OBJ_NAME_LEN]; } __attribute__((aligned(8))); struct bpf_map_info { diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 3ae2ea263613..a7788a58c444 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -23,6 +23,9 @@ #include #include #include +#include +#include +#include #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PROG_ARRAY || \ (map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \ @@ -353,6 +356,30 @@ int bpf_get_file_flag(int flags) offsetof(union bpf_attr, CMD##_LAST_FIELD) - \ sizeof(attr->CMD##_LAST_FIELD)) != NULL +/* dst and src must have at least BPF_OBJ_NAME_LEN number of bytes. + * Return 0 on success and < 0 on error. + */ +static int bpf_obj_name_cpy(char *dst, const char *src) +{ + const char *end = src + BPF_OBJ_NAME_LEN; + + /* Copy all isalnum() and '_' char */ + while (src < end && *src) { + if (!isalnum(*src) && *src != '_') + return -EINVAL; + *dst++ = *src++; + } + + /* No '\0' found in BPF_OBJ_NAME_LEN number of bytes */ + if (src == end) + return -EINVAL; + + /* '\0' terminates dst */ + *dst = 0; + + return 0; +} + #define BPF_MAP_CREATE_LAST_FIELD numa_node /* called via syscall */ static int map_create(union bpf_attr *attr) @@ -1070,7 +1097,7 @@ struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type) EXPORT_SYMBOL_GPL(bpf_prog_get_type); /* last field in 'union bpf_attr' used by this command */ -#define BPF_PROG_LOAD_LAST_FIELD prog_flags +#define BPF_PROG_LOAD_LAST_FIELD prog_name static int bpf_prog_load(union bpf_attr *attr) { @@ -1138,6 +1165,11 @@ static int bpf_prog_load(union bpf_attr *attr) if (err < 0) goto free_prog; + prog->aux->load_time = ktime_get_boot_ns(); + err = bpf_obj_name_cpy(prog->aux->name, attr->prog_name); + if (err) + goto free_prog; + /* run eBPF verifier */ err = bpf_check(&prog, attr); if (err < 0) @@ -1491,8 +1523,25 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, info.type = prog->type; info.id = prog->aux->id; + info.load_time = prog->aux->load_time; + info.created_by_uid = from_kuid_munged(current_user_ns(), + prog->aux->user->uid); memcpy(info.tag, prog->tag, sizeof(prog->tag)); + memcpy(info.name, prog->aux->name, sizeof(prog->aux->name)); + + ulen = info.nr_map_ids; + info.nr_map_ids = prog->aux->used_map_cnt; + ulen = min_t(u32, info.nr_map_ids, ulen); + if (ulen) { + u32 *user_map_ids = (u32 *)info.map_ids; + u32 i; + + for (i = 0; i < ulen; i++) + if (put_user(prog->aux->used_maps[i]->id, + &user_map_ids[i])) + return -EFAULT; + } if (!capable(CAP_SYS_ADMIN)) { info.jited_prog_len = 0; From 881ef7b0a04e2ff2529a6853928487d3470e472c Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 27 Sep 2017 14:37:53 -0700 Subject: [PATCH 0022/1640] BACKPORT: bpf: Add map_name to bpf_map_info This patch allows userspace to specify a name for a map during BPF_MAP_CREATE. The map's name can later be exported to user space via BPF_OBJ_GET_INFO_BY_FD. Signed-off-by: Martin KaFai Lau Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/linux/bpf.h | 1 + include/uapi/linux/bpf.h | 2 ++ kernel/bpf/syscall.c | 7 ++++++- 3 files changed, 9 insertions(+), 1 deletion(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 6b5c1da3c0cb..0bdfa47a7c3b 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -74,6 +74,7 @@ struct bpf_map { atomic_t refcnt; atomic_t usercnt; struct work_struct work; + u8 name[BPF_OBJ_NAME_LEN]; }; /* function argument constraints */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 6535b91a8cbb..f6940631d443 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -231,6 +231,7 @@ union bpf_attr { __u32 numa_node; /* numa node (effective only if * BPF_F_NUMA_NODE is set). */ + __u8 map_name[BPF_OBJ_NAME_LEN]; }; struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */ @@ -972,6 +973,7 @@ struct bpf_map_info { __u32 value_size; __u32 max_entries; __u32 map_flags; + __u8 name[BPF_OBJ_NAME_LEN]; } __attribute__((aligned(8))); /* User bpf_sock_ops struct to access socket values and specify request ops diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index a7788a58c444..878da36e8426 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -380,7 +380,7 @@ static int bpf_obj_name_cpy(char *dst, const char *src) return 0; } -#define BPF_MAP_CREATE_LAST_FIELD numa_node +#define BPF_MAP_CREATE_LAST_FIELD map_name /* called via syscall */ static int map_create(union bpf_attr *attr) { @@ -407,6 +407,10 @@ static int map_create(union bpf_attr *attr) if (IS_ERR(map)) return PTR_ERR(map); + err = bpf_obj_name_cpy(map->name, attr->map_name); + if (err) + goto free_map_nouncharge; + atomic_set(&map->refcnt, 1); atomic_set(&map->usercnt, 1); @@ -1596,6 +1600,7 @@ static int bpf_map_get_info_by_fd(struct bpf_map *map, info.value_size = map->value_size; info.max_entries = map->max_entries; info.map_flags = map->map_flags; + memcpy(info.name, map->name, sizeof(map->name)); if (copy_to_user(uinfo, &info, info_len) || put_user(info_len, &uattr->info.info_len)) From c2adb1f2475dd9938899d273be7ffdb751c55616 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Fri, 29 Sep 2017 10:52:17 -0700 Subject: [PATCH 0023/1640] UPSTREAM: bpf: Fix compiler warning on info.map_ids for 32bit platform This patch uses u64_to_user_ptr() to cast info.map_ids to a userspace ptr. It also tags the user_map_ids with '__user' for sparse check. Fixes: cb4d2b3f03d8 ("bpf: Add name, load_time, uid and map_ids to bpf_prog_info") Signed-off-by: Martin KaFai Lau Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/syscall.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 878da36e8426..5fb7b7c11cde 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1538,7 +1538,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, info.nr_map_ids = prog->aux->used_map_cnt; ulen = min_t(u32, info.nr_map_ids, ulen); if (ulen) { - u32 *user_map_ids = (u32 *)info.map_ids; + u32 __user *user_map_ids = u64_to_user_ptr(info.map_ids); u32 i; for (i = 0; i < ulen; i++) From a3535598eb4adc7c2452fe861ffdc1c8ae9a2eae Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 2 Oct 2017 22:50:22 -0700 Subject: [PATCH 0024/1640] BACKPORT: bpf: introduce BPF_PROG_QUERY command introduce BPF_PROG_QUERY command to retrieve a set of either attached programs to given cgroup or a set of effective programs that will execute for events within a cgroup Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Acked-by: Martin KaFai Lau for cgroup bits Acked-by: Tejun Heo Signed-off-by: David S. Miller --- include/linux/bpf-cgroup.h | 4 ++++ include/linux/bpf.h | 3 +++ include/uapi/linux/bpf.h | 13 +++++++++++ kernel/bpf/cgroup.c | 46 ++++++++++++++++++++++++++++++++++++++ kernel/bpf/core.c | 38 +++++++++++++++++++++++++++++++ kernel/bpf/syscall.c | 34 ++++++++++++++++++++++++++++ kernel/cgroup/cgroup.c | 10 +++++++++ 7 files changed, 148 insertions(+) diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index 540c44fab023..87a7db9feb38 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -45,12 +45,16 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, enum bpf_attach_type type, u32 flags); int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, enum bpf_attach_type type, u32 flags); +int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, + union bpf_attr __user *uattr); /* Wrapper for __cgroup_bpf_*() protected by cgroup_mutex */ int cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, enum bpf_attach_type type, u32 flags); int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, enum bpf_attach_type type, u32 flags); +int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, + union bpf_attr __user *uattr); int __cgroup_bpf_run_filter_skb(struct sock *sk, struct sk_buff *skb, diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 0bdfa47a7c3b..6ac21b352609 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -282,6 +282,9 @@ struct bpf_prog_array { struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags); void bpf_prog_array_free(struct bpf_prog_array __rcu *progs); +int bpf_prog_array_length(struct bpf_prog_array __rcu *progs); +int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *progs, + __u32 __user *prog_ids, u32 cnt); void bpf_prog_array_delete_safe(struct bpf_prog_array __rcu *progs, struct bpf_prog *old_prog); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index f6940631d443..d7a52acc7ea8 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -93,6 +93,7 @@ enum bpf_cmd { BPF_PROG_GET_FD_BY_ID, BPF_MAP_GET_FD_BY_ID, BPF_OBJ_GET_INFO_BY_FD, + BPF_PROG_QUERY, }; enum bpf_map_type { @@ -212,6 +213,9 @@ enum bpf_attach_type { /* Specify numa node during map creation */ #define BPF_F_NUMA_NODE (1U << 2) +/* flags for BPF_PROG_QUERY */ +#define BPF_F_QUERY_EFFECTIVE (1U << 0) + #define BPF_OBJ_NAME_LEN 16U /* Flags for accessing BPF object */ @@ -296,6 +300,15 @@ union bpf_attr { __u32 info_len; __aligned_u64 info; } info; + + struct { /* anonymous struct used by BPF_PROG_QUERY command */ + __u32 target_fd; /* container object to query */ + __u32 attach_type; + __u32 query_flags; + __u32 attach_flags; + __aligned_u64 prog_ids; + __u32 prog_cnt; + } query; } __attribute__((aligned(8))); /* BPF helper function descriptions: diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 3a7964a703aa..09ef184bc46c 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -383,6 +383,52 @@ cleanup: return err; } +/* Must be called with cgroup_mutex held to avoid races. */ +int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + __u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids); + enum bpf_attach_type type = attr->query.attach_type; + struct list_head *progs = &cgrp->bpf.progs[type]; + u32 flags = cgrp->bpf.flags[type]; + int cnt, ret = 0, i; + + if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) + cnt = bpf_prog_array_length(cgrp->bpf.effective[type]); + else + cnt = prog_list_length(progs); + + if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags))) + return -EFAULT; + if (copy_to_user(&uattr->query.prog_cnt, &cnt, sizeof(cnt))) + return -EFAULT; + if (attr->query.prog_cnt == 0 || !prog_ids || !cnt) + /* return early if user requested only program count + flags */ + return 0; + if (attr->query.prog_cnt < cnt) { + cnt = attr->query.prog_cnt; + ret = -ENOSPC; + } + + if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) { + return bpf_prog_array_copy_to_user(cgrp->bpf.effective[type], + prog_ids, cnt); + } else { + struct bpf_prog_list *pl; + u32 id; + + i = 0; + list_for_each_entry(pl, progs, node) { + id = pl->prog->aux->id; + if (copy_to_user(prog_ids + i, &id, sizeof(id))) + return -EFAULT; + if (++i == cnt) + break; + } + } + return ret; +} + /** * __cgroup_bpf_run_filter_skb() - Run a program for packet filtering * @sk: The socket sending or receiving traffic diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 4d34ab12d0df..1e202f403c56 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1568,6 +1568,44 @@ void bpf_prog_array_free(struct bpf_prog_array __rcu *progs) kfree_rcu(progs, rcu); } +int bpf_prog_array_length(struct bpf_prog_array __rcu *progs) +{ + struct bpf_prog **prog; + u32 cnt = 0; + + rcu_read_lock(); + prog = rcu_dereference(progs)->progs; + for (; *prog; prog++) + cnt++; + rcu_read_unlock(); + return cnt; +} + +int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *progs, + __u32 __user *prog_ids, u32 cnt) +{ + struct bpf_prog **prog; + u32 i = 0, id; + + rcu_read_lock(); + prog = rcu_dereference(progs)->progs; + for (; *prog; prog++) { + id = (*prog)->aux->id; + if (copy_to_user(prog_ids + i, &id, sizeof(id))) { + rcu_read_unlock(); + return -EFAULT; + } + if (++i == cnt) { + prog++; + break; + } + } + rcu_read_unlock(); + if (*prog) + return -ENOSPC; + return 0; +} + void bpf_prog_array_delete_safe(struct bpf_prog_array __rcu *progs, struct bpf_prog *old_prog) { diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 5fb7b7c11cde..2bbf932bcd84 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1383,6 +1383,37 @@ static int bpf_prog_detach(const union bpf_attr *attr) return ret; } +#define BPF_PROG_QUERY_LAST_FIELD query.prog_cnt + +static int bpf_prog_query(const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + struct cgroup *cgrp; + int ret; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + if (CHECK_ATTR(BPF_PROG_QUERY)) + return -EINVAL; + if (attr->query.query_flags & ~BPF_F_QUERY_EFFECTIVE) + return -EINVAL; + + switch (attr->query.attach_type) { + case BPF_CGROUP_INET_INGRESS: + case BPF_CGROUP_INET_EGRESS: + case BPF_CGROUP_INET_SOCK_CREATE: + case BPF_CGROUP_SOCK_OPS: + break; + default: + return -EINVAL; + } + cgrp = cgroup_get_from_fd(attr->query.target_fd); + if (IS_ERR(cgrp)) + return PTR_ERR(cgrp); + ret = cgroup_bpf_query(cgrp, attr, uattr); + cgroup_put(cgrp); + return ret; +} #endif /* CONFIG_CGROUP_BPF */ #define BPF_PROG_TEST_RUN_LAST_FIELD test.duration @@ -1692,6 +1723,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz case BPF_PROG_DETACH: err = bpf_prog_detach(&attr); break; + case BPF_PROG_QUERY: + err = bpf_prog_query(&attr, uattr); + break; #endif case BPF_PROG_TEST_RUN: err = bpf_prog_test_run(&attr, uattr); diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 37784813af85..604773a53d8b 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -6125,4 +6125,14 @@ int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, mutex_unlock(&cgroup_mutex); return ret; } +int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + int ret; + + mutex_lock(&cgroup_mutex); + ret = __cgroup_bpf_query(cgrp, attr, uattr); + mutex_unlock(&cgroup_mutex); + return ret; +} #endif /* CONFIG_CGROUP_BPF */ From 2fc4c258ab3c1680ff1e8102b899c591a6389532 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 2 Oct 2017 22:50:23 -0700 Subject: [PATCH 0025/1640] BACKPORT: bpf: enforce return code for cgroup-bpf programs with addition of tnum logic the verifier got smart enough and we can enforce return codes at program load time. For now do so for cgroup-bpf program types. Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 40 ++++++++++++ tools/testing/selftests/bpf/test_verifier.c | 72 +++++++++++++++++++++ 2 files changed, 112 insertions(+) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 7352e5b398e5..56680eac48fb 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3668,6 +3668,43 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) return 0; } +static int check_return_code(struct bpf_verifier_env *env) +{ + struct bpf_reg_state *reg; + struct tnum range = tnum_range(0, 1); + + switch (env->prog->type) { + case BPF_PROG_TYPE_CGROUP_SKB: + case BPF_PROG_TYPE_CGROUP_SOCK: + case BPF_PROG_TYPE_SOCK_OPS: + break; + default: + return 0; + } + + reg = &env->cur_state->regs[BPF_REG_0]; + if (reg->type != SCALAR_VALUE) { + verbose("At program exit the register R0 is not a known value (%s)\n", + reg_type_str[reg->type]); + return -EINVAL; + } + + if (!tnum_in(range, reg->var_off)) { + verbose("At program exit the register R0 "); + if (!tnum_is_unknown(reg->var_off)) { + char tn_buf[48]; + + tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); + verbose("has value %s", tn_buf); + } else { + verbose("has unknown scalar value"); + } + verbose(" should have been 0 or 1\n"); + return -EINVAL; + } + return 0; +} + /* non-recursive DFS pseudo code * 1 procedure DFS-iterative(G,v): * 2 label v as discovered @@ -4507,6 +4544,9 @@ static int do_check(struct bpf_verifier_env *env) return -EACCES; } + err = check_return_code(env); + if (err) + return err; process_bpf_exit: err = pop_stack(env, &env->prev_insn_idx, &env->insn_idx); if (err < 0) { diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c index f7757f7f6d2b..f2e9b37a4463 100644 --- a/tools/testing/selftests/bpf/test_verifier.c +++ b/tools/testing/selftests/bpf/test_verifier.c @@ -8046,6 +8046,78 @@ static struct bpf_test tests[] = { .result = REJECT, .errstr = "variable ctx access var_off=(0x0; 0x4)", }, + { + "bpf_exit with invalid return code. test1", + .insns = { + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, 0), + BPF_EXIT_INSN(), + }, + .errstr = "R0 has value (0x0; 0xffffffff)", + .result = REJECT, + .prog_type = BPF_PROG_TYPE_CGROUP_SOCK, + }, + { + "bpf_exit with invalid return code. test2", + .insns = { + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, 0), + BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 1), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + .prog_type = BPF_PROG_TYPE_CGROUP_SOCK, + }, + { + "bpf_exit with invalid return code. test3", + .insns = { + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, 0), + BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 3), + BPF_EXIT_INSN(), + }, + .errstr = "R0 has value (0x0; 0x3)", + .result = REJECT, + .prog_type = BPF_PROG_TYPE_CGROUP_SOCK, + }, + { + "bpf_exit with invalid return code. test4", + .insns = { + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + .prog_type = BPF_PROG_TYPE_CGROUP_SOCK, + }, + { + "bpf_exit with invalid return code. test5", + .insns = { + BPF_MOV64_IMM(BPF_REG_0, 2), + BPF_EXIT_INSN(), + }, + .errstr = "R0 has value (0x2; 0x0)", + .result = REJECT, + .prog_type = BPF_PROG_TYPE_CGROUP_SOCK, + }, + { + "bpf_exit with invalid return code. test6", + .insns = { + BPF_MOV64_REG(BPF_REG_0, BPF_REG_1), + BPF_EXIT_INSN(), + }, + .errstr = "R0 is not a known value (ctx)", + .result = REJECT, + .prog_type = BPF_PROG_TYPE_CGROUP_SOCK, + }, + { + "bpf_exit with invalid return code. test7", + .insns = { + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, 0), + BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, 4), + BPF_ALU64_REG(BPF_MUL, BPF_REG_0, BPF_REG_2), + BPF_EXIT_INSN(), + }, + .errstr = "R0 has unknown scalar value", + .result = REJECT, + .prog_type = BPF_PROG_TYPE_CGROUP_SOCK, + }, }; static int probe_filter_length(const struct bpf_insn *fp) From 456de77985fcfc0bdd4c2c5c555d16fee560bbad Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 5 Oct 2017 09:19:19 -0700 Subject: [PATCH 0026/1640] BACKPORT: bpf: perf event change needed for subsequent bpf helpers This patch does not impact existing functionalities. It contains the changes in perf event area needed for subsequent bpf_perf_event_read_value and bpf_perf_prog_read_value helpers. Change-Id: I066312fce9ebb0185b02ce6904e057d728473f90 Signed-off-by: Yonghong Song Acked-by: Peter Zijlstra (Intel) Signed-off-by: David S. Miller --- include/linux/perf_event.h | 7 +++++-- kernel/bpf/arraymap.c | 2 +- kernel/events/core.c | 16 ++++++++++++++-- kernel/trace/bpf_trace.c | 2 +- 4 files changed, 21 insertions(+), 6 deletions(-) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index c132338b59cb..be4bdfea9c05 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -841,6 +841,7 @@ struct perf_output_handle { struct bpf_perf_event_data_kern { struct pt_regs *regs; struct perf_sample_data *data; + struct perf_event *event; }; #ifdef CONFIG_CGROUP_PERF @@ -919,7 +920,8 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, void *context); extern void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu); -int perf_event_read_local(struct perf_event *event, u64 *value); +int perf_event_read_local(struct perf_event *event, u64 *value, + u64 *enabled, u64 *running); extern u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running); @@ -1366,7 +1368,8 @@ static inline const struct perf_event_attr *perf_event_attrs(struct perf_event * { return ERR_PTR(-EINVAL); } -static inline int perf_event_read_local(struct perf_event *event, u64 *value) +static inline int perf_event_read_local(struct perf_event *event, u64 *value, + u64 *enabled, u64 *running) { return -EINVAL; } diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index aede91834b7c..fc0193e150cd 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -536,7 +536,7 @@ static void *perf_event_fd_array_get_ptr(struct bpf_map *map, ee = ERR_PTR(-EOPNOTSUPP); event = perf_file->private_data; - if (perf_event_read_local(event, &value) == -EOPNOTSUPP) + if (perf_event_read_local(event, &value, NULL, NULL) == -EOPNOTSUPP) goto err_out; ee = bpf_event_entry_gen(perf_file, map_file); diff --git a/kernel/events/core.c b/kernel/events/core.c index d3497bdd3413..08ed6a2d3a23 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3841,12 +3841,15 @@ static inline u64 perf_event_count(struct perf_event *event) * will not be local and we cannot read them atomically * - must not have a pmu::count method */ -int perf_event_read_local(struct perf_event *event, u64 *value) +int perf_event_read_local(struct perf_event *event, u64 *value, + u64 *enabled, u64 *running) { unsigned long flags; int ret = 0; int local_cpu = smp_processor_id(); bool readable = cpumask_test_cpu(local_cpu, &event->readable_on_cpus); + u64 now; + /* * Disabling interrupts avoids all counter scheduling (context * switches, timer based rotation and IPIs). @@ -3883,13 +3886,21 @@ int perf_event_read_local(struct perf_event *event, u64 *value) goto out; } + now = event->shadow_ctx_time + perf_clock(); + if (enabled) + *enabled = now - event->tstamp_enabled; /* * If the event is currently on this CPU, its either a per-task event, * or local to this CPU. Furthermore it means its ACTIVE (otherwise * oncpu == -1). */ - if (event->oncpu == smp_processor_id() || readable) + if (event->oncpu == smp_processor_id() || readable) { event->pmu->read(event); + if (running) + *running = now - event->tstamp_running; + } else if (running) { + *running = event->total_time_running; + } *value = local64_read(&event->count); out: @@ -8458,6 +8469,7 @@ static void bpf_overflow_handler(struct perf_event *event, struct bpf_perf_event_data_kern ctx = { .data = data, .regs = regs, + .event = event, }; int ret = 0; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 7ddfb056693b..59a38d582d5d 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -290,7 +290,7 @@ BPF_CALL_2(bpf_perf_event_read, struct bpf_map *, map, u64, flags) if (!ee) return -ENOENT; - err = perf_event_read_local(ee->event, &value); + err = perf_event_read_local(ee->event, &value, NULL, NULL); /* * this api is ugly since we miss [-22..-2] range of valid * counter values, but that's uapi From 5cd5c6f4b2c1b478374c8e1e29953b81acdbc9c7 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 5 Oct 2017 09:19:20 -0700 Subject: [PATCH 0027/1640] BACKPORT: bpf: add helper bpf_perf_event_read_value for perf event array map Hardware pmu counters are limited resources. When there are more pmu based perf events opened than available counters, kernel will multiplex these events so each event gets certain percentage (but not 100%) of the pmu time. In case that multiplexing happens, the number of samples or counter value will not reflect the case compared to no multiplexing. This makes comparison between different runs difficult. Typically, the number of samples or counter value should be normalized before comparing to other experiments. The typical normalization is done like: normalized_num_samples = num_samples * time_enabled / time_running normalized_counter_value = counter_value * time_enabled / time_running where time_enabled is the time enabled for event and time_running is the time running for event since last normalization. This patch adds helper bpf_perf_event_read_value for kprobed based perf event array map, to read perf counter and enabled/running time. The enabled/running time is accumulated since the perf event open. To achieve scaling factor between two bpf invocations, users can can use cpu_id as the key (which is typical for perf array usage model) to remember the previous value and do the calculation inside the bpf program. Signed-off-by: Yonghong Song Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 18 +++++++++++++++- kernel/bpf/verifier.c | 4 +++- kernel/trace/bpf_trace.c | 45 ++++++++++++++++++++++++++++++++++++---- 3 files changed, 61 insertions(+), 6 deletions(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index d7a52acc7ea8..5e0fcc1dcd27 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -649,6 +649,14 @@ union bpf_attr { * @delta: An positive/negative integer to be added to xdp_md.data_meta * Return: 0 on success or negative on error * + * int bpf_perf_event_read_value(map, flags, buf, buf_size) + * read perf event counter value and perf event enabled/running time + * @map: pointer to perf_event_array map + * @flags: index of event in the map or bitmask flags + * @buf: buf to fill + * @buf_size: size of the buf + * Return: 0 on success or negative error code + * * int skb_load_bytes_relative(const struct sk_buff *skb, u32 offset, void *to, u32 len, u32 start_header) * Description * This helper is similar to **bpf_skb_load_bytes**\ () in that @@ -843,7 +851,9 @@ enum bpf_func_id { #define BPF_F_ZERO_CSUM_TX (1ULL << 1) #define BPF_F_DONT_FRAGMENT (1ULL << 2) -/* BPF_FUNC_perf_event_output and BPF_FUNC_perf_event_read flags. */ +/* BPF_FUNC_perf_event_output, BPF_FUNC_perf_event_read and + * BPF_FUNC_perf_event_read_value flags. + */ #define BPF_F_INDEX_MASK 0xffffffffULL #define BPF_F_CURRENT_CPU BPF_F_INDEX_MASK /* BPF_FUNC_perf_event_output for sk_buff input context. */ @@ -1041,4 +1051,10 @@ enum { #define TCP_BPF_IW 1001 /* Set TCP initial congestion window */ #define TCP_BPF_SNDCWND_CLAMP 1002 /* Set sndcwnd_clamp */ +struct bpf_perf_event_value { + __u64 counter; + __u64 enabled; + __u64 running; +}; + #endif /* _UAPI__LINUX_BPF_H__ */ diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 56680eac48fb..291dd4b3090a 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1774,7 +1774,8 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id) break; case BPF_MAP_TYPE_PERF_EVENT_ARRAY: if (func_id != BPF_FUNC_perf_event_read && - func_id != BPF_FUNC_perf_event_output) + func_id != BPF_FUNC_perf_event_output && + func_id != BPF_FUNC_perf_event_read_value) goto error; break; case BPF_MAP_TYPE_STACK_TRACE: @@ -1817,6 +1818,7 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id) break; case BPF_FUNC_perf_event_read: case BPF_FUNC_perf_event_output: + case BPF_FUNC_perf_event_read_value: if (map->map_type != BPF_MAP_TYPE_PERF_EVENT_ARRAY) goto error; break; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 59a38d582d5d..c12c910cdf06 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -270,14 +270,14 @@ const struct bpf_func_proto *bpf_get_trace_printk_proto(void) return &bpf_trace_printk_proto; } -BPF_CALL_2(bpf_perf_event_read, struct bpf_map *, map, u64, flags) +static __always_inline int +get_map_perf_counter(struct bpf_map *map, u64 flags, + u64 *value, u64 *enabled, u64 *running) { struct bpf_array *array = container_of(map, struct bpf_array, map); unsigned int cpu = smp_processor_id(); u64 index = flags & BPF_F_INDEX_MASK; struct bpf_event_entry *ee; - u64 value = 0; - int err; if (unlikely(flags & ~(BPF_F_INDEX_MASK))) return -EINVAL; @@ -290,7 +290,15 @@ BPF_CALL_2(bpf_perf_event_read, struct bpf_map *, map, u64, flags) if (!ee) return -ENOENT; - err = perf_event_read_local(ee->event, &value, NULL, NULL); + return perf_event_read_local(ee->event, value, enabled, running); +} + +BPF_CALL_2(bpf_perf_event_read, struct bpf_map *, map, u64, flags) +{ + u64 value = 0; + int err; + + err = get_map_perf_counter(map, flags, &value, NULL, NULL); /* * this api is ugly since we miss [-22..-2] range of valid * counter values, but that's uapi @@ -308,6 +316,33 @@ static const struct bpf_func_proto bpf_perf_event_read_proto = { .arg2_type = ARG_ANYTHING, }; +BPF_CALL_4(bpf_perf_event_read_value, struct bpf_map *, map, u64, flags, + struct bpf_perf_event_value *, buf, u32, size) +{ + int err = -EINVAL; + + if (unlikely(size != sizeof(struct bpf_perf_event_value))) + goto clear; + err = get_map_perf_counter(map, flags, &buf->counter, &buf->enabled, + &buf->running); + if (unlikely(err)) + goto clear; + return 0; +clear: + memset(buf, 0, size); + return err; +} + +static const struct bpf_func_proto bpf_perf_event_read_value_proto = { + .func = bpf_perf_event_read_value, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_PTR_TO_UNINIT_MEM, + .arg4_type = ARG_CONST_SIZE, +}; + static DEFINE_PER_CPU(struct perf_sample_data, bpf_trace_sd); static __always_inline u64 @@ -521,6 +556,8 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func return &bpf_perf_event_output_proto; case BPF_FUNC_get_stackid: return &bpf_get_stackid_proto; + case BPF_FUNC_perf_event_read_value: + return &bpf_perf_event_read_value_proto; default: return tracing_func_proto(func_id); } From ed7bb028f010877562b75eaae3dbd9cf040334a9 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 5 Oct 2017 09:19:22 -0700 Subject: [PATCH 0028/1640] BACKPORT: bpf: add helper bpf_perf_prog_read_value This patch adds helper bpf_perf_prog_read_cvalue for perf event based bpf programs, to read event counter and enabled/running time. The enabled/running time is accumulated since the perf event open. The typical use case for perf event based bpf program is to attach itself to a single event. In such cases, if it is desirable to get scaling factor between two bpf invocations, users can can save the time values in a map, and use the value from the map and the current value to calculate the scaling factor. Signed-off-by: Yonghong Song Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 7 +++++++ kernel/trace/bpf_trace.c | 28 ++++++++++++++++++++++++++++ 2 files changed, 35 insertions(+) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 5e0fcc1dcd27..54ead9483c68 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -657,6 +657,13 @@ union bpf_attr { * @buf_size: size of the buf * Return: 0 on success or negative error code * + * int bpf_perf_prog_read_value(ctx, buf, buf_size) + * read perf prog attached perf event counter and enabled/running time + * @ctx: pointer to ctx + * @buf: buf to fill + * @buf_size: size of the buf + * Return : 0 on success or negative error code + * * int skb_load_bytes_relative(const struct sk_buff *skb, u32 offset, void *to, u32 len, u32 start_header) * Description * This helper is similar to **bpf_skb_load_bytes**\ () in that diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index c12c910cdf06..82aba5a93269 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -635,6 +635,32 @@ static const struct bpf_func_proto bpf_get_stackid_proto_tp = { .arg3_type = ARG_ANYTHING, }; +BPF_CALL_3(bpf_perf_prog_read_value_tp, struct bpf_perf_event_data_kern *, ctx, + struct bpf_perf_event_value *, buf, u32, size) +{ + int err = -EINVAL; + + if (unlikely(size != sizeof(struct bpf_perf_event_value))) + goto clear; + err = perf_event_read_local(ctx->event, &buf->counter, &buf->enabled, + &buf->running); + if (unlikely(err)) + goto clear; + return 0; +clear: + memset(buf, 0, size); + return err; +} + +static const struct bpf_func_proto bpf_perf_prog_read_value_proto_tp = { + .func = bpf_perf_prog_read_value_tp, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_UNINIT_MEM, + .arg3_type = ARG_CONST_SIZE, +}; + static const struct bpf_func_proto *tp_prog_func_proto(enum bpf_func_id func_id) { switch (func_id) { @@ -642,6 +668,8 @@ static const struct bpf_func_proto *tp_prog_func_proto(enum bpf_func_id func_id) return &bpf_perf_event_output_proto_tp; case BPF_FUNC_get_stackid: return &bpf_get_stackid_proto_tp; + case BPF_FUNC_perf_prog_read_value: + return &bpf_perf_prog_read_value_proto_tp; default: return tracing_func_proto(func_id); } From 76f65a6c93a6eefaff8c8f9036dace19a042f24d Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 5 Oct 2017 21:52:11 -0700 Subject: [PATCH 0029/1640] UPSTREAM: bpf: Change bpf_obj_name_cpy() to better ensure map's name is init by 0 During get_info_by_fd, the prog/map name is memcpy-ed. It depends on the prog->aux->name and map->name to be zero initialized. bpf_prog_aux is easy to guarantee that aux->name is zero init. The name in bpf_map may be harder to be guaranteed in the future when new map type is added. Hence, this patch makes bpf_obj_name_cpy() to always zero init the prog/map name. Suggested-by: Daniel Borkmann Signed-off-by: Martin KaFai Lau Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/syscall.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 2bbf932bcd84..9a78683b803d 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -363,6 +363,8 @@ static int bpf_obj_name_cpy(char *dst, const char *src) { const char *end = src + BPF_OBJ_NAME_LEN; + memset(dst, 0, BPF_OBJ_NAME_LEN); + /* Copy all isalnum() and '_' char */ while (src < end && *src) { if (!isalnum(*src) && *src != '_') @@ -374,9 +376,6 @@ static int bpf_obj_name_cpy(char *dst, const char *src) if (src == end) return -EINVAL; - /* '\0' terminates dst */ - *dst = 0; - return 0; } From f3b5e7e17a8befd09384282e2d28c5a496ce1f55 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 5 Oct 2017 21:52:12 -0700 Subject: [PATCH 0030/1640] BACKPORT: bpf: Use char in prog and map name Instead of u8, use char for prog and map name. It can avoid the userspace tool getting compiler's signess warning. The bpf_prog_aux, bpf_map, bpf_attr, bpf_prog_info and bpf_map_info are changed. Signed-off-by: Martin KaFai Lau Cc: Jakub Kicinski Acked-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/bpf.h | 4 ++-- include/uapi/linux/bpf.h | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 6ac21b352609..2ae78e0af744 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -74,7 +74,7 @@ struct bpf_map { atomic_t refcnt; atomic_t usercnt; struct work_struct work; - u8 name[BPF_OBJ_NAME_LEN]; + char name[BPF_OBJ_NAME_LEN]; }; /* function argument constraints */ @@ -207,7 +207,7 @@ struct bpf_prog_aux { struct bpf_prog *prog; struct user_struct *user; u64 load_time; /* ns since boottime */ - u8 name[BPF_OBJ_NAME_LEN]; + char name[BPF_OBJ_NAME_LEN]; #ifdef CONFIG_SECURITY void *security; #endif diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 54ead9483c68..9251902354eb 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -235,7 +235,7 @@ union bpf_attr { __u32 numa_node; /* numa node (effective only if * BPF_F_NUMA_NODE is set). */ - __u8 map_name[BPF_OBJ_NAME_LEN]; + char map_name[BPF_OBJ_NAME_LEN]; }; struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */ @@ -258,7 +258,7 @@ union bpf_attr { __aligned_u64 log_buf; /* user supplied buffer */ __u32 kern_version; /* checked when prog_type=kprobe */ __u32 prog_flags; - __u8 prog_name[BPF_OBJ_NAME_LEN]; + char prog_name[BPF_OBJ_NAME_LEN]; }; struct { /* anonymous struct used by BPF_OBJ_* commands */ @@ -993,7 +993,7 @@ struct bpf_prog_info { __u32 created_by_uid; __u32 nr_map_ids; __aligned_u64 map_ids; - __u8 name[BPF_OBJ_NAME_LEN]; + char name[BPF_OBJ_NAME_LEN]; } __attribute__((aligned(8))); struct bpf_map_info { @@ -1003,7 +1003,7 @@ struct bpf_map_info { __u32 value_size; __u32 max_entries; __u32 map_flags; - __u8 name[BPF_OBJ_NAME_LEN]; + char name[BPF_OBJ_NAME_LEN]; } __attribute__((aligned(8))); /* User bpf_sock_ops struct to access socket values and specify request ops From 864098c95ec8b983dbe82fe9d181978820a954dc Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 5 Oct 2017 21:52:13 -0700 Subject: [PATCH 0031/1640] UPSTREAM: bpf: Append prog->aux->name in bpf_get_prog_name() This patch makes the bpf_prog's name available in kallsyms. The new format is bpf_prog_tag[_name]. Sample kallsyms from running selftests/bpf/test_progs: [root@arch-fb-vm1 ~]# egrep ' bpf_prog_[0-9a-fA-F]{16}' /proc/kallsyms ffffffffa0048000 t bpf_prog_dabf0207d1992486_test_obj_id ffffffffa0038000 t bpf_prog_a04f5eef06a7f555__123456789ABCDE ffffffffa0050000 t bpf_prog_a04f5eef06a7f555 Signed-off-by: Martin KaFai Lau Acked-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/core.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 1e202f403c56..6b1ab380ff85 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -361,12 +361,25 @@ bpf_get_prog_addr_region(const struct bpf_prog *prog, static void bpf_get_prog_name(const struct bpf_prog *prog, char *sym) { + const char *end = sym + KSYM_NAME_LEN; + BUILD_BUG_ON(sizeof("bpf_prog_") + - sizeof(prog->tag) * 2 + 1 > KSYM_NAME_LEN); + sizeof(prog->tag) * 2 + + /* name has been null terminated. + * We should need +1 for the '_' preceding + * the name. However, the null character + * is double counted between the name and the + * sizeof("bpf_prog_") above, so we omit + * the +1 here. + */ + sizeof(prog->aux->name) > KSYM_NAME_LEN); sym += snprintf(sym, KSYM_NAME_LEN, "bpf_prog_"); sym = bin2hex(sym, prog->tag, sizeof(prog->tag)); - *sym = 0; + if (prog->aux->name[0]) + snprintf(sym, (size_t)(end - sym), "_%s", prog->aux->name); + else + *sym = 0; } static __always_inline unsigned long From 04c0f814e70f3187e27e9a17ccf131b46d4191b9 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 9 Oct 2017 10:30:10 -0700 Subject: [PATCH 0032/1640] BACKPORT: bpf: encapsulate verifier log state into a structure Put the loose log_* variables into a structure. This will make it simpler to remove the global verifier state in following patches. Signed-off-by: Jakub Kicinski Reviewed-by: Simon Horman Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/linux/bpf_verifier.h | 13 ++++++++ kernel/bpf/verifier.c | 57 +++++++++++++++++++----------------- 2 files changed, 43 insertions(+), 27 deletions(-) diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 8509484cada4..478378fb3e78 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -136,6 +136,19 @@ struct bpf_insn_aux_data { #define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */ +struct bpf_verifer_log { + u32 level; + char *kbuf; + char __user *ubuf; + u32 len_used; + u32 len_total; +}; + +static inline bool bpf_verifier_log_full(const struct bpf_verifer_log *log) +{ + return log->len_used >= log->len_total - 1; +} + struct bpf_verifier_env; struct bpf_ext_analyzer_ops { int (*insn_hook)(struct bpf_verifier_env *env, diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 291dd4b3090a..b13cee524c61 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -156,8 +156,7 @@ struct bpf_call_arg_meta { /* verbose verifier prints what it's seeing * bpf_check() is called under lock, so no race to access these global vars */ -static u32 log_level, log_size, log_len; -static char *log_buf; +static struct bpf_verifer_log verifier_log; static DEFINE_MUTEX(bpf_verifier_lock); @@ -167,13 +166,15 @@ static DEFINE_MUTEX(bpf_verifier_lock); */ static __printf(1, 2) void verbose(const char *fmt, ...) { + struct bpf_verifer_log *log = &verifier_log; va_list args; - if (log_level == 0 || log_len >= log_size - 1) + if (!log->level || bpf_verifier_log_full(log)) return; va_start(args, fmt); - log_len += vscnprintf(log_buf + log_len, log_size - log_len, fmt, args); + log->len_used += vscnprintf(log->kbuf + log->len_used, + log->len_total - log->len_used, fmt, args); va_end(args); } @@ -1046,7 +1047,7 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, * need to try adding each of min_value and max_value to off * to make sure our theoretical access will be safe. */ - if (log_level) + if (verifier_log.level) print_verifier_state(state); /* The minimum value is only important with signed @@ -3541,7 +3542,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, verbose("R%d pointer comparison prohibited\n", insn->dst_reg); return -EACCES; } - if (log_level) + if (verifier_log.level) print_verifier_state(this_branch); return 0; } @@ -4339,7 +4340,7 @@ static int do_check(struct bpf_verifier_env *env) return err; if (err == 1) { /* found equivalent state, can prune the search */ - if (log_level) { + if (verifier_log.level) { if (do_print_state) verbose("\nfrom %d to %d%s: safe\n", env->prev_insn_idx, env->insn_idx, @@ -4354,8 +4355,9 @@ static int do_check(struct bpf_verifier_env *env) if (need_resched()) cond_resched(); - if (log_level > 1 || (log_level && do_print_state)) { - if (log_level > 1) + if (verifier_log.level > 1 || + (verifier_log.level && do_print_state)) { + if (verifier_log.level > 1) verbose("%d:", env->insn_idx); else verbose("\nfrom %d to %d%s:", @@ -4366,7 +4368,7 @@ static int do_check(struct bpf_verifier_env *env) do_print_state = false; } - if (log_level) { + if (verifier_log.level) { verbose("%d: ", env->insn_idx); print_bpf_insn(env, insn); } @@ -5206,7 +5208,7 @@ static void free_states(struct bpf_verifier_env *env) int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) { - char __user *log_ubuf = NULL; + struct bpf_verifer_log *log = &verifier_log; struct bpf_verifier_env *env; int ret = -EINVAL; @@ -5231,23 +5233,23 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) /* user requested verbose verifier output * and supplied buffer to store the verification trace */ - log_level = attr->log_level; - log_ubuf = (char __user *) (unsigned long) attr->log_buf; - log_size = attr->log_size; - log_len = 0; + log->level = attr->log_level; + log->ubuf = (char __user *) (unsigned long) attr->log_buf; + log->len_total = attr->log_size; + log->len_used = 0; ret = -EINVAL; - /* log_* values have to be sane */ - if (log_size < 128 || log_size > UINT_MAX >> 8 || - log_level == 0 || log_ubuf == NULL) + /* log attributes have to be sane */ + if (log->len_total < 128 || log->len_total > UINT_MAX >> 8 || + !log->level || !log->ubuf) goto err_unlock; ret = -ENOMEM; - log_buf = vmalloc(log_size); - if (!log_buf) + log->kbuf = vmalloc(log->len_total); + if (!log->kbuf) goto err_unlock; } else { - log_level = 0; + log->level = 0; } env->strict_alignment = !!(attr->prog_flags & BPF_F_STRICT_ALIGNMENT); @@ -5291,15 +5293,16 @@ skip_full_check: if (ret == 0) ret = fixup_bpf_calls(env); - if (log_level && log_len >= log_size - 1) { - BUG_ON(log_len >= log_size); + if (log->level && bpf_verifier_log_full(log)) { + BUG_ON(log->len_used >= log->len_total); /* verifier log exceeded user supplied buffer */ ret = -ENOSPC; /* fall through to return what was recorded */ } /* copy verifier log back to user space including trailing zero */ - if (log_level && copy_to_user(log_ubuf, log_buf, log_len + 1) != 0) { + if (log->level && copy_to_user(log->ubuf, log->kbuf, + log->len_used + 1) != 0) { ret = -EFAULT; goto free_log_buf; } @@ -5326,8 +5329,8 @@ skip_full_check: } free_log_buf: - if (log_level) - vfree(log_buf); + if (log->level) + vfree(log->kbuf); if (!env->prog->aux->used_maps) /* if we didn't copy map pointers into bpf_prog_info, release * them now. Otherwise free_used_maps() will release them. @@ -5364,7 +5367,7 @@ int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops, /* grab the mutex to protect few globals used by verifier */ mutex_lock(&bpf_verifier_lock); - log_level = 0; + verifier_log.level = 0; env->strict_alignment = false; if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) From 9cb2b22b9afa3a6618d1aba1f67885104ef20def Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 9 Oct 2017 10:30:11 -0700 Subject: [PATCH 0033/1640] BACKPORT: bpf: move global verifier log into verifier environment The biggest piece of global state protected by the verifier lock is the verifier_log. Move that log to struct bpf_verifier_env. struct bpf_verifier_env has to be passed now to all invocations of verbose(). Signed-off-by: Jakub Kicinski Reviewed-by: Simon Horman Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/linux/bpf_verifier.h | 2 + kernel/bpf/verifier.c | 517 ++++++++++++++++++----------------- 2 files changed, 272 insertions(+), 247 deletions(-) diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 478378fb3e78..0ad1580fea56 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -175,6 +175,8 @@ struct bpf_verifier_env { bool allow_ptr_leaks; bool seen_direct_write; struct bpf_insn_aux_data *insn_aux_data; /* array of per-insn state */ + + struct bpf_verifer_log log; }; static inline struct bpf_reg_state *cur_regs(struct bpf_verifier_env *env) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index b13cee524c61..c2a543a1d57b 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -153,20 +153,16 @@ struct bpf_call_arg_meta { int access_size; }; -/* verbose verifier prints what it's seeing - * bpf_check() is called under lock, so no race to access these global vars - */ -static struct bpf_verifer_log verifier_log; - static DEFINE_MUTEX(bpf_verifier_lock); /* log_level controls verbosity level of eBPF verifier. * verbose() is used to dump the verification trace to the log, so the user * can figure out what's wrong with the program */ -static __printf(1, 2) void verbose(const char *fmt, ...) +static __printf(2, 3) void verbose(struct bpf_verifier_env *env, + const char *fmt, ...) { - struct bpf_verifer_log *log = &verifier_log; + struct bpf_verifer_log *log = &env->log; va_list args; if (!log->level || bpf_verifier_log_full(log)) @@ -214,7 +210,8 @@ static const char *func_id_name(int id) return "unknown"; } -static void print_verifier_state(struct bpf_verifier_state *state) +static void print_verifier_state(struct bpf_verifier_env *env, + struct bpf_verifier_state *state) { struct bpf_reg_state *reg; enum bpf_reg_type t; @@ -225,21 +222,21 @@ static void print_verifier_state(struct bpf_verifier_state *state) t = reg->type; if (t == NOT_INIT) continue; - verbose(" R%d=%s", i, reg_type_str[t]); + verbose(env, " R%d=%s", i, reg_type_str[t]); if ((t == SCALAR_VALUE || t == PTR_TO_STACK) && tnum_is_const(reg->var_off)) { /* reg->off should be 0 for SCALAR_VALUE */ - verbose("%lld", reg->var_off.value + reg->off); + verbose(env, "%lld", reg->var_off.value + reg->off); } else { - verbose("(id=%d", reg->id); + verbose(env, "(id=%d", reg->id); if (t != SCALAR_VALUE) - verbose(",off=%d", reg->off); + verbose(env, ",off=%d", reg->off); if (type_is_pkt_pointer(t)) - verbose(",r=%d", reg->range); + verbose(env, ",r=%d", reg->range); else if (t == CONST_PTR_TO_MAP || t == PTR_TO_MAP_VALUE || t == PTR_TO_MAP_VALUE_OR_NULL) - verbose(",ks=%d,vs=%d", + verbose(env, ",ks=%d,vs=%d", reg->map_ptr->key_size, reg->map_ptr->value_size); if (tnum_is_const(reg->var_off)) { @@ -247,39 +244,39 @@ static void print_verifier_state(struct bpf_verifier_state *state) * could be a pointer whose offset is too big * for reg->off */ - verbose(",imm=%llx", reg->var_off.value); + verbose(env, ",imm=%llx", reg->var_off.value); } else { if (reg->smin_value != reg->umin_value && reg->smin_value != S64_MIN) - verbose(",smin_value=%lld", + verbose(env, ",smin_value=%lld", (long long)reg->smin_value); if (reg->smax_value != reg->umax_value && reg->smax_value != S64_MAX) - verbose(",smax_value=%lld", + verbose(env, ",smax_value=%lld", (long long)reg->smax_value); if (reg->umin_value != 0) - verbose(",umin_value=%llu", + verbose(env, ",umin_value=%llu", (unsigned long long)reg->umin_value); if (reg->umax_value != U64_MAX) - verbose(",umax_value=%llu", + verbose(env, ",umax_value=%llu", (unsigned long long)reg->umax_value); if (!tnum_is_unknown(reg->var_off)) { char tn_buf[48]; tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose(",var_off=%s", tn_buf); + verbose(env, ",var_off=%s", tn_buf); } } - verbose(")"); + verbose(env, ")"); } } for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { if (state->stack[i].slot_type[0] == STACK_SPILL) - verbose(" fp%d=%s", + verbose(env, " fp%d=%s", (-i - 1) * BPF_REG_SIZE, reg_type_str[state->stack[i].spilled_ptr.type]); } - verbose("\n"); + verbose(env, "\n"); } static const char *const bpf_class_string[] = { @@ -334,15 +331,15 @@ static const char *const bpf_jmp_string[16] = { [BPF_EXIT >> 4] = "exit", }; -static void print_bpf_end_insn(const struct bpf_verifier_env *env, +static void print_bpf_end_insn(struct bpf_verifier_env *env, const struct bpf_insn *insn) { - verbose("(%02x) r%d = %s%d r%d\n", insn->code, insn->dst_reg, + verbose(env, "(%02x) r%d = %s%d r%d\n", insn->code, insn->dst_reg, BPF_SRC(insn->code) == BPF_TO_BE ? "be" : "le", insn->imm, insn->dst_reg); } -static void print_bpf_insn(const struct bpf_verifier_env *env, +static void print_bpf_insn(struct bpf_verifier_env *env, const struct bpf_insn *insn) { u8 class = BPF_CLASS(insn->code); @@ -350,23 +347,23 @@ static void print_bpf_insn(const struct bpf_verifier_env *env, if (class == BPF_ALU || class == BPF_ALU64) { if (BPF_OP(insn->code) == BPF_END) { if (class == BPF_ALU64) - verbose("BUG_alu64_%02x\n", insn->code); + verbose(env, "BUG_alu64_%02x\n", insn->code); else print_bpf_end_insn(env, insn); } else if (BPF_OP(insn->code) == BPF_NEG) { - verbose("(%02x) r%d = %s-r%d\n", + verbose(env, "(%02x) r%d = %s-r%d\n", insn->code, insn->dst_reg, class == BPF_ALU ? "(u32) " : "", insn->dst_reg); } else if (BPF_SRC(insn->code) == BPF_X) { - verbose("(%02x) %sr%d %s %sr%d\n", + verbose(env, "(%02x) %sr%d %s %sr%d\n", insn->code, class == BPF_ALU ? "(u32) " : "", insn->dst_reg, bpf_alu_string[BPF_OP(insn->code) >> 4], class == BPF_ALU ? "(u32) " : "", insn->src_reg); } else { - verbose("(%02x) %sr%d %s %s%d\n", + verbose(env, "(%02x) %sr%d %s %s%d\n", insn->code, class == BPF_ALU ? "(u32) " : "", insn->dst_reg, bpf_alu_string[BPF_OP(insn->code) >> 4], @@ -375,46 +372,46 @@ static void print_bpf_insn(const struct bpf_verifier_env *env, } } else if (class == BPF_STX) { if (BPF_MODE(insn->code) == BPF_MEM) - verbose("(%02x) *(%s *)(r%d %+d) = r%d\n", + verbose(env, "(%02x) *(%s *)(r%d %+d) = r%d\n", insn->code, bpf_ldst_string[BPF_SIZE(insn->code) >> 3], insn->dst_reg, insn->off, insn->src_reg); else if (BPF_MODE(insn->code) == BPF_XADD) - verbose("(%02x) lock *(%s *)(r%d %+d) += r%d\n", + verbose(env, "(%02x) lock *(%s *)(r%d %+d) += r%d\n", insn->code, bpf_ldst_string[BPF_SIZE(insn->code) >> 3], insn->dst_reg, insn->off, insn->src_reg); else - verbose("BUG_%02x\n", insn->code); + verbose(env, "BUG_%02x\n", insn->code); } else if (class == BPF_ST) { if (BPF_MODE(insn->code) != BPF_MEM) { - verbose("BUG_st_%02x\n", insn->code); + verbose(env, "BUG_st_%02x\n", insn->code); return; } - verbose("(%02x) *(%s *)(r%d %+d) = %d\n", + verbose(env, "(%02x) *(%s *)(r%d %+d) = %d\n", insn->code, bpf_ldst_string[BPF_SIZE(insn->code) >> 3], insn->dst_reg, insn->off, insn->imm); } else if (class == BPF_LDX) { if (BPF_MODE(insn->code) != BPF_MEM) { - verbose("BUG_ldx_%02x\n", insn->code); + verbose(env, "BUG_ldx_%02x\n", insn->code); return; } - verbose("(%02x) r%d = *(%s *)(r%d %+d)\n", + verbose(env, "(%02x) r%d = *(%s *)(r%d %+d)\n", insn->code, insn->dst_reg, bpf_ldst_string[BPF_SIZE(insn->code) >> 3], insn->src_reg, insn->off); } else if (class == BPF_LD) { if (BPF_MODE(insn->code) == BPF_ABS) { - verbose("(%02x) r0 = *(%s *)skb[%d]\n", + verbose(env, "(%02x) r0 = *(%s *)skb[%d]\n", insn->code, bpf_ldst_string[BPF_SIZE(insn->code) >> 3], insn->imm); } else if (BPF_MODE(insn->code) == BPF_IND) { - verbose("(%02x) r0 = *(%s *)skb[r%d + %d]\n", + verbose(env, "(%02x) r0 = *(%s *)skb[r%d + %d]\n", insn->code, bpf_ldst_string[BPF_SIZE(insn->code) >> 3], insn->src_reg, insn->imm); @@ -429,36 +426,37 @@ static void print_bpf_insn(const struct bpf_verifier_env *env, if (map_ptr && !env->allow_ptr_leaks) imm = 0; - verbose("(%02x) r%d = 0x%llx\n", insn->code, + verbose(env, "(%02x) r%d = 0x%llx\n", insn->code, insn->dst_reg, (unsigned long long)imm); } else { - verbose("BUG_ld_%02x\n", insn->code); + verbose(env, "BUG_ld_%02x\n", insn->code); return; } } else if (class == BPF_JMP) { u8 opcode = BPF_OP(insn->code); if (opcode == BPF_CALL) { - verbose("(%02x) call %s#%d\n", insn->code, + verbose(env, "(%02x) call %s#%d\n", insn->code, func_id_name(insn->imm), insn->imm); } else if (insn->code == (BPF_JMP | BPF_JA)) { - verbose("(%02x) goto pc%+d\n", + verbose(env, "(%02x) goto pc%+d\n", insn->code, insn->off); } else if (insn->code == (BPF_JMP | BPF_EXIT)) { - verbose("(%02x) exit\n", insn->code); + verbose(env, "(%02x) exit\n", insn->code); } else if (BPF_SRC(insn->code) == BPF_X) { - verbose("(%02x) if r%d %s r%d goto pc%+d\n", + verbose(env, "(%02x) if r%d %s r%d goto pc%+d\n", insn->code, insn->dst_reg, bpf_jmp_string[BPF_OP(insn->code) >> 4], insn->src_reg, insn->off); } else { - verbose("(%02x) if r%d %s 0x%x goto pc%+d\n", + verbose(env, "(%02x) if r%d %s 0x%x goto pc%+d\n", insn->code, insn->dst_reg, bpf_jmp_string[BPF_OP(insn->code) >> 4], insn->imm, insn->off); } } else { - verbose("(%02x) %s\n", insn->code, bpf_class_string[class]); + verbose(env, "(%02x) %s\n", + insn->code, bpf_class_string[class]); } } @@ -590,7 +588,7 @@ static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env, if (err) goto err; if (env->stack_size > BPF_COMPLEXITY_LIMIT_STACK) { - verbose("BPF program is too complex\n"); + verbose(env, "BPF program is too complex\n"); goto err; } return &elem->st; @@ -628,10 +626,11 @@ static void __mark_reg_known_zero(struct bpf_reg_state *reg) __mark_reg_known(reg, 0); } -static void mark_reg_known_zero(struct bpf_reg_state *regs, u32 regno) +static void mark_reg_known_zero(struct bpf_verifier_env *env, + struct bpf_reg_state *regs, u32 regno) { if (WARN_ON(regno >= MAX_BPF_REG)) { - verbose("mark_reg_known_zero(regs, %u)\n", regno); + verbose(env, "mark_reg_known_zero(regs, %u)\n", regno); /* Something bad happened, let's kill all regs */ for (regno = 0; regno < MAX_BPF_REG; regno++) __mark_reg_not_init(regs + regno); @@ -741,10 +740,11 @@ static void __mark_reg_unknown(struct bpf_reg_state *reg) __mark_reg_unbounded(reg); } -static void mark_reg_unknown(struct bpf_reg_state *regs, u32 regno) +static void mark_reg_unknown(struct bpf_verifier_env *env, + struct bpf_reg_state *regs, u32 regno) { if (WARN_ON(regno >= MAX_BPF_REG)) { - verbose("mark_reg_unknown(regs, %u)\n", regno); + verbose(env, "mark_reg_unknown(regs, %u)\n", regno); /* Something bad happened, let's kill all regs */ for (regno = 0; regno < MAX_BPF_REG; regno++) __mark_reg_not_init(regs + regno); @@ -759,10 +759,11 @@ static void __mark_reg_not_init(struct bpf_reg_state *reg) reg->type = NOT_INIT; } -static void mark_reg_not_init(struct bpf_reg_state *regs, u32 regno) +static void mark_reg_not_init(struct bpf_verifier_env *env, + struct bpf_reg_state *regs, u32 regno) { if (WARN_ON(regno >= MAX_BPF_REG)) { - verbose("mark_reg_not_init(regs, %u)\n", regno); + verbose(env, "mark_reg_not_init(regs, %u)\n", regno); /* Something bad happened, let's kill all regs */ for (regno = 0; regno < MAX_BPF_REG; regno++) __mark_reg_not_init(regs + regno); @@ -771,22 +772,23 @@ static void mark_reg_not_init(struct bpf_reg_state *regs, u32 regno) __mark_reg_not_init(regs + regno); } -static void init_reg_state(struct bpf_reg_state *regs) +static void init_reg_state(struct bpf_verifier_env *env, + struct bpf_reg_state *regs) { int i; for (i = 0; i < MAX_BPF_REG; i++) { - mark_reg_not_init(regs, i); + mark_reg_not_init(env, regs, i); regs[i].live = REG_LIVE_NONE; } /* frame pointer */ regs[BPF_REG_FP].type = PTR_TO_STACK; - mark_reg_known_zero(regs, BPF_REG_FP); + mark_reg_known_zero(env, regs, BPF_REG_FP); /* 1st arg to a function */ regs[BPF_REG_1].type = PTR_TO_CTX; - mark_reg_known_zero(regs, BPF_REG_1); + mark_reg_known_zero(env, regs, BPF_REG_1); } enum reg_arg_type { @@ -820,26 +822,26 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno, struct bpf_reg_state *regs = env->cur_state->regs; if (regno >= MAX_BPF_REG) { - verbose("R%d is invalid\n", regno); + verbose(env, "R%d is invalid\n", regno); return -EINVAL; } if (t == SRC_OP) { /* check whether register used as source operand can be read */ if (regs[regno].type == NOT_INIT) { - verbose("R%d !read_ok\n", regno); + verbose(env, "R%d !read_ok\n", regno); return -EACCES; } mark_reg_read(env->cur_state, regno); } else { /* check whether register used as dest operand can be written to */ if (regno == BPF_REG_FP) { - verbose("frame pointer is read only\n"); + verbose(env, "frame pointer is read only\n"); return -EACCES; } regs[regno].live |= REG_LIVE_WRITTEN; if (t == DST_OP) - mark_reg_unknown(regs, regno); + mark_reg_unknown(env, regs, regno); } return 0; } @@ -880,7 +882,7 @@ static int check_stack_write(struct bpf_verifier_env *env, if (!env->allow_ptr_leaks && state->stack[spi].slot_type[0] == STACK_SPILL && size != BPF_REG_SIZE) { - verbose("attempt to corrupt spilled pointer on stack\n"); + verbose(env, "attempt to corrupt spilled pointer on stack\n"); return -EACCES; } @@ -889,7 +891,7 @@ static int check_stack_write(struct bpf_verifier_env *env, /* register containing pointer is being spilled into stack */ if (size != BPF_REG_SIZE) { - verbose("invalid size of register spill\n"); + verbose(env, "invalid size of register spill\n"); return -EACCES; } @@ -915,7 +917,8 @@ static int check_stack_write(struct bpf_verifier_env *env, * into two different stack slots, since verifier * cannot sanitize them */ - verbose("insn %d cannot access two stack slots fp%d and fp%d", + verbose(env, + "insn %d cannot access two stack slots fp%d and fp%d", insn_idx, *poff, soff); return -EINVAL; } @@ -949,14 +952,15 @@ static void mark_stack_slot_read(const struct bpf_verifier_state *state, int slo } } -static int check_stack_read(struct bpf_verifier_state *state, int off, int size, +static int check_stack_read(struct bpf_verifier_env *env, + struct bpf_verifier_state *state, int off, int size, int value_regno) { int i, slot = -off - 1, spi = slot / BPF_REG_SIZE; u8 *stype; if (state->allocated_stack <= slot) { - verbose("invalid read from stack off %d+0 size %d\n", + verbose(env, "invalid read from stack off %d+0 size %d\n", off, size); return -EACCES; } @@ -964,12 +968,12 @@ static int check_stack_read(struct bpf_verifier_state *state, int off, int size, if (stype[0] == STACK_SPILL) { if (size != BPF_REG_SIZE) { - verbose("invalid size of register spill\n"); + verbose(env, "invalid size of register spill\n"); return -EACCES; } for (i = 1; i < BPF_REG_SIZE; i++) { if (stype[(slot - i) % BPF_REG_SIZE] != STACK_SPILL) { - verbose("corrupted spill memory\n"); + verbose(env, "corrupted spill memory\n"); return -EACCES; } } @@ -983,14 +987,14 @@ static int check_stack_read(struct bpf_verifier_state *state, int off, int size, } else { for (i = 0; i < size; i++) { if (stype[(slot - i) % BPF_REG_SIZE] != STACK_MISC) { - verbose("invalid read from stack off %d+%d size %d\n", + verbose(env, "invalid read from stack off %d+%d size %d\n", off, i, size); return -EACCES; } } if (value_regno >= 0) /* have read misc data from the stack */ - mark_reg_unknown(state->regs, value_regno); + mark_reg_unknown(env, state->regs, value_regno); return 0; } } @@ -1007,13 +1011,13 @@ static int check_stack_access(struct bpf_verifier_env *env, char tn_buf[48]; tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose("variable stack access var_off=%s off=%d size=%d", + verbose(env, "variable stack access var_off=%s off=%d size=%d", tn_buf, off, size); return -EACCES; } if (off >= 0 || off < -MAX_BPF_STACK) { - verbose("invalid stack off=%d size=%d\n", off, size); + verbose(env, "invalid stack off=%d size=%d\n", off, size); return -EACCES; } @@ -1028,7 +1032,7 @@ static int __check_map_access(struct bpf_verifier_env *env, u32 regno, int off, struct bpf_map *map = regs[regno].map_ptr; if (off < 0 || size <= 0 || off + size > map->value_size) { - verbose("invalid access to map value, value_size=%d off=%d size=%d\n", + verbose(env, "invalid access to map value, value_size=%d off=%d size=%d\n", map->value_size, off, size); return -EACCES; } @@ -1047,8 +1051,8 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, * need to try adding each of min_value and max_value to off * to make sure our theoretical access will be safe. */ - if (verifier_log.level) - print_verifier_state(state); + if (env->log.level) + print_verifier_state(env, state); /* The minimum value is only important with signed * comparisons where we can't assume the floor of a @@ -1060,13 +1064,14 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, (reg->smin_value == S64_MIN || (off + reg->smin_value != (s64)(s32)(off + reg->smin_value)) || reg->smin_value + off < 0)) { - verbose("R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n", + verbose(env, "R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n", regno); return -EACCES; } err = __check_map_access(env, regno, reg->smin_value + off, size); if (err) { - verbose("R%d min value is outside of the array range\n", regno); + verbose(env, "R%d min value is outside of the array range\n", + regno); return err; } @@ -1075,13 +1080,14 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, * If reg->umax_value + off could overflow, treat that as unbounded too. */ if (reg->umax_value >= BPF_MAX_VAR_OFF) { - verbose("R%d unbounded memory access, make sure to bounds check any array access into a map\n", + verbose(env, "R%d unbounded memory access, make sure to bounds check any array access into a map\n", regno); return -EACCES; } err = __check_map_access(env, regno, reg->umax_value + off, size); if (err) - verbose("R%d max value is outside of the array range\n", regno); + verbose(env, "R%d max value is outside of the array range\n", + regno); return err; } @@ -1120,7 +1126,7 @@ static int __check_packet_access(struct bpf_verifier_env *env, u32 regno, struct bpf_reg_state *reg = ®s[regno]; if (off < 0 || size <= 0 || (u64)off + size > reg->range) { - verbose("invalid access to packet, off=%d size=%d, R%d(id=%d,off=%d,r=%d)\n", + verbose(env, "invalid access to packet, off=%d size=%d, R%d(id=%d,off=%d,r=%d)\n", off, size, regno, reg->id, reg->off, reg->range); return -EACCES; } @@ -1143,13 +1149,13 @@ static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off, * detail to prove they're safe. */ if (reg->smin_value < 0) { - verbose("R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n", + verbose(env, "R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n", regno); return -EACCES; } err = __check_packet_access(env, regno, off, size); if (err) { - verbose("R%d offset is outside of the packet\n", regno); + verbose(env, "R%d offset is outside of the packet\n", regno); return err; } return err; @@ -1185,7 +1191,7 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, return 0; } - verbose("invalid bpf_context access off=%d size=%d\n", off, size); + verbose(env, "invalid bpf_context access off=%d size=%d\n", off, size); return -EACCES; } @@ -1217,7 +1223,8 @@ static bool is_pkt_reg(struct bpf_verifier_env *env, int regno) return type_is_pkt_pointer(reg->type); } -static int check_pkt_ptr_alignment(const struct bpf_reg_state *reg, +static int check_pkt_ptr_alignment(struct bpf_verifier_env *env, + const struct bpf_reg_state *reg, int off, int size, bool strict) { struct tnum reg_off; @@ -1242,7 +1249,8 @@ static int check_pkt_ptr_alignment(const struct bpf_reg_state *reg, char tn_buf[48]; tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose("misaligned packet access off %d+%s+%d+%d size %d\n", + verbose(env, + "misaligned packet access off %d+%s+%d+%d size %d\n", ip_align, tn_buf, reg->off, off, size); return -EACCES; } @@ -1250,7 +1258,8 @@ static int check_pkt_ptr_alignment(const struct bpf_reg_state *reg, return 0; } -static int check_generic_ptr_alignment(const struct bpf_reg_state *reg, +static int check_generic_ptr_alignment(struct bpf_verifier_env *env, + const struct bpf_reg_state *reg, const char *pointer_desc, int off, int size, bool strict) { @@ -1265,7 +1274,7 @@ static int check_generic_ptr_alignment(const struct bpf_reg_state *reg, char tn_buf[48]; tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose("misaligned %saccess off %s+%d+%d size %d\n", + verbose(env, "misaligned %saccess off %s+%d+%d size %d\n", pointer_desc, tn_buf, reg->off, off, size); return -EACCES; } @@ -1286,7 +1295,7 @@ static int check_ptr_alignment(struct bpf_verifier_env *env, /* Special case, because of NET_IP_ALIGN. Given metadata sits * right in front, treat it the very same way. */ - return check_pkt_ptr_alignment(reg, off, size, strict); + return check_pkt_ptr_alignment(env, reg, off, size, strict); case PTR_TO_MAP_VALUE: pointer_desc = "value "; break; @@ -1304,7 +1313,8 @@ static int check_ptr_alignment(struct bpf_verifier_env *env, default: break; } - return check_generic_ptr_alignment(reg, pointer_desc, off, size, strict); + return check_generic_ptr_alignment(env, reg, pointer_desc, off, size, + strict); } static int check_ctx_reg(struct bpf_verifier_env *env, @@ -1315,7 +1325,7 @@ static int check_ctx_reg(struct bpf_verifier_env *env, */ if (reg->off) { - verbose("dereference of modified ctx ptr R%d off=%d disallowed\n", + verbose(env, "dereference of modified ctx ptr R%d off=%d disallowed\n", regno, reg->off); return -EACCES; } @@ -1324,7 +1334,7 @@ static int check_ctx_reg(struct bpf_verifier_env *env, char tn_buf[48]; tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose("variable ctx access var_off=%s disallowed\n", tn_buf); + verbose(env, "variable ctx access var_off=%s disallowed\n", tn_buf); return -EACCES; } @@ -1384,20 +1394,20 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn if (reg->type == PTR_TO_MAP_VALUE) { if (t == BPF_WRITE && value_regno >= 0 && is_pointer_value(env, value_regno)) { - verbose("R%d leaks addr into map\n", value_regno); + verbose(env, "R%d leaks addr into map\n", value_regno); return -EACCES; } err = check_map_access(env, regno, off, size); if (!err && t == BPF_READ && value_regno >= 0) - mark_reg_unknown(regs, value_regno); + mark_reg_unknown(env, regs, value_regno); } else if (reg->type == PTR_TO_CTX) { enum bpf_reg_type reg_type = SCALAR_VALUE; if (t == BPF_WRITE && value_regno >= 0 && is_pointer_value(env, value_regno)) { - verbose("R%d leaks addr into ctx\n", value_regno); + verbose(env, "R%d leaks addr into ctx\n", value_regno); return -EACCES; } err = check_ctx_reg(env, reg, regno); @@ -1411,9 +1421,10 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn * case, we know the offset is zero. */ if (reg_type == SCALAR_VALUE) - mark_reg_unknown(regs, value_regno); + mark_reg_unknown(env, regs, value_regno); else - mark_reg_known_zero(regs, value_regno); + mark_reg_known_zero(env, regs, + value_regno); regs[value_regno].id = 0; regs[value_regno].off = 0; regs[value_regno].range = 0; @@ -1433,23 +1444,25 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn err = check_stack_write(env, state, off, size, value_regno, insn_idx); else - err = check_stack_read(state, off, size, value_regno); + err = check_stack_read(env, state, off, size, + value_regno); } else if (reg_is_pkt_pointer(reg)) { if (t == BPF_WRITE && !may_access_direct_pkt_data(env, NULL, t)) { - verbose("cannot write into packet\n"); + verbose(env, "cannot write into packet\n"); return -EACCES; } if (t == BPF_WRITE && value_regno >= 0 && is_pointer_value(env, value_regno)) { - verbose("R%d leaks addr into packet\n", value_regno); + verbose(env, "R%d leaks addr into packet\n", + value_regno); return -EACCES; } err = check_packet_access(env, regno, off, size); if (!err && t == BPF_READ && value_regno >= 0) - mark_reg_unknown(regs, value_regno); + mark_reg_unknown(env, regs, value_regno); } else { - verbose("R%d invalid mem access '%s'\n", - regno, reg_type_str[reg->type]); + verbose(env, "R%d invalid mem access '%s'\n", regno, + reg_type_str[reg->type]); return -EACCES; } @@ -1467,7 +1480,7 @@ static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_ins if ((BPF_SIZE(insn->code) != BPF_W && BPF_SIZE(insn->code) != BPF_DW) || insn->imm != 0) { - verbose("BPF_XADD uses reserved fields\n"); + verbose(env, "BPF_XADD uses reserved fields\n"); return -EINVAL; } @@ -1482,13 +1495,13 @@ static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_ins return err; if (is_pointer_value(env, insn->src_reg)) { - verbose("R%d leaks addr into mem\n", insn->src_reg); + verbose(env, "R%d leaks addr into mem\n", insn->src_reg); return -EACCES; } if (is_ctx_reg(env, insn->dst_reg) || is_pkt_reg(env, insn->dst_reg)) { - verbose("BPF_XADD stores into R%d %s is not allowed\n", + verbose(env, "BPF_XADD stores into R%d %s is not allowed\n", insn->dst_reg, is_ctx_reg(env, insn->dst_reg) ? "context" : "packet"); return -EACCES; @@ -1531,7 +1544,7 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, register_is_null(regs[regno])) return 0; - verbose("R%d type=%s expected=%s\n", regno, + verbose(env, "R%d type=%s expected=%s\n", regno, reg_type_str[regs[regno].type], reg_type_str[PTR_TO_STACK]); return -EACCES; @@ -1542,14 +1555,14 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, char tn_buf[48]; tnum_strn(tn_buf, sizeof(tn_buf), regs[regno].var_off); - verbose("invalid variable stack read R%d var_off=%s\n", + verbose(env, "invalid variable stack read R%d var_off=%s\n", regno, tn_buf); return -EACCES; } off = regs[regno].off + regs[regno].var_off.value; if (off >= 0 || off < -MAX_BPF_STACK || off + access_size > 0 || access_size <= 0) { - verbose("invalid stack type R%d off=%d access_size=%d\n", + verbose(env, "invalid stack type R%d off=%d access_size=%d\n", regno, off, access_size); return -EACCES; } @@ -1569,7 +1582,7 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, if (state->allocated_stack <= slot || state->stack[spi].slot_type[slot % BPF_REG_SIZE] != STACK_MISC) { - verbose("invalid indirect read from stack off %d+%d size %d\n", + verbose(env, "invalid indirect read from stack off %d+%d size %d\n", off, i, access_size); return -EACCES; } @@ -1612,7 +1625,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, if (arg_type == ARG_ANYTHING) { if (is_pointer_value(env, regno)) { - verbose("R%d leaks addr into helper function\n", regno); + verbose(env, "R%d leaks addr into helper function\n", + regno); return -EACCES; } return 0; @@ -1620,7 +1634,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, if (type_is_pkt_pointer(type) && !may_access_direct_pkt_data(env, meta, BPF_READ)) { - verbose("helper access to the packet is not allowed\n"); + verbose(env, "helper access to the packet is not allowed\n"); return -EACCES; } @@ -1661,7 +1675,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, goto err_type; meta->raw_mode = arg_type == ARG_PTR_TO_UNINIT_MEM; } else { - verbose("unsupported arg_type %d\n", arg_type); + verbose(env, "unsupported arg_type %d\n", arg_type); return -EFAULT; } @@ -1679,7 +1693,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, * we have to check map_key here. Otherwise it means * that kernel subsystem misconfigured verifier */ - verbose("invalid map_ptr to access map->key\n"); + verbose(env, "invalid map_ptr to access map->key\n"); return -EACCES; } if (type_is_pkt_pointer(type)) @@ -1695,7 +1709,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, */ if (!meta->map_ptr) { /* kernel subsystem misconfigured verifier */ - verbose("invalid map_ptr to access map->value\n"); + verbose(env, "invalid map_ptr to access map->value\n"); return -EACCES; } if (type_is_pkt_pointer(type)) @@ -1715,7 +1729,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, */ if (regno == 0) { /* kernel subsystem misconfigured verifier */ - verbose("ARG_CONST_SIZE cannot be first argument\n"); + verbose(env, + "ARG_CONST_SIZE cannot be first argument\n"); return -EACCES; } @@ -1732,7 +1747,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, meta = NULL; if (reg->smin_value < 0) { - verbose("R%d min value is negative, either use unsigned or 'var &= const'\n", + verbose(env, "R%d min value is negative, either use unsigned or 'var &= const'\n", regno); return -EACCES; } @@ -1746,7 +1761,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, } if (reg->umax_value >= BPF_MAX_VAR_SIZ) { - verbose("R%d unbounded memory access, use 'var &= const' or 'if (var < const)'\n", + verbose(env, "R%d unbounded memory access, use 'var &= const' or 'if (var < const)'\n", regno); return -EACCES; } @@ -1757,12 +1772,13 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, return err; err_type: - verbose("R%d type=%s expected=%s\n", regno, + verbose(env, "R%d type=%s expected=%s\n", regno, reg_type_str[type], reg_type_str[expected_type]); return -EACCES; } -static int check_map_func_compatibility(struct bpf_map *map, int func_id) +static int check_map_func_compatibility(struct bpf_verifier_env *env, + struct bpf_map *map, int func_id) { if (!map) return 0; @@ -1850,7 +1866,7 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id) return 0; error: - verbose("cannot pass map_type %d into func %s#%d\n", + verbose(env, "cannot pass map_type %d into func %s#%d\n", map->map_type, func_id_name(func_id), func_id); return -EINVAL; } @@ -1884,7 +1900,7 @@ static void clear_all_pkt_pointers(struct bpf_verifier_env *env) for (i = 0; i < MAX_BPF_REG; i++) if (reg_is_pkt_pointer_any(®s[i])) - mark_reg_unknown(regs, i); + mark_reg_unknown(env, regs, i); for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { if (state->stack[i].slot_type[0] != STACK_SPILL) @@ -1905,7 +1921,8 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx) /* find function prototype */ if (func_id < 0 || func_id >= __BPF_FUNC_MAX_ID) { - verbose("invalid func %s#%d\n", func_id_name(func_id), func_id); + verbose(env, "invalid func %s#%d\n", func_id_name(func_id), + func_id); return -EINVAL; } @@ -1913,13 +1930,14 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx) fn = env->prog->aux->ops->get_func_proto(func_id); if (!fn) { - verbose("unknown func %s#%d\n", func_id_name(func_id), func_id); + verbose(env, "unknown func %s#%d\n", func_id_name(func_id), + func_id); return -EINVAL; } /* eBPF programs must be GPL compatible to use GPL-ed functions */ if (!env->prog->gpl_compatible && fn->gpl_only) { - verbose("cannot call GPL only function from proprietary program\n"); + verbose(env, "cannot call GPL only function from proprietary program\n"); return -EINVAL; } @@ -1933,7 +1951,7 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx) */ err = check_raw_mode(fn); if (err) { - verbose("kernel subsystem misconfigured func %s#%d\n", + verbose(env, "kernel subsystem misconfigured func %s#%d\n", func_id_name(func_id), func_id); return err; } @@ -1947,7 +1965,7 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx) return err; if (func_id == BPF_FUNC_tail_call) { if (meta.map_ptr == NULL) { - verbose("verifier bug\n"); + verbose(env, "verifier bug\n"); return -EINVAL; } env->insn_aux_data[insn_idx].map_ptr = meta.map_ptr; @@ -1975,14 +1993,14 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx) regs = cur_regs(env); /* reset caller saved regs */ for (i = 0; i < CALLER_SAVED_REGS; i++) { - mark_reg_not_init(regs, caller_saved[i]); + mark_reg_not_init(env, regs, caller_saved[i]); check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK); } /* update return register (already marked as written above) */ if (fn->ret_type == RET_INTEGER) { /* sets type to SCALAR_VALUE */ - mark_reg_unknown(regs, BPF_REG_0); + mark_reg_unknown(env, regs, BPF_REG_0); } else if (fn->ret_type == RET_VOID) { regs[BPF_REG_0].type = NOT_INIT; } else if (fn->ret_type == RET_PTR_TO_MAP_VALUE_OR_NULL) { @@ -1990,14 +2008,15 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx) regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL; /* There is no offset yet applied, variable or fixed */ - mark_reg_known_zero(regs, BPF_REG_0); + mark_reg_known_zero(env, regs, BPF_REG_0); regs[BPF_REG_0].off = 0; /* remember map_ptr, so that check_map_access() * can check 'value_size' boundary of memory access * to map element returned from bpf_map_lookup_elem() */ if (meta.map_ptr == NULL) { - verbose("kernel subsystem misconfigured verifier\n"); + verbose(env, + "kernel subsystem misconfigured verifier\n"); return -EINVAL; } regs[BPF_REG_0].map_ptr = meta.map_ptr; @@ -2008,12 +2027,12 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx) else if (insn_aux->map_ptr != meta.map_ptr) insn_aux->map_ptr = BPF_MAP_PTR_POISON; } else { - verbose("unknown return type %d of func %s#%d\n", + verbose(env, "unknown return type %d of func %s#%d\n", fn->ret_type, func_id_name(func_id), func_id); return -EINVAL; } - err = check_map_func_compatibility(meta.map_ptr, func_id); + err = check_map_func_compatibility(env, meta.map_ptr, func_id); if (err) return err; @@ -2051,25 +2070,25 @@ static bool check_reg_sane_offset(struct bpf_verifier_env *env, s64 smin = reg->smin_value; if (known && (val >= BPF_MAX_VAR_OFF || val <= -BPF_MAX_VAR_OFF)) { - verbose("math between %s pointer and %lld is not allowed\n", + verbose(env, "math between %s pointer and %lld is not allowed\n", reg_type_str[type], val); return false; } if (reg->off >= BPF_MAX_VAR_OFF || reg->off <= -BPF_MAX_VAR_OFF) { - verbose("%s pointer offset %d is not allowed\n", + verbose(env, "%s pointer offset %d is not allowed\n", reg_type_str[type], reg->off); return false; } if (smin == S64_MIN) { - verbose("math between %s pointer and register with unbounded min value is not allowed\n", + verbose(env, "math between %s pointer and register with unbounded min value is not allowed\n", reg_type_str[type]); return false; } if (smin >= BPF_MAX_VAR_OFF || smin <= -BPF_MAX_VAR_OFF) { - verbose("value %lld makes %s pointer be out of bounds\n", + verbose(env, "value %lld makes %s pointer be out of bounds\n", smin, reg_type_str[type]); return false; } @@ -2265,27 +2284,27 @@ static int sanitize_err(struct bpf_verifier_env *env, switch (reason) { case REASON_BOUNDS: - verbose("R%d has unknown scalar with mixed signed bounds, %s\n", + verbose(env, "R%d has unknown scalar with mixed signed bounds, %s\n", off_reg == dst_reg ? dst : src, err); break; case REASON_TYPE: - verbose("R%d has pointer with unsupported alu operation, %s\n", + verbose(env, "R%d has pointer with unsupported alu operation, %s\n", off_reg == dst_reg ? src : dst, err); break; case REASON_PATHS: - verbose("R%d tried to %s from different maps, paths or scalars, %s\n", + verbose(env, "R%d tried to %s from different maps, paths or scalars, %s\n", dst, op, err); break; case REASON_LIMIT: - verbose("R%d tried to %s beyond pointer bounds, %s\n", + verbose(env, "R%d tried to %s beyond pointer bounds, %s\n", dst, op, err); break; case REASON_STACK: - verbose("R%d could not be pushed for speculative verification, %s\n", + verbose(env, "R%d could not be pushed for speculative verification, %s\n", dst, err); break; default: - verbose("verifier internal error: unknown reason (%d)\n", + verbose(env, "verifier internal error: unknown reason (%d)\n", reason); break; } @@ -2309,14 +2328,14 @@ static int sanitize_check_bounds(struct bpf_verifier_env *env, case PTR_TO_STACK: if (check_stack_access(env, dst_reg, dst_reg->off + dst_reg->var_off.value, 1)) { - verbose("R%d stack pointer arithmetic goes out of range, " + verbose(env, "R%d stack pointer arithmetic goes out of range, " "prohibited for !root\n", dst); return -EACCES; } break; case PTR_TO_MAP_VALUE: if (check_map_access(env, dst, dst_reg->off, 1)) { - verbose("R%d pointer arithmetic of map value goes out of range, " + verbose(env, "R%d pointer arithmetic of map value goes out of range, " "prohibited for !root\n", dst); return -EACCES; } @@ -2362,23 +2381,24 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, if (BPF_CLASS(insn->code) != BPF_ALU64) { /* 32-bit ALU ops on pointers produce (meaningless) scalars */ - verbose("R%d 32-bit pointer arithmetic prohibited\n", + verbose(env, + "R%d 32-bit pointer arithmetic prohibited\n", dst); return -EACCES; } if (ptr_reg->type == PTR_TO_MAP_VALUE_OR_NULL) { - verbose("R%d pointer arithmetic on PTR_TO_MAP_VALUE_OR_NULL prohibited, null-check it first\n", + verbose(env, "R%d pointer arithmetic on PTR_TO_MAP_VALUE_OR_NULL prohibited, null-check it first\n", dst); return -EACCES; } if (ptr_reg->type == CONST_PTR_TO_MAP) { - verbose("R%d pointer arithmetic on CONST_PTR_TO_MAP prohibited\n", + verbose(env, "R%d pointer arithmetic on CONST_PTR_TO_MAP prohibited\n", dst); return -EACCES; } if (ptr_reg->type == PTR_TO_PACKET_END) { - verbose("R%d pointer arithmetic on PTR_TO_PACKET_END prohibited\n", + verbose(env, "R%d pointer arithmetic on PTR_TO_PACKET_END prohibited\n", dst); return -EACCES; } @@ -2454,7 +2474,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, case BPF_SUB: if (dst_reg == off_reg) { /* scalar -= pointer. Creates an unknown scalar */ - verbose("R%d tried to subtract pointer from scalar\n", + verbose(env, "R%d tried to subtract pointer from scalar\n", dst); return -EACCES; } @@ -2463,7 +2483,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, * be able to deal with it. */ if (ptr_reg->type == PTR_TO_STACK) { - verbose("R%d subtraction from stack pointer prohibited\n", + verbose(env, "R%d subtraction from stack pointer prohibited\n", dst); return -EACCES; } @@ -2515,12 +2535,12 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, case BPF_OR: case BPF_XOR: /* bitwise ops on pointers are troublesome. */ - verbose("R%d bitwise operator %s on pointer prohibited\n", + verbose(env, "R%d bitwise operator %s on pointer prohibited\n", dst, bpf_alu_string[opcode >> 4]); return -EACCES; default: /* other operators (e.g. MUL,LSH) produce non-pointer results */ - verbose("R%d pointer arithmetic with %s operator prohibited\n", + verbose(env, "R%d pointer arithmetic with %s operator prohibited\n", dst, bpf_alu_string[opcode >> 4]); return -EACCES; } @@ -2730,7 +2750,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, /* Shifts greater than 31 or 63 are undefined. * This includes shifts by a negative number. */ - mark_reg_unknown(regs, insn->dst_reg); + mark_reg_unknown(env, regs, insn->dst_reg); break; } /* We lose all sign bit information (except what we can pick @@ -2758,7 +2778,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, /* Shifts greater than 31 or 63 are undefined. * This includes shifts by a negative number. */ - mark_reg_unknown(regs, insn->dst_reg); + mark_reg_unknown(env, regs, insn->dst_reg); break; } /* BPF_RSH is an unsigned shift. If the value in dst_reg might @@ -2788,7 +2808,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, __update_reg_bounds(dst_reg); break; default: - mark_reg_unknown(regs, insn->dst_reg); + mark_reg_unknown(env, regs, insn->dst_reg); break; } @@ -2826,10 +2846,10 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env, * pointer subtraction */ if (opcode == BPF_SUB && env->allow_ptr_leaks) { - mark_reg_unknown(regs, insn->dst_reg); + mark_reg_unknown(env, regs, insn->dst_reg); return 0; } - verbose("R%d pointer %s pointer prohibited\n", + verbose(env, "R%d pointer %s pointer prohibited\n", insn->dst_reg, bpf_alu_string[opcode >> 4]); return -EACCES; @@ -2860,13 +2880,13 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env, /* Got here implies adding two SCALAR_VALUEs */ if (WARN_ON_ONCE(ptr_reg)) { - print_verifier_state(env->cur_state); - verbose("verifier internal error: unexpected ptr_reg\n"); + print_verifier_state(env, env->cur_state); + verbose(env, "verifier internal error: unexpected ptr_reg\n"); return -EINVAL; } if (WARN_ON(!src_reg)) { - print_verifier_state(env->cur_state); - verbose("verifier internal error: no src_reg\n"); + print_verifier_state(env, env->cur_state); + verbose(env, "verifier internal error: no src_reg\n"); return -EINVAL; } return adjust_scalar_min_max_vals(env, insn, dst_reg, *src_reg); @@ -2884,14 +2904,14 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) if (BPF_SRC(insn->code) != 0 || insn->src_reg != BPF_REG_0 || insn->off != 0 || insn->imm != 0) { - verbose("BPF_NEG uses reserved fields\n"); + verbose(env, "BPF_NEG uses reserved fields\n"); return -EINVAL; } } else { if (insn->src_reg != BPF_REG_0 || insn->off != 0 || (insn->imm != 16 && insn->imm != 32 && insn->imm != 64) || BPF_CLASS(insn->code) == BPF_ALU64) { - verbose("BPF_END uses reserved fields\n"); + verbose(env, "BPF_END uses reserved fields\n"); return -EINVAL; } } @@ -2902,7 +2922,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) return err; if (is_pointer_value(env, insn->dst_reg)) { - verbose("R%d pointer arithmetic prohibited\n", + verbose(env, "R%d pointer arithmetic prohibited\n", insn->dst_reg); return -EACCES; } @@ -2916,7 +2936,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) if (BPF_SRC(insn->code) == BPF_X) { if (insn->imm != 0 || insn->off != 0) { - verbose("BPF_MOV uses reserved fields\n"); + verbose(env, "BPF_MOV uses reserved fields\n"); return -EINVAL; } @@ -2926,7 +2946,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) return err; } else { if (insn->src_reg != BPF_REG_0 || insn->off != 0) { - verbose("BPF_MOV uses reserved fields\n"); + verbose(env, "BPF_MOV uses reserved fields\n"); return -EINVAL; } } @@ -2946,11 +2966,12 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) } else { /* R1 = (u32) R2 */ if (is_pointer_value(env, insn->src_reg)) { - verbose("R%d partial copy of pointer\n", + verbose(env, + "R%d partial copy of pointer\n", insn->src_reg); return -EACCES; } - mark_reg_unknown(regs, insn->dst_reg); + mark_reg_unknown(env, regs, insn->dst_reg); coerce_reg_to_size(®s[insn->dst_reg], 4); } } else { @@ -2968,14 +2989,14 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) } } else if (opcode > BPF_END) { - verbose("invalid BPF_ALU opcode %x\n", opcode); + verbose(env, "invalid BPF_ALU opcode %x\n", opcode); return -EINVAL; } else { /* all other ALU ops: and, sub, xor, add, ... */ if (BPF_SRC(insn->code) == BPF_X) { if (insn->imm != 0 || insn->off != 0) { - verbose("BPF_ALU uses reserved fields\n"); + verbose(env, "BPF_ALU uses reserved fields\n"); return -EINVAL; } /* check src1 operand */ @@ -2984,7 +3005,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) return err; } else { if (insn->src_reg != BPF_REG_0 || insn->off != 0) { - verbose("BPF_ALU uses reserved fields\n"); + verbose(env, "BPF_ALU uses reserved fields\n"); return -EINVAL; } } @@ -2996,12 +3017,12 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) if ((opcode == BPF_MOD || opcode == BPF_DIV) && BPF_SRC(insn->code) == BPF_K && insn->imm == 0) { - verbose("div by zero\n"); + verbose(env, "div by zero\n"); return -EINVAL; } if (opcode == BPF_ARSH && BPF_CLASS(insn->code) != BPF_ALU64) { - verbose("BPF_ARSH not supported for 32 bit ALU\n"); + verbose(env, "BPF_ARSH not supported for 32 bit ALU\n"); return -EINVAL; } @@ -3010,7 +3031,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) int size = BPF_CLASS(insn->code) == BPF_ALU64 ? 64 : 32; if (insn->imm < 0 || insn->imm >= size) { - verbose("invalid shift %d\n", insn->imm); + verbose(env, "invalid shift %d\n", insn->imm); return -EINVAL; } } @@ -3372,13 +3393,13 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, int err; if (opcode > BPF_JSLE) { - verbose("invalid BPF_JMP opcode %x\n", opcode); + verbose(env, "invalid BPF_JMP opcode %x\n", opcode); return -EINVAL; } if (BPF_SRC(insn->code) == BPF_X) { if (insn->imm != 0) { - verbose("BPF_JMP uses reserved fields\n"); + verbose(env, "BPF_JMP uses reserved fields\n"); return -EINVAL; } @@ -3388,13 +3409,13 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, return err; if (is_pointer_value(env, insn->src_reg)) { - verbose("R%d pointer comparison prohibited\n", + verbose(env, "R%d pointer comparison prohibited\n", insn->src_reg); return -EACCES; } } else { if (insn->src_reg != BPF_REG_0) { - verbose("BPF_JMP uses reserved fields\n"); + verbose(env, "BPF_JMP uses reserved fields\n"); return -EINVAL; } } @@ -3539,11 +3560,12 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, find_good_pkt_pointers(this_branch, ®s[insn->src_reg], PTR_TO_PACKET_META, false); } else if (is_pointer_value(env, insn->dst_reg)) { - verbose("R%d pointer comparison prohibited\n", insn->dst_reg); + verbose(env, "R%d pointer comparison prohibited\n", + insn->dst_reg); return -EACCES; } - if (verifier_log.level) - print_verifier_state(this_branch); + if (env->log.level) + print_verifier_state(env, this_branch); return 0; } @@ -3562,11 +3584,11 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn) int err; if (BPF_SIZE(insn->code) != BPF_DW) { - verbose("invalid BPF_LD_IMM insn\n"); + verbose(env, "invalid BPF_LD_IMM insn\n"); return -EINVAL; } if (insn->off != 0) { - verbose("BPF_LD_IMM64 uses reserved fields\n"); + verbose(env, "BPF_LD_IMM64 uses reserved fields\n"); return -EINVAL; } @@ -3625,14 +3647,14 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) int i, err; if (!may_access_skb(env->prog->type)) { - verbose("BPF_LD_[ABS|IND] instructions not allowed for this program type\n"); + verbose(env, "BPF_LD_[ABS|IND] instructions not allowed for this program type\n"); return -EINVAL; } if (insn->dst_reg != BPF_REG_0 || insn->off != 0 || BPF_SIZE(insn->code) == BPF_DW || (mode == BPF_ABS && insn->src_reg != BPF_REG_0)) { - verbose("BPF_LD_[ABS|IND] uses reserved fields\n"); + verbose(env, "BPF_LD_[ABS|IND] uses reserved fields\n"); return -EINVAL; } @@ -3642,7 +3664,8 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) return err; if (regs[ctx_reg].type != PTR_TO_CTX) { - verbose("at the time of BPF_LD_ABS|IND R6 != pointer to skb\n"); + verbose(env, + "at the time of BPF_LD_ABS|IND R6 != pointer to skb\n"); return -EINVAL; } @@ -3659,7 +3682,7 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) /* reset caller saved regs to unreadable */ for (i = 0; i < CALLER_SAVED_REGS; i++) { - mark_reg_not_init(regs, caller_saved[i]); + mark_reg_not_init(env, regs, caller_saved[i]); check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK); } @@ -3667,7 +3690,7 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) * the value fetched from the packet. * Already marked as written above. */ - mark_reg_unknown(regs, BPF_REG_0); + mark_reg_unknown(env, regs, BPF_REG_0); return 0; } @@ -3687,22 +3710,22 @@ static int check_return_code(struct bpf_verifier_env *env) reg = &env->cur_state->regs[BPF_REG_0]; if (reg->type != SCALAR_VALUE) { - verbose("At program exit the register R0 is not a known value (%s)\n", + verbose(env, "At program exit the register R0 is not a known value (%s)\n", reg_type_str[reg->type]); return -EINVAL; } if (!tnum_in(range, reg->var_off)) { - verbose("At program exit the register R0 "); + verbose(env, "At program exit the register R0 "); if (!tnum_is_unknown(reg->var_off)) { char tn_buf[48]; tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose("has value %s", tn_buf); + verbose(env, "has value %s", tn_buf); } else { - verbose("has unknown scalar value"); + verbose(env, "has unknown scalar value"); } - verbose(" should have been 0 or 1\n"); + verbose(env, " should have been 0 or 1\n"); return -EINVAL; } return 0; @@ -3768,7 +3791,7 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env) return 0; if (w < 0 || w >= env->prog->len) { - verbose("jump out of range from insn %d to %d\n", t, w); + verbose(env, "jump out of range from insn %d to %d\n", t, w); return -EINVAL; } @@ -3785,13 +3808,13 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env) insn_stack[cur_stack++] = w; return 1; } else if ((insn_state[w] & 0xF0) == DISCOVERED) { - verbose("back-edge from insn %d to %d\n", t, w); + verbose(env, "back-edge from insn %d to %d\n", t, w); return -EINVAL; } else if (insn_state[w] == EXPLORED) { /* forward- or cross-edge */ insn_state[t] = DISCOVERED | e; } else { - verbose("insn state internal bug\n"); + verbose(env, "insn state internal bug\n"); return -EFAULT; } return 0; @@ -3885,7 +3908,7 @@ peek_stack: mark_explored: insn_state[t] = EXPLORED; if (cur_stack-- <= 0) { - verbose("pop stack internal bug\n"); + verbose(env, "pop stack internal bug\n"); ret = -EFAULT; goto err_free; } @@ -3894,7 +3917,7 @@ mark_explored: check_state: for (i = 0; i < insn_cnt; i++) { if (insn_state[i] != EXPLORED) { - verbose("unreachable insn %d\n", i); + verbose(env, "unreachable insn %d\n", i); ret = -EINVAL; goto err_free; } @@ -4313,7 +4336,7 @@ static int do_check(struct bpf_verifier_env *env) if (!state) return -ENOMEM; env->cur_state = state; - init_reg_state(state->regs); + init_reg_state(env, state->regs); state->parent = NULL; for (;;) { struct bpf_insn *insn; @@ -4321,7 +4344,7 @@ static int do_check(struct bpf_verifier_env *env) int err; if (env->insn_idx >= insn_cnt) { - verbose("invalid insn idx %d insn_cnt %d\n", + verbose(env, "invalid insn idx %d insn_cnt %d\n", env->insn_idx, insn_cnt); return -EFAULT; } @@ -4330,7 +4353,8 @@ static int do_check(struct bpf_verifier_env *env) class = BPF_CLASS(insn->code); if (++insn_processed > BPF_COMPLEXITY_LIMIT_INSNS) { - verbose("BPF program is too large. Processed %d insn\n", + verbose(env, + "BPF program is too large. Processed %d insn\n", insn_processed); return -E2BIG; } @@ -4340,14 +4364,14 @@ static int do_check(struct bpf_verifier_env *env) return err; if (err == 1) { /* found equivalent state, can prune the search */ - if (verifier_log.level) { + if (env->log.level) { if (do_print_state) - verbose("\nfrom %d to %d%s: safe\n", + verbose(env, "\nfrom %d to %d%s: safe\n", env->prev_insn_idx, env->insn_idx, env->cur_state->speculative ? " (speculative execution)" : ""); else - verbose("%d: safe\n", env->insn_idx); + verbose(env, "%d: safe\n", env->insn_idx); } goto process_bpf_exit; } @@ -4355,21 +4379,20 @@ static int do_check(struct bpf_verifier_env *env) if (need_resched()) cond_resched(); - if (verifier_log.level > 1 || - (verifier_log.level && do_print_state)) { - if (verifier_log.level > 1) - verbose("%d:", env->insn_idx); + if (env->log.level > 1 || (env->log.level && do_print_state)) { + if (env->log.level > 1) + verbose(env, "%d:", env->insn_idx); else - verbose("\nfrom %d to %d%s:", + verbose(env, "\nfrom %d to %d%s:", env->prev_insn_idx, env->insn_idx, env->cur_state->speculative ? " (speculative execution)" : ""); - print_verifier_state(env->cur_state); + print_verifier_state(env, env->cur_state); do_print_state = false; } - if (verifier_log.level) { - verbose("%d: ", env->insn_idx); + if (env->log.level) { + verbose(env, "%d: ", env->insn_idx); print_bpf_insn(env, insn); } @@ -4428,7 +4451,7 @@ static int do_check(struct bpf_verifier_env *env) * src_reg == stack|map in some other branch. * Reject it. */ - verbose("same insn cannot be used with different pointers\n"); + verbose(env, "same insn cannot be used with different pointers\n"); return -EINVAL; } @@ -4468,14 +4491,14 @@ static int do_check(struct bpf_verifier_env *env) } else if (dst_reg_type != *prev_dst_type && (dst_reg_type == PTR_TO_CTX || *prev_dst_type == PTR_TO_CTX)) { - verbose("same insn cannot be used with different pointers\n"); + verbose(env, "same insn cannot be used with different pointers\n"); return -EINVAL; } } else if (class == BPF_ST) { if (BPF_MODE(insn->code) != BPF_MEM || insn->src_reg != BPF_REG_0) { - verbose("BPF_ST uses reserved fields\n"); + verbose(env, "BPF_ST uses reserved fields\n"); return -EINVAL; } /* check src operand */ @@ -4484,7 +4507,7 @@ static int do_check(struct bpf_verifier_env *env) return err; if (is_ctx_reg(env, insn->dst_reg)) { - verbose("BPF_ST stores into R%d context is not allowed\n", + verbose(env, "BPF_ST stores into R%d context is not allowed\n", insn->dst_reg); return -EACCES; } @@ -4504,7 +4527,7 @@ static int do_check(struct bpf_verifier_env *env) insn->off != 0 || insn->src_reg != BPF_REG_0 || insn->dst_reg != BPF_REG_0) { - verbose("BPF_CALL uses reserved fields\n"); + verbose(env, "BPF_CALL uses reserved fields\n"); return -EINVAL; } @@ -4517,7 +4540,7 @@ static int do_check(struct bpf_verifier_env *env) insn->imm != 0 || insn->src_reg != BPF_REG_0 || insn->dst_reg != BPF_REG_0) { - verbose("BPF_JA uses reserved fields\n"); + verbose(env, "BPF_JA uses reserved fields\n"); return -EINVAL; } @@ -4529,7 +4552,7 @@ static int do_check(struct bpf_verifier_env *env) insn->imm != 0 || insn->src_reg != BPF_REG_0 || insn->dst_reg != BPF_REG_0) { - verbose("BPF_EXIT uses reserved fields\n"); + verbose(env, "BPF_EXIT uses reserved fields\n"); return -EINVAL; } @@ -4544,7 +4567,7 @@ static int do_check(struct bpf_verifier_env *env) return err; if (is_pointer_value(env, BPF_REG_0)) { - verbose("R0 leaks addr as return value\n"); + verbose(env, "R0 leaks addr as return value\n"); return -EACCES; } @@ -4552,7 +4575,8 @@ static int do_check(struct bpf_verifier_env *env) if (err) return err; process_bpf_exit: - err = pop_stack(env, &env->prev_insn_idx, &env->insn_idx); + err = pop_stack(env, &env->prev_insn_idx, + &env->insn_idx); if (err < 0) { if (err != -ENOENT) return err; @@ -4582,19 +4606,19 @@ process_bpf_exit: env->insn_idx++; env->insn_aux_data[env->insn_idx].seen = true; } else { - verbose("invalid BPF_LD mode\n"); + verbose(env, "invalid BPF_LD mode\n"); return -EINVAL; } } else { - verbose("unknown insn class %d\n", class); + verbose(env, "unknown insn class %d\n", class); return -EINVAL; } env->insn_idx++; } - verbose("processed %d insns, stack depth %d\n", - insn_processed, env->prog->aux->stack_depth); + verbose(env, "processed %d insns, stack depth %d\n", insn_processed, + env->prog->aux->stack_depth); return 0; } @@ -4606,7 +4630,8 @@ static int check_map_prealloc(struct bpf_map *map) !(map->map_flags & BPF_F_NO_PREALLOC); } -static int check_map_prog_compatibility(struct bpf_map *map, +static int check_map_prog_compatibility(struct bpf_verifier_env *env, + struct bpf_map *map, struct bpf_prog *prog) { @@ -4617,12 +4642,12 @@ static int check_map_prog_compatibility(struct bpf_map *map, */ if (prog->type == BPF_PROG_TYPE_PERF_EVENT) { if (!check_map_prealloc(map)) { - verbose("perf_event programs can only use preallocated hash map\n"); + verbose(env, "perf_event programs can only use preallocated hash map\n"); return -EINVAL; } if (map->inner_map_meta && !check_map_prealloc(map->inner_map_meta)) { - verbose("perf_event programs can only use preallocated inner hash map\n"); + verbose(env, "perf_event programs can only use preallocated inner hash map\n"); return -EINVAL; } } @@ -4645,14 +4670,14 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env) for (i = 0; i < insn_cnt; i++, insn++) { if (BPF_CLASS(insn->code) == BPF_LDX && (BPF_MODE(insn->code) != BPF_MEM || insn->imm != 0)) { - verbose("BPF_LDX uses reserved fields\n"); + verbose(env, "BPF_LDX uses reserved fields\n"); return -EINVAL; } if (BPF_CLASS(insn->code) == BPF_STX && ((BPF_MODE(insn->code) != BPF_MEM && BPF_MODE(insn->code) != BPF_XADD) || insn->imm != 0)) { - verbose("BPF_STX uses reserved fields\n"); + verbose(env, "BPF_STX uses reserved fields\n"); return -EINVAL; } @@ -4663,7 +4688,7 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env) if (i == insn_cnt - 1 || insn[1].code != 0 || insn[1].dst_reg != 0 || insn[1].src_reg != 0 || insn[1].off != 0) { - verbose("invalid bpf_ld_imm64 insn\n"); + verbose(env, "invalid bpf_ld_imm64 insn\n"); return -EINVAL; } @@ -4672,19 +4697,20 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env) goto next_insn; if (insn->src_reg != BPF_PSEUDO_MAP_FD) { - verbose("unrecognized bpf_ld_imm64 insn\n"); + verbose(env, + "unrecognized bpf_ld_imm64 insn\n"); return -EINVAL; } f = fdget(insn->imm); map = __bpf_map_get(f); if (IS_ERR(map)) { - verbose("fd %d is not pointing to valid bpf_map\n", + verbose(env, "fd %d is not pointing to valid bpf_map\n", insn->imm); return PTR_ERR(map); } - err = check_map_prog_compatibility(map, env->prog); + err = check_map_prog_compatibility(env, map, env->prog); if (err) { fdput(f); return err; @@ -4828,7 +4854,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) cnt = ops->gen_prologue(insn_buf, env->seen_direct_write, env->prog); if (cnt >= ARRAY_SIZE(insn_buf)) { - verbose("bpf verifier is misconfigured\n"); + verbose(env, "bpf verifier is misconfigured\n"); return -EINVAL; } else if (cnt) { new_prog = bpf_patch_insn_data(env, 0, insn_buf, cnt); @@ -4904,7 +4930,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) u8 size_code; if (type == BPF_WRITE) { - verbose("bpf verifier narrow ctx access misconfigured\n"); + verbose(env, "bpf verifier narrow ctx access misconfigured\n"); return -EINVAL; } @@ -4923,7 +4949,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) &target_size); if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf) || (ctx_field_size && !target_size)) { - verbose("bpf verifier is misconfigured\n"); + verbose(env, "bpf verifier is misconfigured\n"); return -EINVAL; } @@ -5094,7 +5120,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) */ map_ptr = env->insn_aux_data[i + delta].map_ptr; if (map_ptr == BPF_MAP_PTR_POISON) { - verbose("tail_call obusing map_ptr\n"); + verbose(env, "tail_call obusing map_ptr\n"); return -EINVAL; } if (!map_ptr->unpriv_array) @@ -5129,7 +5155,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) cnt = map_ptr->ops->map_gen_lookup(map_ptr, insn_buf); if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) { - verbose("bpf verifier is misconfigured\n"); + verbose(env, "bpf verifier is misconfigured\n"); return -EINVAL; } @@ -5173,7 +5199,8 @@ patch_call_imm: * programs to call them, must be real in-kernel functions */ if (!fn->func) { - verbose("kernel subsystem misconfigured func %s#%d\n", + verbose(env, + "kernel subsystem misconfigured func %s#%d\n", func_id_name(insn->imm), insn->imm); return -EFAULT; } @@ -5208,8 +5235,8 @@ static void free_states(struct bpf_verifier_env *env) int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) { - struct bpf_verifer_log *log = &verifier_log; struct bpf_verifier_env *env; + struct bpf_verifer_log *log; int ret = -EINVAL; /* 'struct bpf_verifier_env' can be global, but since it's not small, @@ -5218,6 +5245,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) env = kzalloc(sizeof(struct bpf_verifier_env), GFP_KERNEL); if (!env) return -ENOMEM; + log = &env->log; env->insn_aux_data = vzalloc(sizeof(struct bpf_insn_aux_data) * (*prog)->len); @@ -5236,7 +5264,6 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) log->level = attr->log_level; log->ubuf = (char __user *) (unsigned long) attr->log_buf; log->len_total = attr->log_size; - log->len_used = 0; ret = -EINVAL; /* log attributes have to be sane */ @@ -5248,8 +5275,6 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) log->kbuf = vmalloc(log->len_total); if (!log->kbuf) goto err_unlock; - } else { - log->level = 0; } env->strict_alignment = !!(attr->prog_flags & BPF_F_STRICT_ALIGNMENT); @@ -5367,8 +5392,6 @@ int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops, /* grab the mutex to protect few globals used by verifier */ mutex_lock(&bpf_verifier_lock); - verifier_log.level = 0; - env->strict_alignment = false; if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) env->strict_alignment = true; From 4b0123fae9100710263688885d02b6ae0c29d674 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 9 Oct 2017 10:30:12 -0700 Subject: [PATCH 0034/1640] BACKPORT: bpf: move instruction printing into a separate file Separate the instruction printing into a standalone source file. This way sneaky code from tools/ can compile it in directly. Signed-off-by: Jakub Kicinski Reviewed-by: Simon Horman Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/Makefile | 1 + kernel/bpf/disasm.c | 214 ++++++++++++++++++++++++++++++++++++++++++ kernel/bpf/disasm.h | 32 +++++++ kernel/bpf/verifier.c | 202 +-------------------------------------- 4 files changed, 251 insertions(+), 198 deletions(-) create mode 100644 kernel/bpf/disasm.c create mode 100644 kernel/bpf/disasm.h diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index be282c135a66..9df9521de2b1 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -4,6 +4,7 @@ CFLAGS_core.o += $(call cc-disable-warning, override-init) obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o +obj-$(CONFIG_BPF_SYSCALL) += disasm.o ifeq ($(CONFIG_NET),y) obj-$(CONFIG_BPF_SYSCALL) += devmap.o ifeq ($(CONFIG_STREAM_PARSER),y) diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c new file mode 100644 index 000000000000..e682850c9715 --- /dev/null +++ b/kernel/bpf/disasm.c @@ -0,0 +1,214 @@ +/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com + * Copyright (c) 2016 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#include + +#include "disasm.h" + +#define __BPF_FUNC_STR_FN(x) [BPF_FUNC_ ## x] = __stringify(bpf_ ## x) +static const char * const func_id_str[] = { + __BPF_FUNC_MAPPER(__BPF_FUNC_STR_FN) +}; +#undef __BPF_FUNC_STR_FN + +const char *func_id_name(int id) +{ + BUILD_BUG_ON(ARRAY_SIZE(func_id_str) != __BPF_FUNC_MAX_ID); + + if (id >= 0 && id < __BPF_FUNC_MAX_ID && func_id_str[id]) + return func_id_str[id]; + else + return "unknown"; +} + +const char *const bpf_class_string[8] = { + [BPF_LD] = "ld", + [BPF_LDX] = "ldx", + [BPF_ST] = "st", + [BPF_STX] = "stx", + [BPF_ALU] = "alu", + [BPF_JMP] = "jmp", + [BPF_RET] = "BUG", + [BPF_ALU64] = "alu64", +}; + +const char *const bpf_alu_string[16] = { + [BPF_ADD >> 4] = "+=", + [BPF_SUB >> 4] = "-=", + [BPF_MUL >> 4] = "*=", + [BPF_DIV >> 4] = "/=", + [BPF_OR >> 4] = "|=", + [BPF_AND >> 4] = "&=", + [BPF_LSH >> 4] = "<<=", + [BPF_RSH >> 4] = ">>=", + [BPF_NEG >> 4] = "neg", + [BPF_MOD >> 4] = "%=", + [BPF_XOR >> 4] = "^=", + [BPF_MOV >> 4] = "=", + [BPF_ARSH >> 4] = "s>>=", + [BPF_END >> 4] = "endian", +}; + +static const char *const bpf_ldst_string[] = { + [BPF_W >> 3] = "u32", + [BPF_H >> 3] = "u16", + [BPF_B >> 3] = "u8", + [BPF_DW >> 3] = "u64", +}; + +static const char *const bpf_jmp_string[16] = { + [BPF_JA >> 4] = "jmp", + [BPF_JEQ >> 4] = "==", + [BPF_JGT >> 4] = ">", + [BPF_JLT >> 4] = "<", + [BPF_JGE >> 4] = ">=", + [BPF_JLE >> 4] = "<=", + [BPF_JSET >> 4] = "&", + [BPF_JNE >> 4] = "!=", + [BPF_JSGT >> 4] = "s>", + [BPF_JSLT >> 4] = "s<", + [BPF_JSGE >> 4] = "s>=", + [BPF_JSLE >> 4] = "s<=", + [BPF_CALL >> 4] = "call", + [BPF_EXIT >> 4] = "exit", +}; + +static void print_bpf_end_insn(bpf_insn_print_cb verbose, + struct bpf_verifier_env *env, + const struct bpf_insn *insn) +{ + verbose(env, "(%02x) r%d = %s%d r%d\n", insn->code, insn->dst_reg, + BPF_SRC(insn->code) == BPF_TO_BE ? "be" : "le", + insn->imm, insn->dst_reg); +} + +void print_bpf_insn(bpf_insn_print_cb verbose, struct bpf_verifier_env *env, + const struct bpf_insn *insn, bool allow_ptr_leaks) +{ + u8 class = BPF_CLASS(insn->code); + + if (class == BPF_ALU || class == BPF_ALU64) { + if (BPF_OP(insn->code) == BPF_END) { + if (class == BPF_ALU64) + verbose(env, "BUG_alu64_%02x\n", insn->code); + else + print_bpf_end_insn(verbose, env, insn); + } else if (BPF_OP(insn->code) == BPF_NEG) { + verbose(env, "(%02x) r%d = %s-r%d\n", + insn->code, insn->dst_reg, + class == BPF_ALU ? "(u32) " : "", + insn->dst_reg); + } else if (BPF_SRC(insn->code) == BPF_X) { + verbose(env, "(%02x) %sr%d %s %sr%d\n", + insn->code, class == BPF_ALU ? "(u32) " : "", + insn->dst_reg, + bpf_alu_string[BPF_OP(insn->code) >> 4], + class == BPF_ALU ? "(u32) " : "", + insn->src_reg); + } else { + verbose(env, "(%02x) %sr%d %s %s%d\n", + insn->code, class == BPF_ALU ? "(u32) " : "", + insn->dst_reg, + bpf_alu_string[BPF_OP(insn->code) >> 4], + class == BPF_ALU ? "(u32) " : "", + insn->imm); + } + } else if (class == BPF_STX) { + if (BPF_MODE(insn->code) == BPF_MEM) + verbose(env, "(%02x) *(%s *)(r%d %+d) = r%d\n", + insn->code, + bpf_ldst_string[BPF_SIZE(insn->code) >> 3], + insn->dst_reg, + insn->off, insn->src_reg); + else if (BPF_MODE(insn->code) == BPF_XADD) + verbose(env, "(%02x) lock *(%s *)(r%d %+d) += r%d\n", + insn->code, + bpf_ldst_string[BPF_SIZE(insn->code) >> 3], + insn->dst_reg, insn->off, + insn->src_reg); + else + verbose(env, "BUG_%02x\n", insn->code); + } else if (class == BPF_ST) { + if (BPF_MODE(insn->code) != BPF_MEM) { + verbose(env, "BUG_st_%02x\n", insn->code); + return; + } + verbose(env, "(%02x) *(%s *)(r%d %+d) = %d\n", + insn->code, + bpf_ldst_string[BPF_SIZE(insn->code) >> 3], + insn->dst_reg, + insn->off, insn->imm); + } else if (class == BPF_LDX) { + if (BPF_MODE(insn->code) != BPF_MEM) { + verbose(env, "BUG_ldx_%02x\n", insn->code); + return; + } + verbose(env, "(%02x) r%d = *(%s *)(r%d %+d)\n", + insn->code, insn->dst_reg, + bpf_ldst_string[BPF_SIZE(insn->code) >> 3], + insn->src_reg, insn->off); + } else if (class == BPF_LD) { + if (BPF_MODE(insn->code) == BPF_ABS) { + verbose(env, "(%02x) r0 = *(%s *)skb[%d]\n", + insn->code, + bpf_ldst_string[BPF_SIZE(insn->code) >> 3], + insn->imm); + } else if (BPF_MODE(insn->code) == BPF_IND) { + verbose(env, "(%02x) r0 = *(%s *)skb[r%d + %d]\n", + insn->code, + bpf_ldst_string[BPF_SIZE(insn->code) >> 3], + insn->src_reg, insn->imm); + } else if (BPF_MODE(insn->code) == BPF_IMM && + BPF_SIZE(insn->code) == BPF_DW) { + /* At this point, we already made sure that the second + * part of the ldimm64 insn is accessible. + */ + u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm; + bool map_ptr = insn->src_reg == BPF_PSEUDO_MAP_FD; + + if (map_ptr && !allow_ptr_leaks) + imm = 0; + + verbose(env, "(%02x) r%d = 0x%llx\n", insn->code, + insn->dst_reg, (unsigned long long)imm); + } else { + verbose(env, "BUG_ld_%02x\n", insn->code); + return; + } + } else if (class == BPF_JMP) { + u8 opcode = BPF_OP(insn->code); + + if (opcode == BPF_CALL) { + verbose(env, "(%02x) call %s#%d\n", insn->code, + func_id_name(insn->imm), insn->imm); + } else if (insn->code == (BPF_JMP | BPF_JA)) { + verbose(env, "(%02x) goto pc%+d\n", + insn->code, insn->off); + } else if (insn->code == (BPF_JMP | BPF_EXIT)) { + verbose(env, "(%02x) exit\n", insn->code); + } else if (BPF_SRC(insn->code) == BPF_X) { + verbose(env, "(%02x) if r%d %s r%d goto pc%+d\n", + insn->code, insn->dst_reg, + bpf_jmp_string[BPF_OP(insn->code) >> 4], + insn->src_reg, insn->off); + } else { + verbose(env, "(%02x) if r%d %s 0x%x goto pc%+d\n", + insn->code, insn->dst_reg, + bpf_jmp_string[BPF_OP(insn->code) >> 4], + insn->imm, insn->off); + } + } else { + verbose(env, "(%02x) %s\n", + insn->code, bpf_class_string[class]); + } +} diff --git a/kernel/bpf/disasm.h b/kernel/bpf/disasm.h new file mode 100644 index 000000000000..8de977e420b6 --- /dev/null +++ b/kernel/bpf/disasm.h @@ -0,0 +1,32 @@ +/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com + * Copyright (c) 2016 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#ifndef __BPF_DISASM_H__ +#define __BPF_DISASM_H__ + +#include +#include +#include + +extern const char *const bpf_alu_string[16]; +extern const char *const bpf_class_string[8]; + +const char *func_id_name(int id); + +struct bpf_verifier_env; +typedef void (*bpf_insn_print_cb)(struct bpf_verifier_env *env, + const char *, ...); +void print_bpf_insn(bpf_insn_print_cb verbose, struct bpf_verifier_env *env, + const struct bpf_insn *insn, bool allow_ptr_leaks); + +#endif diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index c2a543a1d57b..f3e040e4412c 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -21,6 +21,8 @@ #include #include +#include "disasm.h" + /* bpf_check() is a static code analyzer that walks eBPF program * instruction by instruction and updates register/stack state. * All paths of conditional branches are analyzed until 'bpf_exit' insn. @@ -194,22 +196,6 @@ static const char * const reg_type_str[] = { [PTR_TO_PACKET_END] = "pkt_end", }; -#define __BPF_FUNC_STR_FN(x) [BPF_FUNC_ ## x] = __stringify(bpf_ ## x) -static const char * const func_id_str[] = { - __BPF_FUNC_MAPPER(__BPF_FUNC_STR_FN) -}; -#undef __BPF_FUNC_STR_FN - -static const char *func_id_name(int id) -{ - BUILD_BUG_ON(ARRAY_SIZE(func_id_str) != __BPF_FUNC_MAX_ID); - - if (id >= 0 && id < __BPF_FUNC_MAX_ID && func_id_str[id]) - return func_id_str[id]; - else - return "unknown"; -} - static void print_verifier_state(struct bpf_verifier_env *env, struct bpf_verifier_state *state) { @@ -279,187 +265,6 @@ static void print_verifier_state(struct bpf_verifier_env *env, verbose(env, "\n"); } -static const char *const bpf_class_string[] = { - [BPF_LD] = "ld", - [BPF_LDX] = "ldx", - [BPF_ST] = "st", - [BPF_STX] = "stx", - [BPF_ALU] = "alu", - [BPF_JMP] = "jmp", - [BPF_RET] = "BUG", - [BPF_ALU64] = "alu64", -}; - -static const char *const bpf_alu_string[16] = { - [BPF_ADD >> 4] = "+=", - [BPF_SUB >> 4] = "-=", - [BPF_MUL >> 4] = "*=", - [BPF_DIV >> 4] = "/=", - [BPF_OR >> 4] = "|=", - [BPF_AND >> 4] = "&=", - [BPF_LSH >> 4] = "<<=", - [BPF_RSH >> 4] = ">>=", - [BPF_NEG >> 4] = "neg", - [BPF_MOD >> 4] = "%=", - [BPF_XOR >> 4] = "^=", - [BPF_MOV >> 4] = "=", - [BPF_ARSH >> 4] = "s>>=", - [BPF_END >> 4] = "endian", -}; - -static const char *const bpf_ldst_string[] = { - [BPF_W >> 3] = "u32", - [BPF_H >> 3] = "u16", - [BPF_B >> 3] = "u8", - [BPF_DW >> 3] = "u64", -}; - -static const char *const bpf_jmp_string[16] = { - [BPF_JA >> 4] = "jmp", - [BPF_JEQ >> 4] = "==", - [BPF_JGT >> 4] = ">", - [BPF_JLT >> 4] = "<", - [BPF_JGE >> 4] = ">=", - [BPF_JLE >> 4] = "<=", - [BPF_JSET >> 4] = "&", - [BPF_JNE >> 4] = "!=", - [BPF_JSGT >> 4] = "s>", - [BPF_JSLT >> 4] = "s<", - [BPF_JSGE >> 4] = "s>=", - [BPF_JSLE >> 4] = "s<=", - [BPF_CALL >> 4] = "call", - [BPF_EXIT >> 4] = "exit", -}; - -static void print_bpf_end_insn(struct bpf_verifier_env *env, - const struct bpf_insn *insn) -{ - verbose(env, "(%02x) r%d = %s%d r%d\n", insn->code, insn->dst_reg, - BPF_SRC(insn->code) == BPF_TO_BE ? "be" : "le", - insn->imm, insn->dst_reg); -} - -static void print_bpf_insn(struct bpf_verifier_env *env, - const struct bpf_insn *insn) -{ - u8 class = BPF_CLASS(insn->code); - - if (class == BPF_ALU || class == BPF_ALU64) { - if (BPF_OP(insn->code) == BPF_END) { - if (class == BPF_ALU64) - verbose(env, "BUG_alu64_%02x\n", insn->code); - else - print_bpf_end_insn(env, insn); - } else if (BPF_OP(insn->code) == BPF_NEG) { - verbose(env, "(%02x) r%d = %s-r%d\n", - insn->code, insn->dst_reg, - class == BPF_ALU ? "(u32) " : "", - insn->dst_reg); - } else if (BPF_SRC(insn->code) == BPF_X) { - verbose(env, "(%02x) %sr%d %s %sr%d\n", - insn->code, class == BPF_ALU ? "(u32) " : "", - insn->dst_reg, - bpf_alu_string[BPF_OP(insn->code) >> 4], - class == BPF_ALU ? "(u32) " : "", - insn->src_reg); - } else { - verbose(env, "(%02x) %sr%d %s %s%d\n", - insn->code, class == BPF_ALU ? "(u32) " : "", - insn->dst_reg, - bpf_alu_string[BPF_OP(insn->code) >> 4], - class == BPF_ALU ? "(u32) " : "", - insn->imm); - } - } else if (class == BPF_STX) { - if (BPF_MODE(insn->code) == BPF_MEM) - verbose(env, "(%02x) *(%s *)(r%d %+d) = r%d\n", - insn->code, - bpf_ldst_string[BPF_SIZE(insn->code) >> 3], - insn->dst_reg, - insn->off, insn->src_reg); - else if (BPF_MODE(insn->code) == BPF_XADD) - verbose(env, "(%02x) lock *(%s *)(r%d %+d) += r%d\n", - insn->code, - bpf_ldst_string[BPF_SIZE(insn->code) >> 3], - insn->dst_reg, insn->off, - insn->src_reg); - else - verbose(env, "BUG_%02x\n", insn->code); - } else if (class == BPF_ST) { - if (BPF_MODE(insn->code) != BPF_MEM) { - verbose(env, "BUG_st_%02x\n", insn->code); - return; - } - verbose(env, "(%02x) *(%s *)(r%d %+d) = %d\n", - insn->code, - bpf_ldst_string[BPF_SIZE(insn->code) >> 3], - insn->dst_reg, - insn->off, insn->imm); - } else if (class == BPF_LDX) { - if (BPF_MODE(insn->code) != BPF_MEM) { - verbose(env, "BUG_ldx_%02x\n", insn->code); - return; - } - verbose(env, "(%02x) r%d = *(%s *)(r%d %+d)\n", - insn->code, insn->dst_reg, - bpf_ldst_string[BPF_SIZE(insn->code) >> 3], - insn->src_reg, insn->off); - } else if (class == BPF_LD) { - if (BPF_MODE(insn->code) == BPF_ABS) { - verbose(env, "(%02x) r0 = *(%s *)skb[%d]\n", - insn->code, - bpf_ldst_string[BPF_SIZE(insn->code) >> 3], - insn->imm); - } else if (BPF_MODE(insn->code) == BPF_IND) { - verbose(env, "(%02x) r0 = *(%s *)skb[r%d + %d]\n", - insn->code, - bpf_ldst_string[BPF_SIZE(insn->code) >> 3], - insn->src_reg, insn->imm); - } else if (BPF_MODE(insn->code) == BPF_IMM && - BPF_SIZE(insn->code) == BPF_DW) { - /* At this point, we already made sure that the second - * part of the ldimm64 insn is accessible. - */ - u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm; - bool map_ptr = insn->src_reg == BPF_PSEUDO_MAP_FD; - - if (map_ptr && !env->allow_ptr_leaks) - imm = 0; - - verbose(env, "(%02x) r%d = 0x%llx\n", insn->code, - insn->dst_reg, (unsigned long long)imm); - } else { - verbose(env, "BUG_ld_%02x\n", insn->code); - return; - } - } else if (class == BPF_JMP) { - u8 opcode = BPF_OP(insn->code); - - if (opcode == BPF_CALL) { - verbose(env, "(%02x) call %s#%d\n", insn->code, - func_id_name(insn->imm), insn->imm); - } else if (insn->code == (BPF_JMP | BPF_JA)) { - verbose(env, "(%02x) goto pc%+d\n", - insn->code, insn->off); - } else if (insn->code == (BPF_JMP | BPF_EXIT)) { - verbose(env, "(%02x) exit\n", insn->code); - } else if (BPF_SRC(insn->code) == BPF_X) { - verbose(env, "(%02x) if r%d %s r%d goto pc%+d\n", - insn->code, insn->dst_reg, - bpf_jmp_string[BPF_OP(insn->code) >> 4], - insn->src_reg, insn->off); - } else { - verbose(env, "(%02x) if r%d %s 0x%x goto pc%+d\n", - insn->code, insn->dst_reg, - bpf_jmp_string[BPF_OP(insn->code) >> 4], - insn->imm, insn->off); - } - } else { - verbose(env, "(%02x) %s\n", - insn->code, bpf_class_string[class]); - } -} - static int copy_stack_state(struct bpf_verifier_state *dst, const struct bpf_verifier_state *src) { @@ -4393,7 +4198,8 @@ static int do_check(struct bpf_verifier_env *env) if (env->log.level) { verbose(env, "%d: ", env->insn_idx); - print_bpf_insn(env, insn); + print_bpf_insn(verbose, env, insn, + env->allow_ptr_leaks); } err = ext_analyzer_insn_hook(env, env->insn_idx, env->prev_insn_idx); From df922f5704ba8f470f3716c2dcce1f8b3b195f2d Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 9 Oct 2017 10:30:14 -0700 Subject: [PATCH 0035/1640] UPSTREAM: bpf: don't rely on the verifier lock for metadata_dst allocation bpf_skb_set_tunnel_*() functions require allocation of per-cpu metadata_dst. The allocation happens upon verification of the first program using those helpers. In preparation for removing the verifier lock, use cmpxchg() to make sure we only allocate the metadata_dsts once. Signed-off-by: Jakub Kicinski Reviewed-by: Simon Horman Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/net/dst_metadata.h | 1 + net/core/dst.c | 16 ++++++++++++++++ net/core/filter.c | 16 +++++++++------- 3 files changed, 26 insertions(+), 7 deletions(-) diff --git a/include/net/dst_metadata.h b/include/net/dst_metadata.h index 177b1aabf95d..55b1181aeba4 100644 --- a/include/net/dst_metadata.h +++ b/include/net/dst_metadata.h @@ -89,6 +89,7 @@ static inline int skb_metadata_dst_cmp(const struct sk_buff *skb_a, void metadata_dst_free(struct metadata_dst *); struct metadata_dst *metadata_dst_alloc(u8 optslen, enum metadata_type type, gfp_t flags); +void metadata_dst_free_percpu(struct metadata_dst __percpu *md_dst); struct metadata_dst __percpu * metadata_dst_alloc_percpu(u8 optslen, enum metadata_type type, gfp_t flags); diff --git a/net/core/dst.c b/net/core/dst.c index 2d121958d5b0..53a5f40b16ea 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -322,3 +322,19 @@ metadata_dst_alloc_percpu(u8 optslen, enum metadata_type type, gfp_t flags) return md_dst; } EXPORT_SYMBOL_GPL(metadata_dst_alloc_percpu); + +void metadata_dst_free_percpu(struct metadata_dst __percpu *md_dst) +{ + int cpu; + +#ifdef CONFIG_DST_CACHE + for_each_possible_cpu(cpu) { + struct metadata_dst *one_md_dst = per_cpu_ptr(md_dst, cpu); + + if (one_md_dst->type == METADATA_IP_TUNNEL) + dst_cache_destroy(&one_md_dst->u.tun_info.dst_cache); + } +#endif + free_percpu(md_dst); +} +EXPORT_SYMBOL_GPL(metadata_dst_free_percpu); diff --git a/net/core/filter.c b/net/core/filter.c index 3e147cdd8dcf..82598731d98d 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -43,6 +43,7 @@ #include #include #include +#include #include #include #include @@ -3072,14 +3073,15 @@ static const struct bpf_func_proto * bpf_get_skb_set_tunnel_proto(enum bpf_func_id which) { if (!md_dst) { - /* Race is not possible, since it's called from verifier - * that is holding verifier mutex. - */ - md_dst = metadata_dst_alloc_percpu(IP_TUNNEL_OPTS_MAX, - METADATA_IP_TUNNEL, - GFP_KERNEL); - if (!md_dst) + struct metadata_dst __percpu *tmp; + + tmp = metadata_dst_alloc_percpu(IP_TUNNEL_OPTS_MAX, + METADATA_IP_TUNNEL, + GFP_KERNEL); + if (!tmp) return NULL; + if (cmpxchg(&md_dst, NULL, tmp)) + metadata_dst_free_percpu(tmp); } switch (which) { From 39d3d62ee41f2049f192b272888ea215f3d90544 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 9 Oct 2017 10:30:15 -0700 Subject: [PATCH 0036/1640] UPSTREAM: bpf: write back the verifier log buffer as it gets filled Verifier log buffer can be quite large (up to 16MB currently). As Eric Dumazet points out if we allow multiple verification requests to proceed simultaneously, malicious user may use the verifier as a way of allocating large amounts of unswappable memory to OOM the host. Switch to a strategy of allocating a smaller buffer (1024B) and writing it out into the user buffer after every print. While at it remove the old BUG_ON(). This is in preparation of the global verifier lock removal. Signed-off-by: Jakub Kicinski Reviewed-by: Simon Horman Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/linux/bpf_verifier.h | 4 +++- kernel/bpf/verifier.c | 41 +++++++++++++++++------------------- 2 files changed, 22 insertions(+), 23 deletions(-) diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 0ad1580fea56..1fae78c057d8 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -136,9 +136,11 @@ struct bpf_insn_aux_data { #define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */ +#define BPF_VERIFIER_TMP_LOG_SIZE 1024 + struct bpf_verifer_log { u32 level; - char *kbuf; + char kbuf[BPF_VERIFIER_TMP_LOG_SIZE]; char __user *ubuf; u32 len_used; u32 len_total; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index f3e040e4412c..f7f0052c956f 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -165,15 +165,26 @@ static __printf(2, 3) void verbose(struct bpf_verifier_env *env, const char *fmt, ...) { struct bpf_verifer_log *log = &env->log; + unsigned int n; va_list args; - if (!log->level || bpf_verifier_log_full(log)) + if (!log->level || !log->ubuf || bpf_verifier_log_full(log)) return; va_start(args, fmt); - log->len_used += vscnprintf(log->kbuf + log->len_used, - log->len_total - log->len_used, fmt, args); + n = vscnprintf(log->kbuf, BPF_VERIFIER_TMP_LOG_SIZE, fmt, args); va_end(args); + + WARN_ONCE(n >= BPF_VERIFIER_TMP_LOG_SIZE - 1, + "verifier log line truncated - local buffer too short\n"); + + n = min(log->len_total - log->len_used - 1, n); + log->kbuf[n] = '\0'; + + if (!copy_to_user(log->ubuf + log->len_used, log->kbuf, n + 1)) + log->len_used += n; + else + log->ubuf = NULL; } static bool type_is_pkt_pointer(enum bpf_reg_type type) @@ -5076,11 +5087,6 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) if (log->len_total < 128 || log->len_total > UINT_MAX >> 8 || !log->level || !log->ubuf) goto err_unlock; - - ret = -ENOMEM; - log->kbuf = vmalloc(log->len_total); - if (!log->kbuf) - goto err_unlock; } env->strict_alignment = !!(attr->prog_flags & BPF_F_STRICT_ALIGNMENT); @@ -5124,18 +5130,11 @@ skip_full_check: if (ret == 0) ret = fixup_bpf_calls(env); - if (log->level && bpf_verifier_log_full(log)) { - BUG_ON(log->len_used >= log->len_total); - /* verifier log exceeded user supplied buffer */ + if (log->level && bpf_verifier_log_full(log)) ret = -ENOSPC; - /* fall through to return what was recorded */ - } - - /* copy verifier log back to user space including trailing zero */ - if (log->level && copy_to_user(log->ubuf, log->kbuf, - log->len_used + 1) != 0) { + if (log->level && !log->ubuf) { ret = -EFAULT; - goto free_log_buf; + goto err_release_maps; } if (ret == 0 && env->used_map_cnt) { @@ -5146,7 +5145,7 @@ skip_full_check: if (!env->prog->aux->used_maps) { ret = -ENOMEM; - goto free_log_buf; + goto err_release_maps; } memcpy(env->prog->aux->used_maps, env->used_maps, @@ -5159,9 +5158,7 @@ skip_full_check: convert_pseudo_ld_imm64(env); } -free_log_buf: - if (log->level) - vfree(log->kbuf); +err_release_maps: if (!env->prog->aux->used_maps) /* if we didn't copy map pointers into bpf_prog_info, release * them now. Otherwise free_used_maps() will release them. From 64b89fcbcdce50745097c2a21a36df058f02a3d3 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Wed, 11 Oct 2017 11:56:23 +0100 Subject: [PATCH 0037/1640] UPSTREAM: bpf: remove redundant variable old_flags Variable old_flags is being assigned but is never read; it is redundant and can be removed. Cleans up clang warning: Value stored to 'old_flags' is never read Signed-off-by: Colin Ian King Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/cgroup.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 09ef184bc46c..90e98284d9e4 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -191,7 +191,6 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, struct cgroup_subsys_state *css; struct bpf_prog_list *pl; bool pl_was_allocated; - u32 old_flags; int err; if ((flags & BPF_F_ALLOW_OVERRIDE) && (flags & BPF_F_ALLOW_MULTI)) @@ -238,7 +237,6 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, pl->prog = prog; } - old_flags = cgrp->bpf.flags[type]; cgrp->bpf.flags[type] = flags; /* allocate and recompute effective prog arrays */ From 3e5eca7f7ba6fd87d531659e3bcd4e6ffeb3c113 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 12 Oct 2017 10:34:07 -0700 Subject: [PATCH 0038/1640] UPSTREAM: bpf: verifier: set reg_type on context accesses in second pass Use a simplified is_valid_access() callback when verifier is used for program analysis by non-host JITs. This allows us to teach the verifier about packet start and packet end offsets for direct packet access. We can extend the callback as needed but for most packet processing needs there isn't much more the offloads may require. Signed-off-by: Jakub Kicinski Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 43 +++++++++++++++++++++++++++++++++++++------ 1 file changed, 37 insertions(+), 6 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index f7f0052c956f..02fdf6a46437 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -977,6 +977,36 @@ static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off, return err; } +static bool analyzer_is_valid_access(struct bpf_verifier_env *env, int off, + struct bpf_insn_access_aux *info) +{ + switch (env->prog->type) { + case BPF_PROG_TYPE_XDP: + switch (off) { + case offsetof(struct xdp_buff, data): + info->reg_type = PTR_TO_PACKET; + return true; + case offsetof(struct xdp_buff, data_end): + info->reg_type = PTR_TO_PACKET_END; + return true; + } + return false; + case BPF_PROG_TYPE_SCHED_CLS: + switch (off) { + case offsetof(struct sk_buff, data): + info->reg_type = PTR_TO_PACKET; + return true; + case offsetof(struct sk_buff, cb) + + offsetof(struct bpf_skb_data_end, data_end): + info->reg_type = PTR_TO_PACKET_END; + return true; + } + return false; + default: + return false; + } +} + /* check access to 'struct bpf_context' fields. Supports fixed offsets only */ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, int size, enum bpf_access_type t, enum bpf_reg_type *reg_type) @@ -985,12 +1015,13 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, .reg_type = *reg_type, }; - /* for analyzer ctx accesses are already validated and converted */ - if (env->analyzer_ops) - return 0; - - if (env->prog->aux->ops->is_valid_access && - env->prog->aux->ops->is_valid_access(off, size, t, &info)) { + if (env->analyzer_ops) { + if (analyzer_is_valid_access(env, off, &info)) { + *reg_type = info.reg_type; + return 0; + } + } else if (env->prog->aux->ops->is_valid_access && + env->prog->aux->ops->is_valid_access(off, size, t, &info)) { /* A non zero info.ctx_field_size indicates that this field is a * candidate for later verifier transformation to load the whole * field and then apply a mask when accessed with a narrower From 5a12f8e21a7895798c53a6a2ab906920330a09a2 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 12 Oct 2017 18:40:02 -0400 Subject: [PATCH 0039/1640] UPSTREAM: tracing: bpf: Hide bpf trace events when they are not used All the trace events defined in include/trace/events/bpf.h are only used when CONFIG_BPF_SYSCALL is defined. But this file gets included by include/linux/bpf_trace.h which is included by the networking code with CREATE_TRACE_POINTS defined. If a trace event is created but not used it still has data structures and functions created for its use, even though nothing is using them. To not waste space, do not define the BPF trace events in bpf.h unless CONFIG_BPF_SYSCALL is defined. Signed-off-by: Steven Rostedt (VMware) Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/trace/events/bpf.h | 5 ++++- kernel/bpf/core.c | 3 +++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/include/trace/events/bpf.h b/include/trace/events/bpf.h index cc749d7099fb..150185647e6b 100644 --- a/include/trace/events/bpf.h +++ b/include/trace/events/bpf.h @@ -5,6 +5,9 @@ #if !defined(_TRACE_BPF_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_BPF_H +/* These are only used within the BPF_SYSCALL code */ +#ifdef CONFIG_BPF_SYSCALL + #include #include #include @@ -346,7 +349,7 @@ TRACE_EVENT(bpf_map_next_key, __print_hex(__get_dynamic_array(nxt), __entry->key_len), __entry->key_trunc ? " ..." : "") ); - +#endif /* CONFIG_BPF_SYSCALL */ #endif /* _TRACE_BPF_H */ #include diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 6b1ab380ff85..3df6d8adcf59 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1804,5 +1804,8 @@ int __weak skb_copy_bits(const struct sk_buff *skb, int offset, void *to, EXPORT_TRACEPOINT_SYMBOL_GPL(xdp_exception); +/* These are only used within the BPF_SYSCALL code */ +#ifdef CONFIG_BPF_SYSCALL EXPORT_TRACEPOINT_SYMBOL_GPL(bpf_prog_get_type); EXPORT_TRACEPOINT_SYMBOL_GPL(bpf_prog_put_rcu); +#endif From 3976ce41103702fdadc3d0daa99839930e158de5 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Mon, 16 Oct 2017 12:19:28 +0200 Subject: [PATCH 0040/1640] BACKPORT: bpf: introduce new bpf cpu map type BPF_MAP_TYPE_CPUMAP The 'cpumap' is primarily used as a backend map for XDP BPF helper call bpf_redirect_map() and XDP_REDIRECT action, like 'devmap'. This patch implement the main part of the map. It is not connected to the XDP redirect system yet, and no SKB allocation are done yet. The main concern in this patch is to ensure the datapath can run without any locking. This adds complexity to the setup and tear-down procedure, which assumptions are extra carefully documented in the code comments. V2: - make sure array isn't larger than NR_CPUS - make sure CPUs added is a valid possible CPU V3: fix nitpicks from Jakub Kicinski V5: - Restrict map allocation to root / CAP_SYS_ADMIN - WARN_ON_ONCE if queue is not empty on tear-down - Return -EPERM on memlock limit instead of -ENOMEM - Error code in __cpu_map_entry_alloc() also handle ptr_ring_cleanup() - Moved cpu_map_enqueue() to next patch V6: all notice by Daniel Borkmann - Fix err return code in cpu_map_alloc() introduced in V5 - Move cpu_possible() check after max_entries boundary check - Forbid usage initially in check_map_func_compatibility() V7: - Fix alloc error path spotted by Daniel Borkmann - Did stress test adding+removing CPUs from the map concurrently - Fixed refcnt issue on cpu_map_entry, kthread started too soon - Make sure packets are flushed during tear-down, involved use of rcu_barrier() and kthread_run only exit after queue is empty - Fix alloc error path in __cpu_map_entry_alloc() for ptr_ring V8: - Nitpicking comments and gramma by Edward Cree - Fix missing semi-colon introduced in V7 due to rebasing - Move struct bpf_cpu_map_entry members cpu+map_id to tracepoint patch Signed-off-by: Jesper Dangaard Brouer Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/bpf_types.h | 1 + include/uapi/linux/bpf.h | 1 + kernel/bpf/Makefile | 1 + kernel/bpf/cpumap.c | 560 +++++++++++++++++++++++++++++++++ kernel/bpf/syscall.c | 8 +- kernel/bpf/verifier.c | 5 + tools/include/uapi/linux/bpf.h | 1 + 7 files changed, 576 insertions(+), 1 deletion(-) create mode 100644 kernel/bpf/cpumap.c diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index e1149327a0c0..a8fdf956b60e 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -42,4 +42,5 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP, dev_map_ops) #ifdef CONFIG_STREAM_PARSER BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKMAP, sock_map_ops) #endif +BPF_MAP_TYPE(BPF_MAP_TYPE_CPUMAP, cpu_map_ops) #endif diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 9251902354eb..19a360528239 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -113,6 +113,7 @@ enum bpf_map_type { BPF_MAP_TYPE_HASH_OF_MAPS, BPF_MAP_TYPE_DEVMAP, BPF_MAP_TYPE_SOCKMAP, + BPF_MAP_TYPE_CPUMAP, }; enum bpf_prog_type { diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 9df9521de2b1..cdd3ce111f70 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -7,6 +7,7 @@ obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list obj-$(CONFIG_BPF_SYSCALL) += disasm.o ifeq ($(CONFIG_NET),y) obj-$(CONFIG_BPF_SYSCALL) += devmap.o +obj-$(CONFIG_BPF_SYSCALL) += cpumap.o ifeq ($(CONFIG_STREAM_PARSER),y) obj-$(CONFIG_BPF_SYSCALL) += sockmap.o endif diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c new file mode 100644 index 000000000000..e1e25ddba038 --- /dev/null +++ b/kernel/bpf/cpumap.c @@ -0,0 +1,560 @@ +/* bpf/cpumap.c + * + * Copyright (c) 2017 Jesper Dangaard Brouer, Red Hat Inc. + * Released under terms in GPL version 2. See COPYING. + */ + +/* The 'cpumap' is primarily used as a backend map for XDP BPF helper + * call bpf_redirect_map() and XDP_REDIRECT action, like 'devmap'. + * + * Unlike devmap which redirects XDP frames out another NIC device, + * this map type redirects raw XDP frames to another CPU. The remote + * CPU will do SKB-allocation and call the normal network stack. + * + * This is a scalability and isolation mechanism, that allow + * separating the early driver network XDP layer, from the rest of the + * netstack, and assigning dedicated CPUs for this stage. This + * basically allows for 10G wirespeed pre-filtering via bpf. + */ +#include +#include +#include + +#include +#include +#include +#include + +/* General idea: XDP packets getting XDP redirected to another CPU, + * will maximum be stored/queued for one driver ->poll() call. It is + * guaranteed that setting flush bit and flush operation happen on + * same CPU. Thus, cpu_map_flush operation can deduct via this_cpu_ptr() + * which queue in bpf_cpu_map_entry contains packets. + */ + +#define CPU_MAP_BULK_SIZE 8 /* 8 == one cacheline on 64-bit archs */ +struct xdp_bulk_queue { + void *q[CPU_MAP_BULK_SIZE]; + unsigned int count; +}; + +/* Struct for every remote "destination" CPU in map */ +struct bpf_cpu_map_entry { + u32 qsize; /* Queue size placeholder for map lookup */ + + /* XDP can run multiple RX-ring queues, need __percpu enqueue store */ + struct xdp_bulk_queue __percpu *bulkq; + + /* Queue with potential multi-producers, and single-consumer kthread */ + struct ptr_ring *queue; + struct task_struct *kthread; + struct work_struct kthread_stop_wq; + + atomic_t refcnt; /* Control when this struct can be free'ed */ + struct rcu_head rcu; +}; + +struct bpf_cpu_map { + struct bpf_map map; + /* Below members specific for map type */ + struct bpf_cpu_map_entry **cpu_map; + unsigned long __percpu *flush_needed; +}; + +static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu, + struct xdp_bulk_queue *bq); + +static u64 cpu_map_bitmap_size(const union bpf_attr *attr) +{ + return BITS_TO_LONGS(attr->max_entries) * sizeof(unsigned long); +} + +static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) +{ + struct bpf_cpu_map *cmap; + int err = -ENOMEM; + u64 cost; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return ERR_PTR(-EPERM); + + /* check sanity of attributes */ + if (attr->max_entries == 0 || attr->key_size != 4 || + attr->value_size != 4 || attr->map_flags & ~BPF_F_NUMA_NODE) + return ERR_PTR(-EINVAL); + + cmap = kzalloc(sizeof(*cmap), GFP_USER); + if (!cmap) + return ERR_PTR(-ENOMEM); + + /* mandatory map attributes */ + cmap->map.map_type = attr->map_type; + cmap->map.key_size = attr->key_size; + cmap->map.value_size = attr->value_size; + cmap->map.max_entries = attr->max_entries; + cmap->map.map_flags = attr->map_flags; + cmap->map.numa_node = bpf_map_attr_numa_node(attr); + + /* Pre-limit array size based on NR_CPUS, not final CPU check */ + if (cmap->map.max_entries > NR_CPUS) { + err = -E2BIG; + goto free_cmap; + } + + /* make sure page count doesn't overflow */ + cost = (u64) cmap->map.max_entries * sizeof(struct bpf_cpu_map_entry *); + cost += cpu_map_bitmap_size(attr) * num_possible_cpus(); + if (cost >= U32_MAX - PAGE_SIZE) + goto free_cmap; + cmap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; + + /* Notice returns -EPERM on if map size is larger than memlock limit */ + ret = bpf_map_precharge_memlock(cmap->map.pages); + if (ret) { + err = ret; + goto free_cmap; + } + + /* A per cpu bitfield with a bit per possible CPU in map */ + cmap->flush_needed = __alloc_percpu(cpu_map_bitmap_size(attr), + __alignof__(unsigned long)); + if (!cmap->flush_needed) + goto free_cmap; + + /* Alloc array for possible remote "destination" CPUs */ + cmap->cpu_map = bpf_map_area_alloc(cmap->map.max_entries * + sizeof(struct bpf_cpu_map_entry *), + cmap->map.numa_node); + if (!cmap->cpu_map) + goto free_percpu; + + return &cmap->map; +free_percpu: + free_percpu(cmap->flush_needed); +free_cmap: + kfree(cmap); + return ERR_PTR(err); +} + +void __cpu_map_queue_destructor(void *ptr) +{ + /* The tear-down procedure should have made sure that queue is + * empty. See __cpu_map_entry_replace() and work-queue + * invoked cpu_map_kthread_stop(). Catch any broken behaviour + * gracefully and warn once. + */ + if (WARN_ON_ONCE(ptr)) + page_frag_free(ptr); +} + +static void put_cpu_map_entry(struct bpf_cpu_map_entry *rcpu) +{ + if (atomic_dec_and_test(&rcpu->refcnt)) { + /* The queue should be empty at this point */ + ptr_ring_cleanup(rcpu->queue, __cpu_map_queue_destructor); + kfree(rcpu->queue); + kfree(rcpu); + } +} + +static void get_cpu_map_entry(struct bpf_cpu_map_entry *rcpu) +{ + atomic_inc(&rcpu->refcnt); +} + +/* called from workqueue, to workaround syscall using preempt_disable */ +static void cpu_map_kthread_stop(struct work_struct *work) +{ + struct bpf_cpu_map_entry *rcpu; + + rcpu = container_of(work, struct bpf_cpu_map_entry, kthread_stop_wq); + + /* Wait for flush in __cpu_map_entry_free(), via full RCU barrier, + * as it waits until all in-flight call_rcu() callbacks complete. + */ + rcu_barrier(); + + /* kthread_stop will wake_up_process and wait for it to complete */ + kthread_stop(rcpu->kthread); +} + +static int cpu_map_kthread_run(void *data) +{ + struct bpf_cpu_map_entry *rcpu = data; + + set_current_state(TASK_INTERRUPTIBLE); + + /* When kthread gives stop order, then rcpu have been disconnected + * from map, thus no new packets can enter. Remaining in-flight + * per CPU stored packets are flushed to this queue. Wait honoring + * kthread_stop signal until queue is empty. + */ + while (!kthread_should_stop() || !__ptr_ring_empty(rcpu->queue)) { + struct xdp_pkt *xdp_pkt; + + schedule(); + /* Do work */ + while ((xdp_pkt = ptr_ring_consume(rcpu->queue))) { + /* For now just "refcnt-free" */ + page_frag_free(xdp_pkt); + } + __set_current_state(TASK_INTERRUPTIBLE); + } + __set_current_state(TASK_RUNNING); + + put_cpu_map_entry(rcpu); + return 0; +} + +struct bpf_cpu_map_entry *__cpu_map_entry_alloc(u32 qsize, u32 cpu, int map_id) +{ + gfp_t gfp = GFP_ATOMIC|__GFP_NOWARN; + struct bpf_cpu_map_entry *rcpu; + int numa, err; + + /* Have map->numa_node, but choose node of redirect target CPU */ + numa = cpu_to_node(cpu); + + rcpu = kzalloc_node(sizeof(*rcpu), gfp, numa); + if (!rcpu) + return NULL; + + /* Alloc percpu bulkq */ + rcpu->bulkq = __alloc_percpu_gfp(sizeof(*rcpu->bulkq), + sizeof(void *), gfp); + if (!rcpu->bulkq) + goto free_rcu; + + /* Alloc queue */ + rcpu->queue = kzalloc_node(sizeof(*rcpu->queue), gfp, numa); + if (!rcpu->queue) + goto free_bulkq; + + err = ptr_ring_init(rcpu->queue, qsize, gfp); + if (err) + goto free_queue; + + rcpu->qsize = qsize; + + /* Setup kthread */ + rcpu->kthread = kthread_create_on_node(cpu_map_kthread_run, rcpu, numa, + "cpumap/%d/map:%d", cpu, map_id); + if (IS_ERR(rcpu->kthread)) + goto free_ptr_ring; + + get_cpu_map_entry(rcpu); /* 1-refcnt for being in cmap->cpu_map[] */ + get_cpu_map_entry(rcpu); /* 1-refcnt for kthread */ + + /* Make sure kthread runs on a single CPU */ + kthread_bind(rcpu->kthread, cpu); + wake_up_process(rcpu->kthread); + + return rcpu; + +free_ptr_ring: + ptr_ring_cleanup(rcpu->queue, NULL); +free_queue: + kfree(rcpu->queue); +free_bulkq: + free_percpu(rcpu->bulkq); +free_rcu: + kfree(rcpu); + return NULL; +} + +void __cpu_map_entry_free(struct rcu_head *rcu) +{ + struct bpf_cpu_map_entry *rcpu; + int cpu; + + /* This cpu_map_entry have been disconnected from map and one + * RCU graze-period have elapsed. Thus, XDP cannot queue any + * new packets and cannot change/set flush_needed that can + * find this entry. + */ + rcpu = container_of(rcu, struct bpf_cpu_map_entry, rcu); + + /* Flush remaining packets in percpu bulkq */ + for_each_online_cpu(cpu) { + struct xdp_bulk_queue *bq = per_cpu_ptr(rcpu->bulkq, cpu); + + /* No concurrent bq_enqueue can run at this point */ + bq_flush_to_queue(rcpu, bq); + } + free_percpu(rcpu->bulkq); + /* Cannot kthread_stop() here, last put free rcpu resources */ + put_cpu_map_entry(rcpu); +} + +/* After xchg pointer to bpf_cpu_map_entry, use the call_rcu() to + * ensure any driver rcu critical sections have completed, but this + * does not guarantee a flush has happened yet. Because driver side + * rcu_read_lock/unlock only protects the running XDP program. The + * atomic xchg and NULL-ptr check in __cpu_map_flush() makes sure a + * pending flush op doesn't fail. + * + * The bpf_cpu_map_entry is still used by the kthread, and there can + * still be pending packets (in queue and percpu bulkq). A refcnt + * makes sure to last user (kthread_stop vs. call_rcu) free memory + * resources. + * + * The rcu callback __cpu_map_entry_free flush remaining packets in + * percpu bulkq to queue. Due to caller map_delete_elem() disable + * preemption, cannot call kthread_stop() to make sure queue is empty. + * Instead a work_queue is started for stopping kthread, + * cpu_map_kthread_stop, which waits for an RCU graze period before + * stopping kthread, emptying the queue. + */ +void __cpu_map_entry_replace(struct bpf_cpu_map *cmap, + u32 key_cpu, struct bpf_cpu_map_entry *rcpu) +{ + struct bpf_cpu_map_entry *old_rcpu; + + old_rcpu = xchg(&cmap->cpu_map[key_cpu], rcpu); + if (old_rcpu) { + call_rcu(&old_rcpu->rcu, __cpu_map_entry_free); + INIT_WORK(&old_rcpu->kthread_stop_wq, cpu_map_kthread_stop); + schedule_work(&old_rcpu->kthread_stop_wq); + } +} + +int cpu_map_delete_elem(struct bpf_map *map, void *key) +{ + struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); + u32 key_cpu = *(u32 *)key; + + if (key_cpu >= map->max_entries) + return -EINVAL; + + /* notice caller map_delete_elem() use preempt_disable() */ + __cpu_map_entry_replace(cmap, key_cpu, NULL); + return 0; +} + +int cpu_map_update_elem(struct bpf_map *map, void *key, void *value, + u64 map_flags) +{ + struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); + struct bpf_cpu_map_entry *rcpu; + + /* Array index key correspond to CPU number */ + u32 key_cpu = *(u32 *)key; + /* Value is the queue size */ + u32 qsize = *(u32 *)value; + + if (unlikely(map_flags > BPF_EXIST)) + return -EINVAL; + if (unlikely(key_cpu >= cmap->map.max_entries)) + return -E2BIG; + if (unlikely(map_flags == BPF_NOEXIST)) + return -EEXIST; + if (unlikely(qsize > 16384)) /* sanity limit on qsize */ + return -EOVERFLOW; + + /* Make sure CPU is a valid possible cpu */ + if (!cpu_possible(key_cpu)) + return -ENODEV; + + if (qsize == 0) { + rcpu = NULL; /* Same as deleting */ + } else { + /* Updating qsize cause re-allocation of bpf_cpu_map_entry */ + rcpu = __cpu_map_entry_alloc(qsize, key_cpu, map->id); + if (!rcpu) + return -ENOMEM; + } + rcu_read_lock(); + __cpu_map_entry_replace(cmap, key_cpu, rcpu); + rcu_read_unlock(); + return 0; +} + +void cpu_map_free(struct bpf_map *map) +{ + struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); + int cpu; + u32 i; + + /* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0, + * so the bpf programs (can be more than one that used this map) were + * disconnected from events. Wait for outstanding critical sections in + * these programs to complete. The rcu critical section only guarantees + * no further "XDP/bpf-side" reads against bpf_cpu_map->cpu_map. + * It does __not__ ensure pending flush operations (if any) are + * complete. + */ + synchronize_rcu(); + + /* To ensure all pending flush operations have completed wait for flush + * bitmap to indicate all flush_needed bits to be zero on _all_ cpus. + * Because the above synchronize_rcu() ensures the map is disconnected + * from the program we can assume no new bits will be set. + */ + for_each_online_cpu(cpu) { + unsigned long *bitmap = per_cpu_ptr(cmap->flush_needed, cpu); + + while (!bitmap_empty(bitmap, cmap->map.max_entries)) + cond_resched(); + } + + /* For cpu_map the remote CPUs can still be using the entries + * (struct bpf_cpu_map_entry). + */ + for (i = 0; i < cmap->map.max_entries; i++) { + struct bpf_cpu_map_entry *rcpu; + + rcpu = READ_ONCE(cmap->cpu_map[i]); + if (!rcpu) + continue; + + /* bq flush and cleanup happens after RCU graze-period */ + __cpu_map_entry_replace(cmap, i, NULL); /* call_rcu */ + } + free_percpu(cmap->flush_needed); + bpf_map_area_free(cmap->cpu_map); + kfree(cmap); +} + +struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key) +{ + struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); + struct bpf_cpu_map_entry *rcpu; + + if (key >= map->max_entries) + return NULL; + + rcpu = READ_ONCE(cmap->cpu_map[key]); + return rcpu; +} + +static void *cpu_map_lookup_elem(struct bpf_map *map, void *key) +{ + struct bpf_cpu_map_entry *rcpu = + __cpu_map_lookup_elem(map, *(u32 *)key); + + return rcpu ? &rcpu->qsize : NULL; +} + +static int cpu_map_get_next_key(struct bpf_map *map, void *key, void *next_key) +{ + struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); + u32 index = key ? *(u32 *)key : U32_MAX; + u32 *next = next_key; + + if (index >= cmap->map.max_entries) { + *next = 0; + return 0; + } + + if (index == cmap->map.max_entries - 1) + return -ENOENT; + *next = index + 1; + return 0; +} + +const struct bpf_map_ops cpu_map_ops = { + .map_alloc = cpu_map_alloc, + .map_free = cpu_map_free, + .map_delete_elem = cpu_map_delete_elem, + .map_update_elem = cpu_map_update_elem, + .map_lookup_elem = cpu_map_lookup_elem, + .map_get_next_key = cpu_map_get_next_key, +}; + +static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu, + struct xdp_bulk_queue *bq) +{ + struct ptr_ring *q; + int i; + + if (unlikely(!bq->count)) + return 0; + + q = rcpu->queue; + spin_lock(&q->producer_lock); + + for (i = 0; i < bq->count; i++) { + void *xdp_pkt = bq->q[i]; + int err; + + err = __ptr_ring_produce(q, xdp_pkt); + if (err) { + /* Free xdp_pkt */ + page_frag_free(xdp_pkt); + } + } + bq->count = 0; + spin_unlock(&q->producer_lock); + + return 0; +} + +/* Notice: Will change in later patch */ +struct xdp_pkt { + void *data; + u16 len; + u16 headroom; +}; + +/* Runs under RCU-read-side, plus in softirq under NAPI protection. + * Thus, safe percpu variable access. + */ +int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_pkt *xdp_pkt) +{ + struct xdp_bulk_queue *bq = this_cpu_ptr(rcpu->bulkq); + + if (unlikely(bq->count == CPU_MAP_BULK_SIZE)) + bq_flush_to_queue(rcpu, bq); + + /* Notice, xdp_buff/page MUST be queued here, long enough for + * driver to code invoking us to finished, due to driver + * (e.g. ixgbe) recycle tricks based on page-refcnt. + * + * Thus, incoming xdp_pkt is always queued here (else we race + * with another CPU on page-refcnt and remaining driver code). + * Queue time is very short, as driver will invoke flush + * operation, when completing napi->poll call. + */ + bq->q[bq->count++] = xdp_pkt; + return 0; +} + +void __cpu_map_insert_ctx(struct bpf_map *map, u32 bit) +{ + struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); + unsigned long *bitmap = this_cpu_ptr(cmap->flush_needed); + + __set_bit(bit, bitmap); +} + +void __cpu_map_flush(struct bpf_map *map) +{ + struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); + unsigned long *bitmap = this_cpu_ptr(cmap->flush_needed); + u32 bit; + + /* The napi->poll softirq makes sure __cpu_map_insert_ctx() + * and __cpu_map_flush() happen on same CPU. Thus, the percpu + * bitmap indicate which percpu bulkq have packets. + */ + for_each_set_bit(bit, bitmap, map->max_entries) { + struct bpf_cpu_map_entry *rcpu = READ_ONCE(cmap->cpu_map[bit]); + struct xdp_bulk_queue *bq; + + /* This is possible if entry is removed by user space + * between xdp redirect and flush op. + */ + if (unlikely(!rcpu)) + continue; + + __clear_bit(bit, bitmap); + + /* Flush all frames in bulkq to real queue */ + bq = this_cpu_ptr(rcpu->bulkq); + bq_flush_to_queue(rcpu, bq); + + /* If already running, costs spin_lock_irqsave + smb_mb */ + wake_up_process(rcpu->kthread); + } +} diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 9a78683b803d..7add72ff2b21 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -668,6 +668,12 @@ static int map_update_elem(union bpf_attr *attr) if (copy_from_user(value, uvalue, value_size) != 0) goto free_value; + /* Need to create a kthread, thus must support schedule */ + if (map->map_type == BPF_MAP_TYPE_CPUMAP) { + err = map->ops->map_update_elem(map, key, value, attr->flags); + goto out; + } + /* must increment bpf_prog_active to avoid kprobe+bpf triggering from * inside bpf map update or delete otherwise deadlocks are possible */ @@ -699,7 +705,7 @@ static int map_update_elem(union bpf_attr *attr) __this_cpu_dec(bpf_prog_active); preempt_enable(); maybe_wait_bpf_programs(map); - +out: if (!err) trace_bpf_map_update_elem(map, ufd, key, value); free_value: diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 02fdf6a46437..3df608e96ab9 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1659,6 +1659,11 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, if (func_id != BPF_FUNC_redirect_map) goto error; break; + /* Restrict bpf side of cpumap, open when use-cases appear */ + case BPF_MAP_TYPE_CPUMAP: + if (func_id != BPF_FUNC_redirect_map) + goto error; + break; case BPF_MAP_TYPE_ARRAY_OF_MAPS: case BPF_MAP_TYPE_HASH_OF_MAPS: if (func_id != BPF_FUNC_map_lookup_elem) diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 000f99f85f9c..b5957432a74d 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -112,6 +112,7 @@ enum bpf_map_type { BPF_MAP_TYPE_HASH_OF_MAPS, BPF_MAP_TYPE_DEVMAP, BPF_MAP_TYPE_SOCKMAP, + BPF_MAP_TYPE_CPUMAP, }; enum bpf_prog_type { From 4783bca8b1cd0e0b06e1eda94ee91666428c5ab8 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Mon, 16 Oct 2017 12:19:34 +0200 Subject: [PATCH 0041/1640] BACKPORT: bpf: XDP_REDIRECT enable use of cpumap This patch connects cpumap to the xdp_do_redirect_map infrastructure. Still no SKB allocation are done yet. The XDP frames are transferred to the other CPU, but they are simply refcnt decremented on the remote CPU. This served as a good benchmark for measuring the overhead of remote refcnt decrement. If driver page recycle cache is not efficient then this, exposes a bottleneck in the page allocator. A shout-out to MST's ptr_ring, which is the secret behind is being so efficient to transfer memory pointers between CPUs, without constantly bouncing cache-lines between CPUs. V3: Handle !CONFIG_BPF_SYSCALL pointed out by kbuild test robot. V4: Make Generic-XDP aware of cpumap type, but don't allow redirect yet, as implementation require a separate upstream discussion. V5: - Fix a maybe-uninitialized pointed out by kbuild test robot. - Restrict bpf-prog side access to cpumap, open when use-cases appear - Implement cpu_map_enqueue() as a more simple void pointer enqueue V6: - Allow cpumap type for usage in helper bpf_redirect_map, general bpf-prog side restriction moved to earlier patch. Signed-off-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- include/linux/bpf.h | 32 +++++++- include/trace/events/xdp.h | 10 ++- kernel/bpf/cpumap.c | 22 +++++- kernel/bpf/verifier.c | 3 +- net/core/filter.c | 148 ++++++++++++++++++++++++++++--------- 5 files changed, 177 insertions(+), 38 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 2ae78e0af744..224dfa9370cb 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -403,6 +403,13 @@ struct net_device *__dev_map_lookup_elem(struct bpf_map *map, u32 key); void __dev_map_insert_ctx(struct bpf_map *map, u32 index); void __dev_map_flush(struct bpf_map *map); +struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key); +void __cpu_map_insert_ctx(struct bpf_map *map, u32 index); +void __cpu_map_flush(struct bpf_map *map); +struct xdp_buff; +int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp, + struct net_device *dev_rx); + /* Return map's numa specified by userspace */ static inline int bpf_map_attr_numa_node(const union bpf_attr *attr) { @@ -415,7 +422,7 @@ static inline bool unprivileged_ebpf_enabled(void) return !sysctl_unprivileged_bpf_disabled; } -#else +#else /* !CONFIG_BPF_SYSCALL */ static inline struct bpf_prog *bpf_prog_get(u32 ufd) { return ERR_PTR(-EOPNOTSUPP); @@ -485,6 +492,29 @@ static inline void __dev_map_flush(struct bpf_map *map) { } + +static inline +struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key) +{ + return NULL; +} + +static inline void __cpu_map_insert_ctx(struct bpf_map *map, u32 index) +{ +} + +static inline void __cpu_map_flush(struct bpf_map *map) +{ +} + +struct xdp_buff; +static inline int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, + struct xdp_buff *xdp, + struct net_device *dev_rx) +{ + return 0; +} + static inline bool unprivileged_ebpf_enabled(void) { return false; diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h index f7c73ae62b7a..36163cb2aaa5 100644 --- a/include/trace/events/xdp.h +++ b/include/trace/events/xdp.h @@ -138,12 +138,18 @@ DEFINE_EVENT_PRINT(xdp_redirect_template, xdp_redirect_map_err, __entry->map_id, __entry->map_index) ); +#define devmap_ifindex(fwd, map) \ + (!fwd ? 0 : \ + (!map ? 0 : \ + ((map->map_type == BPF_MAP_TYPE_DEVMAP) ? \ + ((struct net_device *)fwd)->ifindex : 0))) + #define _trace_xdp_redirect_map(dev, xdp, fwd, map, idx) \ - trace_xdp_redirect_map(dev, xdp, fwd ? fwd->ifindex : 0, \ + trace_xdp_redirect_map(dev, xdp, devmap_ifindex(fwd, map), \ 0, map, idx) #define _trace_xdp_redirect_map_err(dev, xdp, fwd, map, idx, err) \ - trace_xdp_redirect_map_err(dev, xdp, fwd ? fwd->ifindex : 0, \ + trace_xdp_redirect_map_err(dev, xdp, devmap_ifindex(fwd, map), \ err, map, idx) #endif /* _TRACE_XDP_H */ diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index e1e25ddba038..768da6a2c265 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -500,7 +500,7 @@ struct xdp_pkt { /* Runs under RCU-read-side, plus in softirq under NAPI protection. * Thus, safe percpu variable access. */ -int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_pkt *xdp_pkt) +static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_pkt *xdp_pkt) { struct xdp_bulk_queue *bq = this_cpu_ptr(rcpu->bulkq); @@ -520,6 +520,26 @@ int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_pkt *xdp_pkt) return 0; } +int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp, + struct net_device *dev_rx) +{ + struct xdp_pkt *xdp_pkt; + int headroom; + + /* For now this is just used as a void pointer to data_hard_start. + * Followup patch will generalize this. + */ + xdp_pkt = xdp->data_hard_start; + + /* Fake writing into xdp_pkt->data to measure overhead */ + headroom = xdp->data - xdp->data_hard_start; + if (headroom < sizeof(*xdp_pkt)) + xdp_pkt->data = xdp->data; + + bq_enqueue(rcpu, xdp_pkt); + return 0; +} + void __cpu_map_insert_ctx(struct bpf_map *map, u32 bit) { struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 3df608e96ab9..87d9b4cdcc45 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1701,7 +1701,8 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, goto error; break; case BPF_FUNC_redirect_map: - if (map->map_type != BPF_MAP_TYPE_DEVMAP) + if (map->map_type != BPF_MAP_TYPE_DEVMAP && + map->map_type != BPF_MAP_TYPE_CPUMAP) goto error; break; case BPF_FUNC_sk_redirect_map: diff --git a/net/core/filter.c b/net/core/filter.c index 82598731d98d..449a9d11e234 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2611,10 +2611,36 @@ static int __bpf_tx_xdp(struct net_device *dev, err = dev->netdev_ops->ndo_xdp_xmit(dev, xdp); if (err) return err; - if (map) + dev->netdev_ops->ndo_xdp_flush(dev); + return 0; +} + +static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd, + struct bpf_map *map, + struct xdp_buff *xdp, + u32 index) +{ + int err; + + if (map->map_type == BPF_MAP_TYPE_DEVMAP) { + struct net_device *dev = fwd; + + if (!dev->netdev_ops->ndo_xdp_xmit) + return -EOPNOTSUPP; + + err = dev->netdev_ops->ndo_xdp_xmit(dev, xdp); + if (err) + return err; __dev_map_insert_ctx(map, index); - else - dev->netdev_ops->ndo_xdp_flush(dev); + + } else if (map->map_type == BPF_MAP_TYPE_CPUMAP) { + struct bpf_cpu_map_entry *rcpu = fwd; + + err = cpu_map_enqueue(rcpu, xdp, dev_rx); + if (err) + return err; + __cpu_map_insert_ctx(map, index); + } return 0; } @@ -2624,11 +2650,33 @@ void xdp_do_flush_map(void) struct bpf_map *map = ri->map_to_flush; ri->map_to_flush = NULL; - if (map) - __dev_map_flush(map); + if (map) { + switch (map->map_type) { + case BPF_MAP_TYPE_DEVMAP: + __dev_map_flush(map); + break; + case BPF_MAP_TYPE_CPUMAP: + __cpu_map_flush(map); + break; + default: + break; + } + } } EXPORT_SYMBOL_GPL(xdp_do_flush_map); +static void *__xdp_map_lookup_elem(struct bpf_map *map, u32 index) +{ + switch (map->map_type) { + case BPF_MAP_TYPE_DEVMAP: + return __dev_map_lookup_elem(map, index); + case BPF_MAP_TYPE_CPUMAP: + return __cpu_map_lookup_elem(map, index); + default: + return NULL; + } +} + static inline bool xdp_map_invalid(const struct bpf_prog *xdp_prog, unsigned long aux) { @@ -2641,8 +2689,8 @@ static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp, struct redirect_info *ri = this_cpu_ptr(&redirect_info); unsigned long map_owner = ri->map_owner; struct bpf_map *map = ri->map; - struct net_device *fwd = NULL; u32 index = ri->ifindex; + void *fwd = NULL; int err; ri->ifindex = 0; @@ -2655,7 +2703,7 @@ static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp, goto err; } - fwd = __dev_map_lookup_elem(map, index); + fwd = __xdp_map_lookup_elem(map, index); if (!fwd) { err = -EINVAL; goto err; @@ -2663,7 +2711,7 @@ static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp, if (ri->map_to_flush && ri->map_to_flush != map) xdp_do_flush_map(); - err = __bpf_tx_xdp(fwd, map, xdp, index); + err = __bpf_tx_xdp_map(dev, fwd, map, xdp, index); if (unlikely(err)) goto err; @@ -2705,54 +2753,88 @@ err: } EXPORT_SYMBOL_GPL(xdp_do_redirect); -int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb, - struct bpf_prog *xdp_prog) +static int __xdp_generic_ok_fwd_dev(struct sk_buff *skb, struct net_device *fwd) +{ + unsigned int len; + + if (unlikely(!(fwd->flags & IFF_UP))) + return -ENETDOWN; + + len = fwd->mtu + fwd->hard_header_len + VLAN_HLEN; + if (skb->len > len) + return -EMSGSIZE; + + return 0; +} + +int xdp_do_generic_redirect_map(struct net_device *dev, struct sk_buff *skb, + struct bpf_prog *xdp_prog) { struct redirect_info *ri = this_cpu_ptr(&redirect_info); unsigned long map_owner = ri->map_owner; struct bpf_map *map = ri->map; struct net_device *fwd = NULL; u32 index = ri->ifindex; - unsigned int len; int err = 0; ri->ifindex = 0; ri->map = NULL; ri->map_owner = 0; - if (map) { - if (unlikely(xdp_map_invalid(xdp_prog, map_owner))) { - err = -EFAULT; - map = NULL; - goto err; - } - fwd = __dev_map_lookup_elem(map, index); - } else { - fwd = dev_get_by_index_rcu(dev_net(dev), index); + if (unlikely(xdp_map_invalid(xdp_prog, map_owner))) { + err = -EFAULT; + map = NULL; + goto err; } + fwd = __xdp_map_lookup_elem(map, index); if (unlikely(!fwd)) { err = -EINVAL; goto err; } - if (unlikely(!(fwd->flags & IFF_UP))) { - err = -ENETDOWN; + if (map->map_type == BPF_MAP_TYPE_DEVMAP) { + if (unlikely((err = __xdp_generic_ok_fwd_dev(skb, fwd)))) + goto err; + skb->dev = fwd; + } else { + /* TODO: Handle BPF_MAP_TYPE_CPUMAP */ + err = -EBADRQC; goto err; } - len = fwd->mtu + fwd->hard_header_len + VLAN_HLEN; - if (skb->len > len) { - err = -EMSGSIZE; - goto err; - } - - skb->dev = fwd; - map ? _trace_xdp_redirect_map(dev, xdp_prog, fwd, map, index) - : _trace_xdp_redirect(dev, xdp_prog, index); + _trace_xdp_redirect_map(dev, xdp_prog, fwd, map, index); return 0; err: - map ? _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map, index, err) - : _trace_xdp_redirect_err(dev, xdp_prog, index, err); + _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map, index, err); + return err; +} + +int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb, + struct bpf_prog *xdp_prog) +{ + struct redirect_info *ri = this_cpu_ptr(&redirect_info); + u32 index = ri->ifindex; + struct net_device *fwd; + int err = 0; + + if (ri->map) + return xdp_do_generic_redirect_map(dev, skb, xdp_prog); + + ri->ifindex = 0; + fwd = dev_get_by_index_rcu(dev_net(dev), index); + if (unlikely(!fwd)) { + err = -EINVAL; + goto err; + } + + if (unlikely((err = __xdp_generic_ok_fwd_dev(skb, fwd)))) + goto err; + + skb->dev = fwd; + _trace_xdp_redirect(dev, xdp_prog, index); + return 0; +err: + _trace_xdp_redirect_err(dev, xdp_prog, index, err); return err; } EXPORT_SYMBOL_GPL(xdp_do_generic_redirect); From d1a384f38e37939856b39c48e1b9ab9eadfb5c92 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Mon, 16 Oct 2017 12:19:39 +0200 Subject: [PATCH 0042/1640] UPSTREAM: bpf: cpumap xdp_buff to skb conversion and allocation This patch makes cpumap functional, by adding SKB allocation and invoking the network stack on the dequeuing CPU. For constructing the SKB on the remote CPU, the xdp_buff in converted into a struct xdp_pkt, and it mapped into the top headroom of the packet, to avoid allocating separate mem. For now, struct xdp_pkt is just a cpumap internal data structure, with info carried between enqueue to dequeue. If a driver doesn't have enough headroom it is simply dropped, with return code -EOVERFLOW. This will be picked up the xdp tracepoint infrastructure, to allow users to catch this. V2: take into account xdp->data_meta V4: - Drop busypoll tricks, keeping it more simple. - Skip RPS and Generic-XDP-recursive-reinjection, suggested by Alexei V5: correct RCU read protection around __netif_receive_skb_core. V6: Setting TASK_RUNNING vs TASK_INTERRUPTIBLE based on talk with Rik van Riel Signed-off-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- include/linux/netdevice.h | 1 + kernel/bpf/cpumap.c | 152 ++++++++++++++++++++++++++++++++------ net/core/dev.c | 27 +++++++ 3 files changed, 158 insertions(+), 22 deletions(-) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 9b5ace9ce053..c209f64c4829 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3347,6 +3347,7 @@ int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb); int netif_rx(struct sk_buff *skb); int netif_rx_ni(struct sk_buff *skb); int netif_receive_skb(struct sk_buff *skb); +int netif_receive_skb_core(struct sk_buff *skb); gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb); void napi_gro_flush(struct napi_struct *napi, bool flush_old); struct sk_buff *napi_get_frags(struct napi_struct *napi); diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 768da6a2c265..ee7adf4352dd 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -25,6 +25,9 @@ #include #include +#include /* netif_receive_skb_core */ +#include /* eth_type_trans */ + /* General idea: XDP packets getting XDP redirected to another CPU, * will maximum be stored/queued for one driver ->poll() call. It is * guaranteed that setting flush bit and flush operation happen on @@ -179,6 +182,92 @@ static void cpu_map_kthread_stop(struct work_struct *work) kthread_stop(rcpu->kthread); } +/* For now, xdp_pkt is a cpumap internal data structure, with info + * carried between enqueue to dequeue. It is mapped into the top + * headroom of the packet, to avoid allocating separate mem. + */ +struct xdp_pkt { + void *data; + u16 len; + u16 headroom; + u16 metasize; + struct net_device *dev_rx; +}; + +/* Convert xdp_buff to xdp_pkt */ +static struct xdp_pkt *convert_to_xdp_pkt(struct xdp_buff *xdp) +{ + struct xdp_pkt *xdp_pkt; + int metasize; + int headroom; + + /* Assure headroom is available for storing info */ + headroom = xdp->data - xdp->data_hard_start; + metasize = xdp->data - xdp->data_meta; + metasize = metasize > 0 ? metasize : 0; + if ((headroom - metasize) < sizeof(*xdp_pkt)) + return NULL; + + /* Store info in top of packet */ + xdp_pkt = xdp->data_hard_start; + + xdp_pkt->data = xdp->data; + xdp_pkt->len = xdp->data_end - xdp->data; + xdp_pkt->headroom = headroom - sizeof(*xdp_pkt); + xdp_pkt->metasize = metasize; + + return xdp_pkt; +} + +struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu, + struct xdp_pkt *xdp_pkt) +{ + unsigned int frame_size; + void *pkt_data_start; + struct sk_buff *skb; + + /* build_skb need to place skb_shared_info after SKB end, and + * also want to know the memory "truesize". Thus, need to + * know the memory frame size backing xdp_buff. + * + * XDP was designed to have PAGE_SIZE frames, but this + * assumption is not longer true with ixgbe and i40e. It + * would be preferred to set frame_size to 2048 or 4096 + * depending on the driver. + * frame_size = 2048; + * frame_len = frame_size - sizeof(*xdp_pkt); + * + * Instead, with info avail, skb_shared_info in placed after + * packet len. This, unfortunately fakes the truesize. + * Another disadvantage of this approach, the skb_shared_info + * is not at a fixed memory location, with mixed length + * packets, which is bad for cache-line hotness. + */ + frame_size = SKB_DATA_ALIGN(xdp_pkt->len) + xdp_pkt->headroom + + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + + pkt_data_start = xdp_pkt->data - xdp_pkt->headroom; + skb = build_skb(pkt_data_start, frame_size); + if (!skb) + return NULL; + + skb_reserve(skb, xdp_pkt->headroom); + __skb_put(skb, xdp_pkt->len); + if (xdp_pkt->metasize) + skb_metadata_set(skb, xdp_pkt->metasize); + + /* Essential SKB info: protocol and skb->dev */ + skb->protocol = eth_type_trans(skb, xdp_pkt->dev_rx); + + /* Optional SKB info, currently missing: + * - HW checksum info (skb->ip_summed) + * - HW RX hash (skb_set_hash) + * - RX ring dev queue index (skb_record_rx_queue) + */ + + return skb; +} + static int cpu_map_kthread_run(void *data) { struct bpf_cpu_map_entry *rcpu = data; @@ -191,15 +280,45 @@ static int cpu_map_kthread_run(void *data) * kthread_stop signal until queue is empty. */ while (!kthread_should_stop() || !__ptr_ring_empty(rcpu->queue)) { + unsigned int processed = 0, drops = 0; struct xdp_pkt *xdp_pkt; - schedule(); - /* Do work */ - while ((xdp_pkt = ptr_ring_consume(rcpu->queue))) { - /* For now just "refcnt-free" */ - page_frag_free(xdp_pkt); + /* Release CPU reschedule checks */ + if (__ptr_ring_empty(rcpu->queue)) { + __set_current_state(TASK_INTERRUPTIBLE); + schedule(); + } else { + cond_resched(); } - __set_current_state(TASK_INTERRUPTIBLE); + __set_current_state(TASK_RUNNING); + + /* Process packets in rcpu->queue */ + local_bh_disable(); + /* + * The bpf_cpu_map_entry is single consumer, with this + * kthread CPU pinned. Lockless access to ptr_ring + * consume side valid as no-resize allowed of queue. + */ + while ((xdp_pkt = __ptr_ring_consume(rcpu->queue))) { + struct sk_buff *skb; + int ret; + + skb = cpu_map_build_skb(rcpu, xdp_pkt); + if (!skb) { + page_frag_free(xdp_pkt); + continue; + } + + /* Inject into network stack */ + ret = netif_receive_skb_core(skb); + if (ret == NET_RX_DROP) + drops++; + + /* Limit BH-disable period */ + if (++processed == 8) + break; + } + local_bh_enable(); /* resched point, may call do_softirq() */ } __set_current_state(TASK_RUNNING); @@ -490,13 +609,6 @@ static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu, return 0; } -/* Notice: Will change in later patch */ -struct xdp_pkt { - void *data; - u16 len; - u16 headroom; -}; - /* Runs under RCU-read-side, plus in softirq under NAPI protection. * Thus, safe percpu variable access. */ @@ -524,17 +636,13 @@ int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp, struct net_device *dev_rx) { struct xdp_pkt *xdp_pkt; - int headroom; - /* For now this is just used as a void pointer to data_hard_start. - * Followup patch will generalize this. - */ - xdp_pkt = xdp->data_hard_start; + xdp_pkt = convert_to_xdp_pkt(xdp); + if (!xdp_pkt) + return -EOVERFLOW; - /* Fake writing into xdp_pkt->data to measure overhead */ - headroom = xdp->data - xdp->data_hard_start; - if (headroom < sizeof(*xdp_pkt)) - xdp_pkt->data = xdp->data; + /* Info needed when constructing SKB on remote CPU */ + xdp_pkt->dev_rx = dev_rx; bq_enqueue(rcpu, xdp_pkt); return 0; diff --git a/net/core/dev.c b/net/core/dev.c index ee44c889e03d..c981f082d780 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4578,6 +4578,33 @@ out: return ret; } +/** + * netif_receive_skb_core - special purpose version of netif_receive_skb + * @skb: buffer to process + * + * More direct receive version of netif_receive_skb(). It should + * only be used by callers that have a need to skip RPS and Generic XDP. + * Caller must also take care of handling if (page_is_)pfmemalloc. + * + * This function may only be called from softirq context and interrupts + * should be enabled. + * + * Return values (usually ignored): + * NET_RX_SUCCESS: no congestion + * NET_RX_DROP: packet was dropped + */ +int netif_receive_skb_core(struct sk_buff *skb) +{ + int ret; + + rcu_read_lock(); + ret = __netif_receive_skb_core(skb, false); + rcu_read_unlock(); + + return ret; +} +EXPORT_SYMBOL(netif_receive_skb_core); + static int __netif_receive_skb(struct sk_buff *skb) { int ret; From 385e77bccbb62f724515ec0f449e1e7cdfec4203 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Mon, 16 Oct 2017 12:19:44 +0200 Subject: [PATCH 0043/1640] UPSTREAM: bpf: cpumap add tracepoints This adds two tracepoint to the cpumap. One for the enqueue side trace_xdp_cpumap_enqueue() and one for the kthread dequeue side trace_xdp_cpumap_kthread(). To mitigate the tracepoint overhead, these are invoked during the enqueue/dequeue bulking phases, thus amortizing the cost. The obvious use-cases are for debugging and monitoring. The non-intuitive use-case is using these as a feedback loop to know the system load. One can imagine auto-scaling by reducing, adding or activating more worker CPUs on demand. V4: tracepoint remove time_limit info, instead add sched info V8: intro struct bpf_cpu_map_entry members cpu+map_id in this patch Signed-off-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- include/trace/events/xdp.h | 70 ++++++++++++++++++++++++++++++++++++++ kernel/bpf/cpumap.c | 24 ++++++++++--- 2 files changed, 89 insertions(+), 5 deletions(-) diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h index 36163cb2aaa5..8989a92c571a 100644 --- a/include/trace/events/xdp.h +++ b/include/trace/events/xdp.h @@ -152,6 +152,76 @@ DEFINE_EVENT_PRINT(xdp_redirect_template, xdp_redirect_map_err, trace_xdp_redirect_map_err(dev, xdp, devmap_ifindex(fwd, map), \ err, map, idx) +TRACE_EVENT(xdp_cpumap_kthread, + + TP_PROTO(int map_id, unsigned int processed, unsigned int drops, + int sched), + + TP_ARGS(map_id, processed, drops, sched), + + TP_STRUCT__entry( + __field(int, map_id) + __field(u32, act) + __field(int, cpu) + __field(unsigned int, drops) + __field(unsigned int, processed) + __field(int, sched) + ), + + TP_fast_assign( + __entry->map_id = map_id; + __entry->act = XDP_REDIRECT; + __entry->cpu = smp_processor_id(); + __entry->drops = drops; + __entry->processed = processed; + __entry->sched = sched; + ), + + TP_printk("kthread" + " cpu=%d map_id=%d action=%s" + " processed=%u drops=%u" + " sched=%d", + __entry->cpu, __entry->map_id, + __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB), + __entry->processed, __entry->drops, + __entry->sched) +); + +TRACE_EVENT(xdp_cpumap_enqueue, + + TP_PROTO(int map_id, unsigned int processed, unsigned int drops, + int to_cpu), + + TP_ARGS(map_id, processed, drops, to_cpu), + + TP_STRUCT__entry( + __field(int, map_id) + __field(u32, act) + __field(int, cpu) + __field(unsigned int, drops) + __field(unsigned int, processed) + __field(int, to_cpu) + ), + + TP_fast_assign( + __entry->map_id = map_id; + __entry->act = XDP_REDIRECT; + __entry->cpu = smp_processor_id(); + __entry->drops = drops; + __entry->processed = processed; + __entry->to_cpu = to_cpu; + ), + + TP_printk("enqueue" + " cpu=%d map_id=%d action=%s" + " processed=%u drops=%u" + " to_cpu=%d", + __entry->cpu, __entry->map_id, + __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB), + __entry->processed, __entry->drops, + __entry->to_cpu) +); + #endif /* _TRACE_XDP_H */ #include diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index ee7adf4352dd..b4358d84ddf1 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -24,6 +24,7 @@ #include #include #include +#include #include /* netif_receive_skb_core */ #include /* eth_type_trans */ @@ -43,6 +44,8 @@ struct xdp_bulk_queue { /* Struct for every remote "destination" CPU in map */ struct bpf_cpu_map_entry { + u32 cpu; /* kthread CPU and map index */ + int map_id; /* Back reference to map */ u32 qsize; /* Queue size placeholder for map lookup */ /* XDP can run multiple RX-ring queues, need __percpu enqueue store */ @@ -280,15 +283,16 @@ static int cpu_map_kthread_run(void *data) * kthread_stop signal until queue is empty. */ while (!kthread_should_stop() || !__ptr_ring_empty(rcpu->queue)) { - unsigned int processed = 0, drops = 0; + unsigned int processed = 0, drops = 0, sched = 0; struct xdp_pkt *xdp_pkt; /* Release CPU reschedule checks */ if (__ptr_ring_empty(rcpu->queue)) { __set_current_state(TASK_INTERRUPTIBLE); schedule(); + sched = 1; } else { - cond_resched(); + sched = cond_resched(); } __set_current_state(TASK_RUNNING); @@ -318,6 +322,9 @@ static int cpu_map_kthread_run(void *data) if (++processed == 8) break; } + /* Feedback loop via tracepoint */ + trace_xdp_cpumap_kthread(rcpu->map_id, processed, drops, sched); + local_bh_enable(); /* resched point, may call do_softirq() */ } __set_current_state(TASK_RUNNING); @@ -354,7 +361,9 @@ struct bpf_cpu_map_entry *__cpu_map_entry_alloc(u32 qsize, u32 cpu, int map_id) if (err) goto free_queue; - rcpu->qsize = qsize; + rcpu->cpu = cpu; + rcpu->map_id = map_id; + rcpu->qsize = qsize; /* Setup kthread */ rcpu->kthread = kthread_create_on_node(cpu_map_kthread_run, rcpu, numa, @@ -584,6 +593,8 @@ const struct bpf_map_ops cpu_map_ops = { static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu, struct xdp_bulk_queue *bq) { + unsigned int processed = 0, drops = 0; + const int to_cpu = rcpu->cpu; struct ptr_ring *q; int i; @@ -599,13 +610,16 @@ static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu, err = __ptr_ring_produce(q, xdp_pkt); if (err) { - /* Free xdp_pkt */ - page_frag_free(xdp_pkt); + drops++; + page_frag_free(xdp_pkt); /* Free xdp_pkt */ } + processed++; } bq->count = 0; spin_unlock(&q->producer_lock); + /* Feedback loop via tracepoints */ + trace_xdp_cpumap_enqueue(rcpu->map_id, processed, drops, to_cpu); return 0; } From c02a12ef4dc62677bd16b0354a685de860d9088d Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 16 Oct 2017 16:40:53 -0700 Subject: [PATCH 0044/1640] BACKPORT: bpf: split verifier and program ops struct bpf_verifier_ops contains both verifier ops and operations used later during program's lifetime (test_run). Split the runtime ops into a different structure. BPF_PROG_TYPE() will now append ## _prog_ops or ## _verifier_ops to the names. Signed-off-by: Jakub Kicinski Acked-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/bpf.h | 15 ++++++++----- include/linux/bpf_types.h | 28 ++++++++++++------------ kernel/bpf/syscall.c | 16 +++++++++++--- kernel/bpf/verifier.c | 12 +++++------ kernel/trace/bpf_trace.c | 15 ++++++++++--- net/core/filter.c | 45 +++++++++++++++++++++++++++++++-------- 6 files changed, 91 insertions(+), 40 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 224dfa9370cb..053bab55a87b 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -175,6 +175,11 @@ bpf_ctx_record_field_size(struct bpf_insn_access_aux *aux, u32 size) aux->ctx_field_size = size; } +struct bpf_prog_ops { + int (*test_run)(struct bpf_prog *prog, const union bpf_attr *kattr, + union bpf_attr __user *uattr); +}; + struct bpf_verifier_ops { /* return eBPF function prototype for verification */ const struct bpf_func_proto *(*get_func_proto)(enum bpf_func_id func_id); @@ -190,8 +195,6 @@ struct bpf_verifier_ops { const struct bpf_insn *src, struct bpf_insn *dst, struct bpf_prog *prog, u32 *target_size); - int (*test_run)(struct bpf_prog *prog, const union bpf_attr *kattr, - union bpf_attr __user *uattr); }; struct bpf_prog_aux { @@ -202,7 +205,8 @@ struct bpf_prog_aux { u32 id; struct latch_tree_node ksym_tnode; struct list_head ksym_lnode; - const struct bpf_verifier_ops *ops; + const struct bpf_prog_ops *ops; + const struct bpf_verifier_ops *vops; struct bpf_map **used_maps; struct bpf_prog *prog; struct user_struct *user; @@ -324,8 +328,9 @@ DECLARE_PER_CPU(int, bpf_prog_active); extern const struct file_operations bpf_map_fops; extern const struct file_operations bpf_prog_fops; -#define BPF_PROG_TYPE(_id, _ops) \ - extern const struct bpf_verifier_ops _ops; +#define BPF_PROG_TYPE(_id, _name) \ + extern const struct bpf_prog_ops _name ## _prog_ops; \ + extern const struct bpf_verifier_ops _name ## _verifier_ops; #define BPF_MAP_TYPE(_id, _ops) \ extern const struct bpf_map_ops _ops; #include diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index a8fdf956b60e..53c5b9ad7220 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -2,22 +2,22 @@ /* internal file - do not include directly */ #ifdef CONFIG_NET -BPF_PROG_TYPE(BPF_PROG_TYPE_SOCKET_FILTER, sk_filter_prog_ops) -BPF_PROG_TYPE(BPF_PROG_TYPE_SCHED_CLS, tc_cls_act_prog_ops) -BPF_PROG_TYPE(BPF_PROG_TYPE_SCHED_ACT, tc_cls_act_prog_ops) -BPF_PROG_TYPE(BPF_PROG_TYPE_XDP, xdp_prog_ops) -BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SKB, cg_skb_prog_ops) -BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SOCK, cg_sock_prog_ops) -BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_IN, lwt_inout_prog_ops) -BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_OUT, lwt_inout_prog_ops) -BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_XMIT, lwt_xmit_prog_ops) -BPF_PROG_TYPE(BPF_PROG_TYPE_SOCK_OPS, sock_ops_prog_ops) -BPF_PROG_TYPE(BPF_PROG_TYPE_SK_SKB, sk_skb_prog_ops) +BPF_PROG_TYPE(BPF_PROG_TYPE_SOCKET_FILTER, sk_filter) +BPF_PROG_TYPE(BPF_PROG_TYPE_SCHED_CLS, tc_cls_act) +BPF_PROG_TYPE(BPF_PROG_TYPE_SCHED_ACT, tc_cls_act) +BPF_PROG_TYPE(BPF_PROG_TYPE_XDP, xdp) +BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SKB, cg_skb) +BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SOCK, cg_sock) +BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_IN, lwt_inout) +BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_OUT, lwt_inout) +BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_XMIT, lwt_xmit) +BPF_PROG_TYPE(BPF_PROG_TYPE_SOCK_OPS, sock_ops) +BPF_PROG_TYPE(BPF_PROG_TYPE_SK_SKB, sk_skb) #endif #ifdef CONFIG_BPF_EVENTS -BPF_PROG_TYPE(BPF_PROG_TYPE_KPROBE, kprobe_prog_ops) -BPF_PROG_TYPE(BPF_PROG_TYPE_TRACEPOINT, tracepoint_prog_ops) -BPF_PROG_TYPE(BPF_PROG_TYPE_PERF_EVENT, perf_event_prog_ops) +BPF_PROG_TYPE(BPF_PROG_TYPE_KPROBE, kprobe) +BPF_PROG_TYPE(BPF_PROG_TYPE_TRACEPOINT, tracepoint) +BPF_PROG_TYPE(BPF_PROG_TYPE_PERF_EVENT, perf_event) #endif BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY, array_map_ops) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 7add72ff2b21..3a8be9015c10 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -827,9 +827,18 @@ err_put: return err; } -static const struct bpf_verifier_ops * const bpf_prog_types[] = { -#define BPF_PROG_TYPE(_id, _ops) \ - [_id] = &_ops, +static const struct bpf_prog_ops * const bpf_prog_types[] = { +#define BPF_PROG_TYPE(_id, _name) \ + [_id] = & _name ## _prog_ops, +#define BPF_MAP_TYPE(_id, _ops) +#include +#undef BPF_PROG_TYPE +#undef BPF_MAP_TYPE +}; + +static const struct bpf_verifier_ops * const bpf_verifier_ops[] = { +#define BPF_PROG_TYPE(_id, _name) \ + [_id] = & _name ## _verifier_ops, #define BPF_MAP_TYPE(_id, _ops) #include #undef BPF_PROG_TYPE @@ -842,6 +851,7 @@ static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog) return -EINVAL; prog->aux->ops = bpf_prog_types[type]; + prog->aux->vops = bpf_verifier_ops[type]; prog->type = type; return 0; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 87d9b4cdcc45..4eaea6b724fb 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1020,8 +1020,8 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, *reg_type = info.reg_type; return 0; } - } else if (env->prog->aux->ops->is_valid_access && - env->prog->aux->ops->is_valid_access(off, size, t, &info)) { + } else if (env->prog->aux->vops->is_valid_access && + env->prog->aux->vops->is_valid_access(off, size, t, &info)) { /* A non zero info.ctx_field_size indicates that this field is a * candidate for later verifier transformation to load the whole * field and then apply a mask when accessed with a narrower @@ -1779,8 +1779,8 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx) return -EINVAL; } - if (env->prog->aux->ops->get_func_proto) - fn = env->prog->aux->ops->get_func_proto(func_id); + if (env->prog->aux->vops->get_func_proto) + fn = env->prog->aux->vops->get_func_proto(func_id); if (!fn) { verbose(env, "unknown func %s#%d\n", func_id_name(func_id), @@ -4695,7 +4695,7 @@ static void sanitize_dead_code(struct bpf_verifier_env *env) */ static int convert_ctx_accesses(struct bpf_verifier_env *env) { - const struct bpf_verifier_ops *ops = env->prog->aux->ops; + const struct bpf_verifier_ops *ops = env->prog->aux->vops; int i, cnt, size, ctx_field_size, delta = 0; const int insn_cnt = env->prog->len; struct bpf_insn insn_buf[16], *insn; @@ -5048,7 +5048,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) insn = new_prog->insnsi + i + delta; } patch_call_imm: - fn = prog->aux->ops->get_func_proto(insn->imm); + fn = prog->aux->vops->get_func_proto(insn->imm); /* all functions that have prototype and verifier allowed * programs to call them, must be real in-kernel functions */ diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 82aba5a93269..88a04a03de4a 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -583,11 +583,14 @@ static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type return true; } -const struct bpf_verifier_ops kprobe_prog_ops = { +const struct bpf_verifier_ops kprobe_verifier_ops = { .get_func_proto = kprobe_prog_func_proto, .is_valid_access = kprobe_prog_is_valid_access, }; +const struct bpf_prog_ops kprobe_prog_ops = { +}; + BPF_CALL_5(bpf_perf_event_output_tp, void *, tp_buff, struct bpf_map *, map, u64, flags, void *, data, u64, size) { @@ -689,11 +692,14 @@ static bool tp_prog_is_valid_access(int off, int size, enum bpf_access_type type return true; } -const struct bpf_verifier_ops tracepoint_prog_ops = { +const struct bpf_verifier_ops tracepoint_verifier_ops = { .get_func_proto = tp_prog_func_proto, .is_valid_access = tp_prog_is_valid_access, }; +const struct bpf_prog_ops tracepoint_prog_ops = { +}; + static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type, struct bpf_insn_access_aux *info) { @@ -749,12 +755,15 @@ static u32 pe_prog_convert_ctx_access(enum bpf_access_type type, return insn - insn_buf; } -const struct bpf_verifier_ops perf_event_prog_ops = { +const struct bpf_verifier_ops perf_event_verifier_ops = { .get_func_proto = tp_prog_func_proto, .is_valid_access = pe_prog_is_valid_access, .convert_ctx_access = pe_prog_convert_ctx_access, }; +const struct bpf_prog_ops perf_event_prog_ops = { +}; + static DEFINE_MUTEX(bpf_event_mutex); int perf_event_attach_bpf_prog(struct perf_event *event, diff --git a/net/core/filter.c b/net/core/filter.c index 449a9d11e234..67a46989b41a 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4520,68 +4520,95 @@ static u32 sk_skb_convert_ctx_access(enum bpf_access_type type, return insn - insn_buf; } -const struct bpf_verifier_ops sk_filter_prog_ops = { +const struct bpf_verifier_ops sk_filter_verifier_ops = { .get_func_proto = sk_filter_func_proto, .is_valid_access = sk_filter_is_valid_access, .convert_ctx_access = bpf_convert_ctx_access, }; -const struct bpf_verifier_ops tc_cls_act_prog_ops = { +const struct bpf_prog_ops sk_filter_prog_ops = { +}; + +const struct bpf_verifier_ops tc_cls_act_verifier_ops = { .get_func_proto = tc_cls_act_func_proto, .is_valid_access = tc_cls_act_is_valid_access, .convert_ctx_access = tc_cls_act_convert_ctx_access, .gen_prologue = tc_cls_act_prologue, +}; + +const struct bpf_prog_ops tc_cls_act_prog_ops = { .test_run = bpf_prog_test_run_skb, }; -const struct bpf_verifier_ops xdp_prog_ops = { +const struct bpf_verifier_ops xdp_verifier_ops = { .get_func_proto = xdp_func_proto, .is_valid_access = xdp_is_valid_access, .convert_ctx_access = xdp_convert_ctx_access, +}; + +const struct bpf_prog_ops xdp_prog_ops = { .test_run = bpf_prog_test_run_xdp, }; -const struct bpf_verifier_ops cg_skb_prog_ops = { +const struct bpf_verifier_ops cg_skb_verifier_ops = { .get_func_proto = sk_filter_func_proto, .is_valid_access = sk_filter_is_valid_access, .convert_ctx_access = bpf_convert_ctx_access, +}; + +const struct bpf_prog_ops cg_skb_prog_ops = { .test_run = bpf_prog_test_run_skb, }; -const struct bpf_verifier_ops lwt_inout_prog_ops = { +const struct bpf_verifier_ops lwt_inout_verifier_ops = { .get_func_proto = lwt_inout_func_proto, .is_valid_access = lwt_is_valid_access, .convert_ctx_access = bpf_convert_ctx_access, +}; + +const struct bpf_prog_ops lwt_inout_prog_ops = { .test_run = bpf_prog_test_run_skb, }; -const struct bpf_verifier_ops lwt_xmit_prog_ops = { +const struct bpf_verifier_ops lwt_xmit_verifier_ops = { .get_func_proto = lwt_xmit_func_proto, .is_valid_access = lwt_is_valid_access, .convert_ctx_access = bpf_convert_ctx_access, .gen_prologue = tc_cls_act_prologue, +}; + +const struct bpf_prog_ops lwt_xmit_prog_ops = { .test_run = bpf_prog_test_run_skb, }; -const struct bpf_verifier_ops cg_sock_prog_ops = { +const struct bpf_verifier_ops cg_sock_verifier_ops = { .get_func_proto = sock_filter_func_proto, .is_valid_access = sock_filter_is_valid_access, .convert_ctx_access = sock_filter_convert_ctx_access, }; -const struct bpf_verifier_ops sock_ops_prog_ops = { +const struct bpf_prog_ops cg_sock_prog_ops = { +}; + +const struct bpf_verifier_ops sock_ops_verifier_ops = { .get_func_proto = sock_ops_func_proto, .is_valid_access = sock_ops_is_valid_access, .convert_ctx_access = sock_ops_convert_ctx_access, }; -const struct bpf_verifier_ops sk_skb_prog_ops = { +const struct bpf_prog_ops sock_ops_prog_ops = { +}; + +const struct bpf_verifier_ops sk_skb_verifier_ops = { .get_func_proto = sk_skb_func_proto, .is_valid_access = sk_skb_is_valid_access, .convert_ctx_access = sk_skb_convert_ctx_access, .gen_prologue = sk_skb_prologue, }; +const struct bpf_prog_ops sk_skb_prog_ops = { +}; + int sk_detach_filter(struct sock *sk) { int ret = -ENOENT; From d7f01a1532e5dbddf0921931922b0a8dfa396f6d Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 16 Oct 2017 16:40:54 -0700 Subject: [PATCH 0045/1640] UPSTREAM: bpf: remove the verifier ops from program structure Since the verifier ops don't have to be associated with the program for its entire lifetime we can move it to verifier's struct bpf_verifier_env. Signed-off-by: Jakub Kicinski Acked-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/bpf.h | 1 - include/linux/bpf_verifier.h | 1 + kernel/bpf/syscall.c | 10 ---------- kernel/bpf/verifier.c | 23 +++++++++++++++++------ 4 files changed, 18 insertions(+), 17 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 053bab55a87b..903ee4dbef52 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -206,7 +206,6 @@ struct bpf_prog_aux { struct latch_tree_node ksym_tnode; struct list_head ksym_lnode; const struct bpf_prog_ops *ops; - const struct bpf_verifier_ops *vops; struct bpf_map **used_maps; struct bpf_prog *prog; struct user_struct *user; diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 1fae78c057d8..4c961fb6e15d 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -164,6 +164,7 @@ struct bpf_verifier_env { u32 insn_idx; u32 prev_insn_idx; struct bpf_prog *prog; /* eBPF program being verified */ + const struct bpf_verifier_ops *ops; struct bpf_verifier_stack_elem *head; /* stack of verifier states to be processed */ int stack_size; /* number of states to be processed */ bool strict_alignment; /* perform strict pointer alignment checks */ diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 3a8be9015c10..954defeb3e06 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -836,22 +836,12 @@ static const struct bpf_prog_ops * const bpf_prog_types[] = { #undef BPF_MAP_TYPE }; -static const struct bpf_verifier_ops * const bpf_verifier_ops[] = { -#define BPF_PROG_TYPE(_id, _name) \ - [_id] = & _name ## _verifier_ops, -#define BPF_MAP_TYPE(_id, _ops) -#include -#undef BPF_PROG_TYPE -#undef BPF_MAP_TYPE -}; - static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog) { if (type >= ARRAY_SIZE(bpf_prog_types) || !bpf_prog_types[type]) return -EINVAL; prog->aux->ops = bpf_prog_types[type]; - prog->aux->vops = bpf_verifier_ops[type]; prog->type = type; return 0; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 4eaea6b724fb..b545262cd10d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -23,6 +23,15 @@ #include "disasm.h" +static const struct bpf_verifier_ops * const bpf_verifier_ops[] = { +#define BPF_PROG_TYPE(_id, _name) \ + [_id] = & _name ## _verifier_ops, +#define BPF_MAP_TYPE(_id, _ops) +#include +#undef BPF_PROG_TYPE +#undef BPF_MAP_TYPE +}; + /* bpf_check() is a static code analyzer that walks eBPF program * instruction by instruction and updates register/stack state. * All paths of conditional branches are analyzed until 'bpf_exit' insn. @@ -1020,8 +1029,8 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, *reg_type = info.reg_type; return 0; } - } else if (env->prog->aux->vops->is_valid_access && - env->prog->aux->vops->is_valid_access(off, size, t, &info)) { + } else if (env->ops->is_valid_access && + env->ops->is_valid_access(off, size, t, &info)) { /* A non zero info.ctx_field_size indicates that this field is a * candidate for later verifier transformation to load the whole * field and then apply a mask when accessed with a narrower @@ -1779,8 +1788,8 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx) return -EINVAL; } - if (env->prog->aux->vops->get_func_proto) - fn = env->prog->aux->vops->get_func_proto(func_id); + if (env->ops->get_func_proto) + fn = env->ops->get_func_proto(func_id); if (!fn) { verbose(env, "unknown func %s#%d\n", func_id_name(func_id), @@ -4695,7 +4704,7 @@ static void sanitize_dead_code(struct bpf_verifier_env *env) */ static int convert_ctx_accesses(struct bpf_verifier_env *env) { - const struct bpf_verifier_ops *ops = env->prog->aux->vops; + const struct bpf_verifier_ops *ops = env->ops; int i, cnt, size, ctx_field_size, delta = 0; const int insn_cnt = env->prog->len; struct bpf_insn insn_buf[16], *insn; @@ -5048,7 +5057,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) insn = new_prog->insnsi + i + delta; } patch_call_imm: - fn = prog->aux->vops->get_func_proto(insn->imm); + fn = env->ops->get_func_proto(insn->imm); /* all functions that have prototype and verifier allowed * programs to call them, must be real in-kernel functions */ @@ -5107,6 +5116,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) if (!env->insn_aux_data) goto err_free_env; env->prog = *prog; + env->ops = bpf_verifier_ops[env->prog->type]; /* grab the mutex to protect few globals used by verifier */ mutex_lock(&bpf_verifier_lock); @@ -5226,6 +5236,7 @@ int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops, if (!env->insn_aux_data) goto err_free_env; env->prog = prog; + env->ops = bpf_verifier_ops[env->prog->type]; env->analyzer_ops = ops; env->analyzer_priv = priv; From 00279d05e31adefd239ffc8510fe1028dc64349e Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 16 Oct 2017 16:40:55 -0700 Subject: [PATCH 0046/1640] UPSTREAM: bpf: move knowledge about post-translation offsets out of verifier Use the fact that verifier ops are now separate from program ops to define a separate set of callbacks for verification of already translated programs. Since we expect the analyzer ops to be defined only for a small subset of all program types initialize their array by hand (don't use linux/bpf_types.h). Signed-off-by: Jakub Kicinski Acked-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/bpf.h | 3 +++ kernel/bpf/verifier.c | 55 +++++++++++++------------------------------ net/core/filter.c | 40 +++++++++++++++++++++++++++++++ 3 files changed, 59 insertions(+), 39 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 903ee4dbef52..a1adcb0030e9 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -336,6 +336,9 @@ extern const struct file_operations bpf_prog_fops; #undef BPF_PROG_TYPE #undef BPF_MAP_TYPE +extern const struct bpf_verifier_ops tc_cls_act_analyzer_ops; +extern const struct bpf_verifier_ops xdp_analyzer_ops; + struct bpf_prog *bpf_prog_get(u32 ufd); struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type); struct bpf_prog * __must_check bpf_prog_add(struct bpf_prog *prog, int i); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index b545262cd10d..2a11482583bd 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -986,36 +986,6 @@ static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off, return err; } -static bool analyzer_is_valid_access(struct bpf_verifier_env *env, int off, - struct bpf_insn_access_aux *info) -{ - switch (env->prog->type) { - case BPF_PROG_TYPE_XDP: - switch (off) { - case offsetof(struct xdp_buff, data): - info->reg_type = PTR_TO_PACKET; - return true; - case offsetof(struct xdp_buff, data_end): - info->reg_type = PTR_TO_PACKET_END; - return true; - } - return false; - case BPF_PROG_TYPE_SCHED_CLS: - switch (off) { - case offsetof(struct sk_buff, data): - info->reg_type = PTR_TO_PACKET; - return true; - case offsetof(struct sk_buff, cb) + - offsetof(struct bpf_skb_data_end, data_end): - info->reg_type = PTR_TO_PACKET_END; - return true; - } - return false; - default: - return false; - } -} - /* check access to 'struct bpf_context' fields. Supports fixed offsets only */ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, int size, enum bpf_access_type t, enum bpf_reg_type *reg_type) @@ -1024,13 +994,8 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, .reg_type = *reg_type, }; - if (env->analyzer_ops) { - if (analyzer_is_valid_access(env, off, &info)) { - *reg_type = info.reg_type; - return 0; - } - } else if (env->ops->is_valid_access && - env->ops->is_valid_access(off, size, t, &info)) { + if (env->ops->is_valid_access && + env->ops->is_valid_access(off, size, t, &info)) { /* A non zero info.ctx_field_size indicates that this field is a * candidate for later verifier transformation to load the whole * field and then apply a mask when accessed with a narrower @@ -1038,9 +1003,12 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, * will only allow for whole field access and rejects any other * type of narrower access. */ - env->insn_aux_data[insn_idx].ctx_field_size = info.ctx_field_size; *reg_type = info.reg_type; + if (env->analyzer_ops) + return 0; + + env->insn_aux_data[insn_idx].ctx_field_size = info.ctx_field_size; /* remember the offset of last byte accessed in ctx */ if (env->prog->aux->max_ctx_offset < off + size) env->prog->aux->max_ctx_offset = off + size; @@ -5220,12 +5188,21 @@ err_free_env: return ret; } +static const struct bpf_verifier_ops * const bpf_analyzer_ops[] = { + [BPF_PROG_TYPE_XDP] = &xdp_analyzer_ops, + [BPF_PROG_TYPE_SCHED_CLS] = &tc_cls_act_analyzer_ops, +}; + int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops, void *priv) { struct bpf_verifier_env *env; int ret; + if (prog->type >= ARRAY_SIZE(bpf_analyzer_ops) || + !bpf_analyzer_ops[prog->type]) + return -EOPNOTSUPP; + env = kzalloc(sizeof(struct bpf_verifier_env), GFP_KERNEL); if (!env) return -ENOMEM; @@ -5236,7 +5213,7 @@ int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops, if (!env->insn_aux_data) goto err_free_env; env->prog = prog; - env->ops = bpf_verifier_ops[env->prog->type]; + env->ops = bpf_analyzer_ops[env->prog->type]; env->analyzer_ops = ops; env->analyzer_priv = priv; diff --git a/net/core/filter.c b/net/core/filter.c index 67a46989b41a..c914a0ffbec4 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3831,6 +3831,23 @@ static bool tc_cls_act_is_valid_access(int off, int size, return bpf_skb_is_valid_access(off, size, type, info); } +static bool +tc_cls_act_is_valid_access_analyzer(int off, int size, + enum bpf_access_type type, + struct bpf_insn_access_aux *info) +{ + switch (off) { + case offsetof(struct sk_buff, data): + info->reg_type = PTR_TO_PACKET; + return true; + case offsetof(struct sk_buff, cb) + + offsetof(struct bpf_skb_data_end, data_end): + info->reg_type = PTR_TO_PACKET_END; + return true; + } + return false; +} + static bool __is_valid_xdp_access(int off, int size) { if (off < 0 || off >= sizeof(struct xdp_md)) @@ -3865,6 +3882,21 @@ static bool xdp_is_valid_access(int off, int size, return __is_valid_xdp_access(off, size); } +static bool xdp_is_valid_access_analyzer(int off, int size, + enum bpf_access_type type, + struct bpf_insn_access_aux *info) +{ + switch (off) { + case offsetof(struct xdp_buff, data): + info->reg_type = PTR_TO_PACKET; + return true; + case offsetof(struct xdp_buff, data_end): + info->reg_type = PTR_TO_PACKET_END; + return true; + } + return false; +} + void bpf_warn_invalid_xdp_action(u32 act) { const u32 act_max = XDP_REDIRECT; @@ -4536,6 +4568,10 @@ const struct bpf_verifier_ops tc_cls_act_verifier_ops = { .gen_prologue = tc_cls_act_prologue, }; +const struct bpf_verifier_ops tc_cls_act_analyzer_ops = { + .is_valid_access = tc_cls_act_is_valid_access_analyzer, +}; + const struct bpf_prog_ops tc_cls_act_prog_ops = { .test_run = bpf_prog_test_run_skb, }; @@ -4546,6 +4582,10 @@ const struct bpf_verifier_ops xdp_verifier_ops = { .convert_ctx_access = xdp_convert_ctx_access, }; +const struct bpf_verifier_ops xdp_analyzer_ops = { + .is_valid_access = xdp_is_valid_access_analyzer, +}; + const struct bpf_prog_ops xdp_prog_ops = { .test_run = bpf_prog_test_run_xdp, }; From 3bf96b6ee46fa2acb1d820f61677cc79bdb62fd7 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 16 Oct 2017 16:40:56 -0700 Subject: [PATCH 0047/1640] UPSTREAM: bpf: allow access to skb->len from offloads Since we are now doing strict checking of what offloads may access, make sure skb->len is on that list. Signed-off-by: Jakub Kicinski Acked-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- net/core/filter.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/core/filter.c b/net/core/filter.c index c914a0ffbec4..acdd079a1dac 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3837,6 +3837,8 @@ tc_cls_act_is_valid_access_analyzer(int off, int size, struct bpf_insn_access_aux *info) { switch (off) { + case offsetof(struct sk_buff, len): + return true; case offsetof(struct sk_buff, data): info->reg_type = PTR_TO_PACKET; return true; From 832352976ad9e937ce3f7c37306f4a5fc1123660 Mon Sep 17 00:00:00 2001 From: Lawrence Brakmo Date: Fri, 20 Oct 2017 11:05:39 -0700 Subject: [PATCH 0048/1640] UPSTREAM: bpf: add support for BPF_SOCK_OPS_BASE_RTT A congestion control algorithm can make a call to the BPF socket_ops program to request the base RTT. The base RTT can be congestion control dependent and is meant to represent a congestion threshold such that RTTs above it indicate congestion. This is especially useful for flows within a DC where the base RTT is easy to obtain. Being provided a base RTT solves a basic problem in RTT based congestion avoidance algorithms (such as Vegas, NV and BBR). Although it is easy to get the base RTT when the network is not congested, it is very diffcult to do when it is very congested. Newer connections get an inflated value of the base RTT leading to unfariness (newer flows with a larger base RTT get more bandwidth). As a result, RTT based congestion avoidance algorithms tend to update their base RTTs to improve fairness. In very congested networks this can lead to base RTT inflation, reducing the ability of these RTT based congestion control algorithms to prevent congestion. Note that in my experiments with TCP-NV, the base RTT provided can be much larger than the actual hardware RTT. For example, experimenting with hosts within a rack where the hardware RTT is 16-20us, I've used base RTTs up to 150us. The effect of using a larger base RTT is that the congestion avoidance algorithm will allow more queueing. When there are only a few flows the main effect is larger measured RTTs and RPC latencies due to the increased queueing. When there are a lot of flows, a larger base RTT can lead to more congestion and more packet drops. For this case, where the hardware RTT is 20us, a base RTT of 80us produces good results. This patch only introduces BPF_SOCK_OPS_BASE_RTT, a later patch in this set adds support for using it in TCP-NV. Further study and testing is needed before support can be added to other delay based congestion avoidance algorithms. Signed-off-by: Lawrence Brakmo Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 19a360528239..e8e7a8d9f624 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1054,6 +1054,13 @@ enum { BPF_SOCK_OPS_NEEDS_ECN, /* If connection's congestion control * needs ECN */ + BPF_SOCK_OPS_BASE_RTT, /* Get base RTT. The correct value is + * based on the path and may be + * dependent on the congestion control + * algorithm. In general it indicates + * a congestion threshold. RTTs above + * this indicate congestion + */ }; #define TCP_BPF_IW 1001 /* Set TCP initial congestion window */ From 9a3b68db4d72ae06618ac55117ad1f63222c1db1 Mon Sep 17 00:00:00 2001 From: Lawrence Brakmo Date: Fri, 20 Oct 2017 11:05:40 -0700 Subject: [PATCH 0049/1640] BACKPORT: bpf: Adding helper function bpf_getsockops Adding support for helper function bpf_getsockops to socket_ops BPF programs. This patch only supports TCP_CONGESTION. Signed-off-by: Vlad Vysotsky Acked-by: Lawrence Brakmo Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 16 +++++++++++--- net/core/filter.c | 46 +++++++++++++++++++++++++++++++++++++++- 2 files changed, 58 insertions(+), 4 deletions(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index e8e7a8d9f624..d2678e94dfa2 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -614,12 +614,22 @@ union bpf_attr { * int bpf_setsockopt(bpf_socket, level, optname, optval, optlen) * Calls setsockopt. Not all opts are available, only those with * integer optvals plus TCP_CONGESTION. - * Supported levels: SOL_SOCKET and IPROTO_TCP + * Supported levels: SOL_SOCKET and IPPROTO_TCP * @bpf_socket: pointer to bpf_socket - * @level: SOL_SOCKET or IPROTO_TCP + * @level: SOL_SOCKET or IPPROTO_TCP * @optname: option name * @optval: pointer to option value - * @optlen: length of optval in byes + * @optlen: length of optval in bytes + * Return: 0 or negative error + * + * int bpf_getsockopt(bpf_socket, level, optname, optval, optlen) + * Calls getsockopt. Not all opts are available. + * Supported levels: IPPROTO_TCP + * @bpf_socket: pointer to bpf_socket + * @level: IPPROTO_TCP + * @optname: option name + * @optval: pointer to option value + * @optlen: length of optval in bytes * Return: 0 or negative error * * int bpf_skb_adjust_room(skb, len_diff, mode, flags) diff --git a/net/core/filter.c b/net/core/filter.c index acdd079a1dac..a62018663d53 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3364,7 +3364,7 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock, static const struct bpf_func_proto bpf_setsockopt_proto = { .func = bpf_setsockopt, - .gpl_only = true, + .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, @@ -3373,6 +3373,48 @@ static const struct bpf_func_proto bpf_setsockopt_proto = { .arg5_type = ARG_CONST_SIZE, }; +BPF_CALL_5(bpf_getsockopt, struct bpf_sock_ops_kern *, bpf_sock, + int, level, int, optname, char *, optval, int, optlen) +{ + struct sock *sk = bpf_sock->sk; + int ret = 0; + + if (!sk_fullsock(sk)) + goto err_clear; + +#ifdef CONFIG_INET + if (level == SOL_TCP && sk->sk_prot->getsockopt == tcp_getsockopt) { + if (optname == TCP_CONGESTION) { + struct inet_connection_sock *icsk = inet_csk(sk); + + if (!icsk->icsk_ca_ops || optlen <= 1) + goto err_clear; + strncpy(optval, icsk->icsk_ca_ops->name, optlen); + optval[optlen - 1] = 0; + } else { + goto err_clear; + } + } else { + goto err_clear; + } + return ret; +#endif +err_clear: + memset(optval, 0, optlen); + return -EINVAL; +} + +static const struct bpf_func_proto bpf_getsockopt_proto = { + .func = bpf_getsockopt, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_UNINIT_MEM, + .arg5_type = ARG_CONST_SIZE, +}; + static const struct bpf_func_proto * bpf_base_func_proto(enum bpf_func_id func_id) { @@ -3559,6 +3601,8 @@ static const struct bpf_func_proto * switch (func_id) { case BPF_FUNC_setsockopt: return &bpf_setsockopt_proto; + case BPF_FUNC_getsockopt: + return &bpf_getsockopt_proto; case BPF_FUNC_sock_map_update: return &bpf_sock_map_update_proto; default: From 9e63ecf093aa91342f767bea1c151076cd743403 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Mon, 23 Oct 2017 19:39:28 +0200 Subject: [PATCH 0050/1640] UPSTREAM: bpf: cpumap fix potential lost wake-up problem As pointed out by Michael, commit 1c601d829ab0 ("bpf: cpumap xdp_buff to skb conversion and allocation") contains a classical example of the potential lost wake-up problem. We need to recheck the condition __ptr_ring_empty() after changing current->state to TASK_INTERRUPTIBLE, this avoids a race between wake_up_process() and schedule(). After this, a race with wake_up_process() will simply change the state to TASK_RUNNING, and the schedule() call not really put us to sleep. Fixes: 1c601d829ab0 ("bpf: cpumap xdp_buff to skb conversion and allocation") Reported-by: "Michael S. Tsirkin" Signed-off-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- kernel/bpf/cpumap.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index b4358d84ddf1..86e29cbf7827 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -288,13 +288,17 @@ static int cpu_map_kthread_run(void *data) /* Release CPU reschedule checks */ if (__ptr_ring_empty(rcpu->queue)) { - __set_current_state(TASK_INTERRUPTIBLE); - schedule(); - sched = 1; + set_current_state(TASK_INTERRUPTIBLE); + /* Recheck to avoid lost wake-up */ + if (__ptr_ring_empty(rcpu->queue)) { + schedule(); + sched = 1; + } else { + __set_current_state(TASK_RUNNING); + } } else { sched = cond_resched(); } - __set_current_state(TASK_RUNNING); /* Process packets in rcpu->queue */ local_bh_disable(); From 1a13936c7ee0c5d9b376ffb6bd469f47995586ca Mon Sep 17 00:00:00 2001 From: Gianluca Borello Date: Thu, 26 Oct 2017 01:47:42 +0000 Subject: [PATCH 0051/1640] UPSTREAM: bpf: remove tail_call and get_stackid helper declarations from bpf.h commit afdb09c720b6 ("security: bpf: Add LSM hooks for bpf object related syscall") included linux/bpf.h in linux/security.h. As a result, bpf programs including bpf_helpers.h and some other header that ends up pulling in also security.h, such as several examples under samples/bpf, fail to compile because bpf_tail_call and bpf_get_stackid are now "redefined as different kind of symbol". >From bpf.h: u64 bpf_tail_call(u64 ctx, u64 r2, u64 index, u64 r4, u64 r5); u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); Whereas in bpf_helpers.h they are: static void (*bpf_tail_call)(void *ctx, void *map, int index); static int (*bpf_get_stackid)(void *ctx, void *map, int flags); Fix this by removing the unused declaration of bpf_tail_call and moving the declaration of bpf_get_stackid in bpf_trace.c, which is the only place where it's needed. Signed-off-by: Gianluca Borello Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/bpf.h | 3 --- kernel/trace/bpf_trace.c | 2 ++ 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index a1adcb0030e9..c47912d4ae96 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -247,9 +247,6 @@ struct bpf_event_entry { struct rcu_head rcu; }; -u64 bpf_tail_call(u64 ctx, u64 r2, u64 index, u64 r4, u64 r5); -u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); - bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *fp); int bpf_prog_calc_tag(struct bpf_prog *fp); diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 88a04a03de4a..28751d3dc8e1 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -15,6 +15,8 @@ #include #include "trace.h" +u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); + /** * trace_call_bpf - invoke BPF program * @call: tracepoint event From 1b6a28cce58faf8c13c625b7d7b28e0984fe627a Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 30 Oct 2017 13:46:47 -0700 Subject: [PATCH 0052/1640] UPSTREAM: net: filter: remove unused variable and fix warning bpf_getsockopt bpf call sets the ret variable to zero and never changes it. What's worse in case CONFIG_INET is not selected the variable is completely unused generating a warning. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Acked-by: Lawrence Brakmo Acked-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- net/core/filter.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/net/core/filter.c b/net/core/filter.c index a62018663d53..17b8e85185da 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3377,7 +3377,6 @@ BPF_CALL_5(bpf_getsockopt, struct bpf_sock_ops_kern *, bpf_sock, int, level, int, optname, char *, optval, int, optlen) { struct sock *sk = bpf_sock->sk; - int ret = 0; if (!sk_fullsock(sk)) goto err_clear; @@ -3397,7 +3396,7 @@ BPF_CALL_5(bpf_getsockopt, struct bpf_sock_ops_kern *, bpf_sock, } else { goto err_clear; } - return ret; + return 0; #endif err_clear: memset(optval, 0, optlen); From 0b83655b6aef5350ac72b3638f6408c8e057e091 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Wed, 1 Nov 2017 12:44:45 +0100 Subject: [PATCH 0053/1640] UPSTREAM: bpf: cpumap micro-optimization in cpu_map_enqueue Discovered that the compiler laid-out asm code in suboptimal way when studying perf report during benchmarking of cpumap. Help the compiler by the marking unlikely code paths. Signed-off-by: Jesper Dangaard Brouer Acked-by: John Fastabend Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/cpumap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 86e29cbf7827..ce5b669003b2 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -208,7 +208,7 @@ static struct xdp_pkt *convert_to_xdp_pkt(struct xdp_buff *xdp) headroom = xdp->data - xdp->data_hard_start; metasize = xdp->data - xdp->data_meta; metasize = metasize > 0 ? metasize : 0; - if ((headroom - metasize) < sizeof(*xdp_pkt)) + if (unlikely((headroom - metasize) < sizeof(*xdp_pkt))) return NULL; /* Store info in top of packet */ @@ -656,7 +656,7 @@ int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp, struct xdp_pkt *xdp_pkt; xdp_pkt = convert_to_xdp_pkt(xdp); - if (!xdp_pkt) + if (unlikely(!xdp_pkt)) return -EOVERFLOW; /* Info needed when constructing SKB on remote CPU */ From 26b91eb0ac7a4aac8febd0f20a1c311fc777ec07 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 1 Nov 2017 23:58:09 +0100 Subject: [PATCH 0054/1640] BACKPORT: bpf: minor cleanups after merge Two minor cleanups after Dave's recent merge in f8ddadc4db6c ("Merge git://git.kernel.org...") of net into net-next in order to get the code in line with what was done originally in the net tree: i) use max() instead of max_t() since both ranges are u16, ii) don't split the direct access test cases in the middle with bpf_exit test cases from 390ee7e29fc ("bpf: enforce return code for cgroup-bpf programs"). Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Acked-by: John Fastabend Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 2a11482583bd..416d834bb9e9 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2959,7 +2959,7 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state, continue; reg = &state->stack[i].spilled_ptr; if (reg->type == type && reg->id == dst_reg->id) - reg->range = max_t(u16, reg->range, new_range); + reg->range = max(reg->range, new_range); } } From 95628c3a62400c089175b35533bb62d2649c1261 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 1 Nov 2017 23:58:10 +0100 Subject: [PATCH 0055/1640] UPSTREAM: bpf: also improve pattern matches for meta access Follow-up to 0fd4759c5515 ("bpf: fix pattern matches for direct packet access") to cover also the remaining data_meta/data matches in the verifier. The matches are also refactored a bit to simplify handling of all the cases. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Acked-by: John Fastabend Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 165 ++++++++++++++++++++++++------------------ 1 file changed, 96 insertions(+), 69 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 416d834bb9e9..0454333de5d6 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3214,6 +3214,99 @@ static void mark_map_regs(struct bpf_verifier_state *state, u32 regno, } } +static bool try_match_pkt_pointers(const struct bpf_insn *insn, + struct bpf_reg_state *dst_reg, + struct bpf_reg_state *src_reg, + struct bpf_verifier_state *this_branch, + struct bpf_verifier_state *other_branch) +{ + if (BPF_SRC(insn->code) != BPF_X) + return false; + + switch (BPF_OP(insn->code)) { + case BPF_JGT: + if ((dst_reg->type == PTR_TO_PACKET && + src_reg->type == PTR_TO_PACKET_END) || + (dst_reg->type == PTR_TO_PACKET_META && + reg_is_init_pkt_pointer(src_reg, PTR_TO_PACKET))) { + /* pkt_data' > pkt_end, pkt_meta' > pkt_data */ + find_good_pkt_pointers(this_branch, dst_reg, + dst_reg->type, false); + } else if ((dst_reg->type == PTR_TO_PACKET_END && + src_reg->type == PTR_TO_PACKET) || + (reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) && + src_reg->type == PTR_TO_PACKET_META)) { + /* pkt_end > pkt_data', pkt_data > pkt_meta' */ + find_good_pkt_pointers(other_branch, src_reg, + src_reg->type, true); + } else { + return false; + } + break; + case BPF_JLT: + if ((dst_reg->type == PTR_TO_PACKET && + src_reg->type == PTR_TO_PACKET_END) || + (dst_reg->type == PTR_TO_PACKET_META && + reg_is_init_pkt_pointer(src_reg, PTR_TO_PACKET))) { + /* pkt_data' < pkt_end, pkt_meta' < pkt_data */ + find_good_pkt_pointers(other_branch, dst_reg, + dst_reg->type, true); + } else if ((dst_reg->type == PTR_TO_PACKET_END && + src_reg->type == PTR_TO_PACKET) || + (reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) && + src_reg->type == PTR_TO_PACKET_META)) { + /* pkt_end < pkt_data', pkt_data > pkt_meta' */ + find_good_pkt_pointers(this_branch, src_reg, + src_reg->type, false); + } else { + return false; + } + break; + case BPF_JGE: + if ((dst_reg->type == PTR_TO_PACKET && + src_reg->type == PTR_TO_PACKET_END) || + (dst_reg->type == PTR_TO_PACKET_META && + reg_is_init_pkt_pointer(src_reg, PTR_TO_PACKET))) { + /* pkt_data' >= pkt_end, pkt_meta' >= pkt_data */ + find_good_pkt_pointers(this_branch, dst_reg, + dst_reg->type, true); + } else if ((dst_reg->type == PTR_TO_PACKET_END && + src_reg->type == PTR_TO_PACKET) || + (reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) && + src_reg->type == PTR_TO_PACKET_META)) { + /* pkt_end >= pkt_data', pkt_data >= pkt_meta' */ + find_good_pkt_pointers(other_branch, src_reg, + src_reg->type, false); + } else { + return false; + } + break; + case BPF_JLE: + if ((dst_reg->type == PTR_TO_PACKET && + src_reg->type == PTR_TO_PACKET_END) || + (dst_reg->type == PTR_TO_PACKET_META && + reg_is_init_pkt_pointer(src_reg, PTR_TO_PACKET))) { + /* pkt_data' <= pkt_end, pkt_meta' <= pkt_data */ + find_good_pkt_pointers(other_branch, dst_reg, + dst_reg->type, false); + } else if ((dst_reg->type == PTR_TO_PACKET_END && + src_reg->type == PTR_TO_PACKET) || + (reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) && + src_reg->type == PTR_TO_PACKET_META)) { + /* pkt_end <= pkt_data', pkt_data <= pkt_meta' */ + find_good_pkt_pointers(this_branch, src_reg, + src_reg->type, true); + } else { + return false; + } + break; + default: + return false; + } + + return true; +} + static int check_cond_jmp_op(struct bpf_verifier_env *env, struct bpf_insn *insn, int *insn_idx) { @@ -3321,75 +3414,9 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, */ mark_map_regs(this_branch, insn->dst_reg, opcode == BPF_JNE); mark_map_regs(other_branch, insn->dst_reg, opcode == BPF_JEQ); - } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGT && - dst_reg->type == PTR_TO_PACKET && - regs[insn->src_reg].type == PTR_TO_PACKET_END) { - /* pkt_data' > pkt_end */ - find_good_pkt_pointers(this_branch, dst_reg, - PTR_TO_PACKET, false); - } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGT && - dst_reg->type == PTR_TO_PACKET_END && - regs[insn->src_reg].type == PTR_TO_PACKET) { - /* pkt_end > pkt_data' */ - find_good_pkt_pointers(other_branch, ®s[insn->src_reg], - PTR_TO_PACKET, true); - } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLT && - dst_reg->type == PTR_TO_PACKET && - regs[insn->src_reg].type == PTR_TO_PACKET_END) { - /* pkt_data' < pkt_end */ - find_good_pkt_pointers(other_branch, dst_reg, PTR_TO_PACKET, - true); - } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLT && - dst_reg->type == PTR_TO_PACKET_END && - regs[insn->src_reg].type == PTR_TO_PACKET) { - /* pkt_end < pkt_data' */ - find_good_pkt_pointers(this_branch, ®s[insn->src_reg], - PTR_TO_PACKET, false); - } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGE && - dst_reg->type == PTR_TO_PACKET && - regs[insn->src_reg].type == PTR_TO_PACKET_END) { - /* pkt_data' >= pkt_end */ - find_good_pkt_pointers(this_branch, dst_reg, - PTR_TO_PACKET, true); - } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGE && - dst_reg->type == PTR_TO_PACKET_END && - regs[insn->src_reg].type == PTR_TO_PACKET) { - /* pkt_end >= pkt_data' */ - find_good_pkt_pointers(other_branch, ®s[insn->src_reg], - PTR_TO_PACKET, false); - } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLE && - dst_reg->type == PTR_TO_PACKET && - regs[insn->src_reg].type == PTR_TO_PACKET_END) { - /* pkt_data' <= pkt_end */ - find_good_pkt_pointers(other_branch, dst_reg, - PTR_TO_PACKET, false); - } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLE && - dst_reg->type == PTR_TO_PACKET_END && - regs[insn->src_reg].type == PTR_TO_PACKET) { - /* pkt_end <= pkt_data' */ - find_good_pkt_pointers(this_branch, ®s[insn->src_reg], - PTR_TO_PACKET, true); - } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGT && - dst_reg->type == PTR_TO_PACKET_META && - reg_is_init_pkt_pointer(®s[insn->src_reg], PTR_TO_PACKET)) { - find_good_pkt_pointers(this_branch, dst_reg, - PTR_TO_PACKET_META, false); - } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLT && - dst_reg->type == PTR_TO_PACKET_META && - reg_is_init_pkt_pointer(®s[insn->src_reg], PTR_TO_PACKET)) { - find_good_pkt_pointers(other_branch, dst_reg, - PTR_TO_PACKET_META, false); - } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGE && - reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) && - regs[insn->src_reg].type == PTR_TO_PACKET_META) { - find_good_pkt_pointers(other_branch, ®s[insn->src_reg], - PTR_TO_PACKET_META, false); - } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLE && - reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) && - regs[insn->src_reg].type == PTR_TO_PACKET_META) { - find_good_pkt_pointers(this_branch, ®s[insn->src_reg], - PTR_TO_PACKET_META, false); - } else if (is_pointer_value(env, insn->dst_reg)) { + } else if (!try_match_pkt_pointers(insn, dst_reg, ®s[insn->src_reg], + this_branch, other_branch) && + is_pointer_value(env, insn->dst_reg)) { verbose(env, "R%d pointer comparison prohibited\n", insn->dst_reg); return -EACCES; From 5e6ec7fe0f558057dbd0e32a4512d00e591f1884 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 2 Nov 2017 12:05:51 +0100 Subject: [PATCH 0056/1640] UPSTREAM: bpf: fix link error without CONFIG_NET I ran into this link error with the latest net-next plus linux-next trees when networking is disabled: kernel/bpf/verifier.o:(.rodata+0x2958): undefined reference to `tc_cls_act_analyzer_ops' kernel/bpf/verifier.o:(.rodata+0x2970): undefined reference to `xdp_analyzer_ops' It seems that the code was written to deal with varying contents of the arrray, but the actual #ifdef was missing. Both tc_cls_act_analyzer_ops and xdp_analyzer_ops are defined in the core networking code, so adding a check for CONFIG_NET seems appropriate here, and I've verified this with many randconfig builds Fixes: 4f9218aaf8a4 ("bpf: move knowledge about post-translation offsets out of verifier") Signed-off-by: Arnd Bergmann Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 0454333de5d6..e881b027c739 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5216,8 +5216,10 @@ err_free_env: } static const struct bpf_verifier_ops * const bpf_analyzer_ops[] = { +#ifdef CONFIG_NET [BPF_PROG_TYPE_XDP] = &xdp_analyzer_ops, [BPF_PROG_TYPE_SCHED_CLS] = &tc_cls_act_analyzer_ops, +#endif }; int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops, From c5aa02bab06dd8fd7679bfc00e85a627398e6858 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 2 Nov 2017 12:05:52 +0100 Subject: [PATCH 0057/1640] UPSTREAM: bpf: fix out-of-bounds access warning in bpf_check The bpf_verifer_ops array is generated dynamically and may be empty depending on configuration, which then causes an out of bounds access: kernel/bpf/verifier.c: In function 'bpf_check': kernel/bpf/verifier.c:4320:29: error: array subscript is above array bounds [-Werror=array-bounds] This adds a check to the start of the function as a workaround. I would assume that the function is never called in that configuration, so the warning is probably harmless. Fixes: 00176a34d9e2 ("bpf: remove the verifier ops from program structure") Signed-off-by: Arnd Bergmann Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index e881b027c739..d645b1311949 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5097,6 +5097,10 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) struct bpf_verifer_log *log; int ret = -EINVAL; + /* no program is valid */ + if (ARRAY_SIZE(bpf_verifier_ops) == 0) + return -EINVAL; + /* 'struct bpf_verifier_env' can be global, but since it's not small, * allocate/free it every time bpf_check() is called */ From a15e36c7f5621ac64a775299b631b5789ca57d70 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 3 Nov 2017 13:56:16 -0700 Subject: [PATCH 0058/1640] BACKPORT: net: bpf: rename ndo_xdp to ndo_bpf ndo_xdp is a control path callback for setting up XDP in the driver. We can reuse it for other forms of communication between the eBPF stack and the drivers. Rename the callback and associated structures and definitions. Signed-off-by: Jakub Kicinski Reviewed-by: Simon Horman Reviewed-by: Quentin Monnet Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 2 +- drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c | 2 +- drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.h | 2 +- .../net/ethernet/cavium/thunder/nicvf_main.c | 4 +-- drivers/net/ethernet/intel/i40e/i40e_main.c | 6 ++-- drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 4 +-- .../net/ethernet/mellanox/mlx4/en_netdev.c | 6 ++-- .../net/ethernet/mellanox/mlx5/core/en_main.c | 4 +-- .../ethernet/netronome/nfp/nfp_net_common.c | 4 +-- drivers/net/ethernet/qlogic/qede/qede.h | 2 +- .../net/ethernet/qlogic/qede/qede_filter.c | 2 +- drivers/net/ethernet/qlogic/qede/qede_main.c | 4 +-- drivers/net/tun.c | 4 +-- drivers/net/virtio_net.c | 4 +-- include/linux/netdevice.h | 23 +++++++------ net/core/dev.c | 34 +++++++++---------- net/core/rtnetlink.c | 4 +-- 17 files changed, 56 insertions(+), 55 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 180a7ef588cf..df6e76e5d414 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -7800,7 +7800,7 @@ static const struct net_device_ops bnxt_netdev_ops = { #endif .ndo_udp_tunnel_add = bnxt_udp_tunnel_add, .ndo_udp_tunnel_del = bnxt_udp_tunnel_del, - .ndo_xdp = bnxt_xdp, + .ndo_bpf = bnxt_xdp, .ndo_bridge_getlink = bnxt_bridge_getlink, .ndo_bridge_setlink = bnxt_bridge_setlink, .ndo_get_phys_port_name = bnxt_get_phys_port_name diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c index 06ce63c00821..261e5847557a 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c @@ -208,7 +208,7 @@ static int bnxt_xdp_set(struct bnxt *bp, struct bpf_prog *prog) return 0; } -int bnxt_xdp(struct net_device *dev, struct netdev_xdp *xdp) +int bnxt_xdp(struct net_device *dev, struct netdev_bpf *xdp) { struct bnxt *bp = netdev_priv(dev); int rc; diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.h b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.h index 12a5ad66b564..414b748038ca 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.h @@ -16,6 +16,6 @@ void bnxt_tx_int_xdp(struct bnxt *bp, struct bnxt_napi *bnapi, int nr_pkts); bool bnxt_rx_xdp(struct bnxt *bp, struct bnxt_rx_ring_info *rxr, u16 cons, struct page *page, u8 **data_ptr, unsigned int *len, u8 *event); -int bnxt_xdp(struct net_device *dev, struct netdev_xdp *xdp); +int bnxt_xdp(struct net_device *dev, struct netdev_bpf *xdp); #endif diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_main.c b/drivers/net/ethernet/cavium/thunder/nicvf_main.c index 204736468687..8f952a92f172 100644 --- a/drivers/net/ethernet/cavium/thunder/nicvf_main.c +++ b/drivers/net/ethernet/cavium/thunder/nicvf_main.c @@ -1773,7 +1773,7 @@ static int nicvf_xdp_setup(struct nicvf *nic, struct bpf_prog *prog) return ret; } -static int nicvf_xdp(struct net_device *netdev, struct netdev_xdp *xdp) +static int nicvf_xdp(struct net_device *netdev, struct netdev_bpf *xdp) { struct nicvf *nic = netdev_priv(netdev); @@ -1806,7 +1806,7 @@ static const struct net_device_ops nicvf_netdev_ops = { .ndo_tx_timeout = nicvf_tx_timeout, .ndo_fix_features = nicvf_fix_features, .ndo_set_features = nicvf_set_features, - .ndo_xdp = nicvf_xdp, + .ndo_bpf = nicvf_xdp, }; static int nicvf_probe(struct pci_dev *pdev, const struct pci_device_id *ent) diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index 0e0bc67a28bf..751c931fe184 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -9645,12 +9645,12 @@ static int i40e_xdp_setup(struct i40e_vsi *vsi, } /** - * i40e_xdp - implements ndo_xdp for i40e + * i40e_xdp - implements ndo_bpf for i40e * @dev: netdevice * @xdp: XDP command **/ static int i40e_xdp(struct net_device *dev, - struct netdev_xdp *xdp) + struct netdev_bpf *xdp) { struct i40e_netdev_priv *np = netdev_priv(dev); struct i40e_vsi *vsi = np->vsi; @@ -9702,7 +9702,7 @@ static const struct net_device_ops i40e_netdev_ops = { .ndo_features_check = i40e_features_check, .ndo_bridge_getlink = i40e_ndo_bridge_getlink, .ndo_bridge_setlink = i40e_ndo_bridge_setlink, - .ndo_xdp = i40e_xdp, + .ndo_bpf = i40e_xdp, }; /** diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index 2557fffcb87b..feda171bf323 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -9886,7 +9886,7 @@ static int ixgbe_xdp_setup(struct net_device *dev, struct bpf_prog *prog) return 0; } -static int ixgbe_xdp(struct net_device *dev, struct netdev_xdp *xdp) +static int ixgbe_xdp(struct net_device *dev, struct netdev_bpf *xdp) { struct ixgbe_adapter *adapter = netdev_priv(dev); @@ -9995,7 +9995,7 @@ static const struct net_device_ops ixgbe_netdev_ops = { .ndo_udp_tunnel_add = ixgbe_add_udp_tunnel_port, .ndo_udp_tunnel_del = ixgbe_del_udp_tunnel_port, .ndo_features_check = ixgbe_features_check, - .ndo_xdp = ixgbe_xdp, + .ndo_bpf = ixgbe_xdp, .ndo_xdp_xmit = ixgbe_xdp_xmit, .ndo_xdp_flush = ixgbe_xdp_flush, }; diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c index 70a80e43d833..1bc7e3497a1a 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c @@ -2930,7 +2930,7 @@ static u32 mlx4_xdp_query(struct net_device *dev) return prog_id; } -static int mlx4_xdp(struct net_device *dev, struct netdev_xdp *xdp) +static int mlx4_xdp(struct net_device *dev, struct netdev_bpf *xdp) { switch (xdp->command) { case XDP_SETUP_PROG: @@ -2972,7 +2972,7 @@ static const struct net_device_ops mlx4_netdev_ops = { .ndo_udp_tunnel_del = mlx4_en_del_vxlan_port, .ndo_features_check = mlx4_en_features_check, .ndo_set_tx_maxrate = mlx4_en_set_tx_maxrate, - .ndo_xdp = mlx4_xdp, + .ndo_bpf = mlx4_xdp, }; static const struct net_device_ops mlx4_netdev_ops_master = { @@ -3009,7 +3009,7 @@ static const struct net_device_ops mlx4_netdev_ops_master = { .ndo_udp_tunnel_del = mlx4_en_del_vxlan_port, .ndo_features_check = mlx4_en_features_check, .ndo_set_tx_maxrate = mlx4_en_set_tx_maxrate, - .ndo_xdp = mlx4_xdp, + .ndo_bpf = mlx4_xdp, }; struct mlx4_en_bond { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 75c491ab6127..707c87f9987c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -3735,7 +3735,7 @@ static u32 mlx5e_xdp_query(struct net_device *dev) return prog_id; } -static int mlx5e_xdp(struct net_device *dev, struct netdev_xdp *xdp) +static int mlx5e_xdp(struct net_device *dev, struct netdev_bpf *xdp) { switch (xdp->command) { case XDP_SETUP_PROG: @@ -3787,7 +3787,7 @@ static const struct net_device_ops mlx5e_netdev_ops = { .ndo_rx_flow_steer = mlx5e_rx_flow_steer, #endif .ndo_tx_timeout = mlx5e_tx_timeout, - .ndo_xdp = mlx5e_xdp, + .ndo_bpf = mlx5e_xdp, #ifdef CONFIG_NET_POLL_CONTROLLER .ndo_poll_controller = mlx5e_netpoll, #endif diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c index 43b1cba913ed..fad9f9fbf2c1 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c @@ -3423,7 +3423,7 @@ nfp_net_xdp_setup(struct nfp_net *nn, struct bpf_prog *prog, u32 flags, return 0; } -static int nfp_net_xdp(struct net_device *netdev, struct netdev_xdp *xdp) +static int nfp_net_xdp(struct net_device *netdev, struct netdev_bpf *xdp) { struct nfp_net *nn = netdev_priv(netdev); @@ -3486,7 +3486,7 @@ const struct net_device_ops nfp_net_netdev_ops = { .ndo_get_phys_port_name = nfp_port_get_phys_port_name, .ndo_udp_tunnel_add = nfp_net_add_vxlan_port, .ndo_udp_tunnel_del = nfp_net_del_vxlan_port, - .ndo_xdp = nfp_net_xdp, + .ndo_bpf = nfp_net_xdp, }; /** diff --git a/drivers/net/ethernet/qlogic/qede/qede.h b/drivers/net/ethernet/qlogic/qede/qede.h index c132b08cefde..4cc9af175a76 100644 --- a/drivers/net/ethernet/qlogic/qede/qede.h +++ b/drivers/net/ethernet/qlogic/qede/qede.h @@ -505,7 +505,7 @@ void qede_fill_rss_params(struct qede_dev *edev, void qede_udp_tunnel_add(struct net_device *dev, struct udp_tunnel_info *ti); void qede_udp_tunnel_del(struct net_device *dev, struct udp_tunnel_info *ti); -int qede_xdp(struct net_device *dev, struct netdev_xdp *xdp); +int qede_xdp(struct net_device *dev, struct netdev_bpf *xdp); #ifdef CONFIG_DCB void qede_set_dcbnl_ops(struct net_device *ndev); diff --git a/drivers/net/ethernet/qlogic/qede/qede_filter.c b/drivers/net/ethernet/qlogic/qede/qede_filter.c index e7ad95de3da8..924cb2ea664d 100644 --- a/drivers/net/ethernet/qlogic/qede/qede_filter.c +++ b/drivers/net/ethernet/qlogic/qede/qede_filter.c @@ -1065,7 +1065,7 @@ static int qede_xdp_set(struct qede_dev *edev, struct bpf_prog *prog) return 0; } -int qede_xdp(struct net_device *dev, struct netdev_xdp *xdp) +int qede_xdp(struct net_device *dev, struct netdev_bpf *xdp) { struct qede_dev *edev = netdev_priv(dev); diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c b/drivers/net/ethernet/qlogic/qede/qede_main.c index 99de923728ec..a2da52362d09 100644 --- a/drivers/net/ethernet/qlogic/qede/qede_main.c +++ b/drivers/net/ethernet/qlogic/qede/qede_main.c @@ -557,7 +557,7 @@ static const struct net_device_ops qede_netdev_ops = { .ndo_udp_tunnel_add = qede_udp_tunnel_add, .ndo_udp_tunnel_del = qede_udp_tunnel_del, .ndo_features_check = qede_features_check, - .ndo_xdp = qede_xdp, + .ndo_bpf = qede_xdp, #ifdef CONFIG_RFS_ACCEL .ndo_rx_flow_steer = qede_rx_flow_steer, #endif @@ -595,7 +595,7 @@ static const struct net_device_ops qede_netdev_vf_xdp_ops = { .ndo_udp_tunnel_add = qede_udp_tunnel_add, .ndo_udp_tunnel_del = qede_udp_tunnel_del, .ndo_features_check = qede_features_check, - .ndo_xdp = qede_xdp, + .ndo_bpf = qede_xdp, }; /* ------------------------------------------------------------------------- diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 10870ac05157..757dff1c7216 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -1077,7 +1077,7 @@ static u32 tun_xdp_query(struct net_device *dev) return 0; } -static int tun_xdp(struct net_device *dev, struct netdev_xdp *xdp) +static int tun_xdp(struct net_device *dev, struct netdev_bpf *xdp) { switch (xdp->command) { case XDP_SETUP_PROG: @@ -1121,7 +1121,7 @@ static const struct net_device_ops tap_netdev_ops = { .ndo_features_check = passthru_features_check, .ndo_set_rx_headroom = tun_set_headroom, .ndo_get_stats64 = tun_net_get_stats64, - .ndo_xdp = tun_xdp, + .ndo_bpf = tun_xdp, }; static void tun_flow_init(struct tun_struct *tun) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index d4313df996fa..339d6c0b162a 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -2097,7 +2097,7 @@ static u32 virtnet_xdp_query(struct net_device *dev) return 0; } -static int virtnet_xdp(struct net_device *dev, struct netdev_xdp *xdp) +static int virtnet_xdp(struct net_device *dev, struct netdev_bpf *xdp) { switch (xdp->command) { case XDP_SETUP_PROG: @@ -2124,7 +2124,7 @@ static const struct net_device_ops virtnet_netdev = { #ifdef CONFIG_NET_POLL_CONTROLLER .ndo_poll_controller = virtnet_netpoll, #endif - .ndo_xdp = virtnet_xdp, + .ndo_bpf = virtnet_xdp, .ndo_features_check = passthru_features_check, }; diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index c209f64c4829..fcfa3487f830 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -793,10 +793,10 @@ enum tc_setup_type { TC_SETUP_CLSBPF, }; -/* These structures hold the attributes of xdp state that are being passed - * to the netdevice through the xdp op. +/* These structures hold the attributes of bpf state that are being passed + * to the netdevice through the bpf op. */ -enum xdp_netdev_command { +enum bpf_netdev_command { /* Set or clear a bpf program used in the earliest stages of packet * rx. The prog will have been loaded as BPF_PROG_TYPE_XDP. The callee * is responsible for calling bpf_prog_put on any old progs that are @@ -815,8 +815,8 @@ enum xdp_netdev_command { struct netlink_ext_ack; -struct netdev_xdp { - enum xdp_netdev_command command; +struct netdev_bpf { + enum bpf_netdev_command command; union { /* XDP_SETUP_PROG */ struct { @@ -1162,9 +1162,10 @@ struct macsec_ops { * appropriate rx headroom value allows avoiding skb head copy on * forward. Setting a negative value resets the rx headroom to the * default value. - * int (*ndo_xdp)(struct net_device *dev, struct netdev_xdp *xdp); + * int (*ndo_bpf)(struct net_device *dev, struct netdev_bpf *bpf); * This function is used to set or query state related to XDP on the - * netdevice. See definition of enum xdp_netdev_command for details. + * netdevice and manage BPF offload. See definition of + * enum bpf_netdev_command for details. * int (*ndo_xdp_xmit)(struct net_device *dev, struct xdp_buff *xdp); * This function is used to submit a XDP packet for transmit on a * netdevice. @@ -1352,8 +1353,8 @@ struct net_device_ops { struct sk_buff *skb); void (*ndo_set_rx_headroom)(struct net_device *dev, int needed_headroom); - int (*ndo_xdp)(struct net_device *dev, - struct netdev_xdp *xdp); + int (*ndo_bpf)(struct net_device *dev, + struct netdev_bpf *bpf); int (*ndo_xdp_xmit)(struct net_device *dev, struct xdp_buff *xdp); void (*ndo_xdp_flush)(struct net_device *dev); @@ -3394,10 +3395,10 @@ struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *d struct sk_buff *dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, struct netdev_queue *txq, int *ret); -typedef int (*xdp_op_t)(struct net_device *dev, struct netdev_xdp *xdp); +typedef int (*bpf_op_t)(struct net_device *dev, struct netdev_bpf *bpf); int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack, int fd, u32 flags); -u8 __dev_xdp_attached(struct net_device *dev, xdp_op_t xdp_op, u32 *prog_id); +u8 __dev_xdp_attached(struct net_device *dev, bpf_op_t xdp_op, u32 *prog_id); int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb); int dev_forward_skb(struct net_device *dev, struct sk_buff *skb); diff --git a/net/core/dev.c b/net/core/dev.c index c981f082d780..240d02e075c9 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4630,7 +4630,7 @@ static int __netif_receive_skb(struct sk_buff *skb) return ret; } -static int generic_xdp_install(struct net_device *dev, struct netdev_xdp *xdp) +static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp) { struct bpf_prog *old = rtnl_dereference(dev->xdp_prog); struct bpf_prog *new = xdp->prog; @@ -7190,26 +7190,26 @@ int dev_change_proto_down(struct net_device *dev, bool proto_down) } EXPORT_SYMBOL(dev_change_proto_down); -u8 __dev_xdp_attached(struct net_device *dev, xdp_op_t xdp_op, u32 *prog_id) +u8 __dev_xdp_attached(struct net_device *dev, bpf_op_t bpf_op, u32 *prog_id) { - struct netdev_xdp xdp; + struct netdev_bpf xdp; memset(&xdp, 0, sizeof(xdp)); xdp.command = XDP_QUERY_PROG; /* Query must always succeed. */ - WARN_ON(xdp_op(dev, &xdp) < 0); + WARN_ON(bpf_op(dev, &xdp) < 0); if (prog_id) *prog_id = xdp.prog_id; return xdp.prog_attached; } -static int dev_xdp_install(struct net_device *dev, xdp_op_t xdp_op, +static int dev_xdp_install(struct net_device *dev, bpf_op_t bpf_op, struct netlink_ext_ack *extack, u32 flags, struct bpf_prog *prog) { - struct netdev_xdp xdp; + struct netdev_bpf xdp; memset(&xdp, 0, sizeof(xdp)); if (flags & XDP_FLAGS_HW_MODE) @@ -7220,7 +7220,7 @@ static int dev_xdp_install(struct net_device *dev, xdp_op_t xdp_op, xdp.flags = flags; xdp.prog = prog; - return xdp_op(dev, &xdp); + return bpf_op(dev, &xdp); } /** @@ -7237,24 +7237,24 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack, { const struct net_device_ops *ops = dev->netdev_ops; struct bpf_prog *prog = NULL; - xdp_op_t xdp_op, xdp_chk; + bpf_op_t bpf_op, bpf_chk; int err; ASSERT_RTNL(); - xdp_op = xdp_chk = ops->ndo_xdp; - if (!xdp_op && (flags & (XDP_FLAGS_DRV_MODE | XDP_FLAGS_HW_MODE))) + bpf_op = bpf_chk = ops->ndo_bpf; + if (!bpf_op && (flags & (XDP_FLAGS_DRV_MODE | XDP_FLAGS_HW_MODE))) return -EOPNOTSUPP; - if (!xdp_op || (flags & XDP_FLAGS_SKB_MODE)) - xdp_op = generic_xdp_install; - if (xdp_op == xdp_chk) - xdp_chk = generic_xdp_install; + if (!bpf_op || (flags & XDP_FLAGS_SKB_MODE)) + bpf_op = generic_xdp_install; + if (bpf_op == bpf_chk) + bpf_chk = generic_xdp_install; if (fd >= 0) { - if (xdp_chk && __dev_xdp_attached(dev, xdp_chk, NULL)) + if (bpf_chk && __dev_xdp_attached(dev, bpf_chk, NULL)) return -EEXIST; if ((flags & XDP_FLAGS_UPDATE_IF_NOEXIST) && - __dev_xdp_attached(dev, xdp_op, NULL)) + __dev_xdp_attached(dev, bpf_op, NULL)) return -EBUSY; prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_XDP); @@ -7262,7 +7262,7 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack, return PTR_ERR(prog); } - err = dev_xdp_install(dev, xdp_op, extack, flags, prog); + err = dev_xdp_install(dev, bpf_op, extack, flags, prog); if (err < 0 && prog) bpf_prog_put(prog); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 1aca1f3f2120..ed3c304ab418 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1242,10 +1242,10 @@ static u8 rtnl_xdp_attached_mode(struct net_device *dev, u32 *prog_id) *prog_id = generic_xdp_prog->aux->id; return XDP_ATTACHED_SKB; } - if (!ops->ndo_xdp) + if (!ops->ndo_bpf) return XDP_ATTACHED_NONE; - return __dev_xdp_attached(dev, ops->ndo_xdp, prog_id); + return __dev_xdp_attached(dev, ops->ndo_bpf, prog_id); } static int rtnl_xdp_fill(struct sk_buff *skb, struct net_device *dev) From aad1b911b35a8056860176f5dc088ab2b5ea485d Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 3 Nov 2017 13:56:17 -0700 Subject: [PATCH 0059/1640] BACKPORT: bpf: offload: add infrastructure for loading programs for a specific netdev The fact that we don't know which device the program is going to be used on is quite limiting in current eBPF infrastructure. We have to reverse or limit the changes which kernel makes to the loaded bytecode if we want it to be offloaded to a networking device. We also have to invent new APIs for debugging and troubleshooting support. Make it possible to load programs for a specific netdev. This helps us to bring the debug information closer to the core eBPF infrastructure (e.g. we will be able to reuse the verifer log in device JIT). It allows device JITs to perform translation on the original bytecode. __bpf_prog_get() when called to get a reference for an attachment point will now refuse to give it if program has a device assigned. Following patches will add a version of that function which passes the expected netdev in. @type argument in __bpf_prog_get() is renamed to attach_type to make it clearer that it's only set on attachment. All calls to ndo_bpf are protected by rtnl, only verifier callbacks are not. We need a wait queue to make sure netdev doesn't get destroyed while verifier is still running and calling its driver. Signed-off-by: Jakub Kicinski Reviewed-by: Simon Horman Reviewed-by: Quentin Monnet Signed-off-by: David S. Miller --- include/linux/bpf.h | 36 +++++++ include/linux/bpf_verifier.h | 10 ++ include/linux/netdevice.h | 14 +++ include/uapi/linux/bpf.h | 1 + kernel/bpf/Makefile | 1 + kernel/bpf/core.c | 18 +++- kernel/bpf/offload.c | 182 +++++++++++++++++++++++++++++++++++ kernel/bpf/syscall.c | 17 +++- kernel/bpf/verifier.c | 15 ++- 9 files changed, 282 insertions(+), 12 deletions(-) create mode 100644 kernel/bpf/offload.c diff --git a/include/linux/bpf.h b/include/linux/bpf.h index c47912d4ae96..56792f592c40 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -15,6 +15,7 @@ #include #include #include +#include struct perf_event; struct bpf_prog; @@ -197,6 +198,16 @@ struct bpf_verifier_ops { struct bpf_prog *prog, u32 *target_size); }; +struct bpf_dev_offload { + struct bpf_prog *prog; + struct net_device *netdev; + void *dev_priv; + struct list_head offloads; + bool dev_state; + bool verifier_running; + wait_queue_head_t verifier_done; +}; + struct bpf_prog_aux { atomic_t refcnt; u32 used_map_cnt; @@ -214,6 +225,7 @@ struct bpf_prog_aux { #ifdef CONFIG_SECURITY void *security; #endif + struct bpf_dev_offload *offload; union { struct work_struct work; struct rcu_head rcu; @@ -333,6 +345,7 @@ extern const struct file_operations bpf_prog_fops; #undef BPF_PROG_TYPE #undef BPF_MAP_TYPE +extern const struct bpf_prog_ops bpf_offload_prog_ops; extern const struct bpf_verifier_ops tc_cls_act_analyzer_ops; extern const struct bpf_verifier_ops xdp_analyzer_ops; @@ -526,6 +539,29 @@ static inline bool unprivileged_ebpf_enabled(void) #endif /* CONFIG_BPF_SYSCALL */ +int bpf_prog_offload_compile(struct bpf_prog *prog); +void bpf_prog_offload_destroy(struct bpf_prog *prog); + +#if defined(CONFIG_NET) && defined(CONFIG_BPF_SYSCALL) +int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr); + +static inline bool bpf_prog_is_dev_bound(struct bpf_prog_aux *aux) +{ + return aux->offload; +} +#else +static inline int bpf_prog_offload_init(struct bpf_prog *prog, + union bpf_attr *attr) +{ + return -EOPNOTSUPP; +} + +static inline bool bpf_prog_is_dev_bound(struct bpf_prog_aux *aux) +{ + return false; +} +#endif /* CONFIG_NET && CONFIG_BPF_SYSCALL */ + #if defined(CONFIG_STREAM_PARSER) && defined(CONFIG_BPF_SYSCALL) struct sock *__sock_map_lookup_elem(struct bpf_map *map, u32 key); int sock_map_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type); diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 4c961fb6e15d..85475d2a8311 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -171,6 +171,7 @@ struct bpf_verifier_env { struct bpf_verifier_state *cur_state; /* current verifier state */ struct bpf_verifier_state_list **explored_states; /* search pruning optimization */ const struct bpf_ext_analyzer_ops *analyzer_ops; /* external analyzer ops */ + const struct bpf_ext_analyzer_ops *dev_ops; /* device analyzer ops */ void *analyzer_priv; /* pointer to external analyzer's private data */ struct bpf_map *used_maps[MAX_USED_MAPS]; /* array of map's used by eBPF program */ u32 used_map_cnt; /* number of used maps */ @@ -187,6 +188,15 @@ static inline struct bpf_reg_state *cur_regs(struct bpf_verifier_env *env) return env->cur_state->regs; } +#if defined(CONFIG_NET) && defined(CONFIG_BPF_SYSCALL) +int bpf_prog_offload_verifier_prep(struct bpf_verifier_env *env); +#else +int bpf_prog_offload_verifier_prep(struct bpf_verifier_env *env) +{ + return -EOPNOTSUPP; +} +#endif + int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops, void *priv); diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index fcfa3487f830..e9b4c5a53092 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -811,8 +811,13 @@ enum bpf_netdev_command { * is equivalent to XDP_ATTACHED_DRV. */ XDP_QUERY_PROG, + /* BPF program for offload callbacks, invoked at program load time. */ + BPF_OFFLOAD_VERIFIER_PREP, + BPF_OFFLOAD_TRANSLATE, + BPF_OFFLOAD_DESTROY, }; +struct bpf_ext_analyzer_ops; struct netlink_ext_ack; struct netdev_bpf { @@ -829,6 +834,15 @@ struct netdev_bpf { u8 prog_attached; u32 prog_id; }; + /* BPF_OFFLOAD_VERIFIER_PREP */ + struct { + struct bpf_prog *prog; + const struct bpf_ext_analyzer_ops *ops; /* callee set */ + } verifier; + /* BPF_OFFLOAD_TRANSLATE, BPF_OFFLOAD_DESTROY */ + struct { + struct bpf_prog *prog; + } offload; }; }; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index d2678e94dfa2..3c25b6d28e16 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -260,6 +260,7 @@ union bpf_attr { __u32 kern_version; /* checked when prog_type=kprobe */ __u32 prog_flags; char prog_name[BPF_OBJ_NAME_LEN]; + __u32 prog_target_ifindex; /* ifindex of netdev to prep for */ }; struct { /* anonymous struct used by BPF_OBJ_* commands */ diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index cdd3ce111f70..c5c8ed85ecf3 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -8,6 +8,7 @@ obj-$(CONFIG_BPF_SYSCALL) += disasm.o ifeq ($(CONFIG_NET),y) obj-$(CONFIG_BPF_SYSCALL) += devmap.o obj-$(CONFIG_BPF_SYSCALL) += cpumap.o +obj-$(CONFIG_BPF_SYSCALL) += offload.o ifeq ($(CONFIG_STREAM_PARSER),y) obj-$(CONFIG_BPF_SYSCALL) += sockmap.o endif diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 3df6d8adcf59..a771d313ba89 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1516,13 +1516,19 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err) * valid program, which in this case would simply not * be JITed, but falls back to the interpreter. */ - fp = bpf_int_jit_compile(fp); + if (!bpf_prog_is_dev_bound(fp->aux)) { + fp = bpf_int_jit_compile(fp); #ifdef CONFIG_BPF_JIT_ALWAYS_ON - if (!fp->jited) { - *err = -ENOTSUPP; - return fp; - } + if (!fp->jited) { + *err = -ENOTSUPP; + return fp; + } #endif + } else { + *err = bpf_prog_offload_compile(fp); + if (*err) + return fp; + } bpf_prog_lock_ro(fp); /* The tail call compatibility check can only be done at @@ -1691,6 +1697,8 @@ static void bpf_prog_free_deferred(struct work_struct *work) struct bpf_prog_aux *aux; aux = container_of(work, struct bpf_prog_aux, work); + if (bpf_prog_is_dev_bound(aux)) + bpf_prog_offload_destroy(aux->prog); bpf_jit_free(aux->prog); } diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c new file mode 100644 index 000000000000..5553e0e2f8b1 --- /dev/null +++ b/kernel/bpf/offload.c @@ -0,0 +1,182 @@ +#include +#include +#include +#include +#include +#include +#include + +/* protected by RTNL */ +static LIST_HEAD(bpf_prog_offload_devs); + +int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr) +{ + struct net *net = current->nsproxy->net_ns; + struct bpf_dev_offload *offload; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (attr->prog_flags) + return -EINVAL; + + offload = kzalloc(sizeof(*offload), GFP_USER); + if (!offload) + return -ENOMEM; + + offload->prog = prog; + init_waitqueue_head(&offload->verifier_done); + + rtnl_lock(); + offload->netdev = __dev_get_by_index(net, attr->prog_target_ifindex); + if (!offload->netdev) { + rtnl_unlock(); + kfree(offload); + return -EINVAL; + } + + prog->aux->offload = offload; + list_add_tail(&offload->offloads, &bpf_prog_offload_devs); + rtnl_unlock(); + + return 0; +} + +static int __bpf_offload_ndo(struct bpf_prog *prog, enum bpf_netdev_command cmd, + struct netdev_bpf *data) +{ + struct net_device *netdev = prog->aux->offload->netdev; + + ASSERT_RTNL(); + + if (!netdev) + return -ENODEV; + if (!netdev->netdev_ops->ndo_bpf) + return -EOPNOTSUPP; + + data->command = cmd; + + return netdev->netdev_ops->ndo_bpf(netdev, data); +} + +int bpf_prog_offload_verifier_prep(struct bpf_verifier_env *env) +{ + struct netdev_bpf data = {}; + int err; + + data.verifier.prog = env->prog; + + rtnl_lock(); + err = __bpf_offload_ndo(env->prog, BPF_OFFLOAD_VERIFIER_PREP, &data); + if (err) + goto exit_unlock; + + env->dev_ops = data.verifier.ops; + + env->prog->aux->offload->dev_state = true; + env->prog->aux->offload->verifier_running = true; +exit_unlock: + rtnl_unlock(); + return err; +} + +static void __bpf_prog_offload_destroy(struct bpf_prog *prog) +{ + struct bpf_dev_offload *offload = prog->aux->offload; + struct netdev_bpf data = {}; + + data.offload.prog = prog; + + if (offload->verifier_running) + wait_event(offload->verifier_done, !offload->verifier_running); + + if (offload->dev_state) + WARN_ON(__bpf_offload_ndo(prog, BPF_OFFLOAD_DESTROY, &data)); + + offload->dev_state = false; + list_del_init(&offload->offloads); + offload->netdev = NULL; +} + +void bpf_prog_offload_destroy(struct bpf_prog *prog) +{ + struct bpf_dev_offload *offload = prog->aux->offload; + + offload->verifier_running = false; + wake_up(&offload->verifier_done); + + rtnl_lock(); + __bpf_prog_offload_destroy(prog); + rtnl_unlock(); + + kfree(offload); +} + +static int bpf_prog_offload_translate(struct bpf_prog *prog) +{ + struct bpf_dev_offload *offload = prog->aux->offload; + struct netdev_bpf data = {}; + int ret; + + data.offload.prog = prog; + + offload->verifier_running = false; + wake_up(&offload->verifier_done); + + rtnl_lock(); + ret = __bpf_offload_ndo(prog, BPF_OFFLOAD_TRANSLATE, &data); + rtnl_unlock(); + + return ret; +} + +static unsigned int bpf_prog_warn_on_exec(const void *ctx, + const struct bpf_insn *insn) +{ + WARN(1, "attempt to execute device eBPF program on the host!"); + return 0; +} + +int bpf_prog_offload_compile(struct bpf_prog *prog) +{ + prog->bpf_func = bpf_prog_warn_on_exec; + + return bpf_prog_offload_translate(prog); +} + +const struct bpf_prog_ops bpf_offload_prog_ops = { +}; + +static int bpf_offload_notification(struct notifier_block *notifier, + ulong event, void *ptr) +{ + struct net_device *netdev = netdev_notifier_info_to_dev(ptr); + struct bpf_dev_offload *offload, *tmp; + + ASSERT_RTNL(); + + switch (event) { + case NETDEV_UNREGISTER: + list_for_each_entry_safe(offload, tmp, &bpf_prog_offload_devs, + offloads) { + if (offload->netdev == netdev) + __bpf_prog_offload_destroy(offload->prog); + } + break; + default: + break; + } + return NOTIFY_OK; +} + +static struct notifier_block bpf_offload_notifier = { + .notifier_call = bpf_offload_notification, +}; + +static int __init bpf_offload_init(void) +{ + register_netdevice_notifier(&bpf_offload_notifier); + return 0; +} + +subsys_initcall(bpf_offload_init); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 954defeb3e06..1ab32a668c1b 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -841,7 +841,10 @@ static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog) if (type >= ARRAY_SIZE(bpf_prog_types) || !bpf_prog_types[type]) return -EINVAL; - prog->aux->ops = bpf_prog_types[type]; + if (!bpf_prog_is_dev_bound(prog->aux)) + prog->aux->ops = bpf_prog_types[type]; + else + prog->aux->ops = &bpf_offload_prog_ops; prog->type = type; return 0; } @@ -1071,7 +1074,7 @@ struct bpf_prog *bpf_prog_inc_not_zero(struct bpf_prog *prog) } EXPORT_SYMBOL_GPL(bpf_prog_inc_not_zero); -static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *type) +static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *attach_type) { struct fd f = fdget(ufd); struct bpf_prog *prog; @@ -1079,7 +1082,7 @@ static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *type) prog = ____bpf_prog_get(f); if (IS_ERR(prog)) return prog; - if (type && prog->type != *type) { + if (attach_type && (prog->type != *attach_type || prog->aux->offload)) { prog = ERR_PTR(-EINVAL); goto out; } @@ -1106,7 +1109,7 @@ struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type) EXPORT_SYMBOL_GPL(bpf_prog_get_type); /* last field in 'union bpf_attr' used by this command */ -#define BPF_PROG_LOAD_LAST_FIELD prog_name +#define BPF_PROG_LOAD_LAST_FIELD prog_target_ifindex static int bpf_prog_load(union bpf_attr *attr) { @@ -1169,6 +1172,12 @@ static int bpf_prog_load(union bpf_attr *attr) atomic_set(&prog->aux->refcnt, 1); prog->gpl_compatible = is_gpl ? 1 : 0; + if (attr->prog_target_ifindex) { + err = bpf_prog_offload_init(prog, attr); + if (err) + goto free_prog; + } + /* find program type: socket_filter vs tracing_filter */ err = find_prog_type(type, prog); if (err < 0) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d645b1311949..a91a8ae232c4 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4174,10 +4174,13 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) static int ext_analyzer_insn_hook(struct bpf_verifier_env *env, int insn_idx, int prev_insn_idx) { - if (!env->analyzer_ops || !env->analyzer_ops->insn_hook) - return 0; + if (env->analyzer_ops && env->analyzer_ops->insn_hook) + return env->analyzer_ops->insn_hook(env, insn_idx, + prev_insn_idx); + if (env->dev_ops && env->dev_ops->insn_hook) + return env->dev_ops->insn_hook(env, insn_idx, prev_insn_idx); - return env->analyzer_ops->insn_hook(env, insn_idx, prev_insn_idx); + return 0; } static int do_check(struct bpf_verifier_env *env) @@ -5139,6 +5142,12 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) env->strict_alignment = true; + if (env->prog->aux->offload) { + ret = bpf_prog_offload_verifier_prep(env); + if (ret) + goto err_unlock; + } + ret = replace_map_fd_with_map_ptr(env); if (ret < 0) goto skip_full_check; From b605276d8163aaf15de0a714139080bb4692ba83 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 3 Nov 2017 13:56:18 -0700 Subject: [PATCH 0060/1640] UPSTREAM: bpf: report offload info to user space Extend struct bpf_prog_info to contain information about program being bound to a device. Since the netdev may get destroyed while program still exists we need a flag to indicate the program is loaded for a device, even if the device is gone. Signed-off-by: Jakub Kicinski Reviewed-by: Simon Horman Reviewed-by: Quentin Monnet Signed-off-by: David S. Miller --- include/linux/bpf.h | 1 + include/uapi/linux/bpf.h | 6 ++++++ kernel/bpf/offload.c | 12 ++++++++++++ kernel/bpf/syscall.c | 5 +++++ 4 files changed, 24 insertions(+) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 56792f592c40..957b11bc3c49 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -541,6 +541,7 @@ static inline bool unprivileged_ebpf_enabled(void) int bpf_prog_offload_compile(struct bpf_prog *prog); void bpf_prog_offload_destroy(struct bpf_prog *prog); +u32 bpf_prog_offload_ifindex(struct bpf_prog *prog); #if defined(CONFIG_NET) && defined(CONFIG_BPF_SYSCALL) int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 3c25b6d28e16..ff375139cca0 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -993,6 +993,10 @@ enum sk_action { #define BPF_TAG_SIZE 8 +enum bpf_prog_status { + BPF_PROG_STATUS_DEV_BOUND = (1 << 0), +}; + struct bpf_prog_info { __u32 type; __u32 id; @@ -1006,6 +1010,8 @@ struct bpf_prog_info { __u32 nr_map_ids; __aligned_u64 map_ids; char name[BPF_OBJ_NAME_LEN]; + __u32 ifindex; + __u32 status; } __attribute__((aligned(8))); struct bpf_map_info { diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index 5553e0e2f8b1..2816feb38be1 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -144,6 +144,18 @@ int bpf_prog_offload_compile(struct bpf_prog *prog) return bpf_prog_offload_translate(prog); } +u32 bpf_prog_offload_ifindex(struct bpf_prog *prog) +{ + struct bpf_dev_offload *offload = prog->aux->offload; + u32 ifindex; + + rtnl_lock(); + ifindex = offload->netdev ? offload->netdev->ifindex : 0; + rtnl_unlock(); + + return ifindex; +} + const struct bpf_prog_ops bpf_offload_prog_ops = { }; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 1ab32a668c1b..eff47792bb98 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1616,6 +1616,11 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, return -EFAULT; } + if (bpf_prog_is_dev_bound(prog->aux)) { + info.status |= BPF_PROG_STATUS_DEV_BOUND; + info.ifindex = bpf_prog_offload_ifindex(prog); + } + done: if (copy_to_user(uinfo, &info, info_len) || put_user(info_len, &uattr->info.info_len)) From 57587884fb5f20dc45ac6bbb1388605b75b6137a Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 3 Nov 2017 13:56:20 -0700 Subject: [PATCH 0061/1640] UPSTREAM: xdp: allow attaching programs loaded for specific device Pass the netdev pointer to bpf_prog_get_type(). This way BPF code can decide whether the device matches what the code was loaded/translated for. Signed-off-by: Jakub Kicinski Reviewed-by: Simon Horman Reviewed-by: Quentin Monnet Signed-off-by: David S. Miller --- include/linux/bpf.h | 10 ++++++++++ kernel/bpf/syscall.c | 33 +++++++++++++++++++++++++++++---- net/core/dev.c | 6 +++++- 3 files changed, 44 insertions(+), 5 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 957b11bc3c49..0bdb3a00a732 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -351,6 +351,8 @@ extern const struct bpf_verifier_ops xdp_analyzer_ops; struct bpf_prog *bpf_prog_get(u32 ufd); struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type); +struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, enum bpf_prog_type type, + struct net_device *netdev); struct bpf_prog * __must_check bpf_prog_add(struct bpf_prog *prog, int i); void bpf_prog_sub(struct bpf_prog *prog, int i); struct bpf_prog * __must_check bpf_prog_inc(struct bpf_prog *prog); @@ -450,6 +452,14 @@ static inline struct bpf_prog *bpf_prog_get_type(u32 ufd, { return ERR_PTR(-EOPNOTSUPP); } + +static inline struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, + enum bpf_prog_type type, + struct net_device *netdev) +{ + return ERR_PTR(-EOPNOTSUPP); +} + static inline struct bpf_prog * __must_check bpf_prog_add(struct bpf_prog *prog, int i) { diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index eff47792bb98..31ca9cd07f77 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1074,7 +1074,22 @@ struct bpf_prog *bpf_prog_inc_not_zero(struct bpf_prog *prog) } EXPORT_SYMBOL_GPL(bpf_prog_inc_not_zero); -static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *attach_type) +static bool bpf_prog_can_attach(struct bpf_prog *prog, + enum bpf_prog_type *attach_type, + struct net_device *netdev) +{ + struct bpf_dev_offload *offload = prog->aux->offload; + + if (prog->type != *attach_type) + return false; + if (offload && offload->netdev != netdev) + return false; + + return true; +} + +static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *attach_type, + struct net_device *netdev) { struct fd f = fdget(ufd); struct bpf_prog *prog; @@ -1082,7 +1097,7 @@ static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *attach_type) prog = ____bpf_prog_get(f); if (IS_ERR(prog)) return prog; - if (attach_type && (prog->type != *attach_type || prog->aux->offload)) { + if (attach_type && !bpf_prog_can_attach(prog, attach_type, netdev)) { prog = ERR_PTR(-EINVAL); goto out; } @@ -1095,12 +1110,12 @@ out: struct bpf_prog *bpf_prog_get(u32 ufd) { - return __bpf_prog_get(ufd, NULL); + return __bpf_prog_get(ufd, NULL, NULL); } struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type) { - struct bpf_prog *prog = __bpf_prog_get(ufd, &type); + struct bpf_prog *prog = __bpf_prog_get(ufd, &type, NULL); if (!IS_ERR(prog)) trace_bpf_prog_get_type(prog); @@ -1108,6 +1123,16 @@ struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type) } EXPORT_SYMBOL_GPL(bpf_prog_get_type); +struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, enum bpf_prog_type type, + struct net_device *netdev) +{ + struct bpf_prog *prog = __bpf_prog_get(ufd, &type, netdev); + + if (!IS_ERR(prog)) + trace_bpf_prog_get_type(prog); + return prog; +} + /* last field in 'union bpf_attr' used by this command */ #define BPF_PROG_LOAD_LAST_FIELD prog_target_ifindex diff --git a/net/core/dev.c b/net/core/dev.c index 240d02e075c9..b949e63da37d 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -7257,7 +7257,11 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack, __dev_xdp_attached(dev, bpf_op, NULL)) return -EBUSY; - prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_XDP); + if (bpf_op == ops->ndo_bpf) + prog = bpf_prog_get_type_dev(fd, BPF_PROG_TYPE_XDP, + dev); + else + prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_XDP); if (IS_ERR(prog)) return PTR_ERR(prog); } From d577385262d22f357c6677609160b5bcc5c2acfe Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 3 Nov 2017 13:56:21 -0700 Subject: [PATCH 0062/1640] UPSTREAM: cls_bpf: allow attaching programs loaded for specific device If TC program is loaded with skip_sw flag, we should allow the device-specific programs to be accepted. Signed-off-by: Jakub Kicinski Reviewed-by: Simon Horman Reviewed-by: Quentin Monnet Signed-off-by: David S. Miller --- kernel/bpf/syscall.c | 1 + net/sched/cls_bpf.c | 10 +++++++--- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 31ca9cd07f77..69b89d8df277 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1132,6 +1132,7 @@ struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, enum bpf_prog_type type, trace_bpf_prog_get_type(prog); return prog; } +EXPORT_SYMBOL_GPL(bpf_prog_get_type_dev); /* last field in 'union bpf_attr' used by this command */ #define BPF_PROG_LOAD_LAST_FIELD prog_target_ifindex diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c index 3a499530f321..bb12bac6a714 100644 --- a/net/sched/cls_bpf.c +++ b/net/sched/cls_bpf.c @@ -367,7 +367,7 @@ static int cls_bpf_prog_from_ops(struct nlattr **tb, struct cls_bpf_prog *prog) } static int cls_bpf_prog_from_efd(struct nlattr **tb, struct cls_bpf_prog *prog, - const struct tcf_proto *tp) + u32 gen_flags, const struct tcf_proto *tp) { struct bpf_prog *fp; char *name = NULL; @@ -375,7 +375,11 @@ static int cls_bpf_prog_from_efd(struct nlattr **tb, struct cls_bpf_prog *prog, bpf_fd = nla_get_u32(tb[TCA_BPF_FD]); - fp = bpf_prog_get_type(bpf_fd, BPF_PROG_TYPE_SCHED_CLS); + if (gen_flags & TCA_CLS_FLAGS_SKIP_SW) + fp = bpf_prog_get_type_dev(bpf_fd, BPF_PROG_TYPE_SCHED_CLS, + qdisc_dev(tp->q)); + else + fp = bpf_prog_get_type(bpf_fd, BPF_PROG_TYPE_SCHED_CLS); if (IS_ERR(fp)) return PTR_ERR(fp); @@ -433,7 +437,7 @@ static int cls_bpf_set_parms(struct net *net, struct tcf_proto *tp, prog->gen_flags = gen_flags; ret = is_bpf ? cls_bpf_prog_from_ops(tb, prog) : - cls_bpf_prog_from_efd(tb, prog, tp); + cls_bpf_prog_from_efd(tb, prog, gen_flags, tp); if (ret < 0) return ret; From c8d56f9967c051a7ff7508d83649142a2b5a231b Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 3 Nov 2017 13:56:30 -0700 Subject: [PATCH 0063/1640] UPSTREAM: bpf: remove old offload/analyzer Thanks to the ability to load a program for a specific device, running verifier twice is no longer needed. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Signed-off-by: David S. Miller --- include/linux/bpf_verifier.h | 5 --- kernel/bpf/verifier.c | 75 ------------------------------------ net/core/filter.c | 42 -------------------- 3 files changed, 122 deletions(-) diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 85475d2a8311..afa5baf72a72 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -170,9 +170,7 @@ struct bpf_verifier_env { bool strict_alignment; /* perform strict pointer alignment checks */ struct bpf_verifier_state *cur_state; /* current verifier state */ struct bpf_verifier_state_list **explored_states; /* search pruning optimization */ - const struct bpf_ext_analyzer_ops *analyzer_ops; /* external analyzer ops */ const struct bpf_ext_analyzer_ops *dev_ops; /* device analyzer ops */ - void *analyzer_priv; /* pointer to external analyzer's private data */ struct bpf_map *used_maps[MAX_USED_MAPS]; /* array of map's used by eBPF program */ u32 used_map_cnt; /* number of used maps */ u32 id_gen; /* used to generate unique reg IDs */ @@ -197,7 +195,4 @@ int bpf_prog_offload_verifier_prep(struct bpf_verifier_env *env) } #endif -int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops, - void *priv); - #endif /* _LINUX_BPF_VERIFIER_H */ diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a91a8ae232c4..f40513baae8f 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1005,9 +1005,6 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, */ *reg_type = info.reg_type; - if (env->analyzer_ops) - return 0; - env->insn_aux_data[insn_idx].ctx_field_size = info.ctx_field_size; /* remember the offset of last byte accessed in ctx */ if (env->prog->aux->max_ctx_offset < off + size) @@ -4174,9 +4171,6 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) static int ext_analyzer_insn_hook(struct bpf_verifier_env *env, int insn_idx, int prev_insn_idx) { - if (env->analyzer_ops && env->analyzer_ops->insn_hook) - return env->analyzer_ops->insn_hook(env, insn_idx, - prev_insn_idx); if (env->dev_ops && env->dev_ops->insn_hook) return env->dev_ops->insn_hook(env, insn_idx, prev_insn_idx); @@ -5227,72 +5221,3 @@ err_free_env: kfree(env); return ret; } - -static const struct bpf_verifier_ops * const bpf_analyzer_ops[] = { -#ifdef CONFIG_NET - [BPF_PROG_TYPE_XDP] = &xdp_analyzer_ops, - [BPF_PROG_TYPE_SCHED_CLS] = &tc_cls_act_analyzer_ops, -#endif -}; - -int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops, - void *priv) -{ - struct bpf_verifier_env *env; - int ret; - - if (prog->type >= ARRAY_SIZE(bpf_analyzer_ops) || - !bpf_analyzer_ops[prog->type]) - return -EOPNOTSUPP; - - env = kzalloc(sizeof(struct bpf_verifier_env), GFP_KERNEL); - if (!env) - return -ENOMEM; - - env->insn_aux_data = vzalloc(sizeof(struct bpf_insn_aux_data) * - prog->len); - ret = -ENOMEM; - if (!env->insn_aux_data) - goto err_free_env; - env->prog = prog; - env->ops = bpf_analyzer_ops[env->prog->type]; - env->analyzer_ops = ops; - env->analyzer_priv = priv; - - /* grab the mutex to protect few globals used by verifier */ - mutex_lock(&bpf_verifier_lock); - - env->strict_alignment = false; - if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) - env->strict_alignment = true; - - env->explored_states = kcalloc(env->prog->len, - sizeof(struct bpf_verifier_state_list *), - GFP_KERNEL); - ret = -ENOMEM; - if (!env->explored_states) - goto skip_full_check; - - ret = check_cfg(env); - if (ret < 0) - goto skip_full_check; - - env->allow_ptr_leaks = capable(CAP_SYS_ADMIN); - - ret = do_check(env); - if (env->cur_state) { - free_verifier_state(env->cur_state, true); - env->cur_state = NULL; - } - -skip_full_check: - while (!pop_stack(env, NULL, NULL)); - free_states(env); - - mutex_unlock(&bpf_verifier_lock); - vfree(env->insn_aux_data); -err_free_env: - kfree(env); - return ret; -} -EXPORT_SYMBOL_GPL(bpf_analyzer); diff --git a/net/core/filter.c b/net/core/filter.c index 17b8e85185da..f9e4a43c19c9 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3874,25 +3874,6 @@ static bool tc_cls_act_is_valid_access(int off, int size, return bpf_skb_is_valid_access(off, size, type, info); } -static bool -tc_cls_act_is_valid_access_analyzer(int off, int size, - enum bpf_access_type type, - struct bpf_insn_access_aux *info) -{ - switch (off) { - case offsetof(struct sk_buff, len): - return true; - case offsetof(struct sk_buff, data): - info->reg_type = PTR_TO_PACKET; - return true; - case offsetof(struct sk_buff, cb) + - offsetof(struct bpf_skb_data_end, data_end): - info->reg_type = PTR_TO_PACKET_END; - return true; - } - return false; -} - static bool __is_valid_xdp_access(int off, int size) { if (off < 0 || off >= sizeof(struct xdp_md)) @@ -3927,21 +3908,6 @@ static bool xdp_is_valid_access(int off, int size, return __is_valid_xdp_access(off, size); } -static bool xdp_is_valid_access_analyzer(int off, int size, - enum bpf_access_type type, - struct bpf_insn_access_aux *info) -{ - switch (off) { - case offsetof(struct xdp_buff, data): - info->reg_type = PTR_TO_PACKET; - return true; - case offsetof(struct xdp_buff, data_end): - info->reg_type = PTR_TO_PACKET_END; - return true; - } - return false; -} - void bpf_warn_invalid_xdp_action(u32 act) { const u32 act_max = XDP_REDIRECT; @@ -4613,10 +4579,6 @@ const struct bpf_verifier_ops tc_cls_act_verifier_ops = { .gen_prologue = tc_cls_act_prologue, }; -const struct bpf_verifier_ops tc_cls_act_analyzer_ops = { - .is_valid_access = tc_cls_act_is_valid_access_analyzer, -}; - const struct bpf_prog_ops tc_cls_act_prog_ops = { .test_run = bpf_prog_test_run_skb, }; @@ -4627,10 +4589,6 @@ const struct bpf_verifier_ops xdp_verifier_ops = { .convert_ctx_access = xdp_convert_ctx_access, }; -const struct bpf_verifier_ops xdp_analyzer_ops = { - .is_valid_access = xdp_is_valid_access_analyzer, -}; - const struct bpf_prog_ops xdp_prog_ops = { .test_run = bpf_prog_test_run_xdp, }; From c4935183048248b8ab6d4e86a105fd29a7cddb6a Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Sun, 5 Nov 2017 08:15:32 -0500 Subject: [PATCH 0064/1640] BACKPORT: bpf, cgroup: implement eBPF-based device controller for cgroup v2 Cgroup v2 lacks the device controller, provided by cgroup v1. This patch adds a new eBPF program type, which in combination of previously added ability to attach multiple eBPF programs to a cgroup, will provide a similar functionality, but with some additional flexibility. This patch introduces a BPF_PROG_TYPE_CGROUP_DEVICE program type. A program takes major and minor device numbers, device type (block/character) and access type (mknod/read/write) as parameters and returns an integer which defines if the operation should be allowed or terminated with -EPERM. Signed-off-by: Roman Gushchin Acked-by: Alexei Starovoitov Acked-by: Tejun Heo Cc: Daniel Borkmann Signed-off-by: David S. Miller --- include/linux/bpf-cgroup.h | 15 ++++++++ include/linux/bpf_types.h | 3 ++ include/linux/device_cgroup.h | 8 ++++- include/uapi/linux/bpf.h | 15 ++++++++ kernel/bpf/cgroup.c | 67 +++++++++++++++++++++++++++++++++++ kernel/bpf/syscall.c | 7 ++++ kernel/bpf/verifier.c | 1 + 7 files changed, 115 insertions(+), 1 deletion(-) diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index 87a7db9feb38..a7f16e0f8d68 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -67,6 +67,9 @@ int __cgroup_bpf_run_filter_sock_ops(struct sock *sk, struct bpf_sock_ops_kern *sock_ops, enum bpf_attach_type type); +int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, + short access, enum bpf_attach_type type); + /* Wrappers for __cgroup_bpf_run_filter_skb() guarded by cgroup_bpf_enabled. */ #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb) \ ({ \ @@ -112,6 +115,17 @@ int __cgroup_bpf_run_filter_sock_ops(struct sock *sk, } \ __ret; \ }) + +#define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type, major, minor, access) \ +({ \ + int __ret = 0; \ + if (cgroup_bpf_enabled) \ + __ret = __cgroup_bpf_check_dev_permission(type, major, minor, \ + access, \ + BPF_CGROUP_DEVICE); \ + \ + __ret; \ +}) #else struct cgroup_bpf {}; @@ -122,6 +136,7 @@ static inline int cgroup_bpf_inherit(struct cgroup *cgrp) { return 0; } #define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET_SOCK(sk) ({ 0; }) #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) ({ 0; }) +#define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type,major,minor,access) ({ 0; }) #endif /* CONFIG_CGROUP_BPF */ diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index 53c5b9ad7220..978c1d9c9383 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -19,6 +19,9 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_KPROBE, kprobe) BPF_PROG_TYPE(BPF_PROG_TYPE_TRACEPOINT, tracepoint) BPF_PROG_TYPE(BPF_PROG_TYPE_PERF_EVENT, perf_event) #endif +#ifdef CONFIG_CGROUP_BPF +BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_DEVICE, cg_dev) +#endif BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY, array_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_ARRAY, percpu_array_map_ops) diff --git a/include/linux/device_cgroup.h b/include/linux/device_cgroup.h index 2d93d7ecd479..8557efe096dc 100644 --- a/include/linux/device_cgroup.h +++ b/include/linux/device_cgroup.h @@ -1,5 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ #include +#include #define DEVCG_ACC_MKNOD 1 #define DEVCG_ACC_READ 2 @@ -19,10 +20,15 @@ static inline int __devcgroup_check_permission(short type, u32 major, u32 minor, { return 0; } #endif -#ifdef CONFIG_CGROUP_DEVICE +#if defined(CONFIG_CGROUP_DEVICE) || defined(CONFIG_CGROUP_BPF) static inline int devcgroup_check_permission(short type, u32 major, u32 minor, short access) { + int rc = BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type, major, minor, access); + + if (rc) + return -EPERM; + return __devcgroup_check_permission(type, major, minor, access); } diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index ff375139cca0..bb2d571bc86e 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -132,6 +132,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_LWT_XMIT, BPF_PROG_TYPE_SOCK_OPS, BPF_PROG_TYPE_SK_SKB, + BPF_PROG_TYPE_CGROUP_DEVICE, }; enum bpf_attach_type { @@ -141,6 +142,7 @@ enum bpf_attach_type { BPF_CGROUP_SOCK_OPS, BPF_SK_SKB_STREAM_PARSER, BPF_SK_SKB_STREAM_VERDICT, + BPF_CGROUP_DEVICE, __MAX_BPF_ATTACH_TYPE }; @@ -1089,4 +1091,17 @@ struct bpf_perf_event_value { __u64 running; }; +#define BPF_DEVCG_ACC_MKNOD (1ULL << 0) +#define BPF_DEVCG_ACC_READ (1ULL << 1) +#define BPF_DEVCG_ACC_WRITE (1ULL << 2) + +#define BPF_DEVCG_DEV_BLOCK (1ULL << 0) +#define BPF_DEVCG_DEV_CHAR (1ULL << 1) + +struct bpf_cgroup_dev_ctx { + __u32 access_type; /* (access << 16) | type */ + __u32 major; + __u32 minor; +}; + #endif /* _UAPI__LINUX_BPF_H__ */ diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 90e98284d9e4..48f30aedf5b7 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -521,3 +521,70 @@ int __cgroup_bpf_run_filter_sock_ops(struct sock *sk, return ret == 1 ? 0 : -EPERM; } EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_ops); + +int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, + short access, enum bpf_attach_type type) +{ + struct cgroup *cgrp; + struct bpf_cgroup_dev_ctx ctx = { + .access_type = (access << 16) | dev_type, + .major = major, + .minor = minor, + }; + int allow = 1; + + rcu_read_lock(); + cgrp = task_dfl_cgroup(current); + allow = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx, + BPF_PROG_RUN); + rcu_read_unlock(); + + return !allow; +} +EXPORT_SYMBOL(__cgroup_bpf_check_dev_permission); + +static const struct bpf_func_proto * +cgroup_dev_func_proto(enum bpf_func_id func_id) +{ + switch (func_id) { + case BPF_FUNC_map_lookup_elem: + return &bpf_map_lookup_elem_proto; + case BPF_FUNC_map_update_elem: + return &bpf_map_update_elem_proto; + case BPF_FUNC_map_delete_elem: + return &bpf_map_delete_elem_proto; + case BPF_FUNC_get_current_uid_gid: + return &bpf_get_current_uid_gid_proto; + case BPF_FUNC_trace_printk: + if (capable(CAP_SYS_ADMIN)) + return bpf_get_trace_printk_proto(); + default: + return NULL; + } +} + +static bool cgroup_dev_is_valid_access(int off, int size, + enum bpf_access_type type, + struct bpf_insn_access_aux *info) +{ + if (type == BPF_WRITE) + return false; + + if (off < 0 || off + size > sizeof(struct bpf_cgroup_dev_ctx)) + return false; + /* The verifier guarantees that size > 0. */ + if (off % size != 0) + return false; + if (size != sizeof(__u32)) + return false; + + return true; +} + +const struct bpf_prog_ops cg_dev_prog_ops = { +}; + +const struct bpf_verifier_ops cg_dev_verifier_ops = { + .get_func_proto = cgroup_dev_func_proto, + .is_valid_access = cgroup_dev_is_valid_access, +}; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 69b89d8df277..5160bbd82766 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1349,6 +1349,9 @@ static int bpf_prog_attach(const union bpf_attr *attr) case BPF_CGROUP_SOCK_OPS: ptype = BPF_PROG_TYPE_SOCK_OPS; break; + case BPF_CGROUP_DEVICE: + ptype = BPF_PROG_TYPE_CGROUP_DEVICE; + break; case BPF_SK_SKB_STREAM_PARSER: case BPF_SK_SKB_STREAM_VERDICT: return sockmap_get_from_fd(attr, true); @@ -1401,6 +1404,9 @@ static int bpf_prog_detach(const union bpf_attr *attr) case BPF_CGROUP_SOCK_OPS: ptype = BPF_PROG_TYPE_SOCK_OPS; break; + case BPF_CGROUP_DEVICE: + ptype = BPF_PROG_TYPE_CGROUP_DEVICE; + break; case BPF_SK_SKB_STREAM_PARSER: case BPF_SK_SKB_STREAM_VERDICT: return sockmap_get_from_fd(attr, false); @@ -1443,6 +1449,7 @@ static int bpf_prog_query(const union bpf_attr *attr, case BPF_CGROUP_INET_EGRESS: case BPF_CGROUP_INET_SOCK_CREATE: case BPF_CGROUP_SOCK_OPS: + case BPF_CGROUP_DEVICE: break; default: return -EINVAL; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index f40513baae8f..e285f4f4c534 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3557,6 +3557,7 @@ static int check_return_code(struct bpf_verifier_env *env) case BPF_PROG_TYPE_CGROUP_SKB: case BPF_PROG_TYPE_CGROUP_SOCK: case BPF_PROG_TYPE_SOCK_OPS: + case BPF_PROG_TYPE_CGROUP_DEVICE: break; default: return 0; From 13ac06e89903c96e19b980f1ac851614655bd25d Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 7 Nov 2017 15:28:42 -0500 Subject: [PATCH 0065/1640] BACKPORT: bpf: add a bpf_override_function helper Error injection is sloppy and very ad-hoc. BPF could fill this niche perfectly with it's kprobe functionality. We could make sure errors are only triggered in specific call chains that we care about with very specific situations. Accomplish this with the bpf_override_funciton helper. This will modify the probe'd callers return value to the specified value and set the PC to an override function that simply returns, bypassing the originally probed function. This gives us a nice clean way to implement systematic error injection for all of our code paths. Acked-by: Alexei Starovoitov Signed-off-by: Josef Bacik Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- arch/Kconfig | 3 +++ arch/x86/Kconfig | 1 + arch/x86/include/asm/kprobes.h | 4 ++++ arch/x86/include/asm/ptrace.h | 5 ++++ arch/x86/kernel/kprobes/ftrace.c | 14 +++++++++++ include/linux/filter.h | 3 ++- include/linux/trace_events.h | 1 + include/uapi/linux/bpf.h | 4 ++++ kernel/bpf/core.c | 3 +++ kernel/bpf/verifier.c | 2 ++ kernel/events/core.c | 7 ++++++ kernel/trace/Kconfig | 11 +++++++++ kernel/trace/bpf_trace.c | 35 ++++++++++++++++++++++++++++ kernel/trace/trace_kprobe.c | 40 ++++++++++++++++++++++++++------ kernel/trace/trace_probe.h | 6 +++++ 15 files changed, 131 insertions(+), 8 deletions(-) diff --git a/arch/Kconfig b/arch/Kconfig index 60b8d811afc9..113a6ea00699 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -199,6 +199,9 @@ config HAVE_OPTPROBES config HAVE_KPROBES_ON_FTRACE bool +config HAVE_KPROBE_OVERRIDE + bool + config HAVE_NMI bool diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index bdcd2eef9201..afa13c1f28b1 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -155,6 +155,7 @@ config X86 select HAVE_KERNEL_XZ select HAVE_KPROBES select HAVE_KPROBES_ON_FTRACE + select HAVE_KPROBE_OVERRIDE select HAVE_KRETPROBES select HAVE_KVM select HAVE_LIVEPATCH if X86_64 diff --git a/arch/x86/include/asm/kprobes.h b/arch/x86/include/asm/kprobes.h index 6cf65437b5e5..c6c3b1f4306a 100644 --- a/arch/x86/include/asm/kprobes.h +++ b/arch/x86/include/asm/kprobes.h @@ -67,6 +67,10 @@ extern const int kretprobe_blacklist_size; void arch_remove_kprobe(struct kprobe *p); asmlinkage void kretprobe_trampoline(void); +#ifdef CONFIG_KPROBES_ON_FTRACE +extern void arch_ftrace_kprobe_override_function(struct pt_regs *regs); +#endif + /* Architecture specific copy of original instruction*/ struct arch_specific_insn { /* copy of the original instruction */ diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h index 8603d127f73c..ee696efec99f 100644 --- a/arch/x86/include/asm/ptrace.h +++ b/arch/x86/include/asm/ptrace.h @@ -109,6 +109,11 @@ static inline unsigned long regs_return_value(struct pt_regs *regs) return regs->ax; } +static inline void regs_set_return_value(struct pt_regs *regs, unsigned long rc) +{ + regs->ax = rc; +} + /* * user_mode(regs) determines whether a register set came from user * mode. On x86_32, this is true if V8086 mode was enabled OR if the diff --git a/arch/x86/kernel/kprobes/ftrace.c b/arch/x86/kernel/kprobes/ftrace.c index bcfee4f69b0e..53deb0b23078 100644 --- a/arch/x86/kernel/kprobes/ftrace.c +++ b/arch/x86/kernel/kprobes/ftrace.c @@ -102,3 +102,17 @@ int arch_prepare_kprobe_ftrace(struct kprobe *p) p->ainsn.boostable = false; return 0; } + +asmlinkage void override_func(void); +asm( + ".type override_func, @function\n" + "override_func:\n" + " ret\n" + ".size override_func, .-override_func\n" +); + +void arch_ftrace_kprobe_override_function(struct pt_regs *regs) +{ + regs->ip = (unsigned long)&override_func; +} +NOKPROBE_SYMBOL(arch_ftrace_kprobe_override_function); diff --git a/include/linux/filter.h b/include/linux/filter.h index c3038118a4c0..df27e84721f7 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -483,7 +483,8 @@ struct bpf_prog { locked:1, /* Program image locked? */ gpl_compatible:1, /* Is filter GPL compatible? */ cb_access:1, /* Is control block accessed? */ - dst_needed:1; /* Do we need dst entry? */ + dst_needed:1, /* Do we need dst entry? */ + kprobe_override:1; /* Do we override a kprobe? */ enum bpf_prog_type type; /* Type of BPF program */ u32 len; /* Number of filter blocks */ u32 jited_len; /* Size of jited insns in bytes */ diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 8c2ba9c2794a..a5de6e381a72 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -525,6 +525,7 @@ do { \ struct perf_event; DECLARE_PER_CPU(struct pt_regs, perf_trace_regs); +DECLARE_PER_CPU(int, bpf_kprobe_override); extern int perf_trace_init(struct perf_event *event); extern void perf_trace_destroy(struct perf_event *event); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index bb2d571bc86e..23fe09d3fbdb 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -678,6 +678,10 @@ union bpf_attr { * @buf_size: size of the buf * Return : 0 on success or negative error code * + * int bpf_override_return(pt_regs, rc) + * @pt_regs: pointer to struct pt_regs + * @rc: the return value to set + * * int skb_load_bytes_relative(const struct sk_buff *skb, u32 offset, void *to, u32 len, u32 start_header) * Description * This helper is similar to **bpf_skb_load_bytes**\ () in that diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index a771d313ba89..f062622d8163 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1458,6 +1458,9 @@ static unsigned int __bpf_prog_ret0_warn(const void *ctx, bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *fp) { + if (fp->kprobe_override) + return false; + if (!array->owner_prog_type) { /* There's no owner yet where we could check for * compatibility. diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index e285f4f4c534..c5b554a90877 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4951,6 +4951,8 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) prog->dst_needed = 1; if (insn->imm == BPF_FUNC_get_prandom_u32) bpf_user_rnd_init_once(); + if (insn->imm == BPF_FUNC_override_return) + prog->kprobe_override = 1; if (insn->imm == BPF_FUNC_tail_call) { /* If we tail call into other programs, we * cannot make any assumptions since they can diff --git a/kernel/events/core.c b/kernel/events/core.c index 08ed6a2d3a23..8508ec42a054 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -8558,6 +8558,13 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) return -EINVAL; } + /* Kprobe override only works for kprobes, not uprobes. */ + if (prog->kprobe_override && + !(event->tp_event->flags & TRACE_EVENT_FL_KPROBE)) { + bpf_prog_put(prog); + return -EINVAL; + } + if (is_tracepoint || is_syscall_tp) { int off = trace_event_get_offsets(event->tp_event); diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index e79ef5aa6224..2ba9ffeefdcd 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -561,6 +561,17 @@ config FUNCTION_PROFILER If in doubt, say N. +config BPF_KPROBE_OVERRIDE + bool "Enable BPF programs to override a kprobed function" + depends on BPF_EVENTS + depends on KPROBES_ON_FTRACE + depends on HAVE_KPROBE_OVERRIDE + depends on DYNAMIC_FTRACE_WITH_REGS + default n + help + Allows BPF to override the execution of a probed function and + set a different return value. This is used for error injection. + config FTRACE_MCOUNT_RECORD def_bool y depends on DYNAMIC_FTRACE diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 28751d3dc8e1..3403275ad178 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -13,6 +13,10 @@ #include #include #include +#include +#include + +#include "trace_probe.h" #include "trace.h" u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); @@ -76,6 +80,29 @@ unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx) } EXPORT_SYMBOL_GPL(trace_call_bpf); +#ifdef CONFIG_BPF_KPROBE_OVERRIDE +BPF_CALL_2(bpf_override_return, struct pt_regs *, regs, unsigned long, rc) +{ + __this_cpu_write(bpf_kprobe_override, 1); + regs_set_return_value(regs, rc); + arch_ftrace_kprobe_override_function(regs); + return 0; +} +#else +BPF_CALL_2(bpf_override_return, struct pt_regs *, regs, unsigned long, rc) +{ + return -EINVAL; +} +#endif + +static const struct bpf_func_proto bpf_override_return_proto = { + .func = bpf_override_return, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, +}; + BPF_CALL_3(bpf_probe_read, void *, dst, u32, size, const void *, unsafe_ptr) { int ret; @@ -560,6 +587,10 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func return &bpf_get_stackid_proto; case BPF_FUNC_perf_event_read_value: return &bpf_perf_event_read_value_proto; + case BPF_FUNC_override_return: + pr_warn_ratelimited("%s[%d] is installing a program with bpf_override_return helper that may cause unexpected behavior!", + current->comm, task_pid_nr(current)); + return &bpf_override_return_proto; default: return tracing_func_proto(func_id); } @@ -775,6 +806,10 @@ int perf_event_attach_bpf_prog(struct perf_event *event, struct bpf_prog_array *new_array; int ret = -EEXIST; + /* Kprobe override only works for ftrace based kprobes. */ + if (prog->kprobe_override && !trace_kprobe_ftrace(event->tp_event)) + return -EINVAL; + mutex_lock(&bpf_event_mutex); if (event->prog) diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index b0db2e4cefa3..52e4ae0f92ae 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -42,6 +42,7 @@ struct trace_kprobe { (offsetof(struct trace_kprobe, tp.args) + \ (sizeof(struct probe_arg) * (n))) +DEFINE_PER_CPU(int, bpf_kprobe_override); static nokprobe_inline bool trace_kprobe_is_return(struct trace_kprobe *tk) { @@ -87,6 +88,12 @@ static nokprobe_inline unsigned long trace_kprobe_nhit(struct trace_kprobe *tk) return nhit; } +int trace_kprobe_ftrace(struct trace_event_call *call) +{ + struct trace_kprobe *tk = (struct trace_kprobe *)call->data; + return kprobe_ftrace(&tk->rp.kp); +} + static int register_kprobe_event(struct trace_kprobe *tk); static int unregister_kprobe_event(struct trace_kprobe *tk); @@ -1183,7 +1190,7 @@ static int kretprobe_event_define_fields(struct trace_event_call *event_call) #ifdef CONFIG_PERF_EVENTS /* Kprobe profile handler */ -static void +static int kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs) { struct trace_event_call *call = &tk->tp.call; @@ -1192,12 +1199,29 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs) int size, __size, dsize; int rctx; - if (bpf_prog_array_valid(call) && !trace_call_bpf(call, regs)) - return; + if (bpf_prog_array_valid(call)) { + int ret; + + ret = trace_call_bpf(call, regs); + + /* + * We need to check and see if we modified the pc of the + * pt_regs, and if so clear the kprobe and return 1 so that we + * don't do the instruction skipping. Also reset our state so + * we are clean the next pass through. + */ + if (__this_cpu_read(bpf_kprobe_override)) { + __this_cpu_write(bpf_kprobe_override, 0); + reset_current_kprobe(); + return 1; + } + if (!ret) + return 0; + } head = this_cpu_ptr(call->perf_events); if (hlist_empty(head)) - return; + return 0; dsize = __get_data_size(&tk->tp, regs); __size = sizeof(*entry) + tk->tp.size + dsize; @@ -1206,13 +1230,14 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs) entry = perf_trace_buf_alloc(size, NULL, &rctx); if (!entry) - return; + return 0; entry->ip = (unsigned long)tk->rp.kp.addr; memset(&entry[1], 0, dsize); store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize); perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs, head, NULL, NULL); + return 0; } NOKPROBE_SYMBOL(kprobe_perf_func); @@ -1288,6 +1313,7 @@ static int kprobe_register(struct trace_event_call *event, static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs) { struct trace_kprobe *tk = container_of(kp, struct trace_kprobe, rp.kp); + int ret = 0; raw_cpu_inc(*tk->nhit); @@ -1295,9 +1321,9 @@ static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs) kprobe_trace_func(tk, regs); #ifdef CONFIG_PERF_EVENTS if (tk->tp.flags & TP_FLAG_PROFILE) - kprobe_perf_func(tk, regs); + ret = kprobe_perf_func(tk, regs); #endif - return 0; /* We don't tweek kernel, so just return 0 */ + return ret; } NOKPROBE_SYMBOL(kprobe_dispatcher); diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index dc39472ca9e4..673f0eb18a59 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -253,6 +253,7 @@ struct symbol_cache; unsigned long update_symbol_cache(struct symbol_cache *sc); void free_symbol_cache(struct symbol_cache *sc); struct symbol_cache *alloc_symbol_cache(const char *sym, long offset); +int trace_kprobe_ftrace(struct trace_event_call *call); #else /* uprobes do not support symbol fetch methods */ #define fetch_symbol_u8 NULL @@ -278,6 +279,11 @@ alloc_symbol_cache(const char *sym, long offset) { return NULL; } + +static inline int trace_kprobe_ftrace(struct trace_event_call *call) +{ + return 0; +} #endif /* CONFIG_KPROBE_EVENTS */ struct probe_arg { From d4147ff626bbcf17df2952608c4b3a6ac79f29b7 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Sat, 11 Nov 2017 18:24:55 +0900 Subject: [PATCH 0066/1640] BACKPORT: bpf: Revert bpf_overrid_function() helper changes. NACK'd by x86 maintainer. Signed-off-by: David S. Miller --- arch/Kconfig | 3 --- arch/x86/Kconfig | 1 - arch/x86/include/asm/kprobes.h | 4 ---- arch/x86/include/asm/ptrace.h | 5 ---- arch/x86/kernel/kprobes/ftrace.c | 14 ----------- include/linux/filter.h | 3 +-- include/linux/trace_events.h | 1 - include/uapi/linux/bpf.h | 4 ---- kernel/bpf/core.c | 3 --- kernel/bpf/verifier.c | 2 -- kernel/events/core.c | 7 ------ kernel/trace/Kconfig | 11 --------- kernel/trace/bpf_trace.c | 35 ---------------------------- kernel/trace/trace_kprobe.c | 40 ++++++-------------------------- kernel/trace/trace_probe.h | 6 ----- 15 files changed, 8 insertions(+), 131 deletions(-) diff --git a/arch/Kconfig b/arch/Kconfig index 113a6ea00699..60b8d811afc9 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -199,9 +199,6 @@ config HAVE_OPTPROBES config HAVE_KPROBES_ON_FTRACE bool -config HAVE_KPROBE_OVERRIDE - bool - config HAVE_NMI bool diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index afa13c1f28b1..bdcd2eef9201 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -155,7 +155,6 @@ config X86 select HAVE_KERNEL_XZ select HAVE_KPROBES select HAVE_KPROBES_ON_FTRACE - select HAVE_KPROBE_OVERRIDE select HAVE_KRETPROBES select HAVE_KVM select HAVE_LIVEPATCH if X86_64 diff --git a/arch/x86/include/asm/kprobes.h b/arch/x86/include/asm/kprobes.h index c6c3b1f4306a..6cf65437b5e5 100644 --- a/arch/x86/include/asm/kprobes.h +++ b/arch/x86/include/asm/kprobes.h @@ -67,10 +67,6 @@ extern const int kretprobe_blacklist_size; void arch_remove_kprobe(struct kprobe *p); asmlinkage void kretprobe_trampoline(void); -#ifdef CONFIG_KPROBES_ON_FTRACE -extern void arch_ftrace_kprobe_override_function(struct pt_regs *regs); -#endif - /* Architecture specific copy of original instruction*/ struct arch_specific_insn { /* copy of the original instruction */ diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h index ee696efec99f..8603d127f73c 100644 --- a/arch/x86/include/asm/ptrace.h +++ b/arch/x86/include/asm/ptrace.h @@ -109,11 +109,6 @@ static inline unsigned long regs_return_value(struct pt_regs *regs) return regs->ax; } -static inline void regs_set_return_value(struct pt_regs *regs, unsigned long rc) -{ - regs->ax = rc; -} - /* * user_mode(regs) determines whether a register set came from user * mode. On x86_32, this is true if V8086 mode was enabled OR if the diff --git a/arch/x86/kernel/kprobes/ftrace.c b/arch/x86/kernel/kprobes/ftrace.c index 53deb0b23078..bcfee4f69b0e 100644 --- a/arch/x86/kernel/kprobes/ftrace.c +++ b/arch/x86/kernel/kprobes/ftrace.c @@ -102,17 +102,3 @@ int arch_prepare_kprobe_ftrace(struct kprobe *p) p->ainsn.boostable = false; return 0; } - -asmlinkage void override_func(void); -asm( - ".type override_func, @function\n" - "override_func:\n" - " ret\n" - ".size override_func, .-override_func\n" -); - -void arch_ftrace_kprobe_override_function(struct pt_regs *regs) -{ - regs->ip = (unsigned long)&override_func; -} -NOKPROBE_SYMBOL(arch_ftrace_kprobe_override_function); diff --git a/include/linux/filter.h b/include/linux/filter.h index df27e84721f7..c3038118a4c0 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -483,8 +483,7 @@ struct bpf_prog { locked:1, /* Program image locked? */ gpl_compatible:1, /* Is filter GPL compatible? */ cb_access:1, /* Is control block accessed? */ - dst_needed:1, /* Do we need dst entry? */ - kprobe_override:1; /* Do we override a kprobe? */ + dst_needed:1; /* Do we need dst entry? */ enum bpf_prog_type type; /* Type of BPF program */ u32 len; /* Number of filter blocks */ u32 jited_len; /* Size of jited insns in bytes */ diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index a5de6e381a72..8c2ba9c2794a 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -525,7 +525,6 @@ do { \ struct perf_event; DECLARE_PER_CPU(struct pt_regs, perf_trace_regs); -DECLARE_PER_CPU(int, bpf_kprobe_override); extern int perf_trace_init(struct perf_event *event); extern void perf_trace_destroy(struct perf_event *event); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 23fe09d3fbdb..bb2d571bc86e 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -678,10 +678,6 @@ union bpf_attr { * @buf_size: size of the buf * Return : 0 on success or negative error code * - * int bpf_override_return(pt_regs, rc) - * @pt_regs: pointer to struct pt_regs - * @rc: the return value to set - * * int skb_load_bytes_relative(const struct sk_buff *skb, u32 offset, void *to, u32 len, u32 start_header) * Description * This helper is similar to **bpf_skb_load_bytes**\ () in that diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index f062622d8163..a771d313ba89 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1458,9 +1458,6 @@ static unsigned int __bpf_prog_ret0_warn(const void *ctx, bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *fp) { - if (fp->kprobe_override) - return false; - if (!array->owner_prog_type) { /* There's no owner yet where we could check for * compatibility. diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index c5b554a90877..e285f4f4c534 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4951,8 +4951,6 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) prog->dst_needed = 1; if (insn->imm == BPF_FUNC_get_prandom_u32) bpf_user_rnd_init_once(); - if (insn->imm == BPF_FUNC_override_return) - prog->kprobe_override = 1; if (insn->imm == BPF_FUNC_tail_call) { /* If we tail call into other programs, we * cannot make any assumptions since they can diff --git a/kernel/events/core.c b/kernel/events/core.c index 8508ec42a054..08ed6a2d3a23 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -8558,13 +8558,6 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) return -EINVAL; } - /* Kprobe override only works for kprobes, not uprobes. */ - if (prog->kprobe_override && - !(event->tp_event->flags & TRACE_EVENT_FL_KPROBE)) { - bpf_prog_put(prog); - return -EINVAL; - } - if (is_tracepoint || is_syscall_tp) { int off = trace_event_get_offsets(event->tp_event); diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 2ba9ffeefdcd..e79ef5aa6224 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -561,17 +561,6 @@ config FUNCTION_PROFILER If in doubt, say N. -config BPF_KPROBE_OVERRIDE - bool "Enable BPF programs to override a kprobed function" - depends on BPF_EVENTS - depends on KPROBES_ON_FTRACE - depends on HAVE_KPROBE_OVERRIDE - depends on DYNAMIC_FTRACE_WITH_REGS - default n - help - Allows BPF to override the execution of a probed function and - set a different return value. This is used for error injection. - config FTRACE_MCOUNT_RECORD def_bool y depends on DYNAMIC_FTRACE diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 3403275ad178..28751d3dc8e1 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -13,10 +13,6 @@ #include #include #include -#include -#include - -#include "trace_probe.h" #include "trace.h" u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); @@ -80,29 +76,6 @@ unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx) } EXPORT_SYMBOL_GPL(trace_call_bpf); -#ifdef CONFIG_BPF_KPROBE_OVERRIDE -BPF_CALL_2(bpf_override_return, struct pt_regs *, regs, unsigned long, rc) -{ - __this_cpu_write(bpf_kprobe_override, 1); - regs_set_return_value(regs, rc); - arch_ftrace_kprobe_override_function(regs); - return 0; -} -#else -BPF_CALL_2(bpf_override_return, struct pt_regs *, regs, unsigned long, rc) -{ - return -EINVAL; -} -#endif - -static const struct bpf_func_proto bpf_override_return_proto = { - .func = bpf_override_return, - .gpl_only = true, - .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_ANYTHING, -}; - BPF_CALL_3(bpf_probe_read, void *, dst, u32, size, const void *, unsafe_ptr) { int ret; @@ -587,10 +560,6 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func return &bpf_get_stackid_proto; case BPF_FUNC_perf_event_read_value: return &bpf_perf_event_read_value_proto; - case BPF_FUNC_override_return: - pr_warn_ratelimited("%s[%d] is installing a program with bpf_override_return helper that may cause unexpected behavior!", - current->comm, task_pid_nr(current)); - return &bpf_override_return_proto; default: return tracing_func_proto(func_id); } @@ -806,10 +775,6 @@ int perf_event_attach_bpf_prog(struct perf_event *event, struct bpf_prog_array *new_array; int ret = -EEXIST; - /* Kprobe override only works for ftrace based kprobes. */ - if (prog->kprobe_override && !trace_kprobe_ftrace(event->tp_event)) - return -EINVAL; - mutex_lock(&bpf_event_mutex); if (event->prog) diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 52e4ae0f92ae..b0db2e4cefa3 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -42,7 +42,6 @@ struct trace_kprobe { (offsetof(struct trace_kprobe, tp.args) + \ (sizeof(struct probe_arg) * (n))) -DEFINE_PER_CPU(int, bpf_kprobe_override); static nokprobe_inline bool trace_kprobe_is_return(struct trace_kprobe *tk) { @@ -88,12 +87,6 @@ static nokprobe_inline unsigned long trace_kprobe_nhit(struct trace_kprobe *tk) return nhit; } -int trace_kprobe_ftrace(struct trace_event_call *call) -{ - struct trace_kprobe *tk = (struct trace_kprobe *)call->data; - return kprobe_ftrace(&tk->rp.kp); -} - static int register_kprobe_event(struct trace_kprobe *tk); static int unregister_kprobe_event(struct trace_kprobe *tk); @@ -1190,7 +1183,7 @@ static int kretprobe_event_define_fields(struct trace_event_call *event_call) #ifdef CONFIG_PERF_EVENTS /* Kprobe profile handler */ -static int +static void kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs) { struct trace_event_call *call = &tk->tp.call; @@ -1199,29 +1192,12 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs) int size, __size, dsize; int rctx; - if (bpf_prog_array_valid(call)) { - int ret; - - ret = trace_call_bpf(call, regs); - - /* - * We need to check and see if we modified the pc of the - * pt_regs, and if so clear the kprobe and return 1 so that we - * don't do the instruction skipping. Also reset our state so - * we are clean the next pass through. - */ - if (__this_cpu_read(bpf_kprobe_override)) { - __this_cpu_write(bpf_kprobe_override, 0); - reset_current_kprobe(); - return 1; - } - if (!ret) - return 0; - } + if (bpf_prog_array_valid(call) && !trace_call_bpf(call, regs)) + return; head = this_cpu_ptr(call->perf_events); if (hlist_empty(head)) - return 0; + return; dsize = __get_data_size(&tk->tp, regs); __size = sizeof(*entry) + tk->tp.size + dsize; @@ -1230,14 +1206,13 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs) entry = perf_trace_buf_alloc(size, NULL, &rctx); if (!entry) - return 0; + return; entry->ip = (unsigned long)tk->rp.kp.addr; memset(&entry[1], 0, dsize); store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize); perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs, head, NULL, NULL); - return 0; } NOKPROBE_SYMBOL(kprobe_perf_func); @@ -1313,7 +1288,6 @@ static int kprobe_register(struct trace_event_call *event, static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs) { struct trace_kprobe *tk = container_of(kp, struct trace_kprobe, rp.kp); - int ret = 0; raw_cpu_inc(*tk->nhit); @@ -1321,9 +1295,9 @@ static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs) kprobe_trace_func(tk, regs); #ifdef CONFIG_PERF_EVENTS if (tk->tp.flags & TP_FLAG_PROFILE) - ret = kprobe_perf_func(tk, regs); + kprobe_perf_func(tk, regs); #endif - return ret; + return 0; /* We don't tweek kernel, so just return 0 */ } NOKPROBE_SYMBOL(kprobe_dispatcher); diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 673f0eb18a59..dc39472ca9e4 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -253,7 +253,6 @@ struct symbol_cache; unsigned long update_symbol_cache(struct symbol_cache *sc); void free_symbol_cache(struct symbol_cache *sc); struct symbol_cache *alloc_symbol_cache(const char *sym, long offset); -int trace_kprobe_ftrace(struct trace_event_call *call); #else /* uprobes do not support symbol fetch methods */ #define fetch_symbol_u8 NULL @@ -279,11 +278,6 @@ alloc_symbol_cache(const char *sym, long offset) { return NULL; } - -static inline int trace_kprobe_ftrace(struct trace_event_call *call) -{ - return 0; -} #endif /* CONFIG_KPROBE_EVENTS */ struct probe_arg { From efe113e8e88191d24b3abfb9be3093e93ecc41f0 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sun, 12 Nov 2017 14:49:09 -0800 Subject: [PATCH 0067/1640] BACKPORT: bpf: improve verifier ARG_CONST_SIZE_OR_ZERO semantics For helpers, the argument type ARG_CONST_SIZE_OR_ZERO permits the access size to be 0 when accessing the previous argument (arg). Right now, it requires the arg needs to be NULL when size passed is 0 or could be 0. It also requires a non-NULL arg when the size is proved to be non-0. This patch changes verifier ARG_CONST_SIZE_OR_ZERO behavior such that for size-0 or possible size-0, it is not required the arg equal to NULL. There are a couple of reasons for this semantics change, and all of them intends to simplify user bpf programs which may improve user experience and/or increase chances of verifier acceptance. Together with the next patch which changes bpf_probe_read arg2 type from ARG_CONST_SIZE to ARG_CONST_SIZE_OR_ZERO, the following two examples, which fail the verifier currently, are able to get verifier acceptance. Example 1: unsigned long len = pend - pstart; len = len > MAX_PAYLOAD_LEN ? MAX_PAYLOAD_LEN : len; len &= MAX_PAYLOAD_LEN; bpf_probe_read(data->payload, len, pstart); It does not have test for "len > 0" and it failed the verifier. Users may not be aware that they have to add this test. Converting the bpf_probe_read helper to have ARG_CONST_SIZE_OR_ZERO helps the above code get verifier acceptance. Example 2: Here is one example where llvm "messed up" the code and the verifier fails. ...... unsigned long len = pend - pstart; if (len > 0 && len <= MAX_PAYLOAD_LEN) bpf_probe_read(data->payload, len, pstart); ...... The compiler generates the following code and verifier fails: ...... 39: (79) r2 = *(u64 *)(r10 -16) 40: (1f) r2 -= r8 41: (bf) r1 = r2 42: (07) r1 += -1 43: (25) if r1 > 0xffe goto pc+3 R0=inv(id=0) R1=inv(id=0,umax_value=4094,var_off=(0x0; 0xfff)) R2=inv(id=0) R6=map_value(id=0,off=0,ks=4,vs=4095,imm=0) R7=inv(id=0) R8=inv(id=0) R9=inv0 R10=fp0 44: (bf) r1 = r6 45: (bf) r3 = r8 46: (85) call bpf_probe_read#45 R2 min value is negative, either use unsigned or 'var &= const' ...... The compiler optimization is correct. If r1 = 0, r1 - 1 = 0xffffffffffffffff > 0xffe. If r1 != 0, r1 - 1 will not wrap. r1 > 0xffe at insn #43 can actually capture both "r1 > 0" and "len <= MAX_PAYLOAD_LEN". This however causes an issue in verifier as the value range of arg2 "r2" does not properly get refined and lead to verification failure. Relaxing bpf_prog_read arg2 from ARG_CONST_SIZE to ARG_CONST_SIZE_OR_ZERO allows the following simplied code: unsigned long len = pend - pstart; if (len <= MAX_PAYLOAD_LEN) bpf_probe_read(data->payload, len, pstart); The llvm compiler will generate less complex code and the verifier is able to verify that the program is okay. Signed-off-by: Yonghong Song Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 42 +++++++++++++++++++++++++----------------- 1 file changed, 25 insertions(+), 17 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index e285f4f4c534..f79b43cb7a36 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -851,12 +851,13 @@ static int check_stack_access(struct bpf_verifier_env *env, /* check read/write into map element returned by bpf_map_lookup_elem() */ static int __check_map_access(struct bpf_verifier_env *env, u32 regno, int off, - int size) + int size, bool zero_size_allowed) { struct bpf_reg_state *regs = cur_regs(env); struct bpf_map *map = regs[regno].map_ptr; - if (off < 0 || size <= 0 || off + size > map->value_size) { + if (off < 0 || size < 0 || (size == 0 && !zero_size_allowed) || + off + size > map->value_size) { verbose(env, "invalid access to map value, value_size=%d off=%d size=%d\n", map->value_size, off, size); return -EACCES; @@ -866,7 +867,7 @@ static int __check_map_access(struct bpf_verifier_env *env, u32 regno, int off, /* check read/write into a map element with possible variable offset */ static int check_map_access(struct bpf_verifier_env *env, u32 regno, - int off, int size) + int off, int size, bool zero_size_allowed) { struct bpf_verifier_state *state = env->cur_state; struct bpf_reg_state *reg = &state->regs[regno]; @@ -893,7 +894,8 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, regno); return -EACCES; } - err = __check_map_access(env, regno, reg->smin_value + off, size); + err = __check_map_access(env, regno, reg->smin_value + off, size, + zero_size_allowed); if (err) { verbose(env, "R%d min value is outside of the array range\n", regno); @@ -909,7 +911,8 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, regno); return -EACCES; } - err = __check_map_access(env, regno, reg->umax_value + off, size); + err = __check_map_access(env, regno, reg->umax_value + off, size, + zero_size_allowed); if (err) verbose(env, "R%d max value is outside of the array range\n", regno); @@ -945,12 +948,13 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env, } static int __check_packet_access(struct bpf_verifier_env *env, u32 regno, - int off, int size) + int off, int size, bool zero_size_allowed) { struct bpf_reg_state *regs = cur_regs(env); struct bpf_reg_state *reg = ®s[regno]; - if (off < 0 || size <= 0 || (u64)off + size > reg->range) { + if (off < 0 || size < 0 || (size == 0 && !zero_size_allowed) || + (u64)off + size > reg->range) { verbose(env, "invalid access to packet, off=%d size=%d, R%d(id=%d,off=%d,r=%d)\n", off, size, regno, reg->id, reg->off, reg->range); return -EACCES; @@ -959,7 +963,7 @@ static int __check_packet_access(struct bpf_verifier_env *env, u32 regno, } static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off, - int size) + int size, bool zero_size_allowed) { struct bpf_reg_state *regs = cur_regs(env); struct bpf_reg_state *reg = ®s[regno]; @@ -978,7 +982,7 @@ static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off, regno); return -EACCES; } - err = __check_packet_access(env, regno, off, size); + err = __check_packet_access(env, regno, off, size, zero_size_allowed); if (err) { verbose(env, "R%d offset is outside of the packet\n", regno); return err; @@ -1219,7 +1223,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn return -EACCES; } - err = check_map_access(env, regno, off, size); + err = check_map_access(env, regno, off, size, false); if (!err && t == BPF_READ && value_regno >= 0) mark_reg_unknown(env, regs, value_regno); @@ -1278,7 +1282,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn value_regno); return -EACCES; } - err = check_packet_access(env, regno, off, size); + err = check_packet_access(env, regno, off, size, false); if (!err && t == BPF_READ && value_regno >= 0) mark_reg_unknown(env, regs, value_regno); } else { @@ -1382,7 +1386,7 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, } off = regs[regno].off + regs[regno].var_off.value; if (off >= 0 || off < -MAX_BPF_STACK || off + access_size > 0 || - access_size <= 0) { + access_size < 0 || (access_size == 0 && !zero_size_allowed)) { verbose(env, "invalid stack type R%d off=%d access_size=%d\n", regno, off, access_size); return -EACCES; @@ -1420,9 +1424,11 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, switch (reg->type) { case PTR_TO_PACKET: case PTR_TO_PACKET_META: - return check_packet_access(env, regno, reg->off, access_size); + return check_packet_access(env, regno, reg->off, access_size, + zero_size_allowed); case PTR_TO_MAP_VALUE: - return check_map_access(env, regno, reg->off, access_size); + return check_map_access(env, regno, reg->off, access_size, + zero_size_allowed); default: /* scalar_value|ptr_to_stack or invalid ptr */ return check_stack_boundary(env, regno, access_size, zero_size_allowed, meta); @@ -1519,7 +1525,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, } if (type_is_pkt_pointer(type)) err = check_packet_access(env, regno, reg->off, - meta->map_ptr->key_size); + meta->map_ptr->key_size, + false); else err = check_stack_boundary(env, regno, meta->map_ptr->key_size, @@ -1535,7 +1542,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, } if (type_is_pkt_pointer(type)) err = check_packet_access(env, regno, reg->off, - meta->map_ptr->value_size); + meta->map_ptr->value_size, + false); else err = check_stack_boundary(env, regno, meta->map_ptr->value_size, @@ -2161,7 +2169,7 @@ static int sanitize_check_bounds(struct bpf_verifier_env *env, } break; case PTR_TO_MAP_VALUE: - if (check_map_access(env, dst, dst_reg->off, 1)) { + if (check_map_access(env, dst, dst_reg->off, 1, false)) { verbose(env, "R%d pointer arithmetic of map value goes out of range, " "prohibited for !root\n", dst); return -EACCES; From 95dc2768a88bd492309230d308a2e905a25877fd Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 14 Nov 2017 17:15:50 -0800 Subject: [PATCH 0068/1640] UPSTREAM: bpf: fix lockdep splat pcpu_freelist_pop() needs the same lockdep awareness than pcpu_freelist_populate() to avoid a false positive. [ INFO: SOFTIRQ-safe -> SOFTIRQ-unsafe lock order detected ] switchto-defaul/12508 [HC0[0]:SC0[6]:HE0:SE0] is trying to acquire: (&htab->buckets[i].lock){......}, at: [] __htab_percpu_map_update_elem+0x1cb/0x300 and this task is already holding: (dev_queue->dev->qdisc_class ?: &qdisc_tx_lock#2){+.-...}, at: [] __dev_queue_xmit+0 x868/0x1240 which would create a new lock dependency: (dev_queue->dev->qdisc_class ?: &qdisc_tx_lock#2){+.-...} -> (&htab->buckets[i].lock){......} but this new dependency connects a SOFTIRQ-irq-safe lock: (dev_queue->dev->qdisc_class ?: &qdisc_tx_lock#2){+.-...} ... which became SOFTIRQ-irq-safe at: [] __lock_acquire+0x42b/0x1f10 [] lock_acquire+0xbc/0x1b0 [] _raw_spin_lock+0x38/0x50 [] __dev_queue_xmit+0x868/0x1240 [] dev_queue_xmit+0x10/0x20 [] ip_finish_output2+0x439/0x590 [] ip_finish_output+0x150/0x2f0 [] ip_output+0x7d/0x260 [] ip_local_out+0x5e/0xe0 [] ip_queue_xmit+0x205/0x620 [] tcp_transmit_skb+0x5a8/0xcb0 [] tcp_write_xmit+0x242/0x1070 [] __tcp_push_pending_frames+0x3c/0xf0 [] tcp_rcv_established+0x312/0x700 [] tcp_v4_do_rcv+0x11c/0x200 [] tcp_v4_rcv+0xaa2/0xc30 [] ip_local_deliver_finish+0xa7/0x240 [] ip_local_deliver+0x66/0x200 [] ip_rcv_finish+0xdd/0x560 [] ip_rcv+0x295/0x510 [] __netif_receive_skb_core+0x988/0x1020 [] __netif_receive_skb+0x21/0x70 [] process_backlog+0x6f/0x230 [] net_rx_action+0x229/0x420 [] __do_softirq+0xd8/0x43d [] do_softirq_own_stack+0x1c/0x30 [] do_softirq+0x55/0x60 [] __local_bh_enable_ip+0xa8/0xb0 [] cpu_startup_entry+0x1c7/0x500 [] start_secondary+0x113/0x140 to a SOFTIRQ-irq-unsafe lock: (&head->lock){+.+...} ... which became SOFTIRQ-irq-unsafe at: ... [] __lock_acquire+0x82f/0x1f10 [] lock_acquire+0xbc/0x1b0 [] _raw_spin_lock+0x38/0x50 [] pcpu_freelist_pop+0x7a/0xb0 [] htab_map_alloc+0x50c/0x5f0 [] SyS_bpf+0x265/0x1200 [] entry_SYSCALL_64_fastpath+0x12/0x17 other info that might help us debug this: Chain exists of: dev_queue->dev->qdisc_class ?: &qdisc_tx_lock#2 --> &htab->buckets[i].lock --> &head->lock Possible interrupt unsafe locking scenario: CPU0 CPU1 ---- ---- lock(&head->lock); local_irq_disable(); lock(dev_queue->dev->qdisc_class ?: &qdisc_tx_lock#2); lock(&htab->buckets[i].lock); lock(dev_queue->dev->qdisc_class ?: &qdisc_tx_lock#2); *** DEADLOCK *** Fixes: e19494edab82 ("bpf: introduce percpu_freelist") Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- kernel/bpf/percpu_freelist.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/percpu_freelist.c b/kernel/bpf/percpu_freelist.c index 0c1b4ba9e90e..7174ee2d84ee 100644 --- a/kernel/bpf/percpu_freelist.c +++ b/kernel/bpf/percpu_freelist.c @@ -88,8 +88,10 @@ struct pcpu_freelist_node *__pcpu_freelist_pop(struct pcpu_freelist *s) { struct pcpu_freelist_head *head; struct pcpu_freelist_node *node; + unsigned long flags; int orig_cpu, cpu; + local_irq_save(flags); orig_cpu = cpu = raw_smp_processor_id(); while (1) { head = per_cpu_ptr(s->freelist, cpu); @@ -97,15 +99,17 @@ struct pcpu_freelist_node *__pcpu_freelist_pop(struct pcpu_freelist *s) node = head->first; if (node) { head->first = node->next; - raw_spin_unlock(&head->lock); + raw_spin_unlock_irqrestore(&head->lock, flags); return node; } raw_spin_unlock(&head->lock); cpu = cpumask_next(cpu, cpu_possible_mask); if (cpu >= nr_cpu_ids) cpu = 0; - if (cpu == orig_cpu) + if (cpu == orig_cpu) { + local_irq_restore(flags); return NULL; + } } } From ca67d04307079303c6d84125b06ee978b60547fd Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 20 Nov 2017 15:21:51 -0800 Subject: [PATCH 0069/1640] UPSTREAM: bpf: offload: add comment warning developers about double destroy Offload state may get destroyed either because the device for which it was constructed is going away, or because the refcount of bpf program itself has reached 0. In both of those cases we will call __bpf_prog_offload_destroy() to unlink the offload from the device. We may in fact call it twice, which works just fine, but we should make clear this is intended and caution others trying to extend the function. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: Daniel Borkmann --- kernel/bpf/offload.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index 2816feb38be1..fd696d3dd429 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -85,6 +85,10 @@ static void __bpf_prog_offload_destroy(struct bpf_prog *prog) struct bpf_dev_offload *offload = prog->aux->offload; struct netdev_bpf data = {}; + /* Caution - if netdev is destroyed before the program, this function + * will be called twice. + */ + data.offload.prog = prog; if (offload->verifier_running) From bd36bc36ccb036b1931025314207fe6732ef0284 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 20 Nov 2017 15:21:52 -0800 Subject: [PATCH 0070/1640] UPSTREAM: bpf: offload: limit offload to cls_bpf and xdp programs only We are currently only allowing attachment of device-bound cls_bpf and XDP programs. Make this restriction explicit in the BPF offload code. This way we can potentially reuse the ifindex field in the future. Since XDP and cls_bpf programs can only be loaded by admin, we can drop the explicit capability check from offload code. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: Daniel Borkmann --- kernel/bpf/offload.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index fd696d3dd429..ac187f9ee182 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -14,8 +14,9 @@ int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr) struct net *net = current->nsproxy->net_ns; struct bpf_dev_offload *offload; - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; + if (attr->prog_type != BPF_PROG_TYPE_SCHED_CLS && + attr->prog_type != BPF_PROG_TYPE_XDP) + return -EINVAL; if (attr->prog_flags) return -EINVAL; From 5239303433bf475632dc0980af9b8dccea0a8f21 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 20 Nov 2017 15:21:53 -0800 Subject: [PATCH 0071/1640] BACKPORT: bpf: offload: rename the ifindex field bpf_target_prog seems long and clunky, rename it to prog_ifindex. We don't want to call this field just ifindex, because maps may need a similar field in the future and bpf_attr members for programs and maps are unnamed. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: Daniel Borkmann --- include/uapi/linux/bpf.h | 2 +- kernel/bpf/offload.c | 2 +- kernel/bpf/syscall.c | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index bb2d571bc86e..23b96f01dd3e 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -262,7 +262,7 @@ union bpf_attr { __u32 kern_version; /* checked when prog_type=kprobe */ __u32 prog_flags; char prog_name[BPF_OBJ_NAME_LEN]; - __u32 prog_target_ifindex; /* ifindex of netdev to prep for */ + __u32 prog_ifindex; /* ifindex of netdev to prep for */ }; struct { /* anonymous struct used by BPF_OBJ_* commands */ diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index ac187f9ee182..a778e5df7e26 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -29,7 +29,7 @@ int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr) init_waitqueue_head(&offload->verifier_done); rtnl_lock(); - offload->netdev = __dev_get_by_index(net, attr->prog_target_ifindex); + offload->netdev = __dev_get_by_index(net, attr->prog_ifindex); if (!offload->netdev) { rtnl_unlock(); kfree(offload); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 5160bbd82766..48bdc7b095b4 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1135,7 +1135,7 @@ struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, enum bpf_prog_type type, EXPORT_SYMBOL_GPL(bpf_prog_get_type_dev); /* last field in 'union bpf_attr' used by this command */ -#define BPF_PROG_LOAD_LAST_FIELD prog_target_ifindex +#define BPF_PROG_LOAD_LAST_FIELD prog_ifindex static int bpf_prog_load(union bpf_attr *attr) { @@ -1198,7 +1198,7 @@ static int bpf_prog_load(union bpf_attr *attr) atomic_set(&prog->aux->refcnt, 1); prog->gpl_compatible = is_gpl ? 1 : 0; - if (attr->prog_target_ifindex) { + if (attr->prog_ifindex) { err = bpf_prog_offload_init(prog, attr); if (err) goto free_prog; From 156da6c30c863f43833884b010240e8b61cc6a0d Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 20 Nov 2017 15:21:54 -0800 Subject: [PATCH 0072/1640] BACKPORT: bpf: offload: move offload device validation out to the drivers With TC shared block changes we can't depend on correct netdev pointer being available in cls_bpf. Move the device validation to the driver. Core will only make sure that offloaded programs are always attached in the driver (or in HW by the driver). We trust that drivers which implement offload callbacks will perform necessary checks. Moving the checks to the driver is generally a useful thing, in practice the check should be against a switchdev instance, not a netdev, given that most ASICs will probably allow using the same program on many ports. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Acked-by: Jiri Pirko Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 4 ++-- kernel/bpf/syscall.c | 23 ++++++++++++----------- net/core/dev.c | 7 ++----- net/sched/cls_bpf.c | 8 +++----- 4 files changed, 19 insertions(+), 23 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 0bdb3a00a732..b5a01131a142 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -352,7 +352,7 @@ extern const struct bpf_verifier_ops xdp_analyzer_ops; struct bpf_prog *bpf_prog_get(u32 ufd); struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type); struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, enum bpf_prog_type type, - struct net_device *netdev); + bool attach_drv); struct bpf_prog * __must_check bpf_prog_add(struct bpf_prog *prog, int i); void bpf_prog_sub(struct bpf_prog *prog, int i); struct bpf_prog * __must_check bpf_prog_inc(struct bpf_prog *prog); @@ -455,7 +455,7 @@ static inline struct bpf_prog *bpf_prog_get_type(u32 ufd, static inline struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, enum bpf_prog_type type, - struct net_device *netdev) + bool attach_drv) { return ERR_PTR(-EOPNOTSUPP); } diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 48bdc7b095b4..9052b47fc337 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1074,22 +1074,23 @@ struct bpf_prog *bpf_prog_inc_not_zero(struct bpf_prog *prog) } EXPORT_SYMBOL_GPL(bpf_prog_inc_not_zero); -static bool bpf_prog_can_attach(struct bpf_prog *prog, - enum bpf_prog_type *attach_type, - struct net_device *netdev) +static bool bpf_prog_get_ok(struct bpf_prog *prog, + enum bpf_prog_type *attach_type, bool attach_drv) { - struct bpf_dev_offload *offload = prog->aux->offload; + /* not an attachment, just a refcount inc, always allow */ + if (!attach_type) + return true; if (prog->type != *attach_type) return false; - if (offload && offload->netdev != netdev) + if (bpf_prog_is_dev_bound(prog->aux) && !attach_drv) return false; return true; } static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *attach_type, - struct net_device *netdev) + bool attach_drv) { struct fd f = fdget(ufd); struct bpf_prog *prog; @@ -1097,7 +1098,7 @@ static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *attach_type, prog = ____bpf_prog_get(f); if (IS_ERR(prog)) return prog; - if (attach_type && !bpf_prog_can_attach(prog, attach_type, netdev)) { + if (!bpf_prog_get_ok(prog, attach_type, attach_drv)) { prog = ERR_PTR(-EINVAL); goto out; } @@ -1110,12 +1111,12 @@ out: struct bpf_prog *bpf_prog_get(u32 ufd) { - return __bpf_prog_get(ufd, NULL, NULL); + return __bpf_prog_get(ufd, NULL, false); } struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type) { - struct bpf_prog *prog = __bpf_prog_get(ufd, &type, NULL); + struct bpf_prog *prog = __bpf_prog_get(ufd, &type, false); if (!IS_ERR(prog)) trace_bpf_prog_get_type(prog); @@ -1124,9 +1125,9 @@ struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type) EXPORT_SYMBOL_GPL(bpf_prog_get_type); struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, enum bpf_prog_type type, - struct net_device *netdev) + bool attach_drv) { - struct bpf_prog *prog = __bpf_prog_get(ufd, &type, netdev); + struct bpf_prog *prog = __bpf_prog_get(ufd, &type, attach_drv); if (!IS_ERR(prog)) trace_bpf_prog_get_type(prog); diff --git a/net/core/dev.c b/net/core/dev.c index b949e63da37d..2b9e3e0a487d 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -7257,11 +7257,8 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack, __dev_xdp_attached(dev, bpf_op, NULL)) return -EBUSY; - if (bpf_op == ops->ndo_bpf) - prog = bpf_prog_get_type_dev(fd, BPF_PROG_TYPE_XDP, - dev); - else - prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_XDP); + prog = bpf_prog_get_type_dev(fd, BPF_PROG_TYPE_XDP, + bpf_op == ops->ndo_bpf); if (IS_ERR(prog)) return PTR_ERR(prog); } diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c index bb12bac6a714..3096b2f8b0a9 100644 --- a/net/sched/cls_bpf.c +++ b/net/sched/cls_bpf.c @@ -371,15 +371,13 @@ static int cls_bpf_prog_from_efd(struct nlattr **tb, struct cls_bpf_prog *prog, { struct bpf_prog *fp; char *name = NULL; + bool skip_sw; u32 bpf_fd; bpf_fd = nla_get_u32(tb[TCA_BPF_FD]); + skip_sw = gen_flags & TCA_CLS_FLAGS_SKIP_SW; - if (gen_flags & TCA_CLS_FLAGS_SKIP_SW) - fp = bpf_prog_get_type_dev(bpf_fd, BPF_PROG_TYPE_SCHED_CLS, - qdisc_dev(tp->q)); - else - fp = bpf_prog_get_type(bpf_fd, BPF_PROG_TYPE_SCHED_CLS); + fp = bpf_prog_get_type_dev(bpf_fd, BPF_PROG_TYPE_SCHED_CLS, skip_sw); if (IS_ERR(fp)) return PTR_ERR(fp); From fe249d52048495e158ace438d22a0ec3d247d94f Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 20 Nov 2017 15:21:56 -0800 Subject: [PATCH 0073/1640] UPSTREAM: bpf: turn bpf_prog_get_type() into a wrapper bpf_prog_get_type() is identical to bpf_prog_get_type_dev(), with false passed as attach_drv. Instead of keeping it as an exported symbol turn it into static inline wrapper. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 13 ++++++------- kernel/bpf/syscall.c | 10 ---------- 2 files changed, 6 insertions(+), 17 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index b5a01131a142..ec1a3b7bacc3 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -350,7 +350,6 @@ extern const struct bpf_verifier_ops tc_cls_act_analyzer_ops; extern const struct bpf_verifier_ops xdp_analyzer_ops; struct bpf_prog *bpf_prog_get(u32 ufd); -struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type); struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, enum bpf_prog_type type, bool attach_drv); struct bpf_prog * __must_check bpf_prog_add(struct bpf_prog *prog, int i); @@ -447,12 +446,6 @@ static inline struct bpf_prog *bpf_prog_get(u32 ufd) return ERR_PTR(-EOPNOTSUPP); } -static inline struct bpf_prog *bpf_prog_get_type(u32 ufd, - enum bpf_prog_type type) -{ - return ERR_PTR(-EOPNOTSUPP); -} - static inline struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, enum bpf_prog_type type, bool attach_drv) @@ -549,6 +542,12 @@ static inline bool unprivileged_ebpf_enabled(void) #endif /* CONFIG_BPF_SYSCALL */ +static inline struct bpf_prog *bpf_prog_get_type(u32 ufd, + enum bpf_prog_type type) +{ + return bpf_prog_get_type_dev(ufd, type, false); +} + int bpf_prog_offload_compile(struct bpf_prog *prog); void bpf_prog_offload_destroy(struct bpf_prog *prog); u32 bpf_prog_offload_ifindex(struct bpf_prog *prog); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 9052b47fc337..12000e174b1e 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1114,16 +1114,6 @@ struct bpf_prog *bpf_prog_get(u32 ufd) return __bpf_prog_get(ufd, NULL, false); } -struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type) -{ - struct bpf_prog *prog = __bpf_prog_get(ufd, &type, false); - - if (!IS_ERR(prog)) - trace_bpf_prog_get_type(prog); - return prog; -} -EXPORT_SYMBOL_GPL(bpf_prog_get_type); - struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, enum bpf_prog_type type, bool attach_drv) { From 14d32bfa6a4cf60c3a65d6e38c2191a4fc6eb917 Mon Sep 17 00:00:00 2001 From: Tim Zimmermann Date: Sun, 17 Aug 2025 07:50:19 +0000 Subject: [PATCH 0074/1640] Revert "UPSTREAM: bpf: relax inode permission check for retrieving bpf program" This reverts commit 7453f2852ad2e4b283fae51a2047b7d02abe8070. --- kernel/bpf/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 556d70b9e731..6e9e7f242140 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -372,7 +372,7 @@ out: static struct bpf_prog *__get_prog_inode(struct inode *inode, enum bpf_prog_type type) { struct bpf_prog *prog; - int ret = inode_permission(inode, MAY_READ); + int ret = inode_permission(inode, MAY_READ | MAY_WRITE); if (ret) return ERR_PTR(ret); From 86d0bab1094c72d0d949fc12168383e3331f98ed Mon Sep 17 00:00:00 2001 From: Tim Zimmermann Date: Sun, 17 Aug 2025 07:50:36 +0000 Subject: [PATCH 0075/1640] Revert "BACKPORT: fix "netfilter: xt_bpf: Fix XT_BPF_MODE_FD_PINNED mode of 'xt_bpf_info_v1'"" This reverts commit 1c0ac5e9bf88a8986b9c32a9c6d273e748f291d6. --- include/linux/bpf.h | 8 -------- kernel/bpf/inode.c | 37 +------------------------------------ net/netfilter/xt_bpf.c | 13 +++++++++++-- 3 files changed, 12 insertions(+), 46 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index ec1a3b7bacc3..99c055f19805 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -414,8 +414,6 @@ static inline void bpf_long_memcpy(void *dst, const void *src, u32 size) /* verify correctness of eBPF program */ int bpf_check(struct bpf_prog **fp, union bpf_attr *attr); -struct bpf_prog *bpf_prog_get_type_path(const char *name, enum bpf_prog_type type); - /* Map specifics */ struct net_device *__dev_map_lookup_elem(struct bpf_map *map, u32 key); void __dev_map_insert_ctx(struct bpf_map *map, u32 index); @@ -492,12 +490,6 @@ static inline int bpf_obj_get_user(const char __user *pathname, int flags) return -EOPNOTSUPP; } -static inline struct bpf_prog *bpf_prog_get_type_path(const char *name, - enum bpf_prog_type type) -{ - return ERR_PTR(-EOPNOTSUPP); -} - static inline struct net_device *__dev_map_lookup_elem(struct bpf_map *map, u32 key) { diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 6e9e7f242140..e2821e8834f2 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -368,42 +368,7 @@ out: putname(pname); return ret; } - -static struct bpf_prog *__get_prog_inode(struct inode *inode, enum bpf_prog_type type) -{ - struct bpf_prog *prog; - int ret = inode_permission(inode, MAY_READ | MAY_WRITE); - if (ret) - return ERR_PTR(ret); - - if (inode->i_op == &bpf_map_iops) - return ERR_PTR(-EINVAL); - if (inode->i_op != &bpf_prog_iops) - return ERR_PTR(-EACCES); - - prog = inode->i_private; - - ret = security_bpf_prog(prog); - if (ret < 0) - return ERR_PTR(ret); - - return bpf_prog_inc(prog); -} - -struct bpf_prog *bpf_prog_get_type_path(const char *name, enum bpf_prog_type type) -{ - struct bpf_prog *prog; - struct path path; - int ret = kern_path(name, LOOKUP_FOLLOW, &path); - if (ret) - return ERR_PTR(ret); - prog = __get_prog_inode(d_backing_inode(path.dentry), type); - if (!IS_ERR(prog)) - touch_atime(&path); - path_put(&path); - return prog; -} -EXPORT_SYMBOL(bpf_prog_get_type_path); +EXPORT_SYMBOL_GPL(bpf_obj_get_user); /* * Display the mount options in /proc/mounts. diff --git a/net/netfilter/xt_bpf.c b/net/netfilter/xt_bpf.c index 7908b67e4b6a..f14a58c29cd4 100644 --- a/net/netfilter/xt_bpf.c +++ b/net/netfilter/xt_bpf.c @@ -57,12 +57,21 @@ static int __bpf_mt_check_fd(int fd, struct bpf_prog **ret) static int __bpf_mt_check_path(const char *path, struct bpf_prog **ret) { + mm_segment_t oldfs = get_fs(); + int retval, fd; + if (strnlen(path, XT_BPF_PATH_MAX) == XT_BPF_PATH_MAX) return -EINVAL; - *ret = bpf_prog_get_type_path(path, BPF_PROG_TYPE_SOCKET_FILTER); - return PTR_ERR_OR_ZERO(*ret); + set_fs(KERNEL_DS); + fd = bpf_obj_get_user(path, 0); + set_fs(oldfs); + if (fd < 0) + return fd; + retval = __bpf_mt_check_fd(fd, ret); + sys_close(fd); + return retval; } static int bpf_mt_check(const struct xt_mtchk_param *par) From 1458fca282c5e557acca72fd81f8665548db5f71 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 2 Dec 2017 20:20:38 -0500 Subject: [PATCH 0076/1640] BACKPORT: fix "netfilter: xt_bpf: Fix XT_BPF_MODE_FD_PINNED mode of 'xt_bpf_info_v1'" Descriptor table is a shared object; it's not a place where you can stick temporary references to files, especially when we don't need an opened file at all. Cc: stable@vger.kernel.org # v4.14 Fixes: 98589a0998b8 ("netfilter: xt_bpf: Fix XT_BPF_MODE_FD_PINNED mode of 'xt_bpf_info_v1'") Signed-off-by: Al Viro --- include/linux/bpf.h | 10 ++++++++++ kernel/bpf/inode.c | 40 +++++++++++++++++++++++++++++++++++++++- kernel/bpf/syscall.c | 2 +- net/netfilter/xt_bpf.c | 14 ++------------ 4 files changed, 52 insertions(+), 14 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 99c055f19805..cf9bcb0882aa 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -433,6 +433,8 @@ static inline int bpf_map_attr_numa_node(const union bpf_attr *attr) attr->numa_node : NUMA_NO_NODE; } +struct bpf_prog *bpf_prog_get_type_path(const char *name, enum bpf_prog_type type); + static inline bool unprivileged_ebpf_enabled(void) { return !sysctl_unprivileged_bpf_disabled; @@ -527,6 +529,12 @@ static inline int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, return 0; } +static inline struct bpf_prog *bpf_prog_get_type_path(const char *name, + enum bpf_prog_type type) +{ + return ERR_PTR(-EOPNOTSUPP); +} + static inline bool unprivileged_ebpf_enabled(void) { return false; @@ -540,6 +548,8 @@ static inline struct bpf_prog *bpf_prog_get_type(u32 ufd, return bpf_prog_get_type_dev(ufd, type, false); } +bool bpf_prog_get_ok(struct bpf_prog *, enum bpf_prog_type *, bool); + int bpf_prog_offload_compile(struct bpf_prog *prog); void bpf_prog_offload_destroy(struct bpf_prog *prog); u32 bpf_prog_offload_ifindex(struct bpf_prog *prog); diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index e2821e8834f2..f66ae4704bdf 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -368,7 +368,45 @@ out: putname(pname); return ret; } -EXPORT_SYMBOL_GPL(bpf_obj_get_user); + +static struct bpf_prog *__get_prog_inode(struct inode *inode, enum bpf_prog_type type) +{ + struct bpf_prog *prog; + int ret = inode_permission(inode, MAY_READ | MAY_WRITE); + if (ret) + return ERR_PTR(ret); + + if (inode->i_op == &bpf_map_iops) + return ERR_PTR(-EINVAL); + if (inode->i_op != &bpf_prog_iops) + return ERR_PTR(-EACCES); + + prog = inode->i_private; + + ret = security_bpf_prog(prog); + if (ret < 0) + return ERR_PTR(ret); + + if (!bpf_prog_get_ok(prog, &type, false)) + return ERR_PTR(-EINVAL); + + return bpf_prog_inc(prog); +} + +struct bpf_prog *bpf_prog_get_type_path(const char *name, enum bpf_prog_type type) +{ + struct bpf_prog *prog; + struct path path; + int ret = kern_path(name, LOOKUP_FOLLOW, &path); + if (ret) + return ERR_PTR(ret); + prog = __get_prog_inode(d_backing_inode(path.dentry), type); + if (!IS_ERR(prog)) + touch_atime(&path); + path_put(&path); + return prog; +} +EXPORT_SYMBOL(bpf_prog_get_type_path); /* * Display the mount options in /proc/mounts. diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 12000e174b1e..537154d2d430 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1074,7 +1074,7 @@ struct bpf_prog *bpf_prog_inc_not_zero(struct bpf_prog *prog) } EXPORT_SYMBOL_GPL(bpf_prog_inc_not_zero); -static bool bpf_prog_get_ok(struct bpf_prog *prog, +bool bpf_prog_get_ok(struct bpf_prog *prog, enum bpf_prog_type *attach_type, bool attach_drv) { /* not an attachment, just a refcount inc, always allow */ diff --git a/net/netfilter/xt_bpf.c b/net/netfilter/xt_bpf.c index f14a58c29cd4..a2cf8a6236d6 100644 --- a/net/netfilter/xt_bpf.c +++ b/net/netfilter/xt_bpf.c @@ -57,21 +57,11 @@ static int __bpf_mt_check_fd(int fd, struct bpf_prog **ret) static int __bpf_mt_check_path(const char *path, struct bpf_prog **ret) { - mm_segment_t oldfs = get_fs(); - int retval, fd; - if (strnlen(path, XT_BPF_PATH_MAX) == XT_BPF_PATH_MAX) return -EINVAL; - set_fs(KERNEL_DS); - fd = bpf_obj_get_user(path, 0); - set_fs(oldfs); - if (fd < 0) - return fd; - - retval = __bpf_mt_check_fd(fd, ret); - sys_close(fd); - return retval; + *ret = bpf_prog_get_type_path(path, BPF_PROG_TYPE_SOCKET_FILTER); + return PTR_ERR_OR_ZERO(*ret); } static int bpf_mt_check(const struct xt_mtchk_param *par) From b8e48ee5654557d3471beb4505f25f6571d3e5db Mon Sep 17 00:00:00 2001 From: Chenbo Feng Date: Tue, 14 May 2019 19:42:57 -0700 Subject: [PATCH 0077/1640] UPSTREAM: bpf: relax inode permission check for retrieving bpf program commit e547ff3f803e779a3898f1f48447b29f43c54085 upstream. For iptable module to load a bpf program from a pinned location, it only retrieve a loaded program and cannot change the program content so requiring a write permission for it might not be necessary. Also when adding or removing an unrelated iptable rule, it might need to flush and reload the xt_bpf related rules as well and triggers the inode permission check. It might be better to remove the write premission check for the inode so we won't need to grant write access to all the processes that flush and restore iptables rules. Signed-off-by: Chenbo Feng Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Signed-off-by: Greg Kroah-Hartman --- kernel/bpf/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index f66ae4704bdf..1ec657e395c6 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -372,7 +372,7 @@ out: static struct bpf_prog *__get_prog_inode(struct inode *inode, enum bpf_prog_type type) { struct bpf_prog *prog; - int ret = inode_permission(inode, MAY_READ | MAY_WRITE); + int ret = inode_permission(inode, MAY_READ); if (ret) return ERR_PTR(ret); From 24bbac91cde257ecedfa5ad6261967519edda515 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 20 Nov 2017 15:21:57 -0800 Subject: [PATCH 0078/1640] UPSTREAM: bpf: offload: ignore namespace moves We are currently destroying the device offload state when device moves to another net namespace. This doesn't break with current NFP code, because offload state is not used on program removal, but it's not correct behaviour. Ignore the device unregister notifications on namespace move. Signed-off-by: Jakub Kicinski Signed-off-by: Daniel Borkmann --- kernel/bpf/offload.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index a778e5df7e26..d4267c674fec 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -174,6 +174,10 @@ static int bpf_offload_notification(struct notifier_block *notifier, switch (event) { case NETDEV_UNREGISTER: + /* ignore namespace changes */ + if (netdev->reg_state != NETREG_UNREGISTERING) + break; + list_for_each_entry_safe(offload, tmp, &bpf_prog_offload_devs, offloads) { if (offload->netdev == netdev) From 41dd4996082bb4b612217ef9ae10a528009d9931 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 20 Nov 2017 15:21:59 -0800 Subject: [PATCH 0079/1640] UPSTREAM: bpf: revert report offload info to user space This reverts commit bd601b6ada11 ("bpf: report offload info to user space"). The ifindex by itself is not sufficient, we should provide information on which network namespace this ifindex belongs to. After considering some options we concluded that it's best to just remove this API for now, and rework it in -next. Signed-off-by: Jakub Kicinski Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 1 - include/uapi/linux/bpf.h | 6 ------ kernel/bpf/offload.c | 12 ------------ kernel/bpf/syscall.c | 5 ----- 4 files changed, 24 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index cf9bcb0882aa..8de2225eda52 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -552,7 +552,6 @@ bool bpf_prog_get_ok(struct bpf_prog *, enum bpf_prog_type *, bool); int bpf_prog_offload_compile(struct bpf_prog *prog); void bpf_prog_offload_destroy(struct bpf_prog *prog); -u32 bpf_prog_offload_ifindex(struct bpf_prog *prog); #if defined(CONFIG_NET) && defined(CONFIG_BPF_SYSCALL) int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 23b96f01dd3e..d5f3780bc2df 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -995,10 +995,6 @@ enum sk_action { #define BPF_TAG_SIZE 8 -enum bpf_prog_status { - BPF_PROG_STATUS_DEV_BOUND = (1 << 0), -}; - struct bpf_prog_info { __u32 type; __u32 id; @@ -1012,8 +1008,6 @@ struct bpf_prog_info { __u32 nr_map_ids; __aligned_u64 map_ids; char name[BPF_OBJ_NAME_LEN]; - __u32 ifindex; - __u32 status; } __attribute__((aligned(8))); struct bpf_map_info { diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index d4267c674fec..68ec884440b7 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -149,18 +149,6 @@ int bpf_prog_offload_compile(struct bpf_prog *prog) return bpf_prog_offload_translate(prog); } -u32 bpf_prog_offload_ifindex(struct bpf_prog *prog) -{ - struct bpf_dev_offload *offload = prog->aux->offload; - u32 ifindex; - - rtnl_lock(); - ifindex = offload->netdev ? offload->netdev->ifindex : 0; - rtnl_unlock(); - - return ifindex; -} - const struct bpf_prog_ops bpf_offload_prog_ops = { }; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 537154d2d430..2f53519f753a 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1640,11 +1640,6 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, return -EFAULT; } - if (bpf_prog_is_dev_bound(prog->aux)) { - info.status |= BPF_PROG_STATUS_DEV_BOUND; - info.ifindex = bpf_prog_offload_ifindex(prog); - } - done: if (copy_to_user(uinfo, &info, info_len) || put_user(info_len, &uattr->info.info_len)) From 17f9f69194dbfd66637667a0da8b9fd1e7b8688a Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 20 Nov 2017 15:22:00 -0800 Subject: [PATCH 0080/1640] UPSTREAM: bpf: make bpf_prog_offload_verifier_prep() static inline Header implementation of bpf_prog_offload_verifier_prep() which is used if CONFIG_NET=n should be a static inline. Signed-off-by: Jakub Kicinski Signed-off-by: Daniel Borkmann --- include/linux/bpf_verifier.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index afa5baf72a72..2726c70730e8 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -189,7 +189,7 @@ static inline struct bpf_reg_state *cur_regs(struct bpf_verifier_env *env) #if defined(CONFIG_NET) && defined(CONFIG_BPF_SYSCALL) int bpf_prog_offload_verifier_prep(struct bpf_verifier_env *env); #else -int bpf_prog_offload_verifier_prep(struct bpf_verifier_env *env) +static inline int bpf_prog_offload_verifier_prep(struct bpf_verifier_env *env) { return -EOPNOTSUPP; } From 1497f23c8506882c1fc330d26963f051a3fba684 Mon Sep 17 00:00:00 2001 From: Gianluca Borello Date: Wed, 22 Nov 2017 18:32:53 +0000 Subject: [PATCH 0081/1640] BACKPORT: bpf: introduce ARG_PTR_TO_MEM_OR_NULL With the current ARG_PTR_TO_MEM/ARG_PTR_TO_UNINIT_MEM semantics, an helper argument can be NULL when the next argument type is ARG_CONST_SIZE_OR_ZERO and the verifier can prove the value of this next argument is 0. However, most helpers are just interested in handling , so forcing them to deal with makes the implementation of those helpers more complicated for no apparent benefits, requiring them to explicitly handle those corner cases with checks that bpf programs could start relying upon, preventing the possibility of removing them later. Solve this by making ARG_PTR_TO_MEM/ARG_PTR_TO_UNINIT_MEM never accept NULL even when ARG_CONST_SIZE_OR_ZERO is set, and introduce a new argument type ARG_PTR_TO_MEM_OR_NULL to explicitly deal with the NULL case. Currently, the only helper that needs this is bpf_csum_diff_proto(), so change arg1 and arg3 to this new type as well. Also add a new battery of tests that explicitly test the !ARG_PTR_TO_MEM_OR_NULL combination: all the current ones testing the various variations are focused on bpf_csum_diff, so cover also other helpers. Signed-off-by: Gianluca Borello Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 1 + kernel/bpf/verifier.c | 4 +++- net/core/filter.c | 4 ++-- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 8de2225eda52..dd2d6735f750 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -93,6 +93,7 @@ enum bpf_arg_type { * functions that access data on eBPF program stack */ ARG_PTR_TO_MEM, /* pointer to valid memory (stack, packet, map value) */ + ARG_PTR_TO_MEM_OR_NULL, /* pointer to valid memory or NULL */ ARG_PTR_TO_UNINIT_MEM, /* pointer to memory does not need to be initialized, * helper function must fill all bytes or clear * them in error case. diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index f79b43cb7a36..1dfcaee271bb 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1488,13 +1488,15 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, if (err < 0) return err; } else if (arg_type == ARG_PTR_TO_MEM || + arg_type == ARG_PTR_TO_MEM_OR_NULL || arg_type == ARG_PTR_TO_UNINIT_MEM) { expected_type = PTR_TO_STACK; /* One exception here. In case function allows for NULL to be * passed in as argument, it's a SCALAR_VALUE type. Final test * happens during stack boundary checking. */ - if (register_is_null(*reg)) + if (register_is_null(*reg) && + arg_type == ARG_PTR_TO_MEM_OR_NULL) /* final test in check_stack_boundary() */; else if (!type_is_pkt_pointer(type) && type != PTR_TO_MAP_VALUE && diff --git a/net/core/filter.c b/net/core/filter.c index f9e4a43c19c9..10e9507fa256 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1703,9 +1703,9 @@ static const struct bpf_func_proto bpf_csum_diff_proto = { .gpl_only = false, .pkt_access = true, .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_MEM, + .arg1_type = ARG_PTR_TO_MEM_OR_NULL, .arg2_type = ARG_CONST_SIZE_OR_ZERO, - .arg3_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_PTR_TO_MEM_OR_NULL, .arg4_type = ARG_CONST_SIZE_OR_ZERO, .arg5_type = ARG_ANYTHING, }; From c1d72290f5a68782f07067dc679c0fc953202ba7 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 27 Nov 2017 12:10:23 -0800 Subject: [PATCH 0082/1640] UPSTREAM: bpf: offload: add a license header I forgot to add a license on kernel/bpf/offload.c. Luckily I'm still the only author so make it explicitly GPLv2. Signed-off-by: Jakub Kicinski Reviewed-by: Simon Horman Signed-off-by: Daniel Borkmann --- kernel/bpf/offload.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index 68ec884440b7..8455b89d1bbf 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -1,3 +1,18 @@ +/* + * Copyright (C) 2017 Netronome Systems, Inc. + * + * This software is licensed under the GNU General License Version 2, + * June 1991 as shown in the file COPYING in the top-level directory of this + * source tree. + * + * THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" + * WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, + * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE + * OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME + * THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. + */ + #include #include #include From cea10be5bbc34c9e772467b76f6707c47a3960ba Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 30 Nov 2017 13:47:54 -0800 Subject: [PATCH 0083/1640] BACKPORT: bpf: set maximum number of attached progs to 64 for a single perf tp cgropu+bpf prog array has a maximum number of 64 programs. Let us apply the same limit here. Fixes: e87c6bc3852b ("bpf: permit multiple bpf attachments for a single perf event") Signed-off-by: Yonghong Song Signed-off-by: Daniel Borkmann --- kernel/bpf/core.c | 3 ++- kernel/trace/bpf_trace.c | 8 ++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index a771d313ba89..4968719dc6d3 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1595,7 +1595,8 @@ int bpf_prog_array_length(struct bpf_prog_array __rcu *progs) rcu_read_lock(); prog = rcu_dereference(progs)->progs; for (; *prog; prog++) - cnt++; + if (*prog != &dummy_bpf_prog.prog) + cnt++; rcu_read_unlock(); return cnt; } diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 28751d3dc8e1..2fdaef452d91 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -768,6 +768,8 @@ const struct bpf_prog_ops perf_event_prog_ops = { static DEFINE_MUTEX(bpf_event_mutex); +#define BPF_TRACE_MAX_PROGS 64 + int perf_event_attach_bpf_prog(struct perf_event *event, struct bpf_prog *prog) { @@ -782,6 +784,12 @@ int perf_event_attach_bpf_prog(struct perf_event *event, old_array = rcu_dereference_protected(event->tp_event->prog_array, lockdep_is_held(&bpf_event_mutex)); + if (old_array && + bpf_prog_array_length(old_array) >= BPF_TRACE_MAX_PROGS) { + ret = -E2BIG; + goto out; + } + ret = bpf_prog_array_copy(old_array, NULL, prog, &new_array); if (ret < 0) goto out; From c077ca0a600ee94ca62bd29401f0cee847df337c Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 30 Nov 2017 21:31:36 -0800 Subject: [PATCH 0084/1640] UPSTREAM: bpf: print liveness info to verifier log let verifier print register and stack liveness information into verifier log Signed-off-by: Alexei Starovoitov Acked-by: John Fastabend Acked-by: Daniel Borkmann Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 1dfcaee271bb..a783275ac25f 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -216,6 +216,17 @@ static const char * const reg_type_str[] = { [PTR_TO_PACKET_END] = "pkt_end", }; +static void print_liveness(struct bpf_verifier_env *env, + enum bpf_reg_liveness live) +{ + if (live & (REG_LIVE_READ | REG_LIVE_WRITTEN)) + verbose(env, "_"); + if (live & REG_LIVE_READ) + verbose(env, "r"); + if (live & REG_LIVE_WRITTEN) + verbose(env, "w"); +} + static void print_verifier_state(struct bpf_verifier_env *env, struct bpf_verifier_state *state) { @@ -228,7 +239,9 @@ static void print_verifier_state(struct bpf_verifier_env *env, t = reg->type; if (t == NOT_INIT) continue; - verbose(env, " R%d=%s", i, reg_type_str[t]); + verbose(env, " R%d", i); + print_liveness(env, reg->live); + verbose(env, "=%s", reg_type_str[t]); if ((t == SCALAR_VALUE || t == PTR_TO_STACK) && tnum_is_const(reg->var_off)) { /* reg->off should be 0 for SCALAR_VALUE */ @@ -277,10 +290,13 @@ static void print_verifier_state(struct bpf_verifier_env *env, } } for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { - if (state->stack[i].slot_type[0] == STACK_SPILL) - verbose(env, " fp%d=%s", - (-i - 1) * BPF_REG_SIZE, + if (state->stack[i].slot_type[0] == STACK_SPILL) { + verbose(env, " fp%d", + (-i - 1) * BPF_REG_SIZE); + print_liveness(env, state->stack[i].spilled_ptr.live); + verbose(env, "=%s", reg_type_str[state->stack[i].spilled_ptr.type]); + } } verbose(env, "\n"); } From e51d78f97da319c6abb546271ebe84c4924ca577 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 30 Nov 2017 21:31:37 -0800 Subject: [PATCH 0085/1640] UPSTREAM: bpf: don't mark FP reg as uninit when verifier hits an internal bug don't mark register R10==FP as uninit, since it's read only register and it's not technically correct to let verifier run further, since it may assume that R10 has valid auxiliary state. While developing subsequent patches this issue was discovered, though the code eventually changed that aux reg state doesn't have pointers any more it is still safer to avoid clearing readonly register. Signed-off-by: Alexei Starovoitov Acked-by: John Fastabend Acked-by: Daniel Borkmann Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a783275ac25f..3c7109f7c3d7 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -586,8 +586,8 @@ static void mark_reg_unknown(struct bpf_verifier_env *env, { if (WARN_ON(regno >= MAX_BPF_REG)) { verbose(env, "mark_reg_unknown(regs, %u)\n", regno); - /* Something bad happened, let's kill all regs */ - for (regno = 0; regno < MAX_BPF_REG; regno++) + /* Something bad happened, let's kill all regs except FP */ + for (regno = 0; regno < BPF_REG_FP; regno++) __mark_reg_not_init(regs + regno); return; } @@ -605,8 +605,8 @@ static void mark_reg_not_init(struct bpf_verifier_env *env, { if (WARN_ON(regno >= MAX_BPF_REG)) { verbose(env, "mark_reg_not_init(regs, %u)\n", regno); - /* Something bad happened, let's kill all regs */ - for (regno = 0; regno < MAX_BPF_REG; regno++) + /* Something bad happened, let's kill all regs except FP */ + for (regno = 0; regno < BPF_REG_FP; regno++) __mark_reg_not_init(regs + regno); return; } From e982cbc5832fe39127ed0dd8fd4738a8b8635588 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 30 Nov 2017 21:31:38 -0800 Subject: [PATCH 0086/1640] UPSTREAM: bpf: improve verifier liveness marks registers with pointers filled from stack were missing live_written marks which caused liveness propagation to unnecessary mark more registers as live_read and miss state pruning opportunities later on. before after bpf_lb-DLB_L3.o 2285 2270 bpf_lb-DLB_L4.o 3723 3682 bpf_lb-DUNKNOWN.o 1110 1110 bpf_lxc-DDROP_ALL.o 27954 27876 bpf_lxc-DUNKNOWN.o 38954 38780 bpf_netdev.o 16943 16937 bpf_overlay.o 7929 7929 Signed-off-by: Alexei Starovoitov Acked-by: John Fastabend Acked-by: Daniel Borkmann Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 3c7109f7c3d7..e9d28c52bcce 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -822,6 +822,11 @@ static int check_stack_read(struct bpf_verifier_env *env, if (value_regno >= 0) { /* restore register state from stack */ state->regs[value_regno] = state->stack[spi].spilled_ptr; + /* mark reg as written since spilled pointer state likely + * has its liveness marks cleared by is_state_visited() + * which resets stack/reg liveness for state transitions + */ + state->regs[value_regno].live |= REG_LIVE_WRITTEN; mark_stack_slot_read(state, spi); } return 0; From 26d3359069b99b1fbd7e0592d1e85dc0d3d0e2e1 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 30 Nov 2017 21:31:39 -0800 Subject: [PATCH 0087/1640] UPSTREAM: bpf: improve JEQ/JNE path walking verifier knows how to trim paths that are known not to be taken at run-time when register containing run-time constant is compared with another constant. It was done only for JEQ comparison. Extend it to include JNE as well. More cases can be added in the future. before after bpf_lb-DLB_L3.o 2270 2051 bpf_lb-DLB_L4.o 3682 3287 bpf_lb-DUNKNOWN.o 1110 1080 bpf_lxc-DDROP_ALL.o 27876 24980 bpf_lxc-DUNKNOWN.o 38780 34308 bpf_netdev.o 16937 15404 bpf_overlay.o 7929 7191 Signed-off-by: Alexei Starovoitov Acked-by: John Fastabend Acked-by: Daniel Borkmann Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index e9d28c52bcce..7128ec81ee0a 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3382,8 +3382,9 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, if (BPF_SRC(insn->code) == BPF_K && (opcode == BPF_JEQ || opcode == BPF_JNE) && dst_reg->type == SCALAR_VALUE && - tnum_equals_const(dst_reg->var_off, insn->imm)) { - if (opcode == BPF_JEQ) { + tnum_is_const(dst_reg->var_off)) { + if ((opcode == BPF_JEQ && dst_reg->var_off.value == insn->imm) || + (opcode == BPF_JNE && dst_reg->var_off.value != insn->imm)) { /* if (imm == imm) goto pc+off; * only follow the goto, ignore fall-through */ From 4c74800c7a450b6977711a74c2eda650b8a695e3 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 30 Nov 2017 21:31:40 -0800 Subject: [PATCH 0088/1640] UPSTREAM: bpf: cleanup register_is_null() don't pass large struct bpf_reg_state by value. Instead pass it by pointer. Signed-off-by: Alexei Starovoitov Acked-by: John Fastabend Acked-by: Daniel Borkmann Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 7128ec81ee0a..5d298d687506 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1365,9 +1365,9 @@ static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_ins } /* Does this register contain a constant zero? */ -static bool register_is_null(struct bpf_reg_state reg) +static bool register_is_null(struct bpf_reg_state *reg) { - return reg.type == SCALAR_VALUE && tnum_equals_const(reg.var_off, 0); + return reg->type == SCALAR_VALUE && tnum_equals_const(reg->var_off, 0); } /* when register 'regno' is passed into function that will read 'access_size' @@ -1380,32 +1380,32 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, int access_size, bool zero_size_allowed, struct bpf_call_arg_meta *meta) { + struct bpf_reg_state *reg = cur_regs(env) + regno; struct bpf_verifier_state *state = env->cur_state; - struct bpf_reg_state *regs = state->regs; int off, i, slot, spi; - if (regs[regno].type != PTR_TO_STACK) { + if (reg->type != PTR_TO_STACK) { /* Allow zero-byte read from NULL, regardless of pointer type */ if (zero_size_allowed && access_size == 0 && - register_is_null(regs[regno])) + register_is_null(reg)) return 0; verbose(env, "R%d type=%s expected=%s\n", regno, - reg_type_str[regs[regno].type], + reg_type_str[reg->type], reg_type_str[PTR_TO_STACK]); return -EACCES; } /* Only allow fixed-offset stack reads */ - if (!tnum_is_const(regs[regno].var_off)) { + if (!tnum_is_const(reg->var_off)) { char tn_buf[48]; - tnum_strn(tn_buf, sizeof(tn_buf), regs[regno].var_off); + tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); verbose(env, "invalid variable stack read R%d var_off=%s\n", regno, tn_buf); return -EACCES; } - off = regs[regno].off + regs[regno].var_off.value; + off = reg->off + reg->var_off.value; if (off >= 0 || off < -MAX_BPF_STACK || off + access_size > 0 || access_size < 0 || (access_size == 0 && !zero_size_allowed)) { verbose(env, "invalid stack type R%d off=%d access_size=%d\n", @@ -1516,7 +1516,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, * passed in as argument, it's a SCALAR_VALUE type. Final test * happens during stack boundary checking. */ - if (register_is_null(*reg) && + if (register_is_null(reg) && arg_type == ARG_PTR_TO_MEM_OR_NULL) /* final test in check_stack_boundary() */; else if (!type_is_pkt_pointer(type) && From 5f0bedd4a2a5bda5fb397cc4653888a40e712a5d Mon Sep 17 00:00:00 2001 From: William Tu Date: Fri, 1 Dec 2017 15:26:09 -0800 Subject: [PATCH 0089/1640] UPSTREAM: bpf: allow disabling tunnel csum for ipv6 Before the patch, BPF_F_ZERO_CSUM_TX can be used only for ipv4 tunnel. With introduction of ip6gretap collect_md mode, the flag should be also supported for ipv6. Signed-off-by: William Tu Cc: Daniel Borkmann Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- net/core/filter.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/net/core/filter.c b/net/core/filter.c index 10e9507fa256..0d3c4bca55b5 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3109,10 +3109,11 @@ BPF_CALL_4(bpf_skb_set_tunnel_key, struct sk_buff *, skb, IPV6_FLOWLABEL_MASK; } else { info->key.u.ipv4.dst = cpu_to_be32(from->remote_ipv4); - if (flags & BPF_F_ZERO_CSUM_TX) - info->key.tun_flags &= ~TUNNEL_CSUM; } + if (flags & BPF_F_ZERO_CSUM_TX) + info->key.tun_flags &= ~TUNNEL_CSUM; + return 0; } From 7ac11766804279c0c71e0357c33cff1d998acae1 Mon Sep 17 00:00:00 2001 From: William Tu Date: Mon, 4 Dec 2017 14:18:29 -0800 Subject: [PATCH 0090/1640] UPSTREAM: bpf: move bpf csum flag check trivial move the BPF_F_ZERO_CSUM_TX check right below the 'flags & BPF_F_DONT_FRAGMENT', so common tun_flags handling is logically together. Signed-off-by: William Tu Acked-by: Daniel Borkmann Signed-off-by: Daniel Borkmann --- net/core/filter.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/net/core/filter.c b/net/core/filter.c index 0d3c4bca55b5..e4dd66b48cb3 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3096,6 +3096,8 @@ BPF_CALL_4(bpf_skb_set_tunnel_key, struct sk_buff *, skb, info->key.tun_flags = TUNNEL_KEY | TUNNEL_CSUM | TUNNEL_NOCACHE; if (flags & BPF_F_DONT_FRAGMENT) info->key.tun_flags |= TUNNEL_DONT_FRAGMENT; + if (flags & BPF_F_ZERO_CSUM_TX) + info->key.tun_flags &= ~TUNNEL_CSUM; info->key.tun_id = cpu_to_be64(from->tunnel_id); info->key.tos = from->tunnel_tos; @@ -3111,9 +3113,6 @@ BPF_CALL_4(bpf_skb_set_tunnel_key, struct sk_buff *, skb, info->key.u.ipv4.dst = cpu_to_be32(from->remote_ipv4); } - if (flags & BPF_F_ZERO_CSUM_TX) - info->key.tun_flags &= ~TUNNEL_CSUM; - return 0; } From f8d6be7cc45b77916c67b200b5f51fde966b5711 Mon Sep 17 00:00:00 2001 From: Lawrence Brakmo Date: Fri, 1 Dec 2017 10:15:04 -0800 Subject: [PATCH 0091/1640] UPSTREAM: bpf: Add access to snd_cwnd and others in sock_ops Adds read access to snd_cwnd and srtt_us fields of tcp_sock. Since these fields are only valid if the socket associated with the sock_ops program call is a full socket, the field is_fullsock is also added to the bpf_sock_ops struct. If the socket is not a full socket, reading these fields returns 0. Note that in most cases it will not be necessary to check is_fullsock to know if there is a full socket. The context of the call, as specified by the 'op' field, can sometimes determine whether there is a full socket. The struct bpf_sock_ops has the following fields added: __u32 is_fullsock; /* Some TCP fields are only valid if * there is a full socket. If not, the * fields read as zero. */ __u32 snd_cwnd; __u32 srtt_us; /* Averaged RTT << 3 in usecs */ There is a new macro, SOCK_OPS_GET_TCP32(NAME), to make it easier to add read access to more 32 bit tcp_sock fields. Signed-off-by: Lawrence Brakmo Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/linux/filter.h | 1 + include/net/tcp.h | 8 +++++--- include/uapi/linux/bpf.h | 6 ++++++ net/core/filter.c | 36 ++++++++++++++++++++++++++++++++++++ 4 files changed, 48 insertions(+), 3 deletions(-) diff --git a/include/linux/filter.h b/include/linux/filter.h index c3038118a4c0..18ce37dfe7dd 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1066,6 +1066,7 @@ struct bpf_sock_ops_kern { u32 reply; u32 replylong[4]; }; + u32 is_fullsock; }; #endif /* __LINUX_FILTER_H__ */ diff --git a/include/net/tcp.h b/include/net/tcp.h index be3f56ab10ad..de92c59012d0 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -2155,10 +2155,12 @@ static inline int tcp_call_bpf(struct sock *sk, int op) struct bpf_sock_ops_kern sock_ops; int ret; - if (sk_fullsock(sk)) - sock_owned_by_me(sk); - memset(&sock_ops, 0, sizeof(sock_ops)); + if (sk_fullsock(sk)) { + sock_ops.is_fullsock = 1; + sock_owned_by_me(sk); + } + sock_ops.sk = sk; sock_ops.op = op; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index d5f3780bc2df..c74db97d94f5 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1039,6 +1039,12 @@ struct bpf_sock_ops { __u32 local_ip6[4]; /* Stored in network byte order */ __u32 remote_port; /* Stored in network byte order */ __u32 local_port; /* stored in host byte order */ + __u32 is_fullsock; /* Some TCP fields are only valid if + * there is a full socket. If not, the + * fields read as zero. + */ + __u32 snd_cwnd; + __u32 srtt_us; /* Averaged RTT << 3 in usecs */ }; /* List of known BPF sock_ops operators. diff --git a/net/core/filter.c b/net/core/filter.c index e4dd66b48cb3..c543b258cfeb 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4534,6 +4534,42 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_num)); break; + + case offsetof(struct bpf_sock_ops, is_fullsock): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct bpf_sock_ops_kern, + is_fullsock), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sock_ops_kern, + is_fullsock)); + break; + +/* Helper macro for adding read access to tcp_sock fields. */ +#define SOCK_OPS_GET_TCP32(FIELD_NAME) \ + do { \ + BUILD_BUG_ON(FIELD_SIZEOF(struct tcp_sock, FIELD_NAME) != 4); \ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ + struct bpf_sock_ops_kern, \ + is_fullsock), \ + si->dst_reg, si->src_reg, \ + offsetof(struct bpf_sock_ops_kern, \ + is_fullsock)); \ + *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 2); \ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ + struct bpf_sock_ops_kern, sk),\ + si->dst_reg, si->src_reg, \ + offsetof(struct bpf_sock_ops_kern, sk));\ + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, \ + offsetof(struct tcp_sock, FIELD_NAME)); \ + } while (0) + + case offsetof(struct bpf_sock_ops, snd_cwnd): + SOCK_OPS_GET_TCP32(snd_cwnd); + break; + + case offsetof(struct bpf_sock_ops, srtt_us): + SOCK_OPS_GET_TCP32(srtt_us); + break; } return insn - insn_buf; } From 3b0c288f8488497e4128a4695fe8aa66814f60c3 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Mon, 11 Dec 2017 11:39:02 -0800 Subject: [PATCH 0092/1640] UPSTREAM: bpf/tracing: allow user space to query prog array on the same tp Commit e87c6bc3852b ("bpf: permit multiple bpf attachments for a single perf event") added support to attach multiple bpf programs to a single perf event. Although this provides flexibility, users may want to know what other bpf programs attached to the same tp interface. Besides getting visibility for the underlying bpf system, such information may also help consolidate multiple bpf programs, understand potential performance issues due to a large array, and debug (e.g., one bpf program which overwrites return code may impact subsequent program results). Commit 2541517c32be ("tracing, perf: Implement BPF programs attached to kprobes") utilized the existing perf ioctl interface and added the command PERF_EVENT_IOC_SET_BPF to attach a bpf program to a tracepoint. This patch adds a new ioctl command, given a perf event fd, to query the bpf program array attached to the same perf tracepoint event. The new uapi ioctl command: PERF_EVENT_IOC_QUERY_BPF The new uapi/linux/perf_event.h structure: struct perf_event_query_bpf { __u32 ids_len; __u32 prog_cnt; __u32 ids[0]; }; User space provides buffer "ids" for kernel to copy to. When returning from the kernel, the number of available programs in the array is set in "prog_cnt". The usage: struct perf_event_query_bpf *query = malloc(sizeof(*query) + sizeof(u32) * ids_len); query.ids_len = ids_len; err = ioctl(pmu_efd, PERF_EVENT_IOC_QUERY_BPF, query); if (err == 0) { /* query.prog_cnt is the number of available progs, * number of progs in ids: (ids_len == 0) ? 0 : query.prog_cnt */ } else if (errno == ENOSPC) { /* query.ids_len number of progs copied, * query.prog_cnt is the number of available progs */ } else { /* other errors */ } Signed-off-by: Yonghong Song Acked-by: Peter Zijlstra (Intel) Acked-by: Alexei Starovoitov Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 4 ++++ include/uapi/linux/perf_event.h | 22 ++++++++++++++++++++++ kernel/bpf/core.c | 21 +++++++++++++++++++++ kernel/events/core.c | 3 +++ kernel/trace/bpf_trace.c | 23 +++++++++++++++++++++++ 5 files changed, 73 insertions(+) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index dd2d6735f750..614284d9c2f7 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -270,6 +270,7 @@ typedef unsigned long (*bpf_ctx_copy_t)(void *dst, const void *src, u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size, void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy); +int bpf_event_query_prog_array(struct perf_event *event, void __user *info); int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr); @@ -301,6 +302,9 @@ int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *progs, void bpf_prog_array_delete_safe(struct bpf_prog_array __rcu *progs, struct bpf_prog *old_prog); +int bpf_prog_array_copy_info(struct bpf_prog_array __rcu *array, + __u32 __user *prog_ids, u32 request_cnt, + __u32 __user *prog_cnt); int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array, struct bpf_prog *exclude_prog, struct bpf_prog *include_prog, diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index fc72a3839c9d..60460cc6d78f 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -418,6 +418,27 @@ struct perf_event_attr { __u16 __reserved_2; /* align to __u64 */ }; +/* + * Structure used by below PERF_EVENT_IOC_QUERY_BPF command + * to query bpf programs attached to the same perf tracepoint + * as the given perf event. + */ +struct perf_event_query_bpf { + /* + * The below ids array length + */ + __u32 ids_len; + /* + * Set by the kernel to indicate the number of + * available programs + */ + __u32 prog_cnt; + /* + * User provided buffer to store program ids + */ + __u32 ids[0]; +}; + #define perf_flags(attr) (*(&(attr)->read_format + 1)) /* @@ -433,6 +454,7 @@ struct perf_event_attr { #define PERF_EVENT_IOC_ID _IOR('$', 7, __u64 *) #define PERF_EVENT_IOC_SET_BPF _IOW('$', 8, __u32) #define PERF_EVENT_IOC_PAUSE_OUTPUT _IOW('$', 9, __u32) +#define PERF_EVENT_IOC_QUERY_BPF _IOWR('$', 10, struct perf_event_query_bpf *) enum perf_event_ioc_flags { PERF_IOC_FLAG_GROUP = 1U << 0, diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 4968719dc6d3..e36ae5a661f0 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1610,6 +1610,8 @@ int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *progs, rcu_read_lock(); prog = rcu_dereference(progs)->progs; for (; *prog; prog++) { + if (*prog == &dummy_bpf_prog.prog) + continue; id = (*prog)->aux->id; if (copy_to_user(prog_ids + i, &id, sizeof(id))) { rcu_read_unlock(); @@ -1693,6 +1695,25 @@ int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array, return 0; } +int bpf_prog_array_copy_info(struct bpf_prog_array __rcu *array, + __u32 __user *prog_ids, u32 request_cnt, + __u32 __user *prog_cnt) +{ + u32 cnt = 0; + + if (array) + cnt = bpf_prog_array_length(array); + + if (copy_to_user(prog_cnt, &cnt, sizeof(cnt))) + return -EFAULT; + + /* return early if user requested only program count or nothing to copy */ + if (!request_cnt || !cnt) + return 0; + + return bpf_prog_array_copy_to_user(array, prog_ids, request_cnt); +} + static void bpf_prog_free_deferred(struct work_struct *work) { struct bpf_prog_aux *aux; diff --git a/kernel/events/core.c b/kernel/events/core.c index 08ed6a2d3a23..644b81ead72a 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5115,6 +5115,9 @@ static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned lon rcu_read_unlock(); return 0; } + + case PERF_EVENT_IOC_QUERY_BPF: + return bpf_event_query_prog_array(event, (void __user *)arg); default: return -ENOTTY; } diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 2fdaef452d91..e39165a2daa0 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -832,3 +832,26 @@ void perf_event_detach_bpf_prog(struct perf_event *event) out: mutex_unlock(&bpf_event_mutex); } + +int bpf_event_query_prog_array(struct perf_event *event, void __user *info) +{ + struct perf_event_query_bpf __user *uquery = info; + struct perf_event_query_bpf query = {}; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (event->attr.type != PERF_TYPE_TRACEPOINT) + return -EINVAL; + if (copy_from_user(&query, uquery, sizeof(query))) + return -EFAULT; + + mutex_lock(&bpf_event_mutex); + ret = bpf_prog_array_copy_info(event->tp_event->prog_array, + uquery->ids, + query.ids_len, + &uquery->prog_cnt); + mutex_unlock(&bpf_event_mutex); + + return ret; +} From b814a58783dbb16ce0da1fb0c2b221e2db59e96a Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 11 Dec 2017 11:36:46 -0500 Subject: [PATCH 0093/1640] UPSTREAM: add infrastructure for tagging functions as error injectable Using BPF we can override kprob'ed functions and return arbitrary values. Obviously this can be a bit unsafe, so make this feature opt-in for functions. Simply tag a function with KPROBE_ERROR_INJECT_SYMBOL in order to give BPF access to that function for error injection purposes. Signed-off-by: Josef Bacik Acked-by: Ingo Molnar Signed-off-by: Alexei Starovoitov --- include/asm-generic/vmlinux.lds.h | 10 ++ include/linux/bpf.h | 11 ++ include/linux/kprobes.h | 1 + include/linux/module.h | 5 + kernel/kprobes.c | 163 ++++++++++++++++++++++++++++++ kernel/module.c | 6 +- 6 files changed, 195 insertions(+), 1 deletion(-) diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index dfe27f9dc2f6..8fc1d1730ec6 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -139,6 +139,15 @@ #define KPROBE_BLACKLIST() #endif +#ifdef CONFIG_BPF_KPROBE_OVERRIDE +#define ERROR_INJECT_LIST() . = ALIGN(8); \ + VMLINUX_SYMBOL(__start_kprobe_error_inject_list) = .; \ + KEEP(*(_kprobe_error_inject_list)) \ + VMLINUX_SYMBOL(__stop_kprobe_error_inject_list) = .; +#else +#define ERROR_INJECT_LIST() +#endif + #ifdef CONFIG_EVENT_TRACING #define FTRACE_EVENTS() . = ALIGN(8); \ VMLINUX_SYMBOL(__start_ftrace_events) = .; \ @@ -621,6 +630,7 @@ FTRACE_EVENTS() \ TRACE_SYSCALLS() \ KPROBE_BLACKLIST() \ + ERROR_INJECT_LIST() \ MEM_DISCARD(init.rodata) \ CLK_OF_TABLES() \ RESERVEDMEM_OF_TABLES() \ diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 614284d9c2f7..4c22590be898 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -618,4 +618,15 @@ extern const struct bpf_func_proto bpf_sock_map_update_proto; void bpf_user_rnd_init_once(void); u64 bpf_user_rnd_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); +#if defined(__KERNEL__) && !defined(__ASSEMBLY__) +#ifdef CONFIG_BPF_KPROBE_OVERRIDE +#define BPF_ALLOW_ERROR_INJECTION(fname) \ +static unsigned long __used \ + __attribute__((__section__("_kprobe_error_inject_list"))) \ + _eil_addr_##fname = (unsigned long)fname; +#else +#define BPF_ALLOW_ERROR_INJECTION(fname) +#endif +#endif + #endif /* _LINUX_BPF_H */ diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index a3b380899e5d..9b6ece57cfff 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -274,6 +274,7 @@ extern bool arch_kprobe_on_func_entry(unsigned long offset); extern bool kprobe_on_func_entry(kprobe_opcode_t *addr, const char *sym, unsigned long offset); extern bool within_kprobe_blacklist(unsigned long addr); +extern bool within_kprobe_error_injection_list(unsigned long addr); struct kprobe_insn_cache { struct mutex mutex; diff --git a/include/linux/module.h b/include/linux/module.h index 6fd9449ca6b1..72c25f845fbe 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -481,6 +481,11 @@ struct module { ctor_fn_t *ctors; unsigned int num_ctors; #endif + +#ifdef CONFIG_BPF_KPROBE_OVERRIDE + unsigned int num_kprobe_ei_funcs; + unsigned long *kprobe_ei_funcs; +#endif } ____cacheline_aligned __randomize_layout; #ifndef MODULE_ARCH_INIT #define MODULE_ARCH_INIT {} diff --git a/kernel/kprobes.c b/kernel/kprobes.c index e86bbcb849ac..6232ea3ce94f 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -83,6 +83,16 @@ static raw_spinlock_t *kretprobe_table_lock_ptr(unsigned long hash) return &(kretprobe_table_locks[hash].lock); } +/* List of symbols that can be overriden for error injection. */ +static LIST_HEAD(kprobe_error_injection_list); +static DEFINE_MUTEX(kprobe_ei_mutex); +struct kprobe_ei_entry { + struct list_head list; + unsigned long start_addr; + unsigned long end_addr; + void *priv; +}; + /* Blacklist -- list of struct kprobe_blacklist_entry */ static LIST_HEAD(kprobe_blacklist); @@ -1457,6 +1467,17 @@ bool within_kprobe_blacklist(unsigned long addr) return false; } +bool within_kprobe_error_injection_list(unsigned long addr) +{ + struct kprobe_ei_entry *ent; + + list_for_each_entry(ent, &kprobe_error_injection_list, list) { + if (addr >= ent->start_addr && addr < ent->end_addr) + return true; + } + return false; +} + /* * If we have a symbol_name argument, look it up and add the offset field * to it. This way, we can specify a relative address to a symbol. @@ -2258,6 +2279,86 @@ static int __init populate_kprobe_blacklist(unsigned long *start, return 0; } +#ifdef CONFIG_BPF_KPROBE_OVERRIDE +/* Markers of the _kprobe_error_inject_list section */ +extern unsigned long __start_kprobe_error_inject_list[]; +extern unsigned long __stop_kprobe_error_inject_list[]; + +/* + * Lookup and populate the kprobe_error_injection_list. + * + * For safety reasons we only allow certain functions to be overriden with + * bpf_error_injection, so we need to populate the list of the symbols that have + * been marked as safe for overriding. + */ +static void populate_kprobe_error_injection_list(unsigned long *start, + unsigned long *end, + void *priv) +{ + unsigned long *iter; + struct kprobe_ei_entry *ent; + unsigned long entry, offset = 0, size = 0; + + mutex_lock(&kprobe_ei_mutex); + for (iter = start; iter < end; iter++) { + entry = arch_deref_entry_point((void *)*iter); + + if (!kernel_text_address(entry) || + !kallsyms_lookup_size_offset(entry, &size, &offset)) { + pr_err("Failed to find error inject entry at %p\n", + (void *)entry); + continue; + } + + ent = kmalloc(sizeof(*ent), GFP_KERNEL); + if (!ent) + break; + ent->start_addr = entry; + ent->end_addr = entry + size; + ent->priv = priv; + INIT_LIST_HEAD(&ent->list); + list_add_tail(&ent->list, &kprobe_error_injection_list); + } + mutex_unlock(&kprobe_ei_mutex); +} + +static void __init populate_kernel_kprobe_ei_list(void) +{ + populate_kprobe_error_injection_list(__start_kprobe_error_inject_list, + __stop_kprobe_error_inject_list, + NULL); +} + +static void module_load_kprobe_ei_list(struct module *mod) +{ + if (!mod->num_kprobe_ei_funcs) + return; + populate_kprobe_error_injection_list(mod->kprobe_ei_funcs, + mod->kprobe_ei_funcs + + mod->num_kprobe_ei_funcs, mod); +} + +static void module_unload_kprobe_ei_list(struct module *mod) +{ + struct kprobe_ei_entry *ent, *n; + if (!mod->num_kprobe_ei_funcs) + return; + + mutex_lock(&kprobe_ei_mutex); + list_for_each_entry_safe(ent, n, &kprobe_error_injection_list, list) { + if (ent->priv == mod) { + list_del_init(&ent->list); + kfree(ent); + } + } + mutex_unlock(&kprobe_ei_mutex); +} +#else +static inline void __init populate_kernel_kprobe_ei_list(void) {} +static inline void module_load_kprobe_ei_list(struct module *m) {} +static inline void module_unload_kprobe_ei_list(struct module *m) {} +#endif + /* Module notifier call back, checking kprobes on the module */ static int kprobes_module_callback(struct notifier_block *nb, unsigned long val, void *data) @@ -2268,6 +2369,11 @@ static int kprobes_module_callback(struct notifier_block *nb, unsigned int i; int checkcore = (val == MODULE_STATE_GOING); + if (val == MODULE_STATE_COMING) + module_load_kprobe_ei_list(mod); + else if (val == MODULE_STATE_GOING) + module_unload_kprobe_ei_list(mod); + if (val != MODULE_STATE_GOING && val != MODULE_STATE_LIVE) return NOTIFY_DONE; @@ -2334,6 +2440,8 @@ static int __init init_kprobes(void) pr_err("Please take care of using kprobes.\n"); } + populate_kernel_kprobe_ei_list(); + if (kretprobe_blacklist_size) { /* lookup the function address from its name */ for (i = 0; kretprobe_blacklist[i].name != NULL; i++) { @@ -2501,6 +2609,56 @@ static const struct file_operations debugfs_kprobe_blacklist_ops = { .release = seq_release, }; +/* + * kprobes/error_injection_list -- shows which functions can be overriden for + * error injection. + * */ +static void *kprobe_ei_seq_start(struct seq_file *m, loff_t *pos) +{ + mutex_lock(&kprobe_ei_mutex); + return seq_list_start(&kprobe_error_injection_list, *pos); +} + +static void kprobe_ei_seq_stop(struct seq_file *m, void *v) +{ + mutex_unlock(&kprobe_ei_mutex); +} + +static void *kprobe_ei_seq_next(struct seq_file *m, void *v, loff_t *pos) +{ + return seq_list_next(v, &kprobe_error_injection_list, pos); +} + +static int kprobe_ei_seq_show(struct seq_file *m, void *v) +{ + char buffer[KSYM_SYMBOL_LEN]; + struct kprobe_ei_entry *ent = + list_entry(v, struct kprobe_ei_entry, list); + + sprint_symbol(buffer, ent->start_addr); + seq_printf(m, "%s\n", buffer); + return 0; +} + +static const struct seq_operations kprobe_ei_seq_ops = { + .start = kprobe_ei_seq_start, + .next = kprobe_ei_seq_next, + .stop = kprobe_ei_seq_stop, + .show = kprobe_ei_seq_show, +}; + +static int kprobe_ei_open(struct inode *inode, struct file *filp) +{ + return seq_open(filp, &kprobe_ei_seq_ops); +} + +static const struct file_operations debugfs_kprobe_ei_ops = { + .open = kprobe_ei_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + static void arm_all_kprobes(void) { struct hlist_head *head; @@ -2642,6 +2800,11 @@ static int __init debugfs_kprobe_init(void) if (!file) goto error; + file = debugfs_create_file("error_injection_list", 0444, dir, NULL, + &debugfs_kprobe_ei_ops); + if (!file) + goto error; + return 0; error: diff --git a/kernel/module.c b/kernel/module.c index 275dc6ccc6f3..53bd38f8a572 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -3265,7 +3265,11 @@ static int find_module_sections(struct module *mod, struct load_info *info) sizeof(*mod->ftrace_callsites), &mod->num_ftrace_callsites); #endif - +#ifdef CONFIG_BPF_KPROBE_OVERRIDE + mod->kprobe_ei_funcs = section_objs(info, "_kprobe_error_inject_list", + sizeof(*mod->kprobe_ei_funcs), + &mod->num_kprobe_ei_funcs); +#endif mod->extable = section_objs(info, "__ex_table", sizeof(*mod->extable), &mod->num_exentries); From 6ac6e790b6d4d425b0d20068f0f20d2fe6d0d5d6 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 11 Dec 2017 11:36:48 -0500 Subject: [PATCH 0094/1640] BACKPORT: bpf: add a bpf_override_function helper Error injection is sloppy and very ad-hoc. BPF could fill this niche perfectly with it's kprobe functionality. We could make sure errors are only triggered in specific call chains that we care about with very specific situations. Accomplish this with the bpf_override_funciton helper. This will modify the probe'd callers return value to the specified value and set the PC to an override function that simply returns, bypassing the originally probed function. This gives us a nice clean way to implement systematic error injection for all of our code paths. Acked-by: Alexei Starovoitov Acked-by: Ingo Molnar Signed-off-by: Josef Bacik Signed-off-by: Alexei Starovoitov --- arch/Kconfig | 3 ++ arch/x86/Kconfig | 1 + arch/x86/include/asm/kprobes.h | 4 +++ arch/x86/include/asm/ptrace.h | 5 +++ arch/x86/kernel/kprobes/ftrace.c | 14 ++++++++ include/linux/filter.h | 3 +- include/linux/trace_events.h | 1 + include/uapi/linux/bpf.h | 4 +++ kernel/bpf/core.c | 3 ++ kernel/bpf/verifier.c | 2 ++ kernel/events/core.c | 7 ++++ kernel/trace/Kconfig | 11 +++++++ kernel/trace/bpf_trace.c | 35 ++++++++++++++++++++ kernel/trace/trace_kprobe.c | 55 ++++++++++++++++++++++++++++---- kernel/trace/trace_probe.h | 12 +++++++ 15 files changed, 152 insertions(+), 8 deletions(-) diff --git a/arch/Kconfig b/arch/Kconfig index 60b8d811afc9..113a6ea00699 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -199,6 +199,9 @@ config HAVE_OPTPROBES config HAVE_KPROBES_ON_FTRACE bool +config HAVE_KPROBE_OVERRIDE + bool + config HAVE_NMI bool diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index bdcd2eef9201..afa13c1f28b1 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -155,6 +155,7 @@ config X86 select HAVE_KERNEL_XZ select HAVE_KPROBES select HAVE_KPROBES_ON_FTRACE + select HAVE_KPROBE_OVERRIDE select HAVE_KRETPROBES select HAVE_KVM select HAVE_LIVEPATCH if X86_64 diff --git a/arch/x86/include/asm/kprobes.h b/arch/x86/include/asm/kprobes.h index 6cf65437b5e5..c6c3b1f4306a 100644 --- a/arch/x86/include/asm/kprobes.h +++ b/arch/x86/include/asm/kprobes.h @@ -67,6 +67,10 @@ extern const int kretprobe_blacklist_size; void arch_remove_kprobe(struct kprobe *p); asmlinkage void kretprobe_trampoline(void); +#ifdef CONFIG_KPROBES_ON_FTRACE +extern void arch_ftrace_kprobe_override_function(struct pt_regs *regs); +#endif + /* Architecture specific copy of original instruction*/ struct arch_specific_insn { /* copy of the original instruction */ diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h index 8603d127f73c..ee696efec99f 100644 --- a/arch/x86/include/asm/ptrace.h +++ b/arch/x86/include/asm/ptrace.h @@ -109,6 +109,11 @@ static inline unsigned long regs_return_value(struct pt_regs *regs) return regs->ax; } +static inline void regs_set_return_value(struct pt_regs *regs, unsigned long rc) +{ + regs->ax = rc; +} + /* * user_mode(regs) determines whether a register set came from user * mode. On x86_32, this is true if V8086 mode was enabled OR if the diff --git a/arch/x86/kernel/kprobes/ftrace.c b/arch/x86/kernel/kprobes/ftrace.c index bcfee4f69b0e..53deb0b23078 100644 --- a/arch/x86/kernel/kprobes/ftrace.c +++ b/arch/x86/kernel/kprobes/ftrace.c @@ -102,3 +102,17 @@ int arch_prepare_kprobe_ftrace(struct kprobe *p) p->ainsn.boostable = false; return 0; } + +asmlinkage void override_func(void); +asm( + ".type override_func, @function\n" + "override_func:\n" + " ret\n" + ".size override_func, .-override_func\n" +); + +void arch_ftrace_kprobe_override_function(struct pt_regs *regs) +{ + regs->ip = (unsigned long)&override_func; +} +NOKPROBE_SYMBOL(arch_ftrace_kprobe_override_function); diff --git a/include/linux/filter.h b/include/linux/filter.h index 18ce37dfe7dd..23b7990fa502 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -483,7 +483,8 @@ struct bpf_prog { locked:1, /* Program image locked? */ gpl_compatible:1, /* Is filter GPL compatible? */ cb_access:1, /* Is control block accessed? */ - dst_needed:1; /* Do we need dst entry? */ + dst_needed:1, /* Do we need dst entry? */ + kprobe_override:1; /* Do we override a kprobe? */ enum bpf_prog_type type; /* Type of BPF program */ u32 len; /* Number of filter blocks */ u32 jited_len; /* Size of jited insns in bytes */ diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 8c2ba9c2794a..a5de6e381a72 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -525,6 +525,7 @@ do { \ struct perf_event; DECLARE_PER_CPU(struct pt_regs, perf_trace_regs); +DECLARE_PER_CPU(int, bpf_kprobe_override); extern int perf_trace_init(struct perf_event *event); extern void perf_trace_destroy(struct perf_event *event); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index c74db97d94f5..05f9adc00fe2 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -678,6 +678,10 @@ union bpf_attr { * @buf_size: size of the buf * Return : 0 on success or negative error code * + * int bpf_override_return(pt_regs, rc) + * @pt_regs: pointer to struct pt_regs + * @rc: the return value to set + * * int skb_load_bytes_relative(const struct sk_buff *skb, u32 offset, void *to, u32 len, u32 start_header) * Description * This helper is similar to **bpf_skb_load_bytes**\ () in that diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index e36ae5a661f0..a04a1fb02634 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1458,6 +1458,9 @@ static unsigned int __bpf_prog_ret0_warn(const void *ctx, bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *fp) { + if (fp->kprobe_override) + return false; + if (!array->owner_prog_type) { /* There's no owner yet where we could check for * compatibility. diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 5d298d687506..7cba075a6a0c 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4983,6 +4983,8 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) prog->dst_needed = 1; if (insn->imm == BPF_FUNC_get_prandom_u32) bpf_user_rnd_init_once(); + if (insn->imm == BPF_FUNC_override_return) + prog->kprobe_override = 1; if (insn->imm == BPF_FUNC_tail_call) { /* If we tail call into other programs, we * cannot make any assumptions since they can diff --git a/kernel/events/core.c b/kernel/events/core.c index 644b81ead72a..878425a1b3f6 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -8561,6 +8561,13 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) return -EINVAL; } + /* Kprobe override only works for kprobes, not uprobes. */ + if (prog->kprobe_override && + !(event->tp_event->flags & TRACE_EVENT_FL_KPROBE)) { + bpf_prog_put(prog); + return -EINVAL; + } + if (is_tracepoint || is_syscall_tp) { int off = trace_event_get_offsets(event->tp_event); diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index e79ef5aa6224..2ba9ffeefdcd 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -561,6 +561,17 @@ config FUNCTION_PROFILER If in doubt, say N. +config BPF_KPROBE_OVERRIDE + bool "Enable BPF programs to override a kprobed function" + depends on BPF_EVENTS + depends on KPROBES_ON_FTRACE + depends on HAVE_KPROBE_OVERRIDE + depends on DYNAMIC_FTRACE_WITH_REGS + default n + help + Allows BPF to override the execution of a probed function and + set a different return value. This is used for error injection. + config FTRACE_MCOUNT_RECORD def_bool y depends on DYNAMIC_FTRACE diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index e39165a2daa0..37b2203d6520 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -13,6 +13,10 @@ #include #include #include +#include +#include + +#include "trace_probe.h" #include "trace.h" u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); @@ -76,6 +80,24 @@ unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx) } EXPORT_SYMBOL_GPL(trace_call_bpf); +#ifdef CONFIG_BPF_KPROBE_OVERRIDE +BPF_CALL_2(bpf_override_return, struct pt_regs *, regs, unsigned long, rc) +{ + __this_cpu_write(bpf_kprobe_override, 1); + regs_set_return_value(regs, rc); + arch_ftrace_kprobe_override_function(regs); + return 0; +} + +static const struct bpf_func_proto bpf_override_return_proto = { + .func = bpf_override_return, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, +}; +#endif + BPF_CALL_3(bpf_probe_read, void *, dst, u32, size, const void *, unsafe_ptr) { int ret; @@ -560,6 +582,10 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func return &bpf_get_stackid_proto; case BPF_FUNC_perf_event_read_value: return &bpf_perf_event_read_value_proto; +#ifdef CONFIG_BPF_KPROBE_OVERRIDE + case BPF_FUNC_override_return: + return &bpf_override_return_proto; +#endif default: return tracing_func_proto(func_id); } @@ -777,6 +803,15 @@ int perf_event_attach_bpf_prog(struct perf_event *event, struct bpf_prog_array *new_array; int ret = -EEXIST; + /* + * Kprobe override only works for ftrace based kprobes, and only if they + * are on the opt-in list. + */ + if (prog->kprobe_override && + (!trace_kprobe_ftrace(event->tp_event) || + !trace_kprobe_error_injectable(event->tp_event))) + return -EINVAL; + mutex_lock(&bpf_event_mutex); if (event->prog) diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index b0db2e4cefa3..63923abbca5c 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -42,6 +42,7 @@ struct trace_kprobe { (offsetof(struct trace_kprobe, tp.args) + \ (sizeof(struct probe_arg) * (n))) +DEFINE_PER_CPU(int, bpf_kprobe_override); static nokprobe_inline bool trace_kprobe_is_return(struct trace_kprobe *tk) { @@ -87,6 +88,27 @@ static nokprobe_inline unsigned long trace_kprobe_nhit(struct trace_kprobe *tk) return nhit; } +int trace_kprobe_ftrace(struct trace_event_call *call) +{ + struct trace_kprobe *tk = (struct trace_kprobe *)call->data; + return kprobe_ftrace(&tk->rp.kp); +} + +int trace_kprobe_error_injectable(struct trace_event_call *call) +{ + struct trace_kprobe *tk = (struct trace_kprobe *)call->data; + unsigned long addr; + + if (tk->symbol) { + addr = (unsigned long) + kallsyms_lookup_name(trace_kprobe_symbol(tk)); + addr += tk->rp.kp.offset; + } else { + addr = (unsigned long)tk->rp.kp.addr; + } + return within_kprobe_error_injection_list(addr); +} + static int register_kprobe_event(struct trace_kprobe *tk); static int unregister_kprobe_event(struct trace_kprobe *tk); @@ -1183,7 +1205,7 @@ static int kretprobe_event_define_fields(struct trace_event_call *event_call) #ifdef CONFIG_PERF_EVENTS /* Kprobe profile handler */ -static void +static int kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs) { struct trace_event_call *call = &tk->tp.call; @@ -1192,12 +1214,29 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs) int size, __size, dsize; int rctx; - if (bpf_prog_array_valid(call) && !trace_call_bpf(call, regs)) - return; + if (bpf_prog_array_valid(call)) { + int ret; + + ret = trace_call_bpf(call, regs); + + /* + * We need to check and see if we modified the pc of the + * pt_regs, and if so clear the kprobe and return 1 so that we + * don't do the instruction skipping. Also reset our state so + * we are clean the next pass through. + */ + if (__this_cpu_read(bpf_kprobe_override)) { + __this_cpu_write(bpf_kprobe_override, 0); + reset_current_kprobe(); + return 1; + } + if (!ret) + return 0; + } head = this_cpu_ptr(call->perf_events); if (hlist_empty(head)) - return; + return 0; dsize = __get_data_size(&tk->tp, regs); __size = sizeof(*entry) + tk->tp.size + dsize; @@ -1206,13 +1245,14 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs) entry = perf_trace_buf_alloc(size, NULL, &rctx); if (!entry) - return; + return 0; entry->ip = (unsigned long)tk->rp.kp.addr; memset(&entry[1], 0, dsize); store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize); perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs, head, NULL, NULL); + return 0; } NOKPROBE_SYMBOL(kprobe_perf_func); @@ -1288,6 +1328,7 @@ static int kprobe_register(struct trace_event_call *event, static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs) { struct trace_kprobe *tk = container_of(kp, struct trace_kprobe, rp.kp); + int ret = 0; raw_cpu_inc(*tk->nhit); @@ -1295,9 +1336,9 @@ static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs) kprobe_trace_func(tk, regs); #ifdef CONFIG_PERF_EVENTS if (tk->tp.flags & TP_FLAG_PROFILE) - kprobe_perf_func(tk, regs); + ret = kprobe_perf_func(tk, regs); #endif - return 0; /* We don't tweek kernel, so just return 0 */ + return ret; } NOKPROBE_SYMBOL(kprobe_dispatcher); diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index dc39472ca9e4..abfeb69a800b 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -253,6 +253,8 @@ struct symbol_cache; unsigned long update_symbol_cache(struct symbol_cache *sc); void free_symbol_cache(struct symbol_cache *sc); struct symbol_cache *alloc_symbol_cache(const char *sym, long offset); +int trace_kprobe_ftrace(struct trace_event_call *call); +int trace_kprobe_error_injectable(struct trace_event_call *call); #else /* uprobes do not support symbol fetch methods */ #define fetch_symbol_u8 NULL @@ -278,6 +280,16 @@ alloc_symbol_cache(const char *sym, long offset) { return NULL; } + +static inline int trace_kprobe_ftrace(struct trace_event_call *call) +{ + return 0; +} + +static inline int trace_kprobe_error_injectable(struct trace_event_call *call) +{ + return 0; +} #endif /* CONFIG_KPROBE_EVENTS */ struct probe_arg { From aa021220442bd5589f942d15dc97d3a5f7cd920e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 12 Dec 2017 14:22:39 -0800 Subject: [PATCH 0095/1640] UPSTREAM: bpf: add schedule points to map alloc/free While using large percpu maps, htab_map_alloc() can hold cpu for hundreds of ms. This patch adds cond_resched() calls to percpu alloc/free call sites, all running in process context. Signed-off-by: Eric Dumazet Signed-off-by: Alexei Starovoitov --- kernel/bpf/hashtab.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 773d0805088f..6058dc49fc4f 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -114,6 +114,7 @@ static void htab_free_elems(struct bpf_htab *htab) pptr = htab_elem_get_ptr(get_htab_elem(htab, i), htab->map.key_size); free_percpu(pptr); + cond_resched(); } free_elems: bpf_map_area_free(htab->elems); @@ -159,6 +160,7 @@ static int prealloc_init(struct bpf_htab *htab) goto free_elems; htab_elem_set_ptr(get_htab_elem(htab, i), htab->map.key_size, pptr); + cond_resched(); } skip_percpu_elems: From 6dac7045d5ee4502d410d0c23bbf190d6398ed4c Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 13 Dec 2017 10:35:37 -0800 Subject: [PATCH 0096/1640] UPSTREAM: bpf/tracing: fix kernel/events/core.c compilation error Commit f371b304f12e ("bpf/tracing: allow user space to query prog array on the same tp") introduced a perf ioctl command to query prog array attached to the same perf tracepoint. The commit introduced a compilation error under certain config conditions, e.g., (1). CONFIG_BPF_SYSCALL is not defined, or (2). CONFIG_TRACING is defined but neither CONFIG_UPROBE_EVENTS nor CONFIG_KPROBE_EVENTS is defined. Error message: kernel/events/core.o: In function `perf_ioctl': core.c:(.text+0x98c4): undefined reference to `bpf_event_query_prog_array' This patch fixed this error by guarding the real definition under CONFIG_BPF_EVENTS and provided static inline dummy function if CONFIG_BPF_EVENTS was not defined. It renamed the function from bpf_event_query_prog_array to perf_event_query_prog_array and moved the definition from linux/bpf.h to linux/trace_events.h so the definition is in proximity to other prog_array related functions. Fixes: f371b304f12e ("bpf/tracing: allow user space to query prog array on the same tp") Reported-by: Stephen Rothwell Signed-off-by: Yonghong Song Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 1 - include/linux/trace_events.h | 6 ++++++ kernel/events/core.c | 2 +- kernel/trace/bpf_trace.c | 2 +- 4 files changed, 8 insertions(+), 3 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 4c22590be898..4d08ced62113 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -270,7 +270,6 @@ typedef unsigned long (*bpf_ctx_copy_t)(void *dst, const void *src, u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size, void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy); -int bpf_event_query_prog_array(struct perf_event *event, void __user *info); int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr); diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index a5de6e381a72..cbd6b89a18c0 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -464,6 +464,7 @@ trace_trigger_soft_disabled(struct trace_event_file *file) unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx); int perf_event_attach_bpf_prog(struct perf_event *event, struct bpf_prog *prog); void perf_event_detach_bpf_prog(struct perf_event *event); +int perf_event_query_prog_array(struct perf_event *event, void __user *info); #else static inline unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx) { @@ -478,6 +479,11 @@ perf_event_attach_bpf_prog(struct perf_event *event, struct bpf_prog *prog) static inline void perf_event_detach_bpf_prog(struct perf_event *event) { } +static inline int +perf_event_query_prog_array(struct perf_event *event, void __user *info) +{ + return -EOPNOTSUPP; +} #endif enum { diff --git a/kernel/events/core.c b/kernel/events/core.c index 878425a1b3f6..9a0af98caead 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5117,7 +5117,7 @@ static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned lon } case PERF_EVENT_IOC_QUERY_BPF: - return bpf_event_query_prog_array(event, (void __user *)arg); + return perf_event_query_prog_array(event, (void __user *)arg); default: return -ENOTTY; } diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 37b2203d6520..6d168738def2 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -868,7 +868,7 @@ out: mutex_unlock(&bpf_event_mutex); } -int bpf_event_query_prog_array(struct perf_event *event, void __user *info) +int perf_event_query_prog_array(struct perf_event *event, void __user *info) { struct perf_event_query_bpf __user *uquery = info; struct perf_event_query_bpf query = {}; From 8b993f33fee6711a85e331ea578e756eab193789 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 14 Dec 2017 21:07:25 +0100 Subject: [PATCH 0097/1640] UPSTREAM: bpf: guarantee r1 to be ctx in case of bpf_helper_changes_pkt_data Some JITs don't cache skb context on stack in prologue, so when LD_ABS/IND is used and helper calls yield bpf_helper_changes_pkt_data() as true, then they temporarily save/restore skb pointer. However, the assumption that skb always has to be in r1 is a bit of a gamble. Right now it turned out to be true for all helpers listed in bpf_helper_changes_pkt_data(), but lets enforce that from verifier side, so that we make this a guarantee and bail out if the func proto is misconfigured in future helpers. In case of BPF helper calls from cBPF, bpf_helper_changes_pkt_data() is completely unrelevant here (since cBPF is context read-only) and therefore always false. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 7cba075a6a0c..183818acc135 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1799,7 +1799,13 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx) return -EINVAL; } + /* With LD_ABS/IND some JITs save/restore skb from r1. */ changes_data = bpf_helper_changes_pkt_data(fn->func); + if (changes_data && fn->arg1_type != ARG_PTR_TO_CTX) { + verbose(env, "kernel subsystem misconfigured func %s#%d: r1 != ctx\n", + func_id_name(func_id), func_id); + return -EINVAL; + } memset(&meta, 0, sizeof(meta)); meta.pkt_access = fn->pkt_access; From 286e7370e2396afedd129f84ffc843bf7b75f467 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 14 Dec 2017 17:55:05 -0800 Subject: [PATCH 0098/1640] UPSTREAM: bpf: introduce function calls (function boundaries) Allow arbitrary function calls from bpf function to another bpf function. Since the beginning of bpf all bpf programs were represented as a single function and program authors were forced to use always_inline for all functions in their C code. That was causing llvm to unnecessary inflate the code size and forcing developers to move code to header files with little code reuse. With a bit of additional complexity teach verifier to recognize arbitrary function calls from one bpf function to another as long as all of functions are presented to the verifier as a single bpf program. New program layout: r6 = r1 // some code .. r1 = .. // arg1 r2 = .. // arg2 call pc+1 // function call pc-relative exit .. = r1 // access arg1 .. = r2 // access arg2 .. call pc+20 // second level of function call ... It allows for better optimized code and finally allows to introduce the core bpf libraries that can be reused in different projects, since programs are no longer limited by single elf file. With function calls bpf can be compiled into multiple .o files. This patch is the first step. It detects programs that contain multiple functions and checks that calls between them are valid. It splits the sequence of bpf instructions (one program) into a set of bpf functions that call each other. Calls to only known functions are allowed. In the future the verifier may allow calls to unresolved functions and will do dynamic linking. This logic supports statically linked bpf functions only. Such function boundary detection could have been done as part of control flow graph building in check_cfg(), but it's cleaner to separate function boundary detection vs control flow checks within a subprogram (function) into logically indepedent steps. Follow up patches may split check_cfg() further, but not check_subprogs(). Only allow bpf-to-bpf calls for root only and for non-hw-offloaded programs. These restrictions can be relaxed in the future. Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: Daniel Borkmann --- include/linux/bpf_verifier.h | 5 +- include/uapi/linux/bpf.h | 6 ++ kernel/bpf/disasm.c | 8 +- kernel/bpf/verifier.c | 141 ++++++++++++++++++++++++++++++++++- 4 files changed, 155 insertions(+), 5 deletions(-) diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 2726c70730e8..c4ad6818887a 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -157,6 +157,8 @@ struct bpf_ext_analyzer_ops { int insn_idx, int prev_insn_idx); }; +#define BPF_MAX_SUBPROGS 256 + /* single container for all structs * one verifier_env per bpf_check() call */ @@ -177,8 +179,9 @@ struct bpf_verifier_env { bool allow_ptr_leaks; bool seen_direct_write; struct bpf_insn_aux_data *insn_aux_data; /* array of per-insn state */ - struct bpf_verifer_log log; + u32 subprog_starts[BPF_MAX_SUBPROGS]; + u32 subprog_cnt; }; static inline struct bpf_reg_state *cur_regs(struct bpf_verifier_env *env) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 05f9adc00fe2..c2dfd46a3b60 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -197,8 +197,14 @@ enum bpf_attach_type { */ #define BPF_F_STRICT_ALIGNMENT (1U << 0) +/* when bpf_ldimm64->src_reg == BPF_PSEUDO_MAP_FD, bpf_ldimm64->imm == fd */ #define BPF_PSEUDO_MAP_FD 1 +/* when bpf_call->src_reg == BPF_PSEUDO_CALL, bpf_call->imm == pc-relative + * offset to another bpf function + */ +#define BPF_PSEUDO_CALL 1 + /* flags for BPF_MAP_UPDATE_ELEM command */ #define BPF_ANY 0 /* create new element or update existing */ #define BPF_NOEXIST 1 /* create new element if it didn't exist */ diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c index e682850c9715..883f88fa5bfc 100644 --- a/kernel/bpf/disasm.c +++ b/kernel/bpf/disasm.c @@ -189,8 +189,12 @@ void print_bpf_insn(bpf_insn_print_cb verbose, struct bpf_verifier_env *env, u8 opcode = BPF_OP(insn->code); if (opcode == BPF_CALL) { - verbose(env, "(%02x) call %s#%d\n", insn->code, - func_id_name(insn->imm), insn->imm); + if (insn->src_reg == BPF_PSEUDO_CALL) + verbose(env, "(%02x) call pc%+d\n", insn->code, + insn->imm); + else + verbose(env, "(%02x) call %s#%d\n", insn->code, + func_id_name(insn->imm), insn->imm); } else if (insn->code == (BPF_JMP | BPF_JA)) { verbose(env, "(%02x) goto pc%+d\n", insn->code, insn->off); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 183818acc135..c8b3f59130cf 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -20,6 +20,8 @@ #include #include #include +#include +#include #include "disasm.h" @@ -638,6 +640,113 @@ enum reg_arg_type { DST_OP_NO_MARK /* same as above, check only, don't mark */ }; +static int cmp_subprogs(const void *a, const void *b) +{ + return *(int *)a - *(int *)b; +} + +static int find_subprog(struct bpf_verifier_env *env, int off) +{ + u32 *p; + + p = bsearch(&off, env->subprog_starts, env->subprog_cnt, + sizeof(env->subprog_starts[0]), cmp_subprogs); + if (!p) + return -ENOENT; + return p - env->subprog_starts; + +} + +static int add_subprog(struct bpf_verifier_env *env, int off) +{ + int insn_cnt = env->prog->len; + int ret; + + if (off >= insn_cnt || off < 0) { + verbose(env, "call to invalid destination\n"); + return -EINVAL; + } + ret = find_subprog(env, off); + if (ret >= 0) + return 0; + if (env->subprog_cnt >= BPF_MAX_SUBPROGS) { + verbose(env, "too many subprograms\n"); + return -E2BIG; + } + env->subprog_starts[env->subprog_cnt++] = off; + sort(env->subprog_starts, env->subprog_cnt, + sizeof(env->subprog_starts[0]), cmp_subprogs, NULL); + return 0; +} + +static int check_subprogs(struct bpf_verifier_env *env) +{ + int i, ret, subprog_start, subprog_end, off, cur_subprog = 0; + struct bpf_insn *insn = env->prog->insnsi; + int insn_cnt = env->prog->len; + + /* determine subprog starts. The end is one before the next starts */ + for (i = 0; i < insn_cnt; i++) { + if (insn[i].code != (BPF_JMP | BPF_CALL)) + continue; + if (insn[i].src_reg != BPF_PSEUDO_CALL) + continue; + if (!env->allow_ptr_leaks) { + verbose(env, "function calls to other bpf functions are allowed for root only\n"); + return -EPERM; + } + if (bpf_prog_is_dev_bound(env->prog->aux)) { + verbose(env, "funcation calls in offloaded programs are not supported yet\n"); + return -EINVAL; + } + ret = add_subprog(env, i + insn[i].imm + 1); + if (ret < 0) + return ret; + } + + if (env->log.level > 1) + for (i = 0; i < env->subprog_cnt; i++) + verbose(env, "func#%d @%d\n", i, env->subprog_starts[i]); + + /* now check that all jumps are within the same subprog */ + subprog_start = 0; + if (env->subprog_cnt == cur_subprog) + subprog_end = insn_cnt; + else + subprog_end = env->subprog_starts[cur_subprog++]; + for (i = 0; i < insn_cnt; i++) { + u8 code = insn[i].code; + + if (BPF_CLASS(code) != BPF_JMP) + goto next; + if (BPF_OP(code) == BPF_EXIT || BPF_OP(code) == BPF_CALL) + goto next; + off = i + insn[i].off + 1; + if (off < subprog_start || off >= subprog_end) { + verbose(env, "jump out of range from insn %d to %d\n", i, off); + return -EINVAL; + } +next: + if (i == subprog_end - 1) { + /* to avoid fall-through from one subprog into another + * the last insn of the subprog should be either exit + * or unconditional jump back + */ + if (code != (BPF_JMP | BPF_EXIT) && + code != (BPF_JMP | BPF_JA)) { + verbose(env, "last insn is not an exit or jmp\n"); + return -EINVAL; + } + subprog_start = subprog_end; + if (env->subprog_cnt == cur_subprog) + subprog_end = insn_cnt; + else + subprog_end = env->subprog_starts[cur_subprog++]; + } + } + return 0; +} + static void mark_reg_read(const struct bpf_verifier_state *state, u32 regno) { struct bpf_verifier_state *parent = state->parent; @@ -3723,6 +3832,10 @@ static int check_cfg(struct bpf_verifier_env *env) int ret = 0; int i, t; + ret = check_subprogs(env); + if (ret < 0) + return ret; + insn_state = kcalloc(insn_cnt, sizeof(int), GFP_KERNEL); if (!insn_state) return -ENOMEM; @@ -3755,6 +3868,14 @@ peek_stack: goto err_free; if (t + 1 < insn_cnt) env->explored_states[t + 1] = STATE_LIST_MARK; + if (insns[t].src_reg == BPF_PSEUDO_CALL) { + env->explored_states[t] = STATE_LIST_MARK; + ret = push_insn(t, t + insns[t].imm + 1, BRANCH, env); + if (ret == 1) + goto peek_stack; + else if (ret < 0) + goto err_free; + } } else if (opcode == BPF_JA) { if (BPF_SRC(insns[t].code) != BPF_K) { ret = -EINVAL; @@ -4698,6 +4819,19 @@ static int adjust_insn_aux_data(struct bpf_verifier_env *env, u32 prog_len, return 0; } +static void adjust_subprog_starts(struct bpf_verifier_env *env, u32 off, u32 len) +{ + int i; + + if (len == 1) + return; + for (i = 0; i < env->subprog_cnt; i++) { + if (env->subprog_starts[i] < off) + continue; + env->subprog_starts[i] += len - 1; + } +} + static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 off, const struct bpf_insn *patch, u32 len) { @@ -4708,6 +4842,7 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of return NULL; if (adjust_insn_aux_data(env, new_prog->len, off, len)) return NULL; + adjust_subprog_starts(env, off, len); return new_prog; } @@ -4984,6 +5119,8 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) if (insn->code != (BPF_JMP | BPF_CALL)) continue; + if (insn->src_reg == BPF_PSEUDO_CALL) + continue; if (insn->imm == BPF_FUNC_get_route_realm) prog->dst_needed = 1; @@ -5194,12 +5331,12 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) if (!env->explored_states) goto skip_full_check; + env->allow_ptr_leaks = capable(CAP_SYS_ADMIN); + ret = check_cfg(env); if (ret < 0) goto skip_full_check; - env->allow_ptr_leaks = capable(CAP_SYS_ADMIN); - ret = do_check(env); if (env->cur_state) { free_verifier_state(env->cur_state, true); From 387d125bd57aca994f84e796458ff4f54198cff3 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 14 Dec 2017 17:55:06 -0800 Subject: [PATCH 0099/1640] BACKPORT: bpf: introduce function calls (verification) Allow arbitrary function calls from bpf function to another bpf function. To recognize such set of bpf functions the verifier does: 1. runs control flow analysis to detect function boundaries 2. proceeds with verification of all functions starting from main(root) function It recognizes that the stack of the caller can be accessed by the callee (if the caller passed a pointer to its stack to the callee) and the callee can store map_value and other pointers into the stack of the caller. 3. keeps track of the stack_depth of each function to make sure that total stack depth is still less than 512 bytes 4. disallows pointers to the callee stack to be stored into the caller stack, since they will be invalid as soon as the callee returns 5. to reuse all of the existing state_pruning logic each function call is considered to be independent call from the verifier point of view. The verifier pretends to inline all function calls it sees are being called. It stores the callsite instruction index as part of the state to make sure that two calls to the same callee from two different places in the caller will be different from state pruning point of view 6. more safety checks are added to liveness analysis Implementation details: . struct bpf_verifier_state is now consists of all stack frames that led to this function . struct bpf_func_state represent one stack frame. It consists of registers in the given frame and its stack . propagate_liveness() logic had a premature optimization where mark_reg_read() and mark_stack_slot_read() were manually inlined with loop iterating over parents for each register or stack slot. Undo this optimization to reuse more complex mark_*_read() logic . skip_callee() logic is not necessary from safety point of view, but without it mark_*_read() markings become too conservative, since after returning from the funciton call a read of r6-r9 will incorrectly propagate the read marks into callee causing inefficient pruning later . mark_*_read() logic is now aware of control flow which makes it more complex. In the future the plan is to rewrite liveness to be hierarchical. So that liveness can be done within basic block only and control flow will be responsible for propagation of liveness information along cfg and between calls. . tail_calls and ld_abs insns are not allowed in the programs with bpf-to-bpf calls . returning stack pointers to the caller or storing them into stack frame of the caller is not allowed Testing: . no difference in cilium processed_insn numbers . large number of tests follows in next patches [Linux4: Replace &env->cur_state->regs[BPF_REG_0] with cur_regs(env) + BPF_REG_0, which was missing due to incomplete backport of https://github.com/torvalds/linux/commit/638f5b90d460] Signed-off-by: Alexei Starovoitov Acked-by: John Fastabend Acked-by: Daniel Borkmann Signed-off-by: Daniel Borkmann --- include/linux/bpf_verifier.h | 36 +- kernel/bpf/verifier.c | 721 +++++++++++++++++++++++++++-------- 2 files changed, 596 insertions(+), 161 deletions(-) diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index c4ad6818887a..2c4245491cbf 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -79,6 +79,14 @@ struct bpf_reg_state { s64 smax_value; /* maximum possible (s64)value */ u64 umin_value; /* minimum possible (u64)value */ u64 umax_value; /* maximum possible (u64)value */ + /* Inside the callee two registers can be both PTR_TO_STACK like + * R1=fp-8 and R2=fp-8, but one of them points to this function stack + * while another to the caller's stack. To differentiate them 'frameno' + * is used which is an index in bpf_verifier_state->frame[] array + * pointing to bpf_func_state. + * This field must be second to last, for states_equal() reasons. + */ + u32 frameno; /* This field must be last, for states_equal() reasons. */ enum bpf_reg_liveness live; }; @@ -99,11 +107,32 @@ struct bpf_stack_state { /* state of the program: * type of all registers and stack info */ -struct bpf_verifier_state { +struct bpf_func_state { struct bpf_reg_state regs[MAX_BPF_REG]; struct bpf_verifier_state *parent; + /* index of call instruction that called into this func */ + int callsite; + /* stack frame number of this function state from pov of + * enclosing bpf_verifier_state. + * 0 = main function, 1 = first callee. + */ + u32 frameno; + /* subprog number == index within subprog_stack_depth + * zero == main subprog + */ + u32 subprogno; + + /* should be second to last. See copy_func_state() */ int allocated_stack; struct bpf_stack_state *stack; +}; + +#define MAX_CALL_FRAMES 8 +struct bpf_verifier_state { + /* call stack tracking */ + struct bpf_func_state *frame[MAX_CALL_FRAMES]; + struct bpf_verifier_state *parent; + u32 curframe; bool speculative; }; @@ -181,12 +210,15 @@ struct bpf_verifier_env { struct bpf_insn_aux_data *insn_aux_data; /* array of per-insn state */ struct bpf_verifer_log log; u32 subprog_starts[BPF_MAX_SUBPROGS]; + u16 subprog_stack_depth[BPF_MAX_SUBPROGS + 1]; u32 subprog_cnt; }; static inline struct bpf_reg_state *cur_regs(struct bpf_verifier_env *env) { - return env->cur_state->regs; + struct bpf_verifier_state *cur = env->cur_state; + + return cur->frame[cur->curframe]->regs; } #if defined(CONFIG_NET) && defined(CONFIG_BPF_SYSCALL) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index c8b3f59130cf..afa2e34d99b3 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -229,13 +229,23 @@ static void print_liveness(struct bpf_verifier_env *env, verbose(env, "w"); } -static void print_verifier_state(struct bpf_verifier_env *env, - struct bpf_verifier_state *state) +static struct bpf_func_state *func(struct bpf_verifier_env *env, + const struct bpf_reg_state *reg) { - struct bpf_reg_state *reg; + struct bpf_verifier_state *cur = env->cur_state; + + return cur->frame[reg->frameno]; +} + +static void print_verifier_state(struct bpf_verifier_env *env, + const struct bpf_func_state *state) +{ + const struct bpf_reg_state *reg; enum bpf_reg_type t; int i; + if (state->frameno) + verbose(env, " frame%d:", state->frameno); for (i = 0; i < MAX_BPF_REG; i++) { reg = &state->regs[i]; t = reg->type; @@ -248,6 +258,8 @@ static void print_verifier_state(struct bpf_verifier_env *env, tnum_is_const(reg->var_off)) { /* reg->off should be 0 for SCALAR_VALUE */ verbose(env, "%lld", reg->var_off.value + reg->off); + if (t == PTR_TO_STACK) + verbose(env, ",call_%d", func(env, reg)->callsite); } else { verbose(env, "(id=%d", reg->id); if (t != SCALAR_VALUE) @@ -303,8 +315,8 @@ static void print_verifier_state(struct bpf_verifier_env *env, verbose(env, "\n"); } -static int copy_stack_state(struct bpf_verifier_state *dst, - const struct bpf_verifier_state *src) +static int copy_stack_state(struct bpf_func_state *dst, + const struct bpf_func_state *src) { if (!src->stack) return 0; @@ -320,13 +332,13 @@ static int copy_stack_state(struct bpf_verifier_state *dst, /* do_check() starts with zero-sized stack in struct bpf_verifier_state to * make it consume minimal amount of memory. check_stack_write() access from - * the program calls into realloc_verifier_state() to grow the stack size. + * the program calls into realloc_func_state() to grow the stack size. * Note there is a non-zero 'parent' pointer inside bpf_verifier_state * which this function copies over. It points to previous bpf_verifier_state * which is never reallocated */ -static int realloc_verifier_state(struct bpf_verifier_state *state, int size, - bool copy_old) +static int realloc_func_state(struct bpf_func_state *state, int size, + bool copy_old) { u32 old_size = state->allocated_stack; struct bpf_stack_state *new_stack; @@ -359,10 +371,21 @@ static int realloc_verifier_state(struct bpf_verifier_state *state, int size, return 0; } +static void free_func_state(struct bpf_func_state *state) +{ + kfree(state->stack); + kfree(state); +} + static void free_verifier_state(struct bpf_verifier_state *state, bool free_self) { - kfree(state->stack); + int i; + + for (i = 0; i <= state->curframe; i++) { + free_func_state(state->frame[i]); + state->frame[i] = NULL; + } if (free_self) kfree(state); } @@ -370,18 +393,47 @@ static void free_verifier_state(struct bpf_verifier_state *state, /* copy verifier state from src to dst growing dst stack space * when necessary to accommodate larger src stack */ -static int copy_verifier_state(struct bpf_verifier_state *dst, - const struct bpf_verifier_state *src) +static int copy_func_state(struct bpf_func_state *dst, + const struct bpf_func_state *src) { int err; - err = realloc_verifier_state(dst, src->allocated_stack, false); + err = realloc_func_state(dst, src->allocated_stack, false); if (err) return err; - memcpy(dst, src, offsetof(struct bpf_verifier_state, allocated_stack)); + memcpy(dst, src, offsetof(struct bpf_func_state, allocated_stack)); return copy_stack_state(dst, src); } +static int copy_verifier_state(struct bpf_verifier_state *dst_state, + const struct bpf_verifier_state *src) +{ + struct bpf_func_state *dst; + int i, err; + + /* if dst has more stack frames then src frame, free them */ + for (i = src->curframe + 1; i <= dst_state->curframe; i++) { + free_func_state(dst_state->frame[i]); + dst_state->frame[i] = NULL; + } + dst_state->speculative = src->speculative; + dst_state->curframe = src->curframe; + dst_state->parent = src->parent; + for (i = 0; i <= src->curframe; i++) { + dst = dst_state->frame[i]; + if (!dst) { + dst = kzalloc(sizeof(*dst), GFP_KERNEL); + if (!dst) + return -ENOMEM; + dst_state->frame[i] = dst; + } + err = copy_func_state(dst, src->frame[i]); + if (err) + return err; + } + return 0; +} + static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx, int *insn_idx) { @@ -445,6 +497,10 @@ err: static const int caller_saved[CALLER_SAVED_REGS] = { BPF_REG_0, BPF_REG_1, BPF_REG_2, BPF_REG_3, BPF_REG_4, BPF_REG_5 }; +#define CALLEE_SAVED_REGS 5 +static const int callee_saved[CALLEE_SAVED_REGS] = { + BPF_REG_6, BPF_REG_7, BPF_REG_8, BPF_REG_9 +}; static void __mark_reg_not_init(struct bpf_reg_state *reg); @@ -580,6 +636,7 @@ static void __mark_reg_unknown(struct bpf_reg_state *reg) reg->id = 0; reg->off = 0; reg->var_off = tnum_unknown; + reg->frameno = 0; __mark_reg_unbounded(reg); } @@ -616,8 +673,9 @@ static void mark_reg_not_init(struct bpf_verifier_env *env, } static void init_reg_state(struct bpf_verifier_env *env, - struct bpf_reg_state *regs) + struct bpf_func_state *state) { + struct bpf_reg_state *regs = state->regs; int i; for (i = 0; i < MAX_BPF_REG; i++) { @@ -628,12 +686,24 @@ static void init_reg_state(struct bpf_verifier_env *env, /* frame pointer */ regs[BPF_REG_FP].type = PTR_TO_STACK; mark_reg_known_zero(env, regs, BPF_REG_FP); + regs[BPF_REG_FP].frameno = state->frameno; /* 1st arg to a function */ regs[BPF_REG_1].type = PTR_TO_CTX; mark_reg_known_zero(env, regs, BPF_REG_1); } +#define BPF_MAIN_FUNC (-1) +static void init_func_state(struct bpf_verifier_env *env, + struct bpf_func_state *state, + int callsite, int frameno, int subprogno) +{ + state->callsite = callsite; + state->frameno = frameno; + state->subprogno = subprogno; + init_reg_state(env, state); +} + enum reg_arg_type { SRC_OP, /* register is used as source operand */ DST_OP, /* register is used as destination operand */ @@ -747,29 +817,86 @@ next: return 0; } -static void mark_reg_read(const struct bpf_verifier_state *state, u32 regno) +struct bpf_verifier_state *skip_callee(struct bpf_verifier_env *env, + const struct bpf_verifier_state *state, + struct bpf_verifier_state *parent, + u32 regno) { - struct bpf_verifier_state *parent = state->parent; + struct bpf_verifier_state *tmp = NULL; + + /* 'parent' could be a state of caller and + * 'state' could be a state of callee. In such case + * parent->curframe < state->curframe + * and it's ok for r1 - r5 registers + * + * 'parent' could be a callee's state after it bpf_exit-ed. + * In such case parent->curframe > state->curframe + * and it's ok for r0 only + */ + if (parent->curframe == state->curframe || + (parent->curframe < state->curframe && + regno >= BPF_REG_1 && regno <= BPF_REG_5) || + (parent->curframe > state->curframe && + regno == BPF_REG_0)) + return parent; + + if (parent->curframe > state->curframe && + regno >= BPF_REG_6) { + /* for callee saved regs we have to skip the whole chain + * of states that belong to callee and mark as LIVE_READ + * the registers before the call + */ + tmp = parent; + while (tmp && tmp->curframe != state->curframe) { + tmp = tmp->parent; + } + if (!tmp) + goto bug; + parent = tmp; + } else { + goto bug; + } + return parent; +bug: + verbose(env, "verifier bug regno %d tmp %p\n", regno, tmp); + verbose(env, "regno %d parent frame %d current frame %d\n", + regno, parent->curframe, state->curframe); + return 0; +} + +static int mark_reg_read(struct bpf_verifier_env *env, + const struct bpf_verifier_state *state, + struct bpf_verifier_state *parent, + u32 regno) +{ + bool writes = parent == state->parent; /* Observe write marks */ if (regno == BPF_REG_FP) /* We don't need to worry about FP liveness because it's read-only */ - return; + return 0; while (parent) { /* if read wasn't screened by an earlier write ... */ - if (state->regs[regno].live & REG_LIVE_WRITTEN) + if (writes && state->frame[state->curframe]->regs[regno].live & REG_LIVE_WRITTEN) break; + parent = skip_callee(env, state, parent, regno); + if (!parent) + return -EFAULT; /* ... then we depend on parent's value */ - parent->regs[regno].live |= REG_LIVE_READ; + parent->frame[parent->curframe]->regs[regno].live |= REG_LIVE_READ; state = parent; parent = state->parent; + writes = true; } + return 0; } static int check_reg_arg(struct bpf_verifier_env *env, u32 regno, enum reg_arg_type t) { - struct bpf_reg_state *regs = env->cur_state->regs; + struct bpf_verifier_state *vstate = env->cur_state; + struct bpf_func_state *state = vstate->frame[vstate->curframe]; + struct bpf_reg_state *regs = state->regs; if (regno >= MAX_BPF_REG) { verbose(env, "R%d is invalid\n", regno); @@ -782,7 +909,7 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno, verbose(env, "R%d !read_ok\n", regno); return -EACCES; } - mark_reg_read(env->cur_state, regno); + return mark_reg_read(env, vstate, vstate->parent, regno); } else { /* check whether register used as dest operand can be written to */ if (regno == BPF_REG_FP) { @@ -817,13 +944,15 @@ static bool is_spillable_regtype(enum bpf_reg_type type) * stack boundary and alignment are checked in check_mem_access() */ static int check_stack_write(struct bpf_verifier_env *env, - struct bpf_verifier_state *state, int off, - int size, int value_regno, int insn_idx) + struct bpf_func_state *state, /* func where register points to */ + int off, int size, int value_regno, int insn_idx) { + struct bpf_func_state *cur; /* state of the current function */ int i, slot = -off - 1, spi = slot / BPF_REG_SIZE, err; + enum bpf_reg_type type; - err = realloc_verifier_state(state, round_up(slot + 1, BPF_REG_SIZE), - true); + err = realloc_func_state(state, round_up(slot + 1, BPF_REG_SIZE), + true); if (err) return err; /* caller checked that off % size == 0 and -MAX_BPF_STACK <= off < 0, @@ -836,8 +965,9 @@ static int check_stack_write(struct bpf_verifier_env *env, return -EACCES; } + cur = env->cur_state->frame[env->cur_state->curframe]; if (value_regno >= 0 && - is_spillable_regtype(state->regs[value_regno].type)) { + is_spillable_regtype((type = cur->regs[value_regno].type))) { /* register containing pointer is being spilled into stack */ if (size != BPF_REG_SIZE) { @@ -845,8 +975,13 @@ static int check_stack_write(struct bpf_verifier_env *env, return -EACCES; } + if (state != cur && type == PTR_TO_STACK) { + verbose(env, "cannot spill pointers to stack into stack frame of the caller\n"); + return -EINVAL; + } + /* save register state */ - state->stack[spi].spilled_ptr = state->regs[value_regno]; + state->stack[spi].spilled_ptr = cur->regs[value_regno]; state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN; for (i = 0; i < BPF_REG_SIZE; i++) { @@ -887,34 +1022,68 @@ static int check_stack_write(struct bpf_verifier_env *env, return 0; } -static void mark_stack_slot_read(const struct bpf_verifier_state *state, int slot) +/* registers of every function are unique and mark_reg_read() propagates + * the liveness in the following cases: + * - from callee into caller for R1 - R5 that were used as arguments + * - from caller into callee for R0 that used as result of the call + * - from caller to the same caller skipping states of the callee for R6 - R9, + * since R6 - R9 are callee saved by implicit function prologue and + * caller's R6 != callee's R6, so when we propagate liveness up to + * parent states we need to skip callee states for R6 - R9. + * + * stack slot marking is different, since stacks of caller and callee are + * accessible in both (since caller can pass a pointer to caller's stack to + * callee which can pass it to another function), hence mark_stack_slot_read() + * has to propagate the stack liveness to all parent states at given frame number. + * Consider code: + * f1() { + * ptr = fp - 8; + * *ptr = ctx; + * call f2 { + * .. = *ptr; + * } + * .. = *ptr; + * } + * First *ptr is reading from f1's stack and mark_stack_slot_read() has + * to mark liveness at the f1's frame and not f2's frame. + * Second *ptr is also reading from f1's stack and mark_stack_slot_read() has + * to propagate liveness to f2 states at f1's frame level and further into + * f1 states at f1's frame level until write into that stack slot + */ +static void mark_stack_slot_read(struct bpf_verifier_env *env, + const struct bpf_verifier_state *state, + struct bpf_verifier_state *parent, + int slot, int frameno) { - struct bpf_verifier_state *parent = state->parent; + bool writes = parent == state->parent; /* Observe write marks */ while (parent) { /* if read wasn't screened by an earlier write ... */ - if (state->stack[slot].spilled_ptr.live & REG_LIVE_WRITTEN) + if (writes && state->frame[frameno]->stack[slot].spilled_ptr.live & REG_LIVE_WRITTEN) break; /* ... then we depend on parent's value */ - parent->stack[slot].spilled_ptr.live |= REG_LIVE_READ; + parent->frame[frameno]->stack[slot].spilled_ptr.live |= REG_LIVE_READ; state = parent; parent = state->parent; + writes = true; } } static int check_stack_read(struct bpf_verifier_env *env, - struct bpf_verifier_state *state, int off, int size, - int value_regno) + struct bpf_func_state *reg_state /* func where register points to */, + int off, int size, int value_regno) { + struct bpf_verifier_state *vstate = env->cur_state; + struct bpf_func_state *state = vstate->frame[vstate->curframe]; int i, slot = -off - 1, spi = slot / BPF_REG_SIZE; u8 *stype; - if (state->allocated_stack <= slot) { + if (reg_state->allocated_stack <= slot) { verbose(env, "invalid read from stack off %d+0 size %d\n", off, size); return -EACCES; } - stype = state->stack[spi].slot_type; + stype = reg_state->stack[spi].slot_type; if (stype[0] == STACK_SPILL) { if (size != BPF_REG_SIZE) { @@ -930,13 +1099,14 @@ static int check_stack_read(struct bpf_verifier_env *env, if (value_regno >= 0) { /* restore register state from stack */ - state->regs[value_regno] = state->stack[spi].spilled_ptr; + state->regs[value_regno] = reg_state->stack[spi].spilled_ptr; /* mark reg as written since spilled pointer state likely * has its liveness marks cleared by is_state_visited() * which resets stack/reg liveness for state transitions */ state->regs[value_regno].live |= REG_LIVE_WRITTEN; - mark_stack_slot_read(state, spi); + mark_stack_slot_read(env, vstate, vstate->parent, spi, + reg_state->frameno); } return 0; } else { @@ -999,7 +1169,8 @@ static int __check_map_access(struct bpf_verifier_env *env, u32 regno, int off, static int check_map_access(struct bpf_verifier_env *env, u32 regno, int off, int size, bool zero_size_allowed) { - struct bpf_verifier_state *state = env->cur_state; + struct bpf_verifier_state *vstate = env->cur_state; + struct bpf_func_state *state = vstate->frame[vstate->curframe]; struct bpf_reg_state *reg = &state->regs[regno]; int err; @@ -1272,6 +1443,39 @@ static int check_ptr_alignment(struct bpf_verifier_env *env, strict); } +static int update_stack_depth(struct bpf_verifier_env *env, + const struct bpf_func_state *func, + int off) +{ + u16 stack = env->subprog_stack_depth[func->subprogno], total = 0; + struct bpf_verifier_state *cur = env->cur_state; + int i; + + if (stack >= -off) + return 0; + + /* update known max for given subprogram */ + env->subprog_stack_depth[func->subprogno] = -off; + + /* compute the total for current call chain */ + for (i = 0; i <= cur->curframe; i++) { + u32 depth = env->subprog_stack_depth[cur->frame[i]->subprogno]; + + /* round up to 32-bytes, since this is granularity + * of interpreter stack sizes + */ + depth = round_up(depth, 32); + total += depth; + } + + if (total > MAX_BPF_STACK) { + verbose(env, "combined stack size of %d calls is %d. Too large\n", + cur->curframe, total); + return -EACCES; + } + return 0; +} + static int check_ctx_reg(struct bpf_verifier_env *env, const struct bpf_reg_state *reg, int regno) { @@ -1329,9 +1533,9 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn int off, int bpf_size, enum bpf_access_type t, int value_regno, bool strict_alignment_once) { - struct bpf_verifier_state *state = env->cur_state; struct bpf_reg_state *regs = cur_regs(env); struct bpf_reg_state *reg = regs + regno; + struct bpf_func_state *state; int size, err = 0; size = bpf_size_to_bytes(bpf_size); @@ -1392,8 +1596,10 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn if (err) return err; - if (env->prog->aux->stack_depth < -off) - env->prog->aux->stack_depth = -off; + state = func(env, reg); + err = update_stack_depth(env, state, off); + if (err) + return err; if (t == BPF_WRITE) err = check_stack_write(env, state, off, size, @@ -1490,7 +1696,7 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, struct bpf_call_arg_meta *meta) { struct bpf_reg_state *reg = cur_regs(env) + regno; - struct bpf_verifier_state *state = env->cur_state; + struct bpf_func_state *state = func(env, reg); int off, i, slot, spi; if (reg->type != PTR_TO_STACK) { @@ -1522,9 +1728,6 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, return -EACCES; } - if (env->prog->aux->stack_depth < -off) - env->prog->aux->stack_depth = -off; - if (meta && meta->raw_mode) { meta->access_size = access_size; meta->regno = regno; @@ -1542,7 +1745,7 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, return -EACCES; } } - return 0; + return update_stack_depth(env, state, off); } static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, @@ -1798,6 +2001,10 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, case BPF_FUNC_tail_call: if (map->map_type != BPF_MAP_TYPE_PROG_ARRAY) goto error; + if (env->subprog_cnt) { + verbose(env, "tail_calls are not allowed in programs with bpf-to-bpf calls\n"); + return -EINVAL; + } break; case BPF_FUNC_perf_event_read: case BPF_FUNC_perf_event_output: @@ -1859,9 +2066,9 @@ static int check_raw_mode(const struct bpf_func_proto *fn) /* Packet data might have moved, any old PTR_TO_PACKET[_META,_END] * are now invalid, so turn them into unknown SCALAR_VALUE. */ -static void clear_all_pkt_pointers(struct bpf_verifier_env *env) +static void __clear_all_pkt_pointers(struct bpf_verifier_env *env, + struct bpf_func_state *state) { - struct bpf_verifier_state *state = env->cur_state; struct bpf_reg_state *regs = state->regs, *reg; int i; @@ -1878,7 +2085,121 @@ static void clear_all_pkt_pointers(struct bpf_verifier_env *env) } } -static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx) +static void clear_all_pkt_pointers(struct bpf_verifier_env *env) +{ + struct bpf_verifier_state *vstate = env->cur_state; + int i; + + for (i = 0; i <= vstate->curframe; i++) + __clear_all_pkt_pointers(env, vstate->frame[i]); +} + +static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, + int *insn_idx) +{ + struct bpf_verifier_state *state = env->cur_state; + struct bpf_func_state *caller, *callee; + int i, subprog, target_insn; + + if (state->curframe >= MAX_CALL_FRAMES) { + verbose(env, "the call stack of %d frames is too deep\n", + state->curframe); + return -E2BIG; + } + + target_insn = *insn_idx + insn->imm; + subprog = find_subprog(env, target_insn + 1); + if (subprog < 0) { + verbose(env, "verifier bug. No program starts at insn %d\n", + target_insn + 1); + return -EFAULT; + } + + caller = state->frame[state->curframe]; + if (state->frame[state->curframe + 1]) { + verbose(env, "verifier bug. Frame %d already allocated\n", + state->curframe + 1); + return -EFAULT; + } + + callee = kzalloc(sizeof(*callee), GFP_KERNEL); + if (!callee) + return -ENOMEM; + state->frame[state->curframe + 1] = callee; + + /* callee cannot access r0, r6 - r9 for reading and has to write + * into its own stack before reading from it. + * callee can read/write into caller's stack + */ + init_func_state(env, callee, + /* remember the callsite, it will be used by bpf_exit */ + *insn_idx /* callsite */, + state->curframe + 1 /* frameno within this callchain */, + subprog + 1 /* subprog number within this prog */); + + /* copy r1 - r5 args that callee can access */ + for (i = BPF_REG_1; i <= BPF_REG_5; i++) + callee->regs[i] = caller->regs[i]; + + /* after the call regsiters r0 - r5 were scratched */ + for (i = 0; i < CALLER_SAVED_REGS; i++) { + mark_reg_not_init(env, caller->regs, caller_saved[i]); + check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK); + } + + /* only increment it after check_reg_arg() finished */ + state->curframe++; + + /* and go analyze first insn of the callee */ + *insn_idx = target_insn; + + if (env->log.level) { + verbose(env, "caller:\n"); + print_verifier_state(env, caller); + verbose(env, "callee:\n"); + print_verifier_state(env, callee); + } + return 0; +} + +static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) +{ + struct bpf_verifier_state *state = env->cur_state; + struct bpf_func_state *caller, *callee; + struct bpf_reg_state *r0; + + callee = state->frame[state->curframe]; + r0 = &callee->regs[BPF_REG_0]; + if (r0->type == PTR_TO_STACK) { + /* technically it's ok to return caller's stack pointer + * (or caller's caller's pointer) back to the caller, + * since these pointers are valid. Only current stack + * pointer will be invalid as soon as function exits, + * but let's be conservative + */ + verbose(env, "cannot return stack pointer to the caller\n"); + return -EINVAL; + } + + state->curframe--; + caller = state->frame[state->curframe]; + /* return to the caller whatever r0 had in the callee */ + caller->regs[BPF_REG_0] = *r0; + + *insn_idx = callee->callsite + 1; + if (env->log.level) { + verbose(env, "returning from callee:\n"); + print_verifier_state(env, callee); + verbose(env, "to caller at %d:\n", *insn_idx); + print_verifier_state(env, caller); + } + /* clear everything in the callee */ + free_func_state(callee); + state->frame[state->curframe + 1] = NULL; + return 0; +} + +static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn_idx) { const struct bpf_func_proto *fn = NULL; struct bpf_reg_state *regs; @@ -2330,7 +2651,9 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, const struct bpf_reg_state *ptr_reg, const struct bpf_reg_state *off_reg) { - struct bpf_reg_state *regs = cur_regs(env), *dst_reg; + struct bpf_verifier_state *vstate = env->cur_state; + struct bpf_func_state *state = vstate->frame[vstate->curframe]; + struct bpf_reg_state *regs = state->regs, *dst_reg; bool known = tnum_is_const(off_reg->var_off); s64 smin_val = off_reg->smin_value, smax_val = off_reg->smax_value, smin_ptr = ptr_reg->smin_value, smax_ptr = ptr_reg->smax_value; @@ -2802,7 +3125,9 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, static int adjust_reg_min_max_vals(struct bpf_verifier_env *env, struct bpf_insn *insn) { - struct bpf_reg_state *regs = cur_regs(env), *dst_reg, *src_reg; + struct bpf_verifier_state *vstate = env->cur_state; + struct bpf_func_state *state = vstate->frame[vstate->curframe]; + struct bpf_reg_state *regs = state->regs, *dst_reg, *src_reg; struct bpf_reg_state *ptr_reg = NULL, off_reg = {0}; u8 opcode = BPF_OP(insn->code); @@ -2853,12 +3178,12 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env, /* Got here implies adding two SCALAR_VALUEs */ if (WARN_ON_ONCE(ptr_reg)) { - print_verifier_state(env, env->cur_state); + print_verifier_state(env, state); verbose(env, "verifier internal error: unexpected ptr_reg\n"); return -EINVAL; } if (WARN_ON(!src_reg)) { - print_verifier_state(env, env->cur_state); + print_verifier_state(env, state); verbose(env, "verifier internal error: no src_reg\n"); return -EINVAL; } @@ -3020,14 +3345,15 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) return 0; } -static void find_good_pkt_pointers(struct bpf_verifier_state *state, +static void find_good_pkt_pointers(struct bpf_verifier_state *vstate, struct bpf_reg_state *dst_reg, enum bpf_reg_type type, bool range_right_open) { + struct bpf_func_state *state = vstate->frame[vstate->curframe]; struct bpf_reg_state *regs = state->regs, *reg; u16 new_range; - int i; + int i, j; if (dst_reg->off < 0 || (dst_reg->off == 0 && range_right_open)) @@ -3097,12 +3423,15 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state, /* keep the maximum range already checked */ regs[i].range = max(regs[i].range, new_range); - for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { - if (state->stack[i].slot_type[0] != STACK_SPILL) - continue; - reg = &state->stack[i].spilled_ptr; - if (reg->type == type && reg->id == dst_reg->id) - reg->range = max(reg->range, new_range); + for (j = 0; j <= vstate->curframe; j++) { + state = vstate->frame[j]; + for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { + if (state->stack[i].slot_type[0] != STACK_SPILL) + continue; + reg = &state->stack[i].spilled_ptr; + if (reg->type == type && reg->id == dst_reg->id) + reg->range = max(reg->range, new_range); + } } } @@ -3340,20 +3669,24 @@ static void mark_map_reg(struct bpf_reg_state *regs, u32 regno, u32 id, /* The logic is similar to find_good_pkt_pointers(), both could eventually * be folded together at some point. */ -static void mark_map_regs(struct bpf_verifier_state *state, u32 regno, +static void mark_map_regs(struct bpf_verifier_state *vstate, u32 regno, bool is_null) { + struct bpf_func_state *state = vstate->frame[vstate->curframe]; struct bpf_reg_state *regs = state->regs; u32 id = regs[regno].id; - int i; + int i, j; for (i = 0; i < MAX_BPF_REG; i++) mark_map_reg(regs, i, id, is_null); - for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { - if (state->stack[i].slot_type[0] != STACK_SPILL) - continue; - mark_map_reg(&state->stack[i].spilled_ptr, 0, id, is_null); + for (j = 0; j <= vstate->curframe; j++) { + state = vstate->frame[j]; + for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { + if (state->stack[i].slot_type[0] != STACK_SPILL) + continue; + mark_map_reg(&state->stack[i].spilled_ptr, 0, id, is_null); + } } } @@ -3453,8 +3786,10 @@ static bool try_match_pkt_pointers(const struct bpf_insn *insn, static int check_cond_jmp_op(struct bpf_verifier_env *env, struct bpf_insn *insn, int *insn_idx) { - struct bpf_verifier_state *other_branch, *this_branch = env->cur_state; - struct bpf_reg_state *regs = this_branch->regs, *dst_reg; + struct bpf_verifier_state *this_branch = env->cur_state; + struct bpf_verifier_state *other_branch; + struct bpf_reg_state *regs = this_branch->frame[this_branch->curframe]->regs; + struct bpf_reg_state *dst_reg, *other_branch_regs; u8 opcode = BPF_OP(insn->code); int err; @@ -3518,6 +3853,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, false); if (!other_branch) return -EFAULT; + other_branch_regs = other_branch->frame[other_branch->curframe]->regs; /* detect if we are comparing against a constant value so we can adjust * our min/max values for our dst register. @@ -3530,22 +3866,22 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, if (dst_reg->type == SCALAR_VALUE && regs[insn->src_reg].type == SCALAR_VALUE) { if (tnum_is_const(regs[insn->src_reg].var_off)) - reg_set_min_max(&other_branch->regs[insn->dst_reg], + reg_set_min_max(&other_branch_regs[insn->dst_reg], dst_reg, regs[insn->src_reg].var_off.value, opcode); else if (tnum_is_const(dst_reg->var_off)) - reg_set_min_max_inv(&other_branch->regs[insn->src_reg], + reg_set_min_max_inv(&other_branch_regs[insn->src_reg], ®s[insn->src_reg], dst_reg->var_off.value, opcode); else if (opcode == BPF_JEQ || opcode == BPF_JNE) /* Comparing for equality, we can combine knowledge */ - reg_combine_min_max(&other_branch->regs[insn->src_reg], - &other_branch->regs[insn->dst_reg], + reg_combine_min_max(&other_branch_regs[insn->src_reg], + &other_branch_regs[insn->dst_reg], ®s[insn->src_reg], ®s[insn->dst_reg], opcode); } } else if (dst_reg->type == SCALAR_VALUE) { - reg_set_min_max(&other_branch->regs[insn->dst_reg], + reg_set_min_max(&other_branch_regs[insn->dst_reg], dst_reg, insn->imm, opcode); } @@ -3566,7 +3902,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, return -EACCES; } if (env->log.level) - print_verifier_state(env, this_branch); + print_verifier_state(env, this_branch->frame[this_branch->curframe]); return 0; } @@ -3652,6 +3988,18 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) return -EINVAL; } + if (env->subprog_cnt) { + /* when program has LD_ABS insn JITs and interpreter assume + * that r1 == ctx == skb which is not the case for callees + * that can have arbitrary arguments. It's problematic + * for main prog as well since JITs would need to analyze + * all functions in order to make proper register save/restore + * decisions in the main prog. Hence disallow LD_ABS with calls + */ + verbose(env, "BPF_LD_[ABS|IND] instructions cannot be mixed with bpf-to-bpf calls\n"); + return -EINVAL; + } + if (insn->dst_reg != BPF_REG_0 || insn->off != 0 || BPF_SIZE(insn->code) == BPF_DW || (mode == BPF_ABS && insn->src_reg != BPF_REG_0)) { @@ -3710,7 +4058,7 @@ static int check_return_code(struct bpf_verifier_env *env) return 0; } - reg = &env->cur_state->regs[BPF_REG_0]; + reg = cur_regs(env) + BPF_REG_0; if (reg->type != SCALAR_VALUE) { verbose(env, "At program exit the register R0 is not a known value (%s)\n", reg_type_str[reg->type]); @@ -3994,11 +4342,21 @@ static bool check_ids(u32 old_id, u32 cur_id, struct idpair *idmap) static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur, struct idpair *idmap) { + bool equal; + if (!(rold->live & REG_LIVE_READ)) /* explored state didn't use this */ return true; - if (memcmp(rold, rcur, offsetof(struct bpf_reg_state, live)) == 0) + equal = memcmp(rold, rcur, offsetof(struct bpf_reg_state, frameno)) == 0; + + if (rold->type == PTR_TO_STACK) + /* two stack pointers are equal only if they're pointing to + * the same stack frame, since fp-8 in foo != fp-8 in bar + */ + return equal && rold->frameno == rcur->frameno; + + if (equal) return true; if (rold->type == NOT_INIT) @@ -4070,7 +4428,6 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur, tnum_in(rold->var_off, rcur->var_off); case PTR_TO_CTX: case CONST_PTR_TO_MAP: - case PTR_TO_STACK: case PTR_TO_PACKET_END: /* Only valid matches are exact, which memcmp() above * would have accepted @@ -4085,8 +4442,8 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur, return false; } -static bool stacksafe(struct bpf_verifier_state *old, - struct bpf_verifier_state *cur, +static bool stacksafe(struct bpf_func_state *old, + struct bpf_func_state *cur, struct idpair *idmap) { int i, spi; @@ -4162,20 +4519,13 @@ static bool stacksafe(struct bpf_verifier_state *old, * whereas register type in current state is meaningful, it means that * the current state will reach 'bpf_exit' instruction safely */ -static bool states_equal(struct bpf_verifier_env *env, - struct bpf_verifier_state *old, - struct bpf_verifier_state *cur) +static bool func_states_equal(struct bpf_func_state *old, + struct bpf_func_state *cur) { struct idpair *idmap; bool ret = false; int i; - /* Verification state from speculative execution simulation - * must never prune a non-speculative execution one. - */ - if (old->speculative && !cur->speculative) - return false; - idmap = kcalloc(ID_MAP_SIZE, sizeof(struct idpair), GFP_KERNEL); /* If we failed to allocate the idmap, just say it's not safe */ if (!idmap) @@ -4194,71 +4544,82 @@ out_free: return ret; } -/* A write screens off any subsequent reads; but write marks come from the - * straight-line code between a state and its parent. When we arrive at a - * jump target (in the first iteration of the propagate_liveness() loop), - * we didn't arrive by the straight-line code, so read marks in state must - * propagate to parent regardless of state's write marks. - */ -static bool do_propagate_liveness(const struct bpf_verifier_state *state, - struct bpf_verifier_state *parent) +static bool states_equal(struct bpf_verifier_env *env, + struct bpf_verifier_state *old, + struct bpf_verifier_state *cur) { - bool writes = parent == state->parent; /* Observe write marks */ - bool touched = false; /* any changes made? */ int i; - if (!parent) - return touched; + if (old->curframe != cur->curframe) + return false; + + /* Verification state from speculative execution simulation + * must never prune a non-speculative execution one. + */ + if (old->speculative && !cur->speculative) + return false; + + /* for states to be equal callsites have to be the same + * and all frame states need to be equivalent + */ + for (i = 0; i <= old->curframe; i++) { + if (old->frame[i]->callsite != cur->frame[i]->callsite) + return false; + if (!func_states_equal(old->frame[i], cur->frame[i])) + return false; + } + return true; +} + +/* A write screens off any subsequent reads; but write marks come from the + * straight-line code between a state and its parent. When we arrive at an + * equivalent state (jump target or such) we didn't arrive by the straight-line + * code, so read marks in the state must propagate to the parent regardless + * of the state's write marks. That's what 'parent == state->parent' comparison + * in mark_reg_read() and mark_stack_slot_read() is for. + */ +static int propagate_liveness(struct bpf_verifier_env *env, + const struct bpf_verifier_state *vstate, + struct bpf_verifier_state *vparent) +{ + int i, frame, err = 0; + struct bpf_func_state *state, *parent; + + if (vparent->curframe != vstate->curframe) { + WARN(1, "propagate_live: parent frame %d current frame %d\n", + vparent->curframe, vstate->curframe); + return -EFAULT; + } /* Propagate read liveness of registers... */ BUILD_BUG_ON(BPF_REG_FP + 1 != MAX_BPF_REG); /* We don't need to worry about FP liveness because it's read-only */ for (i = 0; i < BPF_REG_FP; i++) { - if (parent->regs[i].live & REG_LIVE_READ) + if (vparent->frame[vparent->curframe]->regs[i].live & REG_LIVE_READ) continue; - if (writes && (state->regs[i].live & REG_LIVE_WRITTEN)) - continue; - if (state->regs[i].live & REG_LIVE_READ) { - parent->regs[i].live |= REG_LIVE_READ; - touched = true; + if (vstate->frame[vstate->curframe]->regs[i].live & REG_LIVE_READ) { + err = mark_reg_read(env, vstate, vparent, i); + if (err) + return err; } } - /* ... and stack slots */ - for (i = 0; i < state->allocated_stack / BPF_REG_SIZE && - i < parent->allocated_stack / BPF_REG_SIZE; i++) { - if (parent->stack[i].slot_type[0] != STACK_SPILL) - continue; - if (state->stack[i].slot_type[0] != STACK_SPILL) - continue; - if (parent->stack[i].spilled_ptr.live & REG_LIVE_READ) - continue; - if (writes && - (state->stack[i].spilled_ptr.live & REG_LIVE_WRITTEN)) - continue; - if (state->stack[i].spilled_ptr.live & REG_LIVE_READ) { - parent->stack[i].spilled_ptr.live |= REG_LIVE_READ; - touched = true; - } - } - return touched; -} -/* "parent" is "a state from which we reach the current state", but initially - * it is not the state->parent (i.e. "the state whose straight-line code leads - * to the current state"), instead it is the state that happened to arrive at - * a (prunable) equivalent of the current state. See comment above - * do_propagate_liveness() for consequences of this. - * This function is just a more efficient way of calling mark_reg_read() or - * mark_stack_slot_read() on each reg in "parent" that is read in "state", - * though it requires that parent != state->parent in the call arguments. - */ -static void propagate_liveness(const struct bpf_verifier_state *state, - struct bpf_verifier_state *parent) -{ - while (do_propagate_liveness(state, parent)) { - /* Something changed, so we need to feed those changes onward */ - state = parent; - parent = state->parent; + /* ... and stack slots */ + for (frame = 0; frame <= vstate->curframe; frame++) { + state = vstate->frame[frame]; + parent = vparent->frame[frame]; + for (i = 0; i < state->allocated_stack / BPF_REG_SIZE && + i < parent->allocated_stack / BPF_REG_SIZE; i++) { + if (parent->stack[i].slot_type[0] != STACK_SPILL) + continue; + if (state->stack[i].slot_type[0] != STACK_SPILL) + continue; + if (parent->stack[i].spilled_ptr.live & REG_LIVE_READ) + continue; + if (state->stack[i].spilled_ptr.live & REG_LIVE_READ) + mark_stack_slot_read(env, vstate, vparent, i, frame); + } } + return err; } static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) @@ -4266,7 +4627,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) struct bpf_verifier_state_list *new_sl; struct bpf_verifier_state_list *sl; struct bpf_verifier_state *cur = env->cur_state; - int i, err; + int i, j, err; sl = env->explored_states[insn_idx]; if (!sl) @@ -4287,7 +4648,9 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) * they'll be immediately forgotten as we're pruning * this state and will pop a new one. */ - propagate_liveness(&sl->state, cur); + err = propagate_liveness(env, &sl->state, cur); + if (err) + return err; return 1; } sl = sl->next; @@ -4295,9 +4658,10 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) /* there were no equivalent states, remember current one. * technically the current state is not proven to be safe yet, - * but it will either reach bpf_exit (which means it's safe) or - * it will be rejected. Since there are no loops, we won't be - * seeing this 'insn_idx' instruction again on the way to bpf_exit + * but it will either reach outer most bpf_exit (which means it's safe) + * or it will be rejected. Since there are no loops, we won't be + * seeing this tuple (frame[0].callsite, frame[1].callsite, .. insn_idx) + * again on the way to bpf_exit */ new_sl = kzalloc(sizeof(struct bpf_verifier_state_list), GFP_KERNEL); if (!new_sl) @@ -4321,10 +4685,16 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) * explored_states can get read marks.) */ for (i = 0; i < BPF_REG_FP; i++) - cur->regs[i].live = REG_LIVE_NONE; - for (i = 0; i < cur->allocated_stack / BPF_REG_SIZE; i++) - if (cur->stack[i].slot_type[0] == STACK_SPILL) - cur->stack[i].spilled_ptr.live = REG_LIVE_NONE; + cur->frame[cur->curframe]->regs[i].live = REG_LIVE_NONE; + + /* all stack frames are accessible from callee, clear them all */ + for (j = 0; j <= cur->curframe; j++) { + struct bpf_func_state *frame = cur->frame[j]; + + for (i = 0; i < frame->allocated_stack / BPF_REG_SIZE; i++) + if (frame->stack[i].slot_type[0] == STACK_SPILL) + frame->stack[i].spilled_ptr.live = REG_LIVE_NONE; + } return 0; } @@ -4342,16 +4712,27 @@ static int do_check(struct bpf_verifier_env *env) struct bpf_verifier_state *state; struct bpf_insn *insns = env->prog->insnsi; struct bpf_reg_state *regs; - int insn_cnt = env->prog->len; + int insn_cnt = env->prog->len, i; int insn_processed = 0; bool do_print_state = false; state = kzalloc(sizeof(struct bpf_verifier_state), GFP_KERNEL); if (!state) return -ENOMEM; - env->cur_state = state; - init_reg_state(env, state->regs); + state->curframe = 0; + state->speculative = false; state->parent = NULL; + state->frame[0] = kzalloc(sizeof(struct bpf_func_state), GFP_KERNEL); + if (!state->frame[0]) { + kfree(state); + return -ENOMEM; + } + env->cur_state = state; + init_func_state(env, state->frame[0], + BPF_MAIN_FUNC /* callsite */, + 0 /* frameno */, + 0 /* subprogno, zero == main subprog */); + for (;;) { struct bpf_insn *insn; u8 class; @@ -4401,7 +4782,7 @@ static int do_check(struct bpf_verifier_env *env) env->prev_insn_idx, env->insn_idx, env->cur_state->speculative ? " (speculative execution)" : ""); - print_verifier_state(env, env->cur_state); + print_verifier_state(env, state->frame[state->curframe]); do_print_state = false; } @@ -4540,13 +4921,17 @@ static int do_check(struct bpf_verifier_env *env) if (opcode == BPF_CALL) { if (BPF_SRC(insn->code) != BPF_K || insn->off != 0 || - insn->src_reg != BPF_REG_0 || + (insn->src_reg != BPF_REG_0 && + insn->src_reg != BPF_PSEUDO_CALL) || insn->dst_reg != BPF_REG_0) { verbose(env, "BPF_CALL uses reserved fields\n"); return -EINVAL; } - err = check_call(env, insn->imm, env->insn_idx); + if (insn->src_reg == BPF_PSEUDO_CALL) + err = check_func_call(env, insn, &env->insn_idx); + else + err = check_helper_call(env, insn->imm, env->insn_idx); if (err) return err; @@ -4571,6 +4956,16 @@ static int do_check(struct bpf_verifier_env *env) return -EINVAL; } + if (state->curframe) { + /* exit from nested function */ + env->prev_insn_idx = env->insn_idx; + err = prepare_func_exit(env, &env->insn_idx); + if (err) + return err; + do_print_state = true; + continue; + } + /* eBPF calling convetion is such that R0 is used * to return the value from eBPF program. * Make sure that it's readable at this time @@ -4632,8 +5027,16 @@ process_bpf_exit: env->insn_idx++; } - verbose(env, "processed %d insns, stack depth %d\n", insn_processed, - env->prog->aux->stack_depth); + verbose(env, "processed %d insns, stack depth ", insn_processed); + for (i = 0; i < env->subprog_cnt + 1; i++) { + u32 depth = env->subprog_stack_depth[i]; + + verbose(env, "%d", depth); + if (i + 1 < env->subprog_cnt + 1) + verbose(env, "+"); + } + verbose(env, "\n"); + env->prog->aux->stack_depth = env->subprog_stack_depth[0]; return 0; } From e3fab7684a811561b5349d89c978c64456d73ee6 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 14 Dec 2017 17:55:08 -0800 Subject: [PATCH 0100/1640] UPSTREAM: bpf: teach verifier to recognize zero initialized stack programs with function calls are often passing various pointers via stack. When all calls are inlined llvm flattens stack accesses and optimizes away extra branches. When functions are not inlined it becomes the job of the verifier to recognize zero initialized stack to avoid exploring paths that program will not take. The following program would fail otherwise: ptr = &buffer_on_stack; *ptr = 0; ... func_call(.., ptr, ...) { if (..) *ptr = bpf_map_lookup(); } ... if (*ptr != 0) { // Access (*ptr)->field is valid. // Without stack_zero tracking such (*ptr)->field access // will be rejected } since stack slots are no longer uniform invalid | spill | misc add liveness marking to all slots, but do it in 8 byte chunks. So if nothing was read or written in [fp-16, fp-9] range it will be marked as LIVE_NONE. If any byte in that range was read, it will be marked LIVE_READ and stacksafe() check will perform byte-by-byte verification. If all bytes in the range were written the slot will be marked as LIVE_WRITTEN. This significantly speeds up state equality comparison and reduces total number of states processed. before after bpf_lb-DLB_L3.o 2051 2003 bpf_lb-DLB_L4.o 3287 3164 bpf_lb-DUNKNOWN.o 1080 1080 bpf_lxc-DDROP_ALL.o 24980 12361 bpf_lxc-DUNKNOWN.o 34308 16605 bpf_netdev.o 15404 10962 bpf_overlay.o 7191 6679 Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: Daniel Borkmann --- include/linux/bpf_verifier.h | 3 +- kernel/bpf/verifier.c | 129 +++++++++++++++++++++++++++-------- 2 files changed, 103 insertions(+), 29 deletions(-) diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 2c4245491cbf..e90f99214243 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -94,7 +94,8 @@ struct bpf_reg_state { enum bpf_stack_slot_type { STACK_INVALID, /* nothing was stored in this stack slot */ STACK_SPILL, /* register spilled into stack */ - STACK_MISC /* BPF program wrote some data into this slot */ + STACK_MISC, /* BPF program wrote some data into this slot */ + STACK_ZERO, /* BPF program wrote constant zero */ }; #define BPF_REG_SIZE 8 /* size of eBPF register in bytes */ diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index afa2e34d99b3..3226a6c91ff9 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -311,6 +311,8 @@ static void print_verifier_state(struct bpf_verifier_env *env, verbose(env, "=%s", reg_type_str[state->stack[i].spilled_ptr.type]); } + if (state->stack[i].slot_type[0] == STACK_ZERO) + verbose(env, " fp%d=0", (-i - 1) * BPF_REG_SIZE); } verbose(env, "\n"); } @@ -525,6 +527,13 @@ static void __mark_reg_known_zero(struct bpf_reg_state *reg) __mark_reg_known(reg, 0); } +static void __mark_reg_const_zero(struct bpf_reg_state *reg) +{ + __mark_reg_known(reg, 0); + reg->off = 0; + reg->type = SCALAR_VALUE; +} + static void mark_reg_known_zero(struct bpf_verifier_env *env, struct bpf_reg_state *regs, u32 regno) { @@ -940,6 +949,12 @@ static bool is_spillable_regtype(enum bpf_reg_type type) } } +/* Does this register contain a constant zero? */ +static bool register_is_null(struct bpf_reg_state *reg) +{ + return reg->type == SCALAR_VALUE && tnum_equals_const(reg->var_off, 0); +} + /* check_stack_read/write functions track spill/fill of registers, * stack boundary and alignment are checked in check_mem_access() */ @@ -1012,12 +1027,30 @@ static int check_stack_write(struct bpf_verifier_env *env, state->stack[spi].slot_type[i] = STACK_SPILL; } } else { + u8 type = STACK_MISC; + /* regular write of data into stack */ state->stack[spi].spilled_ptr = (struct bpf_reg_state) {}; + /* only mark the slot as written if all 8 bytes were written + * otherwise read propagation may incorrectly stop too soon + * when stack slots are partially written. + * This heuristic means that read propagation will be + * conservative, since it will add reg_live_read marks + * to stack slots all the way to first state when programs + * writes+reads less than 8 bytes + */ + if (size == BPF_REG_SIZE) + state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN; + + /* when we zero initialize stack slots mark them as such */ + if (value_regno >= 0 && + register_is_null(&cur->regs[value_regno])) + type = STACK_ZERO; + for (i = 0; i < size; i++) state->stack[spi].slot_type[(slot - i) % BPF_REG_SIZE] = - STACK_MISC; + type; } return 0; } @@ -1058,6 +1091,14 @@ static void mark_stack_slot_read(struct bpf_verifier_env *env, bool writes = parent == state->parent; /* Observe write marks */ while (parent) { + if (parent->frame[frameno]->allocated_stack <= slot * BPF_REG_SIZE) + /* since LIVE_WRITTEN mark is only done for full 8-byte + * write the read marks are conservative and parent + * state may not even have the stack allocated. In such case + * end the propagation, since the loop reached beginning + * of the function + */ + break; /* if read wasn't screened by an earlier write ... */ if (writes && state->frame[frameno]->stack[slot].spilled_ptr.live & REG_LIVE_WRITTEN) break; @@ -1105,21 +1146,38 @@ static int check_stack_read(struct bpf_verifier_env *env, * which resets stack/reg liveness for state transitions */ state->regs[value_regno].live |= REG_LIVE_WRITTEN; - mark_stack_slot_read(env, vstate, vstate->parent, spi, - reg_state->frameno); } + mark_stack_slot_read(env, vstate, vstate->parent, spi, + reg_state->frameno); return 0; } else { + int zeros = 0; + for (i = 0; i < size; i++) { - if (stype[(slot - i) % BPF_REG_SIZE] != STACK_MISC) { - verbose(env, "invalid read from stack off %d+%d size %d\n", - off, i, size); - return -EACCES; + if (stype[(slot - i) % BPF_REG_SIZE] == STACK_MISC) + continue; + if (stype[(slot - i) % BPF_REG_SIZE] == STACK_ZERO) { + zeros++; + continue; } + verbose(env, "invalid read from stack off %d+%d size %d\n", + off, i, size); + return -EACCES; + } + mark_stack_slot_read(env, vstate, vstate->parent, spi, + reg_state->frameno); + if (value_regno >= 0) { + if (zeros == size) { + /* any size read into register is zero extended, + * so the whole register == const_zero + */ + __mark_reg_const_zero(&state->regs[value_regno]); + } else { + /* have read misc data from the stack */ + mark_reg_unknown(env, state->regs, value_regno); + } + state->regs[value_regno].live |= REG_LIVE_WRITTEN; } - if (value_regno >= 0) - /* have read misc data from the stack */ - mark_reg_unknown(env, state->regs, value_regno); return 0; } } @@ -1679,12 +1737,6 @@ static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_ins BPF_SIZE(insn->code), BPF_WRITE, -1, true); } -/* Does this register contain a constant zero? */ -static bool register_is_null(struct bpf_reg_state *reg) -{ - return reg->type == SCALAR_VALUE && tnum_equals_const(reg->var_off, 0); -} - /* when register 'regno' is passed into function that will read 'access_size' * bytes from that pointer, make sure that it's within stack boundary * and all elements of stack are initialized. @@ -1735,15 +1787,30 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, } for (i = 0; i < access_size; i++) { + u8 *stype; + slot = -(off + i) - 1; spi = slot / BPF_REG_SIZE; - if (state->allocated_stack <= slot || - state->stack[spi].slot_type[slot % BPF_REG_SIZE] != - STACK_MISC) { - verbose(env, "invalid indirect read from stack off %d+%d size %d\n", - off, i, access_size); - return -EACCES; + if (state->allocated_stack <= slot) + goto err; + stype = &state->stack[spi].slot_type[slot % BPF_REG_SIZE]; + if (*stype == STACK_MISC) + goto mark; + if (*stype == STACK_ZERO) { + /* helper can write anything into the stack */ + *stype = STACK_MISC; + goto mark; } +err: + verbose(env, "invalid indirect read from stack off %d+%d size %d\n", + off, i, access_size); + return -EACCES; +mark: + /* reading any byte out of 8-byte 'spill_slot' will cause + * the whole slot to be marked as 'read' + */ + mark_stack_slot_read(env, env->cur_state, env->cur_state->parent, + spi, state->frameno); } return update_stack_depth(env, state, off); } @@ -4461,8 +4528,19 @@ static bool stacksafe(struct bpf_func_state *old, for (i = 0; i < old->allocated_stack; i++) { spi = i / BPF_REG_SIZE; + if (!(old->stack[spi].spilled_ptr.live & REG_LIVE_READ)) + /* explored state didn't use this */ + return true; + if (old->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_INVALID) continue; + /* if old state was safe with misc data in the stack + * it will be safe with zero-initialized stack. + * The opposite is not true + */ + if (old->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_MISC && + cur->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_ZERO) + continue; if (old->stack[spi].slot_type[i % BPF_REG_SIZE] != cur->stack[spi].slot_type[i % BPF_REG_SIZE]) /* Ex: old explored (safe) state has STACK_SPILL in @@ -4609,10 +4687,6 @@ static int propagate_liveness(struct bpf_verifier_env *env, parent = vparent->frame[frame]; for (i = 0; i < state->allocated_stack / BPF_REG_SIZE && i < parent->allocated_stack / BPF_REG_SIZE; i++) { - if (parent->stack[i].slot_type[0] != STACK_SPILL) - continue; - if (state->stack[i].slot_type[0] != STACK_SPILL) - continue; if (parent->stack[i].spilled_ptr.live & REG_LIVE_READ) continue; if (state->stack[i].spilled_ptr.live & REG_LIVE_READ) @@ -4692,8 +4766,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) struct bpf_func_state *frame = cur->frame[j]; for (i = 0; i < frame->allocated_stack / BPF_REG_SIZE; i++) - if (frame->stack[i].slot_type[0] == STACK_SPILL) - frame->stack[i].spilled_ptr.live = REG_LIVE_NONE; + frame->stack[i].spilled_ptr.live = REG_LIVE_NONE; } return 0; } From 7f80bcb7447ba5c31dc852094873e7bdf5512672 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 14 Dec 2017 17:55:13 -0800 Subject: [PATCH 0101/1640] BACKPORT: bpf: add support for bpf_call to interpreter though bpf_call is still the same call instruction and calling convention 'bpf to bpf' and 'bpf to helper' is the same the interpreter has to oparate on 'struct bpf_insn *'. To distinguish these two cases add a kernel internal opcode and mark call insns with it. This opcode is seen by interpreter only. JITs will never see it. Also add tiny bit of debug code to aid interpreter debugging. Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 1 + include/linux/filter.h | 6 +++ kernel/bpf/core.c | 89 ++++++++++++++++++++++++++++++++++++------ kernel/bpf/verifier.c | 42 ++++++++++++++++++++ 4 files changed, 125 insertions(+), 13 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 4d08ced62113..7c8d48317ba5 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -417,6 +417,7 @@ static inline void bpf_long_memcpy(void *dst, const void *src, u32 size) /* verify correctness of eBPF program */ int bpf_check(struct bpf_prog **fp, union bpf_attr *attr); +void bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth); /* Map specifics */ struct net_device *__dev_map_lookup_elem(struct bpf_map *map, u32 key); diff --git a/include/linux/filter.h b/include/linux/filter.h index 23b7990fa502..10ef8ee25dc0 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -54,6 +54,9 @@ struct bpf_prog_aux; /* unused opcode to mark special call to bpf_tail_call() helper */ #define BPF_TAIL_CALL 0xf0 +/* unused opcode to mark call to interpreter with arguments */ +#define BPF_CALL_ARGS 0xe0 + /* As per nm, we expose JITed images as text (code) section for * kallsyms. That way, tools like perf can find it to match * addresses. @@ -787,6 +790,9 @@ bool sk_filter_charge(struct sock *sk, struct sk_filter *fp); void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp); u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); +#define __bpf_call_base_args \ + ((u64 (*)(u64, u64, u64, u64, u64, const struct bpf_insn *)) \ + __bpf_call_base) struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog); void bpf_jit_compile(struct bpf_prog *prog); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index a04a1fb02634..ed2b7b77be2a 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -223,14 +223,21 @@ int bpf_prog_calc_tag(struct bpf_prog *fp) return 0; } -static bool bpf_is_jmp_and_has_target(const struct bpf_insn *insn) +static int bpf_adj_delta_to_imm(struct bpf_insn *insn, u32 pos, u32 delta, + u32 curr, const bool probe_pass) { - return BPF_CLASS(insn->code) == BPF_JMP && - /* Call and Exit are both special jumps with no - * target inside the BPF instruction image. - */ - BPF_OP(insn->code) != BPF_CALL && - BPF_OP(insn->code) != BPF_EXIT; + const s64 imm_min = S32_MIN, imm_max = S32_MAX; + s64 imm = insn->imm; + + if (curr < pos && curr + imm + 1 > pos) + imm += delta; + else if (curr > pos + delta && curr + imm + 1 <= pos + delta) + imm -= delta; + if (imm < imm_min || imm > imm_max) + return -ERANGE; + if (!probe_pass) + insn->imm = imm; + return 0; } static int bpf_adj_delta_to_off(struct bpf_insn *insn, u32 pos, u32 delta, @@ -258,6 +265,8 @@ static int bpf_adj_branches(struct bpf_prog *prog, u32 pos, u32 delta, int ret = 0; for (i = 0; i < insn_cnt; i++, insn++) { + u8 code; + /* In the probing pass we still operate on the original, * unpatched image in order to check overflows before we * do any other adjustments. Therefore skip the patchlet. @@ -266,12 +275,20 @@ static int bpf_adj_branches(struct bpf_prog *prog, u32 pos, u32 delta, i += delta + 1; insn++; } - - if (!bpf_is_jmp_and_has_target(insn)) + code = insn->code; + if (BPF_CLASS(code) != BPF_JMP || + BPF_OP(code) == BPF_EXIT) continue; - /* Adjust offset of jmps if we cross patch boundaries. */ - ret = bpf_adj_delta_to_off(insn, pos, delta, i, probe_pass); + if (BPF_OP(code) == BPF_CALL) { + if (insn->src_reg != BPF_PSEUDO_CALL) + continue; + ret = bpf_adj_delta_to_imm(insn, pos, delta, i, + probe_pass); + } else { + ret = bpf_adj_delta_to_off(insn, pos, delta, i, + probe_pass); + } if (ret) break; } @@ -908,8 +925,7 @@ EXPORT_SYMBOL_GPL(__bpf_call_base); * * Decode and execute eBPF instructions. */ -static unsigned int ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, - u64 *stack) +static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, u64 *stack) { u64 tmp; static const void *jumptable[256] = { @@ -969,6 +985,7 @@ static unsigned int ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, [BPF_ALU64 | BPF_NEG] = &&ALU64_NEG, /* Call instruction */ [BPF_JMP | BPF_CALL] = &&JMP_CALL, + [BPF_JMP | BPF_CALL_ARGS] = &&JMP_CALL_ARGS, [BPF_JMP | BPF_TAIL_CALL] = &&JMP_TAIL_CALL, /* Jumps */ [BPF_JMP | BPF_JA] = &&JMP_JA, @@ -1151,6 +1168,13 @@ select_insn: BPF_R4, BPF_R5); CONT; + JMP_CALL_ARGS: + BPF_R0 = (__bpf_call_base_args + insn->imm)(BPF_R1, BPF_R2, + BPF_R3, BPF_R4, + BPF_R5, + insn + insn->off + 1); + CONT; + JMP_TAIL_CALL: { struct bpf_map *map = (struct bpf_map *) (unsigned long) BPF_R2; struct bpf_array *array = container_of(map, struct bpf_array, map); @@ -1423,6 +1447,23 @@ static unsigned int PROG_NAME(stack_size)(const void *ctx, const struct bpf_insn return ___bpf_prog_run(regs, insn, stack); \ } +#define PROG_NAME_ARGS(stack_size) __bpf_prog_run_args##stack_size +#define DEFINE_BPF_PROG_RUN_ARGS(stack_size) \ +static u64 PROG_NAME_ARGS(stack_size)(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5, \ + const struct bpf_insn *insn) \ +{ \ + u64 stack[stack_size / sizeof(u64)]; \ + u64 regs[MAX_BPF_REG]; \ +\ + FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; \ + BPF_R1 = r1; \ + BPF_R2 = r2; \ + BPF_R3 = r3; \ + BPF_R4 = r4; \ + BPF_R5 = r5; \ + return ___bpf_prog_run(regs, insn, stack); \ +} + #define EVAL1(FN, X) FN(X) #define EVAL2(FN, X, Y...) FN(X) EVAL1(FN, Y) #define EVAL3(FN, X, Y...) FN(X) EVAL2(FN, Y) @@ -1434,6 +1475,10 @@ EVAL6(DEFINE_BPF_PROG_RUN, 32, 64, 96, 128, 160, 192); EVAL6(DEFINE_BPF_PROG_RUN, 224, 256, 288, 320, 352, 384); EVAL4(DEFINE_BPF_PROG_RUN, 416, 448, 480, 512); +EVAL6(DEFINE_BPF_PROG_RUN_ARGS, 32, 64, 96, 128, 160, 192); +EVAL6(DEFINE_BPF_PROG_RUN_ARGS, 224, 256, 288, 320, 352, 384); +EVAL4(DEFINE_BPF_PROG_RUN_ARGS, 416, 448, 480, 512); + #define PROG_NAME_LIST(stack_size) PROG_NAME(stack_size), static unsigned int (*interpreters[])(const void *ctx, @@ -1442,6 +1487,24 @@ EVAL6(PROG_NAME_LIST, 32, 64, 96, 128, 160, 192) EVAL6(PROG_NAME_LIST, 224, 256, 288, 320, 352, 384) EVAL4(PROG_NAME_LIST, 416, 448, 480, 512) }; +#undef PROG_NAME_LIST +#define PROG_NAME_LIST(stack_size) PROG_NAME_ARGS(stack_size), +static u64 (*interpreters_args[])(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5, + const struct bpf_insn *insn) = { +EVAL6(PROG_NAME_LIST, 32, 64, 96, 128, 160, 192) +EVAL6(PROG_NAME_LIST, 224, 256, 288, 320, 352, 384) +EVAL4(PROG_NAME_LIST, 416, 448, 480, 512) +}; +#undef PROG_NAME_LIST + +void bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth) +{ + stack_depth = max_t(u32, stack_depth, 1); + insn->off = (s16) insn->imm; + insn->imm = interpreters_args[(round_up(stack_depth, 32) / 32) - 1] - + __bpf_call_base_args; + insn->code = BPF_JMP | BPF_CALL_ARGS; +} #else static unsigned int __bpf_prog_ret0_warn(const void *ctx, diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 3226a6c91ff9..ef77e11fee0c 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1534,6 +1534,23 @@ static int update_stack_depth(struct bpf_verifier_env *env, return 0; } +#ifndef CONFIG_BPF_JIT_ALWAYS_ON +static int get_callee_stack_depth(struct bpf_verifier_env *env, + const struct bpf_insn *insn, int idx) +{ + int start = idx + insn->imm + 1, subprog; + + subprog = find_subprog(env, start); + if (subprog < 0) { + WARN_ONCE(1, "verifier bug. No program starts at insn %d\n", + start); + return -EFAULT; + } + subprog++; + return env->subprog_stack_depth[subprog]; +} +#endif + static int check_ctx_reg(struct bpf_verifier_env *env, const struct bpf_reg_state *reg, int regno) { @@ -5481,6 +5498,28 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) return 0; } +static int fixup_call_args(struct bpf_verifier_env *env) +{ +#ifndef CONFIG_BPF_JIT_ALWAYS_ON + struct bpf_prog *prog = env->prog; + struct bpf_insn *insn = prog->insnsi; + int i, depth; +#endif + +#ifndef CONFIG_BPF_JIT_ALWAYS_ON + for (i = 0; i < prog->len; i++, insn++) { + if (insn->code != (BPF_JMP | BPF_CALL) || + insn->src_reg != BPF_PSEUDO_CALL) + continue; + depth = get_callee_stack_depth(env, insn, i); + if (depth < 0) + return depth; + bpf_patch_call_args(insn, depth); + } +#endif + return 0; +} + /* fixup insn->imm field of bpf_call instructions * and inline eligible helpers as explicit sequence of BPF instructions * @@ -5833,6 +5872,9 @@ skip_full_check: if (ret == 0) ret = fixup_bpf_calls(env); + if (ret == 0) + ret = fixup_call_args(env); + if (log->level && bpf_verifier_log_full(log)) ret = -ENOSPC; if (log->level && !log->ubuf) { From 99bb9594d43e0fc6615936469b8035956abaf2c1 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 14 Dec 2017 17:55:14 -0800 Subject: [PATCH 0102/1640] UPSTREAM: bpf: fix net.core.bpf_jit_enable race global bpf_jit_enable variable is tested multiple times in JITs, blinding and verifier core. The malicious root can try to toggle it while loading the programs. This race condition was accounted for and there should be no issues, but it's safer to avoid this race condition. Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: Daniel Borkmann --- arch/arm/net/bpf_jit_32.c | 2 +- arch/arm64/net/bpf_jit_comp.c | 2 +- arch/mips/net/ebpf_jit.c | 2 +- arch/powerpc/net/bpf_jit_comp64.c | 2 +- arch/s390/net/bpf_jit_comp.c | 2 +- arch/sparc/net/bpf_jit_comp_64.c | 2 +- arch/x86/net/bpf_jit_comp.c | 2 +- include/linux/filter.h | 5 +++-- kernel/bpf/core.c | 3 ++- kernel/bpf/verifier.c | 2 +- 10 files changed, 13 insertions(+), 11 deletions(-) diff --git a/arch/arm/net/bpf_jit_32.c b/arch/arm/net/bpf_jit_32.c index e13aca6e6d4b..68aa2f6d9f83 100644 --- a/arch/arm/net/bpf_jit_32.c +++ b/arch/arm/net/bpf_jit_32.c @@ -1827,7 +1827,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) /* If BPF JIT was not enabled then we must fall back to * the interpreter. */ - if (!bpf_jit_enable) + if (!prog->jit_requested) return orig_prog; /* If constant blinding was enabled and we failed during blinding diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index 0b8ab4b12538..7b1b649da4b9 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -856,7 +856,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) int image_size; u8 *image_ptr; - if (!bpf_jit_enable) + if (!prog->jit_requested) return orig_prog; tmp = bpf_jit_blind_constants(prog); diff --git a/arch/mips/net/ebpf_jit.c b/arch/mips/net/ebpf_jit.c index 57a7a9d68475..1186e481a083 100644 --- a/arch/mips/net/ebpf_jit.c +++ b/arch/mips/net/ebpf_jit.c @@ -1871,7 +1871,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) unsigned int image_size; u8 *image_ptr; - if (!bpf_jit_enable || !cpu_has_mips64r2) + if (!prog->jit_requested || !cpu_has_mips64r2) return prog; tmp = bpf_jit_blind_constants(prog); diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c index c504d5bc7d43..4c58af3d300c 100644 --- a/arch/powerpc/net/bpf_jit_comp64.c +++ b/arch/powerpc/net/bpf_jit_comp64.c @@ -1003,7 +1003,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp) struct bpf_prog *tmp_fp; bool bpf_blinded = false; - if (!bpf_jit_enable) + if (!fp->jit_requested) return org_fp; tmp_fp = bpf_jit_blind_constants(org_fp); diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c index 60029baaa72a..6b1003fdd05d 100644 --- a/arch/s390/net/bpf_jit_comp.c +++ b/arch/s390/net/bpf_jit_comp.c @@ -1363,7 +1363,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp) struct bpf_jit jit; int pass; - if (!bpf_jit_enable) + if (!fp->jit_requested) return orig_fp; tmp = bpf_jit_blind_constants(fp); diff --git a/arch/sparc/net/bpf_jit_comp_64.c b/arch/sparc/net/bpf_jit_comp_64.c index dfb1a62abe93..85ae4b0d5fbc 100644 --- a/arch/sparc/net/bpf_jit_comp_64.c +++ b/arch/sparc/net/bpf_jit_comp_64.c @@ -1529,7 +1529,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) u8 *image_ptr; int pass; - if (!bpf_jit_enable) + if (!prog->jit_requested) return orig_prog; tmp = bpf_jit_blind_constants(prog); diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index a114c319cac2..ab453df6e9f7 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -1141,7 +1141,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) int pass; int i; - if (!bpf_jit_enable) + if (!prog->jit_requested) return orig_prog; tmp = bpf_jit_blind_constants(prog); diff --git a/include/linux/filter.h b/include/linux/filter.h index 10ef8ee25dc0..60948a23b191 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -483,6 +483,7 @@ struct bpf_binary_header { struct bpf_prog { u16 pages; /* Number of allocated pages */ u16 jited:1, /* Is our filter JIT'ed? */ + jit_requested:1,/* archs need to JIT the prog */ locked:1, /* Program image locked? */ gpl_compatible:1, /* Is filter GPL compatible? */ cb_access:1, /* Is control block accessed? */ @@ -885,7 +886,7 @@ static inline bool bpf_prog_ebpf_jited(const struct bpf_prog *fp) return fp->jited && bpf_jit_is_ebpf(); } -static inline bool bpf_jit_blinding_enabled(void) +static inline bool bpf_jit_blinding_enabled(struct bpf_prog *prog) { /* These are the prerequisites, should someone ever have the * idea to call blinding outside of them, we make sure to @@ -893,7 +894,7 @@ static inline bool bpf_jit_blinding_enabled(void) */ if (!bpf_jit_is_ebpf()) return false; - if (!bpf_jit_enable) + if (!prog->jit_requested) return false; if (!bpf_jit_harden) return false; diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index ed2b7b77be2a..d2eaff7d411f 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -100,6 +100,7 @@ struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags) fp->pages = size / PAGE_SIZE; fp->aux = aux; fp->aux->prog = fp; + fp->jit_requested = ebpf_jit_enabled(); INIT_LIST_HEAD_RCU(&fp->aux->ksym_lnode); @@ -861,7 +862,7 @@ struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *prog) struct bpf_insn *insn; int i, rewritten; - if (!bpf_jit_blinding_enabled()) + if (!bpf_jit_blinding_enabled(prog)) return prog; clone = bpf_prog_clone_create(prog, GFP_USER); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index ef77e11fee0c..652c0e820079 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5694,7 +5694,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) /* BPF_EMIT_CALL() assumptions in some of the map_gen_lookup * handlers are currently limited to 64 bit only. */ - if (ebpf_jit_enabled() && BITS_PER_LONG == 64 && + if (prog->jit_requested && BITS_PER_LONG == 64 && insn->imm == BPF_FUNC_map_lookup_elem) { map_ptr = env->insn_aux_data[i + delta].map_ptr; if (map_ptr == BPF_MAP_PTR_POISON || From 34a93175e1b37d4b0da9c06003b44d606867d385 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 14 Dec 2017 17:55:15 -0800 Subject: [PATCH 0103/1640] BACKPORT: bpf: x64: add JIT support for multi-function programs Typical JIT does several passes over bpf instructions to compute total size and relative offsets of jumps and calls. With multitple bpf functions calling each other all relative calls will have invalid offsets intially therefore we need to additional last pass over the program to emit calls with correct offsets. For example in case of three bpf functions: main: call foo call bpf_map_lookup exit foo: call bar exit bar: exit We will call bpf_int_jit_compile() indepedently for main(), foo() and bar() x64 JIT typically does 4-5 passes to converge. After these initial passes the image for these 3 functions will be good except call targets, since start addresses of foo() and bar() are unknown when we were JITing main() (note that call bpf_map_lookup will be resolved properly during initial passes). Once start addresses of 3 functions are known we patch call_insn->imm to point to right functions and call bpf_int_jit_compile() again which needs only one pass. Additional safety checks are done to make sure this last pass doesn't produce image that is larger or smaller than previous pass. When constant blinding is on it's applied to all functions at the first pass, since doing it once again at the last pass can change size of the JITed code. Tested on x64 and arm64 hw with JIT on/off, blinding on/off. x64 jits bpf-to-bpf calls correctly while arm64 falls back to interpreter. All other JITs that support normal BPF_CALL will behave the same way since bpf-to-bpf call is equivalent to bpf-to-kernel call from JITs point of view. Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: Daniel Borkmann --- arch/x86/net/bpf_jit_comp.c | 47 ++++++++++++- include/linux/bpf.h | 3 + include/linux/bpf_verifier.h | 1 + include/linux/filter.h | 2 + kernel/bpf/core.c | 13 +++- kernel/bpf/syscall.c | 3 +- kernel/bpf/verifier.c | 126 +++++++++++++++++++++++++++++++++++ 7 files changed, 189 insertions(+), 6 deletions(-) diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index ab453df6e9f7..bcb23d13f6f5 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -1129,13 +1129,23 @@ common_load: return proglen; } +struct x64_jit_data { + struct bpf_binary_header *header; + int *addrs; + u8 *image; + int proglen; + struct jit_context ctx; +}; + struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) { struct bpf_binary_header *header = NULL; struct bpf_prog *tmp, *orig_prog = prog; + struct x64_jit_data *jit_data; int proglen, oldproglen = 0; struct jit_context ctx = {}; bool tmp_blinded = false; + bool extra_pass = false; u8 *image = NULL; int *addrs; int pass; @@ -1155,10 +1165,28 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) prog = tmp; } + jit_data = prog->aux->jit_data; + if (!jit_data) { + jit_data = kzalloc(sizeof(*jit_data), GFP_KERNEL); + if (!jit_data) { + prog = orig_prog; + goto out; + } + prog->aux->jit_data = jit_data; + } + addrs = jit_data->addrs; + if (addrs) { + ctx = jit_data->ctx; + oldproglen = jit_data->proglen; + image = jit_data->image; + header = jit_data->header; + extra_pass = true; + goto skip_init_addrs; + } addrs = kmalloc(prog->len * sizeof(*addrs), GFP_KERNEL); if (!addrs) { prog = orig_prog; - goto out; + goto out_addrs; } /* Before first pass, make a rough estimation of addrs[] @@ -1169,6 +1197,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) addrs[i] = proglen; } ctx.cleanup_addr = proglen; +skip_init_addrs: /* JITed image shrinks with every pass and the loop iterates * until the image stops shrinking. Very large bpf programs @@ -1210,7 +1239,15 @@ out_image: if (image) { bpf_flush_icache(header, image + proglen); - bpf_jit_binary_lock_ro(header); + if (!prog->is_func || extra_pass) { + bpf_jit_binary_lock_ro(header); + } else { + jit_data->addrs = addrs; + jit_data->ctx = ctx; + jit_data->proglen = proglen; + jit_data->image = image; + jit_data->header = header; + } prog->bpf_func = (void *)image; prog->jited = 1; prog->jited_len = proglen; @@ -1218,8 +1255,12 @@ out_image: prog = orig_prog; } + if (!prog->is_func || extra_pass) { out_addrs: - kfree(addrs); + kfree(addrs); + kfree(jit_data); + prog->aux->jit_data = NULL; + } out: if (tmp_blinded) bpf_jit_prog_release_other(prog, prog == orig_prog ? diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 7c8d48317ba5..8d08c7e1e6ad 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -215,6 +215,9 @@ struct bpf_prog_aux { u32 max_ctx_offset; u32 stack_depth; u32 id; + u32 func_cnt; + struct bpf_prog **func; + void *jit_data; /* JIT specific data. arch dependent */ struct latch_tree_node ksym_tnode; struct list_head ksym_lnode; const struct bpf_prog_ops *ops; diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index e90f99214243..ea59b967a8b6 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -156,6 +156,7 @@ struct bpf_insn_aux_data { union { enum bpf_reg_type ptr_type; /* pointer type for load/store insns */ struct bpf_map *map_ptr; /* pointer for call insn into lookup_elem */ + s32 call_imm; /* saved imm field of call insn */ u32 alu_limit; /* limit for add/sub register with pointer */ }; int ctx_field_size; /* the ctx field size for load insn, maybe 0 */ diff --git a/include/linux/filter.h b/include/linux/filter.h index 60948a23b191..4744367598f0 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -488,6 +488,8 @@ struct bpf_prog { gpl_compatible:1, /* Is filter GPL compatible? */ cb_access:1, /* Is control block accessed? */ dst_needed:1, /* Do we need dst entry? */ + blinded:1, /* Was blinded */ + is_func:1, /* program is a bpf function */ kprobe_override:1; /* Do we override a kprobe? */ enum bpf_prog_type type; /* Type of BPF program */ u32 len; /* Number of filter blocks */ diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index d2eaff7d411f..65aa319281c7 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -862,7 +862,7 @@ struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *prog) struct bpf_insn *insn; int i, rewritten; - if (!bpf_jit_blinding_enabled(prog)) + if (!bpf_jit_blinding_enabled(prog) || prog->blinded) return prog; clone = bpf_prog_clone_create(prog, GFP_USER); @@ -904,6 +904,7 @@ struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *prog) i += insn_delta; } + clone->blinded = 1; return clone; } #endif /* CONFIG_BPF_JIT */ @@ -1784,11 +1785,19 @@ int bpf_prog_array_copy_info(struct bpf_prog_array __rcu *array, static void bpf_prog_free_deferred(struct work_struct *work) { struct bpf_prog_aux *aux; + int i; aux = container_of(work, struct bpf_prog_aux, work); if (bpf_prog_is_dev_bound(aux)) bpf_prog_offload_destroy(aux->prog); - bpf_jit_free(aux->prog); + for (i = 0; i < aux->func_cnt; i++) + bpf_jit_free(aux->func[i]); + if (aux->func_cnt) { + kfree(aux->func); + bpf_prog_unlock_free(aux->prog); + } else { + bpf_jit_free(aux->prog); + } } /* Free internal BPF program */ diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 2f53519f753a..4a61bb59007e 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1211,7 +1211,8 @@ static int bpf_prog_load(union bpf_attr *attr) goto free_used_maps; /* eBPF program is ready to be JITed */ - prog = bpf_prog_select_runtime(prog, &err); + if (!prog->bpf_func) + prog = bpf_prog_select_runtime(prog, &err); if (err < 0) goto free_used_maps; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 652c0e820079..5b13474691a0 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5498,6 +5498,128 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) return 0; } +static int jit_subprogs(struct bpf_verifier_env *env) +{ + struct bpf_prog *prog = env->prog, **func, *tmp; + int i, j, subprog_start, subprog_end = 0, len, subprog; + struct bpf_insn *insn = prog->insnsi; + void *old_bpf_func; + int err = -ENOMEM; + + if (env->subprog_cnt == 0) + return 0; + + for (i = 0; i < prog->len; i++, insn++) { + if (insn->code != (BPF_JMP | BPF_CALL) || + insn->src_reg != BPF_PSEUDO_CALL) + continue; + subprog = find_subprog(env, i + insn->imm + 1); + if (subprog < 0) { + WARN_ONCE(1, "verifier bug. No program starts at insn %d\n", + i + insn->imm + 1); + return -EFAULT; + } + /* temporarily remember subprog id inside insn instead of + * aux_data, since next loop will split up all insns into funcs + */ + insn->off = subprog + 1; + /* remember original imm in case JIT fails and fallback + * to interpreter will be needed + */ + env->insn_aux_data[i].call_imm = insn->imm; + /* point imm to __bpf_call_base+1 from JITs point of view */ + insn->imm = 1; + } + + func = kzalloc(sizeof(prog) * (env->subprog_cnt + 1), GFP_KERNEL); + if (!func) + return -ENOMEM; + + for (i = 0; i <= env->subprog_cnt; i++) { + subprog_start = subprog_end; + if (env->subprog_cnt == i) + subprog_end = prog->len; + else + subprog_end = env->subprog_starts[i]; + + len = subprog_end - subprog_start; + func[i] = bpf_prog_alloc(bpf_prog_size(len), GFP_USER); + if (!func[i]) + goto out_free; + memcpy(func[i]->insnsi, &prog->insnsi[subprog_start], + len * sizeof(struct bpf_insn)); + func[i]->len = len; + func[i]->is_func = 1; + /* Use bpf_prog_F_tag to indicate functions in stack traces. + * Long term would need debug info to populate names + */ + func[i]->aux->name[0] = 'F'; + func[i]->aux->stack_depth = env->subprog_stack_depth[i]; + func[i]->jit_requested = 1; + func[i] = bpf_int_jit_compile(func[i]); + if (!func[i]->jited) { + err = -ENOTSUPP; + goto out_free; + } + cond_resched(); + } + /* at this point all bpf functions were successfully JITed + * now populate all bpf_calls with correct addresses and + * run last pass of JIT + */ + for (i = 0; i <= env->subprog_cnt; i++) { + insn = func[i]->insnsi; + for (j = 0; j < func[i]->len; j++, insn++) { + if (insn->code != (BPF_JMP | BPF_CALL) || + insn->src_reg != BPF_PSEUDO_CALL) + continue; + subprog = insn->off; + insn->off = 0; + insn->imm = (u64 (*)(u64, u64, u64, u64, u64)) + func[subprog]->bpf_func - + __bpf_call_base; + } + } + for (i = 0; i <= env->subprog_cnt; i++) { + old_bpf_func = func[i]->bpf_func; + tmp = bpf_int_jit_compile(func[i]); + if (tmp != func[i] || func[i]->bpf_func != old_bpf_func) { + verbose(env, "JIT doesn't support bpf-to-bpf calls\n"); + err = -EFAULT; + goto out_free; + } + cond_resched(); + } + + /* finally lock prog and jit images for all functions and + * populate kallsysm + */ + for (i = 0; i <= env->subprog_cnt; i++) { + bpf_prog_lock_ro(func[i]); + bpf_prog_kallsyms_add(func[i]); + } + prog->jited = 1; + prog->bpf_func = func[0]->bpf_func; + prog->aux->func = func; + prog->aux->func_cnt = env->subprog_cnt + 1; + return 0; +out_free: + for (i = 0; i <= env->subprog_cnt; i++) + if (func[i]) + bpf_jit_free(func[i]); + kfree(func); + /* cleanup main prog to be interpreted */ + prog->jit_requested = 0; + for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) { + if (insn->code != (BPF_JMP | BPF_CALL) || + insn->src_reg != BPF_PSEUDO_CALL) + continue; + insn->off = 0; + insn->imm = env->insn_aux_data[i].call_imm; + } + return err; +} + static int fixup_call_args(struct bpf_verifier_env *env) { #ifndef CONFIG_BPF_JIT_ALWAYS_ON @@ -5506,6 +5628,10 @@ static int fixup_call_args(struct bpf_verifier_env *env) int i, depth; #endif + if (env->prog->jit_requested) + if (jit_subprogs(env) == 0) + return 0; + #ifndef CONFIG_BPF_JIT_ALWAYS_ON for (i = 0; i < prog->len; i++, insn++) { if (insn->code != (BPF_JMP | BPF_CALL) || From 5da71df68d78307c8a2825dbbe110c96d5b4b385 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Mon, 18 Dec 2017 14:03:12 +0000 Subject: [PATCH 0104/1640] UPSTREAM: bpf: fix spelling mistake: "funcation"-> "function" Trivial fix to spelling mistake in error message text. Signed-off-by: Colin Ian King Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 5b13474691a0..839b99749f5f 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -775,7 +775,7 @@ static int check_subprogs(struct bpf_verifier_env *env) return -EPERM; } if (bpf_prog_is_dev_bound(env->prog->aux)) { - verbose(env, "funcation calls in offloaded programs are not supported yet\n"); + verbose(env, "function calls in offloaded programs are not supported yet\n"); return -EINVAL; } ret = add_subprog(env, i + insn[i].imm + 1); From 6ec2ac7bbc3d33e1d145ab09415d555fe25f02c4 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Mon, 18 Dec 2017 17:47:07 +0000 Subject: [PATCH 0105/1640] UPSTREAM: bpf: make function skip_callee static and return NULL rather than 0 Function skip_callee is local to the source and does not need to be in global scope, so make it static. Also return NULL rather than 0. Cleans up two sparse warnings: symbol 'skip_callee' was not declared. Should it be static? Using plain integer as NULL pointer Signed-off-by: Colin Ian King Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 839b99749f5f..f21231c6c9de 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -826,6 +826,7 @@ next: return 0; } +static struct bpf_verifier_state *skip_callee(struct bpf_verifier_env *env, const struct bpf_verifier_state *state, struct bpf_verifier_state *parent, @@ -870,7 +871,7 @@ bug: verbose(env, "verifier bug regno %d tmp %p\n", regno, tmp); verbose(env, "regno %d parent frame %d current frame %d\n", regno, parent->curframe, state->curframe); - return 0; + return NULL; } static int mark_reg_read(struct bpf_verifier_env *env, From 5b4d854f77ec91bbdb209a120dca8417f192715d Mon Sep 17 00:00:00 2001 From: Xiongwei Song Date: Tue, 19 Dec 2017 07:17:15 +0800 Subject: [PATCH 0106/1640] UPSTREAM: bpf: make function xdp_do_generic_redirect_map() static The function xdp_do_generic_redirect_map() is only used in this file, so make it static. Clean up sparse warning: net/core/filter.c:2687:5: warning: no previous prototype for 'xdp_do_generic_redirect_map' [-Wmissing-prototypes] Signed-off-by: Xiongwei Song Signed-off-by: Daniel Borkmann --- net/core/filter.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/net/core/filter.c b/net/core/filter.c index c543b258cfeb..daf4b4d33241 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2767,8 +2767,9 @@ static int __xdp_generic_ok_fwd_dev(struct sk_buff *skb, struct net_device *fwd) return 0; } -int xdp_do_generic_redirect_map(struct net_device *dev, struct sk_buff *skb, - struct bpf_prog *xdp_prog) +static int xdp_do_generic_redirect_map(struct net_device *dev, + struct sk_buff *skb, + struct bpf_prog *xdp_prog) { struct redirect_info *ri = this_cpu_ptr(&redirect_info); unsigned long map_owner = ri->map_owner; From 52f10a0fec055b9daeea7822e449fc347fb98a40 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Mon, 18 Dec 2017 10:13:44 -0800 Subject: [PATCH 0107/1640] UPSTREAM: bpf/cgroup: fix a verification error for a CGROUP_DEVICE type prog The tools/testing/selftests/bpf test program test_dev_cgroup fails with the following error when compiled with llvm 6.0. (I did not try with earlier versions.) libbpf: load bpf program failed: Permission denied libbpf: -- BEGIN DUMP LOG --- libbpf: 0: (61) r2 = *(u32 *)(r1 +4) 1: (b7) r0 = 0 2: (55) if r2 != 0x1 goto pc+8 R0=inv0 R1=ctx(id=0,off=0,imm=0) R2=inv1 R10=fp0 3: (69) r2 = *(u16 *)(r1 +0) invalid bpf_context access off=0 size=2 ... The culprit is the following statement in dev_cgroup.c: short type = ctx->access_type & 0xFFFF; This code is typical as the ctx->access_type is assigned as below in kernel/bpf/cgroup.c: struct bpf_cgroup_dev_ctx ctx = { .access_type = (access << 16) | dev_type, .major = major, .minor = minor, }; The compiler converts it to u16 access while the verifier cgroup_dev_is_valid_access rejects any non u32 access. This patch permits the field access_type to be accessible with type u16 and u8 as well. Signed-off-by: Yonghong Song Tested-by: Roman Gushchin Signed-off-by: Daniel Borkmann --- include/uapi/linux/bpf.h | 3 ++- kernel/bpf/cgroup.c | 15 +++++++++++++-- 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index c2dfd46a3b60..231b08014cf4 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1109,7 +1109,8 @@ struct bpf_perf_event_value { #define BPF_DEVCG_DEV_CHAR (1ULL << 1) struct bpf_cgroup_dev_ctx { - __u32 access_type; /* (access << 16) | type */ + /* access_type encoded as (BPF_DEVCG_ACC_* << 16) | BPF_DEVCG_DEV_* */ + __u32 access_type; __u32 major; __u32 minor; }; diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 48f30aedf5b7..83baf471ec00 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -567,6 +567,8 @@ static bool cgroup_dev_is_valid_access(int off, int size, enum bpf_access_type type, struct bpf_insn_access_aux *info) { + const int size_default = sizeof(__u32); + if (type == BPF_WRITE) return false; @@ -575,8 +577,17 @@ static bool cgroup_dev_is_valid_access(int off, int size, /* The verifier guarantees that size > 0. */ if (off % size != 0) return false; - if (size != sizeof(__u32)) - return false; + + switch (off) { + case bpf_ctx_range(struct bpf_cgroup_dev_ctx, access_type): + bpf_ctx_record_field_size(info, size_default); + if (!bpf_ctx_narrow_access_ok(off, size, size_default)) + return false; + break; + default: + if (size != size_default) + return false; + } return true; } From 6c32a8a8e49b45b39497603254a7b48835552520 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 20 Dec 2017 13:42:56 +0100 Subject: [PATCH 0108/1640] UPSTREAM: bpf: fix kallsyms handling for subprogs Right now kallsyms handling is not working with JITed subprogs. The reason is that when in 1c2a088a6626 ("bpf: x64: add JIT support for multi-function programs") in jit_subprogs() they are passed to bpf_prog_kallsyms_add(), then their prog type is 0, which BPF core will think it's a cBPF program as only cBPF programs have a 0 type. Thus, they need to inherit the type from the main prog. Once that is fixed, they are indeed added to the BPF kallsyms infra, but their tag is 0. Therefore, since intention is to add them as bpf_prog_F_, we need to pass them to bpf_prog_calc_tag() first. And once this is resolved, there is a use-after-free on prog cleanup: we remove the kallsyms entry from the main prog, later walk all subprogs and call bpf_jit_free() on them. However, the kallsyms linkage was never released on them. Thus, do that for all subprogs right in __bpf_prog_put() when refcount hits 0. Fixes: 1c2a088a6626 ("bpf: x64: add JIT support for multi-function programs") Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: Alexei Starovoitov --- kernel/bpf/syscall.c | 6 ++++++ kernel/bpf/verifier.c | 3 +++ 2 files changed, 9 insertions(+) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 4a61bb59007e..d6f8804509c7 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -954,10 +954,16 @@ static void __bpf_prog_put_rcu(struct rcu_head *rcu) static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock) { if (atomic_dec_and_test(&prog->aux->refcnt)) { + int i; + trace_bpf_prog_put_rcu(prog); /* bpf_prog_free_id() must be called first */ bpf_prog_free_id(prog, do_idr_lock); + + for (i = 0; i < prog->aux->func_cnt; i++) + bpf_prog_kallsyms_del(prog->aux->func[i]); bpf_prog_kallsyms_del(prog); + call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu); } } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index f21231c6c9de..3e30f24f87e6 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5549,7 +5549,10 @@ static int jit_subprogs(struct bpf_verifier_env *env) goto out_free; memcpy(func[i]->insnsi, &prog->insnsi[subprog_start], len * sizeof(struct bpf_insn)); + func[i]->type = prog->type; func[i]->len = len; + if (bpf_prog_calc_tag(func[i])) + goto out_free; func[i]->is_func = 1; /* Use bpf_prog_F_tag to indicate functions in stack traces. * Long term would need debug info to populate names From f0c35c1e983d7c74c74d797d449bf4d5e3164388 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 20 Dec 2017 13:42:57 +0100 Subject: [PATCH 0109/1640] BACKPORT: bpf: allow for correlation of maps and helpers in dump Currently a dump of an xlated prog (post verifier stage) doesn't correlate used helpers as well as maps. The prog info lists involved map ids, however there's no correlation of where in the program they are used as of today. Likewise, bpftool does not correlate helper calls with the target functions. The latter can be done w/o any kernel changes through kallsyms, and also has the advantage that this works with inlined helpers and BPF calls. Example, via interpreter: # tc filter show dev foo ingress filter protocol all pref 49152 bpf chain 0 filter protocol all pref 49152 bpf chain 0 handle 0x1 foo.o:[ingress] \ direct-action not_in_hw id 1 tag c74773051b364165 <-- prog id:1 * Output before patch (calls/maps remain unclear): # bpftool prog dump xlated id 1 <-- dump prog id:1 0: (b7) r1 = 2 1: (63) *(u32 *)(r10 -4) = r1 2: (bf) r2 = r10 3: (07) r2 += -4 4: (18) r1 = 0xffff95c47a8d4800 6: (85) call unknown#73040 7: (15) if r0 == 0x0 goto pc+18 8: (bf) r2 = r10 9: (07) r2 += -4 10: (bf) r1 = r0 11: (85) call unknown#73040 12: (15) if r0 == 0x0 goto pc+23 [...] * Output after patch: # bpftool prog dump xlated id 1 0: (b7) r1 = 2 1: (63) *(u32 *)(r10 -4) = r1 2: (bf) r2 = r10 3: (07) r2 += -4 4: (18) r1 = map[id:2] <-- map id:2 6: (85) call bpf_map_lookup_elem#73424 <-- helper call 7: (15) if r0 == 0x0 goto pc+18 8: (bf) r2 = r10 9: (07) r2 += -4 10: (bf) r1 = r0 11: (85) call bpf_map_lookup_elem#73424 12: (15) if r0 == 0x0 goto pc+23 [...] # bpftool map show id 2 <-- show/dump/etc map id:2 2: hash_of_maps flags 0x0 key 4B value 4B max_entries 3 memlock 4096B Example, JITed, same prog: # tc filter show dev foo ingress filter protocol all pref 49152 bpf chain 0 filter protocol all pref 49152 bpf chain 0 handle 0x1 foo.o:[ingress] \ direct-action not_in_hw id 3 tag c74773051b364165 jited # bpftool prog show id 3 3: sched_cls tag c74773051b364165 loaded_at Dec 19/13:48 uid 0 xlated 384B jited 257B memlock 4096B map_ids 2 # bpftool prog dump xlated id 3 0: (b7) r1 = 2 1: (63) *(u32 *)(r10 -4) = r1 2: (bf) r2 = r10 3: (07) r2 += -4 4: (18) r1 = map[id:2] <-- map id:2 6: (85) call __htab_map_lookup_elem#77408 <-+ inlined rewrite 7: (15) if r0 == 0x0 goto pc+2 | 8: (07) r0 += 56 | 9: (79) r0 = *(u64 *)(r0 +0) <-+ 10: (15) if r0 == 0x0 goto pc+24 11: (bf) r2 = r10 12: (07) r2 += -4 [...] Example, same prog, but kallsyms disabled (in that case we are also not allowed to pass any relative offsets, etc, so prog becomes pointer sanitized on dump): # sysctl kernel.kptr_restrict=2 kernel.kptr_restrict = 2 # bpftool prog dump xlated id 3 0: (b7) r1 = 2 1: (63) *(u32 *)(r10 -4) = r1 2: (bf) r2 = r10 3: (07) r2 += -4 4: (18) r1 = map[id:2] 6: (85) call bpf_unspec#0 7: (15) if r0 == 0x0 goto pc+2 [...] Example, BPF calls via interpreter: # bpftool prog dump xlated id 1 0: (85) call pc+2#__bpf_prog_run_args32 1: (b7) r0 = 1 2: (95) exit 3: (b7) r0 = 2 4: (95) exit Example, BPF calls via JIT: # sysctl net.core.bpf_jit_enable=1 net.core.bpf_jit_enable = 1 # sysctl net.core.bpf_jit_kallsyms=1 net.core.bpf_jit_kallsyms = 1 # bpftool prog dump xlated id 1 0: (85) call pc+2#bpf_prog_3b185187f1855c4c_F 1: (b7) r0 = 1 2: (95) exit 3: (b7) r0 = 2 4: (95) exit And finally, an example for tail calls that is now working as well wrt correlation: # bpftool prog dump xlated id 2 [...] 10: (b7) r2 = 8 11: (85) call bpf_trace_printk#-41312 12: (bf) r1 = r6 13: (18) r2 = map[id:1] 15: (b7) r3 = 0 16: (85) call bpf_tail_call#12 17: (b7) r1 = 42 18: (6b) *(u16 *)(r6 +46) = r1 19: (b7) r0 = 0 20: (95) exit # bpftool map show id 1 1: prog_array flags 0x0 key 4B value 4B max_entries 1 memlock 4096B Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: Alexei Starovoitov --- include/linux/filter.h | 9 +++++ kernel/bpf/core.c | 4 +- kernel/bpf/disasm.c | 65 +++++++++++++++++++++++++------ kernel/bpf/disasm.h | 29 +++++++++++--- kernel/bpf/syscall.c | 87 +++++++++++++++++++++++++++++++++++++++--- kernel/bpf/verifier.c | 30 +++++++++++++-- 6 files changed, 198 insertions(+), 26 deletions(-) diff --git a/include/linux/filter.h b/include/linux/filter.h index 4744367598f0..b73c62c50bd9 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -18,6 +18,7 @@ #include #include #include +#include #include @@ -801,6 +802,14 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog); void bpf_jit_compile(struct bpf_prog *prog); bool bpf_helper_changes_pkt_data(void *func); +static inline bool bpf_dump_raw_ok(void) +{ + /* Reconstruction of call-sites is dependent on kallsyms, + * thus make dump the same restriction. + */ + return kallsyms_show_value() == 1; +} + struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, const struct bpf_insn *patch, u32 len); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 65aa319281c7..5151fb294678 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -911,7 +911,9 @@ struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *prog) /* Base function for offset calculation. Needs to go into .text section, * therefore keeping it non-static as well; will also be used by JITs - * anyway later on, so do not let the compiler omit it. + * anyway later on, so do not let the compiler omit it. This also needs + * to go into kallsyms for correlation from e.g. bpftool, so naming + * must not change. */ noinline u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) { diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c index 883f88fa5bfc..8740406df2cd 100644 --- a/kernel/bpf/disasm.c +++ b/kernel/bpf/disasm.c @@ -21,10 +21,39 @@ static const char * const func_id_str[] = { }; #undef __BPF_FUNC_STR_FN -const char *func_id_name(int id) +static const char *__func_get_name(const struct bpf_insn_cbs *cbs, + const struct bpf_insn *insn, + char *buff, size_t len) { BUILD_BUG_ON(ARRAY_SIZE(func_id_str) != __BPF_FUNC_MAX_ID); + if (insn->src_reg != BPF_PSEUDO_CALL && + insn->imm >= 0 && insn->imm < __BPF_FUNC_MAX_ID && + func_id_str[insn->imm]) + return func_id_str[insn->imm]; + + if (cbs && cbs->cb_call) + return cbs->cb_call(cbs->private_data, insn); + + if (insn->src_reg == BPF_PSEUDO_CALL) + snprintf(buff, len, "%+d", insn->imm); + + return buff; +} + +static const char *__func_imm_name(const struct bpf_insn_cbs *cbs, + const struct bpf_insn *insn, + u64 full_imm, char *buff, size_t len) +{ + if (cbs && cbs->cb_imm) + return cbs->cb_imm(cbs->private_data, insn, full_imm); + + snprintf(buff, len, "0x%llx", (unsigned long long)full_imm); + return buff; +} + +const char *func_id_name(int id) +{ if (id >= 0 && id < __BPF_FUNC_MAX_ID && func_id_str[id]) return func_id_str[id]; else @@ -83,7 +112,7 @@ static const char *const bpf_jmp_string[16] = { [BPF_EXIT >> 4] = "exit", }; -static void print_bpf_end_insn(bpf_insn_print_cb verbose, +static void print_bpf_end_insn(bpf_insn_print_t verbose, struct bpf_verifier_env *env, const struct bpf_insn *insn) { @@ -92,9 +121,12 @@ static void print_bpf_end_insn(bpf_insn_print_cb verbose, insn->imm, insn->dst_reg); } -void print_bpf_insn(bpf_insn_print_cb verbose, struct bpf_verifier_env *env, - const struct bpf_insn *insn, bool allow_ptr_leaks) +void print_bpf_insn(const struct bpf_insn_cbs *cbs, + struct bpf_verifier_env *env, + const struct bpf_insn *insn, + bool allow_ptr_leaks) { + const bpf_insn_print_t verbose = cbs->cb_print; u8 class = BPF_CLASS(insn->code); if (class == BPF_ALU || class == BPF_ALU64) { @@ -175,12 +207,15 @@ void print_bpf_insn(bpf_insn_print_cb verbose, struct bpf_verifier_env *env, */ u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm; bool map_ptr = insn->src_reg == BPF_PSEUDO_MAP_FD; + char tmp[64]; if (map_ptr && !allow_ptr_leaks) imm = 0; - verbose(env, "(%02x) r%d = 0x%llx\n", insn->code, - insn->dst_reg, (unsigned long long)imm); + verbose(env, "(%02x) r%d = %s\n", + insn->code, insn->dst_reg, + __func_imm_name(cbs, insn, imm, + tmp, sizeof(tmp))); } else { verbose(env, "BUG_ld_%02x\n", insn->code); return; @@ -189,12 +224,20 @@ void print_bpf_insn(bpf_insn_print_cb verbose, struct bpf_verifier_env *env, u8 opcode = BPF_OP(insn->code); if (opcode == BPF_CALL) { - if (insn->src_reg == BPF_PSEUDO_CALL) - verbose(env, "(%02x) call pc%+d\n", insn->code, - insn->imm); - else + char tmp[64]; + + if (insn->src_reg == BPF_PSEUDO_CALL) { + verbose(env, "(%02x) call pc%s\n", + insn->code, + __func_get_name(cbs, insn, + tmp, sizeof(tmp))); + } else { + strcpy(tmp, "unknown"); verbose(env, "(%02x) call %s#%d\n", insn->code, - func_id_name(insn->imm), insn->imm); + __func_get_name(cbs, insn, + tmp, sizeof(tmp)), + insn->imm); + } } else if (insn->code == (BPF_JMP | BPF_JA)) { verbose(env, "(%02x) goto pc%+d\n", insn->code, insn->off); diff --git a/kernel/bpf/disasm.h b/kernel/bpf/disasm.h index 8de977e420b6..e0857d016f89 100644 --- a/kernel/bpf/disasm.h +++ b/kernel/bpf/disasm.h @@ -17,16 +17,35 @@ #include #include #include +#ifndef __KERNEL__ +#include +#include +#endif + +struct bpf_verifier_env; extern const char *const bpf_alu_string[16]; extern const char *const bpf_class_string[8]; const char *func_id_name(int id); -struct bpf_verifier_env; -typedef void (*bpf_insn_print_cb)(struct bpf_verifier_env *env, - const char *, ...); -void print_bpf_insn(bpf_insn_print_cb verbose, struct bpf_verifier_env *env, - const struct bpf_insn *insn, bool allow_ptr_leaks); +typedef void (*bpf_insn_print_t)(struct bpf_verifier_env *env, + const char *, ...); +typedef const char *(*bpf_insn_revmap_call_t)(void *private_data, + const struct bpf_insn *insn); +typedef const char *(*bpf_insn_print_imm_t)(void *private_data, + const struct bpf_insn *insn, + __u64 full_imm); +struct bpf_insn_cbs { + bpf_insn_print_t cb_print; + bpf_insn_revmap_call_t cb_call; + bpf_insn_print_imm_t cb_imm; + void *private_data; +}; + +void print_bpf_insn(const struct bpf_insn_cbs *cbs, + struct bpf_verifier_env *env, + const struct bpf_insn *insn, + bool allow_ptr_leaks); #endif diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index d6f8804509c7..a185d34b1d8c 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1581,6 +1581,67 @@ static int bpf_map_get_fd_by_id(const union bpf_attr *attr) return fd; } +static const struct bpf_map *bpf_map_from_imm(const struct bpf_prog *prog, + unsigned long addr) +{ + int i; + + for (i = 0; i < prog->aux->used_map_cnt; i++) + if (prog->aux->used_maps[i] == (void *)addr) + return prog->aux->used_maps[i]; + return NULL; +} + +static struct bpf_insn *bpf_insn_prepare_dump(const struct bpf_prog *prog) +{ + const struct bpf_map *map; + struct bpf_insn *insns; + u64 imm; + int i; + + insns = kmemdup(prog->insnsi, bpf_prog_insn_size(prog), + GFP_USER); + if (!insns) + return insns; + + for (i = 0; i < prog->len; i++) { + if (insns[i].code == (BPF_JMP | BPF_TAIL_CALL)) { + insns[i].code = BPF_JMP | BPF_CALL; + insns[i].imm = BPF_FUNC_tail_call; + /* fall-through */ + } + if (insns[i].code == (BPF_JMP | BPF_CALL) || + insns[i].code == (BPF_JMP | BPF_CALL_ARGS)) { + if (insns[i].code == (BPF_JMP | BPF_CALL_ARGS)) + insns[i].code = BPF_JMP | BPF_CALL; + if (!bpf_dump_raw_ok()) + insns[i].imm = 0; + continue; + } + + if (insns[i].code != (BPF_LD | BPF_IMM | BPF_DW)) + continue; + + imm = ((u64)insns[i + 1].imm << 32) | (u32)insns[i].imm; + map = bpf_map_from_imm(prog, imm); + if (map) { + insns[i].src_reg = BPF_PSEUDO_MAP_FD; + insns[i].imm = map->id; + insns[i + 1].imm = 0; + continue; + } + + if (!bpf_dump_raw_ok() && + imm == (unsigned long)prog->aux) { + insns[i].imm = 0; + insns[i + 1].imm = 0; + continue; + } + } + + return insns; +} + static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, const union bpf_attr *attr, union bpf_attr __user *uattr) @@ -1632,18 +1693,34 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, ulen = info.jited_prog_len; info.jited_prog_len = prog->jited_len; if (info.jited_prog_len && ulen) { - uinsns = u64_to_user_ptr(info.jited_prog_insns); - ulen = min_t(u32, info.jited_prog_len, ulen); - if (copy_to_user(uinsns, prog->bpf_func, ulen)) - return -EFAULT; + if (bpf_dump_raw_ok()) { + uinsns = u64_to_user_ptr(info.jited_prog_insns); + ulen = min_t(u32, info.jited_prog_len, ulen); + if (copy_to_user(uinsns, prog->bpf_func, ulen)) + return -EFAULT; + } else { + info.jited_prog_insns = 0; + } } ulen = info.xlated_prog_len; info.xlated_prog_len = bpf_prog_insn_size(prog); if (info.xlated_prog_len && ulen) { + struct bpf_insn *insns_sanitized; + bool fault; + + if (prog->blinded && !bpf_dump_raw_ok()) { + info.xlated_prog_insns = 0; + goto done; + } + insns_sanitized = bpf_insn_prepare_dump(prog); + if (!insns_sanitized) + return -ENOMEM; uinsns = u64_to_user_ptr(info.xlated_prog_insns); ulen = min_t(u32, info.xlated_prog_len, ulen); - if (copy_to_user(uinsns, prog->insnsi, ulen)) + fault = copy_to_user(uinsns, insns_sanitized, ulen); + kfree(insns_sanitized); + if (fault) return -EFAULT; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 3e30f24f87e6..a06945ff6aa3 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4878,9 +4878,12 @@ static int do_check(struct bpf_verifier_env *env) } if (env->log.level) { + const struct bpf_insn_cbs cbs = { + .cb_print = verbose, + }; + verbose(env, "%d: ", env->insn_idx); - print_bpf_insn(verbose, env, insn, - env->allow_ptr_leaks); + print_bpf_insn(&cbs, env, insn, env->allow_ptr_leaks); } err = ext_analyzer_insn_hook(env, env->insn_idx, env->prev_insn_idx); @@ -5503,14 +5506,14 @@ static int jit_subprogs(struct bpf_verifier_env *env) { struct bpf_prog *prog = env->prog, **func, *tmp; int i, j, subprog_start, subprog_end = 0, len, subprog; - struct bpf_insn *insn = prog->insnsi; + struct bpf_insn *insn; void *old_bpf_func; int err = -ENOMEM; if (env->subprog_cnt == 0) return 0; - for (i = 0; i < prog->len; i++, insn++) { + for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) { if (insn->code != (BPF_JMP | BPF_CALL) || insn->src_reg != BPF_PSEUDO_CALL) continue; @@ -5602,6 +5605,25 @@ static int jit_subprogs(struct bpf_verifier_env *env) bpf_prog_lock_ro(func[i]); bpf_prog_kallsyms_add(func[i]); } + + /* Last step: make now unused interpreter insns from main + * prog consistent for later dump requests, so they can + * later look the same as if they were interpreted only. + */ + for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) { + unsigned long addr; + + if (insn->code != (BPF_JMP | BPF_CALL) || + insn->src_reg != BPF_PSEUDO_CALL) + continue; + insn->off = env->insn_aux_data[i].call_imm; + subprog = find_subprog(env, i + insn->off + 1); + addr = (unsigned long)func[subprog + 1]->bpf_func; + addr &= PAGE_MASK; + insn->imm = (u64 (*)(u64, u64, u64, u64, u64)) + addr - __bpf_call_base; + } + prog->jited = 1; prog->bpf_func = func[0]->bpf_func; prog->aux->func = func; From be3bcdc8397be6b61989664daa3a98000ab0e5bb Mon Sep 17 00:00:00 2001 From: Gianluca Borello Date: Sat, 23 Dec 2017 10:09:55 +0000 Subject: [PATCH 0110/1640] BACKPORT: bpf: fix stacksafe exploration when comparing states Commit cc2b14d51053 ("bpf: teach verifier to recognize zero initialized stack") introduced a very relaxed check when comparing stacks of different states, effectively returning a positive result in many cases where it shouldn't. This can create problems in cases such as this following C pseudocode: long var; long *x = bpf_map_lookup(...); if (!x) return; if (*x != 0xbeef) var = 0; else var = 1; /* This is the key part, calling a helper causes an explored state * to be saved with the information that "var" is on the stack as * STACK_ZERO, since the helper is first met by the verifier after * the "var = 0" assignment. This state will however be wrongly used * also for the "var = 1" case, so the verifier assumes "var" is always * 0 and will replace the NULL assignment with nops, because the * search pruning prevents it from exploring the faulty branch. */ bpf_ktime_get_ns(); if (var) *(long *)0 = 0xbeef; Fix the issue by making sure that the stack is fully explored before returning a positive comparison result. Also attach a couple tests that highlight the bad behavior. In the first test, without this fix instructions 16 and 17 are replaced with nops instead of being rejected by the verifier. The second test, instead, allows a program to make a potentially illegal read from the stack. Fixes: cc2b14d51053 ("bpf: teach verifier to recognize zero initialized stack") Signed-off-by: Gianluca Borello Acked-by: Alexei Starovoitov Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a06945ff6aa3..299eac44813c 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4548,7 +4548,7 @@ static bool stacksafe(struct bpf_func_state *old, if (!(old->stack[spi].spilled_ptr.live & REG_LIVE_READ)) /* explored state didn't use this */ - return true; + continue; if (old->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_INVALID) continue; From 14b6295d650dcd8456f4ee919c7e68f9a870d491 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 25 Dec 2017 13:15:40 -0800 Subject: [PATCH 0111/1640] UPSTREAM: bpf: fix maximum stack depth tracking logic Instead of computing max stack depth for current call chain during the main verifier pass track stack depth of each function independently and after do_check() is done do another pass over all instructions analyzing depth of all possible call stacks. Fixes: f4d7e40a5b71 ("bpf: introduce function calls (verification)") Reported-by: Jann Horn Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/linux/bpf_verifier.h | 1 + kernel/bpf/verifier.c | 82 +++++++++++++++++++++++++++++------- 2 files changed, 67 insertions(+), 16 deletions(-) diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index ea59b967a8b6..ae23e26fdd29 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -212,6 +212,7 @@ struct bpf_verifier_env { struct bpf_insn_aux_data *insn_aux_data; /* array of per-insn state */ struct bpf_verifer_log log; u32 subprog_starts[BPF_MAX_SUBPROGS]; + /* computes the stack depth of each bpf function */ u16 subprog_stack_depth[BPF_MAX_SUBPROGS + 1]; u32 subprog_cnt; }; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 299eac44813c..4f23c247b48b 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1506,33 +1506,80 @@ static int update_stack_depth(struct bpf_verifier_env *env, const struct bpf_func_state *func, int off) { - u16 stack = env->subprog_stack_depth[func->subprogno], total = 0; - struct bpf_verifier_state *cur = env->cur_state; - int i; + u16 stack = env->subprog_stack_depth[func->subprogno]; if (stack >= -off) return 0; /* update known max for given subprogram */ env->subprog_stack_depth[func->subprogno] = -off; + return 0; +} - /* compute the total for current call chain */ - for (i = 0; i <= cur->curframe; i++) { - u32 depth = env->subprog_stack_depth[cur->frame[i]->subprogno]; +/* starting from main bpf function walk all instructions of the function + * and recursively walk all callees that given function can call. + * Ignore jump and exit insns. + * Since recursion is prevented by check_cfg() this algorithm + * only needs a local stack of MAX_CALL_FRAMES to remember callsites + */ +static int check_max_stack_depth(struct bpf_verifier_env *env) +{ + int depth = 0, frame = 0, subprog = 0, i = 0, subprog_end; + struct bpf_insn *insn = env->prog->insnsi; + int insn_cnt = env->prog->len; + int ret_insn[MAX_CALL_FRAMES]; + int ret_prog[MAX_CALL_FRAMES]; - /* round up to 32-bytes, since this is granularity - * of interpreter stack sizes - */ - depth = round_up(depth, 32); - total += depth; - } - - if (total > MAX_BPF_STACK) { +process_func: + /* round up to 32-bytes, since this is granularity + * of interpreter stack size + */ + depth += round_up(max_t(u32, env->subprog_stack_depth[subprog], 1), 32); + if (depth > MAX_BPF_STACK) { verbose(env, "combined stack size of %d calls is %d. Too large\n", - cur->curframe, total); + frame + 1, depth); return -EACCES; } - return 0; +continue_func: + if (env->subprog_cnt == subprog) + subprog_end = insn_cnt; + else + subprog_end = env->subprog_starts[subprog]; + for (; i < subprog_end; i++) { + if (insn[i].code != (BPF_JMP | BPF_CALL)) + continue; + if (insn[i].src_reg != BPF_PSEUDO_CALL) + continue; + /* remember insn and function to return to */ + ret_insn[frame] = i + 1; + ret_prog[frame] = subprog; + + /* find the callee */ + i = i + insn[i].imm + 1; + subprog = find_subprog(env, i); + if (subprog < 0) { + WARN_ONCE(1, "verifier bug. No program starts at insn %d\n", + i); + return -EFAULT; + } + subprog++; + frame++; + if (frame >= MAX_CALL_FRAMES) { + WARN_ONCE(1, "verifier bug. Call stack is too deep\n"); + return -EFAULT; + } + goto process_func; + } + /* end of for() loop means the last insn of the 'subprog' + * was reached. Doesn't matter whether it was JA or EXIT + */ + if (frame == 0) + return 0; + depth -= round_up(max_t(u32, env->subprog_stack_depth[subprog], 1), 32); + frame--; + i = ret_insn[frame]; + subprog = ret_prog[frame]; + goto continue_func; } #ifndef CONFIG_BPF_JIT_ALWAYS_ON @@ -6017,6 +6064,9 @@ skip_full_check: if (ret == 0) sanitize_dead_code(env); + if (ret == 0) + ret = check_max_stack_depth(env); + if (ret == 0) /* program is valid, convert *(u32*)(ctx + off) accesses */ ret = convert_ctx_accesses(env); From 38f76af5756e7f5fe373fdfe4c48de24d66a5c31 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 25 Dec 2017 13:15:42 -0800 Subject: [PATCH 0112/1640] BACKPORT: bpf: fix max call depth check fix off by one error in max call depth check and add a test Fixes: f4d7e40a5b71 ("bpf: introduce function calls (verification)") Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 4f23c247b48b..920221f23f0e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2233,9 +2233,9 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, struct bpf_func_state *caller, *callee; int i, subprog, target_insn; - if (state->curframe >= MAX_CALL_FRAMES) { + if (state->curframe + 1 >= MAX_CALL_FRAMES) { verbose(env, "the call stack of %d frames is too deep\n", - state->curframe); + state->curframe + 2); return -E2BIG; } From 2e794479961711f7488fcc9fac9ab0dcf01c1d8b Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 27 Dec 2017 18:39:03 -0800 Subject: [PATCH 0113/1640] UPSTREAM: bpf: offload: don't require rtnl for dev list manipulation We don't need the RTNL lock for all operations on offload state. We only need to hold it around ndo calls. The device offload initialization doesn't require it. The soon-to-come querying of the offload info will only need it partially. We will also be able to remove the waitqueue in following patches. Use struct rw_semaphore because map offload will require sleeping with the semaphore held for read. Suggested-by: Kirill Tkhai Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Signed-off-by: Daniel Borkmann --- kernel/bpf/offload.c | 34 ++++++++++++++++++++++++---------- 1 file changed, 24 insertions(+), 10 deletions(-) diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index 8455b89d1bbf..032079754d88 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -20,13 +20,16 @@ #include #include #include +#include -/* protected by RTNL */ +/* Protects bpf_prog_offload_devs and offload members of all progs. + * RTNL lock cannot be taken when holding this lock. + */ +static DECLARE_RWSEM(bpf_devs_lock); static LIST_HEAD(bpf_prog_offload_devs); int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr) { - struct net *net = current->nsproxy->net_ns; struct bpf_dev_offload *offload; if (attr->prog_type != BPF_PROG_TYPE_SCHED_CLS && @@ -43,19 +46,26 @@ int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr) offload->prog = prog; init_waitqueue_head(&offload->verifier_done); - rtnl_lock(); - offload->netdev = __dev_get_by_index(net, attr->prog_ifindex); - if (!offload->netdev) { - rtnl_unlock(); - kfree(offload); - return -EINVAL; - } + offload->netdev = dev_get_by_index(current->nsproxy->net_ns, + attr->prog_ifindex); + if (!offload->netdev) + goto err_free; + down_write(&bpf_devs_lock); + if (offload->netdev->reg_state != NETREG_REGISTERED) + goto err_unlock; prog->aux->offload = offload; list_add_tail(&offload->offloads, &bpf_prog_offload_devs); - rtnl_unlock(); + dev_put(offload->netdev); + up_write(&bpf_devs_lock); return 0; +err_unlock: + up_write(&bpf_devs_lock); + dev_put(offload->netdev); +err_free: + kfree(offload); + return -EINVAL; } static int __bpf_offload_ndo(struct bpf_prog *prog, enum bpf_netdev_command cmd, @@ -126,7 +136,9 @@ void bpf_prog_offload_destroy(struct bpf_prog *prog) wake_up(&offload->verifier_done); rtnl_lock(); + down_write(&bpf_devs_lock); __bpf_prog_offload_destroy(prog); + up_write(&bpf_devs_lock); rtnl_unlock(); kfree(offload); @@ -181,11 +193,13 @@ static int bpf_offload_notification(struct notifier_block *notifier, if (netdev->reg_state != NETREG_UNREGISTERING) break; + down_write(&bpf_devs_lock); list_for_each_entry_safe(offload, tmp, &bpf_prog_offload_devs, offloads) { if (offload->netdev == netdev) __bpf_prog_offload_destroy(offload->prog); } + up_write(&bpf_devs_lock); break; default: break; From a5ab77d14c751e5e95ec5d3c722075d1e49fcb97 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 27 Dec 2017 18:39:04 -0800 Subject: [PATCH 0114/1640] UPSTREAM: bpf: offload: don't use prog->aux->offload as boolean We currently use aux->offload to indicate that program is bound to a specific device. This forces us to keep the offload structure around even after the device is gone. Add a bool member to struct bpf_prog_aux to indicate if offload was requested. Suggested-by: Alexei Starovoitov Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 3 ++- kernel/bpf/syscall.c | 4 +++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 8d08c7e1e6ad..b636210e34fe 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -216,6 +216,7 @@ struct bpf_prog_aux { u32 stack_depth; u32 id; u32 func_cnt; + bool offload_requested; struct bpf_prog **func; void *jit_data; /* JIT specific data. arch dependent */ struct latch_tree_node ksym_tnode; @@ -566,7 +567,7 @@ int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr); static inline bool bpf_prog_is_dev_bound(struct bpf_prog_aux *aux) { - return aux->offload; + return aux->offload_requested; } #else static inline int bpf_prog_offload_init(struct bpf_prog *prog, diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index a185d34b1d8c..0d0d1c8cbe22 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1174,6 +1174,8 @@ static int bpf_prog_load(union bpf_attr *attr) if (!prog) return -ENOMEM; + prog->aux->offload_requested = !!attr->prog_ifindex; + err = security_bpf_prog_alloc(prog->aux); if (err) goto free_prog_nouncharge; @@ -1195,7 +1197,7 @@ static int bpf_prog_load(union bpf_attr *attr) atomic_set(&prog->aux->refcnt, 1); prog->gpl_compatible = is_gpl ? 1 : 0; - if (attr->prog_ifindex) { + if (bpf_prog_is_dev_bound(prog->aux)) { err = bpf_prog_offload_init(prog, attr); if (err) goto free_prog; From 1aba993de0c413a6682a9c0a8fb92f94f001ced8 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 27 Dec 2017 18:39:05 -0800 Subject: [PATCH 0115/1640] BACKPORT: bpf: offload: allow netdev to disappear while verifier is running To allow verifier instruction callbacks without any extra locking NETDEV_UNREGISTER notification would wait on a waitqueue for verifier to finish. This design decision was made when rtnl lock was providing all the locking. Use the read/write lock instead and remove the workqueue. Verifier will now call into the offload code, so dev_ops are moved to offload structure. Since verifier calls are all under bpf_prog_is_dev_bound() we no longer need static inline implementations to please builds with CONFIG_NET=n. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 9 +++++++-- include/linux/bpf_verifier.h | 16 ++-------------- include/linux/netdevice.h | 4 ++-- kernel/bpf/offload.c | 30 ++++++++++++++++-------------- kernel/bpf/verifier.c | 20 +++++++------------- 5 files changed, 34 insertions(+), 45 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index b636210e34fe..5442cf90c46b 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -17,6 +17,7 @@ #include #include +struct bpf_verifier_env; struct perf_event; struct bpf_prog; struct bpf_map; @@ -199,14 +200,18 @@ struct bpf_verifier_ops { struct bpf_prog *prog, u32 *target_size); }; +struct bpf_prog_offload_ops { + int (*insn_hook)(struct bpf_verifier_env *env, + int insn_idx, int prev_insn_idx); +}; + struct bpf_dev_offload { struct bpf_prog *prog; struct net_device *netdev; void *dev_priv; struct list_head offloads; bool dev_state; - bool verifier_running; - wait_queue_head_t verifier_done; + const struct bpf_prog_offload_ops *dev_ops; }; struct bpf_prog_aux { diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index ae23e26fdd29..b19117ef4df0 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -182,12 +182,6 @@ static inline bool bpf_verifier_log_full(const struct bpf_verifer_log *log) return log->len_used >= log->len_total - 1; } -struct bpf_verifier_env; -struct bpf_ext_analyzer_ops { - int (*insn_hook)(struct bpf_verifier_env *env, - int insn_idx, int prev_insn_idx); -}; - #define BPF_MAX_SUBPROGS 256 /* single container for all structs @@ -203,7 +197,6 @@ struct bpf_verifier_env { bool strict_alignment; /* perform strict pointer alignment checks */ struct bpf_verifier_state *cur_state; /* current verifier state */ struct bpf_verifier_state_list **explored_states; /* search pruning optimization */ - const struct bpf_ext_analyzer_ops *dev_ops; /* device analyzer ops */ struct bpf_map *used_maps[MAX_USED_MAPS]; /* array of map's used by eBPF program */ u32 used_map_cnt; /* number of used maps */ u32 id_gen; /* used to generate unique reg IDs */ @@ -224,13 +217,8 @@ static inline struct bpf_reg_state *cur_regs(struct bpf_verifier_env *env) return cur->frame[cur->curframe]->regs; } -#if defined(CONFIG_NET) && defined(CONFIG_BPF_SYSCALL) int bpf_prog_offload_verifier_prep(struct bpf_verifier_env *env); -#else -static inline int bpf_prog_offload_verifier_prep(struct bpf_verifier_env *env) -{ - return -EOPNOTSUPP; -} -#endif +int bpf_prog_offload_verify_insn(struct bpf_verifier_env *env, + int insn_idx, int prev_insn_idx); #endif /* _LINUX_BPF_VERIFIER_H */ diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index e9b4c5a53092..7dd62a2578ec 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -817,7 +817,7 @@ enum bpf_netdev_command { BPF_OFFLOAD_DESTROY, }; -struct bpf_ext_analyzer_ops; +struct bpf_prog_offload_ops; struct netlink_ext_ack; struct netdev_bpf { @@ -837,7 +837,7 @@ struct netdev_bpf { /* BPF_OFFLOAD_VERIFIER_PREP */ struct { struct bpf_prog *prog; - const struct bpf_ext_analyzer_ops *ops; /* callee set */ + const struct bpf_prog_offload_ops *ops; /* callee set */ } verifier; /* BPF_OFFLOAD_TRANSLATE, BPF_OFFLOAD_DESTROY */ struct { diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index 032079754d88..69ddc3899bab 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -44,7 +44,6 @@ int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr) return -ENOMEM; offload->prog = prog; - init_waitqueue_head(&offload->verifier_done); offload->netdev = dev_get_by_index(current->nsproxy->net_ns, attr->prog_ifindex); @@ -97,15 +96,28 @@ int bpf_prog_offload_verifier_prep(struct bpf_verifier_env *env) if (err) goto exit_unlock; - env->dev_ops = data.verifier.ops; - + env->prog->aux->offload->dev_ops = data.verifier.ops; env->prog->aux->offload->dev_state = true; - env->prog->aux->offload->verifier_running = true; exit_unlock: rtnl_unlock(); return err; } +int bpf_prog_offload_verify_insn(struct bpf_verifier_env *env, + int insn_idx, int prev_insn_idx) +{ + struct bpf_dev_offload *offload; + int ret = -ENODEV; + + down_read(&bpf_devs_lock); + offload = env->prog->aux->offload; + if (offload->netdev) + ret = offload->dev_ops->insn_hook(env, insn_idx, prev_insn_idx); + up_read(&bpf_devs_lock); + + return ret; +} + static void __bpf_prog_offload_destroy(struct bpf_prog *prog) { struct bpf_dev_offload *offload = prog->aux->offload; @@ -117,9 +129,6 @@ static void __bpf_prog_offload_destroy(struct bpf_prog *prog) data.offload.prog = prog; - if (offload->verifier_running) - wait_event(offload->verifier_done, !offload->verifier_running); - if (offload->dev_state) WARN_ON(__bpf_offload_ndo(prog, BPF_OFFLOAD_DESTROY, &data)); @@ -132,9 +141,6 @@ void bpf_prog_offload_destroy(struct bpf_prog *prog) { struct bpf_dev_offload *offload = prog->aux->offload; - offload->verifier_running = false; - wake_up(&offload->verifier_done); - rtnl_lock(); down_write(&bpf_devs_lock); __bpf_prog_offload_destroy(prog); @@ -146,15 +152,11 @@ void bpf_prog_offload_destroy(struct bpf_prog *prog) static int bpf_prog_offload_translate(struct bpf_prog *prog) { - struct bpf_dev_offload *offload = prog->aux->offload; struct netdev_bpf data = {}; int ret; data.offload.prog = prog; - offload->verifier_running = false; - wake_up(&offload->verifier_done); - rtnl_lock(); ret = __bpf_offload_ndo(prog, BPF_OFFLOAD_TRANSLATE, &data); rtnl_unlock(); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 920221f23f0e..f103511fffce 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4836,15 +4836,6 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) return 0; } -static int ext_analyzer_insn_hook(struct bpf_verifier_env *env, - int insn_idx, int prev_insn_idx) -{ - if (env->dev_ops && env->dev_ops->insn_hook) - return env->dev_ops->insn_hook(env, insn_idx, prev_insn_idx); - - return 0; -} - static int do_check(struct bpf_verifier_env *env) { struct bpf_verifier_state *state; @@ -4933,9 +4924,12 @@ static int do_check(struct bpf_verifier_env *env) print_bpf_insn(&cbs, env, insn, env->allow_ptr_leaks); } - err = ext_analyzer_insn_hook(env, env->insn_idx, env->prev_insn_idx); - if (err) - return err; + if (bpf_prog_is_dev_bound(env->prog->aux)) { + err = bpf_prog_offload_verify_insn(env, env->insn_idx, + env->prev_insn_idx); + if (err) + return err; + } regs = cur_regs(env); env->insn_aux_data[env->insn_idx].seen = true; @@ -6028,7 +6022,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) env->strict_alignment = true; - if (env->prog->aux->offload) { + if (bpf_prog_is_dev_bound(env->prog->aux)) { ret = bpf_prog_offload_verifier_prep(env); if (ret) goto err_unlock; From 053b223a5f4ecd6a43158a1c4d0712bb8e985564 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 27 Dec 2017 18:39:06 -0800 Subject: [PATCH 0116/1640] UPSTREAM: bpf: offload: free prog->aux->offload when device disappears All bpf offload operations should now be under bpf_devs_lock, it's safe to free and clear the entire offload structure, not only the netdev pointer. __bpf_prog_offload_destroy() will no longer be called multiple times. Suggested-by: Alexei Starovoitov Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- kernel/bpf/offload.c | 23 +++++++++-------------- 1 file changed, 9 insertions(+), 14 deletions(-) diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index 69ddc3899bab..3126e1a842e6 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -70,12 +70,14 @@ err_free: static int __bpf_offload_ndo(struct bpf_prog *prog, enum bpf_netdev_command cmd, struct netdev_bpf *data) { - struct net_device *netdev = prog->aux->offload->netdev; + struct bpf_dev_offload *offload = prog->aux->offload; + struct net_device *netdev; ASSERT_RTNL(); - if (!netdev) + if (!offload) return -ENODEV; + netdev = offload->netdev; if (!netdev->netdev_ops->ndo_bpf) return -EOPNOTSUPP; @@ -111,7 +113,7 @@ int bpf_prog_offload_verify_insn(struct bpf_verifier_env *env, down_read(&bpf_devs_lock); offload = env->prog->aux->offload; - if (offload->netdev) + if (offload) ret = offload->dev_ops->insn_hook(env, insn_idx, prev_insn_idx); up_read(&bpf_devs_lock); @@ -123,31 +125,24 @@ static void __bpf_prog_offload_destroy(struct bpf_prog *prog) struct bpf_dev_offload *offload = prog->aux->offload; struct netdev_bpf data = {}; - /* Caution - if netdev is destroyed before the program, this function - * will be called twice. - */ - data.offload.prog = prog; if (offload->dev_state) WARN_ON(__bpf_offload_ndo(prog, BPF_OFFLOAD_DESTROY, &data)); - offload->dev_state = false; list_del_init(&offload->offloads); - offload->netdev = NULL; + kfree(offload); + prog->aux->offload = NULL; } void bpf_prog_offload_destroy(struct bpf_prog *prog) { - struct bpf_dev_offload *offload = prog->aux->offload; - rtnl_lock(); down_write(&bpf_devs_lock); - __bpf_prog_offload_destroy(prog); + if (prog->aux->offload) + __bpf_prog_offload_destroy(prog); up_write(&bpf_devs_lock); rtnl_unlock(); - - kfree(offload); } static int bpf_prog_offload_translate(struct bpf_prog *prog) From f0928cf69cc69a6a97d1c23b98a1723e2e7a1765 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 27 Dec 2017 18:39:07 -0800 Subject: [PATCH 0117/1640] UPSTREAM: bpf: offload: free program id when device disappears Bound programs are quite useless after their device disappears. They are simply waiting for reference count to go to zero, don't list them in BPF_PROG_GET_NEXT_ID by freeing their ID early. Note that orphaned offload programs will return -ENODEV on BPF_OBJ_GET_INFO_BY_FD so user will never see ID 0. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 2 ++ kernel/bpf/offload.c | 3 +++ kernel/bpf/syscall.c | 9 +++++++-- 3 files changed, 12 insertions(+), 2 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 5442cf90c46b..12caf635d327 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -373,6 +373,8 @@ void bpf_prog_put(struct bpf_prog *prog); int __bpf_prog_charge(struct user_struct *user, u32 pages); void __bpf_prog_uncharge(struct user_struct *user, u32 pages); +void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock); + struct bpf_map *bpf_map_get_with_uref(u32 ufd); struct bpf_map *__bpf_map_get(struct fd f); struct bpf_map * __must_check bpf_map_inc(struct bpf_map *map, bool uref); diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index 3126e1a842e6..e4f1668a021c 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -130,6 +130,9 @@ static void __bpf_prog_offload_destroy(struct bpf_prog *prog) if (offload->dev_state) WARN_ON(__bpf_offload_ndo(prog, BPF_OFFLOAD_DESTROY, &data)); + /* Make sure BPF_PROG_GET_NEXT_ID can't find this dead program */ + bpf_prog_free_id(prog, true); + list_del_init(&offload->offloads); kfree(offload); prog->aux->offload = NULL; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 0d0d1c8cbe22..8cce54482fe2 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -922,9 +922,13 @@ static int bpf_prog_alloc_id(struct bpf_prog *prog) return id > 0 ? 0 : id; } -static void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock) +void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock) { - /* cBPF to eBPF migrations are currently not in the idr store. */ + /* cBPF to eBPF migrations are currently not in the idr store. + * Offloaded programs are removed from the store when their device + * disappears - even if someone grabs an fd to them they are unusable, + * simply waiting for refcnt to drop to be freed. + */ if (!prog->aux->id) return; @@ -934,6 +938,7 @@ static void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock) __acquire(&prog_idr_lock); idr_remove(&prog_idr, prog->aux->id); + prog->aux->id = 0; if (do_idr_lock) spin_unlock_bh(&prog_idr_lock); From 7d2a2a4379a0d05ef8ab17b963078038e29505a4 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 27 Dec 2017 18:39:09 -0800 Subject: [PATCH 0118/1640] BACKPORT: bpf: offload: report device information for offloaded programs Report to the user ifindex and namespace information of offloaded programs. If device has disappeared return -ENODEV. Specify the namespace using dev/inode combination. CC: Eric W. Biederman Signed-off-by: Jakub Kicinski Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 2 ++ include/uapi/linux/bpf.h | 3 ++ kernel/bpf/offload.c | 59 ++++++++++++++++++++++++++++++++++++++++ kernel/bpf/syscall.c | 6 ++++ 4 files changed, 70 insertions(+) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 12caf635d327..ff16143694f6 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -568,6 +568,8 @@ bool bpf_prog_get_ok(struct bpf_prog *, enum bpf_prog_type *, bool); int bpf_prog_offload_compile(struct bpf_prog *prog); void bpf_prog_offload_destroy(struct bpf_prog *prog); +int bpf_prog_offload_info_fill(struct bpf_prog_info *info, + struct bpf_prog *prog); #if defined(CONFIG_NET) && defined(CONFIG_BPF_SYSCALL) int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 231b08014cf4..4a41fb43c04e 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1018,6 +1018,9 @@ struct bpf_prog_info { __u32 nr_map_ids; __aligned_u64 map_ids; char name[BPF_OBJ_NAME_LEN]; + __u32 ifindex; + __u64 netns_dev; + __u64 netns_ino; } __attribute__((aligned(8))); struct bpf_map_info { diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index e4f1668a021c..040d4e0edf3f 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -16,9 +16,11 @@ #include #include #include +#include #include #include #include +#include #include #include @@ -176,6 +178,63 @@ int bpf_prog_offload_compile(struct bpf_prog *prog) return bpf_prog_offload_translate(prog); } +struct ns_get_path_bpf_prog_args { + struct bpf_prog *prog; + struct bpf_prog_info *info; +}; + +static struct ns_common *bpf_prog_offload_info_fill_ns(void *private_data) +{ + struct ns_get_path_bpf_prog_args *args = private_data; + struct bpf_prog_aux *aux = args->prog->aux; + struct ns_common *ns; + struct net *net; + + rtnl_lock(); + down_read(&bpf_devs_lock); + + if (aux->offload) { + args->info->ifindex = aux->offload->netdev->ifindex; + net = dev_net(aux->offload->netdev); + get_net(net); + ns = &net->ns; + } else { + args->info->ifindex = 0; + ns = NULL; + } + + up_read(&bpf_devs_lock); + rtnl_unlock(); + + return ns; +} + +int bpf_prog_offload_info_fill(struct bpf_prog_info *info, + struct bpf_prog *prog) +{ + struct ns_get_path_bpf_prog_args args = { + .prog = prog, + .info = info, + }; + struct inode *ns_inode; + struct path ns_path; + void *res; + + res = ns_get_path_cb(&ns_path, bpf_prog_offload_info_fill_ns, &args); + if (IS_ERR(res)) { + if (!info->ifindex) + return -ENODEV; + return PTR_ERR(res); + } + + ns_inode = ns_path.dentry->d_inode; + info->netns_dev = new_encode_dev(ns_inode->i_sb->s_dev); + info->netns_ino = ns_inode->i_ino; + path_put(&ns_path); + + return 0; +} + const struct bpf_prog_ops bpf_offload_prog_ops = { }; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 8cce54482fe2..a251366afe64 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1731,6 +1731,12 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, return -EFAULT; } + if (bpf_prog_is_dev_bound(prog->aux)) { + err = bpf_prog_offload_info_fill(&info, prog); + if (err) + return err; + } + done: if (copy_to_user(uinfo, &info, info_len) || put_user(info_len, &uattr->info.info_len)) From 7f5bb4068f899b0fd40de7d774d33b3fad315071 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Wed, 3 Jan 2018 17:57:39 -0800 Subject: [PATCH 0119/1640] UPSTREAM: bpf: sockmap remove unused function This was added for some work that was eventually factored out but the helper call was missed. Remove it now and add it back later if needed. Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann --- kernel/bpf/sockmap.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 0fffca42dac9..e552589b3e7e 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -96,14 +96,6 @@ static inline struct smap_psock *smap_psock_sk(const struct sock *sk) return rcu_dereference_sk_user_data(sk); } -/* compute the linear packet data range [data, data_end) for skb when - * sk_skb type programs are in use. - */ -static inline void bpf_compute_data_end_sk_skb(struct sk_buff *skb) -{ - TCP_SKB_CB(skb)->bpf.data_end = skb->data + skb_headlen(skb); -} - enum __sk_action { __SK_DROP = 0, __SK_PASS, From 49f834d5195e64ba546fe909acb65c3afe34d42a Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Wed, 3 Jan 2018 17:57:56 -0800 Subject: [PATCH 0120/1640] UPSTREAM: bpf: only build sockmap with CONFIG_INET The sockmap infrastructure is only aware of TCP sockets at the moment. In the future we plan to add UDP. In both cases CONFIG_NET should be built-in. So lets only build sockmap if CONFIG_INET is enabled. Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 2 +- include/linux/bpf_types.h | 2 +- kernel/bpf/Makefile | 2 ++ 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index ff16143694f6..6442eb87a362 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -591,7 +591,7 @@ static inline bool bpf_prog_is_dev_bound(struct bpf_prog_aux *aux) } #endif /* CONFIG_NET && CONFIG_BPF_SYSCALL */ -#if defined(CONFIG_STREAM_PARSER) && defined(CONFIG_BPF_SYSCALL) +#if defined(CONFIG_STREAM_PARSER) && defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_INET) struct sock *__sock_map_lookup_elem(struct bpf_map *map, u32 key); int sock_map_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type); #else diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index 978c1d9c9383..19b8349a3809 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -42,7 +42,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY_OF_MAPS, array_of_maps_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_HASH_OF_MAPS, htab_of_maps_map_ops) #ifdef CONFIG_NET BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP, dev_map_ops) -#ifdef CONFIG_STREAM_PARSER +#if defined(CONFIG_STREAM_PARSER) && defined(CONFIG_INET) BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKMAP, sock_map_ops) #endif BPF_MAP_TYPE(BPF_MAP_TYPE_CPUMAP, cpu_map_ops) diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index c5c8ed85ecf3..d7fe790c70b2 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -10,9 +10,11 @@ obj-$(CONFIG_BPF_SYSCALL) += devmap.o obj-$(CONFIG_BPF_SYSCALL) += cpumap.o obj-$(CONFIG_BPF_SYSCALL) += offload.o ifeq ($(CONFIG_STREAM_PARSER),y) +ifeq ($(CONFIG_INET),y) obj-$(CONFIG_BPF_SYSCALL) += sockmap.o endif endif +endif ifeq ($(CONFIG_PERF_EVENTS),y) obj-$(CONFIG_BPF_SYSCALL) += stackmap.o endif From 63b9c14846c682686c22a4f0c661337aa4284a62 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 1 Dec 2017 17:22:19 -0500 Subject: [PATCH 0121/1640] UPSTREAM: bpf_obj_do_pin(): switch to vfs_mkobj(), quit abusing ->mknod() Signed-off-by: Al Viro --- kernel/bpf/inode.c | 50 ++++++++++++++++++++-------------------------- 1 file changed, 22 insertions(+), 28 deletions(-) diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 1ec657e395c6..dc26065e79f7 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -150,39 +150,29 @@ static int bpf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) return 0; } -static int bpf_mkobj_ops(struct inode *dir, struct dentry *dentry, - umode_t mode, const struct inode_operations *iops) +static int bpf_mkobj_ops(struct dentry *dentry, umode_t mode, void *raw, + const struct inode_operations *iops) { - struct inode *inode; - - inode = bpf_get_inode(dir->i_sb, dir, mode | S_IFREG); + struct inode *dir = dentry->d_parent->d_inode; + struct inode *inode = bpf_get_inode(dir->i_sb, dir, mode); if (IS_ERR(inode)) return PTR_ERR(inode); inode->i_op = iops; - inode->i_private = dentry->d_fsdata; + inode->i_private = raw; bpf_dentry_finalize(dentry, inode, dir); return 0; } -static int bpf_mkobj(struct inode *dir, struct dentry *dentry, umode_t mode, - dev_t devt) +static int bpf_mkprog(struct dentry *dentry, umode_t mode, void *arg) { - enum bpf_type type = MINOR(devt); + return bpf_mkobj_ops(dentry, mode, arg, &bpf_prog_iops); +} - if (MAJOR(devt) != UNNAMED_MAJOR || !S_ISREG(mode) || - dentry->d_fsdata == NULL) - return -EPERM; - - switch (type) { - case BPF_TYPE_PROG: - return bpf_mkobj_ops(dir, dentry, mode, &bpf_prog_iops); - case BPF_TYPE_MAP: - return bpf_mkobj_ops(dir, dentry, mode, &bpf_map_iops); - default: - return -EPERM; - } +static int bpf_mkmap(struct dentry *dentry, umode_t mode, void *arg) +{ + return bpf_mkobj_ops(dentry, mode, arg, &bpf_map_iops); } static struct dentry * @@ -218,7 +208,6 @@ static int bpf_symlink(struct inode *dir, struct dentry *dentry, static const struct inode_operations bpf_dir_iops = { .lookup = bpf_lookup, - .mknod = bpf_mkobj, .mkdir = bpf_mkdir, .symlink = bpf_symlink, .rmdir = simple_rmdir, @@ -234,7 +223,6 @@ static int bpf_obj_do_pin(const struct filename *pathname, void *raw, struct inode *dir; struct path path; umode_t mode; - dev_t devt; int ret; dentry = kern_path_create(AT_FDCWD, pathname->name, &path, 0); @@ -242,9 +230,8 @@ static int bpf_obj_do_pin(const struct filename *pathname, void *raw, return PTR_ERR(dentry); mode = S_IFREG | ((S_IRUSR | S_IWUSR) & ~current_umask()); - devt = MKDEV(UNNAMED_MAJOR, type); - ret = security_path_mknod(&path, dentry, mode, devt); + ret = security_path_mknod(&path, dentry, mode, 0); if (ret) goto out; @@ -254,9 +241,16 @@ static int bpf_obj_do_pin(const struct filename *pathname, void *raw, goto out; } - dentry->d_fsdata = raw; - ret = vfs_mknod(dir, dentry, mode, devt); - dentry->d_fsdata = NULL; + switch (type) { + case BPF_TYPE_PROG: + ret = vfs_mkobj(dentry, mode, bpf_mkprog, raw); + break; + case BPF_TYPE_MAP: + ret = vfs_mkobj(dentry, mode, bpf_mkmap, raw); + break; + default: + ret = -EPERM; + } out: done_path_create(&path, dentry); return ret; From e2a770ed223d798e04fa0c6d81f6df6142b34d52 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Wed, 3 Jan 2018 11:25:13 +0100 Subject: [PATCH 0122/1640] UPSTREAM: xdp: base API for new XDP rx-queue info concept This patch only introduce the core data structures and API functions. All XDP enabled drivers must use the API before this info can used. There is a need for XDP to know more about the RX-queue a given XDP frames have arrived on. For both the XDP bpf-prog and kernel side. Instead of extending xdp_buff each time new info is needed, the patch creates a separate read-mostly struct xdp_rxq_info, that contains this info. We stress this data/cache-line is for read-only info. This is NOT for dynamic per packet info, use the data_meta for such use-cases. The performance advantage is this info can be setup at RX-ring init time, instead of updating N-members in xdp_buff. A possible (driver level) micro optimization is that xdp_buff->rxq assignment could be done once per XDP/NAPI loop. The extra pointer deref only happens for program needing access to this info (thus, no slowdown to existing use-cases). Change-Id: I8092ce23273dc992fe4f2360dac0d6cfd0b9b897 Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Alexei Starovoitov --- include/linux/filter.h | 2 ++ include/net/xdp.h | 47 +++++++++++++++++++++++++++++ net/core/Makefile | 2 +- net/core/xdp.c | 67 ++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 117 insertions(+), 1 deletion(-) create mode 100644 include/net/xdp.h create mode 100644 net/core/xdp.c diff --git a/include/linux/filter.h b/include/linux/filter.h index b73c62c50bd9..4d6b1a36f7dc 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -20,6 +20,7 @@ #include #include +#include #include #include @@ -583,6 +584,7 @@ struct xdp_buff { void *data_end; void *data_meta; void *data_hard_start; + struct xdp_rxq_info *rxq; }; /* compute the linear packet data range [data, data_end) which diff --git a/include/net/xdp.h b/include/net/xdp.h new file mode 100644 index 000000000000..86c41631a908 --- /dev/null +++ b/include/net/xdp.h @@ -0,0 +1,47 @@ +/* include/net/xdp.h + * + * Copyright (c) 2017 Jesper Dangaard Brouer, Red Hat Inc. + * Released under terms in GPL version 2. See COPYING. + */ +#ifndef __LINUX_NET_XDP_H__ +#define __LINUX_NET_XDP_H__ + +/** + * DOC: XDP RX-queue information + * + * The XDP RX-queue info (xdp_rxq_info) is associated with the driver + * level RX-ring queues. It is information that is specific to how + * the driver have configured a given RX-ring queue. + * + * Each xdp_buff frame received in the driver carry a (pointer) + * reference to this xdp_rxq_info structure. This provides the XDP + * data-path read-access to RX-info for both kernel and bpf-side + * (limited subset). + * + * For now, direct access is only safe while running in NAPI/softirq + * context. Contents is read-mostly and must not be updated during + * driver NAPI/softirq poll. + * + * The driver usage API is a register and unregister API. + * + * The struct is not directly tied to the XDP prog. A new XDP prog + * can be attached as long as it doesn't change the underlying + * RX-ring. If the RX-ring does change significantly, the NIC driver + * naturally need to stop the RX-ring before purging and reallocating + * memory. In that process the driver MUST call unregistor (which + * also apply for driver shutdown and unload). The register API is + * also mandatory during RX-ring setup. + */ + +struct xdp_rxq_info { + struct net_device *dev; + u32 queue_index; + u32 reg_state; +} ____cacheline_aligned; /* perf critical, avoid false-sharing */ + +int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq, + struct net_device *dev, u32 queue_index); +void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq); +void xdp_rxq_info_unused(struct xdp_rxq_info *xdp_rxq); + +#endif /* __LINUX_NET_XDP_H__ */ diff --git a/net/core/Makefile b/net/core/Makefile index 475570161efb..4a98df0efc61 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -11,7 +11,7 @@ obj-$(CONFIG_SYSCTL) += sysctl_net_core.o obj-y += dev.o ethtool.o dev_addr_lists.o dst.o netevent.o \ neighbour.o rtnetlink.o utils.o link_watch.o filter.o \ sock_diag.o dev_ioctl.o tso.o sock_reuseport.o \ - fib_notifier.o net_ipc_log.o dev_monitor.o + fib_notifier.o net_ipc_log.o dev_monitor.o xdp.o obj-y += net-sysfs.o obj-$(CONFIG_PROC_FS) += net-procfs.o diff --git a/net/core/xdp.c b/net/core/xdp.c new file mode 100644 index 000000000000..229bc5a0ee04 --- /dev/null +++ b/net/core/xdp.c @@ -0,0 +1,67 @@ +/* net/core/xdp.c + * + * Copyright (c) 2017 Jesper Dangaard Brouer, Red Hat Inc. + * Released under terms in GPL version 2. See COPYING. + */ +#include +#include + +#include + +#define REG_STATE_NEW 0x0 +#define REG_STATE_REGISTERED 0x1 +#define REG_STATE_UNREGISTERED 0x2 +#define REG_STATE_UNUSED 0x3 + +void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq) +{ + /* Simplify driver cleanup code paths, allow unreg "unused" */ + if (xdp_rxq->reg_state == REG_STATE_UNUSED) + return; + + WARN(!(xdp_rxq->reg_state == REG_STATE_REGISTERED), "Driver BUG"); + + xdp_rxq->reg_state = REG_STATE_UNREGISTERED; + xdp_rxq->dev = NULL; +} +EXPORT_SYMBOL_GPL(xdp_rxq_info_unreg); + +static void xdp_rxq_info_init(struct xdp_rxq_info *xdp_rxq) +{ + memset(xdp_rxq, 0, sizeof(*xdp_rxq)); +} + +/* Returns 0 on success, negative on failure */ +int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq, + struct net_device *dev, u32 queue_index) +{ + if (xdp_rxq->reg_state == REG_STATE_UNUSED) { + WARN(1, "Driver promised not to register this"); + return -EINVAL; + } + + if (xdp_rxq->reg_state == REG_STATE_REGISTERED) { + WARN(1, "Missing unregister, handled but fix driver"); + xdp_rxq_info_unreg(xdp_rxq); + } + + if (!dev) { + WARN(1, "Missing net_device from driver"); + return -ENODEV; + } + + /* State either UNREGISTERED or NEW */ + xdp_rxq_info_init(xdp_rxq); + xdp_rxq->dev = dev; + xdp_rxq->queue_index = queue_index; + + xdp_rxq->reg_state = REG_STATE_REGISTERED; + return 0; +} +EXPORT_SYMBOL_GPL(xdp_rxq_info_reg); + +void xdp_rxq_info_unused(struct xdp_rxq_info *xdp_rxq) +{ + xdp_rxq->reg_state = REG_STATE_UNUSED; +} +EXPORT_SYMBOL_GPL(xdp_rxq_info_unused); From ba11128d0d64310aefd42d9b7563f2e962815008 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Wed, 3 Jan 2018 11:26:14 +0100 Subject: [PATCH 0123/1640] UPSTREAM: bpf: finally expose xdp_rxq_info to XDP bpf-programs Now all XDP driver have been updated to setup xdp_rxq_info and assign this to xdp_buff->rxq. Thus, it is now safe to enable access to some of the xdp_rxq_info struct members. This patch extend xdp_md and expose UAPI to userspace for ingress_ifindex and rx_queue_index. Access happens via bpf instruction rewrite, that load data directly from struct xdp_rxq_info. * ingress_ifindex map to xdp_rxq_info->dev->ifindex * rx_queue_index map to xdp_rxq_info->queue_index Signed-off-by: Jesper Dangaard Brouer Acked-by: Alexei Starovoitov Acked-by: John Fastabend Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 3 +++ net/core/filter.c | 19 +++++++++++++++++++ 2 files changed, 22 insertions(+) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 4a41fb43c04e..094c4cc944b3 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -996,6 +996,9 @@ struct xdp_md { __u32 data; __u32 data_end; __u32 data_meta; + /* Below access go though struct xdp_rxq_info */ + __u32 ingress_ifindex; /* rxq->dev->ifindex */ + __u32 rx_queue_index; /* rxq->queue_index */ }; enum sk_action { diff --git a/net/core/filter.c b/net/core/filter.c index daf4b4d33241..8ac0a711ad14 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4401,6 +4401,25 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type, si->dst_reg, si->src_reg, offsetof(struct xdp_buff, data_end)); break; + case offsetof(struct xdp_md, ingress_ifindex): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, rxq), + si->dst_reg, si->src_reg, + offsetof(struct xdp_buff, rxq)); + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_rxq_info, dev), + si->dst_reg, si->dst_reg, + offsetof(struct xdp_rxq_info, dev)); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, + bpf_target_off(struct net_device, + ifindex, 4, target_size)); + break; + case offsetof(struct xdp_md, rx_queue_index): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, rxq), + si->dst_reg, si->src_reg, + offsetof(struct xdp_buff, rxq)); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, + bpf_target_off(struct xdp_rxq_info, + queue_index, 4, target_size)); + break; } return insn - insn_buf; From fd19f8c91145c471d819ab77e324231c3b2635f4 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 4 Jan 2018 13:55:03 -0800 Subject: [PATCH 0124/1640] UPSTREAM: bpf: implement syscall command BPF_MAP_GET_NEXT_KEY for stacktrace map Currently, bpf syscall command BPF_MAP_GET_NEXT_KEY is not supported for stacktrace map. However, there are use cases where user space wants to enumerate all stacktrace map entries where BPF_MAP_GET_NEXT_KEY command will be really helpful. In addition, if user space wants to delete all map entries in order to save memory and does not want to close the map file descriptor, BPF_MAP_GET_NEXT_KEY may help improve performance if map entries are sparsely populated. The implementation has similar behavior for BPF_MAP_GET_NEXT_KEY implementation in hashtab. If user provides a NULL key pointer or an invalid key, the first key is returned. Otherwise, the first valid key after the input parameter "key" is returned, or -ENOENT if no valid key can be found. Signed-off-by: Yonghong Song Signed-off-by: Daniel Borkmann --- kernel/bpf/stackmap.c | 28 ++++++++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index f14d8865d75b..070a4b0d28a2 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -232,9 +232,33 @@ int bpf_stackmap_copy(struct bpf_map *map, void *key, void *value) return 0; } -static int stack_map_get_next_key(struct bpf_map *map, void *key, void *next_key) +static int stack_map_get_next_key(struct bpf_map *map, void *key, + void *next_key) { - return -EINVAL; + struct bpf_stack_map *smap = container_of(map, + struct bpf_stack_map, map); + u32 id; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + if (!key) { + id = 0; + } else { + id = *(u32 *)key; + if (id >= smap->n_buckets || !smap->buckets[id]) + id = 0; + else + id++; + } + + while (id < smap->n_buckets && !smap->buckets[id]) + id++; + + if (id >= smap->n_buckets) + return -ENOENT; + + *(u32 *)next_key = id; + return 0; } static int stack_map_update_elem(struct bpf_map *map, void *key, void *value, From c34769eb1e12c12c859969e194bbcfe9b6b46374 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 8 Jan 2018 07:51:17 -0800 Subject: [PATCH 0125/1640] UPSTREAM: bpf: fix verifier GPF in kmalloc failure path syzbot reported the following panic in the verifier triggered by kmalloc error injection: kasan: GPF could be caused by NULL-ptr deref or user memory access RIP: 0010:copy_func_state kernel/bpf/verifier.c:403 [inline] RIP: 0010:copy_verifier_state+0x364/0x590 kernel/bpf/verifier.c:431 Call Trace: pop_stack+0x8c/0x270 kernel/bpf/verifier.c:449 push_stack kernel/bpf/verifier.c:491 [inline] check_cond_jmp_op kernel/bpf/verifier.c:3598 [inline] do_check+0x4b60/0xa050 kernel/bpf/verifier.c:4731 bpf_check+0x3296/0x58c0 kernel/bpf/verifier.c:5489 bpf_prog_load+0xa2a/0x1b00 kernel/bpf/syscall.c:1198 SYSC_bpf kernel/bpf/syscall.c:1807 [inline] SyS_bpf+0x1044/0x4420 kernel/bpf/syscall.c:1769 when copy_verifier_state() aborts in the middle due to kmalloc failure some of the frames could have been partially copied while current free_verifier_state() loop for (i = 0; i <= state->curframe; i++) assumed that all frames are non-null. Simply fix it by adding 'if (!state)' to free_func_state(). Also avoid stressing copy frame logic more if kzalloc fails in push_stack() free env->cur_state right away. Fixes: f4d7e40a5b71 ("bpf: introduce function calls (verification)") Reported-by: syzbot+32ac5a3e473f2e01cfc7@syzkaller.appspotmail.com Reported-by: syzbot+fa99e24f3c29d269a7d5@syzkaller.appspotmail.com Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index f103511fffce..16d9e9b40326 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -375,6 +375,8 @@ static int realloc_func_state(struct bpf_func_state *state, int size, static void free_func_state(struct bpf_func_state *state) { + if (!state) + return; kfree(state->stack); kfree(state); } @@ -490,6 +492,8 @@ static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env, } return &elem->st; err: + free_verifier_state(env->cur_state, true); + env->cur_state = NULL; /* pop all elements and return */ while (!pop_stack(env, NULL, NULL)); return NULL; From 146c5ffa619adc33ccaf5f293cd9cff76641c74f Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Wed, 10 Jan 2018 12:26:06 +0000 Subject: [PATCH 0126/1640] UPSTREAM: bpf: export function to write into verifier log buffer Rename the BPF verifier `verbose()` to `bpf_verifier_log_write()` and export it, so that other components (in particular, drivers for BPF offload) can reuse the user buffer log to dump error messages at verification time. Renaming `verbose()` was necessary in order to avoid a name so generic to be exported to the global namespace. However to prevent too much pain for backports, the calls to `verbose()` in the kernel BPF verifier were not changed. Instead, use function aliasing to make `verbose` point to `bpf_verifier_log_write`. Another solution could consist in making a wrapper around `verbose()`, but since it is a variadic function, I don't see a clean way without creating two identical wrappers, one for the verifier and one to export. Signed-off-by: Quentin Monnet Reviewed-by: Jakub Kicinski Signed-off-by: Daniel Borkmann --- include/linux/bpf_verifier.h | 3 +++ kernel/bpf/verifier.c | 16 ++++++++++++---- 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index b19117ef4df0..166097378e22 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -210,6 +210,9 @@ struct bpf_verifier_env { u32 subprog_cnt; }; +__printf(2, 3) void bpf_verifier_log_write(struct bpf_verifier_env *env, + const char *fmt, ...); + static inline struct bpf_reg_state *cur_regs(struct bpf_verifier_env *env) { struct bpf_verifier_state *cur = env->cur_state; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 16d9e9b40326..8ad64d7dec8e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -169,11 +169,11 @@ struct bpf_call_arg_meta { static DEFINE_MUTEX(bpf_verifier_lock); /* log_level controls verbosity level of eBPF verifier. - * verbose() is used to dump the verification trace to the log, so the user - * can figure out what's wrong with the program + * bpf_verifier_log_write() is used to dump the verification trace to the log, + * so the user can figure out what's wrong with the program */ -static __printf(2, 3) void verbose(struct bpf_verifier_env *env, - const char *fmt, ...) +__printf(2, 3) void bpf_verifier_log_write(struct bpf_verifier_env *env, + const char *fmt, ...) { struct bpf_verifer_log *log = &env->log; unsigned int n; @@ -197,6 +197,14 @@ static __printf(2, 3) void verbose(struct bpf_verifier_env *env, else log->ubuf = NULL; } +EXPORT_SYMBOL_GPL(bpf_verifier_log_write); +/* Historically bpf_verifier_log_write was called verbose, but the name was too + * generic for symbol export. The function was renamed, but not the calls in + * the verifier to avoid complicating backports. Hence the alias below. + */ +static __printf(2, 3) void verbose(struct bpf_verifier_env *env, + const char *fmt, ...) + __attribute__((alias("bpf_verifier_log_write"))); static bool type_is_pkt_pointer(enum bpf_reg_type type) { From 725f678dbd442a9af048f3d1d85b1309f56cdb95 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Wed, 10 Jan 2018 09:20:54 +0000 Subject: [PATCH 0127/1640] UPSTREAM: bpf: fix spelling mistake: "obusing" -> "abusing" Trivial fix to spelling mistake in error message text. Signed-off-by: Colin Ian King Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 8ad64d7dec8e..7c311f1218d2 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5873,7 +5873,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) */ map_ptr = env->insn_aux_data[i + delta].map_ptr; if (map_ptr == BPF_MAP_PTR_POISON) { - verbose(env, "tail_call obusing map_ptr\n"); + verbose(env, "tail_call abusing map_ptr\n"); return -EINVAL; } if (!map_ptr->unpriv_array) From fd75776ab4baa86711aae72d8bdce9fc8a005023 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 11 Jan 2018 17:39:09 +0100 Subject: [PATCH 0128/1640] UPSTREAM: bpf: simplify xdp_convert_ctx_access for xdp_rxq_info As pointed out by Daniel Borkmann, using bpf_target_off() is not necessary for xdp_rxq_info when extracting queue_index and ifindex, as these members are u32 like BPF_W. Also fix trivial spelling mistake introduced in same commit. Fixes: 02dd3291b2f0 ("bpf: finally expose xdp_rxq_info to XDP bpf-programs") Reported-by: Daniel Borkmann Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Daniel Borkmann --- include/uapi/linux/bpf.h | 2 +- net/core/filter.c | 7 +++---- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 094c4cc944b3..43caed45205c 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -996,7 +996,7 @@ struct xdp_md { __u32 data; __u32 data_end; __u32 data_meta; - /* Below access go though struct xdp_rxq_info */ + /* Below access go through struct xdp_rxq_info */ __u32 ingress_ifindex; /* rxq->dev->ifindex */ __u32 rx_queue_index; /* rxq->queue_index */ }; diff --git a/net/core/filter.c b/net/core/filter.c index 8ac0a711ad14..3f8030e49ddc 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4409,16 +4409,15 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type, si->dst_reg, si->dst_reg, offsetof(struct xdp_rxq_info, dev)); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, - bpf_target_off(struct net_device, - ifindex, 4, target_size)); + offsetof(struct net_device, ifindex)); break; case offsetof(struct xdp_md, rx_queue_index): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, rxq), si->dst_reg, si->src_reg, offsetof(struct xdp_buff, rxq)); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, - bpf_target_off(struct xdp_rxq_info, - queue_index, 4, target_size)); + offsetof(struct xdp_rxq_info, + queue_index)); break; } From 166acbeb9cb761f6306e1739913232093bb54ca1 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Sat, 13 Jan 2018 02:55:03 +0900 Subject: [PATCH 0129/1640] BACKPORT: error-injection: Separate error-injection from kprobe Since error-injection framework is not limited to be used by kprobes, nor bpf. Other kernel subsystems can use it freely for checking safeness of error-injection, e.g. livepatch, ftrace etc. So this separate error-injection framework from kprobes. Some differences has been made: - "kprobe" word is removed from any APIs/structures. - BPF_ALLOW_ERROR_INJECTION() is renamed to ALLOW_ERROR_INJECTION() since it is not limited for BPF too. - CONFIG_FUNCTION_ERROR_INJECTION is the config item of this feature. It is automatically enabled if the arch supports error injection feature for kprobe or ftrace etc. Signed-off-by: Masami Hiramatsu Reviewed-by: Josef Bacik Signed-off-by: Alexei Starovoitov --- arch/Kconfig | 2 +- arch/x86/Kconfig | 2 +- arch/x86/include/asm/error-injection.h | 13 ++ arch/x86/lib/Makefile | 1 + arch/x86/lib/error-inject.c | 19 +++ include/asm-generic/error-injection.h | 20 +++ include/asm-generic/vmlinux.lds.h | 14 +- include/linux/bpf.h | 11 -- include/linux/error-injection.h | 21 +++ include/linux/kprobes.h | 1 - include/linux/module.h | 6 +- kernel/kprobes.c | 163 ------------------- kernel/module.c | 8 +- kernel/trace/Kconfig | 2 +- kernel/trace/bpf_trace.c | 4 +- kernel/trace/trace_kprobe.c | 3 +- lib/Kconfig.debug | 4 + lib/Makefile | 1 + lib/error-inject.c | 213 +++++++++++++++++++++++++ 19 files changed, 313 insertions(+), 195 deletions(-) create mode 100644 arch/x86/include/asm/error-injection.h create mode 100644 arch/x86/lib/error-inject.c create mode 100644 include/asm-generic/error-injection.h create mode 100644 include/linux/error-injection.h create mode 100644 lib/error-inject.c diff --git a/arch/Kconfig b/arch/Kconfig index 113a6ea00699..17ea90fa6808 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -199,7 +199,7 @@ config HAVE_OPTPROBES config HAVE_KPROBES_ON_FTRACE bool -config HAVE_KPROBE_OVERRIDE +config HAVE_FUNCTION_ERROR_INJECTION bool config HAVE_NMI diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index afa13c1f28b1..c2507a716059 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -155,7 +155,7 @@ config X86 select HAVE_KERNEL_XZ select HAVE_KPROBES select HAVE_KPROBES_ON_FTRACE - select HAVE_KPROBE_OVERRIDE + select HAVE_FUNCTION_ERROR_INJECTION select HAVE_KRETPROBES select HAVE_KVM select HAVE_LIVEPATCH if X86_64 diff --git a/arch/x86/include/asm/error-injection.h b/arch/x86/include/asm/error-injection.h new file mode 100644 index 000000000000..47b7a1296245 --- /dev/null +++ b/arch/x86/include/asm/error-injection.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_ERROR_INJECTION_H +#define _ASM_ERROR_INJECTION_H + +#include +#include +#include +#include + +asmlinkage void just_return_func(void); +void override_function_with_return(struct pt_regs *regs); + +#endif /* _ASM_ERROR_INJECTION_H */ diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index 60b410ff31e8..55f70309be9c 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -38,6 +38,7 @@ lib-y += memcpy_$(BITS).o lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o lib-$(CONFIG_INSTRUCTION_DECODER) += insn.o inat.o lib-$(CONFIG_RANDOMIZE_BASE) += kaslr.o +lib-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o lib-$(CONFIG_RETPOLINE) += retpoline.o obj-y += msr.o msr-reg.o msr-reg-export.o hweight.o diff --git a/arch/x86/lib/error-inject.c b/arch/x86/lib/error-inject.c new file mode 100644 index 000000000000..7b881d03d0dd --- /dev/null +++ b/arch/x86/lib/error-inject.c @@ -0,0 +1,19 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include + +asmlinkage void just_return_func(void); + +asm( + ".type just_return_func, @function\n" + "just_return_func:\n" + " ret\n" + ".size just_return_func, .-just_return_func\n" +); + +void override_function_with_return(struct pt_regs *regs) +{ + regs->ip = (unsigned long)&just_return_func; +} +NOKPROBE_SYMBOL(override_function_with_return); diff --git a/include/asm-generic/error-injection.h b/include/asm-generic/error-injection.h new file mode 100644 index 000000000000..08352c9d9f97 --- /dev/null +++ b/include/asm-generic/error-injection.h @@ -0,0 +1,20 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_GENERIC_ERROR_INJECTION_H +#define _ASM_GENERIC_ERROR_INJECTION_H + +#if defined(__KERNEL__) && !defined(__ASSEMBLY__) +#ifdef CONFIG_FUNCTION_ERROR_INJECTION +/* + * Whitelist ganerating macro. Specify functions which can be + * error-injectable using this macro. + */ +#define ALLOW_ERROR_INJECTION(fname) \ +static unsigned long __used \ + __attribute__((__section__("_error_injection_whitelist"))) \ + _eil_addr_##fname = (unsigned long)fname; +#else +#define ALLOW_ERROR_INJECTION(fname) +#endif +#endif + +#endif /* _ASM_GENERIC_ERROR_INJECTION_H */ diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 8fc1d1730ec6..6fd37e99c07d 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -139,13 +139,13 @@ #define KPROBE_BLACKLIST() #endif -#ifdef CONFIG_BPF_KPROBE_OVERRIDE -#define ERROR_INJECT_LIST() . = ALIGN(8); \ - VMLINUX_SYMBOL(__start_kprobe_error_inject_list) = .; \ - KEEP(*(_kprobe_error_inject_list)) \ - VMLINUX_SYMBOL(__stop_kprobe_error_inject_list) = .; +#ifdef CONFIG_FUNCTION_ERROR_INJECTION +#define ERROR_INJECT_WHITELIST() . = ALIGN(8); \ + VMLINUX_SYMBOL(__start_error_injection_whitelist) = .;\ + KEEP(*(_error_injection_whitelist)) \ + VMLINUX_SYMBOL(__stop_error_injection_whitelist) = .; #else -#define ERROR_INJECT_LIST() +#define ERROR_INJECT_WHITELIST() #endif #ifdef CONFIG_EVENT_TRACING @@ -630,7 +630,7 @@ FTRACE_EVENTS() \ TRACE_SYSCALLS() \ KPROBE_BLACKLIST() \ - ERROR_INJECT_LIST() \ + ERROR_INJECT_WHITELIST() \ MEM_DISCARD(init.rodata) \ CLK_OF_TABLES() \ RESERVEDMEM_OF_TABLES() \ diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 6442eb87a362..05305359723a 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -631,15 +631,4 @@ extern const struct bpf_func_proto bpf_sock_map_update_proto; void bpf_user_rnd_init_once(void); u64 bpf_user_rnd_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); -#if defined(__KERNEL__) && !defined(__ASSEMBLY__) -#ifdef CONFIG_BPF_KPROBE_OVERRIDE -#define BPF_ALLOW_ERROR_INJECTION(fname) \ -static unsigned long __used \ - __attribute__((__section__("_kprobe_error_inject_list"))) \ - _eil_addr_##fname = (unsigned long)fname; -#else -#define BPF_ALLOW_ERROR_INJECTION(fname) -#endif -#endif - #endif /* _LINUX_BPF_H */ diff --git a/include/linux/error-injection.h b/include/linux/error-injection.h new file mode 100644 index 000000000000..130a67c50dac --- /dev/null +++ b/include/linux/error-injection.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_ERROR_INJECTION_H +#define _LINUX_ERROR_INJECTION_H + +#ifdef CONFIG_FUNCTION_ERROR_INJECTION + +#include + +extern bool within_error_injection_list(unsigned long addr); + +#else /* !CONFIG_FUNCTION_ERROR_INJECTION */ + +#include +static inline bool within_error_injection_list(unsigned long addr) +{ + return false; +} + +#endif + +#endif /* _LINUX_ERROR_INJECTION_H */ diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index 9b6ece57cfff..a3b380899e5d 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -274,7 +274,6 @@ extern bool arch_kprobe_on_func_entry(unsigned long offset); extern bool kprobe_on_func_entry(kprobe_opcode_t *addr, const char *sym, unsigned long offset); extern bool within_kprobe_blacklist(unsigned long addr); -extern bool within_kprobe_error_injection_list(unsigned long addr); struct kprobe_insn_cache { struct mutex mutex; diff --git a/include/linux/module.h b/include/linux/module.h index 72c25f845fbe..49176730b851 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -482,9 +482,9 @@ struct module { unsigned int num_ctors; #endif -#ifdef CONFIG_BPF_KPROBE_OVERRIDE - unsigned int num_kprobe_ei_funcs; - unsigned long *kprobe_ei_funcs; +#ifdef CONFIG_FUNCTION_ERROR_INJECTION + unsigned int num_ei_funcs; + unsigned long *ei_funcs; #endif } ____cacheline_aligned __randomize_layout; #ifndef MODULE_ARCH_INIT diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 6232ea3ce94f..e86bbcb849ac 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -83,16 +83,6 @@ static raw_spinlock_t *kretprobe_table_lock_ptr(unsigned long hash) return &(kretprobe_table_locks[hash].lock); } -/* List of symbols that can be overriden for error injection. */ -static LIST_HEAD(kprobe_error_injection_list); -static DEFINE_MUTEX(kprobe_ei_mutex); -struct kprobe_ei_entry { - struct list_head list; - unsigned long start_addr; - unsigned long end_addr; - void *priv; -}; - /* Blacklist -- list of struct kprobe_blacklist_entry */ static LIST_HEAD(kprobe_blacklist); @@ -1467,17 +1457,6 @@ bool within_kprobe_blacklist(unsigned long addr) return false; } -bool within_kprobe_error_injection_list(unsigned long addr) -{ - struct kprobe_ei_entry *ent; - - list_for_each_entry(ent, &kprobe_error_injection_list, list) { - if (addr >= ent->start_addr && addr < ent->end_addr) - return true; - } - return false; -} - /* * If we have a symbol_name argument, look it up and add the offset field * to it. This way, we can specify a relative address to a symbol. @@ -2279,86 +2258,6 @@ static int __init populate_kprobe_blacklist(unsigned long *start, return 0; } -#ifdef CONFIG_BPF_KPROBE_OVERRIDE -/* Markers of the _kprobe_error_inject_list section */ -extern unsigned long __start_kprobe_error_inject_list[]; -extern unsigned long __stop_kprobe_error_inject_list[]; - -/* - * Lookup and populate the kprobe_error_injection_list. - * - * For safety reasons we only allow certain functions to be overriden with - * bpf_error_injection, so we need to populate the list of the symbols that have - * been marked as safe for overriding. - */ -static void populate_kprobe_error_injection_list(unsigned long *start, - unsigned long *end, - void *priv) -{ - unsigned long *iter; - struct kprobe_ei_entry *ent; - unsigned long entry, offset = 0, size = 0; - - mutex_lock(&kprobe_ei_mutex); - for (iter = start; iter < end; iter++) { - entry = arch_deref_entry_point((void *)*iter); - - if (!kernel_text_address(entry) || - !kallsyms_lookup_size_offset(entry, &size, &offset)) { - pr_err("Failed to find error inject entry at %p\n", - (void *)entry); - continue; - } - - ent = kmalloc(sizeof(*ent), GFP_KERNEL); - if (!ent) - break; - ent->start_addr = entry; - ent->end_addr = entry + size; - ent->priv = priv; - INIT_LIST_HEAD(&ent->list); - list_add_tail(&ent->list, &kprobe_error_injection_list); - } - mutex_unlock(&kprobe_ei_mutex); -} - -static void __init populate_kernel_kprobe_ei_list(void) -{ - populate_kprobe_error_injection_list(__start_kprobe_error_inject_list, - __stop_kprobe_error_inject_list, - NULL); -} - -static void module_load_kprobe_ei_list(struct module *mod) -{ - if (!mod->num_kprobe_ei_funcs) - return; - populate_kprobe_error_injection_list(mod->kprobe_ei_funcs, - mod->kprobe_ei_funcs + - mod->num_kprobe_ei_funcs, mod); -} - -static void module_unload_kprobe_ei_list(struct module *mod) -{ - struct kprobe_ei_entry *ent, *n; - if (!mod->num_kprobe_ei_funcs) - return; - - mutex_lock(&kprobe_ei_mutex); - list_for_each_entry_safe(ent, n, &kprobe_error_injection_list, list) { - if (ent->priv == mod) { - list_del_init(&ent->list); - kfree(ent); - } - } - mutex_unlock(&kprobe_ei_mutex); -} -#else -static inline void __init populate_kernel_kprobe_ei_list(void) {} -static inline void module_load_kprobe_ei_list(struct module *m) {} -static inline void module_unload_kprobe_ei_list(struct module *m) {} -#endif - /* Module notifier call back, checking kprobes on the module */ static int kprobes_module_callback(struct notifier_block *nb, unsigned long val, void *data) @@ -2369,11 +2268,6 @@ static int kprobes_module_callback(struct notifier_block *nb, unsigned int i; int checkcore = (val == MODULE_STATE_GOING); - if (val == MODULE_STATE_COMING) - module_load_kprobe_ei_list(mod); - else if (val == MODULE_STATE_GOING) - module_unload_kprobe_ei_list(mod); - if (val != MODULE_STATE_GOING && val != MODULE_STATE_LIVE) return NOTIFY_DONE; @@ -2440,8 +2334,6 @@ static int __init init_kprobes(void) pr_err("Please take care of using kprobes.\n"); } - populate_kernel_kprobe_ei_list(); - if (kretprobe_blacklist_size) { /* lookup the function address from its name */ for (i = 0; kretprobe_blacklist[i].name != NULL; i++) { @@ -2609,56 +2501,6 @@ static const struct file_operations debugfs_kprobe_blacklist_ops = { .release = seq_release, }; -/* - * kprobes/error_injection_list -- shows which functions can be overriden for - * error injection. - * */ -static void *kprobe_ei_seq_start(struct seq_file *m, loff_t *pos) -{ - mutex_lock(&kprobe_ei_mutex); - return seq_list_start(&kprobe_error_injection_list, *pos); -} - -static void kprobe_ei_seq_stop(struct seq_file *m, void *v) -{ - mutex_unlock(&kprobe_ei_mutex); -} - -static void *kprobe_ei_seq_next(struct seq_file *m, void *v, loff_t *pos) -{ - return seq_list_next(v, &kprobe_error_injection_list, pos); -} - -static int kprobe_ei_seq_show(struct seq_file *m, void *v) -{ - char buffer[KSYM_SYMBOL_LEN]; - struct kprobe_ei_entry *ent = - list_entry(v, struct kprobe_ei_entry, list); - - sprint_symbol(buffer, ent->start_addr); - seq_printf(m, "%s\n", buffer); - return 0; -} - -static const struct seq_operations kprobe_ei_seq_ops = { - .start = kprobe_ei_seq_start, - .next = kprobe_ei_seq_next, - .stop = kprobe_ei_seq_stop, - .show = kprobe_ei_seq_show, -}; - -static int kprobe_ei_open(struct inode *inode, struct file *filp) -{ - return seq_open(filp, &kprobe_ei_seq_ops); -} - -static const struct file_operations debugfs_kprobe_ei_ops = { - .open = kprobe_ei_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - static void arm_all_kprobes(void) { struct hlist_head *head; @@ -2800,11 +2642,6 @@ static int __init debugfs_kprobe_init(void) if (!file) goto error; - file = debugfs_create_file("error_injection_list", 0444, dir, NULL, - &debugfs_kprobe_ei_ops); - if (!file) - goto error; - return 0; error: diff --git a/kernel/module.c b/kernel/module.c index 53bd38f8a572..261c53757961 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -3265,10 +3265,10 @@ static int find_module_sections(struct module *mod, struct load_info *info) sizeof(*mod->ftrace_callsites), &mod->num_ftrace_callsites); #endif -#ifdef CONFIG_BPF_KPROBE_OVERRIDE - mod->kprobe_ei_funcs = section_objs(info, "_kprobe_error_inject_list", - sizeof(*mod->kprobe_ei_funcs), - &mod->num_kprobe_ei_funcs); +#ifdef CONFIG_FUNCTION_ERROR_INJECTION + mod->ei_funcs = section_objs(info, "_error_injection_whitelist", + sizeof(*mod->ei_funcs), + &mod->num_ei_funcs); #endif mod->extable = section_objs(info, "__ex_table", sizeof(*mod->extable), &mod->num_exentries); diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 2ba9ffeefdcd..aaffbfba4a33 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -565,7 +565,7 @@ config BPF_KPROBE_OVERRIDE bool "Enable BPF programs to override a kprobed function" depends on BPF_EVENTS depends on KPROBES_ON_FTRACE - depends on HAVE_KPROBE_OVERRIDE + depends on FUNCTION_ERROR_INJECTION depends on DYNAMIC_FTRACE_WITH_REGS default n help diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 6d168738def2..0ed7f7fa7b39 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -14,7 +14,7 @@ #include #include #include -#include +#include #include "trace_probe.h" #include "trace.h" @@ -85,7 +85,7 @@ BPF_CALL_2(bpf_override_return, struct pt_regs *, regs, unsigned long, rc) { __this_cpu_write(bpf_kprobe_override, 1); regs_set_return_value(regs, rc); - arch_ftrace_kprobe_override_function(regs); + override_function_with_return(regs); return 0; } diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 63923abbca5c..e7e625cc49f8 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -21,6 +21,7 @@ #include #include #include +#include #include "trace_probe.h" @@ -106,7 +107,7 @@ int trace_kprobe_error_injectable(struct trace_event_call *call) } else { addr = (unsigned long)tk->rp.kp.addr; } - return within_kprobe_error_injection_list(addr); + return within_error_injection_list(addr); } static int register_kprobe_event(struct trace_kprobe *tk); diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 50c471e7282c..f00b214a9f6d 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1614,6 +1614,10 @@ config FAULT_INJECTION Provide fault-injection framework. For more details, see Documentation/fault-injection/. +config FUNCTION_ERROR_INJECTION + def_bool y + depends on HAVE_FUNCTION_ERROR_INJECTION && KPROBES + config FAILSLAB bool "Fault-injection capability for kmalloc" depends on FAULT_INJECTION diff --git a/lib/Makefile b/lib/Makefile index d78b53461f86..fdafdf6663d0 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -165,6 +165,7 @@ obj-$(CONFIG_NETDEV_NOTIFIER_ERROR_INJECT) += netdev-notifier-error-inject.o obj-$(CONFIG_MEMORY_NOTIFIER_ERROR_INJECT) += memory-notifier-error-inject.o obj-$(CONFIG_OF_RECONFIG_NOTIFIER_ERROR_INJECT) += \ of-reconfig-notifier-error-inject.o +obj-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o lib-$(CONFIG_GENERIC_BUG) += bug.o diff --git a/lib/error-inject.c b/lib/error-inject.c new file mode 100644 index 000000000000..bccadcf3c981 --- /dev/null +++ b/lib/error-inject.c @@ -0,0 +1,213 @@ +// SPDX-License-Identifier: GPL-2.0 +// error-inject.c: Function-level error injection table +#include +#include +#include +#include +#include +#include +#include +#include + +/* Whitelist of symbols that can be overridden for error injection. */ +static LIST_HEAD(error_injection_list); +static DEFINE_MUTEX(ei_mutex); +struct ei_entry { + struct list_head list; + unsigned long start_addr; + unsigned long end_addr; + void *priv; +}; + +bool within_error_injection_list(unsigned long addr) +{ + struct ei_entry *ent; + bool ret = false; + + mutex_lock(&ei_mutex); + list_for_each_entry(ent, &error_injection_list, list) { + if (addr >= ent->start_addr && addr < ent->end_addr) { + ret = true; + break; + } + } + mutex_unlock(&ei_mutex); + return ret; +} + +/* + * Lookup and populate the error_injection_list. + * + * For safety reasons we only allow certain functions to be overridden with + * bpf_error_injection, so we need to populate the list of the symbols that have + * been marked as safe for overriding. + */ +static void populate_error_injection_list(unsigned long *start, + unsigned long *end, void *priv) +{ + unsigned long *iter; + struct ei_entry *ent; + unsigned long entry, offset = 0, size = 0; + + mutex_lock(&ei_mutex); + for (iter = start; iter < end; iter++) { + entry = arch_deref_entry_point((void *)*iter); + + if (!kernel_text_address(entry) || + !kallsyms_lookup_size_offset(entry, &size, &offset)) { + pr_err("Failed to find error inject entry at %p\n", + (void *)entry); + continue; + } + + ent = kmalloc(sizeof(*ent), GFP_KERNEL); + if (!ent) + break; + ent->start_addr = entry; + ent->end_addr = entry + size; + ent->priv = priv; + INIT_LIST_HEAD(&ent->list); + list_add_tail(&ent->list, &error_injection_list); + } + mutex_unlock(&ei_mutex); +} + +/* Markers of the _error_inject_whitelist section */ +extern unsigned long __start_error_injection_whitelist[]; +extern unsigned long __stop_error_injection_whitelist[]; + +static void __init populate_kernel_ei_list(void) +{ + populate_error_injection_list(__start_error_injection_whitelist, + __stop_error_injection_whitelist, + NULL); +} + +#ifdef CONFIG_MODULES +static void module_load_ei_list(struct module *mod) +{ + if (!mod->num_ei_funcs) + return; + + populate_error_injection_list(mod->ei_funcs, + mod->ei_funcs + mod->num_ei_funcs, mod); +} + +static void module_unload_ei_list(struct module *mod) +{ + struct ei_entry *ent, *n; + + if (!mod->num_ei_funcs) + return; + + mutex_lock(&ei_mutex); + list_for_each_entry_safe(ent, n, &error_injection_list, list) { + if (ent->priv == mod) { + list_del_init(&ent->list); + kfree(ent); + } + } + mutex_unlock(&ei_mutex); +} + +/* Module notifier call back, checking error injection table on the module */ +static int ei_module_callback(struct notifier_block *nb, + unsigned long val, void *data) +{ + struct module *mod = data; + + if (val == MODULE_STATE_COMING) + module_load_ei_list(mod); + else if (val == MODULE_STATE_GOING) + module_unload_ei_list(mod); + + return NOTIFY_DONE; +} + +static struct notifier_block ei_module_nb = { + .notifier_call = ei_module_callback, + .priority = 0 +}; + +static __init int module_ei_init(void) +{ + return register_module_notifier(&ei_module_nb); +} +#else /* !CONFIG_MODULES */ +#define module_ei_init() (0) +#endif + +/* + * error_injection/whitelist -- shows which functions can be overridden for + * error injection. + */ +static void *ei_seq_start(struct seq_file *m, loff_t *pos) +{ + mutex_lock(&ei_mutex); + return seq_list_start(&error_injection_list, *pos); +} + +static void ei_seq_stop(struct seq_file *m, void *v) +{ + mutex_unlock(&ei_mutex); +} + +static void *ei_seq_next(struct seq_file *m, void *v, loff_t *pos) +{ + return seq_list_next(v, &error_injection_list, pos); +} + +static int ei_seq_show(struct seq_file *m, void *v) +{ + struct ei_entry *ent = list_entry(v, struct ei_entry, list); + + seq_printf(m, "%pf\n", (void *)ent->start_addr); + return 0; +} + +static const struct seq_operations ei_seq_ops = { + .start = ei_seq_start, + .next = ei_seq_next, + .stop = ei_seq_stop, + .show = ei_seq_show, +}; + +static int ei_open(struct inode *inode, struct file *filp) +{ + return seq_open(filp, &ei_seq_ops); +} + +static const struct file_operations debugfs_ei_ops = { + .open = ei_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static int __init ei_debugfs_init(void) +{ + struct dentry *dir, *file; + + dir = debugfs_create_dir("error_injection", NULL); + if (!dir) + return -ENOMEM; + + file = debugfs_create_file("list", 0444, dir, NULL, &debugfs_ei_ops); + if (!file) { + debugfs_remove(dir); + return -ENOMEM; + } + + return 0; +} + +static int __init init_error_injection(void) +{ + populate_kernel_ei_list(); + + if (!module_ei_init()) + ei_debugfs_init(); + + return 0; +} +late_initcall(init_error_injection); From 13acda9209de5f3037ced50f2ef84af8612e9fa7 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 11 Jan 2018 20:29:03 -0800 Subject: [PATCH 0130/1640] UPSTREAM: bpf: add map_alloc_check callback .map_alloc callbacks contain a number of checks validating user- -provided map attributes against constraints of a particular map type. For offloaded maps we will need to check map attributes without actually allocating any memory on the host. Add a new callback for validating attributes before any memory is allocated. This callback can be selectively implemented by map types for sharing code with offloads, or simply to separate the logical steps of validation and allocation. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 1 + kernel/bpf/syscall.c | 17 +++++++++++++---- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 05305359723a..139ed666c282 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -25,6 +25,7 @@ struct bpf_map; /* map is generic key/value storage optionally accesible by eBPF programs */ struct bpf_map_ops { /* funcs callable from userspace (via syscall) */ + int (*map_alloc_check)(union bpf_attr *attr); struct bpf_map *(*map_alloc)(union bpf_attr *attr); void (*map_release)(struct bpf_map *map, struct file *map_file); void (*map_free)(struct bpf_map *map); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index a251366afe64..0c3228c7bf32 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -97,16 +97,25 @@ static int check_uarg_tail_zero(void __user *uaddr, static struct bpf_map *find_and_alloc_map(union bpf_attr *attr) { + const struct bpf_map_ops *ops; struct bpf_map *map; + int err; - if (attr->map_type >= ARRAY_SIZE(bpf_map_types) || - !bpf_map_types[attr->map_type]) + if (attr->map_type >= ARRAY_SIZE(bpf_map_types)) + return ERR_PTR(-EINVAL); + ops = bpf_map_types[attr->map_type]; + if (!ops) return ERR_PTR(-EINVAL); - map = bpf_map_types[attr->map_type]->map_alloc(attr); + if (ops->map_alloc_check) { + err = ops->map_alloc_check(attr); + if (err) + return ERR_PTR(err); + } + map = ops->map_alloc(attr); if (IS_ERR(map)) return map; - map->ops = bpf_map_types[attr->map_type]; + map->ops = ops; map->map_type = attr->map_type; return map; } From 6b3fa8822ff8470664ddab5c10257961c64d7bd7 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 11 Jan 2018 20:29:04 -0800 Subject: [PATCH 0131/1640] BACKPORT: bpf: hashtab: move attribute validation before allocation Number of attribute checks are currently performed after hashtab is already allocated. Move them to be able to split them out to the check function later on. Checks have to now be performed on the attr union directly instead of the members of bpf_map, since bpf_map will be allocated later. No functional changes. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Signed-off-by: Daniel Borkmann --- kernel/bpf/hashtab.c | 46 +++++++++++++++++++++----------------------- 1 file changed, 22 insertions(+), 24 deletions(-) diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 6058dc49fc4f..87f7e06b897a 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -269,6 +269,28 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) if (numa_node != NUMA_NO_NODE && (percpu || percpu_lru)) return ERR_PTR(-EINVAL); + /* check sanity of attributes. + * value_size == 0 may be allowed in the future to use map as a set + */ + if (attr->max_entries == 0 || attr->key_size == 0 || + attr->value_size == 0) + return ERR_PTR(-EINVAL); + + if (attr->key_size > MAX_BPF_STACK) + /* eBPF programs initialize keys on stack, so they cannot be + * larger than max stack size + */ + return ERR_PTR(-E2BIG); + + if (attr->value_size >= KMALLOC_MAX_SIZE - + MAX_BPF_STACK - sizeof(struct htab_elem)) + /* if value_size is bigger, the user space won't be able to + * access the elements via bpf syscall. This check also makes + * sure that the elem_size doesn't overflow and it's + * kmalloc-able later in htab_map_update_elem() + */ + return ERR_PTR(-E2BIG); + htab = kzalloc(sizeof(*htab), GFP_USER); if (!htab) return ERR_PTR(-ENOMEM); @@ -281,14 +303,6 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) htab->map.map_flags = attr->map_flags; htab->map.numa_node = numa_node; - /* check sanity of attributes. - * value_size == 0 may be allowed in the future to use map as a set - */ - err = -EINVAL; - if (htab->map.max_entries == 0 || htab->map.key_size == 0 || - htab->map.value_size == 0) - goto free_htab; - if (percpu_lru) { /* ensure each CPU's lru list has >=1 elements. * since we are at it, make each lru list has the same @@ -310,22 +324,6 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) htab->n_buckets = roundup_pow_of_two(htab->map.max_entries); - err = -E2BIG; - if (htab->map.key_size > MAX_BPF_STACK) - /* eBPF programs initialize keys on stack, so they cannot be - * larger than max stack size - */ - goto free_htab; - - if (htab->map.value_size >= KMALLOC_MAX_SIZE - - MAX_BPF_STACK - sizeof(struct htab_elem)) - /* if value_size is bigger, the user space won't be able to - * access the elements via bpf syscall. This check also makes - * sure that the elem_size doesn't overflow and it's - * kmalloc-able later in htab_map_update_elem() - */ - goto free_htab; - htab->elem_size = sizeof(struct htab_elem) + round_up(htab->map.key_size, 8); if (percpu) From 0d576596146e5831490bb544641ca6fe38ba53d0 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 11 Jan 2018 20:29:05 -0800 Subject: [PATCH 0132/1640] UPSTREAM: bpf: hashtab: move checks out of alloc function Use the new callback to perform allocation checks for hash maps. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Signed-off-by: Daniel Borkmann --- kernel/bpf/hashtab.c | 123 +++++++++++++++++++++++++------------------ 1 file changed, 73 insertions(+), 50 deletions(-) diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 87f7e06b897a..ab2c70e42472 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -227,6 +227,70 @@ static int alloc_extra_elems(struct bpf_htab *htab) } /* Called from syscall */ +static int htab_map_alloc_check(union bpf_attr *attr) +{ + bool percpu = (attr->map_type == BPF_MAP_TYPE_PERCPU_HASH || + attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH); + bool lru = (attr->map_type == BPF_MAP_TYPE_LRU_HASH || + attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH); + /* percpu_lru means each cpu has its own LRU list. + * it is different from BPF_MAP_TYPE_PERCPU_HASH where + * the map's value itself is percpu. percpu_lru has + * nothing to do with the map's value. + */ + bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU); + bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC); + int numa_node = bpf_map_attr_numa_node(attr); + + BUILD_BUG_ON(offsetof(struct htab_elem, htab) != + offsetof(struct htab_elem, hash_node.pprev)); + BUILD_BUG_ON(offsetof(struct htab_elem, fnode.next) != + offsetof(struct htab_elem, hash_node.pprev)); + + if (lru && !capable(CAP_SYS_ADMIN)) + /* LRU implementation is much complicated than other + * maps. Hence, limit to CAP_SYS_ADMIN for now. + */ + return -EPERM; + + if (attr->map_flags & ~HTAB_CREATE_FLAG_MASK) + /* reserved bits should not be used */ + return -EINVAL; + + if (!lru && percpu_lru) + return -EINVAL; + + if (lru && !prealloc) + return -ENOTSUPP; + + if (numa_node != NUMA_NO_NODE && (percpu || percpu_lru)) + return -EINVAL; + + /* check sanity of attributes. + * value_size == 0 may be allowed in the future to use map as a set + */ + if (attr->max_entries == 0 || attr->key_size == 0 || + attr->value_size == 0) + return -EINVAL; + + if (attr->key_size > MAX_BPF_STACK) + /* eBPF programs initialize keys on stack, so they cannot be + * larger than max stack size + */ + return -E2BIG; + + if (attr->value_size >= KMALLOC_MAX_SIZE - + MAX_BPF_STACK - sizeof(struct htab_elem)) + /* if value_size is bigger, the user space won't be able to + * access the elements via bpf syscall. This check also makes + * sure that the elem_size doesn't overflow and it's + * kmalloc-able later in htab_map_update_elem() + */ + return -E2BIG; + + return 0; +} + static struct bpf_map *htab_map_alloc(union bpf_attr *attr) { bool percpu = (attr->map_type == BPF_MAP_TYPE_PERCPU_HASH || @@ -245,52 +309,6 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) int err, i; u64 cost; - BUILD_BUG_ON(offsetof(struct htab_elem, htab) != - offsetof(struct htab_elem, hash_node.pprev)); - BUILD_BUG_ON(offsetof(struct htab_elem, fnode.next) != - offsetof(struct htab_elem, hash_node.pprev)); - - if (lru && !capable(CAP_SYS_ADMIN)) - /* LRU implementation is much complicated than other - * maps. Hence, limit to CAP_SYS_ADMIN for now. - */ - return ERR_PTR(-EPERM); - - if (attr->map_flags & ~HTAB_CREATE_FLAG_MASK) - /* reserved bits should not be used */ - return ERR_PTR(-EINVAL); - - if (!lru && percpu_lru) - return ERR_PTR(-EINVAL); - - if (lru && !prealloc) - return ERR_PTR(-ENOTSUPP); - - if (numa_node != NUMA_NO_NODE && (percpu || percpu_lru)) - return ERR_PTR(-EINVAL); - - /* check sanity of attributes. - * value_size == 0 may be allowed in the future to use map as a set - */ - if (attr->max_entries == 0 || attr->key_size == 0 || - attr->value_size == 0) - return ERR_PTR(-EINVAL); - - if (attr->key_size > MAX_BPF_STACK) - /* eBPF programs initialize keys on stack, so they cannot be - * larger than max stack size - */ - return ERR_PTR(-E2BIG); - - if (attr->value_size >= KMALLOC_MAX_SIZE - - MAX_BPF_STACK - sizeof(struct htab_elem)) - /* if value_size is bigger, the user space won't be able to - * access the elements via bpf syscall. This check also makes - * sure that the elem_size doesn't overflow and it's - * kmalloc-able later in htab_map_update_elem() - */ - return ERR_PTR(-E2BIG); - htab = kzalloc(sizeof(*htab), GFP_USER); if (!htab) return ERR_PTR(-ENOMEM); @@ -1162,6 +1180,7 @@ static void htab_map_free(struct bpf_map *map) } const struct bpf_map_ops htab_map_ops = { + .map_alloc_check = htab_map_alloc_check, .map_alloc = htab_map_alloc, .map_free = htab_map_free, .map_get_next_key = htab_map_get_next_key, @@ -1172,6 +1191,7 @@ const struct bpf_map_ops htab_map_ops = { }; const struct bpf_map_ops htab_lru_map_ops = { + .map_alloc_check = htab_map_alloc_check, .map_alloc = htab_map_alloc, .map_free = htab_map_free, .map_get_next_key = htab_map_get_next_key, @@ -1256,6 +1276,7 @@ int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value, } const struct bpf_map_ops htab_percpu_map_ops = { + .map_alloc_check = htab_map_alloc_check, .map_alloc = htab_map_alloc, .map_free = htab_map_free, .map_get_next_key = htab_map_get_next_key, @@ -1265,6 +1286,7 @@ const struct bpf_map_ops htab_percpu_map_ops = { }; const struct bpf_map_ops htab_lru_percpu_map_ops = { + .map_alloc_check = htab_map_alloc_check, .map_alloc = htab_map_alloc, .map_free = htab_map_free, .map_get_next_key = htab_map_get_next_key, @@ -1273,11 +1295,11 @@ const struct bpf_map_ops htab_lru_percpu_map_ops = { .map_delete_elem = htab_lru_map_delete_elem, }; -static struct bpf_map *fd_htab_map_alloc(union bpf_attr *attr) +static int fd_htab_map_alloc_check(union bpf_attr *attr) { if (attr->value_size != sizeof(u32)) - return ERR_PTR(-EINVAL); - return htab_map_alloc(attr); + return -EINVAL; + return htab_map_alloc_check(attr); } static void fd_htab_map_free(struct bpf_map *map) @@ -1348,7 +1370,7 @@ static struct bpf_map *htab_of_map_alloc(union bpf_attr *attr) if (IS_ERR(inner_map_meta)) return inner_map_meta; - map = fd_htab_map_alloc(attr); + map = htab_map_alloc(attr); if (IS_ERR(map)) { bpf_map_meta_free(inner_map_meta); return map; @@ -1392,6 +1414,7 @@ static void htab_of_map_free(struct bpf_map *map) } const struct bpf_map_ops htab_of_maps_map_ops = { + .map_alloc_check = fd_htab_map_alloc_check, .map_alloc = htab_of_map_alloc, .map_free = htab_of_map_free, .map_get_next_key = htab_map_get_next_key, From 2aea5b0ff93c68b1a79e2b78af97d7a7052fbc23 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 11 Jan 2018 20:29:06 -0800 Subject: [PATCH 0133/1640] UPSTREAM: bpf: add helper for copying attrs to struct bpf_map All map types reimplement the field-by-field copy of union bpf_attr members into struct bpf_map. Add a helper to perform this operation. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 1 + kernel/bpf/cpumap.c | 8 +------- kernel/bpf/devmap.c | 8 +------- kernel/bpf/hashtab.c | 9 +-------- kernel/bpf/lpm_trie.c | 7 +------ kernel/bpf/sockmap.c | 8 +------- kernel/bpf/stackmap.c | 6 +----- kernel/bpf/syscall.c | 10 ++++++++++ 8 files changed, 17 insertions(+), 40 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 139ed666c282..fd8c56e0a66d 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -384,6 +384,7 @@ void bpf_map_put(struct bpf_map *map); int bpf_map_precharge_memlock(u32 pages); void *bpf_map_area_alloc(size_t size, int numa_node); void bpf_map_area_free(void *base); +void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr); extern int sysctl_unprivileged_bpf_disabled; diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index ce5b669003b2..192151ec9d12 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -94,13 +94,7 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) if (!cmap) return ERR_PTR(-ENOMEM); - /* mandatory map attributes */ - cmap->map.map_type = attr->map_type; - cmap->map.key_size = attr->key_size; - cmap->map.value_size = attr->value_size; - cmap->map.max_entries = attr->max_entries; - cmap->map.map_flags = attr->map_flags; - cmap->map.numa_node = bpf_map_attr_numa_node(attr); + bpf_map_init_from_attr(&cmap->map, attr); /* Pre-limit array size based on NR_CPUS, not final CPU check */ if (cmap->map.max_entries > NR_CPUS) { diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 1a846a636ae1..0d056dfe35cd 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -93,13 +93,7 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) if (!dtab) return ERR_PTR(-ENOMEM); - /* mandatory map attributes */ - dtab->map.map_type = attr->map_type; - dtab->map.key_size = attr->key_size; - dtab->map.value_size = attr->value_size; - dtab->map.max_entries = attr->max_entries; - dtab->map.map_flags = attr->map_flags; - dtab->map.numa_node = bpf_map_attr_numa_node(attr); + bpf_map_init_from_attr(&dtab->map, attr); /* make sure page count doesn't overflow */ cost = (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *); diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index ab2c70e42472..10658cdff4ad 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -304,7 +304,6 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) */ bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU); bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC); - int numa_node = bpf_map_attr_numa_node(attr); struct bpf_htab *htab; int err, i; u64 cost; @@ -313,13 +312,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) if (!htab) return ERR_PTR(-ENOMEM); - /* mandatory map attributes */ - htab->map.map_type = attr->map_type; - htab->map.key_size = attr->key_size; - htab->map.value_size = attr->value_size; - htab->map.max_entries = attr->max_entries; - htab->map.map_flags = attr->map_flags; - htab->map.numa_node = numa_node; + bpf_map_init_from_attr(&htab->map, attr); if (percpu_lru) { /* ensure each CPU's lru list has >=1 elements. diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index ad7cbb6cbabd..ec6e45ecec08 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -525,12 +525,7 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr) return ERR_PTR(-ENOMEM); /* copy mandatory map attributes */ - trie->map.map_type = attr->map_type; - trie->map.key_size = attr->key_size; - trie->map.value_size = attr->value_size; - trie->map.max_entries = attr->max_entries; - trie->map.map_flags = attr->map_flags; - trie->map.numa_node = bpf_map_attr_numa_node(attr); + bpf_map_init_from_attr(&trie->map, attr); trie->data_size = attr->key_size - offsetof(struct bpf_lpm_trie_key, data); trie->max_prefixlen = trie->data_size * 8; diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index e552589b3e7e..d061756bb814 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -516,13 +516,7 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr) if (!stab) return ERR_PTR(-ENOMEM); - /* mandatory map attributes */ - stab->map.map_type = attr->map_type; - stab->map.key_size = attr->key_size; - stab->map.value_size = attr->value_size; - stab->map.max_entries = attr->max_entries; - stab->map.map_flags = attr->map_flags; - stab->map.numa_node = bpf_map_attr_numa_node(attr); + bpf_map_init_from_attr(&stab->map, attr); /* make sure page count doesn't overflow */ cost = (u64) stab->map.max_entries * sizeof(struct sock *); diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 070a4b0d28a2..43aa79610dee 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -94,14 +94,10 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr) if (cost >= U32_MAX - PAGE_SIZE) goto free_smap; - smap->map.map_type = attr->map_type; - smap->map.key_size = attr->key_size; + bpf_map_init_from_attr(&smap->map, attr); smap->map.value_size = value_size; - smap->map.max_entries = attr->max_entries; - smap->map.map_flags = attr->map_flags; smap->n_buckets = n_buckets; smap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; - smap->map.numa_node = bpf_map_attr_numa_node(attr); err = bpf_map_precharge_memlock(smap->map.pages); if (err) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 0c3228c7bf32..0b9d8e6651e7 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -144,6 +144,16 @@ void bpf_map_area_free(void *area) kvfree(area); } +void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr) +{ + map->map_type = attr->map_type; + map->key_size = attr->key_size; + map->value_size = attr->value_size; + map->max_entries = attr->max_entries; + map->map_flags = attr->map_flags; + map->numa_node = bpf_map_attr_numa_node(attr); +} + int bpf_map_precharge_memlock(u32 pages) { struct user_struct *user = get_current_user(); From a4b13d733013f6724b38c88a472c3e9ab78de1e0 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 11 Jan 2018 20:29:07 -0800 Subject: [PATCH 0134/1640] BACKPORT: bpf: rename bpf_dev_offload -> bpf_prog_offload With map offload coming, we need to call program offload structure something less ambiguous. Pure rename, no functional changes. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 4 ++-- kernel/bpf/offload.c | 10 +++++----- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index fd8c56e0a66d..6b8e91c5fb0a 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -206,7 +206,7 @@ struct bpf_prog_offload_ops { int insn_idx, int prev_insn_idx); }; -struct bpf_dev_offload { +struct bpf_prog_offload { struct bpf_prog *prog; struct net_device *netdev; void *dev_priv; @@ -236,7 +236,7 @@ struct bpf_prog_aux { #ifdef CONFIG_SECURITY void *security; #endif - struct bpf_dev_offload *offload; + struct bpf_prog_offload *offload; union { struct work_struct work; struct rcu_head rcu; diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index 040d4e0edf3f..001ddfde7874 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -32,7 +32,7 @@ static LIST_HEAD(bpf_prog_offload_devs); int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr) { - struct bpf_dev_offload *offload; + struct bpf_prog_offload *offload; if (attr->prog_type != BPF_PROG_TYPE_SCHED_CLS && attr->prog_type != BPF_PROG_TYPE_XDP) @@ -72,7 +72,7 @@ err_free: static int __bpf_offload_ndo(struct bpf_prog *prog, enum bpf_netdev_command cmd, struct netdev_bpf *data) { - struct bpf_dev_offload *offload = prog->aux->offload; + struct bpf_prog_offload *offload = prog->aux->offload; struct net_device *netdev; ASSERT_RTNL(); @@ -110,7 +110,7 @@ exit_unlock: int bpf_prog_offload_verify_insn(struct bpf_verifier_env *env, int insn_idx, int prev_insn_idx) { - struct bpf_dev_offload *offload; + struct bpf_prog_offload *offload; int ret = -ENODEV; down_read(&bpf_devs_lock); @@ -124,7 +124,7 @@ int bpf_prog_offload_verify_insn(struct bpf_verifier_env *env, static void __bpf_prog_offload_destroy(struct bpf_prog *prog) { - struct bpf_dev_offload *offload = prog->aux->offload; + struct bpf_prog_offload *offload = prog->aux->offload; struct netdev_bpf data = {}; data.offload.prog = prog; @@ -242,7 +242,7 @@ static int bpf_offload_notification(struct notifier_block *notifier, ulong event, void *ptr) { struct net_device *netdev = netdev_notifier_info_to_dev(ptr); - struct bpf_dev_offload *offload, *tmp; + struct bpf_prog_offload *offload, *tmp; ASSERT_RTNL(); From 27863828fb0271d3fc4921fbd8a078d8310520a6 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 11 Jan 2018 20:29:08 -0800 Subject: [PATCH 0135/1640] UPSTREAM: bpf: offload: factor out netdev checking at allocation time Add a helper to check if netdev could be found and whether it has .ndo_bpf callback. There is no need to check the callback every time it's invoked, ndos can't reasonably be swapped for a set without .ndp_bpf while program is loaded. bpf_dev_offload_check() will also be used by map offload. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Signed-off-by: Daniel Borkmann --- kernel/bpf/offload.c | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index 001ddfde7874..cdd1e19a668b 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -30,9 +30,19 @@ static DECLARE_RWSEM(bpf_devs_lock); static LIST_HEAD(bpf_prog_offload_devs); +static int bpf_dev_offload_check(struct net_device *netdev) +{ + if (!netdev) + return -EINVAL; + if (!netdev->netdev_ops->ndo_bpf) + return -EOPNOTSUPP; + return 0; +} + int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr) { struct bpf_prog_offload *offload; + int err; if (attr->prog_type != BPF_PROG_TYPE_SCHED_CLS && attr->prog_type != BPF_PROG_TYPE_XDP) @@ -49,12 +59,15 @@ int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr) offload->netdev = dev_get_by_index(current->nsproxy->net_ns, attr->prog_ifindex); - if (!offload->netdev) - goto err_free; + err = bpf_dev_offload_check(offload->netdev); + if (err) + goto err_maybe_put; down_write(&bpf_devs_lock); - if (offload->netdev->reg_state != NETREG_REGISTERED) + if (offload->netdev->reg_state != NETREG_REGISTERED) { + err = -EINVAL; goto err_unlock; + } prog->aux->offload = offload; list_add_tail(&offload->offloads, &bpf_prog_offload_devs); dev_put(offload->netdev); @@ -63,10 +76,11 @@ int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr) return 0; err_unlock: up_write(&bpf_devs_lock); - dev_put(offload->netdev); -err_free: +err_maybe_put: + if (offload->netdev) + dev_put(offload->netdev); kfree(offload); - return -EINVAL; + return err; } static int __bpf_offload_ndo(struct bpf_prog *prog, enum bpf_netdev_command cmd, @@ -80,8 +94,6 @@ static int __bpf_offload_ndo(struct bpf_prog *prog, enum bpf_netdev_command cmd, if (!offload) return -ENODEV; netdev = offload->netdev; - if (!netdev->netdev_ops->ndo_bpf) - return -EOPNOTSUPP; data->command = cmd; From 884a6ff5dc22616827c5b65343e5c68da6c0f74f Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 11 Jan 2018 20:29:09 -0800 Subject: [PATCH 0136/1640] BACKPORT: bpf: offload: add map offload infrastructure BPF map offload follow similar path to program offload. At creation time users may specify ifindex of the device on which they want to create the map. Map will be validated by the kernel's .map_alloc_check callback and device driver will be called for the actual allocation. Map will have an empty set of operations associated with it (save for alloc and free callbacks). The real device callbacks are kept in map->offload->dev_ops because they have slightly different signatures. Map operations are called in process context so the driver may communicate with HW freely, msleep(), wait() etc. Map alloc and free callbacks are muxed via existing .ndo_bpf, and are always called with rtnl lock held. Maps and programs are guaranteed to be destroyed before .ndo_uninit (i.e. before unregister_netdev() returns). Map callbacks are invoked with bpf_devs_lock *read* locked, drivers must take care of exclusive locking if necessary. All offload-specific branches are marked with unlikely() (through bpf_map_is_dev_bound()), given that branch penalty will be negligible compared to IO anyway, and we don't want to penalize SW path unnecessarily. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 59 ++++++++++++ include/linux/netdevice.h | 6 ++ include/uapi/linux/bpf.h | 1 + kernel/bpf/offload.c | 188 ++++++++++++++++++++++++++++++++++++-- kernel/bpf/syscall.c | 44 +++++++-- kernel/bpf/verifier.c | 7 ++ 6 files changed, 292 insertions(+), 13 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 6b8e91c5fb0a..297ec6df974a 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -80,6 +80,33 @@ struct bpf_map { char name[BPF_OBJ_NAME_LEN]; }; +struct bpf_offloaded_map; + +struct bpf_map_dev_ops { + int (*map_get_next_key)(struct bpf_offloaded_map *map, + void *key, void *next_key); + int (*map_lookup_elem)(struct bpf_offloaded_map *map, + void *key, void *value); + int (*map_update_elem)(struct bpf_offloaded_map *map, + void *key, void *value, u64 flags); + int (*map_delete_elem)(struct bpf_offloaded_map *map, void *key); +}; + +struct bpf_offloaded_map { + struct bpf_map map; + struct net_device *netdev; + const struct bpf_map_dev_ops *dev_ops; + void *dev_priv; + struct list_head offloads; +}; + +static inline struct bpf_offloaded_map *map_to_offmap(struct bpf_map *map) +{ + return container_of(map, struct bpf_offloaded_map, map); +} + +extern const struct bpf_map_ops bpf_map_offload_ops; + /* function argument constraints */ enum bpf_arg_type { ARG_DONTCARE = 0, /* unused argument in helper function */ @@ -375,6 +402,7 @@ int __bpf_prog_charge(struct user_struct *user, u32 pages); void __bpf_prog_uncharge(struct user_struct *user, u32 pages); void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock); +void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock); struct bpf_map *bpf_map_get_with_uref(u32 ufd); struct bpf_map *__bpf_map_get(struct fd f); @@ -573,6 +601,15 @@ void bpf_prog_offload_destroy(struct bpf_prog *prog); int bpf_prog_offload_info_fill(struct bpf_prog_info *info, struct bpf_prog *prog); +int bpf_map_offload_lookup_elem(struct bpf_map *map, void *key, void *value); +int bpf_map_offload_update_elem(struct bpf_map *map, + void *key, void *value, u64 flags); +int bpf_map_offload_delete_elem(struct bpf_map *map, void *key); +int bpf_map_offload_get_next_key(struct bpf_map *map, + void *key, void *next_key); + +bool bpf_offload_dev_match(struct bpf_prog *prog, struct bpf_map *map); + #if defined(CONFIG_NET) && defined(CONFIG_BPF_SYSCALL) int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr); @@ -580,6 +617,14 @@ static inline bool bpf_prog_is_dev_bound(struct bpf_prog_aux *aux) { return aux->offload_requested; } + +static inline bool bpf_map_is_dev_bound(struct bpf_map *map) +{ + return unlikely(map->ops == &bpf_map_offload_ops); +} + +struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr); +void bpf_map_offload_map_free(struct bpf_map *map); #else static inline int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr) @@ -591,6 +636,20 @@ static inline bool bpf_prog_is_dev_bound(struct bpf_prog_aux *aux) { return false; } + +static inline bool bpf_map_is_dev_bound(struct bpf_map *map) +{ + return false; +} + +static inline struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr) +{ + return ERR_PTR(-EOPNOTSUPP); +} + +static inline void bpf_map_offload_map_free(struct bpf_map *map) +{ +} #endif /* CONFIG_NET && CONFIG_BPF_SYSCALL */ #if defined(CONFIG_STREAM_PARSER) && defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_INET) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 7dd62a2578ec..d4d2998f7f6e 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -815,6 +815,8 @@ enum bpf_netdev_command { BPF_OFFLOAD_VERIFIER_PREP, BPF_OFFLOAD_TRANSLATE, BPF_OFFLOAD_DESTROY, + BPF_OFFLOAD_MAP_ALLOC, + BPF_OFFLOAD_MAP_FREE, }; struct bpf_prog_offload_ops; @@ -843,6 +845,10 @@ struct netdev_bpf { struct { struct bpf_prog *prog; } offload; + /* BPF_OFFLOAD_MAP_ALLOC, BPF_OFFLOAD_MAP_FREE */ + struct { + struct bpf_offloaded_map *offmap; + }; }; }; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 43caed45205c..194f5b88f344 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -245,6 +245,7 @@ union bpf_attr { * BPF_F_NUMA_NODE is set). */ char map_name[BPF_OBJ_NAME_LEN]; + __u32 map_ifindex; /* ifindex of netdev to create on */ }; struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */ diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index cdd1e19a668b..453785fa1881 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -24,11 +24,13 @@ #include #include -/* Protects bpf_prog_offload_devs and offload members of all progs. +/* Protects bpf_prog_offload_devs, bpf_map_offload_devs and offload members + * of all progs. * RTNL lock cannot be taken when holding this lock. */ static DECLARE_RWSEM(bpf_devs_lock); static LIST_HEAD(bpf_prog_offload_devs); +static LIST_HEAD(bpf_map_offload_devs); static int bpf_dev_offload_check(struct net_device *netdev) { @@ -250,11 +252,186 @@ int bpf_prog_offload_info_fill(struct bpf_prog_info *info, const struct bpf_prog_ops bpf_offload_prog_ops = { }; +static int bpf_map_offload_ndo(struct bpf_offloaded_map *offmap, + enum bpf_netdev_command cmd) +{ + struct netdev_bpf data = {}; + struct net_device *netdev; + + ASSERT_RTNL(); + + data.command = cmd; + data.offmap = offmap; + /* Caller must make sure netdev is valid */ + netdev = offmap->netdev; + + return netdev->netdev_ops->ndo_bpf(netdev, &data); +} + +struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr) +{ + struct net *net = current->nsproxy->net_ns; + struct bpf_offloaded_map *offmap; + int err; + + if (!capable(CAP_SYS_ADMIN)) + return ERR_PTR(-EPERM); + if (attr->map_type != BPF_MAP_TYPE_HASH) + return ERR_PTR(-EINVAL); + + offmap = kzalloc(sizeof(*offmap), GFP_USER); + if (!offmap) + return ERR_PTR(-ENOMEM); + + bpf_map_init_from_attr(&offmap->map, attr); + + rtnl_lock(); + down_write(&bpf_devs_lock); + offmap->netdev = __dev_get_by_index(net, attr->map_ifindex); + err = bpf_dev_offload_check(offmap->netdev); + if (err) + goto err_unlock; + + err = bpf_map_offload_ndo(offmap, BPF_OFFLOAD_MAP_ALLOC); + if (err) + goto err_unlock; + + list_add_tail(&offmap->offloads, &bpf_map_offload_devs); + up_write(&bpf_devs_lock); + rtnl_unlock(); + + return &offmap->map; + +err_unlock: + up_write(&bpf_devs_lock); + rtnl_unlock(); + kfree(offmap); + return ERR_PTR(err); +} + +static void __bpf_map_offload_destroy(struct bpf_offloaded_map *offmap) +{ + WARN_ON(bpf_map_offload_ndo(offmap, BPF_OFFLOAD_MAP_FREE)); + /* Make sure BPF_MAP_GET_NEXT_ID can't find this dead map */ + bpf_map_free_id(&offmap->map, true); + list_del_init(&offmap->offloads); + offmap->netdev = NULL; +} + +void bpf_map_offload_map_free(struct bpf_map *map) +{ + struct bpf_offloaded_map *offmap = map_to_offmap(map); + + rtnl_lock(); + down_write(&bpf_devs_lock); + if (offmap->netdev) + __bpf_map_offload_destroy(offmap); + up_write(&bpf_devs_lock); + rtnl_unlock(); + + kfree(offmap); +} + +int bpf_map_offload_lookup_elem(struct bpf_map *map, void *key, void *value) +{ + struct bpf_offloaded_map *offmap = map_to_offmap(map); + int ret = -ENODEV; + + down_read(&bpf_devs_lock); + if (offmap->netdev) + ret = offmap->dev_ops->map_lookup_elem(offmap, key, value); + up_read(&bpf_devs_lock); + + return ret; +} + +int bpf_map_offload_update_elem(struct bpf_map *map, + void *key, void *value, u64 flags) +{ + struct bpf_offloaded_map *offmap = map_to_offmap(map); + int ret = -ENODEV; + + if (unlikely(flags > BPF_EXIST)) + return -EINVAL; + + down_read(&bpf_devs_lock); + if (offmap->netdev) + ret = offmap->dev_ops->map_update_elem(offmap, key, value, + flags); + up_read(&bpf_devs_lock); + + return ret; +} + +int bpf_map_offload_delete_elem(struct bpf_map *map, void *key) +{ + struct bpf_offloaded_map *offmap = map_to_offmap(map); + int ret = -ENODEV; + + down_read(&bpf_devs_lock); + if (offmap->netdev) + ret = offmap->dev_ops->map_delete_elem(offmap, key); + up_read(&bpf_devs_lock); + + return ret; +} + +int bpf_map_offload_get_next_key(struct bpf_map *map, void *key, void *next_key) +{ + struct bpf_offloaded_map *offmap = map_to_offmap(map); + int ret = -ENODEV; + + down_read(&bpf_devs_lock); + if (offmap->netdev) + ret = offmap->dev_ops->map_get_next_key(offmap, key, next_key); + up_read(&bpf_devs_lock); + + return ret; +} + +bool bpf_offload_dev_match(struct bpf_prog *prog, struct bpf_map *map) +{ + struct bpf_offloaded_map *offmap; + struct bpf_prog_offload *offload; + bool ret; + + if (!!bpf_prog_is_dev_bound(prog->aux) != !!bpf_map_is_dev_bound(map)) + return false; + if (!bpf_prog_is_dev_bound(prog->aux)) + return true; + + down_read(&bpf_devs_lock); + offload = prog->aux->offload; + offmap = map_to_offmap(map); + + ret = offload && offload->netdev == offmap->netdev; + up_read(&bpf_devs_lock); + + return ret; +} + +static void bpf_offload_orphan_all_progs(struct net_device *netdev) +{ + struct bpf_prog_offload *offload, *tmp; + + list_for_each_entry_safe(offload, tmp, &bpf_prog_offload_devs, offloads) + if (offload->netdev == netdev) + __bpf_prog_offload_destroy(offload->prog); +} + +static void bpf_offload_orphan_all_maps(struct net_device *netdev) +{ + struct bpf_offloaded_map *offmap, *tmp; + + list_for_each_entry_safe(offmap, tmp, &bpf_map_offload_devs, offloads) + if (offmap->netdev == netdev) + __bpf_map_offload_destroy(offmap); +} + static int bpf_offload_notification(struct notifier_block *notifier, ulong event, void *ptr) { struct net_device *netdev = netdev_notifier_info_to_dev(ptr); - struct bpf_prog_offload *offload, *tmp; ASSERT_RTNL(); @@ -265,11 +442,8 @@ static int bpf_offload_notification(struct notifier_block *notifier, break; down_write(&bpf_devs_lock); - list_for_each_entry_safe(offload, tmp, &bpf_prog_offload_devs, - offloads) { - if (offload->netdev == netdev) - __bpf_prog_offload_destroy(offload->prog); - } + bpf_offload_orphan_all_progs(netdev); + bpf_offload_orphan_all_maps(netdev); up_write(&bpf_devs_lock); break; default: diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 0b9d8e6651e7..7f437012feb4 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -95,6 +95,11 @@ static int check_uarg_tail_zero(void __user *uaddr, return 0; } +const struct bpf_map_ops bpf_map_offload_ops = { + .map_alloc = bpf_map_offload_map_alloc, + .map_free = bpf_map_offload_map_free, +}; + static struct bpf_map *find_and_alloc_map(union bpf_attr *attr) { const struct bpf_map_ops *ops; @@ -112,6 +117,8 @@ static struct bpf_map *find_and_alloc_map(union bpf_attr *attr) if (err) return ERR_PTR(err); } + if (attr->map_ifindex) + ops = &bpf_map_offload_ops; map = ops->map_alloc(attr); if (IS_ERR(map)) return map; @@ -209,16 +216,25 @@ static int bpf_map_alloc_id(struct bpf_map *map) return id > 0 ? 0 : id; } -static void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock) +void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock) { unsigned long flags; + /* Offloaded maps are removed from the IDR store when their device + * disappears - even if someone holds an fd to them they are unusable, + * the memory is gone, all ops will fail; they are simply waiting for + * refcnt to drop to be freed. + */ + if (!map->id) + return; + if (do_idr_lock) spin_lock_irqsave(&map_idr_lock, flags); else __acquire(&map_idr_lock); idr_remove(&map_idr, map->id); + map->id = 0; if (do_idr_lock) spin_unlock_irqrestore(&map_idr_lock, flags); @@ -398,7 +414,7 @@ static int bpf_obj_name_cpy(char *dst, const char *src) return 0; } -#define BPF_MAP_CREATE_LAST_FIELD map_name +#define BPF_MAP_CREATE_LAST_FIELD map_ifindex /* called via syscall */ static int map_create(union bpf_attr *attr) { @@ -586,8 +602,10 @@ static int map_lookup_elem(union bpf_attr *attr) if (!value) goto free_key; - if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || - map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) { + if (bpf_map_is_dev_bound(map)) { + err = bpf_map_offload_lookup_elem(map, key, value); + } else if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || + map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) { err = bpf_percpu_hash_copy(map, key, value); } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { err = bpf_percpu_array_copy(map, key, value); @@ -688,7 +706,10 @@ static int map_update_elem(union bpf_attr *attr) goto free_value; /* Need to create a kthread, thus must support schedule */ - if (map->map_type == BPF_MAP_TYPE_CPUMAP) { + if (bpf_map_is_dev_bound(map)) { + err = bpf_map_offload_update_elem(map, key, value, attr->flags); + goto out; + } else if (map->map_type == BPF_MAP_TYPE_CPUMAP) { err = map->ops->map_update_elem(map, key, value, attr->flags); goto out; } @@ -766,6 +787,11 @@ static int map_delete_elem(union bpf_attr *attr) goto err_put; } + if (bpf_map_is_dev_bound(map)) { + err = bpf_map_offload_delete_elem(map, key); + goto out; + } + preempt_disable(); __this_cpu_inc(bpf_prog_active); rcu_read_lock(); @@ -774,7 +800,7 @@ static int map_delete_elem(union bpf_attr *attr) __this_cpu_dec(bpf_prog_active); preempt_enable(); maybe_wait_bpf_programs(map); - +out: if (!err) trace_bpf_map_delete_elem(map, ufd, key); kfree(key); @@ -824,9 +850,15 @@ static int map_get_next_key(union bpf_attr *attr) if (!next_key) goto free_key; + if (bpf_map_is_dev_bound(map)) { + err = bpf_map_offload_get_next_key(map, key, next_key); + goto out; + } + rcu_read_lock(); err = map->ops->map_get_next_key(map, key, next_key); rcu_read_unlock(); +out: if (err) goto free_next_key; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 7c311f1218d2..a7735febdc08 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5216,6 +5216,13 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, return -EINVAL; } } + + if ((bpf_prog_is_dev_bound(prog->aux) || bpf_map_is_dev_bound(map)) && + !bpf_offload_dev_match(prog, map)) { + verbose(env, "offload device mismatch between prog and map\n"); + return -EINVAL; + } + return 0; } From c9ef69bc3c44c03fdc87a0a21f7792d7c7f65e2f Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Tue, 16 Jan 2018 11:27:05 +0000 Subject: [PATCH 0137/1640] UPSTREAM: bpf: cpumap: make some functions static Fixes the following sparse warnings: kernel/bpf/cpumap.c:146:6: warning: symbol '__cpu_map_queue_destructor' was not declared. Should it be static? kernel/bpf/cpumap.c:225:16: warning: symbol 'cpu_map_build_skb' was not declared. Should it be static? kernel/bpf/cpumap.c:340:26: warning: symbol '__cpu_map_entry_alloc' was not declared. Should it be static? kernel/bpf/cpumap.c:398:6: warning: symbol '__cpu_map_entry_free' was not declared. Should it be static? kernel/bpf/cpumap.c:441:6: warning: symbol '__cpu_map_entry_replace' was not declared. Should it be static? kernel/bpf/cpumap.c:454:5: warning: symbol 'cpu_map_delete_elem' was not declared. Should it be static? kernel/bpf/cpumap.c:467:5: warning: symbol 'cpu_map_update_elem' was not declared. Should it be static? kernel/bpf/cpumap.c:505:6: warning: symbol 'cpu_map_free' was not declared. Should it be static? Signed-off-by: Wei Yongjun Signed-off-by: Daniel Borkmann --- kernel/bpf/cpumap.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 192151ec9d12..fbfdada6caee 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -137,7 +137,7 @@ free_cmap: return ERR_PTR(err); } -void __cpu_map_queue_destructor(void *ptr) +static void __cpu_map_queue_destructor(void *ptr) { /* The tear-down procedure should have made sure that queue is * empty. See __cpu_map_entry_replace() and work-queue @@ -216,8 +216,8 @@ static struct xdp_pkt *convert_to_xdp_pkt(struct xdp_buff *xdp) return xdp_pkt; } -struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu, - struct xdp_pkt *xdp_pkt) +static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu, + struct xdp_pkt *xdp_pkt) { unsigned int frame_size; void *pkt_data_start; @@ -331,7 +331,8 @@ static int cpu_map_kthread_run(void *data) return 0; } -struct bpf_cpu_map_entry *__cpu_map_entry_alloc(u32 qsize, u32 cpu, int map_id) +static struct bpf_cpu_map_entry *__cpu_map_entry_alloc(u32 qsize, u32 cpu, + int map_id) { gfp_t gfp = GFP_ATOMIC|__GFP_NOWARN; struct bpf_cpu_map_entry *rcpu; @@ -389,7 +390,7 @@ free_rcu: return NULL; } -void __cpu_map_entry_free(struct rcu_head *rcu) +static void __cpu_map_entry_free(struct rcu_head *rcu) { struct bpf_cpu_map_entry *rcpu; int cpu; @@ -432,8 +433,8 @@ void __cpu_map_entry_free(struct rcu_head *rcu) * cpu_map_kthread_stop, which waits for an RCU graze period before * stopping kthread, emptying the queue. */ -void __cpu_map_entry_replace(struct bpf_cpu_map *cmap, - u32 key_cpu, struct bpf_cpu_map_entry *rcpu) +static void __cpu_map_entry_replace(struct bpf_cpu_map *cmap, + u32 key_cpu, struct bpf_cpu_map_entry *rcpu) { struct bpf_cpu_map_entry *old_rcpu; @@ -445,7 +446,7 @@ void __cpu_map_entry_replace(struct bpf_cpu_map *cmap, } } -int cpu_map_delete_elem(struct bpf_map *map, void *key) +static int cpu_map_delete_elem(struct bpf_map *map, void *key) { struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); u32 key_cpu = *(u32 *)key; @@ -458,8 +459,8 @@ int cpu_map_delete_elem(struct bpf_map *map, void *key) return 0; } -int cpu_map_update_elem(struct bpf_map *map, void *key, void *value, - u64 map_flags) +static int cpu_map_update_elem(struct bpf_map *map, void *key, void *value, + u64 map_flags) { struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); struct bpf_cpu_map_entry *rcpu; @@ -496,7 +497,7 @@ int cpu_map_update_elem(struct bpf_map *map, void *key, void *value, return 0; } -void cpu_map_free(struct bpf_map *map) +static void cpu_map_free(struct bpf_map *map) { struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); int cpu; From 7fc1388856bf160cf535fd3bde7de36e75e01d52 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 16 Jan 2018 15:51:45 -0800 Subject: [PATCH 0138/1640] UPSTREAM: bpf: offload: make bpf_offload_dev_match() reject host+host case Daniel suggests it would be more logical for bpf_offload_dev_match() to return false is either the program or the map are not offloaded, rather than treating the both not offloaded case as a "matching CPU/host device". This makes no functional difference today, since verifier only calls bpf_offload_dev_match() when one of the objects is offloaded. Signed-off-by: Jakub Kicinski Signed-off-by: Daniel Borkmann --- kernel/bpf/offload.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index 453785fa1881..a88cebf368bf 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -395,10 +395,8 @@ bool bpf_offload_dev_match(struct bpf_prog *prog, struct bpf_map *map) struct bpf_prog_offload *offload; bool ret; - if (!!bpf_prog_is_dev_bound(prog->aux) != !!bpf_map_is_dev_bound(map)) + if (!bpf_prog_is_dev_bound(prog->aux) || !bpf_map_is_dev_bound(map)) return false; - if (!bpf_prog_is_dev_bound(prog->aux)) - return true; down_read(&bpf_devs_lock); offload = prog->aux->offload; From 19fd23abbfa38685daeae873f5b6cd3d32c1c0a0 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 16 Jan 2018 15:51:46 -0800 Subject: [PATCH 0139/1640] UPSTREAM: bpf: annotate bpf_insn_print_t with __printf Functions of type bpf_insn_print_t take printf-like format string, mark the type accordingly. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Signed-off-by: Daniel Borkmann --- kernel/bpf/disasm.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/disasm.h b/kernel/bpf/disasm.h index e0857d016f89..266fe8ee542b 100644 --- a/kernel/bpf/disasm.h +++ b/kernel/bpf/disasm.h @@ -29,8 +29,8 @@ extern const char *const bpf_class_string[8]; const char *func_id_name(int id); -typedef void (*bpf_insn_print_t)(struct bpf_verifier_env *env, - const char *, ...); +typedef __printf(2, 3) void (*bpf_insn_print_t)(struct bpf_verifier_env *env, + const char *, ...); typedef const char *(*bpf_insn_revmap_call_t)(void *private_data, const struct bpf_insn *insn); typedef const char *(*bpf_insn_print_imm_t)(void *private_data, From f273ad3bad6154fac9253519af10ea059cfcc70b Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 18 Jan 2018 01:15:21 +0100 Subject: [PATCH 0140/1640] UPSTREAM: bpf: mark dst unknown on inconsistent {s, u}bounds adjustments syzkaller generated a BPF proglet and triggered a warning with the following: 0: (b7) r0 = 0 1: (d5) if r0 s<= 0x0 goto pc+0 R0=inv0 R1=ctx(id=0,off=0,imm=0) R10=fp0 2: (1f) r0 -= r1 R0=inv0 R1=ctx(id=0,off=0,imm=0) R10=fp0 verifier internal error: known but bad sbounds What happens is that in the first insn, r0's min/max value are both 0 due to the immediate assignment, later in the jsle test the bounds are updated for the min value in the false path, meaning, they yield smin_val = 1, smax_val = 0, and when ctx pointer is subtracted from r0, verifier bails out with the internal error and throwing a WARN since smin_val != smax_val for the known constant. For min_val > max_val scenario it means that reg_set_min_max() and reg_set_min_max_inv() (which both refine existing bounds) demonstrated that such branch cannot be taken at runtime. In above scenario for the case where it will be taken, the existing [0, 0] bounds are kept intact. Meaning, the rejection is not due to a verifier internal error, and therefore the WARN() is not necessary either. We could just reject such cases in adjust_{ptr,scalar}_min_max_vals() when either known scalars have smin_val != smax_val or umin_val != umax_val or any scalar reg with bounds smin_val > smax_val or umin_val > umax_val. However, there may be a small risk of breakage of buggy programs, so handle this more gracefully and in adjust_{ptr,scalar}_min_max_vals() just taint the dst reg as unknown scalar when we see ops with such kind of src reg. Reported-by: syzbot+6d362cadd45dc0a12ba4@syzkaller.appspotmail.com Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/test_verifier.c | 121 ++++++++++++++++++++ 1 file changed, 121 insertions(+) diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c index f2e9b37a4463..58fe3fe7b212 100644 --- a/tools/testing/selftests/bpf/test_verifier.c +++ b/tools/testing/selftests/bpf/test_verifier.c @@ -8046,6 +8046,127 @@ static struct bpf_test tests[] = { .result = REJECT, .errstr = "variable ctx access var_off=(0x0; 0x4)", }, + { + "check deducing bounds from const, 1", + .insns = { + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_JMP_IMM(BPF_JSGE, BPF_REG_0, 1, 0), + BPF_ALU64_REG(BPF_SUB, BPF_REG_0, BPF_REG_1), + BPF_EXIT_INSN(), + }, + .result = REJECT, + .errstr = "R0 tried to subtract pointer from scalar", + }, + { + "check deducing bounds from const, 2", + .insns = { + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_JMP_IMM(BPF_JSGE, BPF_REG_0, 1, 1), + BPF_EXIT_INSN(), + BPF_JMP_IMM(BPF_JSLE, BPF_REG_0, 1, 1), + BPF_EXIT_INSN(), + BPF_ALU64_REG(BPF_SUB, BPF_REG_1, BPF_REG_0), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + }, + { + "check deducing bounds from const, 3", + .insns = { + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_JMP_IMM(BPF_JSLE, BPF_REG_0, 0, 0), + BPF_ALU64_REG(BPF_SUB, BPF_REG_0, BPF_REG_1), + BPF_EXIT_INSN(), + }, + .result = REJECT, + .errstr = "R0 tried to subtract pointer from scalar", + }, + { + "check deducing bounds from const, 4", + .insns = { + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_JMP_IMM(BPF_JSLE, BPF_REG_0, 0, 1), + BPF_EXIT_INSN(), + BPF_JMP_IMM(BPF_JSGE, BPF_REG_0, 0, 1), + BPF_EXIT_INSN(), + BPF_ALU64_REG(BPF_SUB, BPF_REG_1, BPF_REG_0), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + }, + { + "check deducing bounds from const, 5", + .insns = { + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_JMP_IMM(BPF_JSGE, BPF_REG_0, 0, 1), + BPF_ALU64_REG(BPF_SUB, BPF_REG_0, BPF_REG_1), + BPF_EXIT_INSN(), + }, + .result = REJECT, + .errstr = "R0 tried to subtract pointer from scalar", + }, + { + "check deducing bounds from const, 6", + .insns = { + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_JMP_IMM(BPF_JSGE, BPF_REG_0, 0, 1), + BPF_EXIT_INSN(), + BPF_ALU64_REG(BPF_SUB, BPF_REG_0, BPF_REG_1), + BPF_EXIT_INSN(), + }, + .result = REJECT, + .errstr = "R0 tried to subtract pointer from scalar", + }, + { + "check deducing bounds from const, 7", + .insns = { + BPF_MOV64_IMM(BPF_REG_0, ~0), + BPF_JMP_IMM(BPF_JSGE, BPF_REG_0, 0, 0), + BPF_ALU64_REG(BPF_SUB, BPF_REG_1, BPF_REG_0), + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, + offsetof(struct __sk_buff, mark)), + BPF_EXIT_INSN(), + }, + .result = REJECT, + .errstr = "dereference of modified ctx ptr", + }, + { + "check deducing bounds from const, 8", + .insns = { + BPF_MOV64_IMM(BPF_REG_0, ~0), + BPF_JMP_IMM(BPF_JSGE, BPF_REG_0, 0, 1), + BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_0), + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, + offsetof(struct __sk_buff, mark)), + BPF_EXIT_INSN(), + }, + .result = REJECT, + .errstr = "dereference of modified ctx ptr", + }, + { + "check deducing bounds from const, 9", + .insns = { + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_JMP_IMM(BPF_JSGE, BPF_REG_0, 0, 0), + BPF_ALU64_REG(BPF_SUB, BPF_REG_0, BPF_REG_1), + BPF_EXIT_INSN(), + }, + .result = REJECT, + .errstr = "R0 tried to subtract pointer from scalar", + }, + { + "check deducing bounds from const, 10", + .insns = { + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_JMP_IMM(BPF_JSLE, BPF_REG_0, 0, 0), + /* Marks reg as unknown. */ + BPF_ALU64_IMM(BPF_NEG, BPF_REG_0, 0), + BPF_ALU64_REG(BPF_SUB, BPF_REG_0, BPF_REG_1), + BPF_EXIT_INSN(), + }, + .result = REJECT, + .errstr = "math between ctx pointer and register with unbounded min value is not allowed", + }, { "bpf_exit with invalid return code. test1", .insns = { From 1360b877606b22303cd7bf011bba72e0e5561994 Mon Sep 17 00:00:00 2001 From: Jiong Wang Date: Tue, 16 Jan 2018 16:05:19 -0800 Subject: [PATCH 0141/1640] UPSTREAM: bpf: add new jited info fields in bpf_dev_offload and bpf_prog_info For host JIT, there are "jited_len"/"bpf_func" fields in struct bpf_prog used by all host JIT targets to get jited image and it's length. While for offload, targets are likely to have different offload mechanisms that these info are kept in device private data fields. Therefore, BPF_OBJ_GET_INFO_BY_FD syscall needs an unified way to get JIT length and contents info for offload targets. One way is to introduce new callback to parse device private data then fill those fields in bpf_prog_info. This might be a little heavy, the other way is to add generic fields which will be initialized by all offload targets. This patch follow the second approach to introduce two new fields in struct bpf_dev_offload and teach bpf_prog_get_info_by_fd about them to fill correct jited_prog_len and jited_prog_insns in bpf_prog_info. Reviewed-by: Jakub Kicinski Signed-off-by: Jiong Wang Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 2 ++ kernel/bpf/offload.c | 23 +++++++++++++++++++++++ kernel/bpf/syscall.c | 31 ++++++++++++++++++------------- 3 files changed, 43 insertions(+), 13 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 297ec6df974a..020ba636244e 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -240,6 +240,8 @@ struct bpf_prog_offload { struct list_head offloads; bool dev_state; const struct bpf_prog_offload_ops *dev_ops; + void *jited_image; + u32 jited_len; }; struct bpf_prog_aux { diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index a88cebf368bf..6c0baa1cf8f8 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -230,9 +230,12 @@ int bpf_prog_offload_info_fill(struct bpf_prog_info *info, .prog = prog, .info = info, }; + struct bpf_prog_aux *aux = prog->aux; struct inode *ns_inode; struct path ns_path; + char __user *uinsns; void *res; + u32 ulen; res = ns_get_path_cb(&ns_path, bpf_prog_offload_info_fill_ns, &args); if (IS_ERR(res)) { @@ -241,6 +244,26 @@ int bpf_prog_offload_info_fill(struct bpf_prog_info *info, return PTR_ERR(res); } + down_read(&bpf_devs_lock); + + if (!aux->offload) { + up_read(&bpf_devs_lock); + return -ENODEV; + } + + ulen = info->jited_prog_len; + info->jited_prog_len = aux->offload->jited_len; + if (info->jited_prog_len & ulen) { + uinsns = u64_to_user_ptr(info->jited_prog_insns); + ulen = min_t(u32, info->jited_prog_len, ulen); + if (copy_to_user(uinsns, aux->offload->jited_image, ulen)) { + up_read(&bpf_devs_lock); + return -EFAULT; + } + } + + up_read(&bpf_devs_lock); + ns_inode = ns_path.dentry->d_inode; info->netns_dev = new_encode_dev(ns_inode->i_sb->s_dev); info->netns_ino = ns_inode->i_ino; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 7f437012feb4..e959459dcee2 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1748,19 +1748,6 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, goto done; } - ulen = info.jited_prog_len; - info.jited_prog_len = prog->jited_len; - if (info.jited_prog_len && ulen) { - if (bpf_dump_raw_ok()) { - uinsns = u64_to_user_ptr(info.jited_prog_insns); - ulen = min_t(u32, info.jited_prog_len, ulen); - if (copy_to_user(uinsns, prog->bpf_func, ulen)) - return -EFAULT; - } else { - info.jited_prog_insns = 0; - } - } - ulen = info.xlated_prog_len; info.xlated_prog_len = bpf_prog_insn_size(prog); if (info.xlated_prog_len && ulen) { @@ -1786,6 +1773,24 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, err = bpf_prog_offload_info_fill(&info, prog); if (err) return err; + goto done; + } + + /* NOTE: the following code is supposed to be skipped for offload. + * bpf_prog_offload_info_fill() is the place to fill similar fields + * for offload. + */ + ulen = info.jited_prog_len; + info.jited_prog_len = prog->jited_len; + if (info.jited_prog_len && ulen) { + if (bpf_dump_raw_ok()) { + uinsns = u64_to_user_ptr(info.jited_prog_insns); + ulen = min_t(u32, info.jited_prog_len, ulen); + if (copy_to_user(uinsns, prog->bpf_func, ulen)) + return -EFAULT; + } else { + info.jited_prog_insns = 0; + } } done: From 75409a6d897c12f13f98014f10ea9ba5aec5e810 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Wed, 17 Jan 2018 12:05:36 +0100 Subject: [PATCH 0142/1640] UPSTREAM: bpf: add comments to BPF ld/ldx sizes Doc BPF ld/ldx size defines as comments in code, as it makes in faster to lookup in a programming/review setting, than looking up the sizes in Documentation/networking/filter.txt. Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Daniel Borkmann --- include/uapi/linux/bpf.h | 2 +- include/uapi/linux/bpf_common.h | 7 ++++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 194f5b88f344..ad783be36fd6 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -17,7 +17,7 @@ #define BPF_ALU64 0x07 /* alu mode in double word width */ /* ld/ldx fields */ -#define BPF_DW 0x18 /* double word */ +#define BPF_DW 0x18 /* double word (64-bit) */ #define BPF_XADD 0xc0 /* exclusive add */ /* alu/jmp fields */ diff --git a/include/uapi/linux/bpf_common.h b/include/uapi/linux/bpf_common.h index 18be90725ab0..ee97668bdadb 100644 --- a/include/uapi/linux/bpf_common.h +++ b/include/uapi/linux/bpf_common.h @@ -15,9 +15,10 @@ /* ld/ldx fields */ #define BPF_SIZE(code) ((code) & 0x18) -#define BPF_W 0x00 -#define BPF_H 0x08 -#define BPF_B 0x10 +#define BPF_W 0x00 /* 32-bit */ +#define BPF_H 0x08 /* 16-bit */ +#define BPF_B 0x10 /* 8-bit */ +/* eBPF BPF_DW 0x18 64-bit */ #define BPF_MODE(code) ((code) & 0xe0) #define BPF_IMM 0x00 #define BPF_ABS 0x20 From 42f4d398d15e388b2b2b98fa8f6c4d43f76d5fcb Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 17 Jan 2018 16:52:02 -0800 Subject: [PATCH 0143/1640] UPSTREAM: bpf: allow socket_filter programs to use bpf_prog_test_run in order to improve test coverage allow socket_filter program type to be run via bpf_prog_test_run command. Since such programs can be loaded by non-root tighten permissions for bpf_prog_test_run to be root only to avoid surprises. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- kernel/bpf/syscall.c | 2 ++ net/core/filter.c | 1 + 2 files changed, 3 insertions(+) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index e959459dcee2..3a7624f720ac 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1527,6 +1527,8 @@ static int bpf_prog_test_run(const union bpf_attr *attr, struct bpf_prog *prog; int ret = -ENOTSUPP; + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; if (CHECK_ATTR(BPF_PROG_TEST_RUN)) return -EINVAL; diff --git a/net/core/filter.c b/net/core/filter.c index 3f8030e49ddc..87938530a0b7 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4625,6 +4625,7 @@ const struct bpf_verifier_ops sk_filter_verifier_ops = { }; const struct bpf_prog_ops sk_filter_prog_ops = { + .test_run = bpf_prog_test_run_skb, }; const struct bpf_verifier_ops tc_cls_act_verifier_ops = { From 0daa92a890d0babafa1a4797476a3c2f7dbb95e5 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 17 Jan 2018 19:13:25 -0800 Subject: [PATCH 0144/1640] UPSTREAM: bpf: arraymap: move checks out of alloc function Use the new callback to perform allocation checks for array maps. The fd maps don't need a special allocation callback, they only need a special check callback. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Signed-off-by: Daniel Borkmann --- kernel/bpf/arraymap.c | 54 +++++++++++++++++++++++++++---------------- 1 file changed, 34 insertions(+), 20 deletions(-) diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index fc0193e150cd..7547fd29a9ca 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -52,6 +52,27 @@ static int bpf_array_alloc_percpu(struct bpf_array *array) } /* Called from syscall */ +static int array_map_alloc_check(union bpf_attr *attr) +{ + bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY; + int numa_node = bpf_map_attr_numa_node(attr); + + /* check sanity of attributes */ + if (attr->max_entries == 0 || attr->key_size != 4 || + attr->value_size == 0 || + attr->map_flags & ~ARRAY_CREATE_FLAG_MASK || + (percpu && numa_node != NUMA_NO_NODE)) + return -EINVAL; + + if (attr->value_size > KMALLOC_MAX_SIZE) + /* if value_size is bigger, the user space won't be able to + * access the elements. + */ + return -E2BIG; + + return 0; +} + static struct bpf_map *array_map_alloc(union bpf_attr *attr) { bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY; @@ -61,19 +82,6 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) u64 cost, array_size, mask64; struct bpf_array *array; - /* check sanity of attributes */ - if (attr->max_entries == 0 || attr->key_size != 4 || - attr->value_size == 0 || - attr->map_flags & ~ARRAY_CREATE_FLAG_MASK || - (percpu && numa_node != NUMA_NO_NODE)) - return ERR_PTR(-EINVAL); - - if (attr->value_size > KMALLOC_MAX_SIZE) - /* if value_size is bigger, the user space won't be able to - * access the elements. - */ - return ERR_PTR(-E2BIG); - elem_size = round_up(attr->value_size, 8); max_entries = attr->max_entries; @@ -334,6 +342,7 @@ static void array_map_free(struct bpf_map *map) } const struct bpf_map_ops array_map_ops = { + .map_alloc_check = array_map_alloc_check, .map_alloc = array_map_alloc, .map_free = array_map_free, .map_get_next_key = array_map_get_next_key, @@ -344,6 +353,7 @@ const struct bpf_map_ops array_map_ops = { }; const struct bpf_map_ops percpu_array_map_ops = { + .map_alloc_check = array_map_alloc_check, .map_alloc = array_map_alloc, .map_free = array_map_free, .map_get_next_key = array_map_get_next_key, @@ -352,12 +362,12 @@ const struct bpf_map_ops percpu_array_map_ops = { .map_delete_elem = array_map_delete_elem, }; -static struct bpf_map *fd_array_map_alloc(union bpf_attr *attr) +static int fd_array_map_alloc_check(union bpf_attr *attr) { /* only file descriptors can be stored in this type of map */ if (attr->value_size != sizeof(u32)) - return ERR_PTR(-EINVAL); - return array_map_alloc(attr); + return -EINVAL; + return array_map_alloc_check(attr); } static void fd_array_map_free(struct bpf_map *map) @@ -482,7 +492,8 @@ static void bpf_fd_array_map_clear(struct bpf_map *map) } const struct bpf_map_ops prog_array_map_ops = { - .map_alloc = fd_array_map_alloc, + .map_alloc_check = fd_array_map_alloc_check, + .map_alloc = array_map_alloc, .map_free = fd_array_map_free, .map_get_next_key = array_map_get_next_key, .map_lookup_elem = fd_array_map_lookup_elem, @@ -571,7 +582,8 @@ static void perf_event_fd_array_release(struct bpf_map *map, } const struct bpf_map_ops perf_event_array_map_ops = { - .map_alloc = fd_array_map_alloc, + .map_alloc_check = fd_array_map_alloc_check, + .map_alloc = array_map_alloc, .map_free = fd_array_map_free, .map_get_next_key = array_map_get_next_key, .map_lookup_elem = fd_array_map_lookup_elem, @@ -602,7 +614,8 @@ static void cgroup_fd_array_free(struct bpf_map *map) } const struct bpf_map_ops cgroup_array_map_ops = { - .map_alloc = fd_array_map_alloc, + .map_alloc_check = fd_array_map_alloc_check, + .map_alloc = array_map_alloc, .map_free = cgroup_fd_array_free, .map_get_next_key = array_map_get_next_key, .map_lookup_elem = fd_array_map_lookup_elem, @@ -620,7 +633,7 @@ static struct bpf_map *array_of_map_alloc(union bpf_attr *attr) if (IS_ERR(inner_map_meta)) return inner_map_meta; - map = fd_array_map_alloc(attr); + map = array_map_alloc(attr); if (IS_ERR(map)) { bpf_map_meta_free(inner_map_meta); return map; @@ -683,6 +696,7 @@ static u32 array_of_map_gen_lookup(struct bpf_map *map, } const struct bpf_map_ops array_of_maps_map_ops = { + .map_alloc_check = fd_array_map_alloc_check, .map_alloc = array_of_map_alloc, .map_free = array_of_map_free, .map_get_next_key = array_map_get_next_key, From 9dffd994d6d92410a8e3c3733dd9a1bbb0a8f841 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 17 Jan 2018 19:13:26 -0800 Subject: [PATCH 0145/1640] BACKPORT: bpf: arraymap: use bpf_map_init_from_attr() Arraymap was not converted to use bpf_map_init_from_attr() to avoid merge conflicts with emergency fixes. Do it now. Signed-off-by: Jakub Kicinski Signed-off-by: Daniel Borkmann --- kernel/bpf/arraymap.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 7547fd29a9ca..d4b8ee947670 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -134,12 +134,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) array->map.unpriv_array = unpriv; /* copy mandatory map attributes */ - array->map.map_type = attr->map_type; - array->map.key_size = attr->key_size; - array->map.value_size = attr->value_size; - array->map.max_entries = attr->max_entries; - array->map.map_flags = attr->map_flags; - array->map.numa_node = numa_node; + bpf_map_init_from_attr(&array->map, attr); array->map.pages = cost; array->elem_size = elem_size; From 96d85ffbc7ac504ba4acf3b5a6a6d737a698cc9c Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 17 Jan 2018 19:13:27 -0800 Subject: [PATCH 0146/1640] BACKPORT: bpf: offload: allow array map offload The special handling of different map types is left to the driver. Allow offload of array maps by simply adding it to accepted types. For nfp we have to make sure array elements are not deleted. Signed-off-by: Jakub Kicinski Signed-off-by: Daniel Borkmann --- kernel/bpf/offload.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index 6c0baa1cf8f8..2657976aec2a 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -299,7 +299,8 @@ struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr) if (!capable(CAP_SYS_ADMIN)) return ERR_PTR(-EPERM); - if (attr->map_type != BPF_MAP_TYPE_HASH) + if (attr->map_type != BPF_MAP_TYPE_ARRAY && + attr->map_type != BPF_MAP_TYPE_HASH) return ERR_PTR(-EINVAL); offmap = kzalloc(sizeof(*offmap), GFP_USER); From 8c8b9d6ebdedb61774ffa8fda2930cb6da1c1f7a Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 17 Jan 2018 19:13:28 -0800 Subject: [PATCH 0147/1640] BACKPORT: bpf: offload: report device information about offloaded maps Tell user space about device on which the map was created. Unfortunate reality of user ABI makes sharing this code with program offload difficult but the information is the same. Signed-off-by: Jakub Kicinski Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 2 ++ include/uapi/linux/bpf.h | 3 +++ kernel/bpf/offload.c | 55 ++++++++++++++++++++++++++++++++++++++++ kernel/bpf/syscall.c | 6 +++++ 4 files changed, 66 insertions(+) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 020ba636244e..4734a73db06f 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -603,6 +603,8 @@ void bpf_prog_offload_destroy(struct bpf_prog *prog); int bpf_prog_offload_info_fill(struct bpf_prog_info *info, struct bpf_prog *prog); +int bpf_map_offload_info_fill(struct bpf_map_info *info, struct bpf_map *map); + int bpf_map_offload_lookup_elem(struct bpf_map *map, void *key, void *value); int bpf_map_offload_update_elem(struct bpf_map *map, void *key, void *value, u64 flags); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index ad783be36fd6..85294593ba41 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1035,6 +1035,9 @@ struct bpf_map_info { __u32 max_entries; __u32 map_flags; char name[BPF_OBJ_NAME_LEN]; + __u32 ifindex; + __u64 netns_dev; + __u64 netns_ino; } __attribute__((aligned(8))); /* User bpf_sock_ops struct to access socket values and specify request ops diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index 2657976aec2a..c9401075b58c 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -413,6 +413,61 @@ int bpf_map_offload_get_next_key(struct bpf_map *map, void *key, void *next_key) return ret; } +struct ns_get_path_bpf_map_args { + struct bpf_offloaded_map *offmap; + struct bpf_map_info *info; +}; + +static struct ns_common *bpf_map_offload_info_fill_ns(void *private_data) +{ + struct ns_get_path_bpf_map_args *args = private_data; + struct ns_common *ns; + struct net *net; + + rtnl_lock(); + down_read(&bpf_devs_lock); + + if (args->offmap->netdev) { + args->info->ifindex = args->offmap->netdev->ifindex; + net = dev_net(args->offmap->netdev); + get_net(net); + ns = &net->ns; + } else { + args->info->ifindex = 0; + ns = NULL; + } + + up_read(&bpf_devs_lock); + rtnl_unlock(); + + return ns; +} + +int bpf_map_offload_info_fill(struct bpf_map_info *info, struct bpf_map *map) +{ + struct ns_get_path_bpf_map_args args = { + .offmap = map_to_offmap(map), + .info = info, + }; + struct inode *ns_inode; + struct path ns_path; + void *res; + + res = ns_get_path_cb(&ns_path, bpf_map_offload_info_fill_ns, &args); + if (IS_ERR(res)) { + if (!info->ifindex) + return -ENODEV; + return PTR_ERR(res); + } + + ns_inode = ns_path.dentry->d_inode; + info->netns_dev = new_encode_dev(ns_inode->i_sb->s_dev); + info->netns_ino = ns_inode->i_ino; + path_put(&ns_path); + + return 0; +} + bool bpf_offload_dev_match(struct bpf_prog *prog, struct bpf_map *map) { struct bpf_offloaded_map *offmap; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 3a7624f720ac..3d95106061ae 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1826,6 +1826,12 @@ static int bpf_map_get_info_by_fd(struct bpf_map *map, info.map_flags = map->map_flags; memcpy(info.name, map->name, sizeof(map->name)); + if (bpf_map_is_dev_bound(map)) { + err = bpf_map_offload_info_fill(&info, map); + if (err) + return err; + } + if (copy_to_user(uinfo, &info, info_len) || put_user(info_len, &uattr->info.info_len)) return -EFAULT; From e0f944001f1ec3f00bb713772aa091eb42bec057 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 18 Jan 2018 15:08:50 -0800 Subject: [PATCH 0148/1640] UPSTREAM: bpf: implement MAP_GET_NEXT_KEY command for LPM_TRIE map Current LPM_TRIE map type does not implement MAP_GET_NEXT_KEY command. This command is handy when users want to enumerate keys. Otherwise, a different map which supports key enumeration may be required to store the keys. If the map data is sparse and all map data are to be deleted without closing file descriptor, using MAP_GET_NEXT_KEY to find all keys is much faster than enumerating all key space. This patch implements MAP_GET_NEXT_KEY command for LPM_TRIE map. If user provided key pointer is NULL or the key does not have an exact match in the trie, the first key will be returned. Otherwise, the next key will be returned. In this implemenation, key enumeration follows a postorder traversal of internal trie. More specific keys will be returned first than less specific ones, given a sequence of MAP_GET_NEXT_KEY syscalls. Signed-off-by: Yonghong Song Signed-off-by: Daniel Borkmann --- kernel/bpf/lpm_trie.c | 95 ++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 93 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index ec6e45ecec08..e307f552cbe9 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -596,9 +596,100 @@ out: kfree(trie); } -static int trie_get_next_key(struct bpf_map *map, void *key, void *next_key) +static int trie_get_next_key(struct bpf_map *map, void *_key, void *_next_key) { - return -ENOTSUPP; + struct lpm_trie *trie = container_of(map, struct lpm_trie, map); + struct bpf_lpm_trie_key *key = _key, *next_key = _next_key; + struct lpm_trie_node *node, *next_node = NULL, *parent; + struct lpm_trie_node **node_stack = NULL; + struct lpm_trie_node __rcu **root; + int err = 0, stack_ptr = -1; + unsigned int next_bit; + size_t matchlen; + + /* The get_next_key follows postorder. For the 4 node example in + * the top of this file, the trie_get_next_key() returns the following + * one after another: + * 192.168.0.0/24 + * 192.168.1.0/24 + * 192.168.128.0/24 + * 192.168.0.0/16 + * + * The idea is to return more specific keys before less specific ones. + */ + + /* Empty trie */ + if (!rcu_dereference(trie->root)) + return -ENOENT; + + /* For invalid key, find the leftmost node in the trie */ + if (!key || key->prefixlen > trie->max_prefixlen) { + root = &trie->root; + goto find_leftmost; + } + + node_stack = kmalloc(trie->max_prefixlen * sizeof(struct lpm_trie_node *), + GFP_USER | __GFP_NOWARN); + if (!node_stack) + return -ENOMEM; + + /* Try to find the exact node for the given key */ + for (node = rcu_dereference(trie->root); node;) { + node_stack[++stack_ptr] = node; + matchlen = longest_prefix_match(trie, node, key); + if (node->prefixlen != matchlen || + node->prefixlen == key->prefixlen) + break; + + next_bit = extract_bit(key->data, node->prefixlen); + node = rcu_dereference(node->child[next_bit]); + } + if (!node || node->prefixlen != key->prefixlen || + (node->flags & LPM_TREE_NODE_FLAG_IM)) { + root = &trie->root; + goto find_leftmost; + } + + /* The node with the exactly-matching key has been found, + * find the first node in postorder after the matched node. + */ + node = node_stack[stack_ptr]; + while (stack_ptr > 0) { + parent = node_stack[stack_ptr - 1]; + if (rcu_dereference(parent->child[0]) == node && + rcu_dereference(parent->child[1])) { + root = &parent->child[1]; + goto find_leftmost; + } + if (!(parent->flags & LPM_TREE_NODE_FLAG_IM)) { + next_node = parent; + goto do_copy; + } + + node = parent; + stack_ptr--; + } + + /* did not find anything */ + err = -ENOENT; + goto free_stack; + +find_leftmost: + /* Find the leftmost non-intermediate node, all intermediate nodes + * have exact two children, so this function will never return NULL. + */ + for (node = rcu_dereference(*root); node;) { + if (!(node->flags & LPM_TREE_NODE_FLAG_IM)) + next_node = node; + node = rcu_dereference(node->child[0]); + } +do_copy: + next_key->prefixlen = next_node->prefixlen; + memcpy((void *)next_key + offsetof(struct bpf_lpm_trie_key, data), + next_node->data, trie->data_size); +free_stack: + kfree(node_stack); + return err; } const struct bpf_map_ops trie_map_ops = { From 0e40d1982ffddcc51cbceedd3cfc0ce76214953c Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 20 Jan 2018 01:24:30 +0100 Subject: [PATCH 0149/1640] UPSTREAM: bpf: add csum_diff helper to xdp as well Useful for porting cls_bpf programs w/o increasing program complexity limits much at the same time, so add the helper to XDP as well. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: Alexei Starovoitov --- net/core/filter.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/core/filter.c b/net/core/filter.c index 87938530a0b7..32d026bc24dd 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3555,6 +3555,8 @@ xdp_func_proto(enum bpf_func_id func_id) return &bpf_xdp_event_output_proto; case BPF_FUNC_get_smp_processor_id: return &bpf_get_smp_processor_id_proto; + case BPF_FUNC_csum_diff: + return &bpf_csum_diff_proto; case BPF_FUNC_xdp_adjust_head: return &bpf_xdp_adjust_head_proto; case BPF_FUNC_xdp_adjust_meta: From 13d25362b0254e5f200350633560a6f264f17223 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 20 Jan 2018 01:24:36 +0100 Subject: [PATCH 0150/1640] UPSTREAM: bpf: add upper complexity limit to verifier log Given the limit could potentially get further adjustments in the future, add it to the log so it becomes obvious what the current limit is w/o having to check the source first. This may also be helpful for debugging complexity related issues on kernels that backport from upstream. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a7735febdc08..c2589279087d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5174,7 +5174,8 @@ process_bpf_exit: env->insn_idx++; } - verbose(env, "processed %d insns, stack depth ", insn_processed); + verbose(env, "processed %d insns (limit %d), stack depth ", + insn_processed, BPF_COMPLEXITY_LIMIT_INSNS); for (i = 0; i < env->subprog_cnt + 1; i++) { u32 depth = env->subprog_stack_depth[i]; From c8a7eeb1744445f3c1c481e46a167aa5353434a2 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 20 Jan 2018 01:24:37 +0100 Subject: [PATCH 0151/1640] UPSTREAM: bpf: move event_output to const_size_or_zero for xdp/skb as well Similar rationale as in a60dd35d2e39 ("bpf: change bpf_perf_event_output arg5 type to ARG_CONST_SIZE_OR_ZERO"), change the type to CONST_SIZE_OR_ZERO such that we can better deal with optimized code. No changes needed in bpf_event_output() as it can also deal with 0 size entirely (e.g. as only wake-up signal with empty frame in perf RB, or packet dumps w/o meta data as another such possibility). Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: Alexei Starovoitov --- net/core/filter.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/core/filter.c b/net/core/filter.c index 32d026bc24dd..4f9d5b6a8221 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2946,7 +2946,7 @@ static const struct bpf_func_proto bpf_skb_event_output_proto = { .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_MEM, - .arg5_type = ARG_CONST_SIZE, + .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; static unsigned short bpf_tunnel_key_af(u64 flags) @@ -3235,7 +3235,7 @@ static const struct bpf_func_proto bpf_xdp_event_output_proto = { .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_MEM, - .arg5_type = ARG_CONST_SIZE, + .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; BPF_CALL_1(bpf_get_socket_cookie, struct sk_buff *, skb) From 9ac5b15cd4ca9c535ce170b60cc003bc6a1fd773 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Mon, 22 Jan 2018 22:53:51 -0800 Subject: [PATCH 0152/1640] UPSTREAM: bpf: fix incorrect kmalloc usage in lpm_trie MAP_GET_NEXT_KEY rcu region In commit b471f2f1de8b ("bpf: implement MAP_GET_NEXT_KEY command for LPM_TRIE map"), the implemented MAP_GET_NEXT_KEY callback function is guarded with rcu read lock. In the function body, "kmalloc(size, GFP_USER | __GFP_NOWARN)" is used which may sleep and violate rcu read lock region requirements. This patch fixed the issue by using GFP_ATOMIC instead to avoid blocking kmalloc. Tested with CONFIG_DEBUG_ATOMIC_SLEEP=y as suggested by Eric Dumazet. Fixes: b471f2f1de8b ("bpf: implement MAP_GET_NEXT_KEY command for LPM_TRIE map") Signed-off-by: Yonghong Song Reported-by: syzbot Reviewed-by: Eric Dumazet Signed-off-by: Daniel Borkmann --- kernel/bpf/lpm_trie.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index e307f552cbe9..fc83516df724 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -629,7 +629,7 @@ static int trie_get_next_key(struct bpf_map *map, void *_key, void *_next_key) } node_stack = kmalloc(trie->max_prefixlen * sizeof(struct lpm_trie_node *), - GFP_USER | __GFP_NOWARN); + GFP_ATOMIC | __GFP_NOWARN); if (!node_stack) return -ENOMEM; From 4a403edc9dc637c65975ab47729ef1feedd79cfa Mon Sep 17 00:00:00 2001 From: Lawrence Brakmo Date: Thu, 25 Jan 2018 16:14:05 -0800 Subject: [PATCH 0153/1640] UPSTREAM: bpf: Only reply field should be writeable Currently, a sock_ops BPF program can write the op field and all the reply fields (reply and replylong). This is a bug. The op field should not have been writeable and there is currently no way to use replylong field for indices >= 1. This patch enforces that only the reply field (which equals replylong[0]) is writeable. Fixes: 40304b2a1567 ("bpf: BPF support for sock_ops") Signed-off-by: Lawrence Brakmo Acked-by: Yuchung Cheng Signed-off-by: Alexei Starovoitov --- net/core/filter.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/net/core/filter.c b/net/core/filter.c index 4f9d5b6a8221..9994758ed582 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3940,8 +3940,7 @@ static bool sock_ops_is_valid_access(int off, int size, { if (type == BPF_WRITE) { switch (off) { - case offsetof(struct bpf_sock_ops, op) ... - offsetof(struct bpf_sock_ops, replylong[3]): + case offsetof(struct bpf_sock_ops, reply): break; default: return false; From 5f17e985a876469898566e91397f240e176cbb20 Mon Sep 17 00:00:00 2001 From: Lawrence Brakmo Date: Thu, 25 Jan 2018 16:14:06 -0800 Subject: [PATCH 0154/1640] UPSTREAM: bpf: Make SOCK_OPS_GET_TCP size independent Make SOCK_OPS_GET_TCP helper macro size independent (before only worked with 4-byte fields. Signed-off-by: Lawrence Brakmo Signed-off-by: Alexei Starovoitov --- net/core/filter.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/net/core/filter.c b/net/core/filter.c index 9994758ed582..6adfafa581e7 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4565,9 +4565,10 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, break; /* Helper macro for adding read access to tcp_sock fields. */ -#define SOCK_OPS_GET_TCP32(FIELD_NAME) \ +#define SOCK_OPS_GET_TCP(FIELD_NAME) \ do { \ - BUILD_BUG_ON(FIELD_SIZEOF(struct tcp_sock, FIELD_NAME) != 4); \ + BUILD_BUG_ON(FIELD_SIZEOF(struct tcp_sock, FIELD_NAME) > \ + FIELD_SIZEOF(struct bpf_sock_ops, FIELD_NAME)); \ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ struct bpf_sock_ops_kern, \ is_fullsock), \ @@ -4579,16 +4580,18 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, struct bpf_sock_ops_kern, sk),\ si->dst_reg, si->src_reg, \ offsetof(struct bpf_sock_ops_kern, sk));\ - *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, \ + *insn++ = BPF_LDX_MEM(FIELD_SIZEOF(struct tcp_sock, \ + FIELD_NAME), si->dst_reg, \ + si->dst_reg, \ offsetof(struct tcp_sock, FIELD_NAME)); \ } while (0) case offsetof(struct bpf_sock_ops, snd_cwnd): - SOCK_OPS_GET_TCP32(snd_cwnd); + SOCK_OPS_GET_TCP(snd_cwnd); break; case offsetof(struct bpf_sock_ops, srtt_us): - SOCK_OPS_GET_TCP32(srtt_us); + SOCK_OPS_GET_TCP(srtt_us); break; } return insn - insn_buf; From 0e8c03d84f6b0ce5c654756ebd7d9840adb884f4 Mon Sep 17 00:00:00 2001 From: Lawrence Brakmo Date: Thu, 25 Jan 2018 16:14:07 -0800 Subject: [PATCH 0155/1640] UPSTREAM: bpf: Make SOCK_OPS_GET_TCP struct independent Changed SOCK_OPS_GET_TCP to SOCK_OPS_GET_FIELD and added 2 arguments so now it can also work with struct sock fields. The first argument is the name of the field in the bpf_sock_ops struct, the 2nd argument is the name of the field in the OBJ struct. Previous: SOCK_OPS_GET_TCP(FIELD_NAME) New: SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ) Where OBJ is either "struct tcp_sock" or "struct sock" (without quotation). BPF_FIELD is the name of the field in the bpf_sock_ops struct and OBJ_FIELD is the name of the field in the OBJ struct. Although the field names are currently the same, the kernel struct names could change in the future and this change makes it easier to support that. Note that adding access to tcp_sock fields in sock_ops programs does not preclude the tcp_sock fields from being removed as long as we are willing to do one of the following: 1) Return a fixed value (e.x. 0 or 0xffffffff), or 2) Make the verifier fail if that field is accessed (i.e. program fails to load) so the user will know that field is no longer supported. Signed-off-by: Lawrence Brakmo Signed-off-by: Alexei Starovoitov --- net/core/filter.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/net/core/filter.c b/net/core/filter.c index 6adfafa581e7..0e3961210247 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4564,11 +4564,11 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, is_fullsock)); break; -/* Helper macro for adding read access to tcp_sock fields. */ -#define SOCK_OPS_GET_TCP(FIELD_NAME) \ +/* Helper macro for adding read access to tcp_sock or sock fields. */ +#define SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ) \ do { \ - BUILD_BUG_ON(FIELD_SIZEOF(struct tcp_sock, FIELD_NAME) > \ - FIELD_SIZEOF(struct bpf_sock_ops, FIELD_NAME)); \ + BUILD_BUG_ON(FIELD_SIZEOF(OBJ, OBJ_FIELD) > \ + FIELD_SIZEOF(struct bpf_sock_ops, BPF_FIELD)); \ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ struct bpf_sock_ops_kern, \ is_fullsock), \ @@ -4580,18 +4580,18 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, struct bpf_sock_ops_kern, sk),\ si->dst_reg, si->src_reg, \ offsetof(struct bpf_sock_ops_kern, sk));\ - *insn++ = BPF_LDX_MEM(FIELD_SIZEOF(struct tcp_sock, \ - FIELD_NAME), si->dst_reg, \ - si->dst_reg, \ - offsetof(struct tcp_sock, FIELD_NAME)); \ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(OBJ, \ + OBJ_FIELD), \ + si->dst_reg, si->dst_reg, \ + offsetof(OBJ, OBJ_FIELD)); \ } while (0) case offsetof(struct bpf_sock_ops, snd_cwnd): - SOCK_OPS_GET_TCP(snd_cwnd); + SOCK_OPS_GET_FIELD(snd_cwnd, snd_cwnd, struct tcp_sock); break; case offsetof(struct bpf_sock_ops, srtt_us): - SOCK_OPS_GET_TCP(srtt_us); + SOCK_OPS_GET_FIELD(srtt_us, srtt_us, struct tcp_sock); break; } return insn - insn_buf; From e7fcf8d3497c17b33bf8a269c34f5062a035e853 Mon Sep 17 00:00:00 2001 From: Lawrence Brakmo Date: Thu, 25 Jan 2018 16:14:08 -0800 Subject: [PATCH 0156/1640] UPSTREAM: bpf: Add write access to tcp_sock and sock fields This patch adds a macro, SOCK_OPS_SET_FIELD, for writing to struct tcp_sock or struct sock fields. This required adding a new field "temp" to struct bpf_sock_ops_kern for temporary storage that is used by sock_ops_convert_ctx_access. It is used to store and recover the contents of a register, so the register can be used to store the address of the sk. Since we cannot overwrite the dst_reg because it contains the pointer to ctx, nor the src_reg since it contains the value we want to store, we need an extra register to contain the address of the sk. Also adds the macro SOCK_OPS_GET_OR_SET_FIELD that calls one of the GET or SET macros depending on the value of the TYPE field. Signed-off-by: Lawrence Brakmo Acked-by: Alexei Starovoitov Signed-off-by: Alexei Starovoitov --- include/linux/filter.h | 9 ++++++++ include/net/tcp.h | 2 +- net/core/filter.c | 48 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 58 insertions(+), 1 deletion(-) diff --git a/include/linux/filter.h b/include/linux/filter.h index 4d6b1a36f7dc..dc396d664e85 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1088,6 +1088,15 @@ struct bpf_sock_ops_kern { u32 replylong[4]; }; u32 is_fullsock; + u64 temp; /* temp and everything after is not + * initialized to 0 before calling + * the BPF program. New fields that + * should be initialized to 0 should + * be inserted before temp. + * temp is scratch storage used by + * sock_ops_convert_ctx_access + * as temporary storage of a register. + */ }; #endif /* __LINUX_FILTER_H__ */ diff --git a/include/net/tcp.h b/include/net/tcp.h index de92c59012d0..142d4bb05131 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -2155,7 +2155,7 @@ static inline int tcp_call_bpf(struct sock *sk, int op) struct bpf_sock_ops_kern sock_ops; int ret; - memset(&sock_ops, 0, sizeof(sock_ops)); + memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp)); if (sk_fullsock(sk)) { sock_ops.is_fullsock = 1; sock_owned_by_me(sk); diff --git a/net/core/filter.c b/net/core/filter.c index 0e3961210247..7800eb86e47b 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4586,6 +4586,54 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, offsetof(OBJ, OBJ_FIELD)); \ } while (0) +/* Helper macro for adding write access to tcp_sock or sock fields. + * The macro is called with two registers, dst_reg which contains a pointer + * to ctx (context) and src_reg which contains the value that should be + * stored. However, we need an additional register since we cannot overwrite + * dst_reg because it may be used later in the program. + * Instead we "borrow" one of the other register. We first save its value + * into a new (temp) field in bpf_sock_ops_kern, use it, and then restore + * it at the end of the macro. + */ +#define SOCK_OPS_SET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ) \ + do { \ + int reg = BPF_REG_9; \ + BUILD_BUG_ON(FIELD_SIZEOF(OBJ, OBJ_FIELD) > \ + FIELD_SIZEOF(struct bpf_sock_ops, BPF_FIELD)); \ + if (si->dst_reg == reg || si->src_reg == reg) \ + reg--; \ + if (si->dst_reg == reg || si->src_reg == reg) \ + reg--; \ + *insn++ = BPF_STX_MEM(BPF_DW, si->dst_reg, reg, \ + offsetof(struct bpf_sock_ops_kern, \ + temp)); \ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ + struct bpf_sock_ops_kern, \ + is_fullsock), \ + reg, si->dst_reg, \ + offsetof(struct bpf_sock_ops_kern, \ + is_fullsock)); \ + *insn++ = BPF_JMP_IMM(BPF_JEQ, reg, 0, 2); \ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ + struct bpf_sock_ops_kern, sk),\ + reg, si->dst_reg, \ + offsetof(struct bpf_sock_ops_kern, sk));\ + *insn++ = BPF_STX_MEM(BPF_FIELD_SIZEOF(OBJ, OBJ_FIELD), \ + reg, si->src_reg, \ + offsetof(OBJ, OBJ_FIELD)); \ + *insn++ = BPF_LDX_MEM(BPF_DW, reg, si->dst_reg, \ + offsetof(struct bpf_sock_ops_kern, \ + temp)); \ + } while (0) + +#define SOCK_OPS_GET_OR_SET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ, TYPE) \ + do { \ + if (TYPE == BPF_WRITE) \ + SOCK_OPS_SET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ); \ + else \ + SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ); \ + } while (0) + case offsetof(struct bpf_sock_ops, snd_cwnd): SOCK_OPS_GET_FIELD(snd_cwnd, snd_cwnd, struct tcp_sock); break; From 37204376e0333768274fa35dde80d8817dbb24aa Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Wed, 4 Oct 2017 10:03:44 -0700 Subject: [PATCH 0157/1640] UPSTREAM: tcp: uniform the set up of sockets after successful connection Currently in the TCP code, the initialization sequence for cached metrics, congestion control, BPF, etc, after successful connection is very inconsistent. This introduces inconsistent bevhavior and is prone to bugs. The current call sequence is as follows: (1) for active case (tcp_finish_connect() case): tcp_mtup_init(sk); icsk->icsk_af_ops->rebuild_header(sk); tcp_init_metrics(sk); tcp_call_bpf(sk, BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB); tcp_init_congestion_control(sk); tcp_init_buffer_space(sk); (2) for passive case (tcp_rcv_state_process() TCP_SYN_RECV case): icsk->icsk_af_ops->rebuild_header(sk); tcp_call_bpf(sk, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB); tcp_init_congestion_control(sk); tcp_mtup_init(sk); tcp_init_buffer_space(sk); tcp_init_metrics(sk); (3) for TFO passive case (tcp_fastopen_create_child()): inet_csk(child)->icsk_af_ops->rebuild_header(child); tcp_init_congestion_control(child); tcp_mtup_init(child); tcp_init_metrics(child); tcp_call_bpf(child, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB); tcp_init_buffer_space(child); This commit uniforms the above functions to have the following sequence: tcp_mtup_init(sk); icsk->icsk_af_ops->rebuild_header(sk); tcp_init_metrics(sk); tcp_call_bpf(sk, BPF_SOCK_OPS_ACTIVE/PASSIVE_ESTABLISHED_CB); tcp_init_congestion_control(sk); tcp_init_buffer_space(sk); This sequence is the same as the (1) active case. We pick this sequence because this order correctly allows BPF to override the settings including congestion control module and initial cwnd, etc from the route, and then allows the CC module to see those settings. Suggested-by: Neal Cardwell Tested-by: Neal Cardwell Signed-off-by: Wei Wang Acked-by: Neal Cardwell Acked-by: Yuchung Cheng Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/tcp.h | 1 + net/ipv4/tcp.c | 12 ++++++++++++ net/ipv4/tcp_fastopen.c | 7 +------ net/ipv4/tcp_input.c | 21 +++------------------ 4 files changed, 17 insertions(+), 24 deletions(-) diff --git a/include/net/tcp.h b/include/net/tcp.h index 142d4bb05131..6db14b3948e8 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -438,6 +438,7 @@ bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst); void tcp_disable_fack(struct tcp_sock *tp); void tcp_close(struct sock *sk, long timeout); void tcp_init_sock(struct sock *sk); +void tcp_init_transfer(struct sock *sk, int bpf_op); unsigned int tcp_poll(struct file *file, struct socket *sock, struct poll_table_struct *wait); int tcp_getsockopt(struct sock *sk, int level, int optname, diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index c55ba90b3a9e..1cd78294ea7c 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -462,6 +462,18 @@ void tcp_init_sock(struct sock *sk) } EXPORT_SYMBOL(tcp_init_sock); +void tcp_init_transfer(struct sock *sk, int bpf_op) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + + tcp_mtup_init(sk); + icsk->icsk_af_ops->rebuild_header(sk); + tcp_init_metrics(sk); + tcp_call_bpf(sk, bpf_op); + tcp_init_congestion_control(sk); + tcp_init_buffer_space(sk); +} + static void tcp_tx_timestamp(struct sock *sk, u16 tsflags, struct sk_buff *skb) { if (tsflags && skb) { diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index 3649ec284d40..afdc5e72c578 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -237,12 +237,7 @@ static struct sock *tcp_fastopen_create_child(struct sock *sk, refcount_set(&req->rsk_refcnt, 2); /* Now finish processing the fastopen child socket. */ - inet_csk(child)->icsk_af_ops->rebuild_header(child); - tcp_init_congestion_control(child); - tcp_mtup_init(child); - tcp_init_metrics(child); - tcp_call_bpf(child, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB); - tcp_init_buffer_space(child); + tcp_init_transfer(child, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB); tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index b0448d64529f..3a5e1937dccb 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5718,20 +5718,13 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb) security_inet_conn_established(sk, skb); } - /* Make sure socket is routed, for correct metrics. */ - icsk->icsk_af_ops->rebuild_header(sk); - - tcp_init_metrics(sk); - tcp_call_bpf(sk, BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB); - tcp_init_congestion_control(sk); + tcp_init_transfer(sk, BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB); /* Prevent spurious tcp_cwnd_restart() on first data * packet. */ tp->lsndtime = tcp_jiffies32; - tcp_init_buffer_space(sk); - if (sock_flag(sk, SOCK_KEEPOPEN)) inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tp)); @@ -5898,7 +5891,6 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, if (tcp_is_sack(tp) && sysctl_tcp_fack) tcp_enable_fack(tp); - tcp_mtup_init(sk); tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); tcp_initialize_rcv_mss(sk); @@ -6127,14 +6119,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) inet_csk(sk)->icsk_retransmits = 0; reqsk_fastopen_remove(sk, req, false); } else { - /* Make sure socket is routed, for correct metrics. */ - icsk->icsk_af_ops->rebuild_header(sk); - tcp_call_bpf(sk, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB); - tcp_init_congestion_control(sk); - - tcp_mtup_init(sk); + tcp_init_transfer(sk, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB); tp->copied_seq = tp->rcv_nxt; - tcp_init_buffer_space(sk); } smp_mb(); tcp_set_state(sk, TCP_ESTABLISHED); @@ -6164,8 +6150,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) * are sent out. */ tcp_rearm_rto(sk); - } else - tcp_init_metrics(sk); + } if (!inet_csk(sk)->icsk_ca_ops->cong_control) tcp_update_pacing_rate(sk); From af5a4688300b4c22c7d7556d2136c2cb6df7cef5 Mon Sep 17 00:00:00 2001 From: Lawrence Brakmo Date: Fri, 20 Oct 2017 11:05:41 -0700 Subject: [PATCH 0158/1640] UPSTREAM: bpf: Add BPF_SOCKET_OPS_BASE_RTT support to tcp_nv TCP_NV will try to get the base RTT from a socket_ops BPF program if one is loaded. NV will then use the base RTT to bound its min RTT (its notion of the base RTT). It uses the base RTT as an upper bound and 80% of the base RTT as its lower bound. In other words, NV will consider filtered RTTs larger than base RTT as a sign of congestion. As a result, there is no minRTT inflation when there is a lot of congestion. For example, in a DC where the RTTs are less than 40us when there is no congestion, a base RTT value of 80us improves the performance of NV. The difference between the uncongested RTT and the base RTT provided represents how much queueing we are willing to have (in practice it can be higher). NV has been tunned to reduce congestion when there are many flows at the cost of one flow not achieving full bandwith utilization. When a reasonable base RTT is provided, one NV flow can now fully utilize the full bandwidth. In addition, the performance is also improved when there are many flows. In the following examples the NV results are using a kernel with this patch set (i.e. both NV results are using the new nv_loss_dec_factor). With one host sending to another host and only one flow the goodputs are: Cubic: 9.3 Gbps, NV: 5.5 Gbps, NV (baseRTT=80us): 9.2 Gbps With 2 hosts sending to one host (1 flow per host, the goodput per flow is: Cubic: 4.6 Gbps, NV: 4.5 Gbps, NV (baseRTT=80us)L 4.6 Gbps But the RTTs seen by a ping process in the sender is: Cubic: 3.3ms NV: 97us, NV (baseRTT=80us): 146us With a lot of flows things look even better for NV with baseRTT. Here we have 3 hosts sending to one host. Each sending host has 6 flows: 1 stream, 4x1MB RPC, 1x10KB RPC. Cubic, NV and NV with baseRTT all fully utilize the full available bandwidth. However, the distribution of bandwidth among the flows is very different. For the 10KB RPC flow: Cubic: 27Mbps, NV: 111Mbps, NV (baseRTT=80us): 222Mbps The 99% latencies for the 10KB flows are: Cubic: 26ms, NV: 1ms, NV (baseRTT=80us): 500us The RTT seen by a ping process at the senders: Cubic: 3.2ms NV: 720us, NV (baseRTT=80us): 330us Signed-off-by: Lawrence Brakmo Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- net/ipv4/tcp_nv.c | 40 ++++++++++++++++++++++++++++++++++++++-- 1 file changed, 38 insertions(+), 2 deletions(-) diff --git a/net/ipv4/tcp_nv.c b/net/ipv4/tcp_nv.c index 09f8773fd769..dd8cf0a034f1 100644 --- a/net/ipv4/tcp_nv.c +++ b/net/ipv4/tcp_nv.c @@ -39,7 +39,7 @@ * nv_cong_dec_mult Decrease cwnd by X% (30%) of congestion when detected * nv_ssthresh_factor On congestion set ssthresh to this * / 8 * nv_rtt_factor RTT averaging factor - * nv_loss_dec_factor Decrease cwnd by this (50%) when losses occur + * nv_loss_dec_factor Decrease cwnd to this (80%) when losses occur * nv_dec_eval_min_calls Wait this many RTT measurements before dec cwnd * nv_inc_eval_min_calls Wait this many RTT measurements before inc cwnd * nv_ssthresh_eval_min_calls Wait this many RTT measurements before stopping @@ -61,7 +61,7 @@ static int nv_min_cwnd __read_mostly = 2; static int nv_cong_dec_mult __read_mostly = 30 * 128 / 100; /* = 30% */ static int nv_ssthresh_factor __read_mostly = 8; /* = 1 */ static int nv_rtt_factor __read_mostly = 128; /* = 1/2*old + 1/2*new */ -static int nv_loss_dec_factor __read_mostly = 512; /* => 50% */ +static int nv_loss_dec_factor __read_mostly = 819; /* => 80% */ static int nv_cwnd_growth_rate_neg __read_mostly = 8; static int nv_cwnd_growth_rate_pos __read_mostly; /* 0 => fixed like Reno */ static int nv_dec_eval_min_calls __read_mostly = 60; @@ -101,6 +101,11 @@ struct tcpnv { u32 nv_last_rtt; /* last rtt */ u32 nv_min_rtt; /* active min rtt. Used to determine slope */ u32 nv_min_rtt_new; /* min rtt for future use */ + u32 nv_base_rtt; /* If non-zero it represents the threshold for + * congestion */ + u32 nv_lower_bound_rtt; /* Used in conjunction with nv_base_rtt. It is + * set to 80% of nv_base_rtt. It helps reduce + * unfairness between flows */ u32 nv_rtt_max_rate; /* max rate seen during current RTT */ u32 nv_rtt_start_seq; /* current RTT ends when packet arrives * acking beyond nv_rtt_start_seq */ @@ -132,9 +137,24 @@ static inline void tcpnv_reset(struct tcpnv *ca, struct sock *sk) static void tcpnv_init(struct sock *sk) { struct tcpnv *ca = inet_csk_ca(sk); + int base_rtt; tcpnv_reset(ca, sk); + /* See if base_rtt is available from socket_ops bpf program. + * It is meant to be used in environments, such as communication + * within a datacenter, where we have reasonable estimates of + * RTTs + */ + base_rtt = tcp_call_bpf(sk, BPF_SOCK_OPS_BASE_RTT); + if (base_rtt > 0) { + ca->nv_base_rtt = base_rtt; + ca->nv_lower_bound_rtt = (base_rtt * 205) >> 8; /* 80% */ + } else { + ca->nv_base_rtt = 0; + ca->nv_lower_bound_rtt = 0; + } + ca->nv_allow_cwnd_growth = 1; ca->nv_min_rtt_reset_jiffies = jiffies + 2 * HZ; ca->nv_min_rtt = NV_INIT_RTT; @@ -144,6 +164,19 @@ static void tcpnv_init(struct sock *sk) ca->cwnd_growth_factor = 0; } +/* If provided, apply upper (base_rtt) and lower (lower_bound_rtt) + * bounds to RTT. + */ +inline u32 nv_get_bounded_rtt(struct tcpnv *ca, u32 val) +{ + if (ca->nv_lower_bound_rtt > 0 && val < ca->nv_lower_bound_rtt) + return ca->nv_lower_bound_rtt; + else if (ca->nv_base_rtt > 0 && val > ca->nv_base_rtt) + return ca->nv_base_rtt; + else + return val; +} + static void tcpnv_cong_avoid(struct sock *sk, u32 ack, u32 acked) { struct tcp_sock *tp = tcp_sk(sk); @@ -265,6 +298,9 @@ static void tcpnv_acked(struct sock *sk, const struct ack_sample *sample) if (ca->nv_eval_call_cnt < 255) ca->nv_eval_call_cnt++; + /* Apply bounds to rtt. Only used to update min_rtt */ + avg_rtt = nv_get_bounded_rtt(ca, avg_rtt); + /* update min rtt if necessary */ if (avg_rtt < ca->nv_min_rtt) ca->nv_min_rtt = avg_rtt; From ed8738432506038a5be27ee9958ce5caa2419c31 Mon Sep 17 00:00:00 2001 From: Lawrence Brakmo Date: Thu, 25 Jan 2018 16:14:09 -0800 Subject: [PATCH 0159/1640] UPSTREAM: bpf: Support passing args to sock_ops bpf function Adds support for passing up to 4 arguments to sock_ops bpf functions. It reusues the reply union, so the bpf_sock_ops structures are not increased in size. Signed-off-by: Lawrence Brakmo Signed-off-by: Alexei Starovoitov --- include/linux/filter.h | 1 + include/net/tcp.h | 40 +++++++++++++++++++++++++++++++++++----- include/uapi/linux/bpf.h | 5 +++-- net/ipv4/tcp.c | 2 +- net/ipv4/tcp_nv.c | 2 +- net/ipv4/tcp_output.c | 2 +- 6 files changed, 42 insertions(+), 10 deletions(-) diff --git a/include/linux/filter.h b/include/linux/filter.h index dc396d664e85..e701c8746a93 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1084,6 +1084,7 @@ struct bpf_sock_ops_kern { struct sock *sk; u32 op; union { + u32 args[4]; u32 reply; u32 replylong[4]; }; diff --git a/include/net/tcp.h b/include/net/tcp.h index 6db14b3948e8..bead6c60990b 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -2151,7 +2151,7 @@ void tcp_cleanup_ulp(struct sock *sk); * program loaded). */ #ifdef CONFIG_BPF -static inline int tcp_call_bpf(struct sock *sk, int op) +static inline int tcp_call_bpf(struct sock *sk, int op, u32 nargs, u32 *args) { struct bpf_sock_ops_kern sock_ops; int ret; @@ -2164,6 +2164,8 @@ static inline int tcp_call_bpf(struct sock *sk, int op) sock_ops.sk = sk; sock_ops.op = op; + if (nargs > 0) + memcpy(sock_ops.args, args, nargs * sizeof(*args)); ret = BPF_CGROUP_RUN_PROG_SOCK_OPS(&sock_ops); if (ret == 0) @@ -2172,18 +2174,46 @@ static inline int tcp_call_bpf(struct sock *sk, int op) ret = -1; return ret; } + +static inline int tcp_call_bpf_2arg(struct sock *sk, int op, u32 arg1, u32 arg2) +{ + u32 args[2] = {arg1, arg2}; + + return tcp_call_bpf(sk, op, 2, args); +} + +static inline int tcp_call_bpf_3arg(struct sock *sk, int op, u32 arg1, u32 arg2, + u32 arg3) +{ + u32 args[3] = {arg1, arg2, arg3}; + + return tcp_call_bpf(sk, op, 3, args); +} + #else -static inline int tcp_call_bpf(struct sock *sk, int op) +static inline int tcp_call_bpf(struct sock *sk, int op, u32 nargs, u32 *args) { return -EPERM; } + +static inline int tcp_call_bpf_2arg(struct sock *sk, int op, u32 arg1, u32 arg2) +{ + return -EPERM; +} + +static inline int tcp_call_bpf_3arg(struct sock *sk, int op, u32 arg1, u32 arg2, + u32 arg3) +{ + return -EPERM; +} + #endif static inline u32 tcp_timeout_init(struct sock *sk) { int timeout; - timeout = tcp_call_bpf(sk, BPF_SOCK_OPS_TIMEOUT_INIT); + timeout = tcp_call_bpf(sk, BPF_SOCK_OPS_TIMEOUT_INIT, 0, NULL); if (timeout <= 0) timeout = TCP_TIMEOUT_INIT; @@ -2194,7 +2224,7 @@ static inline u32 tcp_rwnd_init_bpf(struct sock *sk) { int rwnd; - rwnd = tcp_call_bpf(sk, BPF_SOCK_OPS_RWND_INIT); + rwnd = tcp_call_bpf(sk, BPF_SOCK_OPS_RWND_INIT, 0, NULL); if (rwnd < 0) rwnd = 0; @@ -2203,6 +2233,6 @@ static inline u32 tcp_rwnd_init_bpf(struct sock *sk) static inline bool tcp_bpf_ca_needs_ecn(struct sock *sk) { - return (tcp_call_bpf(sk, BPF_SOCK_OPS_NEEDS_ECN) == 1); + return (tcp_call_bpf(sk, BPF_SOCK_OPS_NEEDS_ECN, 0, NULL) == 1); } #endif /* _TCP_H */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 85294593ba41..747c702bb211 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1049,8 +1049,9 @@ struct bpf_map_info { struct bpf_sock_ops { __u32 op; union { - __u32 reply; - __u32 replylong[4]; + __u32 args[4]; /* Optionally passed to bpf program */ + __u32 reply; /* Returned by bpf program */ + __u32 replylong[4]; /* Optionally returned by bpf prog */ }; __u32 family; __u32 remote_ip4; /* Stored in network byte order */ diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 1cd78294ea7c..f57d9e1cb03b 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -469,7 +469,7 @@ void tcp_init_transfer(struct sock *sk, int bpf_op) tcp_mtup_init(sk); icsk->icsk_af_ops->rebuild_header(sk); tcp_init_metrics(sk); - tcp_call_bpf(sk, bpf_op); + tcp_call_bpf(sk, bpf_op, 0, NULL); tcp_init_congestion_control(sk); tcp_init_buffer_space(sk); } diff --git a/net/ipv4/tcp_nv.c b/net/ipv4/tcp_nv.c index dd8cf0a034f1..37f343470b28 100644 --- a/net/ipv4/tcp_nv.c +++ b/net/ipv4/tcp_nv.c @@ -146,7 +146,7 @@ static void tcpnv_init(struct sock *sk) * within a datacenter, where we have reasonable estimates of * RTTs */ - base_rtt = tcp_call_bpf(sk, BPF_SOCK_OPS_BASE_RTT); + base_rtt = tcp_call_bpf(sk, BPF_SOCK_OPS_BASE_RTT, 0, NULL); if (base_rtt > 0) { ca->nv_base_rtt = base_rtt; ca->nv_lower_bound_rtt = (base_rtt * 205) >> 8; /* 80% */ diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 84dbb5d9f928..98a8790ee4b6 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -3542,7 +3542,7 @@ int tcp_connect(struct sock *sk) struct sk_buff *buff; int err; - tcp_call_bpf(sk, BPF_SOCK_OPS_TCP_CONNECT_CB); + tcp_call_bpf(sk, BPF_SOCK_OPS_TCP_CONNECT_CB, 0, NULL); if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk)) return -EHOSTUNREACH; /* Routing failure or similar. */ From 840a5a8194014ca499954677c76344f93e18a73c Mon Sep 17 00:00:00 2001 From: Lawrence Brakmo Date: Thu, 25 Jan 2018 16:14:10 -0800 Subject: [PATCH 0160/1640] BACKPORT: bpf: Adds field bpf_sock_ops_cb_flags to tcp_sock Adds field bpf_sock_ops_cb_flags to tcp_sock and bpf_sock_ops. Its primary use is to determine if there should be calls to sock_ops bpf program at various points in the TCP code. The field is initialized to zero, disabling the calls. A sock_ops BPF program can set it, per connection and as necessary, when the connection is established. It also adds support for reading and writting the field within a sock_ops BPF program. Reading is done by accessing the field directly. However, writing is done through the helper function bpf_sock_ops_cb_flags_set, in order to return an error if a BPF program is trying to set a callback that is not supported in the current kernel (i.e. running an older kernel). The helper function returns 0 if it was able to set all of the bits set in the argument, a positive number containing the bits that could not be set, or -EINVAL if the socket is not a full TCP socket. Examples of where one could call the bpf program: 1) When RTO fires 2) When a packet is retransmitted 3) When the connection terminates 4) When a packet is sent 5) When a packet is received Signed-off-by: Lawrence Brakmo Acked-by: Alexei Starovoitov Signed-off-by: Alexei Starovoitov --- include/linux/tcp.h | 11 +++++++++++ include/uapi/linux/bpf.h | 14 ++++++++++++++ net/core/filter.c | 34 ++++++++++++++++++++++++++++++++++ 3 files changed, 59 insertions(+) diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 45a85277c2ea..e109913dfbd6 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -327,6 +327,17 @@ struct tcp_sock { int linger2; + +/* Sock_ops bpf program related variables */ +#ifdef CONFIG_BPF + u8 bpf_sock_ops_cb_flags; /* Control calling BPF programs + * values defined in uapi/linux/tcp.h + */ +#define BPF_SOCK_OPS_TEST_FLAG(TP, ARG) (TP->bpf_sock_ops_cb_flags & ARG) +#else +#define BPF_SOCK_OPS_TEST_FLAG(TP, ARG) 0 +#endif + /* Receiver side RTT estimation */ struct { u32 rtt_us; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 747c702bb211..7c489904edda 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -642,6 +642,14 @@ union bpf_attr { * @optlen: length of optval in bytes * Return: 0 or negative error * + * int bpf_sock_ops_cb_flags_set(bpf_sock_ops, flags) + * Set callback flags for sock_ops + * @bpf_sock_ops: pointer to bpf_sock_ops_kern struct + * @flags: flags value + * Return: 0 for no error + * -EINVAL if there is no full tcp socket + * bits in flags that are not supported by current kernel + * * int bpf_skb_adjust_room(skb, len_diff, mode, flags) * Grow or shrink room in sk_buff. * @skb: pointer to skb @@ -1066,8 +1074,14 @@ struct bpf_sock_ops { */ __u32 snd_cwnd; __u32 srtt_us; /* Averaged RTT << 3 in usecs */ + __u32 bpf_sock_ops_cb_flags; /* flags defined in uapi/linux/tcp.h */ }; +/* Definitions for bpf_sock_ops_cb_flags */ +#define BPF_SOCK_OPS_ALL_CB_FLAGS 0 /* Mask of all currently + * supported cb flags + */ + /* List of known BPF sock_ops operators. * New entries can only be added at the end */ diff --git a/net/core/filter.c b/net/core/filter.c index 7800eb86e47b..9a727a970018 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3415,6 +3415,33 @@ static const struct bpf_func_proto bpf_getsockopt_proto = { .arg5_type = ARG_CONST_SIZE, }; +BPF_CALL_2(bpf_sock_ops_cb_flags_set, struct bpf_sock_ops_kern *, bpf_sock, + int, argval) +{ + struct sock *sk = bpf_sock->sk; + int val = argval & BPF_SOCK_OPS_ALL_CB_FLAGS; + + if (!sk_fullsock(sk)) + return -EINVAL; + +#ifdef CONFIG_INET + if (val) + tcp_sk(sk)->bpf_sock_ops_cb_flags = val; + + return argval & (~BPF_SOCK_OPS_ALL_CB_FLAGS); +#else + return -EINVAL; +#endif +} + +static const struct bpf_func_proto bpf_sock_ops_cb_flags_set_proto = { + .func = bpf_sock_ops_cb_flags_set, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, +}; + static const struct bpf_func_proto * bpf_base_func_proto(enum bpf_func_id func_id) { @@ -3605,6 +3632,8 @@ static const struct bpf_func_proto * return &bpf_setsockopt_proto; case BPF_FUNC_getsockopt: return &bpf_getsockopt_proto; + case BPF_FUNC_sock_ops_cb_flags_set: + return &bpf_sock_ops_cb_flags_set_proto; case BPF_FUNC_sock_map_update: return &bpf_sock_map_update_proto; default: @@ -4641,6 +4670,11 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, case offsetof(struct bpf_sock_ops, srtt_us): SOCK_OPS_GET_FIELD(srtt_us, srtt_us, struct tcp_sock); break; + + case offsetof(struct bpf_sock_ops, bpf_sock_ops_cb_flags): + SOCK_OPS_GET_FIELD(bpf_sock_ops_cb_flags, bpf_sock_ops_cb_flags, + struct tcp_sock); + break; } return insn - insn_buf; } From 31746b259fbd28bd6456d2644f1b67c6a3842c86 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Tue, 12 Dec 2017 13:10:40 -0800 Subject: [PATCH 0161/1640] BACKPORT: tcp: pause Fast Open globally after third consecutive timeout Prior to this patch, active Fast Open is paused on a specific destination IP address if the previous connections to the IP address have experienced recurring timeouts . But recent experiments by Microsoft (https://goo.gl/cykmn7) and Mozilla browsers indicate the isssue is often caused by broken middle-boxes sitting close to the client. Therefore it is much better user experience if Fast Open is disabled out-right globally to avoid experiencing further timeouts on connections toward other destinations. This patch changes the destination-IP disablement to global disablement if a connection experiencing recurring timeouts or aborts due to timeout. Repeated incidents would still exponentially increase the pause time, starting from an hour. This is extremely conservative but an unfortunate compromise to minimize bad experience due to broken middle-boxes. Reported-by: Dragana Damjanovic Reported-by: Patrick McManus Signed-off-by: Yuchung Cheng Reviewed-by: Wei Wang Reviewed-by: Neal Cardwell Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.txt | 1 + include/net/tcp.h | 5 ++--- net/ipv4/tcp_fastopen.c | 31 +++++++++++++++++--------- net/ipv4/tcp_metrics.c | 5 +---- net/ipv4/tcp_timer.c | 17 +------------- 5 files changed, 25 insertions(+), 34 deletions(-) diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index b6f70207146f..d2a40420317f 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -612,6 +612,7 @@ tcp_fastopen_blackhole_timeout_sec - INTEGER This time period will grow exponentially when more blackhole issues get detected right after Fastopen is re-enabled and will reset to initial value when the blackhole issue goes away. + 0 to disable the blackhole detection. By default, it is set to 1hr. tcp_fwmark_accept - BOOLEAN diff --git a/include/net/tcp.h b/include/net/tcp.h index bead6c60990b..97a8dfdb9b0a 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1599,8 +1599,7 @@ int tcp_md5_hash_key(struct tcp_md5sig_pool *hp, /* From tcp_fastopen.c */ void tcp_fastopen_cache_get(struct sock *sk, u16 *mss, - struct tcp_fastopen_cookie *cookie, int *syn_loss, - unsigned long *last_syn_loss); + struct tcp_fastopen_cookie *cookie); void tcp_fastopen_cache_set(struct sock *sk, u16 mss, struct tcp_fastopen_cookie *cookie, bool syn_lost, u16 try_exp); @@ -1635,7 +1634,7 @@ struct tcp_fastopen_context { void tcp_fastopen_active_disable(struct sock *sk); bool tcp_fastopen_active_should_disable(struct sock *sk); void tcp_fastopen_active_disable_ofo_check(struct sock *sk); -void tcp_fastopen_active_timeout_reset(void); +void tcp_fastopen_active_detect_blackhole(struct sock *sk, bool expired); /* Latencies incurred by various limits for a sender. They are * chronograph-like stats that are mutually exclusive. diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index afdc5e72c578..813cc24b4b37 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -348,17 +348,7 @@ fastopen: bool tcp_fastopen_cookie_check(struct sock *sk, u16 *mss, struct tcp_fastopen_cookie *cookie) { - unsigned long last_syn_loss = 0; - int syn_loss = 0; - - tcp_fastopen_cache_get(sk, mss, cookie, &syn_loss, &last_syn_loss); - - /* Recurring FO SYN losses: no cookie or data in SYN */ - if (syn_loss > 1 && - time_before(jiffies, last_syn_loss + (60*HZ << syn_loss))) { - cookie->len = -1; - return false; - } + tcp_fastopen_cache_get(sk, mss, cookie); /* Firewall blackhole issue check */ if (tcp_fastopen_active_should_disable(sk)) { @@ -414,6 +404,8 @@ EXPORT_SYMBOL(tcp_fastopen_defer_connect); * following circumstances: * 1. client side TFO socket receives out of order FIN * 2. client side TFO socket receives out of order RST + * 3. client side TFO socket has timed out three times consecutively during + * or after handshake * We disable active side TFO globally for 1hr at first. Then if it * happens again, we disable it for 2h, then 4h, 8h, ... * And we reset the timeout back to 1hr when we see a successful active @@ -490,3 +482,20 @@ void tcp_fastopen_active_disable_ofo_check(struct sock *sk) dst_release(dst); } } + +void tcp_fastopen_active_detect_blackhole(struct sock *sk, bool expired) +{ + u32 timeouts = inet_csk(sk)->icsk_retransmits; + struct tcp_sock *tp = tcp_sk(sk); + + /* Broken middle-boxes may black-hole Fast Open connection during or + * even after the handshake. Be extremely conservative and pause + * Fast Open globally after hitting the third consecutive timeout or + * exceeding the configured timeout limit. + */ + if ((tp->syn_fastopen || tp->syn_data || tp->syn_data_acked) && + (timeouts == 2 || (timeouts < 2 && expired))) { + tcp_fastopen_active_disable(sk); + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVEFAIL); + } +} diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index e433b222368c..7688e07b7387 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -567,8 +567,7 @@ bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst) } void tcp_fastopen_cache_get(struct sock *sk, u16 *mss, - struct tcp_fastopen_cookie *cookie, - int *syn_loss, unsigned long *last_syn_loss) + struct tcp_fastopen_cookie *cookie) { struct tcp_metrics_block *tm; @@ -585,8 +584,6 @@ void tcp_fastopen_cache_get(struct sock *sk, u16 *mss, *cookie = tfom->cookie; if (cookie->len <= 0 && tfom->try_exp == 1) cookie->exp = true; - *syn_loss = tfom->syn_loss; - *last_syn_loss = *syn_loss ? tfom->last_syn_loss : 0; } while (read_seqretry(&fastopen_seqlock, seq)); } rcu_read_unlock(); diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 49ca020bf5ac..680654c137dd 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -232,11 +232,6 @@ static int tcp_write_timeout(struct sock *sk) if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { if (icsk->icsk_retransmits) { dst_negative_advice(sk); - if (tp->syn_fastopen || tp->syn_data) - tcp_fastopen_cache_set(sk, 0, NULL, true, 0); - if (tp->syn_data && icsk->icsk_retransmits == 1) - NET_INC_STATS(sock_net(sk), - LINUX_MIB_TCPFASTOPENACTIVEFAIL); } else if (!tp->syn_data && !tp->syn_fastopen) { sk_rethink_txhash(sk); } @@ -244,17 +239,6 @@ static int tcp_write_timeout(struct sock *sk) expired = icsk->icsk_retransmits >= retry_until; } else { if (retransmits_timed_out(sk, net->ipv4.sysctl_tcp_retries1, 0)) { - /* Some middle-boxes may black-hole Fast Open _after_ - * the handshake. Therefore we conservatively disable - * Fast Open on this path on recurring timeouts after - * successful Fast Open. - */ - if (tp->syn_data_acked) { - tcp_fastopen_cache_set(sk, 0, NULL, true, 0); - if (icsk->icsk_retransmits == net->ipv4.sysctl_tcp_retries1) - NET_INC_STATS(sock_net(sk), - LINUX_MIB_TCPFASTOPENACTIVEFAIL); - } /* Black hole detection */ tcp_mtu_probing(icsk, sk); @@ -277,6 +261,7 @@ static int tcp_write_timeout(struct sock *sk) expired = retransmits_timed_out(sk, retry_until, icsk->icsk_user_timeout); } + tcp_fastopen_active_detect_blackhole(sk, expired); if (expired) { /* Has it gone just too far? */ tcp_write_err(sk); From a149e9839bacdf90aaf25fb013d59af5482998a5 Mon Sep 17 00:00:00 2001 From: Lawrence Brakmo Date: Thu, 25 Jan 2018 16:14:11 -0800 Subject: [PATCH 0162/1640] UPSTREAM: bpf: Add sock_ops RTO callback Adds an optional call to sock_ops BPF program based on whether the BPF_SOCK_OPS_RTO_CB_FLAG is set in bpf_sock_ops_flags. The BPF program is passed 2 arguments: icsk_retransmits and whether the RTO has expired. Signed-off-by: Lawrence Brakmo Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 8 +++++++- net/ipv4/tcp_timer.c | 7 +++++++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 7c489904edda..0cab94fc5cc1 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1078,7 +1078,8 @@ struct bpf_sock_ops { }; /* Definitions for bpf_sock_ops_cb_flags */ -#define BPF_SOCK_OPS_ALL_CB_FLAGS 0 /* Mask of all currently +#define BPF_SOCK_OPS_RTO_CB_FLAG (1<<0) +#define BPF_SOCK_OPS_ALL_CB_FLAGS 0x1 /* Mask of all currently * supported cb flags */ @@ -1115,6 +1116,11 @@ enum { * a congestion threshold. RTTs above * this indicate congestion */ + BPF_SOCK_OPS_RTO_CB, /* Called when an RTO has triggered. + * Arg1: value of icsk_retransmits + * Arg2: value of icsk_rto + * Arg3: whether RTO has expired + */ }; #define TCP_BPF_IW 1001 /* Set TCP initial congestion window */ diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 680654c137dd..abe1b222555b 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -262,11 +262,18 @@ static int tcp_write_timeout(struct sock *sk) icsk->icsk_user_timeout); } tcp_fastopen_active_detect_blackhole(sk, expired); + + if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_RTO_CB_FLAG)) + tcp_call_bpf_3arg(sk, BPF_SOCK_OPS_RTO_CB, + icsk->icsk_retransmits, + icsk->icsk_rto, (int)expired); + if (expired) { /* Has it gone just too far? */ tcp_write_err(sk); return 1; } + return 0; } From fc64ea816a8226e2441488dd87bc6175e6bcc704 Mon Sep 17 00:00:00 2001 From: Lawrence Brakmo Date: Thu, 25 Jan 2018 16:14:12 -0800 Subject: [PATCH 0163/1640] UPSTREAM: bpf: Add support for reading sk_state and more Add support for reading many more tcp_sock fields state, same as sk->sk_state rtt_min same as sk->rtt_min.s[0].v (current rtt_min) snd_ssthresh rcv_nxt snd_nxt snd_una mss_cache ecn_flags rate_delivered rate_interval_us packets_out retrans_out total_retrans segs_in data_segs_in segs_out data_segs_out lost_out sacked_out sk_txhash bytes_received (__u64) bytes_acked (__u64) Signed-off-by: Lawrence Brakmo Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 22 ++++++ net/core/filter.c | 149 +++++++++++++++++++++++++++++++++++---- 2 files changed, 157 insertions(+), 14 deletions(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 0cab94fc5cc1..cc35a33b8594 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1075,6 +1075,28 @@ struct bpf_sock_ops { __u32 snd_cwnd; __u32 srtt_us; /* Averaged RTT << 3 in usecs */ __u32 bpf_sock_ops_cb_flags; /* flags defined in uapi/linux/tcp.h */ + __u32 state; + __u32 rtt_min; + __u32 snd_ssthresh; + __u32 rcv_nxt; + __u32 snd_nxt; + __u32 snd_una; + __u32 mss_cache; + __u32 ecn_flags; + __u32 rate_delivered; + __u32 rate_interval_us; + __u32 packets_out; + __u32 retrans_out; + __u32 total_retrans; + __u32 segs_in; + __u32 data_segs_in; + __u32 segs_out; + __u32 data_segs_out; + __u32 lost_out; + __u32 sacked_out; + __u32 sk_txhash; + __u64 bytes_received; + __u64 bytes_acked; }; /* Definitions for bpf_sock_ops_cb_flags */ diff --git a/net/core/filter.c b/net/core/filter.c index 9a727a970018..b2c4d2e0bc17 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3950,33 +3950,43 @@ void bpf_warn_invalid_xdp_action(u32 act) } EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action); -static bool __is_valid_sock_ops_access(int off, int size) -{ - if (off < 0 || off >= sizeof(struct bpf_sock_ops)) - return false; - /* The verifier guarantees that size > 0. */ - if (off % size != 0) - return false; - if (size != sizeof(__u32)) - return false; - - return true; -} - static bool sock_ops_is_valid_access(int off, int size, enum bpf_access_type type, struct bpf_insn_access_aux *info) { + const int size_default = sizeof(__u32); + + if (off < 0 || off >= sizeof(struct bpf_sock_ops)) + return false; + + /* The verifier guarantees that size > 0. */ + if (off % size != 0) + return false; + if (type == BPF_WRITE) { switch (off) { case offsetof(struct bpf_sock_ops, reply): + if (size != size_default) + return false; break; default: return false; } + } else { + switch (off) { + case bpf_ctx_range_till(struct bpf_sock_ops, bytes_received, + bytes_acked): + if (size != sizeof(__u64)) + return false; + break; + default: + if (size != size_default) + return false; + break; + } } - return __is_valid_sock_ops_access(off, size); + return true; } static int sk_skb_prologue(struct bpf_insn *insn_buf, bool direct_write, @@ -4593,6 +4603,32 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, is_fullsock)); break; + case offsetof(struct bpf_sock_ops, state): + BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_state) != 1); + + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct bpf_sock_ops_kern, sk), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sock_ops_kern, sk)); + *insn++ = BPF_LDX_MEM(BPF_B, si->dst_reg, si->dst_reg, + offsetof(struct sock_common, skc_state)); + break; + + case offsetof(struct bpf_sock_ops, rtt_min): + BUILD_BUG_ON(FIELD_SIZEOF(struct tcp_sock, rtt_min) != + sizeof(struct minmax)); + BUILD_BUG_ON(sizeof(struct minmax) < + sizeof(struct minmax_sample)); + + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct bpf_sock_ops_kern, sk), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sock_ops_kern, sk)); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, + offsetof(struct tcp_sock, rtt_min) + + FIELD_SIZEOF(struct minmax_sample, t)); + break; + /* Helper macro for adding read access to tcp_sock or sock fields. */ #define SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ) \ do { \ @@ -4675,6 +4711,91 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, SOCK_OPS_GET_FIELD(bpf_sock_ops_cb_flags, bpf_sock_ops_cb_flags, struct tcp_sock); break; + + case offsetof(struct bpf_sock_ops, snd_ssthresh): + SOCK_OPS_GET_FIELD(snd_ssthresh, snd_ssthresh, struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, rcv_nxt): + SOCK_OPS_GET_FIELD(rcv_nxt, rcv_nxt, struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, snd_nxt): + SOCK_OPS_GET_FIELD(snd_nxt, snd_nxt, struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, snd_una): + SOCK_OPS_GET_FIELD(snd_una, snd_una, struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, mss_cache): + SOCK_OPS_GET_FIELD(mss_cache, mss_cache, struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, ecn_flags): + SOCK_OPS_GET_FIELD(ecn_flags, ecn_flags, struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, rate_delivered): + SOCK_OPS_GET_FIELD(rate_delivered, rate_delivered, + struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, rate_interval_us): + SOCK_OPS_GET_FIELD(rate_interval_us, rate_interval_us, + struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, packets_out): + SOCK_OPS_GET_FIELD(packets_out, packets_out, struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, retrans_out): + SOCK_OPS_GET_FIELD(retrans_out, retrans_out, struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, total_retrans): + SOCK_OPS_GET_FIELD(total_retrans, total_retrans, + struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, segs_in): + SOCK_OPS_GET_FIELD(segs_in, segs_in, struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, data_segs_in): + SOCK_OPS_GET_FIELD(data_segs_in, data_segs_in, struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, segs_out): + SOCK_OPS_GET_FIELD(segs_out, segs_out, struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, data_segs_out): + SOCK_OPS_GET_FIELD(data_segs_out, data_segs_out, + struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, lost_out): + SOCK_OPS_GET_FIELD(lost_out, lost_out, struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, sacked_out): + SOCK_OPS_GET_FIELD(sacked_out, sacked_out, struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, sk_txhash): + SOCK_OPS_GET_FIELD(sk_txhash, sk_txhash, struct sock); + break; + + case offsetof(struct bpf_sock_ops, bytes_received): + SOCK_OPS_GET_FIELD(bytes_received, bytes_received, + struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, bytes_acked): + SOCK_OPS_GET_FIELD(bytes_acked, bytes_acked, struct tcp_sock); + break; } return insn - insn_buf; } From 8124e8dafd1d45e438b8dce55c5bab8e76800031 Mon Sep 17 00:00:00 2001 From: Lawrence Brakmo Date: Thu, 25 Jan 2018 16:14:13 -0800 Subject: [PATCH 0164/1640] BACKPORT: bpf: Add sock_ops R/W access to tclass Adds direct write access to sk_txhash and access to tclass for ipv6 flows through getsockopt and setsockopt. Sample usage for tclass: bpf_getsockopt(skops, SOL_IPV6, IPV6_TCLASS, &v, sizeof(v)) where skops is a pointer to the ctx (struct bpf_sock_ops). Signed-off-by: Lawrence Brakmo Signed-off-by: Alexei Starovoitov --- net/core/filter.c | 44 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 43 insertions(+), 1 deletion(-) diff --git a/net/core/filter.c b/net/core/filter.c index b2c4d2e0bc17..893e1669b407 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3318,6 +3318,29 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock, ret = -EINVAL; } #ifdef CONFIG_INET +#if IS_ENABLED(CONFIG_IPV6) + } else if (level == SOL_IPV6) { + if (optlen != sizeof(int) || sk->sk_family != AF_INET6) + return -EINVAL; + + val = *((int *)optval); + /* Only some options are supported */ + switch (optname) { + case IPV6_TCLASS: + if (val < -1 || val > 0xff) { + ret = -EINVAL; + } else { + struct ipv6_pinfo *np = inet6_sk(sk); + + if (val == -1) + val = 0; + np->tclass = val; + } + break; + default: + ret = -EINVAL; + } +#endif } else if (level == SOL_TCP && sk->sk_prot->setsockopt == tcp_setsockopt) { if (optname == TCP_CONGESTION) { @@ -3394,6 +3417,22 @@ BPF_CALL_5(bpf_getsockopt, struct bpf_sock_ops_kern *, bpf_sock, } else { goto err_clear; } +#if IS_ENABLED(CONFIG_IPV6) + } else if (level == SOL_IPV6) { + struct ipv6_pinfo *np = inet6_sk(sk); + + if (optlen != sizeof(int) || sk->sk_family != AF_INET6) + goto err_clear; + + /* Only some options are supported */ + switch (optname) { + case IPV6_TCLASS: + *((int *)optval) = (int)np->tclass; + break; + default: + goto err_clear; + } +#endif } else { goto err_clear; } @@ -3966,6 +4005,7 @@ static bool sock_ops_is_valid_access(int off, int size, if (type == BPF_WRITE) { switch (off) { case offsetof(struct bpf_sock_ops, reply): + case offsetof(struct bpf_sock_ops, sk_txhash): if (size != size_default) return false; break; @@ -4785,7 +4825,8 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, break; case offsetof(struct bpf_sock_ops, sk_txhash): - SOCK_OPS_GET_FIELD(sk_txhash, sk_txhash, struct sock); + SOCK_OPS_GET_OR_SET_FIELD(sk_txhash, sk_txhash, + struct sock, type); break; case offsetof(struct bpf_sock_ops, bytes_received): @@ -4796,6 +4837,7 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, case offsetof(struct bpf_sock_ops, bytes_acked): SOCK_OPS_GET_FIELD(bytes_acked, bytes_acked, struct tcp_sock); break; + } return insn - insn_buf; } From e77fd1fb1c6df77ed2d82fcf87cddbac8adca61d Mon Sep 17 00:00:00 2001 From: Lawrence Brakmo Date: Thu, 25 Jan 2018 16:14:14 -0800 Subject: [PATCH 0165/1640] UPSTREAM: bpf: Add BPF_SOCK_OPS_RETRANS_CB Adds support for calling sock_ops BPF program when there is a retransmission. Three arguments are used; one for the sequence number, another for the number of segments retransmitted, and the last one for the return value of tcp_transmit_skb (0 => success). Does not include syn-ack retransmissions. New op: BPF_SOCK_OPS_RETRANS_CB. Signed-off-by: Lawrence Brakmo Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 9 ++++++++- net/ipv4/tcp_output.c | 4 ++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index cc35a33b8594..bc3100da611b 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1101,7 +1101,8 @@ struct bpf_sock_ops { /* Definitions for bpf_sock_ops_cb_flags */ #define BPF_SOCK_OPS_RTO_CB_FLAG (1<<0) -#define BPF_SOCK_OPS_ALL_CB_FLAGS 0x1 /* Mask of all currently +#define BPF_SOCK_OPS_RETRANS_CB_FLAG (1<<1) +#define BPF_SOCK_OPS_ALL_CB_FLAGS 0x3 /* Mask of all currently * supported cb flags */ @@ -1143,6 +1144,12 @@ enum { * Arg2: value of icsk_rto * Arg3: whether RTO has expired */ + BPF_SOCK_OPS_RETRANS_CB, /* Called when skb is retransmitted. + * Arg1: sequence number of 1st byte + * Arg2: # segments + * Arg3: return value of + * tcp_transmit_skb (0 => success) + */ }; #define TCP_BPF_IW 1001 /* Set TCP initial congestion window */ diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 98a8790ee4b6..0350846feb59 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2975,6 +2975,10 @@ start: err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); } + if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_RETRANS_CB_FLAG)) + tcp_call_bpf_3arg(sk, BPF_SOCK_OPS_RETRANS_CB, + TCP_SKB_CB(skb)->seq, segs, err); + if (likely(!err)) { TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS; } else if (err != -EBUSY) { From d453c27352d881ae71a3de107fe32b606e422833 Mon Sep 17 00:00:00 2001 From: Lawrence Brakmo Date: Thu, 25 Jan 2018 16:14:15 -0800 Subject: [PATCH 0166/1640] UPSTREAM: bpf: Add BPF_SOCK_OPS_STATE_CB Adds support for calling sock_ops BPF program when there is a TCP state change. Two arguments are used; one for the old state and another for the new state. There is a new enum in include/uapi/linux/bpf.h that exports the TCP states that prepends BPF_ to the current TCP state names. If it is ever necessary to change the internal TCP state values (other than adding more to the end), then it will become necessary to convert from the internal TCP state value to the BPF value before calling the BPF sock_ops function. There are a set of compile checks added in tcp.c to detect if the internal and BPF values differ so we can make the necessary fixes. New op: BPF_SOCK_OPS_STATE_CB. Signed-off-by: Lawrence Brakmo Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 29 ++++++++++++++++++++++++++++- net/ipv4/tcp.c | 24 ++++++++++++++++++++++++ 2 files changed, 52 insertions(+), 1 deletion(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index bc3100da611b..397889abec69 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1102,7 +1102,8 @@ struct bpf_sock_ops { /* Definitions for bpf_sock_ops_cb_flags */ #define BPF_SOCK_OPS_RTO_CB_FLAG (1<<0) #define BPF_SOCK_OPS_RETRANS_CB_FLAG (1<<1) -#define BPF_SOCK_OPS_ALL_CB_FLAGS 0x3 /* Mask of all currently +#define BPF_SOCK_OPS_STATE_CB_FLAG (1<<2) +#define BPF_SOCK_OPS_ALL_CB_FLAGS 0x7 /* Mask of all currently * supported cb flags */ @@ -1150,6 +1151,32 @@ enum { * Arg3: return value of * tcp_transmit_skb (0 => success) */ + BPF_SOCK_OPS_STATE_CB, /* Called when TCP changes state. + * Arg1: old_state + * Arg2: new_state + */ +}; + +/* List of TCP states. There is a build check in net/ipv4/tcp.c to detect + * changes between the TCP and BPF versions. Ideally this should never happen. + * If it does, we need to add code to convert them before calling + * the BPF sock_ops function. + */ +enum { + BPF_TCP_ESTABLISHED = 1, + BPF_TCP_SYN_SENT, + BPF_TCP_SYN_RECV, + BPF_TCP_FIN_WAIT1, + BPF_TCP_FIN_WAIT2, + BPF_TCP_TIME_WAIT, + BPF_TCP_CLOSE, + BPF_TCP_CLOSE_WAIT, + BPF_TCP_LAST_ACK, + BPF_TCP_LISTEN, + BPF_TCP_CLOSING, /* Now a valid state */ + BPF_TCP_NEW_SYN_RECV, + + BPF_TCP_MAX_STATES /* Leave at the end! */ }; #define TCP_BPF_IW 1001 /* Set TCP initial congestion window */ diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index f57d9e1cb03b..f1aa2c467f85 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2056,6 +2056,30 @@ void tcp_set_state(struct sock *sk, int state) { int oldstate = sk->sk_state; + /* We defined a new enum for TCP states that are exported in BPF + * so as not force the internal TCP states to be frozen. The + * following checks will detect if an internal state value ever + * differs from the BPF value. If this ever happens, then we will + * need to remap the internal value to the BPF value before calling + * tcp_call_bpf_2arg. + */ + BUILD_BUG_ON((int)BPF_TCP_ESTABLISHED != (int)TCP_ESTABLISHED); + BUILD_BUG_ON((int)BPF_TCP_SYN_SENT != (int)TCP_SYN_SENT); + BUILD_BUG_ON((int)BPF_TCP_SYN_RECV != (int)TCP_SYN_RECV); + BUILD_BUG_ON((int)BPF_TCP_FIN_WAIT1 != (int)TCP_FIN_WAIT1); + BUILD_BUG_ON((int)BPF_TCP_FIN_WAIT2 != (int)TCP_FIN_WAIT2); + BUILD_BUG_ON((int)BPF_TCP_TIME_WAIT != (int)TCP_TIME_WAIT); + BUILD_BUG_ON((int)BPF_TCP_CLOSE != (int)TCP_CLOSE); + BUILD_BUG_ON((int)BPF_TCP_CLOSE_WAIT != (int)TCP_CLOSE_WAIT); + BUILD_BUG_ON((int)BPF_TCP_LAST_ACK != (int)TCP_LAST_ACK); + BUILD_BUG_ON((int)BPF_TCP_LISTEN != (int)TCP_LISTEN); + BUILD_BUG_ON((int)BPF_TCP_CLOSING != (int)TCP_CLOSING); + BUILD_BUG_ON((int)BPF_TCP_NEW_SYN_RECV != (int)TCP_NEW_SYN_RECV); + BUILD_BUG_ON((int)BPF_TCP_MAX_STATES != (int)TCP_MAX_STATES); + + if (BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk), BPF_SOCK_OPS_STATE_CB_FLAG)) + tcp_call_bpf_2arg(sk, BPF_SOCK_OPS_STATE_CB, oldstate, state); + switch (state) { case TCP_ESTABLISHED: if (oldstate != TCP_ESTABLISHED) From 8ba03c67731504f5189d60da3a00e0d759410bd7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micka=C3=ABl=20Sala=C3=BCn?= Date: Fri, 26 Jan 2018 00:54:02 +0100 Subject: [PATCH 0167/1640] UPSTREAM: bpf: Use the IS_FD_ARRAY() macro in map_update_elem() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Make the code more readable. Signed-off-by: Mickaël Salaün Cc: Alexei Starovoitov Cc: Daniel Borkmann Signed-off-by: Alexei Starovoitov --- kernel/bpf/syscall.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 3d95106061ae..01595089c0ac 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -724,10 +724,7 @@ static int map_update_elem(union bpf_attr *attr) err = bpf_percpu_hash_update(map, key, value, attr->flags); } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { err = bpf_percpu_array_update(map, key, value, attr->flags); - } else if (map->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || - map->map_type == BPF_MAP_TYPE_PROG_ARRAY || - map->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || - map->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS) { + } else if (IS_FD_ARRAY(map)) { rcu_read_lock(); err = bpf_fd_array_map_update_elem(map, f.file, key, value, attr->flags); From 02b10b32a0e64406948de9c8115fe488c958d008 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 26 Jan 2018 23:33:36 +0100 Subject: [PATCH 0168/1640] UPSTREAM: bpf: xor of a/x in cbpf can be done in 32 bit alu Very minor optimization; saves 1 byte per program in x86_64 JIT in cBPF prologue. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: Alexei Starovoitov --- net/core/filter.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/core/filter.c b/net/core/filter.c index 893e1669b407..c497fd68325f 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -401,8 +401,8 @@ do_pass: /* Classic BPF expects A and X to be reset first. These need * to be guaranteed to be the first two instructions. */ - *new_insn++ = BPF_ALU64_REG(BPF_XOR, BPF_REG_A, BPF_REG_A); - *new_insn++ = BPF_ALU64_REG(BPF_XOR, BPF_REG_X, BPF_REG_X); + *new_insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_A); + *new_insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_X, BPF_REG_X); /* All programs must keep CTX in callee saved BPF_REG_CTX. * In eBPF case it's done by the compiler, here we need to From d4749868f1ebca67b85edea73434e185294c8223 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 26 Jan 2018 23:33:37 +0100 Subject: [PATCH 0169/1640] UPSTREAM: bpf: improve dead code sanitizing Given we recently had c131187db2d3 ("bpf: fix branch pruning logic") and 95a762e2c8c9 ("bpf: fix incorrect sign extension in check_alu_op()") in particular where before verifier skipped verification of the wrongly assumed dead branch, we should not just replace the dead code parts with nops (mov r0,r0). If there is a bug such as fixed in 95a762e2c8c9 in future again, where runtime could execute those insns, then one of the potential issues with the current setting would be that given the nops would be at the end of the program, we could execute out of bounds at some point. The best in such case would be to just exit the BPF program altogether and return an exception code. However, given this would require two instructions, and such a dead code gap could just be a single insn long, we would need to place 'r0 = X; ret' snippet at the very end after the user program or at the start before the program (where we'd skip that region on prog entry), and then place unconditional ja's into the dead code gap. While more complex but possible, there's still another block in the road that currently prevents from this, namely BPF to BPF calls. The issue here is that such exception could be returned from a callee, but the caller would not know that it's an exception that needs to be propagated further down. Alternative that has little complexity is to just use a ja-1 code for now which will trap the execution here instead of silently doing bad things if we ever get there due to bugs. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index c2589279087d..ef35ffa4edef 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5404,14 +5404,21 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of return new_prog; } -/* The verifier does more data flow analysis than llvm and will not explore - * branches that are dead at run time. Malicious programs can have dead code - * too. Therefore replace all dead at-run-time code with nops. +/* The verifier does more data flow analysis than llvm and will not + * explore branches that are dead at run time. Malicious programs can + * have dead code too. Therefore replace all dead at-run-time code + * with 'ja -1'. + * + * Just nops are not optimal, e.g. if they would sit at the end of the + * program and through another bug we would manage to jump there, then + * we'd execute beyond program memory otherwise. Returning exception + * code also wouldn't work since we can have subprogs where the dead + * code could be located. */ static void sanitize_dead_code(struct bpf_verifier_env *env) { struct bpf_insn_aux_data *aux_data = env->insn_aux_data; - struct bpf_insn nop = BPF_MOV64_REG(BPF_REG_0, BPF_REG_0); + struct bpf_insn trap = BPF_JMP_IMM(BPF_JA, 0, 0, -1); struct bpf_insn *insn = env->prog->insnsi; const int insn_cnt = env->prog->len; int i; @@ -5419,7 +5426,7 @@ static void sanitize_dead_code(struct bpf_verifier_env *env) for (i = 0; i < insn_cnt; i++) { if (aux_data[i].seen) continue; - memcpy(insn + i, &nop, sizeof(nop)); + memcpy(insn + i, &trap, sizeof(trap)); } } From ac1d1428fc4811266511d0bd41ff2984b31948df Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 26 Jan 2018 23:33:38 +0100 Subject: [PATCH 0170/1640] UPSTREAM: bpf: make unknown opcode handling more robust Recent findings by syzcaller fixed in 7891a87efc71 ("bpf: arsh is not supported in 32 bit alu thus reject it") triggered a warning in the interpreter due to unknown opcode not being rejected by the verifier. The 'return 0' for an unknown opcode is really not optimal, since with BPF to BPF calls, this would go untracked by the verifier. Do two things here to improve the situation: i) perform basic insn sanity check early on in the verification phase and reject every non-uapi insn right there. The bpf_opcode_in_insntable() table reuses the same mapping as the jumptable in ___bpf_prog_run() sans the non-public mappings. And ii) in ___bpf_prog_run() we do need to BUG in the case where the verifier would ever create an unknown opcode due to some rewrites. Note that JITs do not have such issues since they would punt to interpreter in these situations. Moreover, the BPF_JIT_ALWAYS_ON would also help to avoid such unknown opcodes in the first place. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: Alexei Starovoitov --- include/linux/filter.h | 2 + kernel/bpf/core.c | 250 ++++++++++++++++++++++++----------------- kernel/bpf/verifier.c | 7 ++ 3 files changed, 154 insertions(+), 105 deletions(-) diff --git a/include/linux/filter.h b/include/linux/filter.h index e701c8746a93..5e92d1c9c70c 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -765,6 +765,8 @@ static inline int sk_filter(struct sock *sk, struct sk_buff *skb) struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err); void bpf_prog_free(struct bpf_prog *fp); +bool bpf_opcode_in_insntable(u8 code); + struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags); struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size, gfp_t gfp_extra_flags); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 5151fb294678..1fedcc11fbb3 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -921,6 +921,137 @@ noinline u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) } EXPORT_SYMBOL_GPL(__bpf_call_base); +/* All UAPI available opcodes. */ +#define BPF_INSN_MAP(INSN_2, INSN_3) \ + /* 32 bit ALU operations. */ \ + /* Register based. */ \ + INSN_3(ALU, ADD, X), \ + INSN_3(ALU, SUB, X), \ + INSN_3(ALU, AND, X), \ + INSN_3(ALU, OR, X), \ + INSN_3(ALU, LSH, X), \ + INSN_3(ALU, RSH, X), \ + INSN_3(ALU, XOR, X), \ + INSN_3(ALU, MUL, X), \ + INSN_3(ALU, MOV, X), \ + INSN_3(ALU, DIV, X), \ + INSN_3(ALU, MOD, X), \ + INSN_2(ALU, NEG), \ + INSN_3(ALU, END, TO_BE), \ + INSN_3(ALU, END, TO_LE), \ + /* Immediate based. */ \ + INSN_3(ALU, ADD, K), \ + INSN_3(ALU, SUB, K), \ + INSN_3(ALU, AND, K), \ + INSN_3(ALU, OR, K), \ + INSN_3(ALU, LSH, K), \ + INSN_3(ALU, RSH, K), \ + INSN_3(ALU, XOR, K), \ + INSN_3(ALU, MUL, K), \ + INSN_3(ALU, MOV, K), \ + INSN_3(ALU, DIV, K), \ + INSN_3(ALU, MOD, K), \ + /* 64 bit ALU operations. */ \ + /* Register based. */ \ + INSN_3(ALU64, ADD, X), \ + INSN_3(ALU64, SUB, X), \ + INSN_3(ALU64, AND, X), \ + INSN_3(ALU64, OR, X), \ + INSN_3(ALU64, LSH, X), \ + INSN_3(ALU64, RSH, X), \ + INSN_3(ALU64, XOR, X), \ + INSN_3(ALU64, MUL, X), \ + INSN_3(ALU64, MOV, X), \ + INSN_3(ALU64, ARSH, X), \ + INSN_3(ALU64, DIV, X), \ + INSN_3(ALU64, MOD, X), \ + INSN_2(ALU64, NEG), \ + /* Immediate based. */ \ + INSN_3(ALU64, ADD, K), \ + INSN_3(ALU64, SUB, K), \ + INSN_3(ALU64, AND, K), \ + INSN_3(ALU64, OR, K), \ + INSN_3(ALU64, LSH, K), \ + INSN_3(ALU64, RSH, K), \ + INSN_3(ALU64, XOR, K), \ + INSN_3(ALU64, MUL, K), \ + INSN_3(ALU64, MOV, K), \ + INSN_3(ALU64, ARSH, K), \ + INSN_3(ALU64, DIV, K), \ + INSN_3(ALU64, MOD, K), \ + /* Call instruction. */ \ + INSN_2(JMP, CALL), \ + /* Exit instruction. */ \ + INSN_2(JMP, EXIT), \ + /* Jump instructions. */ \ + /* Register based. */ \ + INSN_3(JMP, JEQ, X), \ + INSN_3(JMP, JNE, X), \ + INSN_3(JMP, JGT, X), \ + INSN_3(JMP, JLT, X), \ + INSN_3(JMP, JGE, X), \ + INSN_3(JMP, JLE, X), \ + INSN_3(JMP, JSGT, X), \ + INSN_3(JMP, JSLT, X), \ + INSN_3(JMP, JSGE, X), \ + INSN_3(JMP, JSLE, X), \ + INSN_3(JMP, JSET, X), \ + /* Immediate based. */ \ + INSN_3(JMP, JEQ, K), \ + INSN_3(JMP, JNE, K), \ + INSN_3(JMP, JGT, K), \ + INSN_3(JMP, JLT, K), \ + INSN_3(JMP, JGE, K), \ + INSN_3(JMP, JLE, K), \ + INSN_3(JMP, JSGT, K), \ + INSN_3(JMP, JSLT, K), \ + INSN_3(JMP, JSGE, K), \ + INSN_3(JMP, JSLE, K), \ + INSN_3(JMP, JSET, K), \ + INSN_2(JMP, JA), \ + /* Store instructions. */ \ + /* Register based. */ \ + INSN_3(STX, MEM, B), \ + INSN_3(STX, MEM, H), \ + INSN_3(STX, MEM, W), \ + INSN_3(STX, MEM, DW), \ + INSN_3(STX, XADD, W), \ + INSN_3(STX, XADD, DW), \ + /* Immediate based. */ \ + INSN_3(ST, MEM, B), \ + INSN_3(ST, MEM, H), \ + INSN_3(ST, MEM, W), \ + INSN_3(ST, MEM, DW), \ + /* Load instructions. */ \ + /* Register based. */ \ + INSN_3(LDX, MEM, B), \ + INSN_3(LDX, MEM, H), \ + INSN_3(LDX, MEM, W), \ + INSN_3(LDX, MEM, DW), \ + /* Immediate based. */ \ + INSN_3(LD, IMM, DW), \ + /* Misc (old cBPF carry-over). */ \ + INSN_3(LD, ABS, B), \ + INSN_3(LD, ABS, H), \ + INSN_3(LD, ABS, W), \ + INSN_3(LD, IND, B), \ + INSN_3(LD, IND, H), \ + INSN_3(LD, IND, W) + +bool bpf_opcode_in_insntable(u8 code) +{ +#define BPF_INSN_2_TBL(x, y) [BPF_##x | BPF_##y] = true +#define BPF_INSN_3_TBL(x, y, z) [BPF_##x | BPF_##y | BPF_##z] = true + static const bool public_insntable[256] = { + [0 ... 255] = false, + /* Now overwrite non-defaults ... */ + BPF_INSN_MAP(BPF_INSN_2_TBL, BPF_INSN_3_TBL), + }; +#undef BPF_INSN_3_TBL +#undef BPF_INSN_2_TBL + return public_insntable[code]; +} + #ifndef CONFIG_BPF_JIT_ALWAYS_ON /** * __bpf_prog_run - run eBPF program on a given context @@ -932,115 +1063,18 @@ EXPORT_SYMBOL_GPL(__bpf_call_base); static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, u64 *stack) { u64 tmp; +#define BPF_INSN_2_LBL(x, y) [BPF_##x | BPF_##y] = &&x##_##y +#define BPF_INSN_3_LBL(x, y, z) [BPF_##x | BPF_##y | BPF_##z] = &&x##_##y##_##z static const void *jumptable[256] = { [0 ... 255] = &&default_label, /* Now overwrite non-defaults ... */ - /* 32 bit ALU operations */ - [BPF_ALU | BPF_ADD | BPF_X] = &&ALU_ADD_X, - [BPF_ALU | BPF_ADD | BPF_K] = &&ALU_ADD_K, - [BPF_ALU | BPF_SUB | BPF_X] = &&ALU_SUB_X, - [BPF_ALU | BPF_SUB | BPF_K] = &&ALU_SUB_K, - [BPF_ALU | BPF_AND | BPF_X] = &&ALU_AND_X, - [BPF_ALU | BPF_AND | BPF_K] = &&ALU_AND_K, - [BPF_ALU | BPF_OR | BPF_X] = &&ALU_OR_X, - [BPF_ALU | BPF_OR | BPF_K] = &&ALU_OR_K, - [BPF_ALU | BPF_LSH | BPF_X] = &&ALU_LSH_X, - [BPF_ALU | BPF_LSH | BPF_K] = &&ALU_LSH_K, - [BPF_ALU | BPF_RSH | BPF_X] = &&ALU_RSH_X, - [BPF_ALU | BPF_RSH | BPF_K] = &&ALU_RSH_K, - [BPF_ALU | BPF_XOR | BPF_X] = &&ALU_XOR_X, - [BPF_ALU | BPF_XOR | BPF_K] = &&ALU_XOR_K, - [BPF_ALU | BPF_MUL | BPF_X] = &&ALU_MUL_X, - [BPF_ALU | BPF_MUL | BPF_K] = &&ALU_MUL_K, - [BPF_ALU | BPF_MOV | BPF_X] = &&ALU_MOV_X, - [BPF_ALU | BPF_MOV | BPF_K] = &&ALU_MOV_K, - [BPF_ALU | BPF_DIV | BPF_X] = &&ALU_DIV_X, - [BPF_ALU | BPF_DIV | BPF_K] = &&ALU_DIV_K, - [BPF_ALU | BPF_MOD | BPF_X] = &&ALU_MOD_X, - [BPF_ALU | BPF_MOD | BPF_K] = &&ALU_MOD_K, - [BPF_ALU | BPF_NEG] = &&ALU_NEG, - [BPF_ALU | BPF_END | BPF_TO_BE] = &&ALU_END_TO_BE, - [BPF_ALU | BPF_END | BPF_TO_LE] = &&ALU_END_TO_LE, - /* 64 bit ALU operations */ - [BPF_ALU64 | BPF_ADD | BPF_X] = &&ALU64_ADD_X, - [BPF_ALU64 | BPF_ADD | BPF_K] = &&ALU64_ADD_K, - [BPF_ALU64 | BPF_SUB | BPF_X] = &&ALU64_SUB_X, - [BPF_ALU64 | BPF_SUB | BPF_K] = &&ALU64_SUB_K, - [BPF_ALU64 | BPF_AND | BPF_X] = &&ALU64_AND_X, - [BPF_ALU64 | BPF_AND | BPF_K] = &&ALU64_AND_K, - [BPF_ALU64 | BPF_OR | BPF_X] = &&ALU64_OR_X, - [BPF_ALU64 | BPF_OR | BPF_K] = &&ALU64_OR_K, - [BPF_ALU64 | BPF_LSH | BPF_X] = &&ALU64_LSH_X, - [BPF_ALU64 | BPF_LSH | BPF_K] = &&ALU64_LSH_K, - [BPF_ALU64 | BPF_RSH | BPF_X] = &&ALU64_RSH_X, - [BPF_ALU64 | BPF_RSH | BPF_K] = &&ALU64_RSH_K, - [BPF_ALU64 | BPF_XOR | BPF_X] = &&ALU64_XOR_X, - [BPF_ALU64 | BPF_XOR | BPF_K] = &&ALU64_XOR_K, - [BPF_ALU64 | BPF_MUL | BPF_X] = &&ALU64_MUL_X, - [BPF_ALU64 | BPF_MUL | BPF_K] = &&ALU64_MUL_K, - [BPF_ALU64 | BPF_MOV | BPF_X] = &&ALU64_MOV_X, - [BPF_ALU64 | BPF_MOV | BPF_K] = &&ALU64_MOV_K, - [BPF_ALU64 | BPF_ARSH | BPF_X] = &&ALU64_ARSH_X, - [BPF_ALU64 | BPF_ARSH | BPF_K] = &&ALU64_ARSH_K, - [BPF_ALU64 | BPF_DIV | BPF_X] = &&ALU64_DIV_X, - [BPF_ALU64 | BPF_DIV | BPF_K] = &&ALU64_DIV_K, - [BPF_ALU64 | BPF_MOD | BPF_X] = &&ALU64_MOD_X, - [BPF_ALU64 | BPF_MOD | BPF_K] = &&ALU64_MOD_K, - [BPF_ALU64 | BPF_NEG] = &&ALU64_NEG, - /* Call instruction */ - [BPF_JMP | BPF_CALL] = &&JMP_CALL, + BPF_INSN_MAP(BPF_INSN_2_LBL, BPF_INSN_3_LBL), + /* Non-UAPI available opcodes. */ [BPF_JMP | BPF_CALL_ARGS] = &&JMP_CALL_ARGS, [BPF_JMP | BPF_TAIL_CALL] = &&JMP_TAIL_CALL, - /* Jumps */ - [BPF_JMP | BPF_JA] = &&JMP_JA, - [BPF_JMP | BPF_JEQ | BPF_X] = &&JMP_JEQ_X, - [BPF_JMP | BPF_JEQ | BPF_K] = &&JMP_JEQ_K, - [BPF_JMP | BPF_JNE | BPF_X] = &&JMP_JNE_X, - [BPF_JMP | BPF_JNE | BPF_K] = &&JMP_JNE_K, - [BPF_JMP | BPF_JGT | BPF_X] = &&JMP_JGT_X, - [BPF_JMP | BPF_JGT | BPF_K] = &&JMP_JGT_K, - [BPF_JMP | BPF_JLT | BPF_X] = &&JMP_JLT_X, - [BPF_JMP | BPF_JLT | BPF_K] = &&JMP_JLT_K, - [BPF_JMP | BPF_JGE | BPF_X] = &&JMP_JGE_X, - [BPF_JMP | BPF_JGE | BPF_K] = &&JMP_JGE_K, - [BPF_JMP | BPF_JLE | BPF_X] = &&JMP_JLE_X, - [BPF_JMP | BPF_JLE | BPF_K] = &&JMP_JLE_K, - [BPF_JMP | BPF_JSGT | BPF_X] = &&JMP_JSGT_X, - [BPF_JMP | BPF_JSGT | BPF_K] = &&JMP_JSGT_K, - [BPF_JMP | BPF_JSLT | BPF_X] = &&JMP_JSLT_X, - [BPF_JMP | BPF_JSLT | BPF_K] = &&JMP_JSLT_K, - [BPF_JMP | BPF_JSGE | BPF_X] = &&JMP_JSGE_X, - [BPF_JMP | BPF_JSGE | BPF_K] = &&JMP_JSGE_K, - [BPF_JMP | BPF_JSLE | BPF_X] = &&JMP_JSLE_X, - [BPF_JMP | BPF_JSLE | BPF_K] = &&JMP_JSLE_K, - [BPF_JMP | BPF_JSET | BPF_X] = &&JMP_JSET_X, - [BPF_JMP | BPF_JSET | BPF_K] = &&JMP_JSET_K, - /* Program return */ - [BPF_JMP | BPF_EXIT] = &&JMP_EXIT, - /* Store instructions */ - [BPF_STX | BPF_MEM | BPF_B] = &&STX_MEM_B, - [BPF_STX | BPF_MEM | BPF_H] = &&STX_MEM_H, - [BPF_STX | BPF_MEM | BPF_W] = &&STX_MEM_W, - [BPF_STX | BPF_MEM | BPF_DW] = &&STX_MEM_DW, - [BPF_STX | BPF_XADD | BPF_W] = &&STX_XADD_W, - [BPF_STX | BPF_XADD | BPF_DW] = &&STX_XADD_DW, - [BPF_ST | BPF_MEM | BPF_B] = &&ST_MEM_B, - [BPF_ST | BPF_MEM | BPF_H] = &&ST_MEM_H, - [BPF_ST | BPF_MEM | BPF_W] = &&ST_MEM_W, - [BPF_ST | BPF_MEM | BPF_DW] = &&ST_MEM_DW, - /* Load instructions */ - [BPF_LDX | BPF_MEM | BPF_B] = &&LDX_MEM_B, - [BPF_LDX | BPF_MEM | BPF_H] = &&LDX_MEM_H, - [BPF_LDX | BPF_MEM | BPF_W] = &&LDX_MEM_W, - [BPF_LDX | BPF_MEM | BPF_DW] = &&LDX_MEM_DW, - [BPF_LD | BPF_ABS | BPF_W] = &&LD_ABS_W, - [BPF_LD | BPF_ABS | BPF_H] = &&LD_ABS_H, - [BPF_LD | BPF_ABS | BPF_B] = &&LD_ABS_B, - [BPF_LD | BPF_IND | BPF_W] = &&LD_IND_W, - [BPF_LD | BPF_IND | BPF_H] = &&LD_IND_H, - [BPF_LD | BPF_IND | BPF_B] = &&LD_IND_B, - [BPF_LD | BPF_IMM | BPF_DW] = &&LD_IMM_DW, }; +#undef BPF_INSN_3_LBL +#undef BPF_INSN_2_LBL u32 tail_call_cnt = 0; void *ptr; int off; @@ -1433,8 +1467,14 @@ load_byte: goto load_byte; default_label: - /* If we ever reach this, we have a bug somewhere. */ - WARN_RATELIMIT(1, "unknown opcode %02x\n", insn->code); + /* If we ever reach this, we have a bug somewhere. Die hard here + * instead of just returning 0; we could be somewhere in a subprog, + * so execution could continue otherwise which we do /not/ want. + * + * Note, verifier whitelists all opcodes in bpf_opcode_in_insntable(). + */ + pr_warn("BPF interpreter: unknown opcode %02x\n", insn->code); + BUG_ON(1); return 0; } STACK_FRAME_NON_STANDARD(___bpf_prog_run); /* jump table */ diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index ef35ffa4edef..0581763702e3 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5321,6 +5321,13 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env) next_insn: insn++; i++; + continue; + } + + /* Basic sanity check before we invest more work here. */ + if (!bpf_opcode_in_insntable(insn->code)) { + verbose(env, "unknown opcode %02x\n", insn->code); + return -EINVAL; } } From 6aabf05d32b077ab9710c0ec7cfaa74ef986939c Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Fri, 26 Jan 2018 15:06:07 -0800 Subject: [PATCH 0171/1640] UPSTREAM: bpf: fix kernel page fault in lpm map trie_get_next_key Commit b471f2f1de8b ("bpf: implement MAP_GET_NEXT_KEY command for LPM_TRIE map") introduces a bug likes below: if (!rcu_dereference(trie->root)) return -ENOENT; if (!key || key->prefixlen > trie->max_prefixlen) { root = &trie->root; goto find_leftmost; } ...... find_leftmost: for (node = rcu_dereference(*root); node;) { In the code after label find_leftmost, it is assumed that *root should not be NULL, but it is not true as it is possbile trie->root is changed to NULL by an asynchronous delete operation. The issue is reported by syzbot and Eric Dumazet with the below error log: ...... kasan: CONFIG_KASAN_INLINE enabled kasan: GPF could be caused by NULL-ptr deref or user memory access general protection fault: 0000 [#1] SMP KASAN Dumping ftrace buffer: (ftrace buffer empty) Modules linked in: CPU: 1 PID: 8033 Comm: syz-executor3 Not tainted 4.15.0-rc8+ #4 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:trie_get_next_key+0x3c2/0xf10 kernel/bpf/lpm_trie.c:682 ...... This patch fixed the issue by use local rcu_dereferenced pointer instead of *(&trie->root) later on. Fixes: b471f2f1de8b ("bpf: implement MAP_GET_NEXT_KEY command or LPM_TRIE map") Reported-by: syzbot Reported-by: Eric Dumazet Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov --- kernel/bpf/lpm_trie.c | 26 +++++++++++--------------- 1 file changed, 11 insertions(+), 15 deletions(-) diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index fc83516df724..af67820ea804 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -598,11 +598,10 @@ out: static int trie_get_next_key(struct bpf_map *map, void *_key, void *_next_key) { + struct lpm_trie_node *node, *next_node = NULL, *parent, *search_root; struct lpm_trie *trie = container_of(map, struct lpm_trie, map); struct bpf_lpm_trie_key *key = _key, *next_key = _next_key; - struct lpm_trie_node *node, *next_node = NULL, *parent; struct lpm_trie_node **node_stack = NULL; - struct lpm_trie_node __rcu **root; int err = 0, stack_ptr = -1; unsigned int next_bit; size_t matchlen; @@ -619,14 +618,13 @@ static int trie_get_next_key(struct bpf_map *map, void *_key, void *_next_key) */ /* Empty trie */ - if (!rcu_dereference(trie->root)) + search_root = rcu_dereference(trie->root); + if (!search_root) return -ENOENT; /* For invalid key, find the leftmost node in the trie */ - if (!key || key->prefixlen > trie->max_prefixlen) { - root = &trie->root; + if (!key || key->prefixlen > trie->max_prefixlen) goto find_leftmost; - } node_stack = kmalloc(trie->max_prefixlen * sizeof(struct lpm_trie_node *), GFP_ATOMIC | __GFP_NOWARN); @@ -634,7 +632,7 @@ static int trie_get_next_key(struct bpf_map *map, void *_key, void *_next_key) return -ENOMEM; /* Try to find the exact node for the given key */ - for (node = rcu_dereference(trie->root); node;) { + for (node = search_root; node;) { node_stack[++stack_ptr] = node; matchlen = longest_prefix_match(trie, node, key); if (node->prefixlen != matchlen || @@ -645,10 +643,8 @@ static int trie_get_next_key(struct bpf_map *map, void *_key, void *_next_key) node = rcu_dereference(node->child[next_bit]); } if (!node || node->prefixlen != key->prefixlen || - (node->flags & LPM_TREE_NODE_FLAG_IM)) { - root = &trie->root; + (node->flags & LPM_TREE_NODE_FLAG_IM)) goto find_leftmost; - } /* The node with the exactly-matching key has been found, * find the first node in postorder after the matched node. @@ -656,10 +652,10 @@ static int trie_get_next_key(struct bpf_map *map, void *_key, void *_next_key) node = node_stack[stack_ptr]; while (stack_ptr > 0) { parent = node_stack[stack_ptr - 1]; - if (rcu_dereference(parent->child[0]) == node && - rcu_dereference(parent->child[1])) { - root = &parent->child[1]; - goto find_leftmost; + if (rcu_dereference(parent->child[0]) == node) { + search_root = rcu_dereference(parent->child[1]); + if (search_root) + goto find_leftmost; } if (!(parent->flags & LPM_TREE_NODE_FLAG_IM)) { next_node = parent; @@ -678,7 +674,7 @@ find_leftmost: /* Find the leftmost non-intermediate node, all intermediate nodes * have exact two children, so this function will never return NULL. */ - for (node = rcu_dereference(*root); node;) { + for (node = search_root; node;) { if (!(node->flags & LPM_TREE_NODE_FLAG_IM)) next_node = node; node = rcu_dereference(node->child[0]); From 003b2b86015b67ce8856bb1a0da12c30f44c2af8 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Wed, 3 Jan 2018 11:26:09 +0100 Subject: [PATCH 0172/1640] BACKPORT: xdp: generic XDP handling of xdp_rxq_info Hook points for xdp_rxq_info: * reg : netif_alloc_rx_queues * unreg: netif_free_rx_queues The net_device have some members (num_rx_queues + real_num_rx_queues) and data-area (dev->_rx with struct netdev_rx_queue's) that were primarily used for exporting information about RPS (CONFIG_RPS) queues to sysfs (CONFIG_SYSFS). For generic XDP extend struct netdev_rx_queue with the xdp_rxq_info, and remove some of the CONFIG_SYSFS ifdefs. Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Alexei Starovoitov --- include/linux/netdevice.h | 2 ++ net/core/dev.c | 69 +++++++++++++++++++++++++++++++++------ 2 files changed, 61 insertions(+), 10 deletions(-) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index d4d2998f7f6e..d3e0cf066267 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -44,6 +44,7 @@ #include #endif #include +#include #include #include @@ -702,6 +703,7 @@ struct netdev_rx_queue { #endif struct kobject kobj; struct net_device *dev; + struct xdp_rxq_info xdp_rxq; } ____cacheline_aligned_in_smp; /* diff --git a/net/core/dev.c b/net/core/dev.c index 2b9e3e0a487d..4eb45598b79d 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3942,9 +3942,33 @@ drop: return NET_RX_DROP; } +static struct netdev_rx_queue *netif_get_rxqueue(struct sk_buff *skb) +{ + struct net_device *dev = skb->dev; + struct netdev_rx_queue *rxqueue; + + rxqueue = dev->_rx; + + if (skb_rx_queue_recorded(skb)) { + u16 index = skb_get_rx_queue(skb); + + if (unlikely(index >= dev->real_num_rx_queues)) { + WARN_ONCE(dev->real_num_rx_queues > 1, + "%s received packet on queue %u, but number " + "of RX queues is %u\n", + dev->name, index, dev->real_num_rx_queues); + + return rxqueue; /* Return first rxqueue */ + } + rxqueue += index; + } + return rxqueue; +} + static u32 netif_receive_generic_xdp(struct sk_buff *skb, struct bpf_prog *xdp_prog) { + struct netdev_rx_queue *rxqueue; u32 metalen, act = XDP_DROP; struct xdp_buff xdp; void *orig_data; @@ -3988,6 +4012,9 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, xdp.data_hard_start = skb->data - skb_headroom(skb); orig_data = xdp.data; + rxqueue = netif_get_rxqueue(skb); + xdp.rxq = &rxqueue->xdp_rxq; + act = bpf_prog_run_xdp(xdp_prog, &xdp); off = xdp.data - orig_data; @@ -7636,12 +7663,12 @@ void netif_stacked_transfer_operstate(const struct net_device *rootdev, } EXPORT_SYMBOL(netif_stacked_transfer_operstate); -#ifdef CONFIG_SYSFS static int netif_alloc_rx_queues(struct net_device *dev) { unsigned int i, count = dev->num_rx_queues; struct netdev_rx_queue *rx; size_t sz = count * sizeof(*rx); + int err = 0; BUG_ON(count < 1); @@ -7651,11 +7678,39 @@ static int netif_alloc_rx_queues(struct net_device *dev) dev->_rx = rx; - for (i = 0; i < count; i++) + for (i = 0; i < count; i++) { rx[i].dev = dev; + + /* XDP RX-queue setup */ + err = xdp_rxq_info_reg(&rx[i].xdp_rxq, dev, i); + if (err < 0) + goto err_rxq_info; + } return 0; + +err_rxq_info: + /* Rollback successful reg's and free other resources */ + while (i--) + xdp_rxq_info_unreg(&rx[i].xdp_rxq); + kfree(dev->_rx); + dev->_rx = NULL; + return err; +} + +static void netif_free_rx_queues(struct net_device *dev) +{ + unsigned int i, count = dev->num_rx_queues; + struct netdev_rx_queue *rx; + + /* netif_alloc_rx_queues alloc failed, resources have been unreg'ed */ + if (!dev->_rx) + return; + + rx = dev->_rx; + + for (i = 0; i < count; i++) + xdp_rxq_info_unreg(&rx[i].xdp_rxq); } -#endif static void netdev_init_one_queue(struct net_device *dev, struct netdev_queue *queue, void *_unused) @@ -8239,12 +8294,10 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, return NULL; } -#ifdef CONFIG_SYSFS if (rxqs < 1) { pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n"); return NULL; } -#endif alloc_size = sizeof(struct net_device); if (sizeof_priv) { @@ -8301,12 +8354,10 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, if (netif_alloc_netdev_queues(dev)) goto free_all; -#ifdef CONFIG_SYSFS dev->num_rx_queues = rxqs; dev->real_num_rx_queues = rxqs; if (netif_alloc_rx_queues(dev)) goto free_all; -#endif strcpy(dev->name, name); dev->name_assign_type = name_assign_type; @@ -8346,9 +8397,7 @@ void free_netdev(struct net_device *dev) might_sleep(); netif_free_tx_queues(dev); -#ifdef CONFIG_SYSFS - kvfree(dev->_rx); -#endif + netif_free_rx_queues(dev); kfree(rcu_dereference_protected(dev->ingress_queue, 1)); From 7bd02bf8c16a5043c80ce13bc1c509301f9c5e08 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 31 Jan 2018 12:58:56 +0100 Subject: [PATCH 0173/1640] BACKPORT: bpf: fix null pointer deref in bpf_prog_test_run_xdp syzkaller was able to generate the following XDP program ... (18) r0 = 0x0 (61) r5 = *(u32 *)(r1 +12) (04) (u32) r0 += (u32) 0 (95) exit ... and trigger a NULL pointer dereference in ___bpf_prog_run() via bpf_prog_test_run_xdp() where this was attempted to run. Reason is that recent xdp_rxq_info addition to XDP programs updated all drivers, but not bpf_prog_test_run_xdp(), where xdp_buff is set up. Thus when context rewriter does the deref on the netdev it's NULL at runtime. Fix it by using xdp_rxq from loopback dev. __netif_get_rx_queue() helper can also be reused in various other locations later on. Fixes: 02dd3291b2f0 ("bpf: finally expose xdp_rxq_info to XDP bpf-programs") Reported-by: syzbot+1eb094057b338eb1fc00@syzkaller.appspotmail.com Signed-off-by: Daniel Borkmann Cc: Jesper Dangaard Brouer Acked-by: Jesper Dangaard Brouer Signed-off-by: Alexei Starovoitov --- include/linux/netdevice.h | 6 ++++++ net/bpf/test_run.c | 4 ++++ 2 files changed, 10 insertions(+) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index d3e0cf066267..17fc9b202008 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3303,6 +3303,12 @@ static inline int netif_set_real_num_rx_queues(struct net_device *dev, } #endif +static inline struct netdev_rx_queue * +__netif_get_rx_queue(struct net_device *dev, unsigned int rxq) +{ + return dev->_rx + rxq; +} + #ifdef CONFIG_SYSFS static inline unsigned int get_netdev_rx_queue_index( struct netdev_rx_queue *queue) diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 0e314d5feef4..489e3c64bdae 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -163,6 +163,7 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr, { u32 size = kattr->test.data_size_in; u32 repeat = kattr->test.repeat; + struct netdev_rx_queue *rxqueue; struct xdp_buff xdp = {}; u32 retval, duration; void *data; @@ -177,6 +178,9 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr, xdp.data_meta = xdp.data; xdp.data_end = xdp.data + size; + rxqueue = __netif_get_rx_queue(current->nsproxy->net_ns->loopback_dev, 0); + xdp.rxq = &rxqueue->xdp_rxq; + retval = bpf_test_run(prog, &xdp, repeat, &duration); if (xdp.data != data + XDP_PACKET_HEADROOM + NET_IP_ALIGN) size = xdp.data_end - xdp.data; From 003e28144ac85039cb2f150acca8aae41d3825d3 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 2 Feb 2018 15:14:05 -0800 Subject: [PATCH 0174/1640] UPSTREAM: bpf: fix bpf_prog_array_copy_to_user() issues 1. move copy_to_user out of rcu section to fix the following issue: ./include/linux/rcupdate.h:302 Illegal context switch in RCU read-side critical section! stack backtrace: __dump_stack lib/dump_stack.c:17 [inline] dump_stack+0x194/0x257 lib/dump_stack.c:53 lockdep_rcu_suspicious+0x123/0x170 kernel/locking/lockdep.c:4592 rcu_preempt_sleep_check include/linux/rcupdate.h:301 [inline] ___might_sleep+0x385/0x470 kernel/sched/core.c:6079 __might_sleep+0x95/0x190 kernel/sched/core.c:6067 __might_fault+0xab/0x1d0 mm/memory.c:4532 _copy_to_user+0x2c/0xc0 lib/usercopy.c:25 copy_to_user include/linux/uaccess.h:155 [inline] bpf_prog_array_copy_to_user+0x217/0x4d0 kernel/bpf/core.c:1587 bpf_prog_array_copy_info+0x17b/0x1c0 kernel/bpf/core.c:1685 perf_event_query_prog_array+0x196/0x280 kernel/trace/bpf_trace.c:877 _perf_ioctl kernel/events/core.c:4737 [inline] perf_ioctl+0x3e1/0x1480 kernel/events/core.c:4757 2. move *prog under rcu, since it's not ok to dereference it afterwards 3. in a rare case of prog array being swapped between bpf_prog_array_length() and bpf_prog_array_copy_to_user() calls make sure to copy zeros to user space, so the user doesn't walk over uninited prog_ids while kernel reported uattr->query.prog_cnt > 0 Reported-by: syzbot+7dbcd2d3b85f9b608b23@syzkaller.appspotmail.com Fixes: 468e2f64d220 ("bpf: introduce BPF_PROG_QUERY command") Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- kernel/bpf/core.c | 30 +++++++++++++++++++++++------- 1 file changed, 23 insertions(+), 7 deletions(-) diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 1fedcc11fbb3..4e48a85e722a 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1715,25 +1715,41 @@ int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *progs, __u32 __user *prog_ids, u32 cnt) { struct bpf_prog **prog; - u32 i = 0, id; + unsigned long err = 0; + u32 i = 0, *ids; + bool nospc; + /* users of this function are doing: + * cnt = bpf_prog_array_length(); + * if (cnt > 0) + * bpf_prog_array_copy_to_user(..., cnt); + * so below kcalloc doesn't need extra cnt > 0 check, but + * bpf_prog_array_length() releases rcu lock and + * prog array could have been swapped with empty or larger array, + * so always copy 'cnt' prog_ids to the user. + * In a rare race the user will see zero prog_ids + */ + ids = kcalloc(cnt, sizeof(u32), GFP_USER); + if (!ids) + return -ENOMEM; rcu_read_lock(); prog = rcu_dereference(progs)->progs; for (; *prog; prog++) { if (*prog == &dummy_bpf_prog.prog) continue; - id = (*prog)->aux->id; - if (copy_to_user(prog_ids + i, &id, sizeof(id))) { - rcu_read_unlock(); - return -EFAULT; - } + ids[i] = (*prog)->aux->id; if (++i == cnt) { prog++; break; } } + nospc = !!(*prog); rcu_read_unlock(); - if (*prog) + err = copy_to_user(prog_ids, ids, cnt * sizeof(u32)); + kfree(ids); + if (err) + return -EFAULT; + if (nospc) return -ENOSPC; return 0; } From d8ea22d910cffeb8f898b2a7dbf1ca0417fcb730 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Mon, 5 Feb 2018 10:17:49 -0800 Subject: [PATCH 0175/1640] UPSTREAM: bpf: sockmap, add sock close() hook to remove socks The selftests test_maps program was leaving dangling BPF sockmap programs around because not all psock elements were removed from the map. The elements in turn hold a reference on the BPF program they are attached to causing BPF programs to stay open even after test_maps has completed. The original intent was that sk_state_change() would be called when TCP socks went through TCP_CLOSE state. However, because socks may be in SOCK_DEAD state or the sock may be a listening socket the event is not always triggered. To resolve this use the ULP infrastructure and register our own proto close() handler. This fixes the above case. Fixes: 174a79ff9515 ("bpf: sockmap with sk redirect support") Reported-by: Prashant Bhole Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann --- include/net/tcp.h | 2 + kernel/bpf/sockmap.c | 168 ++++++++++++++++++++++++++----------------- 2 files changed, 103 insertions(+), 67 deletions(-) diff --git a/include/net/tcp.h b/include/net/tcp.h index 97a8dfdb9b0a..c7a7519666fb 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -2119,6 +2119,7 @@ enum hrtimer_restart tcp_pace_kick(struct hrtimer *timer); enum { TCP_ULP_TLS, + TCP_ULP_BPF, }; struct tcp_ulp_ops { @@ -2137,6 +2138,7 @@ struct tcp_ulp_ops { int tcp_register_ulp(struct tcp_ulp_ops *type); void tcp_unregister_ulp(struct tcp_ulp_ops *type); int tcp_set_ulp(struct sock *sk, const char *name); +int tcp_set_ulp_id(struct sock *sk, const int ulp); void tcp_get_available_ulp(char *buf, size_t len); void tcp_cleanup_ulp(struct sock *sk); diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index d061756bb814..9fee2428f711 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -86,9 +86,10 @@ struct smap_psock { struct work_struct tx_work; struct work_struct gc_work; + struct proto *sk_proto; + void (*save_close)(struct sock *sk, long timeout); void (*save_data_ready)(struct sock *sk); void (*save_write_space)(struct sock *sk); - void (*save_state_change)(struct sock *sk); }; static inline struct smap_psock *smap_psock_sk(const struct sock *sk) @@ -96,12 +97,102 @@ static inline struct smap_psock *smap_psock_sk(const struct sock *sk) return rcu_dereference_sk_user_data(sk); } +static struct proto tcp_bpf_proto; +static int bpf_tcp_init(struct sock *sk) +{ + struct smap_psock *psock; + + rcu_read_lock(); + psock = smap_psock_sk(sk); + if (unlikely(!psock)) { + rcu_read_unlock(); + return -EINVAL; + } + + if (unlikely(psock->sk_proto)) { + rcu_read_unlock(); + return -EBUSY; + } + + psock->save_close = sk->sk_prot->close; + psock->sk_proto = sk->sk_prot; + sk->sk_prot = &tcp_bpf_proto; + rcu_read_unlock(); + return 0; +} + +static void bpf_tcp_release(struct sock *sk) +{ + struct smap_psock *psock; + + rcu_read_lock(); + psock = smap_psock_sk(sk); + + if (likely(psock)) { + sk->sk_prot = psock->sk_proto; + psock->sk_proto = NULL; + } + rcu_read_unlock(); +} + +static void smap_release_sock(struct smap_psock *psock, struct sock *sock); + +static void bpf_tcp_close(struct sock *sk, long timeout) +{ + void (*close_fun)(struct sock *sk, long timeout); + struct smap_psock_map_entry *e, *tmp; + struct smap_psock *psock; + struct sock *osk; + + rcu_read_lock(); + psock = smap_psock_sk(sk); + if (unlikely(!psock)) { + rcu_read_unlock(); + return sk->sk_prot->close(sk, timeout); + } + + /* The psock may be destroyed anytime after exiting the RCU critial + * section so by the time we use close_fun the psock may no longer + * be valid. However, bpf_tcp_close is called with the sock lock + * held so the close hook and sk are still valid. + */ + close_fun = psock->save_close; + + write_lock_bh(&sk->sk_callback_lock); + list_for_each_entry_safe(e, tmp, &psock->maps, list) { + osk = cmpxchg(e->entry, sk, NULL); + if (osk == sk) { + list_del(&e->list); + smap_release_sock(psock, sk); + } + } + write_unlock_bh(&sk->sk_callback_lock); + rcu_read_unlock(); + close_fun(sk, timeout); +} + enum __sk_action { __SK_DROP = 0, __SK_PASS, __SK_REDIRECT, }; +static struct tcp_ulp_ops bpf_tcp_ulp_ops __read_mostly = { + .name = "bpf_tcp", + .uid = TCP_ULP_BPF, + .user_visible = false, + .owner = NULL, + .init = bpf_tcp_init, + .release = bpf_tcp_release, +}; + +static int bpf_tcp_ulp_register(void) +{ + tcp_bpf_proto = tcp_prot; + tcp_bpf_proto.close = bpf_tcp_close; + return tcp_register_ulp(&bpf_tcp_ulp_ops); +} + static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb) { struct bpf_prog *prog = READ_ONCE(psock->bpf_verdict); @@ -166,68 +257,6 @@ static void smap_report_sk_error(struct smap_psock *psock, int err) sk->sk_error_report(sk); } -static void smap_release_sock(struct smap_psock *psock, struct sock *sock); - -/* Called with lock_sock(sk) held */ -static void smap_state_change(struct sock *sk) -{ - struct smap_psock_map_entry *e, *tmp; - struct smap_psock *psock; - struct socket_wq *wq; - struct sock *osk; - - rcu_read_lock(); - - /* Allowing transitions into an established syn_recv states allows - * for early binding sockets to a smap object before the connection - * is established. - */ - switch (sk->sk_state) { - case TCP_SYN_SENT: - case TCP_SYN_RECV: - case TCP_ESTABLISHED: - break; - case TCP_CLOSE_WAIT: - case TCP_CLOSING: - case TCP_LAST_ACK: - case TCP_FIN_WAIT1: - case TCP_FIN_WAIT2: - case TCP_LISTEN: - break; - case TCP_CLOSE: - /* Only release if the map entry is in fact the sock in - * question. There is a case where the operator deletes - * the sock from the map, but the TCP sock is closed before - * the psock is detached. Use cmpxchg to verify correct - * sock is removed. - */ - psock = smap_psock_sk(sk); - if (unlikely(!psock)) - break; - write_lock_bh(&sk->sk_callback_lock); - list_for_each_entry_safe(e, tmp, &psock->maps, list) { - osk = cmpxchg(e->entry, sk, NULL); - if (osk == sk) { - list_del(&e->list); - smap_release_sock(psock, sk); - } - } - write_unlock_bh(&sk->sk_callback_lock); - break; - default: - psock = smap_psock_sk(sk); - if (unlikely(!psock)) - break; - smap_report_sk_error(psock, EPIPE); - break; - } - - wq = rcu_dereference(sk->sk_wq); - if (skwq_has_sleeper(wq)) - wake_up_interruptible_all(&wq->wait); - rcu_read_unlock(); -} - static void smap_read_sock_strparser(struct strparser *strp, struct sk_buff *skb) { @@ -325,10 +354,8 @@ static void smap_stop_sock(struct smap_psock *psock, struct sock *sk) return; sk->sk_data_ready = psock->save_data_ready; sk->sk_write_space = psock->save_write_space; - sk->sk_state_change = psock->save_state_change; psock->save_data_ready = NULL; psock->save_write_space = NULL; - psock->save_state_change = NULL; strp_stop(&psock->strp); psock->strp_enabled = false; } @@ -353,6 +380,7 @@ static void smap_release_sock(struct smap_psock *psock, struct sock *sock) if (psock->refcnt) return; + tcp_cleanup_ulp(sock); smap_stop_sock(psock, sock); clear_bit(SMAP_TX_RUNNING, &psock->state); rcu_assign_sk_user_data(sock, NULL); @@ -430,10 +458,8 @@ static void smap_start_sock(struct smap_psock *psock, struct sock *sk) return; psock->save_data_ready = sk->sk_data_ready; psock->save_write_space = sk->sk_write_space; - psock->save_state_change = sk->sk_state_change; sk->sk_data_ready = smap_data_ready; sk->sk_write_space = smap_write_space; - sk->sk_state_change = smap_state_change; psock->strp_enabled = true; } @@ -512,6 +538,10 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr) if (attr->value_size > KMALLOC_MAX_SIZE) return ERR_PTR(-E2BIG); + err = bpf_tcp_ulp_register(); + if (err && err != -EEXIST) + return ERR_PTR(err); + stab = kzalloc(sizeof(*stab), GFP_USER); if (!stab) return ERR_PTR(-ENOMEM); @@ -752,6 +782,10 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops, goto out_progs; } + err = tcp_set_ulp_id(sock, TCP_ULP_BPF); + if (err) + goto out_progs; + set_bit(SMAP_TX_RUNNING, &psock->state); } From fdfa45c4219c3e3447fff8743c047590c8a5de7e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 13 Feb 2018 15:33:52 -0800 Subject: [PATCH 0176/1640] UPSTREAM: bpf: fix sock_map_alloc() error path In case user program provides silly parameters, we want a map_alloc() handler to return an error, not a NULL pointer, otherwise we crash later in find_and_alloc_map() Fixes: 1aa12bdf1bfb ("bpf: sockmap, add sock close() hook to remove socks") Signed-off-by: Eric Dumazet Reported-by: syzbot Acked-by: John Fastabend Signed-off-by: Alexei Starovoitov --- kernel/bpf/sockmap.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 9fee2428f711..242e6de49a64 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -524,8 +524,8 @@ static struct smap_psock *smap_init_psock(struct sock *sock, static struct bpf_map *sock_map_alloc(union bpf_attr *attr) { struct bpf_stab *stab; - int err = -EINVAL; u64 cost; + int err; if (!capable(CAP_NET_ADMIN)) return ERR_PTR(-EPERM); @@ -550,6 +550,7 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr) /* make sure page count doesn't overflow */ cost = (u64) stab->map.max_entries * sizeof(struct sock *); + err = -EINVAL; if (cost >= U32_MAX - PAGE_SIZE) goto free_stab; From e8111115b2b7561c21d483cc53e4f61933e614a8 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Wed, 14 Feb 2018 22:17:34 +0800 Subject: [PATCH 0177/1640] UPSTREAM: bpf: cpumap: use GFP_KERNEL instead of GFP_ATOMIC in __cpu_map_entry_alloc() There're several implications after commit 0bf7800f1799 ("ptr_ring: try vmalloc() when kmalloc() fails") with the using of vmalloc() since can't allow GFP_ATOMIC but mandate GFP_KERNEL. This will lead a WARN since cpumap try to call with GFP_ATOMIC. Fortunately, entry allocation of cpumap can only be done through syscall path which means GFP_ATOMIC is not necessary, so fixing this by replacing GFP_ATOMIC with GFP_KERNEL. Reported-by: syzbot+1a240cdb1f4cc88819df@syzkaller.appspotmail.com Fixes: 0bf7800f1799 ("ptr_ring: try vmalloc() when kmalloc() fails") Cc: Michal Hocko Cc: Daniel Borkmann Cc: Matthew Wilcox Cc: Jesper Dangaard Brouer Cc: akpm@linux-foundation.org Cc: dhowells@redhat.com Cc: hannes@cmpxchg.org Signed-off-by: Jason Wang Acked-by: Jesper Dangaard Brouer Signed-off-by: Daniel Borkmann --- kernel/bpf/cpumap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index fbfdada6caee..a4bb0b34375a 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -334,7 +334,7 @@ static int cpu_map_kthread_run(void *data) static struct bpf_cpu_map_entry *__cpu_map_entry_alloc(u32 qsize, u32 cpu, int map_id) { - gfp_t gfp = GFP_ATOMIC|__GFP_NOWARN; + gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; struct bpf_cpu_map_entry *rcpu; int numa, err; From 33f8a6306dd3aafea1a4228d472b6065427c205b Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 14 Feb 2018 15:31:00 +0100 Subject: [PATCH 0178/1640] UPSTREAM: bpf: fix bpf_prog_array_copy_to_user warning from perf event prog query syzkaller tried to perform a prog query in perf_event_query_prog_array() where struct perf_event_query_bpf had an ids_len of 1,073,741,353 and thus causing a warning due to failed kcalloc() allocation out of the bpf_prog_array_copy_to_user() helper. Given we cannot attach more than 64 programs to a perf event, there's no point in allowing huge ids_len. Therefore, allow a buffer that would fix the maximum number of ids and also add a __GFP_NOWARN to the temporary ids buffer. Fixes: f371b304f12e ("bpf/tracing: allow user space to query prog array on the same tp") Fixes: 0911287ce32b ("bpf: fix bpf_prog_array_copy_to_user() issues") Reported-by: syzbot+cab5816b0edbabf598b3@syzkaller.appspotmail.com Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov --- kernel/bpf/core.c | 2 +- kernel/trace/bpf_trace.c | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 4e48a85e722a..7d9bbf925e07 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1729,7 +1729,7 @@ int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *progs, * so always copy 'cnt' prog_ids to the user. * In a rare race the user will see zero prog_ids */ - ids = kcalloc(cnt, sizeof(u32), GFP_USER); + ids = kcalloc(cnt, sizeof(u32), GFP_USER | __GFP_NOWARN); if (!ids) return -ENOMEM; rcu_read_lock(); diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 0ed7f7fa7b39..d4c909a8dfcc 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -880,6 +880,8 @@ int perf_event_query_prog_array(struct perf_event *event, void __user *info) return -EINVAL; if (copy_from_user(&query, uquery, sizeof(query))) return -EFAULT; + if (query.ids_len > BPF_TRACE_MAX_PROGS) + return -E2BIG; mutex_lock(&bpf_event_mutex); ret = bpf_prog_array_copy_info(event->tp_event->prog_array, From 32c70093c26e987d83df69590973ba4ae998dd70 Mon Sep 17 00:00:00 2001 From: Joe Stringer Date: Wed, 14 Feb 2018 13:50:37 -0800 Subject: [PATCH 0179/1640] UPSTREAM: bpf: Remove unused callee_saved array This array appears to be completely unused, remove it. Signed-off-by: Joe Stringer Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 0581763702e3..03437d915af5 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -511,10 +511,6 @@ err: static const int caller_saved[CALLER_SAVED_REGS] = { BPF_REG_0, BPF_REG_1, BPF_REG_2, BPF_REG_3, BPF_REG_4, BPF_REG_5 }; -#define CALLEE_SAVED_REGS 5 -static const int callee_saved[CALLEE_SAVED_REGS] = { - BPF_REG_6, BPF_REG_7, BPF_REG_8, BPF_REG_9 -}; static void __mark_reg_not_init(struct bpf_reg_state *reg); From 6822ebcd4cf99ff23b034f82983cf2fc23fa325f Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 20 Feb 2018 23:07:33 +0100 Subject: [PATCH 0180/1640] UPSTREAM: bpf: clean up unused-variable warning The only user of this variable is inside of an #ifdef, causing a warning without CONFIG_INET: net/core/filter.c: In function '____bpf_sock_ops_cb_flags_set': net/core/filter.c:3382:6: error: unused variable 'val' [-Werror=unused-variable] int val = argval & BPF_SOCK_OPS_ALL_CB_FLAGS; This replaces the #ifdef with a nicer IS_ENABLED() check that makes the code more readable and avoids the warning. Fixes: b13d88072172 ("bpf: Adds field bpf_sock_ops_cb_flags to tcp_sock") Signed-off-by: Arnd Bergmann Signed-off-by: Daniel Borkmann --- net/core/filter.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/net/core/filter.c b/net/core/filter.c index c497fd68325f..09fc3676c2ef 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3460,17 +3460,13 @@ BPF_CALL_2(bpf_sock_ops_cb_flags_set, struct bpf_sock_ops_kern *, bpf_sock, struct sock *sk = bpf_sock->sk; int val = argval & BPF_SOCK_OPS_ALL_CB_FLAGS; - if (!sk_fullsock(sk)) + if (!IS_ENABLED(CONFIG_INET) || !sk_fullsock(sk)) return -EINVAL; -#ifdef CONFIG_INET if (val) tcp_sk(sk)->bpf_sock_ops_cb_flags = val; return argval & (~BPF_SOCK_OPS_ALL_CB_FLAGS); -#else - return -EINVAL; -#endif } static const struct bpf_func_proto bpf_sock_ops_cb_flags_set_proto = { From e76cf1336acc5175c26ca37772327b3058d3b987 Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Fri, 23 Feb 2018 14:58:41 +0800 Subject: [PATCH 0181/1640] UPSTREAM: bpf: NULL pointer check is not needed in BPF_CGROUP_RUN_PROG_INET_SOCK sk is already allocated in inet_create/inet6_create, hence when BPF_CGROUP_RUN_PROG_INET_SOCK is executed sk will never be NULL. The logic is as bellow, sk = sk_alloc(); if (!sk) goto out; BPF_CGROUP_RUN_PROG_INET_SOCK(sk); Signed-off-by: Yafang Shao Signed-off-by: Daniel Borkmann --- include/linux/bpf-cgroup.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index a7f16e0f8d68..8a4566691c8f 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -96,7 +96,7 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, #define BPF_CGROUP_RUN_PROG_INET_SOCK(sk) \ ({ \ int __ret = 0; \ - if (cgroup_bpf_enabled && sk) { \ + if (cgroup_bpf_enabled) { \ __ret = __cgroup_bpf_run_filter_sk(sk, \ BPF_CGROUP_INET_SOCK_CREATE); \ } \ From e08744695b609f1bb99a299bc8cf796a5e3b399f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 19 Sep 2017 16:27:08 -0700 Subject: [PATCH 0182/1640] UPSTREAM: ipv6: speedup ipv6 tunnels dismantle Implement exit_batch() method to dismantle more devices per round. (rtnl_lock() ... unregister_netdevice_many() ... rtnl_unlock()) Tested: $ cat add_del_unshare.sh for i in `seq 1 40` do (for j in `seq 1 100` ; do unshare -n /bin/true >/dev/null ; done) & done wait ; grep net_namespace /proc/slabinfo Before patch : $ time ./add_del_unshare.sh net_namespace 110 267 5504 1 2 : tunables 8 4 0 : slabdata 110 267 0 real 3m25.292s user 0m0.644s sys 0m40.153s After patch: $ time ./add_del_unshare.sh net_namespace 126 282 5504 1 2 : tunables 8 4 0 : slabdata 126 282 0 real 1m38.965s user 0m0.688s sys 0m37.017s Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/ip6_gre.c | 8 +++++--- net/ipv6/ip6_tunnel.c | 20 +++++++++++--------- net/ipv6/ip6_vti.c | 23 ++++++++++++++--------- net/ipv6/sit.c | 9 ++++++--- 4 files changed, 36 insertions(+), 24 deletions(-) diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 5ca2ab56901d..99606ee387b4 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -1205,19 +1205,21 @@ err_alloc_dev: return err; } -static void __net_exit ip6gre_exit_net(struct net *net) +static void __net_exit ip6gre_exit_batch_net(struct list_head *net_list) { + struct net *net; LIST_HEAD(list); rtnl_lock(); - ip6gre_destroy_tunnels(net, &list); + list_for_each_entry(net, net_list, exit_list) + ip6gre_destroy_tunnels(net, &list); unregister_netdevice_many(&list); rtnl_unlock(); } static struct pernet_operations ip6gre_net_ops = { .init = ip6gre_init_net, - .exit = ip6gre_exit_net, + .exit_batch = ip6gre_exit_batch_net, .id = &ip6gre_net_id, .size = sizeof(struct ip6gre_net), }; diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 7fa0d474d47a..80d76ecb1a26 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -2193,17 +2193,16 @@ static struct xfrm6_tunnel ip6ip6_handler __read_mostly = { .priority = 1, }; -static void __net_exit ip6_tnl_destroy_tunnels(struct net *net) +static void __net_exit ip6_tnl_destroy_tunnels(struct net *net, struct list_head *list) { struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); struct net_device *dev, *aux; int h; struct ip6_tnl *t; - LIST_HEAD(list); for_each_netdev_safe(net, dev, aux) if (dev->rtnl_link_ops == &ip6_link_ops) - unregister_netdevice_queue(dev, &list); + unregister_netdevice_queue(dev, list); for (h = 0; h < IP6_TUNNEL_HASH_SIZE; h++) { t = rtnl_dereference(ip6n->tnls_r_l[h]); @@ -2212,12 +2211,10 @@ static void __net_exit ip6_tnl_destroy_tunnels(struct net *net) * been added to the list by the previous loop. */ if (!net_eq(dev_net(t->dev), net)) - unregister_netdevice_queue(t->dev, &list); + unregister_netdevice_queue(t->dev, list); t = rtnl_dereference(t->next); } } - - unregister_netdevice_many(&list); } static int __net_init ip6_tnl_init_net(struct net *net) @@ -2261,16 +2258,21 @@ err_alloc_dev: return err; } -static void __net_exit ip6_tnl_exit_net(struct net *net) +static void __net_exit ip6_tnl_exit_batch_net(struct list_head *net_list) { + struct net *net; + LIST_HEAD(list); + rtnl_lock(); - ip6_tnl_destroy_tunnels(net); + list_for_each_entry(net, net_list, exit_list) + ip6_tnl_destroy_tunnels(net, &list); + unregister_netdevice_many(&list); rtnl_unlock(); } static struct pernet_operations ip6_tnl_net_ops = { .init = ip6_tnl_init_net, - .exit = ip6_tnl_exit_net, + .exit_batch = ip6_tnl_exit_batch_net, .id = &ip6_tnl_net_id, .size = sizeof(struct ip6_tnl_net), }; diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index 312c80912401..534dac0e4385 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -1086,23 +1086,22 @@ static struct rtnl_link_ops vti6_link_ops __read_mostly = { .get_link_net = ip6_tnl_get_link_net, }; -static void __net_exit vti6_destroy_tunnels(struct vti6_net *ip6n) +static void __net_exit vti6_destroy_tunnels(struct vti6_net *ip6n, + struct list_head *list) { int h; struct ip6_tnl *t; - LIST_HEAD(list); for (h = 0; h < IP6_VTI_HASH_SIZE; h++) { t = rtnl_dereference(ip6n->tnls_r_l[h]); while (t) { - unregister_netdevice_queue(t->dev, &list); + unregister_netdevice_queue(t->dev, list); t = rtnl_dereference(t->next); } } t = rtnl_dereference(ip6n->tnls_wc[0]); - unregister_netdevice_queue(t->dev, &list); - unregister_netdevice_many(&list); + unregister_netdevice_queue(t->dev, list); } static int __net_init vti6_init_net(struct net *net) @@ -1142,18 +1141,24 @@ err_alloc_dev: return err; } -static void __net_exit vti6_exit_net(struct net *net) +static void __net_exit vti6_exit_batch_net(struct list_head *net_list) { - struct vti6_net *ip6n = net_generic(net, vti6_net_id); + struct vti6_net *ip6n; + struct net *net; + LIST_HEAD(list); rtnl_lock(); - vti6_destroy_tunnels(ip6n); + list_for_each_entry(net, net_list, exit_list) { + ip6n = net_generic(net, vti6_net_id); + vti6_destroy_tunnels(ip6n, &list); + } + unregister_netdevice_many(&list); rtnl_unlock(); } static struct pernet_operations vti6_net_ops = { .init = vti6_init_net, - .exit = vti6_exit_net, + .exit_batch = vti6_exit_batch_net, .id = &vti6_net_id, .size = sizeof(struct vti6_net), }; diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index a64d292fd4c7..5f6471225d31 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -1868,19 +1868,22 @@ err_alloc_dev: return err; } -static void __net_exit sit_exit_net(struct net *net) +static void __net_exit sit_exit_batch_net(struct list_head *net_list) { LIST_HEAD(list); + struct net *net; rtnl_lock(); - sit_destroy_tunnels(net, &list); + list_for_each_entry(net, net_list, exit_list) + sit_destroy_tunnels(net, &list); + unregister_netdevice_many(&list); rtnl_unlock(); } static struct pernet_operations sit_net_ops = { .init = sit_init_net, - .exit = sit_exit_net, + .exit_batch = sit_exit_batch_net, .id = &sit_net_id, .size = sizeof(struct sit_net), }; From 97117912aa1648903517e0dde56a0f9d8ae705f5 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sat, 11 Nov 2017 19:06:49 +0800 Subject: [PATCH 0183/1640] UPSTREAM: ip6_gre: add the process for redirect in ip6gre_err This patch is to add redirect icmp packet process for ip6gre by calling ip6_redirect() in ip6gre_err(), as in vti6_err. Prior to this patch, there's even no route cache generated after receiving redirect. Reported-by: Jianlin Shi Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/ipv6/ip6_gre.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 99606ee387b4..7615c14109c2 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -374,6 +374,7 @@ static void ip6gre_tunnel_uninit(struct net_device *dev) static void ip6gre_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { + struct net *net = dev_net(skb->dev); const struct gre_base_hdr *greh; const struct ipv6hdr *ipv6h; int grehlen = sizeof(*greh); @@ -447,6 +448,10 @@ static void ip6gre_err(struct sk_buff *skb, struct inet6_skb_parm *opt, mtu = IPV6_MIN_MTU; t->dev->mtu = mtu; return; + case NDISC_REDIRECT: + ip6_redirect(skb, net, skb->dev->ifindex, 0, + sock_net_uid(net, NULL)); + return; } if (time_before(jiffies, t->err_time + IP6TUNNEL_ERR_TIMEO)) From bc0e23d045760f9821c76a3fcf9768e258d6d6c1 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sat, 11 Nov 2017 19:06:50 +0800 Subject: [PATCH 0184/1640] BACKPORT: ip6_gre: process toobig in a better way Now ip6gre processes toobig icmp packet by setting gre dev's mtu in ip6gre_err, which would cause few things not good: - It couldn't set mtu with dev_set_mtu due to it's not in user context, which causes route cache and idev->cnf.mtu6 not to be updated. - It has to update sk dst pmtu in tx path according to gredev->mtu for ip6gre, while it updates pmtu again according to lower dst pmtu in ip6_tnl_xmit. - To change dev->mtu by toobig icmp packet is not a good idea, it should only work on pmtu. This patch is to process toobig by updating the lower dst's pmtu, as later sk dst pmtu will be updated in ip6_tnl_xmit, the same way as in ip4gre. Note that gre dev's mtu will not be updated any more, it doesn't make any sense to change dev's mtu after receiving a toobig packet. Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/ipv6/ip6_gre.c | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 7615c14109c2..7ae24dc63e5c 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -408,9 +408,8 @@ static void ip6gre_err(struct sk_buff *skb, struct inet6_skb_parm *opt, return; switch (type) { - __u32 teli; struct ipv6_tlv_tnl_enc_lim *tel; - __u32 mtu; + __u32 teli; case ICMPV6_DEST_UNREACH: net_dbg_ratelimited("%s: Path to destination invalid or inactive!\n", t->parms.name); @@ -441,12 +440,7 @@ static void ip6gre_err(struct sk_buff *skb, struct inet6_skb_parm *opt, } return; case ICMPV6_PKT_TOOBIG: - mtu = be32_to_cpu(info) - offset - t->tun_hlen; - if (t->dev->type == ARPHRD_ETHER) - mtu -= ETH_HLEN; - if (mtu < IPV6_MIN_MTU) - mtu = IPV6_MIN_MTU; - t->dev->mtu = mtu; + ip6_update_pmtu(skb, net, info, 0, 0, sock_net_uid(net, NULL)); return; case NDISC_REDIRECT: ip6_redirect(skb, net, skb->dev->ifindex, 0, @@ -513,7 +507,6 @@ static netdev_tx_t __gre6_xmit(struct sk_buff *skb, __u32 *pmtu, __be16 proto) { struct ip6_tnl *tunnel = netdev_priv(dev); - struct dst_entry *dst = skb_dst(skb); __be16 protocol; if (dev->type == ARPHRD_ETHER) @@ -532,10 +525,6 @@ static netdev_tx_t __gre6_xmit(struct sk_buff *skb, gre_build_header(skb, tunnel->tun_hlen, tunnel->parms.o_flags, protocol, tunnel->parms.o_key, htonl(tunnel->o_seqno)); - /* TooBig packet may have updated dst->dev's mtu */ - if (dst && dst_mtu(dst) > dst->dev->mtu) - dst->ops->update_pmtu(dst, NULL, skb, dst->dev->mtu, false); - return ip6_tnl_xmit(skb, dev, dsfield, fl6, encap_limit, pmtu, NEXTHDR_GRE); } From b2d4c9e8806fa241c4b063fae9ec39223828433a Mon Sep 17 00:00:00 2001 From: William Tu Date: Thu, 30 Nov 2017 11:51:28 -0800 Subject: [PATCH 0185/1640] UPSTREAM: ip6_gre: Refactor ip6gre xmit codes This patch refactors the ip6gre_xmit_{ipv4, ipv6}. It is a prep work to add the ip6erspan tunnel. Signed-off-by: William Tu Signed-off-by: David S. Miller --- net/ipv6/ip6_gre.c | 123 +++++++++++++++++++++++++++------------------ 1 file changed, 75 insertions(+), 48 deletions(-) diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 7ae24dc63e5c..243b2259b8b4 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -501,6 +501,78 @@ static int gre_handle_offloads(struct sk_buff *skb, bool csum) csum ? SKB_GSO_GRE_CSUM : SKB_GSO_GRE); } +static void prepare_ip6gre_xmit_ipv4(struct sk_buff *skb, + struct net_device *dev, + struct flowi6 *fl6, __u8 *dsfield, + int *encap_limit) +{ + const struct iphdr *iph = ip_hdr(skb); + struct ip6_tnl *t = netdev_priv(dev); + + if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) + *encap_limit = t->parms.encap_limit; + + memcpy(fl6, &t->fl.u.ip6, sizeof(*fl6)); + + if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS) + *dsfield = ipv4_get_dsfield(iph); + else + *dsfield = ip6_tclass(t->parms.flowinfo); + + if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK) + fl6->flowi6_mark = skb->mark; + else + fl6->flowi6_mark = t->parms.fwmark; + + fl6->flowi6_uid = sock_net_uid(dev_net(dev), NULL); +} + +static int prepare_ip6gre_xmit_ipv6(struct sk_buff *skb, + struct net_device *dev, + struct flowi6 *fl6, __u8 *dsfield, + int *encap_limit) +{ + struct ipv6hdr *ipv6h = ipv6_hdr(skb); + struct ip6_tnl *t = netdev_priv(dev); + __u16 offset; + + offset = ip6_tnl_parse_tlv_enc_lim(skb, skb_network_header(skb)); + /* ip6_tnl_parse_tlv_enc_lim() might have reallocated skb->head */ + + if (offset > 0) { + struct ipv6_tlv_tnl_enc_lim *tel; + + tel = (struct ipv6_tlv_tnl_enc_lim *)&skb_network_header(skb)[offset]; + if (tel->encap_limit == 0) { + icmpv6_send(skb, ICMPV6_PARAMPROB, + ICMPV6_HDR_FIELD, offset + 2); + return -1; + } + *encap_limit = tel->encap_limit - 1; + } else if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) { + *encap_limit = t->parms.encap_limit; + } + + memcpy(fl6, &t->fl.u.ip6, sizeof(*fl6)); + + if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS) + *dsfield = ipv6_get_dsfield(ipv6h); + else + *dsfield = ip6_tclass(t->parms.flowinfo); + + if (t->parms.flags & IP6_TNL_F_USE_ORIG_FLOWLABEL) + fl6->flowlabel |= ip6_flowlabel(ipv6h); + + if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK) + fl6->flowi6_mark = skb->mark; + else + fl6->flowi6_mark = t->parms.fwmark; + + fl6->flowi6_uid = sock_net_uid(dev_net(dev), NULL); + + return 0; +} + static netdev_tx_t __gre6_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield, struct flowi6 *fl6, int encap_limit, @@ -532,7 +604,6 @@ static netdev_tx_t __gre6_xmit(struct sk_buff *skb, static inline int ip6gre_xmit_ipv4(struct sk_buff *skb, struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); - const struct iphdr *iph = ip_hdr(skb); int encap_limit = -1; struct flowi6 fl6; __u8 dsfield; @@ -541,21 +612,7 @@ static inline int ip6gre_xmit_ipv4(struct sk_buff *skb, struct net_device *dev) memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); - if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) - encap_limit = t->parms.encap_limit; - - memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6)); - - if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS) - dsfield = ipv4_get_dsfield(iph); - else - dsfield = ip6_tclass(t->parms.flowinfo); - if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK) - fl6.flowi6_mark = skb->mark; - else - fl6.flowi6_mark = t->parms.fwmark; - - fl6.flowi6_uid = sock_net_uid(dev_net(dev), NULL); + prepare_ip6gre_xmit_ipv4(skb, dev, &fl6, &dsfield, &encap_limit); err = gre_handle_offloads(skb, !!(t->parms.o_flags & TUNNEL_CSUM)); if (err) @@ -579,7 +636,6 @@ static inline int ip6gre_xmit_ipv6(struct sk_buff *skb, struct net_device *dev) struct ip6_tnl *t = netdev_priv(dev); struct ipv6hdr *ipv6h = ipv6_hdr(skb); int encap_limit = -1; - __u16 offset; struct flowi6 fl6; __u8 dsfield; __u32 mtu; @@ -588,37 +644,8 @@ static inline int ip6gre_xmit_ipv6(struct sk_buff *skb, struct net_device *dev) if (ipv6_addr_equal(&t->parms.raddr, &ipv6h->saddr)) return -1; - offset = ip6_tnl_parse_tlv_enc_lim(skb, skb_network_header(skb)); - /* ip6_tnl_parse_tlv_enc_lim() might have reallocated skb->head */ - ipv6h = ipv6_hdr(skb); - - if (offset > 0) { - struct ipv6_tlv_tnl_enc_lim *tel; - tel = (struct ipv6_tlv_tnl_enc_lim *)&skb_network_header(skb)[offset]; - if (tel->encap_limit == 0) { - icmpv6_send(skb, ICMPV6_PARAMPROB, - ICMPV6_HDR_FIELD, offset + 2); - return -1; - } - encap_limit = tel->encap_limit - 1; - } else if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) - encap_limit = t->parms.encap_limit; - - memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6)); - - if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS) - dsfield = ipv6_get_dsfield(ipv6h); - else - dsfield = ip6_tclass(t->parms.flowinfo); - - if (t->parms.flags & IP6_TNL_F_USE_ORIG_FLOWLABEL) - fl6.flowlabel |= ip6_flowlabel(ipv6h); - if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK) - fl6.flowi6_mark = skb->mark; - else - fl6.flowi6_mark = t->parms.fwmark; - - fl6.flowi6_uid = sock_net_uid(dev_net(dev), NULL); + if (prepare_ip6gre_xmit_ipv6(skb, dev, &fl6, &dsfield, &encap_limit)) + return -1; if (gre_handle_offloads(skb, !!(t->parms.o_flags & TUNNEL_CSUM))) return -1; From 620e6a77e7ff571051a182ff6f61aeccf6f60908 Mon Sep 17 00:00:00 2001 From: William Tu Date: Thu, 30 Nov 2017 11:51:27 -0800 Subject: [PATCH 0186/1640] BACKPORT: ip_gre: Refector the erpsan tunnel code. Move two erspan functions to header file, erspan.h, so ipv6 erspan implementation can use it. Signed-off-by: William Tu Signed-off-by: David S. Miller --- include/net/erspan.h | 51 +++++++++++++++++++++++++++++++++++++++++ net/ipv4/ip_gre.c | 54 ++++---------------------------------------- 2 files changed, 56 insertions(+), 49 deletions(-) diff --git a/include/net/erspan.h b/include/net/erspan.h index ca94fc86865e..6e758d08c9ee 100644 --- a/include/net/erspan.h +++ b/include/net/erspan.h @@ -58,4 +58,55 @@ struct erspanhdr { struct erspan_metadata md; }; +static inline u8 tos_to_cos(u8 tos) +{ + u8 dscp, cos; + + dscp = tos >> 2; + cos = dscp >> 3; + return cos; +} + +static inline void erspan_build_header(struct sk_buff *skb, + __be32 id, u32 index, + bool truncate, bool is_ipv4) +{ + struct ethhdr *eth = eth_hdr(skb); + enum erspan_encap_type enc_type; + struct erspanhdr *ershdr; + struct qtag_prefix { + __be16 eth_type; + __be16 tci; + } *qp; + u16 vlan_tci = 0; + u8 tos; + + tos = is_ipv4 ? ip_hdr(skb)->tos : + (ipv6_hdr(skb)->priority << 4) + + (ipv6_hdr(skb)->flow_lbl[0] >> 4); + + enc_type = ERSPAN_ENCAP_NOVLAN; + + /* If mirrored packet has vlan tag, extract tci and + * perserve vlan header in the mirrored frame. + */ + if (eth->h_proto == htons(ETH_P_8021Q)) { + qp = (struct qtag_prefix *)(skb->data + 2 * ETH_ALEN); + vlan_tci = ntohs(qp->tci); + enc_type = ERSPAN_ENCAP_INFRAME; + } + + skb_push(skb, sizeof(*ershdr)); + ershdr = (struct erspanhdr *)skb->data; + memset(ershdr, 0, sizeof(*ershdr)); + + ershdr->ver_vlan = htons((vlan_tci & VLAN_MASK) | + (ERSPAN_VERSION << VER_OFFSET)); + ershdr->session_id = htons((u16)(ntohl(id) & ID_MASK) | + ((tos_to_cos(tos) << COS_OFFSET) & COS_MASK) | + (enc_type << EN_OFFSET & EN_MASK) | + ((truncate << T_OFFSET) & T_MASK)); + ershdr->md.index = htonl(index & INDEX_MASK); +} + #endif diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index eacffdb0bae0..056803ad41ef 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -114,7 +114,8 @@ MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN"); static struct rtnl_link_ops ipgre_link_ops __read_mostly; static int ipgre_tunnel_init(struct net_device *dev); static void erspan_build_header(struct sk_buff *skb, - __be32 id, u32 index, bool truncate); + __be32 id, u32 index, + bool truncate, bool is_ipv4); static unsigned int ipgre_net_id __read_mostly; static unsigned int gre_tap_net_id __read_mostly; @@ -598,7 +599,7 @@ static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev, goto err_free_rt; erspan_build_header(skb, tunnel_id_to_key32(key->tun_id), - ntohl(md->index), truncate); + ntohl(md->index), truncate, true); gre_build_header(skb, 8, TUNNEL_SEQ, htons(ETH_P_ERSPAN), 0, htonl(tunnel->o_seqno++)); @@ -680,52 +681,6 @@ free_skb: return NETDEV_TX_OK; } -static inline u8 tos_to_cos(u8 tos) -{ - u8 dscp, cos; - - dscp = tos >> 2; - cos = dscp >> 3; - return cos; -} - -static void erspan_build_header(struct sk_buff *skb, - __be32 id, u32 index, bool truncate) -{ - struct iphdr *iphdr = ip_hdr(skb); - struct ethhdr *eth = (struct ethhdr *)skb->data; - enum erspan_encap_type enc_type; - struct erspanhdr *ershdr; - struct qtag_prefix { - __be16 eth_type; - __be16 tci; - } *qp; - u16 vlan_tci = 0; - - enc_type = ERSPAN_ENCAP_NOVLAN; - - /* If mirrored packet has vlan tag, extract tci and - * perserve vlan header in the mirrored frame. - */ - if (eth->h_proto == htons(ETH_P_8021Q)) { - qp = (struct qtag_prefix *)(skb->data + 2 * ETH_ALEN); - vlan_tci = ntohs(qp->tci); - enc_type = ERSPAN_ENCAP_INFRAME; - } - - skb_push(skb, sizeof(*ershdr)); - ershdr = (struct erspanhdr *)skb->data; - memset(ershdr, 0, sizeof(*ershdr)); - - ershdr->ver_vlan = htons((vlan_tci & VLAN_MASK) | - (ERSPAN_VERSION << VER_OFFSET)); - ershdr->session_id = htons((u16)(ntohl(id) & ID_MASK) | - ((tos_to_cos(iphdr->tos) << COS_OFFSET) & COS_MASK) | - (enc_type << EN_OFFSET & EN_MASK) | - ((truncate << T_OFFSET) & T_MASK)); - ershdr->md.index = htonl(index & INDEX_MASK); -} - static netdev_tx_t erspan_xmit(struct sk_buff *skb, struct net_device *dev) { @@ -752,7 +707,8 @@ static netdev_tx_t erspan_xmit(struct sk_buff *skb, } /* Push ERSPAN header */ - erspan_build_header(skb, tunnel->parms.o_key, tunnel->index, truncate); + erspan_build_header(skb, tunnel->parms.o_key, tunnel->index, + truncate, true); tunnel->parms.o_flags &= ~TUNNEL_KEY; __gre_xmit(skb, dev, &tunnel->parms.iph, htons(ETH_P_ERSPAN)); return NETDEV_TX_OK; From ff989cf9fb225793f90dcb272564e4e37f06fbd4 Mon Sep 17 00:00:00 2001 From: William Tu Date: Thu, 30 Nov 2017 11:51:29 -0800 Subject: [PATCH 0187/1640] BACKPORT: ip6_gre: Add ERSPAN native tunnel support The patch adds support for ERSPAN tunnel over ipv6. Signed-off-by: William Tu Signed-off-by: David S. Miller --- include/net/ip6_tunnel.h | 1 + net/ipv6/ip6_gre.c | 270 ++++++++++++++++++++++++++++++++++++++- 2 files changed, 267 insertions(+), 4 deletions(-) diff --git a/include/net/ip6_tunnel.h b/include/net/ip6_tunnel.h index 3b0e3cdee1c3..6ca6e5834b22 100644 --- a/include/net/ip6_tunnel.h +++ b/include/net/ip6_tunnel.h @@ -36,6 +36,7 @@ struct __ip6_tnl_parm { __be32 o_key; __u32 fwmark; + __u32 index; /* ERSPAN type II index */ }; /* IPv6 tunnel */ diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 243b2259b8b4..ec5661c3229a 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -55,6 +55,7 @@ #include #include #include +#include static bool log_ecn_error = true; @@ -73,6 +74,7 @@ struct ip6gre_net { static struct rtnl_link_ops ip6gre_link_ops __read_mostly; static struct rtnl_link_ops ip6gre_tap_ops __read_mostly; +static struct rtnl_link_ops ip6erspan_tap_ops __read_mostly; static int ip6gre_tunnel_init(struct net_device *dev); static void ip6gre_tunnel_setup(struct net_device *dev); static void ip6gre_tunnel_link(struct ip6gre_net *ign, struct ip6_tnl *t); @@ -121,7 +123,8 @@ static struct ip6_tnl *ip6gre_tunnel_lookup(struct net_device *dev, unsigned int h1 = HASH_KEY(key); struct ip6_tnl *t, *cand = NULL; struct ip6gre_net *ign = net_generic(net, ip6gre_net_id); - int dev_type = (gre_proto == htons(ETH_P_TEB)) ? + int dev_type = (gre_proto == htons(ETH_P_TEB) || + gre_proto == htons(ETH_P_ERSPAN)) ? ARPHRD_ETHER : ARPHRD_IP6GRE; int score, cand_score = 4; struct net_device *ndev; @@ -473,6 +476,41 @@ static int ip6gre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi) return PACKET_REJECT; } +static int ip6erspan_rcv(struct sk_buff *skb, int gre_hdr_len, + struct tnl_ptk_info *tpi) +{ + const struct ipv6hdr *ipv6h; + struct erspanhdr *ershdr; + struct ip6_tnl *tunnel; + __be32 index; + + ipv6h = ipv6_hdr(skb); + ershdr = (struct erspanhdr *)skb->data; + + if (unlikely(!pskb_may_pull(skb, sizeof(*ershdr)))) + return PACKET_REJECT; + + tpi->key = cpu_to_be32(ntohs(ershdr->session_id) & ID_MASK); + index = ershdr->md.index; + + tunnel = ip6gre_tunnel_lookup(skb->dev, + &ipv6h->saddr, &ipv6h->daddr, tpi->key, + tpi->proto); + if (tunnel) { + if (__iptunnel_pull_header(skb, sizeof(*ershdr), + htons(ETH_P_TEB), + false, false) < 0) + return PACKET_REJECT; + + tunnel->parms.index = ntohl(index); + ip6_tnl_rcv(tunnel, skb, tpi, NULL, log_ecn_error); + + return PACKET_RCVD; + } + + return PACKET_REJECT; +} + static int gre_rcv(struct sk_buff *skb) { struct tnl_ptk_info tpi; @@ -486,6 +524,12 @@ static int gre_rcv(struct sk_buff *skb) if (iptunnel_pull_header(skb, hdr_len, tpi.proto, false)) goto drop; + if (unlikely(tpi.proto == htons(ETH_P_ERSPAN))) { + if (ip6erspan_rcv(skb, hdr_len, &tpi) == PACKET_RCVD) + return 0; + goto drop; + } + if (ip6gre_rcv(skb, &tpi) == PACKET_RCVD) return 0; @@ -740,6 +784,88 @@ tx_err: return NETDEV_TX_OK; } +static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb, + struct net_device *dev) +{ + struct ipv6hdr *ipv6h = ipv6_hdr(skb); + struct ip6_tnl *t = netdev_priv(dev); + struct dst_entry *dst = skb_dst(skb); + struct net_device_stats *stats; + bool truncate = false; + int encap_limit = -1; + __u8 dsfield = false; + struct flowi6 fl6; + int err = -EINVAL; + __u32 mtu; + + if (!ip6_tnl_xmit_ctl(t, &t->parms.laddr, &t->parms.raddr)) + goto tx_err; + + if (gre_handle_offloads(skb, false)) + goto tx_err; + + switch (skb->protocol) { + case htons(ETH_P_IP): + memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); + prepare_ip6gre_xmit_ipv4(skb, dev, &fl6, + &dsfield, &encap_limit); + break; + case htons(ETH_P_IPV6): + if (ipv6_addr_equal(&t->parms.raddr, &ipv6h->saddr)) + goto tx_err; + if (prepare_ip6gre_xmit_ipv6(skb, dev, &fl6, + &dsfield, &encap_limit)) + goto tx_err; + break; + default: + memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6)); + break; + } + + if (skb->len > dev->mtu + dev->hard_header_len) { + pskb_trim(skb, dev->mtu + dev->hard_header_len); + truncate = true; + } + + erspan_build_header(skb, t->parms.o_key, t->parms.index, + truncate, false); + t->parms.o_flags &= ~TUNNEL_KEY; + + IPCB(skb)->flags = 0; + fl6.daddr = t->parms.raddr; + + /* Push GRE header. */ + gre_build_header(skb, 8, TUNNEL_SEQ, + htons(ETH_P_ERSPAN), 0, htonl(t->o_seqno++)); + + /* TooBig packet may have updated dst->dev's mtu */ + if (dst && dst_mtu(dst) > dst->dev->mtu) + dst->ops->update_pmtu(dst, NULL, skb, dst->dev->mtu, false); + + err = ip6_tnl_xmit(skb, dev, dsfield, &fl6, encap_limit, &mtu, + NEXTHDR_GRE); + if (err != 0) { + /* XXX: send ICMP error even if DF is not set. */ + if (err == -EMSGSIZE) { + if (skb->protocol == htons(ETH_P_IP)) + icmp_send(skb, ICMP_DEST_UNREACH, + ICMP_FRAG_NEEDED, htonl(mtu)); + else + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); + } + + goto tx_err; + } + return NETDEV_TX_OK; + +tx_err: + stats = &t->dev->stats; + stats->tx_errors++; + stats->tx_dropped++; + kfree_skb(skb); + return NETDEV_TX_OK; +} + static void ip6gre_tnl_link_config(struct ip6_tnl *t, int set_mtu) { struct net_device *dev = t->dev; @@ -1151,7 +1277,6 @@ static void ip6gre_fb_tunnel_init(struct net_device *dev) tunnel->hlen = sizeof(struct ipv6hdr) + 4; } - static struct inet6_protocol ip6gre_protocol __read_mostly = { .handler = gre_rcv, .err_handler = ip6gre_err, @@ -1166,7 +1291,8 @@ static void ip6gre_destroy_tunnels(struct net *net, struct list_head *head) for_each_netdev_safe(net, dev, aux) if (dev->rtnl_link_ops == &ip6gre_link_ops || - dev->rtnl_link_ops == &ip6gre_tap_ops) + dev->rtnl_link_ops == &ip6gre_tap_ops || + dev->rtnl_link_ops == &ip6erspan_tap_ops) unregister_netdevice_queue(dev, head); for (prio = 0; prio < 4; prio++) { @@ -1289,6 +1415,47 @@ out: return ip6gre_tunnel_validate(tb, data, extack); } +static int ip6erspan_tap_validate(struct nlattr *tb[], struct nlattr *data[], + struct netlink_ext_ack *extack) +{ + __be16 flags = 0; + int ret; + + if (!data) + return 0; + + ret = ip6gre_tap_validate(tb, data, extack); + if (ret) + return ret; + + /* ERSPAN should only have GRE sequence and key flag */ + if (data[IFLA_GRE_OFLAGS]) + flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]); + if (data[IFLA_GRE_IFLAGS]) + flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]); + if (!data[IFLA_GRE_COLLECT_METADATA] && + flags != (GRE_SEQ | GRE_KEY)) + return -EINVAL; + + /* ERSPAN Session ID only has 10-bit. Since we reuse + * 32-bit key field as ID, check it's range. + */ + if (data[IFLA_GRE_IKEY] && + (ntohl(nla_get_be32(data[IFLA_GRE_IKEY])) & ~ID_MASK)) + return -EINVAL; + + if (data[IFLA_GRE_OKEY] && + (ntohl(nla_get_be32(data[IFLA_GRE_OKEY])) & ~ID_MASK)) + return -EINVAL; + + if (data[IFLA_GRE_ERSPAN_INDEX]) { + u32 index = nla_get_u32(data[IFLA_GRE_ERSPAN_INDEX]); + + if (index & ~INDEX_MASK) + return -EINVAL; + } + return 0; +} static void ip6gre_netlink_parms(struct nlattr *data[], struct __ip6_tnl_parm *parms) @@ -1335,6 +1502,9 @@ static void ip6gre_netlink_parms(struct nlattr *data[], if (data[IFLA_GRE_FWMARK]) parms->fwmark = nla_get_u32(data[IFLA_GRE_FWMARK]); + + if (data[IFLA_GRE_ERSPAN_INDEX]) + parms->index = nla_get_u32(data[IFLA_GRE_ERSPAN_INDEX]); } static int ip6gre_tap_init(struct net_device *dev) @@ -1361,6 +1531,59 @@ static const struct net_device_ops ip6gre_tap_netdev_ops = { .ndo_get_iflink = ip6_tnl_get_iflink, }; +static int ip6erspan_tap_init(struct net_device *dev) +{ + struct ip6_tnl *tunnel; + int t_hlen; + int ret; + + tunnel = netdev_priv(dev); + + tunnel->dev = dev; + tunnel->net = dev_net(dev); + strcpy(tunnel->parms.name, dev->name); + + dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); + if (!dev->tstats) + return -ENOMEM; + + ret = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL); + if (ret) { + free_percpu(dev->tstats); + dev->tstats = NULL; + return ret; + } + + tunnel->tun_hlen = 8; + tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen + + sizeof(struct erspanhdr); + t_hlen = tunnel->hlen + sizeof(struct ipv6hdr); + + dev->hard_header_len = LL_MAX_HEADER + t_hlen; + dev->mtu = ETH_DATA_LEN - t_hlen; + if (dev->type == ARPHRD_ETHER) + dev->mtu -= ETH_HLEN; + if (!(tunnel->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) + dev->mtu -= 8; + + dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; + tunnel = netdev_priv(dev); + ip6gre_tnl_link_config(tunnel, 1); + + return 0; +} + +static const struct net_device_ops ip6erspan_netdev_ops = { + .ndo_init = ip6erspan_tap_init, + .ndo_uninit = ip6gre_tunnel_uninit, + .ndo_start_xmit = ip6erspan_tunnel_xmit, + .ndo_set_mac_address = eth_mac_addr, + .ndo_validate_addr = eth_validate_addr, + .ndo_change_mtu = ip6_tnl_change_mtu, + .ndo_get_stats64 = ip_tunnel_get_stats64, + .ndo_get_iflink = ip6_tnl_get_iflink, +}; + static void ip6gre_tap_setup(struct net_device *dev) { @@ -1537,6 +1760,8 @@ static size_t ip6gre_get_size(const struct net_device *dev) nla_total_size(2) + /* IFLA_GRE_FWMARK */ nla_total_size(4) + + /* IFLA_GRE_ERSPAN_INDEX */ + nla_total_size(4) + 0; } @@ -1558,7 +1783,8 @@ static int ip6gre_fill_info(struct sk_buff *skb, const struct net_device *dev) nla_put_u8(skb, IFLA_GRE_ENCAP_LIMIT, p->encap_limit) || nla_put_be32(skb, IFLA_GRE_FLOWINFO, p->flowinfo) || nla_put_u32(skb, IFLA_GRE_FLAGS, p->flags) || - nla_put_u32(skb, IFLA_GRE_FWMARK, p->fwmark)) + nla_put_u32(skb, IFLA_GRE_FWMARK, p->fwmark) || + nla_put_u32(skb, IFLA_GRE_ERSPAN_INDEX, p->index)) goto nla_put_failure; if (nla_put_u16(skb, IFLA_GRE_ENCAP_TYPE, @@ -1594,8 +1820,23 @@ static const struct nla_policy ip6gre_policy[IFLA_GRE_MAX + 1] = { [IFLA_GRE_ENCAP_SPORT] = { .type = NLA_U16 }, [IFLA_GRE_ENCAP_DPORT] = { .type = NLA_U16 }, [IFLA_GRE_FWMARK] = { .type = NLA_U32 }, + [IFLA_GRE_ERSPAN_INDEX] = { .type = NLA_U32 }, }; +static void ip6erspan_tap_setup(struct net_device *dev) +{ + ether_setup(dev); + + dev->netdev_ops = &ip6erspan_netdev_ops; + dev->needs_free_netdev = true; + dev->priv_destructor = ip6gre_dev_free; + + dev->features |= NETIF_F_NETNS_LOCAL; + dev->priv_flags &= ~IFF_TX_SKB_SHARING; + dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; + netif_keep_dst(dev); +} + static struct rtnl_link_ops ip6gre_link_ops __read_mostly = { .kind = "ip6gre", .maxtype = IFLA_GRE_MAX, @@ -1625,6 +1866,20 @@ static struct rtnl_link_ops ip6gre_tap_ops __read_mostly = { .get_link_net = ip6_tnl_get_link_net, }; +static struct rtnl_link_ops ip6erspan_tap_ops __read_mostly = { + .kind = "ip6erspan", + .maxtype = IFLA_GRE_MAX, + .policy = ip6gre_policy, + .priv_size = sizeof(struct ip6_tnl), + .setup = ip6erspan_tap_setup, + .validate = ip6erspan_tap_validate, + .newlink = ip6gre_newlink, + .changelink = ip6gre_changelink, + .get_size = ip6gre_get_size, + .fill_info = ip6gre_fill_info, + .get_link_net = ip6_tnl_get_link_net, +}; + /* * And now the modules code and kernel interface. */ @@ -1653,9 +1908,15 @@ static int __init ip6gre_init(void) if (err < 0) goto tap_ops_failed; + err = rtnl_link_register(&ip6erspan_tap_ops); + if (err < 0) + goto erspan_link_failed; + out: return err; +erspan_link_failed: + rtnl_link_unregister(&ip6gre_tap_ops); tap_ops_failed: rtnl_link_unregister(&ip6gre_link_ops); rtnl_link_failed: @@ -1669,6 +1930,7 @@ static void __exit ip6gre_fini(void) { rtnl_link_unregister(&ip6gre_tap_ops); rtnl_link_unregister(&ip6gre_link_ops); + rtnl_link_unregister(&ip6erspan_tap_ops); inet6_del_protocol(&ip6gre_protocol, IPPROTO_GRE); unregister_pernet_device(&ip6gre_net_ops); } From c0cc8449e4d888f27b7255deaa54d68912876e38 Mon Sep 17 00:00:00 2001 From: William Tu Date: Fri, 1 Dec 2017 15:26:08 -0800 Subject: [PATCH 0188/1640] BACKPORT: ip6_gre: add ip6 gre and gretap collect_md mode Similar to gre, vxlan, geneve, ipip tunnels, allow ip6 gre and gretap tunnels to operate in collect metadata mode. bpf_skb_[gs]et_tunnel_key() helpers can make use of it right away. OVS can use it as well in the future. Signed-off-by: William Tu Signed-off-by: David S. Miller --- net/ipv6/ip6_gre.c | 104 ++++++++++++++++++++++++++++++++++++++---- net/ipv6/ip6_tunnel.c | 5 +- 2 files changed, 98 insertions(+), 11 deletions(-) diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index ec5661c3229a..ec057b689c22 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -56,6 +56,7 @@ #include #include #include +#include static bool log_ecn_error = true; @@ -69,6 +70,7 @@ static unsigned int ip6gre_net_id __read_mostly; struct ip6gre_net { struct ip6_tnl __rcu *tunnels[4][IP6_GRE_HASH_SIZE]; + struct ip6_tnl __rcu *collect_md_tun; struct net_device *fb_tunnel_dev; }; @@ -230,6 +232,10 @@ static struct ip6_tnl *ip6gre_tunnel_lookup(struct net_device *dev, if (cand) return cand; + t = rcu_dereference(ign->collect_md_tun); + if (t && t->dev->flags & IFF_UP) + return t; + ndev = READ_ONCE(ign->fb_tunnel_dev); if (ndev && ndev->flags & IFF_UP) return netdev_priv(ndev); @@ -265,6 +271,9 @@ static void ip6gre_tunnel_link(struct ip6gre_net *ign, struct ip6_tnl *t) { struct ip6_tnl __rcu **tp = ip6gre_bucket(ign, t); + if (t->parms.collect_md) + rcu_assign_pointer(ign->collect_md_tun, t); + rcu_assign_pointer(t->next, rtnl_dereference(*tp)); rcu_assign_pointer(*tp, t); } @@ -274,6 +283,9 @@ static void ip6gre_tunnel_unlink(struct ip6gre_net *ign, struct ip6_tnl *t) struct ip6_tnl __rcu **tp; struct ip6_tnl *iter; + if (t->parms.collect_md) + rcu_assign_pointer(ign->collect_md_tun, NULL); + for (tp = ip6gre_bucket(ign, t); (iter = rtnl_dereference(*tp)) != NULL; tp = &iter->next) { @@ -468,7 +480,22 @@ static int ip6gre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi) &ipv6h->saddr, &ipv6h->daddr, tpi->key, tpi->proto); if (tunnel) { - ip6_tnl_rcv(tunnel, skb, tpi, NULL, log_ecn_error); + if (tunnel->parms.collect_md) { + struct metadata_dst *tun_dst; + __be64 tun_id; + __be16 flags; + + flags = tpi->flags; + tun_id = key32_to_tunnel_id(tpi->key); + + tun_dst = ipv6_tun_rx_dst(skb, flags, tun_id, 0); + if (!tun_dst) + return PACKET_REJECT; + + ip6_tnl_rcv(tunnel, skb, tpi, tun_dst, log_ecn_error); + } else { + ip6_tnl_rcv(tunnel, skb, tpi, NULL, log_ecn_error); + } return PACKET_RCVD; } @@ -638,8 +665,38 @@ static netdev_tx_t __gre6_xmit(struct sk_buff *skb, /* Push GRE header. */ protocol = (dev->type == ARPHRD_ETHER) ? htons(ETH_P_TEB) : proto; - gre_build_header(skb, tunnel->tun_hlen, tunnel->parms.o_flags, - protocol, tunnel->parms.o_key, htonl(tunnel->o_seqno)); + + if (tunnel->parms.collect_md) { + struct ip_tunnel_info *tun_info; + const struct ip_tunnel_key *key; + __be16 flags; + + tun_info = skb_tunnel_info(skb); + if (unlikely(!tun_info || + !(tun_info->mode & IP_TUNNEL_INFO_TX) || + ip_tunnel_info_af(tun_info) != AF_INET6)) + return -EINVAL; + + key = &tun_info->key; + memset(fl6, 0, sizeof(*fl6)); + fl6->flowi6_proto = IPPROTO_GRE; + fl6->daddr = key->u.ipv6.dst; + fl6->flowlabel = key->label; + fl6->flowi6_uid = sock_net_uid(dev_net(dev), NULL); + + dsfield = key->tos; + flags = key->tun_flags & (TUNNEL_CSUM | TUNNEL_KEY); + tunnel->tun_hlen = gre_calc_hlen(flags); + + gre_build_header(skb, tunnel->tun_hlen, + flags, protocol, + tunnel_id_to_key32(tun_info->key.tun_id), 0); + + } else { + gre_build_header(skb, tunnel->tun_hlen, tunnel->parms.o_flags, + protocol, tunnel->parms.o_key, + htonl(tunnel->o_seqno)); + } return ip6_tnl_xmit(skb, dev, dsfield, fl6, encap_limit, pmtu, NEXTHDR_GRE); @@ -650,13 +707,15 @@ static inline int ip6gre_xmit_ipv4(struct sk_buff *skb, struct net_device *dev) struct ip6_tnl *t = netdev_priv(dev); int encap_limit = -1; struct flowi6 fl6; - __u8 dsfield; + __u8 dsfield = 0; __u32 mtu; int err; memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); - prepare_ip6gre_xmit_ipv4(skb, dev, &fl6, &dsfield, &encap_limit); + if (!t->parms.collect_md) + prepare_ip6gre_xmit_ipv4(skb, dev, &fl6, + &dsfield, &encap_limit); err = gre_handle_offloads(skb, !!(t->parms.o_flags & TUNNEL_CSUM)); if (err) @@ -681,14 +740,15 @@ static inline int ip6gre_xmit_ipv6(struct sk_buff *skb, struct net_device *dev) struct ipv6hdr *ipv6h = ipv6_hdr(skb); int encap_limit = -1; struct flowi6 fl6; - __u8 dsfield; + __u8 dsfield = 0; __u32 mtu; int err; if (ipv6_addr_equal(&t->parms.raddr, &ipv6h->saddr)) return -1; - if (prepare_ip6gre_xmit_ipv6(skb, dev, &fl6, &dsfield, &encap_limit)) + if (!t->parms.collect_md && + prepare_ip6gre_xmit_ipv6(skb, dev, &fl6, &dsfield, &encap_limit)) return -1; if (gre_handle_offloads(skb, !!(t->parms.o_flags & TUNNEL_CSUM))) @@ -736,7 +796,8 @@ static int ip6gre_xmit_other(struct sk_buff *skb, struct net_device *dev) if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) encap_limit = t->parms.encap_limit; - memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6)); + if (!t->parms.collect_md) + memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6)); err = gre_handle_offloads(skb, !!(t->parms.o_flags & TUNNEL_CSUM)); if (err) @@ -1241,6 +1302,10 @@ static int ip6gre_tunnel_init_common(struct net_device *dev) if (!(tunnel->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) dev->mtu -= 8; + if (tunnel->parms.collect_md) { + dev->features |= NETIF_F_NETNS_LOCAL; + netif_keep_dst(dev); + } ip6gre_tnl_init_features(dev); return 0; @@ -1257,6 +1322,9 @@ static int ip6gre_tunnel_init(struct net_device *dev) tunnel = netdev_priv(dev); + if (tunnel->parms.collect_md) + return 0; + memcpy(dev->dev_addr, &tunnel->parms.laddr, sizeof(struct in6_addr)); memcpy(dev->broadcast, &tunnel->parms.raddr, sizeof(struct in6_addr)); @@ -1505,6 +1573,9 @@ static void ip6gre_netlink_parms(struct nlattr *data[], if (data[IFLA_GRE_ERSPAN_INDEX]) parms->index = nla_get_u32(data[IFLA_GRE_ERSPAN_INDEX]); + + if (data[IFLA_GRE_COLLECT_METADATA]) + parms->collect_md = true; } static int ip6gre_tap_init(struct net_device *dev) @@ -1654,8 +1725,13 @@ static int ip6gre_newlink(struct net *src_net, struct net_device *dev, ip6gre_netlink_parms(data, &nt->parms); - if (ip6gre_tunnel_find(net, &nt->parms, dev->type)) - return -EEXIST; + if (nt->parms.collect_md) { + if (rtnl_dereference(ign->collect_md_tun)) + return -EEXIST; + } else { + if (ip6gre_tunnel_find(net, &nt->parms, dev->type)) + return -EEXIST; + } if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS]) eth_hw_addr_random(dev); @@ -1758,6 +1834,8 @@ static size_t ip6gre_get_size(const struct net_device *dev) nla_total_size(2) + /* IFLA_GRE_ENCAP_DPORT */ nla_total_size(2) + + /* IFLA_GRE_COLLECT_METADATA */ + nla_total_size(0) + /* IFLA_GRE_FWMARK */ nla_total_size(4) + /* IFLA_GRE_ERSPAN_INDEX */ @@ -1797,6 +1875,11 @@ static int ip6gre_fill_info(struct sk_buff *skb, const struct net_device *dev) t->encap.flags)) goto nla_put_failure; + if (p->collect_md) { + if (nla_put_flag(skb, IFLA_GRE_COLLECT_METADATA)) + goto nla_put_failure; + } + return 0; nla_put_failure: @@ -1819,6 +1902,7 @@ static const struct nla_policy ip6gre_policy[IFLA_GRE_MAX + 1] = { [IFLA_GRE_ENCAP_FLAGS] = { .type = NLA_U16 }, [IFLA_GRE_ENCAP_SPORT] = { .type = NLA_U16 }, [IFLA_GRE_ENCAP_DPORT] = { .type = NLA_U16 }, + [IFLA_GRE_COLLECT_METADATA] = { .type = NLA_FLAG }, [IFLA_GRE_FWMARK] = { .type = NLA_U32 }, [IFLA_GRE_ERSPAN_INDEX] = { .type = NLA_U32 }, }; diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 80d76ecb1a26..0be6cb160345 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -879,7 +879,7 @@ int ip6_tnl_rcv(struct ip6_tnl *t, struct sk_buff *skb, if (tpi->proto == htons(ETH_P_IP)) dscp_ecn_decapsulate = ip4ip6_dscp_ecn_decapsulate; - return __ip6_tnl_rcv(t, skb, tpi, NULL, dscp_ecn_decapsulate, + return __ip6_tnl_rcv(t, skb, tpi, tun_dst, dscp_ecn_decapsulate, log_ecn_err); } EXPORT_SYMBOL(ip6_tnl_rcv); @@ -998,6 +998,9 @@ int ip6_tnl_xmit_ctl(struct ip6_tnl *t, int ret = 0; struct net *net = t->net; + if (t->parms.collect_md) + return 1; + if ((p->flags & IP6_TNL_F_CAP_XMIT) || ((p->flags & IP6_TNL_F_CAP_PER_PACKET) && (ip6_tnl_get_cap(t, laddr, raddr) & IP6_TNL_F_CAP_XMIT))) { From ea1f5600ca181295695755a160db2a018d17d122 Mon Sep 17 00:00:00 2001 From: William Tu Date: Thu, 1 Mar 2018 13:49:57 -0800 Subject: [PATCH 0189/1640] UPSTREAM: gre: add sequence number for collect md mode. Currently GRE sequence number can only be used in native tunnel mode. This patch adds sequence number support for gre collect metadata mode. RFC2890 defines GRE sequence number to be specific to the traffic flow identified by the key. However, this patch does not implement per-key seqno. The sequence number is shared in the same tunnel device. That is, different tunnel keys using the same collect_md tunnel share single sequence number. Signed-off-by: William Tu Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 1 + net/core/filter.c | 4 +++- net/ipv4/ip_gre.c | 7 +++++-- net/ipv6/ip6_gre.c | 13 ++++++++----- 4 files changed, 17 insertions(+), 8 deletions(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 397889abec69..77add95817fa 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -890,6 +890,7 @@ enum bpf_func_id { /* BPF_FUNC_skb_set_tunnel_key flags. */ #define BPF_F_ZERO_CSUM_TX (1ULL << 1) #define BPF_F_DONT_FRAGMENT (1ULL << 2) +#define BPF_F_SEQ_NUMBER (1ULL << 3) /* BPF_FUNC_perf_event_output, BPF_FUNC_perf_event_read and * BPF_FUNC_perf_event_read_value flags. diff --git a/net/core/filter.c b/net/core/filter.c index 09fc3676c2ef..e4fccc35c872 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3065,7 +3065,7 @@ BPF_CALL_4(bpf_skb_set_tunnel_key, struct sk_buff *, skb, struct ip_tunnel_info *info; if (unlikely(flags & ~(BPF_F_TUNINFO_IPV6 | BPF_F_ZERO_CSUM_TX | - BPF_F_DONT_FRAGMENT))) + BPF_F_DONT_FRAGMENT | BPF_F_SEQ_NUMBER))) return -EINVAL; if (unlikely(size != sizeof(struct bpf_tunnel_key))) { switch (size) { @@ -3099,6 +3099,8 @@ BPF_CALL_4(bpf_skb_set_tunnel_key, struct sk_buff *, skb, info->key.tun_flags |= TUNNEL_DONT_FRAGMENT; if (flags & BPF_F_ZERO_CSUM_TX) info->key.tun_flags &= ~TUNNEL_CSUM; + if (flags & BPF_F_SEQ_NUMBER) + info->key.tun_flags |= TUNNEL_SEQ; info->key.tun_id = cpu_to_be64(from->tunnel_id); info->key.tos = from->tunnel_tos; diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 056803ad41ef..f905a59b7368 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -516,6 +516,7 @@ err_free_skb: static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev, __be16 proto) { + struct ip_tunnel *tunnel = netdev_priv(dev); struct ip_tunnel_info *tun_info; const struct ip_tunnel_key *key; struct rtable *rt = NULL; @@ -539,9 +540,11 @@ static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev, if (gre_handle_offloads(skb, !!(tun_info->key.tun_flags & TUNNEL_CSUM))) goto err_free_rt; - flags = tun_info->key.tun_flags & (TUNNEL_CSUM | TUNNEL_KEY); + flags = tun_info->key.tun_flags & + (TUNNEL_CSUM | TUNNEL_KEY | TUNNEL_SEQ); gre_build_header(skb, tunnel_hlen, flags, proto, - tunnel_id_to_key32(tun_info->key.tun_id), 0); + tunnel_id_to_key32(tun_info->key.tun_id), + (flags | TUNNEL_SEQ) ? htonl(tunnel->o_seqno++) : 0); df = key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0; diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index ec057b689c22..4466b8a5e2b4 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -660,9 +660,6 @@ static netdev_tx_t __gre6_xmit(struct sk_buff *skb, else fl6->daddr = tunnel->parms.raddr; - if (tunnel->parms.o_flags & TUNNEL_SEQ) - tunnel->o_seqno++; - /* Push GRE header. */ protocol = (dev->type == ARPHRD_ETHER) ? htons(ETH_P_TEB) : proto; @@ -685,14 +682,20 @@ static netdev_tx_t __gre6_xmit(struct sk_buff *skb, fl6->flowi6_uid = sock_net_uid(dev_net(dev), NULL); dsfield = key->tos; - flags = key->tun_flags & (TUNNEL_CSUM | TUNNEL_KEY); + flags = key->tun_flags & + (TUNNEL_CSUM | TUNNEL_KEY | TUNNEL_SEQ); tunnel->tun_hlen = gre_calc_hlen(flags); gre_build_header(skb, tunnel->tun_hlen, flags, protocol, - tunnel_id_to_key32(tun_info->key.tun_id), 0); + tunnel_id_to_key32(tun_info->key.tun_id), + (flags | TUNNEL_SEQ) ? htonl(tunnel->o_seqno++) + : 0); } else { + if (tunnel->parms.o_flags & TUNNEL_SEQ) + tunnel->o_seqno++; + gre_build_header(skb, tunnel->tun_hlen, tunnel->parms.o_flags, protocol, tunnel->parms.o_key, htonl(tunnel->o_seqno)); From a2f3b1d594d1fd8bd15f323f1dd57ffc82b2cfd6 Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Thu, 8 Mar 2018 23:46:33 -0800 Subject: [PATCH 0190/1640] UPSTREAM: bpf: comment why dots in filenames under BPF virtual FS are not allowed When pinning a file under the BPF virtual file system (traditionally /sys/fs/bpf), using a dot in the name of the location to pin at is not allowed. For example, trying to pin at "/sys/fs/bpf/foo.bar" will be rejected with -EPERM. This check was introduced at the same time as the BPF file system itself, with commit b2197755b263 ("bpf: add support for persistent maps/progs"). At this time, it was checked in a function called "bpf_dname_reserved()", which made clear that using a dot was reserved for future extensions. This function disappeared and the check was moved elsewhere with commit 0c93b7d85d40 ("bpf: reject invalid names right in ->lookup()"), and the meaning of the dot ban was lost. The present commit simply adds a comment in the source to explain to the reader that the usage of dots is reserved for future usage. Signed-off-by: Quentin Monnet Signed-off-by: Daniel Borkmann --- kernel/bpf/inode.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index dc26065e79f7..e98e2702ae4e 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -178,6 +178,9 @@ static int bpf_mkmap(struct dentry *dentry, umode_t mode, void *arg) static struct dentry * bpf_lookup(struct inode *dir, struct dentry *dentry, unsigned flags) { + /* Dots in names (e.g. "/sys/fs/bpf/foo.bar") are reserved for future + * extensions. + */ if (strchr(dentry->d_name.name, '.')) return ERR_PTR(-EPERM); From 1854029ed8f3d2f1306a0a5f42a18d8bb5be9eae Mon Sep 17 00:00:00 2001 From: Song Liu Date: Wed, 14 Mar 2018 10:23:21 -0700 Subject: [PATCH 0191/1640] BACKPORT: bpf: extend stackmap to save binary_build_id+offset instead of address Currently, bpf stackmap store address for each entry in the call trace. To map these addresses to user space files, it is necessary to maintain the mapping from these virtual address to symbols in the binary. Usually, the user space profiler (such as perf) has to scan /proc/pid/maps at the beginning of profiling, and monitor mmap2() calls afterwards. Given the cost of maintaining the address map, this solution is not practical for system wide profiling that is always on. This patch tries to solve this problem with a variation of stackmap. This variation is enabled by flag BPF_F_STACK_BUILD_ID. Instead of storing addresses, the variation stores ELF file build_id + offset. Build ID is a 20-byte unique identifier for ELF files. The following command shows the Build ID of /bin/bash: [user@]$ readelf -n /bin/bash ... Build ID: XXXXXXXXXX ... With BPF_F_STACK_BUILD_ID, bpf_get_stackid() tries to parse Build ID for each entry in the call trace, and translate it into the following struct: struct bpf_stack_build_id_offset { __s32 status; unsigned char build_id[BPF_BUILD_ID_SIZE]; union { __u64 offset; __u64 ip; }; }; The search of build_id is limited to the first page of the file, and this page should be in page cache. Otherwise, we fallback to store ip for this entry (ip field in struct bpf_stack_build_id_offset). This requires the build_id to be stored in the first page. A quick survey of binary and dynamic library files in a few different systems shows that almost all binary and dynamic library files have build_id in the first page. Build_id is only meaningful for user stack. If a kernel stack is added to a stackmap with BPF_F_STACK_BUILD_ID, it will automatically fallback to only store ip (status == BPF_STACK_BUILD_ID_IP). Similarly, if build_id lookup failed for some reason, it will also fallback to store ip. User space can access struct bpf_stack_build_id_offset with bpf syscall BPF_MAP_LOOKUP_ELEM. It is necessary for user space to maintain mapping from build id to binary files. This mostly static mapping is much easier to maintain than per process address maps. Note: Stackmap with build_id only works in non-nmi context at this time. This is because we need to take mm->mmap_sem for find_vma(). If this changes, we would like to allow build_id lookup in nmi context. Signed-off-by: Song Liu Signed-off-by: Daniel Borkmann --- include/uapi/linux/bpf.h | 22 ++++ kernel/bpf/stackmap.c | 257 +++++++++++++++++++++++++++++++++++---- 2 files changed, 257 insertions(+), 22 deletions(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 77add95817fa..0e5cd1f7dfe1 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -231,6 +231,28 @@ enum bpf_attach_type { #define BPF_F_RDONLY (1U << 3) #define BPF_F_WRONLY (1U << 4) +/* Flag for stack_map, store build_id+offset instead of pointer */ +#define BPF_F_STACK_BUILD_ID (1U << 5) + +enum bpf_stack_build_id_status { + /* user space need an empty entry to identify end of a trace */ + BPF_STACK_BUILD_ID_EMPTY = 0, + /* with valid build_id and offset */ + BPF_STACK_BUILD_ID_VALID = 1, + /* couldn't get build_id, fallback to ip */ + BPF_STACK_BUILD_ID_IP = 2, +}; + +#define BPF_BUILD_ID_SIZE 20 +struct bpf_stack_build_id { + __s32 status; + unsigned char build_id[BPF_BUILD_ID_SIZE]; + union { + __u64 offset; + __u64 ip; + }; +}; + union bpf_attr { struct { /* anonymous struct used by BPF_MAP_CREATE command */ __u32 map_type; /* one of enum bpf_map_type */ diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 43aa79610dee..a3b8c4c78db3 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -9,16 +9,19 @@ #include #include #include +#include +#include #include "percpu_freelist.h" -#define STACK_CREATE_FLAG_MASK \ - (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) +#define STACK_CREATE_FLAG_MASK \ + (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY | \ + BPF_F_STACK_BUILD_ID) struct stack_map_bucket { struct pcpu_freelist_node fnode; u32 hash; u32 nr; - u64 ip[]; + u64 data[]; }; struct bpf_stack_map { @@ -29,6 +32,17 @@ struct bpf_stack_map { struct stack_map_bucket *buckets[]; }; +static inline bool stack_map_use_build_id(struct bpf_map *map) +{ + return (map->map_flags & BPF_F_STACK_BUILD_ID); +} + +static inline int stack_map_data_size(struct bpf_map *map) +{ + return stack_map_use_build_id(map) ? + sizeof(struct bpf_stack_build_id) : sizeof(u64); +} + static int prealloc_elems_and_freelist(struct bpf_stack_map *smap) { u64 elem_size = sizeof(struct stack_map_bucket) + @@ -69,8 +83,16 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr) /* check sanity of attributes */ if (attr->max_entries == 0 || attr->key_size != 4 || - value_size < 8 || value_size % 8 || - value_size / 8 > sysctl_perf_event_max_stack) + value_size < 8 || value_size % 8) + return ERR_PTR(-EINVAL); + + BUILD_BUG_ON(sizeof(struct bpf_stack_build_id) % sizeof(u64)); + if (attr->map_flags & BPF_F_STACK_BUILD_ID) { + if (value_size % sizeof(struct bpf_stack_build_id) || + value_size / sizeof(struct bpf_stack_build_id) + > sysctl_perf_event_max_stack) + return ERR_PTR(-EINVAL); + } else if (value_size / 8 > sysctl_perf_event_max_stack) return ERR_PTR(-EINVAL); /* hash table size must be power of 2; roundup_pow_of_two() can overflow @@ -120,13 +142,184 @@ free_smap: return ERR_PTR(err); } +#define BPF_BUILD_ID 3 +/* + * Parse build id from the note segment. This logic can be shared between + * 32-bit and 64-bit system, because Elf32_Nhdr and Elf64_Nhdr are + * identical. + */ +static inline int stack_map_parse_build_id(void *page_addr, + unsigned char *build_id, + void *note_start, + Elf32_Word note_size) +{ + Elf32_Word note_offs = 0, new_offs; + + /* check for overflow */ + if (note_start < page_addr || note_start + note_size < note_start) + return -EINVAL; + + /* only supports note that fits in the first page */ + if (note_start + note_size > page_addr + PAGE_SIZE) + return -EINVAL; + + while (note_offs + sizeof(Elf32_Nhdr) < note_size) { + Elf32_Nhdr *nhdr = (Elf32_Nhdr *)(note_start + note_offs); + + if (nhdr->n_type == BPF_BUILD_ID && + nhdr->n_namesz == sizeof("GNU") && + nhdr->n_descsz == BPF_BUILD_ID_SIZE) { + memcpy(build_id, + note_start + note_offs + + ALIGN(sizeof("GNU"), 4) + sizeof(Elf32_Nhdr), + BPF_BUILD_ID_SIZE); + return 0; + } + new_offs = note_offs + sizeof(Elf32_Nhdr) + + ALIGN(nhdr->n_namesz, 4) + ALIGN(nhdr->n_descsz, 4); + if (new_offs <= note_offs) /* overflow */ + break; + note_offs = new_offs; + } + return -EINVAL; +} + +/* Parse build ID from 32-bit ELF */ +static int stack_map_get_build_id_32(void *page_addr, + unsigned char *build_id) +{ + Elf32_Ehdr *ehdr = (Elf32_Ehdr *)page_addr; + Elf32_Phdr *phdr; + int i; + + /* only supports phdr that fits in one page */ + if (ehdr->e_phnum > + (PAGE_SIZE - sizeof(Elf32_Ehdr)) / sizeof(Elf32_Phdr)) + return -EINVAL; + + phdr = (Elf32_Phdr *)(page_addr + sizeof(Elf32_Ehdr)); + + for (i = 0; i < ehdr->e_phnum; ++i) + if (phdr[i].p_type == PT_NOTE) + return stack_map_parse_build_id(page_addr, build_id, + page_addr + phdr[i].p_offset, + phdr[i].p_filesz); + return -EINVAL; +} + +/* Parse build ID from 64-bit ELF */ +static int stack_map_get_build_id_64(void *page_addr, + unsigned char *build_id) +{ + Elf64_Ehdr *ehdr = (Elf64_Ehdr *)page_addr; + Elf64_Phdr *phdr; + int i; + + /* only supports phdr that fits in one page */ + if (ehdr->e_phnum > + (PAGE_SIZE - sizeof(Elf64_Ehdr)) / sizeof(Elf64_Phdr)) + return -EINVAL; + + phdr = (Elf64_Phdr *)(page_addr + sizeof(Elf64_Ehdr)); + + for (i = 0; i < ehdr->e_phnum; ++i) + if (phdr[i].p_type == PT_NOTE) + return stack_map_parse_build_id(page_addr, build_id, + page_addr + phdr[i].p_offset, + phdr[i].p_filesz); + return -EINVAL; +} + +/* Parse build ID of ELF file mapped to vma */ +static int stack_map_get_build_id(struct vm_area_struct *vma, + unsigned char *build_id) +{ + Elf32_Ehdr *ehdr; + struct page *page; + void *page_addr; + int ret; + + /* only works for page backed storage */ + if (!vma->vm_file) + return -EINVAL; + + page = find_get_page(vma->vm_file->f_mapping, 0); + if (!page) + return -EFAULT; /* page not mapped */ + + ret = -EINVAL; + page_addr = page_address(page); + ehdr = (Elf32_Ehdr *)page_addr; + + /* compare magic x7f "ELF" */ + if (memcmp(ehdr->e_ident, ELFMAG, SELFMAG) != 0) + goto out; + + /* only support executable file and shared object file */ + if (ehdr->e_type != ET_EXEC && ehdr->e_type != ET_DYN) + goto out; + + if (ehdr->e_ident[EI_CLASS] == ELFCLASS32) + ret = stack_map_get_build_id_32(page_addr, build_id); + else if (ehdr->e_ident[EI_CLASS] == ELFCLASS64) + ret = stack_map_get_build_id_64(page_addr, build_id); +out: + put_page(page); + return ret; +} + +static void stack_map_get_build_id_offset(struct bpf_map *map, + struct stack_map_bucket *bucket, + u64 *ips, u32 trace_nr, bool user) +{ + int i; + struct vm_area_struct *vma; + struct bpf_stack_build_id *id_offs; + + bucket->nr = trace_nr; + id_offs = (struct bpf_stack_build_id *)bucket->data; + + /* + * We cannot do up_read() in nmi context, so build_id lookup is + * only supported for non-nmi events. If at some point, it is + * possible to run find_vma() without taking the semaphore, we + * would like to allow build_id lookup in nmi context. + * + * Same fallback is used for kernel stack (!user) on a stackmap + * with build_id. + */ + if (!user || !current || !current->mm || in_nmi() || + down_read_trylock(¤t->mm->mmap_sem) == 0) { + /* cannot access current->mm, fall back to ips */ + for (i = 0; i < trace_nr; i++) { + id_offs[i].status = BPF_STACK_BUILD_ID_IP; + id_offs[i].ip = ips[i]; + } + return; + } + + for (i = 0; i < trace_nr; i++) { + vma = find_vma(current->mm, ips[i]); + if (!vma || stack_map_get_build_id(vma, id_offs[i].build_id)) { + /* per entry fall back to ips */ + id_offs[i].status = BPF_STACK_BUILD_ID_IP; + id_offs[i].ip = ips[i]; + continue; + } + id_offs[i].offset = (vma->vm_pgoff << PAGE_SHIFT) + ips[i] + - vma->vm_start; + id_offs[i].status = BPF_STACK_BUILD_ID_VALID; + } + up_read(¤t->mm->mmap_sem); +} + BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map, u64, flags) { struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); struct perf_callchain_entry *trace; struct stack_map_bucket *bucket, *new_bucket, *old_bucket; - u32 max_depth = map->value_size / 8; + u32 max_depth = map->value_size / stack_map_data_size(map); /* stack_map_alloc() checks that max_depth <= sysctl_perf_event_max_stack */ u32 init_nr = sysctl_perf_event_max_stack - max_depth; u32 skip = flags & BPF_F_SKIP_FIELD_MASK; @@ -134,6 +327,7 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map, bool user = flags & BPF_F_USER_STACK; bool kernel = !user; u64 *ips; + bool hash_matches; if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK | BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID))) @@ -162,24 +356,43 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map, id = hash & (smap->n_buckets - 1); bucket = READ_ONCE(smap->buckets[id]); - if (bucket && bucket->hash == hash) { - if (flags & BPF_F_FAST_STACK_CMP) + hash_matches = bucket && bucket->hash == hash; + /* fast cmp */ + if (hash_matches && flags & BPF_F_FAST_STACK_CMP) + return id; + + if (stack_map_use_build_id(map)) { + /* for build_id+offset, pop a bucket before slow cmp */ + new_bucket = (struct stack_map_bucket *) + pcpu_freelist_pop(&smap->freelist); + if (unlikely(!new_bucket)) + return -ENOMEM; + stack_map_get_build_id_offset(map, new_bucket, ips, + trace_nr, user); + trace_len = trace_nr * sizeof(struct bpf_stack_build_id); + if (hash_matches && bucket->nr == trace_nr && + memcmp(bucket->data, new_bucket->data, trace_len) == 0) { + pcpu_freelist_push(&smap->freelist, &new_bucket->fnode); return id; - if (bucket->nr == trace_nr && - memcmp(bucket->ip, ips, trace_len) == 0) + } + if (bucket && !(flags & BPF_F_REUSE_STACKID)) { + pcpu_freelist_push(&smap->freelist, &new_bucket->fnode); + return -EEXIST; + } + } else { + if (hash_matches && bucket->nr == trace_nr && + memcmp(bucket->data, ips, trace_len) == 0) return id; + if (bucket && !(flags & BPF_F_REUSE_STACKID)) + return -EEXIST; + + new_bucket = (struct stack_map_bucket *) + pcpu_freelist_pop(&smap->freelist); + if (unlikely(!new_bucket)) + return -ENOMEM; + memcpy(new_bucket->data, ips, trace_len); } - /* this call stack is not in the map, try to add it */ - if (bucket && !(flags & BPF_F_REUSE_STACKID)) - return -EEXIST; - - new_bucket = (struct stack_map_bucket *) - pcpu_freelist_pop(&smap->freelist); - if (unlikely(!new_bucket)) - return -ENOMEM; - - memcpy(new_bucket->ip, ips, trace_len); new_bucket->hash = hash; new_bucket->nr = trace_nr; @@ -218,8 +431,8 @@ int bpf_stackmap_copy(struct bpf_map *map, void *key, void *value) if (!bucket) return -ENOENT; - trace_len = bucket->nr * sizeof(u64); - memcpy(value, bucket->ip, trace_len); + trace_len = bucket->nr * stack_map_data_size(map); + memcpy(value, bucket->data, trace_len); memset(value + trace_len, 0, map->value_size - trace_len); old_bucket = xchg(&smap->buckets[id], bucket); From cb1294fdb5dfc274f9f28c695a587fd582916c56 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Sun, 18 Mar 2018 12:56:54 -0700 Subject: [PATCH 0192/1640] UPSTREAM: sockmap: convert refcnt to an atomic refcnt The sockmap refcnt up until now has been wrapped in the sk_callback_lock(). So its not actually needed any locking of its own. The counter itself tracks the lifetime of the psock object. Sockets in a sockmap have a lifetime that is independent of the map they are part of. This is possible because a single socket may be in multiple maps. When this happens we can only release the psock data associated with the socket when the refcnt reaches zero. There are three possible delete sock reference decrement paths first through the normal sockmap process, the user deletes the socket from the map. Second the map is removed and all sockets in the map are removed, delete path is similar to case 1. The third case is an asyncronous socket event such as a closing the socket. The last case handles removing sockets that are no longer available. For completeness, although inc does not pose any problems in this patch series, the inc case only happens when a psock is added to a map. Next we plan to add another socket prog type to handle policy and monitoring on the TX path. When we do this however we will need to keep a reference count open across the sendmsg/sendpage call and holding the sk_callback_lock() here (on every send) seems less than ideal, also it may sleep in cases where we hit memory pressure. Instead of dealing with these issues in some clever way simply make the reference counting a refcnt_t type and do proper atomic ops. Signed-off-by: John Fastabend Acked-by: David S. Miller Signed-off-by: Daniel Borkmann --- kernel/bpf/sockmap.c | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 242e6de49a64..98103d2c4ff0 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -62,8 +62,7 @@ struct smap_psock_map_entry { struct smap_psock { struct rcu_head rcu; - /* refcnt is used inside sk_callback_lock */ - u32 refcnt; + refcount_t refcnt; /* datapath variables */ struct sk_buff_head rxqueue; @@ -376,15 +375,13 @@ static void smap_destroy_psock(struct rcu_head *rcu) static void smap_release_sock(struct smap_psock *psock, struct sock *sock) { - psock->refcnt--; - if (psock->refcnt) - return; - - tcp_cleanup_ulp(sock); - smap_stop_sock(psock, sock); - clear_bit(SMAP_TX_RUNNING, &psock->state); - rcu_assign_sk_user_data(sock, NULL); - call_rcu_sched(&psock->rcu, smap_destroy_psock); + if (refcount_dec_and_test(&psock->refcnt)) { + tcp_cleanup_ulp(sock); + smap_stop_sock(psock, sock); + clear_bit(SMAP_TX_RUNNING, &psock->state); + rcu_assign_sk_user_data(sock, NULL); + call_rcu_sched(&psock->rcu, smap_destroy_psock); + } } static int smap_parse_func_strparser(struct strparser *strp, @@ -514,7 +511,7 @@ static struct smap_psock *smap_init_psock(struct sock *sock, INIT_WORK(&psock->tx_work, smap_tx_work); INIT_WORK(&psock->gc_work, smap_gc_work); INIT_LIST_HEAD(&psock->maps); - psock->refcnt = 1; + refcount_set(&psock->refcnt, 1); rcu_assign_sk_user_data(sock, psock); sock_hold(sock); @@ -775,7 +772,7 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops, err = -EBUSY; goto out_progs; } - psock->refcnt++; + refcount_inc(&psock->refcnt); } else { psock = smap_init_psock(sock, stab); if (IS_ERR(psock)) { From 1f2e3124addeca59ced7da66d56c1f4e22a3e88e Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Sun, 18 Mar 2018 12:57:10 -0700 Subject: [PATCH 0193/1640] BACKPORT: bpf: create tcp_bpf_ulp allowing BPF to monitor socket TX/RX data This implements a BPF ULP layer to allow policy enforcement and monitoring at the socket layer. In order to support this a new program type BPF_PROG_TYPE_SK_MSG is used to run the policy at the sendmsg/sendpage hook. To attach the policy to sockets a sockmap is used with a new program attach type BPF_SK_MSG_VERDICT. Similar to previous sockmap usages when a sock is added to a sockmap, via a map update, if the map contains a BPF_SK_MSG_VERDICT program type attached then the BPF ULP layer is created on the socket and the attached BPF_PROG_TYPE_SK_MSG program is run for every msg in sendmsg case and page/offset in sendpage case. BPF_PROG_TYPE_SK_MSG Semantics/API: BPF_PROG_TYPE_SK_MSG supports only two return codes SK_PASS and SK_DROP. Returning SK_DROP free's the copied data in the sendmsg case and in the sendpage case leaves the data untouched. Both cases return -EACESS to the user. Returning SK_PASS will allow the msg to be sent. In the sendmsg case data is copied into kernel space buffers before running the BPF program. The kernel space buffers are stored in a scatterlist object where each element is a kernel memory buffer. Some effort is made to coalesce data from the sendmsg call here. For example a sendmsg call with many one byte iov entries will likely be pushed into a single entry. The BPF program is run with data pointers (start/end) pointing to the first sg element. In the sendpage case data is not copied. We opt not to copy the data by default here, because the BPF infrastructure does not know what bytes will be needed nor when they will be needed. So copying all bytes may be wasteful. Because of this the initial start/end data pointers are (0,0). Meaning no data can be read or written. This avoids reading data that may be modified by the user. A new helper is added later in this series if reading and writing the data is needed. The helper call will do a copy by default so that the page is exclusively owned by the BPF call. The verdict from the BPF_PROG_TYPE_SK_MSG applies to the entire msg in the sendmsg() case and the entire page/offset in the sendpage case. This avoids ambiguity on how to handle mixed return codes in the sendmsg case. Again a helper is added later in the series if a verdict needs to apply to multiple system calls and/or only a subpart of the currently being processed message. The helper msg_redirect_map() can be used to select the socket to send the data on. This is used similar to existing redirect use cases. This allows policy to redirect msgs. Pseudo code simple example: The basic logic to attach a program to a socket is as follows, // load the programs bpf_prog_load(SOCKMAP_TCP_MSG_PROG, BPF_PROG_TYPE_SK_MSG, &obj, &msg_prog); // lookup the sockmap bpf_map_msg = bpf_object__find_map_by_name(obj, "my_sock_map"); // get fd for sockmap map_fd_msg = bpf_map__fd(bpf_map_msg); // attach program to sockmap bpf_prog_attach(msg_prog, map_fd_msg, BPF_SK_MSG_VERDICT, 0); Adding sockets to the map is done in the normal way, // Add a socket 'fd' to sockmap at location 'i' bpf_map_update_elem(map_fd_msg, &i, fd, BPF_ANY); After the above any socket attached to "my_sock_map", in this case 'fd', will run the BPF msg verdict program (msg_prog) on every sendmsg and sendpage system call. For a complete example see BPF selftests or sockmap samples. Implementation notes: It seemed the simplest, to me at least, to use a refcnt to ensure psock is not lost across the sendmsg copy into the sg, the bpf program running on the data in sg_data, and the final pass to the TCP stack. Some performance testing may show a better method to do this and avoid the refcnt cost, but for now use the simpler method. Another item that will come after basic support is in place is supporting MSG_MORE flag. At the moment we call sendpages even if the MSG_MORE flag is set. An enhancement would be to collect the pages into a larger scatterlist and pass down the stack. Notice that bpf_tcp_sendmsg() could support this with some additional state saved across sendmsg calls. I built the code to support this without having to do refactoring work. Other features TBD include ZEROCOPY and the TCP_RECV_QUEUE/TCP_NO_QUEUE support. This will follow initial series shortly. Future work could improve size limits on the scatterlist rings used here. Currently, we use MAX_SKB_FRAGS simply because this was being used already in the TLS case. Future work could extend the kernel sk APIs to tune this depending on workload. This is a trade-off between memory usage and throughput performance. Signed-off-by: John Fastabend Acked-by: David S. Miller Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 1 + include/linux/bpf_types.h | 1 + include/linux/filter.h | 17 + include/uapi/linux/bpf.h | 19 + kernel/bpf/sockmap.c | 712 +++++++++++++++++++++++++++++++++++++- kernel/bpf/syscall.c | 14 +- kernel/bpf/verifier.c | 5 +- net/core/filter.c | 106 ++++++ 8 files changed, 855 insertions(+), 20 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 4734a73db06f..a774cb63a94f 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -21,6 +21,7 @@ struct bpf_verifier_env; struct perf_event; struct bpf_prog; struct bpf_map; +struct sock; /* map is generic key/value storage optionally accesible by eBPF programs */ struct bpf_map_ops { diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index 19b8349a3809..5e2e8a49fb21 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -13,6 +13,7 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_OUT, lwt_inout) BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_XMIT, lwt_xmit) BPF_PROG_TYPE(BPF_PROG_TYPE_SOCK_OPS, sock_ops) BPF_PROG_TYPE(BPF_PROG_TYPE_SK_SKB, sk_skb) +BPF_PROG_TYPE(BPF_PROG_TYPE_SK_MSG, sk_msg) #endif #ifdef CONFIG_BPF_EVENTS BPF_PROG_TYPE(BPF_PROG_TYPE_KPROBE, kprobe) diff --git a/include/linux/filter.h b/include/linux/filter.h index 5e92d1c9c70c..262f22202560 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -587,6 +587,22 @@ struct xdp_buff { struct xdp_rxq_info *rxq; }; +struct sk_msg_buff { + void *data; + void *data_end; + __u32 apply_bytes; + __u32 cork_bytes; + int sg_copybreak; + int sg_start; + int sg_curr; + int sg_end; + struct scatterlist sg_data[MAX_SKB_FRAGS]; + bool sg_copy[MAX_SKB_FRAGS]; + __u32 key; + __u32 flags; + struct bpf_map *map; +}; + /* compute the linear packet data range [data, data_end) which * will be accessed by cls_bpf, act_bpf and lwt programs */ @@ -848,6 +864,7 @@ xdp_data_meta_unsupported(const struct xdp_buff *xdp) void bpf_warn_invalid_xdp_action(u32 act); struct sock *do_sk_redirect_map(struct sk_buff *skb); +struct sock *do_msg_redirect_map(struct sk_msg_buff *md); #ifdef CONFIG_BPF_JIT extern int bpf_jit_enable; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 0e5cd1f7dfe1..ad74f526eb80 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -133,6 +133,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_SOCK_OPS, BPF_PROG_TYPE_SK_SKB, BPF_PROG_TYPE_CGROUP_DEVICE, + BPF_PROG_TYPE_SK_MSG, }; enum bpf_attach_type { @@ -143,6 +144,7 @@ enum bpf_attach_type { BPF_SK_SKB_STREAM_PARSER, BPF_SK_SKB_STREAM_VERDICT, BPF_CGROUP_DEVICE, + BPF_SK_MSG_VERDICT, __MAX_BPF_ATTACH_TYPE }; @@ -719,6 +721,15 @@ union bpf_attr { * @pt_regs: pointer to struct pt_regs * @rc: the return value to set * + * + * int bpf_msg_redirect_map(map, key, flags) + * Redirect msg to a sock in map using key as a lookup key for the + * sock in map. + * @map: pointer to sockmap + * @key: key to lookup sock in map + * @flags: reserved for future use + * Return: SK_PASS + * * int skb_load_bytes_relative(const struct sk_buff *skb, u32 offset, void *to, u32 len, u32 start_header) * Description * This helper is similar to **bpf_skb_load_bytes**\ () in that @@ -1038,6 +1049,14 @@ enum sk_action { SK_PASS, }; +/* user accessible metadata for SK_MSG packet hook, new fields must + * be added to the end of this structure + */ +struct sk_msg_md { + void *data; + void *data_end; +}; + #define BPF_TAG_SIZE 8 struct bpf_prog_info { diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 98103d2c4ff0..8a5e251a57e8 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -38,6 +38,7 @@ #include #include #include +#include #include #include @@ -47,6 +48,7 @@ struct bpf_stab { struct bpf_map map; struct sock **sock_map; + struct bpf_prog *bpf_tx_msg; struct bpf_prog *bpf_parse; struct bpf_prog *bpf_verdict; }; @@ -73,7 +75,16 @@ struct smap_psock { int save_off; struct sk_buff *save_skb; + /* datapath variables for tx_msg ULP */ + struct sock *sk_redir; + int apply_bytes; + int cork_bytes; + int sg_size; + int eval; + struct sk_msg_buff *cork; + struct strparser strp; + struct bpf_prog *bpf_tx_msg; struct bpf_prog *bpf_parse; struct bpf_prog *bpf_verdict; struct list_head maps; @@ -91,6 +102,11 @@ struct smap_psock { void (*save_write_space)(struct sock *sk); }; +static void smap_release_sock(struct smap_psock *psock, struct sock *sock); +static int bpf_tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size); +static int bpf_tcp_sendpage(struct sock *sk, struct page *page, + int offset, size_t size, int flags); + static inline struct smap_psock *smap_psock_sk(const struct sock *sk) { return rcu_dereference_sk_user_data(sk); @@ -115,27 +131,41 @@ static int bpf_tcp_init(struct sock *sk) psock->save_close = sk->sk_prot->close; psock->sk_proto = sk->sk_prot; + + if (psock->bpf_tx_msg) { + tcp_bpf_proto.sendmsg = bpf_tcp_sendmsg; + tcp_bpf_proto.sendpage = bpf_tcp_sendpage; + } + sk->sk_prot = &tcp_bpf_proto; rcu_read_unlock(); return 0; } +static void smap_release_sock(struct smap_psock *psock, struct sock *sock); +static int free_start_sg(struct sock *sk, struct sk_msg_buff *md); + static void bpf_tcp_release(struct sock *sk) { struct smap_psock *psock; rcu_read_lock(); psock = smap_psock_sk(sk); + if (unlikely(!psock)) + goto out; - if (likely(psock)) { - sk->sk_prot = psock->sk_proto; - psock->sk_proto = NULL; + if (psock->cork) { + free_start_sg(psock->sock, psock->cork); + kfree(psock->cork); + psock->cork = NULL; } + + sk->sk_prot = psock->sk_proto; + psock->sk_proto = NULL; +out: rcu_read_unlock(); } -static void smap_release_sock(struct smap_psock *psock, struct sock *sock); - static void bpf_tcp_close(struct sock *sk, long timeout) { void (*close_fun)(struct sock *sk, long timeout); @@ -174,6 +204,7 @@ enum __sk_action { __SK_DROP = 0, __SK_PASS, __SK_REDIRECT, + __SK_NONE, }; static struct tcp_ulp_ops bpf_tcp_ulp_ops __read_mostly = { @@ -185,10 +216,621 @@ static struct tcp_ulp_ops bpf_tcp_ulp_ops __read_mostly = { .release = bpf_tcp_release, }; +static int memcopy_from_iter(struct sock *sk, + struct sk_msg_buff *md, + struct iov_iter *from, int bytes) +{ + struct scatterlist *sg = md->sg_data; + int i = md->sg_curr, rc = -ENOSPC; + + do { + int copy; + char *to; + + if (md->sg_copybreak >= sg[i].length) { + md->sg_copybreak = 0; + + if (++i == MAX_SKB_FRAGS) + i = 0; + + if (i == md->sg_end) + break; + } + + copy = sg[i].length - md->sg_copybreak; + to = sg_virt(&sg[i]) + md->sg_copybreak; + md->sg_copybreak += copy; + + if (sk->sk_route_caps & NETIF_F_NOCACHE_COPY) + rc = copy_from_iter_nocache(to, copy, from); + else + rc = copy_from_iter(to, copy, from); + + if (rc != copy) { + rc = -EFAULT; + goto out; + } + + bytes -= copy; + if (!bytes) + break; + + md->sg_copybreak = 0; + if (++i == MAX_SKB_FRAGS) + i = 0; + } while (i != md->sg_end); +out: + md->sg_curr = i; + return rc; +} + +static int bpf_tcp_push(struct sock *sk, int apply_bytes, + struct sk_msg_buff *md, + int flags, bool uncharge) +{ + bool apply = apply_bytes; + struct scatterlist *sg; + int offset, ret = 0; + struct page *p; + size_t size; + + while (1) { + sg = md->sg_data + md->sg_start; + size = (apply && apply_bytes < sg->length) ? + apply_bytes : sg->length; + offset = sg->offset; + + tcp_rate_check_app_limited(sk); + p = sg_page(sg); +retry: + ret = do_tcp_sendpages(sk, p, offset, size, flags); + if (ret != size) { + if (ret > 0) { + if (apply) + apply_bytes -= ret; + size -= ret; + offset += ret; + if (uncharge) + sk_mem_uncharge(sk, ret); + goto retry; + } + + sg->length = size; + sg->offset = offset; + return ret; + } + + if (apply) + apply_bytes -= ret; + sg->offset += ret; + sg->length -= ret; + if (uncharge) + sk_mem_uncharge(sk, ret); + + if (!sg->length) { + put_page(p); + md->sg_start++; + if (md->sg_start == MAX_SKB_FRAGS) + md->sg_start = 0; + memset(sg, 0, sizeof(*sg)); + + if (md->sg_start == md->sg_end) + break; + } + + if (apply && !apply_bytes) + break; + } + return 0; +} + +static inline void bpf_compute_data_pointers_sg(struct sk_msg_buff *md) +{ + struct scatterlist *sg = md->sg_data + md->sg_start; + + if (md->sg_copy[md->sg_start]) { + md->data = md->data_end = 0; + } else { + md->data = sg_virt(sg); + md->data_end = md->data + sg->length; + } +} + +static void return_mem_sg(struct sock *sk, int bytes, struct sk_msg_buff *md) +{ + struct scatterlist *sg = md->sg_data; + int i = md->sg_start; + + do { + int uncharge = (bytes < sg[i].length) ? bytes : sg[i].length; + + sk_mem_uncharge(sk, uncharge); + bytes -= uncharge; + if (!bytes) + break; + i++; + if (i == MAX_SKB_FRAGS) + i = 0; + } while (i != md->sg_end); +} + +static void free_bytes_sg(struct sock *sk, int bytes, struct sk_msg_buff *md) +{ + struct scatterlist *sg = md->sg_data; + int i = md->sg_start, free; + + while (bytes && sg[i].length) { + free = sg[i].length; + if (bytes < free) { + sg[i].length -= bytes; + sg[i].offset += bytes; + sk_mem_uncharge(sk, bytes); + break; + } + + sk_mem_uncharge(sk, sg[i].length); + put_page(sg_page(&sg[i])); + bytes -= sg[i].length; + sg[i].length = 0; + sg[i].page_link = 0; + sg[i].offset = 0; + i++; + + if (i == MAX_SKB_FRAGS) + i = 0; + } +} + +static int free_sg(struct sock *sk, int start, struct sk_msg_buff *md) +{ + struct scatterlist *sg = md->sg_data; + int i = start, free = 0; + + while (sg[i].length) { + free += sg[i].length; + sk_mem_uncharge(sk, sg[i].length); + put_page(sg_page(&sg[i])); + sg[i].length = 0; + sg[i].page_link = 0; + sg[i].offset = 0; + i++; + + if (i == MAX_SKB_FRAGS) + i = 0; + } + + return free; +} + +static int free_start_sg(struct sock *sk, struct sk_msg_buff *md) +{ + int free = free_sg(sk, md->sg_start, md); + + md->sg_start = md->sg_end; + return free; +} + +static int free_curr_sg(struct sock *sk, struct sk_msg_buff *md) +{ + return free_sg(sk, md->sg_curr, md); +} + +static int bpf_map_msg_verdict(int _rc, struct sk_msg_buff *md) +{ + return ((_rc == SK_PASS) ? + (md->map ? __SK_REDIRECT : __SK_PASS) : + __SK_DROP); +} + +static unsigned int smap_do_tx_msg(struct sock *sk, + struct smap_psock *psock, + struct sk_msg_buff *md) +{ + struct bpf_prog *prog; + unsigned int rc, _rc; + + preempt_disable(); + rcu_read_lock(); + + /* If the policy was removed mid-send then default to 'accept' */ + prog = READ_ONCE(psock->bpf_tx_msg); + if (unlikely(!prog)) { + _rc = SK_PASS; + goto verdict; + } + + bpf_compute_data_pointers_sg(md); + rc = (*prog->bpf_func)(md, prog->insnsi); + psock->apply_bytes = md->apply_bytes; + + /* Moving return codes from UAPI namespace into internal namespace */ + _rc = bpf_map_msg_verdict(rc, md); + + /* The psock has a refcount on the sock but not on the map and because + * we need to drop rcu read lock here its possible the map could be + * removed between here and when we need it to execute the sock + * redirect. So do the map lookup now for future use. + */ + if (_rc == __SK_REDIRECT) { + if (psock->sk_redir) + sock_put(psock->sk_redir); + psock->sk_redir = do_msg_redirect_map(md); + if (!psock->sk_redir) { + _rc = __SK_DROP; + goto verdict; + } + sock_hold(psock->sk_redir); + } +verdict: + rcu_read_unlock(); + preempt_enable(); + + return _rc; +} + +static int bpf_tcp_sendmsg_do_redirect(struct sock *sk, int send, + struct sk_msg_buff *md, + int flags) +{ + struct smap_psock *psock; + struct scatterlist *sg; + int i, err, free = 0; + + sg = md->sg_data; + + rcu_read_lock(); + psock = smap_psock_sk(sk); + if (unlikely(!psock)) + goto out_rcu; + + if (!refcount_inc_not_zero(&psock->refcnt)) + goto out_rcu; + + rcu_read_unlock(); + lock_sock(sk); + err = bpf_tcp_push(sk, send, md, flags, false); + release_sock(sk); + smap_release_sock(psock, sk); + if (unlikely(err)) + goto out; + return 0; +out_rcu: + rcu_read_unlock(); +out: + i = md->sg_start; + while (sg[i].length) { + free += sg[i].length; + put_page(sg_page(&sg[i])); + sg[i].length = 0; + i++; + if (i == MAX_SKB_FRAGS) + i = 0; + } + return free; +} + +static inline void bpf_md_init(struct smap_psock *psock) +{ + if (!psock->apply_bytes) { + psock->eval = __SK_NONE; + if (psock->sk_redir) { + sock_put(psock->sk_redir); + psock->sk_redir = NULL; + } + } +} + +static void apply_bytes_dec(struct smap_psock *psock, int i) +{ + if (psock->apply_bytes) { + if (psock->apply_bytes < i) + psock->apply_bytes = 0; + else + psock->apply_bytes -= i; + } +} + +static int bpf_exec_tx_verdict(struct smap_psock *psock, + struct sk_msg_buff *m, + struct sock *sk, + int *copied, int flags) +{ + bool cork = false, enospc = (m->sg_start == m->sg_end); + struct sock *redir; + int err = 0; + int send; + +more_data: + if (psock->eval == __SK_NONE) + psock->eval = smap_do_tx_msg(sk, psock, m); + + if (m->cork_bytes && + m->cork_bytes > psock->sg_size && !enospc) { + psock->cork_bytes = m->cork_bytes - psock->sg_size; + if (!psock->cork) { + psock->cork = kcalloc(1, + sizeof(struct sk_msg_buff), + GFP_ATOMIC | __GFP_NOWARN); + + if (!psock->cork) { + err = -ENOMEM; + goto out_err; + } + } + memcpy(psock->cork, m, sizeof(*m)); + goto out_err; + } + + send = psock->sg_size; + if (psock->apply_bytes && psock->apply_bytes < send) + send = psock->apply_bytes; + + switch (psock->eval) { + case __SK_PASS: + err = bpf_tcp_push(sk, send, m, flags, true); + if (unlikely(err)) { + *copied -= free_start_sg(sk, m); + break; + } + + apply_bytes_dec(psock, send); + psock->sg_size -= send; + break; + case __SK_REDIRECT: + redir = psock->sk_redir; + apply_bytes_dec(psock, send); + + if (psock->cork) { + cork = true; + psock->cork = NULL; + } + + return_mem_sg(sk, send, m); + release_sock(sk); + + err = bpf_tcp_sendmsg_do_redirect(redir, send, m, flags); + lock_sock(sk); + + if (cork) { + free_start_sg(sk, m); + kfree(m); + m = NULL; + } + if (unlikely(err)) + *copied -= err; + else + psock->sg_size -= send; + break; + case __SK_DROP: + default: + free_bytes_sg(sk, send, m); + apply_bytes_dec(psock, send); + *copied -= send; + psock->sg_size -= send; + err = -EACCES; + break; + } + + if (likely(!err)) { + bpf_md_init(psock); + if (m && + m->sg_data[m->sg_start].page_link && + m->sg_data[m->sg_start].length) + goto more_data; + } + +out_err: + return err; +} + +static int bpf_tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) +{ + int flags = msg->msg_flags | MSG_NO_SHARED_FRAGS; + struct sk_msg_buff md = {0}; + unsigned int sg_copy = 0; + struct smap_psock *psock; + int copied = 0, err = 0; + struct scatterlist *sg; + long timeo; + + /* Its possible a sock event or user removed the psock _but_ the ops + * have not been reprogrammed yet so we get here. In this case fallback + * to tcp_sendmsg. Note this only works because we _only_ ever allow + * a single ULP there is no hierarchy here. + */ + rcu_read_lock(); + psock = smap_psock_sk(sk); + if (unlikely(!psock)) { + rcu_read_unlock(); + return tcp_sendmsg(sk, msg, size); + } + + /* Increment the psock refcnt to ensure its not released while sending a + * message. Required because sk lookup and bpf programs are used in + * separate rcu critical sections. Its OK if we lose the map entry + * but we can't lose the sock reference. + */ + if (!refcount_inc_not_zero(&psock->refcnt)) { + rcu_read_unlock(); + return tcp_sendmsg(sk, msg, size); + } + + sg = md.sg_data; + sg_init_table(sg, MAX_SKB_FRAGS); + rcu_read_unlock(); + + lock_sock(sk); + timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); + + while (msg_data_left(msg)) { + struct sk_msg_buff *m; + bool enospc = false; + int copy; + + if (sk->sk_err) { + err = sk->sk_err; + goto out_err; + } + + copy = msg_data_left(msg); + if (!sk_stream_memory_free(sk)) + goto wait_for_sndbuf; + + m = psock->cork_bytes ? psock->cork : &md; + m->sg_curr = m->sg_copybreak ? m->sg_curr : m->sg_end; + err = sk_alloc_sg(sk, copy, m->sg_data, + m->sg_start, &m->sg_end, &sg_copy, + m->sg_end - 1); + if (err) { + if (err != -ENOSPC) + goto wait_for_memory; + enospc = true; + copy = sg_copy; + } + + err = memcopy_from_iter(sk, m, &msg->msg_iter, copy); + if (err < 0) { + free_curr_sg(sk, m); + goto out_err; + } + + psock->sg_size += copy; + copied += copy; + sg_copy = 0; + + /* When bytes are being corked skip running BPF program and + * applying verdict unless there is no more buffer space. In + * the ENOSPC case simply run BPF prorgram with currently + * accumulated data. We don't have much choice at this point + * we could try extending the page frags or chaining complex + * frags but even in these cases _eventually_ we will hit an + * OOM scenario. More complex recovery schemes may be + * implemented in the future, but BPF programs must handle + * the case where apply_cork requests are not honored. The + * canonical method to verify this is to check data length. + */ + if (psock->cork_bytes) { + if (copy > psock->cork_bytes) + psock->cork_bytes = 0; + else + psock->cork_bytes -= copy; + + if (psock->cork_bytes && !enospc) + goto out_cork; + + /* All cork bytes accounted for re-run filter */ + psock->eval = __SK_NONE; + psock->cork_bytes = 0; + } + + err = bpf_exec_tx_verdict(psock, m, sk, &copied, flags); + if (unlikely(err < 0)) + goto out_err; + continue; +wait_for_sndbuf: + set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); +wait_for_memory: + err = sk_stream_wait_memory(sk, &timeo); + if (err) + goto out_err; + } +out_err: + if (err < 0) + err = sk_stream_error(sk, msg->msg_flags, err); +out_cork: + release_sock(sk); + smap_release_sock(psock, sk); + return copied ? copied : err; +} + +static int bpf_tcp_sendpage(struct sock *sk, struct page *page, + int offset, size_t size, int flags) +{ + struct sk_msg_buff md = {0}, *m = NULL; + int err = 0, copied = 0; + struct smap_psock *psock; + struct scatterlist *sg; + bool enospc = false; + + rcu_read_lock(); + psock = smap_psock_sk(sk); + if (unlikely(!psock)) + goto accept; + + if (!refcount_inc_not_zero(&psock->refcnt)) + goto accept; + rcu_read_unlock(); + + lock_sock(sk); + + if (psock->cork_bytes) + m = psock->cork; + else + m = &md; + + /* Catch case where ring is full and sendpage is stalled. */ + if (unlikely(m->sg_end == m->sg_start && + m->sg_data[m->sg_end].length)) + goto out_err; + + psock->sg_size += size; + sg = &m->sg_data[m->sg_end]; + sg_set_page(sg, page, size, offset); + get_page(page); + m->sg_copy[m->sg_end] = true; + sk_mem_charge(sk, size); + m->sg_end++; + copied = size; + + if (m->sg_end == MAX_SKB_FRAGS) + m->sg_end = 0; + + if (m->sg_end == m->sg_start) + enospc = true; + + if (psock->cork_bytes) { + if (size > psock->cork_bytes) + psock->cork_bytes = 0; + else + psock->cork_bytes -= size; + + if (psock->cork_bytes && !enospc) + goto out_err; + + /* All cork bytes accounted for re-run filter */ + psock->eval = __SK_NONE; + psock->cork_bytes = 0; + } + + err = bpf_exec_tx_verdict(psock, m, sk, &copied, flags); +out_err: + release_sock(sk); + smap_release_sock(psock, sk); + return copied ? copied : err; +accept: + rcu_read_unlock(); + return tcp_sendpage(sk, page, offset, size, flags); +} + +static void bpf_tcp_msg_add(struct smap_psock *psock, + struct sock *sk, + struct bpf_prog *tx_msg) +{ + struct bpf_prog *orig_tx_msg; + + orig_tx_msg = xchg(&psock->bpf_tx_msg, tx_msg); + if (orig_tx_msg) + bpf_prog_put(orig_tx_msg); +} + static int bpf_tcp_ulp_register(void) { tcp_bpf_proto = tcp_prot; tcp_bpf_proto.close = bpf_tcp_close; + /* Once BPF TX ULP is registered it is never unregistered. It + * will be in the ULP list for the lifetime of the system. Doing + * duplicate registers is not a problem. + */ return tcp_register_ulp(&bpf_tcp_ulp_ops); } @@ -415,7 +1057,6 @@ static int smap_parse_func_strparser(struct strparser *strp, return rc; } - static int smap_read_sock_done(struct strparser *strp, int err) { return err; @@ -485,12 +1126,22 @@ static void smap_gc_work(struct work_struct *w) bpf_prog_put(psock->bpf_parse); if (psock->bpf_verdict) bpf_prog_put(psock->bpf_verdict); + if (psock->bpf_tx_msg) + bpf_prog_put(psock->bpf_tx_msg); + + if (psock->cork) { + free_start_sg(psock->sock, psock->cork); + kfree(psock->cork); + } list_for_each_entry_safe(e, tmp, &psock->maps, list) { list_del(&e->list); kfree(e); } + if (psock->sk_redir) + sock_put(psock->sk_redir); + sock_put(psock->sock); kfree(psock); } @@ -506,6 +1157,7 @@ static struct smap_psock *smap_init_psock(struct sock *sock, if (!psock) return ERR_PTR(-ENOMEM); + psock->eval = __SK_NONE; psock->sock = sock; skb_queue_head_init(&psock->rxqueue); INIT_WORK(&psock->tx_work, smap_tx_work); @@ -714,10 +1366,11 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops, { struct bpf_stab *stab = container_of(map, struct bpf_stab, map); struct smap_psock_map_entry *e = NULL; - struct bpf_prog *verdict, *parse; + struct bpf_prog *verdict, *parse, *tx_msg; struct sock *osock, *sock; struct smap_psock *psock; u32 i = *(u32 *)key; + bool new = false; int err; if (unlikely(flags > BPF_EXIST)) @@ -740,6 +1393,7 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops, */ verdict = READ_ONCE(stab->bpf_verdict); parse = READ_ONCE(stab->bpf_parse); + tx_msg = READ_ONCE(stab->bpf_tx_msg); if (parse && verdict) { /* bpf prog refcnt may be zero if a concurrent attach operation @@ -758,6 +1412,17 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops, } } + if (tx_msg) { + tx_msg = bpf_prog_inc_not_zero(stab->bpf_tx_msg); + if (IS_ERR(tx_msg)) { + if (verdict) + bpf_prog_put(verdict); + if (parse) + bpf_prog_put(parse); + return PTR_ERR(tx_msg); + } + } + write_lock_bh(&sock->sk_callback_lock); psock = smap_psock_sk(sock); @@ -772,7 +1437,14 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops, err = -EBUSY; goto out_progs; } - refcount_inc(&psock->refcnt); + if (READ_ONCE(psock->bpf_tx_msg) && tx_msg) { + err = -EBUSY; + goto out_progs; + } + if (!refcount_inc_not_zero(&psock->refcnt)) { + err = -EAGAIN; + goto out_progs; + } } else { psock = smap_init_psock(sock, stab); if (IS_ERR(psock)) { @@ -780,11 +1452,8 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops, goto out_progs; } - err = tcp_set_ulp_id(sock, TCP_ULP_BPF); - if (err) - goto out_progs; - set_bit(SMAP_TX_RUNNING, &psock->state); + new = true; } e = kzalloc(sizeof(*e), GFP_ATOMIC | __GFP_NOWARN); @@ -797,6 +1466,14 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops, /* 3. At this point we have a reference to a valid psock that is * running. Attach any BPF programs needed. */ + if (tx_msg) + bpf_tcp_msg_add(psock, sock, tx_msg); + if (new) { + err = tcp_set_ulp_id(sock, TCP_ULP_BPF); + if (err) + goto out_free; + } + if (parse && verdict && !psock->strp_enabled) { err = smap_init_sock(psock, sock); if (err) @@ -818,8 +1495,6 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops, struct smap_psock *opsock = smap_psock_sk(osock); write_lock_bh(&osock->sk_callback_lock); - if (osock != sock && parse) - smap_stop_sock(opsock, osock); smap_list_remove(opsock, &stab->sock_map[i]); smap_release_sock(opsock, osock); write_unlock_bh(&osock->sk_callback_lock); @@ -832,6 +1507,8 @@ out_progs: bpf_prog_put(verdict); if (parse) bpf_prog_put(parse); + if (tx_msg) + bpf_prog_put(tx_msg); write_unlock_bh(&sock->sk_callback_lock); kfree(e); return err; @@ -846,6 +1523,9 @@ int sock_map_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type) return -EINVAL; switch (type) { + case BPF_SK_MSG_VERDICT: + orig = xchg(&stab->bpf_tx_msg, prog); + break; case BPF_SK_SKB_STREAM_PARSER: orig = xchg(&stab->bpf_parse, prog); break; @@ -907,6 +1587,10 @@ static void sock_map_release(struct bpf_map *map) orig = xchg(&stab->bpf_verdict, NULL); if (orig) bpf_prog_put(orig); + + orig = xchg(&stab->bpf_tx_msg, NULL); + if (orig) + bpf_prog_put(orig); } const struct bpf_map_ops sock_map_ops = { diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 01595089c0ac..dd7e11606995 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1338,7 +1338,8 @@ static int bpf_obj_get(const union bpf_attr *attr) #define BPF_PROG_ATTACH_LAST_FIELD attach_flags -static int sockmap_get_from_fd(const union bpf_attr *attr, bool attach) +static int sockmap_get_from_fd(const union bpf_attr *attr, + int type, bool attach) { struct bpf_prog *prog = NULL; int ufd = attr->target_fd; @@ -1352,8 +1353,7 @@ static int sockmap_get_from_fd(const union bpf_attr *attr, bool attach) return PTR_ERR(map); if (attach) { - prog = bpf_prog_get_type(attr->attach_bpf_fd, - BPF_PROG_TYPE_SK_SKB); + prog = bpf_prog_get_type(attr->attach_bpf_fd, type); if (IS_ERR(prog)) { fdput(f); return PTR_ERR(prog); @@ -1405,9 +1405,11 @@ static int bpf_prog_attach(const union bpf_attr *attr) case BPF_CGROUP_DEVICE: ptype = BPF_PROG_TYPE_CGROUP_DEVICE; break; + case BPF_SK_MSG_VERDICT: + return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_MSG, true); case BPF_SK_SKB_STREAM_PARSER: case BPF_SK_SKB_STREAM_VERDICT: - return sockmap_get_from_fd(attr, true); + return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_SKB, true); default: return -EINVAL; } @@ -1460,9 +1462,11 @@ static int bpf_prog_detach(const union bpf_attr *attr) case BPF_CGROUP_DEVICE: ptype = BPF_PROG_TYPE_CGROUP_DEVICE; break; + case BPF_SK_MSG_VERDICT: + return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_MSG, false); case BPF_SK_SKB_STREAM_PARSER: case BPF_SK_SKB_STREAM_VERDICT: - return sockmap_get_from_fd(attr, false); + return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_SKB, false); default: return -EINVAL; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 03437d915af5..293255530684 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1305,6 +1305,7 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env, case BPF_PROG_TYPE_XDP: case BPF_PROG_TYPE_LWT_XMIT: case BPF_PROG_TYPE_SK_SKB: + case BPF_PROG_TYPE_SK_MSG: if (meta) return meta->pkt_access; @@ -2129,7 +2130,8 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, case BPF_MAP_TYPE_SOCKMAP: if (func_id != BPF_FUNC_sk_redirect_map && func_id != BPF_FUNC_sock_map_update && - func_id != BPF_FUNC_map_delete_elem) + func_id != BPF_FUNC_map_delete_elem && + func_id != BPF_FUNC_msg_redirect_map) goto error; break; default: @@ -2167,6 +2169,7 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, goto error; break; case BPF_FUNC_sk_redirect_map: + case BPF_FUNC_msg_redirect_map: if (map->map_type != BPF_MAP_TYPE_SOCKMAP) goto error; break; diff --git a/net/core/filter.c b/net/core/filter.c index e4fccc35c872..7a1f16be6c34 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1943,6 +1943,44 @@ static const struct bpf_func_proto bpf_sk_redirect_map_proto = { .arg4_type = ARG_ANYTHING, }; +BPF_CALL_4(bpf_msg_redirect_map, struct sk_msg_buff *, msg, + struct bpf_map *, map, u32, key, u64, flags) +{ + /* If user passes invalid input drop the packet. */ + if (unlikely(flags)) + return SK_DROP; + + msg->key = key; + msg->flags = flags; + msg->map = map; + + return SK_PASS; +} + +struct sock *do_msg_redirect_map(struct sk_msg_buff *msg) +{ + struct sock *sk = NULL; + + if (msg->map) { + sk = __sock_map_lookup_elem(msg->map, msg->key); + + msg->key = 0; + msg->map = NULL; + } + + return sk; +} + +static const struct bpf_func_proto bpf_msg_redirect_map_proto = { + .func = bpf_msg_redirect_map, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_ANYTHING, +}; + BPF_CALL_1(bpf_get_cgroup_classid, const struct sk_buff *, skb) { return task_get_classid(skb); @@ -3678,6 +3716,16 @@ static const struct bpf_func_proto * } } +static const struct bpf_func_proto *sk_msg_func_proto(enum bpf_func_id func_id) +{ + switch (func_id) { + case BPF_FUNC_msg_redirect_map: + return &bpf_msg_redirect_map_proto; + default: + return bpf_base_func_proto(func_id); + } +} + static const struct bpf_func_proto *sk_skb_func_proto(enum bpf_func_id func_id) { switch (func_id) { @@ -4067,6 +4115,32 @@ static bool sk_skb_is_valid_access(int off, int size, return bpf_skb_is_valid_access(off, size, type, info); } +static bool sk_msg_is_valid_access(int off, int size, + enum bpf_access_type type, + struct bpf_insn_access_aux *info) +{ + if (type == BPF_WRITE) + return false; + + switch (off) { + case offsetof(struct sk_msg_md, data): + info->reg_type = PTR_TO_PACKET; + break; + case offsetof(struct sk_msg_md, data_end): + info->reg_type = PTR_TO_PACKET_END; + break; + } + + if (off < 0 || off >= sizeof(struct sk_msg_md)) + return false; + if (off % size != 0) + return false; + if (size != sizeof(__u64)) + return false; + + return true; +} + static u32 bpf_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, @@ -4865,6 +4939,29 @@ static u32 sk_skb_convert_ctx_access(enum bpf_access_type type, return insn - insn_buf; } +static u32 sk_msg_convert_ctx_access(enum bpf_access_type type, + const struct bpf_insn *si, + struct bpf_insn *insn_buf, + struct bpf_prog *prog, u32 *target_size) +{ + struct bpf_insn *insn = insn_buf; + + switch (si->off) { + case offsetof(struct sk_msg_md, data): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg_buff, data), + si->dst_reg, si->src_reg, + offsetof(struct sk_msg_buff, data)); + break; + case offsetof(struct sk_msg_md, data_end): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg_buff, data_end), + si->dst_reg, si->src_reg, + offsetof(struct sk_msg_buff, data_end)); + break; + } + + return insn - insn_buf; +} + const struct bpf_verifier_ops sk_filter_verifier_ops = { .get_func_proto = sk_filter_func_proto, .is_valid_access = sk_filter_is_valid_access, @@ -4955,6 +5052,15 @@ const struct bpf_verifier_ops sk_skb_verifier_ops = { const struct bpf_prog_ops sk_skb_prog_ops = { }; +const struct bpf_verifier_ops sk_msg_verifier_ops = { + .get_func_proto = sk_msg_func_proto, + .is_valid_access = sk_msg_is_valid_access, + .convert_ctx_access = sk_msg_convert_ctx_access, +}; + +const struct bpf_prog_ops sk_msg_prog_ops = { +}; + int sk_detach_filter(struct sock *sk) { int ret = -ENOENT; From 7857c173a2d62f1a9a67a5fdb03d6260251814e6 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Sun, 18 Mar 2018 12:57:15 -0700 Subject: [PATCH 0194/1640] BACKPORT: bpf: sockmap, add bpf_msg_apply_bytes() helper A single sendmsg or sendfile system call can contain multiple logical messages that a BPF program may want to read and apply a verdict. But, without an apply_bytes helper any verdict on the data applies to all bytes in the sendmsg/sendfile. Alternatively, a BPF program may only care to read the first N bytes of a msg. If the payload is large say MB or even GB setting up and calling the BPF program repeatedly for all bytes, even though the verdict is already known, creates unnecessary overhead. To allow BPF programs to control how many bytes a given verdict applies to we implement a bpf_msg_apply_bytes() helper. When called from within a BPF program this sets a counter, internal to the BPF infrastructure, that applies the last verdict to the next N bytes. If the N is smaller than the current data being processed from a sendmsg/sendfile call, the first N bytes will be sent and the BPF program will be re-run with start_data pointing to the N+1 byte. If N is larger than the current data being processed the BPF verdict will be applied to multiple sendmsg/sendfile calls until N bytes are consumed. Note1 if a socket closes with apply_bytes counter non-zero this is not a problem because data is not being buffered for N bytes and is sent as its received. Signed-off-by: John Fastabend Acked-by: David S. Miller Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- net/core/filter.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/net/core/filter.c b/net/core/filter.c index 7a1f16be6c34..5ccf5bfcf34e 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1981,6 +1981,20 @@ static const struct bpf_func_proto bpf_msg_redirect_map_proto = { .arg4_type = ARG_ANYTHING, }; +BPF_CALL_2(bpf_msg_apply_bytes, struct sk_msg_buff *, msg, u32, bytes) +{ + msg->apply_bytes = bytes; + return 0; +} + +static const struct bpf_func_proto bpf_msg_apply_bytes_proto = { + .func = bpf_msg_apply_bytes, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, +}; + BPF_CALL_1(bpf_get_cgroup_classid, const struct sk_buff *, skb) { return task_get_classid(skb); @@ -3721,6 +3735,8 @@ static const struct bpf_func_proto *sk_msg_func_proto(enum bpf_func_id func_id) switch (func_id) { case BPF_FUNC_msg_redirect_map: return &bpf_msg_redirect_map_proto; + case BPF_FUNC_msg_apply_bytes: + return &bpf_msg_apply_bytes_proto; default: return bpf_base_func_proto(func_id); } From 525497f048ea0a425e2f2343fa06730e58100298 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Sun, 18 Mar 2018 12:57:20 -0700 Subject: [PATCH 0195/1640] BACKPORT: bpf: sockmap, add msg_cork_bytes() helper In the case where we need a specific number of bytes before a verdict can be assigned, even if the data spans multiple sendmsg or sendfile calls. The BPF program may use msg_cork_bytes(). The extreme case is a user can call sendmsg repeatedly with 1-byte msg segments. Obviously, this is bad for performance but is still valid. If the BPF program needs N bytes to validate a header it can use msg_cork_bytes to specify N bytes and the BPF program will not be called again until N bytes have been accumulated. The infrastructure will attempt to coalesce data if possible so in many cases (most my use cases at least) the data will be in a single scatterlist element with data pointers pointing to start/end of the element. However, this is dependent on available memory so is not guaranteed. So BPF programs must validate data pointer ranges, but this is the case anyways to convince the verifier the accesses are valid. Signed-off-by: John Fastabend Acked-by: David S. Miller Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- net/core/filter.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/net/core/filter.c b/net/core/filter.c index 5ccf5bfcf34e..7741b302b2b0 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1995,6 +1995,20 @@ static const struct bpf_func_proto bpf_msg_apply_bytes_proto = { .arg2_type = ARG_ANYTHING, }; +BPF_CALL_2(bpf_msg_cork_bytes, struct sk_msg_buff *, msg, u32, bytes) +{ + msg->cork_bytes = bytes; + return 0; +} + +static const struct bpf_func_proto bpf_msg_cork_bytes_proto = { + .func = bpf_msg_cork_bytes, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, +}; + BPF_CALL_1(bpf_get_cgroup_classid, const struct sk_buff *, skb) { return task_get_classid(skb); @@ -3737,6 +3751,8 @@ static const struct bpf_func_proto *sk_msg_func_proto(enum bpf_func_id func_id) return &bpf_msg_redirect_map_proto; case BPF_FUNC_msg_apply_bytes: return &bpf_msg_apply_bytes_proto; + case BPF_FUNC_msg_cork_bytes: + return &bpf_msg_cork_bytes_proto; default: return bpf_base_func_proto(func_id); } From 5fef549e3bc1d82693581f5a50f674f5759f61c0 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Sun, 18 Mar 2018 12:57:25 -0700 Subject: [PATCH 0196/1640] BACKPORT: bpf: sk_msg program helper bpf_sk_msg_pull_data Currently, if a bpf sk msg program is run the program can only parse data that the (start,end) pointers already consumed. For sendmsg hooks this is likely the first scatterlist element. For sendpage this will be the range (0,0) because the data is shared with userspace and by default we want to avoid allowing userspace to modify data while (or after) BPF verdict is being decided. To support pulling in additional bytes for parsing use a new helper bpf_sk_msg_pull(start, end, flags) which works similar to cls tc logic. This helper will attempt to point the data start pointer at 'start' bytes offest into msg and data end pointer at 'end' bytes offset into message. After basic sanity checks to ensure 'start' <= 'end' and 'end' <= msg_length there are a few cases we need to handle. First the sendmsg hook has already copied the data from userspace and has exclusive access to it. Therefor, it is not necessesary to copy the data. However, it may be required. After finding the scatterlist element with 'start' offset byte in it there are two cases. One the range (start,end) is entirely contained in the sg element and is already linear. All that is needed is to update the data pointers, no allocate/copy is needed. The other case is (start, end) crosses sg element boundaries. In this case we allocate a block of size 'end - start' and copy the data to linearize it. Next sendpage hook has not copied any data in initial state so that data pointers are (0,0). In this case we handle it similar to the above sendmsg case except the allocation/copy must always happen. Then when sending the data we have possibly three memory regions that need to be sent, (0, start - 1), (start, end), and (end + 1, msg_length). This is required to ensure any writes by the BPF program are correctly transmitted. Lastly this operation will invalidate any previous data checks so BPF programs will have to revalidate pointers after making this BPF call. Signed-off-by: John Fastabend Acked-by: David S. Miller Signed-off-by: Daniel Borkmann --- net/core/filter.c | 135 +++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 134 insertions(+), 1 deletion(-) diff --git a/net/core/filter.c b/net/core/filter.c index 7741b302b2b0..cc46f2d22338 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2009,6 +2009,136 @@ static const struct bpf_func_proto bpf_msg_cork_bytes_proto = { .arg2_type = ARG_ANYTHING, }; +BPF_CALL_4(bpf_msg_pull_data, + struct sk_msg_buff *, msg, u32, start, u32, end, u64, flags) +{ + unsigned int len = 0, offset = 0, copy = 0; + struct scatterlist *sg = msg->sg_data; + int first_sg, last_sg, i, shift; + unsigned char *p, *to, *from; + int bytes = end - start; + struct page *page; + + if (unlikely(flags || end <= start)) + return -EINVAL; + + /* First find the starting scatterlist element */ + i = msg->sg_start; + do { + len = sg[i].length; + offset += len; + if (start < offset + len) + break; + i++; + if (i == MAX_SKB_FRAGS) + i = 0; + } while (i != msg->sg_end); + + if (unlikely(start >= offset + len)) + return -EINVAL; + + if (!msg->sg_copy[i] && bytes <= len) + goto out; + + first_sg = i; + + /* At this point we need to linearize multiple scatterlist + * elements or a single shared page. Either way we need to + * copy into a linear buffer exclusively owned by BPF. Then + * place the buffer in the scatterlist and fixup the original + * entries by removing the entries now in the linear buffer + * and shifting the remaining entries. For now we do not try + * to copy partial entries to avoid complexity of running out + * of sg_entry slots. The downside is reading a single byte + * will copy the entire sg entry. + */ + do { + copy += sg[i].length; + i++; + if (i == MAX_SKB_FRAGS) + i = 0; + if (bytes < copy) + break; + } while (i != msg->sg_end); + last_sg = i; + + if (unlikely(copy < end - start)) + return -EINVAL; + + page = alloc_pages(__GFP_NOWARN | GFP_ATOMIC, get_order(copy)); + if (unlikely(!page)) + return -ENOMEM; + p = page_address(page); + offset = 0; + + i = first_sg; + do { + from = sg_virt(&sg[i]); + len = sg[i].length; + to = p + offset; + + memcpy(to, from, len); + offset += len; + sg[i].length = 0; + put_page(sg_page(&sg[i])); + + i++; + if (i == MAX_SKB_FRAGS) + i = 0; + } while (i != last_sg); + + sg[first_sg].length = copy; + sg_set_page(&sg[first_sg], page, copy, 0); + + /* To repair sg ring we need to shift entries. If we only + * had a single entry though we can just replace it and + * be done. Otherwise walk the ring and shift the entries. + */ + shift = last_sg - first_sg - 1; + if (!shift) + goto out; + + i = first_sg + 1; + do { + int move_from; + + if (i + shift >= MAX_SKB_FRAGS) + move_from = i + shift - MAX_SKB_FRAGS; + else + move_from = i + shift; + + if (move_from == msg->sg_end) + break; + + sg[i] = sg[move_from]; + sg[move_from].length = 0; + sg[move_from].page_link = 0; + sg[move_from].offset = 0; + + i++; + if (i == MAX_SKB_FRAGS) + i = 0; + } while (1); + msg->sg_end -= shift; + if (msg->sg_end < 0) + msg->sg_end += MAX_SKB_FRAGS; +out: + msg->data = sg_virt(&sg[i]) + start - offset; + msg->data_end = msg->data + bytes; + + return 0; +} + +static const struct bpf_func_proto bpf_msg_pull_data_proto = { + .func = bpf_msg_pull_data, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_ANYTHING, +}; + BPF_CALL_1(bpf_get_cgroup_classid, const struct sk_buff *, skb) { return task_get_classid(skb); @@ -2971,7 +3101,8 @@ bool bpf_helper_changes_pkt_data(void *func) func == bpf_l3_csum_replace || func == bpf_l4_csum_replace || func == bpf_xdp_adjust_head || - func == bpf_xdp_adjust_meta) + func == bpf_xdp_adjust_meta || + func == bpf_msg_pull_data) return true; return false; @@ -3753,6 +3884,8 @@ static const struct bpf_func_proto *sk_msg_func_proto(enum bpf_func_id func_id) return &bpf_msg_apply_bytes_proto; case BPF_FUNC_msg_cork_bytes: return &bpf_msg_cork_bytes_proto; + case BPF_FUNC_msg_pull_data: + return &bpf_msg_pull_data_proto; default: return bpf_base_func_proto(func_id); } From 1744724fa965d7a2e85fdae86e9b6c527a73d747 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Fri, 23 Mar 2018 11:41:28 +0100 Subject: [PATCH 0197/1640] BACKPORT: bpf: Remove struct bpf_verifier_env argument from print_bpf_insn We use print_bpf_insn in user space (bpftool and soon perf), so it'd be nice to keep it generic and strip it off the kernel struct bpf_verifier_env argument. This argument can be safely removed, because its users can use the struct bpf_insn_cbs::private_data to pass it. By changing the argument type we can no longer have clean 'verbose' alias to 'bpf_verifier_log_write' in verifier.c. Instead we're adding the 'verbose' cb_print callback and removing the alias. This way we have new cb_print callback in place, and all the 'verbose(env, ...) calls in verifier.c will cleanly cast to 'verbose(void *, ...)' so no other change is needed. Signed-off-by: Jiri Olsa Signed-off-by: Daniel Borkmann --- kernel/bpf/disasm.c | 52 +++++++++++++++++++++---------------------- kernel/bpf/disasm.h | 5 +---- kernel/bpf/verifier.c | 44 ++++++++++++++++++++++-------------- 3 files changed, 54 insertions(+), 47 deletions(-) diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c index 8740406df2cd..d6b76377cb6e 100644 --- a/kernel/bpf/disasm.c +++ b/kernel/bpf/disasm.c @@ -113,16 +113,16 @@ static const char *const bpf_jmp_string[16] = { }; static void print_bpf_end_insn(bpf_insn_print_t verbose, - struct bpf_verifier_env *env, + void *private_data, const struct bpf_insn *insn) { - verbose(env, "(%02x) r%d = %s%d r%d\n", insn->code, insn->dst_reg, + verbose(private_data, "(%02x) r%d = %s%d r%d\n", + insn->code, insn->dst_reg, BPF_SRC(insn->code) == BPF_TO_BE ? "be" : "le", insn->imm, insn->dst_reg); } void print_bpf_insn(const struct bpf_insn_cbs *cbs, - struct bpf_verifier_env *env, const struct bpf_insn *insn, bool allow_ptr_leaks) { @@ -132,23 +132,23 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs, if (class == BPF_ALU || class == BPF_ALU64) { if (BPF_OP(insn->code) == BPF_END) { if (class == BPF_ALU64) - verbose(env, "BUG_alu64_%02x\n", insn->code); + verbose(cbs->private_data, "BUG_alu64_%02x\n", insn->code); else - print_bpf_end_insn(verbose, env, insn); + print_bpf_end_insn(verbose, cbs->private_data, insn); } else if (BPF_OP(insn->code) == BPF_NEG) { - verbose(env, "(%02x) r%d = %s-r%d\n", + verbose(cbs->private_data, "(%02x) r%d = %s-r%d\n", insn->code, insn->dst_reg, class == BPF_ALU ? "(u32) " : "", insn->dst_reg); } else if (BPF_SRC(insn->code) == BPF_X) { - verbose(env, "(%02x) %sr%d %s %sr%d\n", + verbose(cbs->private_data, "(%02x) %sr%d %s %sr%d\n", insn->code, class == BPF_ALU ? "(u32) " : "", insn->dst_reg, bpf_alu_string[BPF_OP(insn->code) >> 4], class == BPF_ALU ? "(u32) " : "", insn->src_reg); } else { - verbose(env, "(%02x) %sr%d %s %s%d\n", + verbose(cbs->private_data, "(%02x) %sr%d %s %s%d\n", insn->code, class == BPF_ALU ? "(u32) " : "", insn->dst_reg, bpf_alu_string[BPF_OP(insn->code) >> 4], @@ -157,46 +157,46 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs, } } else if (class == BPF_STX) { if (BPF_MODE(insn->code) == BPF_MEM) - verbose(env, "(%02x) *(%s *)(r%d %+d) = r%d\n", + verbose(cbs->private_data, "(%02x) *(%s *)(r%d %+d) = r%d\n", insn->code, bpf_ldst_string[BPF_SIZE(insn->code) >> 3], insn->dst_reg, insn->off, insn->src_reg); else if (BPF_MODE(insn->code) == BPF_XADD) - verbose(env, "(%02x) lock *(%s *)(r%d %+d) += r%d\n", + verbose(cbs->private_data, "(%02x) lock *(%s *)(r%d %+d) += r%d\n", insn->code, bpf_ldst_string[BPF_SIZE(insn->code) >> 3], insn->dst_reg, insn->off, insn->src_reg); else - verbose(env, "BUG_%02x\n", insn->code); + verbose(cbs->private_data, "BUG_%02x\n", insn->code); } else if (class == BPF_ST) { if (BPF_MODE(insn->code) != BPF_MEM) { - verbose(env, "BUG_st_%02x\n", insn->code); + verbose(cbs->private_data, "BUG_st_%02x\n", insn->code); return; } - verbose(env, "(%02x) *(%s *)(r%d %+d) = %d\n", + verbose(cbs->private_data, "(%02x) *(%s *)(r%d %+d) = %d\n", insn->code, bpf_ldst_string[BPF_SIZE(insn->code) >> 3], insn->dst_reg, insn->off, insn->imm); } else if (class == BPF_LDX) { if (BPF_MODE(insn->code) != BPF_MEM) { - verbose(env, "BUG_ldx_%02x\n", insn->code); + verbose(cbs->private_data, "BUG_ldx_%02x\n", insn->code); return; } - verbose(env, "(%02x) r%d = *(%s *)(r%d %+d)\n", + verbose(cbs->private_data, "(%02x) r%d = *(%s *)(r%d %+d)\n", insn->code, insn->dst_reg, bpf_ldst_string[BPF_SIZE(insn->code) >> 3], insn->src_reg, insn->off); } else if (class == BPF_LD) { if (BPF_MODE(insn->code) == BPF_ABS) { - verbose(env, "(%02x) r0 = *(%s *)skb[%d]\n", + verbose(cbs->private_data, "(%02x) r0 = *(%s *)skb[%d]\n", insn->code, bpf_ldst_string[BPF_SIZE(insn->code) >> 3], insn->imm); } else if (BPF_MODE(insn->code) == BPF_IND) { - verbose(env, "(%02x) r0 = *(%s *)skb[r%d + %d]\n", + verbose(cbs->private_data, "(%02x) r0 = *(%s *)skb[r%d + %d]\n", insn->code, bpf_ldst_string[BPF_SIZE(insn->code) >> 3], insn->src_reg, insn->imm); @@ -212,12 +212,12 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs, if (map_ptr && !allow_ptr_leaks) imm = 0; - verbose(env, "(%02x) r%d = %s\n", + verbose(cbs->private_data, "(%02x) r%d = %s\n", insn->code, insn->dst_reg, __func_imm_name(cbs, insn, imm, tmp, sizeof(tmp))); } else { - verbose(env, "BUG_ld_%02x\n", insn->code); + verbose(cbs->private_data, "BUG_ld_%02x\n", insn->code); return; } } else if (class == BPF_JMP) { @@ -227,35 +227,35 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs, char tmp[64]; if (insn->src_reg == BPF_PSEUDO_CALL) { - verbose(env, "(%02x) call pc%s\n", + verbose(cbs->private_data, "(%02x) call pc%s\n", insn->code, __func_get_name(cbs, insn, tmp, sizeof(tmp))); } else { strcpy(tmp, "unknown"); - verbose(env, "(%02x) call %s#%d\n", insn->code, + verbose(cbs->private_data, "(%02x) call %s#%d\n", insn->code, __func_get_name(cbs, insn, tmp, sizeof(tmp)), insn->imm); } } else if (insn->code == (BPF_JMP | BPF_JA)) { - verbose(env, "(%02x) goto pc%+d\n", + verbose(cbs->private_data, "(%02x) goto pc%+d\n", insn->code, insn->off); } else if (insn->code == (BPF_JMP | BPF_EXIT)) { - verbose(env, "(%02x) exit\n", insn->code); + verbose(cbs->private_data, "(%02x) exit\n", insn->code); } else if (BPF_SRC(insn->code) == BPF_X) { - verbose(env, "(%02x) if r%d %s r%d goto pc%+d\n", + verbose(cbs->private_data, "(%02x) if r%d %s r%d goto pc%+d\n", insn->code, insn->dst_reg, bpf_jmp_string[BPF_OP(insn->code) >> 4], insn->src_reg, insn->off); } else { - verbose(env, "(%02x) if r%d %s 0x%x goto pc%+d\n", + verbose(cbs->private_data, "(%02x) if r%d %s 0x%x goto pc%+d\n", insn->code, insn->dst_reg, bpf_jmp_string[BPF_OP(insn->code) >> 4], insn->imm, insn->off); } } else { - verbose(env, "(%02x) %s\n", + verbose(cbs->private_data, "(%02x) %s\n", insn->code, bpf_class_string[class]); } } diff --git a/kernel/bpf/disasm.h b/kernel/bpf/disasm.h index 266fe8ee542b..e1324a834a24 100644 --- a/kernel/bpf/disasm.h +++ b/kernel/bpf/disasm.h @@ -22,14 +22,12 @@ #include #endif -struct bpf_verifier_env; - extern const char *const bpf_alu_string[16]; extern const char *const bpf_class_string[8]; const char *func_id_name(int id); -typedef __printf(2, 3) void (*bpf_insn_print_t)(struct bpf_verifier_env *env, +typedef __printf(2, 3) void (*bpf_insn_print_t)(void *private_data, const char *, ...); typedef const char *(*bpf_insn_revmap_call_t)(void *private_data, const struct bpf_insn *insn); @@ -45,7 +43,6 @@ struct bpf_insn_cbs { }; void print_bpf_insn(const struct bpf_insn_cbs *cbs, - struct bpf_verifier_env *env, const struct bpf_insn *insn, bool allow_ptr_leaks); #endif diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 293255530684..b5cef9a2e4bf 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -168,23 +168,16 @@ struct bpf_call_arg_meta { static DEFINE_MUTEX(bpf_verifier_lock); -/* log_level controls verbosity level of eBPF verifier. - * bpf_verifier_log_write() is used to dump the verification trace to the log, - * so the user can figure out what's wrong with the program - */ -__printf(2, 3) void bpf_verifier_log_write(struct bpf_verifier_env *env, - const char *fmt, ...) +static void log_write(struct bpf_verifier_env *env, const char *fmt, + va_list args) { struct bpf_verifer_log *log = &env->log; unsigned int n; - va_list args; if (!log->level || !log->ubuf || bpf_verifier_log_full(log)) return; - va_start(args, fmt); n = vscnprintf(log->kbuf, BPF_VERIFIER_TMP_LOG_SIZE, fmt, args); - va_end(args); WARN_ONCE(n >= BPF_VERIFIER_TMP_LOG_SIZE - 1, "verifier log line truncated - local buffer too short\n"); @@ -197,14 +190,30 @@ __printf(2, 3) void bpf_verifier_log_write(struct bpf_verifier_env *env, else log->ubuf = NULL; } -EXPORT_SYMBOL_GPL(bpf_verifier_log_write); -/* Historically bpf_verifier_log_write was called verbose, but the name was too - * generic for symbol export. The function was renamed, but not the calls in - * the verifier to avoid complicating backports. Hence the alias below. + +/* log_level controls verbosity level of eBPF verifier. + * bpf_verifier_log_write() is used to dump the verification trace to the log, + * so the user can figure out what's wrong with the program */ -static __printf(2, 3) void verbose(struct bpf_verifier_env *env, - const char *fmt, ...) - __attribute__((alias("bpf_verifier_log_write"))); +__printf(2, 3) void bpf_verifier_log_write(struct bpf_verifier_env *env, + const char *fmt, ...) +{ + va_list args; + + va_start(args, fmt); + log_write(env, fmt, args); + va_end(args); +} +EXPORT_SYMBOL_GPL(bpf_verifier_log_write); + +__printf(2, 3) static void verbose(void *private_data, const char *fmt, ...) +{ + va_list args; + + va_start(args, fmt); + log_write(private_data, fmt, args); + va_end(args); +} static bool type_is_pkt_pointer(enum bpf_reg_type type) { @@ -4929,10 +4938,11 @@ static int do_check(struct bpf_verifier_env *env) if (env->log.level) { const struct bpf_insn_cbs cbs = { .cb_print = verbose, + .private_data = env, }; verbose(env, "%d: ", env->insn_idx); - print_bpf_insn(&cbs, env, insn, env->allow_ptr_leaks); + print_bpf_insn(&cbs, insn, env->allow_ptr_leaks); } if (bpf_prog_is_dev_bound(env->prog->aux)) { From df7d5ed67324a62a362bf5c4791f2827c63cd82f Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Sat, 24 Mar 2018 11:44:22 -0700 Subject: [PATCH 0198/1640] UPSTREAM: bpf: Rename bpf_verifer_log bpf_verifer_log => bpf_verifier_log Signed-off-by: Martin KaFai Lau Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/linux/bpf_verifier.h | 6 +++--- kernel/bpf/verifier.c | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 166097378e22..faba2764a74d 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -169,7 +169,7 @@ struct bpf_insn_aux_data { #define BPF_VERIFIER_TMP_LOG_SIZE 1024 -struct bpf_verifer_log { +struct bpf_verifier_log { u32 level; char kbuf[BPF_VERIFIER_TMP_LOG_SIZE]; char __user *ubuf; @@ -177,7 +177,7 @@ struct bpf_verifer_log { u32 len_total; }; -static inline bool bpf_verifier_log_full(const struct bpf_verifer_log *log) +static inline bool bpf_verifier_log_full(const struct bpf_verifier_log *log) { return log->len_used >= log->len_total - 1; } @@ -203,7 +203,7 @@ struct bpf_verifier_env { bool allow_ptr_leaks; bool seen_direct_write; struct bpf_insn_aux_data *insn_aux_data; /* array of per-insn state */ - struct bpf_verifer_log log; + struct bpf_verifier_log log; u32 subprog_starts[BPF_MAX_SUBPROGS]; /* computes the stack depth of each bpf function */ u16 subprog_stack_depth[BPF_MAX_SUBPROGS + 1]; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index b5cef9a2e4bf..ceaf52059c51 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -171,7 +171,7 @@ static DEFINE_MUTEX(bpf_verifier_lock); static void log_write(struct bpf_verifier_env *env, const char *fmt, va_list args) { - struct bpf_verifer_log *log = &env->log; + struct bpf_verifier_log *log = &env->log; unsigned int n; if (!log->level || !log->ubuf || bpf_verifier_log_full(log)) @@ -6020,7 +6020,7 @@ static void free_states(struct bpf_verifier_env *env) int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) { struct bpf_verifier_env *env; - struct bpf_verifer_log *log; + struct bpf_verifier_log *log; int ret = -EINVAL; /* no program is valid */ From edd1612a8c782b52093337f5cecb083fcecb2030 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Sat, 24 Mar 2018 11:44:23 -0700 Subject: [PATCH 0199/1640] UPSTREAM: bpf: Add bpf_verifier_vlog() and bpf_verifier_log_needed() The BTF (BPF Type Format) verifier needs to reuse the current BPF verifier log. Hence, it requires the following changes: (1) Expose log_write() in verifier.c for other users. Its name is renamed to bpf_verifier_vlog(). (2) The BTF verifier also needs to check 'log->level && log->ubuf && !bpf_verifier_log_full(log);' independently outside of the current log_write(). It is because the BTF verifier will do one-check before making multiple calls to btf_verifier_vlog to log the details of a type. Hence, this check is also re-factored to a new function bpf_verifier_log_needed(). Since it is re-factored, we can check it before va_start() in the current bpf_verifier_log_write() and verbose(). Signed-off-by: Martin KaFai Lau Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/linux/bpf_verifier.h | 7 +++++++ kernel/bpf/verifier.c | 19 +++++++++++-------- 2 files changed, 18 insertions(+), 8 deletions(-) diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index faba2764a74d..07db43e120f4 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -182,6 +182,11 @@ static inline bool bpf_verifier_log_full(const struct bpf_verifier_log *log) return log->len_used >= log->len_total - 1; } +static inline bool bpf_verifier_log_needed(const struct bpf_verifier_log *log) +{ + return log->level && log->ubuf && !bpf_verifier_log_full(log); +} + #define BPF_MAX_SUBPROGS 256 /* single container for all structs @@ -210,6 +215,8 @@ struct bpf_verifier_env { u32 subprog_cnt; }; +void bpf_verifier_vlog(struct bpf_verifier_log *log, const char *fmt, + va_list args); __printf(2, 3) void bpf_verifier_log_write(struct bpf_verifier_env *env, const char *fmt, ...); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index ceaf52059c51..fa775b7bded7 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -168,15 +168,11 @@ struct bpf_call_arg_meta { static DEFINE_MUTEX(bpf_verifier_lock); -static void log_write(struct bpf_verifier_env *env, const char *fmt, - va_list args) +void bpf_verifier_vlog(struct bpf_verifier_log *log, const char *fmt, + va_list args) { - struct bpf_verifier_log *log = &env->log; unsigned int n; - if (!log->level || !log->ubuf || bpf_verifier_log_full(log)) - return; - n = vscnprintf(log->kbuf, BPF_VERIFIER_TMP_LOG_SIZE, fmt, args); WARN_ONCE(n >= BPF_VERIFIER_TMP_LOG_SIZE - 1, @@ -200,18 +196,25 @@ __printf(2, 3) void bpf_verifier_log_write(struct bpf_verifier_env *env, { va_list args; + if (!bpf_verifier_log_needed(&env->log)) + return; + va_start(args, fmt); - log_write(env, fmt, args); + bpf_verifier_vlog(&env->log, fmt, args); va_end(args); } EXPORT_SYMBOL_GPL(bpf_verifier_log_write); __printf(2, 3) static void verbose(void *private_data, const char *fmt, ...) { + struct bpf_verifier_env *env = private_data; va_list args; + if (!bpf_verifier_log_needed(&env->log)) + return; + va_start(args, fmt); - log_write(private_data, fmt, args); + bpf_verifier_vlog(&env->log, fmt, args); va_end(args); } From df00360c0d96ec441ed913a9676805d93e31587f Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Tue, 27 Mar 2018 11:53:21 -0700 Subject: [PATCH 0200/1640] UPSTREAM: bpf: follow idr code convention Generally we do a preload before doing idr allocation. This also help improve the allocation success rate in memory pressure. Cc: Daniel Borkmann Cc: Alexei Starovoitov Signed-off-by: Shaohua Li Signed-off-by: Daniel Borkmann --- kernel/bpf/syscall.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index dd7e11606995..95e957f9e226 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -204,11 +204,13 @@ static int bpf_map_alloc_id(struct bpf_map *map) { int id; + idr_preload(GFP_KERNEL); spin_lock_bh(&map_idr_lock); id = idr_alloc_cyclic(&map_idr, map, 1, INT_MAX, GFP_ATOMIC); if (id > 0) map->id = id; spin_unlock_bh(&map_idr_lock); + idr_preload_end(); if (WARN_ON_ONCE(!id)) return -ENOSPC; @@ -957,11 +959,13 @@ static int bpf_prog_alloc_id(struct bpf_prog *prog) { int id; + idr_preload(GFP_KERNEL); spin_lock_bh(&prog_idr_lock); id = idr_alloc_cyclic(&prog_idr, prog, 1, INT_MAX, GFP_ATOMIC); if (id > 0) prog->aux->id = id; spin_unlock_bh(&prog_idr_lock); + idr_preload_end(); /* id is in [1, INT_MAX) */ if (WARN_ON_ONCE(!id)) From 5871ec5ff5baf7e3f6c134d648fd017b53022cc8 Mon Sep 17 00:00:00 2001 From: "Nikita V. Shirokov" Date: Mon, 26 Mar 2018 08:36:57 -0700 Subject: [PATCH 0201/1640] UPSTREAM: bpf: Add sock_ops R/W access to ipv4 tos Sample usage for tos ... bpf_getsockopt(skops, SOL_IP, IP_TOS, &v, sizeof(v)) ... where skops is a pointer to the ctx (struct bpf_sock_ops). Signed-off-by: Nikita V. Shirokov Signed-off-by: Daniel Borkmann --- net/core/filter.c | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/net/core/filter.c b/net/core/filter.c index cc46f2d22338..215ab1940ef8 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3517,6 +3517,27 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock, ret = -EINVAL; } #ifdef CONFIG_INET + } else if (level == SOL_IP) { + if (optlen != sizeof(int) || sk->sk_family != AF_INET) + return -EINVAL; + + val = *((int *)optval); + /* Only some options are supported */ + switch (optname) { + case IP_TOS: + if (val < -1 || val > 0xff) { + ret = -EINVAL; + } else { + struct inet_sock *inet = inet_sk(sk); + + if (val == -1) + val = 0; + inet->tos = val; + } + break; + default: + ret = -EINVAL; + } #if IS_ENABLED(CONFIG_IPV6) } else if (level == SOL_IPV6) { if (optlen != sizeof(int) || sk->sk_family != AF_INET6) @@ -3616,6 +3637,20 @@ BPF_CALL_5(bpf_getsockopt, struct bpf_sock_ops_kern *, bpf_sock, } else { goto err_clear; } + } else if (level == SOL_IP) { + struct inet_sock *inet = inet_sk(sk); + + if (optlen != sizeof(int) || sk->sk_family != AF_INET) + goto err_clear; + + /* Only some options are supported */ + switch (optname) { + case IP_TOS: + *((int *)optval) = (int)inet->tos; + break; + default: + goto err_clear; + } #if IS_ENABLED(CONFIG_IPV6) } else if (level == SOL_IPV6) { struct ipv6_pinfo *np = inet6_sk(sk); From 923cb2f5c01901219d6ee5fc8979487a1e676436 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 20 Mar 2018 11:19:17 -0700 Subject: [PATCH 0202/1640] UPSTREAM: trace/bpf: remove helper bpf_perf_prog_read_value from tracepoint type programs Commit 4bebdc7a85aa ("bpf: add helper bpf_perf_prog_read_value") added helper bpf_perf_prog_read_value so that perf_event type program can read event counter and enabled/running time. This commit, however, introduced a bug which allows this helper for tracepoint type programs. This is incorrect as bpf_perf_prog_read_value needs to access perf_event through its bpf_perf_event_data_kern type context, which is not available for tracepoint type program. This patch fixed the issue by separating bpf_func_proto between tracepoint and perf_event type programs and removed bpf_perf_prog_read_value from tracepoint func prototype. Fixes: 4bebdc7a85aa ("bpf: add helper bpf_perf_prog_read_value") Reported-by: Alexei Starovoitov Signed-off-by: Yonghong Song Signed-off-by: Daniel Borkmann --- kernel/trace/bpf_trace.c | 70 +++++++++++++++++++++++----------------- 1 file changed, 41 insertions(+), 29 deletions(-) diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index d4c909a8dfcc..0e754a1d0e6a 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -666,32 +666,6 @@ static const struct bpf_func_proto bpf_get_stackid_proto_tp = { .arg3_type = ARG_ANYTHING, }; -BPF_CALL_3(bpf_perf_prog_read_value_tp, struct bpf_perf_event_data_kern *, ctx, - struct bpf_perf_event_value *, buf, u32, size) -{ - int err = -EINVAL; - - if (unlikely(size != sizeof(struct bpf_perf_event_value))) - goto clear; - err = perf_event_read_local(ctx->event, &buf->counter, &buf->enabled, - &buf->running); - if (unlikely(err)) - goto clear; - return 0; -clear: - memset(buf, 0, size); - return err; -} - -static const struct bpf_func_proto bpf_perf_prog_read_value_proto_tp = { - .func = bpf_perf_prog_read_value_tp, - .gpl_only = true, - .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_UNINIT_MEM, - .arg3_type = ARG_CONST_SIZE, -}; - static const struct bpf_func_proto *tp_prog_func_proto(enum bpf_func_id func_id) { switch (func_id) { @@ -699,8 +673,6 @@ static const struct bpf_func_proto *tp_prog_func_proto(enum bpf_func_id func_id) return &bpf_perf_event_output_proto_tp; case BPF_FUNC_get_stackid: return &bpf_get_stackid_proto_tp; - case BPF_FUNC_perf_prog_read_value: - return &bpf_perf_prog_read_value_proto_tp; default: return tracing_func_proto(func_id); } @@ -728,6 +700,46 @@ const struct bpf_verifier_ops tracepoint_verifier_ops = { const struct bpf_prog_ops tracepoint_prog_ops = { }; +BPF_CALL_3(bpf_perf_prog_read_value, struct bpf_perf_event_data_kern *, ctx, + struct bpf_perf_event_value *, buf, u32, size) +{ + int err = -EINVAL; + + if (unlikely(size != sizeof(struct bpf_perf_event_value))) + goto clear; + err = perf_event_read_local(ctx->event, &buf->counter, &buf->enabled, + &buf->running); + if (unlikely(err)) + goto clear; + return 0; +clear: + memset(buf, 0, size); + return err; +} + +static const struct bpf_func_proto bpf_perf_prog_read_value_proto = { + .func = bpf_perf_prog_read_value, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_UNINIT_MEM, + .arg3_type = ARG_CONST_SIZE, +}; + +static const struct bpf_func_proto *pe_prog_func_proto(enum bpf_func_id func_id) +{ + switch (func_id) { + case BPF_FUNC_perf_event_output: + return &bpf_perf_event_output_proto_tp; + case BPF_FUNC_get_stackid: + return &bpf_get_stackid_proto_tp; + case BPF_FUNC_perf_prog_read_value: + return &bpf_perf_prog_read_value_proto; + default: + return tracing_func_proto(func_id); + } +} + static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type, struct bpf_insn_access_aux *info) { @@ -784,7 +796,7 @@ static u32 pe_prog_convert_ctx_access(enum bpf_access_type type, } const struct bpf_verifier_ops perf_event_verifier_ops = { - .get_func_proto = tp_prog_func_proto, + .get_func_proto = pe_prog_func_proto, .is_valid_access = pe_prog_is_valid_access, .convert_ctx_access = pe_prog_convert_ctx_access, }; From beba6490afbdb53de0c973ade469ac1c6409a09b Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 28 Mar 2018 12:05:37 -0700 Subject: [PATCH 0203/1640] UPSTREAM: bpf: introduce BPF_RAW_TRACEPOINT Introduce BPF_PROG_TYPE_RAW_TRACEPOINT bpf program type to access kernel internal arguments of the tracepoints in their raw form. >From bpf program point of view the access to the arguments look like: struct bpf_raw_tracepoint_args { __u64 args[0]; }; int bpf_prog(struct bpf_raw_tracepoint_args *ctx) { // program can read args[N] where N depends on tracepoint // and statically verified at program load+attach time } kprobe+bpf infrastructure allows programs access function arguments. This feature allows programs access raw tracepoint arguments. Similar to proposed 'dynamic ftrace events' there are no abi guarantees to what the tracepoints arguments are and what their meaning is. The program needs to type cast args properly and use bpf_probe_read() helper to access struct fields when argument is a pointer. For every tracepoint __bpf_trace_##call function is prepared. In assembler it looks like: (gdb) disassemble __bpf_trace_xdp_exception Dump of assembler code for function __bpf_trace_xdp_exception: 0xffffffff81132080 <+0>: mov %ecx,%ecx 0xffffffff81132082 <+2>: jmpq 0xffffffff811231f0 where TRACE_EVENT(xdp_exception, TP_PROTO(const struct net_device *dev, const struct bpf_prog *xdp, u32 act), The above assembler snippet is casting 32-bit 'act' field into 'u64' to pass into bpf_trace_run3(), while 'dev' and 'xdp' args are passed as-is. All of ~500 of __bpf_trace_*() functions are only 5-10 byte long and in total this approach adds 7k bytes to .text. This approach gives the lowest possible overhead while calling trace_xdp_exception() from kernel C code and transitioning into bpf land. Since tracepoint+bpf are used at speeds of 1M+ events per second this is valuable optimization. The new BPF_RAW_TRACEPOINT_OPEN sys_bpf command is introduced that returns anon_inode FD of 'bpf-raw-tracepoint' object. The user space looks like: // load bpf prog with BPF_PROG_TYPE_RAW_TRACEPOINT type prog_fd = bpf_prog_load(...); // receive anon_inode fd for given bpf_raw_tracepoint with prog attached raw_tp_fd = bpf_raw_tracepoint_open("xdp_exception", prog_fd); Ctrl-C of tracing daemon or cmdline tool that uses this feature will automatically detach bpf program, unload it and unregister tracepoint probe. On the kernel side the __bpf_raw_tp_map section of pointers to tracepoint definition and to __bpf_trace_*() probe function is used to find a tracepoint with "xdp_exception" name and corresponding __bpf_trace_xdp_exception() probe function which are passed to tracepoint_probe_register() to connect probe with tracepoint. Addition of bpf_raw_tracepoint doesn't interfere with ftrace and perf tracepoint mechanisms. perf_event_open() can be used in parallel on the same tracepoint. Multiple bpf_raw_tracepoint_open("xdp_exception", prog_fd) are permitted. Each with its own bpf program. The kernel will execute all tracepoint probes and all attached bpf programs. In the future bpf_raw_tracepoints can be extended with query/introspection logic. __bpf_raw_tp_map section logic was contributed by Steven Rostedt Signed-off-by: Alexei Starovoitov Signed-off-by: Steven Rostedt (VMware) Acked-by: Steven Rostedt (VMware) Signed-off-by: Daniel Borkmann --- include/asm-generic/vmlinux.lds.h | 10 ++ include/linux/bpf_types.h | 1 + include/linux/trace_events.h | 42 +++++++ include/linux/tracepoint-defs.h | 6 + include/trace/define_trace.h | 1 + include/uapi/linux/bpf.h | 11 ++ kernel/bpf/syscall.c | 78 +++++++++++++ kernel/trace/bpf_trace.c | 183 ++++++++++++++++++++++++++++++ 8 files changed, 332 insertions(+) diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 6fd37e99c07d..851da9de20fe 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -181,6 +181,15 @@ #define TRACE_SYSCALLS() #endif +#ifdef CONFIG_BPF_EVENTS +#define BPF_RAW_TP() STRUCT_ALIGN(); \ + VMLINUX_SYMBOL(__start__bpf_raw_tp) = .; \ + KEEP(*(__bpf_raw_tp_map)) \ + VMLINUX_SYMBOL(__stop__bpf_raw_tp) = .; +#else +#define BPF_RAW_TP() +#endif + #ifdef CONFIG_SERIAL_EARLYCON #define EARLYCON_TABLE() . = ALIGN(8); \ VMLINUX_SYMBOL(__earlycon_table) = .; \ @@ -249,6 +258,7 @@ LIKELY_PROFILE() \ BRANCH_PROFILE() \ TRACE_PRINTKS() \ + BPF_RAW_TP() \ TRACEPOINT_STR() /* diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index 5e2e8a49fb21..6d7243bfb0ff 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -19,6 +19,7 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_SK_MSG, sk_msg) BPF_PROG_TYPE(BPF_PROG_TYPE_KPROBE, kprobe) BPF_PROG_TYPE(BPF_PROG_TYPE_TRACEPOINT, tracepoint) BPF_PROG_TYPE(BPF_PROG_TYPE_PERF_EVENT, perf_event) +BPF_PROG_TYPE(BPF_PROG_TYPE_RAW_TRACEPOINT, raw_tracepoint) #endif #ifdef CONFIG_CGROUP_BPF BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_DEVICE, cg_dev) diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index cbd6b89a18c0..7e185267c993 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -465,6 +465,9 @@ unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx); int perf_event_attach_bpf_prog(struct perf_event *event, struct bpf_prog *prog); void perf_event_detach_bpf_prog(struct perf_event *event); int perf_event_query_prog_array(struct perf_event *event, void __user *info); +int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *prog); +int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog *prog); +struct bpf_raw_event_map *bpf_find_raw_tracepoint(const char *name); #else static inline unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx) { @@ -484,6 +487,18 @@ perf_event_query_prog_array(struct perf_event *event, void __user *info) { return -EOPNOTSUPP; } +static inline int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *p) +{ + return -EOPNOTSUPP; +} +static inline int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog *p) +{ + return -EOPNOTSUPP; +} +static inline struct bpf_raw_event_map *bpf_find_raw_tracepoint(const char *name) +{ + return NULL; +} #endif enum { @@ -543,6 +558,33 @@ extern void ftrace_profile_free_filter(struct perf_event *event); void perf_trace_buf_update(void *record, u16 type); void *perf_trace_buf_alloc(int size, struct pt_regs **regs, int *rctxp); +void bpf_trace_run1(struct bpf_prog *prog, u64 arg1); +void bpf_trace_run2(struct bpf_prog *prog, u64 arg1, u64 arg2); +void bpf_trace_run3(struct bpf_prog *prog, u64 arg1, u64 arg2, + u64 arg3); +void bpf_trace_run4(struct bpf_prog *prog, u64 arg1, u64 arg2, + u64 arg3, u64 arg4); +void bpf_trace_run5(struct bpf_prog *prog, u64 arg1, u64 arg2, + u64 arg3, u64 arg4, u64 arg5); +void bpf_trace_run6(struct bpf_prog *prog, u64 arg1, u64 arg2, + u64 arg3, u64 arg4, u64 arg5, u64 arg6); +void bpf_trace_run7(struct bpf_prog *prog, u64 arg1, u64 arg2, + u64 arg3, u64 arg4, u64 arg5, u64 arg6, u64 arg7); +void bpf_trace_run8(struct bpf_prog *prog, u64 arg1, u64 arg2, + u64 arg3, u64 arg4, u64 arg5, u64 arg6, u64 arg7, + u64 arg8); +void bpf_trace_run9(struct bpf_prog *prog, u64 arg1, u64 arg2, + u64 arg3, u64 arg4, u64 arg5, u64 arg6, u64 arg7, + u64 arg8, u64 arg9); +void bpf_trace_run10(struct bpf_prog *prog, u64 arg1, u64 arg2, + u64 arg3, u64 arg4, u64 arg5, u64 arg6, u64 arg7, + u64 arg8, u64 arg9, u64 arg10); +void bpf_trace_run11(struct bpf_prog *prog, u64 arg1, u64 arg2, + u64 arg3, u64 arg4, u64 arg5, u64 arg6, u64 arg7, + u64 arg8, u64 arg9, u64 arg10, u64 arg11); +void bpf_trace_run12(struct bpf_prog *prog, u64 arg1, u64 arg2, + u64 arg3, u64 arg4, u64 arg5, u64 arg6, u64 arg7, + u64 arg8, u64 arg9, u64 arg10, u64 arg11, u64 arg12); void perf_trace_run_bpf_submit(void *raw_data, int size, int rctx, struct trace_event_call *call, u64 count, struct pt_regs *regs, struct hlist_head *head, diff --git a/include/linux/tracepoint-defs.h b/include/linux/tracepoint-defs.h index 64ed7064f1fa..22c5a46e9693 100644 --- a/include/linux/tracepoint-defs.h +++ b/include/linux/tracepoint-defs.h @@ -35,4 +35,10 @@ struct tracepoint { struct tracepoint_func __rcu *funcs; }; +struct bpf_raw_event_map { + struct tracepoint *tp; + void *bpf_func; + u32 num_args; +} __aligned(32); + #endif diff --git a/include/trace/define_trace.h b/include/trace/define_trace.h index d9e3d4aa3f6e..cb30c5532144 100644 --- a/include/trace/define_trace.h +++ b/include/trace/define_trace.h @@ -95,6 +95,7 @@ #ifdef TRACEPOINTS_ENABLED #include #include +#include #endif #undef TRACE_EVENT diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index ad74f526eb80..10085062a8a8 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -94,6 +94,7 @@ enum bpf_cmd { BPF_MAP_GET_FD_BY_ID, BPF_OBJ_GET_INFO_BY_FD, BPF_PROG_QUERY, + BPF_RAW_TRACEPOINT_OPEN, }; enum bpf_map_type { @@ -134,6 +135,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_SK_SKB, BPF_PROG_TYPE_CGROUP_DEVICE, BPF_PROG_TYPE_SK_MSG, + BPF_PROG_TYPE_RAW_TRACEPOINT, }; enum bpf_attach_type { @@ -344,6 +346,11 @@ union bpf_attr { __aligned_u64 prog_ids; __u32 prog_cnt; } query; + + struct { + __u64 name; + __u32 prog_fd; + } raw_tracepoint; } __attribute__((aligned(8))); /* BPF helper function descriptions: @@ -1244,4 +1251,8 @@ struct bpf_cgroup_dev_ctx { __u32 minor; }; +struct bpf_raw_tracepoint_args { + __u64 args[0]; +}; + #endif /* _UAPI__LINUX_BPF_H__ */ diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 95e957f9e226..02802a8f63ac 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1338,6 +1338,81 @@ static int bpf_obj_get(const union bpf_attr *attr) attr->file_flags); } +struct bpf_raw_tracepoint { + struct bpf_raw_event_map *btp; + struct bpf_prog *prog; +}; + +static int bpf_raw_tracepoint_release(struct inode *inode, struct file *filp) +{ + struct bpf_raw_tracepoint *raw_tp = filp->private_data; + + if (raw_tp->prog) { + bpf_probe_unregister(raw_tp->btp, raw_tp->prog); + bpf_prog_put(raw_tp->prog); + } + kfree(raw_tp); + return 0; +} + +static const struct file_operations bpf_raw_tp_fops = { + .release = bpf_raw_tracepoint_release, + .read = bpf_dummy_read, + .write = bpf_dummy_write, +}; + +#define BPF_RAW_TRACEPOINT_OPEN_LAST_FIELD raw_tracepoint.prog_fd + +static int bpf_raw_tracepoint_open(const union bpf_attr *attr) +{ + struct bpf_raw_tracepoint *raw_tp; + struct bpf_raw_event_map *btp; + struct bpf_prog *prog; + char tp_name[128]; + int tp_fd, err; + + if (strncpy_from_user(tp_name, u64_to_user_ptr(attr->raw_tracepoint.name), + sizeof(tp_name) - 1) < 0) + return -EFAULT; + tp_name[sizeof(tp_name) - 1] = 0; + + btp = bpf_find_raw_tracepoint(tp_name); + if (!btp) + return -ENOENT; + + raw_tp = kzalloc(sizeof(*raw_tp), GFP_USER); + if (!raw_tp) + return -ENOMEM; + raw_tp->btp = btp; + + prog = bpf_prog_get_type(attr->raw_tracepoint.prog_fd, + BPF_PROG_TYPE_RAW_TRACEPOINT); + if (IS_ERR(prog)) { + err = PTR_ERR(prog); + goto out_free_tp; + } + + err = bpf_probe_register(raw_tp->btp, prog); + if (err) + goto out_put_prog; + + raw_tp->prog = prog; + tp_fd = anon_inode_getfd("bpf-raw-tracepoint", &bpf_raw_tp_fops, raw_tp, + O_CLOEXEC); + if (tp_fd < 0) { + bpf_probe_unregister(raw_tp->btp, prog); + err = tp_fd; + goto out_put_prog; + } + return tp_fd; + +out_put_prog: + bpf_prog_put(prog); +out_free_tp: + kfree(raw_tp); + return err; +} + #ifdef CONFIG_CGROUP_BPF #define BPF_PROG_ATTACH_LAST_FIELD attach_flags @@ -1951,6 +2026,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz case BPF_OBJ_GET_INFO_BY_FD: err = bpf_obj_get_info_by_fd(&attr, uattr); break; + case BPF_RAW_TRACEPOINT_OPEN: + err = bpf_raw_tracepoint_open(&attr); + break; default: err = -EINVAL; break; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 0e754a1d0e6a..104039354f54 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -740,6 +740,86 @@ static const struct bpf_func_proto *pe_prog_func_proto(enum bpf_func_id func_id) } } +/* + * bpf_raw_tp_regs are separate from bpf_pt_regs used from skb/xdp + * to avoid potential recursive reuse issue when/if tracepoints are added + * inside bpf_*_event_output and/or bpf_get_stack_id + */ +static DEFINE_PER_CPU(struct pt_regs, bpf_raw_tp_regs); +BPF_CALL_5(bpf_perf_event_output_raw_tp, struct bpf_raw_tracepoint_args *, args, + struct bpf_map *, map, u64, flags, void *, data, u64, size) +{ + struct pt_regs *regs = this_cpu_ptr(&bpf_raw_tp_regs); + + perf_fetch_caller_regs(regs); + return ____bpf_perf_event_output(regs, map, flags, data, size); +} + +static const struct bpf_func_proto bpf_perf_event_output_proto_raw_tp = { + .func = bpf_perf_event_output_raw_tp, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_MEM, + .arg5_type = ARG_CONST_SIZE_OR_ZERO, +}; + +BPF_CALL_3(bpf_get_stackid_raw_tp, struct bpf_raw_tracepoint_args *, args, + struct bpf_map *, map, u64, flags) +{ + struct pt_regs *regs = this_cpu_ptr(&bpf_raw_tp_regs); + + perf_fetch_caller_regs(regs); + /* similar to bpf_perf_event_output_tp, but pt_regs fetched differently */ + return bpf_get_stackid((unsigned long) regs, (unsigned long) map, + flags, 0, 0); +} + +static const struct bpf_func_proto bpf_get_stackid_proto_raw_tp = { + .func = bpf_get_stackid_raw_tp, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_ANYTHING, +}; + +static const struct bpf_func_proto *raw_tp_prog_func_proto(enum bpf_func_id func_id) +{ + switch (func_id) { + case BPF_FUNC_perf_event_output: + return &bpf_perf_event_output_proto_raw_tp; + case BPF_FUNC_get_stackid: + return &bpf_get_stackid_proto_raw_tp; + default: + return tracing_func_proto(func_id); + } +} + +static bool raw_tp_prog_is_valid_access(int off, int size, + enum bpf_access_type type, + struct bpf_insn_access_aux *info) +{ + /* largest tracepoint in the kernel has 12 args */ + if (off < 0 || off >= sizeof(__u64) * 12) + return false; + if (type != BPF_READ) + return false; + if (off % size != 0) + return false; + return true; +} + +const struct bpf_verifier_ops raw_tracepoint_verifier_ops = { + .get_func_proto = raw_tp_prog_func_proto, + .is_valid_access = raw_tp_prog_is_valid_access, +}; + +const struct bpf_prog_ops raw_tracepoint_prog_ops = { +}; + static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type, struct bpf_insn_access_aux *info) { @@ -904,3 +984,106 @@ int perf_event_query_prog_array(struct perf_event *event, void __user *info) return ret; } + +extern struct bpf_raw_event_map __start__bpf_raw_tp[]; +extern struct bpf_raw_event_map __stop__bpf_raw_tp[]; + +struct bpf_raw_event_map *bpf_find_raw_tracepoint(const char *name) +{ + struct bpf_raw_event_map *btp = __start__bpf_raw_tp; + + for (; btp < __stop__bpf_raw_tp; btp++) { + if (!strcmp(btp->tp->name, name)) + return btp; + } + return NULL; +} + +static __always_inline +void __bpf_trace_run(struct bpf_prog *prog, u64 *args) +{ + rcu_read_lock(); + preempt_disable(); + (void) BPF_PROG_RUN(prog, args); + preempt_enable(); + rcu_read_unlock(); +} + +#define UNPACK(...) __VA_ARGS__ +#define REPEAT_1(FN, DL, X, ...) FN(X) +#define REPEAT_2(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_1(FN, DL, __VA_ARGS__) +#define REPEAT_3(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_2(FN, DL, __VA_ARGS__) +#define REPEAT_4(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_3(FN, DL, __VA_ARGS__) +#define REPEAT_5(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_4(FN, DL, __VA_ARGS__) +#define REPEAT_6(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_5(FN, DL, __VA_ARGS__) +#define REPEAT_7(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_6(FN, DL, __VA_ARGS__) +#define REPEAT_8(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_7(FN, DL, __VA_ARGS__) +#define REPEAT_9(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_8(FN, DL, __VA_ARGS__) +#define REPEAT_10(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_9(FN, DL, __VA_ARGS__) +#define REPEAT_11(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_10(FN, DL, __VA_ARGS__) +#define REPEAT_12(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_11(FN, DL, __VA_ARGS__) +#define REPEAT(X, FN, DL, ...) REPEAT_##X(FN, DL, __VA_ARGS__) + +#define SARG(X) u64 arg##X +#define COPY(X) args[X] = arg##X + +#define __DL_COM (,) +#define __DL_SEM (;) + +#define __SEQ_0_11 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 + +#define BPF_TRACE_DEFN_x(x) \ + void bpf_trace_run##x(struct bpf_prog *prog, \ + REPEAT(x, SARG, __DL_COM, __SEQ_0_11)) \ + { \ + u64 args[x]; \ + REPEAT(x, COPY, __DL_SEM, __SEQ_0_11); \ + __bpf_trace_run(prog, args); \ + } \ + EXPORT_SYMBOL_GPL(bpf_trace_run##x) +BPF_TRACE_DEFN_x(1); +BPF_TRACE_DEFN_x(2); +BPF_TRACE_DEFN_x(3); +BPF_TRACE_DEFN_x(4); +BPF_TRACE_DEFN_x(5); +BPF_TRACE_DEFN_x(6); +BPF_TRACE_DEFN_x(7); +BPF_TRACE_DEFN_x(8); +BPF_TRACE_DEFN_x(9); +BPF_TRACE_DEFN_x(10); +BPF_TRACE_DEFN_x(11); +BPF_TRACE_DEFN_x(12); + +static int __bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *prog) +{ + struct tracepoint *tp = btp->tp; + + /* + * check that program doesn't access arguments beyond what's + * available in this tracepoint + */ + if (prog->aux->max_ctx_offset > btp->num_args * sizeof(u64)) + return -EINVAL; + + return tracepoint_probe_register(tp, (void *)btp->bpf_func, prog); +} + +int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *prog) +{ + int err; + + mutex_lock(&bpf_event_mutex); + err = __bpf_probe_register(btp, prog); + mutex_unlock(&bpf_event_mutex); + return err; +} + +int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog *prog) +{ + int err; + + mutex_lock(&bpf_event_mutex); + err = tracepoint_probe_unregister(btp->tp, (void *)btp->bpf_func, prog); + mutex_unlock(&bpf_event_mutex); + return err; +} From fdb22f25a3e71f6529a10b2f8151fa1c8ed9256b Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Wed, 28 Mar 2018 12:49:15 -0700 Subject: [PATCH 0204/1640] BACKPORT: bpf: sockmap redirect ingress support Add support for the BPF_F_INGRESS flag in sk_msg redirect helper. To do this add a scatterlist ring for receiving socks to check before calling into regular recvmsg call path. Additionally, because the poll wakeup logic only checked the skb recv queue we need to add a hook in TCP stack (similar to write side) so that we have a way to wake up polling socks when a scatterlist is redirected to that sock. After this all that is needed is for the redirect helper to push the scatterlist into the psock receive queue. Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann --- include/linux/filter.h | 1 + include/net/sock.h | 1 + kernel/bpf/sockmap.c | 198 ++++++++++++++++++++++++++++++++++++++++- net/core/filter.c | 2 +- net/ipv4/tcp.c | 10 ++- 5 files changed, 207 insertions(+), 5 deletions(-) diff --git a/include/linux/filter.h b/include/linux/filter.h index 262f22202560..a3f50593b21b 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -601,6 +601,7 @@ struct sk_msg_buff { __u32 key; __u32 flags; struct bpf_map *map; + struct list_head list; }; /* compute the linear packet data range [data, data_end) which diff --git a/include/net/sock.h b/include/net/sock.h index 58749b227a79..214cd0877624 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1129,6 +1129,7 @@ struct proto { #endif bool (*stream_memory_free)(const struct sock *sk); + bool (*stream_memory_read)(const struct sock *sk); /* Memory pressure */ void (*enter_memory_pressure)(struct sock *sk); void (*leave_memory_pressure)(struct sock *sk); diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 8a5e251a57e8..9415f27d3d63 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -41,6 +41,8 @@ #include #include #include +#include +#include #define SOCK_CREATE_FLAG_MASK \ (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) @@ -82,6 +84,7 @@ struct smap_psock { int sg_size; int eval; struct sk_msg_buff *cork; + struct list_head ingress; struct strparser strp; struct bpf_prog *bpf_tx_msg; @@ -103,6 +106,8 @@ struct smap_psock { }; static void smap_release_sock(struct smap_psock *psock, struct sock *sock); +static int bpf_tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, + int nonblock, int flags, int *addr_len); static int bpf_tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size); static int bpf_tcp_sendpage(struct sock *sk, struct page *page, int offset, size_t size, int flags); @@ -112,6 +117,21 @@ static inline struct smap_psock *smap_psock_sk(const struct sock *sk) return rcu_dereference_sk_user_data(sk); } +static bool bpf_tcp_stream_read(const struct sock *sk) +{ + struct smap_psock *psock; + bool empty = true; + + rcu_read_lock(); + psock = smap_psock_sk(sk); + if (unlikely(!psock)) + goto out; + empty = list_empty(&psock->ingress); +out: + rcu_read_unlock(); + return !empty; +} + static struct proto tcp_bpf_proto; static int bpf_tcp_init(struct sock *sk) { @@ -135,6 +155,8 @@ static int bpf_tcp_init(struct sock *sk) if (psock->bpf_tx_msg) { tcp_bpf_proto.sendmsg = bpf_tcp_sendmsg; tcp_bpf_proto.sendpage = bpf_tcp_sendpage; + tcp_bpf_proto.recvmsg = bpf_tcp_recvmsg; + tcp_bpf_proto.stream_memory_read = bpf_tcp_stream_read; } sk->sk_prot = &tcp_bpf_proto; @@ -170,6 +192,7 @@ static void bpf_tcp_close(struct sock *sk, long timeout) { void (*close_fun)(struct sock *sk, long timeout); struct smap_psock_map_entry *e, *tmp; + struct sk_msg_buff *md, *mtmp; struct smap_psock *psock; struct sock *osk; @@ -188,6 +211,12 @@ static void bpf_tcp_close(struct sock *sk, long timeout) close_fun = psock->save_close; write_lock_bh(&sk->sk_callback_lock); + list_for_each_entry_safe(md, mtmp, &psock->ingress, list) { + list_del(&md->list); + free_start_sg(psock->sock, md); + kfree(md); + } + list_for_each_entry_safe(e, tmp, &psock->maps, list) { osk = cmpxchg(e->entry, sk, NULL); if (osk == sk) { @@ -468,6 +497,72 @@ verdict: return _rc; } +static int bpf_tcp_ingress(struct sock *sk, int apply_bytes, + struct smap_psock *psock, + struct sk_msg_buff *md, int flags) +{ + bool apply = apply_bytes; + size_t size, copied = 0; + struct sk_msg_buff *r; + int err = 0, i; + + r = kzalloc(sizeof(struct sk_msg_buff), __GFP_NOWARN | GFP_KERNEL); + if (unlikely(!r)) + return -ENOMEM; + + lock_sock(sk); + r->sg_start = md->sg_start; + i = md->sg_start; + + do { + r->sg_data[i] = md->sg_data[i]; + + size = (apply && apply_bytes < md->sg_data[i].length) ? + apply_bytes : md->sg_data[i].length; + + if (!sk_wmem_schedule(sk, size)) { + if (!copied) + err = -ENOMEM; + break; + } + + sk_mem_charge(sk, size); + r->sg_data[i].length = size; + md->sg_data[i].length -= size; + md->sg_data[i].offset += size; + copied += size; + + if (md->sg_data[i].length) { + get_page(sg_page(&r->sg_data[i])); + r->sg_end = (i + 1) == MAX_SKB_FRAGS ? 0 : i + 1; + } else { + i++; + if (i == MAX_SKB_FRAGS) + i = 0; + r->sg_end = i; + } + + if (apply) { + apply_bytes -= size; + if (!apply_bytes) + break; + } + } while (i != md->sg_end); + + md->sg_start = i; + + if (!err) { + list_add_tail(&r->list, &psock->ingress); + sk->sk_data_ready(sk); + } else { + free_start_sg(sk, r); + kfree(r); + } + + release_sock(sk); + return err; +} + static int bpf_tcp_sendmsg_do_redirect(struct sock *sk, int send, struct sk_msg_buff *md, int flags) @@ -475,6 +570,7 @@ static int bpf_tcp_sendmsg_do_redirect(struct sock *sk, int send, struct smap_psock *psock; struct scatterlist *sg; int i, err, free = 0; + bool ingress = !!(md->flags & BPF_F_INGRESS); sg = md->sg_data; @@ -487,9 +583,14 @@ static int bpf_tcp_sendmsg_do_redirect(struct sock *sk, int send, goto out_rcu; rcu_read_unlock(); - lock_sock(sk); - err = bpf_tcp_push(sk, send, md, flags, false); - release_sock(sk); + + if (ingress) { + err = bpf_tcp_ingress(sk, send, psock, md, flags); + } else { + lock_sock(sk); + err = bpf_tcp_push(sk, send, md, flags, false); + release_sock(sk); + } smap_release_sock(psock, sk); if (unlikely(err)) goto out; @@ -623,6 +724,89 @@ out_err: return err; } +static int bpf_tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, + int nonblock, int flags, int *addr_len) +{ + struct iov_iter *iter = &msg->msg_iter; + struct smap_psock *psock; + int copied = 0; + + if (unlikely(flags & MSG_ERRQUEUE)) + return inet_recv_error(sk, msg, len, addr_len); + + rcu_read_lock(); + psock = smap_psock_sk(sk); + if (unlikely(!psock)) + goto out; + + if (unlikely(!refcount_inc_not_zero(&psock->refcnt))) + goto out; + rcu_read_unlock(); + + if (!skb_queue_empty(&sk->sk_receive_queue)) + return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len); + + lock_sock(sk); + while (copied != len) { + struct scatterlist *sg; + struct sk_msg_buff *md; + int i; + + md = list_first_entry_or_null(&psock->ingress, + struct sk_msg_buff, list); + if (unlikely(!md)) + break; + i = md->sg_start; + do { + struct page *page; + int n, copy; + + sg = &md->sg_data[i]; + copy = sg->length; + page = sg_page(sg); + + if (copied + copy > len) + copy = len - copied; + + n = copy_page_to_iter(page, sg->offset, copy, iter); + if (n != copy) { + md->sg_start = i; + release_sock(sk); + smap_release_sock(psock, sk); + return -EFAULT; + } + + copied += copy; + sg->offset += copy; + sg->length -= copy; + sk_mem_uncharge(sk, copy); + + if (!sg->length) { + i++; + if (i == MAX_SKB_FRAGS) + i = 0; + put_page(page); + } + if (copied == len) + break; + } while (i != md->sg_end); + md->sg_start = i; + + if (!sg->length && md->sg_start == md->sg_end) { + list_del(&md->list); + kfree(md); + } + } + + release_sock(sk); + smap_release_sock(psock, sk); + return copied; +out: + rcu_read_unlock(); + return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len); +} + + static int bpf_tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) { int flags = msg->msg_flags | MSG_NO_SHARED_FRAGS; @@ -1110,6 +1294,7 @@ static void sock_map_remove_complete(struct bpf_stab *stab) static void smap_gc_work(struct work_struct *w) { struct smap_psock_map_entry *e, *tmp; + struct sk_msg_buff *md, *mtmp; struct smap_psock *psock; psock = container_of(w, struct smap_psock, gc_work); @@ -1134,6 +1319,12 @@ static void smap_gc_work(struct work_struct *w) kfree(psock->cork); } + list_for_each_entry_safe(md, mtmp, &psock->ingress, list) { + list_del(&md->list); + free_start_sg(psock->sock, md); + kfree(md); + } + list_for_each_entry_safe(e, tmp, &psock->maps, list) { list_del(&e->list); kfree(e); @@ -1163,6 +1354,7 @@ static struct smap_psock *smap_init_psock(struct sock *sock, INIT_WORK(&psock->tx_work, smap_tx_work); INIT_WORK(&psock->gc_work, smap_gc_work); INIT_LIST_HEAD(&psock->maps); + INIT_LIST_HEAD(&psock->ingress); refcount_set(&psock->refcnt, 1); rcu_assign_sk_user_data(sock, psock); diff --git a/net/core/filter.c b/net/core/filter.c index 215ab1940ef8..6f3cc1cb8f0e 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1947,7 +1947,7 @@ BPF_CALL_4(bpf_msg_redirect_map, struct sk_msg_buff *, msg, struct bpf_map *, map, u32, key, u64, flags) { /* If user passes invalid input drop the packet. */ - if (unlikely(flags)) + if (unlikely(flags & ~(BPF_F_INGRESS))) return SK_DROP; msg->key = key; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index f1aa2c467f85..a6b05eebabc9 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -488,6 +488,14 @@ static void tcp_tx_timestamp(struct sock *sk, u16 tsflags, struct sk_buff *skb) } } +static inline bool tcp_stream_is_readable(const struct tcp_sock *tp, + int target, struct sock *sk) +{ + return (tp->rcv_nxt - tp->copied_seq >= target) || + (sk->sk_prot->stream_memory_read ? + sk->sk_prot->stream_memory_read(sk) : false); +} + /* * Wait for a TCP event. * @@ -559,7 +567,7 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait) tp->urg_data) target++; - if (tp->rcv_nxt - tp->copied_seq >= target) + if (tcp_stream_is_readable(tp, target, sk)) mask |= POLLIN | POLLRDNORM; if (!(sk->sk_shutdown & SEND_SHUTDOWN)) { From 1e43c30824b1adb4e214d0423194835750f1c1cd Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Wed, 28 Mar 2018 12:49:25 -0700 Subject: [PATCH 0205/1640] UPSTREAM: bpf: sockmap, BPF_F_INGRESS flag for BPF_SK_SKB_STREAM_VERDICT: Add support for the BPF_F_INGRESS flag in skb redirect helper. To do this convert skb into a scatterlist and push into ingress queue. This is the same logic that is used in the sk_msg redirect helper so it should feel familiar. Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann --- include/linux/filter.h | 1 + kernel/bpf/sockmap.c | 92 ++++++++++++++++++++++++++++++++++-------- net/core/filter.c | 2 +- 3 files changed, 77 insertions(+), 18 deletions(-) diff --git a/include/linux/filter.h b/include/linux/filter.h index a3f50593b21b..4d84b812cfcb 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -601,6 +601,7 @@ struct sk_msg_buff { __u32 key; __u32 flags; struct bpf_map *map; + struct sk_buff *skb; struct list_head list; }; diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 9415f27d3d63..61737e1790fc 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -785,7 +785,8 @@ static int bpf_tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, i++; if (i == MAX_SKB_FRAGS) i = 0; - put_page(page); + if (!md->skb) + put_page(page); } if (copied == len) break; @@ -794,6 +795,8 @@ static int bpf_tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, if (!sg->length && md->sg_start == md->sg_end) { list_del(&md->list); + if (md->skb) + consume_skb(md->skb); kfree(md); } } @@ -1045,27 +1048,72 @@ static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb) __SK_DROP; } +static int smap_do_ingress(struct smap_psock *psock, struct sk_buff *skb) +{ + struct sock *sk = psock->sock; + int copied = 0, num_sg; + struct sk_msg_buff *r; + + r = kzalloc(sizeof(struct sk_msg_buff), __GFP_NOWARN | GFP_ATOMIC); + if (unlikely(!r)) + return -EAGAIN; + + if (!sk_rmem_schedule(sk, skb, skb->len)) { + kfree(r); + return -EAGAIN; + } + + sg_init_table(r->sg_data, MAX_SKB_FRAGS); + num_sg = skb_to_sgvec(skb, r->sg_data, 0, skb->len); + if (unlikely(num_sg < 0)) { + kfree(r); + return num_sg; + } + sk_mem_charge(sk, skb->len); + copied = skb->len; + r->sg_start = 0; + r->sg_end = num_sg == MAX_SKB_FRAGS ? 0 : num_sg; + r->skb = skb; + list_add_tail(&r->list, &psock->ingress); + sk->sk_data_ready(sk); + return copied; +} + static void smap_do_verdict(struct smap_psock *psock, struct sk_buff *skb) { + struct smap_psock *peer; struct sock *sk; + __u32 in; int rc; rc = smap_verdict_func(psock, skb); switch (rc) { case __SK_REDIRECT: sk = do_sk_redirect_map(skb); - if (likely(sk)) { - struct smap_psock *peer = smap_psock_sk(sk); + if (!sk) { + kfree_skb(skb); + break; + } - if (likely(peer && - test_bit(SMAP_TX_RUNNING, &peer->state) && - !sock_flag(sk, SOCK_DEAD) && - sock_writeable(sk))) { - skb_set_owner_w(skb, sk); - skb_queue_tail(&peer->rxqueue, skb); - schedule_work(&peer->tx_work); - break; - } + peer = smap_psock_sk(sk); + in = (TCP_SKB_CB(skb)->bpf.flags) & BPF_F_INGRESS; + + if (unlikely(!peer || sock_flag(sk, SOCK_DEAD) || + !test_bit(SMAP_TX_RUNNING, &peer->state))) { + kfree_skb(skb); + break; + } + + if (!in && sock_writeable(sk)) { + skb_set_owner_w(skb, sk); + skb_queue_tail(&peer->rxqueue, skb); + schedule_work(&peer->tx_work); + break; + } else if (in && + atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf) { + skb_queue_tail(&peer->rxqueue, skb); + schedule_work(&peer->tx_work); + break; } /* Fall through and free skb otherwise */ case __SK_DROP: @@ -1127,15 +1175,23 @@ static void smap_tx_work(struct work_struct *w) } while ((skb = skb_dequeue(&psock->rxqueue))) { + __u32 flags; + rem = skb->len; off = 0; start: + flags = (TCP_SKB_CB(skb)->bpf.flags) & BPF_F_INGRESS; do { - if (likely(psock->sock->sk_socket)) - n = skb_send_sock_locked(psock->sock, - skb, off, rem); - else + if (likely(psock->sock->sk_socket)) { + if (flags) + n = smap_do_ingress(psock, skb); + else + n = skb_send_sock_locked(psock->sock, + skb, off, rem); + } else { n = -EINVAL; + } + if (n <= 0) { if (n == -EAGAIN) { /* Retry when space is available */ @@ -1153,7 +1209,9 @@ start: rem -= n; off += n; } while (rem); - kfree_skb(skb); + + if (!flags) + kfree_skb(skb); } out: release_sock(psock->sock); diff --git a/net/core/filter.c b/net/core/filter.c index 6f3cc1cb8f0e..17db0eb74e4d 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1908,7 +1908,7 @@ BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb, struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); /* If user passes invalid input drop the packet. */ - if (unlikely(flags)) + if (unlikely(flags & ~(BPF_F_INGRESS))) return SK_DROP; tcb->bpf.key = key; From 9ca74ae86c46a41ece88934613b6a628c6d104d8 Mon Sep 17 00:00:00 2001 From: Prashant Bhole Date: Fri, 30 Mar 2018 09:21:00 +0900 Subject: [PATCH 0206/1640] UPSTREAM: bpf: sockmap: initialize sg table entries properly When CONFIG_DEBUG_SG is set, sg->sg_magic is initialized in sg_init_table() and it is verified in sg api while navigating. We hit BUG_ON when magic check is failed. In functions sg_tcp_sendpage and sg_tcp_sendmsg, the struct containing the scatterlist is already zeroed out. So to avoid extra memset, we use sg_init_marker() to initialize sg_magic. Fixed following things: - In bpf_tcp_sendpage: initialize sg using sg_init_marker - In bpf_tcp_sendmsg: Replace sg_init_table with sg_init_marker - In bpf_tcp_push: Replace memset with sg_init_table where consumed sg entry needs to be re-initialized. Signed-off-by: Prashant Bhole Acked-by: John Fastabend Signed-off-by: Daniel Borkmann --- kernel/bpf/sockmap.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 61737e1790fc..e93bda3787cf 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -341,7 +341,7 @@ retry: md->sg_start++; if (md->sg_start == MAX_SKB_FRAGS) md->sg_start = 0; - memset(sg, 0, sizeof(*sg)); + sg_init_table(sg, 1); if (md->sg_start == md->sg_end) break; @@ -843,7 +843,7 @@ static int bpf_tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) } sg = md.sg_data; - sg_init_table(sg, MAX_SKB_FRAGS); + sg_init_marker(sg, MAX_SKB_FRAGS); rcu_read_unlock(); lock_sock(sk); @@ -950,10 +950,14 @@ static int bpf_tcp_sendpage(struct sock *sk, struct page *page, lock_sock(sk); - if (psock->cork_bytes) + if (psock->cork_bytes) { m = psock->cork; - else + sg = &m->sg_data[m->sg_end]; + } else { m = &md; + sg = m->sg_data; + sg_init_marker(sg, MAX_SKB_FRAGS); + } /* Catch case where ring is full and sendpage is stalled. */ if (unlikely(m->sg_end == m->sg_start && @@ -961,7 +965,6 @@ static int bpf_tcp_sendpage(struct sock *sk, struct page *page, goto out_err; psock->sg_size += size; - sg = &m->sg_data[m->sg_end]; sg_set_page(sg, page, size, offset); get_page(page); m->sg_copy[m->sg_end] = true; From eb9a2d21327bc3624fc029db9d83bda6cba42d4f Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Fri, 30 Mar 2018 15:08:00 -0700 Subject: [PATCH 0207/1640] BACKPORT: bpf: Check attach type at prog load time == The problem == There are use-cases when a program of some type can be attached to multiple attach points and those attach points must have different permissions to access context or to call helpers. E.g. context structure may have fields for both IPv4 and IPv6 but it doesn't make sense to read from / write to IPv6 field when attach point is somewhere in IPv4 stack. Same applies to BPF-helpers: it may make sense to call some helper from some attach point, but not from other for same prog type. == The solution == Introduce `expected_attach_type` field in in `struct bpf_attr` for `BPF_PROG_LOAD` command. If scenario described in "The problem" section is the case for some prog type, the field will be checked twice: 1) At load time prog type is checked to see if attach type for it must be known to validate program permissions correctly. Prog will be rejected with EINVAL if it's the case and `expected_attach_type` is not specified or has invalid value. 2) At attach time `attach_type` is compared with `expected_attach_type`, if prog type requires to have one, and, if they differ, attach will be rejected with EINVAL. The `expected_attach_type` is now available as part of `struct bpf_prog` in both `bpf_verifier_ops->is_valid_access()` and `bpf_verifier_ops->get_func_proto()` () and can be used to check context accesses and calls to helpers correspondingly. Initially the idea was discussed by Alexei Starovoitov and Daniel Borkmann here: https://marc.info/?l=linux-netdev&m=152107378717201&w=2 Signed-off-by: Andrey Ignatov Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 5 ++++- include/linux/filter.h | 1 + include/uapi/linux/bpf.h | 5 +++++ kernel/bpf/cgroup.c | 3 ++- kernel/bpf/syscall.c | 31 ++++++++++++++++++++++++++++++- kernel/bpf/verifier.c | 6 +++--- kernel/trace/bpf_trace.c | 27 ++++++++++++++++++--------- net/core/filter.c | 39 +++++++++++++++++++++++++-------------- 8 files changed, 88 insertions(+), 29 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index a774cb63a94f..9390c9a70e22 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -214,12 +214,15 @@ struct bpf_prog_ops { struct bpf_verifier_ops { /* return eBPF function prototype for verification */ - const struct bpf_func_proto *(*get_func_proto)(enum bpf_func_id func_id); + const struct bpf_func_proto * + (*get_func_proto)(enum bpf_func_id func_id, + const struct bpf_prog *prog); /* return true if 'size' wide access at offset 'off' within bpf_context * with 'type' (read or write) is allowed */ bool (*is_valid_access)(int off, int size, enum bpf_access_type type, + const struct bpf_prog *prog, struct bpf_insn_access_aux *info); int (*gen_prologue)(struct bpf_insn *insn, bool direct_write, const struct bpf_prog *prog); diff --git a/include/linux/filter.h b/include/linux/filter.h index 4d84b812cfcb..f07045c17de2 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -494,6 +494,7 @@ struct bpf_prog { is_func:1, /* program is a bpf function */ kprobe_override:1; /* Do we override a kprobe? */ enum bpf_prog_type type; /* Type of BPF program */ + enum bpf_attach_type expected_attach_type; /* For some prog types */ u32 len; /* Number of filter blocks */ u32 jited_len; /* Size of jited insns in bytes */ u8 tag[BPF_TAG_SIZE]; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 10085062a8a8..b37a76403cc1 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -296,6 +296,11 @@ union bpf_attr { __u32 prog_flags; char prog_name[BPF_OBJ_NAME_LEN]; __u32 prog_ifindex; /* ifindex of netdev to prep for */ + /* For some prog types expected attach type must be known at + * load time to verify attach type specific parts of prog + * (context accesses, allowed helpers, etc). + */ + __u32 expected_attach_type; }; struct { /* anonymous struct used by BPF_OBJ_* commands */ diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 83baf471ec00..676b8d80811a 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -544,7 +544,7 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, EXPORT_SYMBOL(__cgroup_bpf_check_dev_permission); static const struct bpf_func_proto * -cgroup_dev_func_proto(enum bpf_func_id func_id) +cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_map_lookup_elem: @@ -565,6 +565,7 @@ cgroup_dev_func_proto(enum bpf_func_id func_id) static bool cgroup_dev_is_valid_access(int off, int size, enum bpf_access_type type, + const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { const int size_default = sizeof(__u32); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 02802a8f63ac..eddd69e1771c 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1188,8 +1188,27 @@ struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, enum bpf_prog_type type, } EXPORT_SYMBOL_GPL(bpf_prog_get_type_dev); +static int +bpf_prog_load_check_attach_type(enum bpf_prog_type prog_type, + enum bpf_attach_type expected_attach_type) +{ + /* There are currently no prog types that require specifying + * attach_type at load time. + */ + return 0; +} + +static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, + enum bpf_attach_type attach_type) +{ + /* There are currently no prog types that require specifying + * attach_type at load time. + */ + return 0; +} + /* last field in 'union bpf_attr' used by this command */ -#define BPF_PROG_LOAD_LAST_FIELD prog_ifindex +#define BPF_PROG_LOAD_LAST_FIELD expected_attach_type static int bpf_prog_load(union bpf_attr *attr) { @@ -1226,11 +1245,16 @@ static int bpf_prog_load(union bpf_attr *attr) !capable(CAP_SYS_ADMIN)) return -EPERM; + if (bpf_prog_load_check_attach_type(type, attr->expected_attach_type)) + return -EINVAL; + /* plain bpf_prog allocation */ prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER); if (!prog) return -ENOMEM; + prog->expected_attach_type = attr->expected_attach_type; + prog->aux->offload_requested = !!attr->prog_ifindex; err = security_bpf_prog_alloc(prog->aux); @@ -1497,6 +1521,11 @@ static int bpf_prog_attach(const union bpf_attr *attr) if (IS_ERR(prog)) return PTR_ERR(prog); + if (bpf_prog_attach_check_attach_type(prog, attr->attach_type)) { + bpf_prog_put(prog); + return -EINVAL; + } + cgrp = cgroup_get_from_fd(attr->target_fd); if (IS_ERR(cgrp)) { bpf_prog_put(prog); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index fa775b7bded7..450b0ae2e245 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1380,7 +1380,7 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, }; if (env->ops->is_valid_access && - env->ops->is_valid_access(off, size, t, &info)) { + env->ops->is_valid_access(off, size, t, env->prog, &info)) { /* A non zero info.ctx_field_size indicates that this field is a * candidate for later verifier transformation to load the whole * field and then apply a mask when accessed with a narrower @@ -2370,7 +2370,7 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn } if (env->ops->get_func_proto) - fn = env->ops->get_func_proto(func_id); + fn = env->ops->get_func_proto(func_id, env->prog); if (!fn) { verbose(env, "unknown func %s#%d\n", func_id_name(func_id), @@ -5981,7 +5981,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) insn = new_prog->insnsi + i + delta; } patch_call_imm: - fn = env->ops->get_func_proto(insn->imm); + fn = env->ops->get_func_proto(insn->imm, env->prog); /* all functions that have prototype and verifier allowed * programs to call them, must be real in-kernel functions */ diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 104039354f54..3ac3bcd48f47 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -527,7 +527,8 @@ static const struct bpf_func_proto bpf_probe_read_str_proto = { .arg3_type = ARG_ANYTHING, }; -static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id) +static const struct bpf_func_proto * +tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_map_lookup_elem: @@ -573,7 +574,8 @@ static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id) } } -static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func_id) +static const struct bpf_func_proto * +kprobe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_perf_event_output: @@ -587,12 +589,13 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func return &bpf_override_return_proto; #endif default: - return tracing_func_proto(func_id); + return tracing_func_proto(func_id, prog); } } /* bpf+kprobe programs can access fields of 'struct pt_regs' */ static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type type, + const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { if (off < 0 || off >= sizeof(struct pt_regs)) @@ -666,7 +669,8 @@ static const struct bpf_func_proto bpf_get_stackid_proto_tp = { .arg3_type = ARG_ANYTHING, }; -static const struct bpf_func_proto *tp_prog_func_proto(enum bpf_func_id func_id) +static const struct bpf_func_proto * +tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_perf_event_output: @@ -674,11 +678,12 @@ static const struct bpf_func_proto *tp_prog_func_proto(enum bpf_func_id func_id) case BPF_FUNC_get_stackid: return &bpf_get_stackid_proto_tp; default: - return tracing_func_proto(func_id); + return tracing_func_proto(func_id, prog); } } static bool tp_prog_is_valid_access(int off, int size, enum bpf_access_type type, + const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { if (off < sizeof(void *) || off >= PERF_MAX_TRACE_SIZE) @@ -726,7 +731,8 @@ static const struct bpf_func_proto bpf_perf_prog_read_value_proto = { .arg3_type = ARG_CONST_SIZE, }; -static const struct bpf_func_proto *pe_prog_func_proto(enum bpf_func_id func_id) +static const struct bpf_func_proto * +pe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_perf_event_output: @@ -736,7 +742,7 @@ static const struct bpf_func_proto *pe_prog_func_proto(enum bpf_func_id func_id) case BPF_FUNC_perf_prog_read_value: return &bpf_perf_prog_read_value_proto; default: - return tracing_func_proto(func_id); + return tracing_func_proto(func_id, prog); } } @@ -786,7 +792,8 @@ static const struct bpf_func_proto bpf_get_stackid_proto_raw_tp = { .arg3_type = ARG_ANYTHING, }; -static const struct bpf_func_proto *raw_tp_prog_func_proto(enum bpf_func_id func_id) +static const struct bpf_func_proto * +raw_tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_perf_event_output: @@ -794,12 +801,13 @@ static const struct bpf_func_proto *raw_tp_prog_func_proto(enum bpf_func_id func case BPF_FUNC_get_stackid: return &bpf_get_stackid_proto_raw_tp; default: - return tracing_func_proto(func_id); + return tracing_func_proto(func_id, prog); } } static bool raw_tp_prog_is_valid_access(int off, int size, enum bpf_access_type type, + const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { /* largest tracepoint in the kernel has 12 args */ @@ -821,6 +829,7 @@ const struct bpf_prog_ops raw_tracepoint_prog_ops = { }; static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type, + const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { const int size_sp = FIELD_SIZEOF(struct bpf_perf_event_data, diff --git a/net/core/filter.c b/net/core/filter.c index 17db0eb74e4d..1661519ffc0d 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3742,7 +3742,7 @@ bpf_base_func_proto(enum bpf_func_id func_id) } static const struct bpf_func_proto * -sock_filter_func_proto(enum bpf_func_id func_id) +sock_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { /* inet and inet6 sockets are created in a process @@ -3756,7 +3756,7 @@ sock_filter_func_proto(enum bpf_func_id func_id) } static const struct bpf_func_proto * -sk_filter_func_proto(enum bpf_func_id func_id) +sk_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_skb_load_bytes: @@ -3773,7 +3773,7 @@ sk_filter_func_proto(enum bpf_func_id func_id) } static const struct bpf_func_proto * -tc_cls_act_func_proto(enum bpf_func_id func_id) +tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_skb_store_bytes: @@ -3844,7 +3844,7 @@ tc_cls_act_func_proto(enum bpf_func_id func_id) } static const struct bpf_func_proto * -xdp_func_proto(enum bpf_func_id func_id) +xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_perf_event_output: @@ -3867,7 +3867,7 @@ xdp_func_proto(enum bpf_func_id func_id) } static const struct bpf_func_proto * -lwt_inout_func_proto(enum bpf_func_id func_id) +lwt_inout_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_skb_load_bytes: @@ -3894,7 +3894,7 @@ lwt_inout_func_proto(enum bpf_func_id func_id) } static const struct bpf_func_proto * - sock_ops_func_proto(enum bpf_func_id func_id) +sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_setsockopt: @@ -3910,7 +3910,8 @@ static const struct bpf_func_proto * } } -static const struct bpf_func_proto *sk_msg_func_proto(enum bpf_func_id func_id) +static const struct bpf_func_proto * +sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_msg_redirect_map: @@ -3926,7 +3927,8 @@ static const struct bpf_func_proto *sk_msg_func_proto(enum bpf_func_id func_id) } } -static const struct bpf_func_proto *sk_skb_func_proto(enum bpf_func_id func_id) +static const struct bpf_func_proto * +sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_skb_store_bytes: @@ -3951,7 +3953,7 @@ static const struct bpf_func_proto *sk_skb_func_proto(enum bpf_func_id func_id) } static const struct bpf_func_proto * -lwt_xmit_func_proto(enum bpf_func_id func_id) +lwt_xmit_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_skb_get_tunnel_key: @@ -3981,11 +3983,12 @@ lwt_xmit_func_proto(enum bpf_func_id func_id) case BPF_FUNC_set_hash_invalid: return &bpf_set_hash_invalid_proto; default: - return lwt_inout_func_proto(func_id); + return lwt_inout_func_proto(func_id, prog); } } static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type, + const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { const int size_default = sizeof(__u32); @@ -4029,6 +4032,7 @@ static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type static bool sk_filter_is_valid_access(int off, int size, enum bpf_access_type type, + const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { switch (off) { @@ -4049,11 +4053,12 @@ static bool sk_filter_is_valid_access(int off, int size, } } - return bpf_skb_is_valid_access(off, size, type, info); + return bpf_skb_is_valid_access(off, size, type, prog, info); } static bool lwt_is_valid_access(int off, int size, enum bpf_access_type type, + const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { switch (off) { @@ -4083,11 +4088,12 @@ static bool lwt_is_valid_access(int off, int size, break; } - return bpf_skb_is_valid_access(off, size, type, info); + return bpf_skb_is_valid_access(off, size, type, prog, info); } static bool sock_filter_is_valid_access(int off, int size, enum bpf_access_type type, + const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { if (type == BPF_WRITE) { @@ -4159,6 +4165,7 @@ static int tc_cls_act_prologue(struct bpf_insn *insn_buf, bool direct_write, static bool tc_cls_act_is_valid_access(int off, int size, enum bpf_access_type type, + const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { if (type == BPF_WRITE) { @@ -4188,7 +4195,7 @@ static bool tc_cls_act_is_valid_access(int off, int size, return false; } - return bpf_skb_is_valid_access(off, size, type, info); + return bpf_skb_is_valid_access(off, size, type, prog, info); } static bool __is_valid_xdp_access(int off, int size) @@ -4205,6 +4212,7 @@ static bool __is_valid_xdp_access(int off, int size) static bool xdp_is_valid_access(int off, int size, enum bpf_access_type type, + const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { if (type == BPF_WRITE) @@ -4237,6 +4245,7 @@ EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action); static bool sock_ops_is_valid_access(int off, int size, enum bpf_access_type type, + const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { const int size_default = sizeof(__u32); @@ -4283,6 +4292,7 @@ static int sk_skb_prologue(struct bpf_insn *insn_buf, bool direct_write, static bool sk_skb_is_valid_access(int off, int size, enum bpf_access_type type, + const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { switch (off) { @@ -4312,11 +4322,12 @@ static bool sk_skb_is_valid_access(int off, int size, break; } - return bpf_skb_is_valid_access(off, size, type, info); + return bpf_skb_is_valid_access(off, size, type, prog, info); } static bool sk_msg_is_valid_access(int off, int size, enum bpf_access_type type, + const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { if (type == BPF_WRITE) From 492f3d37c286cfea0321c991a5a312be03b9d04d Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Fri, 30 Mar 2018 15:08:02 -0700 Subject: [PATCH 0208/1640] UPSTREAM: bpf: Hooks for sys_bind == The problem == There is a use-case when all processes inside a cgroup should use one single IP address on a host that has multiple IP configured. Those processes should use the IP for both ingress and egress, for TCP and UDP traffic. So TCP/UDP servers should be bound to that IP to accept incoming connections on it, and TCP/UDP clients should make outgoing connections from that IP. It should not require changing application code since it's often not possible. Currently it's solved by intercepting glibc wrappers around syscalls such as `bind(2)` and `connect(2)`. It's done by a shared library that is preloaded for every process in a cgroup so that whenever TCP/UDP server calls `bind(2)`, the library replaces IP in sockaddr before passing arguments to syscall. When application calls `connect(2)` the library transparently binds the local end of connection to that IP (`bind(2)` with `IP_BIND_ADDRESS_NO_PORT` to avoid performance penalty). Shared library approach is fragile though, e.g.: * some applications clear env vars (incl. `LD_PRELOAD`); * `/etc/ld.so.preload` doesn't help since some applications are linked with option `-z nodefaultlib`; * other applications don't use glibc and there is nothing to intercept. == The solution == The patch provides much more reliable in-kernel solution for the 1st part of the problem: binding TCP/UDP servers on desired IP. It does not depend on application environment and implementation details (whether glibc is used or not). It adds new eBPF program type `BPF_PROG_TYPE_CGROUP_SOCK_ADDR` and attach types `BPF_CGROUP_INET4_BIND` and `BPF_CGROUP_INET6_BIND` (similar to already existing `BPF_CGROUP_INET_SOCK_CREATE`). The new program type is intended to be used with sockets (`struct sock`) in a cgroup and provided by user `struct sockaddr`. Pointers to both of them are parts of the context passed to programs of newly added types. The new attach types provides hooks in `bind(2)` system call for both IPv4 and IPv6 so that one can write a program to override IP addresses and ports user program tries to bind to and apply such a program for whole cgroup. == Implementation notes == [1] Separate attach types for `AF_INET` and `AF_INET6` are added intentionally to prevent reading/writing to offsets that don't make sense for corresponding socket family. E.g. if user passes `sockaddr_in` it doesn't make sense to read from / write to `user_ip6[]` context fields. [2] The write access to `struct bpf_sock_addr_kern` is implemented using special field as an additional "register". There are just two registers in `sock_addr_convert_ctx_access`: `src` with value to write and `dst` with pointer to context that can't be changed not to break later instructions. But the fields, allowed to write to, are not available directly and to access them address of corresponding pointer has to be loaded first. To get additional register the 1st not used by `src` and `dst` one is taken, its content is saved to `bpf_sock_addr_kern.tmp_reg`, then the register is used to load address of pointer field, and finally the register's content is restored from the temporary field after writing `src` value. Signed-off-by: Andrey Ignatov Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/linux/bpf-cgroup.h | 21 ++++ include/linux/bpf_types.h | 1 + include/linux/filter.h | 10 ++ include/uapi/linux/bpf.h | 23 ++++ kernel/bpf/cgroup.c | 36 ++++++ kernel/bpf/syscall.c | 36 ++++-- kernel/bpf/verifier.c | 1 + net/core/filter.c | 232 +++++++++++++++++++++++++++++++++++++ net/ipv4/af_inet.c | 7 ++ net/ipv6/af_inet6.c | 7 ++ 10 files changed, 366 insertions(+), 8 deletions(-) diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index 8a4566691c8f..67dc4a6471ad 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -6,6 +6,7 @@ #include struct sock; +struct sockaddr; struct cgroup; struct sk_buff; struct bpf_sock_ops_kern; @@ -63,6 +64,10 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk, int __cgroup_bpf_run_filter_sk(struct sock *sk, enum bpf_attach_type type); +int __cgroup_bpf_run_filter_sock_addr(struct sock *sk, + struct sockaddr *uaddr, + enum bpf_attach_type type); + int __cgroup_bpf_run_filter_sock_ops(struct sock *sk, struct bpf_sock_ops_kern *sock_ops, enum bpf_attach_type type); @@ -103,6 +108,20 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, __ret; \ }) +#define BPF_CGROUP_RUN_SA_PROG(sk, uaddr, type) \ +({ \ + int __ret = 0; \ + if (cgroup_bpf_enabled) \ + __ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, type); \ + __ret; \ +}) + +#define BPF_CGROUP_RUN_PROG_INET4_BIND(sk, uaddr) \ + BPF_CGROUP_RUN_SA_PROG(sk, uaddr, BPF_CGROUP_INET4_BIND) + +#define BPF_CGROUP_RUN_PROG_INET6_BIND(sk, uaddr) \ + BPF_CGROUP_RUN_SA_PROG(sk, uaddr, BPF_CGROUP_INET6_BIND) + #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) \ ({ \ int __ret = 0; \ @@ -135,6 +154,8 @@ static inline int cgroup_bpf_inherit(struct cgroup *cgrp) { return 0; } #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET_SOCK(sk) ({ 0; }) +#define BPF_CGROUP_RUN_PROG_INET4_BIND(sk, uaddr) ({ 0; }) +#define BPF_CGROUP_RUN_PROG_INET6_BIND(sk, uaddr) ({ 0; }) #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) ({ 0; }) #define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type,major,minor,access) ({ 0; }) diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index 6d7243bfb0ff..2b28fcf6f6ae 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -8,6 +8,7 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_SCHED_ACT, tc_cls_act) BPF_PROG_TYPE(BPF_PROG_TYPE_XDP, xdp) BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SKB, cg_skb) BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SOCK, cg_sock) +BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SOCK_ADDR, cg_sock_addr) BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_IN, lwt_inout) BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_OUT, lwt_inout) BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_XMIT, lwt_xmit) diff --git a/include/linux/filter.h b/include/linux/filter.h index f07045c17de2..3c0d718a4058 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1102,6 +1102,16 @@ static inline int bpf_tell_extensions(void) return SKF_AD_MAX; } +struct bpf_sock_addr_kern { + struct sock *sk; + struct sockaddr *uaddr; + /* Temporary "register" to make indirect stores to nested structures + * defined above. We need three registers to make such a store, but + * only two (src and dst) are available at convert_ctx_access time + */ + u64 tmp_reg; +}; + struct bpf_sock_ops_kern { struct sock *sk; u32 op; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index b37a76403cc1..44404216c058 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -136,6 +136,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_CGROUP_DEVICE, BPF_PROG_TYPE_SK_MSG, BPF_PROG_TYPE_RAW_TRACEPOINT, + BPF_PROG_TYPE_CGROUP_SOCK_ADDR, }; enum bpf_attach_type { @@ -147,6 +148,8 @@ enum bpf_attach_type { BPF_SK_SKB_STREAM_VERDICT, BPF_CGROUP_DEVICE, BPF_SK_MSG_VERDICT, + BPF_CGROUP_INET4_BIND, + BPF_CGROUP_INET6_BIND, __MAX_BPF_ATTACH_TYPE }; @@ -1102,6 +1105,26 @@ struct bpf_map_info { __u64 netns_ino; } __attribute__((aligned(8))); +/* User bpf_sock_addr struct to access socket fields and sockaddr struct passed + * by user and intended to be used by socket (e.g. to bind to, depends on + * attach attach type). + */ +struct bpf_sock_addr { + __u32 user_family; /* Allows 4-byte read, but no write. */ + __u32 user_ip4; /* Allows 1,2,4-byte read and 4-byte write. + * Stored in network byte order. + */ + __u32 user_ip6[4]; /* Allows 1,2,4-byte read an 4-byte write. + * Stored in network byte order. + */ + __u32 user_port; /* Allows 4-byte read and write. + * Stored in network byte order + */ + __u32 family; /* Allows 4-byte read, but no write */ + __u32 type; /* Allows 4-byte read, but no write */ + __u32 protocol; /* Allows 4-byte read, but no write */ +}; + /* User bpf_sock_ops struct to access socket values and specify request ops * and their replies. * Some of this fields are in network (bigendian) byte order and may need diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 676b8d80811a..1203b064198f 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -493,6 +493,42 @@ int __cgroup_bpf_run_filter_sk(struct sock *sk, } EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk); +/** + * __cgroup_bpf_run_filter_sock_addr() - Run a program on a sock and + * provided by user sockaddr + * @sk: sock struct that will use sockaddr + * @uaddr: sockaddr struct provided by user + * @type: The type of program to be exectuted + * + * socket is expected to be of type INET or INET6. + * + * This function will return %-EPERM if an attached program is found and + * returned value != 1 during execution. In all other cases, 0 is returned. + */ +int __cgroup_bpf_run_filter_sock_addr(struct sock *sk, + struct sockaddr *uaddr, + enum bpf_attach_type type) +{ + struct bpf_sock_addr_kern ctx = { + .sk = sk, + .uaddr = uaddr, + }; + struct cgroup *cgrp; + int ret; + + /* Check socket family since not all sockets represent network + * endpoint (e.g. AF_UNIX). + */ + if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6) + return 0; + + cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); + ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx, BPF_PROG_RUN); + + return ret == 1 ? 0 : -EPERM; +} +EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_addr); + /** * __cgroup_bpf_run_filter_sock_ops() - Run a program on a sock * @sk: socket to get cgroup from diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index eddd69e1771c..400c2b90b6af 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1192,19 +1192,29 @@ static int bpf_prog_load_check_attach_type(enum bpf_prog_type prog_type, enum bpf_attach_type expected_attach_type) { - /* There are currently no prog types that require specifying - * attach_type at load time. - */ - return 0; + switch (prog_type) { + case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: + switch (expected_attach_type) { + case BPF_CGROUP_INET4_BIND: + case BPF_CGROUP_INET6_BIND: + return 0; + default: + return -EINVAL; + } + default: + return 0; + } } static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, enum bpf_attach_type attach_type) { - /* There are currently no prog types that require specifying - * attach_type at load time. - */ - return 0; + switch (prog->type) { + case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: + return attach_type == prog->expected_attach_type ? 0 : -EINVAL; + default: + return 0; + } } /* last field in 'union bpf_attr' used by this command */ @@ -1502,6 +1512,10 @@ static int bpf_prog_attach(const union bpf_attr *attr) case BPF_CGROUP_INET_SOCK_CREATE: ptype = BPF_PROG_TYPE_CGROUP_SOCK; break; + case BPF_CGROUP_INET4_BIND: + case BPF_CGROUP_INET6_BIND: + ptype = BPF_PROG_TYPE_CGROUP_SOCK_ADDR; + break; case BPF_CGROUP_SOCK_OPS: ptype = BPF_PROG_TYPE_SOCK_OPS; break; @@ -1564,6 +1578,10 @@ static int bpf_prog_detach(const union bpf_attr *attr) case BPF_CGROUP_INET_SOCK_CREATE: ptype = BPF_PROG_TYPE_CGROUP_SOCK; break; + case BPF_CGROUP_INET4_BIND: + case BPF_CGROUP_INET6_BIND: + ptype = BPF_PROG_TYPE_CGROUP_SOCK_ADDR; + break; case BPF_CGROUP_SOCK_OPS: ptype = BPF_PROG_TYPE_SOCK_OPS; break; @@ -1613,6 +1631,8 @@ static int bpf_prog_query(const union bpf_attr *attr, case BPF_CGROUP_INET_INGRESS: case BPF_CGROUP_INET_EGRESS: case BPF_CGROUP_INET_SOCK_CREATE: + case BPF_CGROUP_INET4_BIND: + case BPF_CGROUP_INET6_BIND: case BPF_CGROUP_SOCK_OPS: case BPF_CGROUP_DEVICE: break; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 450b0ae2e245..9f24c039495c 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4206,6 +4206,7 @@ static int check_return_code(struct bpf_verifier_env *env) switch (env->prog->type) { case BPF_PROG_TYPE_CGROUP_SKB: case BPF_PROG_TYPE_CGROUP_SOCK: + case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: case BPF_PROG_TYPE_SOCK_OPS: case BPF_PROG_TYPE_CGROUP_DEVICE: break; diff --git a/net/core/filter.c b/net/core/filter.c index 1661519ffc0d..653b5d89103b 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3755,6 +3755,20 @@ sock_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) } } +static const struct bpf_func_proto * +sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + switch (func_id) { + /* inet and inet6 sockets are created in a process + * context so there is always a valid uid/gid + */ + case BPF_FUNC_get_current_uid_gid: + return &bpf_get_current_uid_gid_proto; + default: + return bpf_base_func_proto(func_id); + } +} + static const struct bpf_func_proto * sk_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { @@ -4243,6 +4257,69 @@ void bpf_warn_invalid_xdp_action(u32 act) } EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action); +static bool sock_addr_is_valid_access(int off, int size, + enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + const int size_default = sizeof(__u32); + + if (off < 0 || off >= sizeof(struct bpf_sock_addr)) + return false; + if (off % size != 0) + return false; + + /* Disallow access to IPv6 fields from IPv4 contex and vise + * versa. + */ + switch (off) { + case bpf_ctx_range(struct bpf_sock_addr, user_ip4): + switch (prog->expected_attach_type) { + case BPF_CGROUP_INET4_BIND: + break; + default: + return false; + } + break; + case bpf_ctx_range_till(struct bpf_sock_addr, user_ip6[0], user_ip6[3]): + switch (prog->expected_attach_type) { + case BPF_CGROUP_INET6_BIND: + break; + default: + return false; + } + break; + } + + switch (off) { + case bpf_ctx_range(struct bpf_sock_addr, user_ip4): + case bpf_ctx_range_till(struct bpf_sock_addr, user_ip6[0], user_ip6[3]): + /* Only narrow read access allowed for now. */ + if (type == BPF_READ) { + bpf_ctx_record_field_size(info, size_default); + if (!bpf_ctx_narrow_access_ok(off, size, size_default)) + return false; + } else { + if (size != size_default) + return false; + } + break; + case bpf_ctx_range(struct bpf_sock_addr, user_port): + if (size != size_default) + return false; + break; + default: + if (type == BPF_READ) { + if (size != size_default) + return false; + } else { + return false; + } + } + + return true; +} + static bool sock_ops_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, @@ -4787,6 +4864,152 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type, return insn - insn_buf; } +/* SOCK_ADDR_LOAD_NESTED_FIELD() loads Nested Field S.F.NF where S is type of + * context Structure, F is Field in context structure that contains a pointer + * to Nested Structure of type NS that has the field NF. + * + * SIZE encodes the load size (BPF_B, BPF_H, etc). It's up to caller to make + * sure that SIZE is not greater than actual size of S.F.NF. + * + * If offset OFF is provided, the load happens from that offset relative to + * offset of NF. + */ +#define SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF(S, NS, F, NF, SIZE, OFF) \ + do { \ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(S, F), si->dst_reg, \ + si->src_reg, offsetof(S, F)); \ + *insn++ = BPF_LDX_MEM( \ + SIZE, si->dst_reg, si->dst_reg, \ + bpf_target_off(NS, NF, FIELD_SIZEOF(NS, NF), \ + target_size) \ + + OFF); \ + } while (0) + +#define SOCK_ADDR_LOAD_NESTED_FIELD(S, NS, F, NF) \ + SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF(S, NS, F, NF, \ + BPF_FIELD_SIZEOF(NS, NF), 0) + +/* SOCK_ADDR_STORE_NESTED_FIELD_OFF() has semantic similar to + * SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF() but for store operation. + * + * It doesn't support SIZE argument though since narrow stores are not + * supported for now. + * + * In addition it uses Temporary Field TF (member of struct S) as the 3rd + * "register" since two registers available in convert_ctx_access are not + * enough: we can't override neither SRC, since it contains value to store, nor + * DST since it contains pointer to context that may be used by later + * instructions. But we need a temporary place to save pointer to nested + * structure whose field we want to store to. + */ +#define SOCK_ADDR_STORE_NESTED_FIELD_OFF(S, NS, F, NF, OFF, TF) \ + do { \ + int tmp_reg = BPF_REG_9; \ + if (si->src_reg == tmp_reg || si->dst_reg == tmp_reg) \ + --tmp_reg; \ + if (si->src_reg == tmp_reg || si->dst_reg == tmp_reg) \ + --tmp_reg; \ + *insn++ = BPF_STX_MEM(BPF_DW, si->dst_reg, tmp_reg, \ + offsetof(S, TF)); \ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(S, F), tmp_reg, \ + si->dst_reg, offsetof(S, F)); \ + *insn++ = BPF_STX_MEM( \ + BPF_FIELD_SIZEOF(NS, NF), tmp_reg, si->src_reg, \ + bpf_target_off(NS, NF, FIELD_SIZEOF(NS, NF), \ + target_size) \ + + OFF); \ + *insn++ = BPF_LDX_MEM(BPF_DW, tmp_reg, si->dst_reg, \ + offsetof(S, TF)); \ + } while (0) + +#define SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF(S, NS, F, NF, SIZE, OFF, \ + TF) \ + do { \ + if (type == BPF_WRITE) { \ + SOCK_ADDR_STORE_NESTED_FIELD_OFF(S, NS, F, NF, OFF, \ + TF); \ + } else { \ + SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF( \ + S, NS, F, NF, SIZE, OFF); \ + } \ + } while (0) + +#define SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD(S, NS, F, NF, TF) \ + SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF( \ + S, NS, F, NF, BPF_FIELD_SIZEOF(NS, NF), 0, TF) + +static u32 sock_addr_convert_ctx_access(enum bpf_access_type type, + const struct bpf_insn *si, + struct bpf_insn *insn_buf, + struct bpf_prog *prog, u32 *target_size) +{ + struct bpf_insn *insn = insn_buf; + int off; + + switch (si->off) { + case offsetof(struct bpf_sock_addr, user_family): + SOCK_ADDR_LOAD_NESTED_FIELD(struct bpf_sock_addr_kern, + struct sockaddr, uaddr, sa_family); + break; + + case offsetof(struct bpf_sock_addr, user_ip4): + SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF( + struct bpf_sock_addr_kern, struct sockaddr_in, uaddr, + sin_addr, BPF_SIZE(si->code), 0, tmp_reg); + break; + + case bpf_ctx_range_till(struct bpf_sock_addr, user_ip6[0], user_ip6[3]): + off = si->off; + off -= offsetof(struct bpf_sock_addr, user_ip6[0]); + SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF( + struct bpf_sock_addr_kern, struct sockaddr_in6, uaddr, + sin6_addr.s6_addr32[0], BPF_SIZE(si->code), off, + tmp_reg); + break; + + case offsetof(struct bpf_sock_addr, user_port): + /* To get port we need to know sa_family first and then treat + * sockaddr as either sockaddr_in or sockaddr_in6. + * Though we can simplify since port field has same offset and + * size in both structures. + * Here we check this invariant and use just one of the + * structures if it's true. + */ + BUILD_BUG_ON(offsetof(struct sockaddr_in, sin_port) != + offsetof(struct sockaddr_in6, sin6_port)); + BUILD_BUG_ON(FIELD_SIZEOF(struct sockaddr_in, sin_port) != + FIELD_SIZEOF(struct sockaddr_in6, sin6_port)); + SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD(struct bpf_sock_addr_kern, + struct sockaddr_in6, uaddr, + sin6_port, tmp_reg); + break; + + case offsetof(struct bpf_sock_addr, family): + SOCK_ADDR_LOAD_NESTED_FIELD(struct bpf_sock_addr_kern, + struct sock, sk, sk_family); + break; + + case offsetof(struct bpf_sock_addr, type): + SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF( + struct bpf_sock_addr_kern, struct sock, sk, + __sk_flags_offset, BPF_W, 0); + *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_TYPE_MASK); + *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, SK_FL_TYPE_SHIFT); + break; + + case offsetof(struct bpf_sock_addr, protocol): + SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF( + struct bpf_sock_addr_kern, struct sock, sk, + __sk_flags_offset, BPF_W, 0); + *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_PROTO_MASK); + *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, + SK_FL_PROTO_SHIFT); + break; + } + + return insn - insn_buf; +} + static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, @@ -5244,6 +5467,15 @@ const struct bpf_verifier_ops cg_sock_verifier_ops = { const struct bpf_prog_ops cg_sock_prog_ops = { }; +const struct bpf_verifier_ops cg_sock_addr_verifier_ops = { + .get_func_proto = sock_addr_func_proto, + .is_valid_access = sock_addr_is_valid_access, + .convert_ctx_access = sock_addr_convert_ctx_access, +}; + +const struct bpf_prog_ops cg_sock_addr_prog_ops = { +}; + const struct bpf_verifier_ops sock_ops_verifier_ops = { .get_func_proto = sock_ops_func_proto, .is_valid_access = sock_ops_is_valid_access, diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index f6b67140fa9d..27408c82bcf6 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -449,6 +449,13 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) if (addr_len < sizeof(struct sockaddr_in)) goto out; + /* BPF prog is run before any checks are done so that if the prog + * changes context in a wrong way it will be caught. + */ + err = BPF_CGROUP_RUN_PROG_INET4_BIND(sk, uaddr); + if (err) + goto out; + if (addr->sin_family != AF_INET) { /* Compatibility games : accept AF_UNSPEC (mapped to AF_INET) * only if s_addr is INADDR_ANY. diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 888650b0fae8..a43885614f85 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -304,6 +304,13 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) if (addr_len < SIN6_LEN_RFC2133) return -EINVAL; + /* BPF prog is run before any checks are done so that if the prog + * changes context in a wrong way it will be caught. + */ + err = BPF_CGROUP_RUN_PROG_INET6_BIND(sk, uaddr); + if (err) + return err; + if (addr->sin6_family != AF_INET6) return -EAFNOSUPPORT; From 56418157f135009709c96b7a20d5b10dbc9b52fa Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Fri, 30 Mar 2018 15:08:05 -0700 Subject: [PATCH 0209/1640] BACKPORT: bpf: Hooks for sys_connect == The problem == See description of the problem in the initial patch of this patch set. == The solution == The patch provides much more reliable in-kernel solution for the 2nd part of the problem: making outgoing connecttion from desired IP. It adds new attach types `BPF_CGROUP_INET4_CONNECT` and `BPF_CGROUP_INET6_CONNECT` for program type `BPF_PROG_TYPE_CGROUP_SOCK_ADDR` that can be used to override both source and destination of a connection at connect(2) time. Local end of connection can be bound to desired IP using newly introduced BPF-helper `bpf_bind()`. It allows to bind to only IP though, and doesn't support binding to port, i.e. leverages `IP_BIND_ADDRESS_NO_PORT` socket option. There are two reasons for this: * looking for a free port is expensive and can affect performance significantly; * there is no use-case for port. As for remote end (`struct sockaddr *` passed by user), both parts of it can be overridden, remote IP and remote port. It's useful if an application inside cgroup wants to connect to another application inside same cgroup or to itself, but knows nothing about IP assigned to the cgroup. Support is added for IPv4 and IPv6, for TCP and UDP. IPv4 and IPv6 have separate attach types for same reason as sys_bind hooks, i.e. to prevent reading from / writing to e.g. user_ip6 fields when user passes sockaddr_in since it'd be out-of-bound. == Implementation notes == The patch introduces new field in `struct proto`: `pre_connect` that is a pointer to a function with same signature as `connect` but is called before it. The reason is in some cases BPF hooks should be called way before control is passed to `sk->sk_prot->connect`. Specifically `inet_dgram_connect` autobinds socket before calling `sk->sk_prot->connect` and there is no way to call `bpf_bind()` from hooks from e.g. `ip4_datagram_connect` or `ip6_datagram_connect` since it'd cause double-bind. On the other hand `proto.pre_connect` provides a flexible way to add BPF hooks for connect only for necessary `proto` and call them at desired time before `connect`. Since `bpf_bind()` is allowed to bind only to IP and autobind in `inet_dgram_connect` binds only port there is no chance of double-bind. bpf_bind() sets `force_bind_address_no_port` to bind to only IP despite of value of `bind_address_no_port` socket field. bpf_bind() sets `with_lock` to `false` when calling to __inet_bind() and __inet6_bind() since all call-sites, where bpf_bind() is called, already hold socket lock. Signed-off-by: Andrey Ignatov Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/linux/bpf-cgroup.h | 31 +++++++++++++++++++++ include/net/addrconf.h | 7 +++++ include/net/sock.h | 3 ++ include/net/udp.h | 1 + include/uapi/linux/bpf.h | 10 +++++++ kernel/bpf/syscall.c | 8 ++++++ net/core/filter.c | 57 ++++++++++++++++++++++++++++++++++++++ net/ipv4/af_inet.c | 13 +++++++++ net/ipv4/tcp_ipv4.c | 16 +++++++++++ net/ipv4/udp.c | 14 ++++++++++ net/ipv6/af_inet6.c | 5 ++++ net/ipv6/tcp_ipv6.c | 16 +++++++++++ net/ipv6/udp.c | 20 +++++++++++++ 13 files changed, 201 insertions(+) diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index 67dc4a6471ad..c6ab295e6dcb 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -116,12 +116,38 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, __ret; \ }) +#define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, type) \ +({ \ + int __ret = 0; \ + if (cgroup_bpf_enabled) { \ + lock_sock(sk); \ + __ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, type); \ + release_sock(sk); \ + } \ + __ret; \ +}) + #define BPF_CGROUP_RUN_PROG_INET4_BIND(sk, uaddr) \ BPF_CGROUP_RUN_SA_PROG(sk, uaddr, BPF_CGROUP_INET4_BIND) #define BPF_CGROUP_RUN_PROG_INET6_BIND(sk, uaddr) \ BPF_CGROUP_RUN_SA_PROG(sk, uaddr, BPF_CGROUP_INET6_BIND) +#define BPF_CGROUP_PRE_CONNECT_ENABLED(sk) (cgroup_bpf_enabled && \ + sk->sk_prot->pre_connect) + +#define BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr) \ + BPF_CGROUP_RUN_SA_PROG(sk, uaddr, BPF_CGROUP_INET4_CONNECT) + +#define BPF_CGROUP_RUN_PROG_INET6_CONNECT(sk, uaddr) \ + BPF_CGROUP_RUN_SA_PROG(sk, uaddr, BPF_CGROUP_INET6_CONNECT) + +#define BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr) \ + BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_INET4_CONNECT) + +#define BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr) \ + BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_INET6_CONNECT) + #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) \ ({ \ int __ret = 0; \ @@ -151,11 +177,16 @@ struct cgroup_bpf {}; static inline void cgroup_bpf_put(struct cgroup *cgrp) {} static inline int cgroup_bpf_inherit(struct cgroup *cgrp) { return 0; } +#define BPF_CGROUP_PRE_CONNECT_ENABLED(sk) (0) #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET_SOCK(sk) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET4_BIND(sk, uaddr) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET6_BIND(sk, uaddr) ({ 0; }) +#define BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr) ({ 0; }) +#define BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr) ({ 0; }) +#define BPF_CGROUP_RUN_PROG_INET6_CONNECT(sk, uaddr) ({ 0; }) +#define BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr) ({ 0; }) #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) ({ 0; }) #define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type,major,minor,access) ({ 0; }) diff --git a/include/net/addrconf.h b/include/net/addrconf.h index 9e10ad495233..d06ff0183766 100644 --- a/include/net/addrconf.h +++ b/include/net/addrconf.h @@ -235,6 +235,13 @@ struct ipv6_stub { }; extern const struct ipv6_stub *ipv6_stub __read_mostly; +/* A stub used by bpf helpers. Similarly ugly as ipv6_stub */ +struct ipv6_bpf_stub { + int (*inet6_bind)(struct sock *sk, struct sockaddr *uaddr, int addr_len, + bool force_bind_address_no_port, bool with_lock); +}; +extern const struct ipv6_bpf_stub *ipv6_bpf_stub __read_mostly; + /* * identify MLD packets for MLD filter exceptions */ diff --git a/include/net/sock.h b/include/net/sock.h index 214cd0877624..5741ff62cbe2 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1070,6 +1070,9 @@ static inline void sk_prot_clear_nulls(struct sock *sk, int size) struct proto { void (*close)(struct sock *sk, long timeout); + int (*pre_connect)(struct sock *sk, + struct sockaddr *uaddr, + int addr_len); int (*connect)(struct sock *sk, struct sockaddr *uaddr, int addr_len); diff --git a/include/net/udp.h b/include/net/udp.h index c3d9ad972763..8f95d2234443 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -283,6 +283,7 @@ void udp4_hwcsum(struct sk_buff *skb, __be32 src, __be32 dst); int udp_rcv(struct sk_buff *skb); int udp_ioctl(struct sock *sk, int cmd, unsigned long arg); int udp_init_sock(struct sock *sk); +int udp_pre_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len); int __udp_disconnect(struct sock *sk, int flags); int udp_disconnect(struct sock *sk, int flags); unsigned int udp_poll(struct file *file, struct socket *sock, poll_table *wait); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 44404216c058..a813d0472238 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -150,6 +150,8 @@ enum bpf_attach_type { BPF_SK_MSG_VERDICT, BPF_CGROUP_INET4_BIND, BPF_CGROUP_INET6_BIND, + BPF_CGROUP_INET4_CONNECT, + BPF_CGROUP_INET6_CONNECT, __MAX_BPF_ATTACH_TYPE }; @@ -745,6 +747,14 @@ union bpf_attr { * @flags: reserved for future use * Return: SK_PASS * + * int bpf_bind(ctx, addr, addr_len) + * Bind socket to address. Only binding to IP is supported, no port can be + * set in addr. + * @ctx: pointer to context of type bpf_sock_addr + * @addr: pointer to struct sockaddr to bind socket to + * @addr_len: length of sockaddr structure + * Return: 0 on success or negative error code + * * int skb_load_bytes_relative(const struct sk_buff *skb, u32 offset, void *to, u32 len, u32 start_header) * Description * This helper is similar to **bpf_skb_load_bytes**\ () in that diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 400c2b90b6af..137fc898a6b3 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1197,6 +1197,8 @@ bpf_prog_load_check_attach_type(enum bpf_prog_type prog_type, switch (expected_attach_type) { case BPF_CGROUP_INET4_BIND: case BPF_CGROUP_INET6_BIND: + case BPF_CGROUP_INET4_CONNECT: + case BPF_CGROUP_INET6_CONNECT: return 0; default: return -EINVAL; @@ -1514,6 +1516,8 @@ static int bpf_prog_attach(const union bpf_attr *attr) break; case BPF_CGROUP_INET4_BIND: case BPF_CGROUP_INET6_BIND: + case BPF_CGROUP_INET4_CONNECT: + case BPF_CGROUP_INET6_CONNECT: ptype = BPF_PROG_TYPE_CGROUP_SOCK_ADDR; break; case BPF_CGROUP_SOCK_OPS: @@ -1580,6 +1584,8 @@ static int bpf_prog_detach(const union bpf_attr *attr) break; case BPF_CGROUP_INET4_BIND: case BPF_CGROUP_INET6_BIND: + case BPF_CGROUP_INET4_CONNECT: + case BPF_CGROUP_INET6_CONNECT: ptype = BPF_PROG_TYPE_CGROUP_SOCK_ADDR; break; case BPF_CGROUP_SOCK_OPS: @@ -1633,6 +1639,8 @@ static int bpf_prog_query(const union bpf_attr *attr, case BPF_CGROUP_INET_SOCK_CREATE: case BPF_CGROUP_INET4_BIND: case BPF_CGROUP_INET6_BIND: + case BPF_CGROUP_INET4_CONNECT: + case BPF_CGROUP_INET6_CONNECT: case BPF_CGROUP_SOCK_OPS: case BPF_CGROUP_DEVICE: break; diff --git a/net/core/filter.c b/net/core/filter.c index 653b5d89103b..ef1d8a034adb 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include #include @@ -3711,6 +3712,52 @@ static const struct bpf_func_proto bpf_sock_ops_cb_flags_set_proto = { .arg2_type = ARG_ANYTHING, }; +const struct ipv6_bpf_stub *ipv6_bpf_stub __read_mostly; +EXPORT_SYMBOL_GPL(ipv6_bpf_stub); + +BPF_CALL_3(bpf_bind, struct bpf_sock_addr_kern *, ctx, struct sockaddr *, addr, + int, addr_len) +{ +#ifdef CONFIG_INET + struct sock *sk = ctx->sk; + int err; + + /* Binding to port can be expensive so it's prohibited in the helper. + * Only binding to IP is supported. + */ + err = -EINVAL; + if (addr->sa_family == AF_INET) { + if (addr_len < sizeof(struct sockaddr_in)) + return err; + if (((struct sockaddr_in *)addr)->sin_port != htons(0)) + return err; + return __inet_bind(sk, addr, addr_len, true, false); +#if IS_ENABLED(CONFIG_IPV6) + } else if (addr->sa_family == AF_INET6) { + if (addr_len < SIN6_LEN_RFC2133) + return err; + if (((struct sockaddr_in6 *)addr)->sin6_port != htons(0)) + return err; + /* ipv6_bpf_stub cannot be NULL, since it's called from + * bpf_cgroup_inet6_connect hook and ipv6 is already loaded + */ + return ipv6_bpf_stub->inet6_bind(sk, addr, addr_len, true, false); +#endif /* CONFIG_IPV6 */ + } +#endif /* CONFIG_INET */ + + return -EAFNOSUPPORT; +} + +static const struct bpf_func_proto bpf_bind_proto = { + .func = bpf_bind, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, +}; + static const struct bpf_func_proto * bpf_base_func_proto(enum bpf_func_id func_id) { @@ -3764,6 +3811,14 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) */ case BPF_FUNC_get_current_uid_gid: return &bpf_get_current_uid_gid_proto; + case BPF_FUNC_bind: + switch (prog->expected_attach_type) { + case BPF_CGROUP_INET4_CONNECT: + case BPF_CGROUP_INET6_CONNECT: + return &bpf_bind_proto; + default: + return NULL; + } default: return bpf_base_func_proto(func_id); } @@ -4276,6 +4331,7 @@ static bool sock_addr_is_valid_access(int off, int size, case bpf_ctx_range(struct bpf_sock_addr, user_ip4): switch (prog->expected_attach_type) { case BPF_CGROUP_INET4_BIND: + case BPF_CGROUP_INET4_CONNECT: break; default: return false; @@ -4284,6 +4340,7 @@ static bool sock_addr_is_valid_access(int off, int size, case bpf_ctx_range_till(struct bpf_sock_addr, user_ip6[0], user_ip6[3]): switch (prog->expected_attach_type) { case BPF_CGROUP_INET6_BIND: + case BPF_CGROUP_INET6_CONNECT: break; default: return false; diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 27408c82bcf6..fa79fea33fab 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -538,6 +538,7 @@ int inet_dgram_connect(struct socket *sock, struct sockaddr *uaddr, { struct sock *sk = sock->sk; const struct proto *prot; + int err; if (addr_len < sizeof(uaddr->sa_family)) return -EINVAL; @@ -548,6 +549,12 @@ int inet_dgram_connect(struct socket *sock, struct sockaddr *uaddr, if (uaddr->sa_family == AF_UNSPEC) return prot->disconnect(sk, flags); + if (BPF_CGROUP_PRE_CONNECT_ENABLED(sk)) { + err = sk->sk_prot->pre_connect(sk, uaddr, addr_len); + if (err) + return err; + } + if (!inet_sk(sk)->inet_num && inet_autobind(sk)) return -EAGAIN; return prot->connect(sk, uaddr, addr_len); @@ -628,6 +635,12 @@ int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, if (sk->sk_state != TCP_CLOSE) goto out; + if (BPF_CGROUP_PRE_CONNECT_ENABLED(sk)) { + err = sk->sk_prot->pre_connect(sk, uaddr, addr_len); + if (err) + goto out; + } + err = sk->sk_prot->connect(sk, uaddr, addr_len); if (err < 0) goto out; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 35a363a18b94..b4908f96aa84 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -138,6 +138,21 @@ int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) } EXPORT_SYMBOL_GPL(tcp_twsk_unique); +static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr, + int addr_len) +{ + /* This check is replicated from tcp_v4_connect() and intended to + * prevent BPF program called below from accessing bytes that are out + * of the bound specified by user in addr_len. + */ + if (addr_len < sizeof(struct sockaddr_in)) + return -EINVAL; + + sock_owned_by_me(sk); + + return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr); +} + /* This will initiate an outgoing connection. */ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { @@ -2442,6 +2457,7 @@ struct proto tcp_prot = { .name = "TCP", .owner = THIS_MODULE, .close = tcp_close, + .pre_connect = tcp_v4_pre_connect, .connect = tcp_v4_connect, .disconnect = tcp_disconnect, .accept = inet_csk_accept, diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index cb128fcdb182..cc5201ce82e3 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1764,6 +1764,19 @@ csum_copy_err: goto try_again; } +int udp_pre_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) +{ + /* This check is replicated from __ip4_datagram_connect() and + * intended to prevent BPF program called below from accessing bytes + * that are out of the bound specified by user in addr_len. + */ + if (addr_len < sizeof(struct sockaddr_in)) + return -EINVAL; + + return BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr); +} +EXPORT_SYMBOL(udp_pre_connect); + int __udp_disconnect(struct sock *sk, int flags) { struct inet_sock *inet = inet_sk(sk); @@ -2865,6 +2878,7 @@ struct proto udp_prot = { .name = "UDP", .owner = THIS_MODULE, .close = udp_lib_close, + .pre_connect = udp_pre_connect, .connect = ip4_datagram_connect, .disconnect = udp_disconnect, .ioctl = udp_ioctl, diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index a43885614f85..3a97d33e4fcf 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -905,6 +905,10 @@ static const struct ipv6_stub ipv6_stub_impl = { .nd_tbl = &nd_tbl, }; +static const struct ipv6_bpf_stub ipv6_bpf_stub_impl = { + .inet6_bind = __inet6_bind, +}; + static int __init inet6_init(void) { struct list_head *r; @@ -1061,6 +1065,7 @@ static int __init inet6_init(void) /* ensure that ipv6 stubs are visible only after ipv6 is ready */ wmb(); ipv6_stub = &ipv6_stub_impl; + ipv6_bpf_stub = &ipv6_bpf_stub_impl; out: return err; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 756280071544..4b9d5e509075 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -115,6 +115,21 @@ static u32 tcp_v6_init_ts_off(const struct net *net, const struct sk_buff *skb) ipv6_hdr(skb)->saddr.s6_addr32); } +static int tcp_v6_pre_connect(struct sock *sk, struct sockaddr *uaddr, + int addr_len) +{ + /* This check is replicated from tcp_v6_connect() and intended to + * prevent BPF program called below from accessing bytes that are out + * of the bound specified by user in addr_len. + */ + if (addr_len < SIN6_LEN_RFC2133) + return -EINVAL; + + sock_owned_by_me(sk); + + return BPF_CGROUP_RUN_PROG_INET6_CONNECT(sk, uaddr); +} + static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { @@ -1964,6 +1979,7 @@ struct proto tcpv6_prot = { .name = "TCPv6", .owner = THIS_MODULE, .close = tcp_close, + .pre_connect = tcp_v6_pre_connect, .connect = tcp_v6_connect, .disconnect = tcp_disconnect, .accept = inet_csk_accept, diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index e22e15b06dcb..c519adff6244 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -1032,6 +1032,25 @@ static void udp_v6_flush_pending_frames(struct sock *sk) } } +static int udpv6_pre_connect(struct sock *sk, struct sockaddr *uaddr, + int addr_len) +{ + /* The following checks are replicated from __ip6_datagram_connect() + * and intended to prevent BPF program called below from accessing + * bytes that are out of the bound specified by user in addr_len. + */ + if (uaddr->sa_family == AF_INET) { + if (__ipv6_only_sock(sk)) + return -EAFNOSUPPORT; + return udp_pre_connect(sk, uaddr, addr_len); + } + + if (addr_len < SIN6_LEN_RFC2133) + return -EINVAL; + + return BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr); +} + /** * udp6_hwcsum_outgoing - handle outgoing HW checksumming * @sk: socket we are sending on @@ -1610,6 +1629,7 @@ struct proto udpv6_prot = { .name = "UDPv6", .owner = THIS_MODULE, .close = udp_lib_close, + .pre_connect = udpv6_pre_connect, .connect = ip6_datagram_connect, .disconnect = udp_disconnect, .ioctl = udp_ioctl, From 3dc5c8c51753cd064b4770711a0c7942e3174b59 Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Fri, 30 Mar 2018 15:08:04 -0700 Subject: [PATCH 0210/1640] BACKPORT: net: Introduce __inet_bind() and __inet6_bind Refactor `bind()` code to make it ready to be called from BPF helper function `bpf_bind()` (will be added soon). Implementation of `inet_bind()` and `inet6_bind()` is separated into `__inet_bind()` and `__inet6_bind()` correspondingly. These function can be used from both `sk_prot->bind` and `bpf_bind()` contexts. New functions have two additional arguments. `force_bind_address_no_port` forces binding to IP only w/o checking `inet_sock.bind_address_no_port` field. It'll allow to bind local end of a connection to desired IP in `bpf_bind()` w/o changing `bind_address_no_port` field of a socket. It's useful since `bpf_bind()` can return an error and we'd need to restore original value of `bind_address_no_port` in that case if we changed this before calling to the helper. `with_lock` specifies whether to lock socket when working with `struct sk` or not. The argument is set to `true` for `sk_prot->bind`, i.e. old behavior is preserved. But it will be set to `false` for `bpf_bind()` use-case. The reason is all call-sites, where `bpf_bind()` will be called, already hold that socket lock. Signed-off-by: Andrey Ignatov Acked-by: Alexei Starovoitov Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/net/inet_common.h | 2 ++ include/net/ipv6.h | 2 ++ net/ipv4/af_inet.c | 39 ++++++++++++++++++++++++--------------- net/ipv6/af_inet6.c | 37 ++++++++++++++++++++++++------------- 4 files changed, 52 insertions(+), 28 deletions(-) diff --git a/include/net/inet_common.h b/include/net/inet_common.h index 5a54c9570977..0c6b3274925e 100644 --- a/include/net/inet_common.h +++ b/include/net/inet_common.h @@ -32,6 +32,8 @@ int inet_shutdown(struct socket *sock, int how); int inet_listen(struct socket *sock, int backlog); void inet_sock_destruct(struct sock *sk); int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len); +int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, + bool force_bind_address_no_port, bool with_lock); int inet_getname(struct socket *sock, struct sockaddr *uaddr, int *uaddr_len, int peer); int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg); diff --git a/include/net/ipv6.h b/include/net/ipv6.h index b3ad33761cce..384e71baa1b0 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -945,6 +945,8 @@ void ipv6_local_rxpmtu(struct sock *sk, struct flowi6 *fl6, u32 mtu); void inet6_cleanup_sock(struct sock *sk); void inet6_sock_destruct(struct sock *sk); int inet6_release(struct socket *sock); +int __inet6_bind(struct sock *sock, struct sockaddr *uaddr, int addr_len, + bool force_bind_address_no_port, bool with_lock); int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len); int inet6_getname(struct socket *sock, struct sockaddr *uaddr, int *uaddr_len, int peer); diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index fa79fea33fab..94d35c034567 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -431,30 +431,37 @@ EXPORT_SYMBOL(inet_release); int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) { - struct sockaddr_in *addr = (struct sockaddr_in *)uaddr; struct sock *sk = sock->sk; - struct inet_sock *inet = inet_sk(sk); - struct net *net = sock_net(sk); - unsigned short snum; - int chk_addr_ret; - u32 tb_id = RT_TABLE_LOCAL; int err; /* If the socket has its own bind function then use it. (RAW) */ if (sk->sk_prot->bind) { - err = sk->sk_prot->bind(sk, uaddr, addr_len); - goto out; + return sk->sk_prot->bind(sk, uaddr, addr_len); } - err = -EINVAL; if (addr_len < sizeof(struct sockaddr_in)) - goto out; + return -EINVAL; /* BPF prog is run before any checks are done so that if the prog * changes context in a wrong way it will be caught. */ err = BPF_CGROUP_RUN_PROG_INET4_BIND(sk, uaddr); if (err) - goto out; + return err; + + return __inet_bind(sk, uaddr, addr_len, false, true); +} +EXPORT_SYMBOL(inet_bind); + +int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, + bool force_bind_address_no_port, bool with_lock) +{ + struct sockaddr_in *addr = (struct sockaddr_in *)uaddr; + struct inet_sock *inet = inet_sk(sk); + struct net *net = sock_net(sk); + unsigned short snum; + int chk_addr_ret; + u32 tb_id = RT_TABLE_LOCAL; + int err; if (addr->sin_family != AF_INET) { /* Compatibility games : accept AF_UNSPEC (mapped to AF_INET) @@ -498,7 +505,8 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) * would be illegal to use them (multicast/broadcast) in * which case the sending device address is used. */ - lock_sock(sk); + if (with_lock) + lock_sock(sk); /* Check these errors (active socket, double bind). */ err = -EINVAL; @@ -510,7 +518,8 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) inet->inet_saddr = 0; /* Use device */ /* Make sure we are allowed to bind here. */ - if ((snum || !inet->bind_address_no_port) && + if ((snum || !(inet->bind_address_no_port || + force_bind_address_no_port)) && sk->sk_prot->get_port(sk, snum)) { inet->inet_saddr = inet->inet_rcv_saddr = 0; err = -EADDRINUSE; @@ -527,11 +536,11 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) sk_dst_reset(sk); err = 0; out_release_sock: - release_sock(sk); + if (with_lock) + release_sock(sk); out: return err; } -EXPORT_SYMBOL(inet_bind); int inet_dgram_connect(struct socket *sock, struct sockaddr *uaddr, int addr_len, int flags) diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 3a97d33e4fcf..daa626b97969 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -283,16 +283,8 @@ out_rcu_unlock: /* bind for INET6 API */ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) { - struct sockaddr_in6 *addr = (struct sockaddr_in6 *)uaddr; struct sock *sk = sock->sk; const struct proto *prot; - struct inet_sock *inet = inet_sk(sk); - struct ipv6_pinfo *np = inet6_sk(sk); - struct net *net = sock_net(sk); - __be32 v4addr = 0; - unsigned short snum; - bool saved_ipv6only; - int addr_type = 0; int err = 0; /* IPV6_ADDRFORM can change sk->sk_prot under us. */ @@ -311,11 +303,28 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) if (err) return err; + return __inet6_bind(sk, uaddr, addr_len, false, true); +} +EXPORT_SYMBOL(inet6_bind); + +int __inet6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, + bool force_bind_address_no_port, bool with_lock) +{ + struct sockaddr_in6 *addr = (struct sockaddr_in6 *)uaddr; + struct inet_sock *inet = inet_sk(sk); + struct ipv6_pinfo *np = inet6_sk(sk); + struct net *net = sock_net(sk); + __be32 v4addr = 0; + unsigned short snum; + bool saved_ipv6only; + int addr_type = 0; + int err = 0; + if (addr->sin6_family != AF_INET6) return -EAFNOSUPPORT; addr_type = ipv6_addr_type(&addr->sin6_addr); - if ((addr_type & IPV6_ADDR_MULTICAST) && sock->type == SOCK_STREAM) + if ((addr_type & IPV6_ADDR_MULTICAST) && sk->sk_type == SOCK_STREAM) return -EINVAL; snum = ntohs(addr->sin6_port); @@ -323,7 +332,8 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE)) return -EACCES; - lock_sock(sk); + if (with_lock) + lock_sock(sk); /* Check these errors (active socket, double bind). */ if (sk->sk_state != TCP_CLOSE || inet->inet_num) { @@ -426,7 +436,8 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) sk->sk_ipv6only = 1; /* Make sure we are allowed to bind here. */ - if ((snum || !inet->bind_address_no_port) && + if ((snum || !(inet->bind_address_no_port || + force_bind_address_no_port)) && sk->sk_prot->get_port(sk, snum)) { sk->sk_ipv6only = saved_ipv6only; inet_reset_saddr(sk); @@ -442,13 +453,13 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) inet->inet_dport = 0; inet->inet_daddr = 0; out: - release_sock(sk); + if (with_lock) + release_sock(sk); return err; out_unlock: rcu_read_unlock(); goto out; } -EXPORT_SYMBOL(inet6_bind); int inet6_release(struct socket *sock) { From 02388ed22526515696d338b1f6bcaf5768df77b8 Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Fri, 30 Mar 2018 15:08:07 -0700 Subject: [PATCH 0211/1640] UPSTREAM: bpf: Post-hooks for sys_bind "Post-hooks" are hooks that are called right before returning from sys_bind. At this time IP and port are already allocated and no further changes to `struct sock` can happen before returning from sys_bind but BPF program has a chance to inspect the socket and change sys_bind result. Specifically it can e.g. inspect what port was allocated and if it doesn't satisfy some policy, BPF program can force sys_bind to fail and return EPERM to user. Another example of usage is recording the IP:port pair to some map to use it in later calls to sys_connect. E.g. if some TCP server inside cgroup was bound to some IP:port_n, it can be recorded to a map. And later when some TCP client inside same cgroup is trying to connect to 127.0.0.1:port_n, BPF hook for sys_connect can override the destination and connect application to IP:port_n instead of 127.0.0.1:port_n. That helps forcing all applications inside a cgroup to use desired IP and not break those applications if they e.g. use localhost to communicate between each other. == Implementation details == Post-hooks are implemented as two new attach types `BPF_CGROUP_INET4_POST_BIND` and `BPF_CGROUP_INET6_POST_BIND` for existing prog type `BPF_PROG_TYPE_CGROUP_SOCK`. Separate attach types for IPv4 and IPv6 are introduced to avoid access to IPv6 field in `struct sock` from `inet_bind()` and to IPv4 field from `inet6_bind()` since those fields might not make sense in such cases. Signed-off-by: Andrey Ignatov Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/linux/bpf-cgroup.h | 16 ++++- include/uapi/linux/bpf.h | 11 ++++ kernel/bpf/syscall.c | 43 ++++++++++++++ net/core/filter.c | 118 ++++++++++++++++++++++++++++++++----- net/ipv4/af_inet.c | 18 ++++-- net/ipv6/af_inet6.c | 21 ++++--- 6 files changed, 196 insertions(+), 31 deletions(-) diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index c6ab295e6dcb..30d15e64b993 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -98,16 +98,24 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, __ret; \ }) -#define BPF_CGROUP_RUN_PROG_INET_SOCK(sk) \ +#define BPF_CGROUP_RUN_SK_PROG(sk, type) \ ({ \ int __ret = 0; \ if (cgroup_bpf_enabled) { \ - __ret = __cgroup_bpf_run_filter_sk(sk, \ - BPF_CGROUP_INET_SOCK_CREATE); \ + __ret = __cgroup_bpf_run_filter_sk(sk, type); \ } \ __ret; \ }) +#define BPF_CGROUP_RUN_PROG_INET_SOCK(sk) \ + BPF_CGROUP_RUN_SK_PROG(sk, BPF_CGROUP_INET_SOCK_CREATE) + +#define BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk) \ + BPF_CGROUP_RUN_SK_PROG(sk, BPF_CGROUP_INET4_POST_BIND) + +#define BPF_CGROUP_RUN_PROG_INET6_POST_BIND(sk) \ + BPF_CGROUP_RUN_SK_PROG(sk, BPF_CGROUP_INET6_POST_BIND) + #define BPF_CGROUP_RUN_SA_PROG(sk, uaddr, type) \ ({ \ int __ret = 0; \ @@ -183,6 +191,8 @@ static inline int cgroup_bpf_inherit(struct cgroup *cgrp) { return 0; } #define BPF_CGROUP_RUN_PROG_INET_SOCK(sk) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET4_BIND(sk, uaddr) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET6_BIND(sk, uaddr) ({ 0; }) +#define BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk) ({ 0; }) +#define BPF_CGROUP_RUN_PROG_INET6_POST_BIND(sk) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET6_CONNECT(sk, uaddr) ({ 0; }) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index a813d0472238..bc32efcfe67b 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -152,6 +152,8 @@ enum bpf_attach_type { BPF_CGROUP_INET6_BIND, BPF_CGROUP_INET4_CONNECT, BPF_CGROUP_INET6_CONNECT, + BPF_CGROUP_INET4_POST_BIND, + BPF_CGROUP_INET6_POST_BIND, __MAX_BPF_ATTACH_TYPE }; @@ -1040,6 +1042,15 @@ struct bpf_sock { __u32 protocol; __u32 mark; __u32 priority; + __u32 src_ip4; /* Allows 1,2,4-byte read. + * Stored in network byte order. + */ + __u32 src_ip6[4]; /* Allows 1,2,4-byte read. + * Stored in network byte order. + */ + __u32 src_port; /* Allows 4-byte read. + * Stored in host byte order + */ }; #define XDP_PACKET_HEADROOM 256 diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 137fc898a6b3..c70950cc4200 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1188,11 +1188,46 @@ struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, enum bpf_prog_type type, } EXPORT_SYMBOL_GPL(bpf_prog_get_type_dev); +/* Initially all BPF programs could be loaded w/o specifying + * expected_attach_type. Later for some of them specifying expected_attach_type + * at load time became required so that program could be validated properly. + * Programs of types that are allowed to be loaded both w/ and w/o (for + * backward compatibility) expected_attach_type, should have the default attach + * type assigned to expected_attach_type for the latter case, so that it can be + * validated later at attach time. + * + * bpf_prog_load_fixup_attach_type() sets expected_attach_type in @attr if + * prog type requires it but has some attach types that have to be backward + * compatible. + */ +static void bpf_prog_load_fixup_attach_type(union bpf_attr *attr) +{ + switch (attr->prog_type) { + case BPF_PROG_TYPE_CGROUP_SOCK: + /* Unfortunately BPF_ATTACH_TYPE_UNSPEC enumeration doesn't + * exist so checking for non-zero is the way to go here. + */ + if (!attr->expected_attach_type) + attr->expected_attach_type = + BPF_CGROUP_INET_SOCK_CREATE; + break; + } +} + static int bpf_prog_load_check_attach_type(enum bpf_prog_type prog_type, enum bpf_attach_type expected_attach_type) { switch (prog_type) { + case BPF_PROG_TYPE_CGROUP_SOCK: + switch (expected_attach_type) { + case BPF_CGROUP_INET_SOCK_CREATE: + case BPF_CGROUP_INET4_POST_BIND: + case BPF_CGROUP_INET6_POST_BIND: + return 0; + default: + return -EINVAL; + } case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: switch (expected_attach_type) { case BPF_CGROUP_INET4_BIND: @@ -1212,6 +1247,7 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, enum bpf_attach_type attach_type) { switch (prog->type) { + case BPF_PROG_TYPE_CGROUP_SOCK: case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: return attach_type == prog->expected_attach_type ? 0 : -EINVAL; default: @@ -1257,6 +1293,7 @@ static int bpf_prog_load(union bpf_attr *attr) !capable(CAP_SYS_ADMIN)) return -EPERM; + bpf_prog_load_fixup_attach_type(attr); if (bpf_prog_load_check_attach_type(type, attr->expected_attach_type)) return -EINVAL; @@ -1512,6 +1549,8 @@ static int bpf_prog_attach(const union bpf_attr *attr) ptype = BPF_PROG_TYPE_CGROUP_SKB; break; case BPF_CGROUP_INET_SOCK_CREATE: + case BPF_CGROUP_INET4_POST_BIND: + case BPF_CGROUP_INET6_POST_BIND: ptype = BPF_PROG_TYPE_CGROUP_SOCK; break; case BPF_CGROUP_INET4_BIND: @@ -1580,6 +1619,8 @@ static int bpf_prog_detach(const union bpf_attr *attr) ptype = BPF_PROG_TYPE_CGROUP_SKB; break; case BPF_CGROUP_INET_SOCK_CREATE: + case BPF_CGROUP_INET4_POST_BIND: + case BPF_CGROUP_INET6_POST_BIND: ptype = BPF_PROG_TYPE_CGROUP_SOCK; break; case BPF_CGROUP_INET4_BIND: @@ -1639,6 +1680,8 @@ static int bpf_prog_query(const union bpf_attr *attr, case BPF_CGROUP_INET_SOCK_CREATE: case BPF_CGROUP_INET4_BIND: case BPF_CGROUP_INET6_BIND: + case BPF_CGROUP_INET4_POST_BIND: + case BPF_CGROUP_INET6_POST_BIND: case BPF_CGROUP_INET4_CONNECT: case BPF_CGROUP_INET6_CONNECT: case BPF_CGROUP_SOCK_OPS: diff --git a/net/core/filter.c b/net/core/filter.c index ef1d8a034adb..56c11d1c1dbd 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4160,30 +4160,80 @@ static bool lwt_is_valid_access(int off, int size, return bpf_skb_is_valid_access(off, size, type, prog, info); } + +/* Attach type specific accesses */ +static bool __sock_filter_check_attach_type(int off, + enum bpf_access_type access_type, + enum bpf_attach_type attach_type) +{ + switch (off) { + case offsetof(struct bpf_sock, bound_dev_if): + case offsetof(struct bpf_sock, mark): + case offsetof(struct bpf_sock, priority): + switch (attach_type) { + case BPF_CGROUP_INET_SOCK_CREATE: + goto full_access; + default: + return false; + } + case bpf_ctx_range(struct bpf_sock, src_ip4): + switch (attach_type) { + case BPF_CGROUP_INET4_POST_BIND: + goto read_only; + default: + return false; + } + case bpf_ctx_range_till(struct bpf_sock, src_ip6[0], src_ip6[3]): + switch (attach_type) { + case BPF_CGROUP_INET6_POST_BIND: + goto read_only; + default: + return false; + } + case bpf_ctx_range(struct bpf_sock, src_port): + switch (attach_type) { + case BPF_CGROUP_INET4_POST_BIND: + case BPF_CGROUP_INET6_POST_BIND: + goto read_only; + default: + return false; + } + } +read_only: + return access_type == BPF_READ; +full_access: + return true; +} + +static bool __sock_filter_check_size(int off, int size, + struct bpf_insn_access_aux *info) +{ + const int size_default = sizeof(__u32); + + switch (off) { + case bpf_ctx_range(struct bpf_sock, src_ip4): + case bpf_ctx_range_till(struct bpf_sock, src_ip6[0], src_ip6[3]): + bpf_ctx_record_field_size(info, size_default); + return bpf_ctx_narrow_access_ok(off, size, size_default); + } + + return size == size_default; +} + static bool sock_filter_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { - if (type == BPF_WRITE) { - switch (off) { - case offsetof(struct bpf_sock, bound_dev_if): - case offsetof(struct bpf_sock, mark): - case offsetof(struct bpf_sock, priority): - break; - default: - return false; - } - } - - if (off < 0 || off + size > sizeof(struct bpf_sock)) + if (off < 0 || off >= sizeof(struct bpf_sock)) return false; - /* The verifier guarantees that size > 0. */ if (off % size != 0) return false; - if (size != sizeof(__u32)) + if (!__sock_filter_check_attach_type(off, type, + prog->expected_attach_type)) + return false; + if (!__sock_filter_check_size(off, size, info)) return false; - return true; } @@ -4791,6 +4841,7 @@ static u32 sock_filter_convert_ctx_access(enum bpf_access_type type, struct bpf_prog *prog, u32 *target_size) { struct bpf_insn *insn = insn_buf; + int off; switch (si->off) { case offsetof(struct bpf_sock, bound_dev_if): @@ -4846,6 +4897,43 @@ static u32 sock_filter_convert_ctx_access(enum bpf_access_type type, *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_PROTO_MASK); *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, SK_FL_PROTO_SHIFT); break; + + case offsetof(struct bpf_sock, src_ip4): + *insn++ = BPF_LDX_MEM( + BPF_SIZE(si->code), si->dst_reg, si->src_reg, + bpf_target_off(struct sock_common, skc_rcv_saddr, + FIELD_SIZEOF(struct sock_common, + skc_rcv_saddr), + target_size)); + break; + + case bpf_ctx_range_till(struct bpf_sock, src_ip6[0], src_ip6[3]): +#if IS_ENABLED(CONFIG_IPV6) + off = si->off; + off -= offsetof(struct bpf_sock, src_ip6[0]); + *insn++ = BPF_LDX_MEM( + BPF_SIZE(si->code), si->dst_reg, si->src_reg, + bpf_target_off( + struct sock_common, + skc_v6_rcv_saddr.s6_addr32[0], + FIELD_SIZEOF(struct sock_common, + skc_v6_rcv_saddr.s6_addr32[0]), + target_size) + off); +#else + (void)off; + *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); +#endif + break; + + case offsetof(struct bpf_sock, src_port): + *insn++ = BPF_LDX_MEM( + BPF_FIELD_SIZEOF(struct sock_common, skc_num), + si->dst_reg, si->src_reg, + bpf_target_off(struct sock_common, skc_num, + FIELD_SIZEOF(struct sock_common, + skc_num), + target_size)); + break; } return insn - insn_buf; diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 94d35c034567..f16a6a765267 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -518,12 +518,18 @@ int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, inet->inet_saddr = 0; /* Use device */ /* Make sure we are allowed to bind here. */ - if ((snum || !(inet->bind_address_no_port || - force_bind_address_no_port)) && - sk->sk_prot->get_port(sk, snum)) { - inet->inet_saddr = inet->inet_rcv_saddr = 0; - err = -EADDRINUSE; - goto out_release_sock; + if (snum || !(inet->bind_address_no_port || + force_bind_address_no_port)) { + if (sk->sk_prot->get_port(sk, snum)) { + inet->inet_saddr = inet->inet_rcv_saddr = 0; + err = -EADDRINUSE; + goto out_release_sock; + } + err = BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk); + if (err) { + inet->inet_saddr = inet->inet_rcv_saddr = 0; + goto out_release_sock; + } } if (inet->inet_rcv_saddr) diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index daa626b97969..afb20eb37f54 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -436,13 +436,20 @@ int __inet6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, sk->sk_ipv6only = 1; /* Make sure we are allowed to bind here. */ - if ((snum || !(inet->bind_address_no_port || - force_bind_address_no_port)) && - sk->sk_prot->get_port(sk, snum)) { - sk->sk_ipv6only = saved_ipv6only; - inet_reset_saddr(sk); - err = -EADDRINUSE; - goto out; + if (snum || !(inet->bind_address_no_port || + force_bind_address_no_port)) { + if (sk->sk_prot->get_port(sk, snum)) { + sk->sk_ipv6only = saved_ipv6only; + inet_reset_saddr(sk); + err = -EADDRINUSE; + goto out; + } + err = BPF_CGROUP_RUN_PROG_INET6_POST_BIND(sk); + if (err) { + sk->sk_ipv6only = saved_ipv6only; + inet_reset_saddr(sk); + goto out; + } } if (addr_type != IPV6_ADDR_ANY) From 0d8dc18204436fca547024ad06ab295d339b5fbb Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Mon, 2 Apr 2018 12:50:46 -0700 Subject: [PATCH 0212/1640] UPSTREAM: bpf: sockmap, free memory on sock close with cork data If a socket with pending cork data is closed we do not return the memory to the socket until the garbage collector free's the psock structure. The garbage collector though can run after the sock has completed its close operation. If this ordering happens the sock code will through a WARN_ON because there is still outstanding memory accounted to the sock. To resolve this ensure we return memory to the sock when a socket is closed. Signed-off-by: John Fastabend Fixes: 91843d540a13 ("bpf: sockmap, add msg_cork_bytes() helper") Signed-off-by: Daniel Borkmann --- kernel/bpf/sockmap.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index e93bda3787cf..680c6333da10 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -211,6 +211,12 @@ static void bpf_tcp_close(struct sock *sk, long timeout) close_fun = psock->save_close; write_lock_bh(&sk->sk_callback_lock); + if (psock->cork) { + free_start_sg(psock->sock, psock->cork); + kfree(psock->cork); + psock->cork = NULL; + } + list_for_each_entry_safe(md, mtmp, &psock->ingress, list) { list_del(&md->list); free_start_sg(psock->sock, md); From 4112935ac24989064b8d8cb5a4acae2892e756f3 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Mon, 2 Apr 2018 12:50:52 -0700 Subject: [PATCH 0213/1640] UPSTREAM: bpf: sockmap, duplicates release calls may NULL sk_prot It is possible to have multiple ULP tcp_release call paths in flight if a sock is closed and simultaneously being removed from the sockmap control path. The result would be setting the sk_prot to the saved values on the first iteration and then on the second iteration setting the value to NULL. This patch resolves this by ensuring we only reset the sk_prot pointer if we have a valid saved state to set. Fixes: 4f738adba30a7 ("bpf: create tcp_bpf_ulp allowing BPF to monitor socket TX/RX data") Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann --- kernel/bpf/sockmap.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 680c6333da10..83261ce7e9e9 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -182,8 +182,10 @@ static void bpf_tcp_release(struct sock *sk) psock->cork = NULL; } - sk->sk_prot = psock->sk_proto; - psock->sk_proto = NULL; + if (psock->sk_proto) { + sk->sk_prot = psock->sk_proto; + psock->sk_proto = NULL; + } out: rcu_read_unlock(); } From 5eccb439ebad392fee85a1db4b23e990d0b6e943 Mon Sep 17 00:00:00 2001 From: Anders Roxell Date: Tue, 3 Apr 2018 14:09:47 +0200 Subject: [PATCH 0214/1640] UPSTREAM: kernel/bpf/syscall: fix warning defined but not used MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There will be a build warning -Wunused-function if CONFIG_CGROUP_BPF isn't defined, since the only user is inside #ifdef CONFIG_CGROUP_BPF: kernel/bpf/syscall.c:1229:12: warning: ‘bpf_prog_attach_check_attach_type’ defined but not used [-Wunused-function] static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Current code moves function bpf_prog_attach_check_attach_type inside ifdef CONFIG_CGROUP_BPF. Fixes: 5e43f899b03a ("bpf: Check attach type at prog load time") Signed-off-by: Anders Roxell Signed-off-by: Daniel Borkmann --- kernel/bpf/syscall.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index c70950cc4200..cebe1539c1c7 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1243,18 +1243,6 @@ bpf_prog_load_check_attach_type(enum bpf_prog_type prog_type, } } -static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, - enum bpf_attach_type attach_type) -{ - switch (prog->type) { - case BPF_PROG_TYPE_CGROUP_SOCK: - case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: - return attach_type == prog->expected_attach_type ? 0 : -EINVAL; - default: - return 0; - } -} - /* last field in 'union bpf_attr' used by this command */ #define BPF_PROG_LOAD_LAST_FIELD expected_attach_type @@ -1488,6 +1476,18 @@ out_free_tp: #ifdef CONFIG_CGROUP_BPF +static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, + enum bpf_attach_type attach_type) +{ + switch (prog->type) { + case BPF_PROG_TYPE_CGROUP_SOCK: + case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: + return attach_type == prog->expected_attach_type ? 0 : -EINVAL; + default: + return 0; + } +} + #define BPF_PROG_ATTACH_LAST_FIELD attach_flags static int sockmap_get_from_fd(const union bpf_attr *attr, From 13e5fae1ef1504f283b7ae48d5ff069f31cb8c47 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 10 Apr 2018 09:37:32 -0700 Subject: [PATCH 0215/1640] UPSTREAM: bpf/tracing: fix a deadlock in perf_event_detach_bpf_prog syzbot reported a possible deadlock in perf_event_detach_bpf_prog. The error details: ====================================================== WARNING: possible circular locking dependency detected 4.16.0-rc7+ #3 Not tainted ------------------------------------------------------ syz-executor7/24531 is trying to acquire lock: (bpf_event_mutex){+.+.}, at: [<000000008a849b07>] perf_event_detach_bpf_prog+0x92/0x3d0 kernel/trace/bpf_trace.c:854 but task is already holding lock: (&mm->mmap_sem){++++}, at: [<0000000038768f87>] vm_mmap_pgoff+0x198/0x280 mm/util.c:353 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #1 (&mm->mmap_sem){++++}: __might_fault+0x13a/0x1d0 mm/memory.c:4571 _copy_to_user+0x2c/0xc0 lib/usercopy.c:25 copy_to_user include/linux/uaccess.h:155 [inline] bpf_prog_array_copy_info+0xf2/0x1c0 kernel/bpf/core.c:1694 perf_event_query_prog_array+0x1c7/0x2c0 kernel/trace/bpf_trace.c:891 _perf_ioctl kernel/events/core.c:4750 [inline] perf_ioctl+0x3e1/0x1480 kernel/events/core.c:4770 vfs_ioctl fs/ioctl.c:46 [inline] do_vfs_ioctl+0x1b1/0x1520 fs/ioctl.c:686 SYSC_ioctl fs/ioctl.c:701 [inline] SyS_ioctl+0x8f/0xc0 fs/ioctl.c:692 do_syscall_64+0x281/0x940 arch/x86/entry/common.c:287 entry_SYSCALL_64_after_hwframe+0x42/0xb7 -> #0 (bpf_event_mutex){+.+.}: lock_acquire+0x1d5/0x580 kernel/locking/lockdep.c:3920 __mutex_lock_common kernel/locking/mutex.c:756 [inline] __mutex_lock+0x16f/0x1a80 kernel/locking/mutex.c:893 mutex_lock_nested+0x16/0x20 kernel/locking/mutex.c:908 perf_event_detach_bpf_prog+0x92/0x3d0 kernel/trace/bpf_trace.c:854 perf_event_free_bpf_prog kernel/events/core.c:8147 [inline] _free_event+0xbdb/0x10f0 kernel/events/core.c:4116 put_event+0x24/0x30 kernel/events/core.c:4204 perf_mmap_close+0x60d/0x1010 kernel/events/core.c:5172 remove_vma+0xb4/0x1b0 mm/mmap.c:172 remove_vma_list mm/mmap.c:2490 [inline] do_munmap+0x82a/0xdf0 mm/mmap.c:2731 mmap_region+0x59e/0x15a0 mm/mmap.c:1646 do_mmap+0x6c0/0xe00 mm/mmap.c:1483 do_mmap_pgoff include/linux/mm.h:2223 [inline] vm_mmap_pgoff+0x1de/0x280 mm/util.c:355 SYSC_mmap_pgoff mm/mmap.c:1533 [inline] SyS_mmap_pgoff+0x462/0x5f0 mm/mmap.c:1491 SYSC_mmap arch/x86/kernel/sys_x86_64.c:100 [inline] SyS_mmap+0x16/0x20 arch/x86/kernel/sys_x86_64.c:91 do_syscall_64+0x281/0x940 arch/x86/entry/common.c:287 entry_SYSCALL_64_after_hwframe+0x42/0xb7 other info that might help us debug this: Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(&mm->mmap_sem); lock(bpf_event_mutex); lock(&mm->mmap_sem); lock(bpf_event_mutex); *** DEADLOCK *** ====================================================== The bug is introduced by Commit f371b304f12e ("bpf/tracing: allow user space to query prog array on the same tp") where copy_to_user, which requires mm->mmap_sem, is called inside bpf_event_mutex lock. At the same time, during perf_event file descriptor close, mm->mmap_sem is held first and then subsequent perf_event_detach_bpf_prog needs bpf_event_mutex lock. Such a senario caused a deadlock. As suggested by Daniel, moving copy_to_user out of the bpf_event_mutex lock should fix the problem. Fixes: f371b304f12e ("bpf/tracing: allow user space to query prog array on the same tp") Reported-by: syzbot+dc5ca0e4c9bfafaf2bae@syzkaller.appspotmail.com Signed-off-by: Yonghong Song Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 4 ++-- kernel/bpf/core.c | 45 ++++++++++++++++++++++++++-------------- kernel/trace/bpf_trace.c | 25 ++++++++++++++++++---- 3 files changed, 52 insertions(+), 22 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 9390c9a70e22..1e3d7d2712fe 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -345,8 +345,8 @@ int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *progs, void bpf_prog_array_delete_safe(struct bpf_prog_array __rcu *progs, struct bpf_prog *old_prog); int bpf_prog_array_copy_info(struct bpf_prog_array __rcu *array, - __u32 __user *prog_ids, u32 request_cnt, - __u32 __user *prog_cnt); + u32 *prog_ids, u32 request_cnt, + u32 *prog_cnt); int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array, struct bpf_prog *exclude_prog, struct bpf_prog *include_prog, diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 7d9bbf925e07..bf66092e2093 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1711,13 +1711,32 @@ int bpf_prog_array_length(struct bpf_prog_array __rcu *progs) return cnt; } +static bool bpf_prog_array_copy_core(struct bpf_prog **prog, + u32 *prog_ids, + u32 request_cnt) +{ + int i = 0; + + for (; *prog; prog++) { + if (*prog == &dummy_bpf_prog.prog) + continue; + prog_ids[i] = (*prog)->aux->id; + if (++i == request_cnt) { + prog++; + break; + } + } + + return !!(*prog); +} + int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *progs, __u32 __user *prog_ids, u32 cnt) { struct bpf_prog **prog; unsigned long err = 0; - u32 i = 0, *ids; bool nospc; + u32 *ids; /* users of this function are doing: * cnt = bpf_prog_array_length(); @@ -1734,16 +1753,7 @@ int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *progs, return -ENOMEM; rcu_read_lock(); prog = rcu_dereference(progs)->progs; - for (; *prog; prog++) { - if (*prog == &dummy_bpf_prog.prog) - continue; - ids[i] = (*prog)->aux->id; - if (++i == cnt) { - prog++; - break; - } - } - nospc = !!(*prog); + nospc = bpf_prog_array_copy_core(prog, ids, cnt); rcu_read_unlock(); err = copy_to_user(prog_ids, ids, cnt * sizeof(u32)); kfree(ids); @@ -1822,22 +1832,25 @@ int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array, } int bpf_prog_array_copy_info(struct bpf_prog_array __rcu *array, - __u32 __user *prog_ids, u32 request_cnt, - __u32 __user *prog_cnt) + u32 *prog_ids, u32 request_cnt, + u32 *prog_cnt) { + struct bpf_prog **prog; u32 cnt = 0; if (array) cnt = bpf_prog_array_length(array); - if (copy_to_user(prog_cnt, &cnt, sizeof(cnt))) - return -EFAULT; + *prog_cnt = cnt; /* return early if user requested only program count or nothing to copy */ if (!request_cnt || !cnt) return 0; - return bpf_prog_array_copy_to_user(array, prog_ids, request_cnt); + /* this function is called under trace/bpf_trace.c: bpf_event_mutex */ + prog = rcu_dereference_check(array, 1)->progs; + return bpf_prog_array_copy_core(prog, prog_ids, request_cnt) ? -ENOSPC + : 0; } static void bpf_prog_free_deferred(struct work_struct *work) diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 3ac3bcd48f47..8e53df3e3ca3 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -973,6 +973,7 @@ int perf_event_query_prog_array(struct perf_event *event, void __user *info) { struct perf_event_query_bpf __user *uquery = info; struct perf_event_query_bpf query = {}; + u32 *ids, prog_cnt, ids_len; int ret; if (!capable(CAP_SYS_ADMIN)) @@ -981,16 +982,32 @@ int perf_event_query_prog_array(struct perf_event *event, void __user *info) return -EINVAL; if (copy_from_user(&query, uquery, sizeof(query))) return -EFAULT; - if (query.ids_len > BPF_TRACE_MAX_PROGS) + + ids_len = query.ids_len; + if (ids_len > BPF_TRACE_MAX_PROGS) return -E2BIG; + ids = kcalloc(ids_len, sizeof(u32), GFP_USER | __GFP_NOWARN); + if (!ids) + return -ENOMEM; + /* + * The above kcalloc returns ZERO_SIZE_PTR when ids_len = 0, which + * is required when user only wants to check for uquery->prog_cnt. + * There is no need to check for it since the case is handled + * gracefully in bpf_prog_array_copy_info. + */ mutex_lock(&bpf_event_mutex); ret = bpf_prog_array_copy_info(event->tp_event->prog_array, - uquery->ids, - query.ids_len, - &uquery->prog_cnt); + ids, + ids_len, + &prog_cnt); mutex_unlock(&bpf_event_mutex); + if (copy_to_user(&uquery->prog_cnt, &prog_cnt, sizeof(prog_cnt)) || + copy_to_user(uquery->ids, ids, ids_len * sizeof(u32))) + ret = -EFAULT; + + kfree(ids); return ret; } From 9078a6d0af36c08293925c0ced84293ed19608e3 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Wed, 3 Jan 2018 11:25:34 +0100 Subject: [PATCH 0216/1640] UPSTREAM: xdp/qede: setup xdp_rxq_info and intro xdp_rxq_info_is_reg The driver code qede_free_fp_array() depend on kfree() can be called with a NULL pointer. This stems from the qede_alloc_fp_array() function which either (kz)alloc memory for fp->txq or fp->rxq. This also simplifies error handling code in case of memory allocation failures, but xdp_rxq_info_unreg need to know the difference. Introduce xdp_rxq_info_is_reg() to handle if a memory allocation fails and detect this is the failure path by seeing that xdp_rxq_info was not registred yet, which first happens after successful alloaction in qede_init_fp(). Driver hook points for xdp_rxq_info: * reg : qede_init_fp * unreg: qede_free_fp_array Tested on actual hardware with samples/bpf program. V2: Driver have no proper error path for failed XDP RX-queue info reg, as qede_init_fp() is a void function. Cc: everest-linux-l2@cavium.com Cc: Ariel Elior Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Alexei Starovoitov --- drivers/net/ethernet/qlogic/qede/qede.h | 2 ++ drivers/net/ethernet/qlogic/qede/qede_fp.c | 1 + drivers/net/ethernet/qlogic/qede/qede_main.c | 10 ++++++++++ include/net/xdp.h | 1 + net/core/xdp.c | 6 ++++++ 5 files changed, 20 insertions(+) diff --git a/drivers/net/ethernet/qlogic/qede/qede.h b/drivers/net/ethernet/qlogic/qede/qede.h index 4cc9af175a76..b3cb1d89f1e9 100644 --- a/drivers/net/ethernet/qlogic/qede/qede.h +++ b/drivers/net/ethernet/qlogic/qede/qede.h @@ -40,6 +40,7 @@ #include #include #include +#include #include #include #ifdef CONFIG_RFS_ACCEL @@ -347,6 +348,7 @@ struct qede_rx_queue { u64 xdp_no_pass; void *handle; + struct xdp_rxq_info xdp_rxq; }; union db_prod { diff --git a/drivers/net/ethernet/qlogic/qede/qede_fp.c b/drivers/net/ethernet/qlogic/qede/qede_fp.c index c044156b9ba6..85b71cdb78e8 100644 --- a/drivers/net/ethernet/qlogic/qede/qede_fp.c +++ b/drivers/net/ethernet/qlogic/qede/qede_fp.c @@ -1004,6 +1004,7 @@ static bool qede_rx_xdp(struct qede_dev *edev, xdp.data = xdp.data_hard_start + *data_offset; xdp_set_data_meta_invalid(&xdp); xdp.data_end = xdp.data + *len; + xdp.rxq = &rxq->xdp_rxq; /* Queues always have a full reset currently, so for the time * being until there's atomic program replace just mark read diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c b/drivers/net/ethernet/qlogic/qede/qede_main.c index a2da52362d09..f6bf8efd9fe9 100644 --- a/drivers/net/ethernet/qlogic/qede/qede_main.c +++ b/drivers/net/ethernet/qlogic/qede/qede_main.c @@ -769,6 +769,12 @@ static void qede_free_fp_array(struct qede_dev *edev) fp = &edev->fp_array[i]; kfree(fp->sb_info); + /* Handle mem alloc failure case where qede_init_fp + * didn't register xdp_rxq_info yet. + * Implicit only (fp->type & QEDE_FASTPATH_RX) + */ + if (fp->rxq && xdp_rxq_info_is_reg(&fp->rxq->xdp_rxq)) + xdp_rxq_info_unreg(&fp->rxq->xdp_rxq); kfree(fp->rxq); kfree(fp->xdp_tx); kfree(fp->txq); @@ -1517,6 +1523,10 @@ static void qede_init_fp(struct qede_dev *edev) else fp->rxq->data_direction = DMA_FROM_DEVICE; fp->rxq->dev = &edev->pdev->dev; + + /* Driver have no error path from here */ + WARN_ON(xdp_rxq_info_reg(&fp->rxq->xdp_rxq, edev->ndev, + fp->rxq->rxq_id) < 0); } if (fp->type & QEDE_FASTPATH_TX) { diff --git a/include/net/xdp.h b/include/net/xdp.h index 86c41631a908..b2362ddfa694 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -43,5 +43,6 @@ int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq, struct net_device *dev, u32 queue_index); void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq); void xdp_rxq_info_unused(struct xdp_rxq_info *xdp_rxq); +bool xdp_rxq_info_is_reg(struct xdp_rxq_info *xdp_rxq); #endif /* __LINUX_NET_XDP_H__ */ diff --git a/net/core/xdp.c b/net/core/xdp.c index 229bc5a0ee04..097a0f74e004 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -65,3 +65,9 @@ void xdp_rxq_info_unused(struct xdp_rxq_info *xdp_rxq) xdp_rxq->reg_state = REG_STATE_UNUSED; } EXPORT_SYMBOL_GPL(xdp_rxq_info_unused); + +bool xdp_rxq_info_is_reg(struct xdp_rxq_info *xdp_rxq) +{ + return (xdp_rxq->reg_state == REG_STATE_REGISTERED); +} +EXPORT_SYMBOL_GPL(xdp_rxq_info_is_reg); From 4cdecc44f6c2470ebe8f4e35c243fae81aab84e9 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Tue, 17 Apr 2018 16:45:26 +0200 Subject: [PATCH 0217/1640] UPSTREAM: xdp: introduce xdp_return_frame API and use in cpumap Introduce an xdp_return_frame API, and convert over cpumap as the first user, given it have queued XDP frame structure to leverage. V3: Cleanup and remove C99 style comments, pointed out by Alex Duyck. V6: Remove comment that id will be added later (Req by Alex Duyck) V8: Rename enum mem_type to xdp_mem_type (found by kbuild test robot) Signed-off-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- include/net/xdp.h | 27 ++++++++++++++++++++ kernel/bpf/cpumap.c | 60 +++++++++++++++++++++++++++------------------ net/core/xdp.c | 18 ++++++++++++++ 3 files changed, 81 insertions(+), 24 deletions(-) diff --git a/include/net/xdp.h b/include/net/xdp.h index b2362ddfa694..e4207699c410 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -33,16 +33,43 @@ * also mandatory during RX-ring setup. */ +enum xdp_mem_type { + MEM_TYPE_PAGE_SHARED = 0, /* Split-page refcnt based model */ + MEM_TYPE_PAGE_ORDER0, /* Orig XDP full page model */ + MEM_TYPE_MAX, +}; + +struct xdp_mem_info { + u32 type; /* enum xdp_mem_type, but known size type */ +}; + struct xdp_rxq_info { struct net_device *dev; u32 queue_index; u32 reg_state; + struct xdp_mem_info mem; } ____cacheline_aligned; /* perf critical, avoid false-sharing */ + +static inline +void xdp_return_frame(void *data, struct xdp_mem_info *mem) +{ + if (mem->type == MEM_TYPE_PAGE_SHARED) + page_frag_free(data); + + if (mem->type == MEM_TYPE_PAGE_ORDER0) { + struct page *page = virt_to_page(data); /* Assumes order0 page*/ + + put_page(page); + } +} + int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq, struct net_device *dev, u32 queue_index); void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq); void xdp_rxq_info_unused(struct xdp_rxq_info *xdp_rxq); bool xdp_rxq_info_is_reg(struct xdp_rxq_info *xdp_rxq); +int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq, + enum xdp_mem_type type, void *allocator); #endif /* __LINUX_NET_XDP_H__ */ diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index a4bb0b34375a..3e4bbcbe3e86 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -137,27 +138,6 @@ free_cmap: return ERR_PTR(err); } -static void __cpu_map_queue_destructor(void *ptr) -{ - /* The tear-down procedure should have made sure that queue is - * empty. See __cpu_map_entry_replace() and work-queue - * invoked cpu_map_kthread_stop(). Catch any broken behaviour - * gracefully and warn once. - */ - if (WARN_ON_ONCE(ptr)) - page_frag_free(ptr); -} - -static void put_cpu_map_entry(struct bpf_cpu_map_entry *rcpu) -{ - if (atomic_dec_and_test(&rcpu->refcnt)) { - /* The queue should be empty at this point */ - ptr_ring_cleanup(rcpu->queue, __cpu_map_queue_destructor); - kfree(rcpu->queue); - kfree(rcpu); - } -} - static void get_cpu_map_entry(struct bpf_cpu_map_entry *rcpu) { atomic_inc(&rcpu->refcnt); @@ -188,6 +168,10 @@ struct xdp_pkt { u16 len; u16 headroom; u16 metasize; + /* Lifetime of xdp_rxq_info is limited to NAPI/enqueue time, + * while mem info is valid on remote CPU. + */ + struct xdp_mem_info mem; struct net_device *dev_rx; }; @@ -213,6 +197,9 @@ static struct xdp_pkt *convert_to_xdp_pkt(struct xdp_buff *xdp) xdp_pkt->headroom = headroom - sizeof(*xdp_pkt); xdp_pkt->metasize = metasize; + /* rxq only valid until napi_schedule ends, convert to xdp_mem_info */ + xdp_pkt->mem = xdp->rxq->mem; + return xdp_pkt; } @@ -265,6 +252,31 @@ static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu, return skb; } +static void __cpu_map_ring_cleanup(struct ptr_ring *ring) +{ + /* The tear-down procedure should have made sure that queue is + * empty. See __cpu_map_entry_replace() and work-queue + * invoked cpu_map_kthread_stop(). Catch any broken behaviour + * gracefully and warn once. + */ + struct xdp_pkt *xdp_pkt; + + while ((xdp_pkt = ptr_ring_consume(ring))) + if (WARN_ON_ONCE(xdp_pkt)) + xdp_return_frame(xdp_pkt, &xdp_pkt->mem); +} + +static void put_cpu_map_entry(struct bpf_cpu_map_entry *rcpu) +{ + if (atomic_dec_and_test(&rcpu->refcnt)) { + /* The queue should be empty at this point */ + __cpu_map_ring_cleanup(rcpu->queue); + ptr_ring_cleanup(rcpu->queue, NULL); + kfree(rcpu->queue); + kfree(rcpu); + } +} + static int cpu_map_kthread_run(void *data) { struct bpf_cpu_map_entry *rcpu = data; @@ -307,7 +319,7 @@ static int cpu_map_kthread_run(void *data) skb = cpu_map_build_skb(rcpu, xdp_pkt); if (!skb) { - page_frag_free(xdp_pkt); + xdp_return_frame(xdp_pkt, &xdp_pkt->mem); continue; } @@ -604,13 +616,13 @@ static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu, spin_lock(&q->producer_lock); for (i = 0; i < bq->count; i++) { - void *xdp_pkt = bq->q[i]; + struct xdp_pkt *xdp_pkt = bq->q[i]; int err; err = __ptr_ring_produce(q, xdp_pkt); if (err) { drops++; - page_frag_free(xdp_pkt); /* Free xdp_pkt */ + xdp_return_frame(xdp_pkt->data, &xdp_pkt->mem); } processed++; } diff --git a/net/core/xdp.c b/net/core/xdp.c index 097a0f74e004..7e6b3545277d 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -71,3 +71,21 @@ bool xdp_rxq_info_is_reg(struct xdp_rxq_info *xdp_rxq) return (xdp_rxq->reg_state == REG_STATE_REGISTERED); } EXPORT_SYMBOL_GPL(xdp_rxq_info_is_reg); + +int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq, + enum xdp_mem_type type, void *allocator) +{ + if (type >= MEM_TYPE_MAX) + return -EINVAL; + + xdp_rxq->mem.type = type; + + if (allocator) + return -EOPNOTSUPP; + + /* TODO: Allocate an ID that maps to allocator pointer + * See: https://www.kernel.org/doc/html/latest/core-api/idr.html + */ + return 0; +} +EXPORT_SYMBOL_GPL(xdp_rxq_info_reg_mem_model); From 252cc0ee5a589fa002235b8cb88cd4fbec0a96bb Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Tue, 13 Feb 2018 14:15:36 +0100 Subject: [PATCH 0218/1640] UPSTREAM: net: avoid including xdp.h in filter.h If is sufficient with a forward declaration of struct xdp_rxq_info in linux/filter.h, which avoids including net/xdp.h. This was originally suggested by John Fastabend during the review phase, but wasn't included in the final patchset revision. Thus, this followup. Suggested-by: John Fastabend Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Alexei Starovoitov --- include/linux/filter.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/filter.h b/include/linux/filter.h index 3c0d718a4058..fb9eb49cc822 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -20,7 +20,6 @@ #include #include -#include #include #include @@ -30,6 +29,7 @@ struct sk_buff; struct sock; struct seccomp_data; struct bpf_prog_aux; +struct xdp_rxq_info; /* ArgX, context and stack frame pointer register positions. Note, * Arg1, Arg2, Arg3, etc are used as argument mappings of function From 361a4af282756e678e4924fca2097d8a3906f3e4 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Tue, 17 Apr 2018 16:45:37 +0200 Subject: [PATCH 0219/1640] UPSTREAM: xdp: move struct xdp_buff from filter.h to xdp.h This is done to prepare for the next patch, and it is also nice to move this XDP related struct out of filter.h. Signed-off-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- include/linux/filter.h | 24 +----------------------- include/net/xdp.h | 22 ++++++++++++++++++++++ 2 files changed, 23 insertions(+), 23 deletions(-) diff --git a/include/linux/filter.h b/include/linux/filter.h index fb9eb49cc822..49306e1d27aa 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -30,6 +30,7 @@ struct sock; struct seccomp_data; struct bpf_prog_aux; struct xdp_rxq_info; +struct xdp_buff; /* ArgX, context and stack frame pointer register positions. Note, * Arg1, Arg2, Arg3, etc are used as argument mappings of function @@ -580,14 +581,6 @@ struct bpf_skb_data_end { void *data_end; }; -struct xdp_buff { - void *data; - void *data_end; - void *data_meta; - void *data_hard_start; - struct xdp_rxq_info *rxq; -}; - struct sk_msg_buff { void *data; void *data_end; @@ -849,21 +842,6 @@ int xdp_do_redirect(struct net_device *dev, struct bpf_prog *prog); void xdp_do_flush_map(void); -/* Drivers not supporting XDP metadata can use this helper, which - * rejects any room expansion for metadata as a result. - */ -static __always_inline void -xdp_set_data_meta_invalid(struct xdp_buff *xdp) -{ - xdp->data_meta = xdp->data + 1; -} - -static __always_inline bool -xdp_data_meta_unsupported(const struct xdp_buff *xdp) -{ - return unlikely(xdp->data_meta > xdp->data); -} - void bpf_warn_invalid_xdp_action(u32 act); struct sock *do_sk_redirect_map(struct sk_buff *skb); diff --git a/include/net/xdp.h b/include/net/xdp.h index e4207699c410..15f8ade008b5 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -50,6 +50,13 @@ struct xdp_rxq_info { struct xdp_mem_info mem; } ____cacheline_aligned; /* perf critical, avoid false-sharing */ +struct xdp_buff { + void *data; + void *data_end; + void *data_meta; + void *data_hard_start; + struct xdp_rxq_info *rxq; +}; static inline void xdp_return_frame(void *data, struct xdp_mem_info *mem) @@ -72,4 +79,19 @@ bool xdp_rxq_info_is_reg(struct xdp_rxq_info *xdp_rxq); int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq, enum xdp_mem_type type, void *allocator); +/* Drivers not supporting XDP metadata can use this helper, which + * rejects any room expansion for metadata as a result. + */ +static __always_inline void +xdp_set_data_meta_invalid(struct xdp_buff *xdp) +{ + xdp->data_meta = xdp->data + 1; +} + +static __always_inline bool +xdp_data_meta_unsupported(const struct xdp_buff *xdp) +{ + return unlikely(xdp->data_meta > xdp->data); +} + #endif /* __LINUX_NET_XDP_H__ */ From fa742e3fe3497e091a5bf83762a0b69b7f81afc6 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Tue, 17 Apr 2018 16:45:42 +0200 Subject: [PATCH 0220/1640] UPSTREAM: xdp: introduce a new xdp_frame type This is needed to convert drivers tuntap and virtio_net. This is a generalization of what is done inside cpumap, which will be converted later. Signed-off-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- include/net/xdp.h | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/include/net/xdp.h b/include/net/xdp.h index 15f8ade008b5..756c42811e78 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -58,6 +58,46 @@ struct xdp_buff { struct xdp_rxq_info *rxq; }; +struct xdp_frame { + void *data; + u16 len; + u16 headroom; + u16 metasize; + /* Lifetime of xdp_rxq_info is limited to NAPI/enqueue time, + * while mem info is valid on remote CPU. + */ + struct xdp_mem_info mem; +}; + +/* Convert xdp_buff to xdp_frame */ +static inline +struct xdp_frame *convert_to_xdp_frame(struct xdp_buff *xdp) +{ + struct xdp_frame *xdp_frame; + int metasize; + int headroom; + + /* Assure headroom is available for storing info */ + headroom = xdp->data - xdp->data_hard_start; + metasize = xdp->data - xdp->data_meta; + metasize = metasize > 0 ? metasize : 0; + if (unlikely((headroom - metasize) < sizeof(*xdp_frame))) + return NULL; + + /* Store info in top of packet */ + xdp_frame = xdp->data_hard_start; + + xdp_frame->data = xdp->data; + xdp_frame->len = xdp->data_end - xdp->data; + xdp_frame->headroom = headroom - sizeof(*xdp_frame); + xdp_frame->metasize = metasize; + + /* rxq only valid until napi_schedule ends, convert to xdp_mem_info */ + xdp_frame->mem = xdp->rxq->mem; + + return xdp_frame; +} + static inline void xdp_return_frame(void *data, struct xdp_mem_info *mem) { From 38d0abfbe3f32826edfabbe16618749c99e44678 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Tue, 17 Apr 2018 16:45:57 +0200 Subject: [PATCH 0221/1640] UPSTREAM: bpf: cpumap convert to use generic xdp_frame The generic xdp_frame format, was inspired by the cpumap own internal xdp_pkt format. It is now time to convert it over to the generic xdp_frame format. The cpumap needs one extra field dev_rx. Signed-off-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- include/net/xdp.h | 1 + kernel/bpf/cpumap.c | 100 +++++++++++++------------------------------- 2 files changed, 29 insertions(+), 72 deletions(-) diff --git a/include/net/xdp.h b/include/net/xdp.h index 756c42811e78..ea3773f94f65 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -67,6 +67,7 @@ struct xdp_frame { * while mem info is valid on remote CPU. */ struct xdp_mem_info mem; + struct net_device *dev_rx; /* used by cpumap */ }; /* Convert xdp_buff to xdp_frame */ diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 3e4bbcbe3e86..bcdc4dea5ce7 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -159,52 +159,8 @@ static void cpu_map_kthread_stop(struct work_struct *work) kthread_stop(rcpu->kthread); } -/* For now, xdp_pkt is a cpumap internal data structure, with info - * carried between enqueue to dequeue. It is mapped into the top - * headroom of the packet, to avoid allocating separate mem. - */ -struct xdp_pkt { - void *data; - u16 len; - u16 headroom; - u16 metasize; - /* Lifetime of xdp_rxq_info is limited to NAPI/enqueue time, - * while mem info is valid on remote CPU. - */ - struct xdp_mem_info mem; - struct net_device *dev_rx; -}; - -/* Convert xdp_buff to xdp_pkt */ -static struct xdp_pkt *convert_to_xdp_pkt(struct xdp_buff *xdp) -{ - struct xdp_pkt *xdp_pkt; - int metasize; - int headroom; - - /* Assure headroom is available for storing info */ - headroom = xdp->data - xdp->data_hard_start; - metasize = xdp->data - xdp->data_meta; - metasize = metasize > 0 ? metasize : 0; - if (unlikely((headroom - metasize) < sizeof(*xdp_pkt))) - return NULL; - - /* Store info in top of packet */ - xdp_pkt = xdp->data_hard_start; - - xdp_pkt->data = xdp->data; - xdp_pkt->len = xdp->data_end - xdp->data; - xdp_pkt->headroom = headroom - sizeof(*xdp_pkt); - xdp_pkt->metasize = metasize; - - /* rxq only valid until napi_schedule ends, convert to xdp_mem_info */ - xdp_pkt->mem = xdp->rxq->mem; - - return xdp_pkt; -} - static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu, - struct xdp_pkt *xdp_pkt) + struct xdp_frame *xdpf) { unsigned int frame_size; void *pkt_data_start; @@ -219,7 +175,7 @@ static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu, * would be preferred to set frame_size to 2048 or 4096 * depending on the driver. * frame_size = 2048; - * frame_len = frame_size - sizeof(*xdp_pkt); + * frame_len = frame_size - sizeof(*xdp_frame); * * Instead, with info avail, skb_shared_info in placed after * packet len. This, unfortunately fakes the truesize. @@ -227,21 +183,21 @@ static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu, * is not at a fixed memory location, with mixed length * packets, which is bad for cache-line hotness. */ - frame_size = SKB_DATA_ALIGN(xdp_pkt->len) + xdp_pkt->headroom + + frame_size = SKB_DATA_ALIGN(xdpf->len) + xdpf->headroom + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); - pkt_data_start = xdp_pkt->data - xdp_pkt->headroom; + pkt_data_start = xdpf->data - xdpf->headroom; skb = build_skb(pkt_data_start, frame_size); if (!skb) return NULL; - skb_reserve(skb, xdp_pkt->headroom); - __skb_put(skb, xdp_pkt->len); - if (xdp_pkt->metasize) - skb_metadata_set(skb, xdp_pkt->metasize); + skb_reserve(skb, xdpf->headroom); + __skb_put(skb, xdpf->len); + if (xdpf->metasize) + skb_metadata_set(skb, xdpf->metasize); /* Essential SKB info: protocol and skb->dev */ - skb->protocol = eth_type_trans(skb, xdp_pkt->dev_rx); + skb->protocol = eth_type_trans(skb, xdpf->dev_rx); /* Optional SKB info, currently missing: * - HW checksum info (skb->ip_summed) @@ -259,11 +215,11 @@ static void __cpu_map_ring_cleanup(struct ptr_ring *ring) * invoked cpu_map_kthread_stop(). Catch any broken behaviour * gracefully and warn once. */ - struct xdp_pkt *xdp_pkt; + struct xdp_frame *xdpf; - while ((xdp_pkt = ptr_ring_consume(ring))) - if (WARN_ON_ONCE(xdp_pkt)) - xdp_return_frame(xdp_pkt, &xdp_pkt->mem); + while ((xdpf = ptr_ring_consume(ring))) + if (WARN_ON_ONCE(xdpf)) + xdp_return_frame(xdpf->data, &xdpf->mem); } static void put_cpu_map_entry(struct bpf_cpu_map_entry *rcpu) @@ -290,7 +246,7 @@ static int cpu_map_kthread_run(void *data) */ while (!kthread_should_stop() || !__ptr_ring_empty(rcpu->queue)) { unsigned int processed = 0, drops = 0, sched = 0; - struct xdp_pkt *xdp_pkt; + struct xdp_frame *xdpf; /* Release CPU reschedule checks */ if (__ptr_ring_empty(rcpu->queue)) { @@ -313,13 +269,13 @@ static int cpu_map_kthread_run(void *data) * kthread CPU pinned. Lockless access to ptr_ring * consume side valid as no-resize allowed of queue. */ - while ((xdp_pkt = __ptr_ring_consume(rcpu->queue))) { + while ((xdpf = __ptr_ring_consume(rcpu->queue))) { struct sk_buff *skb; int ret; - skb = cpu_map_build_skb(rcpu, xdp_pkt); + skb = cpu_map_build_skb(rcpu, xdpf); if (!skb) { - xdp_return_frame(xdp_pkt, &xdp_pkt->mem); + xdp_return_frame(xdpf->data, &xdpf->mem); continue; } @@ -616,13 +572,13 @@ static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu, spin_lock(&q->producer_lock); for (i = 0; i < bq->count; i++) { - struct xdp_pkt *xdp_pkt = bq->q[i]; + struct xdp_frame *xdpf = bq->q[i]; int err; - err = __ptr_ring_produce(q, xdp_pkt); + err = __ptr_ring_produce(q, xdpf); if (err) { drops++; - xdp_return_frame(xdp_pkt->data, &xdp_pkt->mem); + xdp_return_frame(xdpf->data, &xdpf->mem); } processed++; } @@ -637,7 +593,7 @@ static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu, /* Runs under RCU-read-side, plus in softirq under NAPI protection. * Thus, safe percpu variable access. */ -static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_pkt *xdp_pkt) +static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf) { struct xdp_bulk_queue *bq = this_cpu_ptr(rcpu->bulkq); @@ -648,28 +604,28 @@ static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_pkt *xdp_pkt) * driver to code invoking us to finished, due to driver * (e.g. ixgbe) recycle tricks based on page-refcnt. * - * Thus, incoming xdp_pkt is always queued here (else we race + * Thus, incoming xdp_frame is always queued here (else we race * with another CPU on page-refcnt and remaining driver code). * Queue time is very short, as driver will invoke flush * operation, when completing napi->poll call. */ - bq->q[bq->count++] = xdp_pkt; + bq->q[bq->count++] = xdpf; return 0; } int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp, struct net_device *dev_rx) { - struct xdp_pkt *xdp_pkt; + struct xdp_frame *xdpf; - xdp_pkt = convert_to_xdp_pkt(xdp); - if (unlikely(!xdp_pkt)) + xdpf = convert_to_xdp_frame(xdp); + if (unlikely(!xdpf)) return -EOVERFLOW; /* Info needed when constructing SKB on remote CPU */ - xdp_pkt->dev_rx = dev_rx; + xdpf->dev_rx = dev_rx; - bq_enqueue(rcpu, xdp_pkt); + bq_enqueue(rcpu, xdpf); return 0; } From 4024e84e472927bfd6d8308d0364b73fa474f78f Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Tue, 17 Apr 2018 16:46:12 +0200 Subject: [PATCH 0222/1640] BACKPORT: xdp: rhashtable with allocator ID to pointer mapping Use the IDA infrastructure for getting a cyclic increasing ID number, that is used for keeping track of each registered allocator per RX-queue xdp_rxq_info. Instead of using the IDR infrastructure, which uses a radix tree, use a dynamic rhashtable, for creating ID to pointer lookup table, because this is faster. The problem that is being solved here is that, the xdp_rxq_info pointer (stored in xdp_buff) cannot be used directly, as the guaranteed lifetime is too short. The info is needed on a (potentially) remote CPU during DMA-TX completion time . In an xdp_frame the xdp_mem_info is stored, when it got converted from an xdp_buff, which is sufficient for the simple page refcnt based recycle schemes. For more advanced allocators there is a need to store a pointer to the registered allocator. Thus, there is a need to guard the lifetime or validity of the allocator pointer, which is done through this rhashtable ID map to pointer. The removal and validity of of the allocator and helper struct xdp_mem_allocator is guarded by RCU. The allocator will be created by the driver, and registered with xdp_rxq_info_reg_mem_model(). It is up-to debate who is responsible for freeing the allocator pointer or invoking the allocator destructor function. In any case, this must happen via RCU freeing. Use the IDA infrastructure for getting a cyclic increasing ID number, that is used for keeping track of each registered allocator per RX-queue xdp_rxq_info. V4: Per req of Jason Wang - Use xdp_rxq_info_reg_mem_model() in all drivers implementing XDP_REDIRECT, even-though it's not strictly necessary when allocator==NULL for type MEM_TYPE_PAGE_SHARED (given it's zero). V6: Per req of Alex Duyck - Introduce rhashtable_lookup() call in later patch V8: Address sparse should be static warnings (from kbuild test robot) Signed-off-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- include/net/xdp.h | 14 +-- net/core/xdp.c | 223 ++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 220 insertions(+), 17 deletions(-) diff --git a/include/net/xdp.h b/include/net/xdp.h index ea3773f94f65..5f67c62540aa 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -41,6 +41,7 @@ enum xdp_mem_type { struct xdp_mem_info { u32 type; /* enum xdp_mem_type, but known size type */ + u32 id; }; struct xdp_rxq_info { @@ -99,18 +100,7 @@ struct xdp_frame *convert_to_xdp_frame(struct xdp_buff *xdp) return xdp_frame; } -static inline -void xdp_return_frame(void *data, struct xdp_mem_info *mem) -{ - if (mem->type == MEM_TYPE_PAGE_SHARED) - page_frag_free(data); - - if (mem->type == MEM_TYPE_PAGE_ORDER0) { - struct page *page = virt_to_page(data); /* Assumes order0 page*/ - - put_page(page); - } -} +void xdp_return_frame(void *data, struct xdp_mem_info *mem); int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq, struct net_device *dev, u32 queue_index); diff --git a/net/core/xdp.c b/net/core/xdp.c index 7e6b3545277d..8b2cb79b5de0 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -5,6 +5,9 @@ */ #include #include +#include +#include +#include #include @@ -13,6 +16,99 @@ #define REG_STATE_UNREGISTERED 0x2 #define REG_STATE_UNUSED 0x3 +static DEFINE_IDA(mem_id_pool); +static DEFINE_MUTEX(mem_id_lock); +#define MEM_ID_MAX 0xFFFE +#define MEM_ID_MIN 1 +static int mem_id_next = MEM_ID_MIN; + +static bool mem_id_init; /* false */ +static struct rhashtable *mem_id_ht; + +struct xdp_mem_allocator { + struct xdp_mem_info mem; + void *allocator; + struct rhash_head node; + struct rcu_head rcu; +}; + +static u32 xdp_mem_id_hashfn(const void *data, u32 len, u32 seed) +{ + const u32 *k = data; + const u32 key = *k; + + BUILD_BUG_ON(FIELD_SIZEOF(struct xdp_mem_allocator, mem.id) + != sizeof(u32)); + + /* Use cyclic increasing ID as direct hash key, see rht_bucket_index */ + return key << RHT_HASH_RESERVED_SPACE; +} + +static int xdp_mem_id_cmp(struct rhashtable_compare_arg *arg, + const void *ptr) +{ + const struct xdp_mem_allocator *xa = ptr; + u32 mem_id = *(u32 *)arg->key; + + return xa->mem.id != mem_id; +} + +static const struct rhashtable_params mem_id_rht_params = { + .nelem_hint = 64, + .head_offset = offsetof(struct xdp_mem_allocator, node), + .key_offset = offsetof(struct xdp_mem_allocator, mem.id), + .key_len = FIELD_SIZEOF(struct xdp_mem_allocator, mem.id), + .max_size = MEM_ID_MAX, + .min_size = 8, + .automatic_shrinking = true, + .hashfn = xdp_mem_id_hashfn, + .obj_cmpfn = xdp_mem_id_cmp, +}; + +static void __xdp_mem_allocator_rcu_free(struct rcu_head *rcu) +{ + struct xdp_mem_allocator *xa; + + xa = container_of(rcu, struct xdp_mem_allocator, rcu); + + /* Allow this ID to be reused */ + ida_simple_remove(&mem_id_pool, xa->mem.id); + + /* TODO: Depending on allocator type/pointer free resources */ + + /* Poison memory */ + xa->mem.id = 0xFFFF; + xa->mem.type = 0xF0F0; + xa->allocator = (void *)0xDEAD9001; + + kfree(xa); +} + +static void __xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq) +{ + struct xdp_mem_allocator *xa; + int id = xdp_rxq->mem.id; + int err; + + if (id == 0) + return; + + mutex_lock(&mem_id_lock); + + xa = rhashtable_lookup(mem_id_ht, &id, mem_id_rht_params); + if (!xa) { + mutex_unlock(&mem_id_lock); + return; + } + + err = rhashtable_remove_fast(mem_id_ht, &xa->node, mem_id_rht_params); + WARN_ON(err); + + call_rcu(&xa->rcu, __xdp_mem_allocator_rcu_free); + + mutex_unlock(&mem_id_lock); +} + void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq) { /* Simplify driver cleanup code paths, allow unreg "unused" */ @@ -21,8 +117,14 @@ void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq) WARN(!(xdp_rxq->reg_state == REG_STATE_REGISTERED), "Driver BUG"); + __xdp_rxq_info_unreg_mem_model(xdp_rxq); + xdp_rxq->reg_state = REG_STATE_UNREGISTERED; xdp_rxq->dev = NULL; + + /* Reset mem info to defaults */ + xdp_rxq->mem.id = 0; + xdp_rxq->mem.type = 0; } EXPORT_SYMBOL_GPL(xdp_rxq_info_unreg); @@ -72,20 +174,131 @@ bool xdp_rxq_info_is_reg(struct xdp_rxq_info *xdp_rxq) } EXPORT_SYMBOL_GPL(xdp_rxq_info_is_reg); +static int __mem_id_init_hash_table(void) +{ + struct rhashtable *rht; + int ret; + + if (unlikely(mem_id_init)) + return 0; + + rht = kzalloc(sizeof(*rht), GFP_KERNEL); + if (!rht) + return -ENOMEM; + + ret = rhashtable_init(rht, &mem_id_rht_params); + if (ret < 0) { + kfree(rht); + return ret; + } + mem_id_ht = rht; + smp_mb(); /* mutex lock should provide enough pairing */ + mem_id_init = true; + + return 0; +} + +/* Allocate a cyclic ID that maps to allocator pointer. + * See: https://www.kernel.org/doc/html/latest/core-api/idr.html + * + * Caller must lock mem_id_lock. + */ +static int __mem_id_cyclic_get(gfp_t gfp) +{ + int retries = 1; + int id; + +again: + id = ida_simple_get(&mem_id_pool, mem_id_next, MEM_ID_MAX, gfp); + if (id < 0) { + if (id == -ENOSPC) { + /* Cyclic allocator, reset next id */ + if (retries--) { + mem_id_next = MEM_ID_MIN; + goto again; + } + } + return id; /* errno */ + } + mem_id_next = id + 1; + + return id; +} + int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq, enum xdp_mem_type type, void *allocator) { + struct xdp_mem_allocator *xdp_alloc; + gfp_t gfp = GFP_KERNEL; + int id, errno, ret; + void *ptr; + + if (xdp_rxq->reg_state != REG_STATE_REGISTERED) { + WARN(1, "Missing register, driver bug"); + return -EFAULT; + } + if (type >= MEM_TYPE_MAX) return -EINVAL; xdp_rxq->mem.type = type; - if (allocator) - return -EOPNOTSUPP; + if (!allocator) + return 0; + + /* Delay init of rhashtable to save memory if feature isn't used */ + if (!mem_id_init) { + mutex_lock(&mem_id_lock); + ret = __mem_id_init_hash_table(); + mutex_unlock(&mem_id_lock); + if (ret < 0) { + WARN_ON(1); + return ret; + } + } + + xdp_alloc = kzalloc(sizeof(*xdp_alloc), gfp); + if (!xdp_alloc) + return -ENOMEM; + + mutex_lock(&mem_id_lock); + id = __mem_id_cyclic_get(gfp); + if (id < 0) { + errno = id; + goto err; + } + xdp_rxq->mem.id = id; + xdp_alloc->mem = xdp_rxq->mem; + xdp_alloc->allocator = allocator; + + /* Insert allocator into ID lookup table */ + ptr = rhashtable_insert_slow(mem_id_ht, &id, &xdp_alloc->node); + if (IS_ERR(ptr)) { + errno = PTR_ERR(ptr); + goto err; + } + + mutex_unlock(&mem_id_lock); - /* TODO: Allocate an ID that maps to allocator pointer - * See: https://www.kernel.org/doc/html/latest/core-api/idr.html - */ return 0; +err: + mutex_unlock(&mem_id_lock); + kfree(xdp_alloc); + return errno; } EXPORT_SYMBOL_GPL(xdp_rxq_info_reg_mem_model); + +void xdp_return_frame(void *data, struct xdp_mem_info *mem) +{ + if (mem->type == MEM_TYPE_PAGE_SHARED) { + page_frag_free(data); + return; + } + + if (mem->type == MEM_TYPE_PAGE_ORDER0) { + struct page *page = virt_to_page(data); /* Assumes order0 page*/ + + put_page(page); + } +} +EXPORT_SYMBOL_GPL(xdp_return_frame); From 36ce5068fbda04085db8e88111937624a32996ec Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Tue, 17 Apr 2018 16:46:17 +0200 Subject: [PATCH 0223/1640] BACKPORT: page_pool: refurbish version of page_pool code Need a fast page recycle mechanism for ndo_xdp_xmit API for returning pages on DMA-TX completion time, which have good cross CPU performance, given DMA-TX completion time can happen on a remote CPU. Refurbish my page_pool code, that was presented[1] at MM-summit 2016. Adapted page_pool code to not depend the page allocator and integration into struct page. The DMA mapping feature is kept, even-though it will not be activated/used in this patchset. [1] http://people.netfilter.org/hawk/presentations/MM-summit2016/generic_page_pool_mm_summit2016.pdf V2: Adjustments requested by Tariq - Changed page_pool_create return codes, don't return NULL, only ERR_PTR, as this simplifies err handling in drivers. V4: many small improvements and cleanups - Add DOC comment section, that can be used by kernel-doc - Improve fallback mode, to work better with refcnt based recycling e.g. remove a WARN as pointed out by Tariq e.g. quicker fallback if ptr_ring is empty. V5: Fixed SPDX license as pointed out by Alexei V6: Adjustments requested by Eric Dumazet - Adjust ____cacheline_aligned_in_smp usage/placement - Move rcu_head in struct page_pool - Free pages quicker on destroy, minimize resources delayed an RCU period - Remove code for forward/backward compat ABI interface V8: Issues found by kbuild test robot - Address sparse should be static warnings - Only compile+link when a driver use/select page_pool, mlx5 selects CONFIG_PAGE_POOL, although its first used in two patches Signed-off-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- .../net/ethernet/mellanox/mlx5/core/Kconfig | 1 + include/net/page_pool.h | 129 +++++++ net/Kconfig | 3 + net/core/Makefile | 1 + net/core/page_pool.c | 317 ++++++++++++++++++ 5 files changed, 451 insertions(+) create mode 100644 include/net/page_pool.h create mode 100644 net/core/page_pool.c diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Kconfig b/drivers/net/ethernet/mellanox/mlx5/core/Kconfig index 576b61c119bb..69c4b98bd2f3 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/Kconfig +++ b/drivers/net/ethernet/mellanox/mlx5/core/Kconfig @@ -30,6 +30,7 @@ config MLX5_CORE_EN depends on NETDEVICES && ETHERNET && INET && PCI && MLX5_CORE depends on IPV6=y || IPV6=n || MLX5_CORE=m imply PTP_1588_CLOCK + select PAGE_POOL default n ---help--- Ethernet support in Mellanox Technologies ConnectX-4 NIC. diff --git a/include/net/page_pool.h b/include/net/page_pool.h new file mode 100644 index 000000000000..1fe77db59518 --- /dev/null +++ b/include/net/page_pool.h @@ -0,0 +1,129 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * page_pool.h + * Author: Jesper Dangaard Brouer + * Copyright (C) 2016 Red Hat, Inc. + */ + +/** + * DOC: page_pool allocator + * + * This page_pool allocator is optimized for the XDP mode that + * uses one-frame-per-page, but have fallbacks that act like the + * regular page allocator APIs. + * + * Basic use involve replacing alloc_pages() calls with the + * page_pool_alloc_pages() call. Drivers should likely use + * page_pool_dev_alloc_pages() replacing dev_alloc_pages(). + * + * If page_pool handles DMA mapping (use page->private), then API user + * is responsible for invoking page_pool_put_page() once. In-case of + * elevated refcnt, the DMA state is released, assuming other users of + * the page will eventually call put_page(). + * + * If no DMA mapping is done, then it can act as shim-layer that + * fall-through to alloc_page. As no state is kept on the page, the + * regular put_page() call is sufficient. + */ +#ifndef _NET_PAGE_POOL_H +#define _NET_PAGE_POOL_H + +#include /* Needed by ptr_ring */ +#include +#include + +#define PP_FLAG_DMA_MAP 1 /* Should page_pool do the DMA map/unmap */ +#define PP_FLAG_ALL PP_FLAG_DMA_MAP + +/* + * Fast allocation side cache array/stack + * + * The cache size and refill watermark is related to the network + * use-case. The NAPI budget is 64 packets. After a NAPI poll the RX + * ring is usually refilled and the max consumed elements will be 64, + * thus a natural max size of objects needed in the cache. + * + * Keeping room for more objects, is due to XDP_DROP use-case. As + * XDP_DROP allows the opportunity to recycle objects directly into + * this array, as it shares the same softirq/NAPI protection. If + * cache is already full (or partly full) then the XDP_DROP recycles + * would have to take a slower code path. + */ +#define PP_ALLOC_CACHE_SIZE 128 +#define PP_ALLOC_CACHE_REFILL 64 +struct pp_alloc_cache { + u32 count; + void *cache[PP_ALLOC_CACHE_SIZE]; +}; + +struct page_pool_params { + unsigned int flags; + unsigned int order; + unsigned int pool_size; + int nid; /* Numa node id to allocate from pages from */ + struct device *dev; /* device, for DMA pre-mapping purposes */ + enum dma_data_direction dma_dir; /* DMA mapping direction */ +}; + +struct page_pool { + struct rcu_head rcu; + struct page_pool_params p; + + /* + * Data structure for allocation side + * + * Drivers allocation side usually already perform some kind + * of resource protection. Piggyback on this protection, and + * require driver to protect allocation side. + * + * For NIC drivers this means, allocate a page_pool per + * RX-queue. As the RX-queue is already protected by + * Softirq/BH scheduling and napi_schedule. NAPI schedule + * guarantee that a single napi_struct will only be scheduled + * on a single CPU (see napi_schedule). + */ + struct pp_alloc_cache alloc ____cacheline_aligned_in_smp; + + /* Data structure for storing recycled pages. + * + * Returning/freeing pages is more complicated synchronization + * wise, because free's can happen on remote CPUs, with no + * association with allocation resource. + * + * Use ptr_ring, as it separates consumer and producer + * effeciently, it a way that doesn't bounce cache-lines. + * + * TODO: Implement bulk return pages into this structure. + */ + struct ptr_ring ring; +}; + +struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp); + +static inline struct page *page_pool_dev_alloc_pages(struct page_pool *pool) +{ + gfp_t gfp = (GFP_ATOMIC | __GFP_NOWARN); + + return page_pool_alloc_pages(pool, gfp); +} + +struct page_pool *page_pool_create(const struct page_pool_params *params); + +void page_pool_destroy(struct page_pool *pool); + +/* Never call this directly, use helpers below */ +void __page_pool_put_page(struct page_pool *pool, + struct page *page, bool allow_direct); + +static inline void page_pool_put_page(struct page_pool *pool, struct page *page) +{ + __page_pool_put_page(pool, page, false); +} +/* Very limited use-cases allow recycle direct */ +static inline void page_pool_recycle_direct(struct page_pool *pool, + struct page *page) +{ + __page_pool_put_page(pool, page, true); +} + +#endif /* _NET_PAGE_POOL_H */ diff --git a/net/Kconfig b/net/Kconfig index 7806b964776c..21f4031e486d 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -450,6 +450,9 @@ config MAY_USE_DEVLINK on MAY_USE_DEVLINK to ensure they do not cause link errors when devlink is a loadable module and the driver using it is built-in. +config PAGE_POOL + bool + endif # if NET # Used by archs to tell that they support BPF JIT compiler plus which flavour. diff --git a/net/core/Makefile b/net/core/Makefile index 4a98df0efc61..95a78f92b636 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -14,6 +14,7 @@ obj-y += dev.o ethtool.o dev_addr_lists.o dst.o netevent.o \ fib_notifier.o net_ipc_log.o dev_monitor.o xdp.o obj-y += net-sysfs.o +obj-$(CONFIG_PAGE_POOL) += page_pool.o obj-$(CONFIG_PROC_FS) += net-procfs.o obj-$(CONFIG_NET_PKTGEN) += pktgen.o obj-$(CONFIG_NETPOLL) += netpoll.o diff --git a/net/core/page_pool.c b/net/core/page_pool.c new file mode 100644 index 000000000000..68bf07206744 --- /dev/null +++ b/net/core/page_pool.c @@ -0,0 +1,317 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * page_pool.c + * Author: Jesper Dangaard Brouer + * Copyright (C) 2016 Red Hat, Inc. + */ +#include +#include +#include + +#include +#include +#include +#include +#include /* for __put_page() */ + +static int page_pool_init(struct page_pool *pool, + const struct page_pool_params *params) +{ + unsigned int ring_qsize = 1024; /* Default */ + + memcpy(&pool->p, params, sizeof(pool->p)); + + /* Validate only known flags were used */ + if (pool->p.flags & ~(PP_FLAG_ALL)) + return -EINVAL; + + if (pool->p.pool_size) + ring_qsize = pool->p.pool_size; + + /* Sanity limit mem that can be pinned down */ + if (ring_qsize > 32768) + return -E2BIG; + + /* DMA direction is either DMA_FROM_DEVICE or DMA_BIDIRECTIONAL. + * DMA_BIDIRECTIONAL is for allowing page used for DMA sending, + * which is the XDP_TX use-case. + */ + if ((pool->p.dma_dir != DMA_FROM_DEVICE) && + (pool->p.dma_dir != DMA_BIDIRECTIONAL)) + return -EINVAL; + + if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0) + return -ENOMEM; + + return 0; +} + +struct page_pool *page_pool_create(const struct page_pool_params *params) +{ + struct page_pool *pool; + int err = 0; + + pool = kzalloc_node(sizeof(*pool), GFP_KERNEL, params->nid); + if (!pool) + return ERR_PTR(-ENOMEM); + + err = page_pool_init(pool, params); + if (err < 0) { + pr_warn("%s() gave up with errno %d\n", __func__, err); + kfree(pool); + return ERR_PTR(err); + } + return pool; +} +EXPORT_SYMBOL(page_pool_create); + +/* fast path */ +static struct page *__page_pool_get_cached(struct page_pool *pool) +{ + struct ptr_ring *r = &pool->ring; + struct page *page; + + /* Quicker fallback, avoid locks when ring is empty */ + if (__ptr_ring_empty(r)) + return NULL; + + /* Test for safe-context, caller should provide this guarantee */ + if (likely(in_serving_softirq())) { + if (likely(pool->alloc.count)) { + /* Fast-path */ + page = pool->alloc.cache[--pool->alloc.count]; + return page; + } + /* Slower-path: Alloc array empty, time to refill + * + * Open-coded bulk ptr_ring consumer. + * + * Discussion: the ring consumer lock is not really + * needed due to the softirq/NAPI protection, but + * later need the ability to reclaim pages on the + * ring. Thus, keeping the locks. + */ + spin_lock(&r->consumer_lock); + while ((page = __ptr_ring_consume(r))) { + if (pool->alloc.count == PP_ALLOC_CACHE_REFILL) + break; + pool->alloc.cache[pool->alloc.count++] = page; + } + spin_unlock(&r->consumer_lock); + return page; + } + + /* Slow-path: Get page from locked ring queue */ + page = ptr_ring_consume(&pool->ring); + return page; +} + +/* slow path */ +noinline +static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool, + gfp_t _gfp) +{ + struct page *page; + gfp_t gfp = _gfp; + dma_addr_t dma; + + /* We could always set __GFP_COMP, and avoid this branch, as + * prep_new_page() can handle order-0 with __GFP_COMP. + */ + if (pool->p.order) + gfp |= __GFP_COMP; + + /* FUTURE development: + * + * Current slow-path essentially falls back to single page + * allocations, which doesn't improve performance. This code + * need bulk allocation support from the page allocator code. + */ + + /* Cache was empty, do real allocation */ + page = alloc_pages_node(pool->p.nid, gfp, pool->p.order); + if (!page) + return NULL; + + if (!(pool->p.flags & PP_FLAG_DMA_MAP)) + goto skip_dma_map; + + /* Setup DMA mapping: use page->private for DMA-addr + * This mapping is kept for lifetime of page, until leaving pool. + */ + dma = dma_map_page(pool->p.dev, page, 0, + (PAGE_SIZE << pool->p.order), + pool->p.dma_dir); + if (dma_mapping_error(pool->p.dev, dma)) { + put_page(page); + return NULL; + } + set_page_private(page, dma); /* page->private = dma; */ + +skip_dma_map: + /* When page just alloc'ed is should/must have refcnt 1. */ + return page; +} + +/* For using page_pool replace: alloc_pages() API calls, but provide + * synchronization guarantee for allocation side. + */ +struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp) +{ + struct page *page; + + /* Fast-path: Get a page from cache */ + page = __page_pool_get_cached(pool); + if (page) + return page; + + /* Slow-path: cache empty, do real allocation */ + page = __page_pool_alloc_pages_slow(pool, gfp); + return page; +} +EXPORT_SYMBOL(page_pool_alloc_pages); + +/* Cleanup page_pool state from page */ +static void __page_pool_clean_page(struct page_pool *pool, + struct page *page) +{ + if (!(pool->p.flags & PP_FLAG_DMA_MAP)) + return; + + /* DMA unmap */ + dma_unmap_page(pool->p.dev, page_private(page), + PAGE_SIZE << pool->p.order, pool->p.dma_dir); + set_page_private(page, 0); +} + +/* Return a page to the page allocator, cleaning up our state */ +static void __page_pool_return_page(struct page_pool *pool, struct page *page) +{ + __page_pool_clean_page(pool, page); + put_page(page); + /* An optimization would be to call __free_pages(page, pool->p.order) + * knowing page is not part of page-cache (thus avoiding a + * __page_cache_release() call). + */ +} + +static bool __page_pool_recycle_into_ring(struct page_pool *pool, + struct page *page) +{ + int ret; + /* BH protection not needed if current is serving softirq */ + if (in_serving_softirq()) + ret = ptr_ring_produce(&pool->ring, page); + else + ret = ptr_ring_produce_bh(&pool->ring, page); + + return (ret == 0) ? true : false; +} + +/* Only allow direct recycling in special circumstances, into the + * alloc side cache. E.g. during RX-NAPI processing for XDP_DROP use-case. + * + * Caller must provide appropriate safe context. + */ +static bool __page_pool_recycle_direct(struct page *page, + struct page_pool *pool) +{ + if (unlikely(pool->alloc.count == PP_ALLOC_CACHE_SIZE)) + return false; + + /* Caller MUST have verified/know (page_ref_count(page) == 1) */ + pool->alloc.cache[pool->alloc.count++] = page; + return true; +} + +void __page_pool_put_page(struct page_pool *pool, + struct page *page, bool allow_direct) +{ + /* This allocator is optimized for the XDP mode that uses + * one-frame-per-page, but have fallbacks that act like the + * regular page allocator APIs. + * + * refcnt == 1 means page_pool owns page, and can recycle it. + */ + if (likely(page_ref_count(page) == 1)) { + /* Read barrier done in page_ref_count / READ_ONCE */ + + if (allow_direct && in_serving_softirq()) + if (__page_pool_recycle_direct(page, pool)) + return; + + if (!__page_pool_recycle_into_ring(pool, page)) { + /* Cache full, fallback to free pages */ + __page_pool_return_page(pool, page); + } + return; + } + /* Fallback/non-XDP mode: API user have elevated refcnt. + * + * Many drivers split up the page into fragments, and some + * want to keep doing this to save memory and do refcnt based + * recycling. Support this use case too, to ease drivers + * switching between XDP/non-XDP. + * + * In-case page_pool maintains the DMA mapping, API user must + * call page_pool_put_page once. In this elevated refcnt + * case, the DMA is unmapped/released, as driver is likely + * doing refcnt based recycle tricks, meaning another process + * will be invoking put_page. + */ + __page_pool_clean_page(pool, page); + put_page(page); +} +EXPORT_SYMBOL(__page_pool_put_page); + +static void __page_pool_empty_ring(struct page_pool *pool) +{ + struct page *page; + + /* Empty recycle ring */ + while ((page = ptr_ring_consume(&pool->ring))) { + /* Verify the refcnt invariant of cached pages */ + if (!(page_ref_count(page) == 1)) + pr_crit("%s() page_pool refcnt %d violation\n", + __func__, page_ref_count(page)); + + __page_pool_return_page(pool, page); + } +} + +static void __page_pool_destroy_rcu(struct rcu_head *rcu) +{ + struct page_pool *pool; + + pool = container_of(rcu, struct page_pool, rcu); + + WARN(pool->alloc.count, "API usage violation"); + + __page_pool_empty_ring(pool); + ptr_ring_cleanup(&pool->ring, NULL); + kfree(pool); +} + +/* Cleanup and release resources */ +void page_pool_destroy(struct page_pool *pool) +{ + struct page *page; + + /* Empty alloc cache, assume caller made sure this is + * no-longer in use, and page_pool_alloc_pages() cannot be + * call concurrently. + */ + while (pool->alloc.count) { + page = pool->alloc.cache[--pool->alloc.count]; + __page_pool_return_page(pool, page); + } + + /* No more consumers should exist, but producers could still + * be in-flight. + */ + __page_pool_empty_ring(pool); + + /* An xdp_mem_allocator can still ref page_pool pointer */ + call_rcu(&pool->rcu, __page_pool_destroy_rcu); +} +EXPORT_SYMBOL(page_pool_destroy); From 778437ddb4e673ceb35a489b8dcfdbc962f7b034 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Tue, 17 Apr 2018 16:46:22 +0200 Subject: [PATCH 0224/1640] UPSTREAM: xdp: allow page_pool as an allocator type in xdp_return_frame New allocator type MEM_TYPE_PAGE_POOL for page_pool usage. The registered allocator page_pool pointer is not available directly from xdp_rxq_info, but it could be (if needed). For now, the driver should keep separate track of the page_pool pointer, which it should use for RX-ring page allocation. As suggested by Saeed, to maintain a symmetric API it is the drivers responsibility to allocate/create and free/destroy the page_pool. Thus, after the driver have called xdp_rxq_info_unreg(), it is drivers responsibility to free the page_pool, but with a RCU free call. This is done easily via the page_pool helper page_pool_destroy() (which avoids touching any driver code during the RCU callback, which could happen after the driver have been unloaded). V8: address issues found by kbuild test robot - Address sparse should be static warnings - Allow xdp.o to be compiled without page_pool.o V9: Remove inline from .c file, compiler knows best Signed-off-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- include/net/page_pool.h | 14 ++++++++++ include/net/xdp.h | 3 +++ net/core/xdp.c | 60 ++++++++++++++++++++++++++++++++--------- 3 files changed, 65 insertions(+), 12 deletions(-) diff --git a/include/net/page_pool.h b/include/net/page_pool.h index 1fe77db59518..c79087153148 100644 --- a/include/net/page_pool.h +++ b/include/net/page_pool.h @@ -117,7 +117,12 @@ void __page_pool_put_page(struct page_pool *pool, static inline void page_pool_put_page(struct page_pool *pool, struct page *page) { + /* When page_pool isn't compiled-in, net/core/xdp.c doesn't + * allow registering MEM_TYPE_PAGE_POOL, but shield linker. + */ +#ifdef CONFIG_PAGE_POOL __page_pool_put_page(pool, page, false); +#endif } /* Very limited use-cases allow recycle direct */ static inline void page_pool_recycle_direct(struct page_pool *pool, @@ -126,4 +131,13 @@ static inline void page_pool_recycle_direct(struct page_pool *pool, __page_pool_put_page(pool, page, true); } +static inline bool is_page_pool_compiled_in(void) +{ +#ifdef CONFIG_PAGE_POOL + return true; +#else + return false; +#endif +} + #endif /* _NET_PAGE_POOL_H */ diff --git a/include/net/xdp.h b/include/net/xdp.h index 5f67c62540aa..d0ee437753dc 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -36,6 +36,7 @@ enum xdp_mem_type { MEM_TYPE_PAGE_SHARED = 0, /* Split-page refcnt based model */ MEM_TYPE_PAGE_ORDER0, /* Orig XDP full page model */ + MEM_TYPE_PAGE_POOL, MEM_TYPE_MAX, }; @@ -44,6 +45,8 @@ struct xdp_mem_info { u32 id; }; +struct page_pool; + struct xdp_rxq_info { struct net_device *dev; u32 queue_index; diff --git a/net/core/xdp.c b/net/core/xdp.c index 8b2cb79b5de0..33e382afbd95 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -8,6 +8,7 @@ #include #include #include +#include #include @@ -27,7 +28,10 @@ static struct rhashtable *mem_id_ht; struct xdp_mem_allocator { struct xdp_mem_info mem; - void *allocator; + union { + void *allocator; + struct page_pool *page_pool; + }; struct rhash_head node; struct rcu_head rcu; }; @@ -74,7 +78,9 @@ static void __xdp_mem_allocator_rcu_free(struct rcu_head *rcu) /* Allow this ID to be reused */ ida_simple_remove(&mem_id_pool, xa->mem.id); - /* TODO: Depending on allocator type/pointer free resources */ + /* Notice, driver is expected to free the *allocator, + * e.g. page_pool, and MUST also use RCU free. + */ /* Poison memory */ xa->mem.id = 0xFFFF; @@ -225,6 +231,17 @@ again: return id; } +static bool __is_supported_mem_type(enum xdp_mem_type type) +{ + if (type == MEM_TYPE_PAGE_POOL) + return is_page_pool_compiled_in(); + + if (type >= MEM_TYPE_MAX) + return false; + + return true; +} + int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq, enum xdp_mem_type type, void *allocator) { @@ -238,13 +255,16 @@ int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq, return -EFAULT; } - if (type >= MEM_TYPE_MAX) - return -EINVAL; + if (!__is_supported_mem_type(type)) + return -EOPNOTSUPP; xdp_rxq->mem.type = type; - if (!allocator) + if (!allocator) { + if (type == MEM_TYPE_PAGE_POOL) + return -EINVAL; /* Setup time check page_pool req */ return 0; + } /* Delay init of rhashtable to save memory if feature isn't used */ if (!mem_id_init) { @@ -290,15 +310,31 @@ EXPORT_SYMBOL_GPL(xdp_rxq_info_reg_mem_model); void xdp_return_frame(void *data, struct xdp_mem_info *mem) { - if (mem->type == MEM_TYPE_PAGE_SHARED) { + struct xdp_mem_allocator *xa; + struct page *page; + + switch (mem->type) { + case MEM_TYPE_PAGE_POOL: + rcu_read_lock(); + /* mem->id is valid, checked in xdp_rxq_info_reg_mem_model() */ + xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params); + page = virt_to_head_page(data); + if (xa) + page_pool_put_page(xa->page_pool, page); + else + put_page(page); + rcu_read_unlock(); + break; + case MEM_TYPE_PAGE_SHARED: page_frag_free(data); - return; - } - - if (mem->type == MEM_TYPE_PAGE_ORDER0) { - struct page *page = virt_to_page(data); /* Assumes order0 page*/ - + break; + case MEM_TYPE_PAGE_ORDER0: + page = virt_to_page(data); /* Assumes order0 page*/ put_page(page); + break; + default: + /* Not possible, checked in xdp_rxq_info_reg_mem_model() */ + break; } } EXPORT_SYMBOL_GPL(xdp_return_frame); From 98709af5fd22ca88c6c792e4090e5deb97098516 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Tue, 17 Apr 2018 16:46:32 +0200 Subject: [PATCH 0225/1640] BACKPORT: xdp: transition into using xdp_frame for return API Changing API xdp_return_frame() to take struct xdp_frame as argument, seems like a natural choice. But there are some subtle performance details here that needs extra care, which is a deliberate choice. When de-referencing xdp_frame on a remote CPU during DMA-TX completion, result in the cache-line is change to "Shared" state. Later when the page is reused for RX, then this xdp_frame cache-line is written, which change the state to "Modified". This situation already happens (naturally) for, virtio_net, tun and cpumap as the xdp_frame pointer is the queued object. In tun and cpumap, the ptr_ring is used for efficiently transferring cache-lines (with pointers) between CPUs. Thus, the only option is to de-referencing xdp_frame. It is only the ixgbe driver that had an optimization, in which it can avoid doing the de-reference of xdp_frame. The driver already have TX-ring queue, which (in case of remote DMA-TX completion) have to be transferred between CPUs anyhow. In this data area, we stored a struct xdp_mem_info and a data pointer, which allowed us to avoid de-referencing xdp_frame. To compensate for this, a prefetchw is used for telling the cache coherency protocol about our access pattern. My benchmarks show that this prefetchw is enough to compensate the ixgbe driver. V7: Adjust for commit d9314c474d4f ("i40e: add support for XDP_REDIRECT") V8: Adjust for commit bd658dda4237 ("net/mlx5e: Separate dma base address and offset in dma_sync call") Signed-off-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- include/net/xdp.h | 2 +- kernel/bpf/cpumap.c | 6 +++--- net/core/xdp.c | 4 +++- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/include/net/xdp.h b/include/net/xdp.h index d0ee437753dc..137ad5f9f40f 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -103,7 +103,7 @@ struct xdp_frame *convert_to_xdp_frame(struct xdp_buff *xdp) return xdp_frame; } -void xdp_return_frame(void *data, struct xdp_mem_info *mem); +void xdp_return_frame(struct xdp_frame *xdpf); int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq, struct net_device *dev, u32 queue_index); diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index bcdc4dea5ce7..c95b04ec103e 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -219,7 +219,7 @@ static void __cpu_map_ring_cleanup(struct ptr_ring *ring) while ((xdpf = ptr_ring_consume(ring))) if (WARN_ON_ONCE(xdpf)) - xdp_return_frame(xdpf->data, &xdpf->mem); + xdp_return_frame(xdpf); } static void put_cpu_map_entry(struct bpf_cpu_map_entry *rcpu) @@ -275,7 +275,7 @@ static int cpu_map_kthread_run(void *data) skb = cpu_map_build_skb(rcpu, xdpf); if (!skb) { - xdp_return_frame(xdpf->data, &xdpf->mem); + xdp_return_frame(xdpf); continue; } @@ -578,7 +578,7 @@ static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu, err = __ptr_ring_produce(q, xdpf); if (err) { drops++; - xdp_return_frame(xdpf->data, &xdpf->mem); + xdp_return_frame(xdpf); } processed++; } diff --git a/net/core/xdp.c b/net/core/xdp.c index 33e382afbd95..0c86b53a3a63 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -308,9 +308,11 @@ err: } EXPORT_SYMBOL_GPL(xdp_rxq_info_reg_mem_model); -void xdp_return_frame(void *data, struct xdp_mem_info *mem) +void xdp_return_frame(struct xdp_frame *xdpf) { + struct xdp_mem_info *mem = &xdpf->mem; struct xdp_mem_allocator *xa; + void *data = xdpf->data; struct page *page; switch (mem->type) { From 968a52facec1d83ab4182809e779f1f3fcd0f59d Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Tue, 17 Apr 2018 16:46:37 +0200 Subject: [PATCH 0226/1640] BACKPORT: xdp: transition into using xdp_frame for ndo_xdp_xmit Changing API ndo_xdp_xmit to take a struct xdp_frame instead of struct xdp_buff. This brings xdp_return_frame and ndp_xdp_xmit in sync. This builds towards changing the API further to become a bulk API, because xdp_buff is not a queue-able object while xdp_frame is. V4: Adjust for commit 59655a5b6c83 ("tuntap: XDP_TX can use native XDP") V7: Adjust for commit d9314c474d4f ("i40e: add support for XDP_REDIRECT") Signed-off-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- include/linux/netdevice.h | 4 ++-- net/core/filter.c | 17 +++++++++++++++-- 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 17fc9b202008..6c973b71a5c4 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1188,7 +1188,7 @@ struct macsec_ops { * This function is used to set or query state related to XDP on the * netdevice and manage BPF offload. See definition of * enum bpf_netdev_command for details. - * int (*ndo_xdp_xmit)(struct net_device *dev, struct xdp_buff *xdp); + * int (*ndo_xdp_xmit)(struct net_device *dev, struct xdp_frame *xdp); * This function is used to submit a XDP packet for transmit on a * netdevice. * void (*ndo_xdp_flush)(struct net_device *dev); @@ -1378,7 +1378,7 @@ struct net_device_ops { int (*ndo_bpf)(struct net_device *dev, struct netdev_bpf *bpf); int (*ndo_xdp_xmit)(struct net_device *dev, - struct xdp_buff *xdp); + struct xdp_frame *xdp); void (*ndo_xdp_flush)(struct net_device *dev); }; diff --git a/net/core/filter.c b/net/core/filter.c index 56c11d1c1dbd..22b0a3ddcb1d 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2799,13 +2799,18 @@ static int __bpf_tx_xdp(struct net_device *dev, struct xdp_buff *xdp, u32 index) { + struct xdp_frame *xdpf; int err; if (!dev->netdev_ops->ndo_xdp_xmit) { return -EOPNOTSUPP; } - err = dev->netdev_ops->ndo_xdp_xmit(dev, xdp); + xdpf = convert_to_xdp_frame(xdp); + if (unlikely(!xdpf)) + return -EOVERFLOW; + + err = dev->netdev_ops->ndo_xdp_xmit(dev, xdpf); if (err) return err; dev->netdev_ops->ndo_xdp_flush(dev); @@ -2821,11 +2826,19 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd, if (map->map_type == BPF_MAP_TYPE_DEVMAP) { struct net_device *dev = fwd; + struct xdp_frame *xdpf; if (!dev->netdev_ops->ndo_xdp_xmit) return -EOPNOTSUPP; - err = dev->netdev_ops->ndo_xdp_xmit(dev, xdp); + xdpf = convert_to_xdp_frame(xdp); + if (unlikely(!xdpf)) + return -EOVERFLOW; + + /* TODO: move to inside map code instead, for bulk support + * err = dev_map_enqueue(dev, xdp); + */ + err = dev->netdev_ops->ndo_xdp_xmit(dev, xdpf); if (err) return err; __dev_map_insert_ctx(map, index); From e926012d366fb326b4ca3affa96b8e9d2f09f93d Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Tue, 17 Apr 2018 16:46:43 +0200 Subject: [PATCH 0227/1640] UPSTREAM: xdp: avoid leaking info stored in frame data on page reuse The bpf infrastructure and verifier goes to great length to avoid bpf progs leaking kernel (pointer) info. For queueing an xdp_buff via XDP_REDIRECT, xdp_frame info stores kernel info (incl pointers) in top part of frame data (xdp->data_hard_start). Checks are in place to assure enough headroom is available for this. This info is not cleared, and if the frame is reused, then a malicious user could use bpf_xdp_adjust_head helper to move xdp->data into this area. Thus, making this area readable. This is not super critical as XDP progs requires root or CAP_SYS_ADMIN, which are privileged enough for such info. An effort (is underway) towards moving networking bpf hooks to the lesser privileged mode CAP_NET_ADMIN, where leaking such info should be avoided. Thus, this patch to clear the info when needed. Signed-off-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- net/core/filter.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/net/core/filter.c b/net/core/filter.c index 22b0a3ddcb1d..6b165f5786bf 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2742,6 +2742,7 @@ static unsigned long xdp_get_metalen(const struct xdp_buff *xdp) BPF_CALL_2(bpf_xdp_adjust_head, struct xdp_buff *, xdp, int, offset) { + void *xdp_frame_end = xdp->data_hard_start + sizeof(struct xdp_frame); unsigned long metalen = xdp_get_metalen(xdp); void *data_start = xdp->data_hard_start + metalen; void *data = xdp->data + offset; @@ -2750,6 +2751,13 @@ BPF_CALL_2(bpf_xdp_adjust_head, struct xdp_buff *, xdp, int, offset) data > xdp->data_end - ETH_HLEN)) return -EINVAL; + /* Avoid info leak, when reusing area prev used by xdp_frame */ + if (data < xdp_frame_end) { + unsigned long clearlen = xdp_frame_end - data; + + memset(data, 0, clearlen); + } + if (metalen) memmove(xdp->data_meta + offset, xdp->data_meta, metalen); From d25b883ef6ce1acc470ab39cf076a8473baa88fc Mon Sep 17 00:00:00 2001 From: "Nikita V. Shirokov" Date: Tue, 17 Apr 2018 21:42:13 -0700 Subject: [PATCH 0228/1640] BACKPORT: bpf: adding bpf_xdp_adjust_tail helper Adding new bpf helper which would allow us to manipulate xdp's data_end pointer, and allow us to reduce packet's size indended use case: to generate ICMP messages from XDP context, where such message would contain truncated original packet. Signed-off-by: Nikita V. Shirokov Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/uapi/linux/bpf.h | 7 +++++++ net/core/filter.c | 29 ++++++++++++++++++++++++++++- 2 files changed, 35 insertions(+), 1 deletion(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index bc32efcfe67b..4b762f4d5cb6 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -757,6 +757,13 @@ union bpf_attr { * @addr_len: length of sockaddr structure * Return: 0 on success or negative error code * + * int bpf_xdp_adjust_tail(xdp_md, delta) + * Adjust the xdp_md.data_end by delta. Only shrinking of packet's + * size is supported. + * @xdp_md: pointer to xdp_md + * @delta: A negative integer to be added to xdp_md.data_end + * Return: 0 on success or negative on error + * * int skb_load_bytes_relative(const struct sk_buff *skb, u32 offset, void *to, u32 len, u32 start_header) * Description * This helper is similar to **bpf_skb_load_bytes**\ () in that diff --git a/net/core/filter.c b/net/core/filter.c index 6b165f5786bf..1470b9dc714b 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2775,6 +2775,30 @@ static const struct bpf_func_proto bpf_xdp_adjust_head_proto = { .arg2_type = ARG_ANYTHING, }; +BPF_CALL_2(bpf_xdp_adjust_tail, struct xdp_buff *, xdp, int, offset) +{ + void *data_end = xdp->data_end + offset; + + /* only shrinking is allowed for now. */ + if (unlikely(offset >= 0)) + return -EINVAL; + + if (unlikely(data_end < xdp->data + ETH_HLEN)) + return -EINVAL; + + xdp->data_end = data_end; + + return 0; +} + +static const struct bpf_func_proto bpf_xdp_adjust_tail_proto = { + .func = bpf_xdp_adjust_tail, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, +}; + BPF_CALL_2(bpf_xdp_adjust_meta, struct xdp_buff *, xdp, int, offset) { void *meta = xdp->data_meta + offset; @@ -3124,7 +3148,8 @@ bool bpf_helper_changes_pkt_data(void *func) func == bpf_l4_csum_replace || func == bpf_xdp_adjust_head || func == bpf_xdp_adjust_meta || - func == bpf_msg_pull_data) + func == bpf_msg_pull_data || + func == bpf_xdp_adjust_tail) return true; return false; @@ -3951,6 +3976,8 @@ xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_xdp_redirect_proto; case BPF_FUNC_redirect_map: return &bpf_xdp_redirect_map_proto; + case BPF_FUNC_xdp_adjust_tail: + return &bpf_xdp_adjust_tail_proto; default: return bpf_base_func_proto(func_id); } From 9af1ff41786f52a4543fcf53e3d2313e73bb13c4 Mon Sep 17 00:00:00 2001 From: "Nikita V. Shirokov" Date: Tue, 17 Apr 2018 21:42:21 -0700 Subject: [PATCH 0229/1640] UPSTREAM: bpf: making bpf_prog_test run aware of possible data_end ptr change after introduction of bpf_xdp_adjust_tail helper packet length could be changed not only if xdp->data pointer has been changed but xdp->data_end as well. making bpf_prog_test_run aware of this possibility Signed-off-by: Nikita V. Shirokov Signed-off-by: Daniel Borkmann --- net/bpf/test_run.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 489e3c64bdae..e8e4ba86c6ab 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -182,7 +182,8 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr, xdp.rxq = &rxqueue->xdp_rxq; retval = bpf_test_run(prog, &xdp, repeat, &duration); - if (xdp.data != data + XDP_PACKET_HEADROOM + NET_IP_ALIGN) + if (xdp.data != data + XDP_PACKET_HEADROOM + NET_IP_ALIGN || + xdp.data_end != xdp.data + size) size = xdp.data_end - xdp.data; ret = bpf_test_finish(kattr, uattr, xdp.data, size, retval, duration); kfree(data); From de210e410566762cd119658a58facf166ac7b8fa Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 19 Apr 2018 16:17:12 +0200 Subject: [PATCH 0230/1640] UPSTREAM: bpf: reserve xdp_frame size in xdp headroom Commit 6dfb970d3dbd ("xdp: avoid leaking info stored in frame data on page reuse") tried to allow user/bpf_prog to (re)use area used by xdp_frame (stored in frame headroom), by memset clearing area when bpf_xdp_adjust_head give bpf_prog access to headroom area. The mentioned commit had two bugs. (1) Didn't take bpf_xdp_adjust_meta into account. (2) a combination of bpf_xdp_adjust_head calls, where xdp->data is moved into xdp_frame section, can cause clearing xdp_frame area again for area previously granted to bpf_prog. After discussions with Daniel, we choose to implement a simpler solution to the problem, which is to reserve the headroom used by xdp_frame info. This also avoids the situation where bpf_prog is allowed to adjust/add headers, and then XDP_REDIRECT later drops the packet due to lack of headroom for the xdp_frame. This would likely confuse the end-user. Fixes: 6dfb970d3dbd ("xdp: avoid leaking info stored in frame data on page reuse") Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Daniel Borkmann --- net/core/filter.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/net/core/filter.c b/net/core/filter.c index 1470b9dc714b..4a666e68f235 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2744,20 +2744,13 @@ BPF_CALL_2(bpf_xdp_adjust_head, struct xdp_buff *, xdp, int, offset) { void *xdp_frame_end = xdp->data_hard_start + sizeof(struct xdp_frame); unsigned long metalen = xdp_get_metalen(xdp); - void *data_start = xdp->data_hard_start + metalen; + void *data_start = xdp_frame_end + metalen; void *data = xdp->data + offset; if (unlikely(data < data_start || data > xdp->data_end - ETH_HLEN)) return -EINVAL; - /* Avoid info leak, when reusing area prev used by xdp_frame */ - if (data < xdp_frame_end) { - unsigned long clearlen = xdp_frame_end - data; - - memset(data, 0, clearlen); - } - if (metalen) memmove(xdp->data_meta + offset, xdp->data_meta, metalen); @@ -2801,12 +2794,13 @@ static const struct bpf_func_proto bpf_xdp_adjust_tail_proto = { BPF_CALL_2(bpf_xdp_adjust_meta, struct xdp_buff *, xdp, int, offset) { + void *xdp_frame_end = xdp->data_hard_start + sizeof(struct xdp_frame); void *meta = xdp->data_meta + offset; unsigned long metalen = xdp->data - meta; if (xdp_data_meta_unsupported(xdp)) return -ENOTSUPP; - if (unlikely(meta < xdp->data_hard_start || + if (unlikely(meta < xdp_frame_end || meta > xdp->data)) return -EINVAL; if (unlikely((metalen & (sizeof(__u32) - 1)) || From 58cc1de3653f2b8fc7a6a57afd204c31773964b7 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 18 Apr 2018 15:55:57 -0700 Subject: [PATCH 0231/1640] UPSTREAM: bpf: btf: Introduce BPF Type Format (BTF) This patch introduces BPF type Format (BTF). BTF (BPF Type Format) is the meta data format which describes the data types of BPF program/map. Hence, it basically focus on the C programming language which the modern BPF is primary using. The first use case is to provide a generic pretty print capability for a BPF map. BTF has its root from CTF (Compact C-Type format). To simplify the handling of BTF data, BTF removes the differences between small and big type/struct-member. Hence, BTF consistently uses u32 instead of supporting both "one u16" and "two u32 (+padding)" in describing type and struct-member. It also raises the number of types (and functions) limit from 0x7fff to 0x7fffffff. Due to the above changes, the format is not compatible to CTF. Hence, BTF starts with a new BTF_MAGIC and version number. This patch does the first verification pass to the BTF. The first pass checks: 1. meta-data size (e.g. It does not go beyond the total btf's size) 2. name_offset is valid 3. Each BTF_KIND (e.g. int, enum, struct....) does its own check of its meta-data. Some other checks, like checking a struct's member is referring to a valid type, can only be done in the second pass. The second verification pass will be implemented in the next patch. Signed-off-by: Martin KaFai Lau Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/uapi/linux/btf.h | 130 ++++++ kernel/bpf/Makefile | 1 + kernel/bpf/btf.c | 915 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 1046 insertions(+) create mode 100644 include/uapi/linux/btf.h create mode 100644 kernel/bpf/btf.c diff --git a/include/uapi/linux/btf.h b/include/uapi/linux/btf.h new file mode 100644 index 000000000000..74a30b1090df --- /dev/null +++ b/include/uapi/linux/btf.h @@ -0,0 +1,130 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* Copyright (c) 2018 Facebook */ +#ifndef _UAPI__LINUX_BTF_H__ +#define _UAPI__LINUX_BTF_H__ + +#include + +#define BTF_MAGIC 0xeB9F +#define BTF_MAGIC_SWAP 0x9FeB +#define BTF_VERSION 1 +#define BTF_FLAGS_COMPR 0x01 + +struct btf_header { + __u16 magic; + __u8 version; + __u8 flags; + + __u32 parent_label; + __u32 parent_name; + + /* All offsets are in bytes relative to the end of this header */ + __u32 label_off; /* offset of label section */ + __u32 object_off; /* offset of data object section*/ + __u32 func_off; /* offset of function section */ + __u32 type_off; /* offset of type section */ + __u32 str_off; /* offset of string section */ + __u32 str_len; /* length of string section */ +}; + +/* Max # of type identifier */ +#define BTF_MAX_TYPE 0x7fffffff +/* Max offset into the string section */ +#define BTF_MAX_NAME_OFFSET 0x7fffffff +/* Max # of struct/union/enum members or func args */ +#define BTF_MAX_VLEN 0xffff + +/* The type id is referring to a parent BTF */ +#define BTF_TYPE_PARENT(id) (((id) >> 31) & 0x1) +#define BTF_TYPE_ID(id) ((id) & BTF_MAX_TYPE) + +/* String is in the ELF string section */ +#define BTF_STR_TBL_ELF_ID(ref) (((ref) >> 31) & 0x1) +#define BTF_STR_OFFSET(ref) ((ref) & BTF_MAX_NAME_OFFSET) + +struct btf_type { + __u32 name; + /* "info" bits arrangement + * bits 0-15: vlen (e.g. # of struct's members) + * bits 16-23: unused + * bits 24-28: kind (e.g. int, ptr, array...etc) + * bits 29-30: unused + * bits 31: root + */ + __u32 info; + /* "size" is used by INT, ENUM, STRUCT and UNION. + * "size" tells the size of the type it is describing. + * + * "type" is used by PTR, TYPEDEF, VOLATILE, CONST and RESTRICT. + * "type" is a type_id referring to another type. + */ + union { + __u32 size; + __u32 type; + }; +}; + +#define BTF_INFO_KIND(info) (((info) >> 24) & 0x1f) +#define BTF_INFO_ISROOT(info) (!!(((info) >> 24) & 0x80)) +#define BTF_INFO_VLEN(info) ((info) & 0xffff) + +#define BTF_KIND_UNKN 0 /* Unknown */ +#define BTF_KIND_INT 1 /* Integer */ +#define BTF_KIND_PTR 2 /* Pointer */ +#define BTF_KIND_ARRAY 3 /* Array */ +#define BTF_KIND_STRUCT 4 /* Struct */ +#define BTF_KIND_UNION 5 /* Union */ +#define BTF_KIND_ENUM 6 /* Enumeration */ +#define BTF_KIND_FWD 7 /* Forward */ +#define BTF_KIND_TYPEDEF 8 /* Typedef */ +#define BTF_KIND_VOLATILE 9 /* Volatile */ +#define BTF_KIND_CONST 10 /* Const */ +#define BTF_KIND_RESTRICT 11 /* Restrict */ +#define BTF_KIND_MAX 11 +#define NR_BTF_KINDS 12 + +/* For some specific BTF_KIND, "struct btf_type" is immediately + * followed by extra data. + */ + +/* BTF_KIND_INT is followed by a u32 and the following + * is the 32 bits arrangement: + */ +#define BTF_INT_ENCODING(VAL) (((VAL) & 0xff000000) >> 24) +#define BTF_INT_OFFSET(VAL) (((VAL & 0x00ff0000)) >> 16) +#define BTF_INT_BITS(VAL) ((VAL) & 0x0000ffff) + +/* Attributes stored in the BTF_INT_ENCODING */ +#define BTF_INT_SIGNED 0x1 +#define BTF_INT_CHAR 0x2 +#define BTF_INT_BOOL 0x4 +#define BTF_INT_VARARGS 0x8 + +/* BTF_KIND_ENUM is followed by multiple "struct btf_enum". + * The exact number of btf_enum is stored in the vlen (of the + * info in "struct btf_type"). + */ +struct btf_enum { + __u32 name; + __s32 val; +}; + +/* BTF_KIND_ARRAY is followed by one "struct btf_array" */ +struct btf_array { + __u32 type; + __u32 index_type; + __u32 nelems; +}; + +/* BTF_KIND_STRUCT and BTF_KIND_UNION are followed + * by multiple "struct btf_member". The exact number + * of btf_member is stored in the vlen (of the info in + * "struct btf_type"). + */ +struct btf_member { + __u32 name; + __u32 type; + __u32 offset; /* offset in bits */ +}; + +#endif /* _UAPI__LINUX_BTF_H__ */ diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index d7fe790c70b2..2d12e26f751e 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -5,6 +5,7 @@ CFLAGS_core.o += $(call cc-disable-warning, override-init) obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o obj-$(CONFIG_BPF_SYSCALL) += disasm.o +obj-$(CONFIG_BPF_SYSCALL) += btf.o ifeq ($(CONFIG_NET),y) obj-$(CONFIG_BPF_SYSCALL) += devmap.o obj-$(CONFIG_BPF_SYSCALL) += cpumap.o diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c new file mode 100644 index 000000000000..26e9ed7cea5f --- /dev/null +++ b/kernel/bpf/btf.c @@ -0,0 +1,915 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (c) 2018 Facebook */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* BTF (BPF Type Format) is the meta data format which describes + * the data types of BPF program/map. Hence, it basically focus + * on the C programming language which the modern BPF is primary + * using. + * + * ELF Section: + * ~~~~~~~~~~~ + * The BTF data is stored under the ".BTF" ELF section + * + * struct btf_type: + * ~~~~~~~~~~~~~~~ + * Each 'struct btf_type' object describes a C data type. + * Depending on the type it is describing, a 'struct btf_type' + * object may be followed by more data. F.e. + * To describe an array, 'struct btf_type' is followed by + * 'struct btf_array'. + * + * 'struct btf_type' and any extra data following it are + * 4 bytes aligned. + * + * Type section: + * ~~~~~~~~~~~~~ + * The BTF type section contains a list of 'struct btf_type' objects. + * Each one describes a C type. Recall from the above section + * that a 'struct btf_type' object could be immediately followed by extra + * data in order to desribe some particular C types. + * + * type_id: + * ~~~~~~~ + * Each btf_type object is identified by a type_id. The type_id + * is implicitly implied by the location of the btf_type object in + * the BTF type section. The first one has type_id 1. The second + * one has type_id 2...etc. Hence, an earlier btf_type has + * a smaller type_id. + * + * A btf_type object may refer to another btf_type object by using + * type_id (i.e. the "type" in the "struct btf_type"). + * + * NOTE that we cannot assume any reference-order. + * A btf_type object can refer to an earlier btf_type object + * but it can also refer to a later btf_type object. + * + * For example, to describe "const void *". A btf_type + * object describing "const" may refer to another btf_type + * object describing "void *". This type-reference is done + * by specifying type_id: + * + * [1] CONST (anon) type_id=2 + * [2] PTR (anon) type_id=0 + * + * The above is the btf_verifier debug log: + * - Each line started with "[?]" is a btf_type object + * - [?] is the type_id of the btf_type object. + * - CONST/PTR is the BTF_KIND_XXX + * - "(anon)" is the name of the type. It just + * happens that CONST and PTR has no name. + * - type_id=XXX is the 'u32 type' in btf_type + * + * NOTE: "void" has type_id 0 + * + * String section: + * ~~~~~~~~~~~~~~ + * The BTF string section contains the names used by the type section. + * Each string is referred by an "offset" from the beginning of the + * string section. + * + * Each string is '\0' terminated. + * + * The first character in the string section must be '\0' + * which is used to mean 'anonymous'. Some btf_type may not + * have a name. + */ + +/* BTF verification: + * + * To verify BTF data, two passes are needed. + * + * Pass #1 + * ~~~~~~~ + * The first pass is to collect all btf_type objects to + * an array: "btf->types". + * + * Depending on the C type that a btf_type is describing, + * a btf_type may be followed by extra data. We don't know + * how many btf_type is there, and more importantly we don't + * know where each btf_type is located in the type section. + * + * Without knowing the location of each type_id, most verifications + * cannot be done. e.g. an earlier btf_type may refer to a later + * btf_type (recall the "const void *" above), so we cannot + * check this type-reference in the first pass. + * + * In the first pass, it still does some verifications (e.g. + * checking the name is a valid offset to the string section). + */ + +#define BITS_PER_U64 (sizeof(u64) * BITS_PER_BYTE) +#define BITS_PER_BYTE_MASK (BITS_PER_BYTE - 1) +#define BITS_PER_BYTE_MASKED(bits) ((bits) & BITS_PER_BYTE_MASK) +#define BITS_ROUNDDOWN_BYTES(bits) ((bits) >> 3) +#define BITS_ROUNDUP_BYTES(bits) \ + (BITS_ROUNDDOWN_BYTES(bits) + !!BITS_PER_BYTE_MASKED(bits)) + +/* 16MB for 64k structs and each has 16 members and + * a few MB spaces for the string section. + * The hard limit is S32_MAX. + */ +#define BTF_MAX_SIZE (16 * 1024 * 1024) +/* 64k. We can raise it later. The hard limit is S32_MAX. */ +#define BTF_MAX_NR_TYPES 65535 + +#define for_each_member(i, struct_type, member) \ + for (i = 0, member = btf_type_member(struct_type); \ + i < btf_type_vlen(struct_type); \ + i++, member++) + +struct btf { + union { + struct btf_header *hdr; + void *data; + }; + struct btf_type **types; + const char *strings; + void *nohdr_data; + u32 nr_types; + u32 types_size; + u32 data_size; +}; + +struct btf_verifier_env { + struct btf *btf; + struct bpf_verifier_log log; + u32 log_type_id; +}; + +static const char * const btf_kind_str[NR_BTF_KINDS] = { + [BTF_KIND_UNKN] = "UNKNOWN", + [BTF_KIND_INT] = "INT", + [BTF_KIND_PTR] = "PTR", + [BTF_KIND_ARRAY] = "ARRAY", + [BTF_KIND_STRUCT] = "STRUCT", + [BTF_KIND_UNION] = "UNION", + [BTF_KIND_ENUM] = "ENUM", + [BTF_KIND_FWD] = "FWD", + [BTF_KIND_TYPEDEF] = "TYPEDEF", + [BTF_KIND_VOLATILE] = "VOLATILE", + [BTF_KIND_CONST] = "CONST", + [BTF_KIND_RESTRICT] = "RESTRICT", +}; + +struct btf_kind_operations { + s32 (*check_meta)(struct btf_verifier_env *env, + const struct btf_type *t, + u32 meta_left); + void (*log_details)(struct btf_verifier_env *env, + const struct btf_type *t); +}; + +static const struct btf_kind_operations * const kind_ops[NR_BTF_KINDS]; +static struct btf_type btf_void; + +static const char *btf_int_encoding_str(u8 encoding) +{ + if (encoding == 0) + return "(none)"; + else if (encoding == BTF_INT_SIGNED) + return "SIGNED"; + else if (encoding == BTF_INT_CHAR) + return "CHAR"; + else if (encoding == BTF_INT_BOOL) + return "BOOL"; + else if (encoding == BTF_INT_VARARGS) + return "VARARGS"; + else + return "UNKN"; +} + +static u16 btf_type_vlen(const struct btf_type *t) +{ + return BTF_INFO_VLEN(t->info); +} + +static u32 btf_type_int(const struct btf_type *t) +{ + return *(u32 *)(t + 1); +} + +static const struct btf_array *btf_type_array(const struct btf_type *t) +{ + return (const struct btf_array *)(t + 1); +} + +static const struct btf_member *btf_type_member(const struct btf_type *t) +{ + return (const struct btf_member *)(t + 1); +} + +static const struct btf_enum *btf_type_enum(const struct btf_type *t) +{ + return (const struct btf_enum *)(t + 1); +} + +static const struct btf_kind_operations *btf_type_ops(const struct btf_type *t) +{ + return kind_ops[BTF_INFO_KIND(t->info)]; +} + +static bool btf_name_offset_valid(const struct btf *btf, u32 offset) +{ + return !BTF_STR_TBL_ELF_ID(offset) && + BTF_STR_OFFSET(offset) < btf->hdr->str_len; +} + +static const char *btf_name_by_offset(const struct btf *btf, u32 offset) +{ + if (!BTF_STR_OFFSET(offset)) + return "(anon)"; + else if (BTF_STR_OFFSET(offset) < btf->hdr->str_len) + return &btf->strings[BTF_STR_OFFSET(offset)]; + else + return "(invalid-name-offset)"; +} + +__printf(2, 3) static void __btf_verifier_log(struct bpf_verifier_log *log, + const char *fmt, ...) +{ + va_list args; + + va_start(args, fmt); + bpf_verifier_vlog(log, fmt, args); + va_end(args); +} + +__printf(2, 3) static void btf_verifier_log(struct btf_verifier_env *env, + const char *fmt, ...) +{ + struct bpf_verifier_log *log = &env->log; + va_list args; + + if (!bpf_verifier_log_needed(log)) + return; + + va_start(args, fmt); + bpf_verifier_vlog(log, fmt, args); + va_end(args); +} + +__printf(4, 5) static void __btf_verifier_log_type(struct btf_verifier_env *env, + const struct btf_type *t, + bool log_details, + const char *fmt, ...) +{ + struct bpf_verifier_log *log = &env->log; + u8 kind = BTF_INFO_KIND(t->info); + struct btf *btf = env->btf; + va_list args; + + if (!bpf_verifier_log_needed(log)) + return; + + __btf_verifier_log(log, "[%u] %s %s%s", + env->log_type_id, + btf_kind_str[kind], + btf_name_by_offset(btf, t->name), + log_details ? " " : ""); + + if (log_details) + btf_type_ops(t)->log_details(env, t); + + if (fmt && *fmt) { + __btf_verifier_log(log, " "); + va_start(args, fmt); + bpf_verifier_vlog(log, fmt, args); + va_end(args); + } + + __btf_verifier_log(log, "\n"); +} + +#define btf_verifier_log_type(env, t, ...) \ + __btf_verifier_log_type((env), (t), true, __VA_ARGS__) +#define btf_verifier_log_basic(env, t, ...) \ + __btf_verifier_log_type((env), (t), false, __VA_ARGS__) + +__printf(4, 5) +static void btf_verifier_log_member(struct btf_verifier_env *env, + const struct btf_type *struct_type, + const struct btf_member *member, + const char *fmt, ...) +{ + struct bpf_verifier_log *log = &env->log; + struct btf *btf = env->btf; + va_list args; + + if (!bpf_verifier_log_needed(log)) + return; + + __btf_verifier_log(log, "\t%s type_id=%u bits_offset=%u", + btf_name_by_offset(btf, member->name), + member->type, member->offset); + + if (fmt && *fmt) { + __btf_verifier_log(log, " "); + va_start(args, fmt); + bpf_verifier_vlog(log, fmt, args); + va_end(args); + } + + __btf_verifier_log(log, "\n"); +} + +static void btf_verifier_log_hdr(struct btf_verifier_env *env) +{ + struct bpf_verifier_log *log = &env->log; + const struct btf *btf = env->btf; + const struct btf_header *hdr; + + if (!bpf_verifier_log_needed(log)) + return; + + hdr = btf->hdr; + __btf_verifier_log(log, "magic: 0x%x\n", hdr->magic); + __btf_verifier_log(log, "version: %u\n", hdr->version); + __btf_verifier_log(log, "flags: 0x%x\n", hdr->flags); + __btf_verifier_log(log, "parent_label: %u\n", hdr->parent_label); + __btf_verifier_log(log, "parent_name: %u\n", hdr->parent_name); + __btf_verifier_log(log, "label_off: %u\n", hdr->label_off); + __btf_verifier_log(log, "object_off: %u\n", hdr->object_off); + __btf_verifier_log(log, "func_off: %u\n", hdr->func_off); + __btf_verifier_log(log, "type_off: %u\n", hdr->type_off); + __btf_verifier_log(log, "str_off: %u\n", hdr->str_off); + __btf_verifier_log(log, "str_len: %u\n", hdr->str_len); + __btf_verifier_log(log, "btf_total_size: %u\n", btf->data_size); +} + +static int btf_add_type(struct btf_verifier_env *env, struct btf_type *t) +{ + struct btf *btf = env->btf; + + /* < 2 because +1 for btf_void which is always in btf->types[0]. + * btf_void is not accounted in btf->nr_types because btf_void + * does not come from the BTF file. + */ + if (btf->types_size - btf->nr_types < 2) { + /* Expand 'types' array */ + + struct btf_type **new_types; + u32 expand_by, new_size; + + if (btf->types_size == BTF_MAX_NR_TYPES) { + btf_verifier_log(env, "Exceeded max num of types"); + return -E2BIG; + } + + expand_by = max_t(u32, btf->types_size >> 2, 16); + new_size = min_t(u32, BTF_MAX_NR_TYPES, + btf->types_size + expand_by); + + new_types = kvzalloc(new_size * sizeof(*new_types), + GFP_KERNEL | __GFP_NOWARN); + if (!new_types) + return -ENOMEM; + + if (btf->nr_types == 0) + new_types[0] = &btf_void; + else + memcpy(new_types, btf->types, + sizeof(*btf->types) * (btf->nr_types + 1)); + + kvfree(btf->types); + btf->types = new_types; + btf->types_size = new_size; + } + + btf->types[++(btf->nr_types)] = t; + + return 0; +} + +static void btf_free(struct btf *btf) +{ + kvfree(btf->types); + kvfree(btf->data); + kfree(btf); +} + +static void btf_verifier_env_free(struct btf_verifier_env *env) +{ + kfree(env); +} + +static s32 btf_int_check_meta(struct btf_verifier_env *env, + const struct btf_type *t, + u32 meta_left) +{ + u32 int_data, nr_bits, meta_needed = sizeof(int_data); + u16 encoding; + + if (meta_left < meta_needed) { + btf_verifier_log_basic(env, t, + "meta_left:%u meta_needed:%u", + meta_left, meta_needed); + return -EINVAL; + } + + if (btf_type_vlen(t)) { + btf_verifier_log_type(env, t, "vlen != 0"); + return -EINVAL; + } + + int_data = btf_type_int(t); + nr_bits = BTF_INT_BITS(int_data) + BTF_INT_OFFSET(int_data); + + if (nr_bits > BITS_PER_U64) { + btf_verifier_log_type(env, t, "nr_bits exceeds %zu", + BITS_PER_U64); + return -EINVAL; + } + + if (BITS_ROUNDUP_BYTES(nr_bits) > t->size) { + btf_verifier_log_type(env, t, "nr_bits exceeds type_size"); + return -EINVAL; + } + + encoding = BTF_INT_ENCODING(int_data); + if (encoding && + encoding != BTF_INT_SIGNED && + encoding != BTF_INT_CHAR && + encoding != BTF_INT_BOOL && + encoding != BTF_INT_VARARGS) { + btf_verifier_log_type(env, t, "Unsupported encoding"); + return -ENOTSUPP; + } + + btf_verifier_log_type(env, t, NULL); + + return meta_needed; +} + +static void btf_int_log(struct btf_verifier_env *env, + const struct btf_type *t) +{ + int int_data = btf_type_int(t); + + btf_verifier_log(env, + "size=%u bits_offset=%u nr_bits=%u encoding=%s", + t->size, BTF_INT_OFFSET(int_data), + BTF_INT_BITS(int_data), + btf_int_encoding_str(BTF_INT_ENCODING(int_data))); +} + +static const struct btf_kind_operations int_ops = { + .check_meta = btf_int_check_meta, + .log_details = btf_int_log, +}; + +static int btf_ref_type_check_meta(struct btf_verifier_env *env, + const struct btf_type *t, + u32 meta_left) +{ + if (btf_type_vlen(t)) { + btf_verifier_log_type(env, t, "vlen != 0"); + return -EINVAL; + } + + if (BTF_TYPE_PARENT(t->type)) { + btf_verifier_log_type(env, t, "Invalid type_id"); + return -EINVAL; + } + + btf_verifier_log_type(env, t, NULL); + + return 0; +} + +static void btf_ref_type_log(struct btf_verifier_env *env, + const struct btf_type *t) +{ + btf_verifier_log(env, "type_id=%u", t->type); +} + +static struct btf_kind_operations modifier_ops = { + .check_meta = btf_ref_type_check_meta, + .log_details = btf_ref_type_log, +}; + +static struct btf_kind_operations ptr_ops = { + .check_meta = btf_ref_type_check_meta, + .log_details = btf_ref_type_log, +}; + +static struct btf_kind_operations fwd_ops = { + .check_meta = btf_ref_type_check_meta, + .log_details = btf_ref_type_log, +}; + +static s32 btf_array_check_meta(struct btf_verifier_env *env, + const struct btf_type *t, + u32 meta_left) +{ + const struct btf_array *array = btf_type_array(t); + u32 meta_needed = sizeof(*array); + + if (meta_left < meta_needed) { + btf_verifier_log_basic(env, t, + "meta_left:%u meta_needed:%u", + meta_left, meta_needed); + return -EINVAL; + } + + if (btf_type_vlen(t)) { + btf_verifier_log_type(env, t, "vlen != 0"); + return -EINVAL; + } + + /* We are a little forgiving on array->index_type since + * the kernel is not using it. + */ + /* Array elem cannot be in type void, + * so !array->type is not allowed. + */ + if (!array->type || BTF_TYPE_PARENT(array->type)) { + btf_verifier_log_type(env, t, "Invalid type_id"); + return -EINVAL; + } + + btf_verifier_log_type(env, t, NULL); + + return meta_needed; +} + +static void btf_array_log(struct btf_verifier_env *env, + const struct btf_type *t) +{ + const struct btf_array *array = btf_type_array(t); + + btf_verifier_log(env, "type_id=%u index_type_id=%u nr_elems=%u", + array->type, array->index_type, array->nelems); +} + +static struct btf_kind_operations array_ops = { + .check_meta = btf_array_check_meta, + .log_details = btf_array_log, +}; + +static s32 btf_struct_check_meta(struct btf_verifier_env *env, + const struct btf_type *t, + u32 meta_left) +{ + bool is_union = BTF_INFO_KIND(t->info) == BTF_KIND_UNION; + const struct btf_member *member; + struct btf *btf = env->btf; + u32 struct_size = t->size; + u32 meta_needed; + u16 i; + + meta_needed = btf_type_vlen(t) * sizeof(*member); + if (meta_left < meta_needed) { + btf_verifier_log_basic(env, t, + "meta_left:%u meta_needed:%u", + meta_left, meta_needed); + return -EINVAL; + } + + btf_verifier_log_type(env, t, NULL); + + for_each_member(i, t, member) { + if (!btf_name_offset_valid(btf, member->name)) { + btf_verifier_log_member(env, t, member, + "Invalid member name_offset:%u", + member->name); + return -EINVAL; + } + + /* A member cannot be in type void */ + if (!member->type || BTF_TYPE_PARENT(member->type)) { + btf_verifier_log_member(env, t, member, + "Invalid type_id"); + return -EINVAL; + } + + if (is_union && member->offset) { + btf_verifier_log_member(env, t, member, + "Invalid member bits_offset"); + return -EINVAL; + } + + if (BITS_ROUNDUP_BYTES(member->offset) > struct_size) { + btf_verifier_log_member(env, t, member, + "Memmber bits_offset exceeds its struct size"); + return -EINVAL; + } + + btf_verifier_log_member(env, t, member, NULL); + } + + return meta_needed; +} + +static void btf_struct_log(struct btf_verifier_env *env, + const struct btf_type *t) +{ + btf_verifier_log(env, "size=%u vlen=%u", t->size, btf_type_vlen(t)); +} + +static struct btf_kind_operations struct_ops = { + .check_meta = btf_struct_check_meta, + .log_details = btf_struct_log, +}; + +static s32 btf_enum_check_meta(struct btf_verifier_env *env, + const struct btf_type *t, + u32 meta_left) +{ + const struct btf_enum *enums = btf_type_enum(t); + struct btf *btf = env->btf; + u16 i, nr_enums; + u32 meta_needed; + + nr_enums = btf_type_vlen(t); + meta_needed = nr_enums * sizeof(*enums); + + if (meta_left < meta_needed) { + btf_verifier_log_basic(env, t, + "meta_left:%u meta_needed:%u", + meta_left, meta_needed); + return -EINVAL; + } + + if (t->size != sizeof(int)) { + btf_verifier_log_type(env, t, "Expected size:%zu", + sizeof(int)); + return -EINVAL; + } + + btf_verifier_log_type(env, t, NULL); + + for (i = 0; i < nr_enums; i++) { + if (!btf_name_offset_valid(btf, enums[i].name)) { + btf_verifier_log(env, "\tInvalid name_offset:%u", + enums[i].name); + return -EINVAL; + } + + btf_verifier_log(env, "\t%s val=%d\n", + btf_name_by_offset(btf, enums[i].name), + enums[i].val); + } + + return meta_needed; +} + +static void btf_enum_log(struct btf_verifier_env *env, + const struct btf_type *t) +{ + btf_verifier_log(env, "size=%u vlen=%u", t->size, btf_type_vlen(t)); +} + +static struct btf_kind_operations enum_ops = { + .check_meta = btf_enum_check_meta, + .log_details = btf_enum_log, +}; + +static const struct btf_kind_operations * const kind_ops[NR_BTF_KINDS] = { + [BTF_KIND_INT] = &int_ops, + [BTF_KIND_PTR] = &ptr_ops, + [BTF_KIND_ARRAY] = &array_ops, + [BTF_KIND_STRUCT] = &struct_ops, + [BTF_KIND_UNION] = &struct_ops, + [BTF_KIND_ENUM] = &enum_ops, + [BTF_KIND_FWD] = &fwd_ops, + [BTF_KIND_TYPEDEF] = &modifier_ops, + [BTF_KIND_VOLATILE] = &modifier_ops, + [BTF_KIND_CONST] = &modifier_ops, + [BTF_KIND_RESTRICT] = &modifier_ops, +}; + +static s32 btf_check_meta(struct btf_verifier_env *env, + const struct btf_type *t, + u32 meta_left) +{ + u32 saved_meta_left = meta_left; + s32 var_meta_size; + + if (meta_left < sizeof(*t)) { + btf_verifier_log(env, "[%u] meta_left:%u meta_needed:%zu", + env->log_type_id, meta_left, sizeof(*t)); + return -EINVAL; + } + meta_left -= sizeof(*t); + + if (BTF_INFO_KIND(t->info) > BTF_KIND_MAX || + BTF_INFO_KIND(t->info) == BTF_KIND_UNKN) { + btf_verifier_log(env, "[%u] Invalid kind:%u", + env->log_type_id, BTF_INFO_KIND(t->info)); + return -EINVAL; + } + + if (!btf_name_offset_valid(env->btf, t->name)) { + btf_verifier_log(env, "[%u] Invalid name_offset:%u", + env->log_type_id, t->name); + return -EINVAL; + } + + var_meta_size = btf_type_ops(t)->check_meta(env, t, meta_left); + if (var_meta_size < 0) + return var_meta_size; + + meta_left -= var_meta_size; + + return saved_meta_left - meta_left; +} + +static int btf_check_all_metas(struct btf_verifier_env *env) +{ + struct btf *btf = env->btf; + struct btf_header *hdr; + void *cur, *end; + + hdr = btf->hdr; + cur = btf->nohdr_data + hdr->type_off; + end = btf->nohdr_data + hdr->str_off; + + env->log_type_id = 1; + while (cur < end) { + struct btf_type *t = cur; + s32 meta_size; + + meta_size = btf_check_meta(env, t, end - cur); + if (meta_size < 0) + return meta_size; + + btf_add_type(env, t); + cur += meta_size; + env->log_type_id++; + } + + return 0; +} + +static int btf_parse_type_sec(struct btf_verifier_env *env) +{ + return btf_check_all_metas(env); +} + +static int btf_parse_str_sec(struct btf_verifier_env *env) +{ + const struct btf_header *hdr; + struct btf *btf = env->btf; + const char *start, *end; + + hdr = btf->hdr; + start = btf->nohdr_data + hdr->str_off; + end = start + hdr->str_len; + + if (!hdr->str_len || hdr->str_len - 1 > BTF_MAX_NAME_OFFSET || + start[0] || end[-1]) { + btf_verifier_log(env, "Invalid string section"); + return -EINVAL; + } + + btf->strings = start; + + return 0; +} + +static int btf_parse_hdr(struct btf_verifier_env *env) +{ + const struct btf_header *hdr; + struct btf *btf = env->btf; + u32 meta_left; + + if (btf->data_size < sizeof(*hdr)) { + btf_verifier_log(env, "btf_header not found"); + return -EINVAL; + } + + btf_verifier_log_hdr(env); + + hdr = btf->hdr; + if (hdr->magic != BTF_MAGIC) { + btf_verifier_log(env, "Invalid magic"); + return -EINVAL; + } + + if (hdr->version != BTF_VERSION) { + btf_verifier_log(env, "Unsupported version"); + return -ENOTSUPP; + } + + if (hdr->flags) { + btf_verifier_log(env, "Unsupported flags"); + return -ENOTSUPP; + } + + meta_left = btf->data_size - sizeof(*hdr); + if (!meta_left) { + btf_verifier_log(env, "No data"); + return -EINVAL; + } + + if (meta_left < hdr->type_off || hdr->str_off <= hdr->type_off || + /* Type section must align to 4 bytes */ + hdr->type_off & (sizeof(u32) - 1)) { + btf_verifier_log(env, "Invalid type_off"); + return -EINVAL; + } + + if (meta_left < hdr->str_off || + meta_left - hdr->str_off < hdr->str_len) { + btf_verifier_log(env, "Invalid str_off or str_len"); + return -EINVAL; + } + + btf->nohdr_data = btf->hdr + 1; + + return 0; +} + +static struct btf *btf_parse(void __user *btf_data, u32 btf_data_size, + u32 log_level, char __user *log_ubuf, u32 log_size) +{ + struct btf_verifier_env *env = NULL; + struct bpf_verifier_log *log; + struct btf *btf = NULL; + u8 *data; + int err; + + if (btf_data_size > BTF_MAX_SIZE) + return ERR_PTR(-E2BIG); + + env = kzalloc(sizeof(*env), GFP_KERNEL | __GFP_NOWARN); + if (!env) + return ERR_PTR(-ENOMEM); + + log = &env->log; + if (log_level || log_ubuf || log_size) { + /* user requested verbose verifier output + * and supplied buffer to store the verification trace + */ + log->level = log_level; + log->ubuf = log_ubuf; + log->len_total = log_size; + + /* log attributes have to be sane */ + if (log->len_total < 128 || log->len_total > UINT_MAX >> 8 || + !log->level || !log->ubuf) { + err = -EINVAL; + goto errout; + } + } + + btf = kzalloc(sizeof(*btf), GFP_KERNEL | __GFP_NOWARN); + if (!btf) { + err = -ENOMEM; + goto errout; + } + + data = kvmalloc(btf_data_size, GFP_KERNEL | __GFP_NOWARN); + if (!data) { + err = -ENOMEM; + goto errout; + } + + btf->data = data; + btf->data_size = btf_data_size; + + if (copy_from_user(data, btf_data, btf_data_size)) { + err = -EFAULT; + goto errout; + } + + env->btf = btf; + + err = btf_parse_hdr(env); + if (err) + goto errout; + + err = btf_parse_str_sec(env); + if (err) + goto errout; + + err = btf_parse_type_sec(env); + if (err) + goto errout; + + if (!err && log->level && bpf_verifier_log_full(log)) { + err = -ENOSPC; + goto errout; + } + + if (!err) { + btf_verifier_env_free(env); + return btf; + } + +errout: + btf_verifier_env_free(env); + if (btf) + btf_free(btf); + return ERR_PTR(err); +} From 2d9345b2949068e677361357ef9682071854c8a6 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 18 Apr 2018 15:55:58 -0700 Subject: [PATCH 0232/1640] UPSTREAM: bpf: btf: Validate type reference After collecting all btf_type in the first pass in an earlier patch, the second pass (in this patch) can validate the reference types (e.g. the referring type does exist and it does not refer to itself). While checking the reference type, it also gathers other information (e.g. the size of an array). This info will be useful in checking the struct's members in a later patch. They will also be useful in doing pretty print later. Signed-off-by: Martin KaFai Lau Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/linux/btf.h | 37 +++ kernel/bpf/btf.c | 666 +++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 702 insertions(+), 1 deletion(-) create mode 100644 include/linux/btf.h diff --git a/include/linux/btf.h b/include/linux/btf.h new file mode 100644 index 000000000000..f14b60368753 --- /dev/null +++ b/include/linux/btf.h @@ -0,0 +1,37 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (c) 2018 Facebook */ + +#ifndef _LINUX_BTF_H +#define _LINUX_BTF_H 1 + +#include + +struct btf; +struct btf_type; + +/* Figure out the size of a type_id. If type_id is a modifier + * (e.g. const), it will be resolved to find out the type with size. + * + * For example: + * In describing "const void *", type_id is "const" and "const" + * refers to "void *". The return type will be "void *". + * + * If type_id is a simple "int", then return type will be "int". + * + * @btf: struct btf object + * @type_id: Find out the size of type_id. The type_id of the return + * type is set to *type_id. + * @ret_size: It can be NULL. If not NULL, the size of the return + * type is set to *ret_size. + * Return: The btf_type (resolved to another type with size info if needed). + * NULL is returned if type_id itself does not have size info + * (e.g. void) or it cannot be resolved to another type that + * has size info. + * *type_id and *ret_size will not be changed in the + * NULL return case. + */ +const struct btf_type *btf_type_id_size(const struct btf *btf, + u32 *type_id, + u32 *ret_size); + +#endif diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 26e9ed7cea5f..18bf266ceeda 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -105,6 +105,50 @@ * * In the first pass, it still does some verifications (e.g. * checking the name is a valid offset to the string section). + * + * Pass #2 + * ~~~~~~~ + * The main focus is to resolve a btf_type that is referring + * to another type. + * + * We have to ensure the referring type: + * 1) does exist in the BTF (i.e. in btf->types[]) + * 2) does not cause a loop: + * struct A { + * struct B b; + * }; + * + * struct B { + * struct A a; + * }; + * + * btf_type_needs_resolve() decides if a btf_type needs + * to be resolved. + * + * The needs_resolve type implements the "resolve()" ops which + * essentially does a DFS and detects backedge. + * + * During resolve (or DFS), different C types have different + * "RESOLVED" conditions. + * + * When resolving a BTF_KIND_STRUCT, we need to resolve all its + * members because a member is always referring to another + * type. A struct's member can be treated as "RESOLVED" if + * it is referring to a BTF_KIND_PTR. Otherwise, the + * following valid C struct would be rejected: + * + * struct A { + * int m; + * struct A *a; + * }; + * + * When resolving a BTF_KIND_PTR, it needs to keep resolving if + * it is referring to another BTF_KIND_PTR. Otherwise, we cannot + * detect a pointer loop, e.g.: + * BTF_KIND_CONST -> BTF_KIND_PTR -> BTF_KIND_CONST -> BTF_KIND_PTR + + * ^ | + * +-----------------------------------------+ + * */ #define BITS_PER_U64 (sizeof(u64) * BITS_PER_BYTE) @@ -127,12 +171,19 @@ i < btf_type_vlen(struct_type); \ i++, member++) +#define for_each_member_from(i, from, struct_type, member) \ + for (i = from, member = btf_type_member(struct_type) + from; \ + i < btf_type_vlen(struct_type); \ + i++, member++) + struct btf { union { struct btf_header *hdr; void *data; }; struct btf_type **types; + u32 *resolved_ids; + u32 *resolved_sizes; const char *strings; void *nohdr_data; u32 nr_types; @@ -140,10 +191,42 @@ struct btf { u32 data_size; }; +enum verifier_phase { + CHECK_META, + CHECK_TYPE, +}; + +struct resolve_vertex { + const struct btf_type *t; + u32 type_id; + u16 next_member; +}; + +enum visit_state { + NOT_VISITED, + VISITED, + RESOLVED, +}; + +enum resolve_mode { + RESOLVE_TBD, /* To Be Determined */ + RESOLVE_PTR, /* Resolving for Pointer */ + RESOLVE_STRUCT_OR_ARRAY, /* Resolving for struct/union + * or array + */ +}; + +#define MAX_RESOLVE_DEPTH 32 + struct btf_verifier_env { struct btf *btf; + u8 *visit_states; + struct resolve_vertex stack[MAX_RESOLVE_DEPTH]; struct bpf_verifier_log log; u32 log_type_id; + u32 top_stack; + enum verifier_phase phase; + enum resolve_mode resolve_mode; }; static const char * const btf_kind_str[NR_BTF_KINDS] = { @@ -165,6 +248,8 @@ struct btf_kind_operations { s32 (*check_meta)(struct btf_verifier_env *env, const struct btf_type *t, u32 meta_left); + int (*resolve)(struct btf_verifier_env *env, + const struct resolve_vertex *v); void (*log_details)(struct btf_verifier_env *env, const struct btf_type *t); }; @@ -172,6 +257,101 @@ struct btf_kind_operations { static const struct btf_kind_operations * const kind_ops[NR_BTF_KINDS]; static struct btf_type btf_void; +static bool btf_type_is_modifier(const struct btf_type *t) +{ + /* Some of them is not strictly a C modifier + * but they are grouped into the same bucket + * for BTF concern: + * A type (t) that refers to another + * type through t->type AND its size cannot + * be determined without following the t->type. + * + * ptr does not fall into this bucket + * because its size is always sizeof(void *). + */ + switch (BTF_INFO_KIND(t->info)) { + case BTF_KIND_TYPEDEF: + case BTF_KIND_VOLATILE: + case BTF_KIND_CONST: + case BTF_KIND_RESTRICT: + return true; + } + + return false; +} + +static bool btf_type_is_void(const struct btf_type *t) +{ + /* void => no type and size info. + * Hence, FWD is also treated as void. + */ + return t == &btf_void || BTF_INFO_KIND(t->info) == BTF_KIND_FWD; +} + +static bool btf_type_is_void_or_null(const struct btf_type *t) +{ + return !t || btf_type_is_void(t); +} + +/* union is only a special case of struct: + * all its offsetof(member) == 0 + */ +static bool btf_type_is_struct(const struct btf_type *t) +{ + u8 kind = BTF_INFO_KIND(t->info); + + return kind == BTF_KIND_STRUCT || kind == BTF_KIND_UNION; +} + +static bool btf_type_is_array(const struct btf_type *t) +{ + return BTF_INFO_KIND(t->info) == BTF_KIND_ARRAY; +} + +static bool btf_type_is_ptr(const struct btf_type *t) +{ + return BTF_INFO_KIND(t->info) == BTF_KIND_PTR; +} + +static bool btf_type_is_int(const struct btf_type *t) +{ + return BTF_INFO_KIND(t->info) == BTF_KIND_INT; +} + +/* What types need to be resolved? + * + * btf_type_is_modifier() is an obvious one. + * + * btf_type_is_struct() because its member refers to + * another type (through member->type). + + * btf_type_is_array() because its element (array->type) + * refers to another type. Array can be thought of a + * special case of struct while array just has the same + * member-type repeated by array->nelems of times. + */ +static bool btf_type_needs_resolve(const struct btf_type *t) +{ + return btf_type_is_modifier(t) || + btf_type_is_ptr(t) || + btf_type_is_struct(t) || + btf_type_is_array(t); +} + +/* t->size can be used */ +static bool btf_type_has_size(const struct btf_type *t) +{ + switch (BTF_INFO_KIND(t->info)) { + case BTF_KIND_INT: + case BTF_KIND_STRUCT: + case BTF_KIND_UNION: + case BTF_KIND_ENUM: + return true; + } + + return false; +} + static const char *btf_int_encoding_str(u8 encoding) { if (encoding == 0) @@ -234,6 +414,14 @@ static const char *btf_name_by_offset(const struct btf *btf, u32 offset) return "(invalid-name-offset)"; } +static const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id) +{ + if (type_id > btf->nr_types) + return NULL; + + return btf->types[type_id]; +} + __printf(2, 3) static void __btf_verifier_log(struct bpf_verifier_log *log, const char *fmt, ...) { @@ -308,6 +496,15 @@ static void btf_verifier_log_member(struct btf_verifier_env *env, if (!bpf_verifier_log_needed(log)) return; + /* The CHECK_META phase already did a btf dump. + * + * If member is logged again, it must hit an error in + * parsing this member. It is useful to print out which + * struct this member belongs to. + */ + if (env->phase != CHECK_META) + btf_verifier_log_type(env, struct_type, NULL); + __btf_verifier_log(log, "\t%s type_id=%u bits_offset=%u", btf_name_by_offset(btf, member->name), member->type, member->offset); @@ -393,15 +590,183 @@ static int btf_add_type(struct btf_verifier_env *env, struct btf_type *t) static void btf_free(struct btf *btf) { kvfree(btf->types); + kvfree(btf->resolved_sizes); + kvfree(btf->resolved_ids); kvfree(btf->data); kfree(btf); } +static int env_resolve_init(struct btf_verifier_env *env) +{ + struct btf *btf = env->btf; + u32 nr_types = btf->nr_types; + u32 *resolved_sizes = NULL; + u32 *resolved_ids = NULL; + u8 *visit_states = NULL; + + /* +1 for btf_void */ + resolved_sizes = kvzalloc((nr_types + 1) * sizeof(*resolved_sizes), + GFP_KERNEL | __GFP_NOWARN); + if (!resolved_sizes) + goto nomem; + + resolved_ids = kvzalloc((nr_types + 1) * sizeof(*resolved_ids), + GFP_KERNEL | __GFP_NOWARN); + if (!resolved_ids) + goto nomem; + + visit_states = kvzalloc((nr_types + 1) * sizeof(*visit_states), + GFP_KERNEL | __GFP_NOWARN); + if (!visit_states) + goto nomem; + + btf->resolved_sizes = resolved_sizes; + btf->resolved_ids = resolved_ids; + env->visit_states = visit_states; + + return 0; + +nomem: + kvfree(resolved_sizes); + kvfree(resolved_ids); + kvfree(visit_states); + return -ENOMEM; +} + static void btf_verifier_env_free(struct btf_verifier_env *env) { + kvfree(env->visit_states); kfree(env); } +static bool env_type_is_resolve_sink(const struct btf_verifier_env *env, + const struct btf_type *next_type) +{ + switch (env->resolve_mode) { + case RESOLVE_TBD: + /* int, enum or void is a sink */ + return !btf_type_needs_resolve(next_type); + case RESOLVE_PTR: + /* int, enum, void, struct or array is a sink for ptr */ + return !btf_type_is_modifier(next_type) && + !btf_type_is_ptr(next_type); + case RESOLVE_STRUCT_OR_ARRAY: + /* int, enum, void or ptr is a sink for struct and array */ + return !btf_type_is_modifier(next_type) && + !btf_type_is_array(next_type) && + !btf_type_is_struct(next_type); + default: + BUG_ON(1); + } +} + +static bool env_type_is_resolved(const struct btf_verifier_env *env, + u32 type_id) +{ + return env->visit_states[type_id] == RESOLVED; +} + +static int env_stack_push(struct btf_verifier_env *env, + const struct btf_type *t, u32 type_id) +{ + struct resolve_vertex *v; + + if (env->top_stack == MAX_RESOLVE_DEPTH) + return -E2BIG; + + if (env->visit_states[type_id] != NOT_VISITED) + return -EEXIST; + + env->visit_states[type_id] = VISITED; + + v = &env->stack[env->top_stack++]; + v->t = t; + v->type_id = type_id; + v->next_member = 0; + + if (env->resolve_mode == RESOLVE_TBD) { + if (btf_type_is_ptr(t)) + env->resolve_mode = RESOLVE_PTR; + else if (btf_type_is_struct(t) || btf_type_is_array(t)) + env->resolve_mode = RESOLVE_STRUCT_OR_ARRAY; + } + + return 0; +} + +static void env_stack_set_next_member(struct btf_verifier_env *env, + u16 next_member) +{ + env->stack[env->top_stack - 1].next_member = next_member; +} + +static void env_stack_pop_resolved(struct btf_verifier_env *env, + u32 resolved_type_id, + u32 resolved_size) +{ + u32 type_id = env->stack[--(env->top_stack)].type_id; + struct btf *btf = env->btf; + + btf->resolved_sizes[type_id] = resolved_size; + btf->resolved_ids[type_id] = resolved_type_id; + env->visit_states[type_id] = RESOLVED; +} + +static const struct resolve_vertex *env_stack_peak(struct btf_verifier_env *env) +{ + return env->top_stack ? &env->stack[env->top_stack - 1] : NULL; +} + +/* The input param "type_id" must point to a needs_resolve type */ +static const struct btf_type *btf_type_id_resolve(const struct btf *btf, + u32 *type_id) +{ + *type_id = btf->resolved_ids[*type_id]; + return btf_type_by_id(btf, *type_id); +} + +const struct btf_type *btf_type_id_size(const struct btf *btf, + u32 *type_id, u32 *ret_size) +{ + const struct btf_type *size_type; + u32 size_type_id = *type_id; + u32 size = 0; + + size_type = btf_type_by_id(btf, size_type_id); + if (btf_type_is_void_or_null(size_type)) + return NULL; + + if (btf_type_has_size(size_type)) { + size = size_type->size; + } else if (btf_type_is_array(size_type)) { + size = btf->resolved_sizes[size_type_id]; + } else if (btf_type_is_ptr(size_type)) { + size = sizeof(void *); + } else { + if (WARN_ON_ONCE(!btf_type_is_modifier(size_type))) + return NULL; + + size = btf->resolved_sizes[size_type_id]; + size_type_id = btf->resolved_ids[size_type_id]; + size_type = btf_type_by_id(btf, size_type_id); + if (btf_type_is_void(size_type)) + return NULL; + } + + *type_id = size_type_id; + if (ret_size) + *ret_size = size; + + return size_type; +} + +static int btf_df_resolve(struct btf_verifier_env *env, + const struct resolve_vertex *v) +{ + btf_verifier_log_basic(env, v->t, "Unsupported resolve"); + return -EINVAL; +} + static s32 btf_int_check_meta(struct btf_verifier_env *env, const struct btf_type *t, u32 meta_left) @@ -464,6 +829,7 @@ static void btf_int_log(struct btf_verifier_env *env, static const struct btf_kind_operations int_ops = { .check_meta = btf_int_check_meta, + .resolve = btf_df_resolve, .log_details = btf_int_log, }; @@ -486,6 +852,104 @@ static int btf_ref_type_check_meta(struct btf_verifier_env *env, return 0; } +static int btf_modifier_resolve(struct btf_verifier_env *env, + const struct resolve_vertex *v) +{ + const struct btf_type *t = v->t; + const struct btf_type *next_type; + u32 next_type_id = t->type; + struct btf *btf = env->btf; + u32 next_type_size = 0; + + next_type = btf_type_by_id(btf, next_type_id); + if (!next_type) { + btf_verifier_log_type(env, v->t, "Invalid type_id"); + return -EINVAL; + } + + /* "typedef void new_void", "const void"...etc */ + if (btf_type_is_void(next_type)) + goto resolved; + + if (!env_type_is_resolve_sink(env, next_type) && + !env_type_is_resolved(env, next_type_id)) + return env_stack_push(env, next_type, next_type_id); + + /* Figure out the resolved next_type_id with size. + * They will be stored in the current modifier's + * resolved_ids and resolved_sizes such that it can + * save us a few type-following when we use it later (e.g. in + * pretty print). + */ + if (!btf_type_id_size(btf, &next_type_id, &next_type_size) && + !btf_type_is_void(btf_type_id_resolve(btf, &next_type_id))) { + btf_verifier_log_type(env, v->t, "Invalid type_id"); + return -EINVAL; + } + +resolved: + env_stack_pop_resolved(env, next_type_id, next_type_size); + + return 0; +} + +static int btf_ptr_resolve(struct btf_verifier_env *env, + const struct resolve_vertex *v) +{ + const struct btf_type *next_type; + const struct btf_type *t = v->t; + u32 next_type_id = t->type; + struct btf *btf = env->btf; + u32 next_type_size = 0; + + next_type = btf_type_by_id(btf, next_type_id); + if (!next_type) { + btf_verifier_log_type(env, v->t, "Invalid type_id"); + return -EINVAL; + } + + /* "void *" */ + if (btf_type_is_void(next_type)) + goto resolved; + + if (!env_type_is_resolve_sink(env, next_type) && + !env_type_is_resolved(env, next_type_id)) + return env_stack_push(env, next_type, next_type_id); + + /* If the modifier was RESOLVED during RESOLVE_STRUCT_OR_ARRAY, + * the modifier may have stopped resolving when it was resolved + * to a ptr (last-resolved-ptr). + * + * We now need to continue from the last-resolved-ptr to + * ensure the last-resolved-ptr will not referring back to + * the currenct ptr (t). + */ + if (btf_type_is_modifier(next_type)) { + const struct btf_type *resolved_type; + u32 resolved_type_id; + + resolved_type_id = next_type_id; + resolved_type = btf_type_id_resolve(btf, &resolved_type_id); + + if (btf_type_is_ptr(resolved_type) && + !env_type_is_resolve_sink(env, resolved_type) && + !env_type_is_resolved(env, resolved_type_id)) + return env_stack_push(env, resolved_type, + resolved_type_id); + } + + if (!btf_type_id_size(btf, &next_type_id, &next_type_size) && + !btf_type_is_void(btf_type_id_resolve(btf, &next_type_id))) { + btf_verifier_log_type(env, v->t, "Invalid type_id"); + return -EINVAL; + } + +resolved: + env_stack_pop_resolved(env, next_type_id, 0); + + return 0; +} + static void btf_ref_type_log(struct btf_verifier_env *env, const struct btf_type *t) { @@ -494,16 +958,19 @@ static void btf_ref_type_log(struct btf_verifier_env *env, static struct btf_kind_operations modifier_ops = { .check_meta = btf_ref_type_check_meta, + .resolve = btf_modifier_resolve, .log_details = btf_ref_type_log, }; static struct btf_kind_operations ptr_ops = { .check_meta = btf_ref_type_check_meta, + .resolve = btf_ptr_resolve, .log_details = btf_ref_type_log, }; static struct btf_kind_operations fwd_ops = { .check_meta = btf_ref_type_check_meta, + .resolve = btf_df_resolve, .log_details = btf_ref_type_log, }; @@ -542,6 +1009,61 @@ static s32 btf_array_check_meta(struct btf_verifier_env *env, return meta_needed; } +static int btf_array_resolve(struct btf_verifier_env *env, + const struct resolve_vertex *v) +{ + const struct btf_array *array = btf_type_array(v->t); + const struct btf_type *elem_type; + u32 elem_type_id = array->type; + struct btf *btf = env->btf; + u32 elem_size; + + elem_type = btf_type_by_id(btf, elem_type_id); + if (btf_type_is_void_or_null(elem_type)) { + btf_verifier_log_type(env, v->t, + "Invalid elem"); + return -EINVAL; + } + + if (!env_type_is_resolve_sink(env, elem_type) && + !env_type_is_resolved(env, elem_type_id)) + return env_stack_push(env, elem_type, elem_type_id); + + elem_type = btf_type_id_size(btf, &elem_type_id, &elem_size); + if (!elem_type) { + btf_verifier_log_type(env, v->t, "Invalid elem"); + return -EINVAL; + } + + if (btf_type_is_int(elem_type)) { + int int_type_data = btf_type_int(elem_type); + u16 nr_bits = BTF_INT_BITS(int_type_data); + u16 nr_bytes = BITS_ROUNDUP_BYTES(nr_bits); + + /* Put more restriction on array of int. The int cannot + * be a bit field and it must be either u8/u16/u32/u64. + */ + if (BITS_PER_BYTE_MASKED(nr_bits) || + BTF_INT_OFFSET(int_type_data) || + (nr_bytes != sizeof(u8) && nr_bytes != sizeof(u16) && + nr_bytes != sizeof(u32) && nr_bytes != sizeof(u64))) { + btf_verifier_log_type(env, v->t, + "Invalid array of int"); + return -EINVAL; + } + } + + if (array->nelems && elem_size > U32_MAX / array->nelems) { + btf_verifier_log_type(env, v->t, + "Array size overflows U32_MAX"); + return -EINVAL; + } + + env_stack_pop_resolved(env, elem_type_id, elem_size * array->nelems); + + return 0; +} + static void btf_array_log(struct btf_verifier_env *env, const struct btf_type *t) { @@ -553,6 +1075,7 @@ static void btf_array_log(struct btf_verifier_env *env, static struct btf_kind_operations array_ops = { .check_meta = btf_array_check_meta, + .resolve = btf_array_resolve, .log_details = btf_array_log, }; @@ -610,6 +1133,50 @@ static s32 btf_struct_check_meta(struct btf_verifier_env *env, return meta_needed; } +static int btf_struct_resolve(struct btf_verifier_env *env, + const struct resolve_vertex *v) +{ + const struct btf_member *member; + u16 i; + + /* Before continue resolving the next_member, + * ensure the last member is indeed resolved to a + * type with size info. + */ + if (v->next_member) { + const struct btf_member *last_member; + u16 last_member_type_id; + + last_member = btf_type_member(v->t) + v->next_member - 1; + last_member_type_id = last_member->type; + if (WARN_ON_ONCE(!env_type_is_resolved(env, + last_member_type_id))) + return -EINVAL; + } + + for_each_member_from(i, v->next_member, v->t, member) { + u32 member_type_id = member->type; + const struct btf_type *member_type = btf_type_by_id(env->btf, + member_type_id); + + if (btf_type_is_void_or_null(member_type)) { + btf_verifier_log_member(env, v->t, member, + "Invalid member"); + return -EINVAL; + } + + if (!env_type_is_resolve_sink(env, member_type) && + !env_type_is_resolved(env, member_type_id)) { + env_stack_set_next_member(env, i + 1); + return env_stack_push(env, member_type, member_type_id); + } + } + + env_stack_pop_resolved(env, 0, 0); + + return 0; +} + static void btf_struct_log(struct btf_verifier_env *env, const struct btf_type *t) { @@ -618,6 +1185,7 @@ static void btf_struct_log(struct btf_verifier_env *env, static struct btf_kind_operations struct_ops = { .check_meta = btf_struct_check_meta, + .resolve = btf_struct_resolve, .log_details = btf_struct_log, }; @@ -671,6 +1239,7 @@ static void btf_enum_log(struct btf_verifier_env *env, static struct btf_kind_operations enum_ops = { .check_meta = btf_enum_check_meta, + .resolve = btf_df_resolve, .log_details = btf_enum_log, }; @@ -751,9 +1320,104 @@ static int btf_check_all_metas(struct btf_verifier_env *env) return 0; } +static int btf_resolve(struct btf_verifier_env *env, + const struct btf_type *t, u32 type_id) +{ + const struct resolve_vertex *v; + int err = 0; + + env->resolve_mode = RESOLVE_TBD; + env_stack_push(env, t, type_id); + while (!err && (v = env_stack_peak(env))) { + env->log_type_id = v->type_id; + err = btf_type_ops(v->t)->resolve(env, v); + } + + env->log_type_id = type_id; + if (err == -E2BIG) + btf_verifier_log_type(env, t, + "Exceeded max resolving depth:%u", + MAX_RESOLVE_DEPTH); + else if (err == -EEXIST) + btf_verifier_log_type(env, t, "Loop detected"); + + return err; +} + +static bool btf_resolve_valid(struct btf_verifier_env *env, + const struct btf_type *t, + u32 type_id) +{ + struct btf *btf = env->btf; + + if (!env_type_is_resolved(env, type_id)) + return false; + + if (btf_type_is_struct(t)) + return !btf->resolved_ids[type_id] && + !btf->resolved_sizes[type_id]; + + if (btf_type_is_modifier(t) || btf_type_is_ptr(t)) { + t = btf_type_id_resolve(btf, &type_id); + return t && !btf_type_is_modifier(t); + } + + if (btf_type_is_array(t)) { + const struct btf_array *array = btf_type_array(t); + const struct btf_type *elem_type; + u32 elem_type_id = array->type; + u32 elem_size; + + elem_type = btf_type_id_size(btf, &elem_type_id, &elem_size); + return elem_type && !btf_type_is_modifier(elem_type) && + (array->nelems * elem_size == + btf->resolved_sizes[type_id]); + } + + return false; +} + +static int btf_check_all_types(struct btf_verifier_env *env) +{ + struct btf *btf = env->btf; + u32 type_id; + int err; + + err = env_resolve_init(env); + if (err) + return err; + + env->phase++; + for (type_id = 1; type_id <= btf->nr_types; type_id++) { + const struct btf_type *t = btf_type_by_id(btf, type_id); + + env->log_type_id = type_id; + if (btf_type_needs_resolve(t) && + !env_type_is_resolved(env, type_id)) { + err = btf_resolve(env, t, type_id); + if (err) + return err; + } + + if (btf_type_needs_resolve(t) && + !btf_resolve_valid(env, t, type_id)) { + btf_verifier_log_type(env, t, "Invalid resolve state"); + return -EINVAL; + } + } + + return 0; +} + static int btf_parse_type_sec(struct btf_verifier_env *env) { - return btf_check_all_metas(env); + int err; + + err = btf_check_all_metas(env); + if (err) + return err; + + return btf_check_all_types(env); } static int btf_parse_str_sec(struct btf_verifier_env *env) From abfb30d5c017341e121f14b7e0d5adfc8fc3d56b Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 18 Apr 2018 15:55:59 -0700 Subject: [PATCH 0233/1640] UPSTREAM: bpf: btf: Check members of struct/union This patch checks a few things of struct's members: 1) It has a valid size (e.g. a "const void" is invalid) 2) A member's size (+ its member's offset) does not exceed the containing struct's size. 3) The member's offset satisfies the alignment requirement The above can only be done after the needs_resolve member's type is resolved. Hence, the above is done together in btf_struct_resolve(). Each possible member's type (e.g. int, enum, modifier...) implements the check_member() ops which will be called from btf_struct_resolve(). Signed-off-by: Martin KaFai Lau Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- kernel/bpf/btf.c | 205 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 205 insertions(+) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 18bf266ceeda..4e31249f6c61 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -250,6 +250,10 @@ struct btf_kind_operations { u32 meta_left); int (*resolve)(struct btf_verifier_env *env, const struct resolve_vertex *v); + int (*check_member)(struct btf_verifier_env *env, + const struct btf_type *struct_type, + const struct btf_member *member, + const struct btf_type *member_type); void (*log_details)(struct btf_verifier_env *env, const struct btf_type *t); }; @@ -760,6 +764,16 @@ const struct btf_type *btf_type_id_size(const struct btf *btf, return size_type; } +static int btf_df_check_member(struct btf_verifier_env *env, + const struct btf_type *struct_type, + const struct btf_member *member, + const struct btf_type *member_type) +{ + btf_verifier_log_basic(env, struct_type, + "Unsupported check_member"); + return -EINVAL; +} + static int btf_df_resolve(struct btf_verifier_env *env, const struct resolve_vertex *v) { @@ -767,6 +781,44 @@ static int btf_df_resolve(struct btf_verifier_env *env, return -EINVAL; } +static int btf_int_check_member(struct btf_verifier_env *env, + const struct btf_type *struct_type, + const struct btf_member *member, + const struct btf_type *member_type) +{ + u32 int_data = btf_type_int(member_type); + u32 struct_bits_off = member->offset; + u32 struct_size = struct_type->size; + u32 nr_copy_bits; + u32 bytes_offset; + + if (U32_MAX - struct_bits_off < BTF_INT_OFFSET(int_data)) { + btf_verifier_log_member(env, struct_type, member, + "bits_offset exceeds U32_MAX"); + return -EINVAL; + } + + struct_bits_off += BTF_INT_OFFSET(int_data); + bytes_offset = BITS_ROUNDDOWN_BYTES(struct_bits_off); + nr_copy_bits = BTF_INT_BITS(int_data) + + BITS_PER_BYTE_MASKED(struct_bits_off); + + if (nr_copy_bits > BITS_PER_U64) { + btf_verifier_log_member(env, struct_type, member, + "nr_copy_bits exceeds 64"); + return -EINVAL; + } + + if (struct_size < bytes_offset || + struct_size - bytes_offset < BITS_ROUNDUP_BYTES(nr_copy_bits)) { + btf_verifier_log_member(env, struct_type, member, + "Member exceeds struct_size"); + return -EINVAL; + } + + return 0; +} + static s32 btf_int_check_meta(struct btf_verifier_env *env, const struct btf_type *t, u32 meta_left) @@ -830,9 +882,61 @@ static void btf_int_log(struct btf_verifier_env *env, static const struct btf_kind_operations int_ops = { .check_meta = btf_int_check_meta, .resolve = btf_df_resolve, + .check_member = btf_int_check_member, .log_details = btf_int_log, }; +static int btf_modifier_check_member(struct btf_verifier_env *env, + const struct btf_type *struct_type, + const struct btf_member *member, + const struct btf_type *member_type) +{ + const struct btf_type *resolved_type; + u32 resolved_type_id = member->type; + struct btf_member resolved_member; + struct btf *btf = env->btf; + + resolved_type = btf_type_id_size(btf, &resolved_type_id, NULL); + if (!resolved_type) { + btf_verifier_log_member(env, struct_type, member, + "Invalid member"); + return -EINVAL; + } + + resolved_member = *member; + resolved_member.type = resolved_type_id; + + return btf_type_ops(resolved_type)->check_member(env, struct_type, + &resolved_member, + resolved_type); +} + +static int btf_ptr_check_member(struct btf_verifier_env *env, + const struct btf_type *struct_type, + const struct btf_member *member, + const struct btf_type *member_type) +{ + u32 struct_size, struct_bits_off, bytes_offset; + + struct_size = struct_type->size; + struct_bits_off = member->offset; + bytes_offset = BITS_ROUNDDOWN_BYTES(struct_bits_off); + + if (BITS_PER_BYTE_MASKED(struct_bits_off)) { + btf_verifier_log_member(env, struct_type, member, + "Member is not byte aligned"); + return -EINVAL; + } + + if (struct_size - bytes_offset < sizeof(void *)) { + btf_verifier_log_member(env, struct_type, member, + "Member exceeds struct_size"); + return -EINVAL; + } + + return 0; +} + static int btf_ref_type_check_meta(struct btf_verifier_env *env, const struct btf_type *t, u32 meta_left) @@ -959,21 +1063,53 @@ static void btf_ref_type_log(struct btf_verifier_env *env, static struct btf_kind_operations modifier_ops = { .check_meta = btf_ref_type_check_meta, .resolve = btf_modifier_resolve, + .check_member = btf_modifier_check_member, .log_details = btf_ref_type_log, }; static struct btf_kind_operations ptr_ops = { .check_meta = btf_ref_type_check_meta, .resolve = btf_ptr_resolve, + .check_member = btf_ptr_check_member, .log_details = btf_ref_type_log, }; static struct btf_kind_operations fwd_ops = { .check_meta = btf_ref_type_check_meta, .resolve = btf_df_resolve, + .check_member = btf_df_check_member, .log_details = btf_ref_type_log, }; +static int btf_array_check_member(struct btf_verifier_env *env, + const struct btf_type *struct_type, + const struct btf_member *member, + const struct btf_type *member_type) +{ + u32 struct_bits_off = member->offset; + u32 struct_size, bytes_offset; + u32 array_type_id, array_size; + struct btf *btf = env->btf; + + if (BITS_PER_BYTE_MASKED(struct_bits_off)) { + btf_verifier_log_member(env, struct_type, member, + "Member is not byte aligned"); + return -EINVAL; + } + + array_type_id = member->type; + btf_type_id_size(btf, &array_type_id, &array_size); + struct_size = struct_type->size; + bytes_offset = BITS_ROUNDDOWN_BYTES(struct_bits_off); + if (struct_size - bytes_offset < array_size) { + btf_verifier_log_member(env, struct_type, member, + "Member exceeds struct_size"); + return -EINVAL; + } + + return 0; +} + static s32 btf_array_check_meta(struct btf_verifier_env *env, const struct btf_type *t, u32 meta_left) @@ -1076,9 +1212,35 @@ static void btf_array_log(struct btf_verifier_env *env, static struct btf_kind_operations array_ops = { .check_meta = btf_array_check_meta, .resolve = btf_array_resolve, + .check_member = btf_array_check_member, .log_details = btf_array_log, }; +static int btf_struct_check_member(struct btf_verifier_env *env, + const struct btf_type *struct_type, + const struct btf_member *member, + const struct btf_type *member_type) +{ + u32 struct_bits_off = member->offset; + u32 struct_size, bytes_offset; + + if (BITS_PER_BYTE_MASKED(struct_bits_off)) { + btf_verifier_log_member(env, struct_type, member, + "Member is not byte aligned"); + return -EINVAL; + } + + struct_size = struct_type->size; + bytes_offset = BITS_ROUNDDOWN_BYTES(struct_bits_off); + if (struct_size - bytes_offset < member_type->size) { + btf_verifier_log_member(env, struct_type, member, + "Member exceeds struct_size"); + return -EINVAL; + } + + return 0; +} + static s32 btf_struct_check_meta(struct btf_verifier_env *env, const struct btf_type *t, u32 meta_left) @@ -1137,6 +1299,7 @@ static int btf_struct_resolve(struct btf_verifier_env *env, const struct resolve_vertex *v) { const struct btf_member *member; + int err; u16 i; /* Before continue resolving the next_member, @@ -1144,6 +1307,7 @@ static int btf_struct_resolve(struct btf_verifier_env *env, * type with size info. */ if (v->next_member) { + const struct btf_type *last_member_type; const struct btf_member *last_member; u16 last_member_type_id; @@ -1152,6 +1316,14 @@ static int btf_struct_resolve(struct btf_verifier_env *env, if (WARN_ON_ONCE(!env_type_is_resolved(env, last_member_type_id))) return -EINVAL; + + last_member_type = btf_type_by_id(env->btf, + last_member_type_id); + err = btf_type_ops(last_member_type)->check_member(env, v->t, + last_member, + last_member_type); + if (err) + return err; } for_each_member_from(i, v->next_member, v->t, member) { @@ -1170,6 +1342,12 @@ static int btf_struct_resolve(struct btf_verifier_env *env, env_stack_set_next_member(env, i + 1); return env_stack_push(env, member_type, member_type_id); } + + err = btf_type_ops(member_type)->check_member(env, v->t, + member, + member_type); + if (err) + return err; } env_stack_pop_resolved(env, 0, 0); @@ -1186,9 +1364,35 @@ static void btf_struct_log(struct btf_verifier_env *env, static struct btf_kind_operations struct_ops = { .check_meta = btf_struct_check_meta, .resolve = btf_struct_resolve, + .check_member = btf_struct_check_member, .log_details = btf_struct_log, }; +static int btf_enum_check_member(struct btf_verifier_env *env, + const struct btf_type *struct_type, + const struct btf_member *member, + const struct btf_type *member_type) +{ + u32 struct_bits_off = member->offset; + u32 struct_size, bytes_offset; + + if (BITS_PER_BYTE_MASKED(struct_bits_off)) { + btf_verifier_log_member(env, struct_type, member, + "Member is not byte aligned"); + return -EINVAL; + } + + struct_size = struct_type->size; + bytes_offset = BITS_ROUNDDOWN_BYTES(struct_bits_off); + if (struct_size - bytes_offset < sizeof(int)) { + btf_verifier_log_member(env, struct_type, member, + "Member exceeds struct_size"); + return -EINVAL; + } + + return 0; +} + static s32 btf_enum_check_meta(struct btf_verifier_env *env, const struct btf_type *t, u32 meta_left) @@ -1240,6 +1444,7 @@ static void btf_enum_log(struct btf_verifier_env *env, static struct btf_kind_operations enum_ops = { .check_meta = btf_enum_check_meta, .resolve = btf_df_resolve, + .check_member = btf_enum_check_member, .log_details = btf_enum_log, }; From 6ec355176ca7a769a3265d858cd281b40817fb83 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 18 Apr 2018 15:56:00 -0700 Subject: [PATCH 0234/1640] UPSTREAM: bpf: btf: Add pretty print capability for data with BTF type info This patch adds pretty print capability for data with BTF type info. The current usage is to allow pretty print for a BPF map. The next few patches will allow a read() on a pinned map with BTF type info for its key and value. This patch uses the seq_printf() infra. Signed-off-by: Martin KaFai Lau Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/linux/btf.h | 2 + kernel/bpf/btf.c | 198 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 200 insertions(+) diff --git a/include/linux/btf.h b/include/linux/btf.h index f14b60368753..d8bdab0280ba 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -33,5 +33,7 @@ struct btf_type; const struct btf_type *btf_type_id_size(const struct btf *btf, u32 *type_id, u32 *ret_size); +void btf_type_seq_show(const struct btf *btf, u32 type_id, void *obj, + struct seq_file *m); #endif diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 4e31249f6c61..10ee41589da2 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3,6 +3,7 @@ #include #include +#include #include #include #include @@ -256,6 +257,9 @@ struct btf_kind_operations { const struct btf_type *member_type); void (*log_details)(struct btf_verifier_env *env, const struct btf_type *t); + void (*seq_show)(const struct btf *btf, const struct btf_type *t, + u32 type_id, void *data, u8 bits_offsets, + struct seq_file *m); }; static const struct btf_kind_operations * const kind_ops[NR_BTF_KINDS]; @@ -781,6 +785,13 @@ static int btf_df_resolve(struct btf_verifier_env *env, return -EINVAL; } +static void btf_df_seq_show(const struct btf *btf, const struct btf_type *t, + u32 type_id, void *data, u8 bits_offsets, + struct seq_file *m) +{ + seq_printf(m, "", BTF_INFO_KIND(t->info)); +} + static int btf_int_check_member(struct btf_verifier_env *env, const struct btf_type *struct_type, const struct btf_member *member, @@ -879,11 +890,96 @@ static void btf_int_log(struct btf_verifier_env *env, btf_int_encoding_str(BTF_INT_ENCODING(int_data))); } +static void btf_int_bits_seq_show(const struct btf *btf, + const struct btf_type *t, + void *data, u8 bits_offset, + struct seq_file *m) +{ + u32 int_data = btf_type_int(t); + u16 nr_bits = BTF_INT_BITS(int_data); + u16 total_bits_offset; + u16 nr_copy_bytes; + u16 nr_copy_bits; + u8 nr_upper_bits; + union { + u64 u64_num; + u8 u8_nums[8]; + } print_num; + + total_bits_offset = bits_offset + BTF_INT_OFFSET(int_data); + data += BITS_ROUNDDOWN_BYTES(total_bits_offset); + bits_offset = BITS_PER_BYTE_MASKED(total_bits_offset); + nr_copy_bits = nr_bits + bits_offset; + nr_copy_bytes = BITS_ROUNDUP_BYTES(nr_copy_bits); + + print_num.u64_num = 0; + memcpy(&print_num.u64_num, data, nr_copy_bytes); + + /* Ditch the higher order bits */ + nr_upper_bits = BITS_PER_BYTE_MASKED(nr_copy_bits); + if (nr_upper_bits) { + /* We need to mask out some bits of the upper byte. */ + u8 mask = (1 << nr_upper_bits) - 1; + + print_num.u8_nums[nr_copy_bytes - 1] &= mask; + } + + print_num.u64_num >>= bits_offset; + + seq_printf(m, "0x%llx", print_num.u64_num); +} + +static void btf_int_seq_show(const struct btf *btf, const struct btf_type *t, + u32 type_id, void *data, u8 bits_offset, + struct seq_file *m) +{ + u32 int_data = btf_type_int(t); + u8 encoding = BTF_INT_ENCODING(int_data); + bool sign = encoding & BTF_INT_SIGNED; + u32 nr_bits = BTF_INT_BITS(int_data); + + if (bits_offset || BTF_INT_OFFSET(int_data) || + BITS_PER_BYTE_MASKED(nr_bits)) { + btf_int_bits_seq_show(btf, t, data, bits_offset, m); + return; + } + + switch (nr_bits) { + case 64: + if (sign) + seq_printf(m, "%lld", *(s64 *)data); + else + seq_printf(m, "%llu", *(u64 *)data); + break; + case 32: + if (sign) + seq_printf(m, "%d", *(s32 *)data); + else + seq_printf(m, "%u", *(u32 *)data); + break; + case 16: + if (sign) + seq_printf(m, "%d", *(s16 *)data); + else + seq_printf(m, "%u", *(u16 *)data); + break; + case 8: + if (sign) + seq_printf(m, "%d", *(s8 *)data); + else + seq_printf(m, "%u", *(u8 *)data); + break; + default: + btf_int_bits_seq_show(btf, t, data, bits_offset, m); + } +} + static const struct btf_kind_operations int_ops = { .check_meta = btf_int_check_meta, .resolve = btf_df_resolve, .check_member = btf_int_check_member, .log_details = btf_int_log, + .seq_show = btf_int_seq_show, }; static int btf_modifier_check_member(struct btf_verifier_env *env, @@ -1054,6 +1150,24 @@ resolved: return 0; } +static void btf_modifier_seq_show(const struct btf *btf, + const struct btf_type *t, + u32 type_id, void *data, + u8 bits_offset, struct seq_file *m) +{ + t = btf_type_id_resolve(btf, &type_id); + + btf_type_ops(t)->seq_show(btf, t, type_id, data, bits_offset, m); +} + +static void btf_ptr_seq_show(const struct btf *btf, const struct btf_type *t, + u32 type_id, void *data, u8 bits_offset, + struct seq_file *m) +{ + /* It is a hashed value */ + seq_printf(m, "%p", *(void **)data); +} + static void btf_ref_type_log(struct btf_verifier_env *env, const struct btf_type *t) { @@ -1065,6 +1179,7 @@ static struct btf_kind_operations modifier_ops = { .resolve = btf_modifier_resolve, .check_member = btf_modifier_check_member, .log_details = btf_ref_type_log, + .seq_show = btf_modifier_seq_show, }; static struct btf_kind_operations ptr_ops = { @@ -1072,6 +1187,7 @@ static struct btf_kind_operations ptr_ops = { .resolve = btf_ptr_resolve, .check_member = btf_ptr_check_member, .log_details = btf_ref_type_log, + .seq_show = btf_ptr_seq_show, }; static struct btf_kind_operations fwd_ops = { @@ -1079,6 +1195,7 @@ static struct btf_kind_operations fwd_ops = { .resolve = btf_df_resolve, .check_member = btf_df_check_member, .log_details = btf_ref_type_log, + .seq_show = btf_df_seq_show, }; static int btf_array_check_member(struct btf_verifier_env *env, @@ -1209,11 +1326,36 @@ static void btf_array_log(struct btf_verifier_env *env, array->type, array->index_type, array->nelems); } +static void btf_array_seq_show(const struct btf *btf, const struct btf_type *t, + u32 type_id, void *data, u8 bits_offset, + struct seq_file *m) +{ + const struct btf_array *array = btf_type_array(t); + const struct btf_kind_operations *elem_ops; + const struct btf_type *elem_type; + u32 i, elem_size, elem_type_id; + + elem_type_id = array->type; + elem_type = btf_type_id_size(btf, &elem_type_id, &elem_size); + elem_ops = btf_type_ops(elem_type); + seq_puts(m, "["); + for (i = 0; i < array->nelems; i++) { + if (i) + seq_puts(m, ","); + + elem_ops->seq_show(btf, elem_type, elem_type_id, data, + bits_offset, m); + data += elem_size; + } + seq_puts(m, "]"); +} + static struct btf_kind_operations array_ops = { .check_meta = btf_array_check_meta, .resolve = btf_array_resolve, .check_member = btf_array_check_member, .log_details = btf_array_log, + .seq_show = btf_array_seq_show, }; static int btf_struct_check_member(struct btf_verifier_env *env, @@ -1361,11 +1503,39 @@ static void btf_struct_log(struct btf_verifier_env *env, btf_verifier_log(env, "size=%u vlen=%u", t->size, btf_type_vlen(t)); } +static void btf_struct_seq_show(const struct btf *btf, const struct btf_type *t, + u32 type_id, void *data, u8 bits_offset, + struct seq_file *m) +{ + const char *seq = BTF_INFO_KIND(t->info) == BTF_KIND_UNION ? "|" : ","; + const struct btf_member *member; + u32 i; + + seq_puts(m, "{"); + for_each_member(i, t, member) { + const struct btf_type *member_type = btf_type_by_id(btf, + member->type); + u32 member_offset = member->offset; + u32 bytes_offset = BITS_ROUNDDOWN_BYTES(member_offset); + u8 bits8_offset = BITS_PER_BYTE_MASKED(member_offset); + const struct btf_kind_operations *ops; + + if (i) + seq_puts(m, seq); + + ops = btf_type_ops(member_type); + ops->seq_show(btf, member_type, member->type, + data + bytes_offset, bits8_offset, m); + } + seq_puts(m, "}"); +} + static struct btf_kind_operations struct_ops = { .check_meta = btf_struct_check_meta, .resolve = btf_struct_resolve, .check_member = btf_struct_check_member, .log_details = btf_struct_log, + .seq_show = btf_struct_seq_show, }; static int btf_enum_check_member(struct btf_verifier_env *env, @@ -1441,11 +1611,31 @@ static void btf_enum_log(struct btf_verifier_env *env, btf_verifier_log(env, "size=%u vlen=%u", t->size, btf_type_vlen(t)); } +static void btf_enum_seq_show(const struct btf *btf, const struct btf_type *t, + u32 type_id, void *data, u8 bits_offset, + struct seq_file *m) +{ + const struct btf_enum *enums = btf_type_enum(t); + u32 i, nr_enums = btf_type_vlen(t); + int v = *(int *)data; + + for (i = 0; i < nr_enums; i++) { + if (v == enums[i].val) { + seq_printf(m, "%s", + btf_name_by_offset(btf, enums[i].name)); + return; + } + } + + seq_printf(m, "%d", v); +} + static struct btf_kind_operations enum_ops = { .check_meta = btf_enum_check_meta, .resolve = btf_df_resolve, .check_member = btf_enum_check_member, .log_details = btf_enum_log, + .seq_show = btf_enum_seq_show, }; static const struct btf_kind_operations * const kind_ops[NR_BTF_KINDS] = { @@ -1782,3 +1972,11 @@ errout: btf_free(btf); return ERR_PTR(err); } + +void btf_type_seq_show(const struct btf *btf, u32 type_id, void *obj, + struct seq_file *m) +{ + const struct btf_type *t = btf_type_by_id(btf, type_id); + + btf_type_ops(t)->seq_show(btf, t, type_id, obj, 0, m); +} From 250623c0a426b190c9e6d474fd08acef13291c9c Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 18 Apr 2018 15:56:01 -0700 Subject: [PATCH 0235/1640] UPSTREAM: bpf: btf: Add BPF_BTF_LOAD command This patch adds a BPF_BTF_LOAD command which 1) loads and verifies the BTF (implemented in earlier patches) 2) returns a BTF fd to userspace. In the next patch, the BTF fd can be specified during BPF_MAP_CREATE. It currently limits to CAP_SYS_ADMIN. Signed-off-by: Martin KaFai Lau Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/linux/btf.h | 4 +++ include/uapi/linux/bpf.h | 9 ++++++ kernel/bpf/btf.c | 67 ++++++++++++++++++++++++++++++++++++++++ kernel/bpf/syscall.c | 17 ++++++++++ 4 files changed, 97 insertions(+) diff --git a/include/linux/btf.h b/include/linux/btf.h index d8bdab0280ba..a7c7072535ea 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -8,7 +8,11 @@ struct btf; struct btf_type; +union bpf_attr; +void btf_put(struct btf *btf); +int btf_new_fd(const union bpf_attr *attr); +struct btf *btf_get_by_fd(int fd); /* Figure out the size of a type_id. If type_id is a modifier * (e.g. const), it will be resolved to find out the type with size. * diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 4b762f4d5cb6..6ebbb93015c3 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -95,6 +95,7 @@ enum bpf_cmd { BPF_OBJ_GET_INFO_BY_FD, BPF_PROG_QUERY, BPF_RAW_TRACEPOINT_OPEN, + BPF_BTF_LOAD, }; enum bpf_map_type { @@ -363,6 +364,14 @@ union bpf_attr { __u64 name; __u32 prog_fd; } raw_tracepoint; + + struct { /* anonymous struct for BPF_BTF_LOAD */ + __aligned_u64 btf; + __aligned_u64 btf_log_buf; + __u32 btf_size; + __u32 btf_log_size; + __u32 btf_log_level; + }; } __attribute__((aligned(8))); /* BPF helper function descriptions: diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 10ee41589da2..2322340694cf 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -7,6 +7,8 @@ #include #include #include +#include +#include #include #include #include @@ -190,6 +192,7 @@ struct btf { u32 nr_types; u32 types_size; u32 data_size; + refcount_t refcnt; }; enum verifier_phase { @@ -604,6 +607,17 @@ static void btf_free(struct btf *btf) kfree(btf); } +static void btf_get(struct btf *btf) +{ + refcount_inc(&btf->refcnt); +} + +void btf_put(struct btf *btf) +{ + if (btf && refcount_dec_and_test(&btf->refcnt)) + btf_free(btf); +} + static int env_resolve_init(struct btf_verifier_env *env) { struct btf *btf = env->btf; @@ -1963,6 +1977,7 @@ static struct btf *btf_parse(void __user *btf_data, u32 btf_data_size, if (!err) { btf_verifier_env_free(env); + btf_get(btf); return btf; } @@ -1980,3 +1995,55 @@ void btf_type_seq_show(const struct btf *btf, u32 type_id, void *obj, btf_type_ops(t)->seq_show(btf, t, type_id, obj, 0, m); } + +static int btf_release(struct inode *inode, struct file *filp) +{ + btf_put(filp->private_data); + return 0; +} + +static const struct file_operations btf_fops = { + .release = btf_release, +}; + +int btf_new_fd(const union bpf_attr *attr) +{ + struct btf *btf; + int fd; + + btf = btf_parse(u64_to_user_ptr(attr->btf), + attr->btf_size, attr->btf_log_level, + u64_to_user_ptr(attr->btf_log_buf), + attr->btf_log_size); + if (IS_ERR(btf)) + return PTR_ERR(btf); + + fd = anon_inode_getfd("btf", &btf_fops, btf, + O_RDONLY | O_CLOEXEC); + if (fd < 0) + btf_put(btf); + + return fd; +} + +struct btf *btf_get_by_fd(int fd) +{ + struct btf *btf; + struct fd f; + + f = fdget(fd); + + if (!f.file) + return ERR_PTR(-EBADF); + + if (f.file->f_op != &btf_fops) { + fdput(f); + return ERR_PTR(-EINVAL); + } + + btf = f.file->private_data; + btf_get(btf); + fdput(f); + + return btf; +} diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index cebe1539c1c7..339909467c84 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -11,6 +11,7 @@ */ #include #include +#include #include #include #include @@ -2048,6 +2049,19 @@ static int bpf_obj_get_info_by_fd(const union bpf_attr *attr, return err; } +#define BPF_BTF_LOAD_LAST_FIELD btf_log_level + +static int bpf_btf_load(const union bpf_attr *attr) +{ + if (CHECK_ATTR(BPF_BTF_LOAD)) + return -EINVAL; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + return btf_new_fd(attr); +} + SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) { union bpf_attr attr; @@ -2129,6 +2143,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz case BPF_RAW_TRACEPOINT_OPEN: err = bpf_raw_tracepoint_open(&attr); break; + case BPF_BTF_LOAD: + err = bpf_btf_load(&attr); + break; default: err = -EINVAL; break; From 355d43a943ca3569ec032614c75781b48a3a608a Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 18 Apr 2018 15:56:02 -0700 Subject: [PATCH 0236/1640] UPSTREAM: bpf: btf: Add BPF_OBJ_GET_INFO_BY_FD support to BTF fd This patch adds BPF_OBJ_GET_INFO_BY_FD support to BTF fd. The original BTF data, which was used to create the BTF fd during the earlier BPF_BTF_LOAD call, will be returned. The userspace is expected to allocate buffer to info.info and the buffer size is set to info.info_len before calling BPF_OBJ_GET_INFO_BY_FD. The original BTF data is copied to the userspace buffer (info.info). Only upto the user's specified info.info_len will be copied. The original BTF data size is set to info.info_len. The userspace needs to check if it is bigger than its allocated buffer size. If it is, the userspace should realloc with the kernel-returned info.info_len and call the BPF_OBJ_GET_INFO_BY_FD again. Signed-off-by: Martin KaFai Lau Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/linux/btf.h | 5 +++++ kernel/bpf/btf.c | 17 ++++++++++++++++- kernel/bpf/syscall.c | 2 ++ 3 files changed, 23 insertions(+), 1 deletion(-) diff --git a/include/linux/btf.h b/include/linux/btf.h index a7c7072535ea..a966dc6d61ee 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -10,9 +10,14 @@ struct btf; struct btf_type; union bpf_attr; +extern const struct file_operations btf_fops; + void btf_put(struct btf *btf); int btf_new_fd(const union bpf_attr *attr); struct btf *btf_get_by_fd(int fd); +int btf_get_info_by_fd(const struct btf *btf, + const union bpf_attr *attr, + union bpf_attr __user *uattr); /* Figure out the size of a type_id. If type_id is a modifier * (e.g. const), it will be resolved to find out the type with size. * diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 2322340694cf..eb56ac760547 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -2002,7 +2002,7 @@ static int btf_release(struct inode *inode, struct file *filp) return 0; } -static const struct file_operations btf_fops = { +const struct file_operations btf_fops = { .release = btf_release, }; @@ -2047,3 +2047,18 @@ struct btf *btf_get_by_fd(int fd) return btf; } + +int btf_get_info_by_fd(const struct btf *btf, + const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + void __user *udata = u64_to_user_ptr(attr->info.info); + u32 copy_len = min_t(u32, btf->data_size, + attr->info.info_len); + + if (copy_to_user(udata, btf->data, copy_len) || + put_user(btf->data_size, &uattr->info.info_len)) + return -EFAULT; + + return 0; +} diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 339909467c84..91de63dc661c 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2042,6 +2042,8 @@ static int bpf_obj_get_info_by_fd(const union bpf_attr *attr, else if (f.file->f_op == &bpf_map_fops) err = bpf_map_get_info_by_fd(f.file->private_data, attr, uattr); + else if (f.file->f_op == &btf_fops) + err = btf_get_info_by_fd(f.file->private_data, attr, uattr); else err = -EINVAL; From a469002199e469008f9854692d6d940f8d8abb35 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 18 Apr 2018 15:56:03 -0700 Subject: [PATCH 0237/1640] UPSTREAM: bpf: btf: Add pretty print support to the basic arraymap This patch adds pretty print support to the basic arraymap. Support for other bpf maps can be added later. This patch adds new attrs to the BPF_MAP_CREATE command to allow specifying the btf_fd, btf_key_id and btf_value_id. The BPF_MAP_CREATE can then associate the btf to the map if the creating map supports BTF. A BTF supported map needs to implement two new map ops, map_seq_show_elem() and map_check_btf(). This patch has implemented these new map ops for the basic arraymap. It also adds file_operations, bpffs_map_fops, to the pinned map such that the pinned map can be opened and read. After that, the user has an intuitive way to do "cat bpffs/pathto/a-pinned-map" instead of getting an error. bpffs_map_fops should not be extended further to support other operations. Other operations (e.g. write/key-lookup...) should be realized by the userspace tools (e.g. bpftool) through the BPF_OBJ_GET_INFO_BY_FD, map's lookup/update interface...etc. Follow up patches will allow the userspace to obtain the BTF from a map-fd. Here is a sample output when reading a pinned arraymap with the following map's value: struct map_value { int count_a; int count_b; }; cat /sys/fs/bpf/pinned_array_map: 0: {1,2} 1: {3,4} 2: {5,6} ... Signed-off-by: Martin KaFai Lau Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 20 ++++- include/uapi/linux/bpf.h | 3 + kernel/bpf/arraymap.c | 50 +++++++++++++ kernel/bpf/inode.c | 156 ++++++++++++++++++++++++++++++++++++++- kernel/bpf/syscall.c | 32 +++++++- 5 files changed, 254 insertions(+), 7 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 1e3d7d2712fe..0a3f2eb225ce 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -22,6 +22,8 @@ struct perf_event; struct bpf_prog; struct bpf_map; struct sock; +struct seq_file; +struct btf; /* map is generic key/value storage optionally accesible by eBPF programs */ struct bpf_map_ops { @@ -49,10 +51,14 @@ struct bpf_map_ops { void (*map_fd_put_ptr)(struct bpf_map *map, void *ptr, bool need_defer); u32 (*map_gen_lookup)(struct bpf_map *map, struct bpf_insn *insn_buf); u32 (*map_fd_sys_lookup_elem)(void *ptr); + void (*map_seq_show_elem)(struct bpf_map *map, void *key, + struct seq_file *m); + int (*map_check_btf)(const struct bpf_map *map, const struct btf *btf, + u32 key_type_id, u32 value_type_id); }; struct bpf_map { - /* 1st cacheline with read-mostly members of which some + /* The first two cachelines with read-mostly members of which some * are also accessed in fast-path (e.g. ops, max_entries). */ const struct bpf_map_ops *ops ____cacheline_aligned; @@ -68,10 +74,13 @@ struct bpf_map { u32 pages; u32 id; int numa_node; + u32 btf_key_id; + u32 btf_value_id; + struct btf *btf; bool unpriv_array; - /* 7 bytes hole */ + /* 55 bytes hole */ - /* 2nd cacheline with misc members to avoid false sharing + /* The 3rd and 4th cacheline with misc members to avoid false sharing * particularly with refcounting. */ struct user_struct *user ____cacheline_aligned; @@ -106,6 +115,11 @@ static inline struct bpf_offloaded_map *map_to_offmap(struct bpf_map *map) return container_of(map, struct bpf_offloaded_map, map); } +static inline bool bpf_map_support_seq_show(const struct bpf_map *map) +{ + return map->ops->map_seq_show_elem && map->ops->map_check_btf; +} + extern const struct bpf_map_ops bpf_map_offload_ops; /* function argument constraints */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 6ebbb93015c3..8697581ed1da 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -280,6 +280,9 @@ union bpf_attr { */ char map_name[BPF_OBJ_NAME_LEN]; __u32 map_ifindex; /* ifindex of netdev to create on */ + __u32 btf_fd; /* fd pointing to a BTF type data */ + __u32 btf_key_id; /* BTF type_id of the key */ + __u32 btf_value_id; /* BTF type_id of the value */ }; struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */ diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index d4b8ee947670..7fa9205844d7 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -11,11 +11,13 @@ * General Public License for more details. */ #include +#include #include #include #include #include #include +#include #include "map_in_map.h" @@ -336,6 +338,52 @@ static void array_map_free(struct bpf_map *map) bpf_map_area_free(array); } +static void array_map_seq_show_elem(struct bpf_map *map, void *key, + struct seq_file *m) +{ + void *value; + + rcu_read_lock(); + + value = array_map_lookup_elem(map, key); + if (!value) { + rcu_read_unlock(); + return; + } + + seq_printf(m, "%u: ", *(u32 *)key); + btf_type_seq_show(map->btf, map->btf_value_id, value, m); + seq_puts(m, "\n"); + + rcu_read_unlock(); +} + +static int array_map_check_btf(const struct bpf_map *map, const struct btf *btf, + u32 btf_key_id, u32 btf_value_id) +{ + const struct btf_type *key_type, *value_type; + u32 key_size, value_size; + u32 int_data; + + key_type = btf_type_id_size(btf, &btf_key_id, &key_size); + if (!key_type || BTF_INFO_KIND(key_type->info) != BTF_KIND_INT) + return -EINVAL; + + int_data = *(u32 *)(key_type + 1); + /* bpf array can only take a u32 key. This check makes + * sure that the btf matches the attr used during map_create. + */ + if (BTF_INT_BITS(int_data) != 32 || key_size != 4 || + BTF_INT_OFFSET(int_data)) + return -EINVAL; + + value_type = btf_type_id_size(btf, &btf_value_id, &value_size); + if (!value_type || value_size > map->value_size) + return -EINVAL; + + return 0; +} + const struct bpf_map_ops array_map_ops = { .map_alloc_check = array_map_alloc_check, .map_alloc = array_map_alloc, @@ -345,6 +393,8 @@ const struct bpf_map_ops array_map_ops = { .map_update_elem = array_map_update_elem, .map_delete_elem = array_map_delete_elem, .map_gen_lookup = array_map_gen_lookup, + .map_seq_show_elem = array_map_seq_show_elem, + .map_check_btf = array_map_check_btf, }; const struct bpf_map_ops percpu_array_map_ops = { diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index e98e2702ae4e..107de9ae2715 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -150,8 +150,154 @@ static int bpf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) return 0; } +struct map_iter { + void *key; + bool done; +}; + +static struct map_iter *map_iter(struct seq_file *m) +{ + return m->private; +} + +static struct bpf_map *seq_file_to_map(struct seq_file *m) +{ + return file_inode(m->file)->i_private; +} + +static void map_iter_free(struct map_iter *iter) +{ + if (iter) { + kfree(iter->key); + kfree(iter); + } +} + +static struct map_iter *map_iter_alloc(struct bpf_map *map) +{ + struct map_iter *iter; + + iter = kzalloc(sizeof(*iter), GFP_KERNEL | __GFP_NOWARN); + if (!iter) + goto error; + + iter->key = kzalloc(map->key_size, GFP_KERNEL | __GFP_NOWARN); + if (!iter->key) + goto error; + + return iter; + +error: + map_iter_free(iter); + return NULL; +} + +static void *map_seq_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct bpf_map *map = seq_file_to_map(m); + void *key = map_iter(m)->key; + + if (map_iter(m)->done) + return NULL; + + if (unlikely(v == SEQ_START_TOKEN)) + goto done; + + if (map->ops->map_get_next_key(map, key, key)) { + map_iter(m)->done = true; + return NULL; + } + +done: + ++(*pos); + return key; +} + +static void *map_seq_start(struct seq_file *m, loff_t *pos) +{ + if (map_iter(m)->done) + return NULL; + + return *pos ? map_iter(m)->key : SEQ_START_TOKEN; +} + +static void map_seq_stop(struct seq_file *m, void *v) +{ +} + +static int map_seq_show(struct seq_file *m, void *v) +{ + struct bpf_map *map = seq_file_to_map(m); + void *key = map_iter(m)->key; + + if (unlikely(v == SEQ_START_TOKEN)) { + seq_puts(m, "# WARNING!! The output is for debug purpose only\n"); + seq_puts(m, "# WARNING!! The output format will change\n"); + } else { + map->ops->map_seq_show_elem(map, key, m); + } + + return 0; +} + +static const struct seq_operations bpffs_map_seq_ops = { + .start = map_seq_start, + .next = map_seq_next, + .show = map_seq_show, + .stop = map_seq_stop, +}; + +static int bpffs_map_open(struct inode *inode, struct file *file) +{ + struct bpf_map *map = inode->i_private; + struct map_iter *iter; + struct seq_file *m; + int err; + + iter = map_iter_alloc(map); + if (!iter) + return -ENOMEM; + + err = seq_open(file, &bpffs_map_seq_ops); + if (err) { + map_iter_free(iter); + return err; + } + + m = file->private_data; + m->private = iter; + + return 0; +} + +static int bpffs_map_release(struct inode *inode, struct file *file) +{ + struct seq_file *m = file->private_data; + + map_iter_free(map_iter(m)); + + return seq_release(inode, file); +} + +/* bpffs_map_fops should only implement the basic + * read operation for a BPF map. The purpose is to + * provide a simple user intuitive way to do + * "cat bpffs/pathto/a-pinned-map". + * + * Other operations (e.g. write, lookup...) should be realized by + * the userspace tools (e.g. bpftool) through the + * BPF_OBJ_GET_INFO_BY_FD and the map's lookup/update + * interface. + */ +static const struct file_operations bpffs_map_fops = { + .open = bpffs_map_open, + .read = seq_read, + .release = bpffs_map_release, +}; + static int bpf_mkobj_ops(struct dentry *dentry, umode_t mode, void *raw, - const struct inode_operations *iops) + const struct inode_operations *iops, + const struct file_operations *fops) { struct inode *dir = dentry->d_parent->d_inode; struct inode *inode = bpf_get_inode(dir->i_sb, dir, mode); @@ -159,6 +305,7 @@ static int bpf_mkobj_ops(struct dentry *dentry, umode_t mode, void *raw, return PTR_ERR(inode); inode->i_op = iops; + inode->i_fop = fops; inode->i_private = raw; bpf_dentry_finalize(dentry, inode, dir); @@ -167,12 +314,15 @@ static int bpf_mkobj_ops(struct dentry *dentry, umode_t mode, void *raw, static int bpf_mkprog(struct dentry *dentry, umode_t mode, void *arg) { - return bpf_mkobj_ops(dentry, mode, arg, &bpf_prog_iops); + return bpf_mkobj_ops(dentry, mode, arg, &bpf_prog_iops, NULL); } static int bpf_mkmap(struct dentry *dentry, umode_t mode, void *arg) { - return bpf_mkobj_ops(dentry, mode, arg, &bpf_map_iops); + struct bpf_map *map = arg; + + return bpf_mkobj_ops(dentry, mode, arg, &bpf_map_iops, + map->btf ? &bpffs_map_fops : NULL); } static struct dentry * diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 91de63dc661c..04460d07e5f7 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -27,6 +27,7 @@ #include #include #include +#include #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PROG_ARRAY || \ (map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \ @@ -252,6 +253,7 @@ static void bpf_map_free_deferred(struct work_struct *work) bpf_map_uncharge_memlock(map); security_bpf_map_free(map); + btf_put(map->btf); /* implementation dependent freeing */ map->ops->map_free(map); } @@ -417,7 +419,7 @@ static int bpf_obj_name_cpy(char *dst, const char *src) return 0; } -#define BPF_MAP_CREATE_LAST_FIELD map_ifindex +#define BPF_MAP_CREATE_LAST_FIELD btf_value_id /* called via syscall */ static int map_create(union bpf_attr *attr) { @@ -451,6 +453,33 @@ static int map_create(union bpf_attr *attr) atomic_set(&map->refcnt, 1); atomic_set(&map->usercnt, 1); + if (bpf_map_support_seq_show(map) && + (attr->btf_key_id || attr->btf_value_id)) { + struct btf *btf; + + if (!attr->btf_key_id || !attr->btf_value_id) { + err = -EINVAL; + goto free_map_nouncharge; + } + + btf = btf_get_by_fd(attr->btf_fd); + if (IS_ERR(btf)) { + err = PTR_ERR(btf); + goto free_map_nouncharge; + } + + err = map->ops->map_check_btf(map, btf, attr->btf_key_id, + attr->btf_value_id); + if (err) { + btf_put(btf); + goto free_map_nouncharge; + } + + map->btf = btf; + map->btf_key_id = attr->btf_key_id; + map->btf_value_id = attr->btf_value_id; + } + err = security_bpf_map_alloc(map); if (err) goto free_map_nouncharge; @@ -483,6 +512,7 @@ free_map: free_map_sec: security_bpf_map_free(map); free_map_nouncharge: + btf_put(map->btf); map->ops->map_free(map); return err; } From 3032ffc9465e1eaf558c30a27ee5d6f3e4f81302 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Fri, 20 Apr 2018 18:16:30 +0200 Subject: [PATCH 0238/1640] UPSTREAM: bpf: sockmap remove dead check Remove dead code that bails on `attr->value_size > KMALLOC_MAX_SIZE` - the previous check already bails on `attr->value_size != 4`. Signed-off-by: Jann Horn Signed-off-by: Daniel Borkmann --- kernel/bpf/sockmap.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 83261ce7e9e9..28d0d06ff28a 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -1445,9 +1445,6 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr) attr->value_size != 4 || attr->map_flags & ~SOCK_CREATE_FLAG_MASK) return ERR_PTR(-EINVAL); - if (attr->value_size > KMALLOC_MAX_SIZE) - return ERR_PTR(-E2BIG); - err = bpf_tcp_ulp_register(); if (err && err != -EEXIST) return ERR_PTR(err); From 6628d2f44067f4a59b432b69f2c9487fd3fc0117 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Sat, 21 Apr 2018 09:48:23 -0700 Subject: [PATCH 0239/1640] BACKPORT: bpf: btf: Clean up btf.h in uapi This patch cleans up btf.h in uapi: 1) Rename "name" to "name_off" to better reflect it is an offset to the string section instead of a char array. 2) Remove unused value BTF_FLAGS_COMPR and BTF_MAGIC_SWAP Suggested-by: Daniel Borkmann Signed-off-by: Martin KaFai Lau Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/uapi/linux/btf.h | 8 +++----- kernel/bpf/btf.c | 20 ++++++++++---------- 2 files changed, 13 insertions(+), 15 deletions(-) diff --git a/include/uapi/linux/btf.h b/include/uapi/linux/btf.h index 74a30b1090df..bcb56ee47014 100644 --- a/include/uapi/linux/btf.h +++ b/include/uapi/linux/btf.h @@ -6,9 +6,7 @@ #include #define BTF_MAGIC 0xeB9F -#define BTF_MAGIC_SWAP 0x9FeB #define BTF_VERSION 1 -#define BTF_FLAGS_COMPR 0x01 struct btf_header { __u16 magic; @@ -43,7 +41,7 @@ struct btf_header { #define BTF_STR_OFFSET(ref) ((ref) & BTF_MAX_NAME_OFFSET) struct btf_type { - __u32 name; + __u32 name_off; /* "info" bits arrangement * bits 0-15: vlen (e.g. # of struct's members) * bits 16-23: unused @@ -105,7 +103,7 @@ struct btf_type { * info in "struct btf_type"). */ struct btf_enum { - __u32 name; + __u32 name_off; __s32 val; }; @@ -122,7 +120,7 @@ struct btf_array { * "struct btf_type"). */ struct btf_member { - __u32 name; + __u32 name_off; __u32 type; __u32 offset; /* offset in bits */ }; diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index eb56ac760547..22e1046a1a86 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -473,7 +473,7 @@ __printf(4, 5) static void __btf_verifier_log_type(struct btf_verifier_env *env, __btf_verifier_log(log, "[%u] %s %s%s", env->log_type_id, btf_kind_str[kind], - btf_name_by_offset(btf, t->name), + btf_name_by_offset(btf, t->name_off), log_details ? " " : ""); if (log_details) @@ -517,7 +517,7 @@ static void btf_verifier_log_member(struct btf_verifier_env *env, btf_verifier_log_type(env, struct_type, NULL); __btf_verifier_log(log, "\t%s type_id=%u bits_offset=%u", - btf_name_by_offset(btf, member->name), + btf_name_by_offset(btf, member->name_off), member->type, member->offset); if (fmt && *fmt) { @@ -1419,10 +1419,10 @@ static s32 btf_struct_check_meta(struct btf_verifier_env *env, btf_verifier_log_type(env, t, NULL); for_each_member(i, t, member) { - if (!btf_name_offset_valid(btf, member->name)) { + if (!btf_name_offset_valid(btf, member->name_off)) { btf_verifier_log_member(env, t, member, "Invalid member name_offset:%u", - member->name); + member->name_off); return -EINVAL; } @@ -1605,14 +1605,14 @@ static s32 btf_enum_check_meta(struct btf_verifier_env *env, btf_verifier_log_type(env, t, NULL); for (i = 0; i < nr_enums; i++) { - if (!btf_name_offset_valid(btf, enums[i].name)) { + if (!btf_name_offset_valid(btf, enums[i].name_off)) { btf_verifier_log(env, "\tInvalid name_offset:%u", - enums[i].name); + enums[i].name_off); return -EINVAL; } btf_verifier_log(env, "\t%s val=%d\n", - btf_name_by_offset(btf, enums[i].name), + btf_name_by_offset(btf, enums[i].name_off), enums[i].val); } @@ -1636,7 +1636,7 @@ static void btf_enum_seq_show(const struct btf *btf, const struct btf_type *t, for (i = 0; i < nr_enums; i++) { if (v == enums[i].val) { seq_printf(m, "%s", - btf_name_by_offset(btf, enums[i].name)); + btf_name_by_offset(btf, enums[i].name_off)); return; } } @@ -1687,9 +1687,9 @@ static s32 btf_check_meta(struct btf_verifier_env *env, return -EINVAL; } - if (!btf_name_offset_valid(env->btf, t->name)) { + if (!btf_name_offset_valid(env->btf, t->name_off)) { btf_verifier_log(env, "[%u] Invalid name_offset:%u", - env->log_type_id, t->name); + env->log_type_id, t->name_off); return -EINVAL; } From 6c88b3ea3ca08307c70c792647a098f9c6ca6031 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Mon, 23 Apr 2018 18:09:21 +0100 Subject: [PATCH 0240/1640] UPSTREAM: bpf: disable and restore preemption in __BPF_PROG_RUN_ARRAY Running bpf programs requires disabled preemption, however at least some* of the BPF_PROG_RUN_ARRAY users do not follow this rule. To fix this bug, and also to make it not happen in the future, let's add explicit preemption disabling/re-enabling to the __BPF_PROG_RUN_ARRAY code. * for example: [ 17.624472] RIP: 0010:__cgroup_bpf_run_filter_sk+0x1c4/0x1d0 ... [ 17.640890] inet6_create+0x3eb/0x520 [ 17.641405] __sock_create+0x242/0x340 [ 17.641939] __sys_socket+0x57/0xe0 [ 17.642370] ? trace_hardirqs_off_thunk+0x1a/0x1c [ 17.642944] SyS_socket+0xa/0x10 [ 17.643357] do_syscall_64+0x79/0x220 [ 17.643879] entry_SYSCALL_64_after_hwframe+0x42/0xb7 Signed-off-by: Roman Gushchin Cc: Alexei Starovoitov Cc: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 0a3f2eb225ce..47d7f8b2bf52 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -371,6 +371,7 @@ int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array, struct bpf_prog **_prog, *__prog; \ struct bpf_prog_array *_array; \ u32 _ret = 1; \ + preempt_disable(); \ rcu_read_lock(); \ _array = rcu_dereference(array); \ if (unlikely(check_non_null && !_array))\ @@ -382,6 +383,7 @@ int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array, } \ _out: \ rcu_read_unlock(); \ + preempt_enable_no_resched(); \ _ret; \ }) From 7c64006e595027ede3ed93c15a38c07754244d2f Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Mon, 23 Apr 2018 15:39:28 -0700 Subject: [PATCH 0241/1640] UPSTREAM: bpf: sockmap, sk_wait_event needed to handle blocking cases In the recvmsg handler we need to add a wait event to support the blocking use cases. Without this we return zero and may confuse user applications. In the wait event any data received on the sk either via sk_receive_queue or the psock ingress list will wake up the sock. Fixes: fa246693a111 ("bpf: sockmap, BPF_F_INGRESS flag for BPF_SK_SKB_STREAM_VERDICT") Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann --- kernel/bpf/sockmap.c | 44 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 28d0d06ff28a..45bb1e8ecc1e 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -43,6 +43,7 @@ #include #include #include +#include #define SOCK_CREATE_FLAG_MASK \ (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) @@ -732,6 +733,26 @@ out_err: return err; } +static int bpf_wait_data(struct sock *sk, + struct smap_psock *psk, int flags, + long timeo, int *err) +{ + int rc; + + DEFINE_WAIT_FUNC(wait, woken_wake_function); + + add_wait_queue(sk_sleep(sk), &wait); + sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); + rc = sk_wait_event(sk, &timeo, + !list_empty(&psk->ingress) || + !skb_queue_empty(&sk->sk_receive_queue), + &wait); + sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); + remove_wait_queue(sk_sleep(sk), &wait); + + return rc; +} + static int bpf_tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, int flags, int *addr_len) { @@ -755,6 +776,7 @@ static int bpf_tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len); lock_sock(sk); +bytes_ready: while (copied != len) { struct scatterlist *sg; struct sk_msg_buff *md; @@ -809,6 +831,28 @@ static int bpf_tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, } } + if (!copied) { + long timeo; + int data; + int err = 0; + + timeo = sock_rcvtimeo(sk, nonblock); + data = bpf_wait_data(sk, psock, flags, timeo, &err); + + if (data) { + if (!skb_queue_empty(&sk->sk_receive_queue)) { + release_sock(sk); + smap_release_sock(psock, sk); + copied = tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len); + return copied; + } + goto bytes_ready; + } + + if (err) + copied = err; + } + release_sock(sk); smap_release_sock(psock, sk); return copied; From bc345c60dce24801054c4f03a5e3ab600dd6c892 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Mon, 23 Apr 2018 15:39:33 -0700 Subject: [PATCH 0242/1640] UPSTREAM: bpf: sockmap, fix double page_put on ENOMEM error in redirect path In the case where the socket memory boundary is hit the redirect path returns an ENOMEM error. However, before checking for this condition the redirect scatterlist buffer is setup with a valid page and length. This is never unwound so when the buffers are released latter in the error path we do a put_page() and clear the scatterlist fields. But, because the initial error happens before completing the scatterlist buffer we end up with both the original buffer and the redirect buffer pointing to the same page resulting in duplicate put_page() calls. To fix this simply move the initial configuration of the redirect scatterlist buffer below the sock memory check. Found this while running TCP_STREAM test with netperf using Cilium. Fixes: fa246693a111 ("bpf: sockmap, BPF_F_INGRESS flag for BPF_SK_SKB_STREAM_VERDICT") Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann --- kernel/bpf/sockmap.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 45bb1e8ecc1e..99f7be8a065f 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -524,8 +524,6 @@ static int bpf_tcp_ingress(struct sock *sk, int apply_bytes, i = md->sg_start; do { - r->sg_data[i] = md->sg_data[i]; - size = (apply && apply_bytes < md->sg_data[i].length) ? apply_bytes : md->sg_data[i].length; @@ -536,6 +534,7 @@ static int bpf_tcp_ingress(struct sock *sk, int apply_bytes, } sk_mem_charge(sk, size); + r->sg_data[i] = md->sg_data[i]; r->sg_data[i].length = size; md->sg_data[i].length -= size; md->sg_data[i].offset += size; From 5c901e4070b3e9a51accaeb0162487071d6d07e5 Mon Sep 17 00:00:00 2001 From: Eyal Birger Date: Tue, 24 Apr 2018 17:50:29 +0300 Subject: [PATCH 0243/1640] BACKPORT: bpf: add helper for getting xfrm states This commit introduces a helper which allows fetching xfrm state parameters by eBPF programs attached to TC. Prototype: bpf_skb_get_xfrm_state(skb, index, xfrm_state, size, flags) skb: pointer to skb index: the index in the skb xfrm_state secpath array xfrm_state: pointer to 'struct bpf_xfrm_state' size: size of 'struct bpf_xfrm_state' flags: reserved for future extensions The helper returns 0 on success. Non zero if no xfrm state at the index is found - or non exists at all. struct bpf_xfrm_state currently includes the SPI, peer IPv4/IPv6 address and the reqid; it can be further extended by adding elements to its end - indicating the populated fields by the 'size' argument - keeping backwards compatibility. Typical usage: struct bpf_xfrm_state x = {}; bpf_skb_get_xfrm_state(skb, 0, &x, sizeof(x), 0); ... Signed-off-by: Eyal Birger Signed-off-by: Daniel Borkmann --- include/uapi/linux/bpf.h | 22 ++++++++++++++++++ net/core/filter.c | 48 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 70 insertions(+) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 8697581ed1da..bd13cfdfc113 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -776,6 +776,15 @@ union bpf_attr { * @delta: A negative integer to be added to xdp_md.data_end * Return: 0 on success or negative on error * + * int bpf_skb_get_xfrm_state(skb, index, xfrm_state, size, flags) + * retrieve XFRM state + * @skb: pointer to skb + * @index: index of the xfrm state in the secpath + * @key: pointer to 'struct bpf_xfrm_state' + * @size: size of 'struct bpf_xfrm_state' + * @flags: room for future extensions + * Return: 0 on success or negative error + * * int skb_load_bytes_relative(const struct sk_buff *skb, u32 offset, void *to, u32 len, u32 start_header) * Description * This helper is similar to **bpf_skb_load_bytes**\ () in that @@ -1038,6 +1047,19 @@ struct bpf_tunnel_key { __u32 tunnel_label; }; +/* user accessible mirror of in-kernel xfrm_state. + * new fields can only be added to the end of this structure + */ +struct bpf_xfrm_state { + __u32 reqid; + __u32 spi; /* Stored in network byte order */ + __u16 family; + union { + __u32 remote_ipv4; /* Stored in network byte order */ + __u32 remote_ipv6[4]; /* Stored in network byte order */ + }; +}; + /* Generic BPF return codes which all BPF program types may support. * The values are binary compatible with their TC_ACT_* counter-part to * provide backwards compatibility with existing SCHED_CLS and SCHED_ACT diff --git a/net/core/filter.c b/net/core/filter.c index 4a666e68f235..2d561f3fa575 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -57,6 +57,7 @@ #include #include #include +#include #include /** @@ -3798,6 +3799,49 @@ static const struct bpf_func_proto bpf_bind_proto = { .arg3_type = ARG_CONST_SIZE, }; +#ifdef CONFIG_XFRM +BPF_CALL_5(bpf_skb_get_xfrm_state, struct sk_buff *, skb, u32, index, + struct bpf_xfrm_state *, to, u32, size, u64, flags) +{ + const struct sec_path *sp = skb_sec_path(skb); + const struct xfrm_state *x; + + if (!sp || unlikely(index >= sp->len || flags)) + goto err_clear; + + x = sp->xvec[index]; + + if (unlikely(size != sizeof(struct bpf_xfrm_state))) + goto err_clear; + + to->reqid = x->props.reqid; + to->spi = x->id.spi; + to->family = x->props.family; + if (to->family == AF_INET6) { + memcpy(to->remote_ipv6, x->props.saddr.a6, + sizeof(to->remote_ipv6)); + } else { + to->remote_ipv4 = x->props.saddr.a4; + } + + return 0; +err_clear: + memset(to, 0, size); + return -EINVAL; +} + +static const struct bpf_func_proto bpf_skb_get_xfrm_state_proto = { + .func = bpf_skb_get_xfrm_state, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_PTR_TO_UNINIT_MEM, + .arg4_type = ARG_CONST_SIZE, + .arg5_type = ARG_ANYTHING, +}; +#endif + static const struct bpf_func_proto * bpf_base_func_proto(enum bpf_func_id func_id) { @@ -3947,6 +3991,10 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_socket_cookie_proto; case BPF_FUNC_get_socket_uid: return &bpf_get_socket_uid_proto; +#ifdef CONFIG_XFRM + case BPF_FUNC_skb_get_xfrm_state: + return &bpf_skb_get_xfrm_state_proto; +#endif default: return bpf_base_func_proto(func_id); } From 4f853b4c278f5cc286a322a8b9e783e196bd5c62 Mon Sep 17 00:00:00 2001 From: William Tu Date: Tue, 24 Apr 2018 23:46:59 -0700 Subject: [PATCH 0244/1640] UPSTREAM: bpf: clear the ip_tunnel_info. The percpu metadata_dst might carry the stale ip_tunnel_info and cause incorrect behavior. When mixing tests using ipv4/ipv6 bpf vxlan and geneve tunnel, the ipv6 tunnel info incorrectly uses ipv4's src ip addr as its ipv6 src address, because the previous tunnel info does not clean up. The patch zeros the fields in ip_tunnel_info. Signed-off-by: William Tu Reported-by: Yifeng Sun Signed-off-by: Daniel Borkmann --- net/core/filter.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/core/filter.c b/net/core/filter.c index 2d561f3fa575..f1f9574a38ac 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3331,6 +3331,7 @@ BPF_CALL_4(bpf_skb_set_tunnel_key, struct sk_buff *, skb, skb_dst_set(skb, (struct dst_entry *) md); info = &md->u.tun_info; + memset(info, 0, sizeof(*info)); info->mode = IP_TUNNEL_INFO_TX; info->key.tun_flags = TUNNEL_KEY | TUNNEL_CSUM | TUNNEL_NOCACHE; From 58f7bec2c9df8a40142dbf727d8b03b741836d65 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 25 Apr 2018 19:41:06 +0200 Subject: [PATCH 0245/1640] UPSTREAM: bpf: Add gpl_compatible flag to struct bpf_prog_info Adding gpl_compatible flag to struct bpf_prog_info so it can be dumped via bpf_prog_get_info_by_fd and displayed via bpftool progs dump. Alexei noticed 4-byte hole in struct bpf_prog_info, so we put the u32 flags field in there, and we can keep adding bit fields in there without breaking user space. Signed-off-by: Jiri Olsa Signed-off-by: Daniel Borkmann --- include/uapi/linux/bpf.h | 1 + kernel/bpf/syscall.c | 1 + 2 files changed, 2 insertions(+) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index bd13cfdfc113..4abdd2a266b5 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1150,6 +1150,7 @@ struct bpf_prog_info { __aligned_u64 map_ids; char name[BPF_OBJ_NAME_LEN]; __u32 ifindex; + __u32 gpl_compatible:1; __u64 netns_dev; __u64 netns_ino; } __attribute__((aligned(8))); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 04460d07e5f7..b5b3126b598a 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1938,6 +1938,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, info.load_time = prog->aux->load_time; info.created_by_uid = from_kuid_munged(current_user_ns(), prog->aux->user->uid); + info.gpl_compatible = prog->gpl_compatible; memcpy(info.tag, prog->tag, sizeof(prog->tag)); memcpy(info.name, prog->aux->name, sizeof(prog->aux->name)); From b9b47187e2db5b317e64885b5b221a43836f5f4e Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Wed, 25 Apr 2018 18:16:52 +0100 Subject: [PATCH 0246/1640] BACKPORT: bpf: add script and prepare bpf.h for new helpers documentation Remove previous "overview" of eBPF helpers from user bpf.h header. Replace it by a comment explaining how to process the new documentation (to come in following patches) with a Python script to produce RST, then man page documentation. Also add the aforementioned Python script under scripts/. It is used to process include/uapi/linux/bpf.h and to extract helper descriptions, to turn it into a RST document that can further be processed with rst2man to produce a man page. The script takes one "--filename " option. If the script is launched from scripts/ in the kernel root directory, it should be able to find the location of the header to parse, and "--filename " is then optional. If it cannot find the file, then the option becomes mandatory. RST-formatted documentation is printed to standard output. Typical workflow for producing the final man page would be: $ ./scripts/bpf_helpers_doc.py \ --filename include/uapi/linux/bpf.h > /tmp/bpf-helpers.rst $ rst2man /tmp/bpf-helpers.rst > /tmp/bpf-helpers.7 $ man /tmp/bpf-helpers.7 Note that the tool kernel-doc cannot be used to document eBPF helpers, whose signatures are not available directly in the header files (pre-processor directives are used to produce them at the beginning of the compilation process). v4: - Also remove overviews for newly added bpf_xdp_adjust_tail() and bpf_skb_get_xfrm_state(). - Remove vague statement about what helpers are restricted to GPL programs in "LICENSE" section for man page footer. - Replace license boilerplate with SPDX tag for Python script. v3: - Change license for man page. - Remove "for safety reasons" from man page header text. - Change "packets metadata" to "packets" in man page header text. - Move and fix comment on helpers introducing no overhead. - Remove "NOTES" section from man page footer. - Add "LICENSE" section to man page footer. - Edit description of file include/uapi/linux/bpf.h in man page footer. Signed-off-by: Quentin Monnet Signed-off-by: Daniel Borkmann --- include/uapi/linux/bpf.h | 447 ++----------------------------------- scripts/bpf_helpers_doc.py | 421 ++++++++++++++++++++++++++++++++++ 2 files changed, 437 insertions(+), 431 deletions(-) create mode 100755 scripts/bpf_helpers_doc.py diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 4abdd2a266b5..437f705b7fb7 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -377,437 +377,22 @@ union bpf_attr { }; } __attribute__((aligned(8))); -/* BPF helper function descriptions: - * - * void *bpf_map_lookup_elem(&map, &key) - * Return: Map value or NULL - * - * int bpf_map_update_elem(&map, &key, &value, flags) - * Return: 0 on success or negative error - * - * int bpf_map_delete_elem(&map, &key) - * Return: 0 on success or negative error - * - * int bpf_probe_read(void *dst, int size, void *src) - * Return: 0 on success or negative error - * - * u64 bpf_ktime_get_ns(void) - * Return: current ktime - * - * int bpf_trace_printk(const char *fmt, int fmt_size, ...) - * Return: length of buffer written or negative error - * - * u32 bpf_prandom_u32(void) - * Return: random value - * - * u32 bpf_raw_smp_processor_id(void) - * Return: SMP processor ID - * - * int bpf_skb_store_bytes(skb, offset, from, len, flags) - * store bytes into packet - * @skb: pointer to skb - * @offset: offset within packet from skb->mac_header - * @from: pointer where to copy bytes from - * @len: number of bytes to store into packet - * @flags: bit 0 - if true, recompute skb->csum - * other bits - reserved - * Return: 0 on success or negative error - * - * int bpf_l3_csum_replace(skb, offset, from, to, flags) - * recompute IP checksum - * @skb: pointer to skb - * @offset: offset within packet where IP checksum is located - * @from: old value of header field - * @to: new value of header field - * @flags: bits 0-3 - size of header field - * other bits - reserved - * Return: 0 on success or negative error - * - * int bpf_l4_csum_replace(skb, offset, from, to, flags) - * recompute TCP/UDP checksum - * @skb: pointer to skb - * @offset: offset within packet where TCP/UDP checksum is located - * @from: old value of header field - * @to: new value of header field - * @flags: bits 0-3 - size of header field - * bit 4 - is pseudo header - * other bits - reserved - * Return: 0 on success or negative error - * - * int bpf_tail_call(ctx, prog_array_map, index) - * jump into another BPF program - * @ctx: context pointer passed to next program - * @prog_array_map: pointer to map which type is BPF_MAP_TYPE_PROG_ARRAY - * @index: 32-bit index inside array that selects specific program to run - * Return: 0 on success or negative error - * - * int bpf_clone_redirect(skb, ifindex, flags) - * redirect to another netdev - * @skb: pointer to skb - * @ifindex: ifindex of the net device - * @flags: bit 0 - if set, redirect to ingress instead of egress - * other bits - reserved - * Return: 0 on success or negative error - * - * u64 bpf_get_current_pid_tgid(void) - * Return: current->tgid << 32 | current->pid - * - * u64 bpf_get_current_uid_gid(void) - * Return: current_gid << 32 | current_uid - * - * int bpf_get_current_comm(char *buf, int size_of_buf) - * stores current->comm into buf - * Return: 0 on success or negative error - * - * u32 bpf_get_cgroup_classid(skb) - * retrieve a proc's classid - * @skb: pointer to skb - * Return: classid if != 0 - * - * int bpf_skb_vlan_push(skb, vlan_proto, vlan_tci) - * Return: 0 on success or negative error - * - * int bpf_skb_vlan_pop(skb) - * Return: 0 on success or negative error - * - * int bpf_skb_get_tunnel_key(skb, key, size, flags) - * int bpf_skb_set_tunnel_key(skb, key, size, flags) - * retrieve or populate tunnel metadata - * @skb: pointer to skb - * @key: pointer to 'struct bpf_tunnel_key' - * @size: size of 'struct bpf_tunnel_key' - * @flags: room for future extensions - * Return: 0 on success or negative error - * - * u64 bpf_perf_event_read(map, flags) - * read perf event counter value - * @map: pointer to perf_event_array map - * @flags: index of event in the map or bitmask flags - * Return: value of perf event counter read or error code - * - * int bpf_redirect(ifindex, flags) - * redirect to another netdev - * @ifindex: ifindex of the net device - * @flags: - * cls_bpf: - * bit 0 - if set, redirect to ingress instead of egress - * other bits - reserved - * xdp_bpf: - * all bits - reserved - * Return: cls_bpf: TC_ACT_REDIRECT on success or TC_ACT_SHOT on error - * xdp_bfp: XDP_REDIRECT on success or XDP_ABORT on error - * int bpf_redirect_map(map, key, flags) - * redirect to endpoint in map - * @map: pointer to dev map - * @key: index in map to lookup - * @flags: -- - * Return: XDP_REDIRECT on success or XDP_ABORT on error - * - * u32 bpf_get_route_realm(skb) - * retrieve a dst's tclassid - * @skb: pointer to skb - * Return: realm if != 0 - * - * int bpf_perf_event_output(ctx, map, flags, data, size) - * output perf raw sample - * @ctx: struct pt_regs* - * @map: pointer to perf_event_array map - * @flags: index of event in the map or bitmask flags - * @data: data on stack to be output as raw data - * @size: size of data - * Return: 0 on success or negative error - * - * int bpf_get_stackid(ctx, map, flags) - * walk user or kernel stack and return id - * @ctx: struct pt_regs* - * @map: pointer to stack_trace map - * @flags: bits 0-7 - numer of stack frames to skip - * bit 8 - collect user stack instead of kernel - * bit 9 - compare stacks by hash only - * bit 10 - if two different stacks hash into the same stackid - * discard old - * other bits - reserved - * Return: >= 0 stackid on success or negative error - * - * s64 bpf_csum_diff(from, from_size, to, to_size, seed) - * calculate csum diff - * @from: raw from buffer - * @from_size: length of from buffer - * @to: raw to buffer - * @to_size: length of to buffer - * @seed: optional seed - * Return: csum result or negative error code - * - * int bpf_skb_get_tunnel_opt(skb, opt, size) - * retrieve tunnel options metadata - * @skb: pointer to skb - * @opt: pointer to raw tunnel option data - * @size: size of @opt - * Return: option size - * - * int bpf_skb_set_tunnel_opt(skb, opt, size) - * populate tunnel options metadata - * @skb: pointer to skb - * @opt: pointer to raw tunnel option data - * @size: size of @opt - * Return: 0 on success or negative error - * - * int bpf_skb_change_proto(skb, proto, flags) - * Change protocol of the skb. Currently supported is v4 -> v6, - * v6 -> v4 transitions. The helper will also resize the skb. eBPF - * program is expected to fill the new headers via skb_store_bytes - * and lX_csum_replace. - * @skb: pointer to skb - * @proto: new skb->protocol type - * @flags: reserved - * Return: 0 on success or negative error - * - * int bpf_skb_change_type(skb, type) - * Change packet type of skb. - * @skb: pointer to skb - * @type: new skb->pkt_type type - * Return: 0 on success or negative error - * - * int bpf_skb_under_cgroup(skb, map, index) - * Check cgroup2 membership of skb - * @skb: pointer to skb - * @map: pointer to bpf_map in BPF_MAP_TYPE_CGROUP_ARRAY type - * @index: index of the cgroup in the bpf_map - * Return: - * == 0 skb failed the cgroup2 descendant test - * == 1 skb succeeded the cgroup2 descendant test - * < 0 error - * - * u32 bpf_get_hash_recalc(skb) - * Retrieve and possibly recalculate skb->hash. - * @skb: pointer to skb - * Return: hash - * - * u64 bpf_get_current_task(void) - * Returns current task_struct - * Return: current - * - * int bpf_probe_write_user(void *dst, void *src, int len) - * safely attempt to write to a location - * @dst: destination address in userspace - * @src: source address on stack - * @len: number of bytes to copy - * Return: 0 on success or negative error - * - * int bpf_current_task_under_cgroup(map, index) - * Check cgroup2 membership of current task - * @map: pointer to bpf_map in BPF_MAP_TYPE_CGROUP_ARRAY type - * @index: index of the cgroup in the bpf_map - * Return: - * == 0 current failed the cgroup2 descendant test - * == 1 current succeeded the cgroup2 descendant test - * < 0 error - * - * int bpf_skb_change_tail(skb, len, flags) - * The helper will resize the skb to the given new size, to be used f.e. - * with control messages. - * @skb: pointer to skb - * @len: new skb length - * @flags: reserved - * Return: 0 on success or negative error - * - * int bpf_skb_pull_data(skb, len) - * The helper will pull in non-linear data in case the skb is non-linear - * and not all of len are part of the linear section. Only needed for - * read/write with direct packet access. - * @skb: pointer to skb - * @len: len to make read/writeable - * Return: 0 on success or negative error - * - * s64 bpf_csum_update(skb, csum) - * Adds csum into skb->csum in case of CHECKSUM_COMPLETE. - * @skb: pointer to skb - * @csum: csum to add - * Return: csum on success or negative error - * - * void bpf_set_hash_invalid(skb) - * Invalidate current skb->hash. - * @skb: pointer to skb - * - * int bpf_get_numa_node_id() - * Return: Id of current NUMA node. - * - * int bpf_skb_change_head() - * Grows headroom of skb and adjusts MAC header offset accordingly. - * Will extends/reallocae as required automatically. - * May change skb data pointer and will thus invalidate any check - * performed for direct packet access. - * @skb: pointer to skb - * @len: length of header to be pushed in front - * @flags: Flags (unused for now) - * Return: 0 on success or negative error - * - * int bpf_xdp_adjust_head(xdp_md, delta) - * Adjust the xdp_md.data by delta - * @xdp_md: pointer to xdp_md - * @delta: An positive/negative integer to be added to xdp_md.data - * Return: 0 on success or negative on error - * - * int bpf_probe_read_str(void *dst, int size, const void *unsafe_ptr) - * Copy a NUL terminated string from unsafe address. In case the string - * length is smaller than size, the target is not padded with further NUL - * bytes. In case the string length is larger than size, just count-1 - * bytes are copied and the last byte is set to NUL. - * @dst: destination address - * @size: maximum number of bytes to copy, including the trailing NUL - * @unsafe_ptr: unsafe address - * Return: - * > 0 length of the string including the trailing NUL on success - * < 0 error - * - * u64 bpf_get_socket_cookie(skb) - * Get the cookie for the socket stored inside sk_buff. - * @skb: pointer to skb - * Return: 8 Bytes non-decreasing number on success or 0 if the socket - * field is missing inside sk_buff - * - * u32 bpf_get_socket_uid(skb) - * Get the owner uid of the socket stored inside sk_buff. - * @skb: pointer to skb - * Return: uid of the socket owner on success or overflowuid if failed. - * - * u32 bpf_set_hash(skb, hash) - * Set full skb->hash. - * @skb: pointer to skb - * @hash: hash to set - * - * int bpf_setsockopt(bpf_socket, level, optname, optval, optlen) - * Calls setsockopt. Not all opts are available, only those with - * integer optvals plus TCP_CONGESTION. - * Supported levels: SOL_SOCKET and IPPROTO_TCP - * @bpf_socket: pointer to bpf_socket - * @level: SOL_SOCKET or IPPROTO_TCP - * @optname: option name - * @optval: pointer to option value - * @optlen: length of optval in bytes - * Return: 0 or negative error - * - * int bpf_getsockopt(bpf_socket, level, optname, optval, optlen) - * Calls getsockopt. Not all opts are available. - * Supported levels: IPPROTO_TCP - * @bpf_socket: pointer to bpf_socket - * @level: IPPROTO_TCP - * @optname: option name - * @optval: pointer to option value - * @optlen: length of optval in bytes - * Return: 0 or negative error - * - * int bpf_sock_ops_cb_flags_set(bpf_sock_ops, flags) - * Set callback flags for sock_ops - * @bpf_sock_ops: pointer to bpf_sock_ops_kern struct - * @flags: flags value - * Return: 0 for no error - * -EINVAL if there is no full tcp socket - * bits in flags that are not supported by current kernel - * - * int bpf_skb_adjust_room(skb, len_diff, mode, flags) - * Grow or shrink room in sk_buff. - * @skb: pointer to skb - * @len_diff: (signed) amount of room to grow/shrink - * @mode: operation mode (enum bpf_adj_room_mode) - * @flags: reserved for future use - * Return: 0 on success or negative error code - * - * int bpf_sk_redirect_map(map, key, flags) - * Redirect skb to a sock in map using key as a lookup key for the - * sock in map. - * @map: pointer to sockmap - * @key: key to lookup sock in map - * @flags: reserved for future use - * Return: SK_PASS - * - * int bpf_sock_map_update(skops, map, key, flags) - * @skops: pointer to bpf_sock_ops - * @map: pointer to sockmap to update - * @key: key to insert/update sock in map - * @flags: same flags as map update elem - * - * int bpf_xdp_adjust_meta(xdp_md, delta) - * Adjust the xdp_md.data_meta by delta - * @xdp_md: pointer to xdp_md - * @delta: An positive/negative integer to be added to xdp_md.data_meta - * Return: 0 on success or negative on error - * - * int bpf_perf_event_read_value(map, flags, buf, buf_size) - * read perf event counter value and perf event enabled/running time - * @map: pointer to perf_event_array map - * @flags: index of event in the map or bitmask flags - * @buf: buf to fill - * @buf_size: size of the buf - * Return: 0 on success or negative error code - * - * int bpf_perf_prog_read_value(ctx, buf, buf_size) - * read perf prog attached perf event counter and enabled/running time - * @ctx: pointer to ctx - * @buf: buf to fill - * @buf_size: size of the buf - * Return : 0 on success or negative error code - * - * int bpf_override_return(pt_regs, rc) - * @pt_regs: pointer to struct pt_regs - * @rc: the return value to set - * - * - * int bpf_msg_redirect_map(map, key, flags) - * Redirect msg to a sock in map using key as a lookup key for the - * sock in map. - * @map: pointer to sockmap - * @key: key to lookup sock in map - * @flags: reserved for future use - * Return: SK_PASS - * - * int bpf_bind(ctx, addr, addr_len) - * Bind socket to address. Only binding to IP is supported, no port can be - * set in addr. - * @ctx: pointer to context of type bpf_sock_addr - * @addr: pointer to struct sockaddr to bind socket to - * @addr_len: length of sockaddr structure - * Return: 0 on success or negative error code - * - * int bpf_xdp_adjust_tail(xdp_md, delta) - * Adjust the xdp_md.data_end by delta. Only shrinking of packet's - * size is supported. - * @xdp_md: pointer to xdp_md - * @delta: A negative integer to be added to xdp_md.data_end - * Return: 0 on success or negative on error - * - * int bpf_skb_get_xfrm_state(skb, index, xfrm_state, size, flags) - * retrieve XFRM state - * @skb: pointer to skb - * @index: index of the xfrm state in the secpath - * @key: pointer to 'struct bpf_xfrm_state' - * @size: size of 'struct bpf_xfrm_state' - * @flags: room for future extensions - * Return: 0 on success or negative error - * - * int skb_load_bytes_relative(const struct sk_buff *skb, u32 offset, void *to, u32 len, u32 start_header) - * Description - * This helper is similar to **bpf_skb_load_bytes**\ () in that - * it provides an easy way to load *len* bytes from *offset* - * from the packet associated to *skb*, into the buffer pointed - * by *to*. The difference to **bpf_skb_load_bytes**\ () is that - * a fifth argument *start_header* exists in order to select a - * base offset to start from. *start_header* can be one of: - * - * **BPF_HDR_START_MAC** - * Base offset to load data from is *skb*'s mac header. - * **BPF_HDR_START_NET** - * Base offset to load data from is *skb*'s network header. - * - * In general, "direct packet access" is the preferred method to - * access packet data, however, this helper is in particular useful - * in socket filters where *skb*\ **->data** does not always point - * to the start of the mac header and where "direct packet access" - * is not available. - * - * Return - * 0 on success, or a negative error in case of failure. - * +/* The description below is an attempt at providing documentation to eBPF + * developers about the multiple available eBPF helper functions. It can be + * parsed and used to produce a manual page. The workflow is the following, + * and requires the rst2man utility: + * + * $ ./scripts/bpf_helpers_doc.py \ + * --filename include/uapi/linux/bpf.h > /tmp/bpf-helpers.rst + * $ rst2man /tmp/bpf-helpers.rst > /tmp/bpf-helpers.7 + * $ man /tmp/bpf-helpers.7 + * + * Note that in order to produce this external documentation, some RST + * formatting is used in the descriptions to get "bold" and "italics" in + * manual pages. Also note that the few trailing white spaces are + * intentional, removing them would break paragraphs for rst2man. + * + * Start of BPF helper function descriptions: */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ diff --git a/scripts/bpf_helpers_doc.py b/scripts/bpf_helpers_doc.py new file mode 100755 index 000000000000..30ba0fee36e4 --- /dev/null +++ b/scripts/bpf_helpers_doc.py @@ -0,0 +1,421 @@ +#!/usr/bin/python3 +# SPDX-License-Identifier: GPL-2.0-only +# +# Copyright (C) 2018 Netronome Systems, Inc. + +# In case user attempts to run with Python 2. +from __future__ import print_function + +import argparse +import re +import sys, os + +class NoHelperFound(BaseException): + pass + +class ParsingError(BaseException): + def __init__(self, line='', reader=None): + if reader: + BaseException.__init__(self, + 'Error at file offset %d, parsing line: %s' % + (reader.tell(), line)) + else: + BaseException.__init__(self, 'Error parsing line: %s' % line) + +class Helper(object): + """ + An object representing the description of an eBPF helper function. + @proto: function prototype of the helper function + @desc: textual description of the helper function + @ret: description of the return value of the helper function + """ + def __init__(self, proto='', desc='', ret=''): + self.proto = proto + self.desc = desc + self.ret = ret + + def proto_break_down(self): + """ + Break down helper function protocol into smaller chunks: return type, + name, distincts arguments. + """ + arg_re = re.compile('^((const )?(struct )?(\w+|...))( (\**)(\w+))?$') + res = {} + proto_re = re.compile('^(.+) (\**)(\w+)\(((([^,]+)(, )?){1,5})\)$') + + capture = proto_re.match(self.proto) + res['ret_type'] = capture.group(1) + res['ret_star'] = capture.group(2) + res['name'] = capture.group(3) + res['args'] = [] + + args = capture.group(4).split(', ') + for a in args: + capture = arg_re.match(a) + res['args'].append({ + 'type' : capture.group(1), + 'star' : capture.group(6), + 'name' : capture.group(7) + }) + + return res + +class HeaderParser(object): + """ + An object used to parse a file in order to extract the documentation of a + list of eBPF helper functions. All the helpers that can be retrieved are + stored as Helper object, in the self.helpers() array. + @filename: name of file to parse, usually include/uapi/linux/bpf.h in the + kernel tree + """ + def __init__(self, filename): + self.reader = open(filename, 'r') + self.line = '' + self.helpers = [] + + def parse_helper(self): + proto = self.parse_proto() + desc = self.parse_desc() + ret = self.parse_ret() + return Helper(proto=proto, desc=desc, ret=ret) + + def parse_proto(self): + # Argument can be of shape: + # - "void" + # - "type name" + # - "type *name" + # - Same as above, with "const" and/or "struct" in front of type + # - "..." (undefined number of arguments, for bpf_trace_printk()) + # There is at least one term ("void"), and at most five arguments. + p = re.compile('^ \* ((.+) \**\w+\((((const )?(struct )?(\w+|\.\.\.)( \**\w+)?)(, )?){1,5}\))$') + capture = p.match(self.line) + if not capture: + raise NoHelperFound + self.line = self.reader.readline() + return capture.group(1) + + def parse_desc(self): + p = re.compile('^ \* \tDescription$') + capture = p.match(self.line) + if not capture: + # Helper can have empty description and we might be parsing another + # attribute: return but do not consume. + return '' + # Description can be several lines, some of them possibly empty, and it + # stops when another subsection title is met. + desc = '' + while True: + self.line = self.reader.readline() + if self.line == ' *\n': + desc += '\n' + else: + p = re.compile('^ \* \t\t(.*)') + capture = p.match(self.line) + if capture: + desc += capture.group(1) + '\n' + else: + break + return desc + + def parse_ret(self): + p = re.compile('^ \* \tReturn$') + capture = p.match(self.line) + if not capture: + # Helper can have empty retval and we might be parsing another + # attribute: return but do not consume. + return '' + # Return value description can be several lines, some of them possibly + # empty, and it stops when another subsection title is met. + ret = '' + while True: + self.line = self.reader.readline() + if self.line == ' *\n': + ret += '\n' + else: + p = re.compile('^ \* \t\t(.*)') + capture = p.match(self.line) + if capture: + ret += capture.group(1) + '\n' + else: + break + return ret + + def run(self): + # Advance to start of helper function descriptions. + offset = self.reader.read().find('* Start of BPF helper function descriptions:') + if offset == -1: + raise Exception('Could not find start of eBPF helper descriptions list') + self.reader.seek(offset) + self.reader.readline() + self.reader.readline() + self.line = self.reader.readline() + + while True: + try: + helper = self.parse_helper() + self.helpers.append(helper) + except NoHelperFound: + break + + self.reader.close() + print('Parsed description of %d helper function(s)' % len(self.helpers), + file=sys.stderr) + +############################################################################### + +class Printer(object): + """ + A generic class for printers. Printers should be created with an array of + Helper objects, and implement a way to print them in the desired fashion. + @helpers: array of Helper objects to print to standard output + """ + def __init__(self, helpers): + self.helpers = helpers + + def print_header(self): + pass + + def print_footer(self): + pass + + def print_one(self, helper): + pass + + def print_all(self): + self.print_header() + for helper in self.helpers: + self.print_one(helper) + self.print_footer() + +class PrinterRST(Printer): + """ + A printer for dumping collected information about helpers as a ReStructured + Text page compatible with the rst2man program, which can be used to + generate a manual page for the helpers. + @helpers: array of Helper objects to print to standard output + """ + def print_header(self): + header = '''\ +.. Copyright (C) All BPF authors and contributors from 2014 to present. +.. See git log include/uapi/linux/bpf.h in kernel tree for details. +.. +.. %%%LICENSE_START(VERBATIM) +.. Permission is granted to make and distribute verbatim copies of this +.. manual provided the copyright notice and this permission notice are +.. preserved on all copies. +.. +.. Permission is granted to copy and distribute modified versions of this +.. manual under the conditions for verbatim copying, provided that the +.. entire resulting derived work is distributed under the terms of a +.. permission notice identical to this one. +.. +.. Since the Linux kernel and libraries are constantly changing, this +.. manual page may be incorrect or out-of-date. The author(s) assume no +.. responsibility for errors or omissions, or for damages resulting from +.. the use of the information contained herein. The author(s) may not +.. have taken the same level of care in the production of this manual, +.. which is licensed free of charge, as they might when working +.. professionally. +.. +.. Formatted or processed versions of this manual, if unaccompanied by +.. the source, must acknowledge the copyright and authors of this work. +.. %%%LICENSE_END +.. +.. Please do not edit this file. It was generated from the documentation +.. located in file include/uapi/linux/bpf.h of the Linux kernel sources +.. (helpers description), and from scripts/bpf_helpers_doc.py in the same +.. repository (header and footer). + +=========== +BPF-HELPERS +=========== +------------------------------------------------------------------------------- +list of eBPF helper functions +------------------------------------------------------------------------------- + +:Manual section: 7 + +DESCRIPTION +=========== + +The extended Berkeley Packet Filter (eBPF) subsystem consists in programs +written in a pseudo-assembly language, then attached to one of the several +kernel hooks and run in reaction of specific events. This framework differs +from the older, "classic" BPF (or "cBPF") in several aspects, one of them being +the ability to call special functions (or "helpers") from within a program. +These functions are restricted to a white-list of helpers defined in the +kernel. + +These helpers are used by eBPF programs to interact with the system, or with +the context in which they work. For instance, they can be used to print +debugging messages, to get the time since the system was booted, to interact +with eBPF maps, or to manipulate network packets. Since there are several eBPF +program types, and that they do not run in the same context, each program type +can only call a subset of those helpers. + +Due to eBPF conventions, a helper can not have more than five arguments. + +Internally, eBPF programs call directly into the compiled helper functions +without requiring any foreign-function interface. As a result, calling helpers +introduces no overhead, thus offering excellent performance. + +This document is an attempt to list and document the helpers available to eBPF +developers. They are sorted by chronological order (the oldest helpers in the +kernel at the top). + +HELPERS +======= +''' + print(header) + + def print_footer(self): + footer = ''' +EXAMPLES +======== + +Example usage for most of the eBPF helpers listed in this manual page are +available within the Linux kernel sources, at the following locations: + +* *samples/bpf/* +* *tools/testing/selftests/bpf/* + +LICENSE +======= + +eBPF programs can have an associated license, passed along with the bytecode +instructions to the kernel when the programs are loaded. The format for that +string is identical to the one in use for kernel modules (Dual licenses, such +as "Dual BSD/GPL", may be used). Some helper functions are only accessible to +programs that are compatible with the GNU Privacy License (GPL). + +In order to use such helpers, the eBPF program must be loaded with the correct +license string passed (via **attr**) to the **bpf**\ () system call, and this +generally translates into the C source code of the program containing a line +similar to the following: + +:: + + char ____license[] __attribute__((section("license"), used)) = "GPL"; + +IMPLEMENTATION +============== + +This manual page is an effort to document the existing eBPF helper functions. +But as of this writing, the BPF sub-system is under heavy development. New eBPF +program or map types are added, along with new helper functions. Some helpers +are occasionally made available for additional program types. So in spite of +the efforts of the community, this page might not be up-to-date. If you want to +check by yourself what helper functions exist in your kernel, or what types of +programs they can support, here are some files among the kernel tree that you +may be interested in: + +* *include/uapi/linux/bpf.h* is the main BPF header. It contains the full list + of all helper functions, as well as many other BPF definitions including most + of the flags, structs or constants used by the helpers. +* *net/core/filter.c* contains the definition of most network-related helper + functions, and the list of program types from which they can be used. +* *kernel/trace/bpf_trace.c* is the equivalent for most tracing program-related + helpers. +* *kernel/bpf/verifier.c* contains the functions used to check that valid types + of eBPF maps are used with a given helper function. +* *kernel/bpf/* directory contains other files in which additional helpers are + defined (for cgroups, sockmaps, etc.). + +Compatibility between helper functions and program types can generally be found +in the files where helper functions are defined. Look for the **struct +bpf_func_proto** objects and for functions returning them: these functions +contain a list of helpers that a given program type can call. Note that the +**default:** label of the **switch ... case** used to filter helpers can call +other functions, themselves allowing access to additional helpers. The +requirement for GPL license is also in those **struct bpf_func_proto**. + +Compatibility between helper functions and map types can be found in the +**check_map_func_compatibility**\ () function in file *kernel/bpf/verifier.c*. + +Helper functions that invalidate the checks on **data** and **data_end** +pointers for network processing are listed in function +**bpf_helper_changes_pkt_data**\ () in file *net/core/filter.c*. + +SEE ALSO +======== + +**bpf**\ (2), +**cgroups**\ (7), +**ip**\ (8), +**perf_event_open**\ (2), +**sendmsg**\ (2), +**socket**\ (7), +**tc-bpf**\ (8)''' + print(footer) + + def print_proto(self, helper): + """ + Format function protocol with bold and italics markers. This makes RST + file less readable, but gives nice results in the manual page. + """ + proto = helper.proto_break_down() + + print('**%s %s%s(' % (proto['ret_type'], + proto['ret_star'].replace('*', '\\*'), + proto['name']), + end='') + + comma = '' + for a in proto['args']: + one_arg = '{}{}'.format(comma, a['type']) + if a['name']: + if a['star']: + one_arg += ' {}**\ '.format(a['star'].replace('*', '\\*')) + else: + one_arg += '** ' + one_arg += '*{}*\\ **'.format(a['name']) + comma = ', ' + print(one_arg, end='') + + print(')**') + + def print_one(self, helper): + self.print_proto(helper) + + if (helper.desc): + print('\tDescription') + # Do not strip all newline characters: formatted code at the end of + # a section must be followed by a blank line. + for line in re.sub('\n$', '', helper.desc, count=1).split('\n'): + print('{}{}'.format('\t\t' if line else '', line)) + + if (helper.ret): + print('\tReturn') + for line in helper.ret.rstrip().split('\n'): + print('{}{}'.format('\t\t' if line else '', line)) + + print('') + +############################################################################### + +# If script is launched from scripts/ from kernel tree and can access +# ../include/uapi/linux/bpf.h, use it as a default name for the file to parse, +# otherwise the --filename argument will be required from the command line. +script = os.path.abspath(sys.argv[0]) +linuxRoot = os.path.dirname(os.path.dirname(script)) +bpfh = os.path.join(linuxRoot, 'include/uapi/linux/bpf.h') + +argParser = argparse.ArgumentParser(description=""" +Parse eBPF header file and generate documentation for eBPF helper functions. +The RST-formatted output produced can be turned into a manual page with the +rst2man utility. +""") +if (os.path.isfile(bpfh)): + argParser.add_argument('--filename', help='path to include/uapi/linux/bpf.h', + default=bpfh) +else: + argParser.add_argument('--filename', help='path to include/uapi/linux/bpf.h') +args = argParser.parse_args() + +# Parse file. +headerParser = HeaderParser(args.filename) +headerParser.run() + +# Print formatted output to standard output. +printer = PrinterRST(headerParser.helpers) +printer.print_all() From e131631b6fdefe9dc7b302066e9d6a4f89629369 Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Wed, 25 Apr 2018 18:16:53 +0100 Subject: [PATCH 0247/1640] UPSTREAM: bpf: add documentation for eBPF helpers (01-11) Add documentation for eBPF helper functions to bpf.h user header file. This documentation can be parsed with the Python script provided in another commit of the patch series, in order to provide a RST document that can later be converted into a man page. The objective is to make the documentation easily understandable and accessible to all eBPF developers, including beginners. This patch contains descriptions for the following helper functions, all written by Alexei: - bpf_map_lookup_elem() - bpf_map_update_elem() - bpf_map_delete_elem() - bpf_probe_read() - bpf_ktime_get_ns() - bpf_trace_printk() - bpf_skb_store_bytes() - bpf_l3_csum_replace() - bpf_l4_csum_replace() - bpf_tail_call() - bpf_clone_redirect() v4: - bpf_map_lookup_elem(): Add "const" qualifier for key. - bpf_map_update_elem(): Add "const" qualifier for key and value. - bpf_map_lookup_elem(): Add "const" qualifier for key. - bpf_skb_store_bytes(): Clarify comment about invalidated verifier checks. - bpf_l3_csum_replace(): Mention L3 instead of just IP, and add a note about bpf_csum_diff(). - bpf_l4_csum_replace(): Mention L4 instead of just TCP/UDP, and add a note about bpf_csum_diff(). - bpf_tail_call(): Bring minor edits to description. - bpf_clone_redirect(): Add a note about the relation with bpf_redirect(). Also clarify comment about invalidated verifier checks. v3: - bpf_map_lookup_elem(): Fix description of restrictions for flags related to the existence of the entry. - bpf_trace_printk(): State that trace_pipe can be configured. Fix return value in case an unknown format specifier is met. Add a note on kernel log notice when the helper is used. Edit example. - bpf_tail_call(): Improve comment on stack inheritance. - bpf_clone_redirect(): Improve description of BPF_F_INGRESS flag. Cc: Alexei Starovoitov Signed-off-by: Quentin Monnet Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/uapi/linux/bpf.h | 230 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 230 insertions(+) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 437f705b7fb7..a44d57728236 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -393,6 +393,236 @@ union bpf_attr { * intentional, removing them would break paragraphs for rst2man. * * Start of BPF helper function descriptions: + * + * void *bpf_map_lookup_elem(struct bpf_map *map, const void *key) + * Description + * Perform a lookup in *map* for an entry associated to *key*. + * Return + * Map value associated to *key*, or **NULL** if no entry was + * found. + * + * int bpf_map_update_elem(struct bpf_map *map, const void *key, const void *value, u64 flags) + * Description + * Add or update the value of the entry associated to *key* in + * *map* with *value*. *flags* is one of: + * + * **BPF_NOEXIST** + * The entry for *key* must not exist in the map. + * **BPF_EXIST** + * The entry for *key* must already exist in the map. + * **BPF_ANY** + * No condition on the existence of the entry for *key*. + * + * Flag value **BPF_NOEXIST** cannot be used for maps of types + * **BPF_MAP_TYPE_ARRAY** or **BPF_MAP_TYPE_PERCPU_ARRAY** (all + * elements always exist), the helper would return an error. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_map_delete_elem(struct bpf_map *map, const void *key) + * Description + * Delete entry with *key* from *map*. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_probe_read(void *dst, u32 size, const void *src) + * Description + * For tracing programs, safely attempt to read *size* bytes from + * address *src* and store the data in *dst*. + * Return + * 0 on success, or a negative error in case of failure. + * + * u64 bpf_ktime_get_ns(void) + * Description + * Return the time elapsed since system boot, in nanoseconds. + * Return + * Current *ktime*. + * + * int bpf_trace_printk(const char *fmt, u32 fmt_size, ...) + * Description + * This helper is a "printk()-like" facility for debugging. It + * prints a message defined by format *fmt* (of size *fmt_size*) + * to file *\/sys/kernel/debug/tracing/trace* from DebugFS, if + * available. It can take up to three additional **u64** + * arguments (as an eBPF helpers, the total number of arguments is + * limited to five). + * + * Each time the helper is called, it appends a line to the trace. + * The format of the trace is customizable, and the exact output + * one will get depends on the options set in + * *\/sys/kernel/debug/tracing/trace_options* (see also the + * *README* file under the same directory). However, it usually + * defaults to something like: + * + * :: + * + * telnet-470 [001] .N.. 419421.045894: 0x00000001: + * + * In the above: + * + * * ``telnet`` is the name of the current task. + * * ``470`` is the PID of the current task. + * * ``001`` is the CPU number on which the task is + * running. + * * In ``.N..``, each character refers to a set of + * options (whether irqs are enabled, scheduling + * options, whether hard/softirqs are running, level of + * preempt_disabled respectively). **N** means that + * **TIF_NEED_RESCHED** and **PREEMPT_NEED_RESCHED** + * are set. + * * ``419421.045894`` is a timestamp. + * * ``0x00000001`` is a fake value used by BPF for the + * instruction pointer register. + * * ```` is the message formatted with + * *fmt*. + * + * The conversion specifiers supported by *fmt* are similar, but + * more limited than for printk(). They are **%d**, **%i**, + * **%u**, **%x**, **%ld**, **%li**, **%lu**, **%lx**, **%lld**, + * **%lli**, **%llu**, **%llx**, **%p**, **%s**. No modifier (size + * of field, padding with zeroes, etc.) is available, and the + * helper will return **-EINVAL** (but print nothing) if it + * encounters an unknown specifier. + * + * Also, note that **bpf_trace_printk**\ () is slow, and should + * only be used for debugging purposes. For this reason, a notice + * bloc (spanning several lines) is printed to kernel logs and + * states that the helper should not be used "for production use" + * the first time this helper is used (or more precisely, when + * **trace_printk**\ () buffers are allocated). For passing values + * to user space, perf events should be preferred. + * Return + * The number of bytes written to the buffer, or a negative error + * in case of failure. + * + * int bpf_skb_store_bytes(struct sk_buff *skb, u32 offset, const void *from, u32 len, u64 flags) + * Description + * Store *len* bytes from address *from* into the packet + * associated to *skb*, at *offset*. *flags* are a combination of + * **BPF_F_RECOMPUTE_CSUM** (automatically recompute the + * checksum for the packet after storing the bytes) and + * **BPF_F_INVALIDATE_HASH** (set *skb*\ **->hash**, *skb*\ + * **->swhash** and *skb*\ **->l4hash** to 0). + * + * A call to this helper is susceptible to change the underlaying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_l3_csum_replace(struct sk_buff *skb, u32 offset, u64 from, u64 to, u64 size) + * Description + * Recompute the layer 3 (e.g. IP) checksum for the packet + * associated to *skb*. Computation is incremental, so the helper + * must know the former value of the header field that was + * modified (*from*), the new value of this field (*to*), and the + * number of bytes (2 or 4) for this field, stored in *size*. + * Alternatively, it is possible to store the difference between + * the previous and the new values of the header field in *to*, by + * setting *from* and *size* to 0. For both methods, *offset* + * indicates the location of the IP checksum within the packet. + * + * This helper works in combination with **bpf_csum_diff**\ (), + * which does not update the checksum in-place, but offers more + * flexibility and can handle sizes larger than 2 or 4 for the + * checksum to update. + * + * A call to this helper is susceptible to change the underlaying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_l4_csum_replace(struct sk_buff *skb, u32 offset, u64 from, u64 to, u64 flags) + * Description + * Recompute the layer 4 (e.g. TCP, UDP or ICMP) checksum for the + * packet associated to *skb*. Computation is incremental, so the + * helper must know the former value of the header field that was + * modified (*from*), the new value of this field (*to*), and the + * number of bytes (2 or 4) for this field, stored on the lowest + * four bits of *flags*. Alternatively, it is possible to store + * the difference between the previous and the new values of the + * header field in *to*, by setting *from* and the four lowest + * bits of *flags* to 0. For both methods, *offset* indicates the + * location of the IP checksum within the packet. In addition to + * the size of the field, *flags* can be added (bitwise OR) actual + * flags. With **BPF_F_MARK_MANGLED_0**, a null checksum is left + * untouched (unless **BPF_F_MARK_ENFORCE** is added as well), and + * for updates resulting in a null checksum the value is set to + * **CSUM_MANGLED_0** instead. Flag **BPF_F_PSEUDO_HDR** indicates + * the checksum is to be computed against a pseudo-header. + * + * This helper works in combination with **bpf_csum_diff**\ (), + * which does not update the checksum in-place, but offers more + * flexibility and can handle sizes larger than 2 or 4 for the + * checksum to update. + * + * A call to this helper is susceptible to change the underlaying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_tail_call(void *ctx, struct bpf_map *prog_array_map, u32 index) + * Description + * This special helper is used to trigger a "tail call", or in + * other words, to jump into another eBPF program. The same stack + * frame is used (but values on stack and in registers for the + * caller are not accessible to the callee). This mechanism allows + * for program chaining, either for raising the maximum number of + * available eBPF instructions, or to execute given programs in + * conditional blocks. For security reasons, there is an upper + * limit to the number of successive tail calls that can be + * performed. + * + * Upon call of this helper, the program attempts to jump into a + * program referenced at index *index* in *prog_array_map*, a + * special map of type **BPF_MAP_TYPE_PROG_ARRAY**, and passes + * *ctx*, a pointer to the context. + * + * If the call succeeds, the kernel immediately runs the first + * instruction of the new program. This is not a function call, + * and it never returns to the previous program. If the call + * fails, then the helper has no effect, and the caller continues + * to run its subsequent instructions. A call can fail if the + * destination program for the jump does not exist (i.e. *index* + * is superior to the number of entries in *prog_array_map*), or + * if the maximum number of tail calls has been reached for this + * chain of programs. This limit is defined in the kernel by the + * macro **MAX_TAIL_CALL_CNT** (not accessible to user space), + * which is currently set to 32. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_clone_redirect(struct sk_buff *skb, u32 ifindex, u64 flags) + * Description + * Clone and redirect the packet associated to *skb* to another + * net device of index *ifindex*. Both ingress and egress + * interfaces can be used for redirection. The **BPF_F_INGRESS** + * value in *flags* is used to make the distinction (ingress path + * is selected if the flag is present, egress path otherwise). + * This is the only flag supported for now. + * + * In comparison with **bpf_redirect**\ () helper, + * **bpf_clone_redirect**\ () has the associated cost of + * duplicating the packet buffer, but this can be executed out of + * the eBPF program. Conversely, **bpf_redirect**\ () is more + * efficient, but it is handled through an action code where the + * redirection happens only after the eBPF program has returned. + * + * A call to this helper is susceptible to change the underlaying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ From ef6b372e5d903581a96fbb0378bb307cb30e40cd Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Wed, 25 Apr 2018 18:16:54 +0100 Subject: [PATCH 0248/1640] UPSTREAM: bpf: add documentation for eBPF helpers (12-22) Add documentation for eBPF helper functions to bpf.h user header file. This documentation can be parsed with the Python script provided in another commit of the patch series, in order to provide a RST document that can later be converted into a man page. The objective is to make the documentation easily understandable and accessible to all eBPF developers, including beginners. This patch contains descriptions for the following helper functions, all written by Alexei: - bpf_get_current_pid_tgid() - bpf_get_current_uid_gid() - bpf_get_current_comm() - bpf_skb_vlan_push() - bpf_skb_vlan_pop() - bpf_skb_get_tunnel_key() - bpf_skb_set_tunnel_key() - bpf_redirect() - bpf_perf_event_output() - bpf_get_stackid() - bpf_get_current_task() v4: - bpf_redirect(): Fix typo: "XDP_ABORT" changed to "XDP_ABORTED". Add note on bpf_redirect_map() providing better performance. Replace "Save for" with "Except for". - bpf_skb_vlan_push(): Clarify comment about invalidated verifier checks. - bpf_skb_vlan_pop(): Clarify comment about invalidated verifier checks. - bpf_skb_get_tunnel_key(): Add notes on tunnel_id, "collect metadata" mode, and example tunneling protocols with which it can be used. - bpf_skb_set_tunnel_key(): Add a reference to the description of bpf_skb_get_tunnel_key(). - bpf_perf_event_output(): Specify that, and for what purpose, the helper can be used with programs attached to TC and XDP. v3: - bpf_skb_get_tunnel_key(): Change and improve description and example. - bpf_redirect(): Improve description of BPF_F_INGRESS flag. - bpf_perf_event_output(): Fix first sentence of description. Delete wrong statement on context being evaluated as a struct pt_reg. Remove the long yet incomplete example. - bpf_get_stackid(): Add a note about PERF_MAX_STACK_DEPTH being configurable. Cc: Alexei Starovoitov Signed-off-by: Quentin Monnet Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/uapi/linux/bpf.h | 254 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 254 insertions(+) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index a44d57728236..4e83b3a21386 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -623,6 +623,260 @@ union bpf_attr { * direct packet access. * Return * 0 on success, or a negative error in case of failure. + * + * u64 bpf_get_current_pid_tgid(void) + * Return + * A 64-bit integer containing the current tgid and pid, and + * created as such: + * *current_task*\ **->tgid << 32 \|** + * *current_task*\ **->pid**. + * + * u64 bpf_get_current_uid_gid(void) + * Return + * A 64-bit integer containing the current GID and UID, and + * created as such: *current_gid* **<< 32 \|** *current_uid*. + * + * int bpf_get_current_comm(char *buf, u32 size_of_buf) + * Description + * Copy the **comm** attribute of the current task into *buf* of + * *size_of_buf*. The **comm** attribute contains the name of + * the executable (excluding the path) for the current task. The + * *size_of_buf* must be strictly positive. On success, the + * helper makes sure that the *buf* is NUL-terminated. On failure, + * it is filled with zeroes. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci) + * Description + * Push a *vlan_tci* (VLAN tag control information) of protocol + * *vlan_proto* to the packet associated to *skb*, then update + * the checksum. Note that if *vlan_proto* is different from + * **ETH_P_8021Q** and **ETH_P_8021AD**, it is considered to + * be **ETH_P_8021Q**. + * + * A call to this helper is susceptible to change the underlaying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_skb_vlan_pop(struct sk_buff *skb) + * Description + * Pop a VLAN header from the packet associated to *skb*. + * + * A call to this helper is susceptible to change the underlaying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_skb_get_tunnel_key(struct sk_buff *skb, struct bpf_tunnel_key *key, u32 size, u64 flags) + * Description + * Get tunnel metadata. This helper takes a pointer *key* to an + * empty **struct bpf_tunnel_key** of **size**, that will be + * filled with tunnel metadata for the packet associated to *skb*. + * The *flags* can be set to **BPF_F_TUNINFO_IPV6**, which + * indicates that the tunnel is based on IPv6 protocol instead of + * IPv4. + * + * The **struct bpf_tunnel_key** is an object that generalizes the + * principal parameters used by various tunneling protocols into a + * single struct. This way, it can be used to easily make a + * decision based on the contents of the encapsulation header, + * "summarized" in this struct. In particular, it holds the IP + * address of the remote end (IPv4 or IPv6, depending on the case) + * in *key*\ **->remote_ipv4** or *key*\ **->remote_ipv6**. Also, + * this struct exposes the *key*\ **->tunnel_id**, which is + * generally mapped to a VNI (Virtual Network Identifier), making + * it programmable together with the **bpf_skb_set_tunnel_key**\ + * () helper. + * + * Let's imagine that the following code is part of a program + * attached to the TC ingress interface, on one end of a GRE + * tunnel, and is supposed to filter out all messages coming from + * remote ends with IPv4 address other than 10.0.0.1: + * + * :: + * + * int ret; + * struct bpf_tunnel_key key = {}; + * + * ret = bpf_skb_get_tunnel_key(skb, &key, sizeof(key), 0); + * if (ret < 0) + * return TC_ACT_SHOT; // drop packet + * + * if (key.remote_ipv4 != 0x0a000001) + * return TC_ACT_SHOT; // drop packet + * + * return TC_ACT_OK; // accept packet + * + * This interface can also be used with all encapsulation devices + * that can operate in "collect metadata" mode: instead of having + * one network device per specific configuration, the "collect + * metadata" mode only requires a single device where the + * configuration can be extracted from this helper. + * + * This can be used together with various tunnels such as VXLan, + * Geneve, GRE or IP in IP (IPIP). + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_skb_set_tunnel_key(struct sk_buff *skb, struct bpf_tunnel_key *key, u32 size, u64 flags) + * Description + * Populate tunnel metadata for packet associated to *skb.* The + * tunnel metadata is set to the contents of *key*, of *size*. The + * *flags* can be set to a combination of the following values: + * + * **BPF_F_TUNINFO_IPV6** + * Indicate that the tunnel is based on IPv6 protocol + * instead of IPv4. + * **BPF_F_ZERO_CSUM_TX** + * For IPv4 packets, add a flag to tunnel metadata + * indicating that checksum computation should be skipped + * and checksum set to zeroes. + * **BPF_F_DONT_FRAGMENT** + * Add a flag to tunnel metadata indicating that the + * packet should not be fragmented. + * **BPF_F_SEQ_NUMBER** + * Add a flag to tunnel metadata indicating that a + * sequence number should be added to tunnel header before + * sending the packet. This flag was added for GRE + * encapsulation, but might be used with other protocols + * as well in the future. + * + * Here is a typical usage on the transmit path: + * + * :: + * + * struct bpf_tunnel_key key; + * populate key ... + * bpf_skb_set_tunnel_key(skb, &key, sizeof(key), 0); + * bpf_clone_redirect(skb, vxlan_dev_ifindex, 0); + * + * See also the description of the **bpf_skb_get_tunnel_key**\ () + * helper for additional information. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_redirect(u32 ifindex, u64 flags) + * Description + * Redirect the packet to another net device of index *ifindex*. + * This helper is somewhat similar to **bpf_clone_redirect**\ + * (), except that the packet is not cloned, which provides + * increased performance. + * + * Except for XDP, both ingress and egress interfaces can be used + * for redirection. The **BPF_F_INGRESS** value in *flags* is used + * to make the distinction (ingress path is selected if the flag + * is present, egress path otherwise). Currently, XDP only + * supports redirection to the egress interface, and accepts no + * flag at all. + * + * The same effect can be attained with the more generic + * **bpf_redirect_map**\ (), which requires specific maps to be + * used but offers better performance. + * Return + * For XDP, the helper returns **XDP_REDIRECT** on success or + * **XDP_ABORTED** on error. For other program types, the values + * are **TC_ACT_REDIRECT** on success or **TC_ACT_SHOT** on + * error. + * + * int bpf_perf_event_output(struct pt_reg *ctx, struct bpf_map *map, u64 flags, void *data, u64 size) + * Description + * Write raw *data* blob into a special BPF perf event held by + * *map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. This perf + * event must have the following attributes: **PERF_SAMPLE_RAW** + * as **sample_type**, **PERF_TYPE_SOFTWARE** as **type**, and + * **PERF_COUNT_SW_BPF_OUTPUT** as **config**. + * + * The *flags* are used to indicate the index in *map* for which + * the value must be put, masked with **BPF_F_INDEX_MASK**. + * Alternatively, *flags* can be set to **BPF_F_CURRENT_CPU** + * to indicate that the index of the current CPU core should be + * used. + * + * The value to write, of *size*, is passed through eBPF stack and + * pointed by *data*. + * + * The context of the program *ctx* needs also be passed to the + * helper. + * + * On user space, a program willing to read the values needs to + * call **perf_event_open**\ () on the perf event (either for + * one or for all CPUs) and to store the file descriptor into the + * *map*. This must be done before the eBPF program can send data + * into it. An example is available in file + * *samples/bpf/trace_output_user.c* in the Linux kernel source + * tree (the eBPF program counterpart is in + * *samples/bpf/trace_output_kern.c*). + * + * **bpf_perf_event_output**\ () achieves better performance + * than **bpf_trace_printk**\ () for sharing data with user + * space, and is much better suitable for streaming data from eBPF + * programs. + * + * Note that this helper is not restricted to tracing use cases + * and can be used with programs attached to TC or XDP as well, + * where it allows for passing data to user space listeners. Data + * can be: + * + * * Only custom structs, + * * Only the packet payload, or + * * A combination of both. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_get_stackid(struct pt_reg *ctx, struct bpf_map *map, u64 flags) + * Description + * Walk a user or a kernel stack and return its id. To achieve + * this, the helper needs *ctx*, which is a pointer to the context + * on which the tracing program is executed, and a pointer to a + * *map* of type **BPF_MAP_TYPE_STACK_TRACE**. + * + * The last argument, *flags*, holds the number of stack frames to + * skip (from 0 to 255), masked with + * **BPF_F_SKIP_FIELD_MASK**. The next bits can be used to set + * a combination of the following flags: + * + * **BPF_F_USER_STACK** + * Collect a user space stack instead of a kernel stack. + * **BPF_F_FAST_STACK_CMP** + * Compare stacks by hash only. + * **BPF_F_REUSE_STACKID** + * If two different stacks hash into the same *stackid*, + * discard the old one. + * + * The stack id retrieved is a 32 bit long integer handle which + * can be further combined with other data (including other stack + * ids) and used as a key into maps. This can be useful for + * generating a variety of graphs (such as flame graphs or off-cpu + * graphs). + * + * For walking a stack, this helper is an improvement over + * **bpf_probe_read**\ (), which can be used with unrolled loops + * but is not efficient and consumes a lot of eBPF instructions. + * Instead, **bpf_get_stackid**\ () can collect up to + * **PERF_MAX_STACK_DEPTH** both kernel and user frames. Note that + * this limit can be controlled with the **sysctl** program, and + * that it should be manually increased in order to profile long + * user stacks (such as stacks for Java programs). To do so, use: + * + * :: + * + * # sysctl kernel.perf_event_max_stack= + * + * Return + * The positive or null stack id on success, or a negative error + * in case of failure. + * + * u64 bpf_get_current_task(void) + * Return + * A pointer to the current task struct. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ From 224f562ba23dd4024ff971dab346c0f0cf724acc Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Wed, 25 Apr 2018 18:16:55 +0100 Subject: [PATCH 0249/1640] UPSTREAM: bpf: add documentation for eBPF helpers (23-32) Add documentation for eBPF helper functions to bpf.h user header file. This documentation can be parsed with the Python script provided in another commit of the patch series, in order to provide a RST document that can later be converted into a man page. The objective is to make the documentation easily understandable and accessible to all eBPF developers, including beginners. This patch contains descriptions for the following helper functions, all written by Daniel: - bpf_get_prandom_u32() - bpf_get_smp_processor_id() - bpf_get_cgroup_classid() - bpf_get_route_realm() - bpf_skb_load_bytes() - bpf_csum_diff() - bpf_skb_get_tunnel_opt() - bpf_skb_set_tunnel_opt() - bpf_skb_change_proto() - bpf_skb_change_type() v4: - bpf_get_prandom_u32(): Warn that the prng is not cryptographically secure. - bpf_get_smp_processor_id(): Fix a typo (case). - bpf_get_cgroup_classid(): Clarify description. Add notes on the helper being limited to cgroup v1, and to egress path. - bpf_get_route_realm(): Add comparison with bpf_get_cgroup_classid(). Add a note about usage with TC and advantage of clsact. Fix a typo in return value ("sdb" instead of "skb"). - bpf_skb_load_bytes(): Make explicit loading large data loads it to the eBPF stack. - bpf_csum_diff(): Add a note on seed that can be cascaded. Link to bpf_l3|l4_csum_replace(). - bpf_skb_get_tunnel_opt(): Add a note about usage with "collect metadata" mode, and example of this with Geneve. - bpf_skb_set_tunnel_opt(): Add a link to bpf_skb_get_tunnel_opt() description. - bpf_skb_change_proto(): Mention that the main use case is NAT64. Clarify comment about invalidated verifier checks. v3: - bpf_get_prandom_u32(): Fix helper name :(. Add description, including a note on the internal random state. - bpf_get_smp_processor_id(): Add description, including a note on the processor id remaining stable during program run. - bpf_get_cgroup_classid(): State that CONFIG_CGROUP_NET_CLASSID is required to use the helper. Add a reference to related documentation. State that placing a task in net_cls controller disables cgroup-bpf. - bpf_get_route_realm(): State that CONFIG_CGROUP_NET_CLASSID is required to use this helper. - bpf_skb_load_bytes(): Fix comment on current use cases for the helper. Cc: Daniel Borkmann Signed-off-by: Quentin Monnet Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/uapi/linux/bpf.h | 197 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 197 insertions(+) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 4e83b3a21386..a71057f51656 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -495,6 +495,27 @@ union bpf_attr { * The number of bytes written to the buffer, or a negative error * in case of failure. * + * u32 bpf_get_prandom_u32(void) + * Description + * Get a pseudo-random number. + * + * From a security point of view, this helper uses its own + * pseudo-random internal state, and cannot be used to infer the + * seed of other random functions in the kernel. However, it is + * essential to note that the generator used by the helper is not + * cryptographically secure. + * Return + * A random 32-bit unsigned value. + * + * u32 bpf_get_smp_processor_id(void) + * Description + * Get the SMP (symmetric multiprocessing) processor id. Note that + * all programs run with preemption disabled, which means that the + * SMP processor id is stable during all the execution of the + * program. + * Return + * The SMP id of the processor running the program. + * * int bpf_skb_store_bytes(struct sk_buff *skb, u32 offset, const void *from, u32 len, u64 flags) * Description * Store *len* bytes from address *from* into the packet @@ -647,6 +668,32 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * + * u32 bpf_get_cgroup_classid(struct sk_buff *skb) + * Description + * Retrieve the classid for the current task, i.e. for the net_cls + * cgroup to which *skb* belongs. + * + * This helper can be used on TC egress path, but not on ingress. + * + * The net_cls cgroup provides an interface to tag network packets + * based on a user-provided identifier for all traffic coming from + * the tasks belonging to the related cgroup. See also the related + * kernel documentation, available from the Linux sources in file + * *Documentation/cgroup-v1/net_cls.txt*. + * + * The Linux kernel has two versions for cgroups: there are + * cgroups v1 and cgroups v2. Both are available to users, who can + * use a mixture of them, but note that the net_cls cgroup is for + * cgroup v1 only. This makes it incompatible with BPF programs + * run on cgroups, which is a cgroup-v2-only feature (a socket can + * only hold data for one version of cgroups at a time). + * + * This helper is only available is the kernel was compiled with + * the **CONFIG_CGROUP_NET_CLASSID** configuration option set to + * "**y**" or to "**m**". + * Return + * The classid, or 0 for the default unconfigured classid. + * * int bpf_skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci) * Description * Push a *vlan_tci* (VLAN tag control information) of protocol @@ -786,6 +833,30 @@ union bpf_attr { * are **TC_ACT_REDIRECT** on success or **TC_ACT_SHOT** on * error. * + * u32 bpf_get_route_realm(struct sk_buff *skb) + * Description + * Retrieve the realm or the route, that is to say the + * **tclassid** field of the destination for the *skb*. The + * indentifier retrieved is a user-provided tag, similar to the + * one used with the net_cls cgroup (see description for + * **bpf_get_cgroup_classid**\ () helper), but here this tag is + * held by a route (a destination entry), not by a task. + * + * Retrieving this identifier works with the clsact TC egress hook + * (see also **tc-bpf(8)**), or alternatively on conventional + * classful egress qdiscs, but not on TC ingress path. In case of + * clsact TC egress hook, this has the advantage that, internally, + * the destination entry has not been dropped yet in the transmit + * path. Therefore, the destination entry does not need to be + * artificially held via **netif_keep_dst**\ () for a classful + * qdisc until the *skb* is freed. + * + * This helper is available only if the kernel was compiled with + * **CONFIG_IP_ROUTE_CLASSID** configuration option. + * Return + * The realm of the route for the packet associated to *skb*, or 0 + * if none was found. + * * int bpf_perf_event_output(struct pt_reg *ctx, struct bpf_map *map, u64 flags, void *data, u64 size) * Description * Write raw *data* blob into a special BPF perf event held by @@ -831,6 +902,23 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * + * int bpf_skb_load_bytes(const struct sk_buff *skb, u32 offset, void *to, u32 len) + * Description + * This helper was provided as an easy way to load data from a + * packet. It can be used to load *len* bytes from *offset* from + * the packet associated to *skb*, into the buffer pointed by + * *to*. + * + * Since Linux 4.7, usage of this helper has mostly been replaced + * by "direct packet access", enabling packet data to be + * manipulated with *skb*\ **->data** and *skb*\ **->data_end** + * pointing respectively to the first byte of packet data and to + * the byte after the last byte of packet data. However, it + * remains useful if one wishes to read large quantities of data + * at once from a packet into the eBPF stack. + * Return + * 0 on success, or a negative error in case of failure. + * * int bpf_get_stackid(struct pt_reg *ctx, struct bpf_map *map, u64 flags) * Description * Walk a user or a kernel stack and return its id. To achieve @@ -874,6 +962,115 @@ union bpf_attr { * The positive or null stack id on success, or a negative error * in case of failure. * + * s64 bpf_csum_diff(__be32 *from, u32 from_size, __be32 *to, u32 to_size, __wsum seed) + * Description + * Compute a checksum difference, from the raw buffer pointed by + * *from*, of length *from_size* (that must be a multiple of 4), + * towards the raw buffer pointed by *to*, of size *to_size* + * (same remark). An optional *seed* can be added to the value + * (this can be cascaded, the seed may come from a previous call + * to the helper). + * + * This is flexible enough to be used in several ways: + * + * * With *from_size* == 0, *to_size* > 0 and *seed* set to + * checksum, it can be used when pushing new data. + * * With *from_size* > 0, *to_size* == 0 and *seed* set to + * checksum, it can be used when removing data from a packet. + * * With *from_size* > 0, *to_size* > 0 and *seed* set to 0, it + * can be used to compute a diff. Note that *from_size* and + * *to_size* do not need to be equal. + * + * This helper can be used in combination with + * **bpf_l3_csum_replace**\ () and **bpf_l4_csum_replace**\ (), to + * which one can feed in the difference computed with + * **bpf_csum_diff**\ (). + * Return + * The checksum result, or a negative error code in case of + * failure. + * + * int bpf_skb_get_tunnel_opt(struct sk_buff *skb, u8 *opt, u32 size) + * Description + * Retrieve tunnel options metadata for the packet associated to + * *skb*, and store the raw tunnel option data to the buffer *opt* + * of *size*. + * + * This helper can be used with encapsulation devices that can + * operate in "collect metadata" mode (please refer to the related + * note in the description of **bpf_skb_get_tunnel_key**\ () for + * more details). A particular example where this can be used is + * in combination with the Geneve encapsulation protocol, where it + * allows for pushing (with **bpf_skb_get_tunnel_opt**\ () helper) + * and retrieving arbitrary TLVs (Type-Length-Value headers) from + * the eBPF program. This allows for full customization of these + * headers. + * Return + * The size of the option data retrieved. + * + * int bpf_skb_set_tunnel_opt(struct sk_buff *skb, u8 *opt, u32 size) + * Description + * Set tunnel options metadata for the packet associated to *skb* + * to the option data contained in the raw buffer *opt* of *size*. + * + * See also the description of the **bpf_skb_get_tunnel_opt**\ () + * helper for additional information. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_skb_change_proto(struct sk_buff *skb, __be16 proto, u64 flags) + * Description + * Change the protocol of the *skb* to *proto*. Currently + * supported are transition from IPv4 to IPv6, and from IPv6 to + * IPv4. The helper takes care of the groundwork for the + * transition, including resizing the socket buffer. The eBPF + * program is expected to fill the new headers, if any, via + * **skb_store_bytes**\ () and to recompute the checksums with + * **bpf_l3_csum_replace**\ () and **bpf_l4_csum_replace**\ + * (). The main case for this helper is to perform NAT64 + * operations out of an eBPF program. + * + * Internally, the GSO type is marked as dodgy so that headers are + * checked and segments are recalculated by the GSO/GRO engine. + * The size for GSO target is adapted as well. + * + * All values for *flags* are reserved for future usage, and must + * be left at zero. + * + * A call to this helper is susceptible to change the underlaying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_skb_change_type(struct sk_buff *skb, u32 type) + * Description + * Change the packet type for the packet associated to *skb*. This + * comes down to setting *skb*\ **->pkt_type** to *type*, except + * the eBPF program does not have a write access to *skb*\ + * **->pkt_type** beside this helper. Using a helper here allows + * for graceful handling of errors. + * + * The major use case is to change incoming *skb*s to + * **PACKET_HOST** in a programmatic way instead of having to + * recirculate via **redirect**\ (..., **BPF_F_INGRESS**), for + * example. + * + * Note that *type* only allows certain values. At this time, they + * are: + * + * **PACKET_HOST** + * Packet is for us. + * **PACKET_BROADCAST** + * Send packet to all. + * **PACKET_MULTICAST** + * Send packet to group. + * **PACKET_OTHERHOST** + * Send packet to someone else. + * Return + * 0 on success, or a negative error in case of failure. + * * u64 bpf_get_current_task(void) * Return * A pointer to the current task struct. From a0af3caa4f38debe89bbab1a7329c48823b71cc6 Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Wed, 25 Apr 2018 18:16:56 +0100 Subject: [PATCH 0250/1640] UPSTREAM: bpf: add documentation for eBPF helpers (33-41) Add documentation for eBPF helper functions to bpf.h user header file. This documentation can be parsed with the Python script provided in another commit of the patch series, in order to provide a RST document that can later be converted into a man page. The objective is to make the documentation easily understandable and accessible to all eBPF developers, including beginners. This patch contains descriptions for the following helper functions, all written by Daniel: - bpf_get_hash_recalc() - bpf_skb_change_tail() - bpf_skb_pull_data() - bpf_csum_update() - bpf_set_hash_invalid() - bpf_get_numa_node_id() - bpf_set_hash() - bpf_skb_adjust_room() - bpf_xdp_adjust_meta() v4: - bpf_skb_change_tail(): Clarify comment about invalidated verifier checks. - bpf_skb_pull_data(): Clarify the motivation for using this helper or bpf_skb_load_bytes(), on non-linear buffers. Fix RST formatting for *skb*. Clarify comment about invalidated verifier checks. - bpf_csum_update(): Fix description of checksum (entire packet, not IP checksum). Fix a typo: "header" instead of "helper". - bpf_set_hash_invalid(): Mention bpf_get_hash_recalc(). - bpf_get_numa_node_id(): State that the helper is not restricted to programs attached to sockets. - bpf_skb_adjust_room(): Clarify comment about invalidated verifier checks. - bpf_xdp_adjust_meta(): Clarify comment about invalidated verifier checks. Cc: Daniel Borkmann Signed-off-by: Quentin Monnet Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/uapi/linux/bpf.h | 164 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 164 insertions(+) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index a71057f51656..535429ee3fcf 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1071,9 +1071,173 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * + * u32 bpf_get_hash_recalc(struct sk_buff *skb) + * Description + * Retrieve the hash of the packet, *skb*\ **->hash**. If it is + * not set, in particular if the hash was cleared due to mangling, + * recompute this hash. Later accesses to the hash can be done + * directly with *skb*\ **->hash**. + * + * Calling **bpf_set_hash_invalid**\ (), changing a packet + * prototype with **bpf_skb_change_proto**\ (), or calling + * **bpf_skb_store_bytes**\ () with the + * **BPF_F_INVALIDATE_HASH** are actions susceptible to clear + * the hash and to trigger a new computation for the next call to + * **bpf_get_hash_recalc**\ (). + * Return + * The 32-bit hash. + * * u64 bpf_get_current_task(void) * Return * A pointer to the current task struct. + * + * int bpf_skb_change_tail(struct sk_buff *skb, u32 len, u64 flags) + * Description + * Resize (trim or grow) the packet associated to *skb* to the + * new *len*. The *flags* are reserved for future usage, and must + * be left at zero. + * + * The basic idea is that the helper performs the needed work to + * change the size of the packet, then the eBPF program rewrites + * the rest via helpers like **bpf_skb_store_bytes**\ (), + * **bpf_l3_csum_replace**\ (), **bpf_l3_csum_replace**\ () + * and others. This helper is a slow path utility intended for + * replies with control messages. And because it is targeted for + * slow path, the helper itself can afford to be slow: it + * implicitly linearizes, unclones and drops offloads from the + * *skb*. + * + * A call to this helper is susceptible to change the underlaying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_skb_pull_data(struct sk_buff *skb, u32 len) + * Description + * Pull in non-linear data in case the *skb* is non-linear and not + * all of *len* are part of the linear section. Make *len* bytes + * from *skb* readable and writable. If a zero value is passed for + * *len*, then the whole length of the *skb* is pulled. + * + * This helper is only needed for reading and writing with direct + * packet access. + * + * For direct packet access, testing that offsets to access + * are within packet boundaries (test on *skb*\ **->data_end**) is + * susceptible to fail if offsets are invalid, or if the requested + * data is in non-linear parts of the *skb*. On failure the + * program can just bail out, or in the case of a non-linear + * buffer, use a helper to make the data available. The + * **bpf_skb_load_bytes**\ () helper is a first solution to access + * the data. Another one consists in using **bpf_skb_pull_data** + * to pull in once the non-linear parts, then retesting and + * eventually access the data. + * + * At the same time, this also makes sure the *skb* is uncloned, + * which is a necessary condition for direct write. As this needs + * to be an invariant for the write part only, the verifier + * detects writes and adds a prologue that is calling + * **bpf_skb_pull_data()** to effectively unclone the *skb* from + * the very beginning in case it is indeed cloned. + * + * A call to this helper is susceptible to change the underlaying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * s64 bpf_csum_update(struct sk_buff *skb, __wsum csum) + * Description + * Add the checksum *csum* into *skb*\ **->csum** in case the + * driver has supplied a checksum for the entire packet into that + * field. Return an error otherwise. This helper is intended to be + * used in combination with **bpf_csum_diff**\ (), in particular + * when the checksum needs to be updated after data has been + * written into the packet through direct packet access. + * Return + * The checksum on success, or a negative error code in case of + * failure. + * + * void bpf_set_hash_invalid(struct sk_buff *skb) + * Description + * Invalidate the current *skb*\ **->hash**. It can be used after + * mangling on headers through direct packet access, in order to + * indicate that the hash is outdated and to trigger a + * recalculation the next time the kernel tries to access this + * hash or when the **bpf_get_hash_recalc**\ () helper is called. + * + * int bpf_get_numa_node_id(void) + * Description + * Return the id of the current NUMA node. The primary use case + * for this helper is the selection of sockets for the local NUMA + * node, when the program is attached to sockets using the + * **SO_ATTACH_REUSEPORT_EBPF** option (see also **socket(7)**), + * but the helper is also available to other eBPF program types, + * similarly to **bpf_get_smp_processor_id**\ (). + * Return + * The id of current NUMA node. + * + * u32 bpf_set_hash(struct sk_buff *skb, u32 hash) + * Description + * Set the full hash for *skb* (set the field *skb*\ **->hash**) + * to value *hash*. + * Return + * 0 + * + * int bpf_skb_adjust_room(struct sk_buff *skb, u32 len_diff, u32 mode, u64 flags) + * Description + * Grow or shrink the room for data in the packet associated to + * *skb* by *len_diff*, and according to the selected *mode*. + * + * There is a single supported mode at this time: + * + * * **BPF_ADJ_ROOM_NET**: Adjust room at the network layer + * (room space is added or removed below the layer 3 header). + * + * All values for *flags* are reserved for future usage, and must + * be left at zero. + * + * A call to this helper is susceptible to change the underlaying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_xdp_adjust_meta(struct xdp_buff *xdp_md, int delta) + * Description + * Adjust the address pointed by *xdp_md*\ **->data_meta** by + * *delta* (which can be positive or negative). Note that this + * operation modifies the address stored in *xdp_md*\ **->data**, + * so the latter must be loaded only after the helper has been + * called. + * + * The use of *xdp_md*\ **->data_meta** is optional and programs + * are not required to use it. The rationale is that when the + * packet is processed with XDP (e.g. as DoS filter), it is + * possible to push further meta data along with it before passing + * to the stack, and to give the guarantee that an ingress eBPF + * program attached as a TC classifier on the same device can pick + * this up for further post-processing. Since TC works with socket + * buffers, it remains possible to set from XDP the **mark** or + * **priority** pointers, or other pointers for the socket buffer. + * Having this scratch space generic and programmable allows for + * more flexibility as the user is free to store whatever meta + * data they need. + * + * A call to this helper is susceptible to change the underlaying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ From 69188ffc7421575b898fd8402423fe4faa4bef61 Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Wed, 25 Apr 2018 18:16:57 +0100 Subject: [PATCH 0251/1640] UPSTREAM: bpf: add documentation for eBPF helpers (42-50) Add documentation for eBPF helper functions to bpf.h user header file. This documentation can be parsed with the Python script provided in another commit of the patch series, in order to provide a RST document that can later be converted into a man page. The objective is to make the documentation easily understandable and accessible to all eBPF developers, including beginners. This patch contains descriptions for the following helper functions: Helper from Kaixu: - bpf_perf_event_read() Helpers from Martin: - bpf_skb_under_cgroup() - bpf_xdp_adjust_head() Helpers from Sargun: - bpf_probe_write_user() - bpf_current_task_under_cgroup() Helper from Thomas: - bpf_skb_change_head() Helper from Gianluca: - bpf_probe_read_str() Helpers from Chenbo: - bpf_get_socket_cookie() - bpf_get_socket_uid() v4: - bpf_perf_event_read(): State that bpf_perf_event_read_value() should be preferred over this helper. - bpf_skb_change_head(): Clarify comment about invalidated verifier checks. - bpf_xdp_adjust_head(): Clarify comment about invalidated verifier checks. - bpf_probe_write_user(): Add that dst must be a valid user space address. - bpf_get_socket_cookie(): Improve description by making clearer that the cockie belongs to the socket, and state that it remains stable for the life of the socket. v3: - bpf_perf_event_read(): Fix time of selection for perf event type in description. Remove occurences of "cores" to avoid confusion with "CPU". Cc: Martin KaFai Lau Cc: Sargun Dhillon Cc: Thomas Graf Cc: Gianluca Borello Cc: Chenbo Feng Signed-off-by: Quentin Monnet Acked-by: Alexei Starovoitov Acked-by: Martin KaFai Lau [for bpf_skb_under_cgroup(), bpf_xdp_adjust_head()] Signed-off-by: Daniel Borkmann --- include/uapi/linux/bpf.h | 172 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 172 insertions(+) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 535429ee3fcf..6a5eebf92367 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -810,6 +810,35 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * + * u64 bpf_perf_event_read(struct bpf_map *map, u64 flags) + * Description + * Read the value of a perf event counter. This helper relies on a + * *map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. The nature of + * the perf event counter is selected when *map* is updated with + * perf event file descriptors. The *map* is an array whose size + * is the number of available CPUs, and each cell contains a value + * relative to one CPU. The value to retrieve is indicated by + * *flags*, that contains the index of the CPU to look up, masked + * with **BPF_F_INDEX_MASK**. Alternatively, *flags* can be set to + * **BPF_F_CURRENT_CPU** to indicate that the value for the + * current CPU should be retrieved. + * + * Note that before Linux 4.13, only hardware perf event can be + * retrieved. + * + * Also, be aware that the newer helper + * **bpf_perf_event_read_value**\ () is recommended over + * **bpf_perf_event_read*\ () in general. The latter has some ABI + * quirks where error and counter value are used as a return code + * (which is wrong to do since ranges may overlap). This issue is + * fixed with bpf_perf_event_read_value(), which at the same time + * provides more features over the **bpf_perf_event_read**\ () + * interface. Please refer to the description of + * **bpf_perf_event_read_value**\ () for details. + * Return + * The value of the perf event counter read from the map, or a + * negative error code in case of failure. + * * int bpf_redirect(u32 ifindex, u64 flags) * Description * Redirect the packet to another net device of index *ifindex*. @@ -1071,6 +1100,17 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * + * int bpf_skb_under_cgroup(struct sk_buff *skb, struct bpf_map *map, u32 index) + * Description + * Check whether *skb* is a descendant of the cgroup2 held by + * *map* of type **BPF_MAP_TYPE_CGROUP_ARRAY**, at *index*. + * Return + * The return value depends on the result of the test, and can be: + * + * * 0, if the *skb* failed the cgroup2 descendant test. + * * 1, if the *skb* succeeded the cgroup2 descendant test. + * * A negative error code, if an error occurred. + * * u32 bpf_get_hash_recalc(struct sk_buff *skb) * Description * Retrieve the hash of the packet, *skb*\ **->hash**. If it is @@ -1091,6 +1131,37 @@ union bpf_attr { * Return * A pointer to the current task struct. * + * int bpf_probe_write_user(void *dst, const void *src, u32 len) + * Description + * Attempt in a safe way to write *len* bytes from the buffer + * *src* to *dst* in memory. It only works for threads that are in + * user context, and *dst* must be a valid user space address. + * + * This helper should not be used to implement any kind of + * security mechanism because of TOC-TOU attacks, but rather to + * debug, divert, and manipulate execution of semi-cooperative + * processes. + * + * Keep in mind that this feature is meant for experiments, and it + * has a risk of crashing the system and running programs. + * Therefore, when an eBPF program using this helper is attached, + * a warning including PID and process name is printed to kernel + * logs. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_current_task_under_cgroup(struct bpf_map *map, u32 index) + * Description + * Check whether the probe is being run is the context of a given + * subset of the cgroup2 hierarchy. The cgroup2 to test is held by + * *map* of type **BPF_MAP_TYPE_CGROUP_ARRAY**, at *index*. + * Return + * The return value depends on the result of the test, and can be: + * + * * 0, if the *skb* task belongs to the cgroup2. + * * 1, if the *skb* task does not belong to the cgroup2. + * * A negative error code, if an error occurred. + * * int bpf_skb_change_tail(struct sk_buff *skb, u32 len, u64 flags) * Description * Resize (trim or grow) the packet associated to *skb* to the @@ -1182,6 +1253,107 @@ union bpf_attr { * Return * The id of current NUMA node. * + * int bpf_skb_change_head(struct sk_buff *skb, u32 len, u64 flags) + * Description + * Grows headroom of packet associated to *skb* and adjusts the + * offset of the MAC header accordingly, adding *len* bytes of + * space. It automatically extends and reallocates memory as + * required. + * + * This helper can be used on a layer 3 *skb* to push a MAC header + * for redirection into a layer 2 device. + * + * All values for *flags* are reserved for future usage, and must + * be left at zero. + * + * A call to this helper is susceptible to change the underlaying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_xdp_adjust_head(struct xdp_buff *xdp_md, int delta) + * Description + * Adjust (move) *xdp_md*\ **->data** by *delta* bytes. Note that + * it is possible to use a negative value for *delta*. This helper + * can be used to prepare the packet for pushing or popping + * headers. + * + * A call to this helper is susceptible to change the underlaying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_probe_read_str(void *dst, int size, const void *unsafe_ptr) + * Description + * Copy a NUL terminated string from an unsafe address + * *unsafe_ptr* to *dst*. The *size* should include the + * terminating NUL byte. In case the string length is smaller than + * *size*, the target is not padded with further NUL bytes. If the + * string length is larger than *size*, just *size*-1 bytes are + * copied and the last byte is set to NUL. + * + * On success, the length of the copied string is returned. This + * makes this helper useful in tracing programs for reading + * strings, and more importantly to get its length at runtime. See + * the following snippet: + * + * :: + * + * SEC("kprobe/sys_open") + * void bpf_sys_open(struct pt_regs *ctx) + * { + * char buf[PATHLEN]; // PATHLEN is defined to 256 + * int res = bpf_probe_read_str(buf, sizeof(buf), + * ctx->di); + * + * // Consume buf, for example push it to + * // userspace via bpf_perf_event_output(); we + * // can use res (the string length) as event + * // size, after checking its boundaries. + * } + * + * In comparison, using **bpf_probe_read()** helper here instead + * to read the string would require to estimate the length at + * compile time, and would often result in copying more memory + * than necessary. + * + * Another useful use case is when parsing individual process + * arguments or individual environment variables navigating + * *current*\ **->mm->arg_start** and *current*\ + * **->mm->env_start**: using this helper and the return value, + * one can quickly iterate at the right offset of the memory area. + * Return + * On success, the strictly positive length of the string, + * including the trailing NUL character. On error, a negative + * value. + * + * u64 bpf_get_socket_cookie(struct sk_buff *skb) + * Description + * If the **struct sk_buff** pointed by *skb* has a known socket, + * retrieve the cookie (generated by the kernel) of this socket. + * If no cookie has been set yet, generate a new cookie. Once + * generated, the socket cookie remains stable for the life of the + * socket. This helper can be useful for monitoring per socket + * networking traffic statistics as it provides a unique socket + * identifier per namespace. + * Return + * A 8-byte long non-decreasing number on success, or 0 if the + * socket field is missing inside *skb*. + * + * u32 bpf_get_socket_uid(struct sk_buff *skb) + * Return + * The owner UID of the socket associated to *skb*. If the socket + * is **NULL**, or if it is not a full socket (i.e. if it is a + * time-wait or a request socket instead), **overflowuid** value + * is returned (note that **overflowuid** might also be the actual + * UID value for the socket). + * * u32 bpf_set_hash(struct sk_buff *skb, u32 hash) * Description * Set the full hash for *skb* (set the field *skb*\ **->hash**) From 968817267e38814ee6a995b7369b6b52f516852a Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Wed, 25 Apr 2018 18:16:58 +0100 Subject: [PATCH 0252/1640] UPSTREAM: bpf: add documentation for eBPF helpers (51-57) Add documentation for eBPF helper functions to bpf.h user header file. This documentation can be parsed with the Python script provided in another commit of the patch series, in order to provide a RST document that can later be converted into a man page. The objective is to make the documentation easily understandable and accessible to all eBPF developers, including beginners. This patch contains descriptions for the following helper functions: Helpers from Lawrence: - bpf_setsockopt() - bpf_getsockopt() - bpf_sock_ops_cb_flags_set() Helpers from Yonghong: - bpf_perf_event_read_value() - bpf_perf_prog_read_value() Helper from Josef: - bpf_override_return() Helper from Andrey: - bpf_bind() v4: - bpf_perf_event_read_value(): State that this helper should be preferred over bpf_perf_event_read(). v3: - bpf_perf_event_read_value(): Fix time of selection for perf event type in description. Remove occurences of "cores" to avoid confusion with "CPU". - bpf_bind(): Remove last paragraph of description, which was off topic. Cc: Lawrence Brakmo Cc: Yonghong Song Cc: Josef Bacik Cc: Andrey Ignatov Signed-off-by: Quentin Monnet Acked-by: Yonghong Song [for bpf_perf_event_read_value(), bpf_perf_prog_read_value()] Acked-by: Andrey Ignatov [for bpf_bind()] Signed-off-by: Daniel Borkmann --- include/uapi/linux/bpf.h | 180 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 180 insertions(+) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 6a5eebf92367..ea938705d73d 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1361,6 +1361,28 @@ union bpf_attr { * Return * 0 * + * int bpf_setsockopt(struct bpf_sock_ops_kern *bpf_socket, int level, int optname, char *optval, int optlen) + * Description + * Emulate a call to **setsockopt()** on the socket associated to + * *bpf_socket*, which must be a full socket. The *level* at + * which the option resides and the name *optname* of the option + * must be specified, see **setsockopt(2)** for more information. + * The option value of length *optlen* is pointed by *optval*. + * + * This helper actually implements a subset of **setsockopt()**. + * It supports the following *level*\ s: + * + * * **SOL_SOCKET**, which supports the following *optname*\ s: + * **SO_RCVBUF**, **SO_SNDBUF**, **SO_MAX_PACING_RATE**, + * **SO_PRIORITY**, **SO_RCVLOWAT**, **SO_MARK**. + * * **IPPROTO_TCP**, which supports the following *optname*\ s: + * **TCP_CONGESTION**, **TCP_BPF_IW**, + * **TCP_BPF_SNDCWND_CLAMP**. + * * **IPPROTO_IP**, which supports *optname* **IP_TOS**. + * * **IPPROTO_IPV6**, which supports *optname* **IPV6_TCLASS**. + * Return + * 0 on success, or a negative error in case of failure. + * * int bpf_skb_adjust_room(struct sk_buff *skb, u32 len_diff, u32 mode, u64 flags) * Description * Grow or shrink the room for data in the packet associated to @@ -1410,6 +1432,164 @@ union bpf_attr { * direct packet access. * Return * 0 on success, or a negative error in case of failure. + * + * int bpf_perf_event_read_value(struct bpf_map *map, u64 flags, struct bpf_perf_event_value *buf, u32 buf_size) + * Description + * Read the value of a perf event counter, and store it into *buf* + * of size *buf_size*. This helper relies on a *map* of type + * **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. The nature of the perf event + * counter is selected when *map* is updated with perf event file + * descriptors. The *map* is an array whose size is the number of + * available CPUs, and each cell contains a value relative to one + * CPU. The value to retrieve is indicated by *flags*, that + * contains the index of the CPU to look up, masked with + * **BPF_F_INDEX_MASK**. Alternatively, *flags* can be set to + * **BPF_F_CURRENT_CPU** to indicate that the value for the + * current CPU should be retrieved. + * + * This helper behaves in a way close to + * **bpf_perf_event_read**\ () helper, save that instead of + * just returning the value observed, it fills the *buf* + * structure. This allows for additional data to be retrieved: in + * particular, the enabled and running times (in *buf*\ + * **->enabled** and *buf*\ **->running**, respectively) are + * copied. In general, **bpf_perf_event_read_value**\ () is + * recommended over **bpf_perf_event_read**\ (), which has some + * ABI issues and provides fewer functionalities. + * + * These values are interesting, because hardware PMU (Performance + * Monitoring Unit) counters are limited resources. When there are + * more PMU based perf events opened than available counters, + * kernel will multiplex these events so each event gets certain + * percentage (but not all) of the PMU time. In case that + * multiplexing happens, the number of samples or counter value + * will not reflect the case compared to when no multiplexing + * occurs. This makes comparison between different runs difficult. + * Typically, the counter value should be normalized before + * comparing to other experiments. The usual normalization is done + * as follows. + * + * :: + * + * normalized_counter = counter * t_enabled / t_running + * + * Where t_enabled is the time enabled for event and t_running is + * the time running for event since last normalization. The + * enabled and running times are accumulated since the perf event + * open. To achieve scaling factor between two invocations of an + * eBPF program, users can can use CPU id as the key (which is + * typical for perf array usage model) to remember the previous + * value and do the calculation inside the eBPF program. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_perf_prog_read_value(struct bpf_perf_event_data_kern *ctx, struct bpf_perf_event_value *buf, u32 buf_size) + * Description + * For en eBPF program attached to a perf event, retrieve the + * value of the event counter associated to *ctx* and store it in + * the structure pointed by *buf* and of size *buf_size*. Enabled + * and running times are also stored in the structure (see + * description of helper **bpf_perf_event_read_value**\ () for + * more details). + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_getsockopt(struct bpf_sock_ops_kern *bpf_socket, int level, int optname, char *optval, int optlen) + * Description + * Emulate a call to **getsockopt()** on the socket associated to + * *bpf_socket*, which must be a full socket. The *level* at + * which the option resides and the name *optname* of the option + * must be specified, see **getsockopt(2)** for more information. + * The retrieved value is stored in the structure pointed by + * *opval* and of length *optlen*. + * + * This helper actually implements a subset of **getsockopt()**. + * It supports the following *level*\ s: + * + * * **IPPROTO_TCP**, which supports *optname* + * **TCP_CONGESTION**. + * * **IPPROTO_IP**, which supports *optname* **IP_TOS**. + * * **IPPROTO_IPV6**, which supports *optname* **IPV6_TCLASS**. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_override_return(struct pt_reg *regs, u64 rc) + * Description + * Used for error injection, this helper uses kprobes to override + * the return value of the probed function, and to set it to *rc*. + * The first argument is the context *regs* on which the kprobe + * works. + * + * This helper works by setting setting the PC (program counter) + * to an override function which is run in place of the original + * probed function. This means the probed function is not run at + * all. The replacement function just returns with the required + * value. + * + * This helper has security implications, and thus is subject to + * restrictions. It is only available if the kernel was compiled + * with the **CONFIG_BPF_KPROBE_OVERRIDE** configuration + * option, and in this case it only works on functions tagged with + * **ALLOW_ERROR_INJECTION** in the kernel code. + * + * Also, the helper is only available for the architectures having + * the CONFIG_FUNCTION_ERROR_INJECTION option. As of this writing, + * x86 architecture is the only one to support this feature. + * Return + * 0 + * + * int bpf_sock_ops_cb_flags_set(struct bpf_sock_ops_kern *bpf_sock, int argval) + * Description + * Attempt to set the value of the **bpf_sock_ops_cb_flags** field + * for the full TCP socket associated to *bpf_sock_ops* to + * *argval*. + * + * The primary use of this field is to determine if there should + * be calls to eBPF programs of type + * **BPF_PROG_TYPE_SOCK_OPS** at various points in the TCP + * code. A program of the same type can change its value, per + * connection and as necessary, when the connection is + * established. This field is directly accessible for reading, but + * this helper must be used for updates in order to return an + * error if an eBPF program tries to set a callback that is not + * supported in the current kernel. + * + * The supported callback values that *argval* can combine are: + * + * * **BPF_SOCK_OPS_RTO_CB_FLAG** (retransmission time out) + * * **BPF_SOCK_OPS_RETRANS_CB_FLAG** (retransmission) + * * **BPF_SOCK_OPS_STATE_CB_FLAG** (TCP state change) + * + * Here are some examples of where one could call such eBPF + * program: + * + * * When RTO fires. + * * When a packet is retransmitted. + * * When the connection terminates. + * * When a packet is sent. + * * When a packet is received. + * Return + * Code **-EINVAL** if the socket is not a full TCP socket; + * otherwise, a positive number containing the bits that could not + * be set is returned (which comes down to 0 if all bits were set + * as required). + * + * int bpf_bind(struct bpf_sock_addr_kern *ctx, struct sockaddr *addr, int addr_len) + * Description + * Bind the socket associated to *ctx* to the address pointed by + * *addr*, of length *addr_len*. This allows for making outgoing + * connection from the desired IP address, which can be useful for + * example when all processes inside a cgroup should use one + * single IP address on a host that has multiple IP configured. + * + * This helper works for IPv4 and IPv6, TCP and UDP sockets. The + * domain (*addr*\ **->sa_family**) must be **AF_INET** (or + * **AF_INET6**). Looking for a free port to bind to can be + * expensive, therefore binding to port is not permitted by the + * helper: *addr*\ **->sin_port** (or **sin6_port**, respectively) + * must be set to zero. + * Return + * 0 on success, or a negative error in case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ From 67306b4bfa070a0596ba3ef4087169edf35a3bf2 Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Wed, 25 Apr 2018 18:16:59 +0100 Subject: [PATCH 0253/1640] UPSTREAM: bpf: add documentation for eBPF helpers (58-64) Add documentation for eBPF helper functions to bpf.h user header file. This documentation can be parsed with the Python script provided in another commit of the patch series, in order to provide a RST document that can later be converted into a man page. The objective is to make the documentation easily understandable and accessible to all eBPF developers, including beginners. This patch contains descriptions for the following helper functions, all written by John: - bpf_redirect_map() - bpf_sk_redirect_map() - bpf_sock_map_update() - bpf_msg_redirect_map() - bpf_msg_apply_bytes() - bpf_msg_cork_bytes() - bpf_msg_pull_data() v4: - bpf_redirect_map(): Fix typos: "XDP_ABORT" changed to "XDP_ABORTED", "his" to "this". Also add a paragraph on performance improvement over bpf_redirect() helper. v3: - bpf_sk_redirect_map(): Improve description of BPF_F_INGRESS flag. - bpf_msg_redirect_map(): Improve description of BPF_F_INGRESS flag. - bpf_redirect_map(): Fix note on CPU redirection, not fully implemented for generic XDP but supported on native XDP. - bpf_msg_pull_data(): Clarify comment about invalidated verifier checks. Cc: Jesper Dangaard Brouer Cc: John Fastabend Signed-off-by: Quentin Monnet Signed-off-by: Daniel Borkmann --- include/uapi/linux/bpf.h | 147 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 147 insertions(+) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index ea938705d73d..81a4f3dce4fa 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1404,6 +1404,56 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * + * int bpf_redirect_map(struct bpf_map *map, u32 key, u64 flags) + * Description + * Redirect the packet to the endpoint referenced by *map* at + * index *key*. Depending on its type, this *map* can contain + * references to net devices (for forwarding packets through other + * ports), or to CPUs (for redirecting XDP frames to another CPU; + * but this is only implemented for native XDP (with driver + * support) as of this writing). + * + * All values for *flags* are reserved for future usage, and must + * be left at zero. + * + * When used to redirect packets to net devices, this helper + * provides a high performance increase over **bpf_redirect**\ (). + * This is due to various implementation details of the underlying + * mechanisms, one of which is the fact that **bpf_redirect_map**\ + * () tries to send packet as a "bulk" to the device. + * Return + * **XDP_REDIRECT** on success, or **XDP_ABORTED** on error. + * + * int bpf_sk_redirect_map(struct bpf_map *map, u32 key, u64 flags) + * Description + * Redirect the packet to the socket referenced by *map* (of type + * **BPF_MAP_TYPE_SOCKMAP**) at index *key*. Both ingress and + * egress interfaces can be used for redirection. The + * **BPF_F_INGRESS** value in *flags* is used to make the + * distinction (ingress path is selected if the flag is present, + * egress path otherwise). This is the only flag supported for now. + * Return + * **SK_PASS** on success, or **SK_DROP** on error. + * + * int bpf_sock_map_update(struct bpf_sock_ops_kern *skops, struct bpf_map *map, void *key, u64 flags) + * Description + * Add an entry to, or update a *map* referencing sockets. The + * *skops* is used as a new value for the entry associated to + * *key*. *flags* is one of: + * + * **BPF_NOEXIST** + * The entry for *key* must not exist in the map. + * **BPF_EXIST** + * The entry for *key* must already exist in the map. + * **BPF_ANY** + * No condition on the existence of the entry for *key*. + * + * If the *map* has eBPF programs (parser and verdict), those will + * be inherited by the socket being added. If the socket is + * already attached to eBPF programs, this results in an error. + * Return + * 0 on success, or a negative error in case of failure. + * * int bpf_xdp_adjust_meta(struct xdp_buff *xdp_md, int delta) * Description * Adjust the address pointed by *xdp_md*\ **->data_meta** by @@ -1574,6 +1624,103 @@ union bpf_attr { * be set is returned (which comes down to 0 if all bits were set * as required). * + * int bpf_msg_redirect_map(struct sk_msg_buff *msg, struct bpf_map *map, u32 key, u64 flags) + * Description + * This helper is used in programs implementing policies at the + * socket level. If the message *msg* is allowed to pass (i.e. if + * the verdict eBPF program returns **SK_PASS**), redirect it to + * the socket referenced by *map* (of type + * **BPF_MAP_TYPE_SOCKMAP**) at index *key*. Both ingress and + * egress interfaces can be used for redirection. The + * **BPF_F_INGRESS** value in *flags* is used to make the + * distinction (ingress path is selected if the flag is present, + * egress path otherwise). This is the only flag supported for now. + * Return + * **SK_PASS** on success, or **SK_DROP** on error. + * + * int bpf_msg_apply_bytes(struct sk_msg_buff *msg, u32 bytes) + * Description + * For socket policies, apply the verdict of the eBPF program to + * the next *bytes* (number of bytes) of message *msg*. + * + * For example, this helper can be used in the following cases: + * + * * A single **sendmsg**\ () or **sendfile**\ () system call + * contains multiple logical messages that the eBPF program is + * supposed to read and for which it should apply a verdict. + * * An eBPF program only cares to read the first *bytes* of a + * *msg*. If the message has a large payload, then setting up + * and calling the eBPF program repeatedly for all bytes, even + * though the verdict is already known, would create unnecessary + * overhead. + * + * When called from within an eBPF program, the helper sets a + * counter internal to the BPF infrastructure, that is used to + * apply the last verdict to the next *bytes*. If *bytes* is + * smaller than the current data being processed from a + * **sendmsg**\ () or **sendfile**\ () system call, the first + * *bytes* will be sent and the eBPF program will be re-run with + * the pointer for start of data pointing to byte number *bytes* + * **+ 1**. If *bytes* is larger than the current data being + * processed, then the eBPF verdict will be applied to multiple + * **sendmsg**\ () or **sendfile**\ () calls until *bytes* are + * consumed. + * + * Note that if a socket closes with the internal counter holding + * a non-zero value, this is not a problem because data is not + * being buffered for *bytes* and is sent as it is received. + * Return + * 0 + * + * int bpf_msg_cork_bytes(struct sk_msg_buff *msg, u32 bytes) + * Description + * For socket policies, prevent the execution of the verdict eBPF + * program for message *msg* until *bytes* (byte number) have been + * accumulated. + * + * This can be used when one needs a specific number of bytes + * before a verdict can be assigned, even if the data spans + * multiple **sendmsg**\ () or **sendfile**\ () calls. The extreme + * case would be a user calling **sendmsg**\ () repeatedly with + * 1-byte long message segments. Obviously, this is bad for + * performance, but it is still valid. If the eBPF program needs + * *bytes* bytes to validate a header, this helper can be used to + * prevent the eBPF program to be called again until *bytes* have + * been accumulated. + * Return + * 0 + * + * int bpf_msg_pull_data(struct sk_msg_buff *msg, u32 start, u32 end, u64 flags) + * Description + * For socket policies, pull in non-linear data from user space + * for *msg* and set pointers *msg*\ **->data** and *msg*\ + * **->data_end** to *start* and *end* bytes offsets into *msg*, + * respectively. + * + * If a program of type **BPF_PROG_TYPE_SK_MSG** is run on a + * *msg* it can only parse data that the (**data**, **data_end**) + * pointers have already consumed. For **sendmsg**\ () hooks this + * is likely the first scatterlist element. But for calls relying + * on the **sendpage** handler (e.g. **sendfile**\ ()) this will + * be the range (**0**, **0**) because the data is shared with + * user space and by default the objective is to avoid allowing + * user space to modify data while (or after) eBPF verdict is + * being decided. This helper can be used to pull in data and to + * set the start and end pointer to given values. Data will be + * copied if necessary (i.e. if data was not linear and if start + * and end pointers do not point to the same chunk). + * + * A call to this helper is susceptible to change the underlaying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * + * All values for *flags* are reserved for future usage, and must + * be left at zero. + * Return + * 0 on success, or a negative error in case of failure. + * * int bpf_bind(struct bpf_sock_addr_kern *ctx, struct sockaddr *addr, int addr_len) * Description * Bind the socket associated to *ctx* to the address pointed by From 396901d04f110c4c89a38bc4ff0e3c66c83c9042 Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Wed, 25 Apr 2018 18:17:00 +0100 Subject: [PATCH 0254/1640] UPSTREAM: bpf: add documentation for eBPF helpers (65-66) Add documentation for eBPF helper functions to bpf.h user header file. This documentation can be parsed with the Python script provided in another commit of the patch series, in order to provide a RST document that can later be converted into a man page. The objective is to make the documentation easily understandable and accessible to all eBPF developers, including beginners. This patch contains descriptions for the following helper functions: Helper from Nikita: - bpf_xdp_adjust_tail() Helper from Eyal: - bpf_skb_get_xfrm_state() v4: - New patch (helpers did not exist yet for previous versions). Cc: Nikita V. Shirokov Cc: Eyal Birger Signed-off-by: Quentin Monnet Signed-off-by: Daniel Borkmann --- include/uapi/linux/bpf.h | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 81a4f3dce4fa..7d179cb46603 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1737,6 +1737,36 @@ union bpf_attr { * must be set to zero. * Return * 0 on success, or a negative error in case of failure. + * + * int bpf_xdp_adjust_tail(struct xdp_buff *xdp_md, int delta) + * Description + * Adjust (move) *xdp_md*\ **->data_end** by *delta* bytes. It is + * only possible to shrink the packet as of this writing, + * therefore *delta* must be a negative integer. + * + * A call to this helper is susceptible to change the underlaying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_skb_get_xfrm_state(struct sk_buff *skb, u32 index, struct bpf_xfrm_state *xfrm_state, u32 size, u64 flags) + * Description + * Retrieve the XFRM state (IP transform framework, see also + * **ip-xfrm(8)**) at *index* in XFRM "security path" for *skb*. + * + * The retrieved value is stored in the **struct bpf_xfrm_state** + * pointed by *xfrm_state* and of length *size*. + * + * All values for *flags* are reserved for future usage, and must + * be left at zero. + * + * This helper is available only if the kernel was compiled with + * **CONFIG_XFRM** configuration option. + * Return + * 0 on success, or a negative error in case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ From 71422b4ee792241e344b33ff29b72b5c06492607 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sat, 28 Apr 2018 22:28:07 -0700 Subject: [PATCH 0255/1640] UPSTREAM: bpf: change prototype for stack_map_get_build_id_offset This patch didn't incur functionality change. The function prototype got changed so that the same function can be reused later. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov --- kernel/bpf/stackmap.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index a3b8c4c78db3..ccde5aeb90ba 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -268,16 +268,11 @@ out: return ret; } -static void stack_map_get_build_id_offset(struct bpf_map *map, - struct stack_map_bucket *bucket, +static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, u64 *ips, u32 trace_nr, bool user) { int i; struct vm_area_struct *vma; - struct bpf_stack_build_id *id_offs; - - bucket->nr = trace_nr; - id_offs = (struct bpf_stack_build_id *)bucket->data; /* * We cannot do up_read() in nmi context, so build_id lookup is @@ -367,8 +362,10 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map, pcpu_freelist_pop(&smap->freelist); if (unlikely(!new_bucket)) return -ENOMEM; - stack_map_get_build_id_offset(map, new_bucket, ips, - trace_nr, user); + new_bucket->nr = trace_nr; + stack_map_get_build_id_offset( + (struct bpf_stack_build_id *)new_bucket->data, + ips, trace_nr, user); trace_len = trace_nr * sizeof(struct bpf_stack_build_id); if (hash_matches && bucket->nr == trace_nr && memcmp(bucket->data, new_bucket->data, trace_len) == 0) { From cf168d7e20469097fa5cabee783b9b8ba852bf51 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sat, 28 Apr 2018 22:28:08 -0700 Subject: [PATCH 0256/1640] BACKPORT: bpf: add bpf_get_stack helper Currently, stackmap and bpf_get_stackid helper are provided for bpf program to get the stack trace. This approach has a limitation though. If two stack traces have the same hash, only one will get stored in the stackmap table, so some stack traces are missing from user perspective. This patch implements a new helper, bpf_get_stack, will send stack traces directly to bpf program. The bpf program is able to see all stack traces, and then can do in-kernel processing or send stack traces to user space through shared map or bpf_perf_event_output. Acked-by: Alexei Starovoitov Change-Id: I7dbdcba1a8ceda4c3626a07c436b33d9f35b3c0e Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 1 + include/linux/filter.h | 3 +- include/uapi/linux/bpf.h | 39 ++++++++++++++++++++++- kernel/bpf/core.c | 6 ++++ kernel/bpf/stackmap.c | 67 ++++++++++++++++++++++++++++++++++++++++ kernel/bpf/verifier.c | 19 ++++++++++++ kernel/trace/bpf_trace.c | 50 +++++++++++++++++++++++++++++- 7 files changed, 182 insertions(+), 3 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 47d7f8b2bf52..0d3ea0aa1929 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -710,6 +710,7 @@ extern const struct bpf_func_proto bpf_get_current_comm_proto; extern const struct bpf_func_proto bpf_skb_vlan_push_proto; extern const struct bpf_func_proto bpf_skb_vlan_pop_proto; extern const struct bpf_func_proto bpf_get_stackid_proto; +extern const struct bpf_func_proto bpf_get_stack_proto; extern const struct bpf_func_proto bpf_sock_map_update_proto; /* Shared helpers among cBPF and eBPF. */ diff --git a/include/linux/filter.h b/include/linux/filter.h index 49306e1d27aa..a252f71b76b9 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -493,7 +493,8 @@ struct bpf_prog { dst_needed:1, /* Do we need dst entry? */ blinded:1, /* Was blinded */ is_func:1, /* program is a bpf function */ - kprobe_override:1; /* Do we override a kprobe? */ + kprobe_override:1, /* Do we override a kprobe? */ + has_callchain_buf:1; /* callchain buffer allocated? */ enum bpf_prog_type type; /* Type of BPF program */ enum bpf_attach_type expected_attach_type; /* For some prog types */ u32 len; /* Number of filter blocks */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 7d179cb46603..d017f039f71a 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1767,6 +1767,40 @@ union bpf_attr { * **CONFIG_XFRM** configuration option. * Return * 0 on success, or a negative error in case of failure. + * + * int bpf_get_stack(struct pt_regs *regs, void *buf, u32 size, u64 flags) + * Description + * Return a user or a kernel stack in bpf program provided buffer. + * To achieve this, the helper needs *ctx*, which is a pointer + * to the context on which the tracing program is executed. + * To store the stacktrace, the bpf program provides *buf* with + * a nonnegative *size*. + * + * The last argument, *flags*, holds the number of stack frames to + * skip (from 0 to 255), masked with + * **BPF_F_SKIP_FIELD_MASK**. The next bits can be used to set + * the following flags: + * + * **BPF_F_USER_STACK** + * Collect a user space stack instead of a kernel stack. + * **BPF_F_USER_BUILD_ID** + * Collect buildid+offset instead of ips for user stack, + * only valid if **BPF_F_USER_STACK** is also specified. + * + * **bpf_get_stack**\ () can collect up to + * **PERF_MAX_STACK_DEPTH** both kernel and user frames, subject + * to sufficient large buffer size. Note that + * this limit can be controlled with the **sysctl** program, and + * that it should be manually increased in order to profile long + * user stacks (such as stacks for Java programs). To do so, use: + * + * :: + * + * # sysctl kernel.perf_event_max_stack= + * + * Return + * a non-negative value equal to or less than size on success, or + * a negative error in case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -1928,11 +1962,14 @@ enum bpf_func_id { /* BPF_FUNC_skb_set_tunnel_key and BPF_FUNC_skb_get_tunnel_key flags. */ #define BPF_F_TUNINFO_IPV6 (1ULL << 0) -/* BPF_FUNC_get_stackid flags. */ +/* flags for both BPF_FUNC_get_stackid and BPF_FUNC_get_stack. */ #define BPF_F_SKIP_FIELD_MASK 0xffULL #define BPF_F_USER_STACK (1ULL << 8) +/* flags used by BPF_FUNC_get_stackid only. */ #define BPF_F_FAST_STACK_CMP (1ULL << 9) #define BPF_F_REUSE_STACKID (1ULL << 10) +/* flags used by BPF_FUNC_get_stack only. */ +#define BPF_F_USER_BUILD_ID (1ULL << 11) /* BPF_FUNC_skb_set_tunnel_key flags. */ #define BPF_F_ZERO_CSUM_TX (1ULL << 1) diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index bf66092e2093..6d6e9f16c30e 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -31,6 +31,8 @@ #include #include #include +#include + #ifdef CONFIG_RKP_MODULE_SUPPORT #include #endif @@ -1861,6 +1863,10 @@ static void bpf_prog_free_deferred(struct work_struct *work) aux = container_of(work, struct bpf_prog_aux, work); if (bpf_prog_is_dev_bound(aux)) bpf_prog_offload_destroy(aux->prog); +#ifdef CONFIG_PERF_EVENTS + if (aux->prog->has_callchain_buf) + put_callchain_buffers(); +#endif for (i = 0; i < aux->func_cnt; i++) bpf_jit_free(aux->func[i]); if (aux->func_cnt) { diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index ccde5aeb90ba..cec0f0a5d87b 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -408,6 +408,73 @@ const struct bpf_func_proto bpf_get_stackid_proto = { .arg3_type = ARG_ANYTHING, }; +BPF_CALL_4(bpf_get_stack, struct pt_regs *, regs, void *, buf, u32, size, + u64, flags) +{ + u32 init_nr, trace_nr, copy_len, elem_size, num_elem; + bool user_build_id = flags & BPF_F_USER_BUILD_ID; + u32 skip = flags & BPF_F_SKIP_FIELD_MASK; + bool user = flags & BPF_F_USER_STACK; + struct perf_callchain_entry *trace; + bool kernel = !user; + int err = -EINVAL; + u64 *ips; + + if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK | + BPF_F_USER_BUILD_ID))) + goto clear; + if (kernel && user_build_id) + goto clear; + + elem_size = (user && user_build_id) ? sizeof(struct bpf_stack_build_id) + : sizeof(u64); + if (unlikely(size % elem_size)) + goto clear; + + num_elem = size / elem_size; + if (sysctl_perf_event_max_stack < num_elem) + init_nr = 0; + else + init_nr = sysctl_perf_event_max_stack - num_elem; + trace = get_perf_callchain(regs, init_nr, kernel, user, + sysctl_perf_event_max_stack, false, false); + if (unlikely(!trace)) + goto err_fault; + + trace_nr = trace->nr - init_nr; + if (trace_nr < skip) + goto err_fault; + + trace_nr -= skip; + trace_nr = (trace_nr <= num_elem) ? trace_nr : num_elem; + copy_len = trace_nr * elem_size; + ips = trace->ip + skip + init_nr; + if (user && user_build_id) + stack_map_get_build_id_offset(buf, ips, trace_nr, user); + else + memcpy(buf, ips, copy_len); + + if (size > copy_len) + memset(buf + copy_len, 0, size - copy_len); + return copy_len; + +err_fault: + err = -EFAULT; +clear: + memset(buf, 0, size); + return err; +} + +const struct bpf_func_proto bpf_get_stack_proto = { + .func = bpf_get_stack, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_UNINIT_MEM, + .arg3_type = ARG_CONST_SIZE_OR_ZERO, + .arg4_type = ARG_ANYTHING, +}; + /* Called from eBPF program */ static void *stack_map_lookup_elem(struct bpf_map *map, void *key) { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 9f24c039495c..b5c681275966 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -22,6 +22,7 @@ #include #include #include +#include #include "disasm.h" @@ -2485,6 +2486,24 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn if (err) return err; + if (func_id == BPF_FUNC_get_stack && !env->prog->has_callchain_buf) { + const char *err_str; + +#ifdef CONFIG_PERF_EVENTS + err = get_callchain_buffers(sysctl_perf_event_max_stack); + err_str = "cannot get callchain buffer for func %s#%d\n"; +#else + err = -ENOTSUPP; + err_str = "func %s#%d not supported without CONFIG_PERF_EVENTS\n"; +#endif + if (err) { + verbose(env, err_str, func_id_name(func_id), func_id); + return err; + } + + env->prog->has_callchain_buf = true; + } + if (changes_data) clear_all_pkt_pointers(env); return 0; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 8e53df3e3ca3..e68cd0fe6c02 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -20,6 +20,7 @@ #include "trace.h" u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); +u64 bpf_get_stack(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); /** * trace_call_bpf - invoke BPF program @@ -582,6 +583,8 @@ kprobe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_perf_event_output_proto; case BPF_FUNC_get_stackid: return &bpf_get_stackid_proto; + case BPF_FUNC_get_stack: + return &bpf_get_stack_proto; case BPF_FUNC_perf_event_read_value: return &bpf_perf_event_read_value_proto; #ifdef CONFIG_BPF_KPROBE_OVERRIDE @@ -669,6 +672,25 @@ static const struct bpf_func_proto bpf_get_stackid_proto_tp = { .arg3_type = ARG_ANYTHING, }; +BPF_CALL_4(bpf_get_stack_tp, void *, tp_buff, void *, buf, u32, size, + u64, flags) +{ + struct pt_regs *regs = *(struct pt_regs **)tp_buff; + + return bpf_get_stack((unsigned long) regs, (unsigned long) buf, + (unsigned long) size, flags, 0); +} + +static const struct bpf_func_proto bpf_get_stack_proto_tp = { + .func = bpf_get_stack_tp, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_UNINIT_MEM, + .arg3_type = ARG_CONST_SIZE_OR_ZERO, + .arg4_type = ARG_ANYTHING, +}; + static const struct bpf_func_proto * tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { @@ -677,6 +699,8 @@ tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_perf_event_output_proto_tp; case BPF_FUNC_get_stackid: return &bpf_get_stackid_proto_tp; + case BPF_FUNC_get_stack: + return &bpf_get_stack_proto_tp; default: return tracing_func_proto(func_id, prog); } @@ -739,6 +763,8 @@ pe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_perf_event_output_proto_tp; case BPF_FUNC_get_stackid: return &bpf_get_stackid_proto_tp; + case BPF_FUNC_get_stack: + return &bpf_get_stack_proto_tp; case BPF_FUNC_perf_prog_read_value: return &bpf_perf_prog_read_value_proto; default: @@ -749,7 +775,7 @@ pe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) /* * bpf_raw_tp_regs are separate from bpf_pt_regs used from skb/xdp * to avoid potential recursive reuse issue when/if tracepoints are added - * inside bpf_*_event_output and/or bpf_get_stack_id + * inside bpf_*_event_output, bpf_get_stackid and/or bpf_get_stack */ static DEFINE_PER_CPU(struct pt_regs, bpf_raw_tp_regs); BPF_CALL_5(bpf_perf_event_output_raw_tp, struct bpf_raw_tracepoint_args *, args, @@ -792,6 +818,26 @@ static const struct bpf_func_proto bpf_get_stackid_proto_raw_tp = { .arg3_type = ARG_ANYTHING, }; +BPF_CALL_4(bpf_get_stack_raw_tp, struct bpf_raw_tracepoint_args *, args, + void *, buf, u32, size, u64, flags) +{ + struct pt_regs *regs = this_cpu_ptr(&bpf_raw_tp_regs); + + perf_fetch_caller_regs(regs); + return bpf_get_stack((unsigned long) regs, (unsigned long) buf, + (unsigned long) size, flags, 0); +} + +static const struct bpf_func_proto bpf_get_stack_proto_raw_tp = { + .func = bpf_get_stack_raw_tp, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE_OR_ZERO, + .arg4_type = ARG_ANYTHING, +}; + static const struct bpf_func_proto * raw_tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { @@ -800,6 +846,8 @@ raw_tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_perf_event_output_proto_raw_tp; case BPF_FUNC_get_stackid: return &bpf_get_stackid_proto_raw_tp; + case BPF_FUNC_get_stack: + return &bpf_get_stack_proto_raw_tp; default: return tracing_func_proto(func_id, prog); } From 36261b0df56d832b01b6841ac08c20210374073a Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sat, 28 Apr 2018 22:28:10 -0700 Subject: [PATCH 0257/1640] UPSTREAM: bpf: remove never-hit branches in verifier adjust_scalar_min_max_vals In verifier function adjust_scalar_min_max_vals, when src_known is false and the opcode is BPF_LSH/BPF_RSH, early return will happen in the function. So remove the branch in handling BPF_LSH/BPF_RSH when src_known is false. Acked-by: Alexei Starovoitov Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index b5c681275966..612607ef7d0b 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3236,10 +3236,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, dst_reg->umin_value <<= umin_val; dst_reg->umax_value <<= umax_val; } - if (src_known) - dst_reg->var_off = tnum_lshift(dst_reg->var_off, umin_val); - else - dst_reg->var_off = tnum_lshift(tnum_unknown, umin_val); + dst_reg->var_off = tnum_lshift(dst_reg->var_off, umin_val); /* We may learn something more from the var_off */ __update_reg_bounds(dst_reg); break; @@ -3267,11 +3264,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, */ dst_reg->smin_value = S64_MIN; dst_reg->smax_value = S64_MAX; - if (src_known) - dst_reg->var_off = tnum_rshift(dst_reg->var_off, - umin_val); - else - dst_reg->var_off = tnum_rshift(tnum_unknown, umin_val); + dst_reg->var_off = tnum_rshift(dst_reg->var_off, umin_val); dst_reg->umin_value >>= umax_val; dst_reg->umax_value >>= umin_val; /* We may learn something more from the var_off */ From 1e33434d21dc08c2c2ab68252a06e4516efc778d Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sat, 28 Apr 2018 22:28:11 -0700 Subject: [PATCH 0258/1640] UPSTREAM: bpf/verifier: improve register value range tracking with ARSH When helpers like bpf_get_stack returns an int value and later on used for arithmetic computation, the LSH and ARSH operations are often required to get proper sign extension into 64-bit. For example, without this patch: 54: R0=inv(id=0,umax_value=800) 54: (bf) r8 = r0 55: R0=inv(id=0,umax_value=800) R8_w=inv(id=0,umax_value=800) 55: (67) r8 <<= 32 56: R8_w=inv(id=0,umax_value=3435973836800,var_off=(0x0; 0x3ff00000000)) 56: (c7) r8 s>>= 32 57: R8=inv(id=0) With this patch: 54: R0=inv(id=0,umax_value=800) 54: (bf) r8 = r0 55: R0=inv(id=0,umax_value=800) R8_w=inv(id=0,umax_value=800) 55: (67) r8 <<= 32 56: R8_w=inv(id=0,umax_value=3435973836800,var_off=(0x0; 0x3ff00000000)) 56: (c7) r8 s>>= 32 57: R8=inv(id=0, umax_value=800,var_off=(0x0; 0x3ff)) With better range of "R8", later on when "R8" is added to other register, e.g., a map pointer or scalar-value register, the better register range can be derived and verifier failure may be avoided. In our later example, ...... usize = bpf_get_stack(ctx, raw_data, max_len, BPF_F_USER_STACK); if (usize < 0) return 0; ksize = bpf_get_stack(ctx, raw_data + usize, max_len - usize, 0); ...... Without improving ARSH value range tracking, the register representing "max_len - usize" will have smin_value equal to S64_MIN and will be rejected by verifier. Acked-by: Alexei Starovoitov Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov --- include/linux/tnum.h | 4 +++- kernel/bpf/tnum.c | 10 ++++++++++ kernel/bpf/verifier.c | 23 +++++++++++++++++++++++ 3 files changed, 36 insertions(+), 1 deletion(-) diff --git a/include/linux/tnum.h b/include/linux/tnum.h index 0d2d3da46139..c7dc2b5902c0 100644 --- a/include/linux/tnum.h +++ b/include/linux/tnum.h @@ -23,8 +23,10 @@ struct tnum tnum_range(u64 min, u64 max); /* Arithmetic and logical ops */ /* Shift a tnum left (by a fixed shift) */ struct tnum tnum_lshift(struct tnum a, u8 shift); -/* Shift a tnum right (by a fixed shift) */ +/* Shift (rsh) a tnum right (by a fixed shift) */ struct tnum tnum_rshift(struct tnum a, u8 shift); +/* Shift (arsh) a tnum right (by a fixed min_shift) */ +struct tnum tnum_arshift(struct tnum a, u8 min_shift); /* Add two tnums, return @a + @b */ struct tnum tnum_add(struct tnum a, struct tnum b); /* Subtract two tnums, return @a - @b */ diff --git a/kernel/bpf/tnum.c b/kernel/bpf/tnum.c index 1f4bf68c12db..938d41211be7 100644 --- a/kernel/bpf/tnum.c +++ b/kernel/bpf/tnum.c @@ -43,6 +43,16 @@ struct tnum tnum_rshift(struct tnum a, u8 shift) return TNUM(a.value >> shift, a.mask >> shift); } +struct tnum tnum_arshift(struct tnum a, u8 min_shift) +{ + /* if a.value is negative, arithmetic shifting by minimum shift + * will have larger negative offset compared to more shifting. + * If a.value is nonnegative, arithmetic shifting by minimum shift + * will have larger positive offset compare to more shifting. + */ + return TNUM((s64)a.value >> min_shift, (s64)a.mask >> min_shift); +} + struct tnum tnum_add(struct tnum a, struct tnum b) { u64 sm, sv, sigma, chi, mu; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 612607ef7d0b..9b5971f88a01 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3270,6 +3270,29 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, /* We may learn something more from the var_off */ __update_reg_bounds(dst_reg); break; + case BPF_ARSH: + if (umax_val >= insn_bitness) { + /* Shifts greater than 31 or 63 are undefined. + * This includes shifts by a negative number. + */ + mark_reg_unknown(env, regs, insn->dst_reg); + break; + } + + /* Upon reaching here, src_known is true and + * umax_val is equal to umin_val. + */ + dst_reg->smin_value >>= umin_val; + dst_reg->smax_value >>= umin_val; + dst_reg->var_off = tnum_arshift(dst_reg->var_off, umin_val); + + /* blow away the dst_reg umin_value/umax_value and rely on + * dst_reg var_off to refine the result. + */ + dst_reg->umin_value = 0; + dst_reg->umax_value = U64_MAX; + __update_reg_bounds(dst_reg); + break; default: mark_reg_unknown(env, regs, insn->dst_reg); break; From 5a04ce9295f1ec57234ada3105af4e1195546154 Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Sat, 28 Apr 2018 16:06:19 -0700 Subject: [PATCH 0259/1640] UPSTREAM: bpf: Fix helpers ctx struct types in uapi doc Helpers may operate on two types of ctx structures: user visible ones (e.g. `struct bpf_sock_ops`) when used in user programs, and kernel ones (e.g. `struct bpf_sock_ops_kern`) in kernel implementation. UAPI documentation must refer to only user visible structures. The patch replaces references to `_kern` structures in BPF helpers description by corresponding user visible structures. Signed-off-by: Andrey Ignatov Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index d017f039f71a..17c7c817410c 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1361,7 +1361,7 @@ union bpf_attr { * Return * 0 * - * int bpf_setsockopt(struct bpf_sock_ops_kern *bpf_socket, int level, int optname, char *optval, int optlen) + * int bpf_setsockopt(struct bpf_sock_ops *bpf_socket, int level, int optname, char *optval, int optlen) * Description * Emulate a call to **setsockopt()** on the socket associated to * *bpf_socket*, which must be a full socket. The *level* at @@ -1435,7 +1435,7 @@ union bpf_attr { * Return * **SK_PASS** on success, or **SK_DROP** on error. * - * int bpf_sock_map_update(struct bpf_sock_ops_kern *skops, struct bpf_map *map, void *key, u64 flags) + * int bpf_sock_map_update(struct bpf_sock_ops *skops, struct bpf_map *map, void *key, u64 flags) * Description * Add an entry to, or update a *map* referencing sockets. The * *skops* is used as a new value for the entry associated to @@ -1533,7 +1533,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_perf_prog_read_value(struct bpf_perf_event_data_kern *ctx, struct bpf_perf_event_value *buf, u32 buf_size) + * int bpf_perf_prog_read_value(struct bpf_perf_event_data *ctx, struct bpf_perf_event_value *buf, u32 buf_size) * Description * For en eBPF program attached to a perf event, retrieve the * value of the event counter associated to *ctx* and store it in @@ -1544,7 +1544,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_getsockopt(struct bpf_sock_ops_kern *bpf_socket, int level, int optname, char *optval, int optlen) + * int bpf_getsockopt(struct bpf_sock_ops *bpf_socket, int level, int optname, char *optval, int optlen) * Description * Emulate a call to **getsockopt()** on the socket associated to * *bpf_socket*, which must be a full socket. The *level* at @@ -1588,7 +1588,7 @@ union bpf_attr { * Return * 0 * - * int bpf_sock_ops_cb_flags_set(struct bpf_sock_ops_kern *bpf_sock, int argval) + * int bpf_sock_ops_cb_flags_set(struct bpf_sock_ops *bpf_sock, int argval) * Description * Attempt to set the value of the **bpf_sock_ops_cb_flags** field * for the full TCP socket associated to *bpf_sock_ops* to @@ -1721,7 +1721,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_bind(struct bpf_sock_addr_kern *ctx, struct sockaddr *addr, int addr_len) + * int bpf_bind(struct bpf_sock_addr *ctx, struct sockaddr *addr, int addr_len) * Description * Bind the socket associated to *ctx* to the address pointed by * *addr*, of length *addr_len*. This allows for making outgoing From 1164f3e68faf37a99a33cb1e54fcb65aa94640a2 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Sat, 28 Apr 2018 19:56:37 -0700 Subject: [PATCH 0260/1640] BACKPORT: bpf: remove tracepoints from bpf core tracepoints to bpf core were added as a way to provide introspection to bpf programs and maps, but after some time it became clear that this approach is inadequate, so prog_id, map_id and corresponding get_next_id, get_fd_by_id, get_info_by_fd, prog_query APIs were introduced and fully adopted by bpftool and other applications. The tracepoints in bpf core started to rot and causing syzbot warnings: WARNING: CPU: 0 PID: 3008 at kernel/trace/trace_event_perf.c:274 Kernel panic - not syncing: panic_on_warn set ... perf_trace_bpf_map_keyval+0x260/0xbd0 include/trace/events/bpf.h:228 trace_bpf_map_update_elem include/trace/events/bpf.h:274 [inline] map_update_elem kernel/bpf/syscall.c:597 [inline] SYSC_bpf kernel/bpf/syscall.c:1478 [inline] Hence this patch deletes tracepoints in bpf core. Reported-by: Eric Biggers Reported-by: syzbot Signed-off-by: Alexei Starovoitov Acked-by: David S. Miller Signed-off-by: Daniel Borkmann --- include/linux/bpf_trace.h | 1 - include/trace/events/bpf.h | 355 ------------------------------------- kernel/bpf/core.c | 6 - kernel/bpf/inode.c | 16 +- kernel/bpf/syscall.c | 15 +- 5 files changed, 2 insertions(+), 391 deletions(-) delete mode 100644 include/trace/events/bpf.h diff --git a/include/linux/bpf_trace.h b/include/linux/bpf_trace.h index e6fe98ae3794..ddf896abcfb6 100644 --- a/include/linux/bpf_trace.h +++ b/include/linux/bpf_trace.h @@ -2,7 +2,6 @@ #ifndef __LINUX_BPF_TRACE_H__ #define __LINUX_BPF_TRACE_H__ -#include #include #endif /* __LINUX_BPF_TRACE_H__ */ diff --git a/include/trace/events/bpf.h b/include/trace/events/bpf.h deleted file mode 100644 index 150185647e6b..000000000000 --- a/include/trace/events/bpf.h +++ /dev/null @@ -1,355 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#undef TRACE_SYSTEM -#define TRACE_SYSTEM bpf - -#if !defined(_TRACE_BPF_H) || defined(TRACE_HEADER_MULTI_READ) -#define _TRACE_BPF_H - -/* These are only used within the BPF_SYSCALL code */ -#ifdef CONFIG_BPF_SYSCALL - -#include -#include -#include -#include - -#define __PROG_TYPE_MAP(FN) \ - FN(SOCKET_FILTER) \ - FN(KPROBE) \ - FN(SCHED_CLS) \ - FN(SCHED_ACT) \ - FN(TRACEPOINT) \ - FN(XDP) \ - FN(PERF_EVENT) \ - FN(CGROUP_SKB) \ - FN(CGROUP_SOCK) \ - FN(LWT_IN) \ - FN(LWT_OUT) \ - FN(LWT_XMIT) - -#define __MAP_TYPE_MAP(FN) \ - FN(HASH) \ - FN(ARRAY) \ - FN(PROG_ARRAY) \ - FN(PERF_EVENT_ARRAY) \ - FN(PERCPU_HASH) \ - FN(PERCPU_ARRAY) \ - FN(STACK_TRACE) \ - FN(CGROUP_ARRAY) \ - FN(LRU_HASH) \ - FN(LRU_PERCPU_HASH) \ - FN(LPM_TRIE) - -#define __PROG_TYPE_TP_FN(x) \ - TRACE_DEFINE_ENUM(BPF_PROG_TYPE_##x); -#define __PROG_TYPE_SYM_FN(x) \ - { BPF_PROG_TYPE_##x, #x }, -#define __PROG_TYPE_SYM_TAB \ - __PROG_TYPE_MAP(__PROG_TYPE_SYM_FN) { -1, 0 } -__PROG_TYPE_MAP(__PROG_TYPE_TP_FN) - -#define __MAP_TYPE_TP_FN(x) \ - TRACE_DEFINE_ENUM(BPF_MAP_TYPE_##x); -#define __MAP_TYPE_SYM_FN(x) \ - { BPF_MAP_TYPE_##x, #x }, -#define __MAP_TYPE_SYM_TAB \ - __MAP_TYPE_MAP(__MAP_TYPE_SYM_FN) { -1, 0 } -__MAP_TYPE_MAP(__MAP_TYPE_TP_FN) - -DECLARE_EVENT_CLASS(bpf_prog_event, - - TP_PROTO(const struct bpf_prog *prg), - - TP_ARGS(prg), - - TP_STRUCT__entry( - __array(u8, prog_tag, 8) - __field(u32, type) - ), - - TP_fast_assign( - BUILD_BUG_ON(sizeof(__entry->prog_tag) != sizeof(prg->tag)); - memcpy(__entry->prog_tag, prg->tag, sizeof(prg->tag)); - __entry->type = prg->type; - ), - - TP_printk("prog=%s type=%s", - __print_hex_str(__entry->prog_tag, 8), - __print_symbolic(__entry->type, __PROG_TYPE_SYM_TAB)) -); - -DEFINE_EVENT(bpf_prog_event, bpf_prog_get_type, - - TP_PROTO(const struct bpf_prog *prg), - - TP_ARGS(prg) -); - -DEFINE_EVENT(bpf_prog_event, bpf_prog_put_rcu, - - TP_PROTO(const struct bpf_prog *prg), - - TP_ARGS(prg) -); - -TRACE_EVENT(bpf_prog_load, - - TP_PROTO(const struct bpf_prog *prg, int ufd), - - TP_ARGS(prg, ufd), - - TP_STRUCT__entry( - __array(u8, prog_tag, 8) - __field(u32, type) - __field(int, ufd) - ), - - TP_fast_assign( - BUILD_BUG_ON(sizeof(__entry->prog_tag) != sizeof(prg->tag)); - memcpy(__entry->prog_tag, prg->tag, sizeof(prg->tag)); - __entry->type = prg->type; - __entry->ufd = ufd; - ), - - TP_printk("prog=%s type=%s ufd=%d", - __print_hex_str(__entry->prog_tag, 8), - __print_symbolic(__entry->type, __PROG_TYPE_SYM_TAB), - __entry->ufd) -); - -TRACE_EVENT(bpf_map_create, - - TP_PROTO(const struct bpf_map *map, int ufd), - - TP_ARGS(map, ufd), - - TP_STRUCT__entry( - __field(u32, type) - __field(u32, size_key) - __field(u32, size_value) - __field(u32, max_entries) - __field(u32, flags) - __field(int, ufd) - ), - - TP_fast_assign( - __entry->type = map->map_type; - __entry->size_key = map->key_size; - __entry->size_value = map->value_size; - __entry->max_entries = map->max_entries; - __entry->flags = map->map_flags; - __entry->ufd = ufd; - ), - - TP_printk("map type=%s ufd=%d key=%u val=%u max=%u flags=%x", - __print_symbolic(__entry->type, __MAP_TYPE_SYM_TAB), - __entry->ufd, __entry->size_key, __entry->size_value, - __entry->max_entries, __entry->flags) -); - -DECLARE_EVENT_CLASS(bpf_obj_prog, - - TP_PROTO(const struct bpf_prog *prg, int ufd, - const struct filename *pname), - - TP_ARGS(prg, ufd, pname), - - TP_STRUCT__entry( - __array(u8, prog_tag, 8) - __field(int, ufd) - __string(path, pname->name) - ), - - TP_fast_assign( - BUILD_BUG_ON(sizeof(__entry->prog_tag) != sizeof(prg->tag)); - memcpy(__entry->prog_tag, prg->tag, sizeof(prg->tag)); - __assign_str(path, pname->name); - __entry->ufd = ufd; - ), - - TP_printk("prog=%s path=%s ufd=%d", - __print_hex_str(__entry->prog_tag, 8), - __get_str(path), __entry->ufd) -); - -DEFINE_EVENT(bpf_obj_prog, bpf_obj_pin_prog, - - TP_PROTO(const struct bpf_prog *prg, int ufd, - const struct filename *pname), - - TP_ARGS(prg, ufd, pname) -); - -DEFINE_EVENT(bpf_obj_prog, bpf_obj_get_prog, - - TP_PROTO(const struct bpf_prog *prg, int ufd, - const struct filename *pname), - - TP_ARGS(prg, ufd, pname) -); - -DECLARE_EVENT_CLASS(bpf_obj_map, - - TP_PROTO(const struct bpf_map *map, int ufd, - const struct filename *pname), - - TP_ARGS(map, ufd, pname), - - TP_STRUCT__entry( - __field(u32, type) - __field(int, ufd) - __string(path, pname->name) - ), - - TP_fast_assign( - __assign_str(path, pname->name); - __entry->type = map->map_type; - __entry->ufd = ufd; - ), - - TP_printk("map type=%s ufd=%d path=%s", - __print_symbolic(__entry->type, __MAP_TYPE_SYM_TAB), - __entry->ufd, __get_str(path)) -); - -DEFINE_EVENT(bpf_obj_map, bpf_obj_pin_map, - - TP_PROTO(const struct bpf_map *map, int ufd, - const struct filename *pname), - - TP_ARGS(map, ufd, pname) -); - -DEFINE_EVENT(bpf_obj_map, bpf_obj_get_map, - - TP_PROTO(const struct bpf_map *map, int ufd, - const struct filename *pname), - - TP_ARGS(map, ufd, pname) -); - -DECLARE_EVENT_CLASS(bpf_map_keyval, - - TP_PROTO(const struct bpf_map *map, int ufd, - const void *key, const void *val), - - TP_ARGS(map, ufd, key, val), - - TP_STRUCT__entry( - __field(u32, type) - __field(u32, key_len) - __dynamic_array(u8, key, map->key_size) - __field(bool, key_trunc) - __field(u32, val_len) - __dynamic_array(u8, val, map->value_size) - __field(bool, val_trunc) - __field(int, ufd) - ), - - TP_fast_assign( - memcpy(__get_dynamic_array(key), key, map->key_size); - memcpy(__get_dynamic_array(val), val, map->value_size); - __entry->type = map->map_type; - __entry->key_len = min(map->key_size, 16U); - __entry->key_trunc = map->key_size != __entry->key_len; - __entry->val_len = min(map->value_size, 16U); - __entry->val_trunc = map->value_size != __entry->val_len; - __entry->ufd = ufd; - ), - - TP_printk("map type=%s ufd=%d key=[%s%s] val=[%s%s]", - __print_symbolic(__entry->type, __MAP_TYPE_SYM_TAB), - __entry->ufd, - __print_hex(__get_dynamic_array(key), __entry->key_len), - __entry->key_trunc ? " ..." : "", - __print_hex(__get_dynamic_array(val), __entry->val_len), - __entry->val_trunc ? " ..." : "") -); - -DEFINE_EVENT(bpf_map_keyval, bpf_map_lookup_elem, - - TP_PROTO(const struct bpf_map *map, int ufd, - const void *key, const void *val), - - TP_ARGS(map, ufd, key, val) -); - -DEFINE_EVENT(bpf_map_keyval, bpf_map_update_elem, - - TP_PROTO(const struct bpf_map *map, int ufd, - const void *key, const void *val), - - TP_ARGS(map, ufd, key, val) -); - -TRACE_EVENT(bpf_map_delete_elem, - - TP_PROTO(const struct bpf_map *map, int ufd, - const void *key), - - TP_ARGS(map, ufd, key), - - TP_STRUCT__entry( - __field(u32, type) - __field(u32, key_len) - __dynamic_array(u8, key, map->key_size) - __field(bool, key_trunc) - __field(int, ufd) - ), - - TP_fast_assign( - memcpy(__get_dynamic_array(key), key, map->key_size); - __entry->type = map->map_type; - __entry->key_len = min(map->key_size, 16U); - __entry->key_trunc = map->key_size != __entry->key_len; - __entry->ufd = ufd; - ), - - TP_printk("map type=%s ufd=%d key=[%s%s]", - __print_symbolic(__entry->type, __MAP_TYPE_SYM_TAB), - __entry->ufd, - __print_hex(__get_dynamic_array(key), __entry->key_len), - __entry->key_trunc ? " ..." : "") -); - -TRACE_EVENT(bpf_map_next_key, - - TP_PROTO(const struct bpf_map *map, int ufd, - const void *key, const void *key_next), - - TP_ARGS(map, ufd, key, key_next), - - TP_STRUCT__entry( - __field(u32, type) - __field(u32, key_len) - __dynamic_array(u8, key, map->key_size) - __dynamic_array(u8, nxt, map->key_size) - __field(bool, key_trunc) - __field(bool, key_null) - __field(int, ufd) - ), - - TP_fast_assign( - if (key) - memcpy(__get_dynamic_array(key), key, map->key_size); - __entry->key_null = !key; - memcpy(__get_dynamic_array(nxt), key_next, map->key_size); - __entry->type = map->map_type; - __entry->key_len = min(map->key_size, 16U); - __entry->key_trunc = map->key_size != __entry->key_len; - __entry->ufd = ufd; - ), - - TP_printk("map type=%s ufd=%d key=[%s%s] next=[%s%s]", - __print_symbolic(__entry->type, __MAP_TYPE_SYM_TAB), - __entry->ufd, - __entry->key_null ? "NULL" : __print_hex(__get_dynamic_array(key), - __entry->key_len), - __entry->key_trunc && !__entry->key_null ? " ..." : "", - __print_hex(__get_dynamic_array(nxt), __entry->key_len), - __entry->key_trunc ? " ..." : "") -); -#endif /* CONFIG_BPF_SYSCALL */ -#endif /* _TRACE_BPF_H */ - -#include diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 6d6e9f16c30e..fa8b9dba465d 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1986,9 +1986,3 @@ int __weak skb_copy_bits(const struct sk_buff *skb, int offset, void *to, #include EXPORT_TRACEPOINT_SYMBOL_GPL(xdp_exception); - -/* These are only used within the BPF_SYSCALL code */ -#ifdef CONFIG_BPF_SYSCALL -EXPORT_TRACEPOINT_SYMBOL_GPL(bpf_prog_get_type); -EXPORT_TRACEPOINT_SYMBOL_GPL(bpf_prog_put_rcu); -#endif diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 107de9ae2715..1e70912c9b01 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -429,13 +429,6 @@ int bpf_obj_pin_user(u32 ufd, const char __user *pathname) ret = bpf_obj_do_pin(pname, raw, type); if (ret != 0) bpf_any_put(raw, type); - if ((trace_bpf_obj_pin_prog_enabled() || - trace_bpf_obj_pin_map_enabled()) && !ret) { - if (type == BPF_TYPE_PROG) - trace_bpf_obj_pin_prog(raw, ufd, pname); - if (type == BPF_TYPE_MAP) - trace_bpf_obj_pin_map(raw, ufd, pname); - } out: putname(pname); return ret; @@ -502,15 +495,8 @@ int bpf_obj_get_user(const char __user *pathname, int flags) else goto out; - if (ret < 0) { + if (ret < 0) bpf_any_put(raw, type); - } else if (trace_bpf_obj_get_prog_enabled() || - trace_bpf_obj_get_map_enabled()) { - if (type == BPF_TYPE_PROG) - trace_bpf_obj_get_prog(raw, ret, pname); - if (type == BPF_TYPE_MAP) - trace_bpf_obj_get_map(raw, ret, pname); - } out: putname(pname); return ret; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index b5b3126b598a..7f603537973c 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -504,7 +504,6 @@ static int map_create(union bpf_attr *attr) return err; } - trace_bpf_map_create(map, err); return err; free_map: @@ -667,7 +666,6 @@ static int map_lookup_elem(union bpf_attr *attr) if (copy_to_user(uvalue, value, value_size) != 0) goto free_value; - trace_bpf_map_lookup_elem(map, ufd, key, value); err = 0; free_value: @@ -776,8 +774,6 @@ static int map_update_elem(union bpf_attr *attr) preempt_enable(); maybe_wait_bpf_programs(map); out: - if (!err) - trace_bpf_map_update_elem(map, ufd, key, value); free_value: kfree(value); free_key: @@ -831,8 +827,6 @@ static int map_delete_elem(union bpf_attr *attr) preempt_enable(); maybe_wait_bpf_programs(map); out: - if (!err) - trace_bpf_map_delete_elem(map, ufd, key); kfree(key); err_put: fdput(f); @@ -896,7 +890,6 @@ out: if (copy_to_user(unext_key, next_key, map->key_size) != 0) goto free_next_key; - trace_bpf_map_next_key(map, ufd, key, next_key); err = 0; free_next_key: @@ -1044,7 +1037,6 @@ static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock) if (atomic_dec_and_test(&prog->aux->refcnt)) { int i; - trace_bpf_prog_put_rcu(prog); /* bpf_prog_free_id() must be called first */ bpf_prog_free_id(prog, do_idr_lock); @@ -1211,11 +1203,7 @@ struct bpf_prog *bpf_prog_get(u32 ufd) struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, enum bpf_prog_type type, bool attach_drv) { - struct bpf_prog *prog = __bpf_prog_get(ufd, &type, attach_drv); - - if (!IS_ERR(prog)) - trace_bpf_prog_get_type(prog); - return prog; + return __bpf_prog_get(ufd, &type, attach_drv); } EXPORT_SYMBOL_GPL(bpf_prog_get_type_dev); @@ -1392,7 +1380,6 @@ static int bpf_prog_load(union bpf_attr *attr) * be using bpf_prog_put() given the program is exposed. */ bpf_prog_kallsyms_add(prog); - trace_bpf_prog_load(prog, err); err = bpf_prog_new_fd(prog); if (err < 0) From 42338ec486bff7f3a8dfdb7301f2a75c9e942c2e Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Mon, 30 Apr 2018 11:39:03 +0100 Subject: [PATCH 0261/1640] UPSTREAM: bpf: fix formatting for bpf_perf_event_read() helper doc Some edits brought to the last iteration of BPF helper functions documentation introduced an error with RST formatting. As a result, most of one paragraph is rendered in bold text when only the name of a helper should be. Fix it, and fix formatting of another function name in the same paragraph. Fixes: c6b5fb8690fa ("bpf: add documentation for eBPF helpers (42-50)") Signed-off-by: Quentin Monnet Signed-off-by: Daniel Borkmann --- include/uapi/linux/bpf.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 17c7c817410c..b66a229c4a7f 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -828,12 +828,12 @@ union bpf_attr { * * Also, be aware that the newer helper * **bpf_perf_event_read_value**\ () is recommended over - * **bpf_perf_event_read*\ () in general. The latter has some ABI + * **bpf_perf_event_read**\ () in general. The latter has some ABI * quirks where error and counter value are used as a return code * (which is wrong to do since ranges may overlap). This issue is - * fixed with bpf_perf_event_read_value(), which at the same time - * provides more features over the **bpf_perf_event_read**\ () - * interface. Please refer to the description of + * fixed with **bpf_perf_event_read_value**\ (), which at the same + * time provides more features over the **bpf_perf_event_read**\ + * () interface. Please refer to the description of * **bpf_perf_event_read_value**\ () for details. * Return * The value of the perf event counter read from the map, or a From a48d0f38cd91c72cb3d965a985e13be7bcced8eb Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Mon, 30 Apr 2018 11:39:04 +0100 Subject: [PATCH 0262/1640] UPSTREAM: bpf: fix formatting for bpf_get_stack() helper doc Fix formatting (indent) for bpf_get_stack() helper documentation, so that the doc is rendered correctly with the Python script. Fixes: c195651e565a ("bpf: add bpf_get_stack helper") Cc: Yonghong Song Signed-off-by: Quentin Monnet Signed-off-by: Daniel Borkmann --- include/uapi/linux/bpf.h | 44 ++++++++++++++++++++-------------------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index b66a229c4a7f..c9dffbbaadbe 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1770,33 +1770,33 @@ union bpf_attr { * * int bpf_get_stack(struct pt_regs *regs, void *buf, u32 size, u64 flags) * Description - * Return a user or a kernel stack in bpf program provided buffer. - * To achieve this, the helper needs *ctx*, which is a pointer - * to the context on which the tracing program is executed. - * To store the stacktrace, the bpf program provides *buf* with - * a nonnegative *size*. + * Return a user or a kernel stack in bpf program provided buffer. + * To achieve this, the helper needs *ctx*, which is a pointer + * to the context on which the tracing program is executed. + * To store the stacktrace, the bpf program provides *buf* with + * a nonnegative *size*. * - * The last argument, *flags*, holds the number of stack frames to - * skip (from 0 to 255), masked with - * **BPF_F_SKIP_FIELD_MASK**. The next bits can be used to set - * the following flags: + * The last argument, *flags*, holds the number of stack frames to + * skip (from 0 to 255), masked with + * **BPF_F_SKIP_FIELD_MASK**. The next bits can be used to set + * the following flags: * - * **BPF_F_USER_STACK** - * Collect a user space stack instead of a kernel stack. - * **BPF_F_USER_BUILD_ID** - * Collect buildid+offset instead of ips for user stack, - * only valid if **BPF_F_USER_STACK** is also specified. + * **BPF_F_USER_STACK** + * Collect a user space stack instead of a kernel stack. + * **BPF_F_USER_BUILD_ID** + * Collect buildid+offset instead of ips for user stack, + * only valid if **BPF_F_USER_STACK** is also specified. * - * **bpf_get_stack**\ () can collect up to - * **PERF_MAX_STACK_DEPTH** both kernel and user frames, subject - * to sufficient large buffer size. Note that - * this limit can be controlled with the **sysctl** program, and - * that it should be manually increased in order to profile long - * user stacks (such as stacks for Java programs). To do so, use: + * **bpf_get_stack**\ () can collect up to + * **PERF_MAX_STACK_DEPTH** both kernel and user frames, subject + * to sufficient large buffer size. Note that + * this limit can be controlled with the **sysctl** program, and + * that it should be manually increased in order to profile long + * user stacks (such as stacks for Java programs). To do so, use: * - * :: + * :: * - * # sysctl kernel.perf_event_max_stack= + * # sysctl kernel.perf_event_max_stack= * * Return * a non-negative value equal to or less than size on success, or From f01a80c09e53e0316d7ec0f397b0d2eb8b22bda7 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Wed, 2 May 2018 13:50:19 -0700 Subject: [PATCH 0263/1640] UPSTREAM: bpf: sockmap, fix scatterlist update on error path in send with apply When the call to do_tcp_sendpage() fails to send the complete block requested we either retry if only a partial send was completed or abort if we receive a error less than or equal to zero. Before returning though we must update the scatterlist length/offset to account for any partial send completed. Before this patch we did this at the end of the retry loop, but this was buggy when used while applying a verdict to fewer bytes than in the scatterlist. When the scatterlist length was being set we forgot to account for the apply logic reducing the size variable. So the result was we chopped off some bytes in the scatterlist without doing proper cleanup on them. This results in a WARNING when the sock is tore down because the bytes have previously been charged to the socket but are never uncharged. The simple fix is to simply do the accounting inside the retry loop subtracting from the absolute scatterlist values rather than trying to accumulate the totals and subtract at the end. Reported-by: Alexei Starovoitov Signed-off-by: John Fastabend Signed-off-by: Alexei Starovoitov --- kernel/bpf/sockmap.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 99f7be8a065f..c533c75699bc 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -326,6 +326,9 @@ retry: if (ret > 0) { if (apply) apply_bytes -= ret; + + sg->offset += ret; + sg->length -= ret; size -= ret; offset += ret; if (uncharge) @@ -333,8 +336,6 @@ retry: goto retry; } - sg->length = size; - sg->offset = offset; return ret; } From 6f058b03b384ff9f95ac8272b4a7558673e89c17 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Wed, 2 May 2018 13:50:24 -0700 Subject: [PATCH 0264/1640] UPSTREAM: bpf: sockmap, zero sg_size on error when buffer is released When an error occurs during a redirect we have two cases that need to be handled (i) we have a cork'ed buffer (ii) we have a normal sendmsg buffer. In the cork'ed buffer case we don't currently support recovering from errors in a redirect action. So the buffer is released and the error should _not_ be pushed back to the caller of sendmsg/sendpage. The rationale here is the user will get an error that relates to old data that may have been sent by some arbitrary thread on that sock. Instead we simple consume the data and tell the user that the data has been consumed. We may add proper error recovery in the future. However, this patch fixes a bug where the bytes outstanding counter sg_size was not zeroed. This could result in a case where if the user has both a cork'ed action and apply action in progress we may incorrectly call into the BPF program when the user expected an old verdict to be applied via the apply action. I don't have a use case where using apply and cork at the same time is valid but we never explicitly reject it because it should work fine. This patch ensures the sg_size is zeroed so we don't have this case. In the normal sendmsg buffer case (no cork data) we also do not zero sg_size. Again this can confuse the apply logic when the logic calls into the BPF program when the BPF programmer expected the old verdict to remain. So ensure we set sg_size to zero here as well. And additionally to keep the psock state in-sync with the sk_msg_buff release all the memory as well. Previously we did this before returning to the user but this left a gap where psock and sk_msg_buff states were out of sync which seems fragile. No additional overhead is taken here except for a call to check the length and realize its already been freed. This is in the error path as well so in my opinion lets have robust code over optimized error paths. Signed-off-by: John Fastabend Signed-off-by: Alexei Starovoitov --- kernel/bpf/sockmap.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index c533c75699bc..f45f4dda5421 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -701,15 +701,22 @@ more_data: err = bpf_tcp_sendmsg_do_redirect(redir, send, m, flags); lock_sock(sk); + if (unlikely(err < 0)) { + free_start_sg(sk, m); + psock->sg_size = 0; + if (!cork) + *copied -= send; + } else { + psock->sg_size -= send; + } + if (cork) { free_start_sg(sk, m); + psock->sg_size = 0; kfree(m); m = NULL; + err = 0; } - if (unlikely(err)) - *copied -= err; - else - psock->sg_size -= send; break; case __SK_DROP: default: From 756673b4d6c075674c531638f273258d24d900a2 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Wed, 2 May 2018 13:50:29 -0700 Subject: [PATCH 0265/1640] UPSTREAM: bpf: sockmap, fix error handling in redirect failures When a redirect failure happens we release the buffers in-flight without calling a sk_mem_uncharge(), the uncharge is called before dropping the sock lock for the redirecte, however we missed updating the ring start index. When no apply actions are in progress this is OK because we uncharge the entire buffer before the redirect. But, when we have apply logic running its possible that only a portion of the buffer is being redirected. In this case we only do memory accounting for the buffer slice being redirected and expect to be able to loop over the BPF program again and/or if a sock is closed uncharge the memory at sock destruct time. With an invalid start index however the program logic looks at the start pointer index, checks the length, and when seeing the length is zero (from the initial release and failure to update the pointer) aborts without uncharging/releasing the remaining memory. The fix for this is simply to update the start index. To avoid fixing this error in two locations we do a small refactor and remove one case where it is open-coded. Then fix it in the single function. Signed-off-by: John Fastabend Signed-off-by: Alexei Starovoitov --- kernel/bpf/sockmap.c | 28 ++++++++++++---------------- 1 file changed, 12 insertions(+), 16 deletions(-) diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index f45f4dda5421..7974bc4c1789 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -393,7 +393,8 @@ static void return_mem_sg(struct sock *sk, int bytes, struct sk_msg_buff *md) } while (i != md->sg_end); } -static void free_bytes_sg(struct sock *sk, int bytes, struct sk_msg_buff *md) +static void free_bytes_sg(struct sock *sk, int bytes, + struct sk_msg_buff *md, bool charge) { struct scatterlist *sg = md->sg_data; int i = md->sg_start, free; @@ -403,11 +404,13 @@ static void free_bytes_sg(struct sock *sk, int bytes, struct sk_msg_buff *md) if (bytes < free) { sg[i].length -= bytes; sg[i].offset += bytes; - sk_mem_uncharge(sk, bytes); + if (charge) + sk_mem_uncharge(sk, bytes); break; } - sk_mem_uncharge(sk, sg[i].length); + if (charge) + sk_mem_uncharge(sk, sg[i].length); put_page(sg_page(&sg[i])); bytes -= sg[i].length; sg[i].length = 0; @@ -418,6 +421,7 @@ static void free_bytes_sg(struct sock *sk, int bytes, struct sk_msg_buff *md) if (i == MAX_SKB_FRAGS) i = 0; } + md->sg_start = i; } static int free_sg(struct sock *sk, int start, struct sk_msg_buff *md) @@ -576,10 +580,10 @@ static int bpf_tcp_sendmsg_do_redirect(struct sock *sk, int send, struct sk_msg_buff *md, int flags) { + bool ingress = !!(md->flags & BPF_F_INGRESS); struct smap_psock *psock; struct scatterlist *sg; - int i, err, free = 0; - bool ingress = !!(md->flags & BPF_F_INGRESS); + int err = 0; sg = md->sg_data; @@ -607,16 +611,8 @@ static int bpf_tcp_sendmsg_do_redirect(struct sock *sk, int send, out_rcu: rcu_read_unlock(); out: - i = md->sg_start; - while (sg[i].length) { - free += sg[i].length; - put_page(sg_page(&sg[i])); - sg[i].length = 0; - i++; - if (i == MAX_SKB_FRAGS) - i = 0; - } - return free; + free_bytes_sg(NULL, send, md, false); + return err; } static inline void bpf_md_init(struct smap_psock *psock) @@ -720,7 +716,7 @@ more_data: break; case __SK_DROP: default: - free_bytes_sg(sk, send, m); + free_bytes_sg(sk, send, m, true); apply_bytes_dec(psock, send); *copied -= send; psock->sg_size -= send; From 73642b43cefd2bcc4ae7244c1a59b1a26923790a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Wed, 2 May 2018 13:01:23 +0200 Subject: [PATCH 0266/1640] UPSTREAM: xsk: add user memory registration support sockopt MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In this commit the base structure of the AF_XDP address family is set up. Further, we introduce the abilty register a window of user memory to the kernel via the XDP_UMEM_REG setsockopt syscall. The memory window is viewed by an AF_XDP socket as a set of equally large frames. After a user memory registration all frames are "owned" by the user application, and not the kernel. v2: More robust checks on umem creation and unaccount on error. Call set_page_dirty_lock on cleanup. Simplified xdp_umem_reg. Co-authored-by: Magnus Karlsson Change-Id: I9031b4b9c2ead2c12994470833ad4a8fbb33e1c1 Signed-off-by: Magnus Karlsson Signed-off-by: Björn Töpel Signed-off-by: Alexei Starovoitov --- include/net/xdp_sock.h | 31 +++++ include/uapi/linux/if_xdp.h | 34 +++++ net/Makefile | 1 + net/xdp/Makefile | 2 + net/xdp/xdp_umem.c | 245 ++++++++++++++++++++++++++++++++++++ net/xdp/xdp_umem.h | 45 +++++++ net/xdp/xdp_umem_props.h | 23 ++++ net/xdp/xsk.c | 215 +++++++++++++++++++++++++++++++ 8 files changed, 596 insertions(+) create mode 100644 include/net/xdp_sock.h create mode 100644 include/uapi/linux/if_xdp.h create mode 100644 net/xdp/Makefile create mode 100644 net/xdp/xdp_umem.c create mode 100644 net/xdp/xdp_umem.h create mode 100644 net/xdp/xdp_umem_props.h create mode 100644 net/xdp/xsk.c diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h new file mode 100644 index 000000000000..94785f5db13e --- /dev/null +++ b/include/net/xdp_sock.h @@ -0,0 +1,31 @@ +/* SPDX-License-Identifier: GPL-2.0 + * AF_XDP internal functions + * Copyright(c) 2018 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#ifndef _LINUX_XDP_SOCK_H +#define _LINUX_XDP_SOCK_H + +#include +#include + +struct xdp_umem; + +struct xdp_sock { + /* struct sock must be the first member of struct xdp_sock */ + struct sock sk; + struct xdp_umem *umem; + /* Protects multiple processes in the control path */ + struct mutex mutex; +}; + +#endif /* _LINUX_XDP_SOCK_H */ diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h new file mode 100644 index 000000000000..41252135a0fe --- /dev/null +++ b/include/uapi/linux/if_xdp.h @@ -0,0 +1,34 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note + * + * if_xdp: XDP socket user-space interface + * Copyright(c) 2018 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * Author(s): Björn Töpel + * Magnus Karlsson + */ + +#ifndef _LINUX_IF_XDP_H +#define _LINUX_IF_XDP_H + +#include + +/* XDP socket options */ +#define XDP_UMEM_REG 3 + +struct xdp_umem_reg { + __u64 addr; /* Start of packet data area */ + __u64 len; /* Length of packet data area */ + __u32 frame_size; /* Frame size */ + __u32 frame_headroom; /* Frame head room */ +}; + +#endif /* _LINUX_IF_XDP_H */ diff --git a/net/Makefile b/net/Makefile index 42d3dcff921b..cf01cae6495d 100644 --- a/net/Makefile +++ b/net/Makefile @@ -85,6 +85,7 @@ obj-y += l3mdev/ endif obj-$(CONFIG_QRTR) += qrtr/ obj-$(CONFIG_NET_NCSI) += ncsi/ +obj-$(CONFIG_XDP_SOCKETS) += xdp/ obj-$(CONFIG_RMNET_DATA) += rmnet_data/ obj-$(CONFIG_RMNET_USB) += rmnet_usb/ obj-$(CONFIG_KNOX_NCM) += ncm/ diff --git a/net/xdp/Makefile b/net/xdp/Makefile new file mode 100644 index 000000000000..a5d736640a0f --- /dev/null +++ b/net/xdp/Makefile @@ -0,0 +1,2 @@ +obj-$(CONFIG_XDP_SOCKETS) += xsk.o xdp_umem.o + diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c new file mode 100644 index 000000000000..ec8b3552be44 --- /dev/null +++ b/net/xdp/xdp_umem.c @@ -0,0 +1,245 @@ +// SPDX-License-Identifier: GPL-2.0 +/* XDP user-space packet buffer + * Copyright(c) 2018 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "xdp_umem.h" + +#define XDP_UMEM_MIN_FRAME_SIZE 2048 + +int xdp_umem_create(struct xdp_umem **umem) +{ + *umem = kzalloc(sizeof(**umem), GFP_KERNEL); + + if (!(*umem)) + return -ENOMEM; + + return 0; +} + +static void xdp_umem_unpin_pages(struct xdp_umem *umem) +{ + unsigned int i; + + if (umem->pgs) { + for (i = 0; i < umem->npgs; i++) { + struct page *page = umem->pgs[i]; + + set_page_dirty_lock(page); + put_page(page); + } + + kfree(umem->pgs); + umem->pgs = NULL; + } +} + +static void xdp_umem_unaccount_pages(struct xdp_umem *umem) +{ + if (umem->user) { + atomic_long_sub(umem->npgs, &umem->user->locked_vm); + free_uid(umem->user); + } +} + +static void xdp_umem_release(struct xdp_umem *umem) +{ + struct task_struct *task; + struct mm_struct *mm; + + if (umem->pgs) { + xdp_umem_unpin_pages(umem); + + task = get_pid_task(umem->pid, PIDTYPE_PID); + put_pid(umem->pid); + if (!task) + goto out; + mm = get_task_mm(task); + put_task_struct(task); + if (!mm) + goto out; + + mmput(mm); + umem->pgs = NULL; + } + + xdp_umem_unaccount_pages(umem); +out: + kfree(umem); +} + +static void xdp_umem_release_deferred(struct work_struct *work) +{ + struct xdp_umem *umem = container_of(work, struct xdp_umem, work); + + xdp_umem_release(umem); +} + +void xdp_get_umem(struct xdp_umem *umem) +{ + atomic_inc(&umem->users); +} + +void xdp_put_umem(struct xdp_umem *umem) +{ + if (!umem) + return; + + if (atomic_dec_and_test(&umem->users)) { + INIT_WORK(&umem->work, xdp_umem_release_deferred); + schedule_work(&umem->work); + } +} + +static int xdp_umem_pin_pages(struct xdp_umem *umem) +{ + unsigned int gup_flags = FOLL_WRITE; + long npgs; + int err; + + umem->pgs = kcalloc(umem->npgs, sizeof(*umem->pgs), GFP_KERNEL); + if (!umem->pgs) + return -ENOMEM; + + down_write(¤t->mm->mmap_sem); + npgs = get_user_pages(umem->address, umem->npgs, + gup_flags, &umem->pgs[0], NULL); + up_write(¤t->mm->mmap_sem); + + if (npgs != umem->npgs) { + if (npgs >= 0) { + umem->npgs = npgs; + err = -ENOMEM; + goto out_pin; + } + err = npgs; + goto out_pgs; + } + return 0; + +out_pin: + xdp_umem_unpin_pages(umem); +out_pgs: + kfree(umem->pgs); + umem->pgs = NULL; + return err; +} + +static int xdp_umem_account_pages(struct xdp_umem *umem) +{ + unsigned long lock_limit, new_npgs, old_npgs; + + if (capable(CAP_IPC_LOCK)) + return 0; + + lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; + umem->user = get_uid(current_user()); + + do { + old_npgs = atomic_long_read(&umem->user->locked_vm); + new_npgs = old_npgs + umem->npgs; + if (new_npgs > lock_limit) { + free_uid(umem->user); + umem->user = NULL; + return -ENOBUFS; + } + } while (atomic_long_cmpxchg(&umem->user->locked_vm, old_npgs, + new_npgs) != old_npgs); + return 0; +} + +int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) +{ + u32 frame_size = mr->frame_size, frame_headroom = mr->frame_headroom; + u64 addr = mr->addr, size = mr->len; + unsigned int nframes, nfpp; + int size_chk, err; + + if (!umem) + return -EINVAL; + + if (frame_size < XDP_UMEM_MIN_FRAME_SIZE || frame_size > PAGE_SIZE) { + /* Strictly speaking we could support this, if: + * - huge pages, or* + * - using an IOMMU, or + * - making sure the memory area is consecutive + * but for now, we simply say "computer says no". + */ + return -EINVAL; + } + + if (!is_power_of_2(frame_size)) + return -EINVAL; + + if (!PAGE_ALIGNED(addr)) { + /* Memory area has to be page size aligned. For + * simplicity, this might change. + */ + return -EINVAL; + } + + if ((addr + size) < addr) + return -EINVAL; + + nframes = size / frame_size; + if (nframes == 0 || nframes > UINT_MAX) + return -EINVAL; + + nfpp = PAGE_SIZE / frame_size; + if (nframes < nfpp || nframes % nfpp) + return -EINVAL; + + frame_headroom = ALIGN(frame_headroom, 64); + + size_chk = frame_size - frame_headroom - XDP_PACKET_HEADROOM; + if (size_chk < 0) + return -EINVAL; + + umem->pid = get_task_pid(current, PIDTYPE_PID); + umem->size = (size_t)size; + umem->address = (unsigned long)addr; + umem->props.frame_size = frame_size; + umem->props.nframes = nframes; + umem->frame_headroom = frame_headroom; + umem->npgs = size / PAGE_SIZE; + umem->pgs = NULL; + umem->user = NULL; + + umem->frame_size_log2 = ilog2(frame_size); + umem->nfpp_mask = nfpp - 1; + umem->nfpplog2 = ilog2(nfpp); + atomic_set(&umem->users, 1); + + err = xdp_umem_account_pages(umem); + if (err) + goto out; + + err = xdp_umem_pin_pages(umem); + if (err) + goto out_account; + return 0; + +out_account: + xdp_umem_unaccount_pages(umem); +out: + put_pid(umem->pid); + return err; +} diff --git a/net/xdp/xdp_umem.h b/net/xdp/xdp_umem.h new file mode 100644 index 000000000000..4597ae81a221 --- /dev/null +++ b/net/xdp/xdp_umem.h @@ -0,0 +1,45 @@ +/* SPDX-License-Identifier: GPL-2.0 + * XDP user-space packet buffer + * Copyright(c) 2018 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#ifndef XDP_UMEM_H_ +#define XDP_UMEM_H_ + +#include +#include +#include + +#include "xdp_umem_props.h" + +struct xdp_umem { + struct page **pgs; + struct xdp_umem_props props; + u32 npgs; + u32 frame_headroom; + u32 nfpp_mask; + u32 nfpplog2; + u32 frame_size_log2; + struct user_struct *user; + struct pid *pid; + unsigned long address; + size_t size; + atomic_t users; + struct work_struct work; +}; + +int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr); +void xdp_get_umem(struct xdp_umem *umem); +void xdp_put_umem(struct xdp_umem *umem); +int xdp_umem_create(struct xdp_umem **umem); + +#endif /* XDP_UMEM_H_ */ diff --git a/net/xdp/xdp_umem_props.h b/net/xdp/xdp_umem_props.h new file mode 100644 index 000000000000..77fb5daf29f3 --- /dev/null +++ b/net/xdp/xdp_umem_props.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: GPL-2.0 + * XDP user-space packet buffer + * Copyright(c) 2018 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#ifndef XDP_UMEM_PROPS_H_ +#define XDP_UMEM_PROPS_H_ + +struct xdp_umem_props { + u32 frame_size; + u32 nframes; +}; + +#endif /* XDP_UMEM_PROPS_H_ */ diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c new file mode 100644 index 000000000000..84e0e867febb --- /dev/null +++ b/net/xdp/xsk.c @@ -0,0 +1,215 @@ +// SPDX-License-Identifier: GPL-2.0 +/* XDP sockets + * + * AF_XDP sockets allows a channel between XDP programs and userspace + * applications. + * Copyright(c) 2018 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * Author(s): Björn Töpel + * Magnus Karlsson + */ + +#define pr_fmt(fmt) "AF_XDP: %s: " fmt, __func__ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "xdp_umem.h" + +static struct xdp_sock *xdp_sk(struct sock *sk) +{ + return (struct xdp_sock *)sk; +} + +static int xsk_release(struct socket *sock) +{ + struct sock *sk = sock->sk; + struct net *net; + + if (!sk) + return 0; + + net = sock_net(sk); + + local_bh_disable(); + sock_prot_inuse_add(net, sk->sk_prot, -1); + local_bh_enable(); + + sock_orphan(sk); + sock->sk = NULL; + + sk_refcnt_debug_release(sk); + sock_put(sk); + + return 0; +} + +static int xsk_setsockopt(struct socket *sock, int level, int optname, + char __user *optval, unsigned int optlen) +{ + struct sock *sk = sock->sk; + struct xdp_sock *xs = xdp_sk(sk); + int err; + + if (level != SOL_XDP) + return -ENOPROTOOPT; + + switch (optname) { + case XDP_UMEM_REG: + { + struct xdp_umem_reg mr; + struct xdp_umem *umem; + + if (xs->umem) + return -EBUSY; + + if (copy_from_user(&mr, optval, sizeof(mr))) + return -EFAULT; + + mutex_lock(&xs->mutex); + err = xdp_umem_create(&umem); + + err = xdp_umem_reg(umem, &mr); + if (err) { + kfree(umem); + mutex_unlock(&xs->mutex); + return err; + } + + /* Make sure umem is ready before it can be seen by others */ + smp_wmb(); + + xs->umem = umem; + mutex_unlock(&xs->mutex); + return 0; + } + default: + break; + } + + return -ENOPROTOOPT; +} + +static struct proto xsk_proto = { + .name = "XDP", + .owner = THIS_MODULE, + .obj_size = sizeof(struct xdp_sock), +}; + +static const struct proto_ops xsk_proto_ops = { + .family = PF_XDP, + .owner = THIS_MODULE, + .release = xsk_release, + .bind = sock_no_bind, + .connect = sock_no_connect, + .socketpair = sock_no_socketpair, + .accept = sock_no_accept, + .getname = sock_no_getname, + .poll = sock_no_poll, + .ioctl = sock_no_ioctl, + .listen = sock_no_listen, + .shutdown = sock_no_shutdown, + .setsockopt = xsk_setsockopt, + .getsockopt = sock_no_getsockopt, + .sendmsg = sock_no_sendmsg, + .recvmsg = sock_no_recvmsg, + .mmap = sock_no_mmap, + .sendpage = sock_no_sendpage, +}; + +static void xsk_destruct(struct sock *sk) +{ + struct xdp_sock *xs = xdp_sk(sk); + + if (!sock_flag(sk, SOCK_DEAD)) + return; + + xdp_put_umem(xs->umem); + + sk_refcnt_debug_dec(sk); +} + +static int xsk_create(struct net *net, struct socket *sock, int protocol, + int kern) +{ + struct sock *sk; + struct xdp_sock *xs; + + if (!ns_capable(net->user_ns, CAP_NET_RAW)) + return -EPERM; + if (sock->type != SOCK_RAW) + return -ESOCKTNOSUPPORT; + + if (protocol) + return -EPROTONOSUPPORT; + + sock->state = SS_UNCONNECTED; + + sk = sk_alloc(net, PF_XDP, GFP_KERNEL, &xsk_proto, kern); + if (!sk) + return -ENOBUFS; + + sock->ops = &xsk_proto_ops; + + sock_init_data(sock, sk); + + sk->sk_family = PF_XDP; + + sk->sk_destruct = xsk_destruct; + sk_refcnt_debug_inc(sk); + + xs = xdp_sk(sk); + mutex_init(&xs->mutex); + + local_bh_disable(); + sock_prot_inuse_add(net, &xsk_proto, 1); + local_bh_enable(); + + return 0; +} + +static const struct net_proto_family xsk_family_ops = { + .family = PF_XDP, + .create = xsk_create, + .owner = THIS_MODULE, +}; + +static int __init xsk_init(void) +{ + int err; + + err = proto_register(&xsk_proto, 0 /* no slab */); + if (err) + goto out; + + err = sock_register(&xsk_family_ops); + if (err) + goto out_proto; + + return 0; + +out_proto: + proto_unregister(&xsk_proto); +out: + return err; +} + +fs_initcall(xsk_init); From 417be2b2c45022f15f9707274dc33149e7cf9cb6 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Wed, 2 May 2018 13:01:24 +0200 Subject: [PATCH 0267/1640] UPSTREAM: xsk: add umem fill queue support and mmap Here, we add another setsockopt for registered user memory (umem) called XDP_UMEM_FILL_QUEUE. Using this socket option, the process can ask the kernel to allocate a queue (ring buffer) and also mmap it (XDP_UMEM_PGOFF_FILL_QUEUE) into the process. The queue is used to explicitly pass ownership of umem frames from the user process to the kernel. These frames will in a later patch be filled in with Rx packet data by the kernel. v2: Fixed potential crash in xsk_mmap. Signed-off-by: Magnus Karlsson Signed-off-by: Alexei Starovoitov --- include/uapi/linux/if_xdp.h | 15 +++++++++ net/xdp/Makefile | 2 +- net/xdp/xdp_umem.c | 5 +++ net/xdp/xdp_umem.h | 2 ++ net/xdp/xsk.c | 65 ++++++++++++++++++++++++++++++++++++- net/xdp/xsk_queue.c | 58 +++++++++++++++++++++++++++++++++ net/xdp/xsk_queue.h | 38 ++++++++++++++++++++++ 7 files changed, 183 insertions(+), 2 deletions(-) create mode 100644 net/xdp/xsk_queue.c create mode 100644 net/xdp/xsk_queue.h diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h index 41252135a0fe..975661e1baca 100644 --- a/include/uapi/linux/if_xdp.h +++ b/include/uapi/linux/if_xdp.h @@ -23,6 +23,7 @@ /* XDP socket options */ #define XDP_UMEM_REG 3 +#define XDP_UMEM_FILL_RING 4 struct xdp_umem_reg { __u64 addr; /* Start of packet data area */ @@ -31,4 +32,18 @@ struct xdp_umem_reg { __u32 frame_headroom; /* Frame head room */ }; +/* Pgoff for mmaping the rings */ +#define XDP_UMEM_PGOFF_FILL_RING 0x100000000 + +struct xdp_ring { + __u32 producer __attribute__((aligned(64))); + __u32 consumer __attribute__((aligned(64))); +}; + +/* Used for the fill and completion queues for buffers */ +struct xdp_umem_ring { + struct xdp_ring ptrs; + __u32 desc[0] __attribute__((aligned(64))); +}; + #endif /* _LINUX_IF_XDP_H */ diff --git a/net/xdp/Makefile b/net/xdp/Makefile index a5d736640a0f..074fb2b2d51c 100644 --- a/net/xdp/Makefile +++ b/net/xdp/Makefile @@ -1,2 +1,2 @@ -obj-$(CONFIG_XDP_SOCKETS) += xsk.o xdp_umem.o +obj-$(CONFIG_XDP_SOCKETS) += xsk.o xdp_umem.o xsk_queue.o diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c index ec8b3552be44..e1f627d0cc1c 100644 --- a/net/xdp/xdp_umem.c +++ b/net/xdp/xdp_umem.c @@ -65,6 +65,11 @@ static void xdp_umem_release(struct xdp_umem *umem) struct task_struct *task; struct mm_struct *mm; + if (umem->fq) { + xskq_destroy(umem->fq); + umem->fq = NULL; + } + if (umem->pgs) { xdp_umem_unpin_pages(umem); diff --git a/net/xdp/xdp_umem.h b/net/xdp/xdp_umem.h index 4597ae81a221..25634b8a5c6f 100644 --- a/net/xdp/xdp_umem.h +++ b/net/xdp/xdp_umem.h @@ -19,9 +19,11 @@ #include #include +#include "xsk_queue.h" #include "xdp_umem_props.h" struct xdp_umem { + struct xsk_queue *fq; struct page **pgs; struct xdp_umem_props props; u32 npgs; diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 84e0e867febb..da67a3c5c1c9 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -32,6 +32,7 @@ #include #include +#include "xsk_queue.h" #include "xdp_umem.h" static struct xdp_sock *xdp_sk(struct sock *sk) @@ -39,6 +40,21 @@ static struct xdp_sock *xdp_sk(struct sock *sk) return (struct xdp_sock *)sk; } +static int xsk_init_queue(u32 entries, struct xsk_queue **queue) +{ + struct xsk_queue *q; + + if (entries == 0 || *queue || !is_power_of_2(entries)) + return -EINVAL; + + q = xskq_create(entries); + if (!q) + return -ENOMEM; + + *queue = q; + return 0; +} + static int xsk_release(struct socket *sock) { struct sock *sk = sock->sk; @@ -101,6 +117,23 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname, mutex_unlock(&xs->mutex); return 0; } + case XDP_UMEM_FILL_RING: + { + struct xsk_queue **q; + int entries; + + if (!xs->umem) + return -EINVAL; + + if (copy_from_user(&entries, optval, sizeof(entries))) + return -EFAULT; + + mutex_lock(&xs->mutex); + q = &xs->umem->fq; + err = xsk_init_queue(entries, q); + mutex_unlock(&xs->mutex); + return err; + } default: break; } @@ -108,6 +141,36 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname, return -ENOPROTOOPT; } +static int xsk_mmap(struct file *file, struct socket *sock, + struct vm_area_struct *vma) +{ + unsigned long offset = vma->vm_pgoff << PAGE_SHIFT; + unsigned long size = vma->vm_end - vma->vm_start; + struct xdp_sock *xs = xdp_sk(sock->sk); + struct xsk_queue *q = NULL; + unsigned long pfn; + struct page *qpg; + + if (!xs->umem) + return -EINVAL; + + if (offset == XDP_UMEM_PGOFF_FILL_RING) + q = xs->umem->fq; + else + return -EINVAL; + + if (!q) + return -EINVAL; + + qpg = virt_to_head_page(q->ring); + if (size > (PAGE_SIZE << compound_order(qpg))) + return -EINVAL; + + pfn = virt_to_phys(q->ring) >> PAGE_SHIFT; + return remap_pfn_range(vma, vma->vm_start, pfn, + size, vma->vm_page_prot); +} + static struct proto xsk_proto = { .name = "XDP", .owner = THIS_MODULE, @@ -131,7 +194,7 @@ static const struct proto_ops xsk_proto_ops = { .getsockopt = sock_no_getsockopt, .sendmsg = sock_no_sendmsg, .recvmsg = sock_no_recvmsg, - .mmap = sock_no_mmap, + .mmap = xsk_mmap, .sendpage = sock_no_sendpage, }; diff --git a/net/xdp/xsk_queue.c b/net/xdp/xsk_queue.c new file mode 100644 index 000000000000..23da4f29d3fb --- /dev/null +++ b/net/xdp/xsk_queue.c @@ -0,0 +1,58 @@ +// SPDX-License-Identifier: GPL-2.0 +/* XDP user-space ring structure + * Copyright(c) 2018 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#include + +#include "xsk_queue.h" + +static u32 xskq_umem_get_ring_size(struct xsk_queue *q) +{ + return sizeof(struct xdp_umem_ring) + q->nentries * sizeof(u32); +} + +struct xsk_queue *xskq_create(u32 nentries) +{ + struct xsk_queue *q; + gfp_t gfp_flags; + size_t size; + + q = kzalloc(sizeof(*q), GFP_KERNEL); + if (!q) + return NULL; + + q->nentries = nentries; + q->ring_mask = nentries - 1; + + gfp_flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | + __GFP_COMP | __GFP_NORETRY; + size = xskq_umem_get_ring_size(q); + + q->ring = (struct xdp_ring *)__get_free_pages(gfp_flags, + get_order(size)); + if (!q->ring) { + kfree(q); + return NULL; + } + + return q; +} + +void xskq_destroy(struct xsk_queue *q) +{ + if (!q) + return; + + page_frag_free(q->ring); + kfree(q); +} diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h new file mode 100644 index 000000000000..7eb556bf73be --- /dev/null +++ b/net/xdp/xsk_queue.h @@ -0,0 +1,38 @@ +/* SPDX-License-Identifier: GPL-2.0 + * XDP user-space ring structure + * Copyright(c) 2018 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#ifndef _LINUX_XSK_QUEUE_H +#define _LINUX_XSK_QUEUE_H + +#include +#include + +#include "xdp_umem_props.h" + +struct xsk_queue { + struct xdp_umem_props umem_props; + u32 ring_mask; + u32 nentries; + u32 prod_head; + u32 prod_tail; + u32 cons_head; + u32 cons_tail; + struct xdp_ring *ring; + u64 invalid_descs; +}; + +struct xsk_queue *xskq_create(u32 nentries); +void xskq_destroy(struct xsk_queue *q); + +#endif /* _LINUX_XSK_QUEUE_H */ From 0e3e8e562de9957d9e845de299938f0b15842129 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Wed, 2 May 2018 13:01:25 +0200 Subject: [PATCH 0268/1640] UPSTREAM: xsk: add Rx queue setup and mmap support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Another setsockopt (XDP_RX_QUEUE) is added to let the process allocate a queue, where the kernel can pass completed Rx frames from the kernel to user process. The mmapping of the queue is done using the XDP_PGOFF_RX_QUEUE offset. Signed-off-by: Björn Töpel Signed-off-by: Alexei Starovoitov --- include/net/xdp_sock.h | 4 ++++ include/uapi/linux/if_xdp.h | 16 +++++++++++++++ net/xdp/xsk.c | 41 +++++++++++++++++++++++++++++-------- net/xdp/xsk_queue.c | 11 ++++++++-- net/xdp/xsk_queue.h | 2 +- 5 files changed, 62 insertions(+), 12 deletions(-) diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index 94785f5db13e..db9a321de087 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -18,11 +18,15 @@ #include #include +struct net_device; +struct xsk_queue; struct xdp_umem; struct xdp_sock { /* struct sock must be the first member of struct xdp_sock */ struct sock sk; + struct xsk_queue *rx; + struct net_device *dev; struct xdp_umem *umem; /* Protects multiple processes in the control path */ struct mutex mutex; diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h index 975661e1baca..65324558829d 100644 --- a/include/uapi/linux/if_xdp.h +++ b/include/uapi/linux/if_xdp.h @@ -22,6 +22,7 @@ #include /* XDP socket options */ +#define XDP_RX_RING 1 #define XDP_UMEM_REG 3 #define XDP_UMEM_FILL_RING 4 @@ -33,13 +34,28 @@ struct xdp_umem_reg { }; /* Pgoff for mmaping the rings */ +#define XDP_PGOFF_RX_RING 0 #define XDP_UMEM_PGOFF_FILL_RING 0x100000000 +struct xdp_desc { + __u32 idx; + __u32 len; + __u16 offset; + __u8 flags; + __u8 padding[5]; +}; + struct xdp_ring { __u32 producer __attribute__((aligned(64))); __u32 consumer __attribute__((aligned(64))); }; +/* Used for the RX and TX queues for packets */ +struct xdp_rxtx_ring { + struct xdp_ring ptrs; + struct xdp_desc desc[0] __attribute__((aligned(64))); +}; + /* Used for the fill and completion queues for buffers */ struct xdp_umem_ring { struct xdp_ring ptrs; diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index da67a3c5c1c9..92bd9b7e548f 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -31,6 +31,7 @@ #include #include #include +#include #include "xsk_queue.h" #include "xdp_umem.h" @@ -40,14 +41,15 @@ static struct xdp_sock *xdp_sk(struct sock *sk) return (struct xdp_sock *)sk; } -static int xsk_init_queue(u32 entries, struct xsk_queue **queue) +static int xsk_init_queue(u32 entries, struct xsk_queue **queue, + bool umem_queue) { struct xsk_queue *q; if (entries == 0 || *queue || !is_power_of_2(entries)) return -EINVAL; - q = xskq_create(entries); + q = xskq_create(entries, umem_queue); if (!q) return -ENOMEM; @@ -89,6 +91,22 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname, return -ENOPROTOOPT; switch (optname) { + case XDP_RX_RING: + { + struct xsk_queue **q; + int entries; + + if (optlen < sizeof(entries)) + return -EINVAL; + if (copy_from_user(&entries, optval, sizeof(entries))) + return -EFAULT; + + mutex_lock(&xs->mutex); + q = &xs->rx; + err = xsk_init_queue(entries, q, false); + mutex_unlock(&xs->mutex); + return err; + } case XDP_UMEM_REG: { struct xdp_umem_reg mr; @@ -130,7 +148,7 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname, mutex_lock(&xs->mutex); q = &xs->umem->fq; - err = xsk_init_queue(entries, q); + err = xsk_init_queue(entries, q, true); mutex_unlock(&xs->mutex); return err; } @@ -151,13 +169,17 @@ static int xsk_mmap(struct file *file, struct socket *sock, unsigned long pfn; struct page *qpg; - if (!xs->umem) - return -EINVAL; + if (offset == XDP_PGOFF_RX_RING) { + q = xs->rx; + } else { + if (!xs->umem) + return -EINVAL; - if (offset == XDP_UMEM_PGOFF_FILL_RING) - q = xs->umem->fq; - else - return -EINVAL; + if (offset == XDP_UMEM_PGOFF_FILL_RING) + q = xs->umem->fq; + else + return -EINVAL; + } if (!q) return -EINVAL; @@ -205,6 +227,7 @@ static void xsk_destruct(struct sock *sk) if (!sock_flag(sk, SOCK_DEAD)) return; + xskq_destroy(xs->rx); xdp_put_umem(xs->umem); sk_refcnt_debug_dec(sk); diff --git a/net/xdp/xsk_queue.c b/net/xdp/xsk_queue.c index 23da4f29d3fb..894f9f89afc7 100644 --- a/net/xdp/xsk_queue.c +++ b/net/xdp/xsk_queue.c @@ -21,7 +21,13 @@ static u32 xskq_umem_get_ring_size(struct xsk_queue *q) return sizeof(struct xdp_umem_ring) + q->nentries * sizeof(u32); } -struct xsk_queue *xskq_create(u32 nentries) +static u32 xskq_rxtx_get_ring_size(struct xsk_queue *q) +{ + return (sizeof(struct xdp_ring) + + q->nentries * sizeof(struct xdp_desc)); +} + +struct xsk_queue *xskq_create(u32 nentries, bool umem_queue) { struct xsk_queue *q; gfp_t gfp_flags; @@ -36,7 +42,8 @@ struct xsk_queue *xskq_create(u32 nentries) gfp_flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP | __GFP_NORETRY; - size = xskq_umem_get_ring_size(q); + size = umem_queue ? xskq_umem_get_ring_size(q) : + xskq_rxtx_get_ring_size(q); q->ring = (struct xdp_ring *)__get_free_pages(gfp_flags, get_order(size)); diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index 7eb556bf73be..5439fa381763 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -32,7 +32,7 @@ struct xsk_queue { u64 invalid_descs; }; -struct xsk_queue *xskq_create(u32 nentries); +struct xsk_queue *xskq_create(u32 nentries, bool umem_queue); void xskq_destroy(struct xsk_queue *q); #endif /* _LINUX_XSK_QUEUE_H */ From 17dee8c73298b06f8ca90fd3f3f98da8a7481a01 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Wed, 2 May 2018 13:01:26 +0200 Subject: [PATCH 0269/1640] UPSTREAM: xsk: add support for bind for Rx Here, the bind syscall is added. Binding an AF_XDP socket, means associating the socket to an umem, a netdev and a queue index. This can be done in two ways. The first way, creating a "socket from scratch". Create the umem using the XDP_UMEM_REG setsockopt and an associated fill queue with XDP_UMEM_FILL_QUEUE. Create the Rx queue using the XDP_RX_QUEUE setsockopt. Call bind passing ifindex and queue index ("channel" in ethtool speak). The second way to bind a socket, is simply skipping the umem/netdev/queue index, and passing another already setup AF_XDP socket. The new socket will then have the same umem/netdev/queue index as the parent so it will share the same umem. You must also set the flags field in the socket address to XDP_SHARED_UMEM. v2: Use PTR_ERR instead of passing error variable explicitly. Signed-off-by: Magnus Karlsson Signed-off-by: Alexei Starovoitov --- include/net/xdp_sock.h | 1 + include/uapi/linux/if_xdp.h | 11 ++++ net/xdp/xdp_umem.c | 5 ++ net/xdp/xdp_umem.h | 1 + net/xdp/xsk.c | 124 +++++++++++++++++++++++++++++++++++- net/xdp/xsk_queue.c | 8 +++ net/xdp/xsk_queue.h | 1 + 7 files changed, 150 insertions(+), 1 deletion(-) diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index db9a321de087..85d02512f59b 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -28,6 +28,7 @@ struct xdp_sock { struct xsk_queue *rx; struct net_device *dev; struct xdp_umem *umem; + u16 queue_id; /* Protects multiple processes in the control path */ struct mutex mutex; }; diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h index 65324558829d..e5091881f776 100644 --- a/include/uapi/linux/if_xdp.h +++ b/include/uapi/linux/if_xdp.h @@ -21,6 +21,17 @@ #include +/* Options for the sxdp_flags field */ +#define XDP_SHARED_UMEM 1 + +struct sockaddr_xdp { + __u16 sxdp_family; + __u32 sxdp_ifindex; + __u32 sxdp_queue_id; + __u32 sxdp_shared_umem_fd; + __u16 sxdp_flags; +}; + /* XDP socket options */ #define XDP_RX_RING 1 #define XDP_UMEM_REG 3 diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c index e1f627d0cc1c..9bac1ad570fa 100644 --- a/net/xdp/xdp_umem.c +++ b/net/xdp/xdp_umem.c @@ -248,3 +248,8 @@ out: put_pid(umem->pid); return err; } + +bool xdp_umem_validate_queues(struct xdp_umem *umem) +{ + return umem->fq; +} diff --git a/net/xdp/xdp_umem.h b/net/xdp/xdp_umem.h index 25634b8a5c6f..b13133e9c501 100644 --- a/net/xdp/xdp_umem.h +++ b/net/xdp/xdp_umem.h @@ -39,6 +39,7 @@ struct xdp_umem { struct work_struct work; }; +bool xdp_umem_validate_queues(struct xdp_umem *umem); int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr); void xdp_get_umem(struct xdp_umem *umem); void xdp_put_umem(struct xdp_umem *umem); diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 92bd9b7e548f..bf2c97b87992 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -57,9 +57,18 @@ static int xsk_init_queue(u32 entries, struct xsk_queue **queue, return 0; } +static void __xsk_release(struct xdp_sock *xs) +{ + /* Wait for driver to stop using the xdp socket. */ + synchronize_net(); + + dev_put(xs->dev); +} + static int xsk_release(struct socket *sock) { struct sock *sk = sock->sk; + struct xdp_sock *xs = xdp_sk(sk); struct net *net; if (!sk) @@ -71,6 +80,11 @@ static int xsk_release(struct socket *sock) sock_prot_inuse_add(net, sk->sk_prot, -1); local_bh_enable(); + if (xs->dev) { + __xsk_release(xs); + xs->dev = NULL; + } + sock_orphan(sk); sock->sk = NULL; @@ -80,6 +94,114 @@ static int xsk_release(struct socket *sock) return 0; } +static struct socket *xsk_lookup_xsk_from_fd(int fd) +{ + struct socket *sock; + int err; + + sock = sockfd_lookup(fd, &err); + if (!sock) + return ERR_PTR(-ENOTSOCK); + + if (sock->sk->sk_family != PF_XDP) { + sockfd_put(sock); + return ERR_PTR(-ENOPROTOOPT); + } + + return sock; +} + +static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) +{ + struct sockaddr_xdp *sxdp = (struct sockaddr_xdp *)addr; + struct sock *sk = sock->sk; + struct net_device *dev, *dev_curr; + struct xdp_sock *xs = xdp_sk(sk); + struct xdp_umem *old_umem = NULL; + int err = 0; + + if (addr_len < sizeof(struct sockaddr_xdp)) + return -EINVAL; + if (sxdp->sxdp_family != AF_XDP) + return -EINVAL; + + mutex_lock(&xs->mutex); + dev_curr = xs->dev; + dev = dev_get_by_index(sock_net(sk), sxdp->sxdp_ifindex); + if (!dev) { + err = -ENODEV; + goto out_release; + } + + if (!xs->rx) { + err = -EINVAL; + goto out_unlock; + } + + if (sxdp->sxdp_queue_id >= dev->num_rx_queues) { + err = -EINVAL; + goto out_unlock; + } + + if (sxdp->sxdp_flags & XDP_SHARED_UMEM) { + struct xdp_sock *umem_xs; + struct socket *sock; + + if (xs->umem) { + /* We have already our own. */ + err = -EINVAL; + goto out_unlock; + } + + sock = xsk_lookup_xsk_from_fd(sxdp->sxdp_shared_umem_fd); + if (IS_ERR(sock)) { + err = PTR_ERR(sock); + goto out_unlock; + } + + umem_xs = xdp_sk(sock->sk); + if (!umem_xs->umem) { + /* No umem to inherit. */ + err = -EBADF; + sockfd_put(sock); + goto out_unlock; + } else if (umem_xs->dev != dev || + umem_xs->queue_id != sxdp->sxdp_queue_id) { + err = -EINVAL; + sockfd_put(sock); + goto out_unlock; + } + + xdp_get_umem(umem_xs->umem); + old_umem = xs->umem; + xs->umem = umem_xs->umem; + sockfd_put(sock); + } else if (!xs->umem || !xdp_umem_validate_queues(xs->umem)) { + err = -EINVAL; + goto out_unlock; + } + + /* Rebind? */ + if (dev_curr && (dev_curr != dev || + xs->queue_id != sxdp->sxdp_queue_id)) { + __xsk_release(xs); + if (old_umem) + xdp_put_umem(old_umem); + } + + xs->dev = dev; + xs->queue_id = sxdp->sxdp_queue_id; + + xskq_set_umem(xs->rx, &xs->umem->props); + +out_unlock: + if (err) + dev_put(dev); +out_release: + mutex_unlock(&xs->mutex); + return err; +} + static int xsk_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen) { @@ -203,7 +325,7 @@ static const struct proto_ops xsk_proto_ops = { .family = PF_XDP, .owner = THIS_MODULE, .release = xsk_release, - .bind = sock_no_bind, + .bind = xsk_bind, .connect = sock_no_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, diff --git a/net/xdp/xsk_queue.c b/net/xdp/xsk_queue.c index 894f9f89afc7..d012e5e23591 100644 --- a/net/xdp/xsk_queue.c +++ b/net/xdp/xsk_queue.c @@ -16,6 +16,14 @@ #include "xsk_queue.h" +void xskq_set_umem(struct xsk_queue *q, struct xdp_umem_props *umem_props) +{ + if (!q) + return; + + q->umem_props = *umem_props; +} + static u32 xskq_umem_get_ring_size(struct xsk_queue *q) { return sizeof(struct xdp_umem_ring) + q->nentries * sizeof(u32); diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index 5439fa381763..9ddd2ee07a84 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -32,6 +32,7 @@ struct xsk_queue { u64 invalid_descs; }; +void xskq_set_umem(struct xsk_queue *q, struct xdp_umem_props *umem_props); struct xsk_queue *xskq_create(u32 nentries, bool umem_queue); void xskq_destroy(struct xsk_queue *q); From 9e16851eecea6d722a1071271b00cc98c523f8e5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Wed, 2 May 2018 13:01:27 +0200 Subject: [PATCH 0270/1640] UPSTREAM: xsk: add Rx receive functions and poll support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Here the actual receive functions of AF_XDP are implemented, that in a later commit, will be called from the XDP layers. There's one set of functions for the XDP_DRV side and another for XDP_SKB (generic). A new XDP API, xdp_return_buff, is also introduced. Adding xdp_return_buff, which is analogous to xdp_return_frame, but acts upon an struct xdp_buff. The API will be used by AF_XDP in future commits. Support for the poll syscall is also implemented. v2: xskq_validate_id did not update cons_tail. The entries variable was calculated twice in xskq_nb_avail. Squashed xdp_return_buff commit. Signed-off-by: Björn Töpel Signed-off-by: Alexei Starovoitov --- include/net/xdp.h | 1 + include/net/xdp_sock.h | 22 ++++++++ net/core/xdp.c | 15 ++++-- net/xdp/xdp_umem.h | 18 +++++++ net/xdp/xsk.c | 73 +++++++++++++++++++++++++- net/xdp/xsk_queue.h | 114 ++++++++++++++++++++++++++++++++++++++++- 6 files changed, 238 insertions(+), 5 deletions(-) diff --git a/include/net/xdp.h b/include/net/xdp.h index 137ad5f9f40f..0b689cf561c7 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -104,6 +104,7 @@ struct xdp_frame *convert_to_xdp_frame(struct xdp_buff *xdp) } void xdp_return_frame(struct xdp_frame *xdpf); +void xdp_return_buff(struct xdp_buff *xdp); int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq, struct net_device *dev, u32 queue_index); diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index 85d02512f59b..a0342dff6a4d 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -31,6 +31,28 @@ struct xdp_sock { u16 queue_id; /* Protects multiple processes in the control path */ struct mutex mutex; + u64 rx_dropped; }; +struct xdp_buff; +#ifdef CONFIG_XDP_SOCKETS +int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp); +int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp); +void xsk_flush(struct xdp_sock *xs); +#else +static inline int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) +{ + return -ENOTSUPP; +} + +static inline int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) +{ + return -ENOTSUPP; +} + +static inline void xsk_flush(struct xdp_sock *xs) +{ +} +#endif /* CONFIG_XDP_SOCKETS */ + #endif /* _LINUX_XDP_SOCK_H */ diff --git a/net/core/xdp.c b/net/core/xdp.c index 0c86b53a3a63..bf6758f74339 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -308,11 +308,9 @@ err: } EXPORT_SYMBOL_GPL(xdp_rxq_info_reg_mem_model); -void xdp_return_frame(struct xdp_frame *xdpf) +static void xdp_return(void *data, struct xdp_mem_info *mem) { - struct xdp_mem_info *mem = &xdpf->mem; struct xdp_mem_allocator *xa; - void *data = xdpf->data; struct page *page; switch (mem->type) { @@ -339,4 +337,15 @@ void xdp_return_frame(struct xdp_frame *xdpf) break; } } + +void xdp_return_frame(struct xdp_frame *xdpf) +{ + xdp_return(xdpf->data, &xdpf->mem); +} EXPORT_SYMBOL_GPL(xdp_return_frame); + +void xdp_return_buff(struct xdp_buff *xdp) +{ + xdp_return(xdp->data, &xdp->rxq->mem); +} +EXPORT_SYMBOL_GPL(xdp_return_buff); diff --git a/net/xdp/xdp_umem.h b/net/xdp/xdp_umem.h index b13133e9c501..c7378a11721f 100644 --- a/net/xdp/xdp_umem.h +++ b/net/xdp/xdp_umem.h @@ -39,6 +39,24 @@ struct xdp_umem { struct work_struct work; }; +static inline char *xdp_umem_get_data(struct xdp_umem *umem, u32 idx) +{ + u64 pg, off; + char *data; + + pg = idx >> umem->nfpplog2; + off = (idx & umem->nfpp_mask) << umem->frame_size_log2; + + data = page_address(umem->pgs[pg]); + return data + off; +} + +static inline char *xdp_umem_get_data_with_headroom(struct xdp_umem *umem, + u32 idx) +{ + return xdp_umem_get_data(umem, idx) + umem->frame_headroom; +} + bool xdp_umem_validate_queues(struct xdp_umem *umem); int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr); void xdp_get_umem(struct xdp_umem *umem); diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index bf2c97b87992..4e1e6c581e1d 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -41,6 +41,74 @@ static struct xdp_sock *xdp_sk(struct sock *sk) return (struct xdp_sock *)sk; } +static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) +{ + u32 *id, len = xdp->data_end - xdp->data; + void *buffer; + int err = 0; + + if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index) + return -EINVAL; + + id = xskq_peek_id(xs->umem->fq); + if (!id) + return -ENOSPC; + + buffer = xdp_umem_get_data_with_headroom(xs->umem, *id); + memcpy(buffer, xdp->data, len); + err = xskq_produce_batch_desc(xs->rx, *id, len, + xs->umem->frame_headroom); + if (!err) + xskq_discard_id(xs->umem->fq); + + return err; +} + +int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) +{ + int err; + + err = __xsk_rcv(xs, xdp); + if (likely(!err)) + xdp_return_buff(xdp); + else + xs->rx_dropped++; + + return err; +} + +void xsk_flush(struct xdp_sock *xs) +{ + xskq_produce_flush_desc(xs->rx); + xs->sk.sk_data_ready(&xs->sk); +} + +int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) +{ + int err; + + err = __xsk_rcv(xs, xdp); + if (!err) + xsk_flush(xs); + else + xs->rx_dropped++; + + return err; +} + +static unsigned int xsk_poll(struct file *file, struct socket *sock, + struct poll_table_struct *wait) +{ + unsigned int mask = datagram_poll(file, sock, wait); + struct sock *sk = sock->sk; + struct xdp_sock *xs = xdp_sk(sk); + + if (xs->rx && !xskq_empty_desc(xs->rx)) + mask |= POLLIN | POLLRDNORM; + + return mask; +} + static int xsk_init_queue(u32 entries, struct xsk_queue **queue, bool umem_queue) { @@ -179,6 +247,9 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) } else if (!xs->umem || !xdp_umem_validate_queues(xs->umem)) { err = -EINVAL; goto out_unlock; + } else { + /* This xsk has its own umem. */ + xskq_set_umem(xs->umem->fq, &xs->umem->props); } /* Rebind? */ @@ -330,7 +401,7 @@ static const struct proto_ops xsk_proto_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = sock_no_getname, - .poll = sock_no_poll, + .poll = xsk_poll, .ioctl = sock_no_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index 9ddd2ee07a84..0a9b92b4f93a 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -20,6 +20,8 @@ #include "xdp_umem_props.h" +#define RX_BATCH_SIZE 16 + struct xsk_queue { struct xdp_umem_props umem_props; u32 ring_mask; @@ -32,8 +34,118 @@ struct xsk_queue { u64 invalid_descs; }; +/* Common functions operating for both RXTX and umem queues */ + +static inline u32 xskq_nb_avail(struct xsk_queue *q, u32 dcnt) +{ + u32 entries = q->prod_tail - q->cons_tail; + + if (entries == 0) { + /* Refresh the local pointer */ + q->prod_tail = READ_ONCE(q->ring->producer); + entries = q->prod_tail - q->cons_tail; + } + + return (entries > dcnt) ? dcnt : entries; +} + +static inline u32 xskq_nb_free(struct xsk_queue *q, u32 producer, u32 dcnt) +{ + u32 free_entries = q->nentries - (producer - q->cons_tail); + + if (free_entries >= dcnt) + return free_entries; + + /* Refresh the local tail pointer */ + q->cons_tail = READ_ONCE(q->ring->consumer); + return q->nentries - (producer - q->cons_tail); +} + +/* UMEM queue */ + +static inline bool xskq_is_valid_id(struct xsk_queue *q, u32 idx) +{ + if (unlikely(idx >= q->umem_props.nframes)) { + q->invalid_descs++; + return false; + } + return true; +} + +static inline u32 *xskq_validate_id(struct xsk_queue *q) +{ + while (q->cons_tail != q->cons_head) { + struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring; + unsigned int idx = q->cons_tail & q->ring_mask; + + if (xskq_is_valid_id(q, ring->desc[idx])) + return &ring->desc[idx]; + + q->cons_tail++; + } + + return NULL; +} + +static inline u32 *xskq_peek_id(struct xsk_queue *q) +{ + struct xdp_umem_ring *ring; + + if (q->cons_tail == q->cons_head) { + WRITE_ONCE(q->ring->consumer, q->cons_tail); + q->cons_head = q->cons_tail + xskq_nb_avail(q, RX_BATCH_SIZE); + + /* Order consumer and data */ + smp_rmb(); + + return xskq_validate_id(q); + } + + ring = (struct xdp_umem_ring *)q->ring; + return &ring->desc[q->cons_tail & q->ring_mask]; +} + +static inline void xskq_discard_id(struct xsk_queue *q) +{ + q->cons_tail++; + (void)xskq_validate_id(q); +} + +/* Rx queue */ + +static inline int xskq_produce_batch_desc(struct xsk_queue *q, + u32 id, u32 len, u16 offset) +{ + struct xdp_rxtx_ring *ring = (struct xdp_rxtx_ring *)q->ring; + unsigned int idx; + + if (xskq_nb_free(q, q->prod_head, 1) == 0) + return -ENOSPC; + + idx = (q->prod_head++) & q->ring_mask; + ring->desc[idx].idx = id; + ring->desc[idx].len = len; + ring->desc[idx].offset = offset; + + return 0; +} + +static inline void xskq_produce_flush_desc(struct xsk_queue *q) +{ + /* Order producer and data */ + smp_wmb(); + + q->prod_tail = q->prod_head, + WRITE_ONCE(q->ring->producer, q->prod_tail); +} + +static inline bool xskq_empty_desc(struct xsk_queue *q) +{ + return (xskq_nb_free(q, q->prod_tail, 1) == q->nentries); +} + void xskq_set_umem(struct xsk_queue *q, struct xdp_umem_props *umem_props); struct xsk_queue *xskq_create(u32 nentries, bool umem_queue); -void xskq_destroy(struct xsk_queue *q); +void xskq_destroy(struct xsk_queue *q_ops); #endif /* _LINUX_XSK_QUEUE_H */ From 1589cf546088ba14d959b6d378fb645d0bad9f10 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Wed, 2 May 2018 13:01:28 +0200 Subject: [PATCH 0271/1640] UPSTREAM: bpf: introduce new bpf AF_XDP map type BPF_MAP_TYPE_XSKMAP MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The xskmap is yet another BPF map, very much inspired by dev/cpu/sockmap, and is a holder of AF_XDP sockets. A user application adds AF_XDP sockets into the map, and by using the bpf_redirect_map helper, an XDP program can redirect XDP frames to an AF_XDP socket. Note that a socket that is bound to certain ifindex/queue index will *only* accept XDP frames from that netdev/queue index. If an XDP program tries to redirect from a netdev/queue index other than what the socket is bound to, the frame will not be received on the socket. A socket can reside in multiple maps. v3: Fixed race and simplified code. v2: Removed one indirection in map lookup. Signed-off-by: Björn Töpel Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 25 ++++ include/linux/bpf_types.h | 3 + include/net/xdp_sock.h | 7 ++ include/uapi/linux/bpf.h | 1 + kernel/bpf/Makefile | 3 + kernel/bpf/verifier.c | 8 +- kernel/bpf/xskmap.c | 239 ++++++++++++++++++++++++++++++++++++++ net/xdp/xsk.c | 5 + 8 files changed, 289 insertions(+), 2 deletions(-) create mode 100644 kernel/bpf/xskmap.c diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 0d3ea0aa1929..4c1823569b6d 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -693,6 +693,31 @@ static inline int sock_map_prog(struct bpf_map *map, } #endif +#if defined(CONFIG_XDP_SOCKETS) +struct xdp_sock; +struct xdp_sock *__xsk_map_lookup_elem(struct bpf_map *map, u32 key); +int __xsk_map_redirect(struct bpf_map *map, struct xdp_buff *xdp, + struct xdp_sock *xs); +void __xsk_map_flush(struct bpf_map *map); +#else +struct xdp_sock; +static inline struct xdp_sock *__xsk_map_lookup_elem(struct bpf_map *map, + u32 key) +{ + return NULL; +} + +static inline int __xsk_map_redirect(struct bpf_map *map, struct xdp_buff *xdp, + struct xdp_sock *xs) +{ + return -EOPNOTSUPP; +} + +static inline void __xsk_map_flush(struct bpf_map *map) +{ +} +#endif + /* verifier prototypes for helper functions called from eBPF programs */ extern const struct bpf_func_proto bpf_map_lookup_elem_proto; extern const struct bpf_func_proto bpf_map_update_elem_proto; diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index 2b28fcf6f6ae..d7df1b323082 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -49,4 +49,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP, dev_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKMAP, sock_map_ops) #endif BPF_MAP_TYPE(BPF_MAP_TYPE_CPUMAP, cpu_map_ops) +#if defined(CONFIG_XDP_SOCKETS) +BPF_MAP_TYPE(BPF_MAP_TYPE_XSKMAP, xsk_map_ops) +#endif #endif diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index a0342dff6a4d..ce3a2ab16b8f 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -28,6 +28,7 @@ struct xdp_sock { struct xsk_queue *rx; struct net_device *dev; struct xdp_umem *umem; + struct list_head flush_node; u16 queue_id; /* Protects multiple processes in the control path */ struct mutex mutex; @@ -39,6 +40,7 @@ struct xdp_buff; int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp); int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp); void xsk_flush(struct xdp_sock *xs); +bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs); #else static inline int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) { @@ -53,6 +55,11 @@ static inline int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) static inline void xsk_flush(struct xdp_sock *xs) { } + +static inline bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs) +{ + return false; +} #endif /* CONFIG_XDP_SOCKETS */ #endif /* _LINUX_XDP_SOCK_H */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index c9dffbbaadbe..cce6d4b3df54 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -116,6 +116,7 @@ enum bpf_map_type { BPF_MAP_TYPE_DEVMAP, BPF_MAP_TYPE_SOCKMAP, BPF_MAP_TYPE_CPUMAP, + BPF_MAP_TYPE_XSKMAP, }; enum bpf_prog_type { diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 2d12e26f751e..1a7e6c1faf91 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -9,6 +9,9 @@ obj-$(CONFIG_BPF_SYSCALL) += btf.o ifeq ($(CONFIG_NET),y) obj-$(CONFIG_BPF_SYSCALL) += devmap.o obj-$(CONFIG_BPF_SYSCALL) += cpumap.o +ifeq ($(CONFIG_XDP_SOCKETS),y) +obj-$(CONFIG_BPF_SYSCALL) += xskmap.o +endif obj-$(CONFIG_BPF_SYSCALL) += offload.o ifeq ($(CONFIG_STREAM_PARSER),y) ifeq ($(CONFIG_INET),y) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 9b5971f88a01..72f528ed9965 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2130,8 +2130,11 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, if (func_id != BPF_FUNC_redirect_map) goto error; break; - /* Restrict bpf side of cpumap, open when use-cases appear */ + /* Restrict bpf side of cpumap and xskmap, open when use-cases + * appear. + */ case BPF_MAP_TYPE_CPUMAP: + case BPF_MAP_TYPE_XSKMAP: if (func_id != BPF_FUNC_redirect_map) goto error; break; @@ -2178,7 +2181,8 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, break; case BPF_FUNC_redirect_map: if (map->map_type != BPF_MAP_TYPE_DEVMAP && - map->map_type != BPF_MAP_TYPE_CPUMAP) + map->map_type != BPF_MAP_TYPE_CPUMAP && + map->map_type != BPF_MAP_TYPE_XSKMAP) goto error; break; case BPF_FUNC_sk_redirect_map: diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c new file mode 100644 index 000000000000..869dbb11b612 --- /dev/null +++ b/kernel/bpf/xskmap.c @@ -0,0 +1,239 @@ +// SPDX-License-Identifier: GPL-2.0 +/* XSKMAP used for AF_XDP sockets + * Copyright(c) 2018 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#include +#include +#include +#include +#include + +struct xsk_map { + struct bpf_map map; + struct xdp_sock **xsk_map; + struct list_head __percpu *flush_list; +}; + +static struct bpf_map *xsk_map_alloc(union bpf_attr *attr) +{ + int cpu, err = -EINVAL; + struct xsk_map *m; + u64 cost; + + if (!capable(CAP_NET_ADMIN)) + return ERR_PTR(-EPERM); + + if (attr->max_entries == 0 || attr->key_size != 4 || + attr->value_size != 4 || + attr->map_flags & ~(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)) + return ERR_PTR(-EINVAL); + + m = kzalloc(sizeof(*m), GFP_USER); + if (!m) + return ERR_PTR(-ENOMEM); + + bpf_map_init_from_attr(&m->map, attr); + + cost = (u64)m->map.max_entries * sizeof(struct xdp_sock *); + cost += sizeof(struct list_head) * num_possible_cpus(); + if (cost >= U32_MAX - PAGE_SIZE) + goto free_m; + + m->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; + + /* Notice returns -EPERM on if map size is larger than memlock limit */ + err = bpf_map_precharge_memlock(m->map.pages); + if (err) + goto free_m; + + m->flush_list = alloc_percpu(struct list_head); + if (!m->flush_list) + goto free_m; + + for_each_possible_cpu(cpu) + INIT_LIST_HEAD(per_cpu_ptr(m->flush_list, cpu)); + + m->xsk_map = bpf_map_area_alloc(m->map.max_entries * + sizeof(struct xdp_sock *), + m->map.numa_node); + if (!m->xsk_map) + goto free_percpu; + return &m->map; + +free_percpu: + free_percpu(m->flush_list); +free_m: + kfree(m); + return ERR_PTR(err); +} + +static void xsk_map_free(struct bpf_map *map) +{ + struct xsk_map *m = container_of(map, struct xsk_map, map); + int i; + + synchronize_net(); + + for (i = 0; i < map->max_entries; i++) { + struct xdp_sock *xs; + + xs = m->xsk_map[i]; + if (!xs) + continue; + + sock_put((struct sock *)xs); + } + + free_percpu(m->flush_list); + bpf_map_area_free(m->xsk_map); + kfree(m); +} + +static int xsk_map_get_next_key(struct bpf_map *map, void *key, void *next_key) +{ + struct xsk_map *m = container_of(map, struct xsk_map, map); + u32 index = key ? *(u32 *)key : U32_MAX; + u32 *next = next_key; + + if (index >= m->map.max_entries) { + *next = 0; + return 0; + } + + if (index == m->map.max_entries - 1) + return -ENOENT; + *next = index + 1; + return 0; +} + +struct xdp_sock *__xsk_map_lookup_elem(struct bpf_map *map, u32 key) +{ + struct xsk_map *m = container_of(map, struct xsk_map, map); + struct xdp_sock *xs; + + if (key >= map->max_entries) + return NULL; + + xs = READ_ONCE(m->xsk_map[key]); + return xs; +} + +int __xsk_map_redirect(struct bpf_map *map, struct xdp_buff *xdp, + struct xdp_sock *xs) +{ + struct xsk_map *m = container_of(map, struct xsk_map, map); + struct list_head *flush_list = this_cpu_ptr(m->flush_list); + int err; + + err = xsk_rcv(xs, xdp); + if (err) + return err; + + if (!xs->flush_node.prev) + list_add(&xs->flush_node, flush_list); + + return 0; +} + +void __xsk_map_flush(struct bpf_map *map) +{ + struct xsk_map *m = container_of(map, struct xsk_map, map); + struct list_head *flush_list = this_cpu_ptr(m->flush_list); + struct xdp_sock *xs, *tmp; + + list_for_each_entry_safe(xs, tmp, flush_list, flush_node) { + xsk_flush(xs); + __list_del(xs->flush_node.prev, xs->flush_node.next); + xs->flush_node.prev = NULL; + } +} + +static void *xsk_map_lookup_elem(struct bpf_map *map, void *key) +{ + return NULL; +} + +static int xsk_map_update_elem(struct bpf_map *map, void *key, void *value, + u64 map_flags) +{ + struct xsk_map *m = container_of(map, struct xsk_map, map); + u32 i = *(u32 *)key, fd = *(u32 *)value; + struct xdp_sock *xs, *old_xs; + struct socket *sock; + int err; + + if (unlikely(map_flags > BPF_EXIST)) + return -EINVAL; + if (unlikely(i >= m->map.max_entries)) + return -E2BIG; + if (unlikely(map_flags == BPF_NOEXIST)) + return -EEXIST; + + sock = sockfd_lookup(fd, &err); + if (!sock) + return err; + + if (sock->sk->sk_family != PF_XDP) { + sockfd_put(sock); + return -EOPNOTSUPP; + } + + xs = (struct xdp_sock *)sock->sk; + + if (!xsk_is_setup_for_bpf_map(xs)) { + sockfd_put(sock); + return -EOPNOTSUPP; + } + + sock_hold(sock->sk); + + old_xs = xchg(&m->xsk_map[i], xs); + if (old_xs) { + /* Make sure we've flushed everything. */ + synchronize_net(); + sock_put((struct sock *)old_xs); + } + + sockfd_put(sock); + return 0; +} + +static int xsk_map_delete_elem(struct bpf_map *map, void *key) +{ + struct xsk_map *m = container_of(map, struct xsk_map, map); + struct xdp_sock *old_xs; + int k = *(u32 *)key; + + if (k >= map->max_entries) + return -EINVAL; + + old_xs = xchg(&m->xsk_map[k], NULL); + if (old_xs) { + /* Make sure we've flushed everything. */ + synchronize_net(); + sock_put((struct sock *)old_xs); + } + + return 0; +} + +const struct bpf_map_ops xsk_map_ops = { + .map_alloc = xsk_map_alloc, + .map_free = xsk_map_free, + .map_get_next_key = xsk_map_get_next_key, + .map_lookup_elem = xsk_map_lookup_elem, + .map_update_elem = xsk_map_update_elem, + .map_delete_elem = xsk_map_delete_elem, +}; + + diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 4e1e6c581e1d..b931a0db5588 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -41,6 +41,11 @@ static struct xdp_sock *xdp_sk(struct sock *sk) return (struct xdp_sock *)sk; } +bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs) +{ + return !!xs->rx; +} + static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) { u32 *id, len = xdp->data_end - xdp->data; From 27acf51b204a3b7b972fa06619c39831dc6c1045 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Wed, 2 May 2018 13:01:29 +0200 Subject: [PATCH 0272/1640] UPSTREAM: xsk: wire up XDP_DRV side of AF_XDP MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This commit wires up the xskmap to XDP_DRV layer. Signed-off-by: Björn Töpel Signed-off-by: Alexei Starovoitov --- net/core/filter.c | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/net/core/filter.c b/net/core/filter.c index f1f9574a38ac..bb20c5b3b45c 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2851,7 +2851,8 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd, { int err; - if (map->map_type == BPF_MAP_TYPE_DEVMAP) { + switch (map->map_type) { + case BPF_MAP_TYPE_DEVMAP: { struct net_device *dev = fwd; struct xdp_frame *xdpf; @@ -2869,14 +2870,25 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd, if (err) return err; __dev_map_insert_ctx(map, index); - - } else if (map->map_type == BPF_MAP_TYPE_CPUMAP) { + break; + } + case BPF_MAP_TYPE_CPUMAP: { struct bpf_cpu_map_entry *rcpu = fwd; err = cpu_map_enqueue(rcpu, xdp, dev_rx); if (err) return err; __cpu_map_insert_ctx(map, index); + break; + } + case BPF_MAP_TYPE_XSKMAP: { + struct xdp_sock *xs = fwd; + + err = __xsk_map_redirect(map, xdp, xs); + return err; + } + default: + break; } return 0; } @@ -2895,6 +2907,9 @@ void xdp_do_flush_map(void) case BPF_MAP_TYPE_CPUMAP: __cpu_map_flush(map); break; + case BPF_MAP_TYPE_XSKMAP: + __xsk_map_flush(map); + break; default: break; } @@ -2909,6 +2924,8 @@ static void *__xdp_map_lookup_elem(struct bpf_map *map, u32 index) return __dev_map_lookup_elem(map, index); case BPF_MAP_TYPE_CPUMAP: return __cpu_map_lookup_elem(map, index); + case BPF_MAP_TYPE_XSKMAP: + return __xsk_map_lookup_elem(map, index); default: return NULL; } From 0e1b0ea1f5e6d6578b78fae9cc6f044c28f1881d Mon Sep 17 00:00:00 2001 From: "Nikita V. Shirokov" Date: Tue, 17 Apr 2018 21:42:14 -0700 Subject: [PATCH 0273/1640] UPSTREAM: bpf: make generic xdp compatible w/ bpf_xdp_adjust_tail w/ bpf_xdp_adjust_tail helper xdp's data_end pointer could be changed as well (only "decrease" of pointer's location is going to be supported). changing of this pointer will change packet's size. for generic XDP we need to reflect this packet's length change by adjusting skb's tail pointer Acked-by: Alexei Starovoitov Signed-off-by: Nikita V. Shirokov Signed-off-by: Daniel Borkmann --- net/core/dev.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/net/core/dev.c b/net/core/dev.c index 4eb45598b79d..e58a2a5929df 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3969,9 +3969,9 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, struct bpf_prog *xdp_prog) { struct netdev_rx_queue *rxqueue; + void *orig_data, *orig_data_end; u32 metalen, act = XDP_DROP; struct xdp_buff xdp; - void *orig_data; int hlen, off; u32 mac_len; @@ -4010,6 +4010,7 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, xdp.data_meta = xdp.data; xdp.data_end = xdp.data + hlen; xdp.data_hard_start = skb->data - skb_headroom(skb); + orig_data_end = xdp.data_end; orig_data = xdp.data; rxqueue = netif_get_rxqueue(skb); @@ -4024,6 +4025,13 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, __skb_push(skb, -off); skb->mac_header += off; + /* check if bpf_xdp_adjust_tail was used. it can only "shrink" + * pckt. + */ + off = orig_data_end - xdp.data_end; + if (off != 0) + skb_set_tail_pointer(skb, xdp.data_end - xdp.data); + switch (act) { case XDP_REDIRECT: case XDP_TX: From c1442a2448e03229a9348db169e95d69ce5fd588 Mon Sep 17 00:00:00 2001 From: "Nikita V. Shirokov" Date: Wed, 25 Apr 2018 07:15:03 -0700 Subject: [PATCH 0274/1640] UPSTREAM: bpf: fix xdp_generic for bpf_adjust_tail usecase When bpf_adjust_tail was introduced for generic xdp, it changed skb's tail pointer, so it was pointing to the new "end of the packet". However skb's len field wasn't properly modified, so on the wire ethernet frame had original (or even bigger, if adjust_head was used) size. This diff is fixing this. Fixes: 198d83bb3 (" bpf: make generic xdp compatible w/ bpf_xdp_adjust_tail") Signed-off-by: Nikita V. Shirokov Signed-off-by: Daniel Borkmann --- net/core/dev.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/core/dev.c b/net/core/dev.c index e58a2a5929df..c734e34d6f3e 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4029,8 +4029,10 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, * pckt. */ off = orig_data_end - xdp.data_end; - if (off != 0) + if (off != 0) { skb_set_tail_pointer(skb, xdp.data_end - xdp.data); + skb->len -= off; + } switch (act) { case XDP_REDIRECT: From b0299cfa67e08181f30a55f8a01b4d7838fad533 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Wed, 2 May 2018 13:01:30 +0200 Subject: [PATCH 0275/1640] UPSTREAM: xsk: wire up XDP_SKB side of AF_XDP MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This commit wires up the xskmap to XDP_SKB layer. Signed-off-by: Björn Töpel Signed-off-by: Alexei Starovoitov --- include/linux/filter.h | 2 +- net/core/dev.c | 35 +++++++++++++++++++---------------- net/core/filter.c | 17 ++++++++++++++--- 3 files changed, 34 insertions(+), 20 deletions(-) diff --git a/include/linux/filter.h b/include/linux/filter.h index a252f71b76b9..5471d0e81efe 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -837,7 +837,7 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, * This does not appear to be a real limitation for existing software. */ int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb, - struct bpf_prog *prog); + struct xdp_buff *xdp, struct bpf_prog *prog); int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, struct bpf_prog *prog); diff --git a/net/core/dev.c b/net/core/dev.c index c734e34d6f3e..1f949a72bb94 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3966,12 +3966,12 @@ static struct netdev_rx_queue *netif_get_rxqueue(struct sk_buff *skb) } static u32 netif_receive_generic_xdp(struct sk_buff *skb, + struct xdp_buff *xdp, struct bpf_prog *xdp_prog) { struct netdev_rx_queue *rxqueue; void *orig_data, *orig_data_end; u32 metalen, act = XDP_DROP; - struct xdp_buff xdp; int hlen, off; u32 mac_len; @@ -4006,19 +4006,19 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, */ mac_len = skb->data - skb_mac_header(skb); hlen = skb_headlen(skb) + mac_len; - xdp.data = skb->data - mac_len; - xdp.data_meta = xdp.data; - xdp.data_end = xdp.data + hlen; - xdp.data_hard_start = skb->data - skb_headroom(skb); - orig_data_end = xdp.data_end; - orig_data = xdp.data; + xdp->data = skb->data - mac_len; + xdp->data_meta = xdp->data; + xdp->data_end = xdp->data + hlen; + xdp->data_hard_start = skb->data - skb_headroom(skb); + orig_data_end = xdp->data_end; + orig_data = xdp->data; rxqueue = netif_get_rxqueue(skb); - xdp.rxq = &rxqueue->xdp_rxq; + xdp->rxq = &rxqueue->xdp_rxq; - act = bpf_prog_run_xdp(xdp_prog, &xdp); + act = bpf_prog_run_xdp(xdp_prog, xdp); - off = xdp.data - orig_data; + off = xdp->data - orig_data; if (off > 0) __skb_pull(skb, off); else if (off < 0) @@ -4028,10 +4028,11 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, /* check if bpf_xdp_adjust_tail was used. it can only "shrink" * pckt. */ - off = orig_data_end - xdp.data_end; + off = orig_data_end - xdp->data_end; if (off != 0) { - skb_set_tail_pointer(skb, xdp.data_end - xdp.data); + skb_set_tail_pointer(skb, xdp->data_end - xdp->data); skb->len -= off; + } switch (act) { @@ -4040,7 +4041,7 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, __skb_push(skb, mac_len); break; case XDP_PASS: - metalen = xdp.data - xdp.data_meta; + metalen = xdp->data - xdp->data_meta; if (metalen) skb_metadata_set(skb, metalen); break; @@ -4090,17 +4091,19 @@ static struct static_key generic_xdp_needed __read_mostly; int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb) { if (xdp_prog) { - u32 act = netif_receive_generic_xdp(skb, xdp_prog); + struct xdp_buff xdp; + u32 act; int err; + act = netif_receive_generic_xdp(skb, &xdp, xdp_prog); if (act != XDP_PASS) { switch (act) { case XDP_REDIRECT: err = xdp_do_generic_redirect(skb->dev, skb, - xdp_prog); + &xdp, xdp_prog); if (err) goto out_redir; - /* fallthru to submit skb */ + break; case XDP_TX: generic_xdp_tx(skb, xdp_prog); break; diff --git a/net/core/filter.c b/net/core/filter.c index bb20c5b3b45c..17f91e0a177c 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -59,6 +59,7 @@ #include #include #include +#include /** * sk_filter_trim_cap - run a packet through a socket filter @@ -3023,13 +3024,14 @@ static int __xdp_generic_ok_fwd_dev(struct sk_buff *skb, struct net_device *fwd) static int xdp_do_generic_redirect_map(struct net_device *dev, struct sk_buff *skb, + struct xdp_buff *xdp, struct bpf_prog *xdp_prog) { struct redirect_info *ri = this_cpu_ptr(&redirect_info); unsigned long map_owner = ri->map_owner; struct bpf_map *map = ri->map; - struct net_device *fwd = NULL; u32 index = ri->ifindex; + void *fwd = NULL; int err = 0; ri->ifindex = 0; @@ -3051,6 +3053,14 @@ static int xdp_do_generic_redirect_map(struct net_device *dev, if (unlikely((err = __xdp_generic_ok_fwd_dev(skb, fwd)))) goto err; skb->dev = fwd; + generic_xdp_tx(skb, xdp_prog); + } else if (map->map_type == BPF_MAP_TYPE_XSKMAP) { + struct xdp_sock *xs = fwd; + + err = xsk_generic_rcv(xs, xdp); + if (err) + goto err; + consume_skb(skb); } else { /* TODO: Handle BPF_MAP_TYPE_CPUMAP */ err = -EBADRQC; @@ -3065,7 +3075,7 @@ err: } int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb, - struct bpf_prog *xdp_prog) + struct xdp_buff *xdp, struct bpf_prog *xdp_prog) { struct redirect_info *ri = this_cpu_ptr(&redirect_info); u32 index = ri->ifindex; @@ -3073,7 +3083,7 @@ int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb, int err = 0; if (ri->map) - return xdp_do_generic_redirect_map(dev, skb, xdp_prog); + return xdp_do_generic_redirect_map(dev, skb, xdp, xdp_prog); ri->ifindex = 0; fwd = dev_get_by_index_rcu(dev_net(dev), index); @@ -3087,6 +3097,7 @@ int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb, skb->dev = fwd; _trace_xdp_redirect(dev, xdp_prog, index); + generic_xdp_tx(skb, xdp_prog); return 0; err: _trace_xdp_redirect_err(dev, xdp_prog, index, err); From 4a4d062143ce06ebd2504b87a4c3d05a37b3c7d2 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Thu, 3 May 2018 17:04:59 +0100 Subject: [PATCH 0276/1640] BACKPORT: bpf: fix possible spectre-v1 in find_and_alloc_map() It's possible for userspace to control attr->map_type. Sanitize it when using it as an array index to prevent an out-of-bounds value being used under speculation. Found by smatch. Signed-off-by: Mark Rutland Cc: Alexei Starovoitov Cc: Dan Carpenter Cc: Daniel Borkmann Cc: Peter Zijlstra Cc: netdev@vger.kernel.org Acked-by: David S. Miller Signed-off-by: Daniel Borkmann --- kernel/bpf/syscall.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 7f603537973c..43e0e72b8413 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -28,6 +28,7 @@ #include #include #include +#include #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PROG_ARRAY || \ (map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \ @@ -105,12 +106,14 @@ const struct bpf_map_ops bpf_map_offload_ops = { static struct bpf_map *find_and_alloc_map(union bpf_attr *attr) { const struct bpf_map_ops *ops; + u32 type = attr->map_type; struct bpf_map *map; int err; - if (attr->map_type >= ARRAY_SIZE(bpf_map_types)) + if (type >= ARRAY_SIZE(bpf_map_types)) return ERR_PTR(-EINVAL); - ops = bpf_map_types[attr->map_type]; + type = array_index_nospec(type, ARRAY_SIZE(bpf_map_types)); + ops = bpf_map_types[type]; if (!ops) return ERR_PTR(-EINVAL); @@ -125,7 +128,7 @@ static struct bpf_map *find_and_alloc_map(union bpf_attr *attr) if (IS_ERR(map)) return map; map->ops = ops; - map->map_type = attr->map_type; + map->map_type = type; return map; } From 90c7f7eae9740fd41372b7375577112f8ecf5e54 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 4 May 2018 01:08:12 +0200 Subject: [PATCH 0277/1640] UPSTREAM: bpf: prefix cbpf internal helpers with bpf_ No change in functionality, just remove the '__' prefix and replace it with a 'bpf_' prefix instead. We later on add a couple of more helpers for cBPF and keeping the scheme with '__' is suboptimal there. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: Alexei Starovoitov --- net/core/filter.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/net/core/filter.c b/net/core/filter.c index 17f91e0a177c..cc2f05760308 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -113,12 +113,12 @@ int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap) } EXPORT_SYMBOL(sk_filter_trim_cap); -BPF_CALL_1(__skb_get_pay_offset, struct sk_buff *, skb) +BPF_CALL_1(bpf_skb_get_pay_offset, struct sk_buff *, skb) { return skb_get_poff(skb); } -BPF_CALL_3(__skb_get_nlattr, struct sk_buff *, skb, u32, a, u32, x) +BPF_CALL_3(bpf_skb_get_nlattr, struct sk_buff *, skb, u32, a, u32, x) { struct nlattr *nla; @@ -138,7 +138,7 @@ BPF_CALL_3(__skb_get_nlattr, struct sk_buff *, skb, u32, a, u32, x) return 0; } -BPF_CALL_3(__skb_get_nlattr_nest, struct sk_buff *, skb, u32, a, u32, x) +BPF_CALL_3(bpf_skb_get_nlattr_nest, struct sk_buff *, skb, u32, a, u32, x) { struct nlattr *nla; @@ -162,13 +162,13 @@ BPF_CALL_3(__skb_get_nlattr_nest, struct sk_buff *, skb, u32, a, u32, x) return 0; } -BPF_CALL_0(__get_raw_cpu_id) +BPF_CALL_0(bpf_get_raw_cpu_id) { return raw_smp_processor_id(); } static const struct bpf_func_proto bpf_get_raw_smp_processor_id_proto = { - .func = __get_raw_cpu_id, + .func = bpf_get_raw_cpu_id, .gpl_only = false, .ret_type = RET_INTEGER, }; @@ -318,16 +318,16 @@ static bool convert_bpf_extensions(struct sock_filter *fp, /* Emit call(arg1=CTX, arg2=A, arg3=X) */ switch (fp->k) { case SKF_AD_OFF + SKF_AD_PAY_OFFSET: - *insn = BPF_EMIT_CALL(__skb_get_pay_offset); + *insn = BPF_EMIT_CALL(bpf_skb_get_pay_offset); break; case SKF_AD_OFF + SKF_AD_NLATTR: - *insn = BPF_EMIT_CALL(__skb_get_nlattr); + *insn = BPF_EMIT_CALL(bpf_skb_get_nlattr); break; case SKF_AD_OFF + SKF_AD_NLATTR_NEST: - *insn = BPF_EMIT_CALL(__skb_get_nlattr_nest); + *insn = BPF_EMIT_CALL(bpf_skb_get_nlattr_nest); break; case SKF_AD_OFF + SKF_AD_CPU: - *insn = BPF_EMIT_CALL(__get_raw_cpu_id); + *insn = BPF_EMIT_CALL(bpf_get_raw_cpu_id); break; case SKF_AD_OFF + SKF_AD_RANDOM: *insn = BPF_EMIT_CALL(bpf_user_rnd_u32); From 059e87de99a6fd8fa5cff0644d096466d8751d65 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 4 May 2018 01:08:13 +0200 Subject: [PATCH 0278/1640] BACKPORT: bpf: migrate ebpf ld_abs/ld_ind tests to test_verifier Remove all eBPF tests involving LD_ABS/LD_IND from test_bpf.ko. Reason is that the eBPF tests from test_bpf module do not go via BPF verifier and therefore any instruction rewrites from verifier cannot take place. Therefore, move them into test_verifier which runs out of user space, so that verfier can rewrite LD_ABS/LD_IND internally in upcoming patches. It will have the same effect since runtime tests are also performed from there. This also allows to finally unexport bpf_skb_vlan_{push,pop}_proto and keep it internal to core kernel. Additionally, also add further cBPF LD_ABS/LD_IND test coverage into test_bpf.ko suite. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 2 -- net/core/filter.c | 6 ++---- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 4c1823569b6d..0f5abf73f0fd 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -732,8 +732,6 @@ extern const struct bpf_func_proto bpf_ktime_get_boot_ns_proto; extern const struct bpf_func_proto bpf_get_current_pid_tgid_proto; extern const struct bpf_func_proto bpf_get_current_uid_gid_proto; extern const struct bpf_func_proto bpf_get_current_comm_proto; -extern const struct bpf_func_proto bpf_skb_vlan_push_proto; -extern const struct bpf_func_proto bpf_skb_vlan_pop_proto; extern const struct bpf_func_proto bpf_get_stackid_proto; extern const struct bpf_func_proto bpf_get_stack_proto; extern const struct bpf_func_proto bpf_sock_map_update_proto; diff --git a/net/core/filter.c b/net/core/filter.c index cc2f05760308..4899873d4131 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2234,7 +2234,7 @@ BPF_CALL_3(bpf_skb_vlan_push, struct sk_buff *, skb, __be16, vlan_proto, return ret; } -const struct bpf_func_proto bpf_skb_vlan_push_proto = { +static const struct bpf_func_proto bpf_skb_vlan_push_proto = { .func = bpf_skb_vlan_push, .gpl_only = false, .ret_type = RET_INTEGER, @@ -2242,7 +2242,6 @@ const struct bpf_func_proto bpf_skb_vlan_push_proto = { .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, }; -EXPORT_SYMBOL_GPL(bpf_skb_vlan_push_proto); BPF_CALL_1(bpf_skb_vlan_pop, struct sk_buff *, skb) { @@ -2256,13 +2255,12 @@ BPF_CALL_1(bpf_skb_vlan_pop, struct sk_buff *, skb) return ret; } -const struct bpf_func_proto bpf_skb_vlan_pop_proto = { +static const struct bpf_func_proto bpf_skb_vlan_pop_proto = { .func = bpf_skb_vlan_pop, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, }; -EXPORT_SYMBOL_GPL(bpf_skb_vlan_pop_proto); static int bpf_skb_generic_push(struct sk_buff *skb, u32 off, u32 len) { From 52bd8772b36535f3932edf826e3d85d53daed5b6 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 4 May 2018 01:08:14 +0200 Subject: [PATCH 0279/1640] BACKPORT: bpf: implement ld_abs/ld_ind in native bpf The main part of this work is to finally allow removal of LD_ABS and LD_IND from the BPF core by reimplementing them through native eBPF instead. Both LD_ABS/LD_IND were carried over from cBPF and keeping them around in native eBPF caused way more trouble than actually worth it. To just list some of the security issues in the past: * fdfaf64e7539 ("x86: bpf_jit: support negative offsets") * 35607b02dbef ("sparc: bpf_jit: fix loads from negative offsets") * e0ee9c12157d ("x86: bpf_jit: fix two bugs in eBPF JIT compiler") * 07aee9439454 ("bpf, sparc: fix usage of wrong reg for load_skb_regs after call") * 6d59b7dbf72e ("bpf, s390x: do not reload skb pointers in non-skb context") * 87338c8e2cbb ("bpf, ppc64: do not reload skb pointers in non-skb context") For programs in native eBPF, LD_ABS/LD_IND are pretty much legacy these days due to their limitations and more efficient/flexible alternatives that have been developed over time such as direct packet access. LD_ABS/LD_IND only cover 1/2/4 byte loads into a register, the load happens in host endianness and its exception handling can yield unexpected behavior. The latter is explained in depth in f6b1b3bf0d5f ("bpf: fix subprog verifier bypass by div/mod by 0 exception") with similar cases of exceptions we had. In native eBPF more recent program types will disable LD_ABS/LD_IND altogether through may_access_skb() in verifier, and given the limitations in terms of exception handling, it's also disabled in programs that use BPF to BPF calls. In terms of cBPF, the LD_ABS/LD_IND is used in networking programs to access packet data. It is not used in seccomp-BPF but programs that use it for socket filtering or reuseport for demuxing with cBPF. This is mostly relevant for applications that have not yet migrated to native eBPF. The main complexity and source of bugs in LD_ABS/LD_IND is coming from their implementation in the various JITs. Most of them keep the model around from cBPF times by implementing a fastpath written in asm. They use typically two from the BPF program hidden CPU registers for caching the skb's headlen (skb->len - skb->data_len) and skb->data. Throughout the JIT phase this requires to keep track whether LD_ABS/LD_IND are used and if so, the two registers need to be recached each time a BPF helper would change the underlying packet data in native eBPF case. At least in eBPF case, available CPU registers are rare and the additional exit path out of the asm written JIT helper makes it also inflexible since not all parts of the JITer are in control from plain C. A LD_ABS/LD_IND implementation in eBPF therefore allows to significantly reduce the complexity in JITs with comparable performance results for them, e.g.: test_bpf tcpdump port 22 tcpdump complex x64 - before 15 21 10 14 19 18 - after 7 10 10 7 10 15 arm64 - before 40 91 92 40 91 151 - after 51 64 73 51 62 113 For cBPF we now track any usage of LD_ABS/LD_IND in bpf_convert_filter() and cache the skb's headlen and data in the cBPF prologue. The BPF_REG_TMP gets remapped from R8 to R2 since it's mainly just used as a local temporary variable. This allows to shrink the image on x86_64 also for seccomp programs slightly since mapping to %rsi is not an ereg. In callee-saved R8 and R9 we now track skb data and headlen, respectively. For normal prologue emission in the JITs this does not add any extra instructions since R8, R9 are pushed to stack in any case from eBPF side. cBPF uses the convert_bpf_ld_abs() emitter which probes the fast path inline already and falls back to bpf_skb_load_helper_{8,16,32}() helper relying on the cached skb data and headlen as well. R8 and R9 never need to be reloaded due to bpf_helper_changes_pkt_data() since all skb access in cBPF is read-only. Then, for the case of native eBPF, we use the bpf_gen_ld_abs() emitter, which calls the bpf_skb_load_helper_{8,16,32}_no_cache() helper unconditionally, does neither cache skb data and headlen nor has an inlined fast path. The reason for the latter is that native eBPF does not have any extra registers available anyway, but even if there were, it avoids any reload of skb data and headlen in the first place. Additionally, for the negative offsets, we provide an alternative bpf_skb_load_bytes_relative() helper in eBPF which operates similarly as bpf_skb_load_bytes() and allows for more flexibility. Tested myself on x64, arm64, s390x, from Sandipan on ppc64. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 2 + include/linux/filter.h | 4 +- kernel/bpf/core.c | 96 ++--------------- kernel/bpf/verifier.c | 24 +++++ net/core/filter.c | 236 +++++++++++++++++++++++++++++++++++++++-- 5 files changed, 262 insertions(+), 100 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 0f5abf73f0fd..a656fcf870e4 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -240,6 +240,8 @@ struct bpf_verifier_ops { struct bpf_insn_access_aux *info); int (*gen_prologue)(struct bpf_insn *insn, bool direct_write, const struct bpf_prog *prog); + int (*gen_ld_abs)(const struct bpf_insn *orig, + struct bpf_insn *insn_buf); u32 (*convert_ctx_access)(enum bpf_access_type type, const struct bpf_insn *src, struct bpf_insn *dst, diff --git a/include/linux/filter.h b/include/linux/filter.h index 5471d0e81efe..1a8d1b7b22c6 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -47,7 +47,9 @@ struct xdp_buff; /* Additional register mappings for converted user programs. */ #define BPF_REG_A BPF_REG_0 #define BPF_REG_X BPF_REG_7 -#define BPF_REG_TMP BPF_REG_8 +#define BPF_REG_TMP BPF_REG_2 /* scratch reg */ +#define BPF_REG_D BPF_REG_8 /* data, callee-saved */ +#define BPF_REG_H BPF_REG_9 /* hlen, callee-saved */ /* Kernel hidden auxiliary/helper register. */ #define BPF_REG_AX MAX_BPF_REG diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index fa8b9dba465d..8c6042e45639 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -774,23 +774,6 @@ static int bpf_jit_blind_insn(const struct bpf_insn *from, *to++ = BPF_JMP_REG(from->code, from->dst_reg, BPF_REG_AX, off); break; - case BPF_LD | BPF_ABS | BPF_W: - case BPF_LD | BPF_ABS | BPF_H: - case BPF_LD | BPF_ABS | BPF_B: - *to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm); - *to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd); - *to++ = BPF_LD_IND(from->code, BPF_REG_AX, 0); - break; - - case BPF_LD | BPF_IND | BPF_W: - case BPF_LD | BPF_IND | BPF_H: - case BPF_LD | BPF_IND | BPF_B: - *to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm); - *to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd); - *to++ = BPF_ALU32_REG(BPF_ADD, BPF_REG_AX, from->src_reg); - *to++ = BPF_LD_IND(from->code, BPF_REG_AX, 0); - break; - case BPF_LD | BPF_IMM | BPF_DW: *to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ aux[1].imm); *to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd); @@ -1031,14 +1014,7 @@ EXPORT_SYMBOL_GPL(__bpf_call_base); INSN_3(LDX, MEM, W), \ INSN_3(LDX, MEM, DW), \ /* Immediate based. */ \ - INSN_3(LD, IMM, DW), \ - /* Misc (old cBPF carry-over). */ \ - INSN_3(LD, ABS, B), \ - INSN_3(LD, ABS, H), \ - INSN_3(LD, ABS, W), \ - INSN_3(LD, IND, B), \ - INSN_3(LD, IND, H), \ - INSN_3(LD, IND, W) + INSN_3(LD, IMM, DW) bool bpf_opcode_in_insntable(u8 code) { @@ -1048,6 +1024,13 @@ bool bpf_opcode_in_insntable(u8 code) [0 ... 255] = false, /* Now overwrite non-defaults ... */ BPF_INSN_MAP(BPF_INSN_2_TBL, BPF_INSN_3_TBL), + /* UAPI exposed, but rewritten opcodes. cBPF carry-over. */ + [BPF_LD | BPF_ABS | BPF_B] = true, + [BPF_LD | BPF_ABS | BPF_H] = true, + [BPF_LD | BPF_ABS | BPF_W] = true, + [BPF_LD | BPF_IND | BPF_B] = true, + [BPF_LD | BPF_IND | BPF_H] = true, + [BPF_LD | BPF_IND | BPF_W] = true, }; #undef BPF_INSN_3_TBL #undef BPF_INSN_2_TBL @@ -1078,8 +1061,6 @@ static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, u64 *stack) #undef BPF_INSN_3_LBL #undef BPF_INSN_2_LBL u32 tail_call_cnt = 0; - void *ptr; - int off; #define CONT ({ insn++; goto select_insn; }) #define CONT_JMP ({ insn++; goto select_insn; }) @@ -1406,67 +1387,6 @@ out: atomic64_add((u64) SRC, (atomic64_t *)(unsigned long) (DST + insn->off)); CONT; - LD_ABS_W: /* BPF_R0 = ntohl(*(u32 *) (skb->data + imm32)) */ - off = IMM; -load_word: - /* BPF_LD + BPD_ABS and BPF_LD + BPF_IND insns are only - * appearing in the programs where ctx == skb - * (see may_access_skb() in the verifier). All programs - * keep 'ctx' in regs[BPF_REG_CTX] == BPF_R6, - * bpf_convert_filter() saves it in BPF_R6, internal BPF - * verifier will check that BPF_R6 == ctx. - * - * BPF_ABS and BPF_IND are wrappers of function calls, - * so they scratch BPF_R1-BPF_R5 registers, preserve - * BPF_R6-BPF_R9, and store return value into BPF_R0. - * - * Implicit input: - * ctx == skb == BPF_R6 == CTX - * - * Explicit input: - * SRC == any register - * IMM == 32-bit immediate - * - * Output: - * BPF_R0 - 8/16/32-bit skb data converted to cpu endianness - */ - - ptr = bpf_load_pointer((struct sk_buff *) (unsigned long) CTX, off, 4, &tmp); - if (likely(ptr != NULL)) { - BPF_R0 = get_unaligned_be32(ptr); - CONT; - } - - return 0; - LD_ABS_H: /* BPF_R0 = ntohs(*(u16 *) (skb->data + imm32)) */ - off = IMM; -load_half: - ptr = bpf_load_pointer((struct sk_buff *) (unsigned long) CTX, off, 2, &tmp); - if (likely(ptr != NULL)) { - BPF_R0 = get_unaligned_be16(ptr); - CONT; - } - - return 0; - LD_ABS_B: /* BPF_R0 = *(u8 *) (skb->data + imm32) */ - off = IMM; -load_byte: - ptr = bpf_load_pointer((struct sk_buff *) (unsigned long) CTX, off, 1, &tmp); - if (likely(ptr != NULL)) { - BPF_R0 = *(u8 *)ptr; - CONT; - } - - return 0; - LD_IND_W: /* BPF_R0 = ntohl(*(u32 *) (skb->data + src_reg + imm32)) */ - off = IMM + SRC; - goto load_word; - LD_IND_H: /* BPF_R0 = ntohs(*(u16 *) (skb->data + src_reg + imm32)) */ - off = IMM + SRC; - goto load_half; - LD_IND_B: /* BPF_R0 = *(u8 *) (skb->data + src_reg + imm32) */ - off = IMM + SRC; - goto load_byte; default_label: /* If we ever reach this, we have a bug somewhere. Die hard here diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 72f528ed9965..5ac2a1191a9a 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4182,6 +4182,11 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) return -EINVAL; } + if (!env->ops->gen_ld_abs) { + verbose(env, "bpf verifier is misconfigured\n"); + return -EINVAL; + } + if (env->subprog_cnt) { /* when program has LD_ABS insn JITs and interpreter assume * that r1 == ctx == skb which is not the case for callees @@ -5858,6 +5863,25 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) continue; } + if (BPF_CLASS(insn->code) == BPF_LD && + (BPF_MODE(insn->code) == BPF_ABS || + BPF_MODE(insn->code) == BPF_IND)) { + cnt = env->ops->gen_ld_abs(insn, insn_buf); + if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) { + verbose(env, "bpf verifier is misconfigured\n"); + return -EINVAL; + } + + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); + if (!new_prog) + return -ENOMEM; + + delta += cnt - 1; + env->prog = prog = new_prog; + insn = new_prog->insnsi + i + delta; + continue; + } + if (insn->code == (BPF_ALU64 | BPF_ADD | BPF_X) || insn->code == (BPF_ALU64 | BPF_SUB | BPF_X)) { const u8 code_add = BPF_ALU64 | BPF_ADD | BPF_X; diff --git a/net/core/filter.c b/net/core/filter.c index 4899873d4131..8a67c4044e3e 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -162,6 +162,87 @@ BPF_CALL_3(bpf_skb_get_nlattr_nest, struct sk_buff *, skb, u32, a, u32, x) return 0; } +BPF_CALL_4(bpf_skb_load_helper_8, const struct sk_buff *, skb, const void *, + data, int, headlen, int, offset) +{ + u8 tmp, *ptr; + const int len = sizeof(tmp); + + if (offset >= 0) { + if (headlen - offset >= len) + return *(u8 *)(data + offset); + if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp))) + return tmp; + } else { + ptr = bpf_internal_load_pointer_neg_helper(skb, offset, len); + if (likely(ptr)) + return *(u8 *)ptr; + } + + return -EFAULT; +} + +BPF_CALL_2(bpf_skb_load_helper_8_no_cache, const struct sk_buff *, skb, + int, offset) +{ + return ____bpf_skb_load_helper_8(skb, skb->data, skb->len - skb->data_len, + offset); +} + +BPF_CALL_4(bpf_skb_load_helper_16, const struct sk_buff *, skb, const void *, + data, int, headlen, int, offset) +{ + u16 tmp, *ptr; + const int len = sizeof(tmp); + + if (offset >= 0) { + if (headlen - offset >= len) + return get_unaligned_be16(data + offset); + if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp))) + return be16_to_cpu(tmp); + } else { + ptr = bpf_internal_load_pointer_neg_helper(skb, offset, len); + if (likely(ptr)) + return get_unaligned_be16(ptr); + } + + return -EFAULT; +} + +BPF_CALL_2(bpf_skb_load_helper_16_no_cache, const struct sk_buff *, skb, + int, offset) +{ + return ____bpf_skb_load_helper_16(skb, skb->data, skb->len - skb->data_len, + offset); +} + +BPF_CALL_4(bpf_skb_load_helper_32, const struct sk_buff *, skb, const void *, + data, int, headlen, int, offset) +{ + u32 tmp, *ptr; + const int len = sizeof(tmp); + + if (likely(offset >= 0)) { + if (headlen - offset >= len) + return get_unaligned_be32(data + offset); + if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp))) + return be32_to_cpu(tmp); + } else { + ptr = bpf_internal_load_pointer_neg_helper(skb, offset, len); + if (likely(ptr)) + return get_unaligned_be32(ptr); + } + + return -EFAULT; +} + +BPF_CALL_2(bpf_skb_load_helper_32_no_cache, const struct sk_buff *, skb, + int, offset) +{ + return ____bpf_skb_load_helper_32(skb, skb->data, skb->len - skb->data_len, + offset); +} + BPF_CALL_0(bpf_get_raw_cpu_id) { return raw_smp_processor_id(); @@ -354,26 +435,87 @@ static bool convert_bpf_extensions(struct sock_filter *fp, return true; } +static bool convert_bpf_ld_abs(struct sock_filter *fp, struct bpf_insn **insnp) +{ + const bool unaligned_ok = IS_BUILTIN(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS); + int size = bpf_size_to_bytes(BPF_SIZE(fp->code)); + bool endian = BPF_SIZE(fp->code) == BPF_H || + BPF_SIZE(fp->code) == BPF_W; + bool indirect = BPF_MODE(fp->code) == BPF_IND; + const int ip_align = NET_IP_ALIGN; + struct bpf_insn *insn = *insnp; + int offset = fp->k; + + if (!indirect && + ((unaligned_ok && offset >= 0) || + (!unaligned_ok && offset >= 0 && + offset + ip_align >= 0 && + offset + ip_align % size == 0))) { + *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_H); + *insn++ = BPF_ALU64_IMM(BPF_SUB, BPF_REG_TMP, offset); + *insn++ = BPF_JMP_IMM(BPF_JSLT, BPF_REG_TMP, size, 2 + endian); + *insn++ = BPF_LDX_MEM(BPF_SIZE(fp->code), BPF_REG_A, BPF_REG_D, + offset); + if (endian) + *insn++ = BPF_ENDIAN(BPF_FROM_BE, BPF_REG_A, size * 8); + *insn++ = BPF_JMP_A(8); + } + + *insn++ = BPF_MOV64_REG(BPF_REG_ARG1, BPF_REG_CTX); + *insn++ = BPF_MOV64_REG(BPF_REG_ARG2, BPF_REG_D); + *insn++ = BPF_MOV64_REG(BPF_REG_ARG3, BPF_REG_H); + if (!indirect) { + *insn++ = BPF_MOV64_IMM(BPF_REG_ARG4, offset); + } else { + *insn++ = BPF_MOV64_REG(BPF_REG_ARG4, BPF_REG_X); + if (fp->k) + *insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_ARG4, offset); + } + + switch (BPF_SIZE(fp->code)) { + case BPF_B: + *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_8); + break; + case BPF_H: + *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_16); + break; + case BPF_W: + *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_32); + break; + default: + return false; + } + + *insn++ = BPF_JMP_IMM(BPF_JSGE, BPF_REG_A, 0, 2); + *insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_A); + *insn = BPF_EXIT_INSN(); + + *insnp = insn; + return true; +} + /** * bpf_convert_filter - convert filter program * @prog: the user passed filter program * @len: the length of the user passed filter program * @new_prog: allocated 'struct bpf_prog' or NULL * @new_len: pointer to store length of converted program + * @seen_ld_abs: bool whether we've seen ld_abs/ind * * Remap 'sock_filter' style classic BPF (cBPF) instruction set to 'bpf_insn' * style extended BPF (eBPF). * Conversion workflow: * * 1) First pass for calculating the new program length: - * bpf_convert_filter(old_prog, old_len, NULL, &new_len) + * bpf_convert_filter(old_prog, old_len, NULL, &new_len, &seen_ld_abs) * * 2) 2nd pass to remap in two passes: 1st pass finds new * jump offsets, 2nd pass remapping: - * bpf_convert_filter(old_prog, old_len, new_prog, &new_len); + * bpf_convert_filter(old_prog, old_len, new_prog, &new_len, &seen_ld_abs) */ static int bpf_convert_filter(struct sock_filter *prog, int len, - struct bpf_prog *new_prog, int *new_len) + struct bpf_prog *new_prog, int *new_len, + bool *seen_ld_abs) { int new_flen = 0, pass = 0, target, i, stack_off; struct bpf_insn *new_insn, *first_insn = NULL; @@ -412,12 +554,27 @@ do_pass: * do this ourself. Initial CTX is present in BPF_REG_ARG1. */ *new_insn++ = BPF_MOV64_REG(BPF_REG_CTX, BPF_REG_ARG1); + if (*seen_ld_abs) { + /* For packet access in classic BPF, cache skb->data + * in callee-saved BPF R8 and skb->len - skb->data_len + * (headlen) in BPF R9. Since classic BPF is read-only + * on CTX, we only need to cache it once. + */ + *new_insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data), + BPF_REG_D, BPF_REG_CTX, + offsetof(struct sk_buff, data)); + *new_insn++ = BPF_LDX_MEM(BPF_W, BPF_REG_H, BPF_REG_CTX, + offsetof(struct sk_buff, len)); + *new_insn++ = BPF_LDX_MEM(BPF_W, BPF_REG_TMP, BPF_REG_CTX, + offsetof(struct sk_buff, data_len)); + *new_insn++ = BPF_ALU32_REG(BPF_SUB, BPF_REG_H, BPF_REG_TMP); + } } else { new_insn += 3; } for (i = 0; i < len; fp++, i++) { - struct bpf_insn tmp_insns[6] = { }; + struct bpf_insn tmp_insns[32] = { }; struct bpf_insn *insn = tmp_insns; if (addrs) @@ -460,6 +617,11 @@ do_pass: BPF_MODE(fp->code) == BPF_ABS && convert_bpf_extensions(fp, &insn)) break; + if (BPF_CLASS(fp->code) == BPF_LD && + convert_bpf_ld_abs(fp, &insn)) { + *seen_ld_abs = true; + break; + } if (fp->code == (BPF_ALU | BPF_DIV | BPF_X) || fp->code == (BPF_ALU | BPF_MOD | BPF_X)) { @@ -569,21 +731,31 @@ jmp_rest: break; /* ldxb 4 * ([14] & 0xf) is remaped into 6 insns. */ - case BPF_LDX | BPF_MSH | BPF_B: - /* tmp = A */ - *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_A); + case BPF_LDX | BPF_MSH | BPF_B: { + struct sock_filter tmp = { + .code = BPF_LD | BPF_ABS | BPF_B, + .k = fp->k, + }; + + *seen_ld_abs = true; + + /* X = A */ + *insn++ = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A); /* A = BPF_R0 = *(u8 *) (skb->data + K) */ - *insn++ = BPF_LD_ABS(BPF_B, fp->k); + convert_bpf_ld_abs(&tmp, &insn); + insn++; /* A &= 0xf */ *insn++ = BPF_ALU32_IMM(BPF_AND, BPF_REG_A, 0xf); /* A <<= 2 */ *insn++ = BPF_ALU32_IMM(BPF_LSH, BPF_REG_A, 2); + /* tmp = X */ + *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_X); /* X = A */ *insn++ = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A); /* A = tmp */ *insn = BPF_MOV64_REG(BPF_REG_A, BPF_REG_TMP); break; - + } /* RET_K is remaped into 2 insns. RET_A case doesn't need an * extra mov as BPF_REG_0 is already mapped into BPF_REG_A. */ @@ -665,6 +837,8 @@ jmp_rest: if (!new_prog) { /* Only calculating new length. */ *new_len = new_insn - first_insn; + if (*seen_ld_abs) + *new_len += 4; /* Prologue bits. */ return 0; } @@ -1026,6 +1200,7 @@ static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp) struct sock_filter *old_prog; struct bpf_prog *old_fp; int err, new_len, old_len = fp->len; + bool seen_ld_abs = false; /* We are free to overwrite insns et al right here as it * won't be used at this point in time anymore internally @@ -1047,7 +1222,8 @@ static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp) } /* 1st pass: calculate the new program length. */ - err = bpf_convert_filter(old_prog, old_len, NULL, &new_len); + err = bpf_convert_filter(old_prog, old_len, NULL, &new_len, + &seen_ld_abs); if (err) goto out_err_free; @@ -1066,7 +1242,8 @@ static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp) fp->len = new_len; /* 2nd pass: remap sock_filter insns into bpf_insn insns. */ - err = bpf_convert_filter(old_prog, old_len, fp, &new_len); + err = bpf_convert_filter(old_prog, old_len, fp, &new_len, + &seen_ld_abs); if (err) /* 2nd bpf_convert_filter() can fail only if it fails * to allocate memory, remapping must succeed. Note, @@ -4393,6 +4570,41 @@ static int bpf_unclone_prologue(struct bpf_insn *insn_buf, bool direct_write, return insn - insn_buf; } +static int bpf_gen_ld_abs(const struct bpf_insn *orig, + struct bpf_insn *insn_buf) +{ + bool indirect = BPF_MODE(orig->code) == BPF_IND; + struct bpf_insn *insn = insn_buf; + + /* We're guaranteed here that CTX is in R6. */ + *insn++ = BPF_MOV64_REG(BPF_REG_1, BPF_REG_CTX); + if (!indirect) { + *insn++ = BPF_MOV64_IMM(BPF_REG_2, orig->imm); + } else { + *insn++ = BPF_MOV64_REG(BPF_REG_2, orig->src_reg); + if (orig->imm) + *insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, orig->imm); + } + + switch (BPF_SIZE(orig->code)) { + case BPF_B: + *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_8_no_cache); + break; + case BPF_H: + *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_16_no_cache); + break; + case BPF_W: + *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_32_no_cache); + break; + } + + *insn++ = BPF_JMP_IMM(BPF_JSGE, BPF_REG_0, 0, 2); + *insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_0, BPF_REG_0); + *insn++ = BPF_EXIT_INSN(); + + return insn - insn_buf; +} + static int tc_cls_act_prologue(struct bpf_insn *insn_buf, bool direct_write, const struct bpf_prog *prog) { @@ -5662,6 +5874,7 @@ const struct bpf_verifier_ops sk_filter_verifier_ops = { .get_func_proto = sk_filter_func_proto, .is_valid_access = sk_filter_is_valid_access, .convert_ctx_access = bpf_convert_ctx_access, + .gen_ld_abs = bpf_gen_ld_abs, }; const struct bpf_prog_ops sk_filter_prog_ops = { @@ -5673,6 +5886,7 @@ const struct bpf_verifier_ops tc_cls_act_verifier_ops = { .is_valid_access = tc_cls_act_is_valid_access, .convert_ctx_access = tc_cls_act_convert_ctx_access, .gen_prologue = tc_cls_act_prologue, + .gen_ld_abs = bpf_gen_ld_abs, }; const struct bpf_prog_ops tc_cls_act_prog_ops = { From 30cf0922f4e8ecb04fab8e05d47ac95840bf3cf6 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 4 May 2018 01:08:15 +0200 Subject: [PATCH 0280/1640] BACKPORT: bpf: add skb_load_bytes_relative helper This adds a small BPF helper similar to bpf_skb_load_bytes() that is able to load relative to mac/net header offset from the skb's linear data. Compared to bpf_skb_load_bytes(), it takes a fifth argument namely start_header, which is either BPF_HDR_START_MAC or BPF_HDR_START_NET. This allows for a more flexible alternative compared to LD_ABS/LD_IND with negative offset. It's enabled for tc BPF programs as well as sock filter program types where it's mainly useful in reuseport programs to ease access to lower header data. Reference: https://lists.iovisor.org/pipermail/iovisor-dev/2017-March/000698.html Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index cce6d4b3df54..e04576278f5f 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1802,6 +1802,30 @@ union bpf_attr { * Return * a non-negative value equal to or less than size on success, or * a negative error in case of failure. + * + * int skb_load_bytes_relative(const struct sk_buff *skb, u32 offset, void *to, u32 len, u32 start_header) + * Description + * This helper is similar to **bpf_skb_load_bytes**\ () in that + * it provides an easy way to load *len* bytes from *offset* + * from the packet associated to *skb*, into the buffer pointed + * by *to*. The difference to **bpf_skb_load_bytes**\ () is that + * a fifth argument *start_header* exists in order to select a + * base offset to start from. *start_header* can be one of: + * + * **BPF_HDR_START_MAC** + * Base offset to load data from is *skb*'s mac header. + * **BPF_HDR_START_NET** + * Base offset to load data from is *skb*'s network header. + * + * In general, "direct packet access" is the preferred method to + * access packet data, however, this helper is in particular useful + * in socket filters where *skb*\ **->data** does not always point + * to the start of the mac header and where "direct packet access" + * is not available. + * + * Return + * 0 on success, or a negative error in case of failure. + * */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ From fd233965bdf2f16425c4b25f291e97c50a6be378 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 4 May 2018 02:13:57 +0200 Subject: [PATCH 0281/1640] UPSTREAM: bpf: use array_index_nospec in find_prog_type Commit 9ef09e35e521 ("bpf: fix possible spectre-v1 in find_and_alloc_map()") converted find_and_alloc_map() over to use array_index_nospec() to sanitize map type that user space passes on map creation, and this patch does an analogous conversion for progs in find_prog_type() as it's also passed from user space when loading progs as attr->prog_type. Signed-off-by: Daniel Borkmann Cc: Mark Rutland Signed-off-by: Alexei Starovoitov --- kernel/bpf/syscall.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 43e0e72b8413..f157e134abb4 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -915,11 +915,17 @@ static const struct bpf_prog_ops * const bpf_prog_types[] = { static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog) { - if (type >= ARRAY_SIZE(bpf_prog_types) || !bpf_prog_types[type]) + const struct bpf_prog_ops *ops; + + if (type >= ARRAY_SIZE(bpf_prog_types)) + return -EINVAL; + type = array_index_nospec(type, ARRAY_SIZE(bpf_prog_types)); + ops = bpf_prog_types[type]; + if (!ops) return -EINVAL; if (!bpf_prog_is_dev_bound(prog->aux)) - prog->aux->ops = bpf_prog_types[type]; + prog->aux->ops = ops; else prog->aux->ops = &bpf_offload_prog_ops; prog->type = type; From cd4ea003499055862d9317201d42cf8f496c1c1d Mon Sep 17 00:00:00 2001 From: Jiong Wang Date: Wed, 2 May 2018 16:17:17 -0400 Subject: [PATCH 0282/1640] UPSTREAM: bpf: unify main prog and subprog Currently, verifier treat main prog and subprog differently. All subprogs detected are kept in env->subprog_starts while main prog is not kept there. Instead, main prog is implicitly defined as the prog start at 0. There is actually no difference between main prog and subprog, it is better to unify them, and register all progs detected into env->subprog_starts. This could also help simplifying some code logic. Signed-off-by: Jiong Wang Signed-off-by: Daniel Borkmann --- include/linux/bpf_verifier.h | 2 +- kernel/bpf/verifier.c | 57 ++++++++++++++++++++---------------- 2 files changed, 32 insertions(+), 27 deletions(-) diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 07db43e120f4..d6e76bd871d4 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -209,7 +209,7 @@ struct bpf_verifier_env { bool seen_direct_write; struct bpf_insn_aux_data *insn_aux_data; /* array of per-insn state */ struct bpf_verifier_log log; - u32 subprog_starts[BPF_MAX_SUBPROGS]; + u32 subprog_starts[BPF_MAX_SUBPROGS + 1]; /* computes the stack depth of each bpf function */ u16 subprog_stack_depth[BPF_MAX_SUBPROGS + 1]; u32 subprog_cnt; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 5ac2a1191a9a..cb5964784177 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -769,7 +769,7 @@ static int add_subprog(struct bpf_verifier_env *env, int off) ret = find_subprog(env, off); if (ret >= 0) return 0; - if (env->subprog_cnt >= BPF_MAX_SUBPROGS) { + if (env->subprog_cnt > BPF_MAX_SUBPROGS) { verbose(env, "too many subprograms\n"); return -E2BIG; } @@ -785,6 +785,11 @@ static int check_subprogs(struct bpf_verifier_env *env) struct bpf_insn *insn = env->prog->insnsi; int insn_cnt = env->prog->len; + /* Add entry function. */ + ret = add_subprog(env, 0); + if (ret < 0) + return ret; + /* determine subprog starts. The end is one before the next starts */ for (i = 0; i < insn_cnt; i++) { if (insn[i].code != (BPF_JMP | BPF_CALL)) @@ -810,10 +815,10 @@ static int check_subprogs(struct bpf_verifier_env *env) /* now check that all jumps are within the same subprog */ subprog_start = 0; - if (env->subprog_cnt == cur_subprog) + if (env->subprog_cnt == cur_subprog + 1) subprog_end = insn_cnt; else - subprog_end = env->subprog_starts[cur_subprog++]; + subprog_end = env->subprog_starts[cur_subprog + 1]; for (i = 0; i < insn_cnt; i++) { u8 code = insn[i].code; @@ -837,11 +842,13 @@ next: verbose(env, "last insn is not an exit or jmp\n"); return -EINVAL; } + cur_subprog++; subprog_start = subprog_end; - if (env->subprog_cnt == cur_subprog) + if (env->subprog_cnt == cur_subprog + 1) subprog_end = insn_cnt; else - subprog_end = env->subprog_starts[cur_subprog++]; + subprog_end = + env->subprog_starts[cur_subprog + 1]; } } return 0; @@ -1563,10 +1570,10 @@ process_func: return -EACCES; } continue_func: - if (env->subprog_cnt == subprog) + if (env->subprog_cnt == subprog + 1) subprog_end = insn_cnt; else - subprog_end = env->subprog_starts[subprog]; + subprog_end = env->subprog_starts[subprog + 1]; for (; i < subprog_end; i++) { if (insn[i].code != (BPF_JMP | BPF_CALL)) continue; @@ -1584,7 +1591,6 @@ continue_func: i); return -EFAULT; } - subprog++; frame++; if (frame >= MAX_CALL_FRAMES) { WARN_ONCE(1, "verifier bug. Call stack is too deep\n"); @@ -1616,7 +1622,6 @@ static int get_callee_stack_depth(struct bpf_verifier_env *env, start); return -EFAULT; } - subprog++; return env->subprog_stack_depth[subprog]; } #endif @@ -2159,7 +2164,7 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, case BPF_FUNC_tail_call: if (map->map_type != BPF_MAP_TYPE_PROG_ARRAY) goto error; - if (env->subprog_cnt) { + if (env->subprog_cnt > 1) { verbose(env, "tail_calls are not allowed in programs with bpf-to-bpf calls\n"); return -EINVAL; } @@ -2295,7 +2300,7 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, /* remember the callsite, it will be used by bpf_exit */ *insn_idx /* callsite */, state->curframe + 1 /* frameno within this callchain */, - subprog + 1 /* subprog number within this prog */); + subprog /* subprog number within this prog */); /* copy r1 - r5 args that callee can access */ for (i = BPF_REG_1; i <= BPF_REG_5; i++) @@ -4187,7 +4192,7 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) return -EINVAL; } - if (env->subprog_cnt) { + if (env->subprog_cnt > 1) { /* when program has LD_ABS insn JITs and interpreter assume * that r1 == ctx == skb which is not the case for callees * that can have arbitrary arguments. It's problematic @@ -5233,11 +5238,11 @@ process_bpf_exit: verbose(env, "processed %d insns (limit %d), stack depth ", insn_processed, BPF_COMPLEXITY_LIMIT_INSNS); - for (i = 0; i < env->subprog_cnt + 1; i++) { + for (i = 0; i < env->subprog_cnt; i++) { u32 depth = env->subprog_stack_depth[i]; verbose(env, "%d", depth); - if (i + 1 < env->subprog_cnt + 1) + if (i + 1 < env->subprog_cnt) verbose(env, "+"); } verbose(env, "\n"); @@ -5642,7 +5647,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) void *old_bpf_func; int err = -ENOMEM; - if (env->subprog_cnt == 0) + if (env->subprog_cnt <= 1) return 0; for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) { @@ -5658,7 +5663,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) /* temporarily remember subprog id inside insn instead of * aux_data, since next loop will split up all insns into funcs */ - insn->off = subprog + 1; + insn->off = subprog; /* remember original imm in case JIT fails and fallback * to interpreter will be needed */ @@ -5667,16 +5672,16 @@ static int jit_subprogs(struct bpf_verifier_env *env) insn->imm = 1; } - func = kzalloc(sizeof(prog) * (env->subprog_cnt + 1), GFP_KERNEL); + func = kzalloc(sizeof(prog) * env->subprog_cnt, GFP_KERNEL); if (!func) return -ENOMEM; - for (i = 0; i <= env->subprog_cnt; i++) { + for (i = 0; i < env->subprog_cnt; i++) { subprog_start = subprog_end; - if (env->subprog_cnt == i) + if (env->subprog_cnt == i + 1) subprog_end = prog->len; else - subprog_end = env->subprog_starts[i]; + subprog_end = env->subprog_starts[i + 1]; len = subprog_end - subprog_start; func[i] = bpf_prog_alloc(bpf_prog_size(len), GFP_USER); @@ -5706,7 +5711,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) * now populate all bpf_calls with correct addresses and * run last pass of JIT */ - for (i = 0; i <= env->subprog_cnt; i++) { + for (i = 0; i < env->subprog_cnt; i++) { insn = func[i]->insnsi; for (j = 0; j < func[i]->len; j++, insn++) { if (insn->code != (BPF_JMP | BPF_CALL) || @@ -5719,7 +5724,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) __bpf_call_base; } } - for (i = 0; i <= env->subprog_cnt; i++) { + for (i = 0; i < env->subprog_cnt; i++) { old_bpf_func = func[i]->bpf_func; tmp = bpf_int_jit_compile(func[i]); if (tmp != func[i] || func[i]->bpf_func != old_bpf_func) { @@ -5733,7 +5738,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) /* finally lock prog and jit images for all functions and * populate kallsysm */ - for (i = 0; i <= env->subprog_cnt; i++) { + for (i = 0; i < env->subprog_cnt; i++) { bpf_prog_lock_ro(func[i]); bpf_prog_kallsyms_add(func[i]); } @@ -5750,7 +5755,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) continue; insn->off = env->insn_aux_data[i].call_imm; subprog = find_subprog(env, i + insn->off + 1); - addr = (unsigned long)func[subprog + 1]->bpf_func; + addr = (unsigned long)func[subprog]->bpf_func; addr &= PAGE_MASK; insn->imm = (u64 (*)(u64, u64, u64, u64, u64)) addr - __bpf_call_base; @@ -5759,10 +5764,10 @@ static int jit_subprogs(struct bpf_verifier_env *env) prog->jited = 1; prog->bpf_func = func[0]->bpf_func; prog->aux->func = func; - prog->aux->func_cnt = env->subprog_cnt + 1; + prog->aux->func_cnt = env->subprog_cnt; return 0; out_free: - for (i = 0; i <= env->subprog_cnt; i++) + for (i = 0; i < env->subprog_cnt; i++) if (func[i]) bpf_jit_free(func[i]); kfree(func); From 303968193bcc41b4be0c098e2c3628881fb54b80 Mon Sep 17 00:00:00 2001 From: Jiong Wang Date: Wed, 2 May 2018 16:17:18 -0400 Subject: [PATCH 0283/1640] UPSTREAM: bpf: centre subprog information fields It is better to centre all subprog information fields into one structure. This structure could later serve as function node in call graph. Signed-off-by: Jiong Wang Signed-off-by: Daniel Borkmann --- include/linux/bpf_verifier.h | 9 ++++-- kernel/bpf/verifier.c | 62 +++++++++++++++++++----------------- 2 files changed, 38 insertions(+), 33 deletions(-) diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index d6e76bd871d4..cf59ae681941 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -189,6 +189,11 @@ static inline bool bpf_verifier_log_needed(const struct bpf_verifier_log *log) #define BPF_MAX_SUBPROGS 256 +struct bpf_subprog_info { + u32 start; /* insn idx of function entry point */ + u16 stack_depth; /* max. stack depth used by this function */ +}; + /* single container for all structs * one verifier_env per bpf_check() call */ @@ -209,9 +214,7 @@ struct bpf_verifier_env { bool seen_direct_write; struct bpf_insn_aux_data *insn_aux_data; /* array of per-insn state */ struct bpf_verifier_log log; - u32 subprog_starts[BPF_MAX_SUBPROGS + 1]; - /* computes the stack depth of each bpf function */ - u16 subprog_stack_depth[BPF_MAX_SUBPROGS + 1]; + struct bpf_subprog_info subprog_info[BPF_MAX_SUBPROGS + 1]; u32 subprog_cnt; }; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index cb5964784177..5ac2a60ec4e0 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -742,18 +742,19 @@ enum reg_arg_type { static int cmp_subprogs(const void *a, const void *b) { - return *(int *)a - *(int *)b; + return ((struct bpf_subprog_info *)a)->start - + ((struct bpf_subprog_info *)b)->start; } static int find_subprog(struct bpf_verifier_env *env, int off) { - u32 *p; + struct bpf_subprog_info *p; - p = bsearch(&off, env->subprog_starts, env->subprog_cnt, - sizeof(env->subprog_starts[0]), cmp_subprogs); + p = bsearch(&off, env->subprog_info, env->subprog_cnt, + sizeof(env->subprog_info[0]), cmp_subprogs); if (!p) return -ENOENT; - return p - env->subprog_starts; + return p - env->subprog_info; } @@ -773,15 +774,16 @@ static int add_subprog(struct bpf_verifier_env *env, int off) verbose(env, "too many subprograms\n"); return -E2BIG; } - env->subprog_starts[env->subprog_cnt++] = off; - sort(env->subprog_starts, env->subprog_cnt, - sizeof(env->subprog_starts[0]), cmp_subprogs, NULL); + env->subprog_info[env->subprog_cnt++].start = off; + sort(env->subprog_info, env->subprog_cnt, + sizeof(env->subprog_info[0]), cmp_subprogs, NULL); return 0; } static int check_subprogs(struct bpf_verifier_env *env) { int i, ret, subprog_start, subprog_end, off, cur_subprog = 0; + struct bpf_subprog_info *subprog = env->subprog_info; struct bpf_insn *insn = env->prog->insnsi; int insn_cnt = env->prog->len; @@ -811,14 +813,14 @@ static int check_subprogs(struct bpf_verifier_env *env) if (env->log.level > 1) for (i = 0; i < env->subprog_cnt; i++) - verbose(env, "func#%d @%d\n", i, env->subprog_starts[i]); + verbose(env, "func#%d @%d\n", i, subprog[i].start); /* now check that all jumps are within the same subprog */ subprog_start = 0; if (env->subprog_cnt == cur_subprog + 1) subprog_end = insn_cnt; else - subprog_end = env->subprog_starts[cur_subprog + 1]; + subprog_end = subprog[cur_subprog + 1].start; for (i = 0; i < insn_cnt; i++) { u8 code = insn[i].code; @@ -847,8 +849,7 @@ next: if (env->subprog_cnt == cur_subprog + 1) subprog_end = insn_cnt; else - subprog_end = - env->subprog_starts[cur_subprog + 1]; + subprog_end = subprog[cur_subprog + 1].start; } } return 0; @@ -1535,13 +1536,13 @@ static int update_stack_depth(struct bpf_verifier_env *env, const struct bpf_func_state *func, int off) { - u16 stack = env->subprog_stack_depth[func->subprogno]; + u16 stack = env->subprog_info[func->subprogno].stack_depth; if (stack >= -off) return 0; /* update known max for given subprogram */ - env->subprog_stack_depth[func->subprogno] = -off; + env->subprog_info[func->subprogno].stack_depth = -off; return 0; } @@ -1553,7 +1554,8 @@ static int update_stack_depth(struct bpf_verifier_env *env, */ static int check_max_stack_depth(struct bpf_verifier_env *env) { - int depth = 0, frame = 0, subprog = 0, i = 0, subprog_end; + int depth = 0, frame = 0, idx = 0, i = 0, subprog_end; + struct bpf_subprog_info *subprog = env->subprog_info; struct bpf_insn *insn = env->prog->insnsi; int insn_cnt = env->prog->len; int ret_insn[MAX_CALL_FRAMES]; @@ -1563,17 +1565,17 @@ process_func: /* round up to 32-bytes, since this is granularity * of interpreter stack size */ - depth += round_up(max_t(u32, env->subprog_stack_depth[subprog], 1), 32); + depth += round_up(max_t(u32, subprog[idx].stack_depth, 1), 32); if (depth > MAX_BPF_STACK) { verbose(env, "combined stack size of %d calls is %d. Too large\n", frame + 1, depth); return -EACCES; } continue_func: - if (env->subprog_cnt == subprog + 1) + if (env->subprog_cnt == idx + 1) subprog_end = insn_cnt; else - subprog_end = env->subprog_starts[subprog + 1]; + subprog_end = subprog[idx + 1].start; for (; i < subprog_end; i++) { if (insn[i].code != (BPF_JMP | BPF_CALL)) continue; @@ -1581,12 +1583,12 @@ continue_func: continue; /* remember insn and function to return to */ ret_insn[frame] = i + 1; - ret_prog[frame] = subprog; + ret_prog[frame] = idx; /* find the callee */ i = i + insn[i].imm + 1; - subprog = find_subprog(env, i); - if (subprog < 0) { + idx = find_subprog(env, i); + if (idx < 0) { WARN_ONCE(1, "verifier bug. No program starts at insn %d\n", i); return -EFAULT; @@ -1603,10 +1605,10 @@ continue_func: */ if (frame == 0) return 0; - depth -= round_up(max_t(u32, env->subprog_stack_depth[subprog], 1), 32); + depth -= round_up(max_t(u32, subprog[idx].stack_depth, 1), 32); frame--; i = ret_insn[frame]; - subprog = ret_prog[frame]; + idx = ret_prog[frame]; goto continue_func; } @@ -1622,7 +1624,7 @@ static int get_callee_stack_depth(struct bpf_verifier_env *env, start); return -EFAULT; } - return env->subprog_stack_depth[subprog]; + return env->subprog_info[subprog].stack_depth; } #endif @@ -5239,14 +5241,14 @@ process_bpf_exit: verbose(env, "processed %d insns (limit %d), stack depth ", insn_processed, BPF_COMPLEXITY_LIMIT_INSNS); for (i = 0; i < env->subprog_cnt; i++) { - u32 depth = env->subprog_stack_depth[i]; + u32 depth = env->subprog_info[i].stack_depth; verbose(env, "%d", depth); if (i + 1 < env->subprog_cnt) verbose(env, "+"); } verbose(env, "\n"); - env->prog->aux->stack_depth = env->subprog_stack_depth[0]; + env->prog->aux->stack_depth = env->subprog_info[0].stack_depth; return 0; } @@ -5453,9 +5455,9 @@ static void adjust_subprog_starts(struct bpf_verifier_env *env, u32 off, u32 len if (len == 1) return; for (i = 0; i < env->subprog_cnt; i++) { - if (env->subprog_starts[i] < off) + if (env->subprog_info[i].start < off) continue; - env->subprog_starts[i] += len - 1; + env->subprog_info[i].start += len - 1; } } @@ -5681,7 +5683,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) if (env->subprog_cnt == i + 1) subprog_end = prog->len; else - subprog_end = env->subprog_starts[i + 1]; + subprog_end = env->subprog_info[i + 1].start; len = subprog_end - subprog_start; func[i] = bpf_prog_alloc(bpf_prog_size(len), GFP_USER); @@ -5698,7 +5700,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) * Long term would need debug info to populate names */ func[i]->aux->name[0] = 'F'; - func[i]->aux->stack_depth = env->subprog_stack_depth[i]; + func[i]->aux->stack_depth = env->subprog_info[i].stack_depth; func[i]->jit_requested = 1; func[i] = bpf_int_jit_compile(func[i]); if (!func[i]->jited) { From 0d6bbd3213a26701b1f752aa3499e6e480e65177 Mon Sep 17 00:00:00 2001 From: Jiong Wang Date: Wed, 2 May 2018 16:17:19 -0400 Subject: [PATCH 0284/1640] UPSTREAM: bpf: add faked "ending" subprog There are quite a few code snippet like the following in verifier: subprog_start = 0; if (env->subprog_cnt == cur_subprog + 1) subprog_end = insn_cnt; else subprog_end = env->subprog_info[cur_subprog + 1].start; The reason is there is no marker in subprog_info array to tell the end of it. We could resolve this issue by introducing a faked "ending" subprog. The special "ending" subprog is with "insn_cnt" as start offset, so it is serving as the end mark whenever we iterate over all subprogs. Signed-off-by: Jiong Wang Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 34 ++++++++++++++-------------------- 1 file changed, 14 insertions(+), 20 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 5ac2a60ec4e0..35df3993bf3b 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -770,7 +770,7 @@ static int add_subprog(struct bpf_verifier_env *env, int off) ret = find_subprog(env, off); if (ret >= 0) return 0; - if (env->subprog_cnt > BPF_MAX_SUBPROGS) { + if (env->subprog_cnt >= BPF_MAX_SUBPROGS) { verbose(env, "too many subprograms\n"); return -E2BIG; } @@ -811,16 +811,18 @@ static int check_subprogs(struct bpf_verifier_env *env) return ret; } + /* Add a fake 'exit' subprog which could simplify subprog iteration + * logic. 'subprog_cnt' should not be increased. + */ + subprog[env->subprog_cnt].start = insn_cnt; + if (env->log.level > 1) for (i = 0; i < env->subprog_cnt; i++) verbose(env, "func#%d @%d\n", i, subprog[i].start); /* now check that all jumps are within the same subprog */ - subprog_start = 0; - if (env->subprog_cnt == cur_subprog + 1) - subprog_end = insn_cnt; - else - subprog_end = subprog[cur_subprog + 1].start; + subprog_start = subprog[cur_subprog].start; + subprog_end = subprog[cur_subprog + 1].start; for (i = 0; i < insn_cnt; i++) { u8 code = insn[i].code; @@ -844,11 +846,9 @@ next: verbose(env, "last insn is not an exit or jmp\n"); return -EINVAL; } - cur_subprog++; subprog_start = subprog_end; - if (env->subprog_cnt == cur_subprog + 1) - subprog_end = insn_cnt; - else + cur_subprog++; + if (cur_subprog < env->subprog_cnt) subprog_end = subprog[cur_subprog + 1].start; } } @@ -1557,7 +1557,6 @@ static int check_max_stack_depth(struct bpf_verifier_env *env) int depth = 0, frame = 0, idx = 0, i = 0, subprog_end; struct bpf_subprog_info *subprog = env->subprog_info; struct bpf_insn *insn = env->prog->insnsi; - int insn_cnt = env->prog->len; int ret_insn[MAX_CALL_FRAMES]; int ret_prog[MAX_CALL_FRAMES]; @@ -1572,10 +1571,7 @@ process_func: return -EACCES; } continue_func: - if (env->subprog_cnt == idx + 1) - subprog_end = insn_cnt; - else - subprog_end = subprog[idx + 1].start; + subprog_end = subprog[idx + 1].start; for (; i < subprog_end; i++) { if (insn[i].code != (BPF_JMP | BPF_CALL)) continue; @@ -5454,7 +5450,8 @@ static void adjust_subprog_starts(struct bpf_verifier_env *env, u32 off, u32 len if (len == 1) return; - for (i = 0; i < env->subprog_cnt; i++) { + /* NOTE: fake 'exit' subprog should be updated as well. */ + for (i = 0; i <= env->subprog_cnt; i++) { if (env->subprog_info[i].start < off) continue; env->subprog_info[i].start += len - 1; @@ -5680,10 +5677,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) for (i = 0; i < env->subprog_cnt; i++) { subprog_start = subprog_end; - if (env->subprog_cnt == i + 1) - subprog_end = prog->len; - else - subprog_end = env->subprog_info[i + 1].start; + subprog_end = env->subprog_info[i + 1].start; len = subprog_end - subprog_start; func[i] = bpf_prog_alloc(bpf_prog_size(len), GFP_USER); From 9b9b3de1e134562e54546e15e702eea7c75b4056 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 3 May 2018 18:37:08 -0700 Subject: [PATCH 0285/1640] UPSTREAM: bpf: offload: allow offloaded programs to use perf event arrays BPF_MAP_TYPE_PERF_EVENT_ARRAY is special as far as offload goes. The map only holds glue to perf ring, not actual data. Allow non-offloaded perf event arrays to be used in offloaded programs. Offload driver can extract the events from HW and put them in the map for user space to retrieve. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Reviewed-by: Jiong Wang Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 5 +++++ kernel/bpf/offload.c | 6 ++++-- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index a656fcf870e4..9e7e605e9bf9 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -115,6 +115,11 @@ static inline struct bpf_offloaded_map *map_to_offmap(struct bpf_map *map) return container_of(map, struct bpf_offloaded_map, map); } +static inline bool bpf_map_offload_neutral(const struct bpf_map *map) +{ + return map->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY; +} + static inline bool bpf_map_support_seq_show(const struct bpf_map *map) { return map->ops->map_seq_show_elem && map->ops->map_check_btf; diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index c9401075b58c..ac747d5cf7c6 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2017 Netronome Systems, Inc. + * Copyright (C) 2017-2018 Netronome Systems, Inc. * * This software is licensed under the GNU General License Version 2, * June 1991 as shown in the file COPYING in the top-level directory of this @@ -474,8 +474,10 @@ bool bpf_offload_dev_match(struct bpf_prog *prog, struct bpf_map *map) struct bpf_prog_offload *offload; bool ret; - if (!bpf_prog_is_dev_bound(prog->aux) || !bpf_map_is_dev_bound(map)) + if (!bpf_prog_is_dev_bound(prog->aux)) return false; + if (!bpf_map_is_dev_bound(map)) + return bpf_map_offload_neutral(map); down_read(&bpf_devs_lock); offload = prog->aux->offload; From 3fabbc78757767894b4c7dd21b07e003e96dd316 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 3 May 2018 18:37:09 -0700 Subject: [PATCH 0286/1640] BACKPORT: nfp: bpf: record offload neutral maps in the driver For asynchronous events originating from the device, like perf event output, we need to be able to make sure that objects being referred to by the FW message are valid on the host. FW events can get queued and reordered. Even if we had a FW message "barrier" we should still protect ourselves from bogus FW output. Add a reverse-mapping hash table and record in it all raw map pointers FW may refer to. Only record neutral maps, i.e. perf event arrays. These are currently the only objects FW can refer to. Use RCU protection on the read side, update side is under RTNL. Since program vs map destruction order is slightly painful for offload simply take an extra reference on all the recorded maps to make sure they don't disappear. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Signed-off-by: Daniel Borkmann --- kernel/bpf/syscall.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index f157e134abb4..556ddb1d8fc1 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -286,6 +286,7 @@ void bpf_map_put(struct bpf_map *map) { __bpf_map_put(map, true); } +EXPORT_SYMBOL_GPL(bpf_map_put); void bpf_map_put_with_uref(struct bpf_map *map) { @@ -547,6 +548,7 @@ struct bpf_map *bpf_map_inc(struct bpf_map *map, bool uref) atomic_inc(&map->usercnt); return map; } +EXPORT_SYMBOL_GPL(bpf_map_inc); struct bpf_map *bpf_map_get_with_uref(u32 ufd) { From 595bd0b8465f258ce950d27e805ed0cfb9a65be0 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 3 May 2018 18:37:10 -0700 Subject: [PATCH 0287/1640] UPSTREAM: bpf: export bpf_event_output() bpf_event_output() is useful for offloads to add events to BPF event rings, export it. Note that export is placed near the stub since tracing is optional and kernel/bpf/core.c is always going to be built. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Reviewed-by: Jiong Wang Signed-off-by: Daniel Borkmann --- kernel/bpf/core.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 8c6042e45639..a9ddc4c43f19 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1860,6 +1860,7 @@ bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size, { return -ENOTSUPP; } +EXPORT_SYMBOL_GPL(bpf_event_output); /* Always built-in helper functions. */ const struct bpf_func_proto bpf_tail_call_proto = { From 62c6847a1d26be76594e65c5b9d4c152659b812e Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 3 May 2018 18:37:11 -0700 Subject: [PATCH 0288/1640] UPSTREAM: bpf: replace map pointer loads before calling into offloads Offloads may find host map pointers more useful than map fds. Map pointers can be used to identify the map, while fds are only valid within the context of loading process. Jump to skip_full_check on error in case verifier log overflow has to be handled (replace_map_fd_with_map_ptr() prints to the log, driver prep may do that too in the future). Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Reviewed-by: Jiong Wang Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 35df3993bf3b..5f325e1bba99 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6133,16 +6133,16 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) env->strict_alignment = true; - if (bpf_prog_is_dev_bound(env->prog->aux)) { - ret = bpf_prog_offload_verifier_prep(env); - if (ret) - goto err_unlock; - } - ret = replace_map_fd_with_map_ptr(env); if (ret < 0) goto skip_full_check; + if (bpf_prog_is_dev_bound(env->prog->aux)) { + ret = bpf_prog_offload_verifier_prep(env); + if (ret) + goto skip_full_check; + } + env->explored_states = kcalloc(env->prog->len, sizeof(struct bpf_verifier_state_list *), GFP_USER); From 3ab8979f962f6332f589127d617c9a22a6b04e6c Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 4 May 2018 16:27:53 +0200 Subject: [PATCH 0289/1640] UPSTREAM: bpf, xskmap: fix crash in xsk_map_alloc error path handling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If bpf_map_precharge_memlock() did not fail, then we set err to zero. However, any subsequent failure from either alloc_percpu() or the bpf_map_area_alloc() will return ERR_PTR(0) which in find_and_alloc_map() will cause NULL pointer deref. In devmap we have the convention that we return -EINVAL on page count overflow, so keep the same logic here and just set err to -ENOMEM after successful bpf_map_precharge_memlock(). Fixes: fbfc504a24f5 ("bpf: introduce new bpf AF_XDP map type BPF_MAP_TYPE_XSKMAP") Signed-off-by: Daniel Borkmann Cc: Björn Töpel Acked-by: David S. Miller Signed-off-by: Alexei Starovoitov --- kernel/bpf/xskmap.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c index 869dbb11b612..cb3a12137404 100644 --- a/kernel/bpf/xskmap.c +++ b/kernel/bpf/xskmap.c @@ -56,6 +56,8 @@ static struct bpf_map *xsk_map_alloc(union bpf_attr *attr) if (err) goto free_m; + err = -ENOMEM; + m->flush_list = alloc_percpu(struct list_head); if (!m->flush_list) goto free_m; From 303612743956e81d8f3ee5de26145f8fcd86b59b Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Fri, 4 May 2018 14:49:50 -0700 Subject: [PATCH 0290/1640] UPSTREAM: bpf: btf: Avoid WARN_ON when CONFIG_REFCOUNT_FULL=y If CONFIG_REFCOUNT_FULL=y, refcount_inc() WARN when refcount is 0. When creating a new btf, the initial btf->refcnt is 0 and triggered the following: [ 34.855452] refcount_t: increment on 0; use-after-free. [ 34.856252] WARNING: CPU: 6 PID: 1857 at lib/refcount.c:153 refcount_inc+0x26/0x30 .... [ 34.868809] Call Trace: [ 34.869168] btf_new_fd+0x1af6/0x24d0 [ 34.869645] ? btf_type_seq_show+0x200/0x200 [ 34.870212] ? lock_acquire+0x3b0/0x3b0 [ 34.870726] ? security_capable+0x54/0x90 [ 34.871247] __x64_sys_bpf+0x1b2/0x310 [ 34.871761] ? __ia32_sys_bpf+0x310/0x310 [ 34.872285] ? bad_area_access_error+0x310/0x310 [ 34.872894] do_syscall_64+0x95/0x3f0 This patch uses refcount_set() instead. Reported-by: Yonghong Song Tested-by: Yonghong Song Signed-off-by: Martin KaFai Lau Acked-by: Song Liu Signed-off-by: Daniel Borkmann --- kernel/bpf/btf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 22e1046a1a86..fa0dce0452e7 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -1977,7 +1977,7 @@ static struct btf *btf_parse(void __user *btf_data, u32 btf_data_size, if (!err) { btf_verifier_env_free(env); - btf_get(btf); + refcount_set(&btf->refcnt, 1); return btf; } From 6b1d023abf6c15cf3f519f917dd38d93703b18fe Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Fri, 4 May 2018 14:49:51 -0700 Subject: [PATCH 0291/1640] UPSTREAM: bpf: btf: Introduce BTF ID This patch gives an ID to each loaded BTF. The ID is allocated by the idr like the existing prog-id and map-id. The bpf_put(map->btf) is moved to __bpf_map_put() so that the userspace can stop seeing the BTF ID ASAP when the last BTF refcnt is gone. It also makes BTF accessible from userspace through the 1. new BPF_BTF_GET_FD_BY_ID command. It is limited to CAP_SYS_ADMIN which is inline with the BPF_BTF_LOAD cmd and the existing BPF_[MAP|PROG]_GET_FD_BY_ID cmd. 2. new btf_id (and btf_key_id + btf_value_id) in "struct bpf_map_info" Once the BTF ID handler is accessible from userspace, freeing a BTF object has to go through a rcu period. The BPF_BTF_GET_FD_BY_ID cmd can then be done under a rcu_read_lock() instead of taking spin_lock. [Note: A similar rcu usage can be done to the existing bpf_prog_get_fd_by_id() in a follow up patch] When processing the BPF_BTF_GET_FD_BY_ID cmd, refcount_inc_not_zero() is needed because the BTF object could be already in the rcu dead row . btf_get() is removed since its usage is currently limited to btf.c alone. refcount_inc() is used directly instead. Signed-off-by: Martin KaFai Lau Acked-by: Alexei Starovoitov Acked-by: Song Liu Signed-off-by: Daniel Borkmann --- include/linux/btf.h | 2 + include/uapi/linux/bpf.h | 5 ++ kernel/bpf/btf.c | 108 +++++++++++++++++++++++++++++++++++---- kernel/bpf/syscall.c | 24 ++++++++- 4 files changed, 128 insertions(+), 11 deletions(-) diff --git a/include/linux/btf.h b/include/linux/btf.h index a966dc6d61ee..e076c4697049 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -44,5 +44,7 @@ const struct btf_type *btf_type_id_size(const struct btf *btf, u32 *ret_size); void btf_type_seq_show(const struct btf *btf, u32 type_id, void *obj, struct seq_file *m); +int btf_get_fd_by_id(u32 id); +u32 btf_id(const struct btf *btf); #endif diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index e04576278f5f..a9aa6939f83e 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -96,6 +96,7 @@ enum bpf_cmd { BPF_PROG_QUERY, BPF_RAW_TRACEPOINT_OPEN, BPF_BTF_LOAD, + BPF_BTF_GET_FD_BY_ID, }; enum bpf_map_type { @@ -344,6 +345,7 @@ union bpf_attr { __u32 start_id; __u32 prog_id; __u32 map_id; + __u32 btf_id; }; __u32 next_id; __u32 open_flags; @@ -2187,6 +2189,9 @@ struct bpf_map_info { __u32 ifindex; __u64 netns_dev; __u64 netns_ino; + __u32 btf_id; + __u32 btf_key_id; + __u32 btf_value_id; } __attribute__((aligned(8))); /* User bpf_sock_addr struct to access socket fields and sockaddr struct passed diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index fa0dce0452e7..40950b6bf395 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include @@ -179,6 +180,9 @@ i < btf_type_vlen(struct_type); \ i++, member++) +static DEFINE_IDR(btf_idr); +static DEFINE_SPINLOCK(btf_idr_lock); + struct btf { union { struct btf_header *hdr; @@ -193,6 +197,8 @@ struct btf { u32 types_size; u32 data_size; refcount_t refcnt; + u32 id; + struct rcu_head rcu; }; enum verifier_phase { @@ -598,6 +604,42 @@ static int btf_add_type(struct btf_verifier_env *env, struct btf_type *t) return 0; } +static int btf_alloc_id(struct btf *btf) +{ + int id; + + idr_preload(GFP_KERNEL); + spin_lock_bh(&btf_idr_lock); + id = idr_alloc_cyclic(&btf_idr, btf, 1, INT_MAX, GFP_ATOMIC); + if (id > 0) + btf->id = id; + spin_unlock_bh(&btf_idr_lock); + idr_preload_end(); + + if (WARN_ON_ONCE(!id)) + return -ENOSPC; + + return id > 0 ? 0 : id; +} + +static void btf_free_id(struct btf *btf) +{ + unsigned long flags; + + /* + * In map-in-map, calling map_delete_elem() on outer + * map will call bpf_map_put on the inner map. + * It will then eventually call btf_free_id() + * on the inner map. Some of the map_delete_elem() + * implementation may have irq disabled, so + * we need to use the _irqsave() version instead + * of the _bh() version. + */ + spin_lock_irqsave(&btf_idr_lock, flags); + idr_remove(&btf_idr, btf->id); + spin_unlock_irqrestore(&btf_idr_lock, flags); +} + static void btf_free(struct btf *btf) { kvfree(btf->types); @@ -607,15 +649,19 @@ static void btf_free(struct btf *btf) kfree(btf); } -static void btf_get(struct btf *btf) +static void btf_free_rcu(struct rcu_head *rcu) { - refcount_inc(&btf->refcnt); + struct btf *btf = container_of(rcu, struct btf, rcu); + + btf_free(btf); } void btf_put(struct btf *btf) { - if (btf && refcount_dec_and_test(&btf->refcnt)) - btf_free(btf); + if (btf && refcount_dec_and_test(&btf->refcnt)) { + btf_free_id(btf); + call_rcu(&btf->rcu, btf_free_rcu); + } } static int env_resolve_init(struct btf_verifier_env *env) @@ -2006,10 +2052,15 @@ const struct file_operations btf_fops = { .release = btf_release, }; +static int __btf_new_fd(struct btf *btf) +{ + return anon_inode_getfd("btf", &btf_fops, btf, O_RDONLY | O_CLOEXEC); +} + int btf_new_fd(const union bpf_attr *attr) { struct btf *btf; - int fd; + int ret; btf = btf_parse(u64_to_user_ptr(attr->btf), attr->btf_size, attr->btf_log_level, @@ -2018,12 +2069,23 @@ int btf_new_fd(const union bpf_attr *attr) if (IS_ERR(btf)) return PTR_ERR(btf); - fd = anon_inode_getfd("btf", &btf_fops, btf, - O_RDONLY | O_CLOEXEC); - if (fd < 0) + ret = btf_alloc_id(btf); + if (ret) { + btf_free(btf); + return ret; + } + + /* + * The BTF ID is published to the userspace. + * All BTF free must go through call_rcu() from + * now on (i.e. free by calling btf_put()). + */ + + ret = __btf_new_fd(btf); + if (ret < 0) btf_put(btf); - return fd; + return ret; } struct btf *btf_get_by_fd(int fd) @@ -2042,7 +2104,7 @@ struct btf *btf_get_by_fd(int fd) } btf = f.file->private_data; - btf_get(btf); + refcount_inc(&btf->refcnt); fdput(f); return btf; @@ -2062,3 +2124,29 @@ int btf_get_info_by_fd(const struct btf *btf, return 0; } + +int btf_get_fd_by_id(u32 id) +{ + struct btf *btf; + int fd; + + rcu_read_lock(); + btf = idr_find(&btf_idr, id); + if (!btf || !refcount_inc_not_zero(&btf->refcnt)) + btf = ERR_PTR(-ENOENT); + rcu_read_unlock(); + + if (IS_ERR(btf)) + return PTR_ERR(btf); + + fd = __btf_new_fd(btf); + if (fd < 0) + btf_put(btf); + + return fd; +} + +u32 btf_id(const struct btf *btf) +{ + return btf->id; +} diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 556ddb1d8fc1..40d5cb8b8893 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -256,7 +256,6 @@ static void bpf_map_free_deferred(struct work_struct *work) bpf_map_uncharge_memlock(map); security_bpf_map_free(map); - btf_put(map->btf); /* implementation dependent freeing */ map->ops->map_free(map); } @@ -277,6 +276,7 @@ static void __bpf_map_put(struct bpf_map *map, bool do_idr_lock) if (atomic_dec_and_test(&map->refcnt)) { /* bpf_map_free_id() must be called first */ bpf_map_free_id(map, do_idr_lock); + btf_put(map->btf); INIT_WORK(&map->work, bpf_map_free_deferred); schedule_work(&map->work); } @@ -2036,6 +2036,12 @@ static int bpf_map_get_info_by_fd(struct bpf_map *map, info.map_flags = map->map_flags; memcpy(info.name, map->name, sizeof(map->name)); + if (map->btf) { + info.btf_id = btf_id(map->btf); + info.btf_key_id = map->btf_key_id; + info.btf_value_id = map->btf_value_id; + } + if (bpf_map_is_dev_bound(map)) { err = bpf_map_offload_info_fill(&info, map); if (err) @@ -2093,6 +2099,19 @@ static int bpf_btf_load(const union bpf_attr *attr) return btf_new_fd(attr); } +#define BPF_BTF_GET_FD_BY_ID_LAST_FIELD btf_id + +static int bpf_btf_get_fd_by_id(const union bpf_attr *attr) +{ + if (CHECK_ATTR(BPF_BTF_GET_FD_BY_ID)) + return -EINVAL; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + return btf_get_fd_by_id(attr->btf_id); +} + SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) { union bpf_attr attr; @@ -2177,6 +2196,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz case BPF_BTF_LOAD: err = bpf_btf_load(&attr); break; + case BPF_BTF_GET_FD_BY_ID: + err = bpf_btf_get_fd_by_id(&attr); + break; default: err = -EINVAL; break; From e7852ae5c692739d5e899b572e9cc30ab10a9793 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Fri, 4 May 2018 14:49:52 -0700 Subject: [PATCH 0292/1640] UPSTREAM: bpf: btf: Add struct bpf_btf_info During BPF_OBJ_GET_INFO_BY_FD on a btf_fd, the current bpf_attr's info.info is directly filled with the BTF binary data. It is not extensible. In this case, we want to add BTF ID. This patch adds "struct bpf_btf_info" which has the BTF ID as one of its member. The BTF binary data itself is exposed through the "btf" and "btf_size" members. Signed-off-by: Martin KaFai Lau Acked-by: Alexei Starovoitov Acked-by: Song Liu Signed-off-by: Daniel Borkmann --- include/uapi/linux/bpf.h | 6 ++++++ kernel/bpf/btf.c | 26 +++++++++++++++++++++----- kernel/bpf/syscall.c | 17 ++++++++++++++++- 3 files changed, 43 insertions(+), 6 deletions(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index a9aa6939f83e..ced5d8725f88 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2194,6 +2194,12 @@ struct bpf_map_info { __u32 btf_value_id; } __attribute__((aligned(8))); +struct bpf_btf_info { + __aligned_u64 btf; + __u32 btf_size; + __u32 id; +} __attribute__((aligned(8))); + /* User bpf_sock_addr struct to access socket fields and sockaddr struct passed * by user and intended to be used by socket (e.g. to bind to, depends on * attach attach type). diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 40950b6bf395..ded10ab47b8a 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -2114,12 +2114,28 @@ int btf_get_info_by_fd(const struct btf *btf, const union bpf_attr *attr, union bpf_attr __user *uattr) { - void __user *udata = u64_to_user_ptr(attr->info.info); - u32 copy_len = min_t(u32, btf->data_size, - attr->info.info_len); + struct bpf_btf_info __user *uinfo; + struct bpf_btf_info info = {}; + u32 info_copy, btf_copy; + void __user *ubtf; + u32 uinfo_len; - if (copy_to_user(udata, btf->data, copy_len) || - put_user(btf->data_size, &uattr->info.info_len)) + uinfo = u64_to_user_ptr(attr->info.info); + uinfo_len = attr->info.info_len; + + info_copy = min_t(u32, uinfo_len, sizeof(info)); + if (copy_from_user(&info, uinfo, info_copy)) + return -EFAULT; + + info.id = btf->id; + ubtf = u64_to_user_ptr(info.btf); + btf_copy = min_t(u32, btf->data_size, info.btf_size); + if (copy_to_user(ubtf, btf->data, btf_copy)) + return -EFAULT; + info.btf_size = btf->data_size; + + if (copy_to_user(uinfo, &info, info_copy) || + put_user(info_copy, &uattr->info.info_len)) return -EFAULT; return 0; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 40d5cb8b8893..1675ed8bd7a3 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2055,6 +2055,21 @@ static int bpf_map_get_info_by_fd(struct bpf_map *map, return 0; } +static int bpf_btf_get_info_by_fd(struct btf *btf, + const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + struct bpf_btf_info __user *uinfo = u64_to_user_ptr(attr->info.info); + u32 info_len = attr->info.info_len; + int err; + + err = check_uarg_tail_zero(uinfo, sizeof(*uinfo), info_len); + if (err) + return err; + + return btf_get_info_by_fd(btf, attr, uattr); +} + #define BPF_OBJ_GET_INFO_BY_FD_LAST_FIELD info.info static int bpf_obj_get_info_by_fd(const union bpf_attr *attr, @@ -2078,7 +2093,7 @@ static int bpf_obj_get_info_by_fd(const union bpf_attr *attr, err = bpf_map_get_info_by_fd(f.file->private_data, attr, uattr); else if (f.file->f_op == &btf_fops) - err = btf_get_info_by_fd(f.file->private_data, attr, uattr); + err = bpf_btf_get_info_by_fd(f.file->private_data, attr, uattr); else err = -EINVAL; From ff93073c0886d992feccd43c59da44509a97b670 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 8 May 2018 19:37:06 -0700 Subject: [PATCH 0293/1640] UPSTREAM: bpf: xdp: allow offloads to store into rx_queue_index It's fairly easy for offloaded XDP programs to select the RX queue packets go to. We need a way of expressing this in the software. Allow write to the rx_queue_index field of struct xdp_md for device-bound programs. Skip convert_ctx_access callback entirely for offloads. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 2 +- kernel/bpf/verifier.c | 2 +- net/core/filter.c | 9 ++++++++- 3 files changed, 10 insertions(+), 3 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 9e7e605e9bf9..dbb94e4ad302 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -644,7 +644,7 @@ bool bpf_offload_dev_match(struct bpf_prog *prog, struct bpf_map *map); #if defined(CONFIG_NET) && defined(CONFIG_BPF_SYSCALL) int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr); -static inline bool bpf_prog_is_dev_bound(struct bpf_prog_aux *aux) +static inline bool bpf_prog_is_dev_bound(const struct bpf_prog_aux *aux) { return aux->offload_requested; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 5f325e1bba99..a463ae91d3c5 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5528,7 +5528,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) } } - if (!ops->convert_ctx_access) + if (!ops->convert_ctx_access || bpf_prog_is_dev_bound(env->prog->aux)) return 0; insn = env->prog->insnsi + delta; diff --git a/net/core/filter.c b/net/core/filter.c index 8a67c4044e3e..ec5cf41e9392 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4663,8 +4663,15 @@ static bool xdp_is_valid_access(int off, int size, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { - if (type == BPF_WRITE) + if (type == BPF_WRITE) { + if (bpf_prog_is_dev_bound(prog->aux)) { + switch (off) { + case offsetof(struct xdp_md, rx_queue_index): + return __is_valid_xdp_access(off, size); + } + } return false; + } switch (off) { case offsetof(struct xdp_md, data): From b9738a601bab883a889514b22532b1f107ef5996 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Mon, 21 May 2018 09:08:13 -0700 Subject: [PATCH 0294/1640] UPSTREAM: net/ipv4: Add helper to return path MTU based on fib result Determine path MTU from a FIB lookup result. Logic is a distillation of ip_dst_mtu_maybe_forward. Signed-off-by: David Ahern Signed-off-by: Daniel Borkmann --- include/net/ip_fib.h | 2 ++ net/ipv4/route.c | 31 +++++++++++++++++++++++++++++++ 2 files changed, 33 insertions(+) diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index b711317a796c..0e326ddfc39f 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -428,4 +428,6 @@ static inline void fib_proc_exit(struct net *net) } #endif +u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr); + #endif /* _NET_FIB_H */ diff --git a/net/ipv4/route.c b/net/ipv4/route.c index c4d628d72b90..396d97c9cfa2 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1402,6 +1402,37 @@ static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr) return NULL; } +/* MTU selection: + * 1. mtu on route is locked - use it + * 2. mtu from nexthop exception + * 3. mtu from egress device + */ + +u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr) +{ + struct fib_info *fi = res->fi; + struct fib_nh *nh = &fi->fib_nh[res->nh_sel]; + struct net_device *dev = nh->nh_dev; + u32 mtu = 0; + + if (dev_net(dev)->ipv4.sysctl_ip_fwd_use_pmtu || + fi->fib_metrics->metrics[RTAX_LOCK - 1] & (1 << RTAX_MTU)) + mtu = fi->fib_mtu; + + if (likely(!mtu)) { + struct fib_nh_exception *fnhe; + + fnhe = find_exception(nh, daddr); + if (fnhe && !time_after_eq(jiffies, fnhe->fnhe_expires)) + mtu = fnhe->fnhe_pmtu; + } + + if (likely(!mtu)) + mtu = min(READ_ONCE(dev->mtu), IP_MAX_MTU); + + return mtu - lwtunnel_headroom(nh->nh_lwtstate, mtu); +} + static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe, __be32 daddr, const bool do_cache) { From 1b885a4d7a5b8701930763fbe9152bfd092671e8 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Fri, 6 Oct 2017 12:05:56 -0700 Subject: [PATCH 0295/1640] UPSTREAM: ipv6: introduce a new function fib6_update_sernum() This function takes a route as input and tries to update the sernum in the fib6_node this route is associated with. It will be used in later commit when adding a cached route into the exception table under that route. Signed-off-by: Wei Wang Signed-off-by: Martin KaFai Lau Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 2 ++ net/ipv6/ip6_fib.c | 14 ++++++++++++++ 2 files changed, 16 insertions(+) diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index ea4354cc767b..6c5ce30ac1b7 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -360,6 +360,8 @@ void __net_exit fib6_notifier_exit(struct net *net); unsigned int fib6_tables_seq_read(struct net *net); int fib6_tables_dump(struct net *net, struct notifier_block *nb); +void fib6_update_sernum(struct rt6_info *rt); + #ifdef CONFIG_IPV6_MULTIPLE_TABLES int fib6_rules_init(void); void fib6_rules_cleanup(void); diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 8400b64ec169..3bb6e1e61032 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -110,6 +110,20 @@ enum { FIB6_NO_SERNUM_CHANGE = 0, }; +void fib6_update_sernum(struct rt6_info *rt) +{ + struct fib6_table *table = rt->rt6i_table; + struct net *net = dev_net(rt->dst.dev); + struct fib6_node *fn; + + write_lock_bh(&table->tb6_lock); + fn = rcu_dereference_protected(rt->rt6i_node, + lockdep_is_held(&table->tb6_lock)); + if (fn) + fn->fn_sernum = fib6_new_sernum(net); + write_unlock_bh(&table->tb6_lock); +} + /* * Auxiliary address test functions for the radix tree. * From 54b1796420f554d4b44a5c29f73e0fc9fc8b8ac6 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Fri, 6 Oct 2017 12:05:57 -0700 Subject: [PATCH 0296/1640] UPSTREAM: ipv6: introduce a hash table to store dst cache Add a hash table into struct rt6_info in order to store dst caches created by pmtu discovery and ip redirect in ipv6 routing code. APIs to add dst cache, delete dst cache, find dst cache and update dst cache in the hash table are implemented and will be used in later commits. This is a preparation work to move all cache routes into the exception table instead of getting inserted into the fib6 tree. Signed-off-by: Wei Wang Signed-off-by: Martin KaFai Lau Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 19 +++ include/net/ip6_route.h | 3 + net/ipv6/route.c | 341 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 363 insertions(+) diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 6c5ce30ac1b7..d0b0e19d2deb 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -101,6 +101,22 @@ struct rt6key { struct fib6_table; +struct rt6_exception_bucket { + struct hlist_head chain; + int depth; +}; + +struct rt6_exception { + struct hlist_node hlist; + struct rt6_info *rt6i; + unsigned long stamp; + struct rcu_head rcu; +}; + +#define FIB6_EXCEPTION_BUCKET_SIZE_SHIFT 10 +#define FIB6_EXCEPTION_BUCKET_SIZE (1 << FIB6_EXCEPTION_BUCKET_SIZE_SHIFT) +#define FIB6_MAX_DEPTH 5 + struct rt6_info { struct dst_entry dst; @@ -137,12 +153,15 @@ struct rt6_info { struct inet6_dev *rt6i_idev; struct rt6_info * __percpu *rt6i_pcpu; + struct rt6_exception_bucket __rcu *rt6i_exception_bucket; u32 rt6i_metric; u32 rt6i_pmtu; /* more non-fragment space at head required */ unsigned short rt6i_nfheader_len; u8 rt6i_protocol; + u8 exception_bucket_flushed:1, + unused:7; }; static inline struct inet6_dev *ip6_dst_idev(struct dst_entry *dst) diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index 34139bc9a853..9b6111d87d1a 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -96,6 +96,9 @@ int ip6_route_add(struct fib6_config *cfg, struct netlink_ext_ack *extack); int ip6_ins_rt(struct rt6_info *); int ip6_del_rt(struct rt6_info *); +void rt6_flush_exceptions(struct rt6_info *rt); +int rt6_remove_exception_rt(struct rt6_info *rt); + static inline int ip6_route_get_saddr(struct net *net, struct rt6_info *rt, const struct in6_addr *daddr, unsigned int prefs, diff --git a/net/ipv6/route.c b/net/ipv6/route.c index c919fe02f09d..a536e2b0a5f4 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -44,6 +44,7 @@ #include #include #include +#include #include #include #include @@ -106,6 +107,9 @@ static int rt6_fill_node(struct net *net, struct in6_addr *dst, struct in6_addr *src, int iif, int type, u32 portid, u32 seq, unsigned int flags); +static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt, + struct in6_addr *daddr, + struct in6_addr *saddr); #ifdef CONFIG_IPV6_ROUTE_INFO static struct rt6_info *rt6_add_route_info(struct net *net, @@ -395,6 +399,7 @@ EXPORT_SYMBOL(ip6_dst_alloc); static void ip6_dst_destroy(struct dst_entry *dst) { struct rt6_info *rt = (struct rt6_info *)dst; + struct rt6_exception_bucket *bucket; struct dst_entry *from = dst->from; struct inet6_dev *idev; @@ -407,6 +412,11 @@ static void ip6_dst_destroy(struct dst_entry *dst) rt->rt6i_idev = NULL; in6_dev_put(idev); } + bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1); + if (bucket) { + rt->rt6i_exception_bucket = NULL; + kfree(bucket); + } dst->from = NULL; dst_release(from); @@ -1097,6 +1107,337 @@ static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt) return pcpu_rt; } +/* exception hash table implementation + */ +static DEFINE_SPINLOCK(rt6_exception_lock); + +/* Remove rt6_ex from hash table and free the memory + * Caller must hold rt6_exception_lock + */ +static void rt6_remove_exception(struct rt6_exception_bucket *bucket, + struct rt6_exception *rt6_ex) +{ + if (!bucket || !rt6_ex) + return; + rt6_ex->rt6i->rt6i_node = NULL; + hlist_del_rcu(&rt6_ex->hlist); + rt6_release(rt6_ex->rt6i); + kfree_rcu(rt6_ex, rcu); + WARN_ON_ONCE(!bucket->depth); + bucket->depth--; +} + +/* Remove oldest rt6_ex in bucket and free the memory + * Caller must hold rt6_exception_lock + */ +static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket) +{ + struct rt6_exception *rt6_ex, *oldest = NULL; + + if (!bucket) + return; + + hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) { + if (!oldest || time_before(rt6_ex->stamp, oldest->stamp)) + oldest = rt6_ex; + } + rt6_remove_exception(bucket, oldest); +} + +static u32 rt6_exception_hash(const struct in6_addr *dst, + const struct in6_addr *src) +{ + static u32 seed __read_mostly; + u32 val; + + net_get_random_once(&seed, sizeof(seed)); + val = jhash(dst, sizeof(*dst), seed); + +#ifdef CONFIG_IPV6_SUBTREES + if (src) + val = jhash(src, sizeof(*src), val); +#endif + return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT); +} + +/* Helper function to find the cached rt in the hash table + * and update bucket pointer to point to the bucket for this + * (daddr, saddr) pair + * Caller must hold rt6_exception_lock + */ +static struct rt6_exception * +__rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket, + const struct in6_addr *daddr, + const struct in6_addr *saddr) +{ + struct rt6_exception *rt6_ex; + u32 hval; + + if (!(*bucket) || !daddr) + return NULL; + + hval = rt6_exception_hash(daddr, saddr); + *bucket += hval; + + hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) { + struct rt6_info *rt6 = rt6_ex->rt6i; + bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr); + +#ifdef CONFIG_IPV6_SUBTREES + if (matched && saddr) + matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr); +#endif + if (matched) + return rt6_ex; + } + return NULL; +} + +/* Helper function to find the cached rt in the hash table + * and update bucket pointer to point to the bucket for this + * (daddr, saddr) pair + * Caller must hold rcu_read_lock() + */ +static struct rt6_exception * +__rt6_find_exception_rcu(struct rt6_exception_bucket **bucket, + const struct in6_addr *daddr, + const struct in6_addr *saddr) +{ + struct rt6_exception *rt6_ex; + u32 hval; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + if (!(*bucket) || !daddr) + return NULL; + + hval = rt6_exception_hash(daddr, saddr); + *bucket += hval; + + hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) { + struct rt6_info *rt6 = rt6_ex->rt6i; + bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr); + +#ifdef CONFIG_IPV6_SUBTREES + if (matched && saddr) + matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr); +#endif + if (matched) + return rt6_ex; + } + return NULL; +} + +static int rt6_insert_exception(struct rt6_info *nrt, + struct rt6_info *ort) +{ + struct rt6_exception_bucket *bucket; + struct in6_addr *src_key = NULL; + struct rt6_exception *rt6_ex; + int err = 0; + + /* ort can't be a cache or pcpu route */ + if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU)) + ort = (struct rt6_info *)ort->dst.from; + WARN_ON_ONCE(ort->rt6i_flags & (RTF_CACHE | RTF_PCPU)); + + spin_lock_bh(&rt6_exception_lock); + + if (ort->exception_bucket_flushed) { + err = -EINVAL; + goto out; + } + + bucket = rcu_dereference_protected(ort->rt6i_exception_bucket, + lockdep_is_held(&rt6_exception_lock)); + if (!bucket) { + bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket), + GFP_ATOMIC); + if (!bucket) { + err = -ENOMEM; + goto out; + } + rcu_assign_pointer(ort->rt6i_exception_bucket, bucket); + } + +#ifdef CONFIG_IPV6_SUBTREES + /* rt6i_src.plen != 0 indicates ort is in subtree + * and exception table is indexed by a hash of + * both rt6i_dst and rt6i_src. + * Otherwise, the exception table is indexed by + * a hash of only rt6i_dst. + */ + if (ort->rt6i_src.plen) + src_key = &nrt->rt6i_src.addr; +#endif + rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr, + src_key); + if (rt6_ex) + rt6_remove_exception(bucket, rt6_ex); + + rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC); + if (!rt6_ex) { + err = -ENOMEM; + goto out; + } + rt6_ex->rt6i = nrt; + rt6_ex->stamp = jiffies; + atomic_inc(&nrt->rt6i_ref); + nrt->rt6i_node = ort->rt6i_node; + hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain); + bucket->depth++; + + if (bucket->depth > FIB6_MAX_DEPTH) + rt6_exception_remove_oldest(bucket); + +out: + spin_unlock_bh(&rt6_exception_lock); + + /* Update fn->fn_sernum to invalidate all cached dst */ + if (!err) + fib6_update_sernum(ort); + + return err; +} + +void rt6_flush_exceptions(struct rt6_info *rt) +{ + struct rt6_exception_bucket *bucket; + struct rt6_exception *rt6_ex; + struct hlist_node *tmp; + int i; + + spin_lock_bh(&rt6_exception_lock); + /* Prevent rt6_insert_exception() to recreate the bucket list */ + rt->exception_bucket_flushed = 1; + + bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, + lockdep_is_held(&rt6_exception_lock)); + if (!bucket) + goto out; + + for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { + hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist) + rt6_remove_exception(bucket, rt6_ex); + WARN_ON_ONCE(bucket->depth); + bucket++; + } + +out: + spin_unlock_bh(&rt6_exception_lock); +} + +/* Find cached rt in the hash table inside passed in rt + * Caller has to hold rcu_read_lock() + */ +static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt, + struct in6_addr *daddr, + struct in6_addr *saddr) +{ + struct rt6_exception_bucket *bucket; + struct in6_addr *src_key = NULL; + struct rt6_exception *rt6_ex; + struct rt6_info *res = NULL; + + bucket = rcu_dereference(rt->rt6i_exception_bucket); + +#ifdef CONFIG_IPV6_SUBTREES + /* rt6i_src.plen != 0 indicates rt is in subtree + * and exception table is indexed by a hash of + * both rt6i_dst and rt6i_src. + * Otherwise, the exception table is indexed by + * a hash of only rt6i_dst. + */ + if (rt->rt6i_src.plen) + src_key = saddr; +#endif + rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key); + + if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i)) + res = rt6_ex->rt6i; + + return res; +} + +/* Remove the passed in cached rt from the hash table that contains it */ +int rt6_remove_exception_rt(struct rt6_info *rt) +{ + struct rt6_info *from = (struct rt6_info *)rt->dst.from; + struct rt6_exception_bucket *bucket; + struct in6_addr *src_key = NULL; + struct rt6_exception *rt6_ex; + int err; + + if (!from || + !(rt->rt6i_flags | RTF_CACHE)) + return -EINVAL; + + if (!rcu_access_pointer(from->rt6i_exception_bucket)) + return -ENOENT; + + spin_lock_bh(&rt6_exception_lock); + bucket = rcu_dereference_protected(from->rt6i_exception_bucket, + lockdep_is_held(&rt6_exception_lock)); +#ifdef CONFIG_IPV6_SUBTREES + /* rt6i_src.plen != 0 indicates 'from' is in subtree + * and exception table is indexed by a hash of + * both rt6i_dst and rt6i_src. + * Otherwise, the exception table is indexed by + * a hash of only rt6i_dst. + */ + if (from->rt6i_src.plen) + src_key = &rt->rt6i_src.addr; +#endif + rt6_ex = __rt6_find_exception_spinlock(&bucket, + &rt->rt6i_dst.addr, + src_key); + if (rt6_ex) { + rt6_remove_exception(bucket, rt6_ex); + err = 0; + } else { + err = -ENOENT; + } + + spin_unlock_bh(&rt6_exception_lock); + return err; +} + +/* Find rt6_ex which contains the passed in rt cache and + * refresh its stamp + */ +static void rt6_update_exception_stamp_rt(struct rt6_info *rt) +{ + struct rt6_info *from = (struct rt6_info *)rt->dst.from; + struct rt6_exception_bucket *bucket; + struct in6_addr *src_key = NULL; + struct rt6_exception *rt6_ex; + + if (!from || + !(rt->rt6i_flags | RTF_CACHE)) + return; + + rcu_read_lock(); + bucket = rcu_dereference(from->rt6i_exception_bucket); + +#ifdef CONFIG_IPV6_SUBTREES + /* rt6i_src.plen != 0 indicates 'from' is in subtree + * and exception table is indexed by a hash of + * both rt6i_dst and rt6i_src. + * Otherwise, the exception table is indexed by + * a hash of only rt6i_dst. + */ + if (from->rt6i_src.plen) + src_key = &rt->rt6i_src.addr; +#endif + rt6_ex = __rt6_find_exception_rcu(&bucket, + &rt->rt6i_dst.addr, + src_key); + if (rt6_ex) + rt6_ex->stamp = jiffies; + + rcu_read_unlock(); +} + struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif, struct flowi6 *fl6, int flags) { From 370d6ae4fee1a020d33cf4d62116dc0a6a789639 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Fri, 6 Oct 2017 12:05:58 -0700 Subject: [PATCH 0297/1640] UPSTREAM: ipv6: prepare fib6_remove_prefsrc() for exception table After we move cached dst entries into the exception table under its parent route, current fib6_remove_prefsrc() no longer can access them. This commit makes fib6_remove_prefsrc() also go through all routes in the exception table to remove the pref src. This is a preparation patch in order to move all cached dst into the exception table. Signed-off-by: Wei Wang Signed-off-by: Martin KaFai Lau Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/route.c | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index a536e2b0a5f4..2e7e6690a0e1 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1270,6 +1270,12 @@ static int rt6_insert_exception(struct rt6_info *nrt, if (ort->rt6i_src.plen) src_key = &nrt->rt6i_src.addr; #endif + + /* Update rt6i_prefsrc as it could be changed + * in rt6_remove_prefsrc() + */ + nrt->rt6i_prefsrc = ort->rt6i_prefsrc; + rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr, src_key); if (rt6_ex) @@ -1438,6 +1444,25 @@ static void rt6_update_exception_stamp_rt(struct rt6_info *rt) rcu_read_unlock(); } +static void rt6_exceptions_remove_prefsrc(struct rt6_info *rt) +{ + struct rt6_exception_bucket *bucket; + struct rt6_exception *rt6_ex; + int i; + + bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, + lockdep_is_held(&rt6_exception_lock)); + + if (bucket) { + for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { + hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) { + rt6_ex->rt6i->rt6i_prefsrc.plen = 0; + } + bucket++; + } + } +} + struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif, struct flowi6 *fl6, int flags) { @@ -3153,8 +3178,12 @@ static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg) if (((void *)rt->dst.dev == dev || !dev) && rt != net->ipv6.ip6_null_entry && ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) { + spin_lock_bh(&rt6_exception_lock); /* remove prefsrc entry */ rt->rt6i_prefsrc.plen = 0; + /* need to update cache as well */ + rt6_exceptions_remove_prefsrc(rt); + spin_unlock_bh(&rt6_exception_lock); } return 0; } From a0ab3aad0e4dc7995e8f13bedf92874208cfa9a3 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Fri, 6 Oct 2017 12:05:59 -0700 Subject: [PATCH 0298/1640] UPSTREAM: ipv6: prepare rt6_mtu_change() for exception table If we move all cached dst into the exception table under the main route, current rt6_mtu_change() will no longer be able to access them. This commit makes rt6_mtu_change_route() function to also go through all cached routes in the exception table under the main route and do proper updates on the mtu. This is a preparation in order to move all cached routes into the exception table. Signed-off-by: Wei Wang Signed-off-by: Martin KaFai Lau Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/route.c | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 2e7e6690a0e1..fce84f5cf387 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1275,6 +1275,14 @@ static int rt6_insert_exception(struct rt6_info *nrt, * in rt6_remove_prefsrc() */ nrt->rt6i_prefsrc = ort->rt6i_prefsrc; + /* rt6_mtu_change() might lower mtu on ort. + * Only insert this exception route if its mtu + * is less than ort's mtu value. + */ + if (nrt->rt6i_pmtu >= dst_mtu(&ort->dst)) { + err = -EINVAL; + goto out; + } rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr, src_key); @@ -1463,6 +1471,32 @@ static void rt6_exceptions_remove_prefsrc(struct rt6_info *rt) } } +static void rt6_exceptions_update_pmtu(struct rt6_info *rt, int mtu) +{ + struct rt6_exception_bucket *bucket; + struct rt6_exception *rt6_ex; + int i; + + bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, + lockdep_is_held(&rt6_exception_lock)); + + if (bucket) { + for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { + hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) { + struct rt6_info *entry = rt6_ex->rt6i; + /* For RTF_CACHE with rt6i_pmtu == 0 + * (i.e. a redirected route), + * the metrics of its rt->dst.from has already + * been updated. + */ + if (entry->rt6i_pmtu && entry->rt6i_pmtu > mtu) + entry->rt6i_pmtu = mtu; + } + bucket++; + } + } +} + struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif, struct flowi6 *fl6, int flags) { @@ -3290,6 +3324,10 @@ static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg) if (rt->dst.dev == arg->dev && dst_metric_raw(&rt->dst, RTAX_MTU) && !dst_metric_locked(&rt->dst, RTAX_MTU)) { + spin_lock_bh(&rt6_exception_lock); + /* This case will be removed once the exception table + * is hooked up. + */ if (rt->rt6i_flags & RTF_CACHE) { /* For RTF_CACHE with rt6i_pmtu == 0 * (i.e. a redirected route), @@ -3303,6 +3341,8 @@ static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg) dst_mtu(&rt->dst) == idev->cnf.mtu6)) { dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu); } + rt6_exceptions_update_pmtu(rt, arg->mtu); + spin_unlock_bh(&rt6_exception_lock); } return 0; } From 1e6c476c618171a4970931fe6247e141e4e254bc Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Fri, 6 Oct 2017 12:06:00 -0700 Subject: [PATCH 0299/1640] UPSTREAM: ipv6: prepare rt6_clean_tohost() for exception table If we move all cached dst into the exception table under the main route, current rt6_clean_tohost() will no longer be able to access them. This commit makes fib6_clean_tohost() to also go through all cached routes in exception table and removes cached gateway routes to the passed in gateway. This is a preparation in order to move all cached routes into the exception table. Signed-off-by: Wei Wang Signed-off-by: Martin KaFai Lau Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/route.c | 48 +++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 47 insertions(+), 1 deletion(-) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index fce84f5cf387..b68440b13d20 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1497,6 +1497,43 @@ static void rt6_exceptions_update_pmtu(struct rt6_info *rt, int mtu) } } +#define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE) + +static void rt6_exceptions_clean_tohost(struct rt6_info *rt, + struct in6_addr *gateway) +{ + struct rt6_exception_bucket *bucket; + struct rt6_exception *rt6_ex; + struct hlist_node *tmp; + int i; + + if (!rcu_access_pointer(rt->rt6i_exception_bucket)) + return; + + spin_lock_bh(&rt6_exception_lock); + bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, + lockdep_is_held(&rt6_exception_lock)); + + if (bucket) { + for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { + hlist_for_each_entry_safe(rt6_ex, tmp, + &bucket->chain, hlist) { + struct rt6_info *entry = rt6_ex->rt6i; + + if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) == + RTF_CACHE_GATEWAY && + ipv6_addr_equal(gateway, + &entry->rt6i_gateway)) { + rt6_remove_exception(bucket, rt6_ex); + } + } + bucket++; + } + } + + spin_unlock_bh(&rt6_exception_lock); +} + struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif, struct flowi6 *fl6, int flags) { @@ -3234,18 +3271,27 @@ void rt6_remove_prefsrc(struct inet6_ifaddr *ifp) } #define RTF_RA_ROUTER (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY) -#define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE) /* Remove routers and update dst entries when gateway turn into host. */ static int fib6_clean_tohost(struct rt6_info *rt, void *arg) { struct in6_addr *gateway = (struct in6_addr *)arg; + /* RTF_CACHE_GATEWAY case will be removed once the exception + * table is hooked up to store all cached routes. + */ if ((((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) || ((rt->rt6i_flags & RTF_CACHE_GATEWAY) == RTF_CACHE_GATEWAY)) && ipv6_addr_equal(gateway, &rt->rt6i_gateway)) { return -1; } + + /* Further clean up cached routes in exception table. + * This is needed because cached route may have a different + * gateway than its 'parent' in the case of an ip redirect. + */ + rt6_exceptions_clean_tohost(rt, gateway); + return 0; } From 538f4093cb68915d444fadc11aa9bcf95d3f78cb Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Fri, 6 Oct 2017 12:06:01 -0700 Subject: [PATCH 0300/1640] UPSTREAM: ipv6: prepare fib6_age() for exception table If all dst cache entries are stored in the exception table under the main route, we have to go through them during fib6_age() when doing garbage collecting. Introduce a new function rt6_age_exception() which goes through all dst entries in the exception table and remove those entries that are expired. This function is called in fib6_age() so that all dst caches are also garbage collected. Signed-off-by: Wei Wang Signed-off-by: Martin KaFai Lau Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 13 +++++++++ include/net/ip6_route.h | 2 ++ net/ipv6/ip6_fib.c | 26 +++++++----------- net/ipv6/route.c | 60 +++++++++++++++++++++++++++++++++++++++++ 4 files changed, 84 insertions(+), 17 deletions(-) diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index d0b0e19d2deb..dc683c5841a9 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -32,6 +32,14 @@ #define FIB6_TABLE_HASHSZ 1 #endif +#define RT6_DEBUG 2 + +#if RT6_DEBUG >= 3 +#define RT6_TRACE(x...) pr_debug(x) +#else +#define RT6_TRACE(x...) do { ; } while (0) +#endif + struct rt6_info; struct fib6_config { @@ -78,6 +86,11 @@ struct fib6_node { struct rcu_head rcu; }; +struct fib6_gc_args { + int timeout; + int more; +}; + #ifndef CONFIG_IPV6_SUBTREES #define FIB6_SUBTREE(fn) NULL #else diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index 9b6111d87d1a..0e91ed9021a4 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -98,6 +98,8 @@ int ip6_del_rt(struct rt6_info *); void rt6_flush_exceptions(struct rt6_info *rt); int rt6_remove_exception_rt(struct rt6_info *rt); +void rt6_age_exceptions(struct rt6_info *rt, struct fib6_gc_args *gc_args, + unsigned long now); static inline int ip6_route_get_saddr(struct net *net, struct rt6_info *rt, const struct in6_addr *daddr, diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 3bb6e1e61032..49c8171b82ad 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -38,14 +38,6 @@ #include #include -#define RT6_DEBUG 2 - -#if RT6_DEBUG >= 3 -#define RT6_TRACE(x...) pr_debug(x) -#else -#define RT6_TRACE(x...) do { ; } while (0) -#endif - static struct kmem_cache *fib6_node_kmem __read_mostly; struct fib6_cleaner { @@ -1891,12 +1883,6 @@ static void fib6_flush_trees(struct net *net) * Garbage collection */ -struct fib6_gc_args -{ - int timeout; - int more; -}; - static int fib6_age(struct rt6_info *rt, void *arg) { struct fib6_gc_args *gc_args = arg; @@ -1905,9 +1891,6 @@ static int fib6_age(struct rt6_info *rt, void *arg) /* * check addrconf expiration here. * Routes are expired even if they are in use. - * - * Also age clones. Note, that clones are aged out - * only if they are not in use now. */ if (rt->rt6i_flags & RTF_EXPIRES && rt->dst.expires) { @@ -1916,6 +1899,9 @@ static int fib6_age(struct rt6_info *rt, void *arg) return -1; } gc_args->more++; + /* The following part will soon be removed when the exception + * table is hooked up to store all cached routes. + */ } else if (rt->rt6i_flags & RTF_CACHE) { if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) rt->dst.obsolete = DST_OBSOLETE_KILL; @@ -1941,6 +1927,12 @@ static int fib6_age(struct rt6_info *rt, void *arg) gc_args->more++; } + /* Also age clones in the exception table. + * Note, that clones are aged out + * only if they are not in use now. + */ + rt6_age_exceptions(rt, gc_args, now); + return 0; } diff --git a/net/ipv6/route.c b/net/ipv6/route.c index b68440b13d20..dbad6a8d33e1 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1534,6 +1534,66 @@ static void rt6_exceptions_clean_tohost(struct rt6_info *rt, spin_unlock_bh(&rt6_exception_lock); } +static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket, + struct rt6_exception *rt6_ex, + struct fib6_gc_args *gc_args, + unsigned long now) +{ + struct rt6_info *rt = rt6_ex->rt6i; + + if (atomic_read(&rt->dst.__refcnt) == 1 && + time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) { + RT6_TRACE("aging clone %p\n", rt); + rt6_remove_exception(bucket, rt6_ex); + return; + } else if (rt->rt6i_flags & RTF_GATEWAY) { + struct neighbour *neigh; + __u8 neigh_flags = 0; + + neigh = dst_neigh_lookup(&rt->dst, &rt->rt6i_gateway); + if (neigh) { + neigh_flags = neigh->flags; + neigh_release(neigh); + } + if (!(neigh_flags & NTF_ROUTER)) { + RT6_TRACE("purging route %p via non-router but gateway\n", + rt); + rt6_remove_exception(bucket, rt6_ex); + return; + } + } + gc_args->more++; +} + +void rt6_age_exceptions(struct rt6_info *rt, + struct fib6_gc_args *gc_args, + unsigned long now) +{ + struct rt6_exception_bucket *bucket; + struct rt6_exception *rt6_ex; + struct hlist_node *tmp; + int i; + + if (!rcu_access_pointer(rt->rt6i_exception_bucket)) + return; + + spin_lock_bh(&rt6_exception_lock); + bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, + lockdep_is_held(&rt6_exception_lock)); + + if (bucket) { + for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { + hlist_for_each_entry_safe(rt6_ex, tmp, + &bucket->chain, hlist) { + rt6_age_examine_exception(bucket, rt6_ex, + gc_args, now); + } + bucket++; + } + } + spin_unlock_bh(&rt6_exception_lock); +} + struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif, struct flowi6 *fl6, int flags) { From 21da05936e78b0a0feb234f82e09a10bd359edcd Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Fri, 6 Oct 2017 12:06:02 -0700 Subject: [PATCH 0301/1640] UPSTREAM: ipv6: prepare fib6_locate() for exception table fib6_locate() is used to find the fib6_node according to the passed in prefix address key. It currently tries to find the fib6_node with the exact match of the passed in key. However, when we move cached routes into the exception table, fib6_locate() will fail to find the fib6_node for it as the cached routes will be stored in the exception table under the fib6_node with the longest prefix match of the cache's dst addr key. This commit adds a new parameter to let the caller specify if it needs exact match or longest prefix match. Right now, all callers still does exact match when calling fib6_locate(). It will be changed in later commit where exception table is hooked up to store cached routes. Signed-off-by: Wei Wang Signed-off-by: Martin KaFai Lau Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 3 ++- net/ipv6/addrconf.c | 2 +- net/ipv6/ip6_fib.c | 30 +++++++++++++++++++++++------- net/ipv6/route.c | 5 +++-- 4 files changed, 29 insertions(+), 11 deletions(-) diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index dc683c5841a9..a3dce846b06d 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -359,7 +359,8 @@ struct fib6_node *fib6_lookup(struct fib6_node *root, struct fib6_node *fib6_locate(struct fib6_node *root, const struct in6_addr *daddr, int dst_len, - const struct in6_addr *saddr, int src_len); + const struct in6_addr *saddr, int src_len, + bool exact_match); void fib6_clean_all(struct net *net, int (*func)(struct rt6_info *, void *arg), void *arg); diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 095e109eda6e..6c4ff113833e 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -2353,7 +2353,7 @@ static struct rt6_info *addrconf_get_prefix_route(const struct in6_addr *pfx, return NULL; read_lock_bh(&table->tb6_lock); - fn = fib6_locate(&table->tb6_root, pfx, plen, NULL, 0); + fn = fib6_locate(&table->tb6_root, pfx, plen, NULL, 0, true); if (!fn) goto out; diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 49c8171b82ad..37c63c5354f7 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -1344,14 +1344,21 @@ struct fib6_node *fib6_lookup(struct fib6_node *root, const struct in6_addr *dad /* * Get node with specified destination prefix (and source prefix, * if subtrees are used) + * exact_match == true means we try to find fn with exact match of + * the passed in prefix addr + * exact_match == false means we try to find fn with longest prefix + * match of the passed in prefix addr. This is useful for finding fn + * for cached route as it will be stored in the exception table under + * the node with longest prefix length. */ static struct fib6_node *fib6_locate_1(struct fib6_node *root, const struct in6_addr *addr, - int plen, int offset) + int plen, int offset, + bool exact_match) { - struct fib6_node *fn; + struct fib6_node *fn, *prev = NULL; for (fn = root; fn ; ) { struct rt6key *key = (struct rt6key *)((u8 *)fn->leaf + offset); @@ -1361,11 +1368,13 @@ static struct fib6_node *fib6_locate_1(struct fib6_node *root, */ if (plen < fn->fn_bit || !ipv6_prefix_equal(&key->addr, addr, fn->fn_bit)) - return NULL; + goto out; if (plen == fn->fn_bit) return fn; + prev = fn; + /* * We have more bits to go */ @@ -1374,24 +1383,31 @@ static struct fib6_node *fib6_locate_1(struct fib6_node *root, else fn = fn->left; } - return NULL; +out: + if (exact_match) + return NULL; + else + return prev; } struct fib6_node *fib6_locate(struct fib6_node *root, const struct in6_addr *daddr, int dst_len, - const struct in6_addr *saddr, int src_len) + const struct in6_addr *saddr, int src_len, + bool exact_match) { struct fib6_node *fn; fn = fib6_locate_1(root, daddr, dst_len, - offsetof(struct rt6_info, rt6i_dst)); + offsetof(struct rt6_info, rt6i_dst), + exact_match); #ifdef CONFIG_IPV6_SUBTREES if (src_len) { WARN_ON(saddr == NULL); if (fn && fn->subtree) fn = fib6_locate_1(fn->subtree, saddr, src_len, - offsetof(struct rt6_info, rt6i_src)); + offsetof(struct rt6_info, rt6i_src), + exact_match); } #endif diff --git a/net/ipv6/route.c b/net/ipv6/route.c index dbad6a8d33e1..826a698bfaf0 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2822,7 +2822,8 @@ static int ip6_route_del(struct fib6_config *cfg, fn = fib6_locate(&table->tb6_root, &cfg->fc_dst, cfg->fc_dst_len, - &cfg->fc_src, cfg->fc_src_len); + &cfg->fc_src, cfg->fc_src_len, + true); if (fn) { for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) { @@ -3030,7 +3031,7 @@ static struct rt6_info *rt6_get_route_info(struct net *net, return NULL; read_lock_bh(&table->tb6_lock); - fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0); + fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true); if (!fn) goto out; From aa223246ee75673c1a4291c0a68fd33db08ad125 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Fri, 6 Oct 2017 12:06:03 -0700 Subject: [PATCH 0302/1640] UPSTREAM: ipv6: hook up exception table to store dst cache This commit makes use of the exception hash table implementation to store dst caches created by pmtu discovery and ip redirect into the hash table under the rt_info and no longer inserts these routes into fib6 tree. This makes the fib6 tree only contain static configured routes and could now be protected by rcu instead of a rw lock. With this change, in the route lookup related functions, after finding the rt6_info with the longest prefix, we also need to search for the exception table before doing backtracking. In the route delete function, if the route being deleted is not a dst cache, deletion of this route also need to flush the whole hash table under it. If it is a dst cache, then only delete the cached dst in the hash table. Note: for fib6_walk_continue() function, w->root now is always pointing to a root node considering that fib6_prune_clones() is removed from the code. So we add a WARN_ON() msg to make sure w->root always points to a root node and also removed the update of w->root in fib6_repair_tree(). This is a prerequisite for later patch because we don't need to make w->root as rcu protected when replacing rwlock with RCU. Also, we remove all prune related variables as it is no longer used. Signed-off-by: Wei Wang Signed-off-by: Martin KaFai Lau Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 1 - net/ipv6/addrconf.c | 1 - net/ipv6/ip6_fib.c | 95 +++++++------------------------------ net/ipv6/route.c | 108 +++++++++++++++++++++--------------------- 4 files changed, 72 insertions(+), 133 deletions(-) diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index a3dce846b06d..09fd5391f85f 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -282,7 +282,6 @@ struct fib6_walker { struct fib6_node *root, *node; struct rt6_info *leaf; enum fib6_walk_state state; - bool prune; unsigned int skip; unsigned int count; int (*func)(struct fib6_walker *); diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 6c4ff113833e..4624390019cc 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -2357,7 +2357,6 @@ static struct rt6_info *addrconf_get_prefix_route(const struct in6_addr *pfx, if (!fn) goto out; - noflags |= RTF_CACHE; for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) { if (rt->dst.dev->ifindex != dev->ifindex) continue; diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 37c63c5354f7..6365c0911954 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -54,7 +54,6 @@ struct fib6_cleaner { #define FWS_INIT FWS_L #endif -static void fib6_prune_clones(struct net *net, struct fib6_node *fn); static struct rt6_info *fib6_find_prefix(struct net *net, struct fib6_node *fn); static struct fib6_node *fib6_repair_tree(struct net *net, struct fib6_node *fn); static int fib6_walk(struct net *net, struct fib6_walker *w); @@ -1102,6 +1101,8 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, if (WARN_ON_ONCE(!atomic_read(&rt->dst.__refcnt))) return -EINVAL; + if (WARN_ON_ONCE(rt->rt6i_flags & RTF_CACHE)) + return -EINVAL; if (info->nlh) { if (!(info->nlh->nlmsg_flags & NLM_F_CREATE)) @@ -1193,11 +1194,8 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, #endif err = fib6_add_rt2node(fn, rt, info, mxc); - if (!err) { + if (!err) fib6_start_gc(info->nl_net, rt); - if (!(rt->rt6i_flags & RTF_CACHE)) - fib6_prune_clones(info->nl_net, pn); - } out: if (err) { @@ -1512,19 +1510,12 @@ static struct fib6_node *fib6_repair_tree(struct net *net, read_lock(&net->ipv6.fib6_walker_lock); FOR_WALKERS(net, w) { if (!child) { - if (w->root == fn) { - w->root = w->node = NULL; - RT6_TRACE("W %p adjusted by delroot 1\n", w); - } else if (w->node == fn) { + if (w->node == fn) { RT6_TRACE("W %p adjusted by delnode 1, s=%d/%d\n", w, w->state, nstate); w->node = pn; w->state = nstate; } } else { - if (w->root == fn) { - w->root = child; - RT6_TRACE("W %p adjusted by delroot 2\n", w); - } if (w->node == fn) { w->node = child; if (children&2) { @@ -1558,12 +1549,17 @@ static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp, RT6_TRACE("fib6_del_route\n"); + WARN_ON_ONCE(rt->rt6i_flags & RTF_CACHE); + /* Unlink it */ *rtp = rt->dst.rt6_next; rt->rt6i_node = NULL; net->ipv6.rt6_stats->fib_rt_entries--; net->ipv6.rt6_stats->fib_discarded_routes++; + /* Flush all cached dst in exception table */ + rt6_flush_exceptions(rt); + /* Reset round-robin state, if necessary */ if (fn->rr_ptr == rt) fn->rr_ptr = NULL; @@ -1626,18 +1622,9 @@ int fib6_del(struct rt6_info *rt, struct nl_info *info) WARN_ON(!(fn->fn_flags & RTN_RTINFO)); - if (!(rt->rt6i_flags & RTF_CACHE)) { - struct fib6_node *pn = fn; -#ifdef CONFIG_IPV6_SUBTREES - /* clones of this route might be in another subtree */ - if (rt->rt6i_src.plen) { - while (!(pn->fn_flags & RTN_ROOT)) - pn = pn->parent; - pn = pn->parent; - } -#endif - fib6_prune_clones(info->nl_net, pn); - } + /* remove cached dst from exception table */ + if (rt->rt6i_flags & RTF_CACHE) + return rt6_remove_exception_rt(rt); /* * Walk the leaf entries looking for ourself @@ -1680,16 +1667,14 @@ static int fib6_walk_continue(struct fib6_walker *w) { struct fib6_node *fn, *pn; + /* w->root should always be table->tb6_root */ + WARN_ON_ONCE(!(w->root->fn_flags & RTN_TL_ROOT)); + for (;;) { fn = w->node; if (!fn) return 0; - if (w->prune && fn != w->root && - fn->fn_flags & RTN_RTINFO && w->state < FWS_C) { - w->state = FWS_C; - w->leaf = fn->leaf; - } switch (w->state) { #ifdef CONFIG_IPV6_SUBTREES case FWS_S: @@ -1821,20 +1806,16 @@ static int fib6_clean_node(struct fib6_walker *w) * func is called on each route. * It may return -1 -> delete this route. * 0 -> continue walking - * - * prune==1 -> only immediate children of node (certainly, - * ignoring pure split nodes) will be scanned. */ static void fib6_clean_tree(struct net *net, struct fib6_node *root, int (*func)(struct rt6_info *, void *arg), - bool prune, int sernum, void *arg) + int sernum, void *arg) { struct fib6_cleaner c; c.w.root = root; c.w.func = fib6_clean_node; - c.w.prune = prune; c.w.count = 0; c.w.skip = 0; c.func = func; @@ -1859,7 +1840,7 @@ static void __fib6_clean_all(struct net *net, hlist_for_each_entry_rcu(table, head, tb6_hlist) { write_lock_bh(&table->tb6_lock); fib6_clean_tree(net, &table->tb6_root, - func, false, sernum, arg); + func, sernum, arg); write_unlock_bh(&table->tb6_lock); } } @@ -1872,22 +1853,6 @@ void fib6_clean_all(struct net *net, int (*func)(struct rt6_info *, void *), __fib6_clean_all(net, func, FIB6_NO_SERNUM_CHANGE, arg); } -static int fib6_prune_clone(struct rt6_info *rt, void *arg) -{ - if (rt->rt6i_flags & RTF_CACHE) { - RT6_TRACE("pruning clone %p\n", rt); - return -1; - } - - return 0; -} - -static void fib6_prune_clones(struct net *net, struct fib6_node *fn) -{ - fib6_clean_tree(net, fn, fib6_prune_clone, true, - FIB6_NO_SERNUM_CHANGE, NULL); -} - static void fib6_flush_trees(struct net *net) { int new_sernum = fib6_new_sernum(net); @@ -1915,32 +1880,6 @@ static int fib6_age(struct rt6_info *rt, void *arg) return -1; } gc_args->more++; - /* The following part will soon be removed when the exception - * table is hooked up to store all cached routes. - */ - } else if (rt->rt6i_flags & RTF_CACHE) { - if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) - rt->dst.obsolete = DST_OBSOLETE_KILL; - if (atomic_read(&rt->dst.__refcnt) == 1 && - rt->dst.obsolete == DST_OBSOLETE_KILL) { - RT6_TRACE("aging clone %p\n", rt); - return -1; - } else if (rt->rt6i_flags & RTF_GATEWAY) { - struct neighbour *neigh; - __u8 neigh_flags = 0; - - neigh = dst_neigh_lookup(&rt->dst, &rt->rt6i_gateway); - if (neigh) { - neigh_flags = neigh->flags; - neigh_release(neigh); - } - if (!(neigh_flags & NTF_ROUTER)) { - RT6_TRACE("purging route %p via non-router but gateway\n", - rt); - return -1; - } - } - gc_args->more++; } /* Also age clones in the exception table. diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 826a698bfaf0..d9bd93f24d5b 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -881,8 +881,8 @@ static struct rt6_info *ip6_pol_route_lookup(struct net *net, struct fib6_table *table, struct flowi6 *fl6, int flags) { + struct rt6_info *rt, *rt_cache; struct fib6_node *fn; - struct rt6_info *rt; if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) flags &= ~RT6_LOOKUP_F_IFACE; @@ -899,6 +899,11 @@ restart: if (fn) goto restart; } + /* Search through exception table */ + rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr); + if (rt_cache) + rt = rt_cache; + dst_use(&rt->dst, jiffies); read_unlock_bh(&table->tb6_lock); @@ -1598,7 +1603,7 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif, struct flowi6 *fl6, int flags) { struct fib6_node *fn, *saved_fn; - struct rt6_info *rt; + struct rt6_info *rt, *rt_cache; int strict = 0; strict |= flags & RT6_LOOKUP_F_IFACE; @@ -1630,6 +1635,10 @@ redo_rt6_select: } } + /*Search through exception table */ + rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr); + if (rt_cache) + rt = rt_cache; if (rt == net->ipv6.ip6_null_entry || (rt->rt6i_flags & RTF_CACHE)) { dst_use(&rt->dst, jiffies); @@ -2002,23 +2011,17 @@ static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk, if (!rt6_cache_allowed_for_pmtu(rt6)) { rt6_do_update_pmtu(rt6, mtu); + /* update rt6_ex->stamp for cache */ + if (rt6->rt6i_flags & RTF_CACHE) + rt6_update_exception_stamp_rt(rt6); } else if (daddr) { struct rt6_info *nrt6; nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr); if (nrt6) { rt6_do_update_pmtu(nrt6, mtu); - - /* ip6_ins_rt(nrt6) will bump the - * rt6->rt6i_node->fn_sernum - * which will fail the next rt6_check() and - * invalidate the sk->sk_dst_cache. - */ - ip6_ins_rt(nrt6); - /* Release the reference taken in - * ip6_rt_cache_alloc() - */ - dst_release(&nrt6->dst); + if (rt6_insert_exception(nrt6, rt6)) + dst_release_immediate(&nrt6->dst); } } } @@ -2087,7 +2090,7 @@ static struct rt6_info *__ip6_route_redirect(struct net *net, int flags) { struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6; - struct rt6_info *rt; + struct rt6_info *rt, *rt_cache; struct fib6_node *fn; /* Get the "current" route for this destination and @@ -2112,8 +2115,23 @@ restart: continue; if (fl6->flowi6_oif != rt->dst.dev->ifindex) continue; - if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway)) + /* rt_cache's gateway might be different from its 'parent' + * in the case of an ip redirect. + * So we keep searching in the exception table if the gateway + * is different. + */ + if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway)) { + rt_cache = rt6_find_cached_rt(rt, + &fl6->daddr, + &fl6->saddr); + if (rt_cache && + ipv6_addr_equal(&rdfl->gateway, + &rt_cache->rt6i_gateway)) { + rt = rt_cache; + break; + } continue; + } break; } @@ -2807,9 +2825,9 @@ out_put: static int ip6_route_del(struct fib6_config *cfg, struct netlink_ext_ack *extack) { + struct rt6_info *rt, *rt_cache; struct fib6_table *table; struct fib6_node *fn; - struct rt6_info *rt; int err = -ESRCH; table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table); @@ -2823,13 +2841,17 @@ static int ip6_route_del(struct fib6_config *cfg, fn = fib6_locate(&table->tb6_root, &cfg->fc_dst, cfg->fc_dst_len, &cfg->fc_src, cfg->fc_src_len, - true); + !(cfg->fc_flags & RTF_CACHE)); if (fn) { for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) { - if ((rt->rt6i_flags & RTF_CACHE) && - !(cfg->fc_flags & RTF_CACHE)) - continue; + if (cfg->fc_flags & RTF_CACHE) { + rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst, + &cfg->fc_src); + if (!rt_cache) + continue; + rt = rt_cache; + } if (cfg->fc_ifindex && (!rt->dst.dev || rt->dst.dev->ifindex != cfg->fc_ifindex)) @@ -2955,8 +2977,14 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu nrt->rt6i_protocol = RTPROT_REDIRECT; nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key; - if (ip6_ins_rt(nrt)) - goto out_release; + /* No need to remove rt from the exception table if rt is + * a cached route because rt6_insert_exception() will + * takes care of it + */ + if (rt6_insert_exception(nrt, rt)) { + dst_release_immediate(&nrt->dst); + goto out; + } netevent.old = &rt->dst; netevent.new = &nrt->dst; @@ -2964,17 +2992,6 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu netevent.neigh = neigh; call_netevent_notifiers(NETEVENT_REDIRECT, &netevent); - if (rt->rt6i_flags & RTF_CACHE) { - rt = (struct rt6_info *) dst_clone(&rt->dst); - ip6_del_rt(rt); - } - -out_release: - /* Release the reference taken in - * ip6_rt_cache_alloc() - */ - dst_release(&nrt->dst); - out: neigh_release(neigh); } @@ -3338,12 +3355,8 @@ static int fib6_clean_tohost(struct rt6_info *rt, void *arg) { struct in6_addr *gateway = (struct in6_addr *)arg; - /* RTF_CACHE_GATEWAY case will be removed once the exception - * table is hooked up to store all cached routes. - */ - if ((((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) || - ((rt->rt6i_flags & RTF_CACHE_GATEWAY) == RTF_CACHE_GATEWAY)) && - ipv6_addr_equal(gateway, &rt->rt6i_gateway)) { + if (((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) && + ipv6_addr_equal(gateway, &rt->rt6i_gateway)) { return -1; } @@ -3432,20 +3445,9 @@ static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg) dst_metric_raw(&rt->dst, RTAX_MTU) && !dst_metric_locked(&rt->dst, RTAX_MTU)) { spin_lock_bh(&rt6_exception_lock); - /* This case will be removed once the exception table - * is hooked up. - */ - if (rt->rt6i_flags & RTF_CACHE) { - /* For RTF_CACHE with rt6i_pmtu == 0 - * (i.e. a redirected route), - * the metrics of its rt->dst.from has already - * been updated. - */ - if (rt->rt6i_pmtu && rt->rt6i_pmtu > arg->mtu) - rt->rt6i_pmtu = arg->mtu; - } else if (dst_mtu(&rt->dst) >= arg->mtu || - (dst_mtu(&rt->dst) < arg->mtu && - dst_mtu(&rt->dst) == idev->cnf.mtu6)) { + if (dst_mtu(&rt->dst) >= arg->mtu || + (dst_mtu(&rt->dst) < arg->mtu && + dst_mtu(&rt->dst) == idev->cnf.mtu6)) { dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu); } rt6_exceptions_update_pmtu(rt, arg->mtu); From 8bd709639dccae62f80d505cb2019cae78c6625e Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Fri, 6 Oct 2017 12:06:04 -0700 Subject: [PATCH 0303/1640] UPSTREAM: ipv6: grab rt->rt6i_ref before allocating pcpu rt After rwlock is replaced with rcu and spinlock, ip6_pol_route() will be called with only rcu held. That means rt6 route deletion could happen simultaneously with rt6_make_pcpu_rt(). This could potentially cause memory leak if rt6_release() is called right before rt6_make_pcpu_rt() on the same route. This patch grabs rt->rt6i_ref safely before calling rt6_make_pcpu_rt() to make sure rt6_release() will not get triggered while rt6_make_pcpu_rt() is in progress. And rt6_release() is called after rt6_make_pcpu_rt() is finished. Note: As we are incrementing rt->rt6i_ref in ip6_pol_route(), there is a very slim chance that fib6_purge_rt() will be triggered unnecessarily when deleting a route if ip6_pol_route() running on another thread picks this route as well and tries to make pcpu cache for it. Signed-off-by: Wei Wang Signed-off-by: Martin KaFai Lau Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/route.c | 60 ++++++++++++++++++++++++------------------------ 1 file changed, 30 insertions(+), 30 deletions(-) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index d9bd93f24d5b..afa9fe739ff1 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1076,7 +1076,6 @@ static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt) static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt) { - struct fib6_table *table = rt->rt6i_table; struct rt6_info *pcpu_rt, *prev, **p; pcpu_rt = ip6_rt_pcpu_alloc(rt); @@ -1087,28 +1086,20 @@ static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt) return net->ipv6.ip6_null_entry; } - read_lock_bh(&table->tb6_lock); - if (rt->rt6i_pcpu) { - p = this_cpu_ptr(rt->rt6i_pcpu); - prev = cmpxchg(p, NULL, pcpu_rt); - if (prev) { - /* If someone did it before us, return prev instead */ - dst_release_immediate(&pcpu_rt->dst); - pcpu_rt = prev; - } - } else { - /* rt has been removed from the fib6 tree - * before we have a chance to acquire the read_lock. - * In this case, don't brother to create a pcpu rt - * since rt is going away anyway. The next - * dst_check() will trigger a re-lookup. - */ - dst_release_immediate(&pcpu_rt->dst); - pcpu_rt = rt; - } dst_hold(&pcpu_rt->dst); + p = this_cpu_ptr(rt->rt6i_pcpu); + prev = cmpxchg(p, NULL, pcpu_rt); + if (prev) { + /* If someone did it before us, return prev instead */ + /* release refcnt taken by ip6_rt_pcpu_alloc() */ + dst_release_immediate(&pcpu_rt->dst); + /* release refcnt taken by above dst_hold() */ + dst_release_immediate(&pcpu_rt->dst); + dst_hold(&prev->dst); + pcpu_rt = prev; + } + rt6_dst_from_metrics_check(pcpu_rt); - read_unlock_bh(&table->tb6_lock); return pcpu_rt; } @@ -1689,19 +1680,28 @@ redo_rt6_select: if (pcpu_rt) { read_unlock_bh(&table->tb6_lock); } else { - /* We have to do the read_unlock first - * because rt6_make_pcpu_route() may trigger - * ip6_dst_gc() which will take the write_lock. - */ - dst_hold(&rt->dst); - read_unlock_bh(&table->tb6_lock); - pcpu_rt = rt6_make_pcpu_route(rt); - dst_release(&rt->dst); + /* atomic_inc_not_zero() is needed when using rcu */ + if (atomic_inc_not_zero(&rt->rt6i_ref)) { + /* We have to do the read_unlock first + * because rt6_make_pcpu_route() may trigger + * ip6_dst_gc() which will take the write_lock. + * + * No dst_hold() on rt is needed because grabbing + * rt->rt6i_ref makes sure rt can't be released. + */ + read_unlock_bh(&table->tb6_lock); + pcpu_rt = rt6_make_pcpu_route(rt); + rt6_release(rt); + } else { + /* rt is already removed from tree */ + read_unlock_bh(&table->tb6_lock); + pcpu_rt = net->ipv6.ip6_null_entry; + dst_hold(&pcpu_rt->dst); + } } trace_fib6_table_lookup(net, pcpu_rt, table->tb6_id, fl6); return pcpu_rt; - } } EXPORT_SYMBOL_GPL(ip6_pol_route); From d9d443fa4505ba3f5520ee64c1c749b29e501e18 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Fri, 6 Oct 2017 12:06:05 -0700 Subject: [PATCH 0304/1640] UPSTREAM: ipv6: don't release rt->rt6i_pcpu memory during rt6_release() After rwlock is replaced with rcu and spinlock, route lookup can happen simultanously with route deletion. This patch removes the call to free_percpu(rt->rt6i_pcpu) from rt6_release() to avoid the race condition between rt6_release() and rt6_get_pcpu_route(). And as free_percpu(rt->rt6i_pcpu) is already called in ip6_dst_destroy() after the rcu grace period, it is safe to do this change. Signed-off-by: Wei Wang Signed-off-by: Martin KaFai Lau Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/ip6_fib.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 6365c0911954..bebc8c0035b2 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -190,9 +190,6 @@ void rt6_free_pcpu(struct rt6_info *non_pcpu_rt) *ppcpu_rt = NULL; } } - - free_percpu(non_pcpu_rt->rt6i_pcpu); - non_pcpu_rt->rt6i_pcpu = NULL; } EXPORT_SYMBOL_GPL(rt6_free_pcpu); From 32cbd7088b2527f81cc591e78b72206e4a6171a2 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Fri, 6 Oct 2017 12:06:06 -0700 Subject: [PATCH 0305/1640] BACKPORT: ipv6: replace dst_hold() with dst_hold_safe() in routing code With rwlock, it is safe to call dst_hold() in the read thread because read thread is guaranteed to be separated from write thread. However, after we replace rwlock with rcu, it is no longer safe to use dst_hold(). A dst might already have been deleted but is waiting for the rcu grace period to pass before freeing the memory when a read thread is trying to do dst_hold(). This could potentially cause double free issue. So this commit replaces all dst_hold() with dst_hold_safe() in all read thread to avoid this double free issue. And in order to make the code more compact, a new function ip6_hold_safe() is introduced. It calls dst_hold_safe() first, and if that fails, it will either fall back to hold and return net->ipv6.ip6_null_entry or set rt to NULL according to the caller's need. Signed-off-by: Wei Wang Signed-off-by: Martin KaFai Lau Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 3 ++- net/ipv6/route.c | 62 +++++++++++++++++++++++++++++++++------------ 2 files changed, 48 insertions(+), 17 deletions(-) diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 4624390019cc..1ea850fe186e 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -2364,7 +2364,8 @@ static struct rt6_info *addrconf_get_prefix_route(const struct in6_addr *pfx, continue; if ((rt->rt6i_flags & noflags) != 0) continue; - dst_hold(&rt->dst); + if (!dst_hold_safe(&rt->dst)) + rt = NULL; break; } out: diff --git a/net/ipv6/route.c b/net/ipv6/route.c index afa9fe739ff1..cc949660a586 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -877,6 +877,23 @@ static struct fib6_node* fib6_backtrack(struct fib6_node *fn, } } +static bool ip6_hold_safe(struct net *net, struct rt6_info **prt, + bool null_fallback) +{ + struct rt6_info *rt = *prt; + + if (dst_hold_safe(&rt->dst)) + return true; + if (null_fallback) { + rt = net->ipv6.ip6_null_entry; + dst_hold(&rt->dst); + } else { + rt = NULL; + } + *prt = rt; + return false; +} + static struct rt6_info *ip6_pol_route_lookup(struct net *net, struct fib6_table *table, struct flowi6 *fl6, int flags) @@ -904,7 +921,9 @@ restart: if (rt_cache) rt = rt_cache; - dst_use(&rt->dst, jiffies); + if (ip6_hold_safe(net, &rt, true)) + dst_use_noref(&rt->dst, jiffies); + read_unlock_bh(&table->tb6_lock); trace_fib6_table_lookup(net, rt, table->tb6_id, fl6); @@ -1067,10 +1086,9 @@ static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt) p = this_cpu_ptr(rt->rt6i_pcpu); pcpu_rt = *p; - if (pcpu_rt) { - dst_hold(&pcpu_rt->dst); + if (pcpu_rt && ip6_hold_safe(NULL, &pcpu_rt, false)) rt6_dst_from_metrics_check(pcpu_rt); - } + return pcpu_rt; } @@ -1631,12 +1649,17 @@ redo_rt6_select: if (rt_cache) rt = rt_cache; - if (rt == net->ipv6.ip6_null_entry || (rt->rt6i_flags & RTF_CACHE)) { - dst_use(&rt->dst, jiffies); + if (rt == net->ipv6.ip6_null_entry) { + read_unlock_bh(&table->tb6_lock); + dst_hold(&rt->dst); + trace_fib6_table_lookup(net, rt, table->tb6_id, fl6); + return rt; + } else if (rt->rt6i_flags & RTF_CACHE) { + if (ip6_hold_safe(net, &rt, true)) { + dst_use_noref(&rt->dst, jiffies); + rt6_dst_from_metrics_check(rt); + } read_unlock_bh(&table->tb6_lock); - - rt6_dst_from_metrics_check(rt); - trace_fib6_table_lookup(net, rt, table->tb6_id, fl6); return rt; } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) && @@ -1649,7 +1672,13 @@ redo_rt6_select: struct rt6_info *uncached_rt; - dst_use(&rt->dst, jiffies); + if (ip6_hold_safe(net, &rt, true)) { + dst_use_noref(&rt->dst, jiffies); + } else { + read_unlock_bh(&table->tb6_lock); + uncached_rt = rt; + goto uncached_rt_out; + } read_unlock_bh(&table->tb6_lock); uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL); @@ -1665,6 +1694,7 @@ redo_rt6_select: dst_hold(&uncached_rt->dst); } +uncached_rt_out: trace_fib6_table_lookup(net, uncached_rt, table->tb6_id, fl6); return uncached_rt; @@ -1673,8 +1703,7 @@ redo_rt6_select: struct rt6_info *pcpu_rt; - rt->dst.lastuse = jiffies; - rt->dst.__use++; + dst_use_noref(&rt->dst, jiffies); pcpu_rt = rt6_get_pcpu_route(rt); if (pcpu_rt) { @@ -2149,7 +2178,7 @@ restart: } out: - dst_hold(&rt->dst); + ip6_hold_safe(net, &rt, true); read_unlock_bh(&table->tb6_lock); @@ -2863,7 +2892,8 @@ static int ip6_route_del(struct fib6_config *cfg, continue; if (cfg->fc_protocol && cfg->fc_protocol != rt->rt6i_protocol) continue; - dst_hold(&rt->dst); + if (!dst_hold_safe(&rt->dst)) + break; read_unlock_bh(&table->tb6_lock); /* if gateway was specified only delete the one hop */ @@ -3059,7 +3089,7 @@ static struct rt6_info *rt6_get_route_info(struct net *net, continue; if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr)) continue; - dst_hold(&rt->dst); + ip6_hold_safe(NULL, &rt, false); break; } out: @@ -3117,7 +3147,7 @@ struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_dev break; } if (rt) - dst_hold(&rt->dst); + ip6_hold_safe(NULL, &rt, false); read_unlock_bh(&table->tb6_lock); return rt; } From c6ac3966a894be291a726817eb460b079c21ff2a Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Fri, 6 Oct 2017 12:06:07 -0700 Subject: [PATCH 0306/1640] UPSTREAM: ipv6: update fn_sernum after route is inserted to tree fib6_add() logic currently calls fib6_add_1() to figure out what node should be used for the newly added route and then call fib6_add_rt2node() to insert the route to the node. And during the call of fib6_add_1(), fn_sernum is updated for all nodes that share the same prefix as the new route. This does not have issue in the current code because reader thread will not be able to access the tree while writer thread is inserting new route to it. However, it is not the case once we transition to use RCU. Reader thread could potentially see the new fn_sernum before the new route is inserted. As a result, reader thread's route lookup will return a stale route with the new fn_sernum. In order to solve this issue, we remove all the update of fn_sernum in fib6_add_1(), and instead, introduce a new function that updates fn_sernum for all related nodes and call this functions once the route is successfully inserted to the tree. Also, smp_wmb() is used after a route is successfully inserted into the fib tree and right before the updated of fn->sernum. And smp_rmb() is used right after fn->sernum is accessed in rt6_get_cookie_safe(). This is to guarantee that when the reader thread sees the new fn->sernum, the new route is already inserted in the tree in memory. Signed-off-by: Wei Wang Signed-off-by: Martin KaFai Lau Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 2 ++ net/ipv6/ip6_fib.c | 39 +++++++++++++++++++++------------------ 2 files changed, 23 insertions(+), 18 deletions(-) diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 09fd5391f85f..4ad5ab30a675 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -223,6 +223,8 @@ static inline bool rt6_get_cookie_safe(const struct rt6_info *rt, if (fn) { *cookie = fn->fn_sernum; + /* pairs with smp_wmb() in fib6_update_sernum_upto_root() */ + smp_rmb(); status = true; } diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index bebc8c0035b2..ecf35e6982c3 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -585,7 +585,7 @@ out: static struct fib6_node *fib6_add_1(struct fib6_node *root, struct in6_addr *addr, int plen, int offset, int allow_create, - int replace_required, int sernum, + int replace_required, struct netlink_ext_ack *extack) { struct fib6_node *fn, *in, *ln; @@ -631,8 +631,6 @@ static struct fib6_node *fib6_add_1(struct fib6_node *root, fn->leaf = NULL; } - fn->fn_sernum = sernum; - return fn; } @@ -641,7 +639,6 @@ static struct fib6_node *fib6_add_1(struct fib6_node *root, */ /* Try to walk down on tree. */ - fn->fn_sernum = sernum; dir = addr_bit_set(addr, fn->fn_bit); pn = fn; fn = dir ? fn->right : fn->left; @@ -677,7 +674,6 @@ static struct fib6_node *fib6_add_1(struct fib6_node *root, ln->fn_bit = plen; ln->parent = pn; - ln->fn_sernum = sernum; if (dir) pn->right = ln; @@ -737,8 +733,6 @@ insert_above: in->leaf = fn->leaf; atomic_inc(&in->leaf->rt6i_ref); - in->fn_sernum = sernum; - /* update parent pointer */ if (dir) pn->right = in; @@ -750,8 +744,6 @@ insert_above: ln->parent = in; fn->parent = in; - ln->fn_sernum = sernum; - if (addr_bit_set(addr, bit)) { in->right = ln; in->left = fn; @@ -776,8 +768,6 @@ insert_above: ln->parent = pn; - ln->fn_sernum = sernum; - if (dir) pn->right = ln; else @@ -1080,6 +1070,20 @@ void fib6_force_start_gc(struct net *net) jiffies + net->ipv6.sysctl.ip6_rt_gc_interval); } +static void fib6_update_sernum_upto_root(struct rt6_info *rt, + int sernum) +{ + struct fib6_node *fn = rcu_dereference_protected(rt->rt6i_node, + lockdep_is_held(&rt->rt6i_table->tb6_lock)); + + /* paired with smp_rmb() in rt6_get_cookie_safe() */ + smp_wmb(); + while (fn) { + fn->fn_sernum = sernum; + fn = fn->parent; + } +} + /* * Add routing information to the routing tree. * / @@ -1112,7 +1116,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, fn = fib6_add_1(root, &rt->rt6i_dst.addr, rt->rt6i_dst.plen, offsetof(struct rt6_info, rt6i_dst), allow_create, - replace_required, sernum, extack); + replace_required, extack); if (IS_ERR(fn)) { err = PTR_ERR(fn); fn = NULL; @@ -1146,15 +1150,13 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, sfn->leaf = info->nl_net->ipv6.ip6_null_entry; atomic_inc(&info->nl_net->ipv6.ip6_null_entry->rt6i_ref); sfn->fn_flags = RTN_ROOT; - sfn->fn_sernum = sernum; /* Now add the first leaf node to new subtree */ sn = fib6_add_1(sfn, &rt->rt6i_src.addr, rt->rt6i_src.plen, offsetof(struct rt6_info, rt6i_src), - allow_create, replace_required, sernum, - extack); + allow_create, replace_required, extack); if (IS_ERR(sn)) { /* If it is failed, discard just allocated @@ -1173,8 +1175,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, sn = fib6_add_1(fn->subtree, &rt->rt6i_src.addr, rt->rt6i_src.plen, offsetof(struct rt6_info, rt6i_src), - allow_create, replace_required, sernum, - extack); + allow_create, replace_required, extack); if (IS_ERR(sn)) { err = PTR_ERR(sn); @@ -1191,8 +1192,10 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, #endif err = fib6_add_rt2node(fn, rt, info, mxc); - if (!err) + if (!err) { + fib6_update_sernum_upto_root(rt, sernum); fib6_start_gc(info->nl_net, rt); + } out: if (err) { From 06329846fa4c89ae4049cffe00ebef6261eeaa50 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Fri, 6 Oct 2017 12:06:08 -0700 Subject: [PATCH 0307/1640] UPSTREAM: ipv6: check fn->leaf before it is used If rwlock is replaced with rcu and spinlock, it is possible that the reader thread will see fn->leaf as NULL in the following scenarios: 1. fib6_add() is in progress and we have already inserted a new node but not yet inserted the route. 2. fib6_del_route() is in progress and we have already set fn->leaf to NULL but not yet freed the node because of rcu grace period. This patch makes sure all the reader threads check fn->leaf first before using it. And together with later patch to grab rcu_read_lock() and rcu_dereference() fn->leaf, it makes sure reader threads are safe when accessing fn->leaf. Signed-off-by: Wei Wang Signed-off-by: Martin KaFai Lau Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/ip6_fib.c | 23 ++++++++++++++++++----- net/ipv6/route.c | 20 ++++++++++++-------- 2 files changed, 30 insertions(+), 13 deletions(-) diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index ecf35e6982c3..f135a6852408 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -1280,10 +1280,13 @@ static struct fib6_node *fib6_lookup_1(struct fib6_node *root, while (fn) { if (FIB6_SUBTREE(fn) || fn->fn_flags & RTN_RTINFO) { + struct rt6_info *leaf = fn->leaf; struct rt6key *key; - key = (struct rt6key *) ((u8 *) fn->leaf + - args->offset); + if (!leaf) + goto backtrack; + + key = (struct rt6key *) ((u8 *)leaf + args->offset); if (ipv6_prefix_equal(&key->addr, args->addr, key->plen)) { #ifdef CONFIG_IPV6_SUBTREES @@ -1300,9 +1303,7 @@ static struct fib6_node *fib6_lookup_1(struct fib6_node *root, return fn; } } -#ifdef CONFIG_IPV6_SUBTREES backtrack: -#endif if (fn->fn_flags & RTN_ROOT) break; @@ -1359,7 +1360,18 @@ static struct fib6_node *fib6_locate_1(struct fib6_node *root, struct fib6_node *fn, *prev = NULL; for (fn = root; fn ; ) { - struct rt6key *key = (struct rt6key *)((u8 *)fn->leaf + offset); + struct rt6_info *leaf = fn->leaf; + struct rt6key *key; + + /* This node is being deleted */ + if (!leaf) { + if (plen <= fn->fn_bit) + goto out; + else + goto next; + } + + key = (struct rt6key *)((u8 *)leaf + offset); /* * Prefix match @@ -1373,6 +1385,7 @@ static struct fib6_node *fib6_locate_1(struct fib6_node *root, prev = fn; +next: /* * We have more bits to go */ diff --git a/net/ipv6/route.c b/net/ipv6/route.c index cc949660a586..ad36ac4d2f8e 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -715,6 +715,7 @@ out: } static struct rt6_info *find_rr_leaf(struct fib6_node *fn, + struct rt6_info *leaf, struct rt6_info *rr_head, u32 metric, int oif, int strict, bool *do_rr) @@ -733,7 +734,7 @@ static struct rt6_info *find_rr_leaf(struct fib6_node *fn, match = find_match(rt, oif, strict, &mpri, match, do_rr); } - for (rt = fn->leaf; rt && rt != rr_head; rt = rt->dst.rt6_next) { + for (rt = leaf; rt && rt != rr_head; rt = rt->dst.rt6_next) { if (rt->rt6i_metric != metric) { cont = rt; break; @@ -751,17 +752,21 @@ static struct rt6_info *find_rr_leaf(struct fib6_node *fn, return match; } -static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict) +static struct rt6_info *rt6_select(struct net *net, struct fib6_node *fn, + int oif, int strict) { + struct rt6_info *leaf = fn->leaf; struct rt6_info *match, *rt0; - struct net *net; bool do_rr = false; + if (!leaf) + return net->ipv6.ip6_null_entry; + rt0 = fn->rr_ptr; if (!rt0) - fn->rr_ptr = rt0 = fn->leaf; + fn->rr_ptr = rt0 = leaf; - match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict, + match = find_rr_leaf(fn, leaf, rt0, rt0->rt6i_metric, oif, strict, &do_rr); if (do_rr) { @@ -769,13 +774,12 @@ static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict) /* no entries matched; do round-robin */ if (!next || next->rt6i_metric != rt0->rt6i_metric) - next = fn->leaf; + next = leaf; if (next != rt0) fn->rr_ptr = next; } - net = dev_net(rt0->dst.dev); return match ? match : net->ipv6.ip6_null_entry; } @@ -1629,7 +1633,7 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, oif = 0; redo_rt6_select: - rt = rt6_select(fn, oif, strict); + rt = rt6_select(net, fn, oif, strict); if (rt->rt6i_nsiblings) rt = rt6_multipath_select(rt, fl6, oif, strict); if (rt == net->ipv6.ip6_null_entry) { From b721a2ca99db1d3d5b13b9ff91ef2546b60c608c Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Fri, 6 Oct 2017 12:06:09 -0700 Subject: [PATCH 0308/1640] UPSTREAM: ipv6: add key length check into rt6_select() After rwlock is replaced with rcu and spinlock, fib6_lookup() could potentially return an intermediate node if other thread is doing fib6_del() on a route which is the only route on the node so that fib6_repair_tree() will be called on this node and potentially assigns fn->leaf to the its child's fn->leaf. In order to detect this situation in rt6_select(), we have to check if fn->fn_bit is consistent with the key length stored in the route. And depending on if the fn is in the subtree or not, the key is either rt->rt6i_dst or rt->rt6i_src. If any inconsistency is found, that means the node no longer holds valid routes in it. So net->ipv6.ip6_null_entry is returned. Signed-off-by: Wei Wang Signed-off-by: Martin KaFai Lau Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/route.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index ad36ac4d2f8e..5e11153c2d8b 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -758,6 +758,7 @@ static struct rt6_info *rt6_select(struct net *net, struct fib6_node *fn, struct rt6_info *leaf = fn->leaf; struct rt6_info *match, *rt0; bool do_rr = false; + int key_plen; if (!leaf) return net->ipv6.ip6_null_entry; @@ -766,6 +767,19 @@ static struct rt6_info *rt6_select(struct net *net, struct fib6_node *fn, if (!rt0) fn->rr_ptr = rt0 = leaf; + /* Double check to make sure fn is not an intermediate node + * and fn->leaf does not points to its child's leaf + * (This might happen if all routes under fn are deleted from + * the tree and fib6_repair_tree() is called on the node.) + */ + key_plen = rt0->rt6i_dst.plen; +#ifdef CONFIG_IPV6_SUBTREES + if (rt0->rt6i_src.plen) + key_plen = rt0->rt6i_src.plen; +#endif + if (fn->fn_bit != key_plen) + return net->ipv6.ip6_null_entry; + match = find_rr_leaf(fn, leaf, rt0, rt0->rt6i_metric, oif, strict, &do_rr); From e549ce293c5bea51d313eb1e46f6a5fd0f5fe216 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Fri, 6 Oct 2017 12:06:10 -0700 Subject: [PATCH 0309/1640] BACKPORT: ipv6: replace rwlock with rcu and spinlock in fib6_table With all the preparation work before, we are now ready to replace rwlock with rcu and spinlock in fib6_table. That means now all fib6_node in fib6_table are protected by rcu. And when freeing fib6_node, call_rcu() is used to wait for the rcu grace period before releasing the memory. When accessing fib6_node, corresponding rcu APIs need to be used. And all previous sessions protected by the write lock will now be protected by the spin lock per table. All previous sessions protected by read lock will now be protected by rcu_read_lock(). A couple of things to note here: 1. As part of the work of replacing rwlock with rcu, the linked list of fn->leaf now has to be rcu protected as well. So both fn->leaf and rt->dst.rt6_next are now __rcu tagged and corresponding rcu APIs are used when manipulating them. 2. For fn->rr_ptr, first of all, it also needs to be rcu protected now and is tagged with __rcu and rcu APIs are used in corresponding places. Secondly, fn->rr_ptr is changed in rt6_select() which is a reader thread. This makes the issue a bit complicated. We think a valid solution for it is to let rt6_select() grab the tb6_lock if it decides to change it. As it is not in the normal operation and only happens when there is no valid neighbor cache for the route, we think the performance impact should be low. 3. fib6_walk_continue() has to be called with tb6_lock held even in the route dumping related functions, e.g. inet6_dump_fib(), fib6_tables_dump() and ipv6_route_seq_ops. It is because fib6_walk_continue() makes modifications to the walker structure, and so are fib6_repair_tree() and fib6_del_route(). In order to do proper syncing between them, we need to let fib6_walk_continue() hold the lock. We may be able to do further improvement on the way we do the tree walk to get rid of the need for holding the spin lock. But not for now. 4. When fib6_del_route() removes a route from the tree, we no longer mark rt->dst.rt6_next to NULL to make simultaneous reader be able to further traverse the list with rcu. However, rt->dst.rt6_next is only valid within this same rcu period. No one should access it later. 5. All the operation of atomic_inc(rt->rt6i_ref) is changed to be performed before we publish this route (either by linking it to fn->leaf or insert it in the list pointed by fn->leaf) just to be safe because as soon as we publish the route, some read thread will be able to access it. Signed-off-by: Wei Wang Signed-off-by: Martin KaFai Lau Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/dst.h | 2 +- include/net/ip6_fib.h | 24 ++- net/ipv6/addrconf.c | 11 +- net/ipv6/ip6_fib.c | 405 +++++++++++++++++++++++++----------------- net/ipv6/route.c | 111 +++++++----- 5 files changed, 328 insertions(+), 225 deletions(-) diff --git a/include/net/dst.h b/include/net/dst.h index 938a0f02ff1c..c9985d2867a8 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -102,7 +102,7 @@ struct dst_entry { union { struct dst_entry *next; struct rtable __rcu *rt_next; - struct rt6_info *rt6_next; + struct rt6_info __rcu *rt6_next; struct dn_route __rcu *dn_next; }; }; diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 4ad5ab30a675..150b3d1b5577 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -71,18 +71,18 @@ struct fib6_config { }; struct fib6_node { - struct fib6_node *parent; - struct fib6_node *left; - struct fib6_node *right; + struct fib6_node __rcu *parent; + struct fib6_node __rcu *left; + struct fib6_node __rcu *right; #ifdef CONFIG_IPV6_SUBTREES - struct fib6_node *subtree; + struct fib6_node __rcu *subtree; #endif - struct rt6_info *leaf; + struct rt6_info __rcu *leaf; __u16 fn_bit; /* bit key */ __u16 fn_flags; int fn_sernum; - struct rt6_info *rr_ptr; + struct rt6_info __rcu *rr_ptr; struct rcu_head rcu; }; @@ -94,7 +94,7 @@ struct fib6_gc_args { #ifndef CONFIG_IPV6_SUBTREES #define FIB6_SUBTREE(fn) NULL #else -#define FIB6_SUBTREE(fn) ((fn)->subtree) +#define FIB6_SUBTREE(fn) (rcu_dereference_protected((fn)->subtree, 1)) #endif struct mx6_config { @@ -177,6 +177,14 @@ struct rt6_info { unused:7; }; +#define for_each_fib6_node_rt_rcu(fn) \ + for (rt = rcu_dereference((fn)->leaf); rt; \ + rt = rcu_dereference(rt->dst.rt6_next)) + +#define for_each_fib6_walker_rt(w) \ + for (rt = (w)->leaf; rt; \ + rt = rcu_dereference_protected(rt->dst.rt6_next, 1)) + static inline struct inet6_dev *ip6_dst_idev(struct dst_entry *dst) { return ((struct rt6_info *)dst)->rt6i_idev; @@ -312,7 +320,7 @@ struct rt6_statistics { struct fib6_table { struct hlist_node tb6_hlist; u32 tb6_id; - rwlock_t tb6_lock; + spinlock_t tb6_lock; struct fib6_node tb6_root; struct inet_peer_base tb6_peers; unsigned int flags; diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 1ea850fe186e..aa112d8ab60a 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -2352,12 +2352,12 @@ static struct rt6_info *addrconf_get_prefix_route(const struct in6_addr *pfx, if (!table) return NULL; - read_lock_bh(&table->tb6_lock); + rcu_read_lock(); fn = fib6_locate(&table->tb6_root, pfx, plen, NULL, 0, true); if (!fn) goto out; - for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) { + for_each_fib6_node_rt_rcu(fn) { if (rt->dst.dev->ifindex != dev->ifindex) continue; if ((rt->rt6i_flags & flags) != flags) @@ -2369,7 +2369,7 @@ static struct rt6_info *addrconf_get_prefix_route(const struct in6_addr *pfx, break; } out: - read_unlock_bh(&table->tb6_lock); + rcu_read_unlock(); return rt; } @@ -5995,10 +5995,9 @@ void addrconf_disable_policy_idev(struct inet6_dev *idev, int val) spin_lock(&ifa->lock); if (ifa->rt) { struct rt6_info *rt = ifa->rt; - struct fib6_table *table = rt->rt6i_table; int cpu; - read_lock(&table->tb6_lock); + rcu_read_lock(); addrconf_set_nopolicy(ifa->rt, val); if (rt->rt6i_pcpu) { for_each_possible_cpu(cpu) { @@ -6008,7 +6007,7 @@ void addrconf_disable_policy_idev(struct inet6_dev *idev, int val) addrconf_set_nopolicy(*rtp, val); } } - read_unlock(&table->tb6_lock); + rcu_read_unlock(); } spin_unlock(&ifa->lock); } diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index f135a6852408..022ef102324c 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -54,8 +54,12 @@ struct fib6_cleaner { #define FWS_INIT FWS_L #endif -static struct rt6_info *fib6_find_prefix(struct net *net, struct fib6_node *fn); -static struct fib6_node *fib6_repair_tree(struct net *net, struct fib6_node *fn); +static struct rt6_info *fib6_find_prefix(struct net *net, + struct fib6_table *table, + struct fib6_node *fn); +static struct fib6_node *fib6_repair_tree(struct net *net, + struct fib6_table *table, + struct fib6_node *fn); static int fib6_walk(struct net *net, struct fib6_walker *w); static int fib6_walk_continue(struct fib6_walker *w); @@ -107,12 +111,12 @@ void fib6_update_sernum(struct rt6_info *rt) struct net *net = dev_net(rt->dst.dev); struct fib6_node *fn; - write_lock_bh(&table->tb6_lock); + spin_lock_bh(&table->tb6_lock); fn = rcu_dereference_protected(rt->rt6i_node, lockdep_is_held(&table->tb6_lock)); if (fn) fn->fn_sernum = fib6_new_sernum(net); - write_unlock_bh(&table->tb6_lock); + spin_unlock_bh(&table->tb6_lock); } /* @@ -207,8 +211,7 @@ static void fib6_link_table(struct net *net, struct fib6_table *tb) * Initialize table lock at a single place to give lockdep a key, * tables aren't visible prior to being linked to the list. */ - rwlock_init(&tb->tb6_lock); - + spin_lock_init(&tb->tb6_lock); h = tb->tb6_id & (FIB6_TABLE_HASHSZ - 1); /* @@ -227,7 +230,8 @@ static struct fib6_table *fib6_alloc_table(struct net *net, u32 id) table = kzalloc(sizeof(*table), GFP_ATOMIC); if (table) { table->tb6_id = id; - table->tb6_root.leaf = net->ipv6.ip6_null_entry; + rcu_assign_pointer(table->tb6_root.leaf, + net->ipv6.ip6_null_entry); table->tb6_root.fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO; inet_peer_base_init(&table->tb6_peers); } @@ -324,11 +328,8 @@ unsigned int fib6_tables_seq_read(struct net *net) struct hlist_head *head = &net->ipv6.fib_table_hash[h]; struct fib6_table *tb; - hlist_for_each_entry_rcu(tb, head, tb6_hlist) { - read_lock_bh(&tb->tb6_lock); + hlist_for_each_entry_rcu(tb, head, tb6_hlist) fib_seq += tb->fib_seq; - read_unlock_bh(&tb->tb6_lock); - } } rcu_read_unlock(); @@ -374,7 +375,7 @@ static int fib6_node_dump(struct fib6_walker *w) { struct rt6_info *rt; - for (rt = w->leaf; rt; rt = rt->dst.rt6_next) + for_each_fib6_walker_rt(w) fib6_rt_dump(rt, w->args); w->leaf = NULL; return 0; @@ -384,9 +385,9 @@ static void fib6_table_dump(struct net *net, struct fib6_table *tb, struct fib6_walker *w) { w->root = &tb->tb6_root; - read_lock_bh(&tb->tb6_lock); + spin_lock_bh(&tb->tb6_lock); fib6_walk(net, w); - read_unlock_bh(&tb->tb6_lock); + spin_unlock_bh(&tb->tb6_lock); } /* Called with rcu_read_lock() */ @@ -423,7 +424,7 @@ static int fib6_dump_node(struct fib6_walker *w) int res; struct rt6_info *rt; - for (rt = w->leaf; rt; rt = rt->dst.rt6_next) { + for_each_fib6_walker_rt(w) { res = rt6_dump_route(rt, w->args); if (res < 0) { /* Frame is full, suspend walking */ @@ -482,9 +483,9 @@ static int fib6_dump_table(struct fib6_table *table, struct sk_buff *skb, w->count = 0; w->skip = 0; - read_lock_bh(&table->tb6_lock); + spin_lock_bh(&table->tb6_lock); res = fib6_walk(net, w); - read_unlock_bh(&table->tb6_lock); + spin_unlock_bh(&table->tb6_lock); if (res > 0) { cb->args[4] = 1; cb->args[5] = w->root->fn_sernum; @@ -499,9 +500,9 @@ static int fib6_dump_table(struct fib6_table *table, struct sk_buff *skb, } else w->skip = 0; - read_lock_bh(&table->tb6_lock); + spin_lock_bh(&table->tb6_lock); res = fib6_walk_continue(w); - read_unlock_bh(&table->tb6_lock); + spin_unlock_bh(&table->tb6_lock); if (res <= 0) { fib6_walker_unlink(net, w); cb->args[4] = 0; @@ -582,11 +583,12 @@ out: * node. */ -static struct fib6_node *fib6_add_1(struct fib6_node *root, - struct in6_addr *addr, int plen, - int offset, int allow_create, - int replace_required, - struct netlink_ext_ack *extack) +static struct fib6_node *fib6_add_1(struct fib6_table *table, + struct fib6_node *root, + struct in6_addr *addr, int plen, + int offset, int allow_create, + int replace_required, + struct netlink_ext_ack *extack) { struct fib6_node *fn, *in, *ln; struct fib6_node *pn = NULL; @@ -601,7 +603,9 @@ static struct fib6_node *fib6_add_1(struct fib6_node *root, fn = root; do { - key = (struct rt6key *)((u8 *)fn->leaf + offset); + struct rt6_info *leaf = rcu_dereference_protected(fn->leaf, + lockdep_is_held(&table->tb6_lock)); + key = (struct rt6key *)((u8 *)leaf + offset); /* * Prefix match @@ -627,8 +631,8 @@ static struct fib6_node *fib6_add_1(struct fib6_node *root, if (plen == fn->fn_bit) { /* clean up an intermediate node */ if (!(fn->fn_flags & RTN_RTINFO)) { - rt6_release(fn->leaf); - fn->leaf = NULL; + RCU_INIT_POINTER(fn->leaf, NULL); + rt6_release(leaf); } return fn; @@ -641,7 +645,11 @@ static struct fib6_node *fib6_add_1(struct fib6_node *root, /* Try to walk down on tree. */ dir = addr_bit_set(addr, fn->fn_bit); pn = fn; - fn = dir ? fn->right : fn->left; + fn = dir ? + rcu_dereference_protected(fn->right, + lockdep_is_held(&table->tb6_lock)) : + rcu_dereference_protected(fn->left, + lockdep_is_held(&table->tb6_lock)); } while (fn); if (!allow_create) { @@ -672,13 +680,12 @@ static struct fib6_node *fib6_add_1(struct fib6_node *root, if (!ln) return ERR_PTR(-ENOMEM); ln->fn_bit = plen; - - ln->parent = pn; + RCU_INIT_POINTER(ln->parent, pn); if (dir) - pn->right = ln; + rcu_assign_pointer(pn->right, ln); else - pn->left = ln; + rcu_assign_pointer(pn->left, ln); return ln; @@ -692,7 +699,8 @@ insert_above: * and the current */ - pn = fn->parent; + pn = rcu_dereference_protected(fn->parent, + lockdep_is_held(&table->tb6_lock)); /* find 1st bit in difference between the 2 addrs. @@ -729,27 +737,28 @@ insert_above: in->fn_bit = bit; - in->parent = pn; + RCU_INIT_POINTER(in->parent, pn); in->leaf = fn->leaf; - atomic_inc(&in->leaf->rt6i_ref); + atomic_inc(&rcu_dereference_protected(in->leaf, + lockdep_is_held(&table->tb6_lock))->rt6i_ref); /* update parent pointer */ if (dir) - pn->right = in; + rcu_assign_pointer(pn->right, in); else - pn->left = in; + rcu_assign_pointer(pn->left, in); ln->fn_bit = plen; - ln->parent = in; - fn->parent = in; + RCU_INIT_POINTER(ln->parent, in); + rcu_assign_pointer(fn->parent, in); if (addr_bit_set(addr, bit)) { - in->right = ln; - in->left = fn; + rcu_assign_pointer(in->right, ln); + rcu_assign_pointer(in->left, fn); } else { - in->left = ln; - in->right = fn; + rcu_assign_pointer(in->left, ln); + rcu_assign_pointer(in->right, fn); } } else { /* plen <= bit */ @@ -766,19 +775,19 @@ insert_above: ln->fn_bit = plen; - ln->parent = pn; - - if (dir) - pn->right = ln; - else - pn->left = ln; + RCU_INIT_POINTER(ln->parent, pn); if (addr_bit_set(&key->addr, plen)) - ln->right = fn; + RCU_INIT_POINTER(ln->right, fn); else - ln->left = fn; + RCU_INIT_POINTER(ln->left, fn); - fn->parent = ln; + rcu_assign_pointer(fn->parent, ln); + + if (dir) + rcu_assign_pointer(pn->right, ln); + else + rcu_assign_pointer(pn->left, ln); } return ln; } @@ -824,6 +833,8 @@ static int fib6_commit_metrics(struct dst_entry *dst, struct mx6_config *mxc) static void fib6_purge_rt(struct rt6_info *rt, struct fib6_node *fn, struct net *net) { + struct fib6_table *table = rt->rt6i_table; + if (atomic_read(&rt->rt6i_ref) != 1) { /* This route is used as dummy address holder in some split * nodes. It is not leaked, but it still holds other resources, @@ -832,12 +843,17 @@ static void fib6_purge_rt(struct rt6_info *rt, struct fib6_node *fn, * to still alive ones. */ while (fn) { - if (!(fn->fn_flags & RTN_RTINFO) && fn->leaf == rt) { - fn->leaf = fib6_find_prefix(net, fn); - atomic_inc(&fn->leaf->rt6i_ref); + struct rt6_info *leaf = rcu_dereference_protected(fn->leaf, + lockdep_is_held(&table->tb6_lock)); + struct rt6_info *new_leaf; + if (!(fn->fn_flags & RTN_RTINFO) && leaf == rt) { + new_leaf = fib6_find_prefix(net, table, fn); + atomic_inc(&new_leaf->rt6i_ref); + rcu_assign_pointer(fn->leaf, new_leaf); rt6_release(rt); } - fn = fn->parent; + fn = rcu_dereference_protected(fn->parent, + lockdep_is_held(&table->tb6_lock)); } } } @@ -849,9 +865,11 @@ static void fib6_purge_rt(struct rt6_info *rt, struct fib6_node *fn, static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt, struct nl_info *info, struct mx6_config *mxc) { + struct rt6_info *leaf = rcu_dereference_protected(fn->leaf, + lockdep_is_held(&rt->rt6i_table->tb6_lock)); struct rt6_info *iter = NULL; - struct rt6_info **ins; - struct rt6_info **fallback_ins = NULL; + struct rt6_info __rcu **ins; + struct rt6_info __rcu **fallback_ins = NULL; int replace = (info->nlh && (info->nlh->nlmsg_flags & NLM_F_REPLACE)); int add = (!info->nlh || @@ -866,7 +884,9 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt, ins = &fn->leaf; - for (iter = fn->leaf; iter; iter = iter->dst.rt6_next) { + for (iter = leaf; iter; + iter = rcu_dereference_protected(iter->dst.rt6_next, + lockdep_is_held(&rt->rt6i_table->tb6_lock))) { /* * Search for duplicates */ @@ -929,7 +949,8 @@ next_iter: * first matching route */ ins = fallback_ins; - iter = *ins; + iter = rcu_dereference_protected(*ins, + lockdep_is_held(&rt->rt6i_table->tb6_lock)); found++; } @@ -943,7 +964,7 @@ next_iter: struct rt6_info *sibling, *temp_sibling; /* Find the first route that have the same metric */ - sibling = fn->leaf; + sibling = leaf; while (sibling) { if (sibling->rt6i_metric == rt->rt6i_metric && rt6_qualify_for_ecmp(sibling)) { @@ -951,7 +972,8 @@ next_iter: &sibling->rt6i_siblings); break; } - sibling = sibling->dst.rt6_next; + sibling = rcu_dereference_protected(sibling->dst.rt6_next, + lockdep_is_held(&rt->rt6i_table->tb6_lock)); } /* For each sibling in the list, increment the counter of * siblings. BUG() if counters does not match, list of siblings @@ -980,10 +1002,10 @@ add: if (err) return err; - rt->dst.rt6_next = iter; - *ins = rt; - rcu_assign_pointer(rt->rt6i_node, fn); + rcu_assign_pointer(rt->dst.rt6_next, iter); atomic_inc(&rt->rt6i_ref); + rcu_assign_pointer(rt->rt6i_node, fn); + rcu_assign_pointer(*ins, rt); call_fib6_entry_notifiers(info->nl_net, FIB_EVENT_ENTRY_ADD, rt); if (!info->skip_notify) @@ -1009,10 +1031,10 @@ add: if (err) return err; - *ins = rt; + atomic_inc(&rt->rt6i_ref); rcu_assign_pointer(rt->rt6i_node, fn); rt->dst.rt6_next = iter->dst.rt6_next; - atomic_inc(&rt->rt6i_ref); + rcu_assign_pointer(*ins, rt); call_fib6_entry_notifiers(info->nl_net, FIB_EVENT_ENTRY_REPLACE, rt); if (!info->skip_notify) @@ -1024,14 +1046,15 @@ add: nsiblings = iter->rt6i_nsiblings; iter->rt6i_node = NULL; fib6_purge_rt(iter, fn, info->nl_net); - if (fn->rr_ptr == iter) + if (rcu_access_pointer(fn->rr_ptr) == iter) fn->rr_ptr = NULL; rt6_release(iter); if (nsiblings) { /* Replacing an ECMP route, remove all siblings */ ins = &rt->dst.rt6_next; - iter = *ins; + iter = rcu_dereference_protected(*ins, + lockdep_is_held(&rt->rt6i_table->tb6_lock)); while (iter) { if (iter->rt6i_metric > rt->rt6i_metric) break; @@ -1039,14 +1062,15 @@ add: *ins = iter->dst.rt6_next; iter->rt6i_node = NULL; fib6_purge_rt(iter, fn, info->nl_net); - if (fn->rr_ptr == iter) + if (rcu_access_pointer(fn->rr_ptr) == iter) fn->rr_ptr = NULL; rt6_release(iter); nsiblings--; } else { ins = &iter->dst.rt6_next; } - iter = *ins; + iter = rcu_dereference_protected(*ins, + lockdep_is_held(&rt->rt6i_table->tb6_lock)); } WARN_ON(nsiblings != 0); } @@ -1080,7 +1104,8 @@ static void fib6_update_sernum_upto_root(struct rt6_info *rt, smp_wmb(); while (fn) { fn->fn_sernum = sernum; - fn = fn->parent; + fn = rcu_dereference_protected(fn->parent, + lockdep_is_held(&rt->rt6i_table->tb6_lock)); } } @@ -1088,12 +1113,14 @@ static void fib6_update_sernum_upto_root(struct rt6_info *rt, * Add routing information to the routing tree. * / * with source addr info in sub-trees + * Need to own table->tb6_lock */ int fib6_add(struct fib6_node *root, struct rt6_info *rt, struct nl_info *info, struct mx6_config *mxc, struct netlink_ext_ack *extack) { + struct fib6_table *table = rt->rt6i_table; struct fib6_node *fn, *pn = NULL; int err = -ENOMEM; int allow_create = 1; @@ -1114,7 +1141,8 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, if (!allow_create && !replace_required) pr_warn("RTM_NEWROUTE with no NLM_F_CREATE or NLM_F_REPLACE\n"); - fn = fib6_add_1(root, &rt->rt6i_dst.addr, rt->rt6i_dst.plen, + fn = fib6_add_1(table, root, + &rt->rt6i_dst.addr, rt->rt6i_dst.plen, offsetof(struct rt6_info, rt6i_dst), allow_create, replace_required, extack); if (IS_ERR(fn)) { @@ -1129,7 +1157,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, if (rt->rt6i_src.plen) { struct fib6_node *sn; - if (!fn->subtree) { + if (!rcu_access_pointer(fn->subtree)) { struct fib6_node *sfn; /* @@ -1147,13 +1175,14 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, if (!sfn) goto failure; - sfn->leaf = info->nl_net->ipv6.ip6_null_entry; atomic_inc(&info->nl_net->ipv6.ip6_null_entry->rt6i_ref); + rcu_assign_pointer(sfn->leaf, + info->nl_net->ipv6.ip6_null_entry); sfn->fn_flags = RTN_ROOT; /* Now add the first leaf node to new subtree */ - sn = fib6_add_1(sfn, &rt->rt6i_src.addr, + sn = fib6_add_1(table, sfn, &rt->rt6i_src.addr, rt->rt6i_src.plen, offsetof(struct rt6_info, rt6i_src), allow_create, replace_required, extack); @@ -1169,10 +1198,10 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, } /* Now link new subtree to main tree */ - sfn->parent = fn; - fn->subtree = sfn; + rcu_assign_pointer(sfn->parent, fn); + rcu_assign_pointer(fn->subtree, sfn); } else { - sn = fib6_add_1(fn->subtree, &rt->rt6i_src.addr, + sn = fib6_add_1(table, FIB6_SUBTREE(fn), &rt->rt6i_src.addr, rt->rt6i_src.plen, offsetof(struct rt6_info, rt6i_src), allow_create, replace_required, extack); @@ -1183,9 +1212,9 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, } } - if (!fn->leaf) { - fn->leaf = rt; + if (!rcu_access_pointer(fn->leaf)) { atomic_inc(&rt->rt6i_ref); + rcu_assign_pointer(fn->leaf, rt); } fn = sn; } @@ -1204,19 +1233,23 @@ out: * If fib6_add_1 has cleared the old leaf pointer in the * super-tree leaf node we have to find a new one for it. */ - if (pn != fn && pn->leaf == rt) { - pn->leaf = NULL; + struct rt6_info *pn_leaf = rcu_dereference_protected(pn->leaf, + lockdep_is_held(&table->tb6_lock)); + if (pn != fn && pn_leaf == rt) { + pn_leaf = NULL; + RCU_INIT_POINTER(pn->leaf, NULL); atomic_dec(&rt->rt6i_ref); } - if (pn != fn && !pn->leaf && !(pn->fn_flags & RTN_RTINFO)) { - pn->leaf = fib6_find_prefix(info->nl_net, pn); + if (pn != fn && !pn_leaf && !(pn->fn_flags & RTN_RTINFO)) { + pn_leaf = fib6_find_prefix(info->nl_net, table, pn); #if RT6_DEBUG >= 2 - if (!pn->leaf) { - WARN_ON(pn->leaf == NULL); - pn->leaf = info->nl_net->ipv6.ip6_null_entry; + if (!pn_leaf) { + WARN_ON(!pn_leaf); + pn_leaf = info->nl_net->ipv6.ip6_null_entry; } #endif - atomic_inc(&pn->leaf->rt6i_ref); + atomic_inc(&pn_leaf->rt6i_ref); + rcu_assign_pointer(pn->leaf, pn_leaf); } #endif goto failure; @@ -1231,7 +1264,7 @@ failure: * fn->leaf. */ if (fn && !(fn->fn_flags & (RTN_RTINFO|RTN_ROOT))) - fib6_repair_tree(info->nl_net, fn); + fib6_repair_tree(info->nl_net, table, fn); /* Always release dst as dst->__refcnt is guaranteed * to be taken before entering this function */ @@ -1269,7 +1302,8 @@ static struct fib6_node *fib6_lookup_1(struct fib6_node *root, dir = addr_bit_set(args->addr, fn->fn_bit); - next = dir ? fn->right : fn->left; + next = dir ? rcu_dereference(fn->right) : + rcu_dereference(fn->left); if (next) { fn = next; @@ -1279,8 +1313,10 @@ static struct fib6_node *fib6_lookup_1(struct fib6_node *root, } while (fn) { - if (FIB6_SUBTREE(fn) || fn->fn_flags & RTN_RTINFO) { - struct rt6_info *leaf = fn->leaf; + struct fib6_node *subtree = FIB6_SUBTREE(fn); + + if (subtree || fn->fn_flags & RTN_RTINFO) { + struct rt6_info *leaf = rcu_dereference(fn->leaf); struct rt6key *key; if (!leaf) @@ -1290,10 +1326,9 @@ static struct fib6_node *fib6_lookup_1(struct fib6_node *root, if (ipv6_prefix_equal(&key->addr, args->addr, key->plen)) { #ifdef CONFIG_IPV6_SUBTREES - if (fn->subtree) { + if (subtree) { struct fib6_node *sfn; - sfn = fib6_lookup_1(fn->subtree, - args + 1); + sfn = fib6_lookup_1(subtree, args + 1); if (!sfn) goto backtrack; fn = sfn; @@ -1307,12 +1342,14 @@ backtrack: if (fn->fn_flags & RTN_ROOT) break; - fn = fn->parent; + fn = rcu_dereference(fn->parent); } return NULL; } +/* called with rcu_read_lock() held + */ struct fib6_node *fib6_lookup(struct fib6_node *root, const struct in6_addr *daddr, const struct in6_addr *saddr) { @@ -1360,7 +1397,7 @@ static struct fib6_node *fib6_locate_1(struct fib6_node *root, struct fib6_node *fn, *prev = NULL; for (fn = root; fn ; ) { - struct rt6_info *leaf = fn->leaf; + struct rt6_info *leaf = rcu_dereference(fn->leaf); struct rt6key *key; /* This node is being deleted */ @@ -1390,9 +1427,9 @@ next: * We have more bits to go */ if (addr_bit_set(addr, fn->fn_bit)) - fn = fn->right; + fn = rcu_dereference(fn->right); else - fn = fn->left; + fn = rcu_dereference(fn->left); } out: if (exact_match) @@ -1414,9 +1451,11 @@ struct fib6_node *fib6_locate(struct fib6_node *root, #ifdef CONFIG_IPV6_SUBTREES if (src_len) { + struct fib6_node *subtree = FIB6_SUBTREE(fn); + WARN_ON(saddr == NULL); - if (fn && fn->subtree) - fn = fib6_locate_1(fn->subtree, saddr, src_len, + if (fn && subtree) + fn = fib6_locate_1(subtree, saddr, src_len, offsetof(struct rt6_info, rt6i_src), exact_match); } @@ -1434,16 +1473,26 @@ struct fib6_node *fib6_locate(struct fib6_node *root, * */ -static struct rt6_info *fib6_find_prefix(struct net *net, struct fib6_node *fn) +static struct rt6_info *fib6_find_prefix(struct net *net, + struct fib6_table *table, + struct fib6_node *fn) { + struct fib6_node *child_left, *child_right; + if (fn->fn_flags & RTN_ROOT) return net->ipv6.ip6_null_entry; while (fn) { - if (fn->left) - return fn->left->leaf; - if (fn->right) - return fn->right->leaf; + child_left = rcu_dereference_protected(fn->left, + lockdep_is_held(&table->tb6_lock)); + child_right = rcu_dereference_protected(fn->right, + lockdep_is_held(&table->tb6_lock)); + if (child_left) + return rcu_dereference_protected(child_left->leaf, + lockdep_is_held(&table->tb6_lock)); + if (child_right) + return rcu_dereference_protected(child_right->leaf, + lockdep_is_held(&table->tb6_lock)); fn = FIB6_SUBTREE(fn); } @@ -1453,31 +1502,49 @@ static struct rt6_info *fib6_find_prefix(struct net *net, struct fib6_node *fn) /* * Called to trim the tree of intermediate nodes when possible. "fn" * is the node we want to try and remove. + * Need to own table->tb6_lock */ static struct fib6_node *fib6_repair_tree(struct net *net, - struct fib6_node *fn) + struct fib6_table *table, + struct fib6_node *fn) { int children; int nstate; - struct fib6_node *child, *pn; + struct fib6_node *child; struct fib6_walker *w; int iter = 0; for (;;) { + struct fib6_node *fn_r = rcu_dereference_protected(fn->right, + lockdep_is_held(&table->tb6_lock)); + struct fib6_node *fn_l = rcu_dereference_protected(fn->left, + lockdep_is_held(&table->tb6_lock)); + struct fib6_node *pn = rcu_dereference_protected(fn->parent, + lockdep_is_held(&table->tb6_lock)); + struct fib6_node *pn_r = rcu_dereference_protected(pn->right, + lockdep_is_held(&table->tb6_lock)); + struct fib6_node *pn_l = rcu_dereference_protected(pn->left, + lockdep_is_held(&table->tb6_lock)); + struct rt6_info *fn_leaf = rcu_dereference_protected(fn->leaf, + lockdep_is_held(&table->tb6_lock)); + struct rt6_info *pn_leaf = rcu_dereference_protected(pn->leaf, + lockdep_is_held(&table->tb6_lock)); + struct rt6_info *new_fn_leaf; + RT6_TRACE("fixing tree: plen=%d iter=%d\n", fn->fn_bit, iter); iter++; WARN_ON(fn->fn_flags & RTN_RTINFO); WARN_ON(fn->fn_flags & RTN_TL_ROOT); - WARN_ON(fn->leaf); + WARN_ON(fn_leaf); children = 0; child = NULL; - if (fn->right) - child = fn->right, children |= 1; - if (fn->left) - child = fn->left, children |= 2; + if (fn_r) + child = fn_r, children |= 1; + if (fn_l) + child = fn_l, children |= 2; if (children == 3 || FIB6_SUBTREE(fn) #ifdef CONFIG_IPV6_SUBTREES @@ -1485,36 +1552,36 @@ static struct fib6_node *fib6_repair_tree(struct net *net, || (children && fn->fn_flags & RTN_ROOT) #endif ) { - fn->leaf = fib6_find_prefix(net, fn); + new_fn_leaf = fib6_find_prefix(net, table, fn); #if RT6_DEBUG >= 2 - if (!fn->leaf) { - WARN_ON(!fn->leaf); - fn->leaf = net->ipv6.ip6_null_entry; + if (!new_fn_leaf) { + WARN_ON(!new_fn_leaf); + new_fn_leaf = net->ipv6.ip6_null_entry; } #endif - atomic_inc(&fn->leaf->rt6i_ref); - return fn->parent; + atomic_inc(&new_fn_leaf->rt6i_ref); + rcu_assign_pointer(fn->leaf, new_fn_leaf); + return pn; } - pn = fn->parent; #ifdef CONFIG_IPV6_SUBTREES if (FIB6_SUBTREE(pn) == fn) { WARN_ON(!(fn->fn_flags & RTN_ROOT)); - FIB6_SUBTREE(pn) = NULL; + RCU_INIT_POINTER(pn->subtree, NULL); nstate = FWS_L; } else { WARN_ON(fn->fn_flags & RTN_ROOT); #endif - if (pn->right == fn) - pn->right = child; - else if (pn->left == fn) - pn->left = child; + if (pn_r == fn) + rcu_assign_pointer(pn->right, child); + else if (pn_l == fn) + rcu_assign_pointer(pn->left, child); #if RT6_DEBUG >= 2 else WARN_ON(1); #endif if (child) - child->parent = pn; + rcu_assign_pointer(child->parent, pn); nstate = FWS_R; #ifdef CONFIG_IPV6_SUBTREES } @@ -1547,17 +1614,18 @@ static struct fib6_node *fib6_repair_tree(struct net *net, if (pn->fn_flags & RTN_RTINFO || FIB6_SUBTREE(pn)) return pn; - rt6_release(pn->leaf); - pn->leaf = NULL; + RCU_INIT_POINTER(pn->leaf, NULL); + rt6_release(pn_leaf); fn = pn; } } -static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp, - struct nl_info *info) +static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn, + struct rt6_info __rcu **rtp, struct nl_info *info) { struct fib6_walker *w; - struct rt6_info *rt = *rtp; + struct rt6_info *rt = rcu_dereference_protected(*rtp, + lockdep_is_held(&table->tb6_lock)); struct net *net = info->nl_net; RT6_TRACE("fib6_del_route\n"); @@ -1574,7 +1642,7 @@ static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp, rt6_flush_exceptions(rt); /* Reset round-robin state, if necessary */ - if (fn->rr_ptr == rt) + if (rcu_access_pointer(fn->rr_ptr) == rt) fn->rr_ptr = NULL; /* Remove this entry from other siblings */ @@ -1593,20 +1661,19 @@ static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp, FOR_WALKERS(net, w) { if (w->state == FWS_C && w->leaf == rt) { RT6_TRACE("walker %p adjusted by delroute\n", w); - w->leaf = rt->dst.rt6_next; + w->leaf = rcu_dereference_protected(rt->dst.rt6_next, + lockdep_is_held(&table->tb6_lock)); if (!w->leaf) w->state = FWS_U; } } read_unlock(&net->ipv6.fib6_walker_lock); - rt->dst.rt6_next = NULL; - /* If it was last route, expunge its radix tree node */ - if (!fn->leaf) { + if (!rcu_access_pointer(fn->leaf)) { fn->fn_flags &= ~RTN_RTINFO; net->ipv6.rt6_stats->fib_route_nodes--; - fn = fib6_repair_tree(net, fn); + fn = fib6_repair_tree(net, table, fn); } fib6_purge_rt(rt, fn, net); @@ -1617,12 +1684,15 @@ static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp, rt6_release(rt); } +/* Need to own table->tb6_lock */ int fib6_del(struct rt6_info *rt, struct nl_info *info) { struct fib6_node *fn = rcu_dereference_protected(rt->rt6i_node, lockdep_is_held(&rt->rt6i_table->tb6_lock)); + struct fib6_table *table = rt->rt6i_table; struct net *net = info->nl_net; - struct rt6_info **rtp; + struct rt6_info __rcu **rtp; + struct rt6_info __rcu **rtp_next; #if RT6_DEBUG >= 2 if (rt->dst.obsolete > 0) { @@ -1643,11 +1713,14 @@ int fib6_del(struct rt6_info *rt, struct nl_info *info) * Walk the leaf entries looking for ourself */ - for (rtp = &fn->leaf; *rtp; rtp = &(*rtp)->dst.rt6_next) { - if (*rtp == rt) { - fib6_del_route(fn, rtp, info); + for (rtp = &fn->leaf; *rtp; rtp = rtp_next) { + struct rt6_info *cur = rcu_dereference_protected(*rtp, + lockdep_is_held(&table->tb6_lock)); + if (rt == cur) { + fib6_del_route(table, fn, rtp, info); return 0; } + rtp_next = &cur->dst.rt6_next; } return -ENOENT; } @@ -1674,11 +1747,13 @@ int fib6_del(struct rt6_info *rt, struct nl_info *info) * 0 -> walk is complete. * >0 -> walk is incomplete (i.e. suspended) * <0 -> walk is terminated by an error. + * + * This function is called with tb6_lock held. */ static int fib6_walk_continue(struct fib6_walker *w) { - struct fib6_node *fn, *pn; + struct fib6_node *fn, *pn, *left, *right; /* w->root should always be table->tb6_root */ WARN_ON_ONCE(!(w->root->fn_flags & RTN_TL_ROOT)); @@ -1698,20 +1773,22 @@ static int fib6_walk_continue(struct fib6_walker *w) w->state = FWS_L; #endif case FWS_L: - if (fn->left) { - w->node = fn->left; + left = rcu_dereference_protected(fn->left, 1); + if (left) { + w->node = left; w->state = FWS_INIT; continue; } w->state = FWS_R; case FWS_R: - if (fn->right) { - w->node = fn->right; + right = rcu_dereference_protected(fn->right, 1); + if (right) { + w->node = right; w->state = FWS_INIT; continue; } w->state = FWS_C; - w->leaf = fn->leaf; + w->leaf = rcu_dereference_protected(fn->leaf, 1); case FWS_C: if (w->leaf && fn->fn_flags & RTN_RTINFO) { int err; @@ -1733,7 +1810,9 @@ skip: case FWS_U: if (fn == w->root) return 0; - pn = fn->parent; + pn = rcu_dereference_protected(fn->parent, 1); + left = rcu_dereference_protected(pn->left, 1); + right = rcu_dereference_protected(pn->right, 1); w->node = pn; #ifdef CONFIG_IPV6_SUBTREES if (FIB6_SUBTREE(pn) == fn) { @@ -1742,13 +1821,13 @@ skip: continue; } #endif - if (pn->left == fn) { + if (left == fn) { w->state = FWS_R; continue; } - if (pn->right == fn) { + if (right == fn) { w->state = FWS_C; - w->leaf = w->node->leaf; + w->leaf = rcu_dereference_protected(w->node->leaf, 1); continue; } #if RT6_DEBUG >= 2 @@ -1791,7 +1870,7 @@ static int fib6_clean_node(struct fib6_walker *w) return 0; } - for (rt = w->leaf; rt; rt = rt->dst.rt6_next) { + for_each_fib6_walker_rt(w) { res = c->func(rt, c->arg); if (res < 0) { w->leaf = rt; @@ -1851,10 +1930,10 @@ static void __fib6_clean_all(struct net *net, for (h = 0; h < FIB6_TABLE_HASHSZ; h++) { head = &net->ipv6.fib_table_hash[h]; hlist_for_each_entry_rcu(table, head, tb6_hlist) { - write_lock_bh(&table->tb6_lock); + spin_lock_bh(&table->tb6_lock); fib6_clean_tree(net, &table->tb6_root, func, sernum, arg); - write_unlock_bh(&table->tb6_lock); + spin_unlock_bh(&table->tb6_lock); } } rcu_read_unlock(); @@ -1968,7 +2047,8 @@ static int __net_init fib6_net_init(struct net *net) goto out_fib_table_hash; net->ipv6.fib6_main_tbl->tb6_id = RT6_TABLE_MAIN; - net->ipv6.fib6_main_tbl->tb6_root.leaf = net->ipv6.ip6_null_entry; + rcu_assign_pointer(net->ipv6.fib6_main_tbl->tb6_root.leaf, + net->ipv6.ip6_null_entry); net->ipv6.fib6_main_tbl->tb6_root.fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO; inet_peer_base_init(&net->ipv6.fib6_main_tbl->tb6_peers); @@ -1979,7 +2059,8 @@ static int __net_init fib6_net_init(struct net *net) if (!net->ipv6.fib6_local_tbl) goto out_fib6_main_tbl; net->ipv6.fib6_local_tbl->tb6_id = RT6_TABLE_LOCAL; - net->ipv6.fib6_local_tbl->tb6_root.leaf = net->ipv6.ip6_null_entry; + rcu_assign_pointer(net->ipv6.fib6_local_tbl->tb6_root.leaf, + net->ipv6.ip6_null_entry); net->ipv6.fib6_local_tbl->tb6_root.fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO; inet_peer_base_init(&net->ipv6.fib6_local_tbl->tb6_peers); @@ -2109,7 +2190,9 @@ static int ipv6_route_yield(struct fib6_walker *w) return 1; do { - iter->w.leaf = iter->w.leaf->dst.rt6_next; + iter->w.leaf = rcu_dereference_protected( + iter->w.leaf->dst.rt6_next, + lockdep_is_held(&iter->tbl->tb6_lock)); iter->skip--; if (!iter->skip && iter->w.leaf) return 1; @@ -2174,7 +2257,7 @@ static void *ipv6_route_seq_next(struct seq_file *seq, void *v, loff_t *pos) if (!v) goto iter_table; - n = ((struct rt6_info *)v)->dst.rt6_next; + n = rcu_dereference(((struct rt6_info *)v)->dst.rt6_next); if (n) { ++*pos; return n; @@ -2182,9 +2265,9 @@ static void *ipv6_route_seq_next(struct seq_file *seq, void *v, loff_t *pos) iter_table: ipv6_route_check_sernum(iter); - read_lock(&iter->tbl->tb6_lock); + spin_lock_bh(&iter->tbl->tb6_lock); r = fib6_walk_continue(&iter->w); - read_unlock(&iter->tbl->tb6_lock); + spin_unlock_bh(&iter->tbl->tb6_lock); if (r > 0) { if (v) ++*pos; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 5e11153c2d8b..883026e33b74 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -491,7 +491,7 @@ static struct rt6_info *rt6_multipath_select(struct rt6_info *match, } /* - * Route lookup. Any table->tb6_lock is implied. + * Route lookup. rcu_read_lock() should be held. */ static inline struct rt6_info *rt6_device_match(struct net *net, @@ -506,7 +506,7 @@ static inline struct rt6_info *rt6_device_match(struct net *net, if (!oif && ipv6_addr_any(saddr)) goto out; - for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) { + for (sprt = rt; sprt; sprt = rcu_dereference(sprt->dst.rt6_next)) { struct net_device *dev = sprt->dst.dev; if (oif) { @@ -725,7 +725,7 @@ static struct rt6_info *find_rr_leaf(struct fib6_node *fn, match = NULL; cont = NULL; - for (rt = rr_head; rt; rt = rt->dst.rt6_next) { + for (rt = rr_head; rt; rt = rcu_dereference(rt->dst.rt6_next)) { if (rt->rt6i_metric != metric) { cont = rt; break; @@ -734,7 +734,8 @@ static struct rt6_info *find_rr_leaf(struct fib6_node *fn, match = find_match(rt, oif, strict, &mpri, match, do_rr); } - for (rt = leaf; rt && rt != rr_head; rt = rt->dst.rt6_next) { + for (rt = leaf; rt && rt != rr_head; + rt = rcu_dereference(rt->dst.rt6_next)) { if (rt->rt6i_metric != metric) { cont = rt; break; @@ -746,7 +747,7 @@ static struct rt6_info *find_rr_leaf(struct fib6_node *fn, if (match || !cont) return match; - for (rt = cont; rt; rt = rt->dst.rt6_next) + for (rt = cont; rt; rt = rcu_dereference(rt->dst.rt6_next)) match = find_match(rt, oif, strict, &mpri, match, do_rr); return match; @@ -755,7 +756,7 @@ static struct rt6_info *find_rr_leaf(struct fib6_node *fn, static struct rt6_info *rt6_select(struct net *net, struct fib6_node *fn, int oif, int strict) { - struct rt6_info *leaf = fn->leaf; + struct rt6_info *leaf = rcu_dereference(fn->leaf); struct rt6_info *match, *rt0; bool do_rr = false; int key_plen; @@ -763,9 +764,9 @@ static struct rt6_info *rt6_select(struct net *net, struct fib6_node *fn, if (!leaf) return net->ipv6.ip6_null_entry; - rt0 = fn->rr_ptr; + rt0 = rcu_dereference(fn->rr_ptr); if (!rt0) - fn->rr_ptr = rt0 = leaf; + rt0 = leaf; /* Double check to make sure fn is not an intermediate node * and fn->leaf does not points to its child's leaf @@ -784,14 +785,19 @@ static struct rt6_info *rt6_select(struct net *net, struct fib6_node *fn, &do_rr); if (do_rr) { - struct rt6_info *next = rt0->dst.rt6_next; + struct rt6_info *next = rcu_dereference(rt0->dst.rt6_next); /* no entries matched; do round-robin */ if (!next || next->rt6i_metric != rt0->rt6i_metric) next = leaf; - if (next != rt0) - fn->rr_ptr = next; + if (next != rt0) { + spin_lock_bh(&leaf->rt6i_table->tb6_lock); + /* make sure next is not being deleted from the tree */ + if (next->rt6i_node) + rcu_assign_pointer(fn->rr_ptr, next); + spin_unlock_bh(&leaf->rt6i_table->tb6_lock); + } } return match ? match : net->ipv6.ip6_null_entry; @@ -881,13 +887,14 @@ int rt6_route_rcv(struct net_device *dev, u8 *opt, int len, static struct fib6_node* fib6_backtrack(struct fib6_node *fn, struct in6_addr *saddr) { - struct fib6_node *pn; + struct fib6_node *pn, *sn; while (1) { if (fn->fn_flags & RTN_TL_ROOT) return NULL; - pn = fn->parent; - if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) - fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); + pn = rcu_dereference(fn->parent); + sn = FIB6_SUBTREE(pn); + if (sn && sn != fn) + fn = fib6_lookup(sn, NULL, saddr); else fn = pn; if (fn->fn_flags & RTN_RTINFO) @@ -922,13 +929,19 @@ static struct rt6_info *ip6_pol_route_lookup(struct net *net, if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) flags &= ~RT6_LOOKUP_F_IFACE; - read_lock_bh(&table->tb6_lock); + rcu_read_lock(); fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); restart: - rt = fn->leaf; - rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags); - if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0) - rt = rt6_multipath_select(rt, fl6, fl6->flowi6_oif, flags); + rt = rcu_dereference(fn->leaf); + if (!rt) { + rt = net->ipv6.ip6_null_entry; + } else { + rt = rt6_device_match(net, rt, &fl6->saddr, + fl6->flowi6_oif, flags); + if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0) + rt = rt6_multipath_select(rt, fl6, + fl6->flowi6_oif, flags); + } if (rt == net->ipv6.ip6_null_entry) { fn = fib6_backtrack(fn, &fl6->saddr); if (fn) @@ -942,7 +955,7 @@ restart: if (ip6_hold_safe(net, &rt, true)) dst_use_noref(&rt->dst, jiffies); - read_unlock_bh(&table->tb6_lock); + rcu_read_unlock(); trace_fib6_table_lookup(net, rt, table->tb6_id, fl6); @@ -996,9 +1009,9 @@ static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info, struct fib6_table *table; table = rt->rt6i_table; - write_lock_bh(&table->tb6_lock); + spin_lock_bh(&table->tb6_lock); err = fib6_add(&table->tb6_root, rt, info, mxc, extack); - write_unlock_bh(&table->tb6_lock); + spin_unlock_bh(&table->tb6_lock); return err; } @@ -1096,7 +1109,7 @@ static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt) return pcpu_rt; } -/* It should be called with read_lock_bh(&tb6_lock) acquired */ +/* It should be called with rcu_read_lock() acquired */ static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt) { struct rt6_info *pcpu_rt, **p; @@ -1638,7 +1651,7 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, if (net->ipv6.devconf_all->forwarding == 0) strict |= RT6_LOOKUP_F_REACHABLE; - read_lock_bh(&table->tb6_lock); + rcu_read_lock(); fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); saved_fn = fn; @@ -1668,7 +1681,7 @@ redo_rt6_select: rt = rt_cache; if (rt == net->ipv6.ip6_null_entry) { - read_unlock_bh(&table->tb6_lock); + rcu_read_unlock(); dst_hold(&rt->dst); trace_fib6_table_lookup(net, rt, table->tb6_id, fl6); return rt; @@ -1677,7 +1690,7 @@ redo_rt6_select: dst_use_noref(&rt->dst, jiffies); rt6_dst_from_metrics_check(rt); } - read_unlock_bh(&table->tb6_lock); + rcu_read_unlock(); trace_fib6_table_lookup(net, rt, table->tb6_id, fl6); return rt; } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) && @@ -1693,11 +1706,11 @@ redo_rt6_select: if (ip6_hold_safe(net, &rt, true)) { dst_use_noref(&rt->dst, jiffies); } else { - read_unlock_bh(&table->tb6_lock); + rcu_read_unlock(); uncached_rt = rt; goto uncached_rt_out; } - read_unlock_bh(&table->tb6_lock); + rcu_read_unlock(); uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL); dst_release(&rt->dst); @@ -1725,7 +1738,7 @@ uncached_rt_out: pcpu_rt = rt6_get_pcpu_route(rt); if (pcpu_rt) { - read_unlock_bh(&table->tb6_lock); + rcu_read_unlock(); } else { /* atomic_inc_not_zero() is needed when using rcu */ if (atomic_inc_not_zero(&rt->rt6i_ref)) { @@ -1736,12 +1749,12 @@ uncached_rt_out: * No dst_hold() on rt is needed because grabbing * rt->rt6i_ref makes sure rt can't be released. */ - read_unlock_bh(&table->tb6_lock); + rcu_read_unlock(); pcpu_rt = rt6_make_pcpu_route(rt); rt6_release(rt); } else { /* rt is already removed from tree */ - read_unlock_bh(&table->tb6_lock); + rcu_read_unlock(); pcpu_rt = net->ipv6.ip6_null_entry; dst_hold(&pcpu_rt->dst); } @@ -2150,10 +2163,10 @@ static struct rt6_info *__ip6_route_redirect(struct net *net, * routes. */ - read_lock_bh(&table->tb6_lock); + rcu_read_lock(); fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); restart: - for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) { + for_each_fib6_node_rt_rcu(fn) { if (rt6_check_expired(rt)) continue; if (rt->dst.error) @@ -2198,7 +2211,7 @@ restart: out: ip6_hold_safe(net, &rt, true); - read_unlock_bh(&table->tb6_lock); + rcu_read_unlock(); trace_fib6_table_lookup(net, rt, table->tb6_id, fl6); return rt; @@ -2800,9 +2813,9 @@ static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info) } table = rt->rt6i_table; - write_lock_bh(&table->tb6_lock); + spin_lock_bh(&table->tb6_lock); err = fib6_del(rt, info); - write_unlock_bh(&table->tb6_lock); + spin_unlock_bh(&table->tb6_lock); out: ip6_rt_put(rt); @@ -2828,7 +2841,7 @@ static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg) if (rt == net->ipv6.ip6_null_entry) goto out_put; table = rt->rt6i_table; - write_lock_bh(&table->tb6_lock); + spin_lock_bh(&table->tb6_lock); if (rt->rt6i_nsiblings && cfg->fc_delete_all_nh) { struct rt6_info *sibling, *next_sibling; @@ -2858,7 +2871,7 @@ static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg) err = fib6_del(rt, info); out_unlock: - write_unlock_bh(&table->tb6_lock); + spin_unlock_bh(&table->tb6_lock); out_put: ip6_rt_put(rt); @@ -2883,7 +2896,7 @@ static int ip6_route_del(struct fib6_config *cfg, return err; } - read_lock_bh(&table->tb6_lock); + rcu_read_lock(); fn = fib6_locate(&table->tb6_root, &cfg->fc_dst, cfg->fc_dst_len, @@ -2891,7 +2904,7 @@ static int ip6_route_del(struct fib6_config *cfg, !(cfg->fc_flags & RTF_CACHE)); if (fn) { - for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) { + for_each_fib6_node_rt_rcu(fn) { if (cfg->fc_flags & RTF_CACHE) { rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst, &cfg->fc_src); @@ -2912,7 +2925,7 @@ static int ip6_route_del(struct fib6_config *cfg, continue; if (!dst_hold_safe(&rt->dst)) break; - read_unlock_bh(&table->tb6_lock); + rcu_read_unlock(); /* if gateway was specified only delete the one hop */ if (cfg->fc_flags & RTF_GATEWAY) @@ -2921,7 +2934,7 @@ static int ip6_route_del(struct fib6_config *cfg, return __ip6_del_rt_siblings(rt, cfg); } } - read_unlock_bh(&table->tb6_lock); + rcu_read_unlock(); return err; } @@ -3095,12 +3108,12 @@ static struct rt6_info *rt6_get_route_info(struct net *net, if (!table) return NULL; - read_lock_bh(&table->tb6_lock); + rcu_read_lock(); fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true); if (!fn) goto out; - for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) { + for_each_fib6_node_rt_rcu(fn) { if (rt->dst.dev->ifindex != dev->ifindex) continue; if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY)) @@ -3111,7 +3124,7 @@ static struct rt6_info *rt6_get_route_info(struct net *net, break; } out: - read_unlock_bh(&table->tb6_lock); + rcu_read_unlock(); return rt; } @@ -3157,8 +3170,8 @@ struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_dev if (!table) return NULL; - read_lock_bh(&table->tb6_lock); - for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) { + rcu_read_lock(); + for_each_fib6_node_rt_rcu(&table->tb6_root) { if (dev == rt->dst.dev && ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) && ipv6_addr_equal(&rt->rt6i_gateway, addr)) @@ -3166,7 +3179,7 @@ struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_dev } if (rt) ip6_hold_safe(NULL, &rt, false); - read_unlock_bh(&table->tb6_lock); + rcu_read_unlock(); return rt; } From d797325016ac7fbfd26dc25cea40ba5822a38a28 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Fri, 6 Oct 2017 12:06:11 -0700 Subject: [PATCH 0310/1640] UPSTREAM: ipv6: take care of rt6_stats Currently, most of the rt6_stats are not hooked up correctly. As the last part of this patch series, hook up all existing rt6_stats and add one new stat fib_rt_uncache to indicate the number of routes in the uncached list. For details of the stats, please refer to the comments added in include/net/ip6_fib.h. Note: fib_rt_alloc and fib_rt_uncache are not guaranteed to be modified under a lock. So atomic_t is used for them. Signed-off-by: Wei Wang Signed-off-by: Martin KaFai Lau Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 15 +++++++++------ net/ipv6/ip6_fib.c | 42 ++++++++++++++++++++++++------------------ net/ipv6/route.c | 16 ++++++++++++++-- 3 files changed, 47 insertions(+), 26 deletions(-) diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 150b3d1b5577..e56cc1a2fe9f 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -299,12 +299,15 @@ struct fib6_walker { }; struct rt6_statistics { - __u32 fib_nodes; - __u32 fib_route_nodes; - __u32 fib_rt_alloc; /* permanent routes */ - __u32 fib_rt_entries; /* rt entries in table */ - __u32 fib_rt_cache; /* cache routes */ - __u32 fib_discarded_routes; + __u32 fib_nodes; /* all fib6 nodes */ + __u32 fib_route_nodes; /* intermediate nodes */ + __u32 fib_rt_entries; /* rt entries in fib table */ + __u32 fib_rt_cache; /* cached rt entries in exception table */ + __u32 fib_discarded_routes; /* total number of routes delete */ + + /* The following stats are not protected by any lock */ + atomic_t fib_rt_alloc; /* total number of routes alloced */ + atomic_t fib_rt_uncache; /* rt entries in uncached list */ }; #define RTN_TL_ROOT 0x0001 diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 022ef102324c..3d65fd054697 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -149,18 +149,21 @@ static __be32 addr_bit_set(const void *token, int fn_bit) addr[fn_bit >> 5]; } -static struct fib6_node *node_alloc(void) +static struct fib6_node *node_alloc(struct net *net) { struct fib6_node *fn; fn = kmem_cache_zalloc(fib6_node_kmem, GFP_ATOMIC); + if (fn) + net->ipv6.rt6_stats->fib_nodes++; return fn; } -static void node_free_immediate(struct fib6_node *fn) +static void node_free_immediate(struct net *net, struct fib6_node *fn) { kmem_cache_free(fib6_node_kmem, fn); + net->ipv6.rt6_stats->fib_nodes--; } static void node_free_rcu(struct rcu_head *head) @@ -170,9 +173,10 @@ static void node_free_rcu(struct rcu_head *head) kmem_cache_free(fib6_node_kmem, fn); } -static void node_free(struct fib6_node *fn) +static void node_free(struct net *net, struct fib6_node *fn) { call_rcu(&fn->rcu, node_free_rcu); + net->ipv6.rt6_stats->fib_nodes--; } void rt6_free_pcpu(struct rt6_info *non_pcpu_rt) @@ -583,7 +587,8 @@ out: * node. */ -static struct fib6_node *fib6_add_1(struct fib6_table *table, +static struct fib6_node *fib6_add_1(struct net *net, + struct fib6_table *table, struct fib6_node *root, struct in6_addr *addr, int plen, int offset, int allow_create, @@ -675,7 +680,7 @@ static struct fib6_node *fib6_add_1(struct fib6_table *table, * Create new leaf node without children. */ - ln = node_alloc(); + ln = node_alloc(net); if (!ln) return ERR_PTR(-ENOMEM); @@ -716,14 +721,14 @@ insert_above: * (new leaf node)[ln] (old node)[fn] */ if (plen > bit) { - in = node_alloc(); - ln = node_alloc(); + in = node_alloc(net); + ln = node_alloc(net); if (!in || !ln) { if (in) - node_free_immediate(in); + node_free_immediate(net, in); if (ln) - node_free_immediate(ln); + node_free_immediate(net, ln); return ERR_PTR(-ENOMEM); } @@ -768,7 +773,7 @@ insert_above: * (old node)[fn] NULL */ - ln = node_alloc(); + ln = node_alloc(net); if (!ln) return ERR_PTR(-ENOMEM); @@ -1066,6 +1071,7 @@ add: fn->rr_ptr = NULL; rt6_release(iter); nsiblings--; + info->nl_net->ipv6.rt6_stats->fib_rt_entries--; } else { ins = &iter->dst.rt6_next; } @@ -1141,7 +1147,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, if (!allow_create && !replace_required) pr_warn("RTM_NEWROUTE with no NLM_F_CREATE or NLM_F_REPLACE\n"); - fn = fib6_add_1(table, root, + fn = fib6_add_1(info->nl_net, table, root, &rt->rt6i_dst.addr, rt->rt6i_dst.plen, offsetof(struct rt6_info, rt6i_dst), allow_create, replace_required, extack); @@ -1171,7 +1177,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, */ /* Create subtree root node */ - sfn = node_alloc(); + sfn = node_alloc(info->nl_net); if (!sfn) goto failure; @@ -1182,8 +1188,8 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, /* Now add the first leaf node to new subtree */ - sn = fib6_add_1(table, sfn, &rt->rt6i_src.addr, - rt->rt6i_src.plen, + sn = fib6_add_1(info->nl_net, table, sfn, + &rt->rt6i_src.addr, rt->rt6i_src.plen, offsetof(struct rt6_info, rt6i_src), allow_create, replace_required, extack); @@ -1192,7 +1198,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, root, and then (in failure) stale node in main tree. */ - node_free_immediate(sfn); + node_free_immediate(info->nl_net, sfn); err = PTR_ERR(sn); goto failure; } @@ -1201,8 +1207,8 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, rcu_assign_pointer(sfn->parent, fn); rcu_assign_pointer(fn->subtree, sfn); } else { - sn = fib6_add_1(table, FIB6_SUBTREE(fn), &rt->rt6i_src.addr, - rt->rt6i_src.plen, + sn = fib6_add_1(info->nl_net, table, FIB6_SUBTREE(fn), + &rt->rt6i_src.addr, rt->rt6i_src.plen, offsetof(struct rt6_info, rt6i_src), allow_create, replace_required, extack); @@ -1610,7 +1616,7 @@ static struct fib6_node *fib6_repair_tree(struct net *net, } read_unlock(&net->ipv6.fib6_walker_lock); - node_free(fn); + node_free(net, fn); if (pn->fn_flags & RTN_RTINFO || FIB6_SUBTREE(pn)) return pn; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 883026e33b74..a1c0d7f105b1 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -145,9 +145,11 @@ static void rt6_uncached_list_del(struct rt6_info *rt) { if (!list_empty(&rt->rt6i_uncached)) { struct uncached_list *ul = rt->rt6i_uncached_list; + struct net *net = dev_net(rt->dst.dev); spin_lock_bh(&ul->lock); list_del(&rt->rt6i_uncached); + atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache); spin_unlock_bh(&ul->lock); } } @@ -362,8 +364,10 @@ static struct rt6_info *__ip6_dst_alloc(struct net *net, struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK, flags); - if (rt) + if (rt) { rt6_info_init(rt); + atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc); + } return rt; } @@ -1162,6 +1166,8 @@ static DEFINE_SPINLOCK(rt6_exception_lock); static void rt6_remove_exception(struct rt6_exception_bucket *bucket, struct rt6_exception *rt6_ex) { + struct net *net = dev_net(rt6_ex->rt6i->dst.dev); + if (!bucket || !rt6_ex) return; rt6_ex->rt6i->rt6i_node = NULL; @@ -1170,6 +1176,7 @@ static void rt6_remove_exception(struct rt6_exception_bucket *bucket, kfree_rcu(rt6_ex, rcu); WARN_ON_ONCE(!bucket->depth); bucket->depth--; + net->ipv6.rt6_stats->fib_rt_cache--; } /* Remove oldest rt6_ex in bucket and free the memory @@ -1276,6 +1283,7 @@ __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket, static int rt6_insert_exception(struct rt6_info *nrt, struct rt6_info *ort) { + struct net *net = dev_net(ort->dst.dev); struct rt6_exception_bucket *bucket; struct in6_addr *src_key = NULL; struct rt6_exception *rt6_ex; @@ -1345,6 +1353,7 @@ static int rt6_insert_exception(struct rt6_info *nrt, nrt->rt6i_node = ort->rt6i_node; hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain); bucket->depth++; + net->ipv6.rt6_stats->fib_rt_cache++; if (bucket->depth > FIB6_MAX_DEPTH) rt6_exception_remove_oldest(bucket); @@ -1720,6 +1729,7 @@ redo_rt6_select: * No need for another dst_hold() */ rt6_uncached_list_add(uncached_rt); + atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache); } else { uncached_rt = net->ipv6.ip6_null_entry; dst_hold(&uncached_rt->dst); @@ -1905,6 +1915,7 @@ struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_ori DST_OBSOLETE_DEAD, 0); if (rt) { rt6_info_init(rt); + atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc); new = &rt->dst; new->__use = 1; @@ -2361,6 +2372,7 @@ struct dst_entry *icmp6_dst_alloc(struct net_device *dev, * do proper release of the net_device */ rt6_uncached_list_add(rt); + atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache); dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0); @@ -4454,7 +4466,7 @@ static int rt6_stats_seq_show(struct seq_file *seq, void *v) seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n", net->ipv6.rt6_stats->fib_nodes, net->ipv6.rt6_stats->fib_route_nodes, - net->ipv6.rt6_stats->fib_rt_alloc, + atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc), net->ipv6.rt6_stats->fib_rt_entries, net->ipv6.rt6_stats->fib_rt_cache, dst_entries_get_slow(&net->ipv6.ip6_dst_ops), From 7aa4ed98006cafa887490c07fdbc85763dadba46 Mon Sep 17 00:00:00 2001 From: David Miller Date: Tue, 28 Nov 2017 15:39:59 -0500 Subject: [PATCH 0311/1640] UPSTREAM: net: dst->rt_next is unused. Delete it. Signed-off-by: David S. Miller Reviewed-by: Eric Dumazet --- include/net/dst.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/net/dst.h b/include/net/dst.h index c9985d2867a8..3832c39a2527 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -101,7 +101,6 @@ struct dst_entry { struct lwtunnel_state *lwtstate; union { struct dst_entry *next; - struct rtable __rcu *rt_next; struct rt6_info __rcu *rt6_next; struct dn_route __rcu *dn_next; }; From 0f5015ba834dea052047c09a974332ed038c54b6 Mon Sep 17 00:00:00 2001 From: Tim Zimmermann Date: Thu, 31 Jul 2025 09:01:46 +0000 Subject: [PATCH 0312/1640] UPSTREAM: net: Remove unused dst->rt_next --- include/net/dst.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/net/dst.h b/include/net/dst.h index 3832c39a2527..1c92f024f17b 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -102,7 +102,6 @@ struct dst_entry { union { struct dst_entry *next; struct rt6_info __rcu *rt6_next; - struct dn_route __rcu *dn_next; }; }; From b5745075aedf01ce1e40d962e8c1fcc63dd9db8e Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Mon, 9 Oct 2017 17:17:26 -0700 Subject: [PATCH 0313/1640] UPSTREAM: ipv6: use rcu_dereference_bh() in ipv6_route_seq_next() This patch replaces rcu_deference() with rcu_dereference_bh() in ipv6_route_seq_next() to avoid the following warning: [ 19.431685] WARNING: suspicious RCU usage [ 19.433451] 4.14.0-rc3-00914-g66f5d6c #118 Not tainted [ 19.435509] ----------------------------- [ 19.437267] net/ipv6/ip6_fib.c:2259 suspicious rcu_dereference_check() usage! [ 19.440790] [ 19.440790] other info that might help us debug this: [ 19.440790] [ 19.444734] [ 19.444734] rcu_scheduler_active = 2, debug_locks = 1 [ 19.447757] 2 locks held by odhcpd/3720: [ 19.449480] #0: (&p->lock){+.+.}, at: [] seq_read+0x3c/0x333 [ 19.452720] #1: (rcu_read_lock_bh){....}, at: [] ipv6_route_seq_start+0x5/0xfd [ 19.456323] [ 19.456323] stack backtrace: [ 19.458812] CPU: 0 PID: 3720 Comm: odhcpd Not tainted 4.14.0-rc3-00914-g66f5d6c #118 [ 19.462042] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1 04/01/2014 [ 19.465414] Call Trace: [ 19.466788] dump_stack+0x86/0xc0 [ 19.468358] lockdep_rcu_suspicious+0xea/0xf3 [ 19.470183] ipv6_route_seq_next+0x71/0x164 [ 19.471963] seq_read+0x244/0x333 [ 19.473522] proc_reg_read+0x48/0x67 [ 19.475152] ? proc_reg_write+0x67/0x67 [ 19.476862] __vfs_read+0x26/0x10b [ 19.478463] ? __might_fault+0x37/0x84 [ 19.480148] vfs_read+0xba/0x146 [ 19.481690] SyS_read+0x51/0x8e [ 19.483197] do_int80_syscall_32+0x66/0x15a [ 19.484969] entry_INT80_compat+0x32/0x50 [ 19.486707] RIP: 0023:0xf7f0be8e [ 19.488244] RSP: 002b:00000000ffa75d04 EFLAGS: 00000246 ORIG_RAX: 0000000000000003 [ 19.491431] RAX: ffffffffffffffda RBX: 0000000000000009 RCX: 0000000008056068 [ 19.493886] RDX: 0000000000001000 RSI: 0000000008056008 RDI: 0000000000001000 [ 19.496331] RBP: 00000000000001ff R08: 0000000000000000 R09: 0000000000000000 [ 19.498768] R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000 [ 19.501217] R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000 Fixes: 66f5d6ce53e6 ("ipv6: replace rwlock with rcu and spinlock in fib6_table") Reported-by: Xiaolong Ye Signed-off-by: Wei Wang Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/ip6_fib.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 3d65fd054697..221dfd6fe9b7 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -2263,7 +2263,7 @@ static void *ipv6_route_seq_next(struct seq_file *seq, void *v, loff_t *pos) if (!v) goto iter_table; - n = rcu_dereference(((struct rt6_info *)v)->dst.rt6_next); + n = rcu_dereference_bh(((struct rt6_info *)v)->dst.rt6_next); if (n) { ++*pos; return n; From 506ce184bad58cd14e52ccff21a44e080669fd84 Mon Sep 17 00:00:00 2001 From: David Miller Date: Tue, 28 Nov 2017 15:40:15 -0500 Subject: [PATCH 0314/1640] UPSTREAM: ipv6: Move rt6_next from dst_entry into ipv6 route structure. Signed-off-by: David S. Miller Reviewed-by: Eric Dumazet --- include/net/dst.h | 1 - include/net/ip6_fib.h | 5 +++-- net/ipv6/ip6_fib.c | 26 +++++++++++++------------- net/ipv6/route.c | 10 +++++----- 4 files changed, 21 insertions(+), 21 deletions(-) diff --git a/include/net/dst.h b/include/net/dst.h index 1c92f024f17b..cf3c67e8aabf 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -101,7 +101,6 @@ struct dst_entry { struct lwtunnel_state *lwtstate; union { struct dst_entry *next; - struct rt6_info __rcu *rt6_next; }; }; diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index e56cc1a2fe9f..1bc52fea49f7 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -132,6 +132,7 @@ struct rt6_exception { struct rt6_info { struct dst_entry dst; + struct rt6_info __rcu *rt6_next; /* * Tail elements of dst_entry (__refcnt etc.) @@ -179,11 +180,11 @@ struct rt6_info { #define for_each_fib6_node_rt_rcu(fn) \ for (rt = rcu_dereference((fn)->leaf); rt; \ - rt = rcu_dereference(rt->dst.rt6_next)) + rt = rcu_dereference(rt->rt6_next)) #define for_each_fib6_walker_rt(w) \ for (rt = (w)->leaf; rt; \ - rt = rcu_dereference_protected(rt->dst.rt6_next, 1)) + rt = rcu_dereference_protected(rt->rt6_next, 1)) static inline struct inet6_dev *ip6_dst_idev(struct dst_entry *dst) { diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 221dfd6fe9b7..b9d434d06857 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -890,7 +890,7 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt, ins = &fn->leaf; for (iter = leaf; iter; - iter = rcu_dereference_protected(iter->dst.rt6_next, + iter = rcu_dereference_protected(iter->rt6_next, lockdep_is_held(&rt->rt6i_table->tb6_lock))) { /* * Search for duplicates @@ -946,7 +946,7 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt, break; next_iter: - ins = &iter->dst.rt6_next; + ins = &iter->rt6_next; } if (fallback_ins && !found) { @@ -977,7 +977,7 @@ next_iter: &sibling->rt6i_siblings); break; } - sibling = rcu_dereference_protected(sibling->dst.rt6_next, + sibling = rcu_dereference_protected(sibling->rt6_next, lockdep_is_held(&rt->rt6i_table->tb6_lock)); } /* For each sibling in the list, increment the counter of @@ -1007,7 +1007,7 @@ add: if (err) return err; - rcu_assign_pointer(rt->dst.rt6_next, iter); + rcu_assign_pointer(rt->rt6_next, iter); atomic_inc(&rt->rt6i_ref); rcu_assign_pointer(rt->rt6i_node, fn); rcu_assign_pointer(*ins, rt); @@ -1038,7 +1038,7 @@ add: atomic_inc(&rt->rt6i_ref); rcu_assign_pointer(rt->rt6i_node, fn); - rt->dst.rt6_next = iter->dst.rt6_next; + rt->rt6_next = iter->rt6_next; rcu_assign_pointer(*ins, rt); call_fib6_entry_notifiers(info->nl_net, FIB_EVENT_ENTRY_REPLACE, rt); @@ -1057,14 +1057,14 @@ add: if (nsiblings) { /* Replacing an ECMP route, remove all siblings */ - ins = &rt->dst.rt6_next; + ins = &rt->rt6_next; iter = rcu_dereference_protected(*ins, lockdep_is_held(&rt->rt6i_table->tb6_lock)); while (iter) { if (iter->rt6i_metric > rt->rt6i_metric) break; if (rt6_qualify_for_ecmp(iter)) { - *ins = iter->dst.rt6_next; + *ins = iter->rt6_next; iter->rt6i_node = NULL; fib6_purge_rt(iter, fn, info->nl_net); if (rcu_access_pointer(fn->rr_ptr) == iter) @@ -1073,7 +1073,7 @@ add: nsiblings--; info->nl_net->ipv6.rt6_stats->fib_rt_entries--; } else { - ins = &iter->dst.rt6_next; + ins = &iter->rt6_next; } iter = rcu_dereference_protected(*ins, lockdep_is_held(&rt->rt6i_table->tb6_lock)); @@ -1639,7 +1639,7 @@ static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn, WARN_ON_ONCE(rt->rt6i_flags & RTF_CACHE); /* Unlink it */ - *rtp = rt->dst.rt6_next; + *rtp = rt->rt6_next; rt->rt6i_node = NULL; net->ipv6.rt6_stats->fib_rt_entries--; net->ipv6.rt6_stats->fib_discarded_routes++; @@ -1667,7 +1667,7 @@ static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn, FOR_WALKERS(net, w) { if (w->state == FWS_C && w->leaf == rt) { RT6_TRACE("walker %p adjusted by delroute\n", w); - w->leaf = rcu_dereference_protected(rt->dst.rt6_next, + w->leaf = rcu_dereference_protected(rt->rt6_next, lockdep_is_held(&table->tb6_lock)); if (!w->leaf) w->state = FWS_U; @@ -1726,7 +1726,7 @@ int fib6_del(struct rt6_info *rt, struct nl_info *info) fib6_del_route(table, fn, rtp, info); return 0; } - rtp_next = &cur->dst.rt6_next; + rtp_next = &cur->rt6_next; } return -ENOENT; } @@ -2197,7 +2197,7 @@ static int ipv6_route_yield(struct fib6_walker *w) do { iter->w.leaf = rcu_dereference_protected( - iter->w.leaf->dst.rt6_next, + iter->w.leaf->rt6_next, lockdep_is_held(&iter->tbl->tb6_lock)); iter->skip--; if (!iter->skip && iter->w.leaf) @@ -2263,7 +2263,7 @@ static void *ipv6_route_seq_next(struct seq_file *seq, void *v, loff_t *pos) if (!v) goto iter_table; - n = rcu_dereference_bh(((struct rt6_info *)v)->dst.rt6_next); + n = rcu_dereference_bh(((struct rt6_info *)v)->rt6_next); if (n) { ++*pos; return n; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index a1c0d7f105b1..31aaf44f88e7 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -510,7 +510,7 @@ static inline struct rt6_info *rt6_device_match(struct net *net, if (!oif && ipv6_addr_any(saddr)) goto out; - for (sprt = rt; sprt; sprt = rcu_dereference(sprt->dst.rt6_next)) { + for (sprt = rt; sprt; sprt = rcu_dereference(sprt->rt6_next)) { struct net_device *dev = sprt->dst.dev; if (oif) { @@ -729,7 +729,7 @@ static struct rt6_info *find_rr_leaf(struct fib6_node *fn, match = NULL; cont = NULL; - for (rt = rr_head; rt; rt = rcu_dereference(rt->dst.rt6_next)) { + for (rt = rr_head; rt; rt = rcu_dereference(rt->rt6_next)) { if (rt->rt6i_metric != metric) { cont = rt; break; @@ -739,7 +739,7 @@ static struct rt6_info *find_rr_leaf(struct fib6_node *fn, } for (rt = leaf; rt && rt != rr_head; - rt = rcu_dereference(rt->dst.rt6_next)) { + rt = rcu_dereference(rt->rt6_next)) { if (rt->rt6i_metric != metric) { cont = rt; break; @@ -751,7 +751,7 @@ static struct rt6_info *find_rr_leaf(struct fib6_node *fn, if (match || !cont) return match; - for (rt = cont; rt; rt = rcu_dereference(rt->dst.rt6_next)) + for (rt = cont; rt; rt = rcu_dereference(rt->rt6_next)) match = find_match(rt, oif, strict, &mpri, match, do_rr); return match; @@ -789,7 +789,7 @@ static struct rt6_info *rt6_select(struct net *net, struct fib6_node *fn, &do_rr); if (do_rr) { - struct rt6_info *next = rcu_dereference(rt0->dst.rt6_next); + struct rt6_info *next = rcu_dereference(rt0->rt6_next); /* no entries matched; do round-robin */ if (!next || next->rt6i_metric != rt0->rt6i_metric) From 84a5a35c08969fa931a019a79a1a74cdd85fd334 Mon Sep 17 00:00:00 2001 From: David Miller Date: Tue, 28 Nov 2017 15:40:22 -0500 Subject: [PATCH 0315/1640] BACKPORT: net: Create and use new helper xfrm_dst_child(). Only IPSEC routes have a non-NULL dst->child pointer. And IPSEC routes are identified by a non-NULL dst->xfrm pointer. Signed-off-by: David S. Miller --- include/net/xfrm.h | 9 +++++++++ net/core/dst.c | 8 +++++--- net/ipv4/xfrm4_mode_tunnel.c | 2 +- net/ipv6/xfrm6_mode_tunnel.c | 2 +- net/ipv6/xfrm6_policy.c | 2 +- net/xfrm/xfrm_output.c | 2 +- net/xfrm/xfrm_policy.c | 12 ++++++------ security/selinux/xfrm.c | 2 +- 8 files changed, 25 insertions(+), 14 deletions(-) diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 3137313b58fd..a244257dcc38 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -1005,6 +1005,15 @@ struct xfrm_dst { u32 path_cookie; }; +static inline struct dst_entry *xfrm_dst_child(const struct dst_entry *dst) +{ +#ifdef CONFIG_XFRM + if (dst->xfrm) + return dst->child; +#endif + return NULL; +} + #ifdef CONFIG_XFRM static inline void xfrm_dst_destroy(struct xfrm_dst *xdst) { diff --git a/net/core/dst.c b/net/core/dst.c index 53a5f40b16ea..da058e603309 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -116,12 +116,14 @@ EXPORT_SYMBOL(dst_alloc); struct dst_entry *dst_destroy(struct dst_entry * dst) { - struct dst_entry *child; + struct dst_entry *child = NULL; smp_rmb(); - child = dst->child; - +#ifdef CONFIG_XFRM + if (dst->xfrm) + child = dst->child; +#endif if (!(dst->flags & DST_NOCOUNT)) dst_entries_add(dst->ops, -1); diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c index e6265e2c274e..7d885a44dc9d 100644 --- a/net/ipv4/xfrm4_mode_tunnel.c +++ b/net/ipv4/xfrm4_mode_tunnel.c @@ -62,7 +62,7 @@ static int xfrm4_mode_tunnel_output(struct xfrm_state *x, struct sk_buff *skb) top_iph->frag_off = (flags & XFRM_STATE_NOPMTUDISC) ? 0 : (XFRM_MODE_SKB_CB(skb)->frag_off & htons(IP_DF)); - top_iph->ttl = ip4_dst_hoplimit(dst->child); + top_iph->ttl = ip4_dst_hoplimit(xfrm_dst_child(dst)); top_iph->saddr = x->props.saddr.a4; top_iph->daddr = x->id.daddr.a4; diff --git a/net/ipv6/xfrm6_mode_tunnel.c b/net/ipv6/xfrm6_mode_tunnel.c index 02556e356f87..e66b94f46532 100644 --- a/net/ipv6/xfrm6_mode_tunnel.c +++ b/net/ipv6/xfrm6_mode_tunnel.c @@ -59,7 +59,7 @@ static int xfrm6_mode_tunnel_output(struct xfrm_state *x, struct sk_buff *skb) if (x->props.flags & XFRM_STATE_NOECN) dsfield &= ~INET_ECN_MASK; ipv6_change_dsfield(top_iph, 0, dsfield); - top_iph->hop_limit = ip6_dst_hoplimit(dst->child); + top_iph->hop_limit = ip6_dst_hoplimit(xfrm_dst_child(dst)); top_iph->saddr = *(struct in6_addr *)&x->props.saddr; top_iph->daddr = *(struct in6_addr *)&x->id.daddr; return 0; diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c index 3e4aa4def9f8..df717f9cbc02 100644 --- a/net/ipv6/xfrm6_policy.c +++ b/net/ipv6/xfrm6_policy.c @@ -271,7 +271,7 @@ static void xfrm6_dst_ifdown(struct dst_entry *dst, struct net_device *dev, in6_dev_put(xdst->u.rt6.rt6i_idev); xdst->u.rt6.rt6i_idev = loopback_idev; in6_dev_hold(loopback_idev); - xdst = (struct xfrm_dst *)xdst->u.dst.child; + xdst = (struct xfrm_dst *)xfrm_dst_child(&xdst->u.dst); } while (xdst->u.dst.xfrm); __in6_dev_put(loopback_idev); diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c index 00ff2a1c5e5f..c46162887b94 100644 --- a/net/xfrm/xfrm_output.c +++ b/net/xfrm/xfrm_output.c @@ -44,7 +44,7 @@ static int xfrm_skb_check_space(struct sk_buff *skb) static struct dst_entry *skb_dst_pop(struct sk_buff *skb) { - struct dst_entry *child = dst_clone(skb_dst(skb)->child); + struct dst_entry *child = dst_clone(xfrm_dst_child(skb_dst(skb))); skb_dst_drop(skb); return child; diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 4ab3d5484839..2a599bee7e59 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -1658,7 +1658,7 @@ static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy, xfrm_init_path((struct xfrm_dst *)dst0, dst, nfheader_len); xfrm_init_pmtu(dst_prev); - for (dst_prev = dst0; dst_prev != dst; dst_prev = dst_prev->child) { + for (dst_prev = dst0; dst_prev != dst; dst_prev = xfrm_dst_child(dst_prev)) { struct xfrm_dst *xdst = (struct xfrm_dst *)dst_prev; err = xfrm_fill_dst(xdst, dev, fl); @@ -2539,7 +2539,7 @@ static int stale_bundle(struct dst_entry *dst) void xfrm_dst_ifdown(struct dst_entry *dst, struct net_device *dev) { - while ((dst = dst->child) && dst->xfrm && dst->dev == dev) { + while ((dst = xfrm_dst_child(dst)) && dst->xfrm && dst->dev == dev) { dst->dev = dev_net(dev)->loopback_dev; dev_hold(dst->dev); dev_put(dev); @@ -2564,7 +2564,7 @@ static void xfrm_init_pmtu(struct dst_entry *dst) struct xfrm_dst *xdst = (struct xfrm_dst *)dst; u32 pmtu, route_mtu_cached; - pmtu = dst_mtu(dst->child); + pmtu = dst_mtu(xfrm_dst_child(dst)); xdst->child_mtu_cached = pmtu; pmtu = xfrm_state_mtu(dst->xfrm, pmtu); @@ -2609,7 +2609,7 @@ static int xfrm_bundle_ok(struct xfrm_dst *first) xdst->policy_genid != atomic_read(&xdst->pols[0]->genid)) return 0; - mtu = dst_mtu(dst->child); + mtu = dst_mtu(xfrm_dst_child(dst)); if (xdst->child_mtu_cached != mtu) { last = xdst; xdst->child_mtu_cached = mtu; @@ -2623,7 +2623,7 @@ static int xfrm_bundle_ok(struct xfrm_dst *first) xdst->route_mtu_cached = mtu; } - dst = dst->child; + dst = xfrm_dst_child(dst); } while (dst->xfrm); if (likely(!last)) @@ -2665,7 +2665,7 @@ static const void *xfrm_get_dst_nexthop(const struct dst_entry *dst, { const struct dst_entry *path = dst->path; - for (; dst != path; dst = dst->child) { + for (; dst != path; dst = xfrm_dst_child(dst)) { const struct xfrm_state *xfrm = dst->xfrm; if (xfrm->props.mode == XFRM_MODE_TRANSPORT) diff --git a/security/selinux/xfrm.c b/security/selinux/xfrm.c index 8768e6b0226f..9e803d2a687a 100644 --- a/security/selinux/xfrm.c +++ b/security/selinux/xfrm.c @@ -459,7 +459,7 @@ int selinux_xfrm_postroute_last(u32 sk_sid, struct sk_buff *skb, if (dst) { struct dst_entry *iter; - for (iter = dst; iter != NULL; iter = iter->child) { + for (iter = dst; iter != NULL; iter = xfrm_dst_child(iter)) { struct xfrm_state *x = iter->xfrm; if (x && selinux_authorizable_xfrm(x)) From a3b4e7fda26d8b63be54b9cbab4bc47f3799b8b6 Mon Sep 17 00:00:00 2001 From: David Miller Date: Tue, 28 Nov 2017 15:40:28 -0500 Subject: [PATCH 0316/1640] UPSTREAM: ipsec: Create and use new helpers for dst child access. This will make a future change moving the dst->child pointer less invasive. Signed-off-by: David S. Miller Reviewed-by: Eric Dumazet --- include/net/xfrm.h | 5 +++++ net/xfrm/xfrm_policy.c | 47 +++++++++++++++++++++--------------------- 2 files changed, 28 insertions(+), 24 deletions(-) diff --git a/include/net/xfrm.h b/include/net/xfrm.h index a244257dcc38..d5a34d700b75 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -1015,6 +1015,11 @@ static inline struct dst_entry *xfrm_dst_child(const struct dst_entry *dst) } #ifdef CONFIG_XFRM +static inline void xfrm_dst_set_child(struct xfrm_dst *xdst, struct dst_entry *child) +{ + xdst->u.dst.child = child; +} + static inline void xfrm_dst_destroy(struct xfrm_dst *xdst) { xfrm_pols_put(xdst->pols, xdst->num_pols); diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 2a599bee7e59..e2ef586af4af 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -1564,8 +1564,8 @@ static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy, unsigned long now = jiffies; struct net_device *dev; struct xfrm_mode *inner_mode; - struct dst_entry *dst_prev = NULL; - struct dst_entry *dst0 = NULL; + struct xfrm_dst *xdst_prev = NULL; + struct xfrm_dst *xdst0 = NULL; int i = 0; int err; int header_len = 0; @@ -1591,13 +1591,13 @@ static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy, goto put_states; } - if (!dst_prev) - dst0 = dst1; + if (!xdst_prev) + xdst0 = xdst; else /* Ref count is taken during xfrm_alloc_dst() * No need to do dst_clone() on dst1 */ - dst_prev->child = dst1; + xfrm_dst_set_child(xdst_prev, &xdst->u.dst); if (xfrm[i]->sel.family == AF_UNSPEC) { inner_mode = xfrm_ip2inner_mode(xfrm[i], @@ -1638,8 +1638,8 @@ static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy, dst1->input = dst_discard; dst1->output = inner_mode->afinfo->output; - dst1->next = dst_prev; - dst_prev = dst1; + dst1->next = &xdst_prev->u.dst; + xdst_prev = xdst; header_len += xfrm[i]->props.header_len; if (xfrm[i]->type->flags & XFRM_TYPE_NON_FRAGMENT) @@ -1647,40 +1647,39 @@ static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy, trailer_len += xfrm[i]->props.trailer_len; } - dst_prev->child = dst; - dst0->path = dst; + xfrm_dst_set_child(xdst_prev, dst); + xdst0->u.dst.path = dst; err = -ENODEV; dev = dst->dev; if (!dev) goto free_dst; - xfrm_init_path((struct xfrm_dst *)dst0, dst, nfheader_len); - xfrm_init_pmtu(dst_prev); + xfrm_init_path(xdst0, dst, nfheader_len); + xfrm_init_pmtu(&xdst_prev->u.dst); - for (dst_prev = dst0; dst_prev != dst; dst_prev = xfrm_dst_child(dst_prev)) { - struct xfrm_dst *xdst = (struct xfrm_dst *)dst_prev; - - err = xfrm_fill_dst(xdst, dev, fl); + for (xdst_prev = xdst0; xdst_prev != (struct xfrm_dst *)dst; + xdst_prev = (struct xfrm_dst *) xfrm_dst_child(&xdst_prev->u.dst)) { + err = xfrm_fill_dst(xdst_prev, dev, fl); if (err) goto free_dst; - dst_prev->header_len = header_len; - dst_prev->trailer_len = trailer_len; - header_len -= xdst->u.dst.xfrm->props.header_len; - trailer_len -= xdst->u.dst.xfrm->props.trailer_len; + xdst_prev->u.dst.header_len = header_len; + xdst_prev->u.dst.trailer_len = trailer_len; + header_len -= xdst_prev->u.dst.xfrm->props.header_len; + trailer_len -= xdst_prev->u.dst.xfrm->props.trailer_len; } out: - return dst0; + return &xdst0->u.dst; put_states: for (; i < nx; i++) xfrm_state_put(xfrm[i]); free_dst: - if (dst0) - dst_release_immediate(dst0); - dst0 = ERR_PTR(err); + if (xdst0) + dst_release_immediate(&xdst0->u.dst); + xdst0 = ERR_PTR(err); goto out; } @@ -1923,7 +1922,7 @@ static struct xfrm_dst *xfrm_create_dummy_bundle(struct net *net, dst1->output = xdst_queue_output; dst_hold(dst); - dst1->child = dst; + xfrm_dst_set_child(xdst, dst); dst1->path = dst; xfrm_init_path((struct xfrm_dst *)dst1, dst, 0); From 92a317ff71b508648cba1cea99935bb2bc1536b9 Mon Sep 17 00:00:00 2001 From: David Miller Date: Tue, 28 Nov 2017 15:45:44 -0500 Subject: [PATCH 0317/1640] UPSTREAM: xfrm: Move child route linkage into xfrm_dst. XFRM bundle child chains look like this: xdst1 --> xdst2 --> xdst3 --> path_dst All of xdstN are xfrm_dst objects and xdst->u.dst.xfrm is non-NULL. The final child pointer in the chain, here called 'path_dst', is some other kind of route such as an ipv4 or ipv6 one. The xfrm output path pops routes, one at a time, via the child pointer, until we hit one which has a dst->xfrm pointer which is NULL. We can easily preserve the above mechanisms with child sitting only in the xfrm_dst structure. All children in the chain before we break out of the xfrm_output() loop have dst->xfrm non-NULL and are therefore xfrm_dst objects. Since we break out of the loop when we find dst->xfrm NULL, we will not try to dereference 'dst' as if it were an xfrm_dst. Signed-off-by: David S. Miller --- include/net/dst.h | 3 +-- include/net/xfrm.h | 15 ++++++++++----- net/core/dst.c | 9 ++++++--- net/core/pktgen.c | 12 ++++++------ net/netfilter/xt_policy.c | 3 ++- net/xfrm/xfrm_device.c | 2 +- 6 files changed, 26 insertions(+), 18 deletions(-) diff --git a/include/net/dst.h b/include/net/dst.h index cf3c67e8aabf..5c8717ec08e7 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -35,7 +35,6 @@ struct sk_buff; struct dst_entry { struct net_device *dev; struct rcu_head rcu_head; - struct dst_entry *child; struct dst_ops *ops; unsigned long _metrics; unsigned long expires; @@ -89,7 +88,7 @@ struct dst_entry { * Align __refcnt to a 64 bytes alignment * (L1_CACHE_SIZE would be too much) */ - long __pad_to_align_refcnt[2]; + long __pad_to_align_refcnt[3]; #endif /* * __refcnt wants to be on a different cache line from diff --git a/include/net/xfrm.h b/include/net/xfrm.h index d5a34d700b75..d8281b2287ff 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -979,7 +979,7 @@ static inline bool xfrm_sec_ctx_match(struct xfrm_sec_ctx *s1, struct xfrm_sec_c /* A struct encoding bundle of transformations to apply to some set of flow. * - * dst->child points to the next element of bundle. + * xdst->child points to the next element of bundle. * dst->xfrm points to an instanse of transformer. * * Due to unfortunate limitations of current routing cache, which we @@ -995,6 +995,7 @@ struct xfrm_dst { struct rt6_info rt6; } u; struct dst_entry *route; + struct dst_entry *child; struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX]; int num_pols, num_xfrms; u32 xfrm_genid; @@ -1008,8 +1009,10 @@ struct xfrm_dst { static inline struct dst_entry *xfrm_dst_child(const struct dst_entry *dst) { #ifdef CONFIG_XFRM - if (dst->xfrm) - return dst->child; + if (dst->xfrm) { + struct xfrm_dst *xdst = (struct xfrm_dst *) dst; + return xdst->child; + } #endif return NULL; } @@ -1017,7 +1020,7 @@ static inline struct dst_entry *xfrm_dst_child(const struct dst_entry *dst) #ifdef CONFIG_XFRM static inline void xfrm_dst_set_child(struct xfrm_dst *xdst, struct dst_entry *child) { - xdst->u.dst.child = child; + xdst->child = child; } static inline void xfrm_dst_destroy(struct xfrm_dst *xdst) @@ -1928,12 +1931,14 @@ bool xfrm_dev_offload_ok(struct sk_buff *skb, struct xfrm_state *x); static inline bool xfrm_dst_offload_ok(struct dst_entry *dst) { struct xfrm_state *x = dst->xfrm; + struct xfrm_dst *xdst; if (!x || !x->type_offload) return false; + xdst = (struct xfrm_dst *) dst; if (x->xso.offload_handle && (x->xso.dev == dst->path->dev) && - !dst->child->xfrm) + !xdst->child->xfrm) return true; return false; diff --git a/net/core/dst.c b/net/core/dst.c index da058e603309..b8773e7b1ec6 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include @@ -62,7 +63,6 @@ void dst_init(struct dst_entry *dst, struct dst_ops *ops, struct net_device *dev, int initial_ref, int initial_obsolete, unsigned short flags) { - dst->child = NULL; dst->dev = dev; if (dev) dev_hold(dev); @@ -121,8 +121,11 @@ struct dst_entry *dst_destroy(struct dst_entry * dst) smp_rmb(); #ifdef CONFIG_XFRM - if (dst->xfrm) - child = dst->child; + if (dst->xfrm) { + struct xfrm_dst *xdst = (struct xfrm_dst *) dst; + + child = xdst->child; + } #endif if (!(dst->flags & DST_NOCOUNT)) dst_entries_add(dst->ops, -1); diff --git a/net/core/pktgen.c b/net/core/pktgen.c index a1a8afd64846..835ef82eeb45 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -399,7 +399,7 @@ struct pktgen_dev { __u8 ipsmode; /* IPSEC mode (config) */ __u8 ipsproto; /* IPSEC type (config) */ __u32 spi; - struct dst_entry dst; + struct xfrm_dst xdst; struct dst_ops dstops; #endif char result[512]; @@ -2609,7 +2609,7 @@ static int pktgen_output_ipsec(struct sk_buff *skb, struct pktgen_dev *pkt_dev) * supports both transport/tunnel mode + ESP/AH type. */ if ((x->props.mode == XFRM_MODE_TUNNEL) && (pkt_dev->spi != 0)) - skb->_skb_refdst = (unsigned long)&pkt_dev->dst | SKB_DST_NOREF; + skb->_skb_refdst = (unsigned long)&pkt_dev->xdst.u.dst | SKB_DST_NOREF; rcu_read_lock_bh(); err = x->outer_mode->output(x, skb); @@ -3745,10 +3745,10 @@ static int pktgen_add_device(struct pktgen_thread *t, const char *ifname) * performance under such circumstance. */ pkt_dev->dstops.family = AF_INET; - pkt_dev->dst.dev = pkt_dev->odev; - dst_init_metrics(&pkt_dev->dst, pktgen_dst_metrics, false); - pkt_dev->dst.child = &pkt_dev->dst; - pkt_dev->dst.ops = &pkt_dev->dstops; + pkt_dev->xdst.u.dst.dev = pkt_dev->odev; + dst_init_metrics(&pkt_dev->xdst.u.dst, pktgen_dst_metrics, false); + pkt_dev->xdst.child = &pkt_dev->xdst.u.dst; + pkt_dev->xdst.u.dst.ops = &pkt_dev->dstops; #endif return add_dev_to_thread(t, pkt_dev); diff --git a/net/netfilter/xt_policy.c b/net/netfilter/xt_policy.c index 2b4ab189bba7..5639fb03bdd9 100644 --- a/net/netfilter/xt_policy.c +++ b/net/netfilter/xt_policy.c @@ -93,7 +93,8 @@ match_policy_out(const struct sk_buff *skb, const struct xt_policy_info *info, if (dst->xfrm == NULL) return -1; - for (i = 0; dst && dst->xfrm; dst = dst->child, i++) { + for (i = 0; dst && dst->xfrm; + dst = ((struct xfrm_dst *)dst)->child, i++) { pos = strict ? i : 0; if (pos >= info->len) return 0; diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c index a07df7bb41c2..f0aa1bbf584c 100644 --- a/net/xfrm/xfrm_device.c +++ b/net/xfrm/xfrm_device.c @@ -126,7 +126,7 @@ bool xfrm_dev_offload_ok(struct sk_buff *skb, struct xfrm_state *x) return false; if ((x->xso.offload_handle && (dev == dst->path->dev)) && - !dst->child->xfrm && x->type->get_mtu) { + !xdst->child->xfrm && x->type->get_mtu) { mtu = x->type->get_mtu(x, xdst->child_mtu_cached); if (skb->len <= mtu) From 9efaeb4568248ce102b4970379a1eb5a051bb3a8 Mon Sep 17 00:00:00 2001 From: David Miller Date: Tue, 28 Nov 2017 15:40:40 -0500 Subject: [PATCH 0318/1640] BACKPORT: ipv6: Move dst->from into struct rt6_info. The dst->from value is only used by ipv6 routes to track where a route "came from". Any time we clone or copy a core ipv6 route in the ipv6 routing tables, we have the copy/clone's ->from point to the base route. This is used to handle route expiration properly. Only ipv6 uses this mechanism, and only ipv6 code references it. So it is safe to move it into rt6_info. Signed-off-by: David S. Miller Reviewed-by: Eric Dumazet --- include/net/dst.h | 3 +-- include/net/ip6_fib.h | 9 ++++----- net/core/dst.c | 1 - net/ipv6/route.c | 39 +++++++++++++++++++-------------------- 4 files changed, 24 insertions(+), 28 deletions(-) diff --git a/include/net/dst.h b/include/net/dst.h index 5c8717ec08e7..a70ea5c675b3 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -39,7 +39,6 @@ struct dst_entry { unsigned long _metrics; unsigned long expires; struct dst_entry *path; - struct dst_entry *from; #ifdef CONFIG_XFRM struct xfrm_state *xfrm; #else @@ -88,7 +87,7 @@ struct dst_entry { * Align __refcnt to a 64 bytes alignment * (L1_CACHE_SIZE would be too much) */ - long __pad_to_align_refcnt[3]; + long __pad_to_align_refcnt[4]; #endif /* * __refcnt wants to be on a different cache line from diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 1bc52fea49f7..357e7741ded8 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -133,6 +133,7 @@ struct rt6_exception { struct rt6_info { struct dst_entry dst; struct rt6_info __rcu *rt6_next; + struct rt6_info *from; /* * Tail elements of dst_entry (__refcnt etc.) @@ -207,11 +208,9 @@ static inline void rt6_update_expires(struct rt6_info *rt0, int timeout) { struct rt6_info *rt; - for (rt = rt0; rt && !(rt->rt6i_flags & RTF_EXPIRES); - rt = (struct rt6_info *)rt->dst.from); + for (rt = rt0; rt && !(rt->rt6i_flags & RTF_EXPIRES); rt = rt->from); if (rt && rt != rt0) rt0->dst.expires = rt->dst.expires; - dst_set_expires(&rt0->dst, timeout); rt0->rt6i_flags |= RTF_EXPIRES; } @@ -245,8 +244,8 @@ static inline u32 rt6_get_cookie(const struct rt6_info *rt) { u32 cookie = 0; - if (rt->dst.from) - rt = (struct rt6_info *)(rt->dst.from); + if (rt->from) + rt = rt->from; rt6_get_cookie_safe(rt, &cookie); diff --git a/net/core/dst.c b/net/core/dst.c index b8773e7b1ec6..50b20411d797 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -70,7 +70,6 @@ void dst_init(struct dst_entry *dst, struct dst_ops *ops, dst_init_metrics(dst, dst_default_metrics.metrics, true); dst->expires = 0UL; dst->path = dst; - dst->from = NULL; #ifdef CONFIG_XFRM dst->xfrm = NULL; #endif diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 31aaf44f88e7..f8e21eb80628 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -188,7 +188,7 @@ static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev) static u32 *rt6_pcpu_cow_metrics(struct rt6_info *rt) { - return dst_metrics_write_ptr(rt->dst.from); + return dst_metrics_write_ptr(&rt->from->dst); } static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old) @@ -404,7 +404,7 @@ static void ip6_dst_destroy(struct dst_entry *dst) { struct rt6_info *rt = (struct rt6_info *)dst; struct rt6_exception_bucket *bucket; - struct dst_entry *from = dst->from; + struct rt6_info *from = rt->from; struct inet6_dev *idev; dst_destroy_metrics_generic(dst); @@ -422,8 +422,8 @@ static void ip6_dst_destroy(struct dst_entry *dst) kfree(bucket); } - dst->from = NULL; - dst_release(from); + rt->from = NULL; + dst_release(&from->dst); } static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev, @@ -456,9 +456,9 @@ static bool rt6_check_expired(const struct rt6_info *rt) if (rt->rt6i_flags & RTF_EXPIRES) { if (time_after(jiffies, rt->dst.expires)) return true; - } else if (rt->dst.from) { + } else if (rt->from) { return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK || - rt6_check_expired((struct rt6_info *)rt->dst.from); + rt6_check_expired(rt->from); } return false; } @@ -1065,7 +1065,7 @@ static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort, */ if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU)) - ort = (struct rt6_info *)ort->dst.from; + ort = ort->from; rcu_read_lock(); dev = ip6_rt_get_dev_rcu(ort); @@ -1291,7 +1291,7 @@ static int rt6_insert_exception(struct rt6_info *nrt, /* ort can't be a cache or pcpu route */ if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU)) - ort = (struct rt6_info *)ort->dst.from; + ort = ort->from; WARN_ON_ONCE(ort->rt6i_flags & (RTF_CACHE | RTF_PCPU)); spin_lock_bh(&rt6_exception_lock); @@ -1430,8 +1430,8 @@ static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt, /* Remove the passed in cached rt from the hash table that contains it */ int rt6_remove_exception_rt(struct rt6_info *rt) { - struct rt6_info *from = (struct rt6_info *)rt->dst.from; struct rt6_exception_bucket *bucket; + struct rt6_info *from = rt->from; struct in6_addr *src_key = NULL; struct rt6_exception *rt6_ex; int err; @@ -1475,8 +1475,8 @@ int rt6_remove_exception_rt(struct rt6_info *rt) */ static void rt6_update_exception_stamp_rt(struct rt6_info *rt) { - struct rt6_info *from = (struct rt6_info *)rt->dst.from; struct rt6_exception_bucket *bucket; + struct rt6_info *from = rt->from; struct in6_addr *src_key = NULL; struct rt6_exception *rt6_ex; @@ -1945,9 +1945,9 @@ struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_ori static void rt6_dst_from_metrics_check(struct rt6_info *rt) { - if (rt->dst.from && - dst_metrics_ptr(&rt->dst) != dst_metrics_ptr(rt->dst.from)) - dst_init_metrics(&rt->dst, dst_metrics_ptr(rt->dst.from), true); + if (rt->from && + dst_metrics_ptr(&rt->dst) != dst_metrics_ptr(&rt->from->dst)) + dst_init_metrics(&rt->dst, dst_metrics_ptr(&rt->from->dst), true); } static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie) @@ -1967,7 +1967,7 @@ static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie) { if (!__rt6_check_expired(rt) && rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK && - rt6_check((struct rt6_info *)(rt->dst.from), cookie)) + rt6_check(rt->from, cookie)) return &rt->dst; else return NULL; @@ -1987,7 +1987,7 @@ static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie) rt6_dst_from_metrics_check(rt); if (rt->rt6i_flags & RTF_PCPU || - (unlikely(!list_empty(&rt->rt6i_uncached)) && rt->dst.from)) + (unlikely(!list_empty(&rt->rt6i_uncached)) && rt->from)) return rt6_dst_from_check(rt, cookie); else return rt6_check(rt, cookie); @@ -3075,11 +3075,11 @@ out: static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from) { - BUG_ON(from->dst.from); + BUG_ON(from->from); rt->rt6i_flags &= ~RTF_EXPIRES; dst_hold(&from->dst); - rt->dst.from = &from->dst; + rt->from = from; dst_init_metrics(&rt->dst, dst_metrics_ptr(&from->dst), true); } @@ -4347,9 +4347,8 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, goto errout; } - if (fibmatch && rt->dst.from) { - struct rt6_info *ort = container_of(rt->dst.from, - struct rt6_info, dst); + if (fibmatch && rt->from) { + struct rt6_info *ort = rt->from; dst_hold(&ort->dst); ip6_rt_put(rt); From 51f51d226421c6ee348f435d3829aeb4816fefee Mon Sep 17 00:00:00 2001 From: David Miller Date: Tue, 28 Nov 2017 15:40:46 -0500 Subject: [PATCH 0319/1640] BACKPORT: xfrm: Move dst->path into struct xfrm_dst The first member of an IPSEC route bundle chain sets it's dst->path to the underlying ipv4/ipv6 route that carries the bundle. Stated another way, if one were to follow the xfrm_dst->child chain of the bundle, the final non-NULL pointer would be the path and point to either an ipv4 or an ipv6 route. This is largely used to make sure that PMTU events propagate down to the correct ipv4 or ipv6 route. When we don't have the top of an IPSEC bundle 'dst->path == dst'. Move it down into xfrm_dst and key off of dst->xfrm. Change-Id: Id3115c7cbb24b989973cffa5a70cf03305a527ea Signed-off-by: David S. Miller Reviewed-by: Eric Dumazet --- include/net/dst.h | 3 +-- include/net/xfrm.h | 15 ++++++++++++++- net/bridge/br_nf_core.c | 1 - net/core/dst.c | 1 - net/ipv4/route.c | 2 +- net/ipv6/ip6_output.c | 4 ++-- net/ipv6/route.c | 6 ------ net/xfrm/xfrm_device.c | 2 +- net/xfrm/xfrm_policy.c | 28 ++++++++++++++-------------- 9 files changed, 33 insertions(+), 29 deletions(-) diff --git a/include/net/dst.h b/include/net/dst.h index a70ea5c675b3..e64916e8daaa 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -38,7 +38,6 @@ struct dst_entry { struct dst_ops *ops; unsigned long _metrics; unsigned long expires; - struct dst_entry *path; #ifdef CONFIG_XFRM struct xfrm_state *xfrm; #else @@ -87,7 +86,7 @@ struct dst_entry { * Align __refcnt to a 64 bytes alignment * (L1_CACHE_SIZE would be too much) */ - long __pad_to_align_refcnt[4]; + long __pad_to_align_refcnt[5]; #endif /* * __refcnt wants to be on a different cache line from diff --git a/include/net/xfrm.h b/include/net/xfrm.h index d8281b2287ff..f1638792aa5c 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -996,6 +996,7 @@ struct xfrm_dst { } u; struct dst_entry *route; struct dst_entry *child; + struct dst_entry *path; struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX]; int num_pols, num_xfrms; u32 xfrm_genid; @@ -1006,6 +1007,18 @@ struct xfrm_dst { u32 path_cookie; }; +static inline struct dst_entry *xfrm_dst_path(const struct dst_entry *dst) +{ +#ifdef CONFIG_XFRM + if (dst->xfrm) { + const struct xfrm_dst *xdst = (const struct xfrm_dst *) dst; + + return xdst->path; + } +#endif + return (struct dst_entry *) dst; +} + static inline struct dst_entry *xfrm_dst_child(const struct dst_entry *dst) { #ifdef CONFIG_XFRM @@ -1937,7 +1950,7 @@ static inline bool xfrm_dst_offload_ok(struct dst_entry *dst) return false; xdst = (struct xfrm_dst *) dst; - if (x->xso.offload_handle && (x->xso.dev == dst->path->dev) && + if (x->xso.offload_handle && (x->xso.dev == xfrm_dst_path(dst)->dev) && !xdst->child->xfrm) return true; diff --git a/net/bridge/br_nf_core.c b/net/bridge/br_nf_core.c index c217276bd76a..d88e724d5755 100644 --- a/net/bridge/br_nf_core.c +++ b/net/bridge/br_nf_core.c @@ -79,7 +79,6 @@ void br_netfilter_rtable_init(struct net_bridge *br) atomic_set(&rt->dst.__refcnt, 1); rt->dst.dev = br->dev; - rt->dst.path = &rt->dst; dst_init_metrics(&rt->dst, br_dst_default_metrics, true); rt->dst.flags = DST_NOXFRM | DST_FAKE_RTABLE; rt->dst.ops = &fake_dst_ops; diff --git a/net/core/dst.c b/net/core/dst.c index 50b20411d797..2cca136aa3e3 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -69,7 +69,6 @@ void dst_init(struct dst_entry *dst, struct dst_ops *ops, dst->ops = ops; dst_init_metrics(dst, dst_default_metrics.metrics, true); dst->expires = 0UL; - dst->path = dst; #ifdef CONFIG_XFRM dst->xfrm = NULL; #endif diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 396d97c9cfa2..396eb42b23c6 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1139,7 +1139,7 @@ void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu) new = true; } - __ip_rt_update_pmtu((struct rtable *) rt->dst.path, &fl4, mtu); + __ip_rt_update_pmtu((struct rtable *) xfrm_dst_path(&rt->dst), &fl4, mtu); if (!dst_check(&rt->dst, 0)) { if (new) diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 5f725d4ee73e..8b7a28b83d87 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1272,7 +1272,7 @@ static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork, READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst); else mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ? - READ_ONCE(rt->dst.dev->mtu) : dst_mtu(rt->dst.path); + READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst)); if (np->frag_size < mtu) { if (np->frag_size) mtu = np->frag_size; @@ -1281,7 +1281,7 @@ static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork, cork->base.gso_size = sk->sk_type == SOCK_DGRAM && sk->sk_protocol == IPPROTO_UDP ? ipc6->gso_size : 0; - if (dst_allfrag(rt->dst.path)) + if (dst_allfrag(xfrm_dst_path(&rt->dst))) cork->base.flags |= IPCORK_ALLFRAG; cork->base.length = 0; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index f8e21eb80628..e6ce3fdea3fa 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -4629,8 +4629,6 @@ static int __net_init ip6_route_net_init(struct net *net) GFP_KERNEL); if (!net->ipv6.ip6_null_entry) goto out_ip6_dst_entries; - net->ipv6.ip6_null_entry->dst.path = - (struct dst_entry *)net->ipv6.ip6_null_entry; net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops; dst_init_metrics(&net->ipv6.ip6_null_entry->dst, ip6_template_metrics, true); @@ -4642,8 +4640,6 @@ static int __net_init ip6_route_net_init(struct net *net) GFP_KERNEL); if (!net->ipv6.ip6_prohibit_entry) goto out_ip6_null_entry; - net->ipv6.ip6_prohibit_entry->dst.path = - (struct dst_entry *)net->ipv6.ip6_prohibit_entry; net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops; dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst, ip6_template_metrics, true); @@ -4653,8 +4649,6 @@ static int __net_init ip6_route_net_init(struct net *net) GFP_KERNEL); if (!net->ipv6.ip6_blk_hole_entry) goto out_ip6_prohibit_entry; - net->ipv6.ip6_blk_hole_entry->dst.path = - (struct dst_entry *)net->ipv6.ip6_blk_hole_entry; net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops; dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst, ip6_template_metrics, true); diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c index f0aa1bbf584c..b6ab6590f4b6 100644 --- a/net/xfrm/xfrm_device.c +++ b/net/xfrm/xfrm_device.c @@ -125,7 +125,7 @@ bool xfrm_dev_offload_ok(struct sk_buff *skb, struct xfrm_state *x) if (!x->type_offload || x->encap) return false; - if ((x->xso.offload_handle && (dev == dst->path->dev)) && + if ((x->xso.offload_handle && (dev == xfrm_dst_path(dst)->dev)) && !xdst->child->xfrm && x->type->get_mtu) { mtu = x->type->get_mtu(x, xdst->child_mtu_cached); diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index e2ef586af4af..b79cd7711f4e 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -1648,7 +1648,7 @@ static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy, } xfrm_dst_set_child(xdst_prev, dst); - xdst0->u.dst.path = dst; + xdst0->path = dst; err = -ENODEV; dev = dst->dev; @@ -1790,8 +1790,8 @@ static void xfrm_policy_queue_process(unsigned long arg) xfrm_decode_session(skb, &fl, dst->ops->family); spin_unlock(&pq->hold_queue.lock); - dst_hold(dst->path); - dst = xfrm_lookup(net, dst->path, &fl, sk, 0); + dst_hold(xfrm_dst_path(dst)); + dst = xfrm_lookup(net, xfrm_dst_path(dst), &fl, sk, 0); if (IS_ERR(dst)) goto purge_queue; @@ -1820,8 +1820,8 @@ static void xfrm_policy_queue_process(unsigned long arg) skb = __skb_dequeue(&list); xfrm_decode_session(skb, &fl, skb_dst(skb)->ops->family); - dst_hold(skb_dst(skb)->path); - dst = xfrm_lookup(net, skb_dst(skb)->path, &fl, skb->sk, 0); + dst_hold(xfrm_dst_path(skb_dst(skb))); + dst = xfrm_lookup(net, xfrm_dst_path(skb_dst(skb)), &fl, skb->sk, 0); if (IS_ERR(dst)) { kfree_skb(skb); continue; @@ -1923,7 +1923,7 @@ static struct xfrm_dst *xfrm_create_dummy_bundle(struct net *net, dst_hold(dst); xfrm_dst_set_child(xdst, dst); - dst1->path = dst; + xdst->path = dst; xfrm_init_path((struct xfrm_dst *)dst1, dst, 0); @@ -2588,7 +2588,7 @@ static int xfrm_bundle_ok(struct xfrm_dst *first) struct xfrm_dst *last; u32 mtu; - if (!dst_check(dst->path, ((struct xfrm_dst *)dst)->path_cookie) || + if (!dst_check(xfrm_dst_path(dst), ((struct xfrm_dst *)dst)->path_cookie) || (dst->dev && !netif_running(dst->dev))) return 0; @@ -2649,22 +2649,20 @@ static int xfrm_bundle_ok(struct xfrm_dst *first) static unsigned int xfrm_default_advmss(const struct dst_entry *dst) { - return dst_metric_advmss(dst->path); + return dst_metric_advmss(xfrm_dst_path(dst)); } static unsigned int xfrm_mtu(const struct dst_entry *dst) { unsigned int mtu = dst_metric_raw(dst, RTAX_MTU); - return mtu ? : dst_mtu(dst->path); + return mtu ? : dst_mtu(xfrm_dst_path(dst)); } static const void *xfrm_get_dst_nexthop(const struct dst_entry *dst, const void *daddr) { - const struct dst_entry *path = dst->path; - - for (; dst != path; dst = xfrm_dst_child(dst)) { + while (dst->xfrm) { const struct xfrm_state *xfrm = dst->xfrm; if (xfrm->props.mode == XFRM_MODE_TRANSPORT) @@ -2673,6 +2671,8 @@ static const void *xfrm_get_dst_nexthop(const struct dst_entry *dst, daddr = xfrm->coaddr; else if (!(xfrm->type->flags & XFRM_TYPE_LOCAL_COADDR)) daddr = &xfrm->id.daddr; + + dst = xfrm_dst_child(dst); } return daddr; } @@ -2681,7 +2681,7 @@ static struct neighbour *xfrm_neigh_lookup(const struct dst_entry *dst, struct sk_buff *skb, const void *daddr) { - const struct dst_entry *path = dst->path; + const struct dst_entry *path = xfrm_dst_path(dst); if (!skb) daddr = xfrm_get_dst_nexthop(dst, daddr); @@ -2690,7 +2690,7 @@ static struct neighbour *xfrm_neigh_lookup(const struct dst_entry *dst, static void xfrm_confirm_neigh(const struct dst_entry *dst, const void *daddr) { - const struct dst_entry *path = dst->path; + const struct dst_entry *path = xfrm_dst_path(dst); daddr = xfrm_get_dst_nexthop(dst, daddr); path->ops->confirm_neigh(path, daddr); From a942ace08544584489509503de04914fa1d6b211 Mon Sep 17 00:00:00 2001 From: David Miller Date: Tue, 28 Nov 2017 15:40:53 -0500 Subject: [PATCH 0320/1640] UPSTREAM: net: Rearrange dst_entry layout to avoid useless padding. We have padding to try and align the refcount on a separate cache line. But after several simplifications the padding has increased substantially. So now it's easy to change the layout to get rid of the padding entirely. We group the write-heavy __refcnt and __use with less often used items such as the rcu_head and the error code. Signed-off-by: David S. Miller Reviewed-by: Eric Dumazet --- include/net/dst.h | 31 ++++++++++++------------------- 1 file changed, 12 insertions(+), 19 deletions(-) diff --git a/include/net/dst.h b/include/net/dst.h index e64916e8daaa..67f147c66369 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -34,7 +34,6 @@ struct sk_buff; struct dst_entry { struct net_device *dev; - struct rcu_head rcu_head; struct dst_ops *ops; unsigned long _metrics; unsigned long expires; @@ -56,8 +55,6 @@ struct dst_entry { #define DST_XFRM_QUEUE 0x0040 #define DST_METADATA 0x0080 - short error; - /* A non-zero value of dst->obsolete forces by-hand validation * of the route entry. Positive values are set by the generic * dst layer to indicate that the entry has been forcefully @@ -73,29 +70,25 @@ struct dst_entry { #define DST_OBSOLETE_KILL -2 unsigned short header_len; /* more space at head required */ unsigned short trailer_len; /* space to reserve at tail */ - unsigned short __pad3; -#ifdef CONFIG_IP_ROUTE_CLASSID - __u32 tclassid; -#else - __u32 __pad2; -#endif - -#ifdef CONFIG_64BIT - /* - * Align __refcnt to a 64 bytes alignment - * (L1_CACHE_SIZE would be too much) - */ - long __pad_to_align_refcnt[5]; -#endif /* * __refcnt wants to be on a different cache line from * input/output/ops or performance tanks badly */ - atomic_t __refcnt; /* client references */ +#ifdef CONFIG_64BIT + atomic_t __refcnt; /* 64-bit offset 64 */ +#endif int __use; unsigned long lastuse; struct lwtunnel_state *lwtstate; + struct rcu_head rcu_head; + short error; + short __pad; + __u32 tclassid; +#ifndef CONFIG_64BIT + atomic_t __refcnt; /* 32-bit offset 64 */ +#endif + union { struct dst_entry *next; }; @@ -244,7 +237,7 @@ static inline void dst_hold(struct dst_entry *dst) { /* * If your kernel compilation stops here, please check - * __pad_to_align_refcnt declaration in struct dst_entry + * the placement of __refcnt in struct dst_entry */ BUILD_BUG_ON(offsetof(struct dst_entry, __refcnt) & 63); WARN_ON(atomic_inc_not_zero(&dst->__refcnt) == 0); From f6325aa6888ca68469cbba5591a90247f065bb51 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Sun, 7 Jan 2018 12:45:11 +0200 Subject: [PATCH 0321/1640] UPSTREAM: ipv6: Add explicit flush indication to routes When routes that are a part of a multipath route are evaluated by fib6_ifdown() in response to NETDEV_DOWN and NETDEV_UNREGISTER events the state of their sibling routes is not considered. This will change in subsequent patches in order to align IPv6 with IPv4's behavior. For example, when the last sibling in a multipath route becomes dead, the entire multipath route needs to be removed. To prevent the tree walker from re-evaluating all the sibling routes each time, we can simply evaluate them once - when the first sibling is traversed. If we determine the entire multipath route needs to be removed, then the 'should_flush' bit is set in all the siblings, which will cause the walker to flush them when it traverses them. Signed-off-by: Ido Schimmel Acked-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 357e7741ded8..9a1cf928b85b 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -176,7 +176,8 @@ struct rt6_info { unsigned short rt6i_nfheader_len; u8 rt6i_protocol; u8 exception_bucket_flushed:1, - unused:7; + should_flush:1, + unused:6; }; #define for_each_fib6_node_rt_rcu(fn) \ From 31624c87875f5b893fa5b897be10b505c7291018 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Sun, 7 Jan 2018 12:45:13 +0200 Subject: [PATCH 0322/1640] UPSTREAM: ipv6: Export sernum update function We are going to allow dead routes to stay in the FIB tree (e.g., when they are part of a multipath route, directly connected route with no carrier) and revive them when their nexthop device gains carrier or when it is put administratively up. This is equivalent to the addition of the route to the FIB tree and we should therefore take care of updating the sernum of all the parent nodes of the node where the route is stored. Otherwise, we risk sockets caching and using sub-optimal dst entries. Export the function that performs the above, so that it could be invoked from fib6_ifup() later on. Signed-off-by: Ido Schimmel Acked-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 1 + net/ipv6/ip6_fib.c | 11 ++++++++--- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 9a1cf928b85b..0660dc0c924f 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -407,6 +407,7 @@ unsigned int fib6_tables_seq_read(struct net *net); int fib6_tables_dump(struct net *net, struct notifier_block *nb); void fib6_update_sernum(struct rt6_info *rt); +void fib6_update_sernum_upto_root(struct net *net, struct rt6_info *rt); #ifdef CONFIG_IPV6_MULTIPLE_TABLES int fib6_rules_init(void); diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index b9d434d06857..236ee0e44682 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -1100,8 +1100,8 @@ void fib6_force_start_gc(struct net *net) jiffies + net->ipv6.sysctl.ip6_rt_gc_interval); } -static void fib6_update_sernum_upto_root(struct rt6_info *rt, - int sernum) +static void __fib6_update_sernum_upto_root(struct rt6_info *rt, + int sernum) { struct fib6_node *fn = rcu_dereference_protected(rt->rt6i_node, lockdep_is_held(&rt->rt6i_table->tb6_lock)); @@ -1115,6 +1115,11 @@ static void fib6_update_sernum_upto_root(struct rt6_info *rt, } } +void fib6_update_sernum_upto_root(struct net *net, struct rt6_info *rt) +{ + __fib6_update_sernum_upto_root(rt, fib6_new_sernum(net)); +} + /* * Add routing information to the routing tree. * / @@ -1228,7 +1233,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, err = fib6_add_rt2node(fn, rt, info, mxc); if (!err) { - fib6_update_sernum_upto_root(rt, sernum); + __fib6_update_sernum_upto_root(rt, sernum); fib6_start_gc(info->nl_net, rt); } From 1abbec9c5b319b1cf86bd6a41f98fa3e93fbee37 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Sun, 7 Jan 2018 12:45:01 +0200 Subject: [PATCH 0323/1640] UPSTREAM: ipv6: Remove redundant route flushing during namespace dismantle By the time fib6_net_exit() is executed all the netdevs in the namespace have been either unregistered or pushed back to the default namespace. That is because pernet subsys operations are always ordered before pernet device operations and therefore invoked after them during namespace dismantle. Thus, all the routing tables in the namespace are empty by the time fib6_net_exit() is invoked and the call to rt6_ifdown() can be removed. This allows us to simplify the condition in fib6_ifdown() as it's only ever called with an actual netdev. Signed-off-by: Ido Schimmel Acked-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/ip6_fib.c | 1 - net/ipv6/route.c | 8 +++----- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 236ee0e44682..eda20f9de264 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -2097,7 +2097,6 @@ static void fib6_net_exit(struct net *net) { unsigned int i; - rt6_ifdown(net, NULL); del_timer_sync(&net->ipv6.ip6_fib_timer); for (i = 0; i < FIB6_TABLE_HASHSZ; i++) { diff --git a/net/ipv6/route.c b/net/ipv6/route.c index e6ce3fdea3fa..b754987fd658 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -3458,10 +3458,9 @@ static int fib6_ifdown(struct rt6_info *rt, void *arg) const struct arg_dev_net *adn = arg; const struct net_device *dev = adn->dev; - if ((rt->dst.dev == dev || !dev) && + if (rt->dst.dev == dev && rt != adn->net->ipv6.ip6_null_entry && - (rt->rt6i_nsiblings == 0 || - (dev && netdev_unregistering(dev)) || + (rt->rt6i_nsiblings == 0 || netdev_unregistering(dev) || !rt->rt6i_idev->cnf.ignore_routes_with_linkdown)) return -1; @@ -3476,8 +3475,7 @@ void rt6_ifdown(struct net *net, struct net_device *dev) }; fib6_clean_all(net, fib6_ifdown, &adn); - if (dev) - rt6_uncached_list_flush_dev(net, dev); + rt6_uncached_list_flush_dev(net, dev); } struct rt6_mtu_change_arg { From 22128c6b8e0dc7fe785fd69d19ac7c5710fb9a5b Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Sun, 7 Jan 2018 12:45:02 +0200 Subject: [PATCH 0324/1640] UPSTREAM: ipv6: Mark dead nexthops with appropriate flags When a netdev is put administratively down or unregistered all the nexthops using it as their nexthop device should be marked with the 'dead' and 'linkdown' flags. Currently, when a route is dumped its nexthop device is tested and the flags are set accordingly. A similar check is performed during route lookup. Instead, we can simply mark the nexthops based on netdev events and avoid checking the netdev's state during route dump and lookup. Signed-off-by: Ido Schimmel Acked-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/route.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index b754987fd658..702fa4cb3bf2 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -3461,8 +3461,10 @@ static int fib6_ifdown(struct rt6_info *rt, void *arg) if (rt->dst.dev == dev && rt != adn->net->ipv6.ip6_null_entry && (rt->rt6i_nsiblings == 0 || netdev_unregistering(dev) || - !rt->rt6i_idev->cnf.ignore_routes_with_linkdown)) + !rt->rt6i_idev->cnf.ignore_routes_with_linkdown)) { + rt->rt6i_nh_flags |= (RTNH_F_DEAD | RTNH_F_LINKDOWN); return -1; + } return 0; } From 571403882332736d739e05c8b44f6be5cf9b7404 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Sun, 7 Jan 2018 12:45:03 +0200 Subject: [PATCH 0325/1640] UPSTREAM: ipv6: Clear nexthop flags upon netdev up Previous patch marked nexthops with the 'dead' and 'linkdown' flags. Clear these flags when the netdev comes back up. Signed-off-by: Ido Schimmel Acked-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_route.h | 1 + net/ipv6/addrconf.c | 3 +++ net/ipv6/route.c | 29 +++++++++++++++++++++++++++++ 3 files changed, 33 insertions(+) diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index 0e91ed9021a4..d47aee6e1f0f 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -169,6 +169,7 @@ void rt6_ifdown(struct net *net, struct net_device *dev); void rt6_mtu_change(struct net_device *dev, unsigned int mtu); void rt6_remove_prefsrc(struct inet6_ifaddr *ifp); void rt6_clean_tohost(struct net *net, struct in6_addr *gateway); +void rt6_sync_up(struct net_device *dev, unsigned int nh_flags); static inline const struct rt6_info *skb_rt6_info(const struct sk_buff *skb) { diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index aa112d8ab60a..2c614ace4cf6 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -3520,6 +3520,9 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event, if (run_pending) addrconf_dad_run(idev); + /* Device has an address by now */ + rt6_sync_up(dev, RTNH_F_DEAD); + /* * If the MTU changed during the interface down, * when the interface up, the changed MTU must be diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 702fa4cb3bf2..dabb2d2246d5 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -3447,6 +3447,35 @@ void rt6_clean_tohost(struct net *net, struct in6_addr *gateway) fib6_clean_all(net, fib6_clean_tohost, gateway); } +struct arg_netdev_event { + const struct net_device *dev; + unsigned int nh_flags; +}; + +static int fib6_ifup(struct rt6_info *rt, void *p_arg) +{ + const struct arg_netdev_event *arg = p_arg; + const struct net *net = dev_net(arg->dev); + + if (rt != net->ipv6.ip6_null_entry && rt->dst.dev == arg->dev) + rt->rt6i_nh_flags &= ~arg->nh_flags; + + return 0; +} + +void rt6_sync_up(struct net_device *dev, unsigned int nh_flags) +{ + struct arg_netdev_event arg = { + .dev = dev, + .nh_flags = nh_flags, + }; + + if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev)) + arg.nh_flags |= RTNH_F_LINKDOWN; + + fib6_clean_all(dev_net(dev), fib6_ifup, &arg); +} + struct arg_dev_net { struct net_device *dev; struct net *net; From c9b8469a3be5c2b07317d7585405c7e847a132d5 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Sun, 7 Jan 2018 12:45:04 +0200 Subject: [PATCH 0326/1640] UPSTREAM: ipv6: Prepare to handle multiple netdev events To make IPv6 more in line with IPv4 we need to be able to respond differently to different netdev events. For example, when a netdev is unregistered all the routes using it as their nexthop device should be flushed, whereas when the netdev's carrier changes only the 'linkdown' flag should be toggled. Currently, this is not possible, as the function that traverses the routing tables is not aware of the triggering event. Propagate the triggering event down, so that it could be used in later patches. Signed-off-by: Ido Schimmel Acked-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_route.h | 2 +- net/ipv6/addrconf.c | 4 ++-- net/ipv6/route.c | 37 +++++++++++++++++++++---------------- 3 files changed, 24 insertions(+), 19 deletions(-) diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index d47aee6e1f0f..f662e83738ac 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -165,11 +165,11 @@ struct rt6_rtnl_dump_arg { }; int rt6_dump_route(struct rt6_info *rt, void *p_arg); -void rt6_ifdown(struct net *net, struct net_device *dev); void rt6_mtu_change(struct net_device *dev, unsigned int mtu); void rt6_remove_prefsrc(struct inet6_ifaddr *ifp); void rt6_clean_tohost(struct net *net, struct in6_addr *gateway); void rt6_sync_up(struct net_device *dev, unsigned int nh_flags); +void rt6_disable_ip(struct net_device *dev, unsigned long event); static inline const struct rt6_info *skb_rt6_info(const struct sk_buff *skb) { diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 2c614ace4cf6..627796683e9b 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -3616,6 +3616,7 @@ static bool addr_is_local(const struct in6_addr *addr) static int addrconf_ifdown(struct net_device *dev, int how) { + unsigned long event = how ? NETDEV_UNREGISTER : NETDEV_DOWN; struct net *net = dev_net(dev); struct inet6_dev *idev; struct inet6_ifaddr *ifa, *tmp; @@ -3627,8 +3628,7 @@ static int addrconf_ifdown(struct net_device *dev, int how) ASSERT_RTNL(); - rt6_ifdown(net, dev); - neigh_ifdown(&nd_tbl, dev); + rt6_disable_ip(dev, event); idev = __in6_dev_get(dev); if (!idev) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index dabb2d2246d5..cb84c9817f74 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2368,7 +2368,7 @@ struct dst_entry *icmp6_dst_alloc(struct net_device *dev, rt->rt6i_idev = idev; dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0); - /* Add this dst into uncached_list so that rt6_ifdown() can + /* Add this dst into uncached_list so that rt6_disable_ip() can * do proper release of the net_device */ rt6_uncached_list_add(rt); @@ -3449,7 +3449,10 @@ void rt6_clean_tohost(struct net *net, struct in6_addr *gateway) struct arg_netdev_event { const struct net_device *dev; - unsigned int nh_flags; + union { + unsigned int nh_flags; + unsigned long event; + }; }; static int fib6_ifup(struct rt6_info *rt, void *p_arg) @@ -3476,19 +3479,15 @@ void rt6_sync_up(struct net_device *dev, unsigned int nh_flags) fib6_clean_all(dev_net(dev), fib6_ifup, &arg); } -struct arg_dev_net { - struct net_device *dev; - struct net *net; -}; - /* called with write lock held for table with rt */ -static int fib6_ifdown(struct rt6_info *rt, void *arg) +static int fib6_ifdown(struct rt6_info *rt, void *p_arg) { - const struct arg_dev_net *adn = arg; - const struct net_device *dev = adn->dev; + const struct arg_netdev_event *arg = p_arg; + const struct net_device *dev = arg->dev; + const struct net *net = dev_net(dev); if (rt->dst.dev == dev && - rt != adn->net->ipv6.ip6_null_entry && + rt != net->ipv6.ip6_null_entry && (rt->rt6i_nsiblings == 0 || netdev_unregistering(dev) || !rt->rt6i_idev->cnf.ignore_routes_with_linkdown)) { rt->rt6i_nh_flags |= (RTNH_F_DEAD | RTNH_F_LINKDOWN); @@ -3498,15 +3497,21 @@ static int fib6_ifdown(struct rt6_info *rt, void *arg) return 0; } -void rt6_ifdown(struct net *net, struct net_device *dev) +static void rt6_sync_down_dev(struct net_device *dev, unsigned long event) { - struct arg_dev_net adn = { + struct arg_netdev_event arg = { .dev = dev, - .net = net, + .event = event, }; - fib6_clean_all(net, fib6_ifdown, &adn); - rt6_uncached_list_flush_dev(net, dev); + fib6_clean_all(dev_net(dev), fib6_ifdown, &arg); +} + +void rt6_disable_ip(struct net_device *dev, unsigned long event) +{ + rt6_sync_down_dev(dev, event); + rt6_uncached_list_flush_dev(dev_net(dev), dev); + neigh_ifdown(&nd_tbl, dev); } struct rt6_mtu_change_arg { From d1375af452439068e46d673bedccf0bd04a0b8d3 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Sun, 7 Jan 2018 12:45:05 +0200 Subject: [PATCH 0327/1640] UPSTREAM: ipv6: Set nexthop flags upon carrier change Similar to IPv4, when the carrier of a netdev changes we should toggle the 'linkdown' flag on all the nexthops using it as their nexthop device. This will later allow us to test for the presence of this flag during route lookup and dump. Up until commit 4832c30d5458 ("net: ipv6: put host and anycast routes on device with address") host and anycast routes used the loopback netdev as their nexthop device and thus were not marked with the 'linkdown' flag. The patch preserves this behavior and allows one to ping the local address even when the nexthop device does not have a carrier and the 'ignore_routes_with_linkdown' sysctl is set. Signed-off-by: Ido Schimmel Acked-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_route.h | 1 + net/ipv6/addrconf.c | 2 ++ net/ipv6/route.c | 23 +++++++++++++++++------ 3 files changed, 20 insertions(+), 6 deletions(-) diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index f662e83738ac..01df7dd760e0 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -170,6 +170,7 @@ void rt6_remove_prefsrc(struct inet6_ifaddr *ifp); void rt6_clean_tohost(struct net *net, struct in6_addr *gateway); void rt6_sync_up(struct net_device *dev, unsigned int nh_flags); void rt6_disable_ip(struct net_device *dev, unsigned long event); +void rt6_sync_down_dev(struct net_device *dev, unsigned long event); static inline const struct rt6_info *skb_rt6_info(const struct sk_buff *skb) { diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 627796683e9b..670aeb1b7a5d 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -3474,6 +3474,7 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event, } else if (event == NETDEV_CHANGE) { if (!addrconf_link_ready(dev)) { /* device is still not ready. */ + rt6_sync_down_dev(dev, event); break; } @@ -3485,6 +3486,7 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event, * multicast snooping switches */ ipv6_mc_up(idev); + rt6_sync_up(dev, RTNH_F_LINKDOWN); break; } idev->if_flags |= IF_READY; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index cb84c9817f74..4a1c451ebba1 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -3486,18 +3486,29 @@ static int fib6_ifdown(struct rt6_info *rt, void *p_arg) const struct net_device *dev = arg->dev; const struct net *net = dev_net(dev); - if (rt->dst.dev == dev && - rt != net->ipv6.ip6_null_entry && - (rt->rt6i_nsiblings == 0 || netdev_unregistering(dev) || - !rt->rt6i_idev->cnf.ignore_routes_with_linkdown)) { - rt->rt6i_nh_flags |= (RTNH_F_DEAD | RTNH_F_LINKDOWN); + if (rt->dst.dev != dev || rt == net->ipv6.ip6_null_entry) + return 0; + + switch (arg->event) { + case NETDEV_UNREGISTER: return -1; + case NETDEV_DOWN: + if (rt->rt6i_nsiblings == 0 || + !rt->rt6i_idev->cnf.ignore_routes_with_linkdown) + return -1; + rt->rt6i_nh_flags |= RTNH_F_DEAD; + /* fall through */ + case NETDEV_CHANGE: + if (rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST)) + break; + rt->rt6i_nh_flags |= RTNH_F_LINKDOWN; + break; } return 0; } -static void rt6_sync_down_dev(struct net_device *dev, unsigned long event) +void rt6_sync_down_dev(struct net_device *dev, unsigned long event) { struct arg_netdev_event arg = { .dev = dev, From fb9f9a22c2e68d315758406ed803b1d28f505077 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Sun, 7 Jan 2018 12:45:06 +0200 Subject: [PATCH 0328/1640] UPSTREAM: ipv6: Set nexthop flags during route creation It is valid to install routes with a nexthop device that does not have a carrier, so we need to make sure they're marked accordingly. As explained in the previous patch, host and anycast routes are never marked with the 'linkdown' flag. Note that reject routes are unaffected, as these use the loopback device which always has a carrier. Signed-off-by: Ido Schimmel Acked-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/route.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 4a1c451ebba1..8dcd9d09f4e8 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2765,6 +2765,9 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg, rt->rt6i_flags = cfg->fc_flags; install_route: + if (!(rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST)) && + !netif_carrier_ok(dev)) + rt->rt6i_nh_flags |= RTNH_F_LINKDOWN; rt->dst.dev = dev; rt->rt6i_idev = idev; rt->rt6i_table = table; From 01f95c9e8b607563efb94a4e469b23b91d5945b2 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 21 Nov 2017 09:50:12 +0200 Subject: [PATCH 0329/1640] UPSTREAM: ipv6: Do not consider linkdown nexthops during multipath When the 'ignore_routes_with_linkdown' sysctl is set, we should not consider linkdown nexthops during route lookup. While the code correctly verifies that the initially selected route ('match') has a carrier, it does not perform the same check in the subsequent multipath selection, resulting in a potential packet loss. In case the chosen route does not have a carrier and the sysctl is set, choose the initially selected route. Fixes: 35103d11173b ("net: ipv6 sysctl option to ignore routes when nexthop link is down") Signed-off-by: Ido Schimmel Acked-by: David Ahern Acked-by: Andy Gospodarek Signed-off-by: David S. Miller --- net/ipv6/route.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 8dcd9d09f4e8..99c0d9c8e6b1 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -485,6 +485,11 @@ static struct rt6_info *rt6_multipath_select(struct rt6_info *match, &match->rt6i_siblings, rt6i_siblings) { route_choosen--; if (route_choosen == 0) { + struct inet6_dev *idev = sibling->rt6i_idev; + + if (!netif_carrier_ok(sibling->dst.dev) && + idev->cnf.ignore_routes_with_linkdown) + break; if (rt6_score_route(sibling, oif, strict) < 0) break; match = sibling; From d088b2ff159db93d38abecd6056b080c2b4cf9f0 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Sun, 7 Jan 2018 12:45:07 +0200 Subject: [PATCH 0330/1640] UPSTREAM: ipv6: Check nexthop flags during route lookup instead of carrier Now that the RTNH_F_LINKDOWN flag is set in nexthops, we can avoid the need to dereference the nexthop device and check its carrier and instead check for the presence of the flag. Signed-off-by: Ido Schimmel Acked-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/route.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 99c0d9c8e6b1..82b2510b7590 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -487,7 +487,7 @@ static struct rt6_info *rt6_multipath_select(struct rt6_info *match, if (route_choosen == 0) { struct inet6_dev *idev = sibling->rt6i_idev; - if (!netif_carrier_ok(sibling->dst.dev) && + if (sibling->rt6i_nh_flags & RTNH_F_LINKDOWN && idev->cnf.ignore_routes_with_linkdown) break; if (rt6_score_route(sibling, oif, strict) < 0) @@ -692,10 +692,9 @@ static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict, int m; bool match_do_rr = false; struct inet6_dev *idev = rt->rt6i_idev; - struct net_device *dev = rt->dst.dev; - if (dev && !netif_carrier_ok(dev) && - idev->cnf.ignore_routes_with_linkdown && + if (idev->cnf.ignore_routes_with_linkdown && + rt->rt6i_nh_flags & RTNH_F_LINKDOWN && !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE)) goto out; From 5f54a1f83f8b0b605f75c707178f9e1e5872acd9 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Sun, 7 Jan 2018 12:45:08 +0200 Subject: [PATCH 0331/1640] UPSTREAM: ipv6: Check nexthop flags in route dump instead of carrier Similar to previous patch, there is no need to check for the carrier of the nexthop device when dumping the route and we can instead check for the presence of the RTNH_F_LINKDOWN flag. Signed-off-by: Ido Schimmel Acked-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/route.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 82b2510b7590..3a9899b5aaf6 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -4070,7 +4070,7 @@ static size_t rt6_nlmsg_size(struct rt6_info *rt) static int rt6_nexthop_info(struct sk_buff *skb, struct rt6_info *rt, unsigned int *flags, bool skip_oif) { - if (!netif_running(rt->dst.dev) || !netif_carrier_ok(rt->dst.dev)) { + if (rt->rt6i_nh_flags & RTNH_F_LINKDOWN) { *flags |= RTNH_F_LINKDOWN; if (rt->rt6i_idev->cnf.ignore_routes_with_linkdown) *flags |= RTNH_F_DEAD; From a66b051a74d8dd97e6665a22934361005520c7fe Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Sun, 7 Jan 2018 12:45:09 +0200 Subject: [PATCH 0332/1640] UPSTREAM: ipv6: Ignore dead routes during lookup Currently, dead routes are only present in the routing tables in case the 'ignore_routes_with_linkdown' sysctl is set. Otherwise, they are flushed. Subsequent patches are going to remove the reliance on this sysctl and make IPv6 more consistent with IPv4. Before this is done, we need to make sure dead routes are skipped during route lookup, so as to not cause packet loss. Signed-off-by: Ido Schimmel Acked-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/route.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 3a9899b5aaf6..55306d687530 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -487,6 +487,8 @@ static struct rt6_info *rt6_multipath_select(struct rt6_info *match, if (route_choosen == 0) { struct inet6_dev *idev = sibling->rt6i_idev; + if (sibling->rt6i_nh_flags & RTNH_F_DEAD) + break; if (sibling->rt6i_nh_flags & RTNH_F_LINKDOWN && idev->cnf.ignore_routes_with_linkdown) break; @@ -512,12 +514,15 @@ static inline struct rt6_info *rt6_device_match(struct net *net, struct rt6_info *local = NULL; struct rt6_info *sprt; - if (!oif && ipv6_addr_any(saddr)) - goto out; + if (!oif && ipv6_addr_any(saddr) && !(rt->rt6i_nh_flags & RTNH_F_DEAD)) + return rt; for (sprt = rt; sprt; sprt = rcu_dereference(sprt->rt6_next)) { struct net_device *dev = sprt->dst.dev; + if (sprt->rt6i_nh_flags & RTNH_F_DEAD) + continue; + if (oif) { if (dev->ifindex == oif) return sprt; @@ -546,8 +551,8 @@ static inline struct rt6_info *rt6_device_match(struct net *net, if (flags & RT6_LOOKUP_F_IFACE) return net->ipv6.ip6_null_entry; } -out: - return rt; + + return rt->rt6i_nh_flags & RTNH_F_DEAD ? net->ipv6.ip6_null_entry : rt; } #ifdef CONFIG_IPV6_ROUTER_PREF @@ -693,6 +698,9 @@ static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict, bool match_do_rr = false; struct inet6_dev *idev = rt->rt6i_idev; + if (rt->rt6i_nh_flags & RTNH_F_DEAD) + goto out; + if (idev->cnf.ignore_routes_with_linkdown && rt->rt6i_nh_flags & RTNH_F_LINKDOWN && !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE)) @@ -2182,6 +2190,8 @@ static struct rt6_info *__ip6_route_redirect(struct net *net, fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); restart: for_each_fib6_node_rt_rcu(fn) { + if (rt->rt6i_nh_flags & RTNH_F_DEAD) + continue; if (rt6_check_expired(rt)) continue; if (rt->dst.error) From 7d14d7d6812a6b5eaa2f26c1c91c10d3446cf3d0 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Sun, 7 Jan 2018 12:45:10 +0200 Subject: [PATCH 0333/1640] UPSTREAM: ipv6: Report dead flag during route dump Up until now the RTNH_F_DEAD flag was only reported in route dump when the 'ignore_routes_with_linkdown' sysctl was set. This is expected as dead routes were flushed otherwise. The reliance on this sysctl is going to be removed, so we need to report the flag regardless of the sysctl's value. Signed-off-by: Ido Schimmel Acked-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/route.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 55306d687530..6d6e46052c70 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -4080,6 +4080,9 @@ static size_t rt6_nlmsg_size(struct rt6_info *rt) static int rt6_nexthop_info(struct sk_buff *skb, struct rt6_info *rt, unsigned int *flags, bool skip_oif) { + if (rt->rt6i_nh_flags & RTNH_F_DEAD) + *flags |= RTNH_F_DEAD; + if (rt->rt6i_nh_flags & RTNH_F_LINKDOWN) { *flags |= RTNH_F_LINKDOWN; if (rt->rt6i_idev->cnf.ignore_routes_with_linkdown) From 077621af15e2e8d89b54bac08c7cc608a3f5899a Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 19 Oct 2017 16:07:10 +0200 Subject: [PATCH 0334/1640] UPSTREAM: ipv6: start fib6 gc on RTF_CACHE dst creation After the commit 2b760fcf5cfb ("ipv6: hook up exception table to store dst cache"), the fib6 gc is not started after the creation of a RTF_CACHE via a redirect or pmtu update, since fib6_add() isn't invoked anymore for such dsts. We need the fib6 gc to run periodically to clean the RTF_CACHE, or the dst will stay there forever. Fix it by explicitly calling fib6_force_start_gc() on successful exception creation. gc_args->more accounting will ensure that the gc timer will run for whatever time needed to properly clean the table. v2 -> v3: - clarified the commit message Fixes: 2b760fcf5cfb ("ipv6: hook up exception table to store dst cache") Signed-off-by: Paolo Abeni Acked-by: Wei Wang Acked-by: Martin KaFai Lau Signed-off-by: David S. Miller --- net/ipv6/route.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 6d6e46052c70..5375f5d8a958 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1374,8 +1374,10 @@ out: spin_unlock_bh(&rt6_exception_lock); /* Update fn->fn_sernum to invalidate all cached dst */ - if (!err) + if (!err) { fib6_update_sernum(ort); + fib6_force_start_gc(net); + } return err; } From e06bc7aee2bf8ef88a3b68afe29a04aac03d99a6 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Sun, 7 Jan 2018 12:45:14 +0200 Subject: [PATCH 0335/1640] UPSTREAM: ipv6: Take table lock outside of sernum update function The next patch is going to allow dead routes to remain in the FIB tree in certain situations. When this happens we need to be sure to bump the sernum of the nodes where these are stored so that potential copies cached in sockets are invalidated. The function that performs this update assumes the table lock is not taken when it is invoked, but that will not be the case when it is invoked by the tree walker. Have the function assume the lock is taken and make the single caller take the lock itself. Signed-off-by: Ido Schimmel Acked-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/ip6_fib.c | 5 +---- net/ipv6/route.c | 2 ++ 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index eda20f9de264..1542ddfeeabe 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -107,16 +107,13 @@ enum { void fib6_update_sernum(struct rt6_info *rt) { - struct fib6_table *table = rt->rt6i_table; struct net *net = dev_net(rt->dst.dev); struct fib6_node *fn; - spin_lock_bh(&table->tb6_lock); fn = rcu_dereference_protected(rt->rt6i_node, - lockdep_is_held(&table->tb6_lock)); + lockdep_is_held(&rt->rt6i_table->tb6_lock)); if (fn) fn->fn_sernum = fib6_new_sernum(net); - spin_unlock_bh(&table->tb6_lock); } /* diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 5375f5d8a958..d6ecf8fea12e 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1375,7 +1375,9 @@ out: /* Update fn->fn_sernum to invalidate all cached dst */ if (!err) { + spin_lock_bh(&ort->rt6i_table->tb6_lock); fib6_update_sernum(ort); + spin_unlock_bh(&ort->rt6i_table->tb6_lock); fib6_force_start_gc(net); } From b4802d76a16c90d079e4af57d5d9fc303b51172a Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Sun, 7 Jan 2018 12:45:15 +0200 Subject: [PATCH 0336/1640] UPSTREAM: ipv6: Flush multipath routes when all siblings are dead By default, IPv6 deletes nexthops from a multipath route when the nexthop device is put administratively down. This differs from IPv4 where the nexthops are kept, but marked with the RTNH_F_DEAD flag. A multipath route is flushed when all of its nexthops become dead. Align IPv6 with IPv4 and have it conform to the same guidelines. In case the multipath route needs to be flushed, its siblings are flushed one by one. Otherwise, the nexthops are marked with the appropriate flags and the tree walker is instructed to skip all the siblings. As explained in previous patches, care is taken to update the sernum of the affected tree nodes, so as to prevent the use of wrong dst entries. Signed-off-by: Ido Schimmel Acked-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/route.c | 83 +++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 75 insertions(+), 8 deletions(-) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index d6ecf8fea12e..8649fd4853a6 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -3481,8 +3481,10 @@ static int fib6_ifup(struct rt6_info *rt, void *p_arg) const struct arg_netdev_event *arg = p_arg; const struct net *net = dev_net(arg->dev); - if (rt != net->ipv6.ip6_null_entry && rt->dst.dev == arg->dev) + if (rt != net->ipv6.ip6_null_entry && rt->dst.dev == arg->dev) { rt->rt6i_nh_flags &= ~arg->nh_flags; + fib6_update_sernum_upto_root(dev_net(rt->dst.dev), rt); + } return 0; } @@ -3500,6 +3502,58 @@ void rt6_sync_up(struct net_device *dev, unsigned int nh_flags) fib6_clean_all(dev_net(dev), fib6_ifup, &arg); } +static bool rt6_multipath_uses_dev(const struct rt6_info *rt, + const struct net_device *dev) +{ + struct rt6_info *iter; + + if (rt->dst.dev == dev) + return true; + list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings) + if (iter->dst.dev == dev) + return true; + + return false; +} + +static void rt6_multipath_flush(struct rt6_info *rt) +{ + struct rt6_info *iter; + + rt->should_flush = 1; + list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings) + iter->should_flush = 1; +} + +static unsigned int rt6_multipath_dead_count(const struct rt6_info *rt, + const struct net_device *down_dev) +{ + struct rt6_info *iter; + unsigned int dead = 0; + + if (rt->dst.dev == down_dev || rt->rt6i_nh_flags & RTNH_F_DEAD) + dead++; + list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings) + if (iter->dst.dev == down_dev || + iter->rt6i_nh_flags & RTNH_F_DEAD) + dead++; + + return dead; +} + +static void rt6_multipath_nh_flags_set(struct rt6_info *rt, + const struct net_device *dev, + unsigned int nh_flags) +{ + struct rt6_info *iter; + + if (rt->dst.dev == dev) + rt->rt6i_nh_flags |= nh_flags; + list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings) + if (iter->dst.dev == dev) + iter->rt6i_nh_flags |= nh_flags; +} + /* called with write lock held for table with rt */ static int fib6_ifdown(struct rt6_info *rt, void *p_arg) { @@ -3507,20 +3561,33 @@ static int fib6_ifdown(struct rt6_info *rt, void *p_arg) const struct net_device *dev = arg->dev; const struct net *net = dev_net(dev); - if (rt->dst.dev != dev || rt == net->ipv6.ip6_null_entry) + if (rt == net->ipv6.ip6_null_entry) return 0; switch (arg->event) { case NETDEV_UNREGISTER: - return -1; + return rt->dst.dev == dev ? -1 : 0; case NETDEV_DOWN: - if (rt->rt6i_nsiblings == 0 || - !rt->rt6i_idev->cnf.ignore_routes_with_linkdown) + if (rt->should_flush) return -1; - rt->rt6i_nh_flags |= RTNH_F_DEAD; - /* fall through */ + if (!rt->rt6i_nsiblings) + return rt->dst.dev == dev ? -1 : 0; + if (rt6_multipath_uses_dev(rt, dev)) { + unsigned int count; + + count = rt6_multipath_dead_count(rt, dev); + if (rt->rt6i_nsiblings + 1 == count) { + rt6_multipath_flush(rt); + return -1; + } + rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD | + RTNH_F_LINKDOWN); + fib6_update_sernum(rt); + } + return -2; case NETDEV_CHANGE: - if (rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST)) + if (rt->dst.dev != dev || + rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST)) break; rt->rt6i_nh_flags |= RTNH_F_LINKDOWN; break; From 0c36045114dc167fb0498b11a6e6e482b45983ce Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 9 Jan 2018 16:40:25 +0200 Subject: [PATCH 0337/1640] UPSTREAM: ipv6: Calculate hash thresholds for IPv6 nexthops Before we convert IPv6 to use hash-threshold instead of modulo-N, we first need each nexthop to store its region boundary in the hash function's output space. The boundary is calculated by dividing the output space equally between the different active nexthops. That is, nexthops that are not dead or linkdown. The boundaries are rebalanced whenever a nexthop is added or removed to a multipath route and whenever a nexthop becomes active or inactive. Signed-off-by: Ido Schimmel Acked-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 1 + include/net/ip6_route.h | 7 +++ net/ipv6/ip6_fib.c | 8 +--- net/ipv6/route.c | 96 +++++++++++++++++++++++++++++++++++++++++ 4 files changed, 106 insertions(+), 6 deletions(-) diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 0660dc0c924f..ec7d820cfceb 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -152,6 +152,7 @@ struct rt6_info { */ struct list_head rt6i_siblings; unsigned int rt6i_nsiblings; + atomic_t rt6i_nh_upper_bound; atomic_t rt6i_ref; diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index 01df7dd760e0..c6451d40cba1 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -66,6 +66,12 @@ static inline bool rt6_need_strict(const struct in6_addr *daddr) (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK); } +static inline bool rt6_qualify_for_ecmp(const struct rt6_info *rt) +{ + return (rt->rt6i_flags & (RTF_GATEWAY|RTF_ADDRCONF|RTF_DYNAMIC)) == + RTF_GATEWAY; +} + void ip6_route_input(struct sk_buff *skb); struct dst_entry *ip6_route_input_lookup(struct net *net, struct net_device *dev, @@ -171,6 +177,7 @@ void rt6_clean_tohost(struct net *net, struct in6_addr *gateway); void rt6_sync_up(struct net_device *dev, unsigned int nh_flags); void rt6_disable_ip(struct net_device *dev, unsigned long event); void rt6_sync_down_dev(struct net_device *dev, unsigned long event); +void rt6_multipath_rebalance(struct rt6_info *rt); static inline const struct rt6_info *skb_rt6_info(const struct sk_buff *skb) { diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 1542ddfeeabe..0cb666eaf283 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -794,12 +794,6 @@ insert_above: return ln; } -static bool rt6_qualify_for_ecmp(struct rt6_info *rt) -{ - return (rt->rt6i_flags & (RTF_GATEWAY|RTF_ADDRCONF|RTF_DYNAMIC)) == - RTF_GATEWAY; -} - static void fib6_copy_metrics(u32 *mp, const struct mx6_config *mxc) { int i; @@ -989,6 +983,7 @@ next_iter: rt6i_nsiblings++; } BUG_ON(rt6i_nsiblings != rt->rt6i_nsiblings); + rt6_multipath_rebalance(temp_sibling); } /* @@ -1662,6 +1657,7 @@ static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn, sibling->rt6i_nsiblings--; rt->rt6i_nsiblings = 0; list_del_init(&rt->rt6i_siblings); + rt6_multipath_rebalance(next_sibling); } /* Adjust walkers */ diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 8649fd4853a6..899290c39e36 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -3476,6 +3476,99 @@ struct arg_netdev_event { }; }; +static struct rt6_info *rt6_multipath_first_sibling(const struct rt6_info *rt) +{ + struct rt6_info *iter; + struct fib6_node *fn; + + fn = rcu_dereference_protected(rt->rt6i_node, + lockdep_is_held(&rt->rt6i_table->tb6_lock)); + iter = rcu_dereference_protected(fn->leaf, + lockdep_is_held(&rt->rt6i_table->tb6_lock)); + while (iter) { + if (iter->rt6i_metric == rt->rt6i_metric && + rt6_qualify_for_ecmp(iter)) + return iter; + iter = rcu_dereference_protected(iter->rt6_next, + lockdep_is_held(&rt->rt6i_table->tb6_lock)); + } + + return NULL; +} + +static bool rt6_is_dead(const struct rt6_info *rt) +{ + if (rt->rt6i_nh_flags & RTNH_F_DEAD || + (rt->rt6i_nh_flags & RTNH_F_LINKDOWN && + rt->rt6i_idev->cnf.ignore_routes_with_linkdown)) + return true; + + return false; +} + +static int rt6_multipath_total_weight(const struct rt6_info *rt) +{ + struct rt6_info *iter; + int total = 0; + + if (!rt6_is_dead(rt)) + total++; + + list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings) { + if (!rt6_is_dead(iter)) + total++; + } + + return total; +} + +static void rt6_upper_bound_set(struct rt6_info *rt, int *weight, int total) +{ + int upper_bound = -1; + + if (!rt6_is_dead(rt)) { + (*weight)++; + upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31, + total) - 1; + } + atomic_set(&rt->rt6i_nh_upper_bound, upper_bound); +} + +static void rt6_multipath_upper_bound_set(struct rt6_info *rt, int total) +{ + struct rt6_info *iter; + int weight = 0; + + rt6_upper_bound_set(rt, &weight, total); + + list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings) + rt6_upper_bound_set(iter, &weight, total); +} + +void rt6_multipath_rebalance(struct rt6_info *rt) +{ + struct rt6_info *first; + int total; + + /* In case the entire multipath route was marked for flushing, + * then there is no need to rebalance upon the removal of every + * sibling route. + */ + if (!rt->rt6i_nsiblings || rt->should_flush) + return; + + /* During lookup routes are evaluated in order, so we need to + * make sure upper bounds are assigned from the first sibling + * onwards. + */ + first = rt6_multipath_first_sibling(rt); + if (WARN_ON_ONCE(!first)) + return; + + total = rt6_multipath_total_weight(first); + rt6_multipath_upper_bound_set(first, total); +} + static int fib6_ifup(struct rt6_info *rt, void *p_arg) { const struct arg_netdev_event *arg = p_arg; @@ -3484,6 +3577,7 @@ static int fib6_ifup(struct rt6_info *rt, void *p_arg) if (rt != net->ipv6.ip6_null_entry && rt->dst.dev == arg->dev) { rt->rt6i_nh_flags &= ~arg->nh_flags; fib6_update_sernum_upto_root(dev_net(rt->dst.dev), rt); + rt6_multipath_rebalance(rt); } return 0; @@ -3583,6 +3677,7 @@ static int fib6_ifdown(struct rt6_info *rt, void *p_arg) rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD | RTNH_F_LINKDOWN); fib6_update_sernum(rt); + rt6_multipath_rebalance(rt); } return -2; case NETDEV_CHANGE: @@ -3590,6 +3685,7 @@ static int fib6_ifdown(struct rt6_info *rt, void *p_arg) rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST)) break; rt->rt6i_nh_flags |= RTNH_F_LINKDOWN; + rt6_multipath_rebalance(rt); break; } From 572aad3276785840f7dcb23b41fc4b0f0c8c54ff Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 9 Jan 2018 16:40:26 +0200 Subject: [PATCH 0338/1640] UPSTREAM: ipv6: Use a 31-bit multipath hash The hash thresholds assigned to IPv6 nexthops are in the range of [-1, 2^31 - 1], where a negative value is assigned to nexthops that should not be considered during multipath selection. Therefore, in a similar fashion to IPv4, we need to use the upper 31-bits of the multipath hash for multipath selection. Signed-off-by: Ido Schimmel Acked-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/route.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 899290c39e36..0dd40792b5b2 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1856,10 +1856,10 @@ u32 rt6_multipath_hash(const struct flowi6 *fl6, const struct sk_buff *skb) if (skb) { ip6_multipath_l3_keys(skb, &hash_keys); - return flow_hash_from_keys(&hash_keys); + return flow_hash_from_keys(&hash_keys) >> 1; } - return get_hash_from_flowi6(fl6); + return get_hash_from_flowi6(fl6) >> 1; } void ip6_route_input(struct sk_buff *skb) From b3feb71b04051ee6a588012b151ccbfb07305b1c Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 9 Jan 2018 16:40:27 +0200 Subject: [PATCH 0339/1640] UPSTREAM: ipv6: Use hash-threshold instead of modulo-N Now that each nexthop stores its region boundary in the multipath hash function's output space, we can use hash-threshold instead of modulo-N in multipath selection. This reduces the number of checks we need to perform during lookup, as dead and linkdown nexthops are assigned a negative region boundary. In addition, in contrast to modulo-N, only flows near region boundaries are affected when a nexthop is added or removed. Signed-off-by: Ido Schimmel Acked-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/route.c | 34 ++++++++++++---------------------- 1 file changed, 12 insertions(+), 22 deletions(-) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 0dd40792b5b2..dfd618ac328d 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -468,7 +468,6 @@ static struct rt6_info *rt6_multipath_select(struct rt6_info *match, int strict) { struct rt6_info *sibling, *next_sibling; - int route_choosen; /* We might have already computed the hash for ICMPv6 errors. In such * case it will always be non-zero. Otherwise now is the time to do it. @@ -476,28 +475,19 @@ static struct rt6_info *rt6_multipath_select(struct rt6_info *match, if (!fl6->mp_hash) fl6->mp_hash = rt6_multipath_hash(fl6, NULL); - route_choosen = fl6->mp_hash % (match->rt6i_nsiblings + 1); - /* Don't change the route, if route_choosen == 0 - * (siblings does not include ourself) - */ - if (route_choosen) - list_for_each_entry_safe(sibling, next_sibling, - &match->rt6i_siblings, rt6i_siblings) { - route_choosen--; - if (route_choosen == 0) { - struct inet6_dev *idev = sibling->rt6i_idev; + if (fl6->mp_hash <= atomic_read(&match->rt6i_nh_upper_bound)) + return match; + + list_for_each_entry_safe(sibling, next_sibling, &match->rt6i_siblings, + rt6i_siblings) { + if (fl6->mp_hash > atomic_read(&sibling->rt6i_nh_upper_bound)) + continue; + if (rt6_score_route(sibling, oif, strict) < 0) + break; + match = sibling; + break; + } - if (sibling->rt6i_nh_flags & RTNH_F_DEAD) - break; - if (sibling->rt6i_nh_flags & RTNH_F_LINKDOWN && - idev->cnf.ignore_routes_with_linkdown) - break; - if (rt6_score_route(sibling, oif, strict) < 0) - break; - match = sibling; - break; - } - } return match; } From 09fbdfc8c0a44a7cdb92c035f3580be3dd4447c5 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 9 Jan 2018 16:40:28 +0200 Subject: [PATCH 0340/1640] UPSTREAM: ipv6: Add support for non-equal-cost multipath The use of hash-threshold instead of modulo-N makes it trivial to add support for non-equal-cost multipath. Instead of dividing the multipath hash function's output space equally between the nexthops, each nexthop is assigned a region size which is proportional to its weight. Signed-off-by: Ido Schimmel Acked-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 1 + net/ipv6/route.c | 11 +++++++---- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index ec7d820cfceb..982cb2ceb22d 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -174,6 +174,7 @@ struct rt6_info { u32 rt6i_metric; u32 rt6i_pmtu; /* more non-fragment space at head required */ + int rt6i_nh_weight; unsigned short rt6i_nfheader_len; u8 rt6i_protocol; u8 exception_bucket_flushed:1, diff --git a/net/ipv6/route.c b/net/ipv6/route.c index dfd618ac328d..870abe2e8bf1 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2620,6 +2620,7 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg, #endif rt->rt6i_metric = cfg->fc_metric; + rt->rt6i_nh_weight = 1; /* We cannot add true routes via loopback here, they would result in kernel looping; promote them to reject routes @@ -3502,11 +3503,11 @@ static int rt6_multipath_total_weight(const struct rt6_info *rt) int total = 0; if (!rt6_is_dead(rt)) - total++; + total += rt->rt6i_nh_weight; list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings) { if (!rt6_is_dead(iter)) - total++; + total += iter->rt6i_nh_weight; } return total; @@ -3517,7 +3518,7 @@ static void rt6_upper_bound_set(struct rt6_info *rt, int *weight, int total) int upper_bound = -1; if (!rt6_is_dead(rt)) { - (*weight)++; + *weight += rt->rt6i_nh_weight; upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31, total) - 1; } @@ -4042,6 +4043,8 @@ static int ip6_route_multipath_add(struct fib6_config *cfg, goto cleanup; } + rt->rt6i_nh_weight = rtnh->rtnh_hops + 1; + err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg); if (err) { dst_release_immediate(&rt->dst); @@ -4279,7 +4282,7 @@ static int rt6_add_nexthop(struct sk_buff *skb, struct rt6_info *rt) if (!rtnh) goto nla_put_failure; - rtnh->rtnh_hops = 0; + rtnh->rtnh_hops = rt->rt6i_nh_weight - 1; rtnh->rtnh_ifindex = rt->dst.dev ? rt->dst.dev->ifindex : 0; if (rt6_nexthop_info(skb, rt, &flags, true) < 0) From aada07cc1b8807ef187d0c89c84a79336d7a6c7f Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Fri, 12 Jan 2018 22:07:36 +0200 Subject: [PATCH 0341/1640] UPSTREAM: ipv6: Fix build with gcc-4.4.5 Emil reported the following compiler errors: net/ipv6/route.c: In function `rt6_sync_up`: net/ipv6/route.c:3586: error: unknown field `nh_flags` specified in initializer net/ipv6/route.c:3586: warning: missing braces around initializer net/ipv6/route.c:3586: warning: (near initialization for `arg.`) net/ipv6/route.c: In function `rt6_sync_down_dev`: net/ipv6/route.c:3695: error: unknown field `event` specified in initializer net/ipv6/route.c:3695: warning: missing braces around initializer net/ipv6/route.c:3695: warning: (near initialization for `arg.`) Problem is with the named initializers for the anonymous union members. Fix this by adding curly braces around the initialization. Fixes: 4c981e28d373 ("ipv6: Prepare to handle multiple netdev events") Signed-off-by: Ido Schimmel Reported-by: Emil S Tantilov Tested-by: Emil S Tantilov Signed-off-by: David S. Miller --- net/ipv6/route.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 870abe2e8bf1..be366bb9536b 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -3578,7 +3578,9 @@ void rt6_sync_up(struct net_device *dev, unsigned int nh_flags) { struct arg_netdev_event arg = { .dev = dev, - .nh_flags = nh_flags, + { + .nh_flags = nh_flags, + }, }; if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev)) @@ -3687,7 +3689,9 @@ void rt6_sync_down_dev(struct net_device *dev, unsigned long event) { struct arg_netdev_event arg = { .dev = dev, - .event = event, + { + .event = event, + }, }; fib6_clean_all(dev_net(dev), fib6_ifdown, &arg); From 1097236bfb95533972a65e98e4fc325f0398d59c Mon Sep 17 00:00:00 2001 From: Donald Sharp Date: Tue, 20 Feb 2018 08:55:58 -0500 Subject: [PATCH 0342/1640] UPSTREAM: net: Allow a rule to track originating protocol Allow a rule that is being added/deleted/modified or dumped to contain the originating protocol's id. The protocol is handled just like a routes originating protocol is. This is especially useful because there is starting to be a plethora of different user space programs adding rules. Allow the vrf device to specify that the kernel is the originator of the rule created for this device. Signed-off-by: Donald Sharp Signed-off-by: David S. Miller --- drivers/net/vrf.c | 1 + include/net/fib_rules.h | 3 ++- include/uapi/linux/fib_rules.h | 2 +- net/core/fib_rules.c | 7 ++++++- 4 files changed, 10 insertions(+), 3 deletions(-) diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c index 5c7c608e24c9..73838f341124 100644 --- a/drivers/net/vrf.c +++ b/drivers/net/vrf.c @@ -1228,6 +1228,7 @@ static int vrf_fib_rule(const struct net_device *dev, __u8 family, bool add_it) memset(frh, 0, sizeof(*frh)); frh->family = family; frh->action = FR_ACT_TO_TBL; + frh->proto = RTPROT_KERNEL; if (nla_put_u8(skb, FRA_L3MDEV, 1)) goto nla_put_failure; diff --git a/include/net/fib_rules.h b/include/net/fib_rules.h index b8fd023ba625..2d184231fb83 100644 --- a/include/net/fib_rules.h +++ b/include/net/fib_rules.h @@ -26,7 +26,8 @@ struct fib_rule { u32 table; u8 action; u8 l3mdev; - /* 2 bytes hole, try to use */ + u8 proto; + /* 1 byte hole, try to use */ u32 target; __be64 tun_id; struct fib_rule __rcu *ctarget; diff --git a/include/uapi/linux/fib_rules.h b/include/uapi/linux/fib_rules.h index 2b642bf9b5a0..925539172d5b 100644 --- a/include/uapi/linux/fib_rules.h +++ b/include/uapi/linux/fib_rules.h @@ -23,8 +23,8 @@ struct fib_rule_hdr { __u8 tos; __u8 table; + __u8 proto; __u8 res1; /* reserved */ - __u8 res2; /* reserved */ __u8 action; __u32 flags; diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index 76c3f602ee15..64258939fc09 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -51,6 +51,7 @@ int fib_default_rule_add(struct fib_rules_ops *ops, r->pref = pref; r->table = table; r->flags = flags; + r->proto = RTPROT_KERNEL; r->fr_net = ops->fro_net; r->uid_range = fib_kuid_range_unset; @@ -463,6 +464,7 @@ int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh, } refcount_set(&rule->refcnt, 1); rule->fr_net = net; + rule->proto = frh->proto; rule->pref = tb[FRA_PRIORITY] ? nla_get_u32(tb[FRA_PRIORITY]) : fib_default_rule_pref(ops); @@ -662,6 +664,9 @@ int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh, } list_for_each_entry(rule, &ops->rules_list, list) { + if (frh->proto && (frh->proto != rule->proto)) + continue; + if (frh->action && (frh->action != rule->action)) continue; @@ -805,9 +810,9 @@ static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule, if (nla_put_u32(skb, FRA_SUPPRESS_PREFIXLEN, rule->suppress_prefixlen)) goto nla_put_failure; frh->res1 = 0; - frh->res2 = 0; frh->action = rule->action; frh->flags = rule->flags; + frh->proto = rule->proto; if (rule->action == FR_ACT_GOTO && rcu_access_pointer(rule->ctarget) == NULL) From 6fb1545418d58709e0e2242971ff312060e6366c Mon Sep 17 00:00:00 2001 From: Donald Sharp Date: Fri, 23 Feb 2018 14:01:52 -0500 Subject: [PATCH 0343/1640] UPSTREAM: net: fib_rules: Add new attribute to set protocol For ages iproute2 has used `struct rtmsg` as the ancillary header for FIB rules and in the process set the protocol value to RTPROT_BOOT. Until ca56209a66 ("net: Allow a rule to track originating protocol") the kernel rules code ignored the protocol value sent from userspace and always returned 0 in notifications. To avoid incompatibility with existing iproute2, send the protocol as a new attribute. Fixes: cac56209a66 ("net: Allow a rule to track originating protocol") Signed-off-by: Donald Sharp Signed-off-by: David S. Miller --- drivers/net/vrf.c | 5 ++++- include/net/fib_rules.h | 3 ++- include/uapi/linux/fib_rules.h | 5 +++-- net/core/fib_rules.c | 15 +++++++++++---- 4 files changed, 20 insertions(+), 8 deletions(-) diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c index 73838f341124..704eabb05dd1 100644 --- a/drivers/net/vrf.c +++ b/drivers/net/vrf.c @@ -1199,6 +1199,7 @@ static inline size_t vrf_fib_rule_nl_size(void) sz = NLMSG_ALIGN(sizeof(struct fib_rule_hdr)); sz += nla_total_size(sizeof(u8)); /* FRA_L3MDEV */ sz += nla_total_size(sizeof(u32)); /* FRA_PRIORITY */ + sz += nla_total_size(sizeof(u8)); /* FRA_PROTOCOL */ return sz; } @@ -1228,7 +1229,9 @@ static int vrf_fib_rule(const struct net_device *dev, __u8 family, bool add_it) memset(frh, 0, sizeof(*frh)); frh->family = family; frh->action = FR_ACT_TO_TBL; - frh->proto = RTPROT_KERNEL; + + if (nla_put_u8(skb, FRA_PROTOCOL, RTPROT_KERNEL)) + goto nla_put_failure; if (nla_put_u8(skb, FRA_L3MDEV, 1)) goto nla_put_failure; diff --git a/include/net/fib_rules.h b/include/net/fib_rules.h index 2d184231fb83..8c95a592a910 100644 --- a/include/net/fib_rules.h +++ b/include/net/fib_rules.h @@ -110,7 +110,8 @@ struct fib_rule_notifier_info { [FRA_SUPPRESS_IFGROUP] = { .type = NLA_U32 }, \ [FRA_GOTO] = { .type = NLA_U32 }, \ [FRA_L3MDEV] = { .type = NLA_U8 }, \ - [FRA_UID_RANGE] = { .len = sizeof(struct fib_rule_uid_range) } + [FRA_UID_RANGE] = { .len = sizeof(struct fib_rule_uid_range) }, \ + [FRA_PROTOCOL] = { .type = NLA_U8 } static inline void fib_rule_get(struct fib_rule *rule) { diff --git a/include/uapi/linux/fib_rules.h b/include/uapi/linux/fib_rules.h index 925539172d5b..77d90ae38114 100644 --- a/include/uapi/linux/fib_rules.h +++ b/include/uapi/linux/fib_rules.h @@ -23,8 +23,8 @@ struct fib_rule_hdr { __u8 tos; __u8 table; - __u8 proto; - __u8 res1; /* reserved */ + __u8 res1; /* reserved */ + __u8 res2; /* reserved */ __u8 action; __u32 flags; @@ -58,6 +58,7 @@ enum { FRA_PAD, FRA_L3MDEV, /* iif or oif is l3mdev goto its table */ FRA_UID_RANGE, /* UID range */ + FRA_PROTOCOL, /* Originator of the rule */ __FRA_MAX }; diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index 64258939fc09..538c0403d68b 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -464,11 +464,13 @@ int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh, } refcount_set(&rule->refcnt, 1); rule->fr_net = net; - rule->proto = frh->proto; rule->pref = tb[FRA_PRIORITY] ? nla_get_u32(tb[FRA_PRIORITY]) : fib_default_rule_pref(ops); + rule->proto = tb[FRA_PROTOCOL] ? + nla_get_u8(tb[FRA_PROTOCOL]) : RTPROT_UNSPEC; + if (tb[FRA_IIFNAME]) { struct net_device *dev; @@ -664,7 +666,8 @@ int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh, } list_for_each_entry(rule, &ops->rules_list, list) { - if (frh->proto && (frh->proto != rule->proto)) + if (tb[FRA_PROTOCOL] && + (rule->proto != nla_get_u8(tb[FRA_PROTOCOL]))) continue; if (frh->action && (frh->action != rule->action)) @@ -783,7 +786,8 @@ static inline size_t fib_rule_nlmsg_size(struct fib_rules_ops *ops, + nla_total_size(4) /* FRA_FWMARK */ + nla_total_size(4) /* FRA_FWMASK */ + nla_total_size_64bit(8) /* FRA_TUN_ID */ - + nla_total_size(sizeof(struct fib_kuid_range)); + + nla_total_size(sizeof(struct fib_kuid_range)) + + nla_total_size(1); /* FRA_PROTOCOL */ if (ops->nlmsg_payload) payload += ops->nlmsg_payload(rule); @@ -810,9 +814,12 @@ static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule, if (nla_put_u32(skb, FRA_SUPPRESS_PREFIXLEN, rule->suppress_prefixlen)) goto nla_put_failure; frh->res1 = 0; + frh->res2 = 0; frh->action = rule->action; frh->flags = rule->flags; - frh->proto = rule->proto; + + if (nla_put_u8(skb, FRA_PROTOCOL, rule->proto)) + goto nla_put_failure; if (rule->action == FR_ACT_GOTO && rcu_access_pointer(rule->ctarget) == NULL) From eab9cc88ddb2eda23cd843a7b91af387b3fc0ea0 Mon Sep 17 00:00:00 2001 From: Roopa Prabhu Date: Wed, 28 Feb 2018 22:40:16 -0500 Subject: [PATCH 0344/1640] UPSTREAM: net: fib_rules: support for match on ip_proto, sport and dport uapi for ip_proto, sport and dport range match in fib rules. Signed-off-by: Roopa Prabhu Acked-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/net/fib_rules.h | 36 ++++++++++++- include/uapi/linux/fib_rules.h | 8 +++ net/core/fib_rules.c | 92 +++++++++++++++++++++++++++++++++- 3 files changed, 133 insertions(+), 3 deletions(-) diff --git a/include/net/fib_rules.h b/include/net/fib_rules.h index 8c95a592a910..a302a839a830 100644 --- a/include/net/fib_rules.h +++ b/include/net/fib_rules.h @@ -27,7 +27,7 @@ struct fib_rule { u8 action; u8 l3mdev; u8 proto; - /* 1 byte hole, try to use */ + u8 ip_proto; u32 target; __be64 tun_id; struct fib_rule __rcu *ctarget; @@ -40,6 +40,8 @@ struct fib_rule { char iifname[IFNAMSIZ]; char oifname[IFNAMSIZ]; struct fib_kuid_range uid_range; + struct fib_rule_port_range sport_range; + struct fib_rule_port_range dport_range; struct rcu_head rcu; }; @@ -145,6 +147,38 @@ static inline u32 frh_get_table(struct fib_rule_hdr *frh, struct nlattr **nla) return frh->table; } +static inline bool fib_rule_port_range_set(const struct fib_rule_port_range *range) +{ + return range->start != 0 && range->end != 0; +} + +static inline bool fib_rule_port_inrange(const struct fib_rule_port_range *a, + __be16 port) +{ + return ntohs(port) >= a->start && + ntohs(port) <= a->end; +} + +static inline bool fib_rule_port_range_valid(const struct fib_rule_port_range *a) +{ + return a->start != 0 && a->end != 0 && a->end < 0xffff && + a->start <= a->end; +} + +static inline bool fib_rule_port_range_compare(struct fib_rule_port_range *a, + struct fib_rule_port_range *b) +{ + return a->start == b->start && + a->end == b->end; +} + +static inline bool fib_rule_requires_fldissect(struct fib_rule *rule) +{ + return rule->ip_proto || + fib_rule_port_range_set(&rule->sport_range) || + fib_rule_port_range_set(&rule->dport_range); +} + struct fib_rules_ops *fib_rules_register(const struct fib_rules_ops *, struct net *); void fib_rules_unregister(struct fib_rules_ops *); diff --git a/include/uapi/linux/fib_rules.h b/include/uapi/linux/fib_rules.h index 77d90ae38114..232df14e1287 100644 --- a/include/uapi/linux/fib_rules.h +++ b/include/uapi/linux/fib_rules.h @@ -35,6 +35,11 @@ struct fib_rule_uid_range { __u32 end; }; +struct fib_rule_port_range { + __u16 start; + __u16 end; +}; + enum { FRA_UNSPEC, FRA_DST, /* destination address */ @@ -59,6 +64,9 @@ enum { FRA_L3MDEV, /* iif or oif is l3mdev goto its table */ FRA_UID_RANGE, /* UID range */ FRA_PROTOCOL, /* Originator of the rule */ + FRA_IP_PROTO, /* ip proto */ + FRA_SPORT_RANGE, /* sport */ + FRA_DPORT_RANGE, /* dport */ __FRA_MAX }; diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index 538c0403d68b..c2804c660207 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -33,6 +33,10 @@ bool fib_rule_matchall(const struct fib_rule *rule) if (!uid_eq(rule->uid_range.start, fib_kuid_range_unset.start) || !uid_eq(rule->uid_range.end, fib_kuid_range_unset.end)) return false; + if (fib_rule_port_range_set(&rule->sport_range)) + return false; + if (fib_rule_port_range_set(&rule->dport_range)) + return false; return true; } EXPORT_SYMBOL_GPL(fib_rule_matchall); @@ -221,6 +225,26 @@ static int nla_put_uid_range(struct sk_buff *skb, struct fib_kuid_range *range) return nla_put(skb, FRA_UID_RANGE, sizeof(out), &out); } +static int nla_get_port_range(struct nlattr *pattr, + struct fib_rule_port_range *port_range) +{ + const struct fib_rule_port_range *pr = nla_data(pattr); + + if (!fib_rule_port_range_valid(pr)) + return -EINVAL; + + port_range->start = pr->start; + port_range->end = pr->end; + + return 0; +} + +static int nla_put_port_range(struct sk_buff *skb, int attrtype, + struct fib_rule_port_range *range) +{ + return nla_put(skb, attrtype, sizeof(*range), range); +} + static int fib_rule_match(struct fib_rule *rule, struct fib_rules_ops *ops, struct flowi *fl, int flags, struct fib_lookup_arg *arg) @@ -423,6 +447,17 @@ static int rule_exists(struct fib_rules_ops *ops, struct fib_rule_hdr *frh, !uid_eq(r->uid_range.end, rule->uid_range.end)) continue; + if (r->ip_proto != rule->ip_proto) + continue; + + if (!fib_rule_port_range_compare(&r->sport_range, + &rule->sport_range)) + continue; + + if (!fib_rule_port_range_compare(&r->dport_range, + &rule->dport_range)) + continue; + if (!ops->compare(r, frh, tb)) continue; return 1; @@ -567,6 +602,23 @@ int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh, rule->uid_range = fib_kuid_range_unset; } + if (tb[FRA_IP_PROTO]) + rule->ip_proto = nla_get_u8(tb[FRA_IP_PROTO]); + + if (tb[FRA_SPORT_RANGE]) { + err = nla_get_port_range(tb[FRA_SPORT_RANGE], + &rule->sport_range); + if (err) + goto errout_free; + } + + if (tb[FRA_DPORT_RANGE]) { + err = nla_get_port_range(tb[FRA_DPORT_RANGE], + &rule->dport_range); + if (err) + goto errout_free; + } + if ((nlh->nlmsg_flags & NLM_F_EXCL) && rule_exists(ops, frh, tb, rule)) { err = -EEXIST; @@ -632,6 +684,8 @@ int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh, { struct net *net = sock_net(skb->sk); struct fib_rule_hdr *frh = nlmsg_data(nlh); + struct fib_rule_port_range sprange = {0, 0}; + struct fib_rule_port_range dprange = {0, 0}; struct fib_rules_ops *ops = NULL; struct fib_rule *rule, *r; struct nlattr *tb[FRA_MAX+1]; @@ -665,6 +719,20 @@ int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh, range = fib_kuid_range_unset; } + if (tb[FRA_SPORT_RANGE]) { + err = nla_get_port_range(tb[FRA_SPORT_RANGE], + &sprange); + if (err) + goto errout; + } + + if (tb[FRA_DPORT_RANGE]) { + err = nla_get_port_range(tb[FRA_DPORT_RANGE], + &dprange); + if (err) + goto errout; + } + list_for_each_entry(rule, &ops->rules_list, list) { if (tb[FRA_PROTOCOL] && (rule->proto != nla_get_u8(tb[FRA_PROTOCOL]))) @@ -710,6 +778,18 @@ int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh, !uid_eq(rule->uid_range.end, range.end))) continue; + if (tb[FRA_IP_PROTO] && + (rule->ip_proto != nla_get_u8(tb[FRA_IP_PROTO]))) + continue; + + if (fib_rule_port_range_set(&sprange) && + !fib_rule_port_range_compare(&rule->sport_range, &sprange)) + continue; + + if (fib_rule_port_range_set(&dprange) && + !fib_rule_port_range_compare(&rule->dport_range, &dprange)) + continue; + if (!ops->compare(rule, frh, tb)) continue; @@ -787,7 +867,10 @@ static inline size_t fib_rule_nlmsg_size(struct fib_rules_ops *ops, + nla_total_size(4) /* FRA_FWMASK */ + nla_total_size_64bit(8) /* FRA_TUN_ID */ + nla_total_size(sizeof(struct fib_kuid_range)) - + nla_total_size(1); /* FRA_PROTOCOL */ + + nla_total_size(1) /* FRA_PROTOCOL */ + + nla_total_size(1) /* FRA_IP_PROTO */ + + nla_total_size(sizeof(struct fib_rule_port_range)) /* FRA_SPORT_RANGE */ + + nla_total_size(sizeof(struct fib_rule_port_range)); /* FRA_DPORT_RANGE */ if (ops->nlmsg_payload) payload += ops->nlmsg_payload(rule); @@ -852,7 +935,12 @@ static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule, (rule->l3mdev && nla_put_u8(skb, FRA_L3MDEV, rule->l3mdev)) || (uid_range_set(&rule->uid_range) && - nla_put_uid_range(skb, &rule->uid_range))) + nla_put_uid_range(skb, &rule->uid_range)) || + (fib_rule_port_range_set(&rule->sport_range) && + nla_put_port_range(skb, FRA_SPORT_RANGE, &rule->sport_range)) || + (fib_rule_port_range_set(&rule->dport_range) && + nla_put_port_range(skb, FRA_DPORT_RANGE, &rule->dport_range)) || + (rule->ip_proto && nla_put_u8(skb, FRA_IP_PROTO, rule->ip_proto))) goto nla_put_failure; if (rule->suppress_ifgroup != -1) { From 8bae520f14df4e8eba2de4d449a6a60383517317 Mon Sep 17 00:00:00 2001 From: Roopa Prabhu Date: Wed, 28 Feb 2018 22:43:22 -0500 Subject: [PATCH 0345/1640] BACKPORT: ipv6: route: dissect flow in input path if fib rules need it Dissect flow in fwd path if fib rules require it. Controlled by a flag to avoid penatly for the common case. Flag is set when fib rules with sport, dport and proto match that require flow dissect are installed. Also passes the dissected hash keys to the multipath hash function when applicable to avoid dissecting the flow again. icmp packets will continue to use inner header for hash calculations. Signed-off-by: Roopa Prabhu Acked-by: Paolo Abeni Acked-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 25 +++++++++++++++++++++++++ include/net/ip6_route.h | 4 +++- include/net/netns/ipv6.h | 3 ++- net/ipv6/fib6_rules.c | 16 ++++++++++++++++ net/ipv6/icmp.c | 2 +- net/ipv6/route.c | 34 +++++++++++++++++++++++++--------- 6 files changed, 72 insertions(+), 12 deletions(-) diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 982cb2ceb22d..e3f078b4bb99 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -417,6 +417,24 @@ void fib6_rules_cleanup(void); bool fib6_rule_default(const struct fib_rule *rule); int fib6_rules_dump(struct net *net, struct notifier_block *nb); unsigned int fib6_rules_seq_read(struct net *net); + +static inline bool fib6_rules_early_flow_dissect(struct net *net, + struct sk_buff *skb, + struct flowi6 *fl6, + struct flow_keys *flkeys) +{ + unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP; + + if (!net->ipv6.fib6_rules_require_fldissect) + return false; + + skb_flow_dissect_flow_keys(skb, flkeys, flag); + fl6->fl6_sport = flkeys->ports.src; + fl6->fl6_dport = flkeys->ports.dst; + fl6->flowi6_proto = flkeys->basic.ip_proto; + + return true; +} #else static inline int fib6_rules_init(void) { @@ -438,5 +456,12 @@ static inline unsigned int fib6_rules_seq_read(struct net *net) { return 0; } +static inline bool fib6_rules_early_flow_dissect(struct net *net, + struct sk_buff *skb, + struct flowi6 *fl6, + struct flow_keys *flkeys) +{ + return false; +} #endif #endif diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index c6451d40cba1..787086e19161 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -127,7 +127,8 @@ static inline int ip6_route_get_saddr(struct net *net, struct rt6_info *rt, struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr, const struct in6_addr *saddr, int oif, int flags); -u32 rt6_multipath_hash(const struct flowi6 *fl6, const struct sk_buff *skb); +u32 rt6_multipath_hash(const struct flowi6 *fl6, const struct sk_buff *skb, + struct flow_keys *hkeys); struct dst_entry *icmp6_dst_alloc(struct net_device *dev, struct flowi6 *fl6); @@ -267,4 +268,5 @@ static inline bool rt6_duplicate_nexthop(struct rt6_info *a, struct rt6_info *b) ipv6_addr_equal(&a->rt6i_gateway, &b->rt6i_gateway) && !lwtunnel_cmp_encap(a->dst.lwtstate, b->dst.lwtstate); } + #endif diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h index 290ca18589ee..2e020868ae67 100644 --- a/include/net/netns/ipv6.h +++ b/include/net/netns/ipv6.h @@ -67,7 +67,8 @@ struct netns_ipv6 { atomic_t ip6_rt_gc_expire; unsigned long ip6_rt_last_gc; #ifdef CONFIG_IPV6_MULTIPLE_TABLES - bool fib6_has_custom_rules; + unsigned int fib6_rules_require_fldissect; + bool fib6_has_custom_rules; struct rt6_info *ip6_prohibit_entry; struct rt6_info *ip6_blk_hole_entry; struct fib6_table *fib6_local_tbl; diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c index 1e19fdbbd2bf..8528fd12452b 100644 --- a/net/ipv6/fib6_rules.c +++ b/net/ipv6/fib6_rules.c @@ -272,12 +272,26 @@ static int fib6_rule_configure(struct fib_rule *rule, struct sk_buff *skb, rule6->dst.plen = frh->dst_len; rule6->tclass = frh->tos; + if (fib_rule_requires_fldissect(rule)) + net->ipv6.fib6_rules_require_fldissect++; + net->ipv6.fib6_has_custom_rules = true; err = 0; errout: return err; } +static int fib6_rule_delete(struct fib_rule *rule) +{ + struct net *net = rule->fr_net; + + if (net->ipv6.fib6_rules_require_fldissect && + fib_rule_requires_fldissect(rule)) + net->ipv6.fib6_rules_require_fldissect--; + + return 0; +} + static int fib6_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh, struct nlattr **tb) { @@ -342,6 +356,7 @@ static const struct fib_rules_ops __net_initconst fib6_rules_ops_template = { .match = fib6_rule_match, .suppress = fib6_rule_suppress, .configure = fib6_rule_configure, + .delete = fib6_rule_delete, .compare = fib6_rule_compare, .fill = fib6_rule_fill, .nlmsg_payload = fib6_rule_nlmsg_payload, @@ -370,6 +385,7 @@ static int __net_init fib6_rules_net_init(struct net *net) goto out_fib6_rules_ops; net->ipv6.fib6_rules_ops = ops; + net->ipv6.fib6_rules_require_fldissect = 0; out: return err; diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index d47dbe421922..463d1cfa2475 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -527,7 +527,7 @@ void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, fl6.fl6_icmp_type = type; fl6.fl6_icmp_code = code; fl6.flowi6_uid = sock_net_uid(net, NULL); - fl6.mp_hash = rt6_multipath_hash(&fl6, skb); + fl6.mp_hash = rt6_multipath_hash(&fl6, skb, NULL); security_skb_classify_flow(skb, flowi6_to_flowi(&fl6)); sk = icmpv6_xmit_lock(net); diff --git a/net/ipv6/route.c b/net/ipv6/route.c index be366bb9536b..c6ff9fc83ffa 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -473,7 +473,7 @@ static struct rt6_info *rt6_multipath_select(struct rt6_info *match, * case it will always be non-zero. Otherwise now is the time to do it. */ if (!fl6->mp_hash) - fl6->mp_hash = rt6_multipath_hash(fl6, NULL); + fl6->mp_hash = rt6_multipath_hash(fl6, NULL, NULL); if (fl6->mp_hash <= atomic_read(&match->rt6i_nh_upper_bound)) return match; @@ -1800,10 +1800,12 @@ struct dst_entry *ip6_route_input_lookup(struct net *net, EXPORT_SYMBOL_GPL(ip6_route_input_lookup); static void ip6_multipath_l3_keys(const struct sk_buff *skb, - struct flow_keys *keys) + struct flow_keys *keys, + struct flow_keys *flkeys) { const struct ipv6hdr *outer_iph = ipv6_hdr(skb); const struct ipv6hdr *key_iph = outer_iph; + struct flow_keys *_flkeys = flkeys; const struct ipv6hdr *inner_iph; const struct icmp6hdr *icmph; struct ipv6hdr _inner_iph; @@ -1830,22 +1832,31 @@ static void ip6_multipath_l3_keys(const struct sk_buff *skb, goto out; key_iph = inner_iph; + _flkeys = NULL; out: memset(keys, 0, sizeof(*keys)); keys->control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; - keys->addrs.v6addrs.src = key_iph->saddr; - keys->addrs.v6addrs.dst = key_iph->daddr; - keys->tags.flow_label = ip6_flowlabel(key_iph); - keys->basic.ip_proto = key_iph->nexthdr; + if (_flkeys) { + keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src; + keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst; + keys->tags.flow_label = _flkeys->tags.flow_label; + keys->basic.ip_proto = _flkeys->basic.ip_proto; + } else { + keys->addrs.v6addrs.src = key_iph->saddr; + keys->addrs.v6addrs.dst = key_iph->daddr; + keys->tags.flow_label = ip6_flowlabel(key_iph); + keys->basic.ip_proto = key_iph->nexthdr; + } } /* if skb is set it will be used and fl6 can be NULL */ -u32 rt6_multipath_hash(const struct flowi6 *fl6, const struct sk_buff *skb) +u32 rt6_multipath_hash(const struct flowi6 *fl6, const struct sk_buff *skb, + struct flow_keys *flkeys) { struct flow_keys hash_keys; if (skb) { - ip6_multipath_l3_keys(skb, &hash_keys); + ip6_multipath_l3_keys(skb, &hash_keys, flkeys); return flow_hash_from_keys(&hash_keys) >> 1; } @@ -1866,12 +1877,17 @@ void ip6_route_input(struct sk_buff *skb) .flowi6_mark = skb->mark, .flowi6_proto = iph->nexthdr, }; + struct flow_keys *flkeys = NULL, _flkeys; tun_info = skb_tunnel_info(skb); if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX)) fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id; + + if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys)) + flkeys = &_flkeys; + if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6)) - fl6.mp_hash = rt6_multipath_hash(&fl6, skb); + fl6.mp_hash = rt6_multipath_hash(&fl6, skb, flkeys); skb_dst_drop(skb); skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags)); } From a141d150daba1df8a9408e564d0798bfe6ca00a5 Mon Sep 17 00:00:00 2001 From: Roopa Prabhu Date: Thu, 1 Mar 2018 17:55:37 -0800 Subject: [PATCH 0346/1640] UPSTREAM: fib_rules: FRA_GENERIC_POLICY updates for ip proto, sport and dport attrs Fixes: bfff4862653b ("net: fib_rules: support for match on ip_proto, sport and dport") Reported-by: Eric Dumazet Signed-off-by: Roopa Prabhu Signed-off-by: David S. Miller --- include/net/fib_rules.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/include/net/fib_rules.h b/include/net/fib_rules.h index a302a839a830..8be473d3227f 100644 --- a/include/net/fib_rules.h +++ b/include/net/fib_rules.h @@ -113,7 +113,11 @@ struct fib_rule_notifier_info { [FRA_GOTO] = { .type = NLA_U32 }, \ [FRA_L3MDEV] = { .type = NLA_U8 }, \ [FRA_UID_RANGE] = { .len = sizeof(struct fib_rule_uid_range) }, \ - [FRA_PROTOCOL] = { .type = NLA_U8 } + [FRA_PROTOCOL] = { .type = NLA_U8 }, \ + [FRA_IP_PROTO] = { .type = NLA_U8 }, \ + [FRA_SPORT_RANGE] = { .len = sizeof(struct fib_rule_port_range) }, \ + [FRA_DPORT_RANGE] = { .len = sizeof(struct fib_rule_port_range) } + static inline void fib_rule_get(struct fib_rule *rule) { From e2e9682acbb92e3e2e3692af19bc4b894c15b334 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Thu, 25 Jan 2018 16:55:07 -0800 Subject: [PATCH 0347/1640] UPSTREAM: net/ipv6: Move gateway validation into helper Move existing code to validate nexthop into a helper. Follow on patch adds support for nexthops marked with onlink, and this helper keeps the complexity of ip6_route_info_create in check. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/route.c | 85 ++++++++++++++++++++++++++++-------------------- 1 file changed, 49 insertions(+), 36 deletions(-) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index c6ff9fc83ffa..ec327e761433 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2517,6 +2517,54 @@ static struct rt6_info *ip6_nh_lookup_table(struct net *net, return rt; } +static int ip6_route_check_nh(struct net *net, + struct fib6_config *cfg, + struct net_device **_dev, + struct inet6_dev **idev) +{ + const struct in6_addr *gw_addr = &cfg->fc_gateway; + struct net_device *dev = _dev ? *_dev : NULL; + struct rt6_info *grt = NULL; + int err = -EHOSTUNREACH; + + if (cfg->fc_table) { + grt = ip6_nh_lookup_table(net, cfg, gw_addr); + if (grt) { + if (grt->rt6i_flags & RTF_GATEWAY || + (dev && dev != grt->dst.dev)) { + ip6_rt_put(grt); + grt = NULL; + } + } + } + + if (!grt) + grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1); + + if (!grt) + goto out; + + if (dev) { + if (dev != grt->dst.dev) { + ip6_rt_put(grt); + goto out; + } + } else { + *_dev = dev = grt->dst.dev; + *idev = grt->rt6i_idev; + dev_hold(dev); + in6_dev_hold(grt->rt6i_idev); + } + + if (!(grt->rt6i_flags & RTF_GATEWAY)) + err = 0; + + ip6_rt_put(grt); + +out: + return err; +} + static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg, struct netlink_ext_ack *extack) { @@ -2706,8 +2754,6 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg, rt->rt6i_gateway = *gw_addr; if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) { - struct rt6_info *grt = NULL; - /* IPv6 strictly inhibits using not link-local addresses as nexthop address. Otherwise, router will not able to send redirects. @@ -2724,40 +2770,7 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg, goto out; } - if (cfg->fc_table) { - grt = ip6_nh_lookup_table(net, cfg, gw_addr); - - if (grt) { - if (grt->rt6i_flags & RTF_GATEWAY || - (dev && dev != grt->dst.dev)) { - ip6_rt_put(grt); - grt = NULL; - } - } - } - - if (!grt) - grt = rt6_lookup(net, gw_addr, NULL, - cfg->fc_ifindex, 1); - - err = -EHOSTUNREACH; - if (!grt) - goto out; - if (dev) { - if (dev != grt->dst.dev) { - ip6_rt_put(grt); - goto out; - } - } else { - dev = grt->dst.dev; - idev = grt->rt6i_idev; - dev_hold(dev); - in6_dev_hold(grt->rt6i_idev); - } - if (!(grt->rt6i_flags & RTF_GATEWAY)) - err = 0; - ip6_rt_put(grt); - + err = ip6_route_check_nh(net, cfg, &dev, &idev); if (err) goto out; } From 8d057ae5258a3c1020d6f74ff8107ee88fc5ef9c Mon Sep 17 00:00:00 2001 From: David Ahern Date: Thu, 25 Jan 2018 16:55:08 -0800 Subject: [PATCH 0348/1640] UPSTREAM: net/ipv6: Add flags and table id to ip6_nh_lookup_table onlink verification needs to do a lookup in potentially different table than the table in fib6_config and without the RT6_LOOKUP_F_IFACE flag. Change ip6_nh_lookup_table to take table id and flags as input arguments. Both verifications want to ignore link state, so add that flag can stay in the lookup helper. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/route.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index ec327e761433..97a034dda27c 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2488,7 +2488,8 @@ static int ip6_convert_metrics(struct mx6_config *mxc, static struct rt6_info *ip6_nh_lookup_table(struct net *net, struct fib6_config *cfg, - const struct in6_addr *gw_addr) + const struct in6_addr *gw_addr, + u32 tbid, int flags) { struct flowi6 fl6 = { .flowi6_oif = cfg->fc_ifindex, @@ -2497,15 +2498,15 @@ static struct rt6_info *ip6_nh_lookup_table(struct net *net, }; struct fib6_table *table; struct rt6_info *rt; - int flags = RT6_LOOKUP_F_IFACE | RT6_LOOKUP_F_IGNORE_LINKSTATE; - table = fib6_get_table(net, cfg->fc_table); + table = fib6_get_table(net, tbid); if (!table) return NULL; if (!ipv6_addr_any(&cfg->fc_prefsrc)) flags |= RT6_LOOKUP_F_HAS_SADDR; + flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE; rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, flags); /* if table lookup failed, fall back to full lookup */ @@ -2528,7 +2529,10 @@ static int ip6_route_check_nh(struct net *net, int err = -EHOSTUNREACH; if (cfg->fc_table) { - grt = ip6_nh_lookup_table(net, cfg, gw_addr); + int flags = RT6_LOOKUP_F_IFACE; + + grt = ip6_nh_lookup_table(net, cfg, gw_addr, + cfg->fc_table, flags); if (grt) { if (grt->rt6i_flags & RTF_GATEWAY || (dev && dev != grt->dst.dev)) { From 0ff879c3d46d8e3202ba78737d80352f54de9a70 Mon Sep 17 00:00:00 2001 From: Alexey Kodanev Date: Tue, 19 Dec 2017 16:59:21 +0300 Subject: [PATCH 0349/1640] UPSTREAM: ip6_vti: adjust vti mtu according to mtu of lower device LTP/udp6_ipsec_vti tests fail when sending large UDP datagrams over ip6_vti that require fragmentation and the underlying device has an MTU smaller than 1500 plus some extra space for headers. This happens because ip6_vti, by default, sets MTU to ETH_DATA_LEN and not updating it depending on a destination address or link parameter. Further attempts to send UDP packets may succeed because pmtu gets updated on ICMPV6_PKT_TOOBIG in vti6_err(). In case the lower device has larger MTU size, e.g. 9000, ip6_vti works but not using the possible maximum size, output packets have 1500 limit. The above cases require manual MTU setup after ip6_vti creation. However ip_vti already updates MTU based on lower device with ip_tunnel_bind_dev(). Here is the example when the lower device MTU is set to 9000: # ip a sh ltp_ns_veth2 ltp_ns_veth2@if7: mtu 9000 ... inet 10.0.0.2/24 scope global ltp_ns_veth2 inet6 fd00::2/64 scope global # ip li add vti6 type vti6 local fd00::2 remote fd00::1 # ip li show vti6 vti6@NONE: mtu 1500 ... link/tunnel6 fd00::2 peer fd00::1 After the patch: # ip li add vti6 type vti6 local fd00::2 remote fd00::1 # ip li show vti6 vti6@NONE: mtu 8832 ... link/tunnel6 fd00::2 peer fd00::1 Reported-by: Petr Vorel Signed-off-by: Alexey Kodanev Signed-off-by: David S. Miller --- net/ipv6/ip6_vti.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index 534dac0e4385..40c35dc963c9 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -657,6 +657,7 @@ static void vti6_link_config(struct ip6_tnl *t) { struct net_device *dev = t->dev; struct __ip6_tnl_parm *p = &t->parms; + struct net_device *tdev = NULL; memcpy(dev->dev_addr, &p->laddr, sizeof(struct in6_addr)); memcpy(dev->broadcast, &p->raddr, sizeof(struct in6_addr)); @@ -669,6 +670,25 @@ static void vti6_link_config(struct ip6_tnl *t) dev->flags |= IFF_POINTOPOINT; else dev->flags &= ~IFF_POINTOPOINT; + + if (p->flags & IP6_TNL_F_CAP_XMIT) { + int strict = (ipv6_addr_type(&p->raddr) & + (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL)); + struct rt6_info *rt = rt6_lookup(t->net, + &p->raddr, &p->laddr, + p->link, strict); + + if (rt) + tdev = rt->dst.dev; + ip6_rt_put(rt); + } + + if (!tdev && p->link) + tdev = __dev_get_by_index(t->net, p->link); + + if (tdev) + dev->mtu = max_t(int, tdev->mtu - dev->hard_header_len, + IPV6_MIN_MTU); } /** From d8c9c39c95870055fe6c40e3998c0823138ddd1f Mon Sep 17 00:00:00 2001 From: David Ahern Date: Fri, 2 Mar 2018 08:32:17 -0800 Subject: [PATCH 0350/1640] BACKPORT: net/ipv6: Pass skb to route lookup IPv6 does path selection for multipath routes deep in the lookup functions. The next patch adds L4 hash option and needs the skb for the forward path. To get the skb to the relevant FIB lookup functions it needs to go through the fib rules layer, so add a lookup_data argument to the fib_lookup_arg struct. Signed-off-by: David Ahern Reviewed-by: Ido Schimmel Reviewed-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- drivers/infiniband/core/cma.c | 2 +- drivers/net/ipvlan/ipvlan_core.c | 3 +- drivers/net/vrf.c | 7 +-- include/net/fib_rules.h | 1 + include/net/ip6_fib.h | 4 +- include/net/ip6_route.h | 11 +++-- net/ipv6/anycast.c | 2 +- net/ipv6/fib6_rules.c | 8 ++-- net/ipv6/icmp.c | 3 +- net/ipv6/ip6_fib.c | 3 +- net/ipv6/ip6_gre.c | 2 +- net/ipv6/ip6_tunnel.c | 4 +- net/ipv6/ip6_vti.c | 2 +- net/ipv6/mcast.c | 4 +- net/ipv6/netfilter/ip6t_rpfilter.c | 2 +- net/ipv6/netfilter/nft_fib_ipv6.c | 3 +- net/ipv6/route.c | 72 ++++++++++++++++++------------ net/ipv6/seg6_local.c | 4 +- 18 files changed, 83 insertions(+), 54 deletions(-) diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index dd00530675d0..f5aa1d65ff4d 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -1330,7 +1330,7 @@ static bool validate_ipv6_net_dev(struct net_device *net_dev, IPV6_ADDR_LINKLOCAL; struct rt6_info *rt = rt6_lookup(dev_net(net_dev), &dst_addr->sin6_addr, &src_addr->sin6_addr, net_dev->ifindex, - strict); + NULL, strict); bool ret; if (!rt) diff --git a/drivers/net/ipvlan/ipvlan_core.c b/drivers/net/ipvlan/ipvlan_core.c index ab9beca09ca5..df8c6d0dae1f 100644 --- a/drivers/net/ipvlan/ipvlan_core.c +++ b/drivers/net/ipvlan/ipvlan_core.c @@ -776,7 +776,8 @@ struct sk_buff *ipvlan_l3_rcv(struct net_device *dev, struct sk_buff *skb, }; skb_dst_drop(skb); - dst = ip6_route_input_lookup(dev_net(sdev), sdev, &fl6, flags); + dst = ip6_route_input_lookup(dev_net(sdev), sdev, &fl6, + skb, flags); skb_dst_set(skb, dst); break; } diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c index 704eabb05dd1..1b5cce954072 100644 --- a/drivers/net/vrf.c +++ b/drivers/net/vrf.c @@ -995,6 +995,7 @@ static struct rt6_info *vrf_ip6_route_lookup(struct net *net, const struct net_device *dev, struct flowi6 *fl6, int ifindex, + const struct sk_buff *skb, int flags) { struct net_vrf *vrf = netdev_priv(dev); @@ -1013,7 +1014,7 @@ static struct rt6_info *vrf_ip6_route_lookup(struct net *net, if (!table) return NULL; - return ip6_pol_route(net, table, ifindex, fl6, flags); + return ip6_pol_route(net, table, ifindex, fl6, skb, flags); } static void vrf_ip6_input_dst(struct sk_buff *skb, struct net_device *vrf_dev, @@ -1031,7 +1032,7 @@ static void vrf_ip6_input_dst(struct sk_buff *skb, struct net_device *vrf_dev, struct net *net = dev_net(vrf_dev); struct rt6_info *rt6; - rt6 = vrf_ip6_route_lookup(net, vrf_dev, &fl6, ifindex, + rt6 = vrf_ip6_route_lookup(net, vrf_dev, &fl6, ifindex, skb, RT6_LOOKUP_F_HAS_SADDR | RT6_LOOKUP_F_IFACE); if (unlikely(!rt6)) return; @@ -1164,7 +1165,7 @@ static struct dst_entry *vrf_link_scope_lookup(const struct net_device *dev, if (!ipv6_addr_any(&fl6->saddr)) flags |= RT6_LOOKUP_F_HAS_SADDR; - rt = vrf_ip6_route_lookup(net, dev, fl6, fl6->flowi6_oif, flags); + rt = vrf_ip6_route_lookup(net, dev, fl6, fl6->flowi6_oif, NULL, flags); if (rt) dst = &rt->dst; diff --git a/include/net/fib_rules.h b/include/net/fib_rules.h index 8be473d3227f..c836289e68b8 100644 --- a/include/net/fib_rules.h +++ b/include/net/fib_rules.h @@ -47,6 +47,7 @@ struct fib_rule { struct fib_lookup_arg { void *lookup_ptr; + const void *lookup_data; void *result; struct fib_rule *rule; u32 table; diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index e3f078b4bb99..03d13d2e9432 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -352,7 +352,8 @@ struct fib6_table { typedef struct rt6_info *(*pol_lookup_t)(struct net *, struct fib6_table *, - struct flowi6 *, int); + struct flowi6 *, + const struct sk_buff *, int); struct fib6_entry_notifier_info { struct fib_notifier_info info; /* must be first */ @@ -366,6 +367,7 @@ struct fib6_entry_notifier_info { struct fib6_table *fib6_get_table(struct net *net, u32 id); struct fib6_table *fib6_new_table(struct net *net, u32 id); struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6, + const struct sk_buff *skb, int flags, pol_lookup_t lookup); struct fib6_node *fib6_lookup(struct fib6_node *root, diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index 787086e19161..bc4f89fc8cc0 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -75,7 +75,8 @@ static inline bool rt6_qualify_for_ecmp(const struct rt6_info *rt) void ip6_route_input(struct sk_buff *skb); struct dst_entry *ip6_route_input_lookup(struct net *net, struct net_device *dev, - struct flowi6 *fl6, int flags); + struct flowi6 *fl6, + const struct sk_buff *skb, int flags); struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk, struct flowi6 *fl6, int flags); @@ -88,9 +89,10 @@ static inline struct dst_entry *ip6_route_output(struct net *net, } struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6, - int flags); + const struct sk_buff *skb, int flags); struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, - int ifindex, struct flowi6 *fl6, int flags); + int ifindex, struct flowi6 *fl6, + const struct sk_buff *skb, int flags); void ip6_route_init_special_entries(void); int ip6_route_init(void); @@ -126,7 +128,8 @@ static inline int ip6_route_get_saddr(struct net *net, struct rt6_info *rt, } struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr, - const struct in6_addr *saddr, int oif, int flags); + const struct in6_addr *saddr, int oif, + const struct sk_buff *skb, int flags); u32 rt6_multipath_hash(const struct flowi6 *fl6, const struct sk_buff *skb, struct flow_keys *hkeys); diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c index 4d8b3d1d530b..eb9c0faae5b5 100644 --- a/net/ipv6/anycast.c +++ b/net/ipv6/anycast.c @@ -78,7 +78,7 @@ int ipv6_sock_ac_join(struct sock *sk, int ifindex, const struct in6_addr *addr) if (ifindex == 0) { struct rt6_info *rt; - rt = rt6_lookup(net, addr, NULL, 0, 0); + rt = rt6_lookup(net, addr, NULL, 0, NULL, 0); if (rt) { dev = rt->dst.dev; ip6_rt_put(rt); diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c index 8528fd12452b..da08885a2d32 100644 --- a/net/ipv6/fib6_rules.c +++ b/net/ipv6/fib6_rules.c @@ -61,11 +61,13 @@ unsigned int fib6_rules_seq_read(struct net *net) } struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6, + const struct sk_buff *skb, int flags, pol_lookup_t lookup) { if (net->ipv6.fib6_has_custom_rules) { struct fib_lookup_arg arg = { .lookup_ptr = lookup, + .lookup_data = skb, .flags = FIB_LOOKUP_NOREF, }; @@ -80,11 +82,11 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6, } else { struct rt6_info *rt; - rt = lookup(net, net->ipv6.fib6_local_tbl, fl6, flags); + rt = lookup(net, net->ipv6.fib6_local_tbl, fl6, skb, flags); if (rt != net->ipv6.ip6_null_entry && rt->dst.error != -EAGAIN) return &rt->dst; ip6_rt_put(rt); - rt = lookup(net, net->ipv6.fib6_main_tbl, fl6, flags); + rt = lookup(net, net->ipv6.fib6_main_tbl, fl6, skb, flags); if (rt->dst.error != -EAGAIN) return &rt->dst; ip6_rt_put(rt); @@ -155,7 +157,7 @@ static int fib6_rule_action(struct fib_rule *rule, struct flowi *flp, goto out; } - rt = lookup(net, table, flp6, flags); + rt = lookup(net, table, flp6, arg->lookup_data, flags); if (rt != net->ipv6.ip6_null_entry) { struct inet6_dev *idev = ip6_dst_idev(&rt->dst); diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 463d1cfa2475..bf9539e328ad 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -636,7 +636,8 @@ int ip6_err_gen_icmpv6_unreach(struct sk_buff *skb, int nhs, int type, skb_pull(skb2, nhs); skb_reset_network_header(skb2); - rt = rt6_lookup(dev_net(skb->dev), &ipv6_hdr(skb2)->saddr, NULL, 0, 0); + rt = rt6_lookup(dev_net(skb->dev), &ipv6_hdr(skb2)->saddr, NULL, 0, + skb, 0); if (rt && rt->dst.dev) skb2->dev = rt->dst.dev; diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 0cb666eaf283..3dfdc54eefb8 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -299,11 +299,12 @@ struct fib6_table *fib6_get_table(struct net *net, u32 id) } struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6, + const struct sk_buff *skb, int flags, pol_lookup_t lookup) { struct rt6_info *rt; - rt = lookup(net, net->ipv6.fib6_main_tbl, fl6, flags); + rt = lookup(net, net->ipv6.fib6_main_tbl, fl6, skb, flags); if (rt->dst.error == -EAGAIN) { ip6_rt_put(rt); rt = net->ipv6.ip6_null_entry; diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 4466b8a5e2b4..4e99bd77e0be 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -975,7 +975,7 @@ static void ip6gre_tnl_link_config(struct ip6_tnl *t, int set_mtu) struct rt6_info *rt = rt6_lookup(t->net, &p->raddr, &p->laddr, - p->link, strict); + p->link, NULL, strict); if (!rt) return; diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 0be6cb160345..bf992c0d05ad 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -691,7 +691,7 @@ ip6ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, /* Try to guess incoming interface */ rt = rt6_lookup(dev_net(skb->dev), &ipv6_hdr(skb2)->saddr, - NULL, 0, 0); + NULL, 0, skb2, 0); if (rt && rt->dst.dev) skb2->dev = rt->dst.dev; @@ -1464,7 +1464,7 @@ static void ip6_tnl_link_config(struct ip6_tnl *t) struct rt6_info *rt = rt6_lookup(t->net, &p->raddr, &p->laddr, - p->link, strict); + p->link, NULL, strict); if (!rt) return; diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index 40c35dc963c9..199067817e79 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -676,7 +676,7 @@ static void vti6_link_config(struct ip6_tnl *t) (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL)); struct rt6_info *rt = rt6_lookup(t->net, &p->raddr, &p->laddr, - p->link, strict); + p->link, NULL, strict); if (rt) tdev = rt->dst.dev; diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index f3a291a9b2f8..9eb33abe83b9 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -165,7 +165,7 @@ int ipv6_sock_mc_join(struct sock *sk, int ifindex, const struct in6_addr *addr) if (ifindex == 0) { struct rt6_info *rt; - rt = rt6_lookup(net, addr, NULL, 0, 0); + rt = rt6_lookup(net, addr, NULL, 0, NULL, 0); if (rt) { dev = rt->dst.dev; ip6_rt_put(rt); @@ -254,7 +254,7 @@ static struct inet6_dev *ip6_mc_find_dev_rcu(struct net *net, struct inet6_dev *idev = NULL; if (ifindex == 0) { - struct rt6_info *rt = rt6_lookup(net, group, NULL, 0, 0); + struct rt6_info *rt = rt6_lookup(net, group, NULL, 0, NULL, 0); if (rt) { dev = rt->dst.dev; diff --git a/net/ipv6/netfilter/ip6t_rpfilter.c b/net/ipv6/netfilter/ip6t_rpfilter.c index 2a7c8f864fa7..5262d4e68dec 100644 --- a/net/ipv6/netfilter/ip6t_rpfilter.c +++ b/net/ipv6/netfilter/ip6t_rpfilter.c @@ -63,7 +63,7 @@ static bool rpfilter_lookup_reverse6(struct net *net, const struct sk_buff *skb, (flags & XT_RPFILTER_LOOSE) == 0) fl6.flowi6_oif = dev->ifindex; - rt = (void *) ip6_route_lookup(net, &fl6, lookup_flags); + rt = (void *)ip6_route_lookup(net, &fl6, skb, lookup_flags); if (rt->dst.error) goto out; diff --git a/net/ipv6/netfilter/nft_fib_ipv6.c b/net/ipv6/netfilter/nft_fib_ipv6.c index bcc673e433e6..bb5eafc282ac 100644 --- a/net/ipv6/netfilter/nft_fib_ipv6.c +++ b/net/ipv6/netfilter/nft_fib_ipv6.c @@ -185,7 +185,8 @@ void nft_fib6_eval(const struct nft_expr *expr, struct nft_regs *regs, } *dest = 0; - rt = (void *)ip6_route_lookup(nft_net(pkt), &fl6, lookup_flags); + rt = (void *)ip6_route_lookup(nft_net(pkt), &fl6, pkt->skb, + lookup_flags); if (rt->dst.error) goto put_rt_err; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 97a034dda27c..2b7c2ff56161 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -465,6 +465,7 @@ static bool rt6_check_expired(const struct rt6_info *rt) static struct rt6_info *rt6_multipath_select(struct rt6_info *match, struct flowi6 *fl6, int oif, + const struct sk_buff *skb, int strict) { struct rt6_info *sibling, *next_sibling; @@ -473,7 +474,7 @@ static struct rt6_info *rt6_multipath_select(struct rt6_info *match, * case it will always be non-zero. Otherwise now is the time to do it. */ if (!fl6->mp_hash) - fl6->mp_hash = rt6_multipath_hash(fl6, NULL, NULL); + fl6->mp_hash = rt6_multipath_hash(fl6, skb, NULL); if (fl6->mp_hash <= atomic_read(&match->rt6i_nh_upper_bound)) return match; @@ -927,7 +928,9 @@ static bool ip6_hold_safe(struct net *net, struct rt6_info **prt, static struct rt6_info *ip6_pol_route_lookup(struct net *net, struct fib6_table *table, - struct flowi6 *fl6, int flags) + struct flowi6 *fl6, + const struct sk_buff *skb, + int flags) { struct rt6_info *rt, *rt_cache; struct fib6_node *fn; @@ -945,8 +948,8 @@ restart: rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags); if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0) - rt = rt6_multipath_select(rt, fl6, - fl6->flowi6_oif, flags); + rt = rt6_multipath_select(rt, fl6, fl6->flowi6_oif, + skb, flags); } if (rt == net->ipv6.ip6_null_entry) { fn = fib6_backtrack(fn, &fl6->saddr); @@ -970,14 +973,15 @@ restart: } struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6, - int flags) + const struct sk_buff *skb, int flags) { - return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_lookup); + return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup); } EXPORT_SYMBOL_GPL(ip6_route_lookup); struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr, - const struct in6_addr *saddr, int oif, int strict) + const struct in6_addr *saddr, int oif, + const struct sk_buff *skb, int strict) { struct flowi6 fl6 = { .flowi6_oif = oif, @@ -991,7 +995,7 @@ struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr, flags |= RT6_LOOKUP_F_HAS_SADDR; } - dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup); + dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup); if (dst->error == 0) return (struct rt6_info *) dst; @@ -1655,7 +1659,8 @@ void rt6_age_exceptions(struct rt6_info *rt, } struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, - int oif, struct flowi6 *fl6, int flags) + int oif, struct flowi6 *fl6, + const struct sk_buff *skb, int flags) { struct fib6_node *fn, *saved_fn; struct rt6_info *rt, *rt_cache; @@ -1677,7 +1682,7 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, redo_rt6_select: rt = rt6_select(net, fn, oif, strict); if (rt->rt6i_nsiblings) - rt = rt6_multipath_select(rt, fl6, oif, strict); + rt = rt6_multipath_select(rt, fl6, oif, skb, strict); if (rt == net->ipv6.ip6_null_entry) { fn = fib6_backtrack(fn, &fl6->saddr); if (fn) @@ -1782,20 +1787,25 @@ uncached_rt_out: } EXPORT_SYMBOL_GPL(ip6_pol_route); -static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table, - struct flowi6 *fl6, int flags) +static struct rt6_info *ip6_pol_route_input(struct net *net, + struct fib6_table *table, + struct flowi6 *fl6, + const struct sk_buff *skb, + int flags) { - return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags); + return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags); } struct dst_entry *ip6_route_input_lookup(struct net *net, struct net_device *dev, - struct flowi6 *fl6, int flags) + struct flowi6 *fl6, + const struct sk_buff *skb, + int flags) { if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG) flags |= RT6_LOOKUP_F_IFACE; - return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input); + return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input); } EXPORT_SYMBOL_GPL(ip6_route_input_lookup); @@ -1889,13 +1899,17 @@ void ip6_route_input(struct sk_buff *skb) if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6)) fl6.mp_hash = rt6_multipath_hash(&fl6, skb, flkeys); skb_dst_drop(skb); - skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags)); + skb_dst_set(skb, + ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags)); } -static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table, - struct flowi6 *fl6, int flags) +static struct rt6_info *ip6_pol_route_output(struct net *net, + struct fib6_table *table, + struct flowi6 *fl6, + const struct sk_buff *skb, + int flags) { - return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags); + return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags); } struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk, @@ -1923,7 +1937,7 @@ struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk, else if (sk) flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs); - return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output); + return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output); } EXPORT_SYMBOL_GPL(ip6_route_output_flags); @@ -2180,6 +2194,7 @@ struct ip6rd_flowi { static struct rt6_info *__ip6_route_redirect(struct net *net, struct fib6_table *table, struct flowi6 *fl6, + const struct sk_buff *skb, int flags) { struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6; @@ -2253,8 +2268,9 @@ out: }; static struct dst_entry *ip6_route_redirect(struct net *net, - const struct flowi6 *fl6, - const struct in6_addr *gateway) + const struct flowi6 *fl6, + const struct sk_buff *skb, + const struct in6_addr *gateway) { int flags = RT6_LOOKUP_F_HAS_SADDR; struct ip6rd_flowi rdfl; @@ -2262,7 +2278,7 @@ static struct dst_entry *ip6_route_redirect(struct net *net, rdfl.fl6 = *fl6; rdfl.gateway = *gateway; - return fib6_rule_lookup(net, &rdfl.fl6, + return fib6_rule_lookup(net, &rdfl.fl6, skb, flags, __ip6_route_redirect); } @@ -2282,7 +2298,7 @@ void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark, fl6.flowlabel = ip6_flowinfo(iph); fl6.flowi6_uid = uid; - dst = ip6_route_redirect(net, &fl6, &ipv6_hdr(skb)->saddr); + dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr); rt6_do_redirect(dst, NULL, skb); dst_release(dst); } @@ -2304,7 +2320,7 @@ void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif, fl6.saddr = iph->daddr; fl6.flowi6_uid = sock_net_uid(net, NULL); - dst = ip6_route_redirect(net, &fl6, &iph->saddr); + dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr); rt6_do_redirect(dst, NULL, skb); dst_release(dst); } @@ -2507,7 +2523,7 @@ static struct rt6_info *ip6_nh_lookup_table(struct net *net, flags |= RT6_LOOKUP_F_HAS_SADDR; flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE; - rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, flags); + rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags); /* if table lookup failed, fall back to full lookup */ if (rt == net->ipv6.ip6_null_entry) { @@ -2543,7 +2559,7 @@ static int ip6_route_check_nh(struct net *net, } if (!grt) - grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1); + grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1); if (!grt) goto out; @@ -4586,7 +4602,7 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, if (!ipv6_addr_any(&fl6.saddr)) flags |= RT6_LOOKUP_F_HAS_SADDR; - dst = ip6_route_input_lookup(net, dev, &fl6, flags); + dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags); rcu_read_unlock(); } else { diff --git a/net/ipv6/seg6_local.c b/net/ipv6/seg6_local.c index 8f8ea7a76b99..f702c39728ab 100644 --- a/net/ipv6/seg6_local.c +++ b/net/ipv6/seg6_local.c @@ -163,7 +163,7 @@ static void lookup_nexthop(struct sk_buff *skb, struct in6_addr *nhaddr, fl6.flowi6_flags = FLOWI_FLAG_KNOWN_NH; if (!tbl_id) { - dst = ip6_route_input_lookup(net, skb->dev, &fl6, flags); + dst = ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags); } else { struct fib6_table *table; @@ -171,7 +171,7 @@ static void lookup_nexthop(struct sk_buff *skb, struct in6_addr *nhaddr, if (!table) goto out; - rt = ip6_pol_route(net, table, 0, &fl6, flags); + rt = ip6_pol_route(net, table, 0, &fl6, skb, flags); dst = &rt->dst; } From 0ddc2e5fbbfc6ecb8b55277dd239c0803a1405de Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 17 Apr 2018 17:33:10 -0700 Subject: [PATCH 0351/1640] UPSTREAM: net/ipv6: Pass net to fib6_update_sernum Pass net namespace to fib6_update_sernum. It can not be marked const as fib6_new_sernum will change ipv6.fib6_sernum. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 2 +- net/ipv6/ip6_fib.c | 3 +-- net/ipv6/route.c | 10 +++++----- 3 files changed, 7 insertions(+), 8 deletions(-) diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 03d13d2e9432..59f0c14b2e15 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -410,7 +410,7 @@ void __net_exit fib6_notifier_exit(struct net *net); unsigned int fib6_tables_seq_read(struct net *net); int fib6_tables_dump(struct net *net, struct notifier_block *nb); -void fib6_update_sernum(struct rt6_info *rt); +void fib6_update_sernum(struct net *net, struct rt6_info *rt); void fib6_update_sernum_upto_root(struct net *net, struct rt6_info *rt); #ifdef CONFIG_IPV6_MULTIPLE_TABLES diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 3dfdc54eefb8..7563cbbbf513 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -105,9 +105,8 @@ enum { FIB6_NO_SERNUM_CHANGE = 0, }; -void fib6_update_sernum(struct rt6_info *rt) +void fib6_update_sernum(struct net *net, struct rt6_info *rt) { - struct net *net = dev_net(rt->dst.dev); struct fib6_node *fn; fn = rcu_dereference_protected(rt->rt6i_node, diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 2b7c2ff56161..e1bd3aa236d5 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1370,7 +1370,7 @@ out: /* Update fn->fn_sernum to invalidate all cached dst */ if (!err) { spin_lock_bh(&ort->rt6i_table->tb6_lock); - fib6_update_sernum(ort); + fib6_update_sernum(net, ort); spin_unlock_bh(&ort->rt6i_table->tb6_lock); fib6_force_start_gc(net); } @@ -3612,11 +3612,11 @@ void rt6_multipath_rebalance(struct rt6_info *rt) static int fib6_ifup(struct rt6_info *rt, void *p_arg) { const struct arg_netdev_event *arg = p_arg; - const struct net *net = dev_net(arg->dev); + struct net *net = dev_net(arg->dev); if (rt != net->ipv6.ip6_null_entry && rt->dst.dev == arg->dev) { rt->rt6i_nh_flags &= ~arg->nh_flags; - fib6_update_sernum_upto_root(dev_net(rt->dst.dev), rt); + fib6_update_sernum_upto_root(net, rt); rt6_multipath_rebalance(rt); } @@ -3695,7 +3695,7 @@ static int fib6_ifdown(struct rt6_info *rt, void *p_arg) { const struct arg_netdev_event *arg = p_arg; const struct net_device *dev = arg->dev; - const struct net *net = dev_net(dev); + struct net *net = dev_net(dev); if (rt == net->ipv6.ip6_null_entry) return 0; @@ -3718,7 +3718,7 @@ static int fib6_ifdown(struct rt6_info *rt, void *p_arg) } rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD | RTNH_F_LINKDOWN); - fib6_update_sernum(rt); + fib6_update_sernum(net, rt); rt6_multipath_rebalance(rt); } return -2; From 2ddf74eb84ddbdb558bf1fe4f08f1bf84ac440bc Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Fri, 27 Oct 2017 17:30:12 -0700 Subject: [PATCH 0352/1640] UPSTREAM: ipv6: prevent user from adding cached routes Cached routes should only be created by the system when receiving pmtu discovery or ip redirect msg. Users should not be allowed to create cached routes. Furthermore, after the patch series to move cached routes into exception table, user added cached routes will trigger the following warning in fib6_add(): WARNING: CPU: 0 PID: 2985 at net/ipv6/ip6_fib.c:1137 fib6_add+0x20d9/0x2c10 net/ipv6/ip6_fib.c:1137 Kernel panic - not syncing: panic_on_warn set ... CPU: 0 PID: 2985 Comm: syzkaller320388 Not tainted 4.14.0-rc3+ #74 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:16 [inline] dump_stack+0x194/0x257 lib/dump_stack.c:52 panic+0x1e4/0x417 kernel/panic.c:181 __warn+0x1c4/0x1d9 kernel/panic.c:542 report_bug+0x211/0x2d0 lib/bug.c:183 fixup_bug+0x40/0x90 arch/x86/kernel/traps.c:178 do_trap_no_signal arch/x86/kernel/traps.c:212 [inline] do_trap+0x260/0x390 arch/x86/kernel/traps.c:261 do_error_trap+0x120/0x390 arch/x86/kernel/traps.c:298 do_invalid_op+0x1b/0x20 arch/x86/kernel/traps.c:311 invalid_op+0x18/0x20 arch/x86/entry/entry_64.S:905 RIP: 0010:fib6_add+0x20d9/0x2c10 net/ipv6/ip6_fib.c:1137 RSP: 0018:ffff8801cf09f6a0 EFLAGS: 00010297 RAX: ffff8801ce45e340 RBX: 1ffff10039e13eec RCX: ffff8801d749c814 RDX: 0000000000000000 RSI: ffff8801d749c700 RDI: ffff8801d749c780 RBP: ffff8801cf09fa08 R08: 0000000000000000 R09: ffff8801cf09f360 R10: ffff8801cf09f2d8 R11: 1ffff10039c8befb R12: 0000000000000001 R13: dffffc0000000000 R14: ffff8801d749c700 R15: ffffffff860655c0 __ip6_ins_rt+0x6c/0x90 net/ipv6/route.c:1011 ip6_route_add+0x148/0x1a0 net/ipv6/route.c:2782 ipv6_route_ioctl+0x4d5/0x690 net/ipv6/route.c:3291 inet6_ioctl+0xef/0x1e0 net/ipv6/af_inet6.c:521 sock_do_ioctl+0x65/0xb0 net/socket.c:961 sock_ioctl+0x2c2/0x440 net/socket.c:1058 vfs_ioctl fs/ioctl.c:45 [inline] do_vfs_ioctl+0x1b1/0x1530 fs/ioctl.c:685 SYSC_ioctl fs/ioctl.c:700 [inline] SyS_ioctl+0x8f/0xc0 fs/ioctl.c:691 entry_SYSCALL_64_fastpath+0x1f/0xbe So we fix this by failing the attemp to add cached routes from userspace with returning EINVAL error. Fixes: 2b760fcf5cfb ("ipv6: hook up exception table to store dst cache") Signed-off-by: Wei Wang Signed-off-by: Eric Dumazet Acked-by: Martin KaFai Lau Signed-off-by: David S. Miller --- include/uapi/linux/ipv6_route.h | 2 +- net/ipv6/route.c | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/include/uapi/linux/ipv6_route.h b/include/uapi/linux/ipv6_route.h index a96eb17ad6fc..593800a18799 100644 --- a/include/uapi/linux/ipv6_route.h +++ b/include/uapi/linux/ipv6_route.h @@ -29,7 +29,7 @@ #define RTF_ROUTEINFO 0x00800000 /* route information - RA */ -#define RTF_CACHE 0x01000000 /* cache entry */ +#define RTF_CACHE 0x01000000 /* read-only: can not be set by user */ #define RTF_FLOW 0x02000000 /* flow significant route */ #define RTF_POLICY 0x04000000 /* policy route */ diff --git a/net/ipv6/route.c b/net/ipv6/route.c index e1bd3aa236d5..d2658c12e498 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2602,6 +2602,12 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg, goto out; } + /* RTF_CACHE is an internal flag; can not be set by userspace */ + if (cfg->fc_flags & RTF_CACHE) { + NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE"); + goto out; + } + if (cfg->fc_dst_len > 128) { NL_SET_ERR_MSG(extack, "Invalid prefix length"); goto out; From 4bbd3114bb3041271cbd4b4a804e33232d8bbed4 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 17 Apr 2018 17:33:13 -0700 Subject: [PATCH 0353/1640] UPSTREAM: net/ipv6: Save route type in rt6_info The RTN_ type for IPv6 FIB entries is currently embedded in rt6i_flags and dst.error. Since dst is going to be removed, it can no longer be relied on for FIB dumps so save the route type as fib6_type. fc_type is set in current users based on the algorithm in rt6_fill_node: - rt6i_flags contains RTF_LOCAL: fc_type = RTN_LOCAL - rt6i_flags contains RTF_ANYCAST: fc_type = RTN_ANYCAST - else fc_type = RTN_UNICAST Similarly, fib6_type is set in the rt6_info templates based on the RTF_REJECT section of rt6_fill_node converting dst.error to RTN type. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 1 + net/ipv6/addrconf.c | 2 ++ net/ipv6/route.c | 46 +++++++++++++++++++------------------------ 3 files changed, 23 insertions(+), 26 deletions(-) diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 59f0c14b2e15..3b016c011615 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -177,6 +177,7 @@ struct rt6_info { int rt6i_nh_weight; unsigned short rt6i_nfheader_len; u8 rt6i_protocol; + u8 fib6_type; u8 exception_bucket_flushed:1, should_flush:1, unused:6; diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 670aeb1b7a5d..696b837fdadb 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -2321,6 +2321,7 @@ addrconf_prefix_route(struct in6_addr *pfx, int plen, struct net_device *dev, .fc_flags = RTF_UP | flags, .fc_nlinfo.nl_net = dev_net(dev), .fc_protocol = RTPROT_KERNEL, + .fc_type = RTN_UNICAST, }; cfg.fc_dst = *pfx; @@ -2384,6 +2385,7 @@ static void addrconf_add_mroute(struct net_device *dev) .fc_ifindex = dev->ifindex, .fc_dst_len = 8, .fc_flags = RTF_UP, + .fc_type = RTN_UNICAST, .fc_nlinfo.nl_net = dev_net(dev), .fc_protocol = RTPROT_KERNEL, }; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index d2658c12e498..0bed80226538 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -311,6 +311,7 @@ static const struct rt6_info ip6_null_entry_template = { .rt6i_protocol = RTPROT_KERNEL, .rt6i_metric = ~(u32) 0, .rt6i_ref = ATOMIC_INIT(1), + .fib6_type = RTN_UNREACHABLE, }; #ifdef CONFIG_IPV6_MULTIPLE_TABLES @@ -328,6 +329,7 @@ static const struct rt6_info ip6_prohibit_entry_template = { .rt6i_protocol = RTPROT_KERNEL, .rt6i_metric = ~(u32) 0, .rt6i_ref = ATOMIC_INIT(1), + .fib6_type = RTN_PROHIBIT, }; static const struct rt6_info ip6_blk_hole_entry_template = { @@ -343,6 +345,7 @@ static const struct rt6_info ip6_blk_hole_entry_template = { .rt6i_protocol = RTPROT_KERNEL, .rt6i_metric = ~(u32) 0, .rt6i_ref = ATOMIC_INIT(1), + .fib6_type = RTN_BLACKHOLE, }; #endif @@ -2608,6 +2611,11 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg, goto out; } + if (cfg->fc_type > RTN_MAX) { + NL_SET_ERR_MSG(extack, "Invalid route type"); + goto out; + } + if (cfg->fc_dst_len > 128) { NL_SET_ERR_MSG(extack, "Invalid prefix length"); goto out; @@ -2712,6 +2720,8 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg, rt->rt6i_metric = cfg->fc_metric; rt->rt6i_nh_weight = 1; + rt->fib6_type = cfg->fc_type; + /* We cannot add true routes via loopback here, they would result in kernel looping; promote them to reject routes */ @@ -3220,6 +3230,7 @@ static struct rt6_info *rt6_add_route_info(struct net *net, .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO | RTF_UP | RTF_PREF(pref), .fc_protocol = RTPROT_RA, + .fc_type = RTN_UNICAST, .fc_nlinfo.portid = 0, .fc_nlinfo.nlh = NULL, .fc_nlinfo.nl_net = net, @@ -3273,6 +3284,7 @@ struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr, .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT | RTF_UP | RTF_EXPIRES | RTF_PREF(pref), .fc_protocol = RTPROT_RA, + .fc_type = RTN_UNICAST, .fc_nlinfo.portid = 0, .fc_nlinfo.nlh = NULL, .fc_nlinfo.nl_net = dev_net(dev), @@ -3317,6 +3329,7 @@ static void rtmsg_to_fib6_config(struct net *net, cfg->fc_dst_len = rtmsg->rtmsg_dst_len; cfg->fc_src_len = rtmsg->rtmsg_src_len; cfg->fc_flags = rtmsg->rtmsg_flags; + cfg->fc_type = rtmsg->rtmsg_type; cfg->fc_nlinfo.nl_net = net; @@ -3437,10 +3450,13 @@ struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev, rt->rt6i_protocol = RTPROT_KERNEL; rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP; - if (anycast) + if (anycast) { + rt->fib6_type = RTN_ANYCAST; rt->rt6i_flags |= RTF_ANYCAST; - else + } else { + rt->fib6_type = RTN_LOCAL; rt->rt6i_flags |= RTF_LOCAL; + } rt->rt6i_gateway = *addr; rt->rt6i_dst.addr = *addr; @@ -4386,30 +4402,8 @@ static int rt6_fill_node(struct net *net, rtm->rtm_table = table < 256 ? table : RT_TABLE_COMPAT; if (nla_put_u32(skb, RTA_TABLE, table)) goto nla_put_failure; - if (rt->rt6i_flags & RTF_REJECT) { - switch (rt->dst.error) { - case -EINVAL: - rtm->rtm_type = RTN_BLACKHOLE; - break; - case -EACCES: - rtm->rtm_type = RTN_PROHIBIT; - break; - case -EAGAIN: - rtm->rtm_type = RTN_THROW; - break; - default: - rtm->rtm_type = RTN_UNREACHABLE; - break; - } - } - else if (rt->rt6i_flags & RTF_LOCAL) - rtm->rtm_type = RTN_LOCAL; - else if (rt->rt6i_flags & RTF_ANYCAST) - rtm->rtm_type = RTN_ANYCAST; - else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK)) - rtm->rtm_type = RTN_LOCAL; - else - rtm->rtm_type = RTN_UNICAST; + + rtm->rtm_type = rt->fib6_type; rtm->rtm_flags = 0; rtm->rtm_scope = RT_SCOPE_UNIVERSE; rtm->rtm_protocol = rt->rt6i_protocol; From 8f647acb33d7c05dafa6bf98f8b1f6acde44a4d8 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 17 Apr 2018 17:33:12 -0700 Subject: [PATCH 0354/1640] UPSTREAM: net/ipv6: Move support functions up in route.c Code move only. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/route.c | 119 +++++++++++++++++++++++------------------------ 1 file changed, 59 insertions(+), 60 deletions(-) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 0bed80226538..b2fe194491e0 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -78,7 +78,6 @@ enum rt6_nud_state { RT6_NUD_SUCCEED = 1 }; -static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort); static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie); static unsigned int ip6_default_advmss(const struct dst_entry *dst); static unsigned int ip6_mtu(const struct dst_entry *dst); @@ -894,6 +893,65 @@ int rt6_route_rcv(struct net_device *dev, u8 *opt, int len, } #endif +/* + * Misc support functions + */ + +/* called with rcu_lock held */ +static struct net_device *ip6_rt_get_dev_rcu(struct rt6_info *rt) +{ + struct net_device *dev = rt->dst.dev; + + if (rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST)) { + /* for copies of local routes, dst->dev needs to be the + * device if it is a master device, the master device if + * device is enslaved, and the loopback as the default + */ + if (netif_is_l3_slave(dev) && + !rt6_need_strict(&rt->rt6i_dst.addr)) + dev = l3mdev_master_dev_rcu(dev); + else if (!netif_is_l3_master(dev)) + dev = dev_net(dev)->loopback_dev; + /* last case is netif_is_l3_master(dev) is true in which + * case we want dev returned to be dev + */ + } + + return dev; +} + +static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from) +{ + BUG_ON(from->from); + + rt->rt6i_flags &= ~RTF_EXPIRES; + dst_hold(&from->dst); + rt->from = from; + dst_init_metrics(&rt->dst, dst_metrics_ptr(&from->dst), true); +} + +static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort) +{ + rt->dst.input = ort->dst.input; + rt->dst.output = ort->dst.output; + rt->rt6i_dst = ort->rt6i_dst; + rt->dst.error = ort->dst.error; + rt->rt6i_idev = ort->rt6i_idev; + if (rt->rt6i_idev) + in6_dev_hold(rt->rt6i_idev); + rt->dst.lastuse = jiffies; + rt->rt6i_gateway = ort->rt6i_gateway; + rt->rt6i_flags = ort->rt6i_flags; + rt6_set_from(rt, ort); + rt->rt6i_metric = ort->rt6i_metric; +#ifdef CONFIG_IPV6_SUBTREES + rt->rt6i_src = ort->rt6i_src; +#endif + rt->rt6i_prefsrc = ort->rt6i_prefsrc; + rt->rt6i_table = ort->rt6i_table; + rt->dst.lwtstate = lwtstate_get(ort->dst.lwtstate); +} + static struct fib6_node* fib6_backtrack(struct fib6_node *fn, struct in6_addr *saddr) { @@ -1039,29 +1097,6 @@ int ip6_ins_rt(struct rt6_info *rt) return __ip6_ins_rt(rt, &info, &mxc, NULL); } -/* called with rcu_lock held */ -static struct net_device *ip6_rt_get_dev_rcu(struct rt6_info *rt) -{ - struct net_device *dev = rt->dst.dev; - - if (rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST)) { - /* for copies of local routes, dst->dev needs to be the - * device if it is a master device, the master device if - * device is enslaved, and the loopback as the default - */ - if (netif_is_l3_slave(dev) && - !rt6_need_strict(&rt->rt6i_dst.addr)) - dev = l3mdev_master_dev_rcu(dev); - else if (!netif_is_l3_master(dev)) - dev = dev_net(dev)->loopback_dev; - /* last case is netif_is_l3_master(dev) is true in which - * case we want dev returned to be dev - */ - } - - return dev; -} - static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort, const struct in6_addr *daddr, const struct in6_addr *saddr) @@ -3146,42 +3181,6 @@ out: neigh_release(neigh); } -/* - * Misc support functions - */ - -static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from) -{ - BUG_ON(from->from); - - rt->rt6i_flags &= ~RTF_EXPIRES; - dst_hold(&from->dst); - rt->from = from; - dst_init_metrics(&rt->dst, dst_metrics_ptr(&from->dst), true); -} - -static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort) -{ - rt->dst.input = ort->dst.input; - rt->dst.output = ort->dst.output; - rt->rt6i_dst = ort->rt6i_dst; - rt->dst.error = ort->dst.error; - rt->rt6i_idev = ort->rt6i_idev; - if (rt->rt6i_idev) - in6_dev_hold(rt->rt6i_idev); - rt->dst.lastuse = jiffies; - rt->rt6i_gateway = ort->rt6i_gateway; - rt->rt6i_flags = ort->rt6i_flags; - rt6_set_from(rt, ort); - rt->rt6i_metric = ort->rt6i_metric; -#ifdef CONFIG_IPV6_SUBTREES - rt->rt6i_src = ort->rt6i_src; -#endif - rt->rt6i_prefsrc = ort->rt6i_prefsrc; - rt->rt6i_table = ort->rt6i_table; - rt->dst.lwtstate = lwtstate_get(ort->dst.lwtstate); -} - #ifdef CONFIG_IPV6_ROUTE_INFO static struct rt6_info *rt6_get_route_info(struct net *net, const struct in6_addr *prefix, int prefixlen, From 6079dec403f4c348e576a11cd80bbecd159aa084 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 13 Feb 2018 20:32:04 -0800 Subject: [PATCH 0355/1640] BACKPORT: net: Move ipv4 set_lwt_redirect helper to lwtunnel IPv4 uses set_lwt_redirect to set the lwtunnel redirect functions as needed. Move it to lwtunnel.h as lwtunnel_set_redirect and change IPv6 to also use it. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/lwtunnel.h | 15 +++++++++++++++ net/ipv4/route.c | 17 ++--------------- net/ipv6/route.c | 9 +-------- 3 files changed, 18 insertions(+), 23 deletions(-) diff --git a/include/net/lwtunnel.h b/include/net/lwtunnel.h index 0ab4647ccc24..ec75c0a1c529 100644 --- a/include/net/lwtunnel.h +++ b/include/net/lwtunnel.h @@ -130,6 +130,17 @@ int lwtunnel_output(struct net *net, struct sock *sk, struct sk_buff *skb); int lwtunnel_input(struct sk_buff *skb); int lwtunnel_xmit(struct sk_buff *skb); +static inline void lwtunnel_set_redirect(struct dst_entry *dst) +{ + if (lwtunnel_output_redirect(dst->lwtstate)) { + dst->lwtstate->orig_output = dst->output; + dst->output = lwtunnel_output; + } + if (lwtunnel_input_redirect(dst->lwtstate)) { + dst->lwtstate->orig_input = dst->input; + dst->input = lwtunnel_input; + } +} #else static inline void lwtstate_free(struct lwtunnel_state *lws) @@ -161,6 +172,10 @@ static inline bool lwtunnel_xmit_redirect(struct lwtunnel_state *lwtstate) return false; } +static inline void lwtunnel_set_redirect(struct dst_entry *dst) +{ +} + static inline unsigned int lwtunnel_headroom(struct lwtunnel_state *lwtstate, unsigned int mtu) { diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 396eb42b23c6..368df8c52bd8 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1750,19 +1750,6 @@ static void ip_handle_martian_source(struct net_device *dev, #endif } -static void set_lwt_redirect(struct rtable *rth) -{ - if (lwtunnel_output_redirect(rth->dst.lwtstate)) { - rth->dst.lwtstate->orig_output = rth->dst.output; - rth->dst.output = lwtunnel_output; - } - - if (lwtunnel_input_redirect(rth->dst.lwtstate)) { - rth->dst.lwtstate->orig_input = rth->dst.input; - rth->dst.input = lwtunnel_input; - } -} - /* called in rcu_read_lock() section */ static int __mkroute_input(struct sk_buff *skb, const struct fib_result *res, @@ -1843,7 +1830,7 @@ static int __mkroute_input(struct sk_buff *skb, rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag, do_cache); - set_lwt_redirect(rth); + lwtunnel_set_redirect(&rth->dst); skb_dst_set(skb, &rth->dst); out: err = 0; @@ -2358,7 +2345,7 @@ add: } rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache); - set_lwt_redirect(rth); + lwtunnel_set_redirect(&rth->dst); return rth; } diff --git a/net/ipv6/route.c b/net/ipv6/route.c index b2fe194491e0..0ef3a69ebf89 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2732,14 +2732,7 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg, if (err) goto out; rt->dst.lwtstate = lwtstate_get(lwtstate); - if (lwtunnel_output_redirect(rt->dst.lwtstate)) { - rt->dst.lwtstate->orig_output = rt->dst.output; - rt->dst.output = lwtunnel_output; - } - if (lwtunnel_input_redirect(rt->dst.lwtstate)) { - rt->dst.lwtstate->orig_input = rt->dst.input; - rt->dst.input = lwtunnel_input; - } + lwtunnel_set_redirect(&rt->dst); } ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len); From abaf072936513c3d49062343cd64b7b389674889 Mon Sep 17 00:00:00 2001 From: Stefano Brivio Date: Tue, 6 Mar 2018 11:10:19 +0100 Subject: [PATCH 0356/1640] UPSTREAM: ipv6: Reflect MTU changes on PMTU of exceptions for MTU-less routes Currently, administrative MTU changes on a given netdevice are not reflected on route exceptions for MTU-less routes, with a set PMTU value, for that device: # ip -6 route get 2001:db8::b 2001:db8::b from :: dev vti_a proto kernel src 2001:db8::a metric 256 pref medium # ping6 -c 1 -q -s10000 2001:db8::b > /dev/null # ip netns exec a ip -6 route get 2001:db8::b 2001:db8::b from :: dev vti_a src 2001:db8::a metric 0 cache expires 571sec mtu 4926 pref medium # ip link set dev vti_a mtu 3000 # ip -6 route get 2001:db8::b 2001:db8::b from :: dev vti_a src 2001:db8::a metric 0 cache expires 571sec mtu 4926 pref medium # ip link set dev vti_a mtu 9000 # ip -6 route get 2001:db8::b 2001:db8::b from :: dev vti_a src 2001:db8::a metric 0 cache expires 571sec mtu 4926 pref medium The first issue is that since commit fb56be83e43d ("net-ipv6: on device mtu change do not add mtu to mtu-less routes") we don't call rt6_exceptions_update_pmtu() from rt6_mtu_change_route(), which handles administrative MTU changes, if the regular route is MTU-less. However, PMTU exceptions should be always updated, as long as RTAX_MTU is not locked. Keep the check for MTU-less main route, as introduced by that commit, but, for exceptions, call rt6_exceptions_update_pmtu() regardless of that check. Once that is fixed, one problem remains: MTU changes are not reflected if the new MTU is higher than the previous one, because rt6_exceptions_update_pmtu() doesn't allow that. We should instead allow PMTU increase if the old PMTU matches the local MTU, as that implies that the old MTU was the lowest in the path, and PMTU discovery might lead to different results. The existing check in rt6_mtu_change_route() correctly took that case into account (for regular routes only), so factor it out and re-use it also in rt6_exceptions_update_pmtu(). While at it, fix comments style and grammar, and try to be a bit more descriptive. Reported-by: Xiumei Mu Fixes: fb56be83e43d ("net-ipv6: on device mtu change do not add mtu to mtu-less routes") Fixes: f5bbe7ee79c2 ("ipv6: prepare rt6_mtu_change() for exception table") Signed-off-by: Stefano Brivio Acked-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/route.c | 71 ++++++++++++++++++++++++++++-------------------- 1 file changed, 42 insertions(+), 29 deletions(-) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 0ef3a69ebf89..138e1074ee88 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1573,7 +1573,30 @@ static void rt6_exceptions_remove_prefsrc(struct rt6_info *rt) } } -static void rt6_exceptions_update_pmtu(struct rt6_info *rt, int mtu) +static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev, + struct rt6_info *rt, int mtu) +{ + /* If the new MTU is lower than the route PMTU, this new MTU will be the + * lowest MTU in the path: always allow updating the route PMTU to + * reflect PMTU decreases. + * + * If the new MTU is higher, and the route PMTU is equal to the local + * MTU, this means the old MTU is the lowest in the path, so allow + * updating it: if other nodes now have lower MTUs, PMTU discovery will + * handle this. + */ + + if (dst_mtu(&rt->dst) >= mtu) + return true; + + if (dst_mtu(&rt->dst) == idev->cnf.mtu6) + return true; + + return false; +} + +static void rt6_exceptions_update_pmtu(struct inet6_dev *idev, + struct rt6_info *rt, int mtu) { struct rt6_exception_bucket *bucket; struct rt6_exception *rt6_ex; @@ -1582,20 +1605,22 @@ static void rt6_exceptions_update_pmtu(struct rt6_info *rt, int mtu) bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, lockdep_is_held(&rt6_exception_lock)); - if (bucket) { - for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { - hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) { - struct rt6_info *entry = rt6_ex->rt6i; - /* For RTF_CACHE with rt6i_pmtu == 0 - * (i.e. a redirected route), - * the metrics of its rt->dst.from has already - * been updated. - */ - if (entry->rt6i_pmtu && entry->rt6i_pmtu > mtu) - entry->rt6i_pmtu = mtu; - } - bucket++; + if (!bucket) + return; + + for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { + hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) { + struct rt6_info *entry = rt6_ex->rt6i; + + /* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected + * route), the metrics of its rt->dst.from have already + * been updated. + */ + if (entry->rt6i_pmtu && + rt6_mtu_change_route_allowed(idev, entry, mtu)) + entry->rt6i_pmtu = mtu; } + bucket++; } } @@ -3792,25 +3817,13 @@ static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg) Since RFC 1981 doesn't include administrative MTU increase update PMTU increase is a MUST. (i.e. jumbo frame) */ - /* - If new MTU is less than route PMTU, this new MTU will be the - lowest MTU in the path, update the route PMTU to reflect PMTU - decreases; if new MTU is greater than route PMTU, and the - old MTU is the lowest MTU in the path, update the route PMTU - to reflect the increase. In this case if the other nodes' MTU - also have the lowest MTU, TOO BIG MESSAGE will be lead to - PMTU discovery. - */ if (rt->dst.dev == arg->dev && - dst_metric_raw(&rt->dst, RTAX_MTU) && !dst_metric_locked(&rt->dst, RTAX_MTU)) { spin_lock_bh(&rt6_exception_lock); - if (dst_mtu(&rt->dst) >= arg->mtu || - (dst_mtu(&rt->dst) < arg->mtu && - dst_mtu(&rt->dst) == idev->cnf.mtu6)) { + if (dst_metric_raw(&rt->dst, RTAX_MTU) && + rt6_mtu_change_route_allowed(idev, rt, arg->mtu)) dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu); - } - rt6_exceptions_update_pmtu(rt, arg->mtu); + rt6_exceptions_update_pmtu(idev, rt, arg->mtu); spin_unlock_bh(&rt6_exception_lock); } return 0; From 1dd770558acd39dac946795fa91339e646a19bff Mon Sep 17 00:00:00 2001 From: David Ahern Date: Thu, 25 Jan 2018 16:55:09 -0800 Subject: [PATCH 0357/1640] UPSTREAM: net/ipv6: Add support for onlink flag Similar to IPv4 allow routes to be added with the RTNH_F_ONLINK flag. The onlink option requires a gateway and a nexthop device. Any unicast gateway is allowed (including IPv4 mapped addresses and unresolved ones) as long as the gateway is not a local address and if it resolves it must match the given device. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/route.c | 51 +++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 50 insertions(+), 1 deletion(-) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 138e1074ee88..ba54e041b7f4 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2597,6 +2597,31 @@ static struct rt6_info *ip6_nh_lookup_table(struct net *net, return rt; } +static int ip6_route_check_nh_onlink(struct net *net, + struct fib6_config *cfg, + struct net_device *dev, + struct netlink_ext_ack *extack) +{ + u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_LOCAL; + const struct in6_addr *gw_addr = &cfg->fc_gateway; + u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT; + struct rt6_info *grt; + int err; + + err = 0; + grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0); + if (grt) { + if (grt->rt6i_flags & flags || dev != grt->dst.dev) { + NL_SET_ERR_MSG(extack, "Nexthop has invalid gateway"); + err = -EINVAL; + } + + ip6_rt_put(grt); + } + + return err; +} + static int ip6_route_check_nh(struct net *net, struct fib6_config *cfg, struct net_device **_dev, @@ -2704,6 +2729,21 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg, if (cfg->fc_metric == 0) cfg->fc_metric = IP6_RT_PRIO_USER; + if (cfg->fc_flags & RTNH_F_ONLINK) { + if (!dev) { + NL_SET_ERR_MSG(extack, + "Nexthop device required for onlink"); + err = -ENODEV; + goto out; + } + + if (!(dev->flags & IFF_UP)) { + NL_SET_ERR_MSG(extack, "Nexthop device is not up"); + err = -ENETDOWN; + goto out; + } + } + err = -ENOBUFS; if (cfg->fc_nlinfo.nlh && !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) { @@ -2859,7 +2899,12 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg, goto out; } - err = ip6_route_check_nh(net, cfg, &dev, &idev); + if (cfg->fc_flags & RTNH_F_ONLINK) { + err = ip6_route_check_nh_onlink(net, cfg, dev, + extack); + } else { + err = ip6_route_check_nh(net, cfg, &dev, &idev); + } if (err) goto out; } @@ -2895,6 +2940,7 @@ install_route: if (!(rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST)) && !netif_carrier_ok(dev)) rt->rt6i_nh_flags |= RTNH_F_LINKDOWN; + rt->rt6i_nh_flags |= (cfg->fc_flags & RTNH_F_ONLINK); rt->dst.dev = dev; rt->rt6i_idev = idev; rt->rt6i_table = table; @@ -3893,6 +3939,8 @@ static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, if (rtm->rtm_flags & RTM_F_CLONED) cfg->fc_flags |= RTF_CACHE; + cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK); + cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid; cfg->fc_nlinfo.nlh = nlh; cfg->fc_nlinfo.nl_net = sock_net(skb->sk); @@ -4334,6 +4382,7 @@ static int rt6_nexthop_info(struct sk_buff *skb, struct rt6_info *rt, goto nla_put_failure; } + *flags |= (rt->rt6i_nh_flags & RTNH_F_ONLINK); if (rt->rt6i_nh_flags & RTNH_F_OFFLOAD) *flags |= RTNH_F_OFFLOAD; From d1c74b0cc35f9eaac5a6642d39f1489ad1fba89e Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 13 Mar 2018 08:29:36 -0700 Subject: [PATCH 0358/1640] UPSTREAM: net/ipv6: Refactor gateway validation on route add Move gateway validation code from ip6_route_info_create into ip6_validate_gw. Code move plus adjustments to handle the potential reset of dev and idev and to make checkpatch happy. Signed-off-by: David Ahern Reviewed-by: Ido Schimmel Signed-off-by: David S. Miller --- net/ipv6/route.c | 120 ++++++++++++++++++++++++++--------------------- 1 file changed, 66 insertions(+), 54 deletions(-) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index ba54e041b7f4..1cd1cc2fa6dc 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2599,7 +2599,7 @@ static struct rt6_info *ip6_nh_lookup_table(struct net *net, static int ip6_route_check_nh_onlink(struct net *net, struct fib6_config *cfg, - struct net_device *dev, + const struct net_device *dev, struct netlink_ext_ack *extack) { u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_LOCAL; @@ -2673,6 +2673,68 @@ out: return err; } +static int ip6_validate_gw(struct net *net, struct fib6_config *cfg, + struct net_device **_dev, struct inet6_dev **idev, + struct netlink_ext_ack *extack) +{ + const struct in6_addr *gw_addr = &cfg->fc_gateway; + int gwa_type = ipv6_addr_type(gw_addr); + const struct net_device *dev = *_dev; + int err = -EINVAL; + + /* if gw_addr is local we will fail to detect this in case + * address is still TENTATIVE (DAD in progress). rt6_lookup() + * will return already-added prefix route via interface that + * prefix route was assigned to, which might be non-loopback. + */ + if (ipv6_chk_addr_and_flags(net, gw_addr, + gwa_type & IPV6_ADDR_LINKLOCAL ? + dev : NULL, 0, 0)) { + NL_SET_ERR_MSG(extack, "Invalid gateway address"); + goto out; + } + + if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) { + /* IPv6 strictly inhibits using not link-local + * addresses as nexthop address. + * Otherwise, router will not able to send redirects. + * It is very good, but in some (rare!) circumstances + * (SIT, PtP, NBMA NOARP links) it is handy to allow + * some exceptions. --ANK + * We allow IPv4-mapped nexthops to support RFC4798-type + * addressing + */ + if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) { + NL_SET_ERR_MSG(extack, "Invalid gateway address"); + goto out; + } + + if (cfg->fc_flags & RTNH_F_ONLINK) + err = ip6_route_check_nh_onlink(net, cfg, dev, extack); + else + err = ip6_route_check_nh(net, cfg, _dev, idev); + + if (err) + goto out; + } + + /* reload in case device was changed */ + dev = *_dev; + + err = -EINVAL; + if (!dev) { + NL_SET_ERR_MSG(extack, "Egress device not specified"); + goto out; + } else if (dev->flags & IFF_LOOPBACK) { + NL_SET_ERR_MSG(extack, + "Egress device can not be loopback device for this route"); + goto out; + } + err = 0; +out: + return err; +} + static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg, struct netlink_ext_ack *extack) { @@ -2862,61 +2924,11 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg, } if (cfg->fc_flags & RTF_GATEWAY) { - const struct in6_addr *gw_addr; - int gwa_type; - - gw_addr = &cfg->fc_gateway; - gwa_type = ipv6_addr_type(gw_addr); - - /* if gw_addr is local we will fail to detect this in case - * address is still TENTATIVE (DAD in progress). rt6_lookup() - * will return already-added prefix route via interface that - * prefix route was assigned to, which might be non-loopback. - */ - err = -EINVAL; - if (ipv6_chk_addr_and_flags(net, gw_addr, - gwa_type & IPV6_ADDR_LINKLOCAL ? - dev : NULL, 0, 0)) { - NL_SET_ERR_MSG(extack, "Invalid gateway address"); + err = ip6_validate_gw(net, cfg, &dev, &idev, extack); + if (err) goto out; - } - rt->rt6i_gateway = *gw_addr; - if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) { - /* IPv6 strictly inhibits using not link-local - addresses as nexthop address. - Otherwise, router will not able to send redirects. - It is very good, but in some (rare!) circumstances - (SIT, PtP, NBMA NOARP links) it is handy to allow - some exceptions. --ANK - We allow IPv4-mapped nexthops to support RFC4798-type - addressing - */ - if (!(gwa_type & (IPV6_ADDR_UNICAST | - IPV6_ADDR_MAPPED))) { - NL_SET_ERR_MSG(extack, - "Invalid gateway address"); - goto out; - } - - if (cfg->fc_flags & RTNH_F_ONLINK) { - err = ip6_route_check_nh_onlink(net, cfg, dev, - extack); - } else { - err = ip6_route_check_nh(net, cfg, &dev, &idev); - } - if (err) - goto out; - } - err = -EINVAL; - if (!dev) { - NL_SET_ERR_MSG(extack, "Egress device not specified"); - goto out; - } else if (dev->flags & IFF_LOOPBACK) { - NL_SET_ERR_MSG(extack, - "Egress device can not be loopback device for this route"); - goto out; - } + rt->rt6i_gateway = cfg->fc_gateway; } err = -ENODEV; From f3d322d7e12146d24b2b44ef9bd772694725ce4b Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 17 Apr 2018 17:33:14 -0700 Subject: [PATCH 0359/1640] BACKPORT: net/ipv6: Move nexthop data to fib6_nh Introduce fib6_nh structure and move nexthop related data from rt6_info and rt6_info.dst to fib6_nh. References to dev, gateway or lwtstate from a FIB lookup perspective are converted to use fib6_nh; datapath references to dst version are left as is. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 16 +++- include/net/ip6_route.h | 6 +- net/ipv6/addrconf.c | 2 +- net/ipv6/ip6_fib.c | 6 +- net/ipv6/route.c | 162 ++++++++++++++++++++++------------------ 5 files changed, 109 insertions(+), 83 deletions(-) diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 3b016c011615..3cc5a965f082 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -130,6 +130,16 @@ struct rt6_exception { #define FIB6_EXCEPTION_BUCKET_SIZE (1 << FIB6_EXCEPTION_BUCKET_SIZE_SHIFT) #define FIB6_MAX_DEPTH 5 +struct fib6_nh { + struct in6_addr nh_gw; + struct net_device *nh_dev; + struct lwtunnel_state *nh_lwtstate; + + unsigned int nh_flags; + atomic_t nh_upper_bound; + int nh_weight; +}; + struct rt6_info { struct dst_entry dst; struct rt6_info __rcu *rt6_next; @@ -152,12 +162,9 @@ struct rt6_info { */ struct list_head rt6i_siblings; unsigned int rt6i_nsiblings; - atomic_t rt6i_nh_upper_bound; atomic_t rt6i_ref; - unsigned int rt6i_nh_flags; - /* These are in a separate cache line. */ struct rt6key rt6i_dst ____cacheline_aligned_in_smp; u32 rt6i_flags; @@ -174,13 +181,14 @@ struct rt6_info { u32 rt6i_metric; u32 rt6i_pmtu; /* more non-fragment space at head required */ - int rt6i_nh_weight; unsigned short rt6i_nfheader_len; u8 rt6i_protocol; u8 fib6_type; u8 exception_bucket_flushed:1, should_flush:1, unused:6; + + struct fib6_nh fib6_nh; }; #define for_each_fib6_node_rt_rcu(fn) \ diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index bc4f89fc8cc0..b493609a863a 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -266,10 +266,10 @@ static inline struct in6_addr *rt6_nexthop(struct rt6_info *rt, static inline bool rt6_duplicate_nexthop(struct rt6_info *a, struct rt6_info *b) { - return a->dst.dev == b->dst.dev && + return a->fib6_nh.nh_dev == b->fib6_nh.nh_dev && a->rt6i_idev == b->rt6i_idev && - ipv6_addr_equal(&a->rt6i_gateway, &b->rt6i_gateway) && - !lwtunnel_cmp_encap(a->dst.lwtstate, b->dst.lwtstate); + ipv6_addr_equal(&a->fib6_nh.nh_gw, &b->fib6_nh.nh_gw) && + !lwtunnel_cmp_encap(a->fib6_nh.nh_lwtstate, b->fib6_nh.nh_lwtstate); } #endif diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 696b837fdadb..8c03d4ced95d 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -2359,7 +2359,7 @@ static struct rt6_info *addrconf_get_prefix_route(const struct in6_addr *pfx, goto out; for_each_fib6_node_rt_rcu(fn) { - if (rt->dst.dev->ifindex != dev->ifindex) + if (rt->fib6_nh.nh_dev->ifindex != dev->ifindex) continue; if ((rt->rt6i_flags & flags) != flags) continue; diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 7563cbbbf513..ab045d35201d 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -2164,6 +2164,7 @@ static int ipv6_route_seq_show(struct seq_file *seq, void *v) { struct rt6_info *rt = v; struct ipv6_route_iter *iter = seq->private; + const struct net_device *dev; seq_printf(seq, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen); @@ -2173,14 +2174,15 @@ static int ipv6_route_seq_show(struct seq_file *seq, void *v) seq_puts(seq, "00000000000000000000000000000000 00 "); #endif if (rt->rt6i_flags & RTF_GATEWAY) - seq_printf(seq, "%pi6", &rt->rt6i_gateway); + seq_printf(seq, "%pi6", &rt->fib6_nh.nh_gw); else seq_puts(seq, "00000000000000000000000000000000"); + dev = rt->fib6_nh.nh_dev; seq_printf(seq, " %08x %08x %08x %08x %8s\n", rt->rt6i_metric, atomic_read(&rt->dst.__refcnt), rt->dst.__use, rt->rt6i_flags, - rt->dst.dev ? rt->dst.dev->name : ""); + dev ? dev->name : ""); iter->w.leaf = NULL; return 0; } diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 1cd1cc2fa6dc..edd05b26023d 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -478,12 +478,15 @@ static struct rt6_info *rt6_multipath_select(struct rt6_info *match, if (!fl6->mp_hash) fl6->mp_hash = rt6_multipath_hash(fl6, skb, NULL); - if (fl6->mp_hash <= atomic_read(&match->rt6i_nh_upper_bound)) + if (fl6->mp_hash <= atomic_read(&match->fib6_nh.nh_upper_bound)) return match; list_for_each_entry_safe(sibling, next_sibling, &match->rt6i_siblings, rt6i_siblings) { - if (fl6->mp_hash > atomic_read(&sibling->rt6i_nh_upper_bound)) + int nh_upper_bound; + + nh_upper_bound = atomic_read(&sibling->fib6_nh.nh_upper_bound); + if (fl6->mp_hash > nh_upper_bound) continue; if (rt6_score_route(sibling, oif, strict) < 0) break; @@ -507,13 +510,14 @@ static inline struct rt6_info *rt6_device_match(struct net *net, struct rt6_info *local = NULL; struct rt6_info *sprt; - if (!oif && ipv6_addr_any(saddr) && !(rt->rt6i_nh_flags & RTNH_F_DEAD)) + if (!oif && ipv6_addr_any(saddr) && + !(rt->fib6_nh.nh_flags & RTNH_F_DEAD)) return rt; for (sprt = rt; sprt; sprt = rcu_dereference(sprt->rt6_next)) { - struct net_device *dev = sprt->dst.dev; + const struct net_device *dev = sprt->fib6_nh.nh_dev; - if (sprt->rt6i_nh_flags & RTNH_F_DEAD) + if (sprt->fib6_nh.nh_flags & RTNH_F_DEAD) continue; if (oif) { @@ -545,7 +549,7 @@ static inline struct rt6_info *rt6_device_match(struct net *net, return net->ipv6.ip6_null_entry; } - return rt->rt6i_nh_flags & RTNH_F_DEAD ? net->ipv6.ip6_null_entry : rt; + return rt->fib6_nh.nh_flags & RTNH_F_DEAD ? net->ipv6.ip6_null_entry : rt; } #ifdef CONFIG_IPV6_ROUTER_PREF @@ -570,7 +574,10 @@ static void rt6_probe_deferred(struct work_struct *w) static void rt6_probe(struct rt6_info *rt) { struct __rt6_probe_work *work; + const struct in6_addr *nh_gw; struct neighbour *neigh; + struct net_device *dev; + /* * Okay, this does not seem to be appropriate * for now, however, we need to check if it @@ -581,8 +588,11 @@ static void rt6_probe(struct rt6_info *rt) */ if (!rt || !(rt->rt6i_flags & RTF_GATEWAY)) return; + + nh_gw = &rt->fib6_nh.nh_gw; + dev = rt->fib6_nh.nh_dev; rcu_read_lock_bh(); - neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway); + neigh = __ipv6_neigh_lookup_noref(dev, nh_gw); if (neigh) { if (neigh->nud_state & NUD_VALID) goto out; @@ -604,9 +614,9 @@ static void rt6_probe(struct rt6_info *rt) if (work) { INIT_WORK(&work->work, rt6_probe_deferred); - work->target = rt->rt6i_gateway; - dev_hold(rt->dst.dev); - work->dev = rt->dst.dev; + work->target = *nh_gw; + dev_hold(dev); + work->dev = dev; schedule_work(&work->work); } @@ -624,7 +634,8 @@ static inline void rt6_probe(struct rt6_info *rt) */ static inline int rt6_check_dev(struct rt6_info *rt, int oif) { - struct net_device *dev = rt->dst.dev; + const struct net_device *dev = rt->fib6_nh.nh_dev; + if (!oif || dev->ifindex == oif) return 2; if ((dev->flags & IFF_LOOPBACK) && @@ -635,15 +646,16 @@ static inline int rt6_check_dev(struct rt6_info *rt, int oif) static inline enum rt6_nud_state rt6_check_neigh(struct rt6_info *rt) { - struct neighbour *neigh; enum rt6_nud_state ret = RT6_NUD_FAIL_HARD; + struct neighbour *neigh; if (rt->rt6i_flags & RTF_NONEXTHOP || !(rt->rt6i_flags & RTF_GATEWAY)) return RT6_NUD_SUCCEED; rcu_read_lock_bh(); - neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway); + neigh = __ipv6_neigh_lookup_noref(rt->fib6_nh.nh_dev, + &rt->fib6_nh.nh_gw); if (neigh) { read_lock(&neigh->lock); if (neigh->nud_state & NUD_VALID) @@ -691,11 +703,11 @@ static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict, bool match_do_rr = false; struct inet6_dev *idev = rt->rt6i_idev; - if (rt->rt6i_nh_flags & RTNH_F_DEAD) + if (rt->fib6_nh.nh_flags & RTNH_F_DEAD) goto out; if (idev->cnf.ignore_routes_with_linkdown && - rt->rt6i_nh_flags & RTNH_F_LINKDOWN && + rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN && !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE)) goto out; @@ -900,7 +912,7 @@ int rt6_route_rcv(struct net_device *dev, u8 *opt, int len, /* called with rcu_lock held */ static struct net_device *ip6_rt_get_dev_rcu(struct rt6_info *rt) { - struct net_device *dev = rt->dst.dev; + struct net_device *dev = rt->fib6_nh.nh_dev; if (rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST)) { /* for copies of local routes, dst->dev needs to be the @@ -940,7 +952,7 @@ static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort) if (rt->rt6i_idev) in6_dev_hold(rt->rt6i_idev); rt->dst.lastuse = jiffies; - rt->rt6i_gateway = ort->rt6i_gateway; + rt->rt6i_gateway = ort->fib6_nh.nh_gw; rt->rt6i_flags = ort->rt6i_flags; rt6_set_from(rt, ort); rt->rt6i_metric = ort->rt6i_metric; @@ -949,7 +961,7 @@ static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort) #endif rt->rt6i_prefsrc = ort->rt6i_prefsrc; rt->rt6i_table = ort->rt6i_table; - rt->dst.lwtstate = lwtstate_get(ort->dst.lwtstate); + rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate); } static struct fib6_node* fib6_backtrack(struct fib6_node *fn, @@ -1327,7 +1339,7 @@ __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket, static int rt6_insert_exception(struct rt6_info *nrt, struct rt6_info *ort) { - struct net *net = dev_net(ort->dst.dev); + struct net *net = dev_net(nrt->dst.dev); struct rt6_exception_bucket *bucket; struct in6_addr *src_key = NULL; struct rt6_exception *rt6_ex; @@ -2278,7 +2290,7 @@ static struct rt6_info *__ip6_route_redirect(struct net *net, fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); restart: for_each_fib6_node_rt_rcu(fn) { - if (rt->rt6i_nh_flags & RTNH_F_DEAD) + if (rt->fib6_nh.nh_flags & RTNH_F_DEAD) continue; if (rt6_check_expired(rt)) continue; @@ -2286,14 +2298,14 @@ restart: break; if (!(rt->rt6i_flags & RTF_GATEWAY)) continue; - if (fl6->flowi6_oif != rt->dst.dev->ifindex) + if (fl6->flowi6_oif != rt->fib6_nh.nh_dev->ifindex) continue; /* rt_cache's gateway might be different from its 'parent' * in the case of an ip redirect. * So we keep searching in the exception table if the gateway * is different. */ - if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway)) { + if (!ipv6_addr_equal(&rdfl->gateway, &rt->fib6_nh.nh_gw)) { rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr); @@ -2858,7 +2870,7 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg, &lwtstate, extack); if (err) goto out; - rt->dst.lwtstate = lwtstate_get(lwtstate); + rt->fib6_nh.nh_lwtstate = lwtstate_get(lwtstate); lwtunnel_set_redirect(&rt->dst); } @@ -2873,7 +2885,7 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg, #endif rt->rt6i_metric = cfg->fc_metric; - rt->rt6i_nh_weight = 1; + rt->fib6_nh.nh_weight = 1; rt->fib6_type = cfg->fc_type; @@ -2928,7 +2940,7 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg, if (err) goto out; - rt->rt6i_gateway = cfg->fc_gateway; + rt->fib6_nh.nh_gw = rt->rt6i_gateway = cfg->fc_gateway; } err = -ENODEV; @@ -2951,9 +2963,9 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg, install_route: if (!(rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST)) && !netif_carrier_ok(dev)) - rt->rt6i_nh_flags |= RTNH_F_LINKDOWN; - rt->rt6i_nh_flags |= (cfg->fc_flags & RTNH_F_ONLINK); - rt->dst.dev = dev; + rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN; + rt->fib6_nh.nh_flags |= (cfg->fc_flags & RTNH_F_ONLINK); + rt->fib6_nh.nh_dev = rt->dst.dev = dev; rt->rt6i_idev = idev; rt->rt6i_table = table; @@ -3113,11 +3125,11 @@ static int ip6_route_del(struct fib6_config *cfg, rt = rt_cache; } if (cfg->fc_ifindex && - (!rt->dst.dev || - rt->dst.dev->ifindex != cfg->fc_ifindex)) + (!rt->fib6_nh.nh_dev || + rt->fib6_nh.nh_dev->ifindex != cfg->fc_ifindex)) continue; if (cfg->fc_flags & RTF_GATEWAY && - !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway)) + !ipv6_addr_equal(&cfg->fc_gateway, &rt->fib6_nh.nh_gw)) continue; if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric) continue; @@ -3278,11 +3290,11 @@ static struct rt6_info *rt6_get_route_info(struct net *net, goto out; for_each_fib6_node_rt_rcu(fn) { - if (rt->dst.dev->ifindex != dev->ifindex) + if (rt->fib6_nh.nh_dev->ifindex != dev->ifindex) continue; if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY)) continue; - if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr)) + if (!ipv6_addr_equal(&rt->fib6_nh.nh_gw, gwaddr)) continue; ip6_hold_safe(NULL, &rt, false); break; @@ -3337,9 +3349,9 @@ struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_dev rcu_read_lock(); for_each_fib6_node_rt_rcu(&table->tb6_root) { - if (dev == rt->dst.dev && + if (dev == rt->fib6_nh.nh_dev && ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) && - ipv6_addr_equal(&rt->rt6i_gateway, addr)) + ipv6_addr_equal(&rt->fib6_nh.nh_gw, addr)) break; } if (rt) @@ -3533,6 +3545,8 @@ struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev, rt->rt6i_flags |= RTF_LOCAL; } + rt->fib6_nh.nh_gw = *addr; + rt->fib6_nh.nh_dev = dev; rt->rt6i_gateway = *addr; rt->rt6i_dst.addr = *addr; rt->rt6i_dst.plen = 128; @@ -3555,7 +3569,7 @@ static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg) struct net *net = ((struct arg_dev_net_ip *)arg)->net; struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr; - if (((void *)rt->dst.dev == dev || !dev) && + if (((void *)rt->fib6_nh.nh_dev == dev || !dev) && rt != net->ipv6.ip6_null_entry && ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) { spin_lock_bh(&rt6_exception_lock); @@ -3587,7 +3601,7 @@ static int fib6_clean_tohost(struct rt6_info *rt, void *arg) struct in6_addr *gateway = (struct in6_addr *)arg; if (((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) && - ipv6_addr_equal(gateway, &rt->rt6i_gateway)) { + ipv6_addr_equal(gateway, &rt->fib6_nh.nh_gw)) { return -1; } @@ -3635,8 +3649,8 @@ static struct rt6_info *rt6_multipath_first_sibling(const struct rt6_info *rt) static bool rt6_is_dead(const struct rt6_info *rt) { - if (rt->rt6i_nh_flags & RTNH_F_DEAD || - (rt->rt6i_nh_flags & RTNH_F_LINKDOWN && + if (rt->fib6_nh.nh_flags & RTNH_F_DEAD || + (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN && rt->rt6i_idev->cnf.ignore_routes_with_linkdown)) return true; @@ -3649,11 +3663,11 @@ static int rt6_multipath_total_weight(const struct rt6_info *rt) int total = 0; if (!rt6_is_dead(rt)) - total += rt->rt6i_nh_weight; + total += rt->fib6_nh.nh_weight; list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings) { if (!rt6_is_dead(iter)) - total += iter->rt6i_nh_weight; + total += iter->fib6_nh.nh_weight; } return total; @@ -3664,11 +3678,11 @@ static void rt6_upper_bound_set(struct rt6_info *rt, int *weight, int total) int upper_bound = -1; if (!rt6_is_dead(rt)) { - *weight += rt->rt6i_nh_weight; + *weight += rt->fib6_nh.nh_weight; upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31, total) - 1; } - atomic_set(&rt->rt6i_nh_upper_bound, upper_bound); + atomic_set(&rt->fib6_nh.nh_upper_bound, upper_bound); } static void rt6_multipath_upper_bound_set(struct rt6_info *rt, int total) @@ -3711,8 +3725,8 @@ static int fib6_ifup(struct rt6_info *rt, void *p_arg) const struct arg_netdev_event *arg = p_arg; struct net *net = dev_net(arg->dev); - if (rt != net->ipv6.ip6_null_entry && rt->dst.dev == arg->dev) { - rt->rt6i_nh_flags &= ~arg->nh_flags; + if (rt != net->ipv6.ip6_null_entry && rt->fib6_nh.nh_dev == arg->dev) { + rt->fib6_nh.nh_flags &= ~arg->nh_flags; fib6_update_sernum_upto_root(net, rt); rt6_multipath_rebalance(rt); } @@ -3740,10 +3754,10 @@ static bool rt6_multipath_uses_dev(const struct rt6_info *rt, { struct rt6_info *iter; - if (rt->dst.dev == dev) + if (rt->fib6_nh.nh_dev == dev) return true; list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings) - if (iter->dst.dev == dev) + if (iter->fib6_nh.nh_dev == dev) return true; return false; @@ -3764,11 +3778,12 @@ static unsigned int rt6_multipath_dead_count(const struct rt6_info *rt, struct rt6_info *iter; unsigned int dead = 0; - if (rt->dst.dev == down_dev || rt->rt6i_nh_flags & RTNH_F_DEAD) + if (rt->fib6_nh.nh_dev == down_dev || + rt->fib6_nh.nh_flags & RTNH_F_DEAD) dead++; list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings) - if (iter->dst.dev == down_dev || - iter->rt6i_nh_flags & RTNH_F_DEAD) + if (iter->fib6_nh.nh_dev == down_dev || + iter->fib6_nh.nh_flags & RTNH_F_DEAD) dead++; return dead; @@ -3780,11 +3795,11 @@ static void rt6_multipath_nh_flags_set(struct rt6_info *rt, { struct rt6_info *iter; - if (rt->dst.dev == dev) - rt->rt6i_nh_flags |= nh_flags; + if (rt->fib6_nh.nh_dev == dev) + rt->fib6_nh.nh_flags |= nh_flags; list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings) - if (iter->dst.dev == dev) - iter->rt6i_nh_flags |= nh_flags; + if (iter->fib6_nh.nh_dev == dev) + iter->fib6_nh.nh_flags |= nh_flags; } /* called with write lock held for table with rt */ @@ -3799,12 +3814,12 @@ static int fib6_ifdown(struct rt6_info *rt, void *p_arg) switch (arg->event) { case NETDEV_UNREGISTER: - return rt->dst.dev == dev ? -1 : 0; + return rt->fib6_nh.nh_dev == dev ? -1 : 0; case NETDEV_DOWN: if (rt->should_flush) return -1; if (!rt->rt6i_nsiblings) - return rt->dst.dev == dev ? -1 : 0; + return rt->fib6_nh.nh_dev == dev ? -1 : 0; if (rt6_multipath_uses_dev(rt, dev)) { unsigned int count; @@ -3820,10 +3835,10 @@ static int fib6_ifdown(struct rt6_info *rt, void *p_arg) } return -2; case NETDEV_CHANGE: - if (rt->dst.dev != dev || + if (rt->fib6_nh.nh_dev != dev || rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST)) break; - rt->rt6i_nh_flags |= RTNH_F_LINKDOWN; + rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN; rt6_multipath_rebalance(rt); break; } @@ -3875,7 +3890,7 @@ static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg) Since RFC 1981 doesn't include administrative MTU increase update PMTU increase is a MUST. (i.e. jumbo frame) */ - if (rt->dst.dev == arg->dev && + if (rt->fib6_nh.nh_dev == arg->dev && !dst_metric_locked(&rt->dst, RTAX_MTU)) { spin_lock_bh(&rt6_exception_lock); if (dst_metric_raw(&rt->dst, RTAX_MTU) && @@ -4183,7 +4198,7 @@ static int ip6_route_multipath_add(struct fib6_config *cfg, goto cleanup; } - rt->rt6i_nh_weight = rtnh->rtnh_hops + 1; + rt->fib6_nh.nh_weight = rtnh->rtnh_hops + 1; err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg); if (err) { @@ -4355,7 +4370,7 @@ static size_t rt6_nlmsg_size(struct rt6_info *rt) nexthop_len = nla_total_size(0) /* RTA_MULTIPATH */ + NLA_ALIGN(sizeof(struct rtnexthop)) + nla_total_size(16) /* RTA_GATEWAY */ - + lwtunnel_get_encap_size(rt->dst.lwtstate); + + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate); nexthop_len *= rt->rt6i_nsiblings; } @@ -4373,38 +4388,38 @@ static size_t rt6_nlmsg_size(struct rt6_info *rt) + nla_total_size(sizeof(struct rta_cacheinfo)) + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */ + nla_total_size(1) /* RTA_PREF */ - + lwtunnel_get_encap_size(rt->dst.lwtstate) + + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate) + nexthop_len; } static int rt6_nexthop_info(struct sk_buff *skb, struct rt6_info *rt, unsigned int *flags, bool skip_oif) { - if (rt->rt6i_nh_flags & RTNH_F_DEAD) + if (rt->fib6_nh.nh_flags & RTNH_F_DEAD) *flags |= RTNH_F_DEAD; - if (rt->rt6i_nh_flags & RTNH_F_LINKDOWN) { + if (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN) { *flags |= RTNH_F_LINKDOWN; if (rt->rt6i_idev->cnf.ignore_routes_with_linkdown) *flags |= RTNH_F_DEAD; } if (rt->rt6i_flags & RTF_GATEWAY) { - if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->rt6i_gateway) < 0) + if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->fib6_nh.nh_gw) < 0) goto nla_put_failure; } - *flags |= (rt->rt6i_nh_flags & RTNH_F_ONLINK); - if (rt->rt6i_nh_flags & RTNH_F_OFFLOAD) + *flags |= (rt->fib6_nh.nh_flags & RTNH_F_ONLINK); + if (rt->fib6_nh.nh_flags & RTNH_F_OFFLOAD) *flags |= RTNH_F_OFFLOAD; /* not needed for multipath encoding b/c it has a rtnexthop struct */ - if (!skip_oif && rt->dst.dev && - nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex)) + if (!skip_oif && rt->fib6_nh.nh_dev && + nla_put_u32(skb, RTA_OIF, rt->fib6_nh.nh_dev->ifindex)) goto nla_put_failure; - if (rt->dst.lwtstate && - lwtunnel_fill_encap(skb, rt->dst.lwtstate) < 0) + if (rt->fib6_nh.nh_lwtstate && + lwtunnel_fill_encap(skb, rt->fib6_nh.nh_lwtstate) < 0) goto nla_put_failure; return 0; @@ -4416,6 +4431,7 @@ nla_put_failure: /* add multipath next hop */ static int rt6_add_nexthop(struct sk_buff *skb, struct rt6_info *rt) { + const struct net_device *dev = rt->fib6_nh.nh_dev; struct rtnexthop *rtnh; unsigned int flags = 0; @@ -4423,8 +4439,8 @@ static int rt6_add_nexthop(struct sk_buff *skb, struct rt6_info *rt) if (!rtnh) goto nla_put_failure; - rtnh->rtnh_hops = rt->rt6i_nh_weight - 1; - rtnh->rtnh_ifindex = rt->dst.dev ? rt->dst.dev->ifindex : 0; + rtnh->rtnh_hops = rt->fib6_nh.nh_weight - 1; + rtnh->rtnh_ifindex = dev ? dev->ifindex : 0; if (rt6_nexthop_info(skb, rt, &flags, true) < 0) goto nla_put_failure; From 715dbd16fefce25b2bdf34c216afa89d70b30f52 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 18 Oct 2017 11:39:13 -0700 Subject: [PATCH 0360/1640] UPSTREAM: net: ipv4: Change fib notifiers to take a fib_alias All of the notifier data (fib_info, tos, type and table id) are contained in the fib_alias. Pass it to the notifier instead of each data separately shortening the argument list by 3. Signed-off-by: David Ahern Reviewed-by: Ido Schimmel Signed-off-by: David S. Miller --- net/ipv4/fib_trie.c | 39 +++++++++++++++------------------------ 1 file changed, 15 insertions(+), 24 deletions(-) diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 0c8fcc050ad2..b7bd99d67dfc 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -87,32 +87,30 @@ static int call_fib_entry_notifier(struct notifier_block *nb, struct net *net, enum fib_event_type event_type, u32 dst, - int dst_len, struct fib_info *fi, - u8 tos, u8 type, u32 tb_id) + int dst_len, struct fib_alias *fa) { struct fib_entry_notifier_info info = { .dst = dst, .dst_len = dst_len, - .fi = fi, - .tos = tos, - .type = type, - .tb_id = tb_id, + .fi = fa->fa_info, + .tos = fa->fa_tos, + .type = fa->fa_type, + .tb_id = fa->tb_id, }; return call_fib4_notifier(nb, net, event_type, &info.info); } static int call_fib_entry_notifiers(struct net *net, enum fib_event_type event_type, u32 dst, - int dst_len, struct fib_info *fi, - u8 tos, u8 type, u32 tb_id) + int dst_len, struct fib_alias *fa) { struct fib_entry_notifier_info info = { .dst = dst, .dst_len = dst_len, - .fi = fi, - .tos = tos, - .type = type, - .tb_id = tb_id, + .fi = fa->fa_info, + .tos = fa->fa_tos, + .type = fa->fa_type, + .tb_id = fa->tb_id, }; return call_fib4_notifiers(net, event_type, &info.info); } @@ -1216,9 +1214,7 @@ int fib_table_insert(struct net *net, struct fib_table *tb, new_fa->fa_default = -1; call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_REPLACE, - key, plen, fi, - new_fa->fa_tos, cfg->fc_type, - tb->tb_id); + key, plen, new_fa); rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id, &cfg->fc_nlinfo, nlflags); @@ -1273,8 +1269,7 @@ int fib_table_insert(struct net *net, struct fib_table *tb, tb->tb_num_default++; rt_cache_flush(cfg->fc_nlinfo.nl_net); - call_fib_entry_notifiers(net, event, key, plen, fi, tos, cfg->fc_type, - tb->tb_id); + call_fib_entry_notifiers(net, event, key, plen, new_fa); rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, new_fa->tb_id, &cfg->fc_nlinfo, nlflags); succeeded: @@ -1574,8 +1569,7 @@ int fib_table_delete(struct net *net, struct fib_table *tb, return -ESRCH; call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, key, plen, - fa_to_delete->fa_info, tos, - fa_to_delete->fa_type, tb->tb_id); + fa_to_delete); rtmsg_fib(RTM_DELROUTE, htonl(key), fa_to_delete, plen, tb->tb_id, &cfg->fc_nlinfo, 0); @@ -1901,9 +1895,7 @@ int fib_table_flush(struct net *net, struct fib_table *tb, bool flush_all) call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, n->key, - KEYLENGTH - fa->fa_slen, - fi, fa->fa_tos, fa->fa_type, - tb->tb_id); + KEYLENGTH - fa->fa_slen, fa); hlist_del_rcu(&fa->fa_list); fib_release_info(fa->fa_info); alias_free_mem_rcu(fa); @@ -1941,8 +1933,7 @@ static void fib_leaf_notify(struct net *net, struct key_vector *l, continue; call_fib_entry_notifier(nb, net, FIB_EVENT_ENTRY_ADD, l->key, - KEYLENGTH - fa->fa_slen, fi, fa->fa_tos, - fa->fa_type, fa->tb_id); + KEYLENGTH - fa->fa_slen, fa); } } From 0f4c03eae09f779c5c61bee7b0f5eb8e3440c340 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Fri, 27 Oct 2017 17:37:13 -0700 Subject: [PATCH 0361/1640] UPSTREAM: net: Add extack to fib_notifier_info Add extack to fib_notifier_info and plumb through stack to call_fib_rule_notifiers, call_fib_entry_notifiers and call_fib6_entry_notifiers. This allows notifer handlers to return messages to user. Signed-off-by: David Ahern Reviewed-by: Ido Schimmel Signed-off-by: David S. Miller --- include/net/fib_notifier.h | 1 + net/core/fib_rules.c | 9 ++++++--- net/ipv4/fib_trie.c | 13 ++++++++----- net/ipv6/ip6_fib.c | 15 +++++++++------ 4 files changed, 24 insertions(+), 14 deletions(-) diff --git a/include/net/fib_notifier.h b/include/net/fib_notifier.h index 669b9716dc7a..4ffcb646ba1f 100644 --- a/include/net/fib_notifier.h +++ b/include/net/fib_notifier.h @@ -9,6 +9,7 @@ struct fib_notifier_info { struct net *net; int family; + struct netlink_ext_ack *extack; }; enum fib_event_type { diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index c2804c660207..aac6f41e3691 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -339,10 +339,12 @@ static int call_fib_rule_notifier(struct notifier_block *nb, struct net *net, static int call_fib_rule_notifiers(struct net *net, enum fib_event_type event_type, struct fib_rule *rule, - struct fib_rules_ops *ops) + struct fib_rules_ops *ops, + struct netlink_ext_ack *extack) { struct fib_rule_notifier_info info = { .info.family = ops->family, + .info.extack = extack, .rule = rule, }; @@ -665,7 +667,7 @@ int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh, if (rule->tun_id) ip_tunnel_need_metadata(); - call_fib_rule_notifiers(net, FIB_EVENT_RULE_ADD, rule, ops); + call_fib_rule_notifiers(net, FIB_EVENT_RULE_ADD, rule, ops, extack); notify_rule_change(RTM_NEWRULE, rule, ops, nlh, NETLINK_CB(skb).portid); flush_route_cache(ops); rules_ops_put(ops); @@ -837,7 +839,8 @@ int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh, } } - call_fib_rule_notifiers(net, FIB_EVENT_RULE_DEL, rule, ops); + call_fib_rule_notifiers(net, FIB_EVENT_RULE_DEL, rule, ops, + NULL); notify_rule_change(RTM_DELRULE, rule, ops, nlh, NETLINK_CB(skb).portid); fib_rule_put(rule); diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index b7bd99d67dfc..b9277a530eab 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -102,9 +102,11 @@ static int call_fib_entry_notifier(struct notifier_block *nb, struct net *net, static int call_fib_entry_notifiers(struct net *net, enum fib_event_type event_type, u32 dst, - int dst_len, struct fib_alias *fa) + int dst_len, struct fib_alias *fa, + struct netlink_ext_ack *extack) { struct fib_entry_notifier_info info = { + .info.extack = extack, .dst = dst, .dst_len = dst_len, .fi = fa->fa_info, @@ -1214,7 +1216,7 @@ int fib_table_insert(struct net *net, struct fib_table *tb, new_fa->fa_default = -1; call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_REPLACE, - key, plen, new_fa); + key, plen, new_fa, extack); rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id, &cfg->fc_nlinfo, nlflags); @@ -1269,7 +1271,7 @@ int fib_table_insert(struct net *net, struct fib_table *tb, tb->tb_num_default++; rt_cache_flush(cfg->fc_nlinfo.nl_net); - call_fib_entry_notifiers(net, event, key, plen, new_fa); + call_fib_entry_notifiers(net, event, key, plen, new_fa, extack); rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, new_fa->tb_id, &cfg->fc_nlinfo, nlflags); succeeded: @@ -1569,7 +1571,7 @@ int fib_table_delete(struct net *net, struct fib_table *tb, return -ESRCH; call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, key, plen, - fa_to_delete); + fa_to_delete, extack); rtmsg_fib(RTM_DELROUTE, htonl(key), fa_to_delete, plen, tb->tb_id, &cfg->fc_nlinfo, 0); @@ -1895,7 +1897,8 @@ int fib_table_flush(struct net *net, struct fib_table *tb, bool flush_all) call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, n->key, - KEYLENGTH - fa->fa_slen, fa); + KEYLENGTH - fa->fa_slen, fa, + NULL); hlist_del_rcu(&fa->fa_list); fib_release_info(fa->fa_info); alias_free_mem_rcu(fa); diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index ab045d35201d..7edc9fb9f2ee 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -350,9 +350,11 @@ static int call_fib6_entry_notifier(struct notifier_block *nb, struct net *net, static int call_fib6_entry_notifiers(struct net *net, enum fib_event_type event_type, - struct rt6_info *rt) + struct rt6_info *rt, + struct netlink_ext_ack *extack) { struct fib6_entry_notifier_info info = { + .info.extack = extack, .rt = rt, }; @@ -859,7 +861,8 @@ static void fib6_purge_rt(struct rt6_info *rt, struct fib6_node *fn, */ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt, - struct nl_info *info, struct mx6_config *mxc) + struct nl_info *info, struct mx6_config *mxc, + struct netlink_ext_ack *extack) { struct rt6_info *leaf = rcu_dereference_protected(fn->leaf, lockdep_is_held(&rt->rt6i_table->tb6_lock)); @@ -1004,7 +1007,7 @@ add: rcu_assign_pointer(rt->rt6i_node, fn); rcu_assign_pointer(*ins, rt); call_fib6_entry_notifiers(info->nl_net, FIB_EVENT_ENTRY_ADD, - rt); + rt, extack); if (!info->skip_notify) inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags); info->nl_net->ipv6.rt6_stats->fib_rt_entries++; @@ -1033,7 +1036,7 @@ add: rt->rt6_next = iter->rt6_next; rcu_assign_pointer(*ins, rt); call_fib6_entry_notifiers(info->nl_net, FIB_EVENT_ENTRY_REPLACE, - rt); + rt, extack); if (!info->skip_notify) inet6_rt_notify(RTM_NEWROUTE, rt, info, NLM_F_REPLACE); if (!(fn->fn_flags & RTN_RTINFO)) { @@ -1223,7 +1226,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, } #endif - err = fib6_add_rt2node(fn, rt, info, mxc); + err = fib6_add_rt2node(fn, rt, info, mxc, extack); if (!err) { __fib6_update_sernum_upto_root(rt, sernum); fib6_start_gc(info->nl_net, rt); @@ -1682,7 +1685,7 @@ static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn, fib6_purge_rt(rt, fn, net); - call_fib6_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, rt); + call_fib6_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, rt, NULL); if (!info->skip_notify) inet6_rt_notify(RTM_DELROUTE, rt, info, 0); rt6_release(rt); From 6b9f6a67ce65c70ec68802ec8daaa8a16c45085c Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 27 Mar 2018 18:21:59 -0700 Subject: [PATCH 0362/1640] UPSTREAM: net/ipv6: Move call_fib6_entry_notifiers up for route adds Move call to call_fib6_entry_notifiers for new IPv6 routes to right before the insertion into the FIB. At this point notifier handlers can decide the fate of the new route with a clean path to delete the potential new entry if the notifier returns non-0. Signed-off-by: David Ahern Reviewed-by: Ido Schimmel Signed-off-by: David S. Miller --- net/ipv6/ip6_fib.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 7edc9fb9f2ee..04ce92505ff6 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -1002,12 +1002,16 @@ add: if (err) return err; + err = call_fib6_entry_notifiers(info->nl_net, + FIB_EVENT_ENTRY_ADD, + rt, extack); + if (err) + return err; + rcu_assign_pointer(rt->rt6_next, iter); atomic_inc(&rt->rt6i_ref); rcu_assign_pointer(rt->rt6i_node, fn); rcu_assign_pointer(*ins, rt); - call_fib6_entry_notifiers(info->nl_net, FIB_EVENT_ENTRY_ADD, - rt, extack); if (!info->skip_notify) inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags); info->nl_net->ipv6.rt6_stats->fib_rt_entries++; @@ -1031,12 +1035,16 @@ add: if (err) return err; + err = call_fib6_entry_notifiers(info->nl_net, + FIB_EVENT_ENTRY_REPLACE, + rt, extack); + if (err) + return err; + atomic_inc(&rt->rt6i_ref); rcu_assign_pointer(rt->rt6i_node, fn); rt->rt6_next = iter->rt6_next; rcu_assign_pointer(*ins, rt); - call_fib6_entry_notifiers(info->nl_net, FIB_EVENT_ENTRY_REPLACE, - rt, extack); if (!info->skip_notify) inet6_rt_notify(RTM_NEWROUTE, rt, info, NLM_F_REPLACE); if (!(fn->fn_flags & RTN_RTINFO)) { From e354071e40cf5a5a1980e431541f7fc3933bd665 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 17 Apr 2018 17:33:11 -0700 Subject: [PATCH 0363/1640] BACKPORT: net/ipv6: Pass net namespace to route functions Pass network namespace reference into route add, delete and get functions. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_route.h | 12 ++++++----- net/ipv6/addrconf.c | 33 ++++++++++++++++------------- net/ipv6/anycast.c | 10 +++++---- net/ipv6/ndisc.c | 12 ++++++----- net/ipv6/route.c | 47 +++++++++++++++++++++-------------------- 5 files changed, 62 insertions(+), 52 deletions(-) diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index b493609a863a..d4b1fa72403c 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -101,8 +101,8 @@ void ip6_route_cleanup(void); int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg); int ip6_route_add(struct fib6_config *cfg, struct netlink_ext_ack *extack); -int ip6_ins_rt(struct rt6_info *); -int ip6_del_rt(struct rt6_info *); +int ip6_ins_rt(struct net *net, struct rt6_info *rt); +int ip6_del_rt(struct net *net, struct rt6_info *rt); void rt6_flush_exceptions(struct rt6_info *rt); int rt6_remove_exception_rt(struct rt6_info *rt); @@ -137,7 +137,7 @@ struct dst_entry *icmp6_dst_alloc(struct net_device *dev, struct flowi6 *fl6); void fib6_force_start_gc(struct net *net); -struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev, +struct rt6_info *addrconf_dst_alloc(struct net *net, struct inet6_dev *idev, const struct in6_addr *addr, bool anycast); struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev, @@ -147,9 +147,11 @@ struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev, * support functions for ND * */ -struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, +struct rt6_info *rt6_get_dflt_router(struct net *net, + const struct in6_addr *addr, struct net_device *dev); -struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr, +struct rt6_info *rt6_add_dflt_router(struct net *net, + const struct in6_addr *gwaddr, struct net_device *dev, unsigned int pref); void rt6_purge_dflt_routers(struct net *net); diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 8c03d4ced95d..b4da2c755d89 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -1018,7 +1018,7 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr, goto out; } - rt = addrconf_dst_alloc(idev, addr, false); + rt = addrconf_dst_alloc(net, idev, addr, false); if (IS_ERR(rt)) { err = PTR_ERR(rt); goto out; @@ -1161,7 +1161,7 @@ cleanup_prefix_route(struct inet6_ifaddr *ifp, unsigned long expires, bool del_r 0, RTF_GATEWAY | RTF_DEFAULT); if (rt) { if (del_rt) - ip6_del_rt(rt); + ip6_del_rt(dev_net(ifp->idev->dev), rt); else { if (!(rt->rt6i_flags & RTF_EXPIRES)) rt6_set_expires(rt, expires); @@ -2666,7 +2666,7 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len, bool sllao) if (rt) { /* Autoconf prefix route */ if (valid_lft == 0) { - ip6_del_rt(rt); + ip6_del_rt(net, rt); rt = NULL; } else if (addrconf_finite_timeout(rt_expires)) { /* not infinity */ @@ -3341,7 +3341,8 @@ static void addrconf_gre_config(struct net_device *dev) } #endif -static int fixup_permanent_addr(struct inet6_dev *idev, +static int fixup_permanent_addr(struct net *net, + struct inet6_dev *idev, struct inet6_ifaddr *ifp) { /* !rt6i_node means the host route was removed from the @@ -3351,7 +3352,7 @@ static int fixup_permanent_addr(struct inet6_dev *idev, if (!ifp->rt || !ifp->rt->rt6i_node) { struct rt6_info *rt, *prev; - rt = addrconf_dst_alloc(idev, &ifp->addr, false); + rt = addrconf_dst_alloc(net, idev, &ifp->addr, false); if (unlikely(IS_ERR(rt))) return PTR_ERR(rt); @@ -3375,7 +3376,7 @@ static int fixup_permanent_addr(struct inet6_dev *idev, return 0; } -static void addrconf_permanent_addr(struct net_device *dev) +static void addrconf_permanent_addr(struct net *net, struct net_device *dev) { struct inet6_ifaddr *ifp, *tmp; struct inet6_dev *idev; @@ -3388,7 +3389,7 @@ static void addrconf_permanent_addr(struct net_device *dev) list_for_each_entry_safe(ifp, tmp, &idev->addr_list, if_list) { if ((ifp->flags & IFA_F_PERMANENT) && - fixup_permanent_addr(idev, ifp) < 0) { + fixup_permanent_addr(net, idev, ifp) < 0) { write_unlock_bh(&idev->lock); in6_ifa_hold(ifp); ipv6_del_addr(ifp); @@ -3457,7 +3458,7 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event, if (event == NETDEV_UP) { /* restore routes for permanent addresses */ - addrconf_permanent_addr(dev); + addrconf_permanent_addr(net, dev); if (!addrconf_link_ready(dev)) { /* device is not ready yet. */ @@ -3751,7 +3752,7 @@ restart: spin_unlock_bh(&ifa->lock); if (rt) - ip6_del_rt(rt); + ip6_del_rt(net, rt); if (state != INET6_IFADDR_STATE_DEAD) { __ipv6_ifa_notify(RTM_DELADDR, ifa); @@ -3874,6 +3875,7 @@ static void addrconf_dad_begin(struct inet6_ifaddr *ifp) struct inet6_dev *idev = ifp->idev; struct net_device *dev = idev->dev; bool bump_id, notify = false; + struct net *net; addrconf_join_solict(dev, &ifp->addr); @@ -3884,8 +3886,9 @@ static void addrconf_dad_begin(struct inet6_ifaddr *ifp) if (ifp->state == INET6_IFADDR_STATE_DEAD) goto out; + net = dev_net(dev); if (dev->flags&(IFF_NOARP|IFF_LOOPBACK) || - (dev_net(dev)->ipv6.devconf_all->accept_dad < 1 && + (net->ipv6.devconf_all->accept_dad < 1 && idev->cnf.accept_dad < 1) || !(ifp->flags&IFA_F_TENTATIVE) || ifp->flags & IFA_F_NODAD) { @@ -3921,8 +3924,8 @@ static void addrconf_dad_begin(struct inet6_ifaddr *ifp) * Frames right away */ if (ifp->flags & IFA_F_OPTIMISTIC) { - ip6_ins_rt(ifp->rt); - if (ipv6_use_optimistic_addr(dev_net(dev), idev)) { + ip6_ins_rt(net, ifp->rt); + if (ipv6_use_optimistic_addr(net, idev)) { /* Because optimistic nodes can use this address, * notify listeners. If DAD fails, RTM_DELADDR is sent. */ @@ -5628,7 +5631,7 @@ static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp) * the device is brought up. */ if (ifp->rt && !rcu_access_pointer(ifp->rt->rt6i_node)) { - ip6_ins_rt(ifp->rt); + ip6_ins_rt(net, ifp->rt); } else if (!ifp->rt && (ifp->idev->dev->flags & IFF_UP)) { pr_warn("BUG: Address %pI6c on device %s is missing its host route.\n", &ifp->addr, ifp->idev->dev->name); @@ -5650,11 +5653,11 @@ static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp) rt = addrconf_get_prefix_route(&ifp->peer_addr, 128, ifp->idev->dev, 0, 0); if (rt) - ip6_del_rt(rt); + ip6_del_rt(net, rt); } if (ifp->rt) { if (dst_hold_safe(&ifp->rt->dst)) - ip6_del_rt(ifp->rt); + ip6_del_rt(net, ifp->rt); } rt_genid_bump_ipv6(net); break; diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c index eb9c0faae5b5..bfe331234729 100644 --- a/net/ipv6/anycast.c +++ b/net/ipv6/anycast.c @@ -251,6 +251,7 @@ int __ipv6_dev_ac_inc(struct inet6_dev *idev, const struct in6_addr *addr) { struct ifacaddr6 *aca; struct rt6_info *rt; + struct net *net; int err; ASSERT_RTNL(); @@ -269,7 +270,8 @@ int __ipv6_dev_ac_inc(struct inet6_dev *idev, const struct in6_addr *addr) } } - rt = addrconf_dst_alloc(idev, addr, true); + net = dev_net(idev->dev); + rt = addrconf_dst_alloc(net, idev, addr, true); if (IS_ERR(rt)) { err = PTR_ERR(rt); goto out; @@ -290,7 +292,7 @@ int __ipv6_dev_ac_inc(struct inet6_dev *idev, const struct in6_addr *addr) aca_get(aca); write_unlock_bh(&idev->lock); - ip6_ins_rt(rt); + ip6_ins_rt(net, rt); addrconf_join_solict(idev->dev, &aca->aca_addr); @@ -333,7 +335,7 @@ int __ipv6_dev_ac_dec(struct inet6_dev *idev, const struct in6_addr *addr) addrconf_leave_solict(idev, &aca->aca_addr); dst_hold(&aca->aca_rt->dst); - ip6_del_rt(aca->aca_rt); + ip6_del_rt(dev_net(idev->dev), aca->aca_rt); aca_put(aca); return 0; @@ -361,7 +363,7 @@ void ipv6_ac_destroy_dev(struct inet6_dev *idev) addrconf_leave_solict(idev, &aca->aca_addr); dst_hold(&aca->aca_rt->dst); - ip6_del_rt(aca->aca_rt); + ip6_del_rt(dev_net(idev->dev), aca->aca_rt); aca_put(aca); diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 2874a95e848c..93c9e00e09a3 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -1170,6 +1170,7 @@ static void ndisc_router_discovery(struct sk_buff *skb) struct neighbour *neigh = NULL; struct inet6_dev *in6_dev; struct rt6_info *rt = NULL; + struct net *net; int lifetime; struct ndisc_options ndopts; int optlen; @@ -1267,9 +1268,9 @@ static void ndisc_router_discovery(struct sk_buff *skb) /* Do not accept RA with source-addr found on local machine unless * accept_ra_from_local is set to true. */ + net = dev_net(in6_dev->dev); if (!in6_dev->cnf.accept_ra_from_local && - ipv6_chk_addr(dev_net(in6_dev->dev), &ipv6_hdr(skb)->saddr, - in6_dev->dev, 0)) { + ipv6_chk_addr(net, &ipv6_hdr(skb)->saddr, in6_dev->dev, 0)) { ND_PRINTK(2, info, "RA from local address detected on dev: %s: default router ignored\n", skb->dev->name); @@ -1286,7 +1287,7 @@ static void ndisc_router_discovery(struct sk_buff *skb) pref = ICMPV6_ROUTER_PREF_MEDIUM; #endif - rt = rt6_get_dflt_router(&ipv6_hdr(skb)->saddr, skb->dev); + rt = rt6_get_dflt_router(net, &ipv6_hdr(skb)->saddr, skb->dev); if (rt) { neigh = dst_neigh_lookup(&rt->dst, &ipv6_hdr(skb)->saddr); @@ -1299,7 +1300,7 @@ static void ndisc_router_discovery(struct sk_buff *skb) } } if (rt && lifetime == 0) { - ip6_del_rt(rt); + ip6_del_rt(net, rt); rt = NULL; } @@ -1308,7 +1309,8 @@ static void ndisc_router_discovery(struct sk_buff *skb) if (!rt && lifetime) { ND_PRINTK(3, info, "RA: adding default router\n"); - rt = rt6_add_dflt_router(&ipv6_hdr(skb)->saddr, skb->dev, pref); + rt = rt6_add_dflt_router(net, &ipv6_hdr(skb)->saddr, + skb->dev, pref); if (!rt) { ND_PRINTK(0, err, "RA: %s failed to add default route\n", diff --git a/net/ipv6/route.c b/net/ipv6/route.c index edd05b26023d..5254a899ea1b 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -876,13 +876,13 @@ int rt6_route_rcv(struct net_device *dev, u8 *opt, int len, } if (rinfo->prefix_len == 0) - rt = rt6_get_dflt_router(gwaddr, dev); + rt = rt6_get_dflt_router(net, gwaddr, dev); else rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev); if (rt && !lifetime) { - ip6_del_rt(rt); + ip6_del_rt(net, rt); rt = NULL; } @@ -1099,9 +1099,9 @@ static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info, return err; } -int ip6_ins_rt(struct rt6_info *rt) +int ip6_ins_rt(struct net *net, struct rt6_info *rt) { - struct nl_info info = { .nl_net = dev_net(rt->dst.dev), }; + struct nl_info info = { .nl_net = net, }; struct mx6_config mxc = { .mx = NULL, }; /* Hold dst to account for the reference from the fib6 tree */ @@ -1183,14 +1183,13 @@ static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt) return pcpu_rt; } -static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt) +static struct rt6_info *rt6_make_pcpu_route(struct net *net, + struct rt6_info *rt) { struct rt6_info *pcpu_rt, *prev, **p; pcpu_rt = ip6_rt_pcpu_alloc(rt); if (!pcpu_rt) { - struct net *net = dev_net(rt->dst.dev); - dst_hold(&net->ipv6.ip6_null_entry->dst); return net->ipv6.ip6_null_entry; } @@ -1846,7 +1845,7 @@ uncached_rt_out: * rt->rt6i_ref makes sure rt can't be released. */ rcu_read_unlock(); - pcpu_rt = rt6_make_pcpu_route(rt); + pcpu_rt = rt6_make_pcpu_route(net, rt); rt6_release(rt); } else { /* rt is already removed from tree */ @@ -2115,7 +2114,7 @@ static void ip6_negative_advice(struct sock *sk, dst_hold(dst); sk_dst_reset(sk); - ip6_del_rt(rt); + ip6_del_rt(dev_net(dst->dev), rt); } return; } @@ -2132,7 +2131,7 @@ static void ip6_link_failure(struct sk_buff *skb) if (rt) { if (rt->rt6i_flags & RTF_CACHE) { if (dst_hold_safe(&rt->dst)) - ip6_del_rt(rt); + ip6_del_rt(dev_net(rt->dst.dev), rt); } else { struct fib6_node *fn; @@ -3015,9 +3014,9 @@ out: static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info) { - int err; + struct net *net = info->nl_net; struct fib6_table *table; - struct net *net = dev_net(rt->dst.dev); + int err; if (rt == net->ipv6.ip6_null_entry) { err = -ENOENT; @@ -3034,11 +3033,10 @@ out: return err; } -int ip6_del_rt(struct rt6_info *rt) +int ip6_del_rt(struct net *net, struct rt6_info *rt) { - struct nl_info info = { - .nl_net = dev_net(rt->dst.dev), - }; + struct nl_info info = { .nl_net = net }; + return __ip6_del_rt(rt, &info); } @@ -3337,13 +3335,15 @@ static struct rt6_info *rt6_add_route_info(struct net *net, } #endif -struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev) +struct rt6_info *rt6_get_dflt_router(struct net *net, + const struct in6_addr *addr, + struct net_device *dev) { u32 tb_id = l3mdev_fib_table(dev) ? : addrconf_rt_table(dev, RT6_TABLE_MAIN); struct rt6_info *rt; struct fib6_table *table; - table = fib6_get_table(dev_net(dev), tb_id); + table = fib6_get_table(net, tb_id); if (!table) return NULL; @@ -3360,7 +3360,8 @@ struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_dev return rt; } -struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr, +struct rt6_info *rt6_add_dflt_router(struct net *net, + const struct in6_addr *gwaddr, struct net_device *dev, unsigned int pref) { @@ -3374,7 +3375,7 @@ struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr, .fc_type = RTN_UNICAST, .fc_nlinfo.portid = 0, .fc_nlinfo.nlh = NULL, - .fc_nlinfo.nl_net = dev_net(dev), + .fc_nlinfo.nl_net = net, }; cfg.fc_gateway = *gwaddr; @@ -3387,7 +3388,7 @@ struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr, table->flags |= RT6_TABLE_HAS_DFLT_ROUTER; } - return rt6_get_dflt_router(gwaddr, dev); + return rt6_get_dflt_router(net, gwaddr, dev); } int rt6_addrconf_purge(struct rt6_info *rt, void *arg) { @@ -3515,12 +3516,12 @@ static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff * Allocate a dst for local (unicast / anycast) address. */ -struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev, +struct rt6_info *addrconf_dst_alloc(struct net *net, + struct inet6_dev *idev, const struct in6_addr *addr, bool anycast) { u32 tb_id; - struct net *net = dev_net(idev->dev); struct net_device *dev = idev->dev; struct rt6_info *rt; From 04ead0c4211d808d13d2be131991935bf7ac0d09 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 17 Apr 2018 17:33:07 -0700 Subject: [PATCH 0364/1640] BACKPORT: net: Move fib_convert_metrics to metrics file Move logic of fib_convert_metrics into ip_metrics_convert. This allows the code that converts netlink attributes into metrics struct to be re-used in a later patch by IPv6. This is mostly a code move with the following changes to variable names: - fi->fib_net becomes net - fc_mx and fc_mx_len are passed as inputs pulled from fib_config - metrics array is passed as an input from fi->fib_metrics->metrics Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip.h | 3 +++ net/ipv4/Makefile | 3 ++- net/ipv4/fib_semantics.c | 45 ++------------------------------ net/ipv4/metrics.c | 56 ++++++++++++++++++++++++++++++++++++++++ 4 files changed, 63 insertions(+), 44 deletions(-) create mode 100644 net/ipv4/metrics.c diff --git a/include/net/ip.h b/include/net/ip.h index fb1f38263266..5de30eb9798e 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -395,6 +395,9 @@ static inline unsigned int ip_skb_dst_mtu(struct sock *sk, return min(READ_ONCE(skb_dst(skb)->dev->mtu), IP_MAX_MTU); } +int ip_metrics_convert(struct net *net, struct nlattr *fc_mx, int fc_mx_len, + u32 *metrics); + u32 ip_idents_reserve(u32 hash, int segs); void __ip_select_ident(struct net *net, struct iphdr *iph, int segs); diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index c6c8ad1d4b6d..30ee969c5957 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -13,7 +13,8 @@ obj-y := route.o inetpeer.o protocol.o \ tcp_offload.o datagram.o raw.o udp.o udplite.o \ udp_offload.o arp.o icmp.o devinet.o af_inet.o igmp.o \ fib_frontend.o fib_semantics.o fib_trie.o fib_notifier.o \ - inet_fragment.o ping.o ip_tunnel_core.o gre_offload.o + inet_fragment.o ping.o ip_tunnel_core.o gre_offload.o \ + metrics.o obj-$(CONFIG_NET_IP_TUNNEL) += ip_tunnel.o obj-$(CONFIG_SYSCTL) += sysctl_net_ipv4.o diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index b08ef1b24cb5..22dca7273392 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -1029,49 +1029,8 @@ static bool fib_valid_prefsrc(struct fib_config *cfg, __be32 fib_prefsrc) static int fib_convert_metrics(struct fib_info *fi, const struct fib_config *cfg) { - bool ecn_ca = false; - struct nlattr *nla; - int remaining; - - if (!cfg->fc_mx) - return 0; - - nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) { - int type = nla_type(nla); - u32 val; - - if (!type) - continue; - if (type > RTAX_MAX) - return -EINVAL; - - if (type == RTAX_CC_ALGO) { - char tmp[TCP_CA_NAME_MAX]; - - nla_strlcpy(tmp, nla, sizeof(tmp)); - val = tcp_ca_get_key_by_name(fi->fib_net, tmp, &ecn_ca); - if (val == TCP_CA_UNSPEC) - return -EINVAL; - } else { - if (nla_len(nla) != sizeof(u32)) - return -EINVAL; - val = nla_get_u32(nla); - } - if (type == RTAX_ADVMSS && val > 65535 - 40) - val = 65535 - 40; - if (type == RTAX_MTU && val > 65535 - 15) - val = 65535 - 15; - if (type == RTAX_HOPLIMIT && val > 255) - val = 255; - if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK)) - return -EINVAL; - fi->fib_metrics->metrics[type - 1] = val; - } - - if (ecn_ca) - fi->fib_metrics->metrics[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA; - - return 0; + return ip_metrics_convert(fi->fib_net, cfg->fc_mx, cfg->fc_mx_len, + fi->fib_metrics->metrics); } struct fib_info *fib_create_info(struct fib_config *cfg, diff --git a/net/ipv4/metrics.c b/net/ipv4/metrics.c new file mode 100644 index 000000000000..55baf3e251aa --- /dev/null +++ b/net/ipv4/metrics.c @@ -0,0 +1,56 @@ +#include +#include +#include +#include +#include +#include + +int ip_metrics_convert(struct net *net, struct nlattr *fc_mx, int fc_mx_len, + u32 *metrics) +{ + bool ecn_ca = false; + struct nlattr *nla; + int remaining; + + if (!fc_mx) + return 0; + + nla_for_each_attr(nla, fc_mx, fc_mx_len, remaining) { + int type = nla_type(nla); + u32 val; + + if (!type) + continue; + if (type > RTAX_MAX) + return -EINVAL; + + if (type == RTAX_CC_ALGO) { + char tmp[TCP_CA_NAME_MAX]; + + nla_strlcpy(tmp, nla, sizeof(tmp)); + val = tcp_ca_get_key_by_name(net, tmp, &ecn_ca); + if (val == TCP_CA_UNSPEC) + return -EINVAL; + } else { + if (nla_len(nla) != sizeof(u32)) + return -EINVAL; + + val = nla_get_u32(nla); + } + if (type == RTAX_ADVMSS && val > 65535 - 40) + val = 65535 - 40; + if (type == RTAX_MTU && val > 65535 - 15) + val = 65535 - 15; + if (type == RTAX_HOPLIMIT && val > 255) + val = 255; + if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK)) + return -EINVAL; + metrics[type - 1] = val; + } + + if (ecn_ca) + metrics[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA; + + return 0; +} +EXPORT_SYMBOL_GPL(ip_metrics_convert); From a03f88b549f5c2e4b2f104d3e5f1478cefcd750a Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 17 Apr 2018 17:33:08 -0700 Subject: [PATCH 0365/1640] UPSTREAM: net: Handle null dst in rtnl_put_cacheinfo Need to keep expires time for IPv6 routes in a dump of FIB entries. Update rtnl_put_cacheinfo to allow dst to be NULL in which case rta_cacheinfo will only contain non-dst data. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index ed3c304ab418..ad6a31c485f5 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -714,13 +714,15 @@ int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst, u32 id, long expires, u32 error) { struct rta_cacheinfo ci = { - .rta_lastuse = jiffies_delta_to_clock_t(jiffies - dst->lastuse), - .rta_used = dst->__use, - .rta_clntref = atomic_read(&(dst->__refcnt)), .rta_error = error, .rta_id = id, }; + if (dst) { + ci.rta_lastuse = jiffies_delta_to_clock_t(jiffies - dst->lastuse); + ci.rta_used = dst->__use; + ci.rta_clntref = atomic_read(&dst->__refcnt); + } if (expires) { unsigned long clock; From 2b3e99ce6ad53b7c8d600c16192e734a100418d4 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 17 Apr 2018 17:33:16 -0700 Subject: [PATCH 0366/1640] BACKPORT: net/ipv6: move metrics from dst to rt6_info Similar to IPv4, add fib metrics to the fib struct, which at the moment is rt6_info. Will be moved to fib6_info in a later patch. Copy metrics into dst by reference using refcount. To make the transition: - add dst_metrics to rt6_info. Default to dst_default_metrics if no metrics are passed during route add. No need for a separate pmtu entry; it can reference the MTU slot in fib6_metrics - ip6_convert_metrics allocates memory in the FIB entry and uses ip_metrics_convert to copy from netlink attribute to metrics entry - the convert metrics call is done in ip6_route_info_create simplifying the route add path + fib6_commit_metrics and fib6_copy_metrics and the temporary mx6_config are no longer needed - add fib6_metric_set helper to change the value of a metric in the fib entry since dst_metric_set can no longer be used - cow_metrics for IPv6 can drop to dst_cow_metrics_generic - rt6_dst_from_metrics_check is no longer needed - rt6_fill_node needs the FIB entry and dst as separate arguments to keep compatibility with existing output. Current dst address is renamed to dest. (to be consistent with IPv4 rt6_fill_node really should be split into 2 functions similar to fib_dump_info and rt_fill_info) - rt6_fill_node no longer needs the temporary metrics variable Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 17 +-- net/core/dst.c | 1 + net/ipv6/ip6_fib.c | 66 ++++------- net/ipv6/ndisc.c | 10 +- net/ipv6/route.c | 257 ++++++++++++++++-------------------------- 5 files changed, 133 insertions(+), 218 deletions(-) diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 3cc5a965f082..a62210baffb2 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -97,11 +97,6 @@ struct fib6_gc_args { #define FIB6_SUBTREE(fn) (rcu_dereference_protected((fn)->subtree, 1)) #endif -struct mx6_config { - const u32 *mx; - DECLARE_BITMAP(mx_valid, RTAX_MAX); -}; - /* * routing information * @@ -179,7 +174,6 @@ struct rt6_info { struct rt6_exception_bucket __rcu *rt6i_exception_bucket; u32 rt6i_metric; - u32 rt6i_pmtu; /* more non-fragment space at head required */ unsigned short rt6i_nfheader_len; u8 rt6i_protocol; @@ -188,6 +182,8 @@ struct rt6_info { should_flush:1, unused:6; + struct dst_metrics *fib6_metrics; +#define fib6_pmtu fib6_metrics->metrics[RTAX_MTU-1] struct fib6_nh fib6_nh; }; @@ -392,8 +388,7 @@ void fib6_clean_all(struct net *net, int (*func)(struct rt6_info *, void *arg), void *arg); int fib6_add(struct fib6_node *root, struct rt6_info *rt, - struct nl_info *info, struct mx6_config *mxc, - struct netlink_ext_ack *extack); + struct nl_info *info, struct netlink_ext_ack *extack); int fib6_del(struct rt6_info *rt, struct nl_info *info); void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info, @@ -422,6 +417,12 @@ int fib6_tables_dump(struct net *net, struct notifier_block *nb); void fib6_update_sernum(struct net *net, struct rt6_info *rt); void fib6_update_sernum_upto_root(struct net *net, struct rt6_info *rt); +void fib6_metric_set(struct rt6_info *f6i, int metric, u32 val); +static inline bool fib6_metric_locked(struct rt6_info *f6i, int metric) +{ + return !!(f6i->fib6_metrics->metrics[RTAX_LOCK - 1] & (1 << metric)); +} + #ifdef CONFIG_IPV6_MULTIPLE_TABLES int fib6_rules_init(void); void fib6_rules_cleanup(void); diff --git a/net/core/dst.c b/net/core/dst.c index 2cca136aa3e3..88f7ddc05013 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -58,6 +58,7 @@ const struct dst_metrics dst_default_metrics = { */ .refcnt = REFCOUNT_INIT(1), }; +EXPORT_SYMBOL(dst_default_metrics); void dst_init(struct dst_entry *dst, struct dst_ops *ops, struct net_device *dev, int initial_ref, int initial_obsolete, diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 04ce92505ff6..b5e3a6d1d05b 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -578,6 +578,24 @@ out: return res; } +void fib6_metric_set(struct rt6_info *f6i, int metric, u32 val) +{ + if (!f6i) + return; + + if (f6i->fib6_metrics == &dst_default_metrics) { + struct dst_metrics *p = kzalloc(sizeof(*p), GFP_ATOMIC); + + if (!p) + return; + + refcount_set(&p->refcnt, 1); + f6i->fib6_metrics = p; + } + + f6i->fib6_metrics->metrics[metric - 1] = val; +} + /* * Routing Table * @@ -796,38 +814,6 @@ insert_above: return ln; } -static void fib6_copy_metrics(u32 *mp, const struct mx6_config *mxc) -{ - int i; - - for (i = 0; i < RTAX_MAX; i++) { - if (test_bit(i, mxc->mx_valid)) - mp[i] = mxc->mx[i]; - } -} - -static int fib6_commit_metrics(struct dst_entry *dst, struct mx6_config *mxc) -{ - if (!mxc->mx) - return 0; - - if (dst->flags & DST_HOST) { - u32 *mp = dst_metrics_write_ptr(dst); - - if (unlikely(!mp)) - return -ENOMEM; - - fib6_copy_metrics(mp, mxc); - } else { - dst_init_metrics(dst, mxc->mx, false); - - /* We've stolen mx now. */ - mxc->mx = NULL; - } - - return 0; -} - static void fib6_purge_rt(struct rt6_info *rt, struct fib6_node *fn, struct net *net) { @@ -861,7 +847,7 @@ static void fib6_purge_rt(struct rt6_info *rt, struct fib6_node *fn, */ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt, - struct nl_info *info, struct mx6_config *mxc, + struct nl_info *info, struct netlink_ext_ack *extack) { struct rt6_info *leaf = rcu_dereference_protected(fn->leaf, @@ -917,7 +903,7 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt, rt6_clean_expires(iter); else rt6_set_expires(iter, rt->dst.expires); - iter->rt6i_pmtu = rt->rt6i_pmtu; + fib6_metric_set(iter, RTAX_MTU, rt->fib6_pmtu); return -EEXIST; } /* If we have the same destination and the same metric, @@ -998,9 +984,6 @@ next_iter: add: nlflags |= NLM_F_CREATE; - err = fib6_commit_metrics(&rt->dst, mxc); - if (err) - return err; err = call_fib6_entry_notifiers(info->nl_net, FIB_EVENT_ENTRY_ADD, @@ -1031,10 +1014,6 @@ add: return -ENOENT; } - err = fib6_commit_metrics(&rt->dst, mxc); - if (err) - return err; - err = call_fib6_entry_notifiers(info->nl_net, FIB_EVENT_ENTRY_REPLACE, rt, extack); @@ -1131,8 +1110,7 @@ void fib6_update_sernum_upto_root(struct net *net, struct rt6_info *rt) */ int fib6_add(struct fib6_node *root, struct rt6_info *rt, - struct nl_info *info, struct mx6_config *mxc, - struct netlink_ext_ack *extack) + struct nl_info *info, struct netlink_ext_ack *extack) { struct fib6_table *table = rt->rt6i_table; struct fib6_node *fn, *pn = NULL; @@ -1234,7 +1212,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, } #endif - err = fib6_add_rt2node(fn, rt, info, mxc, extack); + err = fib6_add_rt2node(fn, rt, info, extack); if (!err) { __fib6_update_sernum_upto_root(rt, sernum); fib6_start_gc(info->nl_net, rt); diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 93c9e00e09a3..b0ee10528927 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -1337,9 +1337,8 @@ static void ndisc_router_discovery(struct sk_buff *skb) ra_msg->icmph.icmp6_hop_limit) { if (in6_dev->cnf.accept_ra_min_hop_limit <= ra_msg->icmph.icmp6_hop_limit) { in6_dev->cnf.hop_limit = ra_msg->icmph.icmp6_hop_limit; - if (rt) - dst_metric_set(&rt->dst, RTAX_HOPLIMIT, - ra_msg->icmph.icmp6_hop_limit); + fib6_metric_set(rt, RTAX_HOPLIMIT, + ra_msg->icmph.icmp6_hop_limit); } else { ND_PRINTK(2, warn, "RA: Got route advertisement with lower hop_limit than minimum\n"); } @@ -1491,10 +1490,7 @@ skip_routeinfo: ND_PRINTK(2, warn, "RA: invalid mtu: %d\n", mtu); } else if (in6_dev->cnf.mtu6 != mtu) { in6_dev->cnf.mtu6 = mtu; - - if (rt) - dst_metric_set(&rt->dst, RTAX_MTU, mtu); - + fib6_metric_set(rt, RTAX_MTU, mtu); rt6_mtu_change(skb->dev, mtu); } } diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 5254a899ea1b..8e10c2607789 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -98,12 +98,11 @@ static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, bool confirm_neigh); static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb); -static void rt6_dst_from_metrics_check(struct rt6_info *rt); static int rt6_score_route(struct rt6_info *rt, int oif, int strict); static size_t rt6_nlmsg_size(struct rt6_info *rt); -static int rt6_fill_node(struct net *net, - struct sk_buff *skb, struct rt6_info *rt, - struct in6_addr *dst, struct in6_addr *src, +static int rt6_fill_node(struct net *net, struct sk_buff *skb, + struct rt6_info *rt, struct dst_entry *dst, + struct in6_addr *dest, struct in6_addr *src, int iif, int type, u32 portid, u32 seq, unsigned int flags); static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt, @@ -185,23 +184,6 @@ static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev) } } -static u32 *rt6_pcpu_cow_metrics(struct rt6_info *rt) -{ - return dst_metrics_write_ptr(&rt->from->dst); -} - -static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old) -{ - struct rt6_info *rt = (struct rt6_info *)dst; - - if (rt->rt6i_flags & RTF_PCPU) - return rt6_pcpu_cow_metrics(rt); - else if (rt->rt6i_flags & RTF_CACHE) - return NULL; - else - return dst_cow_metrics_generic(dst, old); -} - static inline const void *choose_neigh_daddr(struct rt6_info *rt, struct sk_buff *skb, const void *daddr) @@ -251,7 +233,7 @@ static struct dst_ops ip6_dst_ops_template = { .check = ip6_dst_check, .default_advmss = ip6_default_advmss, .mtu = ip6_mtu, - .cow_metrics = ipv6_cow_metrics, + .cow_metrics = dst_cow_metrics_generic, .destroy = ip6_dst_destroy, .ifdown = ip6_dst_ifdown, .negative_advice = ip6_negative_advice, @@ -356,6 +338,7 @@ static void rt6_info_init(struct rt6_info *rt) memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst)); INIT_LIST_HEAD(&rt->rt6i_siblings); INIT_LIST_HEAD(&rt->rt6i_uncached); + rt->fib6_metrics = (struct dst_metrics *)&dst_default_metrics; } /* allocate dst with ip6_dst_ops */ @@ -408,6 +391,7 @@ static void ip6_dst_destroy(struct dst_entry *dst) struct rt6_exception_bucket *bucket; struct rt6_info *from = rt->from; struct inet6_dev *idev; + struct dst_metrics *m; dst_destroy_metrics_generic(dst); free_percpu(rt->rt6i_pcpu); @@ -424,6 +408,10 @@ static void ip6_dst_destroy(struct dst_entry *dst) kfree(bucket); } + m = rt->fib6_metrics; + if (m != &dst_default_metrics && refcount_dec_and_test(&m->refcnt)) + kfree(m); + rt->from = NULL; dst_release(&from->dst); } @@ -939,7 +927,11 @@ static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from) rt->rt6i_flags &= ~RTF_EXPIRES; dst_hold(&from->dst); rt->from = from; - dst_init_metrics(&rt->dst, dst_metrics_ptr(&from->dst), true); + dst_init_metrics(&rt->dst, from->fib6_metrics->metrics, true); + if (from->fib6_metrics != &dst_default_metrics) { + rt->dst._metrics |= DST_METRICS_REFCOUNTED; + refcount_inc(&from->fib6_metrics->refcnt); + } } static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort) @@ -1085,7 +1077,6 @@ EXPORT_SYMBOL(rt6_lookup); */ static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info, - struct mx6_config *mxc, struct netlink_ext_ack *extack) { int err; @@ -1093,7 +1084,7 @@ static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info, table = rt->rt6i_table; spin_lock_bh(&table->tb6_lock); - err = fib6_add(&table->tb6_root, rt, info, mxc, extack); + err = fib6_add(&table->tb6_root, rt, info, extack); spin_unlock_bh(&table->tb6_lock); return err; @@ -1102,11 +1093,10 @@ static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info, int ip6_ins_rt(struct net *net, struct rt6_info *rt) { struct nl_info info = { .nl_net = net, }; - struct mx6_config mxc = { .mx = NULL, }; /* Hold dst to account for the reference from the fib6 tree */ dst_hold(&rt->dst); - return __ip6_ins_rt(rt, &info, &mxc, NULL); + return __ip6_ins_rt(rt, &info, NULL); } static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort, @@ -1177,8 +1167,8 @@ static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt) p = this_cpu_ptr(rt->rt6i_pcpu); pcpu_rt = *p; - if (pcpu_rt && ip6_hold_safe(NULL, &pcpu_rt, false)) - rt6_dst_from_metrics_check(pcpu_rt); + if (pcpu_rt) + ip6_hold_safe(NULL, &pcpu_rt, false); return pcpu_rt; } @@ -1207,7 +1197,6 @@ static struct rt6_info *rt6_make_pcpu_route(struct net *net, pcpu_rt = prev; } - rt6_dst_from_metrics_check(pcpu_rt); return pcpu_rt; } @@ -1335,6 +1324,16 @@ __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket, return NULL; } +static unsigned int fib6_mtu(const struct rt6_info *rt) +{ + unsigned int mtu; + + mtu = rt->fib6_pmtu ? : rt->rt6i_idev->cnf.mtu6; + mtu = min_t(unsigned int, mtu, IP6_MAX_MTU); + + return mtu - lwtunnel_headroom(rt->fib6_nh.nh_lwtstate, mtu); +} + static int rt6_insert_exception(struct rt6_info *nrt, struct rt6_info *ort) { @@ -1387,7 +1386,7 @@ static int rt6_insert_exception(struct rt6_info *nrt, * Only insert this exception route if its mtu * is less than ort's mtu value. */ - if (nrt->rt6i_pmtu >= dst_mtu(&ort->dst)) { + if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(ort)) { err = -EINVAL; goto out; } @@ -1624,12 +1623,12 @@ static void rt6_exceptions_update_pmtu(struct inet6_dev *idev, struct rt6_info *entry = rt6_ex->rt6i; /* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected - * route), the metrics of its rt->dst.from have already + * route), the metrics of its rt->from have already * been updated. */ - if (entry->rt6i_pmtu && + if (dst_metric_raw(&entry->dst, RTAX_MTU) && rt6_mtu_change_route_allowed(idev, entry, mtu)) - entry->rt6i_pmtu = mtu; + dst_metric_set(&entry->dst, RTAX_MTU, mtu); } bucket++; } @@ -1780,10 +1779,9 @@ redo_rt6_select: trace_fib6_table_lookup(net, rt, table->tb6_id, fl6); return rt; } else if (rt->rt6i_flags & RTF_CACHE) { - if (ip6_hold_safe(net, &rt, true)) { + if (ip6_hold_safe(net, &rt, true)) dst_use_noref(&rt->dst, jiffies); - rt6_dst_from_metrics_check(rt); - } + rcu_read_unlock(); trace_fib6_table_lookup(net, rt, table->tb6_id, fl6); return rt; @@ -2053,13 +2051,6 @@ struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_ori * Destination cache support functions */ -static void rt6_dst_from_metrics_check(struct rt6_info *rt) -{ - if (rt->from && - dst_metrics_ptr(&rt->dst) != dst_metrics_ptr(&rt->from->dst)) - dst_init_metrics(&rt->dst, dst_metrics_ptr(&rt->from->dst), true); -} - static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie) { u32 rt_cookie = 0; @@ -2094,8 +2085,6 @@ static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie) * into this function always. */ - rt6_dst_from_metrics_check(rt); - if (rt->rt6i_flags & RTF_PCPU || (unlikely(!list_empty(&rt->rt6i_uncached)) && rt->from)) return rt6_dst_from_check(rt, cookie); @@ -2148,8 +2137,8 @@ static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu) { struct net *net = dev_net(rt->dst.dev); + dst_metric_set(&rt->dst, RTAX_MTU, mtu); rt->rt6i_flags |= RTF_MODIFIED; - rt->rt6i_pmtu = mtu; rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires); } @@ -2198,10 +2187,10 @@ static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk, } else if (daddr) { struct rt6_info *nrt6; - nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr); + nrt6 = ip6_rt_cache_alloc(rt6->from, daddr, saddr); if (nrt6) { rt6_do_update_pmtu(nrt6, mtu); - if (rt6_insert_exception(nrt6, rt6)) + if (rt6_insert_exception(nrt6, rt6->from)) dst_release_immediate(&nrt6->dst); } } @@ -2430,12 +2419,8 @@ static unsigned int ip6_default_advmss(const struct dst_entry *dst) static unsigned int ip6_mtu(const struct dst_entry *dst) { - const struct rt6_info *rt = (const struct rt6_info *)dst; - unsigned int mtu = rt->rt6i_pmtu; struct inet6_dev *idev; - - if (mtu) - goto out; + unsigned int mtu; mtu = dst_metric_raw(dst, RTAX_MTU); if (mtu) @@ -2520,60 +2505,24 @@ out: atomic_set(&net->ipv6.ip6_rt_gc_expire, val - (val >> rt_elasticity)); } -static int ip6_convert_metrics(struct mx6_config *mxc, - const struct fib6_config *cfg) +static int ip6_convert_metrics(struct net *net, struct rt6_info *rt, + struct fib6_config *cfg) { - struct net *net = cfg->fc_nlinfo.nl_net; - bool ecn_ca = false; - struct nlattr *nla; - int remaining; - u32 *mp; + int err = 0; - if (!cfg->fc_mx) - return 0; + if (cfg->fc_mx) { + rt->fib6_metrics = kzalloc(sizeof(*rt->fib6_metrics), + GFP_KERNEL); + if (unlikely(!rt->fib6_metrics)) + return -ENOMEM; - mp = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL); - if (unlikely(!mp)) - return -ENOMEM; + refcount_set(&rt->fib6_metrics->refcnt, 1); - nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) { - int type = nla_type(nla); - u32 val; - - if (!type) - continue; - if (unlikely(type > RTAX_MAX)) - goto err; - - if (type == RTAX_CC_ALGO) { - char tmp[TCP_CA_NAME_MAX]; - - nla_strlcpy(tmp, nla, sizeof(tmp)); - val = tcp_ca_get_key_by_name(net, tmp, &ecn_ca); - if (val == TCP_CA_UNSPEC) - goto err; - } else { - val = nla_get_u32(nla); - } - if (type == RTAX_HOPLIMIT && val > 255) - val = 255; - if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK)) - goto err; - - mp[type - 1] = val; - __set_bit(type - 1, mxc->mx_valid); + err = ip_metrics_convert(net, cfg->fc_mx, cfg->fc_mx_len, + rt->fib6_metrics->metrics); } - if (ecn_ca) { - __set_bit(RTAX_FEATURES - 1, mxc->mx_valid); - mp[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA; - } - - mxc->mx = mp; - return 0; - err: - kfree(mp); - return -EINVAL; + return err; } static struct rt6_info *ip6_nh_lookup_table(struct net *net, @@ -2840,6 +2789,10 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg, goto out; } + err = ip6_convert_metrics(net, rt, cfg); + if (err < 0) + goto out; + if (cfg->fc_flags & RTF_EXPIRES) rt6_set_expires(rt, jiffies + clock_t_to_jiffies(cfg->fc_expires)); @@ -2982,32 +2935,16 @@ out: return ERR_PTR(err); } -int ip6_route_add(struct fib6_config *cfg, - struct netlink_ext_ack *extack) +int ip6_route_add(struct fib6_config *cfg, struct netlink_ext_ack *extack) { - struct mx6_config mxc = { .mx = NULL, }; struct rt6_info *rt; int err; rt = ip6_route_info_create(cfg, extack); - if (IS_ERR(rt)) { - err = PTR_ERR(rt); - rt = NULL; - goto out; - } + if (IS_ERR(rt)) + return PTR_ERR(rt); - err = ip6_convert_metrics(&mxc, cfg); - if (err) - goto out; - - err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, &mxc, extack); - - kfree(mxc.mx); - - return err; -out: - if (rt) - dst_release_immediate(&rt->dst); + err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack); return err; } @@ -3061,7 +2998,7 @@ static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg) if (skb) { u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0; - if (rt6_fill_node(net, skb, rt, + if (rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0, RTM_DELROUTE, info->portid, seq, 0) < 0) { kfree_skb(skb); @@ -3252,7 +3189,7 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu * a cached route because rt6_insert_exception() will * takes care of it */ - if (rt6_insert_exception(nrt, rt)) { + if (rt6_insert_exception(nrt, rt->from)) { dst_release_immediate(&nrt->dst); goto out; } @@ -3892,11 +3829,14 @@ static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg) update PMTU increase is a MUST. (i.e. jumbo frame) */ if (rt->fib6_nh.nh_dev == arg->dev && - !dst_metric_locked(&rt->dst, RTAX_MTU)) { + !fib6_metric_locked(rt, RTAX_MTU)) { + u32 mtu = rt->fib6_pmtu; + + if (mtu >= arg->mtu || + (mtu < arg->mtu && mtu == idev->cnf.mtu6)) + fib6_metric_set(rt, RTAX_MTU, arg->mtu); + spin_lock_bh(&rt6_exception_lock); - if (dst_metric_raw(&rt->dst, RTAX_MTU) && - rt6_mtu_change_route_allowed(idev, rt, arg->mtu)) - dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu); rt6_exceptions_update_pmtu(idev, rt, arg->mtu); spin_unlock_bh(&rt6_exception_lock); } @@ -4063,7 +4003,6 @@ errout: struct rt6_nh { struct rt6_info *rt6_info; struct fib6_config r_cfg; - struct mx6_config mxc; struct list_head next; }; @@ -4078,7 +4017,8 @@ static void ip6_print_replace_route_err(struct list_head *rt6_nh_list) } } -static int ip6_route_info_append(struct list_head *rt6_nh_list, +static int ip6_route_info_append(struct net *net, + struct list_head *rt6_nh_list, struct rt6_info *rt, struct fib6_config *r_cfg) { struct rt6_nh *nh; @@ -4094,7 +4034,7 @@ static int ip6_route_info_append(struct list_head *rt6_nh_list, if (!nh) return -ENOMEM; nh->rt6_info = rt; - err = ip6_convert_metrics(&nh->mxc, r_cfg); + err = ip6_convert_metrics(net, rt, r_cfg); if (err) { kfree(nh); return err; @@ -4201,7 +4141,8 @@ static int ip6_route_multipath_add(struct fib6_config *cfg, rt->fib6_nh.nh_weight = rtnh->rtnh_hops + 1; - err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg); + err = ip6_route_info_append(info->nl_net, &rt6_nh_list, + rt, &r_cfg); if (err) { dst_release_immediate(&rt->dst); goto cleanup; @@ -4218,7 +4159,7 @@ static int ip6_route_multipath_add(struct fib6_config *cfg, err_nh = NULL; list_for_each_entry(nh, &rt6_nh_list, next) { - err = __ip6_ins_rt(nh->rt6_info, info, &nh->mxc, extack); + err = __ip6_ins_rt(nh->rt6_info, info, extack); if (!err) { /* save reference to last route successfully inserted */ @@ -4276,7 +4217,6 @@ cleanup: list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) { if (nh->rt6_info) dst_release_immediate(&nh->rt6_info->dst); - kfree(nh->mxc.mx); list_del(&nh->next); kfree(nh); } @@ -4457,16 +4397,16 @@ nla_put_failure: return -EMSGSIZE; } -static int rt6_fill_node(struct net *net, - struct sk_buff *skb, struct rt6_info *rt, - struct in6_addr *dst, struct in6_addr *src, +static int rt6_fill_node(struct net *net, struct sk_buff *skb, + struct rt6_info *rt, struct dst_entry *dst, + struct in6_addr *dest, struct in6_addr *src, int iif, int type, u32 portid, u32 seq, unsigned int flags) { - u32 metrics[RTAX_MAX]; struct rtmsg *rtm; struct nlmsghdr *nlh; - long expires; + long expires = 0; + u32 *pmetrics; u32 table; nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags); @@ -4494,8 +4434,8 @@ static int rt6_fill_node(struct net *net, if (rt->rt6i_flags & RTF_CACHE) rtm->rtm_flags |= RTM_F_CLONED; - if (dst) { - if (nla_put_in6_addr(skb, RTA_DST, dst)) + if (dest) { + if (nla_put_in6_addr(skb, RTA_DST, dest)) goto nla_put_failure; rtm->rtm_dst_len = 128; } else if (rtm->rtm_dst_len) @@ -4523,9 +4463,9 @@ static int rt6_fill_node(struct net *net, #endif if (nla_put_u32(skb, RTA_IIF, iif)) goto nla_put_failure; - } else if (dst) { + } else if (dest) { struct in6_addr saddr_buf; - if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0 && + if (ip6_route_get_saddr(net, rt, dest, 0, &saddr_buf) == 0 && nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf)) goto nla_put_failure; } @@ -4537,10 +4477,8 @@ static int rt6_fill_node(struct net *net, goto nla_put_failure; } - memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics)); - if (rt->rt6i_pmtu) - metrics[RTAX_MTU - 1] = rt->rt6i_pmtu; - if (rtnetlink_put_metrics(skb, metrics) < 0) + pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics; + if (rtnetlink_put_metrics(skb, pmetrics) < 0) goto nla_put_failure; if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric)) @@ -4572,9 +4510,10 @@ static int rt6_fill_node(struct net *net, goto nla_put_failure; } - expires = (rt->rt6i_flags & RTF_EXPIRES) ? rt->dst.expires - jiffies : 0; + if (rt->rt6i_flags & RTF_EXPIRES && dst) + expires = dst->expires - jiffies; - if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0) + if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0) goto nla_put_failure; if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags))) @@ -4608,10 +4547,9 @@ int rt6_dump_route(struct rt6_info *rt, void *p_arg) } } - return rt6_fill_node(net, - arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE, - NETLINK_CB(arg->cb->skb).portid, arg->cb->nlh->nlmsg_seq, - NLM_F_MULTI); + return rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0, + RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid, + arg->cb->nlh->nlmsg_seq, NLM_F_MULTI); } static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, @@ -4725,13 +4663,14 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, skb_dst_set(skb, &rt->dst); if (fibmatch) - err = rt6_fill_node(net, skb, rt, NULL, NULL, iif, + err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, iif, RTM_NEWROUTE, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, 0); else - err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif, - RTM_NEWROUTE, NETLINK_CB(in_skb).portid, - nlh->nlmsg_seq, 0); + err = rt6_fill_node(net, skb, rt, dst, &fl6.daddr, &fl6.saddr, + iif, RTM_NEWROUTE, + NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, + 0); if (err < 0) { kfree_skb(skb); goto errout; @@ -4757,8 +4696,8 @@ void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info, if (!skb) goto errout; - err = rt6_fill_node(net, skb, rt, NULL, NULL, 0, - event, info->portid, seq, nlm_flags); + err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0, + event, info->portid, seq, nlm_flags); if (err < 0) { /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */ WARN_ON(err == -EMSGSIZE); From 0180207cf38218949bc4bb2239fd56a6c3eace8c Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 17 Apr 2018 17:33:17 -0700 Subject: [PATCH 0367/1640] UPSTREAM: net/ipv6: move expires into rt6_info Add expires to rt6_info for FIB entries, and add fib6 helpers to manage it. Data path use of dst.expires remains. The transition is fairly straightforward: when working with fib entries, rt->dst.expires is just rt->expires, rt6_clean_expires is replaced with fib6_clean_expires, rt6_set_expires becomes fib6_set_expires, and rt6_check_expired becomes fib6_check_expired, where the fib6 versions are added by this patch. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 27 +++++++++++++++++++++++---- net/ipv6/addrconf.c | 6 +++--- net/ipv6/ip6_fib.c | 8 ++++---- net/ipv6/ndisc.c | 2 +- net/ipv6/route.c | 20 +++++++++++--------- 5 files changed, 42 insertions(+), 21 deletions(-) diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index a62210baffb2..bb674beb8189 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -182,6 +182,7 @@ struct rt6_info { should_flush:1, unused:6; + unsigned long expires; struct dst_metrics *fib6_metrics; #define fib6_pmtu fib6_metrics->metrics[RTAX_MTU-1] struct fib6_nh fib6_nh; @@ -200,6 +201,26 @@ static inline struct inet6_dev *ip6_dst_idev(struct dst_entry *dst) return ((struct rt6_info *)dst)->rt6i_idev; } +static inline void fib6_clean_expires(struct rt6_info *f6i) +{ + f6i->rt6i_flags &= ~RTF_EXPIRES; + f6i->expires = 0; +} + +static inline void fib6_set_expires(struct rt6_info *f6i, + unsigned long expires) +{ + f6i->expires = expires; + f6i->rt6i_flags |= RTF_EXPIRES; +} + +static inline bool fib6_check_expired(const struct rt6_info *f6i) +{ + if (f6i->rt6i_flags & RTF_EXPIRES) + return time_after(jiffies, f6i->expires); + return false; +} + static inline void rt6_clean_expires(struct rt6_info *rt) { rt->rt6i_flags &= ~RTF_EXPIRES; @@ -214,11 +235,9 @@ static inline void rt6_set_expires(struct rt6_info *rt, unsigned long expires) static inline void rt6_update_expires(struct rt6_info *rt0, int timeout) { - struct rt6_info *rt; + if (!(rt0->rt6i_flags & RTF_EXPIRES) && rt0->from) + rt0->dst.expires = rt0->from->expires; - for (rt = rt0; rt && !(rt->rt6i_flags & RTF_EXPIRES); rt = rt->from); - if (rt && rt != rt0) - rt0->dst.expires = rt->dst.expires; dst_set_expires(&rt0->dst, timeout); rt0->rt6i_flags |= RTF_EXPIRES; } diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index b4da2c755d89..a46bc7191355 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -1164,7 +1164,7 @@ cleanup_prefix_route(struct inet6_ifaddr *ifp, unsigned long expires, bool del_r ip6_del_rt(dev_net(ifp->idev->dev), rt); else { if (!(rt->rt6i_flags & RTF_EXPIRES)) - rt6_set_expires(rt, expires); + fib6_set_expires(rt, expires); ip6_rt_put(rt); } } @@ -2670,9 +2670,9 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len, bool sllao) rt = NULL; } else if (addrconf_finite_timeout(rt_expires)) { /* not infinity */ - rt6_set_expires(rt, jiffies + rt_expires); + fib6_set_expires(rt, jiffies + rt_expires); } else { - rt6_clean_expires(rt); + fib6_clean_expires(rt); } } else if (valid_lft) { clock_t expires = 0; diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index b5e3a6d1d05b..26b0f8edf0aa 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -900,9 +900,9 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt, if (!(iter->rt6i_flags & RTF_EXPIRES)) return -EEXIST; if (!(rt->rt6i_flags & RTF_EXPIRES)) - rt6_clean_expires(iter); + fib6_clean_expires(iter); else - rt6_set_expires(iter, rt->dst.expires); + fib6_set_expires(iter, rt->expires); fib6_metric_set(iter, RTAX_MTU, rt->fib6_pmtu); return -EEXIST; } @@ -1959,8 +1959,8 @@ static int fib6_age(struct rt6_info *rt, void *arg) * Routes are expired even if they are in use. */ - if (rt->rt6i_flags & RTF_EXPIRES && rt->dst.expires) { - if (time_after(now, rt->dst.expires)) { + if (rt->rt6i_flags & RTF_EXPIRES && rt->expires) { + if (time_after(now, rt->expires)) { RT6_TRACE("expiring %p\n", rt); return -1; } diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index b0ee10528927..8aeb50ac4bd5 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -1332,7 +1332,7 @@ static void ndisc_router_discovery(struct sk_buff *skb) } if (rt) - rt6_set_expires(rt, jiffies + (HZ * lifetime)); + fib6_set_expires(rt, jiffies + (HZ * lifetime)); if (in6_dev->cnf.accept_ra_min_hop_limit < 256 && ra_msg->icmph.icmp6_hop_limit) { if (in6_dev->cnf.accept_ra_min_hop_limit <= ra_msg->icmph.icmp6_hop_limit) { diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 8e10c2607789..598ebe7087c9 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -448,7 +448,7 @@ static bool rt6_check_expired(const struct rt6_info *rt) return true; } else if (rt->from) { return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK || - rt6_check_expired(rt->from); + fib6_check_expired(rt->from); } return false; } @@ -699,7 +699,7 @@ static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict, !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE)) goto out; - if (rt6_check_expired(rt)) + if (fib6_check_expired(rt)) goto out; m = rt6_score_route(rt, oif, strict); @@ -883,9 +883,9 @@ int rt6_route_rcv(struct net_device *dev, u8 *opt, int len, if (rt) { if (!addrconf_finite_timeout(lifetime)) - rt6_clean_expires(rt); + fib6_clean_expires(rt); else - rt6_set_expires(rt, jiffies + HZ * lifetime); + fib6_set_expires(rt, jiffies + HZ * lifetime); ip6_rt_put(rt); } @@ -2280,7 +2280,7 @@ restart: for_each_fib6_node_rt_rcu(fn) { if (rt->fib6_nh.nh_flags & RTNH_F_DEAD) continue; - if (rt6_check_expired(rt)) + if (fib6_check_expired(rt)) continue; if (rt->dst.error) break; @@ -2794,10 +2794,10 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg, goto out; if (cfg->fc_flags & RTF_EXPIRES) - rt6_set_expires(rt, jiffies + + fib6_set_expires(rt, jiffies + clock_t_to_jiffies(cfg->fc_expires)); else - rt6_clean_expires(rt); + fib6_clean_expires(rt); if (cfg->fc_protocol == RTPROT_UNSPEC) cfg->fc_protocol = RTPROT_BOOT; @@ -4510,8 +4510,10 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb, goto nla_put_failure; } - if (rt->rt6i_flags & RTF_EXPIRES && dst) - expires = dst->expires - jiffies; + if (rt->rt6i_flags & RTF_EXPIRES) { + expires = dst ? dst->expires : rt->expires; + expires -= jiffies; + } if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0) goto nla_put_failure; From 1d36ef7c80e02bbcd25586c23f4ac94ab56afc7d Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 17 Apr 2018 17:33:15 -0700 Subject: [PATCH 0368/1640] UPSTREAM: net/ipv6: Defer initialization of dst to data path Defer setting dst input, output and error until fib entry is copied. The reject path from ip6_route_info_create is moved to a new function ip6_rt_init_dst_reject with a helper doing the conversion from fib6_type to dst error. The remainder of the new ip6_rt_init_dst is an amalgamtion of dst code from addrconf_dst_alloc and the non-reject path of ip6_route_info_create. The dst output function is always ip6_output and the input function is either ip6_input (local routes), ip6_mc_input (multicast routes) or ip6_forward (anything else). A couple of places using dst.error are updated to look at rt6i_flags. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/route.c | 115 ++++++++++++++++++++++++++++++----------------- 1 file changed, 74 insertions(+), 41 deletions(-) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 598ebe7087c9..fd4f81d976b9 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -920,6 +920,75 @@ static struct net_device *ip6_rt_get_dev_rcu(struct rt6_info *rt) return dev; } +static const int fib6_prop[RTN_MAX + 1] = { + [RTN_UNSPEC] = 0, + [RTN_UNICAST] = 0, + [RTN_LOCAL] = 0, + [RTN_BROADCAST] = 0, + [RTN_ANYCAST] = 0, + [RTN_MULTICAST] = 0, + [RTN_BLACKHOLE] = -EINVAL, + [RTN_UNREACHABLE] = -EHOSTUNREACH, + [RTN_PROHIBIT] = -EACCES, + [RTN_THROW] = -EAGAIN, + [RTN_NAT] = -EINVAL, + [RTN_XRESOLVE] = -EINVAL, +}; + +static int ip6_rt_type_to_error(u8 fib6_type) +{ + return fib6_prop[fib6_type]; +} + +static void ip6_rt_init_dst_reject(struct rt6_info *rt, struct rt6_info *ort) +{ + rt->dst.error = ip6_rt_type_to_error(ort->fib6_type); + + switch (ort->fib6_type) { + case RTN_BLACKHOLE: + rt->dst.output = dst_discard_out; + rt->dst.input = dst_discard; + break; + case RTN_PROHIBIT: + rt->dst.output = ip6_pkt_prohibit_out; + rt->dst.input = ip6_pkt_prohibit; + break; + case RTN_THROW: + case RTN_UNREACHABLE: + default: + rt->dst.output = ip6_pkt_discard_out; + rt->dst.input = ip6_pkt_discard; + break; + } +} + +static void ip6_rt_init_dst(struct rt6_info *rt, struct rt6_info *ort) +{ + if (ort->rt6i_flags & RTF_REJECT) { + ip6_rt_init_dst_reject(rt, ort); + return; + } + + rt->dst.error = 0; + rt->dst.output = ip6_output; + + if (ort->fib6_type == RTN_LOCAL) { + rt->dst.flags |= DST_HOST; + rt->dst.input = ip6_input; + } else if (ipv6_addr_type(&ort->rt6i_dst.addr) & IPV6_ADDR_MULTICAST) { + rt->dst.input = ip6_mc_input; + } else { + rt->dst.input = ip6_forward; + } + + if (ort->fib6_nh.nh_lwtstate) { + rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate); + lwtunnel_set_redirect(&rt->dst); + } + + rt->dst.lastuse = jiffies; +} + static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from) { BUG_ON(from->from); @@ -936,14 +1005,12 @@ static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from) static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort) { - rt->dst.input = ort->dst.input; - rt->dst.output = ort->dst.output; + ip6_rt_init_dst(rt, ort); + rt->rt6i_dst = ort->rt6i_dst; - rt->dst.error = ort->dst.error; rt->rt6i_idev = ort->rt6i_idev; if (rt->rt6i_idev) in6_dev_hold(rt->rt6i_idev); - rt->dst.lastuse = jiffies; rt->rt6i_gateway = ort->fib6_nh.nh_gw; rt->rt6i_flags = ort->rt6i_flags; rt6_set_from(rt, ort); @@ -2282,7 +2349,7 @@ restart: continue; if (fib6_check_expired(rt)) continue; - if (rt->dst.error) + if (rt->rt6i_flags & RTF_REJECT) break; if (!(rt->rt6i_flags & RTF_GATEWAY)) continue; @@ -2310,7 +2377,7 @@ restart: if (!rt) rt = net->ipv6.ip6_null_entry; - else if (rt->dst.error) { + else if (rt->rt6i_flags & RTF_REJECT) { rt = net->ipv6.ip6_null_entry; goto out; } @@ -2805,15 +2872,6 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg, addr_type = ipv6_addr_type(&cfg->fc_dst); - if (addr_type & IPV6_ADDR_MULTICAST) - rt->dst.input = ip6_mc_input; - else if (cfg->fc_flags & RTF_LOCAL) - rt->dst.input = ip6_input; - else - rt->dst.input = ip6_forward; - - rt->dst.output = ip6_output; - if (cfg->fc_encap) { struct lwtunnel_state *lwtstate; @@ -2823,7 +2881,6 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg, if (err) goto out; rt->fib6_nh.nh_lwtstate = lwtstate_get(lwtstate); - lwtunnel_set_redirect(&rt->dst); } ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len); @@ -2863,27 +2920,6 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg, } } rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP; - switch (cfg->fc_type) { - case RTN_BLACKHOLE: - rt->dst.error = -EINVAL; - rt->dst.output = dst_discard_out; - rt->dst.input = dst_discard; - break; - case RTN_PROHIBIT: - rt->dst.error = -EACCES; - rt->dst.output = ip6_pkt_prohibit_out; - rt->dst.input = ip6_pkt_prohibit; - break; - case RTN_THROW: - case RTN_UNREACHABLE: - default: - rt->dst.error = (cfg->fc_type == RTN_THROW) ? -EAGAIN - : (cfg->fc_type == RTN_UNREACHABLE) - ? -EHOSTUNREACH : -ENETUNREACH; - rt->dst.output = ip6_pkt_discard_out; - rt->dst.input = ip6_pkt_discard; - break; - } goto install_route; } @@ -3467,12 +3503,9 @@ struct rt6_info *addrconf_dst_alloc(struct net *net, return ERR_PTR(-ENOMEM); in6_dev_hold(idev); - - rt->dst.flags |= DST_HOST; - rt->dst.input = ip6_input; - rt->dst.output = ip6_output; rt->rt6i_idev = idev; + rt->dst.flags |= DST_HOST; rt->rt6i_protocol = RTPROT_KERNEL; rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP; if (anycast) { From 6ab1f7865c7800a93db66b40671920291a39f768 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 17 Apr 2018 17:33:19 -0700 Subject: [PATCH 0369/1640] UPSTREAM: net/ipv6: Add rt6_info create function for ip6_pol_route_lookup ip6_pol_route_lookup is the lookup function for ip6_route_lookup and rt6_lookup. At the moment it returns either a reference to a FIB entry or a cached exception. To move FIB entries to a separate struct, this lookup function needs to convert FIB entries to an rt6_info that is returned to the caller. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/route.c | 29 +++++++++++++++++++++++++---- 1 file changed, 25 insertions(+), 4 deletions(-) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index fd4f81d976b9..19216c05e1b5 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1058,6 +1058,19 @@ static bool ip6_hold_safe(struct net *net, struct rt6_info **prt, return false; } +/* called with rcu_lock held */ +static struct rt6_info *ip6_create_rt_rcu(struct rt6_info *rt) +{ + struct net_device *dev = rt->fib6_nh.nh_dev; + struct rt6_info *nrt; + + nrt = __ip6_dst_alloc(dev_net(dev), dev, 0); + if (nrt) + ip6_rt_copy_init(nrt, rt); + + return nrt; +} + static struct rt6_info *ip6_pol_route_lookup(struct net *net, struct fib6_table *table, struct flowi6 *fl6, @@ -1090,18 +1103,26 @@ restart: } /* Search through exception table */ rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr); - if (rt_cache) + if (rt_cache) { rt = rt_cache; + if (ip6_hold_safe(net, &rt, true)) + dst_use_noref(&rt->dst, jiffies); + } else if (dst_hold_safe(&rt->dst)) { + struct rt6_info *nrt; - if (ip6_hold_safe(net, &rt, true)) - dst_use_noref(&rt->dst, jiffies); + nrt = ip6_create_rt_rcu(rt); + dst_release(&rt->dst); + rt = nrt; + } else { + rt = net->ipv6.ip6_null_entry; + dst_hold(&rt->dst); + } rcu_read_unlock(); trace_fib6_table_lookup(net, rt, table->tb6_id, fl6); return rt; - } struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6, From ee697ffac375df93215a268b748da3ac1b25c439 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 17 Apr 2018 17:33:20 -0700 Subject: [PATCH 0370/1640] UPSTREAM: net/ipv6: Move dst flags to booleans in fib entries Continuing to wean FIB paths off of dst_entry, use a bool to hold requests for certain dst settings. Add a helper to convert the flags to DST flags when a FIB entry is converted to a dst_entry. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 5 ++++- net/ipv6/addrconf.c | 4 ++-- net/ipv6/route.c | 29 ++++++++++++++++++++++++----- 3 files changed, 30 insertions(+), 8 deletions(-) diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index bb674beb8189..22f8c2f64149 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -180,7 +180,10 @@ struct rt6_info { u8 fib6_type; u8 exception_bucket_flushed:1, should_flush:1, - unused:6; + dst_nocount:1, + dst_nopolicy:1, + dst_host:1, + unused:3; unsigned long expires; struct dst_metrics *fib6_metrics; diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index a46bc7191355..751c35be2fd7 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -1026,7 +1026,7 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr, if (net->ipv6.devconf_all->disable_policy || idev->cnf.disable_policy) - rt->dst.flags |= DST_NOPOLICY; + rt->dst_nopolicy = true; neigh_parms_data_state_setall(idev->nd_parms); @@ -6008,7 +6008,7 @@ void addrconf_disable_policy_idev(struct inet6_dev *idev, int val) int cpu; rcu_read_lock(); - addrconf_set_nopolicy(ifa->rt, val); + ifa->rt->dst_nopolicy = val ? true : false; if (rt->rt6i_pcpu) { for_each_possible_cpu(cpu) { struct rt6_info **rtp; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 19216c05e1b5..3b199f6963b9 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -940,6 +940,20 @@ static int ip6_rt_type_to_error(u8 fib6_type) return fib6_prop[fib6_type]; } +static unsigned short fib6_info_dst_flags(struct rt6_info *rt) +{ + unsigned short flags = 0; + + if (rt->dst_nocount) + flags |= DST_NOCOUNT; + if (rt->dst_nopolicy) + flags |= DST_NOPOLICY; + if (rt->dst_host) + flags |= DST_HOST; + + return flags; +} + static void ip6_rt_init_dst_reject(struct rt6_info *rt, struct rt6_info *ort) { rt->dst.error = ip6_rt_type_to_error(ort->fib6_type); @@ -964,6 +978,8 @@ static void ip6_rt_init_dst_reject(struct rt6_info *rt, struct rt6_info *ort) static void ip6_rt_init_dst(struct rt6_info *rt, struct rt6_info *ort) { + rt->dst.flags |= fib6_info_dst_flags(ort); + if (ort->rt6i_flags & RTF_REJECT) { ip6_rt_init_dst_reject(rt, ort); return; @@ -973,7 +989,6 @@ static void ip6_rt_init_dst(struct rt6_info *rt, struct rt6_info *ort) rt->dst.output = ip6_output; if (ort->fib6_type == RTN_LOCAL) { - rt->dst.flags |= DST_HOST; rt->dst.input = ip6_input; } else if (ipv6_addr_type(&ort->rt6i_dst.addr) & IPV6_ADDR_MULTICAST) { rt->dst.input = ip6_mc_input; @@ -1061,10 +1076,11 @@ static bool ip6_hold_safe(struct net *net, struct rt6_info **prt, /* called with rcu_lock held */ static struct rt6_info *ip6_create_rt_rcu(struct rt6_info *rt) { + unsigned short flags = fib6_info_dst_flags(rt); struct net_device *dev = rt->fib6_nh.nh_dev; struct rt6_info *nrt; - nrt = __ip6_dst_alloc(dev_net(dev), dev, 0); + nrt = __ip6_dst_alloc(dev_net(dev), dev, flags); if (nrt) ip6_rt_copy_init(nrt, rt); @@ -1232,12 +1248,13 @@ static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort, static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt) { + unsigned short flags = fib6_info_dst_flags(rt); struct net_device *dev; struct rt6_info *pcpu_rt; rcu_read_lock(); dev = ip6_rt_get_dev_rcu(rt); - pcpu_rt = __ip6_dst_alloc(dev_net(dev), dev, rt->dst.flags); + pcpu_rt = __ip6_dst_alloc(dev_net(dev), dev, flags); rcu_read_unlock(); if (!pcpu_rt) return NULL; @@ -2907,7 +2924,7 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg, ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len); rt->rt6i_dst.plen = cfg->fc_dst_len; if (rt->rt6i_dst.plen == 128) - rt->dst.flags |= DST_HOST; + rt->dst_host = true; #ifdef CONFIG_IPV6_SUBTREES ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len); @@ -3523,10 +3540,12 @@ struct rt6_info *addrconf_dst_alloc(struct net *net, if (!rt) return ERR_PTR(-ENOMEM); + rt->dst_nocount = true; + in6_dev_hold(idev); rt->rt6i_idev = idev; - rt->dst.flags |= DST_HOST; + rt->dst_host = true; rt->rt6i_protocol = RTPROT_KERNEL; rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP; if (anycast) { From 46c84432561e0769ddb389f682f56d2441090855 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 17 Apr 2018 17:33:24 -0700 Subject: [PATCH 0371/1640] UPSTREAM: net/ipv6: introduce fib6_info struct and helpers Add fib6_info struct and alloc, destroy, hold and release helpers. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 55 +++++++++++++++++++++++++++++++++++++++ net/ipv6/ip6_fib.c | 60 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 115 insertions(+) diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 22f8c2f64149..9f3d3f9ebceb 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -41,6 +41,7 @@ #endif struct rt6_info; +struct fib6_info; struct fib6_config { u32 fc_table; @@ -135,6 +136,46 @@ struct fib6_nh { int nh_weight; }; +struct fib6_info { + struct fib6_table *rt6i_table; + struct fib6_info __rcu *rt6_next; + struct fib6_node __rcu *rt6i_node; + + /* Multipath routes: + * siblings is a list of fib6_info that have the the same metric/weight, + * destination, but not the same gateway. nsiblings is just a cache + * to speed up lookup. + */ + struct list_head rt6i_siblings; + unsigned int rt6i_nsiblings; + + atomic_t rt6i_ref; + struct inet6_dev *rt6i_idev; + unsigned long expires; + struct dst_metrics *fib6_metrics; +#define fib6_pmtu fib6_metrics->metrics[RTAX_MTU-1] + + struct rt6key rt6i_dst; + u32 rt6i_flags; + struct rt6key rt6i_src; + struct rt6key rt6i_prefsrc; + + struct rt6_info * __percpu *rt6i_pcpu; + struct rt6_exception_bucket __rcu *rt6i_exception_bucket; + + u32 rt6i_metric; + u8 rt6i_protocol; + u8 fib6_type; + u8 exception_bucket_flushed:1, + should_flush:1, + dst_nocount:1, + dst_nopolicy:1, + dst_host:1, + unused:3; + + struct fib6_nh fib6_nh; +}; + struct rt6_info { struct dst_entry dst; struct rt6_info __rcu *rt6_next; @@ -293,6 +334,20 @@ static inline void ip6_rt_put(struct rt6_info *rt) void rt6_free_pcpu(struct rt6_info *non_pcpu_rt); +struct rt6_info *fib6_info_alloc(gfp_t gfp_flags); +void fib6_info_destroy(struct rt6_info *f6i); + +static inline void fib6_info_hold(struct rt6_info *f6i) +{ + atomic_inc(&f6i->rt6i_ref); +} + +static inline void fib6_info_release(struct rt6_info *f6i) +{ + if (f6i && atomic_dec_and_test(&f6i->rt6i_ref)) + fib6_info_destroy(f6i); +} + static inline void rt6_hold(struct rt6_info *rt) { atomic_inc(&rt->rt6i_ref); diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 26b0f8edf0aa..5922730c363b 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -145,6 +145,66 @@ static __be32 addr_bit_set(const void *token, int fn_bit) addr[fn_bit >> 5]; } +struct rt6_info *fib6_info_alloc(gfp_t gfp_flags) +{ + struct rt6_info *f6i; + + f6i = kzalloc(sizeof(*f6i), gfp_flags); + if (!f6i) + return NULL; + + f6i->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, gfp_flags); + if (!f6i->rt6i_pcpu) { + kfree(f6i); + return NULL; + } + + INIT_LIST_HEAD(&f6i->rt6i_siblings); + f6i->fib6_metrics = (struct dst_metrics *)&dst_default_metrics; + + atomic_inc(&f6i->rt6i_ref); + + return f6i; +} + +void fib6_info_destroy(struct rt6_info *f6i) +{ + struct rt6_exception_bucket *bucket; + + WARN_ON(f6i->rt6i_node); + + bucket = rcu_dereference_protected(f6i->rt6i_exception_bucket, 1); + if (bucket) { + f6i->rt6i_exception_bucket = NULL; + kfree(bucket); + } + + if (f6i->rt6i_pcpu) { + int cpu; + + for_each_possible_cpu(cpu) { + struct rt6_info **ppcpu_rt; + struct rt6_info *pcpu_rt; + + ppcpu_rt = per_cpu_ptr(f6i->rt6i_pcpu, cpu); + pcpu_rt = *ppcpu_rt; + if (pcpu_rt) { + dst_dev_put(&pcpu_rt->dst); + dst_release(&pcpu_rt->dst); + *ppcpu_rt = NULL; + } + } + } + + if (f6i->rt6i_idev) + in6_dev_put(f6i->rt6i_idev); + if (f6i->fib6_nh.nh_dev) + dev_put(f6i->fib6_nh.nh_dev); + + kfree(f6i); +} +EXPORT_SYMBOL_GPL(fib6_info_destroy); + static struct fib6_node *node_alloc(struct net *net) { struct fib6_node *fn; From a582e9811c02be3050698d5fe7a273363110a7b0 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Mon, 8 Jan 2018 10:34:00 -0800 Subject: [PATCH 0372/1640] UPSTREAM: ipv6: remove null_entry before adding default route In the current code, when creating a new fib6 table, tb6_root.leaf gets initialized to net->ipv6.ip6_null_entry. If a default route is being added with rt->rt6i_metric = 0xffffffff, fib6_add() will add this route after net->ipv6.ip6_null_entry. As null_entry is shared, it could cause problem. In order to fix it, set fn->leaf to NULL before calling fib6_add_rt2node() when trying to add the first default route. And reset fn->leaf to null_entry when adding fails or when deleting the last default route. syzkaller reported the following issue which is fixed by this commit: WARNING: suspicious RCU usage 4.15.0-rc5+ #171 Not tainted ----------------------------- net/ipv6/ip6_fib.c:1702 suspicious rcu_dereference_protected() usage! other info that might help us debug this: rcu_scheduler_active = 2, debug_locks = 1 4 locks held by swapper/0/0: #0: ((&net->ipv6.ip6_fib_timer)){+.-.}, at: [<00000000d43f631b>] lockdep_copy_map include/linux/lockdep.h:178 [inline] #0: ((&net->ipv6.ip6_fib_timer)){+.-.}, at: [<00000000d43f631b>] call_timer_fn+0x1c6/0x820 kernel/time/timer.c:1310 #1: (&(&net->ipv6.fib6_gc_lock)->rlock){+.-.}, at: [<000000002ff9d65c>] spin_lock_bh include/linux/spinlock.h:315 [inline] #1: (&(&net->ipv6.fib6_gc_lock)->rlock){+.-.}, at: [<000000002ff9d65c>] fib6_run_gc+0x9d/0x3c0 net/ipv6/ip6_fib.c:2007 #2: (rcu_read_lock){....}, at: [<0000000091db762d>] __fib6_clean_all+0x0/0x3a0 net/ipv6/ip6_fib.c:1560 #3: (&(&tb->tb6_lock)->rlock){+.-.}, at: [<000000009e503581>] spin_lock_bh include/linux/spinlock.h:315 [inline] #3: (&(&tb->tb6_lock)->rlock){+.-.}, at: [<000000009e503581>] __fib6_clean_all+0x1d0/0x3a0 net/ipv6/ip6_fib.c:1948 stack backtrace: CPU: 0 PID: 0 Comm: swapper/0 Not tainted 4.15.0-rc5+ #171 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:17 [inline] dump_stack+0x194/0x257 lib/dump_stack.c:53 lockdep_rcu_suspicious+0x123/0x170 kernel/locking/lockdep.c:4585 fib6_del+0xcaa/0x11b0 net/ipv6/ip6_fib.c:1701 fib6_clean_node+0x3aa/0x4f0 net/ipv6/ip6_fib.c:1892 fib6_walk_continue+0x46c/0x8a0 net/ipv6/ip6_fib.c:1815 fib6_walk+0x91/0xf0 net/ipv6/ip6_fib.c:1863 fib6_clean_tree+0x1e6/0x340 net/ipv6/ip6_fib.c:1933 __fib6_clean_all+0x1f4/0x3a0 net/ipv6/ip6_fib.c:1949 fib6_clean_all net/ipv6/ip6_fib.c:1960 [inline] fib6_run_gc+0x16b/0x3c0 net/ipv6/ip6_fib.c:2016 fib6_gc_timer_cb+0x20/0x30 net/ipv6/ip6_fib.c:2033 call_timer_fn+0x228/0x820 kernel/time/timer.c:1320 expire_timers kernel/time/timer.c:1357 [inline] __run_timers+0x7ee/0xb70 kernel/time/timer.c:1660 run_timer_softirq+0x4c/0xb0 kernel/time/timer.c:1686 __do_softirq+0x2d7/0xb85 kernel/softirq.c:285 invoke_softirq kernel/softirq.c:365 [inline] irq_exit+0x1cc/0x200 kernel/softirq.c:405 exiting_irq arch/x86/include/asm/apic.h:540 [inline] smp_apic_timer_interrupt+0x16b/0x700 arch/x86/kernel/apic/apic.c:1052 apic_timer_interrupt+0xa9/0xb0 arch/x86/entry/entry_64.S:904 Reported-by: syzbot Fixes: 66f5d6ce53e6 ("ipv6: replace rwlock with rcu and spinlock in fib6_table") Signed-off-by: Wei Wang Acked-by: Martin KaFai Lau Signed-off-by: David S. Miller --- net/ipv6/ip6_fib.c | 38 +++++++++++++++++++++++++++++--------- 1 file changed, 29 insertions(+), 9 deletions(-) diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 5922730c363b..607066c1ad33 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -715,6 +715,11 @@ static struct fib6_node *fib6_add_1(struct net *net, if (!(fn->fn_flags & RTN_RTINFO)) { RCU_INIT_POINTER(fn->leaf, NULL); rt6_release(leaf); + /* remove null_entry in the root node */ + } else if (fn->fn_flags & RTN_TL_ROOT && + rcu_access_pointer(fn->leaf) == + net->ipv6.ip6_null_entry) { + RCU_INIT_POINTER(fn->leaf, NULL); } return fn; @@ -1309,13 +1314,17 @@ out: return err; failure: - /* fn->leaf could be NULL if fn is an intermediate node and we - * failed to add the new route to it in both subtree creation - * failure and fib6_add_rt2node() failure case. - * In both cases, fib6_repair_tree() should be called to fix - * fn->leaf. + /* fn->leaf could be NULL and fib6_repair_tree() needs to be called if: + * 1. fn is an intermediate node and we failed to add the new + * route to it in both subtree creation failure and fib6_add_rt2node() + * failure case. + * 2. fn is the root node in the table and we fail to add the first + * default route to it. */ - if (fn && !(fn->fn_flags & (RTN_RTINFO|RTN_ROOT))) + if (fn && + (!(fn->fn_flags & (RTN_RTINFO|RTN_ROOT)) || + (fn->fn_flags & RTN_TL_ROOT && + !rcu_access_pointer(fn->leaf)))) fib6_repair_tree(info->nl_net, table, fn); /* Always release dst as dst->__refcnt is guaranteed * to be taken before entering this function @@ -1567,6 +1576,12 @@ static struct fib6_node *fib6_repair_tree(struct net *net, struct fib6_walker *w; int iter = 0; + /* Set fn->leaf to null_entry for root node. */ + if (fn->fn_flags & RTN_TL_ROOT) { + rcu_assign_pointer(fn->leaf, net->ipv6.ip6_null_entry); + return fn; + } + for (;;) { struct fib6_node *fn_r = rcu_dereference_protected(fn->right, lockdep_is_held(&table->tb6_lock)); @@ -1722,10 +1737,15 @@ static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn, } read_unlock(&net->ipv6.fib6_walker_lock); - /* If it was last route, expunge its radix tree node */ + /* If it was last route, call fib6_repair_tree() to: + * 1. For root node, put back null_entry as how the table was created. + * 2. For other nodes, expunge its radix tree node. + */ if (!rcu_access_pointer(fn->leaf)) { - fn->fn_flags &= ~RTN_RTINFO; - net->ipv6.rt6_stats->fib_route_nodes--; + if (!(fn->fn_flags & RTN_TL_ROOT)) { + fn->fn_flags &= ~RTN_RTINFO; + net->ipv6.rt6_stats->fib_route_nodes--; + } fn = fib6_repair_tree(net, table, fn); } From a4464250b0d7e0212c1572ad49958457c3a7ab1e Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Thu, 18 Jan 2018 10:40:03 -0800 Subject: [PATCH 0373/1640] UPSTREAM: ipv6: don't let tb6_root node share routes with other node After commit 4512c43eac7e, if we add a route to the subtree of tb6_root which does not have any route attached to it yet, the current code will let tb6_root and the node in the subtree share the same route. This could cause problem cause tb6_root has RTN_INFO flag marked and the tree repair and clean up code will not work properly. This commit makes sure tb6_root->leaf points back to null_entry instead of sharing route with other node. It fixes the following syzkaller reported issue: BUG: KASAN: use-after-free in ipv6_prefix_equal include/net/ipv6.h:540 [inline] BUG: KASAN: use-after-free in fib6_add_1+0x165f/0x1790 net/ipv6/ip6_fib.c:618 Read of size 8 at addr ffff8801bc043498 by task syz-executor5/19819 CPU: 1 PID: 19819 Comm: syz-executor5 Not tainted 4.15.0-rc7+ #186 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:17 [inline] dump_stack+0x194/0x257 lib/dump_stack.c:53 print_address_description+0x73/0x250 mm/kasan/report.c:252 kasan_report_error mm/kasan/report.c:351 [inline] kasan_report+0x25b/0x340 mm/kasan/report.c:409 __asan_report_load8_noabort+0x14/0x20 mm/kasan/report.c:430 ipv6_prefix_equal include/net/ipv6.h:540 [inline] fib6_add_1+0x165f/0x1790 net/ipv6/ip6_fib.c:618 fib6_add+0x5fa/0x1540 net/ipv6/ip6_fib.c:1214 __ip6_ins_rt+0x6c/0x90 net/ipv6/route.c:1003 ip6_route_add+0x141/0x190 net/ipv6/route.c:2790 ipv6_route_ioctl+0x4db/0x6b0 net/ipv6/route.c:3299 inet6_ioctl+0xef/0x1e0 net/ipv6/af_inet6.c:520 sock_do_ioctl+0x65/0xb0 net/socket.c:958 sock_ioctl+0x2c2/0x440 net/socket.c:1055 vfs_ioctl fs/ioctl.c:46 [inline] do_vfs_ioctl+0x1b1/0x1520 fs/ioctl.c:686 SYSC_ioctl fs/ioctl.c:701 [inline] SyS_ioctl+0x8f/0xc0 fs/ioctl.c:692 entry_SYSCALL_64_fastpath+0x23/0x9a RIP: 0033:0x452ac9 RSP: 002b:00007fd42b321c58 EFLAGS: 00000212 ORIG_RAX: 0000000000000010 RAX: ffffffffffffffda RBX: 000000000071bea0 RCX: 0000000000452ac9 RDX: 0000000020fd7000 RSI: 000000000000890b RDI: 0000000000000013 RBP: 000000000000049e R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000212 R12: 00000000006f4f70 R13: 00000000ffffffff R14: 00007fd42b3226d4 R15: 0000000000000000 Fixes: 4512c43eac7e ("ipv6: remove null_entry before adding default route") Signed-off-by: Wei Wang Acked-by: Eric Dumazet Acked-by: Martin KaFai Lau Signed-off-by: David S. Miller --- net/ipv6/ip6_fib.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 607066c1ad33..c9b0da473d8c 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -1270,8 +1270,14 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, } if (!rcu_access_pointer(fn->leaf)) { - atomic_inc(&rt->rt6i_ref); - rcu_assign_pointer(fn->leaf, rt); + if (fn->fn_flags & RTN_TL_ROOT) { + /* put back null_entry for root node */ + rcu_assign_pointer(fn->leaf, + info->nl_net->ipv6.ip6_null_entry); + } else { + atomic_inc(&rt->rt6i_ref); + rcu_assign_pointer(fn->leaf, rt); + } } fn = sn; } From 87da362d5ef4739bc80f5526a59920cbcbf6de6c Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Wed, 3 Jan 2018 14:11:59 -0800 Subject: [PATCH 0374/1640] UPSTREAM: ipv6: fix general protection fault in fib6_add() In fib6_add(), pn could be NULL if fib6_add_1() failed to return a fib6 node. Checking pn != fn before accessing pn->leaf makes sure pn is not NULL. This fixes the following GPF reported by syzkaller: general protection fault: 0000 [#1] SMP KASAN Dumping ftrace buffer: (ftrace buffer empty) Modules linked in: CPU: 0 PID: 3201 Comm: syzkaller001778 Not tainted 4.15.0-rc5+ #151 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:fib6_add+0x736/0x15a0 net/ipv6/ip6_fib.c:1244 RSP: 0018:ffff8801c7626a70 EFLAGS: 00010202 RAX: dffffc0000000000 RBX: 0000000000000020 RCX: ffffffff84794465 RDX: 0000000000000004 RSI: ffff8801d38935f0 RDI: 0000000000000282 RBP: ffff8801c7626da0 R08: 1ffff10038ec4c35 R09: 0000000000000000 R10: ffff8801c7626c68 R11: 0000000000000000 R12: 00000000fffffffe R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000009 FS: 0000000000000000(0000) GS:ffff8801db200000(0063) knlGS:0000000009b70840 CS: 0010 DS: 002b ES: 002b CR0: 0000000080050033 CR2: 0000000020be1000 CR3: 00000001d585a006 CR4: 00000000001606f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: __ip6_ins_rt+0x6c/0x90 net/ipv6/route.c:1006 ip6_route_multipath_add+0xd14/0x16c0 net/ipv6/route.c:3833 inet6_rtm_newroute+0xdc/0x160 net/ipv6/route.c:3957 rtnetlink_rcv_msg+0x733/0x1020 net/core/rtnetlink.c:4411 netlink_rcv_skb+0x21e/0x460 net/netlink/af_netlink.c:2408 rtnetlink_rcv+0x1c/0x20 net/core/rtnetlink.c:4423 netlink_unicast_kernel net/netlink/af_netlink.c:1275 [inline] netlink_unicast+0x4e8/0x6f0 net/netlink/af_netlink.c:1301 netlink_sendmsg+0xa4a/0xe60 net/netlink/af_netlink.c:1864 sock_sendmsg_nosec net/socket.c:636 [inline] sock_sendmsg+0xca/0x110 net/socket.c:646 sock_write_iter+0x31a/0x5d0 net/socket.c:915 call_write_iter include/linux/fs.h:1772 [inline] do_iter_readv_writev+0x525/0x7f0 fs/read_write.c:653 do_iter_write+0x154/0x540 fs/read_write.c:932 compat_writev+0x225/0x420 fs/read_write.c:1246 do_compat_writev+0x115/0x220 fs/read_write.c:1267 C_SYSC_writev fs/read_write.c:1278 [inline] compat_SyS_writev+0x26/0x30 fs/read_write.c:1274 do_syscall_32_irqs_on arch/x86/entry/common.c:327 [inline] do_fast_syscall_32+0x3ee/0xf9d arch/x86/entry/common.c:389 entry_SYSENTER_compat+0x54/0x63 arch/x86/entry/entry_64_compat.S:125 Reported-by: syzbot Fixes: 66f5d6ce53e6 ("ipv6: replace rwlock with rcu and spinlock in fib6_table") Signed-off-by: Wei Wang Signed-off-by: David S. Miller --- net/ipv6/ip6_fib.c | 35 ++++++++++++++++++++--------------- 1 file changed, 20 insertions(+), 15 deletions(-) diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index c9b0da473d8c..2f2aa326e578 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -1296,23 +1296,28 @@ out: * If fib6_add_1 has cleared the old leaf pointer in the * super-tree leaf node we have to find a new one for it. */ - struct rt6_info *pn_leaf = rcu_dereference_protected(pn->leaf, - lockdep_is_held(&table->tb6_lock)); - if (pn != fn && pn_leaf == rt) { - pn_leaf = NULL; - RCU_INIT_POINTER(pn->leaf, NULL); - atomic_dec(&rt->rt6i_ref); - } - if (pn != fn && !pn_leaf && !(pn->fn_flags & RTN_RTINFO)) { - pn_leaf = fib6_find_prefix(info->nl_net, table, pn); -#if RT6_DEBUG >= 2 - if (!pn_leaf) { - WARN_ON(!pn_leaf); - pn_leaf = info->nl_net->ipv6.ip6_null_entry; + if (pn != fn) { + struct rt6_info *pn_leaf = + rcu_dereference_protected(pn->leaf, + lockdep_is_held(&table->tb6_lock)); + if (pn_leaf == rt) { + pn_leaf = NULL; + RCU_INIT_POINTER(pn->leaf, NULL); + atomic_dec(&rt->rt6i_ref); } + if (!pn_leaf && !(pn->fn_flags & RTN_RTINFO)) { + pn_leaf = fib6_find_prefix(info->nl_net, table, + pn); +#if RT6_DEBUG >= 2 + if (!pn_leaf) { + WARN_ON(!pn_leaf); + pn_leaf = + info->nl_net->ipv6.ip6_null_entry; + } #endif - atomic_inc(&pn_leaf->rt6i_ref); - rcu_assign_pointer(pn->leaf, pn_leaf); + atomic_inc(&pn_leaf->rt6i_ref); + rcu_assign_pointer(pn->leaf, pn_leaf); + } } #endif goto failure; From 23fe11cdd25e93b7d0ff0e631b6513d997bc90a5 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Mon, 23 Oct 2017 14:59:35 -0700 Subject: [PATCH 0375/1640] UPSTREAM: ipv6: add ip6_null_entry check in rt6_select() In rt6_select(), fn->leaf could be pointing to net->ipv6.ip6_null_entry. In this case, we should directly return instead of trying to carry on with the rest of the process. If not, we could crash at: spin_lock_bh(&leaf->rt6i_table->rt6_lock); because net->ipv6.ip6_null_entry does not have rt6i_table set. Syzkaller recently reported following issue on net-next: Use struct sctp_sack_info instead kasan: CONFIG_KASAN_INLINE enabled kasan: GPF could be caused by NULL-ptr deref or user memory access general protection fault: 0000 [#1] SMP KASAN Dumping ftrace buffer: (ftrace buffer empty) Modules linked in: sctp: [Deprecated]: syz-executor4 (pid 26496) Use of struct sctp_assoc_value in delayed_ack socket option. Use struct sctp_sack_info instead CPU: 1 PID: 26523 Comm: syz-executor6 Not tainted 4.14.0-rc4+ #85 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 task: ffff8801d147e3c0 task.stack: ffff8801a4328000 RIP: 0010:debug_spin_lock_before kernel/locking/spinlock_debug.c:83 [inline] RIP: 0010:do_raw_spin_lock+0x23/0x1e0 kernel/locking/spinlock_debug.c:112 RSP: 0018:ffff8801a432ed70 EFLAGS: 00010207 RAX: dffffc0000000000 RBX: 0000000000000018 RCX: 0000000000000000 RDX: 0000000000000003 RSI: 0000000000000000 RDI: 000000000000001c RBP: ffff8801a432ed90 R08: 0000000000000001 R09: 0000000000000000 R10: 0000000000000000 R11: ffffffff8482b279 R12: ffff8801ce2ff3a0 sctp: [Deprecated]: syz-executor1 (pid 26546) Use of int in maxseg socket option. Use struct sctp_assoc_value instead R13: dffffc0000000000 R14: ffff8801d971e000 R15: ffff8801ce2ff0d8 FS: 00007f56e82f5700(0000) GS:ffff8801db300000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000001ddbc22000 CR3: 00000001a4a04000 CR4: 00000000001406e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: __raw_spin_lock_bh include/linux/spinlock_api_smp.h:136 [inline] _raw_spin_lock_bh+0x39/0x40 kernel/locking/spinlock.c:175 spin_lock_bh include/linux/spinlock.h:321 [inline] rt6_select net/ipv6/route.c:786 [inline] ip6_pol_route+0x1be3/0x3bd0 net/ipv6/route.c:1650 sctp: [Deprecated]: syz-executor1 (pid 26576) Use of int in maxseg socket option. Use struct sctp_assoc_value instead TCP: request_sock_TCPv6: Possible SYN flooding on port 20002. Sending cookies. Check SNMP counters. ip6_pol_route_output+0x4c/0x60 net/ipv6/route.c:1843 fib6_rule_lookup+0x9e/0x2a0 net/ipv6/ip6_fib.c:309 ip6_route_output_flags+0x1f1/0x2b0 net/ipv6/route.c:1871 ip6_route_output include/net/ip6_route.h:80 [inline] ip6_dst_lookup_tail+0x4ea/0x970 net/ipv6/ip6_output.c:953 ip6_dst_lookup_flow+0xc8/0x270 net/ipv6/ip6_output.c:1076 sctp_v6_get_dst+0x675/0x1c30 net/sctp/ipv6.c:274 sctp_transport_route+0xa8/0x430 net/sctp/transport.c:287 sctp_assoc_add_peer+0x4fe/0x1100 net/sctp/associola.c:656 __sctp_connect+0x251/0xc80 net/sctp/socket.c:1187 sctp_connect+0xb4/0xf0 net/sctp/socket.c:4209 inet_dgram_connect+0x16b/0x1f0 net/ipv4/af_inet.c:541 SYSC_connect+0x20a/0x480 net/socket.c:1642 SyS_connect+0x24/0x30 net/socket.c:1623 entry_SYSCALL_64_fastpath+0x1f/0xbe Fixes: 66f5d6ce53e6 ("ipv6: replace rwlock with rcu and spinlock in fib6_table") Signed-off-by: Wei Wang Acked-by: Eric Dumazet Acked-by: Martin KaFai Lau Signed-off-by: David S. Miller --- net/ipv6/route.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 3b199f6963b9..3897225a1528 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -770,7 +770,7 @@ static struct rt6_info *rt6_select(struct net *net, struct fib6_node *fn, bool do_rr = false; int key_plen; - if (!leaf) + if (!leaf || leaf == net->ipv6.ip6_null_entry) return net->ipv6.ip6_null_entry; rt0 = rcu_dereference(fn->rr_ptr); From f282a61dfb0887979fa34d79619f44759a8366e8 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Fri, 2 Mar 2018 08:32:15 -0800 Subject: [PATCH 0376/1640] UPSTREAM: net/ipv6: Make rt6_multipath_hash similar to fib_multipath_hash Make rt6_multipath_hash more of a direct parallel to fib_multipath_hash and reduce stack and overhead in the process: get_hash_from_flowi6 is just a wrapper around __get_hash_from_flowi6 with another stack allocation for flow_keys. Move setting the addresses, protocol and label into rt6_multipath_hash and allow it to make the call to flow_hash_from_keys. Signed-off-by: David Ahern Reviewed-by: Ido Schimmel Reviewed-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/ipv6/route.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 3897225a1528..997257a2fc84 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2041,13 +2041,21 @@ u32 rt6_multipath_hash(const struct flowi6 *fl6, const struct sk_buff *skb, struct flow_keys *flkeys) { struct flow_keys hash_keys; + u32 mhash; + memset(&hash_keys, 0, sizeof(hash_keys)); + hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; if (skb) { ip6_multipath_l3_keys(skb, &hash_keys, flkeys); - return flow_hash_from_keys(&hash_keys) >> 1; + } else { + hash_keys.addrs.v6addrs.src = fl6->saddr; + hash_keys.addrs.v6addrs.dst = fl6->daddr; + hash_keys.tags.flow_label = (__force u32)fl6->flowlabel; + hash_keys.basic.ip_proto = fl6->flowi6_proto; } + mhash = flow_hash_from_keys(&hash_keys); - return get_hash_from_flowi6(fl6) >> 1; + return mhash >> 1; } void ip6_route_input(struct sk_buff *skb) From c3986d21f24b3b1cd6aaabcac78131c457f410fe Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Thu, 2 Nov 2017 17:14:05 +0100 Subject: [PATCH 0377/1640] UPSTREAM: ipv4: Send a netevent whenever multipath hash policy is changed Devices performing IPv4 forwarding need to update their multipath hash policy whenever it is changed. Inform these devices by generating a netevent. Signed-off-by: Ido Schimmel Reviewed-by: Petr Machata Signed-off-by: Jiri Pirko Acked-by: David Ahern Signed-off-by: David S. Miller --- include/net/netevent.h | 1 + net/ipv4/sysctl_net_ipv4.c | 20 +++++++++++++++++++- 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/include/net/netevent.h b/include/net/netevent.h index f728d9cad170..40e7bab68490 100644 --- a/include/net/netevent.h +++ b/include/net/netevent.h @@ -26,6 +26,7 @@ enum netevent_notif_type { NETEVENT_NEIGH_UPDATE = 1, /* arg is struct neighbour ptr */ NETEVENT_REDIRECT, /* arg is struct netevent_redirect ptr */ NETEVENT_DELAY_PROBE_TIME_UPDATE, /* arg is struct neigh_parms ptr */ + NETEVENT_MULTIPATH_HASH_UPDATE, /* arg is struct net ptr */ }; int register_netevent_notifier(struct notifier_block *nb); diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 885499ea3242..08f7e2e76c6e 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -26,6 +26,7 @@ #include #include #include +#include static int zero; static int one = 1; @@ -350,6 +351,23 @@ static int proc_tcp_available_ulp(struct ctl_table *ctl, return ret; } +#ifdef CONFIG_IP_ROUTE_MULTIPATH +static int proc_fib_multipath_hash_policy(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + struct net *net = container_of(table->data, struct net, + ipv4.sysctl_fib_multipath_hash_policy); + int ret; + + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + if (write && ret == 0) + call_netevent_notifiers(NETEVENT_MULTIPATH_HASH_UPDATE, net); + + return ret; +} +#endif + static struct ctl_table ipv4_table[] = { { .procname = "tcp_retrans_collapse", @@ -1095,7 +1113,7 @@ static struct ctl_table ipv4_net_table[] = { .data = &init_net.ipv4.sysctl_fib_multipath_hash_policy, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_fib_multipath_hash_policy, .extra1 = &zero, .extra2 = &one, }, From 00cf2ef5670e6b30e054fa6a07b2f90fc6803f72 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Fri, 2 Mar 2018 08:32:16 -0800 Subject: [PATCH 0378/1640] BACKPORT: net: Rename NETEVENT_MULTIPATH_HASH_UPDATE Rename NETEVENT_MULTIPATH_HASH_UPDATE to NETEVENT_IPV4_MPATH_HASH_UPDATE to denote it relates to a change in the IPv4 hash policy. Signed-off-by: David Ahern Reviewed-by: Ido Schimmel Reviewed-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/net/netevent.h | 2 +- net/ipv4/sysctl_net_ipv4.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/include/net/netevent.h b/include/net/netevent.h index 40e7bab68490..baee605a94ab 100644 --- a/include/net/netevent.h +++ b/include/net/netevent.h @@ -26,7 +26,7 @@ enum netevent_notif_type { NETEVENT_NEIGH_UPDATE = 1, /* arg is struct neighbour ptr */ NETEVENT_REDIRECT, /* arg is struct netevent_redirect ptr */ NETEVENT_DELAY_PROBE_TIME_UPDATE, /* arg is struct neigh_parms ptr */ - NETEVENT_MULTIPATH_HASH_UPDATE, /* arg is struct net ptr */ + NETEVENT_IPV4_MPATH_HASH_UPDATE, /* arg is struct net ptr */ }; int register_netevent_notifier(struct notifier_block *nb); diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 08f7e2e76c6e..34f02b25a084 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -362,7 +362,7 @@ static int proc_fib_multipath_hash_policy(struct ctl_table *table, int write, ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (write && ret == 0) - call_netevent_notifiers(NETEVENT_MULTIPATH_HASH_UPDATE, net); + call_netevent_notifiers(NETEVENT_IPV4_MPATH_HASH_UPDATE, net); return ret; } From 9ace8a6bf74e8ed237d60a3bd1a7fcc96b1f9217 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Mon, 30 Oct 2017 14:16:00 -0700 Subject: [PATCH 0379/1640] UPSTREAM: ipv6: Implement limits on Hop-by-Hop and Destination options RFC 8200 (IPv6) defines Hop-by-Hop options and Destination options extension headers. Both of these carry a list of TLVs which is only limited by the maximum length of the extension header (2048 bytes). By the spec a host must process all the TLVs in these options, however these could be used as a fairly obvious denial of service attack. I think this could in fact be a significant DOS vector on the Internet, one mitigating factor might be that many FWs drop all packets with EH (and obviously this is only IPv6) so an Internet wide attack might not be so effective (yet!). By my calculation, the worse case packet with TLVs in a standard 1500 byte MTU packet that would be processed by the stack contains 1282 invidual TLVs (including pad TLVS) or 724 two byte TLVs. I wrote a quick test program that floods a whole bunch of these packets to a host and sure enough there is substantial time spent in ip6_parse_tlv. These packets contain nothing but unknown TLVS (that are ignored), TLV padding, and bogus UDP header with zero payload length. 25.38% [kernel] [k] __fib6_clean_all 21.63% [kernel] [k] ip6_parse_tlv 4.21% [kernel] [k] __local_bh_enable_ip 2.18% [kernel] [k] ip6_pol_route.isra.39 1.98% [kernel] [k] fib6_walk_continue 1.88% [kernel] [k] _raw_write_lock_bh 1.65% [kernel] [k] dst_release This patch adds configurable limits to Destination and Hop-by-Hop options. There are three limits that may be set: - Limit the number of options in a Hop-by-Hop or Destination options extension header. - Limit the byte length of a Hop-by-Hop or Destination options extension header. - Disallow unrecognized options in a Hop-by-Hop or Destination options extension header. The limits are set in corresponding sysctls: ipv6.sysctl.max_dst_opts_cnt ipv6.sysctl.max_hbh_opts_cnt ipv6.sysctl.max_dst_opts_len ipv6.sysctl.max_hbh_opts_len If a max_*_opts_cnt is less than zero then unknown TLVs are disallowed. The number of known TLVs that are allowed is the absolute value of this number. If a limit is exceeded when processing an extension header the packet is dropped. Default values are set to 8 for options counts, and set to INT_MAX for maximum length. Note the choice to limit options to 8 is an arbitrary guess (roughly based on the fact that the stack supports three HBH options and just one destination option). These limits have being proposed in draft-ietf-6man-rfc6434-bis. Tested (by Martin Lau) I tested out 1 thread (i.e. one raw_udp process). I changed the net.ipv6.max_dst_(opts|hbh)_number between 8 to 2048. With sysctls setting to 2048, the softirq% is packed to 100%. With 8, the softirq% is almost unnoticable from mpstat. v2; - Code and documention cleanup. - Change references of RFC2460 to be RFC8200. - Add reference to RFC6434-bis where the limits will be in standard. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.txt | 24 +++++++++ include/net/ipv6.h | 40 +++++++++++++++ include/net/netns/ipv6.h | 4 ++ net/ipv6/af_inet6.c | 4 ++ net/ipv6/exthdrs.c | 67 +++++++++++++++++++++----- net/ipv6/sysctl_net_ipv6.c | 32 ++++++++++++ 6 files changed, 159 insertions(+), 12 deletions(-) diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index d2a40420317f..542df8e622e4 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -1413,6 +1413,30 @@ mld_qrv - INTEGER Default: 2 (as specified by RFC3810 9.1) Minimum: 1 (as specified by RFC6636 4.5) +max_dst_opts_cnt - INTEGER + Maximum number of non-padding TLVs allowed in a Destination + options extension header. If this value is less than zero + then unknown options are disallowed and the number of known + TLVs allowed is the absolute value of this number. + Default: 8 + +max_hbh_opts_cnt - INTEGER + Maximum number of non-padding TLVs allowed in a Hop-by-Hop + options extension header. If this value is less than zero + then unknown options are disallowed and the number of known + TLVs allowed is the absolute value of this number. + Default: 8 + +max dst_opts_len - INTEGER + Maximum length allowed for a Destination options extension + header. + Default: INT_MAX (unlimited) + +max hbh_opts_len - INTEGER + Maximum length allowed for a Hop-by-Hop options extension + header. + Default: INT_MAX (unlimited) + IPv6 Fragmentation: ip6frag_high_thresh - INTEGER diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 384e71baa1b0..fa50206bee5e 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -51,6 +51,46 @@ #define IPV6_DEFAULT_HOPLIMIT 64 #define IPV6_DEFAULT_MCASTHOPS 1 +/* Limits on Hop-by-Hop and Destination options. + * + * Per RFC8200 there is no limit on the maximum number or lengths of options in + * Hop-by-Hop or Destination options other then the packet must fit in an MTU. + * We allow configurable limits in order to mitigate potential denial of + * service attacks. + * + * There are three limits that may be set: + * - Limit the number of options in a Hop-by-Hop or Destination options + * extension header + * - Limit the byte length of a Hop-by-Hop or Destination options extension + * header + * - Disallow unknown options + * + * The limits are expressed in corresponding sysctls: + * + * ipv6.sysctl.max_dst_opts_cnt + * ipv6.sysctl.max_hbh_opts_cnt + * ipv6.sysctl.max_dst_opts_len + * ipv6.sysctl.max_hbh_opts_len + * + * max_*_opts_cnt is the number of TLVs that are allowed for Destination + * options or Hop-by-Hop options. If the number is less than zero then unknown + * TLVs are disallowed and the number of known options that are allowed is the + * absolute value. Setting the value to INT_MAX indicates no limit. + * + * max_*_opts_len is the length limit in bytes of a Destination or + * Hop-by-Hop options extension header. Setting the value to INT_MAX + * indicates no length limit. + * + * If a limit is exceeded when processing an extension header the packet is + * silently discarded. + */ + +/* Default limits for Hop-by-Hop and Destination options */ +#define IP6_DEFAULT_MAX_DST_OPTS_CNT 8 +#define IP6_DEFAULT_MAX_HBH_OPTS_CNT 8 +#define IP6_DEFAULT_MAX_DST_OPTS_LEN INT_MAX /* No limit */ +#define IP6_DEFAULT_MAX_HBH_OPTS_LEN INT_MAX /* No limit */ + /* * Addr type * diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h index 2e020868ae67..75ddd8723894 100644 --- a/include/net/netns/ipv6.h +++ b/include/net/netns/ipv6.h @@ -38,6 +38,10 @@ struct netns_sysctl_ipv6 { int idgen_delay; int flowlabel_state_ranges; int flowlabel_reflect; + int max_dst_opts_cnt; + int max_hbh_opts_cnt; + int max_dst_opts_len; + int max_hbh_opts_len; }; struct netns_ipv6 { diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index afb20eb37f54..b40e78547b60 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -870,6 +870,10 @@ static int __net_init inet6_net_init(struct net *net) net->ipv6.sysctl.idgen_retries = 3; net->ipv6.sysctl.idgen_delay = 1 * HZ; net->ipv6.sysctl.flowlabel_state_ranges = 0; + net->ipv6.sysctl.max_dst_opts_cnt = IP6_DEFAULT_MAX_DST_OPTS_CNT; + net->ipv6.sysctl.max_hbh_opts_cnt = IP6_DEFAULT_MAX_HBH_OPTS_CNT; + net->ipv6.sysctl.max_dst_opts_len = IP6_DEFAULT_MAX_DST_OPTS_LEN; + net->ipv6.sysctl.max_hbh_opts_len = IP6_DEFAULT_MAX_HBH_OPTS_LEN; atomic_set(&net->ipv6.fib6_sernum, 1); err = ipv6_init_mibs(net); diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c index 47a5f8f88c70..abb7e0f41990 100644 --- a/net/ipv6/exthdrs.c +++ b/net/ipv6/exthdrs.c @@ -74,8 +74,20 @@ struct tlvtype_proc { /* An unknown option is detected, decide what to do */ -static bool ip6_tlvopt_unknown(struct sk_buff *skb, int optoff) +static bool ip6_tlvopt_unknown(struct sk_buff *skb, int optoff, + bool disallow_unknowns) { + if (disallow_unknowns) { + /* If unknown TLVs are disallowed by configuration + * then always silently drop packet. Note this also + * means no ICMP parameter problem is sent which + * could be a good property to mitigate a reflection DOS + * attack. + */ + + goto drop; + } + switch ((skb_network_header(skb)[optoff] & 0xC0) >> 6) { case 0: /* ignore */ return true; @@ -94,20 +106,30 @@ static bool ip6_tlvopt_unknown(struct sk_buff *skb, int optoff) return false; } +drop: kfree_skb(skb); return false; } /* Parse tlv encoded option header (hop-by-hop or destination) */ -static bool ip6_parse_tlv(const struct tlvtype_proc *procs, struct sk_buff *skb) +static bool ip6_parse_tlv(const struct tlvtype_proc *procs, + struct sk_buff *skb, + int max_count) { - const struct tlvtype_proc *curr; + int len = (skb_transport_header(skb)[1] + 1) << 3; const unsigned char *nh = skb_network_header(skb); int off = skb_network_header_len(skb); - int len = (skb_transport_header(skb)[1] + 1) << 3; + const struct tlvtype_proc *curr; + bool disallow_unknowns = false; + int tlv_count = 0; int padlen = 0; + if (unlikely(max_count < 0)) { + disallow_unknowns = true; + max_count = -max_count; + } + if (skb_transport_offset(skb) + len > skb_headlen(skb)) goto bad; @@ -148,6 +170,11 @@ static bool ip6_parse_tlv(const struct tlvtype_proc *procs, struct sk_buff *skb) default: /* Other TLV code so scan list */ if (optlen > len) goto bad; + + tlv_count++; + if (tlv_count > max_count) + goto bad; + for (curr = procs; curr->type >= 0; curr++) { if (curr->type == nh[off]) { /* type specific length/alignment @@ -158,10 +185,10 @@ static bool ip6_parse_tlv(const struct tlvtype_proc *procs, struct sk_buff *skb) break; } } - if (curr->type < 0) { - if (ip6_tlvopt_unknown(skb, off) == 0) - return false; - } + if (curr->type < 0 && + !ip6_tlvopt_unknown(skb, off, disallow_unknowns)) + return false; + padlen = 0; break; } @@ -260,23 +287,31 @@ static int ipv6_destopt_rcv(struct sk_buff *skb) __u16 dstbuf; #endif struct dst_entry *dst = skb_dst(skb); + struct net *net = dev_net(skb->dev); + int extlen; if (!pskb_may_pull(skb, skb_transport_offset(skb) + 8) || !pskb_may_pull(skb, (skb_transport_offset(skb) + ((skb_transport_header(skb)[1] + 1) << 3)))) { __IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS); +fail_and_free: kfree_skb(skb); return -1; } + extlen = (skb_transport_header(skb)[1] + 1) << 3; + if (extlen > net->ipv6.sysctl.max_dst_opts_len) + goto fail_and_free; + opt->lastopt = opt->dst1 = skb_network_header_len(skb); #if IS_ENABLED(CONFIG_IPV6_MIP6) dstbuf = opt->dst1; #endif - if (ip6_parse_tlv(tlvprocdestopt_lst, skb)) { - skb->transport_header += (skb_transport_header(skb)[1] + 1) << 3; + if (ip6_parse_tlv(tlvprocdestopt_lst, skb, + init_net.ipv6.sysctl.max_dst_opts_cnt)) { + skb->transport_header += extlen; opt = IP6CB(skb); #if IS_ENABLED(CONFIG_IPV6_MIP6) opt->nhoff = dstbuf; @@ -805,6 +840,8 @@ static const struct tlvtype_proc tlvprochopopt_lst[] = { int ipv6_parse_hopopts(struct sk_buff *skb) { struct inet6_skb_parm *opt = IP6CB(skb); + struct net *net = dev_net(skb->dev); + int extlen; /* * skb_network_header(skb) is equal to skb->data, and @@ -815,13 +852,19 @@ int ipv6_parse_hopopts(struct sk_buff *skb) if (!pskb_may_pull(skb, sizeof(struct ipv6hdr) + 8) || !pskb_may_pull(skb, (sizeof(struct ipv6hdr) + ((skb_transport_header(skb)[1] + 1) << 3)))) { +fail_and_free: kfree_skb(skb); return -1; } + extlen = (skb_transport_header(skb)[1] + 1) << 3; + if (extlen > net->ipv6.sysctl.max_hbh_opts_len) + goto fail_and_free; + opt->flags |= IP6SKB_HOPBYHOP; - if (ip6_parse_tlv(tlvprochopopt_lst, skb)) { - skb->transport_header += (skb_transport_header(skb)[1] + 1) << 3; + if (ip6_parse_tlv(tlvprochopopt_lst, skb, + init_net.ipv6.sysctl.max_hbh_opts_cnt)) { + skb->transport_header += extlen; opt = IP6CB(skb); opt->nhoff = sizeof(struct ipv6hdr); return 1; diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c index f7051ba5b8af..a789a8ac6a64 100644 --- a/net/ipv6/sysctl_net_ipv6.c +++ b/net/ipv6/sysctl_net_ipv6.c @@ -98,6 +98,34 @@ static struct ctl_table ipv6_table_template[] = { .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "max_dst_opts_number", + .data = &init_net.ipv6.sysctl.max_dst_opts_cnt, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "max_hbh_opts_number", + .data = &init_net.ipv6.sysctl.max_hbh_opts_cnt, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "max_dst_opts_length", + .data = &init_net.ipv6.sysctl.max_dst_opts_len, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "max_hbh_length", + .data = &init_net.ipv6.sysctl.max_hbh_opts_len, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, { } }; @@ -158,6 +186,10 @@ static int __net_init ipv6_sysctl_net_init(struct net *net) ipv6_table[7].data = &net->ipv6.sysctl.flowlabel_state_ranges; ipv6_table[8].data = &net->ipv6.sysctl.ip_nonlocal_bind; ipv6_table[9].data = &net->ipv6.sysctl.flowlabel_reflect; + ipv6_table[10].data = &net->ipv6.sysctl.max_dst_opts_cnt; + ipv6_table[11].data = &net->ipv6.sysctl.max_hbh_opts_cnt; + ipv6_table[12].data = &net->ipv6.sysctl.max_dst_opts_len; + ipv6_table[13].data = &net->ipv6.sysctl.max_hbh_opts_len; ipv6_route_table = ipv6_route_sysctl_init(net); if (!ipv6_route_table) From 4c8727359315242fdc37b5ea74a3236fda8c5a51 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Fri, 2 Mar 2018 08:32:18 -0800 Subject: [PATCH 0380/1640] UPSTREAM: net/ipv6: Add support for path selection using hash of 5-tuple Some operators prefer IPv6 path selection to use a standard 5-tuple hash rather than just an L3 hash with the flow the label. To that end add support to IPv6 for multipath hash policy similar to bf4e0a3db97eb ("net: ipv4: add support for ECMP hash policy choice"). The default is still L3 which covers source and destination addresses along with flow label and IPv6 protocol. Signed-off-by: David Ahern Reviewed-by: Ido Schimmel Tested-by: Ido Schimmel Reviewed-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.txt | 7 +++ include/net/ip6_route.h | 4 +- include/net/netevent.h | 1 + include/net/netns/ipv6.h | 1 + net/ipv6/icmp.c | 2 +- net/ipv6/route.c | 68 ++++++++++++++++++++------ net/ipv6/sysctl_net_ipv6.c | 27 ++++++++++ 7 files changed, 91 insertions(+), 19 deletions(-) diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index 542df8e622e4..c2ba72a09d43 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -1390,6 +1390,13 @@ flowlabel_reflect - BOOLEAN FALSE: disabled Default: FALSE +fib_multipath_hash_policy - INTEGER + Controls which hash policy to use for multipath routes. + Default: 0 (Layer 3) + Possible values: + 0 - Layer 3 (source and destination addresses plus flow label) + 1 - Layer 4 (standard 5-tuple) + anycast_src_echo_reply - BOOLEAN Controls the use of anycast addresses as source addresses for ICMPv6 echo reply diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index d4b1fa72403c..75a56be092cc 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -130,8 +130,8 @@ static inline int ip6_route_get_saddr(struct net *net, struct rt6_info *rt, struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr, const struct in6_addr *saddr, int oif, const struct sk_buff *skb, int flags); -u32 rt6_multipath_hash(const struct flowi6 *fl6, const struct sk_buff *skb, - struct flow_keys *hkeys); +u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6, + const struct sk_buff *skb, struct flow_keys *hkeys); struct dst_entry *icmp6_dst_alloc(struct net_device *dev, struct flowi6 *fl6); diff --git a/include/net/netevent.h b/include/net/netevent.h index baee605a94ab..d9918261701c 100644 --- a/include/net/netevent.h +++ b/include/net/netevent.h @@ -27,6 +27,7 @@ enum netevent_notif_type { NETEVENT_REDIRECT, /* arg is struct netevent_redirect ptr */ NETEVENT_DELAY_PROBE_TIME_UPDATE, /* arg is struct neigh_parms ptr */ NETEVENT_IPV4_MPATH_HASH_UPDATE, /* arg is struct net ptr */ + NETEVENT_IPV6_MPATH_HASH_UPDATE, /* arg is struct net ptr */ }; int register_netevent_notifier(struct notifier_block *nb); diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h index 75ddd8723894..7e7b95f8cea1 100644 --- a/include/net/netns/ipv6.h +++ b/include/net/netns/ipv6.h @@ -28,6 +28,7 @@ struct netns_sysctl_ipv6 { int ip6_rt_gc_elasticity; int ip6_rt_mtu_expires; int ip6_rt_min_advmss; + int multipath_hash_policy; int flowlabel_consistency; int auto_flowlabels; int icmpv6_time; diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index bf9539e328ad..041cc640ebe6 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -527,7 +527,7 @@ void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, fl6.fl6_icmp_type = type; fl6.fl6_icmp_code = code; fl6.flowi6_uid = sock_net_uid(net, NULL); - fl6.mp_hash = rt6_multipath_hash(&fl6, skb, NULL); + fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, NULL); security_skb_classify_flow(skb, flowi6_to_flowi(&fl6)); sk = icmpv6_xmit_lock(net); diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 997257a2fc84..ab9c4ddcc552 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -453,7 +453,8 @@ static bool rt6_check_expired(const struct rt6_info *rt) return false; } -static struct rt6_info *rt6_multipath_select(struct rt6_info *match, +static struct rt6_info *rt6_multipath_select(const struct net *net, + struct rt6_info *match, struct flowi6 *fl6, int oif, const struct sk_buff *skb, int strict) @@ -464,7 +465,7 @@ static struct rt6_info *rt6_multipath_select(struct rt6_info *match, * case it will always be non-zero. Otherwise now is the time to do it. */ if (!fl6->mp_hash) - fl6->mp_hash = rt6_multipath_hash(fl6, skb, NULL); + fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL); if (fl6->mp_hash <= atomic_read(&match->fib6_nh.nh_upper_bound)) return match; @@ -1109,7 +1110,7 @@ restart: rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags); if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0) - rt = rt6_multipath_select(rt, fl6, fl6->flowi6_oif, + rt = rt6_multipath_select(net, rt, fl6, fl6->flowi6_oif, skb, flags); } if (rt == net->ipv6.ip6_null_entry) { @@ -1860,7 +1861,7 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, redo_rt6_select: rt = rt6_select(net, fn, oif, strict); if (rt->rt6i_nsiblings) - rt = rt6_multipath_select(rt, fl6, oif, skb, strict); + rt = rt6_multipath_select(net, rt, fl6, oif, skb, strict); if (rt == net->ipv6.ip6_null_entry) { fn = fib6_backtrack(fn, &fl6->saddr); if (fn) @@ -2037,21 +2038,56 @@ out: } /* if skb is set it will be used and fl6 can be NULL */ -u32 rt6_multipath_hash(const struct flowi6 *fl6, const struct sk_buff *skb, - struct flow_keys *flkeys) +u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6, + const struct sk_buff *skb, struct flow_keys *flkeys) { struct flow_keys hash_keys; u32 mhash; - memset(&hash_keys, 0, sizeof(hash_keys)); - hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; - if (skb) { - ip6_multipath_l3_keys(skb, &hash_keys, flkeys); - } else { - hash_keys.addrs.v6addrs.src = fl6->saddr; - hash_keys.addrs.v6addrs.dst = fl6->daddr; - hash_keys.tags.flow_label = (__force u32)fl6->flowlabel; - hash_keys.basic.ip_proto = fl6->flowi6_proto; + switch (net->ipv6.sysctl.multipath_hash_policy) { + case 0: + memset(&hash_keys, 0, sizeof(hash_keys)); + hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; + if (skb) { + ip6_multipath_l3_keys(skb, &hash_keys, flkeys); + } else { + hash_keys.addrs.v6addrs.src = fl6->saddr; + hash_keys.addrs.v6addrs.dst = fl6->daddr; + hash_keys.tags.flow_label = (__force u32)fl6->flowlabel; + hash_keys.basic.ip_proto = fl6->flowi6_proto; + } + break; + case 1: + if (skb) { + unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP; + struct flow_keys keys; + + /* short-circuit if we already have L4 hash present */ + if (skb->l4_hash) + return skb_get_hash_raw(skb) >> 1; + + memset(&hash_keys, 0, sizeof(hash_keys)); + + if (!flkeys) { + skb_flow_dissect_flow_keys(skb, &keys, flag); + flkeys = &keys; + } + hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; + hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src; + hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst; + hash_keys.ports.src = flkeys->ports.src; + hash_keys.ports.dst = flkeys->ports.dst; + hash_keys.basic.ip_proto = flkeys->basic.ip_proto; + } else { + memset(&hash_keys, 0, sizeof(hash_keys)); + hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; + hash_keys.addrs.v6addrs.src = fl6->saddr; + hash_keys.addrs.v6addrs.dst = fl6->daddr; + hash_keys.ports.src = fl6->fl6_sport; + hash_keys.ports.dst = fl6->fl6_dport; + hash_keys.basic.ip_proto = fl6->flowi6_proto; + } + break; } mhash = flow_hash_from_keys(&hash_keys); @@ -2082,7 +2118,7 @@ void ip6_route_input(struct sk_buff *skb) flkeys = &_flkeys; if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6)) - fl6.mp_hash = rt6_multipath_hash(&fl6, skb, flkeys); + fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys); skb_dst_drop(skb); skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags)); diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c index a789a8ac6a64..6fbdef630152 100644 --- a/net/ipv6/sysctl_net_ipv6.c +++ b/net/ipv6/sysctl_net_ipv6.c @@ -16,14 +16,31 @@ #include #include #include +#include #ifdef CONFIG_NETLABEL #include #endif +static int zero; static int one = 1; static int auto_flowlabels_min; static int auto_flowlabels_max = IP6_AUTO_FLOW_LABEL_MAX; +static int proc_rt6_multipath_hash_policy(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + struct net *net; + int ret; + + net = container_of(table->data, struct net, + ipv6.sysctl.multipath_hash_policy); + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + if (write && ret == 0) + call_netevent_notifiers(NETEVENT_IPV6_MPATH_HASH_UPDATE, net); + + return ret; +} static struct ctl_table ipv6_table_template[] = { { @@ -126,6 +143,15 @@ static struct ctl_table ipv6_table_template[] = { .mode = 0644, .proc_handler = proc_dointvec }, + { + .procname = "fib_multipath_hash_policy", + .data = &init_net.ipv6.sysctl.multipath_hash_policy, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_rt6_multipath_hash_policy, + .extra1 = &zero, + .extra2 = &one, + }, { } }; @@ -190,6 +216,7 @@ static int __net_init ipv6_sysctl_net_init(struct net *net) ipv6_table[11].data = &net->ipv6.sysctl.max_hbh_opts_cnt; ipv6_table[12].data = &net->ipv6.sysctl.max_dst_opts_len; ipv6_table[13].data = &net->ipv6.sysctl.max_hbh_opts_len; + ipv6_table[14].data = &net->ipv6.sysctl.multipath_hash_policy, ipv6_route_table = ipv6_route_sysctl_init(net); if (!ipv6_route_table) From d9f538f93a7a3df4de63f5b1e9065d5231c5546d Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 18 Oct 2017 09:56:52 -0700 Subject: [PATCH 0381/1640] UPSTREAM: ipv6: addrconf: cleanup locking in ipv6_add_addr ipv6_add_addr is called in process context with rtnl lock held (e.g., manual config of an address) or during softirq processing (e.g., autoconf and address from a router advertisement). Currently, ipv6_add_addr calls rcu_read_lock_bh shortly after entry and does not call unlock until exit, minus the call around the address validator notifier. Similarly, addrconf_hash_lock is taken after the validator notifier and held until exit. This forces the allocation of inet6_ifaddr to always be atomic. Refactor ipv6_add_addr as follows: 1. add an input boolean to discriminate the call path (process context or softirq). This new flag controls whether the alloc can be done with GFP_KERNEL or GFP_ATOMIC. 2. Move the rcu_read_lock_bh and unlock calls only around functions that do rcu updates. 3. Remove the in6_dev_hold and put added by 3ad7d2468f79f ("Ipvlan should return an error when an address is already in use."). This was done presumably because rcu_read_unlock_bh needs to be called before calling the validator. Since rcu_read_lock is not needed before the validator runs revert the hold and put added by 3ad7d2468f79f and only do the hold when setting ifp->idev. 4. move duplicate address check and insertion of new address in the global address hash into a helper. The helper is called after an ifa is allocated and filled in. This allows the ifa for manually configured addresses to be done with GFP_KERNEL and reduces the overall amount of time with rcu_read_lock held and hash table spinlock held. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 104 +++++++++++++++++++++++++------------------- 1 file changed, 60 insertions(+), 44 deletions(-) diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 751c35be2fd7..78e5e2c3ac3e 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -955,18 +955,43 @@ static u32 inet6_addr_hash(const struct in6_addr *addr) return hash_32(ipv6_addr_hash(addr), IN6_ADDR_HSIZE_SHIFT); } +static int ipv6_add_addr_hash(struct net_device *dev, struct inet6_ifaddr *ifa) +{ + unsigned int hash; + int err = 0; + + spin_lock(&addrconf_hash_lock); + + /* Ignore adding duplicate addresses on an interface */ + if (ipv6_chk_same_addr(dev_net(dev), &ifa->addr, dev)) { + ADBG("ipv6_add_addr: already assigned\n"); + err = -EEXIST; + goto out; + } + + /* Add to big hash table */ + hash = inet6_addr_hash(&ifa->addr); + hlist_add_head_rcu(&ifa->addr_lst, &inet6_addr_lst[hash]); + +out: + spin_unlock(&addrconf_hash_lock); + + return err; +} + /* On success it returns ifp with increased reference count */ static struct inet6_ifaddr * ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr, const struct in6_addr *peer_addr, int pfxlen, - int scope, u32 flags, u32 valid_lft, u32 prefered_lft) + int scope, u32 flags, u32 valid_lft, u32 prefered_lft, + bool can_block) { + gfp_t gfp_flags = can_block ? GFP_KERNEL : GFP_ATOMIC; struct net *net = dev_net(idev->dev); struct inet6_ifaddr *ifa = NULL; - struct rt6_info *rt; + struct rt6_info *rt = NULL; struct in6_validator_info i6vi; - unsigned int hash; int err = 0; int addr_type = ipv6_addr_type(addr); @@ -976,42 +1001,24 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr, addr_type & IPV6_ADDR_LOOPBACK)) return ERR_PTR(-EADDRNOTAVAIL); - rcu_read_lock_bh(); - - in6_dev_hold(idev); - if (idev->dead) { err = -ENODEV; /*XXX*/ - goto out2; + goto out; } if (idev->cnf.disable_ipv6) { err = -EACCES; - goto out2; + goto out; } i6vi.i6vi_addr = *addr; i6vi.i6vi_dev = idev; - rcu_read_unlock_bh(); - err = inet6addr_validator_notifier_call_chain(NETDEV_UP, &i6vi); - - rcu_read_lock_bh(); err = notifier_to_errno(err); - if (err) - goto out2; - - spin_lock(&addrconf_hash_lock); - - /* Ignore adding duplicate addresses on an interface */ - if (ipv6_chk_same_addr(dev_net(idev->dev), addr, idev->dev)) { - ADBG("ipv6_add_addr: already assigned\n"); - err = -EEXIST; + if (err < 0) goto out; - } - - ifa = kzalloc(sizeof(struct inet6_ifaddr), GFP_ATOMIC); + ifa = kzalloc(sizeof(*ifa), gfp_flags); if (!ifa) { ADBG("ipv6_add_addr: malloc failed\n"); err = -ENOBUFS; @@ -1021,6 +1028,7 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr, rt = addrconf_dst_alloc(net, idev, addr, false); if (IS_ERR(rt)) { err = PTR_ERR(rt); + rt = NULL; goto out; } @@ -1051,16 +1059,21 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr, ifa->rt = rt; ifa->idev = idev; + in6_dev_hold(idev); + /* For caller */ refcount_set(&ifa->refcnt, 1); - /* Add to big hash table */ - hash = inet6_addr_hash(addr); + rcu_read_lock_bh(); - hlist_add_head_rcu(&ifa->addr_lst, &inet6_addr_lst[hash]); - spin_unlock(&addrconf_hash_lock); + err = ipv6_add_addr_hash(idev->dev, ifa); + if (err < 0) { + rcu_read_unlock_bh(); + goto out; + } write_lock(&idev->lock); + /* Add to inet6_dev unicast addr list. */ ipv6_link_dev_addr(idev, ifa); @@ -1071,21 +1084,23 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr, in6_ifa_hold(ifa); write_unlock(&idev->lock); -out2: + rcu_read_unlock_bh(); - if (likely(err == 0)) - inet6addr_notifier_call_chain(NETDEV_UP, ifa); - else { - kfree(ifa); - in6_dev_put(idev); + inet6addr_notifier_call_chain(NETDEV_UP, ifa); +out: + if (unlikely(err < 0)) { + if (rt) + ip6_rt_put(rt); + if (ifa) { + if (ifa->idev) + in6_dev_put(ifa->idev); + kfree(ifa); + } ifa = ERR_PTR(err); } return ifa; -out: - spin_unlock(&addrconf_hash_lock); - goto out2; } enum cleanup_prefix_rt_t { @@ -1327,7 +1342,7 @@ retry: ift = ipv6_add_addr(idev, &addr, NULL, tmp_plen, ipv6_addr_scope(&addr), addr_flags, - tmp_valid_lft, tmp_prefered_lft); + tmp_valid_lft, tmp_prefered_lft, true); if (IS_ERR(ift)) { in6_ifa_put(ifp); in6_dev_put(idev); @@ -2023,7 +2038,7 @@ void addrconf_dad_failure(struct inet6_ifaddr *ifp) ifp2 = ipv6_add_addr(idev, &new_addr, NULL, pfxlen, scope, flags, valid_lft, - preferred_lft); + preferred_lft, false); if (IS_ERR(ifp2)) goto lock_errdad; @@ -2523,7 +2538,7 @@ int addrconf_prefix_rcv_add_addr(struct net *net, struct net_device *dev, pinfo->prefix_len, addr_type&IPV6_ADDR_SCOPE_MASK, addr_flags, valid_lft, - prefered_lft); + prefered_lft, false); if (IS_ERR_OR_NULL(ifp)) return -1; @@ -2895,7 +2910,7 @@ static int inet6_addr_add(struct net *net, int ifindex, } ifp = ipv6_add_addr(idev, pfx, peer_pfx, plen, scope, ifa_flags, - valid_lft, prefered_lft); + valid_lft, prefered_lft, true); if (!IS_ERR(ifp)) { if (!(ifa_flags & IFA_F_NOPREFIXROUTE)) { @@ -3010,7 +3025,8 @@ static void add_addr(struct inet6_dev *idev, const struct in6_addr *addr, ifp = ipv6_add_addr(idev, addr, NULL, plen, scope, IFA_F_PERMANENT, - INFINITY_LIFE_TIME, INFINITY_LIFE_TIME); + INFINITY_LIFE_TIME, INFINITY_LIFE_TIME, + true); if (!IS_ERR(ifp)) { spin_lock_bh(&ifp->lock); ifp->flags &= ~IFA_F_TENTATIVE; @@ -3113,7 +3129,7 @@ void addrconf_add_linklocal(struct inet6_dev *idev, #endif ifp = ipv6_add_addr(idev, addr, NULL, 64, IFA_LINK, addr_flags, - INFINITY_LIFE_TIME, INFINITY_LIFE_TIME); + INFINITY_LIFE_TIME, INFINITY_LIFE_TIME, true); if (!IS_ERR(ifp)) { addrconf_prefix_route(&ifp->addr, ifp->prefix_len, idev->dev, 0, 0); addrconf_dad_start(ifp); From b3bec3245b23139acd0c54bd6d805872a67b5b30 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 9 Oct 2017 06:01:37 -0700 Subject: [PATCH 0382/1640] UPSTREAM: ipv6: avoid zeroing per cpu data again per cpu allocations are already zeroed, no need to clear them again. Fixes: d52d3997f843f ("ipv6: Create percpu rt6_info") Signed-off-by: Eric Dumazet Cc: Martin KaFai Lau Cc: Tejun Heo Acked-by: Tejun Heo Acked-by: Martin KaFai Lau Signed-off-by: David S. Miller --- net/ipv6/route.c | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index ab9c4ddcc552..8d9826b0521f 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -365,17 +365,7 @@ struct rt6_info *ip6_dst_alloc(struct net *net, if (rt) { rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_ATOMIC); - if (rt->rt6i_pcpu) { - int cpu; - - for_each_possible_cpu(cpu) { - struct rt6_info **p; - - p = per_cpu_ptr(rt->rt6i_pcpu, cpu); - /* no one shares rt */ - *p = NULL; - } - } else { + if (!rt->rt6i_pcpu) { dst_release_immediate(&rt->dst); return NULL; } From a73b6477c306722ed1f49759157e9a5b3a4c5a0b Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 17 Apr 2018 17:33:18 -0700 Subject: [PATCH 0383/1640] UPSTREAM: net/ipv6: Add fib6_null_entry ip6_null_entry will stay a dst based return for lookups that fail to match an entry. Add a new fib6_null_entry which constitutes the root node and leafs for fibs. Replace existing references to ip6_null_entry with the new fib6_null_entry when dealing with FIBs. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/netns/ipv6.h | 3 +- net/ipv6/ip6_fib.c | 26 ++++++++--------- net/ipv6/route.c | 62 ++++++++++++++++++++++++++++------------ 3 files changed, 58 insertions(+), 33 deletions(-) diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h index 7e7b95f8cea1..e997e8152b32 100644 --- a/include/net/netns/ipv6.h +++ b/include/net/netns/ipv6.h @@ -60,7 +60,8 @@ struct netns_ipv6 { #endif struct xt_table *ip6table_nat; #endif - struct rt6_info *ip6_null_entry; + struct rt6_info *fib6_null_entry; + struct rt6_info *ip6_null_entry; struct rt6_statistics *rt6_stats; struct timer_list ip6_fib_timer; struct hlist_head *fib_table_hash; diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 2f2aa326e578..fe3b12c1514b 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -291,7 +291,7 @@ static struct fib6_table *fib6_alloc_table(struct net *net, u32 id) if (table) { table->tb6_id = id; rcu_assign_pointer(table->tb6_root.leaf, - net->ipv6.ip6_null_entry); + net->ipv6.fib6_null_entry); table->tb6_root.fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO; inet_peer_base_init(&table->tb6_peers); } @@ -429,7 +429,7 @@ struct fib6_dump_arg { static void fib6_rt_dump(struct rt6_info *rt, struct fib6_dump_arg *arg) { - if (rt == arg->net->ipv6.ip6_null_entry) + if (rt == arg->net->ipv6.fib6_null_entry) return; call_fib6_entry_notifier(arg->nb, arg->net, FIB_EVENT_ENTRY_ADD, rt); } @@ -718,7 +718,7 @@ static struct fib6_node *fib6_add_1(struct net *net, /* remove null_entry in the root node */ } else if (fn->fn_flags & RTN_TL_ROOT && rcu_access_pointer(fn->leaf) == - net->ipv6.ip6_null_entry) { + net->ipv6.fib6_null_entry) { RCU_INIT_POINTER(fn->leaf, NULL); } @@ -1232,9 +1232,9 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, if (!sfn) goto failure; - atomic_inc(&info->nl_net->ipv6.ip6_null_entry->rt6i_ref); + atomic_inc(&info->nl_net->ipv6.fib6_null_entry->rt6i_ref); rcu_assign_pointer(sfn->leaf, - info->nl_net->ipv6.ip6_null_entry); + info->nl_net->ipv6.fib6_null_entry); sfn->fn_flags = RTN_ROOT; /* Now add the first leaf node to new subtree */ @@ -1273,7 +1273,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, if (fn->fn_flags & RTN_TL_ROOT) { /* put back null_entry for root node */ rcu_assign_pointer(fn->leaf, - info->nl_net->ipv6.ip6_null_entry); + info->nl_net->ipv6.fib6_null_entry); } else { atomic_inc(&rt->rt6i_ref); rcu_assign_pointer(fn->leaf, rt); @@ -1312,7 +1312,7 @@ out: if (!pn_leaf) { WARN_ON(!pn_leaf); pn_leaf = - info->nl_net->ipv6.ip6_null_entry; + info->nl_net->ipv6.fib6_null_entry; } #endif atomic_inc(&pn_leaf->rt6i_ref); @@ -1552,7 +1552,7 @@ static struct rt6_info *fib6_find_prefix(struct net *net, struct fib6_node *child_left, *child_right; if (fn->fn_flags & RTN_ROOT) - return net->ipv6.ip6_null_entry; + return net->ipv6.fib6_null_entry; while (fn) { child_left = rcu_dereference_protected(fn->left, @@ -1589,7 +1589,7 @@ static struct fib6_node *fib6_repair_tree(struct net *net, /* Set fn->leaf to null_entry for root node. */ if (fn->fn_flags & RTN_TL_ROOT) { - rcu_assign_pointer(fn->leaf, net->ipv6.ip6_null_entry); + rcu_assign_pointer(fn->leaf, net->ipv6.fib6_null_entry); return fn; } @@ -1634,7 +1634,7 @@ static struct fib6_node *fib6_repair_tree(struct net *net, #if RT6_DEBUG >= 2 if (!new_fn_leaf) { WARN_ON(!new_fn_leaf); - new_fn_leaf = net->ipv6.ip6_null_entry; + new_fn_leaf = net->ipv6.fib6_null_entry; } #endif atomic_inc(&new_fn_leaf->rt6i_ref); @@ -1784,7 +1784,7 @@ int fib6_del(struct rt6_info *rt, struct nl_info *info) return -ENOENT; } #endif - if (!fn || rt == net->ipv6.ip6_null_entry) + if (!fn || rt == net->ipv6.fib6_null_entry) return -ENOENT; WARN_ON(!(fn->fn_flags & RTN_RTINFO)); @@ -2132,7 +2132,7 @@ static int __net_init fib6_net_init(struct net *net) net->ipv6.fib6_main_tbl->tb6_id = RT6_TABLE_MAIN; rcu_assign_pointer(net->ipv6.fib6_main_tbl->tb6_root.leaf, - net->ipv6.ip6_null_entry); + net->ipv6.fib6_null_entry); net->ipv6.fib6_main_tbl->tb6_root.fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO; inet_peer_base_init(&net->ipv6.fib6_main_tbl->tb6_peers); @@ -2144,7 +2144,7 @@ static int __net_init fib6_net_init(struct net *net) goto out_fib6_main_tbl; net->ipv6.fib6_local_tbl->tb6_id = RT6_TABLE_LOCAL; rcu_assign_pointer(net->ipv6.fib6_local_tbl->tb6_root.leaf, - net->ipv6.ip6_null_entry); + net->ipv6.fib6_null_entry); net->ipv6.fib6_local_tbl->tb6_root.fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO; inet_peer_base_init(&net->ipv6.fib6_local_tbl->tb6_peers); diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 8d9826b0521f..c755d0023618 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -279,6 +279,15 @@ static const u32 ip6_template_metrics[RTAX_MAX] = { [RTAX_HOPLIMIT - 1] = 0, }; +static const struct rt6_info fib6_null_entry_template = { + .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), + .rt6i_protocol = RTPROT_KERNEL, + .rt6i_metric = ~(u32)0, + .rt6i_ref = ATOMIC_INIT(1), + .fib6_type = RTN_UNREACHABLE, + .fib6_metrics = (struct dst_metrics *)&dst_default_metrics, +}; + static const struct rt6_info ip6_null_entry_template = { .dst = { .__refcnt = ATOMIC_INIT(1), @@ -525,10 +534,10 @@ static inline struct rt6_info *rt6_device_match(struct net *net, return local; if (flags & RT6_LOOKUP_F_IFACE) - return net->ipv6.ip6_null_entry; + return net->ipv6.fib6_null_entry; } - return rt->fib6_nh.nh_flags & RTNH_F_DEAD ? net->ipv6.ip6_null_entry : rt; + return rt->fib6_nh.nh_flags & RTNH_F_DEAD ? net->ipv6.fib6_null_entry : rt; } #ifdef CONFIG_IPV6_ROUTER_PREF @@ -761,8 +770,8 @@ static struct rt6_info *rt6_select(struct net *net, struct fib6_node *fn, bool do_rr = false; int key_plen; - if (!leaf || leaf == net->ipv6.ip6_null_entry) - return net->ipv6.ip6_null_entry; + if (!leaf || leaf == net->ipv6.fib6_null_entry) + return net->ipv6.fib6_null_entry; rt0 = rcu_dereference(fn->rr_ptr); if (!rt0) @@ -779,7 +788,7 @@ static struct rt6_info *rt6_select(struct net *net, struct fib6_node *fn, key_plen = rt0->rt6i_src.plen; #endif if (fn->fn_bit != key_plen) - return net->ipv6.ip6_null_entry; + return net->ipv6.fib6_null_entry; match = find_rr_leaf(fn, leaf, rt0, rt0->rt6i_metric, oif, strict, &do_rr); @@ -800,7 +809,7 @@ static struct rt6_info *rt6_select(struct net *net, struct fib6_node *fn, } } - return match ? match : net->ipv6.ip6_null_entry; + return match ? match : net->ipv6.fib6_null_entry; } static bool rt6_is_gw_or_nonexthop(const struct rt6_info *rt) @@ -1095,7 +1104,7 @@ static struct rt6_info *ip6_pol_route_lookup(struct net *net, restart: rt = rcu_dereference(fn->leaf); if (!rt) { - rt = net->ipv6.ip6_null_entry; + rt = net->ipv6.fib6_null_entry; } else { rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags); @@ -1103,7 +1112,7 @@ restart: rt = rt6_multipath_select(net, rt, fl6, fl6->flowi6_oif, skb, flags); } - if (rt == net->ipv6.ip6_null_entry) { + if (rt == net->ipv6.fib6_null_entry) { fn = fib6_backtrack(fn, &fl6->saddr); if (fn) goto restart; @@ -1852,7 +1861,7 @@ redo_rt6_select: rt = rt6_select(net, fn, oif, strict); if (rt->rt6i_nsiblings) rt = rt6_multipath_select(net, rt, fl6, oif, skb, strict); - if (rt == net->ipv6.ip6_null_entry) { + if (rt == net->ipv6.fib6_null_entry) { fn = fib6_backtrack(fn, &fl6->saddr); if (fn) goto redo_rt6_select; @@ -1869,7 +1878,8 @@ redo_rt6_select: if (rt_cache) rt = rt_cache; - if (rt == net->ipv6.ip6_null_entry) { + if (rt == net->ipv6.fib6_null_entry) { + rt = net->ipv6.ip6_null_entry; rcu_read_unlock(); dst_hold(&rt->dst); trace_fib6_table_lookup(net, rt, table->tb6_id, fl6); @@ -2448,13 +2458,13 @@ restart: } if (!rt) - rt = net->ipv6.ip6_null_entry; + rt = net->ipv6.fib6_null_entry; else if (rt->rt6i_flags & RTF_REJECT) { rt = net->ipv6.ip6_null_entry; goto out; } - if (rt == net->ipv6.ip6_null_entry) { + if (rt == net->ipv6.fib6_null_entry) { fn = fib6_backtrack(fn, &fl6->saddr); if (fn) goto restart; @@ -3063,7 +3073,7 @@ static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info) struct fib6_table *table; int err; - if (rt == net->ipv6.ip6_null_entry) { + if (rt == net->ipv6.fib6_null_entry) { err = -ENOENT; goto out; } @@ -3093,7 +3103,7 @@ static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg) struct fib6_table *table; int err = -ENOENT; - if (rt == net->ipv6.ip6_null_entry) + if (rt == net->ipv6.fib6_null_entry) goto out_put; table = rt->rt6i_table; spin_lock_bh(&table->tb6_lock); @@ -3615,7 +3625,7 @@ static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg) struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr; if (((void *)rt->fib6_nh.nh_dev == dev || !dev) && - rt != net->ipv6.ip6_null_entry && + rt != net->ipv6.fib6_null_entry && ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) { spin_lock_bh(&rt6_exception_lock); /* remove prefsrc entry */ @@ -3770,7 +3780,7 @@ static int fib6_ifup(struct rt6_info *rt, void *p_arg) const struct arg_netdev_event *arg = p_arg; struct net *net = dev_net(arg->dev); - if (rt != net->ipv6.ip6_null_entry && rt->fib6_nh.nh_dev == arg->dev) { + if (rt != net->ipv6.fib6_null_entry && rt->fib6_nh.nh_dev == arg->dev) { rt->fib6_nh.nh_flags &= ~arg->nh_flags; fib6_update_sernum_upto_root(net, rt); rt6_multipath_rebalance(rt); @@ -3854,7 +3864,7 @@ static int fib6_ifdown(struct rt6_info *rt, void *p_arg) const struct net_device *dev = arg->dev; struct net *net = dev_net(dev); - if (rt == net->ipv6.ip6_null_entry) + if (rt == net->ipv6.fib6_null_entry) return 0; switch (arg->event) { @@ -4642,7 +4652,7 @@ int rt6_dump_route(struct rt6_info *rt, void *p_arg) struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg; struct net *net = arg->net; - if (rt == net->ipv6.ip6_null_entry) + if (rt == net->ipv6.fib6_null_entry) return 0; if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) { @@ -4831,6 +4841,8 @@ static int ip6_route_dev_notify(struct notifier_block *this, return NOTIFY_OK; if (event == NETDEV_REGISTER) { + net->ipv6.fib6_null_entry->fib6_nh.nh_dev = dev; + net->ipv6.fib6_null_entry->rt6i_idev = in6_dev_get(dev); net->ipv6.ip6_null_entry->dst.dev = dev; net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev); #ifdef CONFIG_IPV6_MULTIPLE_TABLES @@ -4844,6 +4856,7 @@ static int ip6_route_dev_notify(struct notifier_block *this, /* NETDEV_UNREGISTER could be fired for multiple times by * netdev_wait_allrefs(). Make sure we only call this once. */ + in6_dev_put_clear(&net->ipv6.fib6_null_entry->rt6i_idev); in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev); #ifdef CONFIG_IPV6_MULTIPLE_TABLES in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev); @@ -5033,11 +5046,17 @@ static int __net_init ip6_route_net_init(struct net *net) if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0) goto out_ip6_dst_ops; + net->ipv6.fib6_null_entry = kmemdup(&fib6_null_entry_template, + sizeof(*net->ipv6.fib6_null_entry), + GFP_KERNEL); + if (!net->ipv6.fib6_null_entry) + goto out_ip6_dst_entries; + net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template, sizeof(*net->ipv6.ip6_null_entry), GFP_KERNEL); if (!net->ipv6.ip6_null_entry) - goto out_ip6_dst_entries; + goto out_fib6_null_entry; net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops; dst_init_metrics(&net->ipv6.ip6_null_entry->dst, ip6_template_metrics, true); @@ -5084,6 +5103,8 @@ out_ip6_prohibit_entry: out_ip6_null_entry: kfree(net->ipv6.ip6_null_entry); #endif +out_fib6_null_entry: + kfree(net->ipv6.fib6_null_entry); out_ip6_dst_entries: dst_entries_destroy(&net->ipv6.ip6_dst_ops); out_ip6_dst_ops: @@ -5092,6 +5113,7 @@ out_ip6_dst_ops: static void __net_exit ip6_route_net_exit(struct net *net) { + kfree(net->ipv6.fib6_null_entry); kfree(net->ipv6.ip6_null_entry); #ifdef CONFIG_IPV6_MULTIPLE_TABLES kfree(net->ipv6.ip6_prohibit_entry); @@ -5162,6 +5184,8 @@ void __init ip6_route_init_special_entries(void) /* Registering of the loopback is done before this portion of code, * the loopback reference in rt6_info will not be taken, do it * manually for init_net */ + init_net.ipv6.fib6_null_entry->fib6_nh.nh_dev = init_net.loopback_dev; + init_net.ipv6.fib6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev; init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); #ifdef CONFIG_IPV6_MULTIPLE_TABLES From 4b7ce65896188aa990bca40df1e63b075092780b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 8 Oct 2017 21:07:18 -0700 Subject: [PATCH 0384/1640] BACKPORT: ipv6: fix a BUG in rt6_get_pcpu_route() Ido reported following splat and provided a patch. [ 122.221814] BUG: using smp_processor_id() in preemptible [00000000] code: sshd/2672 [ 122.221845] caller is debug_smp_processor_id+0x17/0x20 [ 122.221866] CPU: 0 PID: 2672 Comm: sshd Not tainted 4.14.0-rc3-idosch-next-custom #639 [ 122.221880] Hardware name: Mellanox Technologies Ltd. MSN2100-CB2FO/SA001017, BIOS 5.6.5 06/07/2016 [ 122.221893] Call Trace: [ 122.221919] dump_stack+0xb1/0x10c [ 122.221946] ? _atomic_dec_and_lock+0x124/0x124 [ 122.221974] ? ___ratelimit+0xfe/0x240 [ 122.222020] check_preemption_disabled+0x173/0x1b0 [ 122.222060] debug_smp_processor_id+0x17/0x20 [ 122.222083] ip6_pol_route+0x1482/0x24a0 ... I believe we can simplify this code path a bit, since we no longer hold a read_lock and need to release it to avoid a dead lock. By disabling BH, we make sure we'll prevent code re-entry and rt6_get_pcpu_route()/rt6_make_pcpu_route() run on the same cpu. Fixes: 66f5d6ce53e6 ("ipv6: replace rwlock with rcu and spinlock in fib6_table") Reported-by: Ido Schimmel Signed-off-by: Eric Dumazet Tested-by: Ido Schimmel Signed-off-by: David S. Miller --- net/ipv6/route.c | 26 ++++++-------------------- 1 file changed, 6 insertions(+), 20 deletions(-) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index c755d0023618..f7803233620f 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1292,15 +1292,7 @@ static struct rt6_info *rt6_make_pcpu_route(struct net *net, dst_hold(&pcpu_rt->dst); p = this_cpu_ptr(rt->rt6i_pcpu); prev = cmpxchg(p, NULL, pcpu_rt); - if (prev) { - /* If someone did it before us, return prev instead */ - /* release refcnt taken by ip6_rt_pcpu_alloc() */ - dst_release_immediate(&pcpu_rt->dst); - /* release refcnt taken by above dst_hold() */ - dst_release_immediate(&pcpu_rt->dst); - dst_hold(&prev->dst); - pcpu_rt = prev; - } + BUG_ON(prev); return pcpu_rt; } @@ -1934,31 +1926,25 @@ uncached_rt_out: struct rt6_info *pcpu_rt; dst_use_noref(&rt->dst, jiffies); + local_bh_disable(); pcpu_rt = rt6_get_pcpu_route(rt); - if (pcpu_rt) { - rcu_read_unlock(); - } else { + if (!pcpu_rt) { /* atomic_inc_not_zero() is needed when using rcu */ if (atomic_inc_not_zero(&rt->rt6i_ref)) { - /* We have to do the read_unlock first - * because rt6_make_pcpu_route() may trigger - * ip6_dst_gc() which will take the write_lock. - * - * No dst_hold() on rt is needed because grabbing + /* No dst_hold() on rt is needed because grabbing * rt->rt6i_ref makes sure rt can't be released. */ - rcu_read_unlock(); pcpu_rt = rt6_make_pcpu_route(net, rt); rt6_release(rt); } else { /* rt is already removed from tree */ - rcu_read_unlock(); pcpu_rt = net->ipv6.ip6_null_entry; dst_hold(&pcpu_rt->dst); } } - + local_bh_enable(); + rcu_read_unlock(); trace_fib6_table_lookup(net, pcpu_rt, table->tb6_id, fl6); return pcpu_rt; } From 21a0040f88cb788bf44730ed53d09ece198f34b2 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 19 Oct 2017 09:31:43 +0200 Subject: [PATCH 0385/1640] UPSTREAM: ipv6: let trace_fib6_table_lookup() dereference the fib table The perf traces for ipv6 routing code show a relevant cost around trace_fib6_table_lookup(), even if no trace is enabled. This is due to the fib6_table de-referencing currently performed by the caller. Let's the tracing code pay this overhead, passing to the trace helper the table pointer. This gives small but measurable performance improvement under UDP flood. Signed-off-by: Paolo Abeni Acked-by: Steven Rostedt (VMware) Acked-by: David Ahern Acked-by: Martin KaFai Lau Signed-off-by: David S. Miller --- include/trace/events/fib6.h | 6 +++--- net/ipv6/route.c | 12 ++++++------ 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/include/trace/events/fib6.h b/include/trace/events/fib6.h index d46e24702765..7e8d48a81b91 100644 --- a/include/trace/events/fib6.h +++ b/include/trace/events/fib6.h @@ -13,9 +13,9 @@ TRACE_EVENT(fib6_table_lookup, TP_PROTO(const struct net *net, const struct rt6_info *rt, - u32 tb_id, const struct flowi6 *flp), + struct fib6_table *table, const struct flowi6 *flp), - TP_ARGS(net, rt, tb_id, flp), + TP_ARGS(net, rt, table, flp), TP_STRUCT__entry( __field( u32, tb_id ) @@ -35,7 +35,7 @@ TRACE_EVENT(fib6_table_lookup, TP_fast_assign( struct in6_addr *in6; - __entry->tb_id = tb_id; + __entry->tb_id = table->tb6_id; __entry->oif = flp->flowi6_oif; __entry->iif = flp->flowi6_iif; __entry->tos = ip6_tclass(flp->flowlabel); diff --git a/net/ipv6/route.c b/net/ipv6/route.c index f7803233620f..3231bf79feaf 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1136,7 +1136,7 @@ restart: rcu_read_unlock(); - trace_fib6_table_lookup(net, rt, table->tb6_id, fl6); + trace_fib6_table_lookup(net, rt, table, fl6); return rt; } @@ -1874,14 +1874,14 @@ redo_rt6_select: rt = net->ipv6.ip6_null_entry; rcu_read_unlock(); dst_hold(&rt->dst); - trace_fib6_table_lookup(net, rt, table->tb6_id, fl6); + trace_fib6_table_lookup(net, rt, table, fl6); return rt; } else if (rt->rt6i_flags & RTF_CACHE) { if (ip6_hold_safe(net, &rt, true)) dst_use_noref(&rt->dst, jiffies); rcu_read_unlock(); - trace_fib6_table_lookup(net, rt, table->tb6_id, fl6); + trace_fib6_table_lookup(net, rt, table, fl6); return rt; } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) && !(rt->rt6i_flags & RTF_GATEWAY))) { @@ -1917,7 +1917,7 @@ redo_rt6_select: } uncached_rt_out: - trace_fib6_table_lookup(net, uncached_rt, table->tb6_id, fl6); + trace_fib6_table_lookup(net, uncached_rt, table, fl6); return uncached_rt; } else { @@ -1945,7 +1945,7 @@ uncached_rt_out: } local_bh_enable(); rcu_read_unlock(); - trace_fib6_table_lookup(net, pcpu_rt, table->tb6_id, fl6); + trace_fib6_table_lookup(net, pcpu_rt, table, fl6); return pcpu_rt; } } @@ -2461,7 +2461,7 @@ out: rcu_read_unlock(); - trace_fib6_table_lookup(net, rt, table->tb6_id, fl6); + trace_fib6_table_lookup(net, rt, table, fl6); return rt; }; From a7c821ac864e46649b1228620f01336d6aca6a79 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 17 Apr 2018 17:33:23 -0700 Subject: [PATCH 0386/1640] UPSTREAM: net/ipv6: Cleanup exception and cache route handling IPv6 FIB will only contain FIB entries with exception routes added to the FIB entry. Once this transformation is complete, FIB lookups will return a fib6_info with the lookup functions still returning a dst based rt6_info. The current code uses rt6_info for both paths and overloads the rt6_info variable usually called 'rt'. This patch introduces a new 'f6i' variable name for the result of the FIB lookup and keeps 'rt' as the dst based return variable. 'f6i' becomes a fib6_info in a later patch which is why it is introduced as f6i now; avoids the additional churn in the later patch. In addition, remove RTF_CACHE and dst checks from fib6 add and delete since they can not happen now and will never happen after the data type flip. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_route.h | 1 - net/ipv6/ip6_fib.c | 16 +---- net/ipv6/route.c | 142 ++++++++++++++++++++++------------------ 3 files changed, 81 insertions(+), 78 deletions(-) diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index 75a56be092cc..bc765fc881bb 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -105,7 +105,6 @@ int ip6_ins_rt(struct net *net, struct rt6_info *rt); int ip6_del_rt(struct net *net, struct rt6_info *rt); void rt6_flush_exceptions(struct rt6_info *rt); -int rt6_remove_exception_rt(struct rt6_info *rt); void rt6_age_exceptions(struct rt6_info *rt, struct fib6_gc_args *gc_args, unsigned long now); diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index fe3b12c1514b..e64ad2b176f1 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -1135,7 +1135,7 @@ add: static void fib6_start_gc(struct net *net, struct rt6_info *rt) { if (!timer_pending(&net->ipv6.ip6_fib_timer) && - (rt->rt6i_flags & (RTF_EXPIRES | RTF_CACHE))) + (rt->rt6i_flags & RTF_EXPIRES)) mod_timer(&net->ipv6.ip6_fib_timer, jiffies + net->ipv6.sysctl.ip6_rt_gc_interval); } @@ -1186,8 +1186,6 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, if (WARN_ON_ONCE(!atomic_read(&rt->dst.__refcnt))) return -EINVAL; - if (WARN_ON_ONCE(rt->rt6i_flags & RTF_CACHE)) - return -EINVAL; if (info->nlh) { if (!(info->nlh->nlmsg_flags & NLM_F_CREATE)) @@ -1708,8 +1706,6 @@ static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn, RT6_TRACE("fib6_del_route\n"); - WARN_ON_ONCE(rt->rt6i_flags & RTF_CACHE); - /* Unlink it */ *rtp = rt->rt6_next; rt->rt6i_node = NULL; @@ -1778,21 +1774,11 @@ int fib6_del(struct rt6_info *rt, struct nl_info *info) struct rt6_info __rcu **rtp; struct rt6_info __rcu **rtp_next; -#if RT6_DEBUG >= 2 - if (rt->dst.obsolete > 0) { - WARN_ON(fn); - return -ENOENT; - } -#endif if (!fn || rt == net->ipv6.fib6_null_entry) return -ENOENT; WARN_ON(!(fn->fn_flags & RTN_RTINFO)); - /* remove cached dst from exception table */ - if (rt->rt6i_flags & RTF_CACHE) - return rt6_remove_exception_rt(rt); - /* * Walk the leaf entries looking for ourself */ diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 3231bf79feaf..5dcabff7a687 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1009,8 +1009,8 @@ static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from) BUG_ON(from->from); rt->rt6i_flags &= ~RTF_EXPIRES; - dst_hold(&from->dst); - rt->from = from; + if (dst_hold_safe(&from->dst)) + rt->from = from; dst_init_metrics(&rt->dst, from->fib6_metrics->metrics, true); if (from->fib6_metrics != &dst_default_metrics) { rt->dst._metrics |= DST_METRICS_REFCOUNTED; @@ -1093,8 +1093,9 @@ static struct rt6_info *ip6_pol_route_lookup(struct net *net, const struct sk_buff *skb, int flags) { - struct rt6_info *rt, *rt_cache; + struct rt6_info *f6i; struct fib6_node *fn; + struct rt6_info *rt; if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) flags &= ~RT6_LOOKUP_F_IFACE; @@ -1102,36 +1103,36 @@ static struct rt6_info *ip6_pol_route_lookup(struct net *net, rcu_read_lock(); fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); restart: - rt = rcu_dereference(fn->leaf); - if (!rt) { - rt = net->ipv6.fib6_null_entry; + f6i = rcu_dereference(fn->leaf); + if (!f6i) { + f6i = net->ipv6.fib6_null_entry; } else { - rt = rt6_device_match(net, rt, &fl6->saddr, + f6i = rt6_device_match(net, f6i, &fl6->saddr, fl6->flowi6_oif, flags); - if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0) - rt = rt6_multipath_select(net, rt, fl6, fl6->flowi6_oif, - skb, flags); + if (f6i->rt6i_nsiblings && fl6->flowi6_oif == 0) + f6i = rt6_multipath_select(net, f6i, fl6, + fl6->flowi6_oif, skb, flags); } - if (rt == net->ipv6.fib6_null_entry) { + if (f6i == net->ipv6.fib6_null_entry) { fn = fib6_backtrack(fn, &fl6->saddr); if (fn) goto restart; } + /* Search through exception table */ - rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr); - if (rt_cache) { - rt = rt_cache; + rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr); + if (rt) { if (ip6_hold_safe(net, &rt, true)) dst_use_noref(&rt->dst, jiffies); - } else if (dst_hold_safe(&rt->dst)) { - struct rt6_info *nrt; - - nrt = ip6_create_rt_rcu(rt); - dst_release(&rt->dst); - rt = nrt; - } else { + } else if (f6i == net->ipv6.fib6_null_entry) { rt = net->ipv6.ip6_null_entry; dst_hold(&rt->dst); + } else { + rt = ip6_create_rt_rcu(f6i); + if (!rt) { + rt = net->ipv6.ip6_null_entry; + dst_hold(&rt->dst); + } } rcu_read_unlock(); @@ -1214,9 +1215,6 @@ static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort, * Clone the route. */ - if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU)) - ort = ort->from; - rcu_read_lock(); dev = ip6_rt_get_dev_rcu(ort); rt = __ip6_dst_alloc(dev_net(dev), dev, 0); @@ -1440,11 +1438,6 @@ static int rt6_insert_exception(struct rt6_info *nrt, struct rt6_exception *rt6_ex; int err = 0; - /* ort can't be a cache or pcpu route */ - if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU)) - ort = ort->from; - WARN_ON_ONCE(ort->rt6i_flags & (RTF_CACHE | RTF_PCPU)); - spin_lock_bh(&rt6_exception_lock); if (ort->exception_bucket_flushed) { @@ -1583,7 +1576,7 @@ static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt, } /* Remove the passed in cached rt from the hash table that contains it */ -int rt6_remove_exception_rt(struct rt6_info *rt) +static int rt6_remove_exception_rt(struct rt6_info *rt) { struct rt6_exception_bucket *bucket; struct rt6_info *from = rt->from; @@ -1833,7 +1826,8 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, const struct sk_buff *skb, int flags) { struct fib6_node *fn, *saved_fn; - struct rt6_info *rt, *rt_cache; + struct rt6_info *f6i; + struct rt6_info *rt; int strict = 0; strict |= flags & RT6_LOOKUP_F_IFACE; @@ -1850,10 +1844,10 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, oif = 0; redo_rt6_select: - rt = rt6_select(net, fn, oif, strict); - if (rt->rt6i_nsiblings) - rt = rt6_multipath_select(net, rt, fl6, oif, skb, strict); - if (rt == net->ipv6.fib6_null_entry) { + f6i = rt6_select(net, fn, oif, strict); + if (f6i->rt6i_nsiblings) + f6i = rt6_multipath_select(net, f6i, fl6, oif, skb, strict); + if (f6i == net->ipv6.fib6_null_entry) { fn = fib6_backtrack(fn, &fl6->saddr); if (fn) goto redo_rt6_select; @@ -1865,18 +1859,17 @@ redo_rt6_select: } } - /*Search through exception table */ - rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr); - if (rt_cache) - rt = rt_cache; - - if (rt == net->ipv6.fib6_null_entry) { + if (f6i == net->ipv6.fib6_null_entry) { rt = net->ipv6.ip6_null_entry; rcu_read_unlock(); dst_hold(&rt->dst); trace_fib6_table_lookup(net, rt, table, fl6); return rt; - } else if (rt->rt6i_flags & RTF_CACHE) { + } + + /*Search through exception table */ + rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr); + if (rt) { if (ip6_hold_safe(net, &rt, true)) dst_use_noref(&rt->dst, jiffies); @@ -1884,7 +1877,7 @@ redo_rt6_select: trace_fib6_table_lookup(net, rt, table, fl6); return rt; } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) && - !(rt->rt6i_flags & RTF_GATEWAY))) { + !(f6i->rt6i_flags & RTF_GATEWAY))) { /* Create a RTF_CACHE clone which will not be * owned by the fib6 tree. It is for the special case where * the daddr in the skb during the neighbor look-up is different @@ -1893,16 +1886,16 @@ redo_rt6_select: struct rt6_info *uncached_rt; - if (ip6_hold_safe(net, &rt, true)) { - dst_use_noref(&rt->dst, jiffies); + if (ip6_hold_safe(net, &f6i, true)) { + dst_use_noref(&f6i->dst, jiffies); } else { rcu_read_unlock(); - uncached_rt = rt; + uncached_rt = f6i; goto uncached_rt_out; } rcu_read_unlock(); - uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL); + uncached_rt = ip6_rt_cache_alloc(f6i, &fl6->daddr, NULL); dst_release(&rt->dst); if (uncached_rt) { @@ -1925,18 +1918,18 @@ uncached_rt_out: struct rt6_info *pcpu_rt; - dst_use_noref(&rt->dst, jiffies); + dst_use_noref(&f6i->dst, jiffies); local_bh_disable(); - pcpu_rt = rt6_get_pcpu_route(rt); + pcpu_rt = rt6_get_pcpu_route(f6i); if (!pcpu_rt) { /* atomic_inc_not_zero() is needed when using rcu */ - if (atomic_inc_not_zero(&rt->rt6i_ref)) { + if (atomic_inc_not_zero(&f6i->rt6i_ref)) { /* No dst_hold() on rt is needed because grabbing * rt->rt6i_ref makes sure rt can't be released. */ - pcpu_rt = rt6_make_pcpu_route(net, rt); - rt6_release(rt); + pcpu_rt = rt6_make_pcpu_route(net, f6i); + rt6_release(f6i); } else { /* rt is already removed from tree */ pcpu_rt = net->ipv6.ip6_null_entry; @@ -2396,7 +2389,8 @@ static struct rt6_info *__ip6_route_redirect(struct net *net, int flags) { struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6; - struct rt6_info *rt, *rt_cache; + struct rt6_info *ret = NULL, *rt_cache; + struct rt6_info *rt; struct fib6_node *fn; /* Get the "current" route for this destination and @@ -2435,7 +2429,7 @@ restart: if (rt_cache && ipv6_addr_equal(&rdfl->gateway, &rt_cache->rt6i_gateway)) { - rt = rt_cache; + ret = rt_cache; break; } continue; @@ -2446,7 +2440,7 @@ restart: if (!rt) rt = net->ipv6.fib6_null_entry; else if (rt->rt6i_flags & RTF_REJECT) { - rt = net->ipv6.ip6_null_entry; + ret = net->ipv6.ip6_null_entry; goto out; } @@ -2457,12 +2451,15 @@ restart: } out: - ip6_hold_safe(net, &rt, true); + if (ret) + dst_hold(&ret->dst); + else + ret = ip6_create_rt_rcu(rt); rcu_read_unlock(); - trace_fib6_table_lookup(net, rt, table, fl6); - return rt; + trace_fib6_table_lookup(net, ret, table, fl6); + return ret; }; static struct dst_entry *ip6_route_redirect(struct net *net, @@ -3133,6 +3130,22 @@ out_put: return err; } +static int ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg) +{ + int rc = -ESRCH; + + if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex) + goto out; + + if (cfg->fc_flags & RTF_GATEWAY && + !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway)) + goto out; + if (dst_hold_safe(&rt->dst)) + rc = rt6_remove_exception_rt(rt); +out: + return rc; +} + static int ip6_route_del(struct fib6_config *cfg, struct netlink_ext_ack *extack) { @@ -3157,11 +3170,16 @@ static int ip6_route_del(struct fib6_config *cfg, if (fn) { for_each_fib6_node_rt_rcu(fn) { if (cfg->fc_flags & RTF_CACHE) { + int rc; + rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst, &cfg->fc_src); - if (!rt_cache) - continue; - rt = rt_cache; + if (rt_cache) { + rc = ip6_del_cached_rt(rt_cache, cfg); + if (rc != -ESRCH) + return rc; + } + continue; } if (cfg->fc_ifindex && (!rt->fib6_nh.nh_dev || @@ -3278,7 +3296,7 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu NEIGH_UPDATE_F_ISROUTER)), NDISC_REDIRECT, &ndopts); - nrt = ip6_rt_cache_alloc(rt, &msg->dest, NULL); + nrt = ip6_rt_cache_alloc(rt->from, &msg->dest, NULL); if (!nrt) goto out; From 67904f700fca7eb90a52b4fac1c20a30a234bf80 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 17 Apr 2018 17:33:22 -0700 Subject: [PATCH 0387/1640] BACKPORT: net/ipv6: Add gfp_flags to route add functions Most FIB entries can be added using memory allocated with GFP_KERNEL. Add gfp_flags to ip6_route_add and addrconf_dst_alloc. Code paths that can be reached from the packet path (e.g., ndisc and autoconfig) or atomic notifiers use GFP_ATOMIC; paths from user context (adding addresses and routes) use GFP_KERNEL. Change-Id: I88b1e9537e438ad243f6fb69796173c39c92e112 Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_route.h | 6 ++++-- net/ipv6/addrconf.c | 40 ++++++++++++++++++++++++---------------- net/ipv6/anycast.c | 2 +- net/ipv6/route.c | 18 ++++++++++-------- 4 files changed, 39 insertions(+), 27 deletions(-) diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index bc765fc881bb..c84c9896e300 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -100,7 +100,8 @@ void ip6_route_cleanup(void); int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg); -int ip6_route_add(struct fib6_config *cfg, struct netlink_ext_ack *extack); +int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags, + struct netlink_ext_ack *extack); int ip6_ins_rt(struct net *net, struct rt6_info *rt); int ip6_del_rt(struct net *net, struct rt6_info *rt); @@ -137,7 +138,8 @@ struct dst_entry *icmp6_dst_alloc(struct net_device *dev, struct flowi6 *fl6); void fib6_force_start_gc(struct net *net); struct rt6_info *addrconf_dst_alloc(struct net *net, struct inet6_dev *idev, - const struct in6_addr *addr, bool anycast); + const struct in6_addr *addr, bool anycast, + gfp_t gfp_flags); struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev, int flags); diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 78e5e2c3ac3e..f1ad937c0ec2 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -1025,7 +1025,7 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr, goto out; } - rt = addrconf_dst_alloc(net, idev, addr, false); + rt = addrconf_dst_alloc(net, idev, addr, false, gfp_flags); if (IS_ERR(rt)) { err = PTR_ERR(rt); rt = NULL; @@ -2325,7 +2325,7 @@ u32 addrconf_rt_table(const struct net_device *dev, u32 default_table) { static void addrconf_prefix_route(struct in6_addr *pfx, int plen, struct net_device *dev, - unsigned long expires, u32 flags) + unsigned long expires, u32 flags, gfp_t gfp_flags) { struct fib6_config cfg = { .fc_table = l3mdev_fib_table(dev) ? : addrconf_rt_table(dev, RT6_TABLE_PREFIX), @@ -2350,7 +2350,7 @@ addrconf_prefix_route(struct in6_addr *pfx, int plen, struct net_device *dev, cfg.fc_flags |= RTF_NONEXTHOP; #endif - ip6_route_add(&cfg, NULL); + ip6_route_add(&cfg, gfp_flags, NULL); } @@ -2407,7 +2407,7 @@ static void addrconf_add_mroute(struct net_device *dev) ipv6_addr_set(&cfg.fc_dst, htonl(0xFF000000), 0, 0, 0); - ip6_route_add(&cfg, NULL); + ip6_route_add(&cfg, GFP_ATOMIC, NULL); } static struct inet6_dev *addrconf_add_dev(struct net_device *dev) @@ -2700,7 +2700,8 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len, bool sllao) if (dev->ip6_ptr->cnf.accept_ra_prefix_route) { addrconf_prefix_route(&pinfo->prefix, pinfo->prefix_len, - dev, expires, flags); + dev, expires, flags, + GFP_ATOMIC); } } ip6_rt_put(rt); @@ -2915,7 +2916,7 @@ static int inet6_addr_add(struct net *net, int ifindex, if (!IS_ERR(ifp)) { if (!(ifa_flags & IFA_F_NOPREFIXROUTE)) { addrconf_prefix_route(&ifp->addr, ifp->prefix_len, dev, - expires, flags); + expires, flags, GFP_KERNEL); } /* @@ -3066,7 +3067,8 @@ static void sit_add_v4_addrs(struct inet6_dev *idev) if (addr.s6_addr32[3]) { add_addr(idev, &addr, plen, scope); - addrconf_prefix_route(&addr, plen, idev->dev, 0, pflags); + addrconf_prefix_route(&addr, plen, idev->dev, 0, pflags, + GFP_ATOMIC); return; } @@ -3091,7 +3093,7 @@ static void sit_add_v4_addrs(struct inet6_dev *idev) add_addr(idev, &addr, plen, flag); addrconf_prefix_route(&addr, plen, idev->dev, 0, - pflags); + pflags, GFP_ATOMIC); } } } @@ -3131,7 +3133,8 @@ void addrconf_add_linklocal(struct inet6_dev *idev, ifp = ipv6_add_addr(idev, addr, NULL, 64, IFA_LINK, addr_flags, INFINITY_LIFE_TIME, INFINITY_LIFE_TIME, true); if (!IS_ERR(ifp)) { - addrconf_prefix_route(&ifp->addr, ifp->prefix_len, idev->dev, 0, 0); + addrconf_prefix_route(&ifp->addr, ifp->prefix_len, idev->dev, + 0, 0, GFP_ATOMIC); addrconf_dad_start(ifp); in6_ifa_put(ifp); } @@ -3250,7 +3253,8 @@ static void addrconf_addr_gen(struct inet6_dev *idev, bool prefix_route) addrconf_add_linklocal(idev, &addr, IFA_F_STABLE_PRIVACY); else if (prefix_route) - addrconf_prefix_route(&addr, 64, idev->dev, 0, 0); + addrconf_prefix_route(&addr, 64, idev->dev, + 0, 0, GFP_KERNEL); break; case IN6_ADDR_GEN_MODE_EUI64: /* addrconf_add_linklocal also adds a prefix_route and we @@ -3260,7 +3264,8 @@ static void addrconf_addr_gen(struct inet6_dev *idev, bool prefix_route) if (ipv6_generate_eui64(addr.s6_addr + 8, idev->dev) == 0) addrconf_add_linklocal(idev, &addr, 0); else if (prefix_route) - addrconf_prefix_route(&addr, 64, idev->dev, 0, 0); + addrconf_prefix_route(&addr, 64, idev->dev, + 0, 0, GFP_ATOMIC); break; case IN6_ADDR_GEN_MODE_NONE: default: @@ -3368,7 +3373,8 @@ static int fixup_permanent_addr(struct net *net, if (!ifp->rt || !ifp->rt->rt6i_node) { struct rt6_info *rt, *prev; - rt = addrconf_dst_alloc(net, idev, &ifp->addr, false); + rt = addrconf_dst_alloc(net, idev, &ifp->addr, false, + GFP_ATOMIC); if (unlikely(IS_ERR(rt))) return PTR_ERR(rt); @@ -3383,7 +3389,7 @@ static int fixup_permanent_addr(struct net *net, if (!(ifp->flags & IFA_F_NOPREFIXROUTE)) { addrconf_prefix_route(&ifp->addr, ifp->prefix_len, - idev->dev, 0, 0); + idev->dev, 0, 0, GFP_ATOMIC); } if (ifp->state == INET6_IFADDR_STATE_PREDAD) @@ -4611,8 +4617,9 @@ static int inet6_addr_modify(struct inet6_ifaddr *ifp, u32 ifa_flags, ipv6_ifa_notify(0, ifp); if (!(ifa_flags & IFA_F_NOPREFIXROUTE)) { - addrconf_prefix_route(&ifp->addr, ifp->prefix_len, ifp->idev->dev, - expires, flags); + addrconf_prefix_route(&ifp->addr, ifp->prefix_len, + ifp->idev->dev, expires, flags, + GFP_KERNEL); } else if (had_prefixroute) { enum cleanup_prefix_rt_t action; unsigned long rt_expires; @@ -5657,7 +5664,8 @@ static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp) addrconf_join_anycast(ifp); if (!ipv6_addr_any(&ifp->peer_addr)) addrconf_prefix_route(&ifp->peer_addr, 128, - ifp->idev->dev, 0, 0); + ifp->idev->dev, 0, 0, + GFP_KERNEL); break; case RTM_DELADDR: if (ifp->idev->cnf.forwarding) diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c index bfe331234729..a2fff55b1df8 100644 --- a/net/ipv6/anycast.c +++ b/net/ipv6/anycast.c @@ -271,7 +271,7 @@ int __ipv6_dev_ac_inc(struct inet6_dev *idev, const struct in6_addr *addr) } net = dev_net(idev->dev); - rt = addrconf_dst_alloc(net, idev, addr, true); + rt = addrconf_dst_alloc(net, idev, addr, true, GFP_ATOMIC); if (IS_ERR(rt)) { err = PTR_ERR(rt); goto out; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 5dcabff7a687..566706827eee 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2828,6 +2828,7 @@ out: } static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg, + gfp_t gfp_flags, struct netlink_ext_ack *extack) { struct net *net = cfg->fc_nlinfo.nl_net; @@ -3036,12 +3037,13 @@ out: return ERR_PTR(err); } -int ip6_route_add(struct fib6_config *cfg, struct netlink_ext_ack *extack) +int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags, + struct netlink_ext_ack *extack) { struct rt6_info *rt; int err; - rt = ip6_route_info_create(cfg, extack); + rt = ip6_route_info_create(cfg, gfp_flags, extack); if (IS_ERR(rt)) return PTR_ERR(rt); @@ -3388,7 +3390,7 @@ static struct rt6_info *rt6_add_route_info(struct net *net, if (!prefixlen) cfg.fc_flags |= RTF_DEFAULT; - ip6_route_add(&cfg, NULL); + ip6_route_add(&cfg, GFP_ATOMIC, NULL); return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev); } @@ -3439,7 +3441,7 @@ struct rt6_info *rt6_add_dflt_router(struct net *net, cfg.fc_gateway = *gwaddr; - if (!ip6_route_add(&cfg, NULL)) { + if (!ip6_route_add(&cfg, GFP_ATOMIC, NULL)) { struct fib6_table *table; table = fib6_get_table(dev_net(dev), cfg.fc_table); @@ -3506,7 +3508,7 @@ int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg) rtnl_lock(); switch (cmd) { case SIOCADDRT: - err = ip6_route_add(&cfg, NULL); + err = ip6_route_add(&cfg, GFP_KERNEL, NULL); break; case SIOCDELRT: err = ip6_route_del(&cfg, NULL); @@ -3578,7 +3580,7 @@ static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff struct rt6_info *addrconf_dst_alloc(struct net *net, struct inet6_dev *idev, const struct in6_addr *addr, - bool anycast) + bool anycast, gfp_t gfp_flags) { u32 tb_id; struct net_device *dev = idev->dev; @@ -4253,7 +4255,7 @@ static int ip6_route_multipath_add(struct fib6_config *cfg, r_cfg.fc_encap_type = nla_get_u16(nla); } - rt = ip6_route_info_create(&r_cfg, extack); + rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack); if (IS_ERR(rt)) { err = PTR_ERR(rt); rt = NULL; @@ -4421,7 +4423,7 @@ static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, if (cfg.fc_mp) return ip6_route_multipath_add(&cfg, extack); else - return ip6_route_add(&cfg, extack); + return ip6_route_add(&cfg, GFP_KERNEL, extack); } static size_t rt6_nlmsg_size(struct rt6_info *rt) From 0c97cb0dd9c5a614faebf1fb3dff62c99dd67f48 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 17 Apr 2018 17:33:25 -0700 Subject: [PATCH 0388/1640] BACKPORT: net/ipv6: separate handling of FIB entries from dst based routes Last step before flipping the data type for FIB entries: - use fib6_info_alloc to create FIB entries in ip6_route_info_create and addrconf_dst_alloc - use fib6_info_release in place of dst_release, ip6_rt_put and rt6_release - remove the dst_hold before calling __ip6_ins_rt or ip6_del_rt - when purging routes, drop per-cpu routes - replace inc and dec of rt6i_ref with fib6_info_hold and fib6_info_release - use rt->from since it points to the FIB entry - drop references to exception bucket, fib6_metrics and per-cpu from dst entries (those are relevant for fib entries only) Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 4 +- include/net/ip6_route.h | 3 +- net/ipv6/addrconf.c | 18 ++--- net/ipv6/anycast.c | 7 +- net/ipv6/ip6_fib.c | 55 +++++++++----- net/ipv6/ip6_output.c | 3 +- net/ipv6/ndisc.c | 6 +- net/ipv6/route.c | 161 ++++++++++++++-------------------------- 8 files changed, 111 insertions(+), 146 deletions(-) diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 9f3d3f9ebceb..6eb744680eaa 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -316,9 +316,7 @@ static inline u32 rt6_get_cookie(const struct rt6_info *rt) u32 cookie = 0; if (rt->from) - rt = rt->from; - - rt6_get_cookie_safe(rt, &cookie); + rt6_get_cookie_safe(rt->from, &cookie); return cookie; } diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index c84c9896e300..82520275c978 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -114,8 +114,7 @@ static inline int ip6_route_get_saddr(struct net *net, struct rt6_info *rt, unsigned int prefs, struct in6_addr *saddr) { - struct inet6_dev *idev = - rt ? ip6_dst_idev((struct dst_entry *)rt) : NULL; + struct inet6_dev *idev = rt ? rt->rt6i_idev : NULL; int err = 0; if (rt && rt->rt6i_prefsrc.plen) diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index f1ad937c0ec2..f06ddabf62d0 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -925,7 +925,6 @@ void inet6_ifa_finish_destroy(struct inet6_ifaddr *ifp) pr_warn("Freeing alive inet6 address %p\n", ifp); return; } - ip6_rt_put(ifp->rt); kfree_rcu(ifp, rcu); } @@ -1090,8 +1089,8 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr, inet6addr_notifier_call_chain(NETDEV_UP, ifa); out: if (unlikely(err < 0)) { - if (rt) - ip6_rt_put(rt); + fib6_info_release(rt); + if (ifa) { if (ifa->idev) in6_dev_put(ifa->idev); @@ -1180,7 +1179,7 @@ cleanup_prefix_route(struct inet6_ifaddr *ifp, unsigned long expires, bool del_r else { if (!(rt->rt6i_flags & RTF_EXPIRES)) fib6_set_expires(rt, expires); - ip6_rt_put(rt); + fib6_info_release(rt); } } } @@ -2380,8 +2379,7 @@ static struct rt6_info *addrconf_get_prefix_route(const struct in6_addr *pfx, continue; if ((rt->rt6i_flags & noflags) != 0) continue; - if (!dst_hold_safe(&rt->dst)) - rt = NULL; + fib6_info_hold(rt); break; } out: @@ -2704,7 +2702,7 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len, bool sllao) GFP_ATOMIC); } } - ip6_rt_put(rt); + fib6_info_release(rt); } /* Try to figure out our local address for this prefix */ @@ -3384,7 +3382,7 @@ static int fixup_permanent_addr(struct net *net, ifp->rt = rt; spin_unlock(&ifp->lock); - ip6_rt_put(prev); + fib6_info_release(prev); } if (!(ifp->flags & IFA_F_NOPREFIXROUTE)) { @@ -5680,8 +5678,8 @@ static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp) ip6_del_rt(net, rt); } if (ifp->rt) { - if (dst_hold_safe(&ifp->rt->dst)) - ip6_del_rt(net, ifp->rt); + ip6_del_rt(net, ifp->rt); + ifp->rt = NULL; } rt_genid_bump_ipv6(net); break; diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c index a2fff55b1df8..4e7507ebd84f 100644 --- a/net/ipv6/anycast.c +++ b/net/ipv6/anycast.c @@ -217,7 +217,7 @@ static void aca_put(struct ifacaddr6 *ac) { if (refcount_dec_and_test(&ac->aca_refcnt)) { in6_dev_put(ac->aca_idev); - dst_release(&ac->aca_rt->dst); + fib6_info_release(ac->aca_rt); kfree(ac); } } @@ -235,6 +235,7 @@ static struct ifacaddr6 *aca_alloc(struct rt6_info *rt, aca->aca_addr = *addr; in6_dev_hold(idev); aca->aca_idev = idev; + fib6_info_hold(rt); aca->aca_rt = rt; aca->aca_users = 1; /* aca_tstamp should be updated upon changes */ @@ -278,7 +279,7 @@ int __ipv6_dev_ac_inc(struct inet6_dev *idev, const struct in6_addr *addr) } aca = aca_alloc(rt, addr); if (!aca) { - ip6_rt_put(rt); + fib6_info_release(rt); err = -ENOMEM; goto out; } @@ -334,7 +335,6 @@ int __ipv6_dev_ac_dec(struct inet6_dev *idev, const struct in6_addr *addr) write_unlock_bh(&idev->lock); addrconf_leave_solict(idev, &aca->aca_addr); - dst_hold(&aca->aca_rt->dst); ip6_del_rt(dev_net(idev->dev), aca->aca_rt); aca_put(aca); @@ -362,7 +362,6 @@ void ipv6_ac_destroy_dev(struct inet6_dev *idev) addrconf_leave_solict(idev, &aca->aca_addr); - dst_hold(&aca->aca_rt->dst); ip6_del_rt(dev_net(idev->dev), aca->aca_rt); aca_put(aca); diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index e64ad2b176f1..b42be727a895 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -170,6 +170,7 @@ struct rt6_info *fib6_info_alloc(gfp_t gfp_flags) void fib6_info_destroy(struct rt6_info *f6i) { struct rt6_exception_bucket *bucket; + struct dst_metrics *m; WARN_ON(f6i->rt6i_node); @@ -201,6 +202,10 @@ void fib6_info_destroy(struct rt6_info *f6i) if (f6i->fib6_nh.nh_dev) dev_put(f6i->fib6_nh.nh_dev); + m = f6i->fib6_metrics; + if (m != &dst_default_metrics && refcount_dec_and_test(&m->refcnt)) + kfree(m); + kfree(f6i); } EXPORT_SYMBOL_GPL(fib6_info_destroy); @@ -714,7 +719,7 @@ static struct fib6_node *fib6_add_1(struct net *net, /* clean up an intermediate node */ if (!(fn->fn_flags & RTN_RTINFO)) { RCU_INIT_POINTER(fn->leaf, NULL); - rt6_release(leaf); + fib6_info_release(leaf); /* remove null_entry in the root node */ } else if (fn->fn_flags & RTN_TL_ROOT && rcu_access_pointer(fn->leaf) == @@ -898,12 +903,32 @@ static void fib6_purge_rt(struct rt6_info *rt, struct fib6_node *fn, if (!(fn->fn_flags & RTN_RTINFO) && leaf == rt) { new_leaf = fib6_find_prefix(net, table, fn); atomic_inc(&new_leaf->rt6i_ref); + rcu_assign_pointer(fn->leaf, new_leaf); - rt6_release(rt); + fib6_info_release(rt); } fn = rcu_dereference_protected(fn->parent, lockdep_is_held(&table->tb6_lock)); } + + if (rt->rt6i_pcpu) { + int cpu; + + /* release the reference to this fib entry from + * all of its cached pcpu routes + */ + for_each_possible_cpu(cpu) { + struct rt6_info **ppcpu_rt; + struct rt6_info *pcpu_rt; + + ppcpu_rt = per_cpu_ptr(rt->rt6i_pcpu, cpu); + pcpu_rt = *ppcpu_rt; + if (pcpu_rt) { + fib6_info_release(pcpu_rt->from); + pcpu_rt->from = NULL; + } + } + } } } @@ -1100,7 +1125,7 @@ add: fib6_purge_rt(iter, fn, info->nl_net); if (rcu_access_pointer(fn->rr_ptr) == iter) fn->rr_ptr = NULL; - rt6_release(iter); + fib6_info_release(iter); if (nsiblings) { /* Replacing an ECMP route, remove all siblings */ @@ -1116,7 +1141,7 @@ add: fib6_purge_rt(iter, fn, info->nl_net); if (rcu_access_pointer(fn->rr_ptr) == iter) fn->rr_ptr = NULL; - rt6_release(iter); + fib6_info_release(iter); nsiblings--; info->nl_net->ipv6.rt6_stats->fib_rt_entries--; } else { @@ -1184,9 +1209,6 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, int replace_required = 0; int sernum = fib6_new_sernum(info->nl_net); - if (WARN_ON_ONCE(!atomic_read(&rt->dst.__refcnt))) - return -EINVAL; - if (info->nlh) { if (!(info->nlh->nlmsg_flags & NLM_F_CREATE)) allow_create = 0; @@ -1301,7 +1323,7 @@ out: if (pn_leaf == rt) { pn_leaf = NULL; RCU_INIT_POINTER(pn->leaf, NULL); - atomic_dec(&rt->rt6i_ref); + fib6_info_release(rt); } if (!pn_leaf && !(pn->fn_flags & RTN_RTINFO)) { pn_leaf = fib6_find_prefix(info->nl_net, table, @@ -1313,7 +1335,7 @@ out: info->nl_net->ipv6.fib6_null_entry; } #endif - atomic_inc(&pn_leaf->rt6i_ref); + fib6_info_hold(pn_leaf); rcu_assign_pointer(pn->leaf, pn_leaf); } } @@ -1335,10 +1357,6 @@ failure: (fn->fn_flags & RTN_TL_ROOT && !rcu_access_pointer(fn->leaf)))) fib6_repair_tree(info->nl_net, table, fn); - /* Always release dst as dst->__refcnt is guaranteed - * to be taken before entering this function - */ - dst_release_immediate(&rt->dst); return err; } @@ -1635,7 +1653,7 @@ static struct fib6_node *fib6_repair_tree(struct net *net, new_fn_leaf = net->ipv6.fib6_null_entry; } #endif - atomic_inc(&new_fn_leaf->rt6i_ref); + fib6_info_hold(new_fn_leaf); rcu_assign_pointer(fn->leaf, new_fn_leaf); return pn; } @@ -1691,7 +1709,7 @@ static struct fib6_node *fib6_repair_tree(struct net *net, return pn; RCU_INIT_POINTER(pn->leaf, NULL); - rt6_release(pn_leaf); + fib6_info_release(pn_leaf); fn = pn; } } @@ -1761,7 +1779,7 @@ static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn, call_fib6_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, rt, NULL); if (!info->skip_notify) inet6_rt_notify(RTM_DELROUTE, rt, info, 0); - rt6_release(rt); + fib6_info_release(rt); } /* Need to own table->tb6_lock */ @@ -2246,9 +2264,8 @@ static int ipv6_route_seq_show(struct seq_file *seq, void *v) dev = rt->fib6_nh.nh_dev; seq_printf(seq, " %08x %08x %08x %08x %8s\n", - rt->rt6i_metric, atomic_read(&rt->dst.__refcnt), - rt->dst.__use, rt->rt6i_flags, - dev ? dev->name : ""); + rt->rt6i_metric, atomic_read(&rt->rt6i_ref), 0, + rt->rt6i_flags, dev ? dev->name : ""); iter->w.leaf = NULL; return 0; } diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 8b7a28b83d87..0896f296b231 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1023,7 +1023,8 @@ static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk, if (!had_dst) *dst = ip6_route_output(net, sk, fl6); rt = (*dst)->error ? NULL : (struct rt6_info *)*dst; - err = ip6_route_get_saddr(net, rt, &fl6->daddr, + err = ip6_route_get_saddr(net, rt ? rt->from : NULL, + &fl6->daddr, sk ? inet6_sk(sk)->srcprefs : 0, &fl6->saddr); if (err) diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 8aeb50ac4bd5..84f512570851 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -1295,7 +1295,7 @@ static void ndisc_router_discovery(struct sk_buff *skb) ND_PRINTK(0, err, "RA: %s got default router without neighbour\n", __func__); - ip6_rt_put(rt); + fib6_info_release(rt); return; } } @@ -1323,7 +1323,7 @@ static void ndisc_router_discovery(struct sk_buff *skb) ND_PRINTK(0, err, "RA: %s got default router without neighbour\n", __func__); - ip6_rt_put(rt); + fib6_info_release(rt); return; } neigh->flags |= NTF_ROUTER; @@ -1509,7 +1509,7 @@ skip_routeinfo: ND_PRINTK(2, warn, "RA: invalid RA options\n"); } out: - ip6_rt_put(rt); + fib6_info_release(rt); if (neigh) neigh_release(neigh); } diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 566706827eee..11ad7ff02f31 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -347,13 +347,11 @@ static void rt6_info_init(struct rt6_info *rt) memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst)); INIT_LIST_HEAD(&rt->rt6i_siblings); INIT_LIST_HEAD(&rt->rt6i_uncached); - rt->fib6_metrics = (struct dst_metrics *)&dst_default_metrics; } /* allocate dst with ip6_dst_ops */ -static struct rt6_info *__ip6_dst_alloc(struct net *net, - struct net_device *dev, - int flags) +struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev, + int flags) { struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK, flags); @@ -365,35 +363,15 @@ static struct rt6_info *__ip6_dst_alloc(struct net *net, return rt; } - -struct rt6_info *ip6_dst_alloc(struct net *net, - struct net_device *dev, - int flags) -{ - struct rt6_info *rt = __ip6_dst_alloc(net, dev, flags); - - if (rt) { - rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_ATOMIC); - if (!rt->rt6i_pcpu) { - dst_release_immediate(&rt->dst); - return NULL; - } - } - - return rt; -} EXPORT_SYMBOL(ip6_dst_alloc); static void ip6_dst_destroy(struct dst_entry *dst) { struct rt6_info *rt = (struct rt6_info *)dst; - struct rt6_exception_bucket *bucket; struct rt6_info *from = rt->from; struct inet6_dev *idev; - struct dst_metrics *m; dst_destroy_metrics_generic(dst); - free_percpu(rt->rt6i_pcpu); rt6_uncached_list_del(rt); idev = rt->rt6i_idev; @@ -401,18 +379,9 @@ static void ip6_dst_destroy(struct dst_entry *dst) rt->rt6i_idev = NULL; in6_dev_put(idev); } - bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1); - if (bucket) { - rt->rt6i_exception_bucket = NULL; - kfree(bucket); - } - - m = rt->fib6_metrics; - if (m != &dst_default_metrics && refcount_dec_and_test(&m->refcnt)) - kfree(m); rt->from = NULL; - dst_release(&from->dst); + fib6_info_release(from); } static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev, @@ -887,7 +856,7 @@ int rt6_route_rcv(struct net_device *dev, u8 *opt, int len, else fib6_set_expires(rt, jiffies + HZ * lifetime); - ip6_rt_put(rt); + fib6_info_release(rt); } return 0; } @@ -1006,11 +975,9 @@ static void ip6_rt_init_dst(struct rt6_info *rt, struct rt6_info *ort) static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from) { - BUG_ON(from->from); - rt->rt6i_flags &= ~RTF_EXPIRES; - if (dst_hold_safe(&from->dst)) - rt->from = from; + fib6_info_hold(from); + rt->from = from; dst_init_metrics(&rt->dst, from->fib6_metrics->metrics, true); if (from->fib6_metrics != &dst_default_metrics) { rt->dst._metrics |= DST_METRICS_REFCOUNTED; @@ -1080,7 +1047,7 @@ static struct rt6_info *ip6_create_rt_rcu(struct rt6_info *rt) struct net_device *dev = rt->fib6_nh.nh_dev; struct rt6_info *nrt; - nrt = __ip6_dst_alloc(dev_net(dev), dev, flags); + nrt = ip6_dst_alloc(dev_net(dev), dev, flags); if (nrt) ip6_rt_copy_init(nrt, rt); @@ -1199,8 +1166,6 @@ int ip6_ins_rt(struct net *net, struct rt6_info *rt) { struct nl_info info = { .nl_net = net, }; - /* Hold dst to account for the reference from the fib6 tree */ - dst_hold(&rt->dst); return __ip6_ins_rt(rt, &info, NULL); } @@ -1217,7 +1182,7 @@ static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort, rcu_read_lock(); dev = ip6_rt_get_dev_rcu(ort); - rt = __ip6_dst_alloc(dev_net(dev), dev, 0); + rt = ip6_dst_alloc(dev_net(dev), dev, 0); rcu_read_unlock(); if (!rt) return NULL; @@ -1252,7 +1217,7 @@ static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt) rcu_read_lock(); dev = ip6_rt_get_dev_rcu(rt); - pcpu_rt = __ip6_dst_alloc(dev_net(dev), dev, flags); + pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags); rcu_read_unlock(); if (!pcpu_rt) return NULL; @@ -1311,7 +1276,7 @@ static void rt6_remove_exception(struct rt6_exception_bucket *bucket, return; rt6_ex->rt6i->rt6i_node = NULL; hlist_del_rcu(&rt6_ex->hlist); - rt6_release(rt6_ex->rt6i); + ip6_rt_put(rt6_ex->rt6i); kfree_rcu(rt6_ex, rcu); WARN_ON_ONCE(!bucket->depth); bucket->depth--; @@ -1886,17 +1851,11 @@ redo_rt6_select: struct rt6_info *uncached_rt; - if (ip6_hold_safe(net, &f6i, true)) { - dst_use_noref(&f6i->dst, jiffies); - } else { - rcu_read_unlock(); - uncached_rt = f6i; - goto uncached_rt_out; - } + fib6_info_hold(f6i); rcu_read_unlock(); uncached_rt = ip6_rt_cache_alloc(f6i, &fl6->daddr, NULL); - dst_release(&rt->dst); + fib6_info_release(f6i); if (uncached_rt) { /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc() @@ -1909,7 +1868,6 @@ redo_rt6_select: dst_hold(&uncached_rt->dst); } -uncached_rt_out: trace_fib6_table_lookup(net, uncached_rt, table, fl6); return uncached_rt; @@ -1918,24 +1876,12 @@ uncached_rt_out: struct rt6_info *pcpu_rt; - dst_use_noref(&f6i->dst, jiffies); local_bh_disable(); pcpu_rt = rt6_get_pcpu_route(f6i); - if (!pcpu_rt) { - /* atomic_inc_not_zero() is needed when using rcu */ - if (atomic_inc_not_zero(&f6i->rt6i_ref)) { - /* No dst_hold() on rt is needed because grabbing - * rt->rt6i_ref makes sure rt can't be released. - */ - pcpu_rt = rt6_make_pcpu_route(net, f6i); - rt6_release(f6i); - } else { - /* rt is already removed from tree */ - pcpu_rt = net->ipv6.ip6_null_entry; - dst_hold(&pcpu_rt->dst); - } - } + if (!pcpu_rt) + pcpu_rt = rt6_make_pcpu_route(net, f6i); + local_bh_enable(); rcu_read_unlock(); trace_fib6_table_lookup(net, pcpu_rt, table, fl6); @@ -2179,11 +2125,26 @@ struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_ori * Destination cache support functions */ +static bool fib6_check(struct rt6_info *f6i, u32 cookie) +{ + u32 rt_cookie = 0; + + if ((f6i && !rt6_get_cookie_safe(f6i, &rt_cookie)) || + rt_cookie != cookie) + return false; + + if (fib6_check_expired(f6i)) + return false; + + return true; +} + static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie) { u32 rt_cookie = 0; - if (!rt6_get_cookie_safe(rt, &rt_cookie) || rt_cookie != cookie) + if ((rt->from && !rt6_get_cookie_safe(rt->from, &rt_cookie)) || + rt_cookie != cookie) return NULL; if (rt6_check_expired(rt)) @@ -2196,7 +2157,7 @@ static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie) { if (!__rt6_check_expired(rt) && rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK && - rt6_check(rt->from, cookie)) + fib6_check(rt->from, cookie)) return &rt->dst; else return NULL; @@ -2231,7 +2192,7 @@ static void ip6_negative_advice(struct sock *sk, dst_hold(dst); sk_dst_reset(sk); - ip6_del_rt(dev_net(dst->dev), rt); + rt6_remove_exception_rt(rt); } return; } @@ -2248,12 +2209,12 @@ static void ip6_link_failure(struct sk_buff *skb) if (rt) { if (rt->rt6i_flags & RTF_CACHE) { if (dst_hold_safe(&rt->dst)) - ip6_del_rt(dev_net(rt->dst.dev), rt); - } else { + rt6_remove_exception_rt(rt); + } else if (rt->from) { struct fib6_node *fn; rcu_read_lock(); - fn = rcu_dereference(rt->rt6i_node); + fn = rcu_dereference(rt->from->rt6i_node); if (fn && (rt->rt6i_flags & RTF_DEFAULT)) fn->fn_sernum = -1; rcu_read_unlock(); @@ -2914,13 +2875,13 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg, if (!table) goto out; - rt = ip6_dst_alloc(net, NULL, - (cfg->fc_flags & RTF_ADDRCONF) ? 0 : DST_NOCOUNT); - - if (!rt) { - err = -ENOMEM; + err = -ENOMEM; + rt = fib6_info_alloc(gfp_flags); + if (!rt) goto out; - } + + if (cfg->fc_flags & RTF_ADDRCONF) + rt->dst_nocount = true; err = ip6_convert_metrics(net, rt, cfg); if (err < 0) @@ -2994,7 +2955,7 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg, if (err) goto out; - rt->fib6_nh.nh_gw = rt->rt6i_gateway = cfg->fc_gateway; + rt->fib6_nh.nh_gw = cfg->fc_gateway; } err = -ENODEV; @@ -3019,7 +2980,7 @@ install_route: !netif_carrier_ok(dev)) rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN; rt->fib6_nh.nh_flags |= (cfg->fc_flags & RTNH_F_ONLINK); - rt->fib6_nh.nh_dev = rt->dst.dev = dev; + rt->fib6_nh.nh_dev = dev; rt->rt6i_idev = idev; rt->rt6i_table = table; @@ -3031,9 +2992,8 @@ out: dev_put(dev); if (idev) in6_dev_put(idev); - if (rt) - dst_release_immediate(&rt->dst); + fib6_info_release(rt); return ERR_PTR(err); } @@ -3048,6 +3008,7 @@ int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags, return PTR_ERR(rt); err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack); + fib6_info_release(rt); return err; } @@ -3069,7 +3030,7 @@ static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info) spin_unlock_bh(&table->tb6_lock); out: - ip6_rt_put(rt); + fib6_info_release(rt); return err; } @@ -3123,7 +3084,7 @@ static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg) out_unlock: spin_unlock_bh(&table->tb6_lock); out_put: - ip6_rt_put(rt); + fib6_info_release(rt); if (skb) { rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE, @@ -3194,8 +3155,7 @@ static int ip6_route_del(struct fib6_config *cfg, continue; if (cfg->fc_protocol && cfg->fc_protocol != rt->rt6i_protocol) continue; - if (!dst_hold_safe(&rt->dst)) - break; + fib6_info_hold(rt); rcu_read_unlock(); /* if gateway was specified only delete the one hop */ @@ -3586,7 +3546,7 @@ struct rt6_info *addrconf_dst_alloc(struct net *net, struct net_device *dev = idev->dev; struct rt6_info *rt; - rt = ip6_dst_alloc(net, dev, DST_NOCOUNT); + rt = fib6_info_alloc(gfp_flags); if (!rt) return ERR_PTR(-ENOMEM); @@ -3607,8 +3567,8 @@ struct rt6_info *addrconf_dst_alloc(struct net *net, } rt->fib6_nh.nh_gw = *addr; + dev_hold(dev); rt->fib6_nh.nh_dev = dev; - rt->rt6i_gateway = *addr; rt->rt6i_dst.addr = *addr; rt->rt6i_dst.plen = 128; tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL; @@ -4267,7 +4227,7 @@ static int ip6_route_multipath_add(struct fib6_config *cfg, err = ip6_route_info_append(info->nl_net, &rt6_nh_list, rt, &r_cfg); if (err) { - dst_release_immediate(&rt->dst); + fib6_info_release(rt); goto cleanup; } @@ -4283,6 +4243,7 @@ static int ip6_route_multipath_add(struct fib6_config *cfg, err_nh = NULL; list_for_each_entry(nh, &rt6_nh_list, next) { err = __ip6_ins_rt(nh->rt6_info, info, extack); + fib6_info_release(nh->rt6_info); if (!err) { /* save reference to last route successfully inserted */ @@ -4339,7 +4300,7 @@ add_errout: cleanup: list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) { if (nh->rt6_info) - dst_release_immediate(&nh->rt6_info->dst); + fib6_info_release(nh->rt6_info); list_del(&nh->next); kfree(nh); } @@ -4771,14 +4732,6 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, goto errout; } - if (fibmatch && rt->from) { - struct rt6_info *ort = rt->from; - - dst_hold(&ort->dst); - ip6_rt_put(rt); - rt = ort; - } - skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb) { ip6_rt_put(rt); @@ -4788,12 +4741,12 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, skb_dst_set(skb, &rt->dst); if (fibmatch) - err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, iif, + err = rt6_fill_node(net, skb, rt->from, NULL, NULL, NULL, iif, RTM_NEWROUTE, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, 0); else - err = rt6_fill_node(net, skb, rt, dst, &fl6.daddr, &fl6.saddr, - iif, RTM_NEWROUTE, + err = rt6_fill_node(net, skb, rt->from, dst, + &fl6.daddr, &fl6.saddr, iif, RTM_NEWROUTE, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, 0); if (err < 0) { From 85584f45c94cc9a878c00af99a2506e1213aa6e8 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Sun, 7 Jan 2018 12:45:12 +0200 Subject: [PATCH 0389/1640] UPSTREAM: ipv6: Teach tree walker to skip multipath routes As explained in previous patch, fib6_ifdown() needs to consider the state of all the sibling routes when a multipath route is traversed. This is done by evaluating all the siblings when the first sibling in a multipath route is traversed. If the multipath route does not need to be flushed (e.g., not all siblings are dead), then we should just skip the multipath route as our work is done. Have the tree walker jump to the last sibling when it is determined that the multipath route needs to be skipped. Signed-off-by: Ido Schimmel Acked-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/ip6_fib.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index b42be727a895..96f542ea06e9 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -1960,7 +1960,7 @@ static int fib6_clean_node(struct fib6_walker *w) for_each_fib6_walker_rt(w) { res = c->func(rt, c->arg); - if (res < 0) { + if (res == -1) { w->leaf = rt; res = fib6_del(rt, &info); if (res) { @@ -1973,6 +1973,12 @@ static int fib6_clean_node(struct fib6_walker *w) continue; } return 0; + } else if (res == -2) { + if (WARN_ON(!rt->rt6i_nsiblings)) + continue; + rt = list_last_entry(&rt->rt6i_siblings, + struct rt6_info, rt6i_siblings); + continue; } WARN_ON(res != 0); } @@ -1984,7 +1990,8 @@ static int fib6_clean_node(struct fib6_walker *w) * Convenient frontend to tree walker. * * func is called on each route. - * It may return -1 -> delete this route. + * It may return -2 -> skip multipath route. + * -1 -> delete this route. * 0 -> continue walking */ From 81171a90dafad3d3401e62353653e65f18ece342 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Fri, 13 Oct 2017 15:01:08 -0700 Subject: [PATCH 0390/1640] UPSTREAM: ipv6: check fn before doing FIB6_SUBTREE(fn) In fib6_locate(), we need to first make sure fn is not NULL before doing FIB6_SUBTREE(fn) to avoid crash. This fixes the following static checker warning: net/ipv6/ip6_fib.c:1462 fib6_locate() warn: variable dereferenced before check 'fn' (see line 1459) net/ipv6/ip6_fib.c 1458 if (src_len) { 1459 struct fib6_node *subtree = FIB6_SUBTREE(fn); ^^^^^^^^^^^^^^^^ We shifted this dereference 1460 1461 WARN_ON(saddr == NULL); 1462 if (fn && subtree) ^^ before the check for NULL. 1463 fn = fib6_locate_1(subtree, saddr, src_len, 1464 offsetof(struct rt6_info, rt6i_src) Fixes: 66f5d6ce53e6 ("ipv6: replace rwlock with rcu and spinlock in fib6_table") Reported-by: Dan Carpenter Signed-off-by: Wei Wang Acked-by: Eric Dumazet Acked-by: Martin KaFai Lau Signed-off-by: David S. Miller --- net/ipv6/ip6_fib.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 96f542ea06e9..ea2483a9006b 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -1539,13 +1539,16 @@ struct fib6_node *fib6_locate(struct fib6_node *root, #ifdef CONFIG_IPV6_SUBTREES if (src_len) { - struct fib6_node *subtree = FIB6_SUBTREE(fn); - WARN_ON(saddr == NULL); - if (fn && subtree) - fn = fib6_locate_1(subtree, saddr, src_len, + if (fn) { + struct fib6_node *subtree = FIB6_SUBTREE(fn); + + if (subtree) { + fn = fib6_locate_1(subtree, saddr, src_len, offsetof(struct rt6_info, rt6i_src), exact_match); + } + } } #endif From 3048913beb582bc08bb521bf139eb849a91c325e Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 17 Apr 2018 17:33:21 -0700 Subject: [PATCH 0391/1640] UPSTREAM: net/ipv6: Create a neigh_lookup for FIB entries The router discovery code has a FIB entry and wants to validate the gateway has a neighbor entry. Refactor the existing dst_neigh_lookup for IPv6 and create a new function that takes the gateway and device and returns a neighbor entry. Use the new function in ndisc_router_discovery to validate the gateway. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_route.h | 3 +++ net/ipv6/ndisc.c | 8 ++++++-- net/ipv6/route.c | 33 ++++++++++++++++++++------------- 3 files changed, 29 insertions(+), 15 deletions(-) diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index 82520275c978..44c2bf841012 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -274,4 +274,7 @@ static inline bool rt6_duplicate_nexthop(struct rt6_info *a, struct rt6_info *b) !lwtunnel_cmp_encap(a->fib6_nh.nh_lwtstate, b->fib6_nh.nh_lwtstate); } +struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw, + struct net_device *dev, struct sk_buff *skb, + const void *daddr); #endif diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 84f512570851..87d984a1b8f3 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -1290,7 +1290,9 @@ static void ndisc_router_discovery(struct sk_buff *skb) rt = rt6_get_dflt_router(net, &ipv6_hdr(skb)->saddr, skb->dev); if (rt) { - neigh = dst_neigh_lookup(&rt->dst, &ipv6_hdr(skb)->saddr); + neigh = ip6_neigh_lookup(&rt->fib6_nh.nh_gw, + rt->fib6_nh.nh_dev, NULL, + &ipv6_hdr(skb)->saddr); if (!neigh) { ND_PRINTK(0, err, "RA: %s got default router without neighbour\n", @@ -1318,7 +1320,9 @@ static void ndisc_router_discovery(struct sk_buff *skb) return; } - neigh = dst_neigh_lookup(&rt->dst, &ipv6_hdr(skb)->saddr); + neigh = ip6_neigh_lookup(&rt->fib6_nh.nh_gw, + rt->fib6_nh.nh_dev, NULL, + &ipv6_hdr(skb)->saddr); if (!neigh) { ND_PRINTK(0, err, "RA: %s got default router without neighbour\n", diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 11ad7ff02f31..f3eb034ab24c 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -184,12 +184,10 @@ static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev) } } -static inline const void *choose_neigh_daddr(struct rt6_info *rt, +static inline const void *choose_neigh_daddr(const struct in6_addr *p, struct sk_buff *skb, const void *daddr) { - struct in6_addr *p = &rt->rt6i_gateway; - if (!ipv6_addr_any(p)) return (const void *) p; else if (skb) @@ -197,18 +195,27 @@ static inline const void *choose_neigh_daddr(struct rt6_info *rt, return daddr; } -static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst, - struct sk_buff *skb, - const void *daddr) +struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw, + struct net_device *dev, + struct sk_buff *skb, + const void *daddr) { - struct rt6_info *rt = (struct rt6_info *) dst; struct neighbour *n; - daddr = choose_neigh_daddr(rt, skb, daddr); - n = __ipv6_neigh_lookup(dst->dev, daddr); + daddr = choose_neigh_daddr(gw, skb, daddr); + n = __ipv6_neigh_lookup(dev, daddr); if (n) return n; - return neigh_create(&nd_tbl, daddr, dst->dev); + return neigh_create(&nd_tbl, daddr, dev); +} + +static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst, + struct sk_buff *skb, + const void *daddr) +{ + const struct rt6_info *rt = container_of(dst, struct rt6_info, dst); + + return ip6_neigh_lookup(&rt->rt6i_gateway, dst->dev, skb, daddr); } static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr) @@ -216,7 +223,7 @@ static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr) struct net_device *dev = dst->dev; struct rt6_info *rt = (struct rt6_info *)dst; - daddr = choose_neigh_daddr(rt, NULL, daddr); + daddr = choose_neigh_daddr(&rt->rt6i_gateway, NULL, daddr); if (!daddr) return; if (dev->flags & (IFF_NOARP | IFF_LOOPBACK)) @@ -241,7 +248,7 @@ static struct dst_ops ip6_dst_ops_template = { .update_pmtu = ip6_rt_update_pmtu, .redirect = rt6_do_redirect, .local_out = __ip6_local_out, - .neigh_lookup = ip6_neigh_lookup, + .neigh_lookup = ip6_dst_neigh_lookup, .confirm_neigh = ip6_confirm_neigh, }; @@ -272,7 +279,7 @@ static struct dst_ops ip6_dst_blackhole_ops = { .update_pmtu = ip6_rt_blackhole_update_pmtu, .redirect = ip6_rt_blackhole_redirect, .cow_metrics = dst_cow_metrics_generic, - .neigh_lookup = ip6_neigh_lookup, + .neigh_lookup = ip6_dst_neigh_lookup, }; static const u32 ip6_template_metrics[RTAX_MAX] = { From add3aec6115adfd86650c8bd86d3e05141002a69 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 17 Apr 2018 17:33:26 -0700 Subject: [PATCH 0392/1640] BACKPORT: net/ipv6: Flip FIB entries to fib6_info Convert all code paths referencing a FIB entry from rt6_info to fib6_info. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/if_inet6.h | 4 +- include/net/ip6_fib.h | 42 +++---- include/net/ip6_route.h | 28 ++--- include/net/netns/ipv6.h | 2 +- net/ipv6/addrconf.c | 20 +-- net/ipv6/anycast.c | 4 +- net/ipv6/ip6_fib.c | 116 +++++++++--------- net/ipv6/ndisc.c | 2 +- net/ipv6/route.c | 259 ++++++++++++++++++++------------------- 9 files changed, 239 insertions(+), 238 deletions(-) diff --git a/include/net/if_inet6.h b/include/net/if_inet6.h index c8569e7a9f68..431ec8ec7a47 100644 --- a/include/net/if_inet6.h +++ b/include/net/if_inet6.h @@ -64,7 +64,7 @@ struct inet6_ifaddr { struct delayed_work dad_work; struct inet6_dev *idev; - struct rt6_info *rt; + struct fib6_info *rt; struct hlist_node addr_lst; struct list_head if_list; @@ -144,7 +144,7 @@ struct ipv6_ac_socklist { struct ifacaddr6 { struct in6_addr aca_addr; struct inet6_dev *aca_idev; - struct rt6_info *aca_rt; + struct fib6_info *aca_rt; struct ifacaddr6 *aca_next; int aca_users; refcount_t aca_refcnt; diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 6eb744680eaa..780230b77fd5 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -78,12 +78,12 @@ struct fib6_node { #ifdef CONFIG_IPV6_SUBTREES struct fib6_node __rcu *subtree; #endif - struct rt6_info __rcu *leaf; + struct fib6_info __rcu *leaf; __u16 fn_bit; /* bit key */ __u16 fn_flags; int fn_sernum; - struct rt6_info __rcu *rr_ptr; + struct fib6_info __rcu *rr_ptr; struct rcu_head rcu; }; @@ -179,7 +179,7 @@ struct fib6_info { struct rt6_info { struct dst_entry dst; struct rt6_info __rcu *rt6_next; - struct rt6_info *from; + struct fib6_info *from; /* * Tail elements of dst_entry (__refcnt etc.) @@ -245,20 +245,20 @@ static inline struct inet6_dev *ip6_dst_idev(struct dst_entry *dst) return ((struct rt6_info *)dst)->rt6i_idev; } -static inline void fib6_clean_expires(struct rt6_info *f6i) +static inline void fib6_clean_expires(struct fib6_info *f6i) { f6i->rt6i_flags &= ~RTF_EXPIRES; f6i->expires = 0; } -static inline void fib6_set_expires(struct rt6_info *f6i, +static inline void fib6_set_expires(struct fib6_info *f6i, unsigned long expires) { f6i->expires = expires; f6i->rt6i_flags |= RTF_EXPIRES; } -static inline bool fib6_check_expired(const struct rt6_info *f6i) +static inline bool fib6_check_expired(const struct fib6_info *f6i) { if (f6i->rt6i_flags & RTF_EXPIRES) return time_after(jiffies, f6i->expires); @@ -291,7 +291,7 @@ static inline void rt6_update_expires(struct rt6_info *rt0, int timeout) * Return true if we can get cookie safely * Return false if not */ -static inline bool rt6_get_cookie_safe(const struct rt6_info *rt, +static inline bool rt6_get_cookie_safe(const struct fib6_info *rt, u32 *cookie) { struct fib6_node *fn; @@ -332,15 +332,15 @@ static inline void ip6_rt_put(struct rt6_info *rt) void rt6_free_pcpu(struct rt6_info *non_pcpu_rt); -struct rt6_info *fib6_info_alloc(gfp_t gfp_flags); -void fib6_info_destroy(struct rt6_info *f6i); +struct fib6_info *fib6_info_alloc(gfp_t gfp_flags); +void fib6_info_destroy(struct fib6_info *f6i); -static inline void fib6_info_hold(struct rt6_info *f6i) +static inline void fib6_info_hold(struct fib6_info *f6i) { atomic_inc(&f6i->rt6i_ref); } -static inline void fib6_info_release(struct rt6_info *f6i) +static inline void fib6_info_release(struct fib6_info *f6i) { if (f6i && atomic_dec_and_test(&f6i->rt6i_ref)) fib6_info_destroy(f6i); @@ -373,7 +373,7 @@ enum fib6_walk_state { struct fib6_walker { struct list_head lh; struct fib6_node *root, *node; - struct rt6_info *leaf; + struct fib6_info *leaf; enum fib6_walk_state state; unsigned int skip; unsigned int count; @@ -437,7 +437,7 @@ typedef struct rt6_info *(*pol_lookup_t)(struct net *, struct fib6_entry_notifier_info { struct fib_notifier_info info; /* must be first */ - struct rt6_info *rt; + struct fib6_info *rt; }; /* @@ -459,14 +459,14 @@ struct fib6_node *fib6_locate(struct fib6_node *root, const struct in6_addr *saddr, int src_len, bool exact_match); -void fib6_clean_all(struct net *net, int (*func)(struct rt6_info *, void *arg), +void fib6_clean_all(struct net *net, int (*func)(struct fib6_info *, void *arg), void *arg); -int fib6_add(struct fib6_node *root, struct rt6_info *rt, +int fib6_add(struct fib6_node *root, struct fib6_info *rt, struct nl_info *info, struct netlink_ext_ack *extack); -int fib6_del(struct rt6_info *rt, struct nl_info *info); +int fib6_del(struct fib6_info *rt, struct nl_info *info); -void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info, +void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info, unsigned int flags); void fib6_run_gc(unsigned long expires, struct net *net, bool force); @@ -489,11 +489,11 @@ void __net_exit fib6_notifier_exit(struct net *net); unsigned int fib6_tables_seq_read(struct net *net); int fib6_tables_dump(struct net *net, struct notifier_block *nb); -void fib6_update_sernum(struct net *net, struct rt6_info *rt); -void fib6_update_sernum_upto_root(struct net *net, struct rt6_info *rt); +void fib6_update_sernum(struct net *net, struct fib6_info *rt); +void fib6_update_sernum_upto_root(struct net *net, struct fib6_info *rt); -void fib6_metric_set(struct rt6_info *f6i, int metric, u32 val); -static inline bool fib6_metric_locked(struct rt6_info *f6i, int metric) +void fib6_metric_set(struct fib6_info *f6i, int metric, u32 val); +static inline bool fib6_metric_locked(struct fib6_info *f6i, int metric) { return !!(f6i->fib6_metrics->metrics[RTAX_LOCK - 1] & (1 << metric)); } diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index 44c2bf841012..a717d005a91a 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -66,7 +66,7 @@ static inline bool rt6_need_strict(const struct in6_addr *daddr) (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK); } -static inline bool rt6_qualify_for_ecmp(const struct rt6_info *rt) +static inline bool rt6_qualify_for_ecmp(const struct fib6_info *rt) { return (rt->rt6i_flags & (RTF_GATEWAY|RTF_ADDRCONF|RTF_DYNAMIC)) == RTF_GATEWAY; @@ -102,14 +102,14 @@ int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg); int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags, struct netlink_ext_ack *extack); -int ip6_ins_rt(struct net *net, struct rt6_info *rt); -int ip6_del_rt(struct net *net, struct rt6_info *rt); +int ip6_ins_rt(struct net *net, struct fib6_info *rt); +int ip6_del_rt(struct net *net, struct fib6_info *rt); -void rt6_flush_exceptions(struct rt6_info *rt); -void rt6_age_exceptions(struct rt6_info *rt, struct fib6_gc_args *gc_args, +void rt6_flush_exceptions(struct fib6_info *rt); +void rt6_age_exceptions(struct fib6_info *rt, struct fib6_gc_args *gc_args, unsigned long now); -static inline int ip6_route_get_saddr(struct net *net, struct rt6_info *rt, +static inline int ip6_route_get_saddr(struct net *net, struct fib6_info *rt, const struct in6_addr *daddr, unsigned int prefs, struct in6_addr *saddr) @@ -136,9 +136,9 @@ struct dst_entry *icmp6_dst_alloc(struct net_device *dev, struct flowi6 *fl6); void fib6_force_start_gc(struct net *net); -struct rt6_info *addrconf_dst_alloc(struct net *net, struct inet6_dev *idev, - const struct in6_addr *addr, bool anycast, - gfp_t gfp_flags); +struct fib6_info *addrconf_dst_alloc(struct net *net, struct inet6_dev *idev, + const struct in6_addr *addr, bool anycast, + gfp_t gfp_flags); struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev, int flags); @@ -147,10 +147,10 @@ struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev, * support functions for ND * */ -struct rt6_info *rt6_get_dflt_router(struct net *net, +struct fib6_info *rt6_get_dflt_router(struct net *net, const struct in6_addr *addr, struct net_device *dev); -struct rt6_info *rt6_add_dflt_router(struct net *net, +struct fib6_info *rt6_add_dflt_router(struct net *net, const struct in6_addr *gwaddr, struct net_device *dev, unsigned int pref); @@ -176,14 +176,14 @@ struct rt6_rtnl_dump_arg { struct net *net; }; -int rt6_dump_route(struct rt6_info *rt, void *p_arg); +int rt6_dump_route(struct fib6_info *rt, void *p_arg); void rt6_mtu_change(struct net_device *dev, unsigned int mtu); void rt6_remove_prefsrc(struct inet6_ifaddr *ifp); void rt6_clean_tohost(struct net *net, struct in6_addr *gateway); void rt6_sync_up(struct net_device *dev, unsigned int nh_flags); void rt6_disable_ip(struct net_device *dev, unsigned long event); void rt6_sync_down_dev(struct net_device *dev, unsigned long event); -void rt6_multipath_rebalance(struct rt6_info *rt); +void rt6_multipath_rebalance(struct fib6_info *rt); static inline const struct rt6_info *skb_rt6_info(const struct sk_buff *skb) { @@ -266,7 +266,7 @@ static inline struct in6_addr *rt6_nexthop(struct rt6_info *rt, return daddr; } -static inline bool rt6_duplicate_nexthop(struct rt6_info *a, struct rt6_info *b) +static inline bool rt6_duplicate_nexthop(struct fib6_info *a, struct fib6_info *b) { return a->fib6_nh.nh_dev == b->fib6_nh.nh_dev && a->rt6i_idev == b->rt6i_idev && diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h index e997e8152b32..a55d82a54ee9 100644 --- a/include/net/netns/ipv6.h +++ b/include/net/netns/ipv6.h @@ -60,7 +60,7 @@ struct netns_ipv6 { #endif struct xt_table *ip6table_nat; #endif - struct rt6_info *fib6_null_entry; + struct fib6_info *fib6_null_entry; struct rt6_info *ip6_null_entry; struct rt6_statistics *rt6_stats; struct timer_list ip6_fib_timer; diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index f06ddabf62d0..35697e79653a 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -176,7 +176,7 @@ static void addrconf_type_change(struct net_device *dev, unsigned long event); static int addrconf_ifdown(struct net_device *dev, int how); -static struct rt6_info *addrconf_get_prefix_route(const struct in6_addr *pfx, +static struct fib6_info *addrconf_get_prefix_route(const struct in6_addr *pfx, int plen, const struct net_device *dev, u32 flags, u32 noflags); @@ -989,7 +989,7 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr, gfp_t gfp_flags = can_block ? GFP_KERNEL : GFP_ATOMIC; struct net *net = dev_net(idev->dev); struct inet6_ifaddr *ifa = NULL; - struct rt6_info *rt = NULL; + struct fib6_info *rt = NULL; struct in6_validator_info i6vi; int err = 0; int addr_type = ipv6_addr_type(addr); @@ -1167,7 +1167,7 @@ check_cleanup_prefix_route(struct inet6_ifaddr *ifp, unsigned long *expires) static void cleanup_prefix_route(struct inet6_ifaddr *ifp, unsigned long expires, bool del_rt) { - struct rt6_info *rt; + struct fib6_info *rt; rt = addrconf_get_prefix_route(&ifp->addr, ifp->prefix_len, @@ -2353,13 +2353,13 @@ addrconf_prefix_route(struct in6_addr *pfx, int plen, struct net_device *dev, } -static struct rt6_info *addrconf_get_prefix_route(const struct in6_addr *pfx, +static struct fib6_info *addrconf_get_prefix_route(const struct in6_addr *pfx, int plen, const struct net_device *dev, u32 flags, u32 noflags) { struct fib6_node *fn; - struct rt6_info *rt = NULL; + struct fib6_info *rt = NULL; struct fib6_table *table; u32 tb_id = l3mdev_fib_table(dev) ? : addrconf_rt_table(dev, RT6_TABLE_PREFIX); @@ -2654,7 +2654,7 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len, bool sllao) */ if (pinfo->onlink) { - struct rt6_info *rt; + struct fib6_info *rt; unsigned long rt_expires; /* Avoid arithmetic overflow. Really, we could @@ -3369,7 +3369,7 @@ static int fixup_permanent_addr(struct net *net, * case regenerate the host route. */ if (!ifp->rt || !ifp->rt->rt6i_node) { - struct rt6_info *rt, *prev; + struct fib6_info *rt, *prev; rt = addrconf_dst_alloc(net, idev, &ifp->addr, false, GFP_ATOMIC); @@ -3742,7 +3742,7 @@ restart: INIT_LIST_HEAD(&del_list); list_for_each_entry_safe(ifa, tmp, &idev->addr_list, if_list) { - struct rt6_info *rt = NULL; + struct fib6_info *rt = NULL; bool keep; addrconf_del_dad_work(ifa); @@ -5670,7 +5670,7 @@ static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp) addrconf_leave_anycast(ifp); addrconf_leave_solict(ifp->idev, &ifp->addr); if (!ipv6_addr_any(&ifp->peer_addr)) { - struct rt6_info *rt; + struct fib6_info *rt; rt = addrconf_get_prefix_route(&ifp->peer_addr, 128, ifp->idev->dev, 0, 0); @@ -6026,7 +6026,7 @@ void addrconf_disable_policy_idev(struct inet6_dev *idev, int val) list_for_each_entry(ifa, &idev->addr_list, if_list) { spin_lock(&ifa->lock); if (ifa->rt) { - struct rt6_info *rt = ifa->rt; + struct fib6_info *rt = ifa->rt; int cpu; rcu_read_lock(); diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c index 4e7507ebd84f..ca0aef660af4 100644 --- a/net/ipv6/anycast.c +++ b/net/ipv6/anycast.c @@ -222,7 +222,7 @@ static void aca_put(struct ifacaddr6 *ac) } } -static struct ifacaddr6 *aca_alloc(struct rt6_info *rt, +static struct ifacaddr6 *aca_alloc(struct fib6_info *rt, const struct in6_addr *addr) { struct inet6_dev *idev = rt->rt6i_idev; @@ -251,7 +251,7 @@ static struct ifacaddr6 *aca_alloc(struct rt6_info *rt, int __ipv6_dev_ac_inc(struct inet6_dev *idev, const struct in6_addr *addr) { struct ifacaddr6 *aca; - struct rt6_info *rt; + struct fib6_info *rt; struct net *net; int err; diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index ea2483a9006b..c54d28b68c46 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -43,7 +43,7 @@ static struct kmem_cache *fib6_node_kmem __read_mostly; struct fib6_cleaner { struct fib6_walker w; struct net *net; - int (*func)(struct rt6_info *, void *arg); + int (*func)(struct fib6_info *, void *arg); int sernum; void *arg; }; @@ -54,7 +54,7 @@ struct fib6_cleaner { #define FWS_INIT FWS_L #endif -static struct rt6_info *fib6_find_prefix(struct net *net, +static struct fib6_info *fib6_find_prefix(struct net *net, struct fib6_table *table, struct fib6_node *fn); static struct fib6_node *fib6_repair_tree(struct net *net, @@ -105,7 +105,7 @@ enum { FIB6_NO_SERNUM_CHANGE = 0, }; -void fib6_update_sernum(struct net *net, struct rt6_info *rt) +void fib6_update_sernum(struct net *net, struct fib6_info *rt) { struct fib6_node *fn; @@ -145,9 +145,9 @@ static __be32 addr_bit_set(const void *token, int fn_bit) addr[fn_bit >> 5]; } -struct rt6_info *fib6_info_alloc(gfp_t gfp_flags) +struct fib6_info *fib6_info_alloc(gfp_t gfp_flags) { - struct rt6_info *f6i; + struct fib6_info *f6i; f6i = kzalloc(sizeof(*f6i), gfp_flags); if (!f6i) @@ -167,7 +167,7 @@ struct rt6_info *fib6_info_alloc(gfp_t gfp_flags) return f6i; } -void fib6_info_destroy(struct rt6_info *f6i) +void fib6_info_destroy(struct fib6_info *f6i) { struct rt6_exception_bucket *bucket; struct dst_metrics *m; @@ -404,7 +404,7 @@ unsigned int fib6_tables_seq_read(struct net *net) static int call_fib6_entry_notifier(struct notifier_block *nb, struct net *net, enum fib_event_type event_type, - struct rt6_info *rt) + struct fib6_info *rt) { struct fib6_entry_notifier_info info = { .rt = rt, @@ -415,7 +415,7 @@ static int call_fib6_entry_notifier(struct notifier_block *nb, struct net *net, static int call_fib6_entry_notifiers(struct net *net, enum fib_event_type event_type, - struct rt6_info *rt, + struct fib6_info *rt, struct netlink_ext_ack *extack) { struct fib6_entry_notifier_info info = { @@ -432,7 +432,7 @@ struct fib6_dump_arg { struct notifier_block *nb; }; -static void fib6_rt_dump(struct rt6_info *rt, struct fib6_dump_arg *arg) +static void fib6_rt_dump(struct fib6_info *rt, struct fib6_dump_arg *arg) { if (rt == arg->net->ipv6.fib6_null_entry) return; @@ -441,7 +441,7 @@ static void fib6_rt_dump(struct rt6_info *rt, struct fib6_dump_arg *arg) static int fib6_node_dump(struct fib6_walker *w) { - struct rt6_info *rt; + struct fib6_info *rt; for_each_fib6_walker_rt(w) fib6_rt_dump(rt, w->args); @@ -490,7 +490,7 @@ int fib6_tables_dump(struct net *net, struct notifier_block *nb) static int fib6_dump_node(struct fib6_walker *w) { int res; - struct rt6_info *rt; + struct fib6_info *rt; for_each_fib6_walker_rt(w) { res = rt6_dump_route(rt, w->args); @@ -507,7 +507,7 @@ static int fib6_dump_node(struct fib6_walker *w) */ if (rt->rt6i_nsiblings) rt = list_last_entry(&rt->rt6i_siblings, - struct rt6_info, + struct fib6_info, rt6i_siblings); } w->leaf = NULL; @@ -643,7 +643,7 @@ out: return res; } -void fib6_metric_set(struct rt6_info *f6i, int metric, u32 val) +void fib6_metric_set(struct fib6_info *f6i, int metric, u32 val) { if (!f6i) return; @@ -690,7 +690,7 @@ static struct fib6_node *fib6_add_1(struct net *net, fn = root; do { - struct rt6_info *leaf = rcu_dereference_protected(fn->leaf, + struct fib6_info *leaf = rcu_dereference_protected(fn->leaf, lockdep_is_held(&table->tb6_lock)); key = (struct rt6key *)((u8 *)leaf + offset); @@ -884,7 +884,7 @@ insert_above: return ln; } -static void fib6_purge_rt(struct rt6_info *rt, struct fib6_node *fn, +static void fib6_purge_rt(struct fib6_info *rt, struct fib6_node *fn, struct net *net) { struct fib6_table *table = rt->rt6i_table; @@ -897,9 +897,9 @@ static void fib6_purge_rt(struct rt6_info *rt, struct fib6_node *fn, * to still alive ones. */ while (fn) { - struct rt6_info *leaf = rcu_dereference_protected(fn->leaf, + struct fib6_info *leaf = rcu_dereference_protected(fn->leaf, lockdep_is_held(&table->tb6_lock)); - struct rt6_info *new_leaf; + struct fib6_info *new_leaf; if (!(fn->fn_flags & RTN_RTINFO) && leaf == rt) { new_leaf = fib6_find_prefix(net, table, fn); atomic_inc(&new_leaf->rt6i_ref); @@ -936,15 +936,15 @@ static void fib6_purge_rt(struct rt6_info *rt, struct fib6_node *fn, * Insert routing information in a node. */ -static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt, +static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt, struct nl_info *info, struct netlink_ext_ack *extack) { - struct rt6_info *leaf = rcu_dereference_protected(fn->leaf, + struct fib6_info *leaf = rcu_dereference_protected(fn->leaf, lockdep_is_held(&rt->rt6i_table->tb6_lock)); - struct rt6_info *iter = NULL; - struct rt6_info __rcu **ins; - struct rt6_info __rcu **fallback_ins = NULL; + struct fib6_info *iter = NULL; + struct fib6_info __rcu **ins; + struct fib6_info __rcu **fallback_ins = NULL; int replace = (info->nlh && (info->nlh->nlmsg_flags & NLM_F_REPLACE)); int add = (!info->nlh || @@ -1036,7 +1036,7 @@ next_iter: /* Link this route to others same route. */ if (rt->rt6i_nsiblings) { unsigned int rt6i_nsiblings; - struct rt6_info *sibling, *temp_sibling; + struct fib6_info *sibling, *temp_sibling; /* Find the first route that have the same metric */ sibling = leaf; @@ -1157,7 +1157,7 @@ add: return 0; } -static void fib6_start_gc(struct net *net, struct rt6_info *rt) +static void fib6_start_gc(struct net *net, struct fib6_info *rt) { if (!timer_pending(&net->ipv6.ip6_fib_timer) && (rt->rt6i_flags & RTF_EXPIRES)) @@ -1172,7 +1172,7 @@ void fib6_force_start_gc(struct net *net) jiffies + net->ipv6.sysctl.ip6_rt_gc_interval); } -static void __fib6_update_sernum_upto_root(struct rt6_info *rt, +static void __fib6_update_sernum_upto_root(struct fib6_info *rt, int sernum) { struct fib6_node *fn = rcu_dereference_protected(rt->rt6i_node, @@ -1187,7 +1187,7 @@ static void __fib6_update_sernum_upto_root(struct rt6_info *rt, } } -void fib6_update_sernum_upto_root(struct net *net, struct rt6_info *rt) +void fib6_update_sernum_upto_root(struct net *net, struct fib6_info *rt) { __fib6_update_sernum_upto_root(rt, fib6_new_sernum(net)); } @@ -1199,7 +1199,7 @@ void fib6_update_sernum_upto_root(struct net *net, struct rt6_info *rt) * Need to own table->tb6_lock */ -int fib6_add(struct fib6_node *root, struct rt6_info *rt, +int fib6_add(struct fib6_node *root, struct fib6_info *rt, struct nl_info *info, struct netlink_ext_ack *extack) { struct fib6_table *table = rt->rt6i_table; @@ -1220,7 +1220,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, fn = fib6_add_1(info->nl_net, table, root, &rt->rt6i_dst.addr, rt->rt6i_dst.plen, - offsetof(struct rt6_info, rt6i_dst), allow_create, + offsetof(struct fib6_info, rt6i_dst), allow_create, replace_required, extack); if (IS_ERR(fn)) { err = PTR_ERR(fn); @@ -1261,7 +1261,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, sn = fib6_add_1(info->nl_net, table, sfn, &rt->rt6i_src.addr, rt->rt6i_src.plen, - offsetof(struct rt6_info, rt6i_src), + offsetof(struct fib6_info, rt6i_src), allow_create, replace_required, extack); if (IS_ERR(sn)) { @@ -1280,7 +1280,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, } else { sn = fib6_add_1(info->nl_net, table, FIB6_SUBTREE(fn), &rt->rt6i_src.addr, rt->rt6i_src.plen, - offsetof(struct rt6_info, rt6i_src), + offsetof(struct fib6_info, rt6i_src), allow_create, replace_required, extack); if (IS_ERR(sn)) { @@ -1317,7 +1317,7 @@ out: * super-tree leaf node we have to find a new one for it. */ if (pn != fn) { - struct rt6_info *pn_leaf = + struct fib6_info *pn_leaf = rcu_dereference_protected(pn->leaf, lockdep_is_held(&table->tb6_lock)); if (pn_leaf == rt) { @@ -1366,7 +1366,7 @@ failure: */ struct lookup_args { - int offset; /* key offset on rt6_info */ + int offset; /* key offset on fib6_info */ const struct in6_addr *addr; /* search key */ }; @@ -1404,7 +1404,7 @@ static struct fib6_node *fib6_lookup_1(struct fib6_node *root, struct fib6_node *subtree = FIB6_SUBTREE(fn); if (subtree || fn->fn_flags & RTN_RTINFO) { - struct rt6_info *leaf = rcu_dereference(fn->leaf); + struct fib6_info *leaf = rcu_dereference(fn->leaf); struct rt6key *key; if (!leaf) @@ -1444,12 +1444,12 @@ struct fib6_node *fib6_lookup(struct fib6_node *root, const struct in6_addr *dad struct fib6_node *fn; struct lookup_args args[] = { { - .offset = offsetof(struct rt6_info, rt6i_dst), + .offset = offsetof(struct fib6_info, rt6i_dst), .addr = daddr, }, #ifdef CONFIG_IPV6_SUBTREES { - .offset = offsetof(struct rt6_info, rt6i_src), + .offset = offsetof(struct fib6_info, rt6i_src), .addr = saddr, }, #endif @@ -1485,7 +1485,7 @@ static struct fib6_node *fib6_locate_1(struct fib6_node *root, struct fib6_node *fn, *prev = NULL; for (fn = root; fn ; ) { - struct rt6_info *leaf = rcu_dereference(fn->leaf); + struct fib6_info *leaf = rcu_dereference(fn->leaf); struct rt6key *key; /* This node is being deleted */ @@ -1534,7 +1534,7 @@ struct fib6_node *fib6_locate(struct fib6_node *root, struct fib6_node *fn; fn = fib6_locate_1(root, daddr, dst_len, - offsetof(struct rt6_info, rt6i_dst), + offsetof(struct fib6_info, rt6i_dst), exact_match); #ifdef CONFIG_IPV6_SUBTREES @@ -1545,7 +1545,7 @@ struct fib6_node *fib6_locate(struct fib6_node *root, if (subtree) { fn = fib6_locate_1(subtree, saddr, src_len, - offsetof(struct rt6_info, rt6i_src), + offsetof(struct fib6_info, rt6i_src), exact_match); } } @@ -1564,7 +1564,7 @@ struct fib6_node *fib6_locate(struct fib6_node *root, * */ -static struct rt6_info *fib6_find_prefix(struct net *net, +static struct fib6_info *fib6_find_prefix(struct net *net, struct fib6_table *table, struct fib6_node *fn) { @@ -1623,11 +1623,11 @@ static struct fib6_node *fib6_repair_tree(struct net *net, lockdep_is_held(&table->tb6_lock)); struct fib6_node *pn_l = rcu_dereference_protected(pn->left, lockdep_is_held(&table->tb6_lock)); - struct rt6_info *fn_leaf = rcu_dereference_protected(fn->leaf, + struct fib6_info *fn_leaf = rcu_dereference_protected(fn->leaf, lockdep_is_held(&table->tb6_lock)); - struct rt6_info *pn_leaf = rcu_dereference_protected(pn->leaf, + struct fib6_info *pn_leaf = rcu_dereference_protected(pn->leaf, lockdep_is_held(&table->tb6_lock)); - struct rt6_info *new_fn_leaf; + struct fib6_info *new_fn_leaf; RT6_TRACE("fixing tree: plen=%d iter=%d\n", fn->fn_bit, iter); iter++; @@ -1718,10 +1718,10 @@ static struct fib6_node *fib6_repair_tree(struct net *net, } static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn, - struct rt6_info __rcu **rtp, struct nl_info *info) + struct fib6_info __rcu **rtp, struct nl_info *info) { struct fib6_walker *w; - struct rt6_info *rt = rcu_dereference_protected(*rtp, + struct fib6_info *rt = rcu_dereference_protected(*rtp, lockdep_is_held(&table->tb6_lock)); struct net *net = info->nl_net; @@ -1742,7 +1742,7 @@ static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn, /* Remove this entry from other siblings */ if (rt->rt6i_nsiblings) { - struct rt6_info *sibling, *next_sibling; + struct fib6_info *sibling, *next_sibling; list_for_each_entry_safe(sibling, next_sibling, &rt->rt6i_siblings, rt6i_siblings) @@ -1786,14 +1786,14 @@ static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn, } /* Need to own table->tb6_lock */ -int fib6_del(struct rt6_info *rt, struct nl_info *info) +int fib6_del(struct fib6_info *rt, struct nl_info *info) { struct fib6_node *fn = rcu_dereference_protected(rt->rt6i_node, lockdep_is_held(&rt->rt6i_table->tb6_lock)); struct fib6_table *table = rt->rt6i_table; struct net *net = info->nl_net; - struct rt6_info __rcu **rtp; - struct rt6_info __rcu **rtp_next; + struct fib6_info __rcu **rtp; + struct fib6_info __rcu **rtp_next; if (!fn || rt == net->ipv6.fib6_null_entry) return -ENOENT; @@ -1805,7 +1805,7 @@ int fib6_del(struct rt6_info *rt, struct nl_info *info) */ for (rtp = &fn->leaf; *rtp; rtp = rtp_next) { - struct rt6_info *cur = rcu_dereference_protected(*rtp, + struct fib6_info *cur = rcu_dereference_protected(*rtp, lockdep_is_held(&table->tb6_lock)); if (rt == cur) { fib6_del_route(table, fn, rtp, info); @@ -1945,7 +1945,7 @@ static int fib6_walk(struct net *net, struct fib6_walker *w) static int fib6_clean_node(struct fib6_walker *w) { int res; - struct rt6_info *rt; + struct fib6_info *rt; struct fib6_cleaner *c = container_of(w, struct fib6_cleaner, w); struct nl_info info = { .nl_net = c->net, @@ -1980,7 +1980,7 @@ static int fib6_clean_node(struct fib6_walker *w) if (WARN_ON(!rt->rt6i_nsiblings)) continue; rt = list_last_entry(&rt->rt6i_siblings, - struct rt6_info, rt6i_siblings); + struct fib6_info, rt6i_siblings); continue; } WARN_ON(res != 0); @@ -1999,7 +1999,7 @@ static int fib6_clean_node(struct fib6_walker *w) */ static void fib6_clean_tree(struct net *net, struct fib6_node *root, - int (*func)(struct rt6_info *, void *arg), + int (*func)(struct fib6_info *, void *arg), int sernum, void *arg) { struct fib6_cleaner c; @@ -2017,7 +2017,7 @@ static void fib6_clean_tree(struct net *net, struct fib6_node *root, } static void __fib6_clean_all(struct net *net, - int (*func)(struct rt6_info *, void *), + int (*func)(struct fib6_info *, void *), int sernum, void *arg) { struct fib6_table *table; @@ -2037,7 +2037,7 @@ static void __fib6_clean_all(struct net *net, rcu_read_unlock(); } -void fib6_clean_all(struct net *net, int (*func)(struct rt6_info *, void *), +void fib6_clean_all(struct net *net, int (*func)(struct fib6_info *, void *), void *arg) { __fib6_clean_all(net, func, FIB6_NO_SERNUM_CHANGE, arg); @@ -2054,7 +2054,7 @@ static void fib6_flush_trees(struct net *net) * Garbage collection */ -static int fib6_age(struct rt6_info *rt, void *arg) +static int fib6_age(struct fib6_info *rt, void *arg) { struct fib6_gc_args *gc_args = arg; unsigned long now = jiffies; @@ -2256,7 +2256,7 @@ struct ipv6_route_iter { static int ipv6_route_seq_show(struct seq_file *seq, void *v) { - struct rt6_info *rt = v; + struct fib6_info *rt = v; struct ipv6_route_iter *iter = seq->private; const struct net_device *dev; @@ -2348,14 +2348,14 @@ static void ipv6_route_check_sernum(struct ipv6_route_iter *iter) static void *ipv6_route_seq_next(struct seq_file *seq, void *v, loff_t *pos) { int r; - struct rt6_info *n; + struct fib6_info *n; struct net *net = seq_file_net(seq); struct ipv6_route_iter *iter = seq->private; if (!v) goto iter_table; - n = rcu_dereference_bh(((struct rt6_info *)v)->rt6_next); + n = rcu_dereference_bh(((struct fib6_info *)v)->rt6_next); if (n) { ++*pos; return n; diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 87d984a1b8f3..2844f0c67080 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -1169,7 +1169,7 @@ static void ndisc_router_discovery(struct sk_buff *skb) struct ra_msg *ra_msg = (struct ra_msg *)skb_transport_header(skb); struct neighbour *neigh = NULL; struct inet6_dev *in6_dev; - struct rt6_info *rt = NULL; + struct fib6_info *rt = NULL; struct net *net; int lifetime; struct ndisc_options ndopts; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index f3eb034ab24c..dcff2470e151 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -98,24 +98,24 @@ static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, bool confirm_neigh); static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb); -static int rt6_score_route(struct rt6_info *rt, int oif, int strict); -static size_t rt6_nlmsg_size(struct rt6_info *rt); +static int rt6_score_route(struct fib6_info *rt, int oif, int strict); +static size_t rt6_nlmsg_size(struct fib6_info *rt); static int rt6_fill_node(struct net *net, struct sk_buff *skb, - struct rt6_info *rt, struct dst_entry *dst, + struct fib6_info *rt, struct dst_entry *dst, struct in6_addr *dest, struct in6_addr *src, int iif, int type, u32 portid, u32 seq, unsigned int flags); -static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt, +static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt, struct in6_addr *daddr, struct in6_addr *saddr); #ifdef CONFIG_IPV6_ROUTE_INFO -static struct rt6_info *rt6_add_route_info(struct net *net, +static struct fib6_info *rt6_add_route_info(struct net *net, const struct in6_addr *prefix, int prefixlen, const struct in6_addr *gwaddr, struct net_device *dev, unsigned int pref); -static struct rt6_info *rt6_get_route_info(struct net *net, +static struct fib6_info *rt6_get_route_info(struct net *net, const struct in6_addr *prefix, int prefixlen, const struct in6_addr *gwaddr, struct net_device *dev); @@ -286,7 +286,7 @@ static const u32 ip6_template_metrics[RTAX_MAX] = { [RTAX_HOPLIMIT - 1] = 0, }; -static const struct rt6_info fib6_null_entry_template = { +static const struct fib6_info fib6_null_entry_template = { .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), .rt6i_protocol = RTPROT_KERNEL, .rt6i_metric = ~(u32)0, @@ -375,7 +375,7 @@ EXPORT_SYMBOL(ip6_dst_alloc); static void ip6_dst_destroy(struct dst_entry *dst) { struct rt6_info *rt = (struct rt6_info *)dst; - struct rt6_info *from = rt->from; + struct fib6_info *from = rt->from; struct inet6_dev *idev; dst_destroy_metrics_generic(dst); @@ -428,13 +428,13 @@ static bool rt6_check_expired(const struct rt6_info *rt) return false; } -static struct rt6_info *rt6_multipath_select(const struct net *net, - struct rt6_info *match, +static struct fib6_info *rt6_multipath_select(const struct net *net, + struct fib6_info *match, struct flowi6 *fl6, int oif, const struct sk_buff *skb, int strict) { - struct rt6_info *sibling, *next_sibling; + struct fib6_info *sibling, *next_sibling; /* We might have already computed the hash for ICMPv6 errors. In such * case it will always be non-zero. Otherwise now is the time to do it. @@ -465,14 +465,14 @@ static struct rt6_info *rt6_multipath_select(const struct net *net, * Route lookup. rcu_read_lock() should be held. */ -static inline struct rt6_info *rt6_device_match(struct net *net, - struct rt6_info *rt, +static inline struct fib6_info *rt6_device_match(struct net *net, + struct fib6_info *rt, const struct in6_addr *saddr, int oif, int flags) { - struct rt6_info *local = NULL; - struct rt6_info *sprt; + struct fib6_info *local = NULL; + struct fib6_info *sprt; if (!oif && ipv6_addr_any(saddr) && !(rt->fib6_nh.nh_flags & RTNH_F_DEAD)) @@ -535,7 +535,7 @@ static void rt6_probe_deferred(struct work_struct *w) kfree(work); } -static void rt6_probe(struct rt6_info *rt) +static void rt6_probe(struct fib6_info *rt) { struct __rt6_probe_work *work; const struct in6_addr *nh_gw; @@ -588,7 +588,7 @@ out: rcu_read_unlock_bh(); } #else -static inline void rt6_probe(struct rt6_info *rt) +static inline void rt6_probe(struct fib6_info *rt) { } #endif @@ -596,7 +596,7 @@ static inline void rt6_probe(struct rt6_info *rt) /* * Default Router Selection (RFC 2461 6.3.6) */ -static inline int rt6_check_dev(struct rt6_info *rt, int oif) +static inline int rt6_check_dev(struct fib6_info *rt, int oif) { const struct net_device *dev = rt->fib6_nh.nh_dev; @@ -608,7 +608,7 @@ static inline int rt6_check_dev(struct rt6_info *rt, int oif) return 0; } -static inline enum rt6_nud_state rt6_check_neigh(struct rt6_info *rt) +static inline enum rt6_nud_state rt6_check_neigh(struct fib6_info *rt) { enum rt6_nud_state ret = RT6_NUD_FAIL_HARD; struct neighbour *neigh; @@ -640,8 +640,7 @@ static inline enum rt6_nud_state rt6_check_neigh(struct rt6_info *rt) return ret; } -static int rt6_score_route(struct rt6_info *rt, int oif, - int strict) +static int rt6_score_route(struct fib6_info *rt, int oif, int strict) { int m; @@ -659,8 +658,8 @@ static int rt6_score_route(struct rt6_info *rt, int oif, return m; } -static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict, - int *mpri, struct rt6_info *match, +static struct fib6_info *find_match(struct fib6_info *rt, int oif, int strict, + int *mpri, struct fib6_info *match, bool *do_rr) { int m; @@ -699,13 +698,13 @@ out: return match; } -static struct rt6_info *find_rr_leaf(struct fib6_node *fn, - struct rt6_info *leaf, - struct rt6_info *rr_head, +static struct fib6_info *find_rr_leaf(struct fib6_node *fn, + struct fib6_info *leaf, + struct fib6_info *rr_head, u32 metric, int oif, int strict, bool *do_rr) { - struct rt6_info *rt, *match, *cont; + struct fib6_info *rt, *match, *cont; int mpri = -1; match = NULL; @@ -738,11 +737,11 @@ static struct rt6_info *find_rr_leaf(struct fib6_node *fn, return match; } -static struct rt6_info *rt6_select(struct net *net, struct fib6_node *fn, +static struct fib6_info *rt6_select(struct net *net, struct fib6_node *fn, int oif, int strict) { - struct rt6_info *leaf = rcu_dereference(fn->leaf); - struct rt6_info *match, *rt0; + struct fib6_info *leaf = rcu_dereference(fn->leaf); + struct fib6_info *match, *rt0; bool do_rr = false; int key_plen; @@ -770,7 +769,7 @@ static struct rt6_info *rt6_select(struct net *net, struct fib6_node *fn, &do_rr); if (do_rr) { - struct rt6_info *next = rcu_dereference(rt0->rt6_next); + struct fib6_info *next = rcu_dereference(rt0->rt6_next); /* no entries matched; do round-robin */ if (!next || next->rt6i_metric != rt0->rt6i_metric) @@ -788,7 +787,7 @@ static struct rt6_info *rt6_select(struct net *net, struct fib6_node *fn, return match ? match : net->ipv6.fib6_null_entry; } -static bool rt6_is_gw_or_nonexthop(const struct rt6_info *rt) +static bool rt6_is_gw_or_nonexthop(const struct fib6_info *rt) { return (rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY)); } @@ -802,7 +801,7 @@ int rt6_route_rcv(struct net_device *dev, u8 *opt, int len, struct in6_addr prefix_buf, *prefix; unsigned int pref; unsigned long lifetime; - struct rt6_info *rt; + struct fib6_info *rt; if (len < sizeof(struct route_info)) { return -EINVAL; @@ -874,7 +873,7 @@ int rt6_route_rcv(struct net_device *dev, u8 *opt, int len, */ /* called with rcu_lock held */ -static struct net_device *ip6_rt_get_dev_rcu(struct rt6_info *rt) +static struct net_device *ip6_rt_get_dev_rcu(struct fib6_info *rt) { struct net_device *dev = rt->fib6_nh.nh_dev; @@ -916,7 +915,7 @@ static int ip6_rt_type_to_error(u8 fib6_type) return fib6_prop[fib6_type]; } -static unsigned short fib6_info_dst_flags(struct rt6_info *rt) +static unsigned short fib6_info_dst_flags(struct fib6_info *rt) { unsigned short flags = 0; @@ -930,7 +929,7 @@ static unsigned short fib6_info_dst_flags(struct rt6_info *rt) return flags; } -static void ip6_rt_init_dst_reject(struct rt6_info *rt, struct rt6_info *ort) +static void ip6_rt_init_dst_reject(struct rt6_info *rt, struct fib6_info *ort) { rt->dst.error = ip6_rt_type_to_error(ort->fib6_type); @@ -952,7 +951,7 @@ static void ip6_rt_init_dst_reject(struct rt6_info *rt, struct rt6_info *ort) } } -static void ip6_rt_init_dst(struct rt6_info *rt, struct rt6_info *ort) +static void ip6_rt_init_dst(struct rt6_info *rt, struct fib6_info *ort) { rt->dst.flags |= fib6_info_dst_flags(ort); @@ -980,7 +979,7 @@ static void ip6_rt_init_dst(struct rt6_info *rt, struct rt6_info *ort) rt->dst.lastuse = jiffies; } -static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from) +static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from) { rt->rt6i_flags &= ~RTF_EXPIRES; fib6_info_hold(from); @@ -992,7 +991,7 @@ static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from) } } -static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort) +static void ip6_rt_copy_init(struct rt6_info *rt, struct fib6_info *ort) { ip6_rt_init_dst(rt, ort); @@ -1048,7 +1047,7 @@ static bool ip6_hold_safe(struct net *net, struct rt6_info **prt, } /* called with rcu_lock held */ -static struct rt6_info *ip6_create_rt_rcu(struct rt6_info *rt) +static struct rt6_info *ip6_create_rt_rcu(struct fib6_info *rt) { unsigned short flags = fib6_info_dst_flags(rt); struct net_device *dev = rt->fib6_nh.nh_dev; @@ -1067,7 +1066,7 @@ static struct rt6_info *ip6_pol_route_lookup(struct net *net, const struct sk_buff *skb, int flags) { - struct rt6_info *f6i; + struct fib6_info *f6i; struct fib6_node *fn; struct rt6_info *rt; @@ -1155,7 +1154,7 @@ EXPORT_SYMBOL(rt6_lookup); * Caller must hold dst before calling it. */ -static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info, +static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info, struct netlink_ext_ack *extack) { int err; @@ -1169,14 +1168,14 @@ static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info, return err; } -int ip6_ins_rt(struct net *net, struct rt6_info *rt) +int ip6_ins_rt(struct net *net, struct fib6_info *rt) { struct nl_info info = { .nl_net = net, }; return __ip6_ins_rt(rt, &info, NULL); } -static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort, +static struct rt6_info *ip6_rt_cache_alloc(struct fib6_info *ort, const struct in6_addr *daddr, const struct in6_addr *saddr) { @@ -1216,7 +1215,7 @@ static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort, return rt; } -static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt) +static struct rt6_info *ip6_rt_pcpu_alloc(struct fib6_info *rt) { unsigned short flags = fib6_info_dst_flags(rt); struct net_device *dev; @@ -1235,7 +1234,7 @@ static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt) } /* It should be called with rcu_read_lock() acquired */ -static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt) +static struct rt6_info *rt6_get_pcpu_route(struct fib6_info *rt) { struct rt6_info *pcpu_rt, **p; @@ -1249,7 +1248,7 @@ static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt) } static struct rt6_info *rt6_make_pcpu_route(struct net *net, - struct rt6_info *rt) + struct fib6_info *rt) { struct rt6_info *pcpu_rt, *prev, **p; @@ -1391,7 +1390,7 @@ __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket, return NULL; } -static unsigned int fib6_mtu(const struct rt6_info *rt) +static unsigned int fib6_mtu(const struct fib6_info *rt) { unsigned int mtu; @@ -1402,7 +1401,7 @@ static unsigned int fib6_mtu(const struct rt6_info *rt) } static int rt6_insert_exception(struct rt6_info *nrt, - struct rt6_info *ort) + struct fib6_info *ort) { struct net *net = dev_net(nrt->dst.dev); struct rt6_exception_bucket *bucket; @@ -1488,7 +1487,7 @@ out: return err; } -void rt6_flush_exceptions(struct rt6_info *rt) +void rt6_flush_exceptions(struct fib6_info *rt) { struct rt6_exception_bucket *bucket; struct rt6_exception *rt6_ex; @@ -1518,7 +1517,7 @@ out: /* Find cached rt in the hash table inside passed in rt * Caller has to hold rcu_read_lock() */ -static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt, +static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt, struct in6_addr *daddr, struct in6_addr *saddr) { @@ -1551,7 +1550,7 @@ static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt, static int rt6_remove_exception_rt(struct rt6_info *rt) { struct rt6_exception_bucket *bucket; - struct rt6_info *from = rt->from; + struct fib6_info *from = rt->from; struct in6_addr *src_key = NULL; struct rt6_exception *rt6_ex; int err; @@ -1596,7 +1595,7 @@ static int rt6_remove_exception_rt(struct rt6_info *rt) static void rt6_update_exception_stamp_rt(struct rt6_info *rt) { struct rt6_exception_bucket *bucket; - struct rt6_info *from = rt->from; + struct fib6_info *from = rt->from; struct in6_addr *src_key = NULL; struct rt6_exception *rt6_ex; @@ -1626,7 +1625,7 @@ static void rt6_update_exception_stamp_rt(struct rt6_info *rt) rcu_read_unlock(); } -static void rt6_exceptions_remove_prefsrc(struct rt6_info *rt) +static void rt6_exceptions_remove_prefsrc(struct fib6_info *rt) { struct rt6_exception_bucket *bucket; struct rt6_exception *rt6_ex; @@ -1668,7 +1667,7 @@ static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev, } static void rt6_exceptions_update_pmtu(struct inet6_dev *idev, - struct rt6_info *rt, int mtu) + struct fib6_info *rt, int mtu) { struct rt6_exception_bucket *bucket; struct rt6_exception *rt6_ex; @@ -1698,7 +1697,7 @@ static void rt6_exceptions_update_pmtu(struct inet6_dev *idev, #define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE) -static void rt6_exceptions_clean_tohost(struct rt6_info *rt, +static void rt6_exceptions_clean_tohost(struct fib6_info *rt, struct in6_addr *gateway) { struct rt6_exception_bucket *bucket; @@ -1764,7 +1763,7 @@ static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket, gc_args->more++; } -void rt6_age_exceptions(struct rt6_info *rt, +void rt6_age_exceptions(struct fib6_info *rt, struct fib6_gc_args *gc_args, unsigned long now) { @@ -1798,7 +1797,7 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, const struct sk_buff *skb, int flags) { struct fib6_node *fn, *saved_fn; - struct rt6_info *f6i; + struct fib6_info *f6i; struct rt6_info *rt; int strict = 0; @@ -2132,7 +2131,7 @@ struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_ori * Destination cache support functions */ -static bool fib6_check(struct rt6_info *f6i, u32 cookie) +static bool fib6_check(struct fib6_info *f6i, u32 cookie) { u32 rt_cookie = 0; @@ -2358,7 +2357,7 @@ static struct rt6_info *__ip6_route_redirect(struct net *net, { struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6; struct rt6_info *ret = NULL, *rt_cache; - struct rt6_info *rt; + struct fib6_info *rt; struct fib6_node *fn; /* Get the "current" route for this destination and @@ -2605,7 +2604,7 @@ out: atomic_set(&net->ipv6.ip6_rt_gc_expire, val - (val >> rt_elasticity)); } -static int ip6_convert_metrics(struct net *net, struct rt6_info *rt, +static int ip6_convert_metrics(struct net *net, struct fib6_info *rt, struct fib6_config *cfg) { int err = 0; @@ -2795,12 +2794,12 @@ out: return err; } -static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg, +static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, gfp_t gfp_flags, struct netlink_ext_ack *extack) { struct net *net = cfg->fc_nlinfo.nl_net; - struct rt6_info *rt = NULL; + struct fib6_info *rt = NULL; struct net_device *dev = NULL; struct inet6_dev *idev = NULL; struct fib6_table *table; @@ -3007,7 +3006,7 @@ out: int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags, struct netlink_ext_ack *extack) { - struct rt6_info *rt; + struct fib6_info *rt; int err; rt = ip6_route_info_create(cfg, gfp_flags, extack); @@ -3020,7 +3019,7 @@ int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags, return err; } -static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info) +static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info) { struct net *net = info->nl_net; struct fib6_table *table; @@ -3041,14 +3040,14 @@ out: return err; } -int ip6_del_rt(struct net *net, struct rt6_info *rt) +int ip6_del_rt(struct net *net, struct fib6_info *rt) { struct nl_info info = { .nl_net = net }; return __ip6_del_rt(rt, &info); } -static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg) +static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg) { struct nl_info *info = &cfg->fc_nlinfo; struct net *net = info->nl_net; @@ -3062,7 +3061,7 @@ static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg) spin_lock_bh(&table->tb6_lock); if (rt->rt6i_nsiblings && cfg->fc_delete_all_nh) { - struct rt6_info *sibling, *next_sibling; + struct fib6_info *sibling, *next_sibling; /* prefer to send a single notification with all hops */ skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any()); @@ -3119,8 +3118,9 @@ out: static int ip6_route_del(struct fib6_config *cfg, struct netlink_ext_ack *extack) { - struct rt6_info *rt, *rt_cache; + struct rt6_info *rt_cache; struct fib6_table *table; + struct fib6_info *rt; struct fib6_node *fn; int err = -ESRCH; @@ -3296,14 +3296,14 @@ out: } #ifdef CONFIG_IPV6_ROUTE_INFO -static struct rt6_info *rt6_get_route_info(struct net *net, +static struct fib6_info *rt6_get_route_info(struct net *net, const struct in6_addr *prefix, int prefixlen, const struct in6_addr *gwaddr, struct net_device *dev) { u32 tb_id = l3mdev_fib_table(dev) ? : addrconf_rt_table(dev, RT6_TABLE_INFO); struct fib6_node *fn; - struct rt6_info *rt = NULL; + struct fib6_info *rt = NULL; struct fib6_table *table; table = fib6_get_table(net, tb_id); @@ -3322,7 +3322,7 @@ static struct rt6_info *rt6_get_route_info(struct net *net, continue; if (!ipv6_addr_equal(&rt->fib6_nh.nh_gw, gwaddr)) continue; - ip6_hold_safe(NULL, &rt, false); + fib6_info_hold(rt); break; } out: @@ -3330,7 +3330,7 @@ out: return rt; } -static struct rt6_info *rt6_add_route_info(struct net *net, +static struct fib6_info *rt6_add_route_info(struct net *net, const struct in6_addr *prefix, int prefixlen, const struct in6_addr *gwaddr, struct net_device *dev, @@ -3363,12 +3363,12 @@ static struct rt6_info *rt6_add_route_info(struct net *net, } #endif -struct rt6_info *rt6_get_dflt_router(struct net *net, +struct fib6_info *rt6_get_dflt_router(struct net *net, const struct in6_addr *addr, struct net_device *dev) { u32 tb_id = l3mdev_fib_table(dev) ? : addrconf_rt_table(dev, RT6_TABLE_MAIN); - struct rt6_info *rt; + struct fib6_info *rt; struct fib6_table *table; table = fib6_get_table(net, tb_id); @@ -3383,12 +3383,12 @@ struct rt6_info *rt6_get_dflt_router(struct net *net, break; } if (rt) - ip6_hold_safe(NULL, &rt, false); + fib6_info_hold(rt); rcu_read_unlock(); return rt; } -struct rt6_info *rt6_add_dflt_router(struct net *net, +struct fib6_info *rt6_add_dflt_router(struct net *net, const struct in6_addr *gwaddr, struct net_device *dev, unsigned int pref) @@ -3419,7 +3419,7 @@ struct rt6_info *rt6_add_dflt_router(struct net *net, return rt6_get_dflt_router(net, gwaddr, dev); } -int rt6_addrconf_purge(struct rt6_info *rt, void *arg) { +int rt6_addrconf_purge(struct fib6_info *rt, void *arg) { if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) && (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) return -1; @@ -3544,14 +3544,14 @@ static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff * Allocate a dst for local (unicast / anycast) address. */ -struct rt6_info *addrconf_dst_alloc(struct net *net, +struct fib6_info *addrconf_dst_alloc(struct net *net, struct inet6_dev *idev, const struct in6_addr *addr, bool anycast, gfp_t gfp_flags) { u32 tb_id; struct net_device *dev = idev->dev; - struct rt6_info *rt; + struct fib6_info *rt; rt = fib6_info_alloc(gfp_flags); if (!rt) @@ -3591,7 +3591,7 @@ struct arg_dev_net_ip { struct in6_addr *addr; }; -static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg) +static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg) { struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev; struct net *net = ((struct arg_dev_net_ip *)arg)->net; @@ -3624,7 +3624,7 @@ void rt6_remove_prefsrc(struct inet6_ifaddr *ifp) #define RTF_RA_ROUTER (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY) /* Remove routers and update dst entries when gateway turn into host. */ -static int fib6_clean_tohost(struct rt6_info *rt, void *arg) +static int fib6_clean_tohost(struct fib6_info *rt, void *arg) { struct in6_addr *gateway = (struct in6_addr *)arg; @@ -3655,9 +3655,9 @@ struct arg_netdev_event { }; }; -static struct rt6_info *rt6_multipath_first_sibling(const struct rt6_info *rt) +static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt) { - struct rt6_info *iter; + struct fib6_info *iter; struct fib6_node *fn; fn = rcu_dereference_protected(rt->rt6i_node, @@ -3675,7 +3675,7 @@ static struct rt6_info *rt6_multipath_first_sibling(const struct rt6_info *rt) return NULL; } -static bool rt6_is_dead(const struct rt6_info *rt) +static bool rt6_is_dead(const struct fib6_info *rt) { if (rt->fib6_nh.nh_flags & RTNH_F_DEAD || (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN && @@ -3685,9 +3685,9 @@ static bool rt6_is_dead(const struct rt6_info *rt) return false; } -static int rt6_multipath_total_weight(const struct rt6_info *rt) +static int rt6_multipath_total_weight(const struct fib6_info *rt) { - struct rt6_info *iter; + struct fib6_info *iter; int total = 0; if (!rt6_is_dead(rt)) @@ -3701,7 +3701,7 @@ static int rt6_multipath_total_weight(const struct rt6_info *rt) return total; } -static void rt6_upper_bound_set(struct rt6_info *rt, int *weight, int total) +static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total) { int upper_bound = -1; @@ -3713,9 +3713,9 @@ static void rt6_upper_bound_set(struct rt6_info *rt, int *weight, int total) atomic_set(&rt->fib6_nh.nh_upper_bound, upper_bound); } -static void rt6_multipath_upper_bound_set(struct rt6_info *rt, int total) +static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total) { - struct rt6_info *iter; + struct fib6_info *iter; int weight = 0; rt6_upper_bound_set(rt, &weight, total); @@ -3724,9 +3724,9 @@ static void rt6_multipath_upper_bound_set(struct rt6_info *rt, int total) rt6_upper_bound_set(iter, &weight, total); } -void rt6_multipath_rebalance(struct rt6_info *rt) +void rt6_multipath_rebalance(struct fib6_info *rt) { - struct rt6_info *first; + struct fib6_info *first; int total; /* In case the entire multipath route was marked for flushing, @@ -3748,7 +3748,7 @@ void rt6_multipath_rebalance(struct rt6_info *rt) rt6_multipath_upper_bound_set(first, total); } -static int fib6_ifup(struct rt6_info *rt, void *p_arg) +static int fib6_ifup(struct fib6_info *rt, void *p_arg) { const struct arg_netdev_event *arg = p_arg; struct net *net = dev_net(arg->dev); @@ -3777,10 +3777,10 @@ void rt6_sync_up(struct net_device *dev, unsigned int nh_flags) fib6_clean_all(dev_net(dev), fib6_ifup, &arg); } -static bool rt6_multipath_uses_dev(const struct rt6_info *rt, +static bool rt6_multipath_uses_dev(const struct fib6_info *rt, const struct net_device *dev) { - struct rt6_info *iter; + struct fib6_info *iter; if (rt->fib6_nh.nh_dev == dev) return true; @@ -3791,19 +3791,19 @@ static bool rt6_multipath_uses_dev(const struct rt6_info *rt, return false; } -static void rt6_multipath_flush(struct rt6_info *rt) +static void rt6_multipath_flush(struct fib6_info *rt) { - struct rt6_info *iter; + struct fib6_info *iter; rt->should_flush = 1; list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings) iter->should_flush = 1; } -static unsigned int rt6_multipath_dead_count(const struct rt6_info *rt, +static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt, const struct net_device *down_dev) { - struct rt6_info *iter; + struct fib6_info *iter; unsigned int dead = 0; if (rt->fib6_nh.nh_dev == down_dev || @@ -3817,11 +3817,11 @@ static unsigned int rt6_multipath_dead_count(const struct rt6_info *rt, return dead; } -static void rt6_multipath_nh_flags_set(struct rt6_info *rt, +static void rt6_multipath_nh_flags_set(struct fib6_info *rt, const struct net_device *dev, unsigned int nh_flags) { - struct rt6_info *iter; + struct fib6_info *iter; if (rt->fib6_nh.nh_dev == dev) rt->fib6_nh.nh_flags |= nh_flags; @@ -3831,7 +3831,7 @@ static void rt6_multipath_nh_flags_set(struct rt6_info *rt, } /* called with write lock held for table with rt */ -static int fib6_ifdown(struct rt6_info *rt, void *p_arg) +static int fib6_ifdown(struct fib6_info *rt, void *p_arg) { const struct arg_netdev_event *arg = p_arg; const struct net_device *dev = arg->dev; @@ -3898,7 +3898,7 @@ struct rt6_mtu_change_arg { unsigned int mtu; }; -static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg) +static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg) { struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg; struct inet6_dev *idev; @@ -4091,7 +4091,7 @@ errout: } struct rt6_nh { - struct rt6_info *rt6_info; + struct fib6_info *fib6_info; struct fib6_config r_cfg; struct list_head next; }; @@ -4109,21 +4109,22 @@ static void ip6_print_replace_route_err(struct list_head *rt6_nh_list) static int ip6_route_info_append(struct net *net, struct list_head *rt6_nh_list, - struct rt6_info *rt, struct fib6_config *r_cfg) + struct fib6_info *rt, + struct fib6_config *r_cfg) { struct rt6_nh *nh; int err = -EEXIST; list_for_each_entry(nh, rt6_nh_list, next) { - /* check if rt6_info already exists */ - if (rt6_duplicate_nexthop(nh->rt6_info, rt)) + /* check if fib6_info already exists */ + if (rt6_duplicate_nexthop(nh->fib6_info, rt)) return err; } nh = kzalloc(sizeof(*nh), GFP_KERNEL); if (!nh) return -ENOMEM; - nh->rt6_info = rt; + nh->fib6_info = rt; err = ip6_convert_metrics(net, rt, r_cfg); if (err) { kfree(nh); @@ -4135,8 +4136,8 @@ static int ip6_route_info_append(struct net *net, return 0; } -static void ip6_route_mpath_notify(struct rt6_info *rt, - struct rt6_info *rt_last, +static void ip6_route_mpath_notify(struct fib6_info *rt, + struct fib6_info *rt_last, struct nl_info *info, __u16 nlflags) { @@ -4148,7 +4149,7 @@ static void ip6_route_mpath_notify(struct rt6_info *rt, */ if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->rt6i_nsiblings) { rt = list_first_entry(&rt_last->rt6i_siblings, - struct rt6_info, + struct fib6_info, rt6i_siblings); } @@ -4172,11 +4173,11 @@ static int fib6_gw_from_attr(struct in6_addr *gw, struct nlattr *nla, static int ip6_route_multipath_add(struct fib6_config *cfg, struct netlink_ext_ack *extack) { - struct rt6_info *rt_notif = NULL, *rt_last = NULL; + struct fib6_info *rt_notif = NULL, *rt_last = NULL; struct nl_info *info = &cfg->fc_nlinfo; struct fib6_config r_cfg; struct rtnexthop *rtnh; - struct rt6_info *rt; + struct fib6_info *rt; struct rt6_nh *err_nh; struct rt6_nh *nh, *nh_safe; __u16 nlflags; @@ -4196,7 +4197,7 @@ static int ip6_route_multipath_add(struct fib6_config *cfg, rtnh = (struct rtnexthop *)cfg->fc_mp; /* Parse a Multipath Entry and build a list (rt6_nh_list) of - * rt6_info structs per nexthop + * fib6_info structs per nexthop */ while (rtnh_ok(rtnh, remaining)) { memcpy(&r_cfg, cfg, sizeof(*cfg)); @@ -4249,20 +4250,20 @@ static int ip6_route_multipath_add(struct fib6_config *cfg, err_nh = NULL; list_for_each_entry(nh, &rt6_nh_list, next) { - err = __ip6_ins_rt(nh->rt6_info, info, extack); - fib6_info_release(nh->rt6_info); + err = __ip6_ins_rt(nh->fib6_info, info, extack); + fib6_info_release(nh->fib6_info); if (!err) { /* save reference to last route successfully inserted */ - rt_last = nh->rt6_info; + rt_last = nh->fib6_info; /* save reference to first route for notification */ if (!rt_notif) - rt_notif = nh->rt6_info; + rt_notif = nh->fib6_info; } - /* nh->rt6_info is used or freed at this point, reset to NULL*/ - nh->rt6_info = NULL; + /* nh->fib6_info is used or freed at this point, reset to NULL*/ + nh->fib6_info = NULL; if (err) { if (replace && nhn) ip6_print_replace_route_err(&rt6_nh_list); @@ -4306,8 +4307,8 @@ add_errout: cleanup: list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) { - if (nh->rt6_info) - fib6_info_release(nh->rt6_info); + if (nh->fib6_info) + fib6_info_release(nh->fib6_info); list_del(&nh->next); kfree(nh); } @@ -4394,7 +4395,7 @@ static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, return ip6_route_add(&cfg, GFP_KERNEL, extack); } -static size_t rt6_nlmsg_size(struct rt6_info *rt) +static size_t rt6_nlmsg_size(struct fib6_info *rt) { int nexthop_len = 0; @@ -4424,7 +4425,7 @@ static size_t rt6_nlmsg_size(struct rt6_info *rt) + nexthop_len; } -static int rt6_nexthop_info(struct sk_buff *skb, struct rt6_info *rt, +static int rt6_nexthop_info(struct sk_buff *skb, struct fib6_info *rt, unsigned int *flags, bool skip_oif) { if (rt->fib6_nh.nh_flags & RTNH_F_DEAD) @@ -4461,7 +4462,7 @@ nla_put_failure: } /* add multipath next hop */ -static int rt6_add_nexthop(struct sk_buff *skb, struct rt6_info *rt) +static int rt6_add_nexthop(struct sk_buff *skb, struct fib6_info *rt) { const struct net_device *dev = rt->fib6_nh.nh_dev; struct rtnexthop *rtnh; @@ -4489,7 +4490,7 @@ nla_put_failure: } static int rt6_fill_node(struct net *net, struct sk_buff *skb, - struct rt6_info *rt, struct dst_entry *dst, + struct fib6_info *rt, struct dst_entry *dst, struct in6_addr *dest, struct in6_addr *src, int iif, int type, u32 portid, u32 seq, unsigned int flags) @@ -4579,7 +4580,7 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb, * each as a nexthop within RTA_MULTIPATH. */ if (rt->rt6i_nsiblings) { - struct rt6_info *sibling, *next_sibling; + struct fib6_info *sibling, *next_sibling; struct nlattr *mp; mp = nla_nest_start(skb, RTA_MULTIPATH); @@ -4621,7 +4622,7 @@ nla_put_failure: return -EMSGSIZE; } -int rt6_dump_route(struct rt6_info *rt, void *p_arg) +int rt6_dump_route(struct fib6_info *rt, void *p_arg) { struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg; struct net *net = arg->net; @@ -4766,7 +4767,7 @@ errout: return err; } -void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info, +void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info, unsigned int nlm_flags) { struct sk_buff *skb; From f2f3e41b3db07f6aaa55ca11a0e97425f1590a67 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 10 Oct 2017 18:01:16 +0100 Subject: [PATCH 0393/1640] UPSTREAM: ipv6: fix dereference of rt6_ex before null check error Currently rt6_ex is being dereferenced before it is null checked hence there is a possible null dereference bug. Fix this by only dereferencing rt6_ex after it has been null checked. Detected by CoverityScan, CID#1457749 ("Dereference before null check") Fixes: 81eb8447daae ("ipv6: take care of rt6_stats") Signed-off-by: Colin Ian King Reviewed-by: Eric Dumazet Acked-by: Martin KaFai Lau Signed-off-by: David S. Miller --- net/ipv6/route.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index dcff2470e151..7cec9f777a77 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1276,10 +1276,12 @@ static DEFINE_SPINLOCK(rt6_exception_lock); static void rt6_remove_exception(struct rt6_exception_bucket *bucket, struct rt6_exception *rt6_ex) { - struct net *net = dev_net(rt6_ex->rt6i->dst.dev); + struct net *net; if (!bucket || !rt6_ex) return; + + net = dev_net(rt6_ex->rt6i->dst.dev); rt6_ex->rt6i->rt6i_node = NULL; hlist_del_rcu(&rt6_ex->hlist); ip6_rt_put(rt6_ex->rt6i); From 88dda0c6a82fa5aa82ad4d88eac19f0f22fb5b54 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 17 Apr 2018 17:33:27 -0700 Subject: [PATCH 0394/1640] UPSTREAM: net/ipv6: Remove unused code and variables for rt6_info Drop unneeded elements from rt6_info struct and rearrange layout to something more relevant for the data path. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 62 +++-------------------------------------- net/ipv6/ip6_fib.c | 22 --------------- net/ipv6/route.c | 27 ++---------------- net/ipv6/xfrm6_policy.c | 2 -- 4 files changed, 6 insertions(+), 107 deletions(-) diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 780230b77fd5..6b86c1739e68 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -178,58 +178,20 @@ struct fib6_info { struct rt6_info { struct dst_entry dst; - struct rt6_info __rcu *rt6_next; struct fib6_info *from; - /* - * Tail elements of dst_entry (__refcnt etc.) - * and these elements (rarely used in hot path) are in - * the same cache line. - */ - struct fib6_table *rt6i_table; - struct fib6_node __rcu *rt6i_node; - - struct in6_addr rt6i_gateway; - - /* Multipath routes: - * siblings is a list of rt6_info that have the the same metric/weight, - * destination, but not the same gateway. nsiblings is just a cache - * to speed up lookup. - */ - struct list_head rt6i_siblings; - unsigned int rt6i_nsiblings; - - atomic_t rt6i_ref; - - /* These are in a separate cache line. */ - struct rt6key rt6i_dst ____cacheline_aligned_in_smp; - u32 rt6i_flags; + struct rt6key rt6i_dst; struct rt6key rt6i_src; + struct in6_addr rt6i_gateway; + struct inet6_dev *rt6i_idev; + u32 rt6i_flags; struct rt6key rt6i_prefsrc; struct list_head rt6i_uncached; struct uncached_list *rt6i_uncached_list; - struct inet6_dev *rt6i_idev; - struct rt6_info * __percpu *rt6i_pcpu; - struct rt6_exception_bucket __rcu *rt6i_exception_bucket; - - u32 rt6i_metric; /* more non-fragment space at head required */ unsigned short rt6i_nfheader_len; - u8 rt6i_protocol; - u8 fib6_type; - u8 exception_bucket_flushed:1, - should_flush:1, - dst_nocount:1, - dst_nopolicy:1, - dst_host:1, - unused:3; - - unsigned long expires; - struct dst_metrics *fib6_metrics; -#define fib6_pmtu fib6_metrics->metrics[RTAX_MTU-1] - struct fib6_nh fib6_nh; }; #define for_each_fib6_node_rt_rcu(fn) \ @@ -330,8 +292,6 @@ static inline void ip6_rt_put(struct rt6_info *rt) dst_release(&rt->dst); } -void rt6_free_pcpu(struct rt6_info *non_pcpu_rt); - struct fib6_info *fib6_info_alloc(gfp_t gfp_flags); void fib6_info_destroy(struct fib6_info *f6i); @@ -346,20 +306,6 @@ static inline void fib6_info_release(struct fib6_info *f6i) fib6_info_destroy(f6i); } -static inline void rt6_hold(struct rt6_info *rt) -{ - atomic_inc(&rt->rt6i_ref); -} - -static inline void rt6_release(struct rt6_info *rt) -{ - if (atomic_dec_and_test(&rt->rt6i_ref)) { - rt6_free_pcpu(rt); - dst_dev_put(&rt->dst); - dst_release(&rt->dst); - } -} - enum fib6_walk_state { #ifdef CONFIG_IPV6_SUBTREES FWS_S, diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index c54d28b68c46..5031fb52710a 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -240,28 +240,6 @@ static void node_free(struct net *net, struct fib6_node *fn) net->ipv6.rt6_stats->fib_nodes--; } -void rt6_free_pcpu(struct rt6_info *non_pcpu_rt) -{ - int cpu; - - if (!non_pcpu_rt->rt6i_pcpu) - return; - - for_each_possible_cpu(cpu) { - struct rt6_info **ppcpu_rt; - struct rt6_info *pcpu_rt; - - ppcpu_rt = per_cpu_ptr(non_pcpu_rt->rt6i_pcpu, cpu); - pcpu_rt = *ppcpu_rt; - if (pcpu_rt) { - dst_dev_put(&pcpu_rt->dst); - dst_release(&pcpu_rt->dst); - *ppcpu_rt = NULL; - } - } -} -EXPORT_SYMBOL_GPL(rt6_free_pcpu); - static void fib6_free_table(struct fib6_table *table) { inetpeer_invalidate_tree(&table->tb6_peers); diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 7cec9f777a77..e65afabdc1a3 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -305,10 +305,6 @@ static const struct rt6_info ip6_null_entry_template = { .output = ip6_pkt_discard_out, }, .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), - .rt6i_protocol = RTPROT_KERNEL, - .rt6i_metric = ~(u32) 0, - .rt6i_ref = ATOMIC_INIT(1), - .fib6_type = RTN_UNREACHABLE, }; #ifdef CONFIG_IPV6_MULTIPLE_TABLES @@ -323,10 +319,6 @@ static const struct rt6_info ip6_prohibit_entry_template = { .output = ip6_pkt_prohibit_out, }, .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), - .rt6i_protocol = RTPROT_KERNEL, - .rt6i_metric = ~(u32) 0, - .rt6i_ref = ATOMIC_INIT(1), - .fib6_type = RTN_PROHIBIT, }; static const struct rt6_info ip6_blk_hole_entry_template = { @@ -339,10 +331,6 @@ static const struct rt6_info ip6_blk_hole_entry_template = { .output = dst_discard_out, }, .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), - .rt6i_protocol = RTPROT_KERNEL, - .rt6i_metric = ~(u32) 0, - .rt6i_ref = ATOMIC_INIT(1), - .fib6_type = RTN_BLACKHOLE, }; #endif @@ -352,7 +340,6 @@ static void rt6_info_init(struct rt6_info *rt) struct dst_entry *dst = &rt->dst; memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst)); - INIT_LIST_HEAD(&rt->rt6i_siblings); INIT_LIST_HEAD(&rt->rt6i_uncached); } @@ -1002,12 +989,10 @@ static void ip6_rt_copy_init(struct rt6_info *rt, struct fib6_info *ort) rt->rt6i_gateway = ort->fib6_nh.nh_gw; rt->rt6i_flags = ort->rt6i_flags; rt6_set_from(rt, ort); - rt->rt6i_metric = ort->rt6i_metric; #ifdef CONFIG_IPV6_SUBTREES rt->rt6i_src = ort->rt6i_src; #endif rt->rt6i_prefsrc = ort->rt6i_prefsrc; - rt->rt6i_table = ort->rt6i_table; rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate); } @@ -1195,7 +1180,6 @@ static struct rt6_info *ip6_rt_cache_alloc(struct fib6_info *ort, ip6_rt_copy_init(rt, ort); rt->rt6i_flags |= RTF_CACHE; - rt->rt6i_metric = 0; rt->dst.flags |= DST_HOST; rt->rt6i_dst.addr = *daddr; rt->rt6i_dst.plen = 128; @@ -1228,7 +1212,6 @@ static struct rt6_info *ip6_rt_pcpu_alloc(struct fib6_info *rt) if (!pcpu_rt) return NULL; ip6_rt_copy_init(pcpu_rt, rt); - pcpu_rt->rt6i_protocol = rt->rt6i_protocol; pcpu_rt->rt6i_flags |= RTF_PCPU; return pcpu_rt; } @@ -1282,9 +1265,8 @@ static void rt6_remove_exception(struct rt6_exception_bucket *bucket, return; net = dev_net(rt6_ex->rt6i->dst.dev); - rt6_ex->rt6i->rt6i_node = NULL; hlist_del_rcu(&rt6_ex->hlist); - ip6_rt_put(rt6_ex->rt6i); + dst_release(&rt6_ex->rt6i->dst); kfree_rcu(rt6_ex, rcu); WARN_ON_ONCE(!bucket->depth); bucket->depth--; @@ -1466,8 +1448,6 @@ static int rt6_insert_exception(struct rt6_info *nrt, } rt6_ex->rt6i = nrt; rt6_ex->stamp = jiffies; - atomic_inc(&nrt->rt6i_ref); - nrt->rt6i_node = ort->rt6i_node; hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain); bucket->depth++; net->ipv6.rt6_stats->fib_rt_cache++; @@ -2117,7 +2097,6 @@ struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_ori rt->rt6i_idev = in6_dev_get(loopback_dev); rt->rt6i_gateway = ort->rt6i_gateway; rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU; - rt->rt6i_metric = 0; memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key)); #ifdef CONFIG_IPV6_SUBTREES @@ -2242,8 +2221,7 @@ static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu) static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt) { return !(rt->rt6i_flags & RTF_CACHE) && - (rt->rt6i_flags & RTF_PCPU || - rcu_access_pointer(rt->rt6i_node)); + (rt->rt6i_flags & RTF_PCPU || rt->from); } static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk, @@ -3275,7 +3253,6 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu if (on_link) nrt->rt6i_flags &= ~RTF_GATEWAY; - nrt->rt6i_protocol = RTPROT_REDIRECT; nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key; /* No need to remove rt from the exception table if rt is diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c index df717f9cbc02..1bad876be17f 100644 --- a/net/ipv6/xfrm6_policy.c +++ b/net/ipv6/xfrm6_policy.c @@ -113,8 +113,6 @@ static int xfrm6_fill_dst(struct xfrm_dst *xdst, struct net_device *dev, * it was magically lost, so this code needs audit */ xdst->u.rt6.rt6i_flags = rt->rt6i_flags & (RTF_ANYCAST | RTF_LOCAL); - xdst->u.rt6.rt6i_metric = rt->rt6i_metric; - xdst->u.rt6.rt6i_node = rt->rt6i_node; xdst->route_cookie = rt6_get_cookie(rt); xdst->u.rt6.rt6i_gateway = rt->rt6i_gateway; xdst->u.rt6.rt6i_dst = rt->rt6i_dst; From 5afa30771d156cb61cece58e1f787b45ed30685d Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 18 Apr 2018 15:38:59 -0700 Subject: [PATCH 0395/1640] BACKPORT: net/ipv6: Rename fib6_info struct elements Change the prefix for fib6_info struct elements from rt6i_ to fib6_. rt6i_pcpu and rt6i_exception_bucket are left as is given that they point to rt6_info entries. Rename only; not functional change intended. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 38 +++--- include/net/ip6_route.h | 26 ++-- net/ipv6/addrconf.c | 24 ++-- net/ipv6/anycast.c | 8 +- net/ipv6/ip6_fib.c | 170 ++++++++++++------------- net/ipv6/ndisc.c | 2 +- net/ipv6/route.c | 272 ++++++++++++++++++++-------------------- 7 files changed, 270 insertions(+), 270 deletions(-) diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 6b86c1739e68..2eef7cd81977 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -137,34 +137,34 @@ struct fib6_nh { }; struct fib6_info { - struct fib6_table *rt6i_table; + struct fib6_table *fib6_table; struct fib6_info __rcu *rt6_next; - struct fib6_node __rcu *rt6i_node; + struct fib6_node __rcu *fib6_node; /* Multipath routes: * siblings is a list of fib6_info that have the the same metric/weight, * destination, but not the same gateway. nsiblings is just a cache * to speed up lookup. */ - struct list_head rt6i_siblings; - unsigned int rt6i_nsiblings; + struct list_head fib6_siblings; + unsigned int fib6_nsiblings; - atomic_t rt6i_ref; - struct inet6_dev *rt6i_idev; + atomic_t fib6_ref; + struct inet6_dev *fib6_idev; unsigned long expires; struct dst_metrics *fib6_metrics; #define fib6_pmtu fib6_metrics->metrics[RTAX_MTU-1] - struct rt6key rt6i_dst; - u32 rt6i_flags; - struct rt6key rt6i_src; - struct rt6key rt6i_prefsrc; + struct rt6key fib6_dst; + u32 fib6_flags; + struct rt6key fib6_src; + struct rt6key fib6_prefsrc; struct rt6_info * __percpu *rt6i_pcpu; struct rt6_exception_bucket __rcu *rt6i_exception_bucket; - u32 rt6i_metric; - u8 rt6i_protocol; + u32 fib6_metric; + u8 fib6_protocol; u8 fib6_type; u8 exception_bucket_flushed:1, should_flush:1, @@ -209,7 +209,7 @@ static inline struct inet6_dev *ip6_dst_idev(struct dst_entry *dst) static inline void fib6_clean_expires(struct fib6_info *f6i) { - f6i->rt6i_flags &= ~RTF_EXPIRES; + f6i->fib6_flags &= ~RTF_EXPIRES; f6i->expires = 0; } @@ -217,12 +217,12 @@ static inline void fib6_set_expires(struct fib6_info *f6i, unsigned long expires) { f6i->expires = expires; - f6i->rt6i_flags |= RTF_EXPIRES; + f6i->fib6_flags |= RTF_EXPIRES; } static inline bool fib6_check_expired(const struct fib6_info *f6i) { - if (f6i->rt6i_flags & RTF_EXPIRES) + if (f6i->fib6_flags & RTF_EXPIRES) return time_after(jiffies, f6i->expires); return false; } @@ -253,14 +253,14 @@ static inline void rt6_update_expires(struct rt6_info *rt0, int timeout) * Return true if we can get cookie safely * Return false if not */ -static inline bool rt6_get_cookie_safe(const struct fib6_info *rt, +static inline bool rt6_get_cookie_safe(const struct fib6_info *f6i, u32 *cookie) { struct fib6_node *fn; bool status = false; rcu_read_lock(); - fn = rcu_dereference(rt->rt6i_node); + fn = rcu_dereference(f6i->fib6_node); if (fn) { *cookie = fn->fn_sernum; @@ -297,12 +297,12 @@ void fib6_info_destroy(struct fib6_info *f6i); static inline void fib6_info_hold(struct fib6_info *f6i) { - atomic_inc(&f6i->rt6i_ref); + atomic_inc(&f6i->fib6_ref); } static inline void fib6_info_release(struct fib6_info *f6i) { - if (f6i && atomic_dec_and_test(&f6i->rt6i_ref)) + if (f6i && atomic_dec_and_test(&f6i->fib6_ref)) fib6_info_destroy(f6i); } diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index a717d005a91a..b0e246bc4bf5 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -66,9 +66,9 @@ static inline bool rt6_need_strict(const struct in6_addr *daddr) (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK); } -static inline bool rt6_qualify_for_ecmp(const struct fib6_info *rt) +static inline bool rt6_qualify_for_ecmp(const struct fib6_info *f6i) { - return (rt->rt6i_flags & (RTF_GATEWAY|RTF_ADDRCONF|RTF_DYNAMIC)) == + return (f6i->fib6_flags & (RTF_GATEWAY|RTF_ADDRCONF|RTF_DYNAMIC)) == RTF_GATEWAY; } @@ -102,23 +102,23 @@ int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg); int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags, struct netlink_ext_ack *extack); -int ip6_ins_rt(struct net *net, struct fib6_info *rt); -int ip6_del_rt(struct net *net, struct fib6_info *rt); +int ip6_ins_rt(struct net *net, struct fib6_info *f6i); +int ip6_del_rt(struct net *net, struct fib6_info *f6i); -void rt6_flush_exceptions(struct fib6_info *rt); -void rt6_age_exceptions(struct fib6_info *rt, struct fib6_gc_args *gc_args, +void rt6_flush_exceptions(struct fib6_info *f6i); +void rt6_age_exceptions(struct fib6_info *f6i, struct fib6_gc_args *gc_args, unsigned long now); -static inline int ip6_route_get_saddr(struct net *net, struct fib6_info *rt, +static inline int ip6_route_get_saddr(struct net *net, struct fib6_info *f6i, const struct in6_addr *daddr, unsigned int prefs, struct in6_addr *saddr) { - struct inet6_dev *idev = rt ? rt->rt6i_idev : NULL; + struct inet6_dev *idev = f6i ? f6i->fib6_idev : NULL; int err = 0; - if (rt && rt->rt6i_prefsrc.plen) - *saddr = rt->rt6i_prefsrc.addr; + if (f6i && f6i->fib6_prefsrc.plen) + *saddr = f6i->fib6_prefsrc.addr; else err = ipv6_dev_get_saddr(net, idev ? idev->dev : NULL, daddr, prefs, saddr); @@ -176,14 +176,14 @@ struct rt6_rtnl_dump_arg { struct net *net; }; -int rt6_dump_route(struct fib6_info *rt, void *p_arg); +int rt6_dump_route(struct fib6_info *f6i, void *p_arg); void rt6_mtu_change(struct net_device *dev, unsigned int mtu); void rt6_remove_prefsrc(struct inet6_ifaddr *ifp); void rt6_clean_tohost(struct net *net, struct in6_addr *gateway); void rt6_sync_up(struct net_device *dev, unsigned int nh_flags); void rt6_disable_ip(struct net_device *dev, unsigned long event); void rt6_sync_down_dev(struct net_device *dev, unsigned long event); -void rt6_multipath_rebalance(struct fib6_info *rt); +void rt6_multipath_rebalance(struct fib6_info *f6i); static inline const struct rt6_info *skb_rt6_info(const struct sk_buff *skb) { @@ -269,7 +269,7 @@ static inline struct in6_addr *rt6_nexthop(struct rt6_info *rt, static inline bool rt6_duplicate_nexthop(struct fib6_info *a, struct fib6_info *b) { return a->fib6_nh.nh_dev == b->fib6_nh.nh_dev && - a->rt6i_idev == b->rt6i_idev && + a->fib6_idev == b->fib6_idev && ipv6_addr_equal(&a->fib6_nh.nh_gw, &b->fib6_nh.nh_gw) && !lwtunnel_cmp_encap(a->fib6_nh.nh_lwtstate, b->fib6_nh.nh_lwtstate); } diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 35697e79653a..167deb682b3f 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -1167,19 +1167,19 @@ check_cleanup_prefix_route(struct inet6_ifaddr *ifp, unsigned long *expires) static void cleanup_prefix_route(struct inet6_ifaddr *ifp, unsigned long expires, bool del_rt) { - struct fib6_info *rt; + struct fib6_info *f6i; - rt = addrconf_get_prefix_route(&ifp->addr, + f6i = addrconf_get_prefix_route(&ifp->addr, ifp->prefix_len, ifp->idev->dev, 0, RTF_GATEWAY | RTF_DEFAULT); - if (rt) { + if (f6i) { if (del_rt) - ip6_del_rt(dev_net(ifp->idev->dev), rt); + ip6_del_rt(dev_net(ifp->idev->dev), f6i); else { - if (!(rt->rt6i_flags & RTF_EXPIRES)) - fib6_set_expires(rt, expires); - fib6_info_release(rt); + if (!(f6i->fib6_flags & RTF_EXPIRES)) + fib6_set_expires(f6i, expires); + fib6_info_release(f6i); } } } @@ -2375,9 +2375,9 @@ static struct fib6_info *addrconf_get_prefix_route(const struct in6_addr *pfx, for_each_fib6_node_rt_rcu(fn) { if (rt->fib6_nh.nh_dev->ifindex != dev->ifindex) continue; - if ((rt->rt6i_flags & flags) != flags) + if ((rt->fib6_flags & flags) != flags) continue; - if ((rt->rt6i_flags & noflags) != 0) + if ((rt->fib6_flags & noflags) != 0) continue; fib6_info_hold(rt); break; @@ -3364,11 +3364,11 @@ static int fixup_permanent_addr(struct net *net, struct inet6_dev *idev, struct inet6_ifaddr *ifp) { - /* !rt6i_node means the host route was removed from the + /* !fib6_node means the host route was removed from the * FIB, for example, if 'lo' device is taken down. In that * case regenerate the host route. */ - if (!ifp->rt || !ifp->rt->rt6i_node) { + if (!ifp->rt || !ifp->rt->fib6_node) { struct fib6_info *rt, *prev; rt = addrconf_dst_alloc(net, idev, &ifp->addr, false, @@ -5651,7 +5651,7 @@ static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp) * host route, so nothing to insert. That will be fixed when * the device is brought up. */ - if (ifp->rt && !rcu_access_pointer(ifp->rt->rt6i_node)) { + if (ifp->rt && !rcu_access_pointer(ifp->rt->fib6_node)) { ip6_ins_rt(net, ifp->rt); } else if (!ifp->rt && (ifp->idev->dev->flags & IFF_UP)) { pr_warn("BUG: Address %pI6c on device %s is missing its host route.\n", diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c index ca0aef660af4..da13bbfcba60 100644 --- a/net/ipv6/anycast.c +++ b/net/ipv6/anycast.c @@ -222,10 +222,10 @@ static void aca_put(struct ifacaddr6 *ac) } } -static struct ifacaddr6 *aca_alloc(struct fib6_info *rt, +static struct ifacaddr6 *aca_alloc(struct fib6_info *f6i, const struct in6_addr *addr) { - struct inet6_dev *idev = rt->rt6i_idev; + struct inet6_dev *idev = f6i->fib6_idev; struct ifacaddr6 *aca; aca = kzalloc(sizeof(*aca), GFP_ATOMIC); @@ -235,8 +235,8 @@ static struct ifacaddr6 *aca_alloc(struct fib6_info *rt, aca->aca_addr = *addr; in6_dev_hold(idev); aca->aca_idev = idev; - fib6_info_hold(rt); - aca->aca_rt = rt; + fib6_info_hold(f6i); + aca->aca_rt = f6i; aca->aca_users = 1; /* aca_tstamp should be updated upon changes */ aca->aca_cstamp = aca->aca_tstamp = jiffies; diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 5031fb52710a..c6a80a05749f 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -105,12 +105,12 @@ enum { FIB6_NO_SERNUM_CHANGE = 0, }; -void fib6_update_sernum(struct net *net, struct fib6_info *rt) +void fib6_update_sernum(struct net *net, struct fib6_info *f6i) { struct fib6_node *fn; - fn = rcu_dereference_protected(rt->rt6i_node, - lockdep_is_held(&rt->rt6i_table->tb6_lock)); + fn = rcu_dereference_protected(f6i->fib6_node, + lockdep_is_held(&f6i->fib6_table->tb6_lock)); if (fn) fn->fn_sernum = fib6_new_sernum(net); } @@ -159,10 +159,10 @@ struct fib6_info *fib6_info_alloc(gfp_t gfp_flags) return NULL; } - INIT_LIST_HEAD(&f6i->rt6i_siblings); + INIT_LIST_HEAD(&f6i->fib6_siblings); f6i->fib6_metrics = (struct dst_metrics *)&dst_default_metrics; - atomic_inc(&f6i->rt6i_ref); + atomic_inc(&f6i->fib6_ref); return f6i; } @@ -172,7 +172,7 @@ void fib6_info_destroy(struct fib6_info *f6i) struct rt6_exception_bucket *bucket; struct dst_metrics *m; - WARN_ON(f6i->rt6i_node); + WARN_ON(f6i->fib6_node); bucket = rcu_dereference_protected(f6i->rt6i_exception_bucket, 1); if (bucket) { @@ -197,8 +197,8 @@ void fib6_info_destroy(struct fib6_info *f6i) } } - if (f6i->rt6i_idev) - in6_dev_put(f6i->rt6i_idev); + if (f6i->fib6_idev) + in6_dev_put(f6i->fib6_idev); if (f6i->fib6_nh.nh_dev) dev_put(f6i->fib6_nh.nh_dev); @@ -401,7 +401,7 @@ static int call_fib6_entry_notifiers(struct net *net, .rt = rt, }; - rt->rt6i_table->fib_seq++; + rt->fib6_table->fib_seq++; return call_fib6_notifiers(net, event_type, &info.info); } @@ -483,10 +483,10 @@ static int fib6_dump_node(struct fib6_walker *w) * last sibling of this route (no need to dump the * sibling routes again) */ - if (rt->rt6i_nsiblings) - rt = list_last_entry(&rt->rt6i_siblings, + if (rt->fib6_nsiblings) + rt = list_last_entry(&rt->fib6_siblings, struct fib6_info, - rt6i_siblings); + fib6_siblings); } w->leaf = NULL; return 0; @@ -810,7 +810,7 @@ insert_above: RCU_INIT_POINTER(in->parent, pn); in->leaf = fn->leaf; atomic_inc(&rcu_dereference_protected(in->leaf, - lockdep_is_held(&table->tb6_lock))->rt6i_ref); + lockdep_is_held(&table->tb6_lock))->fib6_ref); /* update parent pointer */ if (dir) @@ -865,9 +865,9 @@ insert_above: static void fib6_purge_rt(struct fib6_info *rt, struct fib6_node *fn, struct net *net) { - struct fib6_table *table = rt->rt6i_table; + struct fib6_table *table = rt->fib6_table; - if (atomic_read(&rt->rt6i_ref) != 1) { + if (atomic_read(&rt->fib6_ref) != 1) { /* This route is used as dummy address holder in some split * nodes. It is not leaked, but it still holds other resources, * which must be released in time. So, scan ascendant nodes @@ -880,7 +880,7 @@ static void fib6_purge_rt(struct fib6_info *rt, struct fib6_node *fn, struct fib6_info *new_leaf; if (!(fn->fn_flags & RTN_RTINFO) && leaf == rt) { new_leaf = fib6_find_prefix(net, table, fn); - atomic_inc(&new_leaf->rt6i_ref); + atomic_inc(&new_leaf->fib6_ref); rcu_assign_pointer(fn->leaf, new_leaf); fib6_info_release(rt); @@ -919,7 +919,7 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt, struct netlink_ext_ack *extack) { struct fib6_info *leaf = rcu_dereference_protected(fn->leaf, - lockdep_is_held(&rt->rt6i_table->tb6_lock)); + lockdep_is_held(&rt->fib6_table->tb6_lock)); struct fib6_info *iter = NULL; struct fib6_info __rcu **ins; struct fib6_info __rcu **fallback_ins = NULL; @@ -939,12 +939,12 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt, for (iter = leaf; iter; iter = rcu_dereference_protected(iter->rt6_next, - lockdep_is_held(&rt->rt6i_table->tb6_lock))) { + lockdep_is_held(&rt->fib6_table->tb6_lock))) { /* * Search for duplicates */ - if (iter->rt6i_metric == rt->rt6i_metric) { + if (iter->fib6_metric == rt->fib6_metric) { /* * Same priority level */ @@ -963,11 +963,11 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt, } if (rt6_duplicate_nexthop(iter, rt)) { - if (rt->rt6i_nsiblings) - rt->rt6i_nsiblings = 0; - if (!(iter->rt6i_flags & RTF_EXPIRES)) + if (rt->fib6_nsiblings) + rt->fib6_nsiblings = 0; + if (!(iter->fib6_flags & RTF_EXPIRES)) return -EEXIST; - if (!(rt->rt6i_flags & RTF_EXPIRES)) + if (!(rt->fib6_flags & RTF_EXPIRES)) fib6_clean_expires(iter); else fib6_set_expires(iter, rt->expires); @@ -987,10 +987,10 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt, */ if (rt_can_ecmp && rt6_qualify_for_ecmp(iter)) - rt->rt6i_nsiblings++; + rt->fib6_nsiblings++; } - if (iter->rt6i_metric > rt->rt6i_metric) + if (iter->fib6_metric > rt->fib6_metric) break; next_iter: @@ -1003,7 +1003,7 @@ next_iter: */ ins = fallback_ins; iter = rcu_dereference_protected(*ins, - lockdep_is_held(&rt->rt6i_table->tb6_lock)); + lockdep_is_held(&rt->fib6_table->tb6_lock)); found++; } @@ -1012,34 +1012,34 @@ next_iter: fn->rr_ptr = NULL; /* Link this route to others same route. */ - if (rt->rt6i_nsiblings) { - unsigned int rt6i_nsiblings; + if (rt->fib6_nsiblings) { + unsigned int fib6_nsiblings; struct fib6_info *sibling, *temp_sibling; /* Find the first route that have the same metric */ sibling = leaf; while (sibling) { - if (sibling->rt6i_metric == rt->rt6i_metric && + if (sibling->fib6_metric == rt->fib6_metric && rt6_qualify_for_ecmp(sibling)) { - list_add_tail(&rt->rt6i_siblings, - &sibling->rt6i_siblings); + list_add_tail(&rt->fib6_siblings, + &sibling->fib6_siblings); break; } sibling = rcu_dereference_protected(sibling->rt6_next, - lockdep_is_held(&rt->rt6i_table->tb6_lock)); + lockdep_is_held(&rt->fib6_table->tb6_lock)); } /* For each sibling in the list, increment the counter of * siblings. BUG() if counters does not match, list of siblings * is broken! */ - rt6i_nsiblings = 0; + fib6_nsiblings = 0; list_for_each_entry_safe(sibling, temp_sibling, - &rt->rt6i_siblings, rt6i_siblings) { - sibling->rt6i_nsiblings++; - BUG_ON(sibling->rt6i_nsiblings != rt->rt6i_nsiblings); - rt6i_nsiblings++; + &rt->fib6_siblings, fib6_siblings) { + sibling->fib6_nsiblings++; + BUG_ON(sibling->fib6_nsiblings != rt->fib6_nsiblings); + fib6_nsiblings++; } - BUG_ON(rt6i_nsiblings != rt->rt6i_nsiblings); + BUG_ON(fib6_nsiblings != rt->fib6_nsiblings); rt6_multipath_rebalance(temp_sibling); } @@ -1060,8 +1060,8 @@ add: return err; rcu_assign_pointer(rt->rt6_next, iter); - atomic_inc(&rt->rt6i_ref); - rcu_assign_pointer(rt->rt6i_node, fn); + atomic_inc(&rt->fib6_ref); + rcu_assign_pointer(rt->fib6_node, fn); rcu_assign_pointer(*ins, rt); if (!info->skip_notify) inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags); @@ -1088,8 +1088,8 @@ add: if (err) return err; - atomic_inc(&rt->rt6i_ref); - rcu_assign_pointer(rt->rt6i_node, fn); + atomic_inc(&rt->fib6_ref); + rcu_assign_pointer(rt->fib6_node, fn); rt->rt6_next = iter->rt6_next; rcu_assign_pointer(*ins, rt); if (!info->skip_notify) @@ -1098,8 +1098,8 @@ add: info->nl_net->ipv6.rt6_stats->fib_route_nodes++; fn->fn_flags |= RTN_RTINFO; } - nsiblings = iter->rt6i_nsiblings; - iter->rt6i_node = NULL; + nsiblings = iter->fib6_nsiblings; + iter->fib6_node = NULL; fib6_purge_rt(iter, fn, info->nl_net); if (rcu_access_pointer(fn->rr_ptr) == iter) fn->rr_ptr = NULL; @@ -1109,13 +1109,13 @@ add: /* Replacing an ECMP route, remove all siblings */ ins = &rt->rt6_next; iter = rcu_dereference_protected(*ins, - lockdep_is_held(&rt->rt6i_table->tb6_lock)); + lockdep_is_held(&rt->fib6_table->tb6_lock)); while (iter) { - if (iter->rt6i_metric > rt->rt6i_metric) + if (iter->fib6_metric > rt->fib6_metric) break; if (rt6_qualify_for_ecmp(iter)) { *ins = iter->rt6_next; - iter->rt6i_node = NULL; + iter->fib6_node = NULL; fib6_purge_rt(iter, fn, info->nl_net); if (rcu_access_pointer(fn->rr_ptr) == iter) fn->rr_ptr = NULL; @@ -1126,7 +1126,7 @@ add: ins = &iter->rt6_next; } iter = rcu_dereference_protected(*ins, - lockdep_is_held(&rt->rt6i_table->tb6_lock)); + lockdep_is_held(&rt->fib6_table->tb6_lock)); } WARN_ON(nsiblings != 0); } @@ -1138,7 +1138,7 @@ add: static void fib6_start_gc(struct net *net, struct fib6_info *rt) { if (!timer_pending(&net->ipv6.ip6_fib_timer) && - (rt->rt6i_flags & RTF_EXPIRES)) + (rt->fib6_flags & RTF_EXPIRES)) mod_timer(&net->ipv6.ip6_fib_timer, jiffies + net->ipv6.sysctl.ip6_rt_gc_interval); } @@ -1153,15 +1153,15 @@ void fib6_force_start_gc(struct net *net) static void __fib6_update_sernum_upto_root(struct fib6_info *rt, int sernum) { - struct fib6_node *fn = rcu_dereference_protected(rt->rt6i_node, - lockdep_is_held(&rt->rt6i_table->tb6_lock)); + struct fib6_node *fn = rcu_dereference_protected(rt->fib6_node, + lockdep_is_held(&rt->fib6_table->tb6_lock)); /* paired with smp_rmb() in rt6_get_cookie_safe() */ smp_wmb(); while (fn) { fn->fn_sernum = sernum; fn = rcu_dereference_protected(fn->parent, - lockdep_is_held(&rt->rt6i_table->tb6_lock)); + lockdep_is_held(&rt->fib6_table->tb6_lock)); } } @@ -1180,7 +1180,7 @@ void fib6_update_sernum_upto_root(struct net *net, struct fib6_info *rt) int fib6_add(struct fib6_node *root, struct fib6_info *rt, struct nl_info *info, struct netlink_ext_ack *extack) { - struct fib6_table *table = rt->rt6i_table; + struct fib6_table *table = rt->fib6_table; struct fib6_node *fn, *pn = NULL; int err = -ENOMEM; int allow_create = 1; @@ -1197,8 +1197,8 @@ int fib6_add(struct fib6_node *root, struct fib6_info *rt, pr_warn("RTM_NEWROUTE with no NLM_F_CREATE or NLM_F_REPLACE\n"); fn = fib6_add_1(info->nl_net, table, root, - &rt->rt6i_dst.addr, rt->rt6i_dst.plen, - offsetof(struct fib6_info, rt6i_dst), allow_create, + &rt->fib6_dst.addr, rt->fib6_dst.plen, + offsetof(struct fib6_info, fib6_dst), allow_create, replace_required, extack); if (IS_ERR(fn)) { err = PTR_ERR(fn); @@ -1209,7 +1209,7 @@ int fib6_add(struct fib6_node *root, struct fib6_info *rt, pn = fn; #ifdef CONFIG_IPV6_SUBTREES - if (rt->rt6i_src.plen) { + if (rt->fib6_src.plen) { struct fib6_node *sn; if (!rcu_access_pointer(fn->subtree)) { @@ -1230,7 +1230,7 @@ int fib6_add(struct fib6_node *root, struct fib6_info *rt, if (!sfn) goto failure; - atomic_inc(&info->nl_net->ipv6.fib6_null_entry->rt6i_ref); + atomic_inc(&info->nl_net->ipv6.fib6_null_entry->fib6_ref); rcu_assign_pointer(sfn->leaf, info->nl_net->ipv6.fib6_null_entry); sfn->fn_flags = RTN_ROOT; @@ -1238,8 +1238,8 @@ int fib6_add(struct fib6_node *root, struct fib6_info *rt, /* Now add the first leaf node to new subtree */ sn = fib6_add_1(info->nl_net, table, sfn, - &rt->rt6i_src.addr, rt->rt6i_src.plen, - offsetof(struct fib6_info, rt6i_src), + &rt->fib6_src.addr, rt->fib6_src.plen, + offsetof(struct fib6_info, fib6_src), allow_create, replace_required, extack); if (IS_ERR(sn)) { @@ -1257,8 +1257,8 @@ int fib6_add(struct fib6_node *root, struct fib6_info *rt, rcu_assign_pointer(fn->subtree, sfn); } else { sn = fib6_add_1(info->nl_net, table, FIB6_SUBTREE(fn), - &rt->rt6i_src.addr, rt->rt6i_src.plen, - offsetof(struct fib6_info, rt6i_src), + &rt->fib6_src.addr, rt->fib6_src.plen, + offsetof(struct fib6_info, fib6_src), allow_create, replace_required, extack); if (IS_ERR(sn)) { @@ -1273,7 +1273,7 @@ int fib6_add(struct fib6_node *root, struct fib6_info *rt, rcu_assign_pointer(fn->leaf, info->nl_net->ipv6.fib6_null_entry); } else { - atomic_inc(&rt->rt6i_ref); + atomic_inc(&rt->fib6_ref); rcu_assign_pointer(fn->leaf, rt); } } @@ -1422,12 +1422,12 @@ struct fib6_node *fib6_lookup(struct fib6_node *root, const struct in6_addr *dad struct fib6_node *fn; struct lookup_args args[] = { { - .offset = offsetof(struct fib6_info, rt6i_dst), + .offset = offsetof(struct fib6_info, fib6_dst), .addr = daddr, }, #ifdef CONFIG_IPV6_SUBTREES { - .offset = offsetof(struct fib6_info, rt6i_src), + .offset = offsetof(struct fib6_info, fib6_src), .addr = saddr, }, #endif @@ -1512,7 +1512,7 @@ struct fib6_node *fib6_locate(struct fib6_node *root, struct fib6_node *fn; fn = fib6_locate_1(root, daddr, dst_len, - offsetof(struct fib6_info, rt6i_dst), + offsetof(struct fib6_info, fib6_dst), exact_match); #ifdef CONFIG_IPV6_SUBTREES @@ -1523,7 +1523,7 @@ struct fib6_node *fib6_locate(struct fib6_node *root, if (subtree) { fn = fib6_locate_1(subtree, saddr, src_len, - offsetof(struct fib6_info, rt6i_src), + offsetof(struct fib6_info, fib6_src), exact_match); } } @@ -1707,7 +1707,7 @@ static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn, /* Unlink it */ *rtp = rt->rt6_next; - rt->rt6i_node = NULL; + rt->fib6_node = NULL; net->ipv6.rt6_stats->fib_rt_entries--; net->ipv6.rt6_stats->fib_discarded_routes++; @@ -1719,14 +1719,14 @@ static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn, fn->rr_ptr = NULL; /* Remove this entry from other siblings */ - if (rt->rt6i_nsiblings) { + if (rt->fib6_nsiblings) { struct fib6_info *sibling, *next_sibling; list_for_each_entry_safe(sibling, next_sibling, - &rt->rt6i_siblings, rt6i_siblings) - sibling->rt6i_nsiblings--; - rt->rt6i_nsiblings = 0; - list_del_init(&rt->rt6i_siblings); + &rt->fib6_siblings, fib6_siblings) + sibling->fib6_nsiblings--; + rt->fib6_nsiblings = 0; + list_del_init(&rt->fib6_siblings); rt6_multipath_rebalance(next_sibling); } @@ -1766,9 +1766,9 @@ static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn, /* Need to own table->tb6_lock */ int fib6_del(struct fib6_info *rt, struct nl_info *info) { - struct fib6_node *fn = rcu_dereference_protected(rt->rt6i_node, - lockdep_is_held(&rt->rt6i_table->tb6_lock)); - struct fib6_table *table = rt->rt6i_table; + struct fib6_node *fn = rcu_dereference_protected(rt->fib6_node, + lockdep_is_held(&rt->fib6_table->tb6_lock)); + struct fib6_table *table = rt->fib6_table; struct net *net = info->nl_net; struct fib6_info __rcu **rtp; struct fib6_info __rcu **rtp_next; @@ -1948,17 +1948,17 @@ static int fib6_clean_node(struct fib6_walker *w) #if RT6_DEBUG >= 2 pr_debug("%s: del failed: rt=%p@%p err=%d\n", __func__, rt, - rcu_access_pointer(rt->rt6i_node), + rcu_access_pointer(rt->fib6_node), res); #endif continue; } return 0; } else if (res == -2) { - if (WARN_ON(!rt->rt6i_nsiblings)) + if (WARN_ON(!rt->fib6_nsiblings)) continue; - rt = list_last_entry(&rt->rt6i_siblings, - struct fib6_info, rt6i_siblings); + rt = list_last_entry(&rt->fib6_siblings, + struct fib6_info, fib6_siblings); continue; } WARN_ON(res != 0); @@ -2042,7 +2042,7 @@ static int fib6_age(struct fib6_info *rt, void *arg) * Routes are expired even if they are in use. */ - if (rt->rt6i_flags & RTF_EXPIRES && rt->expires) { + if (rt->fib6_flags & RTF_EXPIRES && rt->expires) { if (time_after(now, rt->expires)) { RT6_TRACE("expiring %p\n", rt); return -1; @@ -2238,22 +2238,22 @@ static int ipv6_route_seq_show(struct seq_file *seq, void *v) struct ipv6_route_iter *iter = seq->private; const struct net_device *dev; - seq_printf(seq, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen); + seq_printf(seq, "%pi6 %02x ", &rt->fib6_dst.addr, rt->fib6_dst.plen); #ifdef CONFIG_IPV6_SUBTREES - seq_printf(seq, "%pi6 %02x ", &rt->rt6i_src.addr, rt->rt6i_src.plen); + seq_printf(seq, "%pi6 %02x ", &rt->fib6_src.addr, rt->fib6_src.plen); #else seq_puts(seq, "00000000000000000000000000000000 00 "); #endif - if (rt->rt6i_flags & RTF_GATEWAY) + if (rt->fib6_flags & RTF_GATEWAY) seq_printf(seq, "%pi6", &rt->fib6_nh.nh_gw); else seq_puts(seq, "00000000000000000000000000000000"); dev = rt->fib6_nh.nh_dev; seq_printf(seq, " %08x %08x %08x %08x %8s\n", - rt->rt6i_metric, atomic_read(&rt->rt6i_ref), 0, - rt->rt6i_flags, dev ? dev->name : ""); + rt->fib6_metric, atomic_read(&rt->fib6_ref), 0, + rt->fib6_flags, dev ? dev->name : ""); iter->w.leaf = NULL; return 0; } diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 2844f0c67080..96feb57d9ef8 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -1332,7 +1332,7 @@ static void ndisc_router_discovery(struct sk_buff *skb) } neigh->flags |= NTF_ROUTER; } else if (rt) { - rt->rt6i_flags = (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref); + rt->fib6_flags = (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref); } if (rt) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index e65afabdc1a3..f2c189f9cff7 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -287,10 +287,10 @@ static const u32 ip6_template_metrics[RTAX_MAX] = { }; static const struct fib6_info fib6_null_entry_template = { - .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), - .rt6i_protocol = RTPROT_KERNEL, - .rt6i_metric = ~(u32)0, - .rt6i_ref = ATOMIC_INIT(1), + .fib6_flags = (RTF_REJECT | RTF_NONEXTHOP), + .fib6_protocol = RTPROT_KERNEL, + .fib6_metric = ~(u32)0, + .fib6_ref = ATOMIC_INIT(1), .fib6_type = RTN_UNREACHABLE, .fib6_metrics = (struct dst_metrics *)&dst_default_metrics, }; @@ -432,8 +432,8 @@ static struct fib6_info *rt6_multipath_select(const struct net *net, if (fl6->mp_hash <= atomic_read(&match->fib6_nh.nh_upper_bound)) return match; - list_for_each_entry_safe(sibling, next_sibling, &match->rt6i_siblings, - rt6i_siblings) { + list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings, + fib6_siblings) { int nh_upper_bound; nh_upper_bound = atomic_read(&sibling->fib6_nh.nh_upper_bound); @@ -475,12 +475,12 @@ static inline struct fib6_info *rt6_device_match(struct net *net, if (dev->ifindex == oif) return sprt; if (dev->flags & IFF_LOOPBACK) { - if (!sprt->rt6i_idev || - sprt->rt6i_idev->dev->ifindex != oif) { + if (!sprt->fib6_idev || + sprt->fib6_idev->dev->ifindex != oif) { if (flags & RT6_LOOKUP_F_IFACE) continue; if (local && - local->rt6i_idev->dev->ifindex == oif) + local->fib6_idev->dev->ifindex == oif) continue; } local = sprt; @@ -537,7 +537,7 @@ static void rt6_probe(struct fib6_info *rt) * Router Reachability Probe MUST be rate-limited * to no more than one per minute. */ - if (!rt || !(rt->rt6i_flags & RTF_GATEWAY)) + if (!rt || !(rt->fib6_flags & RTF_GATEWAY)) return; nh_gw = &rt->fib6_nh.nh_gw; @@ -553,7 +553,7 @@ static void rt6_probe(struct fib6_info *rt) if (!(neigh->nud_state & NUD_VALID) && time_after(jiffies, neigh->updated + - rt->rt6i_idev->cnf.rtr_probe_interval)) { + rt->fib6_idev->cnf.rtr_probe_interval)) { work = kmalloc(sizeof(*work), GFP_ATOMIC); if (work) __neigh_set_probe_once(neigh); @@ -590,7 +590,7 @@ static inline int rt6_check_dev(struct fib6_info *rt, int oif) if (!oif || dev->ifindex == oif) return 2; if ((dev->flags & IFF_LOOPBACK) && - rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif) + rt->fib6_idev && rt->fib6_idev->dev->ifindex == oif) return 1; return 0; } @@ -600,8 +600,8 @@ static inline enum rt6_nud_state rt6_check_neigh(struct fib6_info *rt) enum rt6_nud_state ret = RT6_NUD_FAIL_HARD; struct neighbour *neigh; - if (rt->rt6i_flags & RTF_NONEXTHOP || - !(rt->rt6i_flags & RTF_GATEWAY)) + if (rt->fib6_flags & RTF_NONEXTHOP || + !(rt->fib6_flags & RTF_GATEWAY)) return RT6_NUD_SUCCEED; rcu_read_lock_bh(); @@ -635,7 +635,7 @@ static int rt6_score_route(struct fib6_info *rt, int oif, int strict) if (!m && (strict & RT6_LOOKUP_F_IFACE)) return RT6_NUD_FAIL_HARD; #ifdef CONFIG_IPV6_ROUTER_PREF - m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2; + m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->fib6_flags)) << 2; #endif if (strict & RT6_LOOKUP_F_REACHABLE) { int n = rt6_check_neigh(rt); @@ -651,7 +651,7 @@ static struct fib6_info *find_match(struct fib6_info *rt, int oif, int strict, { int m; bool match_do_rr = false; - struct inet6_dev *idev = rt->rt6i_idev; + struct inet6_dev *idev = rt->fib6_idev; if (rt->fib6_nh.nh_flags & RTNH_F_DEAD) goto out; @@ -697,7 +697,7 @@ static struct fib6_info *find_rr_leaf(struct fib6_node *fn, match = NULL; cont = NULL; for (rt = rr_head; rt; rt = rcu_dereference(rt->rt6_next)) { - if (rt->rt6i_metric != metric) { + if (rt->fib6_metric != metric) { cont = rt; break; } @@ -707,7 +707,7 @@ static struct fib6_info *find_rr_leaf(struct fib6_node *fn, for (rt = leaf; rt && rt != rr_head; rt = rcu_dereference(rt->rt6_next)) { - if (rt->rt6i_metric != metric) { + if (rt->fib6_metric != metric) { cont = rt; break; } @@ -744,30 +744,30 @@ static struct fib6_info *rt6_select(struct net *net, struct fib6_node *fn, * (This might happen if all routes under fn are deleted from * the tree and fib6_repair_tree() is called on the node.) */ - key_plen = rt0->rt6i_dst.plen; + key_plen = rt0->fib6_dst.plen; #ifdef CONFIG_IPV6_SUBTREES - if (rt0->rt6i_src.plen) - key_plen = rt0->rt6i_src.plen; + if (rt0->fib6_src.plen) + key_plen = rt0->fib6_src.plen; #endif if (fn->fn_bit != key_plen) return net->ipv6.fib6_null_entry; - match = find_rr_leaf(fn, leaf, rt0, rt0->rt6i_metric, oif, strict, + match = find_rr_leaf(fn, leaf, rt0, rt0->fib6_metric, oif, strict, &do_rr); if (do_rr) { struct fib6_info *next = rcu_dereference(rt0->rt6_next); /* no entries matched; do round-robin */ - if (!next || next->rt6i_metric != rt0->rt6i_metric) + if (!next || next->fib6_metric != rt0->fib6_metric) next = leaf; if (next != rt0) { - spin_lock_bh(&leaf->rt6i_table->tb6_lock); + spin_lock_bh(&leaf->fib6_table->tb6_lock); /* make sure next is not being deleted from the tree */ - if (next->rt6i_node) + if (next->fib6_node) rcu_assign_pointer(fn->rr_ptr, next); - spin_unlock_bh(&leaf->rt6i_table->tb6_lock); + spin_unlock_bh(&leaf->fib6_table->tb6_lock); } } @@ -776,7 +776,7 @@ static struct fib6_info *rt6_select(struct net *net, struct fib6_node *fn, static bool rt6_is_gw_or_nonexthop(const struct fib6_info *rt) { - return (rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY)); + return (rt->fib6_flags & (RTF_NONEXTHOP | RTF_GATEWAY)); } #ifdef CONFIG_IPV6_ROUTE_INFO @@ -840,8 +840,8 @@ int rt6_route_rcv(struct net_device *dev, u8 *opt, int len, rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev, pref); else if (rt) - rt->rt6i_flags = RTF_ROUTEINFO | - (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref); + rt->fib6_flags = RTF_ROUTEINFO | + (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref); if (rt) { if (!addrconf_finite_timeout(lifetime)) @@ -864,13 +864,13 @@ static struct net_device *ip6_rt_get_dev_rcu(struct fib6_info *rt) { struct net_device *dev = rt->fib6_nh.nh_dev; - if (rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST)) { + if (rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) { /* for copies of local routes, dst->dev needs to be the * device if it is a master device, the master device if * device is enslaved, and the loopback as the default */ if (netif_is_l3_slave(dev) && - !rt6_need_strict(&rt->rt6i_dst.addr)) + !rt6_need_strict(&rt->fib6_dst.addr)) dev = l3mdev_master_dev_rcu(dev); else if (!netif_is_l3_master(dev)) dev = dev_net(dev)->loopback_dev; @@ -942,7 +942,7 @@ static void ip6_rt_init_dst(struct rt6_info *rt, struct fib6_info *ort) { rt->dst.flags |= fib6_info_dst_flags(ort); - if (ort->rt6i_flags & RTF_REJECT) { + if (ort->fib6_flags & RTF_REJECT) { ip6_rt_init_dst_reject(rt, ort); return; } @@ -952,7 +952,7 @@ static void ip6_rt_init_dst(struct rt6_info *rt, struct fib6_info *ort) if (ort->fib6_type == RTN_LOCAL) { rt->dst.input = ip6_input; - } else if (ipv6_addr_type(&ort->rt6i_dst.addr) & IPV6_ADDR_MULTICAST) { + } else if (ipv6_addr_type(&ort->fib6_dst.addr) & IPV6_ADDR_MULTICAST) { rt->dst.input = ip6_mc_input; } else { rt->dst.input = ip6_forward; @@ -982,17 +982,17 @@ static void ip6_rt_copy_init(struct rt6_info *rt, struct fib6_info *ort) { ip6_rt_init_dst(rt, ort); - rt->rt6i_dst = ort->rt6i_dst; - rt->rt6i_idev = ort->rt6i_idev; + rt->rt6i_dst = ort->fib6_dst; + rt->rt6i_idev = ort->fib6_idev; if (rt->rt6i_idev) in6_dev_hold(rt->rt6i_idev); rt->rt6i_gateway = ort->fib6_nh.nh_gw; - rt->rt6i_flags = ort->rt6i_flags; + rt->rt6i_flags = ort->fib6_flags; rt6_set_from(rt, ort); #ifdef CONFIG_IPV6_SUBTREES - rt->rt6i_src = ort->rt6i_src; + rt->rt6i_src = ort->fib6_src; #endif - rt->rt6i_prefsrc = ort->rt6i_prefsrc; + rt->rt6i_prefsrc = ort->fib6_prefsrc; rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate); } @@ -1067,7 +1067,7 @@ restart: } else { f6i = rt6_device_match(net, f6i, &fl6->saddr, fl6->flowi6_oif, flags); - if (f6i->rt6i_nsiblings && fl6->flowi6_oif == 0) + if (f6i->fib6_nsiblings && fl6->flowi6_oif == 0) f6i = rt6_multipath_select(net, f6i, fl6, fl6->flowi6_oif, skb, flags); } @@ -1145,7 +1145,7 @@ static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info, int err; struct fib6_table *table; - table = rt->rt6i_table; + table = rt->fib6_table; spin_lock_bh(&table->tb6_lock); err = fib6_add(&table->tb6_root, rt, info, extack); spin_unlock_bh(&table->tb6_lock); @@ -1185,8 +1185,8 @@ static struct rt6_info *ip6_rt_cache_alloc(struct fib6_info *ort, rt->rt6i_dst.plen = 128; if (!rt6_is_gw_or_nonexthop(ort)) { - if (ort->rt6i_dst.plen != 128 && - ipv6_addr_equal(&ort->rt6i_dst.addr, daddr)) + if (ort->fib6_dst.plen != 128 && + ipv6_addr_equal(&ort->fib6_dst.addr, daddr)) rt->rt6i_flags |= RTF_ANYCAST; #ifdef CONFIG_IPV6_SUBTREES if (rt->rt6i_src.plen && saddr) { @@ -1378,7 +1378,7 @@ static unsigned int fib6_mtu(const struct fib6_info *rt) { unsigned int mtu; - mtu = rt->fib6_pmtu ? : rt->rt6i_idev->cnf.mtu6; + mtu = rt->fib6_pmtu ? : rt->fib6_idev->cnf.mtu6; mtu = min_t(unsigned int, mtu, IP6_MAX_MTU); return mtu - lwtunnel_headroom(rt->fib6_nh.nh_lwtstate, mtu); @@ -1419,14 +1419,14 @@ static int rt6_insert_exception(struct rt6_info *nrt, * Otherwise, the exception table is indexed by * a hash of only rt6i_dst. */ - if (ort->rt6i_src.plen) + if (ort->fib6_src.plen) src_key = &nrt->rt6i_src.addr; #endif /* Update rt6i_prefsrc as it could be changed * in rt6_remove_prefsrc() */ - nrt->rt6i_prefsrc = ort->rt6i_prefsrc; + nrt->rt6i_prefsrc = ort->fib6_prefsrc; /* rt6_mtu_change() might lower mtu on ort. * Only insert this exception route if its mtu * is less than ort's mtu value. @@ -1460,9 +1460,9 @@ out: /* Update fn->fn_sernum to invalidate all cached dst */ if (!err) { - spin_lock_bh(&ort->rt6i_table->tb6_lock); + spin_lock_bh(&ort->fib6_table->tb6_lock); fib6_update_sernum(net, ort); - spin_unlock_bh(&ort->rt6i_table->tb6_lock); + spin_unlock_bh(&ort->fib6_table->tb6_lock); fib6_force_start_gc(net); } @@ -1517,7 +1517,7 @@ static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt, * Otherwise, the exception table is indexed by * a hash of only rt6i_dst. */ - if (rt->rt6i_src.plen) + if (rt->fib6_src.plen) src_key = saddr; #endif rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key); @@ -1554,7 +1554,7 @@ static int rt6_remove_exception_rt(struct rt6_info *rt) * Otherwise, the exception table is indexed by * a hash of only rt6i_dst. */ - if (from->rt6i_src.plen) + if (from->fib6_src.plen) src_key = &rt->rt6i_src.addr; #endif rt6_ex = __rt6_find_exception_spinlock(&bucket, @@ -1595,7 +1595,7 @@ static void rt6_update_exception_stamp_rt(struct rt6_info *rt) * Otherwise, the exception table is indexed by * a hash of only rt6i_dst. */ - if (from->rt6i_src.plen) + if (from->fib6_src.plen) src_key = &rt->rt6i_src.addr; #endif rt6_ex = __rt6_find_exception_rcu(&bucket, @@ -1798,7 +1798,7 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, redo_rt6_select: f6i = rt6_select(net, fn, oif, strict); - if (f6i->rt6i_nsiblings) + if (f6i->fib6_nsiblings) f6i = rt6_multipath_select(net, f6i, fl6, oif, skb, strict); if (f6i == net->ipv6.fib6_null_entry) { fn = fib6_backtrack(fn, &fl6->saddr); @@ -1830,7 +1830,7 @@ redo_rt6_select: trace_fib6_table_lookup(net, rt, table, fl6); return rt; } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) && - !(f6i->rt6i_flags & RTF_GATEWAY))) { + !(f6i->fib6_flags & RTF_GATEWAY))) { /* Create a RTF_CACHE clone which will not be * owned by the fib6 tree. It is for the special case where * the daddr in the skb during the neighbor look-up is different @@ -2201,7 +2201,7 @@ static void ip6_link_failure(struct sk_buff *skb) struct fib6_node *fn; rcu_read_lock(); - fn = rcu_dereference(rt->from->rt6i_node); + fn = rcu_dereference(rt->from->fib6_node); if (fn && (rt->rt6i_flags & RTF_DEFAULT)) fn->fn_sernum = -1; rcu_read_unlock(); @@ -2358,9 +2358,9 @@ restart: continue; if (fib6_check_expired(rt)) continue; - if (rt->rt6i_flags & RTF_REJECT) + if (rt->fib6_flags & RTF_REJECT) break; - if (!(rt->rt6i_flags & RTF_GATEWAY)) + if (!(rt->fib6_flags & RTF_GATEWAY)) continue; if (fl6->flowi6_oif != rt->fib6_nh.nh_dev->ifindex) continue; @@ -2386,7 +2386,7 @@ restart: if (!rt) rt = net->ipv6.fib6_null_entry; - else if (rt->rt6i_flags & RTF_REJECT) { + else if (rt->fib6_flags & RTF_REJECT) { ret = net->ipv6.ip6_null_entry; goto out; } @@ -2881,7 +2881,7 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, if (cfg->fc_protocol == RTPROT_UNSPEC) cfg->fc_protocol = RTPROT_BOOT; - rt->rt6i_protocol = cfg->fc_protocol; + rt->fib6_protocol = cfg->fc_protocol; addr_type = ipv6_addr_type(&cfg->fc_dst); @@ -2896,17 +2896,17 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, rt->fib6_nh.nh_lwtstate = lwtstate_get(lwtstate); } - ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len); - rt->rt6i_dst.plen = cfg->fc_dst_len; - if (rt->rt6i_dst.plen == 128) + ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len); + rt->fib6_dst.plen = cfg->fc_dst_len; + if (rt->fib6_dst.plen == 128) rt->dst_host = true; #ifdef CONFIG_IPV6_SUBTREES - ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len); - rt->rt6i_src.plen = cfg->fc_src_len; + ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len); + rt->fib6_src.plen = cfg->fc_src_len; #endif - rt->rt6i_metric = cfg->fc_metric; + rt->fib6_metric = cfg->fc_metric; rt->fib6_nh.nh_weight = 1; rt->fib6_type = cfg->fc_type; @@ -2932,7 +2932,7 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, goto out; } } - rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP; + rt->fib6_flags = RTF_REJECT|RTF_NONEXTHOP; goto install_route; } @@ -2954,21 +2954,21 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, err = -EINVAL; goto out; } - rt->rt6i_prefsrc.addr = cfg->fc_prefsrc; - rt->rt6i_prefsrc.plen = 128; + rt->fib6_prefsrc.addr = cfg->fc_prefsrc; + rt->fib6_prefsrc.plen = 128; } else - rt->rt6i_prefsrc.plen = 0; + rt->fib6_prefsrc.plen = 0; - rt->rt6i_flags = cfg->fc_flags; + rt->fib6_flags = cfg->fc_flags; install_route: - if (!(rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST)) && + if (!(rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) && !netif_carrier_ok(dev)) rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN; rt->fib6_nh.nh_flags |= (cfg->fc_flags & RTNH_F_ONLINK); rt->fib6_nh.nh_dev = dev; - rt->rt6i_idev = idev; - rt->rt6i_table = table; + rt->fib6_idev = idev; + rt->fib6_table = table; cfg->fc_nlinfo.nl_net = dev_net(dev); @@ -3010,7 +3010,7 @@ static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info) goto out; } - table = rt->rt6i_table; + table = rt->fib6_table; spin_lock_bh(&table->tb6_lock); err = fib6_del(rt, info); spin_unlock_bh(&table->tb6_lock); @@ -3037,10 +3037,10 @@ static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg) if (rt == net->ipv6.fib6_null_entry) goto out_put; - table = rt->rt6i_table; + table = rt->fib6_table; spin_lock_bh(&table->tb6_lock); - if (rt->rt6i_nsiblings && cfg->fc_delete_all_nh) { + if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) { struct fib6_info *sibling, *next_sibling; /* prefer to send a single notification with all hops */ @@ -3058,8 +3058,8 @@ static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg) } list_for_each_entry_safe(sibling, next_sibling, - &rt->rt6i_siblings, - rt6i_siblings) { + &rt->fib6_siblings, + fib6_siblings) { err = fib6_del(sibling, info); if (err) goto out_unlock; @@ -3138,9 +3138,9 @@ static int ip6_route_del(struct fib6_config *cfg, if (cfg->fc_flags & RTF_GATEWAY && !ipv6_addr_equal(&cfg->fc_gateway, &rt->fib6_nh.nh_gw)) continue; - if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric) + if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric) continue; - if (cfg->fc_protocol && cfg->fc_protocol != rt->rt6i_protocol) + if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol) continue; fib6_info_hold(rt); rcu_read_unlock(); @@ -3297,7 +3297,7 @@ static struct fib6_info *rt6_get_route_info(struct net *net, for_each_fib6_node_rt_rcu(fn) { if (rt->fib6_nh.nh_dev->ifindex != dev->ifindex) continue; - if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY)) + if ((rt->fib6_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY)) continue; if (!ipv6_addr_equal(&rt->fib6_nh.nh_gw, gwaddr)) continue; @@ -3357,7 +3357,7 @@ struct fib6_info *rt6_get_dflt_router(struct net *net, rcu_read_lock(); for_each_fib6_node_rt_rcu(&table->tb6_root) { if (dev == rt->fib6_nh.nh_dev && - ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) && + ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) && ipv6_addr_equal(&rt->fib6_nh.nh_gw, addr)) break; } @@ -3399,8 +3399,8 @@ struct fib6_info *rt6_add_dflt_router(struct net *net, } int rt6_addrconf_purge(struct fib6_info *rt, void *arg) { - if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) && - (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) + if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) && + (!rt->fib6_idev || rt->fib6_idev->cnf.accept_ra != 2)) return -1; return 0; } @@ -3539,26 +3539,26 @@ struct fib6_info *addrconf_dst_alloc(struct net *net, rt->dst_nocount = true; in6_dev_hold(idev); - rt->rt6i_idev = idev; + rt->fib6_idev = idev; rt->dst_host = true; - rt->rt6i_protocol = RTPROT_KERNEL; - rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP; + rt->fib6_protocol = RTPROT_KERNEL; + rt->fib6_flags = RTF_UP | RTF_NONEXTHOP; if (anycast) { rt->fib6_type = RTN_ANYCAST; - rt->rt6i_flags |= RTF_ANYCAST; + rt->fib6_flags |= RTF_ANYCAST; } else { rt->fib6_type = RTN_LOCAL; - rt->rt6i_flags |= RTF_LOCAL; + rt->fib6_flags |= RTF_LOCAL; } rt->fib6_nh.nh_gw = *addr; dev_hold(dev); rt->fib6_nh.nh_dev = dev; - rt->rt6i_dst.addr = *addr; - rt->rt6i_dst.plen = 128; + rt->fib6_dst.addr = *addr; + rt->fib6_dst.plen = 128; tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL; - rt->rt6i_table = fib6_get_table(net, tb_id); + rt->fib6_table = fib6_get_table(net, tb_id); return rt; } @@ -3578,10 +3578,10 @@ static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg) if (((void *)rt->fib6_nh.nh_dev == dev || !dev) && rt != net->ipv6.fib6_null_entry && - ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) { + ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) { spin_lock_bh(&rt6_exception_lock); /* remove prefsrc entry */ - rt->rt6i_prefsrc.plen = 0; + rt->fib6_prefsrc.plen = 0; /* need to update cache as well */ rt6_exceptions_remove_prefsrc(rt); spin_unlock_bh(&rt6_exception_lock); @@ -3607,7 +3607,7 @@ static int fib6_clean_tohost(struct fib6_info *rt, void *arg) { struct in6_addr *gateway = (struct in6_addr *)arg; - if (((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) && + if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) && ipv6_addr_equal(gateway, &rt->fib6_nh.nh_gw)) { return -1; } @@ -3639,16 +3639,16 @@ static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt) struct fib6_info *iter; struct fib6_node *fn; - fn = rcu_dereference_protected(rt->rt6i_node, - lockdep_is_held(&rt->rt6i_table->tb6_lock)); + fn = rcu_dereference_protected(rt->fib6_node, + lockdep_is_held(&rt->fib6_table->tb6_lock)); iter = rcu_dereference_protected(fn->leaf, - lockdep_is_held(&rt->rt6i_table->tb6_lock)); + lockdep_is_held(&rt->fib6_table->tb6_lock)); while (iter) { - if (iter->rt6i_metric == rt->rt6i_metric && + if (iter->fib6_metric == rt->fib6_metric && rt6_qualify_for_ecmp(iter)) return iter; iter = rcu_dereference_protected(iter->rt6_next, - lockdep_is_held(&rt->rt6i_table->tb6_lock)); + lockdep_is_held(&rt->fib6_table->tb6_lock)); } return NULL; @@ -3658,7 +3658,7 @@ static bool rt6_is_dead(const struct fib6_info *rt) { if (rt->fib6_nh.nh_flags & RTNH_F_DEAD || (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN && - rt->rt6i_idev->cnf.ignore_routes_with_linkdown)) + rt->fib6_idev->cnf.ignore_routes_with_linkdown)) return true; return false; @@ -3672,7 +3672,7 @@ static int rt6_multipath_total_weight(const struct fib6_info *rt) if (!rt6_is_dead(rt)) total += rt->fib6_nh.nh_weight; - list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings) { + list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) { if (!rt6_is_dead(iter)) total += iter->fib6_nh.nh_weight; } @@ -3699,7 +3699,7 @@ static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total) rt6_upper_bound_set(rt, &weight, total); - list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings) + list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) rt6_upper_bound_set(iter, &weight, total); } @@ -3712,7 +3712,7 @@ void rt6_multipath_rebalance(struct fib6_info *rt) * then there is no need to rebalance upon the removal of every * sibling route. */ - if (!rt->rt6i_nsiblings || rt->should_flush) + if (!rt->fib6_nsiblings || rt->should_flush) return; /* During lookup routes are evaluated in order, so we need to @@ -3763,7 +3763,7 @@ static bool rt6_multipath_uses_dev(const struct fib6_info *rt, if (rt->fib6_nh.nh_dev == dev) return true; - list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings) + list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) if (iter->fib6_nh.nh_dev == dev) return true; @@ -3775,7 +3775,7 @@ static void rt6_multipath_flush(struct fib6_info *rt) struct fib6_info *iter; rt->should_flush = 1; - list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings) + list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) iter->should_flush = 1; } @@ -3788,7 +3788,7 @@ static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt, if (rt->fib6_nh.nh_dev == down_dev || rt->fib6_nh.nh_flags & RTNH_F_DEAD) dead++; - list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings) + list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) if (iter->fib6_nh.nh_dev == down_dev || iter->fib6_nh.nh_flags & RTNH_F_DEAD) dead++; @@ -3804,7 +3804,7 @@ static void rt6_multipath_nh_flags_set(struct fib6_info *rt, if (rt->fib6_nh.nh_dev == dev) rt->fib6_nh.nh_flags |= nh_flags; - list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings) + list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) if (iter->fib6_nh.nh_dev == dev) iter->fib6_nh.nh_flags |= nh_flags; } @@ -3825,13 +3825,13 @@ static int fib6_ifdown(struct fib6_info *rt, void *p_arg) case NETDEV_DOWN: if (rt->should_flush) return -1; - if (!rt->rt6i_nsiblings) + if (!rt->fib6_nsiblings) return rt->fib6_nh.nh_dev == dev ? -1 : 0; if (rt6_multipath_uses_dev(rt, dev)) { unsigned int count; count = rt6_multipath_dead_count(rt, dev); - if (rt->rt6i_nsiblings + 1 == count) { + if (rt->fib6_nsiblings + 1 == count) { rt6_multipath_flush(rt); return -1; } @@ -3843,7 +3843,7 @@ static int fib6_ifdown(struct fib6_info *rt, void *p_arg) return -2; case NETDEV_CHANGE: if (rt->fib6_nh.nh_dev != dev || - rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST)) + rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) break; rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN; rt6_multipath_rebalance(rt); @@ -4126,10 +4126,10 @@ static void ip6_route_mpath_notify(struct fib6_info *rt, * nexthop. Since sibling routes are always added at the end of * the list, find the first sibling of the last route appended */ - if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->rt6i_nsiblings) { - rt = list_first_entry(&rt_last->rt6i_siblings, + if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) { + rt = list_first_entry(&rt_last->fib6_siblings, struct fib6_info, - rt6i_siblings); + fib6_siblings); } if (rt) @@ -4378,13 +4378,13 @@ static size_t rt6_nlmsg_size(struct fib6_info *rt) { int nexthop_len = 0; - if (rt->rt6i_nsiblings) { + if (rt->fib6_nsiblings) { nexthop_len = nla_total_size(0) /* RTA_MULTIPATH */ + NLA_ALIGN(sizeof(struct rtnexthop)) + nla_total_size(16) /* RTA_GATEWAY */ + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate); - nexthop_len *= rt->rt6i_nsiblings; + nexthop_len *= rt->fib6_nsiblings; } return NLMSG_ALIGN(sizeof(struct rtmsg)) @@ -4412,11 +4412,11 @@ static int rt6_nexthop_info(struct sk_buff *skb, struct fib6_info *rt, if (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN) { *flags |= RTNH_F_LINKDOWN; - if (rt->rt6i_idev->cnf.ignore_routes_with_linkdown) + if (rt->fib6_idev->cnf.ignore_routes_with_linkdown) *flags |= RTNH_F_DEAD; } - if (rt->rt6i_flags & RTF_GATEWAY) { + if (rt->fib6_flags & RTF_GATEWAY) { if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->fib6_nh.nh_gw) < 0) goto nla_put_failure; } @@ -4486,11 +4486,11 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb, rtm = nlmsg_data(nlh); rtm->rtm_family = AF_INET6; - rtm->rtm_dst_len = rt->rt6i_dst.plen; - rtm->rtm_src_len = rt->rt6i_src.plen; + rtm->rtm_dst_len = rt->fib6_dst.plen; + rtm->rtm_src_len = rt->fib6_src.plen; rtm->rtm_tos = 0; - if (rt->rt6i_table) - table = rt->rt6i_table->tb6_id; + if (rt->fib6_table) + table = rt->fib6_table->tb6_id; else table = RT6_TABLE_UNSPEC; rtm->rtm_table = table < 256 ? table : RT_TABLE_COMPAT; @@ -4500,9 +4500,9 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb, rtm->rtm_type = rt->fib6_type; rtm->rtm_flags = 0; rtm->rtm_scope = RT_SCOPE_UNIVERSE; - rtm->rtm_protocol = rt->rt6i_protocol; + rtm->rtm_protocol = rt->fib6_protocol; - if (rt->rt6i_flags & RTF_CACHE) + if (rt->fib6_flags & RTF_CACHE) rtm->rtm_flags |= RTM_F_CLONED; if (dest) { @@ -4510,7 +4510,7 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb, goto nla_put_failure; rtm->rtm_dst_len = 128; } else if (rtm->rtm_dst_len) - if (nla_put_in6_addr(skb, RTA_DST, &rt->rt6i_dst.addr)) + if (nla_put_in6_addr(skb, RTA_DST, &rt->fib6_dst.addr)) goto nla_put_failure; #ifdef CONFIG_IPV6_SUBTREES if (src) { @@ -4518,12 +4518,12 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb, goto nla_put_failure; rtm->rtm_src_len = 128; } else if (rtm->rtm_src_len && - nla_put_in6_addr(skb, RTA_SRC, &rt->rt6i_src.addr)) + nla_put_in6_addr(skb, RTA_SRC, &rt->fib6_src.addr)) goto nla_put_failure; #endif if (iif) { #ifdef CONFIG_IPV6_MROUTE - if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) { + if (ipv6_addr_is_multicast(&rt->fib6_dst.addr)) { int err = ip6mr_get_route(net, skb, rtm, portid); if (err == 0) @@ -4541,9 +4541,9 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb, goto nla_put_failure; } - if (rt->rt6i_prefsrc.plen) { + if (rt->fib6_prefsrc.plen) { struct in6_addr saddr_buf; - saddr_buf = rt->rt6i_prefsrc.addr; + saddr_buf = rt->fib6_prefsrc.addr; if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf)) goto nla_put_failure; } @@ -4552,13 +4552,13 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb, if (rtnetlink_put_metrics(skb, pmetrics) < 0) goto nla_put_failure; - if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric)) + if (nla_put_u32(skb, RTA_PRIORITY, rt->fib6_metric)) goto nla_put_failure; /* For multipath routes, walk the siblings list and add * each as a nexthop within RTA_MULTIPATH. */ - if (rt->rt6i_nsiblings) { + if (rt->fib6_nsiblings) { struct fib6_info *sibling, *next_sibling; struct nlattr *mp; @@ -4570,7 +4570,7 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb, goto nla_put_failure; list_for_each_entry_safe(sibling, next_sibling, - &rt->rt6i_siblings, rt6i_siblings) { + &rt->fib6_siblings, fib6_siblings) { if (rt6_add_nexthop(skb, sibling) < 0) goto nla_put_failure; } @@ -4581,7 +4581,7 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb, goto nla_put_failure; } - if (rt->rt6i_flags & RTF_EXPIRES) { + if (rt->fib6_flags & RTF_EXPIRES) { expires = dst ? dst->expires : rt->expires; expires -= jiffies; } @@ -4589,7 +4589,7 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb, if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0) goto nla_put_failure; - if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags))) + if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->fib6_flags))) goto nla_put_failure; @@ -4614,7 +4614,7 @@ int rt6_dump_route(struct fib6_info *rt, void *p_arg) /* user wants prefix routes only */ if (rtm->rtm_flags & RTM_F_PREFIX && - !(rt->rt6i_flags & RTF_PREFIX_RT)) { + !(rt->fib6_flags & RTF_PREFIX_RT)) { /* success since this is not a prefix route */ return 1; } @@ -4788,7 +4788,7 @@ static int ip6_route_dev_notify(struct notifier_block *this, if (event == NETDEV_REGISTER) { net->ipv6.fib6_null_entry->fib6_nh.nh_dev = dev; - net->ipv6.fib6_null_entry->rt6i_idev = in6_dev_get(dev); + net->ipv6.fib6_null_entry->fib6_idev = in6_dev_get(dev); net->ipv6.ip6_null_entry->dst.dev = dev; net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev); #ifdef CONFIG_IPV6_MULTIPLE_TABLES @@ -4802,7 +4802,7 @@ static int ip6_route_dev_notify(struct notifier_block *this, /* NETDEV_UNREGISTER could be fired for multiple times by * netdev_wait_allrefs(). Make sure we only call this once. */ - in6_dev_put_clear(&net->ipv6.fib6_null_entry->rt6i_idev); + in6_dev_put_clear(&net->ipv6.fib6_null_entry->fib6_idev); in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev); #ifdef CONFIG_IPV6_MULTIPLE_TABLES in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev); @@ -5131,7 +5131,7 @@ void __init ip6_route_init_special_entries(void) * the loopback reference in rt6_info will not be taken, do it * manually for init_net */ init_net.ipv6.fib6_null_entry->fib6_nh.nh_dev = init_net.loopback_dev; - init_net.ipv6.fib6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); + init_net.ipv6.fib6_null_entry->fib6_idev = in6_dev_get(init_net.loopback_dev); init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev; init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); #ifdef CONFIG_IPV6_MULTIPLE_TABLES From 19f191b1881093c6b9c0564ffdb1d03104a22dc9 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 18 Apr 2018 15:39:01 -0700 Subject: [PATCH 0396/1640] UPSTREAM: net/ipv6: Remove aca_idev aca_idev has only 1 user - inet6_fill_ifacaddr - and it only wants the device index which can be extracted from the fib6_info nexthop. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/if_inet6.h | 1 - include/net/ip6_fib.h | 5 +++++ net/ipv6/addrconf.c | 3 ++- net/ipv6/anycast.c | 4 ---- 4 files changed, 7 insertions(+), 6 deletions(-) diff --git a/include/net/if_inet6.h b/include/net/if_inet6.h index 431ec8ec7a47..b23bc5d6efd9 100644 --- a/include/net/if_inet6.h +++ b/include/net/if_inet6.h @@ -143,7 +143,6 @@ struct ipv6_ac_socklist { struct ifacaddr6 { struct in6_addr aca_addr; - struct inet6_dev *aca_idev; struct fib6_info *aca_rt; struct ifacaddr6 *aca_next; int aca_users; diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 2eef7cd81977..c83a2568f77c 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -412,6 +412,11 @@ int fib6_add(struct fib6_node *root, struct fib6_info *rt, struct nl_info *info, struct netlink_ext_ack *extack); int fib6_del(struct fib6_info *rt, struct nl_info *info); +static inline struct net_device *fib6_info_nh_dev(const struct fib6_info *f6i) +{ + return f6i->fib6_nh.nh_dev; +} + void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info, unsigned int flags); diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 167deb682b3f..5996f57ed2a5 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -4844,9 +4844,10 @@ static int inet6_fill_ifmcaddr(struct sk_buff *skb, struct ifmcaddr6 *ifmca, static int inet6_fill_ifacaddr(struct sk_buff *skb, struct ifacaddr6 *ifaca, u32 portid, u32 seq, int event, unsigned int flags) { + struct net_device *dev = fib6_info_nh_dev(ifaca->aca_rt); + int ifindex = dev ? dev->ifindex : 1; struct nlmsghdr *nlh; u8 scope = RT_SCOPE_UNIVERSE; - int ifindex = ifaca->aca_idev->dev->ifindex; if (ipv6_addr_scope(&ifaca->aca_addr) & IFA_SITE) scope = RT_SCOPE_SITE; diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c index da13bbfcba60..59dbda22da92 100644 --- a/net/ipv6/anycast.c +++ b/net/ipv6/anycast.c @@ -216,7 +216,6 @@ static void aca_get(struct ifacaddr6 *aca) static void aca_put(struct ifacaddr6 *ac) { if (refcount_dec_and_test(&ac->aca_refcnt)) { - in6_dev_put(ac->aca_idev); fib6_info_release(ac->aca_rt); kfree(ac); } @@ -225,7 +224,6 @@ static void aca_put(struct ifacaddr6 *ac) static struct ifacaddr6 *aca_alloc(struct fib6_info *f6i, const struct in6_addr *addr) { - struct inet6_dev *idev = f6i->fib6_idev; struct ifacaddr6 *aca; aca = kzalloc(sizeof(*aca), GFP_ATOMIC); @@ -233,8 +231,6 @@ static struct ifacaddr6 *aca_alloc(struct fib6_info *f6i, return NULL; aca->aca_addr = *addr; - in6_dev_hold(idev); - aca->aca_idev = idev; fib6_info_hold(f6i); aca->aca_rt = f6i; aca->aca_users = 1; From c10c07d8467662c16af2260b6e37fc9db4256a78 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 18 Apr 2018 15:39:00 -0700 Subject: [PATCH 0397/1640] BACKPORT: net/ipv6: Rename addrconf_dst_alloc addrconf_dst_alloc now returns a fib6_info. Update the name and its users to reflect the change. Rename only; no functional change intended. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_route.h | 2 +- net/ipv6/addrconf.c | 26 ++++++++++++------------ net/ipv6/anycast.c | 14 ++++++------- net/ipv6/route.c | 44 ++++++++++++++++++++--------------------- 4 files changed, 43 insertions(+), 43 deletions(-) diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index b0e246bc4bf5..19104376de73 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -136,7 +136,7 @@ struct dst_entry *icmp6_dst_alloc(struct net_device *dev, struct flowi6 *fl6); void fib6_force_start_gc(struct net *net); -struct fib6_info *addrconf_dst_alloc(struct net *net, struct inet6_dev *idev, +struct fib6_info *addrconf_f6i_alloc(struct net *net, struct inet6_dev *idev, const struct in6_addr *addr, bool anycast, gfp_t gfp_flags); diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 5996f57ed2a5..bb849fc2b848 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -989,7 +989,7 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr, gfp_t gfp_flags = can_block ? GFP_KERNEL : GFP_ATOMIC; struct net *net = dev_net(idev->dev); struct inet6_ifaddr *ifa = NULL; - struct fib6_info *rt = NULL; + struct fib6_info *f6i = NULL; struct in6_validator_info i6vi; int err = 0; int addr_type = ipv6_addr_type(addr); @@ -1024,16 +1024,16 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr, goto out; } - rt = addrconf_dst_alloc(net, idev, addr, false, gfp_flags); - if (IS_ERR(rt)) { - err = PTR_ERR(rt); - rt = NULL; + f6i = addrconf_f6i_alloc(net, idev, addr, false, gfp_flags); + if (IS_ERR(f6i)) { + err = PTR_ERR(f6i); + f6i = NULL; goto out; } if (net->ipv6.devconf_all->disable_policy || idev->cnf.disable_policy) - rt->dst_nopolicy = true; + f6i->dst_nopolicy = true; neigh_parms_data_state_setall(idev->nd_parms); @@ -1055,7 +1055,7 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr, ifa->cstamp = ifa->tstamp = jiffies; ifa->tokenized = false; - ifa->rt = rt; + ifa->rt = f6i; ifa->idev = idev; in6_dev_hold(idev); @@ -1089,7 +1089,7 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr, inet6addr_notifier_call_chain(NETDEV_UP, ifa); out: if (unlikely(err < 0)) { - fib6_info_release(rt); + fib6_info_release(f6i); if (ifa) { if (ifa->idev) @@ -3369,17 +3369,17 @@ static int fixup_permanent_addr(struct net *net, * case regenerate the host route. */ if (!ifp->rt || !ifp->rt->fib6_node) { - struct fib6_info *rt, *prev; + struct fib6_info *f6i, *prev; - rt = addrconf_dst_alloc(net, idev, &ifp->addr, false, + f6i = addrconf_f6i_alloc(net, idev, &ifp->addr, false, GFP_ATOMIC); - if (unlikely(IS_ERR(rt))) - return PTR_ERR(rt); + if (unlikely(IS_ERR(f6i))) + return PTR_ERR(f6i); /* ifp->rt can be accessed outside of rtnl */ spin_lock(&ifp->lock); prev = ifp->rt; - ifp->rt = rt; + ifp->rt = f6i; spin_unlock(&ifp->lock); fib6_info_release(prev); diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c index 59dbda22da92..36f388b5922a 100644 --- a/net/ipv6/anycast.c +++ b/net/ipv6/anycast.c @@ -247,7 +247,7 @@ static struct ifacaddr6 *aca_alloc(struct fib6_info *f6i, int __ipv6_dev_ac_inc(struct inet6_dev *idev, const struct in6_addr *addr) { struct ifacaddr6 *aca; - struct fib6_info *rt; + struct fib6_info *f6i; struct net *net; int err; @@ -268,14 +268,14 @@ int __ipv6_dev_ac_inc(struct inet6_dev *idev, const struct in6_addr *addr) } net = dev_net(idev->dev); - rt = addrconf_dst_alloc(net, idev, addr, true, GFP_ATOMIC); - if (IS_ERR(rt)) { - err = PTR_ERR(rt); + f6i = addrconf_f6i_alloc(net, idev, addr, true, GFP_ATOMIC); + if (IS_ERR(f6i)) { + err = PTR_ERR(f6i); goto out; } - aca = aca_alloc(rt, addr); + aca = aca_alloc(f6i, addr); if (!aca) { - fib6_info_release(rt); + fib6_info_release(f6i); err = -ENOMEM; goto out; } @@ -289,7 +289,7 @@ int __ipv6_dev_ac_inc(struct inet6_dev *idev, const struct in6_addr *addr) aca_get(aca); write_unlock_bh(&idev->lock); - ip6_ins_rt(net, rt); + ip6_ins_rt(net, f6i); addrconf_join_solict(idev->dev, &aca->aca_addr); diff --git a/net/ipv6/route.c b/net/ipv6/route.c index f2c189f9cff7..f33fa1fbc271 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -3523,44 +3523,44 @@ static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff * Allocate a dst for local (unicast / anycast) address. */ -struct fib6_info *addrconf_dst_alloc(struct net *net, - struct inet6_dev *idev, - const struct in6_addr *addr, - bool anycast, gfp_t gfp_flags) +struct fib6_info *addrconf_f6i_alloc(struct net *net, + struct inet6_dev *idev, + const struct in6_addr *addr, + bool anycast, gfp_t gfp_flags) { u32 tb_id; struct net_device *dev = idev->dev; - struct fib6_info *rt; + struct fib6_info *f6i; - rt = fib6_info_alloc(gfp_flags); - if (!rt) + f6i = fib6_info_alloc(gfp_flags); + if (!f6i) return ERR_PTR(-ENOMEM); - rt->dst_nocount = true; + f6i->dst_nocount = true; in6_dev_hold(idev); - rt->fib6_idev = idev; + f6i->fib6_idev = idev; - rt->dst_host = true; - rt->fib6_protocol = RTPROT_KERNEL; - rt->fib6_flags = RTF_UP | RTF_NONEXTHOP; + f6i->dst_host = true; + f6i->fib6_protocol = RTPROT_KERNEL; + f6i->fib6_flags = RTF_UP | RTF_NONEXTHOP; if (anycast) { - rt->fib6_type = RTN_ANYCAST; - rt->fib6_flags |= RTF_ANYCAST; + f6i->fib6_type = RTN_ANYCAST; + f6i->fib6_flags |= RTF_ANYCAST; } else { - rt->fib6_type = RTN_LOCAL; - rt->fib6_flags |= RTF_LOCAL; + f6i->fib6_type = RTN_LOCAL; + f6i->fib6_flags |= RTF_LOCAL; } - rt->fib6_nh.nh_gw = *addr; + f6i->fib6_nh.nh_gw = *addr; dev_hold(dev); - rt->fib6_nh.nh_dev = dev; - rt->fib6_dst.addr = *addr; - rt->fib6_dst.plen = 128; + f6i->fib6_nh.nh_dev = dev; + f6i->fib6_dst.addr = *addr; + f6i->fib6_dst.plen = 128; tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL; - rt->fib6_table = fib6_get_table(net, tb_id); + f6i->fib6_table = fib6_get_table(net, tb_id); - return rt; + return f6i; } /* remove deleted ip from prefsrc entries */ From 6d188ede215bfe2498fc67b83e0fb0a719d3a530 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 18 Apr 2018 15:39:02 -0700 Subject: [PATCH 0398/1640] UPSTREAM: net/ipv6: Remove unnecessary checks on fib6_idev Prior to 4832c30d5458 ("net: ipv6: put host and anycast routes on device with address") host routes and anycast routes were installed with the device set to loopback (or VRF device once that feature was added). In the older code dst.dev was set to loopback (needed for packet tx) and rt6i_idev was used to denote the actual interface. Commit 4832c30d5458 changed the code to have dst.dev pointing to the real device with the switch to lo or vrf device done on dst clones. As a consequence of this change a couple of device checks during route lookups are no longer needed. Remove them. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/route.c | 24 ++---------------------- 1 file changed, 2 insertions(+), 22 deletions(-) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index f33fa1fbc271..d24a53691302 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -458,7 +458,6 @@ static inline struct fib6_info *rt6_device_match(struct net *net, int oif, int flags) { - struct fib6_info *local = NULL; struct fib6_info *sprt; if (!oif && ipv6_addr_any(saddr) && @@ -474,17 +473,6 @@ static inline struct fib6_info *rt6_device_match(struct net *net, if (oif) { if (dev->ifindex == oif) return sprt; - if (dev->flags & IFF_LOOPBACK) { - if (!sprt->fib6_idev || - sprt->fib6_idev->dev->ifindex != oif) { - if (flags & RT6_LOOKUP_F_IFACE) - continue; - if (local && - local->fib6_idev->dev->ifindex == oif) - continue; - } - local = sprt; - } } else { if (ipv6_chk_addr(net, saddr, dev, flags & RT6_LOOKUP_F_IFACE)) @@ -492,13 +480,8 @@ static inline struct fib6_info *rt6_device_match(struct net *net, } } - if (oif) { - if (local) - return local; - - if (flags & RT6_LOOKUP_F_IFACE) - return net->ipv6.fib6_null_entry; - } + if (oif && flags & RT6_LOOKUP_F_IFACE) + return net->ipv6.fib6_null_entry; return rt->fib6_nh.nh_flags & RTNH_F_DEAD ? net->ipv6.fib6_null_entry : rt; } @@ -589,9 +572,6 @@ static inline int rt6_check_dev(struct fib6_info *rt, int oif) if (!oif || dev->ifindex == oif) return 2; - if ((dev->flags & IFF_LOOPBACK) && - rt->fib6_idev && rt->fib6_idev->dev->ifindex == oif) - return 1; return 0; } From a5ba71b45d6171e45346daa33aa3b77298b06b45 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 18 Apr 2018 15:39:03 -0700 Subject: [PATCH 0399/1640] UPSTREAM: net/ipv6: Change ip6_route_get_saddr to get dev from route Prior to 4832c30d5458 ("net: ipv6: put host and anycast routes on device with address") host routes and anycast routes were installed with the device set to loopback (or VRF device once that feature was added). In the older code dst.dev was set to loopback (needed for packet tx) and rt6i_idev was used to denote the actual interface. Commit 4832c30d5458 changed the code to have dst.dev pointing to the real device with the switch to lo or vrf device done on dst clones. As a consequence of this change ip6_route_get_saddr can just pass the nexthop device to ipv6_dev_get_saddr. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_route.h | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index 19104376de73..def67a3e06c6 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -114,14 +114,15 @@ static inline int ip6_route_get_saddr(struct net *net, struct fib6_info *f6i, unsigned int prefs, struct in6_addr *saddr) { - struct inet6_dev *idev = f6i ? f6i->fib6_idev : NULL; int err = 0; - if (f6i && f6i->fib6_prefsrc.plen) + if (f6i && f6i->fib6_prefsrc.plen) { *saddr = f6i->fib6_prefsrc.addr; - else - err = ipv6_dev_get_saddr(net, idev ? idev->dev : NULL, - daddr, prefs, saddr); + } else { + struct net_device *dev = f6i ? fib6_info_nh_dev(f6i) : NULL; + + err = ipv6_dev_get_saddr(net, dev, daddr, prefs, saddr); + } return err; } From 80c3d34b13b3d3ea9ff6fa2a122a810f2a3aa61b Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 18 Apr 2018 15:39:04 -0700 Subject: [PATCH 0400/1640] UPSTREAM: net/ipv6: Remove compare of fib6_idev from rt6_duplicate_nexthop After 4832c30d5458 ("net: ipv6: put host and anycast routes on device with address") the comparison of idev does not add value since it correlates to the nexthop device which is already compared. Remove the idev comparison. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_route.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index def67a3e06c6..2ba3d74ad731 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -270,7 +270,6 @@ static inline struct in6_addr *rt6_nexthop(struct rt6_info *rt, static inline bool rt6_duplicate_nexthop(struct fib6_info *a, struct fib6_info *b) { return a->fib6_nh.nh_dev == b->fib6_nh.nh_dev && - a->fib6_idev == b->fib6_idev && ipv6_addr_equal(&a->fib6_nh.nh_gw, &b->fib6_nh.nh_gw) && !lwtunnel_cmp_encap(a->fib6_nh.nh_lwtstate, b->fib6_nh.nh_lwtstate); } From d378f3f0b412ae3fb32168d1dd3af5688e87541b Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 18 Apr 2018 15:39:05 -0700 Subject: [PATCH 0401/1640] BACKPORT: net/ipv6: Remove fib6_idev fib6_idev can be obtained from __in6_dev_get on the nexthop device rather than caching it in the fib6_info. Remove it. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 1 - net/ipv6/ip6_fib.c | 2 -- net/ipv6/route.c | 66 ++++++++++++++++++++++++++++++------------- 3 files changed, 47 insertions(+), 22 deletions(-) diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index c83a2568f77c..ec50ce6c5266 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -150,7 +150,6 @@ struct fib6_info { unsigned int fib6_nsiblings; atomic_t fib6_ref; - struct inet6_dev *fib6_idev; unsigned long expires; struct dst_metrics *fib6_metrics; #define fib6_pmtu fib6_metrics->metrics[RTAX_MTU-1] diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index c6a80a05749f..d8f95b5c0b06 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -197,8 +197,6 @@ void fib6_info_destroy(struct fib6_info *f6i) } } - if (f6i->fib6_idev) - in6_dev_put(f6i->fib6_idev); if (f6i->fib6_nh.nh_dev) dev_put(f6i->fib6_nh.nh_dev); diff --git a/net/ipv6/route.c b/net/ipv6/route.c index d24a53691302..d040a9ca46c3 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -528,15 +528,17 @@ static void rt6_probe(struct fib6_info *rt) rcu_read_lock_bh(); neigh = __ipv6_neigh_lookup_noref(dev, nh_gw); if (neigh) { + struct inet6_dev *idev; + if (neigh->nud_state & NUD_VALID) goto out; + idev = __in6_dev_get(dev); work = NULL; write_lock(&neigh->lock); if (!(neigh->nud_state & NUD_VALID) && time_after(jiffies, - neigh->updated + - rt->fib6_idev->cnf.rtr_probe_interval)) { + neigh->updated + idev->cnf.rtr_probe_interval)) { work = kmalloc(sizeof(*work), GFP_ATOMIC); if (work) __neigh_set_probe_once(neigh); @@ -625,18 +627,32 @@ static int rt6_score_route(struct fib6_info *rt, int oif, int strict) return m; } +/* called with rc_read_lock held */ +static inline bool fib6_ignore_linkdown(const struct fib6_info *f6i) +{ + const struct net_device *dev = fib6_info_nh_dev(f6i); + bool rc = false; + + if (dev) { + const struct inet6_dev *idev = __in6_dev_get(dev); + + rc = !!idev->cnf.ignore_routes_with_linkdown; + } + + return rc; +} + static struct fib6_info *find_match(struct fib6_info *rt, int oif, int strict, int *mpri, struct fib6_info *match, bool *do_rr) { int m; bool match_do_rr = false; - struct inet6_dev *idev = rt->fib6_idev; if (rt->fib6_nh.nh_flags & RTNH_F_DEAD) goto out; - if (idev->cnf.ignore_routes_with_linkdown && + if (fib6_ignore_linkdown(rt) && rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN && !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE)) goto out; @@ -960,12 +976,12 @@ static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from) static void ip6_rt_copy_init(struct rt6_info *rt, struct fib6_info *ort) { + struct net_device *dev = fib6_info_nh_dev(ort); + ip6_rt_init_dst(rt, ort); rt->rt6i_dst = ort->fib6_dst; - rt->rt6i_idev = ort->fib6_idev; - if (rt->rt6i_idev) - in6_dev_hold(rt->rt6i_idev); + rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL; rt->rt6i_gateway = ort->fib6_nh.nh_gw; rt->rt6i_flags = ort->fib6_flags; rt6_set_from(rt, ort); @@ -1358,7 +1374,18 @@ static unsigned int fib6_mtu(const struct fib6_info *rt) { unsigned int mtu; - mtu = rt->fib6_pmtu ? : rt->fib6_idev->cnf.mtu6; + if (rt->fib6_pmtu) { + mtu = rt->fib6_pmtu; + } else { + struct net_device *dev = fib6_info_nh_dev(rt); + struct inet6_dev *idev; + + rcu_read_lock(); + idev = __in6_dev_get(dev); + mtu = idev->cnf.mtu6; + rcu_read_unlock(); + } + mtu = min_t(unsigned int, mtu, IP6_MAX_MTU); return mtu - lwtunnel_headroom(rt->fib6_nh.nh_lwtstate, mtu); @@ -2947,11 +2974,13 @@ install_route: rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN; rt->fib6_nh.nh_flags |= (cfg->fc_flags & RTNH_F_ONLINK); rt->fib6_nh.nh_dev = dev; - rt->fib6_idev = idev; rt->fib6_table = table; cfg->fc_nlinfo.nl_net = dev_net(dev); + if (idev) + in6_dev_put(idev); + return rt; out: if (dev) @@ -3379,8 +3408,11 @@ struct fib6_info *rt6_add_dflt_router(struct net *net, } int rt6_addrconf_purge(struct fib6_info *rt, void *arg) { + struct net_device *dev = fib6_info_nh_dev(rt); + struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL; + if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) && - (!rt->fib6_idev || rt->fib6_idev->cnf.accept_ra != 2)) + (!idev || idev->cnf.accept_ra != 2)) return -1; return 0; } @@ -3517,10 +3549,6 @@ struct fib6_info *addrconf_f6i_alloc(struct net *net, return ERR_PTR(-ENOMEM); f6i->dst_nocount = true; - - in6_dev_hold(idev); - f6i->fib6_idev = idev; - f6i->dst_host = true; f6i->fib6_protocol = RTPROT_KERNEL; f6i->fib6_flags = RTF_UP | RTF_NONEXTHOP; @@ -3638,7 +3666,7 @@ static bool rt6_is_dead(const struct fib6_info *rt) { if (rt->fib6_nh.nh_flags & RTNH_F_DEAD || (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN && - rt->fib6_idev->cnf.ignore_routes_with_linkdown)) + fib6_ignore_linkdown(rt))) return true; return false; @@ -4392,8 +4420,11 @@ static int rt6_nexthop_info(struct sk_buff *skb, struct fib6_info *rt, if (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN) { *flags |= RTNH_F_LINKDOWN; - if (rt->fib6_idev->cnf.ignore_routes_with_linkdown) + + rcu_read_lock(); + if (fib6_ignore_linkdown(rt)) *flags |= RTNH_F_DEAD; + rcu_read_unlock(); } if (rt->fib6_flags & RTF_GATEWAY) { @@ -4768,7 +4799,6 @@ static int ip6_route_dev_notify(struct notifier_block *this, if (event == NETDEV_REGISTER) { net->ipv6.fib6_null_entry->fib6_nh.nh_dev = dev; - net->ipv6.fib6_null_entry->fib6_idev = in6_dev_get(dev); net->ipv6.ip6_null_entry->dst.dev = dev; net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev); #ifdef CONFIG_IPV6_MULTIPLE_TABLES @@ -4782,7 +4812,6 @@ static int ip6_route_dev_notify(struct notifier_block *this, /* NETDEV_UNREGISTER could be fired for multiple times by * netdev_wait_allrefs(). Make sure we only call this once. */ - in6_dev_put_clear(&net->ipv6.fib6_null_entry->fib6_idev); in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev); #ifdef CONFIG_IPV6_MULTIPLE_TABLES in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev); @@ -5111,7 +5140,6 @@ void __init ip6_route_init_special_entries(void) * the loopback reference in rt6_info will not be taken, do it * manually for init_net */ init_net.ipv6.fib6_null_entry->fib6_nh.nh_dev = init_net.loopback_dev; - init_net.ipv6.fib6_null_entry->fib6_idev = in6_dev_get(init_net.loopback_dev); init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev; init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); #ifdef CONFIG_IPV6_MULTIPLE_TABLES From 37e6fda0855a44e5d98783f32d2675bee5211695 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Fri, 20 Apr 2018 15:37:57 -0700 Subject: [PATCH 0402/1640] UPSTREAM: net/ipv6: Clean up rt expires helpers rt6_clean_expires and rt6_set_expires are no longer used. Removed them. rt6_update_expires has 1 caller in route.c, so move it from the header. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 21 --------------------- net/ipv6/route.c | 9 +++++++++ 2 files changed, 9 insertions(+), 21 deletions(-) diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index ec50ce6c5266..7792326996bd 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -226,27 +226,6 @@ static inline bool fib6_check_expired(const struct fib6_info *f6i) return false; } -static inline void rt6_clean_expires(struct rt6_info *rt) -{ - rt->rt6i_flags &= ~RTF_EXPIRES; - rt->dst.expires = 0; -} - -static inline void rt6_set_expires(struct rt6_info *rt, unsigned long expires) -{ - rt->dst.expires = expires; - rt->rt6i_flags |= RTF_EXPIRES; -} - -static inline void rt6_update_expires(struct rt6_info *rt0, int timeout) -{ - if (!(rt0->rt6i_flags & RTF_EXPIRES) && rt0->from) - rt0->dst.expires = rt0->from->expires; - - dst_set_expires(&rt0->dst, timeout); - rt0->rt6i_flags |= RTF_EXPIRES; -} - /* Function to safely get fn->sernum for passed in rt * and store result in passed in cookie. * Return true if we can get cookie safely diff --git a/net/ipv6/route.c b/net/ipv6/route.c index d040a9ca46c3..6984c52f943c 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2216,6 +2216,15 @@ static void ip6_link_failure(struct sk_buff *skb) } } +static void rt6_update_expires(struct rt6_info *rt0, int timeout) +{ + if (!(rt0->rt6i_flags & RTF_EXPIRES) && rt0->from) + rt0->dst.expires = rt0->from->expires; + + dst_set_expires(&rt0->dst, timeout); + rt0->rt6i_flags |= RTF_EXPIRES; +} + static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu) { struct net *net = dev_net(rt->dst.dev); From c1c9a9654a9f7cd2b830702cf6360d27f66ab3ec Mon Sep 17 00:00:00 2001 From: David Ahern Date: Fri, 20 Apr 2018 15:37:58 -0700 Subject: [PATCH 0403/1640] BACKPORT: net/ipv6: Rename rt6_get_cookie_safe rt6_get_cookie_safe takes a fib6_info and checks the sernum of the node. Update the name to reflect its purpose. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 6 +++--- net/ipv6/route.c | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 7792326996bd..f443b965189e 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -231,8 +231,8 @@ static inline bool fib6_check_expired(const struct fib6_info *f6i) * Return true if we can get cookie safely * Return false if not */ -static inline bool rt6_get_cookie_safe(const struct fib6_info *f6i, - u32 *cookie) +static inline bool fib6_get_cookie_safe(const struct fib6_info *f6i, + u32 *cookie) { struct fib6_node *fn; bool status = false; @@ -256,7 +256,7 @@ static inline u32 rt6_get_cookie(const struct rt6_info *rt) u32 cookie = 0; if (rt->from) - rt6_get_cookie_safe(rt->from, &cookie); + fib6_get_cookie_safe(rt->from, &cookie); return cookie; } diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 6984c52f943c..4b4a6dd19a61 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2123,7 +2123,7 @@ static bool fib6_check(struct fib6_info *f6i, u32 cookie) { u32 rt_cookie = 0; - if ((f6i && !rt6_get_cookie_safe(f6i, &rt_cookie)) || + if ((f6i && !fib6_get_cookie_safe(f6i, &rt_cookie)) || rt_cookie != cookie) return false; @@ -2137,7 +2137,7 @@ static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie) { u32 rt_cookie = 0; - if ((rt->from && !rt6_get_cookie_safe(rt->from, &rt_cookie)) || + if ((rt->from && !fib6_get_cookie_safe(rt->from, &rt_cookie)) || rt_cookie != cookie) return NULL; From 6e9d260f8d82186123532858950508260331bf47 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Fri, 20 Apr 2018 15:38:00 -0700 Subject: [PATCH 0404/1640] BACKPORT: net/ipv6: Move rcu locking to callers of fib6_get_cookie_safe A later patch protects 'from' in rt6_info and this simplifies the locking needed by it. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 6 ++++-- net/ipv6/route.c | 13 ++++++++++--- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index f443b965189e..16b7db054191 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -237,7 +237,6 @@ static inline bool fib6_get_cookie_safe(const struct fib6_info *f6i, struct fib6_node *fn; bool status = false; - rcu_read_lock(); fn = rcu_dereference(f6i->fib6_node); if (fn) { @@ -247,7 +246,6 @@ static inline bool fib6_get_cookie_safe(const struct fib6_info *f6i, status = true; } - rcu_read_unlock(); return status; } @@ -255,9 +253,13 @@ static inline u32 rt6_get_cookie(const struct rt6_info *rt) { u32 cookie = 0; + rcu_read_lock(); + if (rt->from) fib6_get_cookie_safe(rt->from, &cookie); + rcu_read_unlock(); + return cookie; } diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 4b4a6dd19a61..a8583cf19df5 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2159,9 +2159,12 @@ static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie) static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie) { + struct dst_entry *dst_ret; struct rt6_info *rt; - rt = (struct rt6_info *) dst; + rt = container_of(dst, struct rt6_info, dst); + + rcu_read_lock(); /* All IPV6 dsts are created with ->obsolete set to the value * DST_OBSOLETE_FORCE_CHK which forces validation calls down @@ -2170,9 +2173,13 @@ static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie) if (rt->rt6i_flags & RTF_PCPU || (unlikely(!list_empty(&rt->rt6i_uncached)) && rt->from)) - return rt6_dst_from_check(rt, cookie); + dst_ret = rt6_dst_from_check(rt, cookie); else - return rt6_check(rt, cookie); + dst_ret = rt6_check(rt, cookie); + + rcu_read_unlock(); + + return dst_ret; } static void ip6_negative_advice(struct sock *sk, From a6957c5ba782058a918fac7f8b28ed8441285a28 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Fri, 20 Apr 2018 15:38:01 -0700 Subject: [PATCH 0405/1640] UPSTREAM: net/ipv6: Move release of fib6_info from pcpu routes to helper Code move only; no functional change intended. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/ip6_fib.c | 41 +++++++++++++++++++++++------------------ 1 file changed, 23 insertions(+), 18 deletions(-) diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index d8f95b5c0b06..dd5a5154d753 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -860,6 +860,27 @@ insert_above: return ln; } +static void fib6_drop_pcpu_from(struct fib6_info *f6i, + const struct fib6_table *table) +{ + int cpu; + + /* release the reference to this fib entry from + * all of its cached pcpu routes + */ + for_each_possible_cpu(cpu) { + struct rt6_info **ppcpu_rt; + struct rt6_info *pcpu_rt; + + ppcpu_rt = per_cpu_ptr(f6i->rt6i_pcpu, cpu); + pcpu_rt = *ppcpu_rt; + if (pcpu_rt) { + fib6_info_release(pcpu_rt->from); + pcpu_rt->from = NULL; + } + } +} + static void fib6_purge_rt(struct fib6_info *rt, struct fib6_node *fn, struct net *net) { @@ -887,24 +908,8 @@ static void fib6_purge_rt(struct fib6_info *rt, struct fib6_node *fn, lockdep_is_held(&table->tb6_lock)); } - if (rt->rt6i_pcpu) { - int cpu; - - /* release the reference to this fib entry from - * all of its cached pcpu routes - */ - for_each_possible_cpu(cpu) { - struct rt6_info **ppcpu_rt; - struct rt6_info *pcpu_rt; - - ppcpu_rt = per_cpu_ptr(rt->rt6i_pcpu, cpu); - pcpu_rt = *ppcpu_rt; - if (pcpu_rt) { - fib6_info_release(pcpu_rt->from); - pcpu_rt->from = NULL; - } - } - } + if (rt->rt6i_pcpu) + fib6_drop_pcpu_from(rt, table); } } From 779d4bba7a03c9643521afe83567ebf94dfb2671 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Fri, 20 Apr 2018 15:37:59 -0700 Subject: [PATCH 0406/1640] UPSTREAM: net/ipv6: Move rcu_read_lock to callers of ip6_rt_cache_alloc A later patch protects 'from' in rt6_info and this simplifies the locking needed by it. With the move, the fib6_info_hold for the uncached_rt is no longer needed since the rcu_lock is still held. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/route.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index a8583cf19df5..b02d99da9088 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1167,10 +1167,8 @@ static struct rt6_info *ip6_rt_cache_alloc(struct fib6_info *ort, * Clone the route. */ - rcu_read_lock(); dev = ip6_rt_get_dev_rcu(ort); rt = ip6_dst_alloc(dev_net(dev), dev, 0); - rcu_read_unlock(); if (!rt) return NULL; @@ -1843,14 +1841,11 @@ redo_rt6_select: * the daddr in the skb during the neighbor look-up is different * from the fl6->daddr used to look-up route here. */ - struct rt6_info *uncached_rt; - fib6_info_hold(f6i); - rcu_read_unlock(); - uncached_rt = ip6_rt_cache_alloc(f6i, &fl6->daddr, NULL); - fib6_info_release(f6i); + + rcu_read_unlock(); if (uncached_rt) { /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc() @@ -2285,7 +2280,9 @@ static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk, } else if (daddr) { struct rt6_info *nrt6; + rcu_read_lock(); nrt6 = ip6_rt_cache_alloc(rt6->from, daddr, saddr); + rcu_read_unlock(); if (nrt6) { rt6_do_update_pmtu(nrt6, mtu); if (rt6_insert_exception(nrt6, rt6->from)) @@ -3270,7 +3267,9 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu NEIGH_UPDATE_F_ISROUTER)), NDISC_REDIRECT, &ndopts); + rcu_read_lock(); nrt = ip6_rt_cache_alloc(rt->from, &msg->dest, NULL); + rcu_read_unlock(); if (!nrt) goto out; From d122cae6c2411b86c7e57522e52ccadc5d1392f6 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Fri, 20 Apr 2018 15:38:02 -0700 Subject: [PATCH 0407/1640] BACKPORT: net/ipv6: Make from in rt6_info rcu protected When a dst entry is created from a fib entry, the 'from' in rt6_info is set to the fib entry. The 'from' reference is used most notably for cookie checking - making sure stale dst entries are updated if the fib entry is changed. When a fib entry is deleted, the pcpu routes on it are walked releasing the fib6_info reference. This is needed for the fib6_info cleanup to happen and to make sure all device references are released in a timely manner. There is a race window when a FIB entry is deleted and the 'from' on the pcpu route is dropped and the pcpu route hits a cookie check. Handle this race using rcu on from. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 8 ++-- net/ipv6/ip6_fib.c | 8 +++- net/ipv6/ip6_output.c | 9 +++- net/ipv6/route.c | 96 +++++++++++++++++++++++++++++++------------ 4 files changed, 87 insertions(+), 34 deletions(-) diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 16b7db054191..32d31defafe8 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -177,7 +177,7 @@ struct fib6_info { struct rt6_info { struct dst_entry dst; - struct fib6_info *from; + struct fib6_info __rcu *from; struct rt6key rt6i_dst; struct rt6key rt6i_src; @@ -251,12 +251,14 @@ static inline bool fib6_get_cookie_safe(const struct fib6_info *f6i, static inline u32 rt6_get_cookie(const struct rt6_info *rt) { + struct fib6_info *from; u32 cookie = 0; rcu_read_lock(); - if (rt->from) - fib6_get_cookie_safe(rt->from, &cookie); + from = rcu_dereference(rt->from); + if (from) + fib6_get_cookie_safe(from, &cookie); rcu_read_unlock(); diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index dd5a5154d753..8971ed34108e 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -875,8 +875,12 @@ static void fib6_drop_pcpu_from(struct fib6_info *f6i, ppcpu_rt = per_cpu_ptr(f6i->rt6i_pcpu, cpu); pcpu_rt = *ppcpu_rt; if (pcpu_rt) { - fib6_info_release(pcpu_rt->from); - pcpu_rt->from = NULL; + struct fib6_info *from; + + from = rcu_dereference_protected(pcpu_rt->from, + lockdep_is_held(&table->tb6_lock)); + rcu_assign_pointer(pcpu_rt->from, NULL); + fib6_info_release(from); } } } diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 0896f296b231..ccfede2994c7 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1017,16 +1017,21 @@ static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk, * that's why we try it again later. */ if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) { + struct fib6_info *from; struct rt6_info *rt; bool had_dst = *dst != NULL; if (!had_dst) *dst = ip6_route_output(net, sk, fl6); rt = (*dst)->error ? NULL : (struct rt6_info *)*dst; - err = ip6_route_get_saddr(net, rt ? rt->from : NULL, - &fl6->daddr, + + rcu_read_lock(); + from = rt ? rcu_dereference(rt->from) : NULL; + err = ip6_route_get_saddr(net, from, &fl6->daddr, sk ? inet6_sk(sk)->srcprefs : 0, &fl6->saddr); + rcu_read_unlock(); + if (err) goto out_err_release; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index b02d99da9088..21494792950c 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -362,7 +362,7 @@ EXPORT_SYMBOL(ip6_dst_alloc); static void ip6_dst_destroy(struct dst_entry *dst) { struct rt6_info *rt = (struct rt6_info *)dst; - struct fib6_info *from = rt->from; + struct fib6_info *from; struct inet6_dev *idev; dst_destroy_metrics_generic(dst); @@ -374,8 +374,11 @@ static void ip6_dst_destroy(struct dst_entry *dst) in6_dev_put(idev); } - rt->from = NULL; + rcu_read_lock(); + from = rcu_dereference(rt->from); + rcu_assign_pointer(rt->from, NULL); fib6_info_release(from); + rcu_read_unlock(); } static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev, @@ -405,12 +408,16 @@ static bool __rt6_check_expired(const struct rt6_info *rt) static bool rt6_check_expired(const struct rt6_info *rt) { + struct fib6_info *from; + + from = rcu_dereference(rt->from); + if (rt->rt6i_flags & RTF_EXPIRES) { if (time_after(jiffies, rt->dst.expires)) return true; - } else if (rt->from) { + } else if (from) { return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK || - fib6_check_expired(rt->from); + fib6_check_expired(from); } return false; } @@ -966,7 +973,7 @@ static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from) { rt->rt6i_flags &= ~RTF_EXPIRES; fib6_info_hold(from); - rt->from = from; + rcu_assign_pointer(rt->from, from); dst_init_metrics(&rt->dst, from->fib6_metrics->metrics, true); if (from->fib6_metrics != &dst_default_metrics) { rt->dst._metrics |= DST_METRICS_REFCOUNTED; @@ -2128,11 +2135,13 @@ static bool fib6_check(struct fib6_info *f6i, u32 cookie) return true; } -static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie) +static struct dst_entry *rt6_check(struct rt6_info *rt, + struct fib6_info *from, + u32 cookie) { u32 rt_cookie = 0; - if ((rt->from && !fib6_get_cookie_safe(rt->from, &rt_cookie)) || + if ((from && !fib6_get_cookie_safe(from, &rt_cookie)) || rt_cookie != cookie) return NULL; @@ -2142,11 +2151,13 @@ static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie) return &rt->dst; } -static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie) +static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, + struct fib6_info *from, + u32 cookie) { if (!__rt6_check_expired(rt) && rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK && - fib6_check(rt->from, cookie)) + fib6_check(from, cookie)) return &rt->dst; else return NULL; @@ -2155,6 +2166,7 @@ static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie) static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie) { struct dst_entry *dst_ret; + struct fib6_info *from; struct rt6_info *rt; rt = container_of(dst, struct rt6_info, dst); @@ -2166,11 +2178,13 @@ static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie) * into this function always. */ - if (rt->rt6i_flags & RTF_PCPU || - (unlikely(!list_empty(&rt->rt6i_uncached)) && rt->from)) - dst_ret = rt6_dst_from_check(rt, cookie); + from = rcu_dereference(rt->from); + + if (from && (rt->rt6i_flags & RTF_PCPU || + unlikely(!list_empty(&rt->rt6i_uncached)))) + dst_ret = rt6_dst_from_check(rt, from, cookie); else - dst_ret = rt6_check(rt, cookie); + dst_ret = rt6_check(rt, from, cookie); rcu_read_unlock(); @@ -2206,13 +2220,17 @@ static void ip6_link_failure(struct sk_buff *skb) if (rt->rt6i_flags & RTF_CACHE) { if (dst_hold_safe(&rt->dst)) rt6_remove_exception_rt(rt); - } else if (rt->from) { + } else { + struct fib6_info *from; struct fib6_node *fn; rcu_read_lock(); - fn = rcu_dereference(rt->from->fib6_node); - if (fn && (rt->rt6i_flags & RTF_DEFAULT)) - fn->fn_sernum = -1; + from = rcu_dereference(rt->from); + if (from) { + fn = rcu_dereference(from->fib6_node); + if (fn && (rt->rt6i_flags & RTF_DEFAULT)) + fn->fn_sernum = -1; + } rcu_read_unlock(); } } @@ -2220,8 +2238,15 @@ static void ip6_link_failure(struct sk_buff *skb) static void rt6_update_expires(struct rt6_info *rt0, int timeout) { - if (!(rt0->rt6i_flags & RTF_EXPIRES) && rt0->from) - rt0->dst.expires = rt0->from->expires; + if (!(rt0->rt6i_flags & RTF_EXPIRES)) { + struct fib6_info *from; + + rcu_read_lock(); + from = rcu_dereference(rt0->from); + if (from) + rt0->dst.expires = from->expires; + rcu_read_unlock(); + } dst_set_expires(&rt0->dst, timeout); rt0->rt6i_flags |= RTF_EXPIRES; @@ -2238,8 +2263,14 @@ static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu) static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt) { + bool from_set; + + rcu_read_lock(); + from_set = !!rcu_dereference(rt->from); + rcu_read_unlock(); + return !(rt->rt6i_flags & RTF_CACHE) && - (rt->rt6i_flags & RTF_PCPU || rt->from); + (rt->rt6i_flags & RTF_PCPU || from_set); } static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk, @@ -2278,16 +2309,18 @@ static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk, if (rt6->rt6i_flags & RTF_CACHE) rt6_update_exception_stamp_rt(rt6); } else if (daddr) { + struct fib6_info *from; struct rt6_info *nrt6; rcu_read_lock(); - nrt6 = ip6_rt_cache_alloc(rt6->from, daddr, saddr); - rcu_read_unlock(); + from = rcu_dereference(rt6->from); + nrt6 = ip6_rt_cache_alloc(from, daddr, saddr); if (nrt6) { rt6_do_update_pmtu(nrt6, mtu); - if (rt6_insert_exception(nrt6, rt6->from)) + if (rt6_insert_exception(nrt6, from)) dst_release_immediate(&nrt6->dst); } + rcu_read_unlock(); } } @@ -3186,6 +3219,7 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu struct ndisc_options ndopts; struct inet6_dev *in6_dev; struct neighbour *neigh; + struct fib6_info *from; struct rd_msg *msg; int optlen, on_link; u8 *lladdr; @@ -3268,7 +3302,8 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu NDISC_REDIRECT, &ndopts); rcu_read_lock(); - nrt = ip6_rt_cache_alloc(rt->from, &msg->dest, NULL); + from = rcu_dereference(rt->from); + nrt = ip6_rt_cache_alloc(from, &msg->dest, NULL); rcu_read_unlock(); if (!nrt) goto out; @@ -4657,6 +4692,7 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, struct net *net = sock_net(in_skb->sk); struct nlattr *tb[RTA_MAX+1]; int err, iif = 0, oif = 0; + struct fib6_info *from; struct dst_entry *dst; struct rt6_info *rt; struct sk_buff *skb; @@ -4753,15 +4789,21 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, } skb_dst_set(skb, &rt->dst); + + rcu_read_lock(); + from = rcu_dereference(rt->from); + if (fibmatch) - err = rt6_fill_node(net, skb, rt->from, NULL, NULL, NULL, iif, + err = rt6_fill_node(net, skb, from, NULL, NULL, NULL, iif, RTM_NEWROUTE, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, 0); else - err = rt6_fill_node(net, skb, rt->from, dst, - &fl6.daddr, &fl6.saddr, iif, RTM_NEWROUTE, + err = rt6_fill_node(net, skb, from, dst, &fl6.daddr, + &fl6.saddr, iif, RTM_NEWROUTE, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, 0); + rcu_read_unlock(); + if (err < 0) { kfree_skb(skb); goto errout; From 4d556b0eacbfa8ada2ec5b27b043ff8006fc1948 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Fri, 4 May 2018 13:54:24 -0700 Subject: [PATCH 0408/1640] UPSTREAM: net/ipv6: rename rt6_next to fib6_next This slipped through the cracks in the followup set to the fib6_info flip. Rename rt6_next to fib6_next. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 6 +++--- net/ipv6/ip6_fib.c | 26 +++++++++++++------------- net/ipv6/route.c | 12 ++++++------ 3 files changed, 22 insertions(+), 22 deletions(-) diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 32d31defafe8..848402bf2396 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -138,7 +138,7 @@ struct fib6_nh { struct fib6_info { struct fib6_table *fib6_table; - struct fib6_info __rcu *rt6_next; + struct fib6_info __rcu *fib6_next; struct fib6_node __rcu *fib6_node; /* Multipath routes: @@ -195,11 +195,11 @@ struct rt6_info { #define for_each_fib6_node_rt_rcu(fn) \ for (rt = rcu_dereference((fn)->leaf); rt; \ - rt = rcu_dereference(rt->rt6_next)) + rt = rcu_dereference(rt->fib6_next)) #define for_each_fib6_walker_rt(w) \ for (rt = (w)->leaf; rt; \ - rt = rcu_dereference_protected(rt->rt6_next, 1)) + rt = rcu_dereference_protected(rt->fib6_next, 1)) static inline struct inet6_dev *ip6_dst_idev(struct dst_entry *dst) { diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 8971ed34108e..d0567d4e0118 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -945,7 +945,7 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt, ins = &fn->leaf; for (iter = leaf; iter; - iter = rcu_dereference_protected(iter->rt6_next, + iter = rcu_dereference_protected(iter->fib6_next, lockdep_is_held(&rt->fib6_table->tb6_lock))) { /* * Search for duplicates @@ -1001,7 +1001,7 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt, break; next_iter: - ins = &iter->rt6_next; + ins = &iter->fib6_next; } if (fallback_ins && !found) { @@ -1032,7 +1032,7 @@ next_iter: &sibling->fib6_siblings); break; } - sibling = rcu_dereference_protected(sibling->rt6_next, + sibling = rcu_dereference_protected(sibling->fib6_next, lockdep_is_held(&rt->fib6_table->tb6_lock)); } /* For each sibling in the list, increment the counter of @@ -1066,7 +1066,7 @@ add: if (err) return err; - rcu_assign_pointer(rt->rt6_next, iter); + rcu_assign_pointer(rt->fib6_next, iter); atomic_inc(&rt->fib6_ref); rcu_assign_pointer(rt->fib6_node, fn); rcu_assign_pointer(*ins, rt); @@ -1097,7 +1097,7 @@ add: atomic_inc(&rt->fib6_ref); rcu_assign_pointer(rt->fib6_node, fn); - rt->rt6_next = iter->rt6_next; + rt->fib6_next = iter->fib6_next; rcu_assign_pointer(*ins, rt); if (!info->skip_notify) inet6_rt_notify(RTM_NEWROUTE, rt, info, NLM_F_REPLACE); @@ -1114,14 +1114,14 @@ add: if (nsiblings) { /* Replacing an ECMP route, remove all siblings */ - ins = &rt->rt6_next; + ins = &rt->fib6_next; iter = rcu_dereference_protected(*ins, lockdep_is_held(&rt->fib6_table->tb6_lock)); while (iter) { if (iter->fib6_metric > rt->fib6_metric) break; if (rt6_qualify_for_ecmp(iter)) { - *ins = iter->rt6_next; + *ins = iter->fib6_next; iter->fib6_node = NULL; fib6_purge_rt(iter, fn, info->nl_net); if (rcu_access_pointer(fn->rr_ptr) == iter) @@ -1130,7 +1130,7 @@ add: nsiblings--; info->nl_net->ipv6.rt6_stats->fib_rt_entries--; } else { - ins = &iter->rt6_next; + ins = &iter->fib6_next; } iter = rcu_dereference_protected(*ins, lockdep_is_held(&rt->fib6_table->tb6_lock)); @@ -1713,7 +1713,7 @@ static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn, RT6_TRACE("fib6_del_route\n"); /* Unlink it */ - *rtp = rt->rt6_next; + *rtp = rt->fib6_next; rt->fib6_node = NULL; net->ipv6.rt6_stats->fib_rt_entries--; net->ipv6.rt6_stats->fib_discarded_routes++; @@ -1742,7 +1742,7 @@ static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn, FOR_WALKERS(net, w) { if (w->state == FWS_C && w->leaf == rt) { RT6_TRACE("walker %p adjusted by delroute\n", w); - w->leaf = rcu_dereference_protected(rt->rt6_next, + w->leaf = rcu_dereference_protected(rt->fib6_next, lockdep_is_held(&table->tb6_lock)); if (!w->leaf) w->state = FWS_U; @@ -1796,7 +1796,7 @@ int fib6_del(struct fib6_info *rt, struct nl_info *info) fib6_del_route(table, fn, rtp, info); return 0; } - rtp_next = &cur->rt6_next; + rtp_next = &cur->fib6_next; } return -ENOENT; } @@ -2274,7 +2274,7 @@ static int ipv6_route_yield(struct fib6_walker *w) do { iter->w.leaf = rcu_dereference_protected( - iter->w.leaf->rt6_next, + iter->w.leaf->fib6_next, lockdep_is_held(&iter->tbl->tb6_lock)); iter->skip--; if (!iter->skip && iter->w.leaf) @@ -2340,7 +2340,7 @@ static void *ipv6_route_seq_next(struct seq_file *seq, void *v, loff_t *pos) if (!v) goto iter_table; - n = rcu_dereference_bh(((struct fib6_info *)v)->rt6_next); + n = rcu_dereference_bh(((struct fib6_info *)v)->fib6_next); if (n) { ++*pos; return n; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 21494792950c..9df0e6dc0fa3 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -471,7 +471,7 @@ static inline struct fib6_info *rt6_device_match(struct net *net, !(rt->fib6_nh.nh_flags & RTNH_F_DEAD)) return rt; - for (sprt = rt; sprt; sprt = rcu_dereference(sprt->rt6_next)) { + for (sprt = rt; sprt; sprt = rcu_dereference(sprt->fib6_next)) { const struct net_device *dev = sprt->fib6_nh.nh_dev; if (sprt->fib6_nh.nh_flags & RTNH_F_DEAD) @@ -699,7 +699,7 @@ static struct fib6_info *find_rr_leaf(struct fib6_node *fn, match = NULL; cont = NULL; - for (rt = rr_head; rt; rt = rcu_dereference(rt->rt6_next)) { + for (rt = rr_head; rt; rt = rcu_dereference(rt->fib6_next)) { if (rt->fib6_metric != metric) { cont = rt; break; @@ -709,7 +709,7 @@ static struct fib6_info *find_rr_leaf(struct fib6_node *fn, } for (rt = leaf; rt && rt != rr_head; - rt = rcu_dereference(rt->rt6_next)) { + rt = rcu_dereference(rt->fib6_next)) { if (rt->fib6_metric != metric) { cont = rt; break; @@ -721,7 +721,7 @@ static struct fib6_info *find_rr_leaf(struct fib6_node *fn, if (match || !cont) return match; - for (rt = cont; rt; rt = rcu_dereference(rt->rt6_next)) + for (rt = cont; rt; rt = rcu_dereference(rt->fib6_next)) match = find_match(rt, oif, strict, &mpri, match, do_rr); return match; @@ -759,7 +759,7 @@ static struct fib6_info *rt6_select(struct net *net, struct fib6_node *fn, &do_rr); if (do_rr) { - struct fib6_info *next = rcu_dereference(rt0->rt6_next); + struct fib6_info *next = rcu_dereference(rt0->fib6_next); /* no entries matched; do round-robin */ if (!next || next->fib6_metric != rt0->fib6_metric) @@ -3705,7 +3705,7 @@ static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt) if (iter->fib6_metric == rt->fib6_metric && rt6_qualify_for_ecmp(iter)) return iter; - iter = rcu_dereference_protected(iter->rt6_next, + iter = rcu_dereference_protected(iter->fib6_next, lockdep_is_held(&rt->fib6_table->tb6_lock)); } From 221bc4560e40b6984c6bf969a3b8e672a42c9abb Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 9 May 2018 20:34:19 -0700 Subject: [PATCH 0409/1640] UPSTREAM: net/ipv6: Rename fib6_lookup to fib6_node_lookup Rename fib6_lookup to fib6_node_lookup to better reflect what it returns. The fib6_lookup name will be used in a later patch for an IPv6 equivalent to IPv4's fib_lookup. Signed-off-by: David Ahern Acked-by: David S. Miller Signed-off-by: Daniel Borkmann --- include/net/ip6_fib.h | 6 +++--- net/ipv6/ip6_fib.c | 14 ++++++++------ net/ipv6/route.c | 8 ++++---- 3 files changed, 15 insertions(+), 13 deletions(-) diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 848402bf2396..5a2a70f3ae29 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -378,9 +378,9 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6, const struct sk_buff *skb, int flags, pol_lookup_t lookup); -struct fib6_node *fib6_lookup(struct fib6_node *root, - const struct in6_addr *daddr, - const struct in6_addr *saddr); +struct fib6_node *fib6_node_lookup(struct fib6_node *root, + const struct in6_addr *daddr, + const struct in6_addr *saddr); struct fib6_node *fib6_locate(struct fib6_node *root, const struct in6_addr *daddr, int dst_len, diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index d0567d4e0118..959886055841 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -1355,8 +1355,8 @@ struct lookup_args { const struct in6_addr *addr; /* search key */ }; -static struct fib6_node *fib6_lookup_1(struct fib6_node *root, - struct lookup_args *args) +static struct fib6_node *fib6_node_lookup_1(struct fib6_node *root, + struct lookup_args *args) { struct fib6_node *fn; __be32 dir; @@ -1401,7 +1401,8 @@ static struct fib6_node *fib6_lookup_1(struct fib6_node *root, #ifdef CONFIG_IPV6_SUBTREES if (subtree) { struct fib6_node *sfn; - sfn = fib6_lookup_1(subtree, args + 1); + sfn = fib6_node_lookup_1(subtree, + args + 1); if (!sfn) goto backtrack; fn = sfn; @@ -1423,8 +1424,9 @@ backtrack: /* called with rcu_read_lock() held */ -struct fib6_node *fib6_lookup(struct fib6_node *root, const struct in6_addr *daddr, - const struct in6_addr *saddr) +struct fib6_node *fib6_node_lookup(struct fib6_node *root, + const struct in6_addr *daddr, + const struct in6_addr *saddr) { struct fib6_node *fn; struct lookup_args args[] = { @@ -1443,7 +1445,7 @@ struct fib6_node *fib6_lookup(struct fib6_node *root, const struct in6_addr *dad } }; - fn = fib6_lookup_1(root, daddr ? args : args + 1); + fn = fib6_node_lookup_1(root, daddr ? args : args + 1); if (!fn || fn->fn_flags & RTN_TL_ROOT) fn = root; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 9df0e6dc0fa3..d89f87da9b83 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1009,7 +1009,7 @@ static struct fib6_node* fib6_backtrack(struct fib6_node *fn, pn = rcu_dereference(fn->parent); sn = FIB6_SUBTREE(pn); if (sn && sn != fn) - fn = fib6_lookup(sn, NULL, saddr); + fn = fib6_node_lookup(sn, NULL, saddr); else fn = pn; if (fn->fn_flags & RTN_RTINFO) @@ -1062,7 +1062,7 @@ static struct rt6_info *ip6_pol_route_lookup(struct net *net, flags &= ~RT6_LOOKUP_F_IFACE; rcu_read_lock(); - fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); + fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); restart: f6i = rcu_dereference(fn->leaf); if (!f6i) { @@ -1802,7 +1802,7 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, rcu_read_lock(); - fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); + fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); saved_fn = fn; if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) @@ -2404,7 +2404,7 @@ static struct rt6_info *__ip6_route_redirect(struct net *net, */ rcu_read_lock(); - fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); + fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); restart: for_each_fib6_node_rt_rcu(fn) { if (rt->fib6_nh.nh_flags & RTNH_F_DEAD) From b1a58aaf2e25437f601fdcb904c9950f53444992 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 9 May 2018 20:34:20 -0700 Subject: [PATCH 0410/1640] UPSTREAM: net/ipv6: Rename rt6_multipath_select Rename rt6_multipath_select to fib6_multipath_select and export it. A later patch wants access to it similar to IPv4's fib_select_path. Signed-off-by: David Ahern Acked-by: David S. Miller Signed-off-by: Daniel Borkmann --- include/net/ip6_fib.h | 5 +++++ net/ipv6/route.c | 17 +++++++++-------- 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 5a2a70f3ae29..ba694322e215 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -378,6 +378,11 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6, const struct sk_buff *skb, int flags, pol_lookup_t lookup); +struct fib6_info *fib6_multipath_select(const struct net *net, + struct fib6_info *match, + struct flowi6 *fl6, int oif, + const struct sk_buff *skb, int strict); + struct fib6_node *fib6_node_lookup(struct fib6_node *root, const struct in6_addr *daddr, const struct in6_addr *saddr); diff --git a/net/ipv6/route.c b/net/ipv6/route.c index d89f87da9b83..631b10e39dc5 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -422,11 +422,11 @@ static bool rt6_check_expired(const struct rt6_info *rt) return false; } -static struct fib6_info *rt6_multipath_select(const struct net *net, - struct fib6_info *match, - struct flowi6 *fl6, int oif, - const struct sk_buff *skb, - int strict) +struct fib6_info *fib6_multipath_select(const struct net *net, + struct fib6_info *match, + struct flowi6 *fl6, int oif, + const struct sk_buff *skb, + int strict) { struct fib6_info *sibling, *next_sibling; @@ -1071,8 +1071,9 @@ restart: f6i = rt6_device_match(net, f6i, &fl6->saddr, fl6->flowi6_oif, flags); if (f6i->fib6_nsiblings && fl6->flowi6_oif == 0) - f6i = rt6_multipath_select(net, f6i, fl6, - fl6->flowi6_oif, skb, flags); + f6i = fib6_multipath_select(net, f6i, fl6, + fl6->flowi6_oif, skb, + flags); } if (f6i == net->ipv6.fib6_null_entry) { fn = fib6_backtrack(fn, &fl6->saddr); @@ -1811,7 +1812,7 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, redo_rt6_select: f6i = rt6_select(net, fn, oif, strict); if (f6i->fib6_nsiblings) - f6i = rt6_multipath_select(net, f6i, fl6, oif, skb, strict); + f6i = fib6_multipath_select(net, f6i, fl6, oif, skb, strict); if (f6i == net->ipv6.fib6_null_entry) { fn = fib6_backtrack(fn, &fl6->saddr); if (fn) From 4730d66d8a025c00ef0803f19131ed29bf9ee2e0 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 9 May 2018 20:34:21 -0700 Subject: [PATCH 0411/1640] UPSTREAM: net/ipv6: Extract table lookup from ip6_pol_route ip6_pol_route is used for ingress and egress FIB lookups. Refactor it moving the table lookup into a separate fib6_table_lookup that can be invoked separately and export the new function. ip6_pol_route now calls fib6_table_lookup and uses the result to generate a dst based rt6_info. Signed-off-by: David Ahern Acked-by: David S. Miller Signed-off-by: Daniel Borkmann --- include/net/ip6_fib.h | 4 ++++ net/ipv6/route.c | 39 +++++++++++++++++++++++++-------------- 2 files changed, 29 insertions(+), 14 deletions(-) diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index ba694322e215..3d7d31a75bc7 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -378,6 +378,10 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6, const struct sk_buff *skb, int flags, pol_lookup_t lookup); +/* called with rcu lock held; caller needs to select path */ +struct fib6_info *fib6_table_lookup(struct net *net, struct fib6_table *table, + int oif, struct flowi6 *fl6, int strict); + struct fib6_info *fib6_multipath_select(const struct net *net, struct fib6_info *match, struct flowi6 *fl6, int oif, diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 631b10e39dc5..f21c1dae0f94 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1787,21 +1787,12 @@ void rt6_age_exceptions(struct fib6_info *rt, spin_unlock_bh(&rt6_exception_lock); } -struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, - int oif, struct flowi6 *fl6, - const struct sk_buff *skb, int flags) +/* must be called with rcu lock held */ +struct fib6_info *fib6_table_lookup(struct net *net, struct fib6_table *table, + int oif, struct flowi6 *fl6, int strict) { struct fib6_node *fn, *saved_fn; struct fib6_info *f6i; - struct rt6_info *rt; - int strict = 0; - - strict |= flags & RT6_LOOKUP_F_IFACE; - strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE; - if (net->ipv6.devconf_all->forwarding == 0) - strict |= RT6_LOOKUP_F_REACHABLE; - - rcu_read_lock(); fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); saved_fn = fn; @@ -1811,8 +1802,6 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, redo_rt6_select: f6i = rt6_select(net, fn, oif, strict); - if (f6i->fib6_nsiblings) - f6i = fib6_multipath_select(net, f6i, fl6, oif, skb, strict); if (f6i == net->ipv6.fib6_null_entry) { fn = fib6_backtrack(fn, &fl6->saddr); if (fn) @@ -1825,6 +1814,28 @@ redo_rt6_select: } } + return f6i; +} + +struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, + int oif, struct flowi6 *fl6, + const struct sk_buff *skb, int flags) +{ + struct fib6_info *f6i; + struct rt6_info *rt; + int strict = 0; + + strict |= flags & RT6_LOOKUP_F_IFACE; + strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE; + if (net->ipv6.devconf_all->forwarding == 0) + strict |= RT6_LOOKUP_F_REACHABLE; + + rcu_read_lock(); + + f6i = fib6_table_lookup(net, table, oif, fl6, strict); + if (f6i->fib6_nsiblings) + f6i = fib6_multipath_select(net, f6i, fl6, oif, skb, strict); + if (f6i == net->ipv6.fib6_null_entry) { rt = net->ipv6.ip6_null_entry; rcu_read_unlock(); From 31602326925a8fc8652c6d2737c30c9bd0379770 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 9 May 2018 20:34:23 -0700 Subject: [PATCH 0412/1640] UPSTREAM: net/ipv6: Add fib6_lookup Add IPv6 equivalent to fib_lookup. Does a fib lookup, including rules, but returns a FIB entry, fib6_info, rather than a dst based rt6_info. fib6_lookup is any where from 140% (MULTIPLE_TABLES config disabled) to 60% faster than any of the dst based lookup methods (without custom rules) and 25% faster with custom rules (e.g., l3mdev rule). Since the lookup function has a completely different signature, fib6_rule_action is split into 2 paths: the existing one is renamed __fib6_rule_action and a new one for the fib6_info path is added. fib6_rule_action decides which to call based on the lookup_ptr. If it is fib6_table_lookup then the new path is taken. Caller must hold rcu lock as no reference is taken on the returned fib entry. Signed-off-by: David Ahern Acked-by: David S. Miller Signed-off-by: Daniel Borkmann --- include/net/ip6_fib.h | 6 +++ net/ipv6/fib6_rules.c | 86 ++++++++++++++++++++++++++++++++++++++++++- net/ipv6/ip6_fib.c | 7 ++++ 3 files changed, 97 insertions(+), 2 deletions(-) diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 3d7d31a75bc7..47229f226e3d 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -378,6 +378,12 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6, const struct sk_buff *skb, int flags, pol_lookup_t lookup); +/* called with rcu lock held; can return error pointer + * caller needs to select path + */ +struct fib6_info *fib6_lookup(struct net *net, int oif, struct flowi6 *fl6, + int flags); + /* called with rcu lock held; caller needs to select path */ struct fib6_info *fib6_table_lookup(struct net *net, struct fib6_table *table, int oif, struct flowi6 *fl6, int strict); diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c index da08885a2d32..bd519be3f833 100644 --- a/net/ipv6/fib6_rules.c +++ b/net/ipv6/fib6_rules.c @@ -60,6 +60,39 @@ unsigned int fib6_rules_seq_read(struct net *net) return fib_rules_seq_read(net, AF_INET6); } +/* called with rcu lock held; no reference taken on fib6_info */ +struct fib6_info *fib6_lookup(struct net *net, int oif, struct flowi6 *fl6, + int flags) +{ + struct fib6_info *f6i; + int err; + + if (net->ipv6.fib6_has_custom_rules) { + struct fib_lookup_arg arg = { + .lookup_ptr = fib6_table_lookup, + .lookup_data = &oif, + .flags = FIB_LOOKUP_NOREF, + }; + + l3mdev_update_flow(net, flowi6_to_flowi(fl6)); + + err = fib_rules_lookup(net->ipv6.fib6_rules_ops, + flowi6_to_flowi(fl6), flags, &arg); + if (err) + return ERR_PTR(err); + + f6i = arg.result ? : net->ipv6.fib6_null_entry; + } else { + f6i = fib6_table_lookup(net, net->ipv6.fib6_local_tbl, + oif, fl6, flags); + if (!f6i || f6i == net->ipv6.fib6_null_entry) + f6i = fib6_table_lookup(net, net->ipv6.fib6_main_tbl, + oif, fl6, flags); + } + + return f6i; +} + struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6, const struct sk_buff *skb, int flags, pol_lookup_t lookup) @@ -121,8 +154,48 @@ static int fib6_rule_saddr(struct net *net, struct fib_rule *rule, int flags, return 0; } -static int fib6_rule_action(struct fib_rule *rule, struct flowi *flp, - int flags, struct fib_lookup_arg *arg) +static int fib6_rule_action_alt(struct fib_rule *rule, struct flowi *flp, + int flags, struct fib_lookup_arg *arg) +{ + struct flowi6 *flp6 = &flp->u.ip6; + struct net *net = rule->fr_net; + struct fib6_table *table; + struct fib6_info *f6i; + int err = -EAGAIN, *oif; + u32 tb_id; + + switch (rule->action) { + case FR_ACT_TO_TBL: + break; + case FR_ACT_UNREACHABLE: + return -ENETUNREACH; + case FR_ACT_PROHIBIT: + return -EACCES; + case FR_ACT_BLACKHOLE: + default: + return -EINVAL; + } + + tb_id = fib_rule_get_table(rule, arg); + table = fib6_get_table(net, tb_id); + if (!table) + return -EAGAIN; + + oif = (int *)arg->lookup_data; + f6i = fib6_table_lookup(net, table, *oif, flp6, flags); + if (f6i != net->ipv6.fib6_null_entry) { + err = fib6_rule_saddr(net, rule, flags, flp6, + fib6_info_nh_dev(f6i)); + + if (likely(!err)) + arg->result = f6i; + } + + return err; +} + +static int __fib6_rule_action(struct fib_rule *rule, struct flowi *flp, + int flags, struct fib_lookup_arg *arg) { struct flowi6 *flp6 = &flp->u.ip6; struct rt6_info *rt = NULL; @@ -186,6 +259,15 @@ out: return err; } +static int fib6_rule_action(struct fib_rule *rule, struct flowi *flp, + int flags, struct fib_lookup_arg *arg) +{ + if (arg->lookup_ptr == fib6_table_lookup) + return fib6_rule_action_alt(rule, flp, flags, arg); + + return __fib6_rule_action(rule, flp, flags, arg); +} + static bool fib6_rule_suppress(struct fib_rule *rule, struct fib_lookup_arg *arg) { struct rt6_info *rt = (struct rt6_info *) arg->result; diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 959886055841..4b8acfda091c 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -354,6 +354,13 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6, return &rt->dst; } +/* called with rcu lock held; no reference taken on fib6_info */ +struct fib6_info *fib6_lookup(struct net *net, int oif, struct flowi6 *fl6, + int flags) +{ + return fib6_table_lookup(net, net->ipv6.fib6_main_tbl, oif, fl6, flags); +} + static void __net_init fib6_tables_init(struct net *net) { fib6_link_table(net, net->ipv6.fib6_main_tbl); From a78ae2d4024d57d0284a954b02d4476d385fa55f Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 9 May 2018 20:34:25 -0700 Subject: [PATCH 0413/1640] BACKPORT: net/ipv6: Add fib lookup stubs for use in bpf helper Add stubs to retrieve a handle to an IPv6 FIB table, fib6_get_table, a stub to do a lookup in a specific table, fib6_table_lookup, and a stub for a full route lookup. The stubs are needed for core bpf code to handle the case when the IPv6 module is not builtin. Signed-off-by: David Ahern Acked-by: David S. Miller Signed-off-by: Daniel Borkmann --- include/net/addrconf.h | 14 ++++++++++++++ net/ipv6/addrconf_core.c | 31 +++++++++++++++++++++++++++++++ net/ipv6/af_inet6.c | 4 ++++ 3 files changed, 49 insertions(+) diff --git a/include/net/addrconf.h b/include/net/addrconf.h index d06ff0183766..2692a90f8ff8 100644 --- a/include/net/addrconf.h +++ b/include/net/addrconf.h @@ -227,6 +227,20 @@ struct ipv6_stub { const struct sock *sk, struct flowi6 *fl6, const struct in6_addr *final_dst); + + struct fib6_table *(*fib6_get_table)(struct net *net, u32 id); + struct fib6_info *(*fib6_lookup)(struct net *net, int oif, + struct flowi6 *fl6, int flags); + struct fib6_info *(*fib6_table_lookup)(struct net *net, + struct fib6_table *table, + int oif, struct flowi6 *fl6, + int flags); + struct fib6_info *(*fib6_multipath_select)(const struct net *net, + struct fib6_info *f6i, + struct flowi6 *fl6, int oif, + const struct sk_buff *skb, + int strict); + void (*udpv6_encap_enable)(void); void (*ndisc_send_na)(struct net_device *dev, const struct in6_addr *daddr, const struct in6_addr *solicited_addr, diff --git a/net/ipv6/addrconf_core.c b/net/ipv6/addrconf_core.c index f5a267972c57..2476b24e23c5 100644 --- a/net/ipv6/addrconf_core.c +++ b/net/ipv6/addrconf_core.c @@ -134,8 +134,39 @@ static struct dst_entry *eafnosupport_ipv6_dst_lookup_flow(struct net *net, return ERR_PTR(-EAFNOSUPPORT); } +static struct fib6_table *eafnosupport_fib6_get_table(struct net *net, u32 id) +{ + return NULL; +} + +static struct fib6_info * +eafnosupport_fib6_table_lookup(struct net *net, struct fib6_table *table, + int oif, struct flowi6 *fl6, int flags) +{ + return NULL; +} + +static struct fib6_info * +eafnosupport_fib6_lookup(struct net *net, int oif, struct flowi6 *fl6, + int flags) +{ + return NULL; +} + +static struct fib6_info * +eafnosupport_fib6_multipath_select(const struct net *net, struct fib6_info *f6i, + struct flowi6 *fl6, int oif, + const struct sk_buff *skb, int strict) +{ + return f6i; +} + const struct ipv6_stub *ipv6_stub __read_mostly = &(struct ipv6_stub) { .ipv6_dst_lookup_flow = eafnosupport_ipv6_dst_lookup_flow, + .fib6_get_table = eafnosupport_fib6_get_table, + .fib6_table_lookup = eafnosupport_fib6_table_lookup, + .fib6_lookup = eafnosupport_fib6_lookup, + .fib6_multipath_select = eafnosupport_fib6_multipath_select, }; EXPORT_SYMBOL_GPL(ipv6_stub); diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index b40e78547b60..b5521c82b108 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -922,6 +922,10 @@ static const struct ipv6_stub ipv6_stub_impl = { .ipv6_sock_mc_join = ipv6_sock_mc_join, .ipv6_sock_mc_drop = ipv6_sock_mc_drop, .ipv6_dst_lookup_flow = ip6_dst_lookup_flow, + .fib6_get_table = fib6_get_table, + .fib6_table_lookup = fib6_table_lookup, + .fib6_lookup = fib6_lookup, + .fib6_multipath_select = fib6_multipath_select, .udpv6_encap_enable = udpv6_encap_enable, .ndisc_send_na = ndisc_send_na, .nd_tbl = &nd_tbl, From cd3924760090fd8be7916c4708bd3c3053dc76f1 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Mon, 26 Feb 2018 10:15:10 +0100 Subject: [PATCH 0414/1640] BACKPORT: ipv6: make ip6_dst_mtu_forward inline Just like ip_dst_mtu_maybe_forward(), to avoid a dependency with ipv6.ko. Signed-off-by: Felix Fietkau Signed-off-by: Pablo Neira Ayuso --- include/net/ip6_route.h | 21 +++++++++++++++++++++ net/ipv6/ip6_output.c | 21 --------------------- 2 files changed, 21 insertions(+), 21 deletions(-) diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index 2ba3d74ad731..95d46c833884 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -274,6 +274,27 @@ static inline bool rt6_duplicate_nexthop(struct fib6_info *a, struct fib6_info * !lwtunnel_cmp_encap(a->fib6_nh.nh_lwtstate, b->fib6_nh.nh_lwtstate); } +static inline unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst) +{ + struct inet6_dev *idev; + unsigned int mtu; + + if (dst_metric_locked(dst, RTAX_MTU)) { + mtu = dst_metric_raw(dst, RTAX_MTU); + if (mtu) + return mtu; + } + + mtu = IPV6_MIN_MTU; + rcu_read_lock(); + idev = __in6_dev_get(dst->dev); + if (idev) + mtu = idev->cnf.mtu6; + rcu_read_unlock(); + + return mtu; +} + struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw, struct net_device *dev, struct sk_buff *skb, const void *daddr); diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index ccfede2994c7..32438bcd3f77 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -425,27 +425,6 @@ static inline int ip6_forward_finish(struct net *net, struct sock *sk, return dst_output(net, sk, skb); } -static unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst) -{ - unsigned int mtu; - struct inet6_dev *idev; - - if (dst_metric_locked(dst, RTAX_MTU)) { - mtu = dst_metric_raw(dst, RTAX_MTU); - if (mtu) - return mtu; - } - - mtu = IPV6_MIN_MTU; - rcu_read_lock(); - idev = __in6_dev_get(dst->dev); - if (idev) - mtu = idev->cnf.mtu6; - rcu_read_unlock(); - - return mtu; -} - static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu) { if (skb->len <= mtu) From 43a932678d888b3a53652fe235acfad79aeb4f96 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Mon, 21 May 2018 09:08:14 -0700 Subject: [PATCH 0415/1640] UPSTREAM: net/ipv6: Add helper to return path MTU based on fib result Determine path MTU from a FIB lookup result. Logic is based on ip6_dst_mtu_forward plus lookup of nexthop exception. Add ip6_dst_mtu_forward to ipv6_stubs to handle access by core bpf code. Signed-off-by: David Ahern Signed-off-by: Daniel Borkmann --- include/net/addrconf.h | 2 ++ include/net/ip6_fib.h | 6 +++++ include/net/ip6_route.h | 3 +++ net/ipv6/addrconf_core.c | 8 +++++++ net/ipv6/af_inet6.c | 1 + net/ipv6/route.c | 48 ++++++++++++++++++++++++++++++++++++++++ 6 files changed, 68 insertions(+) diff --git a/include/net/addrconf.h b/include/net/addrconf.h index 2692a90f8ff8..607a81a1e961 100644 --- a/include/net/addrconf.h +++ b/include/net/addrconf.h @@ -240,6 +240,8 @@ struct ipv6_stub { struct flowi6 *fl6, int oif, const struct sk_buff *skb, int strict); + u32 (*ip6_mtu_from_fib6)(struct fib6_info *f6i, struct in6_addr *daddr, + struct in6_addr *saddr); void (*udpv6_encap_enable)(void); void (*ndisc_send_na)(struct net_device *dev, const struct in6_addr *daddr, diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 47229f226e3d..795fc1b305f5 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -414,6 +414,12 @@ static inline struct net_device *fib6_info_nh_dev(const struct fib6_info *f6i) return f6i->fib6_nh.nh_dev; } +static inline +struct lwtunnel_state *fib6_info_nh_lwt(const struct fib6_info *f6i) +{ + return f6i->fib6_nh.nh_lwtstate; +} + void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info, unsigned int flags); diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index 95d46c833884..670b3fc067ac 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -295,6 +295,9 @@ static inline unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst) return mtu; } +u32 ip6_mtu_from_fib6(struct fib6_info *f6i, struct in6_addr *daddr, + struct in6_addr *saddr); + struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw, struct net_device *dev, struct sk_buff *skb, const void *daddr); diff --git a/net/ipv6/addrconf_core.c b/net/ipv6/addrconf_core.c index 2476b24e23c5..c1dedcfbcae1 100644 --- a/net/ipv6/addrconf_core.c +++ b/net/ipv6/addrconf_core.c @@ -161,12 +161,20 @@ eafnosupport_fib6_multipath_select(const struct net *net, struct fib6_info *f6i, return f6i; } +static u32 +eafnosupport_ip6_mtu_from_fib6(struct fib6_info *f6i, struct in6_addr *daddr, + struct in6_addr *saddr) +{ + return 0; +} + const struct ipv6_stub *ipv6_stub __read_mostly = &(struct ipv6_stub) { .ipv6_dst_lookup_flow = eafnosupport_ipv6_dst_lookup_flow, .fib6_get_table = eafnosupport_fib6_get_table, .fib6_table_lookup = eafnosupport_fib6_table_lookup, .fib6_lookup = eafnosupport_fib6_lookup, .fib6_multipath_select = eafnosupport_fib6_multipath_select, + .ip6_mtu_from_fib6 = eafnosupport_ip6_mtu_from_fib6, }; EXPORT_SYMBOL_GPL(ipv6_stub); diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index b5521c82b108..efa8462cfae1 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -926,6 +926,7 @@ static const struct ipv6_stub ipv6_stub_impl = { .fib6_table_lookup = fib6_table_lookup, .fib6_lookup = fib6_lookup, .fib6_multipath_select = fib6_multipath_select, + .ip6_mtu_from_fib6 = ip6_mtu_from_fib6, .udpv6_encap_enable = udpv6_encap_enable, .ndisc_send_na = ndisc_send_na, .nd_tbl = &nd_tbl, diff --git a/net/ipv6/route.c b/net/ipv6/route.c index f21c1dae0f94..24187cb37449 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2584,6 +2584,54 @@ out: return mtu - lwtunnel_headroom(dst->lwtstate, mtu); } +/* MTU selection: + * 1. mtu on route is locked - use it + * 2. mtu from nexthop exception + * 3. mtu from egress device + * + * based on ip6_dst_mtu_forward and exception logic of + * rt6_find_cached_rt; called with rcu_read_lock + */ +u32 ip6_mtu_from_fib6(struct fib6_info *f6i, struct in6_addr *daddr, + struct in6_addr *saddr) +{ + struct rt6_exception_bucket *bucket; + struct rt6_exception *rt6_ex; + struct in6_addr *src_key; + struct inet6_dev *idev; + u32 mtu = 0; + + if (unlikely(fib6_metric_locked(f6i, RTAX_MTU))) { + mtu = f6i->fib6_pmtu; + if (mtu) + goto out; + } + + src_key = NULL; +#ifdef CONFIG_IPV6_SUBTREES + if (f6i->fib6_src.plen) + src_key = saddr; +#endif + + bucket = rcu_dereference(f6i->rt6i_exception_bucket); + rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key); + if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i)) + mtu = dst_metric_raw(&rt6_ex->rt6i->dst, RTAX_MTU); + + if (likely(!mtu)) { + struct net_device *dev = fib6_info_nh_dev(f6i); + + mtu = IPV6_MIN_MTU; + idev = __in6_dev_get(dev); + if (idev && idev->cnf.mtu6 > mtu) + mtu = idev->cnf.mtu6; + } + + mtu = min_t(unsigned int, mtu, IP6_MAX_MTU); +out: + return mtu - lwtunnel_headroom(fib6_info_nh_lwt(f6i), mtu); +} + struct dst_entry *icmp6_dst_alloc(struct net_device *dev, struct flowi6 *fl6) { From 10f19b558f2b928602a5e645f2115696d9186eb2 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 18 Jun 2018 05:24:31 -0700 Subject: [PATCH 0416/1640] UPSTREAM: net/ipv6: respect rcu grace period before freeing fib6_info syzbot reported use after free that is caused by fib6_info being freed without a proper RCU grace period. CPU: 0 PID: 1407 Comm: udevd Not tainted 4.17.0+ #39 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x1b9/0x294 lib/dump_stack.c:113 print_address_description+0x6c/0x20b mm/kasan/report.c:256 kasan_report_error mm/kasan/report.c:354 [inline] kasan_report.cold.7+0x242/0x2fe mm/kasan/report.c:412 __asan_report_load8_noabort+0x14/0x20 mm/kasan/report.c:433 __read_once_size include/linux/compiler.h:188 [inline] find_rr_leaf net/ipv6/route.c:705 [inline] rt6_select net/ipv6/route.c:761 [inline] fib6_table_lookup+0x12b7/0x14d0 net/ipv6/route.c:1823 ip6_pol_route+0x1c2/0x1020 net/ipv6/route.c:1856 ip6_pol_route_output+0x54/0x70 net/ipv6/route.c:2082 fib6_rule_lookup+0x211/0x6d0 net/ipv6/fib6_rules.c:122 ip6_route_output_flags+0x2c5/0x350 net/ipv6/route.c:2110 ip6_route_output include/net/ip6_route.h:82 [inline] icmpv6_xrlim_allow net/ipv6/icmp.c:211 [inline] icmp6_send+0x147c/0x2da0 net/ipv6/icmp.c:535 icmpv6_send+0x17a/0x300 net/ipv6/ip6_icmp.c:43 ip6_link_failure+0xa5/0x790 net/ipv6/route.c:2244 dst_link_failure include/net/dst.h:427 [inline] ndisc_error_report+0xd1/0x1c0 net/ipv6/ndisc.c:695 neigh_invalidate+0x246/0x550 net/core/neighbour.c:892 neigh_timer_handler+0xaf9/0xde0 net/core/neighbour.c:978 call_timer_fn+0x230/0x940 kernel/time/timer.c:1326 expire_timers kernel/time/timer.c:1363 [inline] __run_timers+0x79e/0xc50 kernel/time/timer.c:1666 run_timer_softirq+0x4c/0x70 kernel/time/timer.c:1692 __do_softirq+0x2e0/0xaf5 kernel/softirq.c:284 invoke_softirq kernel/softirq.c:364 [inline] irq_exit+0x1d1/0x200 kernel/softirq.c:404 exiting_irq arch/x86/include/asm/apic.h:527 [inline] smp_apic_timer_interrupt+0x17e/0x710 arch/x86/kernel/apic/apic.c:1052 apic_timer_interrupt+0xf/0x20 arch/x86/entry/entry_64.S:863 RIP: 0010:strlen+0x5e/0xa0 lib/string.c:482 Code: 24 00 74 3b 48 bb 00 00 00 00 00 fc ff df 4c 89 e0 48 83 c0 01 48 89 c2 48 89 c1 48 c1 ea 03 83 e1 07 0f b6 14 1a 38 ca 7f 04 <84> d2 75 23 80 38 00 75 de 48 83 c4 08 4c 29 e0 5b 41 5c 5d c3 48 RSP: 0018:ffff8801af117850 EFLAGS: 00000246 ORIG_RAX: ffffffffffffff13 RAX: ffff880197f53bd0 RBX: dffffc0000000000 RCX: 0000000000000000 RDX: 0000000000000000 RSI: ffffffff81c5b06c RDI: ffff880197f53bc0 RBP: ffff8801af117868 R08: ffff88019a976540 R09: 0000000000000000 R10: ffff88019a976540 R11: 0000000000000000 R12: ffff880197f53bc0 R13: ffff880197f53bc0 R14: ffffffff899e4e90 R15: ffff8801d91c6a00 strlen include/linux/string.h:267 [inline] getname_kernel+0x24/0x370 fs/namei.c:218 open_exec+0x17/0x70 fs/exec.c:882 load_elf_binary+0x968/0x5610 fs/binfmt_elf.c:780 search_binary_handler+0x17d/0x570 fs/exec.c:1653 exec_binprm fs/exec.c:1695 [inline] __do_execve_file.isra.35+0x16fe/0x2710 fs/exec.c:1819 do_execveat_common fs/exec.c:1866 [inline] do_execve fs/exec.c:1883 [inline] __do_sys_execve fs/exec.c:1964 [inline] __se_sys_execve fs/exec.c:1959 [inline] __x64_sys_execve+0x8f/0xc0 fs/exec.c:1959 do_syscall_64+0x1b1/0x800 arch/x86/entry/common.c:290 entry_SYSCALL_64_after_hwframe+0x49/0xbe RIP: 0033:0x7f1576a46207 Code: 77 19 f4 48 89 d7 44 89 c0 0f 05 48 3d 00 f0 ff ff 76 e0 f7 d8 64 41 89 01 eb d8 f7 d8 64 41 89 01 eb df b8 3b 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 02 f3 c3 48 8b 15 00 8c 2d 00 f7 d8 64 89 02 RSP: 002b:00007ffff2784568 EFLAGS: 00000202 ORIG_RAX: 000000000000003b RAX: ffffffffffffffda RBX: 00000000ffffffff RCX: 00007f1576a46207 RDX: 0000000001215b10 RSI: 00007ffff2784660 RDI: 00007ffff2785670 RBP: 0000000000625500 R08: 000000000000589c R09: 000000000000589c R10: 0000000000000000 R11: 0000000000000202 R12: 0000000001215b10 R13: 0000000000000007 R14: 0000000001204250 R15: 0000000000000005 Allocated by task 12188: save_stack+0x43/0xd0 mm/kasan/kasan.c:448 set_track mm/kasan/kasan.c:460 [inline] kasan_kmalloc+0xc4/0xe0 mm/kasan/kasan.c:553 kmem_cache_alloc_trace+0x152/0x780 mm/slab.c:3620 kmalloc include/linux/slab.h:513 [inline] kzalloc include/linux/slab.h:706 [inline] fib6_info_alloc+0xbb/0x280 net/ipv6/ip6_fib.c:152 ip6_route_info_create+0x782/0x2b50 net/ipv6/route.c:3013 ip6_route_add+0x23/0xb0 net/ipv6/route.c:3154 ipv6_route_ioctl+0x5a5/0x760 net/ipv6/route.c:3660 inet6_ioctl+0x100/0x1f0 net/ipv6/af_inet6.c:546 sock_do_ioctl+0xe4/0x3e0 net/socket.c:973 sock_ioctl+0x30d/0x680 net/socket.c:1097 vfs_ioctl fs/ioctl.c:46 [inline] file_ioctl fs/ioctl.c:500 [inline] do_vfs_ioctl+0x1cf/0x16f0 fs/ioctl.c:684 ksys_ioctl+0xa9/0xd0 fs/ioctl.c:701 __do_sys_ioctl fs/ioctl.c:708 [inline] __se_sys_ioctl fs/ioctl.c:706 [inline] __x64_sys_ioctl+0x73/0xb0 fs/ioctl.c:706 do_syscall_64+0x1b1/0x800 arch/x86/entry/common.c:290 entry_SYSCALL_64_after_hwframe+0x49/0xbe Freed by task 1402: save_stack+0x43/0xd0 mm/kasan/kasan.c:448 set_track mm/kasan/kasan.c:460 [inline] __kasan_slab_free+0x11a/0x170 mm/kasan/kasan.c:521 kasan_slab_free+0xe/0x10 mm/kasan/kasan.c:528 __cache_free mm/slab.c:3498 [inline] kfree+0xd9/0x260 mm/slab.c:3813 fib6_info_destroy+0x29b/0x350 net/ipv6/ip6_fib.c:207 fib6_info_release include/net/ip6_fib.h:286 [inline] __ip6_del_rt_siblings net/ipv6/route.c:3235 [inline] ip6_route_del+0x11c4/0x13b0 net/ipv6/route.c:3316 ipv6_route_ioctl+0x616/0x760 net/ipv6/route.c:3663 inet6_ioctl+0x100/0x1f0 net/ipv6/af_inet6.c:546 sock_do_ioctl+0xe4/0x3e0 net/socket.c:973 sock_ioctl+0x30d/0x680 net/socket.c:1097 vfs_ioctl fs/ioctl.c:46 [inline] file_ioctl fs/ioctl.c:500 [inline] do_vfs_ioctl+0x1cf/0x16f0 fs/ioctl.c:684 ksys_ioctl+0xa9/0xd0 fs/ioctl.c:701 __do_sys_ioctl fs/ioctl.c:708 [inline] __se_sys_ioctl fs/ioctl.c:706 [inline] __x64_sys_ioctl+0x73/0xb0 fs/ioctl.c:706 do_syscall_64+0x1b1/0x800 arch/x86/entry/common.c:290 entry_SYSCALL_64_after_hwframe+0x49/0xbe The buggy address belongs to the object at ffff8801b5df2580 which belongs to the cache kmalloc-256 of size 256 The buggy address is located 8 bytes inside of 256-byte region [ffff8801b5df2580, ffff8801b5df2680) The buggy address belongs to the page: page:ffffea0006d77c80 count:1 mapcount:0 mapping:ffff8801da8007c0 index:0xffff8801b5df2e40 flags: 0x2fffc0000000100(slab) raw: 02fffc0000000100 ffffea0006c5cc48 ffffea0007363308 ffff8801da8007c0 raw: ffff8801b5df2e40 ffff8801b5df2080 0000000100000006 0000000000000000 page dumped because: kasan: bad access detected Memory state around the buggy address: ffff8801b5df2480: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ffff8801b5df2500: fb fb fb fb fb fb fb fb fc fc fc fc fc fc fc fc > ffff8801b5df2580: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ^ ffff8801b5df2600: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ffff8801b5df2680: fc fc fc fc fc fc fc fc fb fb fb fb fb fb fb fb Fixes: a64efe142f5e ("net/ipv6: introduce fib6_info struct and helpers") Signed-off-by: Eric Dumazet Cc: David Ahern Reported-by: syzbot+9e6d75e3edef427ee888@syzkaller.appspotmail.com Acked-by: David Ahern Tested-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 5 +++-- net/ipv6/ip6_fib.c | 5 +++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 795fc1b305f5..067d6732ebc6 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -173,6 +173,7 @@ struct fib6_info { unused:3; struct fib6_nh fib6_nh; + struct rcu_head rcu; }; struct rt6_info { @@ -275,7 +276,7 @@ static inline void ip6_rt_put(struct rt6_info *rt) } struct fib6_info *fib6_info_alloc(gfp_t gfp_flags); -void fib6_info_destroy(struct fib6_info *f6i); +void fib6_info_destroy_rcu(struct rcu_head *head); static inline void fib6_info_hold(struct fib6_info *f6i) { @@ -285,7 +286,7 @@ static inline void fib6_info_hold(struct fib6_info *f6i) static inline void fib6_info_release(struct fib6_info *f6i) { if (f6i && atomic_dec_and_test(&f6i->fib6_ref)) - fib6_info_destroy(f6i); + call_rcu(&f6i->rcu, fib6_info_destroy_rcu); } enum fib6_walk_state { diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 4b8acfda091c..4cb3f1247419 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -167,8 +167,9 @@ struct fib6_info *fib6_info_alloc(gfp_t gfp_flags) return f6i; } -void fib6_info_destroy(struct fib6_info *f6i) +void fib6_info_destroy_rcu(struct rcu_head *head) { + struct fib6_info *f6i = container_of(head, struct fib6_info, rcu); struct rt6_exception_bucket *bucket; struct dst_metrics *m; @@ -206,7 +207,7 @@ void fib6_info_destroy(struct fib6_info *f6i) kfree(f6i); } -EXPORT_SYMBOL_GPL(fib6_info_destroy); +EXPORT_SYMBOL_GPL(fib6_info_destroy_rcu); static struct fib6_node *node_alloc(struct net *net) { From 4f1beed48ddb1e683065cdb3cefa1e9dd46d93f6 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Mon, 23 Apr 2018 11:32:07 -0700 Subject: [PATCH 0417/1640] UPSTREAM: net/ipv6: Fix missing rcu dereferences on from kbuild test robot reported 2 uses of rt->from not properly accessed using rcu_dereference: 1. add rcu_dereference_protected to rt6_remove_exception_rt and make sure it is always called with rcu lock held. 2. change rt6_do_redirect to take a reference on 'from' when accessed the first time so it can be used the sceond time outside of the lock Fixes: a68886a69180 ("net/ipv6: Make from in rt6_info rcu protected") Reported-by: kbuild test robot Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/route.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 24187cb37449..ec35c2670616 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1545,11 +1545,13 @@ static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt, static int rt6_remove_exception_rt(struct rt6_info *rt) { struct rt6_exception_bucket *bucket; - struct fib6_info *from = rt->from; struct in6_addr *src_key = NULL; struct rt6_exception *rt6_ex; + struct fib6_info *from; int err; + from = rcu_dereference_protected(rt->from, + lockdep_is_held(&rt6_exception_lock)); if (!from || !(rt->rt6i_flags | RTF_CACHE)) return -EINVAL; @@ -2229,6 +2231,7 @@ static void ip6_link_failure(struct sk_buff *skb) rt = (struct rt6_info *) skb_dst(skb); if (rt) { + rcu_read_lock(); if (rt->rt6i_flags & RTF_CACHE) { if (dst_hold_safe(&rt->dst)) rt6_remove_exception_rt(rt); @@ -2236,15 +2239,14 @@ static void ip6_link_failure(struct sk_buff *skb) struct fib6_info *from; struct fib6_node *fn; - rcu_read_lock(); from = rcu_dereference(rt->from); if (from) { fn = rcu_dereference(from->fib6_node); if (fn && (rt->rt6i_flags & RTF_DEFAULT)) fn->fn_sernum = -1; } - rcu_read_unlock(); } + rcu_read_unlock(); } } @@ -3363,8 +3365,10 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu rcu_read_lock(); from = rcu_dereference(rt->from); - nrt = ip6_rt_cache_alloc(from, &msg->dest, NULL); + fib6_info_hold(from); rcu_read_unlock(); + + nrt = ip6_rt_cache_alloc(from, &msg->dest, NULL); if (!nrt) goto out; @@ -3378,7 +3382,7 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu * a cached route because rt6_insert_exception() will * takes care of it */ - if (rt6_insert_exception(nrt, rt->from)) { + if (rt6_insert_exception(nrt, from)) { dst_release_immediate(&nrt->dst); goto out; } @@ -3390,6 +3394,7 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu call_netevent_notifiers(NETEVENT_REDIRECT, &netevent); out: + fib6_info_release(from); neigh_release(neigh); } From a5d74cd8a8d47ef86641ec220286c7a19f03e394 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Sat, 21 Jul 2018 20:56:32 -0700 Subject: [PATCH 0418/1640] BACKPORT: ipv6: use fib6_info_hold_safe() when necessary In the code path where only rcu read lock is held, e.g. in the route lookup code path, it is not safe to directly call fib6_info_hold() because the fib6_info may already have been deleted but still exists in the rcu grace period. Holding reference to it could cause double free and crash the kernel. This patch adds a new function fib6_info_hold_safe() and replace fib6_info_hold() in all necessary places. Syzbot reported 3 crash traces because of this. One of them is: 8021q: adding VLAN 0 to HW filter on device team0 IPv6: ADDRCONF(NETDEV_CHANGE): team0: link becomes ready dst_release: dst:(____ptrval____) refcnt:-1 dst_release: dst:(____ptrval____) refcnt:-2 WARNING: CPU: 1 PID: 4845 at include/net/dst.h:239 dst_hold include/net/dst.h:239 [inline] WARNING: CPU: 1 PID: 4845 at include/net/dst.h:239 ip6_setup_cork+0xd66/0x1830 net/ipv6/ip6_output.c:1204 dst_release: dst:(____ptrval____) refcnt:-1 Kernel panic - not syncing: panic_on_warn set ... CPU: 1 PID: 4845 Comm: syz-executor493 Not tainted 4.18.0-rc3+ #10 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x1c9/0x2b4 lib/dump_stack.c:113 panic+0x238/0x4e7 kernel/panic.c:184 dst_release: dst:(____ptrval____) refcnt:-2 dst_release: dst:(____ptrval____) refcnt:-3 __warn.cold.8+0x163/0x1ba kernel/panic.c:536 dst_release: dst:(____ptrval____) refcnt:-4 report_bug+0x252/0x2d0 lib/bug.c:186 fixup_bug arch/x86/kernel/traps.c:178 [inline] do_error_trap+0x1fc/0x4d0 arch/x86/kernel/traps.c:296 dst_release: dst:(____ptrval____) refcnt:-5 do_invalid_op+0x1b/0x20 arch/x86/kernel/traps.c:316 invalid_op+0x14/0x20 arch/x86/entry/entry_64.S:992 RIP: 0010:dst_hold include/net/dst.h:239 [inline] RIP: 0010:ip6_setup_cork+0xd66/0x1830 net/ipv6/ip6_output.c:1204 Code: c1 ed 03 89 9d 18 ff ff ff 48 b8 00 00 00 00 00 fc ff df 41 c6 44 05 00 f8 e9 2d 01 00 00 4c 8b a5 c8 fe ff ff e8 1a f6 e6 fa <0f> 0b e9 6a fc ff ff e8 0e f6 e6 fa 48 8b 85 d0 fe ff ff 48 8d 78 RSP: 0018:ffff8801a8fcf178 EFLAGS: 00010293 RAX: ffff8801a8eba5c0 RBX: 0000000000000000 RCX: ffffffff869511e6 RDX: 0000000000000000 RSI: ffffffff869515b6 RDI: 0000000000000005 RBP: ffff8801a8fcf2c8 R08: ffff8801a8eba5c0 R09: ffffed0035ac8338 R10: ffffed0035ac8338 R11: ffff8801ad6419c3 R12: ffff8801a8fcf720 R13: ffff8801a8fcf6a0 R14: ffff8801ad6419c0 R15: ffff8801ad641980 ip6_make_skb+0x2c8/0x600 net/ipv6/ip6_output.c:1768 udpv6_sendmsg+0x2c90/0x35f0 net/ipv6/udp.c:1376 inet_sendmsg+0x1a1/0x690 net/ipv4/af_inet.c:798 sock_sendmsg_nosec net/socket.c:641 [inline] sock_sendmsg+0xd5/0x120 net/socket.c:651 ___sys_sendmsg+0x51d/0x930 net/socket.c:2125 __sys_sendmmsg+0x240/0x6f0 net/socket.c:2220 __do_sys_sendmmsg net/socket.c:2249 [inline] __se_sys_sendmmsg net/socket.c:2246 [inline] __x64_sys_sendmmsg+0x9d/0x100 net/socket.c:2246 do_syscall_64+0x1b9/0x820 arch/x86/entry/common.c:290 entry_SYSCALL_64_after_hwframe+0x49/0xbe RIP: 0033:0x446ba9 Code: e8 cc bb 02 00 48 83 c4 18 c3 0f 1f 80 00 00 00 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 0f 83 eb 08 fc ff c3 66 2e 0f 1f 84 00 00 00 00 RSP: 002b:00007fb39a469da8 EFLAGS: 00000246 ORIG_RAX: 0000000000000133 RAX: ffffffffffffffda RBX: 00000000006dcc54 RCX: 0000000000446ba9 RDX: 00000000000000b8 RSI: 0000000020001b00 RDI: 0000000000000003 RBP: 00000000006dcc50 R08: 00007fb39a46a700 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 45c828efc7a64843 R13: e6eeb815b9d8a477 R14: 5068caf6f713c6fc R15: 0000000000000001 Dumping ftrace buffer: (ftrace buffer empty) Kernel Offset: disabled Rebooting in 86400 seconds.. Fixes: 93531c674315 ("net/ipv6: separate handling of FIB entries from dst based routes") Reported-by: syzbot+902e2a1bcd4f7808cef5@syzkaller.appspotmail.com Reported-by: syzbot+8ae62d67f647abeeceb9@syzkaller.appspotmail.com Reported-by: syzbot+3f08feb14086930677d0@syzkaller.appspotmail.com Signed-off-by: Wei Wang Acked-by: Eric Dumazet Reviewed-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 5 +++++ net/ipv6/addrconf.c | 3 ++- net/ipv6/route.c | 37 +++++++++++++++++++++++++++++-------- 3 files changed, 36 insertions(+), 9 deletions(-) diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 067d6732ebc6..16195ce9dbc1 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -283,6 +283,11 @@ static inline void fib6_info_hold(struct fib6_info *f6i) atomic_inc(&f6i->fib6_ref); } +static inline bool fib6_info_hold_safe(struct fib6_info *f6i) +{ + return atomic_inc_not_zero(&f6i->fib6_ref); +} + static inline void fib6_info_release(struct fib6_info *f6i) { if (f6i && atomic_dec_and_test(&f6i->fib6_ref)) diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index bb849fc2b848..9d1078c70de0 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -2379,7 +2379,8 @@ static struct fib6_info *addrconf_get_prefix_route(const struct in6_addr *pfx, continue; if ((rt->fib6_flags & noflags) != 0) continue; - fib6_info_hold(rt); + if (!fib6_info_hold_safe(rt)) + continue; break; } out: diff --git a/net/ipv6/route.c b/net/ipv6/route.c index ec35c2670616..709a84f6e367 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -969,10 +969,10 @@ static void ip6_rt_init_dst(struct rt6_info *rt, struct fib6_info *ort) rt->dst.lastuse = jiffies; } +/* Caller must already hold reference to @from */ static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from) { rt->rt6i_flags &= ~RTF_EXPIRES; - fib6_info_hold(from); rcu_assign_pointer(rt->from, from); dst_init_metrics(&rt->dst, from->fib6_metrics->metrics, true); if (from->fib6_metrics != &dst_default_metrics) { @@ -981,6 +981,7 @@ static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from) } } +/* Caller must already hold reference to @ort */ static void ip6_rt_copy_init(struct rt6_info *rt, struct fib6_info *ort) { struct net_device *dev = fib6_info_nh_dev(ort); @@ -1041,9 +1042,14 @@ static struct rt6_info *ip6_create_rt_rcu(struct fib6_info *rt) struct net_device *dev = rt->fib6_nh.nh_dev; struct rt6_info *nrt; + if (!fib6_info_hold_safe(rt)) + return NULL; + nrt = ip6_dst_alloc(dev_net(dev), dev, flags); if (nrt) ip6_rt_copy_init(nrt, rt); + else + fib6_info_release(rt); return nrt; } @@ -1175,10 +1181,15 @@ static struct rt6_info *ip6_rt_cache_alloc(struct fib6_info *ort, * Clone the route. */ + if (!fib6_info_hold_safe(ort)) + return NULL; + dev = ip6_rt_get_dev_rcu(ort); rt = ip6_dst_alloc(dev_net(dev), dev, 0); - if (!rt) + if (!rt) { + fib6_info_release(ort); return NULL; + } ip6_rt_copy_init(rt, ort); rt->rt6i_flags |= RTF_CACHE; @@ -1207,12 +1218,17 @@ static struct rt6_info *ip6_rt_pcpu_alloc(struct fib6_info *rt) struct net_device *dev; struct rt6_info *pcpu_rt; + if (!fib6_info_hold_safe(rt)) + return NULL; + rcu_read_lock(); dev = ip6_rt_get_dev_rcu(rt); pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags); rcu_read_unlock(); - if (!pcpu_rt) + if (!pcpu_rt) { + fib6_info_release(rt); return NULL; + } ip6_rt_copy_init(pcpu_rt, rt); pcpu_rt->rt6i_flags |= RTF_PCPU; return pcpu_rt; @@ -2466,7 +2482,7 @@ restart: out: if (ret) - dst_hold(&ret->dst); + ip6_hold_safe(net, &ret, true); else ret = ip6_create_rt_rcu(rt); @@ -3259,7 +3275,8 @@ static int ip6_route_del(struct fib6_config *cfg, continue; if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol) continue; - fib6_info_hold(rt); + if (!fib6_info_hold_safe(rt)) + continue; rcu_read_unlock(); /* if gateway was specified only delete the one hop */ @@ -3365,6 +3382,9 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu rcu_read_lock(); from = rcu_dereference(rt->from); + /* This fib6_info_hold() is safe here because we hold reference to rt + * and rt already holds reference to fib6_info. + */ fib6_info_hold(from); rcu_read_unlock(); @@ -3425,7 +3445,8 @@ static struct fib6_info *rt6_get_route_info(struct net *net, continue; if (!ipv6_addr_equal(&rt->fib6_nh.nh_gw, gwaddr)) continue; - fib6_info_hold(rt); + if (!fib6_info_hold_safe(rt)) + continue; break; } out: @@ -3485,8 +3506,8 @@ struct fib6_info *rt6_get_dflt_router(struct net *net, ipv6_addr_equal(&rt->fib6_nh.nh_gw, addr)) break; } - if (rt) - fib6_info_hold(rt); + if (rt && !fib6_info_hold_safe(rt)) + rt = NULL; rcu_read_unlock(); return rt; } From f139583a283cbfe3ea996a1d3199a98f78dd74c5 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Fri, 12 Oct 2018 16:22:47 +0200 Subject: [PATCH 0419/1640] UPSTREAM: ipv6: rate-limit probes for neighbourless routes When commit 270972554c91 ("[IPV6]: ROUTE: Add Router Reachability Probing (RFC4191).") introduced router probing, the rt6_probe() function required that a neighbour entry existed. This neighbour entry is used to record the timestamp of the last probe via the ->updated field. Later, commit 2152caea7196 ("ipv6: Do not depend on rt->n in rt6_probe().") removed the requirement for a neighbour entry. Neighbourless routes skip the interval check and are not rate-limited. This patch adds rate-limiting for neighbourless routes, by recording the timestamp of the last probe in the fib6_info itself. Fixes: 2152caea7196 ("ipv6: Do not depend on rt->n in rt6_probe().") Signed-off-by: Sabrina Dubroca Reviewed-by: Stefano Brivio Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 4 ++++ net/ipv6/route.c | 12 ++++++------ 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 16195ce9dbc1..af90ab64b97f 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -162,6 +162,10 @@ struct fib6_info { struct rt6_info * __percpu *rt6i_pcpu; struct rt6_exception_bucket __rcu *rt6i_exception_bucket; +#ifdef CONFIG_IPV6_ROUTER_PREF + unsigned long last_probe; +#endif + u32 fib6_metric; u8 fib6_protocol; u8 fib6_type; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 709a84f6e367..603cc06378fd 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -514,10 +514,11 @@ static void rt6_probe_deferred(struct work_struct *w) static void rt6_probe(struct fib6_info *rt) { - struct __rt6_probe_work *work; + struct __rt6_probe_work *work = NULL; const struct in6_addr *nh_gw; struct neighbour *neigh; struct net_device *dev; + struct inet6_dev *idev; /* * Okay, this does not seem to be appropriate @@ -533,15 +534,12 @@ static void rt6_probe(struct fib6_info *rt) nh_gw = &rt->fib6_nh.nh_gw; dev = rt->fib6_nh.nh_dev; rcu_read_lock_bh(); + idev = __in6_dev_get(dev); neigh = __ipv6_neigh_lookup_noref(dev, nh_gw); if (neigh) { - struct inet6_dev *idev; - if (neigh->nud_state & NUD_VALID) goto out; - idev = __in6_dev_get(dev); - work = NULL; write_lock(&neigh->lock); if (!(neigh->nud_state & NUD_VALID) && time_after(jiffies, @@ -551,11 +549,13 @@ static void rt6_probe(struct fib6_info *rt) __neigh_set_probe_once(neigh); } write_unlock(&neigh->lock); - } else { + } else if (time_after(jiffies, rt->last_probe + + idev->cnf.rtr_probe_interval)) { work = kmalloc(sizeof(*work), GFP_ATOMIC); } if (work) { + rt->last_probe = jiffies; INIT_WORK(&work->work, rt6_probe_deferred); work->target = *nh_gw; dev_hold(dev); From d7045aa4c21da6eb43880884f2f6ee8b48969156 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 15 May 2019 19:39:52 -0700 Subject: [PATCH 0420/1640] UPSTREAM: ipv6: prevent possible fib6 leaks [ Upstream commit 61fb0d01680771f72cc9d39783fb2c122aaad51e ] At ipv6 route dismantle, fib6_drop_pcpu_from() is responsible for finding all percpu routes and set their ->from pointer to NULL, so that fib6_ref can reach its expected value (1). The problem right now is that other cpus can still catch the route being deleted, since there is no rcu grace period between the route deletion and call to fib6_drop_pcpu_from() This can leak the fib6 and associated resources, since no notifier will take care of removing the last reference(s). I decided to add another boolean (fib6_destroying) instead of reusing/renaming exception_bucket_flushed to ease stable backports, and properly document the memory barriers used to implement this fix. This patch has been co-developped with Wei Wang. Fixes: 93531c674315 ("net/ipv6: separate handling of FIB entries from dst based routes") Signed-off-by: Eric Dumazet Reported-by: syzbot Cc: Wei Wang Cc: David Ahern Cc: Martin Lau Acked-by: Wei Wang Acked-by: Martin KaFai Lau Reviewed-by: David Ahern Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- include/net/ip6_fib.h | 3 ++- net/ipv6/ip6_fib.c | 12 +++++++++--- net/ipv6/route.c | 7 +++++++ 3 files changed, 18 insertions(+), 4 deletions(-) diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index af90ab64b97f..791fb63b6fe5 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -174,7 +174,8 @@ struct fib6_info { dst_nocount:1, dst_nopolicy:1, dst_host:1, - unused:3; + fib6_destroying:1, + unused:2; struct fib6_nh fib6_nh; struct rcu_head rcu; diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 4cb3f1247419..ad21f51d7ec9 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -873,6 +873,12 @@ static void fib6_drop_pcpu_from(struct fib6_info *f6i, { int cpu; + /* Make sure rt6_make_pcpu_route() wont add other percpu routes + * while we are cleaning them here. + */ + f6i->fib6_destroying = 1; + mb(); /* paired with the cmpxchg() in rt6_make_pcpu_route() */ + /* release the reference to this fib entry from * all of its cached pcpu routes */ @@ -898,6 +904,9 @@ static void fib6_purge_rt(struct fib6_info *rt, struct fib6_node *fn, { struct fib6_table *table = rt->fib6_table; + if (rt->rt6i_pcpu) + fib6_drop_pcpu_from(rt, table); + if (atomic_read(&rt->fib6_ref) != 1) { /* This route is used as dummy address holder in some split * nodes. It is not leaked, but it still holds other resources, @@ -919,9 +928,6 @@ static void fib6_purge_rt(struct fib6_info *rt, struct fib6_node *fn, fn = rcu_dereference_protected(fn->parent, lockdep_is_held(&table->tb6_lock)); } - - if (rt->rt6i_pcpu) - fib6_drop_pcpu_from(rt, table); } } diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 603cc06378fd..0244412181d8 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1264,6 +1264,13 @@ static struct rt6_info *rt6_make_pcpu_route(struct net *net, prev = cmpxchg(p, NULL, pcpu_rt); BUG_ON(prev); + if (rt->fib6_destroying) { + struct fib6_info *from; + + from = xchg((__force struct fib6_info **)&pcpu_rt->from, NULL); + fib6_info_release(from); + } + return pcpu_rt; } From 35c531a0bc0d8e548ea691d86d77899e247055bd Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 20 Jan 2022 09:41:12 -0800 Subject: [PATCH 0421/1640] UPSTREAM: ipv6: annotate accesses to fn->fn_sernum commit aafc2e3285c2d7a79b7ee15221c19fbeca7b1509 upstream. struct fib6_node's fn_sernum field can be read while other threads change it. Add READ_ONCE()/WRITE_ONCE() annotations. Do not change existing smp barriers in fib6_get_cookie_safe() and __fib6_update_sernum_upto_root() syzbot reported: BUG: KCSAN: data-race in fib6_clean_node / inet6_csk_route_socket write to 0xffff88813df62e2c of 4 bytes by task 1920 on cpu 1: fib6_clean_node+0xc2/0x260 net/ipv6/ip6_fib.c:2178 fib6_walk_continue+0x38e/0x430 net/ipv6/ip6_fib.c:2112 fib6_walk net/ipv6/ip6_fib.c:2160 [inline] fib6_clean_tree net/ipv6/ip6_fib.c:2240 [inline] __fib6_clean_all+0x1a9/0x2e0 net/ipv6/ip6_fib.c:2256 fib6_flush_trees+0x6c/0x80 net/ipv6/ip6_fib.c:2281 rt_genid_bump_ipv6 include/net/net_namespace.h:488 [inline] addrconf_dad_completed+0x57f/0x870 net/ipv6/addrconf.c:4230 addrconf_dad_work+0x908/0x1170 process_one_work+0x3f6/0x960 kernel/workqueue.c:2307 worker_thread+0x616/0xa70 kernel/workqueue.c:2454 kthread+0x1bf/0x1e0 kernel/kthread.c:359 ret_from_fork+0x1f/0x30 read to 0xffff88813df62e2c of 4 bytes by task 15701 on cpu 0: fib6_get_cookie_safe include/net/ip6_fib.h:285 [inline] rt6_get_cookie include/net/ip6_fib.h:306 [inline] ip6_dst_store include/net/ip6_route.h:234 [inline] inet6_csk_route_socket+0x352/0x3c0 net/ipv6/inet6_connection_sock.c:109 inet6_csk_xmit+0x91/0x1e0 net/ipv6/inet6_connection_sock.c:121 __tcp_transmit_skb+0x1323/0x1840 net/ipv4/tcp_output.c:1402 tcp_transmit_skb net/ipv4/tcp_output.c:1420 [inline] tcp_write_xmit+0x1450/0x4460 net/ipv4/tcp_output.c:2680 __tcp_push_pending_frames+0x68/0x1c0 net/ipv4/tcp_output.c:2864 tcp_push+0x2d9/0x2f0 net/ipv4/tcp.c:725 mptcp_push_release net/mptcp/protocol.c:1491 [inline] __mptcp_push_pending+0x46c/0x490 net/mptcp/protocol.c:1578 mptcp_sendmsg+0x9ec/0xa50 net/mptcp/protocol.c:1764 inet6_sendmsg+0x5f/0x80 net/ipv6/af_inet6.c:643 sock_sendmsg_nosec net/socket.c:705 [inline] sock_sendmsg net/socket.c:725 [inline] kernel_sendmsg+0x97/0xd0 net/socket.c:745 sock_no_sendpage+0x84/0xb0 net/core/sock.c:3086 inet_sendpage+0x9d/0xc0 net/ipv4/af_inet.c:834 kernel_sendpage+0x187/0x200 net/socket.c:3492 sock_sendpage+0x5a/0x70 net/socket.c:1007 pipe_to_sendpage+0x128/0x160 fs/splice.c:364 splice_from_pipe_feed fs/splice.c:418 [inline] __splice_from_pipe+0x207/0x500 fs/splice.c:562 splice_from_pipe fs/splice.c:597 [inline] generic_splice_sendpage+0x94/0xd0 fs/splice.c:746 do_splice_from fs/splice.c:767 [inline] direct_splice_actor+0x80/0xa0 fs/splice.c:936 splice_direct_to_actor+0x345/0x650 fs/splice.c:891 do_splice_direct+0x106/0x190 fs/splice.c:979 do_sendfile+0x675/0xc40 fs/read_write.c:1245 __do_sys_sendfile64 fs/read_write.c:1310 [inline] __se_sys_sendfile64 fs/read_write.c:1296 [inline] __x64_sys_sendfile64+0x102/0x140 fs/read_write.c:1296 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x44/0xd0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae value changed: 0x0000026f -> 0x00000271 Reported by Kernel Concurrency Sanitizer on: CPU: 0 PID: 15701 Comm: syz-executor.2 Not tainted 5.16.0-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 The Fixes tag I chose is probably arbitrary, I do not think we need to backport this patch to older kernels. Fixes: c5cff8561d2d ("ipv6: add rcu grace period before freeing fib6_node") Signed-off-by: Eric Dumazet Reported-by: syzbot Link: https://lore.kernel.org/r/20220120174112.1126644-1-eric.dumazet@gmail.com Signed-off-by: Jakub Kicinski Signed-off-by: Greg Kroah-Hartman --- include/net/ip6_fib.h | 2 +- net/ipv6/ip6_fib.c | 23 +++++++++++++---------- net/ipv6/route.c | 2 +- 3 files changed, 15 insertions(+), 12 deletions(-) diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 791fb63b6fe5..d5a9edc09638 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -246,7 +246,7 @@ static inline bool fib6_get_cookie_safe(const struct fib6_info *f6i, fn = rcu_dereference(f6i->fib6_node); if (fn) { - *cookie = fn->fn_sernum; + *cookie = READ_ONCE(fn->fn_sernum); /* pairs with smp_wmb() in fib6_update_sernum_upto_root() */ smp_rmb(); status = true; diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index ad21f51d7ec9..92914be99e9f 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -112,7 +112,7 @@ void fib6_update_sernum(struct net *net, struct fib6_info *f6i) fn = rcu_dereference_protected(f6i->fib6_node, lockdep_is_held(&f6i->fib6_table->tb6_lock)); if (fn) - fn->fn_sernum = fib6_new_sernum(net); + WRITE_ONCE(fn->fn_sernum, fib6_new_sernum(net)); } /* @@ -540,12 +540,13 @@ static int fib6_dump_table(struct fib6_table *table, struct sk_buff *skb, spin_unlock_bh(&table->tb6_lock); if (res > 0) { cb->args[4] = 1; - cb->args[5] = w->root->fn_sernum; + cb->args[5] = READ_ONCE(w->root->fn_sernum); } } else { - if (cb->args[5] != w->root->fn_sernum) { + int sernum = READ_ONCE(w->root->fn_sernum); + if (cb->args[5] != sernum) { /* Begin at the root if the tree changed */ - cb->args[5] = w->root->fn_sernum; + cb->args[5] = sernum; w->state = FWS_INIT; w->node = w->root; w->skip = w->count; @@ -1180,7 +1181,7 @@ static void __fib6_update_sernum_upto_root(struct fib6_info *rt, /* paired with smp_rmb() in rt6_get_cookie_safe() */ smp_wmb(); while (fn) { - fn->fn_sernum = sernum; + WRITE_ONCE(fn->fn_sernum, sernum); fn = rcu_dereference_protected(fn->parent, lockdep_is_held(&rt->fib6_table->tb6_lock)); } @@ -1953,8 +1954,8 @@ static int fib6_clean_node(struct fib6_walker *w) }; if (c->sernum != FIB6_NO_SERNUM_CHANGE && - w->node->fn_sernum != c->sernum) - w->node->fn_sernum = c->sernum; + READ_ONCE(w->node->fn_sernum) != c->sernum) + WRITE_ONCE(w->node->fn_sernum, c->sernum); if (!c->func) { WARN_ON_ONCE(c->sernum == FIB6_NO_SERNUM_CHANGE); @@ -2309,7 +2310,7 @@ static void ipv6_route_seq_setup_walk(struct ipv6_route_iter *iter, iter->w.state = FWS_INIT; iter->w.node = iter->w.root; iter->w.args = iter; - iter->sernum = iter->w.root->fn_sernum; + iter->sernum = READ_ONCE(iter->w.root->fn_sernum); INIT_LIST_HEAD(&iter->w.lh); fib6_walker_link(net, &iter->w); } @@ -2337,8 +2338,10 @@ static struct fib6_table *ipv6_route_seq_next_table(struct fib6_table *tbl, static void ipv6_route_check_sernum(struct ipv6_route_iter *iter) { - if (iter->sernum != iter->w.root->fn_sernum) { - iter->sernum = iter->w.root->fn_sernum; + int sernum = READ_ONCE(iter->w.root->fn_sernum); + + if (iter->sernum != sernum) { + iter->sernum = sernum; iter->w.state = FWS_INIT; iter->w.node = iter->w.root; WARN_ON(iter->w.skip); diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 0244412181d8..a27cb39379e5 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2266,7 +2266,7 @@ static void ip6_link_failure(struct sk_buff *skb) if (from) { fn = rcu_dereference(from->fib6_node); if (fn && (rt->rt6i_flags & RTF_DEFAULT)) - fn->fn_sernum = -1; + WRITE_ONCE(fn->fn_sernum, -1); } } rcu_read_unlock(); From b9c2710c810c9e9aa4481d1a3436307941175e3a Mon Sep 17 00:00:00 2001 From: Stephen Suryaputra Date: Mon, 16 Apr 2018 13:42:16 -0400 Subject: [PATCH 0422/1640] BACKPORT: ipv6: Count interface receive statistics on the ingress netdev The statistics such as InHdrErrors should be counted on the ingress netdev rather than on the dev from the dst, which is the egress. Signed-off-by: Stephen Suryaputra Signed-off-by: David S. Miller --- include/net/addrconf.h | 14 +++++++++ net/ipv6/exthdrs.c | 55 +++++++++++++-------------------- net/ipv6/ip6_input.c | 2 +- net/ipv6/ip6_output.c | 18 +++++------ net/ipv6/reassembly.c | 4 +-- net/ipv6/route.c | 3 +- net/netfilter/ipvs/ip_vs_xmit.c | 5 +-- 7 files changed, 50 insertions(+), 51 deletions(-) diff --git a/include/net/addrconf.h b/include/net/addrconf.h index 607a81a1e961..8aa6c6496dc9 100644 --- a/include/net/addrconf.h +++ b/include/net/addrconf.h @@ -330,6 +330,20 @@ static inline struct inet6_dev *__in6_dev_get(const struct net_device *dev) return rcu_dereference_rtnl(dev->ip6_ptr); } +/** + * __in6_dev_get_safely - get inet6_dev pointer from netdevice + * @dev: network device + * + * This is a safer version of __in6_dev_get + */ +static inline struct inet6_dev *__in6_dev_get_safely(const struct net_device *dev) +{ + if (likely(dev)) + return rcu_dereference_rtnl(dev->ip6_ptr); + else + return NULL; +} + /** * in6_dev_get - get inet6_dev pointer from netdevice * @dev: network device diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c index abb7e0f41990..6d2429b0376d 100644 --- a/net/ipv6/exthdrs.c +++ b/net/ipv6/exthdrs.c @@ -282,6 +282,7 @@ static const struct tlvtype_proc tlvprocdestopt_lst[] = { static int ipv6_destopt_rcv(struct sk_buff *skb) { + struct inet6_dev *idev = __in6_dev_get(skb->dev); struct inet6_skb_parm *opt = IP6CB(skb); #if IS_ENABLED(CONFIG_IPV6_MIP6) __u16 dstbuf; @@ -293,7 +294,7 @@ static int ipv6_destopt_rcv(struct sk_buff *skb) if (!pskb_may_pull(skb, skb_transport_offset(skb) + 8) || !pskb_may_pull(skb, (skb_transport_offset(skb) + ((skb_transport_header(skb)[1] + 1) << 3)))) { - __IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst), + __IP6_INC_STATS(dev_net(dst->dev), idev, IPSTATS_MIB_INHDRERRORS); fail_and_free: kfree_skb(skb); @@ -321,8 +322,7 @@ fail_and_free: return 1; } - __IP6_INC_STATS(dev_net(dst->dev), - ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS); + __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); return -1; } @@ -418,8 +418,7 @@ looped_back: } if (hdr->segments_left >= (hdr->hdrlen >> 1)) { - __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), - IPSTATS_MIB_INHDRERRORS); + __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, ((&hdr->segments_left) - skb_network_header(skb))); @@ -458,8 +457,7 @@ looped_back: if (skb_dst(skb)->dev->flags & IFF_LOOPBACK) { if (ipv6_hdr(skb)->hop_limit <= 1) { - __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), - IPSTATS_MIB_INHDRERRORS); + __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0); kfree_skb(skb); @@ -483,10 +481,10 @@ looped_back: /* called with rcu_read_lock() */ static int ipv6_rthdr_rcv(struct sk_buff *skb) { + struct inet6_dev *idev = __in6_dev_get(skb->dev); struct inet6_skb_parm *opt = IP6CB(skb); struct in6_addr *addr = NULL; struct in6_addr daddr; - struct inet6_dev *idev; int n, i; struct ipv6_rt_hdr *hdr; struct rt0_hdr *rthdr; @@ -500,8 +498,7 @@ static int ipv6_rthdr_rcv(struct sk_buff *skb) if (!pskb_may_pull(skb, skb_transport_offset(skb) + 8) || !pskb_may_pull(skb, (skb_transport_offset(skb) + ((skb_transport_header(skb)[1] + 1) << 3)))) { - __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), - IPSTATS_MIB_INHDRERRORS); + __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); kfree_skb(skb); return -1; } @@ -510,8 +507,7 @@ static int ipv6_rthdr_rcv(struct sk_buff *skb) if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr) || skb->pkt_type != PACKET_HOST) { - __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), - IPSTATS_MIB_INADDRERRORS); + __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS); kfree_skb(skb); return -1; } @@ -529,7 +525,7 @@ looped_back: * processed by own */ if (!addr) { - __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), + __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS); kfree_skb(skb); return -1; @@ -555,8 +551,7 @@ looped_back: goto unknown_rh; /* Silently discard invalid RTH type 2 */ if (hdr->hdrlen != 2 || hdr->segments_left != 1) { - __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), - IPSTATS_MIB_INHDRERRORS); + __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); kfree_skb(skb); return -1; } @@ -574,8 +569,7 @@ looped_back: n = hdr->hdrlen >> 1; if (hdr->segments_left > n) { - __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), - IPSTATS_MIB_INHDRERRORS); + __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, ((&hdr->segments_left) - skb_network_header(skb))); @@ -611,14 +605,12 @@ looped_back: if (xfrm6_input_addr(skb, (xfrm_address_t *)addr, (xfrm_address_t *)&ipv6_hdr(skb)->saddr, IPPROTO_ROUTING) < 0) { - __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), - IPSTATS_MIB_INADDRERRORS); + __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS); kfree_skb(skb); return -1; } if (!ipv6_chk_home_addr(dev_net(skb_dst(skb)->dev), addr)) { - __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), - IPSTATS_MIB_INADDRERRORS); + __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS); kfree_skb(skb); return -1; } @@ -629,8 +621,7 @@ looped_back: } if (ipv6_addr_is_multicast(addr)) { - __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), - IPSTATS_MIB_INADDRERRORS); + __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS); kfree_skb(skb); return -1; } @@ -649,8 +640,7 @@ looped_back: if (skb_dst(skb)->dev->flags&IFF_LOOPBACK) { if (ipv6_hdr(skb)->hop_limit <= 1) { - __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), - IPSTATS_MIB_INHDRERRORS); + __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0); kfree_skb(skb); @@ -665,7 +655,7 @@ looped_back: return -1; unknown_rh: - __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_INHDRERRORS); + __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, (&hdr->type) - skb_network_header(skb)); return -1; @@ -757,34 +747,31 @@ static bool ipv6_hop_ra(struct sk_buff *skb, int optoff) static bool ipv6_hop_jumbo(struct sk_buff *skb, int optoff) { const unsigned char *nh = skb_network_header(skb); + struct inet6_dev *idev = __in6_dev_get_safely(skb->dev); struct net *net = ipv6_skb_net(skb); u32 pkt_len; if (nh[optoff + 1] != 4 || (optoff & 3) != 2) { net_dbg_ratelimited("ipv6_hop_jumbo: wrong jumbo opt length/alignment %d\n", nh[optoff+1]); - __IP6_INC_STATS(net, ipv6_skb_idev(skb), - IPSTATS_MIB_INHDRERRORS); + __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); goto drop; } pkt_len = ntohl(*(__be32 *)(nh + optoff + 2)); if (pkt_len <= IPV6_MAXPLEN) { - __IP6_INC_STATS(net, ipv6_skb_idev(skb), - IPSTATS_MIB_INHDRERRORS); + __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, optoff+2); return false; } if (ipv6_hdr(skb)->payload_len) { - __IP6_INC_STATS(net, ipv6_skb_idev(skb), - IPSTATS_MIB_INHDRERRORS); + __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, optoff); return false; } if (pkt_len > skb->len - sizeof(struct ipv6hdr)) { - __IP6_INC_STATS(net, ipv6_skb_idev(skb), - IPSTATS_MIB_INTRUNCATEDPKTS); + __IP6_INC_STATS(net, idev, IPSTATS_MIB_INTRUNCATEDPKTS); goto drop; } diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c index 9d3bfaec452b..d4dd28afbd6a 100644 --- a/net/ipv6/ip6_input.c +++ b/net/ipv6/ip6_input.c @@ -347,7 +347,7 @@ int ip6_mc_input(struct sk_buff *skb) bool deliver; __IP6_UPD_PO_STATS(dev_net(skb_dst(skb)->dev), - ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_INMCAST, + __in6_dev_get_safely(skb->dev), IPSTATS_MIB_INMCAST, skb->len); hdr = ipv6_hdr(skb); diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 32438bcd3f77..4282229e6519 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -445,6 +445,7 @@ static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu) int ip6_forward(struct sk_buff *skb) { + struct inet6_dev *idev = __in6_dev_get_safely(skb->dev); struct dst_entry *dst = skb_dst(skb); struct ipv6hdr *hdr = ipv6_hdr(skb); struct inet6_skb_parm *opt = IP6CB(skb); @@ -464,8 +465,7 @@ int ip6_forward(struct sk_buff *skb) goto drop; if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) { - __IP6_INC_STATS(net, ip6_dst_idev(dst), - IPSTATS_MIB_INDISCARDS); + __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS); goto drop; } @@ -496,8 +496,7 @@ int ip6_forward(struct sk_buff *skb) /* Force OUTPUT device used as source address */ skb->dev = dst->dev; icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0); - __IP6_INC_STATS(net, ip6_dst_idev(dst), - IPSTATS_MIB_INHDRERRORS); + __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); kfree_skb(skb); return -ETIMEDOUT; @@ -510,15 +509,13 @@ int ip6_forward(struct sk_buff *skb) if (proxied > 0) return ip6_input(skb); else if (proxied < 0) { - __IP6_INC_STATS(net, ip6_dst_idev(dst), - IPSTATS_MIB_INDISCARDS); + __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS); goto drop; } } if (!xfrm6_route_forward(skb)) { - __IP6_INC_STATS(net, ip6_dst_idev(dst), - IPSTATS_MIB_INDISCARDS); + __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS); goto drop; } dst = skb_dst(skb); @@ -575,8 +572,7 @@ int ip6_forward(struct sk_buff *skb) /* Again, force OUTPUT device used as source address */ skb->dev = dst->dev; icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); - __IP6_INC_STATS(net, ip6_dst_idev(dst), - IPSTATS_MIB_INTOOBIGERRORS); + __IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS); __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS); kfree_skb(skb); @@ -600,7 +596,7 @@ int ip6_forward(struct sk_buff *skb) ip6_forward_finish); error: - __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS); + __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS); drop: kfree_skb(skb); return -EINVAL; diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index b2f7a335a12b..60dfd0d11851 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -375,7 +375,7 @@ static int ipv6_frag_rcv(struct sk_buff *skb) spin_unlock(&fq->q.lock); inet_frag_put(&fq->q); if (prob_offset) { - __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), + __IP6_INC_STATS(net, __in6_dev_get_safely(skb->dev), IPSTATS_MIB_INHDRERRORS); /* icmpv6_param_prob() calls kfree_skb(skb) */ icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, prob_offset); @@ -388,7 +388,7 @@ static int ipv6_frag_rcv(struct sk_buff *skb) return -1; fail_hdr: - __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), + __IP6_INC_STATS(net, __in6_dev_get_safely(skb->dev), IPSTATS_MIB_INHDRERRORS); icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, skb_network_header_len(skb)); return -1; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index a27cb39379e5..d8cc24ed39fb 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -3637,7 +3637,8 @@ static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes) case IPSTATS_MIB_INNOROUTES: type = ipv6_addr_type(&ipv6_hdr(skb)->daddr); if (type == IPV6_ADDR_ANY) { - IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst), + IP6_INC_STATS(dev_net(dst->dev), + __in6_dev_get_safely(skb->dev), IPSTATS_MIB_INADDRERRORS); break; } diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index 8792cad28e29..f6fec281ba45 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -266,12 +266,13 @@ static inline bool decrement_ttl(struct netns_ipvs *ipvs, /* check and decrement ttl */ if (ipv6_hdr(skb)->hop_limit <= 1) { + struct inet6_dev *idev = __in6_dev_get_safely(skb->dev); + /* Force OUTPUT device used as source address */ skb->dev = dst->dev; icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0); - IP6_INC_STATS(net, ip6_dst_idev(dst), - IPSTATS_MIB_INHDRERRORS); + IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); return false; } From 7d6e5eb305b3581204159d6b9b8ea1b400176fcd Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 9 May 2018 20:34:26 -0700 Subject: [PATCH 0423/1640] BACKPORT: bpf: Provide helper to do forwarding lookups in kernel FIB table Provide a helper for doing a FIB and neighbor lookup in the kernel tables from an XDP program. The helper provides a fastpath for forwarding packets. If the packet is a local delivery or for any reason is not a simple lookup and forward, the packet continues up the stack. If it is to be forwarded, the forwarding can be done directly if the neighbor is already known. If the neighbor does not exist, the first few packets go up the stack for neighbor resolution. Once resolved, the xdp program provides the fast path. On successful lookup the nexthop dmac, current device smac and egress device index are returned. The API supports IPv4, IPv6 and MPLS protocols, but only IPv4 and IPv6 are implemented in this patch. The API includes layer 4 parameters if the XDP program chooses to do deep packet inspection to allow compare against ACLs implemented as FIB rules. Header rewrite is left to the XDP program. The lookup takes 2 flags: - BPF_FIB_LOOKUP_DIRECT to do a lookup that bypasses FIB rules and goes straight to the table associated with the device (expert setting for those looking to maximize throughput) - BPF_FIB_LOOKUP_OUTPUT to do a lookup from the egress perspective. Default is an ingress lookup. Initial performance numbers collected by Jesper, forwarded packets/sec: Full stack XDP FIB lookup XDP Direct lookup IPv4 1,947,969 7,074,156 7,415,333 IPv6 1,728,000 6,165,504 7,262,720 These number are single CPU core forwarding on a Broadwell E5-1650 v4 @ 3.60GHz. Signed-off-by: David Ahern Acked-by: Jesper Dangaard Brouer Signed-off-by: Daniel Borkmann --- include/uapi/linux/bpf.h | 78 ++++++++++++ net/core/filter.c | 267 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 345 insertions(+) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index ced5d8725f88..f007f93f4810 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1828,6 +1828,33 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * + * + * int bpf_fib_lookup(void *ctx, struct bpf_fib_lookup *params, int plen, u32 flags) + * Description + * Do FIB lookup in kernel tables using parameters in *params*. + * If lookup is successful and result shows packet is to be + * forwarded, the neighbor tables are searched for the nexthop. + * If successful (ie., FIB lookup shows forwarding and nexthop + * is resolved), the nexthop address is returned in ipv4_dst, + * ipv6_dst or mpls_out based on family, smac is set to mac + * address of egress device, dmac is set to nexthop mac address, + * rt_metric is set to metric from route. + * + * *plen* argument is the size of the passed in struct. + * *flags* argument can be one or more BPF_FIB_LOOKUP_ flags: + * + * **BPF_FIB_LOOKUP_DIRECT** means do a direct table lookup vs + * full lookup using FIB rules + * **BPF_FIB_LOOKUP_OUTPUT** means do lookup from an egress + * perspective (default is ingress) + * + * *ctx* is either **struct xdp_md** for XDP programs or + * **struct sk_buff** tc cls_act programs. + * + * Return + * Egress device index on success, 0 if packet needs to continue + * up the stack for further processing or a negative error in case + * of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -2378,4 +2405,55 @@ struct bpf_raw_tracepoint_args { __u64 args[0]; }; +/* DIRECT: Skip the FIB rules and go to FIB table associated with device + * OUTPUT: Do lookup from egress perspective; default is ingress + */ +#define BPF_FIB_LOOKUP_DIRECT BIT(0) +#define BPF_FIB_LOOKUP_OUTPUT BIT(1) + +struct bpf_fib_lookup { + /* input */ + __u8 family; /* network family, AF_INET, AF_INET6, AF_MPLS */ + + /* set if lookup is to consider L4 data - e.g., FIB rules */ + __u8 l4_protocol; + __be16 sport; + __be16 dport; + + /* total length of packet from network header - used for MTU check */ + __u16 tot_len; + __u32 ifindex; /* L3 device index for lookup */ + + union { + /* inputs to lookup */ + __u8 tos; /* AF_INET */ + __be32 flowlabel; /* AF_INET6 */ + + /* output: metric of fib result */ + __u32 rt_metric; + }; + + union { + __be32 mpls_in; + __be32 ipv4_src; + __u32 ipv6_src[4]; /* in6_addr; network order */ + }; + + /* input to bpf_fib_lookup, *dst is destination address. + * output: bpf_fib_lookup sets to gateway address + */ + union { + /* return for MPLS lookups */ + __be32 mpls_out[4]; /* support up to 4 labels */ + __be32 ipv4_dst; + __u32 ipv6_dst[4]; /* in6_addr; network order */ + }; + + /* output */ + __be16 h_vlan_proto; + __be16 h_vlan_TCI; + __u8 smac[6]; /* ETH_ALEN */ + __u8 dmac[6]; /* ETH_ALEN */ +}; + #endif /* _UAPI__LINUX_BPF_H__ */ diff --git a/net/core/filter.c b/net/core/filter.c index ec5cf41e9392..8467c6250a61 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -60,6 +60,10 @@ #include #include #include +#include +#include +#include +#include /** * sk_filter_trim_cap - run a packet through a socket filter @@ -4046,6 +4050,265 @@ static const struct bpf_func_proto bpf_skb_get_xfrm_state_proto = { }; #endif +#if IS_ENABLED(CONFIG_INET) || IS_ENABLED(CONFIG_IPV6) +static int bpf_fib_set_fwd_params(struct bpf_fib_lookup *params, + const struct neighbour *neigh, + const struct net_device *dev) +{ + memcpy(params->dmac, neigh->ha, ETH_ALEN); + memcpy(params->smac, dev->dev_addr, ETH_ALEN); + params->h_vlan_TCI = 0; + params->h_vlan_proto = 0; + + return dev->ifindex; +} +#endif + +#if IS_ENABLED(CONFIG_INET) +static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params, + u32 flags) +{ + struct in_device *in_dev; + struct neighbour *neigh; + struct net_device *dev; + struct fib_result res; + struct fib_nh *nh; + struct flowi4 fl4; + int err; + + dev = dev_get_by_index_rcu(net, params->ifindex); + if (unlikely(!dev)) + return -ENODEV; + + /* verify forwarding is enabled on this interface */ + in_dev = __in_dev_get_rcu(dev); + if (unlikely(!in_dev || !IN_DEV_FORWARD(in_dev))) + return 0; + + if (flags & BPF_FIB_LOOKUP_OUTPUT) { + fl4.flowi4_iif = 1; + fl4.flowi4_oif = params->ifindex; + } else { + fl4.flowi4_iif = params->ifindex; + fl4.flowi4_oif = 0; + } + fl4.flowi4_tos = params->tos & IPTOS_RT_MASK; + fl4.flowi4_scope = RT_SCOPE_UNIVERSE; + fl4.flowi4_flags = 0; + + fl4.flowi4_proto = params->l4_protocol; + fl4.daddr = params->ipv4_dst; + fl4.saddr = params->ipv4_src; + fl4.fl4_sport = params->sport; + fl4.fl4_dport = params->dport; + + if (flags & BPF_FIB_LOOKUP_DIRECT) { + u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN; + struct fib_table *tb; + + tb = fib_get_table(net, tbid); + if (unlikely(!tb)) + return 0; + + err = fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF); + } else { + fl4.flowi4_mark = 0; + fl4.flowi4_secid = 0; + fl4.flowi4_tun_key.tun_id = 0; + fl4.flowi4_uid = sock_net_uid(net, NULL); + + err = fib_lookup(net, &fl4, &res, FIB_LOOKUP_NOREF); + } + + if (err || res.type != RTN_UNICAST) + return 0; + + if (res.fi->fib_nhs > 1) + fib_select_path(net, &res, &fl4, NULL); + + nh = &res.fi->fib_nh[res.nh_sel]; + + /* do not handle lwt encaps right now */ + if (nh->nh_lwtstate) + return 0; + + dev = nh->nh_dev; + if (unlikely(!dev)) + return 0; + + if (nh->nh_gw) + params->ipv4_dst = nh->nh_gw; + + params->rt_metric = res.fi->fib_priority; + + /* xdp and cls_bpf programs are run in RCU-bh so + * rcu_read_lock_bh is not needed here + */ + neigh = __ipv4_neigh_lookup_noref(dev, (__force u32)params->ipv4_dst); + if (neigh) + return bpf_fib_set_fwd_params(params, neigh, dev); + + return 0; +} +#endif + +#if IS_ENABLED(CONFIG_IPV6) +static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params, + u32 flags) +{ + struct in6_addr *src = (struct in6_addr *) params->ipv6_src; + struct in6_addr *dst = (struct in6_addr *) params->ipv6_dst; + struct neighbour *neigh; + struct net_device *dev; + struct inet6_dev *idev; + struct fib6_info *f6i; + struct flowi6 fl6; + int strict = 0; + int oif; + + /* link local addresses are never forwarded */ + if (rt6_need_strict(dst) || rt6_need_strict(src)) + return 0; + + dev = dev_get_by_index_rcu(net, params->ifindex); + if (unlikely(!dev)) + return -ENODEV; + + idev = __in6_dev_get_safely(dev); + if (unlikely(!idev || !net->ipv6.devconf_all->forwarding)) + return 0; + + if (flags & BPF_FIB_LOOKUP_OUTPUT) { + fl6.flowi6_iif = 1; + oif = fl6.flowi6_oif = params->ifindex; + } else { + oif = fl6.flowi6_iif = params->ifindex; + fl6.flowi6_oif = 0; + strict = RT6_LOOKUP_F_HAS_SADDR; + } + fl6.flowlabel = params->flowlabel; + fl6.flowi6_scope = 0; + fl6.flowi6_flags = 0; + fl6.mp_hash = 0; + + fl6.flowi6_proto = params->l4_protocol; + fl6.daddr = *dst; + fl6.saddr = *src; + fl6.fl6_sport = params->sport; + fl6.fl6_dport = params->dport; + + if (flags & BPF_FIB_LOOKUP_DIRECT) { + u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN; + struct fib6_table *tb; + + tb = ipv6_stub->fib6_get_table(net, tbid); + if (unlikely(!tb)) + return 0; + + f6i = ipv6_stub->fib6_table_lookup(net, tb, oif, &fl6, strict); + } else { + fl6.flowi6_mark = 0; + fl6.flowi6_secid = 0; + fl6.flowi6_tun_key.tun_id = 0; + fl6.flowi6_uid = sock_net_uid(net, NULL); + + f6i = ipv6_stub->fib6_lookup(net, oif, &fl6, strict); + } + + if (unlikely(IS_ERR_OR_NULL(f6i) || f6i == net->ipv6.fib6_null_entry)) + return 0; + + if (unlikely(f6i->fib6_flags & RTF_REJECT || + f6i->fib6_type != RTN_UNICAST)) + return 0; + + if (f6i->fib6_nsiblings && fl6.flowi6_oif == 0) + f6i = ipv6_stub->fib6_multipath_select(net, f6i, &fl6, + fl6.flowi6_oif, NULL, + strict); + + if (f6i->fib6_nh.nh_lwtstate) + return 0; + + if (f6i->fib6_flags & RTF_GATEWAY) + *dst = f6i->fib6_nh.nh_gw; + + dev = f6i->fib6_nh.nh_dev; + params->rt_metric = f6i->fib6_metric; + + /* xdp and cls_bpf programs are run in RCU-bh so rcu_read_lock_bh is + * not needed here. Can not use __ipv6_neigh_lookup_noref here + * because we need to get nd_tbl via the stub + */ + neigh = ___neigh_lookup_noref(ipv6_stub->nd_tbl, neigh_key_eq128, + ndisc_hashfn, dst, dev); + if (neigh) + return bpf_fib_set_fwd_params(params, neigh, dev); + + return 0; +} +#endif + +BPF_CALL_4(bpf_xdp_fib_lookup, struct xdp_buff *, ctx, + struct bpf_fib_lookup *, params, int, plen, u32, flags) +{ + if (plen < sizeof(*params)) + return -EINVAL; + + switch (params->family) { +#if IS_ENABLED(CONFIG_INET) + case AF_INET: + return bpf_ipv4_fib_lookup(dev_net(ctx->rxq->dev), params, + flags); +#endif +#if IS_ENABLED(CONFIG_IPV6) + case AF_INET6: + return bpf_ipv6_fib_lookup(dev_net(ctx->rxq->dev), params, + flags); +#endif + } + return 0; +} + +static const struct bpf_func_proto bpf_xdp_fib_lookup_proto = { + .func = bpf_xdp_fib_lookup, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_ANYTHING, +}; + +BPF_CALL_4(bpf_skb_fib_lookup, struct sk_buff *, skb, + struct bpf_fib_lookup *, params, int, plen, u32, flags) +{ + if (plen < sizeof(*params)) + return -EINVAL; + + switch (params->family) { +#if IS_ENABLED(CONFIG_INET) + case AF_INET: + return bpf_ipv4_fib_lookup(dev_net(skb->dev), params, flags); +#endif +#if IS_ENABLED(CONFIG_IPV6) + case AF_INET6: + return bpf_ipv6_fib_lookup(dev_net(skb->dev), params, flags); +#endif + } + return -ENOTSUPP; +} + +static const struct bpf_func_proto bpf_skb_fib_lookup_proto = { + .func = bpf_skb_fib_lookup, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_ANYTHING, +}; + static const struct bpf_func_proto * bpf_base_func_proto(enum bpf_func_id func_id) { @@ -4199,6 +4462,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_skb_get_xfrm_state: return &bpf_skb_get_xfrm_state_proto; #endif + case BPF_FUNC_fib_lookup: + return &bpf_skb_fib_lookup_proto; default: return bpf_base_func_proto(func_id); } @@ -4224,6 +4489,8 @@ xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_xdp_redirect_map_proto; case BPF_FUNC_xdp_adjust_tail: return &bpf_xdp_adjust_tail_proto; + case BPF_FUNC_fib_lookup: + return &bpf_xdp_fib_lookup_proto; default: return bpf_base_func_proto(func_id); } From 9d381cb5af7be13a3926a5b1eed12bb987804658 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 7 May 2018 10:50:48 -0700 Subject: [PATCH 0424/1640] BACKPORT: bpf: enable stackmap with build_id in nmi context Currently, we cannot parse build_id in nmi context because of up_read(¤t->mm->mmap_sem), this makes stackmap with build_id less useful. This patch enables parsing build_id in nmi by putting the up_read() call in irq_work. To avoid memory allocation in nmi context, we use per cpu variable for the irq_work. As a result, only one irq_work per cpu is allowed. If the irq_work is in-use, we fallback to only report ips. Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: Peter Zijlstra Signed-off-by: Song Liu Signed-off-by: Daniel Borkmann --- init/Kconfig | 1 + kernel/bpf/stackmap.c | 59 ++++++++++++++++++++++++++++++++++++++----- 2 files changed, 54 insertions(+), 6 deletions(-) diff --git a/init/Kconfig b/init/Kconfig index f9efd266f4be..82d5912872c6 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1501,6 +1501,7 @@ config EVENTFD config BPF_SYSCALL bool "Enable bpf() system call" select BPF + select IRQ_WORK default n help Enable the bpf() system call that allows to manipulate eBPF diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index cec0f0a5d87b..35ce9851c1cc 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -11,6 +11,7 @@ #include #include #include +#include #include "percpu_freelist.h" #define STACK_CREATE_FLAG_MASK \ @@ -32,6 +33,23 @@ struct bpf_stack_map { struct stack_map_bucket *buckets[]; }; +/* irq_work to run up_read() for build_id lookup in nmi context */ +struct stack_map_irq_work { + struct irq_work irq_work; + struct rw_semaphore *sem; +}; + +static void do_up_read(struct irq_work *entry) +{ + struct stack_map_irq_work *work; + + work = container_of(entry, struct stack_map_irq_work, irq_work); + up_read(work->sem); + work->sem = NULL; +} + +static DEFINE_PER_CPU(struct stack_map_irq_work, up_read_work); + static inline bool stack_map_use_build_id(struct bpf_map *map) { return (map->map_flags & BPF_F_STACK_BUILD_ID); @@ -273,17 +291,27 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, { int i; struct vm_area_struct *vma; + bool in_nmi_ctx = in_nmi(); + bool irq_work_busy = false; + struct stack_map_irq_work *work; + + if (in_nmi_ctx) { + work = this_cpu_ptr(&up_read_work); + if (work->irq_work.flags & IRQ_WORK_BUSY) + /* cannot queue more up_read, fallback */ + irq_work_busy = true; + } /* - * We cannot do up_read() in nmi context, so build_id lookup is - * only supported for non-nmi events. If at some point, it is - * possible to run find_vma() without taking the semaphore, we - * would like to allow build_id lookup in nmi context. + * We cannot do up_read() in nmi context. To do build_id lookup + * in nmi context, we need to run up_read() in irq_work. We use + * a percpu variable to do the irq_work. If the irq_work is + * already used by another lookup, we fall back to report ips. * * Same fallback is used for kernel stack (!user) on a stackmap * with build_id. */ - if (!user || !current || !current->mm || in_nmi() || + if (!user || !current || !current->mm || irq_work_busy || down_read_trylock(¤t->mm->mmap_sem) == 0) { /* cannot access current->mm, fall back to ips */ for (i = 0; i < trace_nr; i++) { @@ -305,7 +333,13 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, - vma->vm_start; id_offs[i].status = BPF_STACK_BUILD_ID_VALID; } - up_read(¤t->mm->mmap_sem); + + if (!in_nmi_ctx) { + up_read(¤t->mm->mmap_sem); + } else { + work->sem = ¤t->mm->mmap_sem; + irq_work_queue(&work->irq_work); + } } BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map, @@ -581,3 +615,16 @@ const struct bpf_map_ops stack_map_ops = { .map_update_elem = stack_map_update_elem, .map_delete_elem = stack_map_delete_elem, }; + +static int __init stack_map_init(void) +{ + int cpu; + struct stack_map_irq_work *work; + + for_each_possible_cpu(cpu) { + work = per_cpu_ptr(&up_read_work, cpu); + init_irq_work(&work->irq_work, do_up_read); + } + return 0; +} +subsys_initcall(stack_map_init); From 8ea0c6120c252bc8b8176b7b5456d9350edf43f0 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Mon, 14 May 2018 10:00:16 -0700 Subject: [PATCH 0425/1640] UPSTREAM: bpf: sockmap, refactor sockmap routines to work with hashmap This patch only refactors the existing sockmap code. This will allow much of the psock initialization code path and bpf helper codes to work for both sockmap bpf map types that are backed by an array, the currently supported type, and the new hash backed bpf map type sockhash. Most the fallout comes from three changes, - Pushing bpf programs into an independent structure so we can use it from the htab struct in the next patch. - Generalizing helpers to use void *key instead of the hardcoded u32. - Instead of passing map/key through the metadata we now do the lookup inline. This avoids storing the key in the metadata which will be useful when keys can be longer than 4 bytes. We rename the sk pointers to sk_redir at this point as well to avoid any confusion between the current sk pointer and the redirect pointer sk_redir. Signed-off-by: John Fastabend Acked-by: David S. Miller Signed-off-by: Daniel Borkmann --- include/linux/filter.h | 3 +- include/net/tcp.h | 3 +- kernel/bpf/sockmap.c | 148 ++++++++++++++++++++++++----------------- net/core/filter.c | 31 +++------ 4 files changed, 98 insertions(+), 87 deletions(-) diff --git a/include/linux/filter.h b/include/linux/filter.h index 1a8d1b7b22c6..bf488c014966 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -595,9 +595,8 @@ struct sk_msg_buff { int sg_end; struct scatterlist sg_data[MAX_SKB_FRAGS]; bool sg_copy[MAX_SKB_FRAGS]; - __u32 key; __u32 flags; - struct bpf_map *map; + struct sock *sk_redir; struct sk_buff *skb; struct list_head list; }; diff --git a/include/net/tcp.h b/include/net/tcp.h index c7a7519666fb..f09047cc016e 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -883,9 +883,8 @@ struct tcp_skb_cb { #endif } header; /* For incoming skbs */ struct { - __u32 key; __u32 flags; - struct bpf_map *map; + struct sock *sk_redir; void *data_end; } bpf; }; diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 7974bc4c1789..f8431a1e2865 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -48,14 +48,18 @@ #define SOCK_CREATE_FLAG_MASK \ (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) -struct bpf_stab { - struct bpf_map map; - struct sock **sock_map; +struct bpf_sock_progs { struct bpf_prog *bpf_tx_msg; struct bpf_prog *bpf_parse; struct bpf_prog *bpf_verdict; }; +struct bpf_stab { + struct bpf_map map; + struct sock **sock_map; + struct bpf_sock_progs progs; +}; + enum smap_psock_state { SMAP_TX_RUNNING, }; @@ -461,7 +465,7 @@ static int free_curr_sg(struct sock *sk, struct sk_msg_buff *md) static int bpf_map_msg_verdict(int _rc, struct sk_msg_buff *md) { return ((_rc == SK_PASS) ? - (md->map ? __SK_REDIRECT : __SK_PASS) : + (md->sk_redir ? __SK_REDIRECT : __SK_PASS) : __SK_DROP); } @@ -1092,7 +1096,7 @@ static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb) * when we orphan the skb so that we don't have the possibility * to reference a stale map. */ - TCP_SKB_CB(skb)->bpf.map = NULL; + TCP_SKB_CB(skb)->bpf.sk_redir = NULL; skb->sk = psock->sock; bpf_compute_data_end_sk_skb(skb); preempt_disable(); @@ -1102,7 +1106,7 @@ static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb) /* Moving return codes from UAPI namespace into internal namespace */ return rc == SK_PASS ? - (TCP_SKB_CB(skb)->bpf.map ? __SK_REDIRECT : __SK_PASS) : + (TCP_SKB_CB(skb)->bpf.sk_redir ? __SK_REDIRECT : __SK_PASS) : __SK_DROP; } @@ -1375,7 +1379,6 @@ static int smap_init_sock(struct smap_psock *psock, } static void smap_init_progs(struct smap_psock *psock, - struct bpf_stab *stab, struct bpf_prog *verdict, struct bpf_prog *parse) { @@ -1453,14 +1456,13 @@ static void smap_gc_work(struct work_struct *w) kfree(psock); } -static struct smap_psock *smap_init_psock(struct sock *sock, - struct bpf_stab *stab) +static struct smap_psock *smap_init_psock(struct sock *sock, int node) { struct smap_psock *psock; psock = kzalloc_node(sizeof(struct smap_psock), GFP_ATOMIC | __GFP_NOWARN, - stab->map.numa_node); + node); if (!psock) return ERR_PTR(-ENOMEM); @@ -1665,40 +1667,26 @@ out: * - sock_map must use READ_ONCE and (cmp)xchg operations * - BPF verdict/parse programs must use READ_ONCE and xchg operations */ -static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops, - struct bpf_map *map, - void *key, u64 flags) + +static int __sock_map_ctx_update_elem(struct bpf_map *map, + struct bpf_sock_progs *progs, + struct sock *sock, + struct sock **map_link, + void *key) { - struct bpf_stab *stab = container_of(map, struct bpf_stab, map); - struct smap_psock_map_entry *e = NULL; struct bpf_prog *verdict, *parse, *tx_msg; - struct sock *osock, *sock; + struct smap_psock_map_entry *e = NULL; struct smap_psock *psock; - u32 i = *(u32 *)key; bool new = false; int err; - if (unlikely(flags > BPF_EXIST)) - return -EINVAL; - - if (unlikely(i >= stab->map.max_entries)) - return -E2BIG; - - sock = READ_ONCE(stab->sock_map[i]); - if (flags == BPF_EXIST && !sock) - return -ENOENT; - else if (flags == BPF_NOEXIST && sock) - return -EEXIST; - - sock = skops->sk; - /* 1. If sock map has BPF programs those will be inherited by the * sock being added. If the sock is already attached to BPF programs * this results in an error. */ - verdict = READ_ONCE(stab->bpf_verdict); - parse = READ_ONCE(stab->bpf_parse); - tx_msg = READ_ONCE(stab->bpf_tx_msg); + verdict = READ_ONCE(progs->bpf_verdict); + parse = READ_ONCE(progs->bpf_parse); + tx_msg = READ_ONCE(progs->bpf_tx_msg); if (parse && verdict) { /* bpf prog refcnt may be zero if a concurrent attach operation @@ -1706,11 +1694,11 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops, * we increment the refcnt. If this is the case abort with an * error. */ - verdict = bpf_prog_inc_not_zero(stab->bpf_verdict); + verdict = bpf_prog_inc_not_zero(progs->bpf_verdict); if (IS_ERR(verdict)) return PTR_ERR(verdict); - parse = bpf_prog_inc_not_zero(stab->bpf_parse); + parse = bpf_prog_inc_not_zero(progs->bpf_parse); if (IS_ERR(parse)) { bpf_prog_put(verdict); return PTR_ERR(parse); @@ -1718,7 +1706,7 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops, } if (tx_msg) { - tx_msg = bpf_prog_inc_not_zero(stab->bpf_tx_msg); + tx_msg = bpf_prog_inc_not_zero(progs->bpf_tx_msg); if (IS_ERR(tx_msg)) { if (verdict) bpf_prog_put(verdict); @@ -1751,7 +1739,7 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops, goto out_progs; } } else { - psock = smap_init_psock(sock, stab); + psock = smap_init_psock(sock, map->numa_node); if (IS_ERR(psock)) { err = PTR_ERR(psock); goto out_progs; @@ -1766,7 +1754,6 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops, err = -ENOMEM; goto out_progs; } - e->entry = &stab->sock_map[i]; /* 3. At this point we have a reference to a valid psock that is * running. Attach any BPF programs needed. @@ -1783,7 +1770,7 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops, err = smap_init_sock(psock, sock); if (err) goto out_free; - smap_init_progs(psock, stab, verdict, parse); + smap_init_progs(psock, verdict, parse); smap_start_sock(psock, sock); } @@ -1792,19 +1779,12 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops, * it with. Because we can only have a single set of programs if * old_sock has a strp we can stop it. */ - list_add_tail(&e->list, &psock->maps); - write_unlock_bh(&sock->sk_callback_lock); - - osock = xchg(&stab->sock_map[i], sock); - if (osock) { - struct smap_psock *opsock = smap_psock_sk(osock); - - write_lock_bh(&osock->sk_callback_lock); - smap_list_remove(opsock, &stab->sock_map[i]); - smap_release_sock(opsock, osock); - write_unlock_bh(&osock->sk_callback_lock); + if (map_link) { + e->entry = map_link; + list_add_tail(&e->list, &psock->maps); } - return 0; + write_unlock_bh(&sock->sk_callback_lock); + return err; out_free: smap_release_sock(psock, sock); out_progs: @@ -1819,23 +1799,69 @@ out_progs: return err; } -int sock_map_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type) +static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops, + struct bpf_map *map, + void *key, u64 flags) { struct bpf_stab *stab = container_of(map, struct bpf_stab, map); + struct bpf_sock_progs *progs = &stab->progs; + struct sock *osock, *sock; + u32 i = *(u32 *)key; + int err; + + if (unlikely(flags > BPF_EXIST)) + return -EINVAL; + + if (unlikely(i >= stab->map.max_entries)) + return -E2BIG; + + sock = READ_ONCE(stab->sock_map[i]); + if (flags == BPF_EXIST && !sock) + return -ENOENT; + else if (flags == BPF_NOEXIST && sock) + return -EEXIST; + + sock = skops->sk; + err = __sock_map_ctx_update_elem(map, progs, sock, &stab->sock_map[i], + key); + if (err) + goto out; + + osock = xchg(&stab->sock_map[i], sock); + if (osock) { + struct smap_psock *opsock = smap_psock_sk(osock); + + write_lock_bh(&osock->sk_callback_lock); + smap_list_remove(opsock, &stab->sock_map[i]); + smap_release_sock(opsock, osock); + write_unlock_bh(&osock->sk_callback_lock); + } +out: + return 0; +} + +int sock_map_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type) +{ + struct bpf_sock_progs *progs; struct bpf_prog *orig; - if (unlikely(map->map_type != BPF_MAP_TYPE_SOCKMAP)) + if (map->map_type == BPF_MAP_TYPE_SOCKMAP) { + struct bpf_stab *stab = container_of(map, struct bpf_stab, map); + + progs = &stab->progs; + } else { return -EINVAL; + } switch (type) { case BPF_SK_MSG_VERDICT: - orig = xchg(&stab->bpf_tx_msg, prog); + orig = xchg(&progs->bpf_tx_msg, prog); break; case BPF_SK_SKB_STREAM_PARSER: - orig = xchg(&stab->bpf_parse, prog); + orig = xchg(&progs->bpf_parse, prog); break; case BPF_SK_SKB_STREAM_VERDICT: - orig = xchg(&stab->bpf_verdict, prog); + orig = xchg(&progs->bpf_verdict, prog); break; default: return -EOPNOTSUPP; @@ -1884,16 +1910,18 @@ static int sock_map_update_elem(struct bpf_map *map, static void sock_map_release(struct bpf_map *map) { struct bpf_stab *stab = container_of(map, struct bpf_stab, map); + struct bpf_sock_progs *progs; struct bpf_prog *orig; - orig = xchg(&stab->bpf_parse, NULL); + progs = &stab->progs; + orig = xchg(&progs->bpf_parse, NULL); if (orig) bpf_prog_put(orig); - orig = xchg(&stab->bpf_verdict, NULL); + orig = xchg(&progs->bpf_verdict, NULL); if (orig) bpf_prog_put(orig); - orig = xchg(&stab->bpf_tx_msg, NULL); + orig = xchg(&progs->bpf_tx_msg, NULL); if (orig) bpf_prog_put(orig); } diff --git a/net/core/filter.c b/net/core/filter.c index 8467c6250a61..1a575b6cb349 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2095,9 +2095,10 @@ BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb, if (unlikely(flags & ~(BPF_F_INGRESS))) return SK_DROP; - tcb->bpf.key = key; tcb->bpf.flags = flags; - tcb->bpf.map = map; + tcb->bpf.sk_redir = __sock_map_lookup_elem(map, key); + if (!tcb->bpf.sk_redir) + return SK_DROP; return SK_PASS; } @@ -2105,16 +2106,8 @@ BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb, struct sock *do_sk_redirect_map(struct sk_buff *skb) { struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); - struct sock *sk = NULL; - if (tcb->bpf.map) { - sk = __sock_map_lookup_elem(tcb->bpf.map, tcb->bpf.key); - - tcb->bpf.key = 0; - tcb->bpf.map = NULL; - } - - return sk; + return tcb->bpf.sk_redir; } static const struct bpf_func_proto bpf_sk_redirect_map_proto = { @@ -2134,25 +2127,17 @@ BPF_CALL_4(bpf_msg_redirect_map, struct sk_msg_buff *, msg, if (unlikely(flags & ~(BPF_F_INGRESS))) return SK_DROP; - msg->key = key; msg->flags = flags; - msg->map = map; + msg->sk_redir = __sock_map_lookup_elem(map, key); + if (!msg->sk_redir) + return SK_DROP; return SK_PASS; } struct sock *do_msg_redirect_map(struct sk_msg_buff *msg) { - struct sock *sk = NULL; - - if (msg->map) { - sk = __sock_map_lookup_elem(msg->map, msg->key); - - msg->key = 0; - msg->map = NULL; - } - - return sk; + return msg->sk_redir; } static const struct bpf_func_proto bpf_msg_redirect_map_proto = { From 9d85befb3201b11f067d8c09ee0e49018bc7cf92 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Mon, 14 May 2018 10:00:17 -0700 Subject: [PATCH 0426/1640] BACKPORT: bpf: sockmap, add hash map support Sockmap is currently backed by an array and enforces keys to be four bytes. This works well for many use cases and was originally modeled after devmap which also uses four bytes keys. However, this has become limiting in larger use cases where a hash would be more appropriate. For example users may want to use the 5-tuple of the socket as the lookup key. To support this add hash support. Signed-off-by: John Fastabend Acked-by: David S. Miller Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 8 + include/linux/bpf_types.h | 1 + include/uapi/linux/bpf.h | 49 +++- kernel/bpf/core.c | 1 + kernel/bpf/sockmap.c | 494 ++++++++++++++++++++++++++++++++++++-- kernel/bpf/verifier.c | 14 +- net/core/filter.c | 58 +++++ 7 files changed, 607 insertions(+), 18 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index dbb94e4ad302..5d1ec9c4152c 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -685,6 +685,7 @@ static inline void bpf_map_offload_map_free(struct bpf_map *map) #if defined(CONFIG_STREAM_PARSER) && defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_INET) struct sock *__sock_map_lookup_elem(struct bpf_map *map, u32 key); +struct sock *__sock_hash_lookup_elem(struct bpf_map *map, void *key); int sock_map_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type); #else static inline struct sock *__sock_map_lookup_elem(struct bpf_map *map, u32 key) @@ -692,6 +693,12 @@ static inline struct sock *__sock_map_lookup_elem(struct bpf_map *map, u32 key) return NULL; } +static inline struct sock *__sock_hash_lookup_elem(struct bpf_map *map, + void *key) +{ + return NULL; +} + static inline int sock_map_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type) @@ -742,6 +749,7 @@ extern const struct bpf_func_proto bpf_get_current_comm_proto; extern const struct bpf_func_proto bpf_get_stackid_proto; extern const struct bpf_func_proto bpf_get_stack_proto; extern const struct bpf_func_proto bpf_sock_map_update_proto; +extern const struct bpf_func_proto bpf_sock_hash_update_proto; /* Shared helpers among cBPF and eBPF. */ void bpf_user_rnd_init_once(void); diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index d7df1b323082..b67f8793de0d 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -47,6 +47,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_HASH_OF_MAPS, htab_of_maps_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP, dev_map_ops) #if defined(CONFIG_STREAM_PARSER) && defined(CONFIG_INET) BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKMAP, sock_map_ops) +BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKHASH, sock_hash_ops) #endif BPF_MAP_TYPE(BPF_MAP_TYPE_CPUMAP, cpu_map_ops) #if defined(CONFIG_XDP_SOCKETS) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index f007f93f4810..9bf03b0467a6 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -118,6 +118,7 @@ enum bpf_map_type { BPF_MAP_TYPE_SOCKMAP, BPF_MAP_TYPE_CPUMAP, BPF_MAP_TYPE_XSKMAP, + BPF_MAP_TYPE_SOCKHASH, }; enum bpf_prog_type { @@ -1828,7 +1829,6 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * * int bpf_fib_lookup(void *ctx, struct bpf_fib_lookup *params, int plen, u32 flags) * Description * Do FIB lookup in kernel tables using parameters in *params*. @@ -1855,6 +1855,53 @@ union bpf_attr { * Egress device index on success, 0 if packet needs to continue * up the stack for further processing or a negative error in case * of failure. + * + * int bpf_sock_hash_update(struct bpf_sock_ops_kern *skops, struct bpf_map *map, void *key, u64 flags) + * Description + * Add an entry to, or update a sockhash *map* referencing sockets. + * The *skops* is used as a new value for the entry associated to + * *key*. *flags* is one of: + * + * **BPF_NOEXIST** + * The entry for *key* must not exist in the map. + * **BPF_EXIST** + * The entry for *key* must already exist in the map. + * **BPF_ANY** + * No condition on the existence of the entry for *key*. + * + * If the *map* has eBPF programs (parser and verdict), those will + * be inherited by the socket being added. If the socket is + * already attached to eBPF programs, this results in an error. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_msg_redirect_hash(struct sk_msg_buff *msg, struct bpf_map *map, void *key, u64 flags) + * Description + * This helper is used in programs implementing policies at the + * socket level. If the message *msg* is allowed to pass (i.e. if + * the verdict eBPF program returns **SK_PASS**), redirect it to + * the socket referenced by *map* (of type + * **BPF_MAP_TYPE_SOCKHASH**) using hash *key*. Both ingress and + * egress interfaces can be used for redirection. The + * **BPF_F_INGRESS** value in *flags* is used to make the + * distinction (ingress path is selected if the flag is present, + * egress path otherwise). This is the only flag supported for now. + * Return + * **SK_PASS** on success, or **SK_DROP** on error. + * + * int bpf_sk_redirect_hash(struct sk_buff *skb, struct bpf_map *map, void *key, u64 flags) + * Description + * This helper is used in programs implementing policies at the + * skb socket level. If the sk_buff *skb* is allowed to pass (i.e. + * if the verdeict eBPF program returns **SK_PASS**), redirect it + * to the socket referenced by *map* (of type + * **BPF_MAP_TYPE_SOCKHASH**) using hash *key*. Both ingress and + * egress interfaces can be used for redirection. The + * **BPF_F_INGRESS** value in *flags* is used to make the + * distinction (ingress path is selected if the flag is present, + * egress otherwise). This is the only flag supported for now. + * Return + * **SK_PASS** on success, or **SK_DROP** on error. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index a9ddc4c43f19..5e3e6ac4d71e 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1848,6 +1848,7 @@ const struct bpf_func_proto bpf_get_current_pid_tgid_proto __weak; const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak; const struct bpf_func_proto bpf_get_current_comm_proto __weak; const struct bpf_func_proto bpf_sock_map_update_proto __weak; +const struct bpf_func_proto bpf_sock_hash_update_proto __weak; const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void) { diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index f8431a1e2865..b13db9894262 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -60,6 +60,28 @@ struct bpf_stab { struct bpf_sock_progs progs; }; +struct bucket { + struct hlist_head head; + raw_spinlock_t lock; +}; + +struct bpf_htab { + struct bpf_map map; + struct bucket *buckets; + atomic_t count; + u32 n_buckets; + u32 elem_size; + struct bpf_sock_progs progs; +}; + +struct htab_elem { + struct rcu_head rcu; + struct hlist_node hash_node; + u32 hash; + struct sock *sk; + char key[0]; +}; + enum smap_psock_state { SMAP_TX_RUNNING, }; @@ -67,6 +89,8 @@ enum smap_psock_state { struct smap_psock_map_entry { struct list_head list; struct sock **entry; + struct htab_elem *hash_link; + struct bpf_htab *htab; }; struct smap_psock { @@ -195,6 +219,12 @@ out: rcu_read_unlock(); } +static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) +{ + atomic_dec(&htab->count); + kfree_rcu(l, rcu); +} + static void bpf_tcp_close(struct sock *sk, long timeout) { void (*close_fun)(struct sock *sk, long timeout); @@ -231,10 +261,16 @@ static void bpf_tcp_close(struct sock *sk, long timeout) } list_for_each_entry_safe(e, tmp, &psock->maps, list) { - osk = cmpxchg(e->entry, sk, NULL); - if (osk == sk) { - list_del(&e->list); - smap_release_sock(psock, sk); + if (e->entry) { + osk = cmpxchg(e->entry, sk, NULL); + if (osk == sk) { + list_del(&e->list); + smap_release_sock(psock, sk); + } + } else { + hlist_del_rcu(&e->hash_link->hash_node); + smap_release_sock(psock, e->hash_link->sk); + free_htab_elem(e->htab, e->hash_link); } } write_unlock_bh(&sk->sk_callback_lock); @@ -1530,12 +1566,14 @@ free_stab: return ERR_PTR(err); } -static void smap_list_remove(struct smap_psock *psock, struct sock **entry) +static void smap_list_remove(struct smap_psock *psock, + struct sock **entry, + struct htab_elem *hash_link) { struct smap_psock_map_entry *e, *tmp; list_for_each_entry_safe(e, tmp, &psock->maps, list) { - if (e->entry == entry) { + if (e->entry == entry || e->hash_link == hash_link) { list_del(&e->list); break; } @@ -1573,7 +1611,7 @@ static void sock_map_free(struct bpf_map *map) * to be null and queued for garbage collection. */ if (likely(psock)) { - smap_list_remove(psock, &stab->sock_map[i]); + smap_list_remove(psock, &stab->sock_map[i], NULL); smap_release_sock(psock, sock); } write_unlock_bh(&sock->sk_callback_lock); @@ -1632,7 +1670,7 @@ static int sock_map_delete_elem(struct bpf_map *map, void *key) if (psock->bpf_parse) smap_stop_sock(psock, sock); - smap_list_remove(psock, &stab->sock_map[k]); + smap_list_remove(psock, &stab->sock_map[k], NULL); smap_release_sock(psock, sock); out: write_unlock_bh(&sock->sk_callback_lock); @@ -1749,10 +1787,12 @@ static int __sock_map_ctx_update_elem(struct bpf_map *map, new = true; } - e = kzalloc(sizeof(*e), GFP_ATOMIC | __GFP_NOWARN); - if (!e) { - err = -ENOMEM; - goto out_progs; + if (map_link) { + e = kzalloc(sizeof(*e), GFP_ATOMIC | __GFP_NOWARN); + if (!e) { + err = -ENOMEM; + goto out_progs; + } } /* 3. At this point we have a reference to a valid psock that is @@ -1786,6 +1826,7 @@ static int __sock_map_ctx_update_elem(struct bpf_map *map, write_unlock_bh(&sock->sk_callback_lock); return err; out_free: + kfree(e); smap_release_sock(psock, sock); out_progs: if (verdict) @@ -1832,7 +1873,7 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops, struct smap_psock *opsock = smap_psock_sk(osock); write_lock_bh(&osock->sk_callback_lock); - smap_list_remove(opsock, &stab->sock_map[i]); + smap_list_remove(opsock, &stab->sock_map[i], NULL); smap_release_sock(opsock, osock); write_unlock_bh(&osock->sk_callback_lock); } @@ -1849,6 +1890,10 @@ int sock_map_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type) struct bpf_stab *stab = container_of(map, struct bpf_stab, map); progs = &stab->progs; + } else if (map->map_type == BPF_MAP_TYPE_SOCKHASH) { + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + + progs = &htab->progs; } else { return -EINVAL; } @@ -1909,11 +1954,19 @@ static int sock_map_update_elem(struct bpf_map *map, static void sock_map_release(struct bpf_map *map) { - struct bpf_stab *stab = container_of(map, struct bpf_stab, map); struct bpf_sock_progs *progs; struct bpf_prog *orig; - progs = &stab->progs; + if (map->map_type == BPF_MAP_TYPE_SOCKMAP) { + struct bpf_stab *stab = container_of(map, struct bpf_stab, map); + + progs = &stab->progs; + } else { + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + + progs = &htab->progs; + } + orig = xchg(&progs->bpf_parse, NULL); if (orig) bpf_prog_put(orig); @@ -1926,6 +1979,390 @@ static void sock_map_release(struct bpf_map *map) bpf_prog_put(orig); } +static struct bpf_map *sock_hash_alloc(union bpf_attr *attr) +{ + struct bpf_htab *htab; + int i, err; + u64 cost; + + if (!capable(CAP_NET_ADMIN)) + return ERR_PTR(-EPERM); + + /* check sanity of attributes */ + if (attr->max_entries == 0 || attr->value_size != 4 || + attr->map_flags & ~SOCK_CREATE_FLAG_MASK) + return ERR_PTR(-EINVAL); + + err = bpf_tcp_ulp_register(); + if (err && err != -EEXIST) + return ERR_PTR(err); + + htab = kzalloc(sizeof(*htab), GFP_USER); + if (!htab) + return ERR_PTR(-ENOMEM); + + bpf_map_init_from_attr(&htab->map, attr); + + htab->n_buckets = roundup_pow_of_two(htab->map.max_entries); + htab->elem_size = sizeof(struct htab_elem) + + round_up(htab->map.key_size, 8); + err = -EINVAL; + if (htab->n_buckets == 0 || + htab->n_buckets > U32_MAX / sizeof(struct bucket)) + goto free_htab; + + cost = (u64) htab->n_buckets * sizeof(struct bucket) + + (u64) htab->elem_size * htab->map.max_entries; + + if (cost >= U32_MAX - PAGE_SIZE) + goto free_htab; + + htab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; + err = bpf_map_precharge_memlock(htab->map.pages); + if (err) + goto free_htab; + + err = -ENOMEM; + htab->buckets = bpf_map_area_alloc( + htab->n_buckets * sizeof(struct bucket), + htab->map.numa_node); + if (!htab->buckets) + goto free_htab; + + for (i = 0; i < htab->n_buckets; i++) { + INIT_HLIST_HEAD(&htab->buckets[i].head); + raw_spin_lock_init(&htab->buckets[i].lock); + } + + return &htab->map; +free_htab: + kfree(htab); + return ERR_PTR(err); +} + +static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash) +{ + return &htab->buckets[hash & (htab->n_buckets - 1)]; +} + +static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash) +{ + return &__select_bucket(htab, hash)->head; +} + +static void sock_hash_free(struct bpf_map *map) +{ + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + int i; + + synchronize_rcu(); + + /* At this point no update, lookup or delete operations can happen. + * However, be aware we can still get a socket state event updates, + * and data ready callabacks that reference the psock from sk_user_data + * Also psock worker threads are still in-flight. So smap_release_sock + * will only free the psock after cancel_sync on the worker threads + * and a grace period expire to ensure psock is really safe to remove. + */ + rcu_read_lock(); + for (i = 0; i < htab->n_buckets; i++) { + struct hlist_head *head = select_bucket(htab, i); + struct hlist_node *n; + struct htab_elem *l; + + hlist_for_each_entry_safe(l, n, head, hash_node) { + struct sock *sock = l->sk; + struct smap_psock *psock; + + hlist_del_rcu(&l->hash_node); + write_lock_bh(&sock->sk_callback_lock); + psock = smap_psock_sk(sock); + /* This check handles a racing sock event that can get + * the sk_callback_lock before this case but after xchg + * causing the refcnt to hit zero and sock user data + * (psock) to be null and queued for garbage collection. + */ + if (likely(psock)) { + smap_list_remove(psock, NULL, l); + smap_release_sock(psock, sock); + } + write_unlock_bh(&sock->sk_callback_lock); + kfree(l); + } + } + rcu_read_unlock(); + bpf_map_area_free(htab->buckets); + kfree(htab); +} + +static struct htab_elem *alloc_sock_hash_elem(struct bpf_htab *htab, + void *key, u32 key_size, u32 hash, + struct sock *sk, + struct htab_elem *old_elem) +{ + struct htab_elem *l_new; + + if (atomic_inc_return(&htab->count) > htab->map.max_entries) { + if (!old_elem) { + atomic_dec(&htab->count); + return ERR_PTR(-E2BIG); + } + } + l_new = kmalloc_node(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN, + htab->map.numa_node); + if (!l_new) + return ERR_PTR(-ENOMEM); + + memcpy(l_new->key, key, key_size); + l_new->sk = sk; + l_new->hash = hash; + return l_new; +} + +static struct htab_elem *lookup_elem_raw(struct hlist_head *head, + u32 hash, void *key, u32 key_size) +{ + struct htab_elem *l; + + hlist_for_each_entry_rcu(l, head, hash_node) { + if (l->hash == hash && !memcmp(&l->key, key, key_size)) + return l; + } + + return NULL; +} + +static inline u32 htab_map_hash(const void *key, u32 key_len) +{ + return jhash(key, key_len, 0); +} + +static int sock_hash_get_next_key(struct bpf_map *map, + void *key, void *next_key) +{ + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + struct htab_elem *l, *next_l; + struct hlist_head *h; + u32 hash, key_size; + int i = 0; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + key_size = map->key_size; + if (!key) + goto find_first_elem; + hash = htab_map_hash(key, key_size); + h = select_bucket(htab, hash); + + l = lookup_elem_raw(h, hash, key, key_size); + if (!l) + goto find_first_elem; + next_l = hlist_entry_safe( + rcu_dereference_raw(hlist_next_rcu(&l->hash_node)), + struct htab_elem, hash_node); + if (next_l) { + memcpy(next_key, next_l->key, key_size); + return 0; + } + + /* no more elements in this hash list, go to the next bucket */ + i = hash & (htab->n_buckets - 1); + i++; + +find_first_elem: + /* iterate over buckets */ + for (; i < htab->n_buckets; i++) { + h = select_bucket(htab, i); + + /* pick first element in the bucket */ + next_l = hlist_entry_safe( + rcu_dereference_raw(hlist_first_rcu(h)), + struct htab_elem, hash_node); + if (next_l) { + /* if it's not empty, just return it */ + memcpy(next_key, next_l->key, key_size); + return 0; + } + } + + /* iterated over all buckets and all elements */ + return -ENOENT; +} + +static int sock_hash_ctx_update_elem(struct bpf_sock_ops_kern *skops, + struct bpf_map *map, + void *key, u64 map_flags) +{ + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + struct bpf_sock_progs *progs = &htab->progs; + struct htab_elem *l_new = NULL, *l_old; + struct smap_psock_map_entry *e = NULL; + struct hlist_head *head; + struct smap_psock *psock; + u32 key_size, hash; + struct sock *sock; + struct bucket *b; + int err; + + sock = skops->sk; + + if (sock->sk_type != SOCK_STREAM || + sock->sk_protocol != IPPROTO_TCP) + return -EOPNOTSUPP; + + if (unlikely(map_flags > BPF_EXIST)) + return -EINVAL; + + e = kzalloc(sizeof(*e), GFP_ATOMIC | __GFP_NOWARN); + if (!e) + return -ENOMEM; + + WARN_ON_ONCE(!rcu_read_lock_held()); + key_size = map->key_size; + hash = htab_map_hash(key, key_size); + b = __select_bucket(htab, hash); + head = &b->head; + + err = __sock_map_ctx_update_elem(map, progs, sock, NULL, key); + if (err) + goto err; + + /* bpf_map_update_elem() can be called in_irq() */ + raw_spin_lock_bh(&b->lock); + l_old = lookup_elem_raw(head, hash, key, key_size); + if (l_old && map_flags == BPF_NOEXIST) { + err = -EEXIST; + goto bucket_err; + } + if (!l_old && map_flags == BPF_EXIST) { + err = -ENOENT; + goto bucket_err; + } + + l_new = alloc_sock_hash_elem(htab, key, key_size, hash, sock, l_old); + if (IS_ERR(l_new)) { + err = PTR_ERR(l_new); + goto bucket_err; + } + + psock = smap_psock_sk(sock); + if (unlikely(!psock)) { + err = -EINVAL; + goto bucket_err; + } + + e->hash_link = l_new; + e->htab = container_of(map, struct bpf_htab, map); + list_add_tail(&e->list, &psock->maps); + + /* add new element to the head of the list, so that + * concurrent search will find it before old elem + */ + hlist_add_head_rcu(&l_new->hash_node, head); + if (l_old) { + psock = smap_psock_sk(l_old->sk); + + hlist_del_rcu(&l_old->hash_node); + smap_list_remove(psock, NULL, l_old); + smap_release_sock(psock, l_old->sk); + free_htab_elem(htab, l_old); + } + raw_spin_unlock_bh(&b->lock); + return 0; +bucket_err: + raw_spin_unlock_bh(&b->lock); +err: + kfree(e); + psock = smap_psock_sk(sock); + if (psock) + smap_release_sock(psock, sock); + return err; +} + +static int sock_hash_update_elem(struct bpf_map *map, + void *key, void *value, u64 flags) +{ + struct bpf_sock_ops_kern skops; + u32 fd = *(u32 *)value; + struct socket *socket; + int err; + + socket = sockfd_lookup(fd, &err); + if (!socket) + return err; + + skops.sk = socket->sk; + if (!skops.sk) { + fput(socket->file); + return -EINVAL; + } + + err = sock_hash_ctx_update_elem(&skops, map, key, flags); + fput(socket->file); + return err; +} + +static int sock_hash_delete_elem(struct bpf_map *map, void *key) +{ + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + struct hlist_head *head; + struct bucket *b; + struct htab_elem *l; + u32 hash, key_size; + int ret = -ENOENT; + + key_size = map->key_size; + hash = htab_map_hash(key, key_size); + b = __select_bucket(htab, hash); + head = &b->head; + + raw_spin_lock_bh(&b->lock); + l = lookup_elem_raw(head, hash, key, key_size); + if (l) { + struct sock *sock = l->sk; + struct smap_psock *psock; + + hlist_del_rcu(&l->hash_node); + write_lock_bh(&sock->sk_callback_lock); + psock = smap_psock_sk(sock); + /* This check handles a racing sock event that can get the + * sk_callback_lock before this case but after xchg happens + * causing the refcnt to hit zero and sock user data (psock) + * to be null and queued for garbage collection. + */ + if (likely(psock)) { + smap_list_remove(psock, NULL, l); + smap_release_sock(psock, sock); + } + write_unlock_bh(&sock->sk_callback_lock); + free_htab_elem(htab, l); + ret = 0; + } + raw_spin_unlock_bh(&b->lock); + return ret; +} + +struct sock *__sock_hash_lookup_elem(struct bpf_map *map, void *key) +{ + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + struct hlist_head *head; + struct htab_elem *l; + u32 key_size, hash; + struct bucket *b; + struct sock *sk; + + key_size = map->key_size; + hash = htab_map_hash(key, key_size); + b = __select_bucket(htab, hash); + head = &b->head; + + raw_spin_lock_bh(&b->lock); + l = lookup_elem_raw(head, hash, key, key_size); + sk = l ? l->sk : NULL; + raw_spin_unlock_bh(&b->lock); + return sk; +} + const struct bpf_map_ops sock_map_ops = { .map_alloc = sock_map_alloc, .map_free = sock_map_free, @@ -1936,6 +2373,15 @@ const struct bpf_map_ops sock_map_ops = { .map_release_uref = sock_map_release, }; +const struct bpf_map_ops sock_hash_ops = { + .map_alloc = sock_hash_alloc, + .map_free = sock_hash_free, + .map_lookup_elem = sock_map_lookup, + .map_get_next_key = sock_hash_get_next_key, + .map_update_elem = sock_hash_update_elem, + .map_delete_elem = sock_hash_delete_elem, +}; + BPF_CALL_4(bpf_sock_map_update, struct bpf_sock_ops_kern *, bpf_sock, struct bpf_map *, map, void *, key, u64, flags) { @@ -1953,3 +2399,21 @@ const struct bpf_func_proto bpf_sock_map_update_proto = { .arg3_type = ARG_PTR_TO_MAP_KEY, .arg4_type = ARG_ANYTHING, }; + +BPF_CALL_4(bpf_sock_hash_update, struct bpf_sock_ops_kern *, bpf_sock, + struct bpf_map *, map, void *, key, u64, flags) +{ + WARN_ON_ONCE(!rcu_read_lock_held()); + return sock_hash_ctx_update_elem(bpf_sock, map, key, flags); +} + +const struct bpf_func_proto bpf_sock_hash_update_proto = { + .func = bpf_sock_hash_update, + .gpl_only = false, + .pkt_access = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_PTR_TO_MAP_KEY, + .arg4_type = ARG_ANYTHING, +}; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a463ae91d3c5..9fd14dee215e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2153,6 +2153,13 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, func_id != BPF_FUNC_msg_redirect_map) goto error; break; + case BPF_MAP_TYPE_SOCKHASH: + if (func_id != BPF_FUNC_sk_redirect_hash && + func_id != BPF_FUNC_sock_hash_update && + func_id != BPF_FUNC_map_delete_elem && + func_id != BPF_FUNC_msg_redirect_hash) + goto error; + break; default: break; } @@ -2190,11 +2197,14 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, break; case BPF_FUNC_sk_redirect_map: case BPF_FUNC_msg_redirect_map: + case BPF_FUNC_sock_map_update: if (map->map_type != BPF_MAP_TYPE_SOCKMAP) goto error; break; - case BPF_FUNC_sock_map_update: - if (map->map_type != BPF_MAP_TYPE_SOCKMAP) + case BPF_FUNC_sk_redirect_hash: + case BPF_FUNC_msg_redirect_hash: + case BPF_FUNC_sock_hash_update: + if (map->map_type != BPF_MAP_TYPE_SOCKHASH) goto error; break; default: diff --git a/net/core/filter.c b/net/core/filter.c index 1a575b6cb349..4bf09bc57978 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2086,6 +2086,33 @@ static const struct bpf_func_proto bpf_redirect_proto = { .arg2_type = ARG_ANYTHING, }; +BPF_CALL_4(bpf_sk_redirect_hash, struct sk_buff *, skb, + struct bpf_map *, map, void *, key, u64, flags) +{ + struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); + + /* If user passes invalid input drop the packet. */ + if (unlikely(flags & ~(BPF_F_INGRESS))) + return SK_DROP; + + tcb->bpf.flags = flags; + tcb->bpf.sk_redir = __sock_hash_lookup_elem(map, key); + if (!tcb->bpf.sk_redir) + return SK_DROP; + + return SK_PASS; +} + +static const struct bpf_func_proto bpf_sk_redirect_hash_proto = { + .func = bpf_sk_redirect_hash, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_PTR_TO_MAP_KEY, + .arg4_type = ARG_ANYTHING, +}; + BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb, struct bpf_map *, map, u32, key, u64, flags) { @@ -2120,6 +2147,31 @@ static const struct bpf_func_proto bpf_sk_redirect_map_proto = { .arg4_type = ARG_ANYTHING, }; +BPF_CALL_4(bpf_msg_redirect_hash, struct sk_msg_buff *, msg, + struct bpf_map *, map, void *, key, u64, flags) +{ + /* If user passes invalid input drop the packet. */ + if (unlikely(flags & ~(BPF_F_INGRESS))) + return SK_DROP; + + msg->flags = flags; + msg->sk_redir = __sock_hash_lookup_elem(map, key); + if (!msg->sk_redir) + return SK_DROP; + + return SK_PASS; +} + +static const struct bpf_func_proto bpf_msg_redirect_hash_proto = { + .func = bpf_msg_redirect_hash, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_PTR_TO_MAP_KEY, + .arg4_type = ARG_ANYTHING, +}; + BPF_CALL_4(bpf_msg_redirect_map, struct sk_msg_buff *, msg, struct bpf_map *, map, u32, key, u64, flags) { @@ -4520,6 +4572,8 @@ sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_sock_ops_cb_flags_set_proto; case BPF_FUNC_sock_map_update: return &bpf_sock_map_update_proto; + case BPF_FUNC_sock_hash_update: + return &bpf_sock_hash_update_proto; default: return bpf_base_func_proto(func_id); } @@ -4531,6 +4585,8 @@ sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) switch (func_id) { case BPF_FUNC_msg_redirect_map: return &bpf_msg_redirect_map_proto; + case BPF_FUNC_msg_redirect_hash: + return &bpf_msg_redirect_hash_proto; case BPF_FUNC_msg_apply_bytes: return &bpf_msg_apply_bytes_proto; case BPF_FUNC_msg_cork_bytes: @@ -4562,6 +4618,8 @@ sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_socket_uid_proto; case BPF_FUNC_sk_redirect_map: return &bpf_sk_redirect_map_proto; + case BPF_FUNC_sk_redirect_hash: + return &bpf_sk_redirect_hash_proto; default: return bpf_base_func_proto(func_id); } From 25d93a4fb43ad3c3602b33c9ff497ea69003d11a Mon Sep 17 00:00:00 2001 From: Mathieu Malaterre Date: Wed, 16 May 2018 22:27:41 +0200 Subject: [PATCH 0427/1640] UPSTREAM: bpf: add __printf verification to bpf_verifier_vlog MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit __printf is useful to verify format and arguments. ‘bpf_verifier_vlog’ function is used twice in verifier.c in both cases the caller function already uses the __printf gcc attribute. Remove the following warning, triggered with W=1: kernel/bpf/verifier.c:176:2: warning: function might be possible candidate for ‘gnu_printf’ format attribute [-Wsuggest-attribute=format] Signed-off-by: Mathieu Malaterre Signed-off-by: Daniel Borkmann --- include/linux/bpf_verifier.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index cf59ae681941..a8583be65b86 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -218,8 +218,8 @@ struct bpf_verifier_env { u32 subprog_cnt; }; -void bpf_verifier_vlog(struct bpf_verifier_log *log, const char *fmt, - va_list args); +__printf(2, 0) void bpf_verifier_vlog(struct bpf_verifier_log *log, + const char *fmt, va_list args); __printf(2, 3) void bpf_verifier_log_write(struct bpf_verifier_env *env, const char *fmt, ...); From 281ead8a55d2b047cd0b9d6942558c2aa04d598e Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 16 May 2018 14:06:26 -0700 Subject: [PATCH 0428/1640] UPSTREAM: bpf: fix sock hashmap kmalloc warning syzbot reported a kernel warning below: WARNING: CPU: 0 PID: 4499 at mm/slab_common.c:996 kmalloc_slab+0x56/0x70 mm/slab_common.c:996 Kernel panic - not syncing: panic_on_warn set ... CPU: 0 PID: 4499 Comm: syz-executor050 Not tainted 4.17.0-rc3+ #9 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x1b9/0x294 lib/dump_stack.c:113 panic+0x22f/0x4de kernel/panic.c:184 __warn.cold.8+0x163/0x1b3 kernel/panic.c:536 report_bug+0x252/0x2d0 lib/bug.c:186 fixup_bug arch/x86/kernel/traps.c:178 [inline] do_error_trap+0x1de/0x490 arch/x86/kernel/traps.c:296 do_invalid_op+0x1b/0x20 arch/x86/kernel/traps.c:315 invalid_op+0x14/0x20 arch/x86/entry/entry_64.S:992 RIP: 0010:kmalloc_slab+0x56/0x70 mm/slab_common.c:996 RSP: 0018:ffff8801d907fc58 EFLAGS: 00010246 RAX: 0000000000000000 RBX: ffff8801aeecb280 RCX: ffffffff8185ebd7 RDX: 0000000000000000 RSI: 0000000000000000 RDI: 00000000ffffffe1 RBP: ffff8801d907fc58 R08: ffff8801adb5e1c0 R09: ffffed0035a84700 R10: ffffed0035a84700 R11: ffff8801ad423803 R12: ffff8801aeecb280 R13: 00000000fffffff4 R14: ffff8801ad891a00 R15: 00000000014200c0 __do_kmalloc mm/slab.c:3713 [inline] __kmalloc+0x25/0x760 mm/slab.c:3727 kmalloc include/linux/slab.h:517 [inline] map_get_next_key+0x24a/0x640 kernel/bpf/syscall.c:858 __do_sys_bpf kernel/bpf/syscall.c:2131 [inline] __se_sys_bpf kernel/bpf/syscall.c:2096 [inline] __x64_sys_bpf+0x354/0x4f0 kernel/bpf/syscall.c:2096 do_syscall_64+0x1b1/0x800 arch/x86/entry/common.c:287 entry_SYSCALL_64_after_hwframe+0x49/0xbe The test case is against sock hashmap with a key size 0xffffffe1. Such a large key size will cause the below code in function sock_hash_alloc() overflowing and produces a smaller elem_size, hence map creation will be successful. htab->elem_size = sizeof(struct htab_elem) + round_up(htab->map.key_size, 8); Later, when map_get_next_key is called and kernel tries to allocate the key unsuccessfully, it will issue the above warning. Similar to hashtab, ensure the key size is at most MAX_BPF_STACK for a successful map creation. Fixes: 81110384441a ("bpf: sockmap, add hash map support") Reported-by: syzbot+e4566d29080e7f3460ff@syzkaller.appspotmail.com Signed-off-by: Yonghong Song Acked-by: John Fastabend Signed-off-by: Daniel Borkmann --- kernel/bpf/sockmap.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index b13db9894262..76c202109083 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -1993,6 +1993,12 @@ static struct bpf_map *sock_hash_alloc(union bpf_attr *attr) attr->map_flags & ~SOCK_CREATE_FLAG_MASK) return ERR_PTR(-EINVAL); + if (attr->key_size > MAX_BPF_STACK) + /* eBPF programs initialize keys on stack, so they cannot be + * larger than max stack size + */ + return ERR_PTR(-E2BIG); + err = bpf_tcp_ulp_register(); if (err && err != -EEXIST) return ERR_PTR(err); From 9aca8d6c91da625b130049f51ac8d55f064dda3f Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Wed, 16 May 2018 16:38:14 -0700 Subject: [PATCH 0429/1640] UPSTREAM: bpf: sockmap, on update propagate errors back to userspace When an error happens in the update sockmap element logic also pass the err up to the user. Fixes: e5cd3abcb31a ("bpf: sockmap, refactor sockmap routines to work with hashmap") Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann --- kernel/bpf/sockmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 76c202109083..1cf530860c84 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -1878,7 +1878,7 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops, write_unlock_bh(&osock->sk_callback_lock); } out: - return 0; + return err; } int sock_map_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type) From a77a4b17220f628a2ede2ce814676d5ab8802a6e Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Thu, 17 May 2018 09:08:43 -0500 Subject: [PATCH 0430/1640] UPSTREAM: bpf: sockmap, fix uninitialized variable There is a potential execution path in which variable err is returned without being properly initialized previously. Fix this by initializing variable err to 0. Addresses-Coverity-ID: 1468964 ("Uninitialized scalar variable") Fixes: e5cd3abcb31a ("bpf: sockmap, refactor sockmap routines to work with hashmap") Signed-off-by: Gustavo A. R. Silva Acked-by: John Fastabend Signed-off-by: Daniel Borkmann --- kernel/bpf/sockmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 1cf530860c84..8327798a9cfd 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -1716,7 +1716,7 @@ static int __sock_map_ctx_update_elem(struct bpf_map *map, struct smap_psock_map_entry *e = NULL; struct smap_psock *psock; bool new = false; - int err; + int err = 0; /* 1. If sock map has BPF programs those will be inherited by the * sock being added. If the sock is already attached to BPF programs From 13ad1af0953163ae65619d27cd849cc2aab4381e Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Thu, 17 May 2018 09:11:02 -0500 Subject: [PATCH 0431/1640] UPSTREAM: bpf: sockmap, fix double-free `e' is being freed twice. Fix this by removing one of the kfree() calls. Addresses-Coverity-ID: 1468983 ("Double free") Fixes: 81110384441a ("bpf: sockmap, add hash map support") Signed-off-by: Gustavo A. R. Silva Acked-by: John Fastabend Signed-off-by: Daniel Borkmann --- kernel/bpf/sockmap.c | 1 - 1 file changed, 1 deletion(-) diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 8327798a9cfd..fbf810224376 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -1826,7 +1826,6 @@ static int __sock_map_ctx_update_elem(struct bpf_map *map, write_unlock_bh(&sock->sk_callback_lock); return err; out_free: - kfree(e); smap_release_sock(psock, sock); out_progs: if (verdict) From 58b5ca182316ea1e7ba6a2a8af6363fe6a63f31b Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Thu, 17 May 2018 14:06:35 -0700 Subject: [PATCH 0432/1640] UPSTREAM: bpf: sockmap update rollback on error can incorrectly dec prog refcnt If the user were to only attach one of the parse or verdict programs then it is possible a subsequent sockmap update could incorrectly decrement the refcnt on the program. This happens because in the rollback logic, after an error, we have to decrement the program reference count when its been incremented. However, we only increment the program reference count if the user has both a verdict and a parse program. The reason for this is because, at least at the moment, both are required for any one to be meaningful. The problem fixed here is in the rollback path we decrement the program refcnt even if only one existing. But we never incremented the refcnt in the first place creating an imbalance. This patch fixes the error path to handle this case. Fixes: 2f857d04601a ("bpf: sockmap, remove STRPARSER map_flags and add multi-map support") Reported-by: Daniel Borkmann Signed-off-by: John Fastabend Acked-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann --- kernel/bpf/sockmap.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index fbf810224376..1439dbebea78 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -1746,10 +1746,10 @@ static int __sock_map_ctx_update_elem(struct bpf_map *map, if (tx_msg) { tx_msg = bpf_prog_inc_not_zero(progs->bpf_tx_msg); if (IS_ERR(tx_msg)) { - if (verdict) - bpf_prog_put(verdict); - if (parse) + if (parse && verdict) { bpf_prog_put(parse); + bpf_prog_put(verdict); + } return PTR_ERR(tx_msg); } } @@ -1828,10 +1828,10 @@ static int __sock_map_ctx_update_elem(struct bpf_map *map, out_free: smap_release_sock(psock, sock); out_progs: - if (verdict) - bpf_prog_put(verdict); - if (parse) + if (parse && verdict) { bpf_prog_put(parse); + bpf_prog_put(verdict); + } if (tx_msg) bpf_prog_put(tx_msg); write_unlock_bh(&sock->sk_callback_lock); From 69ca700ae3cc018ddb6b6a1c8f5156f47a7c8a68 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Thu, 17 May 2018 14:06:40 -0700 Subject: [PATCH 0433/1640] BACKPORT: bpf: parse and verdict prog attach may race with bpf map update In the sockmap design BPF programs (SK_SKB_STREAM_PARSER, SK_SKB_STREAM_VERDICT and SK_MSG_VERDICT) are attached to the sockmap map type and when a sock is added to the map the programs are used by the socket. However, sockmap updates from both userspace and BPF programs can happen concurrently with the attach and detach of these programs. To resolve this we use the bpf_prog_inc_not_zero and a READ_ONCE() primitive to ensure the program pointer is not refeched and possibly NULL'd before the refcnt increment. This happens inside a RCU critical section so although the pointer reference in the map object may be NULL (by a concurrent detach operation) the reference from READ_ONCE will not be free'd until after grace period. This ensures the object returned by READ_ONCE() is valid through the RCU criticl section and safe to use as long as we "know" it may be free'd shortly. Daniel spotted a case in the sock update API where instead of using the READ_ONCE() program reference we used the pointer from the original map, stab->bpf_{verdict|parse|txmsg}. The problem with this is the logic checks the object returned from the READ_ONCE() is not NULL and then tries to reference the object again but using the above map pointer, which may have already been NULL'd by a parallel detach operation. If this happened bpf_porg_inc_not_zero could dereference a NULL pointer. Fix this by using variable returned by READ_ONCE() that is checked for NULL. Fixes: 2f857d04601a ("bpf: sockmap, remove STRPARSER map_flags and add multi-map support") Reported-by: Daniel Borkmann Signed-off-by: John Fastabend Acked-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann --- kernel/bpf/sockmap.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 1439dbebea78..0df81d447777 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -1732,11 +1732,11 @@ static int __sock_map_ctx_update_elem(struct bpf_map *map, * we increment the refcnt. If this is the case abort with an * error. */ - verdict = bpf_prog_inc_not_zero(progs->bpf_verdict); + verdict = bpf_prog_inc_not_zero(verdict); if (IS_ERR(verdict)) return PTR_ERR(verdict); - parse = bpf_prog_inc_not_zero(progs->bpf_parse); + parse = bpf_prog_inc_not_zero(parse); if (IS_ERR(parse)) { bpf_prog_put(verdict); return PTR_ERR(parse); @@ -1744,7 +1744,7 @@ static int __sock_map_ctx_update_elem(struct bpf_map *map, } if (tx_msg) { - tx_msg = bpf_prog_inc_not_zero(progs->bpf_tx_msg); + tx_msg = bpf_prog_inc_not_zero(tx_msg); if (IS_ERR(tx_msg)) { if (parse && verdict) { bpf_prog_put(parse); From db21060d45f07e5544616fe9f6c492501a2fa1d6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Fri, 18 May 2018 14:00:21 +0200 Subject: [PATCH 0434/1640] BACKPORT: xsk: clean up SPDX headers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Clean up SPDX-License-Identifier and removing licensing leftovers. Signed-off-by: Björn Töpel Signed-off-by: Daniel Borkmann --- include/net/xdp_sock.h | 13 ++----------- include/uapi/linux/if_xdp.h | 13 ++----------- kernel/bpf/xskmap.c | 9 --------- net/xdp/xdp_umem.c | 9 --------- net/xdp/xdp_umem.h | 13 ++----------- net/xdp/xdp_umem_props.h | 13 ++----------- net/xdp/xsk.c | 9 --------- net/xdp/xsk_queue.c | 9 --------- net/xdp/xsk_queue.h | 13 ++----------- 9 files changed, 10 insertions(+), 91 deletions(-) diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index ce3a2ab16b8f..ec369686c88f 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -1,15 +1,6 @@ -/* SPDX-License-Identifier: GPL-2.0 - * AF_XDP internal functions +/* SPDX-License-Identifier: GPL-2.0 */ +/* AF_XDP internal functions * Copyright(c) 2018 Intel Corporation. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. */ #ifndef _LINUX_XDP_SOCK_H diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h index e5091881f776..2c2d12750bed 100644 --- a/include/uapi/linux/if_xdp.h +++ b/include/uapi/linux/if_xdp.h @@ -1,17 +1,8 @@ -/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note - * +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* * if_xdp: XDP socket user-space interface * Copyright(c) 2018 Intel Corporation. * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. - * * Author(s): Björn Töpel * Magnus Karlsson */ diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c index cb3a12137404..b3c557476a8d 100644 --- a/kernel/bpf/xskmap.c +++ b/kernel/bpf/xskmap.c @@ -1,15 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* XSKMAP used for AF_XDP sockets * Copyright(c) 2018 Intel Corporation. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. */ #include diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c index 9bac1ad570fa..1a25690f9721 100644 --- a/net/xdp/xdp_umem.c +++ b/net/xdp/xdp_umem.c @@ -1,15 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* XDP user-space packet buffer * Copyright(c) 2018 Intel Corporation. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. */ #include diff --git a/net/xdp/xdp_umem.h b/net/xdp/xdp_umem.h index c7378a11721f..c205d53ca479 100644 --- a/net/xdp/xdp_umem.h +++ b/net/xdp/xdp_umem.h @@ -1,15 +1,6 @@ -/* SPDX-License-Identifier: GPL-2.0 - * XDP user-space packet buffer +/* SPDX-License-Identifier: GPL-2.0 */ +/* XDP user-space packet buffer * Copyright(c) 2018 Intel Corporation. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. */ #ifndef XDP_UMEM_H_ diff --git a/net/xdp/xdp_umem_props.h b/net/xdp/xdp_umem_props.h index 77fb5daf29f3..2cf8ec485fd2 100644 --- a/net/xdp/xdp_umem_props.h +++ b/net/xdp/xdp_umem_props.h @@ -1,15 +1,6 @@ -/* SPDX-License-Identifier: GPL-2.0 - * XDP user-space packet buffer +/* SPDX-License-Identifier: GPL-2.0 */ +/* XDP user-space packet buffer * Copyright(c) 2018 Intel Corporation. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. */ #ifndef XDP_UMEM_PROPS_H_ diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index b931a0db5588..5f88fda93219 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -5,15 +5,6 @@ * applications. * Copyright(c) 2018 Intel Corporation. * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. - * * Author(s): Björn Töpel * Magnus Karlsson */ diff --git a/net/xdp/xsk_queue.c b/net/xdp/xsk_queue.c index d012e5e23591..9f605d22dad4 100644 --- a/net/xdp/xsk_queue.c +++ b/net/xdp/xsk_queue.c @@ -1,15 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* XDP user-space ring structure * Copyright(c) 2018 Intel Corporation. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. */ #include diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index 0a9b92b4f93a..7ca9ce45c342 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -1,15 +1,6 @@ -/* SPDX-License-Identifier: GPL-2.0 - * XDP user-space ring structure +/* SPDX-License-Identifier: GPL-2.0 */ +/* XDP user-space ring structure * Copyright(c) 2018 Intel Corporation. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. */ #ifndef _LINUX_XSK_QUEUE_H From ec168075ea7bccb685e4b513f5a8bd306706fb74 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Thu, 17 May 2018 14:16:58 -0700 Subject: [PATCH 0435/1640] UPSTREAM: bpf: allow sk_msg programs to read sock fields Currently sk_msg programs only have access to the raw data. However, it is often useful when building policies to have the policies specific to the socket endpoint. This allows using the socket tuple as input into filters, etc. This patch adds ctx access to the sock fields. Signed-off-by: John Fastabend Acked-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann --- include/linux/filter.h | 1 + include/uapi/linux/bpf.h | 8 +++ kernel/bpf/sockmap.c | 1 + net/core/filter.c | 114 +++++++++++++++++++++++++++++++++++++-- 4 files changed, 121 insertions(+), 3 deletions(-) diff --git a/include/linux/filter.h b/include/linux/filter.h index bf488c014966..8f1ca945a8f9 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -597,6 +597,7 @@ struct sk_msg_buff { bool sg_copy[MAX_SKB_FRAGS]; __u32 flags; struct sock *sk_redir; + struct sock *sk; struct sk_buff *skb; struct list_head list; }; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 9bf03b0467a6..358822001008 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2229,6 +2229,14 @@ enum sk_action { struct sk_msg_md { void *data; void *data_end; + + __u32 family; + __u32 remote_ip4; /* Stored in network byte order */ + __u32 local_ip4; /* Stored in network byte order */ + __u32 remote_ip6[4]; /* Stored in network byte order */ + __u32 local_ip6[4]; /* Stored in network byte order */ + __u32 remote_port; /* Stored in network byte order */ + __u32 local_port; /* stored in host byte order */ }; #define BPF_TAG_SIZE 8 diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 0df81d447777..ed0603f86df2 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -523,6 +523,7 @@ static unsigned int smap_do_tx_msg(struct sock *sk, } bpf_compute_data_pointers_sg(md); + md->sk = sk; rc = (*prog->bpf_func)(md, prog->insnsi); psock->apply_bytes = md->apply_bytes; diff --git a/net/core/filter.c b/net/core/filter.c index 4bf09bc57978..f78c52f68128 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5166,18 +5166,23 @@ static bool sk_msg_is_valid_access(int off, int size, switch (off) { case offsetof(struct sk_msg_md, data): info->reg_type = PTR_TO_PACKET; + if (size != sizeof(__u64)) + return false; break; case offsetof(struct sk_msg_md, data_end): info->reg_type = PTR_TO_PACKET_END; + if (size != sizeof(__u64)) + return false; break; + default: + if (size != sizeof(__u32)) + return false; } if (off < 0 || off >= sizeof(struct sk_msg_md)) return false; if (off % size != 0) return false; - if (size != sizeof(__u64)) - return false; return true; } @@ -5853,7 +5858,8 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, break; case offsetof(struct bpf_sock_ops, local_ip4): - BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_rcv_saddr) != 4); + BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, + skc_rcv_saddr) != 4); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( struct bpf_sock_ops_kern, sk), @@ -6170,6 +6176,7 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type, struct bpf_prog *prog, u32 *target_size) { struct bpf_insn *insn = insn_buf; + int off; switch (si->off) { case offsetof(struct sk_msg_md, data): @@ -6182,6 +6189,107 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type, si->dst_reg, si->src_reg, offsetof(struct sk_msg_buff, data_end)); break; + case offsetof(struct sk_msg_md, family): + BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_family) != 2); + + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct sk_msg_buff, sk), + si->dst_reg, si->src_reg, + offsetof(struct sk_msg_buff, sk)); + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, + offsetof(struct sock_common, skc_family)); + break; + + case offsetof(struct sk_msg_md, remote_ip4): + BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_daddr) != 4); + + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct sk_msg_buff, sk), + si->dst_reg, si->src_reg, + offsetof(struct sk_msg_buff, sk)); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, + offsetof(struct sock_common, skc_daddr)); + break; + + case offsetof(struct sk_msg_md, local_ip4): + BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, + skc_rcv_saddr) != 4); + + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct sk_msg_buff, sk), + si->dst_reg, si->src_reg, + offsetof(struct sk_msg_buff, sk)); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, + offsetof(struct sock_common, + skc_rcv_saddr)); + break; + + case offsetof(struct sk_msg_md, remote_ip6[0]) ... + offsetof(struct sk_msg_md, remote_ip6[3]): +#if IS_ENABLED(CONFIG_IPV6) + BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, + skc_v6_daddr.s6_addr32[0]) != 4); + + off = si->off; + off -= offsetof(struct sk_msg_md, remote_ip6[0]); + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct sk_msg_buff, sk), + si->dst_reg, si->src_reg, + offsetof(struct sk_msg_buff, sk)); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, + offsetof(struct sock_common, + skc_v6_daddr.s6_addr32[0]) + + off); +#else + *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); +#endif + break; + + case offsetof(struct sk_msg_md, local_ip6[0]) ... + offsetof(struct sk_msg_md, local_ip6[3]): +#if IS_ENABLED(CONFIG_IPV6) + BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, + skc_v6_rcv_saddr.s6_addr32[0]) != 4); + + off = si->off; + off -= offsetof(struct sk_msg_md, local_ip6[0]); + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct sk_msg_buff, sk), + si->dst_reg, si->src_reg, + offsetof(struct sk_msg_buff, sk)); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, + offsetof(struct sock_common, + skc_v6_rcv_saddr.s6_addr32[0]) + + off); +#else + *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); +#endif + break; + + case offsetof(struct sk_msg_md, remote_port): + BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_dport) != 2); + + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct sk_msg_buff, sk), + si->dst_reg, si->src_reg, + offsetof(struct sk_msg_buff, sk)); + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, + offsetof(struct sock_common, skc_dport)); +#ifndef __BIG_ENDIAN_BITFIELD + *insn++ = BPF_ALU32_IMM(BPF_LSH, si->dst_reg, 16); +#endif + break; + + case offsetof(struct sk_msg_md, local_port): + BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_num) != 2); + + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct sk_msg_buff, sk), + si->dst_reg, si->src_reg, + offsetof(struct sk_msg_buff, sk)); + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, + offsetof(struct sock_common, skc_num)); + break; } return insn - insn_buf; From 6d121f28f44f6b9b9e924f5418845f8bc419ff2b Mon Sep 17 00:00:00 2001 From: David Ahern Date: Mon, 21 May 2018 09:08:15 -0700 Subject: [PATCH 0436/1640] UPSTREAM: bpf: Add mtu checking to FIB forwarding helper Add check that egress MTU can handle packet to be forwarded. If the MTU is less than the packet length, return 0 meaning the packet is expected to continue up the stack for help - eg., fragmenting the packet or sending an ICMP. The XDP path needs to leverage the FIB entry for an MTU on the route spec or an exception entry for a given destination. The skb path lets is_skb_forwardable decide if the packet can be sent. Signed-off-by: David Ahern Signed-off-by: Daniel Borkmann --- net/core/filter.c | 42 +++++++++++++++++++++++++++++++++++------- 1 file changed, 35 insertions(+), 7 deletions(-) diff --git a/net/core/filter.c b/net/core/filter.c index f78c52f68128..e9cacc225437 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4103,7 +4103,7 @@ static int bpf_fib_set_fwd_params(struct bpf_fib_lookup *params, #if IS_ENABLED(CONFIG_INET) static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params, - u32 flags) + u32 flags, bool check_mtu) { struct in_device *in_dev; struct neighbour *neigh; @@ -4112,6 +4112,7 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params, struct fib_nh *nh; struct flowi4 fl4; int err; + u32 mtu; dev = dev_get_by_index_rcu(net, params->ifindex); if (unlikely(!dev)) @@ -4163,6 +4164,12 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params, if (res.fi->fib_nhs > 1) fib_select_path(net, &res, &fl4, NULL); + if (check_mtu) { + mtu = ip_mtu_from_fib_result(&res, params->ipv4_dst); + if (params->tot_len > mtu) + return 0; + } + nh = &res.fi->fib_nh[res.nh_sel]; /* do not handle lwt encaps right now */ @@ -4191,7 +4198,7 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params, #if IS_ENABLED(CONFIG_IPV6) static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params, - u32 flags) + u32 flags, bool check_mtu) { struct in6_addr *src = (struct in6_addr *) params->ipv6_src; struct in6_addr *dst = (struct in6_addr *) params->ipv6_dst; @@ -4202,6 +4209,7 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params, struct flowi6 fl6; int strict = 0; int oif; + u32 mtu; /* link local addresses are never forwarded */ if (rt6_need_strict(dst) || rt6_need_strict(src)) @@ -4264,6 +4272,12 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params, fl6.flowi6_oif, NULL, strict); + if (check_mtu) { + mtu = ipv6_stub->ip6_mtu_from_fib6(f6i, dst, src); + if (params->tot_len > mtu) + return 0; + } + if (f6i->fib6_nh.nh_lwtstate) return 0; @@ -4296,12 +4310,12 @@ BPF_CALL_4(bpf_xdp_fib_lookup, struct xdp_buff *, ctx, #if IS_ENABLED(CONFIG_INET) case AF_INET: return bpf_ipv4_fib_lookup(dev_net(ctx->rxq->dev), params, - flags); + flags, true); #endif #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: return bpf_ipv6_fib_lookup(dev_net(ctx->rxq->dev), params, - flags); + flags, true); #endif } return 0; @@ -4320,20 +4334,34 @@ static const struct bpf_func_proto bpf_xdp_fib_lookup_proto = { BPF_CALL_4(bpf_skb_fib_lookup, struct sk_buff *, skb, struct bpf_fib_lookup *, params, int, plen, u32, flags) { + struct net *net = dev_net(skb->dev); + int index = 0; + if (plen < sizeof(*params)) return -EINVAL; switch (params->family) { #if IS_ENABLED(CONFIG_INET) case AF_INET: - return bpf_ipv4_fib_lookup(dev_net(skb->dev), params, flags); + index = bpf_ipv4_fib_lookup(net, params, flags, false); + break; #endif #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: - return bpf_ipv6_fib_lookup(dev_net(skb->dev), params, flags); + index = bpf_ipv6_fib_lookup(net, params, flags, false); + break; #endif } - return -ENOTSUPP; + + if (index > 0) { + struct net_device *dev; + + dev = dev_get_by_index_rcu(net, index); + if (!is_skb_forwardable(dev, skb)) + index = 0; + } + + return index; } static const struct bpf_func_proto bpf_skb_fib_lookup_proto = { From d5bfa9776129d755d9d60b28a1b7159684020fad Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Tue, 22 May 2018 15:03:31 -0700 Subject: [PATCH 0437/1640] UPSTREAM: bpf: Expose check_uarg_tail_zero() This patch exposes check_uarg_tail_zero() which will be reused by a later BTF patch. Its name is changed to bpf_check_uarg_tail_zero(). Signed-off-by: Martin KaFai Lau Acked-by: Yonghong Song Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 2 ++ kernel/bpf/syscall.c | 14 +++++++------- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 5d1ec9c4152c..16a8a63d03b6 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -468,6 +468,8 @@ int bpf_fd_htab_map_update_elem(struct bpf_map *map, struct file *map_file, int bpf_fd_htab_map_lookup_elem(struct bpf_map *map, void *key, u32 *value); int bpf_get_file_flag(int flags); +int bpf_check_uarg_tail_zero(void __user *uaddr, size_t expected_size, + size_t actual_size); /* memcpy that is used with 8-byte aligned pointers, power-of-8 size and * forced to use 'long' read/writes to try to atomically copy long counters. diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 1675ed8bd7a3..b639ff2728d3 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -66,9 +66,9 @@ static const struct bpf_map_ops * const bpf_map_types[] = { * copy_from_user() call. However, this is not a concern since this function is * meant to be a future-proofing of bits. */ -static int check_uarg_tail_zero(void __user *uaddr, - size_t expected_size, - size_t actual_size) +int bpf_check_uarg_tail_zero(void __user *uaddr, + size_t expected_size, + size_t actual_size) { unsigned char __user *addr; unsigned char __user *end; @@ -1922,7 +1922,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, u32 ulen; int err; - err = check_uarg_tail_zero(uinfo, sizeof(info), info_len); + err = bpf_check_uarg_tail_zero(uinfo, sizeof(info), info_len); if (err) return err; info_len = min_t(u32, sizeof(info), info_len); @@ -2022,7 +2022,7 @@ static int bpf_map_get_info_by_fd(struct bpf_map *map, u32 info_len = attr->info.info_len; int err; - err = check_uarg_tail_zero(uinfo, sizeof(info), info_len); + err = bpf_check_uarg_tail_zero(uinfo, sizeof(info), info_len); if (err) return err; info_len = min_t(u32, sizeof(info), info_len); @@ -2063,7 +2063,7 @@ static int bpf_btf_get_info_by_fd(struct btf *btf, u32 info_len = attr->info.info_len; int err; - err = check_uarg_tail_zero(uinfo, sizeof(*uinfo), info_len); + err = bpf_check_uarg_tail_zero(uinfo, sizeof(*uinfo), info_len); if (err) return err; @@ -2135,7 +2135,7 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz if (sysctl_unprivileged_bpf_disabled && !capable(CAP_SYS_ADMIN)) return -EPERM; - err = check_uarg_tail_zero(uattr, sizeof(attr), size); + err = bpf_check_uarg_tail_zero(uattr, sizeof(attr), size); if (err) return err; size = min_t(u32, size, sizeof(attr)); From bec4802a155bdee778435a5e3d8c438ddb72715d Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Tue, 22 May 2018 14:57:18 -0700 Subject: [PATCH 0438/1640] UPSTREAM: bpf: btf: Change how section is supported in btf_header There are currently unused section descriptions in the btf_header. Those sections are here to support future BTF use cases. For example, the func section (func_off) is to support function signature (e.g. the BPF prog function signature). Instead of spelling out all potential sections up-front in the btf_header. This patch makes changes to btf_header such that extending it (e.g. adding a section) is possible later. The unused ones can be removed for now and they can be added back later. This patch: 1. adds a hdr_len to the btf_header. It will allow adding sections (and other info like parent_label and parent_name) later. The check is similar to the existing bpf_attr. If a user passes in a longer hdr_len, the kernel ensures the extra tailing bytes are 0. 2. allows the section order in the BTF object to be different from its sec_off order in btf_header. 3. each sec_off is followed by a sec_len. It must not have gap or overlapping among sections. The string section is ensured to be at the end due to the 4 bytes alignment requirement of the type section. The above changes will allow enough flexibility to add new sections (and other info) to the btf_header later. This patch also removes an unnecessary !err check at the end of btf_parse(). Signed-off-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann --- include/uapi/linux/btf.h | 8 +- kernel/bpf/btf.c | 209 +++++++++++++++++++++++++++++---------- 2 files changed, 160 insertions(+), 57 deletions(-) diff --git a/include/uapi/linux/btf.h b/include/uapi/linux/btf.h index bcb56ee47014..4fa479741a02 100644 --- a/include/uapi/linux/btf.h +++ b/include/uapi/linux/btf.h @@ -12,15 +12,11 @@ struct btf_header { __u16 magic; __u8 version; __u8 flags; - - __u32 parent_label; - __u32 parent_name; + __u32 hdr_len; /* All offsets are in bytes relative to the end of this header */ - __u32 label_off; /* offset of label section */ - __u32 object_off; /* offset of data object section*/ - __u32 func_off; /* offset of function section */ __u32 type_off; /* offset of type section */ + __u32 type_len; /* length of type section */ __u32 str_off; /* offset of string section */ __u32 str_len; /* length of string section */ }; diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index ded10ab47b8a..75da6cbae47d 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include @@ -184,15 +185,13 @@ static DEFINE_IDR(btf_idr); static DEFINE_SPINLOCK(btf_idr_lock); struct btf { - union { - struct btf_header *hdr; - void *data; - }; + void *data; struct btf_type **types; u32 *resolved_ids; u32 *resolved_sizes; const char *strings; void *nohdr_data; + struct btf_header hdr; u32 nr_types; u32 types_size; u32 data_size; @@ -228,6 +227,11 @@ enum resolve_mode { #define MAX_RESOLVE_DEPTH 32 +struct btf_sec_info { + u32 off; + u32 len; +}; + struct btf_verifier_env { struct btf *btf; u8 *visit_states; @@ -418,14 +422,14 @@ static const struct btf_kind_operations *btf_type_ops(const struct btf_type *t) static bool btf_name_offset_valid(const struct btf *btf, u32 offset) { return !BTF_STR_TBL_ELF_ID(offset) && - BTF_STR_OFFSET(offset) < btf->hdr->str_len; + BTF_STR_OFFSET(offset) < btf->hdr.str_len; } static const char *btf_name_by_offset(const struct btf *btf, u32 offset) { if (!BTF_STR_OFFSET(offset)) return "(anon)"; - else if (BTF_STR_OFFSET(offset) < btf->hdr->str_len) + else if (BTF_STR_OFFSET(offset) < btf->hdr.str_len) return &btf->strings[BTF_STR_OFFSET(offset)]; else return "(invalid-name-offset)"; @@ -536,7 +540,8 @@ static void btf_verifier_log_member(struct btf_verifier_env *env, __btf_verifier_log(log, "\n"); } -static void btf_verifier_log_hdr(struct btf_verifier_env *env) +static void btf_verifier_log_hdr(struct btf_verifier_env *env, + u32 btf_data_size) { struct bpf_verifier_log *log = &env->log; const struct btf *btf = env->btf; @@ -545,19 +550,16 @@ static void btf_verifier_log_hdr(struct btf_verifier_env *env) if (!bpf_verifier_log_needed(log)) return; - hdr = btf->hdr; + hdr = &btf->hdr; __btf_verifier_log(log, "magic: 0x%x\n", hdr->magic); __btf_verifier_log(log, "version: %u\n", hdr->version); __btf_verifier_log(log, "flags: 0x%x\n", hdr->flags); - __btf_verifier_log(log, "parent_label: %u\n", hdr->parent_label); - __btf_verifier_log(log, "parent_name: %u\n", hdr->parent_name); - __btf_verifier_log(log, "label_off: %u\n", hdr->label_off); - __btf_verifier_log(log, "object_off: %u\n", hdr->object_off); - __btf_verifier_log(log, "func_off: %u\n", hdr->func_off); + __btf_verifier_log(log, "hdr_len: %u\n", hdr->hdr_len); __btf_verifier_log(log, "type_off: %u\n", hdr->type_off); + __btf_verifier_log(log, "type_len: %u\n", hdr->type_len); __btf_verifier_log(log, "str_off: %u\n", hdr->str_off); __btf_verifier_log(log, "str_len: %u\n", hdr->str_len); - __btf_verifier_log(log, "btf_total_size: %u\n", btf->data_size); + __btf_verifier_log(log, "btf_total_size: %u\n", btf_data_size); } static int btf_add_type(struct btf_verifier_env *env, struct btf_type *t) @@ -1754,9 +1756,9 @@ static int btf_check_all_metas(struct btf_verifier_env *env) struct btf_header *hdr; void *cur, *end; - hdr = btf->hdr; + hdr = &btf->hdr; cur = btf->nohdr_data + hdr->type_off; - end = btf->nohdr_data + hdr->str_off; + end = btf->nohdr_data + hdr->type_len; env->log_type_id = 1; while (cur < end) { @@ -1866,8 +1868,20 @@ static int btf_check_all_types(struct btf_verifier_env *env) static int btf_parse_type_sec(struct btf_verifier_env *env) { + const struct btf_header *hdr = &env->btf->hdr; int err; + /* Type section must align to 4 bytes */ + if (hdr->type_off & (sizeof(u32) - 1)) { + btf_verifier_log(env, "Unaligned type_off"); + return -EINVAL; + } + + if (!hdr->type_len) { + btf_verifier_log(env, "No type found"); + return -EINVAL; + } + err = btf_check_all_metas(env); if (err) return err; @@ -1881,10 +1895,15 @@ static int btf_parse_str_sec(struct btf_verifier_env *env) struct btf *btf = env->btf; const char *start, *end; - hdr = btf->hdr; + hdr = &btf->hdr; start = btf->nohdr_data + hdr->str_off; end = start + hdr->str_len; + if (end != btf->data + btf->data_size) { + btf_verifier_log(env, "String section is not at the end"); + return -EINVAL; + } + if (!hdr->str_len || hdr->str_len - 1 > BTF_MAX_NAME_OFFSET || start[0] || end[-1]) { btf_verifier_log(env, "Invalid string section"); @@ -1896,20 +1915,122 @@ static int btf_parse_str_sec(struct btf_verifier_env *env) return 0; } -static int btf_parse_hdr(struct btf_verifier_env *env) +static const size_t btf_sec_info_offset[] = { + offsetof(struct btf_header, type_off), + offsetof(struct btf_header, str_off), +}; + +static int btf_sec_info_cmp(const void *a, const void *b) +{ + const struct btf_sec_info *x = a; + const struct btf_sec_info *y = b; + + return (int)(x->off - y->off) ? : (int)(x->len - y->len); +} + +static int btf_check_sec_info(struct btf_verifier_env *env, + u32 btf_data_size) +{ + const unsigned int nr_secs = ARRAY_SIZE(btf_sec_info_offset); + struct btf_sec_info secs[nr_secs]; + u32 total, expected_total, i; + const struct btf_header *hdr; + const struct btf *btf; + + btf = env->btf; + hdr = &btf->hdr; + + /* Populate the secs from hdr */ + for (i = 0; i < nr_secs; i++) + secs[i] = *(struct btf_sec_info *)((void *)hdr + + btf_sec_info_offset[i]); + + sort(secs, nr_secs, sizeof(struct btf_sec_info), + btf_sec_info_cmp, NULL); + + /* Check for gaps and overlap among sections */ + total = 0; + expected_total = btf_data_size - hdr->hdr_len; + for (i = 0; i < nr_secs; i++) { + if (expected_total < secs[i].off) { + btf_verifier_log(env, "Invalid section offset"); + return -EINVAL; + } + if (total < secs[i].off) { + /* gap */ + btf_verifier_log(env, "Unsupported section found"); + return -EINVAL; + } + if (total > secs[i].off) { + btf_verifier_log(env, "Section overlap found"); + return -EINVAL; + } + if (expected_total - total < secs[i].len) { + btf_verifier_log(env, + "Total section length too long"); + return -EINVAL; + } + total += secs[i].len; + } + + /* There is data other than hdr and known sections */ + if (expected_total != total) { + btf_verifier_log(env, "Unsupported section found"); + return -EINVAL; + } + + return 0; +} + +static int btf_parse_hdr(struct btf_verifier_env *env, void __user *btf_data, + u32 btf_data_size) { const struct btf_header *hdr; - struct btf *btf = env->btf; - u32 meta_left; + u32 hdr_len, hdr_copy; + /* + * Minimal part of the "struct btf_header" that + * contains the hdr_len. + */ + struct btf_min_header { + u16 magic; + u8 version; + u8 flags; + u32 hdr_len; + } __user *min_hdr; + struct btf *btf; + int err; - if (btf->data_size < sizeof(*hdr)) { + btf = env->btf; + min_hdr = btf_data; + + if (btf_data_size < sizeof(*min_hdr)) { + btf_verifier_log(env, "hdr_len not found"); + return -EINVAL; + } + + if (get_user(hdr_len, &min_hdr->hdr_len)) + return -EFAULT; + + if (btf_data_size < hdr_len) { btf_verifier_log(env, "btf_header not found"); return -EINVAL; } - btf_verifier_log_hdr(env); + err = bpf_check_uarg_tail_zero(btf_data, sizeof(btf->hdr), hdr_len); + if (err) { + if (err == -E2BIG) + btf_verifier_log(env, "Unsupported btf_header"); + return err; + } + + hdr_copy = min_t(u32, hdr_len, sizeof(btf->hdr)); + if (copy_from_user(&btf->hdr, btf_data, hdr_copy)) + return -EFAULT; + + hdr = &btf->hdr; + + btf_verifier_log_hdr(env, btf_data_size); - hdr = btf->hdr; if (hdr->magic != BTF_MAGIC) { btf_verifier_log(env, "Invalid magic"); return -EINVAL; @@ -1925,26 +2046,14 @@ static int btf_parse_hdr(struct btf_verifier_env *env) return -ENOTSUPP; } - meta_left = btf->data_size - sizeof(*hdr); - if (!meta_left) { + if (btf_data_size == hdr->hdr_len) { btf_verifier_log(env, "No data"); return -EINVAL; } - if (meta_left < hdr->type_off || hdr->str_off <= hdr->type_off || - /* Type section must align to 4 bytes */ - hdr->type_off & (sizeof(u32) - 1)) { - btf_verifier_log(env, "Invalid type_off"); - return -EINVAL; - } - - if (meta_left < hdr->str_off || - meta_left - hdr->str_off < hdr->str_len) { - btf_verifier_log(env, "Invalid str_off or str_len"); - return -EINVAL; - } - - btf->nohdr_data = btf->hdr + 1; + err = btf_check_sec_info(env, btf_data_size); + if (err) + return err; return 0; } @@ -1987,6 +2096,11 @@ static struct btf *btf_parse(void __user *btf_data, u32 btf_data_size, err = -ENOMEM; goto errout; } + env->btf = btf; + + err = btf_parse_hdr(env, btf_data, btf_data_size); + if (err) + goto errout; data = kvmalloc(btf_data_size, GFP_KERNEL | __GFP_NOWARN); if (!data) { @@ -1996,18 +2110,13 @@ static struct btf *btf_parse(void __user *btf_data, u32 btf_data_size, btf->data = data; btf->data_size = btf_data_size; + btf->nohdr_data = btf->data + btf->hdr.hdr_len; if (copy_from_user(data, btf_data, btf_data_size)) { err = -EFAULT; goto errout; } - env->btf = btf; - - err = btf_parse_hdr(env); - if (err) - goto errout; - err = btf_parse_str_sec(env); if (err) goto errout; @@ -2016,16 +2125,14 @@ static struct btf *btf_parse(void __user *btf_data, u32 btf_data_size, if (err) goto errout; - if (!err && log->level && bpf_verifier_log_full(log)) { + if (log->level && bpf_verifier_log_full(log)) { err = -ENOSPC; goto errout; } - if (!err) { - btf_verifier_env_free(env); - refcount_set(&btf->refcnt, 1); - return btf; - } + btf_verifier_env_free(env); + refcount_set(&btf->refcnt, 1); + return btf; errout: btf_verifier_env_free(env); From 0bda63646e1dca95f83e14740d2886e0e70fde62 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Tue, 22 May 2018 14:57:19 -0700 Subject: [PATCH 0439/1640] UPSTREAM: bpf: btf: Check array->index_type Instead of ingoring the array->index_type field. Enforce that it must be a BTF_KIND_INT in size 1/2/4/8 bytes. Signed-off-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann --- kernel/bpf/btf.c | 80 +++++++++++++++++++++++++++++++++--------------- 1 file changed, 56 insertions(+), 24 deletions(-) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 75da6cbae47d..e388a6598de2 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -443,6 +443,28 @@ static const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id) return btf->types[type_id]; } +/* + * Regular int is not a bit field and it must be either + * u8/u16/u32/u64. + */ +static bool btf_type_int_is_regular(const struct btf_type *t) +{ + u16 nr_bits, nr_bytes; + u32 int_data; + + int_data = btf_type_int(t); + nr_bits = BTF_INT_BITS(int_data); + nr_bytes = BITS_ROUNDUP_BYTES(nr_bits); + if (BITS_PER_BYTE_MASKED(nr_bits) || + BTF_INT_OFFSET(int_data) || + (nr_bytes != sizeof(u8) && nr_bytes != sizeof(u16) && + nr_bytes != sizeof(u32) && nr_bytes != sizeof(u64))) { + return false; + } + + return true; +} + __printf(2, 3) static void __btf_verifier_log(struct bpf_verifier_log *log, const char *fmt, ...) { @@ -1308,14 +1330,16 @@ static s32 btf_array_check_meta(struct btf_verifier_env *env, return -EINVAL; } - /* We are a little forgiving on array->index_type since - * the kernel is not using it. - */ - /* Array elem cannot be in type void, - * so !array->type is not allowed. + /* Array elem type and index type cannot be in type void, + * so !array->type and !array->index_type are not allowed. */ if (!array->type || BTF_TYPE_PARENT(array->type)) { - btf_verifier_log_type(env, t, "Invalid type_id"); + btf_verifier_log_type(env, t, "Invalid elem"); + return -EINVAL; + } + + if (!array->index_type || BTF_TYPE_PARENT(array->index_type)) { + btf_verifier_log_type(env, t, "Invalid index"); return -EINVAL; } @@ -1328,11 +1352,32 @@ static int btf_array_resolve(struct btf_verifier_env *env, const struct resolve_vertex *v) { const struct btf_array *array = btf_type_array(v->t); - const struct btf_type *elem_type; - u32 elem_type_id = array->type; + const struct btf_type *elem_type, *index_type; + u32 elem_type_id, index_type_id; struct btf *btf = env->btf; u32 elem_size; + /* Check array->index_type */ + index_type_id = array->index_type; + index_type = btf_type_by_id(btf, index_type_id); + if (btf_type_is_void_or_null(index_type)) { + btf_verifier_log_type(env, v->t, "Invalid index"); + return -EINVAL; + } + + if (!env_type_is_resolve_sink(env, index_type) && + !env_type_is_resolved(env, index_type_id)) + return env_stack_push(env, index_type, index_type_id); + + index_type = btf_type_id_size(btf, &index_type_id, NULL); + if (!index_type || !btf_type_is_int(index_type) || + !btf_type_int_is_regular(index_type)) { + btf_verifier_log_type(env, v->t, "Invalid index"); + return -EINVAL; + } + + /* Check array->type */ + elem_type_id = array->type; elem_type = btf_type_by_id(btf, elem_type_id); if (btf_type_is_void_or_null(elem_type)) { btf_verifier_log_type(env, v->t, @@ -1350,22 +1395,9 @@ static int btf_array_resolve(struct btf_verifier_env *env, return -EINVAL; } - if (btf_type_is_int(elem_type)) { - int int_type_data = btf_type_int(elem_type); - u16 nr_bits = BTF_INT_BITS(int_type_data); - u16 nr_bytes = BITS_ROUNDUP_BYTES(nr_bits); - - /* Put more restriction on array of int. The int cannot - * be a bit field and it must be either u8/u16/u32/u64. - */ - if (BITS_PER_BYTE_MASKED(nr_bits) || - BTF_INT_OFFSET(int_type_data) || - (nr_bytes != sizeof(u8) && nr_bytes != sizeof(u16) && - nr_bytes != sizeof(u32) && nr_bytes != sizeof(u64))) { - btf_verifier_log_type(env, v->t, - "Invalid array of int"); - return -EINVAL; - } + if (btf_type_is_int(elem_type) && !btf_type_int_is_regular(elem_type)) { + btf_verifier_log_type(env, v->t, "Invalid array of int"); + return -EINVAL; } if (array->nelems && elem_size > U32_MAX / array->nelems) { From 377546b8622c1313506449df4c3a37ee8882ce21 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Tue, 22 May 2018 14:57:20 -0700 Subject: [PATCH 0440/1640] UPSTREAM: bpf: btf: Remove unused bits from uapi/linux/btf.h This patch does the followings: 1. Limit BTF_MAX_TYPES and BTF_MAX_NAME_OFFSET to 64k. We can raise it later. 2. Remove the BTF_TYPE_PARENT and BTF_STR_TBL_ELF_ID. They are currently encoded at the highest bit of a u32. It is because the current use case does not require supporting parent type (i.e type_id referring to a type in another BTF file). It also does not support referring to a string in ELF. The BTF_TYPE_PARENT and BTF_STR_TBL_ELF_ID checks are replaced by BTF_TYPE_ID_CHECK and BTF_STR_OFFSET_CHECK which are defined in btf.c instead of uapi/linux/btf.h. 3. Limit the BTF_INFO_KIND from 5 bits to 4 bits which is enough. There is unused bits headroom if we ever needed it later. 4. The root bit in BTF_INFO is also removed because it is not used in the current use case. 5. Remove BTF_INT_VARARGS since func type is not supported now. The BTF_INT_ENCODING is limited to 4 bits instead of 8 bits. The above can be added back later because the verifier ensures the unused bits are zeros. Signed-off-by: Martin KaFai Lau Acked-by: Yonghong Song Signed-off-by: Daniel Borkmann --- include/uapi/linux/btf.h | 29 +++++++--------------- kernel/bpf/btf.c | 52 +++++++++++++++++++++++++++------------- 2 files changed, 44 insertions(+), 37 deletions(-) diff --git a/include/uapi/linux/btf.h b/include/uapi/linux/btf.h index 4fa479741a02..0b5ddbe135a4 100644 --- a/include/uapi/linux/btf.h +++ b/include/uapi/linux/btf.h @@ -22,28 +22,19 @@ struct btf_header { }; /* Max # of type identifier */ -#define BTF_MAX_TYPE 0x7fffffff +#define BTF_MAX_TYPE 0x0000ffff /* Max offset into the string section */ -#define BTF_MAX_NAME_OFFSET 0x7fffffff +#define BTF_MAX_NAME_OFFSET 0x0000ffff /* Max # of struct/union/enum members or func args */ #define BTF_MAX_VLEN 0xffff -/* The type id is referring to a parent BTF */ -#define BTF_TYPE_PARENT(id) (((id) >> 31) & 0x1) -#define BTF_TYPE_ID(id) ((id) & BTF_MAX_TYPE) - -/* String is in the ELF string section */ -#define BTF_STR_TBL_ELF_ID(ref) (((ref) >> 31) & 0x1) -#define BTF_STR_OFFSET(ref) ((ref) & BTF_MAX_NAME_OFFSET) - struct btf_type { __u32 name_off; /* "info" bits arrangement * bits 0-15: vlen (e.g. # of struct's members) * bits 16-23: unused - * bits 24-28: kind (e.g. int, ptr, array...etc) - * bits 29-30: unused - * bits 31: root + * bits 24-27: kind (e.g. int, ptr, array...etc) + * bits 28-31: unused */ __u32 info; /* "size" is used by INT, ENUM, STRUCT and UNION. @@ -58,8 +49,7 @@ struct btf_type { }; }; -#define BTF_INFO_KIND(info) (((info) >> 24) & 0x1f) -#define BTF_INFO_ISROOT(info) (!!(((info) >> 24) & 0x80)) +#define BTF_INFO_KIND(info) (((info) >> 24) & 0x0f) #define BTF_INFO_VLEN(info) ((info) & 0xffff) #define BTF_KIND_UNKN 0 /* Unknown */ @@ -84,15 +74,14 @@ struct btf_type { /* BTF_KIND_INT is followed by a u32 and the following * is the 32 bits arrangement: */ -#define BTF_INT_ENCODING(VAL) (((VAL) & 0xff000000) >> 24) +#define BTF_INT_ENCODING(VAL) (((VAL) & 0x0f000000) >> 24) #define BTF_INT_OFFSET(VAL) (((VAL & 0x00ff0000)) >> 16) #define BTF_INT_BITS(VAL) ((VAL) & 0x0000ffff) /* Attributes stored in the BTF_INT_ENCODING */ -#define BTF_INT_SIGNED 0x1 -#define BTF_INT_CHAR 0x2 -#define BTF_INT_BOOL 0x4 -#define BTF_INT_VARARGS 0x8 +#define BTF_INT_SIGNED (1 << 0) +#define BTF_INT_CHAR (1 << 1) +#define BTF_INT_BOOL (1 << 2) /* BTF_KIND_ENUM is followed by multiple "struct btf_enum". * The exact number of btf_enum is stored in the vlen (of the diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index e388a6598de2..9cbeabb5aca3 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -163,13 +163,16 @@ #define BITS_ROUNDUP_BYTES(bits) \ (BITS_ROUNDDOWN_BYTES(bits) + !!BITS_PER_BYTE_MASKED(bits)) +#define BTF_INFO_MASK 0x0f00ffff +#define BTF_INT_MASK 0x0fffffff +#define BTF_TYPE_ID_VALID(type_id) ((type_id) <= BTF_MAX_TYPE) +#define BTF_STR_OFFSET_VALID(name_off) ((name_off) <= BTF_MAX_NAME_OFFSET) + /* 16MB for 64k structs and each has 16 members and * a few MB spaces for the string section. * The hard limit is S32_MAX. */ #define BTF_MAX_SIZE (16 * 1024 * 1024) -/* 64k. We can raise it later. The hard limit is S32_MAX. */ -#define BTF_MAX_NR_TYPES 65535 #define for_each_member(i, struct_type, member) \ for (i = 0, member = btf_type_member(struct_type); \ @@ -383,8 +386,6 @@ static const char *btf_int_encoding_str(u8 encoding) return "CHAR"; else if (encoding == BTF_INT_BOOL) return "BOOL"; - else if (encoding == BTF_INT_VARARGS) - return "VARARGS"; else return "UNKN"; } @@ -421,16 +422,16 @@ static const struct btf_kind_operations *btf_type_ops(const struct btf_type *t) static bool btf_name_offset_valid(const struct btf *btf, u32 offset) { - return !BTF_STR_TBL_ELF_ID(offset) && - BTF_STR_OFFSET(offset) < btf->hdr.str_len; + return BTF_STR_OFFSET_VALID(offset) && + offset < btf->hdr.str_len; } static const char *btf_name_by_offset(const struct btf *btf, u32 offset) { - if (!BTF_STR_OFFSET(offset)) + if (!offset) return "(anon)"; - else if (BTF_STR_OFFSET(offset) < btf->hdr.str_len) - return &btf->strings[BTF_STR_OFFSET(offset)]; + else if (offset < btf->hdr.str_len) + return &btf->strings[offset]; else return "(invalid-name-offset)"; } @@ -598,13 +599,13 @@ static int btf_add_type(struct btf_verifier_env *env, struct btf_type *t) struct btf_type **new_types; u32 expand_by, new_size; - if (btf->types_size == BTF_MAX_NR_TYPES) { + if (btf->types_size == BTF_MAX_TYPE) { btf_verifier_log(env, "Exceeded max num of types"); return -E2BIG; } expand_by = max_t(u32, btf->types_size >> 2, 16); - new_size = min_t(u32, BTF_MAX_NR_TYPES, + new_size = min_t(u32, BTF_MAX_TYPE, btf->types_size + expand_by); new_types = kvzalloc(new_size * sizeof(*new_types), @@ -934,6 +935,12 @@ static s32 btf_int_check_meta(struct btf_verifier_env *env, } int_data = btf_type_int(t); + if (int_data & ~BTF_INT_MASK) { + btf_verifier_log_basic(env, t, "Invalid int_data:%x", + int_data); + return -EINVAL; + } + nr_bits = BTF_INT_BITS(int_data) + BTF_INT_OFFSET(int_data); if (nr_bits > BITS_PER_U64) { @@ -947,12 +954,17 @@ static s32 btf_int_check_meta(struct btf_verifier_env *env, return -EINVAL; } + /* + * Only one of the encoding bits is allowed and it + * should be sufficient for the pretty print purpose (i.e. decoding). + * Multiple bits can be allowed later if it is found + * to be insufficient. + */ encoding = BTF_INT_ENCODING(int_data); if (encoding && encoding != BTF_INT_SIGNED && encoding != BTF_INT_CHAR && - encoding != BTF_INT_BOOL && - encoding != BTF_INT_VARARGS) { + encoding != BTF_INT_BOOL) { btf_verifier_log_type(env, t, "Unsupported encoding"); return -ENOTSUPP; } @@ -1126,7 +1138,7 @@ static int btf_ref_type_check_meta(struct btf_verifier_env *env, return -EINVAL; } - if (BTF_TYPE_PARENT(t->type)) { + if (!BTF_TYPE_ID_VALID(t->type)) { btf_verifier_log_type(env, t, "Invalid type_id"); return -EINVAL; } @@ -1333,12 +1345,12 @@ static s32 btf_array_check_meta(struct btf_verifier_env *env, /* Array elem type and index type cannot be in type void, * so !array->type and !array->index_type are not allowed. */ - if (!array->type || BTF_TYPE_PARENT(array->type)) { + if (!array->type || !BTF_TYPE_ID_VALID(array->type)) { btf_verifier_log_type(env, t, "Invalid elem"); return -EINVAL; } - if (!array->index_type || BTF_TYPE_PARENT(array->index_type)) { + if (!array->index_type || !BTF_TYPE_ID_VALID(array->index_type)) { btf_verifier_log_type(env, t, "Invalid index"); return -EINVAL; } @@ -1507,7 +1519,7 @@ static s32 btf_struct_check_meta(struct btf_verifier_env *env, } /* A member cannot be in type void */ - if (!member->type || BTF_TYPE_PARENT(member->type)) { + if (!member->type || !BTF_TYPE_ID_VALID(member->type)) { btf_verifier_log_member(env, t, member, "Invalid type_id"); return -EINVAL; @@ -1760,6 +1772,12 @@ static s32 btf_check_meta(struct btf_verifier_env *env, } meta_left -= sizeof(*t); + if (t->info & ~BTF_INFO_MASK) { + btf_verifier_log(env, "[%u] Invalid btf_info:%x", + env->log_type_id, t->info); + return -EINVAL; + } + if (BTF_INFO_KIND(t->info) > BTF_KIND_MAX || BTF_INFO_KIND(t->info) == BTF_KIND_UNKN) { btf_verifier_log(env, "[%u] Invalid kind:%u", From 7f82a95bf2a5619c886cc192fd220fef5e68451d Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Tue, 22 May 2018 14:57:21 -0700 Subject: [PATCH 0441/1640] UPSTREAM: bpf: btf: Rename btf_key_id and btf_value_id in bpf_map_info In "struct bpf_map_info", the name "btf_id", "btf_key_id" and "btf_value_id" could cause confusion because the "id" of "btf_id" means the BPF obj id given to the BTF object while "btf_key_id" and "btf_value_id" means the BTF type id within that BTF object. To make it clear, btf_key_id and btf_value_id are renamed to btf_key_type_id and btf_value_type_id. Suggested-by: Daniel Borkmann Signed-off-by: Martin KaFai Lau Acked-by: Yonghong Song Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 4 ++-- include/uapi/linux/bpf.h | 8 ++++---- kernel/bpf/arraymap.c | 2 +- kernel/bpf/syscall.c | 18 +++++++++--------- 4 files changed, 16 insertions(+), 16 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 16a8a63d03b6..42b29e54b8f5 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -74,8 +74,8 @@ struct bpf_map { u32 pages; u32 id; int numa_node; - u32 btf_key_id; - u32 btf_value_id; + u32 btf_key_type_id; + u32 btf_value_type_id; struct btf *btf; bool unpriv_array; /* 55 bytes hole */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 358822001008..31367bec76ca 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -284,8 +284,8 @@ union bpf_attr { char map_name[BPF_OBJ_NAME_LEN]; __u32 map_ifindex; /* ifindex of netdev to create on */ __u32 btf_fd; /* fd pointing to a BTF type data */ - __u32 btf_key_id; /* BTF type_id of the key */ - __u32 btf_value_id; /* BTF type_id of the value */ + __u32 btf_key_type_id; /* BTF type_id of the key */ + __u32 btf_value_type_id; /* BTF type_id of the value */ }; struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */ @@ -2272,8 +2272,8 @@ struct bpf_map_info { __u64 netns_dev; __u64 netns_ino; __u32 btf_id; - __u32 btf_key_id; - __u32 btf_value_id; + __u32 btf_key_type_id; + __u32 btf_value_type_id; } __attribute__((aligned(8))); struct bpf_btf_info { diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 7fa9205844d7..0bd1006f6063 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -352,7 +352,7 @@ static void array_map_seq_show_elem(struct bpf_map *map, void *key, } seq_printf(m, "%u: ", *(u32 *)key); - btf_type_seq_show(map->btf, map->btf_value_id, value, m); + btf_type_seq_show(map->btf, map->btf_value_type_id, value, m); seq_puts(m, "\n"); rcu_read_unlock(); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index b639ff2728d3..7a00dceb1ac6 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -423,7 +423,7 @@ static int bpf_obj_name_cpy(char *dst, const char *src) return 0; } -#define BPF_MAP_CREATE_LAST_FIELD btf_value_id +#define BPF_MAP_CREATE_LAST_FIELD btf_value_type_id /* called via syscall */ static int map_create(union bpf_attr *attr) { @@ -458,10 +458,10 @@ static int map_create(union bpf_attr *attr) atomic_set(&map->usercnt, 1); if (bpf_map_support_seq_show(map) && - (attr->btf_key_id || attr->btf_value_id)) { + (attr->btf_key_type_id || attr->btf_value_type_id)) { struct btf *btf; - if (!attr->btf_key_id || !attr->btf_value_id) { + if (!attr->btf_key_type_id || !attr->btf_value_type_id) { err = -EINVAL; goto free_map_nouncharge; } @@ -472,16 +472,16 @@ static int map_create(union bpf_attr *attr) goto free_map_nouncharge; } - err = map->ops->map_check_btf(map, btf, attr->btf_key_id, - attr->btf_value_id); + err = map->ops->map_check_btf(map, btf, attr->btf_key_type_id, + attr->btf_value_type_id); if (err) { btf_put(btf); goto free_map_nouncharge; } map->btf = btf; - map->btf_key_id = attr->btf_key_id; - map->btf_value_id = attr->btf_value_id; + map->btf_key_type_id = attr->btf_key_type_id; + map->btf_value_type_id = attr->btf_value_type_id; } err = security_bpf_map_alloc(map); @@ -2038,8 +2038,8 @@ static int bpf_map_get_info_by_fd(struct bpf_map *map, if (map->btf) { info.btf_id = btf_id(map->btf); - info.btf_key_id = map->btf_key_id; - info.btf_value_id = map->btf_value_id; + info.btf_key_type_id = map->btf_key_type_id; + info.btf_value_type_id = map->btf_value_type_id; } if (bpf_map_is_dev_bound(map)) { From df5891fe935d2fa45d58056fc78007be1f323ccf Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 21 May 2018 19:22:29 -0700 Subject: [PATCH 0442/1640] UPSTREAM: umh: introduce fork_usermode_blob() helper Introduce helper: int fork_usermode_blob(void *data, size_t len, struct umh_info *info); struct umh_info { struct file *pipe_to_umh; struct file *pipe_from_umh; pid_t pid; }; that GPLed kernel modules (signed or unsigned) can use it to execute part of its own data as swappable user mode process. The kernel will do: - allocate a unique file in tmpfs - populate that file with [data, data + len] bytes - user-mode-helper code will do_execve that file and, before the process starts, the kernel will create two unix pipes for bidirectional communication between kernel module and umh - close tmpfs file, effectively deleting it - the fork_usermode_blob will return zero on success and populate 'struct umh_info' with two unix pipes and the pid of the user process As the first step in the development of the bpfilter project the fork_usermode_blob() helper is introduced to allow user mode code to be invoked from a kernel module. The idea is that user mode code plus normal kernel module code are built as part of the kernel build and installed as traditional kernel module into distro specified location, such that from a distribution point of view, there is no difference between regular kernel modules and kernel modules + umh code. Such modules can be signed, modprobed, rmmod, etc. The use of this new helper by a kernel module doesn't make it any special from kernel and user space tooling point of view. Such approach enables kernel to delegate functionality traditionally done by the kernel modules into the user space processes (either root or !root) and reduces security attack surface of the new code. The buggy umh code would crash the user process, but not the kernel. Another advantage is that umh code of the kernel module can be debugged and tested out of user space (e.g. opening the possibility to run clang sanitizers, fuzzers or user space test suites on the umh code). In case of the bpfilter project such architecture allows complex control plane to be done in the user space while bpf based data plane stays in the kernel. Since umh can crash, can be oom-ed by the kernel, killed by the admin, the kernel module that uses them (like bpfilter) needs to manage life time of umh on its own via two unix pipes and the pid of umh. The exit code of such kernel module should kill the umh it started, so that rmmod of the kernel module will cleanup the corresponding umh. Just like if the kernel module does kmalloc() it should kfree() it in the exit code. Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- fs/exec.c | 38 +++++++++--- include/linux/binfmts.h | 1 + include/linux/umh.h | 12 ++++ kernel/umh.c | 125 +++++++++++++++++++++++++++++++++++++++- 4 files changed, 164 insertions(+), 12 deletions(-) diff --git a/fs/exec.c b/fs/exec.c index a16a118d9f4b..bb0b9ed6f037 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1863,14 +1863,13 @@ static int exec_binprm(struct linux_binprm *bprm) /* * sys_execve() executes a new program. */ -static int do_execveat_common(int fd, struct filename *filename, - struct user_arg_ptr argv, - struct user_arg_ptr envp, - int flags) +static int __do_execve_file(int fd, struct filename *filename, + struct user_arg_ptr argv, + struct user_arg_ptr envp, + int flags, struct file *file) { char *pathbuf = NULL; struct linux_binprm *bprm; - struct file *file; struct files_struct *displaced; int retval; @@ -1909,7 +1908,8 @@ static int do_execveat_common(int fd, struct filename *filename, check_unsafe_exec(bprm); current->in_execve = 1; - file = do_open_execat(fd, filename, flags); + if (!file) + file = do_open_execat(fd, filename, flags); retval = PTR_ERR(file); if (IS_ERR(file)) goto out_unmark; @@ -1925,7 +1925,9 @@ static int do_execveat_common(int fd, struct filename *filename, sched_exec(); bprm->file = file; - if (fd == AT_FDCWD || filename->name[0] == '/') { + if (!filename) { + bprm->filename = "none"; + } else if (fd == AT_FDCWD || filename->name[0] == '/') { bprm->filename = filename->name; } else { if (filename->name[0] == '\0') @@ -2006,7 +2008,8 @@ static int do_execveat_common(int fd, struct filename *filename, task_numa_free(current, false); free_bprm(bprm); kfree(pathbuf); - putname(filename); + if (filename) + putname(filename); if (displaced) put_files_struct(displaced); return retval; @@ -2029,10 +2032,27 @@ out_files: if (displaced) reset_files_struct(displaced); out_ret: - putname(filename); + if (filename) + putname(filename); return retval; } +static int do_execveat_common(int fd, struct filename *filename, + struct user_arg_ptr argv, + struct user_arg_ptr envp, + int flags) +{ + return __do_execve_file(fd, filename, argv, envp, flags, NULL); +} + +int do_execve_file(struct file *file, void *__argv, void *__envp) +{ + struct user_arg_ptr argv = { .ptr.native = __argv }; + struct user_arg_ptr envp = { .ptr.native = __envp }; + + return __do_execve_file(AT_FDCWD, NULL, argv, envp, 0, file); +} + int do_execve(struct filename *filename, const char __user *const __user *__argv, const char __user *const __user *__envp) diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h index b0abe21d6cc9..c783a7b9f284 100644 --- a/include/linux/binfmts.h +++ b/include/linux/binfmts.h @@ -147,5 +147,6 @@ extern int do_execveat(int, struct filename *, const char __user * const __user *, const char __user * const __user *, int); +int do_execve_file(struct file *file, void *__argv, void *__envp); #endif /* _LINUX_BINFMTS_H */ diff --git a/include/linux/umh.h b/include/linux/umh.h index 244aff638220..5c812acbb80a 100644 --- a/include/linux/umh.h +++ b/include/linux/umh.h @@ -22,8 +22,10 @@ struct subprocess_info { const char *path; char **argv; char **envp; + struct file *file; int wait; int retval; + pid_t pid; int (*init)(struct subprocess_info *info, struct cred *new); void (*cleanup)(struct subprocess_info *info); void *data; @@ -38,6 +40,16 @@ call_usermodehelper_setup(const char *path, char **argv, char **envp, int (*init)(struct subprocess_info *info, struct cred *new), void (*cleanup)(struct subprocess_info *), void *data); +struct subprocess_info *call_usermodehelper_setup_file(struct file *file, + int (*init)(struct subprocess_info *info, struct cred *new), + void (*cleanup)(struct subprocess_info *), void *data); +struct umh_info { + struct file *pipe_to_umh; + struct file *pipe_from_umh; + pid_t pid; +}; +int fork_usermode_blob(void *data, size_t len, struct umh_info *info); + extern int call_usermodehelper_exec(struct subprocess_info *info, int wait); diff --git a/kernel/umh.c b/kernel/umh.c index 9c41f05f969b..f520f1800ebc 100644 --- a/kernel/umh.c +++ b/kernel/umh.c @@ -26,6 +26,8 @@ #include #include #include +#include +#include #include @@ -106,9 +108,13 @@ static int call_usermodehelper_exec_async(void *data) commit_creds(new); - retval = do_execve(getname_kernel(sub_info->path), - (const char __user *const __user *)sub_info->argv, - (const char __user *const __user *)sub_info->envp); + if (sub_info->file) + retval = do_execve_file(sub_info->file, + sub_info->argv, sub_info->envp); + else + retval = do_execve(getname_kernel(sub_info->path), + (const char __user *const __user *)sub_info->argv, + (const char __user *const __user *)sub_info->envp); out: sub_info->retval = retval; /* @@ -194,6 +200,8 @@ static void call_usermodehelper_exec_work(struct work_struct *work) if (pid < 0) { sub_info->retval = pid; umh_complete(sub_info); + } else { + sub_info->pid = pid; } } } @@ -402,6 +410,117 @@ struct subprocess_info *call_usermodehelper_setup(const char *path, char **argv, } EXPORT_SYMBOL(call_usermodehelper_setup); +struct subprocess_info *call_usermodehelper_setup_file(struct file *file, + int (*init)(struct subprocess_info *info, struct cred *new), + void (*cleanup)(struct subprocess_info *info), void *data) +{ + struct subprocess_info *sub_info; + + sub_info = kzalloc(sizeof(struct subprocess_info), GFP_KERNEL); + if (!sub_info) + return NULL; + + INIT_WORK(&sub_info->work, call_usermodehelper_exec_work); + sub_info->path = "none"; + sub_info->file = file; + sub_info->init = init; + sub_info->cleanup = cleanup; + sub_info->data = data; + return sub_info; +} + +static int umh_pipe_setup(struct subprocess_info *info, struct cred *new) +{ + struct umh_info *umh_info = info->data; + struct file *from_umh[2]; + struct file *to_umh[2]; + int err; + + /* create pipe to send data to umh */ + err = create_pipe_files(to_umh, 0); + if (err) + return err; + err = replace_fd(0, to_umh[0], 0); + fput(to_umh[0]); + if (err < 0) { + fput(to_umh[1]); + return err; + } + + /* create pipe to receive data from umh */ + err = create_pipe_files(from_umh, 0); + if (err) { + fput(to_umh[1]); + replace_fd(0, NULL, 0); + return err; + } + err = replace_fd(1, from_umh[1], 0); + fput(from_umh[1]); + if (err < 0) { + fput(to_umh[1]); + replace_fd(0, NULL, 0); + fput(from_umh[0]); + return err; + } + + umh_info->pipe_to_umh = to_umh[1]; + umh_info->pipe_from_umh = from_umh[0]; + return 0; +} + +static void umh_save_pid(struct subprocess_info *info) +{ + struct umh_info *umh_info = info->data; + + umh_info->pid = info->pid; +} + +/** + * fork_usermode_blob - fork a blob of bytes as a usermode process + * @data: a blob of bytes that can be do_execv-ed as a file + * @len: length of the blob + * @info: information about usermode process (shouldn't be NULL) + * + * Returns either negative error or zero which indicates success + * in executing a blob of bytes as a usermode process. In such + * case 'struct umh_info *info' is populated with two pipes + * and a pid of the process. The caller is responsible for health + * check of the user process, killing it via pid, and closing the + * pipes when user process is no longer needed. + */ +int fork_usermode_blob(void *data, size_t len, struct umh_info *info) +{ + struct subprocess_info *sub_info; + struct file *file; + ssize_t written; + loff_t pos = 0; + int err; + + file = shmem_kernel_file_setup("", len, 0); + if (IS_ERR(file)) + return PTR_ERR(file); + + written = kernel_write(file, data, len, &pos); + if (written != len) { + err = written; + if (err >= 0) + err = -ENOMEM; + goto out; + } + + err = -ENOMEM; + sub_info = call_usermodehelper_setup_file(file, umh_pipe_setup, + umh_save_pid, info); + if (!sub_info) + goto out; + + err = call_usermodehelper_exec(sub_info, UMH_WAIT_EXEC); +out: + fput(file); + return err; +} +EXPORT_SYMBOL_GPL(fork_usermode_blob); + /** * call_usermodehelper_exec - start a usermode application * @sub_info: information about the subprocessa From a3d80bdd10f22cc65a17e8488f21e17f68ac7d0d Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 21 May 2018 19:22:30 -0700 Subject: [PATCH 0443/1640] UPSTREAM: net: add skeleton of bpfilter kernel module bpfilter.ko consists of bpfilter_kern.c (normal kernel module code) and user mode helper code that is embedded into bpfilter.ko The steps to build bpfilter.ko are the following: - main.c is compiled by HOSTCC into the bpfilter_umh elf executable file - with quite a bit of objcopy and Makefile magic the bpfilter_umh elf file is converted into bpfilter_umh.o object file with _binary_net_bpfilter_bpfilter_umh_start and _end symbols Example: $ nm ./bld_x64/net/bpfilter/bpfilter_umh.o 0000000000004cf8 T _binary_net_bpfilter_bpfilter_umh_end 0000000000004cf8 A _binary_net_bpfilter_bpfilter_umh_size 0000000000000000 T _binary_net_bpfilter_bpfilter_umh_start - bpfilter_umh.o and bpfilter_kern.o are linked together into bpfilter.ko bpfilter_kern.c is a normal kernel module code that calls the fork_usermode_blob() helper to execute part of its own data as a user mode process. Notice that _binary_net_bpfilter_bpfilter_umh_start - end is placed into .init.rodata section, so it's freed as soon as __init function of bpfilter.ko is finished. As part of __init the bpfilter.ko does first request/reply action via two unix pipe provided by fork_usermode_blob() helper to make sure that umh is healthy. If not it will kill it via pid. Later bpfilter_process_sockopt() will be called from bpfilter hooks in get/setsockopt() to pass iptable commands into umh via bpfilter.ko If admin does 'rmmod bpfilter' the __exit code bpfilter.ko will kill umh as well. Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/bpfilter.h | 15 +++++ include/uapi/linux/bpfilter.h | 21 +++++++ net/Kconfig | 2 + net/Makefile | 1 + net/bpfilter/Kconfig | 16 +++++ net/bpfilter/Makefile | 30 +++++++++ net/bpfilter/bpfilter_kern.c | 111 ++++++++++++++++++++++++++++++++++ net/bpfilter/main.c | 63 +++++++++++++++++++ net/bpfilter/msgfmt.h | 17 ++++++ net/ipv4/Makefile | 2 + net/ipv4/bpfilter/Makefile | 2 + net/ipv4/bpfilter/sockopt.c | 42 +++++++++++++ net/ipv4/ip_sockglue.c | 17 ++++++ 13 files changed, 339 insertions(+) create mode 100644 include/linux/bpfilter.h create mode 100644 include/uapi/linux/bpfilter.h create mode 100644 net/bpfilter/Kconfig create mode 100644 net/bpfilter/Makefile create mode 100644 net/bpfilter/bpfilter_kern.c create mode 100644 net/bpfilter/main.c create mode 100644 net/bpfilter/msgfmt.h create mode 100644 net/ipv4/bpfilter/Makefile create mode 100644 net/ipv4/bpfilter/sockopt.c diff --git a/include/linux/bpfilter.h b/include/linux/bpfilter.h new file mode 100644 index 000000000000..687b1760bb9f --- /dev/null +++ b/include/linux/bpfilter.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_BPFILTER_H +#define _LINUX_BPFILTER_H + +#include + +struct sock; +int bpfilter_ip_set_sockopt(struct sock *sk, int optname, char *optval, + unsigned int optlen); +int bpfilter_ip_get_sockopt(struct sock *sk, int optname, char *optval, + int *optlen); +extern int (*bpfilter_process_sockopt)(struct sock *sk, int optname, + char __user *optval, + unsigned int optlen, bool is_set); +#endif diff --git a/include/uapi/linux/bpfilter.h b/include/uapi/linux/bpfilter.h new file mode 100644 index 000000000000..2ec3cc99ea4c --- /dev/null +++ b/include/uapi/linux/bpfilter.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _UAPI_LINUX_BPFILTER_H +#define _UAPI_LINUX_BPFILTER_H + +#include + +enum { + BPFILTER_IPT_SO_SET_REPLACE = 64, + BPFILTER_IPT_SO_SET_ADD_COUNTERS = 65, + BPFILTER_IPT_SET_MAX, +}; + +enum { + BPFILTER_IPT_SO_GET_INFO = 64, + BPFILTER_IPT_SO_GET_ENTRIES = 65, + BPFILTER_IPT_SO_GET_REVISION_MATCH = 66, + BPFILTER_IPT_SO_GET_REVISION_TARGET = 67, + BPFILTER_IPT_GET_MAX, +}; + +#endif /* _UAPI_LINUX_BPFILTER_H */ diff --git a/net/Kconfig b/net/Kconfig index 21f4031e486d..7cce8b19b11a 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -200,6 +200,8 @@ source "net/bridge/netfilter/Kconfig" endif +source "net/bpfilter/Kconfig" + source "net/dccp/Kconfig" source "net/sctp/Kconfig" source "net/rds/Kconfig" diff --git a/net/Makefile b/net/Makefile index cf01cae6495d..fa37df4ebe66 100644 --- a/net/Makefile +++ b/net/Makefile @@ -20,6 +20,7 @@ obj-$(CONFIG_TLS) += tls/ obj-$(CONFIG_XFRM) += xfrm/ obj-$(CONFIG_UNIX_SCM) += unix/ obj-$(CONFIG_NET) += ipv6/ +obj-$(CONFIG_BPFILTER) += bpfilter/ obj-$(CONFIG_PACKET) += packet/ obj-$(CONFIG_NET_KEY) += key/ obj-$(CONFIG_BRIDGE) += bridge/ diff --git a/net/bpfilter/Kconfig b/net/bpfilter/Kconfig new file mode 100644 index 000000000000..60725c5f79db --- /dev/null +++ b/net/bpfilter/Kconfig @@ -0,0 +1,16 @@ +menuconfig BPFILTER + bool "BPF based packet filtering framework (BPFILTER)" + default n + depends on NET && BPF + help + This builds experimental bpfilter framework that is aiming to + provide netfilter compatible functionality via BPF + +if BPFILTER +config BPFILTER_UMH + tristate "bpfilter kernel module with user mode helper" + default m + help + This builds bpfilter kernel module with embedded user mode helper +endif + diff --git a/net/bpfilter/Makefile b/net/bpfilter/Makefile new file mode 100644 index 000000000000..2af752c8ef5e --- /dev/null +++ b/net/bpfilter/Makefile @@ -0,0 +1,30 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# Makefile for the Linux BPFILTER layer. +# + +hostprogs-y := bpfilter_umh +bpfilter_umh-objs := main.o +HOSTCFLAGS += -I. -Itools/include/ +ifeq ($(CONFIG_BPFILTER_UMH), y) +# builtin bpfilter_umh should be compiled with -static +# since rootfs isn't mounted at the time of __init +# function is called and do_execv won't find elf interpreter +HOSTLDFLAGS += -static +endif + +# a bit of elf magic to convert bpfilter_umh binary into a binary blob +# inside bpfilter_umh.o elf file referenced by +# _binary_net_bpfilter_bpfilter_umh_start symbol +# which bpfilter_kern.c passes further into umh blob loader at run-time +quiet_cmd_copy_umh = GEN $@ + cmd_copy_umh = echo ':' > $(obj)/.bpfilter_umh.o.cmd; \ + $(OBJCOPY) -I binary -O $(CONFIG_OUTPUT_FORMAT) \ + -B `$(OBJDUMP) -f $<|grep architecture|cut -d, -f1|cut -d' ' -f2` \ + --rename-section .data=.init.rodata $< $@ + +$(obj)/bpfilter_umh.o: $(obj)/bpfilter_umh + $(call cmd,copy_umh) + +obj-$(CONFIG_BPFILTER_UMH) += bpfilter.o +bpfilter-objs += bpfilter_kern.o bpfilter_umh.o diff --git a/net/bpfilter/bpfilter_kern.c b/net/bpfilter/bpfilter_kern.c new file mode 100644 index 000000000000..7596314b61c7 --- /dev/null +++ b/net/bpfilter/bpfilter_kern.c @@ -0,0 +1,111 @@ +// SPDX-License-Identifier: GPL-2.0 +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include +#include +#include +#include +#include +#include +#include +#include +#include "msgfmt.h" + +#define UMH_start _binary_net_bpfilter_bpfilter_umh_start +#define UMH_end _binary_net_bpfilter_bpfilter_umh_end + +extern char UMH_start; +extern char UMH_end; + +static struct umh_info info; +/* since ip_getsockopt() can run in parallel, serialize access to umh */ +static DEFINE_MUTEX(bpfilter_lock); + +static void shutdown_umh(struct umh_info *info) +{ + struct task_struct *tsk; + + tsk = pid_task(find_vpid(info->pid), PIDTYPE_PID); + if (tsk) + force_sig(SIGKILL, tsk); + fput(info->pipe_to_umh); + fput(info->pipe_from_umh); +} + +static void __stop_umh(void) +{ + if (bpfilter_process_sockopt) { + bpfilter_process_sockopt = NULL; + shutdown_umh(&info); + } +} + +static void stop_umh(void) +{ + mutex_lock(&bpfilter_lock); + __stop_umh(); + mutex_unlock(&bpfilter_lock); +} + +static int __bpfilter_process_sockopt(struct sock *sk, int optname, + char __user *optval, + unsigned int optlen, bool is_set) +{ + struct mbox_request req; + struct mbox_reply reply; + loff_t pos; + ssize_t n; + int ret; + + req.is_set = is_set; + req.pid = current->pid; + req.cmd = optname; + req.addr = (long)optval; + req.len = optlen; + mutex_lock(&bpfilter_lock); + n = __kernel_write(info.pipe_to_umh, &req, sizeof(req), &pos); + if (n != sizeof(req)) { + pr_err("write fail %zd\n", n); + __stop_umh(); + ret = -EFAULT; + goto out; + } + pos = 0; + n = kernel_read(info.pipe_from_umh, &reply, sizeof(reply), &pos); + if (n != sizeof(reply)) { + pr_err("read fail %zd\n", n); + __stop_umh(); + ret = -EFAULT; + goto out; + } + ret = reply.status; +out: + mutex_unlock(&bpfilter_lock); + return ret; +} + +static int __init load_umh(void) +{ + int err; + + /* fork usermode process */ + err = fork_usermode_blob(&UMH_start, &UMH_end - &UMH_start, &info); + if (err) + return err; + pr_info("Loaded bpfilter_umh pid %d\n", info.pid); + + /* health check that usermode process started correctly */ + if (__bpfilter_process_sockopt(NULL, 0, 0, 0, 0) != 0) { + stop_umh(); + return -EFAULT; + } + bpfilter_process_sockopt = &__bpfilter_process_sockopt; + return 0; +} + +static void __exit fini_umh(void) +{ + stop_umh(); +} +module_init(load_umh); +module_exit(fini_umh); +MODULE_LICENSE("GPL"); diff --git a/net/bpfilter/main.c b/net/bpfilter/main.c new file mode 100644 index 000000000000..81bbc1684896 --- /dev/null +++ b/net/bpfilter/main.c @@ -0,0 +1,63 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include "include/uapi/linux/bpf.h" +#include +#include "msgfmt.h" + +int debug_fd; + +static int handle_get_cmd(struct mbox_request *cmd) +{ + switch (cmd->cmd) { + case 0: + return 0; + default: + break; + } + return -ENOPROTOOPT; +} + +static int handle_set_cmd(struct mbox_request *cmd) +{ + return -ENOPROTOOPT; +} + +static void loop(void) +{ + while (1) { + struct mbox_request req; + struct mbox_reply reply; + int n; + + n = read(0, &req, sizeof(req)); + if (n != sizeof(req)) { + dprintf(debug_fd, "invalid request %d\n", n); + return; + } + + reply.status = req.is_set ? + handle_set_cmd(&req) : + handle_get_cmd(&req); + + n = write(1, &reply, sizeof(reply)); + if (n != sizeof(reply)) { + dprintf(debug_fd, "reply failed %d\n", n); + return; + } + } +} + +int main(void) +{ + debug_fd = open("/dev/console", 00000002 | 00000100); + dprintf(debug_fd, "Started bpfilter\n"); + loop(); + close(debug_fd); + return 0; +} diff --git a/net/bpfilter/msgfmt.h b/net/bpfilter/msgfmt.h new file mode 100644 index 000000000000..98d121c62945 --- /dev/null +++ b/net/bpfilter/msgfmt.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _NET_BPFILTER_MSGFMT_H +#define _NET_BPFILTER_MSGFMT_H + +struct mbox_request { + __u64 addr; + __u32 len; + __u32 is_set; + __u32 cmd; + __u32 pid; +}; + +struct mbox_reply { + __u32 status; +}; + +#endif diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index 30ee969c5957..398cc0a301b1 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -16,6 +16,8 @@ obj-y := route.o inetpeer.o protocol.o \ inet_fragment.o ping.o ip_tunnel_core.o gre_offload.o \ metrics.o +obj-$(CONFIG_BPFILTER) += bpfilter/ + obj-$(CONFIG_NET_IP_TUNNEL) += ip_tunnel.o obj-$(CONFIG_SYSCTL) += sysctl_net_ipv4.o obj-$(CONFIG_PROC_FS) += proc.o diff --git a/net/ipv4/bpfilter/Makefile b/net/ipv4/bpfilter/Makefile new file mode 100644 index 000000000000..ce262d76cc48 --- /dev/null +++ b/net/ipv4/bpfilter/Makefile @@ -0,0 +1,2 @@ +obj-$(CONFIG_BPFILTER) += sockopt.o + diff --git a/net/ipv4/bpfilter/sockopt.c b/net/ipv4/bpfilter/sockopt.c new file mode 100644 index 000000000000..42a96d2d8d05 --- /dev/null +++ b/net/ipv4/bpfilter/sockopt.c @@ -0,0 +1,42 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include + +int (*bpfilter_process_sockopt)(struct sock *sk, int optname, + char __user *optval, + unsigned int optlen, bool is_set); +EXPORT_SYMBOL_GPL(bpfilter_process_sockopt); + +int bpfilter_mbox_request(struct sock *sk, int optname, char __user *optval, + unsigned int optlen, bool is_set) +{ + if (!bpfilter_process_sockopt) { + int err = request_module("bpfilter"); + + if (err) + return err; + if (!bpfilter_process_sockopt) + return -ECHILD; + } + return bpfilter_process_sockopt(sk, optname, optval, optlen, is_set); +} + +int bpfilter_ip_set_sockopt(struct sock *sk, int optname, char __user *optval, + unsigned int optlen) +{ + return bpfilter_mbox_request(sk, optname, optval, optlen, true); +} + +int bpfilter_ip_get_sockopt(struct sock *sk, int optname, char __user *optval, + int __user *optlen) +{ + int len; + + if (get_user(len, optlen)) + return -EFAULT; + + return bpfilter_mbox_request(sk, optname, optval, len, false); +} diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index d1081eac3b49..d9c17b50b6f3 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -47,6 +47,8 @@ #include #include +#include + /* * SOL_IP control messages. */ @@ -1246,6 +1248,11 @@ int ip_setsockopt(struct sock *sk, int level, return -ENOPROTOOPT; err = do_ip_setsockopt(sk, level, optname, optval, optlen); +#ifdef CONFIG_BPFILTER + if (optname >= BPFILTER_IPT_SO_SET_REPLACE && + optname < BPFILTER_IPT_SET_MAX) + err = bpfilter_ip_set_sockopt(sk, optname, optval, optlen); +#endif #ifdef CONFIG_NETFILTER /* we need to exclude all possible ENOPROTOOPTs except default case */ if (err == -ENOPROTOOPT && optname != IP_HDRINCL && @@ -1554,6 +1561,11 @@ int ip_getsockopt(struct sock *sk, int level, int err; err = do_ip_getsockopt(sk, level, optname, optval, optlen, 0); +#ifdef CONFIG_BPFILTER + if (optname >= BPFILTER_IPT_SO_GET_INFO && + optname < BPFILTER_IPT_GET_MAX) + err = bpfilter_ip_get_sockopt(sk, optname, optval, optlen); +#endif #ifdef CONFIG_NETFILTER /* we need to exclude all possible ENOPROTOOPTs except default case */ if (err == -ENOPROTOOPT && optname != IP_PKTOPTIONS && @@ -1586,6 +1598,11 @@ int compat_ip_getsockopt(struct sock *sk, int level, int optname, err = do_ip_getsockopt(sk, level, optname, optval, optlen, MSG_CMSG_COMPAT); +#ifdef CONFIG_BPFILTER + if (optname >= BPFILTER_IPT_SO_GET_INFO && + optname < BPFILTER_IPT_GET_MAX) + err = bpfilter_ip_get_sockopt(sk, optname, optval, optlen); +#endif #ifdef CONFIG_NETFILTER /* we need to exclude all possible ENOPROTOOPTs except default case */ if (err == -ENOPROTOOPT && optname != IP_PKTOPTIONS && From d9be4330a58c8fd23c3f5373f7cfe45cde97a384 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 23 May 2018 11:32:36 -0700 Subject: [PATCH 0444/1640] UPSTREAM: bpf: btf: Avoid variable length array Sparse warning: kernel/bpf/btf.c:1985:34: warning: Variable length array is used. This patch directly uses ARRAY_SIZE(). Fixes: f80442a4cd18 ("bpf: btf: Change how section is supported in btf_header") Signed-off-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann --- kernel/bpf/btf.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 9cbeabb5aca3..7e90fd13b5b5 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -1981,8 +1981,7 @@ static int btf_sec_info_cmp(const void *a, const void *b) static int btf_check_sec_info(struct btf_verifier_env *env, u32 btf_data_size) { - const unsigned int nr_secs = ARRAY_SIZE(btf_sec_info_offset); - struct btf_sec_info secs[nr_secs]; + struct btf_sec_info secs[ARRAY_SIZE(btf_sec_info_offset)]; u32 total, expected_total, i; const struct btf_header *hdr; const struct btf *btf; @@ -1991,17 +1990,17 @@ static int btf_check_sec_info(struct btf_verifier_env *env, hdr = &btf->hdr; /* Populate the secs from hdr */ - for (i = 0; i < nr_secs; i++) + for (i = 0; i < ARRAY_SIZE(btf_sec_info_offset); i++) secs[i] = *(struct btf_sec_info *)((void *)hdr + btf_sec_info_offset[i]); - sort(secs, nr_secs, sizeof(struct btf_sec_info), - btf_sec_info_cmp, NULL); + sort(secs, ARRAY_SIZE(btf_sec_info_offset), + sizeof(struct btf_sec_info), btf_sec_info_cmp, NULL); /* Check for gaps and overlap among sections */ total = 0; expected_total = btf_data_size - hdr->hdr_len; - for (i = 0; i < nr_secs; i++) { + for (i = 0; i < ARRAY_SIZE(btf_sec_info_offset); i++) { if (expected_total < secs[i].off) { btf_verifier_log(env, "Invalid section offset"); return -EINVAL; From 4f85a976c10894de29298bda0fdc6ecdc6bb928f Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Thu, 24 May 2018 12:26:45 +0530 Subject: [PATCH 0445/1640] UPSTREAM: bpf: support 64-bit offsets for bpf function calls The imm field of a bpf instruction is a signed 32-bit integer. For JITed bpf-to-bpf function calls, it holds the offset of the start address of the callee's JITed image from __bpf_call_base. For some architectures, such as powerpc64, this offset may be as large as 64 bits and cannot be accomodated in the imm field without truncation. We resolve this by: [1] Additionally using the auxiliary data of each function to keep a list of start addresses of the JITed images for all functions determined by the verifier. [2] Retaining the subprog id inside the off field of the call instructions and using it to index into the list mentioned above and lookup the callee's address. To make sure that the existing JIT compilers continue to work without requiring changes, we keep the imm field as it is. Signed-off-by: Sandipan Das Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 9fd14dee215e..f8c79232f266 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5724,11 +5724,24 @@ static int jit_subprogs(struct bpf_verifier_env *env) insn->src_reg != BPF_PSEUDO_CALL) continue; subprog = insn->off; - insn->off = 0; insn->imm = (u64 (*)(u64, u64, u64, u64, u64)) func[subprog]->bpf_func - __bpf_call_base; } + + /* we use the aux data to keep a list of the start addresses + * of the JITed images for each function in the program + * + * for some architectures, such as powerpc64, the imm field + * might not be large enough to hold the offset of the start + * address of the callee's JITed image from __bpf_call_base + * + * in such cases, we can lookup the start address of a callee + * by using its subprog id, available from the off field of + * the call instruction, as an index for this list + */ + func[i]->aux->func = func; + func[i]->aux->func_cnt = env->subprog_cnt; } for (i = 0; i < env->subprog_cnt; i++) { old_bpf_func = func[i]->bpf_func; From 13ca4b7b43e56c26ac9d63c1d1a56b2407f0f752 Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Thu, 24 May 2018 12:26:48 +0530 Subject: [PATCH 0446/1640] UPSTREAM: bpf: get kernel symbol addresses via syscall This adds new two new fields to struct bpf_prog_info. For multi-function programs, these fields can be used to pass a list of kernel symbol addresses for all functions in a given program to userspace using the bpf system call with the BPF_OBJ_GET_INFO_BY_FD command. When bpf_jit_kallsyms is enabled, we can get the address of the corresponding kernel symbol for a callee function and resolve the symbol's name. The address is determined by adding the value of the call instruction's imm field to __bpf_call_base. This offset gets assigned to the imm field by the verifier. For some architectures, such as powerpc64, the imm field is not large enough to hold this offset. We resolve this by: [1] Assigning the subprog id to the imm field of a call instruction in the verifier instead of the offset of the callee's symbol's address from __bpf_call_base. [2] Determining the address of a callee's corresponding symbol by using the imm field as an index for the list of kernel symbol addresses now available from the program info. Suggested-by: Daniel Borkmann Signed-off-by: Sandipan Das Signed-off-by: Daniel Borkmann --- include/uapi/linux/bpf.h | 2 ++ kernel/bpf/syscall.c | 25 +++++++++++++++++++++++++ kernel/bpf/verifier.c | 7 +------ 3 files changed, 28 insertions(+), 6 deletions(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 31367bec76ca..b6f7d2d8071f 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2258,6 +2258,8 @@ struct bpf_prog_info { __u32 gpl_compatible:1; __u64 netns_dev; __u64 netns_ino; + __u32 nr_jited_ksyms; + __aligned_u64 jited_ksyms; } __attribute__((aligned(8))); struct bpf_map_info { diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 7a00dceb1ac6..ff3bcec3c62c 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1957,6 +1957,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, if (!capable(CAP_SYS_ADMIN)) { info.jited_prog_len = 0; info.xlated_prog_len = 0; + info.nr_jited_ksyms = 0; goto done; } @@ -2005,6 +2006,30 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, } } + ulen = info.nr_jited_ksyms; + info.nr_jited_ksyms = prog->aux->func_cnt; + if (info.nr_jited_ksyms && ulen) { + if (bpf_dump_raw_ok()) { + u64 __user *user_ksyms; + ulong ksym_addr; + u32 i; + + /* copy the address of the kernel symbol + * corresponding to each function + */ + ulen = min_t(u32, info.nr_jited_ksyms, ulen); + user_ksyms = u64_to_user_ptr(info.jited_ksyms); + for (i = 0; i < ulen; i++) { + ksym_addr = (ulong) prog->aux->func[i]->bpf_func; + ksym_addr &= PAGE_MASK; + if (put_user((u64) ksym_addr, &user_ksyms[i])) + return -EFAULT; + } + } else { + info.jited_ksyms = 0; + } + } + done: if (copy_to_user(uinfo, &info, info_len) || put_user(info_len, &uattr->info.info_len)) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index f8c79232f266..dfc33128b7ac 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5767,17 +5767,12 @@ static int jit_subprogs(struct bpf_verifier_env *env) * later look the same as if they were interpreted only. */ for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) { - unsigned long addr; - if (insn->code != (BPF_JMP | BPF_CALL) || insn->src_reg != BPF_PSEUDO_CALL) continue; insn->off = env->insn_aux_data[i].call_imm; subprog = find_subprog(env, i + insn->off + 1); - addr = (unsigned long)func[subprog]->bpf_func; - addr &= PAGE_MASK; - insn->imm = (u64 (*)(u64, u64, u64, u64, u64)) - addr - __bpf_call_base; + insn->imm = subprog; } prog->jited = 1; From e830446c868d018c4e9e9023ac6be444565db64b Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Thu, 24 May 2018 12:26:51 +0530 Subject: [PATCH 0447/1640] UPSTREAM: bpf: fix multi-function JITed dump obtained via syscall Currently, for multi-function programs, we cannot get the JITed instructions using the bpf system call's BPF_OBJ_GET_INFO_BY_FD command. Because of this, userspace tools such as bpftool fail to identify a multi-function program as being JITed or not. With the JIT enabled and the test program running, this can be verified as follows: # cat /proc/sys/net/core/bpf_jit_enable 1 Before applying this patch: # bpftool prog list 1: kprobe name foo tag b811aab41a39ad3d gpl loaded_at 2018-05-16T11:43:38+0530 uid 0 xlated 216B not jited memlock 65536B ... # bpftool prog dump jited id 1 no instructions returned After applying this patch: # bpftool prog list 1: kprobe name foo tag b811aab41a39ad3d gpl loaded_at 2018-05-16T12:13:01+0530 uid 0 xlated 216B jited 308B memlock 65536B ... # bpftool prog dump jited id 1 0: nop 4: nop 8: mflr r0 c: std r0,16(r1) 10: stdu r1,-112(r1) 14: std r31,104(r1) 18: addi r31,r1,48 1c: li r3,10 ... Signed-off-by: Sandipan Das Signed-off-by: Daniel Borkmann --- kernel/bpf/syscall.c | 37 ++++++++++++++++++++++++++++++++++--- 1 file changed, 34 insertions(+), 3 deletions(-) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index ff3bcec3c62c..55e98ff6e6a6 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1994,13 +1994,44 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, * for offload. */ ulen = info.jited_prog_len; - info.jited_prog_len = prog->jited_len; + if (prog->aux->func_cnt) { + u32 i; + + info.jited_prog_len = 0; + for (i = 0; i < prog->aux->func_cnt; i++) + info.jited_prog_len += prog->aux->func[i]->jited_len; + } else { + info.jited_prog_len = prog->jited_len; + } + if (info.jited_prog_len && ulen) { if (bpf_dump_raw_ok()) { uinsns = u64_to_user_ptr(info.jited_prog_insns); ulen = min_t(u32, info.jited_prog_len, ulen); - if (copy_to_user(uinsns, prog->bpf_func, ulen)) - return -EFAULT; + + /* for multi-function programs, copy the JITed + * instructions for all the functions + */ + if (prog->aux->func_cnt) { + u32 len, free, i; + u8 *img; + + free = ulen; + for (i = 0; i < prog->aux->func_cnt; i++) { + len = prog->aux->func[i]->jited_len; + len = min_t(u32, len, free); + img = (u8 *) prog->aux->func[i]->bpf_func; + if (copy_to_user(uinsns, img, len)) + return -EFAULT; + uinsns += len; + free -= len; + if (!free) + break; + } + } else { + if (copy_to_user(uinsns, prog->bpf_func, ulen)) + return -EFAULT; + } } else { info.jited_prog_insns = 0; } From da9bc901397a8f016f8c6562f77478b8d2209a7a Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Thu, 24 May 2018 12:26:52 +0530 Subject: [PATCH 0448/1640] UPSTREAM: bpf: get JITed image lengths of functions via syscall This adds new two new fields to struct bpf_prog_info. For multi-function programs, these fields can be used to pass a list of the JITed image lengths of each function for a given program to userspace using the bpf system call with the BPF_OBJ_GET_INFO_BY_FD command. This can be used by userspace applications like bpftool to split up the contiguous JITed dump, also obtained via the system call, into more relatable chunks corresponding to each function. Signed-off-by: Sandipan Das Signed-off-by: Daniel Borkmann --- include/uapi/linux/bpf.h | 2 ++ kernel/bpf/syscall.c | 20 ++++++++++++++++++++ 2 files changed, 22 insertions(+) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index b6f7d2d8071f..c037c59b572a 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2259,7 +2259,9 @@ struct bpf_prog_info { __u64 netns_dev; __u64 netns_ino; __u32 nr_jited_ksyms; + __u32 nr_jited_func_lens; __aligned_u64 jited_ksyms; + __aligned_u64 jited_func_lens; } __attribute__((aligned(8))); struct bpf_map_info { diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 55e98ff6e6a6..b7137ddc3768 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2061,6 +2061,26 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, } } + ulen = info.nr_jited_func_lens; + info.nr_jited_func_lens = prog->aux->func_cnt; + if (info.nr_jited_func_lens && ulen) { + if (bpf_dump_raw_ok()) { + u32 __user *user_lens; + u32 func_len, i; + + /* copy the JITed image lengths for each function */ + ulen = min_t(u32, info.nr_jited_func_lens, ulen); + user_lens = u64_to_user_ptr(info.jited_func_lens); + for (i = 0; i < ulen; i++) { + func_len = prog->aux->func[i]->jited_len; + if (put_user(func_len, &user_lens[i])) + return -EFAULT; + } + } else { + info.jited_func_lens = 0; + } + } + done: if (copy_to_user(uinfo, &info, info_len) || put_user(info_len, &uattr->info.info_len)) From 5f142156e7b7a99f078bf65f25538b678c61de77 Mon Sep 17 00:00:00 2001 From: Mathieu Xhonneux Date: Sun, 20 May 2018 14:58:13 +0100 Subject: [PATCH 0449/1640] UPSTREAM: ipv6: sr: export function lookup_nexthop The function lookup_nexthop is essential to implement most of the seg6local actions. As we want to provide a BPF helper allowing to apply some of these actions on the packet being processed, the helper should be able to call this function, hence the need to make it public. Moreover, if one argument is incorrect or if the next hop can not be found, an error should be returned by the BPF helper so the BPF program can adapt its processing of the packet (return an error, properly force the drop, ...). This patch hence makes this function return dst->error to indicate a possible error. Signed-off-by: Mathieu Xhonneux Acked-by: David Lebrun Signed-off-by: Daniel Borkmann --- include/net/seg6.h | 3 ++- include/net/seg6_local.h | 24 ++++++++++++++++++++++++ net/ipv6/seg6_local.c | 20 +++++++++++--------- 3 files changed, 37 insertions(+), 10 deletions(-) create mode 100644 include/net/seg6_local.h diff --git a/include/net/seg6.h b/include/net/seg6.h index 099bad59dc90..f450bc37d196 100644 --- a/include/net/seg6.h +++ b/include/net/seg6.h @@ -63,5 +63,6 @@ extern bool seg6_validate_srh(struct ipv6_sr_hdr *srh, int len); extern int seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh, int proto); extern int seg6_do_srh_inline(struct sk_buff *skb, struct ipv6_sr_hdr *osrh); - +extern int seg6_lookup_nexthop(struct sk_buff *skb, struct in6_addr *nhaddr, + u32 tbl_id); #endif diff --git a/include/net/seg6_local.h b/include/net/seg6_local.h new file mode 100644 index 000000000000..57498b23085d --- /dev/null +++ b/include/net/seg6_local.h @@ -0,0 +1,24 @@ +/* + * SR-IPv6 implementation + * + * Authors: + * David Lebrun + * eBPF support: Mathieu Xhonneux + * + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#ifndef _NET_SEG6_LOCAL_H +#define _NET_SEG6_LOCAL_H + +#include +#include + +extern int seg6_lookup_nexthop(struct sk_buff *skb, struct in6_addr *nhaddr, + u32 tbl_id); + +#endif diff --git a/net/ipv6/seg6_local.c b/net/ipv6/seg6_local.c index f702c39728ab..2768c4279a4d 100644 --- a/net/ipv6/seg6_local.c +++ b/net/ipv6/seg6_local.c @@ -31,6 +31,7 @@ #ifdef CONFIG_IPV6_SEG6_HMAC #include #endif +#include #include struct seg6_local_lwt; @@ -142,8 +143,8 @@ static void advance_nextseg(struct ipv6_sr_hdr *srh, struct in6_addr *daddr) *daddr = *addr; } -static void lookup_nexthop(struct sk_buff *skb, struct in6_addr *nhaddr, - u32 tbl_id) +int seg6_lookup_nexthop(struct sk_buff *skb, struct in6_addr *nhaddr, + u32 tbl_id) { struct net *net = dev_net(skb->dev); struct ipv6hdr *hdr = ipv6_hdr(skb); @@ -189,6 +190,7 @@ out: skb_dst_drop(skb); skb_dst_set(skb, dst); + return dst->error; } /* regular endpoint function */ @@ -202,7 +204,7 @@ static int input_action_end(struct sk_buff *skb, struct seg6_local_lwt *slwt) advance_nextseg(srh, &ipv6_hdr(skb)->daddr); - lookup_nexthop(skb, NULL, 0); + seg6_lookup_nexthop(skb, NULL, 0); return dst_input(skb); @@ -222,7 +224,7 @@ static int input_action_end_x(struct sk_buff *skb, struct seg6_local_lwt *slwt) advance_nextseg(srh, &ipv6_hdr(skb)->daddr); - lookup_nexthop(skb, &slwt->nh6, 0); + seg6_lookup_nexthop(skb, &slwt->nh6, 0); return dst_input(skb); @@ -241,7 +243,7 @@ static int input_action_end_t(struct sk_buff *skb, struct seg6_local_lwt *slwt) advance_nextseg(srh, &ipv6_hdr(skb)->daddr); - lookup_nexthop(skb, NULL, slwt->table); + seg6_lookup_nexthop(skb, NULL, slwt->table); return dst_input(skb); @@ -333,7 +335,7 @@ static int input_action_end_dx6(struct sk_buff *skb, if (!ipv6_addr_any(&slwt->nh6)) nhaddr = &slwt->nh6; - lookup_nexthop(skb, nhaddr, 0); + seg6_lookup_nexthop(skb, nhaddr, 0); return dst_input(skb); drop: @@ -382,7 +384,7 @@ static int input_action_end_dt6(struct sk_buff *skb, if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) goto drop; - lookup_nexthop(skb, NULL, slwt->table); + seg6_lookup_nexthop(skb, NULL, slwt->table); return dst_input(skb); @@ -407,7 +409,7 @@ static int input_action_end_b6(struct sk_buff *skb, struct seg6_local_lwt *slwt) skb_set_transport_header(skb, sizeof(struct ipv6hdr)); - lookup_nexthop(skb, NULL, 0); + seg6_lookup_nexthop(skb, NULL, 0); return dst_input(skb); @@ -438,7 +440,7 @@ static int input_action_end_b6_encap(struct sk_buff *skb, skb_set_transport_header(skb, sizeof(struct ipv6hdr)); - lookup_nexthop(skb, NULL, 0); + seg6_lookup_nexthop(skb, NULL, 0); return dst_input(skb); From 6f0fa377a28add25a2401dadfed8f7c217f1a4f5 Mon Sep 17 00:00:00 2001 From: Mathieu Xhonneux Date: Sun, 20 May 2018 14:58:14 +0100 Subject: [PATCH 0450/1640] BACKPORT: bpf: Add IPv6 Segment Routing helpers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The BPF seg6local hook should be powerful enough to enable users to implement most of the use-cases one could think of. After some thinking, we figured out that the following actions should be possible on a SRv6 packet, requiring 3 specific helpers : - bpf_lwt_seg6_store_bytes: Modify non-sensitive fields of the SRH - bpf_lwt_seg6_adjust_srh: Allow to grow or shrink a SRH (to add/delete TLVs) - bpf_lwt_seg6_action: Apply some SRv6 network programming actions (specifically End.X, End.T, End.B6 and End.B6.Encap) The specifications of these helpers are provided in the patch (see include/uapi/linux/bpf.h). The non-sensitive fields of the SRH are the following : flags, tag and TLVs. The other fields can not be modified, to maintain the SRH integrity. Flags, tag and TLVs can easily be modified as their validity can be checked afterwards via seg6_validate_srh. It is not allowed to modify the segments directly. If one wants to add segments on the path, he should stack a new SRH using the End.B6 action via bpf_lwt_seg6_action. Growing, shrinking or editing TLVs via the helpers will flag the SRH as invalid, and it will have to be re-validated before re-entering the IPv6 layer. This flag is stored in a per-CPU buffer, along with the current header length in bytes. Storing the SRH len in bytes in the control block is mandatory when using bpf_lwt_seg6_adjust_srh. The Header Ext. Length field contains the SRH len rounded to 8 bytes (a padding TLV can be inserted to ensure the 8-bytes boundary). When adding/deleting TLVs within the BPF program, the SRH may temporary be in an invalid state where its length cannot be rounded to 8 bytes without remainder, hence the need to store the length in bytes separately. The caller of the BPF program can then ensure that the SRH's final length is valid using this value. Again, a final SRH modified by a BPF program which doesn’t respect the 8-bytes boundary will be discarded as it will be considered as invalid. Finally, a fourth helper is provided, bpf_lwt_push_encap, which is available from the LWT BPF IN hook, but not from the seg6local BPF one. This helper allows to encapsulate a Segment Routing Header (either with a new outer IPv6 header, or by inlining it directly in the existing IPv6 header) into a non-SRv6 packet. This helper is required if we want to offer the possibility to dynamically encapsulate a SRH for non-SRv6 packet, as the BPF seg6local hook only works on traffic already containing a SRH. This is the BPF equivalent of the seg6 LWT infrastructure, which achieves the same purpose but with a static SRH per route. These helpers require CONFIG_IPV6=y (and not =m). Signed-off-by: Mathieu Xhonneux Acked-by: David Lebrun Signed-off-by: Daniel Borkmann --- include/net/seg6_local.h | 8 ++ include/uapi/linux/bpf.h | 90 +++++++++++++ net/core/filter.c | 285 +++++++++++++++++++++++++++++++++++---- net/ipv6/Kconfig | 5 + net/ipv6/seg6_local.c | 2 + 5 files changed, 367 insertions(+), 23 deletions(-) diff --git a/include/net/seg6_local.h b/include/net/seg6_local.h index 57498b23085d..661fd5b4d3e0 100644 --- a/include/net/seg6_local.h +++ b/include/net/seg6_local.h @@ -15,10 +15,18 @@ #ifndef _NET_SEG6_LOCAL_H #define _NET_SEG6_LOCAL_H +#include #include #include extern int seg6_lookup_nexthop(struct sk_buff *skb, struct in6_addr *nhaddr, u32 tbl_id); +struct seg6_bpf_srh_state { + bool valid; + u16 hdrlen; +}; + +DECLARE_PER_CPU(struct seg6_bpf_srh_state, seg6_bpf_srh_states); + #endif diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index c037c59b572a..6e85616db850 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1902,6 +1902,90 @@ union bpf_attr { * egress otherwise). This is the only flag supported for now. * Return * **SK_PASS** on success, or **SK_DROP** on error. + * + * int bpf_lwt_push_encap(struct sk_buff *skb, u32 type, void *hdr, u32 len) + * Description + * Encapsulate the packet associated to *skb* within a Layer 3 + * protocol header. This header is provided in the buffer at + * address *hdr*, with *len* its size in bytes. *type* indicates + * the protocol of the header and can be one of: + * + * **BPF_LWT_ENCAP_SEG6** + * IPv6 encapsulation with Segment Routing Header + * (**struct ipv6_sr_hdr**). *hdr* only contains the SRH, + * the IPv6 header is computed by the kernel. + * **BPF_LWT_ENCAP_SEG6_INLINE** + * Only works if *skb* contains an IPv6 packet. Insert a + * Segment Routing Header (**struct ipv6_sr_hdr**) inside + * the IPv6 header. + * + * A call to this helper is susceptible to change the underlaying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_lwt_seg6_store_bytes(struct sk_buff *skb, u32 offset, const void *from, u32 len) + * Description + * Store *len* bytes from address *from* into the packet + * associated to *skb*, at *offset*. Only the flags, tag and TLVs + * inside the outermost IPv6 Segment Routing Header can be + * modified through this helper. + * + * A call to this helper is susceptible to change the underlaying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_lwt_seg6_adjust_srh(struct sk_buff *skb, u32 offset, s32 delta) + * Description + * Adjust the size allocated to TLVs in the outermost IPv6 + * Segment Routing Header contained in the packet associated to + * *skb*, at position *offset* by *delta* bytes. Only offsets + * after the segments are accepted. *delta* can be as well + * positive (growing) as negative (shrinking). + * + * A call to this helper is susceptible to change the underlaying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_lwt_seg6_action(struct sk_buff *skb, u32 action, void *param, u32 param_len) + * Description + * Apply an IPv6 Segment Routing action of type *action* to the + * packet associated to *skb*. Each action takes a parameter + * contained at address *param*, and of length *param_len* bytes. + * *action* can be one of: + * + * **SEG6_LOCAL_ACTION_END_X** + * End.X action: Endpoint with Layer-3 cross-connect. + * Type of *param*: **struct in6_addr**. + * **SEG6_LOCAL_ACTION_END_T** + * End.T action: Endpoint with specific IPv6 table lookup. + * Type of *param*: **int**. + * **SEG6_LOCAL_ACTION_END_B6** + * End.B6 action: Endpoint bound to an SRv6 policy. + * Type of param: **struct ipv6_sr_hdr**. + * **SEG6_LOCAL_ACTION_END_B6_ENCAP** + * End.B6.Encap action: Endpoint bound to an SRv6 + * encapsulation policy. + * Type of param: **struct ipv6_sr_hdr**. + * + * A call to this helper is susceptible to change the underlaying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -2096,6 +2180,12 @@ enum bpf_hdr_start_off { BPF_HDR_START_NET, }; +/* Encapsulation type for BPF_FUNC_lwt_push_encap helper. */ +enum bpf_lwt_encap_mode { + BPF_LWT_ENCAP_SEG6, + BPF_LWT_ENCAP_SEG6_INLINE +}; + /* user accessible mirror of in-kernel sk_buff. * new fields can only be added to the end of this structure */ diff --git a/net/core/filter.c b/net/core/filter.c index e9cacc225437..9baa58978203 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -64,6 +64,10 @@ #include #include #include +#include +#include +#include +#include /** * sk_filter_trim_cap - run a packet through a socket filter @@ -3372,28 +3376,6 @@ static const struct bpf_func_proto bpf_xdp_redirect_map_proto = { .arg3_type = ARG_ANYTHING, }; -bool bpf_helper_changes_pkt_data(void *func) -{ - if (func == bpf_skb_vlan_push || - func == bpf_skb_vlan_pop || - func == bpf_skb_store_bytes || - func == bpf_skb_change_proto || - func == bpf_skb_change_head || - func == bpf_skb_change_tail || - func == bpf_skb_adjust_room || - func == bpf_skb_pull_data || - func == bpf_clone_redirect || - func == bpf_l3_csum_replace || - func == bpf_l4_csum_replace || - func == bpf_xdp_adjust_head || - func == bpf_xdp_adjust_meta || - func == bpf_msg_pull_data || - func == bpf_xdp_adjust_tail) - return true; - - return false; -} - static unsigned long bpf_skb_copy(void *dst_buff, const void *skb, unsigned long off, unsigned long len) { @@ -4374,6 +4356,264 @@ static const struct bpf_func_proto bpf_skb_fib_lookup_proto = { .arg4_type = ARG_ANYTHING, }; +#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF) +static int bpf_push_seg6_encap(struct sk_buff *skb, u32 type, void *hdr, u32 len) +{ + int err; + struct ipv6_sr_hdr *srh = (struct ipv6_sr_hdr *)hdr; + + if (!seg6_validate_srh(srh, len)) + return -EINVAL; + + switch (type) { + case BPF_LWT_ENCAP_SEG6_INLINE: + if (skb->protocol != htons(ETH_P_IPV6)) + return -EBADMSG; + + err = seg6_do_srh_inline(skb, srh); + break; + case BPF_LWT_ENCAP_SEG6: + skb_reset_inner_headers(skb); + skb->encapsulation = 1; + err = seg6_do_srh_encap(skb, srh, IPPROTO_IPV6); + break; + default: + return -EINVAL; + } + + bpf_compute_data_pointers(skb); + if (err) + return err; + + ipv6_hdr(skb)->payload_len = htons(skb->len - sizeof(struct ipv6hdr)); + skb_set_transport_header(skb, sizeof(struct ipv6hdr)); + + return seg6_lookup_nexthop(skb, NULL, 0); +} +#endif /* CONFIG_IPV6_SEG6_BPF */ + +BPF_CALL_4(bpf_lwt_push_encap, struct sk_buff *, skb, u32, type, void *, hdr, + u32, len) +{ + switch (type) { +#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF) + case BPF_LWT_ENCAP_SEG6: + case BPF_LWT_ENCAP_SEG6_INLINE: + return bpf_push_seg6_encap(skb, type, hdr, len); +#endif + default: + return -EINVAL; + } +} + +static const struct bpf_func_proto bpf_lwt_push_encap_proto = { + .func = bpf_lwt_push_encap, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_PTR_TO_MEM, + .arg4_type = ARG_CONST_SIZE +}; + +BPF_CALL_4(bpf_lwt_seg6_store_bytes, struct sk_buff *, skb, u32, offset, + const void *, from, u32, len) +{ +#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF) + struct seg6_bpf_srh_state *srh_state = + this_cpu_ptr(&seg6_bpf_srh_states); + void *srh_tlvs, *srh_end, *ptr; + struct ipv6_sr_hdr *srh; + int srhoff = 0; + + if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0) + return -EINVAL; + + srh = (struct ipv6_sr_hdr *)(skb->data + srhoff); + srh_tlvs = (void *)((char *)srh + ((srh->first_segment + 1) << 4)); + srh_end = (void *)((char *)srh + sizeof(*srh) + srh_state->hdrlen); + + ptr = skb->data + offset; + if (ptr >= srh_tlvs && ptr + len <= srh_end) + srh_state->valid = 0; + else if (ptr < (void *)&srh->flags || + ptr + len > (void *)&srh->segments) + return -EFAULT; + + if (unlikely(bpf_try_make_writable(skb, offset + len))) + return -EFAULT; + + memcpy(skb->data + offset, from, len); + return 0; +#else /* CONFIG_IPV6_SEG6_BPF */ + return -EOPNOTSUPP; +#endif +} + +static const struct bpf_func_proto bpf_lwt_seg6_store_bytes_proto = { + .func = bpf_lwt_seg6_store_bytes, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_PTR_TO_MEM, + .arg4_type = ARG_CONST_SIZE +}; + +BPF_CALL_4(bpf_lwt_seg6_action, struct sk_buff *, skb, + u32, action, void *, param, u32, param_len) +{ +#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF) + struct seg6_bpf_srh_state *srh_state = + this_cpu_ptr(&seg6_bpf_srh_states); + struct ipv6_sr_hdr *srh; + int srhoff = 0; + int err; + + if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0) + return -EINVAL; + srh = (struct ipv6_sr_hdr *)(skb->data + srhoff); + + if (!srh_state->valid) { + if (unlikely((srh_state->hdrlen & 7) != 0)) + return -EBADMSG; + + srh->hdrlen = (u8)(srh_state->hdrlen >> 3); + if (unlikely(!seg6_validate_srh(srh, (srh->hdrlen + 1) << 3))) + return -EBADMSG; + + srh_state->valid = 1; + } + + switch (action) { + case SEG6_LOCAL_ACTION_END_X: + if (param_len != sizeof(struct in6_addr)) + return -EINVAL; + return seg6_lookup_nexthop(skb, (struct in6_addr *)param, 0); + case SEG6_LOCAL_ACTION_END_T: + if (param_len != sizeof(int)) + return -EINVAL; + return seg6_lookup_nexthop(skb, NULL, *(int *)param); + case SEG6_LOCAL_ACTION_END_B6: + err = bpf_push_seg6_encap(skb, BPF_LWT_ENCAP_SEG6_INLINE, + param, param_len); + if (!err) + srh_state->hdrlen = + ((struct ipv6_sr_hdr *)param)->hdrlen << 3; + return err; + case SEG6_LOCAL_ACTION_END_B6_ENCAP: + err = bpf_push_seg6_encap(skb, BPF_LWT_ENCAP_SEG6, + param, param_len); + if (!err) + srh_state->hdrlen = + ((struct ipv6_sr_hdr *)param)->hdrlen << 3; + return err; + default: + return -EINVAL; + } +#else /* CONFIG_IPV6_SEG6_BPF */ + return -EOPNOTSUPP; +#endif +} + +static const struct bpf_func_proto bpf_lwt_seg6_action_proto = { + .func = bpf_lwt_seg6_action, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_PTR_TO_MEM, + .arg4_type = ARG_CONST_SIZE +}; + +BPF_CALL_3(bpf_lwt_seg6_adjust_srh, struct sk_buff *, skb, u32, offset, + s32, len) +{ +#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF) + struct seg6_bpf_srh_state *srh_state = + this_cpu_ptr(&seg6_bpf_srh_states); + void *srh_end, *srh_tlvs, *ptr; + struct ipv6_sr_hdr *srh; + struct ipv6hdr *hdr; + int srhoff = 0; + int ret; + + if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0) + return -EINVAL; + srh = (struct ipv6_sr_hdr *)(skb->data + srhoff); + + srh_tlvs = (void *)((unsigned char *)srh + sizeof(*srh) + + ((srh->first_segment + 1) << 4)); + srh_end = (void *)((unsigned char *)srh + sizeof(*srh) + + srh_state->hdrlen); + ptr = skb->data + offset; + + if (unlikely(ptr < srh_tlvs || ptr > srh_end)) + return -EFAULT; + if (unlikely(len < 0 && (void *)((char *)ptr - len) > srh_end)) + return -EFAULT; + + if (len > 0) { + ret = skb_cow_head(skb, len); + if (unlikely(ret < 0)) + return ret; + + ret = bpf_skb_net_hdr_push(skb, offset, len); + } else { + ret = bpf_skb_net_hdr_pop(skb, offset, -1 * len); + } + + bpf_compute_data_pointers(skb); + if (unlikely(ret < 0)) + return ret; + + hdr = (struct ipv6hdr *)skb->data; + hdr->payload_len = htons(skb->len - sizeof(struct ipv6hdr)); + + srh_state->hdrlen += len; + srh_state->valid = 0; + return 0; +#else /* CONFIG_IPV6_SEG6_BPF */ + return -EOPNOTSUPP; +#endif +} + +static const struct bpf_func_proto bpf_lwt_seg6_adjust_srh_proto = { + .func = bpf_lwt_seg6_adjust_srh, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, +}; + +bool bpf_helper_changes_pkt_data(void *func) +{ + if (func == bpf_skb_vlan_push || + func == bpf_skb_vlan_pop || + func == bpf_skb_store_bytes || + func == bpf_skb_change_proto || + func == bpf_skb_change_head || + func == bpf_skb_change_tail || + func == bpf_skb_adjust_room || + func == bpf_skb_pull_data || + func == bpf_clone_redirect || + func == bpf_l3_csum_replace || + func == bpf_l4_csum_replace || + func == bpf_xdp_adjust_head || + func == bpf_xdp_adjust_meta || + func == bpf_msg_pull_data || + func == bpf_xdp_adjust_tail || + func == bpf_lwt_push_encap || + func == bpf_lwt_seg6_store_bytes || + func == bpf_lwt_seg6_adjust_srh || + func == bpf_lwt_seg6_action + ) + return true; + + return false; +} + static const struct bpf_func_proto * bpf_base_func_proto(enum bpf_func_id func_id) { @@ -4792,7 +5032,6 @@ static bool lwt_is_valid_access(int off, int size, return bpf_skb_is_valid_access(off, size, type, prog, info); } - /* Attach type specific accesses */ static bool __sock_filter_check_attach_type(int off, enum bpf_access_type access_type, diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig index a941f09a3fce..897893c0d6b8 100644 --- a/net/ipv6/Kconfig +++ b/net/ipv6/Kconfig @@ -331,4 +331,9 @@ config IPV6_SEG6_HMAC If unsure, say N. +config IPV6_SEG6_BPF + def_bool y + depends on IPV6_SEG6_LWTUNNEL + depends on IPV6 = y + endif # IPV6 diff --git a/net/ipv6/seg6_local.c b/net/ipv6/seg6_local.c index 2768c4279a4d..e015c5f1ea44 100644 --- a/net/ipv6/seg6_local.c +++ b/net/ipv6/seg6_local.c @@ -449,6 +449,8 @@ drop: return err; } +DEFINE_PER_CPU(struct seg6_bpf_srh_state, seg6_bpf_srh_states); + static struct seg6_action_desc seg6_action_table[] = { { .action = SEG6_LOCAL_ACTION_END, From e2c4a5ee4a4122bfc613393ac94eeefb273c8a64 Mon Sep 17 00:00:00 2001 From: Mathieu Xhonneux Date: Sun, 20 May 2018 14:58:15 +0100 Subject: [PATCH 0451/1640] UPSTREAM: bpf: Split lwt inout verifier structures The new bpf_lwt_push_encap helper should only be accessible within the LWT BPF IN hook, and not the OUT one, as this may lead to a skb under panic. At the moment, both LWT BPF IN and OUT share the same list of helpers, whose calls are authorized by the verifier. This patch separates the verifier ops for the IN and OUT hooks, and allows the IN hook to call the bpf_lwt_push_encap helper. This patch is also the occasion to put all lwt_*_func_proto functions together for clarity. At the moment, socks_op_func_proto is in the middle of lwt_inout_func_proto and lwt_xmit_func_proto. Signed-off-by: Mathieu Xhonneux Acked-by: David Lebrun Signed-off-by: Daniel Borkmann --- include/linux/bpf_types.h | 4 +- net/core/filter.c | 83 ++++++++++++++++++++++++--------------- 2 files changed, 54 insertions(+), 33 deletions(-) diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index b67f8793de0d..aa5c8b878474 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -9,8 +9,8 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_XDP, xdp) BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SKB, cg_skb) BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SOCK, cg_sock) BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SOCK_ADDR, cg_sock_addr) -BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_IN, lwt_inout) -BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_OUT, lwt_inout) +BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_IN, lwt_in) +BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_OUT, lwt_out) BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_XMIT, lwt_xmit) BPF_PROG_TYPE(BPF_PROG_TYPE_SOCK_OPS, sock_ops) BPF_PROG_TYPE(BPF_PROG_TYPE_SK_SKB, sk_skb) diff --git a/net/core/filter.c b/net/core/filter.c index 9baa58978203..a61c8f6943c8 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4801,33 +4801,6 @@ xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) } } -static const struct bpf_func_proto * -lwt_inout_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) -{ - switch (func_id) { - case BPF_FUNC_skb_load_bytes: - return &bpf_skb_load_bytes_proto; - case BPF_FUNC_skb_pull_data: - return &bpf_skb_pull_data_proto; - case BPF_FUNC_csum_diff: - return &bpf_csum_diff_proto; - case BPF_FUNC_get_cgroup_classid: - return &bpf_get_cgroup_classid_proto; - case BPF_FUNC_get_route_realm: - return &bpf_get_route_realm_proto; - case BPF_FUNC_get_hash_recalc: - return &bpf_get_hash_recalc_proto; - case BPF_FUNC_perf_event_output: - return &bpf_skb_event_output_proto; - case BPF_FUNC_get_smp_processor_id: - return &bpf_get_smp_processor_id_proto; - case BPF_FUNC_skb_under_cgroup: - return &bpf_skb_under_cgroup_proto; - default: - return bpf_base_func_proto(func_id); - } -} - static const struct bpf_func_proto * sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { @@ -4893,6 +4866,44 @@ sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) } } +static const struct bpf_func_proto * +lwt_out_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + switch (func_id) { + case BPF_FUNC_skb_load_bytes: + return &bpf_skb_load_bytes_proto; + case BPF_FUNC_skb_pull_data: + return &bpf_skb_pull_data_proto; + case BPF_FUNC_csum_diff: + return &bpf_csum_diff_proto; + case BPF_FUNC_get_cgroup_classid: + return &bpf_get_cgroup_classid_proto; + case BPF_FUNC_get_route_realm: + return &bpf_get_route_realm_proto; + case BPF_FUNC_get_hash_recalc: + return &bpf_get_hash_recalc_proto; + case BPF_FUNC_perf_event_output: + return &bpf_skb_event_output_proto; + case BPF_FUNC_get_smp_processor_id: + return &bpf_get_smp_processor_id_proto; + case BPF_FUNC_skb_under_cgroup: + return &bpf_skb_under_cgroup_proto; + default: + return bpf_base_func_proto(func_id); + } +} + +static const struct bpf_func_proto * +lwt_in_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + switch (func_id) { + case BPF_FUNC_lwt_push_encap: + return &bpf_lwt_push_encap_proto; + default: + return lwt_out_func_proto(func_id, prog); + } +} + static const struct bpf_func_proto * lwt_xmit_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { @@ -4924,7 +4935,7 @@ lwt_xmit_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_set_hash_invalid: return &bpf_set_hash_invalid_proto; default: - return lwt_inout_func_proto(func_id, prog); + return lwt_out_func_proto(func_id, prog); } } @@ -6605,13 +6616,23 @@ const struct bpf_prog_ops cg_skb_prog_ops = { .test_run = bpf_prog_test_run_skb, }; -const struct bpf_verifier_ops lwt_inout_verifier_ops = { - .get_func_proto = lwt_inout_func_proto, +const struct bpf_verifier_ops lwt_in_verifier_ops = { + .get_func_proto = lwt_in_func_proto, .is_valid_access = lwt_is_valid_access, .convert_ctx_access = bpf_convert_ctx_access, }; -const struct bpf_prog_ops lwt_inout_prog_ops = { +const struct bpf_prog_ops lwt_in_prog_ops = { + .test_run = bpf_prog_test_run_skb, +}; + +const struct bpf_verifier_ops lwt_out_verifier_ops = { + .get_func_proto = lwt_out_func_proto, + .is_valid_access = lwt_is_valid_access, + .convert_ctx_access = bpf_convert_ctx_access, +}; + +const struct bpf_prog_ops lwt_out_prog_ops = { .test_run = bpf_prog_test_run_skb, }; From 1e3cad23536a30916dab359938f00c283519a108 Mon Sep 17 00:00:00 2001 From: Mathieu Xhonneux Date: Sun, 20 May 2018 14:58:16 +0100 Subject: [PATCH 0452/1640] BACKPORT: ipv6: sr: Add seg6local action End.BPF This patch adds the End.BPF action to the LWT seg6local infrastructure. This action works like any other seg6local End action, meaning that an IPv6 header with SRH is needed, whose DA has to be equal to the SID of the action. It will also advance the SRH to the next segment, the BPF program does not have to take care of this. Since the BPF program may not be a source of instability in the kernel, it is important to ensure that the integrity of the packet is maintained before yielding it back to the IPv6 layer. The hook hence keeps track if the SRH has been altered through the helpers, and re-validates its content if needed with seg6_validate_srh. The state kept for validation is stored in a per-CPU buffer. The BPF program is not allowed to directly write into the packet, and only some fields of the SRH can be altered through the helper bpf_lwt_seg6_store_bytes. Performances profiling has shown that the SRH re-validation does not induce a significant overhead. If the altered SRH is deemed as invalid, the packet is dropped. This validation is also done before executing any action through bpf_lwt_seg6_action, and will not be performed again if the SRH is not modified after calling the action. The BPF program may return 3 types of return codes: - BPF_OK: the End.BPF action will look up the next destination through seg6_lookup_nexthop. - BPF_REDIRECT: if an action has been executed through the bpf_lwt_seg6_action helper, the BPF program should return this value, as the skb's destination is already set and the default lookup should not be performed. - BPF_DROP : the packet will be dropped. Signed-off-by: Mathieu Xhonneux Acked-by: David Lebrun Signed-off-by: Daniel Borkmann --- include/linux/bpf_types.h | 1 + include/uapi/linux/bpf.h | 1 + include/uapi/linux/seg6_local.h | 12 +++ kernel/bpf/verifier.c | 1 + net/core/filter.c | 25 +++++ net/ipv6/seg6_local.c | 168 +++++++++++++++++++++++++++++++- 6 files changed, 206 insertions(+), 2 deletions(-) diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index aa5c8b878474..b161e506dcfc 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -12,6 +12,7 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SOCK_ADDR, cg_sock_addr) BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_IN, lwt_in) BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_OUT, lwt_out) BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_XMIT, lwt_xmit) +BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_SEG6LOCAL, lwt_seg6local) BPF_PROG_TYPE(BPF_PROG_TYPE_SOCK_OPS, sock_ops) BPF_PROG_TYPE(BPF_PROG_TYPE_SK_SKB, sk_skb) BPF_PROG_TYPE(BPF_PROG_TYPE_SK_MSG, sk_msg) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 6e85616db850..f7c769994320 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -141,6 +141,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_SK_MSG, BPF_PROG_TYPE_RAW_TRACEPOINT, BPF_PROG_TYPE_CGROUP_SOCK_ADDR, + BPF_PROG_TYPE_LWT_SEG6LOCAL, }; enum bpf_attach_type { diff --git a/include/uapi/linux/seg6_local.h b/include/uapi/linux/seg6_local.h index ef2d8c3e76c1..edc138bdc56d 100644 --- a/include/uapi/linux/seg6_local.h +++ b/include/uapi/linux/seg6_local.h @@ -25,6 +25,7 @@ enum { SEG6_LOCAL_NH6, SEG6_LOCAL_IIF, SEG6_LOCAL_OIF, + SEG6_LOCAL_BPF, __SEG6_LOCAL_MAX, }; #define SEG6_LOCAL_MAX (__SEG6_LOCAL_MAX - 1) @@ -59,10 +60,21 @@ enum { SEG6_LOCAL_ACTION_END_AS = 13, /* forward to SR-unaware VNF with masquerading */ SEG6_LOCAL_ACTION_END_AM = 14, + /* custom BPF action */ + SEG6_LOCAL_ACTION_END_BPF = 15, __SEG6_LOCAL_ACTION_MAX, }; #define SEG6_LOCAL_ACTION_MAX (__SEG6_LOCAL_ACTION_MAX - 1) +enum { + SEG6_LOCAL_BPF_PROG_UNSPEC, + SEG6_LOCAL_BPF_PROG, + SEG6_LOCAL_BPF_PROG_NAME, + __SEG6_LOCAL_BPF_PROG_MAX, +}; + +#define SEG6_LOCAL_BPF_PROG_MAX (__SEG6_LOCAL_BPF_PROG_MAX - 1) + #endif diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index dfc33128b7ac..f81d035f2172 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1317,6 +1317,7 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env, switch (env->prog->type) { case BPF_PROG_TYPE_LWT_IN: case BPF_PROG_TYPE_LWT_OUT: + case BPF_PROG_TYPE_LWT_SEG6LOCAL: /* dst_input() and dst_output() can't write for now */ if (t == BPF_WRITE) return false; diff --git a/net/core/filter.c b/net/core/filter.c index a61c8f6943c8..9929ab0d2546 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4939,6 +4939,21 @@ lwt_xmit_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) } } +static const struct bpf_func_proto * +lwt_seg6local_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + switch (func_id) { + case BPF_FUNC_lwt_seg6_store_bytes: + return &bpf_lwt_seg6_store_bytes_proto; + case BPF_FUNC_lwt_seg6_action: + return &bpf_lwt_seg6_action_proto; + case BPF_FUNC_lwt_seg6_adjust_srh: + return &bpf_lwt_seg6_adjust_srh_proto; + default: + return lwt_out_func_proto(func_id, prog); + } +} + static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) @@ -6647,6 +6662,16 @@ const struct bpf_prog_ops lwt_xmit_prog_ops = { .test_run = bpf_prog_test_run_skb, }; +const struct bpf_verifier_ops lwt_seg6local_verifier_ops = { + .get_func_proto = lwt_seg6local_func_proto, + .is_valid_access = lwt_is_valid_access, + .convert_ctx_access = bpf_convert_ctx_access, +}; + +const struct bpf_prog_ops lwt_seg6local_prog_ops = { + .test_run = bpf_prog_test_run_skb, +}; + const struct bpf_verifier_ops cg_sock_verifier_ops = { .get_func_proto = sock_filter_func_proto, .is_valid_access = sock_filter_is_valid_access, diff --git a/net/ipv6/seg6_local.c b/net/ipv6/seg6_local.c index e015c5f1ea44..0800ac74cedc 100644 --- a/net/ipv6/seg6_local.c +++ b/net/ipv6/seg6_local.c @@ -1,8 +1,9 @@ /* * SR-IPv6 implementation * - * Author: + * Authors: * David Lebrun + * eBPF support: Mathieu Xhonneux * * * This program is free software; you can redistribute it and/or @@ -33,6 +34,7 @@ #endif #include #include +#include struct seg6_local_lwt; @@ -43,6 +45,11 @@ struct seg6_action_desc { int static_headroom; }; +struct bpf_lwt_prog { + struct bpf_prog *prog; + char *name; +}; + struct seg6_local_lwt { int action; struct ipv6_sr_hdr *srh; @@ -51,6 +58,7 @@ struct seg6_local_lwt { struct in6_addr nh6; int iif; int oif; + struct bpf_lwt_prog bpf; int headroom; struct seg6_action_desc *desc; @@ -451,6 +459,69 @@ drop: DEFINE_PER_CPU(struct seg6_bpf_srh_state, seg6_bpf_srh_states); +static int input_action_end_bpf(struct sk_buff *skb, + struct seg6_local_lwt *slwt) +{ + struct seg6_bpf_srh_state *srh_state = + this_cpu_ptr(&seg6_bpf_srh_states); + struct seg6_bpf_srh_state local_srh_state; + struct ipv6_sr_hdr *srh; + int srhoff = 0; + int ret; + + srh = get_and_validate_srh(skb); + if (!srh) + goto drop; + advance_nextseg(srh, &ipv6_hdr(skb)->daddr); + + /* preempt_disable is needed to protect the per-CPU buffer srh_state, + * which is also accessed by the bpf_lwt_seg6_* helpers + */ + preempt_disable(); + srh_state->hdrlen = srh->hdrlen << 3; + srh_state->valid = 1; + + rcu_read_lock(); + bpf_compute_data_pointers(skb); + ret = bpf_prog_run_save_cb(slwt->bpf.prog, skb); + rcu_read_unlock(); + + local_srh_state = *srh_state; + preempt_enable(); + + switch (ret) { + case BPF_OK: + case BPF_REDIRECT: + break; + case BPF_DROP: + goto drop; + default: + pr_warn_once("bpf-seg6local: Illegal return value %u\n", ret); + goto drop; + } + + if (unlikely((local_srh_state.hdrlen & 7) != 0)) + goto drop; + + if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0) + goto drop; + srh = (struct ipv6_sr_hdr *)(skb->data + srhoff); + srh->hdrlen = (u8)(local_srh_state.hdrlen >> 3); + + if (!local_srh_state.valid && + unlikely(!seg6_validate_srh(srh, (srh->hdrlen + 1) << 3))) + goto drop; + + if (ret != BPF_REDIRECT) + seg6_lookup_nexthop(skb, NULL, 0); + + return dst_input(skb); + +drop: + kfree_skb(skb); + return -EINVAL; +} + static struct seg6_action_desc seg6_action_table[] = { { .action = SEG6_LOCAL_ACTION_END, @@ -497,7 +568,13 @@ static struct seg6_action_desc seg6_action_table[] = { .attrs = (1 << SEG6_LOCAL_SRH), .input = input_action_end_b6_encap, .static_headroom = sizeof(struct ipv6hdr), - } + }, + { + .action = SEG6_LOCAL_ACTION_END_BPF, + .attrs = (1 << SEG6_LOCAL_BPF), + .input = input_action_end_bpf, + }, + }; static struct seg6_action_desc *__get_action_desc(int action) @@ -542,6 +619,7 @@ static const struct nla_policy seg6_local_policy[SEG6_LOCAL_MAX + 1] = { .len = sizeof(struct in6_addr) }, [SEG6_LOCAL_IIF] = { .type = NLA_U32 }, [SEG6_LOCAL_OIF] = { .type = NLA_U32 }, + [SEG6_LOCAL_BPF] = { .type = NLA_NESTED }, }; static int parse_nla_srh(struct nlattr **attrs, struct seg6_local_lwt *slwt) @@ -719,6 +797,75 @@ static int cmp_nla_oif(struct seg6_local_lwt *a, struct seg6_local_lwt *b) return 0; } +#define MAX_PROG_NAME 256 +static const struct nla_policy bpf_prog_policy[SEG6_LOCAL_BPF_PROG_MAX + 1] = { + [SEG6_LOCAL_BPF_PROG] = { .type = NLA_U32, }, + [SEG6_LOCAL_BPF_PROG_NAME] = { .type = NLA_NUL_STRING, + .len = MAX_PROG_NAME }, +}; + +static int parse_nla_bpf(struct nlattr **attrs, struct seg6_local_lwt *slwt) +{ + struct nlattr *tb[SEG6_LOCAL_BPF_PROG_MAX + 1]; + struct bpf_prog *p; + int ret; + u32 fd; + + ret = nla_parse_nested(tb, SEG6_LOCAL_BPF_PROG_MAX, + attrs[SEG6_LOCAL_BPF], bpf_prog_policy, NULL); + if (ret < 0) + return ret; + + if (!tb[SEG6_LOCAL_BPF_PROG] || !tb[SEG6_LOCAL_BPF_PROG_NAME]) + return -EINVAL; + + slwt->bpf.name = nla_memdup(tb[SEG6_LOCAL_BPF_PROG_NAME], GFP_KERNEL); + if (!slwt->bpf.name) + return -ENOMEM; + + fd = nla_get_u32(tb[SEG6_LOCAL_BPF_PROG]); + p = bpf_prog_get_type(fd, BPF_PROG_TYPE_LWT_SEG6LOCAL); + if (IS_ERR(p)) { + kfree(slwt->bpf.name); + return PTR_ERR(p); + } + + slwt->bpf.prog = p; + return 0; +} + +static int put_nla_bpf(struct sk_buff *skb, struct seg6_local_lwt *slwt) +{ + struct nlattr *nest; + + if (!slwt->bpf.prog) + return 0; + + nest = nla_nest_start(skb, SEG6_LOCAL_BPF); + if (!nest) + return -EMSGSIZE; + + if (nla_put_u32(skb, SEG6_LOCAL_BPF_PROG, slwt->bpf.prog->aux->id)) + return -EMSGSIZE; + + if (slwt->bpf.name && + nla_put_string(skb, SEG6_LOCAL_BPF_PROG_NAME, slwt->bpf.name)) + return -EMSGSIZE; + + return nla_nest_end(skb, nest); +} + +static int cmp_nla_bpf(struct seg6_local_lwt *a, struct seg6_local_lwt *b) +{ + if (!a->bpf.name && !b->bpf.name) + return 0; + + if (!a->bpf.name || !b->bpf.name) + return 1; + + return strcmp(a->bpf.name, b->bpf.name); +} + struct seg6_action_param { int (*parse)(struct nlattr **attrs, struct seg6_local_lwt *slwt); int (*put)(struct sk_buff *skb, struct seg6_local_lwt *slwt); @@ -749,6 +896,11 @@ static struct seg6_action_param seg6_action_params[SEG6_LOCAL_MAX + 1] = { [SEG6_LOCAL_OIF] = { .parse = parse_nla_oif, .put = put_nla_oif, .cmp = cmp_nla_oif }, + + [SEG6_LOCAL_BPF] = { .parse = parse_nla_bpf, + .put = put_nla_bpf, + .cmp = cmp_nla_bpf }, + }; static int parse_nla_action(struct nlattr **attrs, struct seg6_local_lwt *slwt) @@ -834,6 +986,13 @@ static void seg6_local_destroy_state(struct lwtunnel_state *lwt) struct seg6_local_lwt *slwt = seg6_local_lwtunnel(lwt); kfree(slwt->srh); + + if (slwt->desc->attrs & (1 << SEG6_LOCAL_BPF)) { + kfree(slwt->bpf.name); + bpf_prog_put(slwt->bpf.prog); + } + + return; } static int seg6_local_fill_encap(struct sk_buff *skb, @@ -886,6 +1045,11 @@ static int seg6_local_get_encap_size(struct lwtunnel_state *lwt) if (attrs & (1 << SEG6_LOCAL_OIF)) nlsize += nla_total_size(4); + if (attrs & (1 << SEG6_LOCAL_BPF)) + nlsize += nla_total_size(sizeof(struct nlattr)) + + nla_total_size(MAX_PROG_NAME) + + nla_total_size(4); + return nlsize; } From c45296aed7bd4e9c9ef9b79ca9b82df46c21277f Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 24 May 2018 02:32:53 +0200 Subject: [PATCH 0453/1640] BACKPORT: bpf: properly enforce index mask to prevent out-of-bounds speculation While reviewing the verifier code, I recently noticed that the following two program variants in relation to tail calls can be loaded. Variant 1: # bpftool p d x i 15 0: (15) if r1 == 0x0 goto pc+3 1: (18) r2 = map[id:5] 3: (05) goto pc+2 4: (18) r2 = map[id:6] 6: (b7) r3 = 7 7: (35) if r3 >= 0xa0 goto pc+2 8: (54) (u32) r3 &= (u32) 255 9: (85) call bpf_tail_call#12 10: (b7) r0 = 1 11: (95) exit # bpftool m s i 5 5: prog_array flags 0x0 key 4B value 4B max_entries 4 memlock 4096B # bpftool m s i 6 6: prog_array flags 0x0 key 4B value 4B max_entries 160 memlock 4096B Variant 2: # bpftool p d x i 20 0: (15) if r1 == 0x0 goto pc+3 1: (18) r2 = map[id:8] 3: (05) goto pc+2 4: (18) r2 = map[id:7] 6: (b7) r3 = 7 7: (35) if r3 >= 0x4 goto pc+2 8: (54) (u32) r3 &= (u32) 3 9: (85) call bpf_tail_call#12 10: (b7) r0 = 1 11: (95) exit # bpftool m s i 8 8: prog_array flags 0x0 key 4B value 4B max_entries 160 memlock 4096B # bpftool m s i 7 7: prog_array flags 0x0 key 4B value 4B max_entries 4 memlock 4096B In both cases the index masking inserted by the verifier in order to control out of bounds speculation from a CPU via b2157399cc98 ("bpf: prevent out-of-bounds speculation") seems to be incorrect in what it is enforcing. In the 1st variant, the mask is applied from the map with the significantly larger number of entries where we would allow to a certain degree out of bounds speculation for the smaller map, and in the 2nd variant where the mask is applied from the map with the smaller number of entries, we get buggy behavior since we truncate the index of the larger map. The original intent from commit b2157399cc98 is to reject such occasions where two or more different tail call maps are used in the same tail call helper invocation. However, the check on the BPF_MAP_PTR_POISON is never hit since we never poisoned the saved pointer in the first place! We do this explicitly for map lookups but in case of tail calls we basically used the tail call map in insn_aux_data that was processed in the most recent path which the verifier walked. Thus any prior path that stored a pointer in insn_aux_data at the helper location was always overridden. Fix it by moving the map pointer poison logic into a small helper that covers both BPF helpers with the same logic. After that in fixup_bpf_calls() the poison check is then hit for tail calls and the program rejected. Latter only happens in unprivileged case since this is the *only* occasion where a rewrite needs to happen, and where such rewrite is specific to the map (max_entries, index_mask). In the privileged case the rewrite is generic for the insn->imm / insn->code update so multiple maps from different paths can be handled just fine since all the remaining logic happens in the instruction processing itself. This is similar to the case of map lookups: in case there is a collision of maps in fixup_bpf_calls() we must skip the inlined rewrite since this will turn the generic instruction sequence into a non- generic one. Thus the patch_call_imm will simply update the insn->imm location where the bpf_map_lookup_elem() will later take care of the dispatch. Given we need this 'poison' state as a check, the information of whether a map is an unpriv_array gets lost, so enforcing it prior to that needs an additional state. In general this check is needed since there are some complex and tail call intensive BPF programs out there where LLVM tends to generate such code occasionally. We therefore convert the map_ptr rather into map_state to store all this w/o extra memory overhead, and the bit whether one of the maps involved in the collision was from an unpriv_array thus needs to be retained as well there. Fixes: b2157399cc98 ("bpf: prevent out-of-bounds speculation") Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 2 +- kernel/bpf/verifier.c | 87 ++++++++++++++++++++++++++---------- 2 files changed, 65 insertions(+), 24 deletions(-) diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index a8583be65b86..1c8517320ea6 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -155,7 +155,7 @@ struct bpf_verifier_state_list { struct bpf_insn_aux_data { union { enum bpf_reg_type ptr_type; /* pointer type for load/store insns */ - struct bpf_map *map_ptr; /* pointer for call insn into lookup_elem */ + unsigned long map_state; /* pointer/poison value for maps */ s32 call_imm; /* saved imm field of call insn */ u32 alu_limit; /* limit for add/sub register with pointer */ }; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index f81d035f2172..260cb7ff9356 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -157,7 +157,29 @@ struct bpf_verifier_stack_elem { #define BPF_COMPLEXITY_LIMIT_INSNS 131072 #define BPF_COMPLEXITY_LIMIT_STACK 1024 -#define BPF_MAP_PTR_POISON ((void *)0xeB9F + POISON_POINTER_DELTA) +#define BPF_MAP_PTR_UNPRIV 1UL +#define BPF_MAP_PTR_POISON ((void *)((0xeB9FUL << 1) + \ + POISON_POINTER_DELTA)) +#define BPF_MAP_PTR(X) ((struct bpf_map *)((X) & ~BPF_MAP_PTR_UNPRIV)) + +static bool bpf_map_ptr_poisoned(const struct bpf_insn_aux_data *aux) +{ + return BPF_MAP_PTR(aux->map_state) == BPF_MAP_PTR_POISON; +} + +static bool bpf_map_ptr_unpriv(const struct bpf_insn_aux_data *aux) +{ + return aux->map_state & BPF_MAP_PTR_UNPRIV; +} + +static void bpf_map_ptr_store(struct bpf_insn_aux_data *aux, + const struct bpf_map *map, bool unpriv) +{ + BUILD_BUG_ON((unsigned long)BPF_MAP_PTR_POISON & BPF_MAP_PTR_UNPRIV); + unpriv |= bpf_map_ptr_unpriv(aux); + aux->map_state = (unsigned long)map | + (unpriv ? BPF_MAP_PTR_UNPRIV : 0UL); +} struct bpf_call_arg_meta { struct bpf_map *map_ptr; @@ -2373,6 +2395,29 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) return 0; } +static int +record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta, + int func_id, int insn_idx) +{ + struct bpf_insn_aux_data *aux = &env->insn_aux_data[insn_idx]; + + if (func_id != BPF_FUNC_tail_call && + func_id != BPF_FUNC_map_lookup_elem) + return 0; + if (meta->map_ptr == NULL) { + verbose(env, "kernel subsystem misconfigured verifier\n"); + return -EINVAL; + } + + if (!BPF_MAP_PTR(aux->map_state)) + bpf_map_ptr_store(aux, meta->map_ptr, + meta->map_ptr->unpriv_array); + else if (BPF_MAP_PTR(aux->map_state) != meta->map_ptr) + bpf_map_ptr_store(aux, BPF_MAP_PTR_POISON, + meta->map_ptr->unpriv_array); + return 0; +} + static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn_idx) { const struct bpf_func_proto *fn = NULL; @@ -2431,13 +2476,6 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn err = check_func_arg(env, BPF_REG_2, fn->arg2_type, &meta); if (err) return err; - if (func_id == BPF_FUNC_tail_call) { - if (meta.map_ptr == NULL) { - verbose(env, "verifier bug\n"); - return -EINVAL; - } - env->insn_aux_data[insn_idx].map_ptr = meta.map_ptr; - } err = check_func_arg(env, BPF_REG_3, fn->arg3_type, &meta); if (err) return err; @@ -2448,6 +2486,10 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn if (err) return err; + err = record_func_map(env, &meta, func_id, insn_idx); + if (err) + return err; + /* Mark slots with STACK_MISC in case of raw mode, stack offset * is inferred from register state. */ @@ -2472,8 +2514,6 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn } else if (fn->ret_type == RET_VOID) { regs[BPF_REG_0].type = NOT_INIT; } else if (fn->ret_type == RET_PTR_TO_MAP_VALUE_OR_NULL) { - struct bpf_insn_aux_data *insn_aux; - regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL; /* There is no offset yet applied, variable or fixed */ mark_reg_known_zero(env, regs, BPF_REG_0); @@ -2489,11 +2529,6 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn } regs[BPF_REG_0].map_ptr = meta.map_ptr; regs[BPF_REG_0].id = ++env->id_gen; - insn_aux = &env->insn_aux_data[insn_idx]; - if (!insn_aux->map_ptr) - insn_aux->map_ptr = meta.map_ptr; - else if (insn_aux->map_ptr != meta.map_ptr) - insn_aux->map_ptr = BPF_MAP_PTR_POISON; } else { verbose(env, "unknown return type %d of func %s#%d\n", fn->ret_type, func_id_name(func_id), func_id); @@ -5835,11 +5870,11 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) struct bpf_insn *insn = prog->insnsi; const struct bpf_func_proto *fn; const int insn_cnt = prog->len; + struct bpf_insn_aux_data *aux; struct bpf_insn insn_buf[16]; struct bpf_prog *new_prog; struct bpf_map *map_ptr; int i, cnt, delta = 0; - struct bpf_insn_aux_data *aux; for (i = 0; i < insn_cnt; i++, insn++) { if (insn->code == (BPF_ALU64 | BPF_MOD | BPF_X) || @@ -5983,19 +6018,22 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) insn->imm = 0; insn->code = BPF_JMP | BPF_TAIL_CALL; + aux = &env->insn_aux_data[i + delta]; + if (!bpf_map_ptr_unpriv(aux)) + continue; + /* instead of changing every JIT dealing with tail_call * emit two extra insns: * if (index >= max_entries) goto out; * index &= array->index_mask; * to avoid out-of-bounds cpu speculation */ - map_ptr = env->insn_aux_data[i + delta].map_ptr; - if (map_ptr == BPF_MAP_PTR_POISON) { + if (bpf_map_ptr_poisoned(aux)) { verbose(env, "tail_call abusing map_ptr\n"); return -EINVAL; } - if (!map_ptr->unpriv_array) - continue; + + map_ptr = BPF_MAP_PTR(aux->map_state); insn_buf[0] = BPF_JMP_IMM(BPF_JGE, BPF_REG_3, map_ptr->max_entries, 2); insn_buf[1] = BPF_ALU32_IMM(BPF_AND, BPF_REG_3, @@ -6019,9 +6057,12 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) */ if (prog->jit_requested && BITS_PER_LONG == 64 && insn->imm == BPF_FUNC_map_lookup_elem) { - map_ptr = env->insn_aux_data[i + delta].map_ptr; - if (map_ptr == BPF_MAP_PTR_POISON || - !map_ptr->ops->map_gen_lookup) + aux = &env->insn_aux_data[i + delta]; + if (bpf_map_ptr_poisoned(aux)) + goto patch_call_imm; + + map_ptr = BPF_MAP_PTR(aux->map_state); + if (!map_ptr->ops->map_gen_lookup) goto patch_call_imm; cnt = map_ptr->ops->map_gen_lookup(map_ptr, insn_buf); From 9dc134bf3145b118fce532338ddef16418069bb3 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 9 Jan 2018 17:07:59 +0100 Subject: [PATCH 0454/1640] UPSTREAM: perf/core: Fix another perf,trace,cpuhp lock inversion Lockdep noticed the following 3-way lockup race: perf_trace_init() #0 mutex_lock(&event_mutex) perf_trace_event_init() perf_trace_event_reg() tp_event->class->reg() := tracepoint_probe_register #1 mutex_lock(&tracepoints_mutex) trace_point_add_func() #2 static_key_enable() #2 do_cpu_up() perf_event_init_cpu() #3 mutex_lock(&pmus_lock) #4 mutex_lock(&ctx->mutex) perf_ioctl() #4 ctx = perf_event_ctx_lock() _perf_iotcl() ftrace_profile_set_filter() #0 mutex_lock(&event_mutex) Fudge it for now by noting that the tracepoint state does not depend on the event <-> context relation. Ugly though :/ Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- kernel/events/core.c | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index 9a0af98caead..513d6add509d 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -9003,6 +9003,29 @@ fail_clear_files: return ret; } +static int +perf_tracepoint_set_filter(struct perf_event *event, char *filter_str) +{ + struct perf_event_context *ctx = event->ctx; + int ret; + + /* + * Beware, here be dragons!! + * + * the tracepoint muck will deadlock against ctx->mutex, but the tracepoint + * stuff does not actually need it. So temporarily drop ctx->mutex. As per + * perf_event_ctx_lock() we already have a reference on ctx. + * + * This can result in event getting moved to a different ctx, but that + * does not affect the tracepoint state. + */ + mutex_unlock(&ctx->mutex); + ret = ftrace_profile_set_filter(event, event->attr.config, filter_str); + mutex_lock(&ctx->mutex); + + return ret; +} + static int perf_event_set_filter(struct perf_event *event, void __user *arg) { char *filter_str; @@ -9019,8 +9042,7 @@ static int perf_event_set_filter(struct perf_event *event, void __user *arg) if (IS_ENABLED(CONFIG_EVENT_TRACING) && event->attr.type == PERF_TYPE_TRACEPOINT) - ret = ftrace_profile_set_filter(event, event->attr.config, - filter_str); + ret = perf_tracepoint_set_filter(event, filter_str); else if (has_addr_filter(event)) ret = perf_event_set_addr_filter(event, filter_str); From 630cb19bf2a7cdc79625acc97d8741e8cefc2b81 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Wed, 6 Dec 2017 14:45:13 -0800 Subject: [PATCH 0455/1640] UPSTREAM: perf/core: Prepare perf_event.h for new types: 'perf_kprobe' and 'perf_uprobe' Two new perf types, perf_kprobe and perf_uprobe, will be added to allow creating [k,u]probe with perf_event_open. These [k,u]probe are associated with the file decriptor created by perf_event_open(), thus are easy to clean when the file descriptor is destroyed. kprobe_func and uprobe_path are added to union config1 for pointers to function name for kprobe or binary path for uprobe. kprobe_addr and probe_offset are added to union config2 for kernel address (when kprobe_func is NULL), or [k,u]probe offset. Signed-off-by: Song Liu Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Yonghong Song Reviewed-by: Josef Bacik Acked-by: Alexei Starovoitov Cc: Cc: Cc: Cc: Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20171206224518.3598254-4-songliubraving@fb.com Signed-off-by: Ingo Molnar --- include/uapi/linux/perf_event.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 60460cc6d78f..11403fb6dd64 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -380,10 +380,14 @@ struct perf_event_attr { __u32 bp_type; union { __u64 bp_addr; + __u64 kprobe_func; /* for perf_kprobe */ + __u64 uprobe_path; /* for perf_uprobe */ __u64 config1; /* extension of config */ }; union { __u64 bp_len; + __u64 kprobe_addr; /* when kprobe_func == NULL */ + __u64 probe_offset; /* for perf_[k,u]probe */ __u64 config2; /* extension of config1 */ }; __u64 branch_sample_type; /* enum perf_branch_sample_type */ From 30febf09c86e93b61530f366630edfbb208e1269 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Wed, 6 Dec 2017 14:45:15 -0800 Subject: [PATCH 0456/1640] UPSTREAM: perf/core: Implement the 'perf_kprobe' PMU A new PMU type, perf_kprobe is added. Based on attr from perf_event_open(), perf_kprobe creates a kprobe (or kretprobe) for the perf_event. This kprobe is private to this perf_event, and thus not added to global lists, and not available in tracefs. Two functions, create_local_trace_kprobe() and destroy_local_trace_kprobe() are added to created and destroy these local trace_kprobe. Signed-off-by: Song Liu Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Yonghong Song Reviewed-by: Josef Bacik Cc: Cc: Cc: Cc: Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20171206224518.3598254-6-songliubraving@fb.com Signed-off-by: Ingo Molnar --- include/linux/trace_events.h | 4 + kernel/events/core.c | 142 ++++++++++++++++++++++++-------- kernel/trace/trace_event_perf.c | 49 +++++++++++ kernel/trace/trace_kprobe.c | 91 ++++++++++++++++++-- kernel/trace/trace_probe.h | 7 ++ 5 files changed, 250 insertions(+), 43 deletions(-) diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 7e185267c993..819006af415e 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -552,6 +552,10 @@ extern int perf_trace_init(struct perf_event *event); extern void perf_trace_destroy(struct perf_event *event); extern int perf_trace_add(struct perf_event *event, int flags); extern void perf_trace_del(struct perf_event *event, int flags); +#ifdef CONFIG_KPROBE_EVENTS +extern int perf_kprobe_init(struct perf_event *event, bool is_retprobe); +extern void perf_kprobe_destroy(struct perf_event *event); +#endif extern int ftrace_profile_set_filter(struct perf_event *event, int event_id, char *filter_str); extern void ftrace_profile_free_filter(struct perf_event *event); diff --git a/kernel/events/core.c b/kernel/events/core.c index 513d6add509d..cf5b648c3e42 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -8454,9 +8454,77 @@ static struct pmu perf_tracepoint = { .events_across_hotplug = 1, }; +#ifdef CONFIG_KPROBE_EVENTS +/* + * Flags in config, used by dynamic PMU kprobe and uprobe + * The flags should match following PMU_FORMAT_ATTR(). + * + * PERF_PROBE_CONFIG_IS_RETPROBE if set, create kretprobe/uretprobe + * if not set, create kprobe/uprobe + */ +enum perf_probe_config { + PERF_PROBE_CONFIG_IS_RETPROBE = 1U << 0, /* [k,u]retprobe */ +}; + +PMU_FORMAT_ATTR(retprobe, "config:0"); + +static struct attribute *probe_attrs[] = { + &format_attr_retprobe.attr, + NULL, +}; + +static struct attribute_group probe_format_group = { + .name = "format", + .attrs = probe_attrs, +}; + +static const struct attribute_group *probe_attr_groups[] = { + &probe_format_group, + NULL, +}; + +static int perf_kprobe_event_init(struct perf_event *event); +static struct pmu perf_kprobe = { + .task_ctx_nr = perf_sw_context, + .event_init = perf_kprobe_event_init, + .add = perf_trace_add, + .del = perf_trace_del, + .start = perf_swevent_start, + .stop = perf_swevent_stop, + .read = perf_swevent_read, + .attr_groups = probe_attr_groups, +}; + +static int perf_kprobe_event_init(struct perf_event *event) +{ + int err; + bool is_retprobe; + + if (event->attr.type != perf_kprobe.type) + return -ENOENT; + /* + * no branch sampling for probe events + */ + if (has_branch_stack(event)) + return -EOPNOTSUPP; + + is_retprobe = event->attr.config & PERF_PROBE_CONFIG_IS_RETPROBE; + err = perf_kprobe_init(event, is_retprobe); + if (err) + return err; + + event->destroy = perf_kprobe_destroy; + + return 0; +} +#endif /* CONFIG_KPROBE_EVENTS */ + static inline void perf_tp_register(void) { perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT); +#ifdef CONFIG_KPROBE_EVENTS + perf_pmu_register(&perf_kprobe, "kprobe", -1); +#endif } static void perf_event_free_filter(struct perf_event *event) @@ -8533,13 +8601,28 @@ static void perf_event_free_bpf_handler(struct perf_event *event) } #endif +/* + * returns true if the event is a tracepoint, or a kprobe/upprobe created + * with perf_event_open() + */ +static inline bool perf_event_is_tracing(struct perf_event *event) +{ + if (event->pmu == &perf_tracepoint) + return true; +#ifdef CONFIG_KPROBE_EVENTS + if (event->pmu == &perf_kprobe) + return true; +#endif + return false; +} + static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) { bool is_kprobe, is_tracepoint, is_syscall_tp; struct bpf_prog *prog; int ret; - if (event->attr.type != PERF_TYPE_TRACEPOINT) + if (!perf_event_is_tracing(event)) return perf_event_set_bpf_handler(event, prog_fd); is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_UKPROBE; @@ -8585,7 +8668,7 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) static void perf_event_free_bpf_prog(struct perf_event *event) { - if (event->attr.type != PERF_TYPE_TRACEPOINT) { + if (!perf_event_is_tracing(event)) { perf_event_free_bpf_handler(event); return; } @@ -9003,47 +9086,36 @@ fail_clear_files: return ret; } -static int -perf_tracepoint_set_filter(struct perf_event *event, char *filter_str) -{ - struct perf_event_context *ctx = event->ctx; - int ret; - - /* - * Beware, here be dragons!! - * - * the tracepoint muck will deadlock against ctx->mutex, but the tracepoint - * stuff does not actually need it. So temporarily drop ctx->mutex. As per - * perf_event_ctx_lock() we already have a reference on ctx. - * - * This can result in event getting moved to a different ctx, but that - * does not affect the tracepoint state. - */ - mutex_unlock(&ctx->mutex); - ret = ftrace_profile_set_filter(event, event->attr.config, filter_str); - mutex_lock(&ctx->mutex); - - return ret; -} - static int perf_event_set_filter(struct perf_event *event, void __user *arg) { - char *filter_str; int ret = -EINVAL; - - if ((event->attr.type != PERF_TYPE_TRACEPOINT || - !IS_ENABLED(CONFIG_EVENT_TRACING)) && - !has_addr_filter(event)) - return -EINVAL; + char *filter_str; filter_str = strndup_user(arg, PAGE_SIZE); if (IS_ERR(filter_str)) return PTR_ERR(filter_str); - if (IS_ENABLED(CONFIG_EVENT_TRACING) && - event->attr.type == PERF_TYPE_TRACEPOINT) - ret = perf_tracepoint_set_filter(event, filter_str); - else if (has_addr_filter(event)) +#ifdef CONFIG_EVENT_TRACING + if (perf_event_is_tracing(event)) { + struct perf_event_context *ctx = event->ctx; + + /* + * Beware, here be dragons!! + * + * the tracepoint muck will deadlock against ctx->mutex, but + * the tracepoint stuff does not actually need it. So + * temporarily drop ctx->mutex. As per perf_event_ctx_lock() we + * already have a reference on ctx. + * + * This can result in event getting moved to a different ctx, + * but that does not affect the tracepoint state. + */ + mutex_unlock(&ctx->mutex); + ret = ftrace_profile_set_filter(event, event->attr.config, filter_str); + mutex_lock(&ctx->mutex); + } else +#endif + if (has_addr_filter(event)) ret = perf_event_set_addr_filter(event, filter_str); kfree(filter_str); diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index e4dc2fae33d6..236879b17e1b 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -9,6 +9,7 @@ #include #include #include "trace.h" +#include "trace_probe.h" static char __percpu *perf_trace_buf[PERF_NR_CONTEXTS]; @@ -242,6 +243,54 @@ void perf_trace_destroy(struct perf_event *p_event) mutex_unlock(&event_mutex); } +#ifdef CONFIG_KPROBE_EVENTS +int perf_kprobe_init(struct perf_event *p_event, bool is_retprobe) +{ + int ret; + char *func = NULL; + struct trace_event_call *tp_event; + + if (p_event->attr.kprobe_func) { + func = kzalloc(KSYM_NAME_LEN, GFP_KERNEL); + if (!func) + return -ENOMEM; + ret = strncpy_from_user( + func, u64_to_user_ptr(p_event->attr.kprobe_func), + KSYM_NAME_LEN); + if (ret < 0) + goto out; + + if (func[0] == '\0') { + kfree(func); + func = NULL; + } + } + + tp_event = create_local_trace_kprobe( + func, (void *)(unsigned long)(p_event->attr.kprobe_addr), + p_event->attr.probe_offset, is_retprobe); + if (IS_ERR(tp_event)) { + ret = PTR_ERR(tp_event); + goto out; + } + + ret = perf_trace_event_init(tp_event, p_event); + if (ret) + destroy_local_trace_kprobe(tp_event); +out: + kfree(func); + return ret; +} + +void perf_kprobe_destroy(struct perf_event *p_event) +{ + perf_trace_event_close(p_event); + perf_trace_event_unreg(p_event); + + destroy_local_trace_kprobe(p_event->tp_event); +} +#endif /* CONFIG_KPROBE_EVENTS */ + int perf_trace_add(struct perf_event *p_event, int flags) { struct trace_event_call *tp_event = p_event->tp_event; diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index e7e625cc49f8..5c5354b9c0eb 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -472,6 +472,14 @@ disable_trace_kprobe(struct trace_kprobe *tk, struct trace_event_file *file) disable_kprobe(&tk->rp.kp); wait = 1; } + + /* + * if tk is not added to any list, it must be a local trace_kprobe + * created with perf_event_open. We don't need to wait for these + * trace_kprobes + */ + if (list_empty(&tk->list)) + wait = 0; out: if (wait) { /* @@ -1368,12 +1376,9 @@ static struct trace_event_functions kprobe_funcs = { .trace = print_kprobe_event }; -static int register_kprobe_event(struct trace_kprobe *tk) +static inline void init_trace_event_call(struct trace_kprobe *tk, + struct trace_event_call *call) { - struct trace_event_call *call = &tk->tp.call; - int ret; - - /* Initialize trace_event_call */ INIT_LIST_HEAD(&call->class->fields); if (trace_kprobe_is_return(tk)) { call->event.funcs = &kretprobe_funcs; @@ -1382,6 +1387,19 @@ static int register_kprobe_event(struct trace_kprobe *tk) call->event.funcs = &kprobe_funcs; call->class->define_fields = kprobe_event_define_fields; } + + call->flags = TRACE_EVENT_FL_KPROBE; + call->class->reg = kprobe_register; + call->data = tk; +} + +static int register_kprobe_event(struct trace_kprobe *tk) +{ + struct trace_event_call *call = &tk->tp.call; + int ret = 0; + + init_trace_event_call(tk, call); + if (set_print_fmt(&tk->tp, trace_kprobe_is_return(tk)) < 0) return -ENOMEM; ret = register_trace_event(&call->event); @@ -1389,9 +1407,6 @@ static int register_kprobe_event(struct trace_kprobe *tk) kfree(call->print_fmt); return -ENODEV; } - call->flags = TRACE_EVENT_FL_KPROBE; - call->class->reg = kprobe_register; - call->data = tk; ret = trace_add_event_call(call); if (ret) { pr_info("Failed to register kprobe event: %s\n", @@ -1413,6 +1428,66 @@ static int unregister_kprobe_event(struct trace_kprobe *tk) return ret; } +#ifdef CONFIG_PERF_EVENTS +/* create a trace_kprobe, but don't add it to global lists */ +struct trace_event_call * +create_local_trace_kprobe(char *func, void *addr, unsigned long offs, + bool is_return) +{ + struct trace_kprobe *tk; + int ret; + char *event; + + /* + * local trace_kprobes are not added to probe_list, so they are never + * searched in find_trace_kprobe(). Therefore, there is no concern of + * duplicated name here. + */ + event = func ? func : "DUMMY_EVENT"; + + tk = alloc_trace_kprobe(KPROBE_EVENT_SYSTEM, event, (void *)addr, func, + offs, 0 /* maxactive */, 0 /* nargs */, + is_return); + + if (IS_ERR(tk)) { + pr_info("Failed to allocate trace_probe.(%d)\n", + (int)PTR_ERR(tk)); + return ERR_CAST(tk); + } + + init_trace_event_call(tk, &tk->tp.call); + + if (set_print_fmt(&tk->tp, trace_kprobe_is_return(tk)) < 0) { + ret = -ENOMEM; + goto error; + } + + ret = __register_trace_kprobe(tk); + if (ret < 0) + goto error; + + return &tk->tp.call; +error: + free_trace_kprobe(tk); + return ERR_PTR(ret); +} + +void destroy_local_trace_kprobe(struct trace_event_call *event_call) +{ + struct trace_kprobe *tk; + + tk = container_of(event_call, struct trace_kprobe, tp.call); + + if (trace_probe_is_enabled(&tk->tp)) { + WARN_ON(1); + return; + } + + __unregister_trace_kprobe(tk); + free_trace_kprobe(tk); +} +#endif /* CONFIG_PERF_EVENTS */ + /* Make a tracefs interface for controlling probe points */ static __init int init_kprobe_trace(void) { diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index abfeb69a800b..d0ff5a1b8381 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -423,3 +423,10 @@ store_trace_args(int ent_size, struct trace_probe *tp, struct pt_regs *regs, } extern int set_print_fmt(struct trace_probe *tp, bool is_return); + +#ifdef CONFIG_PERF_EVENTS +extern struct trace_event_call * +create_local_trace_kprobe(char *func, void *addr, unsigned long offs, + bool is_return); +extern void destroy_local_trace_kprobe(struct trace_event_call *event_call); +#endif From fc5298310c7fa73cd96a2d15cb0560f54797b5a1 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Wed, 6 Dec 2017 14:45:16 -0800 Subject: [PATCH 0457/1640] UPSTREAM: perf/core: Implement the 'perf_uprobe' PMU This patch adds perf_uprobe support with similar pattern as previous patch (for kprobe). Two functions, create_local_trace_uprobe() and destroy_local_trace_uprobe(), are created so a uprobe can be created and attached to the file descriptor created by perf_event_open(). Signed-off-by: Song Liu Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Yonghong Song Reviewed-by: Josef Bacik Cc: Cc: Cc: Cc: Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20171206224518.3598254-7-songliubraving@fb.com Signed-off-by: Ingo Molnar --- include/linux/trace_events.h | 4 ++ kernel/events/core.c | 48 +++++++++++++++++- kernel/trace/trace_event_perf.c | 53 ++++++++++++++++++++ kernel/trace/trace_probe.h | 4 ++ kernel/trace/trace_uprobe.c | 86 ++++++++++++++++++++++++++++++--- 5 files changed, 186 insertions(+), 9 deletions(-) diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 819006af415e..05e3eec8dc42 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -556,6 +556,10 @@ extern void perf_trace_del(struct perf_event *event, int flags); extern int perf_kprobe_init(struct perf_event *event, bool is_retprobe); extern void perf_kprobe_destroy(struct perf_event *event); #endif +#ifdef CONFIG_UPROBE_EVENTS +extern int perf_uprobe_init(struct perf_event *event, bool is_retprobe); +extern void perf_uprobe_destroy(struct perf_event *event); +#endif extern int ftrace_profile_set_filter(struct perf_event *event, int event_id, char *filter_str); extern void ftrace_profile_free_filter(struct perf_event *event); diff --git a/kernel/events/core.c b/kernel/events/core.c index cf5b648c3e42..2d01731fb788 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -8454,7 +8454,7 @@ static struct pmu perf_tracepoint = { .events_across_hotplug = 1, }; -#ifdef CONFIG_KPROBE_EVENTS +#if defined(CONFIG_KPROBE_EVENTS) || defined(CONFIG_UPROBE_EVENTS) /* * Flags in config, used by dynamic PMU kprobe and uprobe * The flags should match following PMU_FORMAT_ATTR(). @@ -8482,7 +8482,9 @@ static const struct attribute_group *probe_attr_groups[] = { &probe_format_group, NULL, }; +#endif +#ifdef CONFIG_KPROBE_EVENTS static int perf_kprobe_event_init(struct perf_event *event); static struct pmu perf_kprobe = { .task_ctx_nr = perf_sw_context, @@ -8519,12 +8521,52 @@ static int perf_kprobe_event_init(struct perf_event *event) } #endif /* CONFIG_KPROBE_EVENTS */ +#ifdef CONFIG_UPROBE_EVENTS +static int perf_uprobe_event_init(struct perf_event *event); +static struct pmu perf_uprobe = { + .task_ctx_nr = perf_sw_context, + .event_init = perf_uprobe_event_init, + .add = perf_trace_add, + .del = perf_trace_del, + .start = perf_swevent_start, + .stop = perf_swevent_stop, + .read = perf_swevent_read, + .attr_groups = probe_attr_groups, +}; + +static int perf_uprobe_event_init(struct perf_event *event) +{ + int err; + bool is_retprobe; + + if (event->attr.type != perf_uprobe.type) + return -ENOENT; + /* + * no branch sampling for probe events + */ + if (has_branch_stack(event)) + return -EOPNOTSUPP; + + is_retprobe = event->attr.config & PERF_PROBE_CONFIG_IS_RETPROBE; + err = perf_uprobe_init(event, is_retprobe); + if (err) + return err; + + event->destroy = perf_uprobe_destroy; + + return 0; +} +#endif /* CONFIG_UPROBE_EVENTS */ + static inline void perf_tp_register(void) { perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT); #ifdef CONFIG_KPROBE_EVENTS perf_pmu_register(&perf_kprobe, "kprobe", -1); #endif +#ifdef CONFIG_UPROBE_EVENTS + perf_pmu_register(&perf_uprobe, "uprobe", -1); +#endif } static void perf_event_free_filter(struct perf_event *event) @@ -8612,6 +8654,10 @@ static inline bool perf_event_is_tracing(struct perf_event *event) #ifdef CONFIG_KPROBE_EVENTS if (event->pmu == &perf_kprobe) return true; +#endif +#ifdef CONFIG_UPROBE_EVENTS + if (event->pmu == &perf_uprobe) + return true; #endif return false; } diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 236879b17e1b..f08ef5a309af 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -291,6 +291,59 @@ void perf_kprobe_destroy(struct perf_event *p_event) } #endif /* CONFIG_KPROBE_EVENTS */ +#ifdef CONFIG_UPROBE_EVENTS +int perf_uprobe_init(struct perf_event *p_event, bool is_retprobe) +{ + int ret; + char *path = NULL; + struct trace_event_call *tp_event; + + if (!p_event->attr.uprobe_path) + return -EINVAL; + path = kzalloc(PATH_MAX, GFP_KERNEL); + if (!path) + return -ENOMEM; + ret = strncpy_from_user( + path, u64_to_user_ptr(p_event->attr.uprobe_path), PATH_MAX); + if (ret < 0) + goto out; + if (path[0] == '\0') { + ret = -EINVAL; + goto out; + } + + tp_event = create_local_trace_uprobe( + path, p_event->attr.probe_offset, is_retprobe); + if (IS_ERR(tp_event)) { + ret = PTR_ERR(tp_event); + goto out; + } + + /* + * local trace_uprobe need to hold event_mutex to call + * uprobe_buffer_enable() and uprobe_buffer_disable(). + * event_mutex is not required for local trace_kprobes. + */ + mutex_lock(&event_mutex); + ret = perf_trace_event_init(tp_event, p_event); + if (ret) + destroy_local_trace_uprobe(tp_event); + mutex_unlock(&event_mutex); +out: + kfree(path); + return ret; +} + +void perf_uprobe_destroy(struct perf_event *p_event) +{ + mutex_lock(&event_mutex); + perf_trace_event_close(p_event); + perf_trace_event_unreg(p_event); + mutex_unlock(&event_mutex); + destroy_local_trace_uprobe(p_event->tp_event); +} +#endif /* CONFIG_UPROBE_EVENTS */ + int perf_trace_add(struct perf_event *p_event, int flags) { struct trace_event_call *tp_event = p_event->tp_event; diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index d0ff5a1b8381..e990405a1c9d 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -429,4 +429,8 @@ extern struct trace_event_call * create_local_trace_kprobe(char *func, void *addr, unsigned long offs, bool is_return); extern void destroy_local_trace_kprobe(struct trace_event_call *event_call); + +extern struct trace_event_call * +create_local_trace_uprobe(char *name, unsigned long offs, bool is_return); +extern void destroy_local_trace_uprobe(struct trace_event_call *event_call); #endif diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index b71d116db937..c9a5da2d19d7 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -1297,16 +1297,25 @@ static struct trace_event_functions uprobe_funcs = { .trace = print_uprobe_event }; -static int register_uprobe_event(struct trace_uprobe *tu) +static inline void init_trace_event_call(struct trace_uprobe *tu, + struct trace_event_call *call) { - struct trace_event_call *call = &tu->tp.call; - int ret; - - /* Initialize trace_event_call */ INIT_LIST_HEAD(&call->class->fields); call->event.funcs = &uprobe_funcs; call->class->define_fields = uprobe_event_define_fields; + call->flags = TRACE_EVENT_FL_UPROBE; + call->class->reg = trace_uprobe_register; + call->data = tu; +} + +static int register_uprobe_event(struct trace_uprobe *tu) +{ + struct trace_event_call *call = &tu->tp.call; + int ret = 0; + + init_trace_event_call(tu, call); + if (set_print_fmt(&tu->tp, is_ret_probe(tu)) < 0) return -ENOMEM; @@ -1316,9 +1325,6 @@ static int register_uprobe_event(struct trace_uprobe *tu) return -ENODEV; } - call->flags = TRACE_EVENT_FL_UPROBE; - call->class->reg = trace_uprobe_register; - call->data = tu; ret = trace_add_event_call(call); if (ret) { @@ -1344,6 +1350,70 @@ static int unregister_uprobe_event(struct trace_uprobe *tu) return 0; } +#ifdef CONFIG_PERF_EVENTS +struct trace_event_call * +create_local_trace_uprobe(char *name, unsigned long offs, bool is_return) +{ + struct trace_uprobe *tu; + struct inode *inode; + struct path path; + int ret; + + ret = kern_path(name, LOOKUP_FOLLOW, &path); + if (ret) + return ERR_PTR(ret); + + inode = igrab(d_inode(path.dentry)); + path_put(&path); + + if (!inode || !S_ISREG(inode->i_mode)) { + iput(inode); + return ERR_PTR(-EINVAL); + } + + /* + * local trace_kprobes are not added to probe_list, so they are never + * searched in find_trace_kprobe(). Therefore, there is no concern of + * duplicated name "DUMMY_EVENT" here. + */ + tu = alloc_trace_uprobe(UPROBE_EVENT_SYSTEM, "DUMMY_EVENT", 0, + is_return); + + if (IS_ERR(tu)) { + pr_info("Failed to allocate trace_uprobe.(%d)\n", + (int)PTR_ERR(tu)); + return ERR_CAST(tu); + } + + tu->offset = offs; + tu->inode = inode; + tu->filename = kstrdup(name, GFP_KERNEL); + init_trace_event_call(tu, &tu->tp.call); + + if (set_print_fmt(&tu->tp, is_ret_probe(tu)) < 0) { + ret = -ENOMEM; + goto error; + } + + return &tu->tp.call; +error: + free_trace_uprobe(tu); + return ERR_PTR(ret); +} + +void destroy_local_trace_uprobe(struct trace_event_call *event_call) +{ + struct trace_uprobe *tu; + + tu = container_of(event_call, struct trace_uprobe, tp.call); + + kfree(tu->tp.call.print_fmt); + tu->tp.call.print_fmt = NULL; + + free_trace_uprobe(tu); +} +#endif /* CONFIG_PERF_EVENTS */ + /* Make a trace interface for controling probe points */ static __init int init_uprobe_trace(void) { From 32cd60596aeb20032e51d90c6cfd9a19c026bc69 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 24 May 2018 11:21:08 -0700 Subject: [PATCH 0458/1640] UPSTREAM: perf/core: add perf_get_event() to return perf_event given a struct file A new extern function, perf_get_event(), is added to return a perf event given a struct file. This function will be used in later patches. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov --- include/linux/perf_event.h | 5 +++++ kernel/events/core.c | 8 ++++++++ 2 files changed, 13 insertions(+) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index be4bdfea9c05..58a57df528c6 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -901,6 +901,7 @@ extern void perf_event_exit_task(struct task_struct *child); extern void perf_event_free_task(struct task_struct *task); extern void perf_event_delayed_put(struct task_struct *task); extern struct file *perf_event_get(unsigned int fd); +extern const struct perf_event *perf_get_event(struct file *file); extern const struct perf_event_attr *perf_event_attrs(struct perf_event *event); extern void perf_event_print_debug(void); extern void perf_pmu_disable(struct pmu *pmu); @@ -1364,6 +1365,10 @@ static inline void perf_event_exit_task(struct task_struct *child) { } static inline void perf_event_free_task(struct task_struct *task) { } static inline void perf_event_delayed_put(struct task_struct *task) { } static inline struct file *perf_event_get(unsigned int fd) { return ERR_PTR(-EINVAL); } +static inline const struct perf_event *perf_get_event(struct file *file) +{ + return ERR_PTR(-EINVAL); +} static inline const struct perf_event_attr *perf_event_attrs(struct perf_event *event) { return ERR_PTR(-EINVAL); diff --git a/kernel/events/core.c b/kernel/events/core.c index 2d01731fb788..0ff0ce06df10 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -11516,6 +11516,14 @@ struct file *perf_event_get(unsigned int fd) return file; } +const struct perf_event *perf_get_event(struct file *file) +{ + if (file->f_op != &perf_fops) + return ERR_PTR(-EINVAL); + + return file->private_data; +} + const struct perf_event_attr *perf_event_attrs(struct perf_event *event) { if (!event) From 790e430552deb38713b9a08a1231d41c992a08ac Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 24 May 2018 11:21:09 -0700 Subject: [PATCH 0459/1640] UPSTREAM: bpf: introduce bpf subcommand BPF_TASK_FD_QUERY Currently, suppose a userspace application has loaded a bpf program and attached it to a tracepoint/kprobe/uprobe, and a bpf introspection tool, e.g., bpftool, wants to show which bpf program is attached to which tracepoint/kprobe/uprobe. Such attachment information will be really useful to understand the overall bpf deployment in the system. There is a name field (16 bytes) for each program, which could be used to encode the attachment point. There are some drawbacks for this approaches. First, bpftool user (e.g., an admin) may not really understand the association between the name and the attachment point. Second, if one program is attached to multiple places, encoding a proper name which can imply all these attachments becomes difficult. This patch introduces a new bpf subcommand BPF_TASK_FD_QUERY. Given a pid and fd, if the is associated with a tracepoint/kprobe/uprobe perf event, BPF_TASK_FD_QUERY will return . prog_id . tracepoint name, or . k[ret]probe funcname + offset or kernel addr, or . u[ret]probe filename + offset to the userspace. The user can use "bpftool prog" to find more information about bpf program itself with prog_id. Acked-by: Martin KaFai Lau Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov --- include/linux/trace_events.h | 17 +++++ include/uapi/linux/bpf.h | 26 +++++++ kernel/bpf/syscall.c | 131 +++++++++++++++++++++++++++++++++++ kernel/trace/bpf_trace.c | 48 +++++++++++++ kernel/trace/trace_kprobe.c | 29 ++++++++ kernel/trace/trace_uprobe.c | 22 ++++++ 6 files changed, 273 insertions(+) diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 05e3eec8dc42..9fa0bfab007e 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -468,6 +468,9 @@ int perf_event_query_prog_array(struct perf_event *event, void __user *info); int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *prog); int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog *prog); struct bpf_raw_event_map *bpf_find_raw_tracepoint(const char *name); +int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id, + u32 *fd_type, const char **buf, + u64 *probe_offset, u64 *probe_addr); #else static inline unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx) { @@ -499,6 +502,13 @@ static inline struct bpf_raw_event_map *bpf_find_raw_tracepoint(const char *name { return NULL; } +static inline int bpf_get_perf_event_info(const struct perf_event *event, + u32 *prog_id, u32 *fd_type, + const char **buf, u64 *probe_offset, + u64 *probe_addr) +{ + return -EOPNOTSUPP; +} #endif enum { @@ -555,10 +565,17 @@ extern void perf_trace_del(struct perf_event *event, int flags); #ifdef CONFIG_KPROBE_EVENTS extern int perf_kprobe_init(struct perf_event *event, bool is_retprobe); extern void perf_kprobe_destroy(struct perf_event *event); +extern int bpf_get_kprobe_info(const struct perf_event *event, + u32 *fd_type, const char **symbol, + u64 *probe_offset, u64 *probe_addr, + bool perf_type_tracepoint); #endif #ifdef CONFIG_UPROBE_EVENTS extern int perf_uprobe_init(struct perf_event *event, bool is_retprobe); extern void perf_uprobe_destroy(struct perf_event *event); +extern int bpf_get_uprobe_info(const struct perf_event *event, + u32 *fd_type, const char **filename, + u64 *probe_offset, bool perf_type_tracepoint); #endif extern int ftrace_profile_set_filter(struct perf_event *event, int event_id, char *filter_str); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index f7c769994320..de65e64424eb 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -97,6 +97,7 @@ enum bpf_cmd { BPF_RAW_TRACEPOINT_OPEN, BPF_BTF_LOAD, BPF_BTF_GET_FD_BY_ID, + BPF_TASK_FD_QUERY, }; enum bpf_map_type { @@ -380,6 +381,22 @@ union bpf_attr { __u32 btf_log_size; __u32 btf_log_level; }; + + struct { + __u32 pid; /* input: pid */ + __u32 fd; /* input: fd */ + __u32 flags; /* input: flags */ + __u32 buf_len; /* input/output: buf len */ + __aligned_u64 buf; /* input/output: + * tp_name for tracepoint + * symbol for kprobe + * filename for uprobe + */ + __u32 prog_id; /* output: prod_id */ + __u32 fd_type; /* output: BPF_FD_TYPE_* */ + __u64 probe_offset; /* output: probe_offset */ + __u64 probe_addr; /* output: probe_addr */ + } task_fd_query; } __attribute__((aligned(8))); /* The description below is an attempt at providing documentation to eBPF @@ -2606,4 +2623,13 @@ struct bpf_fib_lookup { __u8 dmac[6]; /* ETH_ALEN */ }; +enum bpf_task_fd_type { + BPF_FD_TYPE_RAW_TRACEPOINT, /* tp name */ + BPF_FD_TYPE_TRACEPOINT, /* tp name */ + BPF_FD_TYPE_KPROBE, /* (symbol + offset) or addr */ + BPF_FD_TYPE_KRETPROBE, /* (symbol + offset) or addr */ + BPF_FD_TYPE_UPROBE, /* filename + offset */ + BPF_FD_TYPE_URETPROBE, /* filename + offset */ +}; + #endif /* _UAPI__LINUX_BPF_H__ */ diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index b7137ddc3768..8898835d6736 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -18,7 +18,9 @@ #include #include #include +#include #include +#include #include #include #include @@ -2203,6 +2205,132 @@ static int bpf_btf_get_fd_by_id(const union bpf_attr *attr) return btf_get_fd_by_id(attr->btf_id); } +static int bpf_task_fd_query_copy(const union bpf_attr *attr, + union bpf_attr __user *uattr, + u32 prog_id, u32 fd_type, + const char *buf, u64 probe_offset, + u64 probe_addr) +{ + char __user *ubuf = u64_to_user_ptr(attr->task_fd_query.buf); + u32 len = buf ? strlen(buf) : 0, input_len; + int err = 0; + + if (put_user(len, &uattr->task_fd_query.buf_len)) + return -EFAULT; + input_len = attr->task_fd_query.buf_len; + if (input_len && ubuf) { + if (!len) { + /* nothing to copy, just make ubuf NULL terminated */ + char zero = '\0'; + + if (put_user(zero, ubuf)) + return -EFAULT; + } else if (input_len >= len + 1) { + /* ubuf can hold the string with NULL terminator */ + if (copy_to_user(ubuf, buf, len + 1)) + return -EFAULT; + } else { + /* ubuf cannot hold the string with NULL terminator, + * do a partial copy with NULL terminator. + */ + char zero = '\0'; + + err = -ENOSPC; + if (copy_to_user(ubuf, buf, input_len - 1)) + return -EFAULT; + if (put_user(zero, ubuf + input_len - 1)) + return -EFAULT; + } + } + + if (put_user(prog_id, &uattr->task_fd_query.prog_id) || + put_user(fd_type, &uattr->task_fd_query.fd_type) || + put_user(probe_offset, &uattr->task_fd_query.probe_offset) || + put_user(probe_addr, &uattr->task_fd_query.probe_addr)) + return -EFAULT; + + return err; +} + +#define BPF_TASK_FD_QUERY_LAST_FIELD task_fd_query.probe_addr + +static int bpf_task_fd_query(const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + pid_t pid = attr->task_fd_query.pid; + u32 fd = attr->task_fd_query.fd; + const struct perf_event *event; + struct files_struct *files; + struct task_struct *task; + struct file *file; + int err; + + if (CHECK_ATTR(BPF_TASK_FD_QUERY)) + return -EINVAL; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (attr->task_fd_query.flags != 0) + return -EINVAL; + + task = get_pid_task(find_vpid(pid), PIDTYPE_PID); + if (!task) + return -ENOENT; + + files = get_files_struct(task); + put_task_struct(task); + if (!files) + return -ENOENT; + + err = 0; + spin_lock(&files->file_lock); + file = fcheck_files(files, fd); + if (!file) + err = -EBADF; + else + get_file(file); + spin_unlock(&files->file_lock); + put_files_struct(files); + + if (err) + goto out; + + if (file->f_op == &bpf_raw_tp_fops) { + struct bpf_raw_tracepoint *raw_tp = file->private_data; + struct bpf_raw_event_map *btp = raw_tp->btp; + + err = bpf_task_fd_query_copy(attr, uattr, + raw_tp->prog->aux->id, + BPF_FD_TYPE_RAW_TRACEPOINT, + btp->tp->name, 0, 0); + goto put_file; + } + + event = perf_get_event(file); + if (!IS_ERR(event)) { + u64 probe_offset, probe_addr; + u32 prog_id, fd_type; + const char *buf; + + err = bpf_get_perf_event_info(event, &prog_id, &fd_type, + &buf, &probe_offset, + &probe_addr); + if (!err) + err = bpf_task_fd_query_copy(attr, uattr, prog_id, + fd_type, buf, + probe_offset, + probe_addr); + goto put_file; + } + + err = -ENOTSUPP; +put_file: + fput(file); +out: + return err; +} + SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) { union bpf_attr attr; @@ -2290,6 +2418,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz case BPF_BTF_GET_FD_BY_ID: err = bpf_btf_get_fd_by_id(&attr); break; + case BPF_TASK_FD_QUERY: + err = bpf_task_fd_query(&attr, uattr); + break; default: err = -EINVAL; break; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index e68cd0fe6c02..121a7ac2e68f 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include "trace_probe.h" @@ -1161,3 +1162,50 @@ int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog *prog) mutex_unlock(&bpf_event_mutex); return err; } + +int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id, + u32 *fd_type, const char **buf, + u64 *probe_offset, u64 *probe_addr) +{ + bool is_tracepoint, is_syscall_tp; + struct bpf_prog *prog; + int flags, err = 0; + + prog = event->prog; + if (!prog) + return -ENOENT; + + /* not supporting BPF_PROG_TYPE_PERF_EVENT yet */ + if (prog->type == BPF_PROG_TYPE_PERF_EVENT) + return -EOPNOTSUPP; + + *prog_id = prog->aux->id; + flags = event->tp_event->flags; + is_tracepoint = flags & TRACE_EVENT_FL_TRACEPOINT; + is_syscall_tp = is_syscall_trace_event(event->tp_event); + + if (is_tracepoint || is_syscall_tp) { + *buf = is_tracepoint ? event->tp_event->tp->name + : event->tp_event->name; + *fd_type = BPF_FD_TYPE_TRACEPOINT; + *probe_offset = 0x0; + *probe_addr = 0x0; + } else { + /* kprobe/uprobe */ + err = -EOPNOTSUPP; +#ifdef CONFIG_KPROBE_EVENTS + if (flags & TRACE_EVENT_FL_KPROBE) + err = bpf_get_kprobe_info(event, fd_type, buf, + probe_offset, probe_addr, + event->attr.type == PERF_TYPE_TRACEPOINT); +#endif +#ifdef CONFIG_UPROBE_EVENTS + if (flags & TRACE_EVENT_FL_UPROBE) + err = bpf_get_uprobe_info(event, fd_type, buf, + probe_offset, + event->attr.type == PERF_TYPE_TRACEPOINT); +#endif + } + + return err; +} diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 5c5354b9c0eb..ff938f5b4b59 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1299,6 +1299,35 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, head, NULL, NULL); } NOKPROBE_SYMBOL(kretprobe_perf_func); + +int bpf_get_kprobe_info(const struct perf_event *event, u32 *fd_type, + const char **symbol, u64 *probe_offset, + u64 *probe_addr, bool perf_type_tracepoint) +{ + const char *pevent = trace_event_name(event->tp_event); + const char *group = event->tp_event->class->system; + struct trace_kprobe *tk; + + if (perf_type_tracepoint) + tk = find_trace_kprobe(pevent, group); + else + tk = event->tp_event->data; + if (!tk) + return -EINVAL; + + *fd_type = trace_kprobe_is_return(tk) ? BPF_FD_TYPE_KRETPROBE + : BPF_FD_TYPE_KPROBE; + if (tk->symbol) { + *symbol = tk->symbol; + *probe_offset = tk->rp.kp.offset; + *probe_addr = 0; + } else { + *symbol = NULL; + *probe_offset = 0; + *probe_addr = (unsigned long)tk->rp.kp.addr; + } + return 0; +} #endif /* CONFIG_PERF_EVENTS */ /* diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index c9a5da2d19d7..03e51bd9a07a 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -1183,6 +1183,28 @@ static void uretprobe_perf_func(struct trace_uprobe *tu, unsigned long func, { __uprobe_perf_func(tu, func, regs, ucb, dsize); } + +int bpf_get_uprobe_info(const struct perf_event *event, u32 *fd_type, + const char **filename, u64 *probe_offset, + bool perf_type_tracepoint) +{ + const char *pevent = trace_event_name(event->tp_event); + const char *group = event->tp_event->class->system; + struct trace_uprobe *tu; + + if (perf_type_tracepoint) + tu = find_probe_event(pevent, group); + else + tu = event->tp_event->data; + if (!tu) + return -EINVAL; + + *fd_type = is_ret_probe(tu) ? BPF_FD_TYPE_URETPROBE + : BPF_FD_TYPE_UPROBE; + *filename = tu->filename; + *probe_offset = tu->offset; + return 0; +} #endif /* CONFIG_PERF_EVENTS */ static int From b9536153763040b4d5cf7faca48518d901d73933 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 24 May 2018 16:45:46 +0200 Subject: [PATCH 0460/1640] BACKPORT: bpf: devmap introduce dev_map_enqueue Functionality is the same, but the ndo_xdp_xmit call is now simply invoked from inside the devmap.c code. V2: Fix compile issue reported by kbuild test robot V5: Cleanups requested by Daniel - Newlines before func definition - Use BUILD_BUG_ON checks - Remove unnecessary use return value store in dev_map_enqueue Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 15 ++++++++++++--- include/trace/events/xdp.h | 9 ++++++++- kernel/bpf/devmap.c | 34 ++++++++++++++++++++++++++++------ net/core/filter.c | 15 ++------------- 4 files changed, 50 insertions(+), 23 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 42b29e54b8f5..0daebef27904 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -492,14 +492,16 @@ int bpf_check(struct bpf_prog **fp, union bpf_attr *attr); void bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth); /* Map specifics */ -struct net_device *__dev_map_lookup_elem(struct bpf_map *map, u32 key); +struct xdp_buff; + +struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key); void __dev_map_insert_ctx(struct bpf_map *map, u32 index); void __dev_map_flush(struct bpf_map *map); +int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp); struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key); void __cpu_map_insert_ctx(struct bpf_map *map, u32 index); void __cpu_map_flush(struct bpf_map *map); -struct xdp_buff; int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp, struct net_device *dev_rx); @@ -583,6 +585,14 @@ static inline void __dev_map_flush(struct bpf_map *map) { } +struct xdp_buff; +struct bpf_dtab_netdev; + +static inline +int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp) +{ + return 0; +} static inline struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key) @@ -598,7 +608,6 @@ static inline void __cpu_map_flush(struct bpf_map *map) { } -struct xdp_buff; static inline int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp, struct net_device *dev_rx) diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h index 8989a92c571a..96104610d40e 100644 --- a/include/trace/events/xdp.h +++ b/include/trace/events/xdp.h @@ -138,11 +138,18 @@ DEFINE_EVENT_PRINT(xdp_redirect_template, xdp_redirect_map_err, __entry->map_id, __entry->map_index) ); +#ifndef __DEVMAP_OBJ_TYPE +#define __DEVMAP_OBJ_TYPE +struct _bpf_dtab_netdev { + struct net_device *dev; +}; +#endif /* __DEVMAP_OBJ_TYPE */ + #define devmap_ifindex(fwd, map) \ (!fwd ? 0 : \ (!map ? 0 : \ ((map->map_type == BPF_MAP_TYPE_DEVMAP) ? \ - ((struct net_device *)fwd)->ifindex : 0))) + ((struct _bpf_dtab_netdev *)fwd)->dev->ifindex : 0))) #define _trace_xdp_redirect_map(dev, xdp, fwd, map, idx) \ trace_xdp_redirect_map(dev, xdp, devmap_ifindex(fwd, map), \ diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 0d056dfe35cd..20b6db7794ea 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -48,13 +48,15 @@ * calls will fail at this point. */ #include +#include #include +#include #define DEV_CREATE_FLAG_MASK \ (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) struct bpf_dtab_netdev { - struct net_device *dev; + struct net_device *dev; /* must be first member, due to tracepoint */ struct bpf_dtab *dtab; unsigned int bit; struct rcu_head rcu; @@ -243,21 +245,38 @@ void __dev_map_flush(struct bpf_map *map) * update happens in parallel here a dev_put wont happen until after reading the * ifindex. */ -struct net_device *__dev_map_lookup_elem(struct bpf_map *map, u32 key) +struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); - struct bpf_dtab_netdev *dev; + struct bpf_dtab_netdev *obj; if (key >= map->max_entries) return NULL; - dev = READ_ONCE(dtab->netdev_map[key]); - return dev ? dev->dev : NULL; + obj = READ_ONCE(dtab->netdev_map[key]); + return obj; +} + +int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp) +{ + struct net_device *dev = dst->dev; + struct xdp_frame *xdpf; + + if (!dev->netdev_ops->ndo_xdp_xmit) + return -EOPNOTSUPP; + + xdpf = convert_to_xdp_frame(xdp); + if (unlikely(!xdpf)) + return -EOVERFLOW; + + /* TODO: implement a bulking/enqueue step later */ + return dev->netdev_ops->ndo_xdp_xmit(dev, xdpf); } static void *dev_map_lookup_elem(struct bpf_map *map, void *key) { - struct net_device *dev = __dev_map_lookup_elem(map, *(u32 *)key); + struct bpf_dtab_netdev *obj = __dev_map_lookup_elem(map, *(u32 *)key); + struct net_device *dev = dev = obj ? obj->dev : NULL; return dev ? &dev->ifindex : NULL; } @@ -407,6 +426,9 @@ static struct notifier_block dev_map_notifier = { static int __init dev_map_init(void) { + /* Assure tracepoint shadow struct _bpf_dtab_netdev is in sync */ + BUILD_BUG_ON(offsetof(struct bpf_dtab_netdev, dev) != + offsetof(struct _bpf_dtab_netdev, dev)); register_netdevice_notifier(&dev_map_notifier); return 0; } diff --git a/net/core/filter.c b/net/core/filter.c index 9929ab0d2546..d2ad9fadb287 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3074,20 +3074,9 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd, switch (map->map_type) { case BPF_MAP_TYPE_DEVMAP: { - struct net_device *dev = fwd; - struct xdp_frame *xdpf; + struct bpf_dtab_netdev *dst = fwd; - if (!dev->netdev_ops->ndo_xdp_xmit) - return -EOPNOTSUPP; - - xdpf = convert_to_xdp_frame(xdp); - if (unlikely(!xdpf)) - return -EOVERFLOW; - - /* TODO: move to inside map code instead, for bulk support - * err = dev_map_enqueue(dev, xdp); - */ - err = dev->netdev_ops->ndo_xdp_xmit(dev, xdpf); + err = dev_map_enqueue(dst, xdp); if (err) return err; __dev_map_insert_ctx(map, index); From eacad3dd24b237cbc94419b0b906ee8f13bea43b Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 24 May 2018 16:45:51 +0200 Subject: [PATCH 0461/1640] UPSTREAM: bpf: devmap prepare xdp frames for bulking Like cpumap create queue for xdp frames that will be bulked. For now, this patch simply invoke ndo_xdp_xmit foreach frame. This happens, either when the map flush operation is envoked, or when the limit DEV_MAP_BULK_SIZE is reached. V5: Avoid memleak on error path in dev_map_update_elem() Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Alexei Starovoitov --- kernel/bpf/devmap.c | 74 ++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 70 insertions(+), 4 deletions(-) diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 20b6db7794ea..20c5c00833ee 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -55,10 +55,17 @@ #define DEV_CREATE_FLAG_MASK \ (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) +#define DEV_MAP_BULK_SIZE 16 +struct xdp_bulk_queue { + struct xdp_frame *q[DEV_MAP_BULK_SIZE]; + unsigned int count; +}; + struct bpf_dtab_netdev { struct net_device *dev; /* must be first member, due to tracepoint */ struct bpf_dtab *dtab; unsigned int bit; + struct xdp_bulk_queue __percpu *bulkq; struct rcu_head rcu; }; @@ -211,6 +218,34 @@ void __dev_map_insert_ctx(struct bpf_map *map, u32 bit) __set_bit(bit, bitmap); } +static int bq_xmit_all(struct bpf_dtab_netdev *obj, + struct xdp_bulk_queue *bq) +{ + struct net_device *dev = obj->dev; + int i; + + if (unlikely(!bq->count)) + return 0; + + for (i = 0; i < bq->count; i++) { + struct xdp_frame *xdpf = bq->q[i]; + + prefetch(xdpf); + } + + for (i = 0; i < bq->count; i++) { + struct xdp_frame *xdpf = bq->q[i]; + int err; + + err = dev->netdev_ops->ndo_xdp_xmit(dev, xdpf); + if (err) + xdp_return_frame(xdpf); + } + bq->count = 0; + + return 0; +} + /* __dev_map_flush is called from xdp_do_flush_map() which _must_ be signaled * from the driver before returning from its napi->poll() routine. The poll() * routine is called either from busy_poll context or net_rx_action signaled @@ -226,6 +261,7 @@ void __dev_map_flush(struct bpf_map *map) for_each_set_bit(bit, bitmap, map->max_entries) { struct bpf_dtab_netdev *dev = READ_ONCE(dtab->netdev_map[bit]); + struct xdp_bulk_queue *bq; struct net_device *netdev; /* This is possible if the dev entry is removed by user space @@ -235,6 +271,9 @@ void __dev_map_flush(struct bpf_map *map) continue; __clear_bit(bit, bitmap); + + bq = this_cpu_ptr(dev->bulkq); + bq_xmit_all(dev, bq); netdev = dev->dev; if (likely(netdev->netdev_ops->ndo_xdp_flush)) netdev->netdev_ops->ndo_xdp_flush(netdev); @@ -257,6 +296,20 @@ struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key) return obj; } +/* Runs under RCU-read-side, plus in softirq under NAPI protection. + * Thus, safe percpu variable access. + */ +static int bq_enqueue(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf) +{ + struct xdp_bulk_queue *bq = this_cpu_ptr(obj->bulkq); + + if (unlikely(bq->count == DEV_MAP_BULK_SIZE)) + bq_xmit_all(obj, bq); + + bq->q[bq->count++] = xdpf; + return 0; +} + int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp) { struct net_device *dev = dst->dev; @@ -269,8 +322,7 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp) if (unlikely(!xdpf)) return -EOVERFLOW; - /* TODO: implement a bulking/enqueue step later */ - return dev->netdev_ops->ndo_xdp_xmit(dev, xdpf); + return bq_enqueue(dst, xdpf); } static void *dev_map_lookup_elem(struct bpf_map *map, void *key) @@ -285,13 +337,18 @@ static void dev_map_flush_old(struct bpf_dtab_netdev *dev) { if (dev->dev->netdev_ops->ndo_xdp_flush) { struct net_device *fl = dev->dev; + struct xdp_bulk_queue *bq; unsigned long *bitmap; + int cpu; for_each_online_cpu(cpu) { bitmap = per_cpu_ptr(dev->dtab->flush_needed, cpu); __clear_bit(dev->bit, bitmap); + bq = per_cpu_ptr(dev->bulkq, cpu); + bq_xmit_all(dev, bq); + fl->netdev_ops->ndo_xdp_flush(dev->dev); } } @@ -303,6 +360,7 @@ static void __dev_map_entry_free(struct rcu_head *rcu) dev = container_of(rcu, struct bpf_dtab_netdev, rcu); dev_map_flush_old(dev); + free_percpu(dev->bulkq); dev_put(dev->dev); kfree(dev); } @@ -335,6 +393,7 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value, { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); struct net *net = current->nsproxy->net_ns; + gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN; struct bpf_dtab_netdev *dev, *old_dev; u32 i = *(u32 *)key; u32 ifindex = *(u32 *)value; @@ -349,13 +408,20 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value, if (!ifindex) { dev = NULL; } else { - dev = kmalloc_node(sizeof(*dev), GFP_ATOMIC | __GFP_NOWARN, - map->numa_node); + dev = kmalloc_node(sizeof(*dev), gfp, map->numa_node); if (!dev) return -ENOMEM; + dev->bulkq = __alloc_percpu_gfp(sizeof(*dev->bulkq), + sizeof(void *), gfp); + if (!dev->bulkq) { + kfree(dev); + return -ENOMEM; + } + dev->dev = dev_get_by_index(net, ifindex); if (!dev->dev) { + free_percpu(dev->bulkq); kfree(dev); return -EINVAL; } From 3b8bc503136681f9c54bd4c7f7756c8fe345c11d Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 24 May 2018 16:45:57 +0200 Subject: [PATCH 0462/1640] UPSTREAM: xdp: add tracepoint for devmap like cpumap have Notice how this allow us get XDP statistic without affecting the XDP performance, as tracepoint is no-longer activated on a per packet basis. V5: Spotted by John Fastabend. Fix 'sent' also counted 'drops' in this patch, a later patch corrected this, but it was a mistake in this intermediate step. Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 6 ++++-- include/trace/events/xdp.h | 39 ++++++++++++++++++++++++++++++++++++++ kernel/bpf/devmap.c | 27 ++++++++++++++++++++++---- net/core/filter.c | 2 +- 4 files changed, 67 insertions(+), 7 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 0daebef27904..59bc670e7c65 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -497,7 +497,8 @@ struct xdp_buff; struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key); void __dev_map_insert_ctx(struct bpf_map *map, u32 index); void __dev_map_flush(struct bpf_map *map); -int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp); +int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, + struct net_device *dev_rx); struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key); void __cpu_map_insert_ctx(struct bpf_map *map, u32 index); @@ -589,7 +590,8 @@ struct xdp_buff; struct bpf_dtab_netdev; static inline -int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp) +int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, + struct net_device *dev_rx) { return 0; } diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h index 96104610d40e..2e9ef0650144 100644 --- a/include/trace/events/xdp.h +++ b/include/trace/events/xdp.h @@ -229,6 +229,45 @@ TRACE_EVENT(xdp_cpumap_enqueue, __entry->to_cpu) ); +TRACE_EVENT(xdp_devmap_xmit, + + TP_PROTO(const struct bpf_map *map, u32 map_index, + int sent, int drops, + const struct net_device *from_dev, + const struct net_device *to_dev), + + TP_ARGS(map, map_index, sent, drops, from_dev, to_dev), + + TP_STRUCT__entry( + __field(int, map_id) + __field(u32, act) + __field(u32, map_index) + __field(int, drops) + __field(int, sent) + __field(int, from_ifindex) + __field(int, to_ifindex) + ), + + TP_fast_assign( + __entry->map_id = map->id; + __entry->act = XDP_REDIRECT; + __entry->map_index = map_index; + __entry->drops = drops; + __entry->sent = sent; + __entry->from_ifindex = from_dev->ifindex; + __entry->to_ifindex = to_dev->ifindex; + ), + + TP_printk("ndo_xdp_xmit" + " map_id=%d map_index=%d action=%s" + " sent=%d drops=%d" + " from_ifindex=%d to_ifindex=%d", + __entry->map_id, __entry->map_index, + __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB), + __entry->sent, __entry->drops, + __entry->from_ifindex, __entry->to_ifindex) +); + #endif /* _TRACE_XDP_H */ #include diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 20c5c00833ee..45d9b9e6f691 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -58,6 +58,7 @@ #define DEV_MAP_BULK_SIZE 16 struct xdp_bulk_queue { struct xdp_frame *q[DEV_MAP_BULK_SIZE]; + struct net_device *dev_rx; unsigned int count; }; @@ -222,6 +223,7 @@ static int bq_xmit_all(struct bpf_dtab_netdev *obj, struct xdp_bulk_queue *bq) { struct net_device *dev = obj->dev; + int sent = 0, drops = 0; int i; if (unlikely(!bq->count)) @@ -238,11 +240,18 @@ static int bq_xmit_all(struct bpf_dtab_netdev *obj, int err; err = dev->netdev_ops->ndo_xdp_xmit(dev, xdpf); - if (err) + if (err) { + drops++; xdp_return_frame(xdpf); + } else { + sent++; + } } bq->count = 0; + trace_xdp_devmap_xmit(&obj->dtab->map, obj->bit, + sent, drops, bq->dev_rx, dev); + bq->dev_rx = NULL; return 0; } @@ -299,18 +308,28 @@ struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key) /* Runs under RCU-read-side, plus in softirq under NAPI protection. * Thus, safe percpu variable access. */ -static int bq_enqueue(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf) +static int bq_enqueue(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf, + struct net_device *dev_rx) + { struct xdp_bulk_queue *bq = this_cpu_ptr(obj->bulkq); if (unlikely(bq->count == DEV_MAP_BULK_SIZE)) bq_xmit_all(obj, bq); + /* Ingress dev_rx will be the same for all xdp_frame's in + * bulk_queue, because bq stored per-CPU and must be flushed + * from net_device drivers NAPI func end. + */ + if (!bq->dev_rx) + bq->dev_rx = dev_rx; + bq->q[bq->count++] = xdpf; return 0; } -int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp) +int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, + struct net_device *dev_rx) { struct net_device *dev = dst->dev; struct xdp_frame *xdpf; @@ -322,7 +341,7 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp) if (unlikely(!xdpf)) return -EOVERFLOW; - return bq_enqueue(dst, xdpf); + return bq_enqueue(dst, xdpf, dev_rx); } static void *dev_map_lookup_elem(struct bpf_map *map, void *key) diff --git a/net/core/filter.c b/net/core/filter.c index d2ad9fadb287..7e2e802a38d1 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3076,7 +3076,7 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd, case BPF_MAP_TYPE_DEVMAP: { struct bpf_dtab_netdev *dst = fwd; - err = dev_map_enqueue(dst, xdp); + err = dev_map_enqueue(dst, xdp, dev_rx); if (err) return err; __dev_map_insert_ctx(map, index); From 3f760176ffbee28e79927f6ccc358539877bb346 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 24 May 2018 16:46:07 +0200 Subject: [PATCH 0463/1640] UPSTREAM: xdp: introduce xdp_return_frame_rx_napi When sending an xdp_frame through xdp_do_redirect call, then error cases can happen where the xdp_frame needs to be dropped, and returning an -errno code isn't sufficient/possible any-longer (e.g. for cpumap case). This is already fully supported, by simply calling xdp_return_frame. This patch is an optimization, which provides xdp_return_frame_rx_napi, which is a faster variant for these error cases. It take advantage of the protection provided by XDP RX running under NAPI protection. This change is mostly relevant for drivers using the page_pool allocator as it can take advantage of this. (Tested with mlx5). Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Alexei Starovoitov --- include/net/page_pool.h | 5 +++-- include/net/xdp.h | 1 + kernel/bpf/cpumap.c | 2 +- kernel/bpf/devmap.c | 2 +- net/core/xdp.c | 20 ++++++++++++++++---- 5 files changed, 22 insertions(+), 8 deletions(-) diff --git a/include/net/page_pool.h b/include/net/page_pool.h index c79087153148..694d055e01ef 100644 --- a/include/net/page_pool.h +++ b/include/net/page_pool.h @@ -115,13 +115,14 @@ void page_pool_destroy(struct page_pool *pool); void __page_pool_put_page(struct page_pool *pool, struct page *page, bool allow_direct); -static inline void page_pool_put_page(struct page_pool *pool, struct page *page) +static inline void page_pool_put_page(struct page_pool *pool, + struct page *page, bool allow_direct) { /* When page_pool isn't compiled-in, net/core/xdp.c doesn't * allow registering MEM_TYPE_PAGE_POOL, but shield linker. */ #ifdef CONFIG_PAGE_POOL - __page_pool_put_page(pool, page, false); + __page_pool_put_page(pool, page, allow_direct); #endif } /* Very limited use-cases allow recycle direct */ diff --git a/include/net/xdp.h b/include/net/xdp.h index 0b689cf561c7..7ad779237ae8 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -104,6 +104,7 @@ struct xdp_frame *convert_to_xdp_frame(struct xdp_buff *xdp) } void xdp_return_frame(struct xdp_frame *xdpf); +void xdp_return_frame_rx_napi(struct xdp_frame *xdpf); void xdp_return_buff(struct xdp_buff *xdp); int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq, diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index c95b04ec103e..e0918d180f08 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -578,7 +578,7 @@ static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu, err = __ptr_ring_produce(q, xdpf); if (err) { drops++; - xdp_return_frame(xdpf); + xdp_return_frame_rx_napi(xdpf); } processed++; } diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 45d9b9e6f691..80d835cd080d 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -242,7 +242,7 @@ static int bq_xmit_all(struct bpf_dtab_netdev *obj, err = dev->netdev_ops->ndo_xdp_xmit(dev, xdpf); if (err) { drops++; - xdp_return_frame(xdpf); + xdp_return_frame_rx_napi(xdpf); } else { sent++; } diff --git a/net/core/xdp.c b/net/core/xdp.c index bf6758f74339..cb8c4e061a5a 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -308,7 +308,13 @@ err: } EXPORT_SYMBOL_GPL(xdp_rxq_info_reg_mem_model); -static void xdp_return(void *data, struct xdp_mem_info *mem) +/* XDP RX runs under NAPI protection, and in different delivery error + * scenarios (e.g. queue full), it is possible to return the xdp_frame + * while still leveraging this protection. The @napi_direct boolian + * is used for those calls sites. Thus, allowing for faster recycling + * of xdp_frames/pages in those cases. + */ +static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct) { struct xdp_mem_allocator *xa; struct page *page; @@ -320,7 +326,7 @@ static void xdp_return(void *data, struct xdp_mem_info *mem) xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params); page = virt_to_head_page(data); if (xa) - page_pool_put_page(xa->page_pool, page); + page_pool_put_page(xa->page_pool, page, napi_direct); else put_page(page); rcu_read_unlock(); @@ -340,12 +346,18 @@ static void xdp_return(void *data, struct xdp_mem_info *mem) void xdp_return_frame(struct xdp_frame *xdpf) { - xdp_return(xdpf->data, &xdpf->mem); + __xdp_return(xdpf->data, &xdpf->mem, false); } EXPORT_SYMBOL_GPL(xdp_return_frame); +void xdp_return_frame_rx_napi(struct xdp_frame *xdpf) +{ + __xdp_return(xdpf->data, &xdpf->mem, true); +} +EXPORT_SYMBOL_GPL(xdp_return_frame_rx_napi); + void xdp_return_buff(struct xdp_buff *xdp) { - xdp_return(xdp->data, &xdp->rxq->mem); + __xdp_return(xdp->data, &xdp->rxq->mem, true); } EXPORT_SYMBOL_GPL(xdp_return_buff); From e1aa0ffa4b88dbc198a6414722b9a5f3411a022b Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 24 May 2018 16:46:12 +0200 Subject: [PATCH 0464/1640] BACKPORT: xdp: change ndo_xdp_xmit API to support bulking This patch change the API for ndo_xdp_xmit to support bulking xdp_frames. When kernel is compiled with CONFIG_RETPOLINE, XDP sees a huge slowdown. Most of the slowdown is caused by DMA API indirect function calls, but also the net_device->ndo_xdp_xmit() call. Benchmarked patch with CONFIG_RETPOLINE, using xdp_redirect_map with single flow/core test (CPU E5-1650 v4 @ 3.60GHz), showed performance improved: for driver ixgbe: 6,042,682 pps -> 6,853,768 pps = +811,086 pps for driver i40e : 6,187,169 pps -> 6,724,519 pps = +537,350 pps With frames avail as a bulk inside the driver ndo_xdp_xmit call, further optimizations are possible, like bulk DMA-mapping for TX. Testing without CONFIG_RETPOLINE show the same performance for physical NIC drivers. The virtual NIC driver tun sees a huge performance boost, as it can avoid doing per frame producer locking, but instead amortize the locking cost over the bulk. V2: Fix compile errors reported by kbuild test robot V4: Isolated ndo, driver changes and callers. Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Alexei Starovoitov --- include/linux/netdevice.h | 14 +++++++++----- kernel/bpf/devmap.c | 29 ++++++++++++++++++----------- net/core/filter.c | 8 ++++---- 3 files changed, 31 insertions(+), 20 deletions(-) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 6c973b71a5c4..a04da0b9b684 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1188,9 +1188,13 @@ struct macsec_ops { * This function is used to set or query state related to XDP on the * netdevice and manage BPF offload. See definition of * enum bpf_netdev_command for details. - * int (*ndo_xdp_xmit)(struct net_device *dev, struct xdp_frame *xdp); - * This function is used to submit a XDP packet for transmit on a - * netdevice. + * int (*ndo_xdp_xmit)(struct net_device *dev, int n, struct xdp_frame **xdp); + * This function is used to submit @n XDP packets for transmit on a + * netdevice. Returns number of frames successfully transmitted, frames + * that got dropped are freed/returned via xdp_return_frame(). + * Returns negative number, means general error invoking ndo, meaning + * no frames were xmit'ed and core-caller will free all frames. + * TODO: Consider add flag to allow sending flush operation. * void (*ndo_xdp_flush)(struct net_device *dev); * This function is used to inform the driver to flush a particular * xdp tx queue. Must be called on same CPU as xdp_xmit. @@ -1377,8 +1381,8 @@ struct net_device_ops { int needed_headroom); int (*ndo_bpf)(struct net_device *dev, struct netdev_bpf *bpf); - int (*ndo_xdp_xmit)(struct net_device *dev, - struct xdp_frame *xdp); + int (*ndo_xdp_xmit)(struct net_device *dev, int n, + struct xdp_frame **xdp); void (*ndo_xdp_flush)(struct net_device *dev); }; diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 80d835cd080d..d810c4d40931 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -235,24 +235,31 @@ static int bq_xmit_all(struct bpf_dtab_netdev *obj, prefetch(xdpf); } - for (i = 0; i < bq->count; i++) { - struct xdp_frame *xdpf = bq->q[i]; - int err; - - err = dev->netdev_ops->ndo_xdp_xmit(dev, xdpf); - if (err) { - drops++; - xdp_return_frame_rx_napi(xdpf); - } else { - sent++; - } + sent = dev->netdev_ops->ndo_xdp_xmit(dev, bq->count, bq->q); + if (sent < 0) { + sent = 0; + goto error; } + drops = bq->count - sent; +out: bq->count = 0; trace_xdp_devmap_xmit(&obj->dtab->map, obj->bit, sent, drops, bq->dev_rx, dev); bq->dev_rx = NULL; return 0; +error: + /* If ndo_xdp_xmit fails with an errno, no frames have been + * xmit'ed and it's our responsibility to them free all. + */ + for (i = 0; i < bq->count; i++) { + struct xdp_frame *xdpf = bq->q[i]; + + /* RX path under NAPI protection, can return frames faster */ + xdp_return_frame_rx_napi(xdpf); + drops++; + } + goto out; } /* __dev_map_flush is called from xdp_do_flush_map() which _must_ be signaled diff --git a/net/core/filter.c b/net/core/filter.c index 7e2e802a38d1..ba58e1d37de4 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3048,7 +3048,7 @@ static int __bpf_tx_xdp(struct net_device *dev, u32 index) { struct xdp_frame *xdpf; - int err; + int sent; if (!dev->netdev_ops->ndo_xdp_xmit) { return -EOPNOTSUPP; @@ -3058,9 +3058,9 @@ static int __bpf_tx_xdp(struct net_device *dev, if (unlikely(!xdpf)) return -EOVERFLOW; - err = dev->netdev_ops->ndo_xdp_xmit(dev, xdpf); - if (err) - return err; + sent = dev->netdev_ops->ndo_xdp_xmit(dev, 1, &xdpf); + if (sent <= 0) + return sent; dev->netdev_ops->ndo_xdp_flush(dev); return 0; } From 775c180022b86ff30ad62632b15522bda1165f9a Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 24 May 2018 16:46:17 +0200 Subject: [PATCH 0465/1640] UPSTREAM: xdp/trace: extend tracepoint in devmap with an err Extending tracepoint xdp:xdp_devmap_xmit in devmap with an err code allow people to easier identify the reason behind the ndo_xdp_xmit call to a given driver is failing. Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Alexei Starovoitov --- include/trace/events/xdp.h | 10 ++++++---- kernel/bpf/devmap.c | 5 +++-- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h index 2e9ef0650144..1ecf4c67fcf7 100644 --- a/include/trace/events/xdp.h +++ b/include/trace/events/xdp.h @@ -234,9 +234,9 @@ TRACE_EVENT(xdp_devmap_xmit, TP_PROTO(const struct bpf_map *map, u32 map_index, int sent, int drops, const struct net_device *from_dev, - const struct net_device *to_dev), + const struct net_device *to_dev, int err), - TP_ARGS(map, map_index, sent, drops, from_dev, to_dev), + TP_ARGS(map, map_index, sent, drops, from_dev, to_dev, err), TP_STRUCT__entry( __field(int, map_id) @@ -246,6 +246,7 @@ TRACE_EVENT(xdp_devmap_xmit, __field(int, sent) __field(int, from_ifindex) __field(int, to_ifindex) + __field(int, err) ), TP_fast_assign( @@ -256,16 +257,17 @@ TRACE_EVENT(xdp_devmap_xmit, __entry->sent = sent; __entry->from_ifindex = from_dev->ifindex; __entry->to_ifindex = to_dev->ifindex; + __entry->err = err; ), TP_printk("ndo_xdp_xmit" " map_id=%d map_index=%d action=%s" " sent=%d drops=%d" - " from_ifindex=%d to_ifindex=%d", + " from_ifindex=%d to_ifindex=%d err=%d", __entry->map_id, __entry->map_index, __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB), __entry->sent, __entry->drops, - __entry->from_ifindex, __entry->to_ifindex) + __entry->from_ifindex, __entry->to_ifindex, __entry->err) ); #endif /* _TRACE_XDP_H */ diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index d810c4d40931..6721f3404bbf 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -223,7 +223,7 @@ static int bq_xmit_all(struct bpf_dtab_netdev *obj, struct xdp_bulk_queue *bq) { struct net_device *dev = obj->dev; - int sent = 0, drops = 0; + int sent = 0, drops = 0, err = 0; int i; if (unlikely(!bq->count)) @@ -237,6 +237,7 @@ static int bq_xmit_all(struct bpf_dtab_netdev *obj, sent = dev->netdev_ops->ndo_xdp_xmit(dev, bq->count, bq->q); if (sent < 0) { + err = sent; sent = 0; goto error; } @@ -245,7 +246,7 @@ out: bq->count = 0; trace_xdp_devmap_xmit(&obj->dtab->map, obj->bit, - sent, drops, bq->dev_rx, dev); + sent, drops, bq->dev_rx, dev, err); bq->dev_rx = NULL; return 0; error: From 614d15fcb7541db895cd235f5d4ac418f7e47b56 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 25 May 2018 23:33:19 +0200 Subject: [PATCH 0466/1640] UPSTREAM: bpf: btf: avoid -Wreturn-type warning gcc warns about a noreturn function possibly returning in some configurations: kernel/bpf/btf.c: In function 'env_type_is_resolve_sink': kernel/bpf/btf.c:729:1: error: control reaches end of non-void function [-Werror=return-type] Using BUG() instead of BUG_ON() avoids that warning and otherwise does the exact same thing. Fixes: eb3f595dab40 ("bpf: btf: Validate type reference") Signed-off-by: Arnd Bergmann Acked-by: Song Liu Signed-off-by: Daniel Borkmann --- kernel/bpf/btf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 7e90fd13b5b5..3d20aa1f4b54 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -749,7 +749,7 @@ static bool env_type_is_resolve_sink(const struct btf_verifier_env *env, !btf_type_is_array(next_type) && !btf_type_is_struct(next_type); default: - BUG_ON(1); + BUG(); } } From 8ddcd6ae9bcb1a52e434f44104bae19cc505b79f Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 25 May 2018 23:33:20 +0200 Subject: [PATCH 0467/1640] UPSTREAM: bpf: avoid -Wmaybe-uninitialized warning The stack_map_get_build_id_offset() function is too long for gcc to track whether 'work' may or may not be initialized at the end of it, leading to a false-positive warning: kernel/bpf/stackmap.c: In function 'stack_map_get_build_id_offset': kernel/bpf/stackmap.c:334:13: error: 'work' may be used uninitialized in this function [-Werror=maybe-uninitialized] This removes the 'in_nmi_ctx' flag and uses the state of that variable itself to see if it got initialized. Fixes: bae77c5eb5b2 ("bpf: enable stackmap with build_id in nmi context") Signed-off-by: Arnd Bergmann Acked-by: Song Liu Signed-off-by: Daniel Borkmann --- kernel/bpf/stackmap.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 35ce9851c1cc..491abd1556df 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -291,11 +291,10 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, { int i; struct vm_area_struct *vma; - bool in_nmi_ctx = in_nmi(); bool irq_work_busy = false; - struct stack_map_irq_work *work; + struct stack_map_irq_work *work = NULL; - if (in_nmi_ctx) { + if (in_nmi()) { work = this_cpu_ptr(&up_read_work); if (work->irq_work.flags & IRQ_WORK_BUSY) /* cannot queue more up_read, fallback */ @@ -334,7 +333,7 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, id_offs[i].status = BPF_STACK_BUILD_ID_VALID; } - if (!in_nmi_ctx) { + if (!work) { up_read(¤t->mm->mmap_sem); } else { work->sem = ¤t->mm->mmap_sem; From 276d6d5e61c8b6ce960aca2f0cfcfbdfc306da5c Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Fri, 25 May 2018 08:55:22 -0700 Subject: [PATCH 0468/1640] UPSTREAM: bpf: Define cgroup_bpf_enabled for CONFIG_CGROUP_BPF=n Static key is used to enable/disable cgroup-bpf related code paths at run time. Though it's not defined when cgroup-bpf is disabled at compile time, i.e. CONFIG_CGROUP_BPF=n, and if some code wants to use it, it has to do this: #ifdef CONFIG_CGROUP_BPF if (cgroup_bpf_enabled) { /* ... some work ... */ } #endif This code can be simplified by setting cgroup_bpf_enabled to 0 for CONFIG_CGROUP_BPF=n case: if (cgroup_bpf_enabled) { /* ... some work ... */ } And it aligns well with existing BPF_CGROUP_RUN_PROG_* macros that defined for both states of CONFIG_CGROUP_BPF. Signed-off-by: Andrey Ignatov Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/linux/bpf-cgroup.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index 30d15e64b993..de8e89a3758b 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -185,6 +185,7 @@ struct cgroup_bpf {}; static inline void cgroup_bpf_put(struct cgroup *cgrp) {} static inline int cgroup_bpf_inherit(struct cgroup *cgrp) { return 0; } +#define cgroup_bpf_enabled (0) #define BPF_CGROUP_PRE_CONNECT_ENABLED(sk) (0) #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb) ({ 0; }) From 2358c47707b8fed8cd4f9d7061d4a4f8f25284e8 Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Fri, 25 May 2018 08:55:23 -0700 Subject: [PATCH 0469/1640] UPSTREAM: bpf: Hooks for sys_sendmsg In addition to already existing BPF hooks for sys_bind and sys_connect, the patch provides new hooks for sys_sendmsg. It leverages existing BPF program type `BPF_PROG_TYPE_CGROUP_SOCK_ADDR` that provides access to socket itlself (properties like family, type, protocol) and user-passed `struct sockaddr *` so that BPF program can override destination IP and port for system calls such as sendto(2) or sendmsg(2) and/or assign source IP to the socket. The hooks are implemented as two new attach types: `BPF_CGROUP_UDP4_SENDMSG` and `BPF_CGROUP_UDP6_SENDMSG` for UDPv4 and UDPv6 correspondingly. UDPv4 and UDPv6 separate attach types for same reason as sys_bind and sys_connect hooks, i.e. to prevent reading from / writing to e.g. user_ip6 fields when user passes sockaddr_in since it'd be out-of-bound. The difference with already existing hooks is sys_sendmsg are implemented only for unconnected UDP. For TCP it doesn't make sense to change user-provided `struct sockaddr *` at sendto(2)/sendmsg(2) time since socket either was already connected and has source/destination set or wasn't connected and call to sendto(2)/sendmsg(2) would lead to ENOTCONN anyway. Connected UDP is already handled by sys_connect hooks that can override source/destination at connect time and use fast-path later, i.e. these hooks don't affect UDP fast-path. Rewriting source IP is implemented differently than that in sys_connect hooks. When sys_sendmsg is used with unconnected UDP it doesn't work to just bind socket to desired local IP address since source IP can be set on per-packet basis by using ancillary data (cmsg(3)). So no matter if socket is bound or not, source IP has to be rewritten on every call to sys_sendmsg. To do so two new fields are added to UAPI `struct bpf_sock_addr`; * `msg_src_ip4` to set source IPv4 for UDPv4; * `msg_src_ip6` to set source IPv6 for UDPv6. Signed-off-by: Andrey Ignatov Acked-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann --- include/linux/bpf-cgroup.h | 23 ++++++++++++++++------ include/linux/filter.h | 1 + include/uapi/linux/bpf.h | 8 ++++++++ kernel/bpf/cgroup.c | 11 ++++++++++- kernel/bpf/syscall.c | 8 ++++++++ net/core/filter.c | 39 ++++++++++++++++++++++++++++++++++++++ net/ipv4/udp.c | 20 +++++++++++++++++-- net/ipv6/udp.c | 24 +++++++++++++++++++++++ 8 files changed, 125 insertions(+), 9 deletions(-) diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index de8e89a3758b..975fb4cf1bb7 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -66,7 +66,8 @@ int __cgroup_bpf_run_filter_sk(struct sock *sk, int __cgroup_bpf_run_filter_sock_addr(struct sock *sk, struct sockaddr *uaddr, - enum bpf_attach_type type); + enum bpf_attach_type type, + void *t_ctx); int __cgroup_bpf_run_filter_sock_ops(struct sock *sk, struct bpf_sock_ops_kern *sock_ops, @@ -120,16 +121,18 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, ({ \ int __ret = 0; \ if (cgroup_bpf_enabled) \ - __ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, type); \ + __ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, type, \ + NULL); \ __ret; \ }) -#define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, type) \ +#define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, type, t_ctx) \ ({ \ int __ret = 0; \ if (cgroup_bpf_enabled) { \ lock_sock(sk); \ - __ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, type); \ + __ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, type, \ + t_ctx); \ release_sock(sk); \ } \ __ret; \ @@ -151,10 +154,16 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, BPF_CGROUP_RUN_SA_PROG(sk, uaddr, BPF_CGROUP_INET6_CONNECT) #define BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr) \ - BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_INET4_CONNECT) + BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_INET4_CONNECT, NULL) #define BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr) \ - BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_INET6_CONNECT) + BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_INET6_CONNECT, NULL) + +#define BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk, uaddr, t_ctx) \ + BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_UDP4_SENDMSG, t_ctx) + +#define BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk, uaddr, t_ctx) \ + BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_UDP6_SENDMSG, t_ctx) #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) \ ({ \ @@ -198,6 +207,8 @@ static inline int cgroup_bpf_inherit(struct cgroup *cgrp) { return 0; } #define BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET6_CONNECT(sk, uaddr) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr) ({ 0; }) +#define BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk, uaddr, t_ctx) ({ 0; }) +#define BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk, uaddr, t_ctx) ({ 0; }) #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) ({ 0; }) #define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type,major,minor,access) ({ 0; }) diff --git a/include/linux/filter.h b/include/linux/filter.h index 8f1ca945a8f9..b9f9efaf86f8 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1091,6 +1091,7 @@ struct bpf_sock_addr_kern { * only two (src and dst) are available at convert_ctx_access time */ u64 tmp_reg; + void *t_ctx; /* Attach type specific context. */ }; struct bpf_sock_ops_kern { diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index de65e64424eb..cdc1b97c9f00 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -160,6 +160,8 @@ enum bpf_attach_type { BPF_CGROUP_INET6_CONNECT, BPF_CGROUP_INET4_POST_BIND, BPF_CGROUP_INET6_POST_BIND, + BPF_CGROUP_UDP4_SENDMSG, + BPF_CGROUP_UDP6_SENDMSG, __MAX_BPF_ATTACH_TYPE }; @@ -2412,6 +2414,12 @@ struct bpf_sock_addr { __u32 family; /* Allows 4-byte read, but no write */ __u32 type; /* Allows 4-byte read, but no write */ __u32 protocol; /* Allows 4-byte read, but no write */ + __u32 msg_src_ip4; /* Allows 1,2,4-byte read an 4-byte write. + * Stored in network byte order. + */ + __u32 msg_src_ip6[4]; /* Allows 1,2,4-byte read an 4-byte write. + * Stored in network byte order. + */ }; /* User bpf_sock_ops struct to access socket values and specify request ops diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 1203b064198f..cfb36a755854 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -499,6 +499,7 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk); * @sk: sock struct that will use sockaddr * @uaddr: sockaddr struct provided by user * @type: The type of program to be exectuted + * @t_ctx: Pointer to attach type specific context * * socket is expected to be of type INET or INET6. * @@ -507,12 +508,15 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk); */ int __cgroup_bpf_run_filter_sock_addr(struct sock *sk, struct sockaddr *uaddr, - enum bpf_attach_type type) + enum bpf_attach_type type, + void *t_ctx) { struct bpf_sock_addr_kern ctx = { .sk = sk, .uaddr = uaddr, + .t_ctx = t_ctx, }; + struct sockaddr_storage unspec; struct cgroup *cgrp; int ret; @@ -522,6 +526,11 @@ int __cgroup_bpf_run_filter_sock_addr(struct sock *sk, if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6) return 0; + if (!ctx.uaddr) { + memset(&unspec, 0, sizeof(unspec)); + ctx.uaddr = (struct sockaddr *)&unspec; + } + cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx, BPF_PROG_RUN); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 8898835d6736..119b414f2e7e 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1266,6 +1266,8 @@ bpf_prog_load_check_attach_type(enum bpf_prog_type prog_type, case BPF_CGROUP_INET6_BIND: case BPF_CGROUP_INET4_CONNECT: case BPF_CGROUP_INET6_CONNECT: + case BPF_CGROUP_UDP4_SENDMSG: + case BPF_CGROUP_UDP6_SENDMSG: return 0; default: return -EINVAL; @@ -1588,6 +1590,8 @@ static int bpf_prog_attach(const union bpf_attr *attr) case BPF_CGROUP_INET6_BIND: case BPF_CGROUP_INET4_CONNECT: case BPF_CGROUP_INET6_CONNECT: + case BPF_CGROUP_UDP4_SENDMSG: + case BPF_CGROUP_UDP6_SENDMSG: ptype = BPF_PROG_TYPE_CGROUP_SOCK_ADDR; break; case BPF_CGROUP_SOCK_OPS: @@ -1658,6 +1662,8 @@ static int bpf_prog_detach(const union bpf_attr *attr) case BPF_CGROUP_INET6_BIND: case BPF_CGROUP_INET4_CONNECT: case BPF_CGROUP_INET6_CONNECT: + case BPF_CGROUP_UDP4_SENDMSG: + case BPF_CGROUP_UDP6_SENDMSG: ptype = BPF_PROG_TYPE_CGROUP_SOCK_ADDR; break; case BPF_CGROUP_SOCK_OPS: @@ -1715,6 +1721,8 @@ static int bpf_prog_query(const union bpf_attr *attr, case BPF_CGROUP_INET6_POST_BIND: case BPF_CGROUP_INET4_CONNECT: case BPF_CGROUP_INET6_CONNECT: + case BPF_CGROUP_UDP4_SENDMSG: + case BPF_CGROUP_UDP6_SENDMSG: case BPF_CGROUP_SOCK_OPS: case BPF_CGROUP_DEVICE: break; diff --git a/net/core/filter.c b/net/core/filter.c index ba58e1d37de4..896b33c4f150 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5310,6 +5310,7 @@ static bool sock_addr_is_valid_access(int off, int size, switch (prog->expected_attach_type) { case BPF_CGROUP_INET4_BIND: case BPF_CGROUP_INET4_CONNECT: + case BPF_CGROUP_UDP4_SENDMSG: break; default: return false; @@ -5319,6 +5320,24 @@ static bool sock_addr_is_valid_access(int off, int size, switch (prog->expected_attach_type) { case BPF_CGROUP_INET6_BIND: case BPF_CGROUP_INET6_CONNECT: + case BPF_CGROUP_UDP6_SENDMSG: + break; + default: + return false; + } + break; + case bpf_ctx_range(struct bpf_sock_addr, msg_src_ip4): + switch (prog->expected_attach_type) { + case BPF_CGROUP_UDP4_SENDMSG: + break; + default: + return false; + } + break; + case bpf_ctx_range_till(struct bpf_sock_addr, msg_src_ip6[0], + msg_src_ip6[3]): + switch (prog->expected_attach_type) { + case BPF_CGROUP_UDP6_SENDMSG: break; default: return false; @@ -5329,6 +5348,9 @@ static bool sock_addr_is_valid_access(int off, int size, switch (off) { case bpf_ctx_range(struct bpf_sock_addr, user_ip4): case bpf_ctx_range_till(struct bpf_sock_addr, user_ip6[0], user_ip6[3]): + case bpf_ctx_range(struct bpf_sock_addr, msg_src_ip4): + case bpf_ctx_range_till(struct bpf_sock_addr, msg_src_ip6[0], + msg_src_ip6[3]): /* Only narrow read access allowed for now. */ if (type == BPF_READ) { bpf_ctx_record_field_size(info, size_default); @@ -6083,6 +6105,23 @@ static u32 sock_addr_convert_ctx_access(enum bpf_access_type type, *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, SK_FL_PROTO_SHIFT); break; + + case offsetof(struct bpf_sock_addr, msg_src_ip4): + /* Treat t_ctx as struct in_addr for msg_src_ip4. */ + SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF( + struct bpf_sock_addr_kern, struct in_addr, t_ctx, + s_addr, BPF_SIZE(si->code), 0, tmp_reg); + break; + + case bpf_ctx_range_till(struct bpf_sock_addr, msg_src_ip6[0], + msg_src_ip6[3]): + off = si->off; + off -= offsetof(struct bpf_sock_addr, msg_src_ip6[0]); + /* Treat t_ctx as struct in6_addr for msg_src_ip6. */ + SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF( + struct bpf_sock_addr_kern, struct in6_addr, t_ctx, + s6_addr32[0], BPF_SIZE(si->code), off, tmp_reg); + break; } return insn - insn_buf; diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index cc5201ce82e3..d9f79c422c42 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -932,6 +932,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) { struct inet_sock *inet = inet_sk(sk); struct udp_sock *up = udp_sk(sk); + DECLARE_SOCKADDR(struct sockaddr_in *, usin, msg->msg_name); struct flowi4 fl4_stack; struct flowi4 *fl4; int ulen = len; @@ -986,8 +987,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) /* * Get and verify the address. */ - if (msg->msg_name) { - DECLARE_SOCKADDR(struct sockaddr_in *, usin, msg->msg_name); + if (usin) { if (msg->msg_namelen < sizeof(*usin)) return -EINVAL; if (usin->sin_family != AF_INET) { @@ -1041,6 +1041,22 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) rcu_read_unlock(); } + if (cgroup_bpf_enabled && !connected) { + err = BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk, + (struct sockaddr *)usin, &ipc.addr); + if (err) + goto out_free; + if (usin) { + if (usin->sin_port == 0) { + /* BPF program set invalid port. Reject it. */ + err = -EINVAL; + goto out_free; + } + daddr = usin->sin_addr.s_addr; + dport = usin->sin_port; + } + } + saddr = ipc.addr; ipc.addr = faddr = daddr; diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index c519adff6244..4836492bac01 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -1386,6 +1386,29 @@ do_udp_sendmsg: fl6.saddr = np->saddr; fl6.fl6_sport = inet->inet_sport; + if (cgroup_bpf_enabled && !connected) { + err = BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk, + (struct sockaddr *)sin6, &fl6.saddr); + if (err) + goto out_no_dst; + if (sin6) { + if (ipv6_addr_v4mapped(&sin6->sin6_addr)) { + /* BPF program rewrote IPv6-only by IPv4-mapped + * IPv6. It's currently unsupported. + */ + err = -ENOTSUPP; + goto out_no_dst; + } + if (sin6->sin6_port == 0) { + /* BPF program set invalid port. Reject it. */ + err = -EINVAL; + goto out_no_dst; + } + fl6.fl6_dport = sin6->sin6_port; + fl6.daddr = sin6->sin6_addr; + } + } + final_p = fl6_update_dst(&fl6, opt, &final); if (final_p) connected = 0; @@ -1482,6 +1505,7 @@ release_dst: out: dst_release(dst); +out_no_dst: fl6_sock_release(flowlabel); txopt_put(opt_to_free); if (!err) From 0e04dcd482849b655cee9bb006e40351dd99ed89 Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Tue, 29 May 2018 12:27:44 +0100 Subject: [PATCH 0470/1640] BACKPORT: bpf: clean up eBPF helpers documentation These are minor edits for the eBPF helpers documentation in include/uapi/linux/bpf.h. The main fix consists in removing "BPF_FIB_LOOKUP_", because it ends with a non-escaped underscore that gets interpreted by rst2man and produces the following message in the resulting manual page: DOCUTILS SYSTEM MESSAGES System Message: ERROR/3 (/tmp/bpf-helpers.rst:, line 1514) Unknown target name: "bpf_fib_lookup". Other edits consist in: - Improving formatting for flag values for "bpf_fib_lookup()" helper. - Emphasising a parameter name in description of the return value for "bpf_get_stack()" helper. - Removing unnecessary blank lines between "Description" and "Return" sections for the few helpers that would use it, for consistency. Signed-off-by: Quentin Monnet Acked-by: Song Liu Signed-off-by: Daniel Borkmann --- include/uapi/linux/bpf.h | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index cdc1b97c9f00..8613c7825cf5 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1010,7 +1010,6 @@ union bpf_attr { * :: * * # sysctl kernel.perf_event_max_stack= - * * Return * The positive or null stack id on success, or a negative error * in case of failure. @@ -1821,10 +1820,9 @@ union bpf_attr { * :: * * # sysctl kernel.perf_event_max_stack= - * * Return - * a non-negative value equal to or less than size on success, or - * a negative error in case of failure. + * A non-negative value equal to or less than *size* on success, + * or a negative error in case of failure. * * int skb_load_bytes_relative(const struct sk_buff *skb, u32 offset, void *to, u32 len, u32 start_header) * Description @@ -1845,7 +1843,6 @@ union bpf_attr { * in socket filters where *skb*\ **->data** does not always point * to the start of the mac header and where "direct packet access" * is not available. - * * Return * 0 on success, or a negative error in case of failure. * @@ -1861,16 +1858,18 @@ union bpf_attr { * rt_metric is set to metric from route. * * *plen* argument is the size of the passed in struct. - * *flags* argument can be one or more BPF_FIB_LOOKUP_ flags: + * *flags* argument can be a combination of one or more of the + * following values: * - * **BPF_FIB_LOOKUP_DIRECT** means do a direct table lookup vs - * full lookup using FIB rules - * **BPF_FIB_LOOKUP_OUTPUT** means do lookup from an egress - * perspective (default is ingress) + * **BPF_FIB_LOOKUP_DIRECT** + * Do a direct table lookup vs full lookup using FIB + * rules. + * **BPF_FIB_LOOKUP_OUTPUT** + * Perform lookup from an egress perspective (default is + * ingress). * * *ctx* is either **struct xdp_md** for XDP programs or * **struct sk_buff** tc cls_act programs. - * * Return * Egress device index on success, 0 if packet needs to continue * up the stack for further processing or a negative error in case From d6c9e9ffd705a2b66049fad46507fe74eaa90c9e Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Tue, 29 May 2018 10:40:18 +0800 Subject: [PATCH 0471/1640] UPSTREAM: bpf: hide the unused 'off' variable MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The local variable is only used while CONFIG_IPV6 enabled net/core/filter.c: In function ‘sk_msg_convert_ctx_access’: net/core/filter.c:6489:6: warning: unused variable ‘off’ [-Wunused-variable] int off; ^ This puts it into #ifdef. Fixes: 303def35f64e ("bpf: allow sk_msg programs to read sock fields") Signed-off-by: YueHaibing Acked-by: Arnd Bergmann Acked-by: John Fastabend Acked-by: Song Liu Signed-off-by: Daniel Borkmann --- net/core/filter.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/core/filter.c b/net/core/filter.c index 896b33c4f150..3ef3b8be116f 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -6497,7 +6497,9 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type, struct bpf_prog *prog, u32 *target_size) { struct bpf_insn *insn = insn_buf; +#if IS_ENABLED(CONFIG_IPV6) int off; +#endif switch (si->off) { case offsetof(struct sk_msg_md, data): From 4bc9eab780312e46da816710c55eaa450396337c Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 29 May 2018 10:58:07 -0700 Subject: [PATCH 0472/1640] UPSTREAM: bpf: Drop mpls from bpf_fib_lookup MPLS support will not be submitted this dev cycle, but in working on it I do see a few changes are needed to the API. For now, drop mpls from the API. Since the fields in question are unions, the mpls fields can be added back later without affecting the uapi. Signed-off-by: David Ahern Signed-off-by: Daniel Borkmann --- include/uapi/linux/bpf.h | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 8613c7825cf5..0d3fcdc0454e 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1852,10 +1852,10 @@ union bpf_attr { * If lookup is successful and result shows packet is to be * forwarded, the neighbor tables are searched for the nexthop. * If successful (ie., FIB lookup shows forwarding and nexthop - * is resolved), the nexthop address is returned in ipv4_dst, - * ipv6_dst or mpls_out based on family, smac is set to mac - * address of egress device, dmac is set to nexthop mac address, - * rt_metric is set to metric from route. + * is resolved), the nexthop address is returned in ipv4_dst + * or ipv6_dst based on family, smac is set to mac address of + * egress device, dmac is set to nexthop mac address, rt_metric + * is set to metric from route (IPv4/IPv6 only). * * *plen* argument is the size of the passed in struct. * *flags* argument can be a combination of one or more of the @@ -2586,8 +2586,10 @@ struct bpf_raw_tracepoint_args { #define BPF_FIB_LOOKUP_OUTPUT BIT(1) struct bpf_fib_lookup { - /* input */ - __u8 family; /* network family, AF_INET, AF_INET6, AF_MPLS */ + /* input: network family for lookup (AF_INET, AF_INET6) + * output: network family of egress nexthop + */ + __u8 family; /* set if lookup is to consider L4 data - e.g., FIB rules */ __u8 l4_protocol; @@ -2603,22 +2605,20 @@ struct bpf_fib_lookup { __u8 tos; /* AF_INET */ __be32 flowlabel; /* AF_INET6 */ - /* output: metric of fib result */ - __u32 rt_metric; + /* output: metric of fib result (IPv4/IPv6 only) */ + __u32 rt_metric; }; union { - __be32 mpls_in; __be32 ipv4_src; __u32 ipv6_src[4]; /* in6_addr; network order */ }; - /* input to bpf_fib_lookup, *dst is destination address. - * output: bpf_fib_lookup sets to gateway address + /* input to bpf_fib_lookup, ipv{4,6}_dst is destination address in + * network header. output: bpf_fib_lookup sets to gateway address + * if FIB lookup returns gateway route */ union { - /* return for MPLS lookups */ - __be32 mpls_out[4]; /* support up to 4 labels */ __be32 ipv4_dst; __u32 ipv6_dst[4]; /* in6_addr; network order */ }; From 0bdb116bd2bc96693aa6e8fe8ce08032bdaecece Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 29 May 2018 11:59:13 -0700 Subject: [PATCH 0473/1640] UPSTREAM: bpf: Verify flags in bpf_fib_lookup Verify flags argument contains only known flags. Allows programs to probe for support as more are added. Signed-off-by: David Ahern Signed-off-by: Daniel Borkmann --- net/core/filter.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/net/core/filter.c b/net/core/filter.c index 3ef3b8be116f..58f420df1d70 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4277,6 +4277,9 @@ BPF_CALL_4(bpf_xdp_fib_lookup, struct xdp_buff *, ctx, if (plen < sizeof(*params)) return -EINVAL; + if (flags & ~(BPF_FIB_LOOKUP_DIRECT | BPF_FIB_LOOKUP_OUTPUT)) + return -EINVAL; + switch (params->family) { #if IS_ENABLED(CONFIG_INET) case AF_INET: @@ -4311,6 +4314,9 @@ BPF_CALL_4(bpf_skb_fib_lookup, struct sk_buff *, skb, if (plen < sizeof(*params)) return -EINVAL; + if (flags & ~(BPF_FIB_LOOKUP_DIRECT | BPF_FIB_LOOKUP_OUTPUT)) + return -EINVAL; + switch (params->family) { #if IS_ENABLED(CONFIG_INET) case AF_INET: From a24867fc2db47e0e48c1b965a5ab71dffc88606c Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Mon, 30 Oct 2017 13:50:22 -0700 Subject: [PATCH 0474/1640] BACKPORT: bpf: avoid rcu_dereference inside bpf_event_mutex lock region During perf event attaching/detaching bpf programs, the tp_event->prog_array change is protected by the bpf_event_mutex lock in both attaching and deteching functions. Although tp_event->prog_array is a rcu pointer, rcu_derefrence is not needed to access it since mutex lock will guarantee ordering. Verified through "make C=2" that sparse locking check still happy with the new change. Also change the label name in perf_event_{attach,detach}_bpf_prog from "out" to "unlock" to reflect the code action after the label. Signed-off-by: Yonghong Song Acked-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Signed-off-by: David S. Miller --- kernel/trace/bpf_trace.c | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 121a7ac2e68f..19f031501bae 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -965,26 +965,26 @@ int perf_event_attach_bpf_prog(struct perf_event *event, mutex_lock(&bpf_event_mutex); if (event->prog) - goto out; + goto unlock; + + old_array = event->tp_event->prog_array; - old_array = rcu_dereference_protected(event->tp_event->prog_array, - lockdep_is_held(&bpf_event_mutex)); if (old_array && bpf_prog_array_length(old_array) >= BPF_TRACE_MAX_PROGS) { ret = -E2BIG; - goto out; + goto unlock; } ret = bpf_prog_array_copy(old_array, NULL, prog, &new_array); if (ret < 0) - goto out; + goto unlock; /* set the new array to event->tp_event and set event->prog */ event->prog = prog; rcu_assign_pointer(event->tp_event->prog_array, new_array); bpf_prog_array_free(old_array); -out: +unlock: mutex_unlock(&bpf_event_mutex); return ret; } @@ -998,11 +998,9 @@ void perf_event_detach_bpf_prog(struct perf_event *event) mutex_lock(&bpf_event_mutex); if (!event->prog) - goto out; - - old_array = rcu_dereference_protected(event->tp_event->prog_array, - lockdep_is_held(&bpf_event_mutex)); + goto unlock; + old_array = event->tp_event->prog_array; ret = bpf_prog_array_copy(old_array, event->prog, NULL, &new_array); if (ret < 0) { bpf_prog_array_delete_safe(old_array, event->prog); @@ -1014,7 +1012,7 @@ void perf_event_detach_bpf_prog(struct perf_event *event) bpf_prog_put(event->prog); event->prog = NULL; -out: +unlock: mutex_unlock(&bpf_event_mutex); } From 3df59db19762c3d9e59ff616210c1271f555a3e5 Mon Sep 17 00:00:00 2001 From: Sean Young Date: Sun, 27 May 2018 12:24:08 +0100 Subject: [PATCH 0475/1640] UPSTREAM: bpf: bpf_prog_array_copy() should return -ENOENT if exclude_prog not found This makes is it possible for bpf prog detach to return -ENOENT. Acked-by: Yonghong Song Signed-off-by: Sean Young Signed-off-by: Daniel Borkmann --- kernel/bpf/core.c | 11 +++++++++-- kernel/trace/bpf_trace.c | 2 ++ 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 5e3e6ac4d71e..63b2f4d8f52b 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1706,6 +1706,7 @@ int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array, int new_prog_cnt, carry_prog_cnt = 0; struct bpf_prog **existing_prog; struct bpf_prog_array *array; + bool found_exclude = false; int new_prog_idx = 0; /* Figure out how many existing progs we need to carry over to @@ -1714,14 +1715,20 @@ int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array, if (old_array) { existing_prog = old_array->progs; for (; *existing_prog; existing_prog++) { - if (*existing_prog != exclude_prog && - *existing_prog != &dummy_bpf_prog.prog) + if (*existing_prog == exclude_prog) { + found_exclude = true; + continue; + } + if (*existing_prog != &dummy_bpf_prog.prog) carry_prog_cnt++; if (*existing_prog == include_prog) return -EEXIST; } } + if (exclude_prog && !found_exclude) + return -ENOENT; + /* How many progs (not NULL) will be in the new array? */ new_prog_cnt = carry_prog_cnt; if (include_prog) diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 19f031501bae..f35de073d764 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1002,6 +1002,8 @@ void perf_event_detach_bpf_prog(struct perf_event *event) old_array = event->tp_event->prog_array; ret = bpf_prog_array_copy(old_array, event->prog, NULL, &new_array); + if (ret == -ENOENT) + goto unlock; if (ret < 0) { bpf_prog_array_delete_safe(old_array, event->prog); } else { From 22d8026ce5a1d67fa916a15fd1bf6b4b60a39169 Mon Sep 17 00:00:00 2001 From: Tim Zimmermann Date: Thu, 7 Aug 2025 07:21:02 +0000 Subject: [PATCH 0476/1640] Squashed revert of android-4.14-stable media/rc backports Revert "media: sharp: fix sharp encoding" This reverts commit 455c8a996a832e19a4cced4575eaf751ed00cdfe. Revert "media: rc: Fix use-after-free bugs caused by ene_tx_irqsim()" This reverts commit 0987f836bc1a258cb8fb51669a5afb67bb01c31b. Revert "media: imon: fix a race condition in send_packet()" This reverts commit 35455fb5c395c6a4a58e5dc11b72c70533e6eb90. Revert "media: igorplugusb: receiver overflow should be reported" This reverts commit 04f7a23c5ac9b74d54ba056773c63f0aa7b40ba5. Revert "media: redrat3: fix control-message timeouts" This reverts commit ac04370aee646e4f4b4d5fcfe53b88e4015210e7. Revert "media: mceusb: fix control-message timeouts" This reverts commit c69238c8a198421e565fac2519561bccad9c885f. Revert "media: mceusb: return without resubmitting URB in case of -EPROTO error." This reverts commit fd97b0283b8842720961a2c29fe8bbc9d3610e42. Revert "media: ite-cir: IR receiver stop working after receive overflow" This reverts commit 404792ddb4690422fada06f9996b2b3802ced14a. Revert "media: rc-loopback: return number of emitters rather than error" This reverts commit e076b6ab1cad9ce50753f31312a279e18ceab184. Revert "media: ite-cir: check for receive overflow" This reverts commit 3f76279ad0079322456e4b456bd86127eeed25d4. Revert "media: mceusb: sanity check for prescaler value" This reverts commit 9a1e55f23ed1a85e189b70011ebc9330735f3e7d. Revert "media: sunxi-cir: ensure IR is handled when it is continuous" This reverts commit b02b60dabdeb601f8e037b034db490e15fff82f3. Revert "media: ati_remote: sanity check for both endpoints" This reverts commit 8134f6b6318c00c6729083970059c16b6f7bb21f. Revert "media: gpio-ir-tx: improve precision of transmitted signal due to scheduling" This reverts commit b3650117d1d2ca8861fa81fa14de158ac79007bc. Revert "media: iguanair: fix endpoint sanity check" This reverts commit 9a4da2a7546525a8a7d869447c9ea9ca874112a5. Revert "media: imon: invalid dereference in imon_touch_event" This reverts commit 2d4c3e5c180f0f91cdc41cf65675bd768401b584. Revert "media: rc: ir-rc6-decoder: enable toggle bit for Kathrein RCU-676 remote" This reverts commit 86f049f29a2d8d7daa573c51a93d73d7c0f5f4aa. Revert "media: rc: imon: Allow iMON RC protocol for ffdc 7e device" This reverts commit 66b330f8729281f03924f1ebaaba42612a545d3d. Revert "media: iguanair: add sanity checks" This reverts commit 88c430f6bf73475d85c94f660c7211ca1a267502. Revert "media: mtk-cir: lower de-glitch counter for rc-mm protocol" This reverts commit 24b7def892db0ba4aec20a90d7a0ce459cdaa7a4. Revert "media: spi: IR LED: add missing of table registration" This reverts commit a523267d4c55b59e5ee2db9587de04d13407c361. Revert "media: serial_ir: Fix use-after-free in serial_ir_init_module" This reverts commit f1c9f1f3665635d86e4d507312f4d114d0d738e3. Revert "media: rc: oops in ir_timer_keyup after device unplug" This reverts commit a406abeb7416501355668222164919836da67034. Revert "media: rc: mce_kbd decoder: fix stuck keys" This reverts commit 99ebaf4f43dd25e18baf02c994c7939f1257f8a3. Revert "media: staging: lirc_zilog: incorrect reference counting" This reverts commit 071ff203d962478368bbc84c3c8f9ba40e9fe2b6. Reapply "media: lirc_zilog: driver only sends LIRCCODE" This reverts commit e7a08ffb2d897f4cfbd7b2931339c4643ad04f64. Revert "media: rc: partial revert of "media: rc: per-protocol repeat period"" This reverts commit f889ad87b2147dbde1d81715175cf44bdee91fab. Revert "media: rc: sir_ir: detect presence of port" This reverts commit 2f2241083a773b52e3319103b0ba2d1331b1f6c0. Revert "media: rc: nec decoder should not send both repeat and keycode" This reverts commit cfec97f26d299f64cb5dd94f5e391d74c9977486. Revert "media: rc: check for integer overflow" This reverts commit e82273a266d03aeb00f0d27762ca306abbf14e01. Revert "media: imon: Fix null-ptr-deref in imon_probe" This reverts commit 16edf1a6a8a52040841e3e875c76bad0aa28f85f. --- drivers/media/rc/ati_remote.c | 4 --- drivers/media/rc/ene_ir.c | 3 +- drivers/media/rc/gpio-ir-tx.c | 7 ++++- drivers/media/rc/igorplugusb.c | 4 +-- drivers/media/rc/iguanair.c | 15 +++++----- drivers/media/rc/imon.c | 21 ++++--------- drivers/media/rc/ir-lirc-codec.c | 9 ++---- drivers/media/rc/ir-mce_kbd-decoder.c | 2 -- drivers/media/rc/ir-nec-decoder.c | 29 ++++++++---------- drivers/media/rc/ir-rc6-decoder.c | 9 ++---- drivers/media/rc/ir-sharp-decoder.c | 8 ++--- drivers/media/rc/ir-spi.c | 1 - drivers/media/rc/ite-cir.c | 6 ---- drivers/media/rc/mceusb.c | 18 ++++------- drivers/media/rc/mtk-cir.c | 8 ----- drivers/media/rc/rc-loopback.c | 2 +- drivers/media/rc/rc-main.c | 36 +++++++++++----------- drivers/media/rc/redrat3.c | 22 +++++++------- drivers/media/rc/serial_ir.c | 9 +++++- drivers/media/rc/sir_ir.c | 40 +++---------------------- drivers/media/rc/sunxi-cir.c | 2 -- drivers/staging/media/lirc/lirc_zilog.c | 9 +++--- 22 files changed, 93 insertions(+), 171 deletions(-) diff --git a/drivers/media/rc/ati_remote.c b/drivers/media/rc/ati_remote.c index 8e3af398a6c4..d0871d60a723 100644 --- a/drivers/media/rc/ati_remote.c +++ b/drivers/media/rc/ati_remote.c @@ -845,10 +845,6 @@ static int ati_remote_probe(struct usb_interface *interface, err("%s: endpoint_in message size==0? \n", __func__); return -ENODEV; } - if (!usb_endpoint_is_int_out(endpoint_out)) { - err("%s: Unexpected endpoint_out\n", __func__); - return -ENODEV; - } ati_remote = kzalloc(sizeof (struct ati_remote), GFP_KERNEL); rc_dev = rc_allocate_device(RC_DRIVER_SCANCODE); diff --git a/drivers/media/rc/ene_ir.c b/drivers/media/rc/ene_ir.c index 4761b2a72d8e..af7ba23e16e1 100644 --- a/drivers/media/rc/ene_ir.c +++ b/drivers/media/rc/ene_ir.c @@ -1117,8 +1117,6 @@ static void ene_remove(struct pnp_dev *pnp_dev) struct ene_device *dev = pnp_get_drvdata(pnp_dev); unsigned long flags; - rc_unregister_device(dev->rdev); - del_timer_sync(&dev->tx_sim_timer); spin_lock_irqsave(&dev->hw_lock, flags); ene_rx_disable(dev); ene_rx_restore_hw_buffer(dev); @@ -1126,6 +1124,7 @@ static void ene_remove(struct pnp_dev *pnp_dev) free_irq(dev->irq, dev); release_region(dev->hw_io, ENE_IO_SIZE); + rc_unregister_device(dev->rdev); kfree(dev); } diff --git a/drivers/media/rc/gpio-ir-tx.c b/drivers/media/rc/gpio-ir-tx.c index 4e70b67ccd18..cd476cab9782 100644 --- a/drivers/media/rc/gpio-ir-tx.c +++ b/drivers/media/rc/gpio-ir-tx.c @@ -87,8 +87,13 @@ static int gpio_ir_tx(struct rc_dev *dev, unsigned int *txbuf, // space edge = ktime_add_us(edge, txbuf[i]); delta = ktime_us_delta(edge, ktime_get()); - if (delta > 0) + if (delta > 10) { + spin_unlock_irqrestore(&gpio_ir->lock, flags); + usleep_range(delta, delta + 10); + spin_lock_irqsave(&gpio_ir->lock, flags); + } else if (delta > 0) { udelay(delta); + } } else { // pulse ktime_t last = ktime_add_us(edge, txbuf[i]); diff --git a/drivers/media/rc/igorplugusb.c b/drivers/media/rc/igorplugusb.c index 2a0325d1f9de..a5ea86be8f44 100644 --- a/drivers/media/rc/igorplugusb.c +++ b/drivers/media/rc/igorplugusb.c @@ -73,11 +73,9 @@ static void igorplugusb_irdata(struct igorplugusb *ir, unsigned len) if (start >= len) { dev_err(ir->dev, "receive overflow invalid: %u", overflow); } else { - if (overflow > 0) { + if (overflow > 0) dev_warn(ir->dev, "receive overflow, at least %u lost", overflow); - ir_raw_event_reset(ir->rc); - } do { rawir.duration = ir->buf_in[i] * 85333; diff --git a/drivers/media/rc/iguanair.c b/drivers/media/rc/iguanair.c index 03dbbfba71fc..30e24da67226 100644 --- a/drivers/media/rc/iguanair.c +++ b/drivers/media/rc/iguanair.c @@ -427,10 +427,6 @@ static int iguanair_probe(struct usb_interface *intf, int ret, pipein, pipeout; struct usb_host_interface *idesc; - idesc = intf->cur_altsetting; - if (idesc->desc.bNumEndpoints < 2) - return -ENODEV; - ir = kzalloc(sizeof(*ir), GFP_KERNEL); rc = rc_allocate_device(RC_DRIVER_IR_RAW); if (!ir || !rc) { @@ -445,13 +441,18 @@ static int iguanair_probe(struct usb_interface *intf, ir->urb_in = usb_alloc_urb(0, GFP_KERNEL); ir->urb_out = usb_alloc_urb(0, GFP_KERNEL); - if (!ir->buf_in || !ir->packet || !ir->urb_in || !ir->urb_out || - !usb_endpoint_is_int_in(&idesc->endpoint[0].desc) || - !usb_endpoint_is_int_out(&idesc->endpoint[1].desc)) { + if (!ir->buf_in || !ir->packet || !ir->urb_in || !ir->urb_out) { ret = -ENOMEM; goto out; } + idesc = intf->altsetting; + + if (idesc->desc.bNumEndpoints < 2) { + ret = -ENODEV; + goto out; + } + ir->rc = rc; ir->dev = &intf->dev; ir->udev = udev; diff --git a/drivers/media/rc/imon.c b/drivers/media/rc/imon.c index 875e00e4a8a0..f6e3ebf71d47 100644 --- a/drivers/media/rc/imon.c +++ b/drivers/media/rc/imon.c @@ -637,14 +637,15 @@ static int send_packet(struct imon_context *ictx) pr_err_ratelimited("error submitting urb(%d)\n", retval); } else { /* Wait for transmission to complete (or abort) */ + mutex_unlock(&ictx->lock); retval = wait_for_completion_interruptible( &ictx->tx.finished); if (retval) { usb_kill_urb(ictx->tx_urb); pr_err_ratelimited("task interrupted\n"); } + mutex_lock(&ictx->lock); - ictx->tx.busy = false; retval = ictx->tx.status; if (retval) pr_err_ratelimited("packet tx failed (%d)\n", retval); @@ -951,8 +952,7 @@ static ssize_t vfd_write(struct file *file, const char __user *buf, return -ENODEV; } - if (mutex_lock_interruptible(&ictx->lock)) - return -ERESTARTSYS; + mutex_lock(&ictx->lock); if (!ictx->dev_present_intf0) { pr_err_ratelimited("no iMON device present\n"); @@ -1734,7 +1734,8 @@ static void imon_incoming_scancode(struct imon_context *ictx, spin_unlock_irqrestore(&ictx->kc_lock, flags); /* send touchscreen events through input subsystem if touchpad data */ - if (ictx->touch && len == 8 && buf[7] == 0x86) { + if (ictx->display_type == IMON_DISPLAY_TYPE_VGA && len == 8 && + buf[7] == 0x86) { imon_touch_event(ictx, buf); return; @@ -1959,17 +1960,12 @@ static void imon_get_ffdc_type(struct imon_context *ictx) break; /* iMON VFD, MCE IR */ case 0x46: + case 0x7e: case 0x9e: dev_info(ictx->dev, "0xffdc iMON VFD, MCE IR"); detected_display_type = IMON_DISPLAY_TYPE_VFD; allowed_protos = RC_PROTO_BIT_RC6_MCE; break; - /* iMON VFD, iMON or MCE IR */ - case 0x7e: - dev_info(ictx->dev, "0xffdc iMON VFD, iMON or MCE IR"); - detected_display_type = IMON_DISPLAY_TYPE_VFD; - allowed_protos |= RC_PROTO_BIT_RC6_MCE; - break; /* iMON LCD, MCE IR */ case 0x9f: dev_info(ictx->dev, "0xffdc iMON LCD, MCE IR"); @@ -2518,11 +2514,6 @@ static int imon_probe(struct usb_interface *interface, mutex_lock(&driver_lock); first_if = usb_ifnum_to_if(usbdev, 0); - if (!first_if) { - ret = -ENODEV; - goto fail; - } - first_if_ctx = usb_get_intfdata(first_if); if (ifnum == 0) { diff --git a/drivers/media/rc/ir-lirc-codec.c b/drivers/media/rc/ir-lirc-codec.c index 4c8f456238bc..d2223c04e9ad 100644 --- a/drivers/media/rc/ir-lirc-codec.c +++ b/drivers/media/rc/ir-lirc-codec.c @@ -298,14 +298,11 @@ static long ir_lirc_ioctl(struct file *filep, unsigned int cmd, if (!dev->max_timeout) return -ENOTTY; - /* Check for multiply overflow */ - if (val > U32_MAX / 1000) - return -EINVAL; - tmp = val * 1000; - if (tmp < dev->min_timeout || tmp > dev->max_timeout) - return -EINVAL; + if (tmp < dev->min_timeout || + tmp > dev->max_timeout) + return -EINVAL; if (dev->s_timeout) ret = dev->s_timeout(dev, tmp); diff --git a/drivers/media/rc/ir-mce_kbd-decoder.c b/drivers/media/rc/ir-mce_kbd-decoder.c index 2a1728edb3c6..7c572a643656 100644 --- a/drivers/media/rc/ir-mce_kbd-decoder.c +++ b/drivers/media/rc/ir-mce_kbd-decoder.c @@ -130,8 +130,6 @@ static void mce_kbd_rx_timeout(unsigned long data) for (i = 0; i < MCIR2_MASK_KEYS_START; i++) input_report_key(mce_kbd->idev, kbd_keycodes[i], 0); - - input_sync(mce_kbd->idev); } static enum mce_kbd_mode mce_kbd_mode(struct mce_kbd_dec *data) diff --git a/drivers/media/rc/ir-nec-decoder.c b/drivers/media/rc/ir-nec-decoder.c index a95d09acc22a..817c18f2ddd1 100644 --- a/drivers/media/rc/ir-nec-decoder.c +++ b/drivers/media/rc/ir-nec-decoder.c @@ -87,6 +87,8 @@ static int ir_nec_decode(struct rc_dev *dev, struct ir_raw_event ev) data->state = STATE_BIT_PULSE; return 0; } else if (eq_margin(ev.duration, NEC_REPEAT_SPACE, NEC_UNIT / 2)) { + rc_repeat(dev); + IR_dprintk(1, "Repeat last key\n"); data->state = STATE_TRAILER_PULSE; return 0; } @@ -149,26 +151,19 @@ static int ir_nec_decode(struct rc_dev *dev, struct ir_raw_event ev) if (!geq_margin(ev.duration, NEC_TRAILER_SPACE, NEC_UNIT / 2)) break; - if (data->count == NEC_NBITS) { - address = bitrev8((data->bits >> 24) & 0xff); - not_address = bitrev8((data->bits >> 16) & 0xff); - command = bitrev8((data->bits >> 8) & 0xff); - not_command = bitrev8((data->bits >> 0) & 0xff); + address = bitrev8((data->bits >> 24) & 0xff); + not_address = bitrev8((data->bits >> 16) & 0xff); + command = bitrev8((data->bits >> 8) & 0xff); + not_command = bitrev8((data->bits >> 0) & 0xff); - scancode = ir_nec_bytes_to_scancode(address, - not_address, - command, - not_command, - &rc_proto); + scancode = ir_nec_bytes_to_scancode(address, not_address, + command, not_command, + &rc_proto); - if (data->is_nec_x) - data->necx_repeat = true; - - rc_keydown(dev, rc_proto, scancode, 0); - } else { - rc_repeat(dev); - } + if (data->is_nec_x) + data->necx_repeat = true; + rc_keydown(dev, rc_proto, scancode, 0); data->state = STATE_INACTIVE; return 0; } diff --git a/drivers/media/rc/ir-rc6-decoder.c b/drivers/media/rc/ir-rc6-decoder.c index 90f7930444a1..5d0d2fe3b7a7 100644 --- a/drivers/media/rc/ir-rc6-decoder.c +++ b/drivers/media/rc/ir-rc6-decoder.c @@ -40,7 +40,6 @@ #define RC6_6A_MCE_TOGGLE_MASK 0x8000 /* for the body bits */ #define RC6_6A_LCC_MASK 0xffff0000 /* RC6-6A-32 long customer code mask */ #define RC6_6A_MCE_CC 0x800f0000 /* MCE customer code */ -#define RC6_6A_KATHREIN_CC 0x80460000 /* Kathrein RCU-676 customer code */ #ifndef CHAR_BIT #define CHAR_BIT 8 /* Normally in */ #endif @@ -253,17 +252,13 @@ again: toggle = 0; break; case 32: - switch (scancode & RC6_6A_LCC_MASK) { - case RC6_6A_MCE_CC: - case RC6_6A_KATHREIN_CC: + if ((scancode & RC6_6A_LCC_MASK) == RC6_6A_MCE_CC) { protocol = RC_PROTO_RC6_MCE; toggle = !!(scancode & RC6_6A_MCE_TOGGLE_MASK); scancode &= ~RC6_6A_MCE_TOGGLE_MASK; - break; - default: + } else { protocol = RC_PROTO_RC6_6A_32; toggle = 0; - break; } break; default: diff --git a/drivers/media/rc/ir-sharp-decoder.c b/drivers/media/rc/ir-sharp-decoder.c index 61d313be5bbf..129b558acc92 100644 --- a/drivers/media/rc/ir-sharp-decoder.c +++ b/drivers/media/rc/ir-sharp-decoder.c @@ -23,9 +23,7 @@ #define SHARP_UNIT 40000 /* ns */ #define SHARP_BIT_PULSE (8 * SHARP_UNIT) /* 320us */ #define SHARP_BIT_0_PERIOD (25 * SHARP_UNIT) /* 1ms (680us space) */ -#define SHARP_BIT_1_PERIOD (50 * SHARP_UNIT) /* 2ms (1680us space) */ -#define SHARP_BIT_0_SPACE (17 * SHARP_UNIT) /* 680us space */ -#define SHARP_BIT_1_SPACE (42 * SHARP_UNIT) /* 1680us space */ +#define SHARP_BIT_1_PERIOD (50 * SHARP_UNIT) /* 2ms (1680ms space) */ #define SHARP_ECHO_SPACE (1000 * SHARP_UNIT) /* 40 ms */ #define SHARP_TRAILER_SPACE (125 * SHARP_UNIT) /* 5 ms (even longer) */ @@ -179,8 +177,8 @@ static const struct ir_raw_timings_pd ir_sharp_timings = { .header_pulse = 0, .header_space = 0, .bit_pulse = SHARP_BIT_PULSE, - .bit_space[0] = SHARP_BIT_0_SPACE, - .bit_space[1] = SHARP_BIT_1_SPACE, + .bit_space[0] = SHARP_BIT_0_PERIOD, + .bit_space[1] = SHARP_BIT_1_PERIOD, .trailer_pulse = SHARP_BIT_PULSE, .trailer_space = SHARP_ECHO_SPACE, .msb_first = 1, diff --git a/drivers/media/rc/ir-spi.c b/drivers/media/rc/ir-spi.c index cbe585f95715..29ed0638cb74 100644 --- a/drivers/media/rc/ir-spi.c +++ b/drivers/media/rc/ir-spi.c @@ -186,7 +186,6 @@ static const struct of_device_id ir_spi_of_match[] = { { .compatible = "ir-spi-led" }, {}, }; -MODULE_DEVICE_TABLE(of, ir_spi_of_match); static struct spi_driver ir_spi_driver = { .probe = ir_spi_probe, diff --git a/drivers/media/rc/ite-cir.c b/drivers/media/rc/ite-cir.c index 7bc38e805acb..65e104c7ddfc 100644 --- a/drivers/media/rc/ite-cir.c +++ b/drivers/media/rc/ite-cir.c @@ -285,12 +285,6 @@ static irqreturn_t ite_cir_isr(int irq, void *data) /* read the interrupt flags */ iflags = dev->params.get_irq_causes(dev); - /* Check for RX overflow */ - if (iflags & ITE_IRQ_RX_FIFO_OVERRUN) { - dev_warn(&dev->rdev->dev, "receive overflow\n"); - ir_raw_event_reset(dev->rdev); - } - /* check for the receive interrupt */ if (iflags & (ITE_IRQ_RX_FIFO | ITE_IRQ_RX_FIFO_OVERRUN)) { /* read the FIFO bytes */ diff --git a/drivers/media/rc/mceusb.c b/drivers/media/rc/mceusb.c index 53e68d02763c..bf7aaff3aa37 100644 --- a/drivers/media/rc/mceusb.c +++ b/drivers/media/rc/mceusb.c @@ -630,18 +630,11 @@ static void mceusb_dev_printdata(struct mceusb_dev *ir, u8 *buf, int buf_len, data[0], data[1]); break; case MCE_RSP_EQIRCFS: - if (!data[0] && !data[1]) { - dev_dbg(dev, "%s: no carrier", inout); - break; - } - // prescaler should make sense - if (data[0] > 8) - break; period = DIV_ROUND_CLOSEST((1U << data[0] * 2) * (data[1] + 1), 10); if (!period) break; - carrier = USEC_PER_SEC / period; + carrier = (1000 * 1000) / period; dev_dbg(dev, "%s carrier of %u Hz (period %uus)", inout, carrier, period); break; @@ -1080,7 +1073,6 @@ static void mceusb_dev_recv(struct urb *urb) case -ECONNRESET: case -ENOENT: case -EILSEQ: - case -EPROTO: case -ESHUTDOWN: usb_unlink_urb(urb); return; @@ -1124,7 +1116,7 @@ static void mceusb_gen1_init(struct mceusb_dev *ir) */ ret = usb_control_msg(ir->usbdev, usb_rcvctrlpipe(ir->usbdev, 0), USB_REQ_SET_ADDRESS, USB_TYPE_VENDOR, 0, 0, - data, USB_CTRL_MSG_SZ, 3000); + data, USB_CTRL_MSG_SZ, HZ * 3); dev_dbg(dev, "set address - ret = %d", ret); dev_dbg(dev, "set address - data[0] = %d, data[1] = %d", data[0], data[1]); @@ -1132,20 +1124,20 @@ static void mceusb_gen1_init(struct mceusb_dev *ir) /* set feature: bit rate 38400 bps */ ret = usb_control_msg(ir->usbdev, usb_sndctrlpipe(ir->usbdev, 0), USB_REQ_SET_FEATURE, USB_TYPE_VENDOR, - 0xc04e, 0x0000, NULL, 0, 3000); + 0xc04e, 0x0000, NULL, 0, HZ * 3); dev_dbg(dev, "set feature - ret = %d", ret); /* bRequest 4: set char length to 8 bits */ ret = usb_control_msg(ir->usbdev, usb_sndctrlpipe(ir->usbdev, 0), 4, USB_TYPE_VENDOR, - 0x0808, 0x0000, NULL, 0, 3000); + 0x0808, 0x0000, NULL, 0, HZ * 3); dev_dbg(dev, "set char length - retB = %d", ret); /* bRequest 2: set handshaking to use DTR/DSR */ ret = usb_control_msg(ir->usbdev, usb_sndctrlpipe(ir->usbdev, 0), 2, USB_TYPE_VENDOR, - 0x0000, 0x0100, NULL, 0, 3000); + 0x0000, 0x0100, NULL, 0, HZ * 3); dev_dbg(dev, "set handshake - retC = %d", ret); /* device resume */ diff --git a/drivers/media/rc/mtk-cir.c b/drivers/media/rc/mtk-cir.c index 00a4a0dfcab8..e88eb64e8e69 100644 --- a/drivers/media/rc/mtk-cir.c +++ b/drivers/media/rc/mtk-cir.c @@ -44,11 +44,6 @@ /* Fields containing pulse width data */ #define MTK_WIDTH_MASK (GENMASK(7, 0)) -/* IR threshold */ -#define MTK_IRTHD 0x14 -#define MTK_DG_CNT_MASK (GENMASK(12, 8)) -#define MTK_DG_CNT(x) ((x) << 8) - /* Bit to enable interrupt */ #define MTK_IRINT_EN BIT(0) @@ -416,9 +411,6 @@ static int mtk_ir_probe(struct platform_device *pdev) mtk_w32_mask(ir, val, ir->data->fields[MTK_HW_PERIOD].mask, ir->data->fields[MTK_HW_PERIOD].reg); - /* Set de-glitch counter */ - mtk_w32_mask(ir, MTK_DG_CNT(1), MTK_DG_CNT_MASK, MTK_IRTHD); - /* Enable IR and PWM */ val = mtk_r32(ir, MTK_CONFIG_HIGH_REG); val |= MTK_OK_COUNT(ir->data->ok_count) | MTK_PWM_EN | MTK_IR_EN; diff --git a/drivers/media/rc/rc-loopback.c b/drivers/media/rc/rc-loopback.c index 5abbde7e5d5b..3822d9ebcb46 100644 --- a/drivers/media/rc/rc-loopback.c +++ b/drivers/media/rc/rc-loopback.c @@ -52,7 +52,7 @@ static int loop_set_tx_mask(struct rc_dev *dev, u32 mask) if ((mask & (RXMASK_REGULAR | RXMASK_LEARNING)) != mask) { dprintk("invalid tx mask: %u\n", mask); - return 2; + return -EINVAL; } dprintk("setting tx mask: %u\n", mask); diff --git a/drivers/media/rc/rc-main.c b/drivers/media/rc/rc-main.c index a22828713c1c..981cccd6b988 100644 --- a/drivers/media/rc/rc-main.c +++ b/drivers/media/rc/rc-main.c @@ -38,41 +38,41 @@ static const struct { [RC_PROTO_UNKNOWN] = { .name = "unknown", .repeat_period = 250 }, [RC_PROTO_OTHER] = { .name = "other", .repeat_period = 250 }, [RC_PROTO_RC5] = { .name = "rc-5", - .scancode_bits = 0x1f7f, .repeat_period = 250 }, + .scancode_bits = 0x1f7f, .repeat_period = 164 }, [RC_PROTO_RC5X_20] = { .name = "rc-5x-20", - .scancode_bits = 0x1f7f3f, .repeat_period = 250 }, + .scancode_bits = 0x1f7f3f, .repeat_period = 164 }, [RC_PROTO_RC5_SZ] = { .name = "rc-5-sz", - .scancode_bits = 0x2fff, .repeat_period = 250 }, + .scancode_bits = 0x2fff, .repeat_period = 164 }, [RC_PROTO_JVC] = { .name = "jvc", .scancode_bits = 0xffff, .repeat_period = 250 }, [RC_PROTO_SONY12] = { .name = "sony-12", - .scancode_bits = 0x1f007f, .repeat_period = 250 }, + .scancode_bits = 0x1f007f, .repeat_period = 100 }, [RC_PROTO_SONY15] = { .name = "sony-15", - .scancode_bits = 0xff007f, .repeat_period = 250 }, + .scancode_bits = 0xff007f, .repeat_period = 100 }, [RC_PROTO_SONY20] = { .name = "sony-20", - .scancode_bits = 0x1fff7f, .repeat_period = 250 }, + .scancode_bits = 0x1fff7f, .repeat_period = 100 }, [RC_PROTO_NEC] = { .name = "nec", - .scancode_bits = 0xffff, .repeat_period = 250 }, + .scancode_bits = 0xffff, .repeat_period = 160 }, [RC_PROTO_NECX] = { .name = "nec-x", - .scancode_bits = 0xffffff, .repeat_period = 250 }, + .scancode_bits = 0xffffff, .repeat_period = 160 }, [RC_PROTO_NEC32] = { .name = "nec-32", - .scancode_bits = 0xffffffff, .repeat_period = 250 }, + .scancode_bits = 0xffffffff, .repeat_period = 160 }, [RC_PROTO_SANYO] = { .name = "sanyo", .scancode_bits = 0x1fffff, .repeat_period = 250 }, [RC_PROTO_MCIR2_KBD] = { .name = "mcir2-kbd", - .scancode_bits = 0xffff, .repeat_period = 250 }, + .scancode_bits = 0xffff, .repeat_period = 150 }, [RC_PROTO_MCIR2_MSE] = { .name = "mcir2-mse", - .scancode_bits = 0x1fffff, .repeat_period = 250 }, + .scancode_bits = 0x1fffff, .repeat_period = 150 }, [RC_PROTO_RC6_0] = { .name = "rc-6-0", - .scancode_bits = 0xffff, .repeat_period = 250 }, + .scancode_bits = 0xffff, .repeat_period = 164 }, [RC_PROTO_RC6_6A_20] = { .name = "rc-6-6a-20", - .scancode_bits = 0xfffff, .repeat_period = 250 }, + .scancode_bits = 0xfffff, .repeat_period = 164 }, [RC_PROTO_RC6_6A_24] = { .name = "rc-6-6a-24", - .scancode_bits = 0xffffff, .repeat_period = 250 }, + .scancode_bits = 0xffffff, .repeat_period = 164 }, [RC_PROTO_RC6_6A_32] = { .name = "rc-6-6a-32", - .scancode_bits = 0xffffffff, .repeat_period = 250 }, + .scancode_bits = 0xffffffff, .repeat_period = 164 }, [RC_PROTO_RC6_MCE] = { .name = "rc-6-mce", - .scancode_bits = 0xffff7fff, .repeat_period = 250 }, + .scancode_bits = 0xffff7fff, .repeat_period = 164 }, [RC_PROTO_SHARP] = { .name = "sharp", .scancode_bits = 0x1fff, .repeat_period = 250 }, [RC_PROTO_XMP] = { .name = "xmp", .repeat_period = 250 }, @@ -1824,11 +1824,11 @@ void rc_unregister_device(struct rc_dev *dev) if (!dev) return; + del_timer_sync(&dev->timer_keyup); + if (dev->driver_type == RC_DRIVER_IR_RAW) ir_raw_event_unregister(dev); - del_timer_sync(&dev->timer_keyup); - rc_free_rx_device(dev); device_del(&dev->dev); diff --git a/drivers/media/rc/redrat3.c b/drivers/media/rc/redrat3.c index d6fd924fea53..6784cb9fc4e7 100644 --- a/drivers/media/rc/redrat3.c +++ b/drivers/media/rc/redrat3.c @@ -415,7 +415,7 @@ static int redrat3_send_cmd(int cmd, struct redrat3_dev *rr3) udev = rr3->udev; res = usb_control_msg(udev, usb_rcvctrlpipe(udev, 0), cmd, USB_TYPE_VENDOR | USB_RECIP_DEVICE | USB_DIR_IN, - 0x0000, 0x0000, data, sizeof(u8), 10000); + 0x0000, 0x0000, data, sizeof(u8), HZ * 10); if (res < 0) { dev_err(rr3->dev, "%s: Error sending rr3 cmd res %d, data %d", @@ -491,7 +491,7 @@ static u32 redrat3_get_timeout(struct redrat3_dev *rr3) pipe = usb_rcvctrlpipe(rr3->udev, 0); ret = usb_control_msg(rr3->udev, pipe, RR3_GET_IR_PARAM, USB_TYPE_VENDOR | USB_RECIP_DEVICE | USB_DIR_IN, - RR3_IR_IO_SIG_TIMEOUT, 0, tmp, len, 5000); + RR3_IR_IO_SIG_TIMEOUT, 0, tmp, len, HZ * 5); if (ret != len) dev_warn(rr3->dev, "Failed to read timeout from hardware\n"); else { @@ -521,7 +521,7 @@ static int redrat3_set_timeout(struct rc_dev *rc_dev, unsigned int timeoutns) ret = usb_control_msg(udev, usb_sndctrlpipe(udev, 0), RR3_SET_IR_PARAM, USB_TYPE_VENDOR | USB_RECIP_DEVICE | USB_DIR_OUT, RR3_IR_IO_SIG_TIMEOUT, 0, timeout, sizeof(*timeout), - 25000); + HZ * 25); dev_dbg(dev, "set ir parm timeout %d ret 0x%02x\n", be32_to_cpu(*timeout), ret); @@ -553,32 +553,32 @@ static void redrat3_reset(struct redrat3_dev *rr3) *val = 0x01; rc = usb_control_msg(udev, rxpipe, RR3_RESET, USB_TYPE_VENDOR | USB_RECIP_DEVICE | USB_DIR_IN, - RR3_CPUCS_REG_ADDR, 0, val, len, 25000); + RR3_CPUCS_REG_ADDR, 0, val, len, HZ * 25); dev_dbg(dev, "reset returned 0x%02x\n", rc); *val = length_fuzz; rc = usb_control_msg(udev, txpipe, RR3_SET_IR_PARAM, USB_TYPE_VENDOR | USB_RECIP_DEVICE | USB_DIR_OUT, - RR3_IR_IO_LENGTH_FUZZ, 0, val, len, 25000); + RR3_IR_IO_LENGTH_FUZZ, 0, val, len, HZ * 25); dev_dbg(dev, "set ir parm len fuzz %d rc 0x%02x\n", *val, rc); *val = (65536 - (minimum_pause * 2000)) / 256; rc = usb_control_msg(udev, txpipe, RR3_SET_IR_PARAM, USB_TYPE_VENDOR | USB_RECIP_DEVICE | USB_DIR_OUT, - RR3_IR_IO_MIN_PAUSE, 0, val, len, 25000); + RR3_IR_IO_MIN_PAUSE, 0, val, len, HZ * 25); dev_dbg(dev, "set ir parm min pause %d rc 0x%02x\n", *val, rc); *val = periods_measure_carrier; rc = usb_control_msg(udev, txpipe, RR3_SET_IR_PARAM, USB_TYPE_VENDOR | USB_RECIP_DEVICE | USB_DIR_OUT, - RR3_IR_IO_PERIODS_MF, 0, val, len, 25000); + RR3_IR_IO_PERIODS_MF, 0, val, len, HZ * 25); dev_dbg(dev, "set ir parm periods measure carrier %d rc 0x%02x", *val, rc); *val = RR3_DRIVER_MAXLENS; rc = usb_control_msg(udev, txpipe, RR3_SET_IR_PARAM, USB_TYPE_VENDOR | USB_RECIP_DEVICE | USB_DIR_OUT, - RR3_IR_IO_MAX_LENGTHS, 0, val, len, 25000); + RR3_IR_IO_MAX_LENGTHS, 0, val, len, HZ * 25); dev_dbg(dev, "set ir parm max lens %d rc 0x%02x\n", *val, rc); kfree(val); @@ -596,7 +596,7 @@ static void redrat3_get_firmware_rev(struct redrat3_dev *rr3) rc = usb_control_msg(rr3->udev, usb_rcvctrlpipe(rr3->udev, 0), RR3_FW_VERSION, USB_TYPE_VENDOR | USB_RECIP_DEVICE | USB_DIR_IN, - 0, 0, buffer, RR3_FW_VERSION_LEN, 5000); + 0, 0, buffer, RR3_FW_VERSION_LEN, HZ * 5); if (rc >= 0) dev_info(rr3->dev, "Firmware rev: %s", buffer); @@ -836,14 +836,14 @@ static int redrat3_transmit_ir(struct rc_dev *rcdev, unsigned *txbuf, pipe = usb_sndbulkpipe(rr3->udev, rr3->ep_out->bEndpointAddress); ret = usb_bulk_msg(rr3->udev, pipe, irdata, - sendbuf_len, &ret_len, 10000); + sendbuf_len, &ret_len, 10 * HZ); dev_dbg(dev, "sent %d bytes, (ret %d)\n", ret_len, ret); /* now tell the hardware to transmit what we sent it */ pipe = usb_rcvctrlpipe(rr3->udev, 0); ret = usb_control_msg(rr3->udev, pipe, RR3_TX_SEND_SIGNAL, USB_TYPE_VENDOR | USB_RECIP_DEVICE | USB_DIR_IN, - 0, 0, irdata, 2, 10000); + 0, 0, irdata, 2, HZ * 10); if (ret < 0) dev_err(dev, "Error: control msg send failed, rc %d\n", ret); diff --git a/drivers/media/rc/serial_ir.c b/drivers/media/rc/serial_ir.c index 842c121dca2d..8b66926bc16a 100644 --- a/drivers/media/rc/serial_ir.c +++ b/drivers/media/rc/serial_ir.c @@ -774,6 +774,8 @@ static void serial_ir_exit(void) static int __init serial_ir_init_module(void) { + int result; + switch (type) { case IR_HOMEBREW: case IR_IRDEO: @@ -801,7 +803,12 @@ static int __init serial_ir_init_module(void) if (sense != -1) sense = !!sense; - return serial_ir_init(); + result = serial_ir_init(); + if (!result) + return 0; + + serial_ir_exit(); + return result; } static void __exit serial_ir_exit_module(void) diff --git a/drivers/media/rc/sir_ir.c b/drivers/media/rc/sir_ir.c index d59918878eb2..bc906fb128d5 100644 --- a/drivers/media/rc/sir_ir.c +++ b/drivers/media/rc/sir_ir.c @@ -57,7 +57,7 @@ static void add_read_queue(int flag, unsigned long val); static irqreturn_t sir_interrupt(int irq, void *dev_id); static void send_space(unsigned long len); static void send_pulse(unsigned long len); -static int init_hardware(void); +static void init_hardware(void); static void drop_hardware(void); /* Initialisation */ @@ -263,36 +263,11 @@ static void send_pulse(unsigned long len) } } -static int init_hardware(void) +static void init_hardware(void) { - u8 scratch, scratch2, scratch3; unsigned long flags; spin_lock_irqsave(&hardware_lock, flags); - - /* - * This is a simple port existence test, borrowed from the autoconfig - * function in drivers/tty/serial/8250/8250_port.c - */ - scratch = sinp(UART_IER); - soutp(UART_IER, 0); -#ifdef __i386__ - outb(0xff, 0x080); -#endif - scratch2 = sinp(UART_IER) & 0x0f; - soutp(UART_IER, 0x0f); -#ifdef __i386__ - outb(0x00, 0x080); -#endif - scratch3 = sinp(UART_IER) & 0x0f; - soutp(UART_IER, scratch); - if (scratch2 != 0 || scratch3 != 0x0f) { - /* we fail, there's nothing here */ - spin_unlock_irqrestore(&hardware_lock, flags); - pr_err("port existence test failed, cannot continue\n"); - return -ENODEV; - } - /* reset UART */ outb(0, io + UART_MCR); outb(0, io + UART_IER); @@ -310,8 +285,6 @@ static int init_hardware(void) /* turn on UART */ outb(UART_MCR_DTR | UART_MCR_RTS | UART_MCR_OUT2, io + UART_MCR); spin_unlock_irqrestore(&hardware_lock, flags); - - return 0; } static void drop_hardware(void) @@ -361,19 +334,14 @@ static int sir_ir_probe(struct platform_device *dev) pr_err("IRQ %d already in use.\n", irq); return retval; } - - retval = init_hardware(); - if (retval) { - del_timer_sync(&timerlist); - return retval; - } - pr_info("I/O port 0x%.4x, IRQ %d.\n", io, irq); retval = devm_rc_register_device(&sir_ir_dev->dev, rcdev); if (retval < 0) return retval; + init_hardware(); + return 0; } diff --git a/drivers/media/rc/sunxi-cir.c b/drivers/media/rc/sunxi-cir.c index bc026a7116ec..97f367b446c4 100644 --- a/drivers/media/rc/sunxi-cir.c +++ b/drivers/media/rc/sunxi-cir.c @@ -132,8 +132,6 @@ static irqreturn_t sunxi_ir_irq(int irqno, void *dev_id) } else if (status & REG_RXINT_RPEI_EN) { ir_raw_event_set_idle(ir->rc, true); ir_raw_event_handle(ir->rc); - } else { - ir_raw_event_handle(ir->rc); } spin_unlock(&ir->ir_lock); diff --git a/drivers/staging/media/lirc/lirc_zilog.c b/drivers/staging/media/lirc/lirc_zilog.c index e35e1b2160e3..71af13bd0ebd 100644 --- a/drivers/staging/media/lirc/lirc_zilog.c +++ b/drivers/staging/media/lirc/lirc_zilog.c @@ -288,7 +288,7 @@ static void release_ir_tx(struct kref *ref) struct IR_tx *tx = container_of(ref, struct IR_tx, ref); struct IR *ir = tx->ir; - ir->l.features &= ~LIRC_CAN_SEND_PULSE; + ir->l.features &= ~LIRC_CAN_SEND_LIRCCODE; /* Don't put_ir_device(tx->ir) here, so our lock doesn't get freed */ ir->tx = NULL; kfree(tx); @@ -1228,7 +1228,6 @@ static unsigned int poll(struct file *filep, poll_table *wait) dev_dbg(ir->l.dev, "%s result = %s\n", __func__, ret ? "POLLIN|POLLRDNORM" : "none"); - put_ir_rx(rx, false); return ret; } @@ -1268,14 +1267,14 @@ static long ioctl(struct file *filep, unsigned int cmd, unsigned long arg) if (!(features & LIRC_CAN_SEND_MASK)) return -ENOTTY; - result = put_user(LIRC_MODE_PULSE, uptr); + result = put_user(LIRC_MODE_LIRCCODE, uptr); break; case LIRC_SET_SEND_MODE: if (!(features & LIRC_CAN_SEND_MASK)) return -ENOTTY; result = get_user(mode, uptr); - if (!result && mode != LIRC_MODE_PULSE) + if (!result && mode != LIRC_MODE_LIRCCODE) return -EINVAL; break; default: @@ -1513,7 +1512,7 @@ static int ir_probe(struct i2c_client *client, const struct i2c_device_id *id) kref_init(&tx->ref); ir->tx = tx; - ir->l.features |= LIRC_CAN_SEND_PULSE; + ir->l.features |= LIRC_CAN_SEND_LIRCCODE; mutex_init(&tx->client_lock); tx->c = client; tx->need_boot = 1; From 815e699d44a5969abeea19b7de4f1ed8af23e0c2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?David=20H=C3=A4rdeman?= Date: Sun, 25 Jun 2017 09:31:19 -0300 Subject: [PATCH 0477/1640] UPSTREAM: [media] media: lirc_dev: clarify error handling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If an error is generated, it is more logical to error out ASAP. Signed-off-by: David Härdeman Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/lirc_dev.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/media/rc/lirc_dev.c b/drivers/media/rc/lirc_dev.c index 9080e39ea391..5e3c4779d866 100644 --- a/drivers/media/rc/lirc_dev.c +++ b/drivers/media/rc/lirc_dev.c @@ -286,7 +286,7 @@ EXPORT_SYMBOL(lirc_unregister_driver); int lirc_dev_fop_open(struct inode *inode, struct file *file) { struct irctl *ir; - int retval = 0; + int retval; if (iminor(inode) >= MAX_IRCTL_DEVICES) { pr_err("open result for %d is -ENODEV\n", iminor(inode)); @@ -327,9 +327,11 @@ int lirc_dev_fop_open(struct inode *inode, struct file *file) ir->open++; -error: nonseekable_open(inode, file); + return 0; + +error: return retval; } EXPORT_SYMBOL(lirc_dev_fop_open); From fc26c28b048ec43b7ad0d55eabb3ad2a9fb5c021 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?David=20H=C3=A4rdeman?= Date: Sun, 25 Jun 2017 09:31:24 -0300 Subject: [PATCH 0478/1640] UPSTREAM: [media] media: lirc_dev: remove support for manually specifying minor number MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit All users of lirc_register_driver() uses dynamic minor allocation, therefore we can remove the ability to explicitly request a given number. This changes the function prototype of lirc_unregister_driver() to also take a struct lirc_driver pointer as the sole argument. Signed-off-by: David Härdeman Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/ir-lirc-codec.c | 9 ++-- drivers/media/rc/lirc_dev.c | 68 +++++++------------------ drivers/staging/media/lirc/lirc_zilog.c | 14 ++--- include/media/lirc_dev.h | 20 ++++---- 4 files changed, 34 insertions(+), 77 deletions(-) diff --git a/drivers/media/rc/ir-lirc-codec.c b/drivers/media/rc/ir-lirc-codec.c index d2223c04e9ad..58bff7a75d5b 100644 --- a/drivers/media/rc/ir-lirc-codec.c +++ b/drivers/media/rc/ir-lirc-codec.c @@ -382,7 +382,6 @@ static int ir_lirc_register(struct rc_dev *dev) snprintf(drv->name, sizeof(drv->name), "ir-lirc-codec (%s)", dev->driver_name); - drv->minor = -1; drv->features = features; drv->data = &dev->raw->lirc; drv->rbuf = NULL; @@ -394,11 +393,9 @@ static int ir_lirc_register(struct rc_dev *dev) drv->rdev = dev; drv->owner = THIS_MODULE; - drv->minor = lirc_register_driver(drv); - if (drv->minor < 0) { - rc = -ENODEV; + rc = lirc_register_driver(drv); + if (rc < 0) goto out; - } dev->raw->lirc.drv = drv; dev->raw->lirc.dev = dev; @@ -413,7 +410,7 @@ static int ir_lirc_unregister(struct rc_dev *dev) { struct lirc_codec *lirc = &dev->raw->lirc; - lirc_unregister_driver(lirc->drv->minor); + lirc_unregister_driver(lirc->drv); kfree(lirc->drv); lirc->drv = NULL; diff --git a/drivers/media/rc/lirc_dev.c b/drivers/media/rc/lirc_dev.c index 5e3c4779d866..f1d8c1ef072e 100644 --- a/drivers/media/rc/lirc_dev.c +++ b/drivers/media/rc/lirc_dev.c @@ -29,7 +29,6 @@ #include #include -#define NOPLUG -1 #define LOGHEAD "lirc_dev (%s[%d]): " static dev_t lirc_base_dev; @@ -114,7 +113,7 @@ out: int lirc_register_driver(struct lirc_driver *d) { struct irctl *ir; - int minor; + unsigned int minor; int err; if (!d) { @@ -132,12 +131,6 @@ int lirc_register_driver(struct lirc_driver *d) return -EINVAL; } - if (d->minor >= MAX_IRCTL_DEVICES) { - dev_err(d->dev, "minor must be between 0 and %d!\n", - MAX_IRCTL_DEVICES - 1); - return -EBADRQC; - } - if (d->code_length < 1 || d->code_length > (BUFLEN * 8)) { dev_err(d->dev, "code length must be less than %d bits\n", BUFLEN * 8); @@ -152,21 +145,14 @@ int lirc_register_driver(struct lirc_driver *d) mutex_lock(&lirc_dev_lock); - minor = d->minor; + /* find first free slot for driver */ + for (minor = 0; minor < MAX_IRCTL_DEVICES; minor++) + if (!irctls[minor]) + break; - if (minor < 0) { - /* find first free slot for driver */ - for (minor = 0; minor < MAX_IRCTL_DEVICES; minor++) - if (!irctls[minor]) - break; - if (minor == MAX_IRCTL_DEVICES) { - dev_err(d->dev, "no free slots for drivers!\n"); - err = -ENOMEM; - goto out_lock; - } - } else if (irctls[minor]) { - dev_err(d->dev, "minor (%d) just registered!\n", minor); - err = -EBUSY; + if (minor == MAX_IRCTL_DEVICES) { + dev_err(d->dev, "no free slots for drivers!\n"); + err = -ENOMEM; goto out_lock; } @@ -178,6 +164,7 @@ int lirc_register_driver(struct lirc_driver *d) mutex_init(&ir->irctl_lock); irctls[minor] = ir; + d->irctl = ir; d->minor = minor; /* some safety check 8-) */ @@ -225,7 +212,7 @@ int lirc_register_driver(struct lirc_driver *d) dev_info(ir->d.dev, "lirc_dev: driver %s registered at minor = %d\n", ir->d.name, ir->d.minor); - return minor; + return 0; out_cdev: cdev_del(&ir->cdev); @@ -238,38 +225,24 @@ out_lock: } EXPORT_SYMBOL(lirc_register_driver); -int lirc_unregister_driver(int minor) +void lirc_unregister_driver(struct lirc_driver *d) { struct irctl *ir; - if (minor < 0 || minor >= MAX_IRCTL_DEVICES) { - pr_err("minor (%d) must be between 0 and %d!\n", - minor, MAX_IRCTL_DEVICES - 1); - return -EBADRQC; - } + if (!d || !d->irctl) + return; - ir = irctls[minor]; - if (!ir) { - pr_err("failed to get irctl\n"); - return -ENOENT; - } + ir = d->irctl; mutex_lock(&lirc_dev_lock); - if (ir->d.minor != minor) { - dev_err(ir->d.dev, "lirc_dev: minor %d device not registered\n", - minor); - mutex_unlock(&lirc_dev_lock); - return -ENOENT; - } - dev_dbg(ir->d.dev, "lirc_dev: driver %s unregistered from minor = %d\n", - ir->d.name, ir->d.minor); + d->name, d->minor); ir->attached = 0; if (ir->open) { dev_dbg(ir->d.dev, LOGHEAD "releasing opened driver\n", - ir->d.name, ir->d.minor); + d->name, d->minor); wake_up_interruptible(&ir->buf->wait_poll); } @@ -278,8 +251,6 @@ int lirc_unregister_driver(int minor) device_del(&ir->dev); cdev_del(&ir->cdev); put_device(&ir->dev); - - return 0; } EXPORT_SYMBOL(lirc_unregister_driver); @@ -306,11 +277,6 @@ int lirc_dev_fop_open(struct inode *inode, struct file *file) dev_dbg(ir->d.dev, LOGHEAD "open called\n", ir->d.name, ir->d.minor); - if (ir->d.minor == NOPLUG) { - retval = -ENODEV; - goto error; - } - if (ir->open) { retval = -EBUSY; goto error; @@ -403,7 +369,7 @@ long lirc_dev_fop_ioctl(struct file *file, unsigned int cmd, unsigned long arg) dev_dbg(ir->d.dev, LOGHEAD "ioctl called (0x%x)\n", ir->d.name, ir->d.minor, cmd); - if (ir->d.minor == NOPLUG || !ir->attached) { + if (!ir->attached) { dev_err(ir->d.dev, LOGHEAD "ioctl result = -ENODEV\n", ir->d.name, ir->d.minor); return -ENODEV; diff --git a/drivers/staging/media/lirc/lirc_zilog.c b/drivers/staging/media/lirc/lirc_zilog.c index 71af13bd0ebd..efcbfef1980e 100644 --- a/drivers/staging/media/lirc/lirc_zilog.c +++ b/drivers/staging/media/lirc/lirc_zilog.c @@ -183,10 +183,7 @@ static void release_ir_device(struct kref *ref) * ir->open_count == 0 - happens on final close() * ir_lock, tx_ref_lock, rx_ref_lock, all released */ - if (ir->l.minor >= 0) { - lirc_unregister_driver(ir->l.minor); - ir->l.minor = -1; - } + lirc_unregister_driver(&ir->l); if (kfifo_initialized(&ir->rbuf.fifo)) lirc_buffer_free(&ir->rbuf); @@ -1385,7 +1382,6 @@ static const struct file_operations lirc_fops = { static struct lirc_driver lirc_template = { .name = "lirc_zilog", - .minor = -1, .code_length = 13, .buffer_size = BUFLEN / 2, .chunk_size = 2, @@ -1599,14 +1595,14 @@ static int ir_probe(struct i2c_client *client, const struct i2c_device_id *id) } /* register with lirc */ - ir->l.minor = lirc_register_driver(&ir->l); - if (ir->l.minor < 0) { + ret = lirc_register_driver(&ir->l); + if (ret < 0) { dev_err(tx->ir->l.dev, "%s: lirc_register_driver() failed: %i\n", - __func__, ir->l.minor); - ret = -EBADRQC; + __func__, ret); goto out_put_xx; } + dev_info(ir->l.dev, "IR unit on %s (i2c-%d) registered as lirc%d and ready\n", adap->name, adap->nr, ir->l.minor); diff --git a/include/media/lirc_dev.h b/include/media/lirc_dev.h index 86d15a9b6c01..1bb9890744fa 100644 --- a/include/media/lirc_dev.h +++ b/include/media/lirc_dev.h @@ -116,10 +116,8 @@ static inline unsigned int lirc_buffer_write(struct lirc_buffer *buf, * * @name: this string will be used for logs * - * @minor: indicates minor device (/dev/lirc) number for - * registered driver if caller fills it with negative - * value, then the first free minor number will be used - * (if available). + * @minor: the minor device (/dev/lircX) number for a registered + * driver. * * @code_length: length of the remote control key code expressed in bits. * @@ -157,10 +155,12 @@ static inline unsigned int lirc_buffer_write(struct lirc_buffer *buf, * device. * * @owner: the module owning this struct + * + * @irctl: the struct irctl for this LIRC device. */ struct lirc_driver { char name[40]; - int minor; + unsigned int minor; __u32 code_length; unsigned int buffer_size; /* in chunks holding one code each */ __u32 features; @@ -175,19 +175,17 @@ struct lirc_driver { const struct file_operations *fops; struct device *dev; struct module *owner; + struct irctl *irctl; }; /* following functions can be called ONLY from user context * - * returns negative value on error or minor number - * of the registered device if success + * returns negative value on error or zero * contents of the structure pointed by p is copied */ -extern int lirc_register_driver(struct lirc_driver *d); +int lirc_register_driver(struct lirc_driver *d); -/* returns negative value on error or 0 if success -*/ -extern int lirc_unregister_driver(int minor); +void lirc_unregister_driver(struct lirc_driver *d); /* Returns the private data stored in the lirc_driver * associated with the given device file pointer. From 849f8351a346ec1aa0b5e74c10876379c2b590ef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?David=20H=C3=A4rdeman?= Date: Sun, 25 Jun 2017 09:31:35 -0300 Subject: [PATCH 0479/1640] UPSTREAM: [media] media: lirc_dev: use cdev_device_add() helper function MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace calls to cdev_add() and device_add() with the cdev_device_add() helper function. Signed-off-by: David Härdeman Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/lirc_dev.c | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) diff --git a/drivers/media/rc/lirc_dev.c b/drivers/media/rc/lirc_dev.c index f1d8c1ef072e..057983b8ec53 100644 --- a/drivers/media/rc/lirc_dev.c +++ b/drivers/media/rc/lirc_dev.c @@ -193,17 +193,11 @@ int lirc_register_driver(struct lirc_driver *d) cdev_init(&ir->cdev, d->fops); ir->cdev.owner = ir->d.owner; - ir->cdev.kobj.parent = &ir->dev.kobj; - - err = cdev_add(&ir->cdev, ir->dev.devt, 1); - if (err) - goto out_free_dev; - ir->attached = 1; - err = device_add(&ir->dev); + err = cdev_device_add(&ir->cdev, &ir->dev); if (err) - goto out_cdev; + goto out_dev; mutex_unlock(&lirc_dev_lock); @@ -214,9 +208,7 @@ int lirc_register_driver(struct lirc_driver *d) return 0; -out_cdev: - cdev_del(&ir->cdev); -out_free_dev: +out_dev: put_device(&ir->dev); out_lock: mutex_unlock(&lirc_dev_lock); @@ -248,8 +240,7 @@ void lirc_unregister_driver(struct lirc_driver *d) mutex_unlock(&lirc_dev_lock); - device_del(&ir->dev); - cdev_del(&ir->cdev); + cdev_device_del(&ir->cdev, &ir->dev); put_device(&ir->dev); } EXPORT_SYMBOL(lirc_unregister_driver); From 65259a75bf4d61ffdfcd1216fb8239dab3bf6007 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?David=20H=C3=A4rdeman?= Date: Sun, 25 Jun 2017 09:31:40 -0300 Subject: [PATCH 0480/1640] UPSTREAM: [media] media: lirc_dev: make better use of file->private_data MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit By making better use of file->private_data in lirc_dev we can avoid digging around in the irctls[] array, thereby simplifying the code. External drivers need to use lirc_get_pdata() instead of mucking around in file->private_data. The newly introduced lirc_init_pdata() function isn't very elegant, but it's a stopgap measure which can be removed once lirc_zilog is converted to rc-core. Signed-off-by: David Härdeman Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/lirc_dev.c | 70 +++++++------------------ drivers/staging/media/lirc/lirc_zilog.c | 53 ++++--------------- include/media/lirc_dev.h | 3 ++ 3 files changed, 33 insertions(+), 93 deletions(-) diff --git a/drivers/media/rc/lirc_dev.c b/drivers/media/rc/lirc_dev.c index 057983b8ec53..ffa203eb2045 100644 --- a/drivers/media/rc/lirc_dev.c +++ b/drivers/media/rc/lirc_dev.c @@ -247,36 +247,18 @@ EXPORT_SYMBOL(lirc_unregister_driver); int lirc_dev_fop_open(struct inode *inode, struct file *file) { - struct irctl *ir; + struct irctl *ir = container_of(inode->i_cdev, struct irctl, cdev); int retval; - if (iminor(inode) >= MAX_IRCTL_DEVICES) { - pr_err("open result for %d is -ENODEV\n", iminor(inode)); - return -ENODEV; - } - - if (mutex_lock_interruptible(&lirc_dev_lock)) - return -ERESTARTSYS; - - ir = irctls[iminor(inode)]; - mutex_unlock(&lirc_dev_lock); - - if (!ir) { - retval = -ENODEV; - goto error; - } - dev_dbg(ir->d.dev, LOGHEAD "open called\n", ir->d.name, ir->d.minor); - if (ir->open) { - retval = -EBUSY; - goto error; - } + if (ir->open) + return -EBUSY; if (ir->d.rdev) { retval = rc_open(ir->d.rdev); if (retval) - goto error; + return retval; } if (ir->buf) @@ -284,25 +266,18 @@ int lirc_dev_fop_open(struct inode *inode, struct file *file) ir->open++; + lirc_init_pdata(inode, file); nonseekable_open(inode, file); return 0; - -error: - return retval; } EXPORT_SYMBOL(lirc_dev_fop_open); int lirc_dev_fop_close(struct inode *inode, struct file *file) { - struct irctl *ir = irctls[iminor(inode)]; + struct irctl *ir = file->private_data; int ret; - if (!ir) { - pr_err("called with invalid irctl\n"); - return -EINVAL; - } - ret = mutex_lock_killable(&lirc_dev_lock); WARN_ON(ret); @@ -318,14 +293,9 @@ EXPORT_SYMBOL(lirc_dev_fop_close); unsigned int lirc_dev_fop_poll(struct file *file, poll_table *wait) { - struct irctl *ir = irctls[iminor(file_inode(file))]; + struct irctl *ir = file->private_data; unsigned int ret; - if (!ir) { - pr_err("called with invalid irctl\n"); - return POLLERR; - } - if (!ir->attached) return POLLHUP | POLLERR; @@ -348,14 +318,9 @@ EXPORT_SYMBOL(lirc_dev_fop_poll); long lirc_dev_fop_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { + struct irctl *ir = file->private_data; __u32 mode; int result = 0; - struct irctl *ir = irctls[iminor(file_inode(file))]; - - if (!ir) { - pr_err("no irctl found!\n"); - return -ENODEV; - } dev_dbg(ir->d.dev, LOGHEAD "ioctl called (0x%x)\n", ir->d.name, ir->d.minor, cmd); @@ -432,16 +397,11 @@ ssize_t lirc_dev_fop_read(struct file *file, size_t length, loff_t *ppos) { - struct irctl *ir = irctls[iminor(file_inode(file))]; + struct irctl *ir = file->private_data; unsigned char *buf; int ret = 0, written = 0; DECLARE_WAITQUEUE(wait, current); - if (!ir) { - pr_err("called with invalid irctl\n"); - return -ENODEV; - } - if (!LIRC_CAN_REC(ir->d.features)) return -EINVAL; @@ -532,9 +492,19 @@ out_unlocked: } EXPORT_SYMBOL(lirc_dev_fop_read); +void lirc_init_pdata(struct inode *inode, struct file *file) +{ + struct irctl *ir = container_of(inode->i_cdev, struct irctl, cdev); + + file->private_data = ir; +} +EXPORT_SYMBOL(lirc_init_pdata); + void *lirc_get_pdata(struct file *file) { - return irctls[iminor(file_inode(file))]->d.data; + struct irctl *ir = file->private_data; + + return ir->d.data; } EXPORT_SYMBOL(lirc_get_pdata); diff --git a/drivers/staging/media/lirc/lirc_zilog.c b/drivers/staging/media/lirc/lirc_zilog.c index efcbfef1980e..c4a4c2f93ae8 100644 --- a/drivers/staging/media/lirc/lirc_zilog.c +++ b/drivers/staging/media/lirc/lirc_zilog.c @@ -879,7 +879,7 @@ out: static ssize_t read(struct file *filep, char __user *outbuf, size_t n, loff_t *ppos) { - struct IR *ir = filep->private_data; + struct IR *ir = lirc_get_pdata(filep); struct IR_rx *rx; struct lirc_buffer *rbuf = ir->l.rbuf; int ret = 0, written = 0, retries = 0; @@ -1089,7 +1089,7 @@ static int send_code(struct IR_tx *tx, unsigned int code, unsigned int key) static ssize_t write(struct file *filep, const char __user *buf, size_t n, loff_t *ppos) { - struct IR *ir = filep->private_data; + struct IR *ir = lirc_get_pdata(filep); struct IR_tx *tx; size_t i; int failures = 0; @@ -1197,7 +1197,7 @@ static ssize_t write(struct file *filep, const char __user *buf, size_t n, /* copied from lirc_dev */ static unsigned int poll(struct file *filep, poll_table *wait) { - struct IR *ir = filep->private_data; + struct IR *ir = lirc_get_pdata(filep); struct IR_rx *rx; struct lirc_buffer *rbuf = ir->l.rbuf; unsigned int ret; @@ -1230,7 +1230,7 @@ static unsigned int poll(struct file *filep, poll_table *wait) static long ioctl(struct file *filep, unsigned int cmd, unsigned long arg) { - struct IR *ir = filep->private_data; + struct IR *ir = lirc_get_pdata(filep); unsigned long __user *uptr = (unsigned long __user *)arg; int result; unsigned long mode, features; @@ -1280,46 +1280,18 @@ static long ioctl(struct file *filep, unsigned int cmd, unsigned long arg) return result; } -static struct IR *get_ir_device_by_minor(unsigned int minor) -{ - struct IR *ir; - struct IR *ret = NULL; - - mutex_lock(&ir_devices_lock); - - if (!list_empty(&ir_devices_list)) { - list_for_each_entry(ir, &ir_devices_list, list) { - if (ir->l.minor == minor) { - ret = get_ir_device(ir, true); - break; - } - } - } - - mutex_unlock(&ir_devices_lock); - return ret; -} - /* - * Open the IR device. Get hold of our IR structure and - * stash it in private_data for the file + * Open the IR device. */ static int open(struct inode *node, struct file *filep) { struct IR *ir; - unsigned int minor = MINOR(node->i_rdev); - /* find our IR struct */ - ir = get_ir_device_by_minor(minor); - - if (!ir) - return -ENODEV; + lirc_init_pdata(node, filep); + ir = lirc_get_pdata(filep); atomic_inc(&ir->open_count); - /* stash our IR struct */ - filep->private_data = ir; - nonseekable_open(node, filep); return 0; } @@ -1327,14 +1299,7 @@ static int open(struct inode *node, struct file *filep) /* Close the IR device */ static int close(struct inode *node, struct file *filep) { - /* find our IR struct */ - struct IR *ir = filep->private_data; - - if (!ir) { - pr_err("ir: %s: no private_data attached to the file!\n", - __func__); - return -ENODEV; - } + struct IR *ir = lirc_get_pdata(filep); atomic_dec(&ir->open_count); @@ -1489,6 +1454,8 @@ static int ir_probe(struct i2c_client *client, const struct i2c_device_id *id) */ ir->l.rbuf = &ir->rbuf; ir->l.dev = &adap->dev; + /* This will be returned by lirc_get_pdata() */ + ir->l.data = ir; ret = lirc_buffer_init(ir->l.rbuf, ir->l.chunk_size, ir->l.buffer_size); if (ret) diff --git a/include/media/lirc_dev.h b/include/media/lirc_dev.h index 1bb9890744fa..d07a53232ffc 100644 --- a/include/media/lirc_dev.h +++ b/include/media/lirc_dev.h @@ -187,6 +187,9 @@ int lirc_register_driver(struct lirc_driver *d); void lirc_unregister_driver(struct lirc_driver *d); +/* Must be called in the open fop before lirc_get_pdata() can be used */ +void lirc_init_pdata(struct inode *inode, struct file *file); + /* Returns the private data stored in the lirc_driver * associated with the given device file pointer. */ From 090ce7ac4e7c10296097d0e6db813d9b7668d430 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?David=20H=C3=A4rdeman?= Date: Sun, 25 Jun 2017 09:31:45 -0300 Subject: [PATCH 0481/1640] UPSTREAM: [media] media: lirc_dev: make chunk_size and buffer_size mandatory MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Make setting chunk_size and buffer_size mandatory for drivers which expect lirc_dev to allocate the lirc_buffer (i.e. ir-lirc-codec) and don't set them in lirc-zilog (which creates its own buffer). Also remove an unnecessary copy of chunk_size in struct irctl (the same information is already available from struct lirc_buffer). Signed-off-by: David Härdeman Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/lirc_dev.c | 26 ++++++++++++------------- drivers/staging/media/lirc/lirc_zilog.c | 5 +---- include/media/lirc_dev.h | 9 +++++---- 3 files changed, 19 insertions(+), 21 deletions(-) diff --git a/drivers/media/rc/lirc_dev.c b/drivers/media/rc/lirc_dev.c index ffa203eb2045..1915ffc52955 100644 --- a/drivers/media/rc/lirc_dev.c +++ b/drivers/media/rc/lirc_dev.c @@ -41,7 +41,6 @@ struct irctl { struct mutex irctl_lock; struct lirc_buffer *buf; bool buf_internal; - unsigned int chunk_size; struct device dev; struct cdev cdev; @@ -74,16 +73,8 @@ static void lirc_release(struct device *ld) static int lirc_allocate_buffer(struct irctl *ir) { int err = 0; - int bytes_in_key; - unsigned int chunk_size; - unsigned int buffer_size; struct lirc_driver *d = &ir->d; - bytes_in_key = BITS_TO_LONGS(d->code_length) + - (d->code_length % 8 ? 1 : 0); - buffer_size = d->buffer_size ? d->buffer_size : BUFLEN / bytes_in_key; - chunk_size = d->chunk_size ? d->chunk_size : bytes_in_key; - if (d->rbuf) { ir->buf = d->rbuf; ir->buf_internal = false; @@ -94,7 +85,7 @@ static int lirc_allocate_buffer(struct irctl *ir) goto out; } - err = lirc_buffer_init(ir->buf, chunk_size, buffer_size); + err = lirc_buffer_init(ir->buf, d->chunk_size, d->buffer_size); if (err) { kfree(ir->buf); ir->buf = NULL; @@ -104,7 +95,6 @@ static int lirc_allocate_buffer(struct irctl *ir) ir->buf_internal = true; d->rbuf = ir->buf; } - ir->chunk_size = ir->buf->chunk_size; out: return err; @@ -131,6 +121,16 @@ int lirc_register_driver(struct lirc_driver *d) return -EINVAL; } + if (!d->rbuf && d->chunk_size < 1) { + pr_err("chunk_size must be set!\n"); + return -EINVAL; + } + + if (!d->rbuf && d->buffer_size < 1) { + pr_err("buffer_size must be set!\n"); + return -EINVAL; + } + if (d->code_length < 1 || d->code_length > (BUFLEN * 8)) { dev_err(d->dev, "code length must be less than %d bits\n", BUFLEN * 8); @@ -407,7 +407,7 @@ ssize_t lirc_dev_fop_read(struct file *file, dev_dbg(ir->d.dev, LOGHEAD "read called\n", ir->d.name, ir->d.minor); - buf = kzalloc(ir->chunk_size, GFP_KERNEL); + buf = kzalloc(ir->buf->chunk_size, GFP_KERNEL); if (!buf) return -ENOMEM; @@ -420,7 +420,7 @@ ssize_t lirc_dev_fop_read(struct file *file, goto out_locked; } - if (length % ir->chunk_size) { + if (length % ir->buf->chunk_size) { ret = -EINVAL; goto out_locked; } diff --git a/drivers/staging/media/lirc/lirc_zilog.c b/drivers/staging/media/lirc/lirc_zilog.c index c4a4c2f93ae8..780b2d9f2f4b 100644 --- a/drivers/staging/media/lirc/lirc_zilog.c +++ b/drivers/staging/media/lirc/lirc_zilog.c @@ -1348,8 +1348,6 @@ static const struct file_operations lirc_fops = { static struct lirc_driver lirc_template = { .name = "lirc_zilog", .code_length = 13, - .buffer_size = BUFLEN / 2, - .chunk_size = 2, .fops = &lirc_fops, .owner = THIS_MODULE, }; @@ -1456,8 +1454,7 @@ static int ir_probe(struct i2c_client *client, const struct i2c_device_id *id) ir->l.dev = &adap->dev; /* This will be returned by lirc_get_pdata() */ ir->l.data = ir; - ret = lirc_buffer_init(ir->l.rbuf, - ir->l.chunk_size, ir->l.buffer_size); + ret = lirc_buffer_init(ir->l.rbuf, 2, BUFLEN / 2); if (ret) goto out_put_ir; } diff --git a/include/media/lirc_dev.h b/include/media/lirc_dev.h index d07a53232ffc..8e3894e2d2c8 100644 --- a/include/media/lirc_dev.h +++ b/include/media/lirc_dev.h @@ -121,13 +121,14 @@ static inline unsigned int lirc_buffer_write(struct lirc_buffer *buf, * * @code_length: length of the remote control key code expressed in bits. * - * @buffer_size: Number of FIFO buffers with @chunk_size size. If zero, - * creates a buffer with BUFLEN size (16 bytes). - * * @features: lirc compatible hardware features, like LIRC_MODE_RAW, * LIRC_CAN\_\*, as defined at include/media/lirc.h. * + * @buffer_size: Number of FIFO buffers with @chunk_size size. + * Only used if @rbuf is NULL. + * * @chunk_size: Size of each FIFO buffer. + * Only used if @rbuf is NULL. * * @data: it may point to any driver data and this pointer will * be passed to all callback functions. @@ -162,9 +163,9 @@ struct lirc_driver { char name[40]; unsigned int minor; __u32 code_length; - unsigned int buffer_size; /* in chunks holding one code each */ __u32 features; + unsigned int buffer_size; /* in chunks holding one code each */ unsigned int chunk_size; void *data; From 0ab0b1bfc66dadeb923632705f92b92538da0beb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?David=20H=C3=A4rdeman?= Date: Sun, 25 Jun 2017 09:31:55 -0300 Subject: [PATCH 0482/1640] UPSTREAM: [media] media: lirc_dev: change irctl->attached to be a boolean MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The "attached" member of struct irctl is a boolean value, so let the code reflect that. Signed-off-by: David Härdeman Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/lirc_dev.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/media/rc/lirc_dev.c b/drivers/media/rc/lirc_dev.c index 1915ffc52955..b07d0ab37d6b 100644 --- a/drivers/media/rc/lirc_dev.c +++ b/drivers/media/rc/lirc_dev.c @@ -35,7 +35,7 @@ static dev_t lirc_base_dev; struct irctl { struct lirc_driver d; - int attached; + bool attached; int open; struct mutex irctl_lock; @@ -193,7 +193,7 @@ int lirc_register_driver(struct lirc_driver *d) cdev_init(&ir->cdev, d->fops); ir->cdev.owner = ir->d.owner; - ir->attached = 1; + ir->attached = true; err = cdev_device_add(&ir->cdev, &ir->dev); if (err) @@ -231,7 +231,7 @@ void lirc_unregister_driver(struct lirc_driver *d) dev_dbg(ir->d.dev, "lirc_dev: driver %s unregistered from minor = %d\n", d->name, d->minor); - ir->attached = 0; + ir->attached = false; if (ir->open) { dev_dbg(ir->d.dev, LOGHEAD "releasing opened driver\n", d->name, d->minor); From 5469dd7f967dda35a7297858b584a353a6a26617 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?David=20H=C3=A4rdeman?= Date: Fri, 30 Jun 2017 05:41:57 -0300 Subject: [PATCH 0483/1640] UPSTREAM: [media] media: lirc_dev: sanitize locking MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use the irctl mutex for all device operations and only use lirc_dev_lock to protect the irctls array. Also, make sure that the device is alive early in each fops function before doing anything else. Since this patch touches nearly every line where the irctl mutex is taken/released, it also renames the mutex at the same time (the name irctl_lock will be misleading once struct irctl goes away in later patches). [mchehab@s-opensource.com: fix a merge conflict] Signed-off-by: David Härdeman Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/lirc_dev.c | 167 ++++++++++++++++++++---------------- 1 file changed, 94 insertions(+), 73 deletions(-) diff --git a/drivers/media/rc/lirc_dev.c b/drivers/media/rc/lirc_dev.c index b07d0ab37d6b..c83fffec0681 100644 --- a/drivers/media/rc/lirc_dev.c +++ b/drivers/media/rc/lirc_dev.c @@ -38,7 +38,7 @@ struct irctl { bool attached; int open; - struct mutex irctl_lock; + struct mutex mutex; /* protect from simultaneous accesses */ struct lirc_buffer *buf; bool buf_internal; @@ -46,6 +46,7 @@ struct irctl { struct cdev cdev; }; +/* This mutex protects the irctls array */ static DEFINE_MUTEX(lirc_dev_lock); static struct irctl *irctls[MAX_IRCTL_DEVICES]; @@ -53,20 +54,25 @@ static struct irctl *irctls[MAX_IRCTL_DEVICES]; /* Only used for sysfs but defined to void otherwise */ static struct class *lirc_class; -static void lirc_release(struct device *ld) +static void lirc_free_buffer(struct irctl *ir) { - struct irctl *ir = container_of(ld, struct irctl, dev); - put_device(ir->dev.parent); if (ir->buf_internal) { lirc_buffer_free(ir->buf); kfree(ir->buf); + ir->buf = NULL; } +} + +static void lirc_release(struct device *ld) +{ + struct irctl *ir = container_of(ld, struct irctl, dev); mutex_lock(&lirc_dev_lock); irctls[ir->d.minor] = NULL; mutex_unlock(&lirc_dev_lock); + lirc_free_buffer(ir); kfree(ir); } @@ -143,6 +149,28 @@ int lirc_register_driver(struct lirc_driver *d) return -EBADRQC; } + /* some safety check 8-) */ + d->name[sizeof(d->name) - 1] = '\0'; + + if (d->features == 0) + d->features = LIRC_CAN_REC_LIRCCODE; + + ir = kzalloc(sizeof(*ir), GFP_KERNEL); + if (!ir) + return -ENOMEM; + + mutex_init(&ir->mutex); + ir->d = *d; + + if (LIRC_CAN_REC(d->features)) { + err = lirc_allocate_buffer(ir); + if (err) { + kfree(ir); + return err; + } + d->rbuf = ir->buf; + } + mutex_lock(&lirc_dev_lock); /* find first free slot for driver */ @@ -152,37 +180,18 @@ int lirc_register_driver(struct lirc_driver *d) if (minor == MAX_IRCTL_DEVICES) { dev_err(d->dev, "no free slots for drivers!\n"); - err = -ENOMEM; - goto out_lock; + mutex_unlock(&lirc_dev_lock); + lirc_free_buffer(ir); + kfree(ir); + return -ENOMEM; } - ir = kzalloc(sizeof(struct irctl), GFP_KERNEL); - if (!ir) { - err = -ENOMEM; - goto out_lock; - } - - mutex_init(&ir->irctl_lock); irctls[minor] = ir; d->irctl = ir; d->minor = minor; + ir->d.minor = minor; - /* some safety check 8-) */ - d->name[sizeof(d->name)-1] = '\0'; - - if (d->features == 0) - d->features = LIRC_CAN_REC_LIRCCODE; - - ir->d = *d; - - if (LIRC_CAN_REC(d->features)) { - err = lirc_allocate_buffer(irctls[minor]); - if (err) { - kfree(ir); - goto out_lock; - } - d->rbuf = ir->buf; - } + mutex_unlock(&lirc_dev_lock); device_initialize(&ir->dev); ir->dev.devt = MKDEV(MAJOR(lirc_base_dev), ir->d.minor); @@ -196,10 +205,10 @@ int lirc_register_driver(struct lirc_driver *d) ir->attached = true; err = cdev_device_add(&ir->cdev, &ir->dev); - if (err) - goto out_dev; - - mutex_unlock(&lirc_dev_lock); + if (err) { + put_device(&ir->dev); + return err; + } get_device(ir->dev.parent); @@ -207,13 +216,6 @@ int lirc_register_driver(struct lirc_driver *d) ir->d.name, ir->d.minor); return 0; - -out_dev: - put_device(&ir->dev); -out_lock: - mutex_unlock(&lirc_dev_lock); - - return err; } EXPORT_SYMBOL(lirc_register_driver); @@ -226,11 +228,13 @@ void lirc_unregister_driver(struct lirc_driver *d) ir = d->irctl; - mutex_lock(&lirc_dev_lock); - dev_dbg(ir->d.dev, "lirc_dev: driver %s unregistered from minor = %d\n", d->name, d->minor); + cdev_device_del(&ir->cdev, &ir->dev); + + mutex_lock(&ir->mutex); + ir->attached = false; if (ir->open) { dev_dbg(ir->d.dev, LOGHEAD "releasing opened driver\n", @@ -238,9 +242,8 @@ void lirc_unregister_driver(struct lirc_driver *d) wake_up_interruptible(&ir->buf->wait_poll); } - mutex_unlock(&lirc_dev_lock); + mutex_unlock(&ir->mutex); - cdev_device_del(&ir->cdev, &ir->dev); put_device(&ir->dev); } EXPORT_SYMBOL(lirc_unregister_driver); @@ -252,13 +255,24 @@ int lirc_dev_fop_open(struct inode *inode, struct file *file) dev_dbg(ir->d.dev, LOGHEAD "open called\n", ir->d.name, ir->d.minor); - if (ir->open) - return -EBUSY; + retval = mutex_lock_interruptible(&ir->mutex); + if (retval) + return retval; + + if (!ir->attached) { + retval = -ENODEV; + goto out; + } + + if (ir->open) { + retval = -EBUSY; + goto out; + } if (ir->d.rdev) { retval = rc_open(ir->d.rdev); if (retval) - return retval; + goto out; } if (ir->buf) @@ -268,24 +282,26 @@ int lirc_dev_fop_open(struct inode *inode, struct file *file) lirc_init_pdata(inode, file); nonseekable_open(inode, file); + mutex_unlock(&ir->mutex); return 0; + +out: + mutex_unlock(&ir->mutex); + return retval; } EXPORT_SYMBOL(lirc_dev_fop_open); int lirc_dev_fop_close(struct inode *inode, struct file *file) { struct irctl *ir = file->private_data; - int ret; - ret = mutex_lock_killable(&lirc_dev_lock); - WARN_ON(ret); + mutex_lock(&ir->mutex); rc_close(ir->d.rdev); - ir->open--; - if (!ret) - mutex_unlock(&lirc_dev_lock); + + mutex_unlock(&ir->mutex); return 0; } @@ -320,18 +336,19 @@ long lirc_dev_fop_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct irctl *ir = file->private_data; __u32 mode; - int result = 0; + int result; dev_dbg(ir->d.dev, LOGHEAD "ioctl called (0x%x)\n", ir->d.name, ir->d.minor, cmd); - if (!ir->attached) { - dev_err(ir->d.dev, LOGHEAD "ioctl result = -ENODEV\n", - ir->d.name, ir->d.minor); - return -ENODEV; - } + result = mutex_lock_interruptible(&ir->mutex); + if (result) + return result; - mutex_lock(&ir->irctl_lock); + if (!ir->attached) { + result = -ENODEV; + goto out; + } switch (cmd) { case LIRC_GET_FEATURES: @@ -386,8 +403,8 @@ long lirc_dev_fop_ioctl(struct file *file, unsigned int cmd, unsigned long arg) result = -ENOTTY; } - mutex_unlock(&ir->irctl_lock); - +out: + mutex_unlock(&ir->mutex); return result; } EXPORT_SYMBOL(lirc_dev_fop_ioctl); @@ -399,27 +416,31 @@ ssize_t lirc_dev_fop_read(struct file *file, { struct irctl *ir = file->private_data; unsigned char *buf; - int ret = 0, written = 0; + int ret, written = 0; DECLARE_WAITQUEUE(wait, current); - if (!LIRC_CAN_REC(ir->d.features)) - return -EINVAL; - dev_dbg(ir->d.dev, LOGHEAD "read called\n", ir->d.name, ir->d.minor); buf = kzalloc(ir->buf->chunk_size, GFP_KERNEL); if (!buf) return -ENOMEM; - if (mutex_lock_interruptible(&ir->irctl_lock)) { - ret = -ERESTARTSYS; - goto out_unlocked; + ret = mutex_lock_interruptible(&ir->mutex); + if (ret) { + kfree(buf); + return ret; } + if (!ir->attached) { ret = -ENODEV; goto out_locked; } + if (!LIRC_CAN_REC(ir->d.features)) { + ret = -EINVAL; + goto out_locked; + } + if (length % ir->buf->chunk_size) { ret = -EINVAL; goto out_locked; @@ -454,13 +475,13 @@ ssize_t lirc_dev_fop_read(struct file *file, break; } - mutex_unlock(&ir->irctl_lock); + mutex_unlock(&ir->mutex); set_current_state(TASK_INTERRUPTIBLE); schedule(); set_current_state(TASK_RUNNING); - if (mutex_lock_interruptible(&ir->irctl_lock)) { - ret = -ERESTARTSYS; + ret = mutex_lock_interruptible(&ir->mutex); + if (ret) { remove_wait_queue(&ir->buf->wait_poll, &wait); goto out_unlocked; } @@ -483,7 +504,7 @@ ssize_t lirc_dev_fop_read(struct file *file, remove_wait_queue(&ir->buf->wait_poll, &wait); out_locked: - mutex_unlock(&ir->irctl_lock); + mutex_unlock(&ir->mutex); out_unlocked: kfree(buf); From 0cbcfb44448ea14ae03ba5a6637d2e35c05d2a9e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?David=20H=C3=A4rdeman?= Date: Sun, 25 Jun 2017 09:32:05 -0300 Subject: [PATCH 0484/1640] UPSTREAM: [media] media: lirc_dev: use an IDA instead of an array to keep track of registered devices MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Using the kernel-provided IDA simplifies the code and makes it possible to remove the lirc_dev_lock mutex. Signed-off-by: David Härdeman Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/lirc_dev.c | 36 ++++++++++++------------------------ include/media/lirc_dev.h | 1 - 2 files changed, 12 insertions(+), 25 deletions(-) diff --git a/drivers/media/rc/lirc_dev.c b/drivers/media/rc/lirc_dev.c index c83fffec0681..a2c5ed0181c1 100644 --- a/drivers/media/rc/lirc_dev.c +++ b/drivers/media/rc/lirc_dev.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include @@ -46,10 +47,9 @@ struct irctl { struct cdev cdev; }; -/* This mutex protects the irctls array */ -static DEFINE_MUTEX(lirc_dev_lock); - -static struct irctl *irctls[MAX_IRCTL_DEVICES]; +/* Used to keep track of allocated lirc devices */ +#define LIRC_MAX_DEVICES 256 +static DEFINE_IDA(lirc_ida); /* Only used for sysfs but defined to void otherwise */ static struct class *lirc_class; @@ -69,9 +69,6 @@ static void lirc_release(struct device *ld) { struct irctl *ir = container_of(ld, struct irctl, dev); - mutex_lock(&lirc_dev_lock); - irctls[ir->d.minor] = NULL; - mutex_unlock(&lirc_dev_lock); lirc_free_buffer(ir); kfree(ir); } @@ -109,7 +106,7 @@ out: int lirc_register_driver(struct lirc_driver *d) { struct irctl *ir; - unsigned int minor; + int minor; int err; if (!d) { @@ -171,28 +168,17 @@ int lirc_register_driver(struct lirc_driver *d) d->rbuf = ir->buf; } - mutex_lock(&lirc_dev_lock); - - /* find first free slot for driver */ - for (minor = 0; minor < MAX_IRCTL_DEVICES; minor++) - if (!irctls[minor]) - break; - - if (minor == MAX_IRCTL_DEVICES) { - dev_err(d->dev, "no free slots for drivers!\n"); - mutex_unlock(&lirc_dev_lock); + minor = ida_simple_get(&lirc_ida, 0, LIRC_MAX_DEVICES, GFP_KERNEL); + if (minor < 0) { lirc_free_buffer(ir); kfree(ir); - return -ENOMEM; + return minor; } - irctls[minor] = ir; d->irctl = ir; d->minor = minor; ir->d.minor = minor; - mutex_unlock(&lirc_dev_lock); - device_initialize(&ir->dev); ir->dev.devt = MKDEV(MAJOR(lirc_base_dev), ir->d.minor); ir->dev.class = lirc_class; @@ -206,6 +192,7 @@ int lirc_register_driver(struct lirc_driver *d) err = cdev_device_add(&ir->cdev, &ir->dev); if (err) { + ida_simple_remove(&lirc_ida, minor); put_device(&ir->dev); return err; } @@ -244,6 +231,7 @@ void lirc_unregister_driver(struct lirc_driver *d) mutex_unlock(&ir->mutex); + ida_simple_remove(&lirc_ida, d->minor); put_device(&ir->dev); } EXPORT_SYMBOL(lirc_unregister_driver); @@ -540,7 +528,7 @@ static int __init lirc_dev_init(void) return PTR_ERR(lirc_class); } - retval = alloc_chrdev_region(&lirc_base_dev, 0, MAX_IRCTL_DEVICES, + retval = alloc_chrdev_region(&lirc_base_dev, 0, LIRC_MAX_DEVICES, "BaseRemoteCtl"); if (retval) { class_destroy(lirc_class); @@ -557,7 +545,7 @@ static int __init lirc_dev_init(void) static void __exit lirc_dev_exit(void) { class_destroy(lirc_class); - unregister_chrdev_region(lirc_base_dev, MAX_IRCTL_DEVICES); + unregister_chrdev_region(lirc_base_dev, LIRC_MAX_DEVICES); pr_info("module unloaded\n"); } diff --git a/include/media/lirc_dev.h b/include/media/lirc_dev.h index 8e3894e2d2c8..51c15c050e85 100644 --- a/include/media/lirc_dev.h +++ b/include/media/lirc_dev.h @@ -9,7 +9,6 @@ #ifndef _LINUX_LIRC_DEV_H #define _LINUX_LIRC_DEV_H -#define MAX_IRCTL_DEVICES 8 #define BUFLEN 16 #include From bfc965134e3cd9dd3104eaed64612a9d6c72e6e5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?David=20H=C3=A4rdeman?= Date: Thu, 21 Sep 2017 16:13:34 -0300 Subject: [PATCH 0485/1640] UPSTREAM: [media] media: rename struct lirc_driver to struct lirc_dev MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is in preparation for the later patches which do away with struct irctl entirely. Signed-off-by: David Härdeman Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/ir-lirc-codec.c | 50 ++++++++++++------------- drivers/media/rc/lirc_dev.c | 12 +++--- drivers/media/rc/rc-core-priv.h | 2 +- drivers/staging/media/lirc/lirc_zilog.c | 12 +++--- include/media/lirc_dev.h | 48 +++++++----------------- 5 files changed, 51 insertions(+), 73 deletions(-) diff --git a/drivers/media/rc/ir-lirc-codec.c b/drivers/media/rc/ir-lirc-codec.c index 58bff7a75d5b..2d591168c991 100644 --- a/drivers/media/rc/ir-lirc-codec.c +++ b/drivers/media/rc/ir-lirc-codec.c @@ -35,7 +35,7 @@ static int ir_lirc_decode(struct rc_dev *dev, struct ir_raw_event ev) struct lirc_codec *lirc = &dev->raw->lirc; int sample; - if (!dev->raw->lirc.drv || !dev->raw->lirc.drv->rbuf) + if (!dev->raw->lirc.ldev || !dev->raw->lirc.ldev->rbuf) return -EINVAL; /* Packet start */ @@ -84,8 +84,8 @@ static int ir_lirc_decode(struct rc_dev *dev, struct ir_raw_event ev) (u64)LIRC_VALUE_MASK); gap_sample = LIRC_SPACE(lirc->gap_duration); - lirc_buffer_write(dev->raw->lirc.drv->rbuf, - (unsigned char *) &gap_sample); + lirc_buffer_write(dev->raw->lirc.ldev->rbuf, + (unsigned char *)&gap_sample); lirc->gap = false; } @@ -95,9 +95,9 @@ static int ir_lirc_decode(struct rc_dev *dev, struct ir_raw_event ev) TO_US(ev.duration), TO_STR(ev.pulse)); } - lirc_buffer_write(dev->raw->lirc.drv->rbuf, + lirc_buffer_write(dev->raw->lirc.ldev->rbuf, (unsigned char *) &sample); - wake_up(&dev->raw->lirc.drv->rbuf->wait_poll); + wake_up(&dev->raw->lirc.ldev->rbuf->wait_poll); return 0; } @@ -343,12 +343,12 @@ static const struct file_operations lirc_fops = { static int ir_lirc_register(struct rc_dev *dev) { - struct lirc_driver *drv; + struct lirc_dev *ldev; int rc = -ENOMEM; unsigned long features = 0; - drv = kzalloc(sizeof(struct lirc_driver), GFP_KERNEL); - if (!drv) + ldev = kzalloc(sizeof(*ldev), GFP_KERNEL); + if (!ldev) return rc; if (dev->driver_type != RC_DRIVER_IR_RAW_TX) { @@ -380,29 +380,29 @@ static int ir_lirc_register(struct rc_dev *dev) if (dev->max_timeout) features |= LIRC_CAN_SET_REC_TIMEOUT; - snprintf(drv->name, sizeof(drv->name), "ir-lirc-codec (%s)", + snprintf(ldev->name, sizeof(ldev->name), "ir-lirc-codec (%s)", dev->driver_name); - drv->features = features; - drv->data = &dev->raw->lirc; - drv->rbuf = NULL; - drv->code_length = sizeof(struct ir_raw_event) * 8; - drv->chunk_size = sizeof(int); - drv->buffer_size = LIRCBUF_SIZE; - drv->fops = &lirc_fops; - drv->dev = &dev->dev; - drv->rdev = dev; - drv->owner = THIS_MODULE; + ldev->features = features; + ldev->data = &dev->raw->lirc; + ldev->rbuf = NULL; + ldev->code_length = sizeof(struct ir_raw_event) * 8; + ldev->chunk_size = sizeof(int); + ldev->buffer_size = LIRCBUF_SIZE; + ldev->fops = &lirc_fops; + ldev->dev = &dev->dev; + ldev->rdev = dev; + ldev->owner = THIS_MODULE; - rc = lirc_register_driver(drv); + rc = lirc_register_device(ldev); if (rc < 0) goto out; - dev->raw->lirc.drv = drv; + dev->raw->lirc.ldev = ldev; dev->raw->lirc.dev = dev; return 0; out: - kfree(drv); + kfree(ldev); return rc; } @@ -410,9 +410,9 @@ static int ir_lirc_unregister(struct rc_dev *dev) { struct lirc_codec *lirc = &dev->raw->lirc; - lirc_unregister_driver(lirc->drv); - kfree(lirc->drv); - lirc->drv = NULL; + lirc_unregister_device(lirc->ldev); + kfree(lirc->ldev); + lirc->ldev = NULL; return 0; } diff --git a/drivers/media/rc/lirc_dev.c b/drivers/media/rc/lirc_dev.c index a2c5ed0181c1..e381a1c04bea 100644 --- a/drivers/media/rc/lirc_dev.c +++ b/drivers/media/rc/lirc_dev.c @@ -35,7 +35,7 @@ static dev_t lirc_base_dev; struct irctl { - struct lirc_driver d; + struct lirc_dev d; bool attached; int open; @@ -76,7 +76,7 @@ static void lirc_release(struct device *ld) static int lirc_allocate_buffer(struct irctl *ir) { int err = 0; - struct lirc_driver *d = &ir->d; + struct lirc_dev *d = &ir->d; if (d->rbuf) { ir->buf = d->rbuf; @@ -103,7 +103,7 @@ out: return err; } -int lirc_register_driver(struct lirc_driver *d) +int lirc_register_device(struct lirc_dev *d) { struct irctl *ir; int minor; @@ -204,9 +204,9 @@ int lirc_register_driver(struct lirc_driver *d) return 0; } -EXPORT_SYMBOL(lirc_register_driver); +EXPORT_SYMBOL(lirc_register_device); -void lirc_unregister_driver(struct lirc_driver *d) +void lirc_unregister_device(struct lirc_dev *d) { struct irctl *ir; @@ -234,7 +234,7 @@ void lirc_unregister_driver(struct lirc_driver *d) ida_simple_remove(&lirc_ida, d->minor); put_device(&ir->dev); } -EXPORT_SYMBOL(lirc_unregister_driver); +EXPORT_SYMBOL(lirc_unregister_device); int lirc_dev_fop_open(struct inode *inode, struct file *file) { diff --git a/drivers/media/rc/rc-core-priv.h b/drivers/media/rc/rc-core-priv.h index 7da9c96cb058..ae4dd0c27731 100644 --- a/drivers/media/rc/rc-core-priv.h +++ b/drivers/media/rc/rc-core-priv.h @@ -106,7 +106,7 @@ struct ir_raw_event_ctrl { } mce_kbd; struct lirc_codec { struct rc_dev *dev; - struct lirc_driver *drv; + struct lirc_dev *ldev; int carrier_low; ktime_t gap_start; diff --git a/drivers/staging/media/lirc/lirc_zilog.c b/drivers/staging/media/lirc/lirc_zilog.c index 780b2d9f2f4b..0766e5029bd7 100644 --- a/drivers/staging/media/lirc/lirc_zilog.c +++ b/drivers/staging/media/lirc/lirc_zilog.c @@ -100,7 +100,7 @@ struct IR { struct list_head list; /* FIXME spinlock access to l.features */ - struct lirc_driver l; + struct lirc_dev l; struct lirc_buffer rbuf; struct mutex ir_lock; @@ -183,7 +183,7 @@ static void release_ir_device(struct kref *ref) * ir->open_count == 0 - happens on final close() * ir_lock, tx_ref_lock, rx_ref_lock, all released */ - lirc_unregister_driver(&ir->l); + lirc_unregister_device(&ir->l); if (kfifo_initialized(&ir->rbuf.fifo)) lirc_buffer_free(&ir->rbuf); @@ -1345,7 +1345,7 @@ static const struct file_operations lirc_fops = { .release = close }; -static struct lirc_driver lirc_template = { +static struct lirc_dev lirc_template = { .name = "lirc_zilog", .code_length = 13, .fops = &lirc_fops, @@ -1441,7 +1441,7 @@ static int ir_probe(struct i2c_client *client, const struct i2c_device_id *id) spin_lock_init(&ir->rx_ref_lock); /* set lirc_dev stuff */ - memcpy(&ir->l, &lirc_template, sizeof(struct lirc_driver)); + memcpy(&ir->l, &lirc_template, sizeof(struct lirc_dev)); /* * FIXME this is a pointer reference to us, but no refcount. * @@ -1559,10 +1559,10 @@ static int ir_probe(struct i2c_client *client, const struct i2c_device_id *id) } /* register with lirc */ - ret = lirc_register_driver(&ir->l); + ret = lirc_register_device(&ir->l); if (ret < 0) { dev_err(tx->ir->l.dev, - "%s: lirc_register_driver() failed: %i\n", + "%s: lirc_register_device() failed: %i\n", __func__, ret); goto out_put_xx; } diff --git a/include/media/lirc_dev.h b/include/media/lirc_dev.h index 51c15c050e85..d16d6e0ef8da 100644 --- a/include/media/lirc_dev.h +++ b/include/media/lirc_dev.h @@ -111,54 +111,32 @@ static inline unsigned int lirc_buffer_write(struct lirc_buffer *buf, } /** - * struct lirc_driver - Defines the parameters on a LIRC driver - * - * @name: this string will be used for logs - * - * @minor: the minor device (/dev/lircX) number for a registered - * driver. - * - * @code_length: length of the remote control key code expressed in bits. + * struct lirc_dev - represents a LIRC device * + * @name: used for logging + * @minor: the minor device (/dev/lircX) number for the device + * @code_length: length of a remote control key code expressed in bits * @features: lirc compatible hardware features, like LIRC_MODE_RAW, * LIRC_CAN\_\*, as defined at include/media/lirc.h. - * * @buffer_size: Number of FIFO buffers with @chunk_size size. * Only used if @rbuf is NULL. - * * @chunk_size: Size of each FIFO buffer. * Only used if @rbuf is NULL. - * - * @data: it may point to any driver data and this pointer will - * be passed to all callback functions. - * + * @data: private per-driver data * @min_timeout: Minimum timeout for record. Valid only if * LIRC_CAN_SET_REC_TIMEOUT is defined. - * * @max_timeout: Maximum timeout for record. Valid only if * LIRC_CAN_SET_REC_TIMEOUT is defined. - * * @rbuf: if not NULL, it will be used as a read buffer, you will * have to write to the buffer by other means, like irq's * (see also lirc_serial.c). - * - * @rdev: Pointed to struct rc_dev associated with the LIRC - * device. - * - * @fops: file_operations for drivers which don't fit the current - * driver model. - * Some ioctl's can be directly handled by lirc_dev if the - * driver's ioctl function is NULL or if it returns - * -ENOIOCTLCMD (see also lirc_serial.c). - * - * @dev: pointer to the struct device associated with the LIRC - * device. - * + * @rdev: &struct rc_dev associated with the device + * @fops: &struct file_operations for the device + * @dev: &struct device assigned to the device * @owner: the module owning this struct - * - * @irctl: the struct irctl for this LIRC device. + * @irctl: &struct irctl assigned to the device */ -struct lirc_driver { +struct lirc_dev { char name[40]; unsigned int minor; __u32 code_length; @@ -183,14 +161,14 @@ struct lirc_driver { * returns negative value on error or zero * contents of the structure pointed by p is copied */ -int lirc_register_driver(struct lirc_driver *d); +int lirc_register_device(struct lirc_dev *d); -void lirc_unregister_driver(struct lirc_driver *d); +void lirc_unregister_device(struct lirc_dev *d); /* Must be called in the open fop before lirc_get_pdata() can be used */ void lirc_init_pdata(struct inode *inode, struct file *file); -/* Returns the private data stored in the lirc_driver +/* Returns the private data stored in the lirc_dev * associated with the given device file pointer. */ void *lirc_get_pdata(struct file *file); From 1eb4410e7ef5c1735ab20f02af337a5a07582dec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?David=20H=C3=A4rdeman?= Date: Sun, 25 Jun 2017 09:32:15 -0300 Subject: [PATCH 0486/1640] UPSTREAM: [media] media: lirc_dev: introduce lirc_allocate_device and lirc_free_device MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduce two new functions so that the API for lirc_dev matches that of the rc-core and input subsystems. This means that lirc_dev structs are managed using the usual four functions: lirc_allocate_device lirc_free_device lirc_register_device lirc_unregister_device The functions are pretty simplistic at this point, later patches will put more flesh on the bones of both. Signed-off-by: David Härdeman Signed-off-by: Sean Young --- drivers/media/rc/ir-lirc-codec.c | 2 +- drivers/media/rc/lirc_dev.c | 13 +++++++++++++ include/media/lirc_dev.h | 9 ++++----- 3 files changed, 18 insertions(+), 6 deletions(-) diff --git a/drivers/media/rc/ir-lirc-codec.c b/drivers/media/rc/ir-lirc-codec.c index 2d591168c991..d5c155a5a547 100644 --- a/drivers/media/rc/ir-lirc-codec.c +++ b/drivers/media/rc/ir-lirc-codec.c @@ -347,7 +347,7 @@ static int ir_lirc_register(struct rc_dev *dev) int rc = -ENOMEM; unsigned long features = 0; - ldev = kzalloc(sizeof(*ldev), GFP_KERNEL); + ldev = lirc_allocate_device(); if (!ldev) return rc; diff --git a/drivers/media/rc/lirc_dev.c b/drivers/media/rc/lirc_dev.c index e381a1c04bea..a6005f70de5a 100644 --- a/drivers/media/rc/lirc_dev.c +++ b/drivers/media/rc/lirc_dev.c @@ -103,6 +103,19 @@ out: return err; } +struct lirc_dev * +lirc_allocate_device(void) +{ + return kzalloc(sizeof(struct lirc_dev), GFP_KERNEL); +} +EXPORT_SYMBOL(lirc_allocate_device); + +void lirc_free_device(struct lirc_dev *d) +{ + kfree(d); +} +EXPORT_SYMBOL(lirc_free_device); + int lirc_register_device(struct lirc_dev *d) { struct irctl *ir; diff --git a/include/media/lirc_dev.h b/include/media/lirc_dev.h index d16d6e0ef8da..4b0dc640e142 100644 --- a/include/media/lirc_dev.h +++ b/include/media/lirc_dev.h @@ -156,11 +156,10 @@ struct lirc_dev { struct irctl *irctl; }; -/* following functions can be called ONLY from user context - * - * returns negative value on error or zero - * contents of the structure pointed by p is copied - */ +struct lirc_dev *lirc_allocate_device(void); + +void lirc_free_device(struct lirc_dev *d); + int lirc_register_device(struct lirc_dev *d); void lirc_unregister_device(struct lirc_dev *d); From b81974278cf11ea87ae73165ae96338f7d08d953 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?David=20H=C3=A4rdeman?= Date: Sun, 25 Jun 2017 09:32:25 -0300 Subject: [PATCH 0487/1640] UPSTREAM: [media] media: lirc_zilog: add a pointer to the parent device to struct IR MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit lirc_zilog stashes a pointer to the parent device in struct lirc_dev and uses it for logging. It makes more sense to let lirc_zilog keep track of that pointer in its own struct (this is in preparation for subsequent patches which will remodel struct lirc_dev). Signed-off-by: David Härdeman Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/staging/media/lirc/lirc_zilog.c | 98 +++++++++++++------------ 1 file changed, 50 insertions(+), 48 deletions(-) diff --git a/drivers/staging/media/lirc/lirc_zilog.c b/drivers/staging/media/lirc/lirc_zilog.c index 0766e5029bd7..cd2eeb365cd7 100644 --- a/drivers/staging/media/lirc/lirc_zilog.c +++ b/drivers/staging/media/lirc/lirc_zilog.c @@ -106,6 +106,7 @@ struct IR { struct mutex ir_lock; atomic_t open_count; + struct device *dev; struct i2c_adapter *adapter; spinlock_t rx_ref_lock; /* struct IR_rx kref get()/put() */ @@ -319,7 +320,7 @@ static int add_to_buf(struct IR *ir) struct IR_tx *tx; if (lirc_buffer_full(rbuf)) { - dev_dbg(ir->l.dev, "buffer overflow\n"); + dev_dbg(ir->dev, "buffer overflow\n"); return -EOVERFLOW; } @@ -365,17 +366,17 @@ static int add_to_buf(struct IR *ir) */ ret = i2c_master_send(rx->c, sendbuf, 1); if (ret != 1) { - dev_err(ir->l.dev, "i2c_master_send failed with %d\n", + dev_err(ir->dev, "i2c_master_send failed with %d\n", ret); if (failures >= 3) { mutex_unlock(&ir->ir_lock); - dev_err(ir->l.dev, + dev_err(ir->dev, "unable to read from the IR chip after 3 resets, giving up\n"); break; } /* Looks like the chip crashed, reset it */ - dev_err(ir->l.dev, + dev_err(ir->dev, "polling the IR receiver chip failed, trying reset\n"); set_current_state(TASK_UNINTERRUPTIBLE); @@ -402,14 +403,14 @@ static int add_to_buf(struct IR *ir) ret = i2c_master_recv(rx->c, keybuf, sizeof(keybuf)); mutex_unlock(&ir->ir_lock); if (ret != sizeof(keybuf)) { - dev_err(ir->l.dev, + dev_err(ir->dev, "i2c_master_recv failed with %d -- keeping last read buffer\n", ret); } else { rx->b[0] = keybuf[3]; rx->b[1] = keybuf[4]; rx->b[2] = keybuf[5]; - dev_dbg(ir->l.dev, + dev_dbg(ir->dev, "key (0x%02x/0x%02x)\n", rx->b[0], rx->b[1]); } @@ -462,7 +463,7 @@ static int lirc_thread(void *arg) struct IR *ir = arg; struct lirc_buffer *rbuf = ir->l.rbuf; - dev_dbg(ir->l.dev, "poll thread started\n"); + dev_dbg(ir->dev, "poll thread started\n"); while (!kthread_should_stop()) { set_current_state(TASK_INTERRUPTIBLE); @@ -490,7 +491,7 @@ static int lirc_thread(void *arg) wake_up_interruptible(&rbuf->wait_poll); } - dev_dbg(ir->l.dev, "poll thread ended\n"); + dev_dbg(ir->dev, "poll thread ended\n"); return 0; } @@ -643,10 +644,10 @@ static int send_data_block(struct IR_tx *tx, unsigned char *data_block) buf[0] = (unsigned char)(i + 1); for (j = 0; j < tosend; ++j) buf[1 + j] = data_block[i + j]; - dev_dbg(tx->ir->l.dev, "%*ph", 5, buf); + dev_dbg(tx->ir->dev, "%*ph", 5, buf); ret = i2c_master_send(tx->c, buf, tosend + 1); if (ret != tosend + 1) { - dev_err(tx->ir->l.dev, + dev_err(tx->ir->dev, "i2c_master_send failed with %d\n", ret); return ret < 0 ? ret : -EFAULT; } @@ -671,7 +672,7 @@ static int send_boot_data(struct IR_tx *tx) buf[1] = 0x20; ret = i2c_master_send(tx->c, buf, 2); if (ret != 2) { - dev_err(tx->ir->l.dev, "i2c_master_send failed with %d\n", ret); + dev_err(tx->ir->dev, "i2c_master_send failed with %d\n", ret); return ret < 0 ? ret : -EFAULT; } @@ -688,22 +689,22 @@ static int send_boot_data(struct IR_tx *tx) } if (ret != 1) { - dev_err(tx->ir->l.dev, "i2c_master_send failed with %d\n", ret); + dev_err(tx->ir->dev, "i2c_master_send failed with %d\n", ret); return ret < 0 ? ret : -EFAULT; } /* Here comes the firmware version... (hopefully) */ ret = i2c_master_recv(tx->c, buf, 4); if (ret != 4) { - dev_err(tx->ir->l.dev, "i2c_master_recv failed with %d\n", ret); + dev_err(tx->ir->dev, "i2c_master_recv failed with %d\n", ret); return 0; } if ((buf[0] != 0x80) && (buf[0] != 0xa0)) { - dev_err(tx->ir->l.dev, "unexpected IR TX init response: %02x\n", + dev_err(tx->ir->dev, "unexpected IR TX init response: %02x\n", buf[0]); return 0; } - dev_notice(tx->ir->l.dev, + dev_notice(tx->ir->dev, "Zilog/Hauppauge IR blaster firmware version %d.%d.%d loaded\n", buf[1], buf[2], buf[3]); @@ -748,15 +749,15 @@ static int fw_load(struct IR_tx *tx) } /* Request codeset data file */ - ret = request_firmware(&fw_entry, "haup-ir-blaster.bin", tx->ir->l.dev); + ret = request_firmware(&fw_entry, "haup-ir-blaster.bin", tx->ir->dev); if (ret != 0) { - dev_err(tx->ir->l.dev, + dev_err(tx->ir->dev, "firmware haup-ir-blaster.bin not available (%d)\n", ret); ret = ret < 0 ? ret : -EFAULT; goto out; } - dev_dbg(tx->ir->l.dev, "firmware of size %zu loaded\n", fw_entry->size); + dev_dbg(tx->ir->dev, "firmware of size %zu loaded\n", fw_entry->size); /* Parse the file */ tx_data = vmalloc(sizeof(*tx_data)); @@ -784,7 +785,7 @@ static int fw_load(struct IR_tx *tx) if (!read_uint8(&data, tx_data->endp, &version)) goto corrupt; if (version != 1) { - dev_err(tx->ir->l.dev, + dev_err(tx->ir->dev, "unsupported code set file version (%u, expected 1) -- please upgrade to a newer driver\n", version); fw_unload_locked(); @@ -801,7 +802,7 @@ static int fw_load(struct IR_tx *tx) &tx_data->num_code_sets)) goto corrupt; - dev_dbg(tx->ir->l.dev, "%u IR blaster codesets loaded\n", + dev_dbg(tx->ir->dev, "%u IR blaster codesets loaded\n", tx_data->num_code_sets); tx_data->code_sets = vmalloc( @@ -866,7 +867,7 @@ static int fw_load(struct IR_tx *tx) goto out; corrupt: - dev_err(tx->ir->l.dev, "firmware is corrupt\n"); + dev_err(tx->ir->dev, "firmware is corrupt\n"); fw_unload_locked(); ret = -EFAULT; @@ -886,9 +887,9 @@ static ssize_t read(struct file *filep, char __user *outbuf, size_t n, unsigned int m; DECLARE_WAITQUEUE(wait, current); - dev_dbg(ir->l.dev, "read called\n"); + dev_dbg(ir->dev, "read called\n"); if (n % rbuf->chunk_size) { - dev_dbg(ir->l.dev, "read result = -EINVAL\n"); + dev_dbg(ir->dev, "read result = -EINVAL\n"); return -EINVAL; } @@ -932,7 +933,7 @@ static ssize_t read(struct file *filep, char __user *outbuf, size_t n, unsigned char buf[MAX_XFER_SIZE]; if (rbuf->chunk_size > sizeof(buf)) { - dev_err(ir->l.dev, + dev_err(ir->dev, "chunk_size is too big (%d)!\n", rbuf->chunk_size); ret = -EINVAL; @@ -947,7 +948,7 @@ static ssize_t read(struct file *filep, char __user *outbuf, size_t n, retries++; } if (retries >= 5) { - dev_err(ir->l.dev, "Buffer read failed!\n"); + dev_err(ir->dev, "Buffer read failed!\n"); ret = -EIO; } } @@ -957,7 +958,7 @@ static ssize_t read(struct file *filep, char __user *outbuf, size_t n, put_ir_rx(rx, false); set_current_state(TASK_RUNNING); - dev_dbg(ir->l.dev, "read result = %d (%s)\n", ret, + dev_dbg(ir->dev, "read result = %d (%s)\n", ret, ret ? "Error" : "OK"); return ret ? ret : written; @@ -974,7 +975,7 @@ static int send_code(struct IR_tx *tx, unsigned int code, unsigned int key) ret = get_key_data(data_block, code, key); if (ret == -EPROTO) { - dev_err(tx->ir->l.dev, + dev_err(tx->ir->dev, "failed to get data for code %u, key %u -- check lircd.conf entries\n", code, key); return ret; @@ -992,7 +993,7 @@ static int send_code(struct IR_tx *tx, unsigned int code, unsigned int key) buf[1] = 0x40; ret = i2c_master_send(tx->c, buf, 2); if (ret != 2) { - dev_err(tx->ir->l.dev, "i2c_master_send failed with %d\n", ret); + dev_err(tx->ir->dev, "i2c_master_send failed with %d\n", ret); return ret < 0 ? ret : -EFAULT; } @@ -1005,18 +1006,18 @@ static int send_code(struct IR_tx *tx, unsigned int code, unsigned int key) } if (ret != 1) { - dev_err(tx->ir->l.dev, "i2c_master_send failed with %d\n", ret); + dev_err(tx->ir->dev, "i2c_master_send failed with %d\n", ret); return ret < 0 ? ret : -EFAULT; } /* Send finished download? */ ret = i2c_master_recv(tx->c, buf, 1); if (ret != 1) { - dev_err(tx->ir->l.dev, "i2c_master_recv failed with %d\n", ret); + dev_err(tx->ir->dev, "i2c_master_recv failed with %d\n", ret); return ret < 0 ? ret : -EFAULT; } if (buf[0] != 0xA0) { - dev_err(tx->ir->l.dev, "unexpected IR TX response #1: %02x\n", + dev_err(tx->ir->dev, "unexpected IR TX response #1: %02x\n", buf[0]); return -EFAULT; } @@ -1026,7 +1027,7 @@ static int send_code(struct IR_tx *tx, unsigned int code, unsigned int key) buf[1] = 0x80; ret = i2c_master_send(tx->c, buf, 2); if (ret != 2) { - dev_err(tx->ir->l.dev, "i2c_master_send failed with %d\n", ret); + dev_err(tx->ir->dev, "i2c_master_send failed with %d\n", ret); return ret < 0 ? ret : -EFAULT; } @@ -1036,7 +1037,7 @@ static int send_code(struct IR_tx *tx, unsigned int code, unsigned int key) * going to skip this whole mess and say we're done on the HD PVR */ if (!tx->post_tx_ready_poll) { - dev_dbg(tx->ir->l.dev, "sent code %u, key %u\n", code, key); + dev_dbg(tx->ir->dev, "sent code %u, key %u\n", code, key); return 0; } @@ -1052,12 +1053,12 @@ static int send_code(struct IR_tx *tx, unsigned int code, unsigned int key) ret = i2c_master_send(tx->c, buf, 1); if (ret == 1) break; - dev_dbg(tx->ir->l.dev, + dev_dbg(tx->ir->dev, "NAK expected: i2c_master_send failed with %d (try %d)\n", ret, i + 1); } if (ret != 1) { - dev_err(tx->ir->l.dev, + dev_err(tx->ir->dev, "IR TX chip never got ready: last i2c_master_send failed with %d\n", ret); return ret < 0 ? ret : -EFAULT; @@ -1066,17 +1067,17 @@ static int send_code(struct IR_tx *tx, unsigned int code, unsigned int key) /* Seems to be an 'ok' response */ i = i2c_master_recv(tx->c, buf, 1); if (i != 1) { - dev_err(tx->ir->l.dev, "i2c_master_recv failed with %d\n", ret); + dev_err(tx->ir->dev, "i2c_master_recv failed with %d\n", ret); return -EFAULT; } if (buf[0] != 0x80) { - dev_err(tx->ir->l.dev, "unexpected IR TX response #2: %02x\n", + dev_err(tx->ir->dev, "unexpected IR TX response #2: %02x\n", buf[0]); return -EFAULT; } /* Oh good, it worked */ - dev_dbg(tx->ir->l.dev, "sent code %u, key %u\n", code, key); + dev_dbg(tx->ir->dev, "sent code %u, key %u\n", code, key); return 0; } @@ -1162,11 +1163,11 @@ static ssize_t write(struct file *filep, const char __user *buf, size_t n, */ if (ret != 0) { /* Looks like the chip crashed, reset it */ - dev_err(tx->ir->l.dev, + dev_err(tx->ir->dev, "sending to the IR transmitter chip failed, trying reset\n"); if (failures >= 3) { - dev_err(tx->ir->l.dev, + dev_err(tx->ir->dev, "unable to send to the IR chip after 3 resets, giving up\n"); mutex_unlock(&ir->ir_lock); mutex_unlock(&tx->client_lock); @@ -1202,7 +1203,7 @@ static unsigned int poll(struct file *filep, poll_table *wait) struct lirc_buffer *rbuf = ir->l.rbuf; unsigned int ret; - dev_dbg(ir->l.dev, "%s called\n", __func__); + dev_dbg(ir->dev, "%s called\n", __func__); rx = get_ir_rx(ir); if (!rx) { @@ -1210,7 +1211,7 @@ static unsigned int poll(struct file *filep, poll_table *wait) * Revisit this, if our poll function ever reports writeable * status for Tx */ - dev_dbg(ir->l.dev, "%s result = POLLERR\n", __func__); + dev_dbg(ir->dev, "%s result = POLLERR\n", __func__); return POLLERR; } @@ -1223,7 +1224,7 @@ static unsigned int poll(struct file *filep, poll_table *wait) /* Indicate what ops could happen immediately without blocking */ ret = lirc_buffer_empty(rbuf) ? 0 : (POLLIN | POLLRDNORM); - dev_dbg(ir->l.dev, "%s result = %s\n", __func__, + dev_dbg(ir->dev, "%s result = %s\n", __func__, ret ? "POLLIN|POLLRDNORM" : "none"); return ret; } @@ -1435,6 +1436,7 @@ static int ir_probe(struct i2c_client *client, const struct i2c_device_id *id) list_add_tail(&ir->list, &ir_devices_list); ir->adapter = adap; + ir->dev = &adap->dev; mutex_init(&ir->ir_lock); atomic_set(&ir->open_count, 0); spin_lock_init(&ir->tx_ref_lock); @@ -1498,7 +1500,7 @@ static int ir_probe(struct i2c_client *client, const struct i2c_device_id *id) /* Proceed only if the Rx client is also ready or not needed */ if (!rx && !tx_only) { - dev_info(tx->ir->l.dev, + dev_info(tx->ir->dev, "probe of IR Tx on %s (i2c-%d) done. Waiting on IR Rx.\n", adap->name, adap->nr); goto out_ok; @@ -1538,7 +1540,7 @@ static int ir_probe(struct i2c_client *client, const struct i2c_device_id *id) "zilog-rx-i2c-%d", adap->nr); if (IS_ERR(rx->task)) { ret = PTR_ERR(rx->task); - dev_err(tx->ir->l.dev, + dev_err(tx->ir->dev, "%s: could not start IR Rx polling thread\n", __func__); /* Failed kthread, so put back the ir ref */ @@ -1561,13 +1563,13 @@ static int ir_probe(struct i2c_client *client, const struct i2c_device_id *id) /* register with lirc */ ret = lirc_register_device(&ir->l); if (ret < 0) { - dev_err(tx->ir->l.dev, + dev_err(tx->ir->dev, "%s: lirc_register_device() failed: %i\n", __func__, ret); goto out_put_xx; } - dev_info(ir->l.dev, + dev_info(ir->dev, "IR unit on %s (i2c-%d) registered as lirc%d and ready\n", adap->name, adap->nr, ir->l.minor); @@ -1577,7 +1579,7 @@ out_ok: if (tx) put_ir_tx(tx, true); put_ir_device(ir, true); - dev_info(ir->l.dev, + dev_info(ir->dev, "probe of IR %s on %s (i2c-%d) done\n", tx_probe ? "Tx" : "Rx", adap->name, adap->nr); mutex_unlock(&ir_devices_lock); From 9428e3fa10a89780a6a2883dd7b95fb8d1b0a826 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?David=20H=C3=A4rdeman?= Date: Sun, 25 Jun 2017 09:32:31 -0300 Subject: [PATCH 0488/1640] UPSTREAM: [media] media: lirc_zilog: use a dynamically allocated lirc_dev MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit lirc_zilog currently embeds a struct lirc_dev in its own struct IR, but subsequent patches will make the lifetime of struct lirc_dev dynamic (i.e. it will be free():d once lirc_dev is sure there are no users of the struct). Therefore, change lirc_zilog to use a pointer to a dynamically allocated struct lirc_dev. Signed-off-by: David Härdeman Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/staging/media/lirc/lirc_zilog.c | 69 ++++++++++++++----------- 1 file changed, 40 insertions(+), 29 deletions(-) diff --git a/drivers/staging/media/lirc/lirc_zilog.c b/drivers/staging/media/lirc/lirc_zilog.c index cd2eeb365cd7..00e8c8f224b7 100644 --- a/drivers/staging/media/lirc/lirc_zilog.c +++ b/drivers/staging/media/lirc/lirc_zilog.c @@ -99,8 +99,8 @@ struct IR { struct kref ref; struct list_head list; - /* FIXME spinlock access to l.features */ - struct lirc_dev l; + /* FIXME spinlock access to l->features */ + struct lirc_dev *l; struct lirc_buffer rbuf; struct mutex ir_lock; @@ -184,7 +184,10 @@ static void release_ir_device(struct kref *ref) * ir->open_count == 0 - happens on final close() * ir_lock, tx_ref_lock, rx_ref_lock, all released */ - lirc_unregister_device(&ir->l); + if (ir->l) { + lirc_unregister_device(ir->l); + lirc_free_device(ir->l); + } if (kfifo_initialized(&ir->rbuf.fifo)) lirc_buffer_free(&ir->rbuf); @@ -241,7 +244,7 @@ static void release_ir_rx(struct kref *ref) * and releasing the ir reference can cause a sleep. That work is * performed by put_ir_rx() */ - ir->l.features &= ~LIRC_CAN_REC_LIRCCODE; + ir->l->features &= ~LIRC_CAN_REC_LIRCCODE; /* Don't put_ir_device(rx->ir) here; lock can't be freed yet */ ir->rx = NULL; /* Don't do the kfree(rx) here; we still need to kill the poll thread */ @@ -286,7 +289,7 @@ static void release_ir_tx(struct kref *ref) struct IR_tx *tx = container_of(ref, struct IR_tx, ref); struct IR *ir = tx->ir; - ir->l.features &= ~LIRC_CAN_SEND_LIRCCODE; + ir->l->features &= ~LIRC_CAN_SEND_LIRCCODE; /* Don't put_ir_device(tx->ir) here, so our lock doesn't get freed */ ir->tx = NULL; kfree(tx); @@ -315,7 +318,7 @@ static int add_to_buf(struct IR *ir) int ret; int failures = 0; unsigned char sendbuf[1] = { 0 }; - struct lirc_buffer *rbuf = ir->l.rbuf; + struct lirc_buffer *rbuf = ir->l->rbuf; struct IR_rx *rx; struct IR_tx *tx; @@ -461,7 +464,7 @@ static int add_to_buf(struct IR *ir) static int lirc_thread(void *arg) { struct IR *ir = arg; - struct lirc_buffer *rbuf = ir->l.rbuf; + struct lirc_buffer *rbuf = ir->l->rbuf; dev_dbg(ir->dev, "poll thread started\n"); @@ -882,7 +885,7 @@ static ssize_t read(struct file *filep, char __user *outbuf, size_t n, { struct IR *ir = lirc_get_pdata(filep); struct IR_rx *rx; - struct lirc_buffer *rbuf = ir->l.rbuf; + struct lirc_buffer *rbuf = ir->l->rbuf; int ret = 0, written = 0, retries = 0; unsigned int m; DECLARE_WAITQUEUE(wait, current); @@ -1200,7 +1203,7 @@ static unsigned int poll(struct file *filep, poll_table *wait) { struct IR *ir = lirc_get_pdata(filep); struct IR_rx *rx; - struct lirc_buffer *rbuf = ir->l.rbuf; + struct lirc_buffer *rbuf = ir->l->rbuf; unsigned int ret; dev_dbg(ir->dev, "%s called\n", __func__); @@ -1236,7 +1239,7 @@ static long ioctl(struct file *filep, unsigned int cmd, unsigned long arg) int result; unsigned long mode, features; - features = ir->l.features; + features = ir->l->features; switch (cmd) { case LIRC_GET_LENGTH: @@ -1346,13 +1349,6 @@ static const struct file_operations lirc_fops = { .release = close }; -static struct lirc_dev lirc_template = { - .name = "lirc_zilog", - .code_length = 13, - .fops = &lirc_fops, - .owner = THIS_MODULE, -}; - static int ir_remove(struct i2c_client *client) { if (strncmp("ir_tx_z8", client->name, 8) == 0) { @@ -1443,22 +1439,35 @@ static int ir_probe(struct i2c_client *client, const struct i2c_device_id *id) spin_lock_init(&ir->rx_ref_lock); /* set lirc_dev stuff */ - memcpy(&ir->l, &lirc_template, sizeof(struct lirc_dev)); + ir->l = lirc_allocate_device(); + if (!ir->l) { + ret = -ENOMEM; + goto out_put_ir; + } + + snprintf(ir->l->name, sizeof(ir->l->name), "lirc_zilog"); + ir->l->code_length = 13; + ir->l->fops = &lirc_fops; + ir->l->owner = THIS_MODULE; + /* * FIXME this is a pointer reference to us, but no refcount. * * This OK for now, since lirc_dev currently won't touch this * buffer as we provide our own lirc_fops. * - * Currently our own lirc_fops rely on this ir->l.rbuf pointer + * Currently our own lirc_fops rely on this ir->l->rbuf pointer */ - ir->l.rbuf = &ir->rbuf; - ir->l.dev = &adap->dev; + ir->l->rbuf = &ir->rbuf; + ir->l->dev = &adap->dev; /* This will be returned by lirc_get_pdata() */ - ir->l.data = ir; - ret = lirc_buffer_init(ir->l.rbuf, 2, BUFLEN / 2); - if (ret) + ir->l->data = ir; + ret = lirc_buffer_init(ir->l->rbuf, 2, BUFLEN / 2); + if (ret) { + lirc_free_device(ir->l); + ir->l = NULL; goto out_put_ir; + } } if (tx_probe) { @@ -1474,7 +1483,7 @@ static int ir_probe(struct i2c_client *client, const struct i2c_device_id *id) kref_init(&tx->ref); ir->tx = tx; - ir->l.features |= LIRC_CAN_SEND_LIRCCODE; + ir->l->features |= LIRC_CAN_SEND_LIRCCODE; mutex_init(&tx->client_lock); tx->c = client; tx->need_boot = 1; @@ -1518,7 +1527,7 @@ static int ir_probe(struct i2c_client *client, const struct i2c_device_id *id) kref_init(&rx->ref); ir->rx = rx; - ir->l.features |= LIRC_CAN_REC_LIRCCODE; + ir->l->features |= LIRC_CAN_REC_LIRCCODE; mutex_init(&rx->client_lock); rx->c = client; rx->hdpvr_data_fmt = @@ -1548,7 +1557,7 @@ static int ir_probe(struct i2c_client *client, const struct i2c_device_id *id) /* Failure exit, so put back rx ref from i2c_client */ i2c_set_clientdata(client, NULL); put_ir_rx(rx, true); - ir->l.features &= ~LIRC_CAN_REC_LIRCCODE; + ir->l->features &= ~LIRC_CAN_REC_LIRCCODE; goto out_put_tx; } @@ -1561,17 +1570,19 @@ static int ir_probe(struct i2c_client *client, const struct i2c_device_id *id) } /* register with lirc */ - ret = lirc_register_device(&ir->l); + ret = lirc_register_device(ir->l); if (ret < 0) { dev_err(tx->ir->dev, "%s: lirc_register_device() failed: %i\n", __func__, ret); + lirc_free_device(ir->l); + ir->l = NULL; goto out_put_xx; } dev_info(ir->dev, "IR unit on %s (i2c-%d) registered as lirc%d and ready\n", - adap->name, adap->nr, ir->l.minor); + adap->name, adap->nr, ir->l->minor); out_ok: if (rx) From 90d78e75bf1041c0fdfd2237aa3c588adb4ebc77 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?David=20H=C3=A4rdeman?= Date: Sun, 25 Jun 2017 09:32:36 -0300 Subject: [PATCH 0489/1640] UPSTREAM: [media] media: lirc_dev: merge struct irctl into struct lirc_dev MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The use of two separate structs (lirc_dev aka lirc_driver and irctl) makes it much harder to follow the proper lifetime of the various structs and necessitates hacks such as keeping a copy of struct lirc_dev inside struct irctl. Merging the two structs means that lirc_dev can properly manage the lifetime of the resulting struct and simplifies the code at the same time. [mchehab@s-opensource.com: fix merge conflict] Signed-off-by: David Härdeman Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/ir-lirc-codec.c | 15 +- drivers/media/rc/lirc_dev.c | 318 +++++++++++------------- drivers/staging/media/lirc/lirc_zilog.c | 20 +- include/media/lirc_dev.h | 26 +- 4 files changed, 177 insertions(+), 202 deletions(-) diff --git a/drivers/media/rc/ir-lirc-codec.c b/drivers/media/rc/ir-lirc-codec.c index d5c155a5a547..bd046c41a53a 100644 --- a/drivers/media/rc/ir-lirc-codec.c +++ b/drivers/media/rc/ir-lirc-codec.c @@ -35,7 +35,7 @@ static int ir_lirc_decode(struct rc_dev *dev, struct ir_raw_event ev) struct lirc_codec *lirc = &dev->raw->lirc; int sample; - if (!dev->raw->lirc.ldev || !dev->raw->lirc.ldev->rbuf) + if (!dev->raw->lirc.ldev || !dev->raw->lirc.ldev->buf) return -EINVAL; /* Packet start */ @@ -84,7 +84,7 @@ static int ir_lirc_decode(struct rc_dev *dev, struct ir_raw_event ev) (u64)LIRC_VALUE_MASK); gap_sample = LIRC_SPACE(lirc->gap_duration); - lirc_buffer_write(dev->raw->lirc.ldev->rbuf, + lirc_buffer_write(dev->raw->lirc.ldev->buf, (unsigned char *)&gap_sample); lirc->gap = false; } @@ -95,9 +95,9 @@ static int ir_lirc_decode(struct rc_dev *dev, struct ir_raw_event ev) TO_US(ev.duration), TO_STR(ev.pulse)); } - lirc_buffer_write(dev->raw->lirc.ldev->rbuf, + lirc_buffer_write(dev->raw->lirc.ldev->buf, (unsigned char *) &sample); - wake_up(&dev->raw->lirc.ldev->rbuf->wait_poll); + wake_up(&dev->raw->lirc.ldev->buf->wait_poll); return 0; } @@ -384,12 +384,12 @@ static int ir_lirc_register(struct rc_dev *dev) dev->driver_name); ldev->features = features; ldev->data = &dev->raw->lirc; - ldev->rbuf = NULL; + ldev->buf = NULL; ldev->code_length = sizeof(struct ir_raw_event) * 8; ldev->chunk_size = sizeof(int); ldev->buffer_size = LIRCBUF_SIZE; ldev->fops = &lirc_fops; - ldev->dev = &dev->dev; + ldev->dev.parent = &dev->dev; ldev->rdev = dev; ldev->owner = THIS_MODULE; @@ -402,7 +402,7 @@ static int ir_lirc_register(struct rc_dev *dev) return 0; out: - kfree(ldev); + lirc_free_device(ldev); return rc; } @@ -411,7 +411,6 @@ static int ir_lirc_unregister(struct rc_dev *dev) struct lirc_codec *lirc = &dev->raw->lirc; lirc_unregister_device(lirc->ldev); - kfree(lirc->ldev); lirc->ldev = NULL; return 0; diff --git a/drivers/media/rc/lirc_dev.c b/drivers/media/rc/lirc_dev.c index a6005f70de5a..e9dae8621670 100644 --- a/drivers/media/rc/lirc_dev.c +++ b/drivers/media/rc/lirc_dev.c @@ -34,19 +34,6 @@ static dev_t lirc_base_dev; -struct irctl { - struct lirc_dev d; - bool attached; - int open; - - struct mutex mutex; /* protect from simultaneous accesses */ - struct lirc_buffer *buf; - bool buf_internal; - - struct device dev; - struct cdev cdev; -}; - /* Used to keep track of allocated lirc devices */ #define LIRC_MAX_DEVICES 256 static DEFINE_IDA(lirc_ida); @@ -54,71 +41,74 @@ static DEFINE_IDA(lirc_ida); /* Only used for sysfs but defined to void otherwise */ static struct class *lirc_class; -static void lirc_free_buffer(struct irctl *ir) +static void lirc_release_device(struct device *ld) { - put_device(ir->dev.parent); + struct lirc_dev *d = container_of(ld, struct lirc_dev, dev); - if (ir->buf_internal) { - lirc_buffer_free(ir->buf); - kfree(ir->buf); - ir->buf = NULL; + put_device(d->dev.parent); + + if (d->buf_internal) { + lirc_buffer_free(d->buf); + kfree(d->buf); + d->buf = NULL; } + kfree(d); + module_put(THIS_MODULE); } -static void lirc_release(struct device *ld) +static int lirc_allocate_buffer(struct lirc_dev *d) { - struct irctl *ir = container_of(ld, struct irctl, dev); + int err; - lirc_free_buffer(ir); - kfree(ir); -} - -static int lirc_allocate_buffer(struct irctl *ir) -{ - int err = 0; - struct lirc_dev *d = &ir->d; - - if (d->rbuf) { - ir->buf = d->rbuf; - ir->buf_internal = false; - } else { - ir->buf = kmalloc(sizeof(struct lirc_buffer), GFP_KERNEL); - if (!ir->buf) { - err = -ENOMEM; - goto out; - } - - err = lirc_buffer_init(ir->buf, d->chunk_size, d->buffer_size); - if (err) { - kfree(ir->buf); - ir->buf = NULL; - goto out; - } - - ir->buf_internal = true; - d->rbuf = ir->buf; + if (d->buf) { + d->buf_internal = false; + return 0; } -out: - return err; + d->buf = kmalloc(sizeof(*d->buf), GFP_KERNEL); + if (!d->buf) + return -ENOMEM; + + err = lirc_buffer_init(d->buf, d->chunk_size, d->buffer_size); + if (err) { + kfree(d->buf); + d->buf = NULL; + return err; + } + + d->buf_internal = true; + return 0; } struct lirc_dev * lirc_allocate_device(void) { - return kzalloc(sizeof(struct lirc_dev), GFP_KERNEL); + struct lirc_dev *d; + + d = kzalloc(sizeof(*d), GFP_KERNEL); + if (d) { + mutex_init(&d->mutex); + device_initialize(&d->dev); + d->dev.class = lirc_class; + d->dev.release = lirc_release_device; + __module_get(THIS_MODULE); + } + + return d; } EXPORT_SYMBOL(lirc_allocate_device); void lirc_free_device(struct lirc_dev *d) { - kfree(d); + if (!d) + return; + + put_device(&d->dev); } EXPORT_SYMBOL(lirc_free_device); int lirc_register_device(struct lirc_dev *d) { - struct irctl *ir; int minor; int err; @@ -127,8 +117,8 @@ int lirc_register_device(struct lirc_dev *d) return -EBADRQC; } - if (!d->dev) { - pr_err("dev pointer not filled in!\n"); + if (!d->dev.parent) { + pr_err("dev parent pointer not filled in!\n"); return -EINVAL; } @@ -137,25 +127,25 @@ int lirc_register_device(struct lirc_dev *d) return -EINVAL; } - if (!d->rbuf && d->chunk_size < 1) { + if (!d->buf && d->chunk_size < 1) { pr_err("chunk_size must be set!\n"); return -EINVAL; } - if (!d->rbuf && d->buffer_size < 1) { + if (!d->buf && d->buffer_size < 1) { pr_err("buffer_size must be set!\n"); return -EINVAL; } if (d->code_length < 1 || d->code_length > (BUFLEN * 8)) { - dev_err(d->dev, "code length must be less than %d bits\n", - BUFLEN * 8); + dev_err(&d->dev, "code length must be less than %d bits\n", + BUFLEN * 8); return -EBADRQC; } - if (!d->rbuf && !(d->fops && d->fops->read && - d->fops->poll && d->fops->unlocked_ioctl)) { - dev_err(d->dev, "undefined read, poll, ioctl\n"); + if (!d->buf && !(d->fops && d->fops->read && + d->fops->poll && d->fops->unlocked_ioctl)) { + dev_err(&d->dev, "undefined read, poll, ioctl\n"); return -EBADRQC; } @@ -165,55 +155,34 @@ int lirc_register_device(struct lirc_dev *d) if (d->features == 0) d->features = LIRC_CAN_REC_LIRCCODE; - ir = kzalloc(sizeof(*ir), GFP_KERNEL); - if (!ir) - return -ENOMEM; - - mutex_init(&ir->mutex); - ir->d = *d; - if (LIRC_CAN_REC(d->features)) { - err = lirc_allocate_buffer(ir); - if (err) { - kfree(ir); + err = lirc_allocate_buffer(d); + if (err) return err; - } - d->rbuf = ir->buf; } minor = ida_simple_get(&lirc_ida, 0, LIRC_MAX_DEVICES, GFP_KERNEL); - if (minor < 0) { - lirc_free_buffer(ir); - kfree(ir); + if (minor < 0) return minor; - } - d->irctl = ir; d->minor = minor; - ir->d.minor = minor; + d->dev.devt = MKDEV(MAJOR(lirc_base_dev), d->minor); + dev_set_name(&d->dev, "lirc%d", d->minor); - device_initialize(&ir->dev); - ir->dev.devt = MKDEV(MAJOR(lirc_base_dev), ir->d.minor); - ir->dev.class = lirc_class; - ir->dev.parent = d->dev; - ir->dev.release = lirc_release; - dev_set_name(&ir->dev, "lirc%d", ir->d.minor); + cdev_init(&d->cdev, d->fops); + d->cdev.owner = d->owner; + d->attached = true; - cdev_init(&ir->cdev, d->fops); - ir->cdev.owner = ir->d.owner; - ir->attached = true; - - err = cdev_device_add(&ir->cdev, &ir->dev); + err = cdev_device_add(&d->cdev, &d->dev); if (err) { ida_simple_remove(&lirc_ida, minor); - put_device(&ir->dev); return err; } - get_device(ir->dev.parent); + get_device(d->dev.parent); - dev_info(ir->d.dev, "lirc_dev: driver %s registered at minor = %d\n", - ir->d.name, ir->d.minor); + dev_info(&d->dev, "lirc_dev: driver %s registered at minor = %d\n", + d->name, d->minor); return 0; } @@ -221,88 +190,83 @@ EXPORT_SYMBOL(lirc_register_device); void lirc_unregister_device(struct lirc_dev *d) { - struct irctl *ir; - - if (!d || !d->irctl) + if (!d) return; - ir = d->irctl; - - dev_dbg(ir->d.dev, "lirc_dev: driver %s unregistered from minor = %d\n", + dev_dbg(&d->dev, "lirc_dev: driver %s unregistered from minor = %d\n", d->name, d->minor); - cdev_device_del(&ir->cdev, &ir->dev); + mutex_lock(&d->mutex); - mutex_lock(&ir->mutex); - - ir->attached = false; - if (ir->open) { - dev_dbg(ir->d.dev, LOGHEAD "releasing opened driver\n", + d->attached = false; + if (d->open) { + dev_dbg(&d->dev, LOGHEAD "releasing opened driver\n", d->name, d->minor); - wake_up_interruptible(&ir->buf->wait_poll); + wake_up_interruptible(&d->buf->wait_poll); } - mutex_unlock(&ir->mutex); + mutex_unlock(&d->mutex); + cdev_device_del(&d->cdev, &d->dev); ida_simple_remove(&lirc_ida, d->minor); - put_device(&ir->dev); + put_device(&d->dev); } EXPORT_SYMBOL(lirc_unregister_device); int lirc_dev_fop_open(struct inode *inode, struct file *file) { - struct irctl *ir = container_of(inode->i_cdev, struct irctl, cdev); + struct lirc_dev *d = container_of(inode->i_cdev, struct lirc_dev, cdev); int retval; - dev_dbg(ir->d.dev, LOGHEAD "open called\n", ir->d.name, ir->d.minor); + dev_dbg(&d->dev, LOGHEAD "open called\n", d->name, d->minor); - retval = mutex_lock_interruptible(&ir->mutex); + retval = mutex_lock_interruptible(&d->mutex); if (retval) return retval; - if (!ir->attached) { + if (!d->attached) { retval = -ENODEV; goto out; } - if (ir->open) { + if (d->open) { retval = -EBUSY; goto out; } - if (ir->d.rdev) { - retval = rc_open(ir->d.rdev); + if (d->rdev) { + retval = rc_open(d->rdev); if (retval) goto out; } - if (ir->buf) - lirc_buffer_clear(ir->buf); + if (d->buf) + lirc_buffer_clear(d->buf); - ir->open++; + d->open++; lirc_init_pdata(inode, file); nonseekable_open(inode, file); - mutex_unlock(&ir->mutex); + mutex_unlock(&d->mutex); return 0; out: - mutex_unlock(&ir->mutex); + mutex_unlock(&d->mutex); return retval; } EXPORT_SYMBOL(lirc_dev_fop_open); int lirc_dev_fop_close(struct inode *inode, struct file *file) { - struct irctl *ir = file->private_data; + struct lirc_dev *d = file->private_data; - mutex_lock(&ir->mutex); + mutex_lock(&d->mutex); - rc_close(ir->d.rdev); - ir->open--; + rc_close(d->rdev); + d->open--; - mutex_unlock(&ir->mutex); + mutex_unlock(&d->mutex); return 0; } @@ -310,24 +274,24 @@ EXPORT_SYMBOL(lirc_dev_fop_close); unsigned int lirc_dev_fop_poll(struct file *file, poll_table *wait) { - struct irctl *ir = file->private_data; + struct lirc_dev *d = file->private_data; unsigned int ret; - if (!ir->attached) + if (!d->attached) return POLLHUP | POLLERR; - if (ir->buf) { - poll_wait(file, &ir->buf->wait_poll, wait); + if (d->buf) { + poll_wait(file, &d->buf->wait_poll, wait); - if (lirc_buffer_empty(ir->buf)) + if (lirc_buffer_empty(d->buf)) ret = 0; else ret = POLLIN | POLLRDNORM; - } else + } else { ret = POLLERR; + } - dev_dbg(ir->d.dev, LOGHEAD "poll result = %d\n", - ir->d.name, ir->d.minor, ret); + dev_dbg(&d->dev, LOGHEAD "poll result = %d\n", d->name, d->minor, ret); return ret; } @@ -335,44 +299,44 @@ EXPORT_SYMBOL(lirc_dev_fop_poll); long lirc_dev_fop_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { - struct irctl *ir = file->private_data; + struct lirc_dev *d = file->private_data; __u32 mode; int result; - dev_dbg(ir->d.dev, LOGHEAD "ioctl called (0x%x)\n", - ir->d.name, ir->d.minor, cmd); + dev_dbg(&d->dev, LOGHEAD "ioctl called (0x%x)\n", + d->name, d->minor, cmd); - result = mutex_lock_interruptible(&ir->mutex); + result = mutex_lock_interruptible(&d->mutex); if (result) return result; - if (!ir->attached) { + if (!d->attached) { result = -ENODEV; goto out; } switch (cmd) { case LIRC_GET_FEATURES: - result = put_user(ir->d.features, (__u32 __user *)arg); + result = put_user(d->features, (__u32 __user *)arg); break; case LIRC_GET_REC_MODE: - if (!LIRC_CAN_REC(ir->d.features)) { + if (!LIRC_CAN_REC(d->features)) { result = -ENOTTY; break; } result = put_user(LIRC_REC2MODE - (ir->d.features & LIRC_CAN_REC_MASK), + (d->features & LIRC_CAN_REC_MASK), (__u32 __user *)arg); break; case LIRC_SET_REC_MODE: - if (!LIRC_CAN_REC(ir->d.features)) { + if (!LIRC_CAN_REC(d->features)) { result = -ENOTTY; break; } result = get_user(mode, (__u32 __user *)arg); - if (!result && !(LIRC_MODE2REC(mode) & ir->d.features)) + if (!result && !(LIRC_MODE2REC(mode) & d->features)) result = -EINVAL; /* * FIXME: We should actually set the mode somehow but @@ -380,32 +344,32 @@ long lirc_dev_fop_ioctl(struct file *file, unsigned int cmd, unsigned long arg) */ break; case LIRC_GET_LENGTH: - result = put_user(ir->d.code_length, (__u32 __user *)arg); + result = put_user(d->code_length, (__u32 __user *)arg); break; case LIRC_GET_MIN_TIMEOUT: - if (!(ir->d.features & LIRC_CAN_SET_REC_TIMEOUT) || - ir->d.min_timeout == 0) { + if (!(d->features & LIRC_CAN_SET_REC_TIMEOUT) || + d->min_timeout == 0) { result = -ENOTTY; break; } - result = put_user(ir->d.min_timeout, (__u32 __user *)arg); + result = put_user(d->min_timeout, (__u32 __user *)arg); break; case LIRC_GET_MAX_TIMEOUT: - if (!(ir->d.features & LIRC_CAN_SET_REC_TIMEOUT) || - ir->d.max_timeout == 0) { + if (!(d->features & LIRC_CAN_SET_REC_TIMEOUT) || + d->max_timeout == 0) { result = -ENOTTY; break; } - result = put_user(ir->d.max_timeout, (__u32 __user *)arg); + result = put_user(d->max_timeout, (__u32 __user *)arg); break; default: result = -ENOTTY; } out: - mutex_unlock(&ir->mutex); + mutex_unlock(&d->mutex); return result; } EXPORT_SYMBOL(lirc_dev_fop_ioctl); @@ -415,34 +379,34 @@ ssize_t lirc_dev_fop_read(struct file *file, size_t length, loff_t *ppos) { - struct irctl *ir = file->private_data; + struct lirc_dev *d = file->private_data; unsigned char *buf; int ret, written = 0; DECLARE_WAITQUEUE(wait, current); - dev_dbg(ir->d.dev, LOGHEAD "read called\n", ir->d.name, ir->d.minor); - - buf = kzalloc(ir->buf->chunk_size, GFP_KERNEL); + buf = kzalloc(d->buf->chunk_size, GFP_KERNEL); if (!buf) return -ENOMEM; - ret = mutex_lock_interruptible(&ir->mutex); + dev_dbg(&d->dev, LOGHEAD "read called\n", d->name, d->minor); + + ret = mutex_lock_interruptible(&d->mutex); if (ret) { kfree(buf); return ret; } - if (!ir->attached) { + if (!d->attached) { ret = -ENODEV; goto out_locked; } - if (!LIRC_CAN_REC(ir->d.features)) { + if (!LIRC_CAN_REC(d->features)) { ret = -EINVAL; goto out_locked; } - if (length % ir->buf->chunk_size) { + if (length % d->buf->chunk_size) { ret = -EINVAL; goto out_locked; } @@ -452,14 +416,14 @@ ssize_t lirc_dev_fop_read(struct file *file, * to avoid losing scan code (in case when queue is awaken somewhere * between while condition checking and scheduling) */ - add_wait_queue(&ir->buf->wait_poll, &wait); + add_wait_queue(&d->buf->wait_poll, &wait); /* * while we didn't provide 'length' bytes, device is opened in blocking * mode and 'copy_to_user' is happy, wait for data. */ while (written < length && ret == 0) { - if (lirc_buffer_empty(ir->buf)) { + if (lirc_buffer_empty(d->buf)) { /* According to the read(2) man page, 'written' can be * returned as less than 'length', instead of blocking * again, returning -EWOULDBLOCK, or returning @@ -476,36 +440,36 @@ ssize_t lirc_dev_fop_read(struct file *file, break; } - mutex_unlock(&ir->mutex); + mutex_unlock(&d->mutex); set_current_state(TASK_INTERRUPTIBLE); schedule(); set_current_state(TASK_RUNNING); - ret = mutex_lock_interruptible(&ir->mutex); + ret = mutex_lock_interruptible(&d->mutex); if (ret) { - remove_wait_queue(&ir->buf->wait_poll, &wait); + remove_wait_queue(&d->buf->wait_poll, &wait); goto out_unlocked; } - if (!ir->attached) { + if (!d->attached) { ret = -ENODEV; goto out_locked; } } else { - lirc_buffer_read(ir->buf, buf); + lirc_buffer_read(d->buf, buf); ret = copy_to_user((void __user *)buffer+written, buf, - ir->buf->chunk_size); + d->buf->chunk_size); if (!ret) - written += ir->buf->chunk_size; + written += d->buf->chunk_size; else ret = -EFAULT; } } - remove_wait_queue(&ir->buf->wait_poll, &wait); + remove_wait_queue(&d->buf->wait_poll, &wait); out_locked: - mutex_unlock(&ir->mutex); + mutex_unlock(&d->mutex); out_unlocked: kfree(buf); @@ -516,17 +480,17 @@ EXPORT_SYMBOL(lirc_dev_fop_read); void lirc_init_pdata(struct inode *inode, struct file *file) { - struct irctl *ir = container_of(inode->i_cdev, struct irctl, cdev); + struct lirc_dev *d = container_of(inode->i_cdev, struct lirc_dev, cdev); - file->private_data = ir; + file->private_data = d; } EXPORT_SYMBOL(lirc_init_pdata); void *lirc_get_pdata(struct file *file) { - struct irctl *ir = file->private_data; + struct lirc_dev *d = file->private_data; - return ir->d.data; + return d->data; } EXPORT_SYMBOL(lirc_get_pdata); diff --git a/drivers/staging/media/lirc/lirc_zilog.c b/drivers/staging/media/lirc/lirc_zilog.c index 00e8c8f224b7..6bd0717bf76e 100644 --- a/drivers/staging/media/lirc/lirc_zilog.c +++ b/drivers/staging/media/lirc/lirc_zilog.c @@ -184,10 +184,8 @@ static void release_ir_device(struct kref *ref) * ir->open_count == 0 - happens on final close() * ir_lock, tx_ref_lock, rx_ref_lock, all released */ - if (ir->l) { + if (ir->l) lirc_unregister_device(ir->l); - lirc_free_device(ir->l); - } if (kfifo_initialized(&ir->rbuf.fifo)) lirc_buffer_free(&ir->rbuf); @@ -318,7 +316,7 @@ static int add_to_buf(struct IR *ir) int ret; int failures = 0; unsigned char sendbuf[1] = { 0 }; - struct lirc_buffer *rbuf = ir->l->rbuf; + struct lirc_buffer *rbuf = ir->l->buf; struct IR_rx *rx; struct IR_tx *tx; @@ -464,7 +462,7 @@ static int add_to_buf(struct IR *ir) static int lirc_thread(void *arg) { struct IR *ir = arg; - struct lirc_buffer *rbuf = ir->l->rbuf; + struct lirc_buffer *rbuf = ir->l->buf; dev_dbg(ir->dev, "poll thread started\n"); @@ -885,7 +883,7 @@ static ssize_t read(struct file *filep, char __user *outbuf, size_t n, { struct IR *ir = lirc_get_pdata(filep); struct IR_rx *rx; - struct lirc_buffer *rbuf = ir->l->rbuf; + struct lirc_buffer *rbuf = ir->l->buf; int ret = 0, written = 0, retries = 0; unsigned int m; DECLARE_WAITQUEUE(wait, current); @@ -1203,7 +1201,7 @@ static unsigned int poll(struct file *filep, poll_table *wait) { struct IR *ir = lirc_get_pdata(filep); struct IR_rx *rx; - struct lirc_buffer *rbuf = ir->l->rbuf; + struct lirc_buffer *rbuf = ir->l->buf; unsigned int ret; dev_dbg(ir->dev, "%s called\n", __func__); @@ -1449,6 +1447,7 @@ static int ir_probe(struct i2c_client *client, const struct i2c_device_id *id) ir->l->code_length = 13; ir->l->fops = &lirc_fops; ir->l->owner = THIS_MODULE; + ir->l->dev.parent = &adap->dev; /* * FIXME this is a pointer reference to us, but no refcount. @@ -1456,13 +1455,12 @@ static int ir_probe(struct i2c_client *client, const struct i2c_device_id *id) * This OK for now, since lirc_dev currently won't touch this * buffer as we provide our own lirc_fops. * - * Currently our own lirc_fops rely on this ir->l->rbuf pointer + * Currently our own lirc_fops rely on this ir->l->buf pointer */ - ir->l->rbuf = &ir->rbuf; - ir->l->dev = &adap->dev; + ir->l->buf = &ir->rbuf; /* This will be returned by lirc_get_pdata() */ ir->l->data = ir; - ret = lirc_buffer_init(ir->l->rbuf, 2, BUFLEN / 2); + ret = lirc_buffer_init(ir->l->buf, 2, BUFLEN / 2); if (ret) { lirc_free_device(ir->l); ir->l = NULL; diff --git a/include/media/lirc_dev.h b/include/media/lirc_dev.h index 4b0dc640e142..981dcabd5fd5 100644 --- a/include/media/lirc_dev.h +++ b/include/media/lirc_dev.h @@ -17,6 +17,8 @@ #include #include #include +#include +#include struct lirc_buffer { wait_queue_head_t wait_poll; @@ -127,14 +129,19 @@ static inline unsigned int lirc_buffer_write(struct lirc_buffer *buf, * LIRC_CAN_SET_REC_TIMEOUT is defined. * @max_timeout: Maximum timeout for record. Valid only if * LIRC_CAN_SET_REC_TIMEOUT is defined. - * @rbuf: if not NULL, it will be used as a read buffer, you will + * @buf: if %NULL, lirc_dev will allocate and manage the buffer, + * otherwise allocated by the caller which will * have to write to the buffer by other means, like irq's * (see also lirc_serial.c). + * @buf_internal: whether lirc_dev has allocated the read buffer or not * @rdev: &struct rc_dev associated with the device * @fops: &struct file_operations for the device - * @dev: &struct device assigned to the device * @owner: the module owning this struct - * @irctl: &struct irctl assigned to the device + * @attached: if the device is still live + * @open: open count for the device's chardev + * @mutex: serialises file_operations calls + * @dev: &struct device assigned to the device + * @cdev: &struct cdev assigned to the device */ struct lirc_dev { char name[40]; @@ -144,16 +151,23 @@ struct lirc_dev { unsigned int buffer_size; /* in chunks holding one code each */ unsigned int chunk_size; + struct lirc_buffer *buf; + bool buf_internal; void *data; int min_timeout; int max_timeout; - struct lirc_buffer *rbuf; struct rc_dev *rdev; const struct file_operations *fops; - struct device *dev; struct module *owner; - struct irctl *irctl; + + bool attached; + int open; + + struct mutex mutex; /* protect from simultaneous accesses */ + + struct device dev; + struct cdev cdev; }; struct lirc_dev *lirc_allocate_device(void); From 184982e1b489a8cad159d0ce600b283dfdb996fb Mon Sep 17 00:00:00 2001 From: Arvind Yadav Date: Sun, 13 Aug 2017 05:54:44 -0300 Subject: [PATCH 0490/1640] UPSTREAM: [media] media: rc: constify usb_device_id usb_device_id are not supposed to change at runtime. All functions working with usb_device_id provided by work with const usb_device_id. So mark the non-const structs as const. Signed-off-by: Arvind Yadav Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/ati_remote.c | 2 +- drivers/media/rc/igorplugusb.c | 2 +- drivers/media/rc/imon.c | 2 +- drivers/media/rc/mceusb.c | 2 +- drivers/media/rc/redrat3.c | 2 +- drivers/media/rc/streamzap.c | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/media/rc/ati_remote.c b/drivers/media/rc/ati_remote.c index d0871d60a723..8e82610ffaad 100644 --- a/drivers/media/rc/ati_remote.c +++ b/drivers/media/rc/ati_remote.c @@ -198,7 +198,7 @@ static const struct ati_receiver_type type_firefly = { .default_keymap = RC_MAP_SNAPSTREAM_FIREFLY }; -static struct usb_device_id ati_remote_table[] = { +static const struct usb_device_id ati_remote_table[] = { { USB_DEVICE(ATI_REMOTE_VENDOR_ID, LOLA_REMOTE_PRODUCT_ID), .driver_info = (unsigned long)&type_ati diff --git a/drivers/media/rc/igorplugusb.c b/drivers/media/rc/igorplugusb.c index a5ea86be8f44..4b715eb995f8 100644 --- a/drivers/media/rc/igorplugusb.c +++ b/drivers/media/rc/igorplugusb.c @@ -245,7 +245,7 @@ static void igorplugusb_disconnect(struct usb_interface *intf) usb_free_urb(ir->urb); } -static struct usb_device_id igorplugusb_table[] = { +static const struct usb_device_id igorplugusb_table[] = { /* Igor Plug USB (Atmel's Manufact. ID) */ { USB_DEVICE(0x03eb, 0x0002) }, /* Fit PC2 Infrared Adapter */ diff --git a/drivers/media/rc/imon.c b/drivers/media/rc/imon.c index f6e3ebf71d47..4dc61264c19b 100644 --- a/drivers/media/rc/imon.c +++ b/drivers/media/rc/imon.c @@ -346,7 +346,7 @@ static const struct imon_usb_dev_descr imon_ir_raw = { * devices use the SoundGraph vendor ID (0x15c2). This driver only supports * the ffdc and later devices, which do onboard decoding. */ -static struct usb_device_id imon_usb_id_table[] = { +static const struct usb_device_id imon_usb_id_table[] = { /* * Several devices with this same device ID, all use iMON_PAD.inf * SoundGraph iMON PAD (IR & VFD) diff --git a/drivers/media/rc/mceusb.c b/drivers/media/rc/mceusb.c index bf7aaff3aa37..67c1ff099eb4 100644 --- a/drivers/media/rc/mceusb.c +++ b/drivers/media/rc/mceusb.c @@ -249,7 +249,7 @@ static const struct mceusb_model mceusb_model[] = { }, }; -static struct usb_device_id mceusb_dev_table[] = { +static const struct usb_device_id mceusb_dev_table[] = { /* Original Microsoft MCE IR Transceiver (often HP-branded) */ { USB_DEVICE(VENDOR_MICROSOFT, 0x006d), .driver_info = MCE_GEN1 }, diff --git a/drivers/media/rc/redrat3.c b/drivers/media/rc/redrat3.c index 6784cb9fc4e7..6bfc24885b5c 100644 --- a/drivers/media/rc/redrat3.c +++ b/drivers/media/rc/redrat3.c @@ -186,7 +186,7 @@ struct redrat3_error { } __packed; /* table of devices that work with this driver */ -static struct usb_device_id redrat3_dev_table[] = { +static const struct usb_device_id redrat3_dev_table[] = { /* Original version of the RedRat3 */ {USB_DEVICE(USB_RR3USB_VENDOR_ID, USB_RR3USB_PRODUCT_ID)}, /* Second Version/release of the RedRat3 - RetRat3-II */ diff --git a/drivers/media/rc/streamzap.c b/drivers/media/rc/streamzap.c index f03a174ddf9d..4eebfcfc10f3 100644 --- a/drivers/media/rc/streamzap.c +++ b/drivers/media/rc/streamzap.c @@ -43,7 +43,7 @@ #define USB_STREAMZAP_PRODUCT_ID 0x0000 /* table of devices that work with this driver */ -static struct usb_device_id streamzap_table[] = { +static const struct usb_device_id streamzap_table[] = { /* Streamzap Remote Control */ { USB_DEVICE(USB_STREAMZAP_VENDOR_ID, USB_STREAMZAP_PRODUCT_ID) }, /* Terminating entry */ From d39695f399cd19071ef7078f328a58acefaba63a Mon Sep 17 00:00:00 2001 From: Bhumika Goyal Date: Sat, 19 Aug 2017 05:22:15 -0300 Subject: [PATCH 0491/1640] UPSTREAM: [media] media: rc: make device_type const Make this const as it is only stored in the type field of a device structure, which is const. Done using Coccinelle. Signed-off-by: Bhumika Goyal Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/rc-main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/media/rc/rc-main.c b/drivers/media/rc/rc-main.c index 981cccd6b988..33bddba6e541 100644 --- a/drivers/media/rc/rc-main.c +++ b/drivers/media/rc/rc-main.c @@ -1529,7 +1529,7 @@ static const struct attribute_group rc_dev_wakeup_filter_attr_grp = { .attrs = rc_dev_wakeup_filter_attrs, }; -static struct device_type rc_dev_type = { +static const struct device_type rc_dev_type = { .release = rc_dev_release, .uevent = rc_dev_uevent, }; From b9dccd20c5eab27a87bdd5a2ff38a4d17b62a311 Mon Sep 17 00:00:00 2001 From: Markus Elfring Date: Tue, 29 Aug 2017 07:40:07 -0300 Subject: [PATCH 0492/1640] UPSTREAM: [media] media: imon: delete an error message for a failed memory allocation Omit an extra message for a memory allocation failure in this function. This issue was detected by using the Coccinelle software. Signed-off-by: Markus Elfring Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/imon.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/media/rc/imon.c b/drivers/media/rc/imon.c index 4dc61264c19b..f6fc57d19216 100644 --- a/drivers/media/rc/imon.c +++ b/drivers/media/rc/imon.c @@ -2308,10 +2308,9 @@ static struct imon_context *imon_init_intf0(struct usb_interface *intf, int ret = -ENOMEM; ictx = kzalloc(sizeof(struct imon_context), GFP_KERNEL); - if (!ictx) { - dev_err(dev, "%s: kzalloc failed for context", __func__); + if (!ictx) goto exit; - } + rx_urb = usb_alloc_urb(0, GFP_KERNEL); if (!rx_urb) goto rx_urb_alloc_failed; From c80c9055eb712c3879d5c14dc12cfb5c3f237ec2 Mon Sep 17 00:00:00 2001 From: Markus Elfring Date: Tue, 29 Aug 2017 16:04:20 -0300 Subject: [PATCH 0493/1640] UPSTREAM: [media] media: img-ir: delete an error message for a failed memory allocation Omit an extra message for a memory allocation failure in this function. This issue was detected by using the Coccinelle software. Signed-off-by: Markus Elfring Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/img-ir/img-ir-core.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/media/rc/img-ir/img-ir-core.c b/drivers/media/rc/img-ir/img-ir-core.c index 03fe080278df..bcbabeeab12a 100644 --- a/drivers/media/rc/img-ir/img-ir-core.c +++ b/drivers/media/rc/img-ir/img-ir-core.c @@ -92,10 +92,9 @@ static int img_ir_probe(struct platform_device *pdev) /* Private driver data */ priv = devm_kzalloc(&pdev->dev, sizeof(*priv), GFP_KERNEL); - if (!priv) { - dev_err(&pdev->dev, "cannot allocate device data\n"); + if (!priv) return -ENOMEM; - } + platform_set_drvdata(pdev, priv); priv->dev = &pdev->dev; spin_lock_init(&priv->lock); From 29f72b164a49fa81de1f569f9f0b277c9dd28d9f Mon Sep 17 00:00:00 2001 From: Sean Young Date: Fri, 1 Sep 2017 08:34:50 -0300 Subject: [PATCH 0494/1640] UPSTREAM: [media] media: rc: avermedia keymap for a800 The keymap is missing one key, and correct another. Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/keymaps/rc-avermedia-m135a.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/media/rc/keymaps/rc-avermedia-m135a.c b/drivers/media/rc/keymaps/rc-avermedia-m135a.c index 9882e2cde975..6d5a73b7ccec 100644 --- a/drivers/media/rc/keymaps/rc-avermedia-m135a.c +++ b/drivers/media/rc/keymaps/rc-avermedia-m135a.c @@ -43,7 +43,8 @@ static struct rc_map_table avermedia_m135a[] = { { 0x0213, KEY_RIGHT }, /* -> or L */ { 0x0212, KEY_LEFT }, /* <- or R */ - { 0x0217, KEY_SLEEP }, /* Capturar Imagem or Snapshot */ + { 0x0215, KEY_MENU }, + { 0x0217, KEY_CAMERA }, /* Capturar Imagem or Snapshot */ { 0x0210, KEY_SHUFFLE }, /* Amostra or 16 chan prev */ { 0x0303, KEY_CHANNELUP }, From dae5f1f2dd06d5c0dff4dc365ede7b1eeee02549 Mon Sep 17 00:00:00 2001 From: Sean Young Date: Fri, 1 Sep 2017 09:55:59 -0300 Subject: [PATCH 0495/1640] UPSTREAM: [media] media: rc: ensure that protocols are enabled for scancode drivers rc scancode drivers without change_protocol should have all protocols enabled at all time. This was only true for cec and ir-kbd-i2c. Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/media/cec/cec-core.c | 1 - drivers/media/i2c/ir-kbd-i2c.c | 1 - drivers/media/rc/rc-main.c | 3 +++ 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/media/cec/cec-core.c b/drivers/media/cec/cec-core.c index 648136e552d5..dc7fd6f80bc0 100644 --- a/drivers/media/cec/cec-core.c +++ b/drivers/media/cec/cec-core.c @@ -277,7 +277,6 @@ struct cec_adapter *cec_allocate_adapter(const struct cec_adap_ops *ops, adap->rc->input_id.version = 1; adap->rc->driver_name = CEC_NAME; adap->rc->allowed_protocols = RC_PROTO_BIT_CEC; - adap->rc->enabled_protocols = RC_PROTO_BIT_CEC; adap->rc->priv = adap; adap->rc->map_name = RC_MAP_CEC; adap->rc->timeout = MS_TO_NS(100); diff --git a/drivers/media/i2c/ir-kbd-i2c.c b/drivers/media/i2c/ir-kbd-i2c.c index a374e2a0ac3d..8b5f7d0435e4 100644 --- a/drivers/media/i2c/ir-kbd-i2c.c +++ b/drivers/media/i2c/ir-kbd-i2c.c @@ -460,7 +460,6 @@ static int ir_probe(struct i2c_client *client, const struct i2c_device_id *id) */ rc->map_name = ir->ir_codes; rc->allowed_protocols = rc_proto; - rc->enabled_protocols = rc_proto; if (!rc->driver_name) rc->driver_name = MODULE_NAME; diff --git a/drivers/media/rc/rc-main.c b/drivers/media/rc/rc-main.c index 33bddba6e541..127f3215cd84 100644 --- a/drivers/media/rc/rc-main.c +++ b/drivers/media/rc/rc-main.c @@ -1638,6 +1638,9 @@ static int rc_prepare_rx_device(struct rc_dev *dev) rc_proto = BIT_ULL(rc_map->rc_proto); + if (dev->driver_type == RC_DRIVER_SCANCODE && !dev->change_protocol) + dev->enabled_protocols = dev->allowed_protocols; + if (dev->change_protocol) { rc = dev->change_protocol(dev, &rc_proto); if (rc < 0) From d1e89a2a0c33d46f1b6fe07c54f552a2c144fa97 Mon Sep 17 00:00:00 2001 From: Sean Young Date: Fri, 1 Sep 2017 11:30:50 -0300 Subject: [PATCH 0496/1640] UPSTREAM: [media] media: rc: if protocols can't be changed, don't be writable If the protocols of an rc device cannot be changed, ensure the sysfs file is not writable. This makes it possible to detect this from userspace, so ir-keytable can deal with case without giving an error. Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/rc-main.c | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/drivers/media/rc/rc-main.c b/drivers/media/rc/rc-main.c index 127f3215cd84..42ac3490b5f8 100644 --- a/drivers/media/rc/rc-main.c +++ b/drivers/media/rc/rc-main.c @@ -1487,7 +1487,10 @@ static int rc_dev_uevent(struct device *device, struct kobj_uevent_env *env) /* * Static device attribute struct with the sysfs attributes for IR's */ -static DEVICE_ATTR(protocols, 0644, show_protocols, store_protocols); +static struct device_attribute dev_attr_ro_protocols = +__ATTR(protocols, 0444, show_protocols, NULL); +static struct device_attribute dev_attr_rw_protocols = +__ATTR(protocols, 0644, show_protocols, store_protocols); static DEVICE_ATTR(wakeup_protocols, 0644, show_wakeup_protocols, store_wakeup_protocols); static RC_FILTER_ATTR(filter, S_IRUGO|S_IWUSR, @@ -1499,13 +1502,22 @@ static RC_FILTER_ATTR(wakeup_filter, S_IRUGO|S_IWUSR, static RC_FILTER_ATTR(wakeup_filter_mask, S_IRUGO|S_IWUSR, show_filter, store_filter, RC_FILTER_WAKEUP, true); -static struct attribute *rc_dev_protocol_attrs[] = { - &dev_attr_protocols.attr, +static struct attribute *rc_dev_rw_protocol_attrs[] = { + &dev_attr_rw_protocols.attr, NULL, }; -static const struct attribute_group rc_dev_protocol_attr_grp = { - .attrs = rc_dev_protocol_attrs, +static const struct attribute_group rc_dev_rw_protocol_attr_grp = { + .attrs = rc_dev_rw_protocol_attrs, +}; + +static struct attribute *rc_dev_ro_protocol_attrs[] = { + &dev_attr_ro_protocols.attr, + NULL, +}; + +static const struct attribute_group rc_dev_ro_protocol_attr_grp = { + .attrs = rc_dev_ro_protocol_attrs, }; static struct attribute *rc_dev_filter_attrs[] = { @@ -1732,8 +1744,10 @@ int rc_register_device(struct rc_dev *dev) dev_set_drvdata(&dev->dev, dev); dev->dev.groups = dev->sysfs_groups; - if (dev->driver_type != RC_DRIVER_IR_RAW_TX) - dev->sysfs_groups[attr++] = &rc_dev_protocol_attr_grp; + if (dev->driver_type == RC_DRIVER_SCANCODE && !dev->change_protocol) + dev->sysfs_groups[attr++] = &rc_dev_ro_protocol_attr_grp; + else if (dev->driver_type != RC_DRIVER_IR_RAW_TX) + dev->sysfs_groups[attr++] = &rc_dev_rw_protocol_attr_grp; if (dev->s_filter) dev->sysfs_groups[attr++] = &rc_dev_filter_attr_grp; if (dev->s_wakeup_filter) From d85e01eb15593e3f91afd3548b03901328729c04 Mon Sep 17 00:00:00 2001 From: Sean Young Date: Fri, 1 Sep 2017 11:34:23 -0300 Subject: [PATCH 0497/1640] UPSTREAM: [media] media: rc: include device name in rc udev event This name is also stored in the input's device name, but that is not available in TX only hardware (no input device). Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/rc-main.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/media/rc/rc-main.c b/drivers/media/rc/rc-main.c index 42ac3490b5f8..8c828fee4f5a 100644 --- a/drivers/media/rc/rc-main.c +++ b/drivers/media/rc/rc-main.c @@ -1480,6 +1480,8 @@ static int rc_dev_uevent(struct device *device, struct kobj_uevent_env *env) ADD_HOTPLUG_VAR("NAME=%s", dev->rc_map.name); if (dev->driver_name) ADD_HOTPLUG_VAR("DRV_NAME=%s", dev->driver_name); + if (dev->device_name) + ADD_HOTPLUG_VAR("DEV_NAME=%s", dev->device_name); return 0; } From 2fc6bf1648af3414cf2d408b7539e391e7a06481 Mon Sep 17 00:00:00 2001 From: Sean Young Date: Wed, 6 Sep 2017 08:19:06 -0300 Subject: [PATCH 0498/1640] UPSTREAM: [media] media: vp7045: port TwinhanDTV Alpha to rc-core Only the nec protocol is understood, but then it doesn't pass on the full scancode and it ignores the nec repeats its own remote sends, so holding buttons does not work. Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/keymaps/rc-twinhan1027.c | 2 +- drivers/media/usb/dvb-usb/dvb-usb-remote.c | 1 + drivers/media/usb/dvb-usb/dvb-usb.h | 1 + drivers/media/usb/dvb-usb/vp7045.c | 88 ++++------------------ 4 files changed, 18 insertions(+), 74 deletions(-) diff --git a/drivers/media/rc/keymaps/rc-twinhan1027.c b/drivers/media/rc/keymaps/rc-twinhan1027.c index 2275b37c61d2..78bb3143a1a8 100644 --- a/drivers/media/rc/keymaps/rc-twinhan1027.c +++ b/drivers/media/rc/keymaps/rc-twinhan1027.c @@ -66,7 +66,7 @@ static struct rc_map_list twinhan_vp1027_map = { .map = { .scan = twinhan_vp1027, .size = ARRAY_SIZE(twinhan_vp1027), - .rc_proto = RC_PROTO_UNKNOWN, /* Legacy IR type */ + .rc_proto = RC_PROTO_NEC, .name = RC_MAP_TWINHAN_VP1027_DVBS, } }; diff --git a/drivers/media/usb/dvb-usb/dvb-usb-remote.c b/drivers/media/usb/dvb-usb/dvb-usb-remote.c index 701c10835482..57ff5869144c 100644 --- a/drivers/media/usb/dvb-usb/dvb-usb-remote.c +++ b/drivers/media/usb/dvb-usb/dvb-usb-remote.c @@ -284,6 +284,7 @@ static int rc_core_dvb_usb_remote_init(struct dvb_usb_device *d) dev->input_phys = d->rc_phys; dev->dev.parent = &d->udev->dev; dev->priv = d; + dev->scancode_mask = d->props.rc.core.scancode_mask; err = rc_register_device(dev); if (err < 0) { diff --git a/drivers/media/usb/dvb-usb/dvb-usb.h b/drivers/media/usb/dvb-usb/dvb-usb.h index 21ab517417fc..f4269cebf2fe 100644 --- a/drivers/media/usb/dvb-usb/dvb-usb.h +++ b/drivers/media/usb/dvb-usb/dvb-usb.h @@ -208,6 +208,7 @@ struct dvb_rc { int (*rc_query) (struct dvb_usb_device *d); int rc_interval; bool bulk_mode; /* uses bulk mode */ + u32 scancode_mask; }; /** diff --git a/drivers/media/usb/dvb-usb/vp7045.c b/drivers/media/usb/dvb-usb/vp7045.c index 13340af0d39c..2527b88beb87 100644 --- a/drivers/media/usb/dvb-usb/vp7045.c +++ b/drivers/media/usb/dvb-usb/vp7045.c @@ -97,82 +97,22 @@ static int vp7045_power_ctrl(struct dvb_usb_device *d, int onoff) return vp7045_usb_op(d,SET_TUNER_POWER,&v,1,NULL,0,150); } -/* remote control stuff */ - -/* The keymapping struct. Somehow this should be loaded to the driver, but - * currently it is hardcoded. */ -static struct rc_map_table rc_map_vp7045_table[] = { - { 0x0016, KEY_POWER }, - { 0x0010, KEY_MUTE }, - { 0x0003, KEY_1 }, - { 0x0001, KEY_2 }, - { 0x0006, KEY_3 }, - { 0x0009, KEY_4 }, - { 0x001d, KEY_5 }, - { 0x001f, KEY_6 }, - { 0x000d, KEY_7 }, - { 0x0019, KEY_8 }, - { 0x001b, KEY_9 }, - { 0x0015, KEY_0 }, - { 0x0005, KEY_CHANNELUP }, - { 0x0002, KEY_CHANNELDOWN }, - { 0x001e, KEY_VOLUMEUP }, - { 0x000a, KEY_VOLUMEDOWN }, - { 0x0011, KEY_RECORD }, - { 0x0017, KEY_FAVORITES }, /* Heart symbol - Channel list. */ - { 0x0014, KEY_PLAY }, - { 0x001a, KEY_STOP }, - { 0x0040, KEY_REWIND }, - { 0x0012, KEY_FASTFORWARD }, - { 0x000e, KEY_PREVIOUS }, /* Recall - Previous channel. */ - { 0x004c, KEY_PAUSE }, - { 0x004d, KEY_SCREEN }, /* Full screen mode. */ - { 0x0054, KEY_AUDIO }, /* MTS - Switch to secondary audio. */ - { 0x000c, KEY_CANCEL }, /* Cancel */ - { 0x001c, KEY_EPG }, /* EPG */ - { 0x0000, KEY_TAB }, /* Tab */ - { 0x0048, KEY_INFO }, /* Preview */ - { 0x0004, KEY_LIST }, /* RecordList */ - { 0x000f, KEY_TEXT }, /* Teletext */ - { 0x0041, KEY_PREVIOUSSONG }, - { 0x0042, KEY_NEXTSONG }, - { 0x004b, KEY_UP }, - { 0x0051, KEY_DOWN }, - { 0x004e, KEY_LEFT }, - { 0x0052, KEY_RIGHT }, - { 0x004f, KEY_ENTER }, - { 0x0013, KEY_CANCEL }, - { 0x004a, KEY_CLEAR }, - { 0x0054, KEY_PRINT }, /* Capture */ - { 0x0043, KEY_SUBTITLE }, /* Subtitle/CC */ - { 0x0008, KEY_VIDEO }, /* A/V */ - { 0x0007, KEY_SLEEP }, /* Hibernate */ - { 0x0045, KEY_ZOOM }, /* Zoom+ */ - { 0x0018, KEY_RED}, - { 0x0053, KEY_GREEN}, - { 0x005e, KEY_YELLOW}, - { 0x005f, KEY_BLUE} -}; - -static int vp7045_rc_query(struct dvb_usb_device *d, u32 *event, int *state) +static int vp7045_rc_query(struct dvb_usb_device *d) { u8 key; - int i; vp7045_usb_op(d,RC_VAL_READ,NULL,0,&key,1,20); deb_rc("remote query key: %x %d\n",key,key); - if (key == 0x44) { - *state = REMOTE_NO_KEY_PRESSED; - return 0; + if (key != 0x44) { + /* + * The 8 bit address isn't available, but since the remote uses + * address 0 we'll use that. nec repeats are ignored too, even + * though the remote sends them. + */ + rc_keydown(d->rc_dev, RC_PROTO_NEC, RC_SCANCODE_NEC(0, key), 0); } - for (i = 0; i < ARRAY_SIZE(rc_map_vp7045_table); i++) - if (rc5_data(&rc_map_vp7045_table[i]) == key) { - *state = REMOTE_KEY_PRESSED; - *event = rc_map_vp7045_table[i].keycode; - break; - } return 0; } @@ -265,11 +205,13 @@ static struct dvb_usb_device_properties vp7045_properties = { .power_ctrl = vp7045_power_ctrl, .read_mac_address = vp7045_read_mac_addr, - .rc.legacy = { - .rc_interval = 400, - .rc_map_table = rc_map_vp7045_table, - .rc_map_size = ARRAY_SIZE(rc_map_vp7045_table), - .rc_query = vp7045_rc_query, + .rc.core = { + .rc_interval = 400, + .rc_codes = RC_MAP_TWINHAN_VP1027_DVBS, + .module_name = KBUILD_MODNAME, + .rc_query = vp7045_rc_query, + .allowed_protos = RC_PROTO_BIT_NEC, + .scancode_mask = 0xff, }, .num_device_descs = 2, From 4171c085d81614eef4d1b993a914f268bd3945de Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 5 Sep 2017 09:07:50 -0300 Subject: [PATCH 0499/1640] UPSTREAM: [media] media: imon: make two const arrays static, reduces object code size Don't populate the const arrays vfd_packet6 and fp_packet on the stack, instead make them static. Makes the object code smaller by over 600 bytes: Before: text data bss dec hex filename 43794 17920 1024 62738 f512 drivers/media/rc/imon.o After: text data bss dec hex filename 42994 18080 1024 62098 f292 drivers/media/rc/imon.o Signed-off-by: Colin Ian King Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/imon.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/media/rc/imon.c b/drivers/media/rc/imon.c index f6fc57d19216..beb9a82f6139 100644 --- a/drivers/media/rc/imon.c +++ b/drivers/media/rc/imon.c @@ -943,7 +943,7 @@ static ssize_t vfd_write(struct file *file, const char __user *buf, int seq; int retval = 0; struct imon_context *ictx; - const unsigned char vfd_packet6[] = { + static const unsigned char vfd_packet6[] = { 0x01, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF }; ictx = file->private_data; @@ -2044,8 +2044,8 @@ static struct rc_dev *imon_init_rdev(struct imon_context *ictx) { struct rc_dev *rdev; int ret; - const unsigned char fp_packet[] = { 0x40, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x88 }; + static const unsigned char fp_packet[] = { + 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x88 }; rdev = rc_allocate_device(ictx->dev_descr->flags & IMON_IR_RAW ? RC_DRIVER_IR_RAW : RC_DRIVER_SCANCODE); From 43e8ccd8053f9a5c293cff3b8e3463565c653ff9 Mon Sep 17 00:00:00 2001 From: Ladislav Michl Date: Thu, 7 Sep 2017 20:34:35 -0300 Subject: [PATCH 0500/1640] UPSTREAM: [media] media: rc: gpio-ir-recv: use helper variable to access device info Using explicit struct device variable makes code a bit more readable. Signed-off-by: Ladislav Michl Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/gpio-ir-recv.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/media/rc/gpio-ir-recv.c b/drivers/media/rc/gpio-ir-recv.c index 7248b3662285..741a68c192ce 100644 --- a/drivers/media/rc/gpio-ir-recv.c +++ b/drivers/media/rc/gpio-ir-recv.c @@ -95,18 +95,18 @@ err_get_value: static int gpio_ir_recv_probe(struct platform_device *pdev) { + struct device *dev = &pdev->dev; struct gpio_rc_dev *gpio_dev; struct rc_dev *rcdev; - const struct gpio_ir_recv_platform_data *pdata = - pdev->dev.platform_data; + const struct gpio_ir_recv_platform_data *pdata = dev->platform_data; int rc; if (pdev->dev.of_node) { struct gpio_ir_recv_platform_data *dtpdata = - devm_kzalloc(&pdev->dev, sizeof(*dtpdata), GFP_KERNEL); + devm_kzalloc(dev, sizeof(*dtpdata), GFP_KERNEL); if (!dtpdata) return -ENOMEM; - rc = gpio_ir_recv_get_devtree_pdata(&pdev->dev, dtpdata); + rc = gpio_ir_recv_get_devtree_pdata(dev, dtpdata); if (rc) return rc; pdata = dtpdata; @@ -135,7 +135,7 @@ static int gpio_ir_recv_probe(struct platform_device *pdev) rcdev->input_id.vendor = 0x0001; rcdev->input_id.product = 0x0001; rcdev->input_id.version = 0x0100; - rcdev->dev.parent = &pdev->dev; + rcdev->dev.parent = dev; rcdev->driver_name = GPIO_IR_DRIVER_NAME; rcdev->min_timeout = 1; rcdev->timeout = IR_DEFAULT_TIMEOUT; @@ -159,7 +159,7 @@ static int gpio_ir_recv_probe(struct platform_device *pdev) rc = rc_register_device(rcdev); if (rc < 0) { - dev_err(&pdev->dev, "failed to register rc device\n"); + dev_err(dev, "failed to register rc device (%d)\n", rc); goto err_register_rc_device; } From a470220f78c78d87f6124ad0c2cbe97f00754393 Mon Sep 17 00:00:00 2001 From: Ladislav Michl Date: Thu, 7 Sep 2017 20:35:22 -0300 Subject: [PATCH 0501/1640] UPSTREAM: [media] media: rc: gpio-ir-recv: use devm_kzalloc Use of devm_kzalloc simplifies error unwinding. Signed-off-by: Ladislav Michl Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/gpio-ir-recv.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/drivers/media/rc/gpio-ir-recv.c b/drivers/media/rc/gpio-ir-recv.c index 741a68c192ce..6fe7e7c14fe4 100644 --- a/drivers/media/rc/gpio-ir-recv.c +++ b/drivers/media/rc/gpio-ir-recv.c @@ -118,15 +118,13 @@ static int gpio_ir_recv_probe(struct platform_device *pdev) if (pdata->gpio_nr < 0) return -EINVAL; - gpio_dev = kzalloc(sizeof(struct gpio_rc_dev), GFP_KERNEL); + gpio_dev = devm_kzalloc(dev, sizeof(*gpio_dev), GFP_KERNEL); if (!gpio_dev) return -ENOMEM; rcdev = rc_allocate_device(RC_DRIVER_IR_RAW); - if (!rcdev) { - rc = -ENOMEM; - goto err_allocate_device; - } + if (!rcdev) + return -ENOMEM; rcdev->priv = gpio_dev; rcdev->device_name = GPIO_IR_DEVICE_NAME; @@ -182,8 +180,6 @@ err_gpio_direction_input: gpio_free(pdata->gpio_nr); err_gpio_request: rc_free_device(rcdev); -err_allocate_device: - kfree(gpio_dev); return rc; } @@ -194,7 +190,6 @@ static int gpio_ir_recv_remove(struct platform_device *pdev) free_irq(gpio_to_irq(gpio_dev->gpio_nr), gpio_dev); rc_unregister_device(gpio_dev->rcdev); gpio_free(gpio_dev->gpio_nr); - kfree(gpio_dev); return 0; } From 9ea4a76824e9c51c6a1969b10292107f41031be7 Mon Sep 17 00:00:00 2001 From: Ladislav Michl Date: Thu, 7 Sep 2017 20:36:11 -0300 Subject: [PATCH 0502/1640] UPSTREAM: [media] media: rc: gpio-ir-recv: use devm_rc_allocate_device Use of devm_rc_allocate_device simplifies error unwinding. Signed-off-by: Ladislav Michl Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/gpio-ir-recv.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/media/rc/gpio-ir-recv.c b/drivers/media/rc/gpio-ir-recv.c index 6fe7e7c14fe4..98dcb8399506 100644 --- a/drivers/media/rc/gpio-ir-recv.c +++ b/drivers/media/rc/gpio-ir-recv.c @@ -122,7 +122,7 @@ static int gpio_ir_recv_probe(struct platform_device *pdev) if (!gpio_dev) return -ENOMEM; - rcdev = rc_allocate_device(RC_DRIVER_IR_RAW); + rcdev = devm_rc_allocate_device(dev, RC_DRIVER_IR_RAW); if (!rcdev) return -ENOMEM; @@ -150,7 +150,7 @@ static int gpio_ir_recv_probe(struct platform_device *pdev) rc = gpio_request(pdata->gpio_nr, "gpio-ir-recv"); if (rc < 0) - goto err_gpio_request; + return rc; rc = gpio_direction_input(pdata->gpio_nr); if (rc < 0) goto err_gpio_direction_input; @@ -178,8 +178,6 @@ err_request_irq: err_register_rc_device: err_gpio_direction_input: gpio_free(pdata->gpio_nr); -err_gpio_request: - rc_free_device(rcdev); return rc; } From 628f9067a067b8d7ddf278b5158f7fc2be2433d9 Mon Sep 17 00:00:00 2001 From: Ladislav Michl Date: Thu, 7 Sep 2017 20:36:39 -0300 Subject: [PATCH 0503/1640] UPSTREAM: [media] media: rc: gpio-ir-recv: use devm_gpio_request_one Use of devm_gpio_request_one simplifies error unwinding. Signed-off-by: Ladislav Michl Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/gpio-ir-recv.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/drivers/media/rc/gpio-ir-recv.c b/drivers/media/rc/gpio-ir-recv.c index 98dcb8399506..77044d664371 100644 --- a/drivers/media/rc/gpio-ir-recv.c +++ b/drivers/media/rc/gpio-ir-recv.c @@ -148,12 +148,10 @@ static int gpio_ir_recv_probe(struct platform_device *pdev) gpio_dev->gpio_nr = pdata->gpio_nr; gpio_dev->active_low = pdata->active_low; - rc = gpio_request(pdata->gpio_nr, "gpio-ir-recv"); + rc = devm_gpio_request_one(dev, pdata->gpio_nr, GPIOF_DIR_IN, + "gpio-ir-recv"); if (rc < 0) return rc; - rc = gpio_direction_input(pdata->gpio_nr); - if (rc < 0) - goto err_gpio_direction_input; rc = rc_register_device(rcdev); if (rc < 0) { @@ -176,8 +174,6 @@ err_request_irq: rc_unregister_device(rcdev); rcdev = NULL; err_register_rc_device: -err_gpio_direction_input: - gpio_free(pdata->gpio_nr); return rc; } @@ -187,7 +183,6 @@ static int gpio_ir_recv_remove(struct platform_device *pdev) free_irq(gpio_to_irq(gpio_dev->gpio_nr), gpio_dev); rc_unregister_device(gpio_dev->rcdev); - gpio_free(gpio_dev->gpio_nr); return 0; } From c1b77f7c43b09cd34f33a498a9c211fc6726819f Mon Sep 17 00:00:00 2001 From: Ladislav Michl Date: Thu, 7 Sep 2017 20:37:07 -0300 Subject: [PATCH 0504/1640] UPSTREAM: [media] media: rc: gpio-ir-recv: use devm_rc_register_device Use of devm_rc_register_device simplifies error unwinding. Signed-off-by: Ladislav Michl Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/gpio-ir-recv.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/drivers/media/rc/gpio-ir-recv.c b/drivers/media/rc/gpio-ir-recv.c index 77044d664371..ae5f9099c8a6 100644 --- a/drivers/media/rc/gpio-ir-recv.c +++ b/drivers/media/rc/gpio-ir-recv.c @@ -153,10 +153,10 @@ static int gpio_ir_recv_probe(struct platform_device *pdev) if (rc < 0) return rc; - rc = rc_register_device(rcdev); + rc = devm_rc_register_device(dev, rcdev); if (rc < 0) { dev_err(dev, "failed to register rc device (%d)\n", rc); - goto err_register_rc_device; + return rc; } platform_set_drvdata(pdev, gpio_dev); @@ -171,9 +171,6 @@ static int gpio_ir_recv_probe(struct platform_device *pdev) return 0; err_request_irq: - rc_unregister_device(rcdev); - rcdev = NULL; -err_register_rc_device: return rc; } @@ -182,7 +179,6 @@ static int gpio_ir_recv_remove(struct platform_device *pdev) struct gpio_rc_dev *gpio_dev = platform_get_drvdata(pdev); free_irq(gpio_to_irq(gpio_dev->gpio_nr), gpio_dev); - rc_unregister_device(gpio_dev->rcdev); return 0; } From 6097f64a26aace4822a4b232d36b1e6c8c0632c7 Mon Sep 17 00:00:00 2001 From: Ladislav Michl Date: Thu, 7 Sep 2017 20:37:36 -0300 Subject: [PATCH 0505/1640] UPSTREAM: [media] media: rc: gpio-ir-recv: do not allow threaded interrupt handler Requesting any context irq is not actually great idea since threaded interrupt handler is run at too unpredictable time which turns timing information wrong. Fix it by requesting regular interrupt. Signed-off-by: Ladislav Michl Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/gpio-ir-recv.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/media/rc/gpio-ir-recv.c b/drivers/media/rc/gpio-ir-recv.c index ae5f9099c8a6..d82ddf906695 100644 --- a/drivers/media/rc/gpio-ir-recv.c +++ b/drivers/media/rc/gpio-ir-recv.c @@ -161,10 +161,9 @@ static int gpio_ir_recv_probe(struct platform_device *pdev) platform_set_drvdata(pdev, gpio_dev); - rc = request_any_context_irq(gpio_to_irq(pdata->gpio_nr), - gpio_ir_recv_irq, - IRQF_TRIGGER_FALLING | IRQF_TRIGGER_RISING, - "gpio-ir-recv-irq", gpio_dev); + rc = request_irq(gpio_to_irq(pdata->gpio_nr), gpio_ir_recv_irq, + IRQF_TRIGGER_FALLING | IRQF_TRIGGER_RISING, + "gpio-ir-recv-irq", gpio_dev); if (rc < 0) goto err_request_irq; From fe5a9256ca6ba1d0b0e01f31f4d2ce1bb73f6e10 Mon Sep 17 00:00:00 2001 From: Ladislav Michl Date: Thu, 7 Sep 2017 20:38:20 -0300 Subject: [PATCH 0506/1640] UPSTREAM: [media] media: rc: gpio-ir-recv: use devm_request_irq Use of devm_request_irq simplifies error unwinding and as free_irq was the last user of driver remove function, remove it too. Signed-off-by: Ladislav Michl Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/gpio-ir-recv.c | 23 ++++------------------- 1 file changed, 4 insertions(+), 19 deletions(-) diff --git a/drivers/media/rc/gpio-ir-recv.c b/drivers/media/rc/gpio-ir-recv.c index d82ddf906695..fe0dd2443af0 100644 --- a/drivers/media/rc/gpio-ir-recv.c +++ b/drivers/media/rc/gpio-ir-recv.c @@ -161,24 +161,10 @@ static int gpio_ir_recv_probe(struct platform_device *pdev) platform_set_drvdata(pdev, gpio_dev); - rc = request_irq(gpio_to_irq(pdata->gpio_nr), gpio_ir_recv_irq, - IRQF_TRIGGER_FALLING | IRQF_TRIGGER_RISING, - "gpio-ir-recv-irq", gpio_dev); - if (rc < 0) - goto err_request_irq; - - return 0; - -err_request_irq: - return rc; -} - -static int gpio_ir_recv_remove(struct platform_device *pdev) -{ - struct gpio_rc_dev *gpio_dev = platform_get_drvdata(pdev); - - free_irq(gpio_to_irq(gpio_dev->gpio_nr), gpio_dev); - return 0; + return devm_request_irq(dev, gpio_to_irq(pdata->gpio_nr), + gpio_ir_recv_irq, + IRQF_TRIGGER_FALLING | IRQF_TRIGGER_RISING, + "gpio-ir-recv-irq", gpio_dev); } #ifdef CONFIG_PM @@ -216,7 +202,6 @@ static const struct dev_pm_ops gpio_ir_recv_pm_ops = { static struct platform_driver gpio_ir_recv_driver = { .probe = gpio_ir_recv_probe, - .remove = gpio_ir_recv_remove, .driver = { .name = GPIO_IR_DRIVER_NAME, .of_match_table = of_match_ptr(gpio_ir_recv_of_match), From 6865ed82e3308c661e1a372b2f7f8abc1a323459 Mon Sep 17 00:00:00 2001 From: Ladislav Michl Date: Thu, 7 Sep 2017 20:39:14 -0300 Subject: [PATCH 0507/1640] UPSTREAM: [media] media: rc: gpio-ir-recv: use KBUILD_MODNAME There already is standard macro providing driver name, use it. Signed-off-by: Ladislav Michl Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/gpio-ir-recv.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/media/rc/gpio-ir-recv.c b/drivers/media/rc/gpio-ir-recv.c index fe0dd2443af0..b78195f06354 100644 --- a/drivers/media/rc/gpio-ir-recv.c +++ b/drivers/media/rc/gpio-ir-recv.c @@ -23,7 +23,6 @@ #include #include -#define GPIO_IR_DRIVER_NAME "gpio-rc-recv" #define GPIO_IR_DEVICE_NAME "gpio_ir_recv" struct gpio_rc_dev { @@ -134,7 +133,7 @@ static int gpio_ir_recv_probe(struct platform_device *pdev) rcdev->input_id.product = 0x0001; rcdev->input_id.version = 0x0100; rcdev->dev.parent = dev; - rcdev->driver_name = GPIO_IR_DRIVER_NAME; + rcdev->driver_name = KBUILD_MODNAME; rcdev->min_timeout = 1; rcdev->timeout = IR_DEFAULT_TIMEOUT; rcdev->max_timeout = 10 * IR_DEFAULT_TIMEOUT; @@ -203,7 +202,7 @@ static const struct dev_pm_ops gpio_ir_recv_pm_ops = { static struct platform_driver gpio_ir_recv_driver = { .probe = gpio_ir_recv_probe, .driver = { - .name = GPIO_IR_DRIVER_NAME, + .name = KBUILD_MODNAME, .of_match_table = of_match_ptr(gpio_ir_recv_of_match), #ifdef CONFIG_PM .pm = &gpio_ir_recv_pm_ops, From 5ba17002b10d4d6ae33dffbdc55ea495aef9193d Mon Sep 17 00:00:00 2001 From: Ladislav Michl Date: Thu, 7 Sep 2017 20:39:45 -0300 Subject: [PATCH 0508/1640] UPSTREAM: [media] media: rc: gpio-ir-recv: remove gpio_ir_recv_platform_data gpio_ir_recv_platform_data are not used anywhere in kernel tree, so remove it. Signed-off-by: Ladislav Michl Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/gpio-ir-recv.c | 92 ++++++------------- .../linux/platform_data/media/gpio-ir-recv.h | 23 ----- 2 files changed, 26 insertions(+), 89 deletions(-) delete mode 100644 include/linux/platform_data/media/gpio-ir-recv.h diff --git a/drivers/media/rc/gpio-ir-recv.c b/drivers/media/rc/gpio-ir-recv.c index b78195f06354..2634b81cbe7e 100644 --- a/drivers/media/rc/gpio-ir-recv.c +++ b/drivers/media/rc/gpio-ir-recv.c @@ -21,7 +21,6 @@ #include #include #include -#include #define GPIO_IR_DEVICE_NAME "gpio_ir_recv" @@ -31,45 +30,6 @@ struct gpio_rc_dev { bool active_low; }; -#ifdef CONFIG_OF -/* - * Translate OpenFirmware node properties into platform_data - */ -static int gpio_ir_recv_get_devtree_pdata(struct device *dev, - struct gpio_ir_recv_platform_data *pdata) -{ - struct device_node *np = dev->of_node; - enum of_gpio_flags flags; - int gpio; - - gpio = of_get_gpio_flags(np, 0, &flags); - if (gpio < 0) { - if (gpio != -EPROBE_DEFER) - dev_err(dev, "Failed to get gpio flags (%d)\n", gpio); - return gpio; - } - - pdata->gpio_nr = gpio; - pdata->active_low = (flags & OF_GPIO_ACTIVE_LOW); - /* probe() takes care of map_name == NULL or allowed_protos == 0 */ - pdata->map_name = of_get_property(np, "linux,rc-map-name", NULL); - pdata->allowed_protos = 0; - - return 0; -} - -static const struct of_device_id gpio_ir_recv_of_match[] = { - { .compatible = "gpio-ir-receiver", }, - { }, -}; -MODULE_DEVICE_TABLE(of, gpio_ir_recv_of_match); - -#else /* !CONFIG_OF */ - -#define gpio_ir_recv_get_devtree_pdata(dev, pdata) (-ENOSYS) - -#endif - static irqreturn_t gpio_ir_recv_irq(int irq, void *dev_id) { struct gpio_rc_dev *gpio_dev = dev_id; @@ -95,32 +55,29 @@ err_get_value: static int gpio_ir_recv_probe(struct platform_device *pdev) { struct device *dev = &pdev->dev; + struct device_node *np = dev->of_node; struct gpio_rc_dev *gpio_dev; + enum of_gpio_flags flags; struct rc_dev *rcdev; - const struct gpio_ir_recv_platform_data *pdata = dev->platform_data; int rc; - if (pdev->dev.of_node) { - struct gpio_ir_recv_platform_data *dtpdata = - devm_kzalloc(dev, sizeof(*dtpdata), GFP_KERNEL); - if (!dtpdata) - return -ENOMEM; - rc = gpio_ir_recv_get_devtree_pdata(dev, dtpdata); - if (rc) - return rc; - pdata = dtpdata; - } - - if (!pdata) - return -EINVAL; - - if (pdata->gpio_nr < 0) - return -EINVAL; + if (!np) + return -ENODEV; gpio_dev = devm_kzalloc(dev, sizeof(*gpio_dev), GFP_KERNEL); if (!gpio_dev) return -ENOMEM; + rc = of_get_gpio_flags(np, 0, &flags); + if (rc < 0) { + if (rc != -EPROBE_DEFER) + dev_err(dev, "Failed to get gpio flags (%d)\n", rc); + return rc; + } + + gpio_dev->gpio_nr = rc; + gpio_dev->active_low = (flags & OF_GPIO_ACTIVE_LOW); + rcdev = devm_rc_allocate_device(dev, RC_DRIVER_IR_RAW); if (!rcdev) return -ENOMEM; @@ -137,17 +94,14 @@ static int gpio_ir_recv_probe(struct platform_device *pdev) rcdev->min_timeout = 1; rcdev->timeout = IR_DEFAULT_TIMEOUT; rcdev->max_timeout = 10 * IR_DEFAULT_TIMEOUT; - if (pdata->allowed_protos) - rcdev->allowed_protocols = pdata->allowed_protos; - else - rcdev->allowed_protocols = RC_PROTO_BIT_ALL_IR_DECODER; - rcdev->map_name = pdata->map_name ?: RC_MAP_EMPTY; + rcdev->allowed_protocols = RC_PROTO_BIT_ALL_IR_DECODER; + rcdev->map_name = of_get_property(np, "linux,rc-map-name", NULL); + if (!rcdev->map_name) + rcdev->map_name = RC_MAP_EMPTY; gpio_dev->rcdev = rcdev; - gpio_dev->gpio_nr = pdata->gpio_nr; - gpio_dev->active_low = pdata->active_low; - rc = devm_gpio_request_one(dev, pdata->gpio_nr, GPIOF_DIR_IN, + rc = devm_gpio_request_one(dev, gpio_dev->gpio_nr, GPIOF_DIR_IN, "gpio-ir-recv"); if (rc < 0) return rc; @@ -160,7 +114,7 @@ static int gpio_ir_recv_probe(struct platform_device *pdev) platform_set_drvdata(pdev, gpio_dev); - return devm_request_irq(dev, gpio_to_irq(pdata->gpio_nr), + return devm_request_irq(dev, gpio_to_irq(gpio_dev->gpio_nr), gpio_ir_recv_irq, IRQF_TRIGGER_FALLING | IRQF_TRIGGER_RISING, "gpio-ir-recv-irq", gpio_dev); @@ -199,6 +153,12 @@ static const struct dev_pm_ops gpio_ir_recv_pm_ops = { }; #endif +static const struct of_device_id gpio_ir_recv_of_match[] = { + { .compatible = "gpio-ir-receiver", }, + { }, +}; +MODULE_DEVICE_TABLE(of, gpio_ir_recv_of_match); + static struct platform_driver gpio_ir_recv_driver = { .probe = gpio_ir_recv_probe, .driver = { diff --git a/include/linux/platform_data/media/gpio-ir-recv.h b/include/linux/platform_data/media/gpio-ir-recv.h deleted file mode 100644 index 0c298f569d5a..000000000000 --- a/include/linux/platform_data/media/gpio-ir-recv.h +++ /dev/null @@ -1,23 +0,0 @@ -/* Copyright (c) 2012, Code Aurora Forum. All rights reserved. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 and - * only version 2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - */ - -#ifndef __GPIO_IR_RECV_H__ -#define __GPIO_IR_RECV_H__ - -struct gpio_ir_recv_platform_data { - int gpio_nr; - bool active_low; - u64 allowed_protos; - const char *map_name; -}; - -#endif /* __GPIO_IR_RECV_H__ */ From 9eefb2e878ebbbc9adf00640695502a9f95d7a88 Mon Sep 17 00:00:00 2001 From: Ladislav Michl Date: Thu, 7 Sep 2017 20:41:32 -0300 Subject: [PATCH 0509/1640] UPSTREAM: [media] media: rc: gpio-ir-recv: use gpiolib API Gpiolib API is preferred way to access gpios. Signed-off-by: Ladislav Michl Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/gpio-ir-recv.c | 59 ++++++++++++--------------------- 1 file changed, 21 insertions(+), 38 deletions(-) diff --git a/drivers/media/rc/gpio-ir-recv.c b/drivers/media/rc/gpio-ir-recv.c index 2634b81cbe7e..24641a9079da 100644 --- a/drivers/media/rc/gpio-ir-recv.c +++ b/drivers/media/rc/gpio-ir-recv.c @@ -26,29 +26,19 @@ struct gpio_rc_dev { struct rc_dev *rcdev; - int gpio_nr; - bool active_low; + struct gpio_desc *gpiod; + int irq; }; static irqreturn_t gpio_ir_recv_irq(int irq, void *dev_id) { + int val; struct gpio_rc_dev *gpio_dev = dev_id; - int gval; - int rc = 0; - gval = gpio_get_value(gpio_dev->gpio_nr); + val = gpiod_get_value(gpio_dev->gpiod); + if (val >= 0) + ir_raw_event_store_edge(gpio_dev->rcdev, val == 1); - if (gval < 0) - goto err_get_value; - - if (gpio_dev->active_low) - gval = !gval; - - rc = ir_raw_event_store_edge(gpio_dev->rcdev, gval == 1); - if (rc < 0) - goto err_get_value; - -err_get_value: return IRQ_HANDLED; } @@ -57,7 +47,6 @@ static int gpio_ir_recv_probe(struct platform_device *pdev) struct device *dev = &pdev->dev; struct device_node *np = dev->of_node; struct gpio_rc_dev *gpio_dev; - enum of_gpio_flags flags; struct rc_dev *rcdev; int rc; @@ -68,15 +57,17 @@ static int gpio_ir_recv_probe(struct platform_device *pdev) if (!gpio_dev) return -ENOMEM; - rc = of_get_gpio_flags(np, 0, &flags); - if (rc < 0) { + gpio_dev->gpiod = devm_gpiod_get(dev, NULL, GPIOD_IN); + if (IS_ERR(gpio_dev->gpiod)) { + rc = PTR_ERR(gpio_dev->gpiod); + /* Just try again if this happens */ if (rc != -EPROBE_DEFER) - dev_err(dev, "Failed to get gpio flags (%d)\n", rc); + dev_err(dev, "error getting gpio (%d)\n", rc); return rc; } - - gpio_dev->gpio_nr = rc; - gpio_dev->active_low = (flags & OF_GPIO_ACTIVE_LOW); + gpio_dev->irq = gpiod_to_irq(gpio_dev->gpiod); + if (gpio_dev->irq < 0) + return gpio_dev->irq; rcdev = devm_rc_allocate_device(dev, RC_DRIVER_IR_RAW); if (!rcdev) @@ -101,11 +92,6 @@ static int gpio_ir_recv_probe(struct platform_device *pdev) gpio_dev->rcdev = rcdev; - rc = devm_gpio_request_one(dev, gpio_dev->gpio_nr, GPIOF_DIR_IN, - "gpio-ir-recv"); - if (rc < 0) - return rc; - rc = devm_rc_register_device(dev, rcdev); if (rc < 0) { dev_err(dev, "failed to register rc device (%d)\n", rc); @@ -114,8 +100,7 @@ static int gpio_ir_recv_probe(struct platform_device *pdev) platform_set_drvdata(pdev, gpio_dev); - return devm_request_irq(dev, gpio_to_irq(gpio_dev->gpio_nr), - gpio_ir_recv_irq, + return devm_request_irq(dev, gpio_dev->irq, gpio_ir_recv_irq, IRQF_TRIGGER_FALLING | IRQF_TRIGGER_RISING, "gpio-ir-recv-irq", gpio_dev); } @@ -123,26 +108,24 @@ static int gpio_ir_recv_probe(struct platform_device *pdev) #ifdef CONFIG_PM static int gpio_ir_recv_suspend(struct device *dev) { - struct platform_device *pdev = to_platform_device(dev); - struct gpio_rc_dev *gpio_dev = platform_get_drvdata(pdev); + struct gpio_rc_dev *gpio_dev = dev_get_drvdata(dev); if (device_may_wakeup(dev)) - enable_irq_wake(gpio_to_irq(gpio_dev->gpio_nr)); + enable_irq_wake(gpio_dev->irq); else - disable_irq(gpio_to_irq(gpio_dev->gpio_nr)); + disable_irq(gpio_dev->irq); return 0; } static int gpio_ir_recv_resume(struct device *dev) { - struct platform_device *pdev = to_platform_device(dev); - struct gpio_rc_dev *gpio_dev = platform_get_drvdata(pdev); + struct gpio_rc_dev *gpio_dev = dev_get_drvdata(dev); if (device_may_wakeup(dev)) - disable_irq_wake(gpio_to_irq(gpio_dev->gpio_nr)); + disable_irq_wake(gpio_dev->irq); else - enable_irq(gpio_to_irq(gpio_dev->gpio_nr)); + enable_irq(gpio_dev->irq); return 0; } From c598baf30a9362e9ce9baaafb403a4f515370ecc Mon Sep 17 00:00:00 2001 From: Thomas Meyer Date: Fri, 8 Sep 2017 13:33:36 -0300 Subject: [PATCH 0510/1640] UPSTREAM: [media] media: rc: Use bsearch library function Replace self coded binary search, by existing library version. Signed-off-by: Thomas Meyer Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/rc-main.c | 34 ++++++++++++++++++++-------------- 1 file changed, 20 insertions(+), 14 deletions(-) diff --git a/drivers/media/rc/rc-main.c b/drivers/media/rc/rc-main.c index 8c828fee4f5a..e0901a2a1ec9 100644 --- a/drivers/media/rc/rc-main.c +++ b/drivers/media/rc/rc-main.c @@ -15,6 +15,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include +#include #include #include #include @@ -460,6 +461,18 @@ static int ir_setkeytable(struct rc_dev *dev, return rc; } +static int rc_map_cmp(const void *key, const void *elt) +{ + const unsigned int *scancode = key; + const struct rc_map_table *e = elt; + + if (*scancode < e->scancode) + return -1; + else if (*scancode > e->scancode) + return 1; + return 0; +} + /** * ir_lookup_by_scancode() - locate mapping by scancode * @rc_map: the struct rc_map to search @@ -472,21 +485,14 @@ static int ir_setkeytable(struct rc_dev *dev, static unsigned int ir_lookup_by_scancode(const struct rc_map *rc_map, unsigned int scancode) { - int start = 0; - int end = rc_map->len - 1; - int mid; + struct rc_map_table *res; - while (start <= end) { - mid = (start + end) / 2; - if (rc_map->scan[mid].scancode < scancode) - start = mid + 1; - else if (rc_map->scan[mid].scancode > scancode) - end = mid - 1; - else - return mid; - } - - return -1U; + res = bsearch(&scancode, rc_map->scan, rc_map->len, + sizeof(struct rc_map_table), rc_map_cmp); + if (!res) + return -1U; + else + return res - rc_map->scan; } /** From a57031ec0e1590345fc491663993384474ec9c7e Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Fri, 8 Sep 2017 13:39:29 -0300 Subject: [PATCH 0511/1640] UPSTREAM: [media] media: default for RC_CORE should be n The Linus policy on Kconfig is that the default should be no for all new devices. I.e the user rebuild a new kernel from an old config should not by default get a larger kernel. Fixes: b4c184e506a4 ("[media] media: reorganize the main Kconfig items") Signed-off-by: Stephen Hemminger Acked-by: Geert Uytterhoeven Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/Kconfig | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/media/rc/Kconfig b/drivers/media/rc/Kconfig index 6af8fcae1600..72da40aa4905 100644 --- a/drivers/media/rc/Kconfig +++ b/drivers/media/rc/Kconfig @@ -2,7 +2,6 @@ menuconfig RC_CORE tristate "Remote Controller support" depends on INPUT - default y ---help--- Enable support for Remote Controllers on Linux. This is needed in order to support several video capture adapters, From fe658cf67b93a81ebc4fb8336a070a1a85cc4754 Mon Sep 17 00:00:00 2001 From: Marc Gonzalez Date: Mon, 18 Sep 2017 11:31:41 -0300 Subject: [PATCH 0512/1640] UPSTREAM: [media] media: rc: Delete duplicate debug message ir_setkeytable() and ir_create_table() print the same debug message. Delete the one in ir_setkeytable() Signed-off-by: Marc Gonzalez Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/rc-main.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/media/rc/rc-main.c b/drivers/media/rc/rc-main.c index e0901a2a1ec9..cb78e5702bef 100644 --- a/drivers/media/rc/rc-main.c +++ b/drivers/media/rc/rc-main.c @@ -440,9 +440,6 @@ static int ir_setkeytable(struct rc_dev *dev, if (rc) return rc; - IR_dprintk(1, "Allocated space for %u keycode entries (%u bytes)\n", - rc_map->size, rc_map->alloc); - for (i = 0; i < from->size; i++) { index = ir_establish_scancode(dev, rc_map, from->scan[i].scancode, false); From 39e69ceb351730478c508f5e87418183fc6e2d96 Mon Sep 17 00:00:00 2001 From: Markus Elfring Date: Tue, 29 Aug 2017 07:45:59 -0300 Subject: [PATCH 0513/1640] UPSTREAM: [media] imon: Improve a size determination in two functions Replace the specification of data structures by pointer dereferences as the parameter for the operator "sizeof" to make the corresponding size determination a bit safer according to the Linux coding style convention. This issue was detected by using the Coccinelle software. Signed-off-by: Markus Elfring Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/imon.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/media/rc/imon.c b/drivers/media/rc/imon.c index beb9a82f6139..5f8c54d613de 100644 --- a/drivers/media/rc/imon.c +++ b/drivers/media/rc/imon.c @@ -602,8 +602,7 @@ static int send_packet(struct imon_context *ictx) ictx->tx_urb->actual_length = 0; } else { /* fill request into kmalloc'ed space: */ - control_req = kmalloc(sizeof(struct usb_ctrlrequest), - GFP_KERNEL); + control_req = kmalloc(sizeof(*control_req), GFP_KERNEL); if (control_req == NULL) return -ENOMEM; @@ -2307,7 +2306,7 @@ static struct imon_context *imon_init_intf0(struct usb_interface *intf, struct usb_host_interface *iface_desc; int ret = -ENOMEM; - ictx = kzalloc(sizeof(struct imon_context), GFP_KERNEL); + ictx = kzalloc(sizeof(*ictx), GFP_KERNEL); if (!ictx) goto exit; From be5e70b1a89373b76a09f40717e30db0f2c7799a Mon Sep 17 00:00:00 2001 From: Ladislav Michl Date: Thu, 5 Oct 2017 09:11:06 -0300 Subject: [PATCH 0514/1640] UPSTREAM: [media] media: rc: fix gpio-ir-receiver build failure The 0-day robot reports: drivers/media/rc/gpio-ir-recv.c: In function 'gpio_ir_recv_irq': >> drivers/media/rc/gpio-ir-recv.c:38:8: error: implicit declaration of function 'gpiod_get_value' [-Werror=implicit-function-declaration] Fixes: eed008e605d1 ("[media] media: rc: gpio-ir-recv: use gpiolib API") For some reason only partial patch was applied. Also include gpio/consumer.h otherwise compile test fails. Reported-by: kbuild test robot Signed-off-by: Ladislav Michl Acked-by: Sean Young --- drivers/media/rc/Kconfig | 1 + drivers/media/rc/gpio-ir-recv.c | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/media/rc/Kconfig b/drivers/media/rc/Kconfig index 72da40aa4905..208484dc8085 100644 --- a/drivers/media/rc/Kconfig +++ b/drivers/media/rc/Kconfig @@ -392,6 +392,7 @@ config RC_LOOPBACK config IR_GPIO_CIR tristate "GPIO IR remote control" depends on RC_CORE + depends on (OF && GPIOLIB) || COMPILE_TEST ---help--- Say Y if you want to use GPIO based IR Receiver. diff --git a/drivers/media/rc/gpio-ir-recv.c b/drivers/media/rc/gpio-ir-recv.c index 24641a9079da..3d99b51384ac 100644 --- a/drivers/media/rc/gpio-ir-recv.c +++ b/drivers/media/rc/gpio-ir-recv.c @@ -14,7 +14,7 @@ #include #include #include -#include +#include #include #include #include From 74eaf28eee841d1b1287aa75810ba1f39645c708 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?David=20H=C3=A4rdeman?= Date: Sun, 25 Jun 2017 08:31:30 -0400 Subject: [PATCH 0515/1640] UPSTREAM: media: lirc_dev: remove min_timeout and max_timeout MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There are no users of this functionality (ir-lirc-codec.c has its own implementation and lirc_zilog.c doesn't use it) so remove it. This only affects users of the lirc kapi, not rc-core drivers. Signed-off-by: David Härdeman Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/lirc_dev.c | 18 ------------------ include/media/lirc_dev.h | 6 ------ 2 files changed, 24 deletions(-) diff --git a/drivers/media/rc/lirc_dev.c b/drivers/media/rc/lirc_dev.c index e9dae8621670..e16d1138ca48 100644 --- a/drivers/media/rc/lirc_dev.c +++ b/drivers/media/rc/lirc_dev.c @@ -346,24 +346,6 @@ long lirc_dev_fop_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case LIRC_GET_LENGTH: result = put_user(d->code_length, (__u32 __user *)arg); break; - case LIRC_GET_MIN_TIMEOUT: - if (!(d->features & LIRC_CAN_SET_REC_TIMEOUT) || - d->min_timeout == 0) { - result = -ENOTTY; - break; - } - - result = put_user(d->min_timeout, (__u32 __user *)arg); - break; - case LIRC_GET_MAX_TIMEOUT: - if (!(d->features & LIRC_CAN_SET_REC_TIMEOUT) || - d->max_timeout == 0) { - result = -ENOTTY; - break; - } - - result = put_user(d->max_timeout, (__u32 __user *)arg); - break; default: result = -ENOTTY; } diff --git a/include/media/lirc_dev.h b/include/media/lirc_dev.h index 981dcabd5fd5..857da67bd931 100644 --- a/include/media/lirc_dev.h +++ b/include/media/lirc_dev.h @@ -125,10 +125,6 @@ static inline unsigned int lirc_buffer_write(struct lirc_buffer *buf, * @chunk_size: Size of each FIFO buffer. * Only used if @rbuf is NULL. * @data: private per-driver data - * @min_timeout: Minimum timeout for record. Valid only if - * LIRC_CAN_SET_REC_TIMEOUT is defined. - * @max_timeout: Maximum timeout for record. Valid only if - * LIRC_CAN_SET_REC_TIMEOUT is defined. * @buf: if %NULL, lirc_dev will allocate and manage the buffer, * otherwise allocated by the caller which will * have to write to the buffer by other means, like irq's @@ -155,8 +151,6 @@ struct lirc_dev { bool buf_internal; void *data; - int min_timeout; - int max_timeout; struct rc_dev *rdev; const struct file_operations *fops; struct module *owner; From 9eaa7973ae88291d9dca44c1045a388918685f37 Mon Sep 17 00:00:00 2001 From: Sean Young Date: Sun, 1 Oct 2017 16:38:29 -0400 Subject: [PATCH 0516/1640] UPSTREAM: media: rc: nec decoder should not send both repeat and keycode When receiving an nec repeat, rc_repeat() is called and then rc_keydown() with the last decoded scancode. That last call is redundant. Fixes: 265a2988d202 ("media: rc-core: consistent use of rc_repeat()") Cc: # v4.14 Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/ir-nec-decoder.c | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/drivers/media/rc/ir-nec-decoder.c b/drivers/media/rc/ir-nec-decoder.c index 817c18f2ddd1..a95d09acc22a 100644 --- a/drivers/media/rc/ir-nec-decoder.c +++ b/drivers/media/rc/ir-nec-decoder.c @@ -87,8 +87,6 @@ static int ir_nec_decode(struct rc_dev *dev, struct ir_raw_event ev) data->state = STATE_BIT_PULSE; return 0; } else if (eq_margin(ev.duration, NEC_REPEAT_SPACE, NEC_UNIT / 2)) { - rc_repeat(dev); - IR_dprintk(1, "Repeat last key\n"); data->state = STATE_TRAILER_PULSE; return 0; } @@ -151,19 +149,26 @@ static int ir_nec_decode(struct rc_dev *dev, struct ir_raw_event ev) if (!geq_margin(ev.duration, NEC_TRAILER_SPACE, NEC_UNIT / 2)) break; - address = bitrev8((data->bits >> 24) & 0xff); - not_address = bitrev8((data->bits >> 16) & 0xff); - command = bitrev8((data->bits >> 8) & 0xff); - not_command = bitrev8((data->bits >> 0) & 0xff); + if (data->count == NEC_NBITS) { + address = bitrev8((data->bits >> 24) & 0xff); + not_address = bitrev8((data->bits >> 16) & 0xff); + command = bitrev8((data->bits >> 8) & 0xff); + not_command = bitrev8((data->bits >> 0) & 0xff); - scancode = ir_nec_bytes_to_scancode(address, not_address, - command, not_command, - &rc_proto); + scancode = ir_nec_bytes_to_scancode(address, + not_address, + command, + not_command, + &rc_proto); - if (data->is_nec_x) - data->necx_repeat = true; + if (data->is_nec_x) + data->necx_repeat = true; + + rc_keydown(dev, rc_proto, scancode, 0); + } else { + rc_repeat(dev); + } - rc_keydown(dev, rc_proto, scancode, 0); data->state = STATE_INACTIVE; return 0; } From cfab22e68225ddaa2b371dbd27bfc468cc080a7a Mon Sep 17 00:00:00 2001 From: Sean Young Date: Thu, 5 Oct 2017 17:30:57 -0400 Subject: [PATCH 0517/1640] UPSTREAM: media: rc: gpio-ir-tx does not work without devicetree or gpiolib If the kernel is built without device tree, this driver cannot be used and without gpiolib it cannot control any gpio pin. Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/media/rc/Kconfig b/drivers/media/rc/Kconfig index 208484dc8085..097c9ea6ee9b 100644 --- a/drivers/media/rc/Kconfig +++ b/drivers/media/rc/Kconfig @@ -403,6 +403,7 @@ config IR_GPIO_TX tristate "GPIO IR Bit Banging Transmitter" depends on RC_CORE depends on LIRC + depends on (OF && GPIOLIB) || COMPILE_TEST ---help--- Say Y if you want to a GPIO based IR transmitter. This is a bit banging driver. From 03497df062762d9c6ad3f8e65795bbe5399d368b Mon Sep 17 00:00:00 2001 From: Sean Young Date: Thu, 5 Oct 2017 17:30:58 -0400 Subject: [PATCH 0518/1640] UPSTREAM: media: rc: pwm-ir-tx needs OF Without device tree, there is no way to use this driver. Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/media/rc/Kconfig b/drivers/media/rc/Kconfig index 097c9ea6ee9b..2739073f11a6 100644 --- a/drivers/media/rc/Kconfig +++ b/drivers/media/rc/Kconfig @@ -416,6 +416,7 @@ config IR_PWM_TX depends on RC_CORE depends on LIRC depends on PWM + depends on OF || COMPILE_TEST ---help--- Say Y if you want to use a PWM based IR transmitter. This is more power efficient than the bit banging gpio driver. From ec6035e9841af13cf5aca4d9c3a074c7999de02d Mon Sep 17 00:00:00 2001 From: Sean Young Date: Thu, 5 Oct 2017 17:30:59 -0400 Subject: [PATCH 0519/1640] UPSTREAM: media: rc: hix5hd2 drivers needs OF Without device tree, there is no way to use this driver. Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/media/rc/Kconfig b/drivers/media/rc/Kconfig index 2739073f11a6..577414b60197 100644 --- a/drivers/media/rc/Kconfig +++ b/drivers/media/rc/Kconfig @@ -178,6 +178,7 @@ config IR_ENE config IR_HIX5HD2 tristate "Hisilicon hix5hd2 IR remote control" depends on RC_CORE + depends on OF || COMPILE_TEST help Say Y here if you want to use hisilicon hix5hd2 remote control. To compile this driver as a module, choose M here: the module will be From 9469a4919df9253190668698fb74c1b322a9beaa Mon Sep 17 00:00:00 2001 From: Marc Gonzalez Date: Fri, 6 Oct 2017 08:33:41 -0400 Subject: [PATCH 0520/1640] UPSTREAM: media: rc: Add tango keymap Add a keymap for the Sigma Designs Vantage (dev board) remote control. Signed-off-by: Marc Gonzalez Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/keymaps/Makefile | 1 + drivers/media/rc/keymaps/rc-tango.c | 92 +++++++++++++++++++++++++++++ include/media/rc-map.h | 1 + 3 files changed, 94 insertions(+) create mode 100644 drivers/media/rc/keymaps/rc-tango.c diff --git a/drivers/media/rc/keymaps/Makefile b/drivers/media/rc/keymaps/Makefile index 2d0b26bf2051..cad14162e8d4 100644 --- a/drivers/media/rc/keymaps/Makefile +++ b/drivers/media/rc/keymaps/Makefile @@ -89,6 +89,7 @@ obj-$(CONFIG_RC_MAP) += rc-adstech-dvb-t-pci.o \ rc-reddo.o \ rc-snapstream-firefly.o \ rc-streamzap.o \ + rc-tango.o \ rc-tbs-nec.o \ rc-technisat-ts35.o \ rc-technisat-usb2.o \ diff --git a/drivers/media/rc/keymaps/rc-tango.c b/drivers/media/rc/keymaps/rc-tango.c new file mode 100644 index 000000000000..1c6e8875d46f --- /dev/null +++ b/drivers/media/rc/keymaps/rc-tango.c @@ -0,0 +1,92 @@ +/* + * Copyright (C) 2017 Sigma Designs + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 2 as published by the Free Software Foundation. + */ + +#include +#include + +static struct rc_map_table tango_table[] = { + { 0x4cb4a, KEY_POWER }, + { 0x4cb48, KEY_FILE }, + { 0x4cb0f, KEY_SETUP }, + { 0x4cb4d, KEY_SUSPEND }, + { 0x4cb4e, KEY_VOLUMEUP }, + { 0x4cb44, KEY_EJECTCD }, + { 0x4cb13, KEY_TV }, + { 0x4cb51, KEY_MUTE }, + { 0x4cb52, KEY_VOLUMEDOWN }, + + { 0x4cb41, KEY_1 }, + { 0x4cb03, KEY_2 }, + { 0x4cb42, KEY_3 }, + { 0x4cb45, KEY_4 }, + { 0x4cb07, KEY_5 }, + { 0x4cb46, KEY_6 }, + { 0x4cb55, KEY_7 }, + { 0x4cb17, KEY_8 }, + { 0x4cb56, KEY_9 }, + { 0x4cb1b, KEY_0 }, + { 0x4cb59, KEY_DELETE }, + { 0x4cb5a, KEY_CAPSLOCK }, + + { 0x4cb47, KEY_BACK }, + { 0x4cb05, KEY_SWITCHVIDEOMODE }, + { 0x4cb06, KEY_UP }, + { 0x4cb43, KEY_LEFT }, + { 0x4cb01, KEY_RIGHT }, + { 0x4cb0a, KEY_DOWN }, + { 0x4cb02, KEY_ENTER }, + { 0x4cb4b, KEY_INFO }, + { 0x4cb09, KEY_HOME }, + + { 0x4cb53, KEY_MENU }, + { 0x4cb12, KEY_PREVIOUS }, + { 0x4cb50, KEY_PLAY }, + { 0x4cb11, KEY_NEXT }, + { 0x4cb4f, KEY_TITLE }, + { 0x4cb0e, KEY_REWIND }, + { 0x4cb4c, KEY_STOP }, + { 0x4cb0d, KEY_FORWARD }, + { 0x4cb57, KEY_MEDIA_REPEAT }, + { 0x4cb16, KEY_ANGLE }, + { 0x4cb54, KEY_PAUSE }, + { 0x4cb15, KEY_SLOW }, + { 0x4cb5b, KEY_TIME }, + { 0x4cb1a, KEY_AUDIO }, + { 0x4cb58, KEY_SUBTITLE }, + { 0x4cb19, KEY_ZOOM }, + + { 0x4cb5f, KEY_RED }, + { 0x4cb1e, KEY_GREEN }, + { 0x4cb5c, KEY_YELLOW }, + { 0x4cb1d, KEY_BLUE }, +}; + +static struct rc_map_list tango_map = { + .map = { + .scan = tango_table, + .size = ARRAY_SIZE(tango_table), + .rc_proto = RC_PROTO_NECX, + .name = RC_MAP_TANGO, + } +}; + +static int __init init_rc_map_tango(void) +{ + return rc_map_register(&tango_map); +} + +static void __exit exit_rc_map_tango(void) +{ + rc_map_unregister(&tango_map); +} + +module_init(init_rc_map_tango) +module_exit(exit_rc_map_tango) + +MODULE_AUTHOR("Sigma Designs"); +MODULE_LICENSE("GPL"); diff --git a/include/media/rc-map.h b/include/media/rc-map.h index 2a160e6e823c..b4ddcb62c993 100644 --- a/include/media/rc-map.h +++ b/include/media/rc-map.h @@ -300,6 +300,7 @@ struct rc_map *rc_map_get(const char *name); #define RC_MAP_REDDO "rc-reddo" #define RC_MAP_SNAPSTREAM_FIREFLY "rc-snapstream-firefly" #define RC_MAP_STREAMZAP "rc-streamzap" +#define RC_MAP_TANGO "rc-tango" #define RC_MAP_TBS_NEC "rc-tbs-nec" #define RC_MAP_TECHNISAT_TS35 "rc-technisat-ts35" #define RC_MAP_TECHNISAT_USB2 "rc-technisat-usb2" From 4e65484794e154b9347f227d6ae855ef8cdf5b83 Mon Sep 17 00:00:00 2001 From: Mans Rullgard Date: Fri, 6 Oct 2017 08:37:50 -0400 Subject: [PATCH 0521/1640] UPSTREAM: media: rc: Add driver for tango HW IR decoder The tango HW IR decoder supports NEC, RC-5, RC-6 protocols. Change-Id: Ib9785b96bb87803aa165d0e8aed0f2bfbb82b284 Signed-off-by: Mans Rullgard Signed-off-by: Marc Gonzalez Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/Kconfig | 10 ++ drivers/media/rc/Makefile | 3 +- drivers/media/rc/tango-ir.c | 281 ++++++++++++++++++++++++++++++++++++ 3 files changed, 293 insertions(+), 1 deletion(-) create mode 100644 drivers/media/rc/tango-ir.c diff --git a/drivers/media/rc/Kconfig b/drivers/media/rc/Kconfig index 577414b60197..5219c892a2a4 100644 --- a/drivers/media/rc/Kconfig +++ b/drivers/media/rc/Kconfig @@ -472,6 +472,16 @@ config IR_SIR To compile this driver as a module, choose M here: the module will be called sir-ir. +config IR_TANGO + tristate "Sigma Designs SMP86xx IR decoder" + depends on RC_CORE + depends on ARCH_TANGO || COMPILE_TEST + ---help--- + Adds support for the HW IR decoder embedded on Sigma Designs + Tango-based systems (SMP86xx, SMP87xx). + The HW decoder supports NEC, RC-5, RC-6 IR protocols. + When compiled as a module, look for tango-ir. + config IR_ZX tristate "ZTE ZX IR remote control" depends on RC_CORE diff --git a/drivers/media/rc/Makefile b/drivers/media/rc/Makefile index 82ab74ed934c..d07367b1674b 100644 --- a/drivers/media/rc/Makefile +++ b/drivers/media/rc/Makefile @@ -45,4 +45,5 @@ obj-$(CONFIG_IR_SERIAL) += serial_ir.o obj-$(CONFIG_IR_SIR) += sir_ir.o obj-$(CONFIG_IR_MTK) += mtk-cir.o obj-$(CONFIG_IR_ZX) += zx-irdec.o -obj-$(CONFIG_IR_MSM_GENI) += msm-geni-ir.o \ No newline at end of file +obj-$(CONFIG_IR_MSM_GENI) += msm-geni-ir.o +obj-$(CONFIG_IR_TANGO) += tango-ir.o diff --git a/drivers/media/rc/tango-ir.c b/drivers/media/rc/tango-ir.c new file mode 100644 index 000000000000..9d4c17230c3a --- /dev/null +++ b/drivers/media/rc/tango-ir.c @@ -0,0 +1,281 @@ +/* + * Copyright (C) 2015 Mans Rullgard + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#define DRIVER_NAME "tango-ir" + +#define IR_NEC_CTRL 0x00 +#define IR_NEC_DATA 0x04 +#define IR_CTRL 0x08 +#define IR_RC5_CLK_DIV 0x0c +#define IR_RC5_DATA 0x10 +#define IR_INT 0x14 + +#define NEC_TIME_BASE 560 +#define RC5_TIME_BASE 1778 + +#define RC6_CTRL 0x00 +#define RC6_CLKDIV 0x04 +#define RC6_DATA0 0x08 +#define RC6_DATA1 0x0c +#define RC6_DATA2 0x10 +#define RC6_DATA3 0x14 +#define RC6_DATA4 0x18 + +#define RC6_CARRIER 36000 +#define RC6_TIME_BASE 16 + +#define NEC_CAP(n) ((n) << 24) +#define GPIO_SEL(n) ((n) << 16) +#define DISABLE_NEC (BIT(4) | BIT(8)) +#define ENABLE_RC5 (BIT(0) | BIT(9)) +#define ENABLE_RC6 (BIT(0) | BIT(7)) +#define ACK_IR_INT (BIT(0) | BIT(1)) +#define ACK_RC6_INT (BIT(31)) + +#define NEC_ANY (RC_PROTO_BIT_NEC | RC_PROTO_BIT_NECX | RC_PROTO_BIT_NEC32) + +struct tango_ir { + void __iomem *rc5_base; + void __iomem *rc6_base; + struct rc_dev *rc; + struct clk *clk; +}; + +static void tango_ir_handle_nec(struct tango_ir *ir) +{ + u32 v, code; + enum rc_proto proto; + + v = readl_relaxed(ir->rc5_base + IR_NEC_DATA); + if (!v) { + rc_repeat(ir->rc); + return; + } + + code = ir_nec_bytes_to_scancode(v, v >> 8, v >> 16, v >> 24, &proto); + rc_keydown(ir->rc, proto, code, 0); +} + +static void tango_ir_handle_rc5(struct tango_ir *ir) +{ + u32 data, field, toggle, addr, cmd, code; + + data = readl_relaxed(ir->rc5_base + IR_RC5_DATA); + if (data & BIT(31)) + return; + + field = data >> 12 & 1; + toggle = data >> 11 & 1; + addr = data >> 6 & 0x1f; + cmd = (data & 0x3f) | (field ^ 1) << 6; + + code = RC_SCANCODE_RC5(addr, cmd); + rc_keydown(ir->rc, RC_PROTO_RC5, code, toggle); +} + +static void tango_ir_handle_rc6(struct tango_ir *ir) +{ + u32 data0, data1, toggle, mode, addr, cmd, code; + + data0 = readl_relaxed(ir->rc6_base + RC6_DATA0); + data1 = readl_relaxed(ir->rc6_base + RC6_DATA1); + + mode = data0 >> 1 & 7; + if (mode != 0) + return; + + toggle = data0 & 1; + addr = data0 >> 16; + cmd = data1; + + code = RC_SCANCODE_RC6_0(addr, cmd); + rc_keydown(ir->rc, RC_PROTO_RC6_0, code, toggle); +} + +static irqreturn_t tango_ir_irq(int irq, void *dev_id) +{ + struct tango_ir *ir = dev_id; + unsigned int rc5_stat; + unsigned int rc6_stat; + + rc5_stat = readl_relaxed(ir->rc5_base + IR_INT); + writel_relaxed(rc5_stat, ir->rc5_base + IR_INT); + + rc6_stat = readl_relaxed(ir->rc6_base + RC6_CTRL); + writel_relaxed(rc6_stat, ir->rc6_base + RC6_CTRL); + + if (!(rc5_stat & 3) && !(rc6_stat & BIT(31))) + return IRQ_NONE; + + if (rc5_stat & BIT(0)) + tango_ir_handle_rc5(ir); + + if (rc5_stat & BIT(1)) + tango_ir_handle_nec(ir); + + if (rc6_stat & BIT(31)) + tango_ir_handle_rc6(ir); + + return IRQ_HANDLED; +} + +static int tango_change_protocol(struct rc_dev *dev, u64 *rc_type) +{ + struct tango_ir *ir = dev->priv; + u32 rc5_ctrl = DISABLE_NEC; + u32 rc6_ctrl = 0; + + if (*rc_type & NEC_ANY) + rc5_ctrl = 0; + + if (*rc_type & RC_PROTO_BIT_RC5) + rc5_ctrl |= ENABLE_RC5; + + if (*rc_type & RC_PROTO_BIT_RC6_0) + rc6_ctrl = ENABLE_RC6; + + writel_relaxed(rc5_ctrl, ir->rc5_base + IR_CTRL); + writel_relaxed(rc6_ctrl, ir->rc6_base + RC6_CTRL); + + return 0; +} + +static int tango_ir_probe(struct platform_device *pdev) +{ + const char *map_name = RC_MAP_TANGO; + struct device *dev = &pdev->dev; + struct rc_dev *rc; + struct tango_ir *ir; + struct resource *rc5_res; + struct resource *rc6_res; + u64 clkrate, clkdiv; + int irq, err; + u32 val; + + rc5_res = platform_get_resource(pdev, IORESOURCE_MEM, 0); + if (!rc5_res) + return -EINVAL; + + rc6_res = platform_get_resource(pdev, IORESOURCE_MEM, 1); + if (!rc6_res) + return -EINVAL; + + irq = platform_get_irq(pdev, 0); + if (irq <= 0) + return -EINVAL; + + ir = devm_kzalloc(dev, sizeof(*ir), GFP_KERNEL); + if (!ir) + return -ENOMEM; + + ir->rc5_base = devm_ioremap_resource(dev, rc5_res); + if (IS_ERR(ir->rc5_base)) + return PTR_ERR(ir->rc5_base); + + ir->rc6_base = devm_ioremap_resource(dev, rc6_res); + if (IS_ERR(ir->rc6_base)) + return PTR_ERR(ir->rc6_base); + + ir->clk = devm_clk_get(dev, NULL); + if (IS_ERR(ir->clk)) + return PTR_ERR(ir->clk); + + rc = devm_rc_allocate_device(dev, RC_DRIVER_SCANCODE); + if (!rc) + return -ENOMEM; + + of_property_read_string(dev->of_node, "linux,rc-map-name", &map_name); + + rc->device_name = DRIVER_NAME; + rc->driver_name = DRIVER_NAME; + rc->input_phys = DRIVER_NAME "/input0"; + rc->map_name = map_name; + rc->allowed_protocols = NEC_ANY | RC_PROTO_BIT_RC5 | RC_PROTO_BIT_RC6_0; + rc->change_protocol = tango_change_protocol; + rc->priv = ir; + ir->rc = rc; + + err = clk_prepare_enable(ir->clk); + if (err) + return err; + + clkrate = clk_get_rate(ir->clk); + + clkdiv = clkrate * NEC_TIME_BASE; + do_div(clkdiv, 1000000); + + val = NEC_CAP(31) | GPIO_SEL(12) | clkdiv; + writel_relaxed(val, ir->rc5_base + IR_NEC_CTRL); + + clkdiv = clkrate * RC5_TIME_BASE; + do_div(clkdiv, 1000000); + + writel_relaxed(DISABLE_NEC, ir->rc5_base + IR_CTRL); + writel_relaxed(clkdiv, ir->rc5_base + IR_RC5_CLK_DIV); + writel_relaxed(ACK_IR_INT, ir->rc5_base + IR_INT); + + clkdiv = clkrate * RC6_TIME_BASE; + do_div(clkdiv, RC6_CARRIER); + + writel_relaxed(ACK_RC6_INT, ir->rc6_base + RC6_CTRL); + writel_relaxed((clkdiv >> 2) << 18 | clkdiv, ir->rc6_base + RC6_CLKDIV); + + err = devm_request_irq(dev, irq, tango_ir_irq, IRQF_SHARED, + dev_name(dev), ir); + if (err) + goto err_clk; + + err = devm_rc_register_device(dev, rc); + if (err) + goto err_clk; + + platform_set_drvdata(pdev, ir); + return 0; + +err_clk: + clk_disable_unprepare(ir->clk); + return err; +} + +static int tango_ir_remove(struct platform_device *pdev) +{ + struct tango_ir *ir = platform_get_drvdata(pdev); + + clk_disable_unprepare(ir->clk); + return 0; +} + +static const struct of_device_id tango_ir_dt_ids[] = { + { .compatible = "sigma,smp8642-ir" }, + { } +}; +MODULE_DEVICE_TABLE(of, tango_ir_dt_ids); + +static struct platform_driver tango_ir_driver = { + .probe = tango_ir_probe, + .remove = tango_ir_remove, + .driver = { + .name = DRIVER_NAME, + .of_match_table = tango_ir_dt_ids, + }, +}; +module_platform_driver(tango_ir_driver); + +MODULE_DESCRIPTION("SMP86xx IR decoder driver"); +MODULE_AUTHOR("Mans Rullgard "); +MODULE_LICENSE("GPL"); From fd813051d57853231e72fdd9a6e4bc58e955f729 Mon Sep 17 00:00:00 2001 From: Sean Young Date: Sun, 8 Oct 2017 14:18:52 -0400 Subject: [PATCH 0522/1640] UPSTREAM: media: rc: check for integer overflow The ioctl LIRC_SET_REC_TIMEOUT would set a timeout of 704ns if called with a timeout of 4294968us. Cc: Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/ir-lirc-codec.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/media/rc/ir-lirc-codec.c b/drivers/media/rc/ir-lirc-codec.c index bd046c41a53a..8f2f37412fc5 100644 --- a/drivers/media/rc/ir-lirc-codec.c +++ b/drivers/media/rc/ir-lirc-codec.c @@ -298,11 +298,14 @@ static long ir_lirc_ioctl(struct file *filep, unsigned int cmd, if (!dev->max_timeout) return -ENOTTY; + /* Check for multiply overflow */ + if (val > U32_MAX / 1000) + return -EINVAL; + tmp = val * 1000; - if (tmp < dev->min_timeout || - tmp > dev->max_timeout) - return -EINVAL; + if (tmp < dev->min_timeout || tmp > dev->max_timeout) + return -EINVAL; if (dev->s_timeout) ret = dev->s_timeout(dev, tmp); From 3e177e6ac60431becd33463fe10a60a6c10cb81a Mon Sep 17 00:00:00 2001 From: Sean Young Date: Mon, 9 Oct 2017 04:30:06 -0400 Subject: [PATCH 0523/1640] UPSTREAM: media: rc: ir-spi needs OF Without device tree, there is no way to use this driver. Signed-off-by: Sean Young Acked-by: Andi Shyti Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/media/rc/Kconfig b/drivers/media/rc/Kconfig index 5219c892a2a4..c0b37e09f50a 100644 --- a/drivers/media/rc/Kconfig +++ b/drivers/media/rc/Kconfig @@ -286,6 +286,7 @@ config IR_REDRAT3 config IR_SPI tristate "SPI connected IR LED" depends on SPI && LIRC + depends on OF || COMPILE_TEST ---help--- Say Y if you want to use an IR LED connected through SPI bus. From 7eada65a8f11e6d7e1938ac77b460b370f98419e Mon Sep 17 00:00:00 2001 From: Arvind Yadav Date: Mon, 9 Oct 2017 20:14:48 +0200 Subject: [PATCH 0524/1640] UPSTREAM: media: imon: Fix null-ptr-deref in imon_probe It seems that the return value of usb_ifnum_to_if() can be NULL and needs to be checked. Signed-off-by: Arvind Yadav Tested-by: Andrey Konovalov Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/imon.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/media/rc/imon.c b/drivers/media/rc/imon.c index 5f8c54d613de..e8c0e07935c0 100644 --- a/drivers/media/rc/imon.c +++ b/drivers/media/rc/imon.c @@ -2512,6 +2512,11 @@ static int imon_probe(struct usb_interface *interface, mutex_lock(&driver_lock); first_if = usb_ifnum_to_if(usbdev, 0); + if (!first_if) { + ret = -ENODEV; + goto fail; + } + first_if_ctx = usb_get_intfdata(first_if); if (ifnum == 0) { From e186bd7d8105c3211f7a9ba5cb70d9db5234a4ca Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 17 Oct 2017 01:10:36 +0200 Subject: [PATCH 0525/1640] UPSTREAM: media: serial_ir: Convert timers to use timer_setup() In preparation for unconditionally passing the struct timer_list pointer to all timer callbacks, switch to using the new timer_setup() and from_timer() to pass the timer pointer explicitly. Cc: Mauro Carvalho Chehab Signed-off-by: Kees Cook Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/serial_ir.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/media/rc/serial_ir.c b/drivers/media/rc/serial_ir.c index 8b66926bc16a..8bf5637b3a69 100644 --- a/drivers/media/rc/serial_ir.c +++ b/drivers/media/rc/serial_ir.c @@ -470,7 +470,7 @@ static int hardware_init_port(void) return 0; } -static void serial_ir_timeout(unsigned long arg) +static void serial_ir_timeout(struct timer_list *unused) { DEFINE_IR_RAW_EVENT(ev); @@ -540,8 +540,7 @@ static int serial_ir_probe(struct platform_device *dev) serial_ir.rcdev = rcdev; - setup_timer(&serial_ir.timeout_timer, serial_ir_timeout, - (unsigned long)&serial_ir); + timer_setup(&serial_ir.timeout_timer, serial_ir_timeout, 0); result = devm_request_irq(&dev->dev, irq, serial_ir_irq_handler, share_irq ? IRQF_SHARED : 0, From 1101af1cbf5cee62519630fd838f0ddefa902a90 Mon Sep 17 00:00:00 2001 From: Younian Wang Date: Thu, 19 Oct 2017 21:43:29 +0200 Subject: [PATCH 0526/1640] UPSTREAM: media: rc/keymaps: add support for RC of hisilicon TV demo boards This is a NEC protocol type remote controller distributed with hisilicon TV demo boards. Signed-off-by: Younian Wang Signed-off-by: Jiancheng Xue Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/keymaps/Makefile | 1 + drivers/media/rc/keymaps/rc-hisi-tv-demo.c | 81 ++++++++++++++++++++++ include/media/rc-map.h | 1 + 3 files changed, 83 insertions(+) create mode 100644 drivers/media/rc/keymaps/rc-hisi-tv-demo.c diff --git a/drivers/media/rc/keymaps/Makefile b/drivers/media/rc/keymaps/Makefile index cad14162e8d4..34c0a45bd880 100644 --- a/drivers/media/rc/keymaps/Makefile +++ b/drivers/media/rc/keymaps/Makefile @@ -48,6 +48,7 @@ obj-$(CONFIG_RC_MAP) += rc-adstech-dvb-t-pci.o \ rc-geekbox.o \ rc-genius-tvgo-a11mce.o \ rc-gotview7135.o \ + rc-hisi-tv-demo.o \ rc-imon-mce.o \ rc-imon-pad.o \ rc-iodata-bctv7e.o \ diff --git a/drivers/media/rc/keymaps/rc-hisi-tv-demo.c b/drivers/media/rc/keymaps/rc-hisi-tv-demo.c new file mode 100644 index 000000000000..4816e3a4a18d --- /dev/null +++ b/drivers/media/rc/keymaps/rc-hisi-tv-demo.c @@ -0,0 +1,81 @@ +/* + * Keytable for remote controller of HiSilicon tv demo board. + * + * Copyright (c) 2017 HiSilicon Technologies Co., Ltd. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include +#include + +static struct rc_map_table hisi_tv_demo_keymap[] = { + { 0x00000092, KEY_1}, + { 0x00000093, KEY_2}, + { 0x000000cc, KEY_3}, + { 0x0000009f, KEY_4}, + { 0x0000008e, KEY_5}, + { 0x0000008f, KEY_6}, + { 0x000000c8, KEY_7}, + { 0x00000094, KEY_8}, + { 0x0000008a, KEY_9}, + { 0x0000008b, KEY_0}, + { 0x000000ce, KEY_ENTER}, + { 0x000000ca, KEY_UP}, + { 0x00000099, KEY_LEFT}, + { 0x00000084, KEY_PAGEUP}, + { 0x000000c1, KEY_RIGHT}, + { 0x000000d2, KEY_DOWN}, + { 0x00000089, KEY_PAGEDOWN}, + { 0x000000d1, KEY_MUTE}, + { 0x00000098, KEY_VOLUMEDOWN}, + { 0x00000090, KEY_VOLUMEUP}, + { 0x0000009c, KEY_POWER}, + { 0x000000d6, KEY_STOP}, + { 0x00000097, KEY_MENU}, + { 0x000000cb, KEY_BACK}, + { 0x000000da, KEY_PLAYPAUSE}, + { 0x00000080, KEY_INFO}, + { 0x000000c3, KEY_REWIND}, + { 0x00000087, KEY_HOMEPAGE}, + { 0x000000d0, KEY_FASTFORWARD}, + { 0x000000c4, KEY_SOUND}, + { 0x00000082, BTN_1}, + { 0x000000c7, BTN_2}, + { 0x00000086, KEY_PROGRAM}, + { 0x000000d9, KEY_SUBTITLE}, + { 0x00000085, KEY_ZOOM}, + { 0x0000009b, KEY_RED}, + { 0x0000009a, KEY_GREEN}, + { 0x000000c0, KEY_YELLOW}, + { 0x000000c2, KEY_BLUE}, + { 0x0000009d, KEY_CHANNELDOWN}, + { 0x000000cf, KEY_CHANNELUP}, +}; + +static struct rc_map_list hisi_tv_demo_map = { + .map = { + .scan = hisi_tv_demo_keymap, + .size = ARRAY_SIZE(hisi_tv_demo_keymap), + .rc_proto = RC_PROTO_NEC, + .name = RC_MAP_HISI_TV_DEMO, + } +}; + +static int __init init_rc_map_hisi_tv_demo(void) +{ + return rc_map_register(&hisi_tv_demo_map); +} + +static void __exit exit_rc_map_hisi_tv_demo(void) +{ + rc_map_unregister(&hisi_tv_demo_map); +} + +module_init(init_rc_map_hisi_tv_demo) +module_exit(exit_rc_map_hisi_tv_demo) + +MODULE_LICENSE("GPL v2"); diff --git a/include/media/rc-map.h b/include/media/rc-map.h index b4ddcb62c993..6d2172b6a0ed 100644 --- a/include/media/rc-map.h +++ b/include/media/rc-map.h @@ -258,6 +258,7 @@ struct rc_map *rc_map_get(const char *name); #define RC_MAP_GENIUS_TVGO_A11MCE "rc-genius-tvgo-a11mce" #define RC_MAP_GOTVIEW7135 "rc-gotview7135" #define RC_MAP_HAUPPAUGE_NEW "rc-hauppauge" +#define RC_MAP_HISI_TV_DEMO "rc-hisi-tv-demo" #define RC_MAP_IMON_MCE "rc-imon-mce" #define RC_MAP_IMON_PAD "rc-imon-pad" #define RC_MAP_IODATA_BCTV7E "rc-iodata-bctv7e" From 6d360f6de4681ef29472b10cf1b7971558f51564 Mon Sep 17 00:00:00 2001 From: Younian Wang Date: Thu, 19 Oct 2017 21:43:30 +0200 Subject: [PATCH 0527/1640] UPSTREAM: media: rc/keymaps: add support for RC of hisilicon poplar board This is a NEC protocol type remote controller distributed with 96boards poplar@tocoding board. Signed-off-by: Younian Wang Signed-off-by: Jiancheng Xue Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/keymaps/Makefile | 1 + drivers/media/rc/keymaps/rc-hisi-poplar.c | 69 +++++++++++++++++++++++ include/media/rc-map.h | 1 + 3 files changed, 71 insertions(+) create mode 100644 drivers/media/rc/keymaps/rc-hisi-poplar.c diff --git a/drivers/media/rc/keymaps/Makefile b/drivers/media/rc/keymaps/Makefile index 34c0a45bd880..8eb58ef1bdce 100644 --- a/drivers/media/rc/keymaps/Makefile +++ b/drivers/media/rc/keymaps/Makefile @@ -48,6 +48,7 @@ obj-$(CONFIG_RC_MAP) += rc-adstech-dvb-t-pci.o \ rc-geekbox.o \ rc-genius-tvgo-a11mce.o \ rc-gotview7135.o \ + rc-hisi-poplar.o \ rc-hisi-tv-demo.o \ rc-imon-mce.o \ rc-imon-pad.o \ diff --git a/drivers/media/rc/keymaps/rc-hisi-poplar.c b/drivers/media/rc/keymaps/rc-hisi-poplar.c new file mode 100644 index 000000000000..78728bc7f63a --- /dev/null +++ b/drivers/media/rc/keymaps/rc-hisi-poplar.c @@ -0,0 +1,69 @@ +/* + * Keytable for remote controller of HiSilicon poplar board. + * + * Copyright (c) 2017 HiSilicon Technologies Co., Ltd. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include +#include + +static struct rc_map_table hisi_poplar_keymap[] = { + { 0x0000b292, KEY_1}, + { 0x0000b293, KEY_2}, + { 0x0000b2cc, KEY_3}, + { 0x0000b28e, KEY_4}, + { 0x0000b28f, KEY_5}, + { 0x0000b2c8, KEY_6}, + { 0x0000b28a, KEY_7}, + { 0x0000b28b, KEY_8}, + { 0x0000b2c4, KEY_9}, + { 0x0000b287, KEY_0}, + { 0x0000b282, KEY_HOMEPAGE}, + { 0x0000b2ca, KEY_UP}, + { 0x0000b299, KEY_LEFT}, + { 0x0000b2c1, KEY_RIGHT}, + { 0x0000b2d2, KEY_DOWN}, + { 0x0000b2c5, KEY_DELETE}, + { 0x0000b29c, KEY_MUTE}, + { 0x0000b281, KEY_VOLUMEDOWN}, + { 0x0000b280, KEY_VOLUMEUP}, + { 0x0000b2dc, KEY_POWER}, + { 0x0000b29a, KEY_MENU}, + { 0x0000b28d, KEY_SETUP}, + { 0x0000b2c5, KEY_BACK}, + { 0x0000b295, KEY_PLAYPAUSE}, + { 0x0000b2ce, KEY_ENTER}, + { 0x0000b285, KEY_CHANNELUP}, + { 0x0000b286, KEY_CHANNELDOWN}, + { 0x0000b2da, KEY_NUMERIC_STAR}, + { 0x0000b2d0, KEY_NUMERIC_POUND}, +}; + +static struct rc_map_list hisi_poplar_map = { + .map = { + .scan = hisi_poplar_keymap, + .size = ARRAY_SIZE(hisi_poplar_keymap), + .rc_proto = RC_PROTO_NEC, + .name = RC_MAP_HISI_POPLAR, + } +}; + +static int __init init_rc_map_hisi_poplar(void) +{ + return rc_map_register(&hisi_poplar_map); +} + +static void __exit exit_rc_map_hisi_poplar(void) +{ + rc_map_unregister(&hisi_poplar_map); +} + +module_init(init_rc_map_hisi_poplar) +module_exit(exit_rc_map_hisi_poplar) + +MODULE_LICENSE("GPL v2"); diff --git a/include/media/rc-map.h b/include/media/rc-map.h index 6d2172b6a0ed..cc59c72ac282 100644 --- a/include/media/rc-map.h +++ b/include/media/rc-map.h @@ -258,6 +258,7 @@ struct rc_map *rc_map_get(const char *name); #define RC_MAP_GENIUS_TVGO_A11MCE "rc-genius-tvgo-a11mce" #define RC_MAP_GOTVIEW7135 "rc-gotview7135" #define RC_MAP_HAUPPAUGE_NEW "rc-hauppauge" +#define RC_MAP_HISI_POPLAR "rc-hisi-poplar" #define RC_MAP_HISI_TV_DEMO "rc-hisi-tv-demo" #define RC_MAP_IMON_MCE "rc-imon-mce" #define RC_MAP_IMON_PAD "rc-imon-pad" From e414b4aa890bf8559c078e606e810c5edd139b05 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 24 Oct 2017 11:23:14 -0400 Subject: [PATCH 0528/1640] UPSTREAM: media: rc: Convert timers to use timer_setup() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In preparation for unconditionally passing the struct timer_list pointer to all timer callbacks, switch to using the new timer_setup() and from_timer() to pass the timer pointer explicitly. Cc: Maxim Levitsky Cc: James Hogan Cc: Hans Verkuil Cc: "Antti Seppälä" Cc: Heiner Kallweit Cc: "David Härdeman" Cc: Andi Shyti Signed-off-by: Kees Cook Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/ene_ir.c | 7 +++---- drivers/media/rc/igorplugusb.c | 6 +++--- drivers/media/rc/img-ir/img-ir-hw.c | 13 ++++++------- drivers/media/rc/img-ir/img-ir-raw.c | 6 +++--- drivers/media/rc/imon.c | 7 +++---- drivers/media/rc/ir-mce_kbd-decoder.c | 7 +++---- drivers/media/rc/rc-ir-raw.c | 8 ++++---- drivers/media/rc/rc-main.c | 7 +++---- drivers/media/rc/sir_ir.c | 4 ++-- 9 files changed, 30 insertions(+), 35 deletions(-) diff --git a/drivers/media/rc/ene_ir.c b/drivers/media/rc/ene_ir.c index af7ba23e16e1..71b8c9bbf6c4 100644 --- a/drivers/media/rc/ene_ir.c +++ b/drivers/media/rc/ene_ir.c @@ -670,9 +670,9 @@ exit: } /* timer to simulate tx done interrupt */ -static void ene_tx_irqsim(unsigned long data) +static void ene_tx_irqsim(struct timer_list *t) { - struct ene_device *dev = (struct ene_device *)data; + struct ene_device *dev = from_timer(dev, t, tx_sim_timer); unsigned long flags; spin_lock_irqsave(&dev->hw_lock, flags); @@ -1045,8 +1045,7 @@ static int ene_probe(struct pnp_dev *pnp_dev, const struct pnp_device_id *id) if (!dev->hw_learning_and_tx_capable && txsim) { dev->hw_learning_and_tx_capable = true; - setup_timer(&dev->tx_sim_timer, ene_tx_irqsim, - (long unsigned int)dev); + timer_setup(&dev->tx_sim_timer, ene_tx_irqsim, 0); pr_warn("Simulation of TX activated\n"); } diff --git a/drivers/media/rc/igorplugusb.c b/drivers/media/rc/igorplugusb.c index 4b715eb995f8..f563ddd7f739 100644 --- a/drivers/media/rc/igorplugusb.c +++ b/drivers/media/rc/igorplugusb.c @@ -137,9 +137,9 @@ static void igorplugusb_cmd(struct igorplugusb *ir, int cmd) dev_err(ir->dev, "submit urb failed: %d", ret); } -static void igorplugusb_timer(unsigned long data) +static void igorplugusb_timer(struct timer_list *t) { - struct igorplugusb *ir = (struct igorplugusb *)data; + struct igorplugusb *ir = from_timer(ir, t, timer); igorplugusb_cmd(ir, GET_INFRACODE); } @@ -174,7 +174,7 @@ static int igorplugusb_probe(struct usb_interface *intf, ir->dev = &intf->dev; - setup_timer(&ir->timer, igorplugusb_timer, (unsigned long)ir); + timer_setup(&ir->timer, igorplugusb_timer, 0); ir->request.bRequest = GET_INFRACODE; ir->request.bRequestType = USB_TYPE_VENDOR | USB_DIR_IN; diff --git a/drivers/media/rc/img-ir/img-ir-hw.c b/drivers/media/rc/img-ir/img-ir-hw.c index 82fdf4cc0824..f54bc5d23893 100644 --- a/drivers/media/rc/img-ir/img-ir-hw.c +++ b/drivers/media/rc/img-ir/img-ir-hw.c @@ -867,9 +867,9 @@ static void img_ir_handle_data(struct img_ir_priv *priv, u32 len, u64 raw) } /* timer function to end waiting for repeat. */ -static void img_ir_end_timer(unsigned long arg) +static void img_ir_end_timer(struct timer_list *t) { - struct img_ir_priv *priv = (struct img_ir_priv *)arg; + struct img_ir_priv *priv = from_timer(priv, t, hw.end_timer); spin_lock_irq(&priv->lock); img_ir_end_repeat(priv); @@ -881,9 +881,9 @@ static void img_ir_end_timer(unsigned long arg) * cleared when invalid interrupts were generated due to a quirk in the * img-ir decoder. */ -static void img_ir_suspend_timer(unsigned long arg) +static void img_ir_suspend_timer(struct timer_list *t) { - struct img_ir_priv *priv = (struct img_ir_priv *)arg; + struct img_ir_priv *priv = from_timer(priv, t, hw.suspend_timer); spin_lock_irq(&priv->lock); /* @@ -1055,9 +1055,8 @@ int img_ir_probe_hw(struct img_ir_priv *priv) img_ir_probe_hw_caps(priv); /* Set up the end timer */ - setup_timer(&hw->end_timer, img_ir_end_timer, (unsigned long)priv); - setup_timer(&hw->suspend_timer, img_ir_suspend_timer, - (unsigned long)priv); + timer_setup(&hw->end_timer, img_ir_end_timer, 0); + timer_setup(&hw->suspend_timer, img_ir_suspend_timer, 0); /* Register a clock notifier */ if (!IS_ERR(priv->clk)) { diff --git a/drivers/media/rc/img-ir/img-ir-raw.c b/drivers/media/rc/img-ir/img-ir-raw.c index 64714efc1145..6e545680d3b6 100644 --- a/drivers/media/rc/img-ir/img-ir-raw.c +++ b/drivers/media/rc/img-ir/img-ir-raw.c @@ -67,9 +67,9 @@ void img_ir_isr_raw(struct img_ir_priv *priv, u32 irq_status) * order to be assured of the final space. If there are no edges for a certain * time we use this timer to emit a final sample to satisfy them. */ -static void img_ir_echo_timer(unsigned long arg) +static void img_ir_echo_timer(struct timer_list *t) { - struct img_ir_priv *priv = (struct img_ir_priv *)arg; + struct img_ir_priv *priv = from_timer(priv, t, raw.timer); spin_lock_irq(&priv->lock); @@ -107,7 +107,7 @@ int img_ir_probe_raw(struct img_ir_priv *priv) int error; /* Set up the echo timer */ - setup_timer(&raw->timer, img_ir_echo_timer, (unsigned long)priv); + timer_setup(&raw->timer, img_ir_echo_timer, 0); /* Allocate raw decoder */ raw->rdev = rdev = rc_allocate_device(RC_DRIVER_IR_RAW); diff --git a/drivers/media/rc/imon.c b/drivers/media/rc/imon.c index e8c0e07935c0..4e695a7116f1 100644 --- a/drivers/media/rc/imon.c +++ b/drivers/media/rc/imon.c @@ -1090,9 +1090,9 @@ static void usb_tx_callback(struct urb *urb) /** * report touchscreen input */ -static void imon_touch_display_timeout(unsigned long data) +static void imon_touch_display_timeout(struct timer_list *t) { - struct imon_context *ictx = (struct imon_context *)data; + struct imon_context *ictx = from_timer(ictx, t, ttimer); if (ictx->display_type != IMON_DISPLAY_TYPE_VGA) return; @@ -2408,8 +2408,7 @@ static struct imon_context *imon_init_intf1(struct usb_interface *intf, mutex_lock(&ictx->lock); if (ictx->display_type == IMON_DISPLAY_TYPE_VGA) { - setup_timer(&ictx->ttimer, imon_touch_display_timeout, - (unsigned long)ictx); + timer_setup(&ictx->ttimer, imon_touch_display_timeout, 0); } ictx->usbdev_intf1 = usb_get_dev(interface_to_usbdev(intf)); diff --git a/drivers/media/rc/ir-mce_kbd-decoder.c b/drivers/media/rc/ir-mce_kbd-decoder.c index 7c572a643656..69d6264d54e6 100644 --- a/drivers/media/rc/ir-mce_kbd-decoder.c +++ b/drivers/media/rc/ir-mce_kbd-decoder.c @@ -115,9 +115,9 @@ static unsigned char kbd_keycodes[256] = { KEY_RESERVED }; -static void mce_kbd_rx_timeout(unsigned long data) +static void mce_kbd_rx_timeout(struct timer_list *t) { - struct mce_kbd_dec *mce_kbd = (struct mce_kbd_dec *)data; + struct mce_kbd_dec *mce_kbd = from_timer(mce_kbd, t, rx_timeout); int i; unsigned char maskcode; @@ -389,8 +389,7 @@ static int ir_mce_kbd_register(struct rc_dev *dev) set_bit(EV_MSC, idev->evbit); set_bit(MSC_SCAN, idev->mscbit); - setup_timer(&mce_kbd->rx_timeout, mce_kbd_rx_timeout, - (unsigned long)mce_kbd); + timer_setup(&mce_kbd->rx_timeout, mce_kbd_rx_timeout, 0); input_set_drvdata(idev, mce_kbd); diff --git a/drivers/media/rc/rc-ir-raw.c b/drivers/media/rc/rc-ir-raw.c index 503bc425a187..f6e5ba4fbb49 100644 --- a/drivers/media/rc/rc-ir-raw.c +++ b/drivers/media/rc/rc-ir-raw.c @@ -471,9 +471,10 @@ int ir_raw_encode_scancode(enum rc_proto protocol, u32 scancode, } EXPORT_SYMBOL(ir_raw_encode_scancode); -static void edge_handle(unsigned long arg) +static void edge_handle(struct timer_list *t) { - struct rc_dev *dev = (struct rc_dev *)arg; + struct ir_raw_event_ctrl *raw = from_timer(raw, t, edge_handle); + struct rc_dev *dev = raw->dev; ktime_t interval = ktime_sub(ktime_get(), dev->raw->last_event); if (ktime_to_ns(interval) >= dev->timeout) { @@ -513,8 +514,7 @@ int ir_raw_event_prepare(struct rc_dev *dev) dev->raw->dev = dev; dev->change_protocol = change_protocol; - setup_timer(&dev->raw->edge_handle, edge_handle, - (unsigned long)dev); + timer_setup(&dev->raw->edge_handle, edge_handle, 0); INIT_KFIFO(dev->raw->kfifo); return 0; diff --git a/drivers/media/rc/rc-main.c b/drivers/media/rc/rc-main.c index cb78e5702bef..17950e29d4e3 100644 --- a/drivers/media/rc/rc-main.c +++ b/drivers/media/rc/rc-main.c @@ -630,9 +630,9 @@ EXPORT_SYMBOL_GPL(rc_keyup); * This routine will generate a keyup event some time after a keydown event * is generated when no further activity has been detected. */ -static void ir_timer_keyup(unsigned long cookie) +static void ir_timer_keyup(struct timer_list *t) { - struct rc_dev *dev = (struct rc_dev *)cookie; + struct rc_dev *dev = from_timer(dev, t, timer_keyup); unsigned long flags; /* @@ -1570,8 +1570,7 @@ struct rc_dev *rc_allocate_device(enum rc_driver_type type) dev->input_dev->setkeycode = ir_setkeycode; input_set_drvdata(dev->input_dev, dev); - setup_timer(&dev->timer_keyup, ir_timer_keyup, - (unsigned long)dev); + timer_setup(&dev->timer_keyup, ir_timer_keyup, 0); spin_lock_init(&dev->rc_map.lock); spin_lock_init(&dev->keylock); diff --git a/drivers/media/rc/sir_ir.c b/drivers/media/rc/sir_ir.c index bc906fb128d5..76120664b700 100644 --- a/drivers/media/rc/sir_ir.c +++ b/drivers/media/rc/sir_ir.c @@ -120,7 +120,7 @@ static void add_read_queue(int flag, unsigned long val) } /* SECTION: Hardware */ -static void sir_timeout(unsigned long data) +static void sir_timeout(struct timer_list *unused) { /* * if last received signal was a pulse, but receiving stopped @@ -321,7 +321,7 @@ static int sir_ir_probe(struct platform_device *dev) rcdev->timeout = IR_DEFAULT_TIMEOUT; rcdev->dev.parent = &sir_ir_dev->dev; - setup_timer(&timerlist, sir_timeout, 0); + timer_setup(&timerlist, sir_timeout, 0); /* get I/O port access and IRQ line */ if (!devm_request_region(&sir_ir_dev->dev, io, 8, KBUILD_MODNAME)) { From 865dc6655a0dbb7b16ea44cfe78b64a575ed9c25 Mon Sep 17 00:00:00 2001 From: Oleh Kravchenko Date: Sat, 28 Oct 2017 09:38:16 -0400 Subject: [PATCH 0529/1640] UPSTREAM: media: rc: mceusb: add support for 1b80:d3b2 Evromedia USB Full Hybrid Full HD (1b80:d3b2) has IR on Interface 0. Remote controller supplied with this tuner fully compatible with RC_MAP_MSI_DIGIVOX_III. Signed-off-by: Oleh Kravchenko Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/mceusb.c | 9 +++++++++ drivers/media/usb/cx231xx/cx231xx-cards.c | 1 - 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/drivers/media/rc/mceusb.c b/drivers/media/rc/mceusb.c index 67c1ff099eb4..948699408764 100644 --- a/drivers/media/rc/mceusb.c +++ b/drivers/media/rc/mceusb.c @@ -188,6 +188,7 @@ enum mceusb_model_type { TIVO_KIT, MCE_GEN2_NO_TX, HAUPPAUGE_CX_HYBRID_TV, + EVROMEDIA_FULL_HYBRID_FULLHD, }; struct mceusb_model { @@ -247,6 +248,11 @@ static const struct mceusb_model mceusb_model[] = { .mce_gen2 = 1, .rc_map = RC_MAP_TIVO, }, + [EVROMEDIA_FULL_HYBRID_FULLHD] = { + .name = "Evromedia USB Full Hybrid Full HD", + .no_tx = 1, + .rc_map = RC_MAP_MSI_DIGIVOX_III, + }, }; static const struct usb_device_id mceusb_dev_table[] = { @@ -398,6 +404,9 @@ static const struct usb_device_id mceusb_dev_table[] = { .driver_info = HAUPPAUGE_CX_HYBRID_TV }, /* Adaptec / HP eHome Receiver */ { USB_DEVICE(VENDOR_ADAPTEC, 0x0094) }, + /* Evromedia USB Full Hybrid Full HD */ + { USB_DEVICE(0x1b80, 0xd3b2), + .driver_info = EVROMEDIA_FULL_HYBRID_FULLHD }, /* Terminating entry */ { } diff --git a/drivers/media/usb/cx231xx/cx231xx-cards.c b/drivers/media/usb/cx231xx/cx231xx-cards.c index c30cb0fb165d..6f906b7b682c 100644 --- a/drivers/media/usb/cx231xx/cx231xx-cards.c +++ b/drivers/media/usb/cx231xx/cx231xx-cards.c @@ -847,7 +847,6 @@ struct cx231xx_board cx231xx_boards[] = { .demod_addr = 0x64, /* 0xc8 >> 1 */ .demod_i2c_master = I2C_1_MUX_3, .has_dvb = 1, - .ir_i2c_master = I2C_0, .norm = V4L2_STD_PAL, .output_mode = OUT_MODE_VIP11, .tuner_addr = 0x60, /* 0xc0 >> 1 */ From ddb2964b08d9202dc76baf049d4c905483fe4be5 Mon Sep 17 00:00:00 2001 From: Oleh Kravchenko Date: Sat, 28 Oct 2017 09:38:17 -0400 Subject: [PATCH 0530/1640] UPSTREAM: media: rc: Add Astrometa T2hybrid keymap module Add the keymap module for Astrometa T2hybrid remote control commands. Signed-off-by: Oleh Kravchenko Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/keymaps/Makefile | 1 + .../media/rc/keymaps/rc-astrometa-t2hybrid.c | 70 +++++++++++++++++++ include/media/rc-map.h | 1 + 3 files changed, 72 insertions(+) create mode 100644 drivers/media/rc/keymaps/rc-astrometa-t2hybrid.c diff --git a/drivers/media/rc/keymaps/Makefile b/drivers/media/rc/keymaps/Makefile index 8eb58ef1bdce..50b319355edf 100644 --- a/drivers/media/rc/keymaps/Makefile +++ b/drivers/media/rc/keymaps/Makefile @@ -3,6 +3,7 @@ obj-$(CONFIG_RC_MAP) += rc-adstech-dvb-t-pci.o \ rc-alink-dtu-m.o \ rc-anysee.o \ rc-apac-viewcomp.o \ + rc-astrometa-t2hybrid.o \ rc-asus-pc39.o \ rc-asus-ps3-100.o \ rc-ati-tv-wonder-hd-600.o \ diff --git a/drivers/media/rc/keymaps/rc-astrometa-t2hybrid.c b/drivers/media/rc/keymaps/rc-astrometa-t2hybrid.c new file mode 100644 index 000000000000..51690960fec4 --- /dev/null +++ b/drivers/media/rc/keymaps/rc-astrometa-t2hybrid.c @@ -0,0 +1,70 @@ +/* + * Keytable for the Astrometa T2hybrid remote controller + * + * Copyright (C) 2017 Oleh Kravchenko + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#include +#include + +static struct rc_map_table t2hybrid[] = { + { 0x4d, KEY_POWER2 }, + { 0x54, KEY_VIDEO }, /* Source */ + { 0x16, KEY_MUTE }, + + { 0x4c, KEY_RECORD }, + { 0x05, KEY_CHANNELUP }, + { 0x0c, KEY_TIME}, /* Timeshift */ + + { 0x0a, KEY_VOLUMEDOWN }, + { 0x40, KEY_ZOOM }, /* Fullscreen */ + { 0x1e, KEY_VOLUMEUP }, + + { 0x12, KEY_0 }, + { 0x02, KEY_CHANNELDOWN }, + { 0x1c, KEY_AGAIN }, /* Recall */ + + { 0x09, KEY_1 }, + { 0x1d, KEY_2 }, + { 0x1f, KEY_3 }, + + { 0x0d, KEY_4 }, + { 0x19, KEY_5 }, + { 0x1b, KEY_6 }, + + { 0x11, KEY_7 }, + { 0x15, KEY_8 }, + { 0x17, KEY_9 }, +}; + +static struct rc_map_list t2hybrid_map = { + .map = { + .scan = t2hybrid, + .size = ARRAY_SIZE(t2hybrid), + .rc_proto = RC_PROTO_NEC, + .name = RC_MAP_ASTROMETA_T2HYBRID, + } +}; + +static int __init init_rc_map_t2hybrid(void) +{ + return rc_map_register(&t2hybrid_map); +} + +static void __exit exit_rc_map_t2hybrid(void) +{ + rc_map_unregister(&t2hybrid_map); +} + +module_init(init_rc_map_t2hybrid) +module_exit(exit_rc_map_t2hybrid) + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Oleh Kravchenko "); diff --git a/include/media/rc-map.h b/include/media/rc-map.h index cc59c72ac282..72197cb43781 100644 --- a/include/media/rc-map.h +++ b/include/media/rc-map.h @@ -211,6 +211,7 @@ struct rc_map *rc_map_get(const char *name); #define RC_MAP_ALINK_DTU_M "rc-alink-dtu-m" #define RC_MAP_ANYSEE "rc-anysee" #define RC_MAP_APAC_VIEWCOMP "rc-apac-viewcomp" +#define RC_MAP_ASTROMETA_T2HYBRID "rc-astrometa-t2hybrid" #define RC_MAP_ASUS_PC39 "rc-asus-pc39" #define RC_MAP_ASUS_PS3_100 "rc-asus-ps3-100" #define RC_MAP_ATI_TV_WONDER_HD_600 "rc-ati-tv-wonder-hd-600" From e87433b652f079af325aa195c357fdc5c8499493 Mon Sep 17 00:00:00 2001 From: Oleh Kravchenko Date: Sat, 28 Oct 2017 09:38:18 -0400 Subject: [PATCH 0531/1640] UPSTREAM: media: rc: mceusb: add support for 15f4:0135 Astrometa T2hybrid (15f4:0135) has IR on Interface 0. Signed-off-by: Oleh Kravchenko Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/mceusb.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/media/rc/mceusb.c b/drivers/media/rc/mceusb.c index 948699408764..a9187b0b46a1 100644 --- a/drivers/media/rc/mceusb.c +++ b/drivers/media/rc/mceusb.c @@ -189,6 +189,7 @@ enum mceusb_model_type { MCE_GEN2_NO_TX, HAUPPAUGE_CX_HYBRID_TV, EVROMEDIA_FULL_HYBRID_FULLHD, + ASTROMETA_T2HYBRID, }; struct mceusb_model { @@ -253,6 +254,11 @@ static const struct mceusb_model mceusb_model[] = { .no_tx = 1, .rc_map = RC_MAP_MSI_DIGIVOX_III, }, + [ASTROMETA_T2HYBRID] = { + .name = "Astrometa T2Hybrid", + .no_tx = 1, + .rc_map = RC_MAP_ASTROMETA_T2HYBRID, + } }; static const struct usb_device_id mceusb_dev_table[] = { @@ -407,6 +413,9 @@ static const struct usb_device_id mceusb_dev_table[] = { /* Evromedia USB Full Hybrid Full HD */ { USB_DEVICE(0x1b80, 0xd3b2), .driver_info = EVROMEDIA_FULL_HYBRID_FULLHD }, + /* Astrometa T2hybrid */ + { USB_DEVICE(0x15f4, 0x0135), + .driver_info = ASTROMETA_T2HYBRID }, /* Terminating entry */ { } From 498bcde572c5921a79741dc3ee6a5384eb384535 Mon Sep 17 00:00:00 2001 From: Sean Young Date: Wed, 8 Nov 2017 16:19:45 -0500 Subject: [PATCH 0532/1640] UPSTREAM: media: rc: sir_ir: detect presence of port Without this test, sir_ir clumsy claims resources for a device which does not exist. The 0-day kernel test robot reports the following errors (in a loop): sir_ir sir_ir.0: Trapped in interrupt genirq: Flags mismatch irq 4. 00000000 (ttyS0) vs. 00000000 (sir_ir) When sir_ir is loaded with the default io and irq, the following happens: - sir_ir claims irq 4 - user space opens /dev/ttyS0 - in serial8250_do_startup(), some setup is done for ttyS0, which causes irq 4 to fire (in THRE test) - sir_ir does not realise it was not for it, and spins until the "trapped in interrupt" - now serial driver calls setup_irq() and fails and we get the "Flags mismatch" error. There is no port present at 0x3e8 so simply check for the presence of a port, as suggested by Linus. Reported-by: kbuild test robot Tested-by: Fengguang Wu Signed-off-by: Sean Young Cc: # 4.12+ Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/sir_ir.c | 40 +++++++++++++++++++++++++++++++++++---- 1 file changed, 36 insertions(+), 4 deletions(-) diff --git a/drivers/media/rc/sir_ir.c b/drivers/media/rc/sir_ir.c index 76120664b700..9ee2c9196b4d 100644 --- a/drivers/media/rc/sir_ir.c +++ b/drivers/media/rc/sir_ir.c @@ -57,7 +57,7 @@ static void add_read_queue(int flag, unsigned long val); static irqreturn_t sir_interrupt(int irq, void *dev_id); static void send_space(unsigned long len); static void send_pulse(unsigned long len); -static void init_hardware(void); +static int init_hardware(void); static void drop_hardware(void); /* Initialisation */ @@ -263,11 +263,36 @@ static void send_pulse(unsigned long len) } } -static void init_hardware(void) +static int init_hardware(void) { + u8 scratch, scratch2, scratch3; unsigned long flags; spin_lock_irqsave(&hardware_lock, flags); + + /* + * This is a simple port existence test, borrowed from the autoconfig + * function in drivers/tty/serial/8250/8250_port.c + */ + scratch = sinp(UART_IER); + soutp(UART_IER, 0); +#ifdef __i386__ + outb(0xff, 0x080); +#endif + scratch2 = sinp(UART_IER) & 0x0f; + soutp(UART_IER, 0x0f); +#ifdef __i386__ + outb(0x00, 0x080); +#endif + scratch3 = sinp(UART_IER) & 0x0f; + soutp(UART_IER, scratch); + if (scratch2 != 0 || scratch3 != 0x0f) { + /* we fail, there's nothing here */ + spin_unlock_irqrestore(&hardware_lock, flags); + pr_err("port existence test failed, cannot continue\n"); + return -ENODEV; + } + /* reset UART */ outb(0, io + UART_MCR); outb(0, io + UART_IER); @@ -285,6 +310,8 @@ static void init_hardware(void) /* turn on UART */ outb(UART_MCR_DTR | UART_MCR_RTS | UART_MCR_OUT2, io + UART_MCR); spin_unlock_irqrestore(&hardware_lock, flags); + + return 0; } static void drop_hardware(void) @@ -334,14 +361,19 @@ static int sir_ir_probe(struct platform_device *dev) pr_err("IRQ %d already in use.\n", irq); return retval; } + + retval = init_hardware(); + if (retval) { + del_timer_sync(&timerlist); + return retval; + } + pr_info("I/O port 0x%.4x, IRQ %d.\n", io, irq); retval = devm_rc_register_device(&sir_ir_dev->dev, rcdev); if (retval < 0) return retval; - init_hardware(); - return 0; } From 1f232c67cfa9afef6d7a5b1a9bde095011a09e70 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 27 Nov 2017 08:35:13 -0500 Subject: [PATCH 0533/1640] UPSTREAM: media: rc: fix lots of documentation warnings Building the driver with gcc 7.2.1 and: make ARCH=i386 CF=-D__CHECK_ENDIAN__ CONFIG_DEBUG_SECTION_MISMATCH=y W=1 CHECK='' M=drivers/media now produces a lot of warnings: drivers/media/rc/rc-main.c:278: warning: No description found for parameter 'new_keycode' drivers/media/rc/rc-main.c:278: warning: Excess function parameter 'keycode' description in 'ir_update_mapping' drivers/media/rc/rc-main.c:387: warning: No description found for parameter 'ke' drivers/media/rc/rc-main.c:387: warning: No description found for parameter 'old_keycode' drivers/media/rc/rc-main.c:387: warning: Excess function parameter 'scancode' description in 'ir_setkeycode' drivers/media/rc/rc-main.c:387: warning: Excess function parameter 'keycode' description in 'ir_setkeycode' drivers/media/rc/rc-main.c:433: warning: Excess function parameter 'to' description in 'ir_setkeytable' drivers/media/rc/rc-main.c:506: warning: No description found for parameter 'ke' drivers/media/rc/rc-main.c:506: warning: Excess function parameter 'scancode' description in 'ir_getkeycode' drivers/media/rc/rc-main.c:506: warning: Excess function parameter 'keycode' description in 'ir_getkeycode' drivers/media/rc/rc-main.c:634: warning: No description found for parameter 't' drivers/media/rc/rc-main.c:634: warning: Excess function parameter 'cookie' description in 'ir_timer_keyup' Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/rc-main.c | 46 +++++++++++++++++++++++--------------- 1 file changed, 28 insertions(+), 18 deletions(-) diff --git a/drivers/media/rc/rc-main.c b/drivers/media/rc/rc-main.c index 17950e29d4e3..c4b0217bd169 100644 --- a/drivers/media/rc/rc-main.c +++ b/drivers/media/rc/rc-main.c @@ -170,10 +170,11 @@ static struct rc_map_list empty_map = { * @name: name to assign to the table * @rc_proto: ir type to assign to the new table * @size: initial size of the table - * @return: zero on success or a negative error code * * This routine will initialize the rc_map and will allocate * memory to hold at least the specified number of elements. + * + * return: zero on success or a negative error code */ static int ir_create_table(struct rc_map *rc_map, const char *name, u64 rc_proto, size_t size) @@ -216,10 +217,11 @@ static void ir_free_table(struct rc_map *rc_map) * ir_resize_table() - resizes a scancode table if necessary * @rc_map: the rc_map to resize * @gfp_flags: gfp flags to use when allocating memory - * @return: zero on success or a negative error code * * This routine will shrink the rc_map if it has lots of * unused entries and grow it if it is full. + * + * return: zero on success or a negative error code */ static int ir_resize_table(struct rc_map *rc_map, gfp_t gfp_flags) { @@ -265,11 +267,13 @@ static int ir_resize_table(struct rc_map *rc_map, gfp_t gfp_flags) * @dev: the struct rc_dev device descriptor * @rc_map: scancode table to be adjusted * @index: index of the mapping that needs to be updated - * @keycode: the desired keycode - * @return: previous keycode assigned to the mapping + * @new_keycode: the desired keycode * * This routine is used to update scancode->keycode mapping at given * position. + * + * return: previous keycode assigned to the mapping + * */ static unsigned int ir_update_mapping(struct rc_dev *dev, struct rc_map *rc_map, @@ -320,12 +324,13 @@ static unsigned int ir_update_mapping(struct rc_dev *dev, * @scancode: the desired scancode * @resize: controls whether we allowed to resize the table to * accommodate not yet present scancodes - * @return: index of the mapping containing scancode in question - * or -1U in case of failure. * * This routine is used to locate given scancode in rc_map. * If scancode is not yet present the routine will allocate a new slot * for it. + * + * return: index of the mapping containing scancode in question + * or -1U in case of failure. */ static unsigned int ir_establish_scancode(struct rc_dev *dev, struct rc_map *rc_map, @@ -375,11 +380,12 @@ static unsigned int ir_establish_scancode(struct rc_dev *dev, /** * ir_setkeycode() - set a keycode in the scancode->keycode table * @idev: the struct input_dev device descriptor - * @scancode: the desired scancode - * @keycode: result - * @return: -EINVAL if the keycode could not be inserted, otherwise zero. + * @ke: Input keymap entry + * @old_keycode: result * * This routine is used to handle evdev EVIOCSKEY ioctl. + * + * return: -EINVAL if the keycode could not be inserted, otherwise zero. */ static int ir_setkeycode(struct input_dev *idev, const struct input_keymap_entry *ke, @@ -422,11 +428,11 @@ out: /** * ir_setkeytable() - sets several entries in the scancode->keycode table * @dev: the struct rc_dev device descriptor - * @to: the struct rc_map to copy entries to * @from: the struct rc_map to copy entries from - * @return: -ENOMEM if all keycodes could not be inserted, otherwise zero. * * This routine is used to handle table initialization. + * + * return: -ENOMEM if all keycodes could not be inserted, otherwise zero. */ static int ir_setkeytable(struct rc_dev *dev, const struct rc_map *from) @@ -474,10 +480,11 @@ static int rc_map_cmp(const void *key, const void *elt) * ir_lookup_by_scancode() - locate mapping by scancode * @rc_map: the struct rc_map to search * @scancode: scancode to look for in the table - * @return: index in the table, -1U if not found * * This routine performs binary search in RC keykeymap table for * given scancode. + * + * return: index in the table, -1U if not found */ static unsigned int ir_lookup_by_scancode(const struct rc_map *rc_map, unsigned int scancode) @@ -495,11 +502,11 @@ static unsigned int ir_lookup_by_scancode(const struct rc_map *rc_map, /** * ir_getkeycode() - get a keycode from the scancode->keycode table * @idev: the struct input_dev device descriptor - * @scancode: the desired scancode - * @keycode: used to return the keycode, if found, or KEY_RESERVED - * @return: always returns zero. + * @ke: Input keymap entry * * This routine is used to handle evdev EVIOCGKEY ioctl. + * + * return: always returns zero. */ static int ir_getkeycode(struct input_dev *idev, struct input_keymap_entry *ke) @@ -556,11 +563,12 @@ out: * rc_g_keycode_from_table() - gets the keycode that corresponds to a scancode * @dev: the struct rc_dev descriptor of the device * @scancode: the scancode to look for - * @return: the corresponding keycode, or KEY_RESERVED * * This routine is used by drivers which need to convert a scancode to a * keycode. Normally it should not be used since drivers should have no * interest in keycodes. + * + * return: the corresponding keycode, or KEY_RESERVED */ u32 rc_g_keycode_from_table(struct rc_dev *dev, u32 scancode) { @@ -625,7 +633,8 @@ EXPORT_SYMBOL_GPL(rc_keyup); /** * ir_timer_keyup() - generates a keyup event after a timeout - * @cookie: a pointer to the struct rc_dev for the device + * + * @t: a pointer to the struct timer_list * * This routine will generate a keyup event some time after a keydown event * is generated when no further activity has been detected. @@ -780,7 +789,8 @@ EXPORT_SYMBOL_GPL(rc_keydown_notimeout); * provides sensible defaults * @dev: the struct rc_dev descriptor of the device * @filter: the scancode and mask - * @return: 0 or -EINVAL if the filter is not valid + * + * return: 0 or -EINVAL if the filter is not valid */ static int rc_validate_filter(struct rc_dev *dev, struct rc_scancode_filter *filter) From 558c1145e7ad6046033d3e48ed137dd58cfddd70 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 27 Nov 2017 10:09:03 -0500 Subject: [PATCH 0534/1640] UPSTREAM: media: img-ir-hw: fix one kernel-doc comment Needed to suppress the following warnings: drivers/media/rc/img-ir/img-ir-hw.c:351: warning: No description found for parameter 'reg_timings' drivers/media/rc/img-ir/img-ir-hw.c:351: warning: Excess function parameter 'timings' description in 'img_ir_decoder_convert' Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/img-ir/img-ir-hw.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/media/rc/img-ir/img-ir-hw.c b/drivers/media/rc/img-ir/img-ir-hw.c index f54bc5d23893..ec4ded84cd17 100644 --- a/drivers/media/rc/img-ir/img-ir-hw.c +++ b/drivers/media/rc/img-ir/img-ir-hw.c @@ -339,7 +339,7 @@ static void img_ir_decoder_preprocess(struct img_ir_decoder *decoder) /** * img_ir_decoder_convert() - Generate internal timings in decoder. * @decoder: Decoder to be converted to internal timings. - * @timings: Timing register values. + * @reg_timings: Timing register values. * @clock_hz: IR clock rate in Hz. * * Fills out the repeat timings and timing register values for a specific clock From bf7c13b02d54fdb2dff11a801e7233b96580658d Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 27 Nov 2017 10:19:38 -0500 Subject: [PATCH 0535/1640] UPSTREAM: media: rc-ir-raw: cleanup kernel-doc markups Cleanup those warnings: drivers/media/rc/rc-ir-raw.c:141: warning: No description found for parameter 'ev' drivers/media/rc/rc-ir-raw.c:141: warning: Excess function parameter 'type' description in 'ir_raw_event_store_with_filter' Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/rc-ir-raw.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/media/rc/rc-ir-raw.c b/drivers/media/rc/rc-ir-raw.c index f6e5ba4fbb49..d78483a504c9 100644 --- a/drivers/media/rc/rc-ir-raw.c +++ b/drivers/media/rc/rc-ir-raw.c @@ -128,7 +128,7 @@ EXPORT_SYMBOL_GPL(ir_raw_event_store_edge); /** * ir_raw_event_store_with_filter() - pass next pulse/space to decoders with some processing * @dev: the struct rc_dev device descriptor - * @type: the type of the event that has occurred + * @ev: the event that has occurred * * This routine (which may be called from an interrupt context) works * in similar manner to ir_raw_event_store_edge. From 9c11e73dde02d12c46bf07342a679cda0753772b Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 27 Nov 2017 10:23:07 -0500 Subject: [PATCH 0536/1640] UPSTREAM: media: ir-nec-decoder: fix kernel-doc parameters Some parameters aren't correctly identified, as noticed by those warnings: drivers/media/rc/ir-nec-decoder.c:49: warning: No description found for parameter 'ev' drivers/media/rc/ir-nec-decoder.c:49: warning: Excess function parameter 'duration' description in 'ir_nec_decode' drivers/media/rc/ir-nec-decoder.c:189: warning: Excess function parameter 'raw' description in 'ir_nec_scancode_to_raw' Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/ir-nec-decoder.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/media/rc/ir-nec-decoder.c b/drivers/media/rc/ir-nec-decoder.c index a95d09acc22a..6880c190dcd2 100644 --- a/drivers/media/rc/ir-nec-decoder.c +++ b/drivers/media/rc/ir-nec-decoder.c @@ -41,7 +41,7 @@ enum nec_state { /** * ir_nec_decode() - Decode one NEC pulse or space * @dev: the struct rc_dev descriptor of the device - * @duration: the struct ir_raw_event descriptor of the pulse/space + * @ev: the struct ir_raw_event descriptor of the pulse/space * * This function returns -EINVAL if the pulse violates the state machine */ @@ -183,7 +183,6 @@ static int ir_nec_decode(struct rc_dev *dev, struct ir_raw_event ev) * ir_nec_scancode_to_raw() - encode an NEC scancode ready for modulation. * @protocol: specific protocol to use * @scancode: a single NEC scancode. - * @raw: raw data to be modulated. */ static u32 ir_nec_scancode_to_raw(enum rc_proto protocol, u32 scancode) { From efd6e2c039fbe3b92b9773d6cb944aabecf3c723 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 27 Nov 2017 10:27:54 -0500 Subject: [PATCH 0537/1640] UPSTREAM: media: imon: don't use kernel-doc "/**" markups The function documentation here doesn't follow kernel-doc, as parameters aren't documented. So, stop abusing on "/**" markups. Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/imon.c | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/drivers/media/rc/imon.c b/drivers/media/rc/imon.c index 4e695a7116f1..2ae62148b765 100644 --- a/drivers/media/rc/imon.c +++ b/drivers/media/rc/imon.c @@ -492,7 +492,7 @@ static void free_imon_context(struct imon_context *ictx) dev_dbg(dev, "%s: iMON context freed\n", __func__); } -/** +/* * Called when the Display device (e.g. /dev/lcd0) * is opened by the application. */ @@ -542,7 +542,7 @@ exit: return retval; } -/** +/* * Called when the display device (e.g. /dev/lcd0) * is closed by the application. */ @@ -575,7 +575,7 @@ static int display_close(struct inode *inode, struct file *file) return retval; } -/** +/* * Sends a packet to the device -- this function must be called with * ictx->lock held, or its unlock/lock sequence while waiting for tx * to complete can/will lead to a deadlock. @@ -664,7 +664,7 @@ static int send_packet(struct imon_context *ictx) return retval; } -/** +/* * Sends an associate packet to the iMON 2.4G. * * This might not be such a good idea, since it has an id collision with @@ -694,7 +694,7 @@ static int send_associate_24g(struct imon_context *ictx) return retval; } -/** +/* * Sends packets to setup and show clock on iMON display * * Arguments: year - last 2 digits of year, month - 1..12, @@ -781,7 +781,7 @@ static int send_set_imon_clock(struct imon_context *ictx, return retval; } -/** +/* * These are the sysfs functions to handle the association on the iMON 2.4G LT. */ static ssize_t show_associate_remote(struct device *d, @@ -823,7 +823,7 @@ static ssize_t store_associate_remote(struct device *d, return count; } -/** +/* * sysfs functions to control internal imon clock */ static ssize_t show_imon_clock(struct device *d, @@ -923,7 +923,7 @@ static const struct attribute_group imon_rf_attr_group = { .attrs = imon_rf_sysfs_entries }; -/** +/* * Writes data to the VFD. The iMON VFD is 2x16 characters * and requires data in 5 consecutive USB interrupt packets, * each packet but the last carrying 7 bytes. @@ -1008,7 +1008,7 @@ exit: return (!retval) ? n_bytes : retval; } -/** +/* * Writes data to the LCD. The iMON OEM LCD screen expects 8-byte * packets. We accept data as 16 hexadecimal digits, followed by a * newline (to make it easy to drive the device from a command-line @@ -1066,7 +1066,7 @@ exit: return (!retval) ? n_bytes : retval; } -/** +/* * Callback function for USB core API: transmit data */ static void usb_tx_callback(struct urb *urb) @@ -1087,7 +1087,7 @@ static void usb_tx_callback(struct urb *urb) complete(&ictx->tx.finished); } -/** +/* * report touchscreen input */ static void imon_touch_display_timeout(struct timer_list *t) @@ -1103,7 +1103,7 @@ static void imon_touch_display_timeout(struct timer_list *t) input_sync(ictx->touch); } -/** +/* * iMON IR receivers support two different signal sets -- those used by * the iMON remotes, and those used by the Windows MCE remotes (which is * really just RC-6), but only one or the other at a time, as the signals @@ -1188,7 +1188,7 @@ static inline int tv2int(const struct timeval *a, const struct timeval *b) return sec; } -/** +/* * The directional pad behaves a bit differently, depending on whether this is * one of the older ffdc devices or a newer device. Newer devices appear to * have a higher resolution matrix for more precise mouse movement, but it @@ -1540,7 +1540,7 @@ static void imon_pad_to_keys(struct imon_context *ictx, unsigned char *buf) } } -/** +/* * figure out if these is a press or a release. We don't actually * care about repeats, as those will be auto-generated within the IR * subsystem for repeating scancodes. @@ -1589,10 +1589,10 @@ static int imon_parse_press_type(struct imon_context *ictx, return press_type; } -/** +/* * Process the incoming packet */ -/** +/* * Convert bit count to time duration (in us) and submit * the value to lirc_dev. */ @@ -1605,7 +1605,7 @@ static void submit_data(struct imon_context *context) ir_raw_event_store_with_filter(context->rdev, &ev); } -/** +/* * Process the incoming packet */ static void imon_incoming_ir_raw(struct imon_context *context, @@ -1828,7 +1828,7 @@ not_input_data: } } -/** +/* * Callback function for USB core API: receive data */ static void usb_rx_callback_intf0(struct urb *urb) @@ -2482,7 +2482,7 @@ static void imon_init_display(struct imon_context *ictx, } -/** +/* * Callback function for USB core API: Probe */ static int imon_probe(struct usb_interface *interface, @@ -2580,7 +2580,7 @@ fail: return ret; } -/** +/* * Callback function for USB core API: disconnect */ static void imon_disconnect(struct usb_interface *interface) From 70d2eac3ae29a5cfc80ddab31d6d59da9a5170b5 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Wed, 29 Nov 2017 03:46:30 -0500 Subject: [PATCH 0538/1640] UPSTREAM: media: rc: fix kernel-doc parameter names There are several parameters there that are named wrong, as reported by those warnings: drivers/media/rc/ir-sharp-decoder.c:47: warning: No description found for parameter 'ev' drivers/media/rc/ir-sharp-decoder.c:47: warning: Excess function parameter 'duration' description in 'ir_sharp_decode' drivers/media/rc/ir-sanyo-decoder.c:56: warning: No description found for parameter 'ev' drivers/media/rc/ir-sanyo-decoder.c:56: warning: Excess function parameter 'duration' description in 'ir_sanyo_decode' drivers/media/rc/ir-xmp-decoder.c:43: warning: No description found for parameter 'ev' drivers/media/rc/ir-xmp-decoder.c:43: warning: Excess function parameter 'duration' description in 'ir_xmp_decode' drivers/media/rc/ir-jvc-decoder.c:47: warning: No description found for parameter 'ev' drivers/media/rc/ir-jvc-decoder.c:47: warning: Excess function parameter 'duration' description in 'ir_jvc_decode' drivers/media/rc/ir-lirc-codec.c:34: warning: No description found for parameter 'dev' drivers/media/rc/ir-lirc-codec.c:34: warning: No description found for parameter 'ev' drivers/media/rc/ir-lirc-codec.c:34: warning: Excess function parameter 'input_dev' description in 'ir_lirc_decode' drivers/media/rc/ir-lirc-codec.c:34: warning: Excess function parameter 'duration' description in 'ir_lirc_decode' Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/ir-jvc-decoder.c | 2 +- drivers/media/rc/ir-lirc-codec.c | 4 ++-- drivers/media/rc/ir-sanyo-decoder.c | 2 +- drivers/media/rc/ir-sharp-decoder.c | 2 +- drivers/media/rc/ir-xmp-decoder.c | 2 +- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/media/rc/ir-jvc-decoder.c b/drivers/media/rc/ir-jvc-decoder.c index e2bd68c42edf..22c8aee3df4f 100644 --- a/drivers/media/rc/ir-jvc-decoder.c +++ b/drivers/media/rc/ir-jvc-decoder.c @@ -39,7 +39,7 @@ enum jvc_state { /** * ir_jvc_decode() - Decode one JVC pulse or space * @dev: the struct rc_dev descriptor of the device - * @duration: the struct ir_raw_event descriptor of the pulse/space + * @ev: the struct ir_raw_event descriptor of the pulse/space * * This function returns -EINVAL if the pulse violates the state machine */ diff --git a/drivers/media/rc/ir-lirc-codec.c b/drivers/media/rc/ir-lirc-codec.c index 8f2f37412fc5..4fd4521693d9 100644 --- a/drivers/media/rc/ir-lirc-codec.c +++ b/drivers/media/rc/ir-lirc-codec.c @@ -25,8 +25,8 @@ /** * ir_lirc_decode() - Send raw IR data to lirc_dev to be relayed to the * lircd userspace daemon for decoding. - * @input_dev: the struct rc_dev descriptor of the device - * @duration: the struct ir_raw_event descriptor of the pulse/space + * @dev: the struct rc_dev descriptor of the device + * @ev: the struct ir_raw_event descriptor of the pulse/space * * This function returns -EINVAL if the lirc interfaces aren't wired up. */ diff --git a/drivers/media/rc/ir-sanyo-decoder.c b/drivers/media/rc/ir-sanyo-decoder.c index 758c60956850..d94e07b02f3b 100644 --- a/drivers/media/rc/ir-sanyo-decoder.c +++ b/drivers/media/rc/ir-sanyo-decoder.c @@ -48,7 +48,7 @@ enum sanyo_state { /** * ir_sanyo_decode() - Decode one SANYO pulse or space * @dev: the struct rc_dev descriptor of the device - * @duration: the struct ir_raw_event descriptor of the pulse/space + * @ev: the struct ir_raw_event descriptor of the pulse/space * * This function returns -EINVAL if the pulse violates the state machine */ diff --git a/drivers/media/rc/ir-sharp-decoder.c b/drivers/media/rc/ir-sharp-decoder.c index 129b558acc92..7140dd6160ee 100644 --- a/drivers/media/rc/ir-sharp-decoder.c +++ b/drivers/media/rc/ir-sharp-decoder.c @@ -39,7 +39,7 @@ enum sharp_state { /** * ir_sharp_decode() - Decode one Sharp pulse or space * @dev: the struct rc_dev descriptor of the device - * @duration: the struct ir_raw_event descriptor of the pulse/space + * @ev: the struct ir_raw_event descriptor of the pulse/space * * This function returns -EINVAL if the pulse violates the state machine */ diff --git a/drivers/media/rc/ir-xmp-decoder.c b/drivers/media/rc/ir-xmp-decoder.c index 6f464be1c8d7..712bc6d76e92 100644 --- a/drivers/media/rc/ir-xmp-decoder.c +++ b/drivers/media/rc/ir-xmp-decoder.c @@ -35,7 +35,7 @@ enum xmp_state { /** * ir_xmp_decode() - Decode one XMP pulse or space * @dev: the struct rc_dev descriptor of the device - * @duration: the struct ir_raw_event descriptor of the pulse/space + * @ev: the struct ir_raw_event descriptor of the pulse/space * * This function returns -EINVAL if the pulse violates the state machine */ From 63c06efba22500e4fb94a256ab7d117433efabd8 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Wed, 29 Nov 2017 08:33:45 -0500 Subject: [PATCH 0539/1640] UPSTREAM: media: drivers: remove "/**" from non-kernel-doc comments Several comments are wrongly tagged as kernel-doc, causing those warnings: drivers/media/rc/st_rc.c:98: warning: No description found for parameter 'irq' drivers/media/rc/st_rc.c:98: warning: No description found for parameter 'data' drivers/media/pci/solo6x10/solo6x10-enc.c:183: warning: No description found for parameter 'solo_dev' drivers/media/pci/solo6x10/solo6x10-enc.c:183: warning: No description found for parameter 'ch' drivers/media/pci/solo6x10/solo6x10-enc.c:183: warning: No description found for parameter 'qp' drivers/media/usb/pwc/pwc-dec23.c:652: warning: Cannot understand * on line 652 - I thought it was a doc line drivers/media/usb/dvb-usb/cinergyT2-fe.c:40: warning: No description found for parameter 'op' drivers/media/usb/dvb-usb/friio-fe.c:301: warning: Cannot understand * (reg, val) commad list to initialize this module. on line 301 - I thought it was a doc line drivers/media/rc/streamzap.c:201: warning: No description found for parameter 'urb' drivers/media/rc/streamzap.c:333: warning: No description found for parameter 'intf' drivers/media/rc/streamzap.c:333: warning: No description found for parameter 'id' drivers/media/rc/streamzap.c:464: warning: No description found for parameter 'interface' drivers/media/i2c/ov5647.c:432: warning: Cannot understand * @short Subdev core operations registration on line 432 - I thought it was a doc line drivers/media/usb/dvb-usb/friio.c:35: warning: No description found for parameter 'd' drivers/media/usb/dvb-usb/friio.c:35: warning: No description found for parameter 'addr' drivers/media/usb/dvb-usb/friio.c:35: warning: No description found for parameter 'wbuf' drivers/media/usb/dvb-usb/friio.c:35: warning: No description found for parameter 'wlen' drivers/media/usb/dvb-usb/friio.c:35: warning: No description found for parameter 'rbuf' drivers/media/usb/dvb-usb/friio.c:35: warning: No description found for parameter 'rlen' drivers/media/platform/vim2m.c:350: warning: No description found for parameter 'priv' drivers/media/dvb-frontends/tua6100.c:34: warning: cannot understand function prototype: 'struct tua6100_priv ' drivers/media/platform/sti/hva/hva-h264.c:140: warning: cannot understand function prototype: 'struct hva_h264_stereo_video_sei ' drivers/media/platform/sti/hva/hva-h264.c:150: warning: Cannot understand * @frame_width: width in pixels of the buffer containing the input frame on line 150 - I thought it was a doc line drivers/media/platform/sti/hva/hva-h264.c:356: warning: Cannot understand * @ slice_size: slice size on line 356 - I thought it was a doc line drivers/media/platform/sti/hva/hva-h264.c:369: warning: Cannot understand * @ bitstream_size: bitstream size on line 369 - I thought it was a doc line drivers/media/platform/sti/hva/hva-h264.c:395: warning: Cannot understand * @seq_info: sequence information buffer on line 395 - I thought it was a doc line drivers/media/dvb-frontends/sp887x.c:137: warning: No description found for parameter 'fe' drivers/media/dvb-frontends/sp887x.c:137: warning: No description found for parameter 'fw' drivers/media/dvb-frontends/sp887x.c:287: warning: No description found for parameter 'n' drivers/media/dvb-frontends/sp887x.c:287: warning: No description found for parameter 'd' drivers/media/dvb-frontends/sp887x.c:287: warning: No description found for parameter 'quotient_i' drivers/media/dvb-frontends/sp887x.c:287: warning: No description found for parameter 'quotient_f' drivers/media/usb/ttusb-budget/dvb-ttusb-budget.c:83: warning: cannot understand function prototype: 'struct ttusb ' drivers/media/platform/sh_veu.c:277: warning: No description found for parameter 'priv' drivers/media/dvb-frontends/zl10036.c:33: warning: cannot understand function prototype: 'int zl10036_debug; ' drivers/media/dvb-frontends/zl10036.c:179: warning: No description found for parameter 'state' drivers/media/dvb-frontends/zl10036.c:179: warning: No description found for parameter 'frequency' drivers/media/platform/rcar_fdp1.c:1139: warning: No description found for parameter 'priv' drivers/media/platform/ti-vpe/vpe.c:933: warning: No description found for parameter 'priv' drivers/media/usb/gspca/ov519.c:36: warning: No description found for parameter 'fmt' drivers/media/usb/dvb-usb/dib0700_devices.c:3367: warning: No description found for parameter 'adap' Signed-off-by: Mauro Carvalho Chehab --- drivers/media/dvb-frontends/sp887x.c | 6 +++--- drivers/media/dvb-frontends/tua6100.c | 2 +- drivers/media/dvb-frontends/zl10036.c | 8 ++++---- drivers/media/i2c/ov5647.c | 4 ++-- drivers/media/pci/solo6x10/solo6x10-enc.c | 2 +- drivers/media/platform/rcar_fdp1.c | 2 +- drivers/media/platform/sh_veu.c | 2 +- drivers/media/platform/sti/hva/hva-h264.c | 18 +++++++++++++----- drivers/media/platform/ti-vpe/vpe.c | 2 +- drivers/media/platform/vim2m.c | 2 +- drivers/media/rc/st_rc.c | 6 +++--- drivers/media/rc/streamzap.c | 6 +++--- drivers/media/usb/dvb-usb/cinergyT2-fe.c | 2 +- drivers/media/usb/dvb-usb/dib0700_devices.c | 8 ++++---- drivers/media/usb/dvb-usb/friio-fe.c | 2 +- drivers/media/usb/dvb-usb/friio.c | 2 +- drivers/media/usb/gspca/ov519.c | 2 +- drivers/media/usb/pwc/pwc-dec23.c | 7 +++---- .../media/usb/ttusb-budget/dvb-ttusb-budget.c | 6 +++--- 19 files changed, 48 insertions(+), 41 deletions(-) diff --git a/drivers/media/dvb-frontends/sp887x.c b/drivers/media/dvb-frontends/sp887x.c index 18352bd3c8c6..8b9618dcaa63 100644 --- a/drivers/media/dvb-frontends/sp887x.c +++ b/drivers/media/dvb-frontends/sp887x.c @@ -57,7 +57,7 @@ static int sp887x_writereg (struct sp887x_state* state, u16 reg, u16 data) int ret; if ((ret = i2c_transfer(state->i2c, &msg, 1)) != 1) { - /** + /* * in case of soft reset we ignore ACK errors... */ if (!(reg == 0xf1a && data == 0x000 && @@ -130,7 +130,7 @@ static void sp887x_setup_agc (struct sp887x_state* state) #define BLOCKSIZE 30 #define FW_SIZE 0x4000 -/** +/* * load firmware and setup MPEG interface... */ static int sp887x_initial_setup (struct dvb_frontend* fe, const struct firmware *fw) @@ -279,7 +279,7 @@ static int configure_reg0xc05(struct dtv_frontend_properties *p, u16 *reg0xc05) return 0; } -/** +/* * estimates division of two 24bit numbers, * derived from the ves1820/stv0299 driver code */ diff --git a/drivers/media/dvb-frontends/tua6100.c b/drivers/media/dvb-frontends/tua6100.c index 5541ffa4d2f3..e09b07ab6c1c 100644 --- a/drivers/media/dvb-frontends/tua6100.c +++ b/drivers/media/dvb-frontends/tua6100.c @@ -1,4 +1,4 @@ -/** +/* * Driver for Infineon tua6100 pll. * * (c) 2006 Andrew de Quincey diff --git a/drivers/media/dvb-frontends/zl10036.c b/drivers/media/dvb-frontends/zl10036.c index 6cd4fb8bc271..231c51acf22f 100644 --- a/drivers/media/dvb-frontends/zl10036.c +++ b/drivers/media/dvb-frontends/zl10036.c @@ -1,4 +1,4 @@ -/** +/* * Driver for Zarlink zl10036 DVB-S silicon tuner * * Copyright (C) 2006 Tino Reichardt @@ -157,7 +157,7 @@ static int zl10036_sleep(struct dvb_frontend *fe) return ret; } -/** +/* * register map of the ZL10036/ZL10038 * * reg[default] content @@ -219,7 +219,7 @@ static int zl10036_set_bandwidth(struct zl10036_state *state, u32 fbw) if (fbw <= 28820) { br = _BR_MAXIMUM; } else { - /** + /* * f(bw)=34,6MHz f(xtal)=10.111MHz * br = (10111/34600) * 63 * 1/K = 14; */ @@ -315,7 +315,7 @@ static int zl10036_set_params(struct dvb_frontend *fe) || (frequency > fe->ops.info.frequency_max)) return -EINVAL; - /** + /* * alpha = 1.35 for dvb-s * fBW = (alpha*symbolrate)/(2*0.8) * 1.35 / (2*0.8) = 27 / 32 diff --git a/drivers/media/i2c/ov5647.c b/drivers/media/i2c/ov5647.c index 95ce90fdb876..210aa822399c 100644 --- a/drivers/media/i2c/ov5647.c +++ b/drivers/media/i2c/ov5647.c @@ -407,8 +407,8 @@ static int ov5647_sensor_set_register(struct v4l2_subdev *sd, } #endif -/** - * @short Subdev core operations registration +/* + * Subdev core operations registration */ static const struct v4l2_subdev_core_ops ov5647_subdev_core_ops = { .s_power = ov5647_sensor_power, diff --git a/drivers/media/pci/solo6x10/solo6x10-enc.c b/drivers/media/pci/solo6x10/solo6x10-enc.c index d28211bb9674..58d6b5131dd0 100644 --- a/drivers/media/pci/solo6x10/solo6x10-enc.c +++ b/drivers/media/pci/solo6x10/solo6x10-enc.c @@ -175,7 +175,7 @@ out: return 0; } -/** +/* * Set channel Quality Profile (0-3). */ void solo_s_jpeg_qp(struct solo_dev *solo_dev, unsigned int ch, diff --git a/drivers/media/platform/rcar_fdp1.c b/drivers/media/platform/rcar_fdp1.c index 5965e34e36cc..f37a6bfb0609 100644 --- a/drivers/media/platform/rcar_fdp1.c +++ b/drivers/media/platform/rcar_fdp1.c @@ -1134,7 +1134,7 @@ static int fdp1_device_process(struct fdp1_ctx *ctx) * mem2mem callbacks */ -/** +/* * job_ready() - check whether an instance is ready to be scheduled to run */ static int fdp1_m2m_job_ready(void *priv) diff --git a/drivers/media/platform/sh_veu.c b/drivers/media/platform/sh_veu.c index a4f593220ef0..80c1b80f6586 100644 --- a/drivers/media/platform/sh_veu.c +++ b/drivers/media/platform/sh_veu.c @@ -267,7 +267,7 @@ static void sh_veu_process(struct sh_veu_dev *veu, sh_veu_reg_write(veu, VEU_EIER, 1); /* enable interrupt in VEU */ } -/** +/* * sh_veu_device_run() - prepares and starts the device * * This will be called by the framework when it decides to schedule a particular diff --git a/drivers/media/platform/sti/hva/hva-h264.c b/drivers/media/platform/sti/hva/hva-h264.c index e6f247a983c7..d69c58211107 100644 --- a/drivers/media/platform/sti/hva/hva-h264.c +++ b/drivers/media/platform/sti/hva/hva-h264.c @@ -134,7 +134,7 @@ enum hva_h264_sei_payload_type { SEI_FRAME_PACKING_ARRANGEMENT = 45 }; -/** +/* * stereo Video Info struct */ struct hva_h264_stereo_video_sei { @@ -146,7 +146,9 @@ struct hva_h264_stereo_video_sei { u8 right_view_self_contained_flag; }; -/** +/* + * struct hva_h264_td + * * @frame_width: width in pixels of the buffer containing the input frame * @frame_height: height in pixels of the buffer containing the input frame * @frame_num: the parameter to be written in the slice header @@ -352,7 +354,9 @@ struct hva_h264_td { u32 addr_brc_in_out_parameter; }; -/** +/* + * struct hva_h264_slice_po + * * @ slice_size: slice size * @ slice_start_time: start time * @ slice_stop_time: stop time @@ -365,7 +369,9 @@ struct hva_h264_slice_po { u32 slice_num; }; -/** +/* + * struct hva_h264_po + * * @ bitstream_size: bitstream size * @ dct_bitstream_size: dtc bitstream size * @ stuffing_bits: number of stuffing bits inserted by the encoder @@ -391,7 +397,9 @@ struct hva_h264_task { struct hva_h264_po po; }; -/** +/* + * struct hva_h264_ctx + * * @seq_info: sequence information buffer * @ref_frame: reference frame buffer * @rec_frame: reconstructed frame buffer diff --git a/drivers/media/platform/ti-vpe/vpe.c b/drivers/media/platform/ti-vpe/vpe.c index bbd8bb611915..74f03f9c7901 100644 --- a/drivers/media/platform/ti-vpe/vpe.c +++ b/drivers/media/platform/ti-vpe/vpe.c @@ -931,7 +931,7 @@ static struct vpe_ctx *file2ctx(struct file *file) * mem2mem callbacks */ -/** +/* * job_ready() - check whether an instance is ready to be scheduled to run */ static int job_ready(void *priv) diff --git a/drivers/media/platform/vim2m.c b/drivers/media/platform/vim2m.c index b01fba020d5f..0592f40b23f3 100644 --- a/drivers/media/platform/vim2m.c +++ b/drivers/media/platform/vim2m.c @@ -343,7 +343,7 @@ static void schedule_irq(struct vim2m_dev *dev, int msec_timeout) * mem2mem callbacks */ -/** +/* * job_ready() - check whether an instance is ready to be scheduled to run */ static int job_ready(void *priv) diff --git a/drivers/media/rc/st_rc.c b/drivers/media/rc/st_rc.c index a8e39c635f34..d2efd7b2c3bc 100644 --- a/drivers/media/rc/st_rc.c +++ b/drivers/media/rc/st_rc.c @@ -49,7 +49,7 @@ struct st_rc_device { #define IRB_RX_NOISE_SUPPR 0x5c /* noise suppression */ #define IRB_RX_POLARITY_INV 0x68 /* polarity inverter */ -/** +/* * IRQ set: Enable full FIFO 1 -> bit 3; * Enable overrun IRQ 1 -> bit 2; * Enable last symbol IRQ 1 -> bit 1: @@ -72,7 +72,7 @@ static void st_rc_send_lirc_timeout(struct rc_dev *rdev) ir_raw_event_store(rdev, &ev); } -/** +/* * RX graphical example to better understand the difference between ST IR block * output and standard definition used by LIRC (and most of the world!) * @@ -317,7 +317,7 @@ static int st_rc_probe(struct platform_device *pdev) device_init_wakeup(dev, true); dev_pm_set_wake_irq(dev, rc_dev->irq); - /** + /* * for LIRC_MODE_MODE2 or LIRC_MODE_PULSE or LIRC_MODE_RAW * lircd expects a long space first before a signal train to sync. */ diff --git a/drivers/media/rc/streamzap.c b/drivers/media/rc/streamzap.c index 4eebfcfc10f3..c9a70fda88a8 100644 --- a/drivers/media/rc/streamzap.c +++ b/drivers/media/rc/streamzap.c @@ -191,7 +191,7 @@ static void sz_push_half_space(struct streamzap_ir *sz, sz_push_full_space(sz, value & SZ_SPACE_MASK); } -/** +/* * streamzap_callback - usb IRQ handler callback * * This procedure is invoked on reception of data from @@ -321,7 +321,7 @@ out: return NULL; } -/** +/* * streamzap_probe * * Called by usb-core to associated with a candidate device @@ -450,7 +450,7 @@ free_sz: return retval; } -/** +/* * streamzap_disconnect * * Called by the usb core when the device is removed from the system. diff --git a/drivers/media/usb/dvb-usb/cinergyT2-fe.c b/drivers/media/usb/dvb-usb/cinergyT2-fe.c index f9772ad0a2a5..5a2f81311fb7 100644 --- a/drivers/media/usb/dvb-usb/cinergyT2-fe.c +++ b/drivers/media/usb/dvb-usb/cinergyT2-fe.c @@ -26,7 +26,7 @@ #include "cinergyT2.h" -/** +/* * convert linux-dvb frontend parameter set into TPS. * See ETSI ETS-300744, section 4.6.2, table 9 for details. * diff --git a/drivers/media/usb/dvb-usb/dib0700_devices.c b/drivers/media/usb/dvb-usb/dib0700_devices.c index 0a65884cefe3..8bc4d4d1109d 100644 --- a/drivers/media/usb/dvb-usb/dib0700_devices.c +++ b/drivers/media/usb/dvb-usb/dib0700_devices.c @@ -1678,10 +1678,10 @@ static int dib8096_set_param_override(struct dvb_frontend *fe) return -EINVAL; } - /** Update PLL if needed ratio **/ + /* Update PLL if needed ratio */ state->dib8000_ops.update_pll(fe, &dib8090_pll_config_12mhz, fe->dtv_property_cache.bandwidth_hz / 1000, 0); - /** Get optimize PLL ratio to remove spurious **/ + /* Get optimize PLL ratio to remove spurious */ pll_ratio = dib8090_compute_pll_parameters(fe); if (pll_ratio == 17) timf = 21387946; @@ -1692,7 +1692,7 @@ static int dib8096_set_param_override(struct dvb_frontend *fe) else timf = 18179756; - /** Update ratio **/ + /* Update ratio */ state->dib8000_ops.update_pll(fe, &dib8090_pll_config_12mhz, fe->dtv_property_cache.bandwidth_hz / 1000, pll_ratio); state->dib8000_ops.ctrl_timf(fe, DEMOD_TIMF_SET, timf); @@ -3378,7 +3378,7 @@ static int novatd_sleep_override(struct dvb_frontend* fe) return state->sleep(fe); } -/** +/* * novatd_frontend_attach - Nova-TD specific attach * * Nova-TD has GPIO0, 1 and 2 for LEDs. So do not fiddle with them except for diff --git a/drivers/media/usb/dvb-usb/friio-fe.c b/drivers/media/usb/dvb-usb/friio-fe.c index 0251a4e91d47..0b108071197a 100644 --- a/drivers/media/usb/dvb-usb/friio-fe.c +++ b/drivers/media/usb/dvb-usb/friio-fe.c @@ -319,7 +319,7 @@ static int jdvbt90502_set_frontend(struct dvb_frontend *fe) } -/** +/* * (reg, val) commad list to initialize this module. * captured on a Windows box. */ diff --git a/drivers/media/usb/dvb-usb/friio.c b/drivers/media/usb/dvb-usb/friio.c index 62abe6c43a32..16875945e662 100644 --- a/drivers/media/usb/dvb-usb/friio.c +++ b/drivers/media/usb/dvb-usb/friio.c @@ -21,7 +21,7 @@ MODULE_PARM_DESC(debug, DVB_DEFINE_MOD_OPT_ADAPTER_NR(adapter_nr); -/** +/* * Indirect I2C access to the PLL via FE. * whole I2C protocol data to the PLL is sent via the FE's I2C register. * This is done by a control msg to the FE with the I2C data accompanied, and diff --git a/drivers/media/usb/gspca/ov519.c b/drivers/media/usb/gspca/ov519.c index b51d2de1aca8..b78a2cf664fc 100644 --- a/drivers/media/usb/gspca/ov519.c +++ b/drivers/media/usb/gspca/ov519.c @@ -1,4 +1,4 @@ -/** +/* * OV519 driver * * Copyright (C) 2008-2011 Jean-François Moine diff --git a/drivers/media/usb/pwc/pwc-dec23.c b/drivers/media/usb/pwc/pwc-dec23.c index 3792fedff951..1283b3bd9800 100644 --- a/drivers/media/usb/pwc/pwc-dec23.c +++ b/drivers/media/usb/pwc/pwc-dec23.c @@ -649,11 +649,10 @@ static void DecompressBand23(struct pwc_dec23_private *pdec, } /** - * * Uncompress a pwc23 buffer. - * - * src: raw data - * dst: image output + * @pdev: pointer to pwc device's internal struct + * @src: raw data + * @dst: image output */ void pwc_dec23_decompress(struct pwc_device *pdev, const void *src, diff --git a/drivers/media/usb/ttusb-budget/dvb-ttusb-budget.c b/drivers/media/usb/ttusb-budget/dvb-ttusb-budget.c index b842f367249f..a142b9dc0feb 100644 --- a/drivers/media/usb/ttusb-budget/dvb-ttusb-budget.c +++ b/drivers/media/usb/ttusb-budget/dvb-ttusb-budget.c @@ -76,7 +76,7 @@ DVB_DEFINE_MOD_OPT_ADAPTER_NR(adapter_nr); #define TTUSB_REV_2_2 0x22 #define TTUSB_BUDGET_NAME "ttusb_stc_fw" -/** +/* * since we're casting (struct ttusb*) <-> (struct dvb_demux*) around * the dvb_demux field must be the first in struct!! */ @@ -713,7 +713,7 @@ static void ttusb_process_frame(struct ttusb *ttusb, u8 * data, int len) } } - /** + /* * if length is valid and we reached the end: * goto next muxpack */ @@ -729,7 +729,7 @@ static void ttusb_process_frame(struct ttusb *ttusb, u8 * data, int len) /* maximum bytes, until we know the length */ ttusb->muxpack_len = 2; - /** + /* * no muxpacks left? * return to search-sync state */ From 74df1446135f3ca1ef36db63d3d144340839fe48 Mon Sep 17 00:00:00 2001 From: Sean Young Date: Sun, 19 Nov 2017 16:57:27 -0500 Subject: [PATCH 0540/1640] UPSTREAM: media: rc: partial revert of "media: rc: per-protocol repeat period" Since commit d57ea877af38 ("media: rc: per-protocol repeat period"), most IR protocols have a lower keyup timeout. This causes problems on the ite-cir, which has default IR timeout of 200ms. Since the IR decoders read the trailing space, with a IR timeout of 200ms, the last keydown will have at least a delay of 200ms. This is more than the protocol timeout of e.g. rc-6 (which is 164ms). As a result the last IR will be interpreted as a new keydown event, and we get two keypresses. Revert the protocol timeout to 250ms, except for cec which needs a timeout of 550ms. Fixes: d57ea877af38 ("media: rc: per-protocol repeat period") Cc: # 4.14 Reported-by: Matthias Reichl Signed-off-by: Sean Young Tested-by: Matthias Reichl Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/rc-main.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/drivers/media/rc/rc-main.c b/drivers/media/rc/rc-main.c index c4b0217bd169..c144b77eac98 100644 --- a/drivers/media/rc/rc-main.c +++ b/drivers/media/rc/rc-main.c @@ -39,41 +39,41 @@ static const struct { [RC_PROTO_UNKNOWN] = { .name = "unknown", .repeat_period = 250 }, [RC_PROTO_OTHER] = { .name = "other", .repeat_period = 250 }, [RC_PROTO_RC5] = { .name = "rc-5", - .scancode_bits = 0x1f7f, .repeat_period = 164 }, + .scancode_bits = 0x1f7f, .repeat_period = 250 }, [RC_PROTO_RC5X_20] = { .name = "rc-5x-20", - .scancode_bits = 0x1f7f3f, .repeat_period = 164 }, + .scancode_bits = 0x1f7f3f, .repeat_period = 250 }, [RC_PROTO_RC5_SZ] = { .name = "rc-5-sz", - .scancode_bits = 0x2fff, .repeat_period = 164 }, + .scancode_bits = 0x2fff, .repeat_period = 250 }, [RC_PROTO_JVC] = { .name = "jvc", .scancode_bits = 0xffff, .repeat_period = 250 }, [RC_PROTO_SONY12] = { .name = "sony-12", - .scancode_bits = 0x1f007f, .repeat_period = 100 }, + .scancode_bits = 0x1f007f, .repeat_period = 250 }, [RC_PROTO_SONY15] = { .name = "sony-15", - .scancode_bits = 0xff007f, .repeat_period = 100 }, + .scancode_bits = 0xff007f, .repeat_period = 250 }, [RC_PROTO_SONY20] = { .name = "sony-20", - .scancode_bits = 0x1fff7f, .repeat_period = 100 }, + .scancode_bits = 0x1fff7f, .repeat_period = 250 }, [RC_PROTO_NEC] = { .name = "nec", - .scancode_bits = 0xffff, .repeat_period = 160 }, + .scancode_bits = 0xffff, .repeat_period = 250 }, [RC_PROTO_NECX] = { .name = "nec-x", - .scancode_bits = 0xffffff, .repeat_period = 160 }, + .scancode_bits = 0xffffff, .repeat_period = 250 }, [RC_PROTO_NEC32] = { .name = "nec-32", - .scancode_bits = 0xffffffff, .repeat_period = 160 }, + .scancode_bits = 0xffffffff, .repeat_period = 250 }, [RC_PROTO_SANYO] = { .name = "sanyo", .scancode_bits = 0x1fffff, .repeat_period = 250 }, [RC_PROTO_MCIR2_KBD] = { .name = "mcir2-kbd", - .scancode_bits = 0xffff, .repeat_period = 150 }, + .scancode_bits = 0xffff, .repeat_period = 250 }, [RC_PROTO_MCIR2_MSE] = { .name = "mcir2-mse", - .scancode_bits = 0x1fffff, .repeat_period = 150 }, + .scancode_bits = 0x1fffff, .repeat_period = 250 }, [RC_PROTO_RC6_0] = { .name = "rc-6-0", - .scancode_bits = 0xffff, .repeat_period = 164 }, + .scancode_bits = 0xffff, .repeat_period = 250 }, [RC_PROTO_RC6_6A_20] = { .name = "rc-6-6a-20", - .scancode_bits = 0xfffff, .repeat_period = 164 }, + .scancode_bits = 0xfffff, .repeat_period = 250 }, [RC_PROTO_RC6_6A_24] = { .name = "rc-6-6a-24", - .scancode_bits = 0xffffff, .repeat_period = 164 }, + .scancode_bits = 0xffffff, .repeat_period = 250 }, [RC_PROTO_RC6_6A_32] = { .name = "rc-6-6a-32", - .scancode_bits = 0xffffffff, .repeat_period = 164 }, + .scancode_bits = 0xffffffff, .repeat_period = 250 }, [RC_PROTO_RC6_MCE] = { .name = "rc-6-mce", - .scancode_bits = 0xffff7fff, .repeat_period = 164 }, + .scancode_bits = 0xffff7fff, .repeat_period = 250 }, [RC_PROTO_SHARP] = { .name = "sharp", .scancode_bits = 0x1fff, .repeat_period = 250 }, [RC_PROTO_XMP] = { .name = "xmp", .repeat_period = 250 }, From 910738c7087744c9d995e7630ca6e7b6fb9393d1 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Fri, 1 Dec 2017 08:47:08 -0500 Subject: [PATCH 0541/1640] UPSTREAM: media: rc: add SPDX identifiers to the code I wrote As we're now using SPDX identifiers, on the several media drivers I wrote, add the proper SPDX, identifying the license I meant. As we're now using the short license, it doesn't make sense to keep the original license text. Also, fix MODULE_LICENSE to properly identify GPL v2. Signed-off-by: Mauro Carvalho Chehab Reviewed-by: Philippe Ombredanne Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/ir-nec-decoder.c | 19 ++++----------- drivers/media/rc/ir-rc5-decoder.c | 21 +++++----------- drivers/media/rc/ir-sanyo-decoder.c | 37 +++++++++++------------------ drivers/media/rc/rc-core-priv.h | 18 ++++---------- drivers/media/rc/rc-ir-raw.c | 17 ++++--------- drivers/media/rc/rc-main.c | 19 ++++----------- 6 files changed, 38 insertions(+), 93 deletions(-) diff --git a/drivers/media/rc/ir-nec-decoder.c b/drivers/media/rc/ir-nec-decoder.c index 6880c190dcd2..22eed9505244 100644 --- a/drivers/media/rc/ir-nec-decoder.c +++ b/drivers/media/rc/ir-nec-decoder.c @@ -1,16 +1,7 @@ -/* ir-nec-decoder.c - handle NEC IR Pulse/Space protocol - * - * Copyright (C) 2010 by Mauro Carvalho Chehab - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation version 2 of the License. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - */ +// SPDX-License-Identifier: GPL-2.0 +// ir-nec-decoder.c - handle NEC IR Pulse/Space protocol +// +// Copyright (C) 2010 by Mauro Carvalho Chehab #include #include @@ -281,7 +272,7 @@ static void __exit ir_nec_decode_exit(void) module_init(ir_nec_decode_init); module_exit(ir_nec_decode_exit); -MODULE_LICENSE("GPL"); +MODULE_LICENSE("GPL v2"); MODULE_AUTHOR("Mauro Carvalho Chehab"); MODULE_AUTHOR("Red Hat Inc. (http://www.redhat.com)"); MODULE_DESCRIPTION("NEC IR protocol decoder"); diff --git a/drivers/media/rc/ir-rc5-decoder.c b/drivers/media/rc/ir-rc5-decoder.c index 1292f534de43..cbff3e26d481 100644 --- a/drivers/media/rc/ir-rc5-decoder.c +++ b/drivers/media/rc/ir-rc5-decoder.c @@ -1,17 +1,8 @@ -/* ir-rc5-decoder.c - decoder for RC5(x) and StreamZap protocols - * - * Copyright (C) 2010 by Mauro Carvalho Chehab - * Copyright (C) 2010 by Jarod Wilson - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation version 2 of the License. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - */ +// SPDX-License-Identifier: GPL-2.0 +// ir-rc5-decoder.c - decoder for RC5(x) and StreamZap protocols +// +// Copyright (C) 2010 by Mauro Carvalho Chehab +// Copyright (C) 2010 by Jarod Wilson /* * This decoder handles the 14 bit RC5 protocol, 15 bit "StreamZap" protocol @@ -300,7 +291,7 @@ static void __exit ir_rc5_decode_exit(void) module_init(ir_rc5_decode_init); module_exit(ir_rc5_decode_exit); -MODULE_LICENSE("GPL"); +MODULE_LICENSE("GPL v2"); MODULE_AUTHOR("Mauro Carvalho Chehab and Jarod Wilson"); MODULE_AUTHOR("Red Hat Inc. (http://www.redhat.com)"); MODULE_DESCRIPTION("RC5(x/sz) IR protocol decoder"); diff --git a/drivers/media/rc/ir-sanyo-decoder.c b/drivers/media/rc/ir-sanyo-decoder.c index d94e07b02f3b..2138f0e9472d 100644 --- a/drivers/media/rc/ir-sanyo-decoder.c +++ b/drivers/media/rc/ir-sanyo-decoder.c @@ -1,25 +1,16 @@ -/* ir-sanyo-decoder.c - handle SANYO IR Pulse/Space protocol - * - * Copyright (C) 2011 by Mauro Carvalho Chehab - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation version 2 of the License. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * This protocol uses the NEC protocol timings. However, data is formatted as: - * 13 bits Custom Code - * 13 bits NOT(Custom Code) - * 8 bits Key data - * 8 bits NOT(Key data) - * - * According with LIRC, this protocol is used on Sanyo, Aiwa and Chinon - * Information for this protocol is available at the Sanyo LC7461 datasheet. - */ +// SPDX-License-Identifier: GPL-2.0 +// ir-sanyo-decoder.c - handle SANYO IR Pulse/Space protocol +// +// Copyright (C) 2011 by Mauro Carvalho Chehab +// +// This protocol uses the NEC protocol timings. However, data is formatted as: +// 13 bits Custom Code +// 13 bits NOT(Custom Code) +// 8 bits Key data +// 8 bits NOT(Key data) +// +// According with LIRC, this protocol is used on Sanyo, Aiwa and Chinon +// Information for this protocol is available at the Sanyo LC7461 datasheet. #include #include @@ -236,7 +227,7 @@ static void __exit ir_sanyo_decode_exit(void) module_init(ir_sanyo_decode_init); module_exit(ir_sanyo_decode_exit); -MODULE_LICENSE("GPL"); +MODULE_LICENSE("GPL v2"); MODULE_AUTHOR("Mauro Carvalho Chehab"); MODULE_AUTHOR("Red Hat Inc. (http://www.redhat.com)"); MODULE_DESCRIPTION("SANYO IR protocol decoder"); diff --git a/drivers/media/rc/rc-core-priv.h b/drivers/media/rc/rc-core-priv.h index ae4dd0c27731..564d6e13585e 100644 --- a/drivers/media/rc/rc-core-priv.h +++ b/drivers/media/rc/rc-core-priv.h @@ -1,17 +1,7 @@ -/* - * Remote Controller core raw events header - * - * Copyright (C) 2010 by Mauro Carvalho Chehab - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation version 2 of the License. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - */ +// SPDX-License-Identifier: GPL-2.0 +// Remote Controller core raw events header +// +// Copyright (C) 2010 by Mauro Carvalho Chehab #ifndef _RC_CORE_PRIV #define _RC_CORE_PRIV diff --git a/drivers/media/rc/rc-ir-raw.c b/drivers/media/rc/rc-ir-raw.c index d78483a504c9..0616eee564a8 100644 --- a/drivers/media/rc/rc-ir-raw.c +++ b/drivers/media/rc/rc-ir-raw.c @@ -1,16 +1,7 @@ -/* rc-ir-raw.c - handle IR pulse/space events - * - * Copyright (C) 2010 by Mauro Carvalho Chehab - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation version 2 of the License. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - */ +// SPDX-License-Identifier: GPL-2.0 +// rc-ir-raw.c - handle IR pulse/space events +// +// Copyright (C) 2010 by Mauro Carvalho Chehab #include #include diff --git a/drivers/media/rc/rc-main.c b/drivers/media/rc/rc-main.c index c144b77eac98..372f4d61cb48 100644 --- a/drivers/media/rc/rc-main.c +++ b/drivers/media/rc/rc-main.c @@ -1,16 +1,7 @@ -/* rc-main.c - Remote Controller core module - * - * Copyright (C) 2009-2010 by Mauro Carvalho Chehab - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation version 2 of the License. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - */ +// SPDX-License-Identifier: GPL-2.0 +// rc-main.c - Remote Controller core module +// +// Copyright (C) 2009-2010 by Mauro Carvalho Chehab #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt @@ -1905,4 +1896,4 @@ EXPORT_SYMBOL_GPL(rc_core_debug); module_param_named(debug, rc_core_debug, int, 0644); MODULE_AUTHOR("Mauro Carvalho Chehab"); -MODULE_LICENSE("GPL"); +MODULE_LICENSE("GPL v2"); From 2bdafac1000e03c0e467172fc71e56619d576878 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Fri, 1 Dec 2017 08:47:11 -0500 Subject: [PATCH 0542/1640] UPSTREAM: media: rc keymaps: add SPDX identifiers to the code I wrote As we're now using SPDX identifiers, on the several RC keymap files I wrote, add the proper SPDX, identifying the license I meant. As we're now using the short license, it doesn't make sense to keep the original license text. Also, fix MODULE_LICENSE to identify GPL v2, as this is the minimal license requirement for those modles. Signed-off-by: Mauro Carvalho Chehab --- .../media/rc/keymaps/rc-adstech-dvb-t-pci.c | 17 ++++------- drivers/media/rc/keymaps/rc-apac-viewcomp.c | 17 ++++------- drivers/media/rc/keymaps/rc-asus-pc39.c | 17 ++++------- drivers/media/rc/keymaps/rc-asus-ps3-100.c | 17 ++++------- .../rc/keymaps/rc-ati-tv-wonder-hd-600.c | 17 ++++------- drivers/media/rc/keymaps/rc-avermedia-a16d.c | 17 ++++------- .../media/rc/keymaps/rc-avermedia-cardbus.c | 17 ++++------- drivers/media/rc/keymaps/rc-avermedia-dvbt.c | 17 ++++------- drivers/media/rc/keymaps/rc-avermedia-m135a.c | 15 ++++------ drivers/media/rc/keymaps/rc-avermedia.c | 17 ++++------- drivers/media/rc/keymaps/rc-avertv-303.c | 17 ++++------- drivers/media/rc/keymaps/rc-behold-columbus.c | 17 ++++------- drivers/media/rc/keymaps/rc-behold.c | 17 ++++------- drivers/media/rc/keymaps/rc-budget-ci-old.c | 17 ++++------- drivers/media/rc/keymaps/rc-cinergy-1400.c | 17 ++++------- drivers/media/rc/keymaps/rc-cinergy.c | 17 ++++------- drivers/media/rc/keymaps/rc-dib0700-nec.c | 27 +++++++---------- drivers/media/rc/keymaps/rc-dib0700-rc5.c | 27 +++++++---------- drivers/media/rc/keymaps/rc-dm1105-nec.c | 17 ++++------- drivers/media/rc/keymaps/rc-dntv-live-dvb-t.c | 17 ++++------- .../media/rc/keymaps/rc-dntv-live-dvbt-pro.c | 17 ++++------- drivers/media/rc/keymaps/rc-em-terratec.c | 17 ++++------- .../media/rc/keymaps/rc-encore-enltv-fm53.c | 17 ++++------- drivers/media/rc/keymaps/rc-encore-enltv.c | 17 ++++------- drivers/media/rc/keymaps/rc-encore-enltv2.c | 17 ++++------- drivers/media/rc/keymaps/rc-evga-indtube.c | 17 ++++------- drivers/media/rc/keymaps/rc-eztv.c | 17 ++++------- drivers/media/rc/keymaps/rc-flydvb.c | 17 ++++------- drivers/media/rc/keymaps/rc-flyvideo.c | 17 ++++------- drivers/media/rc/keymaps/rc-fusionhdtv-mce.c | 17 ++++------- drivers/media/rc/keymaps/rc-gadmei-rm008z.c | 17 ++++------- .../media/rc/keymaps/rc-genius-tvgo-a11mce.c | 17 ++++------- drivers/media/rc/keymaps/rc-gotview7135.c | 17 ++++------- drivers/media/rc/keymaps/rc-hauppauge.c | 29 ++++++++----------- drivers/media/rc/keymaps/rc-iodata-bctv7e.c | 17 ++++------- drivers/media/rc/keymaps/rc-kaiomy.c | 17 ++++------- drivers/media/rc/keymaps/rc-kworld-315u.c | 17 ++++------- .../rc/keymaps/rc-kworld-plus-tv-analog.c | 17 ++++------- drivers/media/rc/keymaps/rc-manli.c | 17 ++++------- .../media/rc/keymaps/rc-msi-tvanywhere-plus.c | 17 ++++------- drivers/media/rc/keymaps/rc-msi-tvanywhere.c | 17 ++++------- drivers/media/rc/keymaps/rc-nebula.c | 17 ++++------- .../rc/keymaps/rc-nec-terratec-cinergy-xs.c | 17 ++++------- drivers/media/rc/keymaps/rc-norwood.c | 17 ++++------- drivers/media/rc/keymaps/rc-npgtech.c | 17 ++++------- drivers/media/rc/keymaps/rc-pctv-sedna.c | 17 ++++------- drivers/media/rc/keymaps/rc-pinnacle-color.c | 17 ++++------- drivers/media/rc/keymaps/rc-pinnacle-grey.c | 17 ++++------- .../media/rc/keymaps/rc-pinnacle-pctv-hd.c | 17 ++++------- drivers/media/rc/keymaps/rc-pixelview-002t.c | 17 ++++------- drivers/media/rc/keymaps/rc-pixelview-mk12.c | 17 ++++------- drivers/media/rc/keymaps/rc-pixelview-new.c | 17 ++++------- drivers/media/rc/keymaps/rc-pixelview.c | 17 ++++------- .../rc/keymaps/rc-powercolor-real-angel.c | 17 ++++------- drivers/media/rc/keymaps/rc-proteus-2309.c | 17 ++++------- drivers/media/rc/keymaps/rc-purpletv.c | 17 ++++------- drivers/media/rc/keymaps/rc-pv951.c | 17 ++++------- .../rc/keymaps/rc-real-audio-220-32-keys.c | 17 ++++------- drivers/media/rc/keymaps/rc-tbs-nec.c | 17 ++++------- .../media/rc/keymaps/rc-terratec-cinergy-xs.c | 17 ++++------- drivers/media/rc/keymaps/rc-tevii-nec.c | 17 ++++------- drivers/media/rc/keymaps/rc-tt-1500.c | 17 ++++------- drivers/media/rc/keymaps/rc-videomate-s350.c | 17 ++++------- .../media/rc/keymaps/rc-videomate-tv-pvr.c | 17 ++++------- .../rc/keymaps/rc-winfast-usbii-deluxe.c | 17 ++++------- drivers/media/rc/keymaps/rc-winfast.c | 17 ++++------- 66 files changed, 411 insertions(+), 741 deletions(-) diff --git a/drivers/media/rc/keymaps/rc-adstech-dvb-t-pci.c b/drivers/media/rc/keymaps/rc-adstech-dvb-t-pci.c index 2d303c2cee3b..732687ce0637 100644 --- a/drivers/media/rc/keymaps/rc-adstech-dvb-t-pci.c +++ b/drivers/media/rc/keymaps/rc-adstech-dvb-t-pci.c @@ -1,14 +1,9 @@ -/* adstech-dvb-t-pci.h - Keytable for adstech_dvb_t_pci Remote Controller - * - * keymap imported from ir-keymaps.c - * - * Copyright (c) 2010 by Mauro Carvalho Chehab - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - */ +// SPDX-License-Identifier: GPL-2.0+ +// adstech-dvb-t-pci.h - Keytable for adstech_dvb_t_pci Remote Controller +// +// keymap imported from ir-keymaps.c +// +// Copyright (c) 2010 by Mauro Carvalho Chehab #include #include diff --git a/drivers/media/rc/keymaps/rc-apac-viewcomp.c b/drivers/media/rc/keymaps/rc-apac-viewcomp.c index 65bc8957d9c3..af2e7fdc7b85 100644 --- a/drivers/media/rc/keymaps/rc-apac-viewcomp.c +++ b/drivers/media/rc/keymaps/rc-apac-viewcomp.c @@ -1,14 +1,9 @@ -/* apac-viewcomp.h - Keytable for apac_viewcomp Remote Controller - * - * keymap imported from ir-keymaps.c - * - * Copyright (c) 2010 by Mauro Carvalho Chehab - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - */ +// SPDX-License-Identifier: GPL-2.0+ +// apac-viewcomp.h - Keytable for apac_viewcomp Remote Controller +// +// keymap imported from ir-keymaps.c +// +// Copyright (c) 2010 by Mauro Carvalho Chehab #include #include diff --git a/drivers/media/rc/keymaps/rc-asus-pc39.c b/drivers/media/rc/keymaps/rc-asus-pc39.c index 530e1d1158d1..13a935c3ac59 100644 --- a/drivers/media/rc/keymaps/rc-asus-pc39.c +++ b/drivers/media/rc/keymaps/rc-asus-pc39.c @@ -1,14 +1,9 @@ -/* asus-pc39.h - Keytable for asus_pc39 Remote Controller - * - * keymap imported from ir-keymaps.c - * - * Copyright (c) 2010 by Mauro Carvalho Chehab - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - */ +// SPDX-License-Identifier: GPL-2.0+ +// asus-pc39.h - Keytable for asus_pc39 Remote Controller +// +// keymap imported from ir-keymaps.c +// +// Copyright (c) 2010 by Mauro Carvalho Chehab #include #include diff --git a/drivers/media/rc/keymaps/rc-asus-ps3-100.c b/drivers/media/rc/keymaps/rc-asus-ps3-100.c index c91ba332984c..7f836fcc68ac 100644 --- a/drivers/media/rc/keymaps/rc-asus-ps3-100.c +++ b/drivers/media/rc/keymaps/rc-asus-ps3-100.c @@ -1,14 +1,9 @@ -/* asus-ps3-100.h - Keytable for asus_ps3_100 Remote Controller - * - * Copyright (c) 2012 by Mauro Carvalho Chehab - * - * Based on a previous patch from Remi Schwartz - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - */ +// SPDX-License-Identifier: GPL-2.0+ +// asus-ps3-100.h - Keytable for asus_ps3_100 Remote Controller +// +// Copyright (c) 2012 by Mauro Carvalho Chehab +// +// Based on a previous patch from Remi Schwartz #include #include diff --git a/drivers/media/rc/keymaps/rc-ati-tv-wonder-hd-600.c b/drivers/media/rc/keymaps/rc-ati-tv-wonder-hd-600.c index 11b4bdd2392b..b4b7932c0c5a 100644 --- a/drivers/media/rc/keymaps/rc-ati-tv-wonder-hd-600.c +++ b/drivers/media/rc/keymaps/rc-ati-tv-wonder-hd-600.c @@ -1,14 +1,9 @@ -/* ati-tv-wonder-hd-600.h - Keytable for ati_tv_wonder_hd_600 Remote Controller - * - * keymap imported from ir-keymaps.c - * - * Copyright (c) 2010 by Mauro Carvalho Chehab - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - */ +// SPDX-License-Identifier: GPL-2.0+ +// ati-tv-wonder-hd-600.h - Keytable for ati_tv_wonder_hd_600 Remote Controller +// +// keymap imported from ir-keymaps.c +// +// Copyright (c) 2010 by Mauro Carvalho Chehab #include #include diff --git a/drivers/media/rc/keymaps/rc-avermedia-a16d.c b/drivers/media/rc/keymaps/rc-avermedia-a16d.c index 510dc90ebf49..5549c043cfe4 100644 --- a/drivers/media/rc/keymaps/rc-avermedia-a16d.c +++ b/drivers/media/rc/keymaps/rc-avermedia-a16d.c @@ -1,14 +1,9 @@ -/* avermedia-a16d.h - Keytable for avermedia_a16d Remote Controller - * - * keymap imported from ir-keymaps.c - * - * Copyright (c) 2010 by Mauro Carvalho Chehab - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - */ +// SPDX-License-Identifier: GPL-2.0+ +// avermedia-a16d.h - Keytable for avermedia_a16d Remote Controller +// +// keymap imported from ir-keymaps.c +// +// Copyright (c) 2010 by Mauro Carvalho Chehab #include #include diff --git a/drivers/media/rc/keymaps/rc-avermedia-cardbus.c b/drivers/media/rc/keymaps/rc-avermedia-cardbus.c index 4bbc1e68d1b8..74edcd82e685 100644 --- a/drivers/media/rc/keymaps/rc-avermedia-cardbus.c +++ b/drivers/media/rc/keymaps/rc-avermedia-cardbus.c @@ -1,14 +1,9 @@ -/* avermedia-cardbus.h - Keytable for avermedia_cardbus Remote Controller - * - * keymap imported from ir-keymaps.c - * - * Copyright (c) 2010 by Mauro Carvalho Chehab - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - */ +// SPDX-License-Identifier: GPL-2.0+ +// avermedia-cardbus.h - Keytable for avermedia_cardbus Remote Controller +// +// keymap imported from ir-keymaps.c +// +// Copyright (c) 2010 by Mauro Carvalho Chehab #include #include diff --git a/drivers/media/rc/keymaps/rc-avermedia-dvbt.c b/drivers/media/rc/keymaps/rc-avermedia-dvbt.c index f6b8547dbad3..796184160a48 100644 --- a/drivers/media/rc/keymaps/rc-avermedia-dvbt.c +++ b/drivers/media/rc/keymaps/rc-avermedia-dvbt.c @@ -1,14 +1,9 @@ -/* avermedia-dvbt.h - Keytable for avermedia_dvbt Remote Controller - * - * keymap imported from ir-keymaps.c - * - * Copyright (c) 2010 by Mauro Carvalho Chehab - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - */ +// SPDX-License-Identifier: GPL-2.0+ +// avermedia-dvbt.h - Keytable for avermedia_dvbt Remote Controller +// +// keymap imported from ir-keymaps.c +// +// Copyright (c) 2010 by Mauro Carvalho Chehab #include #include diff --git a/drivers/media/rc/keymaps/rc-avermedia-m135a.c b/drivers/media/rc/keymaps/rc-avermedia-m135a.c index 6d5a73b7ccec..f6977df1a75b 100644 --- a/drivers/media/rc/keymaps/rc-avermedia-m135a.c +++ b/drivers/media/rc/keymaps/rc-avermedia-m135a.c @@ -1,13 +1,8 @@ -/* avermedia-m135a.c - Keytable for Avermedia M135A Remote Controllers - * - * Copyright (c) 2010 by Mauro Carvalho Chehab - * Copyright (c) 2010 by Herton Ronaldo Krzesinski - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - */ +// SPDX-License-Identifier: GPL-2.0+ +// avermedia-m135a.c - Keytable for Avermedia M135A Remote Controllers +// +// Copyright (c) 2010 by Mauro Carvalho Chehab +// Copyright (c) 2010 by Herton Ronaldo Krzesinski #include #include diff --git a/drivers/media/rc/keymaps/rc-avermedia.c b/drivers/media/rc/keymaps/rc-avermedia.c index 6503f11c7df5..631ff52564f0 100644 --- a/drivers/media/rc/keymaps/rc-avermedia.c +++ b/drivers/media/rc/keymaps/rc-avermedia.c @@ -1,14 +1,9 @@ -/* avermedia.h - Keytable for avermedia Remote Controller - * - * keymap imported from ir-keymaps.c - * - * Copyright (c) 2010 by Mauro Carvalho Chehab - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - */ +// SPDX-License-Identifier: GPL-2.0+ +// avermedia.h - Keytable for avermedia Remote Controller +// +// keymap imported from ir-keymaps.c +// +// Copyright (c) 2010 by Mauro Carvalho Chehab #include #include diff --git a/drivers/media/rc/keymaps/rc-avertv-303.c b/drivers/media/rc/keymaps/rc-avertv-303.c index fbdd7ada57ce..47ca8b7ea532 100644 --- a/drivers/media/rc/keymaps/rc-avertv-303.c +++ b/drivers/media/rc/keymaps/rc-avertv-303.c @@ -1,14 +1,9 @@ -/* avertv-303.h - Keytable for avertv_303 Remote Controller - * - * keymap imported from ir-keymaps.c - * - * Copyright (c) 2010 by Mauro Carvalho Chehab - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - */ +// SPDX-License-Identifier: GPL-2.0+ +// avertv-303.h - Keytable for avertv_303 Remote Controller +// +// keymap imported from ir-keymaps.c +// +// Copyright (c) 2010 by Mauro Carvalho Chehab #include #include diff --git a/drivers/media/rc/keymaps/rc-behold-columbus.c b/drivers/media/rc/keymaps/rc-behold-columbus.c index d256743be998..61f679fec45c 100644 --- a/drivers/media/rc/keymaps/rc-behold-columbus.c +++ b/drivers/media/rc/keymaps/rc-behold-columbus.c @@ -1,14 +1,9 @@ -/* behold-columbus.h - Keytable for behold_columbus Remote Controller - * - * keymap imported from ir-keymaps.c - * - * Copyright (c) 2010 by Mauro Carvalho Chehab - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - */ +// SPDX-License-Identifier: GPL-2.0+ +// behold-columbus.h - Keytable for behold_columbus Remote Controller +// +// keymap imported from ir-keymaps.c +// +// Copyright (c) 2010 by Mauro Carvalho Chehab #include #include diff --git a/drivers/media/rc/keymaps/rc-behold.c b/drivers/media/rc/keymaps/rc-behold.c index 93dc795adc67..9b1b57e3c875 100644 --- a/drivers/media/rc/keymaps/rc-behold.c +++ b/drivers/media/rc/keymaps/rc-behold.c @@ -1,14 +1,9 @@ -/* behold.h - Keytable for behold Remote Controller - * - * keymap imported from ir-keymaps.c - * - * Copyright (c) 2010 by Mauro Carvalho Chehab - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - */ +// SPDX-License-Identifier: GPL-2.0+ +// behold.h - Keytable for behold Remote Controller +// +// keymap imported from ir-keymaps.c +// +// Copyright (c) 2010 by Mauro Carvalho Chehab #include #include diff --git a/drivers/media/rc/keymaps/rc-budget-ci-old.c b/drivers/media/rc/keymaps/rc-budget-ci-old.c index 81ea1424d9e5..56f051af6154 100644 --- a/drivers/media/rc/keymaps/rc-budget-ci-old.c +++ b/drivers/media/rc/keymaps/rc-budget-ci-old.c @@ -1,14 +1,9 @@ -/* budget-ci-old.h - Keytable for budget_ci_old Remote Controller - * - * keymap imported from ir-keymaps.c - * - * Copyright (c) 2010 by Mauro Carvalho Chehab - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - */ +// SPDX-License-Identifier: GPL-2.0+ +// budget-ci-old.h - Keytable for budget_ci_old Remote Controller +// +// keymap imported from ir-keymaps.c +// +// Copyright (c) 2010 by Mauro Carvalho Chehab #include #include diff --git a/drivers/media/rc/keymaps/rc-cinergy-1400.c b/drivers/media/rc/keymaps/rc-cinergy-1400.c index bcb96b3dda85..dacb13c53bb4 100644 --- a/drivers/media/rc/keymaps/rc-cinergy-1400.c +++ b/drivers/media/rc/keymaps/rc-cinergy-1400.c @@ -1,14 +1,9 @@ -/* cinergy-1400.h - Keytable for cinergy_1400 Remote Controller - * - * keymap imported from ir-keymaps.c - * - * Copyright (c) 2010 by Mauro Carvalho Chehab - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - */ +// SPDX-License-Identifier: GPL-2.0+ +// cinergy-1400.h - Keytable for cinergy_1400 Remote Controller +// +// keymap imported from ir-keymaps.c +// +// Copyright (c) 2010 by Mauro Carvalho Chehab #include #include diff --git a/drivers/media/rc/keymaps/rc-cinergy.c b/drivers/media/rc/keymaps/rc-cinergy.c index fd56c402aae5..6ab2e51b764d 100644 --- a/drivers/media/rc/keymaps/rc-cinergy.c +++ b/drivers/media/rc/keymaps/rc-cinergy.c @@ -1,14 +1,9 @@ -/* cinergy.h - Keytable for cinergy Remote Controller - * - * keymap imported from ir-keymaps.c - * - * Copyright (c) 2010 by Mauro Carvalho Chehab - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - */ +// SPDX-License-Identifier: GPL-2.0+ +// cinergy.h - Keytable for cinergy Remote Controller +// +// keymap imported from ir-keymaps.c +// +// Copyright (c) 2010 by Mauro Carvalho Chehab #include #include diff --git a/drivers/media/rc/keymaps/rc-dib0700-nec.c b/drivers/media/rc/keymaps/rc-dib0700-nec.c index 1b4df106b7b5..4ee801acb089 100644 --- a/drivers/media/rc/keymaps/rc-dib0700-nec.c +++ b/drivers/media/rc/keymaps/rc-dib0700-nec.c @@ -1,19 +1,14 @@ -/* rc-dvb0700-big.c - Keytable for devices in dvb0700 - * - * Copyright (c) 2010 by Mauro Carvalho Chehab - * - * TODO: This table is a real mess, as it merges RC codes from several - * devices into a big table. It also has both RC-5 and NEC codes inside. - * It should be broken into small tables, and the protocols should properly - * be identificated. - * - * The table were imported from dib0700_devices.c. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - */ +// SPDX-License-Identifier: GPL-2.0+ +// rc-dvb0700-big.c - Keytable for devices in dvb0700 +// +// Copyright (c) 2010 by Mauro Carvalho Chehab +// +// TODO: This table is a real mess, as it merges RC codes from several +// devices into a big table. It also has both RC-5 and NEC codes inside. +// It should be broken into small tables, and the protocols should properly +// be identificated. +// +// The table were imported from dib0700_devices.c. #include #include diff --git a/drivers/media/rc/keymaps/rc-dib0700-rc5.c b/drivers/media/rc/keymaps/rc-dib0700-rc5.c index b0f8151bb824..ef4085a0fda3 100644 --- a/drivers/media/rc/keymaps/rc-dib0700-rc5.c +++ b/drivers/media/rc/keymaps/rc-dib0700-rc5.c @@ -1,19 +1,14 @@ -/* rc-dvb0700-big.c - Keytable for devices in dvb0700 - * - * Copyright (c) 2010 by Mauro Carvalho Chehab - * - * TODO: This table is a real mess, as it merges RC codes from several - * devices into a big table. It also has both RC-5 and NEC codes inside. - * It should be broken into small tables, and the protocols should properly - * be identificated. - * - * The table were imported from dib0700_devices.c. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - */ +// SPDX-License-Identifier: GPL-2.0+ +// rc-dvb0700-big.c - Keytable for devices in dvb0700 +// +// Copyright (c) 2010 by Mauro Carvalho Chehab +// +// TODO: This table is a real mess, as it merges RC codes from several +// devices into a big table. It also has both RC-5 and NEC codes inside. +// It should be broken into small tables, and the protocols should properly +// be identificated. +// +// The table were imported from dib0700_devices.c. #include #include diff --git a/drivers/media/rc/keymaps/rc-dm1105-nec.c b/drivers/media/rc/keymaps/rc-dm1105-nec.c index c353445d10ed..d853cd9a0936 100644 --- a/drivers/media/rc/keymaps/rc-dm1105-nec.c +++ b/drivers/media/rc/keymaps/rc-dm1105-nec.c @@ -1,14 +1,9 @@ -/* dm1105-nec.h - Keytable for dm1105_nec Remote Controller - * - * keymap imported from ir-keymaps.c - * - * Copyright (c) 2010 by Mauro Carvalho Chehab - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - */ +// SPDX-License-Identifier: GPL-2.0+ +// dm1105-nec.h - Keytable for dm1105_nec Remote Controller +// +// keymap imported from ir-keymaps.c +// +// Copyright (c) 2010 by Mauro Carvalho Chehab #include #include diff --git a/drivers/media/rc/keymaps/rc-dntv-live-dvb-t.c b/drivers/media/rc/keymaps/rc-dntv-live-dvb-t.c index 5bafd5b70f5e..cdc1d8c990cb 100644 --- a/drivers/media/rc/keymaps/rc-dntv-live-dvb-t.c +++ b/drivers/media/rc/keymaps/rc-dntv-live-dvb-t.c @@ -1,14 +1,9 @@ -/* dntv-live-dvb-t.h - Keytable for dntv_live_dvb_t Remote Controller - * - * keymap imported from ir-keymaps.c - * - * Copyright (c) 2010 by Mauro Carvalho Chehab - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - */ +// SPDX-License-Identifier: GPL-2.0+ +// dntv-live-dvb-t.h - Keytable for dntv_live_dvb_t Remote Controller +// +// keymap imported from ir-keymaps.c +// +// Copyright (c) 2010 by Mauro Carvalho Chehab #include #include diff --git a/drivers/media/rc/keymaps/rc-dntv-live-dvbt-pro.c b/drivers/media/rc/keymaps/rc-dntv-live-dvbt-pro.c index 360167c8829b..38e1d1b837da 100644 --- a/drivers/media/rc/keymaps/rc-dntv-live-dvbt-pro.c +++ b/drivers/media/rc/keymaps/rc-dntv-live-dvbt-pro.c @@ -1,14 +1,9 @@ -/* dntv-live-dvbt-pro.h - Keytable for dntv_live_dvbt_pro Remote Controller - * - * keymap imported from ir-keymaps.c - * - * Copyright (c) 2010 by Mauro Carvalho Chehab - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - */ +// SPDX-License-Identifier: GPL-2.0+ +// dntv-live-dvbt-pro.h - Keytable for dntv_live_dvbt_pro Remote Controller +// +// keymap imported from ir-keymaps.c +// +// Copyright (c) 2010 by Mauro Carvalho Chehab #include #include diff --git a/drivers/media/rc/keymaps/rc-em-terratec.c b/drivers/media/rc/keymaps/rc-em-terratec.c index 18e1a2679c20..cbbba21484fb 100644 --- a/drivers/media/rc/keymaps/rc-em-terratec.c +++ b/drivers/media/rc/keymaps/rc-em-terratec.c @@ -1,14 +1,9 @@ -/* em-terratec.h - Keytable for em_terratec Remote Controller - * - * keymap imported from ir-keymaps.c - * - * Copyright (c) 2010 by Mauro Carvalho Chehab - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - */ +// SPDX-License-Identifier: GPL-2.0+ +// em-terratec.h - Keytable for em_terratec Remote Controller +// +// keymap imported from ir-keymaps.c +// +// Copyright (c) 2010 by Mauro Carvalho Chehab #include #include diff --git a/drivers/media/rc/keymaps/rc-encore-enltv-fm53.c b/drivers/media/rc/keymaps/rc-encore-enltv-fm53.c index 72ffd5cb0108..e4e78c1f4123 100644 --- a/drivers/media/rc/keymaps/rc-encore-enltv-fm53.c +++ b/drivers/media/rc/keymaps/rc-encore-enltv-fm53.c @@ -1,14 +1,9 @@ -/* encore-enltv-fm53.h - Keytable for encore_enltv_fm53 Remote Controller - * - * keymap imported from ir-keymaps.c - * - * Copyright (c) 2010 by Mauro Carvalho Chehab - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - */ +// SPDX-License-Identifier: GPL-2.0+ +// encore-enltv-fm53.h - Keytable for encore_enltv_fm53 Remote Controller +// +// keymap imported from ir-keymaps.c +// +// Copyright (c) 2010 by Mauro Carvalho Chehab #include #include diff --git a/drivers/media/rc/keymaps/rc-encore-enltv.c b/drivers/media/rc/keymaps/rc-encore-enltv.c index e0381e7aa964..5b4e832d5fac 100644 --- a/drivers/media/rc/keymaps/rc-encore-enltv.c +++ b/drivers/media/rc/keymaps/rc-encore-enltv.c @@ -1,14 +1,9 @@ -/* encore-enltv.h - Keytable for encore_enltv Remote Controller - * - * keymap imported from ir-keymaps.c - * - * Copyright (c) 2010 by Mauro Carvalho Chehab - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - */ +// SPDX-License-Identifier: GPL-2.0+ +// encore-enltv.h - Keytable for encore_enltv Remote Controller +// +// keymap imported from ir-keymaps.c +// +// Copyright (c) 2010 by Mauro Carvalho Chehab #include #include diff --git a/drivers/media/rc/keymaps/rc-encore-enltv2.c b/drivers/media/rc/keymaps/rc-encore-enltv2.c index e9b0bfba319c..c3d4437a6fda 100644 --- a/drivers/media/rc/keymaps/rc-encore-enltv2.c +++ b/drivers/media/rc/keymaps/rc-encore-enltv2.c @@ -1,14 +1,9 @@ -/* encore-enltv2.h - Keytable for encore_enltv2 Remote Controller - * - * keymap imported from ir-keymaps.c - * - * Copyright (c) 2010 by Mauro Carvalho Chehab - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - */ +// SPDX-License-Identifier: GPL-2.0+ +// encore-enltv2.h - Keytable for encore_enltv2 Remote Controller +// +// keymap imported from ir-keymaps.c +// +// Copyright (c) 2010 by Mauro Carvalho Chehab #include #include diff --git a/drivers/media/rc/keymaps/rc-evga-indtube.c b/drivers/media/rc/keymaps/rc-evga-indtube.c index b77c5e908668..f4398444330b 100644 --- a/drivers/media/rc/keymaps/rc-evga-indtube.c +++ b/drivers/media/rc/keymaps/rc-evga-indtube.c @@ -1,14 +1,9 @@ -/* evga-indtube.h - Keytable for evga_indtube Remote Controller - * - * keymap imported from ir-keymaps.c - * - * Copyright (c) 2010 by Mauro Carvalho Chehab - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - */ +// SPDX-License-Identifier: GPL-2.0+ +// evga-indtube.h - Keytable for evga_indtube Remote Controller +// +// keymap imported from ir-keymaps.c +// +// Copyright (c) 2010 by Mauro Carvalho Chehab #include #include diff --git a/drivers/media/rc/keymaps/rc-eztv.c b/drivers/media/rc/keymaps/rc-eztv.c index 5013b3b2aa93..0e481d51fcb5 100644 --- a/drivers/media/rc/keymaps/rc-eztv.c +++ b/drivers/media/rc/keymaps/rc-eztv.c @@ -1,14 +1,9 @@ -/* eztv.h - Keytable for eztv Remote Controller - * - * keymap imported from ir-keymaps.c - * - * Copyright (c) 2010 by Mauro Carvalho Chehab - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - */ +// SPDX-License-Identifier: GPL-2.0+ +// eztv.h - Keytable for eztv Remote Controller +// +// keymap imported from ir-keymaps.c +// +// Copyright (c) 2010 by Mauro Carvalho Chehab #include #include diff --git a/drivers/media/rc/keymaps/rc-flydvb.c b/drivers/media/rc/keymaps/rc-flydvb.c index 418b32521273..45940d7c92d0 100644 --- a/drivers/media/rc/keymaps/rc-flydvb.c +++ b/drivers/media/rc/keymaps/rc-flydvb.c @@ -1,14 +1,9 @@ -/* flydvb.h - Keytable for flydvb Remote Controller - * - * keymap imported from ir-keymaps.c - * - * Copyright (c) 2010 by Mauro Carvalho Chehab - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - */ +// SPDX-License-Identifier: GPL-2.0+ +// flydvb.h - Keytable for flydvb Remote Controller +// +// keymap imported from ir-keymaps.c +// +// Copyright (c) 2010 by Mauro Carvalho Chehab #include #include diff --git a/drivers/media/rc/keymaps/rc-flyvideo.c b/drivers/media/rc/keymaps/rc-flyvideo.c index 93fb87ecf061..b2d4e4c7b192 100644 --- a/drivers/media/rc/keymaps/rc-flyvideo.c +++ b/drivers/media/rc/keymaps/rc-flyvideo.c @@ -1,14 +1,9 @@ -/* flyvideo.h - Keytable for flyvideo Remote Controller - * - * keymap imported from ir-keymaps.c - * - * Copyright (c) 2010 by Mauro Carvalho Chehab - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - */ +// SPDX-License-Identifier: GPL-2.0+ +// flyvideo.h - Keytable for flyvideo Remote Controller +// +// keymap imported from ir-keymaps.c +// +// Copyright (c) 2010 by Mauro Carvalho Chehab #include #include diff --git a/drivers/media/rc/keymaps/rc-fusionhdtv-mce.c b/drivers/media/rc/keymaps/rc-fusionhdtv-mce.c index 9ed3f749262b..1c63fc7d4576 100644 --- a/drivers/media/rc/keymaps/rc-fusionhdtv-mce.c +++ b/drivers/media/rc/keymaps/rc-fusionhdtv-mce.c @@ -1,14 +1,9 @@ -/* fusionhdtv-mce.h - Keytable for fusionhdtv_mce Remote Controller - * - * keymap imported from ir-keymaps.c - * - * Copyright (c) 2010 by Mauro Carvalho Chehab - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - */ +// SPDX-License-Identifier: GPL-2.0+ +// fusionhdtv-mce.h - Keytable for fusionhdtv_mce Remote Controller +// +// keymap imported from ir-keymaps.c +// +// Copyright (c) 2010 by Mauro Carvalho Chehab #include #include diff --git a/drivers/media/rc/keymaps/rc-gadmei-rm008z.c b/drivers/media/rc/keymaps/rc-gadmei-rm008z.c index 3443b721d092..4a0a9786914f 100644 --- a/drivers/media/rc/keymaps/rc-gadmei-rm008z.c +++ b/drivers/media/rc/keymaps/rc-gadmei-rm008z.c @@ -1,14 +1,9 @@ -/* gadmei-rm008z.h - Keytable for gadmei_rm008z Remote Controller - * - * keymap imported from ir-keymaps.c - * - * Copyright (c) 2010 by Mauro Carvalho Chehab - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - */ +// SPDX-License-Identifier: GPL-2.0+ +// gadmei-rm008z.h - Keytable for gadmei_rm008z Remote Controller +// +// keymap imported from ir-keymaps.c +// +// Copyright (c) 2010 by Mauro Carvalho Chehab #include #include diff --git a/drivers/media/rc/keymaps/rc-genius-tvgo-a11mce.c b/drivers/media/rc/keymaps/rc-genius-tvgo-a11mce.c index d140e8d45bcc..cc876a85cc31 100644 --- a/drivers/media/rc/keymaps/rc-genius-tvgo-a11mce.c +++ b/drivers/media/rc/keymaps/rc-genius-tvgo-a11mce.c @@ -1,14 +1,9 @@ -/* genius-tvgo-a11mce.h - Keytable for genius_tvgo_a11mce Remote Controller - * - * keymap imported from ir-keymaps.c - * - * Copyright (c) 2010 by Mauro Carvalho Chehab - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - */ +// SPDX-License-Identifier: GPL-2.0+ +// genius-tvgo-a11mce.h - Keytable for genius_tvgo_a11mce Remote Controller +// +// keymap imported from ir-keymaps.c +// +// Copyright (c) 2010 by Mauro Carvalho Chehab #include #include diff --git a/drivers/media/rc/keymaps/rc-gotview7135.c b/drivers/media/rc/keymaps/rc-gotview7135.c index 51230fbb52ba..6b94bd39d977 100644 --- a/drivers/media/rc/keymaps/rc-gotview7135.c +++ b/drivers/media/rc/keymaps/rc-gotview7135.c @@ -1,14 +1,9 @@ -/* gotview7135.h - Keytable for gotview7135 Remote Controller - * - * keymap imported from ir-keymaps.c - * - * Copyright (c) 2010 by Mauro Carvalho Chehab - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - */ +// SPDX-License-Identifier: GPL-2.0+ +// gotview7135.h - Keytable for gotview7135 Remote Controller +// +// keymap imported from ir-keymaps.c +// +// Copyright (c) 2010 by Mauro Carvalho Chehab #include #include diff --git a/drivers/media/rc/keymaps/rc-hauppauge.c b/drivers/media/rc/keymaps/rc-hauppauge.c index 890164b68d64..582aa9012443 100644 --- a/drivers/media/rc/keymaps/rc-hauppauge.c +++ b/drivers/media/rc/keymaps/rc-hauppauge.c @@ -1,20 +1,15 @@ -/* rc-hauppauge.c - Keytable for Hauppauge Remote Controllers - * - * keymap imported from ir-keymaps.c - * - * This map currently contains the code for four different RCs: - * - New Hauppauge Gray; - * - Old Hauppauge Gray (with a golden screen for media keys); - * - Hauppauge Black; - * - DSR-0112 remote bundled with Haupauge MiniStick. - * - * Copyright (c) 2010-2011 by Mauro Carvalho Chehab - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - */ +// SPDX-License-Identifier: GPL-2.0+ +// rc-hauppauge.c - Keytable for Hauppauge Remote Controllers +// +// keymap imported from ir-keymaps.c +// +// This map currently contains the code for four different RCs: +// - New Hauppauge Gray; +// - Old Hauppauge Gray (with a golden screen for media keys); +// - Hauppauge Black; +// - DSR-0112 remote bundled with Haupauge MiniStick. +// +// Copyright (c) 2010-2011 by Mauro Carvalho Chehab #include #include diff --git a/drivers/media/rc/keymaps/rc-iodata-bctv7e.c b/drivers/media/rc/keymaps/rc-iodata-bctv7e.c index 8cf87a15c4f2..6ced43458f2a 100644 --- a/drivers/media/rc/keymaps/rc-iodata-bctv7e.c +++ b/drivers/media/rc/keymaps/rc-iodata-bctv7e.c @@ -1,14 +1,9 @@ -/* iodata-bctv7e.h - Keytable for iodata_bctv7e Remote Controller - * - * keymap imported from ir-keymaps.c - * - * Copyright (c) 2010 by Mauro Carvalho Chehab - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - */ +// SPDX-License-Identifier: GPL-2.0+ +// iodata-bctv7e.h - Keytable for iodata_bctv7e Remote Controller +// +// keymap imported from ir-keymaps.c +// +// Copyright (c) 2010 by Mauro Carvalho Chehab #include #include diff --git a/drivers/media/rc/keymaps/rc-kaiomy.c b/drivers/media/rc/keymaps/rc-kaiomy.c index e791f1e1b43b..f0f88df18606 100644 --- a/drivers/media/rc/keymaps/rc-kaiomy.c +++ b/drivers/media/rc/keymaps/rc-kaiomy.c @@ -1,14 +1,9 @@ -/* kaiomy.h - Keytable for kaiomy Remote Controller - * - * keymap imported from ir-keymaps.c - * - * Copyright (c) 2010 by Mauro Carvalho Chehab - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - */ +// SPDX-License-Identifier: GPL-2.0+ +// kaiomy.h - Keytable for kaiomy Remote Controller +// +// keymap imported from ir-keymaps.c +// +// Copyright (c) 2010 by Mauro Carvalho Chehab #include #include diff --git a/drivers/media/rc/keymaps/rc-kworld-315u.c b/drivers/media/rc/keymaps/rc-kworld-315u.c index 71dce0138f0e..ed0e0586dea2 100644 --- a/drivers/media/rc/keymaps/rc-kworld-315u.c +++ b/drivers/media/rc/keymaps/rc-kworld-315u.c @@ -1,14 +1,9 @@ -/* kworld-315u.h - Keytable for kworld_315u Remote Controller - * - * keymap imported from ir-keymaps.c - * - * Copyright (c) 2010 by Mauro Carvalho Chehab - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - */ +// SPDX-License-Identifier: GPL-2.0+ +// kworld-315u.h - Keytable for kworld_315u Remote Controller +// +// keymap imported from ir-keymaps.c +// +// Copyright (c) 2010 by Mauro Carvalho Chehab #include #include diff --git a/drivers/media/rc/keymaps/rc-kworld-plus-tv-analog.c b/drivers/media/rc/keymaps/rc-kworld-plus-tv-analog.c index e0322ed16c94..453e04377de7 100644 --- a/drivers/media/rc/keymaps/rc-kworld-plus-tv-analog.c +++ b/drivers/media/rc/keymaps/rc-kworld-plus-tv-analog.c @@ -1,14 +1,9 @@ -/* kworld-plus-tv-analog.h - Keytable for kworld_plus_tv_analog Remote Controller - * - * keymap imported from ir-keymaps.c - * - * Copyright (c) 2010 by Mauro Carvalho Chehab - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - */ +// SPDX-License-Identifier: GPL-2.0+ +// kworld-plus-tv-analog.h - Keytable for kworld_plus_tv_analog Remote Controller +// +// keymap imported from ir-keymaps.c +// +// Copyright (c) 2010 by Mauro Carvalho Chehab #include #include diff --git a/drivers/media/rc/keymaps/rc-manli.c b/drivers/media/rc/keymaps/rc-manli.c index da566902a4dd..29c9feaf413b 100644 --- a/drivers/media/rc/keymaps/rc-manli.c +++ b/drivers/media/rc/keymaps/rc-manli.c @@ -1,14 +1,9 @@ -/* manli.h - Keytable for manli Remote Controller - * - * keymap imported from ir-keymaps.c - * - * Copyright (c) 2010 by Mauro Carvalho Chehab - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - */ +// SPDX-License-Identifier: GPL-2.0+ +// manli.h - Keytable for manli Remote Controller +// +// keymap imported from ir-keymaps.c +// +// Copyright (c) 2010 by Mauro Carvalho Chehab #include #include diff --git a/drivers/media/rc/keymaps/rc-msi-tvanywhere-plus.c b/drivers/media/rc/keymaps/rc-msi-tvanywhere-plus.c index dfa0ed1d7667..78cf2c286083 100644 --- a/drivers/media/rc/keymaps/rc-msi-tvanywhere-plus.c +++ b/drivers/media/rc/keymaps/rc-msi-tvanywhere-plus.c @@ -1,14 +1,9 @@ -/* msi-tvanywhere-plus.h - Keytable for msi_tvanywhere_plus Remote Controller - * - * keymap imported from ir-keymaps.c - * - * Copyright (c) 2010 by Mauro Carvalho Chehab - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - */ +// SPDX-License-Identifier: GPL-2.0+ +// msi-tvanywhere-plus.h - Keytable for msi_tvanywhere_plus Remote Controller +// +// keymap imported from ir-keymaps.c +// +// Copyright (c) 2010 by Mauro Carvalho Chehab #include #include diff --git a/drivers/media/rc/keymaps/rc-msi-tvanywhere.c b/drivers/media/rc/keymaps/rc-msi-tvanywhere.c index 2111816a3f59..359a57be3a66 100644 --- a/drivers/media/rc/keymaps/rc-msi-tvanywhere.c +++ b/drivers/media/rc/keymaps/rc-msi-tvanywhere.c @@ -1,14 +1,9 @@ -/* msi-tvanywhere.h - Keytable for msi_tvanywhere Remote Controller - * - * keymap imported from ir-keymaps.c - * - * Copyright (c) 2010 by Mauro Carvalho Chehab - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - */ +// SPDX-License-Identifier: GPL-2.0+ +// msi-tvanywhere.h - Keytable for msi_tvanywhere Remote Controller +// +// keymap imported from ir-keymaps.c +// +// Copyright (c) 2010 by Mauro Carvalho Chehab #include #include diff --git a/drivers/media/rc/keymaps/rc-nebula.c b/drivers/media/rc/keymaps/rc-nebula.c index 109b6e1a8b1a..17d7c1b324da 100644 --- a/drivers/media/rc/keymaps/rc-nebula.c +++ b/drivers/media/rc/keymaps/rc-nebula.c @@ -1,14 +1,9 @@ -/* nebula.h - Keytable for nebula Remote Controller - * - * keymap imported from ir-keymaps.c - * - * Copyright (c) 2010 by Mauro Carvalho Chehab - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - */ +// SPDX-License-Identifier: GPL-2.0+ +// nebula.h - Keytable for nebula Remote Controller +// +// keymap imported from ir-keymaps.c +// +// Copyright (c) 2010 by Mauro Carvalho Chehab #include #include diff --git a/drivers/media/rc/keymaps/rc-nec-terratec-cinergy-xs.c b/drivers/media/rc/keymaps/rc-nec-terratec-cinergy-xs.c index bb2d3a2962c0..76beef44a8d7 100644 --- a/drivers/media/rc/keymaps/rc-nec-terratec-cinergy-xs.c +++ b/drivers/media/rc/keymaps/rc-nec-terratec-cinergy-xs.c @@ -1,14 +1,9 @@ -/* nec-terratec-cinergy-xs.h - Keytable for nec_terratec_cinergy_xs Remote Controller - * - * keymap imported from ir-keymaps.c - * - * Copyright (c) 2010 by Mauro Carvalho Chehab - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - */ +// SPDX-License-Identifier: GPL-2.0+ +// nec-terratec-cinergy-xs.h - Keytable for nec_terratec_cinergy_xs Remote Controller +// +// keymap imported from ir-keymaps.c +// +// Copyright (c) 2010 by Mauro Carvalho Chehab #include #include diff --git a/drivers/media/rc/keymaps/rc-norwood.c b/drivers/media/rc/keymaps/rc-norwood.c index cd25df336749..3765705c5549 100644 --- a/drivers/media/rc/keymaps/rc-norwood.c +++ b/drivers/media/rc/keymaps/rc-norwood.c @@ -1,14 +1,9 @@ -/* norwood.h - Keytable for norwood Remote Controller - * - * keymap imported from ir-keymaps.c - * - * Copyright (c) 2010 by Mauro Carvalho Chehab - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - */ +// SPDX-License-Identifier: GPL-2.0+ +// norwood.h - Keytable for norwood Remote Controller +// +// keymap imported from ir-keymaps.c +// +// Copyright (c) 2010 by Mauro Carvalho Chehab #include #include diff --git a/drivers/media/rc/keymaps/rc-npgtech.c b/drivers/media/rc/keymaps/rc-npgtech.c index 140bbc20a764..abaf7f6d4cb7 100644 --- a/drivers/media/rc/keymaps/rc-npgtech.c +++ b/drivers/media/rc/keymaps/rc-npgtech.c @@ -1,14 +1,9 @@ -/* npgtech.h - Keytable for npgtech Remote Controller - * - * keymap imported from ir-keymaps.c - * - * Copyright (c) 2010 by Mauro Carvalho Chehab - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - */ +// SPDX-License-Identifier: GPL-2.0+ +// npgtech.h - Keytable for npgtech Remote Controller +// +// keymap imported from ir-keymaps.c +// +// Copyright (c) 2010 by Mauro Carvalho Chehab #include #include diff --git a/drivers/media/rc/keymaps/rc-pctv-sedna.c b/drivers/media/rc/keymaps/rc-pctv-sedna.c index 52b4558b7bd0..e3462c5c8984 100644 --- a/drivers/media/rc/keymaps/rc-pctv-sedna.c +++ b/drivers/media/rc/keymaps/rc-pctv-sedna.c @@ -1,14 +1,9 @@ -/* pctv-sedna.h - Keytable for pctv_sedna Remote Controller - * - * keymap imported from ir-keymaps.c - * - * Copyright (c) 2010 by Mauro Carvalho Chehab - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - */ +// SPDX-License-Identifier: GPL-2.0+ +// pctv-sedna.h - Keytable for pctv_sedna Remote Controller +// +// keymap imported from ir-keymaps.c +// +// Copyright (c) 2010 by Mauro Carvalho Chehab #include #include diff --git a/drivers/media/rc/keymaps/rc-pinnacle-color.c b/drivers/media/rc/keymaps/rc-pinnacle-color.c index 973c9c34e304..63c2851e9dfe 100644 --- a/drivers/media/rc/keymaps/rc-pinnacle-color.c +++ b/drivers/media/rc/keymaps/rc-pinnacle-color.c @@ -1,14 +1,9 @@ -/* pinnacle-color.h - Keytable for pinnacle_color Remote Controller - * - * keymap imported from ir-keymaps.c - * - * Copyright (c) 2010 by Mauro Carvalho Chehab - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - */ +// SPDX-License-Identifier: GPL-2.0+ +// pinnacle-color.h - Keytable for pinnacle_color Remote Controller +// +// keymap imported from ir-keymaps.c +// +// Copyright (c) 2010 by Mauro Carvalho Chehab #include #include diff --git a/drivers/media/rc/keymaps/rc-pinnacle-grey.c b/drivers/media/rc/keymaps/rc-pinnacle-grey.c index 22e44b0d2a93..31794d4180db 100644 --- a/drivers/media/rc/keymaps/rc-pinnacle-grey.c +++ b/drivers/media/rc/keymaps/rc-pinnacle-grey.c @@ -1,14 +1,9 @@ -/* pinnacle-grey.h - Keytable for pinnacle_grey Remote Controller - * - * keymap imported from ir-keymaps.c - * - * Copyright (c) 2010 by Mauro Carvalho Chehab - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - */ +// SPDX-License-Identifier: GPL-2.0+ +// pinnacle-grey.h - Keytable for pinnacle_grey Remote Controller +// +// keymap imported from ir-keymaps.c +// +// Copyright (c) 2010 by Mauro Carvalho Chehab #include #include diff --git a/drivers/media/rc/keymaps/rc-pinnacle-pctv-hd.c b/drivers/media/rc/keymaps/rc-pinnacle-pctv-hd.c index 186dcf8e0491..876aeb6e1d9c 100644 --- a/drivers/media/rc/keymaps/rc-pinnacle-pctv-hd.c +++ b/drivers/media/rc/keymaps/rc-pinnacle-pctv-hd.c @@ -1,14 +1,9 @@ -/* pinnacle-pctv-hd.h - Keytable for pinnacle_pctv_hd Remote Controller - * - * keymap imported from ir-keymaps.c - * - * Copyright (c) 2010 by Mauro Carvalho Chehab - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - */ +// SPDX-License-Identifier: GPL-2.0+ +// pinnacle-pctv-hd.h - Keytable for pinnacle_pctv_hd Remote Controller +// +// keymap imported from ir-keymaps.c +// +// Copyright (c) 2010 by Mauro Carvalho Chehab #include #include diff --git a/drivers/media/rc/keymaps/rc-pixelview-002t.c b/drivers/media/rc/keymaps/rc-pixelview-002t.c index b235ada2e28f..4ed85f61d0ee 100644 --- a/drivers/media/rc/keymaps/rc-pixelview-002t.c +++ b/drivers/media/rc/keymaps/rc-pixelview-002t.c @@ -1,14 +1,9 @@ -/* rc-pixelview-mk12.h - Keytable for pixelview Remote Controller - * - * keymap imported from ir-keymaps.c - * - * Copyright (c) 2010 by Mauro Carvalho Chehab - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - */ +// SPDX-License-Identifier: GPL-2.0+ +// rc-pixelview-mk12.h - Keytable for pixelview Remote Controller +// +// keymap imported from ir-keymaps.c +// +// Copyright (c) 2010 by Mauro Carvalho Chehab #include #include diff --git a/drivers/media/rc/keymaps/rc-pixelview-mk12.c b/drivers/media/rc/keymaps/rc-pixelview-mk12.c index 453d52d663fe..6ded64b732a5 100644 --- a/drivers/media/rc/keymaps/rc-pixelview-mk12.c +++ b/drivers/media/rc/keymaps/rc-pixelview-mk12.c @@ -1,14 +1,9 @@ -/* rc-pixelview-mk12.h - Keytable for pixelview Remote Controller - * - * keymap imported from ir-keymaps.c - * - * Copyright (c) 2010 by Mauro Carvalho Chehab - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - */ +// SPDX-License-Identifier: GPL-2.0+ +// rc-pixelview-mk12.h - Keytable for pixelview Remote Controller +// +// keymap imported from ir-keymaps.c +// +// Copyright (c) 2010 by Mauro Carvalho Chehab #include #include diff --git a/drivers/media/rc/keymaps/rc-pixelview-new.c b/drivers/media/rc/keymaps/rc-pixelview-new.c index ef97095ec8f1..791130f108ff 100644 --- a/drivers/media/rc/keymaps/rc-pixelview-new.c +++ b/drivers/media/rc/keymaps/rc-pixelview-new.c @@ -1,14 +1,9 @@ -/* pixelview-new.h - Keytable for pixelview_new Remote Controller - * - * keymap imported from ir-keymaps.c - * - * Copyright (c) 2010 by Mauro Carvalho Chehab - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - */ +// SPDX-License-Identifier: GPL-2.0+ +// pixelview-new.h - Keytable for pixelview_new Remote Controller +// +// keymap imported from ir-keymaps.c +// +// Copyright (c) 2010 by Mauro Carvalho Chehab #include #include diff --git a/drivers/media/rc/keymaps/rc-pixelview.c b/drivers/media/rc/keymaps/rc-pixelview.c index cfd8f80d3617..988919735165 100644 --- a/drivers/media/rc/keymaps/rc-pixelview.c +++ b/drivers/media/rc/keymaps/rc-pixelview.c @@ -1,14 +1,9 @@ -/* pixelview.h - Keytable for pixelview Remote Controller - * - * keymap imported from ir-keymaps.c - * - * Copyright (c) 2010 by Mauro Carvalho Chehab - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - */ +// SPDX-License-Identifier: GPL-2.0+ +// pixelview.h - Keytable for pixelview Remote Controller +// +// keymap imported from ir-keymaps.c +// +// Copyright (c) 2010 by Mauro Carvalho Chehab #include #include diff --git a/drivers/media/rc/keymaps/rc-powercolor-real-angel.c b/drivers/media/rc/keymaps/rc-powercolor-real-angel.c index b63f82bcf29a..4988e71c524c 100644 --- a/drivers/media/rc/keymaps/rc-powercolor-real-angel.c +++ b/drivers/media/rc/keymaps/rc-powercolor-real-angel.c @@ -1,14 +1,9 @@ -/* powercolor-real-angel.h - Keytable for powercolor_real_angel Remote Controller - * - * keymap imported from ir-keymaps.c - * - * Copyright (c) 2010 by Mauro Carvalho Chehab - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - */ +// SPDX-License-Identifier: GPL-2.0+ +// powercolor-real-angel.h - Keytable for powercolor_real_angel Remote Controller +// +// keymap imported from ir-keymaps.c +// +// Copyright (c) 2010 by Mauro Carvalho Chehab #include #include diff --git a/drivers/media/rc/keymaps/rc-proteus-2309.c b/drivers/media/rc/keymaps/rc-proteus-2309.c index be34c517e4e1..d2c13d0e7bff 100644 --- a/drivers/media/rc/keymaps/rc-proteus-2309.c +++ b/drivers/media/rc/keymaps/rc-proteus-2309.c @@ -1,14 +1,9 @@ -/* proteus-2309.h - Keytable for proteus_2309 Remote Controller - * - * keymap imported from ir-keymaps.c - * - * Copyright (c) 2010 by Mauro Carvalho Chehab - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - */ +// SPDX-License-Identifier: GPL-2.0+ +// proteus-2309.h - Keytable for proteus_2309 Remote Controller +// +// keymap imported from ir-keymaps.c +// +// Copyright (c) 2010 by Mauro Carvalho Chehab #include #include diff --git a/drivers/media/rc/keymaps/rc-purpletv.c b/drivers/media/rc/keymaps/rc-purpletv.c index 84c40b97ee00..c8011f4d96ea 100644 --- a/drivers/media/rc/keymaps/rc-purpletv.c +++ b/drivers/media/rc/keymaps/rc-purpletv.c @@ -1,14 +1,9 @@ -/* purpletv.h - Keytable for purpletv Remote Controller - * - * keymap imported from ir-keymaps.c - * - * Copyright (c) 2010 by Mauro Carvalho Chehab - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - */ +// SPDX-License-Identifier: GPL-2.0+ +// purpletv.h - Keytable for purpletv Remote Controller +// +// keymap imported from ir-keymaps.c +// +// Copyright (c) 2010 by Mauro Carvalho Chehab #include #include diff --git a/drivers/media/rc/keymaps/rc-pv951.c b/drivers/media/rc/keymaps/rc-pv951.c index be190ddebfc4..5235ee899c30 100644 --- a/drivers/media/rc/keymaps/rc-pv951.c +++ b/drivers/media/rc/keymaps/rc-pv951.c @@ -1,14 +1,9 @@ -/* pv951.h - Keytable for pv951 Remote Controller - * - * keymap imported from ir-keymaps.c - * - * Copyright (c) 2010 by Mauro Carvalho Chehab - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - */ +// SPDX-License-Identifier: GPL-2.0+ +// pv951.h - Keytable for pv951 Remote Controller +// +// keymap imported from ir-keymaps.c +// +// Copyright (c) 2010 by Mauro Carvalho Chehab #include #include diff --git a/drivers/media/rc/keymaps/rc-real-audio-220-32-keys.c b/drivers/media/rc/keymaps/rc-real-audio-220-32-keys.c index 957fa21747ea..1cf786649675 100644 --- a/drivers/media/rc/keymaps/rc-real-audio-220-32-keys.c +++ b/drivers/media/rc/keymaps/rc-real-audio-220-32-keys.c @@ -1,14 +1,9 @@ -/* real-audio-220-32-keys.h - Keytable for real_audio_220_32_keys Remote Controller - * - * keymap imported from ir-keymaps.c - * - * Copyright (c) 2010 by Mauro Carvalho Chehab - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - */ +// SPDX-License-Identifier: GPL-2.0+ +// real-audio-220-32-keys.h - Keytable for real_audio_220_32_keys Remote Controller +// +// keymap imported from ir-keymaps.c +// +// Copyright (c) 2010 by Mauro Carvalho Chehab #include #include diff --git a/drivers/media/rc/keymaps/rc-tbs-nec.c b/drivers/media/rc/keymaps/rc-tbs-nec.c index 05facc043272..42766cb877c3 100644 --- a/drivers/media/rc/keymaps/rc-tbs-nec.c +++ b/drivers/media/rc/keymaps/rc-tbs-nec.c @@ -1,14 +1,9 @@ -/* tbs-nec.h - Keytable for tbs_nec Remote Controller - * - * keymap imported from ir-keymaps.c - * - * Copyright (c) 2010 by Mauro Carvalho Chehab - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - */ +// SPDX-License-Identifier: GPL-2.0+ +// tbs-nec.h - Keytable for tbs_nec Remote Controller +// +// keymap imported from ir-keymaps.c +// +// Copyright (c) 2010 by Mauro Carvalho Chehab #include #include diff --git a/drivers/media/rc/keymaps/rc-terratec-cinergy-xs.c b/drivers/media/rc/keymaps/rc-terratec-cinergy-xs.c index 3d0f6f7e5bea..6cf53a56bce4 100644 --- a/drivers/media/rc/keymaps/rc-terratec-cinergy-xs.c +++ b/drivers/media/rc/keymaps/rc-terratec-cinergy-xs.c @@ -1,14 +1,9 @@ -/* terratec-cinergy-xs.h - Keytable for terratec_cinergy_xs Remote Controller - * - * keymap imported from ir-keymaps.c - * - * Copyright (c) 2010 by Mauro Carvalho Chehab - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - */ +// SPDX-License-Identifier: GPL-2.0+ +// terratec-cinergy-xs.h - Keytable for terratec_cinergy_xs Remote Controller +// +// keymap imported from ir-keymaps.c +// +// Copyright (c) 2010 by Mauro Carvalho Chehab #include #include diff --git a/drivers/media/rc/keymaps/rc-tevii-nec.c b/drivers/media/rc/keymaps/rc-tevii-nec.c index 31f8a0fd1f2c..58fcc72f528e 100644 --- a/drivers/media/rc/keymaps/rc-tevii-nec.c +++ b/drivers/media/rc/keymaps/rc-tevii-nec.c @@ -1,14 +1,9 @@ -/* tevii-nec.h - Keytable for tevii_nec Remote Controller - * - * keymap imported from ir-keymaps.c - * - * Copyright (c) 2010 by Mauro Carvalho Chehab - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - */ +// SPDX-License-Identifier: GPL-2.0+ +// tevii-nec.h - Keytable for tevii_nec Remote Controller +// +// keymap imported from ir-keymaps.c +// +// Copyright (c) 2010 by Mauro Carvalho Chehab #include #include diff --git a/drivers/media/rc/keymaps/rc-tt-1500.c b/drivers/media/rc/keymaps/rc-tt-1500.c index 374c230705d2..52f239d2c025 100644 --- a/drivers/media/rc/keymaps/rc-tt-1500.c +++ b/drivers/media/rc/keymaps/rc-tt-1500.c @@ -1,14 +1,9 @@ -/* tt-1500.h - Keytable for tt_1500 Remote Controller - * - * keymap imported from ir-keymaps.c - * - * Copyright (c) 2010 by Mauro Carvalho Chehab - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - */ +// SPDX-License-Identifier: GPL-2.0+ +// tt-1500.h - Keytable for tt_1500 Remote Controller +// +// keymap imported from ir-keymaps.c +// +// Copyright (c) 2010 by Mauro Carvalho Chehab #include #include diff --git a/drivers/media/rc/keymaps/rc-videomate-s350.c b/drivers/media/rc/keymaps/rc-videomate-s350.c index b4f103269872..e4d4dff06a24 100644 --- a/drivers/media/rc/keymaps/rc-videomate-s350.c +++ b/drivers/media/rc/keymaps/rc-videomate-s350.c @@ -1,14 +1,9 @@ -/* videomate-s350.h - Keytable for videomate_s350 Remote Controller - * - * keymap imported from ir-keymaps.c - * - * Copyright (c) 2010 by Mauro Carvalho Chehab - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - */ +// SPDX-License-Identifier: GPL-2.0+ +// videomate-s350.h - Keytable for videomate_s350 Remote Controller +// +// keymap imported from ir-keymaps.c +// +// Copyright (c) 2010 by Mauro Carvalho Chehab #include #include diff --git a/drivers/media/rc/keymaps/rc-videomate-tv-pvr.c b/drivers/media/rc/keymaps/rc-videomate-tv-pvr.c index c431fdf44057..7c4890944407 100644 --- a/drivers/media/rc/keymaps/rc-videomate-tv-pvr.c +++ b/drivers/media/rc/keymaps/rc-videomate-tv-pvr.c @@ -1,14 +1,9 @@ -/* videomate-tv-pvr.h - Keytable for videomate_tv_pvr Remote Controller - * - * keymap imported from ir-keymaps.c - * - * Copyright (c) 2010 by Mauro Carvalho Chehab - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - */ +// SPDX-License-Identifier: GPL-2.0+ +// videomate-tv-pvr.h - Keytable for videomate_tv_pvr Remote Controller +// +// keymap imported from ir-keymaps.c +// +// Copyright (c) 2010 by Mauro Carvalho Chehab #include #include diff --git a/drivers/media/rc/keymaps/rc-winfast-usbii-deluxe.c b/drivers/media/rc/keymaps/rc-winfast-usbii-deluxe.c index 5a437e61bd5d..30495673cddd 100644 --- a/drivers/media/rc/keymaps/rc-winfast-usbii-deluxe.c +++ b/drivers/media/rc/keymaps/rc-winfast-usbii-deluxe.c @@ -1,14 +1,9 @@ -/* winfast-usbii-deluxe.h - Keytable for winfast_usbii_deluxe Remote Controller - * - * keymap imported from ir-keymaps.c - * - * Copyright (c) 2010 by Mauro Carvalho Chehab - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - */ +// SPDX-License-Identifier: GPL-2.0+ +// winfast-usbii-deluxe.h - Keytable for winfast_usbii_deluxe Remote Controller +// +// keymap imported from ir-keymaps.c +// +// Copyright (c) 2010 by Mauro Carvalho Chehab #include #include diff --git a/drivers/media/rc/keymaps/rc-winfast.c b/drivers/media/rc/keymaps/rc-winfast.c index 53685d1f9a47..ee7f4c349fd6 100644 --- a/drivers/media/rc/keymaps/rc-winfast.c +++ b/drivers/media/rc/keymaps/rc-winfast.c @@ -1,14 +1,9 @@ -/* winfast.h - Keytable for winfast Remote Controller - * - * keymap imported from ir-keymaps.c - * - * Copyright (c) 2010 by Mauro Carvalho Chehab - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - */ +// SPDX-License-Identifier: GPL-2.0+ +// winfast.h - Keytable for winfast Remote Controller +// +// keymap imported from ir-keymaps.c +// +// Copyright (c) 2010 by Mauro Carvalho Chehab #include #include From 18ef5ab5fac968bb43d6b11a01c05e9b1655db20 Mon Sep 17 00:00:00 2001 From: Sean Young Date: Tue, 24 Oct 2017 14:59:31 -0400 Subject: [PATCH 0543/1640] BACKPORT: media: staging: remove lirc_zilog driver The ir-kbd-i2c driver behaves like the lirc_zilog driver, except it can send raw IR and receives scancodes rather than lirccodes. The lirc_zilog driver only polls if the lirc chardev is opened; similarly the ir-kbd-i2c driver only polls if the corresponding input device is opened, or the lirc device. Polling is disabled during IR transmission through the mutex. The polling period is 402ms in the ir-kdb-i2c driver, and 260ms in the lirc_zilog driver. Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/staging/media/Kconfig | 3 - drivers/staging/media/Makefile | 1 - drivers/staging/media/lirc/Kconfig | 21 - drivers/staging/media/lirc/Makefile | 6 - drivers/staging/media/lirc/TODO | 36 - drivers/staging/media/lirc/lirc_zilog.c | 1653 ----------------------- 6 files changed, 1720 deletions(-) delete mode 100644 drivers/staging/media/lirc/Kconfig delete mode 100644 drivers/staging/media/lirc/Makefile delete mode 100644 drivers/staging/media/lirc/TODO delete mode 100644 drivers/staging/media/lirc/lirc_zilog.c diff --git a/drivers/staging/media/Kconfig b/drivers/staging/media/Kconfig index f8c25ee082ef..3a09140700e6 100644 --- a/drivers/staging/media/Kconfig +++ b/drivers/staging/media/Kconfig @@ -31,7 +31,4 @@ source "drivers/staging/media/imx/Kconfig" source "drivers/staging/media/omap4iss/Kconfig" -# Keep LIRC at the end, as it has sub-menus -source "drivers/staging/media/lirc/Kconfig" - endif diff --git a/drivers/staging/media/Makefile b/drivers/staging/media/Makefile index be732cf932fd..f25327163c67 100644 --- a/drivers/staging/media/Makefile +++ b/drivers/staging/media/Makefile @@ -2,7 +2,6 @@ obj-$(CONFIG_I2C_BCM2048) += bcm2048/ obj-$(CONFIG_DVB_CXD2099) += cxd2099/ obj-$(CONFIG_VIDEO_IMX_MEDIA) += imx/ -obj-$(CONFIG_LIRC_STAGING) += lirc/ obj-$(CONFIG_VIDEO_DM365_VPFE) += davinci_vpfe/ obj-$(CONFIG_VIDEO_OMAP4) += omap4iss/ obj-$(CONFIG_INTEL_ATOMISP) += atomisp/ diff --git a/drivers/staging/media/lirc/Kconfig b/drivers/staging/media/lirc/Kconfig deleted file mode 100644 index 3e350a9922de..000000000000 --- a/drivers/staging/media/lirc/Kconfig +++ /dev/null @@ -1,21 +0,0 @@ -# -# LIRC driver(s) configuration -# -menuconfig LIRC_STAGING - bool "Linux Infrared Remote Control IR receiver/transmitter drivers" - depends on LIRC - help - Say Y here, and all supported Linux Infrared Remote Control IR and - RF receiver and transmitter drivers will be displayed. When paired - with a remote control and the lirc daemon, the receiver drivers - allow control of your Linux system via remote control. - -if LIRC_STAGING - -config LIRC_ZILOG - tristate "Zilog/Hauppauge IR Transmitter" - depends on LIRC && I2C - help - Driver for the Zilog/Hauppauge IR Transmitter, found on - PVR-150/500, HVR-1200/1250/1700/1800, HD-PVR and other cards -endif diff --git a/drivers/staging/media/lirc/Makefile b/drivers/staging/media/lirc/Makefile deleted file mode 100644 index 665562436e30..000000000000 --- a/drivers/staging/media/lirc/Makefile +++ /dev/null @@ -1,6 +0,0 @@ -# Makefile for the lirc drivers. -# - -# Each configuration option enables a list of files. - -obj-$(CONFIG_LIRC_ZILOG) += lirc_zilog.o diff --git a/drivers/staging/media/lirc/TODO b/drivers/staging/media/lirc/TODO deleted file mode 100644 index a97800a8e127..000000000000 --- a/drivers/staging/media/lirc/TODO +++ /dev/null @@ -1,36 +0,0 @@ -1. Both ir-kbd-i2c and lirc_zilog provide support for RX events for -the chips supported by lirc_zilog. Before moving lirc_zilog out of staging: - -a. ir-kbd-i2c needs a module parameter added to allow the user to tell - ir-kbd-i2c to ignore Z8 IR units. - -b. lirc_zilog should provide Rx key presses to the rc core like ir-kbd-i2c - does. - - -2. lirc_zilog module ref-counting need examination. It has not been -verified that cdev and lirc_dev will take the proper module references on -lirc_zilog to prevent removal of lirc_zilog when the /dev/lircN device node -is open. - -(The good news is ref-counting of lirc_zilog internal structures appears to be -complete. Testing has shown the cx18 module can be unloaded out from under -irw + lircd + lirc_dev, with the /dev/lirc0 device node open, with no adverse -effects. The cx18 module could then be reloaded and irw properly began -receiving button presses again and ir_send worked without error.) - - -3. Bridge drivers, if able, should provide a chip reset() callback -to lirc_zilog via struct IR_i2c_init_data. cx18 and ivtv already have routines -to perform Z8 chip resets via GPIO manipulations. This would allow lirc_zilog -to bring the chip back to normal when it hangs, in the same places the -original lirc_pvr150 driver code does. This is not strictly needed, so it -is not required to move lirc_zilog out of staging. - -Note: Both lirc_zilog and ir-kbd-i2c support the Zilog Z8 for IR, as programmed -and installed on Hauppauge products. When working on either module, developers -must consider at least the following bridge drivers which mention an IR Rx unit -at address 0x71 (indicative of a Z8): - - ivtv cx18 hdpvr pvrusb2 bt8xx cx88 saa7134 - diff --git a/drivers/staging/media/lirc/lirc_zilog.c b/drivers/staging/media/lirc/lirc_zilog.c deleted file mode 100644 index 6bd0717bf76e..000000000000 --- a/drivers/staging/media/lirc/lirc_zilog.c +++ /dev/null @@ -1,1653 +0,0 @@ -/* - * i2c IR lirc driver for devices with zilog IR processors - * - * Copyright (c) 2000 Gerd Knorr - * modified for PixelView (BT878P+W/FM) by - * Michal Kochanowicz - * Christoph Bartelmus - * modified for KNC ONE TV Station/Anubis Typhoon TView Tuner by - * Ulrich Mueller - * modified for Asus TV-Box and Creative/VisionTek BreakOut-Box by - * Stefan Jahn - * modified for inclusion into kernel sources by - * Jerome Brock - * modified for Leadtek Winfast PVR2000 by - * Thomas Reitmayr (treitmayr@yahoo.com) - * modified for Hauppauge PVR-150 IR TX device by - * Mark Weaver - * changed name from lirc_pvr150 to lirc_zilog, works on more than pvr-150 - * Jarod Wilson - * - * parts are cut&pasted from the lirc_i2c.c driver - * - * Numerous changes updating lirc_zilog.c in kernel 2.6.38 and later are - * Copyright (C) 2011 Andy Walls - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - * - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#include -#include - -/* Max transfer size done by I2C transfer functions */ -#define MAX_XFER_SIZE 64 - -struct IR; - -struct IR_rx { - struct kref ref; - struct IR *ir; - - /* RX device */ - struct mutex client_lock; - struct i2c_client *c; - - /* RX polling thread data */ - struct task_struct *task; - - /* RX read data */ - unsigned char b[3]; - bool hdpvr_data_fmt; -}; - -struct IR_tx { - struct kref ref; - struct IR *ir; - - /* TX device */ - struct mutex client_lock; - struct i2c_client *c; - - /* TX additional actions needed */ - int need_boot; - bool post_tx_ready_poll; -}; - -struct IR { - struct kref ref; - struct list_head list; - - /* FIXME spinlock access to l->features */ - struct lirc_dev *l; - struct lirc_buffer rbuf; - - struct mutex ir_lock; - atomic_t open_count; - - struct device *dev; - struct i2c_adapter *adapter; - - spinlock_t rx_ref_lock; /* struct IR_rx kref get()/put() */ - struct IR_rx *rx; - - spinlock_t tx_ref_lock; /* struct IR_tx kref get()/put() */ - struct IR_tx *tx; -}; - -/* IR transceiver instance object list */ -/* - * This lock is used for the following: - * a. ir_devices_list access, insertions, deletions - * b. struct IR kref get()s and put()s - * c. serialization of ir_probe() for the two i2c_clients for a Z8 - */ -static DEFINE_MUTEX(ir_devices_lock); -static LIST_HEAD(ir_devices_list); - -/* Block size for IR transmitter */ -#define TX_BLOCK_SIZE 99 - -/* Hauppauge IR transmitter data */ -struct tx_data_struct { - /* Boot block */ - unsigned char *boot_data; - - /* Start of binary data block */ - unsigned char *datap; - - /* End of binary data block */ - unsigned char *endp; - - /* Number of installed codesets */ - unsigned int num_code_sets; - - /* Pointers to codesets */ - unsigned char **code_sets; - - /* Global fixed data template */ - int fixed[TX_BLOCK_SIZE]; -}; - -static struct tx_data_struct *tx_data; -static struct mutex tx_data_lock; - - -/* module parameters */ -static bool debug; /* debug output */ -static bool tx_only; /* only handle the IR Tx function */ - - -/* struct IR reference counting */ -static struct IR *get_ir_device(struct IR *ir, bool ir_devices_lock_held) -{ - if (ir_devices_lock_held) { - kref_get(&ir->ref); - } else { - mutex_lock(&ir_devices_lock); - kref_get(&ir->ref); - mutex_unlock(&ir_devices_lock); - } - return ir; -} - -static void release_ir_device(struct kref *ref) -{ - struct IR *ir = container_of(ref, struct IR, ref); - - /* - * Things should be in this state by now: - * ir->rx set to NULL and deallocated - happens before ir->rx->ir put() - * ir->rx->task kthread stopped - happens before ir->rx->ir put() - * ir->tx set to NULL and deallocated - happens before ir->tx->ir put() - * ir->open_count == 0 - happens on final close() - * ir_lock, tx_ref_lock, rx_ref_lock, all released - */ - if (ir->l) - lirc_unregister_device(ir->l); - - if (kfifo_initialized(&ir->rbuf.fifo)) - lirc_buffer_free(&ir->rbuf); - list_del(&ir->list); - kfree(ir); -} - -static int put_ir_device(struct IR *ir, bool ir_devices_lock_held) -{ - int released; - - if (ir_devices_lock_held) - return kref_put(&ir->ref, release_ir_device); - - mutex_lock(&ir_devices_lock); - released = kref_put(&ir->ref, release_ir_device); - mutex_unlock(&ir_devices_lock); - - return released; -} - -/* struct IR_rx reference counting */ -static struct IR_rx *get_ir_rx(struct IR *ir) -{ - struct IR_rx *rx; - - spin_lock(&ir->rx_ref_lock); - rx = ir->rx; - if (rx) - kref_get(&rx->ref); - spin_unlock(&ir->rx_ref_lock); - return rx; -} - -static void destroy_rx_kthread(struct IR_rx *rx, bool ir_devices_lock_held) -{ - /* end up polling thread */ - if (!IS_ERR_OR_NULL(rx->task)) { - kthread_stop(rx->task); - rx->task = NULL; - /* Put the ir ptr that ir_probe() gave to the rx poll thread */ - put_ir_device(rx->ir, ir_devices_lock_held); - } -} - -static void release_ir_rx(struct kref *ref) -{ - struct IR_rx *rx = container_of(ref, struct IR_rx, ref); - struct IR *ir = rx->ir; - - /* - * This release function can't do all the work, as we want - * to keep the rx_ref_lock a spinlock, and killing the poll thread - * and releasing the ir reference can cause a sleep. That work is - * performed by put_ir_rx() - */ - ir->l->features &= ~LIRC_CAN_REC_LIRCCODE; - /* Don't put_ir_device(rx->ir) here; lock can't be freed yet */ - ir->rx = NULL; - /* Don't do the kfree(rx) here; we still need to kill the poll thread */ -} - -static int put_ir_rx(struct IR_rx *rx, bool ir_devices_lock_held) -{ - int released; - struct IR *ir = rx->ir; - - spin_lock(&ir->rx_ref_lock); - released = kref_put(&rx->ref, release_ir_rx); - spin_unlock(&ir->rx_ref_lock); - /* Destroy the rx kthread while not holding the spinlock */ - if (released) { - destroy_rx_kthread(rx, ir_devices_lock_held); - kfree(rx); - /* Make sure we're not still in a poll_table somewhere */ - wake_up_interruptible(&ir->rbuf.wait_poll); - } - /* Do a reference put() for the rx->ir reference, if we released rx */ - if (released) - put_ir_device(ir, ir_devices_lock_held); - return released; -} - -/* struct IR_tx reference counting */ -static struct IR_tx *get_ir_tx(struct IR *ir) -{ - struct IR_tx *tx; - - spin_lock(&ir->tx_ref_lock); - tx = ir->tx; - if (tx) - kref_get(&tx->ref); - spin_unlock(&ir->tx_ref_lock); - return tx; -} - -static void release_ir_tx(struct kref *ref) -{ - struct IR_tx *tx = container_of(ref, struct IR_tx, ref); - struct IR *ir = tx->ir; - - ir->l->features &= ~LIRC_CAN_SEND_LIRCCODE; - /* Don't put_ir_device(tx->ir) here, so our lock doesn't get freed */ - ir->tx = NULL; - kfree(tx); -} - -static int put_ir_tx(struct IR_tx *tx, bool ir_devices_lock_held) -{ - int released; - struct IR *ir = tx->ir; - - spin_lock(&ir->tx_ref_lock); - released = kref_put(&tx->ref, release_ir_tx); - spin_unlock(&ir->tx_ref_lock); - /* Do a reference put() for the tx->ir reference, if we released tx */ - if (released) - put_ir_device(ir, ir_devices_lock_held); - return released; -} - -static int add_to_buf(struct IR *ir) -{ - __u16 code; - unsigned char codes[2]; - unsigned char keybuf[6]; - int got_data = 0; - int ret; - int failures = 0; - unsigned char sendbuf[1] = { 0 }; - struct lirc_buffer *rbuf = ir->l->buf; - struct IR_rx *rx; - struct IR_tx *tx; - - if (lirc_buffer_full(rbuf)) { - dev_dbg(ir->dev, "buffer overflow\n"); - return -EOVERFLOW; - } - - rx = get_ir_rx(ir); - if (!rx) - return -ENXIO; - - /* Ensure our rx->c i2c_client remains valid for the duration */ - mutex_lock(&rx->client_lock); - if (!rx->c) { - mutex_unlock(&rx->client_lock); - put_ir_rx(rx, false); - return -ENXIO; - } - - tx = get_ir_tx(ir); - - /* - * service the device as long as it is returning - * data and we have space - */ - do { - if (kthread_should_stop()) { - ret = -ENODATA; - break; - } - - /* - * Lock i2c bus for the duration. RX/TX chips interfere so - * this is worth it - */ - mutex_lock(&ir->ir_lock); - - if (kthread_should_stop()) { - mutex_unlock(&ir->ir_lock); - ret = -ENODATA; - break; - } - - /* - * Send random "poll command" (?) Windows driver does this - * and it is a good point to detect chip failure. - */ - ret = i2c_master_send(rx->c, sendbuf, 1); - if (ret != 1) { - dev_err(ir->dev, "i2c_master_send failed with %d\n", - ret); - if (failures >= 3) { - mutex_unlock(&ir->ir_lock); - dev_err(ir->dev, - "unable to read from the IR chip after 3 resets, giving up\n"); - break; - } - - /* Looks like the chip crashed, reset it */ - dev_err(ir->dev, - "polling the IR receiver chip failed, trying reset\n"); - - set_current_state(TASK_UNINTERRUPTIBLE); - if (kthread_should_stop()) { - mutex_unlock(&ir->ir_lock); - ret = -ENODATA; - break; - } - schedule_timeout((100 * HZ + 999) / 1000); - if (tx) - tx->need_boot = 1; - - ++failures; - mutex_unlock(&ir->ir_lock); - ret = 0; - continue; - } - - if (kthread_should_stop()) { - mutex_unlock(&ir->ir_lock); - ret = -ENODATA; - break; - } - ret = i2c_master_recv(rx->c, keybuf, sizeof(keybuf)); - mutex_unlock(&ir->ir_lock); - if (ret != sizeof(keybuf)) { - dev_err(ir->dev, - "i2c_master_recv failed with %d -- keeping last read buffer\n", - ret); - } else { - rx->b[0] = keybuf[3]; - rx->b[1] = keybuf[4]; - rx->b[2] = keybuf[5]; - dev_dbg(ir->dev, - "key (0x%02x/0x%02x)\n", - rx->b[0], rx->b[1]); - } - - /* key pressed ? */ - if (rx->hdpvr_data_fmt) { - if (got_data && (keybuf[0] == 0x80)) { - ret = 0; - break; - } else if (got_data && (keybuf[0] == 0x00)) { - ret = -ENODATA; - break; - } - } else if ((rx->b[0] & 0x80) == 0) { - ret = got_data ? 0 : -ENODATA; - break; - } - - /* look what we have */ - code = (((__u16)rx->b[0] & 0x7f) << 6) | (rx->b[1] >> 2); - - codes[0] = (code >> 8) & 0xff; - codes[1] = code & 0xff; - - /* return it */ - lirc_buffer_write(rbuf, codes); - ++got_data; - ret = 0; - } while (!lirc_buffer_full(rbuf)); - - mutex_unlock(&rx->client_lock); - if (tx) - put_ir_tx(tx, false); - put_ir_rx(rx, false); - return ret; -} - -/* - * Main function of the polling thread -- from lirc_dev. - * We don't fit the LIRC model at all anymore. This is horrible, but - * basically we have a single RX/TX device with a nasty failure mode - * that needs to be accounted for across the pair. lirc lets us provide - * fops, but prevents us from using the internal polling, etc. if we do - * so. Hence the replication. Might be neater to extend the LIRC model - * to account for this but I'd think it's a very special case of seriously - * messed up hardware. - */ -static int lirc_thread(void *arg) -{ - struct IR *ir = arg; - struct lirc_buffer *rbuf = ir->l->buf; - - dev_dbg(ir->dev, "poll thread started\n"); - - while (!kthread_should_stop()) { - set_current_state(TASK_INTERRUPTIBLE); - - /* if device not opened, we can sleep half a second */ - if (atomic_read(&ir->open_count) == 0) { - schedule_timeout(HZ / 2); - continue; - } - - /* - * This is ~113*2 + 24 + jitter (2*repeat gap + code length). - * We use this interval as the chip resets every time you poll - * it (bad!). This is therefore just sufficient to catch all - * of the button presses. It makes the remote much more - * responsive. You can see the difference by running irw and - * holding down a button. With 100ms, the old polling - * interval, you'll notice breaks in the repeat sequence - * corresponding to lost keypresses. - */ - schedule_timeout((260 * HZ) / 1000); - if (kthread_should_stop()) - break; - if (!add_to_buf(ir)) - wake_up_interruptible(&rbuf->wait_poll); - } - - dev_dbg(ir->dev, "poll thread ended\n"); - return 0; -} - -/* safe read of a uint32 (always network byte order) */ -static int read_uint32(unsigned char **data, - unsigned char *endp, unsigned int *val) -{ - if (*data + 4 > endp) - return 0; - *val = ((*data)[0] << 24) | ((*data)[1] << 16) | - ((*data)[2] << 8) | (*data)[3]; - *data += 4; - return 1; -} - -/* safe read of a uint8 */ -static int read_uint8(unsigned char **data, - unsigned char *endp, unsigned char *val) -{ - if (*data + 1 > endp) - return 0; - *val = *((*data)++); - return 1; -} - -/* safe skipping of N bytes */ -static int skip(unsigned char **data, - unsigned char *endp, unsigned int distance) -{ - if (*data + distance > endp) - return 0; - *data += distance; - return 1; -} - -/* decompress key data into the given buffer */ -static int get_key_data(unsigned char *buf, - unsigned int codeset, unsigned int key) -{ - unsigned char *data, *endp, *diffs, *key_block; - unsigned char keys, ndiffs, id; - unsigned int base, lim, pos, i; - - /* Binary search for the codeset */ - for (base = 0, lim = tx_data->num_code_sets; lim; lim >>= 1) { - pos = base + (lim >> 1); - data = tx_data->code_sets[pos]; - - if (!read_uint32(&data, tx_data->endp, &i)) - goto corrupt; - - if (i == codeset) { - break; - } else if (codeset > i) { - base = pos + 1; - --lim; - } - } - /* Not found? */ - if (!lim) - return -EPROTO; - - /* Set end of data block */ - endp = pos < tx_data->num_code_sets - 1 ? - tx_data->code_sets[pos + 1] : tx_data->endp; - - /* Read the block header */ - if (!read_uint8(&data, endp, &keys) || - !read_uint8(&data, endp, &ndiffs) || - ndiffs > TX_BLOCK_SIZE || keys == 0) - goto corrupt; - - /* Save diffs & skip */ - diffs = data; - if (!skip(&data, endp, ndiffs)) - goto corrupt; - - /* Read the id of the first key */ - if (!read_uint8(&data, endp, &id)) - goto corrupt; - - /* Unpack the first key's data */ - for (i = 0; i < TX_BLOCK_SIZE; ++i) { - if (tx_data->fixed[i] == -1) { - if (!read_uint8(&data, endp, &buf[i])) - goto corrupt; - } else { - buf[i] = (unsigned char)tx_data->fixed[i]; - } - } - - /* Early out key found/not found */ - if (key == id) - return 0; - if (keys == 1) - return -EPROTO; - - /* Sanity check */ - key_block = data; - if (!skip(&data, endp, (keys - 1) * (ndiffs + 1))) - goto corrupt; - - /* Binary search for the key */ - for (base = 0, lim = keys - 1; lim; lim >>= 1) { - /* Seek to block */ - unsigned char *key_data; - - pos = base + (lim >> 1); - key_data = key_block + (ndiffs + 1) * pos; - - if (*key_data == key) { - /* skip key id */ - ++key_data; - - /* found, so unpack the diffs */ - for (i = 0; i < ndiffs; ++i) { - unsigned char val; - - if (!read_uint8(&key_data, endp, &val) || - diffs[i] >= TX_BLOCK_SIZE) - goto corrupt; - buf[diffs[i]] = val; - } - - return 0; - } else if (key > *key_data) { - base = pos + 1; - --lim; - } - } - /* Key not found */ - return -EPROTO; - -corrupt: - pr_err("firmware is corrupt\n"); - return -EFAULT; -} - -/* send a block of data to the IR TX device */ -static int send_data_block(struct IR_tx *tx, unsigned char *data_block) -{ - int i, j, ret; - unsigned char buf[5]; - - for (i = 0; i < TX_BLOCK_SIZE;) { - int tosend = TX_BLOCK_SIZE - i; - - if (tosend > 4) - tosend = 4; - buf[0] = (unsigned char)(i + 1); - for (j = 0; j < tosend; ++j) - buf[1 + j] = data_block[i + j]; - dev_dbg(tx->ir->dev, "%*ph", 5, buf); - ret = i2c_master_send(tx->c, buf, tosend + 1); - if (ret != tosend + 1) { - dev_err(tx->ir->dev, - "i2c_master_send failed with %d\n", ret); - return ret < 0 ? ret : -EFAULT; - } - i += tosend; - } - return 0; -} - -/* send boot data to the IR TX device */ -static int send_boot_data(struct IR_tx *tx) -{ - int ret, i; - unsigned char buf[4]; - - /* send the boot block */ - ret = send_data_block(tx, tx_data->boot_data); - if (ret != 0) - return ret; - - /* Hit the go button to activate the new boot data */ - buf[0] = 0x00; - buf[1] = 0x20; - ret = i2c_master_send(tx->c, buf, 2); - if (ret != 2) { - dev_err(tx->ir->dev, "i2c_master_send failed with %d\n", ret); - return ret < 0 ? ret : -EFAULT; - } - - /* - * Wait for zilog to settle after hitting go post boot block upload. - * Without this delay, the HD-PVR and HVR-1950 both return an -EIO - * upon attempting to get firmware revision, and tx probe thus fails. - */ - for (i = 0; i < 10; i++) { - ret = i2c_master_send(tx->c, buf, 1); - if (ret == 1) - break; - udelay(100); - } - - if (ret != 1) { - dev_err(tx->ir->dev, "i2c_master_send failed with %d\n", ret); - return ret < 0 ? ret : -EFAULT; - } - - /* Here comes the firmware version... (hopefully) */ - ret = i2c_master_recv(tx->c, buf, 4); - if (ret != 4) { - dev_err(tx->ir->dev, "i2c_master_recv failed with %d\n", ret); - return 0; - } - if ((buf[0] != 0x80) && (buf[0] != 0xa0)) { - dev_err(tx->ir->dev, "unexpected IR TX init response: %02x\n", - buf[0]); - return 0; - } - dev_notice(tx->ir->dev, - "Zilog/Hauppauge IR blaster firmware version %d.%d.%d loaded\n", - buf[1], buf[2], buf[3]); - - return 0; -} - -/* unload "firmware", lock held */ -static void fw_unload_locked(void) -{ - if (tx_data) { - vfree(tx_data->code_sets); - - vfree(tx_data->datap); - - vfree(tx_data); - tx_data = NULL; - pr_debug("successfully unloaded IR blaster firmware\n"); - } -} - -/* unload "firmware" for the IR TX device */ -static void fw_unload(void) -{ - mutex_lock(&tx_data_lock); - fw_unload_locked(); - mutex_unlock(&tx_data_lock); -} - -/* load "firmware" for the IR TX device */ -static int fw_load(struct IR_tx *tx) -{ - int ret; - unsigned int i; - unsigned char *data, version, num_global_fixed; - const struct firmware *fw_entry; - - /* Already loaded? */ - mutex_lock(&tx_data_lock); - if (tx_data) { - ret = 0; - goto out; - } - - /* Request codeset data file */ - ret = request_firmware(&fw_entry, "haup-ir-blaster.bin", tx->ir->dev); - if (ret != 0) { - dev_err(tx->ir->dev, - "firmware haup-ir-blaster.bin not available (%d)\n", - ret); - ret = ret < 0 ? ret : -EFAULT; - goto out; - } - dev_dbg(tx->ir->dev, "firmware of size %zu loaded\n", fw_entry->size); - - /* Parse the file */ - tx_data = vmalloc(sizeof(*tx_data)); - if (!tx_data) { - release_firmware(fw_entry); - ret = -ENOMEM; - goto out; - } - tx_data->code_sets = NULL; - - /* Copy the data so hotplug doesn't get confused and timeout */ - tx_data->datap = vmalloc(fw_entry->size); - if (!tx_data->datap) { - release_firmware(fw_entry); - vfree(tx_data); - ret = -ENOMEM; - goto out; - } - memcpy(tx_data->datap, fw_entry->data, fw_entry->size); - tx_data->endp = tx_data->datap + fw_entry->size; - release_firmware(fw_entry); fw_entry = NULL; - - /* Check version */ - data = tx_data->datap; - if (!read_uint8(&data, tx_data->endp, &version)) - goto corrupt; - if (version != 1) { - dev_err(tx->ir->dev, - "unsupported code set file version (%u, expected 1) -- please upgrade to a newer driver\n", - version); - fw_unload_locked(); - ret = -EFAULT; - goto out; - } - - /* Save boot block for later */ - tx_data->boot_data = data; - if (!skip(&data, tx_data->endp, TX_BLOCK_SIZE)) - goto corrupt; - - if (!read_uint32(&data, tx_data->endp, - &tx_data->num_code_sets)) - goto corrupt; - - dev_dbg(tx->ir->dev, "%u IR blaster codesets loaded\n", - tx_data->num_code_sets); - - tx_data->code_sets = vmalloc( - tx_data->num_code_sets * sizeof(char *)); - if (!tx_data->code_sets) { - fw_unload_locked(); - ret = -ENOMEM; - goto out; - } - - for (i = 0; i < TX_BLOCK_SIZE; ++i) - tx_data->fixed[i] = -1; - - /* Read global fixed data template */ - if (!read_uint8(&data, tx_data->endp, &num_global_fixed) || - num_global_fixed > TX_BLOCK_SIZE) - goto corrupt; - for (i = 0; i < num_global_fixed; ++i) { - unsigned char pos, val; - - if (!read_uint8(&data, tx_data->endp, &pos) || - !read_uint8(&data, tx_data->endp, &val) || - pos >= TX_BLOCK_SIZE) - goto corrupt; - tx_data->fixed[pos] = (int)val; - } - - /* Filch out the position of each code set */ - for (i = 0; i < tx_data->num_code_sets; ++i) { - unsigned int id; - unsigned char keys; - unsigned char ndiffs; - - /* Save the codeset position */ - tx_data->code_sets[i] = data; - - /* Read header */ - if (!read_uint32(&data, tx_data->endp, &id) || - !read_uint8(&data, tx_data->endp, &keys) || - !read_uint8(&data, tx_data->endp, &ndiffs) || - ndiffs > TX_BLOCK_SIZE || keys == 0) - goto corrupt; - - /* skip diff positions */ - if (!skip(&data, tx_data->endp, ndiffs)) - goto corrupt; - - /* - * After the diffs we have the first key id + data - - * global fixed - */ - if (!skip(&data, tx_data->endp, - 1 + TX_BLOCK_SIZE - num_global_fixed)) - goto corrupt; - - /* Then we have keys-1 blocks of key id+diffs */ - if (!skip(&data, tx_data->endp, - (ndiffs + 1) * (keys - 1))) - goto corrupt; - } - ret = 0; - goto out; - -corrupt: - dev_err(tx->ir->dev, "firmware is corrupt\n"); - fw_unload_locked(); - ret = -EFAULT; - -out: - mutex_unlock(&tx_data_lock); - return ret; -} - -/* copied from lirc_dev */ -static ssize_t read(struct file *filep, char __user *outbuf, size_t n, - loff_t *ppos) -{ - struct IR *ir = lirc_get_pdata(filep); - struct IR_rx *rx; - struct lirc_buffer *rbuf = ir->l->buf; - int ret = 0, written = 0, retries = 0; - unsigned int m; - DECLARE_WAITQUEUE(wait, current); - - dev_dbg(ir->dev, "read called\n"); - if (n % rbuf->chunk_size) { - dev_dbg(ir->dev, "read result = -EINVAL\n"); - return -EINVAL; - } - - rx = get_ir_rx(ir); - if (!rx) - return -ENXIO; - - /* - * we add ourselves to the task queue before buffer check - * to avoid losing scan code (in case when queue is awaken somewhere - * between while condition checking and scheduling) - */ - add_wait_queue(&rbuf->wait_poll, &wait); - set_current_state(TASK_INTERRUPTIBLE); - - /* - * while we didn't provide 'length' bytes, device is opened in blocking - * mode and 'copy_to_user' is happy, wait for data. - */ - while (written < n && ret == 0) { - if (lirc_buffer_empty(rbuf)) { - /* - * According to the read(2) man page, 'written' can be - * returned as less than 'n', instead of blocking - * again, returning -EWOULDBLOCK, or returning - * -ERESTARTSYS - */ - if (written) - break; - if (filep->f_flags & O_NONBLOCK) { - ret = -EWOULDBLOCK; - break; - } - if (signal_pending(current)) { - ret = -ERESTARTSYS; - break; - } - schedule(); - set_current_state(TASK_INTERRUPTIBLE); - } else { - unsigned char buf[MAX_XFER_SIZE]; - - if (rbuf->chunk_size > sizeof(buf)) { - dev_err(ir->dev, - "chunk_size is too big (%d)!\n", - rbuf->chunk_size); - ret = -EINVAL; - break; - } - m = lirc_buffer_read(rbuf, buf); - if (m == rbuf->chunk_size) { - ret = copy_to_user(outbuf + written, buf, - rbuf->chunk_size); - written += rbuf->chunk_size; - } else { - retries++; - } - if (retries >= 5) { - dev_err(ir->dev, "Buffer read failed!\n"); - ret = -EIO; - } - } - } - - remove_wait_queue(&rbuf->wait_poll, &wait); - put_ir_rx(rx, false); - set_current_state(TASK_RUNNING); - - dev_dbg(ir->dev, "read result = %d (%s)\n", ret, - ret ? "Error" : "OK"); - - return ret ? ret : written; -} - -/* send a keypress to the IR TX device */ -static int send_code(struct IR_tx *tx, unsigned int code, unsigned int key) -{ - unsigned char data_block[TX_BLOCK_SIZE]; - unsigned char buf[2]; - int i, ret; - - /* Get data for the codeset/key */ - ret = get_key_data(data_block, code, key); - - if (ret == -EPROTO) { - dev_err(tx->ir->dev, - "failed to get data for code %u, key %u -- check lircd.conf entries\n", - code, key); - return ret; - } else if (ret != 0) { - return ret; - } - - /* Send the data block */ - ret = send_data_block(tx, data_block); - if (ret != 0) - return ret; - - /* Send data block length? */ - buf[0] = 0x00; - buf[1] = 0x40; - ret = i2c_master_send(tx->c, buf, 2); - if (ret != 2) { - dev_err(tx->ir->dev, "i2c_master_send failed with %d\n", ret); - return ret < 0 ? ret : -EFAULT; - } - - /* Give the z8 a moment to process data block */ - for (i = 0; i < 10; i++) { - ret = i2c_master_send(tx->c, buf, 1); - if (ret == 1) - break; - udelay(100); - } - - if (ret != 1) { - dev_err(tx->ir->dev, "i2c_master_send failed with %d\n", ret); - return ret < 0 ? ret : -EFAULT; - } - - /* Send finished download? */ - ret = i2c_master_recv(tx->c, buf, 1); - if (ret != 1) { - dev_err(tx->ir->dev, "i2c_master_recv failed with %d\n", ret); - return ret < 0 ? ret : -EFAULT; - } - if (buf[0] != 0xA0) { - dev_err(tx->ir->dev, "unexpected IR TX response #1: %02x\n", - buf[0]); - return -EFAULT; - } - - /* Send prepare command? */ - buf[0] = 0x00; - buf[1] = 0x80; - ret = i2c_master_send(tx->c, buf, 2); - if (ret != 2) { - dev_err(tx->ir->dev, "i2c_master_send failed with %d\n", ret); - return ret < 0 ? ret : -EFAULT; - } - - /* - * The sleep bits aren't necessary on the HD PVR, and in fact, the - * last i2c_master_recv always fails with a -5, so for now, we're - * going to skip this whole mess and say we're done on the HD PVR - */ - if (!tx->post_tx_ready_poll) { - dev_dbg(tx->ir->dev, "sent code %u, key %u\n", code, key); - return 0; - } - - /* - * This bit NAKs until the device is ready, so we retry it - * sleeping a bit each time. This seems to be what the windows - * driver does, approximately. - * Try for up to 1s. - */ - for (i = 0; i < 20; ++i) { - set_current_state(TASK_UNINTERRUPTIBLE); - schedule_timeout((50 * HZ + 999) / 1000); - ret = i2c_master_send(tx->c, buf, 1); - if (ret == 1) - break; - dev_dbg(tx->ir->dev, - "NAK expected: i2c_master_send failed with %d (try %d)\n", - ret, i + 1); - } - if (ret != 1) { - dev_err(tx->ir->dev, - "IR TX chip never got ready: last i2c_master_send failed with %d\n", - ret); - return ret < 0 ? ret : -EFAULT; - } - - /* Seems to be an 'ok' response */ - i = i2c_master_recv(tx->c, buf, 1); - if (i != 1) { - dev_err(tx->ir->dev, "i2c_master_recv failed with %d\n", ret); - return -EFAULT; - } - if (buf[0] != 0x80) { - dev_err(tx->ir->dev, "unexpected IR TX response #2: %02x\n", - buf[0]); - return -EFAULT; - } - - /* Oh good, it worked */ - dev_dbg(tx->ir->dev, "sent code %u, key %u\n", code, key); - return 0; -} - -/* - * Write a code to the device. We take in a 32-bit number (an int) and then - * decode this to a codeset/key index. The key data is then decompressed and - * sent to the device. We have a spin lock as per i2c documentation to prevent - * multiple concurrent sends which would probably cause the device to explode. - */ -static ssize_t write(struct file *filep, const char __user *buf, size_t n, - loff_t *ppos) -{ - struct IR *ir = lirc_get_pdata(filep); - struct IR_tx *tx; - size_t i; - int failures = 0; - - /* Validate user parameters */ - if (n % sizeof(int)) - return -EINVAL; - - /* Get a struct IR_tx reference */ - tx = get_ir_tx(ir); - if (!tx) - return -ENXIO; - - /* Ensure our tx->c i2c_client remains valid for the duration */ - mutex_lock(&tx->client_lock); - if (!tx->c) { - mutex_unlock(&tx->client_lock); - put_ir_tx(tx, false); - return -ENXIO; - } - - /* Lock i2c bus for the duration */ - mutex_lock(&ir->ir_lock); - - /* Send each keypress */ - for (i = 0; i < n;) { - int ret = 0; - int command; - - if (copy_from_user(&command, buf + i, sizeof(command))) { - mutex_unlock(&ir->ir_lock); - mutex_unlock(&tx->client_lock); - put_ir_tx(tx, false); - return -EFAULT; - } - - /* Send boot data first if required */ - if (tx->need_boot == 1) { - /* Make sure we have the 'firmware' loaded, first */ - ret = fw_load(tx); - if (ret != 0) { - mutex_unlock(&ir->ir_lock); - mutex_unlock(&tx->client_lock); - put_ir_tx(tx, false); - if (ret != -ENOMEM) - ret = -EIO; - return ret; - } - /* Prep the chip for transmitting codes */ - ret = send_boot_data(tx); - if (ret == 0) - tx->need_boot = 0; - } - - /* Send the code */ - if (ret == 0) { - ret = send_code(tx, (unsigned int)command >> 16, - (unsigned int)command & 0xFFFF); - if (ret == -EPROTO) { - mutex_unlock(&ir->ir_lock); - mutex_unlock(&tx->client_lock); - put_ir_tx(tx, false); - return ret; - } - } - - /* - * Hmm, a failure. If we've had a few then give up, otherwise - * try a reset - */ - if (ret != 0) { - /* Looks like the chip crashed, reset it */ - dev_err(tx->ir->dev, - "sending to the IR transmitter chip failed, trying reset\n"); - - if (failures >= 3) { - dev_err(tx->ir->dev, - "unable to send to the IR chip after 3 resets, giving up\n"); - mutex_unlock(&ir->ir_lock); - mutex_unlock(&tx->client_lock); - put_ir_tx(tx, false); - return ret; - } - set_current_state(TASK_UNINTERRUPTIBLE); - schedule_timeout((100 * HZ + 999) / 1000); - tx->need_boot = 1; - ++failures; - } else { - i += sizeof(int); - } - } - - /* Release i2c bus */ - mutex_unlock(&ir->ir_lock); - - mutex_unlock(&tx->client_lock); - - /* Give back our struct IR_tx reference */ - put_ir_tx(tx, false); - - /* All looks good */ - return n; -} - -/* copied from lirc_dev */ -static unsigned int poll(struct file *filep, poll_table *wait) -{ - struct IR *ir = lirc_get_pdata(filep); - struct IR_rx *rx; - struct lirc_buffer *rbuf = ir->l->buf; - unsigned int ret; - - dev_dbg(ir->dev, "%s called\n", __func__); - - rx = get_ir_rx(ir); - if (!rx) { - /* - * Revisit this, if our poll function ever reports writeable - * status for Tx - */ - dev_dbg(ir->dev, "%s result = POLLERR\n", __func__); - return POLLERR; - } - - /* - * Add our lirc_buffer's wait_queue to the poll_table. A wake up on - * that buffer's wait queue indicates we may have a new poll status. - */ - poll_wait(filep, &rbuf->wait_poll, wait); - - /* Indicate what ops could happen immediately without blocking */ - ret = lirc_buffer_empty(rbuf) ? 0 : (POLLIN | POLLRDNORM); - - dev_dbg(ir->dev, "%s result = %s\n", __func__, - ret ? "POLLIN|POLLRDNORM" : "none"); - return ret; -} - -static long ioctl(struct file *filep, unsigned int cmd, unsigned long arg) -{ - struct IR *ir = lirc_get_pdata(filep); - unsigned long __user *uptr = (unsigned long __user *)arg; - int result; - unsigned long mode, features; - - features = ir->l->features; - - switch (cmd) { - case LIRC_GET_LENGTH: - result = put_user(13UL, uptr); - break; - case LIRC_GET_FEATURES: - result = put_user(features, uptr); - break; - case LIRC_GET_REC_MODE: - if (!(features & LIRC_CAN_REC_MASK)) - return -ENOTTY; - - result = put_user(LIRC_REC2MODE - (features & LIRC_CAN_REC_MASK), - uptr); - break; - case LIRC_SET_REC_MODE: - if (!(features & LIRC_CAN_REC_MASK)) - return -ENOTTY; - - result = get_user(mode, uptr); - if (!result && !(LIRC_MODE2REC(mode) & features)) - result = -ENOTTY; - break; - case LIRC_GET_SEND_MODE: - if (!(features & LIRC_CAN_SEND_MASK)) - return -ENOTTY; - - result = put_user(LIRC_MODE_LIRCCODE, uptr); - break; - case LIRC_SET_SEND_MODE: - if (!(features & LIRC_CAN_SEND_MASK)) - return -ENOTTY; - - result = get_user(mode, uptr); - if (!result && mode != LIRC_MODE_LIRCCODE) - return -EINVAL; - break; - default: - return -EINVAL; - } - return result; -} - -/* - * Open the IR device. - */ -static int open(struct inode *node, struct file *filep) -{ - struct IR *ir; - - lirc_init_pdata(node, filep); - ir = lirc_get_pdata(filep); - - atomic_inc(&ir->open_count); - - nonseekable_open(node, filep); - return 0; -} - -/* Close the IR device */ -static int close(struct inode *node, struct file *filep) -{ - struct IR *ir = lirc_get_pdata(filep); - - atomic_dec(&ir->open_count); - - put_ir_device(ir, false); - return 0; -} - -static int ir_remove(struct i2c_client *client); -static int ir_probe(struct i2c_client *client, const struct i2c_device_id *id); - -#define ID_FLAG_TX 0x01 -#define ID_FLAG_HDPVR 0x02 - -static const struct i2c_device_id ir_transceiver_id[] = { - { "ir_tx_z8f0811_haup", ID_FLAG_TX }, - { "ir_rx_z8f0811_haup", 0 }, - { "ir_tx_z8f0811_hdpvr", ID_FLAG_HDPVR | ID_FLAG_TX }, - { "ir_rx_z8f0811_hdpvr", ID_FLAG_HDPVR }, - { } -}; -MODULE_DEVICE_TABLE(i2c, ir_transceiver_id); - -static struct i2c_driver driver = { - .driver = { - .name = "Zilog/Hauppauge i2c IR", - }, - .probe = ir_probe, - .remove = ir_remove, - .id_table = ir_transceiver_id, -}; - -static const struct file_operations lirc_fops = { - .owner = THIS_MODULE, - .llseek = no_llseek, - .read = read, - .write = write, - .poll = poll, - .unlocked_ioctl = ioctl, -#ifdef CONFIG_COMPAT - .compat_ioctl = ioctl, -#endif - .open = open, - .release = close -}; - -static int ir_remove(struct i2c_client *client) -{ - if (strncmp("ir_tx_z8", client->name, 8) == 0) { - struct IR_tx *tx = i2c_get_clientdata(client); - - if (tx) { - mutex_lock(&tx->client_lock); - tx->c = NULL; - mutex_unlock(&tx->client_lock); - put_ir_tx(tx, false); - } - } else if (strncmp("ir_rx_z8", client->name, 8) == 0) { - struct IR_rx *rx = i2c_get_clientdata(client); - - if (rx) { - mutex_lock(&rx->client_lock); - rx->c = NULL; - mutex_unlock(&rx->client_lock); - put_ir_rx(rx, false); - } - } - return 0; -} - -/* ir_devices_lock must be held */ -static struct IR *get_ir_device_by_adapter(struct i2c_adapter *adapter) -{ - struct IR *ir; - - if (list_empty(&ir_devices_list)) - return NULL; - - list_for_each_entry(ir, &ir_devices_list, list) - if (ir->adapter == adapter) { - get_ir_device(ir, true); - return ir; - } - - return NULL; -} - -static int ir_probe(struct i2c_client *client, const struct i2c_device_id *id) -{ - struct IR *ir; - struct IR_tx *tx; - struct IR_rx *rx; - struct i2c_adapter *adap = client->adapter; - int ret; - bool tx_probe = false; - - dev_dbg(&client->dev, "%s: %s on i2c-%d (%s), client addr=0x%02x\n", - __func__, id->name, adap->nr, adap->name, client->addr); - - /* - * The IR receiver is at i2c address 0x71. - * The IR transmitter is at i2c address 0x70. - */ - - if (id->driver_data & ID_FLAG_TX) - tx_probe = true; - else if (tx_only) /* module option */ - return -ENXIO; - - pr_info("probing IR %s on %s (i2c-%d)\n", - tx_probe ? "Tx" : "Rx", adap->name, adap->nr); - - mutex_lock(&ir_devices_lock); - - /* Use a single struct IR instance for both the Rx and Tx functions */ - ir = get_ir_device_by_adapter(adap); - if (!ir) { - ir = kzalloc(sizeof(*ir), GFP_KERNEL); - if (!ir) { - ret = -ENOMEM; - goto out_no_ir; - } - kref_init(&ir->ref); - - /* store for use in ir_probe() again, and open() later on */ - INIT_LIST_HEAD(&ir->list); - list_add_tail(&ir->list, &ir_devices_list); - - ir->adapter = adap; - ir->dev = &adap->dev; - mutex_init(&ir->ir_lock); - atomic_set(&ir->open_count, 0); - spin_lock_init(&ir->tx_ref_lock); - spin_lock_init(&ir->rx_ref_lock); - - /* set lirc_dev stuff */ - ir->l = lirc_allocate_device(); - if (!ir->l) { - ret = -ENOMEM; - goto out_put_ir; - } - - snprintf(ir->l->name, sizeof(ir->l->name), "lirc_zilog"); - ir->l->code_length = 13; - ir->l->fops = &lirc_fops; - ir->l->owner = THIS_MODULE; - ir->l->dev.parent = &adap->dev; - - /* - * FIXME this is a pointer reference to us, but no refcount. - * - * This OK for now, since lirc_dev currently won't touch this - * buffer as we provide our own lirc_fops. - * - * Currently our own lirc_fops rely on this ir->l->buf pointer - */ - ir->l->buf = &ir->rbuf; - /* This will be returned by lirc_get_pdata() */ - ir->l->data = ir; - ret = lirc_buffer_init(ir->l->buf, 2, BUFLEN / 2); - if (ret) { - lirc_free_device(ir->l); - ir->l = NULL; - goto out_put_ir; - } - } - - if (tx_probe) { - /* Get the IR_rx instance for later, if already allocated */ - rx = get_ir_rx(ir); - - /* Set up a struct IR_tx instance */ - tx = kzalloc(sizeof(*tx), GFP_KERNEL); - if (!tx) { - ret = -ENOMEM; - goto out_put_xx; - } - kref_init(&tx->ref); - ir->tx = tx; - - ir->l->features |= LIRC_CAN_SEND_LIRCCODE; - mutex_init(&tx->client_lock); - tx->c = client; - tx->need_boot = 1; - tx->post_tx_ready_poll = - (id->driver_data & ID_FLAG_HDPVR) ? false : true; - - /* An ir ref goes to the struct IR_tx instance */ - tx->ir = get_ir_device(ir, true); - - /* A tx ref goes to the i2c_client */ - i2c_set_clientdata(client, get_ir_tx(ir)); - - /* - * Load the 'firmware'. We do this before registering with - * lirc_dev, so the first firmware load attempt does not happen - * after a open() or write() call on the device. - * - * Failure here is not deemed catastrophic, so the receiver will - * still be usable. Firmware load will be retried in write(), - * if it is needed. - */ - fw_load(tx); - - /* Proceed only if the Rx client is also ready or not needed */ - if (!rx && !tx_only) { - dev_info(tx->ir->dev, - "probe of IR Tx on %s (i2c-%d) done. Waiting on IR Rx.\n", - adap->name, adap->nr); - goto out_ok; - } - } else { - /* Get the IR_tx instance for later, if already allocated */ - tx = get_ir_tx(ir); - - /* Set up a struct IR_rx instance */ - rx = kzalloc(sizeof(*rx), GFP_KERNEL); - if (!rx) { - ret = -ENOMEM; - goto out_put_xx; - } - kref_init(&rx->ref); - ir->rx = rx; - - ir->l->features |= LIRC_CAN_REC_LIRCCODE; - mutex_init(&rx->client_lock); - rx->c = client; - rx->hdpvr_data_fmt = - (id->driver_data & ID_FLAG_HDPVR) ? true : false; - - /* An ir ref goes to the struct IR_rx instance */ - rx->ir = get_ir_device(ir, true); - - /* An rx ref goes to the i2c_client */ - i2c_set_clientdata(client, get_ir_rx(ir)); - - /* - * Start the polling thread. - * It will only perform an empty loop around schedule_timeout() - * until we register with lirc_dev and the first user open() - */ - /* An ir ref goes to the new rx polling kthread */ - rx->task = kthread_run(lirc_thread, get_ir_device(ir, true), - "zilog-rx-i2c-%d", adap->nr); - if (IS_ERR(rx->task)) { - ret = PTR_ERR(rx->task); - dev_err(tx->ir->dev, - "%s: could not start IR Rx polling thread\n", - __func__); - /* Failed kthread, so put back the ir ref */ - put_ir_device(ir, true); - /* Failure exit, so put back rx ref from i2c_client */ - i2c_set_clientdata(client, NULL); - put_ir_rx(rx, true); - ir->l->features &= ~LIRC_CAN_REC_LIRCCODE; - goto out_put_tx; - } - - /* Proceed only if the Tx client is also ready */ - if (!tx) { - pr_info("probe of IR Rx on %s (i2c-%d) done. Waiting on IR Tx.\n", - adap->name, adap->nr); - goto out_ok; - } - } - - /* register with lirc */ - ret = lirc_register_device(ir->l); - if (ret < 0) { - dev_err(tx->ir->dev, - "%s: lirc_register_device() failed: %i\n", - __func__, ret); - lirc_free_device(ir->l); - ir->l = NULL; - goto out_put_xx; - } - - dev_info(ir->dev, - "IR unit on %s (i2c-%d) registered as lirc%d and ready\n", - adap->name, adap->nr, ir->l->minor); - -out_ok: - if (rx) - put_ir_rx(rx, true); - if (tx) - put_ir_tx(tx, true); - put_ir_device(ir, true); - dev_info(ir->dev, - "probe of IR %s on %s (i2c-%d) done\n", - tx_probe ? "Tx" : "Rx", adap->name, adap->nr); - mutex_unlock(&ir_devices_lock); - return 0; - -out_put_xx: - if (rx) - put_ir_rx(rx, true); -out_put_tx: - if (tx) - put_ir_tx(tx, true); -out_put_ir: - put_ir_device(ir, true); -out_no_ir: - dev_err(&client->dev, - "%s: probing IR %s on %s (i2c-%d) failed with %d\n", - __func__, tx_probe ? "Tx" : "Rx", adap->name, adap->nr, ret); - mutex_unlock(&ir_devices_lock); - return ret; -} - -static int __init zilog_init(void) -{ - int ret; - - pr_notice("Zilog/Hauppauge IR driver initializing\n"); - - mutex_init(&tx_data_lock); - - request_module("firmware_class"); - - ret = i2c_add_driver(&driver); - if (ret) - pr_err("initialization failed\n"); - else - pr_notice("initialization complete\n"); - - return ret; -} - -static void __exit zilog_exit(void) -{ - i2c_del_driver(&driver); - /* if loaded */ - fw_unload(); - pr_notice("Zilog/Hauppauge IR driver unloaded\n"); -} - -module_init(zilog_init); -module_exit(zilog_exit); - -MODULE_DESCRIPTION("Zilog/Hauppauge infrared transmitter driver (i2c stack)"); -MODULE_AUTHOR("Gerd Knorr, Michal Kochanowicz, Christoph Bartelmus, Ulrich Mueller, Stefan Jahn, Jerome Brock, Mark Weaver, Andy Walls"); -MODULE_LICENSE("GPL"); -/* for compat with old name, which isn't all that accurate anymore */ -MODULE_ALIAS("lirc_pvr150"); - -module_param(debug, bool, 0644); -MODULE_PARM_DESC(debug, "Enable debugging messages"); - -module_param(tx_only, bool, 0644); -MODULE_PARM_DESC(tx_only, "Only handle the IR transmit function"); From ea5c895c6e02199916407944f1e509bcde06a060 Mon Sep 17 00:00:00 2001 From: Sean Young Date: Thu, 8 Jun 2017 05:10:41 -0400 Subject: [PATCH 0544/1640] UPSTREAM: media: lirc: remove LIRCCODE and LIRC_GET_LENGTH LIRCCODE is a lirc mode where a driver produces driver-dependent codes for receive and transmit. No driver uses this any more. The LIRC_GET_LENGTH ioctl was used for this mode only. Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- Documentation/media/lirc.h.rst.exceptions | 5 +++ .../media/uapi/rc/lirc-dev-intro.rst | 15 ------- Documentation/media/uapi/rc/lirc-func.rst | 1 - .../media/uapi/rc/lirc-get-features.rst | 7 +-- .../media/uapi/rc/lirc-get-length.rst | 44 ------------------- .../media/uapi/rc/lirc-get-rec-mode.rst | 4 +- .../media/uapi/rc/lirc-get-send-mode.rst | 3 +- drivers/media/rc/ir-lirc-codec.c | 1 - drivers/media/rc/lirc_dev.c | 12 ----- include/media/lirc_dev.h | 4 -- 10 files changed, 9 insertions(+), 87 deletions(-) delete mode 100644 Documentation/media/uapi/rc/lirc-get-length.rst diff --git a/Documentation/media/lirc.h.rst.exceptions b/Documentation/media/lirc.h.rst.exceptions index c130617a9986..63ba1d341905 100644 --- a/Documentation/media/lirc.h.rst.exceptions +++ b/Documentation/media/lirc.h.rst.exceptions @@ -28,6 +28,10 @@ ignore define LIRC_CAN_SEND_MASK ignore define LIRC_CAN_REC_MASK ignore define LIRC_CAN_SET_REC_DUTY_CYCLE +# Obsolete ioctls + +ignore ioctl LIRC_GET_LENGTH + # Undocumented macros ignore define PULSE_BIT @@ -40,3 +44,4 @@ ignore define LIRC_VALUE_MASK ignore define LIRC_MODE2_MASK ignore define LIRC_MODE_RAW +ignore define LIRC_MODE_LIRCCODE diff --git a/Documentation/media/uapi/rc/lirc-dev-intro.rst b/Documentation/media/uapi/rc/lirc-dev-intro.rst index d1936eeb9ce0..3cacf9aeac40 100644 --- a/Documentation/media/uapi/rc/lirc-dev-intro.rst +++ b/Documentation/media/uapi/rc/lirc-dev-intro.rst @@ -72,21 +72,6 @@ on the following table. this packet will be sent, with the number of microseconds with no IR. -.. _lirc-mode-lirccode: - -``LIRC_MODE_LIRCCODE`` - - This mode can be used for IR receive and send. - - The IR signal is decoded internally by the receiver, or encoded by the - transmitter. The LIRC interface represents the scancode as byte string, - which might not be a u32, it can be any length. The value is entirely - driver dependent. This mode is used by some older lirc drivers. - - The length of each code depends on the driver, which can be retrieved - with :ref:`lirc_get_length`. This length is used both - for transmitting and receiving IR. - .. _lirc-mode-pulse: ``LIRC_MODE_PULSE`` diff --git a/Documentation/media/uapi/rc/lirc-func.rst b/Documentation/media/uapi/rc/lirc-func.rst index 9b5a772ec96c..ddb4620de294 100644 --- a/Documentation/media/uapi/rc/lirc-func.rst +++ b/Documentation/media/uapi/rc/lirc-func.rst @@ -18,7 +18,6 @@ LIRC Function Reference lirc-set-send-duty-cycle lirc-get-timeout lirc-set-rec-timeout - lirc-get-length lirc-set-rec-carrier lirc-set-rec-carrier-range lirc-set-send-carrier diff --git a/Documentation/media/uapi/rc/lirc-get-features.rst b/Documentation/media/uapi/rc/lirc-get-features.rst index 64f89a4f9d9c..50c2c26d8e89 100644 --- a/Documentation/media/uapi/rc/lirc-get-features.rst +++ b/Documentation/media/uapi/rc/lirc-get-features.rst @@ -62,8 +62,7 @@ LIRC features ``LIRC_CAN_REC_LIRCCODE`` - The driver is capable of receiving using - :ref:`LIRC_MODE_LIRCCODE `. + Unused. Kept just to avoid breaking uAPI. .. _LIRC-CAN-SET-SEND-CARRIER: @@ -170,9 +169,7 @@ LIRC features ``LIRC_CAN_SEND_LIRCCODE`` - The driver supports sending (also called as IR blasting or IR TX) using - :ref:`LIRC_MODE_LIRCCODE `. - + Unused. Kept just to avoid breaking uAPI. Return Value ============ diff --git a/Documentation/media/uapi/rc/lirc-get-length.rst b/Documentation/media/uapi/rc/lirc-get-length.rst deleted file mode 100644 index 3990af5de0e9..000000000000 --- a/Documentation/media/uapi/rc/lirc-get-length.rst +++ /dev/null @@ -1,44 +0,0 @@ -.. -*- coding: utf-8; mode: rst -*- - -.. _lirc_get_length: - -********************* -ioctl LIRC_GET_LENGTH -********************* - -Name -==== - -LIRC_GET_LENGTH - Retrieves the code length in bits. - -Synopsis -======== - -.. c:function:: int ioctl( int fd, LIRC_GET_LENGTH, __u32 *length ) - :name: LIRC_GET_LENGTH - -Arguments -========= - -``fd`` - File descriptor returned by open(). - -``length`` - length, in bits - - -Description -=========== - -Retrieves the code length in bits (only for -:ref:`LIRC_MODE_LIRCCODE `). -Reads on the device must be done in blocks matching the bit count. -The bit could should be rounded up so that it matches full bytes. - - -Return Value -============ - -On success 0 is returned, on error -1 and the ``errno`` variable is set -appropriately. The generic error codes are described at the -:ref:`Generic Error Codes ` chapter. diff --git a/Documentation/media/uapi/rc/lirc-get-rec-mode.rst b/Documentation/media/uapi/rc/lirc-get-rec-mode.rst index a4eb6c0a26e9..b89de9add921 100644 --- a/Documentation/media/uapi/rc/lirc-get-rec-mode.rst +++ b/Documentation/media/uapi/rc/lirc-get-rec-mode.rst @@ -34,9 +34,7 @@ Description =========== Get/set supported receive modes. Only :ref:`LIRC_MODE_MODE2 ` -and :ref:`LIRC_MODE_LIRCCODE ` are supported for IR -receive. Use :ref:`lirc_get_features` to find out which modes the driver -supports. +is supported for IR receive. Return Value ============ diff --git a/Documentation/media/uapi/rc/lirc-get-send-mode.rst b/Documentation/media/uapi/rc/lirc-get-send-mode.rst index a169b234290e..e686b21689a0 100644 --- a/Documentation/media/uapi/rc/lirc-get-send-mode.rst +++ b/Documentation/media/uapi/rc/lirc-get-send-mode.rst @@ -36,8 +36,7 @@ Description Get/set current transmit mode. -Only :ref:`LIRC_MODE_PULSE ` and -:ref:`LIRC_MODE_LIRCCODE ` is supported by for IR send, +Only :ref:`LIRC_MODE_PULSE ` is supported by for IR send, depending on the driver. Use :ref:`lirc_get_features` to find out which modes the driver supports. diff --git a/drivers/media/rc/ir-lirc-codec.c b/drivers/media/rc/ir-lirc-codec.c index 4fd4521693d9..9954ad4b8e59 100644 --- a/drivers/media/rc/ir-lirc-codec.c +++ b/drivers/media/rc/ir-lirc-codec.c @@ -388,7 +388,6 @@ static int ir_lirc_register(struct rc_dev *dev) ldev->features = features; ldev->data = &dev->raw->lirc; ldev->buf = NULL; - ldev->code_length = sizeof(struct ir_raw_event) * 8; ldev->chunk_size = sizeof(int); ldev->buffer_size = LIRCBUF_SIZE; ldev->fops = &lirc_fops; diff --git a/drivers/media/rc/lirc_dev.c b/drivers/media/rc/lirc_dev.c index e16d1138ca48..ef7e915dc9a2 100644 --- a/drivers/media/rc/lirc_dev.c +++ b/drivers/media/rc/lirc_dev.c @@ -137,12 +137,6 @@ int lirc_register_device(struct lirc_dev *d) return -EINVAL; } - if (d->code_length < 1 || d->code_length > (BUFLEN * 8)) { - dev_err(&d->dev, "code length must be less than %d bits\n", - BUFLEN * 8); - return -EBADRQC; - } - if (!d->buf && !(d->fops && d->fops->read && d->fops->poll && d->fops->unlocked_ioctl)) { dev_err(&d->dev, "undefined read, poll, ioctl\n"); @@ -152,9 +146,6 @@ int lirc_register_device(struct lirc_dev *d) /* some safety check 8-) */ d->name[sizeof(d->name) - 1] = '\0'; - if (d->features == 0) - d->features = LIRC_CAN_REC_LIRCCODE; - if (LIRC_CAN_REC(d->features)) { err = lirc_allocate_buffer(d); if (err) @@ -343,9 +334,6 @@ long lirc_dev_fop_ioctl(struct file *file, unsigned int cmd, unsigned long arg) * for now, lirc_serial doesn't support mode changing either */ break; - case LIRC_GET_LENGTH: - result = put_user(d->code_length, (__u32 __user *)arg); - break; default: result = -ENOTTY; } diff --git a/include/media/lirc_dev.h b/include/media/lirc_dev.h index 857da67bd931..0a03dd9e5a68 100644 --- a/include/media/lirc_dev.h +++ b/include/media/lirc_dev.h @@ -9,8 +9,6 @@ #ifndef _LINUX_LIRC_DEV_H #define _LINUX_LIRC_DEV_H -#define BUFLEN 16 - #include #include #include @@ -117,7 +115,6 @@ static inline unsigned int lirc_buffer_write(struct lirc_buffer *buf, * * @name: used for logging * @minor: the minor device (/dev/lircX) number for the device - * @code_length: length of a remote control key code expressed in bits * @features: lirc compatible hardware features, like LIRC_MODE_RAW, * LIRC_CAN\_\*, as defined at include/media/lirc.h. * @buffer_size: Number of FIFO buffers with @chunk_size size. @@ -142,7 +139,6 @@ static inline unsigned int lirc_buffer_write(struct lirc_buffer *buf, struct lirc_dev { char name[40]; unsigned int minor; - __u32 code_length; __u32 features; unsigned int buffer_size; /* in chunks holding one code each */ From 79812d1ad54c4608283b520763742127db8c46a9 Mon Sep 17 00:00:00 2001 From: Sean Young Date: Sat, 25 Feb 2017 06:51:29 -0500 Subject: [PATCH 0545/1640] UPSTREAM: media: lirc: implement scancode sending This introduces a new lirc mode: scancode. Any device which can send raw IR can now also send scancodes. int main() { int mode, fd = open("/dev/lirc0", O_RDWR); mode = LIRC_MODE_SCANCODE; if (ioctl(fd, LIRC_SET_SEND_MODE, &mode)) { // kernel too old or lirc does not support transmit } struct lirc_scancode scancode = { .scancode = 0x1e3d, .rc_proto = RC_PROTO_RC5, }; write(fd, &scancode, sizeof(scancode)); close(fd); } The other fields of lirc_scancode must be set to 0. Note that toggle (rc5, rc6) and repeats (nec) are not implemented. Nor is there a method for holding down a key for a period. Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/ir-lirc-codec.c | 95 +++++++++++++++++++++++--------- drivers/media/rc/rc-core-priv.h | 2 +- include/media/rc-map.h | 54 +----------------- include/uapi/linux/lirc.h | 82 +++++++++++++++++++++++++++ 4 files changed, 154 insertions(+), 79 deletions(-) diff --git a/drivers/media/rc/ir-lirc-codec.c b/drivers/media/rc/ir-lirc-codec.c index 9954ad4b8e59..0a3ec693d290 100644 --- a/drivers/media/rc/ir-lirc-codec.c +++ b/drivers/media/rc/ir-lirc-codec.c @@ -107,7 +107,8 @@ static ssize_t ir_lirc_transmit_ir(struct file *file, const char __user *buf, { struct lirc_codec *lirc; struct rc_dev *dev; - unsigned int *txbuf; /* buffer with values to transmit */ + unsigned int *txbuf = NULL; + struct ir_raw_event *raw = NULL; ssize_t ret = -EINVAL; size_t count; ktime_t start; @@ -121,16 +122,50 @@ static ssize_t ir_lirc_transmit_ir(struct file *file, const char __user *buf, if (!lirc) return -EFAULT; - if (n < sizeof(unsigned) || n % sizeof(unsigned)) - return -EINVAL; + if (lirc->send_mode == LIRC_MODE_SCANCODE) { + struct lirc_scancode scan; - count = n / sizeof(unsigned); - if (count > LIRCBUF_SIZE || count % 2 == 0) - return -EINVAL; + if (n != sizeof(scan)) + return -EINVAL; - txbuf = memdup_user(buf, n); - if (IS_ERR(txbuf)) - return PTR_ERR(txbuf); + if (copy_from_user(&scan, buf, sizeof(scan))) + return -EFAULT; + + if (scan.flags || scan.keycode || scan.timestamp) + return -EINVAL; + + raw = kmalloc_array(LIRCBUF_SIZE, sizeof(*raw), GFP_KERNEL); + if (!raw) + return -ENOMEM; + + ret = ir_raw_encode_scancode(scan.rc_proto, scan.scancode, + raw, LIRCBUF_SIZE); + if (ret < 0) + goto out; + + count = ret; + + txbuf = kmalloc_array(count, sizeof(unsigned int), GFP_KERNEL); + if (!txbuf) { + ret = -ENOMEM; + goto out; + } + + for (i = 0; i < count; i++) + /* Convert from NS to US */ + txbuf[i] = DIV_ROUND_UP(raw[i].duration, 1000); + } else { + if (n < sizeof(unsigned int) || n % sizeof(unsigned int)) + return -EINVAL; + + count = n / sizeof(unsigned int); + if (count > LIRCBUF_SIZE || count % 2 == 0) + return -EINVAL; + + txbuf = memdup_user(buf, n); + if (IS_ERR(txbuf)) + return PTR_ERR(txbuf); + } dev = lirc->dev; if (!dev) { @@ -156,24 +191,30 @@ static ssize_t ir_lirc_transmit_ir(struct file *file, const char __user *buf, if (ret < 0) goto out; - for (duration = i = 0; i < ret; i++) - duration += txbuf[i]; + if (lirc->send_mode == LIRC_MODE_SCANCODE) { + ret = n; + } else { + for (duration = i = 0; i < ret; i++) + duration += txbuf[i]; - ret *= sizeof(unsigned int); + ret *= sizeof(unsigned int); - /* - * The lircd gap calculation expects the write function to - * wait for the actual IR signal to be transmitted before - * returning. - */ - towait = ktime_us_delta(ktime_add_us(start, duration), ktime_get()); - if (towait > 0) { - set_current_state(TASK_INTERRUPTIBLE); - schedule_timeout(usecs_to_jiffies(towait)); + /* + * The lircd gap calculation expects the write function to + * wait for the actual IR signal to be transmitted before + * returning. + */ + towait = ktime_us_delta(ktime_add_us(start, duration), + ktime_get()); + if (towait > 0) { + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(usecs_to_jiffies(towait)); + } } out: kfree(txbuf); + kfree(raw); return ret; } @@ -202,20 +243,22 @@ static long ir_lirc_ioctl(struct file *filep, unsigned int cmd, switch (cmd) { - /* legacy support */ + /* mode support */ case LIRC_GET_SEND_MODE: if (!dev->tx_ir) return -ENOTTY; - val = LIRC_MODE_PULSE; + val = lirc->send_mode; break; case LIRC_SET_SEND_MODE: if (!dev->tx_ir) return -ENOTTY; - if (val != LIRC_MODE_PULSE) + if (!(val == LIRC_MODE_PULSE || val == LIRC_MODE_SCANCODE)) return -EINVAL; + + lirc->send_mode = val; return 0; /* TX settings */ @@ -361,7 +404,7 @@ static int ir_lirc_register(struct rc_dev *dev) } if (dev->tx_ir) { - features |= LIRC_CAN_SEND_PULSE; + features |= LIRC_CAN_SEND_PULSE | LIRC_CAN_SEND_SCANCODE; if (dev->s_tx_mask) features |= LIRC_CAN_SET_TRANSMITTER_MASK; if (dev->s_tx_carrier) @@ -399,6 +442,8 @@ static int ir_lirc_register(struct rc_dev *dev) if (rc < 0) goto out; + dev->raw->lirc.send_mode = LIRC_MODE_PULSE; + dev->raw->lirc.ldev = ldev; dev->raw->lirc.dev = dev; return 0; diff --git a/drivers/media/rc/rc-core-priv.h b/drivers/media/rc/rc-core-priv.h index 564d6e13585e..d10fc998e1db 100644 --- a/drivers/media/rc/rc-core-priv.h +++ b/drivers/media/rc/rc-core-priv.h @@ -103,7 +103,7 @@ struct ir_raw_event_ctrl { u64 gap_duration; bool gap; bool send_timeout_reports; - + u8 send_mode; } lirc; struct xmp_dec { int state; diff --git a/include/media/rc-map.h b/include/media/rc-map.h index 72197cb43781..7046734b3895 100644 --- a/include/media/rc-map.h +++ b/include/media/rc-map.h @@ -10,59 +10,7 @@ */ #include - -/** - * enum rc_proto - the Remote Controller protocol - * - * @RC_PROTO_UNKNOWN: Protocol not known - * @RC_PROTO_OTHER: Protocol known but proprietary - * @RC_PROTO_RC5: Philips RC5 protocol - * @RC_PROTO_RC5X_20: Philips RC5x 20 bit protocol - * @RC_PROTO_RC5_SZ: StreamZap variant of RC5 - * @RC_PROTO_JVC: JVC protocol - * @RC_PROTO_SONY12: Sony 12 bit protocol - * @RC_PROTO_SONY15: Sony 15 bit protocol - * @RC_PROTO_SONY20: Sony 20 bit protocol - * @RC_PROTO_NEC: NEC protocol - * @RC_PROTO_NECX: Extended NEC protocol - * @RC_PROTO_NEC32: NEC 32 bit protocol - * @RC_PROTO_SANYO: Sanyo protocol - * @RC_PROTO_MCIR2_KBD: RC6-ish MCE keyboard - * @RC_PROTO_MCIR2_MSE: RC6-ish MCE mouse - * @RC_PROTO_RC6_0: Philips RC6-0-16 protocol - * @RC_PROTO_RC6_6A_20: Philips RC6-6A-20 protocol - * @RC_PROTO_RC6_6A_24: Philips RC6-6A-24 protocol - * @RC_PROTO_RC6_6A_32: Philips RC6-6A-32 protocol - * @RC_PROTO_RC6_MCE: MCE (Philips RC6-6A-32 subtype) protocol - * @RC_PROTO_SHARP: Sharp protocol - * @RC_PROTO_XMP: XMP protocol - * @RC_PROTO_CEC: CEC protocol - */ -enum rc_proto { - RC_PROTO_UNKNOWN = 0, - RC_PROTO_OTHER = 1, - RC_PROTO_RC5 = 2, - RC_PROTO_RC5X_20 = 3, - RC_PROTO_RC5_SZ = 4, - RC_PROTO_JVC = 5, - RC_PROTO_SONY12 = 6, - RC_PROTO_SONY15 = 7, - RC_PROTO_SONY20 = 8, - RC_PROTO_NEC = 9, - RC_PROTO_NECX = 10, - RC_PROTO_NEC32 = 11, - RC_PROTO_SANYO = 12, - RC_PROTO_MCIR2_KBD = 13, - RC_PROTO_MCIR2_MSE = 14, - RC_PROTO_RC6_0 = 15, - RC_PROTO_RC6_6A_20 = 16, - RC_PROTO_RC6_6A_24 = 17, - RC_PROTO_RC6_6A_32 = 18, - RC_PROTO_RC6_MCE = 19, - RC_PROTO_SHARP = 20, - RC_PROTO_XMP = 21, - RC_PROTO_CEC = 22, -}; +#include #define RC_PROTO_BIT_NONE 0ULL #define RC_PROTO_BIT_UNKNOWN BIT_ULL(RC_PROTO_UNKNOWN) diff --git a/include/uapi/linux/lirc.h b/include/uapi/linux/lirc.h index c3aef4316fbf..4fe580d36e41 100644 --- a/include/uapi/linux/lirc.h +++ b/include/uapi/linux/lirc.h @@ -47,12 +47,14 @@ #define LIRC_MODE_RAW 0x00000001 #define LIRC_MODE_PULSE 0x00000002 #define LIRC_MODE_MODE2 0x00000004 +#define LIRC_MODE_SCANCODE 0x00000008 #define LIRC_MODE_LIRCCODE 0x00000010 #define LIRC_CAN_SEND_RAW LIRC_MODE2SEND(LIRC_MODE_RAW) #define LIRC_CAN_SEND_PULSE LIRC_MODE2SEND(LIRC_MODE_PULSE) #define LIRC_CAN_SEND_MODE2 LIRC_MODE2SEND(LIRC_MODE_MODE2) +#define LIRC_CAN_SEND_SCANCODE LIRC_MODE2SEND(LIRC_MODE_SCANCODE) #define LIRC_CAN_SEND_LIRCCODE LIRC_MODE2SEND(LIRC_MODE_LIRCCODE) #define LIRC_CAN_SEND_MASK 0x0000003f @@ -64,6 +66,7 @@ #define LIRC_CAN_REC_RAW LIRC_MODE2REC(LIRC_MODE_RAW) #define LIRC_CAN_REC_PULSE LIRC_MODE2REC(LIRC_MODE_PULSE) #define LIRC_CAN_REC_MODE2 LIRC_MODE2REC(LIRC_MODE_MODE2) +#define LIRC_CAN_REC_SCANCODE LIRC_MODE2REC(LIRC_MODE_SCANCODE) #define LIRC_CAN_REC_LIRCCODE LIRC_MODE2REC(LIRC_MODE_LIRCCODE) #define LIRC_CAN_REC_MASK LIRC_MODE2REC(LIRC_CAN_SEND_MASK) @@ -131,4 +134,83 @@ #define LIRC_SET_WIDEBAND_RECEIVER _IOW('i', 0x00000023, __u32) +/* + * struct lirc_scancode - decoded scancode with protocol for use with + * LIRC_MODE_SCANCODE + * + * @timestamp: Timestamp in nanoseconds using CLOCK_MONOTONIC when IR + * was decoded. + * @flags: should be 0 for transmit. When receiving scancodes, + * LIRC_SCANCODE_FLAG_TOGGLE or LIRC_SCANCODE_FLAG_REPEAT can be set + * depending on the protocol + * @rc_proto: see enum rc_proto + * @keycode: the translated keycode. Set to 0 for transmit. + * @scancode: the scancode received or to be sent + */ +struct lirc_scancode { + __u64 timestamp; + __u16 flags; + __u16 rc_proto; + __u32 keycode; + __u64 scancode; +}; + +/* Set if the toggle bit of rc-5 or rc-6 is enabled */ +#define LIRC_SCANCODE_FLAG_TOGGLE 1 +/* Set if this is a nec or sanyo repeat */ +#define LIRC_SCANCODE_FLAG_REPEAT 2 + +/** + * enum rc_proto - the Remote Controller protocol + * + * @RC_PROTO_UNKNOWN: Protocol not known + * @RC_PROTO_OTHER: Protocol known but proprietary + * @RC_PROTO_RC5: Philips RC5 protocol + * @RC_PROTO_RC5X_20: Philips RC5x 20 bit protocol + * @RC_PROTO_RC5_SZ: StreamZap variant of RC5 + * @RC_PROTO_JVC: JVC protocol + * @RC_PROTO_SONY12: Sony 12 bit protocol + * @RC_PROTO_SONY15: Sony 15 bit protocol + * @RC_PROTO_SONY20: Sony 20 bit protocol + * @RC_PROTO_NEC: NEC protocol + * @RC_PROTO_NECX: Extended NEC protocol + * @RC_PROTO_NEC32: NEC 32 bit protocol + * @RC_PROTO_SANYO: Sanyo protocol + * @RC_PROTO_MCIR2_KBD: RC6-ish MCE keyboard + * @RC_PROTO_MCIR2_MSE: RC6-ish MCE mouse + * @RC_PROTO_RC6_0: Philips RC6-0-16 protocol + * @RC_PROTO_RC6_6A_20: Philips RC6-6A-20 protocol + * @RC_PROTO_RC6_6A_24: Philips RC6-6A-24 protocol + * @RC_PROTO_RC6_6A_32: Philips RC6-6A-32 protocol + * @RC_PROTO_RC6_MCE: MCE (Philips RC6-6A-32 subtype) protocol + * @RC_PROTO_SHARP: Sharp protocol + * @RC_PROTO_XMP: XMP protocol + * @RC_PROTO_CEC: CEC protocol + */ +enum rc_proto { + RC_PROTO_UNKNOWN = 0, + RC_PROTO_OTHER = 1, + RC_PROTO_RC5 = 2, + RC_PROTO_RC5X_20 = 3, + RC_PROTO_RC5_SZ = 4, + RC_PROTO_JVC = 5, + RC_PROTO_SONY12 = 6, + RC_PROTO_SONY15 = 7, + RC_PROTO_SONY20 = 8, + RC_PROTO_NEC = 9, + RC_PROTO_NECX = 10, + RC_PROTO_NEC32 = 11, + RC_PROTO_SANYO = 12, + RC_PROTO_MCIR2_KBD = 13, + RC_PROTO_MCIR2_MSE = 14, + RC_PROTO_RC6_0 = 15, + RC_PROTO_RC6_6A_20 = 16, + RC_PROTO_RC6_6A_24 = 17, + RC_PROTO_RC6_6A_32 = 18, + RC_PROTO_RC6_MCE = 19, + RC_PROTO_SHARP = 20, + RC_PROTO_XMP = 21, + RC_PROTO_CEC = 22, +}; + #endif From d34f414236403192c4dff9eba05e71100e70e55e Mon Sep 17 00:00:00 2001 From: Sean Young Date: Sat, 25 Feb 2017 06:51:30 -0500 Subject: [PATCH 0546/1640] UPSTREAM: media: lirc: use the correct carrier for scancode transmit If the lirc device supports it, set the carrier for the protocol. Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/ir-jvc-decoder.c | 1 + drivers/media/rc/ir-lirc-codec.c | 29 ++++++++++++++++---------- drivers/media/rc/ir-mce_kbd-decoder.c | 1 + drivers/media/rc/ir-nec-decoder.c | 1 + drivers/media/rc/ir-rc5-decoder.c | 1 + drivers/media/rc/ir-rc6-decoder.c | 1 + drivers/media/rc/ir-sanyo-decoder.c | 1 + drivers/media/rc/ir-sharp-decoder.c | 1 + drivers/media/rc/ir-sony-decoder.c | 1 + drivers/media/rc/rc-core-priv.h | 1 + drivers/media/rc/rc-ir-raw.c | 30 +++++++++++++++++++++++++++ include/media/rc-core.h | 1 + 12 files changed, 58 insertions(+), 11 deletions(-) diff --git a/drivers/media/rc/ir-jvc-decoder.c b/drivers/media/rc/ir-jvc-decoder.c index 22c8aee3df4f..c03c776cfa54 100644 --- a/drivers/media/rc/ir-jvc-decoder.c +++ b/drivers/media/rc/ir-jvc-decoder.c @@ -212,6 +212,7 @@ static struct ir_raw_handler jvc_handler = { .protocols = RC_PROTO_BIT_JVC, .decode = ir_jvc_decode, .encode = ir_jvc_encode, + .carrier = 38000, }; static int __init ir_jvc_decode_init(void) diff --git a/drivers/media/rc/ir-lirc-codec.c b/drivers/media/rc/ir-lirc-codec.c index 0a3ec693d290..bdacbadac416 100644 --- a/drivers/media/rc/ir-lirc-codec.c +++ b/drivers/media/rc/ir-lirc-codec.c @@ -122,6 +122,17 @@ static ssize_t ir_lirc_transmit_ir(struct file *file, const char __user *buf, if (!lirc) return -EFAULT; + dev = lirc->dev; + if (!dev) { + ret = -EFAULT; + goto out; + } + + if (!dev->tx_ir) { + ret = -EINVAL; + goto out; + } + if (lirc->send_mode == LIRC_MODE_SCANCODE) { struct lirc_scancode scan; @@ -154,6 +165,13 @@ static ssize_t ir_lirc_transmit_ir(struct file *file, const char __user *buf, for (i = 0; i < count; i++) /* Convert from NS to US */ txbuf[i] = DIV_ROUND_UP(raw[i].duration, 1000); + + if (dev->s_tx_carrier) { + int carrier = ir_raw_encode_carrier(scan.rc_proto); + + if (carrier > 0) + dev->s_tx_carrier(dev, carrier); + } } else { if (n < sizeof(unsigned int) || n % sizeof(unsigned int)) return -EINVAL; @@ -167,17 +185,6 @@ static ssize_t ir_lirc_transmit_ir(struct file *file, const char __user *buf, return PTR_ERR(txbuf); } - dev = lirc->dev; - if (!dev) { - ret = -EFAULT; - goto out; - } - - if (!dev->tx_ir) { - ret = -EINVAL; - goto out; - } - for (i = 0; i < count; i++) { if (txbuf[i] > IR_MAX_DURATION / 1000 - duration || !txbuf[i]) { ret = -EINVAL; diff --git a/drivers/media/rc/ir-mce_kbd-decoder.c b/drivers/media/rc/ir-mce_kbd-decoder.c index 69d6264d54e6..dbc6e00bace2 100644 --- a/drivers/media/rc/ir-mce_kbd-decoder.c +++ b/drivers/media/rc/ir-mce_kbd-decoder.c @@ -473,6 +473,7 @@ static struct ir_raw_handler mce_kbd_handler = { .encode = ir_mce_kbd_encode, .raw_register = ir_mce_kbd_register, .raw_unregister = ir_mce_kbd_unregister, + .carrier = 36000, }; static int __init ir_mce_kbd_decode_init(void) diff --git a/drivers/media/rc/ir-nec-decoder.c b/drivers/media/rc/ir-nec-decoder.c index 22eed9505244..31d7bafe7bda 100644 --- a/drivers/media/rc/ir-nec-decoder.c +++ b/drivers/media/rc/ir-nec-decoder.c @@ -254,6 +254,7 @@ static struct ir_raw_handler nec_handler = { RC_PROTO_BIT_NEC32, .decode = ir_nec_decode, .encode = ir_nec_encode, + .carrier = 38000, }; static int __init ir_nec_decode_init(void) diff --git a/drivers/media/rc/ir-rc5-decoder.c b/drivers/media/rc/ir-rc5-decoder.c index cbff3e26d481..f589d99245eb 100644 --- a/drivers/media/rc/ir-rc5-decoder.c +++ b/drivers/media/rc/ir-rc5-decoder.c @@ -273,6 +273,7 @@ static struct ir_raw_handler rc5_handler = { RC_PROTO_BIT_RC5_SZ, .decode = ir_rc5_decode, .encode = ir_rc5_encode, + .carrier = 36000, }; static int __init ir_rc5_decode_init(void) diff --git a/drivers/media/rc/ir-rc6-decoder.c b/drivers/media/rc/ir-rc6-decoder.c index 5d0d2fe3b7a7..665025303c28 100644 --- a/drivers/media/rc/ir-rc6-decoder.c +++ b/drivers/media/rc/ir-rc6-decoder.c @@ -408,6 +408,7 @@ static struct ir_raw_handler rc6_handler = { RC_PROTO_BIT_RC6_MCE, .decode = ir_rc6_decode, .encode = ir_rc6_encode, + .carrier = 36000, }; static int __init ir_rc6_decode_init(void) diff --git a/drivers/media/rc/ir-sanyo-decoder.c b/drivers/media/rc/ir-sanyo-decoder.c index 2138f0e9472d..ded39cdfc6ef 100644 --- a/drivers/media/rc/ir-sanyo-decoder.c +++ b/drivers/media/rc/ir-sanyo-decoder.c @@ -209,6 +209,7 @@ static struct ir_raw_handler sanyo_handler = { .protocols = RC_PROTO_BIT_SANYO, .decode = ir_sanyo_decode, .encode = ir_sanyo_encode, + .carrier = 38000, }; static int __init ir_sanyo_decode_init(void) diff --git a/drivers/media/rc/ir-sharp-decoder.c b/drivers/media/rc/ir-sharp-decoder.c index 7140dd6160ee..df296991906c 100644 --- a/drivers/media/rc/ir-sharp-decoder.c +++ b/drivers/media/rc/ir-sharp-decoder.c @@ -226,6 +226,7 @@ static struct ir_raw_handler sharp_handler = { .protocols = RC_PROTO_BIT_SHARP, .decode = ir_sharp_decode, .encode = ir_sharp_encode, + .carrier = 38000, }; static int __init ir_sharp_decode_init(void) diff --git a/drivers/media/rc/ir-sony-decoder.c b/drivers/media/rc/ir-sony-decoder.c index a47ced763031..e4bcff21c025 100644 --- a/drivers/media/rc/ir-sony-decoder.c +++ b/drivers/media/rc/ir-sony-decoder.c @@ -221,6 +221,7 @@ static struct ir_raw_handler sony_handler = { RC_PROTO_BIT_SONY20, .decode = ir_sony_decode, .encode = ir_sony_encode, + .carrier = 40000, }; static int __init ir_sony_decode_init(void) diff --git a/drivers/media/rc/rc-core-priv.h b/drivers/media/rc/rc-core-priv.h index d10fc998e1db..2fab4069c023 100644 --- a/drivers/media/rc/rc-core-priv.h +++ b/drivers/media/rc/rc-core-priv.h @@ -19,6 +19,7 @@ struct ir_raw_handler { int (*decode)(struct rc_dev *dev, struct ir_raw_event event); int (*encode)(enum rc_proto protocol, u32 scancode, struct ir_raw_event *events, unsigned int max); + u32 carrier; /* These two should only be used by the lirc decoder */ int (*raw_register)(struct rc_dev *dev); diff --git a/drivers/media/rc/rc-ir-raw.c b/drivers/media/rc/rc-ir-raw.c index 0616eee564a8..208db8a5adff 100644 --- a/drivers/media/rc/rc-ir-raw.c +++ b/drivers/media/rc/rc-ir-raw.c @@ -484,6 +484,36 @@ static void edge_handle(struct timer_list *t) ir_raw_event_handle(dev); } +/** + * ir_raw_encode_carrier() - Get carrier used for protocol + * + * @protocol: protocol + * + * Attempts to find the carrier for the specified protocol + * + * Returns: The carrier in Hz + * -EINVAL if the protocol is invalid, or if no + * compatible encoder was found. + */ +int ir_raw_encode_carrier(enum rc_proto protocol) +{ + struct ir_raw_handler *handler; + int ret = -EINVAL; + u64 mask = BIT_ULL(protocol); + + mutex_lock(&ir_raw_handler_lock); + list_for_each_entry(handler, &ir_raw_handler_list, list) { + if (handler->protocols & mask && handler->encode) { + ret = handler->carrier; + break; + } + } + mutex_unlock(&ir_raw_handler_lock); + + return ret; +} +EXPORT_SYMBOL(ir_raw_encode_carrier); + /* * Used to (un)register raw event clients */ diff --git a/include/media/rc-core.h b/include/media/rc-core.h index 314a1edb6189..ca48632ec8e2 100644 --- a/include/media/rc-core.h +++ b/include/media/rc-core.h @@ -309,6 +309,7 @@ int ir_raw_event_store_with_filter(struct rc_dev *dev, void ir_raw_event_set_idle(struct rc_dev *dev, bool idle); int ir_raw_encode_scancode(enum rc_proto protocol, u32 scancode, struct ir_raw_event *events, unsigned int max); +int ir_raw_encode_carrier(enum rc_proto protocol); static inline void ir_raw_event_reset(struct rc_dev *dev) { From f4e7eaf9e344b4e4356218359862a143a482d26c Mon Sep 17 00:00:00 2001 From: Sean Young Date: Sat, 25 Feb 2017 06:51:31 -0500 Subject: [PATCH 0547/1640] UPSTREAM: media: rc: auto load encoder if necessary When sending scancodes, load the encoder if we need it. Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/rc-core-priv.h | 1 + drivers/media/rc/rc-ir-raw.c | 2 ++ drivers/media/rc/rc-main.c | 2 +- 3 files changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/media/rc/rc-core-priv.h b/drivers/media/rc/rc-core-priv.h index 2fab4069c023..6014f116cba2 100644 --- a/drivers/media/rc/rc-core-priv.h +++ b/drivers/media/rc/rc-core-priv.h @@ -261,6 +261,7 @@ void ir_raw_event_free(struct rc_dev *dev); void ir_raw_event_unregister(struct rc_dev *dev); int ir_raw_handler_register(struct ir_raw_handler *ir_raw_handler); void ir_raw_handler_unregister(struct ir_raw_handler *ir_raw_handler); +void ir_raw_load_modules(u64 *protocols); void ir_raw_init(void); /* diff --git a/drivers/media/rc/rc-ir-raw.c b/drivers/media/rc/rc-ir-raw.c index 208db8a5adff..78638d1b73cc 100644 --- a/drivers/media/rc/rc-ir-raw.c +++ b/drivers/media/rc/rc-ir-raw.c @@ -448,6 +448,8 @@ int ir_raw_encode_scancode(enum rc_proto protocol, u32 scancode, int ret = -EINVAL; u64 mask = 1ULL << protocol; + ir_raw_load_modules(&mask); + mutex_lock(&ir_raw_handler_lock); list_for_each_entry(handler, &ir_raw_handler_list, list) { if (handler->protocols & mask && handler->encode) { diff --git a/drivers/media/rc/rc-main.c b/drivers/media/rc/rc-main.c index 372f4d61cb48..29a90adb0f7c 100644 --- a/drivers/media/rc/rc-main.c +++ b/drivers/media/rc/rc-main.c @@ -1082,7 +1082,7 @@ static int parse_protocol_change(u64 *protocols, const char *buf) return count; } -static void ir_raw_load_modules(u64 *protocols) +void ir_raw_load_modules(u64 *protocols) { u64 available; int i, ret; From a5af727929e2c04f86f890a34fd9566a58f8cdb7 Mon Sep 17 00:00:00 2001 From: Sean Young Date: Sat, 23 Sep 2017 10:41:13 -0400 Subject: [PATCH 0548/1640] UPSTREAM: media: lirc: lirc interface should not be a raw decoder The lirc user interface exists as a raw decoder, which does not make much sense for transmit-only devices. In addition, we want to have lirc char devices for devices which do not use raw IR, i.e. scancode only devices. Note that rc-code, lirc_dev, ir-lirc-codec are now calling functions of each other, so they've been merged into one module rc-core to avoid circular dependencies. Since ir-lirc-codec no longer exists as separate codec module, there is no need for RC_DRIVER_IR_RAW_TX type drivers to call ir_raw_event_register(). Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/Kconfig | 31 ++---- drivers/media/rc/Makefile | 5 +- drivers/media/rc/ir-lirc-codec.c | 135 +++++++------------------- drivers/media/rc/ir-mce_kbd-decoder.c | 6 -- drivers/media/rc/lirc_dev.c | 47 +++------ drivers/media/rc/rc-core-priv.h | 45 ++++++--- drivers/media/rc/rc-ir-raw.c | 24 +---- drivers/media/rc/rc-main.c | 52 +++++----- include/media/lirc_dev.h | 10 -- include/media/rc-core.h | 33 ++++--- 10 files changed, 144 insertions(+), 244 deletions(-) diff --git a/drivers/media/rc/Kconfig b/drivers/media/rc/Kconfig index c0b37e09f50a..f114316ccc53 100644 --- a/drivers/media/rc/Kconfig +++ b/drivers/media/rc/Kconfig @@ -16,34 +16,21 @@ menuconfig RC_CORE if RC_CORE source "drivers/media/rc/keymaps/Kconfig" +config LIRC + bool "LIRC user interface" + depends on RC_CORE + ---help--- + Enable this option to enable the Linux Infrared Remote + Control user interface (e.g. /dev/lirc*). This interface + passes raw IR to and from userspace, which is needed for + IR transmitting (aka "blasting") and for the lirc daemon. + menuconfig RC_DECODERS bool "Remote controller decoders" depends on RC_CORE default y if RC_DECODERS -config LIRC - tristate "LIRC interface driver" - depends on RC_CORE - - ---help--- - Enable this option to build the Linux Infrared Remote - Control (LIRC) core device interface driver. The LIRC - interface passes raw IR to and from userspace, where the - LIRC daemon handles protocol decoding for IR reception and - encoding for IR transmitting (aka "blasting"). - -config IR_LIRC_CODEC - tristate "Enable IR to LIRC bridge" - depends on RC_CORE - depends on LIRC - default y - - ---help--- - Enable this option to pass raw IR to and from userspace via - the LIRC interface. - - config IR_NEC_DECODER tristate "Enable IR raw decoder for the NEC protocol" depends on RC_CORE diff --git a/drivers/media/rc/Makefile b/drivers/media/rc/Makefile index d07367b1674b..bcb49badfe36 100644 --- a/drivers/media/rc/Makefile +++ b/drivers/media/rc/Makefile @@ -1,10 +1,10 @@ # SPDX-License-Identifier: GPL-2.0 -rc-core-objs := rc-main.o rc-ir-raw.o obj-y += keymaps/ obj-$(CONFIG_RC_CORE) += rc-core.o -obj-$(CONFIG_LIRC) += lirc_dev.o +rc-core-y := rc-main.o rc-ir-raw.o +rc-core-$(CONFIG_LIRC) += lirc_dev.o ir-lirc-codec.o obj-$(CONFIG_IR_NEC_DECODER) += ir-nec-decoder.o obj-$(CONFIG_IR_RC5_DECODER) += ir-rc5-decoder.o obj-$(CONFIG_IR_RC6_DECODER) += ir-rc6-decoder.o @@ -13,7 +13,6 @@ obj-$(CONFIG_IR_SONY_DECODER) += ir-sony-decoder.o obj-$(CONFIG_IR_SANYO_DECODER) += ir-sanyo-decoder.o obj-$(CONFIG_IR_SHARP_DECODER) += ir-sharp-decoder.o obj-$(CONFIG_IR_MCE_KBD_DECODER) += ir-mce_kbd-decoder.o -obj-$(CONFIG_IR_LIRC_CODEC) += ir-lirc-codec.o obj-$(CONFIG_IR_XMP_DECODER) += ir-xmp-decoder.o # stand-alone IR receivers/transmitters diff --git a/drivers/media/rc/ir-lirc-codec.c b/drivers/media/rc/ir-lirc-codec.c index bdacbadac416..aec0109b1a69 100644 --- a/drivers/media/rc/ir-lirc-codec.c +++ b/drivers/media/rc/ir-lirc-codec.c @@ -14,7 +14,6 @@ #include #include -#include #include #include #include @@ -23,21 +22,15 @@ #define LIRCBUF_SIZE 256 /** - * ir_lirc_decode() - Send raw IR data to lirc_dev to be relayed to the - * lircd userspace daemon for decoding. + * ir_lirc_raw_event() - Send raw IR data to lirc to be relayed to userspace + * * @dev: the struct rc_dev descriptor of the device * @ev: the struct ir_raw_event descriptor of the pulse/space - * - * This function returns -EINVAL if the lirc interfaces aren't wired up. */ -static int ir_lirc_decode(struct rc_dev *dev, struct ir_raw_event ev) +void ir_lirc_raw_event(struct rc_dev *dev, struct ir_raw_event ev) { - struct lirc_codec *lirc = &dev->raw->lirc; int sample; - if (!dev->raw->lirc.ldev || !dev->raw->lirc.ldev->buf) - return -EINVAL; - /* Packet start */ if (ev.reset) { /* Userspace expects a long space event before the start of @@ -56,15 +49,15 @@ static int ir_lirc_decode(struct rc_dev *dev, struct ir_raw_event ev) /* Packet end */ } else if (ev.timeout) { - if (lirc->gap) - return 0; + if (dev->gap) + return; - lirc->gap_start = ktime_get(); - lirc->gap = true; - lirc->gap_duration = ev.duration; + dev->gap_start = ktime_get(); + dev->gap = true; + dev->gap_duration = ev.duration; - if (!lirc->send_timeout_reports) - return 0; + if (!dev->send_timeout_reports) + return; sample = LIRC_TIMEOUT(ev.duration / 1000); IR_dprintk(2, "timeout report (duration: %d)\n", sample); @@ -72,21 +65,21 @@ static int ir_lirc_decode(struct rc_dev *dev, struct ir_raw_event ev) /* Normal sample */ } else { - if (lirc->gap) { + if (dev->gap) { int gap_sample; - lirc->gap_duration += ktime_to_ns(ktime_sub(ktime_get(), - lirc->gap_start)); + dev->gap_duration += ktime_to_ns(ktime_sub(ktime_get(), + dev->gap_start)); /* Convert to ms and cap by LIRC_VALUE_MASK */ - do_div(lirc->gap_duration, 1000); - lirc->gap_duration = min(lirc->gap_duration, - (u64)LIRC_VALUE_MASK); + do_div(dev->gap_duration, 1000); + dev->gap_duration = min_t(u64, dev->gap_duration, + LIRC_VALUE_MASK); - gap_sample = LIRC_SPACE(lirc->gap_duration); - lirc_buffer_write(dev->raw->lirc.ldev->buf, + gap_sample = LIRC_SPACE(dev->gap_duration); + lirc_buffer_write(dev->lirc_dev->buf, (unsigned char *)&gap_sample); - lirc->gap = false; + dev->gap = false; } sample = ev.pulse ? LIRC_PULSE(ev.duration / 1000) : @@ -95,18 +88,16 @@ static int ir_lirc_decode(struct rc_dev *dev, struct ir_raw_event ev) TO_US(ev.duration), TO_STR(ev.pulse)); } - lirc_buffer_write(dev->raw->lirc.ldev->buf, + lirc_buffer_write(dev->lirc_dev->buf, (unsigned char *) &sample); - wake_up(&dev->raw->lirc.ldev->buf->wait_poll); - return 0; + wake_up(&dev->lirc_dev->buf->wait_poll); } static ssize_t ir_lirc_transmit_ir(struct file *file, const char __user *buf, size_t n, loff_t *ppos) { - struct lirc_codec *lirc; - struct rc_dev *dev; + struct rc_dev *dev = file->private_data; unsigned int *txbuf = NULL; struct ir_raw_event *raw = NULL; ssize_t ret = -EINVAL; @@ -118,22 +109,12 @@ static ssize_t ir_lirc_transmit_ir(struct file *file, const char __user *buf, start = ktime_get(); - lirc = lirc_get_pdata(file); - if (!lirc) - return -EFAULT; - - dev = lirc->dev; - if (!dev) { - ret = -EFAULT; - goto out; - } - if (!dev->tx_ir) { ret = -EINVAL; goto out; } - if (lirc->send_mode == LIRC_MODE_SCANCODE) { + if (dev->send_mode == LIRC_MODE_SCANCODE) { struct lirc_scancode scan; if (n != sizeof(scan)) @@ -198,7 +179,7 @@ static ssize_t ir_lirc_transmit_ir(struct file *file, const char __user *buf, if (ret < 0) goto out; - if (lirc->send_mode == LIRC_MODE_SCANCODE) { + if (dev->send_mode == LIRC_MODE_SCANCODE) { ret = n; } else { for (duration = i = 0; i < ret; i++) @@ -228,20 +209,11 @@ out: static long ir_lirc_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) { - struct lirc_codec *lirc; - struct rc_dev *dev; + struct rc_dev *dev = filep->private_data; u32 __user *argp = (u32 __user *)(arg); int ret = 0; __u32 val = 0, tmp; - lirc = lirc_get_pdata(filep); - if (!lirc) - return -EFAULT; - - dev = lirc->dev; - if (!dev) - return -EFAULT; - if (_IOC_DIR(cmd) & _IOC_WRITE) { ret = get_user(val, argp); if (ret) @@ -255,7 +227,7 @@ static long ir_lirc_ioctl(struct file *filep, unsigned int cmd, if (!dev->tx_ir) return -ENOTTY; - val = lirc->send_mode; + val = dev->send_mode; break; case LIRC_SET_SEND_MODE: @@ -265,7 +237,7 @@ static long ir_lirc_ioctl(struct file *filep, unsigned int cmd, if (!(val == LIRC_MODE_PULSE || val == LIRC_MODE_SCANCODE)) return -EINVAL; - lirc->send_mode = val; + dev->send_mode = val; return 0; /* TX settings */ @@ -299,7 +271,7 @@ static long ir_lirc_ioctl(struct file *filep, unsigned int cmd, return -EINVAL; return dev->s_rx_carrier_range(dev, - dev->raw->lirc.carrier_low, + dev->carrier_low, val); case LIRC_SET_REC_CARRIER_RANGE: @@ -309,7 +281,7 @@ static long ir_lirc_ioctl(struct file *filep, unsigned int cmd, if (val <= 0) return -EINVAL; - dev->raw->lirc.carrier_low = val; + dev->carrier_low = val; return 0; case LIRC_GET_REC_RESOLUTION: @@ -367,7 +339,7 @@ static long ir_lirc_ioctl(struct file *filep, unsigned int cmd, if (!dev->timeout) return -ENOTTY; - lirc->send_timeout_reports = !!val; + dev->send_timeout_reports = !!val; break; default: @@ -394,7 +366,7 @@ static const struct file_operations lirc_fops = { .llseek = no_llseek, }; -static int ir_lirc_register(struct rc_dev *dev) +int ir_lirc_register(struct rc_dev *dev) { struct lirc_dev *ldev; int rc = -ENOMEM; @@ -436,7 +408,6 @@ static int ir_lirc_register(struct rc_dev *dev) snprintf(ldev->name, sizeof(ldev->name), "ir-lirc-codec (%s)", dev->driver_name); ldev->features = features; - ldev->data = &dev->raw->lirc; ldev->buf = NULL; ldev->chunk_size = sizeof(int); ldev->buffer_size = LIRCBUF_SIZE; @@ -449,10 +420,8 @@ static int ir_lirc_register(struct rc_dev *dev) if (rc < 0) goto out; - dev->raw->lirc.send_mode = LIRC_MODE_PULSE; - - dev->raw->lirc.ldev = ldev; - dev->raw->lirc.dev = dev; + dev->send_mode = LIRC_MODE_PULSE; + dev->lirc_dev = ldev; return 0; out: @@ -460,40 +429,8 @@ out: return rc; } -static int ir_lirc_unregister(struct rc_dev *dev) +void ir_lirc_unregister(struct rc_dev *dev) { - struct lirc_codec *lirc = &dev->raw->lirc; - - lirc_unregister_device(lirc->ldev); - lirc->ldev = NULL; - - return 0; + lirc_unregister_device(dev->lirc_dev); + dev->lirc_dev = NULL; } - -static struct ir_raw_handler lirc_handler = { - .protocols = 0, - .decode = ir_lirc_decode, - .raw_register = ir_lirc_register, - .raw_unregister = ir_lirc_unregister, -}; - -static int __init ir_lirc_codec_init(void) -{ - ir_raw_handler_register(&lirc_handler); - - printk(KERN_INFO "IR LIRC bridge handler initialized\n"); - return 0; -} - -static void __exit ir_lirc_codec_exit(void) -{ - ir_raw_handler_unregister(&lirc_handler); -} - -module_init(ir_lirc_codec_init); -module_exit(ir_lirc_codec_exit); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Jarod Wilson "); -MODULE_AUTHOR("Red Hat Inc. (http://www.redhat.com)"); -MODULE_DESCRIPTION("LIRC IR handler bridge"); diff --git a/drivers/media/rc/ir-mce_kbd-decoder.c b/drivers/media/rc/ir-mce_kbd-decoder.c index dbc6e00bace2..2c9ee0c1f432 100644 --- a/drivers/media/rc/ir-mce_kbd-decoder.c +++ b/drivers/media/rc/ir-mce_kbd-decoder.c @@ -358,9 +358,6 @@ static int ir_mce_kbd_register(struct rc_dev *dev) struct input_dev *idev; int i, ret; - if (dev->driver_type == RC_DRIVER_IR_RAW_TX) - return 0; - idev = input_allocate_device(); if (!idev) return -ENOMEM; @@ -415,9 +412,6 @@ static int ir_mce_kbd_unregister(struct rc_dev *dev) struct mce_kbd_dec *mce_kbd = &dev->raw->mce_kbd; struct input_dev *idev = mce_kbd->idev; - if (dev->driver_type == RC_DRIVER_IR_RAW_TX) - return 0; - del_timer_sync(&mce_kbd->rx_timeout); input_unregister_device(idev); diff --git a/drivers/media/rc/lirc_dev.c b/drivers/media/rc/lirc_dev.c index ef7e915dc9a2..3cc95deaa84e 100644 --- a/drivers/media/rc/lirc_dev.c +++ b/drivers/media/rc/lirc_dev.c @@ -26,7 +26,7 @@ #include #include -#include +#include "rc-core-priv.h" #include #include @@ -236,7 +236,7 @@ int lirc_dev_fop_open(struct inode *inode, struct file *file) d->open++; - lirc_init_pdata(inode, file); + file->private_data = d->rdev; nonseekable_open(inode, file); mutex_unlock(&d->mutex); @@ -250,11 +250,12 @@ EXPORT_SYMBOL(lirc_dev_fop_open); int lirc_dev_fop_close(struct inode *inode, struct file *file) { - struct lirc_dev *d = file->private_data; + struct rc_dev *rcdev = file->private_data; + struct lirc_dev *d = rcdev->lirc_dev; mutex_lock(&d->mutex); - rc_close(d->rdev); + rc_close(rcdev); d->open--; mutex_unlock(&d->mutex); @@ -265,7 +266,8 @@ EXPORT_SYMBOL(lirc_dev_fop_close); unsigned int lirc_dev_fop_poll(struct file *file, poll_table *wait) { - struct lirc_dev *d = file->private_data; + struct rc_dev *rcdev = file->private_data; + struct lirc_dev *d = rcdev->lirc_dev; unsigned int ret; if (!d->attached) @@ -290,7 +292,8 @@ EXPORT_SYMBOL(lirc_dev_fop_poll); long lirc_dev_fop_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { - struct lirc_dev *d = file->private_data; + struct rc_dev *rcdev = file->private_data; + struct lirc_dev *d = rcdev->lirc_dev; __u32 mode; int result; @@ -349,7 +352,8 @@ ssize_t lirc_dev_fop_read(struct file *file, size_t length, loff_t *ppos) { - struct lirc_dev *d = file->private_data; + struct rc_dev *rcdev = file->private_data; + struct lirc_dev *d = rcdev->lirc_dev; unsigned char *buf; int ret, written = 0; DECLARE_WAITQUEUE(wait, current); @@ -448,24 +452,7 @@ out_unlocked: } EXPORT_SYMBOL(lirc_dev_fop_read); -void lirc_init_pdata(struct inode *inode, struct file *file) -{ - struct lirc_dev *d = container_of(inode->i_cdev, struct lirc_dev, cdev); - - file->private_data = d; -} -EXPORT_SYMBOL(lirc_init_pdata); - -void *lirc_get_pdata(struct file *file) -{ - struct lirc_dev *d = file->private_data; - - return d->data; -} -EXPORT_SYMBOL(lirc_get_pdata); - - -static int __init lirc_dev_init(void) +int __init lirc_dev_init(void) { int retval; @@ -489,16 +476,8 @@ static int __init lirc_dev_init(void) return 0; } -static void __exit lirc_dev_exit(void) +void __exit lirc_dev_exit(void) { class_destroy(lirc_class); unregister_chrdev_region(lirc_base_dev, LIRC_MAX_DEVICES); - pr_info("module unloaded\n"); } - -module_init(lirc_dev_init); -module_exit(lirc_dev_exit); - -MODULE_DESCRIPTION("LIRC base driver module"); -MODULE_AUTHOR("Artur Lipowski"); -MODULE_LICENSE("GPL"); diff --git a/drivers/media/rc/rc-core-priv.h b/drivers/media/rc/rc-core-priv.h index 6014f116cba2..face39c3a96c 100644 --- a/drivers/media/rc/rc-core-priv.h +++ b/drivers/media/rc/rc-core-priv.h @@ -12,6 +12,20 @@ #include #include +/** + * rc_open - Opens a RC device + * + * @rdev: pointer to struct rc_dev. + */ +int rc_open(struct rc_dev *rdev); + +/** + * rc_close - Closes a RC device + * + * @rdev: pointer to struct rc_dev. + */ +void rc_close(struct rc_dev *rdev); + struct ir_raw_handler { struct list_head list; @@ -21,7 +35,7 @@ struct ir_raw_handler { struct ir_raw_event *events, unsigned int max); u32 carrier; - /* These two should only be used by the lirc decoder */ + /* These two should only be used by the mce kbd decoder */ int (*raw_register)(struct rc_dev *dev); int (*raw_unregister)(struct rc_dev *dev); }; @@ -95,17 +109,6 @@ struct ir_raw_event_ctrl { unsigned count; unsigned wanted_bits; } mce_kbd; - struct lirc_codec { - struct rc_dev *dev; - struct lirc_dev *ldev; - int carrier_low; - - ktime_t gap_start; - u64 gap_duration; - bool gap; - bool send_timeout_reports; - u8 send_mode; - } lirc; struct xmp_dec { int state; unsigned count; @@ -264,6 +267,24 @@ void ir_raw_handler_unregister(struct ir_raw_handler *ir_raw_handler); void ir_raw_load_modules(u64 *protocols); void ir_raw_init(void); +/* + * lirc interface + */ +#ifdef CONFIG_LIRC +int lirc_dev_init(void); +void lirc_dev_exit(void); +void ir_lirc_raw_event(struct rc_dev *dev, struct ir_raw_event ev); +int ir_lirc_register(struct rc_dev *dev); +void ir_lirc_unregister(struct rc_dev *dev); +#else +static inline int lirc_dev_init(void) { return 0; } +static inline void lirc_dev_exit(void) {} +static inline void ir_lirc_raw_event(struct rc_dev *dev, + struct ir_raw_event ev) { } +static inline int ir_lirc_register(struct rc_dev *dev) { return 0; } +static inline void ir_lirc_unregister(struct rc_dev *dev) { } +#endif + /* * Decoder initialization code * diff --git a/drivers/media/rc/rc-ir-raw.c b/drivers/media/rc/rc-ir-raw.c index 78638d1b73cc..3dabb783a1f0 100644 --- a/drivers/media/rc/rc-ir-raw.c +++ b/drivers/media/rc/rc-ir-raw.c @@ -31,6 +31,7 @@ static int ir_raw_event_thread(void *data) if (raw->dev->enabled_protocols & handler->protocols || !handler->protocols) handler->decode(raw->dev, ev); + ir_lirc_raw_event(raw->dev, ev); raw->prev_ev = ev; } mutex_unlock(&ir_raw_handler_lock); @@ -521,16 +522,9 @@ EXPORT_SYMBOL(ir_raw_encode_carrier); */ int ir_raw_event_prepare(struct rc_dev *dev) { - static bool raw_init; /* 'false' default value, raw decoders loaded? */ - if (!dev) return -EINVAL; - if (!raw_init) { - request_module("ir-lirc-codec"); - raw_init = true; - } - dev->raw = kzalloc(sizeof(*dev->raw), GFP_KERNEL); if (!dev->raw) return -ENOMEM; @@ -548,19 +542,11 @@ int ir_raw_event_register(struct rc_dev *dev) struct ir_raw_handler *handler; struct task_struct *thread; - /* - * raw transmitters do not need any event registration - * because the event is coming from userspace - */ - if (dev->driver_type != RC_DRIVER_IR_RAW_TX) { - thread = kthread_run(ir_raw_event_thread, dev->raw, "rc%u", - dev->minor); + thread = kthread_run(ir_raw_event_thread, dev->raw, "rc%u", dev->minor); + if (IS_ERR(thread)) + return PTR_ERR(thread); - if (IS_ERR(thread)) - return PTR_ERR(thread); - - dev->raw->thread = thread; - } + dev->raw->thread = thread; mutex_lock(&ir_raw_handler_lock); list_add_tail(&dev->raw->list, &ir_raw_client_list); diff --git a/drivers/media/rc/rc-main.c b/drivers/media/rc/rc-main.c index 29a90adb0f7c..56b322b3d325 100644 --- a/drivers/media/rc/rc-main.c +++ b/drivers/media/rc/rc-main.c @@ -846,7 +846,6 @@ int rc_open(struct rc_dev *rdev) return rval; } -EXPORT_SYMBOL_GPL(rc_open); static int ir_open(struct input_dev *idev) { @@ -866,7 +865,6 @@ void rc_close(struct rc_dev *rdev) mutex_unlock(&rdev->lock); } } -EXPORT_SYMBOL_GPL(rc_close); static void ir_close(struct input_dev *idev) { @@ -941,23 +939,6 @@ struct rc_filter_attribute { .mask = (_mask), \ } -static bool lirc_is_present(void) -{ -#if defined(CONFIG_LIRC_MODULE) - struct module *lirc; - - mutex_lock(&module_mutex); - lirc = find_module("lirc_dev"); - mutex_unlock(&module_mutex); - - return lirc ? true : false; -#elif defined(CONFIG_LIRC) - return true; -#else - return false; -#endif -} - /** * show_protocols() - shows the current IR protocol(s) * @device: the device descriptor @@ -1002,8 +983,10 @@ static ssize_t show_protocols(struct device *device, allowed &= ~proto_names[i].type; } - if (dev->driver_type == RC_DRIVER_IR_RAW && lirc_is_present()) +#ifdef CONFIG_LIRC + if (dev->driver_type == RC_DRIVER_IR_RAW) tmp += sprintf(tmp, "[lirc] "); +#endif if (tmp != buf) tmp--; @@ -1759,8 +1742,7 @@ int rc_register_device(struct rc_dev *dev) dev->sysfs_groups[attr++] = &rc_dev_wakeup_filter_attr_grp; dev->sysfs_groups[attr++] = NULL; - if (dev->driver_type == RC_DRIVER_IR_RAW || - dev->driver_type == RC_DRIVER_IR_RAW_TX) { + if (dev->driver_type == RC_DRIVER_IR_RAW) { rc = ir_raw_event_prepare(dev); if (rc < 0) goto out_minor; @@ -1787,19 +1769,28 @@ int rc_register_device(struct rc_dev *dev) goto out_dev; } - if (dev->driver_type == RC_DRIVER_IR_RAW || - dev->driver_type == RC_DRIVER_IR_RAW_TX) { - rc = ir_raw_event_register(dev); + /* Ensure that the lirc kfifo is setup before we start the thread */ + if (dev->driver_type != RC_DRIVER_SCANCODE) { + rc = ir_lirc_register(dev); if (rc < 0) goto out_rx; } + if (dev->driver_type == RC_DRIVER_IR_RAW) { + rc = ir_raw_event_register(dev); + if (rc < 0) + goto out_lirc; + } + IR_dprintk(1, "Registered rc%u (driver: %s)\n", dev->minor, dev->driver_name ? dev->driver_name : "unknown"); return 0; +out_lirc: + if (dev->driver_type != RC_DRIVER_SCANCODE) + ir_lirc_unregister(dev); out_rx: rc_free_rx_device(dev); out_dev: @@ -1853,6 +1844,9 @@ void rc_unregister_device(struct rc_dev *dev) rc_free_rx_device(dev); + if (dev->driver_type != RC_DRIVER_SCANCODE) + ir_lirc_unregister(dev); + device_del(&dev->dev); ida_simple_remove(&rc_ida, dev->minor); @@ -1875,6 +1869,13 @@ static int __init rc_core_init(void) return rc; } + rc = lirc_dev_init(); + if (rc) { + pr_err("rc_core: unable to init lirc\n"); + class_unregister(&rc_class); + return 0; + } + led_trigger_register_simple("rc-feedback", &led_feedback); rc_map_register(&empty_map); @@ -1883,6 +1884,7 @@ static int __init rc_core_init(void) static void __exit rc_core_exit(void) { + lirc_dev_exit(); class_unregister(&rc_class); led_trigger_unregister_simple(led_feedback); rc_map_unregister(&empty_map); diff --git a/include/media/lirc_dev.h b/include/media/lirc_dev.h index 0a03dd9e5a68..dd0c078796e8 100644 --- a/include/media/lirc_dev.h +++ b/include/media/lirc_dev.h @@ -121,7 +121,6 @@ static inline unsigned int lirc_buffer_write(struct lirc_buffer *buf, * Only used if @rbuf is NULL. * @chunk_size: Size of each FIFO buffer. * Only used if @rbuf is NULL. - * @data: private per-driver data * @buf: if %NULL, lirc_dev will allocate and manage the buffer, * otherwise allocated by the caller which will * have to write to the buffer by other means, like irq's @@ -146,7 +145,6 @@ struct lirc_dev { struct lirc_buffer *buf; bool buf_internal; - void *data; struct rc_dev *rdev; const struct file_operations *fops; struct module *owner; @@ -168,14 +166,6 @@ int lirc_register_device(struct lirc_dev *d); void lirc_unregister_device(struct lirc_dev *d); -/* Must be called in the open fop before lirc_get_pdata() can be used */ -void lirc_init_pdata(struct inode *inode, struct file *file); - -/* Returns the private data stored in the lirc_dev - * associated with the given device file pointer. - */ -void *lirc_get_pdata(struct file *file); - /* default file operations * used by drivers if they override only some operations */ diff --git a/include/media/rc-core.h b/include/media/rc-core.h index ca48632ec8e2..5d6e415c7acc 100644 --- a/include/media/rc-core.h +++ b/include/media/rc-core.h @@ -20,6 +20,7 @@ #include #include #include +#include #include extern int rc_core_debug; @@ -115,6 +116,15 @@ enum rc_filter_type { * @max_timeout: maximum timeout supported by device * @rx_resolution : resolution (in ns) of input sampler * @tx_resolution: resolution (in ns) of output sampler + * @lirc_dev: lirc char device + * @carrier_low: when setting the carrier range, first the low end must be + * set with an ioctl and then the high end with another ioctl + * @gap_start: time when gap starts + * @gap_duration: duration of initial gap + * @gap: true if we're in a gap + * @send_timeout_reports: report timeouts in lirc raw IR. + * @send_mode: lirc mode for sending, either LIRC_MODE_SCANCODE or + * LIRC_MODE_PULSE * @change_protocol: allow changing the protocol used on hardware decoders * @open: callback to allow drivers to enable polling/irq when IR input device * is opened. @@ -174,6 +184,15 @@ struct rc_dev { u32 max_timeout; u32 rx_resolution; u32 tx_resolution; +#ifdef CONFIG_LIRC + struct lirc_dev *lirc_dev; + int carrier_low; + ktime_t gap_start; + u64 gap_duration; + bool gap; + bool send_timeout_reports; + u8 send_mode; +#endif int (*change_protocol)(struct rc_dev *dev, u64 *rc_proto); int (*open)(struct rc_dev *dev); void (*close)(struct rc_dev *dev); @@ -248,20 +267,6 @@ int devm_rc_register_device(struct device *parent, struct rc_dev *dev); */ void rc_unregister_device(struct rc_dev *dev); -/** - * rc_open - Opens a RC device - * - * @rdev: pointer to struct rc_dev. - */ -int rc_open(struct rc_dev *rdev); - -/** - * rc_close - Closes a RC device - * - * @rdev: pointer to struct rc_dev. - */ -void rc_close(struct rc_dev *rdev); - void rc_repeat(struct rc_dev *dev); void rc_keydown(struct rc_dev *dev, enum rc_proto protocol, u32 scancode, u8 toggle); From ac11ad2d5c4987982803f894c314557abfc614e5 Mon Sep 17 00:00:00 2001 From: Sean Young Date: Wed, 27 Sep 2017 16:00:49 -0400 Subject: [PATCH 0549/1640] UPSTREAM: media: lirc: validate scancode for transmit Ensure we reject an attempt to transmit invalid scancodes. Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/ir-lirc-codec.c | 10 ++++++ drivers/media/rc/rc-core-priv.h | 1 + drivers/media/rc/rc-main.c | 53 ++++++++++++++++++++------------ 3 files changed, 44 insertions(+), 20 deletions(-) diff --git a/drivers/media/rc/ir-lirc-codec.c b/drivers/media/rc/ir-lirc-codec.c index aec0109b1a69..1ed69c9e64bf 100644 --- a/drivers/media/rc/ir-lirc-codec.c +++ b/drivers/media/rc/ir-lirc-codec.c @@ -126,6 +126,16 @@ static ssize_t ir_lirc_transmit_ir(struct file *file, const char __user *buf, if (scan.flags || scan.keycode || scan.timestamp) return -EINVAL; + /* + * The scancode field in lirc_scancode is 64-bit simply + * to future-proof it, since there are IR protocols encode + * use more than 32 bits. For now only 32-bit protocols + * are supported. + */ + if (scan.scancode > U32_MAX || + !rc_validate_scancode(scan.rc_proto, scan.scancode)) + return -EINVAL; + raw = kmalloc_array(LIRCBUF_SIZE, sizeof(*raw), GFP_KERNEL); if (!raw) return -ENOMEM; diff --git a/drivers/media/rc/rc-core-priv.h b/drivers/media/rc/rc-core-priv.h index face39c3a96c..6d5a36b8b550 100644 --- a/drivers/media/rc/rc-core-priv.h +++ b/drivers/media/rc/rc-core-priv.h @@ -150,6 +150,7 @@ static inline bool is_timing_event(struct ir_raw_event ev) #define TO_STR(is_pulse) ((is_pulse) ? "pulse" : "space") /* functions for IR encoders */ +bool rc_validate_scancode(enum rc_proto proto, u32 scancode); static inline void init_ir_raw_event_duration(struct ir_raw_event *ev, unsigned int pulse, diff --git a/drivers/media/rc/rc-main.c b/drivers/media/rc/rc-main.c index 56b322b3d325..ce8837b1facd 100644 --- a/drivers/media/rc/rc-main.c +++ b/drivers/media/rc/rc-main.c @@ -775,6 +775,37 @@ void rc_keydown_notimeout(struct rc_dev *dev, enum rc_proto protocol, } EXPORT_SYMBOL_GPL(rc_keydown_notimeout); +/** + * rc_validate_scancode() - checks that a scancode is valid for a protocol + * @proto: protocol + * @scancode: scancode + */ +bool rc_validate_scancode(enum rc_proto proto, u32 scancode) +{ + switch (proto) { + case RC_PROTO_NECX: + if ((((scancode >> 16) ^ ~(scancode >> 8)) & 0xff) == 0) + return false; + break; + case RC_PROTO_NEC32: + if ((((scancode >> 24) ^ ~(scancode >> 16)) & 0xff) == 0) + return false; + break; + case RC_PROTO_RC6_MCE: + if ((scancode & 0xffff0000) != 0x800f0000) + return false; + break; + case RC_PROTO_RC6_6A_32: + if ((scancode & 0xffff0000) == 0x800f0000) + return false; + break; + default: + break; + } + + return true; +} + /** * rc_validate_filter() - checks that the scancode and mask are valid and * provides sensible defaults @@ -794,26 +825,8 @@ static int rc_validate_filter(struct rc_dev *dev, mask = protocols[protocol].scancode_bits; - switch (protocol) { - case RC_PROTO_NECX: - if ((((s >> 16) ^ ~(s >> 8)) & 0xff) == 0) - return -EINVAL; - break; - case RC_PROTO_NEC32: - if ((((s >> 24) ^ ~(s >> 16)) & 0xff) == 0) - return -EINVAL; - break; - case RC_PROTO_RC6_MCE: - if ((s & 0xffff0000) != 0x800f0000) - return -EINVAL; - break; - case RC_PROTO_RC6_6A_32: - if ((s & 0xffff0000) == 0x800f0000) - return -EINVAL; - break; - default: - break; - } + if (!rc_validate_scancode(protocol, s)) + return -EINVAL; filter->data &= mask; filter->mask &= mask; From 0982abe6cf4e3338e1e45b2ad911f2bb04d75026 Mon Sep 17 00:00:00 2001 From: Sean Young Date: Mon, 9 Oct 2017 16:32:41 -0400 Subject: [PATCH 0550/1640] UPSTREAM: media: rc: document and fix rc_validate_scancode() For some IR protocols, some scancode values not valid, i.e. they're part of a different protocol variant. Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/rc-main.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/drivers/media/rc/rc-main.c b/drivers/media/rc/rc-main.c index ce8837b1facd..e944d28b96d2 100644 --- a/drivers/media/rc/rc-main.c +++ b/drivers/media/rc/rc-main.c @@ -776,21 +776,35 @@ void rc_keydown_notimeout(struct rc_dev *dev, enum rc_proto protocol, EXPORT_SYMBOL_GPL(rc_keydown_notimeout); /** - * rc_validate_scancode() - checks that a scancode is valid for a protocol + * rc_validate_scancode() - checks that a scancode is valid for a protocol. + * For nec, it should do the opposite of ir_nec_bytes_to_scancode() * @proto: protocol * @scancode: scancode */ bool rc_validate_scancode(enum rc_proto proto, u32 scancode) { switch (proto) { + /* + * NECX has a 16-bit address; if the lower 8 bits match the upper + * 8 bits inverted, then the address would match regular nec. + */ case RC_PROTO_NECX: if ((((scancode >> 16) ^ ~(scancode >> 8)) & 0xff) == 0) return false; break; + /* + * NEC32 has a 16 bit address and 16 bit command. If the lower 8 bits + * of the command match the upper 8 bits inverted, then it would + * be either NEC or NECX. + */ case RC_PROTO_NEC32: - if ((((scancode >> 24) ^ ~(scancode >> 16)) & 0xff) == 0) + if ((((scancode >> 8) ^ ~scancode) & 0xff) == 0) return false; break; + /* + * If the customer code (top 32-bit) is 0x800f, it is MCE else it + * is regular mode-6a 32 bit + */ case RC_PROTO_RC6_MCE: if ((scancode & 0xffff0000) != 0x800f0000) return false; From 942d728ca193b3d2eaf3939afdb805edd31cf0e2 Mon Sep 17 00:00:00 2001 From: Sean Young Date: Sat, 23 Sep 2017 12:05:59 -0400 Subject: [PATCH 0551/1640] UPSTREAM: media: lirc: merge lirc_dev_fop_ioctl and ir_lirc_ioctl Calculate lirc features when necessary, and add LIRC_{S,G}ET_REC_MODE cases to ir_lirc_ioctl. This makes lirc_dev_fop_ioctl() unnecessary since all cases are already handled by ir_lirc_ioctl(). Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/ir-lirc-codec.c | 79 +++++++++++++++++++------------- drivers/media/rc/lirc_dev.c | 62 ++----------------------- include/media/lirc_dev.h | 4 -- 3 files changed, 50 insertions(+), 95 deletions(-) diff --git a/drivers/media/rc/ir-lirc-codec.c b/drivers/media/rc/ir-lirc-codec.c index 1ed69c9e64bf..f933e7617882 100644 --- a/drivers/media/rc/ir-lirc-codec.c +++ b/drivers/media/rc/ir-lirc-codec.c @@ -231,8 +231,54 @@ static long ir_lirc_ioctl(struct file *filep, unsigned int cmd, } switch (cmd) { + case LIRC_GET_FEATURES: + if (dev->driver_type == RC_DRIVER_IR_RAW) { + val |= LIRC_CAN_REC_MODE2; + if (dev->rx_resolution) + val |= LIRC_CAN_GET_REC_RESOLUTION; + } + + if (dev->tx_ir) { + val |= LIRC_CAN_SEND_PULSE | LIRC_CAN_SEND_SCANCODE; + if (dev->s_tx_mask) + val |= LIRC_CAN_SET_TRANSMITTER_MASK; + if (dev->s_tx_carrier) + val |= LIRC_CAN_SET_SEND_CARRIER; + if (dev->s_tx_duty_cycle) + val |= LIRC_CAN_SET_SEND_DUTY_CYCLE; + } + + if (dev->s_rx_carrier_range) + val |= LIRC_CAN_SET_REC_CARRIER | + LIRC_CAN_SET_REC_CARRIER_RANGE; + + if (dev->s_learning_mode) + val |= LIRC_CAN_USE_WIDEBAND_RECEIVER; + + if (dev->s_carrier_report) + val |= LIRC_CAN_MEASURE_CARRIER; + + if (dev->max_timeout) + val |= LIRC_CAN_SET_REC_TIMEOUT; + + break; /* mode support */ + case LIRC_GET_REC_MODE: + if (dev->driver_type == RC_DRIVER_IR_RAW_TX) + return -ENOTTY; + + val = LIRC_MODE_MODE2; + break; + + case LIRC_SET_REC_MODE: + if (dev->driver_type == RC_DRIVER_IR_RAW_TX) + return -ENOTTY; + + if (val != LIRC_MODE_MODE2) + return -EINVAL; + return 0; + case LIRC_GET_SEND_MODE: if (!dev->tx_ir) return -ENOTTY; @@ -353,7 +399,7 @@ static long ir_lirc_ioctl(struct file *filep, unsigned int cmd, break; default: - return lirc_dev_fop_ioctl(filep, cmd, arg); + return -ENOTTY; } if (_IOC_DIR(cmd) & _IOC_READ) @@ -380,44 +426,13 @@ int ir_lirc_register(struct rc_dev *dev) { struct lirc_dev *ldev; int rc = -ENOMEM; - unsigned long features = 0; ldev = lirc_allocate_device(); if (!ldev) return rc; - if (dev->driver_type != RC_DRIVER_IR_RAW_TX) { - features |= LIRC_CAN_REC_MODE2; - if (dev->rx_resolution) - features |= LIRC_CAN_GET_REC_RESOLUTION; - } - - if (dev->tx_ir) { - features |= LIRC_CAN_SEND_PULSE | LIRC_CAN_SEND_SCANCODE; - if (dev->s_tx_mask) - features |= LIRC_CAN_SET_TRANSMITTER_MASK; - if (dev->s_tx_carrier) - features |= LIRC_CAN_SET_SEND_CARRIER; - if (dev->s_tx_duty_cycle) - features |= LIRC_CAN_SET_SEND_DUTY_CYCLE; - } - - if (dev->s_rx_carrier_range) - features |= LIRC_CAN_SET_REC_CARRIER | - LIRC_CAN_SET_REC_CARRIER_RANGE; - - if (dev->s_learning_mode) - features |= LIRC_CAN_USE_WIDEBAND_RECEIVER; - - if (dev->s_carrier_report) - features |= LIRC_CAN_MEASURE_CARRIER; - - if (dev->max_timeout) - features |= LIRC_CAN_SET_REC_TIMEOUT; - snprintf(ldev->name, sizeof(ldev->name), "ir-lirc-codec (%s)", dev->driver_name); - ldev->features = features; ldev->buf = NULL; ldev->chunk_size = sizeof(int); ldev->buffer_size = LIRCBUF_SIZE; diff --git a/drivers/media/rc/lirc_dev.c b/drivers/media/rc/lirc_dev.c index 3cc95deaa84e..95058ea01e62 100644 --- a/drivers/media/rc/lirc_dev.c +++ b/drivers/media/rc/lirc_dev.c @@ -109,6 +109,7 @@ EXPORT_SYMBOL(lirc_free_device); int lirc_register_device(struct lirc_dev *d) { + struct rc_dev *rcdev = d->rdev; int minor; int err; @@ -146,7 +147,7 @@ int lirc_register_device(struct lirc_dev *d) /* some safety check 8-) */ d->name[sizeof(d->name) - 1] = '\0'; - if (LIRC_CAN_REC(d->features)) { + if (rcdev->driver_type == RC_DRIVER_IR_RAW) { err = lirc_allocate_buffer(d); if (err) return err; @@ -290,63 +291,6 @@ unsigned int lirc_dev_fop_poll(struct file *file, poll_table *wait) } EXPORT_SYMBOL(lirc_dev_fop_poll); -long lirc_dev_fop_ioctl(struct file *file, unsigned int cmd, unsigned long arg) -{ - struct rc_dev *rcdev = file->private_data; - struct lirc_dev *d = rcdev->lirc_dev; - __u32 mode; - int result; - - dev_dbg(&d->dev, LOGHEAD "ioctl called (0x%x)\n", - d->name, d->minor, cmd); - - result = mutex_lock_interruptible(&d->mutex); - if (result) - return result; - - if (!d->attached) { - result = -ENODEV; - goto out; - } - - switch (cmd) { - case LIRC_GET_FEATURES: - result = put_user(d->features, (__u32 __user *)arg); - break; - case LIRC_GET_REC_MODE: - if (!LIRC_CAN_REC(d->features)) { - result = -ENOTTY; - break; - } - - result = put_user(LIRC_REC2MODE - (d->features & LIRC_CAN_REC_MASK), - (__u32 __user *)arg); - break; - case LIRC_SET_REC_MODE: - if (!LIRC_CAN_REC(d->features)) { - result = -ENOTTY; - break; - } - - result = get_user(mode, (__u32 __user *)arg); - if (!result && !(LIRC_MODE2REC(mode) & d->features)) - result = -EINVAL; - /* - * FIXME: We should actually set the mode somehow but - * for now, lirc_serial doesn't support mode changing either - */ - break; - default: - result = -ENOTTY; - } - -out: - mutex_unlock(&d->mutex); - return result; -} -EXPORT_SYMBOL(lirc_dev_fop_ioctl); - ssize_t lirc_dev_fop_read(struct file *file, char __user *buffer, size_t length, @@ -375,7 +319,7 @@ ssize_t lirc_dev_fop_read(struct file *file, goto out_locked; } - if (!LIRC_CAN_REC(d->features)) { + if (rcdev->driver_type != RC_DRIVER_IR_RAW) { ret = -EINVAL; goto out_locked; } diff --git a/include/media/lirc_dev.h b/include/media/lirc_dev.h index dd0c078796e8..86a3cf798775 100644 --- a/include/media/lirc_dev.h +++ b/include/media/lirc_dev.h @@ -115,8 +115,6 @@ static inline unsigned int lirc_buffer_write(struct lirc_buffer *buf, * * @name: used for logging * @minor: the minor device (/dev/lircX) number for the device - * @features: lirc compatible hardware features, like LIRC_MODE_RAW, - * LIRC_CAN\_\*, as defined at include/media/lirc.h. * @buffer_size: Number of FIFO buffers with @chunk_size size. * Only used if @rbuf is NULL. * @chunk_size: Size of each FIFO buffer. @@ -138,7 +136,6 @@ static inline unsigned int lirc_buffer_write(struct lirc_buffer *buf, struct lirc_dev { char name[40]; unsigned int minor; - __u32 features; unsigned int buffer_size; /* in chunks holding one code each */ unsigned int chunk_size; @@ -172,7 +169,6 @@ void lirc_unregister_device(struct lirc_dev *d); int lirc_dev_fop_open(struct inode *inode, struct file *file); int lirc_dev_fop_close(struct inode *inode, struct file *file); unsigned int lirc_dev_fop_poll(struct file *file, poll_table *wait); -long lirc_dev_fop_ioctl(struct file *file, unsigned int cmd, unsigned long arg); ssize_t lirc_dev_fop_read(struct file *file, char __user *buffer, size_t length, loff_t *ppos); #endif From 407c45bd29b6c62934751ce22e728e3f657d14d6 Mon Sep 17 00:00:00 2001 From: Sean Young Date: Sat, 23 Sep 2017 14:44:18 -0400 Subject: [PATCH 0552/1640] UPSTREAM: media: lirc: use kfifo rather than lirc_buffer for raw IR Since the only mode lirc devices can handle is raw IR, handle this in a plain kfifo. Remove lirc_buffer since this is no longer needed. Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/ir-lirc-codec.c | 81 ++++++++++--- drivers/media/rc/lirc_dev.c | 199 +++---------------------------- include/media/lirc_dev.h | 109 ----------------- include/media/rc-core.h | 4 + 4 files changed, 87 insertions(+), 306 deletions(-) diff --git a/drivers/media/rc/ir-lirc-codec.c b/drivers/media/rc/ir-lirc-codec.c index f933e7617882..2fa1f905a266 100644 --- a/drivers/media/rc/ir-lirc-codec.c +++ b/drivers/media/rc/ir-lirc-codec.c @@ -66,8 +66,6 @@ void ir_lirc_raw_event(struct rc_dev *dev, struct ir_raw_event ev) } else { if (dev->gap) { - int gap_sample; - dev->gap_duration += ktime_to_ns(ktime_sub(ktime_get(), dev->gap_start)); @@ -76,9 +74,7 @@ void ir_lirc_raw_event(struct rc_dev *dev, struct ir_raw_event ev) dev->gap_duration = min_t(u64, dev->gap_duration, LIRC_VALUE_MASK); - gap_sample = LIRC_SPACE(dev->gap_duration); - lirc_buffer_write(dev->lirc_dev->buf, - (unsigned char *)&gap_sample); + kfifo_put(&dev->rawir, LIRC_SPACE(dev->gap_duration)); dev->gap = false; } @@ -88,10 +84,8 @@ void ir_lirc_raw_event(struct rc_dev *dev, struct ir_raw_event ev) TO_US(ev.duration), TO_STR(ev.pulse)); } - lirc_buffer_write(dev->lirc_dev->buf, - (unsigned char *) &sample); - - wake_up(&dev->lirc_dev->buf->wait_poll); + kfifo_put(&dev->rawir, sample); + wake_up_poll(&dev->wait_poll, POLLIN | POLLRDNORM); } static ssize_t ir_lirc_transmit_ir(struct file *file, const char __user *buf, @@ -408,6 +402,68 @@ static long ir_lirc_ioctl(struct file *filep, unsigned int cmd, return ret; } +static unsigned int ir_lirc_poll(struct file *file, + struct poll_table_struct *wait) +{ + struct rc_dev *rcdev = file->private_data; + struct lirc_dev *d = rcdev->lirc_dev; + unsigned int events = 0; + + poll_wait(file, &rcdev->wait_poll, wait); + + if (!d->attached) + events = POLLHUP | POLLERR; + else if (rcdev->driver_type == RC_DRIVER_IR_RAW && + !kfifo_is_empty(&rcdev->rawir)) + events = POLLIN | POLLRDNORM; + + return events; +} + +static ssize_t ir_lirc_read(struct file *file, char __user *buffer, + size_t length, loff_t *ppos) +{ + struct rc_dev *rcdev = file->private_data; + struct lirc_dev *d = rcdev->lirc_dev; + unsigned int copied; + int ret; + + if (rcdev->driver_type == RC_DRIVER_IR_RAW_TX) + return -EINVAL; + + if (length < sizeof(unsigned int) || length % sizeof(unsigned int)) + return -EINVAL; + + if (!d->attached) + return -ENODEV; + + do { + if (kfifo_is_empty(&rcdev->rawir)) { + if (file->f_flags & O_NONBLOCK) + return -EAGAIN; + + ret = wait_event_interruptible(rcdev->wait_poll, + !kfifo_is_empty(&rcdev->rawir) || + !d->attached); + if (ret) + return ret; + } + + if (!d->attached) + return -ENODEV; + + ret = mutex_lock_interruptible(&rcdev->lock); + if (ret) + return ret; + ret = kfifo_to_user(&rcdev->rawir, buffer, length, &copied); + mutex_unlock(&rcdev->lock); + if (ret) + return ret; + } while (copied == 0); + + return copied; +} + static const struct file_operations lirc_fops = { .owner = THIS_MODULE, .write = ir_lirc_transmit_ir, @@ -415,8 +471,8 @@ static const struct file_operations lirc_fops = { #ifdef CONFIG_COMPAT .compat_ioctl = ir_lirc_ioctl, #endif - .read = lirc_dev_fop_read, - .poll = lirc_dev_fop_poll, + .read = ir_lirc_read, + .poll = ir_lirc_poll, .open = lirc_dev_fop_open, .release = lirc_dev_fop_close, .llseek = no_llseek, @@ -433,9 +489,6 @@ int ir_lirc_register(struct rc_dev *dev) snprintf(ldev->name, sizeof(ldev->name), "ir-lirc-codec (%s)", dev->driver_name); - ldev->buf = NULL; - ldev->chunk_size = sizeof(int); - ldev->buffer_size = LIRCBUF_SIZE; ldev->fops = &lirc_fops; ldev->dev.parent = &dev->dev; ldev->rdev = dev; diff --git a/drivers/media/rc/lirc_dev.c b/drivers/media/rc/lirc_dev.c index 95058ea01e62..9a0ad8d9a0cb 100644 --- a/drivers/media/rc/lirc_dev.c +++ b/drivers/media/rc/lirc_dev.c @@ -44,40 +44,14 @@ static struct class *lirc_class; static void lirc_release_device(struct device *ld) { struct lirc_dev *d = container_of(ld, struct lirc_dev, dev); + struct rc_dev *rcdev = d->rdev; - put_device(d->dev.parent); + if (rcdev->driver_type == RC_DRIVER_IR_RAW) + kfifo_free(&rcdev->rawir); - if (d->buf_internal) { - lirc_buffer_free(d->buf); - kfree(d->buf); - d->buf = NULL; - } kfree(d); module_put(THIS_MODULE); -} - -static int lirc_allocate_buffer(struct lirc_dev *d) -{ - int err; - - if (d->buf) { - d->buf_internal = false; - return 0; - } - - d->buf = kmalloc(sizeof(*d->buf), GFP_KERNEL); - if (!d->buf) - return -ENOMEM; - - err = lirc_buffer_init(d->buf, d->chunk_size, d->buffer_size); - if (err) { - kfree(d->buf); - d->buf = NULL; - return err; - } - - d->buf_internal = true; - return 0; + put_device(d->dev.parent); } struct lirc_dev * @@ -128,31 +102,16 @@ int lirc_register_device(struct lirc_dev *d) return -EINVAL; } - if (!d->buf && d->chunk_size < 1) { - pr_err("chunk_size must be set!\n"); - return -EINVAL; - } - - if (!d->buf && d->buffer_size < 1) { - pr_err("buffer_size must be set!\n"); - return -EINVAL; - } - - if (!d->buf && !(d->fops && d->fops->read && - d->fops->poll && d->fops->unlocked_ioctl)) { - dev_err(&d->dev, "undefined read, poll, ioctl\n"); - return -EBADRQC; - } - /* some safety check 8-) */ d->name[sizeof(d->name) - 1] = '\0'; if (rcdev->driver_type == RC_DRIVER_IR_RAW) { - err = lirc_allocate_buffer(d); - if (err) - return err; + if (kfifo_alloc(&rcdev->rawir, MAX_IR_EVENT_SIZE, GFP_KERNEL)) + return -ENOMEM; } + init_waitqueue_head(&rcdev->wait_poll); + minor = ida_simple_get(&lirc_ida, 0, LIRC_MAX_DEVICES, GFP_KERNEL); if (minor < 0) return minor; @@ -182,9 +141,13 @@ EXPORT_SYMBOL(lirc_register_device); void lirc_unregister_device(struct lirc_dev *d) { + struct rc_dev *rcdev; + if (!d) return; + rcdev = d->rdev; + dev_dbg(&d->dev, "lirc_dev: driver %s unregistered from minor = %d\n", d->name, d->minor); @@ -194,7 +157,7 @@ void lirc_unregister_device(struct lirc_dev *d) if (d->open) { dev_dbg(&d->dev, LOGHEAD "releasing opened driver\n", d->name, d->minor); - wake_up_interruptible(&d->buf->wait_poll); + wake_up_poll(&rcdev->wait_poll, POLLHUP); } mutex_unlock(&d->mutex); @@ -208,6 +171,7 @@ EXPORT_SYMBOL(lirc_unregister_device); int lirc_dev_fop_open(struct inode *inode, struct file *file) { struct lirc_dev *d = container_of(inode->i_cdev, struct lirc_dev, cdev); + struct rc_dev *rcdev = d->rdev; int retval; dev_dbg(&d->dev, LOGHEAD "open called\n", d->name, d->minor); @@ -232,8 +196,8 @@ int lirc_dev_fop_open(struct inode *inode, struct file *file) goto out; } - if (d->buf) - lirc_buffer_clear(d->buf); + if (rcdev->driver_type == RC_DRIVER_IR_RAW) + kfifo_reset_out(&rcdev->rawir); d->open++; @@ -265,137 +229,6 @@ int lirc_dev_fop_close(struct inode *inode, struct file *file) } EXPORT_SYMBOL(lirc_dev_fop_close); -unsigned int lirc_dev_fop_poll(struct file *file, poll_table *wait) -{ - struct rc_dev *rcdev = file->private_data; - struct lirc_dev *d = rcdev->lirc_dev; - unsigned int ret; - - if (!d->attached) - return POLLHUP | POLLERR; - - if (d->buf) { - poll_wait(file, &d->buf->wait_poll, wait); - - if (lirc_buffer_empty(d->buf)) - ret = 0; - else - ret = POLLIN | POLLRDNORM; - } else { - ret = POLLERR; - } - - dev_dbg(&d->dev, LOGHEAD "poll result = %d\n", d->name, d->minor, ret); - - return ret; -} -EXPORT_SYMBOL(lirc_dev_fop_poll); - -ssize_t lirc_dev_fop_read(struct file *file, - char __user *buffer, - size_t length, - loff_t *ppos) -{ - struct rc_dev *rcdev = file->private_data; - struct lirc_dev *d = rcdev->lirc_dev; - unsigned char *buf; - int ret, written = 0; - DECLARE_WAITQUEUE(wait, current); - - buf = kzalloc(d->buf->chunk_size, GFP_KERNEL); - if (!buf) - return -ENOMEM; - - dev_dbg(&d->dev, LOGHEAD "read called\n", d->name, d->minor); - - ret = mutex_lock_interruptible(&d->mutex); - if (ret) { - kfree(buf); - return ret; - } - - if (!d->attached) { - ret = -ENODEV; - goto out_locked; - } - - if (rcdev->driver_type != RC_DRIVER_IR_RAW) { - ret = -EINVAL; - goto out_locked; - } - - if (length % d->buf->chunk_size) { - ret = -EINVAL; - goto out_locked; - } - - /* - * we add ourselves to the task queue before buffer check - * to avoid losing scan code (in case when queue is awaken somewhere - * between while condition checking and scheduling) - */ - add_wait_queue(&d->buf->wait_poll, &wait); - - /* - * while we didn't provide 'length' bytes, device is opened in blocking - * mode and 'copy_to_user' is happy, wait for data. - */ - while (written < length && ret == 0) { - if (lirc_buffer_empty(d->buf)) { - /* According to the read(2) man page, 'written' can be - * returned as less than 'length', instead of blocking - * again, returning -EWOULDBLOCK, or returning - * -ERESTARTSYS - */ - if (written) - break; - if (file->f_flags & O_NONBLOCK) { - ret = -EWOULDBLOCK; - break; - } - if (signal_pending(current)) { - ret = -ERESTARTSYS; - break; - } - - mutex_unlock(&d->mutex); - set_current_state(TASK_INTERRUPTIBLE); - schedule(); - set_current_state(TASK_RUNNING); - - ret = mutex_lock_interruptible(&d->mutex); - if (ret) { - remove_wait_queue(&d->buf->wait_poll, &wait); - goto out_unlocked; - } - - if (!d->attached) { - ret = -ENODEV; - goto out_locked; - } - } else { - lirc_buffer_read(d->buf, buf); - ret = copy_to_user((void __user *)buffer+written, buf, - d->buf->chunk_size); - if (!ret) - written += d->buf->chunk_size; - else - ret = -EFAULT; - } - } - - remove_wait_queue(&d->buf->wait_poll, &wait); - -out_locked: - mutex_unlock(&d->mutex); - -out_unlocked: - kfree(buf); - - return ret ? ret : written; -} -EXPORT_SYMBOL(lirc_dev_fop_read); - int __init lirc_dev_init(void) { int retval; diff --git a/include/media/lirc_dev.h b/include/media/lirc_dev.h index 86a3cf798775..14d3eb36672e 100644 --- a/include/media/lirc_dev.h +++ b/include/media/lirc_dev.h @@ -18,112 +18,11 @@ #include #include -struct lirc_buffer { - wait_queue_head_t wait_poll; - spinlock_t fifo_lock; - unsigned int chunk_size; - unsigned int size; /* in chunks */ - /* Using chunks instead of bytes pretends to simplify boundary checking - * And should allow for some performance fine tunning later */ - struct kfifo fifo; -}; - -static inline void lirc_buffer_clear(struct lirc_buffer *buf) -{ - unsigned long flags; - - if (kfifo_initialized(&buf->fifo)) { - spin_lock_irqsave(&buf->fifo_lock, flags); - kfifo_reset(&buf->fifo); - spin_unlock_irqrestore(&buf->fifo_lock, flags); - } else - WARN(1, "calling %s on an uninitialized lirc_buffer\n", - __func__); -} - -static inline int lirc_buffer_init(struct lirc_buffer *buf, - unsigned int chunk_size, - unsigned int size) -{ - int ret; - - init_waitqueue_head(&buf->wait_poll); - spin_lock_init(&buf->fifo_lock); - buf->chunk_size = chunk_size; - buf->size = size; - ret = kfifo_alloc(&buf->fifo, size * chunk_size, GFP_KERNEL); - - return ret; -} - -static inline void lirc_buffer_free(struct lirc_buffer *buf) -{ - if (kfifo_initialized(&buf->fifo)) { - kfifo_free(&buf->fifo); - } else - WARN(1, "calling %s on an uninitialized lirc_buffer\n", - __func__); -} - -static inline int lirc_buffer_len(struct lirc_buffer *buf) -{ - int len; - unsigned long flags; - - spin_lock_irqsave(&buf->fifo_lock, flags); - len = kfifo_len(&buf->fifo); - spin_unlock_irqrestore(&buf->fifo_lock, flags); - - return len; -} - -static inline int lirc_buffer_full(struct lirc_buffer *buf) -{ - return lirc_buffer_len(buf) == buf->size * buf->chunk_size; -} - -static inline int lirc_buffer_empty(struct lirc_buffer *buf) -{ - return !lirc_buffer_len(buf); -} - -static inline unsigned int lirc_buffer_read(struct lirc_buffer *buf, - unsigned char *dest) -{ - unsigned int ret = 0; - - if (lirc_buffer_len(buf) >= buf->chunk_size) - ret = kfifo_out_locked(&buf->fifo, dest, buf->chunk_size, - &buf->fifo_lock); - return ret; - -} - -static inline unsigned int lirc_buffer_write(struct lirc_buffer *buf, - unsigned char *orig) -{ - unsigned int ret; - - ret = kfifo_in_locked(&buf->fifo, orig, buf->chunk_size, - &buf->fifo_lock); - - return ret; -} - /** * struct lirc_dev - represents a LIRC device * * @name: used for logging * @minor: the minor device (/dev/lircX) number for the device - * @buffer_size: Number of FIFO buffers with @chunk_size size. - * Only used if @rbuf is NULL. - * @chunk_size: Size of each FIFO buffer. - * Only used if @rbuf is NULL. - * @buf: if %NULL, lirc_dev will allocate and manage the buffer, - * otherwise allocated by the caller which will - * have to write to the buffer by other means, like irq's - * (see also lirc_serial.c). - * @buf_internal: whether lirc_dev has allocated the read buffer or not * @rdev: &struct rc_dev associated with the device * @fops: &struct file_operations for the device * @owner: the module owning this struct @@ -137,11 +36,6 @@ struct lirc_dev { char name[40]; unsigned int minor; - unsigned int buffer_size; /* in chunks holding one code each */ - unsigned int chunk_size; - struct lirc_buffer *buf; - bool buf_internal; - struct rc_dev *rdev; const struct file_operations *fops; struct module *owner; @@ -168,7 +62,4 @@ void lirc_unregister_device(struct lirc_dev *d); */ int lirc_dev_fop_open(struct inode *inode, struct file *file); int lirc_dev_fop_close(struct inode *inode, struct file *file); -unsigned int lirc_dev_fop_poll(struct file *file, poll_table *wait); -ssize_t lirc_dev_fop_read(struct file *file, char __user *buffer, size_t length, - loff_t *ppos); #endif diff --git a/include/media/rc-core.h b/include/media/rc-core.h index 5d6e415c7acc..fb91666bf881 100644 --- a/include/media/rc-core.h +++ b/include/media/rc-core.h @@ -123,6 +123,8 @@ enum rc_filter_type { * @gap_duration: duration of initial gap * @gap: true if we're in a gap * @send_timeout_reports: report timeouts in lirc raw IR. + * @rawir: queue for incoming raw IR + * @wait_poll: poll struct for lirc device * @send_mode: lirc mode for sending, either LIRC_MODE_SCANCODE or * LIRC_MODE_PULSE * @change_protocol: allow changing the protocol used on hardware decoders @@ -191,6 +193,8 @@ struct rc_dev { u64 gap_duration; bool gap; bool send_timeout_reports; + DECLARE_KFIFO_PTR(rawir, unsigned int); + wait_queue_head_t wait_poll; u8 send_mode; #endif int (*change_protocol)(struct rc_dev *dev, u64 *rc_proto); From de2e8aaab9315c5e6091f0ddd671fc853278c806 Mon Sep 17 00:00:00 2001 From: Sean Young Date: Tue, 26 Sep 2017 07:31:29 -0400 Subject: [PATCH 0553/1640] UPSTREAM: media: lirc: move lirc_dev->attached to rc_dev->registered This is done to further remove the lirc kernel api. Ensure that every fops checks for this. Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/ir-lirc-codec.c | 16 ++++++++++------ drivers/media/rc/lirc_dev.c | 4 +--- drivers/media/rc/rc-main.c | 10 ++++++++++ include/media/lirc_dev.h | 2 -- include/media/rc-core.h | 3 +++ 5 files changed, 24 insertions(+), 11 deletions(-) diff --git a/drivers/media/rc/ir-lirc-codec.c b/drivers/media/rc/ir-lirc-codec.c index 2fa1f905a266..ff74a5d7a0f3 100644 --- a/drivers/media/rc/ir-lirc-codec.c +++ b/drivers/media/rc/ir-lirc-codec.c @@ -101,6 +101,9 @@ static ssize_t ir_lirc_transmit_ir(struct file *file, const char __user *buf, unsigned int duration = 0; /* signal duration in us */ int i; + if (!dev->registered) + return -ENODEV; + start = ktime_get(); if (!dev->tx_ir) { @@ -224,6 +227,9 @@ static long ir_lirc_ioctl(struct file *filep, unsigned int cmd, return ret; } + if (!dev->registered) + return -ENODEV; + switch (cmd) { case LIRC_GET_FEATURES: if (dev->driver_type == RC_DRIVER_IR_RAW) { @@ -406,12 +412,11 @@ static unsigned int ir_lirc_poll(struct file *file, struct poll_table_struct *wait) { struct rc_dev *rcdev = file->private_data; - struct lirc_dev *d = rcdev->lirc_dev; unsigned int events = 0; poll_wait(file, &rcdev->wait_poll, wait); - if (!d->attached) + if (!rcdev->registered) events = POLLHUP | POLLERR; else if (rcdev->driver_type == RC_DRIVER_IR_RAW && !kfifo_is_empty(&rcdev->rawir)) @@ -424,7 +429,6 @@ static ssize_t ir_lirc_read(struct file *file, char __user *buffer, size_t length, loff_t *ppos) { struct rc_dev *rcdev = file->private_data; - struct lirc_dev *d = rcdev->lirc_dev; unsigned int copied; int ret; @@ -434,7 +438,7 @@ static ssize_t ir_lirc_read(struct file *file, char __user *buffer, if (length < sizeof(unsigned int) || length % sizeof(unsigned int)) return -EINVAL; - if (!d->attached) + if (!rcdev->registered) return -ENODEV; do { @@ -444,12 +448,12 @@ static ssize_t ir_lirc_read(struct file *file, char __user *buffer, ret = wait_event_interruptible(rcdev->wait_poll, !kfifo_is_empty(&rcdev->rawir) || - !d->attached); + !rcdev->registered); if (ret) return ret; } - if (!d->attached) + if (!rcdev->registered) return -ENODEV; ret = mutex_lock_interruptible(&rcdev->lock); diff --git a/drivers/media/rc/lirc_dev.c b/drivers/media/rc/lirc_dev.c index 9a0ad8d9a0cb..22171267aa90 100644 --- a/drivers/media/rc/lirc_dev.c +++ b/drivers/media/rc/lirc_dev.c @@ -122,7 +122,6 @@ int lirc_register_device(struct lirc_dev *d) cdev_init(&d->cdev, d->fops); d->cdev.owner = d->owner; - d->attached = true; err = cdev_device_add(&d->cdev, &d->dev); if (err) { @@ -153,7 +152,6 @@ void lirc_unregister_device(struct lirc_dev *d) mutex_lock(&d->mutex); - d->attached = false; if (d->open) { dev_dbg(&d->dev, LOGHEAD "releasing opened driver\n", d->name, d->minor); @@ -180,7 +178,7 @@ int lirc_dev_fop_open(struct inode *inode, struct file *file) if (retval) return retval; - if (!d->attached) { + if (!rcdev->registered) { retval = -ENODEV; goto out; } diff --git a/drivers/media/rc/rc-main.c b/drivers/media/rc/rc-main.c index e944d28b96d2..8b1b20e7a3c3 100644 --- a/drivers/media/rc/rc-main.c +++ b/drivers/media/rc/rc-main.c @@ -1809,6 +1809,8 @@ int rc_register_device(struct rc_dev *dev) goto out_lirc; } + dev->registered = true; + IR_dprintk(1, "Registered rc%u (driver: %s)\n", dev->minor, dev->driver_name ? dev->driver_name : "unknown"); @@ -1871,6 +1873,14 @@ void rc_unregister_device(struct rc_dev *dev) rc_free_rx_device(dev); + mutex_lock(&dev->lock); + dev->registered = false; + mutex_unlock(&dev->lock); + + /* + * lirc device should be freed with dev->registered = false, so + * that userspace polling will get notified. + */ if (dev->driver_type != RC_DRIVER_SCANCODE) ir_lirc_unregister(dev); diff --git a/include/media/lirc_dev.h b/include/media/lirc_dev.h index 14d3eb36672e..5782add67edd 100644 --- a/include/media/lirc_dev.h +++ b/include/media/lirc_dev.h @@ -26,7 +26,6 @@ * @rdev: &struct rc_dev associated with the device * @fops: &struct file_operations for the device * @owner: the module owning this struct - * @attached: if the device is still live * @open: open count for the device's chardev * @mutex: serialises file_operations calls * @dev: &struct device assigned to the device @@ -40,7 +39,6 @@ struct lirc_dev { const struct file_operations *fops; struct module *owner; - bool attached; int open; struct mutex mutex; /* protect from simultaneous accesses */ diff --git a/include/media/rc-core.h b/include/media/rc-core.h index fb91666bf881..b6d719734744 100644 --- a/include/media/rc-core.h +++ b/include/media/rc-core.h @@ -127,6 +127,8 @@ enum rc_filter_type { * @wait_poll: poll struct for lirc device * @send_mode: lirc mode for sending, either LIRC_MODE_SCANCODE or * LIRC_MODE_PULSE + * @registered: set to true by rc_register_device(), false by + * rc_unregister_device * @change_protocol: allow changing the protocol used on hardware decoders * @open: callback to allow drivers to enable polling/irq when IR input device * is opened. @@ -197,6 +199,7 @@ struct rc_dev { wait_queue_head_t wait_poll; u8 send_mode; #endif + bool registered; int (*change_protocol)(struct rc_dev *dev, u64 *rc_proto); int (*open)(struct rc_dev *dev); void (*close)(struct rc_dev *dev); From 4f57009d64876cc44b94da7bd84e443e1f14fb12 Mon Sep 17 00:00:00 2001 From: Sean Young Date: Sat, 23 Sep 2017 17:44:03 -0400 Subject: [PATCH 0554/1640] UPSTREAM: media: lirc: do not call close() or open() on unregistered devices If a lirc chardev is held open after a device is unplugged, rc_close() will be called after rc_unregister_device(). The driver is not expecting any calls at this point, and the iguanair driver causes an oops in this scenario. rc_open() can be called when the device is removed too, by calling open on the chardev whilst the device is being removed. Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/rc-main.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/drivers/media/rc/rc-main.c b/drivers/media/rc/rc-main.c index 8b1b20e7a3c3..ace00e77c96a 100644 --- a/drivers/media/rc/rc-main.c +++ b/drivers/media/rc/rc-main.c @@ -863,11 +863,15 @@ int rc_open(struct rc_dev *rdev) mutex_lock(&rdev->lock); - if (!rdev->users++ && rdev->open != NULL) - rval = rdev->open(rdev); + if (!rdev->registered) { + rval = -ENODEV; + } else { + if (!rdev->users++ && rdev->open) + rval = rdev->open(rdev); - if (rval) - rdev->users--; + if (rval) + rdev->users--; + } mutex_unlock(&rdev->lock); @@ -886,7 +890,7 @@ void rc_close(struct rc_dev *rdev) if (rdev) { mutex_lock(&rdev->lock); - if (!--rdev->users && rdev->close != NULL) + if (!--rdev->users && rdev->close && rdev->registered) rdev->close(rdev); mutex_unlock(&rdev->lock); From f5378488097b4a67838560573d976cefa5f7889a Mon Sep 17 00:00:00 2001 From: Sean Young Date: Tue, 26 Sep 2017 07:44:20 -0400 Subject: [PATCH 0555/1640] UPSTREAM: media: lirc: create rc-core open and close lirc functions Replace the generic kernel lirc api with ones which use rc-core, further reducing the lirc_dev members. Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/ir-lirc-codec.c | 59 ++++++++++++++++++++++++++- drivers/media/rc/lirc_dev.c | 68 ++------------------------------ include/media/lirc_dev.h | 11 ------ include/media/rc-core.h | 2 + 4 files changed, 62 insertions(+), 78 deletions(-) diff --git a/drivers/media/rc/ir-lirc-codec.c b/drivers/media/rc/ir-lirc-codec.c index ff74a5d7a0f3..1e921e4f8839 100644 --- a/drivers/media/rc/ir-lirc-codec.c +++ b/drivers/media/rc/ir-lirc-codec.c @@ -88,6 +88,61 @@ void ir_lirc_raw_event(struct rc_dev *dev, struct ir_raw_event ev) wake_up_poll(&dev->wait_poll, POLLIN | POLLRDNORM); } +static int ir_lirc_open(struct inode *inode, struct file *file) +{ + struct lirc_dev *d = container_of(inode->i_cdev, struct lirc_dev, cdev); + struct rc_dev *dev = d->rdev; + int retval; + + retval = rc_open(dev); + if (retval) + return retval; + + retval = mutex_lock_interruptible(&dev->lock); + if (retval) + goto out_rc; + + if (!dev->registered) { + retval = -ENODEV; + goto out_unlock; + } + + if (dev->lirc_open) { + retval = -EBUSY; + goto out_unlock; + } + + if (dev->driver_type == RC_DRIVER_IR_RAW) + kfifo_reset_out(&dev->rawir); + + dev->lirc_open++; + file->private_data = dev; + + nonseekable_open(inode, file); + mutex_unlock(&dev->lock); + + return 0; + +out_unlock: + mutex_unlock(&dev->lock); +out_rc: + rc_close(dev); + return retval; +} + +static int ir_lirc_close(struct inode *inode, struct file *file) +{ + struct rc_dev *dev = file->private_data; + + mutex_lock(&dev->lock); + dev->lirc_open--; + mutex_unlock(&dev->lock); + + rc_close(dev); + + return 0; +} + static ssize_t ir_lirc_transmit_ir(struct file *file, const char __user *buf, size_t n, loff_t *ppos) { @@ -477,8 +532,8 @@ static const struct file_operations lirc_fops = { #endif .read = ir_lirc_read, .poll = ir_lirc_poll, - .open = lirc_dev_fop_open, - .release = lirc_dev_fop_close, + .open = ir_lirc_open, + .release = ir_lirc_close, .llseek = no_llseek, }; diff --git a/drivers/media/rc/lirc_dev.c b/drivers/media/rc/lirc_dev.c index 22171267aa90..32124fb5c88e 100644 --- a/drivers/media/rc/lirc_dev.c +++ b/drivers/media/rc/lirc_dev.c @@ -61,7 +61,6 @@ lirc_allocate_device(void) d = kzalloc(sizeof(*d), GFP_KERNEL); if (d) { - mutex_init(&d->mutex); device_initialize(&d->dev); d->dev.class = lirc_class; d->dev.release = lirc_release_device; @@ -150,15 +149,15 @@ void lirc_unregister_device(struct lirc_dev *d) dev_dbg(&d->dev, "lirc_dev: driver %s unregistered from minor = %d\n", d->name, d->minor); - mutex_lock(&d->mutex); + mutex_lock(&rcdev->lock); - if (d->open) { + if (rcdev->lirc_open) { dev_dbg(&d->dev, LOGHEAD "releasing opened driver\n", d->name, d->minor); wake_up_poll(&rcdev->wait_poll, POLLHUP); } - mutex_unlock(&d->mutex); + mutex_unlock(&rcdev->lock); cdev_device_del(&d->cdev, &d->dev); ida_simple_remove(&lirc_ida, d->minor); @@ -166,67 +165,6 @@ void lirc_unregister_device(struct lirc_dev *d) } EXPORT_SYMBOL(lirc_unregister_device); -int lirc_dev_fop_open(struct inode *inode, struct file *file) -{ - struct lirc_dev *d = container_of(inode->i_cdev, struct lirc_dev, cdev); - struct rc_dev *rcdev = d->rdev; - int retval; - - dev_dbg(&d->dev, LOGHEAD "open called\n", d->name, d->minor); - - retval = mutex_lock_interruptible(&d->mutex); - if (retval) - return retval; - - if (!rcdev->registered) { - retval = -ENODEV; - goto out; - } - - if (d->open) { - retval = -EBUSY; - goto out; - } - - if (d->rdev) { - retval = rc_open(d->rdev); - if (retval) - goto out; - } - - if (rcdev->driver_type == RC_DRIVER_IR_RAW) - kfifo_reset_out(&rcdev->rawir); - - d->open++; - - file->private_data = d->rdev; - nonseekable_open(inode, file); - mutex_unlock(&d->mutex); - - return 0; - -out: - mutex_unlock(&d->mutex); - return retval; -} -EXPORT_SYMBOL(lirc_dev_fop_open); - -int lirc_dev_fop_close(struct inode *inode, struct file *file) -{ - struct rc_dev *rcdev = file->private_data; - struct lirc_dev *d = rcdev->lirc_dev; - - mutex_lock(&d->mutex); - - rc_close(rcdev); - d->open--; - - mutex_unlock(&d->mutex); - - return 0; -} -EXPORT_SYMBOL(lirc_dev_fop_close); - int __init lirc_dev_init(void) { int retval; diff --git a/include/media/lirc_dev.h b/include/media/lirc_dev.h index 5782add67edd..b45af81b4633 100644 --- a/include/media/lirc_dev.h +++ b/include/media/lirc_dev.h @@ -26,8 +26,6 @@ * @rdev: &struct rc_dev associated with the device * @fops: &struct file_operations for the device * @owner: the module owning this struct - * @open: open count for the device's chardev - * @mutex: serialises file_operations calls * @dev: &struct device assigned to the device * @cdev: &struct cdev assigned to the device */ @@ -39,10 +37,6 @@ struct lirc_dev { const struct file_operations *fops; struct module *owner; - int open; - - struct mutex mutex; /* protect from simultaneous accesses */ - struct device dev; struct cdev cdev; }; @@ -55,9 +49,4 @@ int lirc_register_device(struct lirc_dev *d); void lirc_unregister_device(struct lirc_dev *d); -/* default file operations - * used by drivers if they override only some operations - */ -int lirc_dev_fop_open(struct inode *inode, struct file *file); -int lirc_dev_fop_close(struct inode *inode, struct file *file); #endif diff --git a/include/media/rc-core.h b/include/media/rc-core.h index b6d719734744..4f585bff1347 100644 --- a/include/media/rc-core.h +++ b/include/media/rc-core.h @@ -117,6 +117,7 @@ enum rc_filter_type { * @rx_resolution : resolution (in ns) of input sampler * @tx_resolution: resolution (in ns) of output sampler * @lirc_dev: lirc char device + * @lirc_open: count of the number of times the device has been opened * @carrier_low: when setting the carrier range, first the low end must be * set with an ioctl and then the high end with another ioctl * @gap_start: time when gap starts @@ -190,6 +191,7 @@ struct rc_dev { u32 tx_resolution; #ifdef CONFIG_LIRC struct lirc_dev *lirc_dev; + int lirc_open; int carrier_low; ktime_t gap_start; u64 gap_duration; From f68f606cde011e9977c9f60f840f68396cc2c08d Mon Sep 17 00:00:00 2001 From: Sean Young Date: Tue, 26 Sep 2017 07:56:39 -0400 Subject: [PATCH 0556/1640] UPSTREAM: media: lirc: remove name from lirc_dev This is a duplicate of rcdev->driver_name. Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- Documentation/media/uapi/rc/lirc-dev-intro.rst | 2 +- drivers/media/rc/ir-lirc-codec.c | 2 -- drivers/media/rc/lirc_dev.c | 9 +++------ include/media/lirc_dev.h | 2 -- 4 files changed, 4 insertions(+), 11 deletions(-) diff --git a/Documentation/media/uapi/rc/lirc-dev-intro.rst b/Documentation/media/uapi/rc/lirc-dev-intro.rst index 3cacf9aeac40..a3fa3c1ef169 100644 --- a/Documentation/media/uapi/rc/lirc-dev-intro.rst +++ b/Documentation/media/uapi/rc/lirc-dev-intro.rst @@ -18,7 +18,7 @@ Example dmesg output upon a driver registering w/LIRC: $ dmesg |grep lirc_dev lirc_dev: IR Remote Control driver registered, major 248 - rc rc0: lirc_dev: driver ir-lirc-codec (mceusb) registered at minor = 0 + rc rc0: lirc_dev: driver mceusb registered at minor = 0 What you should see for a chardev: diff --git a/drivers/media/rc/ir-lirc-codec.c b/drivers/media/rc/ir-lirc-codec.c index 1e921e4f8839..6435306f385b 100644 --- a/drivers/media/rc/ir-lirc-codec.c +++ b/drivers/media/rc/ir-lirc-codec.c @@ -546,8 +546,6 @@ int ir_lirc_register(struct rc_dev *dev) if (!ldev) return rc; - snprintf(ldev->name, sizeof(ldev->name), "ir-lirc-codec (%s)", - dev->driver_name); ldev->fops = &lirc_fops; ldev->dev.parent = &dev->dev; ldev->rdev = dev; diff --git a/drivers/media/rc/lirc_dev.c b/drivers/media/rc/lirc_dev.c index 32124fb5c88e..4ac74fd86fd4 100644 --- a/drivers/media/rc/lirc_dev.c +++ b/drivers/media/rc/lirc_dev.c @@ -101,9 +101,6 @@ int lirc_register_device(struct lirc_dev *d) return -EINVAL; } - /* some safety check 8-) */ - d->name[sizeof(d->name) - 1] = '\0'; - if (rcdev->driver_type == RC_DRIVER_IR_RAW) { if (kfifo_alloc(&rcdev->rawir, MAX_IR_EVENT_SIZE, GFP_KERNEL)) return -ENOMEM; @@ -131,7 +128,7 @@ int lirc_register_device(struct lirc_dev *d) get_device(d->dev.parent); dev_info(&d->dev, "lirc_dev: driver %s registered at minor = %d\n", - d->name, d->minor); + rcdev->driver_name, d->minor); return 0; } @@ -147,13 +144,13 @@ void lirc_unregister_device(struct lirc_dev *d) rcdev = d->rdev; dev_dbg(&d->dev, "lirc_dev: driver %s unregistered from minor = %d\n", - d->name, d->minor); + rcdev->driver_name, d->minor); mutex_lock(&rcdev->lock); if (rcdev->lirc_open) { dev_dbg(&d->dev, LOGHEAD "releasing opened driver\n", - d->name, d->minor); + rcdev->driver_name, d->minor); wake_up_poll(&rcdev->wait_poll, POLLHUP); } diff --git a/include/media/lirc_dev.h b/include/media/lirc_dev.h index b45af81b4633..d12e1d1c3d67 100644 --- a/include/media/lirc_dev.h +++ b/include/media/lirc_dev.h @@ -21,7 +21,6 @@ /** * struct lirc_dev - represents a LIRC device * - * @name: used for logging * @minor: the minor device (/dev/lircX) number for the device * @rdev: &struct rc_dev associated with the device * @fops: &struct file_operations for the device @@ -30,7 +29,6 @@ * @cdev: &struct cdev assigned to the device */ struct lirc_dev { - char name[40]; unsigned int minor; struct rc_dev *rdev; From 0411d547fe1f8876e87200f48a4cabc11d5e8138 Mon Sep 17 00:00:00 2001 From: Sean Young Date: Tue, 26 Sep 2017 09:34:47 -0400 Subject: [PATCH 0557/1640] UPSTREAM: media: lirc: remove last remnants of lirc kapi rc-core has replaced the lirc kapi many years ago, and now with the last driver ported to rc-core, we can finally remove it. Note this has no effect on userspace. All future IR drivers should use the rc-core api. Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- Documentation/media/kapi/rc-core.rst | 5 - drivers/media/rc/ir-lirc-codec.c | 41 +------- drivers/media/rc/lirc_dev.c | 149 +++++++++------------------ drivers/media/rc/rc-core-priv.h | 3 + drivers/media/rc/rc-main.c | 1 - include/media/lirc_dev.h | 50 --------- include/media/rc-core.h | 8 +- 7 files changed, 62 insertions(+), 195 deletions(-) delete mode 100644 include/media/lirc_dev.h diff --git a/Documentation/media/kapi/rc-core.rst b/Documentation/media/kapi/rc-core.rst index a45895886257..41c2256dbf6a 100644 --- a/Documentation/media/kapi/rc-core.rst +++ b/Documentation/media/kapi/rc-core.rst @@ -7,8 +7,3 @@ Remote Controller core .. kernel-doc:: include/media/rc-core.h .. kernel-doc:: include/media/rc-map.h - -LIRC -~~~~ - -.. kernel-doc:: include/media/lirc_dev.h diff --git a/drivers/media/rc/ir-lirc-codec.c b/drivers/media/rc/ir-lirc-codec.c index 6435306f385b..52daac9bc470 100644 --- a/drivers/media/rc/ir-lirc-codec.c +++ b/drivers/media/rc/ir-lirc-codec.c @@ -12,10 +12,10 @@ * GNU General Public License for more details. */ +#include #include #include #include -#include #include #include "rc-core-priv.h" @@ -90,8 +90,8 @@ void ir_lirc_raw_event(struct rc_dev *dev, struct ir_raw_event ev) static int ir_lirc_open(struct inode *inode, struct file *file) { - struct lirc_dev *d = container_of(inode->i_cdev, struct lirc_dev, cdev); - struct rc_dev *dev = d->rdev; + struct rc_dev *dev = container_of(inode->i_cdev, struct rc_dev, + lirc_cdev); int retval; retval = rc_open(dev); @@ -523,7 +523,7 @@ static ssize_t ir_lirc_read(struct file *file, char __user *buffer, return copied; } -static const struct file_operations lirc_fops = { +const struct file_operations lirc_fops = { .owner = THIS_MODULE, .write = ir_lirc_transmit_ir, .unlocked_ioctl = ir_lirc_ioctl, @@ -536,36 +536,3 @@ static const struct file_operations lirc_fops = { .release = ir_lirc_close, .llseek = no_llseek, }; - -int ir_lirc_register(struct rc_dev *dev) -{ - struct lirc_dev *ldev; - int rc = -ENOMEM; - - ldev = lirc_allocate_device(); - if (!ldev) - return rc; - - ldev->fops = &lirc_fops; - ldev->dev.parent = &dev->dev; - ldev->rdev = dev; - ldev->owner = THIS_MODULE; - - rc = lirc_register_device(ldev); - if (rc < 0) - goto out; - - dev->send_mode = LIRC_MODE_PULSE; - dev->lirc_dev = ldev; - return 0; - -out: - lirc_free_device(ldev); - return rc; -} - -void ir_lirc_unregister(struct rc_dev *dev) -{ - lirc_unregister_device(dev->lirc_dev); - dev->lirc_dev = NULL; -} diff --git a/drivers/media/rc/lirc_dev.c b/drivers/media/rc/lirc_dev.c index 4ac74fd86fd4..155a4de249a0 100644 --- a/drivers/media/rc/lirc_dev.c +++ b/drivers/media/rc/lirc_dev.c @@ -18,24 +18,19 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include -#include -#include -#include #include #include -#include #include +#include #include "rc-core-priv.h" #include -#include #define LOGHEAD "lirc_dev (%s[%d]): " static dev_t lirc_base_dev; /* Used to keep track of allocated lirc devices */ -#define LIRC_MAX_DEVICES 256 static DEFINE_IDA(lirc_ida); /* Only used for sysfs but defined to void otherwise */ @@ -43,124 +38,80 @@ static struct class *lirc_class; static void lirc_release_device(struct device *ld) { - struct lirc_dev *d = container_of(ld, struct lirc_dev, dev); - struct rc_dev *rcdev = d->rdev; + struct rc_dev *rcdev = container_of(ld, struct rc_dev, lirc_dev); if (rcdev->driver_type == RC_DRIVER_IR_RAW) kfifo_free(&rcdev->rawir); - kfree(d); - module_put(THIS_MODULE); - put_device(d->dev.parent); + put_device(&rcdev->dev); } -struct lirc_dev * -lirc_allocate_device(void) +int ir_lirc_register(struct rc_dev *dev) { - struct lirc_dev *d; + int err, minor; - d = kzalloc(sizeof(*d), GFP_KERNEL); - if (d) { - device_initialize(&d->dev); - d->dev.class = lirc_class; - d->dev.release = lirc_release_device; - __module_get(THIS_MODULE); - } + device_initialize(&dev->lirc_dev); + dev->lirc_dev.class = lirc_class; + dev->lirc_dev.release = lirc_release_device; + dev->send_mode = LIRC_MODE_PULSE; - return d; -} -EXPORT_SYMBOL(lirc_allocate_device); - -void lirc_free_device(struct lirc_dev *d) -{ - if (!d) - return; - - put_device(&d->dev); -} -EXPORT_SYMBOL(lirc_free_device); - -int lirc_register_device(struct lirc_dev *d) -{ - struct rc_dev *rcdev = d->rdev; - int minor; - int err; - - if (!d) { - pr_err("driver pointer must be not NULL!\n"); - return -EBADRQC; - } - - if (!d->dev.parent) { - pr_err("dev parent pointer not filled in!\n"); - return -EINVAL; - } - - if (!d->fops) { - pr_err("fops pointer not filled in!\n"); - return -EINVAL; - } - - if (rcdev->driver_type == RC_DRIVER_IR_RAW) { - if (kfifo_alloc(&rcdev->rawir, MAX_IR_EVENT_SIZE, GFP_KERNEL)) + if (dev->driver_type == RC_DRIVER_IR_RAW) { + if (kfifo_alloc(&dev->rawir, MAX_IR_EVENT_SIZE, GFP_KERNEL)) return -ENOMEM; } - init_waitqueue_head(&rcdev->wait_poll); + init_waitqueue_head(&dev->wait_poll); - minor = ida_simple_get(&lirc_ida, 0, LIRC_MAX_DEVICES, GFP_KERNEL); - if (minor < 0) - return minor; - - d->minor = minor; - d->dev.devt = MKDEV(MAJOR(lirc_base_dev), d->minor); - dev_set_name(&d->dev, "lirc%d", d->minor); - - cdev_init(&d->cdev, d->fops); - d->cdev.owner = d->owner; - - err = cdev_device_add(&d->cdev, &d->dev); - if (err) { - ida_simple_remove(&lirc_ida, minor); - return err; + minor = ida_simple_get(&lirc_ida, 0, RC_DEV_MAX, GFP_KERNEL); + if (minor < 0) { + err = minor; + goto out_kfifo; } - get_device(d->dev.parent); + dev->lirc_dev.parent = &dev->dev; + dev->lirc_dev.devt = MKDEV(MAJOR(lirc_base_dev), minor); + dev_set_name(&dev->lirc_dev, "lirc%d", minor); - dev_info(&d->dev, "lirc_dev: driver %s registered at minor = %d\n", - rcdev->driver_name, d->minor); + cdev_init(&dev->lirc_cdev, &lirc_fops); + + err = cdev_device_add(&dev->lirc_cdev, &dev->lirc_dev); + if (err) + goto out_ida; + + get_device(&dev->dev); + + dev_info(&dev->dev, "lirc_dev: driver %s registered at minor = %d", + dev->driver_name, minor); return 0; + +out_ida: + ida_simple_remove(&lirc_ida, minor); +out_kfifo: + if (dev->driver_type == RC_DRIVER_IR_RAW) + kfifo_free(&dev->rawir); + return err; } -EXPORT_SYMBOL(lirc_register_device); -void lirc_unregister_device(struct lirc_dev *d) +void ir_lirc_unregister(struct rc_dev *dev) { - struct rc_dev *rcdev; + dev_dbg(&dev->dev, "lirc_dev: driver %s unregistered from minor = %d\n", + dev->driver_name, MINOR(dev->lirc_dev.devt)); - if (!d) - return; + mutex_lock(&dev->lock); - rcdev = d->rdev; - - dev_dbg(&d->dev, "lirc_dev: driver %s unregistered from minor = %d\n", - rcdev->driver_name, d->minor); - - mutex_lock(&rcdev->lock); - - if (rcdev->lirc_open) { - dev_dbg(&d->dev, LOGHEAD "releasing opened driver\n", - rcdev->driver_name, d->minor); - wake_up_poll(&rcdev->wait_poll, POLLHUP); + if (dev->lirc_open) { + dev_dbg(&dev->dev, LOGHEAD "releasing opened driver\n", + dev->driver_name, MINOR(dev->lirc_dev.devt)); + wake_up_poll(&dev->wait_poll, POLLHUP); } - mutex_unlock(&rcdev->lock); + mutex_unlock(&dev->lock); - cdev_device_del(&d->cdev, &d->dev); - ida_simple_remove(&lirc_ida, d->minor); - put_device(&d->dev); + cdev_device_del(&dev->lirc_cdev, &dev->lirc_dev); + ida_simple_remove(&lirc_ida, MINOR(dev->lirc_dev.devt)); + put_device(&dev->lirc_dev); } -EXPORT_SYMBOL(lirc_unregister_device); int __init lirc_dev_init(void) { @@ -172,7 +123,7 @@ int __init lirc_dev_init(void) return PTR_ERR(lirc_class); } - retval = alloc_chrdev_region(&lirc_base_dev, 0, LIRC_MAX_DEVICES, + retval = alloc_chrdev_region(&lirc_base_dev, 0, RC_DEV_MAX, "BaseRemoteCtl"); if (retval) { class_destroy(lirc_class); @@ -189,5 +140,5 @@ int __init lirc_dev_init(void) void __exit lirc_dev_exit(void) { class_destroy(lirc_class); - unregister_chrdev_region(lirc_base_dev, LIRC_MAX_DEVICES); + unregister_chrdev_region(lirc_base_dev, RC_DEV_MAX); } diff --git a/drivers/media/rc/rc-core-priv.h b/drivers/media/rc/rc-core-priv.h index 6d5a36b8b550..9af43d16ca29 100644 --- a/drivers/media/rc/rc-core-priv.h +++ b/drivers/media/rc/rc-core-priv.h @@ -6,6 +6,7 @@ #ifndef _RC_CORE_PRIV #define _RC_CORE_PRIV +#define RC_DEV_MAX 256 /* Define the max number of pulse/space transitions to buffer */ #define MAX_IR_EVENT_SIZE 512 @@ -277,6 +278,8 @@ void lirc_dev_exit(void); void ir_lirc_raw_event(struct rc_dev *dev, struct ir_raw_event ev); int ir_lirc_register(struct rc_dev *dev); void ir_lirc_unregister(struct rc_dev *dev); + +extern const struct file_operations lirc_fops; #else static inline int lirc_dev_init(void) { return 0; } static inline void lirc_dev_exit(void) {} diff --git a/drivers/media/rc/rc-main.c b/drivers/media/rc/rc-main.c index ace00e77c96a..12ff6d87b113 100644 --- a/drivers/media/rc/rc-main.c +++ b/drivers/media/rc/rc-main.c @@ -20,7 +20,6 @@ /* Sizes are in bytes, 256 bytes allows for 32 entries on x64 */ #define IR_TAB_MIN_SIZE 256 #define IR_TAB_MAX_SIZE 8192 -#define RC_DEV_MAX 256 static const struct { const char *name; diff --git a/include/media/lirc_dev.h b/include/media/lirc_dev.h deleted file mode 100644 index d12e1d1c3d67..000000000000 --- a/include/media/lirc_dev.h +++ /dev/null @@ -1,50 +0,0 @@ -/* - * LIRC base driver - * - * by Artur Lipowski - * This code is licensed under GNU GPL - * - */ - -#ifndef _LINUX_LIRC_DEV_H -#define _LINUX_LIRC_DEV_H - -#include -#include -#include -#include -#include -#include -#include -#include - -/** - * struct lirc_dev - represents a LIRC device - * - * @minor: the minor device (/dev/lircX) number for the device - * @rdev: &struct rc_dev associated with the device - * @fops: &struct file_operations for the device - * @owner: the module owning this struct - * @dev: &struct device assigned to the device - * @cdev: &struct cdev assigned to the device - */ -struct lirc_dev { - unsigned int minor; - - struct rc_dev *rdev; - const struct file_operations *fops; - struct module *owner; - - struct device dev; - struct cdev cdev; -}; - -struct lirc_dev *lirc_allocate_device(void); - -void lirc_free_device(struct lirc_dev *d); - -int lirc_register_device(struct lirc_dev *d); - -void lirc_unregister_device(struct lirc_dev *d); - -#endif diff --git a/include/media/rc-core.h b/include/media/rc-core.h index 4f585bff1347..2d24c88652aa 100644 --- a/include/media/rc-core.h +++ b/include/media/rc-core.h @@ -17,10 +17,10 @@ #define _RC_CORE #include +#include #include #include #include -#include #include extern int rc_core_debug; @@ -116,7 +116,8 @@ enum rc_filter_type { * @max_timeout: maximum timeout supported by device * @rx_resolution : resolution (in ns) of input sampler * @tx_resolution: resolution (in ns) of output sampler - * @lirc_dev: lirc char device + * @lirc_dev: lirc device + * @lirc_cdev: lirc char cdev * @lirc_open: count of the number of times the device has been opened * @carrier_low: when setting the carrier range, first the low end must be * set with an ioctl and then the high end with another ioctl @@ -190,7 +191,8 @@ struct rc_dev { u32 rx_resolution; u32 tx_resolution; #ifdef CONFIG_LIRC - struct lirc_dev *lirc_dev; + struct device lirc_dev; + struct cdev lirc_cdev; int lirc_open; int carrier_low; ktime_t gap_start; From 1ce652213d7236bd2b1797cafb73085038aa5681 Mon Sep 17 00:00:00 2001 From: Sean Young Date: Sat, 25 Feb 2017 06:51:32 -0500 Subject: [PATCH 0558/1640] UPSTREAM: media: lirc: implement reading scancode This implements LIRC_MODE_SCANCODE reading from the lirc device. The scancode can be read from the input device too, but with this interface you get the rc protocol, keycode, toggle and repeat status in addition to just the scancode. int main() { int fd, mode, rc; fd = open("/dev/lirc0", O_RDWR); mode = LIRC_MODE_SCANCODE; if (ioctl(fd, LIRC_SET_REC_MODE, &mode)) { // kernel too old or lirc does not support transmit } struct lirc_scancode scancode; while (read(fd, &scancode, sizeof(scancode)) == sizeof(scancode)) { printf("protocol:%d scancode:0x%x toggle:%d repeat:%d\n", scancode.rc_proto, scancode.scancode, !!(scancode.flags & LIRC_SCANCODE_FLAG_TOGGLE), !!(scancode.flags & LIRC_SCANCODE_FLAG_REPEAT)); } close(fd); } Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/ir-lirc-codec.c | 104 ++++++++++++++++++++++---- drivers/media/rc/ir-mce_kbd-decoder.c | 5 ++ drivers/media/rc/lirc_dev.c | 13 ++++ drivers/media/rc/rc-core-priv.h | 3 + drivers/media/rc/rc-main.c | 7 ++ include/media/rc-core.h | 5 ++ 6 files changed, 122 insertions(+), 15 deletions(-) diff --git a/drivers/media/rc/ir-lirc-codec.c b/drivers/media/rc/ir-lirc-codec.c index 52daac9bc470..817258c87b5c 100644 --- a/drivers/media/rc/ir-lirc-codec.c +++ b/drivers/media/rc/ir-lirc-codec.c @@ -88,6 +88,21 @@ void ir_lirc_raw_event(struct rc_dev *dev, struct ir_raw_event ev) wake_up_poll(&dev->wait_poll, POLLIN | POLLRDNORM); } +/** + * ir_lirc_scancode_event() - Send scancode data to lirc to be relayed to + * userspace + * @dev: the struct rc_dev descriptor of the device + * @lsc: the struct lirc_scancode describing the decoded scancode + */ +void ir_lirc_scancode_event(struct rc_dev *dev, struct lirc_scancode *lsc) +{ + lsc->timestamp = ktime_get_ns(); + + if (kfifo_put(&dev->scancodes, *lsc)) + wake_up_poll(&dev->wait_poll, POLLIN | POLLRDNORM); +} +EXPORT_SYMBOL_GPL(ir_lirc_scancode_event); + static int ir_lirc_open(struct inode *inode, struct file *file) { struct rc_dev *dev = container_of(inode->i_cdev, struct rc_dev, @@ -114,6 +129,8 @@ static int ir_lirc_open(struct inode *inode, struct file *file) if (dev->driver_type == RC_DRIVER_IR_RAW) kfifo_reset_out(&dev->rawir); + if (dev->driver_type != RC_DRIVER_IR_RAW_TX) + kfifo_reset_out(&dev->scancodes); dev->lirc_open++; file->private_data = dev; @@ -288,7 +305,7 @@ static long ir_lirc_ioctl(struct file *filep, unsigned int cmd, switch (cmd) { case LIRC_GET_FEATURES: if (dev->driver_type == RC_DRIVER_IR_RAW) { - val |= LIRC_CAN_REC_MODE2; + val |= LIRC_CAN_REC_MODE2 | LIRC_CAN_REC_SCANCODE; if (dev->rx_resolution) val |= LIRC_CAN_GET_REC_RESOLUTION; } @@ -323,15 +340,17 @@ static long ir_lirc_ioctl(struct file *filep, unsigned int cmd, if (dev->driver_type == RC_DRIVER_IR_RAW_TX) return -ENOTTY; - val = LIRC_MODE_MODE2; + val = dev->rec_mode; break; case LIRC_SET_REC_MODE: if (dev->driver_type == RC_DRIVER_IR_RAW_TX) return -ENOTTY; - if (val != LIRC_MODE_MODE2) + if (!(val == LIRC_MODE_MODE2 || val == LIRC_MODE_SCANCODE)) return -EINVAL; + + dev->rec_mode = val; return 0; case LIRC_GET_SEND_MODE: @@ -471,31 +490,31 @@ static unsigned int ir_lirc_poll(struct file *file, poll_wait(file, &rcdev->wait_poll, wait); - if (!rcdev->registered) + if (!rcdev->registered) { events = POLLHUP | POLLERR; - else if (rcdev->driver_type == RC_DRIVER_IR_RAW && - !kfifo_is_empty(&rcdev->rawir)) - events = POLLIN | POLLRDNORM; + } else if (rcdev->driver_type != RC_DRIVER_IR_RAW_TX) { + if (rcdev->rec_mode == LIRC_MODE_SCANCODE && + !kfifo_is_empty(&rcdev->scancodes)) + events = POLLIN | POLLRDNORM; + + if (rcdev->rec_mode == LIRC_MODE_MODE2 && + !kfifo_is_empty(&rcdev->rawir)) + events = POLLIN | POLLRDNORM; + } return events; } -static ssize_t ir_lirc_read(struct file *file, char __user *buffer, - size_t length, loff_t *ppos) +static ssize_t ir_lirc_read_mode2(struct file *file, char __user *buffer, + size_t length) { struct rc_dev *rcdev = file->private_data; unsigned int copied; int ret; - if (rcdev->driver_type == RC_DRIVER_IR_RAW_TX) - return -EINVAL; - if (length < sizeof(unsigned int) || length % sizeof(unsigned int)) return -EINVAL; - if (!rcdev->registered) - return -ENODEV; - do { if (kfifo_is_empty(&rcdev->rawir)) { if (file->f_flags & O_NONBLOCK) @@ -523,6 +542,61 @@ static ssize_t ir_lirc_read(struct file *file, char __user *buffer, return copied; } +static ssize_t ir_lirc_read_scancode(struct file *file, char __user *buffer, + size_t length) +{ + struct rc_dev *rcdev = file->private_data; + unsigned int copied; + int ret; + + if (length < sizeof(struct lirc_scancode) || + length % sizeof(struct lirc_scancode)) + return -EINVAL; + + do { + if (kfifo_is_empty(&rcdev->scancodes)) { + if (file->f_flags & O_NONBLOCK) + return -EAGAIN; + + ret = wait_event_interruptible(rcdev->wait_poll, + !kfifo_is_empty(&rcdev->scancodes) || + !rcdev->registered); + if (ret) + return ret; + } + + if (!rcdev->registered) + return -ENODEV; + + ret = mutex_lock_interruptible(&rcdev->lock); + if (ret) + return ret; + ret = kfifo_to_user(&rcdev->scancodes, buffer, length, &copied); + mutex_unlock(&rcdev->lock); + if (ret) + return ret; + } while (copied == 0); + + return copied; +} + +static ssize_t ir_lirc_read(struct file *file, char __user *buffer, + size_t length, loff_t *ppos) +{ + struct rc_dev *rcdev = file->private_data; + + if (rcdev->driver_type == RC_DRIVER_IR_RAW_TX) + return -EINVAL; + + if (!rcdev->registered) + return -ENODEV; + + if (rcdev->rec_mode == LIRC_MODE_MODE2) + return ir_lirc_read_mode2(file, buffer, length); + else /* LIRC_MODE_SCANCODE */ + return ir_lirc_read_scancode(file, buffer, length); +} + const struct file_operations lirc_fops = { .owner = THIS_MODULE, .write = ir_lirc_transmit_ir, diff --git a/drivers/media/rc/ir-mce_kbd-decoder.c b/drivers/media/rc/ir-mce_kbd-decoder.c index 2c9ee0c1f432..8cf4cf358052 100644 --- a/drivers/media/rc/ir-mce_kbd-decoder.c +++ b/drivers/media/rc/ir-mce_kbd-decoder.c @@ -215,6 +215,7 @@ static int ir_mce_kbd_decode(struct rc_dev *dev, struct ir_raw_event ev) struct mce_kbd_dec *data = &dev->raw->mce_kbd; u32 scancode; unsigned long delay; + struct lirc_scancode lsc = {}; if (!is_timing_event(ev)) { if (ev.reset) @@ -326,18 +327,22 @@ again: mod_timer(&data->rx_timeout, jiffies + delay); /* Pass data to keyboard buffer parser */ ir_mce_kbd_process_keyboard_data(data->idev, scancode); + lsc.rc_proto = RC_PROTO_MCIR2_KBD; break; case MCIR2_MOUSE_NBITS: scancode = data->body & 0x1fffff; IR_dprintk(1, "mouse data 0x%06x\n", scancode); /* Pass data to mouse buffer parser */ ir_mce_kbd_process_mouse_data(data->idev, scancode); + lsc.rc_proto = RC_PROTO_MCIR2_MSE; break; default: IR_dprintk(1, "not keyboard or mouse data\n"); goto out; } + lsc.scancode = scancode; + ir_lirc_scancode_event(dev, &lsc); data->state = STATE_INACTIVE; input_event(data->idev, EV_MSC, MSC_SCAN, scancode); input_sync(data->idev); diff --git a/drivers/media/rc/lirc_dev.c b/drivers/media/rc/lirc_dev.c index 155a4de249a0..d766abcffeac 100644 --- a/drivers/media/rc/lirc_dev.c +++ b/drivers/media/rc/lirc_dev.c @@ -42,6 +42,8 @@ static void lirc_release_device(struct device *ld) if (rcdev->driver_type == RC_DRIVER_IR_RAW) kfifo_free(&rcdev->rawir); + if (rcdev->driver_type != RC_DRIVER_IR_RAW_TX) + kfifo_free(&rcdev->scancodes); put_device(&rcdev->dev); } @@ -55,11 +57,20 @@ int ir_lirc_register(struct rc_dev *dev) dev->lirc_dev.release = lirc_release_device; dev->send_mode = LIRC_MODE_PULSE; + dev->rec_mode = LIRC_MODE_MODE2; + if (dev->driver_type == RC_DRIVER_IR_RAW) { if (kfifo_alloc(&dev->rawir, MAX_IR_EVENT_SIZE, GFP_KERNEL)) return -ENOMEM; } + if (dev->driver_type != RC_DRIVER_IR_RAW_TX) { + if (kfifo_alloc(&dev->scancodes, 32, GFP_KERNEL)) { + kfifo_free(&dev->rawir); + return -ENOMEM; + } + } + init_waitqueue_head(&dev->wait_poll); minor = ida_simple_get(&lirc_ida, 0, RC_DEV_MAX, GFP_KERNEL); @@ -90,6 +101,8 @@ out_ida: out_kfifo: if (dev->driver_type == RC_DRIVER_IR_RAW) kfifo_free(&dev->rawir); + if (dev->driver_type != RC_DRIVER_IR_RAW_TX) + kfifo_free(&dev->scancodes); return err; } diff --git a/drivers/media/rc/rc-core-priv.h b/drivers/media/rc/rc-core-priv.h index 9af43d16ca29..2a5e9cc3ddb3 100644 --- a/drivers/media/rc/rc-core-priv.h +++ b/drivers/media/rc/rc-core-priv.h @@ -276,6 +276,7 @@ void ir_raw_init(void); int lirc_dev_init(void); void lirc_dev_exit(void); void ir_lirc_raw_event(struct rc_dev *dev, struct ir_raw_event ev); +void ir_lirc_scancode_event(struct rc_dev *dev, struct lirc_scancode *lsc); int ir_lirc_register(struct rc_dev *dev); void ir_lirc_unregister(struct rc_dev *dev); @@ -285,6 +286,8 @@ static inline int lirc_dev_init(void) { return 0; } static inline void lirc_dev_exit(void) {} static inline void ir_lirc_raw_event(struct rc_dev *dev, struct ir_raw_event ev) { } +static inline void ir_lirc_scancode_event(struct rc_dev *dev, + struct lirc_scancode *lsc) { } static inline int ir_lirc_register(struct rc_dev *dev) { return 0; } static inline void ir_lirc_unregister(struct rc_dev *dev) { } #endif diff --git a/drivers/media/rc/rc-main.c b/drivers/media/rc/rc-main.c index 12ff6d87b113..b22443fe8c34 100644 --- a/drivers/media/rc/rc-main.c +++ b/drivers/media/rc/rc-main.c @@ -697,6 +697,13 @@ static void ir_do_keydown(struct rc_dev *dev, enum rc_proto protocol, dev->last_protocol != protocol || dev->last_scancode != scancode || dev->last_toggle != toggle); + struct lirc_scancode sc = { + .scancode = scancode, .rc_proto = protocol, + .flags = toggle ? LIRC_SCANCODE_FLAG_TOGGLE : 0, + .keycode = keycode + }; + + ir_lirc_scancode_event(dev, &sc); if (new_event && dev->keypressed) ir_do_keyup(dev, false); diff --git a/include/media/rc-core.h b/include/media/rc-core.h index 2d24c88652aa..fbf1648d2ec9 100644 --- a/include/media/rc-core.h +++ b/include/media/rc-core.h @@ -126,9 +126,12 @@ enum rc_filter_type { * @gap: true if we're in a gap * @send_timeout_reports: report timeouts in lirc raw IR. * @rawir: queue for incoming raw IR + * @scancodes: queue for incoming decoded scancodes * @wait_poll: poll struct for lirc device * @send_mode: lirc mode for sending, either LIRC_MODE_SCANCODE or * LIRC_MODE_PULSE + * @rec_mode: lirc mode for receiving, either LIRC_MODE_SCANCODE or + * LIRC_MODE_MODE2 * @registered: set to true by rc_register_device(), false by * rc_unregister_device * @change_protocol: allow changing the protocol used on hardware decoders @@ -200,8 +203,10 @@ struct rc_dev { bool gap; bool send_timeout_reports; DECLARE_KFIFO_PTR(rawir, unsigned int); + DECLARE_KFIFO_PTR(scancodes, struct lirc_scancode); wait_queue_head_t wait_poll; u8 send_mode; + u8 rec_mode; #endif bool registered; int (*change_protocol)(struct rc_dev *dev, u64 *rc_proto); From b9b6efee5a6d4d2c5d9fa1ce669107ce675725fc Mon Sep 17 00:00:00 2001 From: Sean Young Date: Sat, 30 Sep 2017 07:13:37 -0400 Subject: [PATCH 0559/1640] UPSTREAM: media: lirc: ensure lirc device receives nec repeats The lirc device should get lirc repeats whether there is a keymap match or not. Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/rc-main.c | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/drivers/media/rc/rc-main.c b/drivers/media/rc/rc-main.c index b22443fe8c34..058807bc80dc 100644 --- a/drivers/media/rc/rc-main.c +++ b/drivers/media/rc/rc-main.c @@ -662,19 +662,25 @@ void rc_repeat(struct rc_dev *dev) { unsigned long flags; unsigned int timeout = protocols[dev->last_protocol].repeat_period; + struct lirc_scancode sc = { + .scancode = dev->last_scancode, .rc_proto = dev->last_protocol, + .keycode = dev->keypressed ? dev->last_keycode : KEY_RESERVED, + .flags = LIRC_SCANCODE_FLAG_REPEAT | + (dev->last_toggle ? LIRC_SCANCODE_FLAG_TOGGLE : 0) + }; + + ir_lirc_scancode_event(dev, &sc); spin_lock_irqsave(&dev->keylock, flags); - if (!dev->keypressed) - goto out; - input_event(dev->input_dev, EV_MSC, MSC_SCAN, dev->last_scancode); input_sync(dev->input_dev); - dev->keyup_jiffies = jiffies + msecs_to_jiffies(timeout); - mod_timer(&dev->timer_keyup, dev->keyup_jiffies); + if (dev->keypressed) { + dev->keyup_jiffies = jiffies + msecs_to_jiffies(timeout); + mod_timer(&dev->timer_keyup, dev->keyup_jiffies); + } -out: spin_unlock_irqrestore(&dev->keylock, flags); } EXPORT_SYMBOL_GPL(rc_repeat); @@ -710,13 +716,14 @@ static void ir_do_keydown(struct rc_dev *dev, enum rc_proto protocol, input_event(dev->input_dev, EV_MSC, MSC_SCAN, scancode); + dev->last_protocol = protocol; + dev->last_scancode = scancode; + dev->last_toggle = toggle; + dev->last_keycode = keycode; + if (new_event && keycode != KEY_RESERVED) { /* Register a keypress */ dev->keypressed = true; - dev->last_protocol = protocol; - dev->last_scancode = scancode; - dev->last_toggle = toggle; - dev->last_keycode = keycode; IR_dprintk(1, "%s: key down event, key 0x%04x, protocol 0x%04x, scancode 0x%08x\n", dev->device_name, keycode, protocol, scancode); From 8a084bea0e77743db05a968226d85bd02e00a9dd Mon Sep 17 00:00:00 2001 From: Sean Young Date: Sat, 25 Feb 2017 06:51:34 -0500 Subject: [PATCH 0560/1640] UPSTREAM: media: lirc: document LIRC_MODE_SCANCODE Lirc supports a new mode which requires documentation. Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- Documentation/media/lirc.h.rst.exceptions | 26 ++++++++++ .../media/uapi/rc/lirc-dev-intro.rst | 51 +++++++++++++++++-- .../media/uapi/rc/lirc-get-features.rst | 16 ++++++ .../media/uapi/rc/lirc-get-rec-mode.rst | 3 +- .../media/uapi/rc/lirc-get-send-mode.rst | 3 +- Documentation/media/uapi/rc/lirc-read.rst | 15 ++++-- Documentation/media/uapi/rc/lirc-write.rst | 19 +++++-- 7 files changed, 118 insertions(+), 15 deletions(-) diff --git a/Documentation/media/lirc.h.rst.exceptions b/Documentation/media/lirc.h.rst.exceptions index 63ba1d341905..c6e3a35d2c4e 100644 --- a/Documentation/media/lirc.h.rst.exceptions +++ b/Documentation/media/lirc.h.rst.exceptions @@ -32,6 +32,32 @@ ignore define LIRC_CAN_SET_REC_DUTY_CYCLE ignore ioctl LIRC_GET_LENGTH +# rc protocols + +ignore symbol RC_PROTO_UNKNOWN +ignore symbol RC_PROTO_OTHER +ignore symbol RC_PROTO_RC5 +ignore symbol RC_PROTO_RC5X_20 +ignore symbol RC_PROTO_RC5_SZ +ignore symbol RC_PROTO_JVC +ignore symbol RC_PROTO_SONY12 +ignore symbol RC_PROTO_SONY15 +ignore symbol RC_PROTO_SONY20 +ignore symbol RC_PROTO_NEC +ignore symbol RC_PROTO_NECX +ignore symbol RC_PROTO_NEC32 +ignore symbol RC_PROTO_SANYO +ignore symbol RC_PROTO_MCIR2_KBD +ignore symbol RC_PROTO_MCIR2_MSE +ignore symbol RC_PROTO_RC6_0 +ignore symbol RC_PROTO_RC6_6A_20 +ignore symbol RC_PROTO_RC6_6A_24 +ignore symbol RC_PROTO_RC6_6A_32 +ignore symbol RC_PROTO_RC6_MCE +ignore symbol RC_PROTO_SHARP +ignore symbol RC_PROTO_XMP +ignore symbol RC_PROTO_CEC + # Undocumented macros ignore define PULSE_BIT diff --git a/Documentation/media/uapi/rc/lirc-dev-intro.rst b/Documentation/media/uapi/rc/lirc-dev-intro.rst index a3fa3c1ef169..47c6c218e72a 100644 --- a/Documentation/media/uapi/rc/lirc-dev-intro.rst +++ b/Documentation/media/uapi/rc/lirc-dev-intro.rst @@ -6,11 +6,12 @@ Introduction ************ -The LIRC device interface is a bi-directional interface for transporting -raw IR data between userspace and kernelspace. Fundamentally, it is just -a chardev (/dev/lircX, for X = 0, 1, 2, ...), with a number of standard -struct file_operations defined on it. With respect to transporting raw -IR data to and fro, the essential fops are read, write and ioctl. +LIRC stands for Linux Infrared Remote Control. The LIRC device interface is +a bi-directional interface for transporting raw IR and decoded scancodes +data between userspace and kernelspace. Fundamentally, it is just a chardev +(/dev/lircX, for X = 0, 1, 2, ...), with a number of standard struct +file_operations defined on it. With respect to transporting raw IR and +decoded scancodes to and fro, the essential fops are read, write and ioctl. Example dmesg output upon a driver registering w/LIRC: @@ -36,6 +37,46 @@ LIRC modes LIRC supports some modes of receiving and sending IR codes, as shown on the following table. +.. _lirc-mode-scancode: +.. _lirc-scancode-flag-toggle: +.. _lirc-scancode-flag-repeat: + +``LIRC_MODE_SCANCODE`` + + This mode is for both sending and receiving IR. + + For transmitting (aka sending), create a ``struct lirc_scancode`` with + the desired scancode set in the ``scancode`` member, ``rc_proto`` set + the IR protocol, and all other members set to 0. Write this struct to + the lirc device. + + For receiving, you read ``struct lirc_scancode`` from the lirc device, + with ``scancode`` set to the received scancode and the IR protocol + ``rc_proto``. If the scancode maps to a valid key code, this is set + in the ``keycode`` field, else it is set to ``KEY_RESERVED``. + + The ``flags`` can have ``LIRC_SCANCODE_FLAG_TOGGLE`` set if the toggle + bit is set in protocols that support it (e.g. rc-5 and rc-6), or + ``LIRC_SCANCODE_FLAG_REPEAT`` for when a repeat is received for protocols + that support it (e.g. nec). + + In the Sanyo and NEC protocol, if you hold a button on remote, rather than + repeating the entire scancode, the remote sends a shorter message with + no scancode, which just means button is held, a "repeat". When this is + received, the ``LIRC_SCANCODE_FLAG_REPEAT`` is set and the scancode and + keycode is repeated. + + With nec, there is no way to distinguish "button hold" from "repeatedly + pressing the same button". The rc-5 and rc-6 protocols have a toggle bit. + When a button is released and pressed again, the toggle bit is inverted. + If the toggle bit is set, the ``LIRC_SCANCODE_FLAG_TOGGLE`` is set. + + The ``timestamp`` field is filled with the time nanoseconds + (in ``CLOCK_MONOTONIC``) when the scancode was decoded. + + An ``enum rc_proto`` in the :ref:`lirc_header` lists all the supported + IR protocols. + .. _lirc-mode-mode2: ``LIRC_MODE_MODE2`` diff --git a/Documentation/media/uapi/rc/lirc-get-features.rst b/Documentation/media/uapi/rc/lirc-get-features.rst index 50c2c26d8e89..3ee44067de63 100644 --- a/Documentation/media/uapi/rc/lirc-get-features.rst +++ b/Documentation/media/uapi/rc/lirc-get-features.rst @@ -64,6 +64,14 @@ LIRC features Unused. Kept just to avoid breaking uAPI. +.. _LIRC-CAN-REC-SCANCODE: + +``LIRC_CAN_REC_SCANCODE`` + + The driver is capable of receiving using + :ref:`LIRC_MODE_SCANCODE `. + + .. _LIRC-CAN-SET-SEND-CARRIER: ``LIRC_CAN_SET_SEND_CARRIER`` @@ -171,6 +179,14 @@ LIRC features Unused. Kept just to avoid breaking uAPI. +.. _LIRC-CAN-SEND-SCANCODE: + +``LIRC_CAN_SEND_SCANCODE`` + + The driver supports sending (also called as IR blasting or IR TX) using + :ref:`LIRC_MODE_SCANCODE `. + + Return Value ============ diff --git a/Documentation/media/uapi/rc/lirc-get-rec-mode.rst b/Documentation/media/uapi/rc/lirc-get-rec-mode.rst index b89de9add921..34919feaf392 100644 --- a/Documentation/media/uapi/rc/lirc-get-rec-mode.rst +++ b/Documentation/media/uapi/rc/lirc-get-rec-mode.rst @@ -34,7 +34,8 @@ Description =========== Get/set supported receive modes. Only :ref:`LIRC_MODE_MODE2 ` -is supported for IR receive. +and :ref:`LIRC_MODE_SCANCODE ` are supported. +Use :ref:`lirc_get_features` to find out which modes the driver supports. Return Value ============ diff --git a/Documentation/media/uapi/rc/lirc-get-send-mode.rst b/Documentation/media/uapi/rc/lirc-get-send-mode.rst index e686b21689a0..e39383f08e21 100644 --- a/Documentation/media/uapi/rc/lirc-get-send-mode.rst +++ b/Documentation/media/uapi/rc/lirc-get-send-mode.rst @@ -36,7 +36,8 @@ Description Get/set current transmit mode. -Only :ref:`LIRC_MODE_PULSE ` is supported by for IR send, +Only :ref:`LIRC_MODE_PULSE ` and +:ref:`LIRC_MODE_SCANCODE ` are supported by for IR send, depending on the driver. Use :ref:`lirc_get_features` to find out which modes the driver supports. diff --git a/Documentation/media/uapi/rc/lirc-read.rst b/Documentation/media/uapi/rc/lirc-read.rst index ff14a69104e5..51d37ed10194 100644 --- a/Documentation/media/uapi/rc/lirc-read.rst +++ b/Documentation/media/uapi/rc/lirc-read.rst @@ -45,13 +45,20 @@ descriptor ``fd`` into the buffer starting at ``buf``. If ``count`` is zero, is greater than ``SSIZE_MAX``, the result is unspecified. The exact format of the data depends on what :ref:`lirc_modes` a driver -uses. Use :ref:`lirc_get_features` to get the supported mode. +uses. Use :ref:`lirc_get_features` to get the supported mode, and use +:ref:`lirc_set_rec_mode` set the current active mode. -The generally preferred mode for receive is -:ref:`LIRC_MODE_MODE2 `, -in which packets containing an int value describing an IR signal are +The mode :ref:`LIRC_MODE_MODE2 ` is for raw IR, +in which packets containing an unsigned int value describing an IR signal are read from the chardev. +Alternatively, :ref:`LIRC_MODE_SCANCODE ` can be available, +in this mode scancodes which are either decoded by software decoders, or +by hardware decoders. The ``rc_proto`` member is set to the +protocol used for transmission, and ``scancode`` to the decoded scancode, +and the ``keycode`` set to the keycode or ``KEY_RESERVED``. + + Return Value ============ diff --git a/Documentation/media/uapi/rc/lirc-write.rst b/Documentation/media/uapi/rc/lirc-write.rst index 2aad0fef4a5b..3d7541bad8b9 100644 --- a/Documentation/media/uapi/rc/lirc-write.rst +++ b/Documentation/media/uapi/rc/lirc-write.rst @@ -42,21 +42,32 @@ Description referenced by the file descriptor ``fd`` from the buffer starting at ``buf``. -The exact format of the data depends on what mode a driver uses, use -:ref:`lirc_get_features` to get the supported mode. +The exact format of the data depends on what mode a driver is in, use +:ref:`lirc_get_features` to get the supported modes and use +:ref:`lirc_set_send_mode` set the mode. When in :ref:`LIRC_MODE_PULSE ` mode, the data written to the chardev is a pulse/space sequence of integer values. Pulses and spaces are only marked implicitly by their position. The data must start and end with a pulse, therefore, the data must always include an uneven number of -samples. The write function must block until the data has been transmitted +samples. The write function blocks until the data has been transmitted by the hardware. If more data is provided than the hardware can send, the driver returns ``EINVAL``. +When in :ref:`LIRC_MODE_SCANCODE ` mode, one +``struct lirc_scancode`` must be written to the chardev at a time, else +``EINVAL`` is returned. Set the desired scancode in the ``scancode`` member, +and the protocol in the ``rc_proto`` member. All other members must be set +to 0, else ``EINVAL`` is returned. If there is no protocol encoder +for the protocol or the scancode is not valid for the specified protocol, +``EINVAL`` is returned. The write function may not wait until the scancode +is transmitted. + + Return Value ============ -On success, the number of bytes read is returned. It is not an error if +On success, the number of bytes written is returned. It is not an error if this number is smaller than the number of bytes requested, or the amount of data required for one frame. On error, -1 is returned, and the ``errno`` variable is set appropriately. The generic error codes are described at the From d79ddab013d858566cf323a037b85ab3cd0cd52e Mon Sep 17 00:00:00 2001 From: Sean Young Date: Sun, 24 Sep 2017 12:43:24 -0400 Subject: [PATCH 0561/1640] UPSTREAM: media: lirc: scancode rc devices should have a lirc device too Now that the lirc interface supports scancodes, RC scancode devices can also have a lirc device. The only receiving feature they will have enabled is LIRC_CAN_REC_SCANCODE. Note that CEC devices have no lirc device, since they can be controlled from their /dev/cecN chardev. Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/ir-lirc-codec.c | 19 +++++++++++++++---- drivers/media/rc/lirc_dev.c | 5 ++++- drivers/media/rc/rc-main.c | 6 +++--- 3 files changed, 22 insertions(+), 8 deletions(-) diff --git a/drivers/media/rc/ir-lirc-codec.c b/drivers/media/rc/ir-lirc-codec.c index 817258c87b5c..8c5df6e8579e 100644 --- a/drivers/media/rc/ir-lirc-codec.c +++ b/drivers/media/rc/ir-lirc-codec.c @@ -304,6 +304,9 @@ static long ir_lirc_ioctl(struct file *filep, unsigned int cmd, switch (cmd) { case LIRC_GET_FEATURES: + if (dev->driver_type == RC_DRIVER_SCANCODE) + val |= LIRC_CAN_REC_SCANCODE; + if (dev->driver_type == RC_DRIVER_IR_RAW) { val |= LIRC_CAN_REC_MODE2 | LIRC_CAN_REC_SCANCODE; if (dev->rx_resolution) @@ -344,11 +347,19 @@ static long ir_lirc_ioctl(struct file *filep, unsigned int cmd, break; case LIRC_SET_REC_MODE: - if (dev->driver_type == RC_DRIVER_IR_RAW_TX) + switch (dev->driver_type) { + case RC_DRIVER_IR_RAW_TX: return -ENOTTY; - - if (!(val == LIRC_MODE_MODE2 || val == LIRC_MODE_SCANCODE)) - return -EINVAL; + case RC_DRIVER_SCANCODE: + if (val != LIRC_MODE_SCANCODE) + return -EINVAL; + break; + case RC_DRIVER_IR_RAW: + if (!(val == LIRC_MODE_MODE2 || + val == LIRC_MODE_SCANCODE)) + return -EINVAL; + break; + } dev->rec_mode = val; return 0; diff --git a/drivers/media/rc/lirc_dev.c b/drivers/media/rc/lirc_dev.c index d766abcffeac..2a0c48698309 100644 --- a/drivers/media/rc/lirc_dev.c +++ b/drivers/media/rc/lirc_dev.c @@ -57,7 +57,10 @@ int ir_lirc_register(struct rc_dev *dev) dev->lirc_dev.release = lirc_release_device; dev->send_mode = LIRC_MODE_PULSE; - dev->rec_mode = LIRC_MODE_MODE2; + if (dev->driver_type == RC_DRIVER_SCANCODE) + dev->rec_mode = LIRC_MODE_SCANCODE; + else + dev->rec_mode = LIRC_MODE_MODE2; if (dev->driver_type == RC_DRIVER_IR_RAW) { if (kfifo_alloc(&dev->rawir, MAX_IR_EVENT_SIZE, GFP_KERNEL)) diff --git a/drivers/media/rc/rc-main.c b/drivers/media/rc/rc-main.c index 058807bc80dc..5830cb2c5943 100644 --- a/drivers/media/rc/rc-main.c +++ b/drivers/media/rc/rc-main.c @@ -1814,7 +1814,7 @@ int rc_register_device(struct rc_dev *dev) } /* Ensure that the lirc kfifo is setup before we start the thread */ - if (dev->driver_type != RC_DRIVER_SCANCODE) { + if (dev->allowed_protocols != RC_PROTO_BIT_CEC) { rc = ir_lirc_register(dev); if (rc < 0) goto out_rx; @@ -1835,7 +1835,7 @@ int rc_register_device(struct rc_dev *dev) return 0; out_lirc: - if (dev->driver_type != RC_DRIVER_SCANCODE) + if (dev->allowed_protocols != RC_PROTO_BIT_CEC) ir_lirc_unregister(dev); out_rx: rc_free_rx_device(dev); @@ -1898,7 +1898,7 @@ void rc_unregister_device(struct rc_dev *dev) * lirc device should be freed with dev->registered = false, so * that userspace polling will get notified. */ - if (dev->driver_type != RC_DRIVER_SCANCODE) + if (dev->allowed_protocols != RC_PROTO_BIT_CEC) ir_lirc_unregister(dev); device_del(&dev->dev); From 30833ef8313652eb3c25f87f4625c3410f0b49be Mon Sep 17 00:00:00 2001 From: Sean Young Date: Thu, 2 Nov 2017 16:39:16 -0400 Subject: [PATCH 0562/1640] UPSTREAM: media: rc: move ir-lirc-codec.c contents into lirc_dev.c Since removing the lirc kapi, ir-lirc-codec.c only contains lirc fops so the file name is no longer correct. By moving its content into lirc_dev.c the ugly extern struct lirc_fops is not longer needed, and everything lirc related is in one file. Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/Makefile | 2 +- drivers/media/rc/ir-lirc-codec.c | 623 ------------------------------- drivers/media/rc/lirc_dev.c | 604 ++++++++++++++++++++++++++++++ drivers/media/rc/rc-core-priv.h | 2 - 4 files changed, 605 insertions(+), 626 deletions(-) delete mode 100644 drivers/media/rc/ir-lirc-codec.c diff --git a/drivers/media/rc/Makefile b/drivers/media/rc/Makefile index bcb49badfe36..740abd46b1be 100644 --- a/drivers/media/rc/Makefile +++ b/drivers/media/rc/Makefile @@ -4,7 +4,7 @@ obj-y += keymaps/ obj-$(CONFIG_RC_CORE) += rc-core.o rc-core-y := rc-main.o rc-ir-raw.o -rc-core-$(CONFIG_LIRC) += lirc_dev.o ir-lirc-codec.o +rc-core-$(CONFIG_LIRC) += lirc_dev.o obj-$(CONFIG_IR_NEC_DECODER) += ir-nec-decoder.o obj-$(CONFIG_IR_RC5_DECODER) += ir-rc5-decoder.o obj-$(CONFIG_IR_RC6_DECODER) += ir-rc6-decoder.o diff --git a/drivers/media/rc/ir-lirc-codec.c b/drivers/media/rc/ir-lirc-codec.c deleted file mode 100644 index 8c5df6e8579e..000000000000 --- a/drivers/media/rc/ir-lirc-codec.c +++ /dev/null @@ -1,623 +0,0 @@ -/* ir-lirc-codec.c - rc-core to classic lirc interface bridge - * - * Copyright (C) 2010 by Jarod Wilson - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation version 2 of the License. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - */ - -#include -#include -#include -#include -#include -#include "rc-core-priv.h" - -#define LIRCBUF_SIZE 256 - -/** - * ir_lirc_raw_event() - Send raw IR data to lirc to be relayed to userspace - * - * @dev: the struct rc_dev descriptor of the device - * @ev: the struct ir_raw_event descriptor of the pulse/space - */ -void ir_lirc_raw_event(struct rc_dev *dev, struct ir_raw_event ev) -{ - int sample; - - /* Packet start */ - if (ev.reset) { - /* Userspace expects a long space event before the start of - * the signal to use as a sync. This may be done with repeat - * packets and normal samples. But if a reset has been sent - * then we assume that a long time has passed, so we send a - * space with the maximum time value. */ - sample = LIRC_SPACE(LIRC_VALUE_MASK); - IR_dprintk(2, "delivering reset sync space to lirc_dev\n"); - - /* Carrier reports */ - } else if (ev.carrier_report) { - sample = LIRC_FREQUENCY(ev.carrier); - IR_dprintk(2, "carrier report (freq: %d)\n", sample); - - /* Packet end */ - } else if (ev.timeout) { - - if (dev->gap) - return; - - dev->gap_start = ktime_get(); - dev->gap = true; - dev->gap_duration = ev.duration; - - if (!dev->send_timeout_reports) - return; - - sample = LIRC_TIMEOUT(ev.duration / 1000); - IR_dprintk(2, "timeout report (duration: %d)\n", sample); - - /* Normal sample */ - } else { - - if (dev->gap) { - dev->gap_duration += ktime_to_ns(ktime_sub(ktime_get(), - dev->gap_start)); - - /* Convert to ms and cap by LIRC_VALUE_MASK */ - do_div(dev->gap_duration, 1000); - dev->gap_duration = min_t(u64, dev->gap_duration, - LIRC_VALUE_MASK); - - kfifo_put(&dev->rawir, LIRC_SPACE(dev->gap_duration)); - dev->gap = false; - } - - sample = ev.pulse ? LIRC_PULSE(ev.duration / 1000) : - LIRC_SPACE(ev.duration / 1000); - IR_dprintk(2, "delivering %uus %s to lirc_dev\n", - TO_US(ev.duration), TO_STR(ev.pulse)); - } - - kfifo_put(&dev->rawir, sample); - wake_up_poll(&dev->wait_poll, POLLIN | POLLRDNORM); -} - -/** - * ir_lirc_scancode_event() - Send scancode data to lirc to be relayed to - * userspace - * @dev: the struct rc_dev descriptor of the device - * @lsc: the struct lirc_scancode describing the decoded scancode - */ -void ir_lirc_scancode_event(struct rc_dev *dev, struct lirc_scancode *lsc) -{ - lsc->timestamp = ktime_get_ns(); - - if (kfifo_put(&dev->scancodes, *lsc)) - wake_up_poll(&dev->wait_poll, POLLIN | POLLRDNORM); -} -EXPORT_SYMBOL_GPL(ir_lirc_scancode_event); - -static int ir_lirc_open(struct inode *inode, struct file *file) -{ - struct rc_dev *dev = container_of(inode->i_cdev, struct rc_dev, - lirc_cdev); - int retval; - - retval = rc_open(dev); - if (retval) - return retval; - - retval = mutex_lock_interruptible(&dev->lock); - if (retval) - goto out_rc; - - if (!dev->registered) { - retval = -ENODEV; - goto out_unlock; - } - - if (dev->lirc_open) { - retval = -EBUSY; - goto out_unlock; - } - - if (dev->driver_type == RC_DRIVER_IR_RAW) - kfifo_reset_out(&dev->rawir); - if (dev->driver_type != RC_DRIVER_IR_RAW_TX) - kfifo_reset_out(&dev->scancodes); - - dev->lirc_open++; - file->private_data = dev; - - nonseekable_open(inode, file); - mutex_unlock(&dev->lock); - - return 0; - -out_unlock: - mutex_unlock(&dev->lock); -out_rc: - rc_close(dev); - return retval; -} - -static int ir_lirc_close(struct inode *inode, struct file *file) -{ - struct rc_dev *dev = file->private_data; - - mutex_lock(&dev->lock); - dev->lirc_open--; - mutex_unlock(&dev->lock); - - rc_close(dev); - - return 0; -} - -static ssize_t ir_lirc_transmit_ir(struct file *file, const char __user *buf, - size_t n, loff_t *ppos) -{ - struct rc_dev *dev = file->private_data; - unsigned int *txbuf = NULL; - struct ir_raw_event *raw = NULL; - ssize_t ret = -EINVAL; - size_t count; - ktime_t start; - s64 towait; - unsigned int duration = 0; /* signal duration in us */ - int i; - - if (!dev->registered) - return -ENODEV; - - start = ktime_get(); - - if (!dev->tx_ir) { - ret = -EINVAL; - goto out; - } - - if (dev->send_mode == LIRC_MODE_SCANCODE) { - struct lirc_scancode scan; - - if (n != sizeof(scan)) - return -EINVAL; - - if (copy_from_user(&scan, buf, sizeof(scan))) - return -EFAULT; - - if (scan.flags || scan.keycode || scan.timestamp) - return -EINVAL; - - /* - * The scancode field in lirc_scancode is 64-bit simply - * to future-proof it, since there are IR protocols encode - * use more than 32 bits. For now only 32-bit protocols - * are supported. - */ - if (scan.scancode > U32_MAX || - !rc_validate_scancode(scan.rc_proto, scan.scancode)) - return -EINVAL; - - raw = kmalloc_array(LIRCBUF_SIZE, sizeof(*raw), GFP_KERNEL); - if (!raw) - return -ENOMEM; - - ret = ir_raw_encode_scancode(scan.rc_proto, scan.scancode, - raw, LIRCBUF_SIZE); - if (ret < 0) - goto out; - - count = ret; - - txbuf = kmalloc_array(count, sizeof(unsigned int), GFP_KERNEL); - if (!txbuf) { - ret = -ENOMEM; - goto out; - } - - for (i = 0; i < count; i++) - /* Convert from NS to US */ - txbuf[i] = DIV_ROUND_UP(raw[i].duration, 1000); - - if (dev->s_tx_carrier) { - int carrier = ir_raw_encode_carrier(scan.rc_proto); - - if (carrier > 0) - dev->s_tx_carrier(dev, carrier); - } - } else { - if (n < sizeof(unsigned int) || n % sizeof(unsigned int)) - return -EINVAL; - - count = n / sizeof(unsigned int); - if (count > LIRCBUF_SIZE || count % 2 == 0) - return -EINVAL; - - txbuf = memdup_user(buf, n); - if (IS_ERR(txbuf)) - return PTR_ERR(txbuf); - } - - for (i = 0; i < count; i++) { - if (txbuf[i] > IR_MAX_DURATION / 1000 - duration || !txbuf[i]) { - ret = -EINVAL; - goto out; - } - - duration += txbuf[i]; - } - - ret = dev->tx_ir(dev, txbuf, count); - if (ret < 0) - goto out; - - if (dev->send_mode == LIRC_MODE_SCANCODE) { - ret = n; - } else { - for (duration = i = 0; i < ret; i++) - duration += txbuf[i]; - - ret *= sizeof(unsigned int); - - /* - * The lircd gap calculation expects the write function to - * wait for the actual IR signal to be transmitted before - * returning. - */ - towait = ktime_us_delta(ktime_add_us(start, duration), - ktime_get()); - if (towait > 0) { - set_current_state(TASK_INTERRUPTIBLE); - schedule_timeout(usecs_to_jiffies(towait)); - } - } - -out: - kfree(txbuf); - kfree(raw); - return ret; -} - -static long ir_lirc_ioctl(struct file *filep, unsigned int cmd, - unsigned long arg) -{ - struct rc_dev *dev = filep->private_data; - u32 __user *argp = (u32 __user *)(arg); - int ret = 0; - __u32 val = 0, tmp; - - if (_IOC_DIR(cmd) & _IOC_WRITE) { - ret = get_user(val, argp); - if (ret) - return ret; - } - - if (!dev->registered) - return -ENODEV; - - switch (cmd) { - case LIRC_GET_FEATURES: - if (dev->driver_type == RC_DRIVER_SCANCODE) - val |= LIRC_CAN_REC_SCANCODE; - - if (dev->driver_type == RC_DRIVER_IR_RAW) { - val |= LIRC_CAN_REC_MODE2 | LIRC_CAN_REC_SCANCODE; - if (dev->rx_resolution) - val |= LIRC_CAN_GET_REC_RESOLUTION; - } - - if (dev->tx_ir) { - val |= LIRC_CAN_SEND_PULSE | LIRC_CAN_SEND_SCANCODE; - if (dev->s_tx_mask) - val |= LIRC_CAN_SET_TRANSMITTER_MASK; - if (dev->s_tx_carrier) - val |= LIRC_CAN_SET_SEND_CARRIER; - if (dev->s_tx_duty_cycle) - val |= LIRC_CAN_SET_SEND_DUTY_CYCLE; - } - - if (dev->s_rx_carrier_range) - val |= LIRC_CAN_SET_REC_CARRIER | - LIRC_CAN_SET_REC_CARRIER_RANGE; - - if (dev->s_learning_mode) - val |= LIRC_CAN_USE_WIDEBAND_RECEIVER; - - if (dev->s_carrier_report) - val |= LIRC_CAN_MEASURE_CARRIER; - - if (dev->max_timeout) - val |= LIRC_CAN_SET_REC_TIMEOUT; - - break; - - /* mode support */ - case LIRC_GET_REC_MODE: - if (dev->driver_type == RC_DRIVER_IR_RAW_TX) - return -ENOTTY; - - val = dev->rec_mode; - break; - - case LIRC_SET_REC_MODE: - switch (dev->driver_type) { - case RC_DRIVER_IR_RAW_TX: - return -ENOTTY; - case RC_DRIVER_SCANCODE: - if (val != LIRC_MODE_SCANCODE) - return -EINVAL; - break; - case RC_DRIVER_IR_RAW: - if (!(val == LIRC_MODE_MODE2 || - val == LIRC_MODE_SCANCODE)) - return -EINVAL; - break; - } - - dev->rec_mode = val; - return 0; - - case LIRC_GET_SEND_MODE: - if (!dev->tx_ir) - return -ENOTTY; - - val = dev->send_mode; - break; - - case LIRC_SET_SEND_MODE: - if (!dev->tx_ir) - return -ENOTTY; - - if (!(val == LIRC_MODE_PULSE || val == LIRC_MODE_SCANCODE)) - return -EINVAL; - - dev->send_mode = val; - return 0; - - /* TX settings */ - case LIRC_SET_TRANSMITTER_MASK: - if (!dev->s_tx_mask) - return -ENOTTY; - - return dev->s_tx_mask(dev, val); - - case LIRC_SET_SEND_CARRIER: - if (!dev->s_tx_carrier) - return -ENOTTY; - - return dev->s_tx_carrier(dev, val); - - case LIRC_SET_SEND_DUTY_CYCLE: - if (!dev->s_tx_duty_cycle) - return -ENOTTY; - - if (val <= 0 || val >= 100) - return -EINVAL; - - return dev->s_tx_duty_cycle(dev, val); - - /* RX settings */ - case LIRC_SET_REC_CARRIER: - if (!dev->s_rx_carrier_range) - return -ENOTTY; - - if (val <= 0) - return -EINVAL; - - return dev->s_rx_carrier_range(dev, - dev->carrier_low, - val); - - case LIRC_SET_REC_CARRIER_RANGE: - if (!dev->s_rx_carrier_range) - return -ENOTTY; - - if (val <= 0) - return -EINVAL; - - dev->carrier_low = val; - return 0; - - case LIRC_GET_REC_RESOLUTION: - if (!dev->rx_resolution) - return -ENOTTY; - - val = dev->rx_resolution / 1000; - break; - - case LIRC_SET_WIDEBAND_RECEIVER: - if (!dev->s_learning_mode) - return -ENOTTY; - - return dev->s_learning_mode(dev, !!val); - - case LIRC_SET_MEASURE_CARRIER_MODE: - if (!dev->s_carrier_report) - return -ENOTTY; - - return dev->s_carrier_report(dev, !!val); - - /* Generic timeout support */ - case LIRC_GET_MIN_TIMEOUT: - if (!dev->max_timeout) - return -ENOTTY; - val = DIV_ROUND_UP(dev->min_timeout, 1000); - break; - - case LIRC_GET_MAX_TIMEOUT: - if (!dev->max_timeout) - return -ENOTTY; - val = dev->max_timeout / 1000; - break; - - case LIRC_SET_REC_TIMEOUT: - if (!dev->max_timeout) - return -ENOTTY; - - /* Check for multiply overflow */ - if (val > U32_MAX / 1000) - return -EINVAL; - - tmp = val * 1000; - - if (tmp < dev->min_timeout || tmp > dev->max_timeout) - return -EINVAL; - - if (dev->s_timeout) - ret = dev->s_timeout(dev, tmp); - if (!ret) - dev->timeout = tmp; - break; - - case LIRC_SET_REC_TIMEOUT_REPORTS: - if (!dev->timeout) - return -ENOTTY; - - dev->send_timeout_reports = !!val; - break; - - default: - return -ENOTTY; - } - - if (_IOC_DIR(cmd) & _IOC_READ) - ret = put_user(val, argp); - - return ret; -} - -static unsigned int ir_lirc_poll(struct file *file, - struct poll_table_struct *wait) -{ - struct rc_dev *rcdev = file->private_data; - unsigned int events = 0; - - poll_wait(file, &rcdev->wait_poll, wait); - - if (!rcdev->registered) { - events = POLLHUP | POLLERR; - } else if (rcdev->driver_type != RC_DRIVER_IR_RAW_TX) { - if (rcdev->rec_mode == LIRC_MODE_SCANCODE && - !kfifo_is_empty(&rcdev->scancodes)) - events = POLLIN | POLLRDNORM; - - if (rcdev->rec_mode == LIRC_MODE_MODE2 && - !kfifo_is_empty(&rcdev->rawir)) - events = POLLIN | POLLRDNORM; - } - - return events; -} - -static ssize_t ir_lirc_read_mode2(struct file *file, char __user *buffer, - size_t length) -{ - struct rc_dev *rcdev = file->private_data; - unsigned int copied; - int ret; - - if (length < sizeof(unsigned int) || length % sizeof(unsigned int)) - return -EINVAL; - - do { - if (kfifo_is_empty(&rcdev->rawir)) { - if (file->f_flags & O_NONBLOCK) - return -EAGAIN; - - ret = wait_event_interruptible(rcdev->wait_poll, - !kfifo_is_empty(&rcdev->rawir) || - !rcdev->registered); - if (ret) - return ret; - } - - if (!rcdev->registered) - return -ENODEV; - - ret = mutex_lock_interruptible(&rcdev->lock); - if (ret) - return ret; - ret = kfifo_to_user(&rcdev->rawir, buffer, length, &copied); - mutex_unlock(&rcdev->lock); - if (ret) - return ret; - } while (copied == 0); - - return copied; -} - -static ssize_t ir_lirc_read_scancode(struct file *file, char __user *buffer, - size_t length) -{ - struct rc_dev *rcdev = file->private_data; - unsigned int copied; - int ret; - - if (length < sizeof(struct lirc_scancode) || - length % sizeof(struct lirc_scancode)) - return -EINVAL; - - do { - if (kfifo_is_empty(&rcdev->scancodes)) { - if (file->f_flags & O_NONBLOCK) - return -EAGAIN; - - ret = wait_event_interruptible(rcdev->wait_poll, - !kfifo_is_empty(&rcdev->scancodes) || - !rcdev->registered); - if (ret) - return ret; - } - - if (!rcdev->registered) - return -ENODEV; - - ret = mutex_lock_interruptible(&rcdev->lock); - if (ret) - return ret; - ret = kfifo_to_user(&rcdev->scancodes, buffer, length, &copied); - mutex_unlock(&rcdev->lock); - if (ret) - return ret; - } while (copied == 0); - - return copied; -} - -static ssize_t ir_lirc_read(struct file *file, char __user *buffer, - size_t length, loff_t *ppos) -{ - struct rc_dev *rcdev = file->private_data; - - if (rcdev->driver_type == RC_DRIVER_IR_RAW_TX) - return -EINVAL; - - if (!rcdev->registered) - return -ENODEV; - - if (rcdev->rec_mode == LIRC_MODE_MODE2) - return ir_lirc_read_mode2(file, buffer, length); - else /* LIRC_MODE_SCANCODE */ - return ir_lirc_read_scancode(file, buffer, length); -} - -const struct file_operations lirc_fops = { - .owner = THIS_MODULE, - .write = ir_lirc_transmit_ir, - .unlocked_ioctl = ir_lirc_ioctl, -#ifdef CONFIG_COMPAT - .compat_ioctl = ir_lirc_ioctl, -#endif - .read = ir_lirc_read, - .poll = ir_lirc_poll, - .open = ir_lirc_open, - .release = ir_lirc_close, - .llseek = no_llseek, -}; diff --git a/drivers/media/rc/lirc_dev.c b/drivers/media/rc/lirc_dev.c index 2a0c48698309..97d60f0b5836 100644 --- a/drivers/media/rc/lirc_dev.c +++ b/drivers/media/rc/lirc_dev.c @@ -22,11 +22,14 @@ #include #include #include +#include +#include #include "rc-core-priv.h" #include #define LOGHEAD "lirc_dev (%s[%d]): " +#define LIRCBUF_SIZE 256 static dev_t lirc_base_dev; @@ -36,6 +39,607 @@ static DEFINE_IDA(lirc_ida); /* Only used for sysfs but defined to void otherwise */ static struct class *lirc_class; +/** + * ir_lirc_raw_event() - Send raw IR data to lirc to be relayed to userspace + * + * @dev: the struct rc_dev descriptor of the device + * @ev: the struct ir_raw_event descriptor of the pulse/space + */ +void ir_lirc_raw_event(struct rc_dev *dev, struct ir_raw_event ev) +{ + int sample; + + /* Packet start */ + if (ev.reset) { + /* + * Userspace expects a long space event before the start of + * the signal to use as a sync. This may be done with repeat + * packets and normal samples. But if a reset has been sent + * then we assume that a long time has passed, so we send a + * space with the maximum time value. + */ + sample = LIRC_SPACE(LIRC_VALUE_MASK); + IR_dprintk(2, "delivering reset sync space to lirc_dev\n"); + + /* Carrier reports */ + } else if (ev.carrier_report) { + sample = LIRC_FREQUENCY(ev.carrier); + IR_dprintk(2, "carrier report (freq: %d)\n", sample); + + /* Packet end */ + } else if (ev.timeout) { + if (dev->gap) + return; + + dev->gap_start = ktime_get(); + dev->gap = true; + dev->gap_duration = ev.duration; + + if (!dev->send_timeout_reports) + return; + + sample = LIRC_TIMEOUT(ev.duration / 1000); + IR_dprintk(2, "timeout report (duration: %d)\n", sample); + + /* Normal sample */ + } else { + if (dev->gap) { + dev->gap_duration += ktime_to_ns(ktime_sub(ktime_get(), + dev->gap_start)); + + /* Convert to ms and cap by LIRC_VALUE_MASK */ + do_div(dev->gap_duration, 1000); + dev->gap_duration = min_t(u64, dev->gap_duration, + LIRC_VALUE_MASK); + + kfifo_put(&dev->rawir, LIRC_SPACE(dev->gap_duration)); + dev->gap = false; + } + + sample = ev.pulse ? LIRC_PULSE(ev.duration / 1000) : + LIRC_SPACE(ev.duration / 1000); + IR_dprintk(2, "delivering %uus %s to lirc_dev\n", + TO_US(ev.duration), TO_STR(ev.pulse)); + } + + kfifo_put(&dev->rawir, sample); + wake_up_poll(&dev->wait_poll, POLLIN | POLLRDNORM); +} + +/** + * ir_lirc_scancode_event() - Send scancode data to lirc to be relayed to + * userspace + * @dev: the struct rc_dev descriptor of the device + * @lsc: the struct lirc_scancode describing the decoded scancode + */ +void ir_lirc_scancode_event(struct rc_dev *dev, struct lirc_scancode *lsc) +{ + lsc->timestamp = ktime_get_ns(); + + if (kfifo_put(&dev->scancodes, *lsc)) + wake_up_poll(&dev->wait_poll, POLLIN | POLLRDNORM); +} +EXPORT_SYMBOL_GPL(ir_lirc_scancode_event); + +static int ir_lirc_open(struct inode *inode, struct file *file) +{ + struct rc_dev *dev = container_of(inode->i_cdev, struct rc_dev, + lirc_cdev); + int retval; + + retval = rc_open(dev); + if (retval) + return retval; + + retval = mutex_lock_interruptible(&dev->lock); + if (retval) + goto out_rc; + + if (!dev->registered) { + retval = -ENODEV; + goto out_unlock; + } + + if (dev->lirc_open) { + retval = -EBUSY; + goto out_unlock; + } + + if (dev->driver_type == RC_DRIVER_IR_RAW) + kfifo_reset_out(&dev->rawir); + if (dev->driver_type != RC_DRIVER_IR_RAW_TX) + kfifo_reset_out(&dev->scancodes); + + dev->lirc_open++; + file->private_data = dev; + + nonseekable_open(inode, file); + mutex_unlock(&dev->lock); + + return 0; + +out_unlock: + mutex_unlock(&dev->lock); +out_rc: + rc_close(dev); + return retval; +} + +static int ir_lirc_close(struct inode *inode, struct file *file) +{ + struct rc_dev *dev = file->private_data; + + mutex_lock(&dev->lock); + dev->lirc_open--; + mutex_unlock(&dev->lock); + + rc_close(dev); + + return 0; +} + +static ssize_t ir_lirc_transmit_ir(struct file *file, const char __user *buf, + size_t n, loff_t *ppos) +{ + struct rc_dev *dev = file->private_data; + unsigned int *txbuf = NULL; + struct ir_raw_event *raw = NULL; + ssize_t ret = -EINVAL; + size_t count; + ktime_t start; + s64 towait; + unsigned int duration = 0; /* signal duration in us */ + int i; + + if (!dev->registered) + return -ENODEV; + + start = ktime_get(); + + if (!dev->tx_ir) { + ret = -EINVAL; + goto out; + } + + if (dev->send_mode == LIRC_MODE_SCANCODE) { + struct lirc_scancode scan; + + if (n != sizeof(scan)) + return -EINVAL; + + if (copy_from_user(&scan, buf, sizeof(scan))) + return -EFAULT; + + if (scan.flags || scan.keycode || scan.timestamp) + return -EINVAL; + + /* + * The scancode field in lirc_scancode is 64-bit simply + * to future-proof it, since there are IR protocols encode + * use more than 32 bits. For now only 32-bit protocols + * are supported. + */ + if (scan.scancode > U32_MAX || + !rc_validate_scancode(scan.rc_proto, scan.scancode)) + return -EINVAL; + + raw = kmalloc_array(LIRCBUF_SIZE, sizeof(*raw), GFP_KERNEL); + if (!raw) + return -ENOMEM; + + ret = ir_raw_encode_scancode(scan.rc_proto, scan.scancode, + raw, LIRCBUF_SIZE); + if (ret < 0) + goto out; + + count = ret; + + txbuf = kmalloc_array(count, sizeof(unsigned int), GFP_KERNEL); + if (!txbuf) { + ret = -ENOMEM; + goto out; + } + + for (i = 0; i < count; i++) + /* Convert from NS to US */ + txbuf[i] = DIV_ROUND_UP(raw[i].duration, 1000); + + if (dev->s_tx_carrier) { + int carrier = ir_raw_encode_carrier(scan.rc_proto); + + if (carrier > 0) + dev->s_tx_carrier(dev, carrier); + } + } else { + if (n < sizeof(unsigned int) || n % sizeof(unsigned int)) + return -EINVAL; + + count = n / sizeof(unsigned int); + if (count > LIRCBUF_SIZE || count % 2 == 0) + return -EINVAL; + + txbuf = memdup_user(buf, n); + if (IS_ERR(txbuf)) + return PTR_ERR(txbuf); + } + + for (i = 0; i < count; i++) { + if (txbuf[i] > IR_MAX_DURATION / 1000 - duration || !txbuf[i]) { + ret = -EINVAL; + goto out; + } + + duration += txbuf[i]; + } + + ret = dev->tx_ir(dev, txbuf, count); + if (ret < 0) + goto out; + + if (dev->send_mode == LIRC_MODE_SCANCODE) { + ret = n; + } else { + for (duration = i = 0; i < ret; i++) + duration += txbuf[i]; + + ret *= sizeof(unsigned int); + + /* + * The lircd gap calculation expects the write function to + * wait for the actual IR signal to be transmitted before + * returning. + */ + towait = ktime_us_delta(ktime_add_us(start, duration), + ktime_get()); + if (towait > 0) { + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(usecs_to_jiffies(towait)); + } + } + +out: + kfree(txbuf); + kfree(raw); + return ret; +} + +static long ir_lirc_ioctl(struct file *filep, unsigned int cmd, + unsigned long arg) +{ + struct rc_dev *dev = filep->private_data; + u32 __user *argp = (u32 __user *)(arg); + int ret = 0; + __u32 val = 0, tmp; + + if (_IOC_DIR(cmd) & _IOC_WRITE) { + ret = get_user(val, argp); + if (ret) + return ret; + } + + if (!dev->registered) + return -ENODEV; + + switch (cmd) { + case LIRC_GET_FEATURES: + if (dev->driver_type == RC_DRIVER_SCANCODE) + val |= LIRC_CAN_REC_SCANCODE; + + if (dev->driver_type == RC_DRIVER_IR_RAW) { + val |= LIRC_CAN_REC_MODE2 | LIRC_CAN_REC_SCANCODE; + if (dev->rx_resolution) + val |= LIRC_CAN_GET_REC_RESOLUTION; + } + + if (dev->tx_ir) { + val |= LIRC_CAN_SEND_PULSE | LIRC_CAN_SEND_SCANCODE; + if (dev->s_tx_mask) + val |= LIRC_CAN_SET_TRANSMITTER_MASK; + if (dev->s_tx_carrier) + val |= LIRC_CAN_SET_SEND_CARRIER; + if (dev->s_tx_duty_cycle) + val |= LIRC_CAN_SET_SEND_DUTY_CYCLE; + } + + if (dev->s_rx_carrier_range) + val |= LIRC_CAN_SET_REC_CARRIER | + LIRC_CAN_SET_REC_CARRIER_RANGE; + + if (dev->s_learning_mode) + val |= LIRC_CAN_USE_WIDEBAND_RECEIVER; + + if (dev->s_carrier_report) + val |= LIRC_CAN_MEASURE_CARRIER; + + if (dev->max_timeout) + val |= LIRC_CAN_SET_REC_TIMEOUT; + + break; + + /* mode support */ + case LIRC_GET_REC_MODE: + if (dev->driver_type == RC_DRIVER_IR_RAW_TX) + return -ENOTTY; + + val = dev->rec_mode; + break; + + case LIRC_SET_REC_MODE: + switch (dev->driver_type) { + case RC_DRIVER_IR_RAW_TX: + return -ENOTTY; + case RC_DRIVER_SCANCODE: + if (val != LIRC_MODE_SCANCODE) + return -EINVAL; + break; + case RC_DRIVER_IR_RAW: + if (!(val == LIRC_MODE_MODE2 || + val == LIRC_MODE_SCANCODE)) + return -EINVAL; + break; + } + + dev->rec_mode = val; + return 0; + + case LIRC_GET_SEND_MODE: + if (!dev->tx_ir) + return -ENOTTY; + + val = dev->send_mode; + break; + + case LIRC_SET_SEND_MODE: + if (!dev->tx_ir) + return -ENOTTY; + + if (!(val == LIRC_MODE_PULSE || val == LIRC_MODE_SCANCODE)) + return -EINVAL; + + dev->send_mode = val; + return 0; + + /* TX settings */ + case LIRC_SET_TRANSMITTER_MASK: + if (!dev->s_tx_mask) + return -ENOTTY; + + return dev->s_tx_mask(dev, val); + + case LIRC_SET_SEND_CARRIER: + if (!dev->s_tx_carrier) + return -ENOTTY; + + return dev->s_tx_carrier(dev, val); + + case LIRC_SET_SEND_DUTY_CYCLE: + if (!dev->s_tx_duty_cycle) + return -ENOTTY; + + if (val <= 0 || val >= 100) + return -EINVAL; + + return dev->s_tx_duty_cycle(dev, val); + + /* RX settings */ + case LIRC_SET_REC_CARRIER: + if (!dev->s_rx_carrier_range) + return -ENOTTY; + + if (val <= 0) + return -EINVAL; + + return dev->s_rx_carrier_range(dev, + dev->carrier_low, + val); + + case LIRC_SET_REC_CARRIER_RANGE: + if (!dev->s_rx_carrier_range) + return -ENOTTY; + + if (val <= 0) + return -EINVAL; + + dev->carrier_low = val; + return 0; + + case LIRC_GET_REC_RESOLUTION: + if (!dev->rx_resolution) + return -ENOTTY; + + val = dev->rx_resolution / 1000; + break; + + case LIRC_SET_WIDEBAND_RECEIVER: + if (!dev->s_learning_mode) + return -ENOTTY; + + return dev->s_learning_mode(dev, !!val); + + case LIRC_SET_MEASURE_CARRIER_MODE: + if (!dev->s_carrier_report) + return -ENOTTY; + + return dev->s_carrier_report(dev, !!val); + + /* Generic timeout support */ + case LIRC_GET_MIN_TIMEOUT: + if (!dev->max_timeout) + return -ENOTTY; + val = DIV_ROUND_UP(dev->min_timeout, 1000); + break; + + case LIRC_GET_MAX_TIMEOUT: + if (!dev->max_timeout) + return -ENOTTY; + val = dev->max_timeout / 1000; + break; + + case LIRC_SET_REC_TIMEOUT: + if (!dev->max_timeout) + return -ENOTTY; + + /* Check for multiply overflow */ + if (val > U32_MAX / 1000) + return -EINVAL; + + tmp = val * 1000; + + if (tmp < dev->min_timeout || tmp > dev->max_timeout) + return -EINVAL; + + if (dev->s_timeout) + ret = dev->s_timeout(dev, tmp); + if (!ret) + dev->timeout = tmp; + break; + + case LIRC_SET_REC_TIMEOUT_REPORTS: + if (!dev->timeout) + return -ENOTTY; + + dev->send_timeout_reports = !!val; + break; + + default: + return -ENOTTY; + } + + if (_IOC_DIR(cmd) & _IOC_READ) + ret = put_user(val, argp); + + return ret; +} + +static unsigned int ir_lirc_poll(struct file *file, + struct poll_table_struct *wait) +{ + struct rc_dev *rcdev = file->private_data; + unsigned int events = 0; + + poll_wait(file, &rcdev->wait_poll, wait); + + if (!rcdev->registered) { + events = POLLHUP | POLLERR; + } else if (rcdev->driver_type != RC_DRIVER_IR_RAW_TX) { + if (rcdev->rec_mode == LIRC_MODE_SCANCODE && + !kfifo_is_empty(&rcdev->scancodes)) + events = POLLIN | POLLRDNORM; + + if (rcdev->rec_mode == LIRC_MODE_MODE2 && + !kfifo_is_empty(&rcdev->rawir)) + events = POLLIN | POLLRDNORM; + } + + return events; +} + +static ssize_t ir_lirc_read_mode2(struct file *file, char __user *buffer, + size_t length) +{ + struct rc_dev *rcdev = file->private_data; + unsigned int copied; + int ret; + + if (length < sizeof(unsigned int) || length % sizeof(unsigned int)) + return -EINVAL; + + do { + if (kfifo_is_empty(&rcdev->rawir)) { + if (file->f_flags & O_NONBLOCK) + return -EAGAIN; + + ret = wait_event_interruptible(rcdev->wait_poll, + !kfifo_is_empty(&rcdev->rawir) || + !rcdev->registered); + if (ret) + return ret; + } + + if (!rcdev->registered) + return -ENODEV; + + ret = mutex_lock_interruptible(&rcdev->lock); + if (ret) + return ret; + ret = kfifo_to_user(&rcdev->rawir, buffer, length, &copied); + mutex_unlock(&rcdev->lock); + if (ret) + return ret; + } while (copied == 0); + + return copied; +} + +static ssize_t ir_lirc_read_scancode(struct file *file, char __user *buffer, + size_t length) +{ + struct rc_dev *rcdev = file->private_data; + unsigned int copied; + int ret; + + if (length < sizeof(struct lirc_scancode) || + length % sizeof(struct lirc_scancode)) + return -EINVAL; + + do { + if (kfifo_is_empty(&rcdev->scancodes)) { + if (file->f_flags & O_NONBLOCK) + return -EAGAIN; + + ret = wait_event_interruptible(rcdev->wait_poll, + !kfifo_is_empty(&rcdev->scancodes) || + !rcdev->registered); + if (ret) + return ret; + } + + if (!rcdev->registered) + return -ENODEV; + + ret = mutex_lock_interruptible(&rcdev->lock); + if (ret) + return ret; + ret = kfifo_to_user(&rcdev->scancodes, buffer, length, &copied); + mutex_unlock(&rcdev->lock); + if (ret) + return ret; + } while (copied == 0); + + return copied; +} + +static ssize_t ir_lirc_read(struct file *file, char __user *buffer, + size_t length, loff_t *ppos) +{ + struct rc_dev *rcdev = file->private_data; + + if (rcdev->driver_type == RC_DRIVER_IR_RAW_TX) + return -EINVAL; + + if (!rcdev->registered) + return -ENODEV; + + if (rcdev->rec_mode == LIRC_MODE_MODE2) + return ir_lirc_read_mode2(file, buffer, length); + else /* LIRC_MODE_SCANCODE */ + return ir_lirc_read_scancode(file, buffer, length); +} + +static const struct file_operations lirc_fops = { + .owner = THIS_MODULE, + .write = ir_lirc_transmit_ir, + .unlocked_ioctl = ir_lirc_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = ir_lirc_ioctl, +#endif + .read = ir_lirc_read, + .poll = ir_lirc_poll, + .open = ir_lirc_open, + .release = ir_lirc_close, + .llseek = no_llseek, +}; + static void lirc_release_device(struct device *ld) { struct rc_dev *rcdev = container_of(ld, struct rc_dev, lirc_dev); diff --git a/drivers/media/rc/rc-core-priv.h b/drivers/media/rc/rc-core-priv.h index 2a5e9cc3ddb3..915434855a63 100644 --- a/drivers/media/rc/rc-core-priv.h +++ b/drivers/media/rc/rc-core-priv.h @@ -279,8 +279,6 @@ void ir_lirc_raw_event(struct rc_dev *dev, struct ir_raw_event ev); void ir_lirc_scancode_event(struct rc_dev *dev, struct lirc_scancode *lsc); int ir_lirc_register(struct rc_dev *dev); void ir_lirc_unregister(struct rc_dev *dev); - -extern const struct file_operations lirc_fops; #else static inline int lirc_dev_init(void) { return 0; } static inline void lirc_dev_exit(void) {} From 0a63269fa0e8a153d011a6cf69ad47b5047e2b17 Mon Sep 17 00:00:00 2001 From: Sean Young Date: Thu, 2 Nov 2017 16:44:21 -0400 Subject: [PATCH 0563/1640] UPSTREAM: media: rc: include rather than This removes the need for include/media/lirc.h, which just includes the uapi file. Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/lirc_dev.c | 2 +- include/media/lirc.h | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) delete mode 100644 include/media/lirc.h diff --git a/drivers/media/rc/lirc_dev.c b/drivers/media/rc/lirc_dev.c index 97d60f0b5836..8a0e9f74329a 100644 --- a/drivers/media/rc/lirc_dev.c +++ b/drivers/media/rc/lirc_dev.c @@ -26,7 +26,7 @@ #include #include "rc-core-priv.h" -#include +#include #define LOGHEAD "lirc_dev (%s[%d]): " #define LIRCBUF_SIZE 256 diff --git a/include/media/lirc.h b/include/media/lirc.h deleted file mode 100644 index 554988c860c1..000000000000 --- a/include/media/lirc.h +++ /dev/null @@ -1 +0,0 @@ -#include From 7dc846ab7f4816d9bf57df443e60919802a59aa3 Mon Sep 17 00:00:00 2001 From: Sean Young Date: Thu, 2 Nov 2017 17:21:13 -0400 Subject: [PATCH 0564/1640] UPSTREAM: media: lirc: allow lirc device to be opened more than once This makes it possible for lircd to read from a lirc chardev, and not keep it busy. Note that this changes the default for timeout reports to on. lircd already enables timeout reports when it opens a lirc device, leaving them on until the next reboot. Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/lirc_dev.c | 247 +++++++++++++++++++----------------- include/media/rc-core.h | 50 +++++--- 2 files changed, 165 insertions(+), 132 deletions(-) diff --git a/drivers/media/rc/lirc_dev.c b/drivers/media/rc/lirc_dev.c index 8a0e9f74329a..7b9246fb9652 100644 --- a/drivers/media/rc/lirc_dev.c +++ b/drivers/media/rc/lirc_dev.c @@ -28,7 +28,6 @@ #include "rc-core-priv.h" #include -#define LOGHEAD "lirc_dev (%s[%d]): " #define LIRCBUF_SIZE 256 static dev_t lirc_base_dev; @@ -47,6 +46,8 @@ static struct class *lirc_class; */ void ir_lirc_raw_event(struct rc_dev *dev, struct ir_raw_event ev) { + unsigned long flags; + struct lirc_fh *fh; int sample; /* Packet start */ @@ -75,9 +76,6 @@ void ir_lirc_raw_event(struct rc_dev *dev, struct ir_raw_event ev) dev->gap = true; dev->gap_duration = ev.duration; - if (!dev->send_timeout_reports) - return; - sample = LIRC_TIMEOUT(ev.duration / 1000); IR_dprintk(2, "timeout report (duration: %d)\n", sample); @@ -92,7 +90,11 @@ void ir_lirc_raw_event(struct rc_dev *dev, struct ir_raw_event ev) dev->gap_duration = min_t(u64, dev->gap_duration, LIRC_VALUE_MASK); - kfifo_put(&dev->rawir, LIRC_SPACE(dev->gap_duration)); + spin_lock_irqsave(&dev->lirc_fh_lock, flags); + list_for_each_entry(fh, &dev->lirc_fh, list) + kfifo_put(&fh->rawir, + LIRC_SPACE(dev->gap_duration)); + spin_unlock_irqrestore(&dev->lirc_fh_lock, flags); dev->gap = false; } @@ -102,22 +104,35 @@ void ir_lirc_raw_event(struct rc_dev *dev, struct ir_raw_event ev) TO_US(ev.duration), TO_STR(ev.pulse)); } - kfifo_put(&dev->rawir, sample); - wake_up_poll(&dev->wait_poll, POLLIN | POLLRDNORM); + spin_lock_irqsave(&dev->lirc_fh_lock, flags); + list_for_each_entry(fh, &dev->lirc_fh, list) { + if (LIRC_IS_TIMEOUT(sample) && !fh->send_timeout_reports) + continue; + if (kfifo_put(&fh->rawir, sample)) + wake_up_poll(&fh->wait_poll, POLLIN | POLLRDNORM); + } + spin_unlock_irqrestore(&dev->lirc_fh_lock, flags); } /** * ir_lirc_scancode_event() - Send scancode data to lirc to be relayed to - * userspace + * userspace. This can be called in atomic context. * @dev: the struct rc_dev descriptor of the device * @lsc: the struct lirc_scancode describing the decoded scancode */ void ir_lirc_scancode_event(struct rc_dev *dev, struct lirc_scancode *lsc) { + unsigned long flags; + struct lirc_fh *fh; + lsc->timestamp = ktime_get_ns(); - if (kfifo_put(&dev->scancodes, *lsc)) - wake_up_poll(&dev->wait_poll, POLLIN | POLLRDNORM); + spin_lock_irqsave(&dev->lirc_fh_lock, flags); + list_for_each_entry(fh, &dev->lirc_fh, list) { + if (kfifo_put(&fh->scancodes, *lsc)) + wake_up_poll(&fh->wait_poll, POLLIN | POLLRDNORM); + } + spin_unlock_irqrestore(&dev->lirc_fh_lock, flags); } EXPORT_SYMBOL_GPL(ir_lirc_scancode_event); @@ -125,55 +140,88 @@ static int ir_lirc_open(struct inode *inode, struct file *file) { struct rc_dev *dev = container_of(inode->i_cdev, struct rc_dev, lirc_cdev); + struct lirc_fh *fh = kzalloc(sizeof(*fh), GFP_KERNEL); + unsigned long flags; int retval; - retval = rc_open(dev); - if (retval) - return retval; + if (!fh) + return -ENOMEM; - retval = mutex_lock_interruptible(&dev->lock); - if (retval) - goto out_rc; + get_device(&dev->dev); if (!dev->registered) { retval = -ENODEV; - goto out_unlock; + goto out_fh; } - if (dev->lirc_open) { - retval = -EBUSY; - goto out_unlock; + if (dev->driver_type == RC_DRIVER_IR_RAW) { + if (kfifo_alloc(&fh->rawir, MAX_IR_EVENT_SIZE, GFP_KERNEL)) { + retval = -ENOMEM; + goto out_fh; + } } - if (dev->driver_type == RC_DRIVER_IR_RAW) - kfifo_reset_out(&dev->rawir); - if (dev->driver_type != RC_DRIVER_IR_RAW_TX) - kfifo_reset_out(&dev->scancodes); + if (dev->driver_type != RC_DRIVER_IR_RAW_TX) { + if (kfifo_alloc(&fh->scancodes, 32, GFP_KERNEL)) { + retval = -ENOMEM; + goto out_rawir; + } + } - dev->lirc_open++; - file->private_data = dev; + fh->send_mode = LIRC_MODE_PULSE; + fh->rc = dev; + fh->send_timeout_reports = true; + + if (dev->driver_type == RC_DRIVER_SCANCODE) + fh->rec_mode = LIRC_MODE_SCANCODE; + else + fh->rec_mode = LIRC_MODE_MODE2; + + retval = rc_open(dev); + if (retval) + goto out_kfifo; + + init_waitqueue_head(&fh->wait_poll); + + file->private_data = fh; + spin_lock_irqsave(&dev->lirc_fh_lock, flags); + list_add(&fh->list, &dev->lirc_fh); + spin_unlock_irqrestore(&dev->lirc_fh_lock, flags); nonseekable_open(inode, file); - mutex_unlock(&dev->lock); return 0; +out_kfifo: + if (dev->driver_type != RC_DRIVER_IR_RAW_TX) + kfifo_free(&fh->scancodes); +out_rawir: + if (dev->driver_type == RC_DRIVER_IR_RAW) + kfifo_free(&fh->rawir); +out_fh: + kfree(fh); + put_device(&dev->dev); -out_unlock: - mutex_unlock(&dev->lock); -out_rc: - rc_close(dev); return retval; } static int ir_lirc_close(struct inode *inode, struct file *file) { - struct rc_dev *dev = file->private_data; + struct lirc_fh *fh = file->private_data; + struct rc_dev *dev = fh->rc; + unsigned long flags; - mutex_lock(&dev->lock); - dev->lirc_open--; - mutex_unlock(&dev->lock); + spin_lock_irqsave(&dev->lirc_fh_lock, flags); + list_del(&fh->list); + spin_unlock_irqrestore(&dev->lirc_fh_lock, flags); + + if (dev->driver_type == RC_DRIVER_IR_RAW) + kfifo_free(&fh->rawir); + if (dev->driver_type != RC_DRIVER_IR_RAW_TX) + kfifo_free(&fh->scancodes); + kfree(fh); rc_close(dev); + put_device(&dev->dev); return 0; } @@ -181,7 +229,8 @@ static int ir_lirc_close(struct inode *inode, struct file *file) static ssize_t ir_lirc_transmit_ir(struct file *file, const char __user *buf, size_t n, loff_t *ppos) { - struct rc_dev *dev = file->private_data; + struct lirc_fh *fh = file->private_data; + struct rc_dev *dev = fh->rc; unsigned int *txbuf = NULL; struct ir_raw_event *raw = NULL; ssize_t ret = -EINVAL; @@ -201,7 +250,7 @@ static ssize_t ir_lirc_transmit_ir(struct file *file, const char __user *buf, goto out; } - if (dev->send_mode == LIRC_MODE_SCANCODE) { + if (fh->send_mode == LIRC_MODE_SCANCODE) { struct lirc_scancode scan; if (n != sizeof(scan)) @@ -276,7 +325,7 @@ static ssize_t ir_lirc_transmit_ir(struct file *file, const char __user *buf, if (ret < 0) goto out; - if (dev->send_mode == LIRC_MODE_SCANCODE) { + if (fh->send_mode == LIRC_MODE_SCANCODE) { ret = n; } else { for (duration = i = 0; i < ret; i++) @@ -303,10 +352,11 @@ out: return ret; } -static long ir_lirc_ioctl(struct file *filep, unsigned int cmd, +static long ir_lirc_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { - struct rc_dev *dev = filep->private_data; + struct lirc_fh *fh = file->private_data; + struct rc_dev *dev = fh->rc; u32 __user *argp = (u32 __user *)(arg); int ret = 0; __u32 val = 0, tmp; @@ -361,7 +411,7 @@ static long ir_lirc_ioctl(struct file *filep, unsigned int cmd, if (dev->driver_type == RC_DRIVER_IR_RAW_TX) return -ENOTTY; - val = dev->rec_mode; + val = fh->rec_mode; break; case LIRC_SET_REC_MODE: @@ -379,14 +429,14 @@ static long ir_lirc_ioctl(struct file *filep, unsigned int cmd, break; } - dev->rec_mode = val; + fh->rec_mode = val; return 0; case LIRC_GET_SEND_MODE: if (!dev->tx_ir) return -ENOTTY; - val = dev->send_mode; + val = fh->send_mode; break; case LIRC_SET_SEND_MODE: @@ -396,7 +446,7 @@ static long ir_lirc_ioctl(struct file *filep, unsigned int cmd, if (!(val == LIRC_MODE_PULSE || val == LIRC_MODE_SCANCODE)) return -EINVAL; - dev->send_mode = val; + fh->send_mode = val; return 0; /* TX settings */ @@ -430,7 +480,7 @@ static long ir_lirc_ioctl(struct file *filep, unsigned int cmd, return -EINVAL; return dev->s_rx_carrier_range(dev, - dev->carrier_low, + fh->carrier_low, val); case LIRC_SET_REC_CARRIER_RANGE: @@ -440,7 +490,7 @@ static long ir_lirc_ioctl(struct file *filep, unsigned int cmd, if (val <= 0) return -EINVAL; - dev->carrier_low = val; + fh->carrier_low = val; return 0; case LIRC_GET_REC_RESOLUTION: @@ -498,7 +548,7 @@ static long ir_lirc_ioctl(struct file *filep, unsigned int cmd, if (!dev->timeout) return -ENOTTY; - dev->send_timeout_reports = !!val; + fh->send_timeout_reports = !!val; break; default: @@ -514,20 +564,21 @@ static long ir_lirc_ioctl(struct file *filep, unsigned int cmd, static unsigned int ir_lirc_poll(struct file *file, struct poll_table_struct *wait) { - struct rc_dev *rcdev = file->private_data; + struct lirc_fh *fh = file->private_data; + struct rc_dev *rcdev = fh->rc; unsigned int events = 0; - poll_wait(file, &rcdev->wait_poll, wait); + poll_wait(file, &fh->wait_poll, wait); if (!rcdev->registered) { events = POLLHUP | POLLERR; } else if (rcdev->driver_type != RC_DRIVER_IR_RAW_TX) { - if (rcdev->rec_mode == LIRC_MODE_SCANCODE && - !kfifo_is_empty(&rcdev->scancodes)) + if (fh->rec_mode == LIRC_MODE_SCANCODE && + !kfifo_is_empty(&fh->scancodes)) events = POLLIN | POLLRDNORM; - if (rcdev->rec_mode == LIRC_MODE_MODE2 && - !kfifo_is_empty(&rcdev->rawir)) + if (fh->rec_mode == LIRC_MODE_MODE2 && + !kfifo_is_empty(&fh->rawir)) events = POLLIN | POLLRDNORM; } @@ -537,7 +588,8 @@ static unsigned int ir_lirc_poll(struct file *file, static ssize_t ir_lirc_read_mode2(struct file *file, char __user *buffer, size_t length) { - struct rc_dev *rcdev = file->private_data; + struct lirc_fh *fh = file->private_data; + struct rc_dev *rcdev = fh->rc; unsigned int copied; int ret; @@ -545,12 +597,12 @@ static ssize_t ir_lirc_read_mode2(struct file *file, char __user *buffer, return -EINVAL; do { - if (kfifo_is_empty(&rcdev->rawir)) { + if (kfifo_is_empty(&fh->rawir)) { if (file->f_flags & O_NONBLOCK) return -EAGAIN; - ret = wait_event_interruptible(rcdev->wait_poll, - !kfifo_is_empty(&rcdev->rawir) || + ret = wait_event_interruptible(fh->wait_poll, + !kfifo_is_empty(&fh->rawir) || !rcdev->registered); if (ret) return ret; @@ -562,7 +614,7 @@ static ssize_t ir_lirc_read_mode2(struct file *file, char __user *buffer, ret = mutex_lock_interruptible(&rcdev->lock); if (ret) return ret; - ret = kfifo_to_user(&rcdev->rawir, buffer, length, &copied); + ret = kfifo_to_user(&fh->rawir, buffer, length, &copied); mutex_unlock(&rcdev->lock); if (ret) return ret; @@ -574,7 +626,8 @@ static ssize_t ir_lirc_read_mode2(struct file *file, char __user *buffer, static ssize_t ir_lirc_read_scancode(struct file *file, char __user *buffer, size_t length) { - struct rc_dev *rcdev = file->private_data; + struct lirc_fh *fh = file->private_data; + struct rc_dev *rcdev = fh->rc; unsigned int copied; int ret; @@ -583,12 +636,12 @@ static ssize_t ir_lirc_read_scancode(struct file *file, char __user *buffer, return -EINVAL; do { - if (kfifo_is_empty(&rcdev->scancodes)) { + if (kfifo_is_empty(&fh->scancodes)) { if (file->f_flags & O_NONBLOCK) return -EAGAIN; - ret = wait_event_interruptible(rcdev->wait_poll, - !kfifo_is_empty(&rcdev->scancodes) || + ret = wait_event_interruptible(fh->wait_poll, + !kfifo_is_empty(&fh->scancodes) || !rcdev->registered); if (ret) return ret; @@ -600,7 +653,7 @@ static ssize_t ir_lirc_read_scancode(struct file *file, char __user *buffer, ret = mutex_lock_interruptible(&rcdev->lock); if (ret) return ret; - ret = kfifo_to_user(&rcdev->scancodes, buffer, length, &copied); + ret = kfifo_to_user(&fh->scancodes, buffer, length, &copied); mutex_unlock(&rcdev->lock); if (ret) return ret; @@ -612,7 +665,8 @@ static ssize_t ir_lirc_read_scancode(struct file *file, char __user *buffer, static ssize_t ir_lirc_read(struct file *file, char __user *buffer, size_t length, loff_t *ppos) { - struct rc_dev *rcdev = file->private_data; + struct lirc_fh *fh = file->private_data; + struct rc_dev *rcdev = fh->rc; if (rcdev->driver_type == RC_DRIVER_IR_RAW_TX) return -EINVAL; @@ -620,7 +674,7 @@ static ssize_t ir_lirc_read(struct file *file, char __user *buffer, if (!rcdev->registered) return -ENODEV; - if (rcdev->rec_mode == LIRC_MODE_MODE2) + if (fh->rec_mode == LIRC_MODE_MODE2) return ir_lirc_read_mode2(file, buffer, length); else /* LIRC_MODE_SCANCODE */ return ir_lirc_read_scancode(file, buffer, length); @@ -644,11 +698,6 @@ static void lirc_release_device(struct device *ld) { struct rc_dev *rcdev = container_of(ld, struct rc_dev, lirc_dev); - if (rcdev->driver_type == RC_DRIVER_IR_RAW) - kfifo_free(&rcdev->rawir); - if (rcdev->driver_type != RC_DRIVER_IR_RAW_TX) - kfifo_free(&rcdev->scancodes); - put_device(&rcdev->dev); } @@ -656,40 +705,20 @@ int ir_lirc_register(struct rc_dev *dev) { int err, minor; + minor = ida_simple_get(&lirc_ida, 0, RC_DEV_MAX, GFP_KERNEL); + if (minor < 0) + return minor; + device_initialize(&dev->lirc_dev); dev->lirc_dev.class = lirc_class; - dev->lirc_dev.release = lirc_release_device; - dev->send_mode = LIRC_MODE_PULSE; - - if (dev->driver_type == RC_DRIVER_SCANCODE) - dev->rec_mode = LIRC_MODE_SCANCODE; - else - dev->rec_mode = LIRC_MODE_MODE2; - - if (dev->driver_type == RC_DRIVER_IR_RAW) { - if (kfifo_alloc(&dev->rawir, MAX_IR_EVENT_SIZE, GFP_KERNEL)) - return -ENOMEM; - } - - if (dev->driver_type != RC_DRIVER_IR_RAW_TX) { - if (kfifo_alloc(&dev->scancodes, 32, GFP_KERNEL)) { - kfifo_free(&dev->rawir); - return -ENOMEM; - } - } - - init_waitqueue_head(&dev->wait_poll); - - minor = ida_simple_get(&lirc_ida, 0, RC_DEV_MAX, GFP_KERNEL); - if (minor < 0) { - err = minor; - goto out_kfifo; - } - dev->lirc_dev.parent = &dev->dev; + dev->lirc_dev.release = lirc_release_device; dev->lirc_dev.devt = MKDEV(MAJOR(lirc_base_dev), minor); dev_set_name(&dev->lirc_dev, "lirc%d", minor); + INIT_LIST_HEAD(&dev->lirc_fh); + spin_lock_init(&dev->lirc_fh_lock); + cdev_init(&dev->lirc_cdev, &lirc_fops); err = cdev_device_add(&dev->lirc_cdev, &dev->lirc_dev); @@ -705,32 +734,24 @@ int ir_lirc_register(struct rc_dev *dev) out_ida: ida_simple_remove(&lirc_ida, minor); -out_kfifo: - if (dev->driver_type == RC_DRIVER_IR_RAW) - kfifo_free(&dev->rawir); - if (dev->driver_type != RC_DRIVER_IR_RAW_TX) - kfifo_free(&dev->scancodes); return err; } void ir_lirc_unregister(struct rc_dev *dev) { + unsigned long flags; + struct lirc_fh *fh; + dev_dbg(&dev->dev, "lirc_dev: driver %s unregistered from minor = %d\n", dev->driver_name, MINOR(dev->lirc_dev.devt)); - mutex_lock(&dev->lock); - - if (dev->lirc_open) { - dev_dbg(&dev->dev, LOGHEAD "releasing opened driver\n", - dev->driver_name, MINOR(dev->lirc_dev.devt)); - wake_up_poll(&dev->wait_poll, POLLHUP); - } - - mutex_unlock(&dev->lock); + spin_lock_irqsave(&dev->lirc_fh_lock, flags); + list_for_each_entry(fh, &dev->lirc_fh, list) + wake_up_poll(&fh->wait_poll, POLLHUP | POLLERR); + spin_unlock_irqrestore(&dev->lirc_fh_lock, flags); cdev_device_del(&dev->lirc_cdev, &dev->lirc_dev); ida_simple_remove(&lirc_ida, MINOR(dev->lirc_dev.devt)); - put_device(&dev->lirc_dev); } int __init lirc_dev_init(void) diff --git a/include/media/rc-core.h b/include/media/rc-core.h index fbf1648d2ec9..3a47a25a6593 100644 --- a/include/media/rc-core.h +++ b/include/media/rc-core.h @@ -68,6 +68,33 @@ enum rc_filter_type { RC_FILTER_MAX }; +/** + * struct lirc_fh - represents an open lirc file + * @list: list of open file handles + * @rc: rcdev for this lirc chardev + * @carrier_low: when setting the carrier range, first the low end must be + * set with an ioctl and then the high end with another ioctl + * @send_timeout_reports: report timeouts in lirc raw IR. + * @rawir: queue for incoming raw IR + * @scancodes: queue for incoming decoded scancodes + * @wait_poll: poll struct for lirc device + * @send_mode: lirc mode for sending, either LIRC_MODE_SCANCODE or + * LIRC_MODE_PULSE + * @rec_mode: lirc mode for receiving, either LIRC_MODE_SCANCODE or + * LIRC_MODE_MODE2 + */ +struct lirc_fh { + struct list_head list; + struct rc_dev *rc; + int carrier_low; + bool send_timeout_reports; + DECLARE_KFIFO_PTR(rawir, unsigned int); + DECLARE_KFIFO_PTR(scancodes, struct lirc_scancode); + wait_queue_head_t wait_poll; + u8 send_mode; + u8 rec_mode; +}; + /** * struct rc_dev - represents a remote control device * @dev: driver model's view of this device @@ -118,20 +145,11 @@ enum rc_filter_type { * @tx_resolution: resolution (in ns) of output sampler * @lirc_dev: lirc device * @lirc_cdev: lirc char cdev - * @lirc_open: count of the number of times the device has been opened - * @carrier_low: when setting the carrier range, first the low end must be - * set with an ioctl and then the high end with another ioctl * @gap_start: time when gap starts * @gap_duration: duration of initial gap * @gap: true if we're in a gap - * @send_timeout_reports: report timeouts in lirc raw IR. - * @rawir: queue for incoming raw IR - * @scancodes: queue for incoming decoded scancodes - * @wait_poll: poll struct for lirc device - * @send_mode: lirc mode for sending, either LIRC_MODE_SCANCODE or - * LIRC_MODE_PULSE - * @rec_mode: lirc mode for receiving, either LIRC_MODE_SCANCODE or - * LIRC_MODE_MODE2 + * @lirc_fh_lock: protects lirc_fh list + * @lirc_fh: list of open files * @registered: set to true by rc_register_device(), false by * rc_unregister_device * @change_protocol: allow changing the protocol used on hardware decoders @@ -196,17 +214,11 @@ struct rc_dev { #ifdef CONFIG_LIRC struct device lirc_dev; struct cdev lirc_cdev; - int lirc_open; - int carrier_low; ktime_t gap_start; u64 gap_duration; bool gap; - bool send_timeout_reports; - DECLARE_KFIFO_PTR(rawir, unsigned int); - DECLARE_KFIFO_PTR(scancodes, struct lirc_scancode); - wait_queue_head_t wait_poll; - u8 send_mode; - u8 rec_mode; + spinlock_t lirc_fh_lock; + struct list_head lirc_fh; #endif bool registered; int (*change_protocol)(struct rc_dev *dev, u64 *rc_proto); From 2cc4ebfdb96091d5389e96c349b53195f0e47e93 Mon Sep 17 00:00:00 2001 From: Sean Young Date: Sat, 4 Nov 2017 08:30:45 -0400 Subject: [PATCH 0565/1640] UPSTREAM: media: lirc: improve locking Once rc_unregister_device() has been called, no driver function should be called. This prevents some nasty race conditions with an ioctl calls driver functions when the driver specific data has been freed. Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/lirc_dev.c | 237 +++++++++++++++++++++--------------- 1 file changed, 136 insertions(+), 101 deletions(-) diff --git a/drivers/media/rc/lirc_dev.c b/drivers/media/rc/lirc_dev.c index 7b9246fb9652..218658917cf6 100644 --- a/drivers/media/rc/lirc_dev.c +++ b/drivers/media/rc/lirc_dev.c @@ -233,15 +233,21 @@ static ssize_t ir_lirc_transmit_ir(struct file *file, const char __user *buf, struct rc_dev *dev = fh->rc; unsigned int *txbuf = NULL; struct ir_raw_event *raw = NULL; - ssize_t ret = -EINVAL; + ssize_t ret; size_t count; ktime_t start; s64 towait; unsigned int duration = 0; /* signal duration in us */ int i; - if (!dev->registered) - return -ENODEV; + ret = mutex_lock_interruptible(&dev->lock); + if (ret) + return ret; + + if (!dev->registered) { + ret = -ENODEV; + goto out; + } start = ktime_get(); @@ -253,14 +259,20 @@ static ssize_t ir_lirc_transmit_ir(struct file *file, const char __user *buf, if (fh->send_mode == LIRC_MODE_SCANCODE) { struct lirc_scancode scan; - if (n != sizeof(scan)) - return -EINVAL; + if (n != sizeof(scan)) { + ret = -EINVAL; + goto out; + } - if (copy_from_user(&scan, buf, sizeof(scan))) - return -EFAULT; + if (copy_from_user(&scan, buf, sizeof(scan))) { + ret = -EFAULT; + goto out; + } - if (scan.flags || scan.keycode || scan.timestamp) - return -EINVAL; + if (scan.flags || scan.keycode || scan.timestamp) { + ret = -EINVAL; + goto out; + } /* * The scancode field in lirc_scancode is 64-bit simply @@ -269,12 +281,16 @@ static ssize_t ir_lirc_transmit_ir(struct file *file, const char __user *buf, * are supported. */ if (scan.scancode > U32_MAX || - !rc_validate_scancode(scan.rc_proto, scan.scancode)) - return -EINVAL; + !rc_validate_scancode(scan.rc_proto, scan.scancode)) { + ret = -EINVAL; + goto out; + } raw = kmalloc_array(LIRCBUF_SIZE, sizeof(*raw), GFP_KERNEL); - if (!raw) - return -ENOMEM; + if (!raw) { + ret = -ENOMEM; + goto out; + } ret = ir_raw_encode_scancode(scan.rc_proto, scan.scancode, raw, LIRCBUF_SIZE); @@ -300,16 +316,22 @@ static ssize_t ir_lirc_transmit_ir(struct file *file, const char __user *buf, dev->s_tx_carrier(dev, carrier); } } else { - if (n < sizeof(unsigned int) || n % sizeof(unsigned int)) - return -EINVAL; + if (n < sizeof(unsigned int) || n % sizeof(unsigned int)) { + ret = -EINVAL; + goto out; + } count = n / sizeof(unsigned int); - if (count > LIRCBUF_SIZE || count % 2 == 0) - return -EINVAL; + if (count > LIRCBUF_SIZE || count % 2 == 0) { + ret = -EINVAL; + goto out; + } txbuf = memdup_user(buf, n); - if (IS_ERR(txbuf)) - return PTR_ERR(txbuf); + if (IS_ERR(txbuf)) { + ret = PTR_ERR(txbuf); + goto out; + } } for (i = 0; i < count; i++) { @@ -347,6 +369,7 @@ static ssize_t ir_lirc_transmit_ir(struct file *file, const char __user *buf, } out: + mutex_unlock(&dev->lock); kfree(txbuf); kfree(raw); return ret; @@ -358,8 +381,8 @@ static long ir_lirc_ioctl(struct file *file, unsigned int cmd, struct lirc_fh *fh = file->private_data; struct rc_dev *dev = fh->rc; u32 __user *argp = (u32 __user *)(arg); - int ret = 0; - __u32 val = 0, tmp; + u32 val = 0; + int ret; if (_IOC_DIR(cmd) & _IOC_WRITE) { ret = get_user(val, argp); @@ -367,8 +390,14 @@ static long ir_lirc_ioctl(struct file *file, unsigned int cmd, return ret; } - if (!dev->registered) - return -ENODEV; + ret = mutex_lock_interruptible(&dev->lock); + if (ret) + return ret; + + if (!dev->registered) { + ret = -ENODEV; + goto out; + } switch (cmd) { case LIRC_GET_FEATURES: @@ -409,155 +438,161 @@ static long ir_lirc_ioctl(struct file *file, unsigned int cmd, /* mode support */ case LIRC_GET_REC_MODE: if (dev->driver_type == RC_DRIVER_IR_RAW_TX) - return -ENOTTY; - - val = fh->rec_mode; + ret = -ENOTTY; + else + val = fh->rec_mode; break; case LIRC_SET_REC_MODE: switch (dev->driver_type) { case RC_DRIVER_IR_RAW_TX: - return -ENOTTY; + ret = -ENOTTY; + break; case RC_DRIVER_SCANCODE: if (val != LIRC_MODE_SCANCODE) - return -EINVAL; + ret = -EINVAL; break; case RC_DRIVER_IR_RAW: if (!(val == LIRC_MODE_MODE2 || val == LIRC_MODE_SCANCODE)) - return -EINVAL; + ret = -EINVAL; break; } - fh->rec_mode = val; - return 0; + if (!ret) + fh->rec_mode = val; + break; case LIRC_GET_SEND_MODE: if (!dev->tx_ir) - return -ENOTTY; - - val = fh->send_mode; + ret = -ENOTTY; + else + val = fh->send_mode; break; case LIRC_SET_SEND_MODE: if (!dev->tx_ir) - return -ENOTTY; - - if (!(val == LIRC_MODE_PULSE || val == LIRC_MODE_SCANCODE)) - return -EINVAL; - - fh->send_mode = val; - return 0; + ret = -ENOTTY; + else if (!(val == LIRC_MODE_PULSE || val == LIRC_MODE_SCANCODE)) + ret = -EINVAL; + else + fh->send_mode = val; + break; /* TX settings */ case LIRC_SET_TRANSMITTER_MASK: if (!dev->s_tx_mask) - return -ENOTTY; - - return dev->s_tx_mask(dev, val); + ret = -ENOTTY; + else + ret = dev->s_tx_mask(dev, val); + break; case LIRC_SET_SEND_CARRIER: if (!dev->s_tx_carrier) - return -ENOTTY; - - return dev->s_tx_carrier(dev, val); + ret = -ENOTTY; + else + ret = dev->s_tx_carrier(dev, val); + break; case LIRC_SET_SEND_DUTY_CYCLE: if (!dev->s_tx_duty_cycle) - return -ENOTTY; - - if (val <= 0 || val >= 100) - return -EINVAL; - - return dev->s_tx_duty_cycle(dev, val); + ret = -ENOTTY; + else if (val <= 0 || val >= 100) + ret = -EINVAL; + else + ret = dev->s_tx_duty_cycle(dev, val); + break; /* RX settings */ case LIRC_SET_REC_CARRIER: if (!dev->s_rx_carrier_range) - return -ENOTTY; - - if (val <= 0) - return -EINVAL; - - return dev->s_rx_carrier_range(dev, - fh->carrier_low, - val); + ret = -ENOTTY; + else if (val <= 0) + ret = -EINVAL; + else + ret = dev->s_rx_carrier_range(dev, fh->carrier_low, + val); + break; case LIRC_SET_REC_CARRIER_RANGE: if (!dev->s_rx_carrier_range) - return -ENOTTY; - - if (val <= 0) - return -EINVAL; - - fh->carrier_low = val; - return 0; + ret = -ENOTTY; + else if (val <= 0) + ret = -EINVAL; + else + fh->carrier_low = val; + break; case LIRC_GET_REC_RESOLUTION: if (!dev->rx_resolution) - return -ENOTTY; - - val = dev->rx_resolution / 1000; + ret = -ENOTTY; + else + val = dev->rx_resolution / 1000; break; case LIRC_SET_WIDEBAND_RECEIVER: if (!dev->s_learning_mode) - return -ENOTTY; - - return dev->s_learning_mode(dev, !!val); + ret = -ENOTTY; + else + ret = dev->s_learning_mode(dev, !!val); + break; case LIRC_SET_MEASURE_CARRIER_MODE: if (!dev->s_carrier_report) - return -ENOTTY; - - return dev->s_carrier_report(dev, !!val); + ret = -ENOTTY; + else + ret = dev->s_carrier_report(dev, !!val); + break; /* Generic timeout support */ case LIRC_GET_MIN_TIMEOUT: if (!dev->max_timeout) - return -ENOTTY; - val = DIV_ROUND_UP(dev->min_timeout, 1000); + ret = -ENOTTY; + else + val = DIV_ROUND_UP(dev->min_timeout, 1000); break; case LIRC_GET_MAX_TIMEOUT: if (!dev->max_timeout) - return -ENOTTY; - val = dev->max_timeout / 1000; + ret = -ENOTTY; + else + val = dev->max_timeout / 1000; break; case LIRC_SET_REC_TIMEOUT: - if (!dev->max_timeout) - return -ENOTTY; + if (!dev->max_timeout) { + ret = -ENOTTY; + } else if (val > U32_MAX / 1000) { + /* Check for multiply overflow */ + ret = -EINVAL; + } else { + u32 tmp = val * 1000; - /* Check for multiply overflow */ - if (val > U32_MAX / 1000) - return -EINVAL; - - tmp = val * 1000; - - if (tmp < dev->min_timeout || tmp > dev->max_timeout) - return -EINVAL; - - if (dev->s_timeout) - ret = dev->s_timeout(dev, tmp); - if (!ret) - dev->timeout = tmp; + if (tmp < dev->min_timeout || tmp > dev->max_timeout) + ret = -EINVAL; + else if (dev->s_timeout) + ret = dev->s_timeout(dev, tmp); + else if (!ret) + dev->timeout = tmp; + } break; case LIRC_SET_REC_TIMEOUT_REPORTS: if (!dev->timeout) - return -ENOTTY; - - fh->send_timeout_reports = !!val; + ret = -ENOTTY; + else + fh->send_timeout_reports = !!val; break; default: - return -ENOTTY; + ret = -ENOTTY; } - if (_IOC_DIR(cmd) & _IOC_READ) + if (!ret && _IOC_DIR(cmd) & _IOC_READ) ret = put_user(val, argp); +out: + mutex_unlock(&dev->lock); return ret; } From b6ad4b7148e8b0595467f3dc630db6b6d75581f2 Mon Sep 17 00:00:00 2001 From: Chunyan Zhang Date: Mon, 6 Nov 2017 09:06:10 -0500 Subject: [PATCH 0566/1640] UPSTREAM: media: rc: Replace timeval with ktime_t in imon.c This patch changes the 32-bit time type (timeval) to the 64-bit one (ktime_t), since 32-bit time types will break in the year 2038. I use ktime_t instead of all uses of timeval in imon.c This patch also changes do_gettimeofday() to ktime_get() accordingly, since ktime_get returns a ktime_t, but do_gettimeofday returns a struct timeval, and the other reason is that ktime_get() uses the monotonic clock. Signed-off-by: Chunyan Zhang Signed-off-by: Arnd Bergmann Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/imon.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/drivers/media/rc/imon.c b/drivers/media/rc/imon.c index 2ae62148b765..c2fa14ed76f8 100644 --- a/drivers/media/rc/imon.c +++ b/drivers/media/rc/imon.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include #include @@ -37,7 +38,6 @@ #include #include -#include #include #define MOD_AUTHOR "Jarod Wilson " @@ -1198,16 +1198,16 @@ static inline int tv2int(const struct timeval *a, const struct timeval *b) */ static int stabilize(int a, int b, u16 timeout, u16 threshold) { - struct timeval ct; - static struct timeval prev_time = {0, 0}; - static struct timeval hit_time = {0, 0}; + ktime_t ct; + static ktime_t prev_time; + static ktime_t hit_time; static int x, y, prev_result, hits; int result = 0; - int msec, msec_hit; + long msec, msec_hit; - do_gettimeofday(&ct); - msec = tv2int(&ct, &prev_time); - msec_hit = tv2int(&ct, &hit_time); + ct = ktime_get(); + msec = ktime_ms_delta(ct, prev_time); + msec_hit = ktime_ms_delta(ct, hit_time); if (msec > 100) { x = 0; @@ -1685,9 +1685,9 @@ static void imon_incoming_scancode(struct imon_context *ictx, u32 kc; u64 scancode; int press_type = 0; - int msec; - struct timeval t; - static struct timeval prev_time = { 0, 0 }; + long msec; + ktime_t t; + static ktime_t prev_time; u8 ktype; /* filter out junk data on the older 0xffdc imon devices */ @@ -1780,10 +1780,10 @@ static void imon_incoming_scancode(struct imon_context *ictx, /* Only panel type events left to process now */ spin_lock_irqsave(&ictx->kc_lock, flags); - do_gettimeofday(&t); + t = ktime_get(); /* KEY_MUTE repeats from knob need to be suppressed */ if (ictx->kc == KEY_MUTE && ictx->kc == ictx->last_keycode) { - msec = tv2int(&t, &prev_time); + msec = ktime_ms_delta(t, prev_time); if (msec < ictx->idev->rep[REP_DELAY]) { spin_unlock_irqrestore(&ictx->kc_lock, flags); return; From cf465a58e97c0ea3a2ed0f4141c2c595f5118b8b Mon Sep 17 00:00:00 2001 From: Arvind Yadav Date: Wed, 15 Nov 2017 23:37:51 -0500 Subject: [PATCH 0567/1640] UPSTREAM: media: winbond-cir: Fix pnp_irq's error checking for wbcir_probe The pnp_irq() function returns -1 if an error occurs. pnp_irq() error checking for zero is not correct. Signed-off-by: Arvind Yadav Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/winbond-cir.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/media/rc/winbond-cir.c b/drivers/media/rc/winbond-cir.c index 3ca7ab48293d..0adf0991f5ab 100644 --- a/drivers/media/rc/winbond-cir.c +++ b/drivers/media/rc/winbond-cir.c @@ -1044,7 +1044,7 @@ wbcir_probe(struct pnp_dev *device, const struct pnp_device_id *dev_id) data->irq = pnp_irq(device, 0); if (data->wbase == 0 || data->ebase == 0 || - data->sbase == 0 || data->irq == 0) { + data->sbase == 0 || data->irq == -1) { err = -ENODEV; dev_err(dev, "Invalid resources\n"); goto exit_free_data; From c5de00819159fdf98582806b08e01d97b3a69959 Mon Sep 17 00:00:00 2001 From: Sean Young Date: Tue, 21 Nov 2017 15:51:39 -0500 Subject: [PATCH 0568/1640] UPSTREAM: media: imon: auto-config ffdc 30 device Another device with the 0xffdc device id, this one with 0x30 in the config byte. Its an iMON VFD + iMON IR (it does not understand rc6). Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/imon.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/media/rc/imon.c b/drivers/media/rc/imon.c index c2fa14ed76f8..b2bf661874ac 100644 --- a/drivers/media/rc/imon.c +++ b/drivers/media/rc/imon.c @@ -1953,6 +1953,7 @@ static void imon_get_ffdc_type(struct imon_context *ictx) break; /* iMON VFD, iMON IR */ case 0x24: + case 0x30: case 0x85: dev_info(ictx->dev, "0xffdc iMON VFD, iMON IR"); detected_display_type = IMON_DISPLAY_TYPE_VFD; From c8c4722ca2f1587cd2657b91740b4a4598e004cc Mon Sep 17 00:00:00 2001 From: Sean Young Date: Thu, 23 Nov 2017 17:37:10 -0500 Subject: [PATCH 0569/1640] UPSTREAM: media: cec: move cec autorepeat handling to rc-core CEC autorepeat is different than other protocols. Autorepeat is triggered by the first repeated user control pressed CEC message, rather than a fixed REP_DELAY. This change also does away with the KEY_UP event directly after the first KEY_DOWN event, which was used to stop autorepeat from starting. See commit a9a249a2c997 ("media: cec: fix remote control passthrough") for the original change. Acked-by: Hans Verkuil Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/media/cec/cec-adap.c | 56 ++++-------------------------------- drivers/media/cec/cec-core.c | 12 -------- drivers/media/rc/rc-main.c | 49 ++++++++++++++++++++++++++++++- include/media/cec.h | 5 ---- include/media/rc-core.h | 3 ++ 5 files changed, 56 insertions(+), 69 deletions(-) diff --git a/drivers/media/cec/cec-adap.c b/drivers/media/cec/cec-adap.c index 2765ca732b95..852b184ccaac 100644 --- a/drivers/media/cec/cec-adap.c +++ b/drivers/media/cec/cec-adap.c @@ -1790,9 +1790,6 @@ static int cec_receive_notify(struct cec_adapter *adap, struct cec_msg *msg, int la_idx = cec_log_addr2idx(adap, dest_laddr); bool from_unregistered = init_laddr == 0xf; struct cec_msg tx_cec_msg = { }; -#ifdef CONFIG_MEDIA_CEC_RC - int scancode; -#endif dprintk(2, "%s: %*ph\n", __func__, msg->len, msg->msg); @@ -1888,9 +1885,11 @@ static int cec_receive_notify(struct cec_adapter *adap, struct cec_msg *msg, */ case 0x60: if (msg->len == 2) - scancode = msg->msg[2]; + rc_keydown(adap->rc, RC_PROTO_CEC, + msg->msg[2], 0); else - scancode = msg->msg[2] << 8 | msg->msg[3]; + rc_keydown(adap->rc, RC_PROTO_CEC, + msg->msg[2] << 8 | msg->msg[3], 0); break; /* * Other function messages that are not handled. @@ -1903,54 +1902,11 @@ static int cec_receive_notify(struct cec_adapter *adap, struct cec_msg *msg, */ case 0x56: case 0x57: case 0x67: case 0x68: case 0x69: case 0x6a: - scancode = -1; break; default: - scancode = msg->msg[2]; + rc_keydown(adap->rc, RC_PROTO_CEC, msg->msg[2], 0); break; } - - /* Was repeating, but keypress timed out */ - if (adap->rc_repeating && !adap->rc->keypressed) { - adap->rc_repeating = false; - adap->rc_last_scancode = -1; - } - /* Different keypress from last time, ends repeat mode */ - if (adap->rc_last_scancode != scancode) { - rc_keyup(adap->rc); - adap->rc_repeating = false; - } - /* We can't handle this scancode */ - if (scancode < 0) { - adap->rc_last_scancode = scancode; - break; - } - - /* Send key press */ - rc_keydown(adap->rc, RC_PROTO_CEC, scancode, 0); - - /* When in repeating mode, we're done */ - if (adap->rc_repeating) - break; - - /* - * We are not repeating, but the new scancode is - * the same as the last one, and this second key press is - * within 550 ms (the 'Follower Safety Timeout') from the - * previous key press, so we now enable the repeating mode. - */ - if (adap->rc_last_scancode == scancode && - msg->rx_ts - adap->rc_last_keypress < 550 * NSEC_PER_MSEC) { - adap->rc_repeating = true; - break; - } - /* - * Not in repeating mode, so avoid triggering repeat mode - * by calling keyup. - */ - rc_keyup(adap->rc); - adap->rc_last_scancode = scancode; - adap->rc_last_keypress = msg->rx_ts; #endif break; @@ -1960,8 +1916,6 @@ static int cec_receive_notify(struct cec_adapter *adap, struct cec_msg *msg, break; #ifdef CONFIG_MEDIA_CEC_RC rc_keyup(adap->rc); - adap->rc_repeating = false; - adap->rc_last_scancode = -1; #endif break; diff --git a/drivers/media/cec/cec-core.c b/drivers/media/cec/cec-core.c index dc7fd6f80bc0..fb59a2bb6b97 100644 --- a/drivers/media/cec/cec-core.c +++ b/drivers/media/cec/cec-core.c @@ -280,7 +280,6 @@ struct cec_adapter *cec_allocate_adapter(const struct cec_adap_ops *ops, adap->rc->priv = adap; adap->rc->map_name = RC_MAP_CEC; adap->rc->timeout = MS_TO_NS(100); - adap->rc_last_scancode = -1; #endif return adap; } @@ -312,17 +311,6 @@ int cec_register_adapter(struct cec_adapter *adap, adap->rc = NULL; return res; } - /* - * The REP_DELAY for CEC is really the time between the initial - * 'User Control Pressed' message and the second. The first - * keypress is always seen as non-repeating, the second - * (provided it has the same UI Command) will start the 'Press - * and Hold' (aka repeat) behavior. By setting REP_DELAY to the - * same value as REP_PERIOD the expected CEC behavior is - * reproduced. - */ - adap->rc->input_dev->rep[REP_DELAY] = - adap->rc->input_dev->rep[REP_PERIOD]; } #endif diff --git a/drivers/media/rc/rc-main.c b/drivers/media/rc/rc-main.c index 5830cb2c5943..1870b7999062 100644 --- a/drivers/media/rc/rc-main.c +++ b/drivers/media/rc/rc-main.c @@ -597,6 +597,7 @@ static void ir_do_keyup(struct rc_dev *dev, bool sync) return; IR_dprintk(1, "keyup key 0x%04x\n", dev->last_keycode); + del_timer_sync(&dev->timer_repeat); input_report_key(dev->input_dev, dev->last_keycode, 0); led_trigger_event(led_feedback, LED_OFF); if (sync) @@ -650,6 +651,31 @@ static void ir_timer_keyup(struct timer_list *t) spin_unlock_irqrestore(&dev->keylock, flags); } +/** + * ir_timer_repeat() - generates a repeat event after a timeout + * + * @t: a pointer to the struct timer_list + * + * This routine will generate a soft repeat event every REP_PERIOD + * milliseconds. + */ +static void ir_timer_repeat(struct timer_list *t) +{ + struct rc_dev *dev = from_timer(dev, t, timer_repeat); + struct input_dev *input = dev->input_dev; + unsigned long flags; + + spin_lock_irqsave(&dev->keylock, flags); + if (dev->keypressed) { + input_event(input, EV_KEY, dev->last_keycode, 2); + input_sync(input); + if (input->rep[REP_PERIOD]) + mod_timer(&dev->timer_repeat, jiffies + + msecs_to_jiffies(input->rep[REP_PERIOD])); + } + spin_unlock_irqrestore(&dev->keylock, flags); +} + /** * rc_repeat() - signals that a key is still pressed * @dev: the struct rc_dev descriptor of the device @@ -732,6 +758,22 @@ static void ir_do_keydown(struct rc_dev *dev, enum rc_proto protocol, led_trigger_event(led_feedback, LED_FULL); } + /* + * For CEC, start sending repeat messages as soon as the first + * repeated message is sent, as long as REP_DELAY = 0 and REP_PERIOD + * is non-zero. Otherwise, the input layer will generate repeat + * messages. + */ + if (!new_event && keycode != KEY_RESERVED && + dev->allowed_protocols == RC_PROTO_BIT_CEC && + !timer_pending(&dev->timer_repeat) && + dev->input_dev->rep[REP_PERIOD] && + !dev->input_dev->rep[REP_DELAY]) { + input_event(dev->input_dev, EV_KEY, keycode, 2); + mod_timer(&dev->timer_repeat, jiffies + + msecs_to_jiffies(dev->input_dev->rep[REP_PERIOD])); + } + input_sync(dev->input_dev); } @@ -1599,6 +1641,7 @@ struct rc_dev *rc_allocate_device(enum rc_driver_type type) input_set_drvdata(dev->input_dev, dev); timer_setup(&dev->timer_keyup, ir_timer_keyup, 0); + timer_setup(&dev->timer_repeat, ir_timer_repeat, 0); spin_lock_init(&dev->rc_map.lock); spin_lock_init(&dev->keylock); @@ -1732,7 +1775,10 @@ static int rc_setup_rx_device(struct rc_dev *dev) * to avoid wrong repetition of the keycodes. Note that this must be * set after the call to input_register_device(). */ - dev->input_dev->rep[REP_DELAY] = 500; + if (dev->allowed_protocols == RC_PROTO_BIT_CEC) + dev->input_dev->rep[REP_DELAY] = 0; + else + dev->input_dev->rep[REP_DELAY] = 500; /* * As a repeat event on protocols like RC-5 and NEC take as long as @@ -1884,6 +1930,7 @@ void rc_unregister_device(struct rc_dev *dev) return; del_timer_sync(&dev->timer_keyup); + del_timer_sync(&dev->timer_repeat); if (dev->driver_type == RC_DRIVER_IR_RAW) ir_raw_event_unregister(dev); diff --git a/include/media/cec.h b/include/media/cec.h index b7339cc6fd3d..ca7ecd199aa7 100644 --- a/include/media/cec.h +++ b/include/media/cec.h @@ -191,11 +191,6 @@ struct cec_adapter { u32 tx_timeouts; -#ifdef CONFIG_MEDIA_CEC_RC - bool rc_repeating; - int rc_last_scancode; - u64 rc_last_keypress; -#endif #ifdef CONFIG_CEC_NOTIFIER struct cec_notifier *notifier; #endif diff --git a/include/media/rc-core.h b/include/media/rc-core.h index 3a47a25a6593..0a4026cf64f3 100644 --- a/include/media/rc-core.h +++ b/include/media/rc-core.h @@ -134,6 +134,8 @@ struct lirc_fh { * @keypressed: whether a key is currently pressed * @keyup_jiffies: time (in jiffies) when the current keypress should be released * @timer_keyup: timer for releasing a keypress + * @timer_repeat: timer for autorepeat events. This is needed for CEC, which + * has non-standard repeats. * @last_keycode: keycode of last keypress * @last_protocol: protocol of last keypress * @last_scancode: scancode of last keypress @@ -202,6 +204,7 @@ struct rc_dev { bool keypressed; unsigned long keyup_jiffies; struct timer_list timer_keyup; + struct timer_list timer_repeat; u32 last_keycode; enum rc_proto last_protocol; u32 last_scancode; From e65f5837c8805881b4ef882aea9ef14186499383 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Thu, 30 Nov 2017 08:00:45 -0500 Subject: [PATCH 0570/1640] UPSTREAM: media: RC docs: add enum rc_proto description at the docs This is part of the uAPI. Add it to the documentation again, and fix cross-references. Signed-off-by: Mauro Carvalho Chehab Acked-by: Sean Young --- .../media/uapi/rc/lirc-dev-intro.rst | 19 +++++++++++++------ Documentation/media/uapi/rc/lirc-read.rst | 2 +- Documentation/media/uapi/rc/lirc-write.rst | 4 ++-- 3 files changed, 16 insertions(+), 9 deletions(-) diff --git a/Documentation/media/uapi/rc/lirc-dev-intro.rst b/Documentation/media/uapi/rc/lirc-dev-intro.rst index 47c6c218e72a..3a74fec66d69 100644 --- a/Documentation/media/uapi/rc/lirc-dev-intro.rst +++ b/Documentation/media/uapi/rc/lirc-dev-intro.rst @@ -46,13 +46,13 @@ on the following table. This mode is for both sending and receiving IR. For transmitting (aka sending), create a ``struct lirc_scancode`` with - the desired scancode set in the ``scancode`` member, ``rc_proto`` set - the IR protocol, and all other members set to 0. Write this struct to + the desired scancode set in the ``scancode`` member, :c:type:`rc_proto` + set the IR protocol, and all other members set to 0. Write this struct to the lirc device. For receiving, you read ``struct lirc_scancode`` from the lirc device, with ``scancode`` set to the received scancode and the IR protocol - ``rc_proto``. If the scancode maps to a valid key code, this is set + :c:type:`rc_proto`. If the scancode maps to a valid key code, this is set in the ``keycode`` field, else it is set to ``KEY_RESERVED``. The ``flags`` can have ``LIRC_SCANCODE_FLAG_TOGGLE`` set if the toggle @@ -74,9 +74,6 @@ on the following table. The ``timestamp`` field is filled with the time nanoseconds (in ``CLOCK_MONOTONIC``) when the scancode was decoded. - An ``enum rc_proto`` in the :ref:`lirc_header` lists all the supported - IR protocols. - .. _lirc-mode-mode2: ``LIRC_MODE_MODE2`` @@ -125,3 +122,13 @@ on the following table. of entries. This mode is used only for IR send. + + +************************** +Remote Controller protocol +************************** + +An enum :c:type:`rc_proto` in the :ref:`lirc_header` lists all the +supported IR protocols: + +.. kernel-doc:: include/uapi/linux/lirc.h diff --git a/Documentation/media/uapi/rc/lirc-read.rst b/Documentation/media/uapi/rc/lirc-read.rst index 51d37ed10194..c024aaffb8ad 100644 --- a/Documentation/media/uapi/rc/lirc-read.rst +++ b/Documentation/media/uapi/rc/lirc-read.rst @@ -54,7 +54,7 @@ read from the chardev. Alternatively, :ref:`LIRC_MODE_SCANCODE ` can be available, in this mode scancodes which are either decoded by software decoders, or -by hardware decoders. The ``rc_proto`` member is set to the +by hardware decoders. The :c:type:`rc_proto` member is set to the protocol used for transmission, and ``scancode`` to the decoded scancode, and the ``keycode`` set to the keycode or ``KEY_RESERVED``. diff --git a/Documentation/media/uapi/rc/lirc-write.rst b/Documentation/media/uapi/rc/lirc-write.rst index 3d7541bad8b9..dd3d1fe807a6 100644 --- a/Documentation/media/uapi/rc/lirc-write.rst +++ b/Documentation/media/uapi/rc/lirc-write.rst @@ -57,8 +57,8 @@ driver returns ``EINVAL``. When in :ref:`LIRC_MODE_SCANCODE ` mode, one ``struct lirc_scancode`` must be written to the chardev at a time, else ``EINVAL`` is returned. Set the desired scancode in the ``scancode`` member, -and the protocol in the ``rc_proto`` member. All other members must be set -to 0, else ``EINVAL`` is returned. If there is no protocol encoder +and the protocol in the :c:type:`rc_proto`: member. All other members must be +set to 0, else ``EINVAL`` is returned. If there is no protocol encoder for the protocol or the scancode is not valid for the specified protocol, ``EINVAL`` is returned. The write function may not wait until the scancode is transmitted. From 28e77825edf94c4069ec1c24bc03bf4fd00c9995 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Sun, 24 Sep 2017 05:24:58 -0400 Subject: [PATCH 0571/1640] UPSTREAM: media: rc-core.h: minor adjustments at rc_driver_type doc The description of this enum doesn't match what it actually represents. Adjust it. Acked-by: Sakari Ailus Signed-off-by: Mauro Carvalho Chehab --- include/media/rc-core.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/media/rc-core.h b/include/media/rc-core.h index 0a4026cf64f3..aed4272d47f5 100644 --- a/include/media/rc-core.h +++ b/include/media/rc-core.h @@ -31,9 +31,9 @@ do { \ } while (0) /** - * enum rc_driver_type - type of the RC output + * enum rc_driver_type - type of the RC driver. * - * @RC_DRIVER_SCANCODE: Driver or hardware generates a scancode + * @RC_DRIVER_SCANCODE: Driver or hardware generates a scancode. * @RC_DRIVER_IR_RAW: Driver or hardware generates pulse/space sequences. * It needs a Infra-Red pulse/space decoder * @RC_DRIVER_IR_RAW_TX: Device transmitter only, From 80259f3ee8c871acee0ea3aed9920502557a87f0 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 18 Dec 2017 15:15:53 -0500 Subject: [PATCH 0572/1640] BACKPORT: media: fix SPDX comment on some header files The agreed format is to use /* */ comments inside header files. Unfortunately, I ended by using // on a few ones. [Linux4: Discard everything but drivers/media/rc/*] Reported-by: Andi Shyti Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/rc-core-priv.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/media/rc/rc-core-priv.h b/drivers/media/rc/rc-core-priv.h index 915434855a63..3c3d2620f0e8 100644 --- a/drivers/media/rc/rc-core-priv.h +++ b/drivers/media/rc/rc-core-priv.h @@ -1,7 +1,9 @@ -// SPDX-License-Identifier: GPL-2.0 -// Remote Controller core raw events header -// -// Copyright (C) 2010 by Mauro Carvalho Chehab +/* + * SPDX-License-Identifier: GPL-2.0 + * Remote Controller core raw events header + * + * Copyright (C) 2010 by Mauro Carvalho Chehab + */ #ifndef _RC_CORE_PRIV #define _RC_CORE_PRIV From 81fa6589c510a194dc0a48d11741b2b43f80dfce Mon Sep 17 00:00:00 2001 From: Sean Young Date: Sat, 2 Dec 2017 06:10:34 -0500 Subject: [PATCH 0573/1640] UPSTREAM: media: imon: auto-config ffdc 26 device Another device with the 0xffdc device id, this one with 0x26 in the config byte. Its an iMON Inside + iMON IR. It does respond to rc-6, but seems to produce random garbage rather than a scancode. Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/imon.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/media/rc/imon.c b/drivers/media/rc/imon.c index b2bf661874ac..783df40d3aba 100644 --- a/drivers/media/rc/imon.c +++ b/drivers/media/rc/imon.c @@ -1972,6 +1972,11 @@ static void imon_get_ffdc_type(struct imon_context *ictx) detected_display_type = IMON_DISPLAY_TYPE_LCD; allowed_protos = RC_PROTO_BIT_RC6_MCE; break; + /* no display, iMON IR */ + case 0x26: + dev_info(ictx->dev, "0xffdc iMON Inside, iMON IR"); + ictx->display_supported = false; + break; default: dev_info(ictx->dev, "Unknown 0xffdc device, defaulting to VFD and iMON IR"); detected_display_type = IMON_DISPLAY_TYPE_VFD; From 51c2b33fb6e0c8531647c8a29110b1bebfadaa37 Mon Sep 17 00:00:00 2001 From: Sean Young Date: Sat, 2 Dec 2017 07:47:16 -0500 Subject: [PATCH 0574/1640] UPSTREAM: media: imon: remove unused function tv2int Since commit 9c7fd60e951d ("media: rc: Replace timeval with ktime_t in imon.c"), the function tv2int() is no longer used. Remove it. Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/imon.c | 23 ----------------------- 1 file changed, 23 deletions(-) diff --git a/drivers/media/rc/imon.c b/drivers/media/rc/imon.c index 783df40d3aba..ac67bd64486f 100644 --- a/drivers/media/rc/imon.c +++ b/drivers/media/rc/imon.c @@ -1165,29 +1165,6 @@ out: return retval; } -static inline int tv2int(const struct timeval *a, const struct timeval *b) -{ - int usecs = 0; - int sec = 0; - - if (b->tv_usec > a->tv_usec) { - usecs = 1000000; - sec--; - } - - usecs += a->tv_usec - b->tv_usec; - - sec += a->tv_sec - b->tv_sec; - sec *= 1000; - usecs /= 1000; - sec += usecs; - - if (sec < 0) - sec = 1000; - - return sec; -} - /* * The directional pad behaves a bit differently, depending on whether this is * one of the older ffdc devices or a newer device. Newer devices appear to From a3e5b46775653a092ad762e3e865fddff31f90e9 Mon Sep 17 00:00:00 2001 From: Sean Young Date: Sun, 3 Dec 2017 08:55:24 -0500 Subject: [PATCH 0575/1640] UPSTREAM: media: rc: bang in ir_do_keyup rc_keydown() can be called from interrupt context, by e.g. an rc scancode driver. Since commit b2c96ba352b5 ("media: cec: move cec autorepeat handling to rc-core"), the del_timer_sync() call is not happy about being called in interrupt connect. del_timer() will suffice. WARNING: CPU: 0 PID: 0 at kernel/time/timer.c:1285 del_timer_sync+0x1d/0x40 CPU: 0 PID: 0 Comm: swapper/0 Tainted: G W 4.15.0-rc1+ #1 Hardware name: /DG45ID, BIOS IDG4510H.86A.0135.2011.0225.1100 02/25/2011 task: ffffffffa3e10480 task.stack: ffffffffa3e00000 RIP: 0010:del_timer_sync+0x1d/0x40 RSP: 0018:ffff8b396bc03db0 EFLAGS: 00010046 RAX: 0000000080010000 RBX: ffff8b394d70e410 RCX: 0000000000000073 RDX: 0000000000000001 RSI: 0000000000000001 RDI: ffff8b394d70e410 RBP: 0000000000000001 R08: ffffffffc0616000 R09: ffff8b396bfa3000 R10: 0000000000000000 R11: 0000000000000390 R12: ffff8b394f003800 R13: 0000000000000000 R14: ffff8b3771c19630 R15: 0000000000000000 FS: 0000000000000000(0000) GS:ffff8b396bc00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f1944469000 CR3: 00000001ebe09000 CR4: 00000000000006f0 Call Trace: ir_do_keyup.part.5+0x22/0x90 [rc_core] rc_keyup+0x37/0x50 [rc_core] usb_rx_callback_intf0+0x79/0x90 [imon] __usb_hcd_giveback_urb+0x90/0x130 uhci_giveback_urb+0xab/0x250 uhci_scan_schedule.part.34+0x806/0xb00 uhci_irq+0xab/0x150 usb_hcd_irq+0x22/0x30 __handle_irq_event_percpu+0x3a/0x180 handle_irq_event_percpu+0x30/0x70 handle_irq_event+0x27/0x50 handle_fasteoi_irq+0x6b/0x110 handle_irq+0xa5/0x100 do_IRQ+0x41/0xc0 common_interrupt+0x96/0x96 RIP: 0010:cpuidle_enter_state+0x9a/0x2d0 RSP: 0018:ffffffffa3e03e88 EFLAGS: 00000246 ORIG_RAX: ffffffffffffffda RAX: ffff8b396bc1a000 RBX: 00000010da7bcd63 RCX: 00000010da7bccf6 RDX: 00000010da7bcd63 RSI: 00000010da7bcd63 RDI: 0000000000000000 RBP: ffff8b394f587400 R08: 0000000000000000 R09: 0000000000000002 R10: ffffffffa3e03e48 R11: 0000000000000390 R12: 0000000000000003 R13: ffffffffa3ebf018 R14: 0000000000000000 R15: 00000010da7ba772 ? cpuidle_enter_state+0x8d/0x2d0 do_idle+0x17b/0x1d0 cpu_startup_entry+0x6f/0x80 start_kernel+0x4a7/0x4c7 secondary_startup_64+0xa5/0xb0 Code: e7 5b 5d 41 5c e9 84 88 05 00 0f 1f 40 00 66 66 66 66 90 65 8b 05 e4 6f ef 5c a9 00 00 0f 00 53 48 89 fb 74 16 f6 47 22 20 75 10 <0f> ff 48 89 df e8 89 f1 ff ff 85 c0 79 0e f3 90 48 89 df e8 7b Fixes: b2c96ba352b5 ("media: cec: move cec autorepeat handling to rc-core") Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/rc-main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/media/rc/rc-main.c b/drivers/media/rc/rc-main.c index 1870b7999062..1db8d38fed7c 100644 --- a/drivers/media/rc/rc-main.c +++ b/drivers/media/rc/rc-main.c @@ -597,7 +597,7 @@ static void ir_do_keyup(struct rc_dev *dev, bool sync) return; IR_dprintk(1, "keyup key 0x%04x\n", dev->last_keycode); - del_timer_sync(&dev->timer_repeat); + del_timer(&dev->timer_repeat); input_report_key(dev->input_dev, dev->last_keycode, 0); led_trigger_event(led_feedback, LED_OFF); if (sync) From fbd0657844787fcd6ac8eabd444a6276c532ed12 Mon Sep 17 00:00:00 2001 From: Sean Young Date: Mon, 11 Dec 2017 17:12:09 -0500 Subject: [PATCH 0576/1640] UPSTREAM: media: lirc: when transmitting scancodes, block until transmit is done The semantics for lirc IR transmit with raw IR is that the write call should block until the IR is transmitted. Some drivers have no idea when this actually is (e.g. mceusb), so there is a wait. This is useful for userspace, as it might want to send a IR button press, a gap of a predefined number of milliseconds, and then send a repeat message. It turns out that for transmitting scancodes this feature is even more useful, as user space has no idea how long the IR is. So, maintain the existing semantics for IR scancode transmit. Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- Documentation/media/uapi/rc/lirc-write.rst | 4 ++-- drivers/media/rc/lirc_dev.c | 22 +++++++++++----------- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/Documentation/media/uapi/rc/lirc-write.rst b/Documentation/media/uapi/rc/lirc-write.rst index dd3d1fe807a6..d4566b0a2015 100644 --- a/Documentation/media/uapi/rc/lirc-write.rst +++ b/Documentation/media/uapi/rc/lirc-write.rst @@ -60,8 +60,8 @@ When in :ref:`LIRC_MODE_SCANCODE ` mode, one and the protocol in the :c:type:`rc_proto`: member. All other members must be set to 0, else ``EINVAL`` is returned. If there is no protocol encoder for the protocol or the scancode is not valid for the specified protocol, -``EINVAL`` is returned. The write function may not wait until the scancode -is transmitted. +``EINVAL`` is returned. The write function blocks until the scancode +is transmitted by the hardware. Return Value diff --git a/drivers/media/rc/lirc_dev.c b/drivers/media/rc/lirc_dev.c index 218658917cf6..6cedb546c3e0 100644 --- a/drivers/media/rc/lirc_dev.c +++ b/drivers/media/rc/lirc_dev.c @@ -354,18 +354,18 @@ static ssize_t ir_lirc_transmit_ir(struct file *file, const char __user *buf, duration += txbuf[i]; ret *= sizeof(unsigned int); + } - /* - * The lircd gap calculation expects the write function to - * wait for the actual IR signal to be transmitted before - * returning. - */ - towait = ktime_us_delta(ktime_add_us(start, duration), - ktime_get()); - if (towait > 0) { - set_current_state(TASK_INTERRUPTIBLE); - schedule_timeout(usecs_to_jiffies(towait)); - } + /* + * The lircd gap calculation expects the write function to + * wait for the actual IR signal to be transmitted before + * returning. + */ + towait = ktime_us_delta(ktime_add_us(start, duration), + ktime_get()); + if (towait > 0) { + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(usecs_to_jiffies(towait)); } out: From f6fd96c07963a0d6f38726f820437f40be31d789 Mon Sep 17 00:00:00 2001 From: Sean Young Date: Mon, 11 Dec 2017 17:21:28 -0500 Subject: [PATCH 0577/1640] UPSTREAM: media: rc: iguanair: simplify tx loop The TX loop is more complex than it should. Simplify it. Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/iguanair.c | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/drivers/media/rc/iguanair.c b/drivers/media/rc/iguanair.c index 30e24da67226..7daac8bab83b 100644 --- a/drivers/media/rc/iguanair.c +++ b/drivers/media/rc/iguanair.c @@ -347,26 +347,23 @@ static int iguanair_set_tx_mask(struct rc_dev *dev, uint32_t mask) static int iguanair_tx(struct rc_dev *dev, unsigned *txbuf, unsigned count) { struct iguanair *ir = dev->priv; - uint8_t space; - unsigned i, size, periods, bytes; + unsigned int i, size, p, periods; int rc; mutex_lock(&ir->lock); /* convert from us to carrier periods */ - for (i = space = size = 0; i < count; i++) { + for (i = size = 0; i < count; i++) { periods = DIV_ROUND_CLOSEST(txbuf[i] * ir->carrier, 1000000); - bytes = DIV_ROUND_UP(periods, 127); - if (size + bytes > ir->bufsize) { - rc = -EINVAL; - goto out; - } while (periods) { - unsigned p = min(periods, 127u); - ir->packet->payload[size++] = p | space; + p = min(periods, 127u); + if (size >= ir->bufsize) { + rc = -EINVAL; + goto out; + } + ir->packet->payload[size++] = p | ((i & 1) ? 0x80 : 0); periods -= p; } - space ^= 0x80; } ir->packet->header.start = 0; From 44a8b12213c1666d9a777964b5e1e1bc30b5b18f Mon Sep 17 00:00:00 2001 From: Sean Young Date: Wed, 13 Dec 2017 16:09:21 -0500 Subject: [PATCH 0578/1640] UPSTREAM: media: lirc: do not pass ERR_PTR to kfree If memdup_user() fails, txbuf will be an error pointer and passed to kfree. Reported-by: Dan Carpenter Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/lirc_dev.c | 35 ++++++++++++++++++----------------- 1 file changed, 18 insertions(+), 17 deletions(-) diff --git a/drivers/media/rc/lirc_dev.c b/drivers/media/rc/lirc_dev.c index 6cedb546c3e0..8618aba152c6 100644 --- a/drivers/media/rc/lirc_dev.c +++ b/drivers/media/rc/lirc_dev.c @@ -231,7 +231,7 @@ static ssize_t ir_lirc_transmit_ir(struct file *file, const char __user *buf, { struct lirc_fh *fh = file->private_data; struct rc_dev *dev = fh->rc; - unsigned int *txbuf = NULL; + unsigned int *txbuf; struct ir_raw_event *raw = NULL; ssize_t ret; size_t count; @@ -246,14 +246,14 @@ static ssize_t ir_lirc_transmit_ir(struct file *file, const char __user *buf, if (!dev->registered) { ret = -ENODEV; - goto out; + goto out_unlock; } start = ktime_get(); if (!dev->tx_ir) { ret = -EINVAL; - goto out; + goto out_unlock; } if (fh->send_mode == LIRC_MODE_SCANCODE) { @@ -261,17 +261,17 @@ static ssize_t ir_lirc_transmit_ir(struct file *file, const char __user *buf, if (n != sizeof(scan)) { ret = -EINVAL; - goto out; + goto out_unlock; } if (copy_from_user(&scan, buf, sizeof(scan))) { ret = -EFAULT; - goto out; + goto out_unlock; } if (scan.flags || scan.keycode || scan.timestamp) { ret = -EINVAL; - goto out; + goto out_unlock; } /* @@ -283,26 +283,26 @@ static ssize_t ir_lirc_transmit_ir(struct file *file, const char __user *buf, if (scan.scancode > U32_MAX || !rc_validate_scancode(scan.rc_proto, scan.scancode)) { ret = -EINVAL; - goto out; + goto out_unlock; } raw = kmalloc_array(LIRCBUF_SIZE, sizeof(*raw), GFP_KERNEL); if (!raw) { ret = -ENOMEM; - goto out; + goto out_unlock; } ret = ir_raw_encode_scancode(scan.rc_proto, scan.scancode, raw, LIRCBUF_SIZE); if (ret < 0) - goto out; + goto out_kfree; count = ret; txbuf = kmalloc_array(count, sizeof(unsigned int), GFP_KERNEL); if (!txbuf) { ret = -ENOMEM; - goto out; + goto out_kfree; } for (i = 0; i < count; i++) @@ -318,26 +318,26 @@ static ssize_t ir_lirc_transmit_ir(struct file *file, const char __user *buf, } else { if (n < sizeof(unsigned int) || n % sizeof(unsigned int)) { ret = -EINVAL; - goto out; + goto out_unlock; } count = n / sizeof(unsigned int); if (count > LIRCBUF_SIZE || count % 2 == 0) { ret = -EINVAL; - goto out; + goto out_unlock; } txbuf = memdup_user(buf, n); if (IS_ERR(txbuf)) { ret = PTR_ERR(txbuf); - goto out; + goto out_unlock; } } for (i = 0; i < count; i++) { if (txbuf[i] > IR_MAX_DURATION / 1000 - duration || !txbuf[i]) { ret = -EINVAL; - goto out; + goto out_kfree; } duration += txbuf[i]; @@ -345,7 +345,7 @@ static ssize_t ir_lirc_transmit_ir(struct file *file, const char __user *buf, ret = dev->tx_ir(dev, txbuf, count); if (ret < 0) - goto out; + goto out_kfree; if (fh->send_mode == LIRC_MODE_SCANCODE) { ret = n; @@ -368,10 +368,11 @@ static ssize_t ir_lirc_transmit_ir(struct file *file, const char __user *buf, schedule_timeout(usecs_to_jiffies(towait)); } -out: - mutex_unlock(&dev->lock); +out_kfree: kfree(txbuf); kfree(raw); +out_unlock: + mutex_unlock(&dev->lock); return ret; } From eaf1382e69e08115cd9d312e800f888921fbdcd0 Mon Sep 17 00:00:00 2001 From: Sean Young Date: Wed, 13 Dec 2017 16:17:44 -0500 Subject: [PATCH 0579/1640] UPSTREAM: media: lirc: no need to recalculate duration This is code existed for when drivers would send less than the whole buffer; no driver does this any more, so this is redundant. Drivers should return -EINVAL if they cannot send the entire buffer. Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/lirc_dev.c | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/drivers/media/rc/lirc_dev.c b/drivers/media/rc/lirc_dev.c index 8618aba152c6..1fc1fd665bce 100644 --- a/drivers/media/rc/lirc_dev.c +++ b/drivers/media/rc/lirc_dev.c @@ -347,15 +347,6 @@ static ssize_t ir_lirc_transmit_ir(struct file *file, const char __user *buf, if (ret < 0) goto out_kfree; - if (fh->send_mode == LIRC_MODE_SCANCODE) { - ret = n; - } else { - for (duration = i = 0; i < ret; i++) - duration += txbuf[i]; - - ret *= sizeof(unsigned int); - } - /* * The lircd gap calculation expects the write function to * wait for the actual IR signal to be transmitted before @@ -368,6 +359,7 @@ static ssize_t ir_lirc_transmit_ir(struct file *file, const char __user *buf, schedule_timeout(usecs_to_jiffies(towait)); } + ret = n; out_kfree: kfree(txbuf); kfree(raw); From 2baf8ec816c726e1479c3cc8895ab9a72d8b92d5 Mon Sep 17 00:00:00 2001 From: Sean Young Date: Wed, 13 Dec 2017 16:30:22 -0500 Subject: [PATCH 0580/1640] UPSTREAM: media: lirc: release lock before sleep There is no reason to hold the lock while we wait for the IR to transmit. Reported-by: Dan Carpenter Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/lirc_dev.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/media/rc/lirc_dev.c b/drivers/media/rc/lirc_dev.c index 1fc1fd665bce..713d42e4b661 100644 --- a/drivers/media/rc/lirc_dev.c +++ b/drivers/media/rc/lirc_dev.c @@ -347,6 +347,10 @@ static ssize_t ir_lirc_transmit_ir(struct file *file, const char __user *buf, if (ret < 0) goto out_kfree; + kfree(txbuf); + kfree(raw); + mutex_unlock(&dev->lock); + /* * The lircd gap calculation expects the write function to * wait for the actual IR signal to be transmitted before @@ -359,7 +363,7 @@ static ssize_t ir_lirc_transmit_ir(struct file *file, const char __user *buf, schedule_timeout(usecs_to_jiffies(towait)); } - ret = n; + return n; out_kfree: kfree(txbuf); kfree(raw); From 19aa976c8b98fe37b898b2eae7304fa534dbd443 Mon Sep 17 00:00:00 2001 From: Andi Shyti Date: Tue, 12 Dec 2017 02:47:20 -0500 Subject: [PATCH 0581/1640] UPSTREAM: media: ir-spi: add SPDX identifier Replace the original license statement with the SPDX identifier. Update also the copyright owner adding myself as co-owner of the copyright. Signed-off-by: Andi Shyti Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/ir-spi.c | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/drivers/media/rc/ir-spi.c b/drivers/media/rc/ir-spi.c index 29ed0638cb74..a32a84ae2d0b 100644 --- a/drivers/media/rc/ir-spi.c +++ b/drivers/media/rc/ir-spi.c @@ -1,13 +1,8 @@ -/* - * Copyright (c) 2016 Samsung Electronics Co., Ltd. - * Author: Andi Shyti - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * SPI driven IR LED device driver - */ +// SPDX-License-Identifier: GPL-2.0 +// SPI driven IR LED device driver +// +// Copyright (c) 2016 Samsung Electronics Co., Ltd. +// Copyright (c) Andi Shyti #include #include From 8a0e4548fe02e4332b35fa3af5c92a83e7f91175 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Thu, 4 Jan 2018 06:47:28 -0500 Subject: [PATCH 0582/1640] BACKPORT: media: fix usage of whitespaces and on indentation On several places, whitespaces are being used for indentation, or even at the end of the line. Fix them. [Linux4: Discard everything but drivers/media/rc/Kconfig] Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/Kconfig | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/media/rc/Kconfig b/drivers/media/rc/Kconfig index f114316ccc53..64b56663884e 100644 --- a/drivers/media/rc/Kconfig +++ b/drivers/media/rc/Kconfig @@ -26,7 +26,7 @@ config LIRC IR transmitting (aka "blasting") and for the lirc daemon. menuconfig RC_DECODERS - bool "Remote controller decoders" + bool "Remote controller decoders" depends on RC_CORE default y @@ -452,9 +452,9 @@ config IR_SERIAL_TRANSMITTER Serial Port Transmitter support config IR_SIR - tristate "Built-in SIR IrDA port" - depends on RC_CORE - ---help--- + tristate "Built-in SIR IrDA port" + depends on RC_CORE + ---help--- Say Y if you want to use a IrDA SIR port Transceivers. To compile this driver as a module, choose M here: the module will From 4800f2a33d8cc6bff1e5152047e1761d6a79c190 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Thu, 4 Jan 2018 13:08:56 -0500 Subject: [PATCH 0583/1640] BACKPORT: media: replace all occurrences There are a lot of places where sequences of space/tabs are found. Get rid of all spaces before tabs. [Linux4: Discard everything but drivers/media/rc/*] Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/keymaps/rc-behold-columbus.c | 6 +++--- drivers/media/rc/keymaps/rc-winfast-usbii-deluxe.c | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/media/rc/keymaps/rc-behold-columbus.c b/drivers/media/rc/keymaps/rc-behold-columbus.c index 61f679fec45c..e73057945bd1 100644 --- a/drivers/media/rc/keymaps/rc-behold-columbus.c +++ b/drivers/media/rc/keymaps/rc-behold-columbus.c @@ -30,12 +30,12 @@ static struct rc_map_table behold_columbus[] = { /* 0x01 0x02 0x03 0x0D * * 1 2 3 Stereo * - * * + * * * 0x04 0x05 0x06 0x19 * * 4 5 6 Snapshot * - * * + * * * 0x07 0x08 0x09 0x10 * - * 7 8 9 Zoom * + * 7 8 9 Zoom * * */ { 0x01, KEY_1 }, { 0x02, KEY_2 }, diff --git a/drivers/media/rc/keymaps/rc-winfast-usbii-deluxe.c b/drivers/media/rc/keymaps/rc-winfast-usbii-deluxe.c index 30495673cddd..e443192dbe14 100644 --- a/drivers/media/rc/keymaps/rc-winfast-usbii-deluxe.c +++ b/drivers/media/rc/keymaps/rc-winfast-usbii-deluxe.c @@ -37,7 +37,7 @@ static struct rc_map_table winfast_usbii_deluxe[] = { { 0x60, KEY_CHANNELDOWN}, /* CHANNELDOWN */ { 0x61, KEY_LAST}, /* LAST CHANNEL (RECALL) */ - { 0x72, KEY_VIDEO}, /* INPUT MODES (TV/FM) */ + { 0x72, KEY_VIDEO}, /* INPUT MODES (TV/FM) */ { 0x70, KEY_POWER2}, /* TV ON/OFF */ From 818eb05808c5df8cacd1571b4d3ed92d16a939c6 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 19 Dec 2017 11:48:25 -0500 Subject: [PATCH 0584/1640] UPSTREAM: media: lirc: don't kfree the uninitialized pointer txbuf The current error exit path if ir_raw_encode_scancode fails is via the label out_kfree which kfree's an uninitialized pointer txbuf. Fix this by exiting via a new exit path that does not kfree txbuf. Also exit via this new exit path for a failed allocation of txbuf to avoid a redundant kfree on a NULL pointer (to save a bunch of CPU cycles). Detected by: CoverityScan, CID#1463070 ("Uninitialized pointer read") Fixes: f81a8158d4fb ("media: lirc: release lock before sleep") Signed-off-by: Colin Ian King Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/lirc_dev.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/media/rc/lirc_dev.c b/drivers/media/rc/lirc_dev.c index 713d42e4b661..c04c546bf092 100644 --- a/drivers/media/rc/lirc_dev.c +++ b/drivers/media/rc/lirc_dev.c @@ -295,14 +295,14 @@ static ssize_t ir_lirc_transmit_ir(struct file *file, const char __user *buf, ret = ir_raw_encode_scancode(scan.rc_proto, scan.scancode, raw, LIRCBUF_SIZE); if (ret < 0) - goto out_kfree; + goto out_kfree_raw; count = ret; txbuf = kmalloc_array(count, sizeof(unsigned int), GFP_KERNEL); if (!txbuf) { ret = -ENOMEM; - goto out_kfree; + goto out_kfree_raw; } for (i = 0; i < count; i++) @@ -366,6 +366,7 @@ static ssize_t ir_lirc_transmit_ir(struct file *file, const char __user *buf, return n; out_kfree: kfree(txbuf); +out_kfree_raw: kfree(raw); out_unlock: mutex_unlock(&dev->lock); From 37279dd30f6a51e5fe892da0f8bce1cea7197b6f Mon Sep 17 00:00:00 2001 From: Sean Young Date: Thu, 28 Dec 2017 14:45:12 -0500 Subject: [PATCH 0585/1640] UPSTREAM: media: lirc: add module alias for lirc_dev Since commit a60d64b15c20 ("media: lirc: lirc interface should not be a raw decoder"), there is no lirc_dev module any more. On Ubuntu 16.10, the /etc/init.d/lirc startup script attempts to load the lirc_dev module. Since this module does not exist any more, this script fails. Add an alias so the correct module is loaded. Fixes: a60d64b15c20 ("media: lirc: lirc interface should not be a raw decoder") Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/lirc_dev.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/media/rc/lirc_dev.c b/drivers/media/rc/lirc_dev.c index c04c546bf092..13ef0475d901 100644 --- a/drivers/media/rc/lirc_dev.c +++ b/drivers/media/rc/lirc_dev.c @@ -816,3 +816,5 @@ void __exit lirc_dev_exit(void) class_destroy(lirc_class); unregister_chrdev_region(lirc_base_dev, RC_DEV_MAX); } + +MODULE_ALIAS("lirc_dev"); From 5d2457b56d8f49cc6f4865e2f4a34e20f157393f Mon Sep 17 00:00:00 2001 From: Sean Young Date: Thu, 28 Dec 2017 14:58:26 -0500 Subject: [PATCH 0586/1640] UPSTREAM: media: lirc: lirc daemon fails to detect raw IR device Since commit 9b6192589be7 ("media: lirc: implement scancode sending"), and commit de142c324106 ("media: lirc: implement reading scancode") the lirc features ioctl for raw IR devices advertises two modes for sending and receiving. The lirc daemon now fails to detect a raw IR device, both for transmit and receive. To fix this, do not advertise the scancode mode in the lirc features for raw IR devices (however do keep it for scancode devices). The mode can still be used via the LIRC_SET_{REC,SEND}_MODE ioctl. Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- .../media/uapi/rc/lirc-get-features.rst | 24 +++++++++---------- drivers/media/rc/lirc_dev.c | 4 ++-- 2 files changed, 13 insertions(+), 15 deletions(-) diff --git a/Documentation/media/uapi/rc/lirc-get-features.rst b/Documentation/media/uapi/rc/lirc-get-features.rst index 3ee44067de63..889a8807037b 100644 --- a/Documentation/media/uapi/rc/lirc-get-features.rst +++ b/Documentation/media/uapi/rc/lirc-get-features.rst @@ -55,8 +55,11 @@ LIRC features ``LIRC_CAN_REC_MODE2`` - The driver is capable of receiving using - :ref:`LIRC_MODE_MODE2 `. + This is raw IR driver for receiving. This means that + :ref:`LIRC_MODE_MODE2 ` is used. This also implies + that :ref:`LIRC_MODE_SCANCODE ` is also supported, + as long as the kernel is recent enough. Use the + :ref:`lirc_set_rec_mode` to switch modes. .. _LIRC-CAN-REC-LIRCCODE: @@ -68,9 +71,8 @@ LIRC features ``LIRC_CAN_REC_SCANCODE`` - The driver is capable of receiving using - :ref:`LIRC_MODE_SCANCODE `. - + This is a scancode driver for receiving. This means that + :ref:`LIRC_MODE_SCANCODE ` is used. .. _LIRC-CAN-SET-SEND-CARRIER: @@ -164,7 +166,10 @@ LIRC features ``LIRC_CAN_SEND_PULSE`` The driver supports sending (also called as IR blasting or IR TX) using - :ref:`LIRC_MODE_PULSE `. + :ref:`LIRC_MODE_PULSE `. This implies that + :ref:`LIRC_MODE_SCANCODE ` is also supported for + transmit, as long as the kernel is recent enough. Use the + :ref:`lirc_set_send_mode` to switch modes. .. _LIRC-CAN-SEND-MODE2: @@ -179,13 +184,6 @@ LIRC features Unused. Kept just to avoid breaking uAPI. -.. _LIRC-CAN-SEND-SCANCODE: - -``LIRC_CAN_SEND_SCANCODE`` - - The driver supports sending (also called as IR blasting or IR TX) using - :ref:`LIRC_MODE_SCANCODE `. - Return Value ============ diff --git a/drivers/media/rc/lirc_dev.c b/drivers/media/rc/lirc_dev.c index 13ef0475d901..6ef5b24eb1d8 100644 --- a/drivers/media/rc/lirc_dev.c +++ b/drivers/media/rc/lirc_dev.c @@ -403,13 +403,13 @@ static long ir_lirc_ioctl(struct file *file, unsigned int cmd, val |= LIRC_CAN_REC_SCANCODE; if (dev->driver_type == RC_DRIVER_IR_RAW) { - val |= LIRC_CAN_REC_MODE2 | LIRC_CAN_REC_SCANCODE; + val |= LIRC_CAN_REC_MODE2; if (dev->rx_resolution) val |= LIRC_CAN_GET_REC_RESOLUTION; } if (dev->tx_ir) { - val |= LIRC_CAN_SEND_PULSE | LIRC_CAN_SEND_SCANCODE; + val |= LIRC_CAN_SEND_PULSE; if (dev->s_tx_mask) val |= LIRC_CAN_SET_TRANSMITTER_MASK; if (dev->s_tx_carrier) From 58ae2193c2db24bebbf24e1412b6af4b1bb4f3d6 Mon Sep 17 00:00:00 2001 From: Sean Young Date: Fri, 5 Jan 2018 08:26:51 -0500 Subject: [PATCH 0587/1640] UPSTREAM: media: rc: clean up leader pulse/space for manchester encoding MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The IR rc6 encoder sends the header using manchester encoding using 0 bits, which causes the following: UBSAN: Undefined behaviour in drivers/media/rc/rc-ir-raw.c:247:6 shift exponent 4294967295 is too large for 64-bit type 'long long unsigned int' So, allow the leader code to send a pulse and space and remove the unused pulse_space_start field. Cc: Antti Seppälä Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/ir-mce_kbd-decoder.c | 2 +- drivers/media/rc/ir-rc5-decoder.c | 9 +++---- drivers/media/rc/ir-rc6-decoder.c | 35 ++++++++------------------- drivers/media/rc/rc-core-priv.h | 10 ++++---- drivers/media/rc/rc-ir-raw.c | 12 ++++----- 5 files changed, 24 insertions(+), 44 deletions(-) diff --git a/drivers/media/rc/ir-mce_kbd-decoder.c b/drivers/media/rc/ir-mce_kbd-decoder.c index 8cf4cf358052..2a279b3b9c0a 100644 --- a/drivers/media/rc/ir-mce_kbd-decoder.c +++ b/drivers/media/rc/ir-mce_kbd-decoder.c @@ -424,7 +424,7 @@ static int ir_mce_kbd_unregister(struct rc_dev *dev) } static const struct ir_raw_timings_manchester ir_mce_kbd_timings = { - .leader = MCIR2_PREFIX_PULSE, + .leader_pulse = MCIR2_PREFIX_PULSE, .invert = 1, .clock = MCIR2_UNIT, .trailer_space = MCIR2_UNIT * 10, diff --git a/drivers/media/rc/ir-rc5-decoder.c b/drivers/media/rc/ir-rc5-decoder.c index f589d99245eb..a1d6c955ffc8 100644 --- a/drivers/media/rc/ir-rc5-decoder.c +++ b/drivers/media/rc/ir-rc5-decoder.c @@ -173,16 +173,14 @@ out: } static const struct ir_raw_timings_manchester ir_rc5_timings = { - .leader = RC5_UNIT, - .pulse_space_start = 0, + .leader_pulse = RC5_UNIT, .clock = RC5_UNIT, .trailer_space = RC5_UNIT * 10, }; static const struct ir_raw_timings_manchester ir_rc5x_timings[2] = { { - .leader = RC5_UNIT, - .pulse_space_start = 0, + .leader_pulse = RC5_UNIT, .clock = RC5_UNIT, .trailer_space = RC5X_SPACE, }, @@ -193,8 +191,7 @@ static const struct ir_raw_timings_manchester ir_rc5x_timings[2] = { }; static const struct ir_raw_timings_manchester ir_rc5_sz_timings = { - .leader = RC5_UNIT, - .pulse_space_start = 0, + .leader_pulse = RC5_UNIT, .clock = RC5_UNIT, .trailer_space = RC5_UNIT * 10, }; diff --git a/drivers/media/rc/ir-rc6-decoder.c b/drivers/media/rc/ir-rc6-decoder.c index 665025303c28..422dec08738c 100644 --- a/drivers/media/rc/ir-rc6-decoder.c +++ b/drivers/media/rc/ir-rc6-decoder.c @@ -288,13 +288,8 @@ out: static const struct ir_raw_timings_manchester ir_rc6_timings[4] = { { - .leader = RC6_PREFIX_PULSE, - .pulse_space_start = 0, - .clock = RC6_UNIT, - .invert = 1, - .trailer_space = RC6_PREFIX_SPACE, - }, - { + .leader_pulse = RC6_PREFIX_PULSE, + .leader_space = RC6_PREFIX_SPACE, .clock = RC6_UNIT, .invert = 1, }, @@ -329,27 +324,22 @@ static int ir_rc6_encode(enum rc_proto protocol, u32 scancode, struct ir_raw_event *e = events; if (protocol == RC_PROTO_RC6_0) { - /* Modulate the preamble */ - ret = ir_raw_gen_manchester(&e, max, &ir_rc6_timings[0], 0, 0); - if (ret < 0) - return ret; - /* Modulate the header (Start Bit & Mode-0) */ ret = ir_raw_gen_manchester(&e, max - (e - events), - &ir_rc6_timings[1], - RC6_HEADER_NBITS, (1 << 3)); + &ir_rc6_timings[0], + RC6_HEADER_NBITS + 1, (1 << 3)); if (ret < 0) return ret; /* Modulate Trailer Bit */ ret = ir_raw_gen_manchester(&e, max - (e - events), - &ir_rc6_timings[2], 1, 0); + &ir_rc6_timings[1], 1, 0); if (ret < 0) return ret; /* Modulate rest of the data */ ret = ir_raw_gen_manchester(&e, max - (e - events), - &ir_rc6_timings[3], RC6_0_NBITS, + &ir_rc6_timings[2], RC6_0_NBITS, scancode); if (ret < 0) return ret; @@ -372,27 +362,22 @@ static int ir_rc6_encode(enum rc_proto protocol, u32 scancode, return -EINVAL; } - /* Modulate the preamble */ - ret = ir_raw_gen_manchester(&e, max, &ir_rc6_timings[0], 0, 0); - if (ret < 0) - return ret; - /* Modulate the header (Start Bit & Header-version 6 */ ret = ir_raw_gen_manchester(&e, max - (e - events), - &ir_rc6_timings[1], - RC6_HEADER_NBITS, (1 << 3 | 6)); + &ir_rc6_timings[0], + RC6_HEADER_NBITS + 1, (1 << 3 | 6)); if (ret < 0) return ret; /* Modulate Trailer Bit */ ret = ir_raw_gen_manchester(&e, max - (e - events), - &ir_rc6_timings[2], 1, 0); + &ir_rc6_timings[1], 1, 0); if (ret < 0) return ret; /* Modulate rest of the data */ ret = ir_raw_gen_manchester(&e, max - (e - events), - &ir_rc6_timings[3], + &ir_rc6_timings[2], bits, scancode); if (ret < 0) diff --git a/drivers/media/rc/rc-core-priv.h b/drivers/media/rc/rc-core-priv.h index 3c3d2620f0e8..458e9eb2d6a9 100644 --- a/drivers/media/rc/rc-core-priv.h +++ b/drivers/media/rc/rc-core-priv.h @@ -166,17 +166,17 @@ static inline void init_ir_raw_event_duration(struct ir_raw_event *ev, /** * struct ir_raw_timings_manchester - Manchester coding timings - * @leader: duration of leader pulse (if any) 0 if continuing - * existing signal (see @pulse_space_start) - * @pulse_space_start: 1 for starting with pulse (0 for starting with space) + * @leader_pulse: duration of leader pulse (if any) 0 if continuing + * existing signal + * @leader_space: duration of leader space (if any) * @clock: duration of each pulse/space in ns * @invert: if set clock logic is inverted * (0 = space + pulse, 1 = pulse + space) * @trailer_space: duration of trailer space in ns */ struct ir_raw_timings_manchester { - unsigned int leader; - unsigned int pulse_space_start:1; + unsigned int leader_pulse; + unsigned int leader_space; unsigned int clock; unsigned int invert:1; unsigned int trailer_space; diff --git a/drivers/media/rc/rc-ir-raw.c b/drivers/media/rc/rc-ir-raw.c index 3dabb783a1f0..8500b57923c0 100644 --- a/drivers/media/rc/rc-ir-raw.c +++ b/drivers/media/rc/rc-ir-raw.c @@ -246,17 +246,15 @@ int ir_raw_gen_manchester(struct ir_raw_event **ev, unsigned int max, i = BIT_ULL(n - 1); - if (timings->leader) { + if (timings->leader_pulse) { if (!max--) return ret; - if (timings->pulse_space_start) { - init_ir_raw_event_duration((*ev)++, 1, timings->leader); - + init_ir_raw_event_duration((*ev), 1, timings->leader_pulse); + if (timings->leader_space) { if (!max--) return ret; - init_ir_raw_event_duration((*ev), 0, timings->leader); - } else { - init_ir_raw_event_duration((*ev), 1, timings->leader); + init_ir_raw_event_duration(++(*ev), 0, + timings->leader_space); } i >>= 1; } else { From a9f9fafd3f0a73616b96b15c2699d05327281db4 Mon Sep 17 00:00:00 2001 From: Sean Young Date: Fri, 5 Jan 2018 08:38:43 -0500 Subject: [PATCH 0588/1640] UPSTREAM: media: rc: do not remove first bit if leader pulse is present MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The rc5 protocol does not have a leading pulse or space, but we encode the first bit using a single leading pulse. For other protocols, the leading pulse or space does not represent any bit. So, don't remove the first bit if a leading pulse is present. Cc: Antti Seppälä Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/ir-mce_kbd-decoder.c | 4 ++-- drivers/media/rc/ir-rc5-decoder.c | 13 ++++++++----- drivers/media/rc/ir-rc6-decoder.c | 4 ++-- drivers/media/rc/rc-ir-raw.c | 1 - 4 files changed, 12 insertions(+), 10 deletions(-) diff --git a/drivers/media/rc/ir-mce_kbd-decoder.c b/drivers/media/rc/ir-mce_kbd-decoder.c index 2a279b3b9c0a..2c3df02e05ff 100644 --- a/drivers/media/rc/ir-mce_kbd-decoder.c +++ b/drivers/media/rc/ir-mce_kbd-decoder.c @@ -452,11 +452,11 @@ static int ir_mce_kbd_encode(enum rc_proto protocol, u32 scancode, if (protocol == RC_PROTO_MCIR2_KBD) { raw = scancode | ((u64)MCIR2_KEYBOARD_HEADER << MCIR2_KEYBOARD_NBITS); - len = MCIR2_KEYBOARD_NBITS + MCIR2_HEADER_NBITS + 1; + len = MCIR2_KEYBOARD_NBITS + MCIR2_HEADER_NBITS; } else { raw = scancode | ((u64)MCIR2_MOUSE_HEADER << MCIR2_MOUSE_NBITS); - len = MCIR2_MOUSE_NBITS + MCIR2_HEADER_NBITS + 1; + len = MCIR2_MOUSE_NBITS + MCIR2_HEADER_NBITS; } ret = ir_raw_gen_manchester(&e, max, &ir_mce_kbd_timings, len, raw); diff --git a/drivers/media/rc/ir-rc5-decoder.c b/drivers/media/rc/ir-rc5-decoder.c index a1d6c955ffc8..11a28f8772da 100644 --- a/drivers/media/rc/ir-rc5-decoder.c +++ b/drivers/media/rc/ir-rc5-decoder.c @@ -225,9 +225,9 @@ static int ir_rc5_encode(enum rc_proto protocol, u32 scancode, /* encode data */ data = !commandx << 12 | system << 6 | command; - /* Modulate the data */ + /* First bit is encoded by leader_pulse */ ret = ir_raw_gen_manchester(&e, max, &ir_rc5_timings, - RC5_NBITS, data); + RC5_NBITS - 1, data); if (ret < 0) return ret; } else if (protocol == RC_PROTO_RC5X_20) { @@ -240,10 +240,11 @@ static int ir_rc5_encode(enum rc_proto protocol, u32 scancode, /* encode data */ data = commandx << 18 | system << 12 | command << 6 | xdata; - /* Modulate the data */ + /* First bit is encoded by leader_pulse */ pre_space_data = data >> (RC5X_NBITS - CHECK_RC5X_NBITS); ret = ir_raw_gen_manchester(&e, max, &ir_rc5x_timings[0], - CHECK_RC5X_NBITS, pre_space_data); + CHECK_RC5X_NBITS - 1, + pre_space_data); if (ret < 0) return ret; ret = ir_raw_gen_manchester(&e, max - (e - events), @@ -254,8 +255,10 @@ static int ir_rc5_encode(enum rc_proto protocol, u32 scancode, return ret; } else if (protocol == RC_PROTO_RC5_SZ) { /* RC5-SZ scancode is raw enough for Manchester as it is */ + /* First bit is encoded by leader_pulse */ ret = ir_raw_gen_manchester(&e, max, &ir_rc5_sz_timings, - RC5_SZ_NBITS, scancode & 0x2fff); + RC5_SZ_NBITS - 1, + scancode & 0x2fff); if (ret < 0) return ret; } else { diff --git a/drivers/media/rc/ir-rc6-decoder.c b/drivers/media/rc/ir-rc6-decoder.c index 422dec08738c..55bb19bbd4e9 100644 --- a/drivers/media/rc/ir-rc6-decoder.c +++ b/drivers/media/rc/ir-rc6-decoder.c @@ -327,7 +327,7 @@ static int ir_rc6_encode(enum rc_proto protocol, u32 scancode, /* Modulate the header (Start Bit & Mode-0) */ ret = ir_raw_gen_manchester(&e, max - (e - events), &ir_rc6_timings[0], - RC6_HEADER_NBITS + 1, (1 << 3)); + RC6_HEADER_NBITS, (1 << 3)); if (ret < 0) return ret; @@ -365,7 +365,7 @@ static int ir_rc6_encode(enum rc_proto protocol, u32 scancode, /* Modulate the header (Start Bit & Header-version 6 */ ret = ir_raw_gen_manchester(&e, max - (e - events), &ir_rc6_timings[0], - RC6_HEADER_NBITS + 1, (1 << 3 | 6)); + RC6_HEADER_NBITS, (1 << 3 | 6)); if (ret < 0) return ret; diff --git a/drivers/media/rc/rc-ir-raw.c b/drivers/media/rc/rc-ir-raw.c index 8500b57923c0..18504870b9f0 100644 --- a/drivers/media/rc/rc-ir-raw.c +++ b/drivers/media/rc/rc-ir-raw.c @@ -256,7 +256,6 @@ int ir_raw_gen_manchester(struct ir_raw_event **ev, unsigned int max, init_ir_raw_event_duration(++(*ev), 0, timings->leader_space); } - i >>= 1; } else { /* continue existing signal */ --(*ev); From 0c1977327b202cede776c4457aa5de5bcf7901a8 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 11 Feb 2018 14:34:03 -0800 Subject: [PATCH 0589/1640] BACKPORT: vfs: do bulk POLL* -> EPOLL* replacement This is the mindless scripted replacement of kernel use of POLL* variables as described by Al, done by this script: for V in IN OUT PRI ERR RDNORM RDBAND WRNORM WRBAND HUP RDHUP NVAL MSG; do L=`git grep -l -w POLL$V | grep -v '^t' | grep -v /um/ | grep -v '^sa' | grep -v '/poll.h$'|grep -v '^D'` for f in $L; do sed -i "-es/^\([^\"]*\)\(\\)/\\1E\\2/" $f; done done with de-mangling cleanups yet to come. NOTE! On almost all architectures, the EPOLL* constants have the same values as the POLL* constants do. But they keyword here is "almost". For various bad reasons they aren't the same, and epoll() doesn't actually work quite correctly in some cases due to this on Sparc et al. The next patch from Al will sort out the final differences, and we should be all done. [Linux4: Only change drivers/media/rc/lirc_dev.c] Scripted-by: Al Viro Signed-off-by: Linus Torvalds --- drivers/media/rc/lirc_dev.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/media/rc/lirc_dev.c b/drivers/media/rc/lirc_dev.c index 6ef5b24eb1d8..694846026be6 100644 --- a/drivers/media/rc/lirc_dev.c +++ b/drivers/media/rc/lirc_dev.c @@ -109,7 +109,7 @@ void ir_lirc_raw_event(struct rc_dev *dev, struct ir_raw_event ev) if (LIRC_IS_TIMEOUT(sample) && !fh->send_timeout_reports) continue; if (kfifo_put(&fh->rawir, sample)) - wake_up_poll(&fh->wait_poll, POLLIN | POLLRDNORM); + wake_up_poll(&fh->wait_poll, EPOLLIN | EPOLLRDNORM); } spin_unlock_irqrestore(&dev->lirc_fh_lock, flags); } @@ -130,7 +130,7 @@ void ir_lirc_scancode_event(struct rc_dev *dev, struct lirc_scancode *lsc) spin_lock_irqsave(&dev->lirc_fh_lock, flags); list_for_each_entry(fh, &dev->lirc_fh, list) { if (kfifo_put(&fh->scancodes, *lsc)) - wake_up_poll(&fh->wait_poll, POLLIN | POLLRDNORM); + wake_up_poll(&fh->wait_poll, EPOLLIN | EPOLLRDNORM); } spin_unlock_irqrestore(&dev->lirc_fh_lock, flags); } @@ -604,15 +604,15 @@ static unsigned int ir_lirc_poll(struct file *file, poll_wait(file, &fh->wait_poll, wait); if (!rcdev->registered) { - events = POLLHUP | POLLERR; + events = EPOLLHUP | EPOLLERR; } else if (rcdev->driver_type != RC_DRIVER_IR_RAW_TX) { if (fh->rec_mode == LIRC_MODE_SCANCODE && !kfifo_is_empty(&fh->scancodes)) - events = POLLIN | POLLRDNORM; + events = EPOLLIN | EPOLLRDNORM; if (fh->rec_mode == LIRC_MODE_MODE2 && !kfifo_is_empty(&fh->rawir)) - events = POLLIN | POLLRDNORM; + events = EPOLLIN | EPOLLRDNORM; } return events; @@ -780,7 +780,7 @@ void ir_lirc_unregister(struct rc_dev *dev) spin_lock_irqsave(&dev->lirc_fh_lock, flags); list_for_each_entry(fh, &dev->lirc_fh, list) - wake_up_poll(&fh->wait_poll, POLLHUP | POLLERR); + wake_up_poll(&fh->wait_poll, EPOLLHUP | EPOLLERR); spin_unlock_irqrestore(&dev->lirc_fh_lock, flags); cdev_device_del(&dev->lirc_cdev, &dev->lirc_dev); From 0bf329d95e505ffa419c143c1b182384d5977422 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 25 Jan 2018 17:19:08 -0500 Subject: [PATCH 0590/1640] UPSTREAM: media: rc: don't mark IR decoders default y I usually update my config with make oldconfig and pressing return, trusting that whoever updates Kconfig sets sensible defaults. But my recent kernels ended up with all kinds of IR decoders built in that are not used by anything because they are all marked with default y. default y should only be set for something that prevents booting on common systems, never for some random weirdo driver feature like this. Remove all the "default y" in drivers/media/rc/Kconfig Signed-off-by: Andi Kleen Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/Kconfig | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/drivers/media/rc/Kconfig b/drivers/media/rc/Kconfig index 64b56663884e..2a52dd4defd4 100644 --- a/drivers/media/rc/Kconfig +++ b/drivers/media/rc/Kconfig @@ -28,14 +28,12 @@ config LIRC menuconfig RC_DECODERS bool "Remote controller decoders" depends on RC_CORE - default y if RC_DECODERS config IR_NEC_DECODER tristate "Enable IR raw decoder for the NEC protocol" depends on RC_CORE select BITREVERSE - default y ---help--- Enable this option if you have IR with NEC protocol, and @@ -45,7 +43,6 @@ config IR_RC5_DECODER tristate "Enable IR raw decoder for the RC-5 protocol" depends on RC_CORE select BITREVERSE - default y ---help--- Enable this option if you have IR with RC-5 protocol, and @@ -55,7 +52,6 @@ config IR_RC6_DECODER tristate "Enable IR raw decoder for the RC6 protocol" depends on RC_CORE select BITREVERSE - default y ---help--- Enable this option if you have an infrared remote control which @@ -65,7 +61,6 @@ config IR_JVC_DECODER tristate "Enable IR raw decoder for the JVC protocol" depends on RC_CORE select BITREVERSE - default y ---help--- Enable this option if you have an infrared remote control which @@ -75,7 +70,6 @@ config IR_SONY_DECODER tristate "Enable IR raw decoder for the Sony protocol" depends on RC_CORE select BITREVERSE - default y ---help--- Enable this option if you have an infrared remote control which @@ -84,7 +78,6 @@ config IR_SONY_DECODER config IR_SANYO_DECODER tristate "Enable IR raw decoder for the Sanyo protocol" depends on RC_CORE - default y ---help--- Enable this option if you have an infrared remote control which @@ -94,7 +87,6 @@ config IR_SANYO_DECODER config IR_SHARP_DECODER tristate "Enable IR raw decoder for the Sharp protocol" depends on RC_CORE - default y ---help--- Enable this option if you have an infrared remote control which @@ -105,7 +97,6 @@ config IR_MCE_KBD_DECODER tristate "Enable IR raw decoder for the MCE keyboard/mouse protocol" depends on RC_CORE select BITREVERSE - default y ---help--- Enable this option if you have a Microsoft Remote Keyboard for @@ -116,7 +107,6 @@ config IR_XMP_DECODER tristate "Enable IR raw decoder for the XMP protocol" depends on RC_CORE select BITREVERSE - default y ---help--- Enable this option if you have IR with XMP protocol, and @@ -446,7 +436,6 @@ config IR_SERIAL config IR_SERIAL_TRANSMITTER bool "Serial Port Transmitter" - default y depends on IR_SERIAL ---help--- Serial Port Transmitter support From e6e4feae4895f876ccb1ac120718bcd4189bf947 Mon Sep 17 00:00:00 2001 From: Alexey Khoroshilov Date: Fri, 26 Jan 2018 17:10:17 -0500 Subject: [PATCH 0591/1640] UPSTREAM: media: rc: ir-hix5hd2: fix error handling of clk_prepare_enable() Return code of clk_prepare_enable() is ignored in many places. The patch adds error handling for all of them. Found by Linux Driver Verification project (linuxtesting.org). Signed-off-by: Alexey Khoroshilov Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/ir-hix5hd2.c | 35 ++++++++++++++++++++++++++++------- 1 file changed, 28 insertions(+), 7 deletions(-) diff --git a/drivers/media/rc/ir-hix5hd2.c b/drivers/media/rc/ir-hix5hd2.c index 0ce11c41dfae..700ab4c563d0 100644 --- a/drivers/media/rc/ir-hix5hd2.c +++ b/drivers/media/rc/ir-hix5hd2.c @@ -71,9 +71,10 @@ struct hix5hd2_ir_priv { unsigned long rate; }; -static void hix5hd2_ir_enable(struct hix5hd2_ir_priv *dev, bool on) +static int hix5hd2_ir_enable(struct hix5hd2_ir_priv *dev, bool on) { u32 val; + int ret = 0; if (dev->regmap) { regmap_read(dev->regmap, IR_CLK, &val); @@ -87,10 +88,11 @@ static void hix5hd2_ir_enable(struct hix5hd2_ir_priv *dev, bool on) regmap_write(dev->regmap, IR_CLK, val); } else { if (on) - clk_prepare_enable(dev->clock); + ret = clk_prepare_enable(dev->clock); else clk_disable_unprepare(dev->clock); } + return ret; } static int hix5hd2_ir_config(struct hix5hd2_ir_priv *priv) @@ -127,9 +129,18 @@ static int hix5hd2_ir_config(struct hix5hd2_ir_priv *priv) static int hix5hd2_ir_open(struct rc_dev *rdev) { struct hix5hd2_ir_priv *priv = rdev->priv; + int ret; - hix5hd2_ir_enable(priv, true); - return hix5hd2_ir_config(priv); + ret = hix5hd2_ir_enable(priv, true); + if (ret) + return ret; + + ret = hix5hd2_ir_config(priv); + if (ret) { + hix5hd2_ir_enable(priv, false); + return ret; + } + return 0; } static void hix5hd2_ir_close(struct rc_dev *rdev) @@ -239,7 +250,9 @@ static int hix5hd2_ir_probe(struct platform_device *pdev) ret = PTR_ERR(priv->clock); goto err; } - clk_prepare_enable(priv->clock); + ret = clk_prepare_enable(priv->clock); + if (ret) + goto err; priv->rate = clk_get_rate(priv->clock); rdev->allowed_protocols = RC_PROTO_BIT_ALL_IR_DECODER; @@ -309,9 +322,17 @@ static int hix5hd2_ir_suspend(struct device *dev) static int hix5hd2_ir_resume(struct device *dev) { struct hix5hd2_ir_priv *priv = dev_get_drvdata(dev); + int ret; - hix5hd2_ir_enable(priv, true); - clk_prepare_enable(priv->clock); + ret = hix5hd2_ir_enable(priv, true); + if (ret) + return ret; + + ret = clk_prepare_enable(priv->clock); + if (ret) { + hix5hd2_ir_enable(priv, false); + return ret; + } writel_relaxed(0x01, priv->base + IR_ENABLE); writel_relaxed(0x00, priv->base + IR_INTM); From b204ddd898e68eef0a6f9ce54eac1f2022d3ec12 Mon Sep 17 00:00:00 2001 From: Sean Young Date: Sat, 27 Jan 2018 09:05:37 -0500 Subject: [PATCH 0592/1640] UPSTREAM: media: rc: ir-spi: fix duty cycle Calculate the pulse rather than having a few preset values. Signed-off-by: Sean Young Acked-by: Andi Shyti Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/ir-spi.c | 24 ++---------------------- 1 file changed, 2 insertions(+), 22 deletions(-) diff --git a/drivers/media/rc/ir-spi.c b/drivers/media/rc/ir-spi.c index a32a84ae2d0b..7163d5ce2e64 100644 --- a/drivers/media/rc/ir-spi.c +++ b/drivers/media/rc/ir-spi.c @@ -15,21 +15,11 @@ #define IR_SPI_DRIVER_NAME "ir-spi" -/* pulse value for different duty cycles */ -#define IR_SPI_PULSE_DC_50 0xff00 -#define IR_SPI_PULSE_DC_60 0xfc00 -#define IR_SPI_PULSE_DC_70 0xf800 -#define IR_SPI_PULSE_DC_75 0xf000 -#define IR_SPI_PULSE_DC_80 0xc000 -#define IR_SPI_PULSE_DC_90 0x8000 - #define IR_SPI_DEFAULT_FREQUENCY 38000 -#define IR_SPI_BIT_PER_WORD 8 #define IR_SPI_MAX_BUFSIZE 4096 struct ir_spi_data { u32 freq; - u8 duty_cycle; bool negated; u16 tx_buf[IR_SPI_MAX_BUFSIZE]; @@ -105,19 +95,9 @@ static int ir_spi_set_tx_carrier(struct rc_dev *dev, u32 carrier) static int ir_spi_set_duty_cycle(struct rc_dev *dev, u32 duty_cycle) { struct ir_spi_data *idata = dev->priv; + int bits = (duty_cycle * 15) / 100; - if (duty_cycle >= 90) - idata->pulse = IR_SPI_PULSE_DC_90; - else if (duty_cycle >= 80) - idata->pulse = IR_SPI_PULSE_DC_80; - else if (duty_cycle >= 75) - idata->pulse = IR_SPI_PULSE_DC_75; - else if (duty_cycle >= 70) - idata->pulse = IR_SPI_PULSE_DC_70; - else if (duty_cycle >= 60) - idata->pulse = IR_SPI_PULSE_DC_60; - else - idata->pulse = IR_SPI_PULSE_DC_50; + idata->pulse = GENMASK(bits, 0); if (idata->negated) { idata->pulse = ~idata->pulse; From 342856bb592418e2a1478d256b8dd8e680962db9 Mon Sep 17 00:00:00 2001 From: Sean Young Date: Mon, 12 Feb 2018 07:20:52 -0500 Subject: [PATCH 0593/1640] UPSTREAM: media: rc: replace IR_dprintk() with dev_dbg in IR decoders Use dev_dbg() rather than custom debug function. Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/ir-jvc-decoder.c | 14 +++---- drivers/media/rc/ir-mce_kbd-decoder.c | 60 ++++++++++++++------------- drivers/media/rc/ir-nec-decoder.c | 20 ++++----- drivers/media/rc/ir-rc5-decoder.c | 12 +++--- drivers/media/rc/ir-rc6-decoder.c | 26 ++++++------ drivers/media/rc/ir-sanyo-decoder.c | 18 ++++---- drivers/media/rc/ir-sharp-decoder.c | 17 ++++---- drivers/media/rc/ir-sony-decoder.c | 14 ++++--- drivers/media/rc/ir-xmp-decoder.c | 29 ++++++------- 9 files changed, 106 insertions(+), 104 deletions(-) diff --git a/drivers/media/rc/ir-jvc-decoder.c b/drivers/media/rc/ir-jvc-decoder.c index c03c776cfa54..8cb68ae43282 100644 --- a/drivers/media/rc/ir-jvc-decoder.c +++ b/drivers/media/rc/ir-jvc-decoder.c @@ -56,8 +56,8 @@ static int ir_jvc_decode(struct rc_dev *dev, struct ir_raw_event ev) if (!geq_margin(ev.duration, JVC_UNIT, JVC_UNIT / 2)) goto out; - IR_dprintk(2, "JVC decode started at state %d (%uus %s)\n", - data->state, TO_US(ev.duration), TO_STR(ev.pulse)); + dev_dbg(&dev->dev, "JVC decode started at state %d (%uus %s)\n", + data->state, TO_US(ev.duration), TO_STR(ev.pulse)); again: switch (data->state) { @@ -136,15 +136,15 @@ again: u32 scancode; scancode = (bitrev8((data->bits >> 8) & 0xff) << 8) | (bitrev8((data->bits >> 0) & 0xff) << 0); - IR_dprintk(1, "JVC scancode 0x%04x\n", scancode); + dev_dbg(&dev->dev, "JVC scancode 0x%04x\n", scancode); rc_keydown(dev, RC_PROTO_JVC, scancode, data->toggle); data->first = false; data->old_bits = data->bits; } else if (data->bits == data->old_bits) { - IR_dprintk(1, "JVC repeat\n"); + dev_dbg(&dev->dev, "JVC repeat\n"); rc_repeat(dev); } else { - IR_dprintk(1, "JVC invalid repeat msg\n"); + dev_dbg(&dev->dev, "JVC invalid repeat msg\n"); break; } @@ -164,8 +164,8 @@ again: } out: - IR_dprintk(1, "JVC decode failed at state %d (%uus %s)\n", - data->state, TO_US(ev.duration), TO_STR(ev.pulse)); + dev_dbg(&dev->dev, "JVC decode failed at state %d (%uus %s)\n", + data->state, TO_US(ev.duration), TO_STR(ev.pulse)); data->state = STATE_INACTIVE; return -EINVAL; } diff --git a/drivers/media/rc/ir-mce_kbd-decoder.c b/drivers/media/rc/ir-mce_kbd-decoder.c index 2c3df02e05ff..c110984ca671 100644 --- a/drivers/media/rc/ir-mce_kbd-decoder.c +++ b/drivers/media/rc/ir-mce_kbd-decoder.c @@ -117,19 +117,19 @@ static unsigned char kbd_keycodes[256] = { static void mce_kbd_rx_timeout(struct timer_list *t) { - struct mce_kbd_dec *mce_kbd = from_timer(mce_kbd, t, rx_timeout); - int i; + struct ir_raw_event_ctrl *raw = from_timer(raw, t, mce_kbd.rx_timeout); unsigned char maskcode; + int i; - IR_dprintk(2, "timer callback clearing all keys\n"); + dev_dbg(&raw->dev->dev, "timer callback clearing all keys\n"); for (i = 0; i < 7; i++) { maskcode = kbd_keycodes[MCIR2_MASK_KEYS_START + i]; - input_report_key(mce_kbd->idev, maskcode, 0); + input_report_key(raw->mce_kbd.idev, maskcode, 0); } for (i = 0; i < MCIR2_MASK_KEYS_START; i++) - input_report_key(mce_kbd->idev, kbd_keycodes[i], 0); + input_report_key(raw->mce_kbd.idev, kbd_keycodes[i], 0); } static enum mce_kbd_mode mce_kbd_mode(struct mce_kbd_dec *data) @@ -144,16 +144,16 @@ static enum mce_kbd_mode mce_kbd_mode(struct mce_kbd_dec *data) } } -static void ir_mce_kbd_process_keyboard_data(struct input_dev *idev, - u32 scancode) +static void ir_mce_kbd_process_keyboard_data(struct rc_dev *dev, u32 scancode) { + struct mce_kbd_dec *data = &dev->raw->mce_kbd; u8 keydata = (scancode >> 8) & 0xff; u8 shiftmask = scancode & 0xff; unsigned char keycode, maskcode; int i, keystate; - IR_dprintk(1, "keyboard: keydata = 0x%02x, shiftmask = 0x%02x\n", - keydata, shiftmask); + dev_dbg(&dev->dev, "keyboard: keydata = 0x%02x, shiftmask = 0x%02x\n", + keydata, shiftmask); for (i = 0; i < 7; i++) { maskcode = kbd_keycodes[MCIR2_MASK_KEYS_START + i]; @@ -161,20 +161,21 @@ static void ir_mce_kbd_process_keyboard_data(struct input_dev *idev, keystate = 1; else keystate = 0; - input_report_key(idev, maskcode, keystate); + input_report_key(data->idev, maskcode, keystate); } if (keydata) { keycode = kbd_keycodes[keydata]; - input_report_key(idev, keycode, 1); + input_report_key(data->idev, keycode, 1); } else { for (i = 0; i < MCIR2_MASK_KEYS_START; i++) - input_report_key(idev, kbd_keycodes[i], 0); + input_report_key(data->idev, kbd_keycodes[i], 0); } } -static void ir_mce_kbd_process_mouse_data(struct input_dev *idev, u32 scancode) +static void ir_mce_kbd_process_mouse_data(struct rc_dev *dev, u32 scancode) { + struct mce_kbd_dec *data = &dev->raw->mce_kbd; /* raw mouse coordinates */ u8 xdata = (scancode >> 7) & 0x7f; u8 ydata = (scancode >> 14) & 0x7f; @@ -193,14 +194,14 @@ static void ir_mce_kbd_process_mouse_data(struct input_dev *idev, u32 scancode) else y = ydata; - IR_dprintk(1, "mouse: x = %d, y = %d, btns = %s%s\n", - x, y, left ? "L" : "", right ? "R" : ""); + dev_dbg(&dev->dev, "mouse: x = %d, y = %d, btns = %s%s\n", + x, y, left ? "L" : "", right ? "R" : ""); - input_report_rel(idev, REL_X, x); - input_report_rel(idev, REL_Y, y); + input_report_rel(data->idev, REL_X, x); + input_report_rel(data->idev, REL_Y, y); - input_report_key(idev, BTN_LEFT, left); - input_report_key(idev, BTN_RIGHT, right); + input_report_key(data->idev, BTN_LEFT, left); + input_report_key(data->idev, BTN_RIGHT, right); } /** @@ -227,8 +228,8 @@ static int ir_mce_kbd_decode(struct rc_dev *dev, struct ir_raw_event ev) goto out; again: - IR_dprintk(2, "started at state %i (%uus %s)\n", - data->state, TO_US(ev.duration), TO_STR(ev.pulse)); + dev_dbg(&dev->dev, "started at state %i (%uus %s)\n", + data->state, TO_US(ev.duration), TO_STR(ev.pulse)); if (!geq_margin(ev.duration, MCIR2_UNIT, MCIR2_UNIT / 2)) return 0; @@ -280,7 +281,7 @@ again: data->wanted_bits = MCIR2_MOUSE_NBITS; break; default: - IR_dprintk(1, "not keyboard or mouse data\n"); + dev_dbg(&dev->dev, "not keyboard or mouse data\n"); goto out; } @@ -319,25 +320,26 @@ again: switch (data->wanted_bits) { case MCIR2_KEYBOARD_NBITS: scancode = data->body & 0xffff; - IR_dprintk(1, "keyboard data 0x%08x\n", data->body); + dev_dbg(&dev->dev, "keyboard data 0x%08x\n", + data->body); if (dev->timeout) delay = usecs_to_jiffies(dev->timeout / 1000); else delay = msecs_to_jiffies(100); mod_timer(&data->rx_timeout, jiffies + delay); /* Pass data to keyboard buffer parser */ - ir_mce_kbd_process_keyboard_data(data->idev, scancode); + ir_mce_kbd_process_keyboard_data(dev, scancode); lsc.rc_proto = RC_PROTO_MCIR2_KBD; break; case MCIR2_MOUSE_NBITS: scancode = data->body & 0x1fffff; - IR_dprintk(1, "mouse data 0x%06x\n", scancode); + dev_dbg(&dev->dev, "mouse data 0x%06x\n", scancode); /* Pass data to mouse buffer parser */ - ir_mce_kbd_process_mouse_data(data->idev, scancode); + ir_mce_kbd_process_mouse_data(dev, scancode); lsc.rc_proto = RC_PROTO_MCIR2_MSE; break; default: - IR_dprintk(1, "not keyboard or mouse data\n"); + dev_dbg(&dev->dev, "not keyboard or mouse data\n"); goto out; } @@ -350,8 +352,8 @@ again: } out: - IR_dprintk(1, "failed at state %i (%uus %s)\n", - data->state, TO_US(ev.duration), TO_STR(ev.pulse)); + dev_dbg(&dev->dev, "failed at state %i (%uus %s)\n", + data->state, TO_US(ev.duration), TO_STR(ev.pulse)); data->state = STATE_INACTIVE; input_sync(data->idev); return -EINVAL; diff --git a/drivers/media/rc/ir-nec-decoder.c b/drivers/media/rc/ir-nec-decoder.c index 31d7bafe7bda..21647b809e6f 100644 --- a/drivers/media/rc/ir-nec-decoder.c +++ b/drivers/media/rc/ir-nec-decoder.c @@ -49,8 +49,8 @@ static int ir_nec_decode(struct rc_dev *dev, struct ir_raw_event ev) return 0; } - IR_dprintk(2, "NEC decode started at state %d (%uus %s)\n", - data->state, TO_US(ev.duration), TO_STR(ev.pulse)); + dev_dbg(&dev->dev, "NEC decode started at state %d (%uus %s)\n", + data->state, TO_US(ev.duration), TO_STR(ev.pulse)); switch (data->state) { @@ -99,13 +99,11 @@ static int ir_nec_decode(struct rc_dev *dev, struct ir_raw_event ev) break; if (data->necx_repeat && data->count == NECX_REPEAT_BITS && - geq_margin(ev.duration, - NEC_TRAILER_SPACE, NEC_UNIT / 2)) { - IR_dprintk(1, "Repeat last key\n"); - rc_repeat(dev); - data->state = STATE_INACTIVE; - return 0; - + geq_margin(ev.duration, NEC_TRAILER_SPACE, NEC_UNIT / 2)) { + dev_dbg(&dev->dev, "Repeat last key\n"); + rc_repeat(dev); + data->state = STATE_INACTIVE; + return 0; } else if (data->count > NECX_REPEAT_BITS) data->necx_repeat = false; @@ -164,8 +162,8 @@ static int ir_nec_decode(struct rc_dev *dev, struct ir_raw_event ev) return 0; } - IR_dprintk(1, "NEC decode failed at count %d state %d (%uus %s)\n", - data->count, data->state, TO_US(ev.duration), TO_STR(ev.pulse)); + dev_dbg(&dev->dev, "NEC decode failed at count %d state %d (%uus %s)\n", + data->count, data->state, TO_US(ev.duration), TO_STR(ev.pulse)); data->state = STATE_INACTIVE; return -EINVAL; } diff --git a/drivers/media/rc/ir-rc5-decoder.c b/drivers/media/rc/ir-rc5-decoder.c index 11a28f8772da..74d3b859c3a2 100644 --- a/drivers/media/rc/ir-rc5-decoder.c +++ b/drivers/media/rc/ir-rc5-decoder.c @@ -54,8 +54,8 @@ static int ir_rc5_decode(struct rc_dev *dev, struct ir_raw_event ev) goto out; again: - IR_dprintk(2, "RC5(x/sz) decode started at state %i (%uus %s)\n", - data->state, TO_US(ev.duration), TO_STR(ev.pulse)); + dev_dbg(&dev->dev, "RC5(x/sz) decode started at state %i (%uus %s)\n", + data->state, TO_US(ev.duration), TO_STR(ev.pulse)); if (!geq_margin(ev.duration, RC5_UNIT, RC5_UNIT / 2)) return 0; @@ -157,8 +157,8 @@ again: } else break; - IR_dprintk(1, "RC5(x/sz) scancode 0x%06x (p: %u, t: %u)\n", - scancode, protocol, toggle); + dev_dbg(&dev->dev, "RC5(x/sz) scancode 0x%06x (p: %u, t: %u)\n", + scancode, protocol, toggle); rc_keydown(dev, protocol, scancode, toggle); data->state = STATE_INACTIVE; @@ -166,8 +166,8 @@ again: } out: - IR_dprintk(1, "RC5(x/sz) decode failed at state %i count %d (%uus %s)\n", - data->state, data->count, TO_US(ev.duration), TO_STR(ev.pulse)); + dev_dbg(&dev->dev, "RC5(x/sz) decode failed at state %i count %d (%uus %s)\n", + data->state, data->count, TO_US(ev.duration), TO_STR(ev.pulse)); data->state = STATE_INACTIVE; return -EINVAL; } diff --git a/drivers/media/rc/ir-rc6-decoder.c b/drivers/media/rc/ir-rc6-decoder.c index 55bb19bbd4e9..8314da32453f 100644 --- a/drivers/media/rc/ir-rc6-decoder.c +++ b/drivers/media/rc/ir-rc6-decoder.c @@ -100,8 +100,8 @@ static int ir_rc6_decode(struct rc_dev *dev, struct ir_raw_event ev) goto out; again: - IR_dprintk(2, "RC6 decode started at state %i (%uus %s)\n", - data->state, TO_US(ev.duration), TO_STR(ev.pulse)); + dev_dbg(&dev->dev, "RC6 decode started at state %i (%uus %s)\n", + data->state, TO_US(ev.duration), TO_STR(ev.pulse)); if (!geq_margin(ev.duration, RC6_UNIT, RC6_UNIT / 2)) return 0; @@ -170,7 +170,7 @@ again: break; if (!(data->header & RC6_STARTBIT_MASK)) { - IR_dprintk(1, "RC6 invalid start bit\n"); + dev_dbg(&dev->dev, "RC6 invalid start bit\n"); break; } @@ -187,7 +187,7 @@ again: data->wanted_bits = RC6_6A_NBITS; break; default: - IR_dprintk(1, "RC6 unknown mode\n"); + dev_dbg(&dev->dev, "RC6 unknown mode\n"); goto out; } goto again; @@ -230,13 +230,13 @@ again: scancode = data->body; toggle = data->toggle; protocol = RC_PROTO_RC6_0; - IR_dprintk(1, "RC6(0) scancode 0x%04x (toggle: %u)\n", - scancode, toggle); + dev_dbg(&dev->dev, "RC6(0) scancode 0x%04x (toggle: %u)\n", + scancode, toggle); break; case RC6_MODE_6A: if (data->count > CHAR_BIT * sizeof data->body) { - IR_dprintk(1, "RC6 too many (%u) data bits\n", + dev_dbg(&dev->dev, "RC6 too many (%u) data bits\n", data->count); goto out; } @@ -262,15 +262,15 @@ again: } break; default: - IR_dprintk(1, "RC6(6A) unsupported length\n"); + dev_dbg(&dev->dev, "RC6(6A) unsupported length\n"); goto out; } - IR_dprintk(1, "RC6(6A) proto 0x%04x, scancode 0x%08x (toggle: %u)\n", - protocol, scancode, toggle); + dev_dbg(&dev->dev, "RC6(6A) proto 0x%04x, scancode 0x%08x (toggle: %u)\n", + protocol, scancode, toggle); break; default: - IR_dprintk(1, "RC6 unknown mode\n"); + dev_dbg(&dev->dev, "RC6 unknown mode\n"); goto out; } @@ -280,8 +280,8 @@ again: } out: - IR_dprintk(1, "RC6 decode failed at state %i (%uus %s)\n", - data->state, TO_US(ev.duration), TO_STR(ev.pulse)); + dev_dbg(&dev->dev, "RC6 decode failed at state %i (%uus %s)\n", + data->state, TO_US(ev.duration), TO_STR(ev.pulse)); data->state = STATE_INACTIVE; return -EINVAL; } diff --git a/drivers/media/rc/ir-sanyo-decoder.c b/drivers/media/rc/ir-sanyo-decoder.c index ded39cdfc6ef..4efe6db5376a 100644 --- a/drivers/media/rc/ir-sanyo-decoder.c +++ b/drivers/media/rc/ir-sanyo-decoder.c @@ -52,14 +52,14 @@ static int ir_sanyo_decode(struct rc_dev *dev, struct ir_raw_event ev) if (!is_timing_event(ev)) { if (ev.reset) { - IR_dprintk(1, "SANYO event reset received. reset to state 0\n"); + dev_dbg(&dev->dev, "SANYO event reset received. reset to state 0\n"); data->state = STATE_INACTIVE; } return 0; } - IR_dprintk(2, "SANYO decode started at state %d (%uus %s)\n", - data->state, TO_US(ev.duration), TO_STR(ev.pulse)); + dev_dbg(&dev->dev, "SANYO decode started at state %d (%uus %s)\n", + data->state, TO_US(ev.duration), TO_STR(ev.pulse)); switch (data->state) { @@ -102,7 +102,7 @@ static int ir_sanyo_decode(struct rc_dev *dev, struct ir_raw_event ev) if (!data->count && geq_margin(ev.duration, SANYO_REPEAT_SPACE, SANYO_UNIT / 2)) { rc_repeat(dev); - IR_dprintk(1, "SANYO repeat last key\n"); + dev_dbg(&dev->dev, "SANYO repeat last key\n"); data->state = STATE_INACTIVE; return 0; } @@ -144,21 +144,21 @@ static int ir_sanyo_decode(struct rc_dev *dev, struct ir_raw_event ev) not_command = bitrev8((data->bits >> 0) & 0xff); if ((command ^ not_command) != 0xff) { - IR_dprintk(1, "SANYO checksum error: received 0x%08Lx\n", - data->bits); + dev_dbg(&dev->dev, "SANYO checksum error: received 0x%08llx\n", + data->bits); data->state = STATE_INACTIVE; return 0; } scancode = address << 8 | command; - IR_dprintk(1, "SANYO scancode: 0x%06x\n", scancode); + dev_dbg(&dev->dev, "SANYO scancode: 0x%06x\n", scancode); rc_keydown(dev, RC_PROTO_SANYO, scancode, 0); data->state = STATE_INACTIVE; return 0; } - IR_dprintk(1, "SANYO decode failed at count %d state %d (%uus %s)\n", - data->count, data->state, TO_US(ev.duration), TO_STR(ev.pulse)); + dev_dbg(&dev->dev, "SANYO decode failed at count %d state %d (%uus %s)\n", + data->count, data->state, TO_US(ev.duration), TO_STR(ev.pulse)); data->state = STATE_INACTIVE; return -EINVAL; } diff --git a/drivers/media/rc/ir-sharp-decoder.c b/drivers/media/rc/ir-sharp-decoder.c index df296991906c..6a38c50566a4 100644 --- a/drivers/media/rc/ir-sharp-decoder.c +++ b/drivers/media/rc/ir-sharp-decoder.c @@ -54,8 +54,8 @@ static int ir_sharp_decode(struct rc_dev *dev, struct ir_raw_event ev) return 0; } - IR_dprintk(2, "Sharp decode started at state %d (%uus %s)\n", - data->state, TO_US(ev.duration), TO_STR(ev.pulse)); + dev_dbg(&dev->dev, "Sharp decode started at state %d (%uus %s)\n", + data->state, TO_US(ev.duration), TO_STR(ev.pulse)); switch (data->state) { @@ -149,9 +149,9 @@ static int ir_sharp_decode(struct rc_dev *dev, struct ir_raw_event ev) msg = (data->bits >> 15) & 0x7fff; echo = data->bits & 0x7fff; if ((msg ^ echo) != 0x3ff) { - IR_dprintk(1, - "Sharp checksum error: received 0x%04x, 0x%04x\n", - msg, echo); + dev_dbg(&dev->dev, + "Sharp checksum error: received 0x%04x, 0x%04x\n", + msg, echo); break; } @@ -159,16 +159,15 @@ static int ir_sharp_decode(struct rc_dev *dev, struct ir_raw_event ev) command = bitrev8((msg >> 2) & 0xff); scancode = address << 8 | command; - IR_dprintk(1, "Sharp scancode 0x%04x\n", scancode); + dev_dbg(&dev->dev, "Sharp scancode 0x%04x\n", scancode); rc_keydown(dev, RC_PROTO_SHARP, scancode, 0); data->state = STATE_INACTIVE; return 0; } - IR_dprintk(1, "Sharp decode failed at count %d state %d (%uus %s)\n", - data->count, data->state, TO_US(ev.duration), - TO_STR(ev.pulse)); + dev_dbg(&dev->dev, "Sharp decode failed at count %d state %d (%uus %s)\n", + data->count, data->state, TO_US(ev.duration), TO_STR(ev.pulse)); data->state = STATE_INACTIVE; return -EINVAL; } diff --git a/drivers/media/rc/ir-sony-decoder.c b/drivers/media/rc/ir-sony-decoder.c index e4bcff21c025..6764ec9de646 100644 --- a/drivers/media/rc/ir-sony-decoder.c +++ b/drivers/media/rc/ir-sony-decoder.c @@ -55,8 +55,8 @@ static int ir_sony_decode(struct rc_dev *dev, struct ir_raw_event ev) if (!geq_margin(ev.duration, SONY_UNIT, SONY_UNIT / 2)) goto out; - IR_dprintk(2, "Sony decode started at state %d (%uus %s)\n", - data->state, TO_US(ev.duration), TO_STR(ev.pulse)); + dev_dbg(&dev->dev, "Sony decode started at state %d (%uus %s)\n", + data->state, TO_US(ev.duration), TO_STR(ev.pulse)); switch (data->state) { @@ -148,19 +148,21 @@ static int ir_sony_decode(struct rc_dev *dev, struct ir_raw_event ev) protocol = RC_PROTO_SONY20; break; default: - IR_dprintk(1, "Sony invalid bitcount %u\n", data->count); + dev_dbg(&dev->dev, "Sony invalid bitcount %u\n", + data->count); goto out; } scancode = device << 16 | subdevice << 8 | function; - IR_dprintk(1, "Sony(%u) scancode 0x%05x\n", data->count, scancode); + dev_dbg(&dev->dev, "Sony(%u) scancode 0x%05x\n", data->count, + scancode); rc_keydown(dev, protocol, scancode, 0); goto finish_state_machine; } out: - IR_dprintk(1, "Sony decode failed at state %d (%uus %s)\n", - data->state, TO_US(ev.duration), TO_STR(ev.pulse)); + dev_dbg(&dev->dev, "Sony decode failed at state %d (%uus %s)\n", + data->state, TO_US(ev.duration), TO_STR(ev.pulse)); data->state = STATE_INACTIVE; return -EINVAL; diff --git a/drivers/media/rc/ir-xmp-decoder.c b/drivers/media/rc/ir-xmp-decoder.c index 712bc6d76e92..58b47af1a763 100644 --- a/drivers/media/rc/ir-xmp-decoder.c +++ b/drivers/media/rc/ir-xmp-decoder.c @@ -49,8 +49,8 @@ static int ir_xmp_decode(struct rc_dev *dev, struct ir_raw_event ev) return 0; } - IR_dprintk(2, "XMP decode started at state %d %d (%uus %s)\n", - data->state, data->count, TO_US(ev.duration), TO_STR(ev.pulse)); + dev_dbg(&dev->dev, "XMP decode started at state %d %d (%uus %s)\n", + data->state, data->count, TO_US(ev.duration), TO_STR(ev.pulse)); switch (data->state) { @@ -85,7 +85,7 @@ static int ir_xmp_decode(struct rc_dev *dev, struct ir_raw_event ev) u32 scancode; if (data->count != 16) { - IR_dprintk(2, "received TRAILER period at index %d: %u\n", + dev_dbg(&dev->dev, "received TRAILER period at index %d: %u\n", data->count, ev.duration); data->state = STATE_INACTIVE; return -EINVAL; @@ -99,7 +99,8 @@ static int ir_xmp_decode(struct rc_dev *dev, struct ir_raw_event ev) */ divider = (n[3] - XMP_NIBBLE_PREFIX) / 15 - 2000; if (divider < 50) { - IR_dprintk(2, "divider to small %d.\n", divider); + dev_dbg(&dev->dev, "divider to small %d.\n", + divider); data->state = STATE_INACTIVE; return -EINVAL; } @@ -113,7 +114,7 @@ static int ir_xmp_decode(struct rc_dev *dev, struct ir_raw_event ev) n[12] + n[13] + n[14] + n[15]) % 16; if (sum1 != 15 || sum2 != 15) { - IR_dprintk(2, "checksum errors sum1=0x%X sum2=0x%X\n", + dev_dbg(&dev->dev, "checksum errors sum1=0x%X sum2=0x%X\n", sum1, sum2); data->state = STATE_INACTIVE; return -EINVAL; @@ -127,24 +128,24 @@ static int ir_xmp_decode(struct rc_dev *dev, struct ir_raw_event ev) obc1 = n[12] << 4 | n[13]; obc2 = n[14] << 4 | n[15]; if (subaddr != subaddr2) { - IR_dprintk(2, "subaddress nibbles mismatch 0x%02X != 0x%02X\n", + dev_dbg(&dev->dev, "subaddress nibbles mismatch 0x%02X != 0x%02X\n", subaddr, subaddr2); data->state = STATE_INACTIVE; return -EINVAL; } if (oem != 0x44) - IR_dprintk(1, "Warning: OEM nibbles 0x%02X. Expected 0x44\n", + dev_dbg(&dev->dev, "Warning: OEM nibbles 0x%02X. Expected 0x44\n", oem); scancode = addr << 24 | subaddr << 16 | obc1 << 8 | obc2; - IR_dprintk(1, "XMP scancode 0x%06x\n", scancode); + dev_dbg(&dev->dev, "XMP scancode 0x%06x\n", scancode); if (toggle == 0) { rc_keydown(dev, RC_PROTO_XMP, scancode, 0); } else { rc_repeat(dev); - IR_dprintk(1, "Repeat last key\n"); + dev_dbg(&dev->dev, "Repeat last key\n"); } data->state = STATE_INACTIVE; @@ -153,7 +154,7 @@ static int ir_xmp_decode(struct rc_dev *dev, struct ir_raw_event ev) } else if (geq_margin(ev.duration, XMP_HALFFRAME_SPACE, XMP_NIBBLE_PREFIX)) { /* Expect 8 or 16 nibble pulses. 16 in case of 'final' frame */ if (data->count == 16) { - IR_dprintk(2, "received half frame pulse at index %d. Probably a final frame key-up event: %u\n", + dev_dbg(&dev->dev, "received half frame pulse at index %d. Probably a final frame key-up event: %u\n", data->count, ev.duration); /* * TODO: for now go back to half frame position @@ -164,7 +165,7 @@ static int ir_xmp_decode(struct rc_dev *dev, struct ir_raw_event ev) } else if (data->count != 8) - IR_dprintk(2, "received half frame pulse at index %d: %u\n", + dev_dbg(&dev->dev, "received half frame pulse at index %d: %u\n", data->count, ev.duration); data->state = STATE_LEADER_PULSE; @@ -173,7 +174,7 @@ static int ir_xmp_decode(struct rc_dev *dev, struct ir_raw_event ev) } else if (geq_margin(ev.duration, XMP_NIBBLE_PREFIX, XMP_UNIT)) { /* store nibble raw data, decode after trailer */ if (data->count == 16) { - IR_dprintk(2, "to many pulses (%d) ignoring: %u\n", + dev_dbg(&dev->dev, "to many pulses (%d) ignoring: %u\n", data->count, ev.duration); data->state = STATE_INACTIVE; return -EINVAL; @@ -189,8 +190,8 @@ static int ir_xmp_decode(struct rc_dev *dev, struct ir_raw_event ev) break; } - IR_dprintk(1, "XMP decode failed at count %d state %d (%uus %s)\n", - data->count, data->state, TO_US(ev.duration), TO_STR(ev.pulse)); + dev_dbg(&dev->dev, "XMP decode failed at count %d state %d (%uus %s)\n", + data->count, data->state, TO_US(ev.duration), TO_STR(ev.pulse)); data->state = STATE_INACTIVE; return -EINVAL; } From 124927dd648052531d24e233523b1fa50d4d70be Mon Sep 17 00:00:00 2001 From: Sean Young Date: Mon, 12 Feb 2018 07:27:50 -0500 Subject: [PATCH 0594/1640] UPSTREAM: media: rc: remove IR_dprintk() from rc-core Use dev_dbg() rather than custom debug function. Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/lirc_dev.c | 10 ++-- drivers/media/rc/rc-ir-raw.c | 6 +-- drivers/media/rc/rc-main.c | 91 ++++++++++++++++++------------------ include/media/rc-core.h | 7 --- 4 files changed, 53 insertions(+), 61 deletions(-) diff --git a/drivers/media/rc/lirc_dev.c b/drivers/media/rc/lirc_dev.c index 694846026be6..e59b67b74a77 100644 --- a/drivers/media/rc/lirc_dev.c +++ b/drivers/media/rc/lirc_dev.c @@ -60,12 +60,12 @@ void ir_lirc_raw_event(struct rc_dev *dev, struct ir_raw_event ev) * space with the maximum time value. */ sample = LIRC_SPACE(LIRC_VALUE_MASK); - IR_dprintk(2, "delivering reset sync space to lirc_dev\n"); + dev_dbg(&dev->dev, "delivering reset sync space to lirc_dev\n"); /* Carrier reports */ } else if (ev.carrier_report) { sample = LIRC_FREQUENCY(ev.carrier); - IR_dprintk(2, "carrier report (freq: %d)\n", sample); + dev_dbg(&dev->dev, "carrier report (freq: %d)\n", sample); /* Packet end */ } else if (ev.timeout) { @@ -77,7 +77,7 @@ void ir_lirc_raw_event(struct rc_dev *dev, struct ir_raw_event ev) dev->gap_duration = ev.duration; sample = LIRC_TIMEOUT(ev.duration / 1000); - IR_dprintk(2, "timeout report (duration: %d)\n", sample); + dev_dbg(&dev->dev, "timeout report (duration: %d)\n", sample); /* Normal sample */ } else { @@ -100,8 +100,8 @@ void ir_lirc_raw_event(struct rc_dev *dev, struct ir_raw_event ev) sample = ev.pulse ? LIRC_PULSE(ev.duration / 1000) : LIRC_SPACE(ev.duration / 1000); - IR_dprintk(2, "delivering %uus %s to lirc_dev\n", - TO_US(ev.duration), TO_STR(ev.pulse)); + dev_dbg(&dev->dev, "delivering %uus %s to lirc_dev\n", + TO_US(ev.duration), TO_STR(ev.pulse)); } spin_lock_irqsave(&dev->lirc_fh_lock, flags); diff --git a/drivers/media/rc/rc-ir-raw.c b/drivers/media/rc/rc-ir-raw.c index 18504870b9f0..2790a0d268fd 100644 --- a/drivers/media/rc/rc-ir-raw.c +++ b/drivers/media/rc/rc-ir-raw.c @@ -65,8 +65,8 @@ int ir_raw_event_store(struct rc_dev *dev, struct ir_raw_event *ev) if (!dev->raw) return -EINVAL; - IR_dprintk(2, "sample: (%05dus %s)\n", - TO_US(ev->duration), TO_STR(ev->pulse)); + dev_dbg(&dev->dev, "sample: (%05dus %s)\n", + TO_US(ev->duration), TO_STR(ev->pulse)); if (!kfifo_put(&dev->raw->kfifo, *ev)) { dev_err(&dev->dev, "IR event FIFO is full!\n"); @@ -168,7 +168,7 @@ void ir_raw_event_set_idle(struct rc_dev *dev, bool idle) if (!dev->raw) return; - IR_dprintk(2, "%s idle mode\n", idle ? "enter" : "leave"); + dev_dbg(&dev->dev, "%s idle mode\n", idle ? "enter" : "leave"); if (idle) { dev->raw->this_ev.timeout = true; diff --git a/drivers/media/rc/rc-main.c b/drivers/media/rc/rc-main.c index 1db8d38fed7c..4a952108ba1e 100644 --- a/drivers/media/rc/rc-main.c +++ b/drivers/media/rc/rc-main.c @@ -156,6 +156,7 @@ static struct rc_map_list empty_map = { /** * ir_create_table() - initializes a scancode table + * @dev: the rc_dev device * @rc_map: the rc_map to initialize * @name: name to assign to the table * @rc_proto: ir type to assign to the new table @@ -166,7 +167,7 @@ static struct rc_map_list empty_map = { * * return: zero on success or a negative error code */ -static int ir_create_table(struct rc_map *rc_map, +static int ir_create_table(struct rc_dev *dev, struct rc_map *rc_map, const char *name, u64 rc_proto, size_t size) { rc_map->name = kstrdup(name, GFP_KERNEL); @@ -182,8 +183,8 @@ static int ir_create_table(struct rc_map *rc_map, return -ENOMEM; } - IR_dprintk(1, "Allocated space for %u keycode entries (%u bytes)\n", - rc_map->size, rc_map->alloc); + dev_dbg(&dev->dev, "Allocated space for %u keycode entries (%u bytes)\n", + rc_map->size, rc_map->alloc); return 0; } @@ -205,6 +206,7 @@ static void ir_free_table(struct rc_map *rc_map) /** * ir_resize_table() - resizes a scancode table if necessary + * @dev: the rc_dev device * @rc_map: the rc_map to resize * @gfp_flags: gfp flags to use when allocating memory * @@ -213,7 +215,8 @@ static void ir_free_table(struct rc_map *rc_map) * * return: zero on success or a negative error code */ -static int ir_resize_table(struct rc_map *rc_map, gfp_t gfp_flags) +static int ir_resize_table(struct rc_dev *dev, struct rc_map *rc_map, + gfp_t gfp_flags) { unsigned int oldalloc = rc_map->alloc; unsigned int newalloc = oldalloc; @@ -226,23 +229,21 @@ static int ir_resize_table(struct rc_map *rc_map, gfp_t gfp_flags) return -ENOMEM; newalloc *= 2; - IR_dprintk(1, "Growing table to %u bytes\n", newalloc); + dev_dbg(&dev->dev, "Growing table to %u bytes\n", newalloc); } if ((rc_map->len * 3 < rc_map->size) && (oldalloc > IR_TAB_MIN_SIZE)) { /* Less than 1/3 of entries in use -> shrink keytable */ newalloc /= 2; - IR_dprintk(1, "Shrinking table to %u bytes\n", newalloc); + dev_dbg(&dev->dev, "Shrinking table to %u bytes\n", newalloc); } if (newalloc == oldalloc) return 0; newscan = kmalloc(newalloc, gfp_flags); - if (!newscan) { - IR_dprintk(1, "Failed to kmalloc %u bytes\n", newalloc); + if (!newscan) return -ENOMEM; - } memcpy(newscan, rc_map->scan, rc_map->len * sizeof(struct rc_map_table)); rc_map->scan = newscan; @@ -275,16 +276,16 @@ static unsigned int ir_update_mapping(struct rc_dev *dev, /* Did the user wish to remove the mapping? */ if (new_keycode == KEY_RESERVED || new_keycode == KEY_UNKNOWN) { - IR_dprintk(1, "#%d: Deleting scan 0x%04x\n", - index, rc_map->scan[index].scancode); + dev_dbg(&dev->dev, "#%d: Deleting scan 0x%04x\n", + index, rc_map->scan[index].scancode); rc_map->len--; memmove(&rc_map->scan[index], &rc_map->scan[index+ 1], (rc_map->len - index) * sizeof(struct rc_map_table)); } else { - IR_dprintk(1, "#%d: %s scan 0x%04x with key 0x%04x\n", - index, - old_keycode == KEY_RESERVED ? "New" : "Replacing", - rc_map->scan[index].scancode, new_keycode); + dev_dbg(&dev->dev, "#%d: %s scan 0x%04x with key 0x%04x\n", + index, + old_keycode == KEY_RESERVED ? "New" : "Replacing", + rc_map->scan[index].scancode, new_keycode); rc_map->scan[index].keycode = new_keycode; __set_bit(new_keycode, dev->input_dev->keybit); } @@ -301,7 +302,7 @@ static unsigned int ir_update_mapping(struct rc_dev *dev, } /* Possibly shrink the keytable, failure is not a problem */ - ir_resize_table(rc_map, GFP_ATOMIC); + ir_resize_table(dev, rc_map, GFP_ATOMIC); } return old_keycode; @@ -352,7 +353,7 @@ static unsigned int ir_establish_scancode(struct rc_dev *dev, /* No previous mapping found, we might need to grow the table */ if (rc_map->size == rc_map->len) { - if (!resize || ir_resize_table(rc_map, GFP_ATOMIC)) + if (!resize || ir_resize_table(dev, rc_map, GFP_ATOMIC)) return -1U; } @@ -431,8 +432,8 @@ static int ir_setkeytable(struct rc_dev *dev, unsigned int i, index; int rc; - rc = ir_create_table(rc_map, from->name, - from->rc_proto, from->size); + rc = ir_create_table(dev, rc_map, from->name, from->rc_proto, + from->size); if (rc) return rc; @@ -576,8 +577,8 @@ u32 rc_g_keycode_from_table(struct rc_dev *dev, u32 scancode) spin_unlock_irqrestore(&rc_map->lock, flags); if (keycode != KEY_RESERVED) - IR_dprintk(1, "%s: scancode 0x%04x keycode 0x%02x\n", - dev->device_name, scancode, keycode); + dev_dbg(&dev->dev, "%s: scancode 0x%04x keycode 0x%02x\n", + dev->device_name, scancode, keycode); return keycode; } @@ -596,7 +597,7 @@ static void ir_do_keyup(struct rc_dev *dev, bool sync) if (!dev->keypressed) return; - IR_dprintk(1, "keyup key 0x%04x\n", dev->last_keycode); + dev_dbg(&dev->dev, "keyup key 0x%04x\n", dev->last_keycode); del_timer(&dev->timer_repeat); input_report_key(dev->input_dev, dev->last_keycode, 0); led_trigger_event(led_feedback, LED_OFF); @@ -751,8 +752,8 @@ static void ir_do_keydown(struct rc_dev *dev, enum rc_proto protocol, /* Register a keypress */ dev->keypressed = true; - IR_dprintk(1, "%s: key down event, key 0x%04x, protocol 0x%04x, scancode 0x%08x\n", - dev->device_name, keycode, protocol, scancode); + dev_dbg(&dev->dev, "%s: key down event, key 0x%04x, protocol 0x%04x, scancode 0x%08x\n", + dev->device_name, keycode, protocol, scancode); input_report_key(dev->input_dev, keycode, 1); led_trigger_event(led_feedback, LED_FULL); @@ -1056,8 +1057,8 @@ static ssize_t show_protocols(struct device *device, mutex_unlock(&dev->lock); - IR_dprintk(1, "%s: allowed - 0x%llx, enabled - 0x%llx\n", - __func__, (long long)allowed, (long long)enabled); + dev_dbg(&dev->dev, "%s: allowed - 0x%llx, enabled - 0x%llx\n", + __func__, (long long)allowed, (long long)enabled); for (i = 0; i < ARRAY_SIZE(proto_names); i++) { if (allowed & enabled & proto_names[i].type) @@ -1083,6 +1084,7 @@ static ssize_t show_protocols(struct device *device, /** * parse_protocol_change() - parses a protocol change request + * @dev: rc_dev device * @protocols: pointer to the bitmask of current protocols * @buf: pointer to the buffer with a list of changes * @@ -1092,7 +1094,8 @@ static ssize_t show_protocols(struct device *device, * Writing "none" will disable all protocols. * Returns the number of changes performed or a negative error code. */ -static int parse_protocol_change(u64 *protocols, const char *buf) +static int parse_protocol_change(struct rc_dev *dev, u64 *protocols, + const char *buf) { const char *tmp; unsigned count = 0; @@ -1128,7 +1131,8 @@ static int parse_protocol_change(u64 *protocols, const char *buf) if (!strcasecmp(tmp, "lirc")) mask = 0; else { - IR_dprintk(1, "Unknown protocol: '%s'\n", tmp); + dev_dbg(&dev->dev, "Unknown protocol: '%s'\n", + tmp); return -EINVAL; } } @@ -1144,7 +1148,7 @@ static int parse_protocol_change(u64 *protocols, const char *buf) } if (!count) { - IR_dprintk(1, "Protocol not specified\n"); + dev_dbg(&dev->dev, "Protocol not specified\n"); return -EINVAL; } @@ -1217,12 +1221,12 @@ static ssize_t store_protocols(struct device *device, u64 old_protocols, new_protocols; ssize_t rc; - IR_dprintk(1, "Normal protocol change requested\n"); + dev_dbg(&dev->dev, "Normal protocol change requested\n"); current_protocols = &dev->enabled_protocols; filter = &dev->scancode_filter; if (!dev->change_protocol) { - IR_dprintk(1, "Protocol switching not supported\n"); + dev_dbg(&dev->dev, "Protocol switching not supported\n"); return -EINVAL; } @@ -1230,14 +1234,14 @@ static ssize_t store_protocols(struct device *device, old_protocols = *current_protocols; new_protocols = old_protocols; - rc = parse_protocol_change(&new_protocols, buf); + rc = parse_protocol_change(dev, &new_protocols, buf); if (rc < 0) goto out; rc = dev->change_protocol(dev, &new_protocols); if (rc < 0) { - IR_dprintk(1, "Error setting protocols to 0x%llx\n", - (long long)new_protocols); + dev_dbg(&dev->dev, "Error setting protocols to 0x%llx\n", + (long long)new_protocols); goto out; } @@ -1246,8 +1250,8 @@ static ssize_t store_protocols(struct device *device, if (new_protocols != old_protocols) { *current_protocols = new_protocols; - IR_dprintk(1, "Protocols changed to 0x%llx\n", - (long long)new_protocols); + dev_dbg(&dev->dev, "Protocols changed to 0x%llx\n", + (long long)new_protocols); } /* @@ -1435,8 +1439,8 @@ static ssize_t show_wakeup_protocols(struct device *device, mutex_unlock(&dev->lock); - IR_dprintk(1, "%s: allowed - 0x%llx, enabled - %d\n", - __func__, (long long)allowed, enabled); + dev_dbg(&dev->dev, "%s: allowed - 0x%llx, enabled - %d\n", + __func__, (long long)allowed, enabled); for (i = 0; i < ARRAY_SIZE(protocols); i++) { if (allowed & (1ULL << i)) { @@ -1511,7 +1515,7 @@ static ssize_t store_wakeup_protocols(struct device *device, if (dev->wakeup_protocol != protocol) { dev->wakeup_protocol = protocol; - IR_dprintk(1, "Wakeup protocol changed to %d\n", protocol); + dev_dbg(&dev->dev, "Wakeup protocol changed to %d\n", protocol); if (protocol == RC_PROTO_RC6_MCE) dev->scancode_wakeup_filter.data = 0x800f0000; @@ -1874,9 +1878,8 @@ int rc_register_device(struct rc_dev *dev) dev->registered = true; - IR_dprintk(1, "Registered rc%u (driver: %s)\n", - dev->minor, - dev->driver_name ? dev->driver_name : "unknown"); + dev_dbg(&dev->dev, "Registered rc%u (driver: %s)\n", dev->minor, + dev->driver_name ? dev->driver_name : "unknown"); return 0; @@ -1994,9 +1997,5 @@ static void __exit rc_core_exit(void) subsys_initcall(rc_core_init); module_exit(rc_core_exit); -int rc_core_debug; /* ir_debug level (0,1,2) */ -EXPORT_SYMBOL_GPL(rc_core_debug); -module_param_named(debug, rc_core_debug, int, 0644); - MODULE_AUTHOR("Mauro Carvalho Chehab"); MODULE_LICENSE("GPL v2"); diff --git a/include/media/rc-core.h b/include/media/rc-core.h index aed4272d47f5..fc3a92668bab 100644 --- a/include/media/rc-core.h +++ b/include/media/rc-core.h @@ -23,13 +23,6 @@ #include #include -extern int rc_core_debug; -#define IR_dprintk(level, fmt, ...) \ -do { \ - if (rc_core_debug >= level) \ - printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__); \ -} while (0) - /** * enum rc_driver_type - type of the RC driver. * From 8dba454c3d0cc37449e57adabb1e3b5193d5f531 Mon Sep 17 00:00:00 2001 From: Sean Young Date: Mon, 12 Feb 2018 08:58:01 -0500 Subject: [PATCH 0595/1640] UPSTREAM: media: rc: remove obsolete comment This comment is no longer relevant. Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/rc-core-priv.h | 7 ------- 1 file changed, 7 deletions(-) diff --git a/drivers/media/rc/rc-core-priv.h b/drivers/media/rc/rc-core-priv.h index 458e9eb2d6a9..d09a06e1c17f 100644 --- a/drivers/media/rc/rc-core-priv.h +++ b/drivers/media/rc/rc-core-priv.h @@ -292,11 +292,4 @@ static inline int ir_lirc_register(struct rc_dev *dev) { return 0; } static inline void ir_lirc_unregister(struct rc_dev *dev) { } #endif -/* - * Decoder initialization code - * - * Those load logic are called during ir-core init, and automatically - * loads the compiled decoders for their usage with IR raw events - */ - #endif /* _RC_CORE_PRIV */ From faedaa5b46844a98851a05aae7635e9d1235a401 Mon Sep 17 00:00:00 2001 From: Sean Young Date: Mon, 12 Feb 2018 08:59:00 -0500 Subject: [PATCH 0596/1640] UPSTREAM: media: rc: remove useless if statement ret is always 0, so remove if statement. Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/lirc_dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/media/rc/lirc_dev.c b/drivers/media/rc/lirc_dev.c index e59b67b74a77..bdc91028ce0d 100644 --- a/drivers/media/rc/lirc_dev.c +++ b/drivers/media/rc/lirc_dev.c @@ -570,7 +570,7 @@ static long ir_lirc_ioctl(struct file *file, unsigned int cmd, ret = -EINVAL; else if (dev->s_timeout) ret = dev->s_timeout(dev, tmp); - else if (!ret) + else dev->timeout = tmp; } break; From 9d2575c50976ae4fa28e3fc2705a91de7f59f323 Mon Sep 17 00:00:00 2001 From: Sean Young Date: Mon, 12 Feb 2018 09:00:28 -0500 Subject: [PATCH 0597/1640] UPSTREAM: media: rc: get start time just before calling driver tx The current code gets the start time before copying the IR from userspace (could cause page faults) and encoding IR. This means that the gap calculation could be off. Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/lirc_dev.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/media/rc/lirc_dev.c b/drivers/media/rc/lirc_dev.c index bdc91028ce0d..84bfe1cf2a32 100644 --- a/drivers/media/rc/lirc_dev.c +++ b/drivers/media/rc/lirc_dev.c @@ -249,8 +249,6 @@ static ssize_t ir_lirc_transmit_ir(struct file *file, const char __user *buf, goto out_unlock; } - start = ktime_get(); - if (!dev->tx_ir) { ret = -EINVAL; goto out_unlock; @@ -343,6 +341,8 @@ static ssize_t ir_lirc_transmit_ir(struct file *file, const char __user *buf, duration += txbuf[i]; } + start = ktime_get(); + ret = dev->tx_ir(dev, txbuf, count); if (ret < 0) goto out_kfree; From 4b9e5fb9a7488cde2e1f99219ef27edd4d7cc54e Mon Sep 17 00:00:00 2001 From: Sean Young Date: Tue, 13 Feb 2018 06:11:35 -0500 Subject: [PATCH 0598/1640] UPSTREAM: media: rc: no need to announce major number Since commit a60d64b15c20 ("media: lirc: lirc interface should not be a raw decoder"), the message in the documentation is incorrect as the module name is rc_core, not lirc_dev. Since the message is not useful, just make the message debug and remove it from the documentation. Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- Documentation/media/uapi/rc/lirc-dev-intro.rst | 1 - drivers/media/rc/lirc_dev.c | 4 ++-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/Documentation/media/uapi/rc/lirc-dev-intro.rst b/Documentation/media/uapi/rc/lirc-dev-intro.rst index 3a74fec66d69..698e4f80270e 100644 --- a/Documentation/media/uapi/rc/lirc-dev-intro.rst +++ b/Documentation/media/uapi/rc/lirc-dev-intro.rst @@ -18,7 +18,6 @@ Example dmesg output upon a driver registering w/LIRC: .. code-block:: none $ dmesg |grep lirc_dev - lirc_dev: IR Remote Control driver registered, major 248 rc rc0: lirc_dev: driver mceusb registered at minor = 0 What you should see for a chardev: diff --git a/drivers/media/rc/lirc_dev.c b/drivers/media/rc/lirc_dev.c index 84bfe1cf2a32..a9cd96fe0a52 100644 --- a/drivers/media/rc/lirc_dev.c +++ b/drivers/media/rc/lirc_dev.c @@ -805,8 +805,8 @@ int __init lirc_dev_init(void) return retval; } - pr_info("IR Remote Control driver registered, major %d\n", - MAJOR(lirc_base_dev)); + pr_debug("IR Remote Control driver registered, major %d\n", + MAJOR(lirc_base_dev)); return 0; } From 4e0b07a12abafe07619434003c319fa770c863ec Mon Sep 17 00:00:00 2001 From: Sean Young Date: Wed, 14 Feb 2018 10:26:17 -0500 Subject: [PATCH 0599/1640] UPSTREAM: media: rc: fix race condition in ir_raw_event_store_edge() handling There is a possible race condition between the IR timeout being generated from the timer, and new IR arriving. This could result in the timeout being added to the kfifo after new IR arrives. On top of that, there is concurrent write access to the kfifo from ir_raw_event_store_edge() and the timer. Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/rc-core-priv.h | 5 +++-- drivers/media/rc/rc-ir-raw.c | 24 +++++++++++++++++++++--- 2 files changed, 24 insertions(+), 5 deletions(-) diff --git a/drivers/media/rc/rc-core-priv.h b/drivers/media/rc/rc-core-priv.h index d09a06e1c17f..5e80b4273e2d 100644 --- a/drivers/media/rc/rc-core-priv.h +++ b/drivers/media/rc/rc-core-priv.h @@ -50,8 +50,9 @@ struct ir_raw_event_ctrl { DECLARE_KFIFO(kfifo, struct ir_raw_event, MAX_IR_EVENT_SIZE); ktime_t last_event; /* when last event occurred */ struct rc_dev *dev; /* pointer to the parent rc_dev */ - /* edge driver */ - struct timer_list edge_handle; + /* handle delayed ir_raw_event_store_edge processing */ + spinlock_t edge_spinlock; + struct timer_list edge_handle; /* raw decoder state follows */ struct ir_raw_event prev_ev; diff --git a/drivers/media/rc/rc-ir-raw.c b/drivers/media/rc/rc-ir-raw.c index 2790a0d268fd..984bb82851f9 100644 --- a/drivers/media/rc/rc-ir-raw.c +++ b/drivers/media/rc/rc-ir-raw.c @@ -101,6 +101,7 @@ int ir_raw_event_store_edge(struct rc_dev *dev, bool pulse) ev.duration = ktime_to_ns(ktime_sub(now, dev->raw->last_event)); ev.pulse = !pulse; + spin_lock(&dev->raw->edge_spinlock); rc = ir_raw_event_store(dev, &ev); dev->raw->last_event = now; @@ -112,6 +113,7 @@ int ir_raw_event_store_edge(struct rc_dev *dev, bool pulse) mod_timer(&dev->raw->edge_handle, jiffies + msecs_to_jiffies(15)); } + spin_unlock(&dev->raw->edge_spinlock); return rc; } @@ -462,12 +464,26 @@ int ir_raw_encode_scancode(enum rc_proto protocol, u32 scancode, } EXPORT_SYMBOL(ir_raw_encode_scancode); -static void edge_handle(struct timer_list *t) +/** + * ir_raw_edge_handle() - Handle ir_raw_event_store_edge() processing + * + * @t: timer_list + * + * This callback is armed by ir_raw_event_store_edge(). It does two things: + * first of all, rather than calling ir_raw_event_handle() for each + * edge and waking up the rc thread, 15 ms after the first edge + * ir_raw_event_handle() is called. Secondly, generate a timeout event + * no more IR is received after the rc_dev timeout. + */ +static void ir_raw_edge_handle(struct timer_list *t) { struct ir_raw_event_ctrl *raw = from_timer(raw, t, edge_handle); struct rc_dev *dev = raw->dev; - ktime_t interval = ktime_sub(ktime_get(), dev->raw->last_event); + unsigned long flags; + ktime_t interval; + spin_lock_irqsave(&dev->raw->edge_spinlock, flags); + interval = ktime_sub(ktime_get(), dev->raw->last_event); if (ktime_to_ns(interval) >= dev->timeout) { DEFINE_IR_RAW_EVENT(ev); @@ -480,6 +496,7 @@ static void edge_handle(struct timer_list *t) jiffies + nsecs_to_jiffies(dev->timeout - ktime_to_ns(interval))); } + spin_unlock_irqrestore(&dev->raw->edge_spinlock, flags); ir_raw_event_handle(dev); } @@ -528,7 +545,8 @@ int ir_raw_event_prepare(struct rc_dev *dev) dev->raw->dev = dev; dev->change_protocol = change_protocol; - timer_setup(&dev->raw->edge_handle, edge_handle, 0); + spin_lock_init(&dev->raw->edge_spinlock); + timer_setup(&dev->raw->edge_handle, ir_raw_edge_handle, 0); INIT_KFIFO(dev->raw->kfifo); return 0; From da5e5a8f549ff54f8e33b34955a8369d49c8b5dd Mon Sep 17 00:00:00 2001 From: James Hogan Date: Wed, 21 Feb 2018 16:02:10 +0000 Subject: [PATCH 0600/1640] UPSTREAM: media: img-ir: Drop METAG dependency Now that arch/metag/ has been removed, remove the METAG dependency from the IMG IR device driver. The hardware is also present on MIPS SoCs so the driver still has value. Signed-off-by: James Hogan Acked-by: Sean Young Cc: Mauro Carvalho Chehab Cc: Mauro Carvalho Chehab Cc: linux-media@vger.kernel.org Cc: linux-metag@vger.kernel.org --- drivers/media/rc/img-ir/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/media/rc/img-ir/Kconfig b/drivers/media/rc/img-ir/Kconfig index a896d3c83a1c..d2c6617d468e 100644 --- a/drivers/media/rc/img-ir/Kconfig +++ b/drivers/media/rc/img-ir/Kconfig @@ -1,7 +1,7 @@ config IR_IMG tristate "ImgTec IR Decoder" depends on RC_CORE - depends on METAG || MIPS || COMPILE_TEST + depends on MIPS || COMPILE_TEST select IR_IMG_HW if !IR_IMG_RAW help Say Y or M here if you want to use the ImgTec infrared decoder From 890baa2ccb7f38abd7e8501852fdacbef81398b7 Mon Sep 17 00:00:00 2001 From: Philipp Rossak Date: Tue, 13 Feb 2018 07:29:47 -0500 Subject: [PATCH 0601/1640] UPSTREAM: media: rc: update sunxi-ir driver to get base clock frequency from devicetree This patch updates the sunxi-ir driver to set the base clock frequency from devicetree. This is necessary since there are different ir receivers on the market, that operate with different frequencies. So this value could be set if the attached ir receiver needs a different base clock frequency, than the default 8 MHz. Signed-off-by: Philipp Rossak Reviewed-by: Andi Shyti Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/sunxi-cir.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/drivers/media/rc/sunxi-cir.c b/drivers/media/rc/sunxi-cir.c index 97f367b446c4..f500cea228a9 100644 --- a/drivers/media/rc/sunxi-cir.c +++ b/drivers/media/rc/sunxi-cir.c @@ -72,12 +72,8 @@ /* CIR_REG register idle threshold */ #define REG_CIR_ITHR(val) (((val) << 8) & (GENMASK(15, 8))) -/* Required frequency for IR0 or IR1 clock in CIR mode */ +/* Required frequency for IR0 or IR1 clock in CIR mode (default) */ #define SUNXI_IR_BASE_CLK 8000000 -/* Frequency after IR internal divider */ -#define SUNXI_IR_CLK (SUNXI_IR_BASE_CLK / 64) -/* Sample period in ns */ -#define SUNXI_IR_SAMPLE (1000000000ul / SUNXI_IR_CLK) /* Noise threshold in samples */ #define SUNXI_IR_RXNOISE 1 /* Idle Threshold in samples */ @@ -122,7 +118,8 @@ static irqreturn_t sunxi_ir_irq(int irqno, void *dev_id) /* for each bit in fifo */ dt = readb(ir->base + SUNXI_IR_RXFIFO_REG); rawir.pulse = (dt & 0x80) != 0; - rawir.duration = ((dt & 0x7f) + 1) * SUNXI_IR_SAMPLE; + rawir.duration = ((dt & 0x7f) + 1) * + ir->rc->rx_resolution; ir_raw_event_store_with_filter(ir->rc, &rawir); } } @@ -148,6 +145,7 @@ static int sunxi_ir_probe(struct platform_device *pdev) struct device_node *dn = dev->of_node; struct resource *res; struct sunxi_ir *ir; + u32 b_clk_freq = SUNXI_IR_BASE_CLK; ir = devm_kzalloc(dev, sizeof(struct sunxi_ir), GFP_KERNEL); if (!ir) @@ -172,6 +170,9 @@ static int sunxi_ir_probe(struct platform_device *pdev) return PTR_ERR(ir->clk); } + /* Base clock frequency (optional) */ + of_property_read_u32(dn, "clock-frequency", &b_clk_freq); + /* Reset (optional) */ ir->rst = devm_reset_control_get_optional_exclusive(dev, NULL); if (IS_ERR(ir->rst)) @@ -180,11 +181,12 @@ static int sunxi_ir_probe(struct platform_device *pdev) if (ret) return ret; - ret = clk_set_rate(ir->clk, SUNXI_IR_BASE_CLK); + ret = clk_set_rate(ir->clk, b_clk_freq); if (ret) { dev_err(dev, "set ir base clock failed!\n"); goto exit_reset_assert; } + dev_dbg(dev, "set base clock frequency to %d Hz.\n", b_clk_freq); if (clk_prepare_enable(ir->apb_clk)) { dev_err(dev, "try to enable apb_ir_clk failed\n"); @@ -225,7 +227,8 @@ static int sunxi_ir_probe(struct platform_device *pdev) ir->rc->map_name = ir->map_name ?: RC_MAP_EMPTY; ir->rc->dev.parent = dev; ir->rc->allowed_protocols = RC_PROTO_BIT_ALL_IR_DECODER; - ir->rc->rx_resolution = SUNXI_IR_SAMPLE; + /* Frequency after IR internal divider with sample period in ns */ + ir->rc->rx_resolution = (1000000000ul / (b_clk_freq / 64)); ir->rc->timeout = MS_TO_NS(SUNXI_IR_TIMEOUT); ir->rc->driver_name = SUNXI_IR_DEV; From cdeeea3b712692127c3a9ad0ab91411a70c5fafc Mon Sep 17 00:00:00 2001 From: Sean Young Date: Sat, 6 Jan 2018 07:24:50 -0500 Subject: [PATCH 0602/1640] UPSTREAM: media: Revert "[media] staging: lirc_imon: port remaining usb ids to imon and remove" This code was ported without the necessary hardware to test. There are multiple problems which are more easily solved by writing a separate driver. This reverts commit f41003a23a02dc7299539300f74360c2a932714a. Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/imon.c | 135 +++------------------------------------- 1 file changed, 7 insertions(+), 128 deletions(-) diff --git a/drivers/media/rc/imon.c b/drivers/media/rc/imon.c index ac67bd64486f..188ab40c91a9 100644 --- a/drivers/media/rc/imon.c +++ b/drivers/media/rc/imon.c @@ -92,7 +92,6 @@ struct imon_usb_dev_descr { __u16 flags; #define IMON_NO_FLAGS 0 #define IMON_NEED_20MS_PKT_DELAY 1 -#define IMON_IR_RAW 2 struct imon_panel_key_table key_table[]; }; @@ -123,12 +122,6 @@ struct imon_context { unsigned char usb_tx_buf[8]; unsigned int send_packet_delay; - struct rx_data { - int count; /* length of 0 or 1 sequence */ - int prev_bit; /* logic level of sequence */ - int initial_space; /* initial space flag */ - } rx; - struct tx_t { unsigned char data_buf[35]; /* user data buffer */ struct completion finished; /* wait for write to finish */ @@ -331,10 +324,6 @@ static const struct imon_usb_dev_descr imon_DH102 = { } }; -static const struct imon_usb_dev_descr imon_ir_raw = { - .flags = IMON_IR_RAW, -}; - /* * USB Device ID for iMON USB Control Boards * @@ -418,18 +407,6 @@ static const struct usb_device_id imon_usb_id_table[] = { /* device specifics unknown */ { USB_DEVICE(0x15c2, 0x0046), .driver_info = (unsigned long)&imon_default_table}, - /* TriGem iMON (IR only) -- TG_iMON.inf */ - { USB_DEVICE(0x0aa8, 0x8001), - .driver_info = (unsigned long)&imon_ir_raw}, - /* SoundGraph iMON (IR only) -- sg_imon.inf */ - { USB_DEVICE(0x04e8, 0xff30), - .driver_info = (unsigned long)&imon_ir_raw}, - /* SoundGraph iMON VFD (IR & VFD) -- iMON_VFD.inf */ - { USB_DEVICE(0x0aa8, 0xffda), - .driver_info = (unsigned long)&imon_ir_raw}, - /* SoundGraph iMON SS (IR & VFD) -- iMON_SS.inf */ - { USB_DEVICE(0x15c2, 0xffda), - .driver_info = (unsigned long)&imon_ir_raw}, {} }; @@ -1569,91 +1546,8 @@ static int imon_parse_press_type(struct imon_context *ictx, /* * Process the incoming packet */ -/* - * Convert bit count to time duration (in us) and submit - * the value to lirc_dev. - */ -static void submit_data(struct imon_context *context) -{ - DEFINE_IR_RAW_EVENT(ev); - - ev.pulse = context->rx.prev_bit; - ev.duration = US_TO_NS(context->rx.count * BIT_DURATION); - ir_raw_event_store_with_filter(context->rdev, &ev); -} - -/* - * Process the incoming packet - */ -static void imon_incoming_ir_raw(struct imon_context *context, +static void imon_incoming_packet(struct imon_context *ictx, struct urb *urb, int intf) -{ - int len = urb->actual_length; - unsigned char *buf = urb->transfer_buffer; - struct device *dev = context->dev; - int octet, bit; - unsigned char mask; - - if (len != 8) { - dev_warn(dev, "imon %s: invalid incoming packet size (len = %d, intf%d)\n", - __func__, len, intf); - return; - } - - if (debug) - dev_info(dev, "raw packet: %*ph\n", len, buf); - /* - * Translate received data to pulse and space lengths. - * Received data is active low, i.e. pulses are 0 and - * spaces are 1. - * - * My original algorithm was essentially similar to - * Changwoo Ryu's with the exception that he switched - * the incoming bits to active high and also fed an - * initial space to LIRC at the start of a new sequence - * if the previous bit was a pulse. - * - * I've decided to adopt his algorithm. - */ - - if (buf[7] == 1 && context->rx.initial_space) { - /* LIRC requires a leading space */ - context->rx.prev_bit = 0; - context->rx.count = 4; - submit_data(context); - context->rx.count = 0; - } - - for (octet = 0; octet < 5; ++octet) { - mask = 0x80; - for (bit = 0; bit < 8; ++bit) { - int curr_bit = !(buf[octet] & mask); - - if (curr_bit != context->rx.prev_bit) { - if (context->rx.count) { - submit_data(context); - context->rx.count = 0; - } - context->rx.prev_bit = curr_bit; - } - ++context->rx.count; - mask >>= 1; - } - } - - if (buf[7] == 10) { - if (context->rx.count) { - submit_data(context); - context->rx.count = 0; - } - context->rx.initial_space = context->rx.prev_bit; - } - - ir_raw_event_handle(context->rdev); -} - -static void imon_incoming_scancode(struct imon_context *ictx, - struct urb *urb, int intf) { int len = urb->actual_length; unsigned char *buf = urb->transfer_buffer; @@ -1836,10 +1730,7 @@ static void usb_rx_callback_intf0(struct urb *urb) break; case 0: - if (ictx->rdev->driver_type == RC_DRIVER_IR_RAW) - imon_incoming_ir_raw(ictx, urb, intfnum); - else - imon_incoming_scancode(ictx, urb, intfnum); + imon_incoming_packet(ictx, urb, intfnum); break; default: @@ -1880,10 +1771,7 @@ static void usb_rx_callback_intf1(struct urb *urb) break; case 0: - if (ictx->rdev->driver_type == RC_DRIVER_IR_RAW) - imon_incoming_ir_raw(ictx, urb, intfnum); - else - imon_incoming_scancode(ictx, urb, intfnum); + imon_incoming_packet(ictx, urb, intfnum); break; default: @@ -1997,14 +1885,11 @@ static void imon_set_display_type(struct imon_context *ictx) case 0x0041: case 0x0042: case 0x0043: - case 0x8001: - case 0xff30: configured_display_type = IMON_DISPLAY_TYPE_NONE; ictx->display_supported = false; break; case 0x0036: case 0x0044: - case 0xffda: default: configured_display_type = IMON_DISPLAY_TYPE_VFD; break; @@ -2029,8 +1914,7 @@ static struct rc_dev *imon_init_rdev(struct imon_context *ictx) static const unsigned char fp_packet[] = { 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x88 }; - rdev = rc_allocate_device(ictx->dev_descr->flags & IMON_IR_RAW ? - RC_DRIVER_IR_RAW : RC_DRIVER_SCANCODE); + rdev = rc_allocate_device(RC_DRIVER_SCANCODE); if (!rdev) { dev_err(ictx->dev, "remote control dev allocation failed\n"); goto out; @@ -2048,12 +1932,8 @@ static struct rc_dev *imon_init_rdev(struct imon_context *ictx) rdev->dev.parent = ictx->dev; rdev->priv = ictx; - if (ictx->dev_descr->flags & IMON_IR_RAW) - rdev->allowed_protocols = RC_PROTO_BIT_ALL_IR_DECODER; - else - /* iMON PAD or MCE */ - rdev->allowed_protocols = RC_PROTO_BIT_OTHER | - RC_PROTO_BIT_RC6_MCE; + /* iMON PAD or MCE */ + rdev->allowed_protocols = RC_PROTO_BIT_OTHER | RC_PROTO_BIT_RC6_MCE; rdev->change_protocol = imon_ir_change_protocol; rdev->driver_name = MOD_NAME; @@ -2071,8 +1951,7 @@ static struct rc_dev *imon_init_rdev(struct imon_context *ictx) imon_set_display_type(ictx); - if (ictx->rc_proto == RC_PROTO_BIT_RC6_MCE || - ictx->dev_descr->flags & IMON_IR_RAW) + if (ictx->rc_proto == RC_PROTO_BIT_RC6_MCE) rdev->map_name = RC_MAP_IMON_MCE; else rdev->map_name = RC_MAP_IMON_PAD; From 0d420bc0b091a37134506ae698bc123608c2a4f4 Mon Sep 17 00:00:00 2001 From: Sean Young Date: Mon, 5 Mar 2018 08:32:14 -0500 Subject: [PATCH 0603/1640] UPSTREAM: media: rc: add keymap for iMON RSC remote Note that the stick on the remote is not supported yet. Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/keymaps/Makefile | 1 + drivers/media/rc/keymaps/rc-imon-rsc.c | 81 ++++++++++++++++++++++++++ include/media/rc-map.h | 1 + 3 files changed, 83 insertions(+) create mode 100644 drivers/media/rc/keymaps/rc-imon-rsc.c diff --git a/drivers/media/rc/keymaps/Makefile b/drivers/media/rc/keymaps/Makefile index 50b319355edf..d6b913a3032d 100644 --- a/drivers/media/rc/keymaps/Makefile +++ b/drivers/media/rc/keymaps/Makefile @@ -53,6 +53,7 @@ obj-$(CONFIG_RC_MAP) += rc-adstech-dvb-t-pci.o \ rc-hisi-tv-demo.o \ rc-imon-mce.o \ rc-imon-pad.o \ + rc-imon-rsc.o \ rc-iodata-bctv7e.o \ rc-it913x-v1.o \ rc-it913x-v2.o \ diff --git a/drivers/media/rc/keymaps/rc-imon-rsc.c b/drivers/media/rc/keymaps/rc-imon-rsc.c new file mode 100644 index 000000000000..83e4564aaa22 --- /dev/null +++ b/drivers/media/rc/keymaps/rc-imon-rsc.c @@ -0,0 +1,81 @@ +// SPDX-License-Identifier: GPL-2.0+ +// +// Copyright (C) 2018 Sean Young + +#include +#include + +// +// Note that this remote has a stick which its own IR protocol, +// with 16 directions. This is not supported yet. +// +static struct rc_map_table imon_rsc[] = { + { 0x801010, KEY_EXIT }, + { 0x80102f, KEY_POWER }, + { 0x80104a, KEY_SCREENSAVER }, /* Screensaver */ + { 0x801049, KEY_TIME }, /* Timer */ + { 0x801054, KEY_NUMERIC_1 }, + { 0x801055, KEY_NUMERIC_2 }, + { 0x801056, KEY_NUMERIC_3 }, + { 0x801057, KEY_NUMERIC_4 }, + { 0x801058, KEY_NUMERIC_5 }, + { 0x801059, KEY_NUMERIC_6 }, + { 0x80105a, KEY_NUMERIC_7 }, + { 0x80105b, KEY_NUMERIC_8 }, + { 0x80105c, KEY_NUMERIC_9 }, + { 0x801081, KEY_SCREEN }, /* Desktop */ + { 0x80105d, KEY_NUMERIC_0 }, + { 0x801082, KEY_MAX }, + { 0x801048, KEY_ESC }, + { 0x80104b, KEY_MEDIA }, /* Windows key */ + { 0x801083, KEY_MENU }, + { 0x801045, KEY_APPSELECT }, /* app launcher */ + { 0x801084, KEY_STOP }, + { 0x801046, KEY_CYCLEWINDOWS }, + { 0x801085, KEY_BACKSPACE }, + { 0x801086, KEY_KEYBOARD }, + { 0x801087, KEY_SPACE }, + { 0x80101e, KEY_RESERVED }, /* shift tab */ + { 0x801098, BTN_0 }, + { 0x80101f, KEY_TAB }, + { 0x80101b, BTN_LEFT }, + { 0x80101d, BTN_RIGHT }, + { 0x801016, BTN_MIDDLE }, /* drag and drop */ + { 0x801088, KEY_MUTE }, + { 0x80105e, KEY_VOLUMEDOWN }, + { 0x80105f, KEY_VOLUMEUP }, + { 0x80104c, KEY_PLAY }, + { 0x80104d, KEY_PAUSE }, + { 0x80104f, KEY_EJECTCD }, + { 0x801050, KEY_PREVIOUS }, + { 0x801051, KEY_NEXT }, + { 0x80104e, KEY_STOP }, + { 0x801052, KEY_REWIND }, + { 0x801053, KEY_FASTFORWARD }, + { 0x801089, KEY_ZOOM } /* full screen */ +}; + +static struct rc_map_list imon_rsc_map = { + .map = { + .scan = imon_rsc, + .size = ARRAY_SIZE(imon_rsc), + .rc_proto = RC_PROTO_NEC, + .name = RC_MAP_IMON_RSC, + } +}; + +static int __init init_rc_map_imon_rsc(void) +{ + return rc_map_register(&imon_rsc_map); +} + +static void __exit exit_rc_map_imon_rsc(void) +{ + rc_map_unregister(&imon_rsc_map); +} + +module_init(init_rc_map_imon_rsc) +module_exit(exit_rc_map_imon_rsc) + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Sean Young "); diff --git a/include/media/rc-map.h b/include/media/rc-map.h index 7046734b3895..7fc84991bd12 100644 --- a/include/media/rc-map.h +++ b/include/media/rc-map.h @@ -211,6 +211,7 @@ struct rc_map *rc_map_get(const char *name); #define RC_MAP_HISI_TV_DEMO "rc-hisi-tv-demo" #define RC_MAP_IMON_MCE "rc-imon-mce" #define RC_MAP_IMON_PAD "rc-imon-pad" +#define RC_MAP_IMON_RSC "rc-imon-rsc" #define RC_MAP_IODATA_BCTV7E "rc-iodata-bctv7e" #define RC_MAP_IT913X_V1 "rc-it913x-v1" #define RC_MAP_IT913X_V2 "rc-it913x-v2" From 8f623c7416634ed8eeb7323a5d58d812af4a5539 Mon Sep 17 00:00:00 2001 From: Sean Young Date: Fri, 5 Jan 2018 14:58:51 -0500 Subject: [PATCH 0604/1640] UPSTREAM: media: rc: new driver for early iMon device These devices were supported by the lirc_imon.c driver which was removed from staging in commit f41003a23a02 ("[media] staging: lirc_imon: port remaining usb ids to imon and remove"). Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- MAINTAINERS | 7 ++ drivers/media/rc/Kconfig | 12 +++ drivers/media/rc/Makefile | 1 + drivers/media/rc/imon_raw.c | 199 ++++++++++++++++++++++++++++++++++++ 4 files changed, 219 insertions(+) create mode 100644 drivers/media/rc/imon_raw.c diff --git a/MAINTAINERS b/MAINTAINERS index bf8526bd9ed9..90fb5968c43c 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -6758,6 +6758,13 @@ M: James Hogan S: Maintained F: drivers/media/rc/img-ir/ +IMON SOUNDGRAPH USB IR RECEIVER +M: Sean Young +L: linux-media@vger.kernel.org +S: Maintained +F: drivers/media/rc/imon_raw.c +F: drivers/media/rc/imon.c + IMS TWINTURBO FRAMEBUFFER DRIVER L: linux-fbdev@vger.kernel.org S: Orphan diff --git a/drivers/media/rc/Kconfig b/drivers/media/rc/Kconfig index 2a52dd4defd4..73de36430c57 100644 --- a/drivers/media/rc/Kconfig +++ b/drivers/media/rc/Kconfig @@ -175,6 +175,18 @@ config IR_IMON To compile this driver as a module, choose M here: the module will be called imon. +config IR_IMON_RAW + tristate "SoundGraph iMON Receiver (early raw IR models)" + depends on USB_ARCH_HAS_HCD + depends on RC_CORE + select USB + ---help--- + Say Y here if you want to use a SoundGraph iMON IR Receiver, + early raw models. + + To compile this driver as a module, choose M here: the + module will be called imon_raw. + config IR_MCEUSB tristate "Windows Media Center Ed. eHome Infrared Transceiver" depends on USB_ARCH_HAS_HCD diff --git a/drivers/media/rc/Makefile b/drivers/media/rc/Makefile index 740abd46b1be..a60a11850672 100644 --- a/drivers/media/rc/Makefile +++ b/drivers/media/rc/Makefile @@ -19,6 +19,7 @@ obj-$(CONFIG_IR_XMP_DECODER) += ir-xmp-decoder.o obj-$(CONFIG_RC_ATI_REMOTE) += ati_remote.o obj-$(CONFIG_IR_HIX5HD2) += ir-hix5hd2.o obj-$(CONFIG_IR_IMON) += imon.o +obj-$(CONFIG_IR_IMON_RAW) += imon_raw.o obj-$(CONFIG_IR_ITE_CIR) += ite-cir.o obj-$(CONFIG_IR_MCEUSB) += mceusb.o obj-$(CONFIG_IR_FINTEK) += fintek-cir.o diff --git a/drivers/media/rc/imon_raw.c b/drivers/media/rc/imon_raw.c new file mode 100644 index 000000000000..32709f96de14 --- /dev/null +++ b/drivers/media/rc/imon_raw.c @@ -0,0 +1,199 @@ +// SPDX-License-Identifier: GPL-2.0+ +// +// Copyright (C) 2018 Sean Young + +#include +#include +#include +#include + +/* Each bit is 250us */ +#define BIT_DURATION 250000 + +struct imon { + struct device *dev; + struct urb *ir_urb; + struct rc_dev *rcdev; + u8 ir_buf[8]; + char phys[64]; +}; + +/* + * ffs/find_next_bit() searches in the wrong direction, so open-code our own. + */ +static inline int is_bit_set(const u8 *buf, int bit) +{ + return buf[bit / 8] & (0x80 >> (bit & 7)); +} + +static void imon_ir_data(struct imon *imon) +{ + DEFINE_IR_RAW_EVENT(rawir); + int offset = 0, size = 5 * 8; + int bit; + + dev_dbg(imon->dev, "data: %*ph", 8, imon->ir_buf); + + while (offset < size) { + bit = offset; + while (!is_bit_set(imon->ir_buf, bit) && bit < size) + bit++; + dev_dbg(imon->dev, "pulse: %d bits", bit - offset); + if (bit > offset) { + rawir.pulse = true; + rawir.duration = (bit - offset) * BIT_DURATION; + ir_raw_event_store_with_filter(imon->rcdev, &rawir); + } + + if (bit >= size) + break; + + offset = bit; + while (is_bit_set(imon->ir_buf, bit) && bit < size) + bit++; + dev_dbg(imon->dev, "space: %d bits", bit - offset); + + rawir.pulse = false; + rawir.duration = (bit - offset) * BIT_DURATION; + ir_raw_event_store_with_filter(imon->rcdev, &rawir); + + offset = bit; + } + + if (imon->ir_buf[7] == 0x0a) { + ir_raw_event_set_idle(imon->rcdev, true); + ir_raw_event_handle(imon->rcdev); + } +} + +static void imon_ir_rx(struct urb *urb) +{ + struct imon *imon = urb->context; + int ret; + + switch (urb->status) { + case 0: + if (imon->ir_buf[7] != 0xff) + imon_ir_data(imon); + break; + case -ECONNRESET: + case -ENOENT: + case -ESHUTDOWN: + usb_unlink_urb(urb); + return; + case -EPIPE: + default: + dev_dbg(imon->dev, "error: urb status = %d", urb->status); + break; + } + + ret = usb_submit_urb(urb, GFP_ATOMIC); + if (ret && ret != -ENODEV) + dev_warn(imon->dev, "failed to resubmit urb: %d", ret); +} + +static int imon_probe(struct usb_interface *intf, + const struct usb_device_id *id) +{ + struct usb_endpoint_descriptor *ir_ep = NULL; + struct usb_host_interface *idesc; + struct usb_device *udev; + struct rc_dev *rcdev; + struct imon *imon; + int i, ret; + + udev = interface_to_usbdev(intf); + idesc = intf->cur_altsetting; + + for (i = 0; i < idesc->desc.bNumEndpoints; i++) { + struct usb_endpoint_descriptor *ep = &idesc->endpoint[i].desc; + + if (usb_endpoint_is_int_in(ep)) { + ir_ep = ep; + break; + } + } + + if (!ir_ep) { + dev_err(&intf->dev, "IR endpoint missing"); + return -ENODEV; + } + + imon = devm_kmalloc(&intf->dev, sizeof(*imon), GFP_KERNEL); + if (!imon) + return -ENOMEM; + + imon->ir_urb = usb_alloc_urb(0, GFP_KERNEL); + if (!imon->ir_urb) + return -ENOMEM; + + imon->dev = &intf->dev; + usb_fill_int_urb(imon->ir_urb, udev, + usb_rcvintpipe(udev, ir_ep->bEndpointAddress), + imon->ir_buf, sizeof(imon->ir_buf), + imon_ir_rx, imon, ir_ep->bInterval); + + rcdev = devm_rc_allocate_device(&intf->dev, RC_DRIVER_IR_RAW); + if (!rcdev) { + ret = -ENOMEM; + goto free_urb; + } + + usb_make_path(udev, imon->phys, sizeof(imon->phys)); + + rcdev->device_name = "iMON Station"; + rcdev->driver_name = KBUILD_MODNAME; + rcdev->input_phys = imon->phys; + usb_to_input_id(udev, &rcdev->input_id); + rcdev->dev.parent = &intf->dev; + rcdev->allowed_protocols = RC_PROTO_BIT_ALL_IR_DECODER; + rcdev->map_name = RC_MAP_IMON_RSC; + rcdev->rx_resolution = BIT_DURATION; + rcdev->priv = imon; + + ret = devm_rc_register_device(&intf->dev, rcdev); + if (ret) + goto free_urb; + + imon->rcdev = rcdev; + + ret = usb_submit_urb(imon->ir_urb, GFP_KERNEL); + if (ret) + goto free_urb; + + usb_set_intfdata(intf, imon); + + return 0; + +free_urb: + usb_free_urb(imon->ir_urb); + return ret; +} + +static void imon_disconnect(struct usb_interface *intf) +{ + struct imon *imon = usb_get_intfdata(intf); + + usb_kill_urb(imon->ir_urb); + usb_free_urb(imon->ir_urb); +} + +static const struct usb_device_id imon_table[] = { + /* SoundGraph iMON (IR only) -- sg_imon.inf */ + { USB_DEVICE(0x04e8, 0xff30) }, + {} +}; + +static struct usb_driver imon_driver = { + .name = KBUILD_MODNAME, + .probe = imon_probe, + .disconnect = imon_disconnect, + .id_table = imon_table +}; + +module_usb_driver(imon_driver); + +MODULE_DESCRIPTION("Early raw iMON IR devices"); +MODULE_AUTHOR("Sean Young "); +MODULE_LICENSE("GPL"); +MODULE_DEVICE_TABLE(usb, imon_table); From 77678c18d6a3e0923cd55b69fbfb62b684c1f249 Mon Sep 17 00:00:00 2001 From: Sean Young Date: Tue, 6 Mar 2018 08:57:57 -0500 Subject: [PATCH 0605/1640] UPSTREAM: media: rc: oops in ir_timer_keyup after device unplug If there is IR in the raw kfifo when ir_raw_event_unregister() is called, then kthread_stop() causes ir_raw_event_thread to be scheduled, decode some scancodes and re-arm timer_keyup. The timer_keyup then fires when the rc device is long gone. Cc: stable@vger.kernel.org Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/rc-main.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/media/rc/rc-main.c b/drivers/media/rc/rc-main.c index 4a952108ba1e..8621761a680f 100644 --- a/drivers/media/rc/rc-main.c +++ b/drivers/media/rc/rc-main.c @@ -1932,12 +1932,12 @@ void rc_unregister_device(struct rc_dev *dev) if (!dev) return; - del_timer_sync(&dev->timer_keyup); - del_timer_sync(&dev->timer_repeat); - if (dev->driver_type == RC_DRIVER_IR_RAW) ir_raw_event_unregister(dev); + del_timer_sync(&dev->timer_keyup); + del_timer_sync(&dev->timer_repeat); + rc_free_rx_device(dev); mutex_lock(&dev->lock); From 288e9cb62741772754cedcb9ffb58df63dee75df Mon Sep 17 00:00:00 2001 From: Sean Young Date: Sun, 3 Dec 2017 11:06:54 -0500 Subject: [PATCH 0606/1640] UPSTREAM: media: rc: add new imon protocol decoder and encoder This makes it possible to use the various iMON remotes with any raw IR RC device. Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/Kconfig | 9 ++ drivers/media/rc/Makefile | 1 + drivers/media/rc/ir-imon-decoder.c | 193 +++++++++++++++++++++++++ drivers/media/rc/keymaps/rc-imon-pad.c | 3 +- drivers/media/rc/rc-core-priv.h | 6 + drivers/media/rc/rc-main.c | 3 + include/media/rc-map.h | 8 +- include/uapi/linux/lirc.h | 2 + 8 files changed, 220 insertions(+), 5 deletions(-) create mode 100644 drivers/media/rc/ir-imon-decoder.c diff --git a/drivers/media/rc/Kconfig b/drivers/media/rc/Kconfig index 73de36430c57..149394676c4a 100644 --- a/drivers/media/rc/Kconfig +++ b/drivers/media/rc/Kconfig @@ -111,6 +111,15 @@ config IR_XMP_DECODER ---help--- Enable this option if you have IR with XMP protocol, and if the IR is decoded in software + +config IR_IMON_DECODER + tristate "Enable IR raw decoder for the iMON protocol" + depends on RC_CORE + ---help--- + Enable this option if you have iMON PAD or Antec Veris infrared + remote control and you would like to use it with a raw IR + receiver, or if you wish to use an encoder to transmit this IR. + endif #RC_DECODERS menuconfig RC_DEVICES diff --git a/drivers/media/rc/Makefile b/drivers/media/rc/Makefile index a60a11850672..81cb22d1cffd 100644 --- a/drivers/media/rc/Makefile +++ b/drivers/media/rc/Makefile @@ -14,6 +14,7 @@ obj-$(CONFIG_IR_SANYO_DECODER) += ir-sanyo-decoder.o obj-$(CONFIG_IR_SHARP_DECODER) += ir-sharp-decoder.o obj-$(CONFIG_IR_MCE_KBD_DECODER) += ir-mce_kbd-decoder.o obj-$(CONFIG_IR_XMP_DECODER) += ir-xmp-decoder.o +obj-$(CONFIG_IR_IMON_DECODER) += ir-imon-decoder.o # stand-alone IR receivers/transmitters obj-$(CONFIG_RC_ATI_REMOTE) += ati_remote.o diff --git a/drivers/media/rc/ir-imon-decoder.c b/drivers/media/rc/ir-imon-decoder.c new file mode 100644 index 000000000000..a1ff06a26542 --- /dev/null +++ b/drivers/media/rc/ir-imon-decoder.c @@ -0,0 +1,193 @@ +// SPDX-License-Identifier: GPL-2.0+ +// ir-imon-decoder.c - handle iMon protocol +// +// Copyright (C) 2018 by Sean Young + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include "rc-core-priv.h" + +#define IMON_UNIT 415662 /* ns */ +#define IMON_BITS 30 +#define IMON_CHKBITS (BIT(30) | BIT(25) | BIT(24) | BIT(22) | \ + BIT(21) | BIT(20) | BIT(19) | BIT(18) | \ + BIT(17) | BIT(16) | BIT(14) | BIT(13) | \ + BIT(12) | BIT(11) | BIT(10) | BIT(9)) + +/* + * This protocol has 30 bits. The format is one IMON_UNIT header pulse, + * followed by 30 bits. Each bit is one IMON_UNIT check field, and then + * one IMON_UNIT field with the actual bit (1=space, 0=pulse). + * The check field is always space for some bits, for others it is pulse if + * both the preceding and current bit are zero, else space. IMON_CHKBITS + * defines which bits are of type check. + * + * There is no way to distinguish an incomplete message from one where + * the lower bits are all set, iow. the last pulse is for the lowest + * bit which is 0. + */ +enum imon_state { + STATE_INACTIVE, + STATE_BIT_CHK, + STATE_BIT_START, + STATE_FINISHED +}; + +/** + * ir_imon_decode() - Decode one iMON pulse or space + * @dev: the struct rc_dev descriptor of the device + * @ev: the struct ir_raw_event descriptor of the pulse/space + * + * This function returns -EINVAL if the pulse violates the state machine + */ +static int ir_imon_decode(struct rc_dev *dev, struct ir_raw_event ev) +{ + struct imon_dec *data = &dev->raw->imon; + + if (!is_timing_event(ev)) { + if (ev.reset) + data->state = STATE_INACTIVE; + return 0; + } + + dev_dbg(&dev->dev, + "iMON decode started at state %d bitno %d (%uus %s)\n", + data->state, data->count, TO_US(ev.duration), + TO_STR(ev.pulse)); + + for (;;) { + if (!geq_margin(ev.duration, IMON_UNIT, IMON_UNIT / 2)) + return 0; + + decrease_duration(&ev, IMON_UNIT); + + switch (data->state) { + case STATE_INACTIVE: + if (ev.pulse) { + data->state = STATE_BIT_CHK; + data->bits = 0; + data->count = IMON_BITS; + } + break; + case STATE_BIT_CHK: + if (IMON_CHKBITS & BIT(data->count)) + data->last_chk = ev.pulse; + else if (ev.pulse) + goto err_out; + data->state = STATE_BIT_START; + break; + case STATE_BIT_START: + data->bits <<= 1; + if (!ev.pulse) + data->bits |= 1; + + if (IMON_CHKBITS & BIT(data->count)) { + if (data->last_chk != !(data->bits & 3)) + goto err_out; + } + + if (!data->count--) + data->state = STATE_FINISHED; + else + data->state = STATE_BIT_CHK; + break; + case STATE_FINISHED: + if (ev.pulse) + goto err_out; + rc_keydown(dev, RC_PROTO_IMON, data->bits, 0); + data->state = STATE_INACTIVE; + break; + } + } + +err_out: + dev_dbg(&dev->dev, + "iMON decode failed at state %d bitno %d (%uus %s)\n", + data->state, data->count, TO_US(ev.duration), + TO_STR(ev.pulse)); + + data->state = STATE_INACTIVE; + + return -EINVAL; +} + +/** + * ir_imon_encode() - Encode a scancode as a stream of raw events + * + * @protocol: protocol to encode + * @scancode: scancode to encode + * @events: array of raw ir events to write into + * @max: maximum size of @events + * + * Returns: The number of events written. + * -ENOBUFS if there isn't enough space in the array to fit the + * encoding. In this case all @max events will have been written. + */ +static int ir_imon_encode(enum rc_proto protocol, u32 scancode, + struct ir_raw_event *events, unsigned int max) +{ + struct ir_raw_event *e = events; + int i, pulse; + + if (!max--) + return -ENOBUFS; + init_ir_raw_event_duration(e, 1, IMON_UNIT); + + for (i = IMON_BITS; i >= 0; i--) { + if (BIT(i) & IMON_CHKBITS) + pulse = !(scancode & (BIT(i) | BIT(i + 1))); + else + pulse = 0; + + if (pulse == e->pulse) { + e->duration += IMON_UNIT; + } else { + if (!max--) + return -ENOBUFS; + init_ir_raw_event_duration(++e, pulse, IMON_UNIT); + } + + pulse = !(scancode & BIT(i)); + + if (pulse == e->pulse) { + e->duration += IMON_UNIT; + } else { + if (!max--) + return -ENOBUFS; + init_ir_raw_event_duration(++e, pulse, IMON_UNIT); + } + } + + if (e->pulse) + e++; + + return e - events; +} + +static struct ir_raw_handler imon_handler = { + .protocols = RC_PROTO_BIT_IMON, + .decode = ir_imon_decode, + .encode = ir_imon_encode, + .carrier = 38000, +}; + +static int __init ir_imon_decode_init(void) +{ + ir_raw_handler_register(&imon_handler); + + pr_info("IR iMON protocol handler initialized\n"); + return 0; +} + +static void __exit ir_imon_decode_exit(void) +{ + ir_raw_handler_unregister(&imon_handler); +} + +module_init(ir_imon_decode_init); +module_exit(ir_imon_decode_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Sean Young "); +MODULE_DESCRIPTION("iMON IR protocol decoder"); diff --git a/drivers/media/rc/keymaps/rc-imon-pad.c b/drivers/media/rc/keymaps/rc-imon-pad.c index a7296ffbf218..8501cf0a3253 100644 --- a/drivers/media/rc/keymaps/rc-imon-pad.c +++ b/drivers/media/rc/keymaps/rc-imon-pad.c @@ -134,8 +134,7 @@ static struct rc_map_list imon_pad_map = { .map = { .scan = imon_pad, .size = ARRAY_SIZE(imon_pad), - /* actual protocol details unknown, hardware decoder */ - .rc_proto = RC_PROTO_OTHER, + .rc_proto = RC_PROTO_IMON, .name = RC_MAP_IMON_PAD, } }; diff --git a/drivers/media/rc/rc-core-priv.h b/drivers/media/rc/rc-core-priv.h index 5e80b4273e2d..e0e6a17460f6 100644 --- a/drivers/media/rc/rc-core-priv.h +++ b/drivers/media/rc/rc-core-priv.h @@ -118,6 +118,12 @@ struct ir_raw_event_ctrl { unsigned count; u32 durations[16]; } xmp; + struct imon_dec { + int state; + int count; + int last_chk; + unsigned int bits; + } imon; }; /* macros for IR decoders */ diff --git a/drivers/media/rc/rc-main.c b/drivers/media/rc/rc-main.c index 8621761a680f..b67be33bd62f 100644 --- a/drivers/media/rc/rc-main.c +++ b/drivers/media/rc/rc-main.c @@ -68,6 +68,8 @@ static const struct { .scancode_bits = 0x1fff, .repeat_period = 250 }, [RC_PROTO_XMP] = { .name = "xmp", .repeat_period = 250 }, [RC_PROTO_CEC] = { .name = "cec", .repeat_period = 550 }, + [RC_PROTO_IMON] = { .name = "imon", + .scancode_bits = 0x7fffffff, .repeat_period = 250 }, }; /* Used to keep track of known keymaps */ @@ -1004,6 +1006,7 @@ static const struct { RC_PROTO_BIT_MCIR2_MSE, "mce_kbd", "ir-mce_kbd-decoder" }, { RC_PROTO_BIT_XMP, "xmp", "ir-xmp-decoder" }, { RC_PROTO_BIT_CEC, "cec", NULL }, + { RC_PROTO_BIT_IMON, "imon", "ir-imon-decoder" }, }; /** diff --git a/include/media/rc-map.h b/include/media/rc-map.h index 7fc84991bd12..bfa3017cecba 100644 --- a/include/media/rc-map.h +++ b/include/media/rc-map.h @@ -36,6 +36,7 @@ #define RC_PROTO_BIT_SHARP BIT_ULL(RC_PROTO_SHARP) #define RC_PROTO_BIT_XMP BIT_ULL(RC_PROTO_XMP) #define RC_PROTO_BIT_CEC BIT_ULL(RC_PROTO_CEC) +#define RC_PROTO_BIT_IMON BIT_ULL(RC_PROTO_IMON) #define RC_PROTO_BIT_ALL \ (RC_PROTO_BIT_UNKNOWN | RC_PROTO_BIT_OTHER | \ @@ -49,7 +50,8 @@ RC_PROTO_BIT_RC6_0 | RC_PROTO_BIT_RC6_6A_20 | \ RC_PROTO_BIT_RC6_6A_24 | RC_PROTO_BIT_RC6_6A_32 | \ RC_PROTO_BIT_RC6_MCE | RC_PROTO_BIT_SHARP | \ - RC_PROTO_BIT_XMP | RC_PROTO_BIT_CEC) + RC_PROTO_BIT_XMP | RC_PROTO_BIT_CEC | \ + RC_PROTO_BIT_IMON) /* All rc protocols for which we have decoders */ #define RC_PROTO_BIT_ALL_IR_DECODER \ (RC_PROTO_BIT_RC5 | RC_PROTO_BIT_RC5X_20 | \ @@ -62,7 +64,7 @@ RC_PROTO_BIT_RC6_0 | RC_PROTO_BIT_RC6_6A_20 | \ RC_PROTO_BIT_RC6_6A_24 | RC_PROTO_BIT_RC6_6A_32 | \ RC_PROTO_BIT_RC6_MCE | RC_PROTO_BIT_SHARP | \ - RC_PROTO_BIT_XMP) + RC_PROTO_BIT_XMP | RC_PROTO_BIT_IMON) #define RC_PROTO_BIT_ALL_IR_ENCODER \ (RC_PROTO_BIT_RC5 | RC_PROTO_BIT_RC5X_20 | \ @@ -75,7 +77,7 @@ RC_PROTO_BIT_RC6_0 | RC_PROTO_BIT_RC6_6A_20 | \ RC_PROTO_BIT_RC6_6A_24 | \ RC_PROTO_BIT_RC6_6A_32 | RC_PROTO_BIT_RC6_MCE | \ - RC_PROTO_BIT_SHARP) + RC_PROTO_BIT_SHARP | RC_PROTO_BIT_IMON) #define RC_SCANCODE_UNKNOWN(x) (x) #define RC_SCANCODE_OTHER(x) (x) diff --git a/include/uapi/linux/lirc.h b/include/uapi/linux/lirc.h index 4fe580d36e41..948d9a491083 100644 --- a/include/uapi/linux/lirc.h +++ b/include/uapi/linux/lirc.h @@ -186,6 +186,7 @@ struct lirc_scancode { * @RC_PROTO_SHARP: Sharp protocol * @RC_PROTO_XMP: XMP protocol * @RC_PROTO_CEC: CEC protocol + * @RC_PROTO_IMON: iMon Pad protocol */ enum rc_proto { RC_PROTO_UNKNOWN = 0, @@ -211,6 +212,7 @@ enum rc_proto { RC_PROTO_SHARP = 20, RC_PROTO_XMP = 21, RC_PROTO_CEC = 22, + RC_PROTO_IMON = 23, }; #endif From a0c5d3c9e92ef6d3093755e21571a8b5d02a0ee7 Mon Sep 17 00:00:00 2001 From: Sean Young Date: Wed, 7 Mar 2018 05:55:38 -0500 Subject: [PATCH 0607/1640] UPSTREAM: media: imon: rename protocol from other to imon This renames the protocol for the imon rc driver from other to imon, since it is now an known protocol. Although different name will show up in the sysfs protocol file, loading a keymap using existing ir-keytable versions still works. Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/imon.c | 37 +++++++++++++++++++++++-------------- 1 file changed, 23 insertions(+), 14 deletions(-) diff --git a/drivers/media/rc/imon.c b/drivers/media/rc/imon.c index 188ab40c91a9..864a0d36edab 100644 --- a/drivers/media/rc/imon.c +++ b/drivers/media/rc/imon.c @@ -1110,18 +1110,18 @@ static int imon_ir_change_protocol(struct rc_dev *rc, u64 *rc_proto) dev_dbg(dev, "Configuring IR receiver for MCE protocol\n"); ir_proto_packet[0] = 0x01; *rc_proto = RC_PROTO_BIT_RC6_MCE; - } else if (*rc_proto & RC_PROTO_BIT_OTHER) { + } else if (*rc_proto & RC_PROTO_BIT_IMON) { dev_dbg(dev, "Configuring IR receiver for iMON protocol\n"); if (!pad_stabilize) dev_dbg(dev, "PAD stabilize functionality disabled\n"); /* ir_proto_packet[0] = 0x00; // already the default */ - *rc_proto = RC_PROTO_BIT_OTHER; + *rc_proto = RC_PROTO_BIT_IMON; } else { dev_warn(dev, "Unsupported IR protocol specified, overriding to iMON IR protocol\n"); if (!pad_stabilize) dev_dbg(dev, "PAD stabilize functionality disabled\n"); /* ir_proto_packet[0] = 0x00; // already the default */ - *rc_proto = RC_PROTO_BIT_OTHER; + *rc_proto = RC_PROTO_BIT_IMON; } memcpy(ictx->usb_tx_buf, &ir_proto_packet, sizeof(ir_proto_packet)); @@ -1385,7 +1385,7 @@ static void imon_pad_to_keys(struct imon_context *ictx, unsigned char *buf) rel_x = buf[2]; rel_y = buf[3]; - if (ictx->rc_proto == RC_PROTO_BIT_OTHER && pad_stabilize) { + if (ictx->rc_proto == RC_PROTO_BIT_IMON && pad_stabilize) { if ((buf[1] == 0) && ((rel_x != 0) || (rel_y != 0))) { dir = stabilize((int)rel_x, (int)rel_y, timeout, threshold); @@ -1452,7 +1452,7 @@ static void imon_pad_to_keys(struct imon_context *ictx, unsigned char *buf) buf[0] = 0x01; buf[1] = buf[4] = buf[5] = buf[6] = buf[7] = 0; - if (ictx->rc_proto == RC_PROTO_BIT_OTHER && pad_stabilize) { + if (ictx->rc_proto == RC_PROTO_BIT_IMON && pad_stabilize) { dir = stabilize((int)rel_x, (int)rel_y, timeout, threshold); if (!dir) { @@ -1636,11 +1636,18 @@ static void imon_incoming_packet(struct imon_context *ictx, if (press_type == 0) rc_keyup(ictx->rdev); else { - if (ictx->rc_proto == RC_PROTO_BIT_RC6_MCE || - ictx->rc_proto == RC_PROTO_BIT_OTHER) - rc_keydown(ictx->rdev, - ictx->rc_proto == RC_PROTO_BIT_RC6_MCE ? RC_PROTO_RC6_MCE : RC_PROTO_OTHER, - ictx->rc_scancode, ictx->rc_toggle); + enum rc_proto proto; + + if (ictx->rc_proto == RC_PROTO_BIT_RC6_MCE) + proto = RC_PROTO_RC6_MCE; + else if (ictx->rc_proto == RC_PROTO_BIT_IMON) + proto = RC_PROTO_IMON; + else + return; + + rc_keydown(ictx->rdev, proto, ictx->rc_scancode, + ictx->rc_toggle); + spin_lock_irqsave(&ictx->kc_lock, flags); ictx->last_keycode = ictx->kc; spin_unlock_irqrestore(&ictx->kc_lock, flags); @@ -1797,7 +1804,7 @@ static void imon_get_ffdc_type(struct imon_context *ictx) { u8 ffdc_cfg_byte = ictx->usb_rx_buf[6]; u8 detected_display_type = IMON_DISPLAY_TYPE_NONE; - u64 allowed_protos = RC_PROTO_BIT_OTHER; + u64 allowed_protos = RC_PROTO_BIT_IMON; switch (ffdc_cfg_byte) { /* iMON Knob, no display, iMON IR + vol knob */ @@ -1845,8 +1852,10 @@ static void imon_get_ffdc_type(struct imon_context *ictx) default: dev_info(ictx->dev, "Unknown 0xffdc device, defaulting to VFD and iMON IR"); detected_display_type = IMON_DISPLAY_TYPE_VFD; - /* We don't know which one it is, allow user to set the - * RC6 one from userspace if OTHER wasn't correct. */ + /* + * We don't know which one it is, allow user to set the + * RC6 one from userspace if IMON wasn't correct. + */ allowed_protos |= RC_PROTO_BIT_RC6_MCE; break; } @@ -1933,7 +1942,7 @@ static struct rc_dev *imon_init_rdev(struct imon_context *ictx) rdev->priv = ictx; /* iMON PAD or MCE */ - rdev->allowed_protocols = RC_PROTO_BIT_OTHER | RC_PROTO_BIT_RC6_MCE; + rdev->allowed_protocols = RC_PROTO_BIT_IMON | RC_PROTO_BIT_RC6_MCE; rdev->change_protocol = imon_ir_change_protocol; rdev->driver_name = MOD_NAME; From 923b66fa5cc6c99956ba6c15752ac1bfe287d9aa Mon Sep 17 00:00:00 2001 From: Sean Young Date: Thu, 8 Mar 2018 09:42:44 -0500 Subject: [PATCH 0608/1640] UPSTREAM: media: rc: meson-ir: add timeout on idle Meson doesn't seem to be able to generate timeout events in hardware. So install a software timer to generate the timeout events required by the decoders to prevent "ghost keypresses". Reported-by: Matthias Reichl Tested-by: Matthias Reichl Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/meson-ir.c | 3 +-- drivers/media/rc/rc-ir-raw.c | 30 +++++++++++++++++++++++++++--- include/media/rc-core.h | 4 +++- 3 files changed, 31 insertions(+), 6 deletions(-) diff --git a/drivers/media/rc/meson-ir.c b/drivers/media/rc/meson-ir.c index f2204eb77e2a..64b0aa4f4db7 100644 --- a/drivers/media/rc/meson-ir.c +++ b/drivers/media/rc/meson-ir.c @@ -97,8 +97,7 @@ static irqreturn_t meson_ir_irq(int irqno, void *dev_id) status = readl_relaxed(ir->reg + IR_DEC_STATUS); rawir.pulse = !!(status & STATUS_IR_DEC_IN); - ir_raw_event_store(ir->rc, &rawir); - ir_raw_event_handle(ir->rc); + ir_raw_event_store_with_timeout(ir->rc, &rawir); spin_unlock(&ir->lock); diff --git a/drivers/media/rc/rc-ir-raw.c b/drivers/media/rc/rc-ir-raw.c index 984bb82851f9..374f83105a23 100644 --- a/drivers/media/rc/rc-ir-raw.c +++ b/drivers/media/rc/rc-ir-raw.c @@ -92,7 +92,6 @@ int ir_raw_event_store_edge(struct rc_dev *dev, bool pulse) { ktime_t now; DEFINE_IR_RAW_EVENT(ev); - int rc = 0; if (!dev->raw) return -EINVAL; @@ -101,8 +100,33 @@ int ir_raw_event_store_edge(struct rc_dev *dev, bool pulse) ev.duration = ktime_to_ns(ktime_sub(now, dev->raw->last_event)); ev.pulse = !pulse; + return ir_raw_event_store_with_timeout(dev, &ev); +} +EXPORT_SYMBOL_GPL(ir_raw_event_store_edge); + +/* + * ir_raw_event_store_with_timeout() - pass a pulse/space duration to the raw + * ir decoders, schedule decoding and + * timeout + * @dev: the struct rc_dev device descriptor + * @ev: the struct ir_raw_event descriptor of the pulse/space + * + * This routine (which may be called from an interrupt context) stores a + * pulse/space duration for the raw ir decoding state machines, schedules + * decoding and generates a timeout. + */ +int ir_raw_event_store_with_timeout(struct rc_dev *dev, struct ir_raw_event *ev) +{ + ktime_t now; + int rc = 0; + + if (!dev->raw) + return -EINVAL; + + now = ktime_get(); + spin_lock(&dev->raw->edge_spinlock); - rc = ir_raw_event_store(dev, &ev); + rc = ir_raw_event_store(dev, ev); dev->raw->last_event = now; @@ -117,7 +141,7 @@ int ir_raw_event_store_edge(struct rc_dev *dev, bool pulse) return rc; } -EXPORT_SYMBOL_GPL(ir_raw_event_store_edge); +EXPORT_SYMBOL_GPL(ir_raw_event_store_with_timeout); /** * ir_raw_event_store_with_filter() - pass next pulse/space to decoders with some processing diff --git a/include/media/rc-core.h b/include/media/rc-core.h index fc3a92668bab..6742fd86ff65 100644 --- a/include/media/rc-core.h +++ b/include/media/rc-core.h @@ -334,7 +334,9 @@ void ir_raw_event_handle(struct rc_dev *dev); int ir_raw_event_store(struct rc_dev *dev, struct ir_raw_event *ev); int ir_raw_event_store_edge(struct rc_dev *dev, bool pulse); int ir_raw_event_store_with_filter(struct rc_dev *dev, - struct ir_raw_event *ev); + struct ir_raw_event *ev); +int ir_raw_event_store_with_timeout(struct rc_dev *dev, + struct ir_raw_event *ev); void ir_raw_event_set_idle(struct rc_dev *dev, bool idle); int ir_raw_encode_scancode(enum rc_proto protocol, u32 scancode, struct ir_raw_event *events, unsigned int max); From 94b6e2dc4a010a05d19db94025b64bbdf0f527c3 Mon Sep 17 00:00:00 2001 From: Sean Young Date: Mon, 12 Mar 2018 17:23:00 -0400 Subject: [PATCH 0609/1640] UPSTREAM: media: rc: meson-ir: lower timeout and make configurable A timeout of 200ms is much longer than necessary, and delays the decoding decoding of a single scancode and the last scancode when a button is being held. This makes the remote seem sluggish. If the min_timeout and max_timeout values are set, the timeout is configurable via the LIRC_SET_REC_TIMEOUT ioctl. Tested-by: Matthias Reichl Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/meson-ir.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/media/rc/meson-ir.c b/drivers/media/rc/meson-ir.c index 64b0aa4f4db7..f449b35d25e7 100644 --- a/drivers/media/rc/meson-ir.c +++ b/drivers/media/rc/meson-ir.c @@ -144,7 +144,9 @@ static int meson_ir_probe(struct platform_device *pdev) ir->rc->map_name = map_name ? map_name : RC_MAP_EMPTY; ir->rc->allowed_protocols = RC_PROTO_BIT_ALL_IR_DECODER; ir->rc->rx_resolution = US_TO_NS(MESON_TRATE); - ir->rc->timeout = MS_TO_NS(200); + ir->rc->min_timeout = 1; + ir->rc->timeout = IR_DEFAULT_TIMEOUT; + ir->rc->max_timeout = 10 * IR_DEFAULT_TIMEOUT; ir->rc->driver_name = DRIVER_NAME; spin_lock_init(&ir->lock); From eb84408e94650c2b4ef6b54ffb1328f4b57f2ecc Mon Sep 17 00:00:00 2001 From: A Sun Date: Fri, 16 Mar 2018 15:52:09 -0400 Subject: [PATCH 0610/1640] UPSTREAM: media: mceusb: add IR learning support features (IR carrier frequency measurement and wide-band/short-range receiver) Windows Media Center IR transceivers include two IR receivers; wide-band/short-range and narrow-band/long-range. The short-range (5cm distance) receiver is for IR learning and has IR carrier frequency measuring ability. Add mceusb driver support to select the short range IR receiver and enable pass through of its IR carrier frequency measurements. RC and LIRC already support these mceusb driver additions. Test platform: Linux raspberrypi 4.9.59-v7+ #1047 SMP Sun Oct 29 12:19:23 GMT 2017 armv7l GNU/Linux mceusb 1-1.2:1.0: Registered Pinnacle Systems PCTV Remote USB with mce emulator interface version 1 mceusb 1-1.2:1.0: 2 tx ports (0x0 cabled) and 2 rx sensors (0x1 active) Sony TV remote control ir-ctl from v4l-utils pi@raspberrypi:~ $ ir-ctl -V IR raw version 1.12.3 pi@raspberrypi:~ $ ir-ctl -m -r ... pulse 650 space 550 pulse 650 space 600 pulse 600 space 600 pulse 1200 space 600 pulse 650 space 550 pulse 650 space 600 pulse 600 space 600 pulse 550 carrier 40004 space 16777215 ^C pi@raspberrypi:~ $ exit Signed-off-by: A Sun Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/mceusb.c | 152 +++++++++++++++++++++++++++++++++++--- 1 file changed, 140 insertions(+), 12 deletions(-) diff --git a/drivers/media/rc/mceusb.c b/drivers/media/rc/mceusb.c index a9187b0b46a1..f8c23d577493 100644 --- a/drivers/media/rc/mceusb.c +++ b/drivers/media/rc/mceusb.c @@ -42,7 +42,7 @@ #include #include -#define DRIVER_VERSION "1.93" +#define DRIVER_VERSION "1.94" #define DRIVER_AUTHOR "Jarod Wilson " #define DRIVER_DESC "Windows Media Center Ed. eHome Infrared Transceiver " \ "device driver" @@ -198,6 +198,13 @@ struct mceusb_model { u32 mce_gen3:1; u32 tx_mask_normal:1; u32 no_tx:1; + /* + * 2nd IR receiver (short-range, wideband) for learning mode: + * 0, absent 2nd receiver (rx2) + * 1, rx2 present + * 2, rx2 which under counts IR carrier cycles + */ + u32 rx2; int ir_intfnum; @@ -209,9 +216,11 @@ static const struct mceusb_model mceusb_model[] = { [MCE_GEN1] = { .mce_gen1 = 1, .tx_mask_normal = 1, + .rx2 = 2, }, [MCE_GEN2] = { .mce_gen2 = 1, + .rx2 = 2, }, [MCE_GEN2_NO_TX] = { .mce_gen2 = 1, @@ -220,10 +229,12 @@ static const struct mceusb_model mceusb_model[] = { [MCE_GEN2_TX_INV] = { .mce_gen2 = 1, .tx_mask_normal = 1, + .rx2 = 1, }, [MCE_GEN3] = { .mce_gen3 = 1, .tx_mask_normal = 1, + .rx2 = 2, }, [POLARIS_EVK] = { /* @@ -232,6 +243,7 @@ static const struct mceusb_model mceusb_model[] = { * to allow testing it */ .name = "Conexant Hybrid TV (cx231xx) MCE IR", + .rx2 = 2, }, [CX_HYBRID_TV] = { .no_tx = 1, /* tx isn't wired up at all */ @@ -244,10 +256,12 @@ static const struct mceusb_model mceusb_model[] = { [MULTIFUNCTION] = { .mce_gen2 = 1, .ir_intfnum = 2, + .rx2 = 2, }, [TIVO_KIT] = { .mce_gen2 = 1, .rc_map = RC_MAP_TIVO, + .rx2 = 2, }, [EVROMEDIA_FULL_HYBRID_FULLHD] = { .name = "Evromedia USB Full Hybrid Full HD", @@ -427,7 +441,8 @@ struct mceusb_dev { struct rc_dev *rc; /* optional features we can enable */ - bool learning_enabled; + bool carrier_report_enabled; + bool wideband_rx_enabled; /* aka learning mode, short-range rx */ /* core device bits */ struct device *dev; @@ -458,6 +473,7 @@ struct mceusb_dev { u32 tx_mask_normal:1; u32 microsoft_gen1:1; u32 no_tx:1; + u32 rx2; } flags; /* transmit support */ @@ -474,6 +490,11 @@ struct mceusb_dev { u8 num_rxports; /* number of receive sensors */ u8 txports_cabled; /* bitmask of transmitters with cable */ u8 rxports_active; /* bitmask of active receive sensors */ + bool learning_active; /* wideband rx is active */ + + /* receiver carrier frequency detection support */ + u32 pulse_tunit; /* IR pulse "on" cumulative time units */ + u32 pulse_count; /* pulse "on" count in measurement interval */ /* * support for async error handler mceusb_deferred_kevent() @@ -684,8 +705,8 @@ static void mceusb_dev_printdata(struct mceusb_dev *ir, u8 *buf, int buf_len, /* aka MCE_RSP_EQIRRXCFCNT */ if (out) dev_dbg(dev, "Get receive sensor"); - else if (ir->learning_enabled) - dev_dbg(dev, "RX pulse count: %d", + else + dev_dbg(dev, "RX carrier cycle count: %d", ((data[0] << 8) | data[1])); break; case MCE_RSP_EQIRNUMPORTS: @@ -955,6 +976,67 @@ static int mceusb_set_tx_carrier(struct rc_dev *dev, u32 carrier) return 0; } +/* + * Select or deselect the 2nd receiver port. + * Second receiver is learning mode, wide-band, short-range receiver. + * Only one receiver (long or short range) may be active at a time. + */ +static int mceusb_set_rx_wideband(struct rc_dev *dev, int enable) +{ + struct mceusb_dev *ir = dev->priv; + unsigned char cmdbuf[3] = { MCE_CMD_PORT_IR, + MCE_CMD_SETIRRXPORTEN, 0x00 }; + + dev_dbg(ir->dev, "select %s-range receive sensor", + enable ? "short" : "long"); + if (enable) { + ir->wideband_rx_enabled = true; + cmdbuf[2] = 2; /* port 2 is short range receiver */ + } else { + ir->wideband_rx_enabled = false; + cmdbuf[2] = 1; /* port 1 is long range receiver */ + } + mce_async_out(ir, cmdbuf, sizeof(cmdbuf)); + /* response from device sets ir->learning_active */ + + return 0; +} + +/* + * Enable/disable receiver carrier frequency pass through reporting. + * Only the short-range receiver has carrier frequency measuring capability. + * Implicitly select this receiver when enabling carrier frequency reporting. + */ +static int mceusb_set_rx_carrier_report(struct rc_dev *dev, int enable) +{ + struct mceusb_dev *ir = dev->priv; + unsigned char cmdbuf[3] = { MCE_CMD_PORT_IR, + MCE_CMD_SETIRRXPORTEN, 0x00 }; + + dev_dbg(ir->dev, "%s short-range receiver carrier reporting", + enable ? "enable" : "disable"); + if (enable) { + ir->carrier_report_enabled = true; + if (!ir->learning_active) { + cmdbuf[2] = 2; /* port 2 is short range receiver */ + mce_async_out(ir, cmdbuf, sizeof(cmdbuf)); + } + } else { + ir->carrier_report_enabled = false; + /* + * Revert to normal (long-range) receiver only if the + * wideband (short-range) receiver wasn't explicitly + * enabled. + */ + if (ir->learning_active && !ir->wideband_rx_enabled) { + cmdbuf[2] = 1; /* port 1 is long range receiver */ + mce_async_out(ir, cmdbuf, sizeof(cmdbuf)); + } + } + + return 0; +} + /* * We don't do anything but print debug spew for many of the command bits * we receive from the hardware, but some of them are useful information @@ -962,8 +1044,11 @@ static int mceusb_set_tx_carrier(struct rc_dev *dev, u32 carrier) */ static void mceusb_handle_command(struct mceusb_dev *ir, int index) { + DEFINE_IR_RAW_EVENT(rawir); u8 hi = ir->buf_in[index + 1] & 0xff; u8 lo = ir->buf_in[index + 2] & 0xff; + u32 carrier_cycles; + u32 cycles_fix; switch (ir->buf_in[index]) { /* the one and only 5-byte return value command */ @@ -980,6 +1065,33 @@ static void mceusb_handle_command(struct mceusb_dev *ir, int index) ir->num_txports = hi; ir->num_rxports = lo; break; + case MCE_RSP_EQIRRXCFCNT: + /* + * The carrier cycle counter can overflow and wrap around + * without notice from the device. So frequency measurement + * will be inaccurate with long duration IR. + * + * The long-range (non learning) receiver always reports + * zero count so we always ignore its report. + */ + if (ir->carrier_report_enabled && ir->learning_active && + ir->pulse_tunit > 0) { + carrier_cycles = (hi << 8 | lo); + /* + * Adjust carrier cycle count by adding + * 1 missed count per pulse "on" + */ + cycles_fix = ir->flags.rx2 == 2 ? ir->pulse_count : 0; + rawir.carrier_report = 1; + rawir.carrier = (1000000u / MCE_TIME_UNIT) * + (carrier_cycles + cycles_fix) / + ir->pulse_tunit; + dev_dbg(ir->dev, "RX carrier frequency %u Hz (pulse count = %u, cycles = %u, duration = %u, rx2 = %u)", + rawir.carrier, ir->pulse_count, carrier_cycles, + ir->pulse_tunit, ir->flags.rx2); + ir_raw_event_store(ir->rc, &rawir); + } + break; /* 1-byte return value commands */ case MCE_RSP_EQEMVER: @@ -989,8 +1101,12 @@ static void mceusb_handle_command(struct mceusb_dev *ir, int index) ir->tx_mask = hi; break; case MCE_RSP_EQIRRXPORTEN: - ir->learning_enabled = ((hi & 0x02) == 0x02); - ir->rxports_active = hi; + ir->learning_active = ((hi & 0x02) == 0x02); + if (ir->rxports_active != hi) { + dev_info(ir->dev, "%s-range (0x%x) receiver active", + ir->learning_active ? "short" : "long", hi); + ir->rxports_active = hi; + } break; case MCE_RSP_CMD_ILLEGAL: ir->need_reset = true; @@ -1027,12 +1143,16 @@ static void mceusb_process_ir_data(struct mceusb_dev *ir, int buf_len) ir->rem--; init_ir_raw_event(&rawir); rawir.pulse = ((ir->buf_in[i] & MCE_PULSE_BIT) != 0); - rawir.duration = (ir->buf_in[i] & MCE_PULSE_MASK) - * US_TO_NS(MCE_TIME_UNIT); + rawir.duration = (ir->buf_in[i] & MCE_PULSE_MASK); + if (rawir.pulse) { + ir->pulse_tunit += rawir.duration; + ir->pulse_count++; + } + rawir.duration *= US_TO_NS(MCE_TIME_UNIT); - dev_dbg(ir->dev, "Storing %s with duration %u", + dev_dbg(ir->dev, "Storing %s %u ns (%02x)", rawir.pulse ? "pulse" : "space", - rawir.duration); + rawir.duration, ir->buf_in[i]); if (ir_raw_event_store_with_filter(ir->rc, &rawir)) event = true; @@ -1053,10 +1173,13 @@ static void mceusb_process_ir_data(struct mceusb_dev *ir, int buf_len) ir->rem = (ir->cmd & MCE_PACKET_LENGTH_MASK); mceusb_dev_printdata(ir, ir->buf_in, buf_len, i, ir->rem + 1, false); - if (ir->rem) + if (ir->rem) { ir->parser_state = PARSE_IRDATA; - else + } else { ir_raw_event_reset(ir->rc); + ir->pulse_tunit = 0; + ir->pulse_count = 0; + } break; } @@ -1292,6 +1415,10 @@ static struct rc_dev *mceusb_init_rc_dev(struct mceusb_dev *ir) rc->s_tx_carrier = mceusb_set_tx_carrier; rc->tx_ir = mceusb_tx_ir; } + if (ir->flags.rx2 > 0) { + rc->s_learning_mode = mceusb_set_rx_wideband; + rc->s_carrier_report = mceusb_set_rx_carrier_report; + } rc->driver_name = DRIVER_NAME; switch (le16_to_cpu(udev->descriptor.idVendor)) { @@ -1406,6 +1533,7 @@ static int mceusb_dev_probe(struct usb_interface *intf, ir->flags.microsoft_gen1 = is_microsoft_gen1; ir->flags.tx_mask_normal = tx_mask_normal; ir->flags.no_tx = mceusb_model[model].no_tx; + ir->flags.rx2 = mceusb_model[model].rx2; ir->model = model; /* Saving usb interface data for use by the transmitter routine */ From 5f98652a247f038ccffbe31fa1b261fbabc93d3f Mon Sep 17 00:00:00 2001 From: Sean Young Date: Sun, 18 Mar 2018 06:46:02 -0400 Subject: [PATCH 0611/1640] UPSTREAM: media: rc: mceusb: pid 0x0609 vid 0x031d does not under report carrier cycles This mceusb does not need the carrier count quirk, with it set it reports the carrier higher than it is. Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/mceusb.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/media/rc/mceusb.c b/drivers/media/rc/mceusb.c index f8c23d577493..69ba57372c05 100644 --- a/drivers/media/rc/mceusb.c +++ b/drivers/media/rc/mceusb.c @@ -182,6 +182,7 @@ enum mceusb_model_type { MCE_GEN1, MCE_GEN3, MCE_GEN2_TX_INV, + MCE_GEN2_TX_INV_RX_GOOD, POLARIS_EVK, CX_HYBRID_TV, MULTIFUNCTION, @@ -231,6 +232,11 @@ static const struct mceusb_model mceusb_model[] = { .tx_mask_normal = 1, .rx2 = 1, }, + [MCE_GEN2_TX_INV_RX_GOOD] = { + .mce_gen2 = 1, + .tx_mask_normal = 1, + .rx2 = 2, + }, [MCE_GEN3] = { .mce_gen3 = 1, .tx_mask_normal = 1, @@ -304,7 +310,7 @@ static const struct usb_device_id mceusb_dev_table[] = { .driver_info = MULTIFUNCTION }, /* SMK/Toshiba G83C0004D410 */ { USB_DEVICE(VENDOR_SMK, 0x031d), - .driver_info = MCE_GEN2_TX_INV }, + .driver_info = MCE_GEN2_TX_INV_RX_GOOD }, /* SMK eHome Infrared Transceiver (Sony VAIO) */ { USB_DEVICE(VENDOR_SMK, 0x0322), .driver_info = MCE_GEN2_TX_INV }, From 7c7496d352c7335256fcee336df222136be020be Mon Sep 17 00:00:00 2001 From: Tony Lindgren Date: Mon, 16 Apr 2018 10:22:01 -0700 Subject: [PATCH 0612/1640] BACKPORT: ARM: OMAP2+: Drop unused pm-noop Looks like these functions don't do anything in the mainline kernel so we can just drop it. Note that we must now also remove ir-rx51 pdata as it relies on the dummy platform data that does not do anything. And ir-rx51 is calling a pdata callback that doesn't do anything without checking if it exists first. For configuring device specific minimal latencies, the interface to use is pm_qos_add_request(). For an example, see what was done in commit 9834ffd1ecc3 ("ASoC: omap-mcbsp: Add PM QoS support for McBSP to prevent glitches"). I've added some comments to ir-rx51 so people using it can add pm_qos support and test it. Cc: Ivaylo Dimitrov Cc: Kevin Hilman Cc: Laurent Pinchart Cc: Tomi Valkeinen Acked-by: Mauro Carvalho Chehab Signed-off-by: Tony Lindgren --- arch/arm/mach-omap2/Makefile | 1 - arch/arm/mach-omap2/display.c | 7 - arch/arm/mach-omap2/hsmmc.c | 1 - arch/arm/mach-omap2/i2c.c | 1 - arch/arm/mach-omap2/io.c | 3 - arch/arm/mach-omap2/omap-pm-noop.c | 176 -------------------- arch/arm/mach-omap2/omap-pm.h | 161 ------------------ arch/arm/mach-omap2/pdata-quirks.c | 14 -- arch/arm/mach-omap2/pm-debug.c | 5 - arch/arm/mach-omap2/pm.c | 10 +- arch/arm/mach-omap2/timer.c | 1 - arch/arm/plat-omap/Kconfig | 10 -- drivers/media/rc/ir-rx51.c | 17 +- include/linux/platform_data/media/ir-rx51.h | 9 - 14 files changed, 4 insertions(+), 412 deletions(-) delete mode 100644 arch/arm/mach-omap2/omap-pm-noop.c delete mode 100644 arch/arm/mach-omap2/omap-pm.h delete mode 100644 include/linux/platform_data/media/ir-rx51.h diff --git a/arch/arm/mach-omap2/Makefile b/arch/arm/mach-omap2/Makefile index 38f1748a4500..fdc8abb3d524 100644 --- a/arch/arm/mach-omap2/Makefile +++ b/arch/arm/mach-omap2/Makefile @@ -78,7 +78,6 @@ endif omap-4-5-pm-common = omap-mpuss-lowpower.o obj-$(CONFIG_ARCH_OMAP4) += $(omap-4-5-pm-common) obj-$(CONFIG_SOC_OMAP5) += $(omap-4-5-pm-common) -obj-$(CONFIG_OMAP_PM_NOOP) += omap-pm-noop.o ifeq ($(CONFIG_PM),y) obj-$(CONFIG_ARCH_OMAP2) += pm24xx.o diff --git a/arch/arm/mach-omap2/display.c b/arch/arm/mach-omap2/display.c index b01b7515b6cc..46cc20a72361 100644 --- a/arch/arm/mach-omap2/display.c +++ b/arch/arm/mach-omap2/display.c @@ -32,7 +32,6 @@ #include #include "omap_hwmod.h" #include "omap_device.h" -#include "omap-pm.h" #include "common.h" #include "soc.h" @@ -131,11 +130,6 @@ static void omap_dsi_disable_pads(int dsi_id, unsigned lane_mask) omap4_dsi_mux_pads(dsi_id, 0); } -static int omap_dss_set_min_bus_tput(struct device *dev, unsigned long tput) -{ - return omap_pm_set_min_bus_tput(dev, OCP_INITIATOR_AGENT, tput); -} - static enum omapdss_version __init omap_display_get_version(void) { if (cpu_is_omap24xx()) @@ -174,7 +168,6 @@ static int __init omapdss_init_fbdev(void) static struct omap_dss_board_info board_data = { .dsi_enable_pads = omap_dsi_enable_pads, .dsi_disable_pads = omap_dsi_disable_pads, - .set_min_bus_tput = omap_dss_set_min_bus_tput, }; struct device_node *node; int r; diff --git a/arch/arm/mach-omap2/hsmmc.c b/arch/arm/mach-omap2/hsmmc.c index 6d28aa20a7d3..eb3aece1f8f7 100644 --- a/arch/arm/mach-omap2/hsmmc.c +++ b/arch/arm/mach-omap2/hsmmc.c @@ -20,7 +20,6 @@ #include "soc.h" #include "omap_device.h" -#include "omap-pm.h" #include "hsmmc.h" #include "control.h" diff --git a/arch/arm/mach-omap2/i2c.c b/arch/arm/mach-omap2/i2c.c index 91a21c3923b2..37ff25ee3d89 100644 --- a/arch/arm/mach-omap2/i2c.c +++ b/arch/arm/mach-omap2/i2c.c @@ -22,7 +22,6 @@ #include "soc.h" #include "omap_hwmod.h" #include "omap_device.h" -#include "omap-pm.h" #include "prm.h" #include "common.h" diff --git a/arch/arm/mach-omap2/io.c b/arch/arm/mach-omap2/io.c index cb5d7314cf99..5f358930684a 100644 --- a/arch/arm/mach-omap2/io.c +++ b/arch/arm/mach-omap2/io.c @@ -37,7 +37,6 @@ #include "clock.h" #include "clock2xxx.h" #include "clock3xxx.h" -#include "omap-pm.h" #include "sdrc.h" #include "control.h" #include "serial.h" @@ -421,8 +420,6 @@ static void __init __maybe_unused omap_hwmod_init_postsetup(void) postsetup_state = _HWMOD_STATE_ENABLED; #endif omap_hwmod_for_each(_set_hwmod_postsetup_state, &postsetup_state); - - omap_pm_if_early_init(); } static void __init __maybe_unused omap_common_late_init(void) diff --git a/arch/arm/mach-omap2/omap-pm-noop.c b/arch/arm/mach-omap2/omap-pm-noop.c deleted file mode 100644 index 4ead077ea4e7..000000000000 --- a/arch/arm/mach-omap2/omap-pm-noop.c +++ /dev/null @@ -1,176 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * omap-pm-noop.c - OMAP power management interface - dummy version - * - * This code implements the OMAP power management interface to - * drivers, CPUIdle, CPUFreq, and DSP Bridge. It is strictly for - * debug/demonstration use, as it does nothing but printk() whenever a - * function is called (when DEBUG is defined, below) - * - * Copyright (C) 2008-2009 Texas Instruments, Inc. - * Copyright (C) 2008-2009 Nokia Corporation - * Paul Walmsley - * - * Interface developed by (in alphabetical order): - * Karthik Dasu, Tony Lindgren, Rajendra Nayak, Sakari Poussa, Veeramanikandan - * Raju, Anand Sawant, Igor Stoppa, Paul Walmsley, Richard Woodruff - */ - -#undef DEBUG - -#include -#include -#include -#include - -#include "omap_device.h" -#include "omap-pm.h" - -static bool off_mode_enabled; -static int dummy_context_loss_counter; - -/* - * Device-driver-originated constraints (via board-*.c files) - */ - -int omap_pm_set_max_mpu_wakeup_lat(struct device *dev, long t) -{ - if (!dev || t < -1) { - WARN(1, "OMAP PM: %s: invalid parameter(s)", __func__); - return -EINVAL; - } - - if (t == -1) - pr_debug("OMAP PM: remove max MPU wakeup latency constraint: dev %s\n", - dev_name(dev)); - else - pr_debug("OMAP PM: add max MPU wakeup latency constraint: dev %s, t = %ld usec\n", - dev_name(dev), t); - - /* - * For current Linux, this needs to map the MPU to a - * powerdomain, then go through the list of current max lat - * constraints on the MPU and find the smallest. If - * the latency constraint has changed, the code should - * recompute the state to enter for the next powerdomain - * state. - * - * TI CDP code can call constraint_set here. - */ - - return 0; -} - -int omap_pm_set_min_bus_tput(struct device *dev, u8 agent_id, unsigned long r) -{ - if (!dev || (agent_id != OCP_INITIATOR_AGENT && - agent_id != OCP_TARGET_AGENT)) { - WARN(1, "OMAP PM: %s: invalid parameter(s)", __func__); - return -EINVAL; - } - - if (r == 0) - pr_debug("OMAP PM: remove min bus tput constraint: dev %s for agent_id %d\n", - dev_name(dev), agent_id); - else - pr_debug("OMAP PM: add min bus tput constraint: dev %s for agent_id %d: rate %ld KiB\n", - dev_name(dev), agent_id, r); - - /* - * This code should model the interconnect and compute the - * required clock frequency, convert that to a VDD2 OPP ID, then - * set the VDD2 OPP appropriately. - * - * TI CDP code can call constraint_set here on the VDD2 OPP. - */ - - return 0; -} - -/* - * DSP Bridge-specific constraints - */ - - -/** - * omap_pm_enable_off_mode - notify OMAP PM that off-mode is enabled - * - * Intended for use only by OMAP PM core code to notify this layer - * that off mode has been enabled. - */ -void omap_pm_enable_off_mode(void) -{ - off_mode_enabled = true; -} - -/** - * omap_pm_disable_off_mode - notify OMAP PM that off-mode is disabled - * - * Intended for use only by OMAP PM core code to notify this layer - * that off mode has been disabled. - */ -void omap_pm_disable_off_mode(void) -{ - off_mode_enabled = false; -} - -/* - * Device context loss tracking - */ - -#ifdef CONFIG_ARCH_OMAP2PLUS - -int omap_pm_get_dev_context_loss_count(struct device *dev) -{ - struct platform_device *pdev = to_platform_device(dev); - int count; - - if (WARN_ON(!dev)) - return -ENODEV; - - if (dev->pm_domain == &omap_device_pm_domain) { - count = omap_device_get_context_loss_count(pdev); - } else { - WARN_ONCE(off_mode_enabled, "omap_pm: using dummy context loss counter; device %s should be converted to omap_device", - dev_name(dev)); - - count = dummy_context_loss_counter; - - if (off_mode_enabled) { - count++; - /* - * Context loss count has to be a non-negative value. - * Clear the sign bit to get a value range from 0 to - * INT_MAX. - */ - count &= INT_MAX; - dummy_context_loss_counter = count; - } - } - - pr_debug("OMAP PM: context loss count for dev %s = %d\n", - dev_name(dev), count); - - return count; -} - -#else - -int omap_pm_get_dev_context_loss_count(struct device *dev) -{ - return dummy_context_loss_counter; -} - -#endif - -/* Should be called before clk framework init */ -int __init omap_pm_if_early_init(void) -{ - return 0; -} - -/* Must be called after clock framework is initialized */ -int __init omap_pm_if_init(void) -{ - return 0; -} diff --git a/arch/arm/mach-omap2/omap-pm.h b/arch/arm/mach-omap2/omap-pm.h deleted file mode 100644 index 5ba5df47f91b..000000000000 --- a/arch/arm/mach-omap2/omap-pm.h +++ /dev/null @@ -1,161 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * omap-pm.h - OMAP power management interface - * - * Copyright (C) 2008-2010 Texas Instruments, Inc. - * Copyright (C) 2008-2010 Nokia Corporation - * Paul Walmsley - * - * Interface developed by (in alphabetical order): Karthik Dasu, Jouni - * Högander, Tony Lindgren, Rajendra Nayak, Sakari Poussa, - * Veeramanikandan Raju, Anand Sawant, Igor Stoppa, Paul Walmsley, - * Richard Woodruff - */ - -#ifndef ASM_ARM_ARCH_OMAP_OMAP_PM_H -#define ASM_ARM_ARCH_OMAP_OMAP_PM_H - -#include -#include -#include -#include - -/* - * agent_id values for use with omap_pm_set_min_bus_tput(): - * - * OCP_INITIATOR_AGENT is only valid for devices that can act as - * initiators -- it represents the device's L3 interconnect - * connection. OCP_TARGET_AGENT represents the device's L4 - * interconnect connection. - */ -#define OCP_TARGET_AGENT 1 -#define OCP_INITIATOR_AGENT 2 - -/** - * omap_pm_if_early_init - OMAP PM init code called before clock fw init - * @mpu_opp_table: array ptr to struct omap_opp for MPU - * @dsp_opp_table: array ptr to struct omap_opp for DSP - * @l3_opp_table : array ptr to struct omap_opp for CORE - * - * Initialize anything that must be configured before the clock - * framework starts. The "_if_" is to avoid name collisions with the - * PM idle-loop code. - */ -int __init omap_pm_if_early_init(void); - -/** - * omap_pm_if_init - OMAP PM init code called after clock fw init - * - * The main initialization code. OPP tables are passed in here. The - * "_if_" is to avoid name collisions with the PM idle-loop code. - */ -int __init omap_pm_if_init(void); - -/* - * Device-driver-originated constraints (via board-*.c files, platform_data) - */ - - -/** - * omap_pm_set_max_mpu_wakeup_lat - set the maximum MPU wakeup latency - * @dev: struct device * requesting the constraint - * @t: maximum MPU wakeup latency in microseconds - * - * Request that the maximum interrupt latency for the MPU to be no - * greater than @t microseconds. "Interrupt latency" in this case is - * defined as the elapsed time from the occurrence of a hardware or - * timer interrupt to the time when the device driver's interrupt - * service routine has been entered by the MPU. - * - * It is intended that underlying PM code will use this information to - * determine what power state to put the MPU powerdomain into, and - * possibly the CORE powerdomain as well, since interrupt handling - * code currently runs from SDRAM. Advanced PM or board*.c code may - * also configure interrupt controller priorities, OCP bus priorities, - * CPU speed(s), etc. - * - * This function will not affect device wakeup latency, e.g., time - * elapsed from when a device driver enables a hardware device with - * clk_enable(), to when the device is ready for register access or - * other use. To control this device wakeup latency, use - * omap_pm_set_max_dev_wakeup_lat() - * - * Multiple calls to omap_pm_set_max_mpu_wakeup_lat() will replace the - * previous t value. To remove the latency target for the MPU, call - * with t = -1. - * - * XXX This constraint will be deprecated soon in favor of the more - * general omap_pm_set_max_dev_wakeup_lat() - * - * Returns -EINVAL for an invalid argument, -ERANGE if the constraint - * is not satisfiable, or 0 upon success. - */ -int omap_pm_set_max_mpu_wakeup_lat(struct device *dev, long t); - - -/** - * omap_pm_set_min_bus_tput - set minimum bus throughput needed by device - * @dev: struct device * requesting the constraint - * @tbus_id: interconnect to operate on (OCP_{INITIATOR,TARGET}_AGENT) - * @r: minimum throughput (in KiB/s) - * - * Request that the minimum data throughput on the OCP interconnect - * attached to device @dev interconnect agent @tbus_id be no less - * than @r KiB/s. - * - * It is expected that the OMAP PM or bus code will use this - * information to set the interconnect clock to run at the lowest - * possible speed that satisfies all current system users. The PM or - * bus code will adjust the estimate based on its model of the bus, so - * device driver authors should attempt to specify an accurate - * quantity for their device use case, and let the PM or bus code - * overestimate the numbers as necessary to handle request/response - * latency, other competing users on the system, etc. On OMAP2/3, if - * a driver requests a minimum L4 interconnect speed constraint, the - * code will also need to add an minimum L3 interconnect speed - * constraint, - * - * Multiple calls to omap_pm_set_min_bus_tput() will replace the - * previous rate value for this device. To remove the interconnect - * throughput restriction for this device, call with r = 0. - * - * Returns -EINVAL for an invalid argument, -ERANGE if the constraint - * is not satisfiable, or 0 upon success. - */ -int omap_pm_set_min_bus_tput(struct device *dev, u8 agent_id, unsigned long r); - - -/* - * CPUFreq-originated constraint - * - * In the future, this should be handled by custom OPP clocktype - * functions. - */ - - -/* - * Device context loss tracking - */ - -/** - * omap_pm_get_dev_context_loss_count - return count of times dev has lost ctx - * @dev: struct device * - * - * This function returns the number of times that the device @dev has - * lost its internal context. This generally occurs on a powerdomain - * transition to OFF. Drivers use this as an optimization to avoid restoring - * context if the device hasn't lost it. To use, drivers should initially - * call this in their context save functions and store the result. Early in - * the driver's context restore function, the driver should call this function - * again, and compare the result to the stored counter. If they differ, the - * driver must restore device context. If the number of context losses - * exceeds the maximum positive integer, the function will wrap to 0 and - * continue counting. Returns the number of context losses for this device, - * or negative value upon error. - */ -int omap_pm_get_dev_context_loss_count(struct device *dev); - -void omap_pm_enable_off_mode(void); -void omap_pm_disable_off_mode(void); - -#endif diff --git a/arch/arm/mach-omap2/pdata-quirks.c b/arch/arm/mach-omap2/pdata-quirks.c index 2477f6086de4..70d280c72b66 100644 --- a/arch/arm/mach-omap2/pdata-quirks.c +++ b/arch/arm/mach-omap2/pdata-quirks.c @@ -25,7 +25,6 @@ #include #include #include -#include #include #include @@ -33,7 +32,6 @@ #include "common-board-devices.h" #include "control.h" #include "omap_device.h" -#include "omap-pm.h" #include "omap-secure.h" #include "soc.h" #include "hsmmc.h" @@ -411,18 +409,6 @@ static struct pwm_omap_dmtimer_pdata pwm_dmtimer_pdata = { }; #endif -static struct ir_rx51_platform_data __maybe_unused rx51_ir_data = { - .set_max_mpu_wakeup_lat = omap_pm_set_max_mpu_wakeup_lat, -}; - -static struct platform_device __maybe_unused rx51_ir_device = { - .name = "ir_rx51", - .id = -1, - .dev = { - .platform_data = &rx51_ir_data, - }, -}; - #if IS_ENABLED(CONFIG_SND_OMAP_SOC_MCBSP) static struct omap_mcbsp_platform_data mcbsp_pdata; static void __init omap3_mcbsp_init(void) diff --git a/arch/arm/mach-omap2/pm-debug.c b/arch/arm/mach-omap2/pm-debug.c index 5c46ea6756d7..acb698d5780f 100644 --- a/arch/arm/mach-omap2/pm-debug.c +++ b/arch/arm/mach-omap2/pm-debug.c @@ -31,7 +31,6 @@ #include "clock.h" #include "powerdomain.h" #include "clockdomain.h" -#include "omap-pm.h" #include "soc.h" #include "cm2xxx_3xxx.h" @@ -240,10 +239,6 @@ static int option_set(void *data, u64 val) *option = val; if (option == &enable_off_mode) { - if (val) - omap_pm_enable_off_mode(); - else - omap_pm_disable_off_mode(); if (cpu_is_omap34xx()) omap3_pm_off_mode_enable(val); } diff --git a/arch/arm/mach-omap2/pm.c b/arch/arm/mach-omap2/pm.c index 6f68576e5695..b98c46d7f112 100644 --- a/arch/arm/mach-omap2/pm.c +++ b/arch/arm/mach-omap2/pm.c @@ -16,11 +16,11 @@ #include #include #include +#include #include #include -#include "omap-pm.h" #include "omap_device.h" #include "common.h" @@ -230,14 +230,6 @@ static void __init omap4_init_voltages(void) omap2_set_init_voltage("iva", "dpll_iva_m5x2_ck", "iva"); } -static int __init omap2_common_pm_init(void) -{ - omap_pm_if_init(); - - return 0; -} -omap_postcore_initcall(omap2_common_pm_init); - int __init omap2_common_pm_late_init(void) { /* Init the voltage layer */ diff --git a/arch/arm/mach-omap2/timer.c b/arch/arm/mach-omap2/timer.c index c421d12b3203..11032df74daa 100644 --- a/arch/arm/mach-omap2/timer.c +++ b/arch/arm/mach-omap2/timer.c @@ -50,7 +50,6 @@ #include "omap_device.h" #include #include -#include "omap-pm.h" #include "soc.h" #include "common.h" diff --git a/arch/arm/plat-omap/Kconfig b/arch/arm/plat-omap/Kconfig index 7276afee30b3..4609e1c8a1cd 100644 --- a/arch/arm/plat-omap/Kconfig +++ b/arch/arm/plat-omap/Kconfig @@ -121,16 +121,6 @@ config OMAP_SERIAL_WAKE to data on the serial RX line. This allows you to wake the system from serial console. -choice - prompt "OMAP PM layer selection" - depends on ARCH_OMAP - default OMAP_PM_NOOP - -config OMAP_PM_NOOP - bool "No-op/debug PM layer" - -endchoice - endmenu endif diff --git a/drivers/media/rc/ir-rx51.c b/drivers/media/rc/ir-rx51.c index 49265f02e772..8a93f7468622 100644 --- a/drivers/media/rc/ir-rx51.c +++ b/drivers/media/rc/ir-rx51.c @@ -22,7 +22,6 @@ #include #include -#include #define WBUF_LEN 256 @@ -31,7 +30,6 @@ struct ir_rx51 { struct pwm_device *pwm; struct hrtimer timer; struct device *dev; - struct ir_rx51_platform_data *pdata; wait_queue_head_t wqueue; unsigned int freq; /* carrier frequency */ @@ -130,10 +128,9 @@ static int ir_rx51_tx(struct rc_dev *dev, unsigned int *buffer, ir_rx51->wbuf[count] = -1; /* Insert termination mark */ /* - * Adjust latency requirements so the device doesn't go in too - * deep sleep states + * REVISIT: Adjust latency requirements so the device doesn't go in too + * deep sleep states with pm_qos_add_request(). */ - ir_rx51->pdata->set_max_mpu_wakeup_lat(ir_rx51->dev, 50); ir_rx51_on(ir_rx51); ir_rx51->wbuf_index = 1; @@ -146,8 +143,7 @@ static int ir_rx51_tx(struct rc_dev *dev, unsigned int *buffer, */ wait_event_interruptible(ir_rx51->wqueue, ir_rx51->wbuf_index < 0); - /* We can sleep again */ - ir_rx51->pdata->set_max_mpu_wakeup_lat(ir_rx51->dev, -1); + /* REVISIT: Remove pm_qos constraint, we can sleep again */ return count; } @@ -244,13 +240,6 @@ static int ir_rx51_probe(struct platform_device *dev) struct pwm_device *pwm; struct rc_dev *rcdev; - ir_rx51.pdata = dev->dev.platform_data; - - if (!ir_rx51.pdata) { - dev_err(&dev->dev, "Platform Data is missing\n"); - return -ENXIO; - } - pwm = pwm_get(&dev->dev, NULL); if (IS_ERR(pwm)) { int err = PTR_ERR(pwm); diff --git a/include/linux/platform_data/media/ir-rx51.h b/include/linux/platform_data/media/ir-rx51.h deleted file mode 100644 index 9d127aa648e7..000000000000 --- a/include/linux/platform_data/media/ir-rx51.h +++ /dev/null @@ -1,9 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _IR_RX51_H -#define _IR_RX51_H - -struct ir_rx51_platform_data { - int(*set_max_mpu_wakeup_lat)(struct device *dev, long t); -}; - -#endif From f01499264a9c7321dff14bf6db9d2fdac53b3d4b Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Thu, 12 Apr 2018 06:32:18 -0400 Subject: [PATCH 0613/1640] UPSTREAM: media: st_rc: Don't stay on an IRQ handler forever As warned by smatch: drivers/media/rc/st_rc.c:110 st_rc_rx_interrupt() warn: this loop depends on readl() succeeding If something goes wrong at readl(), the logic will stay there inside an IRQ code forever. This is not the nicest thing to do :-) So, add a timeout there, preventing staying inside the IRQ for more than 10ms. Acked-by: Patrice Chotard Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/st_rc.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/drivers/media/rc/st_rc.c b/drivers/media/rc/st_rc.c index d2efd7b2c3bc..c855b177103c 100644 --- a/drivers/media/rc/st_rc.c +++ b/drivers/media/rc/st_rc.c @@ -96,19 +96,24 @@ static void st_rc_send_lirc_timeout(struct rc_dev *rdev) static irqreturn_t st_rc_rx_interrupt(int irq, void *data) { + unsigned long timeout; unsigned int symbol, mark = 0; struct st_rc_device *dev = data; int last_symbol = 0; - u32 status; + u32 status, int_status; DEFINE_IR_RAW_EVENT(ev); if (dev->irq_wake) pm_wakeup_event(dev->dev, 0); - status = readl(dev->rx_base + IRB_RX_STATUS); + /* FIXME: is 10ms good enough ? */ + timeout = jiffies + msecs_to_jiffies(10); + do { + status = readl(dev->rx_base + IRB_RX_STATUS); + if (!(status & (IRB_FIFO_NOT_EMPTY | IRB_OVERFLOW))) + break; - while (status & (IRB_FIFO_NOT_EMPTY | IRB_OVERFLOW)) { - u32 int_status = readl(dev->rx_base + IRB_RX_INT_STATUS); + int_status = readl(dev->rx_base + IRB_RX_INT_STATUS); if (unlikely(int_status & IRB_RX_OVERRUN_INT)) { /* discard the entire collection in case of errors! */ ir_raw_event_reset(dev->rdev); @@ -148,8 +153,7 @@ static irqreturn_t st_rc_rx_interrupt(int irq, void *data) } last_symbol = 0; - status = readl(dev->rx_base + IRB_RX_STATUS); - } + } while (time_is_after_jiffies(timeout)); writel(IRB_RX_INTS, dev->rx_base + IRB_RX_INT_CLEAR); From 0768f3ff0d886bbd67e13da7febb5b360b8bcc88 Mon Sep 17 00:00:00 2001 From: Sean Young Date: Tue, 27 Mar 2018 11:24:05 -0400 Subject: [PATCH 0614/1640] UPSTREAM: media: rc: report receiver and transmitter type on device register On the raspberry pi, we might have two lirc devices; one for sending and one for receiving. This change makes it much more apparent which one is which. Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- .../media/uapi/rc/lirc-dev-intro.rst | 2 +- drivers/media/rc/lirc_dev.c | 22 +++++++++++++++++-- 2 files changed, 21 insertions(+), 3 deletions(-) diff --git a/Documentation/media/uapi/rc/lirc-dev-intro.rst b/Documentation/media/uapi/rc/lirc-dev-intro.rst index 698e4f80270e..11516c8bff62 100644 --- a/Documentation/media/uapi/rc/lirc-dev-intro.rst +++ b/Documentation/media/uapi/rc/lirc-dev-intro.rst @@ -18,7 +18,7 @@ Example dmesg output upon a driver registering w/LIRC: .. code-block:: none $ dmesg |grep lirc_dev - rc rc0: lirc_dev: driver mceusb registered at minor = 0 + rc rc0: lirc_dev: driver mceusb registered at minor = 0, raw IR receiver, raw IR transmitter What you should see for a chardev: diff --git a/drivers/media/rc/lirc_dev.c b/drivers/media/rc/lirc_dev.c index a9cd96fe0a52..a539c6d20bf5 100644 --- a/drivers/media/rc/lirc_dev.c +++ b/drivers/media/rc/lirc_dev.c @@ -736,6 +736,7 @@ static void lirc_release_device(struct device *ld) int ir_lirc_register(struct rc_dev *dev) { + const char *rx_type, *tx_type; int err, minor; minor = ida_simple_get(&lirc_ida, 0, RC_DEV_MAX, GFP_KERNEL); @@ -760,8 +761,25 @@ int ir_lirc_register(struct rc_dev *dev) get_device(&dev->dev); - dev_info(&dev->dev, "lirc_dev: driver %s registered at minor = %d", - dev->driver_name, minor); + switch (dev->driver_type) { + case RC_DRIVER_SCANCODE: + rx_type = "scancode"; + break; + case RC_DRIVER_IR_RAW: + rx_type = "raw IR"; + break; + default: + rx_type = "no"; + break; + } + + if (dev->tx_ir) + tx_type = "raw IR"; + else + tx_type = "no"; + + dev_info(&dev->dev, "lirc_dev: driver %s registered at minor = %d, %s receiver, %s transmitter", + dev->driver_name, minor, rx_type, tx_type); return 0; From d9155a4937057357f3f694440aaceff1acf0205b Mon Sep 17 00:00:00 2001 From: Sean Young Date: Fri, 23 Mar 2018 16:47:37 -0400 Subject: [PATCH 0615/1640] UPSTREAM: media: rc: set timeout to smallest value required by enabled protocols The longer the IR timeout, the longer the rc device waits until delivering the trailing space. So, by reducing this timeout, we reduce the delay for the last scancode to be delivered. Note that the lirc daemon disables all protocols, in which case we revert back to the default value. Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/ir-imon-decoder.c | 1 + drivers/media/rc/ir-jvc-decoder.c | 1 + drivers/media/rc/ir-mce_kbd-decoder.c | 1 + drivers/media/rc/ir-nec-decoder.c | 1 + drivers/media/rc/ir-rc5-decoder.c | 1 + drivers/media/rc/ir-rc6-decoder.c | 1 + drivers/media/rc/ir-sanyo-decoder.c | 1 + drivers/media/rc/ir-sharp-decoder.c | 1 + drivers/media/rc/ir-sony-decoder.c | 1 + drivers/media/rc/ir-xmp-decoder.c | 1 + drivers/media/rc/rc-core-priv.h | 1 + drivers/media/rc/rc-ir-raw.c | 31 ++++++++++++++++++++++++++- drivers/media/rc/rc-main.c | 12 +++++------ 13 files changed, 47 insertions(+), 7 deletions(-) diff --git a/drivers/media/rc/ir-imon-decoder.c b/drivers/media/rc/ir-imon-decoder.c index a1ff06a26542..52ea3b2fda74 100644 --- a/drivers/media/rc/ir-imon-decoder.c +++ b/drivers/media/rc/ir-imon-decoder.c @@ -170,6 +170,7 @@ static struct ir_raw_handler imon_handler = { .decode = ir_imon_decode, .encode = ir_imon_encode, .carrier = 38000, + .min_timeout = IMON_UNIT * IMON_BITS * 2, }; static int __init ir_imon_decode_init(void) diff --git a/drivers/media/rc/ir-jvc-decoder.c b/drivers/media/rc/ir-jvc-decoder.c index 8cb68ae43282..5706cfe60027 100644 --- a/drivers/media/rc/ir-jvc-decoder.c +++ b/drivers/media/rc/ir-jvc-decoder.c @@ -213,6 +213,7 @@ static struct ir_raw_handler jvc_handler = { .decode = ir_jvc_decode, .encode = ir_jvc_encode, .carrier = 38000, + .min_timeout = JVC_TRAILER_SPACE, }; static int __init ir_jvc_decode_init(void) diff --git a/drivers/media/rc/ir-mce_kbd-decoder.c b/drivers/media/rc/ir-mce_kbd-decoder.c index c110984ca671..05f2a36769c0 100644 --- a/drivers/media/rc/ir-mce_kbd-decoder.c +++ b/drivers/media/rc/ir-mce_kbd-decoder.c @@ -475,6 +475,7 @@ static struct ir_raw_handler mce_kbd_handler = { .raw_register = ir_mce_kbd_register, .raw_unregister = ir_mce_kbd_unregister, .carrier = 36000, + .min_timeout = MCIR2_MAX_LEN + MCIR2_UNIT / 2, }; static int __init ir_mce_kbd_decode_init(void) diff --git a/drivers/media/rc/ir-nec-decoder.c b/drivers/media/rc/ir-nec-decoder.c index 21647b809e6f..6a8973ae3684 100644 --- a/drivers/media/rc/ir-nec-decoder.c +++ b/drivers/media/rc/ir-nec-decoder.c @@ -253,6 +253,7 @@ static struct ir_raw_handler nec_handler = { .decode = ir_nec_decode, .encode = ir_nec_encode, .carrier = 38000, + .min_timeout = NEC_TRAILER_SPACE, }; static int __init ir_nec_decode_init(void) diff --git a/drivers/media/rc/ir-rc5-decoder.c b/drivers/media/rc/ir-rc5-decoder.c index 74d3b859c3a2..cbfaadbee8fa 100644 --- a/drivers/media/rc/ir-rc5-decoder.c +++ b/drivers/media/rc/ir-rc5-decoder.c @@ -274,6 +274,7 @@ static struct ir_raw_handler rc5_handler = { .decode = ir_rc5_decode, .encode = ir_rc5_encode, .carrier = 36000, + .min_timeout = RC5_TRAILER, }; static int __init ir_rc5_decode_init(void) diff --git a/drivers/media/rc/ir-rc6-decoder.c b/drivers/media/rc/ir-rc6-decoder.c index 8314da32453f..66e07109f6fc 100644 --- a/drivers/media/rc/ir-rc6-decoder.c +++ b/drivers/media/rc/ir-rc6-decoder.c @@ -394,6 +394,7 @@ static struct ir_raw_handler rc6_handler = { .decode = ir_rc6_decode, .encode = ir_rc6_encode, .carrier = 36000, + .min_timeout = RC6_SUFFIX_SPACE, }; static int __init ir_rc6_decode_init(void) diff --git a/drivers/media/rc/ir-sanyo-decoder.c b/drivers/media/rc/ir-sanyo-decoder.c index 4efe6db5376a..dd6ee1e339d6 100644 --- a/drivers/media/rc/ir-sanyo-decoder.c +++ b/drivers/media/rc/ir-sanyo-decoder.c @@ -210,6 +210,7 @@ static struct ir_raw_handler sanyo_handler = { .decode = ir_sanyo_decode, .encode = ir_sanyo_encode, .carrier = 38000, + .min_timeout = SANYO_TRAILER_SPACE, }; static int __init ir_sanyo_decode_init(void) diff --git a/drivers/media/rc/ir-sharp-decoder.c b/drivers/media/rc/ir-sharp-decoder.c index 6a38c50566a4..f96e0c992eed 100644 --- a/drivers/media/rc/ir-sharp-decoder.c +++ b/drivers/media/rc/ir-sharp-decoder.c @@ -226,6 +226,7 @@ static struct ir_raw_handler sharp_handler = { .decode = ir_sharp_decode, .encode = ir_sharp_encode, .carrier = 38000, + .min_timeout = SHARP_ECHO_SPACE + SHARP_ECHO_SPACE / 4, }; static int __init ir_sharp_decode_init(void) diff --git a/drivers/media/rc/ir-sony-decoder.c b/drivers/media/rc/ir-sony-decoder.c index 6764ec9de646..5065c081238d 100644 --- a/drivers/media/rc/ir-sony-decoder.c +++ b/drivers/media/rc/ir-sony-decoder.c @@ -224,6 +224,7 @@ static struct ir_raw_handler sony_handler = { .decode = ir_sony_decode, .encode = ir_sony_encode, .carrier = 40000, + .min_timeout = SONY_TRAILER_SPACE, }; static int __init ir_sony_decode_init(void) diff --git a/drivers/media/rc/ir-xmp-decoder.c b/drivers/media/rc/ir-xmp-decoder.c index 58b47af1a763..c965f51df1c1 100644 --- a/drivers/media/rc/ir-xmp-decoder.c +++ b/drivers/media/rc/ir-xmp-decoder.c @@ -199,6 +199,7 @@ static int ir_xmp_decode(struct rc_dev *dev, struct ir_raw_event ev) static struct ir_raw_handler xmp_handler = { .protocols = RC_PROTO_BIT_XMP, .decode = ir_xmp_decode, + .min_timeout = XMP_TRAILER_SPACE, }; static int __init ir_xmp_decode_init(void) diff --git a/drivers/media/rc/rc-core-priv.h b/drivers/media/rc/rc-core-priv.h index e0e6a17460f6..f78551344eca 100644 --- a/drivers/media/rc/rc-core-priv.h +++ b/drivers/media/rc/rc-core-priv.h @@ -37,6 +37,7 @@ struct ir_raw_handler { int (*encode)(enum rc_proto protocol, u32 scancode, struct ir_raw_event *events, unsigned int max); u32 carrier; + u32 min_timeout; /* These two should only be used by the mce kbd decoder */ int (*raw_register)(struct rc_dev *dev); diff --git a/drivers/media/rc/rc-ir-raw.c b/drivers/media/rc/rc-ir-raw.c index 374f83105a23..22e44c8f16fd 100644 --- a/drivers/media/rc/rc-ir-raw.c +++ b/drivers/media/rc/rc-ir-raw.c @@ -233,7 +233,36 @@ ir_raw_get_allowed_protocols(void) static int change_protocol(struct rc_dev *dev, u64 *rc_proto) { - /* the caller will update dev->enabled_protocols */ + struct ir_raw_handler *handler; + u32 timeout = 0; + + if (!dev->max_timeout) + return 0; + + mutex_lock(&ir_raw_handler_lock); + list_for_each_entry(handler, &ir_raw_handler_list, list) { + if (handler->protocols & *rc_proto) { + if (timeout < handler->min_timeout) + timeout = handler->min_timeout; + } + } + mutex_unlock(&ir_raw_handler_lock); + + if (timeout == 0) + timeout = IR_DEFAULT_TIMEOUT; + else + timeout += MS_TO_NS(10); + + if (timeout < dev->min_timeout) + timeout = dev->min_timeout; + else if (timeout > dev->max_timeout) + timeout = dev->max_timeout; + + if (dev->s_timeout) + dev->s_timeout(dev, timeout); + else + dev->timeout = timeout; + return 0; } diff --git a/drivers/media/rc/rc-main.c b/drivers/media/rc/rc-main.c index b67be33bd62f..6a720e9c7aa8 100644 --- a/drivers/media/rc/rc-main.c +++ b/drivers/media/rc/rc-main.c @@ -1241,6 +1241,9 @@ static ssize_t store_protocols(struct device *device, if (rc < 0) goto out; + if (dev->driver_type == RC_DRIVER_IR_RAW) + ir_raw_load_modules(&new_protocols); + rc = dev->change_protocol(dev, &new_protocols); if (rc < 0) { dev_dbg(&dev->dev, "Error setting protocols to 0x%llx\n", @@ -1248,9 +1251,6 @@ static ssize_t store_protocols(struct device *device, goto out; } - if (dev->driver_type == RC_DRIVER_IR_RAW) - ir_raw_load_modules(&new_protocols); - if (new_protocols != old_protocols) { *current_protocols = new_protocols; dev_dbg(&dev->dev, "Protocols changed to 0x%llx\n", @@ -1735,6 +1735,9 @@ static int rc_prepare_rx_device(struct rc_dev *dev) if (dev->driver_type == RC_DRIVER_SCANCODE && !dev->change_protocol) dev->enabled_protocols = dev->allowed_protocols; + if (dev->driver_type == RC_DRIVER_IR_RAW) + ir_raw_load_modules(&rc_proto); + if (dev->change_protocol) { rc = dev->change_protocol(dev, &rc_proto); if (rc < 0) @@ -1742,9 +1745,6 @@ static int rc_prepare_rx_device(struct rc_dev *dev) dev->enabled_protocols = rc_proto; } - if (dev->driver_type == RC_DRIVER_IR_RAW) - ir_raw_load_modules(&rc_proto); - set_bit(EV_KEY, dev->input_dev->evbit); set_bit(EV_REP, dev->input_dev->evbit); set_bit(EV_MSC, dev->input_dev->evbit); From 49b291c33a7c0525e798e47e0b810d8eeb4d7935 Mon Sep 17 00:00:00 2001 From: Sean Young Date: Fri, 23 Mar 2018 16:59:52 -0400 Subject: [PATCH 0616/1640] UPSTREAM: media: rc: add ioctl to get the current timeout Since the kernel now modifies the timeout, make it possible to retrieve the current value. Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- Documentation/media/uapi/rc/lirc-func.rst | 1 + .../media/uapi/rc/lirc-set-rec-timeout.rst | 14 +++++++++----- drivers/media/rc/lirc_dev.c | 7 +++++++ include/uapi/linux/lirc.h | 6 ++++++ 4 files changed, 23 insertions(+), 5 deletions(-) diff --git a/Documentation/media/uapi/rc/lirc-func.rst b/Documentation/media/uapi/rc/lirc-func.rst index ddb4620de294..9656423a3f28 100644 --- a/Documentation/media/uapi/rc/lirc-func.rst +++ b/Documentation/media/uapi/rc/lirc-func.rst @@ -17,6 +17,7 @@ LIRC Function Reference lirc-get-rec-resolution lirc-set-send-duty-cycle lirc-get-timeout + lirc-get-rec-timeout lirc-set-rec-timeout lirc-set-rec-carrier lirc-set-rec-carrier-range diff --git a/Documentation/media/uapi/rc/lirc-set-rec-timeout.rst b/Documentation/media/uapi/rc/lirc-set-rec-timeout.rst index b3e16bbdbc90..a833a6a4c25a 100644 --- a/Documentation/media/uapi/rc/lirc-set-rec-timeout.rst +++ b/Documentation/media/uapi/rc/lirc-set-rec-timeout.rst @@ -1,19 +1,23 @@ .. -*- coding: utf-8; mode: rst -*- .. _lirc_set_rec_timeout: +.. _lirc_get_rec_timeout: -************************** -ioctl LIRC_SET_REC_TIMEOUT -************************** +*************************************************** +ioctl LIRC_GET_REC_TIMEOUT and LIRC_SET_REC_TIMEOUT +*************************************************** Name ==== -LIRC_SET_REC_TIMEOUT - sets the integer value for IR inactivity timeout. +LIRC_GET_REC_TIMEOUT/LIRC_SET_REC_TIMEOUT - Get/set the integer value for IR inactivity timeout. Synopsis ======== +.. c:function:: int ioctl( int fd, LIRC_GET_REC_TIMEOUT, __u32 *timeout ) + :name: LIRC_GET_REC_TIMEOUT + .. c:function:: int ioctl( int fd, LIRC_SET_REC_TIMEOUT, __u32 *timeout ) :name: LIRC_SET_REC_TIMEOUT @@ -30,7 +34,7 @@ Arguments Description =========== -Sets the integer value for IR inactivity timeout. +Get and set the integer value for IR inactivity timeout. If supported by the hardware, setting it to 0 disables all hardware timeouts and data should be reported as soon as possible. If the exact value diff --git a/drivers/media/rc/lirc_dev.c b/drivers/media/rc/lirc_dev.c index a539c6d20bf5..3f5ab383805d 100644 --- a/drivers/media/rc/lirc_dev.c +++ b/drivers/media/rc/lirc_dev.c @@ -575,6 +575,13 @@ static long ir_lirc_ioctl(struct file *file, unsigned int cmd, } break; + case LIRC_GET_REC_TIMEOUT: + if (!dev->timeout) + ret = -ENOTTY; + else + val = DIV_ROUND_UP(dev->timeout, 1000); + break; + case LIRC_SET_REC_TIMEOUT_REPORTS: if (!dev->timeout) ret = -ENOTTY; diff --git a/include/uapi/linux/lirc.h b/include/uapi/linux/lirc.h index 948d9a491083..7db6063fa6a2 100644 --- a/include/uapi/linux/lirc.h +++ b/include/uapi/linux/lirc.h @@ -134,6 +134,12 @@ #define LIRC_SET_WIDEBAND_RECEIVER _IOW('i', 0x00000023, __u32) +/* + * Return the recording timeout, which is either set by + * the ioctl LIRC_SET_REC_TIMEOUT or by the kernel after setting the protocols. + */ +#define LIRC_GET_REC_TIMEOUT _IOR('i', 0x00000024, __u32) + /* * struct lirc_scancode - decoded scancode with protocol for use with * LIRC_MODE_SCANCODE From 1638c8932f521432bc9fa63e8483eeaa4843a53e Mon Sep 17 00:00:00 2001 From: Sean Young Date: Sat, 24 Mar 2018 08:02:48 -0400 Subject: [PATCH 0617/1640] UPSTREAM: media: rc: per-protocol repeat period and minimum keyup timer Each IR protocol has its own repeat period. We can minimise the keyup timer to be the protocol period + IR timeout. This makes keys less "sticky" and makes IR more reactive and nicer to use. This feature was previously attempted in commit d57ea877af38 ("media: rc: per-protocol repeat period"), but that did not take the IR timeout into account, and had to be reverted. Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/media/cec/cec-core.c | 2 +- drivers/media/rc/lirc_dev.c | 2 +- drivers/media/rc/rc-main.c | 56 +++++++++++++++++++----------------- 3 files changed, 31 insertions(+), 29 deletions(-) diff --git a/drivers/media/cec/cec-core.c b/drivers/media/cec/cec-core.c index fb59a2bb6b97..c0f8afaf3396 100644 --- a/drivers/media/cec/cec-core.c +++ b/drivers/media/cec/cec-core.c @@ -279,7 +279,7 @@ struct cec_adapter *cec_allocate_adapter(const struct cec_adap_ops *ops, adap->rc->allowed_protocols = RC_PROTO_BIT_CEC; adap->rc->priv = adap; adap->rc->map_name = RC_MAP_CEC; - adap->rc->timeout = MS_TO_NS(100); + adap->rc->timeout = MS_TO_NS(550); #endif return adap; } diff --git a/drivers/media/rc/lirc_dev.c b/drivers/media/rc/lirc_dev.c index 3f5ab383805d..0dcf7444c970 100644 --- a/drivers/media/rc/lirc_dev.c +++ b/drivers/media/rc/lirc_dev.c @@ -583,7 +583,7 @@ static long ir_lirc_ioctl(struct file *file, unsigned int cmd, break; case LIRC_SET_REC_TIMEOUT_REPORTS: - if (!dev->timeout) + if (dev->driver_type != RC_DRIVER_IR_RAW) ret = -ENOTTY; else fh->send_timeout_reports = !!val; diff --git a/drivers/media/rc/rc-main.c b/drivers/media/rc/rc-main.c index 6a720e9c7aa8..9f4df60f62e1 100644 --- a/drivers/media/rc/rc-main.c +++ b/drivers/media/rc/rc-main.c @@ -26,50 +26,50 @@ static const struct { unsigned int repeat_period; unsigned int scancode_bits; } protocols[] = { - [RC_PROTO_UNKNOWN] = { .name = "unknown", .repeat_period = 250 }, - [RC_PROTO_OTHER] = { .name = "other", .repeat_period = 250 }, + [RC_PROTO_UNKNOWN] = { .name = "unknown", .repeat_period = 125 }, + [RC_PROTO_OTHER] = { .name = "other", .repeat_period = 125 }, [RC_PROTO_RC5] = { .name = "rc-5", - .scancode_bits = 0x1f7f, .repeat_period = 250 }, + .scancode_bits = 0x1f7f, .repeat_period = 114 }, [RC_PROTO_RC5X_20] = { .name = "rc-5x-20", - .scancode_bits = 0x1f7f3f, .repeat_period = 250 }, + .scancode_bits = 0x1f7f3f, .repeat_period = 114 }, [RC_PROTO_RC5_SZ] = { .name = "rc-5-sz", - .scancode_bits = 0x2fff, .repeat_period = 250 }, + .scancode_bits = 0x2fff, .repeat_period = 114 }, [RC_PROTO_JVC] = { .name = "jvc", - .scancode_bits = 0xffff, .repeat_period = 250 }, + .scancode_bits = 0xffff, .repeat_period = 125 }, [RC_PROTO_SONY12] = { .name = "sony-12", - .scancode_bits = 0x1f007f, .repeat_period = 250 }, + .scancode_bits = 0x1f007f, .repeat_period = 100 }, [RC_PROTO_SONY15] = { .name = "sony-15", - .scancode_bits = 0xff007f, .repeat_period = 250 }, + .scancode_bits = 0xff007f, .repeat_period = 100 }, [RC_PROTO_SONY20] = { .name = "sony-20", - .scancode_bits = 0x1fff7f, .repeat_period = 250 }, + .scancode_bits = 0x1fff7f, .repeat_period = 100 }, [RC_PROTO_NEC] = { .name = "nec", - .scancode_bits = 0xffff, .repeat_period = 250 }, + .scancode_bits = 0xffff, .repeat_period = 110 }, [RC_PROTO_NECX] = { .name = "nec-x", - .scancode_bits = 0xffffff, .repeat_period = 250 }, + .scancode_bits = 0xffffff, .repeat_period = 110 }, [RC_PROTO_NEC32] = { .name = "nec-32", - .scancode_bits = 0xffffffff, .repeat_period = 250 }, + .scancode_bits = 0xffffffff, .repeat_period = 110 }, [RC_PROTO_SANYO] = { .name = "sanyo", - .scancode_bits = 0x1fffff, .repeat_period = 250 }, + .scancode_bits = 0x1fffff, .repeat_period = 125 }, [RC_PROTO_MCIR2_KBD] = { .name = "mcir2-kbd", - .scancode_bits = 0xffff, .repeat_period = 250 }, + .scancode_bits = 0xffff, .repeat_period = 100 }, [RC_PROTO_MCIR2_MSE] = { .name = "mcir2-mse", - .scancode_bits = 0x1fffff, .repeat_period = 250 }, + .scancode_bits = 0x1fffff, .repeat_period = 100 }, [RC_PROTO_RC6_0] = { .name = "rc-6-0", - .scancode_bits = 0xffff, .repeat_period = 250 }, + .scancode_bits = 0xffff, .repeat_period = 114 }, [RC_PROTO_RC6_6A_20] = { .name = "rc-6-6a-20", - .scancode_bits = 0xfffff, .repeat_period = 250 }, + .scancode_bits = 0xfffff, .repeat_period = 114 }, [RC_PROTO_RC6_6A_24] = { .name = "rc-6-6a-24", - .scancode_bits = 0xffffff, .repeat_period = 250 }, + .scancode_bits = 0xffffff, .repeat_period = 114 }, [RC_PROTO_RC6_6A_32] = { .name = "rc-6-6a-32", - .scancode_bits = 0xffffffff, .repeat_period = 250 }, + .scancode_bits = 0xffffffff, .repeat_period = 114 }, [RC_PROTO_RC6_MCE] = { .name = "rc-6-mce", - .scancode_bits = 0xffff7fff, .repeat_period = 250 }, + .scancode_bits = 0xffff7fff, .repeat_period = 114 }, [RC_PROTO_SHARP] = { .name = "sharp", - .scancode_bits = 0x1fff, .repeat_period = 250 }, - [RC_PROTO_XMP] = { .name = "xmp", .repeat_period = 250 }, - [RC_PROTO_CEC] = { .name = "cec", .repeat_period = 550 }, + .scancode_bits = 0x1fff, .repeat_period = 125 }, + [RC_PROTO_XMP] = { .name = "xmp", .repeat_period = 125 }, + [RC_PROTO_CEC] = { .name = "cec", .repeat_period = 0 }, [RC_PROTO_IMON] = { .name = "imon", - .scancode_bits = 0x7fffffff, .repeat_period = 250 }, + .scancode_bits = 0x7fffffff, .repeat_period = 114 }, }; /* Used to keep track of known keymaps */ @@ -690,7 +690,8 @@ static void ir_timer_repeat(struct timer_list *t) void rc_repeat(struct rc_dev *dev) { unsigned long flags; - unsigned int timeout = protocols[dev->last_protocol].repeat_period; + unsigned int timeout = nsecs_to_jiffies(dev->timeout) + + msecs_to_jiffies(protocols[dev->last_protocol].repeat_period); struct lirc_scancode sc = { .scancode = dev->last_scancode, .rc_proto = dev->last_protocol, .keycode = dev->keypressed ? dev->last_keycode : KEY_RESERVED, @@ -706,7 +707,7 @@ void rc_repeat(struct rc_dev *dev) input_sync(dev->input_dev); if (dev->keypressed) { - dev->keyup_jiffies = jiffies + msecs_to_jiffies(timeout); + dev->keyup_jiffies = jiffies + timeout; mod_timer(&dev->timer_keyup, dev->keyup_jiffies); } @@ -801,7 +802,7 @@ void rc_keydown(struct rc_dev *dev, enum rc_proto protocol, u32 scancode, ir_do_keydown(dev, protocol, scancode, keycode, toggle); if (dev->keypressed) { - dev->keyup_jiffies = jiffies + + dev->keyup_jiffies = jiffies + nsecs_to_jiffies(dev->timeout) + msecs_to_jiffies(protocols[protocol].repeat_period); mod_timer(&dev->timer_keyup, dev->keyup_jiffies); } @@ -1647,6 +1648,7 @@ struct rc_dev *rc_allocate_device(enum rc_driver_type type) dev->input_dev->setkeycode = ir_setkeycode; input_set_drvdata(dev->input_dev, dev); + dev->timeout = IR_DEFAULT_TIMEOUT; timer_setup(&dev->timer_keyup, ir_timer_keyup, 0); timer_setup(&dev->timer_repeat, ir_timer_repeat, 0); From 41d61c209da87dca34e672bd49016af20cba3d4f Mon Sep 17 00:00:00 2001 From: Sean Young Date: Sun, 25 Mar 2018 11:45:40 -0400 Subject: [PATCH 0618/1640] UPSTREAM: media: rc: mce_kbd decoder: low timeout values cause double keydowns The mce keyboard repeats pressed keys every 100ms. If the IR timeout is set to less than that, we send key up events before the repeat arrives, so we have key up/key down for each IR repeat. The keyboard ends any sequence with a 0 scancode, in which case all keys are cleared so there is no need to run the timeout timer: it only exists for the case that the final 0 was not received. Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/ir-mce_kbd-decoder.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/drivers/media/rc/ir-mce_kbd-decoder.c b/drivers/media/rc/ir-mce_kbd-decoder.c index 05f2a36769c0..9d609dca6e2b 100644 --- a/drivers/media/rc/ir-mce_kbd-decoder.c +++ b/drivers/media/rc/ir-mce_kbd-decoder.c @@ -322,11 +322,13 @@ again: scancode = data->body & 0xffff; dev_dbg(&dev->dev, "keyboard data 0x%08x\n", data->body); - if (dev->timeout) - delay = usecs_to_jiffies(dev->timeout / 1000); - else - delay = msecs_to_jiffies(100); - mod_timer(&data->rx_timeout, jiffies + delay); + if (scancode) { + delay = nsecs_to_jiffies(dev->timeout) + + msecs_to_jiffies(100); + mod_timer(&data->rx_timeout, jiffies + delay); + } else { + del_timer(&data->rx_timeout); + } /* Pass data to keyboard buffer parser */ ir_mce_kbd_process_keyboard_data(dev, scancode); lsc.rc_proto = RC_PROTO_MCIR2_KBD; From 0f230fe1e50a9d169ba2f219e2f730c845d6690c Mon Sep 17 00:00:00 2001 From: Sean Young Date: Sat, 7 Apr 2018 17:41:17 -0400 Subject: [PATCH 0619/1640] UPSTREAM: media: rc: mce_kbd protocol encodes two scancodes If two keys are pressed, then both keys are encoded in the scancode. This makes the mce keyboard more responsive. Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/ir-mce_kbd-decoder.c | 21 ++++++++++++--------- drivers/media/rc/rc-main.c | 2 +- 2 files changed, 13 insertions(+), 10 deletions(-) diff --git a/drivers/media/rc/ir-mce_kbd-decoder.c b/drivers/media/rc/ir-mce_kbd-decoder.c index 9d609dca6e2b..f94e89ebc724 100644 --- a/drivers/media/rc/ir-mce_kbd-decoder.c +++ b/drivers/media/rc/ir-mce_kbd-decoder.c @@ -147,13 +147,14 @@ static enum mce_kbd_mode mce_kbd_mode(struct mce_kbd_dec *data) static void ir_mce_kbd_process_keyboard_data(struct rc_dev *dev, u32 scancode) { struct mce_kbd_dec *data = &dev->raw->mce_kbd; - u8 keydata = (scancode >> 8) & 0xff; + u8 keydata1 = (scancode >> 8) & 0xff; + u8 keydata2 = (scancode >> 16) & 0xff; u8 shiftmask = scancode & 0xff; - unsigned char keycode, maskcode; + unsigned char maskcode; int i, keystate; - dev_dbg(&dev->dev, "keyboard: keydata = 0x%02x, shiftmask = 0x%02x\n", - keydata, shiftmask); + dev_dbg(&dev->dev, "keyboard: keydata2 = 0x%02x, keydata1 = 0x%02x, shiftmask = 0x%02x\n", + keydata2, keydata1, shiftmask); for (i = 0; i < 7; i++) { maskcode = kbd_keycodes[MCIR2_MASK_KEYS_START + i]; @@ -164,10 +165,12 @@ static void ir_mce_kbd_process_keyboard_data(struct rc_dev *dev, u32 scancode) input_report_key(data->idev, maskcode, keystate); } - if (keydata) { - keycode = kbd_keycodes[keydata]; - input_report_key(data->idev, keycode, 1); - } else { + if (keydata1) + input_report_key(data->idev, kbd_keycodes[keydata1], 1); + if (keydata2) + input_report_key(data->idev, kbd_keycodes[keydata2], 1); + + if (!keydata1 && !keydata2) { for (i = 0; i < MCIR2_MASK_KEYS_START; i++) input_report_key(data->idev, kbd_keycodes[i], 0); } @@ -319,7 +322,7 @@ again: switch (data->wanted_bits) { case MCIR2_KEYBOARD_NBITS: - scancode = data->body & 0xffff; + scancode = data->body & 0xffffff; dev_dbg(&dev->dev, "keyboard data 0x%08x\n", data->body); if (scancode) { diff --git a/drivers/media/rc/rc-main.c b/drivers/media/rc/rc-main.c index 9f4df60f62e1..b7071bde670a 100644 --- a/drivers/media/rc/rc-main.c +++ b/drivers/media/rc/rc-main.c @@ -51,7 +51,7 @@ static const struct { [RC_PROTO_SANYO] = { .name = "sanyo", .scancode_bits = 0x1fffff, .repeat_period = 125 }, [RC_PROTO_MCIR2_KBD] = { .name = "mcir2-kbd", - .scancode_bits = 0xffff, .repeat_period = 100 }, + .scancode_bits = 0xffffff, .repeat_period = 100 }, [RC_PROTO_MCIR2_MSE] = { .name = "mcir2-mse", .scancode_bits = 0x1fffff, .repeat_period = 100 }, [RC_PROTO_RC6_0] = { .name = "rc-6-0", From fd6f0ff0d3519fcd7baf85aab8b101f3ee5c40a1 Mon Sep 17 00:00:00 2001 From: Sean Young Date: Sun, 8 Apr 2018 06:36:40 -0400 Subject: [PATCH 0620/1640] UPSTREAM: media: rc: mce_kbd decoder: fix stuck keys The MCE Remote sends a 0 scancode when keys are released. If this is not received or decoded, then keys can get "stuck"; the keyup event is not sent since the input_sync() is missing from the timeout handler. Cc: stable@vger.kernel.org Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/ir-mce_kbd-decoder.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/media/rc/ir-mce_kbd-decoder.c b/drivers/media/rc/ir-mce_kbd-decoder.c index f94e89ebc724..002b8323ae69 100644 --- a/drivers/media/rc/ir-mce_kbd-decoder.c +++ b/drivers/media/rc/ir-mce_kbd-decoder.c @@ -130,6 +130,8 @@ static void mce_kbd_rx_timeout(struct timer_list *t) for (i = 0; i < MCIR2_MASK_KEYS_START; i++) input_report_key(raw->mce_kbd.idev, kbd_keycodes[i], 0); + + input_sync(raw->mce_kbd.idev); } static enum mce_kbd_mode mce_kbd_mode(struct mce_kbd_dec *data) From 3acb2f04873cce562ef1029e66835d7aae6059e7 Mon Sep 17 00:00:00 2001 From: Sean Young Date: Wed, 11 Apr 2018 11:02:16 -0400 Subject: [PATCH 0621/1640] UPSTREAM: media: rc: mce_kbd decoder: remove superfluous call to input_sync There is nothing to sync in this code path. Reported-by: Matthias Reichl Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/ir-mce_kbd-decoder.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/media/rc/ir-mce_kbd-decoder.c b/drivers/media/rc/ir-mce_kbd-decoder.c index 002b8323ae69..2fc78710a724 100644 --- a/drivers/media/rc/ir-mce_kbd-decoder.c +++ b/drivers/media/rc/ir-mce_kbd-decoder.c @@ -362,7 +362,6 @@ out: dev_dbg(&dev->dev, "failed at state %i (%uus %s)\n", data->state, TO_US(ev.duration), TO_STR(ev.pulse)); data->state = STATE_INACTIVE; - input_sync(data->idev); return -EINVAL; } From 851fdc3bfca715773e817de7a3efd2d7aa1028e9 Mon Sep 17 00:00:00 2001 From: Sean Young Date: Thu, 12 Apr 2018 16:28:39 -0400 Subject: [PATCH 0622/1640] UPSTREAM: media: rc: mce_kbd decoder: fix race condition The MCE keyboard sends both key down and key up events. We have a timeout handler mce_kbd_rx_timeout() in case the keyup event is never received; however, this may race with new key down events from occurring. The race is that key down scancode arrives and key down events are generated. The timeout handler races this and generates key up events straight afterwards. Since the keyboard generates scancodes every 100ms, most likely the keys will be repeated 100ms later, and now we have new key down events and the user sees duplicate key presses. Reported-by: Matthias Reichl Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/ir-mce_kbd-decoder.c | 25 +++++++++++++++++-------- drivers/media/rc/rc-core-priv.h | 2 ++ 2 files changed, 19 insertions(+), 8 deletions(-) diff --git a/drivers/media/rc/ir-mce_kbd-decoder.c b/drivers/media/rc/ir-mce_kbd-decoder.c index 2fc78710a724..9574c3dd90f2 100644 --- a/drivers/media/rc/ir-mce_kbd-decoder.c +++ b/drivers/media/rc/ir-mce_kbd-decoder.c @@ -119,19 +119,25 @@ static void mce_kbd_rx_timeout(struct timer_list *t) { struct ir_raw_event_ctrl *raw = from_timer(raw, t, mce_kbd.rx_timeout); unsigned char maskcode; + unsigned long flags; int i; dev_dbg(&raw->dev->dev, "timer callback clearing all keys\n"); - for (i = 0; i < 7; i++) { - maskcode = kbd_keycodes[MCIR2_MASK_KEYS_START + i]; - input_report_key(raw->mce_kbd.idev, maskcode, 0); + spin_lock_irqsave(&raw->mce_kbd.keylock, flags); + + if (time_is_before_eq_jiffies(raw->mce_kbd.rx_timeout.expires)) { + for (i = 0; i < 7; i++) { + maskcode = kbd_keycodes[MCIR2_MASK_KEYS_START + i]; + input_report_key(raw->mce_kbd.idev, maskcode, 0); + } + + for (i = 0; i < MCIR2_MASK_KEYS_START; i++) + input_report_key(raw->mce_kbd.idev, kbd_keycodes[i], 0); + + input_sync(raw->mce_kbd.idev); } - - for (i = 0; i < MCIR2_MASK_KEYS_START; i++) - input_report_key(raw->mce_kbd.idev, kbd_keycodes[i], 0); - - input_sync(raw->mce_kbd.idev); + spin_unlock_irqrestore(&raw->mce_kbd.keylock, flags); } static enum mce_kbd_mode mce_kbd_mode(struct mce_kbd_dec *data) @@ -327,6 +333,7 @@ again: scancode = data->body & 0xffffff; dev_dbg(&dev->dev, "keyboard data 0x%08x\n", data->body); + spin_lock(&data->keylock); if (scancode) { delay = nsecs_to_jiffies(dev->timeout) + msecs_to_jiffies(100); @@ -336,6 +343,7 @@ again: } /* Pass data to keyboard buffer parser */ ir_mce_kbd_process_keyboard_data(dev, scancode); + spin_unlock(&data->keylock); lsc.rc_proto = RC_PROTO_MCIR2_KBD; break; case MCIR2_MOUSE_NBITS: @@ -400,6 +408,7 @@ static int ir_mce_kbd_register(struct rc_dev *dev) set_bit(MSC_SCAN, idev->mscbit); timer_setup(&mce_kbd->rx_timeout, mce_kbd_rx_timeout, 0); + spin_lock_init(&mce_kbd->keylock); input_set_drvdata(idev, mce_kbd); diff --git a/drivers/media/rc/rc-core-priv.h b/drivers/media/rc/rc-core-priv.h index f78551344eca..07ba77fe6a3b 100644 --- a/drivers/media/rc/rc-core-priv.h +++ b/drivers/media/rc/rc-core-priv.h @@ -105,6 +105,8 @@ struct ir_raw_event_ctrl { } sharp; struct mce_kbd_dec { struct input_dev *idev; + /* locks key up timer */ + spinlock_t keylock; struct timer_list rx_timeout; char name[64]; char phys[64]; From 94be097d9f1e73351e98ba43954e649f35fcad63 Mon Sep 17 00:00:00 2001 From: Sean Young Date: Wed, 18 Apr 2018 05:36:25 -0400 Subject: [PATCH 0623/1640] UPSTREAM: media: rc: mceusb: IR of length 0 means IR timeout, not reset The last usb packet with IR data will end with 0x80 (MCE_IRDATA_TRAILER). If we reset the decoder state at this point, IR decoding can fail. Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/mceusb.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/media/rc/mceusb.c b/drivers/media/rc/mceusb.c index 69ba57372c05..a1c21903b96c 100644 --- a/drivers/media/rc/mceusb.c +++ b/drivers/media/rc/mceusb.c @@ -1182,7 +1182,12 @@ static void mceusb_process_ir_data(struct mceusb_dev *ir, int buf_len) if (ir->rem) { ir->parser_state = PARSE_IRDATA; } else { - ir_raw_event_reset(ir->rc); + init_ir_raw_event(&rawir); + rawir.timeout = 1; + rawir.duration = ir->rc->timeout; + if (ir_raw_event_store_with_filter(ir->rc, + &rawir)) + event = true; ir->pulse_tunit = 0; ir->pulse_count = 0; } From b2e07cbe0c8ce87bf964ec0ab29098172a5c55b8 Mon Sep 17 00:00:00 2001 From: Sean Young Date: Sun, 8 Apr 2018 11:06:49 -0400 Subject: [PATCH 0624/1640] UPSTREAM: media: rc: mceusb: allow the timeout to be configurable mceusb devices have a default timeout of 100ms, but this can be changed. Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/mceusb.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/drivers/media/rc/mceusb.c b/drivers/media/rc/mceusb.c index a1c21903b96c..5c0bf61fae26 100644 --- a/drivers/media/rc/mceusb.c +++ b/drivers/media/rc/mceusb.c @@ -982,6 +982,25 @@ static int mceusb_set_tx_carrier(struct rc_dev *dev, u32 carrier) return 0; } +static int mceusb_set_timeout(struct rc_dev *dev, unsigned int timeout) +{ + u8 cmdbuf[4] = { MCE_CMD_PORT_IR, MCE_CMD_SETIRTIMEOUT, 0, 0 }; + struct mceusb_dev *ir = dev->priv; + unsigned int units; + + units = DIV_ROUND_CLOSEST(timeout, US_TO_NS(MCE_TIME_UNIT)); + + cmdbuf[2] = units >> 8; + cmdbuf[3] = units; + + mce_async_out(ir, cmdbuf, sizeof(cmdbuf)); + + /* get receiver timeout value */ + mce_async_out(ir, GET_RX_TIMEOUT, sizeof(GET_RX_TIMEOUT)); + + return 0; +} + /* * Select or deselect the 2nd receiver port. * Second receiver is learning mode, wide-band, short-range receiver. @@ -1420,7 +1439,10 @@ static struct rc_dev *mceusb_init_rc_dev(struct mceusb_dev *ir) rc->dev.parent = dev; rc->priv = ir; rc->allowed_protocols = RC_PROTO_BIT_ALL_IR_DECODER; + rc->min_timeout = US_TO_NS(MCE_TIME_UNIT); rc->timeout = MS_TO_NS(100); + rc->max_timeout = 10 * IR_DEFAULT_TIMEOUT; + rc->s_timeout = mceusb_set_timeout; if (!ir->flags.no_tx) { rc->s_tx_mask = mceusb_set_tx_mask; rc->s_tx_carrier = mceusb_set_tx_carrier; From 21f3cf048d1860011e6d0658959c06c00d6deda5 Mon Sep 17 00:00:00 2001 From: Andi Shyti Date: Fri, 13 Apr 2018 09:48:29 -0400 Subject: [PATCH 0625/1640] UPSTREAM: media: rc: ir-spi: update Andi's e-mail Because I will be leaving Samsung soon, update my e-mail to the etezian.org mail. Cc: Andi Shyti Signed-off-by: Andi Shyti Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/ir-spi.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/media/rc/ir-spi.c b/drivers/media/rc/ir-spi.c index 7163d5ce2e64..66334e8d63ba 100644 --- a/drivers/media/rc/ir-spi.c +++ b/drivers/media/rc/ir-spi.c @@ -2,7 +2,7 @@ // SPI driven IR LED device driver // // Copyright (c) 2016 Samsung Electronics Co., Ltd. -// Copyright (c) Andi Shyti +// Copyright (c) Andi Shyti #include #include @@ -173,6 +173,6 @@ static struct spi_driver ir_spi_driver = { module_spi_driver(ir_spi_driver); -MODULE_AUTHOR("Andi Shyti "); +MODULE_AUTHOR("Andi Shyti "); MODULE_DESCRIPTION("SPI IR LED"); MODULE_LICENSE("GPL v2"); From 6e57b25eba383d27bb5754a0753585f3efa1a338 Mon Sep 17 00:00:00 2001 From: Ryder Lee Date: Sun, 15 Apr 2018 22:34:30 -0400 Subject: [PATCH 0626/1640] UPSTREAM: media: rc: mtk-cir: use of_device_get_match_data() The usage of of_device_get_match_data() reduce the code size a bit. Signed-off-by: Ryder Lee Acked-by: Sean Wang Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/mtk-cir.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/media/rc/mtk-cir.c b/drivers/media/rc/mtk-cir.c index e88eb64e8e69..e42efd9d382e 100644 --- a/drivers/media/rc/mtk-cir.c +++ b/drivers/media/rc/mtk-cir.c @@ -299,8 +299,6 @@ static int mtk_ir_probe(struct platform_device *pdev) { struct device *dev = &pdev->dev; struct device_node *dn = dev->of_node; - const struct of_device_id *of_id = - of_match_device(mtk_ir_match, &pdev->dev); struct resource *res; struct mtk_ir *ir; u32 val; @@ -312,7 +310,7 @@ static int mtk_ir_probe(struct platform_device *pdev) return -ENOMEM; ir->dev = dev; - ir->data = of_id->data; + ir->data = of_device_get_match_data(dev); ir->clk = devm_clk_get(dev, "clk"); if (IS_ERR(ir->clk)) { From f81d8819d793b094917210b7e6624d61356b8dce Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Wed, 25 Apr 2018 05:34:48 -0400 Subject: [PATCH 0627/1640] BACKPORT: MAINTAINERS & files: Canonize the e-mails I use at files From now on, I'll start using my @kernel.org as my development e-mail. As such, let's remove the entries that point to the old mchehab@s-opensource.com at MAINTAINERS file. For the files written with a copyright with mchehab@s-opensource, let's keep Samsung on their names, using mchehab+samsung@kernel.org, in order to keep pointing to my employer, with sponsors the work. For the files written before I join Samsung (on July, 4 2013), let's just use mchehab@kernel.org. For bug reports, we can simply point to just kernel.org, as this will reach my mchehab+samsung inbox anyway. [Linux4: Only keep RC related bits] Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Brian Warner Signed-off-by: Mauro Carvalho Chehab --- Documentation/media/uapi/rc/keytable.c.rst | 2 +- MAINTAINERS | 17 ----------------- drivers/media/rc/keymaps/rc-avermedia-m135a.c | 2 +- drivers/media/rc/keymaps/rc-encore-enltv-fm53.c | 2 +- drivers/media/rc/keymaps/rc-encore-enltv2.c | 2 +- drivers/media/rc/keymaps/rc-kaiomy.c | 2 +- .../media/rc/keymaps/rc-kworld-plus-tv-analog.c | 2 +- drivers/media/rc/keymaps/rc-pixelview-new.c | 2 +- 8 files changed, 7 insertions(+), 24 deletions(-) diff --git a/Documentation/media/uapi/rc/keytable.c.rst b/Documentation/media/uapi/rc/keytable.c.rst index e6ce1e3f5a78..217237f93b37 100644 --- a/Documentation/media/uapi/rc/keytable.c.rst +++ b/Documentation/media/uapi/rc/keytable.c.rst @@ -7,7 +7,7 @@ file: uapi/v4l/keytable.c /* keytable.c - This program allows checking/replacing keys at IR - Copyright (C) 2006-2009 Mauro Carvalho Chehab + Copyright (C) 2006-2009 Mauro Carvalho Chehab This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by diff --git a/MAINTAINERS b/MAINTAINERS index 90fb5968c43c..324dc3cc5c1b 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2498,7 +2498,6 @@ F: Documentation/devicetree/bindings/sound/axentia,* F: sound/soc/atmel/tse850-pcm5142.c AZ6007 DVB DRIVER -M: Mauro Carvalho Chehab M: Mauro Carvalho Chehab L: linux-media@vger.kernel.org W: https://linuxtv.org @@ -3040,7 +3039,6 @@ F: include/linux/btrfs* F: include/uapi/linux/btrfs* BTTV VIDEO4LINUX DRIVER -M: Mauro Carvalho Chehab M: Mauro Carvalho Chehab L: linux-media@vger.kernel.org W: https://linuxtv.org @@ -3763,7 +3761,6 @@ S: Maintained F: drivers/media/dvb-frontends/cx24120* CX88 VIDEO4LINUX DRIVER -M: Mauro Carvalho Chehab M: Mauro Carvalho Chehab L: linux-media@vger.kernel.org W: https://linuxtv.org @@ -4910,7 +4907,6 @@ F: drivers/edac/thunderx_edac* EDAC-CORE M: Borislav Petkov -M: Mauro Carvalho Chehab M: Mauro Carvalho Chehab L: linux-edac@vger.kernel.org T: git git://git.kernel.org/pub/scm/linux/kernel/git/bp/bp.git for-next @@ -4939,7 +4935,6 @@ S: Maintained F: drivers/edac/fsl_ddr_edac.* EDAC-GHES -M: Mauro Carvalho Chehab M: Mauro Carvalho Chehab L: linux-edac@vger.kernel.org S: Maintained @@ -4956,21 +4951,18 @@ S: Maintained F: drivers/edac/i5000_edac.c EDAC-I5400 -M: Mauro Carvalho Chehab M: Mauro Carvalho Chehab L: linux-edac@vger.kernel.org S: Maintained F: drivers/edac/i5400_edac.c EDAC-I7300 -M: Mauro Carvalho Chehab M: Mauro Carvalho Chehab L: linux-edac@vger.kernel.org S: Maintained F: drivers/edac/i7300_edac.c EDAC-I7CORE -M: Mauro Carvalho Chehab M: Mauro Carvalho Chehab L: linux-edac@vger.kernel.org S: Maintained @@ -5020,7 +5012,6 @@ S: Maintained F: drivers/edac/r82600_edac.c EDAC-SBRIDGE -M: Mauro Carvalho Chehab M: Mauro Carvalho Chehab L: linux-edac@vger.kernel.org S: Maintained @@ -5073,7 +5064,6 @@ S: Maintained F: drivers/net/ethernet/ibm/ehea/ EM28XX VIDEO4LINUX DRIVER -M: Mauro Carvalho Chehab M: Mauro Carvalho Chehab L: linux-media@vger.kernel.org W: https://linuxtv.org @@ -8601,7 +8591,6 @@ S: Maintained F: drivers/media/dvb-frontends/stv6111* MEDIA INPUT INFRASTRUCTURE (V4L/DVB) -M: Mauro Carvalho Chehab M: Mauro Carvalho Chehab P: LinuxTV.org Project L: linux-media@vger.kernel.org @@ -11773,7 +11762,6 @@ S: Odd Fixes F: drivers/media/i2c/saa6588* SAA7134 VIDEO4LINUX DRIVER -M: Mauro Carvalho Chehab M: Mauro Carvalho Chehab L: linux-media@vger.kernel.org W: https://linuxtv.org @@ -12261,7 +12249,6 @@ S: Maintained F: drivers/media/radio/si4713/radio-usb-si4713.c SIANO DVB DRIVER -M: Mauro Carvalho Chehab M: Mauro Carvalho Chehab L: linux-media@vger.kernel.org W: https://linuxtv.org @@ -13133,7 +13120,6 @@ S: Maintained F: drivers/media/i2c/tda9840* TEA5761 TUNER DRIVER -M: Mauro Carvalho Chehab M: Mauro Carvalho Chehab L: linux-media@vger.kernel.org W: https://linuxtv.org @@ -13142,7 +13128,6 @@ S: Odd fixes F: drivers/media/tuners/tea5761.* TEA5767 TUNER DRIVER -M: Mauro Carvalho Chehab M: Mauro Carvalho Chehab L: linux-media@vger.kernel.org W: https://linuxtv.org @@ -13549,7 +13534,6 @@ F: Documentation/networking/tlan.txt F: drivers/net/ethernet/ti/tlan.* TM6000 VIDEO4LINUX DRIVER -M: Mauro Carvalho Chehab M: Mauro Carvalho Chehab L: linux-media@vger.kernel.org W: https://linuxtv.org @@ -14715,7 +14699,6 @@ S: Maintained F: arch/x86/entry/vdso/ XC2028/3028 TUNER DRIVER -M: Mauro Carvalho Chehab M: Mauro Carvalho Chehab L: linux-media@vger.kernel.org W: https://linuxtv.org diff --git a/drivers/media/rc/keymaps/rc-avermedia-m135a.c b/drivers/media/rc/keymaps/rc-avermedia-m135a.c index f6977df1a75b..d275d98d066a 100644 --- a/drivers/media/rc/keymaps/rc-avermedia-m135a.c +++ b/drivers/media/rc/keymaps/rc-avermedia-m135a.c @@ -12,7 +12,7 @@ * * On Avermedia M135A with IR model RM-JX, the same codes exist on both * Positivo (BR) and original IR, initial version and remote control codes - * added by Mauro Carvalho Chehab + * added by Mauro Carvalho Chehab * * Positivo also ships Avermedia M135A with model RM-K6, extra control * codes added by Herton Ronaldo Krzesinski diff --git a/drivers/media/rc/keymaps/rc-encore-enltv-fm53.c b/drivers/media/rc/keymaps/rc-encore-enltv-fm53.c index e4e78c1f4123..057c13b765ef 100644 --- a/drivers/media/rc/keymaps/rc-encore-enltv-fm53.c +++ b/drivers/media/rc/keymaps/rc-encore-enltv-fm53.c @@ -9,7 +9,7 @@ #include /* Encore ENLTV-FM v5.3 - Mauro Carvalho Chehab + Mauro Carvalho Chehab */ static struct rc_map_table encore_enltv_fm53[] = { diff --git a/drivers/media/rc/keymaps/rc-encore-enltv2.c b/drivers/media/rc/keymaps/rc-encore-enltv2.c index c3d4437a6fda..cd0555924456 100644 --- a/drivers/media/rc/keymaps/rc-encore-enltv2.c +++ b/drivers/media/rc/keymaps/rc-encore-enltv2.c @@ -9,7 +9,7 @@ #include /* Encore ENLTV2-FM - silver plastic - "Wand Media" written at the botton - Mauro Carvalho Chehab */ + Mauro Carvalho Chehab */ static struct rc_map_table encore_enltv2[] = { { 0x4c, KEY_POWER2 }, diff --git a/drivers/media/rc/keymaps/rc-kaiomy.c b/drivers/media/rc/keymaps/rc-kaiomy.c index f0f88df18606..a00051339842 100644 --- a/drivers/media/rc/keymaps/rc-kaiomy.c +++ b/drivers/media/rc/keymaps/rc-kaiomy.c @@ -9,7 +9,7 @@ #include /* Kaiomy TVnPC U2 - Mauro Carvalho Chehab + Mauro Carvalho Chehab */ static struct rc_map_table kaiomy[] = { diff --git a/drivers/media/rc/keymaps/rc-kworld-plus-tv-analog.c b/drivers/media/rc/keymaps/rc-kworld-plus-tv-analog.c index 453e04377de7..db5edde3eeb1 100644 --- a/drivers/media/rc/keymaps/rc-kworld-plus-tv-analog.c +++ b/drivers/media/rc/keymaps/rc-kworld-plus-tv-analog.c @@ -9,7 +9,7 @@ #include /* Kworld Plus TV Analog Lite PCI IR - Mauro Carvalho Chehab + Mauro Carvalho Chehab */ static struct rc_map_table kworld_plus_tv_analog[] = { diff --git a/drivers/media/rc/keymaps/rc-pixelview-new.c b/drivers/media/rc/keymaps/rc-pixelview-new.c index 791130f108ff..e4e34f2ccf74 100644 --- a/drivers/media/rc/keymaps/rc-pixelview-new.c +++ b/drivers/media/rc/keymaps/rc-pixelview-new.c @@ -9,7 +9,7 @@ #include /* - Mauro Carvalho Chehab + Mauro Carvalho Chehab present on PV MPEG 8000GT */ From 596044dfcf9651751b300c8a937629ce09823ed7 Mon Sep 17 00:00:00 2001 From: Sean Young Date: Sun, 12 Nov 2017 16:34:59 -0500 Subject: [PATCH 0628/1640] UPSTREAM: media: rc: only register protocol for rc device if enabled The raw_register function exists to create input devices associated with that IR protocol. If the mce_kbd module is loaded, then every rc device will have mce_kbd input devices, even if the protocol is not enabled. Change this to call the register function to when the protocol is enabled. Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/rc-ir-raw.c | 30 ++++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/drivers/media/rc/rc-ir-raw.c b/drivers/media/rc/rc-ir-raw.c index 22e44c8f16fd..2ab8a2b7092a 100644 --- a/drivers/media/rc/rc-ir-raw.c +++ b/drivers/media/rc/rc-ir-raw.c @@ -236,6 +236,19 @@ static int change_protocol(struct rc_dev *dev, u64 *rc_proto) struct ir_raw_handler *handler; u32 timeout = 0; + mutex_lock(&ir_raw_handler_lock); + list_for_each_entry(handler, &ir_raw_handler_list, list) { + if (!(dev->enabled_protocols & handler->protocols) && + (*rc_proto & handler->protocols) && handler->raw_register) + handler->raw_register(dev); + + if ((dev->enabled_protocols & handler->protocols) && + !(*rc_proto & handler->protocols) && + handler->raw_unregister) + handler->raw_unregister(dev); + } + mutex_unlock(&ir_raw_handler_lock); + if (!dev->max_timeout) return 0; @@ -607,7 +620,6 @@ int ir_raw_event_prepare(struct rc_dev *dev) int ir_raw_event_register(struct rc_dev *dev) { - struct ir_raw_handler *handler; struct task_struct *thread; thread = kthread_run(ir_raw_event_thread, dev->raw, "rc%u", dev->minor); @@ -618,9 +630,6 @@ int ir_raw_event_register(struct rc_dev *dev) mutex_lock(&ir_raw_handler_lock); list_add_tail(&dev->raw->list, &ir_raw_client_list); - list_for_each_entry(handler, &ir_raw_handler_list, list) - if (handler->raw_register) - handler->raw_register(dev); mutex_unlock(&ir_raw_handler_lock); return 0; @@ -648,7 +657,8 @@ void ir_raw_event_unregister(struct rc_dev *dev) mutex_lock(&ir_raw_handler_lock); list_del(&dev->raw->list); list_for_each_entry(handler, &ir_raw_handler_list, list) - if (handler->raw_unregister) + if (handler->raw_unregister && + (handler->protocols & dev->enabled_protocols)) handler->raw_unregister(dev); mutex_unlock(&ir_raw_handler_lock); @@ -661,13 +671,8 @@ void ir_raw_event_unregister(struct rc_dev *dev) int ir_raw_handler_register(struct ir_raw_handler *ir_raw_handler) { - struct ir_raw_event_ctrl *raw; - mutex_lock(&ir_raw_handler_lock); list_add_tail(&ir_raw_handler->list, &ir_raw_handler_list); - if (ir_raw_handler->raw_register) - list_for_each_entry(raw, &ir_raw_client_list, list) - ir_raw_handler->raw_register(raw->dev); atomic64_or(ir_raw_handler->protocols, &available_protocols); mutex_unlock(&ir_raw_handler_lock); @@ -683,9 +688,10 @@ void ir_raw_handler_unregister(struct ir_raw_handler *ir_raw_handler) mutex_lock(&ir_raw_handler_lock); list_del(&ir_raw_handler->list); list_for_each_entry(raw, &ir_raw_client_list, list) { - ir_raw_disable_protocols(raw->dev, protocols); - if (ir_raw_handler->raw_unregister) + if (ir_raw_handler->raw_unregister && + (raw->dev->enabled_protocols & protocols)) ir_raw_handler->raw_unregister(raw->dev); + ir_raw_disable_protocols(raw->dev, protocols); } atomic64_andnot(protocols, &available_protocols); mutex_unlock(&ir_raw_handler_lock); From f56df6e6cf83a01cc7c82a2b6523336d7cd15eb8 Mon Sep 17 00:00:00 2001 From: Sean Young Date: Wed, 18 Apr 2018 16:44:58 -0400 Subject: [PATCH 0629/1640] UPSTREAM: media: rc: imon decoder: support the stick The iMON PAD controller has a analog stick, which can be switched to keyboard mode (cursor keys) or work as a crappy mouse. Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/ir-imon-decoder.c | 135 ++++++++++++++++++++++++++++- drivers/media/rc/rc-core-priv.h | 3 + 2 files changed, 135 insertions(+), 3 deletions(-) diff --git a/drivers/media/rc/ir-imon-decoder.c b/drivers/media/rc/ir-imon-decoder.c index 52ea3b2fda74..67c1b0c15aae 100644 --- a/drivers/media/rc/ir-imon-decoder.c +++ b/drivers/media/rc/ir-imon-decoder.c @@ -31,9 +31,69 @@ enum imon_state { STATE_INACTIVE, STATE_BIT_CHK, STATE_BIT_START, - STATE_FINISHED + STATE_FINISHED, + STATE_ERROR, }; +static void ir_imon_decode_scancode(struct rc_dev *dev) +{ + struct imon_dec *imon = &dev->raw->imon; + + /* Keyboard/Mouse toggle */ + if (imon->bits == 0x299115b7) + imon->stick_keyboard = !imon->stick_keyboard; + + if ((imon->bits & 0xfc0000ff) == 0x680000b7) { + int rel_x, rel_y; + u8 buf; + + buf = imon->bits >> 16; + rel_x = (buf & 0x08) | (buf & 0x10) >> 2 | + (buf & 0x20) >> 4 | (buf & 0x40) >> 6; + if (imon->bits & 0x02000000) + rel_x |= ~0x0f; + buf = imon->bits >> 8; + rel_y = (buf & 0x08) | (buf & 0x10) >> 2 | + (buf & 0x20) >> 4 | (buf & 0x40) >> 6; + if (imon->bits & 0x01000000) + rel_y |= ~0x0f; + + if (rel_x && rel_y && imon->stick_keyboard) { + if (abs(rel_y) > abs(rel_x)) + imon->bits = rel_y > 0 ? + 0x289515b7 : /* KEY_DOWN */ + 0x2aa515b7; /* KEY_UP */ + else + imon->bits = rel_x > 0 ? + 0x2ba515b7 : /* KEY_RIGHT */ + 0x29a515b7; /* KEY_LEFT */ + } + + if (!imon->stick_keyboard) { + struct lirc_scancode lsc = { + .scancode = imon->bits, + .rc_proto = RC_PROTO_IMON, + }; + + ir_lirc_scancode_event(dev, &lsc); + + input_event(imon->idev, EV_MSC, MSC_SCAN, imon->bits); + + input_report_rel(imon->idev, REL_X, rel_x); + input_report_rel(imon->idev, REL_Y, rel_y); + + input_report_key(imon->idev, BTN_LEFT, + (imon->bits & 0x00010000) != 0); + input_report_key(imon->idev, BTN_RIGHT, + (imon->bits & 0x00040000) != 0); + input_sync(imon->idev); + return; + } + } + + rc_keydown(dev, RC_PROTO_IMON, imon->bits, 0); +} + /** * ir_imon_decode() - Decode one iMON pulse or space * @dev: the struct rc_dev descriptor of the device @@ -56,6 +116,22 @@ static int ir_imon_decode(struct rc_dev *dev, struct ir_raw_event ev) data->state, data->count, TO_US(ev.duration), TO_STR(ev.pulse)); + /* + * Since iMON protocol is a series of bits, if at any point + * we encounter an error, make sure that any remaining bits + * aren't parsed as a scancode made up of less bits. + * + * Note that if the stick is held, then the remote repeats + * the scancode with about 12ms between them. So, make sure + * we have at least 10ms of space after an error. That way, + * we're at a new scancode. + */ + if (data->state == STATE_ERROR) { + if (!ev.pulse && ev.duration > MS_TO_NS(10)) + data->state = STATE_INACTIVE; + return 0; + } + for (;;) { if (!geq_margin(ev.duration, IMON_UNIT, IMON_UNIT / 2)) return 0; @@ -95,7 +171,7 @@ static int ir_imon_decode(struct rc_dev *dev, struct ir_raw_event ev) case STATE_FINISHED: if (ev.pulse) goto err_out; - rc_keydown(dev, RC_PROTO_IMON, data->bits, 0); + ir_imon_decode_scancode(dev); data->state = STATE_INACTIVE; break; } @@ -107,7 +183,7 @@ err_out: data->state, data->count, TO_US(ev.duration), TO_STR(ev.pulse)); - data->state = STATE_INACTIVE; + data->state = STATE_ERROR; return -EINVAL; } @@ -165,11 +241,64 @@ static int ir_imon_encode(enum rc_proto protocol, u32 scancode, return e - events; } +static int ir_imon_register(struct rc_dev *dev) +{ + struct input_dev *idev; + struct imon_dec *imon = &dev->raw->imon; + int ret; + + idev = input_allocate_device(); + if (!idev) + return -ENOMEM; + + snprintf(imon->name, sizeof(imon->name), + "iMON PAD Stick (%s)", dev->device_name); + idev->name = imon->name; + idev->phys = dev->input_phys; + + /* Mouse bits */ + set_bit(EV_REL, idev->evbit); + set_bit(EV_KEY, idev->evbit); + set_bit(REL_X, idev->relbit); + set_bit(REL_Y, idev->relbit); + set_bit(BTN_LEFT, idev->keybit); + set_bit(BTN_RIGHT, idev->keybit); + + /* Report scancodes too */ + set_bit(EV_MSC, idev->evbit); + set_bit(MSC_SCAN, idev->mscbit); + + input_set_drvdata(idev, imon); + + ret = input_register_device(idev); + if (ret < 0) { + input_free_device(idev); + return -EIO; + } + + imon->idev = idev; + imon->stick_keyboard = false; + + return 0; +} + +static int ir_imon_unregister(struct rc_dev *dev) +{ + struct imon_dec *imon = &dev->raw->imon; + + input_unregister_device(imon->idev); + imon->idev = NULL; + + return 0; +} + static struct ir_raw_handler imon_handler = { .protocols = RC_PROTO_BIT_IMON, .decode = ir_imon_decode, .encode = ir_imon_encode, .carrier = 38000, + .raw_register = ir_imon_register, + .raw_unregister = ir_imon_unregister, .min_timeout = IMON_UNIT * IMON_BITS * 2, }; diff --git a/drivers/media/rc/rc-core-priv.h b/drivers/media/rc/rc-core-priv.h index 07ba77fe6a3b..bbb9a7eb6b63 100644 --- a/drivers/media/rc/rc-core-priv.h +++ b/drivers/media/rc/rc-core-priv.h @@ -126,6 +126,9 @@ struct ir_raw_event_ctrl { int count; int last_chk; unsigned int bits; + bool stick_keyboard; + struct input_dev *idev; + char name[64]; } imon; }; From 3d5d72beeb6d47672edeefdc608306ecdb265616 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Fri, 20 Apr 2018 13:42:49 -0400 Subject: [PATCH 0630/1640] UPSTREAM: media: rc: allow build pnp-dependent drivers with COMPILE_TEST The pnp header already provide enough stub to build those drivers with COMPILE_TEST on non-x86 archs. Signed-off-by: Mauro Carvalho Chehab Acked-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/Kconfig | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/media/rc/Kconfig b/drivers/media/rc/Kconfig index 149394676c4a..eec3377e4cd3 100644 --- a/drivers/media/rc/Kconfig +++ b/drivers/media/rc/Kconfig @@ -149,7 +149,7 @@ config RC_ATI_REMOTE config IR_ENE tristate "ENE eHome Receiver/Transceiver (pnp id: ENE0100/ENE02xxx)" - depends on PNP + depends on PNP || COMPILE_TEST depends on RC_CORE ---help--- Say Y here to enable support for integrated infrared receiver @@ -210,7 +210,7 @@ config IR_MCEUSB config IR_ITE_CIR tristate "ITE Tech Inc. IT8712/IT8512 Consumer Infrared Transceiver" - depends on PNP + depends on PNP || COMPILE_TEST depends on RC_CORE ---help--- Say Y here to enable support for integrated infrared receivers @@ -223,7 +223,7 @@ config IR_ITE_CIR config IR_FINTEK tristate "Fintek Consumer Infrared Transceiver" - depends on PNP + depends on PNP || COMPILE_TEST depends on RC_CORE ---help--- Say Y here to enable support for integrated infrared receiver @@ -257,7 +257,7 @@ config IR_MTK config IR_NUVOTON tristate "Nuvoton w836x7hg Consumer Infrared Transceiver" - depends on PNP + depends on PNP || COMPILE_TEST depends on RC_CORE ---help--- Say Y here to enable support for integrated infrared receiver @@ -305,7 +305,7 @@ config IR_STREAMZAP config IR_WINBOND_CIR tristate "Winbond IR remote control" - depends on X86 && PNP + depends on (X86 && PNP) || COMPILE_TEST depends on RC_CORE select NEW_LEDS select LEDS_CLASS From ceb98e1eaba4ef26542a81e660e0f191c5931ab2 Mon Sep 17 00:00:00 2001 From: Sean Young Date: Wed, 9 May 2018 06:11:28 -0400 Subject: [PATCH 0631/1640] UPSTREAM: media: mceusb: MCE_CMD_SETIRTIMEOUT cause strange behaviour on device If the IR timeout is set on vid 1784 pid 0011, the device starts behaving strangely. Reported-by: Matthias Reichl Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/mceusb.c | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/drivers/media/rc/mceusb.c b/drivers/media/rc/mceusb.c index 5c0bf61fae26..1619b748469b 100644 --- a/drivers/media/rc/mceusb.c +++ b/drivers/media/rc/mceusb.c @@ -181,6 +181,7 @@ enum mceusb_model_type { MCE_GEN2 = 0, /* Most boards */ MCE_GEN1, MCE_GEN3, + MCE_GEN3_BROKEN_IRTIMEOUT, MCE_GEN2_TX_INV, MCE_GEN2_TX_INV_RX_GOOD, POLARIS_EVK, @@ -199,6 +200,7 @@ struct mceusb_model { u32 mce_gen3:1; u32 tx_mask_normal:1; u32 no_tx:1; + u32 broken_irtimeout:1; /* * 2nd IR receiver (short-range, wideband) for learning mode: * 0, absent 2nd receiver (rx2) @@ -242,6 +244,12 @@ static const struct mceusb_model mceusb_model[] = { .tx_mask_normal = 1, .rx2 = 2, }, + [MCE_GEN3_BROKEN_IRTIMEOUT] = { + .mce_gen3 = 1, + .tx_mask_normal = 1, + .rx2 = 2, + .broken_irtimeout = 1 + }, [POLARIS_EVK] = { /* * In fact, the EVK is shipped without @@ -352,7 +360,7 @@ static const struct usb_device_id mceusb_dev_table[] = { .driver_info = MCE_GEN2_TX_INV }, /* Topseed eHome Infrared Transceiver */ { USB_DEVICE(VENDOR_TOPSEED, 0x0011), - .driver_info = MCE_GEN3 }, + .driver_info = MCE_GEN3_BROKEN_IRTIMEOUT }, /* Ricavision internal Infrared Transceiver */ { USB_DEVICE(VENDOR_RICAVISION, 0x0010) }, /* Itron ione Libra Q-11 */ @@ -1441,8 +1449,16 @@ static struct rc_dev *mceusb_init_rc_dev(struct mceusb_dev *ir) rc->allowed_protocols = RC_PROTO_BIT_ALL_IR_DECODER; rc->min_timeout = US_TO_NS(MCE_TIME_UNIT); rc->timeout = MS_TO_NS(100); - rc->max_timeout = 10 * IR_DEFAULT_TIMEOUT; - rc->s_timeout = mceusb_set_timeout; + if (!mceusb_model[ir->model].broken_irtimeout) { + rc->s_timeout = mceusb_set_timeout; + rc->max_timeout = 10 * IR_DEFAULT_TIMEOUT; + } else { + /* + * If we can't set the timeout using CMD_SETIRTIMEOUT, we can + * rely on software timeouts for timeouts < 100ms. + */ + rc->max_timeout = rc->timeout; + } if (!ir->flags.no_tx) { rc->s_tx_mask = mceusb_set_tx_mask; rc->s_tx_carrier = mceusb_set_tx_carrier; From e9135bb4a4cee0b17e96a06dda1f5f5b43348580 Mon Sep 17 00:00:00 2001 From: Sean Young Date: Thu, 10 May 2018 07:37:51 -0400 Subject: [PATCH 0632/1640] UPSTREAM: media: mceusb: filter out bogus timing irdata of duration 0 A mceusb device has been observed producing invalid irdata. Proactively guard against this. Suggested-by: Matthias Reichl Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/mceusb.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/media/rc/mceusb.c b/drivers/media/rc/mceusb.c index 1619b748469b..1ca49491abc8 100644 --- a/drivers/media/rc/mceusb.c +++ b/drivers/media/rc/mceusb.c @@ -1177,6 +1177,11 @@ static void mceusb_process_ir_data(struct mceusb_dev *ir, int buf_len) init_ir_raw_event(&rawir); rawir.pulse = ((ir->buf_in[i] & MCE_PULSE_BIT) != 0); rawir.duration = (ir->buf_in[i] & MCE_PULSE_MASK); + if (unlikely(!rawir.duration)) { + dev_warn(ir->dev, "nonsensical irdata %02x with duration 0", + ir->buf_in[i]); + break; + } if (rawir.pulse) { ir->pulse_tunit += rawir.duration; ir->pulse_count++; From 7dc51a54ceacf3d278065ccdb891c595a2ebf3ab Mon Sep 17 00:00:00 2001 From: Sean Young Date: Thu, 10 May 2018 07:49:49 -0400 Subject: [PATCH 0633/1640] UPSTREAM: media: mceusb: add missing break Fallthrough is not intended here. Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/mceusb.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/media/rc/mceusb.c b/drivers/media/rc/mceusb.c index 1ca49491abc8..4c0c8008872a 100644 --- a/drivers/media/rc/mceusb.c +++ b/drivers/media/rc/mceusb.c @@ -572,6 +572,7 @@ static int mceusb_cmd_datasize(u8 cmd, u8 subcmd) datasize = 1; break; } + break; case MCE_CMD_PORT_IR: switch (subcmd) { case MCE_CMD_UNKNOWN: From 369a4f64dd733316cf56a0373cd13f3ce8840997 Mon Sep 17 00:00:00 2001 From: Sean Young Date: Thu, 10 May 2018 16:41:15 -0400 Subject: [PATCH 0634/1640] UPSTREAM: media: rc: default to idle on at startup or after reset Any spaces events received after a reset or startup should be discarded, so ensure the rc device is in idle mode. This also makes it much easier to detect incorrect raw events, as we will do in a following commit. Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/rc-ir-raw.c | 1 + include/media/rc-core.h | 1 + 2 files changed, 2 insertions(+) diff --git a/drivers/media/rc/rc-ir-raw.c b/drivers/media/rc/rc-ir-raw.c index 2ab8a2b7092a..2e50104ae138 100644 --- a/drivers/media/rc/rc-ir-raw.c +++ b/drivers/media/rc/rc-ir-raw.c @@ -611,6 +611,7 @@ int ir_raw_event_prepare(struct rc_dev *dev) dev->raw->dev = dev; dev->change_protocol = change_protocol; + dev->idle = true; spin_lock_init(&dev->raw->edge_spinlock); timer_setup(&dev->raw->edge_handle, ir_raw_edge_handle, 0); INIT_KFIFO(dev->raw->kfifo); diff --git a/include/media/rc-core.h b/include/media/rc-core.h index 6742fd86ff65..61571773a98d 100644 --- a/include/media/rc-core.h +++ b/include/media/rc-core.h @@ -347,6 +347,7 @@ static inline void ir_raw_event_reset(struct rc_dev *dev) struct ir_raw_event ev = { .reset = true }; ir_raw_event_store(dev, &ev); + dev->idle = true; ir_raw_event_handle(dev); } From 427645bc013332164a884ed9e6dd57bf3ee669e1 Mon Sep 17 00:00:00 2001 From: Sean Young Date: Thu, 10 May 2018 06:11:47 -0400 Subject: [PATCH 0635/1640] UPSTREAM: media: rc: drivers should produce alternate pulse and space timing events Report an error if this is not the case or any problem with the generated raw events. Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/rc-ir-raw.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/drivers/media/rc/rc-ir-raw.c b/drivers/media/rc/rc-ir-raw.c index 2e50104ae138..49c56da9bc67 100644 --- a/drivers/media/rc/rc-ir-raw.c +++ b/drivers/media/rc/rc-ir-raw.c @@ -22,16 +22,27 @@ static int ir_raw_event_thread(void *data) { struct ir_raw_event ev; struct ir_raw_handler *handler; - struct ir_raw_event_ctrl *raw = (struct ir_raw_event_ctrl *)data; + struct ir_raw_event_ctrl *raw = data; + struct rc_dev *dev = raw->dev; while (1) { mutex_lock(&ir_raw_handler_lock); while (kfifo_out(&raw->kfifo, &ev, 1)) { + if (is_timing_event(ev)) { + if (ev.duration == 0) + dev_err(&dev->dev, "nonsensical timing event of duration 0"); + if (is_timing_event(raw->prev_ev) && + !is_transition(&ev, &raw->prev_ev)) + dev_err(&dev->dev, "two consecutive events of type %s", + TO_STR(ev.pulse)); + if (raw->prev_ev.reset && ev.pulse == 0) + dev_err(&dev->dev, "timing event after reset should be pulse"); + } list_for_each_entry(handler, &ir_raw_handler_list, list) - if (raw->dev->enabled_protocols & + if (dev->enabled_protocols & handler->protocols || !handler->protocols) - handler->decode(raw->dev, ev); - ir_lirc_raw_event(raw->dev, ev); + handler->decode(dev, ev); + ir_lirc_raw_event(dev, ev); raw->prev_ev = ev; } mutex_unlock(&ir_raw_handler_lock); From 05e35e22620f7dc3eb28554b99b7375fadcf397f Mon Sep 17 00:00:00 2001 From: Sean Young Date: Thu, 10 May 2018 06:14:38 -0400 Subject: [PATCH 0636/1640] UPSTREAM: media: rc: decoders do not need to check for transitions Drivers should never produce consecutive pulse or space raw events. Should that occur, we would have bigger problems than this code is trying to guard against. Note that we already log an error should a driver misbehave. Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/ir-mce_kbd-decoder.c | 6 ------ drivers/media/rc/ir-rc5-decoder.c | 3 --- drivers/media/rc/ir-rc6-decoder.c | 10 ---------- 3 files changed, 19 deletions(-) diff --git a/drivers/media/rc/ir-mce_kbd-decoder.c b/drivers/media/rc/ir-mce_kbd-decoder.c index 9574c3dd90f2..64ea42927669 100644 --- a/drivers/media/rc/ir-mce_kbd-decoder.c +++ b/drivers/media/rc/ir-mce_kbd-decoder.c @@ -274,9 +274,6 @@ again: return 0; case STATE_HEADER_BIT_END: - if (!is_transition(&ev, &dev->raw->prev_ev)) - break; - decrease_duration(&ev, MCIR2_BIT_END); if (data->count != MCIR2_HEADER_NBITS) { @@ -313,9 +310,6 @@ again: return 0; case STATE_BODY_BIT_END: - if (!is_transition(&ev, &dev->raw->prev_ev)) - break; - if (data->count == data->wanted_bits) data->state = STATE_FINISHED; else diff --git a/drivers/media/rc/ir-rc5-decoder.c b/drivers/media/rc/ir-rc5-decoder.c index cbfaadbee8fa..63624654a71e 100644 --- a/drivers/media/rc/ir-rc5-decoder.c +++ b/drivers/media/rc/ir-rc5-decoder.c @@ -88,9 +88,6 @@ again: return 0; case STATE_BIT_END: - if (!is_transition(&ev, &dev->raw->prev_ev)) - break; - if (data->count == CHECK_RC5X_NBITS) data->state = STATE_CHECK_RC5X; else diff --git a/drivers/media/rc/ir-rc6-decoder.c b/drivers/media/rc/ir-rc6-decoder.c index 66e07109f6fc..68487ce9f79b 100644 --- a/drivers/media/rc/ir-rc6-decoder.c +++ b/drivers/media/rc/ir-rc6-decoder.c @@ -145,9 +145,6 @@ again: return 0; case STATE_HEADER_BIT_END: - if (!is_transition(&ev, &dev->raw->prev_ev)) - break; - if (data->count == RC6_HEADER_NBITS) data->state = STATE_TOGGLE_START; else @@ -165,10 +162,6 @@ again: return 0; case STATE_TOGGLE_END: - if (!is_transition(&ev, &dev->raw->prev_ev) || - !geq_margin(ev.duration, RC6_TOGGLE_END, RC6_UNIT / 2)) - break; - if (!(data->header & RC6_STARTBIT_MASK)) { dev_dbg(&dev->dev, "RC6 invalid start bit\n"); break; @@ -210,9 +203,6 @@ again: break; case STATE_BODY_BIT_END: - if (!is_transition(&ev, &dev->raw->prev_ev)) - break; - if (data->count == data->wanted_bits) data->state = STATE_FINISHED; else From dc18779a6323c8574b4858fa1a921adae4089157 Mon Sep 17 00:00:00 2001 From: Sean Young Date: Fri, 11 May 2018 05:36:26 -0400 Subject: [PATCH 0637/1640] UPSTREAM: media: rc: winbond: do not send reset and timeout raw events on startup ir_raw_event_set_idle() sends a timeout event which is not needed, and on startup no reset event is needed either. Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/winbond-cir.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/media/rc/winbond-cir.c b/drivers/media/rc/winbond-cir.c index 0adf0991f5ab..851acba9b436 100644 --- a/drivers/media/rc/winbond-cir.c +++ b/drivers/media/rc/winbond-cir.c @@ -989,8 +989,7 @@ wbcir_init_hw(struct wbcir_data *data) /* Clear RX state */ data->rxstate = WBCIR_RXSTATE_INACTIVE; - ir_raw_event_reset(data->dev); - ir_raw_event_set_idle(data->dev, true); + wbcir_idle_rx(data->dev, true); /* Clear TX state */ if (data->txstate == WBCIR_TXSTATE_ACTIVE) { @@ -1009,6 +1008,7 @@ wbcir_resume(struct pnp_dev *device) struct wbcir_data *data = pnp_get_drvdata(device); wbcir_init_hw(data); + ir_raw_event_reset(data->dev); enable_irq(data->irq); led_classdev_resume(&data->led); From 38ad05ea0a12d96da5dccfa07a6e9665e36750af Mon Sep 17 00:00:00 2001 From: Matthias Reichl Date: Sun, 13 May 2018 07:24:31 -0400 Subject: [PATCH 0638/1640] UPSTREAM: media: rc: ite-cir: lower timeout and extend allowed timeout range The minimum possible timeout of ite-cir is 8 samples, which is typically about 70us. The driver however changes the FIFO trigger level from the hardware's default of 1 byte to 17 bytes, so the minimum usable timeout value is 17 * 8 samples, which is typically about 1.2ms. Tests showed that using timeouts down to 1.2ms actually work fine. The current default timeout of 200ms is much longer than necessary and the maximum timeout of 1s seems to have been chosen a bit arbitrarily. So change the minimum timeout to the driver's limit of 17 * 8 samples and bring timeout and maximum timeout in line with the settings of many other receivers. Signed-off-by: Matthias Reichl Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/ite-cir.c | 8 +++++--- drivers/media/rc/ite-cir.h | 7 ------- 2 files changed, 5 insertions(+), 10 deletions(-) diff --git a/drivers/media/rc/ite-cir.c b/drivers/media/rc/ite-cir.c index 65e104c7ddfc..de77d22c30a7 100644 --- a/drivers/media/rc/ite-cir.c +++ b/drivers/media/rc/ite-cir.c @@ -1561,9 +1561,11 @@ static int ite_probe(struct pnp_dev *pdev, const struct pnp_device_id rdev->close = ite_close; rdev->s_idle = ite_s_idle; rdev->s_rx_carrier_range = ite_set_rx_carrier_range; - rdev->min_timeout = ITE_MIN_IDLE_TIMEOUT; - rdev->max_timeout = ITE_MAX_IDLE_TIMEOUT; - rdev->timeout = ITE_IDLE_TIMEOUT; + /* FIFO threshold is 17 bytes, so 17 * 8 samples minimum */ + rdev->min_timeout = 17 * 8 * ITE_BAUDRATE_DIVISOR * + itdev->params.sample_period; + rdev->timeout = IR_DEFAULT_TIMEOUT; + rdev->max_timeout = 10 * IR_DEFAULT_TIMEOUT; rdev->rx_resolution = ITE_BAUDRATE_DIVISOR * itdev->params.sample_period; rdev->tx_resolution = ITE_BAUDRATE_DIVISOR * diff --git a/drivers/media/rc/ite-cir.h b/drivers/media/rc/ite-cir.h index 0e8ebc880d1f..9cb24ac01350 100644 --- a/drivers/media/rc/ite-cir.h +++ b/drivers/media/rc/ite-cir.h @@ -154,13 +154,6 @@ struct ite_dev { /* default carrier freq for when demodulator is off (Hz) */ #define ITE_DEFAULT_CARRIER_FREQ 38000 -/* default idling timeout in ns (0.2 seconds) */ -#define ITE_IDLE_TIMEOUT 200000000UL - -/* limit timeout values */ -#define ITE_MIN_IDLE_TIMEOUT 100000000UL -#define ITE_MAX_IDLE_TIMEOUT 1000000000UL - /* convert bits to us */ #define ITE_BITS_TO_NS(bits, sample_period) \ ((u32) ((bits) * ITE_BAUDRATE_DIVISOR * sample_period)) From 49b89511741af7229a7abc90fffc03efc9600abe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Winiarski?= Date: Mon, 21 May 2018 10:38:01 -0400 Subject: [PATCH 0639/1640] UPSTREAM: media: rc: nuvoton: Tweak the interrupt enabling dance MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It appears that we need to enable CIR device before attempting to touch some of the registers. Previously, this was not a big issue, since we were rarely seeing nvt_close() getting called. Unfortunately, since commit cb84343fced1 ("media: lirc: do not call close() or open() on unregistered devices") the initial open() during probe from rc_setup_rx_device() is no longer successful, which means that userspace clients will actually end up calling nvt_open()/nvt_close(). Since nvt_open() is broken, the device doesn't seem to work as expected. Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=199597 Signed-off-by: Michał Winiarski Cc: Jarod Wilson Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/nuvoton-cir.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/media/rc/nuvoton-cir.c b/drivers/media/rc/nuvoton-cir.c index 5e1d866a61a5..ce8949b6549d 100644 --- a/drivers/media/rc/nuvoton-cir.c +++ b/drivers/media/rc/nuvoton-cir.c @@ -922,6 +922,9 @@ static int nvt_open(struct rc_dev *dev) struct nvt_dev *nvt = dev->priv; unsigned long flags; + /* enable the CIR logical device */ + nvt_enable_logical_dev(nvt, LOGICAL_DEV_CIR); + spin_lock_irqsave(&nvt->lock, flags); /* set function enable flags */ @@ -937,9 +940,6 @@ static int nvt_open(struct rc_dev *dev) spin_unlock_irqrestore(&nvt->lock, flags); - /* enable the CIR logical device */ - nvt_enable_logical_dev(nvt, LOGICAL_DEV_CIR); - return 0; } From ee3c7a977a3bcc0c1e85328311bbf02e5089b6a6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Winiarski?= Date: Mon, 21 May 2018 10:38:02 -0400 Subject: [PATCH 0640/1640] UPSTREAM: media: rc: nuvoton: Keep track of users on CIR enable/disable MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Core rc keeps track of the users - let's use it to tweak the code and use the common code path on suspend/resume. Signed-off-by: Michał Winiarski Cc: Jarod Wilson Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/nuvoton-cir.c | 82 +++++++++++++++------------------- 1 file changed, 36 insertions(+), 46 deletions(-) diff --git a/drivers/media/rc/nuvoton-cir.c b/drivers/media/rc/nuvoton-cir.c index ce8949b6549d..eebd6fef5602 100644 --- a/drivers/media/rc/nuvoton-cir.c +++ b/drivers/media/rc/nuvoton-cir.c @@ -543,27 +543,9 @@ static void nvt_cir_regs_init(struct nvt_dev *nvt) nvt_cir_reg_write(nvt, CIR_FIFOCON_TX_TRIGGER_LEV | CIR_FIFOCON_RX_TRIGGER_LEV, CIR_FIFOCON); - /* - * Enable TX and RX, specify carrier on = low, off = high, and set - * sample period (currently 50us) - */ - nvt_cir_reg_write(nvt, - CIR_IRCON_TXEN | CIR_IRCON_RXEN | - CIR_IRCON_RXINV | CIR_IRCON_SAMPLE_PERIOD_SEL, - CIR_IRCON); - /* clear hardware rx and tx fifos */ nvt_clear_cir_fifo(nvt); nvt_clear_tx_fifo(nvt); - - /* clear any and all stray interrupts */ - nvt_cir_reg_write(nvt, 0xff, CIR_IRSTS); - - /* and finally, enable interrupts */ - nvt_set_cir_iren(nvt); - - /* enable the CIR logical device */ - nvt_enable_logical_dev(nvt, LOGICAL_DEV_CIR); } static void nvt_cir_wake_regs_init(struct nvt_dev *nvt) @@ -892,6 +874,32 @@ static irqreturn_t nvt_cir_isr(int irq, void *data) return IRQ_HANDLED; } +static void nvt_enable_cir(struct nvt_dev *nvt) +{ + unsigned long flags; + + /* enable the CIR logical device */ + nvt_enable_logical_dev(nvt, LOGICAL_DEV_CIR); + + spin_lock_irqsave(&nvt->lock, flags); + + /* + * Enable TX and RX, specify carrier on = low, off = high, and set + * sample period (currently 50us) + */ + nvt_cir_reg_write(nvt, CIR_IRCON_TXEN | CIR_IRCON_RXEN | + CIR_IRCON_RXINV | CIR_IRCON_SAMPLE_PERIOD_SEL, + CIR_IRCON); + + /* clear all pending interrupts */ + nvt_cir_reg_write(nvt, 0xff, CIR_IRSTS); + + /* enable interrupts */ + nvt_set_cir_iren(nvt); + + spin_unlock_irqrestore(&nvt->lock, flags); +} + static void nvt_disable_cir(struct nvt_dev *nvt) { unsigned long flags; @@ -920,25 +928,8 @@ static void nvt_disable_cir(struct nvt_dev *nvt) static int nvt_open(struct rc_dev *dev) { struct nvt_dev *nvt = dev->priv; - unsigned long flags; - /* enable the CIR logical device */ - nvt_enable_logical_dev(nvt, LOGICAL_DEV_CIR); - - spin_lock_irqsave(&nvt->lock, flags); - - /* set function enable flags */ - nvt_cir_reg_write(nvt, CIR_IRCON_TXEN | CIR_IRCON_RXEN | - CIR_IRCON_RXINV | CIR_IRCON_SAMPLE_PERIOD_SEL, - CIR_IRCON); - - /* clear all pending interrupts */ - nvt_cir_reg_write(nvt, 0xff, CIR_IRSTS); - - /* enable interrupts */ - nvt_set_cir_iren(nvt); - - spin_unlock_irqrestore(&nvt->lock, flags); + nvt_enable_cir(nvt); return 0; } @@ -1093,19 +1084,13 @@ static void nvt_remove(struct pnp_dev *pdev) static int nvt_suspend(struct pnp_dev *pdev, pm_message_t state) { struct nvt_dev *nvt = pnp_get_drvdata(pdev); - unsigned long flags; nvt_dbg("%s called", __func__); - spin_lock_irqsave(&nvt->lock, flags); - - /* disable all CIR interrupts */ - nvt_cir_reg_write(nvt, 0, CIR_IREN); - - spin_unlock_irqrestore(&nvt->lock, flags); - - /* disable cir logical dev */ - nvt_disable_logical_dev(nvt, LOGICAL_DEV_CIR); + mutex_lock(&nvt->rdev->lock); + if (nvt->rdev->users) + nvt_disable_cir(nvt); + mutex_unlock(&nvt->rdev->lock); /* make sure wake is enabled */ nvt_enable_wake(nvt); @@ -1122,6 +1107,11 @@ static int nvt_resume(struct pnp_dev *pdev) nvt_cir_regs_init(nvt); nvt_cir_wake_regs_init(nvt); + mutex_lock(&nvt->rdev->lock); + if (nvt->rdev->users) + nvt_enable_cir(nvt); + mutex_unlock(&nvt->rdev->lock); + return 0; } From d9780358c2d535ab77448970c6f9d72a820c882e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Winiarski?= Date: Fri, 25 May 2018 10:28:17 -0400 Subject: [PATCH 0641/1640] UPSTREAM: media: rc: nuvoton: Keep device enabled during reg init MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Doing writes when the device is disabled seems to be a NOOP. For CIR device, we should enable it, initialize it, and then disable it until it's opened. CIR_WAKE should always be enabled. Signed-off-by: Michał Winiarski Cc: Jarod Wilson Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/nuvoton-cir.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/media/rc/nuvoton-cir.c b/drivers/media/rc/nuvoton-cir.c index eebd6fef5602..b8299c9a9744 100644 --- a/drivers/media/rc/nuvoton-cir.c +++ b/drivers/media/rc/nuvoton-cir.c @@ -535,6 +535,8 @@ static void nvt_set_cir_iren(struct nvt_dev *nvt) static void nvt_cir_regs_init(struct nvt_dev *nvt) { + nvt_enable_logical_dev(nvt, LOGICAL_DEV_CIR); + /* set sample limit count (PE interrupt raised when reached) */ nvt_cir_reg_write(nvt, CIR_RX_LIMIT_COUNT >> 8, CIR_SLCH); nvt_cir_reg_write(nvt, CIR_RX_LIMIT_COUNT & 0xff, CIR_SLCL); @@ -546,10 +548,14 @@ static void nvt_cir_regs_init(struct nvt_dev *nvt) /* clear hardware rx and tx fifos */ nvt_clear_cir_fifo(nvt); nvt_clear_tx_fifo(nvt); + + nvt_disable_logical_dev(nvt, LOGICAL_DEV_CIR); } static void nvt_cir_wake_regs_init(struct nvt_dev *nvt) { + nvt_enable_logical_dev(nvt, LOGICAL_DEV_CIR_WAKE); + /* * Disable RX, set specific carrier on = low, off = high, * and sample period (currently 50us) @@ -561,9 +567,6 @@ static void nvt_cir_wake_regs_init(struct nvt_dev *nvt) /* clear any and all stray interrupts */ nvt_cir_wake_reg_write(nvt, 0xff, CIR_WAKE_IRSTS); - - /* enable the CIR WAKE logical device */ - nvt_enable_logical_dev(nvt, LOGICAL_DEV_CIR_WAKE); } static void nvt_enable_wake(struct nvt_dev *nvt) From feeb468beaec0c537729760d57e372fbef9bd117 Mon Sep 17 00:00:00 2001 From: Sean Young Date: Thu, 24 May 2018 05:47:17 -0400 Subject: [PATCH 0642/1640] UPSTREAM: media: rc: ensure input/lirc device can be opened after register Since commit cb84343fced1 ("media: lirc: do not call close() or open() on unregistered devices") rc_open() will return -ENODEV if rcdev->registered is false. Ensure this is set before we register the input device and the lirc device, else we have a short window where the neither the lirc or input device can be opened. Fixes: cb84343fced1 ("media: lirc: do not call close() or open() on unregistered devices") Cc: stable@vger.kernel.org # v4.16+ Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/rc-main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/media/rc/rc-main.c b/drivers/media/rc/rc-main.c index b7071bde670a..2e222d9ee01f 100644 --- a/drivers/media/rc/rc-main.c +++ b/drivers/media/rc/rc-main.c @@ -1862,6 +1862,8 @@ int rc_register_device(struct rc_dev *dev) dev->device_name ?: "Unspecified device", path ?: "N/A"); kfree(path); + dev->registered = true; + if (dev->driver_type != RC_DRIVER_IR_RAW_TX) { rc = rc_setup_rx_device(dev); if (rc) @@ -1881,8 +1883,6 @@ int rc_register_device(struct rc_dev *dev) goto out_lirc; } - dev->registered = true; - dev_dbg(&dev->dev, "Registered rc%u (driver: %s)\n", dev->minor, dev->driver_name ? dev->driver_name : "unknown"); From 633363af99854405d47163e125599762805b4875 Mon Sep 17 00:00:00 2001 From: Sean Young Date: Sun, 27 May 2018 12:24:09 +0100 Subject: [PATCH 0643/1640] BACKPORT: media: rc: introduce BPF_PROG_LIRC_MODE2 Add support for BPF_PROG_LIRC_MODE2. This type of BPF program can call rc_keydown() to reported decoded IR scancodes, or rc_repeat() to report that the last key should be repeated. The bpf program can be attached to using the bpf(BPF_PROG_ATTACH) syscall; the target_fd must be the /dev/lircN device. Acked-by: Yonghong Song Signed-off-by: Sean Young Signed-off-by: Daniel Borkmann --- drivers/media/rc/Kconfig | 13 ++ drivers/media/rc/Makefile | 1 + drivers/media/rc/bpf-lirc.c | 313 ++++++++++++++++++++++++++++++++ drivers/media/rc/lirc_dev.c | 30 +++ drivers/media/rc/rc-core-priv.h | 21 +++ drivers/media/rc/rc-ir-raw.c | 12 +- include/linux/bpf_lirc.h | 29 +++ include/linux/bpf_types.h | 3 + include/uapi/linux/bpf.h | 49 +++++ kernel/bpf/syscall.c | 7 + 10 files changed, 476 insertions(+), 2 deletions(-) create mode 100644 drivers/media/rc/bpf-lirc.c create mode 100644 include/linux/bpf_lirc.h diff --git a/drivers/media/rc/Kconfig b/drivers/media/rc/Kconfig index eec3377e4cd3..30f05cbedebd 100644 --- a/drivers/media/rc/Kconfig +++ b/drivers/media/rc/Kconfig @@ -25,6 +25,19 @@ config LIRC passes raw IR to and from userspace, which is needed for IR transmitting (aka "blasting") and for the lirc daemon. +config BPF_LIRC_MODE2 + bool "Support for eBPF programs attached to lirc devices" + depends on BPF_SYSCALL + depends on RC_CORE=y + depends on LIRC + help + Allow attaching eBPF programs to a lirc device using the bpf(2) + syscall command BPF_PROG_ATTACH. This is supported for raw IR + receivers. + + These eBPF programs can be used to decode IR into scancodes, for + IR protocols not supported by the kernel decoders. + menuconfig RC_DECODERS bool "Remote controller decoders" depends on RC_CORE diff --git a/drivers/media/rc/Makefile b/drivers/media/rc/Makefile index 81cb22d1cffd..2e4045d366a5 100644 --- a/drivers/media/rc/Makefile +++ b/drivers/media/rc/Makefile @@ -5,6 +5,7 @@ obj-y += keymaps/ obj-$(CONFIG_RC_CORE) += rc-core.o rc-core-y := rc-main.o rc-ir-raw.o rc-core-$(CONFIG_LIRC) += lirc_dev.o +rc-core-$(CONFIG_BPF_LIRC_MODE2) += bpf-lirc.o obj-$(CONFIG_IR_NEC_DECODER) += ir-nec-decoder.o obj-$(CONFIG_IR_RC5_DECODER) += ir-rc5-decoder.o obj-$(CONFIG_IR_RC6_DECODER) += ir-rc6-decoder.o diff --git a/drivers/media/rc/bpf-lirc.c b/drivers/media/rc/bpf-lirc.c new file mode 100644 index 000000000000..40826bba06b6 --- /dev/null +++ b/drivers/media/rc/bpf-lirc.c @@ -0,0 +1,313 @@ +// SPDX-License-Identifier: GPL-2.0 +// bpf-lirc.c - handles bpf +// +// Copyright (C) 2018 Sean Young + +#include +#include +#include +#include "rc-core-priv.h" + +/* + * BPF interface for raw IR + */ +const struct bpf_prog_ops lirc_mode2_prog_ops = { +}; + +BPF_CALL_1(bpf_rc_repeat, u32*, sample) +{ + struct ir_raw_event_ctrl *ctrl; + + ctrl = container_of(sample, struct ir_raw_event_ctrl, bpf_sample); + + rc_repeat(ctrl->dev); + + return 0; +} + +static const struct bpf_func_proto rc_repeat_proto = { + .func = bpf_rc_repeat, + .gpl_only = true, /* rc_repeat is EXPORT_SYMBOL_GPL */ + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, +}; + +/* + * Currently rc-core does not support 64-bit scancodes, but there are many + * known protocols with more than 32 bits. So, define the interface as u64 + * as a future-proof. + */ +BPF_CALL_4(bpf_rc_keydown, u32*, sample, u32, protocol, u64, scancode, + u32, toggle) +{ + struct ir_raw_event_ctrl *ctrl; + + ctrl = container_of(sample, struct ir_raw_event_ctrl, bpf_sample); + + rc_keydown(ctrl->dev, protocol, scancode, toggle != 0); + + return 0; +} + +static const struct bpf_func_proto rc_keydown_proto = { + .func = bpf_rc_keydown, + .gpl_only = true, /* rc_keydown is EXPORT_SYMBOL_GPL */ + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_ANYTHING, +}; + +static const struct bpf_func_proto * +lirc_mode2_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + switch (func_id) { + case BPF_FUNC_rc_repeat: + return &rc_repeat_proto; + case BPF_FUNC_rc_keydown: + return &rc_keydown_proto; + case BPF_FUNC_map_lookup_elem: + return &bpf_map_lookup_elem_proto; + case BPF_FUNC_map_update_elem: + return &bpf_map_update_elem_proto; + case BPF_FUNC_map_delete_elem: + return &bpf_map_delete_elem_proto; + case BPF_FUNC_ktime_get_ns: + return &bpf_ktime_get_ns_proto; + case BPF_FUNC_tail_call: + return &bpf_tail_call_proto; + case BPF_FUNC_get_prandom_u32: + return &bpf_get_prandom_u32_proto; + case BPF_FUNC_trace_printk: + if (capable(CAP_SYS_ADMIN)) + return bpf_get_trace_printk_proto(); + /* fall through */ + default: + return NULL; + } +} + +static bool lirc_mode2_is_valid_access(int off, int size, + enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + /* We have one field of u32 */ + return type == BPF_READ && off == 0 && size == sizeof(u32); +} + +const struct bpf_verifier_ops lirc_mode2_verifier_ops = { + .get_func_proto = lirc_mode2_func_proto, + .is_valid_access = lirc_mode2_is_valid_access +}; + +#define BPF_MAX_PROGS 64 + +static int lirc_bpf_attach(struct rc_dev *rcdev, struct bpf_prog *prog) +{ + struct bpf_prog_array __rcu *old_array; + struct bpf_prog_array *new_array; + struct ir_raw_event_ctrl *raw; + int ret; + + if (rcdev->driver_type != RC_DRIVER_IR_RAW) + return -EINVAL; + + ret = mutex_lock_interruptible(&ir_raw_handler_lock); + if (ret) + return ret; + + raw = rcdev->raw; + if (!raw) { + ret = -ENODEV; + goto unlock; + } + + if (raw->progs && bpf_prog_array_length(raw->progs) >= BPF_MAX_PROGS) { + ret = -E2BIG; + goto unlock; + } + + old_array = raw->progs; + ret = bpf_prog_array_copy(old_array, NULL, prog, &new_array); + if (ret < 0) + goto unlock; + + rcu_assign_pointer(raw->progs, new_array); + bpf_prog_array_free(old_array); + +unlock: + mutex_unlock(&ir_raw_handler_lock); + return ret; +} + +static int lirc_bpf_detach(struct rc_dev *rcdev, struct bpf_prog *prog) +{ + struct bpf_prog_array __rcu *old_array; + struct bpf_prog_array *new_array; + struct ir_raw_event_ctrl *raw; + int ret; + + if (rcdev->driver_type != RC_DRIVER_IR_RAW) + return -EINVAL; + + ret = mutex_lock_interruptible(&ir_raw_handler_lock); + if (ret) + return ret; + + raw = rcdev->raw; + if (!raw) { + ret = -ENODEV; + goto unlock; + } + + old_array = raw->progs; + ret = bpf_prog_array_copy(old_array, prog, NULL, &new_array); + /* + * Do not use bpf_prog_array_delete_safe() as we would end up + * with a dummy entry in the array, and the we would free the + * dummy in lirc_bpf_free() + */ + if (ret) + goto unlock; + + rcu_assign_pointer(raw->progs, new_array); + bpf_prog_array_free(old_array); +unlock: + mutex_unlock(&ir_raw_handler_lock); + return ret; +} + +void lirc_bpf_run(struct rc_dev *rcdev, u32 sample) +{ + struct ir_raw_event_ctrl *raw = rcdev->raw; + + raw->bpf_sample = sample; + + if (raw->progs) + BPF_PROG_RUN_ARRAY(raw->progs, &raw->bpf_sample, BPF_PROG_RUN); +} + +/* + * This should be called once the rc thread has been stopped, so there can be + * no concurrent bpf execution. + */ +void lirc_bpf_free(struct rc_dev *rcdev) +{ + struct bpf_prog **progs; + + if (!rcdev->raw->progs) + return; + + progs = rcu_dereference(rcdev->raw->progs)->progs; + while (*progs) + bpf_prog_put(*progs++); + + bpf_prog_array_free(rcdev->raw->progs); +} + +int lirc_prog_attach(const union bpf_attr *attr) +{ + struct bpf_prog *prog; + struct rc_dev *rcdev; + int ret; + + if (attr->attach_flags) + return -EINVAL; + + prog = bpf_prog_get_type(attr->attach_bpf_fd, + BPF_PROG_TYPE_LIRC_MODE2); + if (IS_ERR(prog)) + return PTR_ERR(prog); + + rcdev = rc_dev_get_from_fd(attr->target_fd); + if (IS_ERR(rcdev)) { + bpf_prog_put(prog); + return PTR_ERR(rcdev); + } + + ret = lirc_bpf_attach(rcdev, prog); + if (ret) + bpf_prog_put(prog); + + put_device(&rcdev->dev); + + return ret; +} + +int lirc_prog_detach(const union bpf_attr *attr) +{ + struct bpf_prog *prog; + struct rc_dev *rcdev; + int ret; + + if (attr->attach_flags) + return -EINVAL; + + prog = bpf_prog_get_type(attr->attach_bpf_fd, + BPF_PROG_TYPE_LIRC_MODE2); + if (IS_ERR(prog)) + return PTR_ERR(prog); + + rcdev = rc_dev_get_from_fd(attr->target_fd); + if (IS_ERR(rcdev)) { + bpf_prog_put(prog); + return PTR_ERR(rcdev); + } + + ret = lirc_bpf_detach(rcdev, prog); + + bpf_prog_put(prog); + put_device(&rcdev->dev); + + return ret; +} + +int lirc_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr) +{ + __u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids); + struct bpf_prog_array __rcu *progs; + struct rc_dev *rcdev; + u32 cnt, flags = 0; + int ret; + + if (attr->query.query_flags) + return -EINVAL; + + rcdev = rc_dev_get_from_fd(attr->query.target_fd); + if (IS_ERR(rcdev)) + return PTR_ERR(rcdev); + + if (rcdev->driver_type != RC_DRIVER_IR_RAW) { + ret = -EINVAL; + goto put; + } + + ret = mutex_lock_interruptible(&ir_raw_handler_lock); + if (ret) + goto put; + + progs = rcdev->raw->progs; + cnt = progs ? bpf_prog_array_length(progs) : 0; + + if (copy_to_user(&uattr->query.prog_cnt, &cnt, sizeof(cnt))) { + ret = -EFAULT; + goto unlock; + } + + if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags))) { + ret = -EFAULT; + goto unlock; + } + + if (attr->query.prog_cnt != 0 && prog_ids && cnt) + ret = bpf_prog_array_copy_to_user(progs, prog_ids, cnt); + +unlock: + mutex_unlock(&ir_raw_handler_lock); +put: + put_device(&rcdev->dev); + + return ret; +} diff --git a/drivers/media/rc/lirc_dev.c b/drivers/media/rc/lirc_dev.c index 0dcf7444c970..44c796e01b8e 100644 --- a/drivers/media/rc/lirc_dev.c +++ b/drivers/media/rc/lirc_dev.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -104,6 +105,12 @@ void ir_lirc_raw_event(struct rc_dev *dev, struct ir_raw_event ev) TO_US(ev.duration), TO_STR(ev.pulse)); } + /* + * bpf does not care about the gap generated above; that exists + * for backwards compatibility + */ + lirc_bpf_run(dev, sample); + spin_lock_irqsave(&dev->lirc_fh_lock, flags); list_for_each_entry(fh, &dev->lirc_fh, list) { if (LIRC_IS_TIMEOUT(sample) && !fh->send_timeout_reports) @@ -842,4 +849,27 @@ void __exit lirc_dev_exit(void) unregister_chrdev_region(lirc_base_dev, RC_DEV_MAX); } +struct rc_dev *rc_dev_get_from_fd(int fd) +{ + struct fd f = fdget(fd); + struct lirc_fh *fh; + struct rc_dev *dev; + + if (!f.file) + return ERR_PTR(-EBADF); + + if (f.file->f_op != &lirc_fops) { + fdput(f); + return ERR_PTR(-EINVAL); + } + + fh = f.file->private_data; + dev = fh->rc; + + get_device(&dev->dev); + fdput(f); + + return dev; +} + MODULE_ALIAS("lirc_dev"); diff --git a/drivers/media/rc/rc-core-priv.h b/drivers/media/rc/rc-core-priv.h index bbb9a7eb6b63..e847bdad5c51 100644 --- a/drivers/media/rc/rc-core-priv.h +++ b/drivers/media/rc/rc-core-priv.h @@ -13,6 +13,7 @@ #define MAX_IR_EVENT_SIZE 512 #include +#include #include /** @@ -58,6 +59,11 @@ struct ir_raw_event_ctrl { /* raw decoder state follows */ struct ir_raw_event prev_ev; struct ir_raw_event this_ev; + +#ifdef CONFIG_BPF_LIRC_MODE2 + u32 bpf_sample; + struct bpf_prog_array __rcu *progs; +#endif struct nec_dec { int state; unsigned count; @@ -132,6 +138,9 @@ struct ir_raw_event_ctrl { } imon; }; +/* Mutex for locking raw IR processing and handler change */ +extern struct mutex ir_raw_handler_lock; + /* macros for IR decoders */ static inline bool geq_margin(unsigned d1, unsigned d2, unsigned margin) { @@ -294,6 +303,7 @@ void ir_lirc_raw_event(struct rc_dev *dev, struct ir_raw_event ev); void ir_lirc_scancode_event(struct rc_dev *dev, struct lirc_scancode *lsc); int ir_lirc_register(struct rc_dev *dev); void ir_lirc_unregister(struct rc_dev *dev); +struct rc_dev *rc_dev_get_from_fd(int fd); #else static inline int lirc_dev_init(void) { return 0; } static inline void lirc_dev_exit(void) {} @@ -305,4 +315,15 @@ static inline int ir_lirc_register(struct rc_dev *dev) { return 0; } static inline void ir_lirc_unregister(struct rc_dev *dev) { } #endif +/* + * bpf interface + */ +#ifdef CONFIG_BPF_LIRC_MODE2 +void lirc_bpf_free(struct rc_dev *dev); +void lirc_bpf_run(struct rc_dev *dev, u32 sample); +#else +static inline void lirc_bpf_free(struct rc_dev *dev) { } +static inline void lirc_bpf_run(struct rc_dev *dev, u32 sample) { } +#endif + #endif /* _RC_CORE_PRIV */ diff --git a/drivers/media/rc/rc-ir-raw.c b/drivers/media/rc/rc-ir-raw.c index 49c56da9bc67..2e0066b1a31c 100644 --- a/drivers/media/rc/rc-ir-raw.c +++ b/drivers/media/rc/rc-ir-raw.c @@ -14,7 +14,7 @@ static LIST_HEAD(ir_raw_client_list); /* Used to handle IR raw handler extensions */ -static DEFINE_MUTEX(ir_raw_handler_lock); +DEFINE_MUTEX(ir_raw_handler_lock); static LIST_HEAD(ir_raw_handler_list); static atomic64_t available_protocols = ATOMIC64_INIT(0); @@ -672,9 +672,17 @@ void ir_raw_event_unregister(struct rc_dev *dev) if (handler->raw_unregister && (handler->protocols & dev->enabled_protocols)) handler->raw_unregister(dev); - mutex_unlock(&ir_raw_handler_lock); + + lirc_bpf_free(dev); ir_raw_event_free(dev); + + /* + * A user can be calling bpf(BPF_PROG_{QUERY|ATTACH|DETACH}), so + * ensure that the raw member is null on unlock; this is how + * "device gone" is checked. + */ + mutex_unlock(&ir_raw_handler_lock); } /* diff --git a/include/linux/bpf_lirc.h b/include/linux/bpf_lirc.h new file mode 100644 index 000000000000..5f8a4283092d --- /dev/null +++ b/include/linux/bpf_lirc.h @@ -0,0 +1,29 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BPF_LIRC_H +#define _BPF_LIRC_H + +#include + +#ifdef CONFIG_BPF_LIRC_MODE2 +int lirc_prog_attach(const union bpf_attr *attr); +int lirc_prog_detach(const union bpf_attr *attr); +int lirc_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr); +#else +static inline int lirc_prog_attach(const union bpf_attr *attr) +{ + return -EINVAL; +} + +static inline int lirc_prog_detach(const union bpf_attr *attr) +{ + return -EINVAL; +} + +static inline int lirc_prog_query(const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + return -EINVAL; +} +#endif + +#endif /* _BPF_LIRC_H */ diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index b161e506dcfc..c5700c2d5549 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -26,6 +26,9 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_RAW_TRACEPOINT, raw_tracepoint) #ifdef CONFIG_CGROUP_BPF BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_DEVICE, cg_dev) #endif +#ifdef CONFIG_BPF_LIRC_MODE2 +BPF_PROG_TYPE(BPF_PROG_TYPE_LIRC_MODE2, lirc_mode2) +#endif BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY, array_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_ARRAY, percpu_array_map_ops) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 0d3fcdc0454e..45acd440e119 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -143,6 +143,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_RAW_TRACEPOINT, BPF_PROG_TYPE_CGROUP_SOCK_ADDR, BPF_PROG_TYPE_LWT_SEG6LOCAL, + BPF_PROG_TYPE_LIRC_MODE2, }; enum bpf_attach_type { @@ -162,6 +163,7 @@ enum bpf_attach_type { BPF_CGROUP_INET6_POST_BIND, BPF_CGROUP_UDP4_SENDMSG, BPF_CGROUP_UDP6_SENDMSG, + BPF_LIRC_MODE2, __MAX_BPF_ATTACH_TYPE }; @@ -2005,6 +2007,53 @@ union bpf_attr { * direct packet access. * Return * 0 on success, or a negative error in case of failure. + * + * int bpf_rc_keydown(void *ctx, u32 protocol, u64 scancode, u32 toggle) + * Description + * This helper is used in programs implementing IR decoding, to + * report a successfully decoded key press with *scancode*, + * *toggle* value in the given *protocol*. The scancode will be + * translated to a keycode using the rc keymap, and reported as + * an input key down event. After a period a key up event is + * generated. This period can be extended by calling either + * **bpf_rc_keydown** () again with the same values, or calling + * **bpf_rc_repeat** (). + * + * Some protocols include a toggle bit, in case the button was + * released and pressed again between consecutive scancodes. + * + * The *ctx* should point to the lirc sample as passed into + * the program. + * + * The *protocol* is the decoded protocol number (see + * **enum rc_proto** for some predefined values). + * + * This helper is only available is the kernel was compiled with + * the **CONFIG_BPF_LIRC_MODE2** configuration option set to + * "**y**". + * + * Return + * 0 + * + * int bpf_rc_repeat(void *ctx) + * Description + * This helper is used in programs implementing IR decoding, to + * report a successfully decoded repeat key message. This delays + * the generation of a key up event for previously generated + * key down event. + * + * Some IR protocols like NEC have a special IR message for + * repeating last button, for when a button is held down. + * + * The *ctx* should point to the lirc sample as passed into + * the program. + * + * This helper is only available is the kernel was compiled with + * the **CONFIG_BPF_LIRC_MODE2** configuration option set to + * "**y**". + * + * Return + * 0 */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 119b414f2e7e..92355fd45d63 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -11,6 +11,7 @@ */ #include #include +#include #include #include #include @@ -1605,6 +1606,8 @@ static int bpf_prog_attach(const union bpf_attr *attr) case BPF_SK_SKB_STREAM_PARSER: case BPF_SK_SKB_STREAM_VERDICT: return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_SKB, true); + case BPF_LIRC_MODE2: + return lirc_prog_attach(attr); default: return -EINVAL; } @@ -1677,6 +1680,8 @@ static int bpf_prog_detach(const union bpf_attr *attr) case BPF_SK_SKB_STREAM_PARSER: case BPF_SK_SKB_STREAM_VERDICT: return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_SKB, false); + case BPF_LIRC_MODE2: + return lirc_prog_detach(attr); default: return -EINVAL; } @@ -1726,6 +1731,8 @@ static int bpf_prog_query(const union bpf_attr *attr, case BPF_CGROUP_SOCK_OPS: case BPF_CGROUP_DEVICE: break; + case BPF_LIRC_MODE2: + return lirc_prog_query(attr, uattr); default: return -EINVAL; } From ea37a648ac640046e2a1b688d714d05cc1062991 Mon Sep 17 00:00:00 2001 From: Sean Young Date: Tue, 26 Jun 2018 11:03:18 -0400 Subject: [PATCH 0644/1640] UPSTREAM: media: rc: be less noisy when driver misbehaves Since commit 48231f289e52 ("media: rc: drivers should produce alternate pulse and space timing events"), on meson-ir we are regularly producing errors. Reduce to warning level and only warn once to avoid flooding the log. A proper fix for meson-ir is going to be too large for v4.18. Signed-off-by: Sean Young Cc: stable@vger.kernel.org # 4.17+ Tested-by: Jerome Brunet Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/rc-ir-raw.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/media/rc/rc-ir-raw.c b/drivers/media/rc/rc-ir-raw.c index 2e0066b1a31c..e7948908e78c 100644 --- a/drivers/media/rc/rc-ir-raw.c +++ b/drivers/media/rc/rc-ir-raw.c @@ -30,13 +30,13 @@ static int ir_raw_event_thread(void *data) while (kfifo_out(&raw->kfifo, &ev, 1)) { if (is_timing_event(ev)) { if (ev.duration == 0) - dev_err(&dev->dev, "nonsensical timing event of duration 0"); + dev_warn_once(&dev->dev, "nonsensical timing event of duration 0"); if (is_timing_event(raw->prev_ev) && !is_transition(&ev, &raw->prev_ev)) - dev_err(&dev->dev, "two consecutive events of type %s", - TO_STR(ev.pulse)); + dev_warn_once(&dev->dev, "two consecutive events of type %s", + TO_STR(ev.pulse)); if (raw->prev_ev.reset && ev.pulse == 0) - dev_err(&dev->dev, "timing event after reset should be pulse"); + dev_warn_once(&dev->dev, "timing event after reset should be pulse"); } list_for_each_entry(handler, &ir_raw_handler_list, list) if (dev->enabled_protocols & From 262bb442dbe77a453e03f7cd838ab05365b38e39 Mon Sep 17 00:00:00 2001 From: Sean Young Date: Wed, 4 Jul 2018 10:57:58 -0400 Subject: [PATCH 0645/1640] UPSTREAM: media: bpf: ensure bpf program is freed on detach Currently we are leaking bpf programs when they are detached from the lirc device; the refcount never reaches zero. Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/bpf-lirc.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/media/rc/bpf-lirc.c b/drivers/media/rc/bpf-lirc.c index 40826bba06b6..55400317ec53 100644 --- a/drivers/media/rc/bpf-lirc.c +++ b/drivers/media/rc/bpf-lirc.c @@ -174,6 +174,7 @@ static int lirc_bpf_detach(struct rc_dev *rcdev, struct bpf_prog *prog) rcu_assign_pointer(raw->progs, new_array); bpf_prog_array_free(old_array); + bpf_prog_put(prog); unlock: mutex_unlock(&ir_raw_handler_lock); return ret; From daafae2743514c73e0ccbf2008af960631b7e841 Mon Sep 17 00:00:00 2001 From: Sean Young Date: Sat, 28 Jul 2018 05:11:15 -0400 Subject: [PATCH 0646/1640] UPSTREAM: media: rc: read out of bounds if bpf reports high protocol number The repeat period is read from a static array. If a keydown event is reported from bpf with a high protocol number, we read out of bounds. This is unlikely to end up with a reasonable repeat period at the best of times, in which case no timely key up event is generated. Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/rc-main.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/drivers/media/rc/rc-main.c b/drivers/media/rc/rc-main.c index 2e222d9ee01f..ca68e1d2b2f9 100644 --- a/drivers/media/rc/rc-main.c +++ b/drivers/media/rc/rc-main.c @@ -679,6 +679,14 @@ static void ir_timer_repeat(struct timer_list *t) spin_unlock_irqrestore(&dev->keylock, flags); } +static unsigned int repeat_period(int protocol) +{ + if (protocol >= ARRAY_SIZE(protocols)) + return 100; + + return protocols[protocol].repeat_period; +} + /** * rc_repeat() - signals that a key is still pressed * @dev: the struct rc_dev descriptor of the device @@ -691,7 +699,7 @@ void rc_repeat(struct rc_dev *dev) { unsigned long flags; unsigned int timeout = nsecs_to_jiffies(dev->timeout) + - msecs_to_jiffies(protocols[dev->last_protocol].repeat_period); + msecs_to_jiffies(repeat_period(dev->last_protocol)); struct lirc_scancode sc = { .scancode = dev->last_scancode, .rc_proto = dev->last_protocol, .keycode = dev->keypressed ? dev->last_keycode : KEY_RESERVED, @@ -803,7 +811,7 @@ void rc_keydown(struct rc_dev *dev, enum rc_proto protocol, u32 scancode, if (dev->keypressed) { dev->keyup_jiffies = jiffies + nsecs_to_jiffies(dev->timeout) + - msecs_to_jiffies(protocols[protocol].repeat_period); + msecs_to_jiffies(repeat_period(protocol)); mod_timer(&dev->timer_keyup, dev->keyup_jiffies); } spin_unlock_irqrestore(&dev->keylock, flags); From e50a74a3cefcce19fe572d27c85639153a5a7c5b Mon Sep 17 00:00:00 2001 From: Sean Young Date: Mon, 22 Oct 2018 05:01:50 -0400 Subject: [PATCH 0647/1640] UPSTREAM: media: rc: cec devices do not have a lirc chardev MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit e5bb9d3d755f128956ed467ae50b41d22bb680c6 upstream. This fixes an oops in ir_lirc_scancode_event(). BUG: unable to handle kernel NULL pointer dereference at 0000000000000038 PGD 0 P4D 0 Oops: 0000 [#1] SMP PTI CPU: 9 PID: 27687 Comm: kworker/9:2 Tainted: P           OE 4.18.12-200.fc28.x86_64 #1 Hardware name: Supermicro C7X99-OCE-F/C7X99-OCE-F, BIOS 2.1a 06/15/2018 Workqueue: events pulse8_irq_work_handler [pulse8_cec] RIP: 0010:ir_lirc_scancode_event+0x3d/0xb0 [rc_core] Code: 8d ae b4 07 00 00 49 81 c6 b8 07 00 00 53 e8 4a df c3 d5 48 89 ef 49 89 45 00 e8 4e 84 41 d6 49 8b 1e 49 89 c4 4c 39 f3 74 58 <8b> 43 38 8b 53 40 89 c1 2b 4b 3c 39 ca 72 41 21 d0 49 8b 7d 00 49 RSP: 0018:ffffaa10e3c07d58 EFLAGS: 00010017 RAX: 0000000000000002 RBX: 0000000000000000 RCX: 0000000000000018 RDX: 0000000000000001 RSI: 00316245397fa93c RDI: ffff966d31c8d7b4 RBP: ffff966d31c8d7b4 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000003 R11: ffffaa10e3c07e28 R12: 0000000000000002 R13: ffffaa10e3c07d88 R14: ffff966d31c8d7b8 R15: 0000000000000073 FS:  0000000000000000(0000) GS:ffff966d3f440000(0000) knlGS:0000000000000000 CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000038 CR3: 00000009d820a003 CR4: 00000000003606e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace:  ir_do_keydown+0x75/0x260 [rc_core]  rc_keydown+0x54/0xc0 [rc_core]  cec_received_msg_ts+0xaa8/0xaf0 [cec]  process_one_work+0x1a1/0x350  worker_thread+0x30/0x380  ? pwq_unbound_release_workfn+0xd0/0xd0  kthread+0x112/0x130  ? kthread_create_worker_on_cpu+0x70/0x70  ret_from_fork+0x35/0x40 Modules linked in: rc_tt_1500 dvb_usb_dvbsky dvb_usb_v2 uas usb_storage fuse vhost_net vhost tap xt_CHECKSUM iptable_mangle ip6t_REJECT nf_reject_ipv6 tun 8021q garp mrp xt_nat macvlan xfs devlink ebta  si2157 si2168 cx25840 cx23885 kvm altera_ci tda18271 joydev ir_rc6_decoder rc_rc6_mce crct10dif_pclmul crc32_pclmul ghash_clmulni_intel intel_cstate intel_uncore altera_stapl m88ds3103 tveeprom cx2341  mxm_wmi igb crc32c_intel megaraid_sas dca i2c_algo_bit wmi vfio_pci irqbypass vfio_virqfd vfio_iommu_type1 vfio i2c_dev CR2: 0000000000000038 Cc: # v4.16+ Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Greg Kroah-Hartman --- drivers/media/rc/rc-main.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/media/rc/rc-main.c b/drivers/media/rc/rc-main.c index ca68e1d2b2f9..8b2c16dd58bd 100644 --- a/drivers/media/rc/rc-main.c +++ b/drivers/media/rc/rc-main.c @@ -707,7 +707,8 @@ void rc_repeat(struct rc_dev *dev) (dev->last_toggle ? LIRC_SCANCODE_FLAG_TOGGLE : 0) }; - ir_lirc_scancode_event(dev, &sc); + if (dev->allowed_protocols != RC_PROTO_BIT_CEC) + ir_lirc_scancode_event(dev, &sc); spin_lock_irqsave(&dev->keylock, flags); @@ -747,7 +748,8 @@ static void ir_do_keydown(struct rc_dev *dev, enum rc_proto protocol, .keycode = keycode }; - ir_lirc_scancode_event(dev, &sc); + if (dev->allowed_protocols != RC_PROTO_BIT_CEC) + ir_lirc_scancode_event(dev, &sc); if (new_event && dev->keypressed) ir_do_keyup(dev, false); From 44a1cb873fdc0230bcbb5e68c4ba4a4dc403ea56 Mon Sep 17 00:00:00 2001 From: Sean Young Date: Sun, 4 Nov 2018 05:12:09 -0500 Subject: [PATCH 0648/1640] UPSTREAM: media: rc: ensure close() is called on rc_unregister_device [ Upstream commit 8e782fcf78275f505194e767c515202d4fd274bc ] If userspace has an open file descriptor on the rc input device or lirc device when rc_unregister_device() is called, then the rc close() is never called. This ensures that the receiver is turned off on the nuvoton-cir driver during shutdown. Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Sasha Levin --- drivers/media/rc/rc-main.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/media/rc/rc-main.c b/drivers/media/rc/rc-main.c index 8b2c16dd58bd..0f218afdadaa 100644 --- a/drivers/media/rc/rc-main.c +++ b/drivers/media/rc/rc-main.c @@ -1956,6 +1956,8 @@ void rc_unregister_device(struct rc_dev *dev) rc_free_rx_device(dev); mutex_lock(&dev->lock); + if (dev->users && dev->close) + dev->close(dev); dev->registered = false; mutex_unlock(&dev->lock); From f03cb103456337385628d5f7b79cc81894c5a419 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Tue, 5 Mar 2019 00:40:26 -0500 Subject: [PATCH 0649/1640] UPSTREAM: media: serial_ir: Fix use-after-free in serial_ir_init_module commit 56cd26b618855c9af48c8301aa6754ced8dd0beb upstream. Syzkaller report this: BUG: KASAN: use-after-free in sysfs_remove_file_ns+0x5f/0x70 fs/sysfs/file.c:468 Read of size 8 at addr ffff8881dc7ae030 by task syz-executor.0/6249 CPU: 1 PID: 6249 Comm: syz-executor.0 Not tainted 5.0.0-rc8+ #3 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1ubuntu1 04/01/2014 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0xfa/0x1ce lib/dump_stack.c:113 print_address_description+0x65/0x270 mm/kasan/report.c:187 kasan_report+0x149/0x18d mm/kasan/report.c:317 ? 0xffffffffc1728000 sysfs_remove_file_ns+0x5f/0x70 fs/sysfs/file.c:468 sysfs_remove_file include/linux/sysfs.h:519 [inline] driver_remove_file+0x40/0x50 drivers/base/driver.c:122 remove_bind_files drivers/base/bus.c:585 [inline] bus_remove_driver+0x186/0x220 drivers/base/bus.c:725 driver_unregister+0x6c/0xa0 drivers/base/driver.c:197 serial_ir_init_module+0x169/0x1000 [serial_ir] do_one_initcall+0xfa/0x5ca init/main.c:887 do_init_module+0x204/0x5f6 kernel/module.c:3460 load_module+0x66b2/0x8570 kernel/module.c:3808 __do_sys_finit_module+0x238/0x2a0 kernel/module.c:3902 do_syscall_64+0x147/0x600 arch/x86/entry/common.c:290 entry_SYSCALL_64_after_hwframe+0x49/0xbe RIP: 0033:0x462e99 Code: f7 d8 64 89 02 b8 ff ff ff ff c3 66 0f 1f 44 00 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 bc ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007f9450132c58 EFLAGS: 00000246 ORIG_RAX: 0000000000000139 RAX: ffffffffffffffda RBX: 000000000073bf00 RCX: 0000000000462e99 RDX: 0000000000000000 RSI: 0000000020000100 RDI: 0000000000000003 RBP: 00007f9450132c70 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 00007f94501336bc R13: 00000000004bcefa R14: 00000000006f6fb0 R15: 0000000000000004 Allocated by task 6249: set_track mm/kasan/common.c:85 [inline] __kasan_kmalloc.constprop.3+0xa0/0xd0 mm/kasan/common.c:495 kmalloc include/linux/slab.h:545 [inline] kzalloc include/linux/slab.h:740 [inline] bus_add_driver+0xc0/0x610 drivers/base/bus.c:651 driver_register+0x1bb/0x3f0 drivers/base/driver.c:170 serial_ir_init_module+0xe8/0x1000 [serial_ir] do_one_initcall+0xfa/0x5ca init/main.c:887 do_init_module+0x204/0x5f6 kernel/module.c:3460 load_module+0x66b2/0x8570 kernel/module.c:3808 __do_sys_finit_module+0x238/0x2a0 kernel/module.c:3902 do_syscall_64+0x147/0x600 arch/x86/entry/common.c:290 entry_SYSCALL_64_after_hwframe+0x49/0xbe Freed by task 6249: set_track mm/kasan/common.c:85 [inline] __kasan_slab_free+0x130/0x180 mm/kasan/common.c:457 slab_free_hook mm/slub.c:1430 [inline] slab_free_freelist_hook mm/slub.c:1457 [inline] slab_free mm/slub.c:3005 [inline] kfree+0xe1/0x270 mm/slub.c:3957 kobject_cleanup lib/kobject.c:662 [inline] kobject_release lib/kobject.c:691 [inline] kref_put include/linux/kref.h:67 [inline] kobject_put+0x146/0x240 lib/kobject.c:708 bus_remove_driver+0x10e/0x220 drivers/base/bus.c:732 driver_unregister+0x6c/0xa0 drivers/base/driver.c:197 serial_ir_init_module+0x14c/0x1000 [serial_ir] do_one_initcall+0xfa/0x5ca init/main.c:887 do_init_module+0x204/0x5f6 kernel/module.c:3460 load_module+0x66b2/0x8570 kernel/module.c:3808 __do_sys_finit_module+0x238/0x2a0 kernel/module.c:3902 do_syscall_64+0x147/0x600 arch/x86/entry/common.c:290 entry_SYSCALL_64_after_hwframe+0x49/0xbe The buggy address belongs to the object at ffff8881dc7ae000 which belongs to the cache kmalloc-256 of size 256 The buggy address is located 48 bytes inside of 256-byte region [ffff8881dc7ae000, ffff8881dc7ae100) The buggy address belongs to the page: page:ffffea000771eb80 count:1 mapcount:0 mapping:ffff8881f6c02e00 index:0x0 flags: 0x2fffc0000000200(slab) raw: 02fffc0000000200 ffffea0007d14800 0000000400000002 ffff8881f6c02e00 raw: 0000000000000000 00000000800c000c 00000001ffffffff 0000000000000000 page dumped because: kasan: bad access detected Memory state around the buggy address: ffff8881dc7adf00: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ffff8881dc7adf80: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 >ffff8881dc7ae000: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ^ ffff8881dc7ae080: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ffff8881dc7ae100: fc fc fc fc fc fc fc fc 00 00 00 00 00 00 00 00 There are already cleanup handlings in serial_ir_init error path, no need to call serial_ir_exit do it again in serial_ir_init_module, otherwise will trigger a use-after-free issue. Fixes: fa5dc29c1fcc ("[media] lirc_serial: move out of staging and rename to serial_ir") Reported-by: Hulk Robot Signed-off-by: YueHaibing Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Greg Kroah-Hartman --- drivers/media/rc/serial_ir.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/drivers/media/rc/serial_ir.c b/drivers/media/rc/serial_ir.c index 8bf5637b3a69..e613c0175591 100644 --- a/drivers/media/rc/serial_ir.c +++ b/drivers/media/rc/serial_ir.c @@ -773,8 +773,6 @@ static void serial_ir_exit(void) static int __init serial_ir_init_module(void) { - int result; - switch (type) { case IR_HOMEBREW: case IR_IRDEO: @@ -802,12 +800,7 @@ static int __init serial_ir_init_module(void) if (sense != -1) sense = !!sense; - result = serial_ir_init(); - if (!result) - return 0; - - serial_ir_exit(); - return result; + return serial_ir_init(); } static void __exit serial_ir_exit_module(void) From de40c97aef7ca8040204dd46b667c12c6f4c1c3d Mon Sep 17 00:00:00 2001 From: Daniel Gomez Date: Mon, 22 Apr 2019 15:10:20 -0400 Subject: [PATCH 0650/1640] UPSTREAM: media: spi: IR LED: add missing of table registration [ Upstream commit 24e4cf770371df6ad49ed873f21618d9878f64c8 ] MODULE_DEVICE_TABLE(of, should be called to complete DT OF mathing mechanism and register it. Before this patch: modinfo drivers/media/rc/ir-spi.ko | grep alias After this patch: modinfo drivers/media/rc/ir-spi.ko | grep alias alias: of:N*T*Cir-spi-ledC* alias: of:N*T*Cir-spi-led Reported-by: Javier Martinez Canillas Signed-off-by: Daniel Gomez Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Sasha Levin --- drivers/media/rc/ir-spi.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/media/rc/ir-spi.c b/drivers/media/rc/ir-spi.c index 66334e8d63ba..c58f2d38a458 100644 --- a/drivers/media/rc/ir-spi.c +++ b/drivers/media/rc/ir-spi.c @@ -161,6 +161,7 @@ static const struct of_device_id ir_spi_of_match[] = { { .compatible = "ir-spi-led" }, {}, }; +MODULE_DEVICE_TABLE(of, ir_spi_of_match); static struct spi_driver ir_spi_driver = { .probe = ir_spi_probe, From 10d2726379f7258b697670957f64a6f84285d09e Mon Sep 17 00:00:00 2001 From: Sean Young Date: Fri, 12 Jul 2019 18:47:00 -0400 Subject: [PATCH 0651/1640] UPSTREAM: media: mtk-cir: lower de-glitch counter for rc-mm protocol [ Upstream commit 5dd4b89dc098bf22cd13e82a308f42a02c102b2b ] The rc-mm protocol can't be decoded by the mtk-cir since the de-glitch filter removes pulses/spaces shorter than 294 microseconds. Tested on a BananaPi R2. Signed-off-by: Sean Young Acked-by: Sean Wang Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Sasha Levin --- drivers/media/rc/mtk-cir.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/media/rc/mtk-cir.c b/drivers/media/rc/mtk-cir.c index e42efd9d382e..d37b85d2bc75 100644 --- a/drivers/media/rc/mtk-cir.c +++ b/drivers/media/rc/mtk-cir.c @@ -44,6 +44,11 @@ /* Fields containing pulse width data */ #define MTK_WIDTH_MASK (GENMASK(7, 0)) +/* IR threshold */ +#define MTK_IRTHD 0x14 +#define MTK_DG_CNT_MASK (GENMASK(12, 8)) +#define MTK_DG_CNT(x) ((x) << 8) + /* Bit to enable interrupt */ #define MTK_IRINT_EN BIT(0) @@ -409,6 +414,9 @@ static int mtk_ir_probe(struct platform_device *pdev) mtk_w32_mask(ir, val, ir->data->fields[MTK_HW_PERIOD].mask, ir->data->fields[MTK_HW_PERIOD].reg); + /* Set de-glitch counter */ + mtk_w32_mask(ir, MTK_DG_CNT(1), MTK_DG_CNT_MASK, MTK_IRTHD); + /* Enable IR and PWM */ val = mtk_r32(ir, MTK_CONFIG_HIGH_REG); val |= MTK_OK_COUNT(ir->data->ok_count) | MTK_PWM_EN | MTK_IR_EN; From 68be40300906c4d4a298677eb538acee161dffd4 Mon Sep 17 00:00:00 2001 From: Oliver Neukum Date: Tue, 30 Jul 2019 05:50:44 -0300 Subject: [PATCH 0652/1640] UPSTREAM: media: iguanair: add sanity checks [ Upstream commit ab1cbdf159beba7395a13ab70bc71180929ca064 ] The driver needs to check the endpoint types, too, as opposed to the number of endpoints. This also requires moving the check earlier. Reported-by: syzbot+01a77b82edaa374068e1@syzkaller.appspotmail.com Signed-off-by: Oliver Neukum Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Sasha Levin --- drivers/media/rc/iguanair.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/drivers/media/rc/iguanair.c b/drivers/media/rc/iguanair.c index 7daac8bab83b..6f3030b2054d 100644 --- a/drivers/media/rc/iguanair.c +++ b/drivers/media/rc/iguanair.c @@ -424,6 +424,10 @@ static int iguanair_probe(struct usb_interface *intf, int ret, pipein, pipeout; struct usb_host_interface *idesc; + idesc = intf->altsetting; + if (idesc->desc.bNumEndpoints < 2) + return -ENODEV; + ir = kzalloc(sizeof(*ir), GFP_KERNEL); rc = rc_allocate_device(RC_DRIVER_IR_RAW); if (!ir || !rc) { @@ -438,18 +442,13 @@ static int iguanair_probe(struct usb_interface *intf, ir->urb_in = usb_alloc_urb(0, GFP_KERNEL); ir->urb_out = usb_alloc_urb(0, GFP_KERNEL); - if (!ir->buf_in || !ir->packet || !ir->urb_in || !ir->urb_out) { + if (!ir->buf_in || !ir->packet || !ir->urb_in || !ir->urb_out || + !usb_endpoint_is_int_in(&idesc->endpoint[0].desc) || + !usb_endpoint_is_int_out(&idesc->endpoint[1].desc)) { ret = -ENOMEM; goto out; } - idesc = intf->altsetting; - - if (idesc->desc.bNumEndpoints < 2) { - ret = -ENODEV; - goto out; - } - ir->rc = rc; ir->dev = &intf->dev; ir->udev = udev; From 7f8e36f0e6ee9c3d103fc1dc9f9463c66461e55c Mon Sep 17 00:00:00 2001 From: Darius Rad Date: Tue, 23 Jul 2019 13:37:46 -0300 Subject: [PATCH 0653/1640] UPSTREAM: media: rc: imon: Allow iMON RC protocol for ffdc 7e device [ Upstream commit b20a6e298bcb8cb8ae18de26baaf462a6418515b ] Allow selecting the IR protocol, MCE or iMON, for a device that identifies as follows (with config id 0x7e): 15c2:ffdc SoundGraph Inc. iMON PAD Remote Controller As the driver is structured to default to iMON when both RC protocols are supported, existing users of this device (using MCE protocol) will need to manually switch to MCE (RC-6) protocol from userspace (with ir-keytable, sysfs). Signed-off-by: Darius Rad Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Sasha Levin --- drivers/media/rc/imon.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/media/rc/imon.c b/drivers/media/rc/imon.c index 864a0d36edab..0658529c0fa1 100644 --- a/drivers/media/rc/imon.c +++ b/drivers/media/rc/imon.c @@ -1832,12 +1832,17 @@ static void imon_get_ffdc_type(struct imon_context *ictx) break; /* iMON VFD, MCE IR */ case 0x46: - case 0x7e: case 0x9e: dev_info(ictx->dev, "0xffdc iMON VFD, MCE IR"); detected_display_type = IMON_DISPLAY_TYPE_VFD; allowed_protos = RC_PROTO_BIT_RC6_MCE; break; + /* iMON VFD, iMON or MCE IR */ + case 0x7e: + dev_info(ictx->dev, "0xffdc iMON VFD, iMON or MCE IR"); + detected_display_type = IMON_DISPLAY_TYPE_VFD; + allowed_protos |= RC_PROTO_BIT_RC6_MCE; + break; /* iMON LCD, MCE IR */ case 0x9f: dev_info(ictx->dev, "0xffdc iMON LCD, MCE IR"); From 754dcf425a814ccb276f7e2fb8eb5afc60213d2e Mon Sep 17 00:00:00 2001 From: A Sun Date: Thu, 15 Aug 2019 13:41:19 -0300 Subject: [PATCH 0654/1640] UPSTREAM: media: mceusb: fix (eliminate) TX IR signal length limit [ Upstream commit 9fc3ce31f5bde660197f35135e90a1cced58aa2c ] Fix and eliminate mceusb's IR length limit for IR signals transmitted to the MCE IR blaster ports. An IR signal TX exceeding 306 pulse/space samples presently causes -EINVAL return error. There's no such limitation nor error with the MCE device hardware. And valid IR signals exist with more than 400 pulse/space for the control of certain appliances (eg Panasonic ACXA75C00600 air conditioner). The scope of this patch is limited to the mceusb driver. There are still IR signal TX length and time constraints that related modules of rc core (eg LIRC) impose, further up the driver stack. Changes for mceusb_tx_ir(): Converts and sends LIRC IR pulse/space sequence to MCE device IR pulse/space format. Break long length LIRC sequence into multiple (unlimited number of) parts for sending to the MCE device. Reduce kernel stack IR buffer size: 128 (was 384) Increase MCE IR data packet size: 31 (was 5) Zero time LIRC pulse/space no longer copied to MCE IR data. Eliminate overwriting the source/input LIRC IR data in txbuf[]. Eliminate -EINVAL return; return number of IR samples sent (>0) or MCE write error code (<0). New mce_write() and mce_write_callback(): Implements synchronous blocking I/O, with timeout, for writing/sending data to the MCE device. An unlimited multipart IR signal sent to the MCE device faster than real time requires flow control absent with the original mce_request_packet() and mce_async_callback() asynchronous I/O implementation. Also absent is TX error feedback. mce_write() combines and replaces mce_request_packet() and mce_async_callback() with conversion to synchronous I/O. mce_write() returns bytes sent (>0) or MCE device write error (<0). Debug hex dump TX data before processing. Rename mce_async_out() -> mce_command_out(): The original name is misleading with underlying synchronous I/O implementation. Function renamed to mce_command_out(). Changes in mceusb_handle_command(): Add support for MCE device error case MCE_RSP_TX_TIMEOUT "IR TX timeout (TX buffer underrun)" Changes in mceusb_dev_printdata(): Changes support test and debug of multipart TX IR. Add buffer boundary information (offset and buffer size) to TX hex dump. Correct TX trace bug "Raw IR data, 0 pulse/space samples" Add trace for MCE_RSP_TX_TIMEOUT "IR TX timeout (TX buffer underrun)" Other changes: The driver's write to USB device architecture change (async to sync I/O) is significant so we bump DRIVER_VERSION to "1.95" (from "1.94"). Tests: $ cat -n irdata1 | head -3 1 carrier 36000 2 pulse 6350 3 space 6350 $ cat -n irdata1 | tail -3 76 pulse 6350 77 space 6350 78 pulse 6350 $ ir-ctl -s irdata1 [1549021.073612] mceusb 1-1.3:1.0: requesting 36000 HZ carrier [1549021.073635] mceusb 1-1.3:1.0: tx data[0]: 9f 06 01 45 (len=4 sz=4) [1549021.073649] mceusb 1-1.3:1.0: Request carrier of 35714 Hz (period 28us) [1549021.073848] mceusb 1-1.3:1.0: tx done status = 4 (wait = 100, expire = 100 (1000ms), urb->actual_length = 4, urb->status = 0) [1549021.074689] mceusb 1-1.3:1.0: rx data[0]: 9f 06 01 45 (len=4 sz=4) [1549021.074701] mceusb 1-1.3:1.0: Got carrier of 35714 Hz (period 28us) [1549021.102023] mceusb 1-1.3:1.0: tx data[0]: 9f 08 03 (len=3 sz=3) [1549021.102036] mceusb 1-1.3:1.0: Request transmit blaster mask of 0x03 [1549021.102219] mceusb 1-1.3:1.0: tx done status = 3 (wait = 100, expire = 100 (1000ms), urb->actual_length = 3, urb->status = 0) [1549021.131979] mceusb 1-1.3:1.0: tx data[0]: 9e ff 7f ff 7f ff 7f ff 7f ff 7f ff 7f ff 7f ff 7f ff 7f ff 7f ff 7f ff 7f ff 7f ff 7f ff 7f 9e ff 7f ff 7f ff 7f ff 7f ff 7f ff 7f ff 7f ff 7f ff 7f ff 7f ff 7f ff 7f ff 7f ff 7f ff 7f 91 ff (len=81 sz=81) [1549021.131992] mceusb 1-1.3:1.0: Raw IR data, 30 pulse/space samples [1549021.133592] mceusb 1-1.3:1.0: tx done status = 81 (wait = 100, expire = 100 (1000ms), urb->actual_length = 81, urb->status = 0) Hex dumps limited to 64 bytes. 0xff is MCE maximum time pulse, 0x7f is MCE maximum time space. $ cat -n irdata2 | head -3 1 carrier 36000 2 pulse 50 3 space 50 $ cat -n irdata2 | tail -3 254 pulse 50 255 space 50 256 pulse 50 $ ir-ctl -s irdata2 [1549306.586998] mceusb 1-1.3:1.0: tx data[0]: 9f 08 03 (len=3 sz=3) [1549306.587015] mceusb 1-1.3:1.0: Request transmit blaster mask of 0x03 [1549306.587252] mceusb 1-1.3:1.0: tx done status = 3 (wait = 100, expire = 100 (1000ms), urb->actual_length = 3, urb->status = 0) [1549306.613275] mceusb 1-1.3:1.0: tx data[0]: 9e 81 01 81 01 81 01 81 01 81 01 81 01 81 01 81 01 81 01 81 01 81 01 81 01 81 01 81 01 81 01 9e 81 01 81 01 81 01 81 01 81 01 81 01 81 01 81 01 81 01 81 01 81 01 81 01 81 01 81 01 81 01 9e 81 (len=128 sz=128) [1549306.613291] mceusb 1-1.3:1.0: Raw IR data, 30 pulse/space samples [1549306.614837] mceusb 1-1.3:1.0: tx done status = 128 (wait = 100, expire = 100 (1000ms), urb->actual_length = 128, urb->status = 0) [1549306.614861] mceusb 1-1.3:1.0: tx data[0]: 9e 01 81 01 81 01 81 01 81 01 81 01 81 01 81 01 81 01 81 01 81 01 81 01 81 01 81 01 81 01 81 9e 01 81 01 81 01 81 01 81 01 81 01 81 01 81 01 81 01 81 01 81 01 81 01 81 01 81 01 81 01 81 9e 01 (len=128 sz=128) [1549306.614869] mceusb 1-1.3:1.0: Raw IR data, 30 pulse/space samples [1549306.620199] mceusb 1-1.3:1.0: tx done status = 128 (wait = 100, expire = 100 (1000ms), urb->actual_length = 128, urb->status = 0) [1549306.620212] mceusb 1-1.3:1.0: tx data[0]: 89 81 01 81 01 81 01 81 01 81 80 (len=11 sz=11) [1549306.620221] mceusb 1-1.3:1.0: Raw IR data, 9 pulse/space samples [1549306.633294] mceusb 1-1.3:1.0: tx done status = 11 (wait = 98, expire = 100 (1000ms), urb->actual_length = 11, urb->status = 0) Hex dumps limited to 64 bytes. 0x81 is MCE minimum time pulse, 0x01 is MCE minimum time space. TX IR part 3 sz=11 shows 20msec I/O blocking delay (100expire - 98wait = 2jiffies) Signed-off-by: A Sun Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Sasha Levin --- drivers/media/rc/mceusb.c | 330 ++++++++++++++++++++++---------------- 1 file changed, 194 insertions(+), 136 deletions(-) diff --git a/drivers/media/rc/mceusb.c b/drivers/media/rc/mceusb.c index 4c0c8008872a..f1dfb8409432 100644 --- a/drivers/media/rc/mceusb.c +++ b/drivers/media/rc/mceusb.c @@ -42,21 +42,22 @@ #include #include -#define DRIVER_VERSION "1.94" +#define DRIVER_VERSION "1.95" #define DRIVER_AUTHOR "Jarod Wilson " #define DRIVER_DESC "Windows Media Center Ed. eHome Infrared Transceiver " \ "device driver" #define DRIVER_NAME "mceusb" +#define USB_TX_TIMEOUT 1000 /* in milliseconds */ #define USB_CTRL_MSG_SZ 2 /* Size of usb ctrl msg on gen1 hw */ #define MCE_G1_INIT_MSGS 40 /* Init messages on gen1 hw to throw out */ /* MCE constants */ -#define MCE_CMDBUF_SIZE 384 /* MCE Command buffer length */ +#define MCE_IRBUF_SIZE 128 /* TX IR buffer length */ #define MCE_TIME_UNIT 50 /* Approx 50us resolution */ -#define MCE_CODE_LENGTH 5 /* Normal length of packet (with header) */ -#define MCE_PACKET_SIZE 4 /* Normal length of packet (without header) */ -#define MCE_IRDATA_HEADER 0x84 /* Actual header format is 0x80 + num_bytes */ +#define MCE_PACKET_SIZE 31 /* Max length of packet (with header) */ +#define MCE_IRDATA_HEADER (0x80 + MCE_PACKET_SIZE - 1) + /* Actual format is 0x80 + num_bytes */ #define MCE_IRDATA_TRAILER 0x80 /* End of IR data */ #define MCE_MAX_CHANNELS 2 /* Two transmitters, hardware dependent? */ #define MCE_DEFAULT_TX_MASK 0x03 /* Vals: TX1=0x01, TX2=0x02, ALL=0x03 */ @@ -609,9 +610,9 @@ static void mceusb_dev_printdata(struct mceusb_dev *ir, u8 *buf, int buf_len, if (len <= skip) return; - dev_dbg(dev, "%cx data: %*ph (length=%d)", - (out ? 't' : 'r'), - min(len, buf_len - offset), buf + offset, len); + dev_dbg(dev, "%cx data[%d]: %*ph (len=%d sz=%d)", + (out ? 't' : 'r'), offset, + min(len, buf_len - offset), buf + offset, len, buf_len); inout = out ? "Request" : "Got"; @@ -733,6 +734,9 @@ static void mceusb_dev_printdata(struct mceusb_dev *ir, u8 *buf, int buf_len, case MCE_RSP_CMD_ILLEGAL: dev_dbg(dev, "Illegal PORT_IR command"); break; + case MCE_RSP_TX_TIMEOUT: + dev_dbg(dev, "IR TX timeout (TX buffer underrun)"); + break; default: dev_dbg(dev, "Unknown command 0x%02x 0x%02x", cmd, subcmd); @@ -747,13 +751,14 @@ static void mceusb_dev_printdata(struct mceusb_dev *ir, u8 *buf, int buf_len, dev_dbg(dev, "End of raw IR data"); else if ((cmd != MCE_CMD_PORT_IR) && ((cmd & MCE_PORT_MASK) == MCE_COMMAND_IRDATA)) - dev_dbg(dev, "Raw IR data, %d pulse/space samples", ir->rem); + dev_dbg(dev, "Raw IR data, %d pulse/space samples", + cmd & MCE_PACKET_LENGTH_MASK); #endif } /* * Schedule work that can't be done in interrupt handlers - * (mceusb_dev_recv() and mce_async_callback()) nor tasklets. + * (mceusb_dev_recv() and mce_write_callback()) nor tasklets. * Invokes mceusb_deferred_kevent() for recovering from * error events specified by the kevent bit field. */ @@ -766,23 +771,80 @@ static void mceusb_defer_kevent(struct mceusb_dev *ir, int kevent) dev_dbg(ir->dev, "kevent %d scheduled", kevent); } -static void mce_async_callback(struct urb *urb) +static void mce_write_callback(struct urb *urb) { - struct mceusb_dev *ir; - int len; - if (!urb) return; - ir = urb->context; + complete(urb->context); +} + +/* + * Write (TX/send) data to MCE device USB endpoint out. + * Used for IR blaster TX and MCE device commands. + * + * Return: The number of bytes written (> 0) or errno (< 0). + */ +static int mce_write(struct mceusb_dev *ir, u8 *data, int size) +{ + int ret; + struct urb *urb; + struct device *dev = ir->dev; + unsigned char *buf_out; + struct completion tx_done; + unsigned long expire; + unsigned long ret_wait; + + mceusb_dev_printdata(ir, data, size, 0, size, true); + + urb = usb_alloc_urb(0, GFP_KERNEL); + if (unlikely(!urb)) { + dev_err(dev, "Error: mce write couldn't allocate urb"); + return -ENOMEM; + } + + buf_out = kmalloc(size, GFP_KERNEL); + if (!buf_out) { + usb_free_urb(urb); + return -ENOMEM; + } + + init_completion(&tx_done); + + /* outbound data */ + if (usb_endpoint_xfer_int(ir->usb_ep_out)) + usb_fill_int_urb(urb, ir->usbdev, ir->pipe_out, + buf_out, size, mce_write_callback, &tx_done, + ir->usb_ep_out->bInterval); + else + usb_fill_bulk_urb(urb, ir->usbdev, ir->pipe_out, + buf_out, size, mce_write_callback, &tx_done); + memcpy(buf_out, data, size); + + ret = usb_submit_urb(urb, GFP_KERNEL); + if (ret) { + dev_err(dev, "Error: mce write submit urb error = %d", ret); + kfree(buf_out); + usb_free_urb(urb); + return ret; + } + + expire = msecs_to_jiffies(USB_TX_TIMEOUT); + ret_wait = wait_for_completion_timeout(&tx_done, expire); + if (!ret_wait) { + dev_err(dev, "Error: mce write timed out (expire = %lu (%dms))", + expire, USB_TX_TIMEOUT); + usb_kill_urb(urb); + ret = (urb->status == -ENOENT ? -ETIMEDOUT : urb->status); + } else { + ret = urb->status; + } + if (ret >= 0) + ret = urb->actual_length; /* bytes written */ switch (urb->status) { /* success */ case 0: - len = urb->actual_length; - - mceusb_dev_printdata(ir, urb->transfer_buffer, len, - 0, len, true); break; case -ECONNRESET: @@ -792,140 +854,135 @@ static void mce_async_callback(struct urb *urb) break; case -EPIPE: - dev_err(ir->dev, "Error: request urb status = %d (TX HALT)", + dev_err(ir->dev, "Error: mce write urb status = %d (TX HALT)", urb->status); mceusb_defer_kevent(ir, EVENT_TX_HALT); break; default: - dev_err(ir->dev, "Error: request urb status = %d", urb->status); + dev_err(ir->dev, "Error: mce write urb status = %d", + urb->status); break; } - /* the transfer buffer and urb were allocated in mce_request_packet */ - kfree(urb->transfer_buffer); + dev_dbg(dev, "tx done status = %d (wait = %lu, expire = %lu (%dms), urb->actual_length = %d, urb->status = %d)", + ret, ret_wait, expire, USB_TX_TIMEOUT, + urb->actual_length, urb->status); + + kfree(buf_out); usb_free_urb(urb); + + return ret; } -/* request outgoing (send) usb packet - used to initialize remote */ -static void mce_request_packet(struct mceusb_dev *ir, unsigned char *data, - int size) -{ - int res; - struct urb *async_urb; - struct device *dev = ir->dev; - unsigned char *async_buf; - - async_urb = usb_alloc_urb(0, GFP_KERNEL); - if (unlikely(!async_urb)) { - dev_err(dev, "Error, couldn't allocate urb!"); - return; - } - - async_buf = kmalloc(size, GFP_KERNEL); - if (!async_buf) { - usb_free_urb(async_urb); - return; - } - - /* outbound data */ - if (usb_endpoint_xfer_int(ir->usb_ep_out)) - usb_fill_int_urb(async_urb, ir->usbdev, ir->pipe_out, - async_buf, size, mce_async_callback, ir, - ir->usb_ep_out->bInterval); - else - usb_fill_bulk_urb(async_urb, ir->usbdev, ir->pipe_out, - async_buf, size, mce_async_callback, ir); - - memcpy(async_buf, data, size); - - dev_dbg(dev, "send request called (size=%#x)", size); - - res = usb_submit_urb(async_urb, GFP_ATOMIC); - if (res) { - dev_err(dev, "send request FAILED! (res=%d)", res); - kfree(async_buf); - usb_free_urb(async_urb); - return; - } - dev_dbg(dev, "send request complete (res=%d)", res); -} - -static void mce_async_out(struct mceusb_dev *ir, unsigned char *data, int size) +static void mce_command_out(struct mceusb_dev *ir, u8 *data, int size) { int rsize = sizeof(DEVICE_RESUME); if (ir->need_reset) { ir->need_reset = false; - mce_request_packet(ir, DEVICE_RESUME, rsize); + mce_write(ir, DEVICE_RESUME, rsize); msleep(10); } - mce_request_packet(ir, data, size); + mce_write(ir, data, size); msleep(10); } -/* Send data out the IR blaster port(s) */ +/* + * Transmit IR out the MCE device IR blaster port(s). + * + * Convert IR pulse/space sequence from LIRC to MCE format. + * Break up a long IR sequence into multiple parts (MCE IR data packets). + * + * u32 txbuf[] consists of IR pulse, space, ..., and pulse times in usec. + * Pulses and spaces are implicit by their position. + * The first IR sample, txbuf[0], is always a pulse. + * + * u8 irbuf[] consists of multiple IR data packets for the MCE device. + * A packet is 1 u8 MCE_IRDATA_HEADER and up to 30 u8 IR samples. + * An IR sample is 1-bit pulse/space flag with 7-bit time + * in MCE time units (50usec). + * + * Return: The number of IR samples sent (> 0) or errno (< 0). + */ static int mceusb_tx_ir(struct rc_dev *dev, unsigned *txbuf, unsigned count) { struct mceusb_dev *ir = dev->priv; - int i, length, ret = 0; - int cmdcount = 0; - unsigned char cmdbuf[MCE_CMDBUF_SIZE]; - - /* MCE tx init header */ - cmdbuf[cmdcount++] = MCE_CMD_PORT_IR; - cmdbuf[cmdcount++] = MCE_CMD_SETIRTXPORTS; - cmdbuf[cmdcount++] = ir->tx_mask; + u8 cmdbuf[3] = { MCE_CMD_PORT_IR, MCE_CMD_SETIRTXPORTS, 0x00 }; + u8 irbuf[MCE_IRBUF_SIZE]; + int ircount = 0; + unsigned int irsample; + int i, length, ret; /* Send the set TX ports command */ - mce_async_out(ir, cmdbuf, cmdcount); - cmdcount = 0; + cmdbuf[2] = ir->tx_mask; + mce_command_out(ir, cmdbuf, sizeof(cmdbuf)); - /* Generate mce packet data */ - for (i = 0; (i < count) && (cmdcount < MCE_CMDBUF_SIZE); i++) { - txbuf[i] = txbuf[i] / MCE_TIME_UNIT; + /* Generate mce IR data packet */ + for (i = 0; i < count; i++) { + irsample = txbuf[i] / MCE_TIME_UNIT; - do { /* loop to support long pulses/spaces > 127*50us=6.35ms */ - - /* Insert mce packet header every 4th entry */ - if ((cmdcount < MCE_CMDBUF_SIZE) && - (cmdcount % MCE_CODE_LENGTH) == 0) - cmdbuf[cmdcount++] = MCE_IRDATA_HEADER; - - /* Insert mce packet data */ - if (cmdcount < MCE_CMDBUF_SIZE) - cmdbuf[cmdcount++] = - (txbuf[i] < MCE_PULSE_BIT ? - txbuf[i] : MCE_MAX_PULSE_LENGTH) | - (i & 1 ? 0x00 : MCE_PULSE_BIT); - else { - ret = -EINVAL; - goto out; + /* loop to support long pulses/spaces > 6350us (127*50us) */ + while (irsample > 0) { + /* Insert IR header every 30th entry */ + if (ircount % MCE_PACKET_SIZE == 0) { + /* Room for IR header and one IR sample? */ + if (ircount >= MCE_IRBUF_SIZE - 1) { + /* Send near full buffer */ + ret = mce_write(ir, irbuf, ircount); + if (ret < 0) + return ret; + ircount = 0; + } + irbuf[ircount++] = MCE_IRDATA_HEADER; } - } while ((txbuf[i] > MCE_MAX_PULSE_LENGTH) && - (txbuf[i] -= MCE_MAX_PULSE_LENGTH)); - } + /* Insert IR sample */ + if (irsample <= MCE_MAX_PULSE_LENGTH) { + irbuf[ircount] = irsample; + irsample = 0; + } else { + irbuf[ircount] = MCE_MAX_PULSE_LENGTH; + irsample -= MCE_MAX_PULSE_LENGTH; + } + /* + * Even i = IR pulse + * Odd i = IR space + */ + irbuf[ircount] |= (i & 1 ? 0 : MCE_PULSE_BIT); + ircount++; - /* Check if we have room for the empty packet at the end */ - if (cmdcount >= MCE_CMDBUF_SIZE) { - ret = -EINVAL; - goto out; - } + /* IR buffer full? */ + if (ircount >= MCE_IRBUF_SIZE) { + /* Fix packet length in last header */ + length = ircount % MCE_PACKET_SIZE; + if (length > 0) + irbuf[ircount - length] -= + MCE_PACKET_SIZE - length; + /* Send full buffer */ + ret = mce_write(ir, irbuf, ircount); + if (ret < 0) + return ret; + ircount = 0; + } + } + } /* after for loop, 0 <= ircount < MCE_IRBUF_SIZE */ /* Fix packet length in last header */ - length = cmdcount % MCE_CODE_LENGTH; - cmdbuf[cmdcount - length] -= MCE_CODE_LENGTH - length; + length = ircount % MCE_PACKET_SIZE; + if (length > 0) + irbuf[ircount - length] -= MCE_PACKET_SIZE - length; - /* All mce commands end with an empty packet (0x80) */ - cmdbuf[cmdcount++] = MCE_IRDATA_TRAILER; + /* Append IR trailer (0x80) to final partial (or empty) IR buffer */ + irbuf[ircount++] = MCE_IRDATA_TRAILER; - /* Transmit the command to the mce device */ - mce_async_out(ir, cmdbuf, cmdcount); + /* Send final buffer */ + ret = mce_write(ir, irbuf, ircount); + if (ret < 0) + return ret; -out: - return ret ? ret : count; + return count; } /* Sets active IR outputs -- mce devices typically have two */ @@ -965,7 +1022,7 @@ static int mceusb_set_tx_carrier(struct rc_dev *dev, u32 carrier) cmdbuf[2] = MCE_CMD_SIG_END; cmdbuf[3] = MCE_IRDATA_TRAILER; dev_dbg(ir->dev, "disabling carrier modulation"); - mce_async_out(ir, cmdbuf, sizeof(cmdbuf)); + mce_command_out(ir, cmdbuf, sizeof(cmdbuf)); return 0; } @@ -979,7 +1036,7 @@ static int mceusb_set_tx_carrier(struct rc_dev *dev, u32 carrier) carrier); /* Transmit new carrier to mce device */ - mce_async_out(ir, cmdbuf, sizeof(cmdbuf)); + mce_command_out(ir, cmdbuf, sizeof(cmdbuf)); return 0; } } @@ -1002,10 +1059,10 @@ static int mceusb_set_timeout(struct rc_dev *dev, unsigned int timeout) cmdbuf[2] = units >> 8; cmdbuf[3] = units; - mce_async_out(ir, cmdbuf, sizeof(cmdbuf)); + mce_command_out(ir, cmdbuf, sizeof(cmdbuf)); /* get receiver timeout value */ - mce_async_out(ir, GET_RX_TIMEOUT, sizeof(GET_RX_TIMEOUT)); + mce_command_out(ir, GET_RX_TIMEOUT, sizeof(GET_RX_TIMEOUT)); return 0; } @@ -1030,7 +1087,7 @@ static int mceusb_set_rx_wideband(struct rc_dev *dev, int enable) ir->wideband_rx_enabled = false; cmdbuf[2] = 1; /* port 1 is long range receiver */ } - mce_async_out(ir, cmdbuf, sizeof(cmdbuf)); + mce_command_out(ir, cmdbuf, sizeof(cmdbuf)); /* response from device sets ir->learning_active */ return 0; @@ -1053,7 +1110,7 @@ static int mceusb_set_rx_carrier_report(struct rc_dev *dev, int enable) ir->carrier_report_enabled = true; if (!ir->learning_active) { cmdbuf[2] = 2; /* port 2 is short range receiver */ - mce_async_out(ir, cmdbuf, sizeof(cmdbuf)); + mce_command_out(ir, cmdbuf, sizeof(cmdbuf)); } } else { ir->carrier_report_enabled = false; @@ -1064,7 +1121,7 @@ static int mceusb_set_rx_carrier_report(struct rc_dev *dev, int enable) */ if (ir->learning_active && !ir->wideband_rx_enabled) { cmdbuf[2] = 1; /* port 1 is long range receiver */ - mce_async_out(ir, cmdbuf, sizeof(cmdbuf)); + mce_command_out(ir, cmdbuf, sizeof(cmdbuf)); } } @@ -1143,6 +1200,7 @@ static void mceusb_handle_command(struct mceusb_dev *ir, int index) } break; case MCE_RSP_CMD_ILLEGAL: + case MCE_RSP_TX_TIMEOUT: ir->need_reset = true; break; default: @@ -1280,7 +1338,7 @@ static void mceusb_get_emulator_version(struct mceusb_dev *ir) { /* If we get no reply or an illegal command reply, its ver 1, says MS */ ir->emver = 1; - mce_async_out(ir, GET_EMVER, sizeof(GET_EMVER)); + mce_command_out(ir, GET_EMVER, sizeof(GET_EMVER)); } static void mceusb_gen1_init(struct mceusb_dev *ir) @@ -1326,10 +1384,10 @@ static void mceusb_gen1_init(struct mceusb_dev *ir) dev_dbg(dev, "set handshake - retC = %d", ret); /* device resume */ - mce_async_out(ir, DEVICE_RESUME, sizeof(DEVICE_RESUME)); + mce_command_out(ir, DEVICE_RESUME, sizeof(DEVICE_RESUME)); /* get hw/sw revision? */ - mce_async_out(ir, GET_REVISION, sizeof(GET_REVISION)); + mce_command_out(ir, GET_REVISION, sizeof(GET_REVISION)); kfree(data); } @@ -1337,13 +1395,13 @@ static void mceusb_gen1_init(struct mceusb_dev *ir) static void mceusb_gen2_init(struct mceusb_dev *ir) { /* device resume */ - mce_async_out(ir, DEVICE_RESUME, sizeof(DEVICE_RESUME)); + mce_command_out(ir, DEVICE_RESUME, sizeof(DEVICE_RESUME)); /* get wake version (protocol, key, address) */ - mce_async_out(ir, GET_WAKEVERSION, sizeof(GET_WAKEVERSION)); + mce_command_out(ir, GET_WAKEVERSION, sizeof(GET_WAKEVERSION)); /* unknown what this one actually returns... */ - mce_async_out(ir, GET_UNKNOWN2, sizeof(GET_UNKNOWN2)); + mce_command_out(ir, GET_UNKNOWN2, sizeof(GET_UNKNOWN2)); } static void mceusb_get_parameters(struct mceusb_dev *ir) @@ -1357,24 +1415,24 @@ static void mceusb_get_parameters(struct mceusb_dev *ir) ir->num_rxports = 2; /* get number of tx and rx ports */ - mce_async_out(ir, GET_NUM_PORTS, sizeof(GET_NUM_PORTS)); + mce_command_out(ir, GET_NUM_PORTS, sizeof(GET_NUM_PORTS)); /* get the carrier and frequency */ - mce_async_out(ir, GET_CARRIER_FREQ, sizeof(GET_CARRIER_FREQ)); + mce_command_out(ir, GET_CARRIER_FREQ, sizeof(GET_CARRIER_FREQ)); if (ir->num_txports && !ir->flags.no_tx) /* get the transmitter bitmask */ - mce_async_out(ir, GET_TX_BITMASK, sizeof(GET_TX_BITMASK)); + mce_command_out(ir, GET_TX_BITMASK, sizeof(GET_TX_BITMASK)); /* get receiver timeout value */ - mce_async_out(ir, GET_RX_TIMEOUT, sizeof(GET_RX_TIMEOUT)); + mce_command_out(ir, GET_RX_TIMEOUT, sizeof(GET_RX_TIMEOUT)); /* get receiver sensor setting */ - mce_async_out(ir, GET_RX_SENSOR, sizeof(GET_RX_SENSOR)); + mce_command_out(ir, GET_RX_SENSOR, sizeof(GET_RX_SENSOR)); for (i = 0; i < ir->num_txports; i++) { cmdbuf[2] = i; - mce_async_out(ir, cmdbuf, sizeof(cmdbuf)); + mce_command_out(ir, cmdbuf, sizeof(cmdbuf)); } } @@ -1383,7 +1441,7 @@ static void mceusb_flash_led(struct mceusb_dev *ir) if (ir->emver < 2) return; - mce_async_out(ir, FLASH_LED, sizeof(FLASH_LED)); + mce_command_out(ir, FLASH_LED, sizeof(FLASH_LED)); } /* From 84b3f2b2d8df10e3ecf38e7d4df264d8060dc536 Mon Sep 17 00:00:00 2001 From: Matthias Reichl Date: Tue, 28 Aug 2018 09:49:42 -0400 Subject: [PATCH 0655/1640] UPSTREAM: media: rc: ir-rc6-decoder: enable toggle bit for Kathrein RCU-676 remote [ Upstream commit 85e4af0a7ae2f146769b7475ae531bf8a3f3afb4 ] The Kathrein RCU-676 remote uses the 32-bit rc6 protocol and toggles bit 15 (0x8000) on repeated button presses, like MCE remotes. Add it's customer code 0x80460000 to the 32-bit rc6 toggle handling code to get proper scancodes and toggle reports. Signed-off-by: Matthias Reichl Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Sasha Levin --- drivers/media/rc/ir-rc6-decoder.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/media/rc/ir-rc6-decoder.c b/drivers/media/rc/ir-rc6-decoder.c index 68487ce9f79b..d96aed1343e4 100644 --- a/drivers/media/rc/ir-rc6-decoder.c +++ b/drivers/media/rc/ir-rc6-decoder.c @@ -40,6 +40,7 @@ #define RC6_6A_MCE_TOGGLE_MASK 0x8000 /* for the body bits */ #define RC6_6A_LCC_MASK 0xffff0000 /* RC6-6A-32 long customer code mask */ #define RC6_6A_MCE_CC 0x800f0000 /* MCE customer code */ +#define RC6_6A_KATHREIN_CC 0x80460000 /* Kathrein RCU-676 customer code */ #ifndef CHAR_BIT #define CHAR_BIT 8 /* Normally in */ #endif @@ -242,13 +243,17 @@ again: toggle = 0; break; case 32: - if ((scancode & RC6_6A_LCC_MASK) == RC6_6A_MCE_CC) { + switch (scancode & RC6_6A_LCC_MASK) { + case RC6_6A_MCE_CC: + case RC6_6A_KATHREIN_CC: protocol = RC_PROTO_RC6_MCE; toggle = !!(scancode & RC6_6A_MCE_TOGGLE_MASK); scancode &= ~RC6_6A_MCE_TOGGLE_MASK; - } else { + break; + default: protocol = RC_PROTO_RC6_6A_32; toggle = 0; + break; } break; default: From 25c6540a47ce17ca2b059706c9a056ce469c7c03 Mon Sep 17 00:00:00 2001 From: Sean Young Date: Wed, 16 Oct 2019 14:19:15 -0300 Subject: [PATCH 0656/1640] UPSTREAM: media: imon: invalid dereference in imon_touch_event commit f3f5ba42c58d56d50f539854d8cc188944e96087 upstream. The touch timer is set up in intf1. If the second interface does not exist, the timer and touch input device are not setup and we get the following error, when touch events are reported via intf0. kernel BUG at kernel/time/timer.c:956! invalid opcode: 0000 [#1] SMP KASAN CPU: 0 PID: 0 Comm: swapper/0 Not tainted 5.4.0-rc1+ #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:__mod_timer kernel/time/timer.c:956 [inline] RIP: 0010:__mod_timer kernel/time/timer.c:949 [inline] RIP: 0010:mod_timer+0x5a2/0xb50 kernel/time/timer.c:1100 Code: 45 10 c7 44 24 14 ff ff ff ff 48 89 44 24 08 48 8d 45 20 48 c7 44 24 18 00 00 00 00 48 89 04 24 e9 5a fc ff ff e8 ae ce 0e 00 <0f> 0b e8 a7 ce 0e 00 4c 89 74 24 20 e9 37 fe ff ff e8 98 ce 0e 00 RSP: 0018:ffff8881db209930 EFLAGS: 00010006 RAX: ffffffff86c2b200 RBX: 00000000ffffa688 RCX: ffffffff83efc583 RDX: 0000000000000100 RSI: ffffffff812f4d82 RDI: ffff8881d2356200 RBP: ffff8881d23561e8 R08: ffffffff86c2b200 R09: ffffed103a46abeb R10: ffffed103a46abea R11: ffff8881d2355f53 R12: dffffc0000000000 R13: 1ffff1103b64132d R14: ffff8881d2355f50 R15: 0000000000000006 FS: 0000000000000000(0000) GS:ffff8881db200000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f75e2799000 CR3: 00000001d3b07000 CR4: 00000000001406f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: imon_touch_event drivers/media/rc/imon.c:1348 [inline] imon_incoming_packet.isra.0+0x2546/0x2f10 drivers/media/rc/imon.c:1603 usb_rx_callback_intf0+0x151/0x1e0 drivers/media/rc/imon.c:1734 __usb_hcd_giveback_urb+0x1f2/0x470 drivers/usb/core/hcd.c:1654 usb_hcd_giveback_urb+0x368/0x420 drivers/usb/core/hcd.c:1719 dummy_timer+0x120f/0x2fa2 drivers/usb/gadget/udc/dummy_hcd.c:1965 call_timer_fn+0x179/0x650 kernel/time/timer.c:1404 expire_timers kernel/time/timer.c:1449 [inline] __run_timers kernel/time/timer.c:1773 [inline] __run_timers kernel/time/timer.c:1740 [inline] run_timer_softirq+0x5e3/0x1490 kernel/time/timer.c:1786 __do_softirq+0x221/0x912 kernel/softirq.c:292 invoke_softirq kernel/softirq.c:373 [inline] irq_exit+0x178/0x1a0 kernel/softirq.c:413 exiting_irq arch/x86/include/asm/apic.h:536 [inline] smp_apic_timer_interrupt+0x12f/0x500 arch/x86/kernel/apic/apic.c:1137 apic_timer_interrupt+0xf/0x20 arch/x86/entry/entry_64.S:830 RIP: 0010:default_idle+0x28/0x2e0 arch/x86/kernel/process.c:581 Code: 90 90 41 56 41 55 65 44 8b 2d 44 3a 8f 7a 41 54 55 53 0f 1f 44 00 00 e8 36 ee d0 fb e9 07 00 00 00 0f 00 2d fa dd 4f 00 fb f4 <65> 44 8b 2d 20 3a 8f 7a 0f 1f 44 00 00 5b 5d 41 5c 41 5d 41 5e c3 RSP: 0018:ffffffff86c07da8 EFLAGS: 00000246 ORIG_RAX: ffffffffffffff13 RAX: 0000000000000007 RBX: ffffffff86c2b200 RCX: 0000000000000000 RDX: 0000000000000000 RSI: 0000000000000006 RDI: ffffffff86c2ba4c RBP: fffffbfff0d85640 R08: ffffffff86c2b200 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000 R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000 cpuidle_idle_call kernel/sched/idle.c:154 [inline] do_idle+0x3b6/0x500 kernel/sched/idle.c:263 cpu_startup_entry+0x14/0x20 kernel/sched/idle.c:355 start_kernel+0x82a/0x864 init/main.c:784 secondary_startup_64+0xa4/0xb0 arch/x86/kernel/head_64.S:241 Modules linked in: Reported-by: syzbot+f49d12d34f2321cf4df2@syzkaller.appspotmail.com Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Greg Kroah-Hartman --- drivers/media/rc/imon.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/media/rc/imon.c b/drivers/media/rc/imon.c index 0658529c0fa1..b453d65aa9e5 100644 --- a/drivers/media/rc/imon.c +++ b/drivers/media/rc/imon.c @@ -1604,8 +1604,7 @@ static void imon_incoming_packet(struct imon_context *ictx, spin_unlock_irqrestore(&ictx->kc_lock, flags); /* send touchscreen events through input subsystem if touchpad data */ - if (ictx->display_type == IMON_DISPLAY_TYPE_VGA && len == 8 && - buf[7] == 0x86) { + if (ictx->touch && len == 8 && buf[7] == 0x86) { imon_touch_event(ictx, buf); return; From a69a76006f6712673efd2266c312d3fcabca2733 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Fri, 3 Jan 2020 17:35:13 +0100 Subject: [PATCH 0657/1640] UPSTREAM: media: iguanair: fix endpoint sanity check [ Upstream commit 1b257870a78b0a9ce98fdfb052c58542022ffb5b ] Make sure to use the current alternate setting, which need not be the first one by index, when verifying the endpoint descriptors and initialising the URBs. Failing to do so could cause the driver to misbehave or trigger a WARN() in usb_submit_urb() that kernels with panic_on_warn set would choke on. Fixes: 26ff63137c45 ("[media] Add support for the IguanaWorks USB IR Transceiver") Fixes: ab1cbdf159be ("media: iguanair: add sanity checks") Cc: stable # 3.6 Cc: Oliver Neukum Signed-off-by: Johan Hovold Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Sasha Levin --- drivers/media/rc/iguanair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/media/rc/iguanair.c b/drivers/media/rc/iguanair.c index 6f3030b2054d..1df9522c30fa 100644 --- a/drivers/media/rc/iguanair.c +++ b/drivers/media/rc/iguanair.c @@ -424,7 +424,7 @@ static int iguanair_probe(struct usb_interface *intf, int ret, pipein, pipeout; struct usb_host_interface *idesc; - idesc = intf->altsetting; + idesc = intf->cur_altsetting; if (idesc->desc.bNumEndpoints < 2) return -ENODEV; From d9d9e2556134cb31ebe7bc4344db05acbc44bae2 Mon Sep 17 00:00:00 2001 From: Sean Young Date: Thu, 21 Nov 2019 11:10:47 +0100 Subject: [PATCH 0658/1640] UPSTREAM: media: rc: ensure lirc is initialized before registering input device commit 080d89f522e2baddb4fbbd1af4b67b5f92537ef8 upstream. Once rc_open is called on the input device, lirc events can be delivered. Ensure lirc is ready to do so else we might get this: Registered IR keymap rc-hauppauge rc rc0: Hauppauge WinTV PVR-350 as /devices/pci0000:00/0000:00:1e.0/0000:04:00.0/i2c-0/0-0018/rc/rc0 input: Hauppauge WinTV PVR-350 as /devices/pci0000:00/0000:00:1e.0/0000:04:00.0/i2c-0/0-0018/rc/rc0/input9 BUG: kernel NULL pointer dereference, address: 0000000000000038 PGD 0 P4D 0 Oops: 0000 [#1] SMP PTI CPU: 1 PID: 17 Comm: kworker/1:0 Not tainted 5.3.11-300.fc31.x86_64 #1 Hardware name: /DG43NB, BIOS NBG4310H.86A.0096.2009.0903.1845 09/03/2009 Workqueue: events ir_work [ir_kbd_i2c] RIP: 0010:ir_lirc_scancode_event+0x3d/0xb0 Code: a6 b4 07 00 00 49 81 c6 b8 07 00 00 55 53 e8 ba a7 9d ff 4c 89 e7 49 89 45 00 e8 5e 7a 25 00 49 8b 1e 48 89 c5 4c 39 f3 74 58 <8b> 43 38 8b 53 40 89 c1 2b 4b 3c 39 ca 72 41 21 d0 49 8b 7d 00 49 RSP: 0018:ffffaae2000b3d88 EFLAGS: 00010017 RAX: 0000000000000002 RBX: 0000000000000000 RCX: 0000000000000019 RDX: 0000000000000001 RSI: 006e801b1f26ce6a RDI: ffff9e39797c37b4 RBP: 0000000000000002 R08: 0000000000000001 R09: 0000000000000001 R10: 0000000000000001 R11: 0000000000000001 R12: ffff9e39797c37b4 R13: ffffaae2000b3db8 R14: ffff9e39797c37b8 R15: ffff9e39797c33d8 FS: 0000000000000000(0000) GS:ffff9e397b680000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000038 CR3: 0000000035844000 CR4: 00000000000006e0 Call Trace: ir_do_keydown+0x8e/0x2b0 rc_keydown+0x52/0xc0 ir_work+0xb8/0x130 [ir_kbd_i2c] process_one_work+0x19d/0x340 worker_thread+0x50/0x3b0 kthread+0xfb/0x130 ? process_one_work+0x340/0x340 ? kthread_park+0x80/0x80 ret_from_fork+0x35/0x40 Modules linked in: rc_hauppauge tuner msp3400 saa7127 saa7115 ivtv(+) tveeprom cx2341x v4l2_common videodev mc i2c_algo_bit ir_kbd_i2c ip_tables firewire_ohci e1000e serio_raw firewire_core ata_generic crc_itu_t pata_acpi pata_jmicron fuse CR2: 0000000000000038 ---[ end trace c67c2697a99fa74b ]--- RIP: 0010:ir_lirc_scancode_event+0x3d/0xb0 Code: a6 b4 07 00 00 49 81 c6 b8 07 00 00 55 53 e8 ba a7 9d ff 4c 89 e7 49 89 45 00 e8 5e 7a 25 00 49 8b 1e 48 89 c5 4c 39 f3 74 58 <8b> 43 38 8b 53 40 89 c1 2b 4b 3c 39 ca 72 41 21 d0 49 8b 7d 00 49 RSP: 0018:ffffaae2000b3d88 EFLAGS: 00010017 RAX: 0000000000000002 RBX: 0000000000000000 RCX: 0000000000000019 RDX: 0000000000000001 RSI: 006e801b1f26ce6a RDI: ffff9e39797c37b4 RBP: 0000000000000002 R08: 0000000000000001 R09: 0000000000000001 R10: 0000000000000001 R11: 0000000000000001 R12: ffff9e39797c37b4 R13: ffffaae2000b3db8 R14: ffff9e39797c37b8 R15: ffff9e39797c33d8 FS: 0000000000000000(0000) GS:ffff9e397b680000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000038 CR3: 0000000035844000 CR4: 00000000000006e0 rc rc0: lirc_dev: driver ir_kbd_i2c registered at minor = 0, scancode receiver, no transmitter tuner-simple 0-0061: creating new instance tuner-simple 0-0061: type set to 2 (Philips NTSC (FI1236,FM1236 and compatibles)) ivtv0: Registered device video0 for encoder MPG (4096 kB) ivtv0: Registered device video32 for encoder YUV (2048 kB) ivtv0: Registered device vbi0 for encoder VBI (1024 kB) ivtv0: Registered device video24 for encoder PCM (320 kB) ivtv0: Registered device radio0 for encoder radio ivtv0: Registered device video16 for decoder MPG (1024 kB) ivtv0: Registered device vbi8 for decoder VBI (64 kB) ivtv0: Registered device vbi16 for decoder VOUT ivtv0: Registered device video48 for decoder YUV (1024 kB) Cc: stable@vger.kernel.org Tested-by: Nick French Reported-by: Nick French Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Greg Kroah-Hartman --- drivers/media/rc/rc-main.c | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/drivers/media/rc/rc-main.c b/drivers/media/rc/rc-main.c index 0f218afdadaa..c30affbd43a9 100644 --- a/drivers/media/rc/rc-main.c +++ b/drivers/media/rc/rc-main.c @@ -1874,23 +1874,28 @@ int rc_register_device(struct rc_dev *dev) dev->registered = true; - if (dev->driver_type != RC_DRIVER_IR_RAW_TX) { - rc = rc_setup_rx_device(dev); - if (rc) - goto out_dev; - } - - /* Ensure that the lirc kfifo is setup before we start the thread */ + /* + * once the the input device is registered in rc_setup_rx_device, + * userspace can open the input device and rc_open() will be called + * as a result. This results in driver code being allowed to submit + * keycodes with rc_keydown, so lirc must be registered first. + */ if (dev->allowed_protocols != RC_PROTO_BIT_CEC) { rc = ir_lirc_register(dev); if (rc < 0) - goto out_rx; + goto out_dev; + } + + if (dev->driver_type != RC_DRIVER_IR_RAW_TX) { + rc = rc_setup_rx_device(dev); + if (rc) + goto out_lirc; } if (dev->driver_type == RC_DRIVER_IR_RAW) { rc = ir_raw_event_register(dev); if (rc < 0) - goto out_lirc; + goto out_rx; } dev_dbg(&dev->dev, "Registered rc%u (driver: %s)\n", dev->minor, @@ -1898,11 +1903,11 @@ int rc_register_device(struct rc_dev *dev) return 0; +out_rx: + rc_free_rx_device(dev); out_lirc: if (dev->allowed_protocols != RC_PROTO_BIT_CEC) ir_lirc_unregister(dev); -out_rx: - rc_free_rx_device(dev); out_dev: device_del(&dev->dev); out_rx_free: From 21e5fe10ec25d134808360337c3e0754a72ab084 Mon Sep 17 00:00:00 2001 From: Sean Young Date: Thu, 13 Jun 2019 04:49:26 -0400 Subject: [PATCH 0659/1640] UPSTREAM: media: rc: IR signal for Panasonic air conditioner too long commit 5c4c8b4a999019f19e770cb55cbacb89c95897bd upstream. The IR signal to control the Panasonic ACXA75C00600 air conditioner has 439 pulse/spaces. Increase limit to make it possible to transmit signal. Reported-by: Takashi Kanamaru Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Greg Kroah-Hartman --- drivers/media/rc/lirc_dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/media/rc/lirc_dev.c b/drivers/media/rc/lirc_dev.c index 44c796e01b8e..2801eeb2f09e 100644 --- a/drivers/media/rc/lirc_dev.c +++ b/drivers/media/rc/lirc_dev.c @@ -29,7 +29,7 @@ #include "rc-core-priv.h" #include -#define LIRCBUF_SIZE 256 +#define LIRCBUF_SIZE 1024 static dev_t lirc_base_dev; From c2ef8c3969c308f65a95159c2f7adbc039e736f8 Mon Sep 17 00:00:00 2001 From: Sean Young Date: Sat, 2 May 2020 14:50:52 +0200 Subject: [PATCH 0660/1640] UPSTREAM: media: gpio-ir-tx: improve precision of transmitted signal due to scheduling [ Upstream commit ea8912b788f8144e7d32ee61e5ccba45424bef83 ] usleep_range() may take longer than the max argument due to scheduling, especially under load. This is causing random errors in the transmitted IR. Remove the usleep_range() in favour of busy-looping with udelay(). Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Sasha Levin --- drivers/media/rc/gpio-ir-tx.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/drivers/media/rc/gpio-ir-tx.c b/drivers/media/rc/gpio-ir-tx.c index cd476cab9782..4e70b67ccd18 100644 --- a/drivers/media/rc/gpio-ir-tx.c +++ b/drivers/media/rc/gpio-ir-tx.c @@ -87,13 +87,8 @@ static int gpio_ir_tx(struct rc_dev *dev, unsigned int *txbuf, // space edge = ktime_add_us(edge, txbuf[i]); delta = ktime_us_delta(edge, ktime_get()); - if (delta > 10) { - spin_unlock_irqrestore(&gpio_ir->lock, flags); - usleep_range(delta, delta + 10); - spin_lock_irqsave(&gpio_ir->lock, flags); - } else if (delta > 0) { + if (delta > 0) udelay(delta); - } } else { // pulse ktime_t last = ktime_add_us(edge, txbuf[i]); From 96658c7fef2b6cc0f16ea3d7337e884812e1371b Mon Sep 17 00:00:00 2001 From: Sean Young Date: Sat, 8 Aug 2020 13:38:02 +0200 Subject: [PATCH 0661/1640] UPSTREAM: media: rc: do not access device via sysfs after rc_unregister_device() commit a2e2d73fa28136598e84db9d021091f1b98cbb1a upstream. Device drivers do not expect to have change_protocol or wakeup re-programming to be accesed after rc_unregister_device(). This can cause the device driver to access deallocated resources. Cc: # 4.16+ Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Greg Kroah-Hartman --- drivers/media/rc/rc-main.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/drivers/media/rc/rc-main.c b/drivers/media/rc/rc-main.c index c30affbd43a9..c4e7aa23aac7 100644 --- a/drivers/media/rc/rc-main.c +++ b/drivers/media/rc/rc-main.c @@ -1245,6 +1245,10 @@ static ssize_t store_protocols(struct device *device, } mutex_lock(&dev->lock); + if (!dev->registered) { + mutex_unlock(&dev->lock); + return -ENODEV; + } old_protocols = *current_protocols; new_protocols = old_protocols; @@ -1383,6 +1387,10 @@ static ssize_t store_filter(struct device *device, return -EINVAL; mutex_lock(&dev->lock); + if (!dev->registered) { + mutex_unlock(&dev->lock); + return -ENODEV; + } new_filter = *filter; if (fattr->mask) @@ -1497,6 +1505,10 @@ static ssize_t store_wakeup_protocols(struct device *device, int i; mutex_lock(&dev->lock); + if (!dev->registered) { + mutex_unlock(&dev->lock); + return -ENODEV; + } allowed = dev->allowed_wakeup_protocols; From 4fa6928eb975fa3f0cd5080241bd8b0f107d8e67 Mon Sep 17 00:00:00 2001 From: Sean Young Date: Sat, 8 Aug 2020 13:19:12 +0200 Subject: [PATCH 0662/1640] UPSTREAM: media: rc: uevent sysfs file races with rc_unregister_device() commit 4f0835d6677dc69263f90f976524cb92b257d9f4 upstream. Only report uevent file contents if device still registered, else we might read freed memory. Reported-by: syzbot+ceef16277388d6f24898@syzkaller.appspotmail.com Cc: Hillf Danton Cc: # 4.16+ Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Greg Kroah-Hartman --- drivers/media/rc/rc-main.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/drivers/media/rc/rc-main.c b/drivers/media/rc/rc-main.c index c4e7aa23aac7..cf3df733d960 100644 --- a/drivers/media/rc/rc-main.c +++ b/drivers/media/rc/rc-main.c @@ -1568,25 +1568,25 @@ static void rc_dev_release(struct device *device) kfree(dev); } -#define ADD_HOTPLUG_VAR(fmt, val...) \ - do { \ - int err = add_uevent_var(env, fmt, val); \ - if (err) \ - return err; \ - } while (0) - static int rc_dev_uevent(struct device *device, struct kobj_uevent_env *env) { struct rc_dev *dev = to_rc_dev(device); + int ret = 0; - if (dev->rc_map.name) - ADD_HOTPLUG_VAR("NAME=%s", dev->rc_map.name); - if (dev->driver_name) - ADD_HOTPLUG_VAR("DRV_NAME=%s", dev->driver_name); - if (dev->device_name) - ADD_HOTPLUG_VAR("DEV_NAME=%s", dev->device_name); + mutex_lock(&dev->lock); - return 0; + if (!dev->registered) + ret = -ENODEV; + if (ret == 0 && dev->rc_map.name) + ret = add_uevent_var(env, "NAME=%s", dev->rc_map.name); + if (ret == 0 && dev->driver_name) + ret = add_uevent_var(env, "DRV_NAME=%s", dev->driver_name); + if (ret == 0 && dev->device_name) + ret = add_uevent_var(env, "DEV_NAME=%s", dev->device_name); + + mutex_unlock(&dev->lock); + + return ret; } /* @@ -1970,14 +1970,14 @@ void rc_unregister_device(struct rc_dev *dev) del_timer_sync(&dev->timer_keyup); del_timer_sync(&dev->timer_repeat); - rc_free_rx_device(dev); - mutex_lock(&dev->lock); if (dev->users && dev->close) dev->close(dev); dev->registered = false; mutex_unlock(&dev->lock); + rc_free_rx_device(dev); + /* * lirc device should be freed with dev->registered = false, so * that userspace polling will get notified. From 58366a899260523c88d1d0a3b23f03efdf203b79 Mon Sep 17 00:00:00 2001 From: Oliver Neukum Date: Wed, 16 Sep 2020 15:50:51 +0200 Subject: [PATCH 0663/1640] UPSTREAM: media: ati_remote: sanity check for both endpoints [ Upstream commit a8be80053ea74bd9c3f9a3810e93b802236d6498 ] If you do sanity checks, you should do them for both endpoints. Hence introduce checking for endpoint type for the output endpoint, too. Reported-by: syzbot+998261c2ae5932458f6c@syzkaller.appspotmail.com Signed-off-by: Oliver Neukum Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Sasha Levin --- drivers/media/rc/ati_remote.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/media/rc/ati_remote.c b/drivers/media/rc/ati_remote.c index 8e82610ffaad..01c82da8e9aa 100644 --- a/drivers/media/rc/ati_remote.c +++ b/drivers/media/rc/ati_remote.c @@ -845,6 +845,10 @@ static int ati_remote_probe(struct usb_interface *interface, err("%s: endpoint_in message size==0? \n", __func__); return -ENODEV; } + if (!usb_endpoint_is_int_out(endpoint_out)) { + err("%s: Unexpected endpoint_out\n", __func__); + return -ENODEV; + } ati_remote = kzalloc(sizeof (struct ati_remote), GFP_KERNEL); rc_dev = rc_allocate_device(RC_DRIVER_SCANCODE); From 38c5fb452d31ac537e8d56a95647bf20547313f5 Mon Sep 17 00:00:00 2001 From: Sean Young Date: Mon, 9 Nov 2020 23:16:52 +0100 Subject: [PATCH 0664/1640] UPSTREAM: media: sunxi-cir: ensure IR is handled when it is continuous commit 3f56df4c8ffeb120ed41906d3aae71799b7e726a upstream. If a user holds a button down on a remote, then no ir idle interrupt will be generated until the user releases the button, depending on how quickly the remote repeats. No IR is processed until that point, which means that holding down a button may not do anything. This also resolves an issue on a Cubieboard 1 where the IR receiver is picking up ambient infrared as IR and spews out endless "rc rc0: IR event FIFO is full!" messages unless you choose to live in the dark. Cc: stable@vger.kernel.org Tested-by: Hans Verkuil Acked-by: Maxime Ripard Reported-by: Hans Verkuil Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Greg Kroah-Hartman --- drivers/media/rc/sunxi-cir.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/media/rc/sunxi-cir.c b/drivers/media/rc/sunxi-cir.c index f500cea228a9..0114e81fa6fa 100644 --- a/drivers/media/rc/sunxi-cir.c +++ b/drivers/media/rc/sunxi-cir.c @@ -129,6 +129,8 @@ static irqreturn_t sunxi_ir_irq(int irqno, void *dev_id) } else if (status & REG_RXINT_RPEI_EN) { ir_raw_event_set_idle(ir->rc, true); ir_raw_event_handle(ir->rc); + } else { + ir_raw_event_handle(ir->rc); } spin_unlock(&ir->ir_lock); From ac3cb04fe964eac0f99367c1c749c491569154e1 Mon Sep 17 00:00:00 2001 From: Sean Young Date: Sun, 20 Dec 2020 13:29:54 +0100 Subject: [PATCH 0665/1640] UPSTREAM: media: rc: ensure that uevent can be read directly after rc device register commit 896111dc4bcf887b835b3ef54f48b450d4692a1d upstream. There is a race condition where if the /sys/class/rc0/uevent file is read before rc_dev->registered is set to true, -ENODEV will be returned. Link: https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1901089 Cc: stable@vger.kernel.org Fixes: a2e2d73fa281 ("media: rc: do not access device via sysfs after rc_unregister_device()") Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Greg Kroah-Hartman --- drivers/media/rc/rc-main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/media/rc/rc-main.c b/drivers/media/rc/rc-main.c index cf3df733d960..6ea6038b9d36 100644 --- a/drivers/media/rc/rc-main.c +++ b/drivers/media/rc/rc-main.c @@ -1875,6 +1875,8 @@ int rc_register_device(struct rc_dev *dev) goto out_raw; } + dev->registered = true; + rc = device_add(&dev->dev); if (rc) goto out_rx_free; @@ -1884,8 +1886,6 @@ int rc_register_device(struct rc_dev *dev) dev->device_name ?: "Unspecified device", path ?: "N/A"); kfree(path); - dev->registered = true; - /* * once the the input device is registered in rc_setup_rx_device, * userspace can open the input device and rc_open() will be called From 3a2fd8a0b569ffce82421a4ab37cd26aa3778408 Mon Sep 17 00:00:00 2001 From: Sean Young Date: Tue, 19 Jan 2021 14:53:50 +0100 Subject: [PATCH 0666/1640] UPSTREAM: media: mceusb: sanity check for prescaler value commit 9dec0f48a75e0dadca498002d25ef4e143e60194 upstream. prescaler larger than 8 would mean the carrier is at most 152Hz, which does not make sense for IR carriers. Reported-by: syzbot+6d31bf169a8265204b8d@syzkaller.appspotmail.com Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Greg Kroah-Hartman --- drivers/media/rc/mceusb.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/media/rc/mceusb.c b/drivers/media/rc/mceusb.c index f1dfb8409432..845583e2af4d 100644 --- a/drivers/media/rc/mceusb.c +++ b/drivers/media/rc/mceusb.c @@ -685,11 +685,18 @@ static void mceusb_dev_printdata(struct mceusb_dev *ir, u8 *buf, int buf_len, data[0], data[1]); break; case MCE_RSP_EQIRCFS: + if (!data[0] && !data[1]) { + dev_dbg(dev, "%s: no carrier", inout); + break; + } + // prescaler should make sense + if (data[0] > 8) + break; period = DIV_ROUND_CLOSEST((1U << data[0] * 2) * (data[1] + 1), 10); if (!period) break; - carrier = (1000 * 1000) / period; + carrier = USEC_PER_SEC / period; dev_dbg(dev, "%s carrier of %u Hz (period %uus)", inout, carrier, period); break; From 0db77255d54efd75a25d274e7d260e0fe49ab6b4 Mon Sep 17 00:00:00 2001 From: Sean Young Date: Mon, 22 Feb 2021 09:08:35 +0100 Subject: [PATCH 0667/1640] UPSTREAM: media: ite-cir: check for receive overflow [ Upstream commit 28c7afb07ccfc0a939bb06ac1e7afe669901c65a ] It's best if this condition is reported. Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Sasha Levin --- drivers/media/rc/ite-cir.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/media/rc/ite-cir.c b/drivers/media/rc/ite-cir.c index de77d22c30a7..18f3718315a8 100644 --- a/drivers/media/rc/ite-cir.c +++ b/drivers/media/rc/ite-cir.c @@ -285,8 +285,14 @@ static irqreturn_t ite_cir_isr(int irq, void *data) /* read the interrupt flags */ iflags = dev->params.get_irq_causes(dev); + /* Check for RX overflow */ + if (iflags & ITE_IRQ_RX_FIFO_OVERRUN) { + dev_warn(&dev->rdev->dev, "receive overflow\n"); + ir_raw_event_reset(dev->rdev); + } + /* check for the receive interrupt */ - if (iflags & (ITE_IRQ_RX_FIFO | ITE_IRQ_RX_FIFO_OVERRUN)) { + if (iflags & ITE_IRQ_RX_FIFO) { /* read the FIFO bytes */ rx_bytes = dev->params.get_rx_bytes(dev, rx_buf, From 37399695172b93023d260fa9d40a37b69479e234 Mon Sep 17 00:00:00 2001 From: Sean Young Date: Wed, 23 Jun 2021 22:37:54 +0100 Subject: [PATCH 0668/1640] UPSTREAM: media, bpf: Do not copy more entries than user space requested [ Upstream commit 647d446d66e493d23ca1047fa8492b0269674530 ] The syscall bpf(BPF_PROG_QUERY, &attr) should use the prog_cnt field to see how many entries user space provided and return ENOSPC if there are more programs than that. Before this patch, this is not checked and ENOSPC is never returned. Note that one lirc device is limited to 64 bpf programs, and user space I'm aware of -- ir-keytable -- always gives enough space for 64 entries already. However, we should not copy program ids than are requested. Signed-off-by: Sean Young Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20210623213754.632-1-sean@mess.org Signed-off-by: Sasha Levin --- drivers/media/rc/bpf-lirc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/media/rc/bpf-lirc.c b/drivers/media/rc/bpf-lirc.c index 55400317ec53..0f88b4f779c8 100644 --- a/drivers/media/rc/bpf-lirc.c +++ b/drivers/media/rc/bpf-lirc.c @@ -303,7 +303,8 @@ int lirc_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr) } if (attr->query.prog_cnt != 0 && prog_ids && cnt) - ret = bpf_prog_array_copy_to_user(progs, prog_ids, cnt); + ret = bpf_prog_array_copy_to_user(progs, prog_ids, + attr->query.prog_cnt); unlock: mutex_unlock(&ir_raw_handler_lock); From de4d231da9c32de7cae66a50056ac03eddc75ea6 Mon Sep 17 00:00:00 2001 From: Sean Young Date: Sat, 3 Jul 2021 15:37:17 +0200 Subject: [PATCH 0669/1640] UPSTREAM: media: rc-loopback: return number of emitters rather than error commit 6b7f554be8c92319d7e6df92fd247ebb9beb4a45 upstream. The LIRC_SET_TRANSMITTER_MASK ioctl should return the number of emitters if an invalid list was set. Cc: stable@vger.kernel.org Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Greg Kroah-Hartman --- drivers/media/rc/rc-loopback.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/media/rc/rc-loopback.c b/drivers/media/rc/rc-loopback.c index 3822d9ebcb46..5abbde7e5d5b 100644 --- a/drivers/media/rc/rc-loopback.c +++ b/drivers/media/rc/rc-loopback.c @@ -52,7 +52,7 @@ static int loop_set_tx_mask(struct rc_dev *dev, u32 mask) if ((mask & (RXMASK_REGULAR | RXMASK_LEARNING)) != mask) { dprintk("invalid tx mask: %u\n", mask); - return -EINVAL; + return 2; } dprintk("setting tx mask: %u\n", mask); From bfb64b994d2046324d0a1cdce52c6ddaa25bafc2 Mon Sep 17 00:00:00 2001 From: Sean Young Date: Sun, 17 Oct 2021 13:01:15 +0100 Subject: [PATCH 0670/1640] UPSTREAM: media: ite-cir: IR receiver stop working after receive overflow commit fdc881783099c6343921ff017450831c8766d12a upstream. On an Intel NUC6iSYK, no IR is reported after a receive overflow. When a receiver overflow occurs, this condition is only cleared by reading the fifo. Make sure we read anything in the fifo. Fixes: 28c7afb07ccf ("media: ite-cir: check for receive overflow") Suggested-by: Bryan Pass Tested-by: Bryan Pass Cc: stable@vger.kernel.org> Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Greg Kroah-Hartman --- drivers/media/rc/ite-cir.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/media/rc/ite-cir.c b/drivers/media/rc/ite-cir.c index 18f3718315a8..679dd78382d7 100644 --- a/drivers/media/rc/ite-cir.c +++ b/drivers/media/rc/ite-cir.c @@ -292,7 +292,7 @@ static irqreturn_t ite_cir_isr(int irq, void *data) } /* check for the receive interrupt */ - if (iflags & ITE_IRQ_RX_FIFO) { + if (iflags & (ITE_IRQ_RX_FIFO | ITE_IRQ_RX_FIFO_OVERRUN)) { /* read the FIFO bytes */ rx_bytes = dev->params.get_rx_bytes(dev, rx_buf, From 58776dbfb9e8e3dc4bf502de0803683c94a6df91 Mon Sep 17 00:00:00 2001 From: Rajat Asthana Date: Wed, 18 Aug 2021 22:31:10 +0200 Subject: [PATCH 0671/1640] UPSTREAM: media: mceusb: return without resubmitting URB in case of -EPROTO error. [ Upstream commit 476db72e521983ecb847e4013b263072bb1110fc ] Syzkaller reported a warning called "rcu detected stall in dummy_timer". The error seems to be an error in mceusb_dev_recv(). In the case of -EPROTO error, the routine immediately resubmits the URB. Instead it should return without resubmitting URB. Reported-by: syzbot+4d3749e9612c2cfab956@syzkaller.appspotmail.com Signed-off-by: Rajat Asthana Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Sasha Levin --- drivers/media/rc/mceusb.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/media/rc/mceusb.c b/drivers/media/rc/mceusb.c index 845583e2af4d..cf4bcf7c62f2 100644 --- a/drivers/media/rc/mceusb.c +++ b/drivers/media/rc/mceusb.c @@ -1323,6 +1323,7 @@ static void mceusb_dev_recv(struct urb *urb) case -ECONNRESET: case -ENOENT: case -EILSEQ: + case -EPROTO: case -ESHUTDOWN: usb_unlink_urb(urb); return; From fad05ced9a52687acde74b4eb514c39b342cb55f Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Mon, 25 Oct 2021 13:16:34 +0100 Subject: [PATCH 0672/1640] UPSTREAM: media: mceusb: fix control-message timeouts commit 16394e998cbb050730536bdf7e89f5a70efbd974 upstream. USB control-message timeouts are specified in milliseconds and should specifically not vary with CONFIG_HZ. Fixes: 66e89522aff7 ("V4L/DVB: IR: add mceusb IR receiver driver") Cc: stable@vger.kernel.org # 2.6.36 Signed-off-by: Johan Hovold Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Greg Kroah-Hartman --- drivers/media/rc/mceusb.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/media/rc/mceusb.c b/drivers/media/rc/mceusb.c index cf4bcf7c62f2..0b619a2c146e 100644 --- a/drivers/media/rc/mceusb.c +++ b/drivers/media/rc/mceusb.c @@ -1367,7 +1367,7 @@ static void mceusb_gen1_init(struct mceusb_dev *ir) */ ret = usb_control_msg(ir->usbdev, usb_rcvctrlpipe(ir->usbdev, 0), USB_REQ_SET_ADDRESS, USB_TYPE_VENDOR, 0, 0, - data, USB_CTRL_MSG_SZ, HZ * 3); + data, USB_CTRL_MSG_SZ, 3000); dev_dbg(dev, "set address - ret = %d", ret); dev_dbg(dev, "set address - data[0] = %d, data[1] = %d", data[0], data[1]); @@ -1375,20 +1375,20 @@ static void mceusb_gen1_init(struct mceusb_dev *ir) /* set feature: bit rate 38400 bps */ ret = usb_control_msg(ir->usbdev, usb_sndctrlpipe(ir->usbdev, 0), USB_REQ_SET_FEATURE, USB_TYPE_VENDOR, - 0xc04e, 0x0000, NULL, 0, HZ * 3); + 0xc04e, 0x0000, NULL, 0, 3000); dev_dbg(dev, "set feature - ret = %d", ret); /* bRequest 4: set char length to 8 bits */ ret = usb_control_msg(ir->usbdev, usb_sndctrlpipe(ir->usbdev, 0), 4, USB_TYPE_VENDOR, - 0x0808, 0x0000, NULL, 0, HZ * 3); + 0x0808, 0x0000, NULL, 0, 3000); dev_dbg(dev, "set char length - retB = %d", ret); /* bRequest 2: set handshaking to use DTR/DSR */ ret = usb_control_msg(ir->usbdev, usb_sndctrlpipe(ir->usbdev, 0), 2, USB_TYPE_VENDOR, - 0x0000, 0x0100, NULL, 0, HZ * 3); + 0x0000, 0x0100, NULL, 0, 3000); dev_dbg(dev, "set handshake - retC = %d", ret); /* device resume */ From 20cbc39955e6e78535bee9354c367bdb0564de6f Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Mon, 25 Oct 2021 13:16:35 +0100 Subject: [PATCH 0673/1640] UPSTREAM: media: redrat3: fix control-message timeouts commit 2adc965c8bfa224e11ecccf9c92fd458c4236428 upstream. USB control-message timeouts are specified in milliseconds and should specifically not vary with CONFIG_HZ. Fixes: 2154be651b90 ("[media] redrat3: new rc-core IR transceiver device driver") Cc: stable@vger.kernel.org # 3.0 Signed-off-by: Johan Hovold Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Greg Kroah-Hartman --- drivers/media/rc/redrat3.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/drivers/media/rc/redrat3.c b/drivers/media/rc/redrat3.c index 6bfc24885b5c..14be14b0b0b0 100644 --- a/drivers/media/rc/redrat3.c +++ b/drivers/media/rc/redrat3.c @@ -415,7 +415,7 @@ static int redrat3_send_cmd(int cmd, struct redrat3_dev *rr3) udev = rr3->udev; res = usb_control_msg(udev, usb_rcvctrlpipe(udev, 0), cmd, USB_TYPE_VENDOR | USB_RECIP_DEVICE | USB_DIR_IN, - 0x0000, 0x0000, data, sizeof(u8), HZ * 10); + 0x0000, 0x0000, data, sizeof(u8), 10000); if (res < 0) { dev_err(rr3->dev, "%s: Error sending rr3 cmd res %d, data %d", @@ -491,7 +491,7 @@ static u32 redrat3_get_timeout(struct redrat3_dev *rr3) pipe = usb_rcvctrlpipe(rr3->udev, 0); ret = usb_control_msg(rr3->udev, pipe, RR3_GET_IR_PARAM, USB_TYPE_VENDOR | USB_RECIP_DEVICE | USB_DIR_IN, - RR3_IR_IO_SIG_TIMEOUT, 0, tmp, len, HZ * 5); + RR3_IR_IO_SIG_TIMEOUT, 0, tmp, len, 5000); if (ret != len) dev_warn(rr3->dev, "Failed to read timeout from hardware\n"); else { @@ -521,7 +521,7 @@ static int redrat3_set_timeout(struct rc_dev *rc_dev, unsigned int timeoutns) ret = usb_control_msg(udev, usb_sndctrlpipe(udev, 0), RR3_SET_IR_PARAM, USB_TYPE_VENDOR | USB_RECIP_DEVICE | USB_DIR_OUT, RR3_IR_IO_SIG_TIMEOUT, 0, timeout, sizeof(*timeout), - HZ * 25); + 25000); dev_dbg(dev, "set ir parm timeout %d ret 0x%02x\n", be32_to_cpu(*timeout), ret); @@ -553,32 +553,32 @@ static void redrat3_reset(struct redrat3_dev *rr3) *val = 0x01; rc = usb_control_msg(udev, rxpipe, RR3_RESET, USB_TYPE_VENDOR | USB_RECIP_DEVICE | USB_DIR_IN, - RR3_CPUCS_REG_ADDR, 0, val, len, HZ * 25); + RR3_CPUCS_REG_ADDR, 0, val, len, 25000); dev_dbg(dev, "reset returned 0x%02x\n", rc); *val = length_fuzz; rc = usb_control_msg(udev, txpipe, RR3_SET_IR_PARAM, USB_TYPE_VENDOR | USB_RECIP_DEVICE | USB_DIR_OUT, - RR3_IR_IO_LENGTH_FUZZ, 0, val, len, HZ * 25); + RR3_IR_IO_LENGTH_FUZZ, 0, val, len, 25000); dev_dbg(dev, "set ir parm len fuzz %d rc 0x%02x\n", *val, rc); *val = (65536 - (minimum_pause * 2000)) / 256; rc = usb_control_msg(udev, txpipe, RR3_SET_IR_PARAM, USB_TYPE_VENDOR | USB_RECIP_DEVICE | USB_DIR_OUT, - RR3_IR_IO_MIN_PAUSE, 0, val, len, HZ * 25); + RR3_IR_IO_MIN_PAUSE, 0, val, len, 25000); dev_dbg(dev, "set ir parm min pause %d rc 0x%02x\n", *val, rc); *val = periods_measure_carrier; rc = usb_control_msg(udev, txpipe, RR3_SET_IR_PARAM, USB_TYPE_VENDOR | USB_RECIP_DEVICE | USB_DIR_OUT, - RR3_IR_IO_PERIODS_MF, 0, val, len, HZ * 25); + RR3_IR_IO_PERIODS_MF, 0, val, len, 25000); dev_dbg(dev, "set ir parm periods measure carrier %d rc 0x%02x", *val, rc); *val = RR3_DRIVER_MAXLENS; rc = usb_control_msg(udev, txpipe, RR3_SET_IR_PARAM, USB_TYPE_VENDOR | USB_RECIP_DEVICE | USB_DIR_OUT, - RR3_IR_IO_MAX_LENGTHS, 0, val, len, HZ * 25); + RR3_IR_IO_MAX_LENGTHS, 0, val, len, 25000); dev_dbg(dev, "set ir parm max lens %d rc 0x%02x\n", *val, rc); kfree(val); @@ -596,7 +596,7 @@ static void redrat3_get_firmware_rev(struct redrat3_dev *rr3) rc = usb_control_msg(rr3->udev, usb_rcvctrlpipe(rr3->udev, 0), RR3_FW_VERSION, USB_TYPE_VENDOR | USB_RECIP_DEVICE | USB_DIR_IN, - 0, 0, buffer, RR3_FW_VERSION_LEN, HZ * 5); + 0, 0, buffer, RR3_FW_VERSION_LEN, 5000); if (rc >= 0) dev_info(rr3->dev, "Firmware rev: %s", buffer); @@ -836,14 +836,14 @@ static int redrat3_transmit_ir(struct rc_dev *rcdev, unsigned *txbuf, pipe = usb_sndbulkpipe(rr3->udev, rr3->ep_out->bEndpointAddress); ret = usb_bulk_msg(rr3->udev, pipe, irdata, - sendbuf_len, &ret_len, 10 * HZ); + sendbuf_len, &ret_len, 10000); dev_dbg(dev, "sent %d bytes, (ret %d)\n", ret_len, ret); /* now tell the hardware to transmit what we sent it */ pipe = usb_rcvctrlpipe(rr3->udev, 0); ret = usb_control_msg(rr3->udev, pipe, RR3_TX_SEND_SIGNAL, USB_TYPE_VENDOR | USB_RECIP_DEVICE | USB_DIR_IN, - 0, 0, irdata, 2, HZ * 10); + 0, 0, irdata, 2, 10000); if (ret < 0) dev_err(dev, "Error: control msg send failed, rc %d\n", ret); From e850dc09fd4e312e91e84ee6de8a5e2d42b75a29 Mon Sep 17 00:00:00 2001 From: Sean Young Date: Tue, 30 Nov 2021 23:58:19 +0100 Subject: [PATCH 0674/1640] UPSTREAM: media: igorplugusb: receiver overflow should be reported [ Upstream commit 8fede658e7ddb605bbd68ed38067ddb0af033db4 ] Without this, some IR will be missing mid-stream and we might decode something which never really occurred. Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Sasha Levin --- drivers/media/rc/igorplugusb.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/media/rc/igorplugusb.c b/drivers/media/rc/igorplugusb.c index f563ddd7f739..98a13532a596 100644 --- a/drivers/media/rc/igorplugusb.c +++ b/drivers/media/rc/igorplugusb.c @@ -73,9 +73,11 @@ static void igorplugusb_irdata(struct igorplugusb *ir, unsigned len) if (start >= len) { dev_err(ir->dev, "receive overflow invalid: %u", overflow); } else { - if (overflow > 0) + if (overflow > 0) { dev_warn(ir->dev, "receive overflow, at least %u lost", overflow); + ir_raw_event_reset(ir->rc); + } do { rawir.duration = ir->buf_in[i] * 85333; From 4a135cbe7403bb38989823c0fe84cdd854aea252 Mon Sep 17 00:00:00 2001 From: Gautam Menghani Date: Wed, 19 Oct 2022 06:02:14 +0100 Subject: [PATCH 0675/1640] UPSTREAM: media: imon: fix a race condition in send_packet() [ Upstream commit 813ceef062b53d68f296aa3cb944b21a091fabdb ] The function send_packet() has a race condition as follows: func send_packet() { // do work call usb_submit_urb() mutex_unlock() wait_for_event_interruptible() <-- lock gone mutex_lock() } func vfd_write() { mutex_lock() call send_packet() <- prev call is not completed mutex_unlock() } When the mutex is unlocked and the function send_packet() waits for the call to complete, vfd_write() can start another call, which leads to the "URB submitted while active" warning in usb_submit_urb(). Fix this by removing the mutex_unlock() call in send_packet() and using mutex_lock_interruptible(). Link: https://syzkaller.appspot.com/bug?id=e378e6a51fbe6c5cc43e34f131cc9a315ef0337e Fixes: 21677cfc562a ("V4L/DVB: ir-core: add imon driver") Reported-by: syzbot+0c3cb6dc05fbbdc3ad66@syzkaller.appspotmail.com Signed-off-by: Gautam Menghani Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Sasha Levin --- drivers/media/rc/imon.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/media/rc/imon.c b/drivers/media/rc/imon.c index b453d65aa9e5..c78e1a4a10ec 100644 --- a/drivers/media/rc/imon.c +++ b/drivers/media/rc/imon.c @@ -613,15 +613,14 @@ static int send_packet(struct imon_context *ictx) pr_err_ratelimited("error submitting urb(%d)\n", retval); } else { /* Wait for transmission to complete (or abort) */ - mutex_unlock(&ictx->lock); retval = wait_for_completion_interruptible( &ictx->tx.finished); if (retval) { usb_kill_urb(ictx->tx_urb); pr_err_ratelimited("task interrupted\n"); } - mutex_lock(&ictx->lock); + ictx->tx.busy = false; retval = ictx->tx.status; if (retval) pr_err_ratelimited("packet tx failed (%d)\n", retval); @@ -928,7 +927,8 @@ static ssize_t vfd_write(struct file *file, const char __user *buf, return -ENODEV; } - mutex_lock(&ictx->lock); + if (mutex_lock_interruptible(&ictx->lock)) + return -ERESTARTSYS; if (!ictx->dev_present_intf0) { pr_err_ratelimited("no iMON device present\n"); From b02ca6b65af8e953730cc7536aa5d33f628a3447 Mon Sep 17 00:00:00 2001 From: Duoming Zhou Date: Tue, 24 Jan 2023 08:55:33 +0100 Subject: [PATCH 0676/1640] UPSTREAM: media: rc: Fix use-after-free bugs caused by ene_tx_irqsim() [ Upstream commit 29b0589a865b6f66d141d79b2dd1373e4e50fe17 ] When the ene device is detaching, function ene_remove() will be called. But there is no function to cancel tx_sim_timer in ene_remove(), the timer handler ene_tx_irqsim() could race with ene_remove(). As a result, the UAF bugs could happen, the process is shown below. (cleanup routine) | (timer routine) | mod_timer(&dev->tx_sim_timer, ..) ene_remove() | (wait a time) | ene_tx_irqsim() | dev->hw_lock //USE | ene_tx_sample(dev) //USE Fix by adding del_timer_sync(&dev->tx_sim_timer) in ene_remove(), The tx_sim_timer could stop before ene device is deallocated. What's more, The rc_unregister_device() and del_timer_sync() should be called first in ene_remove() and the deallocated functions such as free_irq(), release_region() and so on should be called behind them. Because the rc_unregister_device() is well synchronized. Otherwise, race conditions may happen. The situations that may lead to race conditions are shown below. Firstly, the rx receiver is disabled with ene_rx_disable() before rc_unregister_device() in ene_remove(), which means it can be enabled again if a process opens /dev/lirc0 between ene_rx_disable() and rc_unregister_device(). Secondly, the irqaction descriptor is freed by free_irq() before the rc device is unregistered, which means irqaction descriptor may be accessed again after it is deallocated. Thirdly, the timer can call ene_tx_sample() that can write to the io ports, which means the io ports could be accessed again after they are deallocated by release_region(). Therefore, the rc_unregister_device() and del_timer_sync() should be called first in ene_remove(). Suggested by: Sean Young Fixes: 9ea53b74df9c ("V4L/DVB: STAGING: remove lirc_ene0100 driver") Signed-off-by: Duoming Zhou Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Sasha Levin --- drivers/media/rc/ene_ir.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/media/rc/ene_ir.c b/drivers/media/rc/ene_ir.c index 71b8c9bbf6c4..8cf2a5c0575a 100644 --- a/drivers/media/rc/ene_ir.c +++ b/drivers/media/rc/ene_ir.c @@ -1116,6 +1116,8 @@ static void ene_remove(struct pnp_dev *pnp_dev) struct ene_device *dev = pnp_get_drvdata(pnp_dev); unsigned long flags; + rc_unregister_device(dev->rdev); + del_timer_sync(&dev->tx_sim_timer); spin_lock_irqsave(&dev->hw_lock, flags); ene_rx_disable(dev); ene_rx_restore_hw_buffer(dev); @@ -1123,7 +1125,6 @@ static void ene_remove(struct pnp_dev *pnp_dev) free_irq(dev->irq, dev); release_region(dev->hw_io, ENE_IO_SIZE); - rc_unregister_device(dev->rdev); kfree(dev); } From 41ad1bf1595073e3b179184f1cc7dcbf66767cb8 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Fri, 24 Mar 2023 13:38:33 -0700 Subject: [PATCH 0677/1640] UPSTREAM: media: rc: gpio-ir-recv: Fix support for wake-up [ Upstream commit 9c592f8ab114875fdb3b2040f01818e53de44991 ] The driver was intended from the start to be a wake-up source for the system, however due to the absence of a suitable call to device_set_wakeup_capable(), the device_may_wakeup() call used to decide whether to enable the GPIO interrupt as a wake-up source would never happen. Lookup the DT standard "wakeup-source" property and call device_init_wakeup() to ensure the device is flagged as being wakeup capable. Reported-by: Matthew Lear Fixes: fd0f6851eb46 ("[media] rc: Add support for GPIO based IR Receiver driver") Signed-off-by: Florian Fainelli Signed-off-by: Sean Young Signed-off-by: Hans Verkuil Signed-off-by: Sasha Levin --- drivers/media/rc/gpio-ir-recv.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/media/rc/gpio-ir-recv.c b/drivers/media/rc/gpio-ir-recv.c index 3d99b51384ac..ed5cfde4d9e7 100644 --- a/drivers/media/rc/gpio-ir-recv.c +++ b/drivers/media/rc/gpio-ir-recv.c @@ -91,6 +91,8 @@ static int gpio_ir_recv_probe(struct platform_device *pdev) rcdev->map_name = RC_MAP_EMPTY; gpio_dev->rcdev = rcdev; + if (of_property_read_bool(np, "wakeup-source")) + device_init_wakeup(dev, true); rc = devm_rc_register_device(dev, rcdev); if (rc < 0) { From a2e0c670d01a1ff6d4e8b39f0d726a1c08484029 Mon Sep 17 00:00:00 2001 From: Sean Young Date: Fri, 6 Oct 2023 22:31:52 +0100 Subject: [PATCH 0678/1640] UPSTREAM: media: lirc: drop trailing space from scancode transmit commit c8a489f820179fb12251e262b50303c29de991ac upstream. When transmitting, infrared drivers expect an odd number of samples; iow without a trailing space. No problems have been observed so far, so this is just belt and braces. Fixes: 9b6192589be7 ("media: lirc: implement scancode sending") Cc: stable@vger.kernel.org Signed-off-by: Sean Young Signed-off-by: Hans Verkuil Signed-off-by: Greg Kroah-Hartman --- drivers/media/rc/lirc_dev.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/media/rc/lirc_dev.c b/drivers/media/rc/lirc_dev.c index 2801eeb2f09e..87311ce13da7 100644 --- a/drivers/media/rc/lirc_dev.c +++ b/drivers/media/rc/lirc_dev.c @@ -302,7 +302,11 @@ static ssize_t ir_lirc_transmit_ir(struct file *file, const char __user *buf, if (ret < 0) goto out_kfree_raw; - count = ret; + /* drop trailing space */ + if (!(ret % 2)) + count = ret - 1; + else + count = ret; txbuf = kmalloc_array(count, sizeof(unsigned int), GFP_KERNEL); if (!txbuf) { From a4244dc221de271802a30ee46bc13515bbddf010 Mon Sep 17 00:00:00 2001 From: Sean Young Date: Fri, 6 Oct 2023 12:54:25 +0100 Subject: [PATCH 0679/1640] UPSTREAM: media: sharp: fix sharp encoding commit 4f7efc71891462ab7606da7039f480d7c1584a13 upstream. The Sharp protocol[1] encoding has incorrect timings for bit space. [1] https://www.sbprojects.net/knowledge/ir/sharp.php Fixes: d35afc5fe097 ("[media] rc: ir-sharp-decoder: Add encode capability") Cc: stable@vger.kernel.org Reported-by: Joe Ferner Closes: https://sourceforge.net/p/lirc/mailman/message/38604507/ Signed-off-by: Sean Young Signed-off-by: Hans Verkuil Signed-off-by: Greg Kroah-Hartman --- drivers/media/rc/ir-sharp-decoder.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/media/rc/ir-sharp-decoder.c b/drivers/media/rc/ir-sharp-decoder.c index f96e0c992eed..dbddf987df97 100644 --- a/drivers/media/rc/ir-sharp-decoder.c +++ b/drivers/media/rc/ir-sharp-decoder.c @@ -23,7 +23,9 @@ #define SHARP_UNIT 40000 /* ns */ #define SHARP_BIT_PULSE (8 * SHARP_UNIT) /* 320us */ #define SHARP_BIT_0_PERIOD (25 * SHARP_UNIT) /* 1ms (680us space) */ -#define SHARP_BIT_1_PERIOD (50 * SHARP_UNIT) /* 2ms (1680ms space) */ +#define SHARP_BIT_1_PERIOD (50 * SHARP_UNIT) /* 2ms (1680us space) */ +#define SHARP_BIT_0_SPACE (17 * SHARP_UNIT) /* 680us space */ +#define SHARP_BIT_1_SPACE (42 * SHARP_UNIT) /* 1680us space */ #define SHARP_ECHO_SPACE (1000 * SHARP_UNIT) /* 40 ms */ #define SHARP_TRAILER_SPACE (125 * SHARP_UNIT) /* 5 ms (even longer) */ @@ -176,8 +178,8 @@ static const struct ir_raw_timings_pd ir_sharp_timings = { .header_pulse = 0, .header_space = 0, .bit_pulse = SHARP_BIT_PULSE, - .bit_space[0] = SHARP_BIT_0_PERIOD, - .bit_space[1] = SHARP_BIT_1_PERIOD, + .bit_space[0] = SHARP_BIT_0_SPACE, + .bit_space[1] = SHARP_BIT_1_SPACE, .trailer_pulse = SHARP_BIT_PULSE, .trailer_space = SHARP_ECHO_SPACE, .msb_first = 1, From e8c290fbe8824f18ade3afb117ecdb544c90cb5c Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Wed, 30 May 2018 16:09:16 +0100 Subject: [PATCH 0680/1640] UPSTREAM: bpf: devmap: remove redundant assignment of dev = dev The assignment dev = dev is redundant and should be removed. Detected by CoverityScan, CID#1469486 ("Evaluation order violation") Signed-off-by: Colin Ian King Acked-by: Song Liu Signed-off-by: Alexei Starovoitov --- kernel/bpf/devmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 6721f3404bbf..9580d918f1a2 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -355,7 +355,7 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, static void *dev_map_lookup_elem(struct bpf_map *map, void *key) { struct bpf_dtab_netdev *obj = __dev_map_lookup_elem(map, *(u32 *)key); - struct net_device *dev = dev = obj ? obj->dev : NULL; + struct net_device *dev = obj ? obj->dev : NULL; return dev ? &dev->ifindex : NULL; } From 60cad6ae66427f6add16fb62f37e57cd81b8cc21 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 30 May 2018 12:24:17 -0700 Subject: [PATCH 0681/1640] UPSTREAM: bpf: Change bpf_fib_lookup to return -EAFNOSUPPORT for unsupported address families Update bpf_fib_lookup to return -EAFNOSUPPORT for unsupported address families. Allows userspace to probe for support as more are added (e.g., AF_MPLS). Signed-off-by: David Ahern Signed-off-by: Alexei Starovoitov --- net/core/filter.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/core/filter.c b/net/core/filter.c index 58f420df1d70..027e12550e8a 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4292,7 +4292,7 @@ BPF_CALL_4(bpf_xdp_fib_lookup, struct xdp_buff *, ctx, flags, true); #endif } - return 0; + return -EAFNOSUPPORT; } static const struct bpf_func_proto bpf_xdp_fib_lookup_proto = { @@ -4309,7 +4309,7 @@ BPF_CALL_4(bpf_skb_fib_lookup, struct sk_buff *, skb, struct bpf_fib_lookup *, params, int, plen, u32, flags) { struct net *net = dev_net(skb->dev); - int index = 0; + int index = -EAFNOSUPPORT; if (plen < sizeof(*params)) return -EINVAL; From 20b06eb130018bcb5346e044916a545cd5ce94ab Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 2 Jun 2018 05:21:59 +0200 Subject: [PATCH 0682/1640] BACKPORT: bpf: fix uapi hole for 32 bit compat applications In 64 bit, we have a 4 byte hole between ifindex and netns_dev in the case of struct bpf_map_info but also struct bpf_prog_info. In net-next commit b85fab0e67b ("bpf: Add gpl_compatible flag to struct bpf_prog_info") added a bitfield into it to expose some flags related to programs. Thus, add an unnamed __u32 bitfield for both so that alignment keeps the same in both 32 and 64 bit cases, and can be naturally extended from there as in b85fab0e67b. Before: # file test.o test.o: ELF 32-bit LSB relocatable, Intel 80386, version 1 (SYSV), not stripped # pahole test.o struct bpf_map_info { __u32 type; /* 0 4 */ __u32 id; /* 4 4 */ __u32 key_size; /* 8 4 */ __u32 value_size; /* 12 4 */ __u32 max_entries; /* 16 4 */ __u32 map_flags; /* 20 4 */ char name[16]; /* 24 16 */ __u32 ifindex; /* 40 4 */ __u64 netns_dev; /* 44 8 */ __u64 netns_ino; /* 52 8 */ /* size: 64, cachelines: 1, members: 10 */ /* padding: 4 */ }; After (same as on 64 bit): # file test.o test.o: ELF 32-bit LSB relocatable, Intel 80386, version 1 (SYSV), not stripped # pahole test.o struct bpf_map_info { __u32 type; /* 0 4 */ __u32 id; /* 4 4 */ __u32 key_size; /* 8 4 */ __u32 value_size; /* 12 4 */ __u32 max_entries; /* 16 4 */ __u32 map_flags; /* 20 4 */ char name[16]; /* 24 16 */ __u32 ifindex; /* 40 4 */ /* XXX 4 bytes hole, try to pack */ __u64 netns_dev; /* 48 8 */ __u64 netns_ino; /* 56 8 */ /* --- cacheline 1 boundary (64 bytes) --- */ /* size: 64, cachelines: 1, members: 10 */ /* sum members: 60, holes: 1, sum holes: 4 */ }; Reported-by: Dmitry V. Levin Reported-by: Eugene Syromiatnikov Fixes: 52775b33bb507 ("bpf: offload: report device information about offloaded maps") Fixes: 675fc275a3a2d ("bpf: offload: report device information for offloaded programs") Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 45acd440e119..203589687761 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2414,6 +2414,7 @@ struct bpf_prog_info { char name[BPF_OBJ_NAME_LEN]; __u32 ifindex; __u32 gpl_compatible:1; + __u32 :31; __u64 netns_dev; __u64 netns_ino; __u32 nr_jited_ksyms; @@ -2431,6 +2432,7 @@ struct bpf_map_info { __u32 map_flags; char name[BPF_OBJ_NAME_LEN]; __u32 ifindex; + __u32 :32; __u64 netns_dev; __u64 netns_ino; __u32 btf_id; From 9143f77b259078f014a65dfa3c6ad0f6a3712706 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Sat, 2 Jun 2018 09:06:50 -0700 Subject: [PATCH 0683/1640] BACKPORT: bpf: btf: Check array t->size This patch ensures array's t->size is 0. The array size is decided by its individual elem's size and the number of elements. Hence, t->size is not used and it must be 0. A test case is added to test_btf.c Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov --- kernel/bpf/btf.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 3d20aa1f4b54..84ad532f2854 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -1342,6 +1342,11 @@ static s32 btf_array_check_meta(struct btf_verifier_env *env, return -EINVAL; } + if (t->size) { + btf_verifier_log_type(env, t, "size != 0"); + return -EINVAL; + } + /* Array elem type and index type cannot be in type void, * so !array->type and !array->index_type are not allowed. */ From f04cc9f44bf2c3167c553544e9dd4b3f18e459cb Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Sat, 2 Jun 2018 09:06:51 -0700 Subject: [PATCH 0684/1640] BACKPORT: bpf: btf: Ensure t->type == 0 for BTF_KIND_FWD The t->type in BTF_KIND_FWD is not used. It must be 0. This patch ensures that and also adds a test case in test_btf.c Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov --- kernel/bpf/btf.c | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 84ad532f2854..8653ab004c73 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -1286,8 +1286,27 @@ static struct btf_kind_operations ptr_ops = { .seq_show = btf_ptr_seq_show, }; +static s32 btf_fwd_check_meta(struct btf_verifier_env *env, + const struct btf_type *t, + u32 meta_left) +{ + if (btf_type_vlen(t)) { + btf_verifier_log_type(env, t, "vlen != 0"); + return -EINVAL; + } + + if (t->type) { + btf_verifier_log_type(env, t, "type != 0"); + return -EINVAL; + } + + btf_verifier_log_type(env, t, NULL); + + return 0; +} + static struct btf_kind_operations fwd_ops = { - .check_meta = btf_ref_type_check_meta, + .check_meta = btf_fwd_check_meta, .resolve = btf_df_resolve, .check_member = btf_df_check_member, .log_details = btf_ref_type_log, From a039785fe0d4f4bdcdd78a91ef8cc65593feb935 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 2 Jun 2018 23:06:33 +0200 Subject: [PATCH 0685/1640] UPSTREAM: bpf: fixup error message from gpl helpers on license mismatch Stating 'proprietary program' in the error is just silly since it can also be a different open source license than that which is just not compatible. Reference: https://twitter.com/majek04/status/998531268039102465 Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Acked-by: Jesper Dangaard Brouer Acked-by: Song Liu Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 260cb7ff9356..b82fee1b417f 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2444,7 +2444,7 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn /* eBPF programs must be GPL compatible to use GPL-ed functions */ if (!env->prog->gpl_compatible && fn->gpl_only) { - verbose(env, "cannot call GPL only function from proprietary program\n"); + verbose(env, "cannot call GPL-restricted function from non-GPL compatible program\n"); return -EINVAL; } From 7fb8bd93ee752577ca651411d9cbc0399c91f7ed Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 2 Jun 2018 23:06:34 +0200 Subject: [PATCH 0686/1640] UPSTREAM: bpf: show prog and map id in fdinfo Its trivial and straight forward to expose it for scripts that can then use it along with bpftool in order to inspect an individual application's used maps and progs. Right now we dump some basic information in the fdinfo file but with the help of the map/prog id full introspection becomes possible now. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Acked-by: Song Liu Acked-by: Jesper Dangaard Brouer Signed-off-by: Alexei Starovoitov --- kernel/bpf/syscall.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 92355fd45d63..93fcb9c9d300 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -328,13 +328,15 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp) "value_size:\t%u\n" "max_entries:\t%u\n" "map_flags:\t%#x\n" - "memlock:\t%llu\n", + "memlock:\t%llu\n" + "map_id:\t%u\n", map->map_type, map->key_size, map->value_size, map->max_entries, map->map_flags, - map->pages * 1ULL << PAGE_SHIFT); + map->pages * 1ULL << PAGE_SHIFT, + map->id); if (owner_prog_type) { seq_printf(m, "owner_prog_type:\t%u\n", @@ -1087,11 +1089,13 @@ static void bpf_prog_show_fdinfo(struct seq_file *m, struct file *filp) "prog_type:\t%u\n" "prog_jited:\t%u\n" "prog_tag:\t%s\n" - "memlock:\t%llu\n", + "memlock:\t%llu\n" + "prog_id:\t%u\n", prog->type, prog->jited, prog_tag, - prog->pages * 1ULL << PAGE_SHIFT); + prog->pages * 1ULL << PAGE_SHIFT, + prog->aux->id); } #endif From 3ea1e65f3fc561a3be8fdeb7216fdc6084708977 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 2 Jun 2018 23:06:35 +0200 Subject: [PATCH 0687/1640] UPSTREAM: bpf: avoid retpoline for lookup/update/delete calls on maps While some of the BPF map lookup helpers provide a ->map_gen_lookup() callback for inlining the map lookup altogether it is not available for every map, so the remaining ones have to call bpf_map_lookup_elem() helper which does a dispatch to map->ops->map_lookup_elem(). In times of retpolines, this will control and trap speculative execution rather than letting it do its work for the indirect call and will therefore cause a slowdown. Likewise, bpf_map_update_elem() and bpf_map_delete_elem() do not have an inlined version and need to call into their map->ops->map_update_elem() resp. map->ops->map_delete_elem() handlers. Before: # bpftool prog dump xlated id 1 0: (bf) r2 = r10 1: (07) r2 += -8 2: (7a) *(u64 *)(r2 +0) = 0 3: (18) r1 = map[id:1] 5: (85) call __htab_map_lookup_elem#232656 6: (15) if r0 == 0x0 goto pc+4 7: (71) r1 = *(u8 *)(r0 +35) 8: (55) if r1 != 0x0 goto pc+1 9: (72) *(u8 *)(r0 +35) = 1 10: (07) r0 += 56 11: (15) if r0 == 0x0 goto pc+4 12: (bf) r2 = r0 13: (18) r1 = map[id:1] 15: (85) call bpf_map_delete_elem#215008 <-- indirect call via 16: (95) exit helper After: # bpftool prog dump xlated id 1 0: (bf) r2 = r10 1: (07) r2 += -8 2: (7a) *(u64 *)(r2 +0) = 0 3: (18) r1 = map[id:1] 5: (85) call __htab_map_lookup_elem#233328 6: (15) if r0 == 0x0 goto pc+4 7: (71) r1 = *(u8 *)(r0 +35) 8: (55) if r1 != 0x0 goto pc+1 9: (72) *(u8 *)(r0 +35) = 1 10: (07) r0 += 56 11: (15) if r0 == 0x0 goto pc+4 12: (bf) r2 = r0 13: (18) r1 = map[id:1] 15: (85) call htab_lru_map_delete_elem#238240 <-- direct call 16: (95) exit In all three lookup/update/delete cases however we can use the actual address of the map callback directly if we find that there's only a single path with a map pointer leading to the helper call, meaning when the map pointer has not been poisoned from verifier side. Example code can be seen above for the delete case. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Acked-by: Song Liu Signed-off-by: Alexei Starovoitov --- include/linux/filter.h | 3 ++ kernel/bpf/hashtab.c | 12 ++++++-- kernel/bpf/verifier.c | 68 ++++++++++++++++++++++++++++++------------ 3 files changed, 61 insertions(+), 22 deletions(-) diff --git a/include/linux/filter.h b/include/linux/filter.h index b9f9efaf86f8..67b81f72c75f 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -311,6 +311,9 @@ struct xdp_buff; /* Function call */ +#define BPF_CAST_CALL(x) \ + ((u64 (*)(u64, u64, u64, u64, u64))(x)) + #define BPF_EMIT_CALL(FUNC) \ ((struct bpf_insn) { \ .code = BPF_JMP | BPF_CALL, \ diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 10658cdff4ad..36b383f2781d 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -507,7 +507,9 @@ static u32 htab_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) struct bpf_insn *insn = insn_buf; const int ret = BPF_REG_0; - *insn++ = BPF_EMIT_CALL((u64 (*)(u64, u64, u64, u64, u64))__htab_map_lookup_elem); + BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem, + (void *(*)(struct bpf_map *map, void *key))NULL)); + *insn++ = BPF_EMIT_CALL(BPF_CAST_CALL(__htab_map_lookup_elem)); *insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 1); *insn++ = BPF_ALU64_IMM(BPF_ADD, ret, offsetof(struct htab_elem, key) + @@ -546,7 +548,9 @@ static u32 htab_lru_map_gen_lookup(struct bpf_map *map, const int ret = BPF_REG_0; const int ref_reg = BPF_REG_1; - *insn++ = BPF_EMIT_CALL((u64 (*)(u64, u64, u64, u64, u64))__htab_map_lookup_elem); + BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem, + (void *(*)(struct bpf_map *map, void *key))NULL)); + *insn++ = BPF_EMIT_CALL(BPF_CAST_CALL(__htab_map_lookup_elem)); *insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 4); *insn++ = BPF_LDX_MEM(BPF_B, ref_reg, ret, offsetof(struct htab_elem, lru_node) + @@ -1390,7 +1394,9 @@ static u32 htab_of_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn = insn_buf; const int ret = BPF_REG_0; - *insn++ = BPF_EMIT_CALL((u64 (*)(u64, u64, u64, u64, u64))__htab_map_lookup_elem); + BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem, + (void *(*)(struct bpf_map *map, void *key))NULL)); + *insn++ = BPF_EMIT_CALL(BPF_CAST_CALL(__htab_map_lookup_elem)); *insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 2); *insn++ = BPF_ALU64_IMM(BPF_ADD, ret, offsetof(struct htab_elem, key) + diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index b82fee1b417f..f1d4ea910397 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2402,8 +2402,11 @@ record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta, struct bpf_insn_aux_data *aux = &env->insn_aux_data[insn_idx]; if (func_id != BPF_FUNC_tail_call && - func_id != BPF_FUNC_map_lookup_elem) + func_id != BPF_FUNC_map_lookup_elem && + func_id != BPF_FUNC_map_update_elem && + func_id != BPF_FUNC_map_delete_elem) return 0; + if (meta->map_ptr == NULL) { verbose(env, "kernel subsystem misconfigured verifier\n"); return -EINVAL; @@ -5870,6 +5873,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) struct bpf_insn *insn = prog->insnsi; const struct bpf_func_proto *fn; const int insn_cnt = prog->len; + const struct bpf_map_ops *ops; struct bpf_insn_aux_data *aux; struct bpf_insn insn_buf[16]; struct bpf_prog *new_prog; @@ -6053,35 +6057,61 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) } /* BPF_EMIT_CALL() assumptions in some of the map_gen_lookup - * handlers are currently limited to 64 bit only. + * and other inlining handlers are currently limited to 64 bit + * only. */ if (prog->jit_requested && BITS_PER_LONG == 64 && - insn->imm == BPF_FUNC_map_lookup_elem) { + (insn->imm == BPF_FUNC_map_lookup_elem || + insn->imm == BPF_FUNC_map_update_elem || + insn->imm == BPF_FUNC_map_delete_elem)) { aux = &env->insn_aux_data[i + delta]; if (bpf_map_ptr_poisoned(aux)) goto patch_call_imm; map_ptr = BPF_MAP_PTR(aux->map_state); - if (!map_ptr->ops->map_gen_lookup) - goto patch_call_imm; + ops = map_ptr->ops; + if (insn->imm == BPF_FUNC_map_lookup_elem && + ops->map_gen_lookup) { + cnt = ops->map_gen_lookup(map_ptr, insn_buf); + if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) { + verbose(env, "bpf verifier is misconfigured\n"); + return -EINVAL; + } - cnt = map_ptr->ops->map_gen_lookup(map_ptr, insn_buf); - if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) { - verbose(env, "bpf verifier is misconfigured\n"); - return -EINVAL; + new_prog = bpf_patch_insn_data(env, i + delta, + insn_buf, cnt); + if (!new_prog) + return -ENOMEM; + + delta += cnt - 1; + env->prog = prog = new_prog; + insn = new_prog->insnsi + i + delta; + continue; } - new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, - cnt); - if (!new_prog) - return -ENOMEM; + BUILD_BUG_ON(!__same_type(ops->map_lookup_elem, + (void *(*)(struct bpf_map *map, void *key))NULL)); + BUILD_BUG_ON(!__same_type(ops->map_delete_elem, + (int (*)(struct bpf_map *map, void *key))NULL)); + BUILD_BUG_ON(!__same_type(ops->map_update_elem, + (int (*)(struct bpf_map *map, void *key, void *value, + u64 flags))NULL)); + switch (insn->imm) { + case BPF_FUNC_map_lookup_elem: + insn->imm = BPF_CAST_CALL(ops->map_lookup_elem) - + __bpf_call_base; + continue; + case BPF_FUNC_map_update_elem: + insn->imm = BPF_CAST_CALL(ops->map_update_elem) - + __bpf_call_base; + continue; + case BPF_FUNC_map_delete_elem: + insn->imm = BPF_CAST_CALL(ops->map_delete_elem) - + __bpf_call_base; + continue; + } - delta += cnt - 1; - - /* keep walking new program and skip insns we just inserted */ - env->prog = prog = new_prog; - insn = new_prog->insnsi + i + delta; - continue; + goto patch_call_imm; } if (insn->imm == BPF_FUNC_redirect_map) { From 92ab4ce161e865379955e7de574104a5e23996d9 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 2 Jun 2018 23:06:36 +0200 Subject: [PATCH 0688/1640] BACKPORT: bpf: add bpf_skb_cgroup_id helper Add a new bpf_skb_cgroup_id() helper that allows to retrieve the cgroup id from the skb's socket. This is useful in particular to enable bpf_get_cgroup_classid()-like behavior for cgroup v1 in cgroup v2 by allowing ID based matching on egress. This can in particular be used in combination with applying policy e.g. from map lookups, and also complements the older bpf_skb_under_cgroup() interface. In user space the cgroup id for a given path can be retrieved through the f_handle as demonstrated in [0] recently. [0] https://lkml.org/lkml/2018/5/22/1190 Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 16 ++++++++++++++++ net/core/filter.c | 29 +++++++++++++++++++++++++++-- 2 files changed, 43 insertions(+), 2 deletions(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 203589687761..2957cb670a5c 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2054,6 +2054,22 @@ union bpf_attr { * * Return * 0 + * + * uint64_t bpf_skb_cgroup_id(struct sk_buff *skb) + * Description + * Return the cgroup v2 id of the socket associated with the *skb*. + * This is roughly similar to the **bpf_get_cgroup_classid**\ () + * helper for cgroup v1 by providing a tag resp. identifier that + * can be matched on or used for map lookups e.g. to implement + * policy. The cgroup v2 id of a given path in the hierarchy is + * exposed in user space through the f_handle API in order to get + * to the same 64-bit id. + * + * This helper can be used on TC egress path, but not on ingress, + * and is available only if the kernel was compiled with the + * **CONFIG_SOCK_CGROUP_DATA** configuration option. + * Return + * The id is returned or 0 in case the id could not be retrieved. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ diff --git a/net/core/filter.c b/net/core/filter.c index 027e12550e8a..d5e10e16963b 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3663,6 +3663,27 @@ static const struct bpf_func_proto bpf_skb_under_cgroup_proto = { .arg3_type = ARG_ANYTHING, }; +#ifdef CONFIG_SOCK_CGROUP_DATA +BPF_CALL_1(bpf_skb_cgroup_id, const struct sk_buff *, skb) +{ + struct sock *sk = skb_to_full_sk(skb); + struct cgroup *cgrp; + + if (!sk || !sk_fullsock(sk)) + return 0; + + cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); + return cgrp->kn->id.id; +} + +static const struct bpf_func_proto bpf_skb_cgroup_id_proto = { + .func = bpf_skb_cgroup_id, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, +}; +#endif + static unsigned long bpf_xdp_copy(void *dst_buff, const void *src_buff, unsigned long off, unsigned long len) { @@ -4758,12 +4779,16 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_socket_cookie_proto; case BPF_FUNC_get_socket_uid: return &bpf_get_socket_uid_proto; + case BPF_FUNC_fib_lookup: + return &bpf_skb_fib_lookup_proto; #ifdef CONFIG_XFRM case BPF_FUNC_skb_get_xfrm_state: return &bpf_skb_get_xfrm_state_proto; #endif - case BPF_FUNC_fib_lookup: - return &bpf_skb_fib_lookup_proto; +#ifdef CONFIG_SOCK_CGROUP_DATA + case BPF_FUNC_skb_cgroup_id: + return &bpf_skb_cgroup_id_proto; +#endif default: return bpf_base_func_proto(func_id); } From ca5a495410d26571861361a0e55bf69ed46b252d Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 2 Jun 2018 23:06:37 +0200 Subject: [PATCH 0689/1640] UPSTREAM: bpf: make sure to clear unused fields in tunnel/xfrm state fetch Since the remaining bits are not filled in struct bpf_tunnel_key resp. struct bpf_xfrm_state and originate from uninitialized stack space, we should make sure to clear them before handing control back to the program. Also add a padding element to struct bpf_xfrm_state for future use similar as we have in struct bpf_tunnel_key and clear it as well. struct bpf_xfrm_state { __u32 reqid; /* 0 4 */ __u32 spi; /* 4 4 */ __u16 family; /* 8 2 */ /* XXX 2 bytes hole, try to pack */ union { __u32 remote_ipv4; /* 4 */ __u32 remote_ipv6[4]; /* 16 */ }; /* 12 16 */ /* size: 28, cachelines: 1, members: 4 */ /* sum members: 26, holes: 1, sum holes: 2 */ /* last cacheline: 28 bytes */ }; Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Acked-by: Song Liu Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 3 ++- net/core/filter.c | 6 ++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 2957cb670a5c..5e19b47355e9 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2314,7 +2314,7 @@ struct bpf_tunnel_key { }; __u8 tunnel_tos; __u8 tunnel_ttl; - __u16 tunnel_ext; + __u16 tunnel_ext; /* Padding, future use. */ __u32 tunnel_label; }; @@ -2325,6 +2325,7 @@ struct bpf_xfrm_state { __u32 reqid; __u32 spi; /* Stored in network byte order */ __u16 family; + __u16 ext; /* Padding, future use. */ union { __u32 remote_ipv4; /* Stored in network byte order */ __u32 remote_ipv6[4]; /* Stored in network byte order */ diff --git a/net/core/filter.c b/net/core/filter.c index d5e10e16963b..b24e6df68ecf 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3447,6 +3447,7 @@ set_compat: to->tunnel_id = be64_to_cpu(info->key.tun_id); to->tunnel_tos = info->key.tos; to->tunnel_ttl = info->key.ttl; + to->tunnel_ext = 0; if (flags & BPF_F_TUNINFO_IPV6) { memcpy(to->remote_ipv6, &info->key.u.ipv6.src, @@ -3454,6 +3455,8 @@ set_compat: to->tunnel_label = be32_to_cpu(info->key.label); } else { to->remote_ipv4 = be32_to_cpu(info->key.u.ipv4.src); + memset(&to->remote_ipv6[1], 0, sizeof(__u32) * 3); + to->tunnel_label = 0; } if (unlikely(size != sizeof(struct bpf_tunnel_key))) @@ -4054,11 +4057,14 @@ BPF_CALL_5(bpf_skb_get_xfrm_state, struct sk_buff *, skb, u32, index, to->reqid = x->props.reqid; to->spi = x->id.spi; to->family = x->props.family; + to->ext = 0; + if (to->family == AF_INET6) { memcpy(to->remote_ipv6, x->props.saddr.a6, sizeof(to->remote_ipv6)); } else { to->remote_ipv4 = x->props.saddr.a4; + memset(&to->remote_ipv6[1], 0, sizeof(__u32) * 3); } return 0; From 02dc4f9f34b25412caa2d65e2210abee027d5caa Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 2 Jun 2018 23:06:39 +0200 Subject: [PATCH 0690/1640] UPSTREAM: bpf: fix context access in tracing progs on 32 bit archs Wang reported that all the testcases for BPF_PROG_TYPE_PERF_EVENT program type in test_verifier report the following errors on x86_32: 172/p unpriv: spill/fill of different pointers ldx FAIL Unexpected error message! 0: (bf) r6 = r10 1: (07) r6 += -8 2: (15) if r1 == 0x0 goto pc+3 R1=ctx(id=0,off=0,imm=0) R6=fp-8,call_-1 R10=fp0,call_-1 3: (bf) r2 = r10 4: (07) r2 += -76 5: (7b) *(u64 *)(r6 +0) = r2 6: (55) if r1 != 0x0 goto pc+1 R1=ctx(id=0,off=0,imm=0) R2=fp-76,call_-1 R6=fp-8,call_-1 R10=fp0,call_-1 fp-8=fp 7: (7b) *(u64 *)(r6 +0) = r1 8: (79) r1 = *(u64 *)(r6 +0) 9: (79) r1 = *(u64 *)(r1 +68) invalid bpf_context access off=68 size=8 378/p check bpf_perf_event_data->sample_period byte load permitted FAIL Failed to load prog 'Permission denied'! 0: (b7) r0 = 0 1: (71) r0 = *(u8 *)(r1 +68) invalid bpf_context access off=68 size=1 379/p check bpf_perf_event_data->sample_period half load permitted FAIL Failed to load prog 'Permission denied'! 0: (b7) r0 = 0 1: (69) r0 = *(u16 *)(r1 +68) invalid bpf_context access off=68 size=2 380/p check bpf_perf_event_data->sample_period word load permitted FAIL Failed to load prog 'Permission denied'! 0: (b7) r0 = 0 1: (61) r0 = *(u32 *)(r1 +68) invalid bpf_context access off=68 size=4 381/p check bpf_perf_event_data->sample_period dword load permitted FAIL Failed to load prog 'Permission denied'! 0: (b7) r0 = 0 1: (79) r0 = *(u64 *)(r1 +68) invalid bpf_context access off=68 size=8 Reason is that struct pt_regs on x86_32 doesn't fully align to 8 byte boundary due to its size of 68 bytes. Therefore, bpf_ctx_narrow_access_ok() will then bail out saying that off & (size_default - 1) which is 68 & 7 doesn't cleanly align in the case of sample_period access from struct bpf_perf_event_data, hence verifier wrongly thinks we might be doing an unaligned access here though underlying arch can handle it just fine. Therefore adjust this down to machine size and check and rewrite the offset for narrow access on that basis. We also need to fix corresponding pe_prog_is_valid_access(), since we hit the check for off % size != 0 (e.g. 68 % 8 -> 4) in the first and last test. With that in place, progs for tracing work on x86_32. Reported-by: Wang YanQing Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Tested-by: Wang YanQing Signed-off-by: Alexei Starovoitov --- include/linux/filter.h | 30 ++++++++++++++++++++++++------ kernel/bpf/verifier.c | 3 ++- kernel/trace/bpf_trace.c | 10 ++++++++-- 3 files changed, 34 insertions(+), 9 deletions(-) diff --git a/include/linux/filter.h b/include/linux/filter.h index 67b81f72c75f..8ffd0511a0ca 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -706,16 +706,34 @@ static inline bool bpf_prog_was_classic(const struct bpf_prog *prog) return prog->type == BPF_PROG_TYPE_UNSPEC; } -static inline bool -bpf_ctx_narrow_access_ok(u32 off, u32 size, const u32 size_default) +static inline u32 bpf_ctx_off_adjust_machine(u32 size) { - bool off_ok; + const u32 size_machine = sizeof(unsigned long); + + if (size > size_machine && size % size_machine == 0) + size = size_machine; + + return size; +} + +static inline bool bpf_ctx_narrow_align_ok(u32 off, u32 size_access, + u32 size_default) +{ + size_default = bpf_ctx_off_adjust_machine(size_default); + size_access = bpf_ctx_off_adjust_machine(size_access); + #ifdef __LITTLE_ENDIAN - off_ok = (off & (size_default - 1)) == 0; + return (off & (size_default - 1)) == 0; #else - off_ok = (off & (size_default - 1)) + size == size_default; + return (off & (size_default - 1)) + size_access == size_default; #endif - return off_ok && size <= size_default && (size & (size - 1)) == 0; +} + +static inline bool +bpf_ctx_narrow_access_ok(u32 off, u32 size, u32 size_default) +{ + return bpf_ctx_narrow_align_ok(off, size, size_default) && + size <= size_default && (size & (size - 1)) == 0; } #define bpf_classic_proglen(fprog) (fprog->len * sizeof(fprog->filter[0])) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index f1d4ea910397..f6fe08e9bdbd 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5637,6 +5637,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) */ is_narrower_load = size < ctx_field_size; if (is_narrower_load) { + u32 size_default = bpf_ctx_off_adjust_machine(ctx_field_size); u32 off = insn->off; u8 size_code; @@ -5651,7 +5652,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) else if (ctx_field_size == 8) size_code = BPF_DW; - insn->off = off & ~(ctx_field_size - 1); + insn->off = off & ~(size_default - 1); insn->code = BPF_LDX | BPF_MEM | size_code; } diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index f35de073d764..58890e62a7d5 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -888,8 +888,14 @@ static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type return false; if (type != BPF_READ) return false; - if (off % size != 0) - return false; + if (off % size != 0) { + if (sizeof(unsigned long) != 4) + return false; + if (size != 8) + return false; + if (off % size != 4) + return false; + } switch (off) { case bpf_ctx_range(struct bpf_perf_event_data, sample_period): From 12ba113360f19b8702baa16953919b6fe7d8a843 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 31 May 2018 10:59:47 +0200 Subject: [PATCH 0691/1640] BACKPORT: xdp: add flags argument to ndo_xdp_xmit API This patch only change the API and reject any use of flags. This is an intermediate step that allows us to implement the flush flag operation later, for each individual driver in a separate patch. The plan is to implement flush operation via XDP_XMIT_FLUSH flag and then remove XDP_XMIT_FLAGS_NONE when done. Signed-off-by: Jesper Dangaard Brouer Acked-by: Song Liu Signed-off-by: Alexei Starovoitov --- include/linux/netdevice.h | 7 ++++--- include/net/xdp.h | 5 +++++ kernel/bpf/devmap.c | 2 +- net/core/filter.c | 2 +- 4 files changed, 11 insertions(+), 5 deletions(-) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index a04da0b9b684..82b8729a076c 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1188,13 +1188,13 @@ struct macsec_ops { * This function is used to set or query state related to XDP on the * netdevice and manage BPF offload. See definition of * enum bpf_netdev_command for details. - * int (*ndo_xdp_xmit)(struct net_device *dev, int n, struct xdp_frame **xdp); + * int (*ndo_xdp_xmit)(struct net_device *dev, int n, struct xdp_frame **xdp, + * u32 flags); * This function is used to submit @n XDP packets for transmit on a * netdevice. Returns number of frames successfully transmitted, frames * that got dropped are freed/returned via xdp_return_frame(). * Returns negative number, means general error invoking ndo, meaning * no frames were xmit'ed and core-caller will free all frames. - * TODO: Consider add flag to allow sending flush operation. * void (*ndo_xdp_flush)(struct net_device *dev); * This function is used to inform the driver to flush a particular * xdp tx queue. Must be called on same CPU as xdp_xmit. @@ -1382,7 +1382,8 @@ struct net_device_ops { int (*ndo_bpf)(struct net_device *dev, struct netdev_bpf *bpf); int (*ndo_xdp_xmit)(struct net_device *dev, int n, - struct xdp_frame **xdp); + struct xdp_frame **xdp, + u32 flags); void (*ndo_xdp_flush)(struct net_device *dev); }; diff --git a/include/net/xdp.h b/include/net/xdp.h index 7ad779237ae8..0c45f0f943ed 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -40,6 +40,11 @@ enum xdp_mem_type { MEM_TYPE_MAX, }; +/* XDP flags for ndo_xdp_xmit */ +#define XDP_XMIT_FLAGS_NONE 0U +#define XDP_XMIT_FLUSH (1U << 0) /* doorbell signal consumer */ +#define XDP_XMIT_FLAGS_MASK XDP_XMIT_FLUSH + struct xdp_mem_info { u32 type; /* enum xdp_mem_type, but known size type */ u32 id; diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 9580d918f1a2..5d960d99df5c 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -235,7 +235,7 @@ static int bq_xmit_all(struct bpf_dtab_netdev *obj, prefetch(xdpf); } - sent = dev->netdev_ops->ndo_xdp_xmit(dev, bq->count, bq->q); + sent = dev->netdev_ops->ndo_xdp_xmit(dev, bq->count, bq->q, 0); if (sent < 0) { err = sent; sent = 0; diff --git a/net/core/filter.c b/net/core/filter.c index b24e6df68ecf..404e5b282286 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3058,7 +3058,7 @@ static int __bpf_tx_xdp(struct net_device *dev, if (unlikely(!xdpf)) return -EOVERFLOW; - sent = dev->netdev_ops->ndo_xdp_xmit(dev, 1, &xdpf); + sent = dev->netdev_ops->ndo_xdp_xmit(dev, 1, &xdpf, 0); if (sent <= 0) return sent; dev->netdev_ops->ndo_xdp_flush(dev); From 85a040ec82bc1f420acff4eb2b55c96307dbfe15 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 31 May 2018 11:00:18 +0200 Subject: [PATCH 0692/1640] UPSTREAM: bpf/xdp: non-map redirect can avoid calling ndo_xdp_flush This is the first real user of the XDP_XMIT_FLUSH flag. As pointed out many times, XDP_REDIRECT without using BPF maps is significant slower than the map variant. This is primary due to the lack of bulking, as the ndo_xdp_flush operation is required after each frame (to avoid frames hanging on the egress device). It is still possible to optimize this case. Instead of invoking two NDO indirect calls, which are very expensive with CONFIG_RETPOLINE, instead instruct ndo_xdp_xmit to flush via XDP_XMIT_FLUSH flag. Signed-off-by: Jesper Dangaard Brouer Acked-by: Song Liu Signed-off-by: Alexei Starovoitov --- net/core/filter.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/net/core/filter.c b/net/core/filter.c index 404e5b282286..c10efbb00a19 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3058,10 +3058,9 @@ static int __bpf_tx_xdp(struct net_device *dev, if (unlikely(!xdpf)) return -EOVERFLOW; - sent = dev->netdev_ops->ndo_xdp_xmit(dev, 1, &xdpf, 0); + sent = dev->netdev_ops->ndo_xdp_xmit(dev, 1, &xdpf, XDP_XMIT_FLUSH); if (sent <= 0) return sent; - dev->netdev_ops->ndo_xdp_flush(dev); return 0; } From fa1795a8935b484498e1ca306f4c11f7cd0bb054 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 31 May 2018 11:00:23 +0200 Subject: [PATCH 0693/1640] UPSTREAM: bpf/xdp: devmap can avoid calling ndo_xdp_flush The XDP_REDIRECT map devmap can avoid using ndo_xdp_flush, by instead instructing ndo_xdp_xmit to flush via XDP_XMIT_FLUSH flag in appropriate places. Notice after this patch it is possible to remove ndo_xdp_flush completely, as this is the last user of ndo_xdp_flush. This is left for later patches, to keep driver changes separate. Signed-off-by: Jesper Dangaard Brouer Acked-by: Song Liu Signed-off-by: Alexei Starovoitov --- kernel/bpf/devmap.c | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 5d960d99df5c..d89b62200247 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -220,7 +220,7 @@ void __dev_map_insert_ctx(struct bpf_map *map, u32 bit) } static int bq_xmit_all(struct bpf_dtab_netdev *obj, - struct xdp_bulk_queue *bq) + struct xdp_bulk_queue *bq, u32 flags) { struct net_device *dev = obj->dev; int sent = 0, drops = 0, err = 0; @@ -235,7 +235,7 @@ static int bq_xmit_all(struct bpf_dtab_netdev *obj, prefetch(xdpf); } - sent = dev->netdev_ops->ndo_xdp_xmit(dev, bq->count, bq->q, 0); + sent = dev->netdev_ops->ndo_xdp_xmit(dev, bq->count, bq->q, flags); if (sent < 0) { err = sent; sent = 0; @@ -279,7 +279,6 @@ void __dev_map_flush(struct bpf_map *map) for_each_set_bit(bit, bitmap, map->max_entries) { struct bpf_dtab_netdev *dev = READ_ONCE(dtab->netdev_map[bit]); struct xdp_bulk_queue *bq; - struct net_device *netdev; /* This is possible if the dev entry is removed by user space * between xdp redirect and flush op. @@ -290,10 +289,7 @@ void __dev_map_flush(struct bpf_map *map) __clear_bit(bit, bitmap); bq = this_cpu_ptr(dev->bulkq); - bq_xmit_all(dev, bq); - netdev = dev->dev; - if (likely(netdev->netdev_ops->ndo_xdp_flush)) - netdev->netdev_ops->ndo_xdp_flush(netdev); + bq_xmit_all(dev, bq, XDP_XMIT_FLUSH); } } @@ -323,7 +319,7 @@ static int bq_enqueue(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf, struct xdp_bulk_queue *bq = this_cpu_ptr(obj->bulkq); if (unlikely(bq->count == DEV_MAP_BULK_SIZE)) - bq_xmit_all(obj, bq); + bq_xmit_all(obj, bq, 0); /* Ingress dev_rx will be the same for all xdp_frame's in * bulk_queue, because bq stored per-CPU and must be flushed @@ -362,8 +358,7 @@ static void *dev_map_lookup_elem(struct bpf_map *map, void *key) static void dev_map_flush_old(struct bpf_dtab_netdev *dev) { - if (dev->dev->netdev_ops->ndo_xdp_flush) { - struct net_device *fl = dev->dev; + if (dev->dev->netdev_ops->ndo_xdp_xmit) { struct xdp_bulk_queue *bq; unsigned long *bitmap; @@ -374,9 +369,7 @@ static void dev_map_flush_old(struct bpf_dtab_netdev *dev) __clear_bit(dev->bit, bitmap); bq = per_cpu_ptr(dev->bulkq, cpu); - bq_xmit_all(dev, bq); - - fl->netdev_ops->ndo_xdp_flush(dev->dev); + bq_xmit_all(dev, bq, XDP_XMIT_FLUSH); } } } From b14604aea06674aec6a679a308b6e431aded0758 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sun, 3 Jun 2018 15:59:41 -0700 Subject: [PATCH 0694/1640] BACKPORT: bpf: implement bpf_get_current_cgroup_id() helper bpf has been used extensively for tracing. For example, bcc contains an almost full set of bpf-based tools to trace kernel and user functions/events. Most tracing tools are currently either filtered based on pid or system-wide. Containers have been used quite extensively in industry and cgroup is often used together to provide resource isolation and protection. Several processes may run inside the same container. It is often desirable to get container-level tracing results as well, e.g. syscall count, function count, I/O activity, etc. This patch implements a new helper, bpf_get_current_cgroup_id(), which will return cgroup id based on the cgroup within which the current task is running. The later patch will provide an example to show that userspace can get the same cgroup id so it could configure a filter or policy in the bpf program based on task cgroup id. The helper is currently implemented for tracing. It can be added to other program types as well when needed. Acked-by: Alexei Starovoitov Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 1 + include/uapi/linux/bpf.h | 5 +++++ kernel/bpf/core.c | 1 + kernel/bpf/helpers.c | 15 +++++++++++++++ kernel/trace/bpf_trace.c | 2 ++ 5 files changed, 24 insertions(+) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 59bc670e7c65..fd6787e3314b 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -763,6 +763,7 @@ extern const struct bpf_func_proto bpf_get_stackid_proto; extern const struct bpf_func_proto bpf_get_stack_proto; extern const struct bpf_func_proto bpf_sock_map_update_proto; extern const struct bpf_func_proto bpf_sock_hash_update_proto; +extern const struct bpf_func_proto bpf_get_current_cgroup_id_proto; /* Shared helpers among cBPF and eBPF. */ void bpf_user_rnd_init_once(void); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 5e19b47355e9..c8c45005885c 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2070,6 +2070,11 @@ union bpf_attr { * **CONFIG_SOCK_CGROUP_DATA** configuration option. * Return * The id is returned or 0 in case the id could not be retrieved. + * + * u64 bpf_get_current_cgroup_id(void) + * Return + * A 64-bit integer containing the current cgroup id based + * on the cgroup within which the current task is running. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 63b2f4d8f52b..d21d799840c9 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1856,6 +1856,7 @@ const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak; const struct bpf_func_proto bpf_get_current_comm_proto __weak; const struct bpf_func_proto bpf_sock_map_update_proto __weak; const struct bpf_func_proto bpf_sock_hash_update_proto __weak; +const struct bpf_func_proto bpf_get_current_cgroup_id_proto __weak; const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void) { diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index aac170ea6c7e..3c347cfed616 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -191,3 +191,18 @@ const struct bpf_func_proto bpf_get_current_comm_proto = { .arg1_type = ARG_PTR_TO_UNINIT_MEM, .arg2_type = ARG_CONST_SIZE, }; + +#ifdef CONFIG_CGROUPS +BPF_CALL_0(bpf_get_current_cgroup_id) +{ + struct cgroup *cgrp = task_dfl_cgroup(current); + + return cgrp->kn->id.id; +} + +const struct bpf_func_proto bpf_get_current_cgroup_id_proto = { + .func = bpf_get_current_cgroup_id, + .gpl_only = false, + .ret_type = RET_INTEGER, +}; +#endif diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 58890e62a7d5..54ed70fd96ab 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -571,6 +571,8 @@ tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_prandom_u32_proto; case BPF_FUNC_probe_read_str: return &bpf_probe_read_str_proto; + case BPF_FUNC_get_current_cgroup_id: + return &bpf_get_current_cgroup_id_proto; default: return NULL; } From e3f1e2297d055abcb1a4f5891abe95f5617e4c36 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Sun, 3 Jun 2018 08:15:19 -0700 Subject: [PATCH 0695/1640] BACKPORT: bpf: flowlabel in bpf_fib_lookup should be flowinfo As Michal noted the flow struct takes both the flow label and priority. Update the bpf_fib_lookup API to note that it is flowinfo and not just the flow label. Cc: Michal Kubecek Signed-off-by: David Ahern Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 2 +- net/core/filter.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index c8c45005885c..8fb5a01c1776 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2676,7 +2676,7 @@ struct bpf_fib_lookup { union { /* inputs to lookup */ __u8 tos; /* AF_INET */ - __be32 flowlabel; /* AF_INET6 */ + __be32 flowinfo; /* AF_INET6, flow_label + priority */ /* output: metric of fib result (IPv4/IPv6 only) */ __u32 rt_metric; diff --git a/net/core/filter.c b/net/core/filter.c index c10efbb00a19..5056a8400451 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4228,7 +4228,7 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params, fl6.flowi6_oif = 0; strict = RT6_LOOKUP_F_HAS_SADDR; } - fl6.flowlabel = params->flowlabel; + fl6.flowlabel = params->flowinfo; fl6.flowi6_scope = 0; fl6.flowi6_flags = 0; fl6.mp_hash = 0; From f8c78b58d83b664106e75ace388da042fe00dc79 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 8 Jun 2018 18:10:34 +0200 Subject: [PATCH 0696/1640] UPSTREAM: bpf: implement dummy fops for bpf objects syzkaller was able to trigger the following warning in do_dentry_open(): WARNING: CPU: 1 PID: 4508 at fs/open.c:778 do_dentry_open+0x4ad/0xe40 fs/open.c:778 Kernel panic - not syncing: panic_on_warn set ... CPU: 1 PID: 4508 Comm: syz-executor867 Not tainted 4.17.0+ #90 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: [...] vfs_open+0x139/0x230 fs/open.c:908 do_last fs/namei.c:3370 [inline] path_openat+0x1717/0x4dc0 fs/namei.c:3511 do_filp_open+0x249/0x350 fs/namei.c:3545 do_sys_open+0x56f/0x740 fs/open.c:1101 __do_sys_openat fs/open.c:1128 [inline] __se_sys_openat fs/open.c:1122 [inline] __x64_sys_openat+0x9d/0x100 fs/open.c:1122 do_syscall_64+0x1b1/0x800 arch/x86/entry/common.c:287 entry_SYSCALL_64_after_hwframe+0x49/0xbe Problem was that prog and map inodes in bpf fs did not implement a dummy file open operation that would return an error. The patch in do_dentry_open() checks whether f_ops are present and if not bails out with an error. While this may be fine, we really shouldn't be throwing a warning though. Thus follow the model similar to bad_file_ops and reject the request unconditionally with -EIO. Fixes: b2197755b263 ("bpf: add support for persistent maps/progs") Reported-by: syzbot+2e7fcab0f56fdbb330b8@syzkaller.appspotmail.com Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov --- kernel/bpf/inode.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 1e70912c9b01..d50c7d7dbba2 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -295,6 +295,15 @@ static const struct file_operations bpffs_map_fops = { .release = bpffs_map_release, }; +static int bpffs_obj_open(struct inode *inode, struct file *file) +{ + return -EIO; +} + +static const struct file_operations bpffs_obj_fops = { + .open = bpffs_obj_open, +}; + static int bpf_mkobj_ops(struct dentry *dentry, umode_t mode, void *raw, const struct inode_operations *iops, const struct file_operations *fops) @@ -314,7 +323,8 @@ static int bpf_mkobj_ops(struct dentry *dentry, umode_t mode, void *raw, static int bpf_mkprog(struct dentry *dentry, umode_t mode, void *arg) { - return bpf_mkobj_ops(dentry, mode, arg, &bpf_prog_iops, NULL); + return bpf_mkobj_ops(dentry, mode, arg, &bpf_prog_iops, + &bpffs_obj_fops); } static int bpf_mkmap(struct dentry *dentry, umode_t mode, void *arg) @@ -322,7 +332,7 @@ static int bpf_mkmap(struct dentry *dentry, umode_t mode, void *arg) struct bpf_map *map = arg; return bpf_mkobj_ops(dentry, mode, arg, &bpf_map_iops, - map->btf ? &bpffs_map_fops : NULL); + map->btf ? &bpffs_map_fops : &bpffs_obj_fops); } static struct dentry * From 9a84470a88b197ebd7196ffd1c2b1210be731e33 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 12 Jun 2018 13:55:00 -0700 Subject: [PATCH 0697/1640] BACKPORT: treewide: kmalloc() -> kmalloc_array() The kmalloc() function has a 2-factor argument form, kmalloc_array(). This patch replaces cases of: kmalloc(a * b, gfp) with: kmalloc_array(a * b, gfp) as well as handling cases of: kmalloc(a * b * c, gfp) with: kmalloc(array3_size(a, b, c), gfp) as it's slightly less ugly than: kmalloc_array(array_size(a, b), c, gfp) This does, however, attempt to ignore constant size factors like: kmalloc(4 * 1024, gfp) though any constants defined via macros get caught up in the conversion. Any factors with a sizeof() of "unsigned char", "char", and "u8" were dropped, since they're redundant. The tools/ directory was manually excluded, since it has its own implementation of kmalloc(). The Coccinelle script used for this was: // Fix redundant parens around sizeof(). @@ type TYPE; expression THING, E; @@ ( kmalloc( - (sizeof(TYPE)) * E + sizeof(TYPE) * E , ...) | kmalloc( - (sizeof(THING)) * E + sizeof(THING) * E , ...) ) // Drop single-byte sizes and redundant parens. @@ expression COUNT; typedef u8; typedef __u8; @@ ( kmalloc( - sizeof(u8) * (COUNT) + COUNT , ...) | kmalloc( - sizeof(__u8) * (COUNT) + COUNT , ...) | kmalloc( - sizeof(char) * (COUNT) + COUNT , ...) | kmalloc( - sizeof(unsigned char) * (COUNT) + COUNT , ...) | kmalloc( - sizeof(u8) * COUNT + COUNT , ...) | kmalloc( - sizeof(__u8) * COUNT + COUNT , ...) | kmalloc( - sizeof(char) * COUNT + COUNT , ...) | kmalloc( - sizeof(unsigned char) * COUNT + COUNT , ...) ) // 2-factor product with sizeof(type/expression) and identifier or constant. @@ type TYPE; expression THING; identifier COUNT_ID; constant COUNT_CONST; @@ ( - kmalloc + kmalloc_array ( - sizeof(TYPE) * (COUNT_ID) + COUNT_ID, sizeof(TYPE) , ...) | - kmalloc + kmalloc_array ( - sizeof(TYPE) * COUNT_ID + COUNT_ID, sizeof(TYPE) , ...) | - kmalloc + kmalloc_array ( - sizeof(TYPE) * (COUNT_CONST) + COUNT_CONST, sizeof(TYPE) , ...) | - kmalloc + kmalloc_array ( - sizeof(TYPE) * COUNT_CONST + COUNT_CONST, sizeof(TYPE) , ...) | - kmalloc + kmalloc_array ( - sizeof(THING) * (COUNT_ID) + COUNT_ID, sizeof(THING) , ...) | - kmalloc + kmalloc_array ( - sizeof(THING) * COUNT_ID + COUNT_ID, sizeof(THING) , ...) | - kmalloc + kmalloc_array ( - sizeof(THING) * (COUNT_CONST) + COUNT_CONST, sizeof(THING) , ...) | - kmalloc + kmalloc_array ( - sizeof(THING) * COUNT_CONST + COUNT_CONST, sizeof(THING) , ...) ) // 2-factor product, only identifiers. @@ identifier SIZE, COUNT; @@ - kmalloc + kmalloc_array ( - SIZE * COUNT + COUNT, SIZE , ...) // 3-factor product with 1 sizeof(type) or sizeof(expression), with // redundant parens removed. @@ expression THING; identifier STRIDE, COUNT; type TYPE; @@ ( kmalloc( - sizeof(TYPE) * (COUNT) * (STRIDE) + array3_size(COUNT, STRIDE, sizeof(TYPE)) , ...) | kmalloc( - sizeof(TYPE) * (COUNT) * STRIDE + array3_size(COUNT, STRIDE, sizeof(TYPE)) , ...) | kmalloc( - sizeof(TYPE) * COUNT * (STRIDE) + array3_size(COUNT, STRIDE, sizeof(TYPE)) , ...) | kmalloc( - sizeof(TYPE) * COUNT * STRIDE + array3_size(COUNT, STRIDE, sizeof(TYPE)) , ...) | kmalloc( - sizeof(THING) * (COUNT) * (STRIDE) + array3_size(COUNT, STRIDE, sizeof(THING)) , ...) | kmalloc( - sizeof(THING) * (COUNT) * STRIDE + array3_size(COUNT, STRIDE, sizeof(THING)) , ...) | kmalloc( - sizeof(THING) * COUNT * (STRIDE) + array3_size(COUNT, STRIDE, sizeof(THING)) , ...) | kmalloc( - sizeof(THING) * COUNT * STRIDE + array3_size(COUNT, STRIDE, sizeof(THING)) , ...) ) // 3-factor product with 2 sizeof(variable), with redundant parens removed. @@ expression THING1, THING2; identifier COUNT; type TYPE1, TYPE2; @@ ( kmalloc( - sizeof(TYPE1) * sizeof(TYPE2) * COUNT + array3_size(COUNT, sizeof(TYPE1), sizeof(TYPE2)) , ...) | kmalloc( - sizeof(TYPE1) * sizeof(THING2) * (COUNT) + array3_size(COUNT, sizeof(TYPE1), sizeof(TYPE2)) , ...) | kmalloc( - sizeof(THING1) * sizeof(THING2) * COUNT + array3_size(COUNT, sizeof(THING1), sizeof(THING2)) , ...) | kmalloc( - sizeof(THING1) * sizeof(THING2) * (COUNT) + array3_size(COUNT, sizeof(THING1), sizeof(THING2)) , ...) | kmalloc( - sizeof(TYPE1) * sizeof(THING2) * COUNT + array3_size(COUNT, sizeof(TYPE1), sizeof(THING2)) , ...) | kmalloc( - sizeof(TYPE1) * sizeof(THING2) * (COUNT) + array3_size(COUNT, sizeof(TYPE1), sizeof(THING2)) , ...) ) // 3-factor product, only identifiers, with redundant parens removed. @@ identifier STRIDE, SIZE, COUNT; @@ ( kmalloc( - (COUNT) * STRIDE * SIZE + array3_size(COUNT, STRIDE, SIZE) , ...) | kmalloc( - COUNT * (STRIDE) * SIZE + array3_size(COUNT, STRIDE, SIZE) , ...) | kmalloc( - COUNT * STRIDE * (SIZE) + array3_size(COUNT, STRIDE, SIZE) , ...) | kmalloc( - (COUNT) * (STRIDE) * SIZE + array3_size(COUNT, STRIDE, SIZE) , ...) | kmalloc( - COUNT * (STRIDE) * (SIZE) + array3_size(COUNT, STRIDE, SIZE) , ...) | kmalloc( - (COUNT) * STRIDE * (SIZE) + array3_size(COUNT, STRIDE, SIZE) , ...) | kmalloc( - (COUNT) * (STRIDE) * (SIZE) + array3_size(COUNT, STRIDE, SIZE) , ...) | kmalloc( - COUNT * STRIDE * SIZE + array3_size(COUNT, STRIDE, SIZE) , ...) ) // Any remaining multi-factor products, first at least 3-factor products, // when they're not all constants... @@ expression E1, E2, E3; constant C1, C2, C3; @@ ( kmalloc(C1 * C2 * C3, ...) | kmalloc( - (E1) * E2 * E3 + array3_size(E1, E2, E3) , ...) | kmalloc( - (E1) * (E2) * E3 + array3_size(E1, E2, E3) , ...) | kmalloc( - (E1) * (E2) * (E3) + array3_size(E1, E2, E3) , ...) | kmalloc( - E1 * E2 * E3 + array3_size(E1, E2, E3) , ...) ) // And then all remaining 2 factors products when they're not all constants, // keeping sizeof() as the second factor argument. @@ expression THING, E1, E2; type TYPE; constant C1, C2, C3; @@ ( kmalloc(sizeof(THING) * C2, ...) | kmalloc(sizeof(TYPE) * C2, ...) | kmalloc(C1 * C2 * C3, ...) | kmalloc(C1 * C2, ...) | - kmalloc + kmalloc_array ( - sizeof(TYPE) * (E2) + E2, sizeof(TYPE) , ...) | - kmalloc + kmalloc_array ( - sizeof(TYPE) * E2 + E2, sizeof(TYPE) , ...) | - kmalloc + kmalloc_array ( - sizeof(THING) * (E2) + E2, sizeof(THING) , ...) | - kmalloc + kmalloc_array ( - sizeof(THING) * E2 + E2, sizeof(THING) , ...) | - kmalloc + kmalloc_array ( - (E1) * E2 + E1, E2 , ...) | - kmalloc + kmalloc_array ( - (E1) * (E2) + E1, E2 , ...) | - kmalloc + kmalloc_array ( - E1 * E2 + E1, E2 , ...) ) [Linux4: Only keep bpf related bits] Signed-off-by: Kees Cook --- kernel/bpf/lpm_trie.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index af67820ea804..c86e165e1fcb 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -626,8 +626,9 @@ static int trie_get_next_key(struct bpf_map *map, void *_key, void *_next_key) if (!key || key->prefixlen > trie->max_prefixlen) goto find_leftmost; - node_stack = kmalloc(trie->max_prefixlen * sizeof(struct lpm_trie_node *), - GFP_ATOMIC | __GFP_NOWARN); + node_stack = kmalloc_array(trie->max_prefixlen, + sizeof(struct lpm_trie_node *), + GFP_ATOMIC | __GFP_NOWARN); if (!node_stack) return -ENOMEM; From 5f23a281768d8d8f60550a0545dbcbf56968833b Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 12 Jun 2018 14:03:40 -0700 Subject: [PATCH 0698/1640] BACKPORT: treewide: kzalloc() -> kcalloc() The kzalloc() function has a 2-factor argument form, kcalloc(). This patch replaces cases of: kzalloc(a * b, gfp) with: kcalloc(a * b, gfp) as well as handling cases of: kzalloc(a * b * c, gfp) with: kzalloc(array3_size(a, b, c), gfp) as it's slightly less ugly than: kzalloc_array(array_size(a, b), c, gfp) This does, however, attempt to ignore constant size factors like: kzalloc(4 * 1024, gfp) though any constants defined via macros get caught up in the conversion. Any factors with a sizeof() of "unsigned char", "char", and "u8" were dropped, since they're redundant. The Coccinelle script used for this was: // Fix redundant parens around sizeof(). @@ type TYPE; expression THING, E; @@ ( kzalloc( - (sizeof(TYPE)) * E + sizeof(TYPE) * E , ...) | kzalloc( - (sizeof(THING)) * E + sizeof(THING) * E , ...) ) // Drop single-byte sizes and redundant parens. @@ expression COUNT; typedef u8; typedef __u8; @@ ( kzalloc( - sizeof(u8) * (COUNT) + COUNT , ...) | kzalloc( - sizeof(__u8) * (COUNT) + COUNT , ...) | kzalloc( - sizeof(char) * (COUNT) + COUNT , ...) | kzalloc( - sizeof(unsigned char) * (COUNT) + COUNT , ...) | kzalloc( - sizeof(u8) * COUNT + COUNT , ...) | kzalloc( - sizeof(__u8) * COUNT + COUNT , ...) | kzalloc( - sizeof(char) * COUNT + COUNT , ...) | kzalloc( - sizeof(unsigned char) * COUNT + COUNT , ...) ) // 2-factor product with sizeof(type/expression) and identifier or constant. @@ type TYPE; expression THING; identifier COUNT_ID; constant COUNT_CONST; @@ ( - kzalloc + kcalloc ( - sizeof(TYPE) * (COUNT_ID) + COUNT_ID, sizeof(TYPE) , ...) | - kzalloc + kcalloc ( - sizeof(TYPE) * COUNT_ID + COUNT_ID, sizeof(TYPE) , ...) | - kzalloc + kcalloc ( - sizeof(TYPE) * (COUNT_CONST) + COUNT_CONST, sizeof(TYPE) , ...) | - kzalloc + kcalloc ( - sizeof(TYPE) * COUNT_CONST + COUNT_CONST, sizeof(TYPE) , ...) | - kzalloc + kcalloc ( - sizeof(THING) * (COUNT_ID) + COUNT_ID, sizeof(THING) , ...) | - kzalloc + kcalloc ( - sizeof(THING) * COUNT_ID + COUNT_ID, sizeof(THING) , ...) | - kzalloc + kcalloc ( - sizeof(THING) * (COUNT_CONST) + COUNT_CONST, sizeof(THING) , ...) | - kzalloc + kcalloc ( - sizeof(THING) * COUNT_CONST + COUNT_CONST, sizeof(THING) , ...) ) // 2-factor product, only identifiers. @@ identifier SIZE, COUNT; @@ - kzalloc + kcalloc ( - SIZE * COUNT + COUNT, SIZE , ...) // 3-factor product with 1 sizeof(type) or sizeof(expression), with // redundant parens removed. @@ expression THING; identifier STRIDE, COUNT; type TYPE; @@ ( kzalloc( - sizeof(TYPE) * (COUNT) * (STRIDE) + array3_size(COUNT, STRIDE, sizeof(TYPE)) , ...) | kzalloc( - sizeof(TYPE) * (COUNT) * STRIDE + array3_size(COUNT, STRIDE, sizeof(TYPE)) , ...) | kzalloc( - sizeof(TYPE) * COUNT * (STRIDE) + array3_size(COUNT, STRIDE, sizeof(TYPE)) , ...) | kzalloc( - sizeof(TYPE) * COUNT * STRIDE + array3_size(COUNT, STRIDE, sizeof(TYPE)) , ...) | kzalloc( - sizeof(THING) * (COUNT) * (STRIDE) + array3_size(COUNT, STRIDE, sizeof(THING)) , ...) | kzalloc( - sizeof(THING) * (COUNT) * STRIDE + array3_size(COUNT, STRIDE, sizeof(THING)) , ...) | kzalloc( - sizeof(THING) * COUNT * (STRIDE) + array3_size(COUNT, STRIDE, sizeof(THING)) , ...) | kzalloc( - sizeof(THING) * COUNT * STRIDE + array3_size(COUNT, STRIDE, sizeof(THING)) , ...) ) // 3-factor product with 2 sizeof(variable), with redundant parens removed. @@ expression THING1, THING2; identifier COUNT; type TYPE1, TYPE2; @@ ( kzalloc( - sizeof(TYPE1) * sizeof(TYPE2) * COUNT + array3_size(COUNT, sizeof(TYPE1), sizeof(TYPE2)) , ...) | kzalloc( - sizeof(TYPE1) * sizeof(THING2) * (COUNT) + array3_size(COUNT, sizeof(TYPE1), sizeof(TYPE2)) , ...) | kzalloc( - sizeof(THING1) * sizeof(THING2) * COUNT + array3_size(COUNT, sizeof(THING1), sizeof(THING2)) , ...) | kzalloc( - sizeof(THING1) * sizeof(THING2) * (COUNT) + array3_size(COUNT, sizeof(THING1), sizeof(THING2)) , ...) | kzalloc( - sizeof(TYPE1) * sizeof(THING2) * COUNT + array3_size(COUNT, sizeof(TYPE1), sizeof(THING2)) , ...) | kzalloc( - sizeof(TYPE1) * sizeof(THING2) * (COUNT) + array3_size(COUNT, sizeof(TYPE1), sizeof(THING2)) , ...) ) // 3-factor product, only identifiers, with redundant parens removed. @@ identifier STRIDE, SIZE, COUNT; @@ ( kzalloc( - (COUNT) * STRIDE * SIZE + array3_size(COUNT, STRIDE, SIZE) , ...) | kzalloc( - COUNT * (STRIDE) * SIZE + array3_size(COUNT, STRIDE, SIZE) , ...) | kzalloc( - COUNT * STRIDE * (SIZE) + array3_size(COUNT, STRIDE, SIZE) , ...) | kzalloc( - (COUNT) * (STRIDE) * SIZE + array3_size(COUNT, STRIDE, SIZE) , ...) | kzalloc( - COUNT * (STRIDE) * (SIZE) + array3_size(COUNT, STRIDE, SIZE) , ...) | kzalloc( - (COUNT) * STRIDE * (SIZE) + array3_size(COUNT, STRIDE, SIZE) , ...) | kzalloc( - (COUNT) * (STRIDE) * (SIZE) + array3_size(COUNT, STRIDE, SIZE) , ...) | kzalloc( - COUNT * STRIDE * SIZE + array3_size(COUNT, STRIDE, SIZE) , ...) ) // Any remaining multi-factor products, first at least 3-factor products, // when they're not all constants... @@ expression E1, E2, E3; constant C1, C2, C3; @@ ( kzalloc(C1 * C2 * C3, ...) | kzalloc( - (E1) * E2 * E3 + array3_size(E1, E2, E3) , ...) | kzalloc( - (E1) * (E2) * E3 + array3_size(E1, E2, E3) , ...) | kzalloc( - (E1) * (E2) * (E3) + array3_size(E1, E2, E3) , ...) | kzalloc( - E1 * E2 * E3 + array3_size(E1, E2, E3) , ...) ) // And then all remaining 2 factors products when they're not all constants, // keeping sizeof() as the second factor argument. @@ expression THING, E1, E2; type TYPE; constant C1, C2, C3; @@ ( kzalloc(sizeof(THING) * C2, ...) | kzalloc(sizeof(TYPE) * C2, ...) | kzalloc(C1 * C2 * C3, ...) | kzalloc(C1 * C2, ...) | - kzalloc + kcalloc ( - sizeof(TYPE) * (E2) + E2, sizeof(TYPE) , ...) | - kzalloc + kcalloc ( - sizeof(TYPE) * E2 + E2, sizeof(TYPE) , ...) | - kzalloc + kcalloc ( - sizeof(THING) * (E2) + E2, sizeof(THING) , ...) | - kzalloc + kcalloc ( - sizeof(THING) * E2 + E2, sizeof(THING) , ...) | - kzalloc + kcalloc ( - (E1) * E2 + E1, E2 , ...) | - kzalloc + kcalloc ( - (E1) * (E2) + E1, E2 , ...) | - kzalloc + kcalloc ( - E1 * E2 + E1, E2 , ...) ) [Linux4: Only keep bpf related bits] Signed-off-by: Kees Cook --- kernel/bpf/verifier.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index f6fe08e9bdbd..fe1a1e7c0460 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5721,7 +5721,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) insn->imm = 1; } - func = kzalloc(sizeof(prog) * env->subprog_cnt, GFP_KERNEL); + func = kcalloc(env->subprog_cnt, sizeof(prog), GFP_KERNEL); if (!func) return -ENOMEM; From c35bb7a703fb664fb0538f0c56b55f19424d8333 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 11 Jun 2018 14:35:55 -0700 Subject: [PATCH 0699/1640] UPSTREAM: mm: Introduce kvcalloc() The kv*alloc()-family was missing kvcalloc(). Adding this allows for 2-argument multiplication conversions of kvzalloc(a * b, ...) into kvcalloc(a, b, ...). Signed-off-by: Kees Cook --- include/linux/mm.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/include/linux/mm.h b/include/linux/mm.h index 1a15242092ef..6fb4692fd009 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -630,6 +630,11 @@ static inline void *kvmalloc_array(size_t n, size_t size, gfp_t flags) return kvmalloc(n * size, flags); } +static inline void *kvcalloc(size_t n, size_t size, gfp_t flags) +{ + return kvmalloc_array(n, size, flags | __GFP_ZERO); +} + extern void kvfree(const void *addr); extern void kvfree_sensitive(const void *addr, size_t len); From f2c7025ff5672341d1b50047aedfd909537cea6a Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 12 Jun 2018 14:04:48 -0700 Subject: [PATCH 0700/1640] BACKPORT: treewide: kvzalloc() -> kvcalloc() The kvzalloc() function has a 2-factor argument form, kvcalloc(). This patch replaces cases of: kvzalloc(a * b, gfp) with: kvcalloc(a * b, gfp) as well as handling cases of: kvzalloc(a * b * c, gfp) with: kvzalloc(array3_size(a, b, c), gfp) as it's slightly less ugly than: kvcalloc(array_size(a, b), c, gfp) This does, however, attempt to ignore constant size factors like: kvzalloc(4 * 1024, gfp) though any constants defined via macros get caught up in the conversion. Any factors with a sizeof() of "unsigned char", "char", and "u8" were dropped, since they're redundant. The Coccinelle script used for this was: // Fix redundant parens around sizeof(). @@ type TYPE; expression THING, E; @@ ( kvzalloc( - (sizeof(TYPE)) * E + sizeof(TYPE) * E , ...) | kvzalloc( - (sizeof(THING)) * E + sizeof(THING) * E , ...) ) // Drop single-byte sizes and redundant parens. @@ expression COUNT; typedef u8; typedef __u8; @@ ( kvzalloc( - sizeof(u8) * (COUNT) + COUNT , ...) | kvzalloc( - sizeof(__u8) * (COUNT) + COUNT , ...) | kvzalloc( - sizeof(char) * (COUNT) + COUNT , ...) | kvzalloc( - sizeof(unsigned char) * (COUNT) + COUNT , ...) | kvzalloc( - sizeof(u8) * COUNT + COUNT , ...) | kvzalloc( - sizeof(__u8) * COUNT + COUNT , ...) | kvzalloc( - sizeof(char) * COUNT + COUNT , ...) | kvzalloc( - sizeof(unsigned char) * COUNT + COUNT , ...) ) // 2-factor product with sizeof(type/expression) and identifier or constant. @@ type TYPE; expression THING; identifier COUNT_ID; constant COUNT_CONST; @@ ( - kvzalloc + kvcalloc ( - sizeof(TYPE) * (COUNT_ID) + COUNT_ID, sizeof(TYPE) , ...) | - kvzalloc + kvcalloc ( - sizeof(TYPE) * COUNT_ID + COUNT_ID, sizeof(TYPE) , ...) | - kvzalloc + kvcalloc ( - sizeof(TYPE) * (COUNT_CONST) + COUNT_CONST, sizeof(TYPE) , ...) | - kvzalloc + kvcalloc ( - sizeof(TYPE) * COUNT_CONST + COUNT_CONST, sizeof(TYPE) , ...) | - kvzalloc + kvcalloc ( - sizeof(THING) * (COUNT_ID) + COUNT_ID, sizeof(THING) , ...) | - kvzalloc + kvcalloc ( - sizeof(THING) * COUNT_ID + COUNT_ID, sizeof(THING) , ...) | - kvzalloc + kvcalloc ( - sizeof(THING) * (COUNT_CONST) + COUNT_CONST, sizeof(THING) , ...) | - kvzalloc + kvcalloc ( - sizeof(THING) * COUNT_CONST + COUNT_CONST, sizeof(THING) , ...) ) // 2-factor product, only identifiers. @@ identifier SIZE, COUNT; @@ - kvzalloc + kvcalloc ( - SIZE * COUNT + COUNT, SIZE , ...) // 3-factor product with 1 sizeof(type) or sizeof(expression), with // redundant parens removed. @@ expression THING; identifier STRIDE, COUNT; type TYPE; @@ ( kvzalloc( - sizeof(TYPE) * (COUNT) * (STRIDE) + array3_size(COUNT, STRIDE, sizeof(TYPE)) , ...) | kvzalloc( - sizeof(TYPE) * (COUNT) * STRIDE + array3_size(COUNT, STRIDE, sizeof(TYPE)) , ...) | kvzalloc( - sizeof(TYPE) * COUNT * (STRIDE) + array3_size(COUNT, STRIDE, sizeof(TYPE)) , ...) | kvzalloc( - sizeof(TYPE) * COUNT * STRIDE + array3_size(COUNT, STRIDE, sizeof(TYPE)) , ...) | kvzalloc( - sizeof(THING) * (COUNT) * (STRIDE) + array3_size(COUNT, STRIDE, sizeof(THING)) , ...) | kvzalloc( - sizeof(THING) * (COUNT) * STRIDE + array3_size(COUNT, STRIDE, sizeof(THING)) , ...) | kvzalloc( - sizeof(THING) * COUNT * (STRIDE) + array3_size(COUNT, STRIDE, sizeof(THING)) , ...) | kvzalloc( - sizeof(THING) * COUNT * STRIDE + array3_size(COUNT, STRIDE, sizeof(THING)) , ...) ) // 3-factor product with 2 sizeof(variable), with redundant parens removed. @@ expression THING1, THING2; identifier COUNT; type TYPE1, TYPE2; @@ ( kvzalloc( - sizeof(TYPE1) * sizeof(TYPE2) * COUNT + array3_size(COUNT, sizeof(TYPE1), sizeof(TYPE2)) , ...) | kvzalloc( - sizeof(TYPE1) * sizeof(THING2) * (COUNT) + array3_size(COUNT, sizeof(TYPE1), sizeof(TYPE2)) , ...) | kvzalloc( - sizeof(THING1) * sizeof(THING2) * COUNT + array3_size(COUNT, sizeof(THING1), sizeof(THING2)) , ...) | kvzalloc( - sizeof(THING1) * sizeof(THING2) * (COUNT) + array3_size(COUNT, sizeof(THING1), sizeof(THING2)) , ...) | kvzalloc( - sizeof(TYPE1) * sizeof(THING2) * COUNT + array3_size(COUNT, sizeof(TYPE1), sizeof(THING2)) , ...) | kvzalloc( - sizeof(TYPE1) * sizeof(THING2) * (COUNT) + array3_size(COUNT, sizeof(TYPE1), sizeof(THING2)) , ...) ) // 3-factor product, only identifiers, with redundant parens removed. @@ identifier STRIDE, SIZE, COUNT; @@ ( kvzalloc( - (COUNT) * STRIDE * SIZE + array3_size(COUNT, STRIDE, SIZE) , ...) | kvzalloc( - COUNT * (STRIDE) * SIZE + array3_size(COUNT, STRIDE, SIZE) , ...) | kvzalloc( - COUNT * STRIDE * (SIZE) + array3_size(COUNT, STRIDE, SIZE) , ...) | kvzalloc( - (COUNT) * (STRIDE) * SIZE + array3_size(COUNT, STRIDE, SIZE) , ...) | kvzalloc( - COUNT * (STRIDE) * (SIZE) + array3_size(COUNT, STRIDE, SIZE) , ...) | kvzalloc( - (COUNT) * STRIDE * (SIZE) + array3_size(COUNT, STRIDE, SIZE) , ...) | kvzalloc( - (COUNT) * (STRIDE) * (SIZE) + array3_size(COUNT, STRIDE, SIZE) , ...) | kvzalloc( - COUNT * STRIDE * SIZE + array3_size(COUNT, STRIDE, SIZE) , ...) ) // Any remaining multi-factor products, first at least 3-factor products, // when they're not all constants... @@ expression E1, E2, E3; constant C1, C2, C3; @@ ( kvzalloc(C1 * C2 * C3, ...) | kvzalloc( - (E1) * E2 * E3 + array3_size(E1, E2, E3) , ...) | kvzalloc( - (E1) * (E2) * E3 + array3_size(E1, E2, E3) , ...) | kvzalloc( - (E1) * (E2) * (E3) + array3_size(E1, E2, E3) , ...) | kvzalloc( - E1 * E2 * E3 + array3_size(E1, E2, E3) , ...) ) // And then all remaining 2 factors products when they're not all constants, // keeping sizeof() as the second factor argument. @@ expression THING, E1, E2; type TYPE; constant C1, C2, C3; @@ ( kvzalloc(sizeof(THING) * C2, ...) | kvzalloc(sizeof(TYPE) * C2, ...) | kvzalloc(C1 * C2 * C3, ...) | kvzalloc(C1 * C2, ...) | - kvzalloc + kvcalloc ( - sizeof(TYPE) * (E2) + E2, sizeof(TYPE) , ...) | - kvzalloc + kvcalloc ( - sizeof(TYPE) * E2 + E2, sizeof(TYPE) , ...) | - kvzalloc + kvcalloc ( - sizeof(THING) * (E2) + E2, sizeof(THING) , ...) | - kvzalloc + kvcalloc ( - sizeof(THING) * E2 + E2, sizeof(THING) , ...) | - kvzalloc + kvcalloc ( - (E1) * E2 + E1, E2 , ...) | - kvzalloc + kvcalloc ( - (E1) * (E2) + E1, E2 , ...) | - kvzalloc + kvcalloc ( - E1 * E2 + E1, E2 , ...) ) [Linux4: Only keep bpf related bits] Signed-off-by: Kees Cook --- kernel/bpf/btf.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 8653ab004c73..2d49d18b793a 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -608,7 +608,7 @@ static int btf_add_type(struct btf_verifier_env *env, struct btf_type *t) new_size = min_t(u32, BTF_MAX_TYPE, btf->types_size + expand_by); - new_types = kvzalloc(new_size * sizeof(*new_types), + new_types = kvcalloc(new_size, sizeof(*new_types), GFP_KERNEL | __GFP_NOWARN); if (!new_types) return -ENOMEM; @@ -698,17 +698,17 @@ static int env_resolve_init(struct btf_verifier_env *env) u8 *visit_states = NULL; /* +1 for btf_void */ - resolved_sizes = kvzalloc((nr_types + 1) * sizeof(*resolved_sizes), + resolved_sizes = kvcalloc(nr_types + 1, sizeof(*resolved_sizes), GFP_KERNEL | __GFP_NOWARN); if (!resolved_sizes) goto nomem; - resolved_ids = kvzalloc((nr_types + 1) * sizeof(*resolved_ids), + resolved_ids = kvcalloc(nr_types + 1, sizeof(*resolved_ids), GFP_KERNEL | __GFP_NOWARN); if (!resolved_ids) goto nomem; - visit_states = kvzalloc((nr_types + 1) * sizeof(*visit_states), + visit_states = kvcalloc(nr_types + 1, sizeof(*visit_states), GFP_KERNEL | __GFP_NOWARN); if (!visit_states) goto nomem; From af6b25179d811f6e4b294b625af33a0f788aa9b6 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 8 May 2018 12:55:26 -0700 Subject: [PATCH 0701/1640] UPSTREAM: mm: Use overflow helpers in kvmalloc() Instead of open-coded multiplication and bounds checking, use the new overflow helper. Additionally prepare for vmalloc() users to add array_size()-family helpers in the future. Signed-off-by: Kees Cook --- include/linux/mm.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 6fb4692fd009..0d7c2168e13c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -25,6 +25,7 @@ #include #include #include +#include struct mempolicy; struct anon_vma; @@ -624,10 +625,12 @@ static inline void *kvzalloc(size_t size, gfp_t flags) static inline void *kvmalloc_array(size_t n, size_t size, gfp_t flags) { - if (size != 0 && n > SIZE_MAX / size) + size_t bytes; + + if (unlikely(check_mul_overflow(n, size, &bytes))) return NULL; - return kvmalloc(n * size, flags); + return kvmalloc(bytes, flags); } static inline void *kvcalloc(size_t n, size_t size, gfp_t flags) From 84c5a35935f559c26d9cfee36949d604ff5438d4 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 12 Jun 2018 14:27:37 -0700 Subject: [PATCH 0702/1640] BACKPORT: treewide: Use array_size() in vzalloc() The vzalloc() function has no 2-factor argument form, so multiplication factors need to be wrapped in array_size(). This patch replaces cases of: vzalloc(a * b) with: vzalloc(array_size(a, b)) as well as handling cases of: vzalloc(a * b * c) with: vzalloc(array3_size(a, b, c)) This does, however, attempt to ignore constant size factors like: vzalloc(4 * 1024) though any constants defined via macros get caught up in the conversion. Any factors with a sizeof() of "unsigned char", "char", and "u8" were dropped, since they're redundant. The Coccinelle script used for this was: // Fix redundant parens around sizeof(). @@ type TYPE; expression THING, E; @@ ( vzalloc( - (sizeof(TYPE)) * E + sizeof(TYPE) * E , ...) | vzalloc( - (sizeof(THING)) * E + sizeof(THING) * E , ...) ) // Drop single-byte sizes and redundant parens. @@ expression COUNT; typedef u8; typedef __u8; @@ ( vzalloc( - sizeof(u8) * (COUNT) + COUNT , ...) | vzalloc( - sizeof(__u8) * (COUNT) + COUNT , ...) | vzalloc( - sizeof(char) * (COUNT) + COUNT , ...) | vzalloc( - sizeof(unsigned char) * (COUNT) + COUNT , ...) | vzalloc( - sizeof(u8) * COUNT + COUNT , ...) | vzalloc( - sizeof(__u8) * COUNT + COUNT , ...) | vzalloc( - sizeof(char) * COUNT + COUNT , ...) | vzalloc( - sizeof(unsigned char) * COUNT + COUNT , ...) ) // 2-factor product with sizeof(type/expression) and identifier or constant. @@ type TYPE; expression THING; identifier COUNT_ID; constant COUNT_CONST; @@ ( vzalloc( - sizeof(TYPE) * (COUNT_ID) + array_size(COUNT_ID, sizeof(TYPE)) , ...) | vzalloc( - sizeof(TYPE) * COUNT_ID + array_size(COUNT_ID, sizeof(TYPE)) , ...) | vzalloc( - sizeof(TYPE) * (COUNT_CONST) + array_size(COUNT_CONST, sizeof(TYPE)) , ...) | vzalloc( - sizeof(TYPE) * COUNT_CONST + array_size(COUNT_CONST, sizeof(TYPE)) , ...) | vzalloc( - sizeof(THING) * (COUNT_ID) + array_size(COUNT_ID, sizeof(THING)) , ...) | vzalloc( - sizeof(THING) * COUNT_ID + array_size(COUNT_ID, sizeof(THING)) , ...) | vzalloc( - sizeof(THING) * (COUNT_CONST) + array_size(COUNT_CONST, sizeof(THING)) , ...) | vzalloc( - sizeof(THING) * COUNT_CONST + array_size(COUNT_CONST, sizeof(THING)) , ...) ) // 2-factor product, only identifiers. @@ identifier SIZE, COUNT; @@ vzalloc( - SIZE * COUNT + array_size(COUNT, SIZE) , ...) // 3-factor product with 1 sizeof(type) or sizeof(expression), with // redundant parens removed. @@ expression THING; identifier STRIDE, COUNT; type TYPE; @@ ( vzalloc( - sizeof(TYPE) * (COUNT) * (STRIDE) + array3_size(COUNT, STRIDE, sizeof(TYPE)) , ...) | vzalloc( - sizeof(TYPE) * (COUNT) * STRIDE + array3_size(COUNT, STRIDE, sizeof(TYPE)) , ...) | vzalloc( - sizeof(TYPE) * COUNT * (STRIDE) + array3_size(COUNT, STRIDE, sizeof(TYPE)) , ...) | vzalloc( - sizeof(TYPE) * COUNT * STRIDE + array3_size(COUNT, STRIDE, sizeof(TYPE)) , ...) | vzalloc( - sizeof(THING) * (COUNT) * (STRIDE) + array3_size(COUNT, STRIDE, sizeof(THING)) , ...) | vzalloc( - sizeof(THING) * (COUNT) * STRIDE + array3_size(COUNT, STRIDE, sizeof(THING)) , ...) | vzalloc( - sizeof(THING) * COUNT * (STRIDE) + array3_size(COUNT, STRIDE, sizeof(THING)) , ...) | vzalloc( - sizeof(THING) * COUNT * STRIDE + array3_size(COUNT, STRIDE, sizeof(THING)) , ...) ) // 3-factor product with 2 sizeof(variable), with redundant parens removed. @@ expression THING1, THING2; identifier COUNT; type TYPE1, TYPE2; @@ ( vzalloc( - sizeof(TYPE1) * sizeof(TYPE2) * COUNT + array3_size(COUNT, sizeof(TYPE1), sizeof(TYPE2)) , ...) | vzalloc( - sizeof(TYPE1) * sizeof(THING2) * (COUNT) + array3_size(COUNT, sizeof(TYPE1), sizeof(TYPE2)) , ...) | vzalloc( - sizeof(THING1) * sizeof(THING2) * COUNT + array3_size(COUNT, sizeof(THING1), sizeof(THING2)) , ...) | vzalloc( - sizeof(THING1) * sizeof(THING2) * (COUNT) + array3_size(COUNT, sizeof(THING1), sizeof(THING2)) , ...) | vzalloc( - sizeof(TYPE1) * sizeof(THING2) * COUNT + array3_size(COUNT, sizeof(TYPE1), sizeof(THING2)) , ...) | vzalloc( - sizeof(TYPE1) * sizeof(THING2) * (COUNT) + array3_size(COUNT, sizeof(TYPE1), sizeof(THING2)) , ...) ) // 3-factor product, only identifiers, with redundant parens removed. @@ identifier STRIDE, SIZE, COUNT; @@ ( vzalloc( - (COUNT) * STRIDE * SIZE + array3_size(COUNT, STRIDE, SIZE) , ...) | vzalloc( - COUNT * (STRIDE) * SIZE + array3_size(COUNT, STRIDE, SIZE) , ...) | vzalloc( - COUNT * STRIDE * (SIZE) + array3_size(COUNT, STRIDE, SIZE) , ...) | vzalloc( - (COUNT) * (STRIDE) * SIZE + array3_size(COUNT, STRIDE, SIZE) , ...) | vzalloc( - COUNT * (STRIDE) * (SIZE) + array3_size(COUNT, STRIDE, SIZE) , ...) | vzalloc( - (COUNT) * STRIDE * (SIZE) + array3_size(COUNT, STRIDE, SIZE) , ...) | vzalloc( - (COUNT) * (STRIDE) * (SIZE) + array3_size(COUNT, STRIDE, SIZE) , ...) | vzalloc( - COUNT * STRIDE * SIZE + array3_size(COUNT, STRIDE, SIZE) , ...) ) // Any remaining multi-factor products, first at least 3-factor products // when they're not all constants... @@ expression E1, E2, E3; constant C1, C2, C3; @@ ( vzalloc(C1 * C2 * C3, ...) | vzalloc( - E1 * E2 * E3 + array3_size(E1, E2, E3) , ...) ) // And then all remaining 2 factors products when they're not all constants. @@ expression E1, E2; constant C1, C2; @@ ( vzalloc(C1 * C2, ...) | vzalloc( - E1 * E2 + array_size(E1, E2) , ...) ) [Linux4: Only keep bpf related bits] Signed-off-by: Kees Cook --- kernel/bpf/verifier.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index fe1a1e7c0460..0979671e624d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5480,7 +5480,8 @@ static int adjust_insn_aux_data(struct bpf_verifier_env *env, u32 prog_len, if (cnt == 1) return 0; - new_data = vzalloc(sizeof(struct bpf_insn_aux_data) * prog_len); + new_data = vzalloc(array_size(prog_len, + sizeof(struct bpf_insn_aux_data))); if (!new_data) return -ENOMEM; memcpy(new_data, old_data, sizeof(struct bpf_insn_aux_data) * off); @@ -6194,8 +6195,9 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) return -ENOMEM; log = &env->log; - env->insn_aux_data = vzalloc(sizeof(struct bpf_insn_aux_data) * - (*prog)->len); + env->insn_aux_data = + vzalloc(array_size(sizeof(struct bpf_insn_aux_data), + (*prog)->len)); ret = -ENOMEM; if (!env->insn_aux_data) goto err_free_env; From d54df5ed11db9594e1e2cd2369b0ddde1148a3af Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 15 Jun 2018 02:30:47 +0200 Subject: [PATCH 0703/1640] UPSTREAM: bpf: fix panic in prog load calls cleanup While testing I found that when hitting error path in bpf_prog_load() where we jump to free_used_maps and prog contained BPF to BPF calls that were JITed earlier, then we never clean up the bpf_prog_kallsyms_add() done under jit_subprogs(). Add proper API to make BPF kallsyms deletion more clear and fix that. Fixes: 1c2a088a6626 ("bpf: x64: add JIT support for multi-function programs") Signed-off-by: Daniel Borkmann Acked-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov --- include/linux/filter.h | 3 +++ kernel/bpf/core.c | 14 ++++++++++++++ kernel/bpf/syscall.c | 8 ++------ 3 files changed, 19 insertions(+), 6 deletions(-) diff --git a/include/linux/filter.h b/include/linux/filter.h index 8ffd0511a0ca..e8eed86e1c3b 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1032,6 +1032,9 @@ static inline void bpf_prog_kallsyms_del(struct bpf_prog *fp) } #endif /* CONFIG_BPF_JIT */ +void bpf_prog_kallsyms_del_subprogs(struct bpf_prog *fp); +void bpf_prog_kallsyms_del_all(struct bpf_prog *fp); + #define BPF_ANC BIT(15) static inline bool bpf_needs_clear_a(const struct sock_filter *first) diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index d21d799840c9..fac39ca49ce2 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -357,6 +357,20 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, return prog_adj; } +void bpf_prog_kallsyms_del_subprogs(struct bpf_prog *fp) +{ + int i; + + for (i = 0; i < fp->aux->func_cnt; i++) + bpf_prog_kallsyms_del(fp->aux->func[i]); +} + +void bpf_prog_kallsyms_del_all(struct bpf_prog *fp) +{ + bpf_prog_kallsyms_del_subprogs(fp); + bpf_prog_kallsyms_del(fp); +} + #ifdef CONFIG_BPF_JIT /* All BPF JIT sysctl knobs here. */ int bpf_jit_enable __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_ALWAYS_ON); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 93fcb9c9d300..e8b6f4d8d358 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1051,14 +1051,9 @@ static void __bpf_prog_put_rcu(struct rcu_head *rcu) static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock) { if (atomic_dec_and_test(&prog->aux->refcnt)) { - int i; - /* bpf_prog_free_id() must be called first */ bpf_prog_free_id(prog, do_idr_lock); - - for (i = 0; i < prog->aux->func_cnt; i++) - bpf_prog_kallsyms_del(prog->aux->func[i]); - bpf_prog_kallsyms_del(prog); + bpf_prog_kallsyms_del_all(prog); call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu); } @@ -1407,6 +1402,7 @@ static int bpf_prog_load(union bpf_attr *attr) return err; free_used_maps: + bpf_prog_kallsyms_del_subprogs(prog); free_used_maps(prog->aux); free_prog: bpf_prog_uncharge_memlock(prog); From 026b59d2197beedbfaa3f64d6cb0605ccb1706e7 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 15 Jun 2018 02:30:48 +0200 Subject: [PATCH 0704/1640] BACKPORT: bpf: reject any prog that failed read-only lock We currently lock any JITed image as read-only via bpf_jit_binary_lock_ro() as well as the BPF image as read-only through bpf_prog_lock_ro(). In the case any of these would fail we throw a WARN_ON_ONCE() in order to yell loudly to the log. Perhaps, to some extend, this may be comparable to an allocation where __GFP_NOWARN is explicitly not set. Added via 65869a47f348 ("bpf: improve read-only handling"), this behavior is slightly different compared to any of the other in-kernel set_memory_ro() users who do not check the return code of set_memory_ro() and friends /at all/ (e.g. in the case of module_enable_ro() / module_disable_ro()). Given in BPF this is mandatory hardening step, we want to know whether there are any issues that would leave both BPF data writable. So it happens that syzkaller enabled fault injection and it triggered memory allocation failure deep inside x86's change_page_attr_set_clr() which was triggered from set_memory_ro(). Now, there are two options: i) leaving everything as is, and ii) reworking the image locking code in order to have a final checkpoint out of the central bpf_prog_select_runtime() which probes whether any of the calls during prog setup weren't successful, and then bailing out with an error. Option ii) is a better approach since this additional paranoia avoids altogether leaving any potential W+X pages from BPF side in the system. Therefore, lets be strict about it, and reject programs in such unlikely occasion. While testing I noticed also that one bpf_prog_lock_ro() call was missing on the outer dummy prog in case of calls, e.g. in the destructor we call bpf_prog_free_deferred() on the main prog where we try to bpf_prog_unlock_free() the program, and since we go via bpf_prog_select_runtime() do that as well. Reported-by: syzbot+3b889862e65a98317058@syzkaller.appspotmail.com Reported-by: syzbot+9e762b52dd17e616a7a5@syzkaller.appspotmail.com Signed-off-by: Daniel Borkmann Acked-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov --- include/linux/filter.h | 60 ++++++++++++++++++++++++++---------------- kernel/bpf/core.c | 53 ++++++++++++++++++++++++++++++++----- kernel/bpf/syscall.c | 4 +-- 3 files changed, 86 insertions(+), 31 deletions(-) diff --git a/include/linux/filter.h b/include/linux/filter.h index e8eed86e1c3b..72ce0edd4ba2 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -484,7 +484,8 @@ struct bpf_binary_header { #ifdef CONFIG_CFI_CLANG u32 magic; #endif - unsigned int pages; + u16 pages; + u16 locked:1; u8 image[]; }; @@ -738,15 +739,18 @@ bpf_ctx_narrow_access_ok(u32 off, u32 size, u32 size_default) #define bpf_classic_proglen(fprog) (fprog->len * sizeof(fprog->filter[0])) -#ifdef CONFIG_ARCH_HAS_SET_MEMORY static inline void bpf_prog_lock_ro(struct bpf_prog *fp) { +#ifdef CONFIG_ARCH_HAS_SET_MEMORY fp->locked = 1; - WARN_ON_ONCE(set_memory_ro((unsigned long)fp, fp->pages)); + if (set_memory_ro((unsigned long)fp, fp->pages)) + fp->locked = 0; +#endif } static inline void bpf_prog_unlock_ro(struct bpf_prog *fp) { +#ifdef CONFIG_ARCH_HAS_SET_MEMORY if (fp->locked) { WARN_ON_ONCE(set_memory_rw((unsigned long)fp, fp->pages)); /* In case set_memory_rw() fails, we want to be the first @@ -754,34 +758,30 @@ static inline void bpf_prog_unlock_ro(struct bpf_prog *fp) */ fp->locked = 0; } +#endif } static inline void bpf_jit_binary_lock_ro(struct bpf_binary_header *hdr) { - WARN_ON_ONCE(set_memory_ro((unsigned long)hdr, hdr->pages)); +#ifdef CONFIG_ARCH_HAS_SET_MEMORY + hdr->locked = 1; + if (set_memory_ro((unsigned long)hdr, hdr->pages)) + hdr->locked = 0; +#endif } static inline void bpf_jit_binary_unlock_ro(struct bpf_binary_header *hdr) { - WARN_ON_ONCE(set_memory_rw((unsigned long)hdr, hdr->pages)); +#ifdef CONFIG_ARCH_HAS_SET_MEMORY + if (hdr->locked) { + WARN_ON_ONCE(set_memory_rw((unsigned long)hdr, hdr->pages)); + /* In case set_memory_rw() fails, we want to be the first + * to crash here instead of some random place later on. + */ + hdr->locked = 0; + } +#endif } -#else -static inline void bpf_prog_lock_ro(struct bpf_prog *fp) -{ -} - -static inline void bpf_prog_unlock_ro(struct bpf_prog *fp) -{ -} - -static inline void bpf_jit_binary_lock_ro(struct bpf_binary_header *hdr) -{ -} - -static inline void bpf_jit_binary_unlock_ro(struct bpf_binary_header *hdr) -{ -} -#endif /* CONFIG_ARCH_HAS_SET_MEMORY */ static inline struct bpf_binary_header * bpf_jit_binary_hdr(const struct bpf_prog *fp) @@ -792,6 +792,22 @@ bpf_jit_binary_hdr(const struct bpf_prog *fp) return (void *)addr; } +#ifdef CONFIG_ARCH_HAS_SET_MEMORY +static inline int bpf_prog_check_pages_ro_single(const struct bpf_prog *fp) +{ + if (!fp->locked) + return -ENOLCK; + if (fp->jited) { + const struct bpf_binary_header *hdr = bpf_jit_binary_hdr(fp); + + if (!hdr->locked) + return -ENOLCK; + } + + return 0; +} +#endif + int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap); static inline int sk_filter(struct sock *sk, struct sk_buff *skb) { diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index fac39ca49ce2..f9ccff971a73 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -665,6 +665,8 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr, bpf_jit_set_header_magic(hdr); hdr->pages = pages; + hdr->locked = 0; + hole = min_t(unsigned int, size - (proglen + sizeof(*hdr)), PAGE_SIZE - sizeof(*hdr)); start = (get_random_int() % hole) & ~(alignment - 1); @@ -1538,6 +1540,33 @@ static int bpf_check_tail_call(const struct bpf_prog *fp) return 0; } +static int bpf_prog_check_pages_ro_locked(const struct bpf_prog *fp) +{ +#ifdef CONFIG_ARCH_HAS_SET_MEMORY + int i, err; + + for (i = 0; i < fp->aux->func_cnt; i++) { + err = bpf_prog_check_pages_ro_single(fp->aux->func[i]); + if (err) + return err; + } + + return bpf_prog_check_pages_ro_single(fp); +#endif + return 0; +} + +static void bpf_prog_select_func(struct bpf_prog *fp) +{ +#ifndef CONFIG_BPF_JIT_ALWAYS_ON + u32 stack_depth = max_t(u32, fp->aux->stack_depth, 1); + + fp->bpf_func = interpreters[(round_up(stack_depth, 32) / 32) - 1]; +#else + fp->bpf_func = __bpf_prog_ret0_warn; +#endif +} + /** * bpf_prog_select_runtime - select exec runtime for BPF program * @fp: bpf_prog populated with internal BPF program @@ -1548,13 +1577,13 @@ static int bpf_check_tail_call(const struct bpf_prog *fp) */ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err) { -#ifndef CONFIG_BPF_JIT_ALWAYS_ON - u32 stack_depth = max_t(u32, fp->aux->stack_depth, 1); + /* In case of BPF to BPF calls, verifier did all the prep + * work with regards to JITing, etc. + */ + if (fp->bpf_func) + goto finalize; - fp->bpf_func = interpreters[(round_up(stack_depth, 32) / 32) - 1]; -#else - fp->bpf_func = __bpf_prog_ret0_warn; -#endif + bpf_prog_select_func(fp); /* eBPF JITs can rewrite the program in case constant * blinding is active. However, in case of error during @@ -1575,6 +1604,8 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err) if (*err) return fp; } + +finalize: bpf_prog_lock_ro(fp); /* The tail call compatibility check can only be done at @@ -1583,7 +1614,17 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err) * all eBPF JITs might immediately support all features. */ *err = bpf_check_tail_call(fp); + if (*err) + return fp; + /* Checkpoint: at this point onwards any cBPF -> eBPF or + * native eBPF program is read-only. If we failed to change + * the page attributes (e.g. allocation failure from + * splitting large pages), then reject the whole program + * in order to guarantee not ending up with any W+X pages + * from BPF side in kernel. + */ + *err = bpf_prog_check_pages_ro_locked(fp); return fp; } EXPORT_SYMBOL_GPL(bpf_prog_select_runtime); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index e8b6f4d8d358..c5280fad1ba2 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1370,9 +1370,7 @@ static int bpf_prog_load(union bpf_attr *attr) if (err < 0) goto free_used_maps; - /* eBPF program is ready to be JITed */ - if (!prog->bpf_func) - prog = bpf_prog_select_runtime(prog, &err); + prog = bpf_prog_select_runtime(prog, &err); if (err < 0) goto free_used_maps; From 8c99af07a32c7f0e029b840f3821bfb3df5e8522 Mon Sep 17 00:00:00 2001 From: Toshiaki Makita Date: Thu, 14 Jun 2018 11:07:42 +0900 Subject: [PATCH 0705/1640] BACKPORT: xdp: Fix handling of devmap in generic XDP Commit 67f29e07e131 ("bpf: devmap introduce dev_map_enqueue") changed the return value type of __devmap_lookup_elem() from struct net_device * to struct bpf_dtab_netdev * but forgot to modify generic XDP code accordingly. Thus generic XDP incorrectly used struct bpf_dtab_netdev where struct net_device is expected, then skb->dev was set to invalid value. v2: - Fix compiler warning without CONFIG_BPF_SYSCALL. Fixes: 67f29e07e131 ("bpf: devmap introduce dev_map_enqueue") Signed-off-by: Toshiaki Makita Acked-by: Yonghong Song Acked-by: Jesper Dangaard Brouer Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 12 ++++++++++++ include/linux/filter.h | 16 ++++++++++++++++ kernel/bpf/devmap.c | 14 ++++++++++++++ net/core/filter.c | 21 ++++----------------- 4 files changed, 46 insertions(+), 17 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index fd6787e3314b..5108fd83f56a 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -493,12 +493,15 @@ void bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth); /* Map specifics */ struct xdp_buff; +struct sk_buff; struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key); void __dev_map_insert_ctx(struct bpf_map *map, u32 index); void __dev_map_flush(struct bpf_map *map); int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, struct net_device *dev_rx); +int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb, + struct bpf_prog *xdp_prog); struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key); void __cpu_map_insert_ctx(struct bpf_map *map, u32 index); @@ -596,6 +599,15 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, return 0; } +struct sk_buff; + +static inline int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, + struct sk_buff *skb, + struct bpf_prog *xdp_prog) +{ + return 0; +} + static inline struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key) { diff --git a/include/linux/filter.h b/include/linux/filter.h index 72ce0edd4ba2..5b187afa8261 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -19,6 +19,7 @@ #include #include #include +#include #include @@ -869,6 +870,21 @@ static inline bool bpf_dump_raw_ok(void) struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, const struct bpf_insn *patch, u32 len); +static inline int __xdp_generic_ok_fwd_dev(struct sk_buff *skb, + struct net_device *fwd) +{ + unsigned int len; + + if (unlikely(!(fwd->flags & IFF_UP))) + return -ENETDOWN; + + len = fwd->mtu + fwd->hard_header_len + VLAN_HLEN; + if (skb->len > len) + return -EMSGSIZE; + + return 0; +} + /* The pair of xdp_do_redirect and xdp_do_flush_map MUST be called in the * same cpu context. Further for best results no more than a single map * for the do_redirect/do_flush pair should be used. This limitation is diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index d89b62200247..448f4a2070b0 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -348,6 +348,20 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, return bq_enqueue(dst, xdpf, dev_rx); } +int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb, + struct bpf_prog *xdp_prog) +{ + int err; + + err = __xdp_generic_ok_fwd_dev(skb, dst->dev); + if (unlikely(err)) + return err; + skb->dev = dst->dev; + generic_xdp_tx(skb, xdp_prog); + + return 0; +} + static void *dev_map_lookup_elem(struct bpf_map *map, void *key) { struct bpf_dtab_netdev *obj = __dev_map_lookup_elem(map, *(u32 *)key); diff --git a/net/core/filter.c b/net/core/filter.c index 5056a8400451..de41b5ef0ccb 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3216,20 +3216,6 @@ err: } EXPORT_SYMBOL_GPL(xdp_do_redirect); -static int __xdp_generic_ok_fwd_dev(struct sk_buff *skb, struct net_device *fwd) -{ - unsigned int len; - - if (unlikely(!(fwd->flags & IFF_UP))) - return -ENETDOWN; - - len = fwd->mtu + fwd->hard_header_len + VLAN_HLEN; - if (skb->len > len) - return -EMSGSIZE; - - return 0; -} - static int xdp_do_generic_redirect_map(struct net_device *dev, struct sk_buff *skb, struct xdp_buff *xdp, @@ -3258,10 +3244,11 @@ static int xdp_do_generic_redirect_map(struct net_device *dev, } if (map->map_type == BPF_MAP_TYPE_DEVMAP) { - if (unlikely((err = __xdp_generic_ok_fwd_dev(skb, fwd)))) + struct bpf_dtab_netdev *dst = fwd; + + err = dev_map_generic_redirect(dst, skb, xdp_prog); + if (unlikely(err)) goto err; - skb->dev = fwd; - generic_xdp_tx(skb, xdp_prog); } else if (map->map_type == BPF_MAP_TYPE_XSKMAP) { struct xdp_sock *xs = fwd; From 481c26e010b9b34ebf39ffd094ff0d33ecd8064d Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Thu, 21 Jun 2018 13:13:04 +0100 Subject: [PATCH 0706/1640] BACKPORT: atomics/treewide: Rename __atomic_add_unless() => atomic_fetch_add_unless() While __atomic_add_unless() was originally intended as a building-block for atomic_add_unless(), it's now used in a number of places around the kernel. It's the only common atomic operation named __atomic*(), rather than atomic_*(), and for consistency it would be better named atomic_fetch_add_unless(). This lack of consistency is slightly confusing, and gets in the way of scripting atomics. Given that, let's clean things up and promote it to an official part of the atomics API, in the form of atomic_fetch_add_unless(). This patch converts definitions and invocations over to the new name, including the instrumented version, using the following script: ---- git grep -w __atomic_add_unless | while read line; do sed -i '{s/\<__atomic_add_unless\>/atomic_fetch_add_unless/}' "${line%%:*}"; done git grep -w __arch_atomic_add_unless | while read line; do sed -i '{s/\<__arch_atomic_add_unless\>/arch_atomic_fetch_add_unless/}' "${line%%:*}"; done ---- Note that we do not have atomic{64,_long}_fetch_add_unless(), which will be introduced by later patches. There should be no functional change as a result of this patch. Signed-off-by: Mark Rutland Reviewed-by: Will Deacon Acked-by: Geert Uytterhoeven Acked-by: Peter Zijlstra (Intel) Acked-by: Palmer Dabbelt Cc: Boqun Feng Cc: Linus Torvalds Cc: Thomas Gleixner Link: https://lore.kernel.org/lkml/20180621121321.4761-2-mark.rutland@arm.com Signed-off-by: Ingo Molnar --- arch/alpha/include/asm/atomic.h | 4 ++-- arch/arc/include/asm/atomic.h | 4 ++-- arch/arm/include/asm/atomic.h | 4 ++-- arch/arm64/include/asm/atomic.h | 2 +- arch/frv/include/asm/atomic.h | 2 +- arch/h8300/include/asm/atomic.h | 2 +- arch/hexagon/include/asm/atomic.h | 4 ++-- arch/ia64/include/asm/atomic.h | 2 +- arch/m32r/include/asm/atomic.h | 4 ++-- arch/m68k/include/asm/atomic.h | 2 +- arch/metag/include/asm/atomic_lnkget.h | 2 +- arch/metag/include/asm/atomic_lock1.h | 2 +- arch/mips/include/asm/atomic.h | 4 ++-- arch/mn10300/include/asm/atomic.h | 2 +- arch/openrisc/include/asm/atomic.h | 4 ++-- arch/parisc/include/asm/atomic.h | 4 ++-- arch/powerpc/include/asm/atomic.h | 8 ++++---- arch/s390/include/asm/atomic.h | 2 +- arch/sh/include/asm/atomic.h | 4 ++-- arch/sparc/include/asm/atomic_32.h | 2 +- arch/sparc/include/asm/atomic_64.h | 2 +- arch/sparc/lib/atomic32.c | 4 ++-- arch/tile/include/asm/atomic_32.h | 4 ++-- arch/tile/include/asm/atomic_64.h | 2 +- arch/x86/include/asm/atomic.h | 4 ++-- arch/xtensa/include/asm/atomic.h | 4 ++-- drivers/block/rbd.c | 2 +- drivers/infiniband/core/rdma_core.c | 2 +- fs/afs/rxrpc.c | 2 +- include/asm-generic/atomic-instrumented.h | 4 ++-- include/asm-generic/atomic.h | 4 ++-- include/linux/atomic.h | 2 +- kernel/bpf/syscall.c | 4 ++-- kernel/trace/tracing_map.c | 2 +- net/rxrpc/call_object.c | 2 +- net/rxrpc/conn_object.c | 4 ++-- 36 files changed, 56 insertions(+), 56 deletions(-) diff --git a/arch/alpha/include/asm/atomic.h b/arch/alpha/include/asm/atomic.h index 85867d3cea64..1850abfff37b 100644 --- a/arch/alpha/include/asm/atomic.h +++ b/arch/alpha/include/asm/atomic.h @@ -193,7 +193,7 @@ ATOMIC_OPS(xor, xor) #define atomic_xchg(v, new) (xchg(&((v)->counter), new)) /** - * __atomic_add_unless - add unless the number is a given value + * atomic_fetch_add_unless - add unless the number is a given value * @v: pointer of type atomic_t * @a: the amount to add to v... * @u: ...unless v is equal to u. @@ -201,7 +201,7 @@ ATOMIC_OPS(xor, xor) * Atomically adds @a to @v, so long as it was not @u. * Returns the old value of @v. */ -static __inline__ int __atomic_add_unless(atomic_t *v, int a, int u) +static __inline__ int atomic_fetch_add_unless(atomic_t *v, int a, int u) { int c, new, old; smp_mb(); diff --git a/arch/arc/include/asm/atomic.h b/arch/arc/include/asm/atomic.h index c98b59ac0612..2873e07dcb21 100644 --- a/arch/arc/include/asm/atomic.h +++ b/arch/arc/include/asm/atomic.h @@ -309,7 +309,7 @@ ATOMIC_OPS(xor, ^=, CTOP_INST_AXOR_DI_R2_R2_R3) #undef ATOMIC_OP /** - * __atomic_add_unless - add unless the number is a given value + * atomic_fetch_add_unless - add unless the number is a given value * @v: pointer of type atomic_t * @a: the amount to add to v... * @u: ...unless v is equal to u. @@ -317,7 +317,7 @@ ATOMIC_OPS(xor, ^=, CTOP_INST_AXOR_DI_R2_R2_R3) * Atomically adds @a to @v, so long as it was not @u. * Returns the old value of @v */ -#define __atomic_add_unless(v, a, u) \ +#define atomic_fetch_add_unless(v, a, u) \ ({ \ int c, old; \ \ diff --git a/arch/arm/include/asm/atomic.h b/arch/arm/include/asm/atomic.h index 66d0e215a773..9d56d0727c9b 100644 --- a/arch/arm/include/asm/atomic.h +++ b/arch/arm/include/asm/atomic.h @@ -130,7 +130,7 @@ static inline int atomic_cmpxchg_relaxed(atomic_t *ptr, int old, int new) } #define atomic_cmpxchg_relaxed atomic_cmpxchg_relaxed -static inline int __atomic_add_unless(atomic_t *v, int a, int u) +static inline int atomic_fetch_add_unless(atomic_t *v, int a, int u) { int oldval, newval; unsigned long tmp; @@ -215,7 +215,7 @@ static inline int atomic_cmpxchg(atomic_t *v, int old, int new) return ret; } -static inline int __atomic_add_unless(atomic_t *v, int a, int u) +static inline int atomic_fetch_add_unless(atomic_t *v, int a, int u) { int c, old; diff --git a/arch/arm64/include/asm/atomic.h b/arch/arm64/include/asm/atomic.h index c0235e0ff849..264d20339f74 100644 --- a/arch/arm64/include/asm/atomic.h +++ b/arch/arm64/include/asm/atomic.h @@ -125,7 +125,7 @@ #define atomic_dec_and_test(v) (atomic_dec_return(v) == 0) #define atomic_sub_and_test(i, v) (atomic_sub_return((i), (v)) == 0) #define atomic_add_negative(i, v) (atomic_add_return((i), (v)) < 0) -#define __atomic_add_unless(v, a, u) ___atomic_add_unless(v, a, u,) +#define atomic_fetch_add_unless(v, a, u) ___atomic_add_unless(v, a, u,) #define atomic_andnot atomic_andnot /* diff --git a/arch/frv/include/asm/atomic.h b/arch/frv/include/asm/atomic.h index e93c9494503a..a4d17c0272b3 100644 --- a/arch/frv/include/asm/atomic.h +++ b/arch/frv/include/asm/atomic.h @@ -146,7 +146,7 @@ static inline void atomic64_dec(atomic64_t *v) #define atomic64_cmpxchg(v, old, new) (__cmpxchg_64(old, new, &(v)->counter)) #define atomic64_xchg(v, new) (__xchg_64(new, &(v)->counter)) -static __inline__ int __atomic_add_unless(atomic_t *v, int a, int u) +static __inline__ int atomic_fetch_add_unless(atomic_t *v, int a, int u) { int c, old; c = atomic_read(v); diff --git a/arch/h8300/include/asm/atomic.h b/arch/h8300/include/asm/atomic.h index 941e7554e886..4465cfc30a3a 100644 --- a/arch/h8300/include/asm/atomic.h +++ b/arch/h8300/include/asm/atomic.h @@ -94,7 +94,7 @@ static inline int atomic_cmpxchg(atomic_t *v, int old, int new) return ret; } -static inline int __atomic_add_unless(atomic_t *v, int a, int u) +static inline int atomic_fetch_add_unless(atomic_t *v, int a, int u) { int ret; h8300flags flags; diff --git a/arch/hexagon/include/asm/atomic.h b/arch/hexagon/include/asm/atomic.h index d4e283b4f335..afb210546675 100644 --- a/arch/hexagon/include/asm/atomic.h +++ b/arch/hexagon/include/asm/atomic.h @@ -164,7 +164,7 @@ ATOMIC_OPS(xor) #undef ATOMIC_OP /** - * __atomic_add_unless - add unless the number is a given value + * atomic_fetch_add_unless - add unless the number is a given value * @v: pointer to value * @a: amount to add * @u: unless value is equal to u @@ -173,7 +173,7 @@ ATOMIC_OPS(xor) * */ -static inline int __atomic_add_unless(atomic_t *v, int a, int u) +static inline int atomic_fetch_add_unless(atomic_t *v, int a, int u) { int __oldval; register int tmp; diff --git a/arch/ia64/include/asm/atomic.h b/arch/ia64/include/asm/atomic.h index 28e02c99be6d..5daf6c0fe3bb 100644 --- a/arch/ia64/include/asm/atomic.h +++ b/arch/ia64/include/asm/atomic.h @@ -237,7 +237,7 @@ ATOMIC64_FETCH_OP(xor, ^) (cmpxchg(&((v)->counter), old, new)) #define atomic64_xchg(v, new) (xchg(&((v)->counter), new)) -static __inline__ int __atomic_add_unless(atomic_t *v, int a, int u) +static __inline__ int atomic_fetch_add_unless(atomic_t *v, int a, int u) { int c, old; c = atomic_read(v); diff --git a/arch/m32r/include/asm/atomic.h b/arch/m32r/include/asm/atomic.h index 8bf67e55ff54..123fb4ca942a 100644 --- a/arch/m32r/include/asm/atomic.h +++ b/arch/m32r/include/asm/atomic.h @@ -249,7 +249,7 @@ static __inline__ int atomic_dec_return(atomic_t *v) #define atomic_xchg(v, new) (xchg(&((v)->counter), new)) /** - * __atomic_add_unless - add unless the number is a given value + * atomic_fetch_add_unless - add unless the number is a given value * @v: pointer of type atomic_t * @a: the amount to add to v... * @u: ...unless v is equal to u. @@ -257,7 +257,7 @@ static __inline__ int atomic_dec_return(atomic_t *v) * Atomically adds @a to @v, so long as it was not @u. * Returns the old value of @v. */ -static __inline__ int __atomic_add_unless(atomic_t *v, int a, int u) +static __inline__ int atomic_fetch_add_unless(atomic_t *v, int a, int u) { int c, old; c = atomic_read(v); diff --git a/arch/m68k/include/asm/atomic.h b/arch/m68k/include/asm/atomic.h index e993e2860ee1..8022d9ea1213 100644 --- a/arch/m68k/include/asm/atomic.h +++ b/arch/m68k/include/asm/atomic.h @@ -211,7 +211,7 @@ static inline int atomic_add_negative(int i, atomic_t *v) return c != 0; } -static __inline__ int __atomic_add_unless(atomic_t *v, int a, int u) +static __inline__ int atomic_fetch_add_unless(atomic_t *v, int a, int u) { int c, old; c = atomic_read(v); diff --git a/arch/metag/include/asm/atomic_lnkget.h b/arch/metag/include/asm/atomic_lnkget.h index 17e8c61c946d..4346e3831f6a 100644 --- a/arch/metag/include/asm/atomic_lnkget.h +++ b/arch/metag/include/asm/atomic_lnkget.h @@ -154,7 +154,7 @@ static inline int atomic_xchg(atomic_t *v, int new) return old; } -static inline int __atomic_add_unless(atomic_t *v, int a, int u) +static inline int atomic_fetch_add_unless(atomic_t *v, int a, int u) { int result, temp; diff --git a/arch/metag/include/asm/atomic_lock1.h b/arch/metag/include/asm/atomic_lock1.h index 2ce8fa3a79c2..a0c8ff162fbb 100644 --- a/arch/metag/include/asm/atomic_lock1.h +++ b/arch/metag/include/asm/atomic_lock1.h @@ -122,7 +122,7 @@ static inline int atomic_cmpxchg(atomic_t *v, int old, int new) #define atomic_xchg(v, new) (xchg(&((v)->counter), new)) -static inline int __atomic_add_unless(atomic_t *v, int a, int u) +static inline int atomic_fetch_add_unless(atomic_t *v, int a, int u) { int ret; unsigned long flags; diff --git a/arch/mips/include/asm/atomic.h b/arch/mips/include/asm/atomic.h index 0ab176bdb8e8..02fc1553cf9b 100644 --- a/arch/mips/include/asm/atomic.h +++ b/arch/mips/include/asm/atomic.h @@ -275,7 +275,7 @@ static __inline__ int atomic_sub_if_positive(int i, atomic_t * v) #define atomic_xchg(v, new) (xchg(&((v)->counter), (new))) /** - * __atomic_add_unless - add unless the number is a given value + * atomic_fetch_add_unless - add unless the number is a given value * @v: pointer of type atomic_t * @a: the amount to add to v... * @u: ...unless v is equal to u. @@ -283,7 +283,7 @@ static __inline__ int atomic_sub_if_positive(int i, atomic_t * v) * Atomically adds @a to @v, so long as it was not @u. * Returns the old value of @v. */ -static __inline__ int __atomic_add_unless(atomic_t *v, int a, int u) +static __inline__ int atomic_fetch_add_unless(atomic_t *v, int a, int u) { int c, old; c = atomic_read(v); diff --git a/arch/mn10300/include/asm/atomic.h b/arch/mn10300/include/asm/atomic.h index 36389efd45e8..6c42d6a0a907 100644 --- a/arch/mn10300/include/asm/atomic.h +++ b/arch/mn10300/include/asm/atomic.h @@ -144,7 +144,7 @@ static inline void atomic_dec(atomic_t *v) #define atomic_dec_and_test(v) (atomic_sub_return(1, (v)) == 0) #define atomic_inc_and_test(v) (atomic_add_return(1, (v)) == 0) -#define __atomic_add_unless(v, a, u) \ +#define atomic_fetch_add_unless(v, a, u) \ ({ \ int c, old; \ c = atomic_read(v); \ diff --git a/arch/openrisc/include/asm/atomic.h b/arch/openrisc/include/asm/atomic.h index 146e1660f00e..b589fac39b92 100644 --- a/arch/openrisc/include/asm/atomic.h +++ b/arch/openrisc/include/asm/atomic.h @@ -100,7 +100,7 @@ ATOMIC_OP(xor) * * This is often used through atomic_inc_not_zero() */ -static inline int __atomic_add_unless(atomic_t *v, int a, int u) +static inline int atomic_fetch_add_unless(atomic_t *v, int a, int u) { int old, tmp; @@ -119,7 +119,7 @@ static inline int __atomic_add_unless(atomic_t *v, int a, int u) return old; } -#define __atomic_add_unless __atomic_add_unless +#define atomic_fetch_add_unless atomic_fetch_add_unless #include diff --git a/arch/parisc/include/asm/atomic.h b/arch/parisc/include/asm/atomic.h index 614bcc7673f5..3c48acb14035 100644 --- a/arch/parisc/include/asm/atomic.h +++ b/arch/parisc/include/asm/atomic.h @@ -78,7 +78,7 @@ static __inline__ int atomic_read(const atomic_t *v) #define atomic_xchg(v, new) (xchg(&((v)->counter), new)) /** - * __atomic_add_unless - add unless the number is a given value + * atomic_fetch_add_unless - add unless the number is a given value * @v: pointer of type atomic_t * @a: the amount to add to v... * @u: ...unless v is equal to u. @@ -86,7 +86,7 @@ static __inline__ int atomic_read(const atomic_t *v) * Atomically adds @a to @v, so long as it was not @u. * Returns the old value of @v. */ -static __inline__ int __atomic_add_unless(atomic_t *v, int a, int u) +static __inline__ int atomic_fetch_add_unless(atomic_t *v, int a, int u) { int c, old; c = atomic_read(v); diff --git a/arch/powerpc/include/asm/atomic.h b/arch/powerpc/include/asm/atomic.h index 682b3e6a1e21..1483261080a1 100644 --- a/arch/powerpc/include/asm/atomic.h +++ b/arch/powerpc/include/asm/atomic.h @@ -218,7 +218,7 @@ static __inline__ int atomic_dec_return_relaxed(atomic_t *v) #define atomic_xchg_relaxed(v, new) xchg_relaxed(&((v)->counter), (new)) /** - * __atomic_add_unless - add unless the number is a given value + * atomic_fetch_add_unless - add unless the number is a given value * @v: pointer of type atomic_t * @a: the amount to add to v... * @u: ...unless v is equal to u. @@ -226,13 +226,13 @@ static __inline__ int atomic_dec_return_relaxed(atomic_t *v) * Atomically adds @a to @v, so long as it was not @u. * Returns the old value of @v. */ -static __inline__ int __atomic_add_unless(atomic_t *v, int a, int u) +static __inline__ int atomic_fetch_add_unless(atomic_t *v, int a, int u) { int t; __asm__ __volatile__ ( PPC_ATOMIC_ENTRY_BARRIER -"1: lwarx %0,0,%1 # __atomic_add_unless\n\ +"1: lwarx %0,0,%1 # atomic_fetch_add_unless\n\ cmpw 0,%0,%3 \n\ beq 2f \n\ add %0,%2,%0 \n" @@ -538,7 +538,7 @@ static __inline__ int atomic64_add_unless(atomic64_t *v, long a, long u) __asm__ __volatile__ ( PPC_ATOMIC_ENTRY_BARRIER -"1: ldarx %0,0,%1 # __atomic_add_unless\n\ +"1: ldarx %0,0,%1 # atomic_fetch_add_unless\n\ cmpd 0,%0,%3 \n\ beq 2f \n\ add %0,%2,%0 \n" diff --git a/arch/s390/include/asm/atomic.h b/arch/s390/include/asm/atomic.h index 4b55532f15c4..c2858cdd8c29 100644 --- a/arch/s390/include/asm/atomic.h +++ b/arch/s390/include/asm/atomic.h @@ -90,7 +90,7 @@ static inline int atomic_cmpxchg(atomic_t *v, int old, int new) return __atomic_cmpxchg(&v->counter, old, new); } -static inline int __atomic_add_unless(atomic_t *v, int a, int u) +static inline int atomic_fetch_add_unless(atomic_t *v, int a, int u) { int c, old; c = atomic_read(v); diff --git a/arch/sh/include/asm/atomic.h b/arch/sh/include/asm/atomic.h index 0fd0099f43cc..ef45931ebac5 100644 --- a/arch/sh/include/asm/atomic.h +++ b/arch/sh/include/asm/atomic.h @@ -46,7 +46,7 @@ #define atomic_cmpxchg(v, o, n) (cmpxchg(&((v)->counter), (o), (n))) /** - * __atomic_add_unless - add unless the number is a given value + * atomic_fetch_add_unless - add unless the number is a given value * @v: pointer of type atomic_t * @a: the amount to add to v... * @u: ...unless v is equal to u. @@ -54,7 +54,7 @@ * Atomically adds @a to @v, so long as it was not @u. * Returns the old value of @v. */ -static inline int __atomic_add_unless(atomic_t *v, int a, int u) +static inline int atomic_fetch_add_unless(atomic_t *v, int a, int u) { int c, old; c = atomic_read(v); diff --git a/arch/sparc/include/asm/atomic_32.h b/arch/sparc/include/asm/atomic_32.h index 0c3b3b4a9963..a139319a3d1d 100644 --- a/arch/sparc/include/asm/atomic_32.h +++ b/arch/sparc/include/asm/atomic_32.h @@ -27,7 +27,7 @@ int atomic_fetch_or(int, atomic_t *); int atomic_fetch_xor(int, atomic_t *); int atomic_cmpxchg(atomic_t *, int, int); int atomic_xchg(atomic_t *, int); -int __atomic_add_unless(atomic_t *, int, int); +int atomic_fetch_add_unless(atomic_t *, int, int); void atomic_set(atomic_t *, int); #define atomic_set_release(v, i) atomic_set((v), (i)) diff --git a/arch/sparc/include/asm/atomic_64.h b/arch/sparc/include/asm/atomic_64.h index 28db058d471b..f416fd3d2708 100644 --- a/arch/sparc/include/asm/atomic_64.h +++ b/arch/sparc/include/asm/atomic_64.h @@ -89,7 +89,7 @@ static inline int atomic_xchg(atomic_t *v, int new) return xchg(&v->counter, new); } -static inline int __atomic_add_unless(atomic_t *v, int a, int u) +static inline int atomic_fetch_add_unless(atomic_t *v, int a, int u) { int c, old; c = atomic_read(v); diff --git a/arch/sparc/lib/atomic32.c b/arch/sparc/lib/atomic32.c index 465a901a0ada..281fa634bb1a 100644 --- a/arch/sparc/lib/atomic32.c +++ b/arch/sparc/lib/atomic32.c @@ -95,7 +95,7 @@ int atomic_cmpxchg(atomic_t *v, int old, int new) } EXPORT_SYMBOL(atomic_cmpxchg); -int __atomic_add_unless(atomic_t *v, int a, int u) +int atomic_fetch_add_unless(atomic_t *v, int a, int u) { int ret; unsigned long flags; @@ -107,7 +107,7 @@ int __atomic_add_unless(atomic_t *v, int a, int u) spin_unlock_irqrestore(ATOMIC_HASH(v), flags); return ret; } -EXPORT_SYMBOL(__atomic_add_unless); +EXPORT_SYMBOL(atomic_fetch_add_unless); /* Atomic operations are already serializing */ void atomic_set(atomic_t *v, int i) diff --git a/arch/tile/include/asm/atomic_32.h b/arch/tile/include/asm/atomic_32.h index 53a423e7cb92..75759d13bf81 100644 --- a/arch/tile/include/asm/atomic_32.h +++ b/arch/tile/include/asm/atomic_32.h @@ -72,7 +72,7 @@ static inline int atomic_add_return(int i, atomic_t *v) } /** - * __atomic_add_unless - add unless the number is already a given value + * atomic_fetch_add_unless - add unless the number is already a given value * @v: pointer of type atomic_t * @a: the amount to add to v... * @u: ...unless v is equal to u. @@ -80,7 +80,7 @@ static inline int atomic_add_return(int i, atomic_t *v) * Atomically adds @a to @v, so long as @v was not already @u. * Returns the old value of @v. */ -static inline int __atomic_add_unless(atomic_t *v, int a, int u) +static inline int atomic_fetch_add_unless(atomic_t *v, int a, int u) { smp_mb(); /* barrier for proper semantics */ return _atomic_xchg_add_unless(&v->counter, a, u); diff --git a/arch/tile/include/asm/atomic_64.h b/arch/tile/include/asm/atomic_64.h index 4cefa0c9fd81..b72cfe4d29b0 100644 --- a/arch/tile/include/asm/atomic_64.h +++ b/arch/tile/include/asm/atomic_64.h @@ -97,7 +97,7 @@ static inline void atomic_xor(int i, atomic_t *v) } while (guess != oldval); } -static inline int __atomic_add_unless(atomic_t *v, int a, int u) +static inline int atomic_fetch_add_unless(atomic_t *v, int a, int u) { int guess, oldval = v->counter; do { diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h index 46ca345cb674..2ecac924f4d9 100644 --- a/arch/x86/include/asm/atomic.h +++ b/arch/x86/include/asm/atomic.h @@ -254,7 +254,7 @@ static inline int arch_atomic_fetch_xor(int i, atomic_t *v) } /** - * __arch_atomic_add_unless - add unless the number is already a given value + * arch_atomic_fetch_add_unless - add unless the number is already a given value * @v: pointer of type atomic_t * @a: the amount to add to v... * @u: ...unless v is equal to u. @@ -262,7 +262,7 @@ static inline int arch_atomic_fetch_xor(int i, atomic_t *v) * Atomically adds @a to @v, so long as @v was not already @u. * Returns the old value of @v. */ -static __always_inline int __arch_atomic_add_unless(atomic_t *v, int a, int u) +static __always_inline int arch_atomic_fetch_add_unless(atomic_t *v, int a, int u) { int c = arch_atomic_read(v); diff --git a/arch/xtensa/include/asm/atomic.h b/arch/xtensa/include/asm/atomic.h index e7a23f2a519a..4188e56c06c9 100644 --- a/arch/xtensa/include/asm/atomic.h +++ b/arch/xtensa/include/asm/atomic.h @@ -275,7 +275,7 @@ ATOMIC_OPS(xor) #define atomic_xchg(v, new) (xchg(&((v)->counter), new)) /** - * __atomic_add_unless - add unless the number is a given value + * atomic_fetch_add_unless - add unless the number is a given value * @v: pointer of type atomic_t * @a: the amount to add to v... * @u: ...unless v is equal to u. @@ -283,7 +283,7 @@ ATOMIC_OPS(xor) * Atomically adds @a to @v, so long as it was not @u. * Returns the old value of @v. */ -static __inline__ int __atomic_add_unless(atomic_t *v, int a, int u) +static __inline__ int atomic_fetch_add_unless(atomic_t *v, int a, int u) { int c, old; c = atomic_read(v); diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 859d227504f7..45928e4da2b2 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -60,7 +60,7 @@ static int atomic_inc_return_safe(atomic_t *v) { unsigned int counter; - counter = (unsigned int)__atomic_add_unless(v, 1, 0); + counter = (unsigned int)atomic_fetch_add_unless(v, 1, 0); if (counter <= (unsigned int)INT_MAX) return (int)counter; diff --git a/drivers/infiniband/core/rdma_core.c b/drivers/infiniband/core/rdma_core.c index 1984d6cee3e0..39bc14f68d70 100644 --- a/drivers/infiniband/core/rdma_core.c +++ b/drivers/infiniband/core/rdma_core.c @@ -121,7 +121,7 @@ static int uverbs_try_lock_object(struct ib_uobject *uobj, bool exclusive) * this lock. */ if (!exclusive) - return __atomic_add_unless(&uobj->usecnt, 1, -1) == -1 ? + return atomic_fetch_add_unless(&uobj->usecnt, 1, -1) == -1 ? -EBUSY : 0; /* lock is either WRITE or DESTROY - should be exclusive */ diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index 7dc9c78a1c31..805ce7f2318a 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -617,7 +617,7 @@ static void afs_wake_up_async_call(struct sock *sk, struct rxrpc_call *rxcall, trace_afs_notify_call(rxcall, call); call->need_attention = true; - u = __atomic_add_unless(&call->usage, 1, 0); + u = atomic_fetch_add_unless(&call->usage, 1, 0); if (u != 0) { trace_afs_call(call, afs_call_trace_wake, u + 1, atomic_read(&afs_outstanding_calls), diff --git a/include/asm-generic/atomic-instrumented.h b/include/asm-generic/atomic-instrumented.h index 66f964a9a8ff..d6de08113679 100644 --- a/include/asm-generic/atomic-instrumented.h +++ b/include/asm-generic/atomic-instrumented.h @@ -84,10 +84,10 @@ static __always_inline bool atomic64_try_cmpxchg(atomic64_t *v, s64 *old, s64 ne } #endif -static __always_inline int __atomic_add_unless(atomic_t *v, int a, int u) +static __always_inline int atomic_fetch_add_unless(atomic_t *v, int a, int u) { kasan_check_write(v, sizeof(*v)); - return __arch_atomic_add_unless(v, a, u); + return arch_atomic_fetch_add_unless(v, a, u); } diff --git a/include/asm-generic/atomic.h b/include/asm-generic/atomic.h index 3f38eb03649c..d2115b9fa1b4 100644 --- a/include/asm-generic/atomic.h +++ b/include/asm-generic/atomic.h @@ -223,8 +223,8 @@ static inline void atomic_dec(atomic_t *v) #define atomic_xchg(ptr, v) (xchg(&(ptr)->counter, (v))) #define atomic_cmpxchg(v, old, new) (cmpxchg(&((v)->counter), (old), (new))) -#ifndef __atomic_add_unless -static inline int __atomic_add_unless(atomic_t *v, int a, int u) +#ifndef atomic_fetch_add_unless +static inline int atomic_fetch_add_unless(atomic_t *v, int a, int u) { int c, old; c = atomic_read(v); diff --git a/include/linux/atomic.h b/include/linux/atomic.h index 01ce3997cb42..9cc982936675 100644 --- a/include/linux/atomic.h +++ b/include/linux/atomic.h @@ -530,7 +530,7 @@ */ static inline int atomic_add_unless(atomic_t *v, int a, int u) { - return __atomic_add_unless(v, a, u) != u; + return atomic_fetch_add_unless(v, a, u) != u; } /** diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index c5280fad1ba2..aa5ce6569e4b 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -576,7 +576,7 @@ static struct bpf_map *bpf_map_inc_not_zero(struct bpf_map *map, { int refold; - refold = __atomic_add_unless(&map->refcnt, 1, 0); + refold = atomic_fetch_add_unless(&map->refcnt, 1, 0); if (refold >= BPF_MAX_REFCNT) { __bpf_map_put(map, false); @@ -1159,7 +1159,7 @@ struct bpf_prog *bpf_prog_inc_not_zero(struct bpf_prog *prog) { int refold; - refold = __atomic_add_unless(&prog->aux->refcnt, 1, 0); + refold = atomic_fetch_add_unless(&prog->aux->refcnt, 1, 0); if (refold >= BPF_MAX_REFCNT) { __bpf_prog_put(prog, false); diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c index 6354c5f24c7e..f822aaaa32a6 100644 --- a/kernel/trace/tracing_map.c +++ b/kernel/trace/tracing_map.c @@ -355,7 +355,7 @@ static struct tracing_map_elt *get_free_elt(struct tracing_map *map) struct tracing_map_elt *elt = NULL; int idx; - idx = __atomic_add_unless(&map->next_elt, 1, map->max_elts); + idx = atomic_fetch_add_unless(&map->next_elt, 1, map->max_elts); if (idx < map->max_elts) { elt = *(TRACING_MAP_ELT(map->elts, idx)); if (map->ops && map->ops->elt_init) diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index 1c98a026b41a..159cea6d26b6 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -412,7 +412,7 @@ void rxrpc_incoming_call(struct rxrpc_sock *rx, bool rxrpc_queue_call(struct rxrpc_call *call) { const void *here = __builtin_return_address(0); - int n = __atomic_add_unless(&call->usage, 1, 0); + int n = atomic_fetch_add_unless(&call->usage, 1, 0); if (n == 0) return false; if (rxrpc_queue_work(&call->processor)) diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c index 0e5087b9e07c..26b9f1614728 100644 --- a/net/rxrpc/conn_object.c +++ b/net/rxrpc/conn_object.c @@ -250,7 +250,7 @@ void rxrpc_kill_connection(struct rxrpc_connection *conn) bool rxrpc_queue_conn(struct rxrpc_connection *conn) { const void *here = __builtin_return_address(0); - int n = __atomic_add_unless(&conn->usage, 1, 0); + int n = atomic_fetch_add_unless(&conn->usage, 1, 0); if (n == 0) return false; if (rxrpc_queue_work(&conn->processor)) @@ -293,7 +293,7 @@ rxrpc_get_connection_maybe(struct rxrpc_connection *conn) const void *here = __builtin_return_address(0); if (conn) { - int n = __atomic_add_unless(&conn->usage, 1, 0); + int n = atomic_fetch_add_unless(&conn->usage, 1, 0); if (n > 0) trace_rxrpc_conn(conn, rxrpc_conn_got, n + 1, here); else From a5afbb696a71a3ec351eccb95e13ec9357fdaa2a Mon Sep 17 00:00:00 2001 From: Sean Young Date: Tue, 19 Jun 2018 00:04:24 +0100 Subject: [PATCH 0707/1640] UPSTREAM: bpf: fix attach type BPF_LIRC_MODE2 dependency wrt CONFIG_CGROUP_BPF If the kernel is compiled with CONFIG_CGROUP_BPF not enabled, it is not possible to attach, detach or query IR BPF programs to /dev/lircN devices, making them impossible to use. For embedded devices, it should be possible to use IR decoding without cgroups or CONFIG_CGROUP_BPF enabled. This change requires some refactoring, since bpf_prog_{attach,detach,query} functions are now always compiled, but their code paths for cgroups need moving out. Rather than a #ifdef CONFIG_CGROUP_BPF in kernel/bpf/syscall.c, moving them to kernel/bpf/cgroup.c and kernel/bpf/sockmap.c does not require #ifdefs since that is already conditionally compiled. Fixes: f4364dcfc86d ("media: rc: introduce BPF_PROG_LIRC_MODE2") Signed-off-by: Sean Young Signed-off-by: Daniel Borkmann --- drivers/media/rc/bpf-lirc.c | 14 +----- include/linux/bpf-cgroup.h | 26 ++++++++++ include/linux/bpf.h | 8 +++ include/linux/bpf_lirc.h | 5 +- kernel/bpf/cgroup.c | 54 ++++++++++++++++++++ kernel/bpf/sockmap.c | 18 +++++++ kernel/bpf/syscall.c | 99 ++++++++----------------------------- 7 files changed, 132 insertions(+), 92 deletions(-) diff --git a/drivers/media/rc/bpf-lirc.c b/drivers/media/rc/bpf-lirc.c index 0f88b4f779c8..7f4aed108b41 100644 --- a/drivers/media/rc/bpf-lirc.c +++ b/drivers/media/rc/bpf-lirc.c @@ -208,29 +208,19 @@ void lirc_bpf_free(struct rc_dev *rcdev) bpf_prog_array_free(rcdev->raw->progs); } -int lirc_prog_attach(const union bpf_attr *attr) +int lirc_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog) { - struct bpf_prog *prog; struct rc_dev *rcdev; int ret; if (attr->attach_flags) return -EINVAL; - prog = bpf_prog_get_type(attr->attach_bpf_fd, - BPF_PROG_TYPE_LIRC_MODE2); - if (IS_ERR(prog)) - return PTR_ERR(prog); - rcdev = rc_dev_get_from_fd(attr->target_fd); - if (IS_ERR(rcdev)) { - bpf_prog_put(prog); + if (IS_ERR(rcdev)) return PTR_ERR(rcdev); - } ret = lirc_bpf_attach(rcdev, prog); - if (ret) - bpf_prog_put(prog); put_device(&rcdev->dev); diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index 975fb4cf1bb7..79795c5fa7c3 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -188,12 +188,38 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, \ __ret; \ }) +int cgroup_bpf_prog_attach(const union bpf_attr *attr, + enum bpf_prog_type ptype, struct bpf_prog *prog); +int cgroup_bpf_prog_detach(const union bpf_attr *attr, + enum bpf_prog_type ptype); +int cgroup_bpf_prog_query(const union bpf_attr *attr, + union bpf_attr __user *uattr); #else +struct bpf_prog; struct cgroup_bpf {}; static inline void cgroup_bpf_put(struct cgroup *cgrp) {} static inline int cgroup_bpf_inherit(struct cgroup *cgrp) { return 0; } +static inline int cgroup_bpf_prog_attach(const union bpf_attr *attr, + enum bpf_prog_type ptype, + struct bpf_prog *prog) +{ + return -EINVAL; +} + +static inline int cgroup_bpf_prog_detach(const union bpf_attr *attr, + enum bpf_prog_type ptype) +{ + return -EINVAL; +} + +static inline int cgroup_bpf_prog_query(const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + return -EINVAL; +} + #define cgroup_bpf_enabled (0) #define BPF_CGROUP_PRE_CONNECT_ENABLED(sk) (0) #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) ({ 0; }) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 5108fd83f56a..c971a85355ca 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -712,6 +712,8 @@ static inline void bpf_map_offload_map_free(struct bpf_map *map) struct sock *__sock_map_lookup_elem(struct bpf_map *map, u32 key); struct sock *__sock_hash_lookup_elem(struct bpf_map *map, void *key); int sock_map_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type); +int sockmap_get_from_fd(const union bpf_attr *attr, int type, + struct bpf_prog *prog); #else static inline struct sock *__sock_map_lookup_elem(struct bpf_map *map, u32 key) { @@ -730,6 +732,12 @@ static inline int sock_map_prog(struct bpf_map *map, { return -EOPNOTSUPP; } + +static inline int sockmap_get_from_fd(const union bpf_attr *attr, int type, + struct bpf_prog *prog) +{ + return -EINVAL; +} #endif #if defined(CONFIG_XDP_SOCKETS) diff --git a/include/linux/bpf_lirc.h b/include/linux/bpf_lirc.h index 5f8a4283092d..9d9ff755ec29 100644 --- a/include/linux/bpf_lirc.h +++ b/include/linux/bpf_lirc.h @@ -5,11 +5,12 @@ #include #ifdef CONFIG_BPF_LIRC_MODE2 -int lirc_prog_attach(const union bpf_attr *attr); +int lirc_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog); int lirc_prog_detach(const union bpf_attr *attr); int lirc_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr); #else -static inline int lirc_prog_attach(const union bpf_attr *attr) +static inline int lirc_prog_attach(const union bpf_attr *attr, + struct bpf_prog *prog) { return -EINVAL; } diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index cfb36a755854..badabb0b435c 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -427,6 +427,60 @@ int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, return ret; } +int cgroup_bpf_prog_attach(const union bpf_attr *attr, + enum bpf_prog_type ptype, struct bpf_prog *prog) +{ + struct cgroup *cgrp; + int ret; + + cgrp = cgroup_get_from_fd(attr->target_fd); + if (IS_ERR(cgrp)) + return PTR_ERR(cgrp); + + ret = cgroup_bpf_attach(cgrp, prog, attr->attach_type, + attr->attach_flags); + cgroup_put(cgrp); + return ret; +} + +int cgroup_bpf_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype) +{ + struct bpf_prog *prog; + struct cgroup *cgrp; + int ret; + + cgrp = cgroup_get_from_fd(attr->target_fd); + if (IS_ERR(cgrp)) + return PTR_ERR(cgrp); + + prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype); + if (IS_ERR(prog)) + prog = NULL; + + ret = cgroup_bpf_detach(cgrp, prog, attr->attach_type, 0); + if (prog) + bpf_prog_put(prog); + + cgroup_put(cgrp); + return ret; +} + +int cgroup_bpf_prog_query(const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + struct cgroup *cgrp; + int ret; + + cgrp = cgroup_get_from_fd(attr->query.target_fd); + if (IS_ERR(cgrp)) + return PTR_ERR(cgrp); + + ret = cgroup_bpf_query(cgrp, attr, uattr); + + cgroup_put(cgrp); + return ret; +} + /** * __cgroup_bpf_run_filter_skb() - Run a program for packet filtering * @sk: The socket sending or receiving traffic diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index ed0603f86df2..ec3fbe7f8682 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -1918,6 +1918,24 @@ int sock_map_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type) return 0; } +int sockmap_get_from_fd(const union bpf_attr *attr, int type, + struct bpf_prog *prog) +{ + int ufd = attr->target_fd; + struct bpf_map *map; + struct fd f; + int err; + + f = fdget(ufd); + map = __bpf_map_get(f); + if (IS_ERR(map)) + return PTR_ERR(map); + + err = sock_map_prog(map, prog, attr->attach_type); + fdput(f); + return err; +} + static void *sock_map_lookup(struct bpf_map *map, void *key) { return NULL; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index aa5ce6569e4b..24a141e25361 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1506,8 +1506,6 @@ out_free_tp: return err; } -#ifdef CONFIG_CGROUP_BPF - static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, enum bpf_attach_type attach_type) { @@ -1522,40 +1520,6 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, #define BPF_PROG_ATTACH_LAST_FIELD attach_flags -static int sockmap_get_from_fd(const union bpf_attr *attr, - int type, bool attach) -{ - struct bpf_prog *prog = NULL; - int ufd = attr->target_fd; - struct bpf_map *map; - struct fd f; - int err; - - f = fdget(ufd); - map = __bpf_map_get(f); - if (IS_ERR(map)) - return PTR_ERR(map); - - if (attach) { - prog = bpf_prog_get_type(attr->attach_bpf_fd, type); - if (IS_ERR(prog)) { - fdput(f); - return PTR_ERR(prog); - } - } - - err = sock_map_prog(map, prog, attr->attach_type); - if (err) { - fdput(f); - if (prog) - bpf_prog_put(prog); - return err; - } - - fdput(f); - return 0; -} - #define BPF_F_ATTACH_MASK \ (BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI) @@ -1563,7 +1527,6 @@ static int bpf_prog_attach(const union bpf_attr *attr) { enum bpf_prog_type ptype; struct bpf_prog *prog; - struct cgroup *cgrp; int ret; if (!capable(CAP_NET_ADMIN)) @@ -1600,12 +1563,15 @@ static int bpf_prog_attach(const union bpf_attr *attr) ptype = BPF_PROG_TYPE_CGROUP_DEVICE; break; case BPF_SK_MSG_VERDICT: - return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_MSG, true); + ptype = BPF_PROG_TYPE_SK_MSG; + break; case BPF_SK_SKB_STREAM_PARSER: case BPF_SK_SKB_STREAM_VERDICT: - return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_SKB, true); + ptype = BPF_PROG_TYPE_SK_SKB; + break; case BPF_LIRC_MODE2: - return lirc_prog_attach(attr); + ptype = BPF_PROG_TYPE_LIRC_MODE2; + break; default: return -EINVAL; } @@ -1619,18 +1585,20 @@ static int bpf_prog_attach(const union bpf_attr *attr) return -EINVAL; } - cgrp = cgroup_get_from_fd(attr->target_fd); - if (IS_ERR(cgrp)) { - bpf_prog_put(prog); - return PTR_ERR(cgrp); + switch (ptype) { + case BPF_PROG_TYPE_SK_SKB: + case BPF_PROG_TYPE_SK_MSG: + ret = sockmap_get_from_fd(attr, ptype, prog); + break; + case BPF_PROG_TYPE_LIRC_MODE2: + ret = lirc_prog_attach(attr, prog); + break; + default: + ret = cgroup_bpf_prog_attach(attr, ptype, prog); } - ret = cgroup_bpf_attach(cgrp, prog, attr->attach_type, - attr->attach_flags); if (ret) bpf_prog_put(prog); - cgroup_put(cgrp); - return ret; } @@ -1639,9 +1607,6 @@ static int bpf_prog_attach(const union bpf_attr *attr) static int bpf_prog_detach(const union bpf_attr *attr) { enum bpf_prog_type ptype; - struct bpf_prog *prog; - struct cgroup *cgrp; - int ret; if (!capable(CAP_NET_ADMIN)) return -EPERM; @@ -1674,29 +1639,17 @@ static int bpf_prog_detach(const union bpf_attr *attr) ptype = BPF_PROG_TYPE_CGROUP_DEVICE; break; case BPF_SK_MSG_VERDICT: - return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_MSG, false); + return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_MSG, NULL); case BPF_SK_SKB_STREAM_PARSER: case BPF_SK_SKB_STREAM_VERDICT: - return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_SKB, false); + return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_SKB, NULL); case BPF_LIRC_MODE2: return lirc_prog_detach(attr); default: return -EINVAL; } - cgrp = cgroup_get_from_fd(attr->target_fd); - if (IS_ERR(cgrp)) - return PTR_ERR(cgrp); - - prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype); - if (IS_ERR(prog)) - prog = NULL; - - ret = cgroup_bpf_detach(cgrp, prog, attr->attach_type, 0); - if (prog) - bpf_prog_put(prog); - cgroup_put(cgrp); - return ret; + return cgroup_bpf_prog_detach(attr, ptype); } #define BPF_PROG_QUERY_LAST_FIELD query.prog_cnt @@ -1704,9 +1657,6 @@ static int bpf_prog_detach(const union bpf_attr *attr) static int bpf_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr) { - struct cgroup *cgrp; - int ret; - if (!capable(CAP_NET_ADMIN)) return -EPERM; if (CHECK_ATTR(BPF_PROG_QUERY)) @@ -1734,14 +1684,9 @@ static int bpf_prog_query(const union bpf_attr *attr, default: return -EINVAL; } - cgrp = cgroup_get_from_fd(attr->query.target_fd); - if (IS_ERR(cgrp)) - return PTR_ERR(cgrp); - ret = cgroup_bpf_query(cgrp, attr, uattr); - cgroup_put(cgrp); - return ret; + + return cgroup_bpf_prog_query(attr, uattr); } -#endif /* CONFIG_CGROUP_BPF */ #define BPF_PROG_TEST_RUN_LAST_FIELD test.duration @@ -2391,7 +2336,6 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz case BPF_OBJ_GET: err = bpf_obj_get(&attr); break; -#ifdef CONFIG_CGROUP_BPF case BPF_PROG_ATTACH: err = bpf_prog_attach(&attr); break; @@ -2401,7 +2345,6 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz case BPF_PROG_QUERY: err = bpf_prog_query(&attr, uattr); break; -#endif case BPF_PROG_TEST_RUN: err = bpf_prog_test_run(&attr, uattr); break; From 6216031b474a11d9545045f80363255a4f3a5a55 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 26 Jun 2018 16:21:18 -0700 Subject: [PATCH 0708/1640] BACKPORT: bpf: Change bpf_fib_lookup to return lookup status For ACLs implemented using either FIB rules or FIB entries, the BPF program needs the FIB lookup status to be able to drop the packet. Since the bpf_fib_lookup API has not reached a released kernel yet, change the return code to contain an encoding of the FIB lookup result and return the nexthop device index in the params struct. In addition, inform the BPF program of any post FIB lookup reason as to why the packet needs to go up the stack. The fib result for unicast routes must have an egress device, so remove the check that it is non-NULL. Signed-off-by: David Ahern Signed-off-by: Daniel Borkmann --- include/uapi/linux/bpf.h | 28 ++++++++++--- net/core/filter.c | 86 +++++++++++++++++++++++++--------------- 2 files changed, 77 insertions(+), 37 deletions(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 8fb5a01c1776..c5845fb9f2f3 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1857,7 +1857,8 @@ union bpf_attr { * is resolved), the nexthop address is returned in ipv4_dst * or ipv6_dst based on family, smac is set to mac address of * egress device, dmac is set to nexthop mac address, rt_metric - * is set to metric from route (IPv4/IPv6 only). + * is set to metric from route (IPv4/IPv6 only), and ifindex + * is set to the device index of the nexthop from the FIB lookup. * * *plen* argument is the size of the passed in struct. * *flags* argument can be a combination of one or more of the @@ -1873,9 +1874,10 @@ union bpf_attr { * *ctx* is either **struct xdp_md** for XDP programs or * **struct sk_buff** tc cls_act programs. * Return - * Egress device index on success, 0 if packet needs to continue - * up the stack for further processing or a negative error in case - * of failure. + * * < 0 if any input argument is invalid + * * 0 on success (packet is forwarded, nexthop neighbor exists) + * * > 0 one of **BPF_FIB_LKUP_RET_** codes explaining why the + * * packet is not forwarded or needs assist from full stack * * int bpf_sock_hash_update(struct bpf_sock_ops_kern *skops, struct bpf_map *map, void *key, u64 flags) * Description @@ -2658,6 +2660,18 @@ struct bpf_raw_tracepoint_args { #define BPF_FIB_LOOKUP_DIRECT BIT(0) #define BPF_FIB_LOOKUP_OUTPUT BIT(1) +enum { + BPF_FIB_LKUP_RET_SUCCESS, /* lookup successful */ + BPF_FIB_LKUP_RET_BLACKHOLE, /* dest is blackholed; can be dropped */ + BPF_FIB_LKUP_RET_UNREACHABLE, /* dest is unreachable; can be dropped */ + BPF_FIB_LKUP_RET_PROHIBIT, /* dest not allowed; can be dropped */ + BPF_FIB_LKUP_RET_NOT_FWDED, /* packet is not forwarded */ + BPF_FIB_LKUP_RET_FWD_DISABLED, /* fwding is not enabled on ingress */ + BPF_FIB_LKUP_RET_UNSUPP_LWT, /* fwd requires encapsulation */ + BPF_FIB_LKUP_RET_NO_NEIGH, /* no neighbor entry for nh */ + BPF_FIB_LKUP_RET_FRAG_NEEDED, /* fragmentation required to fwd */ +}; + struct bpf_fib_lookup { /* input: network family for lookup (AF_INET, AF_INET6) * output: network family of egress nexthop @@ -2671,7 +2685,11 @@ struct bpf_fib_lookup { /* total length of packet from network header - used for MTU check */ __u16 tot_len; - __u32 ifindex; /* L3 device index for lookup */ + + /* input: L3 device index for lookup + * output: device index from FIB lookup + */ + __u32 ifindex; union { /* inputs to lookup */ diff --git a/net/core/filter.c b/net/core/filter.c index de41b5ef0ccb..dba6f03ad968 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4080,8 +4080,9 @@ static int bpf_fib_set_fwd_params(struct bpf_fib_lookup *params, memcpy(params->smac, dev->dev_addr, ETH_ALEN); params->h_vlan_TCI = 0; params->h_vlan_proto = 0; + params->ifindex = dev->ifindex; - return dev->ifindex; + return 0; } #endif @@ -4105,7 +4106,7 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params, /* verify forwarding is enabled on this interface */ in_dev = __in_dev_get_rcu(dev); if (unlikely(!in_dev || !IN_DEV_FORWARD(in_dev))) - return 0; + return BPF_FIB_LKUP_RET_FWD_DISABLED; if (flags & BPF_FIB_LOOKUP_OUTPUT) { fl4.flowi4_iif = 1; @@ -4130,7 +4131,7 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params, tb = fib_get_table(net, tbid); if (unlikely(!tb)) - return 0; + return BPF_FIB_LKUP_RET_NOT_FWDED; err = fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF); } else { @@ -4142,8 +4143,20 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params, err = fib_lookup(net, &fl4, &res, FIB_LOOKUP_NOREF); } - if (err || res.type != RTN_UNICAST) - return 0; + if (err) { + /* map fib lookup errors to RTN_ type */ + if (err == -EINVAL) + return BPF_FIB_LKUP_RET_BLACKHOLE; + if (err == -EHOSTUNREACH) + return BPF_FIB_LKUP_RET_UNREACHABLE; + if (err == -EACCES) + return BPF_FIB_LKUP_RET_PROHIBIT; + + return BPF_FIB_LKUP_RET_NOT_FWDED; + } + + if (res.type != RTN_UNICAST) + return BPF_FIB_LKUP_RET_NOT_FWDED; if (res.fi->fib_nhs > 1) fib_select_path(net, &res, &fl4, NULL); @@ -4151,19 +4164,16 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params, if (check_mtu) { mtu = ip_mtu_from_fib_result(&res, params->ipv4_dst); if (params->tot_len > mtu) - return 0; + return BPF_FIB_LKUP_RET_FRAG_NEEDED; } nh = &res.fi->fib_nh[res.nh_sel]; /* do not handle lwt encaps right now */ if (nh->nh_lwtstate) - return 0; + return BPF_FIB_LKUP_RET_UNSUPP_LWT; dev = nh->nh_dev; - if (unlikely(!dev)) - return 0; - if (nh->nh_gw) params->ipv4_dst = nh->nh_gw; @@ -4173,10 +4183,10 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params, * rcu_read_lock_bh is not needed here */ neigh = __ipv4_neigh_lookup_noref(dev, (__force u32)params->ipv4_dst); - if (neigh) - return bpf_fib_set_fwd_params(params, neigh, dev); + if (!neigh) + return BPF_FIB_LKUP_RET_NO_NEIGH; - return 0; + return bpf_fib_set_fwd_params(params, neigh, dev); } #endif @@ -4197,7 +4207,7 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params, /* link local addresses are never forwarded */ if (rt6_need_strict(dst) || rt6_need_strict(src)) - return 0; + return BPF_FIB_LKUP_RET_NOT_FWDED; dev = dev_get_by_index_rcu(net, params->ifindex); if (unlikely(!dev)) @@ -4205,7 +4215,7 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params, idev = __in6_dev_get_safely(dev); if (unlikely(!idev || !net->ipv6.devconf_all->forwarding)) - return 0; + return BPF_FIB_LKUP_RET_FWD_DISABLED; if (flags & BPF_FIB_LOOKUP_OUTPUT) { fl6.flowi6_iif = 1; @@ -4232,7 +4242,7 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params, tb = ipv6_stub->fib6_get_table(net, tbid); if (unlikely(!tb)) - return 0; + return BPF_FIB_LKUP_RET_NOT_FWDED; f6i = ipv6_stub->fib6_table_lookup(net, tb, oif, &fl6, strict); } else { @@ -4245,11 +4255,23 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params, } if (unlikely(IS_ERR_OR_NULL(f6i) || f6i == net->ipv6.fib6_null_entry)) - return 0; + return BPF_FIB_LKUP_RET_NOT_FWDED; - if (unlikely(f6i->fib6_flags & RTF_REJECT || - f6i->fib6_type != RTN_UNICAST)) - return 0; + if (unlikely(f6i->fib6_flags & RTF_REJECT)) { + switch (f6i->fib6_type) { + case RTN_BLACKHOLE: + return BPF_FIB_LKUP_RET_BLACKHOLE; + case RTN_UNREACHABLE: + return BPF_FIB_LKUP_RET_UNREACHABLE; + case RTN_PROHIBIT: + return BPF_FIB_LKUP_RET_PROHIBIT; + default: + return BPF_FIB_LKUP_RET_NOT_FWDED; + } + } + + if (f6i->fib6_type != RTN_UNICAST) + return BPF_FIB_LKUP_RET_NOT_FWDED; if (f6i->fib6_nsiblings && fl6.flowi6_oif == 0) f6i = ipv6_stub->fib6_multipath_select(net, f6i, &fl6, @@ -4259,11 +4281,11 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params, if (check_mtu) { mtu = ipv6_stub->ip6_mtu_from_fib6(f6i, dst, src); if (params->tot_len > mtu) - return 0; + return BPF_FIB_LKUP_RET_FRAG_NEEDED; } if (f6i->fib6_nh.nh_lwtstate) - return 0; + return BPF_FIB_LKUP_RET_UNSUPP_LWT; if (f6i->fib6_flags & RTF_GATEWAY) *dst = f6i->fib6_nh.nh_gw; @@ -4277,10 +4299,10 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params, */ neigh = ___neigh_lookup_noref(ipv6_stub->nd_tbl, neigh_key_eq128, ndisc_hashfn, dst, dev); - if (neigh) - return bpf_fib_set_fwd_params(params, neigh, dev); + if (!neigh) + return BPF_FIB_LKUP_RET_NO_NEIGH; - return 0; + return bpf_fib_set_fwd_params(params, neigh, dev); } #endif @@ -4322,7 +4344,7 @@ BPF_CALL_4(bpf_skb_fib_lookup, struct sk_buff *, skb, struct bpf_fib_lookup *, params, int, plen, u32, flags) { struct net *net = dev_net(skb->dev); - int index = -EAFNOSUPPORT; + int rc = -EAFNOSUPPORT; if (plen < sizeof(*params)) return -EINVAL; @@ -4333,25 +4355,25 @@ BPF_CALL_4(bpf_skb_fib_lookup, struct sk_buff *, skb, switch (params->family) { #if IS_ENABLED(CONFIG_INET) case AF_INET: - index = bpf_ipv4_fib_lookup(net, params, flags, false); + rc = bpf_ipv4_fib_lookup(net, params, flags, false); break; #endif #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: - index = bpf_ipv6_fib_lookup(net, params, flags, false); + rc = bpf_ipv6_fib_lookup(net, params, flags, false); break; #endif } - if (index > 0) { + if (!rc) { struct net_device *dev; - dev = dev_get_by_index_rcu(net, index); + dev = dev_get_by_index_rcu(net, params->ifindex); if (!is_skb_forwardable(dev, skb)) - index = 0; + rc = BPF_FIB_LKUP_RET_FRAG_NEEDED; } - return index; + return rc; } static const struct bpf_func_proto bpf_skb_fib_lookup_proto = { From 3ae01340d394923995da650311b450efdc58913d Mon Sep 17 00:00:00 2001 From: William Tu Date: Thu, 25 Jan 2018 13:20:11 -0800 Subject: [PATCH 0709/1640] BACKPORT: openvswitch: add erspan version I and II support The patch adds support for openvswitch to configure erspan v1 and v2. The OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS attr is added to uapi as a binary blob to support all ERSPAN v1 and v2's fields. Note that Previous commit "openvswitch: Add erspan tunnel support." was reverted since it does not design properly. Signed-off-by: William Tu Acked-by: Pravin B Shelar Signed-off-by: David S. Miller --- include/uapi/linux/openvswitch.h | 1 + net/openvswitch/flow_netlink.c | 52 +++++++++++++++++++++++++++++++- 2 files changed, 52 insertions(+), 1 deletion(-) diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h index 36d0b161e066..9cb40ab76aa8 100644 --- a/include/uapi/linux/openvswitch.h +++ b/include/uapi/linux/openvswitch.h @@ -360,6 +360,7 @@ enum ovs_tunnel_key_attr { OVS_TUNNEL_KEY_ATTR_IPV6_SRC, /* struct in6_addr src IPv6 address. */ OVS_TUNNEL_KEY_ATTR_IPV6_DST, /* struct in6_addr dst IPv6 address. */ OVS_TUNNEL_KEY_ATTR_PAD, + OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS, /* struct erspan_metadata */ __OVS_TUNNEL_KEY_ATTR_MAX }; diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index a4a4260615d6..4bbc8db9f9f6 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -48,6 +48,7 @@ #include #include #include +#include #include "flow_netlink.h" @@ -315,7 +316,8 @@ size_t ovs_tun_key_attr_size(void) + nla_total_size(0) /* OVS_TUNNEL_KEY_ATTR_CSUM */ + nla_total_size(0) /* OVS_TUNNEL_KEY_ATTR_OAM */ + nla_total_size(256) /* OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS */ - /* OVS_TUNNEL_KEY_ATTR_VXLAN_OPTS is mutually exclusive with + /* OVS_TUNNEL_KEY_ATTR_VXLAN_OPTS and + * OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS is mutually exclusive with * OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS and covered by it. */ + nla_total_size(2) /* OVS_TUNNEL_KEY_ATTR_TP_SRC */ @@ -371,6 +373,7 @@ static const struct ovs_len_tbl ovs_tunnel_key_lens[OVS_TUNNEL_KEY_ATTR_MAX + 1] .next = ovs_vxlan_ext_key_lens }, [OVS_TUNNEL_KEY_ATTR_IPV6_SRC] = { .len = sizeof(struct in6_addr) }, [OVS_TUNNEL_KEY_ATTR_IPV6_DST] = { .len = sizeof(struct in6_addr) }, + [OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS] = { .len = OVS_ATTR_VARIABLE }, }; /* The size of the argument for each %OVS_KEY_ATTR_* Netlink attribute. */ @@ -593,6 +596,33 @@ static int vxlan_tun_opt_from_nlattr(const struct nlattr *attr, return 0; } +static int erspan_tun_opt_from_nlattr(const struct nlattr *a, + struct sw_flow_match *match, bool is_mask, + bool log) +{ + unsigned long opt_key_offset; + + BUILD_BUG_ON(sizeof(struct erspan_metadata) > + sizeof(match->key->tun_opts)); + + if (nla_len(a) > sizeof(match->key->tun_opts)) { + OVS_NLERR(log, "ERSPAN option length err (len %d, max %zu).", + nla_len(a), sizeof(match->key->tun_opts)); + return -EINVAL; + } + + if (!is_mask) + SW_FLOW_KEY_PUT(match, tun_opts_len, + sizeof(struct erspan_metadata), false); + else + SW_FLOW_KEY_PUT(match, tun_opts_len, 0xff, true); + + opt_key_offset = TUN_METADATA_OFFSET(nla_len(a)); + SW_FLOW_KEY_MEMCPY_OFFSET(match, opt_key_offset, nla_data(a), + nla_len(a), is_mask); + return 0; +} + static int ip_tun_from_nlattr(const struct nlattr *attr, struct sw_flow_match *match, bool is_mask, bool log) @@ -700,6 +730,20 @@ static int ip_tun_from_nlattr(const struct nlattr *attr, break; case OVS_TUNNEL_KEY_ATTR_PAD: break; + case OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS: + if (opts_type) { + OVS_NLERR(log, "Multiple metadata blocks provided"); + return -EINVAL; + } + + err = erspan_tun_opt_from_nlattr(a, match, is_mask, + log); + if (err) + return err; + + tun_flags |= TUNNEL_ERSPAN_OPT; + opts_type = type; + break; default: OVS_NLERR(log, "Unknown IP tunnel attribute %d", type); @@ -824,6 +868,10 @@ static int __ip_tun_to_nlattr(struct sk_buff *skb, else if (output->tun_flags & TUNNEL_VXLAN_OPT && vxlan_opt_to_nlattr(skb, tun_opts, swkey_tun_opts_len)) return -EMSGSIZE; + else if (output->tun_flags & TUNNEL_ERSPAN_OPT && + nla_put(skb, OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS, + swkey_tun_opts_len, tun_opts)) + return -EMSGSIZE; } return 0; @@ -2192,6 +2240,8 @@ static int validate_and_copy_set_tun(const struct nlattr *attr, break; case OVS_TUNNEL_KEY_ATTR_VXLAN_OPTS: break; + case OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS: + break; } }; From 9ca1ff270530a0816e69daf722b6fe0cb669c436 Mon Sep 17 00:00:00 2001 From: William Tu Date: Tue, 5 Dec 2017 15:15:44 -0800 Subject: [PATCH 0710/1640] BACKPORT: ip6_gre: add ip6 erspan collect_md mode Similar to ip6 gretap and ip4 gretap, the patch allows erspan tunnel to operate in collect metadata mode. bpf_skb_[gs]et_tunnel_key() helpers can make use of it right away. Signed-off-by: William Tu Signed-off-by: David S. Miller --- net/ipv6/ip6_gre.c | 110 ++++++++++++++++++++++++++++++++++----------- 1 file changed, 85 insertions(+), 25 deletions(-) diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 4e99bd77e0be..704e653077ca 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -529,8 +529,37 @@ static int ip6erspan_rcv(struct sk_buff *skb, int gre_hdr_len, false, false) < 0) return PACKET_REJECT; - tunnel->parms.index = ntohl(index); - ip6_tnl_rcv(tunnel, skb, tpi, NULL, log_ecn_error); + if (tunnel->parms.collect_md) { + struct metadata_dst *tun_dst; + struct ip_tunnel_info *info; + struct erspan_metadata *md; + __be64 tun_id; + __be16 flags; + + tpi->flags |= TUNNEL_KEY; + flags = tpi->flags; + tun_id = key32_to_tunnel_id(tpi->key); + + tun_dst = ipv6_tun_rx_dst(skb, flags, tun_id, + sizeof(*md)); + if (!tun_dst) + return PACKET_REJECT; + + info = &tun_dst->u.tun_info; + md = ip_tunnel_info_opts(info); + if (!md) + return PACKET_REJECT; + + md->index = index; + info->key.tun_flags |= TUNNEL_ERSPAN_OPT; + info->options_len = sizeof(*md); + + ip6_tnl_rcv(tunnel, skb, tpi, tun_dst, log_ecn_error); + + } else { + tunnel->parms.index = ntohl(index); + ip6_tnl_rcv(tunnel, skb, tpi, NULL, log_ecn_error); + } return PACKET_RCVD; } @@ -868,42 +897,73 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb, if (gre_handle_offloads(skb, false)) goto tx_err; - switch (skb->protocol) { - case htons(ETH_P_IP): - memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); - prepare_ip6gre_xmit_ipv4(skb, dev, &fl6, - &dsfield, &encap_limit); - break; - case htons(ETH_P_IPV6): - if (ipv6_addr_equal(&t->parms.raddr, &ipv6h->saddr)) - goto tx_err; - if (prepare_ip6gre_xmit_ipv6(skb, dev, &fl6, - &dsfield, &encap_limit)) - goto tx_err; - break; - default: - memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6)); - break; - } - if (skb->len > dev->mtu + dev->hard_header_len) { pskb_trim(skb, dev->mtu + dev->hard_header_len); truncate = true; } - erspan_build_header(skb, t->parms.o_key, t->parms.index, - truncate, false); t->parms.o_flags &= ~TUNNEL_KEY; - IPCB(skb)->flags = 0; - fl6.daddr = t->parms.raddr; + + /* For collect_md mode, derive fl6 from the tunnel key, + * for native mode, call prepare_ip6gre_xmit_{ipv4,ipv6}. + */ + if (t->parms.collect_md) { + struct ip_tunnel_info *tun_info; + const struct ip_tunnel_key *key; + struct erspan_metadata *md; + + tun_info = skb_tunnel_info(skb); + if (unlikely(!tun_info || + !(tun_info->mode & IP_TUNNEL_INFO_TX) || + ip_tunnel_info_af(tun_info) != AF_INET6)) + return -EINVAL; + + key = &tun_info->key; + memset(&fl6, 0, sizeof(fl6)); + fl6.flowi6_proto = IPPROTO_GRE; + fl6.daddr = key->u.ipv6.dst; + fl6.flowlabel = key->label; + fl6.flowi6_uid = sock_net_uid(dev_net(dev), NULL); + + dsfield = key->tos; + md = ip_tunnel_info_opts(tun_info); + if (!md) + goto tx_err; + + erspan_build_header(skb, tunnel_id_to_key32(key->tun_id), + ntohl(md->index), truncate, false); + + } else { + switch (skb->protocol) { + case htons(ETH_P_IP): + memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); + prepare_ip6gre_xmit_ipv4(skb, dev, &fl6, + &dsfield, &encap_limit); + break; + case htons(ETH_P_IPV6): + if (ipv6_addr_equal(&t->parms.raddr, &ipv6h->saddr)) + goto tx_err; + if (prepare_ip6gre_xmit_ipv6(skb, dev, &fl6, + &dsfield, &encap_limit)) + goto tx_err; + break; + default: + memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6)); + break; + } + + erspan_build_header(skb, t->parms.o_key, t->parms.index, + truncate, false); + fl6.daddr = t->parms.raddr; + } /* Push GRE header. */ gre_build_header(skb, 8, TUNNEL_SEQ, htons(ETH_P_ERSPAN), 0, htonl(t->o_seqno++)); /* TooBig packet may have updated dst->dev's mtu */ - if (dst && dst_mtu(dst) > dst->dev->mtu) + if (!t->parms.collect_md && dst && dst_mtu(dst) > dst->dev->mtu) dst->ops->update_pmtu(dst, NULL, skb, dst->dev->mtu, false); err = ip6_tnl_xmit(skb, dev, dsfield, &fl6, encap_limit, &mtu, From c622a225ee63c7fe5a3bb9b97ad8592f1935bf03 Mon Sep 17 00:00:00 2001 From: Pieter Jansen van Vuuren Date: Tue, 26 Jun 2018 21:39:36 -0700 Subject: [PATCH 0711/1640] BACKPORT: net: check tunnel option type in tunnel flags Check the tunnel option type stored in tunnel flags when creating options for tunnels. Thereby ensuring we do not set geneve, vxlan or erspan tunnel options on interfaces that are not associated with them. Make sure all users of the infrastructure set correct flags, for the BPF helper we have to set all bits to keep backward compatibility. Signed-off-by: Pieter Jansen van Vuuren Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- drivers/net/geneve.c | 6 ++++-- drivers/net/vxlan.c | 3 ++- include/net/ip_tunnels.h | 8 ++++++-- net/core/filter.c | 2 +- net/ipv4/ip_gre.c | 2 ++ net/ipv6/ip6_gre.c | 2 ++ net/openvswitch/flow_netlink.c | 7 ++++++- 7 files changed, 23 insertions(+), 7 deletions(-) diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c index a152b2902788..29badd77c8b9 100644 --- a/drivers/net/geneve.c +++ b/drivers/net/geneve.c @@ -234,7 +234,8 @@ static void geneve_rx(struct geneve_dev *geneve, struct geneve_sock *gs, } /* Update tunnel dst according to Geneve options. */ ip_tunnel_info_opts_set(&tun_dst->u.tun_info, - gnvh->options, gnvh->opt_len * 4); + gnvh->options, gnvh->opt_len * 4, + TUNNEL_GENEVE_OPT); } else { /* Drop packets w/ critical options, * since we don't support any... @@ -691,7 +692,8 @@ static void geneve_build_header(struct genevehdr *geneveh, geneveh->proto_type = htons(ETH_P_TEB); geneveh->rsvd2 = 0; - ip_tunnel_info_opts_get(geneveh->options, info); + if (info->key.tun_flags & TUNNEL_GENEVE_OPT) + ip_tunnel_info_opts_get(geneveh->options, info); } static int geneve_build_skb(struct dst_entry *dst, struct sk_buff *skb, diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index 3aa49417cdfd..768394d96538 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -2181,7 +2181,8 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, vni = tunnel_id_to_key32(info->key.tun_id); ifindex = 0; dst_cache = &info->dst_cache; - if (info->options_len) { + if (info->options_len && + info->key.tun_flags & TUNNEL_VXLAN_OPT) { if (info->options_len < sizeof(*md)) goto drop; md = ip_tunnel_info_opts(info); diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index 7844e393f905..caa337a39929 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -486,10 +486,12 @@ static inline void ip_tunnel_info_opts_get(void *to, } static inline void ip_tunnel_info_opts_set(struct ip_tunnel_info *info, - const void *from, int len) + const void *from, int len, + __be16 flags) { memcpy(ip_tunnel_info_opts(info), from, len); info->options_len = len; + info->key.tun_flags |= flags; } static inline struct ip_tunnel_info *lwt_tun_info(struct lwtunnel_state *lwtstate) @@ -531,9 +533,11 @@ static inline void ip_tunnel_info_opts_get(void *to, } static inline void ip_tunnel_info_opts_set(struct ip_tunnel_info *info, - const void *from, int len) + const void *from, int len, + __be16 flags) { info->options_len = 0; + info->key.tun_flags |= flags; } #endif /* CONFIG_INET */ diff --git a/net/core/filter.c b/net/core/filter.c index dba6f03ad968..172aae9e5b2a 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3584,7 +3584,7 @@ BPF_CALL_3(bpf_skb_set_tunnel_opt, struct sk_buff *, skb, if (unlikely(size > IP_TUNNEL_OPTS_MAX)) return -ENOMEM; - ip_tunnel_info_opts_set(info, from, size); + ip_tunnel_info_opts_set(info, from, size, TUNNEL_OPTIONS_PRESENT); return 0; } diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index f905a59b7368..5ffd206a1a22 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -578,6 +578,8 @@ static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev, goto err_free_skb; key = &tun_info->key; + if (!(tun_info->key.tun_flags & TUNNEL_ERSPAN_OPT)) + goto err_free_rt; /* ERSPAN has fixed 8 byte GRE header */ tunnel_hlen = 8 + sizeof(struct erspanhdr); diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 704e653077ca..32e1091a77fc 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -927,6 +927,8 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb, fl6.flowi6_uid = sock_net_uid(dev_net(dev), NULL); dsfield = key->tos; + if (!(tun_info->key.tun_flags & TUNNEL_ERSPAN_OPT)) + goto tx_err; md = ip_tunnel_info_opts(tun_info); if (!md) goto tx_err; diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index 4bbc8db9f9f6..01281514aec3 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -2225,7 +2225,9 @@ static int validate_and_copy_set_tun(const struct nlattr *attr, struct ovs_tunnel_info *ovs_tun; struct nlattr *a; int err = 0, start, opts_type; + __be16 dst_opt_type; + dst_opt_type = 0; ovs_match_init(&match, &key, true, NULL); opts_type = ip_tun_from_nlattr(nla_data(attr), &match, false, log); if (opts_type < 0) @@ -2237,10 +2239,13 @@ static int validate_and_copy_set_tun(const struct nlattr *attr, err = validate_geneve_opts(&key); if (err < 0) return err; + dst_opt_type = TUNNEL_GENEVE_OPT; break; case OVS_TUNNEL_KEY_ATTR_VXLAN_OPTS: + dst_opt_type = TUNNEL_VXLAN_OPT; break; case OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS: + dst_opt_type = TUNNEL_ERSPAN_OPT; break; } }; @@ -2283,7 +2288,7 @@ static int validate_and_copy_set_tun(const struct nlattr *attr, */ ip_tunnel_info_opts_set(tun_info, TUN_METADATA_OPTS(&key, key.tun_opts_len), - key.tun_opts_len); + key.tun_opts_len, dst_opt_type); add_nested_action_end(*sfa, start); return err; From 9fd51ffd11cde263afe1cae0fd1a1c52b0be3448 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 28 Jun 2018 23:34:59 +0200 Subject: [PATCH 0712/1640] BACKPORT: bpf: undo prog rejection on read-only lock failure Partially undo commit 9facc336876f ("bpf: reject any prog that failed read-only lock") since it caused a regression, that is, syzkaller was able to manage to cause a panic via fault injection deep in set_memory_ro() path by letting an allocation fail: In x86's __change_page_attr_set_clr() it was able to change the attributes of the primary mapping but not in the alias mapping via cpa_process_alias(), so the second, inner call to the __change_page_attr() via __change_page_attr_set_clr() had to split a larger page and failed in the alloc_pages() with the artifically triggered allocation error which is then propagated down to the call site. Thus, for set_memory_ro() this means that it returned with an error, but from debugging a probe_kernel_write() revealed EFAULT on that memory since the primary mapping succeeded to get changed. Therefore the subsequent hdr->locked = 0 reset triggered the panic as it was performed on read-only memory, so call-site assumptions were infact wrong to assume that it would either succeed /or/ not succeed at all since there's no such rollback in set_memory_*() calls from partial change of mappings, in other words, we're left in a state that is "half done". A later undo via set_memory_rw() is succeeding though due to matching permissions on that part (aka due to the try_preserve_large_page() succeeding). While reproducing locally with explicitly triggering this error, the initial splitting only happens on rare occasions and in real world it would additionally need oom conditions, but that said, it could partially fail. Therefore, it is definitely wrong to bail out on set_memory_ro() error and reject the program with the set_memory_*() semantics we have today. Shouldn't have gone the extra mile since no other user in tree today infact checks for any set_memory_*() errors, e.g. neither module_enable_ro() / module_disable_ro() for module RO/NX handling which is mostly default these days nor kprobes core with alloc_insn_page() / free_insn_page() as examples that could be invoked long after bootup and original 314beb9bcabf ("x86: bpf_jit_comp: secure bpf jit against spraying attacks") did neither when it got first introduced to BPF so "improving" with bailing out was clearly not right when set_memory_*() cannot handle it today. Kees suggested that if set_memory_*() can fail, we should annotate it with __must_check, and all callers need to deal with it gracefully given those set_memory_*() markings aren't "advisory", but they're expected to actually do what they say. This might be an option worth to move forward in future but would at the same time require that set_memory_*() calls from supporting archs are guaranteed to be "atomic" in that they provide rollback if part of the range fails, once that happened, the transition from RW -> RO could be made more robust that way, while subsequent RO -> RW transition /must/ continue guaranteeing to always succeed the undo part. Reported-by: syzbot+a4eb8c7766952a1ca872@syzkaller.appspotmail.com Reported-by: syzbot+d866d1925855328eac3b@syzkaller.appspotmail.com Fixes: 9facc336876f ("bpf: reject any prog that failed read-only lock") Cc: Laura Abbott Cc: Kees Cook Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: Alexei Starovoitov --- include/linux/filter.h | 55 ++++++------------------------------------ kernel/bpf/core.c | 28 --------------------- 2 files changed, 8 insertions(+), 75 deletions(-) diff --git a/include/linux/filter.h b/include/linux/filter.h index 5b187afa8261..c3df91e4e068 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -485,8 +485,7 @@ struct bpf_binary_header { #ifdef CONFIG_CFI_CLANG u32 magic; #endif - u16 pages; - u16 locked:1; + u32 pages; u8 image[]; }; @@ -494,7 +493,7 @@ struct bpf_prog { u16 pages; /* Number of allocated pages */ u16 jited:1, /* Is our filter JIT'ed? */ jit_requested:1,/* archs need to JIT the prog */ - locked:1, /* Program image locked? */ + undo_set_mem:1, /* Passed set_memory_ro() checkpoint */ gpl_compatible:1, /* Is filter GPL compatible? */ cb_access:1, /* Is control block accessed? */ dst_needed:1, /* Do we need dst entry? */ @@ -742,46 +741,24 @@ bpf_ctx_narrow_access_ok(u32 off, u32 size, u32 size_default) static inline void bpf_prog_lock_ro(struct bpf_prog *fp) { -#ifdef CONFIG_ARCH_HAS_SET_MEMORY - fp->locked = 1; - if (set_memory_ro((unsigned long)fp, fp->pages)) - fp->locked = 0; -#endif + fp->undo_set_mem = 1; + set_memory_ro((unsigned long)fp, fp->pages); } static inline void bpf_prog_unlock_ro(struct bpf_prog *fp) { -#ifdef CONFIG_ARCH_HAS_SET_MEMORY - if (fp->locked) { - WARN_ON_ONCE(set_memory_rw((unsigned long)fp, fp->pages)); - /* In case set_memory_rw() fails, we want to be the first - * to crash here instead of some random place later on. - */ - fp->locked = 0; - } -#endif + if (fp->undo_set_mem) + set_memory_rw((unsigned long)fp, fp->pages); } static inline void bpf_jit_binary_lock_ro(struct bpf_binary_header *hdr) { -#ifdef CONFIG_ARCH_HAS_SET_MEMORY - hdr->locked = 1; - if (set_memory_ro((unsigned long)hdr, hdr->pages)) - hdr->locked = 0; -#endif + set_memory_ro((unsigned long)hdr, hdr->pages); } static inline void bpf_jit_binary_unlock_ro(struct bpf_binary_header *hdr) { -#ifdef CONFIG_ARCH_HAS_SET_MEMORY - if (hdr->locked) { - WARN_ON_ONCE(set_memory_rw((unsigned long)hdr, hdr->pages)); - /* In case set_memory_rw() fails, we want to be the first - * to crash here instead of some random place later on. - */ - hdr->locked = 0; - } -#endif + set_memory_rw((unsigned long)hdr, hdr->pages); } static inline struct bpf_binary_header * @@ -793,22 +770,6 @@ bpf_jit_binary_hdr(const struct bpf_prog *fp) return (void *)addr; } -#ifdef CONFIG_ARCH_HAS_SET_MEMORY -static inline int bpf_prog_check_pages_ro_single(const struct bpf_prog *fp) -{ - if (!fp->locked) - return -ENOLCK; - if (fp->jited) { - const struct bpf_binary_header *hdr = bpf_jit_binary_hdr(fp); - - if (!hdr->locked) - return -ENOLCK; - } - - return 0; -} -#endif - int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap); static inline int sk_filter(struct sock *sk, struct sk_buff *skb) { diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index f9ccff971a73..af398a32c4aa 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -665,8 +665,6 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr, bpf_jit_set_header_magic(hdr); hdr->pages = pages; - hdr->locked = 0; - hole = min_t(unsigned int, size - (proglen + sizeof(*hdr)), PAGE_SIZE - sizeof(*hdr)); start = (get_random_int() % hole) & ~(alignment - 1); @@ -1540,22 +1538,6 @@ static int bpf_check_tail_call(const struct bpf_prog *fp) return 0; } -static int bpf_prog_check_pages_ro_locked(const struct bpf_prog *fp) -{ -#ifdef CONFIG_ARCH_HAS_SET_MEMORY - int i, err; - - for (i = 0; i < fp->aux->func_cnt; i++) { - err = bpf_prog_check_pages_ro_single(fp->aux->func[i]); - if (err) - return err; - } - - return bpf_prog_check_pages_ro_single(fp); -#endif - return 0; -} - static void bpf_prog_select_func(struct bpf_prog *fp) { #ifndef CONFIG_BPF_JIT_ALWAYS_ON @@ -1614,17 +1596,7 @@ finalize: * all eBPF JITs might immediately support all features. */ *err = bpf_check_tail_call(fp); - if (*err) - return fp; - /* Checkpoint: at this point onwards any cBPF -> eBPF or - * native eBPF program is read-only. If we failed to change - * the page attributes (e.g. allocation failure from - * splitting large pages), then reject the whole program - * in order to guarantee not ending up with any W+X pages - * from BPF side in kernel. - */ - *err = bpf_prog_check_pages_ro_locked(fp); return fp; } EXPORT_SYMBOL_GPL(bpf_prog_select_runtime); From b4c1c71834ca33320c5d09d5e5351a0c2867919c Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Sat, 30 Jun 2018 06:17:36 -0700 Subject: [PATCH 0713/1640] UPSTREAM: bpf: sockmap, fix crash when ipv6 sock is added This fixes a crash where we assign tcp_prot to IPv6 sockets instead of tcpv6_prot. Previously we overwrote the sk->prot field with tcp_prot even in the AF_INET6 case. This patch ensures the correct tcp_prot and tcpv6_prot are used. Tested with 'netserver -6' and 'netperf -H [IPv6]' as well as 'netperf -H [IPv4]'. The ESTABLISHED check resolves the previously crashing case here. Fixes: 174a79ff9515 ("bpf: sockmap with sk redirect support") Reported-by: syzbot+5c063698bdbfac19f363@syzkaller.appspotmail.com Acked-by: Martin KaFai Lau Signed-off-by: John Fastabend Signed-off-by: Wei Wang Signed-off-by: Daniel Borkmann --- kernel/bpf/sockmap.c | 58 ++++++++++++++++++++++++++++++++++++-------- 1 file changed, 48 insertions(+), 10 deletions(-) diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index ec3fbe7f8682..04a1e177eda8 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -140,6 +140,7 @@ static int bpf_tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, static int bpf_tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size); static int bpf_tcp_sendpage(struct sock *sk, struct page *page, int offset, size_t size, int flags); +static void bpf_tcp_close(struct sock *sk, long timeout); static inline struct smap_psock *smap_psock_sk(const struct sock *sk) { @@ -161,7 +162,42 @@ out: return !empty; } -static struct proto tcp_bpf_proto; +enum { + SOCKMAP_IPV4, + SOCKMAP_IPV6, + SOCKMAP_NUM_PROTS, +}; + +enum { + SOCKMAP_BASE, + SOCKMAP_TX, + SOCKMAP_NUM_CONFIGS, +}; + +static struct proto *saved_tcpv6_prot __read_mostly; +static DEFINE_SPINLOCK(tcpv6_prot_lock); +static struct proto bpf_tcp_prots[SOCKMAP_NUM_PROTS][SOCKMAP_NUM_CONFIGS]; +static void build_protos(struct proto prot[SOCKMAP_NUM_CONFIGS], + struct proto *base) +{ + prot[SOCKMAP_BASE] = *base; + prot[SOCKMAP_BASE].close = bpf_tcp_close; + prot[SOCKMAP_BASE].recvmsg = bpf_tcp_recvmsg; + prot[SOCKMAP_BASE].stream_memory_read = bpf_tcp_stream_read; + + prot[SOCKMAP_TX] = prot[SOCKMAP_BASE]; + prot[SOCKMAP_TX].sendmsg = bpf_tcp_sendmsg; + prot[SOCKMAP_TX].sendpage = bpf_tcp_sendpage; +} + +static void update_sk_prot(struct sock *sk, struct smap_psock *psock) +{ + int family = sk->sk_family == AF_INET6 ? SOCKMAP_IPV6 : SOCKMAP_IPV4; + int conf = psock->bpf_tx_msg ? SOCKMAP_TX : SOCKMAP_BASE; + + sk->sk_prot = &bpf_tcp_prots[family][conf]; +} + static int bpf_tcp_init(struct sock *sk) { struct smap_psock *psock; @@ -181,14 +217,17 @@ static int bpf_tcp_init(struct sock *sk) psock->save_close = sk->sk_prot->close; psock->sk_proto = sk->sk_prot; - if (psock->bpf_tx_msg) { - tcp_bpf_proto.sendmsg = bpf_tcp_sendmsg; - tcp_bpf_proto.sendpage = bpf_tcp_sendpage; - tcp_bpf_proto.recvmsg = bpf_tcp_recvmsg; - tcp_bpf_proto.stream_memory_read = bpf_tcp_stream_read; + /* Build IPv6 sockmap whenever the address of tcpv6_prot changes */ + if (sk->sk_family == AF_INET6 && + unlikely(sk->sk_prot != smp_load_acquire(&saved_tcpv6_prot))) { + spin_lock_bh(&tcpv6_prot_lock); + if (likely(sk->sk_prot != saved_tcpv6_prot)) { + build_protos(bpf_tcp_prots[SOCKMAP_IPV6], sk->sk_prot); + smp_store_release(&saved_tcpv6_prot, sk->sk_prot); + } + spin_unlock_bh(&tcpv6_prot_lock); } - - sk->sk_prot = &tcp_bpf_proto; + update_sk_prot(sk, psock); rcu_read_unlock(); return 0; } @@ -1111,8 +1150,7 @@ static void bpf_tcp_msg_add(struct smap_psock *psock, static int bpf_tcp_ulp_register(void) { - tcp_bpf_proto = tcp_prot; - tcp_bpf_proto.close = bpf_tcp_close; + build_protos(bpf_tcp_prots[SOCKMAP_IPV4], &tcp_prot); /* Once BPF TX ULP is registered it is never unregistered. It * will be in the ULP list for the lifetime of the system. Doing * duplicate registers is not a problem. From 1c07cb24393d2962fb83ade05717f028515ce918 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Sat, 30 Jun 2018 06:17:41 -0700 Subject: [PATCH 0714/1640] UPSTREAM: bpf: sockmap, fix smap_list_map_remove when psock is in many maps If a hashmap is free'd with open socks it removes the reference to the hash entry from the psock. If that is the last reference to the psock then it will also be free'd by the reference counting logic. However the current logic that removes the hash reference from the list of references is broken. In smap_list_remove() we first check if the sockmap entry matches and then check if the hashmap entry matches. But, the sockmap entry sill always match because its NULL in this case which causes the first entry to be removed from the list. If this is always the "right" entry (because the user adds/removes entries in order) then everything is OK but otherwise a subsequent bpf_tcp_close() may reference a free'd object. To fix this create two list handlers one for sockmap and one for sockhash. Reported-by: syzbot+0ce137753c78f7b6acc1@syzkaller.appspotmail.com Fixes: 81110384441a ("bpf: sockmap, add hash map support") Acked-by: Martin KaFai Lau Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann --- kernel/bpf/sockmap.c | 34 ++++++++++++++++++++++------------ 1 file changed, 22 insertions(+), 12 deletions(-) diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 04a1e177eda8..d15af42f796e 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -1605,17 +1605,27 @@ free_stab: return ERR_PTR(err); } -static void smap_list_remove(struct smap_psock *psock, - struct sock **entry, - struct htab_elem *hash_link) +static void smap_list_map_remove(struct smap_psock *psock, + struct sock **entry) { struct smap_psock_map_entry *e, *tmp; list_for_each_entry_safe(e, tmp, &psock->maps, list) { - if (e->entry == entry || e->hash_link == hash_link) { + if (e->entry == entry) + list_del(&e->list); + } +} + +static void smap_list_hash_remove(struct smap_psock *psock, + struct htab_elem *hash_link) +{ + struct smap_psock_map_entry *e, *tmp; + + list_for_each_entry_safe(e, tmp, &psock->maps, list) { + struct htab_elem *c = e->hash_link; + + if (c == hash_link) list_del(&e->list); - break; - } } } @@ -1650,7 +1660,7 @@ static void sock_map_free(struct bpf_map *map) * to be null and queued for garbage collection. */ if (likely(psock)) { - smap_list_remove(psock, &stab->sock_map[i], NULL); + smap_list_map_remove(psock, &stab->sock_map[i]); smap_release_sock(psock, sock); } write_unlock_bh(&sock->sk_callback_lock); @@ -1709,7 +1719,7 @@ static int sock_map_delete_elem(struct bpf_map *map, void *key) if (psock->bpf_parse) smap_stop_sock(psock, sock); - smap_list_remove(psock, &stab->sock_map[k], NULL); + smap_list_map_remove(psock, &stab->sock_map[k]); smap_release_sock(psock, sock); out: write_unlock_bh(&sock->sk_callback_lock); @@ -1911,7 +1921,7 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops, struct smap_psock *opsock = smap_psock_sk(osock); write_lock_bh(&osock->sk_callback_lock); - smap_list_remove(opsock, &stab->sock_map[i], NULL); + smap_list_map_remove(opsock, &stab->sock_map[i]); smap_release_sock(opsock, osock); write_unlock_bh(&osock->sk_callback_lock); } @@ -2145,7 +2155,7 @@ static void sock_hash_free(struct bpf_map *map) * (psock) to be null and queued for garbage collection. */ if (likely(psock)) { - smap_list_remove(psock, NULL, l); + smap_list_hash_remove(psock, l); smap_release_sock(psock, sock); } write_unlock_bh(&sock->sk_callback_lock); @@ -2325,7 +2335,7 @@ static int sock_hash_ctx_update_elem(struct bpf_sock_ops_kern *skops, psock = smap_psock_sk(l_old->sk); hlist_del_rcu(&l_old->hash_node); - smap_list_remove(psock, NULL, l_old); + smap_list_hash_remove(psock, l_old); smap_release_sock(psock, l_old->sk); free_htab_elem(htab, l_old); } @@ -2393,7 +2403,7 @@ static int sock_hash_delete_elem(struct bpf_map *map, void *key) * to be null and queued for garbage collection. */ if (likely(psock)) { - smap_list_remove(psock, NULL, l); + smap_list_hash_remove(psock, l); smap_release_sock(psock, sock); } write_unlock_bh(&sock->sk_callback_lock); From 3e8729d5348a5b83c9052e8f91c7dad785ba14ed Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Sat, 30 Jun 2018 06:17:47 -0700 Subject: [PATCH 0715/1640] UPSTREAM: bpf: sockhash fix omitted bucket lock in sock_close First the sk_callback_lock() was being used to protect both the sock callback hooks and the psock->maps list. This got overly convoluted after the addition of sockhash (in sockmap it made some sense because masp and callbacks were tightly coupled) so lets split out a specific lock for maps and only use the callback lock for its intended purpose. This fixes a couple cases where we missed using maps lock when it was in fact needed. Also this makes it easier to follow the code because now we can put the locking closer to the actual code its serializing. Next, in sock_hash_delete_elem() the pattern was as follows, sock_hash_delete_elem() [...] spin_lock(bucket_lock) l = lookup_elem_raw() if (l) hlist_del_rcu() write_lock(sk_callback_lock) .... destroy psock ... write_unlock(sk_callback_lock) spin_unlock(bucket_lock) The ordering is necessary because we only know the {p}sock after dereferencing the hash table which we can't do unless we have the bucket lock held. Once we have the bucket lock and the psock element it is deleted from the hashmap to ensure any other path doing a lookup will fail. Finally, the refcnt is decremented and if zero the psock is destroyed. In parallel with the above (or free'ing the map) a tcp close event may trigger tcp_close(). Which at the moment omits the bucket lock altogether (oops!) where the flow looks like this, bpf_tcp_close() [...] write_lock(sk_callback_lock) for each psock->maps // list of maps this sock is part of hlist_del_rcu(ref_hash_node); .... destroy psock ... write_unlock(sk_callback_lock) Obviously, and demonstrated by syzbot, this is broken because we can have multiple threads deleting entries via hlist_del_rcu(). To fix this we might be tempted to wrap the hlist operation in a bucket lock but that would create a lock inversion problem. In summary to follow locking rules the psocks maps list needs the sk_callback_lock (after this patch maps_lock) but we need the bucket lock to do the hlist_del_rcu. To resolve the lock inversion problem pop the head of the maps list repeatedly and remove the reference until no more are left. If a delete happens in parallel from the BPF API that is OK as well because it will do a similar action, lookup the lock in the map/hash, delete it from the map/hash, and dec the refcnt. We check for this case before doing a destroy on the psock to ensure we don't have two threads tearing down a psock. The new logic is as follows, bpf_tcp_close() e = psock_map_pop(psock->maps) // done with map lock bucket_lock() // lock hash list bucket l = lookup_elem_raw(head, hash, key, key_size); if (l) { //only get here if elmnt was not already removed hlist_del_rcu() ... destroy psock... } bucket_unlock() And finally for all the above to work add missing locking around map operations per above. Then add RCU annotations and use rcu_dereference/rcu_assign_pointer to manage values relying on RCU so that the object is not free'd from sock_hash_free() while it is being referenced in bpf_tcp_close(). Reported-by: syzbot+0ce137753c78f7b6acc1@syzkaller.appspotmail.com Fixes: 81110384441a ("bpf: sockmap, add hash map support") Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann --- kernel/bpf/sockmap.c | 145 ++++++++++++++++++++++++++++--------------- 1 file changed, 96 insertions(+), 49 deletions(-) diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index d15af42f796e..c8b7e9cd3fd4 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -72,6 +72,7 @@ struct bpf_htab { u32 n_buckets; u32 elem_size; struct bpf_sock_progs progs; + struct rcu_head rcu; }; struct htab_elem { @@ -89,8 +90,8 @@ enum smap_psock_state { struct smap_psock_map_entry { struct list_head list; struct sock **entry; - struct htab_elem *hash_link; - struct bpf_htab *htab; + struct htab_elem __rcu *hash_link; + struct bpf_htab __rcu *htab; }; struct smap_psock { @@ -120,6 +121,7 @@ struct smap_psock { struct bpf_prog *bpf_parse; struct bpf_prog *bpf_verdict; struct list_head maps; + spinlock_t maps_lock; /* Back reference used when sock callback trigger sockmap operations */ struct sock *sock; @@ -258,16 +260,54 @@ out: rcu_read_unlock(); } +static struct htab_elem *lookup_elem_raw(struct hlist_head *head, + u32 hash, void *key, u32 key_size) +{ + struct htab_elem *l; + + hlist_for_each_entry_rcu(l, head, hash_node) { + if (l->hash == hash && !memcmp(&l->key, key, key_size)) + return l; + } + + return NULL; +} + +static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash) +{ + return &htab->buckets[hash & (htab->n_buckets - 1)]; +} + +static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash) +{ + return &__select_bucket(htab, hash)->head; +} + static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) { atomic_dec(&htab->count); kfree_rcu(l, rcu); } +static struct smap_psock_map_entry *psock_map_pop(struct sock *sk, + struct smap_psock *psock) +{ + struct smap_psock_map_entry *e; + + spin_lock_bh(&psock->maps_lock); + e = list_first_entry_or_null(&psock->maps, + struct smap_psock_map_entry, + list); + if (e) + list_del(&e->list); + spin_unlock_bh(&psock->maps_lock); + return e; +} + static void bpf_tcp_close(struct sock *sk, long timeout) { void (*close_fun)(struct sock *sk, long timeout); - struct smap_psock_map_entry *e, *tmp; + struct smap_psock_map_entry *e; struct sk_msg_buff *md, *mtmp; struct smap_psock *psock; struct sock *osk; @@ -286,7 +326,6 @@ static void bpf_tcp_close(struct sock *sk, long timeout) */ close_fun = psock->save_close; - write_lock_bh(&sk->sk_callback_lock); if (psock->cork) { free_start_sg(psock->sock, psock->cork); kfree(psock->cork); @@ -299,20 +338,38 @@ static void bpf_tcp_close(struct sock *sk, long timeout) kfree(md); } - list_for_each_entry_safe(e, tmp, &psock->maps, list) { + e = psock_map_pop(sk, psock); + while (e) { if (e->entry) { osk = cmpxchg(e->entry, sk, NULL); if (osk == sk) { - list_del(&e->list); smap_release_sock(psock, sk); } } else { - hlist_del_rcu(&e->hash_link->hash_node); - smap_release_sock(psock, e->hash_link->sk); - free_htab_elem(e->htab, e->hash_link); + struct htab_elem *link = rcu_dereference(e->hash_link); + struct bpf_htab *htab = rcu_dereference(e->htab); + struct hlist_head *head; + struct htab_elem *l; + struct bucket *b; + + b = __select_bucket(htab, link->hash); + head = &b->head; + raw_spin_lock_bh(&b->lock); + l = lookup_elem_raw(head, + link->hash, link->key, + htab->map.key_size); + /* If another thread deleted this object skip deletion. + * The refcnt on psock may or may not be zero. + */ + if (l) { + hlist_del_rcu(&link->hash_node); + smap_release_sock(psock, link->sk); + free_htab_elem(htab, link); + } + raw_spin_unlock_bh(&b->lock); } + e = psock_map_pop(sk, psock); } - write_unlock_bh(&sk->sk_callback_lock); rcu_read_unlock(); close_fun(sk, timeout); } @@ -1398,7 +1455,9 @@ static void smap_release_sock(struct smap_psock *psock, struct sock *sock) { if (refcount_dec_and_test(&psock->refcnt)) { tcp_cleanup_ulp(sock); + write_lock_bh(&sock->sk_callback_lock); smap_stop_sock(psock, sock); + write_unlock_bh(&sock->sk_callback_lock); clear_bit(SMAP_TX_RUNNING, &psock->state); rcu_assign_sk_user_data(sock, NULL); call_rcu_sched(&psock->rcu, smap_destroy_psock); @@ -1549,6 +1608,7 @@ static struct smap_psock *smap_init_psock(struct sock *sock, int node) INIT_LIST_HEAD(&psock->maps); INIT_LIST_HEAD(&psock->ingress); refcount_set(&psock->refcnt, 1); + spin_lock_init(&psock->maps_lock); rcu_assign_sk_user_data(sock, psock); sock_hold(sock); @@ -1610,10 +1670,12 @@ static void smap_list_map_remove(struct smap_psock *psock, { struct smap_psock_map_entry *e, *tmp; + spin_lock_bh(&psock->maps_lock); list_for_each_entry_safe(e, tmp, &psock->maps, list) { if (e->entry == entry) list_del(&e->list); } + spin_unlock_bh(&psock->maps_lock); } static void smap_list_hash_remove(struct smap_psock *psock, @@ -1621,12 +1683,14 @@ static void smap_list_hash_remove(struct smap_psock *psock, { struct smap_psock_map_entry *e, *tmp; + spin_lock_bh(&psock->maps_lock); list_for_each_entry_safe(e, tmp, &psock->maps, list) { - struct htab_elem *c = e->hash_link; + struct htab_elem *c = rcu_dereference(e->hash_link); if (c == hash_link) list_del(&e->list); } + spin_unlock_bh(&psock->maps_lock); } static void sock_map_free(struct bpf_map *map) @@ -1652,7 +1716,6 @@ static void sock_map_free(struct bpf_map *map) if (!sock) continue; - write_lock_bh(&sock->sk_callback_lock); psock = smap_psock_sk(sock); /* This check handles a racing sock event that can get the * sk_callback_lock before this case but after xchg happens @@ -1663,7 +1726,6 @@ static void sock_map_free(struct bpf_map *map) smap_list_map_remove(psock, &stab->sock_map[i]); smap_release_sock(psock, sock); } - write_unlock_bh(&sock->sk_callback_lock); } rcu_read_unlock(); @@ -1712,7 +1774,6 @@ static int sock_map_delete_elem(struct bpf_map *map, void *key) if (!sock) return -EINVAL; - write_lock_bh(&sock->sk_callback_lock); psock = smap_psock_sk(sock); if (!psock) goto out; @@ -1722,7 +1783,6 @@ static int sock_map_delete_elem(struct bpf_map *map, void *key) smap_list_map_remove(psock, &stab->sock_map[k]); smap_release_sock(psock, sock); out: - write_unlock_bh(&sock->sk_callback_lock); return 0; } @@ -1803,7 +1863,6 @@ static int __sock_map_ctx_update_elem(struct bpf_map *map, } } - write_lock_bh(&sock->sk_callback_lock); psock = smap_psock_sk(sock); /* 2. Do not allow inheriting programs if psock exists and has @@ -1860,7 +1919,9 @@ static int __sock_map_ctx_update_elem(struct bpf_map *map, if (err) goto out_free; smap_init_progs(psock, verdict, parse); + write_lock_bh(&sock->sk_callback_lock); smap_start_sock(psock, sock); + write_unlock_bh(&sock->sk_callback_lock); } /* 4. Place psock in sockmap for use and stop any programs on @@ -1870,9 +1931,10 @@ static int __sock_map_ctx_update_elem(struct bpf_map *map, */ if (map_link) { e->entry = map_link; + spin_lock_bh(&psock->maps_lock); list_add_tail(&e->list, &psock->maps); + spin_unlock_bh(&psock->maps_lock); } - write_unlock_bh(&sock->sk_callback_lock); return err; out_free: smap_release_sock(psock, sock); @@ -1883,7 +1945,6 @@ out_progs: } if (tx_msg) bpf_prog_put(tx_msg); - write_unlock_bh(&sock->sk_callback_lock); kfree(e); return err; } @@ -1920,10 +1981,8 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops, if (osock) { struct smap_psock *opsock = smap_psock_sk(osock); - write_lock_bh(&osock->sk_callback_lock); smap_list_map_remove(opsock, &stab->sock_map[i]); smap_release_sock(opsock, osock); - write_unlock_bh(&osock->sk_callback_lock); } out: return err; @@ -2112,14 +2171,13 @@ free_htab: return ERR_PTR(err); } -static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash) +static void __bpf_htab_free(struct rcu_head *rcu) { - return &htab->buckets[hash & (htab->n_buckets - 1)]; -} + struct bpf_htab *htab; -static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash) -{ - return &__select_bucket(htab, hash)->head; + htab = container_of(rcu, struct bpf_htab, rcu); + bpf_map_area_free(htab->buckets); + kfree(htab); } static void sock_hash_free(struct bpf_map *map) @@ -2138,16 +2196,18 @@ static void sock_hash_free(struct bpf_map *map) */ rcu_read_lock(); for (i = 0; i < htab->n_buckets; i++) { - struct hlist_head *head = select_bucket(htab, i); + struct bucket *b = __select_bucket(htab, i); + struct hlist_head *head; struct hlist_node *n; struct htab_elem *l; + raw_spin_lock_bh(&b->lock); + head = &b->head; hlist_for_each_entry_safe(l, n, head, hash_node) { struct sock *sock = l->sk; struct smap_psock *psock; hlist_del_rcu(&l->hash_node); - write_lock_bh(&sock->sk_callback_lock); psock = smap_psock_sk(sock); /* This check handles a racing sock event that can get * the sk_callback_lock before this case but after xchg @@ -2158,13 +2218,12 @@ static void sock_hash_free(struct bpf_map *map) smap_list_hash_remove(psock, l); smap_release_sock(psock, sock); } - write_unlock_bh(&sock->sk_callback_lock); - kfree(l); + free_htab_elem(htab, l); } + raw_spin_unlock_bh(&b->lock); } rcu_read_unlock(); - bpf_map_area_free(htab->buckets); - kfree(htab); + call_rcu(&htab->rcu, __bpf_htab_free); } static struct htab_elem *alloc_sock_hash_elem(struct bpf_htab *htab, @@ -2191,19 +2250,6 @@ static struct htab_elem *alloc_sock_hash_elem(struct bpf_htab *htab, return l_new; } -static struct htab_elem *lookup_elem_raw(struct hlist_head *head, - u32 hash, void *key, u32 key_size) -{ - struct htab_elem *l; - - hlist_for_each_entry_rcu(l, head, hash_node) { - if (l->hash == hash && !memcmp(&l->key, key, key_size)) - return l; - } - - return NULL; -} - static inline u32 htab_map_hash(const void *key, u32 key_len) { return jhash(key, key_len, 0); @@ -2323,9 +2369,12 @@ static int sock_hash_ctx_update_elem(struct bpf_sock_ops_kern *skops, goto bucket_err; } - e->hash_link = l_new; - e->htab = container_of(map, struct bpf_htab, map); + rcu_assign_pointer(e->hash_link, l_new); + rcu_assign_pointer(e->htab, + container_of(map, struct bpf_htab, map)); + spin_lock_bh(&psock->maps_lock); list_add_tail(&e->list, &psock->maps); + spin_unlock_bh(&psock->maps_lock); /* add new element to the head of the list, so that * concurrent search will find it before old elem @@ -2395,7 +2444,6 @@ static int sock_hash_delete_elem(struct bpf_map *map, void *key) struct smap_psock *psock; hlist_del_rcu(&l->hash_node); - write_lock_bh(&sock->sk_callback_lock); psock = smap_psock_sk(sock); /* This check handles a racing sock event that can get the * sk_callback_lock before this case but after xchg happens @@ -2406,7 +2454,6 @@ static int sock_hash_delete_elem(struct bpf_map *map, void *key) smap_list_hash_remove(psock, l); smap_release_sock(psock, sock); } - write_unlock_bh(&sock->sk_callback_lock); free_htab_elem(htab, l); ret = 0; } From 8d57dc22ca3097e79b1f313d0cee4d5f5ab3e2f3 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Sat, 30 Jun 2018 06:17:52 -0700 Subject: [PATCH 0716/1640] UPSTREAM: bpf: sockhash, add release routine Add map_release_uref pointer to hashmap ops. This was dropped when original sockhash code was ported into bpf-next before initial commit. Fixes: 81110384441a ("bpf: sockmap, add hash map support") Acked-by: Martin KaFai Lau Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann --- kernel/bpf/sockmap.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index c8b7e9cd3fd4..3fa0325417a1 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -2499,6 +2499,7 @@ const struct bpf_map_ops sock_hash_ops = { .map_get_next_key = sock_hash_get_next_key, .map_update_elem = sock_hash_update_elem, .map_delete_elem = sock_hash_delete_elem, + .map_release_uref = sock_map_release, }; BPF_CALL_4(bpf_sock_map_update, struct bpf_sock_ops_kern *, bpf_sock, From 775f4ec8e0e9932f368f9052289f03a65af6ea61 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Wed, 4 Jul 2018 17:34:37 -0500 Subject: [PATCH 0717/1640] UPSTREAM: net: core: filter: mark expected switch fall-through In preparation to enabling -Wimplicit-fallthrough, mark switch cases where we are expecting to fall through. Warning level 2 was used: -Wimplicit-fallthrough=2 Signed-off-by: Gustavo A. R. Silva Signed-off-by: David S. Miller --- net/core/filter.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/core/filter.c b/net/core/filter.c index 172aae9e5b2a..7111315b1475 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4669,6 +4669,7 @@ bpf_base_func_proto(enum bpf_func_id func_id) case BPF_FUNC_trace_printk: if (capable(CAP_SYS_ADMIN)) return bpf_get_trace_printk_proto(); + /* else: fall through */ default: return NULL; } From 7eb849e8617da3ec979f2ccdc7100c0dd2c5c506 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Thu, 5 Jul 2018 08:05:56 -0700 Subject: [PATCH 0718/1640] UPSTREAM: bpf: sockmap, error path can not release psock in multi-map case The current code, in the error path of sock_hash_ctx_update_elem, checks if the sock has a psock in the user data and if so decrements the reference count of the psock. However, if the error happens early in the error path we may have never incremented the psock reference count and if the psock exists because the sock is in another map then we may inadvertently decrement the reference count. Fix this by making the error path only call smap_release_sock if the error happens after the increment. Reported-by: syzbot+d464d2c20c717ef5a6a8@syzkaller.appspotmail.com Fixes: 81110384441a ("bpf: sockmap, add hash map support") Signed-off-by: John Fastabend Signed-off-by: Alexei Starovoitov --- kernel/bpf/sockmap.c | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 3fa0325417a1..0b87f174ebbb 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -1899,7 +1899,7 @@ static int __sock_map_ctx_update_elem(struct bpf_map *map, e = kzalloc(sizeof(*e), GFP_ATOMIC | __GFP_NOWARN); if (!e) { err = -ENOMEM; - goto out_progs; + goto out_free; } } @@ -2345,7 +2345,10 @@ static int sock_hash_ctx_update_elem(struct bpf_sock_ops_kern *skops, if (err) goto err; - /* bpf_map_update_elem() can be called in_irq() */ + /* psock is valid here because otherwise above *ctx_update_elem would + * have thrown an error. It is safe to skip error check. + */ + psock = smap_psock_sk(sock); raw_spin_lock_bh(&b->lock); l_old = lookup_elem_raw(head, hash, key, key_size); if (l_old && map_flags == BPF_NOEXIST) { @@ -2363,12 +2366,6 @@ static int sock_hash_ctx_update_elem(struct bpf_sock_ops_kern *skops, goto bucket_err; } - psock = smap_psock_sk(sock); - if (unlikely(!psock)) { - err = -EINVAL; - goto bucket_err; - } - rcu_assign_pointer(e->hash_link, l_new); rcu_assign_pointer(e->htab, container_of(map, struct bpf_htab, map)); @@ -2391,12 +2388,10 @@ static int sock_hash_ctx_update_elem(struct bpf_sock_ops_kern *skops, raw_spin_unlock_bh(&b->lock); return 0; bucket_err: + smap_release_sock(psock, sock); raw_spin_unlock_bh(&b->lock); err: kfree(e); - psock = smap_psock_sk(sock); - if (psock) - smap_release_sock(psock, sock); return err; } From 43a374f8035c45572d65bbf8a66e22a4fc1a24c9 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Thu, 5 Jul 2018 08:06:01 -0700 Subject: [PATCH 0719/1640] UPSTREAM: bpf: sockmap, hash table is RCU so readers do not need locks This removes locking from readers of RCU hash table. Its not necessary. Fixes: 81110384441a ("bpf: sockmap, add hash map support") Signed-off-by: John Fastabend Signed-off-by: Alexei Starovoitov --- kernel/bpf/sockmap.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 0b87f174ebbb..e8bcb1fb1173 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -2470,10 +2470,8 @@ struct sock *__sock_hash_lookup_elem(struct bpf_map *map, void *key) b = __select_bucket(htab, hash); head = &b->head; - raw_spin_lock_bh(&b->lock); l = lookup_elem_raw(head, hash, key, key_size); sk = l ? l->sk : NULL; - raw_spin_unlock_bh(&b->lock); return sk; } From 4e90b0a9a2f69ae8260bf9712d5f02678113f0f6 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Thu, 5 Jul 2018 08:50:04 -0700 Subject: [PATCH 0720/1640] UPSTREAM: bpf: sockhash, disallow bpf_tcp_close and update in parallel After latest lock updates there is no longer anything preventing a close and recvmsg call running in parallel. Additionally, we can race update with close if we close a socket and simultaneously update if via the BPF userspace API (note the cgroup ops are already run with sock_lock held). To resolve this take sock_lock in close and update paths. Reported-by: syzbot+b680e42077a0d7c9a0c4@syzkaller.appspotmail.com Fixes: e9db4ef6bf4c ("bpf: sockhash fix omitted bucket lock in sock_close") Signed-off-by: John Fastabend Signed-off-by: Alexei Starovoitov --- kernel/bpf/sockmap.c | 15 +++++++++++++++ kernel/bpf/syscall.c | 4 +++- 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index e8bcb1fb1173..e08d89493dba 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -312,10 +312,12 @@ static void bpf_tcp_close(struct sock *sk, long timeout) struct smap_psock *psock; struct sock *osk; + lock_sock(sk); rcu_read_lock(); psock = smap_psock_sk(sk); if (unlikely(!psock)) { rcu_read_unlock(); + release_sock(sk); return sk->sk_prot->close(sk, timeout); } @@ -371,6 +373,7 @@ static void bpf_tcp_close(struct sock *sk, long timeout) e = psock_map_pop(sk, psock); } rcu_read_unlock(); + release_sock(sk); close_fun(sk, timeout); } @@ -2072,7 +2075,13 @@ static int sock_map_update_elem(struct bpf_map *map, return -EOPNOTSUPP; } + lock_sock(skops.sk); + preempt_disable(); + rcu_read_lock(); err = sock_map_ctx_update_elem(&skops, map, key, flags); + rcu_read_unlock(); + preempt_enable(); + release_sock(skops.sk); fput(socket->file); return err; } @@ -2413,7 +2422,13 @@ static int sock_hash_update_elem(struct bpf_map *map, return -EINVAL; } + lock_sock(skops.sk); + preempt_disable(); + rcu_read_lock(); err = sock_hash_ctx_update_elem(&skops, map, key, flags); + rcu_read_unlock(); + preempt_enable(); + release_sock(skops.sk); fput(socket->file); return err; } diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 24a141e25361..40996cc7d31e 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -750,7 +750,9 @@ static int map_update_elem(union bpf_attr *attr) if (bpf_map_is_dev_bound(map)) { err = bpf_map_offload_update_elem(map, key, value, attr->flags); goto out; - } else if (map->map_type == BPF_MAP_TYPE_CPUMAP) { + } else if (map->map_type == BPF_MAP_TYPE_CPUMAP || + map->map_type == BPF_MAP_TYPE_SOCKHASH || + map->map_type == BPF_MAP_TYPE_SOCKMAP) { err = map->ops->map_update_elem(map, key, value, attr->flags); goto out; } From 032c249dc42f6afe70a571531c4fd9ec5250cc62 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Thu, 5 Jul 2018 08:50:10 -0700 Subject: [PATCH 0721/1640] UPSTREAM: bpf: sockmap, consume_skb in close path Currently, when a sock is closed and the bpf_tcp_close() callback is used we remove memory but do not free the skb. Call consume_skb() if the skb is attached to the buffer. Reported-by: syzbot+d464d2c20c717ef5a6a8@syzkaller.appspotmail.com Fixes: 1aa12bdf1bfb ("bpf: sockmap, add sock close() hook to remove socks") Signed-off-by: John Fastabend Signed-off-by: Alexei Starovoitov --- kernel/bpf/sockmap.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index e08d89493dba..8a42316796db 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -571,7 +571,8 @@ static int free_sg(struct sock *sk, int start, struct sk_msg_buff *md) while (sg[i].length) { free += sg[i].length; sk_mem_uncharge(sk, sg[i].length); - put_page(sg_page(&sg[i])); + if (!md->skb) + put_page(sg_page(&sg[i])); sg[i].length = 0; sg[i].page_link = 0; sg[i].offset = 0; @@ -580,6 +581,8 @@ static int free_sg(struct sock *sk, int start, struct sk_msg_buff *md) if (i == MAX_SKB_FRAGS) i = 0; } + if (md->skb) + consume_skb(md->skb); return free; } From 7c3d1849aff65303d06a49dc3b8a823ef9ce73e3 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 25 Sep 2017 02:25:50 +0200 Subject: [PATCH 0722/1640] BACKPORT: bpf: rename bpf_compute_data_end into bpf_compute_data_pointers Just do the rename into bpf_compute_data_pointers() as we'll add one more pointer here to recompute. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Acked-by: John Fastabend Signed-off-by: David S. Miller --- include/linux/filter.h | 9 ++++++--- kernel/bpf/sockmap.c | 4 ++-- net/bpf/test_run.c | 2 +- net/core/filter.c | 14 +++++++------- net/core/lwt_bpf.c | 2 +- net/sched/act_bpf.c | 4 ++-- net/sched/cls_bpf.c | 4 ++-- 7 files changed, 21 insertions(+), 18 deletions(-) diff --git a/include/linux/filter.h b/include/linux/filter.h index c3df91e4e068..595b2015755d 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -606,10 +606,13 @@ struct sk_msg_buff { struct list_head list; }; -/* compute the linear packet data range [data, data_end) which - * will be accessed by cls_bpf, act_bpf and lwt programs +/* Compute the linear packet data range [data, data_end) which + * will be accessed by various program types (cls_bpf, act_bpf, + * lwt, ...). Subsystems allowing direct data access must (!) + * ensure that cb[] area can be written to when BPF program is + * invoked (otherwise cb[] save/restore is necessary). */ -static inline void bpf_compute_data_end(struct sk_buff *skb) +static inline void bpf_compute_data_pointers(struct sk_buff *skb) { struct bpf_skb_data_end *cb = (struct bpf_skb_data_end *)skb->cb; diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 8a42316796db..6eca01262e0a 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -1236,7 +1236,7 @@ static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb) */ TCP_SKB_CB(skb)->bpf.sk_redir = NULL; skb->sk = psock->sock; - bpf_compute_data_end_sk_skb(skb); + bpf_compute_data_pointers(skb); preempt_disable(); rc = (*prog->bpf_func)(skb, prog->insnsi); preempt_enable(); @@ -1494,7 +1494,7 @@ static int smap_parse_func_strparser(struct strparser *strp, * any socket yet. */ skb->sk = psock->sock; - bpf_compute_data_end_sk_skb(skb); + bpf_compute_data_pointers(skb); rc = (*prog->bpf_func)(skb, prog->insnsi); skb->sk = NULL; rcu_read_unlock(); diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index e8e4ba86c6ab..d733d5e1eec3 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -135,7 +135,7 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr, if (is_l2) __skb_push(skb, hh_len); if (is_direct_pkt_access) - bpf_compute_data_end(skb); + bpf_compute_data_pointers(skb); retval = bpf_test_run(prog, skb, repeat, &duration); if (!is_l2) { if (skb_headroom(skb) < hh_len) { diff --git a/net/core/filter.c b/net/core/filter.c index 7111315b1475..3aebb7de6e0d 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1611,7 +1611,7 @@ static inline int bpf_try_make_writable(struct sk_buff *skb, { int err = __bpf_try_make_writable(skb, write_len); - bpf_compute_data_end(skb); + bpf_compute_data_pointers(skb); return err; } @@ -2452,7 +2452,7 @@ BPF_CALL_3(bpf_skb_vlan_push, struct sk_buff *, skb, __be16, vlan_proto, ret = skb_vlan_push(skb, vlan_proto, vlan_tci); bpf_pull_mac_rcsum(skb); - bpf_compute_data_end(skb); + bpf_compute_data_pointers(skb); return ret; } @@ -2473,7 +2473,7 @@ BPF_CALL_1(bpf_skb_vlan_pop, struct sk_buff *, skb) ret = skb_vlan_pop(skb); bpf_pull_mac_rcsum(skb); - bpf_compute_data_end(skb); + bpf_compute_data_pointers(skb); return ret; } @@ -2677,7 +2677,7 @@ BPF_CALL_3(bpf_skb_change_proto, struct sk_buff *, skb, __be16, proto, * need to be verified first. */ ret = bpf_skb_proto_xlat(skb, proto); - bpf_compute_data_end(skb); + bpf_compute_data_pointers(skb); return ret; } @@ -2811,7 +2811,7 @@ static int bpf_skb_adjust_net(struct sk_buff *skb, s32 len_diff) ret = shrink ? bpf_skb_net_shrink(skb, len_diff_abs) : bpf_skb_net_grow(skb, len_diff_abs); - bpf_compute_data_end(skb); + bpf_compute_data_pointers(skb); return ret; } @@ -2902,7 +2902,7 @@ BPF_CALL_3(bpf_skb_change_tail, struct sk_buff *, skb, u32, new_len, skb_gso_reset(skb); } - bpf_compute_data_end(skb); + bpf_compute_data_pointers(skb); return ret; } @@ -2943,7 +2943,7 @@ BPF_CALL_3(bpf_skb_change_head, struct sk_buff *, skb, u32, head_room, skb_reset_mac_len(skb); } - bpf_compute_data_end(skb); + bpf_compute_data_pointers(skb); return 0; } diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c index 65313c766ab3..680782c53225 100644 --- a/net/core/lwt_bpf.c +++ b/net/core/lwt_bpf.c @@ -51,7 +51,7 @@ static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt, */ preempt_disable(); rcu_read_lock(); - bpf_compute_data_end(skb); + bpf_compute_data_pointers(skb); ret = bpf_prog_run_save_cb(lwt->prog, skb); rcu_read_unlock(); diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c index bdc8885c0448..6bc400025566 100644 --- a/net/sched/act_bpf.c +++ b/net/sched/act_bpf.c @@ -49,11 +49,11 @@ static int tcf_bpf(struct sk_buff *skb, const struct tc_action *act, filter = rcu_dereference(prog->filter); if (at_ingress) { __skb_push(skb, skb->mac_len); - bpf_compute_data_end(skb); + bpf_compute_data_pointers(skb); filter_res = BPF_PROG_RUN(filter, skb); __skb_pull(skb, skb->mac_len); } else { - bpf_compute_data_end(skb); + bpf_compute_data_pointers(skb); filter_res = BPF_PROG_RUN(filter, skb); } rcu_read_unlock(); diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c index 3096b2f8b0a9..b41718504205 100644 --- a/net/sched/cls_bpf.c +++ b/net/sched/cls_bpf.c @@ -102,11 +102,11 @@ static int cls_bpf_classify(struct sk_buff *skb, const struct tcf_proto *tp, } else if (at_ingress) { /* It is safe to push/pull even if skb_shared() */ __skb_push(skb, skb->mac_len); - bpf_compute_data_end(skb); + bpf_compute_data_pointers(skb); filter_res = BPF_PROG_RUN(prog->filter, skb); __skb_pull(skb, skb->mac_len); } else { - bpf_compute_data_end(skb); + bpf_compute_data_pointers(skb); filter_res = BPF_PROG_RUN(prog->filter, skb); } From 971b1053f60964aa6d770434c427eb3ccff8e531 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Thu, 5 Jul 2018 08:50:15 -0700 Subject: [PATCH 0723/1640] BACKPORT: bpf: sockmap, convert bpf_compute_data_pointers to bpf_*_sk_skb In commit 'bpf: bpf_compute_data uses incorrect cb structure' (8108a7751512) we added the routine bpf_compute_data_end_sk_skb() to compute the correct data_end values, but this has since been lost. In kernel v4.14 this was correct and the above patch was applied in it entirety. Then when v4.14 was merged into v4.15-rc1 net-next tree we lost the piece that renamed bpf_compute_data_pointers to the new function bpf_compute_data_end_sk_skb. This was done here, e1ea2f9856b7 ("Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net") When it conflicted with the following rename patch, 6aaae2b6c433 ("bpf: rename bpf_compute_data_end into bpf_compute_data_pointers") Finally, after a refactor I thought even the function bpf_compute_data_end_sk_skb() was no longer needed and it was erroneously removed. However, we never reverted the sk_skb_convert_ctx_access() usage of tcp_skb_cb which had been committed and survived the merge conflict. Here we fix this by adding back the helper and *_data_end_sk_skb() usage. Using the bpf_skc_data_end mapping is not correct because it expects a qdisc_skb_cb object but at the sock layer this is not the case. Even though it happens to work here because we don't overwrite any data in-use at the socket layer and the cb structure is cleared later this has potential to create some subtle issues. But, even more concretely the filter.c access check uses tcp_skb_cb. And by some act of chance though, struct bpf_skb_data_end { struct qdisc_skb_cb qdisc_cb; /* 0 28 */ /* XXX 4 bytes hole, try to pack */ void * data_meta; /* 32 8 */ void * data_end; /* 40 8 */ /* size: 48, cachelines: 1, members: 3 */ /* sum members: 44, holes: 1, sum holes: 4 */ /* last cacheline: 48 bytes */ }; and then tcp_skb_cb, struct tcp_skb_cb { [...] struct { __u32 flags; /* 24 4 */ struct sock * sk_redir; /* 32 8 */ void * data_end; /* 40 8 */ } bpf; /* 24 */ }; So when we use offset_of() to track down the byte offset we get 40 in either case and everything continues to work. Fix this mess and use correct structures its unclear how long this might actually work for until someone moves the structs around. Reported-by: Martin KaFai Lau Fixes: e1ea2f9856b7 ("Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net") Fixes: 6aaae2b6c433 ("bpf: rename bpf_compute_data_end into bpf_compute_data_pointers") Signed-off-by: John Fastabend Signed-off-by: Alexei Starovoitov --- include/net/tcp.h | 4 ++ kernel/bpf/sockmap.c | 4 +- net/core/filter.c | 98 ++++++++++++++++++++++++++++++++++++++++---- 3 files changed, 97 insertions(+), 9 deletions(-) diff --git a/include/net/tcp.h b/include/net/tcp.h index f09047cc016e..8bbdcf82494a 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -892,6 +892,10 @@ struct tcp_skb_cb { #define TCP_SKB_CB(__skb) ((struct tcp_skb_cb *)&((__skb)->cb[0])) +static inline void bpf_compute_data_end_sk_skb(struct sk_buff *skb) +{ + TCP_SKB_CB(skb)->bpf.data_end = skb->data + skb_headlen(skb); +} #if IS_ENABLED(CONFIG_IPV6) /* This is the variant of inet6_iif() that must be used by TCP, diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 6eca01262e0a..8a42316796db 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -1236,7 +1236,7 @@ static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb) */ TCP_SKB_CB(skb)->bpf.sk_redir = NULL; skb->sk = psock->sock; - bpf_compute_data_pointers(skb); + bpf_compute_data_end_sk_skb(skb); preempt_disable(); rc = (*prog->bpf_func)(skb, prog->insnsi); preempt_enable(); @@ -1494,7 +1494,7 @@ static int smap_parse_func_strparser(struct strparser *strp, * any socket yet. */ skb->sk = psock->sock; - bpf_compute_data_pointers(skb); + bpf_compute_data_end_sk_skb(skb); rc = (*prog->bpf_func)(skb, prog->insnsi); skb->sk = NULL; rcu_read_unlock(); diff --git a/net/core/filter.c b/net/core/filter.c index 3aebb7de6e0d..56db22fa5987 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1762,6 +1762,37 @@ static const struct bpf_func_proto bpf_skb_pull_data_proto = { .arg2_type = ARG_ANYTHING, }; +static inline int sk_skb_try_make_writable(struct sk_buff *skb, + unsigned int write_len) +{ + int err = __bpf_try_make_writable(skb, write_len); + + bpf_compute_data_end_sk_skb(skb); + return err; +} + +BPF_CALL_2(sk_skb_pull_data, struct sk_buff *, skb, u32, len) +{ + /* Idea is the following: should the needed direct read/write + * test fail during runtime, we can pull in more data and redo + * again, since implicitly, we invalidate previous checks here. + * + * Or, since we know how much we need to make read/writeable, + * this can be done once at the program beginning for direct + * access case. By this we overcome limitations of only current + * headroom being accessible. + */ + return sk_skb_try_make_writable(skb, len ? : skb_headlen(skb)); +} + +static const struct bpf_func_proto sk_skb_pull_data_proto = { + .func = sk_skb_pull_data, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, +}; + BPF_CALL_5(bpf_l3_csum_replace, struct sk_buff *, skb, u32, offset, u64, from, u64, to, u64, flags) { @@ -2864,8 +2895,8 @@ static int bpf_skb_trim_rcsum(struct sk_buff *skb, unsigned int new_len) return __skb_trim_rcsum(skb, new_len); } -BPF_CALL_3(bpf_skb_change_tail, struct sk_buff *, skb, u32, new_len, - u64, flags) +static inline int __bpf_skb_change_tail(struct sk_buff *skb, u32 new_len, + u64 flags) { u32 max_len = BPF_SKB_MAX_LEN; u32 min_len = __bpf_skb_min_len(skb); @@ -2901,6 +2932,13 @@ BPF_CALL_3(bpf_skb_change_tail, struct sk_buff *, skb, u32, new_len, if (!ret && skb_is_gso(skb)) skb_gso_reset(skb); } + return ret; +} + +BPF_CALL_3(bpf_skb_change_tail, struct sk_buff *, skb, u32, new_len, + u64, flags) +{ + int ret = __bpf_skb_change_tail(skb, new_len, flags); bpf_compute_data_pointers(skb); return ret; @@ -2915,8 +2953,26 @@ static const struct bpf_func_proto bpf_skb_change_tail_proto = { .arg3_type = ARG_ANYTHING, }; -BPF_CALL_3(bpf_skb_change_head, struct sk_buff *, skb, u32, head_room, +BPF_CALL_3(sk_skb_change_tail, struct sk_buff *, skb, u32, new_len, u64, flags) +{ + int ret = __bpf_skb_change_tail(skb, new_len, flags); + + bpf_compute_data_end_sk_skb(skb); + return ret; +} + +static const struct bpf_func_proto sk_skb_change_tail_proto = { + .func = sk_skb_change_tail, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, +}; + +static inline int __bpf_skb_change_head(struct sk_buff *skb, u32 head_room, + u64 flags) { u32 max_len = BPF_SKB_MAX_LEN; u32 new_len = skb->len + head_room; @@ -2943,8 +2999,16 @@ BPF_CALL_3(bpf_skb_change_head, struct sk_buff *, skb, u32, head_room, skb_reset_mac_len(skb); } + return ret; +} + +BPF_CALL_3(bpf_skb_change_head, struct sk_buff *, skb, u32, head_room, + u64, flags) +{ + int ret = __bpf_skb_change_head(skb, head_room, flags); + bpf_compute_data_pointers(skb); - return 0; + return ret; } static const struct bpf_func_proto bpf_skb_change_head_proto = { @@ -2956,6 +3020,23 @@ static const struct bpf_func_proto bpf_skb_change_head_proto = { .arg3_type = ARG_ANYTHING, }; +BPF_CALL_3(sk_skb_change_head, struct sk_buff *, skb, u32, head_room, + u64, flags) +{ + int ret = __bpf_skb_change_head(skb, head_room, flags); + + bpf_compute_data_end_sk_skb(skb); + return ret; +} + +static const struct bpf_func_proto sk_skb_change_head_proto = { + .func = sk_skb_change_head, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, +}; static unsigned long xdp_get_metalen(const struct xdp_buff *xdp) { return xdp_data_meta_unsupported(xdp) ? 0 : @@ -4624,9 +4705,12 @@ bool bpf_helper_changes_pkt_data(void *func) func == bpf_skb_store_bytes || func == bpf_skb_change_proto || func == bpf_skb_change_head || + func == sk_skb_change_head || func == bpf_skb_change_tail || + func == sk_skb_change_tail || func == bpf_skb_adjust_room || func == bpf_skb_pull_data || + func == sk_skb_pull_data || func == bpf_clone_redirect || func == bpf_l3_csum_replace || func == bpf_l4_csum_replace || @@ -4883,11 +4967,11 @@ sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_skb_load_bytes: return &bpf_skb_load_bytes_proto; case BPF_FUNC_skb_pull_data: - return &bpf_skb_pull_data_proto; + return &sk_skb_pull_data_proto; case BPF_FUNC_skb_change_tail: - return &bpf_skb_change_tail_proto; + return &sk_skb_change_tail_proto; case BPF_FUNC_skb_change_head: - return &bpf_skb_change_head_proto; + return &sk_skb_change_head_proto; case BPF_FUNC_get_socket_cookie: return &bpf_get_socket_cookie_proto; case BPF_FUNC_get_socket_uid: From 6f1a73d7be471f0a98457c1f6808a84cee76c317 Mon Sep 17 00:00:00 2001 From: Toshiaki Makita Date: Fri, 6 Jul 2018 11:49:00 +0900 Subject: [PATCH 0724/1640] UPSTREAM: xdp: XDP_REDIRECT should check IFF_UP and MTU Otherwise we end up with attempting to send packets from down devices or to send oversized packets, which may cause unexpected driver/device behaviour. Generic XDP has already done this check, so reuse the logic in native XDP. Fixes: 814abfabef3c ("xdp: add bpf_redirect helper function") Signed-off-by: Toshiaki Makita Signed-off-by: Alexei Starovoitov --- include/linux/filter.h | 6 +++--- kernel/bpf/devmap.c | 7 ++++++- net/core/filter.c | 9 +++++++-- 3 files changed, 16 insertions(+), 6 deletions(-) diff --git a/include/linux/filter.h b/include/linux/filter.h index 595b2015755d..fadc99166e35 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -834,8 +834,8 @@ static inline bool bpf_dump_raw_ok(void) struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, const struct bpf_insn *patch, u32 len); -static inline int __xdp_generic_ok_fwd_dev(struct sk_buff *skb, - struct net_device *fwd) +static inline int xdp_ok_fwd_dev(const struct net_device *fwd, + unsigned int pktlen) { unsigned int len; @@ -843,7 +843,7 @@ static inline int __xdp_generic_ok_fwd_dev(struct sk_buff *skb, return -ENETDOWN; len = fwd->mtu + fwd->hard_header_len + VLAN_HLEN; - if (skb->len > len) + if (pktlen > len) return -EMSGSIZE; return 0; diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 448f4a2070b0..c3c1d567829b 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -337,10 +337,15 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, { struct net_device *dev = dst->dev; struct xdp_frame *xdpf; + int err; if (!dev->netdev_ops->ndo_xdp_xmit) return -EOPNOTSUPP; + err = xdp_ok_fwd_dev(dev, xdp->data_end - xdp->data); + if (unlikely(err)) + return err; + xdpf = convert_to_xdp_frame(xdp); if (unlikely(!xdpf)) return -EOVERFLOW; @@ -353,7 +358,7 @@ int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb, { int err; - err = __xdp_generic_ok_fwd_dev(skb, dst->dev); + err = xdp_ok_fwd_dev(dst->dev, skb->len); if (unlikely(err)) return err; skb->dev = dst->dev; diff --git a/net/core/filter.c b/net/core/filter.c index 56db22fa5987..d8096c846dd4 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3129,12 +3129,16 @@ static int __bpf_tx_xdp(struct net_device *dev, u32 index) { struct xdp_frame *xdpf; - int sent; + int err, sent; if (!dev->netdev_ops->ndo_xdp_xmit) { return -EOPNOTSUPP; } + err = xdp_ok_fwd_dev(dev, xdp->data_end - xdp->data); + if (unlikely(err)) + return err; + xdpf = convert_to_xdp_frame(xdp); if (unlikely(!xdpf)) return -EOVERFLOW; @@ -3368,7 +3372,8 @@ int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb, goto err; } - if (unlikely((err = __xdp_generic_ok_fwd_dev(skb, fwd)))) + err = xdp_ok_fwd_dev(fwd, skb->len); + if (unlikely(err)) goto err; skb->dev = fwd; From 347abf8bc90aa3b3813afc9d9cd29b78d67081c9 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Fri, 6 Jul 2018 14:34:29 -0700 Subject: [PATCH 0725/1640] UPSTREAM: bpf: include errno.h from bpf-cgroup.h Commit fdb5c4531c1e ("bpf: fix attach type BPF_LIRC_MODE2 dependency wrt CONFIG_CGROUP_BPF") caused some build issues, detected by 0-DAY kernel test infrastructure. The problem is that cgroup_bpf_prog_attach/detach/query() functions can return -EINVAL error code, which is not defined. Fix this adding errno.h to includes. Fixes: fdb5c4531c1e ("bpf: fix attach type BPF_LIRC_MODE2 dependency wrt CONFIG_CGROUP_BPF") Signed-off-by: Roman Gushchin Cc: Sean Young Cc: Daniel Borkmann Cc: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/linux/bpf-cgroup.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index 79795c5fa7c3..d50c2f0a655a 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -2,6 +2,7 @@ #ifndef _BPF_CGROUP_H #define _BPF_CGROUP_H +#include #include #include From a3bd7946c78728aae8c820fbae055f916c6c563b Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 10 Jul 2018 00:43:22 +0200 Subject: [PATCH 0726/1640] UPSTREAM: bpf: fix ldx in ld_abs rewrite for large offsets Mark reported that syzkaller triggered a KASAN detected slab-out-of-bounds bug in ___bpf_prog_run() with a BPF_LD | BPF_ABS word load at offset 0x8001. After further investigation it became clear that the issue was the BPF_LDX_MEM() which takes offset as an argument whereas it cannot encode larger than S16_MAX offsets into it. For this synthetical case we need to move the full address into tmp register instead and do the LDX without immediate value. Fixes: e0cea7ce988c ("bpf: implement ld_abs/ld_ind in native bpf") Reported-by: syzbot Reported-by: Mark Rutland Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov --- net/core/filter.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/net/core/filter.c b/net/core/filter.c index d8096c846dd4..736df839afd1 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -459,11 +459,21 @@ static bool convert_bpf_ld_abs(struct sock_filter *fp, struct bpf_insn **insnp) (!unaligned_ok && offset >= 0 && offset + ip_align >= 0 && offset + ip_align % size == 0))) { + bool ldx_off_ok = offset <= S16_MAX; + *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_H); *insn++ = BPF_ALU64_IMM(BPF_SUB, BPF_REG_TMP, offset); - *insn++ = BPF_JMP_IMM(BPF_JSLT, BPF_REG_TMP, size, 2 + endian); - *insn++ = BPF_LDX_MEM(BPF_SIZE(fp->code), BPF_REG_A, BPF_REG_D, - offset); + *insn++ = BPF_JMP_IMM(BPF_JSLT, BPF_REG_TMP, + size, 2 + endian + (!ldx_off_ok * 2)); + if (ldx_off_ok) { + *insn++ = BPF_LDX_MEM(BPF_SIZE(fp->code), BPF_REG_A, + BPF_REG_D, offset); + } else { + *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_D); + *insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_TMP, offset); + *insn++ = BPF_LDX_MEM(BPF_SIZE(fp->code), BPF_REG_A, + BPF_REG_TMP, 0); + } if (endian) *insn++ = BPF_ENDIAN(BPF_FROM_BE, BPF_REG_A, size * 8); *insn++ = BPF_JMP_A(8); From dff1eca0f735bded93163fd5a090e24a8d20002d Mon Sep 17 00:00:00 2001 From: Mathieu Xhonneux Date: Tue, 10 Jul 2018 16:54:02 +0000 Subject: [PATCH 0727/1640] UPSTREAM: bpf: fix availability probing for seg6 helpers bpf_lwt_seg6_* helpers require CONFIG_IPV6_SEG6_BPF, and currently return -EOPNOTSUPP to indicate unavailability. This patch forces the BPF verifier to reject programs using these helpers when !CONFIG_IPV6_SEG6_BPF, allowing users to more easily probe if they are available or not. Signed-off-by: Mathieu Xhonneux Signed-off-by: Daniel Borkmann --- net/core/filter.c | 23 ++++++++--------------- 1 file changed, 8 insertions(+), 15 deletions(-) diff --git a/net/core/filter.c b/net/core/filter.c index 736df839afd1..0fdba8c69b30 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4542,10 +4542,10 @@ static const struct bpf_func_proto bpf_lwt_push_encap_proto = { .arg4_type = ARG_CONST_SIZE }; +#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF) BPF_CALL_4(bpf_lwt_seg6_store_bytes, struct sk_buff *, skb, u32, offset, const void *, from, u32, len) { -#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF) struct seg6_bpf_srh_state *srh_state = this_cpu_ptr(&seg6_bpf_srh_states); void *srh_tlvs, *srh_end, *ptr; @@ -4571,9 +4571,6 @@ BPF_CALL_4(bpf_lwt_seg6_store_bytes, struct sk_buff *, skb, u32, offset, memcpy(skb->data + offset, from, len); return 0; -#else /* CONFIG_IPV6_SEG6_BPF */ - return -EOPNOTSUPP; -#endif } static const struct bpf_func_proto bpf_lwt_seg6_store_bytes_proto = { @@ -4589,7 +4586,6 @@ static const struct bpf_func_proto bpf_lwt_seg6_store_bytes_proto = { BPF_CALL_4(bpf_lwt_seg6_action, struct sk_buff *, skb, u32, action, void *, param, u32, param_len) { -#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF) struct seg6_bpf_srh_state *srh_state = this_cpu_ptr(&seg6_bpf_srh_states); struct ipv6_sr_hdr *srh; @@ -4637,9 +4633,6 @@ BPF_CALL_4(bpf_lwt_seg6_action, struct sk_buff *, skb, default: return -EINVAL; } -#else /* CONFIG_IPV6_SEG6_BPF */ - return -EOPNOTSUPP; -#endif } static const struct bpf_func_proto bpf_lwt_seg6_action_proto = { @@ -4655,7 +4648,6 @@ static const struct bpf_func_proto bpf_lwt_seg6_action_proto = { BPF_CALL_3(bpf_lwt_seg6_adjust_srh, struct sk_buff *, skb, u32, offset, s32, len) { -#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF) struct seg6_bpf_srh_state *srh_state = this_cpu_ptr(&seg6_bpf_srh_states); void *srh_end, *srh_tlvs, *ptr; @@ -4699,9 +4691,6 @@ BPF_CALL_3(bpf_lwt_seg6_adjust_srh, struct sk_buff *, skb, u32, offset, srh_state->hdrlen += len; srh_state->valid = 0; return 0; -#else /* CONFIG_IPV6_SEG6_BPF */ - return -EOPNOTSUPP; -#endif } static const struct bpf_func_proto bpf_lwt_seg6_adjust_srh_proto = { @@ -4712,6 +4701,7 @@ static const struct bpf_func_proto bpf_lwt_seg6_adjust_srh_proto = { .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, }; +#endif /* CONFIG_IPV6_SEG6_BPF */ bool bpf_helper_changes_pkt_data(void *func) { @@ -4733,11 +4723,12 @@ bool bpf_helper_changes_pkt_data(void *func) func == bpf_xdp_adjust_meta || func == bpf_msg_pull_data || func == bpf_xdp_adjust_tail || - func == bpf_lwt_push_encap || +#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF) func == bpf_lwt_seg6_store_bytes || func == bpf_lwt_seg6_adjust_srh || - func == bpf_lwt_seg6_action - ) + func == bpf_lwt_seg6_action || +#endif + func == bpf_lwt_push_encap) return true; return false; @@ -5077,12 +5068,14 @@ static const struct bpf_func_proto * lwt_seg6local_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { +#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF) case BPF_FUNC_lwt_seg6_store_bytes: return &bpf_lwt_seg6_store_bytes_proto; case BPF_FUNC_lwt_seg6_action: return &bpf_lwt_seg6_action_proto; case BPF_FUNC_lwt_seg6_adjust_srh: return &bpf_lwt_seg6_adjust_srh_proto; +#endif default: return lwt_out_func_proto(func_id, prog); } From 695d92d3f9ea0894c270a7d11cf3c10d2eb89f79 Mon Sep 17 00:00:00 2001 From: Okash Khawaja Date: Tue, 10 Jul 2018 14:33:07 -0700 Subject: [PATCH 0728/1640] UPSTREAM: bpf: btf: Fix bitfield extraction for big endian When extracting bitfield from a number, btf_int_bits_seq_show() builds a mask and accesses least significant byte of the number in a way specific to little-endian. This patch fixes that by checking endianness of the machine and then shifting left and right the unneeded bits. Thanks to Martin Lau for the help in navigating potential pitfalls when dealing with endianess and for the final solution. Fixes: b00b8daec828 ("bpf: btf: Add pretty print capability for data with BTF type info") Signed-off-by: Okash Khawaja Acked-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann --- kernel/bpf/btf.c | 30 +++++++++++++----------------- 1 file changed, 13 insertions(+), 17 deletions(-) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 2d49d18b793a..e016ac3afa24 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -991,16 +991,13 @@ static void btf_int_bits_seq_show(const struct btf *btf, void *data, u8 bits_offset, struct seq_file *m) { + u16 left_shift_bits, right_shift_bits; u32 int_data = btf_type_int(t); u16 nr_bits = BTF_INT_BITS(int_data); u16 total_bits_offset; u16 nr_copy_bytes; u16 nr_copy_bits; - u8 nr_upper_bits; - union { - u64 u64_num; - u8 u8_nums[8]; - } print_num; + u64 print_num; total_bits_offset = bits_offset + BTF_INT_OFFSET(int_data); data += BITS_ROUNDDOWN_BYTES(total_bits_offset); @@ -1008,21 +1005,20 @@ static void btf_int_bits_seq_show(const struct btf *btf, nr_copy_bits = nr_bits + bits_offset; nr_copy_bytes = BITS_ROUNDUP_BYTES(nr_copy_bits); - print_num.u64_num = 0; - memcpy(&print_num.u64_num, data, nr_copy_bytes); + print_num = 0; + memcpy(&print_num, data, nr_copy_bytes); - /* Ditch the higher order bits */ - nr_upper_bits = BITS_PER_BYTE_MASKED(nr_copy_bits); - if (nr_upper_bits) { - /* We need to mask out some bits of the upper byte. */ - u8 mask = (1 << nr_upper_bits) - 1; +#ifdef __BIG_ENDIAN_BITFIELD + left_shift_bits = bits_offset; +#else + left_shift_bits = BITS_PER_U64 - nr_copy_bits; +#endif + right_shift_bits = BITS_PER_U64 - nr_bits; - print_num.u8_nums[nr_copy_bytes - 1] &= mask; - } + print_num <<= left_shift_bits; + print_num >>= right_shift_bits; - print_num.u64_num >>= bits_offset; - - seq_printf(m, "0x%llx", print_num.u64_num); + seq_printf(m, "0x%llx", print_num); } static void btf_int_seq_show(const struct btf *btf, const struct btf_type *t, From 53ad1e6ef55c625fb041ad3b0b1b946af1266074 Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Thu, 12 Jul 2018 12:52:22 +0100 Subject: [PATCH 0729/1640] UPSTREAM: bpf: fix documentation for eBPF helpers Minor formatting edits for eBPF helpers documentation, including blank lines removal, fix of item list for return values in bpf_fib_lookup(), and missing prefix on bpf_skb_load_bytes_relative(). Signed-off-by: Quentin Monnet Reviewed-by: Jakub Kicinski Signed-off-by: Daniel Borkmann --- include/uapi/linux/bpf.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index c5845fb9f2f3..eb6ad8963e96 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1826,7 +1826,7 @@ union bpf_attr { * A non-negative value equal to or less than *size* on success, * or a negative error in case of failure. * - * int skb_load_bytes_relative(const struct sk_buff *skb, u32 offset, void *to, u32 len, u32 start_header) + * int bpf_skb_load_bytes_relative(const struct sk_buff *skb, u32 offset, void *to, u32 len, u32 start_header) * Description * This helper is similar to **bpf_skb_load_bytes**\ () in that * it provides an easy way to load *len* bytes from *offset* @@ -1877,7 +1877,7 @@ union bpf_attr { * * < 0 if any input argument is invalid * * 0 on success (packet is forwarded, nexthop neighbor exists) * * > 0 one of **BPF_FIB_LKUP_RET_** codes explaining why the - * * packet is not forwarded or needs assist from full stack + * packet is not forwarded or needs assist from full stack * * int bpf_sock_hash_update(struct bpf_sock_ops_kern *skops, struct bpf_map *map, void *key, u64 flags) * Description @@ -2033,7 +2033,6 @@ union bpf_attr { * This helper is only available is the kernel was compiled with * the **CONFIG_BPF_LIRC_MODE2** configuration option set to * "**y**". - * * Return * 0 * @@ -2053,7 +2052,6 @@ union bpf_attr { * This helper is only available is the kernel was compiled with * the **CONFIG_BPF_LIRC_MODE2** configuration option set to * "**y**". - * * Return * 0 * From e86e067c47fd31bcba9376489907fb1b4ae1eeaa Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 12 Jul 2018 21:44:28 +0200 Subject: [PATCH 0730/1640] BACKPORT: bpf: don't leave partial mangled prog in jit_subprogs error path syzkaller managed to trigger the following bug through fault injection: [...] [ 141.043668] verifier bug. No program starts at insn 3 [ 141.044648] WARNING: CPU: 3 PID: 4072 at kernel/bpf/verifier.c:1613 get_callee_stack_depth kernel/bpf/verifier.c:1612 [inline] [ 141.044648] WARNING: CPU: 3 PID: 4072 at kernel/bpf/verifier.c:1613 fixup_call_args kernel/bpf/verifier.c:5587 [inline] [ 141.044648] WARNING: CPU: 3 PID: 4072 at kernel/bpf/verifier.c:1613 bpf_check+0x525e/0x5e60 kernel/bpf/verifier.c:5952 [ 141.047355] CPU: 3 PID: 4072 Comm: a.out Not tainted 4.18.0-rc4+ #51 [ 141.048446] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996),BIOS 1.10.2-1 04/01/2014 [ 141.049877] Call Trace: [ 141.050324] __dump_stack lib/dump_stack.c:77 [inline] [ 141.050324] dump_stack+0x1c9/0x2b4 lib/dump_stack.c:113 [ 141.050950] ? dump_stack_print_info.cold.2+0x52/0x52 lib/dump_stack.c:60 [ 141.051837] panic+0x238/0x4e7 kernel/panic.c:184 [ 141.052386] ? add_taint.cold.5+0x16/0x16 kernel/panic.c:385 [ 141.053101] ? __warn.cold.8+0x148/0x1ba kernel/panic.c:537 [ 141.053814] ? __warn.cold.8+0x117/0x1ba kernel/panic.c:530 [ 141.054506] ? get_callee_stack_depth kernel/bpf/verifier.c:1612 [inline] [ 141.054506] ? fixup_call_args kernel/bpf/verifier.c:5587 [inline] [ 141.054506] ? bpf_check+0x525e/0x5e60 kernel/bpf/verifier.c:5952 [ 141.055163] __warn.cold.8+0x163/0x1ba kernel/panic.c:538 [ 141.055820] ? get_callee_stack_depth kernel/bpf/verifier.c:1612 [inline] [ 141.055820] ? fixup_call_args kernel/bpf/verifier.c:5587 [inline] [ 141.055820] ? bpf_check+0x525e/0x5e60 kernel/bpf/verifier.c:5952 [...] What happens in jit_subprogs() is that kcalloc() for the subprog func buffer is failing with NULL where we then bail out. Latter is a plain return -ENOMEM, and this is definitely not okay since earlier in the loop we are walking all subprogs and temporarily rewrite insn->off to remember the subprog id as well as insn->imm to temporarily point the call to __bpf_call_base + 1 for the initial JIT pass. Thus, bailing out in such state and handing this over to the interpreter is troublesome since later/subsequent e.g. find_subprog() lookups are based on wrong insn->imm. Therefore, once we hit this point, we need to jump to out_free path where we undo all changes from earlier loop, so that interpreter can work on unmodified insn->{off,imm}. Another point is that should find_subprog() fail in jit_subprogs() due to a verifier bug, then we also should not simply defer the program to the interpreter since also here we did partial modifications. Instead we should just bail out entirely and return an error to the user who is trying to load the program. Fixes: 1c2a088a6626 ("bpf: x64: add JIT support for multi-function programs") Reported-by: syzbot+7d427828b2ea6e592804@syzkaller.appspotmail.com Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 0979671e624d..d8d540b1fc8c 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5704,6 +5704,10 @@ static int jit_subprogs(struct bpf_verifier_env *env) if (insn->code != (BPF_JMP | BPF_CALL) || insn->src_reg != BPF_PSEUDO_CALL) continue; + /* Upon error here we cannot fall back to interpreter but + * need a hard reject of the program. Thus -EFAULT is + * propagated in any case. + */ subprog = find_subprog(env, i + insn->imm + 1); if (subprog < 0) { WARN_ONCE(1, "verifier bug. No program starts at insn %d\n", @@ -5724,7 +5728,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) func = kcalloc(env->subprog_cnt, sizeof(prog), GFP_KERNEL); if (!func) - return -ENOMEM; + goto out_undo_insn; for (i = 0; i < env->subprog_cnt; i++) { subprog_start = subprog_end; @@ -5789,7 +5793,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) tmp = bpf_int_jit_compile(func[i]); if (tmp != func[i] || func[i]->bpf_func != old_bpf_func) { verbose(env, "JIT doesn't support bpf-to-bpf calls\n"); - err = -EFAULT; + err = -ENOTSUPP; goto out_free; } cond_resched(); @@ -5826,6 +5830,7 @@ out_free: if (func[i]) bpf_jit_free(func[i]); kfree(func); +out_undo_insn: /* cleanup main prog to be interpreted */ prog->jit_requested = 0; for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) { @@ -5845,11 +5850,16 @@ static int fixup_call_args(struct bpf_verifier_env *env) struct bpf_insn *insn = prog->insnsi; int i, depth; #endif + int err; - if (env->prog->jit_requested) - if (jit_subprogs(env) == 0) + err = 0; + if (env->prog->jit_requested) { + err = jit_subprogs(env); + if (err == 0) return 0; - + if (err == -EFAULT) + return err; + } #ifndef CONFIG_BPF_JIT_ALWAYS_ON for (i = 0; i < prog->len; i++, insn++) { if (insn->code != (BPF_JMP | BPF_CALL) || @@ -5860,8 +5870,9 @@ static int fixup_call_args(struct bpf_verifier_env *env) return depth; bpf_patch_call_args(insn, depth); } + err = 0; #endif - return 0; + return err; } /* fixup insn->imm field of bpf_call instructions From d70d990e98c4a0bf0f262ca0da992d62ca3bfe39 Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Wed, 11 Jul 2018 17:33:32 -0700 Subject: [PATCH 0731/1640] UPSTREAM: bpf: Add BPF_SOCK_OPS_TCP_LISTEN_CB Add new TCP-BPF callback that is called on listen(2) right after socket transition to TCP_LISTEN state. It fills the gap for listening sockets in TCP-BPF. For example BPF program can set BPF_SOCK_OPS_STATE_CB_FLAG when socket becomes listening and track later transition from TCP_LISTEN to TCP_CLOSE with BPF_SOCK_OPS_STATE_CB callback. Before there was no way to do it with TCP-BPF and other options were much harder to work with. E.g. socket state tracking can be done with tracepoints (either raw or regular) but they can't be attached to cgroup and their lifetime has to be managed separately. Signed-off-by: Andrey Ignatov Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/uapi/linux/bpf.h | 3 +++ net/ipv4/af_inet.c | 1 + 2 files changed, 4 insertions(+) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index eb6ad8963e96..5837002914de 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2601,6 +2601,9 @@ enum { * Arg1: old_state * Arg2: new_state */ + BPF_SOCK_OPS_TCP_LISTEN_CB, /* Called on listen(2), right after + * socket transition to LISTEN state. + */ }; /* List of TCP states. There is a build check in net/ipv4/tcp.c to detect diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index f16a6a765267..05edbe2d725b 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -229,6 +229,7 @@ int inet_listen(struct socket *sock, int backlog) err = inet_csk_listen_start(sk, backlog); if (err) goto out; + tcp_call_bpf(sk, BPF_SOCK_OPS_TCP_LISTEN_CB, 0, NULL); } sk->sk_max_ack_backlog = backlog; err = 0; From 1a81711ac875c8c45def930684b08a8d6a948320 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 17 Jul 2018 09:38:59 +0100 Subject: [PATCH 0732/1640] UPSTREAM: bpf: sockmap: remove redundant pointer sg Pointer sg is being assigned but is never used hence it is redundant and can be removed. Cleans up clang warning: warning: variable 'sg' set but not used [-Wunused-but-set-variable] Signed-off-by: Colin Ian King Signed-off-by: Daniel Borkmann --- kernel/bpf/sockmap.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 8a42316796db..24127c3e4066 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -725,11 +725,8 @@ static int bpf_tcp_sendmsg_do_redirect(struct sock *sk, int send, { bool ingress = !!(md->flags & BPF_F_INGRESS); struct smap_psock *psock; - struct scatterlist *sg; int err = 0; - sg = md->sg_data; - rcu_read_lock(); psock = smap_psock_sk(sk); if (unlikely(!psock)) From dc2feff47c84f6e32f6cd7a562978b456c2e9893 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 17 Jul 2018 10:53:23 -0700 Subject: [PATCH 0733/1640] UPSTREAM: bpf: offload: rename bpf_offload_dev_match() to bpf_offload_prog_map_match() A set of new API functions exported for the drivers will soon use 'bpf_offload_dev_' as a prefix. Rename the bpf_offload_dev_match() which is internal to the core (used by the verifier) to avoid any confusion. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 2 +- kernel/bpf/offload.c | 2 +- kernel/bpf/verifier.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index c971a85355ca..dedff4e4ebf1 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -664,7 +664,7 @@ int bpf_map_offload_delete_elem(struct bpf_map *map, void *key); int bpf_map_offload_get_next_key(struct bpf_map *map, void *key, void *next_key); -bool bpf_offload_dev_match(struct bpf_prog *prog, struct bpf_map *map); +bool bpf_offload_prog_map_match(struct bpf_prog *prog, struct bpf_map *map); #if defined(CONFIG_NET) && defined(CONFIG_BPF_SYSCALL) int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr); diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index ac747d5cf7c6..6184e48703f4 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -468,7 +468,7 @@ int bpf_map_offload_info_fill(struct bpf_map_info *info, struct bpf_map *map) return 0; } -bool bpf_offload_dev_match(struct bpf_prog *prog, struct bpf_map *map) +bool bpf_offload_prog_map_match(struct bpf_prog *prog, struct bpf_map *map) { struct bpf_offloaded_map *offmap; struct bpf_prog_offload *offload; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d8d540b1fc8c..0679bfa45f2d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5328,7 +5328,7 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, } if ((bpf_prog_is_dev_bound(prog->aux) || bpf_map_is_dev_bound(map)) && - !bpf_offload_dev_match(prog, map)) { + !bpf_offload_prog_map_match(prog, map)) { verbose(env, "offload device mismatch between prog and map\n"); return -EINVAL; } From ec0000fb2aea25225bffa05d51c9e72f286b4953 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 17 Jul 2018 10:53:24 -0700 Subject: [PATCH 0734/1640] BACKPORT: bpf: offload: aggregate offloads per-device Currently we have two lists of offloaded objects - programs and maps. Netdevice unregister notifier scans those lists to orphan objects associated with device being unregistered. This puts unnecessary (even if negligible) burden on all netdev unregister calls in BPF- -enabled kernel. The lists of objects may potentially get long making the linear scan even more problematic. There haven't been complaints about this mechanisms so far, but it is suboptimal. Instead of relying on notifiers, make the few BPF-capable drivers register explicitly for BPF offloads. The programs and maps will now be collected per-device not on a global list, and only scanned for removal when driver unregisters from BPF offloads. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 3 + kernel/bpf/offload.c | 148 +++++++++++++++++++++++++++++-------------- 2 files changed, 102 insertions(+), 49 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index dedff4e4ebf1..f909f0a3033b 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -666,6 +666,9 @@ int bpf_map_offload_get_next_key(struct bpf_map *map, bool bpf_offload_prog_map_match(struct bpf_prog *prog, struct bpf_map *map); +int bpf_offload_dev_netdev_register(struct net_device *netdev); +void bpf_offload_dev_netdev_unregister(struct net_device *netdev); + #if defined(CONFIG_NET) && defined(CONFIG_BPF_SYSCALL) int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr); diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index 6184e48703f4..cd64a26807aa 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -18,19 +18,37 @@ #include #include #include +#include #include #include #include +#include #include #include -/* Protects bpf_prog_offload_devs, bpf_map_offload_devs and offload members +/* Protects offdevs, members of bpf_offload_netdev and offload members * of all progs. * RTNL lock cannot be taken when holding this lock. */ static DECLARE_RWSEM(bpf_devs_lock); -static LIST_HEAD(bpf_prog_offload_devs); -static LIST_HEAD(bpf_map_offload_devs); + +struct bpf_offload_netdev { + struct rhash_head l; + struct net_device *netdev; + struct list_head progs; + struct list_head maps; +}; + +static const struct rhashtable_params offdevs_params = { + .nelem_hint = 4, + .key_len = sizeof(struct net_device *), + .key_offset = offsetof(struct bpf_offload_netdev, netdev), + .head_offset = offsetof(struct bpf_offload_netdev, l), + .automatic_shrinking = true, +}; + +static struct rhashtable offdevs; +static bool offdevs_inited; static int bpf_dev_offload_check(struct net_device *netdev) { @@ -41,8 +59,19 @@ static int bpf_dev_offload_check(struct net_device *netdev) return 0; } +static struct bpf_offload_netdev * +bpf_offload_find_netdev(struct net_device *netdev) +{ + lockdep_assert_held(&bpf_devs_lock); + + if (!offdevs_inited) + return NULL; + return rhashtable_lookup_fast(&offdevs, &netdev, offdevs_params); +} + int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr) { + struct bpf_offload_netdev *ondev; struct bpf_prog_offload *offload; int err; @@ -66,12 +95,13 @@ int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr) goto err_maybe_put; down_write(&bpf_devs_lock); - if (offload->netdev->reg_state != NETREG_REGISTERED) { + ondev = bpf_offload_find_netdev(offload->netdev); + if (!ondev) { err = -EINVAL; goto err_unlock; } prog->aux->offload = offload; - list_add_tail(&offload->offloads, &bpf_prog_offload_devs); + list_add_tail(&offload->offloads, &ondev->progs); dev_put(offload->netdev); up_write(&bpf_devs_lock); @@ -294,6 +324,7 @@ static int bpf_map_offload_ndo(struct bpf_offloaded_map *offmap, struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr) { struct net *net = current->nsproxy->net_ns; + struct bpf_offload_netdev *ondev; struct bpf_offloaded_map *offmap; int err; @@ -316,11 +347,17 @@ struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr) if (err) goto err_unlock; + ondev = bpf_offload_find_netdev(offmap->netdev); + if (!ondev) { + err = -EINVAL; + goto err_unlock; + } + err = bpf_map_offload_ndo(offmap, BPF_OFFLOAD_MAP_ALLOC); if (err) goto err_unlock; - list_add_tail(&offmap->offloads, &bpf_map_offload_devs); + list_add_tail(&offmap->offloads, &ondev->maps); up_write(&bpf_devs_lock); rtnl_unlock(); @@ -489,56 +526,69 @@ bool bpf_offload_prog_map_match(struct bpf_prog *prog, struct bpf_map *map) return ret; } -static void bpf_offload_orphan_all_progs(struct net_device *netdev) +int bpf_offload_dev_netdev_register(struct net_device *netdev) { - struct bpf_prog_offload *offload, *tmp; + struct bpf_offload_netdev *ondev; + int err; - list_for_each_entry_safe(offload, tmp, &bpf_prog_offload_devs, offloads) - if (offload->netdev == netdev) - __bpf_prog_offload_destroy(offload->prog); + down_write(&bpf_devs_lock); + if (!offdevs_inited) { + err = rhashtable_init(&offdevs, &offdevs_params); + if (err) + return err; + offdevs_inited = true; + } + up_write(&bpf_devs_lock); + + ondev = kzalloc(sizeof(*ondev), GFP_KERNEL); + if (!ondev) + return -ENOMEM; + + ondev->netdev = netdev; + INIT_LIST_HEAD(&ondev->progs); + INIT_LIST_HEAD(&ondev->maps); + + down_write(&bpf_devs_lock); + err = rhashtable_insert_fast(&offdevs, &ondev->l, offdevs_params); + if (err) { + netdev_warn(netdev, "failed to register for BPF offload\n"); + goto err_unlock_free; + } + + up_write(&bpf_devs_lock); + return 0; + +err_unlock_free: + up_write(&bpf_devs_lock); + kfree(ondev); + return err; } +EXPORT_SYMBOL_GPL(bpf_offload_dev_netdev_register); -static void bpf_offload_orphan_all_maps(struct net_device *netdev) +void bpf_offload_dev_netdev_unregister(struct net_device *netdev) { - struct bpf_offloaded_map *offmap, *tmp; - - list_for_each_entry_safe(offmap, tmp, &bpf_map_offload_devs, offloads) - if (offmap->netdev == netdev) - __bpf_map_offload_destroy(offmap); -} - -static int bpf_offload_notification(struct notifier_block *notifier, - ulong event, void *ptr) -{ - struct net_device *netdev = netdev_notifier_info_to_dev(ptr); + struct bpf_offloaded_map *offmap, *mtmp; + struct bpf_prog_offload *offload, *ptmp; + struct bpf_offload_netdev *ondev; ASSERT_RTNL(); - switch (event) { - case NETDEV_UNREGISTER: - /* ignore namespace changes */ - if (netdev->reg_state != NETREG_UNREGISTERING) - break; + down_write(&bpf_devs_lock); + ondev = rhashtable_lookup_fast(&offdevs, &netdev, offdevs_params); + if (WARN_ON(!ondev)) + goto unlock; - down_write(&bpf_devs_lock); - bpf_offload_orphan_all_progs(netdev); - bpf_offload_orphan_all_maps(netdev); - up_write(&bpf_devs_lock); - break; - default: - break; - } - return NOTIFY_OK; + WARN_ON(rhashtable_remove_fast(&offdevs, &ondev->l, offdevs_params)); + + list_for_each_entry_safe(offload, ptmp, &ondev->progs, offloads) + __bpf_prog_offload_destroy(offload->prog); + list_for_each_entry_safe(offmap, mtmp, &ondev->maps, offloads) + __bpf_map_offload_destroy(offmap); + + WARN_ON(!list_empty(&ondev->progs)); + WARN_ON(!list_empty(&ondev->maps)); + kfree(ondev); +unlock: + up_write(&bpf_devs_lock); } - -static struct notifier_block bpf_offload_notifier = { - .notifier_call = bpf_offload_notification, -}; - -static int __init bpf_offload_init(void) -{ - register_netdevice_notifier(&bpf_offload_notifier); - return 0; -} - -subsys_initcall(bpf_offload_init); +EXPORT_SYMBOL_GPL(bpf_offload_dev_netdev_unregister); From b8652d6630729c654668f41aece9fd072630a166 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 17 Jul 2018 10:53:25 -0700 Subject: [PATCH 0735/1640] BACKPORT: bpf: offload: keep the offload state per-ASIC Create a higher-level entity to represent a device/ASIC to allow programs and maps to be shared between device ports. The extra work is required to make sure we don't destroy BPF objects as soon as the netdev for which they were loaded gets destroyed, as other ports may still be using them. When netdev goes away all of its BPF objects will be moved to other netdevs of the device, and only destroyed when last netdev is unregistered. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 9 +++-- kernel/bpf/offload.c | 79 +++++++++++++++++++++++++++++++++++--------- 2 files changed, 70 insertions(+), 18 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index f909f0a3033b..fbc292601d3b 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -90,6 +90,7 @@ struct bpf_map { char name[BPF_OBJ_NAME_LEN]; }; +struct bpf_offload_dev; struct bpf_offloaded_map; struct bpf_map_dev_ops { @@ -666,8 +667,12 @@ int bpf_map_offload_get_next_key(struct bpf_map *map, bool bpf_offload_prog_map_match(struct bpf_prog *prog, struct bpf_map *map); -int bpf_offload_dev_netdev_register(struct net_device *netdev); -void bpf_offload_dev_netdev_unregister(struct net_device *netdev); +struct bpf_offload_dev *bpf_offload_dev_create(void); +void bpf_offload_dev_destroy(struct bpf_offload_dev *offdev); +int bpf_offload_dev_netdev_register(struct bpf_offload_dev *offdev, + struct net_device *netdev); +void bpf_offload_dev_netdev_unregister(struct bpf_offload_dev *offdev, + struct net_device *netdev); #if defined(CONFIG_NET) && defined(CONFIG_BPF_SYSCALL) int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr); diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index cd64a26807aa..925575f64ff1 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -32,11 +32,17 @@ */ static DECLARE_RWSEM(bpf_devs_lock); +struct bpf_offload_dev { + struct list_head netdevs; +}; + struct bpf_offload_netdev { struct rhash_head l; struct net_device *netdev; + struct bpf_offload_dev *offdev; struct list_head progs; struct list_head maps; + struct list_head offdev_netdevs; }; static const struct rhashtable_params offdevs_params = { @@ -526,25 +532,18 @@ bool bpf_offload_prog_map_match(struct bpf_prog *prog, struct bpf_map *map) return ret; } -int bpf_offload_dev_netdev_register(struct net_device *netdev) +int bpf_offload_dev_netdev_register(struct bpf_offload_dev *offdev, + struct net_device *netdev) { struct bpf_offload_netdev *ondev; int err; - down_write(&bpf_devs_lock); - if (!offdevs_inited) { - err = rhashtable_init(&offdevs, &offdevs_params); - if (err) - return err; - offdevs_inited = true; - } - up_write(&bpf_devs_lock); - ondev = kzalloc(sizeof(*ondev), GFP_KERNEL); if (!ondev) return -ENOMEM; ondev->netdev = netdev; + ondev->offdev = offdev; INIT_LIST_HEAD(&ondev->progs); INIT_LIST_HEAD(&ondev->maps); @@ -555,6 +554,7 @@ int bpf_offload_dev_netdev_register(struct net_device *netdev) goto err_unlock_free; } + list_add(&ondev->offdev_netdevs, &offdev->netdevs); up_write(&bpf_devs_lock); return 0; @@ -565,11 +565,12 @@ err_unlock_free: } EXPORT_SYMBOL_GPL(bpf_offload_dev_netdev_register); -void bpf_offload_dev_netdev_unregister(struct net_device *netdev) +void bpf_offload_dev_netdev_unregister(struct bpf_offload_dev *offdev, + struct net_device *netdev) { + struct bpf_offload_netdev *ondev, *altdev; struct bpf_offloaded_map *offmap, *mtmp; struct bpf_prog_offload *offload, *ptmp; - struct bpf_offload_netdev *ondev; ASSERT_RTNL(); @@ -579,11 +580,26 @@ void bpf_offload_dev_netdev_unregister(struct net_device *netdev) goto unlock; WARN_ON(rhashtable_remove_fast(&offdevs, &ondev->l, offdevs_params)); + list_del(&ondev->offdev_netdevs); - list_for_each_entry_safe(offload, ptmp, &ondev->progs, offloads) - __bpf_prog_offload_destroy(offload->prog); - list_for_each_entry_safe(offmap, mtmp, &ondev->maps, offloads) - __bpf_map_offload_destroy(offmap); + /* Try to move the objects to another netdev of the device */ + altdev = list_first_entry_or_null(&offdev->netdevs, + struct bpf_offload_netdev, + offdev_netdevs); + if (altdev) { + list_for_each_entry(offload, &ondev->progs, offloads) + offload->netdev = altdev->netdev; + list_splice_init(&ondev->progs, &altdev->progs); + + list_for_each_entry(offmap, &ondev->maps, offloads) + offmap->netdev = altdev->netdev; + list_splice_init(&ondev->maps, &altdev->maps); + } else { + list_for_each_entry_safe(offload, ptmp, &ondev->progs, offloads) + __bpf_prog_offload_destroy(offload->prog); + list_for_each_entry_safe(offmap, mtmp, &ondev->maps, offloads) + __bpf_map_offload_destroy(offmap); + } WARN_ON(!list_empty(&ondev->progs)); WARN_ON(!list_empty(&ondev->maps)); @@ -592,3 +608,34 @@ unlock: up_write(&bpf_devs_lock); } EXPORT_SYMBOL_GPL(bpf_offload_dev_netdev_unregister); + +struct bpf_offload_dev *bpf_offload_dev_create(void) +{ + struct bpf_offload_dev *offdev; + int err; + + down_write(&bpf_devs_lock); + if (!offdevs_inited) { + err = rhashtable_init(&offdevs, &offdevs_params); + if (err) + return ERR_PTR(err); + offdevs_inited = true; + } + up_write(&bpf_devs_lock); + + offdev = kzalloc(sizeof(*offdev), GFP_KERNEL); + if (!offdev) + return ERR_PTR(-ENOMEM); + + INIT_LIST_HEAD(&offdev->netdevs); + + return offdev; +} +EXPORT_SYMBOL_GPL(bpf_offload_dev_create); + +void bpf_offload_dev_destroy(struct bpf_offload_dev *offdev) +{ + WARN_ON(!list_empty(&offdev->netdevs)); + kfree(offdev); +} +EXPORT_SYMBOL_GPL(bpf_offload_dev_destroy); From b3808bbe9a8617b48d30b5402a2a3a87111ded68 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 17 Jul 2018 10:53:26 -0700 Subject: [PATCH 0736/1640] UPSTREAM: bpf: offload: allow program and map sharing per-ASIC Allow programs and maps to be re-used across different netdevs, as long as they belong to the same struct bpf_offload_dev. Update the bpf_offload_prog_map_match() helper for the verifier and export a new helper for the drivers to use when checking programs at attachment time. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 1 + kernel/bpf/offload.c | 42 +++++++++++++++++++++++++++++++++++------- 2 files changed, 36 insertions(+), 7 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index fbc292601d3b..133c36357e2e 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -673,6 +673,7 @@ int bpf_offload_dev_netdev_register(struct bpf_offload_dev *offdev, struct net_device *netdev); void bpf_offload_dev_netdev_unregister(struct bpf_offload_dev *offdev, struct net_device *netdev); +bool bpf_offload_dev_match(struct bpf_prog *prog, struct net_device *netdev); #if defined(CONFIG_NET) && defined(CONFIG_BPF_SYSCALL) int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr); diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index 925575f64ff1..177a52436394 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -511,22 +511,50 @@ int bpf_map_offload_info_fill(struct bpf_map_info *info, struct bpf_map *map) return 0; } -bool bpf_offload_prog_map_match(struct bpf_prog *prog, struct bpf_map *map) +static bool __bpf_offload_dev_match(struct bpf_prog *prog, + struct net_device *netdev) { - struct bpf_offloaded_map *offmap; + struct bpf_offload_netdev *ondev1, *ondev2; struct bpf_prog_offload *offload; - bool ret; if (!bpf_prog_is_dev_bound(prog->aux)) return false; - if (!bpf_map_is_dev_bound(map)) - return bpf_map_offload_neutral(map); + + offload = prog->aux->offload; + if (!offload) + return false; + if (offload->netdev == netdev) + return true; + + ondev1 = bpf_offload_find_netdev(offload->netdev); + ondev2 = bpf_offload_find_netdev(netdev); + + return ondev1 && ondev2 && ondev1->offdev == ondev2->offdev; +} + +bool bpf_offload_dev_match(struct bpf_prog *prog, struct net_device *netdev) +{ + bool ret; down_read(&bpf_devs_lock); - offload = prog->aux->offload; + ret = __bpf_offload_dev_match(prog, netdev); + up_read(&bpf_devs_lock); + + return ret; +} +EXPORT_SYMBOL_GPL(bpf_offload_dev_match); + +bool bpf_offload_prog_map_match(struct bpf_prog *prog, struct bpf_map *map) +{ + struct bpf_offloaded_map *offmap; + bool ret; + + if (!bpf_map_is_dev_bound(map)) + return bpf_map_offload_neutral(map); offmap = map_to_offmap(map); - ret = offload && offload->netdev == offmap->netdev; + down_read(&bpf_devs_lock); + ret = __bpf_offload_dev_match(prog, offmap->netdev); up_read(&bpf_devs_lock); return ret; From 7a835ae1102f6260f894554e72ef2531b3e5f6e3 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 19 Jul 2018 22:14:31 -0700 Subject: [PATCH 0737/1640] UPSTREAM: bpf: btf: Clean up BTF_INT_BITS() in uapi btf.h This patch shrinks the BTF_INT_BITS() mask. The current btf_int_check_meta() ensures the nr_bits of an integer cannot exceed 64. Hence, it is mostly an uapi cleanup. The actual btf usage (i.e. seq_show()) is also modified to use u8 instead of u16. The verification (e.g. btf_int_check_meta()) path stays as is to deal with invalid BTF situation. Fixes: 69b693f0aefa ("bpf: btf: Introduce BPF Type Format (BTF)") Signed-off-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann --- include/uapi/linux/btf.h | 2 +- kernel/bpf/btf.c | 16 ++++++++++------ 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/include/uapi/linux/btf.h b/include/uapi/linux/btf.h index 0b5ddbe135a4..972265f32871 100644 --- a/include/uapi/linux/btf.h +++ b/include/uapi/linux/btf.h @@ -76,7 +76,7 @@ struct btf_type { */ #define BTF_INT_ENCODING(VAL) (((VAL) & 0x0f000000) >> 24) #define BTF_INT_OFFSET(VAL) (((VAL & 0x00ff0000)) >> 16) -#define BTF_INT_BITS(VAL) ((VAL) & 0x0000ffff) +#define BTF_INT_BITS(VAL) ((VAL) & 0x000000ff) /* Attributes stored in the BTF_INT_ENCODING */ #define BTF_INT_SIGNED (1 << 0) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index e016ac3afa24..9704934252b3 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -450,7 +450,7 @@ static const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id) */ static bool btf_type_int_is_regular(const struct btf_type *t) { - u16 nr_bits, nr_bytes; + u8 nr_bits, nr_bytes; u32 int_data; int_data = btf_type_int(t); @@ -993,12 +993,16 @@ static void btf_int_bits_seq_show(const struct btf *btf, { u16 left_shift_bits, right_shift_bits; u32 int_data = btf_type_int(t); - u16 nr_bits = BTF_INT_BITS(int_data); - u16 total_bits_offset; - u16 nr_copy_bytes; - u16 nr_copy_bits; + u8 nr_bits = BTF_INT_BITS(int_data); + u8 total_bits_offset; + u8 nr_copy_bytes; + u8 nr_copy_bits; u64 print_num; + /* + * bits_offset is at most 7. + * BTF_INT_OFFSET() cannot exceed 64 bits. + */ total_bits_offset = bits_offset + BTF_INT_OFFSET(int_data); data += BITS_ROUNDDOWN_BYTES(total_bits_offset); bits_offset = BITS_PER_BYTE_MASKED(total_bits_offset); @@ -1028,7 +1032,7 @@ static void btf_int_seq_show(const struct btf *btf, const struct btf_type *t, u32 int_data = btf_type_int(t); u8 encoding = BTF_INT_ENCODING(int_data); bool sign = encoding & BTF_INT_SIGNED; - u32 nr_bits = BTF_INT_BITS(int_data); + u8 nr_bits = BTF_INT_BITS(int_data); if (bits_offset || BTF_INT_OFFSET(int_data) || BITS_PER_BYTE_MASKED(nr_bits)) { From 317711ecaf0428f769b3c0bab5b9f715df338666 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Thu, 19 Jul 2018 15:56:59 +0800 Subject: [PATCH 0738/1640] UPSTREAM: bpfilter: Fix mismatch in function argument types Fix following warning: net/ipv4/bpfilter/sockopt.c:28:5: error: symbol 'bpfilter_ip_set_sockopt' redeclared with different type net/ipv4/bpfilter/sockopt.c:34:5: error: symbol 'bpfilter_ip_get_sockopt' redeclared with different type Signed-off-by: YueHaibing Acked-by: Martin KaFai Lau Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/linux/bpfilter.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/linux/bpfilter.h b/include/linux/bpfilter.h index 687b1760bb9f..f02cee0225d4 100644 --- a/include/linux/bpfilter.h +++ b/include/linux/bpfilter.h @@ -5,10 +5,10 @@ #include struct sock; -int bpfilter_ip_set_sockopt(struct sock *sk, int optname, char *optval, +int bpfilter_ip_set_sockopt(struct sock *sk, int optname, char __user *optval, unsigned int optlen); -int bpfilter_ip_get_sockopt(struct sock *sk, int optname, char *optval, - int *optlen); +int bpfilter_ip_get_sockopt(struct sock *sk, int optname, char __user *optval, + int __user *optlen); extern int (*bpfilter_process_sockopt)(struct sock *sk, int optname, char __user *optval, unsigned int optlen, bool is_set); From 37cf08b51ca27f3beffb307613f0f78ddf663e95 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Fri, 20 Jul 2018 17:38:37 -0700 Subject: [PATCH 0739/1640] BACKPORT: bpf: btf: Ensure the member->offset is in the right order This patch ensures the member->offset of a struct is in the correct order (i.e the later member's offset cannot go backward). The current "pahole -J" BTF encoder does not generate something like this. However, checking this can ensure future encoder will not violate this. Fixes: 69b693f0aefa ("bpf: btf: Introduce BPF Type Format (BTF)") Signed-off-by: Martin KaFai Lau Acked-by: Yonghong Song Signed-off-by: Daniel Borkmann --- kernel/bpf/btf.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 9704934252b3..2590700237c1 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -1519,9 +1519,9 @@ static s32 btf_struct_check_meta(struct btf_verifier_env *env, { bool is_union = BTF_INFO_KIND(t->info) == BTF_KIND_UNION; const struct btf_member *member; + u32 meta_needed, last_offset; struct btf *btf = env->btf; u32 struct_size = t->size; - u32 meta_needed; u16 i; meta_needed = btf_type_vlen(t) * sizeof(*member); @@ -1534,6 +1534,7 @@ static s32 btf_struct_check_meta(struct btf_verifier_env *env, btf_verifier_log_type(env, t, NULL); + last_offset = 0; for_each_member(i, t, member) { if (!btf_name_offset_valid(btf, member->name_off)) { btf_verifier_log_member(env, t, member, @@ -1555,6 +1556,16 @@ static s32 btf_struct_check_meta(struct btf_verifier_env *env, return -EINVAL; } + /* + * ">" instead of ">=" because the last member could be + * "char a[0];" + */ + if (last_offset > member->offset) { + btf_verifier_log_member(env, t, member, + "Invalid member bits_offset"); + return -EINVAL; + } + if (BITS_ROUNDUP_BYTES(member->offset) > struct_size) { btf_verifier_log_member(env, t, member, "Memmber bits_offset exceeds its struct size"); @@ -1562,6 +1573,7 @@ static s32 btf_struct_check_meta(struct btf_verifier_env *env, } btf_verifier_log_member(env, t, member, NULL); + last_offset = member->offset; } return meta_needed; From e3ecf4c2195c01e5e5c7c7b3f76229cfdc16a690 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 26 Jul 2018 09:57:59 -0700 Subject: [PATCH 0740/1640] BACKPORT: bpf: btf: Use exact btf value_size match in map_check_btf() The current map_check_btf() in BPF_MAP_TYPE_ARRAY rejects '> map->value_size' to ensure map_seq_show_elem() will not access things beyond an array element. Yonghong suggested that using '!=' is a more correct check. The 8 bytes round_up on value_size is stored in array->elem_size. Hence, using '!=' on map->value_size is a proper check. This patch also adds new tests to check the btf array key type and value type. Two of these new tests verify the btf's value_size (the change in this patch). It also fixes two existing tests that wrongly encoded a btf's type size (pprint_test) and the value_type_id (in one of the raw_tests[]). However, that do not affect these two BTF verification tests before or after this test changes. These two tests mainly failed at array creation time after this patch. Fixes: a26ca7c982cb ("bpf: btf: Add pretty print support to the basic arraymap") Suggested-by: Yonghong Song Acked-by: Yonghong Song Signed-off-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann --- kernel/bpf/arraymap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 0bd1006f6063..4a2a3b202f46 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -378,7 +378,7 @@ static int array_map_check_btf(const struct bpf_map *map, const struct btf *btf, return -EINVAL; value_type = btf_type_id_size(btf, &btf_value_id, &value_size); - if (!value_type || value_size > map->value_size) + if (!value_type || value_size != map->value_size) return -EINVAL; return 0; From 711dab4663d0891862c9a3d812210aea1b7052d9 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 28 Jul 2018 00:17:56 +0200 Subject: [PATCH 0741/1640] UPSTREAM: bpf: fix bpf_skb_load_bytes_relative pkt length check The len > skb_headlen(skb) cannot be used as a maximum upper bound for the packet length since it does not have any relation to the full linear packet length when filtering is used from upper layers (e.g. in case of reuseport BPF programs) as by then skb->data, skb->len already got mangled through __skb_pull() and others. Fixes: 4e1ec56cdc59 ("bpf: add skb_load_bytes_relative helper") Signed-off-by: Daniel Borkmann Acked-by: Martin KaFai Lau --- net/core/filter.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/net/core/filter.c b/net/core/filter.c index 0fdba8c69b30..e660b9ea48a8 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1712,24 +1712,26 @@ static const struct bpf_func_proto bpf_skb_load_bytes_proto = { BPF_CALL_5(bpf_skb_load_bytes_relative, const struct sk_buff *, skb, u32, offset, void *, to, u32, len, u32, start_header) { + u8 *end = skb_tail_pointer(skb); + u8 *net = skb_network_header(skb); + u8 *mac = skb_mac_header(skb); u8 *ptr; - if (unlikely(offset > 0xffff || len > skb_headlen(skb))) + if (unlikely(offset > 0xffff || len > (end - mac))) goto err_clear; switch (start_header) { case BPF_HDR_START_MAC: - ptr = skb_mac_header(skb) + offset; + ptr = mac + offset; break; case BPF_HDR_START_NET: - ptr = skb_network_header(skb) + offset; + ptr = net + offset; break; default: goto err_clear; } - if (likely(ptr >= skb_mac_header(skb) && - ptr + len <= skb_tail_pointer(skb))) { + if (likely(ptr >= mac && ptr + len <= end)) { memcpy(to, ptr, len); return 0; } From 314f418d9b435aa63fe8ff431969f13df90978f7 Mon Sep 17 00:00:00 2001 From: Mathieu Xhonneux Date: Thu, 26 Jul 2018 02:10:40 +0000 Subject: [PATCH 0742/1640] UPSTREAM: bpf: add End.DT6 action to bpf_lwt_seg6_action helper The seg6local LWT provides the End.DT6 action, which allows to decapsulate an outer IPv6 header containing a Segment Routing Header (SRH), full specification is available here: https://tools.ietf.org/html/draft-filsfils-spring-srv6-network-programming-05 This patch adds this action now to the seg6local BPF interface. Since it is not mandatory that the inner IPv6 header also contains a SRH, seg6_bpf_srh_state has been extended with a pointer to a possible SRH of the outermost IPv6 header. This helps assessing if the validation must be triggered or not, and avoids some calls to ipv6_find_hdr. v3: s/1/true, s/0/false for boolean values v2: - changed true/false -> 1/0 - preempt_enable no longer called in first conditional block Signed-off-by: Mathieu Xhonneux Signed-off-by: Daniel Borkmann --- include/net/seg6_local.h | 4 +- net/core/filter.c | 88 +++++++++++++++++++++++++++------------- net/ipv6/seg6_local.c | 50 +++++++++++++++-------- 3 files changed, 94 insertions(+), 48 deletions(-) diff --git a/include/net/seg6_local.h b/include/net/seg6_local.h index 661fd5b4d3e0..08359e2d8b35 100644 --- a/include/net/seg6_local.h +++ b/include/net/seg6_local.h @@ -21,10 +21,12 @@ extern int seg6_lookup_nexthop(struct sk_buff *skb, struct in6_addr *nhaddr, u32 tbl_id); +extern bool seg6_bpf_has_valid_srh(struct sk_buff *skb); struct seg6_bpf_srh_state { - bool valid; + struct ipv6_sr_hdr *srh; u16 hdrlen; + bool valid; }; DECLARE_PER_CPU(struct seg6_bpf_srh_state, seg6_bpf_srh_states); diff --git a/net/core/filter.c b/net/core/filter.c index e660b9ea48a8..a35c912fcdcc 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4550,26 +4550,28 @@ BPF_CALL_4(bpf_lwt_seg6_store_bytes, struct sk_buff *, skb, u32, offset, { struct seg6_bpf_srh_state *srh_state = this_cpu_ptr(&seg6_bpf_srh_states); + struct ipv6_sr_hdr *srh = srh_state->srh; void *srh_tlvs, *srh_end, *ptr; - struct ipv6_sr_hdr *srh; int srhoff = 0; - if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0) + if (srh == NULL) return -EINVAL; - srh = (struct ipv6_sr_hdr *)(skb->data + srhoff); srh_tlvs = (void *)((char *)srh + ((srh->first_segment + 1) << 4)); srh_end = (void *)((char *)srh + sizeof(*srh) + srh_state->hdrlen); ptr = skb->data + offset; if (ptr >= srh_tlvs && ptr + len <= srh_end) - srh_state->valid = 0; + srh_state->valid = false; else if (ptr < (void *)&srh->flags || ptr + len > (void *)&srh->segments) return -EFAULT; if (unlikely(bpf_try_make_writable(skb, offset + len))) return -EFAULT; + if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0) + return -EINVAL; + srh_state->srh = (struct ipv6_sr_hdr *)(skb->data + srhoff); memcpy(skb->data + offset, from, len); return 0; @@ -4585,52 +4587,78 @@ static const struct bpf_func_proto bpf_lwt_seg6_store_bytes_proto = { .arg4_type = ARG_CONST_SIZE }; +static void bpf_update_srh_state(struct sk_buff *skb) +{ + struct seg6_bpf_srh_state *srh_state = + this_cpu_ptr(&seg6_bpf_srh_states); + int srhoff = 0; + + if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0) { + srh_state->srh = NULL; + } else { + srh_state->srh = (struct ipv6_sr_hdr *)(skb->data + srhoff); + srh_state->hdrlen = srh_state->srh->hdrlen << 3; + srh_state->valid = true; + } +} + BPF_CALL_4(bpf_lwt_seg6_action, struct sk_buff *, skb, u32, action, void *, param, u32, param_len) { struct seg6_bpf_srh_state *srh_state = this_cpu_ptr(&seg6_bpf_srh_states); - struct ipv6_sr_hdr *srh; - int srhoff = 0; + int hdroff = 0; int err; - if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0) - return -EINVAL; - srh = (struct ipv6_sr_hdr *)(skb->data + srhoff); - - if (!srh_state->valid) { - if (unlikely((srh_state->hdrlen & 7) != 0)) - return -EBADMSG; - - srh->hdrlen = (u8)(srh_state->hdrlen >> 3); - if (unlikely(!seg6_validate_srh(srh, (srh->hdrlen + 1) << 3))) - return -EBADMSG; - - srh_state->valid = 1; - } - switch (action) { case SEG6_LOCAL_ACTION_END_X: + if (!seg6_bpf_has_valid_srh(skb)) + return -EBADMSG; if (param_len != sizeof(struct in6_addr)) return -EINVAL; return seg6_lookup_nexthop(skb, (struct in6_addr *)param, 0); case SEG6_LOCAL_ACTION_END_T: + if (!seg6_bpf_has_valid_srh(skb)) + return -EBADMSG; if (param_len != sizeof(int)) return -EINVAL; return seg6_lookup_nexthop(skb, NULL, *(int *)param); + case SEG6_LOCAL_ACTION_END_DT6: + if (!seg6_bpf_has_valid_srh(skb)) + return -EBADMSG; + if (param_len != sizeof(int)) + return -EINVAL; + + if (ipv6_find_hdr(skb, &hdroff, IPPROTO_IPV6, NULL, NULL) < 0) + return -EBADMSG; + if (!pskb_pull(skb, hdroff)) + return -EBADMSG; + + skb_postpull_rcsum(skb, skb_network_header(skb), hdroff); + skb_reset_network_header(skb); + skb_reset_transport_header(skb); + skb->encapsulation = 0; + + bpf_compute_data_pointers(skb); + bpf_update_srh_state(skb); + return seg6_lookup_nexthop(skb, NULL, *(int *)param); case SEG6_LOCAL_ACTION_END_B6: + if (srh_state->srh && !seg6_bpf_has_valid_srh(skb)) + return -EBADMSG; err = bpf_push_seg6_encap(skb, BPF_LWT_ENCAP_SEG6_INLINE, param, param_len); if (!err) - srh_state->hdrlen = - ((struct ipv6_sr_hdr *)param)->hdrlen << 3; + bpf_update_srh_state(skb); + return err; case SEG6_LOCAL_ACTION_END_B6_ENCAP: + if (srh_state->srh && !seg6_bpf_has_valid_srh(skb)) + return -EBADMSG; err = bpf_push_seg6_encap(skb, BPF_LWT_ENCAP_SEG6, param, param_len); if (!err) - srh_state->hdrlen = - ((struct ipv6_sr_hdr *)param)->hdrlen << 3; + bpf_update_srh_state(skb); + return err; default: return -EINVAL; @@ -4652,15 +4680,14 @@ BPF_CALL_3(bpf_lwt_seg6_adjust_srh, struct sk_buff *, skb, u32, offset, { struct seg6_bpf_srh_state *srh_state = this_cpu_ptr(&seg6_bpf_srh_states); + struct ipv6_sr_hdr *srh = srh_state->srh; void *srh_end, *srh_tlvs, *ptr; - struct ipv6_sr_hdr *srh; struct ipv6hdr *hdr; int srhoff = 0; int ret; - if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0) + if (unlikely(srh == NULL)) return -EINVAL; - srh = (struct ipv6_sr_hdr *)(skb->data + srhoff); srh_tlvs = (void *)((unsigned char *)srh + sizeof(*srh) + ((srh->first_segment + 1) << 4)); @@ -4690,8 +4717,11 @@ BPF_CALL_3(bpf_lwt_seg6_adjust_srh, struct sk_buff *, skb, u32, offset, hdr = (struct ipv6hdr *)skb->data; hdr->payload_len = htons(skb->len - sizeof(struct ipv6hdr)); + if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0) + return -EINVAL; + srh_state->srh = (struct ipv6_sr_hdr *)(skb->data + srhoff); srh_state->hdrlen += len; - srh_state->valid = 0; + srh_state->valid = false; return 0; } diff --git a/net/ipv6/seg6_local.c b/net/ipv6/seg6_local.c index 0800ac74cedc..4d7087aeef5d 100644 --- a/net/ipv6/seg6_local.c +++ b/net/ipv6/seg6_local.c @@ -459,36 +459,57 @@ drop: DEFINE_PER_CPU(struct seg6_bpf_srh_state, seg6_bpf_srh_states); +bool seg6_bpf_has_valid_srh(struct sk_buff *skb) +{ + struct seg6_bpf_srh_state *srh_state = + this_cpu_ptr(&seg6_bpf_srh_states); + struct ipv6_sr_hdr *srh = srh_state->srh; + + if (unlikely(srh == NULL)) + return false; + + if (unlikely(!srh_state->valid)) { + if ((srh_state->hdrlen & 7) != 0) + return false; + + srh->hdrlen = (u8)(srh_state->hdrlen >> 3); + if (!seg6_validate_srh(srh, (srh->hdrlen + 1) << 3)) + return false; + + srh_state->valid = true; + } + + return true; +} + static int input_action_end_bpf(struct sk_buff *skb, struct seg6_local_lwt *slwt) { struct seg6_bpf_srh_state *srh_state = this_cpu_ptr(&seg6_bpf_srh_states); - struct seg6_bpf_srh_state local_srh_state; struct ipv6_sr_hdr *srh; - int srhoff = 0; int ret; srh = get_and_validate_srh(skb); - if (!srh) - goto drop; + if (!srh) { + kfree_skb(skb); + return -EINVAL; + } advance_nextseg(srh, &ipv6_hdr(skb)->daddr); /* preempt_disable is needed to protect the per-CPU buffer srh_state, * which is also accessed by the bpf_lwt_seg6_* helpers */ preempt_disable(); + srh_state->srh = srh; srh_state->hdrlen = srh->hdrlen << 3; - srh_state->valid = 1; + srh_state->valid = true; rcu_read_lock(); bpf_compute_data_pointers(skb); ret = bpf_prog_run_save_cb(slwt->bpf.prog, skb); rcu_read_unlock(); - local_srh_state = *srh_state; - preempt_enable(); - switch (ret) { case BPF_OK: case BPF_REDIRECT: @@ -500,24 +521,17 @@ static int input_action_end_bpf(struct sk_buff *skb, goto drop; } - if (unlikely((local_srh_state.hdrlen & 7) != 0)) - goto drop; - - if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0) - goto drop; - srh = (struct ipv6_sr_hdr *)(skb->data + srhoff); - srh->hdrlen = (u8)(local_srh_state.hdrlen >> 3); - - if (!local_srh_state.valid && - unlikely(!seg6_validate_srh(srh, (srh->hdrlen + 1) << 3))) + if (srh_state->srh && !seg6_bpf_has_valid_srh(skb)) goto drop; + preempt_enable(); if (ret != BPF_REDIRECT) seg6_lookup_nexthop(skb, NULL, 0); return dst_input(skb); drop: + preempt_enable(); kfree_skb(skb); return -EINVAL; } From 9a79b3aee0dd13358148dde9f49e31a41f03e1fe Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Mon, 30 Jul 2018 17:42:28 -0700 Subject: [PATCH 0743/1640] UPSTREAM: bpf: Support bpf_get_socket_cookie in more prog types bpf_get_socket_cookie() helper can be used to identify skb that correspond to the same socket. Though socket cookie can be useful in many other use-cases where socket is available in program context. Specifically BPF_PROG_TYPE_CGROUP_SOCK_ADDR and BPF_PROG_TYPE_SOCK_OPS programs can benefit from it so that one of them can augment a value in a map prepared earlier by other program for the same socket. The patch adds support to call bpf_get_socket_cookie() from BPF_PROG_TYPE_CGROUP_SOCK_ADDR and BPF_PROG_TYPE_SOCK_OPS. It doesn't introduce new helpers. Instead it reuses same helper name bpf_get_socket_cookie() but adds support to this helper to accept `struct bpf_sock_addr` and `struct bpf_sock_ops`. Documentation in bpf.h is changed in a way that should not break automatic generation of markdown. Signed-off-by: Andrey Ignatov Acked-by: Yonghong Song Signed-off-by: Daniel Borkmann --- include/uapi/linux/bpf.h | 14 ++++++++++++++ net/core/filter.c | 28 ++++++++++++++++++++++++++++ 2 files changed, 42 insertions(+) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 5837002914de..2afb34e17f28 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1371,6 +1371,20 @@ union bpf_attr { * A 8-byte long non-decreasing number on success, or 0 if the * socket field is missing inside *skb*. * + * u64 bpf_get_socket_cookie(struct bpf_sock_addr *ctx) + * Description + * Equivalent to bpf_get_socket_cookie() helper that accepts + * *skb*, but gets socket from **struct bpf_sock_addr** contex. + * Return + * A 8-byte long non-decreasing number. + * + * u64 bpf_get_socket_cookie(struct bpf_sock_ops *ctx) + * Description + * Equivalent to bpf_get_socket_cookie() helper that accepts + * *skb*, but gets socket from **struct bpf_sock_ops** contex. + * Return + * A 8-byte long non-decreasing number. + * * u32 bpf_get_socket_uid(struct sk_buff *skb) * Return * The owner UID of the socket associated to *skb*. If the socket diff --git a/net/core/filter.c b/net/core/filter.c index a35c912fcdcc..4a9bae13e12d 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3815,6 +3815,30 @@ static const struct bpf_func_proto bpf_get_socket_cookie_proto = { .arg1_type = ARG_PTR_TO_CTX, }; +BPF_CALL_1(bpf_get_socket_cookie_sock_addr, struct bpf_sock_addr_kern *, ctx) +{ + return sock_gen_cookie(ctx->sk); +} + +static const struct bpf_func_proto bpf_get_socket_cookie_sock_addr_proto = { + .func = bpf_get_socket_cookie_sock_addr, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, +}; + +BPF_CALL_1(bpf_get_socket_cookie_sock_ops, struct bpf_sock_ops_kern *, ctx) +{ + return sock_gen_cookie(ctx->sk); +} + +static const struct bpf_func_proto bpf_get_socket_cookie_sock_ops_proto = { + .func = bpf_get_socket_cookie_sock_ops, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, +}; + BPF_CALL_1(bpf_get_socket_uid, struct sk_buff *, skb) { struct sock *sk = sk_to_full_sk(skb->sk); @@ -4828,6 +4852,8 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) default: return NULL; } + case BPF_FUNC_get_socket_cookie: + return &bpf_get_socket_cookie_sock_addr_proto; default: return bpf_base_func_proto(func_id); } @@ -4972,6 +4998,8 @@ sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_sock_map_update_proto; case BPF_FUNC_sock_hash_update: return &bpf_sock_hash_update_proto; + case BPF_FUNC_get_socket_cookie: + return &bpf_get_socket_cookie_sock_ops_proto; default: return bpf_base_func_proto(func_id); } From 7a6b6b21c8076e278dcee039eb7281b58006b21f Mon Sep 17 00:00:00 2001 From: Arthur Fabre Date: Tue, 31 Jul 2018 18:17:22 +0100 Subject: [PATCH 0744/1640] BACKPORT: bpf: verifier: MOV64 don't mark dst reg unbounded When check_alu_op() handles a BPF_MOV64 between two registers, it calls check_reg_arg(DST_OP) on the dst register, marking it as unbounded. If the src and dst register are the same, this marks the src as unbounded, which can lead to unexpected errors for further checks that rely on bounds info. For example: BPF_MOV64_IMM(BPF_REG_2, 0), BPF_MOV64_REG(BPF_REG_2, BPF_REG_2), BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_2), BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), Results in: "math between ctx pointer and register with unbounded min value is not allowed" check_alu_op() now uses check_reg_arg(DST_OP_NO_MARK), and MOVs that need to mark the dst register (MOVIMM, MOV32) do so. Added a test case for MOV64 dst == src, and dst != src. Signed-off-by: Arthur Fabre Acked-by: Edward Cree Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 0679bfa45f2d..ba8c29620415 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3495,8 +3495,8 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) } } - /* check dest operand */ - err = check_reg_arg(env, insn->dst_reg, DST_OP); + /* check dest operand, mark as required later */ + err = check_reg_arg(env, insn->dst_reg, DST_OP_NO_MARK); if (err) return err; @@ -3522,6 +3522,8 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) /* case: R = imm * remember the value we stored into this reg */ + /* clear any state __mark_reg_known doesn't set */ + mark_reg_unknown(env, regs, insn->dst_reg); regs[insn->dst_reg].type = SCALAR_VALUE; if (BPF_CLASS(insn->code) == BPF_ALU64) { __mark_reg_known(regs + insn->dst_reg, From cfba61181315f9dbf3fb1dbc75164ce760f46395 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Thu, 2 Aug 2018 14:27:17 -0700 Subject: [PATCH 0745/1640] UPSTREAM: bpf: add ability to charge bpf maps memory dynamically This commits extends existing bpf maps memory charging API to support dynamic charging/uncharging. This is required to account memory used by maps, if all entries are created dynamically after the map initialization. Signed-off-by: Roman Gushchin Cc: Alexei Starovoitov Cc: Daniel Borkmann Acked-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 2 ++ kernel/bpf/syscall.c | 62 ++++++++++++++++++++++++++++++++------------ 2 files changed, 47 insertions(+), 17 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 133c36357e2e..16d6eb6025b7 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -440,6 +440,8 @@ struct bpf_map * __must_check bpf_map_inc(struct bpf_map *map, bool uref); void bpf_map_put_with_uref(struct bpf_map *map); void bpf_map_put(struct bpf_map *map); int bpf_map_precharge_memlock(u32 pages); +int bpf_map_charge_memlock(struct bpf_map *map, u32 pages); +void bpf_map_uncharge_memlock(struct bpf_map *map, u32 pages); void *bpf_map_area_alloc(size_t size, int numa_node); void bpf_map_area_free(void *base); void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 40996cc7d31e..eb57031abd54 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -182,32 +182,60 @@ int bpf_map_precharge_memlock(u32 pages) return 0; } -static int bpf_map_charge_memlock(struct bpf_map *map) +static int bpf_charge_memlock(struct user_struct *user, u32 pages) { - struct user_struct *user = get_current_user(); - unsigned long memlock_limit; + unsigned long memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; - memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; - - atomic_long_add(map->pages, &user->locked_vm); - - if (atomic_long_read(&user->locked_vm) > memlock_limit) { - atomic_long_sub(map->pages, &user->locked_vm); - free_uid(user); + if (atomic_long_add_return(pages, &user->locked_vm) > memlock_limit) { + atomic_long_sub(pages, &user->locked_vm); return -EPERM; } - map->user = user; return 0; } -static void bpf_map_uncharge_memlock(struct bpf_map *map) +static void bpf_uncharge_memlock(struct user_struct *user, u32 pages) +{ + atomic_long_sub(pages, &user->locked_vm); +} + +static int bpf_map_init_memlock(struct bpf_map *map) +{ + struct user_struct *user = get_current_user(); + int ret; + + ret = bpf_charge_memlock(user, map->pages); + if (ret) { + free_uid(user); + return ret; + } + map->user = user; + return ret; +} + +static void bpf_map_release_memlock(struct bpf_map *map) { struct user_struct *user = map->user; - - atomic_long_sub(map->pages, &user->locked_vm); + bpf_uncharge_memlock(user, map->pages); free_uid(user); } +int bpf_map_charge_memlock(struct bpf_map *map, u32 pages) +{ + int ret; + + ret = bpf_charge_memlock(map->user, pages); + if (ret) + return ret; + map->pages += pages; + return ret; +} + +void bpf_map_uncharge_memlock(struct bpf_map *map, u32 pages) +{ + bpf_uncharge_memlock(map->user, pages); + map->pages -= pages; +} + static int bpf_map_alloc_id(struct bpf_map *map) { int id; @@ -257,7 +285,7 @@ static void bpf_map_free_deferred(struct work_struct *work) { struct bpf_map *map = container_of(work, struct bpf_map, work); - bpf_map_uncharge_memlock(map); + bpf_map_release_memlock(map); security_bpf_map_free(map); /* implementation dependent freeing */ map->ops->map_free(map); @@ -493,7 +521,7 @@ static int map_create(union bpf_attr *attr) if (err) goto free_map_nouncharge; - err = bpf_map_charge_memlock(map); + err = bpf_map_init_memlock(map); if (err) goto free_map_sec; @@ -516,7 +544,7 @@ static int map_create(union bpf_attr *attr) return err; free_map: - bpf_map_uncharge_memlock(map); + bpf_map_release_memlock(map); free_map_sec: security_bpf_map_free(map); free_map_nouncharge: From 792730fafd07289be124619536043baac19a9c97 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Thu, 2 Aug 2018 14:27:18 -0700 Subject: [PATCH 0746/1640] UPSTREAM: bpf: introduce cgroup storage maps This commit introduces BPF_MAP_TYPE_CGROUP_STORAGE maps: a special type of maps which are implementing the cgroup storage. >From the userspace point of view it's almost a generic hash map with the (cgroup inode id, attachment type) pair used as a key. The only difference is that some operations are restricted: 1) a user can't create new entries, 2) a user can't remove existing entries. The lookup from userspace is o(log(n)). Signed-off-by: Roman Gushchin Cc: Alexei Starovoitov Cc: Daniel Borkmann Acked-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann --- include/linux/bpf-cgroup.h | 38 ++++ include/linux/bpf.h | 1 + include/linux/bpf_types.h | 3 + include/uapi/linux/bpf.h | 6 + kernel/bpf/Makefile | 1 + kernel/bpf/local_storage.c | 376 +++++++++++++++++++++++++++++++++++++ kernel/bpf/syscall.c | 3 + kernel/bpf/verifier.c | 12 ++ 8 files changed, 440 insertions(+) create mode 100644 kernel/bpf/local_storage.c diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index d50c2f0a655a..7d00d58869ed 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -4,19 +4,39 @@ #include #include +#include #include struct sock; struct sockaddr; struct cgroup; struct sk_buff; +struct bpf_map; +struct bpf_prog; struct bpf_sock_ops_kern; +struct bpf_cgroup_storage; #ifdef CONFIG_CGROUP_BPF extern struct static_key_false cgroup_bpf_enabled_key; #define cgroup_bpf_enabled static_branch_unlikely(&cgroup_bpf_enabled_key) +struct bpf_cgroup_storage_map; + +struct bpf_storage_buffer { + struct rcu_head rcu; + char data[0]; +}; + +struct bpf_cgroup_storage { + struct bpf_storage_buffer *buf; + struct bpf_cgroup_storage_map *map; + struct bpf_cgroup_storage_key key; + struct list_head list; + struct rb_node node; + struct rcu_head rcu; +}; + struct bpf_prog_list { struct list_head node; struct bpf_prog *prog; @@ -77,6 +97,15 @@ int __cgroup_bpf_run_filter_sock_ops(struct sock *sk, int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, short access, enum bpf_attach_type type); +struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(struct bpf_prog *prog); +void bpf_cgroup_storage_free(struct bpf_cgroup_storage *storage); +void bpf_cgroup_storage_link(struct bpf_cgroup_storage *storage, + struct cgroup *cgroup, + enum bpf_attach_type type); +void bpf_cgroup_storage_unlink(struct bpf_cgroup_storage *storage); +int bpf_cgroup_storage_assign(struct bpf_prog *prog, struct bpf_map *map); +void bpf_cgroup_storage_release(struct bpf_prog *prog, struct bpf_map *map); + /* Wrappers for __cgroup_bpf_run_filter_skb() guarded by cgroup_bpf_enabled. */ #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb) \ ({ \ @@ -221,6 +250,15 @@ static inline int cgroup_bpf_prog_query(const union bpf_attr *attr, return -EINVAL; } +static inline int bpf_cgroup_storage_assign(struct bpf_prog *prog, + struct bpf_map *map) { return 0; } +static inline void bpf_cgroup_storage_release(struct bpf_prog *prog, + struct bpf_map *map) {} +static inline struct bpf_cgroup_storage *bpf_cgroup_storage_alloc( + struct bpf_prog *prog) { return 0; } +static inline void bpf_cgroup_storage_free( + struct bpf_cgroup_storage *storage) {} + #define cgroup_bpf_enabled (0) #define BPF_CGROUP_PRE_CONNECT_ENABLED(sk) (0) #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) ({ 0; }) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 16d6eb6025b7..92a0fd73e001 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -287,6 +287,7 @@ struct bpf_prog_aux { struct bpf_prog *prog; struct user_struct *user; u64 load_time; /* ns since boottime */ + struct bpf_map *cgroup_storage; char name[BPF_OBJ_NAME_LEN]; #ifdef CONFIG_SECURITY void *security; diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index c5700c2d5549..add08be53b6f 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -37,6 +37,9 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_PERF_EVENT_ARRAY, perf_event_array_map_ops) #ifdef CONFIG_CGROUPS BPF_MAP_TYPE(BPF_MAP_TYPE_CGROUP_ARRAY, cgroup_array_map_ops) #endif +#ifdef CONFIG_CGROUP_BPF +BPF_MAP_TYPE(BPF_MAP_TYPE_CGROUP_STORAGE, cgroup_storage_map_ops) +#endif BPF_MAP_TYPE(BPF_MAP_TYPE_HASH, htab_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_HASH, htab_percpu_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_LRU_HASH, htab_lru_map_ops) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 2afb34e17f28..8c8331a6fc2e 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -75,6 +75,11 @@ struct bpf_lpm_trie_key { __u8 data[0]; /* Arbitrary size */ }; +struct bpf_cgroup_storage_key { + __u64 cgroup_inode_id; /* cgroup inode id */ + __u32 attach_type; /* program attach type */ +}; + /* BPF syscall commands, see bpf(2) man-page for details. */ enum bpf_cmd { BPF_MAP_CREATE, @@ -120,6 +125,7 @@ enum bpf_map_type { BPF_MAP_TYPE_CPUMAP, BPF_MAP_TYPE_XSKMAP, BPF_MAP_TYPE_SOCKHASH, + BPF_MAP_TYPE_CGROUP_STORAGE, }; enum bpf_prog_type { diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 1a7e6c1faf91..593abdb60ffa 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -4,6 +4,7 @@ CFLAGS_core.o += $(call cc-disable-warning, override-init) obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o +obj-$(CONFIG_BPF_SYSCALL) += local_storage.o obj-$(CONFIG_BPF_SYSCALL) += disasm.o obj-$(CONFIG_BPF_SYSCALL) += btf.o ifeq ($(CONFIG_NET),y) diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c new file mode 100644 index 000000000000..f23d3fdeba23 --- /dev/null +++ b/kernel/bpf/local_storage.c @@ -0,0 +1,376 @@ +//SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include + +#ifdef CONFIG_CGROUP_BPF + +#define LOCAL_STORAGE_CREATE_FLAG_MASK \ + (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) + +struct bpf_cgroup_storage_map { + struct bpf_map map; + + spinlock_t lock; + struct bpf_prog *prog; + struct rb_root root; + struct list_head list; +}; + +static struct bpf_cgroup_storage_map *map_to_storage(struct bpf_map *map) +{ + return container_of(map, struct bpf_cgroup_storage_map, map); +} + +static int bpf_cgroup_storage_key_cmp( + const struct bpf_cgroup_storage_key *key1, + const struct bpf_cgroup_storage_key *key2) +{ + if (key1->cgroup_inode_id < key2->cgroup_inode_id) + return -1; + else if (key1->cgroup_inode_id > key2->cgroup_inode_id) + return 1; + else if (key1->attach_type < key2->attach_type) + return -1; + else if (key1->attach_type > key2->attach_type) + return 1; + return 0; +} + +static struct bpf_cgroup_storage *cgroup_storage_lookup( + struct bpf_cgroup_storage_map *map, struct bpf_cgroup_storage_key *key, + bool locked) +{ + struct rb_root *root = &map->root; + struct rb_node *node; + + if (!locked) + spin_lock_bh(&map->lock); + + node = root->rb_node; + while (node) { + struct bpf_cgroup_storage *storage; + + storage = container_of(node, struct bpf_cgroup_storage, node); + + switch (bpf_cgroup_storage_key_cmp(key, &storage->key)) { + case -1: + node = node->rb_left; + break; + case 1: + node = node->rb_right; + break; + default: + if (!locked) + spin_unlock_bh(&map->lock); + return storage; + } + } + + if (!locked) + spin_unlock_bh(&map->lock); + + return NULL; +} + +static int cgroup_storage_insert(struct bpf_cgroup_storage_map *map, + struct bpf_cgroup_storage *storage) +{ + struct rb_root *root = &map->root; + struct rb_node **new = &(root->rb_node), *parent = NULL; + + while (*new) { + struct bpf_cgroup_storage *this; + + this = container_of(*new, struct bpf_cgroup_storage, node); + + parent = *new; + switch (bpf_cgroup_storage_key_cmp(&storage->key, &this->key)) { + case -1: + new = &((*new)->rb_left); + break; + case 1: + new = &((*new)->rb_right); + break; + default: + return -EEXIST; + } + } + + rb_link_node(&storage->node, parent, new); + rb_insert_color(&storage->node, root); + + return 0; +} + +static void *cgroup_storage_lookup_elem(struct bpf_map *_map, void *_key) +{ + struct bpf_cgroup_storage_map *map = map_to_storage(_map); + struct bpf_cgroup_storage_key *key = _key; + struct bpf_cgroup_storage *storage; + + storage = cgroup_storage_lookup(map, key, false); + if (!storage) + return NULL; + + return &READ_ONCE(storage->buf)->data[0]; +} + +static int cgroup_storage_update_elem(struct bpf_map *map, void *_key, + void *value, u64 flags) +{ + struct bpf_cgroup_storage_key *key = _key; + struct bpf_cgroup_storage *storage; + struct bpf_storage_buffer *new; + + if (flags & BPF_NOEXIST) + return -EINVAL; + + storage = cgroup_storage_lookup((struct bpf_cgroup_storage_map *)map, + key, false); + if (!storage) + return -ENOENT; + + new = kmalloc_node(sizeof(struct bpf_storage_buffer) + + map->value_size, __GFP_ZERO | GFP_USER, + map->numa_node); + if (!new) + return -ENOMEM; + + memcpy(&new->data[0], value, map->value_size); + + new = xchg(&storage->buf, new); + kfree_rcu(new, rcu); + + return 0; +} + +static int cgroup_storage_get_next_key(struct bpf_map *_map, void *_key, + void *_next_key) +{ + struct bpf_cgroup_storage_map *map = map_to_storage(_map); + struct bpf_cgroup_storage_key *key = _key; + struct bpf_cgroup_storage_key *next = _next_key; + struct bpf_cgroup_storage *storage; + + spin_lock_bh(&map->lock); + + if (list_empty(&map->list)) + goto enoent; + + if (key) { + storage = cgroup_storage_lookup(map, key, true); + if (!storage) + goto enoent; + + storage = list_next_entry(storage, list); + if (!storage) + goto enoent; + } else { + storage = list_first_entry(&map->list, + struct bpf_cgroup_storage, list); + } + + spin_unlock_bh(&map->lock); + next->attach_type = storage->key.attach_type; + next->cgroup_inode_id = storage->key.cgroup_inode_id; + return 0; + +enoent: + spin_unlock_bh(&map->lock); + return -ENOENT; +} + +static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr) +{ + int numa_node = bpf_map_attr_numa_node(attr); + struct bpf_cgroup_storage_map *map; + + if (attr->key_size != sizeof(struct bpf_cgroup_storage_key)) + return ERR_PTR(-EINVAL); + + if (attr->value_size > PAGE_SIZE) + return ERR_PTR(-E2BIG); + + if (attr->map_flags & ~LOCAL_STORAGE_CREATE_FLAG_MASK) + /* reserved bits should not be used */ + return ERR_PTR(-EINVAL); + + if (attr->max_entries) + /* max_entries is not used and enforced to be 0 */ + return ERR_PTR(-EINVAL); + + map = kmalloc_node(sizeof(struct bpf_cgroup_storage_map), + __GFP_ZERO | GFP_USER, numa_node); + if (!map) + return ERR_PTR(-ENOMEM); + + map->map.pages = round_up(sizeof(struct bpf_cgroup_storage_map), + PAGE_SIZE) >> PAGE_SHIFT; + + /* copy mandatory map attributes */ + bpf_map_init_from_attr(&map->map, attr); + + spin_lock_init(&map->lock); + map->root = RB_ROOT; + INIT_LIST_HEAD(&map->list); + + return &map->map; +} + +static void cgroup_storage_map_free(struct bpf_map *_map) +{ + struct bpf_cgroup_storage_map *map = map_to_storage(_map); + + WARN_ON(!RB_EMPTY_ROOT(&map->root)); + WARN_ON(!list_empty(&map->list)); + + kfree(map); +} + +static int cgroup_storage_delete_elem(struct bpf_map *map, void *key) +{ + return -EINVAL; +} + +const struct bpf_map_ops cgroup_storage_map_ops = { + .map_alloc = cgroup_storage_map_alloc, + .map_free = cgroup_storage_map_free, + .map_get_next_key = cgroup_storage_get_next_key, + .map_lookup_elem = cgroup_storage_lookup_elem, + .map_update_elem = cgroup_storage_update_elem, + .map_delete_elem = cgroup_storage_delete_elem, +}; + +int bpf_cgroup_storage_assign(struct bpf_prog *prog, struct bpf_map *_map) +{ + struct bpf_cgroup_storage_map *map = map_to_storage(_map); + int ret = -EBUSY; + + spin_lock_bh(&map->lock); + + if (map->prog && map->prog != prog) + goto unlock; + if (prog->aux->cgroup_storage && prog->aux->cgroup_storage != _map) + goto unlock; + + map->prog = prog; + prog->aux->cgroup_storage = _map; + ret = 0; +unlock: + spin_unlock_bh(&map->lock); + + return ret; +} + +void bpf_cgroup_storage_release(struct bpf_prog *prog, struct bpf_map *_map) +{ + struct bpf_cgroup_storage_map *map = map_to_storage(_map); + + spin_lock_bh(&map->lock); + if (map->prog == prog) { + WARN_ON(prog->aux->cgroup_storage != _map); + map->prog = NULL; + prog->aux->cgroup_storage = NULL; + } + spin_unlock_bh(&map->lock); +} + +struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(struct bpf_prog *prog) +{ + struct bpf_cgroup_storage *storage; + struct bpf_map *map; + u32 pages; + + map = prog->aux->cgroup_storage; + if (!map) + return NULL; + + pages = round_up(sizeof(struct bpf_cgroup_storage) + + sizeof(struct bpf_storage_buffer) + + map->value_size, PAGE_SIZE) >> PAGE_SHIFT; + if (bpf_map_charge_memlock(map, pages)) + return ERR_PTR(-EPERM); + + storage = kmalloc_node(sizeof(struct bpf_cgroup_storage), + __GFP_ZERO | GFP_USER, map->numa_node); + if (!storage) { + bpf_map_uncharge_memlock(map, pages); + return ERR_PTR(-ENOMEM); + } + + storage->buf = kmalloc_node(sizeof(struct bpf_storage_buffer) + + map->value_size, __GFP_ZERO | GFP_USER, + map->numa_node); + if (!storage->buf) { + bpf_map_uncharge_memlock(map, pages); + kfree(storage); + return ERR_PTR(-ENOMEM); + } + + storage->map = (struct bpf_cgroup_storage_map *)map; + + return storage; +} + +void bpf_cgroup_storage_free(struct bpf_cgroup_storage *storage) +{ + u32 pages; + struct bpf_map *map; + + if (!storage) + return; + + map = &storage->map->map; + pages = round_up(sizeof(struct bpf_cgroup_storage) + + sizeof(struct bpf_storage_buffer) + + map->value_size, PAGE_SIZE) >> PAGE_SHIFT; + bpf_map_uncharge_memlock(map, pages); + + kfree_rcu(storage->buf, rcu); + kfree_rcu(storage, rcu); +} + +void bpf_cgroup_storage_link(struct bpf_cgroup_storage *storage, + struct cgroup *cgroup, + enum bpf_attach_type type) +{ + struct bpf_cgroup_storage_map *map; + + if (!storage) + return; + + storage->key.attach_type = type; + storage->key.cgroup_inode_id = cgroup->kn->id.id; + + map = storage->map; + + spin_lock_bh(&map->lock); + WARN_ON(cgroup_storage_insert(map, storage)); + list_add(&storage->list, &map->list); + spin_unlock_bh(&map->lock); +} + +void bpf_cgroup_storage_unlink(struct bpf_cgroup_storage *storage) +{ + struct bpf_cgroup_storage_map *map; + struct rb_root *root; + + if (!storage) + return; + + map = storage->map; + + spin_lock_bh(&map->lock); + root = &map->root; + rb_erase(&storage->node, root); + + list_del(&storage->list); + spin_unlock_bh(&map->lock); +} + +#endif diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index eb57031abd54..d90a0853eb1a 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -974,6 +974,9 @@ static void free_used_maps(struct bpf_prog_aux *aux) { int i; + if (aux->cgroup_storage) + bpf_cgroup_storage_release(aux->prog, aux->cgroup_storage); + for (i = 0; i < aux->used_map_cnt; i++) bpf_map_put(aux->used_maps[i]); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index ba8c29620415..b489f547ab93 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5428,6 +5428,14 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env) } env->used_maps[env->used_map_cnt++] = map; + if (map->map_type == BPF_MAP_TYPE_CGROUP_STORAGE && + bpf_cgroup_storage_assign(env->prog, map)) { + verbose(env, + "only one cgroup storage is allowed\n"); + fdput(f); + return -EBUSY; + } + fdput(f); next_insn: insn++; @@ -5454,6 +5462,10 @@ static void release_maps(struct bpf_verifier_env *env) { int i; + if (env->prog->aux->cgroup_storage) + bpf_cgroup_storage_release(env->prog, + env->prog->aux->cgroup_storage); + for (i = 0; i < env->used_map_cnt; i++) bpf_map_put(env->used_maps[i]); } From 335151868069a43533576d022b2af1b37cca82bd Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Thu, 2 Aug 2018 14:27:19 -0700 Subject: [PATCH 0747/1640] UPSTREAM: bpf: pass a pointer to a cgroup storage using pcpu variable This commit introduces the bpf_cgroup_storage_set() helper, which will be used to pass a pointer to a cgroup storage to the bpf helper. Signed-off-by: Roman Gushchin Cc: Alexei Starovoitov Cc: Daniel Borkmann Acked-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann --- include/linux/bpf-cgroup.h | 15 +++++++++++++++ kernel/bpf/local_storage.c | 2 ++ 2 files changed, 17 insertions(+) diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index 7d00d58869ed..9a144ddbbc8f 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -4,6 +4,7 @@ #include #include +#include #include #include @@ -21,6 +22,8 @@ struct bpf_cgroup_storage; extern struct static_key_false cgroup_bpf_enabled_key; #define cgroup_bpf_enabled static_branch_unlikely(&cgroup_bpf_enabled_key) +DECLARE_PER_CPU(void*, bpf_cgroup_storage); + struct bpf_cgroup_storage_map; struct bpf_storage_buffer { @@ -97,6 +100,17 @@ int __cgroup_bpf_run_filter_sock_ops(struct sock *sk, int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, short access, enum bpf_attach_type type); +static inline void bpf_cgroup_storage_set(struct bpf_cgroup_storage *storage) +{ + struct bpf_storage_buffer *buf; + + if (!storage) + return; + + buf = READ_ONCE(storage->buf); + this_cpu_write(bpf_cgroup_storage, &buf->data[0]); +} + struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(struct bpf_prog *prog); void bpf_cgroup_storage_free(struct bpf_cgroup_storage *storage); void bpf_cgroup_storage_link(struct bpf_cgroup_storage *storage, @@ -250,6 +264,7 @@ static inline int cgroup_bpf_prog_query(const union bpf_attr *attr, return -EINVAL; } +static inline void bpf_cgroup_storage_set(struct bpf_cgroup_storage *storage) {} static inline int bpf_cgroup_storage_assign(struct bpf_prog *prog, struct bpf_map *map) { return 0; } static inline void bpf_cgroup_storage_release(struct bpf_prog *prog, diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index f23d3fdeba23..fc4e37f68f2a 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -7,6 +7,8 @@ #include #include +DEFINE_PER_CPU(void*, bpf_cgroup_storage); + #ifdef CONFIG_CGROUP_BPF #define LOCAL_STORAGE_CREATE_FLAG_MASK \ From 829b4834a98859253a39a84813c8404d2a6be4d4 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Thu, 2 Aug 2018 14:27:20 -0700 Subject: [PATCH 0748/1640] UPSTREAM: bpf: allocate cgroup storage entries on attaching bpf programs If a bpf program is using cgroup local storage, allocate a bpf_cgroup_storage structure automatically on attaching the program to a cgroup and save the pointer into the corresponding bpf_prog_list entry. Analogically, release the cgroup local storage on detaching of the bpf program. Signed-off-by: Roman Gushchin Cc: Alexei Starovoitov Cc: Daniel Borkmann Acked-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann --- include/linux/bpf-cgroup.h | 1 + kernel/bpf/cgroup.c | 35 +++++++++++++++++++++++++++++++---- 2 files changed, 32 insertions(+), 4 deletions(-) diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index 9a144ddbbc8f..f91b0f8ff3a9 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -43,6 +43,7 @@ struct bpf_cgroup_storage { struct bpf_prog_list { struct list_head node; struct bpf_prog *prog; + struct bpf_cgroup_storage *storage; }; struct bpf_prog_array; diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index badabb0b435c..935274c86bfe 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -34,6 +34,8 @@ void cgroup_bpf_put(struct cgroup *cgrp) list_for_each_entry_safe(pl, tmp, progs, node) { list_del(&pl->node); bpf_prog_put(pl->prog); + bpf_cgroup_storage_unlink(pl->storage); + bpf_cgroup_storage_free(pl->storage); kfree(pl); static_branch_dec(&cgroup_bpf_enabled_key); } @@ -188,6 +190,7 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, { struct list_head *progs = &cgrp->bpf.progs[type]; struct bpf_prog *old_prog = NULL; + struct bpf_cgroup_storage *storage, *old_storage = NULL; struct cgroup_subsys_state *css; struct bpf_prog_list *pl; bool pl_was_allocated; @@ -210,31 +213,47 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, if (prog_list_length(progs) >= BPF_CGROUP_MAX_PROGS) return -E2BIG; + storage = bpf_cgroup_storage_alloc(prog); + if (IS_ERR(storage)) + return -ENOMEM; + if (flags & BPF_F_ALLOW_MULTI) { - list_for_each_entry(pl, progs, node) - if (pl->prog == prog) + list_for_each_entry(pl, progs, node) { + if (pl->prog == prog) { /* disallow attaching the same prog twice */ + bpf_cgroup_storage_free(storage); return -EINVAL; + } + } pl = kmalloc(sizeof(*pl), GFP_KERNEL); - if (!pl) + if (!pl) { + bpf_cgroup_storage_free(storage); return -ENOMEM; + } + pl_was_allocated = true; pl->prog = prog; + pl->storage = storage; list_add_tail(&pl->node, progs); } else { if (list_empty(progs)) { pl = kmalloc(sizeof(*pl), GFP_KERNEL); - if (!pl) + if (!pl) { + bpf_cgroup_storage_free(storage); return -ENOMEM; + } pl_was_allocated = true; list_add_tail(&pl->node, progs); } else { pl = list_first_entry(progs, typeof(*pl), node); old_prog = pl->prog; + old_storage = pl->storage; + bpf_cgroup_storage_unlink(old_storage); pl_was_allocated = false; } pl->prog = prog; + pl->storage = storage; } cgrp->bpf.flags[type] = flags; @@ -257,10 +276,13 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, } static_branch_inc(&cgroup_bpf_enabled_key); + if (old_storage) + bpf_cgroup_storage_free(old_storage); if (old_prog) { bpf_prog_put(old_prog); static_branch_dec(&cgroup_bpf_enabled_key); } + bpf_cgroup_storage_link(storage, cgrp, type); return 0; cleanup: @@ -276,6 +298,9 @@ cleanup: /* and cleanup the prog list */ pl->prog = old_prog; + bpf_cgroup_storage_free(pl->storage); + pl->storage = old_storage; + bpf_cgroup_storage_link(old_storage, cgrp, type); if (pl_was_allocated) { list_del(&pl->node); kfree(pl); @@ -356,6 +381,8 @@ int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, /* now can actually delete it from this cgroup list */ list_del(&pl->node); + bpf_cgroup_storage_unlink(pl->storage); + bpf_cgroup_storage_free(pl->storage); kfree(pl); if (list_empty(progs)) /* last program was detached, reset flags to zero */ From 5ece7747def81ee66ec921200b071115d1c71b1c Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Thu, 2 Aug 2018 14:27:21 -0700 Subject: [PATCH 0749/1640] UPSTREAM: bpf: extend bpf_prog_array to store pointers to the cgroup storage This patch converts bpf_prog_array from an array of prog pointers to the array of struct bpf_prog_array_item elements. This allows to save a cgroup storage pointer for each bpf program efficiently attached to a cgroup. Signed-off-by: Roman Gushchin Cc: Alexei Starovoitov Cc: Daniel Borkmann Acked-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann --- drivers/media/rc/bpf-lirc.c | 10 +++-- include/linux/bpf.h | 19 +++++++--- kernel/bpf/cgroup.c | 21 +++++----- kernel/bpf/core.c | 76 +++++++++++++++++++------------------ 4 files changed, 70 insertions(+), 56 deletions(-) diff --git a/drivers/media/rc/bpf-lirc.c b/drivers/media/rc/bpf-lirc.c index 7f4aed108b41..5a0e26e47f59 100644 --- a/drivers/media/rc/bpf-lirc.c +++ b/drivers/media/rc/bpf-lirc.c @@ -196,14 +196,16 @@ void lirc_bpf_run(struct rc_dev *rcdev, u32 sample) */ void lirc_bpf_free(struct rc_dev *rcdev) { - struct bpf_prog **progs; + struct bpf_prog_array_item *item; if (!rcdev->raw->progs) return; - progs = rcu_dereference(rcdev->raw->progs)->progs; - while (*progs) - bpf_prog_put(*progs++); + item = rcu_dereference(rcdev->raw->progs)->items; + while (item->prog) { + bpf_prog_put(item->prog); + item++; + } bpf_prog_array_free(rcdev->raw->progs); } diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 92a0fd73e001..8df18866d373 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -354,9 +354,14 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr, * The 'struct bpf_prog_array *' should only be replaced with xchg() * since other cpus are walking the array of pointers in parallel. */ +struct bpf_prog_array_item { + struct bpf_prog *prog; + struct bpf_cgroup_storage *cgroup_storage; +}; + struct bpf_prog_array { struct rcu_head rcu; - struct bpf_prog *progs[0]; + struct bpf_prog_array_item items[0]; }; struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags); @@ -377,7 +382,8 @@ int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array, #define __BPF_PROG_RUN_ARRAY(array, ctx, func, check_non_null) \ ({ \ - struct bpf_prog **_prog, *__prog; \ + struct bpf_prog_array_item *_item; \ + struct bpf_prog *_prog; \ struct bpf_prog_array *_array; \ u32 _ret = 1; \ preempt_disable(); \ @@ -385,10 +391,11 @@ int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array, _array = rcu_dereference(array); \ if (unlikely(check_non_null && !_array))\ goto _out; \ - _prog = _array->progs; \ - while ((__prog = READ_ONCE(*_prog))) { \ - _ret &= func(__prog, ctx); \ - _prog++; \ + _item = &_array->items[0]; \ + while ((_prog = READ_ONCE(_item->prog))) { \ + bpf_cgroup_storage_set(_item->cgroup_storage); \ + _ret &= func(_prog, ctx); \ + _item++; \ } \ _out: \ rcu_read_unlock(); \ diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 935274c86bfe..ddfa6cc13e57 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -117,15 +117,18 @@ static int compute_effective_progs(struct cgroup *cgrp, cnt = 0; p = cgrp; do { - if (cnt == 0 || (p->bpf.flags[type] & BPF_F_ALLOW_MULTI)) - list_for_each_entry(pl, - &p->bpf.progs[type], node) { - if (!pl->prog) - continue; - progs->progs[cnt++] = pl->prog; - } - p = cgroup_parent(p); - } while (p); + if (cnt > 0 && !(p->bpf.flags[type] & BPF_F_ALLOW_MULTI)) + continue; + + list_for_each_entry(pl, &p->bpf.progs[type], node) { + if (!pl->prog) + continue; + + progs->items[cnt].prog = pl->prog; + progs->items[cnt].cgroup_storage = pl->storage; + cnt++; + } + } while ((p = cgroup_parent(p))); rcu_assign_pointer(*array, progs); return 0; diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index af398a32c4aa..4e3d8af37e8d 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1632,7 +1632,8 @@ struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags) { if (prog_cnt) return kzalloc(sizeof(struct bpf_prog_array) + - sizeof(struct bpf_prog *) * (prog_cnt + 1), + sizeof(struct bpf_prog_array_item) * + (prog_cnt + 1), flags); return &empty_prog_array.hdr; @@ -1646,43 +1647,45 @@ void bpf_prog_array_free(struct bpf_prog_array __rcu *progs) kfree_rcu(progs, rcu); } -int bpf_prog_array_length(struct bpf_prog_array __rcu *progs) +int bpf_prog_array_length(struct bpf_prog_array __rcu *array) { - struct bpf_prog **prog; + struct bpf_prog_array_item *item; u32 cnt = 0; rcu_read_lock(); - prog = rcu_dereference(progs)->progs; - for (; *prog; prog++) - if (*prog != &dummy_bpf_prog.prog) + item = rcu_dereference(array)->items; + for (; item->prog; item++) + if (item->prog != &dummy_bpf_prog.prog) cnt++; rcu_read_unlock(); return cnt; } -static bool bpf_prog_array_copy_core(struct bpf_prog **prog, + +static bool bpf_prog_array_copy_core(struct bpf_prog_array __rcu *array, u32 *prog_ids, u32 request_cnt) { + struct bpf_prog_array_item *item; int i = 0; - for (; *prog; prog++) { - if (*prog == &dummy_bpf_prog.prog) + item = rcu_dereference(array)->items; + for (; item->prog; item++) { + if (item->prog == &dummy_bpf_prog.prog) continue; - prog_ids[i] = (*prog)->aux->id; + prog_ids[i] = item->prog->aux->id; if (++i == request_cnt) { - prog++; + item++; break; } } - return !!(*prog); + return !!(item->prog); } -int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *progs, +int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *array, __u32 __user *prog_ids, u32 cnt) { - struct bpf_prog **prog; unsigned long err = 0; bool nospc; u32 *ids; @@ -1701,8 +1704,7 @@ int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *progs, if (!ids) return -ENOMEM; rcu_read_lock(); - prog = rcu_dereference(progs)->progs; - nospc = bpf_prog_array_copy_core(prog, ids, cnt); + nospc = bpf_prog_array_copy_core(array, ids, cnt); rcu_read_unlock(); err = copy_to_user(prog_ids, ids, cnt * sizeof(u32)); kfree(ids); @@ -1713,14 +1715,14 @@ int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *progs, return 0; } -void bpf_prog_array_delete_safe(struct bpf_prog_array __rcu *progs, +void bpf_prog_array_delete_safe(struct bpf_prog_array __rcu *array, struct bpf_prog *old_prog) { - struct bpf_prog **prog = progs->progs; + struct bpf_prog_array_item *item = array->items; - for (; *prog; prog++) - if (*prog == old_prog) { - WRITE_ONCE(*prog, &dummy_bpf_prog.prog); + for (; item->prog; item++) + if (item->prog == old_prog) { + WRITE_ONCE(item->prog, &dummy_bpf_prog.prog); break; } } @@ -1731,7 +1733,7 @@ int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array, struct bpf_prog_array **new_array) { int new_prog_cnt, carry_prog_cnt = 0; - struct bpf_prog **existing_prog; + struct bpf_prog_array_item *existing; struct bpf_prog_array *array; bool found_exclude = false; int new_prog_idx = 0; @@ -1740,15 +1742,15 @@ int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array, * the new array. */ if (old_array) { - existing_prog = old_array->progs; - for (; *existing_prog; existing_prog++) { - if (*existing_prog == exclude_prog) { + existing = old_array->items; + for (; existing->prog; existing++) { + if (existing->prog == exclude_prog) { found_exclude = true; continue; } - if (*existing_prog != &dummy_bpf_prog.prog) + if (existing->prog != &dummy_bpf_prog.prog) carry_prog_cnt++; - if (*existing_prog == include_prog) + if (existing->prog == include_prog) return -EEXIST; } } @@ -1774,15 +1776,17 @@ int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array, /* Fill in the new prog array */ if (carry_prog_cnt) { - existing_prog = old_array->progs; - for (; *existing_prog; existing_prog++) - if (*existing_prog != exclude_prog && - *existing_prog != &dummy_bpf_prog.prog) - array->progs[new_prog_idx++] = *existing_prog; + existing = old_array->items; + for (; existing->prog; existing++) + if (existing->prog != exclude_prog && + existing->prog != &dummy_bpf_prog.prog) { + array->items[new_prog_idx++].prog = + existing->prog; + } } if (include_prog) - array->progs[new_prog_idx++] = include_prog; - array->progs[new_prog_idx] = NULL; + array->items[new_prog_idx++].prog = include_prog; + array->items[new_prog_idx].prog = NULL; *new_array = array; return 0; } @@ -1791,7 +1795,6 @@ int bpf_prog_array_copy_info(struct bpf_prog_array __rcu *array, u32 *prog_ids, u32 request_cnt, u32 *prog_cnt) { - struct bpf_prog **prog; u32 cnt = 0; if (array) @@ -1804,8 +1807,7 @@ int bpf_prog_array_copy_info(struct bpf_prog_array __rcu *array, return 0; /* this function is called under trace/bpf_trace.c: bpf_event_mutex */ - prog = rcu_dereference_check(array, 1)->progs; - return bpf_prog_array_copy_core(prog, prog_ids, request_cnt) ? -ENOSPC + return bpf_prog_array_copy_core(array, prog_ids, request_cnt) ? -ENOSPC : 0; } From fc4ba65ef648da358a7270de34fe5c4bc30e7b1f Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Thu, 2 Aug 2018 14:27:22 -0700 Subject: [PATCH 0750/1640] UPSTREAM: bpf/verifier: introduce BPF_PTR_TO_MAP_VALUE BPF_MAP_TYPE_CGROUP_STORAGE maps are special in a way that the access from the bpf program side is lookup-free. That means the result is guaranteed to be a valid pointer to the cgroup storage; no NULL-check is required. This patch introduces BPF_PTR_TO_MAP_VALUE return type, which is required to cause the verifier accept programs, which are not checking the map value pointer for being NULL. Signed-off-by: Roman Gushchin Cc: Alexei Starovoitov Cc: Daniel Borkmann Acked-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 1 + kernel/bpf/verifier.c | 8 ++++++-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 8df18866d373..b2a85c6d2b6b 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -160,6 +160,7 @@ enum bpf_arg_type { enum bpf_return_type { RET_INTEGER, /* function returns integer */ RET_VOID, /* function doesn't return anything */ + RET_PTR_TO_MAP_VALUE, /* returns a pointer to map elem value */ RET_PTR_TO_MAP_VALUE_OR_NULL, /* returns a pointer to map elem value or NULL */ }; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index b489f547ab93..636c0a9b4b49 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2516,8 +2516,12 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn mark_reg_unknown(env, regs, BPF_REG_0); } else if (fn->ret_type == RET_VOID) { regs[BPF_REG_0].type = NOT_INIT; - } else if (fn->ret_type == RET_PTR_TO_MAP_VALUE_OR_NULL) { - regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL; + } else if (fn->ret_type == RET_PTR_TO_MAP_VALUE_OR_NULL || + fn->ret_type == RET_PTR_TO_MAP_VALUE) { + if (fn->ret_type == RET_PTR_TO_MAP_VALUE) + regs[BPF_REG_0].type = PTR_TO_MAP_VALUE; + else + regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL; /* There is no offset yet applied, variable or fixed */ mark_reg_known_zero(env, regs, BPF_REG_0); regs[BPF_REG_0].off = 0; From 9d0ff803af82d8a892f55f30a28cd33c5c6f9694 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Thu, 2 Aug 2018 14:27:23 -0700 Subject: [PATCH 0751/1640] UPSTREAM: bpf: don't allow create maps of cgroup local storages As there is one-to-one relation between a bpf program and cgroup local storage map, there is no sense in creating a map of cgroup local storage maps. Forbid it explicitly to avoid possible side effects. Signed-off-by: Roman Gushchin Cc: Alexei Starovoitov Cc: Daniel Borkmann Acked-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann --- kernel/bpf/map_in_map.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c index 1878aace6a5c..051c5e40792c 100644 --- a/kernel/bpf/map_in_map.c +++ b/kernel/bpf/map_in_map.c @@ -24,7 +24,8 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd) * is a runtime binding. Doing static check alone * in the verifier is not enough. */ - if (inner_map->map_type == BPF_MAP_TYPE_PROG_ARRAY) { + if (inner_map->map_type == BPF_MAP_TYPE_PROG_ARRAY || + inner_map->map_type == BPF_MAP_TYPE_CGROUP_STORAGE) { fdput(f); return ERR_PTR(-ENOTSUPP); } From 553c5f47c8d957b3243853fbb087ff3af4218a6f Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Thu, 2 Aug 2018 14:27:24 -0700 Subject: [PATCH 0752/1640] BACKPORT: bpf: introduce the bpf_get_local_storage() helper function The bpf_get_local_storage() helper function is used to get a pointer to the bpf local storage from a bpf program. It takes a pointer to a storage map and flags as arguments. Right now it accepts only cgroup storage maps, and flags argument has to be 0. Further it can be extended to support other types of local storage: e.g. thread local storage etc. Signed-off-by: Roman Gushchin Cc: Alexei Starovoitov Cc: Daniel Borkmann Acked-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 2 ++ include/uapi/linux/bpf.h | 18 ++++++++++++++++++ kernel/bpf/cgroup.c | 2 ++ kernel/bpf/core.c | 1 + kernel/bpf/helpers.c | 20 ++++++++++++++++++++ kernel/bpf/verifier.c | 18 ++++++++++++++++++ net/core/filter.c | 23 ++++++++++++++++++++++- 7 files changed, 83 insertions(+), 1 deletion(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index b2a85c6d2b6b..e2acc9f012eb 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -805,6 +805,8 @@ extern const struct bpf_func_proto bpf_sock_map_update_proto; extern const struct bpf_func_proto bpf_sock_hash_update_proto; extern const struct bpf_func_proto bpf_get_current_cgroup_id_proto; +extern const struct bpf_func_proto bpf_get_local_storage_proto; + /* Shared helpers among cBPF and eBPF. */ void bpf_user_rnd_init_once(void); u64 bpf_user_rnd_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 8c8331a6fc2e..05b4aa406331 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2095,6 +2095,24 @@ union bpf_attr { * Return * A 64-bit integer containing the current cgroup id based * on the cgroup within which the current task is running. + * + * void* get_local_storage(void *map, u64 flags) + * Description + * Get the pointer to the local storage area. + * The type and the size of the local storage is defined + * by the *map* argument. + * The *flags* meaning is specific for each map type, + * and has to be 0 for cgroup local storage. + * + * Depending on the bpf program type, a local storage area + * can be shared between multiple instances of the bpf program, + * running simultaneously. + * + * A user should care about the synchronization by himself. + * For example, by using the BPF_STX_XADD instruction to alter + * the shared data. + * Return + * Pointer to the local storage area. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index ddfa6cc13e57..0a4fe5a7dc91 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -684,6 +684,8 @@ cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_map_delete_elem_proto; case BPF_FUNC_get_current_uid_gid: return &bpf_get_current_uid_gid_proto; + case BPF_FUNC_get_local_storage: + return &bpf_get_local_storage_proto; case BPF_FUNC_trace_printk: if (capable(CAP_SYS_ADMIN)) return bpf_get_trace_printk_proto(); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 4e3d8af37e8d..26aa24deaf02 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1886,6 +1886,7 @@ const struct bpf_func_proto bpf_get_current_comm_proto __weak; const struct bpf_func_proto bpf_sock_map_update_proto __weak; const struct bpf_func_proto bpf_sock_hash_update_proto __weak; const struct bpf_func_proto bpf_get_current_cgroup_id_proto __weak; +const struct bpf_func_proto bpf_get_local_storage_proto __weak; const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void) { diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 3c347cfed616..c54e3ac03389 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -205,4 +205,24 @@ const struct bpf_func_proto bpf_get_current_cgroup_id_proto = { .gpl_only = false, .ret_type = RET_INTEGER, }; + +DECLARE_PER_CPU(void*, bpf_cgroup_storage); + +BPF_CALL_2(bpf_get_local_storage, struct bpf_map *, map, u64, flags) +{ + /* map and flags arguments are not used now, + * but provide an ability to extend the API + * for other types of local storages. + * verifier checks that their values are correct. + */ + return (unsigned long) this_cpu_read(bpf_cgroup_storage); +} + +const struct bpf_func_proto bpf_get_local_storage_proto = { + .func = bpf_get_local_storage, + .gpl_only = false, + .ret_type = RET_PTR_TO_MAP_VALUE, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_ANYTHING, +}; #endif diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 636c0a9b4b49..8e3fdd5315ef 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2148,6 +2148,10 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, func_id != BPF_FUNC_current_task_under_cgroup) goto error; break; + case BPF_MAP_TYPE_CGROUP_STORAGE: + if (func_id != BPF_FUNC_get_local_storage) + goto error; + break; /* devmap returns a pointer to a live net_device ifindex that we cannot * allow to be modified from bpf side. So do not allow lookup elements * for now. @@ -2230,6 +2234,10 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, if (map->map_type != BPF_MAP_TYPE_SOCKHASH) goto error; break; + case BPF_FUNC_get_local_storage: + if (map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE) + goto error; + break; default: break; } @@ -2504,6 +2512,16 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn } regs = cur_regs(env); + + /* check that flags argument in get_local_storage(map, flags) is 0, + * this is required because get_local_storage() can't return an error. + */ + if (func_id == BPF_FUNC_get_local_storage && + !register_is_null(®s[BPF_REG_2])) { + verbose(env, "get_local_storage() doesn't support non-zero flags\n"); + return -EINVAL; + } + /* reset caller saved regs */ for (i = 0; i < CALLER_SAVED_REGS; i++) { mark_reg_not_init(env, regs, caller_saved[i]); diff --git a/net/core/filter.c b/net/core/filter.c index 4a9bae13e12d..5b9368c23999 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4830,6 +4830,8 @@ sock_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) */ case BPF_FUNC_get_current_uid_gid: return &bpf_get_current_uid_gid_proto; + case BPF_FUNC_get_local_storage: + return &bpf_get_local_storage_proto; default: return bpf_base_func_proto(func_id); } @@ -4854,6 +4856,8 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) } case BPF_FUNC_get_socket_cookie: return &bpf_get_socket_cookie_sock_addr_proto; + case BPF_FUNC_get_local_storage: + return &bpf_get_local_storage_proto; default: return bpf_base_func_proto(func_id); } @@ -4876,6 +4880,17 @@ sk_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) } } +static const struct bpf_func_proto * +cg_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + switch (func_id) { + case BPF_FUNC_get_local_storage: + return &bpf_get_local_storage_proto; + default: + return sk_filter_func_proto(func_id, prog); + } +} + static const struct bpf_func_proto * tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { @@ -5000,6 +5015,8 @@ sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_sock_hash_update_proto; case BPF_FUNC_get_socket_cookie: return &bpf_get_socket_cookie_sock_ops_proto; + case BPF_FUNC_get_local_storage: + return &bpf_get_local_storage_proto; default: return bpf_base_func_proto(func_id); } @@ -5019,6 +5036,8 @@ sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_msg_cork_bytes_proto; case BPF_FUNC_msg_pull_data: return &bpf_msg_pull_data_proto; + case BPF_FUNC_get_local_storage: + return &bpf_get_local_storage_proto; default: return bpf_base_func_proto(func_id); } @@ -5046,6 +5065,8 @@ sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_sk_redirect_map_proto; case BPF_FUNC_sk_redirect_hash: return &bpf_sk_redirect_hash_proto; + case BPF_FUNC_get_local_storage: + return &bpf_get_local_storage_proto; default: return bpf_base_func_proto(func_id); } @@ -6850,7 +6871,7 @@ const struct bpf_prog_ops xdp_prog_ops = { }; const struct bpf_verifier_ops cg_skb_verifier_ops = { - .get_func_proto = sk_filter_func_proto, + .get_func_proto = cg_skb_func_proto, .is_valid_access = sk_filter_is_valid_access, .convert_ctx_access = bpf_convert_ctx_access, }; From 2611adc8796cc059d66a12979048ccc8478d88da Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Thu, 2 Aug 2018 14:27:27 -0700 Subject: [PATCH 0753/1640] UPSTREAM: bpf/test_run: support cgroup local storage Allocate a temporary cgroup storage to use for bpf program test runs. Because the test program is not actually attached to a cgroup, the storage is allocated manually just for the execution of the bpf program. If the program is executed multiple times, the storage is not zeroed on each run, emulating multiple runs of the program, attached to a real cgroup. Signed-off-by: Roman Gushchin Cc: Alexei Starovoitov Cc: Daniel Borkmann Acked-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann --- net/bpf/test_run.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index d733d5e1eec3..e0c6dfae42d8 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -11,12 +11,14 @@ #include #include -static __always_inline u32 bpf_test_run_one(struct bpf_prog *prog, void *ctx) +static __always_inline u32 bpf_test_run_one(struct bpf_prog *prog, void *ctx, + struct bpf_cgroup_storage *storage) { u32 ret; preempt_disable(); rcu_read_lock(); + bpf_cgroup_storage_set(storage); ret = BPF_PROG_RUN(prog, ctx); rcu_read_unlock(); preempt_enable(); @@ -26,14 +28,19 @@ static __always_inline u32 bpf_test_run_one(struct bpf_prog *prog, void *ctx) static u32 bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat, u32 *time) { + struct bpf_cgroup_storage *storage = NULL; u64 time_start, time_spent = 0; u32 ret = 0, i; + storage = bpf_cgroup_storage_alloc(prog); + if (IS_ERR(storage)) + return PTR_ERR(storage); + if (!repeat) repeat = 1; time_start = ktime_get_ns(); for (i = 0; i < repeat; i++) { - ret = bpf_test_run_one(prog, ctx); + ret = bpf_test_run_one(prog, ctx, storage); if (need_resched()) { if (signal_pending(current)) break; @@ -46,6 +53,8 @@ static u32 bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat, u32 *time) do_div(time_spent, repeat); *time = time_spent > U32_MAX ? U32_MAX : (u32)time_spent; + bpf_cgroup_storage_free(storage); + return ret; } From 9a025706068e108da22e3e21a6be1606ac97168e Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Mon, 6 Aug 2018 14:27:28 -0700 Subject: [PATCH 0754/1640] UPSTREAM: bpf: introduce update_effective_progs() __cgroup_bpf_attach() and __cgroup_bpf_detach() functions have a good amount of duplicated code, which is possible to eliminate by introducing the update_effective_progs() helper function. The update_effective_progs() calls compute_effective_progs() and then in case of success it calls activate_effective_progs() for each descendant cgroup. In case of failure (OOM), it releases allocated prog arrays and return the error code. Signed-off-by: Roman Gushchin Cc: Alexei Starovoitov Cc: Daniel Borkmann Acked-by: Song Liu Signed-off-by: Daniel Borkmann --- kernel/bpf/cgroup.c | 99 +++++++++++++++++++++------------------------ 1 file changed, 45 insertions(+), 54 deletions(-) diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 0a4fe5a7dc91..6a7d931bbc55 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -177,6 +177,45 @@ cleanup: return -ENOMEM; } +static int update_effective_progs(struct cgroup *cgrp, + enum bpf_attach_type type) +{ + struct cgroup_subsys_state *css; + int err; + + /* allocate and recompute effective prog arrays */ + css_for_each_descendant_pre(css, &cgrp->self) { + struct cgroup *desc = container_of(css, struct cgroup, self); + + err = compute_effective_progs(desc, type, &desc->bpf.inactive); + if (err) + goto cleanup; + } + + /* all allocations were successful. Activate all prog arrays */ + css_for_each_descendant_pre(css, &cgrp->self) { + struct cgroup *desc = container_of(css, struct cgroup, self); + + activate_effective_progs(desc, type, desc->bpf.inactive); + desc->bpf.inactive = NULL; + } + + return 0; + +cleanup: + /* oom while computing effective. Free all computed effective arrays + * since they were not activated + */ + css_for_each_descendant_pre(css, &cgrp->self) { + struct cgroup *desc = container_of(css, struct cgroup, self); + + bpf_prog_array_free(desc->bpf.inactive); + desc->bpf.inactive = NULL; + } + + return err; +} + #define BPF_CGROUP_MAX_PROGS 64 /** @@ -194,7 +233,6 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, struct list_head *progs = &cgrp->bpf.progs[type]; struct bpf_prog *old_prog = NULL; struct bpf_cgroup_storage *storage, *old_storage = NULL; - struct cgroup_subsys_state *css; struct bpf_prog_list *pl; bool pl_was_allocated; int err; @@ -261,22 +299,9 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, cgrp->bpf.flags[type] = flags; - /* allocate and recompute effective prog arrays */ - css_for_each_descendant_pre(css, &cgrp->self) { - struct cgroup *desc = container_of(css, struct cgroup, self); - - err = compute_effective_progs(desc, type, &desc->bpf.inactive); - if (err) - goto cleanup; - } - - /* all allocations were successful. Activate all prog arrays */ - css_for_each_descendant_pre(css, &cgrp->self) { - struct cgroup *desc = container_of(css, struct cgroup, self); - - activate_effective_progs(desc, type, desc->bpf.inactive); - desc->bpf.inactive = NULL; - } + err = update_effective_progs(cgrp, type); + if (err) + goto cleanup; static_branch_inc(&cgroup_bpf_enabled_key); if (old_storage) @@ -289,16 +314,6 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, return 0; cleanup: - /* oom while computing effective. Free all computed effective arrays - * since they were not activated - */ - css_for_each_descendant_pre(css, &cgrp->self) { - struct cgroup *desc = container_of(css, struct cgroup, self); - - bpf_prog_array_free(desc->bpf.inactive); - desc->bpf.inactive = NULL; - } - /* and cleanup the prog list */ pl->prog = old_prog; bpf_cgroup_storage_free(pl->storage); @@ -326,7 +341,6 @@ int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, struct list_head *progs = &cgrp->bpf.progs[type]; u32 flags = cgrp->bpf.flags[type]; struct bpf_prog *old_prog = NULL; - struct cgroup_subsys_state *css; struct bpf_prog_list *pl; int err; @@ -365,22 +379,9 @@ int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, pl->prog = NULL; } - /* allocate and recompute effective prog arrays */ - css_for_each_descendant_pre(css, &cgrp->self) { - struct cgroup *desc = container_of(css, struct cgroup, self); - - err = compute_effective_progs(desc, type, &desc->bpf.inactive); - if (err) - goto cleanup; - } - - /* all allocations were successful. Activate all prog arrays */ - css_for_each_descendant_pre(css, &cgrp->self) { - struct cgroup *desc = container_of(css, struct cgroup, self); - - activate_effective_progs(desc, type, desc->bpf.inactive); - desc->bpf.inactive = NULL; - } + err = update_effective_progs(cgrp, type); + if (err) + goto cleanup; /* now can actually delete it from this cgroup list */ list_del(&pl->node); @@ -396,16 +397,6 @@ int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, return 0; cleanup: - /* oom while computing effective. Free all computed effective arrays - * since they were not activated - */ - css_for_each_descendant_pre(css, &cgrp->self) { - struct cgroup *desc = container_of(css, struct cgroup, self); - - bpf_prog_array_free(desc->bpf.inactive); - desc->bpf.inactive = NULL; - } - /* and restore back old_prog */ pl->prog = old_prog; return err; From 63399953cc624b43042956966cfcd52173bf17b4 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 8 Aug 2018 19:23:13 +0200 Subject: [PATCH 0755/1640] UPSTREAM: bpf, sockmap: fix bpf_tcp_sendmsg sock error handling While working on bpf_tcp_sendmsg() code, I noticed that when a sk->sk_err is set we error out with err = sk->sk_err. However this is problematic since sk->sk_err is a positive error value and therefore we will neither go into sk_stream_error() nor will we report an error back to user space. I had this case with EPIPE and user space was thinking sendmsg() succeeded since EPIPE is a positive value, thinking we submitted 32 bytes. Fix it by negating the sk->sk_err value. Fixes: 4f738adba30a ("bpf: create tcp_bpf_ulp allowing BPF to monitor socket TX/RX data") Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Signed-off-by: Alexei Starovoitov --- kernel/bpf/sockmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 24127c3e4066..4a301ec712be 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -1050,7 +1050,7 @@ static int bpf_tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) int copy; if (sk->sk_err) { - err = sk->sk_err; + err = -sk->sk_err; goto out_err; } From c7f7d31f1768e196195d35cc2511152238a0c783 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 8 Aug 2018 19:23:14 +0200 Subject: [PATCH 0756/1640] UPSTREAM: bpf, sockmap: fix leak in bpf_tcp_sendmsg wait for mem path In bpf_tcp_sendmsg() the sk_alloc_sg() may fail. In the case of ENOMEM, it may also mean that we've partially filled the scatterlist entries with pages. Later jumping to sk_stream_wait_memory() we could further fail with an error for several reasons, however we miss to call free_start_sg() if the local sk_msg_buff was used. Fixes: 4f738adba30a ("bpf: create tcp_bpf_ulp allowing BPF to monitor socket TX/RX data") Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Signed-off-by: Alexei Starovoitov --- kernel/bpf/sockmap.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 4a301ec712be..daf92f9df000 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -1045,7 +1045,7 @@ static int bpf_tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); while (msg_data_left(msg)) { - struct sk_msg_buff *m; + struct sk_msg_buff *m = NULL; bool enospc = false; int copy; @@ -1113,8 +1113,11 @@ wait_for_sndbuf: set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); wait_for_memory: err = sk_stream_wait_memory(sk, &timeo); - if (err) + if (err) { + if (m && m != psock->cork) + free_start_sg(sk, m); goto out_err; + } } out_err: if (err < 0) From f6dce64ec8774b9cf09c568ed63ef0afa47addaf Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Wed, 8 Aug 2018 23:00:34 +0200 Subject: [PATCH 0757/1640] UPSTREAM: xdp: fix bug in cpumap teardown code path When removing a cpumap entry, a number of syncronization steps happen. Eventually the teardown code __cpu_map_entry_free is invoked from/via call_rcu. The teardown code __cpu_map_entry_free() flushes remaining xdp_frames, by invoking bq_flush_to_queue, which calls xdp_return_frame_rx_napi(). The issues is that the teardown code is not running in the RX NAPI code path. Thus, it is not allowed to invoke the NAPI variant of xdp_return_frame. This bug was found and triggered by using the --stress-mode option to the samples/bpf program xdp_redirect_cpu. It is hard to trigger, because the ptr_ring have to be full and cpumap bulk queue max contains 8 packets, and a remote CPU is racing to empty the ptr_ring queue. Fixes: 389ab7f01af9 ("xdp: introduce xdp_return_frame_rx_napi") Tested-by: Jean-Tsung Hsiao Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Daniel Borkmann --- kernel/bpf/cpumap.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index e0918d180f08..46f5f29605d4 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -69,7 +69,7 @@ struct bpf_cpu_map { }; static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu, - struct xdp_bulk_queue *bq); + struct xdp_bulk_queue *bq, bool in_napi_ctx); static u64 cpu_map_bitmap_size(const union bpf_attr *attr) { @@ -375,7 +375,7 @@ static void __cpu_map_entry_free(struct rcu_head *rcu) struct xdp_bulk_queue *bq = per_cpu_ptr(rcpu->bulkq, cpu); /* No concurrent bq_enqueue can run at this point */ - bq_flush_to_queue(rcpu, bq); + bq_flush_to_queue(rcpu, bq, false); } free_percpu(rcpu->bulkq); /* Cannot kthread_stop() here, last put free rcpu resources */ @@ -558,7 +558,7 @@ const struct bpf_map_ops cpu_map_ops = { }; static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu, - struct xdp_bulk_queue *bq) + struct xdp_bulk_queue *bq, bool in_napi_ctx) { unsigned int processed = 0, drops = 0; const int to_cpu = rcpu->cpu; @@ -578,7 +578,10 @@ static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu, err = __ptr_ring_produce(q, xdpf); if (err) { drops++; - xdp_return_frame_rx_napi(xdpf); + if (likely(in_napi_ctx)) + xdp_return_frame_rx_napi(xdpf); + else + xdp_return_frame(xdpf); } processed++; } @@ -598,7 +601,7 @@ static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf) struct xdp_bulk_queue *bq = this_cpu_ptr(rcpu->bulkq); if (unlikely(bq->count == CPU_MAP_BULK_SIZE)) - bq_flush_to_queue(rcpu, bq); + bq_flush_to_queue(rcpu, bq, true); /* Notice, xdp_buff/page MUST be queued here, long enough for * driver to code invoking us to finished, due to driver @@ -661,7 +664,7 @@ void __cpu_map_flush(struct bpf_map *map) /* Flush all frames in bulkq to real queue */ bq = this_cpu_ptr(rcpu->bulkq); - bq_flush_to_queue(rcpu, bq); + bq_flush_to_queue(rcpu, bq, true); /* If already running, costs spin_lock_irqsave + smb_mb */ wake_up_process(rcpu->kthread); From c948f50e0320517670195937ffc004df118b0377 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Wed, 8 Aug 2018 23:00:45 +0200 Subject: [PATCH 0758/1640] UPSTREAM: xdp: fix bug in devmap teardown code path Like cpumap teardown, the devmap teardown code also flush remaining xdp_frames, via bq_xmit_all() in case map entry is removed. The code can call xdp_return_frame_rx_napi, from the the wrong context, in-case ndo_xdp_xmit() fails. Fixes: 389ab7f01af9 ("xdp: introduce xdp_return_frame_rx_napi") Fixes: 735fc4054b3a ("xdp: change ndo_xdp_xmit API to support bulking") Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Daniel Borkmann --- kernel/bpf/devmap.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index c3c1d567829b..11e691223c90 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -220,7 +220,8 @@ void __dev_map_insert_ctx(struct bpf_map *map, u32 bit) } static int bq_xmit_all(struct bpf_dtab_netdev *obj, - struct xdp_bulk_queue *bq, u32 flags) + struct xdp_bulk_queue *bq, u32 flags, + bool in_napi_ctx) { struct net_device *dev = obj->dev; int sent = 0, drops = 0, err = 0; @@ -257,7 +258,10 @@ error: struct xdp_frame *xdpf = bq->q[i]; /* RX path under NAPI protection, can return frames faster */ - xdp_return_frame_rx_napi(xdpf); + if (likely(in_napi_ctx)) + xdp_return_frame_rx_napi(xdpf); + else + xdp_return_frame(xdpf); drops++; } goto out; @@ -289,7 +293,7 @@ void __dev_map_flush(struct bpf_map *map) __clear_bit(bit, bitmap); bq = this_cpu_ptr(dev->bulkq); - bq_xmit_all(dev, bq, XDP_XMIT_FLUSH); + bq_xmit_all(dev, bq, XDP_XMIT_FLUSH, true); } } @@ -319,7 +323,7 @@ static int bq_enqueue(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf, struct xdp_bulk_queue *bq = this_cpu_ptr(obj->bulkq); if (unlikely(bq->count == DEV_MAP_BULK_SIZE)) - bq_xmit_all(obj, bq, 0); + bq_xmit_all(obj, bq, 0, true); /* Ingress dev_rx will be the same for all xdp_frame's in * bulk_queue, because bq stored per-CPU and must be flushed @@ -388,7 +392,7 @@ static void dev_map_flush_old(struct bpf_dtab_netdev *dev) __clear_bit(dev->bit, bitmap); bq = per_cpu_ptr(dev->bulkq, cpu); - bq_xmit_all(dev, bq, XDP_XMIT_FLUSH); + bq_xmit_all(dev, bq, XDP_XMIT_FLUSH, false); } } } From b8830c4b7500e7ee328d8c7aaf36299aa7b173f6 Mon Sep 17 00:00:00 2001 From: Toshiaki Makita Date: Fri, 3 Aug 2018 16:58:15 +0900 Subject: [PATCH 0759/1640] UPSTREAM: bpf: Make redirect_info accessible from modules We are going to add kern_flags field in redirect_info for kernel internal use. In order to avoid function call to access the flags, make redirect_info accessible from modules. Also as it is now non-static, add prefix bpf_ to redirect_info. v6: - Fix sparse warning around EXPORT_SYMBOL. Signed-off-by: Toshiaki Makita Signed-off-by: Daniel Borkmann --- include/linux/filter.h | 10 ++++++++++ net/core/filter.c | 29 +++++++++++------------------ 2 files changed, 21 insertions(+), 18 deletions(-) diff --git a/include/linux/filter.h b/include/linux/filter.h index fadc99166e35..0219780d9310 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -606,6 +606,16 @@ struct sk_msg_buff { struct list_head list; }; +struct bpf_redirect_info { + u32 ifindex; + u32 flags; + struct bpf_map *map; + struct bpf_map *map_to_flush; + unsigned long map_owner; +}; + +DECLARE_PER_CPU(struct bpf_redirect_info, bpf_redirect_info); + /* Compute the linear packet data range [data, data_end) which * will be accessed by various program types (cls_bpf, act_bpf, * lwt, ...). Subsystems allowing direct data access must (!) diff --git a/net/core/filter.c b/net/core/filter.c index 5b9368c23999..a869c7311e72 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2087,19 +2087,12 @@ static const struct bpf_func_proto bpf_clone_redirect_proto = { .arg3_type = ARG_ANYTHING, }; -struct redirect_info { - u32 ifindex; - u32 flags; - struct bpf_map *map; - struct bpf_map *map_to_flush; - unsigned long map_owner; -}; - -static DEFINE_PER_CPU(struct redirect_info, redirect_info); +DEFINE_PER_CPU(struct bpf_redirect_info, bpf_redirect_info); +EXPORT_PER_CPU_SYMBOL_GPL(bpf_redirect_info); BPF_CALL_2(bpf_redirect, u32, ifindex, u64, flags) { - struct redirect_info *ri = this_cpu_ptr(&redirect_info); + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); if (unlikely(flags & ~(BPF_F_INGRESS))) return TC_ACT_SHOT; @@ -2112,7 +2105,7 @@ BPF_CALL_2(bpf_redirect, u32, ifindex, u64, flags) int skb_do_redirect(struct sk_buff *skb) { - struct redirect_info *ri = this_cpu_ptr(&redirect_info); + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); struct net_device *dev; dev = dev_get_by_index_rcu(dev_net(skb->dev), ri->ifindex); @@ -3201,7 +3194,7 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd, void xdp_do_flush_map(void) { - struct redirect_info *ri = this_cpu_ptr(&redirect_info); + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); struct bpf_map *map = ri->map_to_flush; ri->map_to_flush = NULL; @@ -3246,7 +3239,7 @@ static inline bool xdp_map_invalid(const struct bpf_prog *xdp_prog, static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp, struct bpf_prog *xdp_prog) { - struct redirect_info *ri = this_cpu_ptr(&redirect_info); + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); unsigned long map_owner = ri->map_owner; struct bpf_map *map = ri->map; u32 index = ri->ifindex; @@ -3286,7 +3279,7 @@ err: int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, struct bpf_prog *xdp_prog) { - struct redirect_info *ri = this_cpu_ptr(&redirect_info); + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); struct net_device *fwd; u32 index = ri->ifindex; int err; @@ -3318,7 +3311,7 @@ static int xdp_do_generic_redirect_map(struct net_device *dev, struct xdp_buff *xdp, struct bpf_prog *xdp_prog) { - struct redirect_info *ri = this_cpu_ptr(&redirect_info); + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); unsigned long map_owner = ri->map_owner; struct bpf_map *map = ri->map; u32 index = ri->ifindex; @@ -3369,7 +3362,7 @@ err: int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb, struct xdp_buff *xdp, struct bpf_prog *xdp_prog) { - struct redirect_info *ri = this_cpu_ptr(&redirect_info); + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); u32 index = ri->ifindex; struct net_device *fwd; int err = 0; @@ -3400,7 +3393,7 @@ EXPORT_SYMBOL_GPL(xdp_do_generic_redirect); BPF_CALL_2(bpf_xdp_redirect, u32, ifindex, u64, flags) { - struct redirect_info *ri = this_cpu_ptr(&redirect_info); + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); if (unlikely(flags)) return XDP_ABORTED; @@ -3424,7 +3417,7 @@ static const struct bpf_func_proto bpf_xdp_redirect_proto = { BPF_CALL_4(bpf_xdp_redirect_map, struct bpf_map *, map, u32, ifindex, u64, flags, unsigned long, map_owner) { - struct redirect_info *ri = this_cpu_ptr(&redirect_info); + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); if (unlikely(flags)) return XDP_ABORTED; From 59a8f7abe7a9bbce7ee4a6466859dba153a8b808 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 9 Aug 2018 08:55:19 -0700 Subject: [PATCH 0760/1640] UPSTREAM: bpf: fix bpffs non-array map seq_show issue In function map_seq_next() of kernel/bpf/inode.c, the first key will be the "0" regardless of the map type. This works for array. But for hash type, if it happens key "0" is in the map, the bpffs map show will miss some items if the key "0" is not the first element of the first bucket. This patch fixed the issue by guaranteeing to get the first element, if the seq_show is just started, by passing NULL pointer key to map_get_next_key() callback. This way, no missing elements will occur for bpffs hash table show even if key "0" is in the map. Fixes: a26ca7c982cb5 ("bpf: btf: Add pretty print support to the basic arraymap") Acked-by: Alexei Starovoitov Signed-off-by: Yonghong Song Signed-off-by: Daniel Borkmann --- kernel/bpf/inode.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index d50c7d7dbba2..2ebc5bfea762 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -196,19 +196,21 @@ static void *map_seq_next(struct seq_file *m, void *v, loff_t *pos) { struct bpf_map *map = seq_file_to_map(m); void *key = map_iter(m)->key; + void *prev_key; if (map_iter(m)->done) return NULL; if (unlikely(v == SEQ_START_TOKEN)) - goto done; + prev_key = NULL; + else + prev_key = key; - if (map->ops->map_get_next_key(map, key, key)) { + if (map->ops->map_get_next_key(map, prev_key, key)) { map_iter(m)->done = true; return NULL; } -done: ++(*pos); return key; } From f0446a1255828583057925c17f914416b08c295d Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 9 Aug 2018 08:55:20 -0700 Subject: [PATCH 0761/1640] UPSTREAM: bpf: btf: add pretty print for hash/lru_hash maps Commit a26ca7c982cb ("bpf: btf: Add pretty print support to the basic arraymap") added pretty print support to array map. This patch adds pretty print for hash and lru_hash maps. The following example shows the pretty-print result of a pinned hashmap: struct map_value { int count_a; int count_b; }; cat /sys/fs/bpf/pinned_hash_map: 87907: {87907,87908} 57354: {37354,57355} 76625: {76625,76626} ... Signed-off-by: Yonghong Song Signed-off-by: Daniel Borkmann --- kernel/bpf/hashtab.c | 44 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 36b383f2781d..65a024ad3f2c 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -11,9 +11,11 @@ * General Public License for more details. */ #include +#include #include #include #include +#include #include "percpu_freelist.h" #include "bpf_lru_list.h" #include "map_in_map.h" @@ -1176,6 +1178,44 @@ static void htab_map_free(struct bpf_map *map) kfree(htab); } +static void htab_map_seq_show_elem(struct bpf_map *map, void *key, + struct seq_file *m) +{ + void *value; + + rcu_read_lock(); + + value = htab_map_lookup_elem(map, key); + if (!value) { + rcu_read_unlock(); + return; + } + + btf_type_seq_show(map->btf, map->btf_key_type_id, key, m); + seq_puts(m, ": "); + btf_type_seq_show(map->btf, map->btf_value_type_id, value, m); + seq_puts(m, "\n"); + + rcu_read_unlock(); +} + +static int htab_map_check_btf(const struct bpf_map *map, const struct btf *btf, + u32 btf_key_id, u32 btf_value_id) +{ + const struct btf_type *key_type, *value_type; + u32 key_size, value_size; + + key_type = btf_type_id_size(btf, &btf_key_id, &key_size); + if (!key_type || key_size != map->key_size) + return -EINVAL; + + value_type = btf_type_id_size(btf, &btf_value_id, &value_size); + if (!value_type || value_size != map->value_size) + return -EINVAL; + + return 0; +} + const struct bpf_map_ops htab_map_ops = { .map_alloc_check = htab_map_alloc_check, .map_alloc = htab_map_alloc, @@ -1185,6 +1225,8 @@ const struct bpf_map_ops htab_map_ops = { .map_update_elem = htab_map_update_elem, .map_delete_elem = htab_map_delete_elem, .map_gen_lookup = htab_map_gen_lookup, + .map_seq_show_elem = htab_map_seq_show_elem, + .map_check_btf = htab_map_check_btf, }; const struct bpf_map_ops htab_lru_map_ops = { @@ -1197,6 +1239,8 @@ const struct bpf_map_ops htab_lru_map_ops = { .map_update_elem = htab_lru_map_update_elem, .map_delete_elem = htab_lru_map_delete_elem, .map_gen_lookup = htab_lru_map_gen_lookup, + .map_seq_show_elem = htab_map_seq_show_elem, + .map_check_btf = htab_map_check_btf, }; /* Called from eBPF program */ From 97aa8c1fdf501d7dea468cf3a52ca269fb44f48c Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 8 Aug 2018 01:01:22 -0700 Subject: [PATCH 0762/1640] BACKPORT: net: Add ID (if needed) to sock_reuseport and expose reuseport_lock A later patch will introduce a BPF_MAP_TYPE_REUSEPORT_ARRAY which allows a SO_REUSEPORT sk to be added to a bpf map. When a sk is removed from reuse->socks[], it also needs to be removed from the bpf map. Also, when adding a sk to a bpf map, the bpf map needs to ensure it is indeed in a reuse->socks[]. Hence, reuseport_lock is needed by the bpf map to ensure its map_update_elem() and map_delete_elem() operations are in-sync with the reuse->socks[]. The BPF_MAP_TYPE_REUSEPORT_ARRAY map will only acquire the reuseport_lock after ensuring the adding sk is already in a reuseport group (i.e. reuse->socks[]). The map_lookup_elem() will be lockless. This patch also adds an ID to sock_reuseport. A later patch will introduce BPF_PROG_TYPE_SK_REUSEPORT which allows a bpf prog to select a sk from a bpf map. It is inflexible to statically enforce a bpf map can only contain the sk belonging to a particular reuse->socks[] (i.e. same IP:PORT) during the bpf verification time. For example, think about the the map-in-map situation where the inner map can be dynamically changed in runtime and the outer map may have inner maps belonging to different reuseport groups. Hence, when the bpf prog (in the new BPF_PROG_TYPE_SK_REUSEPORT type) selects a sk, this selected sk has to be checked to ensure it belongs to the requesting reuseport group (i.e. the group serving that IP:PORT). The "sk->sk_reuseport_cb" pointer cannot be used for this checking purpose because the pointer value will change after reuseport_grow(). Instead of saving all checking conditions like the ones preced calling "reuseport_add_sock()" and compare them everytime a bpf_prog is run, a 32bits ID is introduced to survive the reuseport_grow(). The ID is only acquired if any of the reuse->socks[] is added to the newly introduced "BPF_MAP_TYPE_REUSEPORT_ARRAY" map. If "BPF_MAP_TYPE_REUSEPORT_ARRAY" is not used, the changes in this patch is a no-op. Signed-off-by: Martin KaFai Lau Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/net/sock_reuseport.h | 6 ++++++ net/core/sock_reuseport.c | 27 ++++++++++++++++++++++++++- 2 files changed, 32 insertions(+), 1 deletion(-) diff --git a/include/net/sock_reuseport.h b/include/net/sock_reuseport.h index 0054b3a9b923..c5330848cae1 100644 --- a/include/net/sock_reuseport.h +++ b/include/net/sock_reuseport.h @@ -5,13 +5,18 @@ #include #include #include +#include #include +extern spinlock_t reuseport_lock; + struct sock_reuseport { struct rcu_head rcu; u16 max_socks; /* length of socks */ u16 num_socks; /* elements in socks */ + /* ID stays the same even after the size of socks[] grows. */ + unsigned int reuseport_id; struct bpf_prog __rcu *prog; /* optional BPF sock selector */ struct sock *socks[0]; /* array of sock pointers */ }; @@ -25,5 +30,6 @@ extern struct sock *reuseport_select_sock(struct sock *sk, int hdr_len); extern struct bpf_prog *reuseport_attach_prog(struct sock *sk, struct bpf_prog *prog); +int reuseport_get_id(struct sock_reuseport *reuse); #endif /* _SOCK_REUSEPORT_H */ diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c index 676092d7bd81..24d198b9e418 100644 --- a/net/core/sock_reuseport.c +++ b/net/core/sock_reuseport.c @@ -8,11 +8,33 @@ #include #include +#include #include #define INIT_SOCKS 128 -static DEFINE_SPINLOCK(reuseport_lock); +DEFINE_SPINLOCK(reuseport_lock); + +#define REUSEPORT_MIN_ID 1 +static DEFINE_IDA(reuseport_ida); + +int reuseport_get_id(struct sock_reuseport *reuse) +{ + int id; + + if (reuse->reuseport_id) + return reuse->reuseport_id; + + id = ida_simple_get(&reuseport_ida, REUSEPORT_MIN_ID, 0, + /* Called under reuseport_lock */ + GFP_ATOMIC); + if (id < 0) + return id; + + reuse->reuseport_id = id; + + return reuse->reuseport_id; +} static struct sock_reuseport *__reuseport_alloc(unsigned int max_socks) { @@ -78,6 +100,7 @@ static struct sock_reuseport *reuseport_grow(struct sock_reuseport *reuse) more_reuse->max_socks = more_socks_size; more_reuse->num_socks = reuse->num_socks; more_reuse->prog = reuse->prog; + more_reuse->reuseport_id = reuse->reuseport_id; memcpy(more_reuse->socks, reuse->socks, reuse->num_socks * sizeof(struct sock *)); @@ -101,6 +124,8 @@ static void reuseport_free_rcu(struct rcu_head *head) reuse = container_of(head, struct sock_reuseport, rcu); if (reuse->prog) bpf_prog_destroy(reuse->prog); + if (reuse->reuseport_id) + ida_simple_remove(&reuseport_ida, reuse->reuseport_id); kfree(reuse); } From 54514c7d0601db037ba4143646c67d69b4432d21 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 8 Aug 2018 01:01:24 -0700 Subject: [PATCH 0763/1640] BACKPORT: bpf: Introduce BPF_MAP_TYPE_REUSEPORT_SOCKARRAY This patch introduces a new map type BPF_MAP_TYPE_REUSEPORT_SOCKARRAY. To unleash the full potential of a bpf prog, it is essential for the userspace to be capable of directly setting up a bpf map which can then be consumed by the bpf prog to make decision. In this case, decide which SO_REUSEPORT sk to serve the incoming request. By adding BPF_MAP_TYPE_REUSEPORT_SOCKARRAY, the userspace has total control and visibility on where a SO_REUSEPORT sk should be located in a bpf map. The later patch will introduce BPF_PROG_TYPE_SK_REUSEPORT such that the bpf prog can directly select a sk from the bpf map. That will raise the programmability of the bpf prog attached to a reuseport group (a group of sk serving the same IP:PORT). For example, in UDP, the bpf prog can peek into the payload (e.g. through the "data" pointer introduced in the later patch) to learn the application level's connection information and then decide which sk to pick from a bpf map. The userspace can tightly couple the sk's location in a bpf map with the application logic in generating the UDP payload's connection information. This connection info contact/API stays within the userspace. Also, when used with map-in-map, the userspace can switch the old-server-process's inner map to a new-server-process's inner map in one call "bpf_map_update_elem(outer_map, &index, &new_reuseport_array)". The bpf prog will then direct incoming requests to the new process instead of the old process. The old process can finish draining the pending requests (e.g. by "accept()") before closing the old-fds. [Note that deleting a fd from a bpf map does not necessary mean the fd is closed] During map_update_elem(), Only SO_REUSEPORT sk (i.e. which has already been added to a reuse->socks[]) can be used. That means a SO_REUSEPORT sk that is "bind()" for UDP or "bind()+listen()" for TCP. These conditions are ensured in "reuseport_array_update_check()". A SO_REUSEPORT sk can only be added once to a map (i.e. the same sk cannot be added twice even to the same map). SO_REUSEPORT already allows another sk to be created for the same IP:PORT. There is no need to re-create a similar usage in the BPF side. When a SO_REUSEPORT is deleted from the "reuse->socks[]" (e.g. "close()"), it will notify the bpf map to remove it from the map also. It is done through "bpf_sk_reuseport_detach()" and it will only be called if >=1 of the "reuse->sock[]" has ever been added to a bpf map. The map_update()/map_delete() has to be in-sync with the "reuse->socks[]". Hence, the same "reuseport_lock" used by "reuse->socks[]" has to be used here also. Care has been taken to ensure the lock is only acquired when the adding sk passes some strict tests. and freeing the map does not require the reuseport_lock. The reuseport_array will also support lookup from the syscall side. It will return a sock_gen_cookie(). The sock_gen_cookie() is on-demand (i.e. a sk's cookie is not generated until the very first map_lookup_elem()). The lookup cookie is 64bits but it goes against the logical userspace expectation on 32bits sizeof(fd) (and as other fd based bpf maps do also). It may catch user in surprise if we enforce value_size=8 while userspace still pass a 32bits fd during update. Supporting different value_size between lookup and update seems unintuitive also. We also need to consider what if other existing fd based maps want to return 64bits value from syscall's lookup in the future. Hence, reuseport_array supports both value_size 4 and 8, and assuming user will usually use value_size=4. The syscall's lookup will return ENOSPC on value_size=4. It will will only return 64bits value from sock_gen_cookie() when user consciously choose value_size=8 (as a signal that lookup is desired) which then requires a 64bits value in both lookup and update. Signed-off-by: Martin KaFai Lau Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 28 +++ include/linux/bpf_types.h | 3 + include/uapi/linux/bpf.h | 1 + kernel/bpf/Makefile | 3 + kernel/bpf/arraymap.c | 2 +- kernel/bpf/reuseport_array.c | 363 +++++++++++++++++++++++++++++++++++ kernel/bpf/syscall.c | 6 + net/core/sock_reuseport.c | 8 + 8 files changed, 413 insertions(+), 1 deletion(-) create mode 100644 kernel/bpf/reuseport_array.c diff --git a/include/linux/bpf.h b/include/linux/bpf.h index e2acc9f012eb..ad2d1c71a07d 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -529,6 +529,7 @@ static inline int bpf_map_attr_numa_node(const union bpf_attr *attr) } struct bpf_prog *bpf_prog_get_type_path(const char *name, enum bpf_prog_type type); +int array_map_alloc_check(union bpf_attr *attr); static inline bool unprivileged_ebpf_enabled(void) { @@ -785,6 +786,33 @@ static inline void __xsk_map_flush(struct bpf_map *map) } #endif +#if defined(CONFIG_INET) && defined(CONFIG_BPF_SYSCALL) +void bpf_sk_reuseport_detach(struct sock *sk); +int bpf_fd_reuseport_array_lookup_elem(struct bpf_map *map, void *key, + void *value); +int bpf_fd_reuseport_array_update_elem(struct bpf_map *map, void *key, + void *value, u64 map_flags); +#else +static inline void bpf_sk_reuseport_detach(struct sock *sk) +{ +} + +#ifdef CONFIG_BPF_SYSCALL +static inline int bpf_fd_reuseport_array_lookup_elem(struct bpf_map *map, + void *key, void *value) +{ + return -EOPNOTSUPP; +} + +static inline int bpf_fd_reuseport_array_update_elem(struct bpf_map *map, + void *key, void *value, + u64 map_flags) +{ + return -EOPNOTSUPP; +} +#endif /* CONFIG_BPF_SYSCALL */ +#endif /* defined(CONFIG_INET) && defined(CONFIG_BPF_SYSCALL) */ + /* verifier prototypes for helper functions called from eBPF programs */ extern const struct bpf_func_proto bpf_map_lookup_elem_proto; extern const struct bpf_func_proto bpf_map_update_elem_proto; diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index add08be53b6f..14fd6c02d258 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -60,4 +60,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_CPUMAP, cpu_map_ops) #if defined(CONFIG_XDP_SOCKETS) BPF_MAP_TYPE(BPF_MAP_TYPE_XSKMAP, xsk_map_ops) #endif +#ifdef CONFIG_INET +BPF_MAP_TYPE(BPF_MAP_TYPE_REUSEPORT_SOCKARRAY, reuseport_array_ops) +#endif #endif diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 05b4aa406331..cb9f5adebf37 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -126,6 +126,7 @@ enum bpf_map_type { BPF_MAP_TYPE_XSKMAP, BPF_MAP_TYPE_SOCKHASH, BPF_MAP_TYPE_CGROUP_STORAGE, + BPF_MAP_TYPE_REUSEPORT_SOCKARRAY, }; enum bpf_prog_type { diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 593abdb60ffa..ffc39a7e028d 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -24,3 +24,6 @@ ifeq ($(CONFIG_PERF_EVENTS),y) obj-$(CONFIG_BPF_SYSCALL) += stackmap.o endif obj-$(CONFIG_CGROUP_BPF) += cgroup.o +ifeq ($(CONFIG_INET),y) +obj-$(CONFIG_BPF_SYSCALL) += reuseport_array.o +endif diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 4a2a3b202f46..019f19fe56d2 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -54,7 +54,7 @@ static int bpf_array_alloc_percpu(struct bpf_array *array) } /* Called from syscall */ -static int array_map_alloc_check(union bpf_attr *attr) +int array_map_alloc_check(union bpf_attr *attr) { bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY; int numa_node = bpf_map_attr_numa_node(attr); diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c new file mode 100644 index 000000000000..18e225de80ff --- /dev/null +++ b/kernel/bpf/reuseport_array.c @@ -0,0 +1,363 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2018 Facebook + */ +#include +#include +#include +#include + +struct reuseport_array { + struct bpf_map map; + struct sock __rcu *ptrs[]; +}; + +static struct reuseport_array *reuseport_array(struct bpf_map *map) +{ + return (struct reuseport_array *)map; +} + +/* The caller must hold the reuseport_lock */ +void bpf_sk_reuseport_detach(struct sock *sk) +{ + struct sock __rcu **socks; + + write_lock_bh(&sk->sk_callback_lock); + socks = sk->sk_user_data; + if (socks) { + WRITE_ONCE(sk->sk_user_data, NULL); + /* + * Do not move this NULL assignment outside of + * sk->sk_callback_lock because there is + * a race with reuseport_array_free() + * which does not hold the reuseport_lock. + */ + RCU_INIT_POINTER(*socks, NULL); + } + write_unlock_bh(&sk->sk_callback_lock); +} + +static int reuseport_array_alloc_check(union bpf_attr *attr) +{ + if (attr->value_size != sizeof(u32) && + attr->value_size != sizeof(u64)) + return -EINVAL; + + return array_map_alloc_check(attr); +} + +static void *reuseport_array_lookup_elem(struct bpf_map *map, void *key) +{ + struct reuseport_array *array = reuseport_array(map); + u32 index = *(u32 *)key; + + if (unlikely(index >= array->map.max_entries)) + return NULL; + + return rcu_dereference(array->ptrs[index]); +} + +/* Called from syscall only */ +static int reuseport_array_delete_elem(struct bpf_map *map, void *key) +{ + struct reuseport_array *array = reuseport_array(map); + u32 index = *(u32 *)key; + struct sock *sk; + int err; + + if (index >= map->max_entries) + return -E2BIG; + + if (!rcu_access_pointer(array->ptrs[index])) + return -ENOENT; + + spin_lock_bh(&reuseport_lock); + + sk = rcu_dereference_protected(array->ptrs[index], + lockdep_is_held(&reuseport_lock)); + if (sk) { + write_lock_bh(&sk->sk_callback_lock); + WRITE_ONCE(sk->sk_user_data, NULL); + RCU_INIT_POINTER(array->ptrs[index], NULL); + write_unlock_bh(&sk->sk_callback_lock); + err = 0; + } else { + err = -ENOENT; + } + + spin_unlock_bh(&reuseport_lock); + + return err; +} + +static void reuseport_array_free(struct bpf_map *map) +{ + struct reuseport_array *array = reuseport_array(map); + struct sock *sk; + u32 i; + + synchronize_rcu(); + + /* + * ops->map_*_elem() will not be able to access this + * array now. Hence, this function only races with + * bpf_sk_reuseport_detach() which was triggerred by + * close() or disconnect(). + * + * This function and bpf_sk_reuseport_detach() are + * both removing sk from "array". Who removes it + * first does not matter. + * + * The only concern here is bpf_sk_reuseport_detach() + * may access "array" which is being freed here. + * bpf_sk_reuseport_detach() access this "array" + * through sk->sk_user_data _and_ with sk->sk_callback_lock + * held which is enough because this "array" is not freed + * until all sk->sk_user_data has stopped referencing this "array". + * + * Hence, due to the above, taking "reuseport_lock" is not + * needed here. + */ + + /* + * Since reuseport_lock is not taken, sk is accessed under + * rcu_read_lock() + */ + rcu_read_lock(); + for (i = 0; i < map->max_entries; i++) { + sk = rcu_dereference(array->ptrs[i]); + if (sk) { + write_lock_bh(&sk->sk_callback_lock); + /* + * No need for WRITE_ONCE(). At this point, + * no one is reading it without taking the + * sk->sk_callback_lock. + */ + sk->sk_user_data = NULL; + write_unlock_bh(&sk->sk_callback_lock); + RCU_INIT_POINTER(array->ptrs[i], NULL); + } + } + rcu_read_unlock(); + + /* + * Once reaching here, all sk->sk_user_data is not + * referenceing this "array". "array" can be freed now. + */ + bpf_map_area_free(array); +} + +static struct bpf_map *reuseport_array_alloc(union bpf_attr *attr) +{ + int err, numa_node = bpf_map_attr_numa_node(attr); + struct reuseport_array *array; + u64 cost, array_size; + + if (!capable(CAP_SYS_ADMIN)) + return ERR_PTR(-EPERM); + + array_size = sizeof(*array); + array_size += (u64)attr->max_entries * sizeof(struct sock *); + + /* make sure there is no u32 overflow later in round_up() */ + cost = array_size; + if (cost >= U32_MAX - PAGE_SIZE) + return ERR_PTR(-ENOMEM); + cost = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; + + err = bpf_map_precharge_memlock(cost); + if (err) + return ERR_PTR(err); + + /* allocate all map elements and zero-initialize them */ + array = bpf_map_area_alloc(array_size, numa_node); + if (!array) + return ERR_PTR(-ENOMEM); + + /* copy mandatory map attributes */ + bpf_map_init_from_attr(&array->map, attr); + array->map.pages = cost; + + return &array->map; +} + +int bpf_fd_reuseport_array_lookup_elem(struct bpf_map *map, void *key, + void *value) +{ + struct sock *sk; + int err; + + if (map->value_size != sizeof(u64)) + return -ENOSPC; + + rcu_read_lock(); + sk = reuseport_array_lookup_elem(map, key); + if (sk) { + *(u64 *)value = sock_gen_cookie(sk); + err = 0; + } else { + err = -ENOENT; + } + rcu_read_unlock(); + + return err; +} + +static int +reuseport_array_update_check(const struct reuseport_array *array, + const struct sock *nsk, + const struct sock *osk, + const struct sock_reuseport *nsk_reuse, + u32 map_flags) +{ + if (osk && map_flags == BPF_NOEXIST) + return -EEXIST; + + if (!osk && map_flags == BPF_EXIST) + return -ENOENT; + + if (nsk->sk_protocol != IPPROTO_UDP && nsk->sk_protocol != IPPROTO_TCP) + return -ENOTSUPP; + + if (nsk->sk_family != AF_INET && nsk->sk_family != AF_INET6) + return -ENOTSUPP; + + if (nsk->sk_type != SOCK_STREAM && nsk->sk_type != SOCK_DGRAM) + return -ENOTSUPP; + + /* + * sk must be hashed (i.e. listening in the TCP case or binded + * in the UDP case) and + * it must also be a SO_REUSEPORT sk (i.e. reuse cannot be NULL). + * + * Also, sk will be used in bpf helper that is protected by + * rcu_read_lock(). + */ + if (!sock_flag(nsk, SOCK_RCU_FREE) || !sk_hashed(nsk) || !nsk_reuse) + return -EINVAL; + + /* READ_ONCE because the sk->sk_callback_lock may not be held here */ + if (READ_ONCE(nsk->sk_user_data)) + return -EBUSY; + + return 0; +} + +/* + * Called from syscall only. + * The "nsk" in the fd refcnt. + * The "osk" and "reuse" are protected by reuseport_lock. + */ +int bpf_fd_reuseport_array_update_elem(struct bpf_map *map, void *key, + void *value, u64 map_flags) +{ + struct reuseport_array *array = reuseport_array(map); + struct sock *free_osk = NULL, *osk, *nsk; + struct sock_reuseport *reuse; + u32 index = *(u32 *)key; + struct socket *socket; + int err, fd; + + if (map_flags > BPF_EXIST) + return -EINVAL; + + if (index >= map->max_entries) + return -E2BIG; + + if (map->value_size == sizeof(u64)) { + u64 fd64 = *(u64 *)value; + + if (fd64 > S32_MAX) + return -EINVAL; + fd = fd64; + } else { + fd = *(int *)value; + } + + socket = sockfd_lookup(fd, &err); + if (!socket) + return err; + + nsk = socket->sk; + if (!nsk) { + err = -EINVAL; + goto put_file; + } + + /* Quick checks before taking reuseport_lock */ + err = reuseport_array_update_check(array, nsk, + rcu_access_pointer(array->ptrs[index]), + rcu_access_pointer(nsk->sk_reuseport_cb), + map_flags); + if (err) + goto put_file; + + spin_lock_bh(&reuseport_lock); + /* + * Some of the checks only need reuseport_lock + * but it is done under sk_callback_lock also + * for simplicity reason. + */ + write_lock_bh(&nsk->sk_callback_lock); + + osk = rcu_dereference_protected(array->ptrs[index], + lockdep_is_held(&reuseport_lock)); + reuse = rcu_dereference_protected(nsk->sk_reuseport_cb, + lockdep_is_held(&reuseport_lock)); + err = reuseport_array_update_check(array, nsk, osk, reuse, map_flags); + if (err) + goto put_file_unlock; + + /* Ensure reuse->reuseport_id is set */ + err = reuseport_get_id(reuse); + if (err < 0) + goto put_file_unlock; + + WRITE_ONCE(nsk->sk_user_data, &array->ptrs[index]); + rcu_assign_pointer(array->ptrs[index], nsk); + free_osk = osk; + err = 0; + +put_file_unlock: + write_unlock_bh(&nsk->sk_callback_lock); + + if (free_osk) { + write_lock_bh(&free_osk->sk_callback_lock); + WRITE_ONCE(free_osk->sk_user_data, NULL); + write_unlock_bh(&free_osk->sk_callback_lock); + } + + spin_unlock_bh(&reuseport_lock); +put_file: + fput(socket->file); + return err; +} + +/* Called from syscall */ +static int reuseport_array_get_next_key(struct bpf_map *map, void *key, + void *next_key) +{ + struct reuseport_array *array = reuseport_array(map); + u32 index = key ? *(u32 *)key : U32_MAX; + u32 *next = (u32 *)next_key; + + if (index >= array->map.max_entries) { + *next = 0; + return 0; + } + + if (index == array->map.max_entries - 1) + return -ENOENT; + + *next = index + 1; + return 0; +} + +const struct bpf_map_ops reuseport_array_ops = { + .map_alloc_check = reuseport_array_alloc_check, + .map_alloc = reuseport_array_alloc, + .map_free = reuseport_array_free, + .map_lookup_elem = reuseport_array_lookup_elem, + .map_get_next_key = reuseport_array_get_next_key, + .map_delete_elem = reuseport_array_delete_elem, +}; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index d90a0853eb1a..a3da88e8427e 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -685,6 +685,8 @@ static int map_lookup_elem(union bpf_attr *attr) err = bpf_fd_array_map_lookup_elem(map, key, value); } else if (IS_FD_HASH(map)) { err = bpf_fd_htab_map_lookup_elem(map, key, value); + } else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) { + err = bpf_fd_reuseport_array_lookup_elem(map, key, value); } else { rcu_read_lock(); if (map->ops->map_lookup_elem_sys_only) @@ -805,6 +807,10 @@ static int map_update_elem(union bpf_attr *attr) err = bpf_fd_htab_map_update_elem(map, f.file, key, value, attr->flags); rcu_read_unlock(); + } else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) { + /* rcu_read_lock() is not needed */ + err = bpf_fd_reuseport_array_update_elem(map, key, value, + attr->flags); } else { rcu_read_lock(); err = map->ops->map_update_elem(map, key, value, attr->flags); diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c index 24d198b9e418..859d999df418 100644 --- a/net/core/sock_reuseport.c +++ b/net/core/sock_reuseport.c @@ -185,6 +185,14 @@ void reuseport_detach_sock(struct sock *sk) spin_lock_bh(&reuseport_lock); reuse = rcu_dereference_protected(sk->sk_reuseport_cb, lockdep_is_held(&reuseport_lock)); + + /* At least one of the sk in this reuseport group is added to + * a bpf map. Notify the bpf side. The bpf map logic will + * remove the sk if it is indeed added to a bpf map. + */ + if (reuse->reuseport_id) + bpf_sk_reuseport_detach(sk); + rcu_assign_pointer(sk->sk_reuseport_cb, NULL); for (i = 0; i < reuse->num_socks; i++) { From 15a200a1f41f8053f675cd4262b98a229233de1e Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 13 Sep 2017 13:58:15 -0700 Subject: [PATCH 0764/1640] UPSTREAM: net: Convert int functions to bool Global function ipv6_rcv_saddr_equal and static functions ipv6_rcv_saddr_equal and ipv4_rcv_saddr_equal currently return int. bool is slightly more descriptive for these functions so change their return type from int to bool. Signed-off-by: Joe Perches Signed-off-by: David S. Miller --- include/net/addrconf.h | 4 ++-- net/ipv4/inet_connection_sock.c | 36 ++++++++++++++++----------------- 2 files changed, 20 insertions(+), 20 deletions(-) diff --git a/include/net/addrconf.h b/include/net/addrconf.h index 8aa6c6496dc9..7985a68033ea 100644 --- a/include/net/addrconf.h +++ b/include/net/addrconf.h @@ -95,8 +95,8 @@ int __ipv6_get_lladdr(struct inet6_dev *idev, struct in6_addr *addr, u32 banned_flags); int ipv6_get_lladdr(struct net_device *dev, struct in6_addr *addr, u32 banned_flags); -int inet_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2, - bool match_wildcard); +bool inet_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2, + bool match_wildcard); void addrconf_join_solict(struct net_device *dev, const struct in6_addr *addr); void addrconf_leave_solict(struct inet6_dev *idev, const struct in6_addr *addr); diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 6a8e8ceb46e2..958eefeaed38 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -39,11 +39,11 @@ EXPORT_SYMBOL(inet_csk_timer_bug_msg); * IPV6_ADDR_ANY only equals to IPV6_ADDR_ANY, * and 0.0.0.0 equals to 0.0.0.0 only */ -static int ipv6_rcv_saddr_equal(const struct in6_addr *sk1_rcv_saddr6, - const struct in6_addr *sk2_rcv_saddr6, - __be32 sk1_rcv_saddr, __be32 sk2_rcv_saddr, - bool sk1_ipv6only, bool sk2_ipv6only, - bool match_wildcard) +static bool ipv6_rcv_saddr_equal(const struct in6_addr *sk1_rcv_saddr6, + const struct in6_addr *sk2_rcv_saddr6, + __be32 sk1_rcv_saddr, __be32 sk2_rcv_saddr, + bool sk1_ipv6only, bool sk2_ipv6only, + bool match_wildcard) { int addr_type = ipv6_addr_type(sk1_rcv_saddr6); int addr_type2 = sk2_rcv_saddr6 ? ipv6_addr_type(sk2_rcv_saddr6) : IPV6_ADDR_MAPPED; @@ -52,29 +52,29 @@ static int ipv6_rcv_saddr_equal(const struct in6_addr *sk1_rcv_saddr6, if (addr_type == IPV6_ADDR_MAPPED && addr_type2 == IPV6_ADDR_MAPPED) { if (!sk2_ipv6only) { if (sk1_rcv_saddr == sk2_rcv_saddr) - return 1; + return true; if (!sk1_rcv_saddr || !sk2_rcv_saddr) return match_wildcard; } - return 0; + return false; } if (addr_type == IPV6_ADDR_ANY && addr_type2 == IPV6_ADDR_ANY) - return 1; + return true; if (addr_type2 == IPV6_ADDR_ANY && match_wildcard && !(sk2_ipv6only && addr_type == IPV6_ADDR_MAPPED)) - return 1; + return true; if (addr_type == IPV6_ADDR_ANY && match_wildcard && !(sk1_ipv6only && addr_type2 == IPV6_ADDR_MAPPED)) - return 1; + return true; if (sk2_rcv_saddr6 && ipv6_addr_equal(sk1_rcv_saddr6, sk2_rcv_saddr6)) - return 1; + return true; - return 0; + return false; } #endif @@ -82,20 +82,20 @@ static int ipv6_rcv_saddr_equal(const struct in6_addr *sk1_rcv_saddr6, * match_wildcard == false: addresses must be exactly the same, i.e. * 0.0.0.0 only equals to 0.0.0.0 */ -static int ipv4_rcv_saddr_equal(__be32 sk1_rcv_saddr, __be32 sk2_rcv_saddr, - bool sk2_ipv6only, bool match_wildcard) +static bool ipv4_rcv_saddr_equal(__be32 sk1_rcv_saddr, __be32 sk2_rcv_saddr, + bool sk2_ipv6only, bool match_wildcard) { if (!sk2_ipv6only) { if (sk1_rcv_saddr == sk2_rcv_saddr) - return 1; + return true; if (!sk1_rcv_saddr || !sk2_rcv_saddr) return match_wildcard; } - return 0; + return false; } -int inet_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2, - bool match_wildcard) +bool inet_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2, + bool match_wildcard) { #if IS_ENABLED(CONFIG_IPV6) if (sk->sk_family == AF_INET6) From 1af8c89812878e9b3f271010d1080d4263e1481e Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 8 Aug 2018 01:01:25 -0700 Subject: [PATCH 0765/1640] BACKPORT: bpf: Introduce BPF_PROG_TYPE_SK_REUSEPORT This patch adds a BPF_PROG_TYPE_SK_REUSEPORT which can select a SO_REUSEPORT sk from a BPF_MAP_TYPE_REUSEPORT_ARRAY. Like other non SK_FILTER/CGROUP_SKB program, it requires CAP_SYS_ADMIN. BPF_PROG_TYPE_SK_REUSEPORT introduces "struct sk_reuseport_kern" to store the bpf context instead of using the skb->cb[48]. At the SO_REUSEPORT sk lookup time, it is in the middle of transiting from a lower layer (ipv4/ipv6) to a upper layer (udp/tcp). At this point, it is not always clear where the bpf context can be appended in the skb->cb[48] to avoid saving-and-restoring cb[]. Even putting aside the difference between ipv4-vs-ipv6 and udp-vs-tcp. It is not clear if the lower layer is only ipv4 and ipv6 in the future and will it not touch the cb[] again before transiting to the upper layer. For example, in udp_gro_receive(), it uses the 48 byte NAPI_GRO_CB instead of IP[6]CB and it may still modify the cb[] after calling the udp[46]_lib_lookup_skb(). Because of the above reason, if sk->cb is used for the bpf ctx, saving-and-restoring is needed and likely the whole 48 bytes cb[] has to be saved and restored. Instead of saving, setting and restoring the cb[], this patch opts to create a new "struct sk_reuseport_kern" and setting the needed values in there. The new BPF_PROG_TYPE_SK_REUSEPORT and "struct sk_reuseport_(kern|md)" will serve all ipv4/ipv6 + udp/tcp combinations. There is no protocol specific usage at this point and it is also inline with the current sock_reuseport.c implementation (i.e. no protocol specific requirement). In "struct sk_reuseport_md", this patch exposes data/data_end/len with semantic similar to other existing usages. Together with "bpf_skb_load_bytes()" and "bpf_skb_load_bytes_relative()", the bpf prog can peek anywhere in the skb. The "bind_inany" tells the bpf prog that the reuseport group is bind-ed to a local INANY address which cannot be learned from skb. The new "bind_inany" is added to "struct sock_reuseport" which will be used when running the new "BPF_PROG_TYPE_SK_REUSEPORT" bpf prog in order to avoid repeating the "bind INANY" test on "sk_v6_rcv_saddr/sk->sk_rcv_saddr" every time a bpf prog is run. It can only be properly initialized when a "sk->sk_reuseport" enabled sk is adding to a hashtable (i.e. during "reuseport_alloc()" and "reuseport_add_sock()"). The new "sk_select_reuseport()" is the main helper that the bpf prog will use to select a SO_REUSEPORT sk. It is the only function that can use the new BPF_MAP_TYPE_REUSEPORT_ARRAY. As mentioned in the earlier patch, the validity of a selected sk is checked in run time in "sk_select_reuseport()". Doing the check in verification time is difficult and inflexible (consider the map-in-map use case). The runtime check is to compare the selected sk's reuseport_id with the reuseport_id that we want. This helper will return -EXXX if the selected sk cannot serve the incoming request (e.g. reuseport_id not match). The bpf prog can decide if it wants to do SK_DROP as its discretion. When the bpf prog returns SK_PASS, the kernel will check if a valid sk has been selected (i.e. "reuse_kern->selected_sk != NULL"). If it does , it will use the selected sk. If not, the kernel will select one from "reuse->socks[]" (as before this patch). The SK_DROP and SK_PASS handling logic will be in the next patch. Signed-off-by: Martin KaFai Lau Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/linux/bpf_types.h | 3 + include/linux/filter.h | 15 ++ include/net/addrconf.h | 1 + include/net/sock_reuseport.h | 6 +- include/uapi/linux/bpf.h | 33 ++++ kernel/bpf/verifier.c | 9 ++ net/core/filter.c | 269 +++++++++++++++++++++++++++++++- net/core/sock_reuseport.c | 20 ++- net/ipv4/inet_connection_sock.c | 9 ++ net/ipv4/inet_hashtables.c | 5 +- net/ipv4/udp.c | 5 +- 11 files changed, 363 insertions(+), 12 deletions(-) diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index 14fd6c02d258..cd26c090e7c0 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -29,6 +29,9 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_DEVICE, cg_dev) #ifdef CONFIG_BPF_LIRC_MODE2 BPF_PROG_TYPE(BPF_PROG_TYPE_LIRC_MODE2, lirc_mode2) #endif +#ifdef CONFIG_INET +BPF_PROG_TYPE(BPF_PROG_TYPE_SK_REUSEPORT, sk_reuseport) +#endif BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY, array_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_ARRAY, percpu_array_map_ops) diff --git a/include/linux/filter.h b/include/linux/filter.h index 0219780d9310..ba829a5d138a 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -32,6 +32,7 @@ struct seccomp_data; struct bpf_prog_aux; struct xdp_rxq_info; struct xdp_buff; +struct sock_reuseport; /* ArgX, context and stack frame pointer register positions. Note, * Arg1, Arg2, Arg3, etc are used as argument mappings of function @@ -877,6 +878,20 @@ void bpf_warn_invalid_xdp_action(u32 act); struct sock *do_sk_redirect_map(struct sk_buff *skb); struct sock *do_msg_redirect_map(struct sk_msg_buff *md); +#ifdef CONFIG_INET +struct sock *bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk, + struct bpf_prog *prog, struct sk_buff *skb, + u32 hash); +#else +static inline struct sock * +bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk, + struct bpf_prog *prog, struct sk_buff *skb, + u32 hash) +{ + return NULL; +} +#endif + #ifdef CONFIG_BPF_JIT extern int bpf_jit_enable; extern int bpf_jit_harden; diff --git a/include/net/addrconf.h b/include/net/addrconf.h index 7985a68033ea..0b07d5e0ab1b 100644 --- a/include/net/addrconf.h +++ b/include/net/addrconf.h @@ -97,6 +97,7 @@ int ipv6_get_lladdr(struct net_device *dev, struct in6_addr *addr, u32 banned_flags); bool inet_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2, bool match_wildcard); +bool inet_rcv_saddr_any(const struct sock *sk); void addrconf_join_solict(struct net_device *dev, const struct in6_addr *addr); void addrconf_leave_solict(struct inet6_dev *idev, const struct in6_addr *addr); diff --git a/include/net/sock_reuseport.h b/include/net/sock_reuseport.h index c5330848cae1..267507214cdf 100644 --- a/include/net/sock_reuseport.h +++ b/include/net/sock_reuseport.h @@ -17,12 +17,14 @@ struct sock_reuseport { u16 num_socks; /* elements in socks */ /* ID stays the same even after the size of socks[] grows. */ unsigned int reuseport_id; + bool bind_inany; struct bpf_prog __rcu *prog; /* optional BPF sock selector */ struct sock *socks[0]; /* array of sock pointers */ }; -extern int reuseport_alloc(struct sock *sk); -extern int reuseport_add_sock(struct sock *sk, struct sock *sk2); +extern int reuseport_alloc(struct sock *sk, bool bind_inany); +extern int reuseport_add_sock(struct sock *sk, struct sock *sk2, + bool bind_inany); extern void reuseport_detach_sock(struct sock *sk); extern struct sock *reuseport_select_sock(struct sock *sk, u32 hash, diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index cb9f5adebf37..f935738e45c3 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -151,6 +151,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_CGROUP_SOCK_ADDR, BPF_PROG_TYPE_LWT_SEG6LOCAL, BPF_PROG_TYPE_LIRC_MODE2, + BPF_PROG_TYPE_SK_REUSEPORT, }; enum bpf_attach_type { @@ -2114,6 +2115,14 @@ union bpf_attr { * the shared data. * Return * Pointer to the local storage area. + * + * int bpf_sk_select_reuseport(struct sk_reuseport_md *reuse, struct bpf_map *map, void *key, u64 flags) + * Description + * Select a SO_REUSEPORT sk from a BPF_MAP_TYPE_REUSEPORT_ARRAY map + * It checks the selected sk is matching the incoming + * request in the skb. + * Return + * 0 on success, or a negative error in case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -2458,6 +2467,30 @@ struct sk_msg_md { __u32 local_port; /* stored in host byte order */ }; +struct sk_reuseport_md { + /* + * Start of directly accessible data. It begins from + * the tcp/udp header. + */ + void *data; + void *data_end; /* End of directly accessible data */ + /* + * Total length of packet (starting from the tcp/udp header). + * Note that the directly accessible bytes (data_end - data) + * could be less than this "len". Those bytes could be + * indirectly read by a helper "bpf_skb_load_bytes()". + */ + __u32 len; + /* + * Eth protocol in the mac header (network byte order). e.g. + * ETH_P_IP(0x0800) and ETH_P_IPV6(0x86DD) + */ + __u32 eth_protocol; + __u32 ip_protocol; /* IP protocol. e.g. IPPROTO_TCP, IPPROTO_UDP */ + __u32 bind_inany; /* Is sock bound to an INANY address? */ + __u32 hash; /* A hash of the packet 4 tuples */ +}; + #define BPF_TAG_SIZE 8 struct bpf_prog_info { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 8e3fdd5315ef..fb492914fe0f 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1340,6 +1340,7 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env, case BPF_PROG_TYPE_LWT_IN: case BPF_PROG_TYPE_LWT_OUT: case BPF_PROG_TYPE_LWT_SEG6LOCAL: + case BPF_PROG_TYPE_SK_REUSEPORT: /* dst_input() and dst_output() can't write for now */ if (t == BPF_WRITE) return false; @@ -2187,6 +2188,10 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, func_id != BPF_FUNC_msg_redirect_hash) goto error; break; + case BPF_MAP_TYPE_REUSEPORT_SOCKARRAY: + if (func_id != BPF_FUNC_sk_select_reuseport) + goto error; + break; default: break; } @@ -2238,6 +2243,10 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, if (map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE) goto error; break; + case BPF_FUNC_sk_select_reuseport: + if (map->map_type != BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) + goto error; + break; default: break; } diff --git a/net/core/filter.c b/net/core/filter.c index a869c7311e72..dc02d74e7a4c 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1462,7 +1462,7 @@ static int __reuseport_attach_prog(struct bpf_prog *prog, struct sock *sk) return -ENOMEM; if (sk_unhashed(sk) && sk->sk_reuseport) { - err = reuseport_alloc(sk); + err = reuseport_alloc(sk, false); if (err) return err; } else if (!rcu_access_pointer(sk->sk_reuseport_cb)) { @@ -7023,3 +7023,270 @@ out: release_sock(sk); return ret; } + +#ifdef CONFIG_INET +struct sk_reuseport_kern { + struct sk_buff *skb; + struct sock *sk; + struct sock *selected_sk; + void *data_end; + u32 hash; + u32 reuseport_id; + bool bind_inany; +}; + +static void bpf_init_reuseport_kern(struct sk_reuseport_kern *reuse_kern, + struct sock_reuseport *reuse, + struct sock *sk, struct sk_buff *skb, + u32 hash) +{ + reuse_kern->skb = skb; + reuse_kern->sk = sk; + reuse_kern->selected_sk = NULL; + reuse_kern->data_end = skb->data + skb_headlen(skb); + reuse_kern->hash = hash; + reuse_kern->reuseport_id = reuse->reuseport_id; + reuse_kern->bind_inany = reuse->bind_inany; +} + +struct sock *bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk, + struct bpf_prog *prog, struct sk_buff *skb, + u32 hash) +{ + struct sk_reuseport_kern reuse_kern; + enum sk_action action; + + bpf_init_reuseport_kern(&reuse_kern, reuse, sk, skb, hash); + action = BPF_PROG_RUN(prog, &reuse_kern); + + if (action == SK_PASS) + return reuse_kern.selected_sk; + else + return ERR_PTR(-ECONNREFUSED); +} + +BPF_CALL_4(sk_select_reuseport, struct sk_reuseport_kern *, reuse_kern, + struct bpf_map *, map, void *, key, u32, flags) +{ + struct sock_reuseport *reuse; + struct sock *selected_sk; + + selected_sk = map->ops->map_lookup_elem(map, key); + if (!selected_sk) + return -ENOENT; + + reuse = rcu_dereference(selected_sk->sk_reuseport_cb); + if (!reuse) + /* selected_sk is unhashed (e.g. by close()) after the + * above map_lookup_elem(). Treat selected_sk has already + * been removed from the map. + */ + return -ENOENT; + + if (unlikely(reuse->reuseport_id != reuse_kern->reuseport_id)) { + struct sock *sk; + + if (unlikely(!reuse_kern->reuseport_id)) + /* There is a small race between adding the + * sk to the map and setting the + * reuse_kern->reuseport_id. + * Treat it as the sk has not been added to + * the bpf map yet. + */ + return -ENOENT; + + sk = reuse_kern->sk; + if (sk->sk_protocol != selected_sk->sk_protocol) + return -EPROTOTYPE; + else if (sk->sk_family != selected_sk->sk_family) + return -EAFNOSUPPORT; + + /* Catch all. Likely bound to a different sockaddr. */ + return -EBADFD; + } + + reuse_kern->selected_sk = selected_sk; + + return 0; +} + +static const struct bpf_func_proto sk_select_reuseport_proto = { + .func = sk_select_reuseport, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_PTR_TO_MAP_KEY, + .arg4_type = ARG_ANYTHING, +}; + +BPF_CALL_4(sk_reuseport_load_bytes, + const struct sk_reuseport_kern *, reuse_kern, u32, offset, + void *, to, u32, len) +{ + return ____bpf_skb_load_bytes(reuse_kern->skb, offset, to, len); +} + +static const struct bpf_func_proto sk_reuseport_load_bytes_proto = { + .func = sk_reuseport_load_bytes, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_PTR_TO_UNINIT_MEM, + .arg4_type = ARG_CONST_SIZE, +}; + +BPF_CALL_5(sk_reuseport_load_bytes_relative, + const struct sk_reuseport_kern *, reuse_kern, u32, offset, + void *, to, u32, len, u32, start_header) +{ + return ____bpf_skb_load_bytes_relative(reuse_kern->skb, offset, to, + len, start_header); +} + +static const struct bpf_func_proto sk_reuseport_load_bytes_relative_proto = { + .func = sk_reuseport_load_bytes_relative, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_PTR_TO_UNINIT_MEM, + .arg4_type = ARG_CONST_SIZE, + .arg5_type = ARG_ANYTHING, +}; + +static const struct bpf_func_proto * +sk_reuseport_func_proto(enum bpf_func_id func_id, + const struct bpf_prog *prog) +{ + switch (func_id) { + case BPF_FUNC_sk_select_reuseport: + return &sk_select_reuseport_proto; + case BPF_FUNC_skb_load_bytes: + return &sk_reuseport_load_bytes_proto; + case BPF_FUNC_skb_load_bytes_relative: + return &sk_reuseport_load_bytes_relative_proto; + default: + return bpf_base_func_proto(func_id); + } +} + +static bool +sk_reuseport_is_valid_access(int off, int size, + enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + const u32 size_default = sizeof(__u32); + + if (off < 0 || off >= sizeof(struct sk_reuseport_md) || + off % size || type != BPF_READ) + return false; + + switch (off) { + case offsetof(struct sk_reuseport_md, data): + info->reg_type = PTR_TO_PACKET; + return size == sizeof(__u64); + + case offsetof(struct sk_reuseport_md, data_end): + info->reg_type = PTR_TO_PACKET_END; + return size == sizeof(__u64); + + case offsetof(struct sk_reuseport_md, hash): + return size == size_default; + + /* Fields that allow narrowing */ + case offsetof(struct sk_reuseport_md, eth_protocol): + if (size < FIELD_SIZEOF(struct sk_buff, protocol)) + return false; + case offsetof(struct sk_reuseport_md, ip_protocol): + case offsetof(struct sk_reuseport_md, bind_inany): + case offsetof(struct sk_reuseport_md, len): + bpf_ctx_record_field_size(info, size_default); + return bpf_ctx_narrow_access_ok(off, size, size_default); + + default: + return false; + } +} + +#define SK_REUSEPORT_LOAD_FIELD(F) ({ \ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_reuseport_kern, F), \ + si->dst_reg, si->src_reg, \ + bpf_target_off(struct sk_reuseport_kern, F, \ + FIELD_SIZEOF(struct sk_reuseport_kern, F), \ + target_size)); \ + }) + +#define SK_REUSEPORT_LOAD_SKB_FIELD(SKB_FIELD) \ + SOCK_ADDR_LOAD_NESTED_FIELD(struct sk_reuseport_kern, \ + struct sk_buff, \ + skb, \ + SKB_FIELD) + +#define SK_REUSEPORT_LOAD_SK_FIELD_SIZE_OFF(SK_FIELD, BPF_SIZE, EXTRA_OFF) \ + SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF(struct sk_reuseport_kern, \ + struct sock, \ + sk, \ + SK_FIELD, BPF_SIZE, EXTRA_OFF) + +static u32 sk_reuseport_convert_ctx_access(enum bpf_access_type type, + const struct bpf_insn *si, + struct bpf_insn *insn_buf, + struct bpf_prog *prog, + u32 *target_size) +{ + struct bpf_insn *insn = insn_buf; + + switch (si->off) { + case offsetof(struct sk_reuseport_md, data): + SK_REUSEPORT_LOAD_SKB_FIELD(data); + break; + + case offsetof(struct sk_reuseport_md, len): + SK_REUSEPORT_LOAD_SKB_FIELD(len); + break; + + case offsetof(struct sk_reuseport_md, eth_protocol): + SK_REUSEPORT_LOAD_SKB_FIELD(protocol); + break; + + case offsetof(struct sk_reuseport_md, ip_protocol): + BUILD_BUG_ON(hweight_long(SK_FL_PROTO_MASK) != BITS_PER_BYTE); + SK_REUSEPORT_LOAD_SK_FIELD_SIZE_OFF(__sk_flags_offset, + BPF_W, 0); + *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_PROTO_MASK); + *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, + SK_FL_PROTO_SHIFT); + /* SK_FL_PROTO_MASK and SK_FL_PROTO_SHIFT are endian + * aware. No further narrowing or masking is needed. + */ + *target_size = 1; + break; + + case offsetof(struct sk_reuseport_md, data_end): + SK_REUSEPORT_LOAD_FIELD(data_end); + break; + + case offsetof(struct sk_reuseport_md, hash): + SK_REUSEPORT_LOAD_FIELD(hash); + break; + + case offsetof(struct sk_reuseport_md, bind_inany): + SK_REUSEPORT_LOAD_FIELD(bind_inany); + break; + } + + return insn - insn_buf; +} + +const struct bpf_verifier_ops sk_reuseport_verifier_ops = { + .get_func_proto = sk_reuseport_func_proto, + .is_valid_access = sk_reuseport_is_valid_access, + .convert_ctx_access = sk_reuseport_convert_ctx_access, +}; + +const struct bpf_prog_ops sk_reuseport_prog_ops = { +}; +#endif /* CONFIG_INET */ diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c index 859d999df418..7a5e3493f0ff 100644 --- a/net/core/sock_reuseport.c +++ b/net/core/sock_reuseport.c @@ -51,7 +51,7 @@ static struct sock_reuseport *__reuseport_alloc(unsigned int max_socks) return reuse; } -int reuseport_alloc(struct sock *sk) +int reuseport_alloc(struct sock *sk, bool bind_inany) { struct sock_reuseport *reuse; @@ -63,9 +63,17 @@ int reuseport_alloc(struct sock *sk) /* Allocation attempts can occur concurrently via the setsockopt path * and the bind/hash path. Nothing to do when we lose the race. */ - if (rcu_dereference_protected(sk->sk_reuseport_cb, - lockdep_is_held(&reuseport_lock))) + reuse = rcu_dereference_protected(sk->sk_reuseport_cb, + lockdep_is_held(&reuseport_lock)); + if (reuse) { + /* Only set reuse->bind_inany if the bind_inany is true. + * Otherwise, it will overwrite the reuse->bind_inany + * which was set by the bind/hash path. + */ + if (bind_inany) + reuse->bind_inany = bind_inany; goto out; + } reuse = __reuseport_alloc(INIT_SOCKS); if (!reuse) { @@ -75,6 +83,7 @@ int reuseport_alloc(struct sock *sk) reuse->socks[0] = sk; reuse->num_socks = 1; + reuse->bind_inany = bind_inany; rcu_assign_pointer(sk->sk_reuseport_cb, reuse); out: @@ -101,6 +110,7 @@ static struct sock_reuseport *reuseport_grow(struct sock_reuseport *reuse) more_reuse->num_socks = reuse->num_socks; more_reuse->prog = reuse->prog; more_reuse->reuseport_id = reuse->reuseport_id; + more_reuse->bind_inany = reuse->bind_inany; memcpy(more_reuse->socks, reuse->socks, reuse->num_socks * sizeof(struct sock *)); @@ -135,12 +145,12 @@ static void reuseport_free_rcu(struct rcu_head *head) * @sk2: Socket belonging to the existing reuseport group. * May return ENOMEM and not add socket to group under memory pressure. */ -int reuseport_add_sock(struct sock *sk, struct sock *sk2) +int reuseport_add_sock(struct sock *sk, struct sock *sk2, bool bind_inany) { struct sock_reuseport *old_reuse, *reuse; if (!rcu_access_pointer(sk2->sk_reuseport_cb)) { - int err = reuseport_alloc(sk2); + int err = reuseport_alloc(sk2, bind_inany); if (err) return err; diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 958eefeaed38..11f6f4e7ce40 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -112,6 +112,15 @@ bool inet_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2, } EXPORT_SYMBOL(inet_rcv_saddr_equal); +bool inet_rcv_saddr_any(const struct sock *sk) +{ +#if IS_ENABLED(CONFIG_IPV6) + if (sk->sk_family == AF_INET6) + return ipv6_addr_any(&sk->sk_v6_rcv_saddr); +#endif + return !sk->sk_rcv_saddr; +} + void inet_get_local_port_range(struct net *net, int *low, int *high) { unsigned int seq; diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index c5092e2b5933..0a6223077fe4 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -508,10 +508,11 @@ static int inet_reuseport_add_sock(struct sock *sk, inet_csk(sk2)->icsk_bind_hash == tb && sk2->sk_reuseport && uid_eq(uid, sock_i_uid(sk2)) && inet_rcv_saddr_equal(sk, sk2, false)) - return reuseport_add_sock(sk, sk2); + return reuseport_add_sock(sk, sk2, + inet_rcv_saddr_any(sk)); } - return reuseport_alloc(sk); + return reuseport_alloc(sk, inet_rcv_saddr_any(sk)); } int __inet_hash(struct sock *sk, struct sock *osk) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index d9f79c422c42..f78f4925b961 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -231,11 +231,12 @@ static int udp_reuseport_add_sock(struct sock *sk, struct udp_hslot *hslot) (sk2->sk_bound_dev_if == sk->sk_bound_dev_if) && sk2->sk_reuseport && uid_eq(uid, sock_i_uid(sk2)) && inet_rcv_saddr_equal(sk, sk2, false)) { - return reuseport_add_sock(sk, sk2); + return reuseport_add_sock(sk, sk2, + inet_rcv_saddr_any(sk)); } } - return reuseport_alloc(sk); + return reuseport_alloc(sk, inet_rcv_saddr_any(sk)); } /** From ef6ac44b94a6296d70906349c29d95ba33e75188 Mon Sep 17 00:00:00 2001 From: Tim Zimmermann Date: Sun, 10 Aug 2025 08:01:54 +0000 Subject: [PATCH 0766/1640] Revert "tcp/dccp: fix possible race __inet_lookup_established()" This reverts commit 94671cf125f8e08e093290451d511c197b005c82. --- include/linux/rculist_nulls.h | 37 ----------------------------------- include/net/inet_hashtables.h | 12 +++--------- include/net/sock.h | 5 ----- net/ipv4/inet_diag.c | 3 +-- net/ipv4/inet_hashtables.c | 18 ++++++++--------- net/ipv4/tcp_ipv4.c | 7 +++---- net/ipv6/inet6_hashtables.c | 3 +-- 7 files changed, 17 insertions(+), 68 deletions(-) diff --git a/include/linux/rculist_nulls.h b/include/linux/rculist_nulls.h index cf64a9492256..a7fe73a22ccc 100644 --- a/include/linux/rculist_nulls.h +++ b/include/linux/rculist_nulls.h @@ -100,43 +100,6 @@ static inline void hlist_nulls_add_head_rcu(struct hlist_nulls_node *n, WRITE_ONCE(first->pprev, &n->next); } -/** - * hlist_nulls_add_tail_rcu - * @n: the element to add to the hash list. - * @h: the list to add to. - * - * Description: - * Adds the specified element to the specified hlist_nulls, - * while permitting racing traversals. - * - * The caller must take whatever precautions are necessary - * (such as holding appropriate locks) to avoid racing - * with another list-mutation primitive, such as hlist_nulls_add_head_rcu() - * or hlist_nulls_del_rcu(), running on this same list. - * However, it is perfectly legal to run concurrently with - * the _rcu list-traversal primitives, such as - * hlist_nulls_for_each_entry_rcu(), used to prevent memory-consistency - * problems on Alpha CPUs. Regardless of the type of CPU, the - * list-traversal primitive must be guarded by rcu_read_lock(). - */ -static inline void hlist_nulls_add_tail_rcu(struct hlist_nulls_node *n, - struct hlist_nulls_head *h) -{ - struct hlist_nulls_node *i, *last = NULL; - - /* Note: write side code, so rcu accessors are not needed. */ - for (i = h->first; !is_a_nulls(i); i = i->next) - last = i; - - if (last) { - n->next = last->next; - n->pprev = &last->next; - rcu_assign_pointer(hlist_next_rcu(last), n); - } else { - hlist_nulls_add_head_rcu(n, h); - } -} - /** * hlist_nulls_for_each_entry_rcu - iterate over rcu list of given type * @tpos: the type * to use as a loop cursor. diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index e079478bf5c9..bc913136b58f 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -106,18 +106,12 @@ struct inet_bind_hashbucket { struct hlist_head chain; }; -/* Sockets can be hashed in established or listening table. - * We must use different 'nulls' end-of-chain value for all hash buckets : - * A socket might transition from ESTABLISH to LISTEN state without - * RCU grace period. A lookup in ehash table needs to handle this case. +/* + * Sockets can be hashed in established or listening table */ -#define LISTENING_NULLS_BASE (1U << 29) struct inet_listen_hashbucket { spinlock_t lock; - union { - struct hlist_head head; - struct hlist_nulls_head nulls_head; - }; + struct hlist_head head; }; /* This is for listening sockets, thus all sockets which possess wildcards. */ diff --git a/include/net/sock.h b/include/net/sock.h index 5741ff62cbe2..86aafa6abe87 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -715,11 +715,6 @@ static inline void __sk_nulls_add_node_rcu(struct sock *sk, struct hlist_nulls_h hlist_nulls_add_head_rcu(&sk->sk_nulls_node, list); } -static inline void __sk_nulls_add_node_tail_rcu(struct sock *sk, struct hlist_nulls_head *list) -{ - hlist_nulls_add_tail_rcu(&sk->sk_nulls_node, list); -} - static inline void sk_nulls_add_node_rcu(struct sock *sk, struct hlist_nulls_head *list) { sock_hold(sk); diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 462f85c76edd..c73b7ab7f8cb 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -908,12 +908,11 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb, for (i = s_i; i < INET_LHTABLE_SIZE; i++) { struct inet_listen_hashbucket *ilb; - struct hlist_nulls_node *node; num = 0; ilb = &hashinfo->listening_hash[i]; spin_lock(&ilb->lock); - sk_nulls_for_each(sk, node, &ilb->nulls_head) { + sk_for_each(sk, &ilb->head) { struct inet_sock *inet = inet_sk(sk); if (!net_eq(sock_net(sk), net)) diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 0a6223077fe4..f059518c00d4 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -223,10 +223,9 @@ struct sock *__inet_lookup_listener(struct net *net, int score, hiscore = 0, matches = 0, reuseport = 0; bool exact_dif = inet_exact_dif_match(net, skb); struct sock *sk, *result = NULL; - struct hlist_nulls_node *node; u32 phash = 0; - sk_nulls_for_each_rcu(sk, node, &ilb->nulls_head) { + sk_for_each_rcu(sk, &ilb->head) { score = compute_score(sk, net, hnum, daddr, dif, sdif, exact_dif); if (score > hiscore) { @@ -496,11 +495,10 @@ static int inet_reuseport_add_sock(struct sock *sk, struct inet_listen_hashbucket *ilb) { struct inet_bind_bucket *tb = inet_csk(sk)->icsk_bind_hash; - const struct hlist_nulls_node *node; struct sock *sk2; kuid_t uid = sock_i_uid(sk); - sk_nulls_for_each_rcu(sk2, node, &ilb->nulls_head) { + sk_for_each_rcu(sk2, &ilb->head) { if (sk2 != sk && sk2->sk_family == sk->sk_family && ipv6_only_sock(sk2) == ipv6_only_sock(sk) && @@ -536,9 +534,9 @@ int __inet_hash(struct sock *sk, struct sock *osk) } if (IS_ENABLED(CONFIG_IPV6) && sk->sk_reuseport && sk->sk_family == AF_INET6) - __sk_nulls_add_node_tail_rcu(sk, &ilb->nulls_head); + hlist_add_tail_rcu(&sk->sk_node, &ilb->head); else - __sk_nulls_add_node_rcu(sk, &ilb->nulls_head); + hlist_add_head_rcu(&sk->sk_node, &ilb->head); sock_set_flag(sk, SOCK_RCU_FREE); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); unlock: @@ -581,7 +579,10 @@ void inet_unhash(struct sock *sk) spin_lock_bh(lock); if (rcu_access_pointer(sk->sk_reuseport_cb)) reuseport_detach_sock(sk); - done = __sk_nulls_del_node_init_rcu(sk); + if (listener) + done = __sk_del_node_init(sk); + else + done = __sk_nulls_del_node_init_rcu(sk); if (done) sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); spin_unlock_bh(lock); @@ -732,8 +733,7 @@ void inet_hashinfo_init(struct inet_hashinfo *h) for (i = 0; i < INET_LHTABLE_SIZE; i++) { spin_lock_init(&h->listening_hash[i].lock); - INIT_HLIST_NULLS_HEAD(&h->listening_hash[i].nulls_head, - i + LISTENING_NULLS_BASE); + INIT_HLIST_HEAD(&h->listening_hash[i].head); } if (h != &tcp_hashinfo) diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index b4908f96aa84..3ca11273b6f9 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1991,14 +1991,13 @@ static void *listening_get_next(struct seq_file *seq, void *cur) struct tcp_iter_state *st = seq->private; struct net *net = seq_file_net(seq); struct inet_listen_hashbucket *ilb; - struct hlist_nulls_node *node; struct sock *sk = cur; if (!sk) { get_head: ilb = &tcp_hashinfo.listening_hash[st->bucket]; spin_lock(&ilb->lock); - sk = sk_nulls_head(&ilb->nulls_head); + sk = sk_head(&ilb->head); st->offset = 0; goto get_sk; } @@ -2006,9 +2005,9 @@ get_head: ++st->num; ++st->offset; - sk = sk_nulls_next(sk); + sk = sk_next(sk); get_sk: - sk_nulls_for_each_from(sk, node) { + sk_for_each_from(sk) { if (!net_eq(sock_net(sk), net)) continue; if (sk->sk_family == st->family) diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c index 7d83ab627b09..4d2135e6aad1 100644 --- a/net/ipv6/inet6_hashtables.c +++ b/net/ipv6/inet6_hashtables.c @@ -137,10 +137,9 @@ struct sock *inet6_lookup_listener(struct net *net, int score, hiscore = 0, matches = 0, reuseport = 0; bool exact_dif = inet6_exact_dif_match(net, skb); struct sock *sk, *result = NULL; - struct hlist_nulls_node *node; u32 phash = 0; - sk_nulls_for_each(sk, node, &ilb->nulls_head) { + sk_for_each(sk, &ilb->head) { score = compute_score(sk, net, hnum, daddr, dif, sdif, exact_dif); if (score > hiscore) { reuseport = sk->sk_reuseport; From 67d5edc0a8ae5f1a107e1b782ea2cb391217b954 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 30 Nov 2017 15:39:34 +0100 Subject: [PATCH 0767/1640] UPSTREAM: net/reuseport: drop legacy code Since commit e32ea7e74727 ("soreuseport: fast reuseport UDP socket selection") and commit c125e80b8868 ("soreuseport: fast reuseport TCP socket selection") the relevant reuseport socket matching the current packet is selected by the reuseport_select_sock() call. The only exceptions are invalid BPF filters/filters returning out-of-range indices. In the latter case the code implicitly falls back to using the hash demultiplexing, but instead of selecting the socket inside the reuseport_select_sock() function, it relies on the hash selection logic introduced with the early soreuseport implementation. With this patch, in case of a BPF filter returning a bad socket index value, we fall back to hash-based selection inside the reuseport_select_sock() body, so that we can drop some duplicate code in the ipv4 and ipv6 stack. This also allows faster lookup in the above scenario and will allow us to avoid computing the hash value for successful, BPF based demultiplexing - in a later patch. Signed-off-by: Paolo Abeni Acked-by: Craig Gallek Signed-off-by: David S. Miller --- net/core/sock_reuseport.c | 4 +++- net/ipv4/inet_hashtables.c | 11 ++--------- net/ipv4/udp.c | 22 ++++------------------ net/ipv6/inet6_hashtables.c | 11 ++--------- net/ipv6/udp.c | 22 ++++------------------ 5 files changed, 15 insertions(+), 55 deletions(-) diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c index 7a5e3493f0ff..b31d354fda9f 100644 --- a/net/core/sock_reuseport.c +++ b/net/core/sock_reuseport.c @@ -283,7 +283,9 @@ struct sock *reuseport_select_sock(struct sock *sk, if (prog && skb) sk2 = run_bpf(reuse, socks, prog, skb, hdr_len); - else + + /* no bpf or invalid bpf result: fall back to hash usage */ + if (!sk2) sk2 = reuse->socks[reciprocal_scale(hash, socks)]; } diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index f059518c00d4..c20e2d5bc90f 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -220,32 +220,25 @@ struct sock *__inet_lookup_listener(struct net *net, { unsigned int hash = inet_lhashfn(net, hnum); struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash]; - int score, hiscore = 0, matches = 0, reuseport = 0; bool exact_dif = inet_exact_dif_match(net, skb); struct sock *sk, *result = NULL; + int score, hiscore = 0; u32 phash = 0; sk_for_each_rcu(sk, &ilb->head) { score = compute_score(sk, net, hnum, daddr, dif, sdif, exact_dif); if (score > hiscore) { - reuseport = sk->sk_reuseport; - if (reuseport) { + if (sk->sk_reuseport) { phash = inet_ehashfn(net, daddr, hnum, saddr, sport); result = reuseport_select_sock(sk, phash, skb, doff); if (result) return result; - matches = 1; } result = sk; hiscore = score; - } else if (score == hiscore && reuseport) { - matches++; - if (reciprocal_scale(phash, matches) == 0) - result = sk; - phash = next_pseudo_random32(phash); } } return result; diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index f78f4925b961..5ba5034a2f90 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -455,7 +455,7 @@ static struct sock *udp4_lib_lookup2(struct net *net, struct sk_buff *skb) { struct sock *sk, *result; - int score, badness, matches = 0, reuseport = 0; + int score, badness; u32 hash = 0; result = NULL; @@ -464,23 +464,16 @@ static struct sock *udp4_lib_lookup2(struct net *net, score = compute_score(sk, net, saddr, sport, daddr, hnum, dif, sdif, exact_dif); if (score > badness) { - reuseport = sk->sk_reuseport; - if (reuseport) { + if (sk->sk_reuseport) { hash = udp_ehashfn(net, daddr, hnum, saddr, sport); result = reuseport_select_sock(sk, hash, skb, sizeof(struct udphdr)); if (result) return result; - matches = 1; } badness = score; result = sk; - } else if (score == badness && reuseport) { - matches++; - if (reciprocal_scale(hash, matches) == 0) - result = sk; - hash = next_pseudo_random32(hash); } } return result; @@ -498,7 +491,7 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, unsigned int hash2, slot2, slot = udp_hashfn(net, hnum, udptable->mask); struct udp_hslot *hslot2, *hslot = &udptable->hash[slot]; bool exact_dif = udp_lib_exact_dif_match(net, skb); - int score, badness, matches = 0, reuseport = 0; + int score, badness; u32 hash = 0; if (hslot->count > 10) { @@ -536,23 +529,16 @@ begin: score = compute_score(sk, net, saddr, sport, daddr, hnum, dif, sdif, exact_dif); if (score > badness) { - reuseport = sk->sk_reuseport; - if (reuseport) { + if (sk->sk_reuseport) { hash = udp_ehashfn(net, daddr, hnum, saddr, sport); result = reuseport_select_sock(sk, hash, skb, sizeof(struct udphdr)); if (result) return result; - matches = 1; } result = sk; badness = score; - } else if (score == badness && reuseport) { - matches++; - if (reciprocal_scale(hash, matches) == 0) - result = sk; - hash = next_pseudo_random32(hash); } } return result; diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c index 4d2135e6aad1..880296105437 100644 --- a/net/ipv6/inet6_hashtables.c +++ b/net/ipv6/inet6_hashtables.c @@ -134,31 +134,24 @@ struct sock *inet6_lookup_listener(struct net *net, { unsigned int hash = inet_lhashfn(net, hnum); struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash]; - int score, hiscore = 0, matches = 0, reuseport = 0; bool exact_dif = inet6_exact_dif_match(net, skb); struct sock *sk, *result = NULL; + int score, hiscore = 0; u32 phash = 0; sk_for_each(sk, &ilb->head) { score = compute_score(sk, net, hnum, daddr, dif, sdif, exact_dif); if (score > hiscore) { - reuseport = sk->sk_reuseport; - if (reuseport) { + if (sk->sk_reuseport) { phash = inet6_ehashfn(net, daddr, hnum, saddr, sport); result = reuseport_select_sock(sk, phash, skb, doff); if (result) return result; - matches = 1; } result = sk; hiscore = score; - } else if (score == hiscore && reuseport) { - matches++; - if (reciprocal_scale(phash, matches) == 0) - result = sk; - phash = next_pseudo_random32(phash); } } return result; diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 4836492bac01..bd13bd21cb18 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -197,7 +197,7 @@ static struct sock *udp6_lib_lookup2(struct net *net, struct udp_hslot *hslot2, struct sk_buff *skb) { struct sock *sk, *result; - int score, badness, matches = 0, reuseport = 0; + int score, badness; u32 hash = 0; result = NULL; @@ -206,8 +206,7 @@ static struct sock *udp6_lib_lookup2(struct net *net, score = compute_score(sk, net, saddr, sport, daddr, hnum, dif, sdif, exact_dif); if (score > badness) { - reuseport = sk->sk_reuseport; - if (reuseport) { + if (sk->sk_reuseport) { hash = udp6_ehashfn(net, daddr, hnum, saddr, sport); @@ -215,15 +214,9 @@ static struct sock *udp6_lib_lookup2(struct net *net, sizeof(struct udphdr)); if (result) return result; - matches = 1; } result = sk; badness = score; - } else if (score == badness && reuseport) { - matches++; - if (reciprocal_scale(hash, matches) == 0) - result = sk; - hash = next_pseudo_random32(hash); } } return result; @@ -241,7 +234,7 @@ struct sock *__udp6_lib_lookup(struct net *net, unsigned int hash2, slot2, slot = udp_hashfn(net, hnum, udptable->mask); struct udp_hslot *hslot2, *hslot = &udptable->hash[slot]; bool exact_dif = udp6_lib_exact_dif_match(net, skb); - int score, badness, matches = 0, reuseport = 0; + int score, badness; u32 hash = 0; if (hslot->count > 10) { @@ -280,23 +273,16 @@ begin: score = compute_score(sk, net, saddr, sport, daddr, hnum, dif, sdif, exact_dif); if (score > badness) { - reuseport = sk->sk_reuseport; - if (reuseport) { + if (sk->sk_reuseport) { hash = udp6_ehashfn(net, daddr, hnum, saddr, sport); result = reuseport_select_sock(sk, hash, skb, sizeof(struct udphdr)); if (result) return result; - matches = 1; } result = sk; badness = score; - } else if (score == badness && reuseport) { - matches++; - if (reciprocal_scale(hash, matches) == 0) - result = sk; - hash = next_pseudo_random32(hash); } } return result; From 0afc9ebfa82db33f27d37e03da083666d6a9cfbb Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Fri, 1 Dec 2017 12:52:29 -0800 Subject: [PATCH 0768/1640] UPSTREAM: inet: Add a count to struct inet_listen_hashbucket This patch adds a count to the 'struct inet_listen_hashbucket'. It counts how many sk is hashed to a bucket. It will be used to decide if the (to-be-added) portaddr listener's hashtable should be used during inet[6]_lookup_listener(). Signed-off-by: Martin KaFai Lau Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/inet_hashtables.h | 1 + net/ipv4/inet_hashtables.c | 11 +++++++++-- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index bc913136b58f..936d1a3baeba 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -111,6 +111,7 @@ struct inet_bind_hashbucket { */ struct inet_listen_hashbucket { spinlock_t lock; + unsigned int count; struct hlist_head head; }; diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index c20e2d5bc90f..1f0ee81e326b 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -530,6 +530,7 @@ int __inet_hash(struct sock *sk, struct sock *osk) hlist_add_tail_rcu(&sk->sk_node, &ilb->head); else hlist_add_head_rcu(&sk->sk_node, &ilb->head); + ilb->count++; sock_set_flag(sk, SOCK_RCU_FREE); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); unlock: @@ -556,6 +557,7 @@ EXPORT_SYMBOL_GPL(inet_hash); void inet_unhash(struct sock *sk) { struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; + struct inet_listen_hashbucket *ilb; spinlock_t *lock; bool listener = false; int done; @@ -564,7 +566,8 @@ void inet_unhash(struct sock *sk) return; if (sk->sk_state == TCP_LISTEN) { - lock = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)].lock; + ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)]; + lock = &ilb->lock; listener = true; } else { lock = inet_ehash_lockp(hashinfo, sk->sk_hash); @@ -576,8 +579,11 @@ void inet_unhash(struct sock *sk) done = __sk_del_node_init(sk); else done = __sk_nulls_del_node_init_rcu(sk); - if (done) + if (done) { + if (listener) + ilb->count--; sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); + } spin_unlock_bh(lock); } EXPORT_SYMBOL_GPL(inet_unhash); @@ -727,6 +733,7 @@ void inet_hashinfo_init(struct inet_hashinfo *h) for (i = 0; i < INET_LHTABLE_SIZE; i++) { spin_lock_init(&h->listening_hash[i].lock); INIT_HLIST_HEAD(&h->listening_hash[i].head); + h->listening_hash[i].count = 0; } if (h != &tcp_hashinfo) From bf98b7b6fff5ef9616236cfb9bfc234bd28e3b6c Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Fri, 1 Dec 2017 12:52:30 -0800 Subject: [PATCH 0769/1640] UPSTREAM: udp: Move udp[46]_portaddr_hash() to net/ip[v6].h This patch moves the udp[46]_portaddr_hash() to net/ip[v6].h. The function name is renamed to ipv[46]_portaddr_hash(). It will be used by a later patch which adds a second listener hashtable hashed by the address and port. Signed-off-by: Martin KaFai Lau Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/ip.h | 9 +++++++++ include/net/ipv6.h | 17 +++++++++++++++++ net/ipv4/udp.c | 22 ++++++++-------------- net/ipv6/udp.c | 32 ++++++++------------------------ 4 files changed, 42 insertions(+), 38 deletions(-) diff --git a/include/net/ip.h b/include/net/ip.h index 5de30eb9798e..c8dee1b1d431 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -26,12 +26,14 @@ #include #include #include +#include #include #include #include #include #include +#include #define IPV4_MAX_PMTU 65535U /* RFC 2675, Section 5.1 */ #define IPV4_MIN_MTU 68 /* RFC 791 */ @@ -541,6 +543,13 @@ static inline unsigned int ipv4_addr_hash(__be32 ip) return (__force unsigned int) ip; } +static inline u32 ipv4_portaddr_hash(const struct net *net, + __be32 saddr, + unsigned int port) +{ + return jhash_1word((__force u32)saddr, net_hash_mix(net)) ^ port; +} + bool ip_call_ra_chain(struct sk_buff *skb); /* diff --git a/include/net/ipv6.h b/include/net/ipv6.h index fa50206bee5e..e4986d7a0fa2 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -22,6 +22,7 @@ #include #include #include +#include #define SIN6_LEN_RFC2133 24 @@ -613,6 +614,22 @@ static inline bool ipv6_addr_v4mapped(const struct in6_addr *a) cpu_to_be32(0x0000ffff))) == 0UL; } +static inline u32 ipv6_portaddr_hash(const struct net *net, + const struct in6_addr *addr6, + unsigned int port) +{ + unsigned int hash, mix = net_hash_mix(net); + + if (ipv6_addr_any(addr6)) + hash = jhash_1word(0, mix); + else if (ipv6_addr_v4mapped(addr6)) + hash = jhash_1word((__force u32)addr6->s6_addr32[3], mix); + else + hash = jhash2((__force u32 *)addr6->s6_addr32, 4, mix); + + return hash ^ port; +} + /* * Check for a RFC 4843 ORCHID address * (Overlay Routable Cryptographic Hash Identifiers) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 5ba5034a2f90..92cea31cecef 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -367,18 +367,12 @@ fail: } EXPORT_SYMBOL(udp_lib_get_port); -static u32 udp4_portaddr_hash(const struct net *net, __be32 saddr, - unsigned int port) -{ - return jhash_1word((__force u32)saddr, net_hash_mix(net)) ^ port; -} - int udp_v4_get_port(struct sock *sk, unsigned short snum) { unsigned int hash2_nulladdr = - udp4_portaddr_hash(sock_net(sk), htonl(INADDR_ANY), snum); + ipv4_portaddr_hash(sock_net(sk), htonl(INADDR_ANY), snum); unsigned int hash2_partial = - udp4_portaddr_hash(sock_net(sk), inet_sk(sk)->inet_rcv_saddr, 0); + ipv4_portaddr_hash(sock_net(sk), inet_sk(sk)->inet_rcv_saddr, 0); /* precompute partial secondary hash */ udp_sk(sk)->udp_portaddr_hash = hash2_partial; @@ -495,7 +489,7 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, u32 hash = 0; if (hslot->count > 10) { - hash2 = udp4_portaddr_hash(net, daddr, hnum); + hash2 = ipv4_portaddr_hash(net, daddr, hnum); slot2 = hash2 & udptable->mask; hslot2 = &udptable->hash2[slot2]; if (hslot->count < hslot2->count) @@ -506,7 +500,7 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, exact_dif, hslot2, skb); if (!result) { unsigned int old_slot2 = slot2; - hash2 = udp4_portaddr_hash(net, htonl(INADDR_ANY), hnum); + hash2 = ipv4_portaddr_hash(net, htonl(INADDR_ANY), hnum); slot2 = hash2 & udptable->mask; /* avoid searching the same slot again. */ if (unlikely(slot2 == old_slot2)) @@ -1884,7 +1878,7 @@ EXPORT_SYMBOL(udp_lib_rehash); static void udp_v4_rehash(struct sock *sk) { - u16 new_hash = udp4_portaddr_hash(sock_net(sk), + u16 new_hash = ipv4_portaddr_hash(sock_net(sk), inet_sk(sk)->inet_rcv_saddr, inet_sk(sk)->inet_num); udp_lib_rehash(sk, new_hash); @@ -2096,9 +2090,9 @@ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb, struct sk_buff *nskb; if (use_hash2) { - hash2_any = udp4_portaddr_hash(net, htonl(INADDR_ANY), hnum) & + hash2_any = ipv4_portaddr_hash(net, htonl(INADDR_ANY), hnum) & udptable->mask; - hash2 = udp4_portaddr_hash(net, daddr, hnum) & udptable->mask; + hash2 = ipv4_portaddr_hash(net, daddr, hnum) & udptable->mask; start_lookup: hslot = &udptable->hash2[hash2]; offset = offsetof(typeof(*sk), __sk_common.skc_portaddr_node); @@ -2506,7 +2500,7 @@ static struct sock *__udp4_lib_demux_lookup(struct net *net, int dif, int sdif) { unsigned short hnum = ntohs(loc_port); - unsigned int hash2 = udp4_portaddr_hash(net, loc_addr, hnum); + unsigned int hash2 = ipv4_portaddr_hash(net, loc_addr, hnum); unsigned int slot2 = hash2 & udp_table.mask; struct udp_hslot *hslot2 = &udp_table.hash2[slot2]; INET_ADDR_COOKIE(acookie, rmt_addr, loc_addr); diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index bd13bd21cb18..30779582d9bd 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -102,28 +102,12 @@ static u32 udp6_ehashfn(const struct net *net, udp6_ehash_secret + net_hash_mix(net)); } -static u32 udp6_portaddr_hash(const struct net *net, - const struct in6_addr *addr6, - unsigned int port) -{ - unsigned int hash, mix = net_hash_mix(net); - - if (ipv6_addr_any(addr6)) - hash = jhash_1word(0, mix); - else if (ipv6_addr_v4mapped(addr6)) - hash = jhash_1word((__force u32)addr6->s6_addr32[3], mix); - else - hash = jhash2((__force u32 *)addr6->s6_addr32, 4, mix); - - return hash ^ port; -} - int udp_v6_get_port(struct sock *sk, unsigned short snum) { unsigned int hash2_nulladdr = - udp6_portaddr_hash(sock_net(sk), &in6addr_any, snum); + ipv6_portaddr_hash(sock_net(sk), &in6addr_any, snum); unsigned int hash2_partial = - udp6_portaddr_hash(sock_net(sk), &sk->sk_v6_rcv_saddr, 0); + ipv6_portaddr_hash(sock_net(sk), &sk->sk_v6_rcv_saddr, 0); /* precompute partial secondary hash */ udp_sk(sk)->udp_portaddr_hash = hash2_partial; @@ -132,7 +116,7 @@ int udp_v6_get_port(struct sock *sk, unsigned short snum) static void udp_v6_rehash(struct sock *sk) { - u16 new_hash = udp6_portaddr_hash(sock_net(sk), + u16 new_hash = ipv6_portaddr_hash(sock_net(sk), &sk->sk_v6_rcv_saddr, inet_sk(sk)->inet_num); @@ -238,7 +222,7 @@ struct sock *__udp6_lib_lookup(struct net *net, u32 hash = 0; if (hslot->count > 10) { - hash2 = udp6_portaddr_hash(net, daddr, hnum); + hash2 = ipv6_portaddr_hash(net, daddr, hnum); slot2 = hash2 & udptable->mask; hslot2 = &udptable->hash2[slot2]; if (hslot->count < hslot2->count) @@ -249,7 +233,7 @@ struct sock *__udp6_lib_lookup(struct net *net, hslot2, skb); if (!result) { unsigned int old_slot2 = slot2; - hash2 = udp6_portaddr_hash(net, &in6addr_any, hnum); + hash2 = ipv6_portaddr_hash(net, &in6addr_any, hnum); slot2 = hash2 & udptable->mask; /* avoid searching the same slot again. */ if (unlikely(slot2 == old_slot2)) @@ -743,9 +727,9 @@ static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb, struct sk_buff *nskb; if (use_hash2) { - hash2_any = udp6_portaddr_hash(net, &in6addr_any, hnum) & + hash2_any = ipv6_portaddr_hash(net, &in6addr_any, hnum) & udptable->mask; - hash2 = udp6_portaddr_hash(net, daddr, hnum) & udptable->mask; + hash2 = ipv6_portaddr_hash(net, daddr, hnum) & udptable->mask; start_lookup: hslot = &udptable->hash2[hash2]; offset = offsetof(typeof(*sk), __sk_common.skc_portaddr_node); @@ -940,7 +924,7 @@ static struct sock *__udp6_lib_demux_lookup(struct net *net, int dif, int sdif) { unsigned short hnum = ntohs(loc_port); - unsigned int hash2 = udp6_portaddr_hash(net, loc_addr, hnum); + unsigned int hash2 = ipv6_portaddr_hash(net, loc_addr, hnum); unsigned int slot2 = hash2 & udp_table.mask; struct udp_hslot *hslot2 = &udp_table.hash2[slot2]; const __portpair ports = INET_COMBINED_PORTS(rmt_port, hnum); From 83083b58b845f8e2a7e4f53858a658c6afb2c888 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Fri, 1 Dec 2017 12:52:31 -0800 Subject: [PATCH 0770/1640] BACKPORT: inet: Add a 2nd listener hashtable (port+addr) The current listener hashtable is hashed by port only. When a process is listening at many IP addresses with the same port (e.g. [IP1]:443, [IP2]:443... [IPN]:443), the inet[6]_lookup_listener() performance is degraded to a link list. It is prone to syn attack. UDP had a similar issue and a second hashtable was added to resolve it. This patch adds a second hashtable for the listener's sockets. The second hashtable is hashed by port and address. It cannot reuse the existing skc_portaddr_node which is shared with skc_bind_node. TCP listener needs to use skc_bind_node. Instead, this patch adds a hlist_node 'icsk_listen_portaddr_node' to the inet_connection_sock which the listener (like TCP) also belongs to. The new portaddr hashtable may need two lookup (First by IP:PORT. Second by INADDR_ANY:PORT if the IP:PORT is a not found). Hence, it implements a similar cut off as UDP such that it will only consult the new portaddr hashtable if the current port-only hashtable has >10 sk in the link-list. lhash2 and lhash2_mask are added to 'struct inet_hashinfo'. I take this chance to plug a 4 bytes hole. It is done by first moving the existing bind_bucket_cachep up and then add the new (int lhash2_mask, *lhash2) after the existing bhash_size. Signed-off-by: Martin KaFai Lau Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/inet_connection_sock.h | 2 + include/net/inet_hashtables.h | 32 ++++-- net/ipv4/inet_hashtables.c | 168 +++++++++++++++++++++++++++-- net/ipv6/inet6_hashtables.c | 66 ++++++++++++ 4 files changed, 251 insertions(+), 17 deletions(-) diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index 3d85113c2276..f48c33be29c9 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -77,6 +77,7 @@ struct inet_connection_sock_af_ops { * @icsk_af_ops Operations which are AF_INET{4,6} specific * @icsk_ulp_ops Pluggable ULP control hook * @icsk_ulp_data ULP private data + * @icsk_listen_portaddr_node hash to the portaddr listener hashtable * @icsk_ca_state: Congestion control state * @icsk_retransmits: Number of unrecovered [RTO] timeouts * @icsk_pending: Scheduled timer event @@ -101,6 +102,7 @@ struct inet_connection_sock { const struct inet_connection_sock_af_ops *icsk_af_ops; const struct tcp_ulp_ops *icsk_ulp_ops; void *icsk_ulp_data; + struct hlist_node icsk_listen_portaddr_node; unsigned int (*icsk_sync_mss)(struct sock *sk, u32 pmtu); __u8 icsk_ca_state:6, icsk_ca_setsockopt:1, diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index 936d1a3baeba..c0ebafcebff5 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -133,12 +133,13 @@ struct inet_hashinfo { /* Ok, let's try this, I give up, we do need a local binding * TCP hash as well as the others for fast bind/connect. */ - struct inet_bind_hashbucket *bhash; - - unsigned int bhash_size; - /* 4 bytes hole on 64 bit */ - struct kmem_cache *bind_bucket_cachep; + struct inet_bind_hashbucket *bhash; + unsigned int bhash_size; + + /* The 2nd listener table hashed by local port and address */ + unsigned int lhash2_mask; + struct inet_listen_hashbucket *lhash2; /* All the above members are written once at bootup and * never written again _or_ are predominantly read-access. @@ -146,14 +147,25 @@ struct inet_hashinfo { * Now align to a new cache line as all the following members * might be often dirty. */ - /* All sockets in TCP_LISTEN state will be in here. This is the only - * table where wildcard'd TCP sockets can exist. Hash function here - * is just local port number. + /* All sockets in TCP_LISTEN state will be in listening_hash. + * This is the only table where wildcard'd TCP sockets can + * exist. listening_hash is only hashed by local port number. + * If lhash2 is initialized, the same socket will also be hashed + * to lhash2 by port and address. */ struct inet_listen_hashbucket listening_hash[INET_LHTABLE_SIZE] ____cacheline_aligned_in_smp; }; +#define inet_lhash2_for_each_icsk_rcu(__icsk, list) \ + hlist_for_each_entry_rcu(__icsk, list, icsk_listen_portaddr_node) + +static inline struct inet_listen_hashbucket * +inet_lhash2_bucket(struct inet_hashinfo *h, u32 hash) +{ + return &h->lhash2[hash & h->lhash2_mask]; +} + static inline struct inet_ehash_bucket *inet_ehash_bucket( struct inet_hashinfo *hashinfo, unsigned int hash) @@ -209,6 +221,10 @@ int __inet_inherit_port(const struct sock *sk, struct sock *child); void inet_put_port(struct sock *sk); void inet_hashinfo_init(struct inet_hashinfo *h); +void inet_hashinfo2_init(struct inet_hashinfo *h, const char *name, + unsigned long numentries, int scale, + unsigned long low_limit, + unsigned long high_limit); bool inet_ehash_insert(struct sock *sk, struct sock *osk, bool *found_dup_sk); bool inet_ehash_nolisten(struct sock *sk, struct sock *osk, diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 1f0ee81e326b..9e618ca8a7ac 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -172,6 +173,60 @@ int __inet_inherit_port(const struct sock *sk, struct sock *child) } EXPORT_SYMBOL_GPL(__inet_inherit_port); +static struct inet_listen_hashbucket * +inet_lhash2_bucket_sk(struct inet_hashinfo *h, struct sock *sk) +{ + u32 hash; + +#if IS_ENABLED(CONFIG_IPV6) + if (sk->sk_family == AF_INET6) + hash = ipv6_portaddr_hash(sock_net(sk), + &sk->sk_v6_rcv_saddr, + inet_sk(sk)->inet_num); + else +#endif + hash = ipv4_portaddr_hash(sock_net(sk), + inet_sk(sk)->inet_rcv_saddr, + inet_sk(sk)->inet_num); + return inet_lhash2_bucket(h, hash); +} + +static void inet_hash2(struct inet_hashinfo *h, struct sock *sk) +{ + struct inet_listen_hashbucket *ilb2; + + if (!h->lhash2) + return; + + ilb2 = inet_lhash2_bucket_sk(h, sk); + + spin_lock(&ilb2->lock); + if (sk->sk_reuseport && sk->sk_family == AF_INET6) + hlist_add_tail_rcu(&inet_csk(sk)->icsk_listen_portaddr_node, + &ilb2->head); + else + hlist_add_head_rcu(&inet_csk(sk)->icsk_listen_portaddr_node, + &ilb2->head); + ilb2->count++; + spin_unlock(&ilb2->lock); +} + +static void inet_unhash2(struct inet_hashinfo *h, struct sock *sk) +{ + struct inet_listen_hashbucket *ilb2; + + if (!h->lhash2 || + WARN_ON_ONCE(hlist_unhashed(&inet_csk(sk)->icsk_listen_portaddr_node))) + return; + + ilb2 = inet_lhash2_bucket_sk(h, sk); + + spin_lock(&ilb2->lock); + hlist_del_init_rcu(&inet_csk(sk)->icsk_listen_portaddr_node); + ilb2->count--; + spin_unlock(&ilb2->lock); +} + static inline int compute_score(struct sock *sk, struct net *net, const unsigned short hnum, const __be32 daddr, const int dif, const int sdif, bool exact_dif) @@ -211,6 +266,40 @@ static inline int compute_score(struct sock *sk, struct net *net, */ /* called with rcu_read_lock() : No refcount taken on the socket */ +static struct sock *inet_lhash2_lookup(struct net *net, + struct inet_listen_hashbucket *ilb2, + struct sk_buff *skb, int doff, + const __be32 saddr, __be16 sport, + const __be32 daddr, const unsigned short hnum, + const int dif, const int sdif) +{ + bool exact_dif = inet_exact_dif_match(net, skb); + struct inet_connection_sock *icsk; + struct sock *sk, *result = NULL; + int score, hiscore = 0; + u32 phash = 0; + + inet_lhash2_for_each_icsk_rcu(icsk, &ilb2->head) { + sk = (struct sock *)icsk; + score = compute_score(sk, net, hnum, daddr, + dif, sdif, exact_dif); + if (score > hiscore) { + if (sk->sk_reuseport) { + phash = inet_ehashfn(net, daddr, hnum, + saddr, sport); + result = reuseport_select_sock(sk, phash, + skb, doff); + if (result) + return result; + } + result = sk; + hiscore = score; + } + } + + return result; +} + struct sock *__inet_lookup_listener(struct net *net, struct inet_hashinfo *hashinfo, struct sk_buff *skb, int doff, @@ -221,10 +310,42 @@ struct sock *__inet_lookup_listener(struct net *net, unsigned int hash = inet_lhashfn(net, hnum); struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash]; bool exact_dif = inet_exact_dif_match(net, skb); + struct inet_listen_hashbucket *ilb2; struct sock *sk, *result = NULL; int score, hiscore = 0; + unsigned int hash2; u32 phash = 0; + if (ilb->count <= 10 || !hashinfo->lhash2) + goto port_lookup; + + /* Too many sk in the ilb bucket (which is hashed by port alone). + * Try lhash2 (which is hashed by port and addr) instead. + */ + + hash2 = ipv4_portaddr_hash(net, daddr, hnum); + ilb2 = inet_lhash2_bucket(hashinfo, hash2); + if (ilb2->count > ilb->count) + goto port_lookup; + + result = inet_lhash2_lookup(net, ilb2, skb, doff, + saddr, sport, daddr, hnum, + dif, sdif); + if (result) + return result; + + /* Lookup lhash2 with INADDR_ANY */ + + hash2 = ipv4_portaddr_hash(net, htonl(INADDR_ANY), hnum); + ilb2 = inet_lhash2_bucket(hashinfo, hash2); + if (ilb2->count > ilb->count) + goto port_lookup; + + return inet_lhash2_lookup(net, ilb2, skb, doff, + saddr, sport, daddr, hnum, + dif, sdif); + +port_lookup: sk_for_each_rcu(sk, &ilb->head) { score = compute_score(sk, net, hnum, daddr, dif, sdif, exact_dif); @@ -530,6 +651,7 @@ int __inet_hash(struct sock *sk, struct sock *osk) hlist_add_tail_rcu(&sk->sk_node, &ilb->head); else hlist_add_head_rcu(&sk->sk_node, &ilb->head); + inet_hash2(hashinfo, sk); ilb->count++; sock_set_flag(sk, SOCK_RCU_FREE); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); @@ -560,7 +682,6 @@ void inet_unhash(struct sock *sk) struct inet_listen_hashbucket *ilb; spinlock_t *lock; bool listener = false; - int done; if (sk_unhashed(sk)) return; @@ -573,17 +694,20 @@ void inet_unhash(struct sock *sk) lock = inet_ehash_lockp(hashinfo, sk->sk_hash); } spin_lock_bh(lock); + if (sk_unhashed(sk)) + goto unlock; + if (rcu_access_pointer(sk->sk_reuseport_cb)) reuseport_detach_sock(sk); - if (listener) - done = __sk_del_node_init(sk); - else - done = __sk_nulls_del_node_init_rcu(sk); - if (done) { - if (listener) - ilb->count--; - sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); + if (listener) { + inet_unhash2(hashinfo, sk); + __sk_del_node_init(sk); + ilb->count--; + } else { + __sk_nulls_del_node_init_rcu(sk); } + sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); +unlock: spin_unlock_bh(lock); } EXPORT_SYMBOL_GPL(inet_unhash); @@ -736,6 +860,8 @@ void inet_hashinfo_init(struct inet_hashinfo *h) h->listening_hash[i].count = 0; } + h->lhash2 = NULL; + if (h != &tcp_hashinfo) return; @@ -747,6 +873,30 @@ void inet_hashinfo_init(struct inet_hashinfo *h) } EXPORT_SYMBOL_GPL(inet_hashinfo_init); +void __init inet_hashinfo2_init(struct inet_hashinfo *h, const char *name, + unsigned long numentries, int scale, + unsigned long low_limit, + unsigned long high_limit) +{ + unsigned int i; + + h->lhash2 = alloc_large_system_hash(name, + sizeof(*h->lhash2), + numentries, + scale, + 0, + NULL, + &h->lhash2_mask, + low_limit, + high_limit); + + for (i = 0; i <= h->lhash2_mask; i++) { + spin_lock_init(&h->lhash2[i].lock); + INIT_HLIST_HEAD(&h->lhash2[i].head); + h->lhash2[i].count = 0; + } +} + int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo) { unsigned int locksz = sizeof(spinlock_t); diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c index 880296105437..85a9bd097626 100644 --- a/net/ipv6/inet6_hashtables.c +++ b/net/ipv6/inet6_hashtables.c @@ -125,6 +125,40 @@ static inline int compute_score(struct sock *sk, struct net *net, } /* called with rcu_read_lock() */ +static struct sock *inet6_lhash2_lookup(struct net *net, + struct inet_listen_hashbucket *ilb2, + struct sk_buff *skb, int doff, + const struct in6_addr *saddr, + const __be16 sport, const struct in6_addr *daddr, + const unsigned short hnum, const int dif, const int sdif) +{ + bool exact_dif = inet6_exact_dif_match(net, skb); + struct inet_connection_sock *icsk; + struct sock *sk, *result = NULL; + int score, hiscore = 0; + u32 phash = 0; + + inet_lhash2_for_each_icsk_rcu(icsk, &ilb2->head) { + sk = (struct sock *)icsk; + score = compute_score(sk, net, hnum, daddr, dif, sdif, + exact_dif); + if (score > hiscore) { + if (sk->sk_reuseport) { + phash = inet6_ehashfn(net, daddr, hnum, + saddr, sport); + result = reuseport_select_sock(sk, phash, + skb, doff); + if (result) + return result; + } + result = sk; + hiscore = score; + } + } + + return result; +} + struct sock *inet6_lookup_listener(struct net *net, struct inet_hashinfo *hashinfo, struct sk_buff *skb, int doff, @@ -135,10 +169,42 @@ struct sock *inet6_lookup_listener(struct net *net, unsigned int hash = inet_lhashfn(net, hnum); struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash]; bool exact_dif = inet6_exact_dif_match(net, skb); + struct inet_listen_hashbucket *ilb2; struct sock *sk, *result = NULL; int score, hiscore = 0; + unsigned int hash2; u32 phash = 0; + if (ilb->count <= 10 || !hashinfo->lhash2) + goto port_lookup; + + /* Too many sk in the ilb bucket (which is hashed by port alone). + * Try lhash2 (which is hashed by port and addr) instead. + */ + + hash2 = ipv6_portaddr_hash(net, daddr, hnum); + ilb2 = inet_lhash2_bucket(hashinfo, hash2); + if (ilb2->count > ilb->count) + goto port_lookup; + + result = inet6_lhash2_lookup(net, ilb2, skb, doff, + saddr, sport, daddr, hnum, + dif, sdif); + if (result) + return result; + + /* Lookup lhash2 with in6addr_any */ + + hash2 = ipv6_portaddr_hash(net, &in6addr_any, hnum); + ilb2 = inet_lhash2_bucket(hashinfo, hash2); + if (ilb2->count > ilb->count) + goto port_lookup; + + return inet6_lhash2_lookup(net, ilb2, skb, doff, + saddr, sport, daddr, hnum, + dif, sdif); + +port_lookup: sk_for_each(sk, &ilb->head) { score = compute_score(sk, net, hnum, daddr, dif, sdif, exact_dif); if (score > hiscore) { From 101bbb8d436c4aa0ec057f50f8f895e29a851803 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 13 Dec 2019 18:20:41 -0800 Subject: [PATCH 0771/1640] UPSTREAM: tcp/dccp: fix possible race __inet_lookup_established() commit 8dbd76e79a16b45b2ccb01d2f2e08dbf64e71e40 upstream. Michal Kubecek and Firo Yang did a very nice analysis of crashes happening in __inet_lookup_established(). Since a TCP socket can go from TCP_ESTABLISH to TCP_LISTEN (via a close()/socket()/listen() cycle) without a RCU grace period, I should not have changed listeners linkage in their hash table. They must use the nulls protocol (Documentation/RCU/rculist_nulls.txt), so that a lookup can detect a socket in a hash list was moved in another one. Since we added code in commit d296ba60d8e2 ("soreuseport: Resolve merge conflict for v4/v6 ordering fix"), we have to add hlist_nulls_add_tail_rcu() helper. Fixes: 3b24d854cb35 ("tcp/dccp: do not touch listener sk_refcnt under synflood") Signed-off-by: Eric Dumazet Reported-by: Michal Kubecek Reported-by: Firo Yang Reviewed-by: Michal Kubecek Link: https://lore.kernel.org/netdev/20191120083919.GH27852@unicorn.suse.cz/ Signed-off-by: Jakub Kicinski [stable-4.19: we also need to update code in __inet_lookup_listener() and inet6_lookup_listener() which has been removed in 5.0-rc1.] Signed-off-by: Michal Kubecek Signed-off-by: Greg Kroah-Hartman --- include/linux/rculist_nulls.h | 37 +++++++++++++++++++++++++++++++++++ include/net/inet_hashtables.h | 12 +++++++++--- include/net/sock.h | 5 +++++ net/ipv4/inet_diag.c | 3 ++- net/ipv4/inet_hashtables.c | 19 +++++++++--------- net/ipv4/tcp_ipv4.c | 7 ++++--- net/ipv6/inet6_hashtables.c | 3 ++- 7 files changed, 69 insertions(+), 17 deletions(-) diff --git a/include/linux/rculist_nulls.h b/include/linux/rculist_nulls.h index a7fe73a22ccc..cf64a9492256 100644 --- a/include/linux/rculist_nulls.h +++ b/include/linux/rculist_nulls.h @@ -100,6 +100,43 @@ static inline void hlist_nulls_add_head_rcu(struct hlist_nulls_node *n, WRITE_ONCE(first->pprev, &n->next); } +/** + * hlist_nulls_add_tail_rcu + * @n: the element to add to the hash list. + * @h: the list to add to. + * + * Description: + * Adds the specified element to the specified hlist_nulls, + * while permitting racing traversals. + * + * The caller must take whatever precautions are necessary + * (such as holding appropriate locks) to avoid racing + * with another list-mutation primitive, such as hlist_nulls_add_head_rcu() + * or hlist_nulls_del_rcu(), running on this same list. + * However, it is perfectly legal to run concurrently with + * the _rcu list-traversal primitives, such as + * hlist_nulls_for_each_entry_rcu(), used to prevent memory-consistency + * problems on Alpha CPUs. Regardless of the type of CPU, the + * list-traversal primitive must be guarded by rcu_read_lock(). + */ +static inline void hlist_nulls_add_tail_rcu(struct hlist_nulls_node *n, + struct hlist_nulls_head *h) +{ + struct hlist_nulls_node *i, *last = NULL; + + /* Note: write side code, so rcu accessors are not needed. */ + for (i = h->first; !is_a_nulls(i); i = i->next) + last = i; + + if (last) { + n->next = last->next; + n->pprev = &last->next; + rcu_assign_pointer(hlist_next_rcu(last), n); + } else { + hlist_nulls_add_head_rcu(n, h); + } +} + /** * hlist_nulls_for_each_entry_rcu - iterate over rcu list of given type * @tpos: the type * to use as a loop cursor. diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index c0ebafcebff5..2d04f3e06de1 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -106,13 +106,19 @@ struct inet_bind_hashbucket { struct hlist_head chain; }; -/* - * Sockets can be hashed in established or listening table +/* Sockets can be hashed in established or listening table. + * We must use different 'nulls' end-of-chain value for all hash buckets : + * A socket might transition from ESTABLISH to LISTEN state without + * RCU grace period. A lookup in ehash table needs to handle this case. */ +#define LISTENING_NULLS_BASE (1U << 29) struct inet_listen_hashbucket { spinlock_t lock; unsigned int count; - struct hlist_head head; + union { + struct hlist_head head; + struct hlist_nulls_head nulls_head; + }; }; /* This is for listening sockets, thus all sockets which possess wildcards. */ diff --git a/include/net/sock.h b/include/net/sock.h index 86aafa6abe87..5741ff62cbe2 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -715,6 +715,11 @@ static inline void __sk_nulls_add_node_rcu(struct sock *sk, struct hlist_nulls_h hlist_nulls_add_head_rcu(&sk->sk_nulls_node, list); } +static inline void __sk_nulls_add_node_tail_rcu(struct sock *sk, struct hlist_nulls_head *list) +{ + hlist_nulls_add_tail_rcu(&sk->sk_nulls_node, list); +} + static inline void sk_nulls_add_node_rcu(struct sock *sk, struct hlist_nulls_head *list) { sock_hold(sk); diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index c73b7ab7f8cb..462f85c76edd 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -908,11 +908,12 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb, for (i = s_i; i < INET_LHTABLE_SIZE; i++) { struct inet_listen_hashbucket *ilb; + struct hlist_nulls_node *node; num = 0; ilb = &hashinfo->listening_hash[i]; spin_lock(&ilb->lock); - sk_for_each(sk, &ilb->head) { + sk_nulls_for_each(sk, node, &ilb->nulls_head) { struct inet_sock *inet = inet_sk(sk); if (!net_eq(sock_net(sk), net)) diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 9e618ca8a7ac..845005d20599 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -312,6 +312,7 @@ struct sock *__inet_lookup_listener(struct net *net, bool exact_dif = inet_exact_dif_match(net, skb); struct inet_listen_hashbucket *ilb2; struct sock *sk, *result = NULL; + struct hlist_nulls_node *node; int score, hiscore = 0; unsigned int hash2; u32 phash = 0; @@ -346,7 +347,7 @@ struct sock *__inet_lookup_listener(struct net *net, dif, sdif); port_lookup: - sk_for_each_rcu(sk, &ilb->head) { + sk_nulls_for_each_rcu(sk, node, &ilb->nulls_head) { score = compute_score(sk, net, hnum, daddr, dif, sdif, exact_dif); if (score > hiscore) { @@ -609,10 +610,11 @@ static int inet_reuseport_add_sock(struct sock *sk, struct inet_listen_hashbucket *ilb) { struct inet_bind_bucket *tb = inet_csk(sk)->icsk_bind_hash; + const struct hlist_nulls_node *node; struct sock *sk2; kuid_t uid = sock_i_uid(sk); - sk_for_each_rcu(sk2, &ilb->head) { + sk_nulls_for_each_rcu(sk2, node, &ilb->nulls_head) { if (sk2 != sk && sk2->sk_family == sk->sk_family && ipv6_only_sock(sk2) == ipv6_only_sock(sk) && @@ -648,9 +650,9 @@ int __inet_hash(struct sock *sk, struct sock *osk) } if (IS_ENABLED(CONFIG_IPV6) && sk->sk_reuseport && sk->sk_family == AF_INET6) - hlist_add_tail_rcu(&sk->sk_node, &ilb->head); + __sk_nulls_add_node_tail_rcu(sk, &ilb->nulls_head); else - hlist_add_head_rcu(&sk->sk_node, &ilb->head); + __sk_nulls_add_node_rcu(sk, &ilb->nulls_head); inet_hash2(hashinfo, sk); ilb->count++; sock_set_flag(sk, SOCK_RCU_FREE); @@ -701,11 +703,9 @@ void inet_unhash(struct sock *sk) reuseport_detach_sock(sk); if (listener) { inet_unhash2(hashinfo, sk); - __sk_del_node_init(sk); - ilb->count--; - } else { - __sk_nulls_del_node_init_rcu(sk); + ilb->count--; } + __sk_nulls_del_node_init_rcu(sk); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); unlock: spin_unlock_bh(lock); @@ -856,7 +856,8 @@ void inet_hashinfo_init(struct inet_hashinfo *h) for (i = 0; i < INET_LHTABLE_SIZE; i++) { spin_lock_init(&h->listening_hash[i].lock); - INIT_HLIST_HEAD(&h->listening_hash[i].head); + INIT_HLIST_NULLS_HEAD(&h->listening_hash[i].nulls_head, + i + LISTENING_NULLS_BASE); h->listening_hash[i].count = 0; } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 3ca11273b6f9..b4908f96aa84 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1991,13 +1991,14 @@ static void *listening_get_next(struct seq_file *seq, void *cur) struct tcp_iter_state *st = seq->private; struct net *net = seq_file_net(seq); struct inet_listen_hashbucket *ilb; + struct hlist_nulls_node *node; struct sock *sk = cur; if (!sk) { get_head: ilb = &tcp_hashinfo.listening_hash[st->bucket]; spin_lock(&ilb->lock); - sk = sk_head(&ilb->head); + sk = sk_nulls_head(&ilb->nulls_head); st->offset = 0; goto get_sk; } @@ -2005,9 +2006,9 @@ get_head: ++st->num; ++st->offset; - sk = sk_next(sk); + sk = sk_nulls_next(sk); get_sk: - sk_for_each_from(sk) { + sk_nulls_for_each_from(sk, node) { if (!net_eq(sock_net(sk), net)) continue; if (sk->sk_family == st->family) diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c index 85a9bd097626..d03644849890 100644 --- a/net/ipv6/inet6_hashtables.c +++ b/net/ipv6/inet6_hashtables.c @@ -171,6 +171,7 @@ struct sock *inet6_lookup_listener(struct net *net, bool exact_dif = inet6_exact_dif_match(net, skb); struct inet_listen_hashbucket *ilb2; struct sock *sk, *result = NULL; + struct hlist_nulls_node *node; int score, hiscore = 0; unsigned int hash2; u32 phash = 0; @@ -205,7 +206,7 @@ struct sock *inet6_lookup_listener(struct net *net, dif, sdif); port_lookup: - sk_for_each(sk, &ilb->head) { + sk_nulls_for_each(sk, node, &ilb->nulls_head) { score = compute_score(sk, net, hnum, daddr, dif, sdif, exact_dif); if (score > hiscore) { if (sk->sk_reuseport) { From c78f1b2439c836da8cbe18474b2050e642a41be3 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 8 Aug 2018 01:01:26 -0700 Subject: [PATCH 0772/1640] UPSTREAM: bpf: Enable BPF_PROG_TYPE_SK_REUSEPORT bpf prog in reuseport selection This patch allows a BPF_PROG_TYPE_SK_REUSEPORT bpf prog to select a SO_REUSEPORT sk from a BPF_MAP_TYPE_REUSEPORT_ARRAY introduced in the earlier patch. "bpf_run_sk_reuseport()" will return -ECONNREFUSED when the BPF_PROG_TYPE_SK_REUSEPORT prog returns SK_DROP. The callers, in inet[6]_hashtable.c and ipv[46]/udp.c, are modified to handle this case and return NULL immediately instead of continuing the sk search from its hashtable. It re-uses the existing SO_ATTACH_REUSEPORT_EBPF setsockopt to attach BPF_PROG_TYPE_SK_REUSEPORT. The "sk_reuseport_attach_bpf()" will check if the attaching bpf prog is in the new SK_REUSEPORT or the existing SOCKET_FILTER type and then check different things accordingly. One level of "__reuseport_attach_prog()" call is removed. The "sk_unhashed() && ..." and "sk->sk_reuseport_cb" tests are pushed back to "reuseport_attach_prog()" in sock_reuseport.c. sock_reuseport.c seems to have more knowledge on those test requirements than filter.c. In "reuseport_attach_prog()", after new_prog is attached to reuse->prog, the old_prog (if any) is also directly freed instead of returning the old_prog to the caller and asking the caller to free. The sysctl_optmem_max check is moved back to the "sk_reuseport_attach_filter()" and "sk_reuseport_attach_bpf()". As of other bpf prog types, the new BPF_PROG_TYPE_SK_REUSEPORT is only bounded by the usual "bpf_prog_charge_memlock()" during load time instead of bounded by both bpf_prog_charge_memlock and sysctl_optmem_max. Signed-off-by: Martin KaFai Lau Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/linux/filter.h | 1 + include/net/sock_reuseport.h | 3 +- net/core/filter.c | 89 +++++++++++++++++++++--------------- net/core/sock_reuseport.c | 36 +++++++++++---- net/ipv4/inet_hashtables.c | 14 ++++-- net/ipv4/udp.c | 4 ++ net/ipv6/inet6_hashtables.c | 14 ++++-- net/ipv6/udp.c | 4 ++ 8 files changed, 107 insertions(+), 58 deletions(-) diff --git a/include/linux/filter.h b/include/linux/filter.h index ba829a5d138a..e18cf8a295e3 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -818,6 +818,7 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk); int sk_attach_bpf(u32 ufd, struct sock *sk); int sk_reuseport_attach_filter(struct sock_fprog *fprog, struct sock *sk); int sk_reuseport_attach_bpf(u32 ufd, struct sock *sk); +void sk_reuseport_prog_free(struct bpf_prog *prog); int sk_detach_filter(struct sock *sk); int sk_get_filter(struct sock *sk, struct sock_filter __user *filter, unsigned int len); diff --git a/include/net/sock_reuseport.h b/include/net/sock_reuseport.h index 267507214cdf..b0ff88240049 100644 --- a/include/net/sock_reuseport.h +++ b/include/net/sock_reuseport.h @@ -30,8 +30,7 @@ extern struct sock *reuseport_select_sock(struct sock *sk, u32 hash, struct sk_buff *skb, int hdr_len); -extern struct bpf_prog *reuseport_attach_prog(struct sock *sk, - struct bpf_prog *prog); +extern int reuseport_attach_prog(struct sock *sk, struct bpf_prog *prog); int reuseport_get_id(struct sock_reuseport *reuse); #endif /* _SOCK_REUSEPORT_H */ diff --git a/net/core/filter.c b/net/core/filter.c index dc02d74e7a4c..6981c0454411 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1453,30 +1453,6 @@ static int __sk_attach_prog(struct bpf_prog *prog, struct sock *sk) return 0; } -static int __reuseport_attach_prog(struct bpf_prog *prog, struct sock *sk) -{ - struct bpf_prog *old_prog; - int err; - - if (bpf_prog_size(prog->len) > sysctl_optmem_max) - return -ENOMEM; - - if (sk_unhashed(sk) && sk->sk_reuseport) { - err = reuseport_alloc(sk, false); - if (err) - return err; - } else if (!rcu_access_pointer(sk->sk_reuseport_cb)) { - /* The socket wasn't bound with SO_REUSEPORT */ - return -EINVAL; - } - - old_prog = reuseport_attach_prog(sk, prog); - if (old_prog) - bpf_prog_destroy(old_prog); - - return 0; -} - static struct bpf_prog *__get_filter(struct sock_fprog *fprog, struct sock *sk) { @@ -1550,13 +1526,15 @@ int sk_reuseport_attach_filter(struct sock_fprog *fprog, struct sock *sk) if (IS_ERR(prog)) return PTR_ERR(prog); - err = __reuseport_attach_prog(prog, sk); - if (err < 0) { - __bpf_prog_release(prog); - return err; - } + if (bpf_prog_size(prog->len) > sysctl_optmem_max) + err = -ENOMEM; + else + err = reuseport_attach_prog(sk, prog); - return 0; + if (err) + __bpf_prog_release(prog); + + return err; } static struct bpf_prog *__get_bpf(u32 ufd, struct sock *sk) @@ -1586,19 +1564,58 @@ int sk_attach_bpf(u32 ufd, struct sock *sk) int sk_reuseport_attach_bpf(u32 ufd, struct sock *sk) { - struct bpf_prog *prog = __get_bpf(ufd, sk); + struct bpf_prog *prog; int err; + if (sock_flag(sk, SOCK_FILTER_LOCKED)) + return -EPERM; + + prog = bpf_prog_get_type(ufd, BPF_PROG_TYPE_SOCKET_FILTER); + if (IS_ERR(prog) && PTR_ERR(prog) == -EINVAL) + prog = bpf_prog_get_type(ufd, BPF_PROG_TYPE_SK_REUSEPORT); if (IS_ERR(prog)) return PTR_ERR(prog); - err = __reuseport_attach_prog(prog, sk); - if (err < 0) { - bpf_prog_put(prog); - return err; + if (prog->type == BPF_PROG_TYPE_SK_REUSEPORT) { + /* Like other non BPF_PROG_TYPE_SOCKET_FILTER + * bpf prog (e.g. sockmap). It depends on the + * limitation imposed by bpf_prog_load(). + * Hence, sysctl_optmem_max is not checked. + */ + if ((sk->sk_type != SOCK_STREAM && + sk->sk_type != SOCK_DGRAM) || + (sk->sk_protocol != IPPROTO_UDP && + sk->sk_protocol != IPPROTO_TCP) || + (sk->sk_family != AF_INET && + sk->sk_family != AF_INET6)) { + err = -ENOTSUPP; + goto err_prog_put; + } + } else { + /* BPF_PROG_TYPE_SOCKET_FILTER */ + if (bpf_prog_size(prog->len) > sysctl_optmem_max) { + err = -ENOMEM; + goto err_prog_put; + } } - return 0; + err = reuseport_attach_prog(sk, prog); +err_prog_put: + if (err) + bpf_prog_put(prog); + + return err; +} + +void sk_reuseport_prog_free(struct bpf_prog *prog) +{ + if (!prog) + return; + + if (prog->type == BPF_PROG_TYPE_SK_REUSEPORT) + bpf_prog_put(prog); + else + bpf_prog_destroy(prog); } struct bpf_scratchpad { diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c index b31d354fda9f..4a55e9d2dd03 100644 --- a/net/core/sock_reuseport.c +++ b/net/core/sock_reuseport.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #define INIT_SOCKS 128 @@ -132,8 +133,7 @@ static void reuseport_free_rcu(struct rcu_head *head) struct sock_reuseport *reuse; reuse = container_of(head, struct sock_reuseport, rcu); - if (reuse->prog) - bpf_prog_destroy(reuse->prog); + sk_reuseport_prog_free(rcu_dereference_protected(reuse->prog, 1)); if (reuse->reuseport_id) ida_simple_remove(&reuseport_ida, reuse->reuseport_id); kfree(reuse); @@ -218,9 +218,9 @@ void reuseport_detach_sock(struct sock *sk) } EXPORT_SYMBOL(reuseport_detach_sock); -static struct sock *run_bpf(struct sock_reuseport *reuse, u16 socks, - struct bpf_prog *prog, struct sk_buff *skb, - int hdr_len) +static struct sock *run_bpf_filter(struct sock_reuseport *reuse, u16 socks, + struct bpf_prog *prog, struct sk_buff *skb, + int hdr_len) { struct sk_buff *nskb = NULL; u32 index; @@ -281,9 +281,15 @@ struct sock *reuseport_select_sock(struct sock *sk, /* paired with smp_wmb() in reuseport_add_sock() */ smp_rmb(); - if (prog && skb) - sk2 = run_bpf(reuse, socks, prog, skb, hdr_len); + if (!prog || !skb) + goto select_by_hash; + if (prog->type == BPF_PROG_TYPE_SK_REUSEPORT) + sk2 = bpf_run_sk_reuseport(reuse, sk, prog, skb, hash); + else + sk2 = run_bpf_filter(reuse, socks, prog, skb, hdr_len); + +select_by_hash: /* no bpf or invalid bpf result: fall back to hash usage */ if (!sk2) sk2 = reuse->socks[reciprocal_scale(hash, socks)]; @@ -295,12 +301,21 @@ out: } EXPORT_SYMBOL(reuseport_select_sock); -struct bpf_prog * -reuseport_attach_prog(struct sock *sk, struct bpf_prog *prog) +int reuseport_attach_prog(struct sock *sk, struct bpf_prog *prog) { struct sock_reuseport *reuse; struct bpf_prog *old_prog; + if (sk_unhashed(sk) && sk->sk_reuseport) { + int err = reuseport_alloc(sk, false); + + if (err) + return err; + } else if (!rcu_access_pointer(sk->sk_reuseport_cb)) { + /* The socket wasn't bound with SO_REUSEPORT */ + return -EINVAL; + } + spin_lock_bh(&reuseport_lock); reuse = rcu_dereference_protected(sk->sk_reuseport_cb, lockdep_is_held(&reuseport_lock)); @@ -309,6 +324,7 @@ reuseport_attach_prog(struct sock *sk, struct bpf_prog *prog) rcu_assign_pointer(reuse->prog, prog); spin_unlock_bh(&reuseport_lock); - return old_prog; + sk_reuseport_prog_free(old_prog); + return 0; } EXPORT_SYMBOL(reuseport_attach_prog); diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 845005d20599..7ef23d5fe1ff 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -333,7 +333,7 @@ struct sock *__inet_lookup_listener(struct net *net, saddr, sport, daddr, hnum, dif, sdif); if (result) - return result; + goto done; /* Lookup lhash2 with INADDR_ANY */ @@ -342,9 +342,10 @@ struct sock *__inet_lookup_listener(struct net *net, if (ilb2->count > ilb->count) goto port_lookup; - return inet_lhash2_lookup(net, ilb2, skb, doff, - saddr, sport, daddr, hnum, - dif, sdif); + result = inet_lhash2_lookup(net, ilb2, skb, doff, + saddr, sport, daddr, hnum, + dif, sdif); + goto done; port_lookup: sk_nulls_for_each_rcu(sk, node, &ilb->nulls_head) { @@ -357,12 +358,15 @@ port_lookup: result = reuseport_select_sock(sk, phash, skb, doff); if (result) - return result; + goto done; } result = sk; hiscore = score; } } +done: + if (unlikely(IS_ERR(result))) + return NULL; return result; } EXPORT_SYMBOL_GPL(__inet_lookup_listener); diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 92cea31cecef..d1a0444985a1 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -514,6 +514,8 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, daddr, hnum, dif, sdif, exact_dif, hslot2, skb); } + if (unlikely(IS_ERR(result))) + return NULL; return result; } begin: @@ -528,6 +530,8 @@ begin: saddr, sport); result = reuseport_select_sock(sk, hash, skb, sizeof(struct udphdr)); + if (unlikely(IS_ERR(result))) + return NULL; if (result) return result; } diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c index d03644849890..d8391921363f 100644 --- a/net/ipv6/inet6_hashtables.c +++ b/net/ipv6/inet6_hashtables.c @@ -192,7 +192,7 @@ struct sock *inet6_lookup_listener(struct net *net, saddr, sport, daddr, hnum, dif, sdif); if (result) - return result; + goto done; /* Lookup lhash2 with in6addr_any */ @@ -201,9 +201,10 @@ struct sock *inet6_lookup_listener(struct net *net, if (ilb2->count > ilb->count) goto port_lookup; - return inet6_lhash2_lookup(net, ilb2, skb, doff, - saddr, sport, daddr, hnum, - dif, sdif); + result = inet6_lhash2_lookup(net, ilb2, skb, doff, + saddr, sport, daddr, hnum, + dif, sdif); + goto done; port_lookup: sk_nulls_for_each(sk, node, &ilb->nulls_head) { @@ -215,12 +216,15 @@ port_lookup: result = reuseport_select_sock(sk, phash, skb, doff); if (result) - return result; + goto done; } result = sk; hiscore = score; } } +done: + if (unlikely(IS_ERR(result))) + return NULL; return result; } EXPORT_SYMBOL_GPL(inet6_lookup_listener); diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 30779582d9bd..4c060cb62266 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -248,6 +248,8 @@ struct sock *__udp6_lib_lookup(struct net *net, exact_dif, hslot2, skb); } + if (unlikely(IS_ERR(result))) + return NULL; return result; } begin: @@ -262,6 +264,8 @@ begin: saddr, sport); result = reuseport_select_sock(sk, hash, skb, sizeof(struct udphdr)); + if (unlikely(IS_ERR(result))) + return NULL; if (result) return result; } From 716ee0391598801184f1f5b678fb08823dc57817 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sun, 12 Aug 2018 01:59:17 +0200 Subject: [PATCH 0773/1640] UPSTREAM: bpf: decouple btf from seq bpf fs dump and enable more maps Commit a26ca7c982cb ("bpf: btf: Add pretty print support to the basic arraymap") and 699c86d6ec21 ("bpf: btf: add pretty print for hash/lru_hash maps") enabled support for BTF and dumping via BPF fs for array and hash/lru map. However, both can be decoupled from each other such that regular BPF maps can be supported for attaching BTF key/value information, while not all maps necessarily need to dump via map_seq_show_elem() callback. The basic sanity check which is a prerequisite for all maps is that key/value size has to match in any case, and some maps can have extra checks via map_check_btf() callback, e.g. probing certain types or indicating no support in general. With that we can also enable retrieving BTF info for per-cpu map types and lpm. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Acked-by: Yonghong Song --- include/linux/bpf.h | 13 +++++++++---- kernel/bpf/arraymap.c | 26 ++++++++++++-------------- kernel/bpf/cpumap.c | 1 + kernel/bpf/devmap.c | 1 + kernel/bpf/hashtab.c | 20 +------------------- kernel/bpf/inode.c | 3 ++- kernel/bpf/local_storage.c | 1 + kernel/bpf/lpm_trie.c | 12 ++++++++++++ kernel/bpf/sockmap.c | 2 ++ kernel/bpf/stackmap.c | 1 + kernel/bpf/syscall.c | 36 ++++++++++++++++++++++++++++++++---- kernel/bpf/xskmap.c | 3 +-- 12 files changed, 75 insertions(+), 44 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index ad2d1c71a07d..eef57b96996c 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -23,7 +23,7 @@ struct bpf_prog; struct bpf_map; struct sock; struct seq_file; -struct btf; +struct btf_type; /* map is generic key/value storage optionally accesible by eBPF programs */ struct bpf_map_ops { @@ -53,8 +53,9 @@ struct bpf_map_ops { u32 (*map_fd_sys_lookup_elem)(void *ptr); void (*map_seq_show_elem)(struct bpf_map *map, void *key, struct seq_file *m); - int (*map_check_btf)(const struct bpf_map *map, const struct btf *btf, - u32 key_type_id, u32 value_type_id); + int (*map_check_btf)(const struct bpf_map *map, + const struct btf_type *key_type, + const struct btf_type *value_type); }; struct bpf_map { @@ -123,9 +124,13 @@ static inline bool bpf_map_offload_neutral(const struct bpf_map *map) static inline bool bpf_map_support_seq_show(const struct bpf_map *map) { - return map->ops->map_seq_show_elem && map->ops->map_check_btf; + return map->btf && map->ops->map_seq_show_elem; } +int map_check_no_btf(const struct bpf_map *map, + const struct btf_type *key_type, + const struct btf_type *value_type); + extern const struct bpf_map_ops bpf_map_offload_ops; /* function argument constraints */ diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 019f19fe56d2..44f53c06629e 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -358,27 +358,20 @@ static void array_map_seq_show_elem(struct bpf_map *map, void *key, rcu_read_unlock(); } -static int array_map_check_btf(const struct bpf_map *map, const struct btf *btf, - u32 btf_key_id, u32 btf_value_id) +static int array_map_check_btf(const struct bpf_map *map, + const struct btf_type *key_type, + const struct btf_type *value_type) { - const struct btf_type *key_type, *value_type; - u32 key_size, value_size; u32 int_data; - key_type = btf_type_id_size(btf, &btf_key_id, &key_size); - if (!key_type || BTF_INFO_KIND(key_type->info) != BTF_KIND_INT) + if (BTF_INFO_KIND(key_type->info) != BTF_KIND_INT) return -EINVAL; int_data = *(u32 *)(key_type + 1); - /* bpf array can only take a u32 key. This check makes - * sure that the btf matches the attr used during map_create. + /* bpf array can only take a u32 key. This check makes sure + * that the btf matches the attr used during map_create. */ - if (BTF_INT_BITS(int_data) != 32 || key_size != 4 || - BTF_INT_OFFSET(int_data)) - return -EINVAL; - - value_type = btf_type_id_size(btf, &btf_value_id, &value_size); - if (!value_type || value_size != map->value_size) + if (BTF_INT_BITS(int_data) != 32 || BTF_INT_OFFSET(int_data)) return -EINVAL; return 0; @@ -405,6 +398,7 @@ const struct bpf_map_ops percpu_array_map_ops = { .map_lookup_elem = percpu_array_map_lookup_elem, .map_update_elem = array_map_update_elem, .map_delete_elem = array_map_delete_elem, + .map_check_btf = array_map_check_btf, }; static int fd_array_map_alloc_check(union bpf_attr *attr) @@ -547,6 +541,7 @@ const struct bpf_map_ops prog_array_map_ops = { .map_fd_put_ptr = prog_fd_array_put_ptr, .map_fd_sys_lookup_elem = prog_fd_array_sys_lookup_elem, .map_release_uref = bpf_fd_array_map_clear, + .map_check_btf = map_check_no_btf, }; static struct bpf_event_entry *bpf_event_entry_gen(struct file *perf_file, @@ -636,6 +631,7 @@ const struct bpf_map_ops perf_event_array_map_ops = { .map_fd_get_ptr = perf_event_fd_array_get_ptr, .map_fd_put_ptr = perf_event_fd_array_put_ptr, .map_release = perf_event_fd_array_release, + .map_check_btf = map_check_no_btf, }; #ifdef CONFIG_CGROUPS @@ -667,6 +663,7 @@ const struct bpf_map_ops cgroup_array_map_ops = { .map_delete_elem = fd_array_map_delete_elem, .map_fd_get_ptr = cgroup_fd_array_get_ptr, .map_fd_put_ptr = cgroup_fd_array_put_ptr, + .map_check_btf = map_check_no_btf, }; #endif @@ -751,4 +748,5 @@ const struct bpf_map_ops array_of_maps_map_ops = { .map_fd_put_ptr = bpf_map_fd_put_ptr, .map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem, .map_gen_lookup = array_of_map_gen_lookup, + .map_check_btf = map_check_no_btf, }; diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 46f5f29605d4..620bc5024d7d 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -555,6 +555,7 @@ const struct bpf_map_ops cpu_map_ops = { .map_update_elem = cpu_map_update_elem, .map_lookup_elem = cpu_map_lookup_elem, .map_get_next_key = cpu_map_get_next_key, + .map_check_btf = map_check_no_btf, }; static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu, diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 11e691223c90..42cf66268420 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -491,6 +491,7 @@ const struct bpf_map_ops dev_map_ops = { .map_lookup_elem = dev_map_lookup_elem, .map_update_elem = dev_map_update_elem, .map_delete_elem = dev_map_delete_elem, + .map_check_btf = map_check_no_btf, }; static int dev_map_notification(struct notifier_block *notifier, diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 65a024ad3f2c..7f0559663e21 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -1199,23 +1199,6 @@ static void htab_map_seq_show_elem(struct bpf_map *map, void *key, rcu_read_unlock(); } -static int htab_map_check_btf(const struct bpf_map *map, const struct btf *btf, - u32 btf_key_id, u32 btf_value_id) -{ - const struct btf_type *key_type, *value_type; - u32 key_size, value_size; - - key_type = btf_type_id_size(btf, &btf_key_id, &key_size); - if (!key_type || key_size != map->key_size) - return -EINVAL; - - value_type = btf_type_id_size(btf, &btf_value_id, &value_size); - if (!value_type || value_size != map->value_size) - return -EINVAL; - - return 0; -} - const struct bpf_map_ops htab_map_ops = { .map_alloc_check = htab_map_alloc_check, .map_alloc = htab_map_alloc, @@ -1226,7 +1209,6 @@ const struct bpf_map_ops htab_map_ops = { .map_delete_elem = htab_map_delete_elem, .map_gen_lookup = htab_map_gen_lookup, .map_seq_show_elem = htab_map_seq_show_elem, - .map_check_btf = htab_map_check_btf, }; const struct bpf_map_ops htab_lru_map_ops = { @@ -1240,7 +1222,6 @@ const struct bpf_map_ops htab_lru_map_ops = { .map_delete_elem = htab_lru_map_delete_elem, .map_gen_lookup = htab_lru_map_gen_lookup, .map_seq_show_elem = htab_map_seq_show_elem, - .map_check_btf = htab_map_check_btf, }; /* Called from eBPF program */ @@ -1467,4 +1448,5 @@ const struct bpf_map_ops htab_of_maps_map_ops = { .map_fd_put_ptr = bpf_map_fd_put_ptr, .map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem, .map_gen_lookup = htab_of_map_gen_lookup, + .map_check_btf = map_check_no_btf, }; diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 2ebc5bfea762..dc9d7ac8228d 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -334,7 +334,8 @@ static int bpf_mkmap(struct dentry *dentry, umode_t mode, void *arg) struct bpf_map *map = arg; return bpf_mkobj_ops(dentry, mode, arg, &bpf_map_iops, - map->btf ? &bpffs_map_fops : &bpffs_obj_fops); + bpf_map_support_seq_show(map) ? + &bpffs_map_fops : &bpffs_obj_fops); } static struct dentry * diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index fc4e37f68f2a..22ad967d1e5f 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -246,6 +246,7 @@ const struct bpf_map_ops cgroup_storage_map_ops = { .map_lookup_elem = cgroup_storage_lookup_elem, .map_update_elem = cgroup_storage_update_elem, .map_delete_elem = cgroup_storage_delete_elem, + .map_check_btf = map_check_no_btf, }; int bpf_cgroup_storage_assign(struct bpf_prog *prog, struct bpf_map *_map) diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index c86e165e1fcb..fa382ca9231c 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -10,11 +10,13 @@ */ #include +#include #include #include #include #include #include +#include /* Intermediate node */ #define LPM_TREE_NODE_FLAG_IM BIT(0) @@ -689,6 +691,15 @@ free_stack: return err; } +static int trie_check_btf(const struct bpf_map *map, + const struct btf_type *key_type, + const struct btf_type *value_type) +{ + /* Keys must have struct bpf_lpm_trie_key embedded. */ + return BTF_INFO_KIND(key_type->info) != BTF_KIND_STRUCT ? + -EINVAL : 0; +} + const struct bpf_map_ops trie_map_ops = { .map_alloc = trie_alloc, .map_free = trie_free, @@ -696,4 +707,5 @@ const struct bpf_map_ops trie_map_ops = { .map_lookup_elem = trie_lookup_elem, .map_update_elem = trie_update_elem, .map_delete_elem = trie_delete_elem, + .map_check_btf = trie_check_btf, }; diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index daf92f9df000..60656d33407c 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -2501,6 +2501,7 @@ const struct bpf_map_ops sock_map_ops = { .map_update_elem = sock_map_update_elem, .map_delete_elem = sock_map_delete_elem, .map_release_uref = sock_map_release, + .map_check_btf = map_check_no_btf, }; const struct bpf_map_ops sock_hash_ops = { @@ -2511,6 +2512,7 @@ const struct bpf_map_ops sock_hash_ops = { .map_update_elem = sock_hash_update_elem, .map_delete_elem = sock_hash_delete_elem, .map_release_uref = sock_map_release, + .map_check_btf = map_check_no_btf, }; BPF_CALL_4(bpf_sock_map_update, struct bpf_sock_ops_kern *, bpf_sock, diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 491abd1556df..2af07b864cfa 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -613,6 +613,7 @@ const struct bpf_map_ops stack_map_ops = { .map_lookup_elem = stack_map_lookup_elem, .map_update_elem = stack_map_update_elem, .map_delete_elem = stack_map_delete_elem, + .map_check_btf = map_check_no_btf, }; static int __init stack_map_init(void) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index a3da88e8427e..6fde33165107 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -104,6 +104,7 @@ int bpf_check_uarg_tail_zero(void __user *uaddr, const struct bpf_map_ops bpf_map_offload_ops = { .map_alloc = bpf_map_offload_map_alloc, .map_free = bpf_map_offload_map_free, + .map_check_btf = map_check_no_btf, }; static struct bpf_map *find_and_alloc_map(union bpf_attr *attr) @@ -456,6 +457,34 @@ static int bpf_obj_name_cpy(char *dst, const char *src) return 0; } +int map_check_no_btf(const struct bpf_map *map, + const struct btf_type *key_type, + const struct btf_type *value_type) +{ + return -ENOTSUPP; +} + +static int map_check_btf(const struct bpf_map *map, const struct btf *btf, + u32 btf_key_id, u32 btf_value_id) +{ + const struct btf_type *key_type, *value_type; + u32 key_size, value_size; + int ret = 0; + + key_type = btf_type_id_size(btf, &btf_key_id, &key_size); + if (!key_type || key_size != map->key_size) + return -EINVAL; + + value_type = btf_type_id_size(btf, &btf_value_id, &value_size); + if (!value_type || value_size != map->value_size) + return -EINVAL; + + if (map->ops->map_check_btf) + ret = map->ops->map_check_btf(map, key_type, value_type); + + return ret; +} + #define BPF_MAP_CREATE_LAST_FIELD btf_value_type_id /* called via syscall */ static int map_create(union bpf_attr *attr) @@ -490,8 +519,7 @@ static int map_create(union bpf_attr *attr) atomic_set(&map->refcnt, 1); atomic_set(&map->usercnt, 1); - if (bpf_map_support_seq_show(map) && - (attr->btf_key_type_id || attr->btf_value_type_id)) { + if (attr->btf_key_type_id || attr->btf_value_type_id) { struct btf *btf; if (!attr->btf_key_type_id || !attr->btf_value_type_id) { @@ -505,8 +533,8 @@ static int map_create(union bpf_attr *attr) goto free_map_nouncharge; } - err = map->ops->map_check_btf(map, btf, attr->btf_key_type_id, - attr->btf_value_type_id); + err = map_check_btf(map, btf, attr->btf_key_type_id, + attr->btf_value_type_id); if (err) { btf_put(btf); goto free_map_nouncharge; diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c index b3c557476a8d..4ddf61e158f6 100644 --- a/kernel/bpf/xskmap.c +++ b/kernel/bpf/xskmap.c @@ -227,6 +227,5 @@ const struct bpf_map_ops xsk_map_ops = { .map_lookup_elem = xsk_map_lookup_elem, .map_update_elem = xsk_map_update_elem, .map_delete_elem = xsk_map_delete_elem, + .map_check_btf = map_check_no_btf, }; - - From 404b521c578cc6f04a4c5960a04fc9db5895e45b Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Sun, 12 Aug 2018 10:49:27 -0700 Subject: [PATCH 0774/1640] BACKPORT: bpf: Introduce bpf_skb_ancestor_cgroup_id helper == Problem description == It's useful to be able to identify cgroup associated with skb in TC so that a policy can be applied to this skb, and existing bpf_skb_cgroup_id helper can help with this. Though in real life cgroup hierarchy and hierarchy to apply a policy to don't map 1:1. It's often the case that there is a container and corresponding cgroup, but there are many more sub-cgroups inside container, e.g. because it's delegated to containerized application to control resources for its subsystems, or to separate application inside container from infra that belongs to containerization system (e.g. sshd). At the same time it may be useful to apply a policy to container as a whole. If multiple containers like this are run on a host (what is often the case) and many of them have sub-cgroups, it may not be possible to apply per-container policy in TC with existing helpers such as bpf_skb_under_cgroup or bpf_skb_cgroup_id: * bpf_skb_cgroup_id will return id of immediate cgroup associated with skb, i.e. if it's a sub-cgroup inside container, it can't be used to identify container's cgroup; * bpf_skb_under_cgroup can work only with one cgroup and doesn't scale, i.e. if there are N containers on a host and a policy has to be applied to M of them (0 <= M <= N), it'd require M calls to bpf_skb_under_cgroup, and, if M changes, it'd require to rebuild & load new BPF program. == Solution == The patch introduces new helper bpf_skb_ancestor_cgroup_id that can be used to get id of cgroup v2 that is an ancestor of cgroup associated with skb at specified level of cgroup hierarchy. That way admin can place all containers on one level of cgroup hierarchy (what is a good practice in general and already used in many configurations) and identify specific cgroup on this level no matter what sub-cgroup skb is associated with. E.g. if there is a cgroup hierarchy: root/ root/container1/ root/container1/app11/ root/container1/app11/sub-app-a/ root/container1/app12/ root/container2/ root/container2/app21/ root/container2/app22/ root/container2/app22/sub-app-b/ , then having skb associated with root/container1/app11/sub-app-a/ it's possible to get ancestor at level 1, what is container1 and apply policy for this container, or apply another policy if it's container2. Policies can be kept e.g. in a hash map where key is a container cgroup id and value is an action. Levels where container cgroups are created are usually known in advance whether cgroup hierarchy inside container may be hard to predict especially in case when its creation is delegated to containerized application. == Implementation details == The helper gets ancestor by walking parents up to specified level. Another option would be to get different kind of "id" from cgroup->ancestor_ids[level] and use it with idr_find() to get struct cgroup for ancestor. But that would require radix lookup what doesn't seem to be better (at least it's not obviously better). Format of return value of the new helper is same as that of bpf_skb_cgroup_id. Signed-off-by: Andrey Ignatov Signed-off-by: Daniel Borkmann --- include/linux/cgroup.h | 30 ++++++++++++++++++++++++++++++ include/uapi/linux/bpf.h | 18 ++++++++++++++++++ net/core/filter.c | 28 ++++++++++++++++++++++++++++ 3 files changed, 76 insertions(+) diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 19290afc87bb..49ad1e6f2e21 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -565,6 +565,36 @@ static inline bool cgroup_is_descendant(struct cgroup *cgrp, return cgrp->ancestor_ids[ancestor->level] == ancestor->id; } +/** + * cgroup_ancestor - find ancestor of cgroup + * @cgrp: cgroup to find ancestor of + * @ancestor_level: level of ancestor to find starting from root + * + * Find ancestor of cgroup at specified level starting from root if it exists + * and return pointer to it. Return NULL if @cgrp doesn't have ancestor at + * @ancestor_level. + * + * This function is safe to call as long as @cgrp is accessible. + */ +static inline struct cgroup *cgroup_ancestor(struct cgroup *cgrp, + int ancestor_level) +{ + struct cgroup *ptr; + + if (cgrp->level < ancestor_level) + return NULL; + + for (ptr = cgrp; + ptr && ptr->level > ancestor_level; + ptr = cgroup_parent(ptr)) + ; + + if (ptr && ptr->level == ancestor_level) + return ptr; + + return NULL; +} + /** * task_under_cgroup_hierarchy - test task's membership of cgroup ancestry * @task: the task to be tested diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index f935738e45c3..7358e2785b3a 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2093,6 +2093,24 @@ union bpf_attr { * Return * The id is returned or 0 in case the id could not be retrieved. * + * u64 bpf_skb_ancestor_cgroup_id(struct sk_buff *skb, int ancestor_level) + * Description + * Return id of cgroup v2 that is ancestor of cgroup associated + * with the *skb* at the *ancestor_level*. The root cgroup is at + * *ancestor_level* zero and each step down the hierarchy + * increments the level. If *ancestor_level* == level of cgroup + * associated with *skb*, then return value will be same as that + * of **bpf_skb_cgroup_id**\ (). + * + * The helper is useful to implement policies based on cgroups + * that are upper in hierarchy than immediate cgroup associated + * with *skb*. + * + * The format of returned id and helper limitations are same as in + * **bpf_skb_cgroup_id**\ (). + * Return + * The id is returned or 0 in case the id could not be retrieved. + * * u64 bpf_get_current_cgroup_id(void) * Return * A 64-bit integer containing the current cgroup id based diff --git a/net/core/filter.c b/net/core/filter.c index 6981c0454411..8d7fadb2c118 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3779,6 +3779,32 @@ static const struct bpf_func_proto bpf_skb_cgroup_id_proto = { .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, }; + +BPF_CALL_2(bpf_skb_ancestor_cgroup_id, const struct sk_buff *, skb, int, + ancestor_level) +{ + struct sock *sk = skb_to_full_sk(skb); + struct cgroup *ancestor; + struct cgroup *cgrp; + + if (!sk || !sk_fullsock(sk)) + return 0; + + cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); + ancestor = cgroup_ancestor(cgrp, ancestor_level); + if (!ancestor) + return 0; + + return ancestor->kn->id.id; +} + +static const struct bpf_func_proto bpf_skb_ancestor_cgroup_id_proto = { + .func = bpf_skb_ancestor_cgroup_id, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, +}; #endif static unsigned long bpf_xdp_copy(void *dst_buff, const void *src_buff, @@ -4976,6 +5002,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) #ifdef CONFIG_SOCK_CGROUP_DATA case BPF_FUNC_skb_cgroup_id: return &bpf_skb_cgroup_id_proto; + case BPF_FUNC_skb_ancestor_cgroup_id: + return &bpf_skb_ancestor_cgroup_id_proto; #endif default: return bpf_base_func_proto(func_id); From 617a113bf7878869cb8cb54bf0830ac476cfa1f1 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Tue, 14 Aug 2018 11:20:21 -0500 Subject: [PATCH 0775/1640] UPSTREAM: net: filter: mark expected switch fall-through In preparation to enabling -Wimplicit-fallthrough, mark switch cases where we are expecting to fall through. Addresses-Coverity-ID: 1472592 ("Missing break in switch") Signed-off-by: Gustavo A. R. Silva Signed-off-by: David S. Miller --- net/core/filter.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/core/filter.c b/net/core/filter.c index 8d7fadb2c118..358d1410310c 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -7245,6 +7245,7 @@ sk_reuseport_is_valid_access(int off, int size, case offsetof(struct sk_reuseport_md, eth_protocol): if (size < FIELD_SIZEOF(struct sk_buff, protocol)) return false; + /* fall through */ case offsetof(struct sk_reuseport_md, ip_protocol): case offsetof(struct sk_reuseport_md, bind_inany): case offsetof(struct sk_reuseport_md, len): From 62e28b3477488d8b949f0bbac0e29c399c0bae5e Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 14 Aug 2018 11:01:12 -0700 Subject: [PATCH 0776/1640] UPSTREAM: bpf: fix a rcu usage warning in bpf_prog_array_copy_core() Commit 394e40a29788 ("bpf: extend bpf_prog_array to store pointers to the cgroup storage") refactored the bpf_prog_array_copy_core() to accommodate new structure bpf_prog_array_item which contains bpf_prog array itself. In the old code, we had perf_event_query_prog_array(): mutex_lock(...) bpf_prog_array_copy_call(): prog = rcu_dereference_check(array, 1)->progs bpf_prog_array_copy_core(prog, ...) mutex_unlock(...) With the above commit, we had perf_event_query_prog_array(): mutex_lock(...) bpf_prog_array_copy_call(): bpf_prog_array_copy_core(array, ...): item = rcu_dereference(array)->items; ... mutex_unlock(...) The new code will trigger a lockdep rcu checking warning. The fix is to change rcu_dereference() to rcu_dereference_check() to prevent such a warning. Reported-by: syzbot+6e72317008eef84a216b@syzkaller.appspotmail.com Fixes: 394e40a29788 ("bpf: extend bpf_prog_array to store pointers to the cgroup storage") Cc: Roman Gushchin Signed-off-by: Yonghong Song Acked-by: Alexei Starovoitov Acked-by: Roman Gushchin Signed-off-by: Daniel Borkmann --- kernel/bpf/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 26aa24deaf02..c072338d7e09 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1669,7 +1669,7 @@ static bool bpf_prog_array_copy_core(struct bpf_prog_array __rcu *array, struct bpf_prog_array_item *item; int i = 0; - item = rcu_dereference(array)->items; + item = rcu_dereference_check(array, 1)->items; for (; item->prog; item++) { if (item->prog == &dummy_bpf_prog.prog) continue; From 12cdf13ff571592234dd82b3d0ed7e2e42d36b37 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 16 Aug 2018 21:49:08 +0200 Subject: [PATCH 0777/1640] UPSTREAM: bpf, sockmap: fix leakage of smap_psock_map_entry While working on sockmap I noticed that we do not always kfree the struct smap_psock_map_entry list elements which track psocks attached to maps. In the case of sock_hash_ctx_update_elem(), these map entries are allocated outside of __sock_map_ctx_update_elem() with their linkage to the socket hash table filled. In the case of sock array, the map entries are allocated inside of __sock_map_ctx_update_elem() and added with their linkage to the psock->maps. Both additions are under psock->maps_lock each. Now, we drop these elements from their psock->maps list in a few occasions: i) in sock array via smap_list_map_remove() when an entry is either deleted from the map from user space, or updated via user space or BPF program where we drop the old socket at that map slot, or the sock array is freed via sock_map_free() and drops all its elements; ii) for sock hash via smap_list_hash_remove() in exactly the same occasions as just described for sock array; iii) in the bpf_tcp_close() where we remove the elements from the list via psock_map_pop() and iterate over them dropping themselves from either sock array or sock hash; and last but not least iv) once again in smap_gc_work() which is a callback for deferring the work once the psock refcount hit zero and thus the socket is being destroyed. Problem is that the only case where we kfree() the list entry is in case iv), which at that point should have an empty list in normal cases. So in cases from i) to iii) we unlink the elements without freeing where they go out of reach from us. Hence fix is to properly kfree() them as well to stop the leakage. Given these are all handled under psock->maps_lock there is no need for deferred RCU freeing. I later also ran with kmemleak detector and it confirmed the finding as well where in the state before the fix the object goes unreferenced while after the patch no kmemleak report related to BPF showed up. [...] unreferenced object 0xffff880378eadae0 (size 64): comm "test_sockmap", pid 2225, jiffies 4294720701 (age 43.504s) hex dump (first 32 bytes): 00 01 00 00 00 00 ad de 00 02 00 00 00 00 ad de ................ 50 4d 75 5d 03 88 ff ff 00 00 00 00 00 00 00 00 PMu]............ backtrace: [<000000005225ac3c>] sock_map_ctx_update_elem.isra.21+0xd8/0x210 [<0000000045dd6d3c>] bpf_sock_map_update+0x29/0x60 [<00000000877723aa>] ___bpf_prog_run+0x1e1f/0x4960 [<000000002ef89e83>] 0xffffffffffffffff unreferenced object 0xffff880378ead240 (size 64): comm "test_sockmap", pid 2225, jiffies 4294720701 (age 43.504s) hex dump (first 32 bytes): 00 01 00 00 00 00 ad de 00 02 00 00 00 00 ad de ................ 00 44 75 5d 03 88 ff ff 00 00 00 00 00 00 00 00 .Du]............ backtrace: [<000000005225ac3c>] sock_map_ctx_update_elem.isra.21+0xd8/0x210 [<0000000030e37a3a>] sock_map_update_elem+0x125/0x240 [<000000002e5ce36e>] map_update_elem+0x4eb/0x7b0 [<00000000db453cc9>] __x64_sys_bpf+0x1f9/0x360 [<0000000000763660>] do_syscall_64+0x9a/0x300 [<00000000422a2bb2>] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [<000000002ef89e83>] 0xffffffffffffffff [...] Fixes: e9db4ef6bf4c ("bpf: sockhash fix omitted bucket lock in sock_close") Fixes: 54fedb42c653 ("bpf: sockmap, fix smap_list_map_remove when psock is in many maps") Fixes: 2f857d04601a ("bpf: sockmap, remove STRPARSER map_flags and add multi-map support") Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Acked-by: Song Liu Signed-off-by: Alexei Starovoitov --- kernel/bpf/sockmap.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 60656d33407c..8671cf77eaf3 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -370,6 +370,7 @@ static void bpf_tcp_close(struct sock *sk, long timeout) } raw_spin_unlock_bh(&b->lock); } + kfree(e); e = psock_map_pop(sk, psock); } rcu_read_unlock(); @@ -1678,8 +1679,10 @@ static void smap_list_map_remove(struct smap_psock *psock, spin_lock_bh(&psock->maps_lock); list_for_each_entry_safe(e, tmp, &psock->maps, list) { - if (e->entry == entry) + if (e->entry == entry) { list_del(&e->list); + kfree(e); + } } spin_unlock_bh(&psock->maps_lock); } @@ -1693,8 +1696,10 @@ static void smap_list_hash_remove(struct smap_psock *psock, list_for_each_entry_safe(e, tmp, &psock->maps, list) { struct htab_elem *c = rcu_dereference(e->hash_link); - if (c == hash_link) + if (c == hash_link) { list_del(&e->list); + kfree(e); + } } spin_unlock_bh(&psock->maps_lock); } From 6cdc357edbe94cc42c43b56f8d24462cb5fa197a Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 16 Aug 2018 21:49:09 +0200 Subject: [PATCH 0778/1640] UPSTREAM: bpf, sockmap: fix map elem deletion race with smap_stop_sock The smap_start_sock() and smap_stop_sock() are each protected under the sock->sk_callback_lock from their call-sites except in the case of sock_map_delete_elem() where we drop the old socket from the map slot. This is racy because the same sock could be part of multiple sock maps, so we run smap_stop_sock() in parallel, and given at that point psock->strp_enabled might be true on both CPUs, we might for example wrongly restore the sk->sk_data_ready / sk->sk_write_space. Therefore, hold the sock->sk_callback_lock as well on delete. Looks like 2f857d04601a ("bpf: sockmap, remove STRPARSER map_flags and add multi-map support") had this right, but later on e9db4ef6bf4c ("bpf: sockhash fix omitted bucket lock in sock_close") removed it again from delete leaving this smap_stop_sock() instance unprotected. Fixes: e9db4ef6bf4c ("bpf: sockhash fix omitted bucket lock in sock_close") Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Acked-by: Song Liu Signed-off-by: Alexei Starovoitov --- kernel/bpf/sockmap.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 8671cf77eaf3..3251a04d1814 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -1789,8 +1789,11 @@ static int sock_map_delete_elem(struct bpf_map *map, void *key) if (!psock) goto out; - if (psock->bpf_parse) + if (psock->bpf_parse) { + write_lock_bh(&sock->sk_callback_lock); smap_stop_sock(psock, sock); + write_unlock_bh(&sock->sk_callback_lock); + } smap_list_map_remove(psock, &stab->sock_map[k]); smap_release_sock(psock, sock); out: From 6aed4af1a34ecc986403d00c7fa8c8b356c45b65 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 16 Aug 2018 21:49:10 +0200 Subject: [PATCH 0779/1640] UPSTREAM: bpf, sockmap: fix sock_map_ctx_update_elem race with exist/noexist The current code in sock_map_ctx_update_elem() allows for BPF_EXIST and BPF_NOEXIST map update flags. While on array-like maps this approach is rather uncommon, e.g. bpf_fd_array_map_update_elem() and others enforce map update flags to be BPF_ANY such that xchg() can be used directly, the current implementation in sock map does not guarantee that such operation with BPF_EXIST / BPF_NOEXIST is atomic. The initial test does a READ_ONCE(stab->sock_map[i]) to fetch the socket from the slot which is then tested for NULL / non-NULL. However later after __sock_map_ctx_update_elem(), the actual update is done through osock = xchg(&stab->sock_map[i], sock). Problem is that in the meantime a different CPU could have updated / deleted a socket on that specific slot and thus flag contraints won't hold anymore. I've been thinking whether best would be to just break UAPI and do an enforcement of BPF_ANY to check if someone actually complains, however trouble is that already in BPF kselftest we use BPF_NOEXIST for the map update, and therefore it might have been copied into applications already. The fix to keep the current behavior intact would be to add a map lock similar to the sock hash bucket lock only for covering the whole map. Fixes: 174a79ff9515 ("bpf: sockmap with sk redirect support") Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Acked-by: Song Liu Signed-off-by: Alexei Starovoitov --- kernel/bpf/sockmap.c | 108 +++++++++++++++++++++++-------------------- 1 file changed, 58 insertions(+), 50 deletions(-) diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 3251a04d1814..1d092f399b39 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -58,6 +58,7 @@ struct bpf_stab { struct bpf_map map; struct sock **sock_map; struct bpf_sock_progs progs; + raw_spinlock_t lock; }; struct bucket { @@ -89,9 +90,9 @@ enum smap_psock_state { struct smap_psock_map_entry { struct list_head list; + struct bpf_map *map; struct sock **entry; struct htab_elem __rcu *hash_link; - struct bpf_htab __rcu *htab; }; struct smap_psock { @@ -343,13 +344,18 @@ static void bpf_tcp_close(struct sock *sk, long timeout) e = psock_map_pop(sk, psock); while (e) { if (e->entry) { - osk = cmpxchg(e->entry, sk, NULL); + struct bpf_stab *stab = container_of(e->map, struct bpf_stab, map); + + raw_spin_lock_bh(&stab->lock); + osk = *e->entry; if (osk == sk) { + *e->entry = NULL; smap_release_sock(psock, sk); } + raw_spin_unlock_bh(&stab->lock); } else { struct htab_elem *link = rcu_dereference(e->hash_link); - struct bpf_htab *htab = rcu_dereference(e->htab); + struct bpf_htab *htab = container_of(e->map, struct bpf_htab, map); struct hlist_head *head; struct htab_elem *l; struct bucket *b; @@ -1645,6 +1651,7 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr) return ERR_PTR(-ENOMEM); bpf_map_init_from_attr(&stab->map, attr); + raw_spin_lock_init(&stab->lock); /* make sure page count doesn't overflow */ cost = (u64) stab->map.max_entries * sizeof(struct sock *); @@ -1719,14 +1726,15 @@ static void sock_map_free(struct bpf_map *map) * and a grace period expire to ensure psock is really safe to remove. */ rcu_read_lock(); + raw_spin_lock_bh(&stab->lock); for (i = 0; i < stab->map.max_entries; i++) { struct smap_psock *psock; struct sock *sock; - sock = xchg(&stab->sock_map[i], NULL); + sock = stab->sock_map[i]; if (!sock) continue; - + stab->sock_map[i] = NULL; psock = smap_psock_sk(sock); /* This check handles a racing sock event that can get the * sk_callback_lock before this case but after xchg happens @@ -1738,6 +1746,7 @@ static void sock_map_free(struct bpf_map *map) smap_release_sock(psock, sock); } } + raw_spin_unlock_bh(&stab->lock); rcu_read_unlock(); sock_map_remove_complete(stab); @@ -1781,14 +1790,16 @@ static int sock_map_delete_elem(struct bpf_map *map, void *key) if (k >= map->max_entries) return -EINVAL; - sock = xchg(&stab->sock_map[k], NULL); + raw_spin_lock_bh(&stab->lock); + sock = stab->sock_map[k]; + stab->sock_map[k] = NULL; + raw_spin_unlock_bh(&stab->lock); if (!sock) return -EINVAL; psock = smap_psock_sk(sock); if (!psock) - goto out; - + return 0; if (psock->bpf_parse) { write_lock_bh(&sock->sk_callback_lock); smap_stop_sock(psock, sock); @@ -1796,7 +1807,6 @@ static int sock_map_delete_elem(struct bpf_map *map, void *key) } smap_list_map_remove(psock, &stab->sock_map[k]); smap_release_sock(psock, sock); -out: return 0; } @@ -1832,11 +1842,9 @@ out: static int __sock_map_ctx_update_elem(struct bpf_map *map, struct bpf_sock_progs *progs, struct sock *sock, - struct sock **map_link, void *key) { struct bpf_prog *verdict, *parse, *tx_msg; - struct smap_psock_map_entry *e = NULL; struct smap_psock *psock; bool new = false; int err = 0; @@ -1909,14 +1917,6 @@ static int __sock_map_ctx_update_elem(struct bpf_map *map, new = true; } - if (map_link) { - e = kzalloc(sizeof(*e), GFP_ATOMIC | __GFP_NOWARN); - if (!e) { - err = -ENOMEM; - goto out_free; - } - } - /* 3. At this point we have a reference to a valid psock that is * running. Attach any BPF programs needed. */ @@ -1938,17 +1938,6 @@ static int __sock_map_ctx_update_elem(struct bpf_map *map, write_unlock_bh(&sock->sk_callback_lock); } - /* 4. Place psock in sockmap for use and stop any programs on - * the old sock assuming its not the same sock we are replacing - * it with. Because we can only have a single set of programs if - * old_sock has a strp we can stop it. - */ - if (map_link) { - e->entry = map_link; - spin_lock_bh(&psock->maps_lock); - list_add_tail(&e->list, &psock->maps); - spin_unlock_bh(&psock->maps_lock); - } return err; out_free: smap_release_sock(psock, sock); @@ -1959,7 +1948,6 @@ out_progs: } if (tx_msg) bpf_prog_put(tx_msg); - kfree(e); return err; } @@ -1969,36 +1957,57 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops, { struct bpf_stab *stab = container_of(map, struct bpf_stab, map); struct bpf_sock_progs *progs = &stab->progs; - struct sock *osock, *sock; + struct sock *osock, *sock = skops->sk; + struct smap_psock_map_entry *e; + struct smap_psock *psock; u32 i = *(u32 *)key; int err; if (unlikely(flags > BPF_EXIST)) return -EINVAL; - if (unlikely(i >= stab->map.max_entries)) return -E2BIG; - sock = READ_ONCE(stab->sock_map[i]); - if (flags == BPF_EXIST && !sock) - return -ENOENT; - else if (flags == BPF_NOEXIST && sock) - return -EEXIST; + e = kzalloc(sizeof(*e), GFP_ATOMIC | __GFP_NOWARN); + if (!e) + return -ENOMEM; - sock = skops->sk; - err = __sock_map_ctx_update_elem(map, progs, sock, &stab->sock_map[i], - key); + err = __sock_map_ctx_update_elem(map, progs, sock, key); if (err) goto out; - osock = xchg(&stab->sock_map[i], sock); - if (osock) { - struct smap_psock *opsock = smap_psock_sk(osock); - - smap_list_map_remove(opsock, &stab->sock_map[i]); - smap_release_sock(opsock, osock); + /* psock guaranteed to be present. */ + psock = smap_psock_sk(sock); + raw_spin_lock_bh(&stab->lock); + osock = stab->sock_map[i]; + if (osock && flags == BPF_NOEXIST) { + err = -EEXIST; + goto out_unlock; } + if (!osock && flags == BPF_EXIST) { + err = -ENOENT; + goto out_unlock; + } + + e->entry = &stab->sock_map[i]; + e->map = map; + spin_lock_bh(&psock->maps_lock); + list_add_tail(&e->list, &psock->maps); + spin_unlock_bh(&psock->maps_lock); + + stab->sock_map[i] = sock; + if (osock) { + psock = smap_psock_sk(osock); + smap_list_map_remove(psock, &stab->sock_map[i]); + smap_release_sock(psock, osock); + } + raw_spin_unlock_bh(&stab->lock); + return 0; +out_unlock: + smap_release_sock(psock, sock); + raw_spin_unlock_bh(&stab->lock); out: + kfree(e); return err; } @@ -2361,7 +2370,7 @@ static int sock_hash_ctx_update_elem(struct bpf_sock_ops_kern *skops, b = __select_bucket(htab, hash); head = &b->head; - err = __sock_map_ctx_update_elem(map, progs, sock, NULL, key); + err = __sock_map_ctx_update_elem(map, progs, sock, key); if (err) goto err; @@ -2387,8 +2396,7 @@ static int sock_hash_ctx_update_elem(struct bpf_sock_ops_kern *skops, } rcu_assign_pointer(e->hash_link, l_new); - rcu_assign_pointer(e->htab, - container_of(map, struct bpf_htab, map)); + e->map = map; spin_lock_bh(&psock->maps_lock); list_add_tail(&e->list, &psock->maps); spin_unlock_bh(&psock->maps_lock); From b13f9b250aa36f15d9aadc5cf19c23781e96e87c Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 1 Dec 2017 15:08:56 -0800 Subject: [PATCH 0780/1640] UPSTREAM: net: xdp: report flags program was installed with on query Some drivers enforce that flags on program replacement and removal must match the flags passed on install. This leaves the possibility open to enable simultaneous loading of XDP programs both to HW and DRV. Allow such drivers to report the flags back to the stack. Signed-off-by: Jakub Kicinski Reviewed-by: Simon Horman Reviewed-by: Quentin Monnet Signed-off-by: Daniel Borkmann --- drivers/net/ethernet/netronome/nfp/nfp_net_common.c | 1 + include/linux/netdevice.h | 2 ++ 2 files changed, 3 insertions(+) diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c index fad9f9fbf2c1..411a29cf84ac 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c @@ -3437,6 +3437,7 @@ static int nfp_net_xdp(struct net_device *netdev, struct netdev_bpf *xdp) if (nn->dp.bpf_offload_xdp) xdp->prog_attached = XDP_ATTACHED_HW; xdp->prog_id = nn->xdp_prog ? nn->xdp_prog->aux->id : 0; + xdp->flags = nn->xdp_prog ? nn->xdp_flags : 0; return 0; default: return -EINVAL; diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 82b8729a076c..27bbad0d5a12 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -837,6 +837,8 @@ struct netdev_bpf { struct { u8 prog_attached; u32 prog_id; + /* flags with which program was installed */ + u32 prog_flags; }; /* BPF_OFFLOAD_VERIFIER_PREP */ struct { From f5757b29c656348ae480603075471040410a559c Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 11 Jul 2018 20:36:40 -0700 Subject: [PATCH 0781/1640] BACKPORT: xdp: factor out common program/flags handling from drivers Basic operations drivers perform during xdp setup and query can be moved to helpers in the core. Encapsulate program and flags into a structure and add helpers. Note that the structure is intended as the "main" program information source in the driver. Most drivers will additionally place the program pointer in their fast path or ring structures. The helpers don't have a huge impact now, but they will decrease the code duplication when programs can be installed in HW and driver at the same time. Encapsulating the basic operations in helpers will hopefully also reduce the number of changes to drivers which adopt them. Helpers could really be static inline, but they depend on definition of struct netdev_bpf which means they'd have to be placed in netdevice.h, an already 4500 line header. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Signed-off-by: Daniel Borkmann --- include/net/xdp.h | 13 +++++++++++++ net/core/xdp.c | 34 ++++++++++++++++++++++++++++++++++ 2 files changed, 47 insertions(+) diff --git a/include/net/xdp.h b/include/net/xdp.h index 0c45f0f943ed..0f32d8dc02a6 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -135,4 +135,17 @@ xdp_data_meta_unsupported(const struct xdp_buff *xdp) return unlikely(xdp->data_meta > xdp->data); } +struct xdp_attachment_info { + struct bpf_prog *prog; + u32 flags; +}; + +struct netdev_bpf; +int xdp_attachment_query(struct xdp_attachment_info *info, + struct netdev_bpf *bpf); +bool xdp_attachment_flags_ok(struct xdp_attachment_info *info, + struct netdev_bpf *bpf); +void xdp_attachment_setup(struct xdp_attachment_info *info, + struct netdev_bpf *bpf); + #endif /* __LINUX_NET_XDP_H__ */ diff --git a/net/core/xdp.c b/net/core/xdp.c index cb8c4e061a5a..07f36d13229d 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -3,8 +3,11 @@ * Copyright (c) 2017 Jesper Dangaard Brouer, Red Hat Inc. * Released under terms in GPL version 2. See COPYING. */ +#include +#include #include #include +#include #include #include #include @@ -361,3 +364,34 @@ void xdp_return_buff(struct xdp_buff *xdp) __xdp_return(xdp->data, &xdp->rxq->mem, true); } EXPORT_SYMBOL_GPL(xdp_return_buff); + +int xdp_attachment_query(struct xdp_attachment_info *info, + struct netdev_bpf *bpf) +{ + bpf->prog_id = info->prog ? info->prog->aux->id : 0; + bpf->prog_flags = info->prog ? info->flags : 0; + return 0; +} +EXPORT_SYMBOL_GPL(xdp_attachment_query); + +bool xdp_attachment_flags_ok(struct xdp_attachment_info *info, + struct netdev_bpf *bpf) +{ + if (info->prog && (bpf->flags ^ info->flags) & XDP_FLAGS_MODES) { + NL_SET_ERR_MSG(bpf->extack, + "program loaded with different flags"); + return false; + } + return true; +} +EXPORT_SYMBOL_GPL(xdp_attachment_flags_ok); + +void xdp_attachment_setup(struct xdp_attachment_info *info, + struct netdev_bpf *bpf) +{ + if (info->prog) + bpf_prog_put(info->prog); + info->prog = bpf->prog; + info->flags = bpf->flags; +} +EXPORT_SYMBOL_GPL(xdp_attachment_setup); From 7998a6703c907caf28103980666d076f7597e82e Mon Sep 17 00:00:00 2001 From: Toshiaki Makita Date: Fri, 3 Aug 2018 16:58:16 +0900 Subject: [PATCH 0782/1640] UPSTREAM: xdp: Helpers for disabling napi_direct of xdp_return_frame We need some mechanism to disable napi_direct on calling xdp_return_frame_rx_napi() from some context. When veth gets support of XDP_REDIRECT, it will redirects packets which are redirected from other devices. On redirection veth will reuse xdp_mem_info of the redirection source device to make return_frame work. But in this case .ndo_xdp_xmit() called from veth redirection uses xdp_mem_info which is not guarded by NAPI, because the .ndo_xdp_xmit() is not called directly from the rxq which owns the xdp_mem_info. This approach introduces a flag in bpf_redirect_info to indicate that napi_direct should be disabled even when _rx_napi variant is used as well as helper functions to use it. A NAPI handler who wants to use this flag needs to call xdp_set_return_frame_no_direct() before processing packets, and call xdp_clear_return_frame_no_direct() after xdp_do_flush_map() before exiting NAPI. v4: - Use bpf_redirect_info for storing the flag instead of xdp_mem_info to avoid per-frame copy cost. Signed-off-by: Toshiaki Makita Signed-off-by: Daniel Borkmann --- include/linux/filter.h | 25 +++++++++++++++++++++++++ net/core/xdp.c | 6 ++++-- 2 files changed, 29 insertions(+), 2 deletions(-) diff --git a/include/linux/filter.h b/include/linux/filter.h index e18cf8a295e3..c5365e7abb10 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -613,10 +613,14 @@ struct bpf_redirect_info { struct bpf_map *map; struct bpf_map *map_to_flush; unsigned long map_owner; + u32 kern_flags; }; DECLARE_PER_CPU(struct bpf_redirect_info, bpf_redirect_info); +/* flags for bpf_redirect_info kern_flags */ +#define BPF_RI_F_RF_NO_DIRECT BIT(0) /* no napi_direct on return_frame */ + /* Compute the linear packet data range [data, data_end) which * will be accessed by various program types (cls_bpf, act_bpf, * lwt, ...). Subsystems allowing direct data access must (!) @@ -846,6 +850,27 @@ static inline bool bpf_dump_raw_ok(void) struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, const struct bpf_insn *patch, u32 len); +static inline bool xdp_return_frame_no_direct(void) +{ + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); + + return ri->kern_flags & BPF_RI_F_RF_NO_DIRECT; +} + +static inline void xdp_set_return_frame_no_direct(void) +{ + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); + + ri->kern_flags |= BPF_RI_F_RF_NO_DIRECT; +} + +static inline void xdp_clear_return_frame_no_direct(void) +{ + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); + + ri->kern_flags &= ~BPF_RI_F_RF_NO_DIRECT; +} + static inline int xdp_ok_fwd_dev(const struct net_device *fwd, unsigned int pktlen) { diff --git a/net/core/xdp.c b/net/core/xdp.c index 07f36d13229d..12c16e6a1d6c 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -328,10 +328,12 @@ static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct) /* mem->id is valid, checked in xdp_rxq_info_reg_mem_model() */ xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params); page = virt_to_head_page(data); - if (xa) + if (xa) { + napi_direct &= !xdp_return_frame_no_direct(); page_pool_put_page(xa->page_pool, page, napi_direct); - else + } else { put_page(page); + } rcu_read_unlock(); break; case MEM_TYPE_PAGE_SHARED: From ce3990296580d36c69c4aa5b8df32420d3780300 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 17 Aug 2018 23:26:14 +0200 Subject: [PATCH 0783/1640] UPSTREAM: bpf: fix redirect to map under tail calls Commits 109980b894e9 ("bpf: don't select potentially stale ri->map from buggy xdp progs") and 7c3001313396 ("bpf: fix ri->map_owner pointer on bpf_prog_realloc") tried to mitigate that buggy programs using bpf_redirect_map() helper call do not leave stale maps behind. Idea was to add a map_owner cookie into the per CPU struct redirect_info which was set to prog->aux by the prog making the helper call as a proof that the map is not stale since the prog is implicitly holding a reference to it. This owner cookie could later on get compared with the program calling into BPF whether they match and therefore the redirect could proceed with processing the map safely. In (obvious) hindsight, this approach breaks down when tail calls are involved since the original caller's prog->aux pointer does not have to match the one from one of the progs out of the tail call chain, and therefore the xdp buffer will be dropped instead of redirected. A way around that would be to fix the issue differently (which also allows to remove related work in fast path at the same time): once the life-time of a redirect map has come to its end we use it's map free callback where we need to wait on synchronize_rcu() for current outstanding xdp buffers and remove such a map pointer from the redirect info if found to be present. At that time no program is using this map anymore so we simply invalidate the map pointers to NULL iff they previously pointed to that instance while making sure that the redirect path only reads out the map once. Fixes: 97f91a7cf04f ("bpf: add bpf_redirect_map helper routine") Fixes: 109980b894e9 ("bpf: don't select potentially stale ri->map from buggy xdp progs") Reported-by: Sebastiano Miano Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Signed-off-by: Alexei Starovoitov --- include/linux/filter.h | 3 +- include/trace/events/xdp.h | 5 ++- kernel/bpf/cpumap.c | 2 ++ kernel/bpf/devmap.c | 1 + kernel/bpf/verifier.c | 21 ------------ kernel/bpf/xskmap.c | 1 + net/core/filter.c | 68 +++++++++++++++++--------------------- 7 files changed, 38 insertions(+), 63 deletions(-) diff --git a/include/linux/filter.h b/include/linux/filter.h index c5365e7abb10..e975d1329404 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -612,7 +612,6 @@ struct bpf_redirect_info { u32 flags; struct bpf_map *map; struct bpf_map *map_to_flush; - unsigned long map_owner; u32 kern_flags; }; @@ -850,6 +849,8 @@ static inline bool bpf_dump_raw_ok(void) struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, const struct bpf_insn *patch, u32 len); +void bpf_clear_redirect_map(struct bpf_map *map); + static inline bool xdp_return_frame_no_direct(void) { struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h index 1ecf4c67fcf7..e95cb86b65cf 100644 --- a/include/trace/events/xdp.h +++ b/include/trace/events/xdp.h @@ -147,9 +147,8 @@ struct _bpf_dtab_netdev { #define devmap_ifindex(fwd, map) \ (!fwd ? 0 : \ - (!map ? 0 : \ - ((map->map_type == BPF_MAP_TYPE_DEVMAP) ? \ - ((struct _bpf_dtab_netdev *)fwd)->dev->ifindex : 0))) + ((map->map_type == BPF_MAP_TYPE_DEVMAP) ? \ + ((struct _bpf_dtab_netdev *)fwd)->dev->ifindex : 0)) #define _trace_xdp_redirect_map(dev, xdp, fwd, map, idx) \ trace_xdp_redirect_map(dev, xdp, devmap_ifindex(fwd, map), \ diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 620bc5024d7d..24aac0d0f412 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -479,6 +479,8 @@ static void cpu_map_free(struct bpf_map *map) * It does __not__ ensure pending flush operations (if any) are * complete. */ + + bpf_clear_redirect_map(map); synchronize_rcu(); /* To ensure all pending flush operations have completed wait for flush diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 42cf66268420..1e525d70f833 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -161,6 +161,7 @@ static void dev_map_free(struct bpf_map *map) list_del_rcu(&dtab->list); spin_unlock(&dev_map_lock); + bpf_clear_redirect_map(map); synchronize_rcu(); /* Make sure prior __dev_map_entry_free() have completed. */ diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index fb492914fe0f..c070671b727b 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6172,27 +6172,6 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) goto patch_call_imm; } - if (insn->imm == BPF_FUNC_redirect_map) { - /* Note, we cannot use prog directly as imm as subsequent - * rewrites would still change the prog pointer. The only - * stable address we can use is aux, which also works with - * prog clones during blinding. - */ - u64 addr = (unsigned long)prog->aux; - struct bpf_insn r4_ld[] = { - BPF_LD_IMM64(BPF_REG_4, addr), - *insn, - }; - cnt = ARRAY_SIZE(r4_ld); - - new_prog = bpf_patch_insn_data(env, i + delta, r4_ld, cnt); - if (!new_prog) - return -ENOMEM; - - delta += cnt - 1; - env->prog = prog = new_prog; - insn = new_prog->insnsi + i + delta; - } patch_call_imm: fn = env->ops->get_func_proto(insn->imm, env->prog); /* all functions that have prototype and verifier allowed diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c index 4ddf61e158f6..9f8463afda9c 100644 --- a/kernel/bpf/xskmap.c +++ b/kernel/bpf/xskmap.c @@ -75,6 +75,7 @@ static void xsk_map_free(struct bpf_map *map) struct xsk_map *m = container_of(map, struct xsk_map, map); int i; + bpf_clear_redirect_map(map); synchronize_net(); for (i = 0; i < map->max_entries; i++) { diff --git a/net/core/filter.c b/net/core/filter.c index 358d1410310c..5ea056a20974 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3247,31 +3247,33 @@ static void *__xdp_map_lookup_elem(struct bpf_map *map, u32 index) } } -static inline bool xdp_map_invalid(const struct bpf_prog *xdp_prog, - unsigned long aux) +void bpf_clear_redirect_map(struct bpf_map *map) { - return (unsigned long)xdp_prog->aux != aux; + struct bpf_redirect_info *ri; + int cpu; + + for_each_possible_cpu(cpu) { + ri = per_cpu_ptr(&bpf_redirect_info, cpu); + /* Avoid polluting remote cacheline due to writes if + * not needed. Once we pass this test, we need the + * cmpxchg() to make sure it hasn't been changed in + * the meantime by remote CPU. + */ + if (unlikely(READ_ONCE(ri->map) == map)) + cmpxchg(&ri->map, map, NULL); + } } static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp, - struct bpf_prog *xdp_prog) + struct bpf_prog *xdp_prog, struct bpf_map *map) { struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); - unsigned long map_owner = ri->map_owner; - struct bpf_map *map = ri->map; u32 index = ri->ifindex; void *fwd = NULL; int err; ri->ifindex = 0; - ri->map = NULL; - ri->map_owner = 0; - - if (unlikely(xdp_map_invalid(xdp_prog, map_owner))) { - err = -EFAULT; - map = NULL; - goto err; - } + WRITE_ONCE(ri->map, NULL); fwd = __xdp_map_lookup_elem(map, index); if (!fwd) { @@ -3297,12 +3299,13 @@ int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, struct bpf_prog *xdp_prog) { struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); + struct bpf_map *map = READ_ONCE(ri->map); struct net_device *fwd; u32 index = ri->ifindex; int err; - if (ri->map) - return xdp_do_redirect_map(dev, xdp, xdp_prog); + if (map) + return xdp_do_redirect_map(dev, xdp, xdp_prog, map); fwd = dev_get_by_index_rcu(dev_net(dev), index); ri->ifindex = 0; @@ -3326,24 +3329,17 @@ EXPORT_SYMBOL_GPL(xdp_do_redirect); static int xdp_do_generic_redirect_map(struct net_device *dev, struct sk_buff *skb, struct xdp_buff *xdp, - struct bpf_prog *xdp_prog) + struct bpf_prog *xdp_prog, + struct bpf_map *map) { struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); - unsigned long map_owner = ri->map_owner; - struct bpf_map *map = ri->map; u32 index = ri->ifindex; void *fwd = NULL; int err = 0; ri->ifindex = 0; - ri->map = NULL; - ri->map_owner = 0; + WRITE_ONCE(ri->map, NULL); - if (unlikely(xdp_map_invalid(xdp_prog, map_owner))) { - err = -EFAULT; - map = NULL; - goto err; - } fwd = __xdp_map_lookup_elem(map, index); if (unlikely(!fwd)) { err = -EINVAL; @@ -3380,13 +3376,14 @@ int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb, struct xdp_buff *xdp, struct bpf_prog *xdp_prog) { struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); + struct bpf_map *map = READ_ONCE(ri->map); u32 index = ri->ifindex; struct net_device *fwd; int err = 0; - if (ri->map) - return xdp_do_generic_redirect_map(dev, skb, xdp, xdp_prog); - + if (map) + return xdp_do_generic_redirect_map(dev, skb, xdp, xdp_prog, + map); ri->ifindex = 0; fwd = dev_get_by_index_rcu(dev_net(dev), index); if (unlikely(!fwd)) { @@ -3417,8 +3414,7 @@ BPF_CALL_2(bpf_xdp_redirect, u32, ifindex, u64, flags) ri->ifindex = ifindex; ri->flags = flags; - ri->map = NULL; - ri->map_owner = 0; + WRITE_ONCE(ri->map, NULL); return XDP_REDIRECT; } @@ -3431,8 +3427,8 @@ static const struct bpf_func_proto bpf_xdp_redirect_proto = { .arg2_type = ARG_ANYTHING, }; -BPF_CALL_4(bpf_xdp_redirect_map, struct bpf_map *, map, u32, ifindex, u64, flags, - unsigned long, map_owner) +BPF_CALL_3(bpf_xdp_redirect_map, struct bpf_map *, map, u32, ifindex, + u64, flags) { struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); @@ -3441,15 +3437,11 @@ BPF_CALL_4(bpf_xdp_redirect_map, struct bpf_map *, map, u32, ifindex, u64, flags ri->ifindex = ifindex; ri->flags = flags; - ri->map = map; - ri->map_owner = map_owner; + WRITE_ONCE(ri->map, map); return XDP_REDIRECT; } -/* Note, arg4 is hidden from users and populated by the verifier - * with the right pointer. - */ static const struct bpf_func_proto bpf_xdp_redirect_map_proto = { .func = bpf_xdp_redirect_map, .gpl_only = false, From 9789402d87ddec7390f9f2cda75a763862b53907 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 21 Aug 2018 15:55:00 +0200 Subject: [PATCH 0784/1640] UPSTREAM: bpf, sockmap: fix sock_hash_alloc and reject zero-sized keys Currently, it is possible to create a sock hash map with key size of 0 and have the kernel return a fd back to user space. This is invalid for hash maps (and kernel also hasn't been tested for zero key size support in general at this point). Thus, reject such configuration. Fixes: 81110384441a ("bpf: sockmap, add hash map support") Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Acked-by: Song Liu --- kernel/bpf/sockmap.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 1d092f399b39..25446e430da9 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -2143,7 +2143,9 @@ static struct bpf_map *sock_hash_alloc(union bpf_attr *attr) return ERR_PTR(-EPERM); /* check sanity of attributes */ - if (attr->max_entries == 0 || attr->value_size != 4 || + if (attr->max_entries == 0 || + attr->key_size == 0 || + attr->value_size != 4 || attr->map_flags & ~SOCK_CREATE_FLAG_MASK) return ERR_PTR(-EINVAL); From d95f57abf468299dce70af96a13f1b37c45ab3cb Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 22 Aug 2018 18:09:17 +0200 Subject: [PATCH 0785/1640] UPSTREAM: bpf, sockmap: fix sock hash count in alloc_sock_hash_elem When we try to allocate a new sock hash entry and the allocation fails, then sock hash map fails to reduce the map element counter, meaning we keep accounting this element although it was never used. Fix it by dropping the element counter on error. Fixes: 81110384441a ("bpf: sockmap, add hash map support") Signed-off-by: Daniel Borkmann Acked-by: John Fastabend --- kernel/bpf/sockmap.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 25446e430da9..cf5195c7c331 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -2272,8 +2272,10 @@ static struct htab_elem *alloc_sock_hash_elem(struct bpf_htab *htab, } l_new = kmalloc_node(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN, htab->map.numa_node); - if (!l_new) + if (!l_new) { + atomic_dec(&htab->count); return ERR_PTR(-ENOMEM); + } memcpy(l_new->key, key, key_size); l_new->sk = sk; From b54b24e929ed0a17322261500c88314a020cee69 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 22 Aug 2018 23:49:37 +0200 Subject: [PATCH 0786/1640] UPSTREAM: bpf: use per htab salt for bucket hash All BPF hash and LRU maps currently have a known and global seed we feed into jhash() which is 0. This is suboptimal, thus fix it by generating a random seed upon hashtab setup time which we can later on feed into jhash() on lookup, update and deletions. Fixes: 0f8e4bd8a1fc8 ("bpf: add hashtable type of eBPF maps") Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Acked-by: Song Liu Reviewed-by: Eduardo Valentin --- kernel/bpf/hashtab.c | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 7f0559663e21..16081d8384bf 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include "percpu_freelist.h" #include "bpf_lru_list.h" @@ -41,6 +42,7 @@ struct bpf_htab { atomic_t count; /* number of elements in this hashtable */ u32 n_buckets; /* number of hash buckets */ u32 elem_size; /* size of each element in bytes */ + u32 hashrnd; }; /* each htab element is struct htab_elem + key + value */ @@ -375,6 +377,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) if (!htab->buckets) goto free_htab; + htab->hashrnd = get_random_int(); for (i = 0; i < htab->n_buckets; i++) { INIT_HLIST_NULLS_HEAD(&htab->buckets[i].head, i); raw_spin_lock_init(&htab->buckets[i].lock); @@ -406,9 +409,9 @@ free_htab: return ERR_PTR(err); } -static inline u32 htab_map_hash(const void *key, u32 key_len) +static inline u32 htab_map_hash(const void *key, u32 key_len, u32 hashrnd) { - return jhash(key, key_len, 0); + return jhash(key, key_len, hashrnd); } static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash) @@ -474,7 +477,7 @@ static void *__htab_map_lookup_elem(struct bpf_map *map, void *key) key_size = map->key_size; - hash = htab_map_hash(key, key_size); + hash = htab_map_hash(key, key_size, htab->hashrnd); head = select_bucket(htab, hash); @@ -613,7 +616,7 @@ static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key) if (!key) goto find_first_elem; - hash = htab_map_hash(key, key_size); + hash = htab_map_hash(key, key_size, htab->hashrnd); head = select_bucket(htab, hash); @@ -838,7 +841,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, key_size = map->key_size; - hash = htab_map_hash(key, key_size); + hash = htab_map_hash(key, key_size, htab->hashrnd); b = __select_bucket(htab, hash); head = &b->head; @@ -894,7 +897,7 @@ static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value, key_size = map->key_size; - hash = htab_map_hash(key, key_size); + hash = htab_map_hash(key, key_size, htab->hashrnd); b = __select_bucket(htab, hash); head = &b->head; @@ -959,7 +962,7 @@ static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key, key_size = map->key_size; - hash = htab_map_hash(key, key_size); + hash = htab_map_hash(key, key_size, htab->hashrnd); b = __select_bucket(htab, hash); head = &b->head; @@ -1012,7 +1015,7 @@ static int __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, key_size = map->key_size; - hash = htab_map_hash(key, key_size); + hash = htab_map_hash(key, key_size, htab->hashrnd); b = __select_bucket(htab, hash); head = &b->head; @@ -1085,7 +1088,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key) key_size = map->key_size; - hash = htab_map_hash(key, key_size); + hash = htab_map_hash(key, key_size, htab->hashrnd); b = __select_bucket(htab, hash); head = &b->head; @@ -1117,7 +1120,7 @@ static int htab_lru_map_delete_elem(struct bpf_map *map, void *key) key_size = map->key_size; - hash = htab_map_hash(key, key_size); + hash = htab_map_hash(key, key_size, htab->hashrnd); b = __select_bucket(htab, hash); head = &b->head; From c7a4209d3ac723c5e6f4a75f413c6f9415e4e702 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 24 Aug 2018 22:08:50 +0200 Subject: [PATCH 0787/1640] UPSTREAM: bpf, sockmap: fix potential use after free in bpf_tcp_close bpf_tcp_close() we pop the psock linkage to a map via psock_map_pop(). A parallel update on the sock hash map can happen between psock_map_pop() and lookup_elem_raw() where we override the element under link->hash / link->key. In bpf_tcp_close()'s lookup_elem_raw() we subsequently only test whether an element is present, but we do not test whether the element is infact the element we were looking for. We lock the sock in bpf_tcp_close() during that time, so do we hold the lock in sock_hash_update_elem(). However, the latter locks the sock which is newly updated, not the one we're purging from the hash table. This means that while one CPU is doing the lookup from bpf_tcp_close(), another CPU is doing the map update in parallel, dropped our sock from the hlist and released the psock. Subsequently the first CPU will find the new sock and attempts to drop and release the old sock yet another time. Fix is that we need to check the elements for a match after lookup, similar as we do in the sock map. Note that the hash tab elems are freed via RCU, so access to their link->hash / link->key is fine since we're under RCU read side there. Fixes: e9db4ef6bf4c ("bpf: sockhash fix omitted bucket lock in sock_close") Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Signed-off-by: Alexei Starovoitov --- kernel/bpf/sockmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index cf5195c7c331..01879e4d599a 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -369,7 +369,7 @@ static void bpf_tcp_close(struct sock *sk, long timeout) /* If another thread deleted this object skip deletion. * The refcnt on psock may or may not be zero. */ - if (l) { + if (l && l == link) { hlist_del_rcu(&link->hash_node); smap_release_sock(psock, link->sk); free_htab_elem(htab, link); From 39fc3fd7bf637ee735b3de57c25b9380932605ad Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 24 Aug 2018 22:08:51 +0200 Subject: [PATCH 0788/1640] UPSTREAM: bpf, sockmap: fix psock refcount leak in bpf_tcp_recvmsg In bpf_tcp_recvmsg() we first took a reference on the psock, however once we find that there are skbs in the normal socket's receive queue we return with processing them through tcp_recvmsg(). Problem is that we leak the taken reference on the psock in that path. Given we don't really do anything with the psock at this point, move the skb_queue_empty() test before we fetch the psock to fix this case. Fixes: 8934ce2fd081 ("bpf: sockmap redirect ingress support") Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Signed-off-by: Alexei Starovoitov --- kernel/bpf/sockmap.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 01879e4d599a..26d8a3053407 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -912,6 +912,8 @@ static int bpf_tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, if (unlikely(flags & MSG_ERRQUEUE)) return inet_recv_error(sk, msg, len, addr_len); + if (!skb_queue_empty(&sk->sk_receive_queue)) + return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len); rcu_read_lock(); psock = smap_psock_sk(sk); @@ -922,9 +924,6 @@ static int bpf_tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, goto out; rcu_read_unlock(); - if (!skb_queue_empty(&sk->sk_receive_queue)) - return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len); - lock_sock(sk); bytes_ready: while (copied != len) { From 85bdff72b054b2765219553160f7b414e582a200 Mon Sep 17 00:00:00 2001 From: Stefan Agner Date: Mon, 27 Aug 2018 21:30:42 +0200 Subject: [PATCH 0789/1640] UPSTREAM: bpf: fix build error with clang Building the newly introduced BPF_PROG_TYPE_SK_REUSEPORT leads to a compile time error when building with clang: net/core/filter.o: In function `sk_reuseport_convert_ctx_access': ../net/core/filter.c:7284: undefined reference to `__compiletime_assert_7284' It seems that clang has issues resolving hweight_long at compile time. Since SK_FL_PROTO_MASK is a constant, we can use the interface for known constant arguments which works fine with clang. Fixes: 2dbb9b9e6df6 ("bpf: Introduce BPF_PROG_TYPE_SK_REUSEPORT") Signed-off-by: Stefan Agner Signed-off-by: Alexei Starovoitov --- net/core/filter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/core/filter.c b/net/core/filter.c index 5ea056a20974..6969ffb09cbd 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -7291,7 +7291,7 @@ static u32 sk_reuseport_convert_ctx_access(enum bpf_access_type type, break; case offsetof(struct sk_reuseport_md, ip_protocol): - BUILD_BUG_ON(hweight_long(SK_FL_PROTO_MASK) != BITS_PER_BYTE); + BUILD_BUG_ON(HWEIGHT32(SK_FL_PROTO_MASK) != BITS_PER_BYTE); SK_REUSEPORT_LOAD_SK_FIELD_SIZE_OFF(__sk_flags_offset, BPF_W, 0); *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_PROTO_MASK); From 8ad25dbb013c4ba59d787f81fdde80245beac905 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Fri, 24 Aug 2018 17:37:00 -0700 Subject: [PATCH 0790/1640] UPSTREAM: bpf: sockmap, decrement copied count correctly in redirect error case Currently, when a redirect occurs in sockmap and an error occurs in the redirect call we unwind the scatterlist once in the error path of bpf_tcp_sendmsg_do_redirect() and then again in sendmsg(). Then in the error path of sendmsg we decrement the copied count by the send size. However, its possible we partially sent data before the error was generated. This can happen if do_tcp_sendpages() partially sends the scatterlist before encountering a memory pressure error. If this happens we need to decrement the copied value (the value tracking how many bytes were actually sent to TCP stack) by the number of remaining bytes _not_ the entire send size. Otherwise we risk confusing userspace. Also we don't need two calls to free the scatterlist one is good enough. So remove the one in bpf_tcp_sendmsg_do_redirect() and then properly reduce copied by the number of remaining bytes which may in fact be the entire send size if no bytes were sent. To do this use bool to indicate if free_start_sg() should do mem accounting or not. Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann --- kernel/bpf/sockmap.c | 45 ++++++++++++++++++++++---------------------- 1 file changed, 22 insertions(+), 23 deletions(-) diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 26d8a3053407..ce63e5801746 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -236,7 +236,7 @@ static int bpf_tcp_init(struct sock *sk) } static void smap_release_sock(struct smap_psock *psock, struct sock *sock); -static int free_start_sg(struct sock *sk, struct sk_msg_buff *md); +static int free_start_sg(struct sock *sk, struct sk_msg_buff *md, bool charge); static void bpf_tcp_release(struct sock *sk) { @@ -248,7 +248,7 @@ static void bpf_tcp_release(struct sock *sk) goto out; if (psock->cork) { - free_start_sg(psock->sock, psock->cork); + free_start_sg(psock->sock, psock->cork, true); kfree(psock->cork); psock->cork = NULL; } @@ -330,14 +330,14 @@ static void bpf_tcp_close(struct sock *sk, long timeout) close_fun = psock->save_close; if (psock->cork) { - free_start_sg(psock->sock, psock->cork); + free_start_sg(psock->sock, psock->cork, true); kfree(psock->cork); psock->cork = NULL; } list_for_each_entry_safe(md, mtmp, &psock->ingress, list) { list_del(&md->list); - free_start_sg(psock->sock, md); + free_start_sg(psock->sock, md, true); kfree(md); } @@ -570,14 +570,16 @@ static void free_bytes_sg(struct sock *sk, int bytes, md->sg_start = i; } -static int free_sg(struct sock *sk, int start, struct sk_msg_buff *md) +static int free_sg(struct sock *sk, int start, + struct sk_msg_buff *md, bool charge) { struct scatterlist *sg = md->sg_data; int i = start, free = 0; while (sg[i].length) { free += sg[i].length; - sk_mem_uncharge(sk, sg[i].length); + if (charge) + sk_mem_uncharge(sk, sg[i].length); if (!md->skb) put_page(sg_page(&sg[i])); sg[i].length = 0; @@ -594,9 +596,9 @@ static int free_sg(struct sock *sk, int start, struct sk_msg_buff *md) return free; } -static int free_start_sg(struct sock *sk, struct sk_msg_buff *md) +static int free_start_sg(struct sock *sk, struct sk_msg_buff *md, bool charge) { - int free = free_sg(sk, md->sg_start, md); + int free = free_sg(sk, md->sg_start, md, charge); md->sg_start = md->sg_end; return free; @@ -604,7 +606,7 @@ static int free_start_sg(struct sock *sk, struct sk_msg_buff *md) static int free_curr_sg(struct sock *sk, struct sk_msg_buff *md) { - return free_sg(sk, md->sg_curr, md); + return free_sg(sk, md->sg_curr, md, true); } static int bpf_map_msg_verdict(int _rc, struct sk_msg_buff *md) @@ -718,7 +720,7 @@ static int bpf_tcp_ingress(struct sock *sk, int apply_bytes, list_add_tail(&r->list, &psock->ingress); sk->sk_data_ready(sk); } else { - free_start_sg(sk, r); + free_start_sg(sk, r, true); kfree(r); } @@ -752,14 +754,10 @@ static int bpf_tcp_sendmsg_do_redirect(struct sock *sk, int send, release_sock(sk); } smap_release_sock(psock, sk); - if (unlikely(err)) - goto out; - return 0; + return err; out_rcu: rcu_read_unlock(); -out: - free_bytes_sg(NULL, send, md, false); - return err; + return 0; } static inline void bpf_md_init(struct smap_psock *psock) @@ -822,7 +820,7 @@ more_data: case __SK_PASS: err = bpf_tcp_push(sk, send, m, flags, true); if (unlikely(err)) { - *copied -= free_start_sg(sk, m); + *copied -= free_start_sg(sk, m, true); break; } @@ -845,16 +843,17 @@ more_data: lock_sock(sk); if (unlikely(err < 0)) { - free_start_sg(sk, m); + int free = free_start_sg(sk, m, false); + psock->sg_size = 0; if (!cork) - *copied -= send; + *copied -= free; } else { psock->sg_size -= send; } if (cork) { - free_start_sg(sk, m); + free_start_sg(sk, m, true); psock->sg_size = 0; kfree(m); m = NULL; @@ -1121,7 +1120,7 @@ wait_for_memory: err = sk_stream_wait_memory(sk, &timeo); if (err) { if (m && m != psock->cork) - free_start_sg(sk, m); + free_start_sg(sk, m, true); goto out_err; } } @@ -1580,13 +1579,13 @@ static void smap_gc_work(struct work_struct *w) bpf_prog_put(psock->bpf_tx_msg); if (psock->cork) { - free_start_sg(psock->sock, psock->cork); + free_start_sg(psock->sock, psock->cork, true); kfree(psock->cork); } list_for_each_entry_safe(md, mtmp, &psock->ingress, list) { list_del(&md->list); - free_start_sg(psock->sock, md); + free_start_sg(psock->sock, md, true); kfree(md); } From f95c07b8bcb41171f6e2652aa3efd74a413b0965 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 28 Aug 2018 16:15:35 +0200 Subject: [PATCH 0791/1640] UPSTREAM: bpf: fix several offset tests in bpf_msg_pull_data While recently going over bpf_msg_pull_data(), I noticed three issues which are fixed in here: 1) When we attempt to find the first scatterlist element (sge) for the start offset, we add len to the offset before we check for start < offset + len, whereas it should come after when we iterate to the next sge to accumulate the offsets. For example, given a start offset of 12 with a sge length of 8 for the first sge in the list would lead us to determine this sge as the first sge thinking it covers first 16 bytes where start is located, whereas start sits in subsequent sges so we would end up pulling in the wrong data. 2) After figuring out the starting sge, we have a short-cut test in !msg->sg_copy[i] && bytes <= len. This checks whether it's not needed to make the page at the sge private where we can just exit by updating msg->data and msg->data_end. However, the length test is not fully correct. bytes <= len checks whether the requested bytes (end - start offsets) fit into the sge's length. The part that is missing is that start must not be sge length aligned. Meaning, the start offset into the sge needs to be accounted as well on top of the requested bytes as otherwise we can access the sge out of bounds. For example the sge could have length of 8, our requested bytes could have length of 8, but at a start offset of 4, so we also would need to pull in 4 bytes of the next sge, when we jump to the out label we do set msg->data to sg_virt(&sg[i]) + start - offset and msg->data_end to msg->data + bytes which would be oob. 3) The subsequent bytes < copy test for finding the last sge has the same issue as in point 2) but also it tests for less than rather than less or equal to. Meaning if the sge length is of 8 and requested bytes of 8 while having the start aligned with the sge, we would unnecessarily go and pull in the next sge as well to make it private. Fixes: 015632bb30da ("bpf: sk_msg program helper bpf_sk_msg_pull_data") Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Signed-off-by: Alexei Starovoitov --- net/core/filter.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/net/core/filter.c b/net/core/filter.c index 6969ffb09cbd..ff4bbf728214 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2291,10 +2291,10 @@ BPF_CALL_4(bpf_msg_pull_data, struct sk_msg_buff *, msg, u32, start, u32, end, u64, flags) { unsigned int len = 0, offset = 0, copy = 0; + int bytes = end - start, bytes_sg_total; struct scatterlist *sg = msg->sg_data; int first_sg, last_sg, i, shift; unsigned char *p, *to, *from; - int bytes = end - start; struct page *page; if (unlikely(flags || end <= start)) @@ -2304,9 +2304,9 @@ BPF_CALL_4(bpf_msg_pull_data, i = msg->sg_start; do { len = sg[i].length; - offset += len; if (start < offset + len) break; + offset += len; i++; if (i == MAX_SKB_FRAGS) i = 0; @@ -2315,7 +2315,11 @@ BPF_CALL_4(bpf_msg_pull_data, if (unlikely(start >= offset + len)) return -EINVAL; - if (!msg->sg_copy[i] && bytes <= len) + /* The start may point into the sg element so we need to also + * account for the headroom. + */ + bytes_sg_total = start - offset + bytes; + if (!msg->sg_copy[i] && bytes_sg_total <= len) goto out; first_sg = i; @@ -2335,12 +2339,12 @@ BPF_CALL_4(bpf_msg_pull_data, i++; if (i == MAX_SKB_FRAGS) i = 0; - if (bytes < copy) + if (bytes_sg_total <= copy) break; } while (i != msg->sg_end); last_sg = i; - if (unlikely(copy < end - start)) + if (unlikely(bytes_sg_total > copy)) return -EINVAL; page = alloc_pages(__GFP_NOWARN | GFP_ATOMIC, get_order(copy)); From afdb5de0b624787a7f00cc9474cda21f44bc6fd9 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 29 Aug 2018 16:50:34 +0200 Subject: [PATCH 0792/1640] UPSTREAM: bpf: fix msg->data/data_end after sg shift repair in bpf_msg_pull_data In the current code, msg->data is set as sg_virt(&sg[i]) + start - offset and msg->data_end relative to it as msg->data + bytes. Using iterator i to point to the updated starting scatterlist element holds true for some cases, however not for all where we'd end up pointing out of bounds. It is /correct/ for these ones: 1) When first finding the starting scatterlist element (sge) where we find that the page is already privately owned by the msg and where the requested bytes and headroom fit into the sge's length. However, it's /incorrect/ for the following ones: 2) After we made the requested area private and updated the newly allocated page into first_sg slot of the scatterlist ring; when we find that no shift repair of the ring is needed where we bail out updating msg->data and msg->data_end. At that point i will point to last_sg, which in this case is the next elem of first_sg in the ring. The sge at that point might as well be invalid (e.g. i == msg->sg_end), which we use for setting the range of sg_virt(&sg[i]). The correct one would have been first_sg. 3) Similar as in 2) but when we find that a shift repair of the ring is needed. In this case we fix up all sges and stop once we've reached the end. In this case i will point to will point to the new msg->sg_end, and the sge at that point will be invalid. Again here the requested range sits in first_sg. Fixes: 015632bb30da ("bpf: sk_msg program helper bpf_sk_msg_pull_data") Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Signed-off-by: Alexei Starovoitov --- net/core/filter.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/net/core/filter.c b/net/core/filter.c index ff4bbf728214..98055de4fa2f 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2315,6 +2315,7 @@ BPF_CALL_4(bpf_msg_pull_data, if (unlikely(start >= offset + len)) return -EINVAL; + first_sg = i; /* The start may point into the sg element so we need to also * account for the headroom. */ @@ -2322,8 +2323,6 @@ BPF_CALL_4(bpf_msg_pull_data, if (!msg->sg_copy[i] && bytes_sg_total <= len) goto out; - first_sg = i; - /* At this point we need to linearize multiple scatterlist * elements or a single shared page. Either way we need to * copy into a linear buffer exclusively owned by BPF. Then @@ -2405,7 +2404,7 @@ BPF_CALL_4(bpf_msg_pull_data, if (msg->sg_end < 0) msg->sg_end += MAX_SKB_FRAGS; out: - msg->data = sg_virt(&sg[i]) + start - offset; + msg->data = sg_virt(&sg[first_sg]) + start - offset; msg->data_end = msg->data + bytes; return 0; From 4edbba04cf48fbec3e3e45c67e3463f9b9fcc3e7 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 29 Aug 2018 16:50:35 +0200 Subject: [PATCH 0793/1640] UPSTREAM: bpf: fix shift upon scatterlist ring wrap-around in bpf_msg_pull_data If first_sg and last_sg wraps around in the scatterlist ring, then we need to account for that in the shift as well. E.g. crafting such msgs where this is the case leads to a hang as shift becomes negative. E.g. consider the following scenario: first_sg := 14 |=> shift := -12 msg->sg_start := 10 last_sg := 3 | msg->sg_end := 5 round 1: i := 15, move_from := 3, sg[15] := sg[ 3] round 2: i := 0, move_from := -12, sg[ 0] := sg[-12] round 3: i := 1, move_from := -11, sg[ 1] := sg[-11] round 4: i := 2, move_from := -10, sg[ 2] := sg[-10] [...] round 13: i := 11, move_from := -1, sg[ 2] := sg[ -1] round 14: i := 12, move_from := 0, sg[ 2] := sg[ 0] round 15: i := 13, move_from := 1, sg[ 2] := sg[ 1] round 16: i := 14, move_from := 2, sg[ 2] := sg[ 2] round 17: i := 15, move_from := 3, sg[ 2] := sg[ 3] [...] This means we will loop forever and never hit the msg->sg_end condition to break out of the loop. When we see that the ring wraps around, then the shift should be MAX_SKB_FRAGS - first_sg + last_sg - 1. Meaning, the remainder slots from the tail of the ring and the head until last_sg combined. Fixes: 015632bb30da ("bpf: sk_msg program helper bpf_sk_msg_pull_data") Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Signed-off-by: Alexei Starovoitov --- net/core/filter.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/net/core/filter.c b/net/core/filter.c index 98055de4fa2f..bf2bbc87032b 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2375,7 +2375,10 @@ BPF_CALL_4(bpf_msg_pull_data, * had a single entry though we can just replace it and * be done. Otherwise walk the ring and shift the entries. */ - shift = last_sg - first_sg - 1; + WARN_ON_ONCE(last_sg == first_sg); + shift = last_sg > first_sg ? + last_sg - first_sg - 1 : + MAX_SKB_FRAGS - first_sg + last_sg - 1; if (!shift) goto out; From 574ffdf020d2d6422a281d15a2da1966635e99b2 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 29 Aug 2018 16:50:36 +0200 Subject: [PATCH 0794/1640] UPSTREAM: bpf: fix sg shift repair start offset in bpf_msg_pull_data When we perform the sg shift repair for the scatterlist ring, we currently start out at i = first_sg + 1. However, this is not correct since the first_sg could point to the sge sitting at slot MAX_SKB_FRAGS - 1, and a subsequent i = MAX_SKB_FRAGS will access the scatterlist ring (sg) out of bounds. Add the sk_msg_iter_var() helper for iterating through the ring, and apply the same rule for advancing to the next ring element as we do elsewhere. Later work will use this helper also in other places. Fixes: 015632bb30da ("bpf: sk_msg program helper bpf_sk_msg_pull_data") Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Signed-off-by: Alexei Starovoitov --- net/core/filter.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/net/core/filter.c b/net/core/filter.c index bf2bbc87032b..bc87b5dcbb94 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2287,6 +2287,13 @@ static const struct bpf_func_proto bpf_msg_cork_bytes_proto = { .arg2_type = ARG_ANYTHING, }; +#define sk_msg_iter_var(var) \ + do { \ + var++; \ + if (var == MAX_SKB_FRAGS) \ + var = 0; \ + } while (0) + BPF_CALL_4(bpf_msg_pull_data, struct sk_msg_buff *, msg, u32, start, u32, end, u64, flags) { @@ -2307,9 +2314,7 @@ BPF_CALL_4(bpf_msg_pull_data, if (start < offset + len) break; offset += len; - i++; - if (i == MAX_SKB_FRAGS) - i = 0; + sk_msg_iter_var(i); } while (i != msg->sg_end); if (unlikely(start >= offset + len)) @@ -2335,9 +2340,7 @@ BPF_CALL_4(bpf_msg_pull_data, */ do { copy += sg[i].length; - i++; - if (i == MAX_SKB_FRAGS) - i = 0; + sk_msg_iter_var(i); if (bytes_sg_total <= copy) break; } while (i != msg->sg_end); @@ -2363,9 +2366,7 @@ BPF_CALL_4(bpf_msg_pull_data, sg[i].length = 0; put_page(sg_page(&sg[i])); - i++; - if (i == MAX_SKB_FRAGS) - i = 0; + sk_msg_iter_var(i); } while (i != last_sg); sg[first_sg].length = copy; @@ -2382,7 +2383,8 @@ BPF_CALL_4(bpf_msg_pull_data, if (!shift) goto out; - i = first_sg + 1; + i = first_sg; + sk_msg_iter_var(i); do { int move_from; @@ -2399,9 +2401,7 @@ BPF_CALL_4(bpf_msg_pull_data, sg[move_from].page_link = 0; sg[move_from].offset = 0; - i++; - if (i == MAX_SKB_FRAGS) - i = 0; + sk_msg_iter_var(i); } while (1); msg->sg_end -= shift; if (msg->sg_end < 0) From 178272697c88ac052a93c2a2c424376d6f205730 Mon Sep 17 00:00:00 2001 From: Tushar Dave Date: Fri, 31 Aug 2018 23:45:16 +0200 Subject: [PATCH 0795/1640] UPSTREAM: bpf: Fix bpf_msg_pull_data() Helper bpf_msg_pull_data() mistakenly reuses variable 'offset' while linearizing multiple scatterlist elements. Variable 'offset' is used to find first starting scatterlist element i.e. msg->data = sg_virt(&sg[first_sg]) + start - offset" Use different variable name while linearizing multiple scatterlist elements so that value contained in variable 'offset' won't get overwritten. Fixes: 015632bb30da ("bpf: sk_msg program helper bpf_sk_msg_pull_data") Signed-off-by: Tushar Dave Signed-off-by: Daniel Borkmann --- net/core/filter.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/net/core/filter.c b/net/core/filter.c index bc87b5dcbb94..fea771f147a7 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2297,7 +2297,7 @@ static const struct bpf_func_proto bpf_msg_cork_bytes_proto = { BPF_CALL_4(bpf_msg_pull_data, struct sk_msg_buff *, msg, u32, start, u32, end, u64, flags) { - unsigned int len = 0, offset = 0, copy = 0; + unsigned int len = 0, offset = 0, copy = 0, poffset = 0; int bytes = end - start, bytes_sg_total; struct scatterlist *sg = msg->sg_data; int first_sg, last_sg, i, shift; @@ -2353,16 +2353,15 @@ BPF_CALL_4(bpf_msg_pull_data, if (unlikely(!page)) return -ENOMEM; p = page_address(page); - offset = 0; i = first_sg; do { from = sg_virt(&sg[i]); len = sg[i].length; - to = p + offset; + to = p + poffset; memcpy(to, from, len); - offset += len; + poffset += len; sg[i].length = 0; put_page(sg_page(&sg[i])); From c273a16f35619a348c24644f932a4e01a692bc53 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Thu, 30 Aug 2018 21:25:02 -0700 Subject: [PATCH 0796/1640] UPSTREAM: bpf: avoid misuse of psock when TCP_ULP_BPF collides with another ULP Currently we check sk_user_data is non NULL to determine if the sk exists in a map. However, this is not sufficient to ensure the psock or the ULP ops are not in use by another user, such as kcm or TLS. To avoid this when adding a sock to a map also verify it is of the correct ULP type. Additionally, when releasing a psock verify that it is the TCP_ULP_BPF type before releasing the ULP. The error case where we abort an update due to ULP collision can cause this error path. For example, __sock_map_ctx_update_elem() [...] err = tcp_set_ulp_id(sock, TCP_ULP_BPF) <- collides with TLS if (err) <- so err out here goto out_free [...] out_free: smap_release_sock() <- calling tcp_cleanup_ulp releases the TLS ULP incorrectly. Fixes: 2f857d04601a ("bpf: sockmap, remove STRPARSER map_flags and add multi-map support") Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann --- kernel/bpf/sockmap.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index ce63e5801746..488ef9663c01 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -1462,10 +1462,16 @@ static void smap_destroy_psock(struct rcu_head *rcu) schedule_work(&psock->gc_work); } +static bool psock_is_smap_sk(struct sock *sk) +{ + return inet_csk(sk)->icsk_ulp_ops == &bpf_tcp_ulp_ops; +} + static void smap_release_sock(struct smap_psock *psock, struct sock *sock) { if (refcount_dec_and_test(&psock->refcnt)) { - tcp_cleanup_ulp(sock); + if (psock_is_smap_sk(sock)) + tcp_cleanup_ulp(sock); write_lock_bh(&sock->sk_callback_lock); smap_stop_sock(psock, sock); write_unlock_bh(&sock->sk_callback_lock); @@ -1892,6 +1898,10 @@ static int __sock_map_ctx_update_elem(struct bpf_map *map, * doesn't update user data. */ if (psock) { + if (!psock_is_smap_sk(sock)) { + err = -EBUSY; + goto out_progs; + } if (READ_ONCE(psock->bpf_parse) && parse) { err = -EBUSY; goto out_progs; From 94c1e9f4649f568a957afbda1b71d56471783978 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 12 Sep 2018 10:29:11 -0700 Subject: [PATCH 0797/1640] UPSTREAM: bpf: btf: Fix end boundary calculation for type section The end boundary math for type section is incorrect in btf_check_all_metas(). It just happens that hdr->type_off is always 0 for now because there are only two sections (type and string) and string section must be at the end (ensured in btf_parse_str_sec). However, type_off may not be 0 if a new section would be added later. This patch fixes it. Fixes: f80442a4cd18 ("bpf: btf: Change how section is supported in btf_header") Reported-by: Dmitry Vyukov Signed-off-by: Martin KaFai Lau Acked-by: Yonghong Song Signed-off-by: Daniel Borkmann --- kernel/bpf/btf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 2590700237c1..138f0302692e 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -1844,7 +1844,7 @@ static int btf_check_all_metas(struct btf_verifier_env *env) hdr = &btf->hdr; cur = btf->nohdr_data + hdr->type_off; - end = btf->nohdr_data + hdr->type_len; + end = cur + hdr->type_len; env->log_type_id = 1; while (cur < end) { From eb7b4c4779fb79c1fc4f37589e8a21cb79d25956 Mon Sep 17 00:00:00 2001 From: Tushar Dave Date: Wed, 12 Sep 2018 22:15:29 +0200 Subject: [PATCH 0798/1640] UPSTREAM: bpf: use __GFP_COMP while allocating page Helper bpg_msg_pull_data() can allocate multiple pages while linearizing multiple scatterlist elements into one shared page. However, if the shared page has size > PAGE_SIZE, using copy_page_to_iter() causes below warning. e.g. [ 6367.019832] WARNING: CPU: 2 PID: 7410 at lib/iov_iter.c:825 page_copy_sane.part.8+0x0/0x8 To avoid above warning, use __GFP_COMP while allocating multiple contiguous pages. Fixes: 015632bb30da ("bpf: sk_msg program helper bpf_sk_msg_pull_data") Signed-off-by: Tushar Dave Signed-off-by: Daniel Borkmann --- net/core/filter.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/core/filter.c b/net/core/filter.c index fea771f147a7..5330117417e4 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2349,7 +2349,8 @@ BPF_CALL_4(bpf_msg_pull_data, if (unlikely(bytes_sg_total > copy)) return -EINVAL; - page = alloc_pages(__GFP_NOWARN | GFP_ATOMIC, get_order(copy)); + page = alloc_pages(__GFP_NOWARN | GFP_ATOMIC | __GFP_COMP, + get_order(copy)); if (unlikely(!page)) return -ENOMEM; p = page_address(page); From 00cc26b523a3fd33b88ed9a42489e36153a615c4 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Tue, 18 Sep 2018 09:01:44 -0700 Subject: [PATCH 0799/1640] UPSTREAM: bpf: sockmap only allow ESTABLISHED sock state After this patch we only allow socks that are in ESTABLISHED state or are being added via a sock_ops event that is transitioning into an ESTABLISHED state. By allowing sock_ops events we allow users to manage sockmaps directly from sock ops programs. The two supported sock_ops ops are BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB and BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB. Similar to TLS ULP this ensures sk_user_data is correct. Reported-by: Eric Dumazet Fixes: 1aa12bdf1bfb ("bpf: sockmap, add sock close() hook to remove socks") Signed-off-by: John Fastabend Acked-by: Yonghong Song Signed-off-by: Daniel Borkmann --- kernel/bpf/sockmap.c | 31 ++++++++++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 488ef9663c01..1f97b559892a 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -2097,8 +2097,12 @@ static int sock_map_update_elem(struct bpf_map *map, return -EINVAL; } + /* ULPs are currently supported only for TCP sockets in ESTABLISHED + * state. + */ if (skops.sk->sk_type != SOCK_STREAM || - skops.sk->sk_protocol != IPPROTO_TCP) { + skops.sk->sk_protocol != IPPROTO_TCP || + skops.sk->sk_state != TCP_ESTABLISHED) { fput(socket->file); return -EOPNOTSUPP; } @@ -2453,6 +2457,16 @@ static int sock_hash_update_elem(struct bpf_map *map, return -EINVAL; } + /* ULPs are currently supported only for TCP sockets in ESTABLISHED + * state. + */ + if (skops.sk->sk_type != SOCK_STREAM || + skops.sk->sk_protocol != IPPROTO_TCP || + skops.sk->sk_state != TCP_ESTABLISHED) { + fput(socket->file); + return -EOPNOTSUPP; + } + lock_sock(skops.sk); preempt_disable(); rcu_read_lock(); @@ -2543,10 +2557,22 @@ const struct bpf_map_ops sock_hash_ops = { .map_check_btf = map_check_no_btf, }; +static bool bpf_is_valid_sock_op(struct bpf_sock_ops_kern *ops) +{ + return ops->op == BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB || + ops->op == BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB; +} BPF_CALL_4(bpf_sock_map_update, struct bpf_sock_ops_kern *, bpf_sock, struct bpf_map *, map, void *, key, u64, flags) { WARN_ON_ONCE(!rcu_read_lock_held()); + + /* ULPs are currently supported only for TCP sockets in ESTABLISHED + * state. This checks that the sock ops triggering the update is + * one indicating we are (or will be soon) in an ESTABLISHED state. + */ + if (!bpf_is_valid_sock_op(bpf_sock)) + return -EOPNOTSUPP; return sock_map_ctx_update_elem(bpf_sock, map, key, flags); } @@ -2565,6 +2591,9 @@ BPF_CALL_4(bpf_sock_hash_update, struct bpf_sock_ops_kern *, bpf_sock, struct bpf_map *, map, void *, key, u64, flags) { WARN_ON_ONCE(!rcu_read_lock_held()); + + if (!bpf_is_valid_sock_op(bpf_sock)) + return -EOPNOTSUPP; return sock_hash_ctx_update_elem(bpf_sock, map, key, flags); } From faf1bf14ca6e2f1981947aa869d5dc7fed40a336 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Tue, 18 Sep 2018 09:01:49 -0700 Subject: [PATCH 0800/1640] UPSTREAM: bpf: sockmap, fix transition through disconnect without close It is possible (via shutdown()) for TCP socks to go trough TCP_CLOSE state via tcp_disconnect() without actually calling tcp_close which would then call our bpf_tcp_close() callback. Because of this a user could disconnect a socket then put it in a LISTEN state which would break our assumptions about sockets always being ESTABLISHED state. To resolve this rely on the unhash hook, which is called in the disconnect case, to remove the sock from the sockmap. Reported-by: Eric Dumazet Fixes: 1aa12bdf1bfb ("bpf: sockmap, add sock close() hook to remove socks") Signed-off-by: John Fastabend Acked-by: Yonghong Song Signed-off-by: Daniel Borkmann --- kernel/bpf/sockmap.c | 60 ++++++++++++++++++++++++++++++-------------- 1 file changed, 41 insertions(+), 19 deletions(-) diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 1f97b559892a..0a0f2ec75370 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -132,6 +132,7 @@ struct smap_psock { struct work_struct gc_work; struct proto *sk_proto; + void (*save_unhash)(struct sock *sk); void (*save_close)(struct sock *sk, long timeout); void (*save_data_ready)(struct sock *sk); void (*save_write_space)(struct sock *sk); @@ -143,6 +144,7 @@ static int bpf_tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, static int bpf_tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size); static int bpf_tcp_sendpage(struct sock *sk, struct page *page, int offset, size_t size, int flags); +static void bpf_tcp_unhash(struct sock *sk); static void bpf_tcp_close(struct sock *sk, long timeout); static inline struct smap_psock *smap_psock_sk(const struct sock *sk) @@ -184,6 +186,7 @@ static void build_protos(struct proto prot[SOCKMAP_NUM_CONFIGS], struct proto *base) { prot[SOCKMAP_BASE] = *base; + prot[SOCKMAP_BASE].unhash = bpf_tcp_unhash; prot[SOCKMAP_BASE].close = bpf_tcp_close; prot[SOCKMAP_BASE].recvmsg = bpf_tcp_recvmsg; prot[SOCKMAP_BASE].stream_memory_read = bpf_tcp_stream_read; @@ -217,6 +220,7 @@ static int bpf_tcp_init(struct sock *sk) return -EBUSY; } + psock->save_unhash = sk->sk_prot->unhash; psock->save_close = sk->sk_prot->close; psock->sk_proto = sk->sk_prot; @@ -305,30 +309,12 @@ static struct smap_psock_map_entry *psock_map_pop(struct sock *sk, return e; } -static void bpf_tcp_close(struct sock *sk, long timeout) +static void bpf_tcp_remove(struct sock *sk, struct smap_psock *psock) { - void (*close_fun)(struct sock *sk, long timeout); struct smap_psock_map_entry *e; struct sk_msg_buff *md, *mtmp; - struct smap_psock *psock; struct sock *osk; - lock_sock(sk); - rcu_read_lock(); - psock = smap_psock_sk(sk); - if (unlikely(!psock)) { - rcu_read_unlock(); - release_sock(sk); - return sk->sk_prot->close(sk, timeout); - } - - /* The psock may be destroyed anytime after exiting the RCU critial - * section so by the time we use close_fun the psock may no longer - * be valid. However, bpf_tcp_close is called with the sock lock - * held so the close hook and sk are still valid. - */ - close_fun = psock->save_close; - if (psock->cork) { free_start_sg(psock->sock, psock->cork, true); kfree(psock->cork); @@ -379,6 +365,42 @@ static void bpf_tcp_close(struct sock *sk, long timeout) kfree(e); e = psock_map_pop(sk, psock); } +} + +static void bpf_tcp_unhash(struct sock *sk) +{ + void (*unhash_fun)(struct sock *sk); + struct smap_psock *psock; + + rcu_read_lock(); + psock = smap_psock_sk(sk); + if (unlikely(!psock)) { + rcu_read_unlock(); + if (sk->sk_prot->unhash) + sk->sk_prot->unhash(sk); + return; + } + unhash_fun = psock->save_unhash; + bpf_tcp_remove(sk, psock); + rcu_read_unlock(); + unhash_fun(sk); +} + +static void bpf_tcp_close(struct sock *sk, long timeout) +{ + void (*close_fun)(struct sock *sk, long timeout); + struct smap_psock *psock; + + lock_sock(sk); + rcu_read_lock(); + psock = smap_psock_sk(sk); + if (unlikely(!psock)) { + rcu_read_unlock(); + release_sock(sk); + return sk->sk_prot->close(sk, timeout); + } + close_fun = psock->save_close; + bpf_tcp_remove(sk, psock); rcu_read_unlock(); release_sock(sk); close_fun(sk, timeout); From 982736f6133270d45687da9655c96574c54b7a19 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Fri, 28 Sep 2018 14:33:21 +0100 Subject: [PATCH 0801/1640] UPSTREAM: bpf: harden flags check in cgroup_storage_update_elem() cgroup_storage_update_elem() shouldn't accept any flags argument values except BPF_ANY and BPF_EXIST to guarantee the backward compatibility, had a new flag value been added. Fixes: de9cbbaadba5 ("bpf: introduce cgroup storage maps") Signed-off-by: Roman Gushchin Reported-by: Daniel Borkmann Cc: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- kernel/bpf/local_storage.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index 22ad967d1e5f..94126cbffc88 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -129,7 +129,7 @@ static int cgroup_storage_update_elem(struct bpf_map *map, void *_key, struct bpf_cgroup_storage *storage; struct bpf_storage_buffer *new; - if (flags & BPF_NOEXIST) + if (flags != BPF_ANY && flags != BPF_EXIST) return -EINVAL; storage = cgroup_storage_lookup((struct bpf_cgroup_storage_map *)map, From 1a3ff0d7d627d63fdec0768f708e6c9def8fdcde Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Tue, 2 Oct 2018 02:41:53 +0000 Subject: [PATCH 0802/1640] UPSTREAM: bpf: don't accept cgroup local storage with zero value size Explicitly forbid creating cgroup local storage maps with zero value size, as it makes no sense and might even cause a panic. Reported-by: syzbot+18628320d3b14a5c459c@syzkaller.appspotmail.com Signed-off-by: Roman Gushchin Cc: Alexei Starovoitov Cc: Daniel Borkmann Signed-off-by: Daniel Borkmann --- kernel/bpf/local_storage.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index 94126cbffc88..830d7f095748 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -195,6 +195,9 @@ static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr) if (attr->key_size != sizeof(struct bpf_cgroup_storage_key)) return ERR_PTR(-EINVAL); + if (attr->value_size == 0) + return ERR_PTR(-EINVAL); + if (attr->value_size > PAGE_SIZE) return ERR_PTR(-E2BIG); From bb88f40c6181a624c20fa87014efdd7b89b13924 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Mon, 8 Oct 2018 19:40:16 +0200 Subject: [PATCH 0803/1640] UPSTREAM: xsk: do not call synchronize_net() under RCU read lock MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The XSKMAP update and delete functions called synchronize_net(), which can sleep. It is not allowed to sleep during an RCU read section. Instead we need to make sure that the sock sk_destruct (xsk_destruct) function is asynchronously called after an RCU grace period. Setting the SOCK_RCU_FREE flag for XDP sockets takes care of this. Fixes: fbfc504a24f5 ("bpf: introduce new bpf AF_XDP map type BPF_MAP_TYPE_XSKMAP") Reported-by: Eric Dumazet Signed-off-by: Björn Töpel Acked-by: Song Liu Signed-off-by: Daniel Borkmann --- kernel/bpf/xskmap.c | 10 ++-------- net/xdp/xsk.c | 2 ++ 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c index 9f8463afda9c..47147c9e184d 100644 --- a/kernel/bpf/xskmap.c +++ b/kernel/bpf/xskmap.c @@ -192,11 +192,8 @@ static int xsk_map_update_elem(struct bpf_map *map, void *key, void *value, sock_hold(sock->sk); old_xs = xchg(&m->xsk_map[i], xs); - if (old_xs) { - /* Make sure we've flushed everything. */ - synchronize_net(); + if (old_xs) sock_put((struct sock *)old_xs); - } sockfd_put(sock); return 0; @@ -212,11 +209,8 @@ static int xsk_map_delete_elem(struct bpf_map *map, void *key) return -EINVAL; old_xs = xchg(&m->xsk_map[k], NULL); - if (old_xs) { - /* Make sure we've flushed everything. */ - synchronize_net(); + if (old_xs) sock_put((struct sock *)old_xs); - } return 0; } diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 5f88fda93219..344a67d86e9a 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -451,6 +451,8 @@ static int xsk_create(struct net *net, struct socket *sock, int protocol, sk->sk_destruct = xsk_destruct; sk_refcnt_debug_inc(sk); + sock_set_flag(sk, SOCK_RCU_FREE); + xs = xdp_sk(sk); mutex_init(&xs->mutex); From 48d7e7a691eb1bb3c3814529e19f0d0b032dd44f Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 4 Sep 2018 19:13:44 -0700 Subject: [PATCH 0804/1640] UPSTREAM: bpf/verifier: fix verifier instability [ Upstream commit a9c676bc8fc58d00eea9836fb14ee43c0346416a ] Edward Cree says: In check_mem_access(), for the PTR_TO_CTX case, after check_ctx_access() has supplied a reg_type, the other members of the register state are set appropriately. Previously reg.range was set to 0, but as it is in a union with reg.map_ptr, which is larger, upper bytes of the latter were left in place. This then caused the memcmp() in regsafe() to fail, preventing some branches from being pruned (and occasionally causing the same program to take a varying number of processed insns on repeated verifier runs). Fix the instability by clearing bpf_reg_state in __mark_reg_[un]known() Fixes: f1174f77b50c ("bpf/verifier: rework value tracking") Debugged-by: Edward Cree Acked-by: Edward Cree Signed-off-by: Alexei Starovoitov Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- kernel/bpf/verifier.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index c070671b727b..ef8cd3b49170 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -554,7 +554,9 @@ static void __mark_reg_not_init(struct bpf_reg_state *reg); */ static void __mark_reg_known(struct bpf_reg_state *reg, u64 imm) { - reg->id = 0; + /* Clear id, off, and union(map_ptr, range) */ + memset(((u8 *)reg) + sizeof(reg->type), 0, + offsetof(struct bpf_reg_state, var_off) - sizeof(reg->type)); reg->var_off = tnum_const(imm); reg->smin_value = (s64)imm; reg->smax_value = (s64)imm; @@ -573,7 +575,6 @@ static void __mark_reg_known_zero(struct bpf_reg_state *reg) static void __mark_reg_const_zero(struct bpf_reg_state *reg) { __mark_reg_known(reg, 0); - reg->off = 0; reg->type = SCALAR_VALUE; } @@ -684,9 +685,12 @@ static void __mark_reg_unbounded(struct bpf_reg_state *reg) /* Mark a register as having a completely unknown (scalar) value. */ static void __mark_reg_unknown(struct bpf_reg_state *reg) { + /* + * Clear type, id, off, and union(map_ptr, range) and + * padding between 'type' and union + */ + memset(reg, 0, offsetof(struct bpf_reg_state, var_off)); reg->type = SCALAR_VALUE; - reg->id = 0; - reg->off = 0; reg->var_off = tnum_unknown; reg->frameno = 0; __mark_reg_unbounded(reg); @@ -1756,9 +1760,6 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn else mark_reg_known_zero(env, regs, value_regno); - regs[value_regno].id = 0; - regs[value_regno].off = 0; - regs[value_regno].range = 0; regs[value_regno].type = reg_type; } @@ -2551,7 +2552,6 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL; /* There is no offset yet applied, variable or fixed */ mark_reg_known_zero(env, regs, BPF_REG_0); - regs[BPF_REG_0].off = 0; /* remember map_ptr, so that check_map_access() * can check 'value_size' boundary of memory access * to map element returned from bpf_map_lookup_elem() From 35cf150187d4558a556e034c631a5efe86e94577 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 2 Nov 2018 11:35:46 +0100 Subject: [PATCH 0805/1640] UPSTREAM: bpf: fix bpf_prog_get_info_by_fd to return 0 func_lens for unpriv [ Upstream commit 28c2fae726bf5003cd209b0d5910a642af98316f ] While dbecd7388476 ("bpf: get kernel symbol addresses via syscall") zeroed info.nr_jited_ksyms in bpf_prog_get_info_by_fd() for queries from unprivileged users, commit 815581c11cc2 ("bpf: get JITed image lengths of functions via syscall") forgot about doing so and therefore returns the #elems of the user set up buffer which is incorrect. It also needs to indicate a info.nr_jited_func_lens of zero. Fixes: 815581c11cc2 ("bpf: get JITed image lengths of functions via syscall") Signed-off-by: Daniel Borkmann Cc: Sandipan Das Cc: Song Liu Signed-off-by: Alexei Starovoitov Signed-off-by: Sasha Levin --- kernel/bpf/syscall.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 6fde33165107..da8eba8bbe00 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1985,6 +1985,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, info.jited_prog_len = 0; info.xlated_prog_len = 0; info.nr_jited_ksyms = 0; + info.nr_jited_func_lens = 0; goto done; } From b359f0ad8cc9803ff853898aff127c9e3ef061d6 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Wed, 14 Nov 2018 10:00:34 -0800 Subject: [PATCH 0806/1640] UPSTREAM: bpf: allocate local storage buffers using GFP_ATOMIC [ Upstream commit 569a933b03f3c48b392fe67c0086b3a6b9306b5a ] Naresh reported an issue with the non-atomic memory allocation of cgroup local storage buffers: [ 73.047526] BUG: sleeping function called from invalid context at /srv/oe/build/tmp-rpb-glibc/work-shared/intel-corei7-64/kernel-source/mm/slab.h:421 [ 73.060915] in_atomic(): 1, irqs_disabled(): 0, pid: 3157, name: test_cgroup_sto [ 73.068342] INFO: lockdep is turned off. [ 73.072293] CPU: 2 PID: 3157 Comm: test_cgroup_sto Not tainted 4.20.0-rc2-next-20181113 #1 [ 73.080548] Hardware name: Supermicro SYS-5019S-ML/X11SSH-F, BIOS 2.0b 07/27/2017 [ 73.088018] Call Trace: [ 73.090463] dump_stack+0x70/0xa5 [ 73.093783] ___might_sleep+0x152/0x240 [ 73.097619] __might_sleep+0x4a/0x80 [ 73.101191] __kmalloc_node+0x1cf/0x2f0 [ 73.105031] ? cgroup_storage_update_elem+0x46/0x90 [ 73.109909] cgroup_storage_update_elem+0x46/0x90 cgroup_storage_update_elem() (as well as other update map update callbacks) is called with disabled preemption, so GFP_ATOMIC allocation should be used: e.g. alloc_htab_elem() in hashtab.c. Reported-by: Naresh Kamboju Tested-by: Naresh Kamboju Signed-off-by: Roman Gushchin Cc: Alexei Starovoitov Cc: Daniel Borkmann Signed-off-by: Alexei Starovoitov Signed-off-by: Sasha Levin --- kernel/bpf/local_storage.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index 830d7f095748..fc1605aee5ea 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -138,7 +138,8 @@ static int cgroup_storage_update_elem(struct bpf_map *map, void *_key, return -ENOENT; new = kmalloc_node(sizeof(struct bpf_storage_buffer) + - map->value_size, __GFP_ZERO | GFP_USER, + map->value_size, + __GFP_ZERO | GFP_ATOMIC | __GFP_NOWARN, map->numa_node); if (!new) return -ENOMEM; From 77855a542583d237fe9bbcdbb38610d76beb77cc Mon Sep 17 00:00:00 2001 From: Edward Cree Date: Fri, 16 Nov 2018 12:00:07 +0000 Subject: [PATCH 0807/1640] BACKPORT: bpf: fix off-by-one error in adjust_subprog_starts commit afd594240806acc138cf696c09f2f4829d55d02f upstream. When patching in a new sequence for the first insn of a subprog, the start of that subprog does not change (it's the first insn of the sequence), so adjust_subprog_starts should check start <= off (rather than < off). Also added a test to test_verifier.c (it's essentially the syz reproducer). Fixes: cc8b0b92a169 ("bpf: introduce function calls (function boundaries)") Reported-by: syzbot+4fc427c7af994b0948be@syzkaller.appspotmail.com Signed-off-by: Edward Cree Acked-by: Yonghong Song Signed-off-by: Alexei Starovoitov Signed-off-by: Greg Kroah-Hartman --- kernel/bpf/verifier.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index ef8cd3b49170..132ebe838a6d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5547,7 +5547,7 @@ static void adjust_subprog_starts(struct bpf_verifier_env *env, u32 off, u32 len return; /* NOTE: fake 'exit' subprog should be updated as well. */ for (i = 0; i <= env->subprog_cnt; i++) { - if (env->subprog_info[i].start < off) + if (env->subprog_info[i].start <= off) continue; env->subprog_info[i].start += len - 1; } From b47615eec49a8227bac50c07f558343d030e0926 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 3 Dec 2018 22:46:04 -0800 Subject: [PATCH 0808/1640] UPSTREAM: bpf: check pending signals while verifying programs [ Upstream commit c3494801cd1785e2c25f1a5735fa19ddcf9665da ] Malicious user space may try to force the verifier to use as much cpu time and memory as possible. Hence check for pending signals while verifying the program. Note that suspend of sys_bpf(PROG_LOAD) syscall will lead to EAGAIN, since the kernel has to release the resources used for program verification. Reported-by: Anatoly Trosinenko Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Acked-by: Edward Cree Signed-off-by: Daniel Borkmann Signed-off-by: Sasha Levin --- kernel/bpf/verifier.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 132ebe838a6d..f9c6b60591a0 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5053,6 +5053,9 @@ static int do_check(struct bpf_verifier_env *env) goto process_bpf_exit; } + if (signal_pending(current)) + return -EAGAIN; + if (need_resched()) cond_resched(); From 88a977c55f35a8141d9771b78e83c929bb84e2a8 Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Sat, 10 Nov 2018 22:15:13 -0800 Subject: [PATCH 0809/1640] UPSTREAM: bpf: Allow narrow loads with offset > 0 [ Upstream commit 46f53a65d2de3e1591636c22b626b09d8684fd71 ] Currently BPF verifier allows narrow loads for a context field only with offset zero. E.g. if there is a __u32 field then only the following loads are permitted: * off=0, size=1 (narrow); * off=0, size=2 (narrow); * off=0, size=4 (full). On the other hand LLVM can generate a load with offset different than zero that make sense from program logic point of view, but verifier doesn't accept it. E.g. tools/testing/selftests/bpf/sendmsg4_prog.c has code: #define DST_IP4 0xC0A801FEU /* 192.168.1.254 */ ... if ((ctx->user_ip4 >> 24) == (bpf_htonl(DST_IP4) >> 24) && where ctx is struct bpf_sock_addr. Some versions of LLVM can produce the following byte code for it: 8: 71 12 07 00 00 00 00 00 r2 = *(u8 *)(r1 + 7) 9: 67 02 00 00 18 00 00 00 r2 <<= 24 10: 18 03 00 00 00 00 00 fe 00 00 00 00 00 00 00 00 r3 = 4261412864 ll 12: 5d 32 07 00 00 00 00 00 if r2 != r3 goto +7 where `*(u8 *)(r1 + 7)` means narrow load for ctx->user_ip4 with size=1 and offset=3 (7 - sizeof(ctx->user_family) = 3). This load is currently rejected by verifier. Verifier code that rejects such loads is in bpf_ctx_narrow_access_ok() what means any is_valid_access implementation, that uses the function, works this way, e.g. bpf_skb_is_valid_access() for __sk_buff or sock_addr_is_valid_access() for bpf_sock_addr. The patch makes such loads supported. Offset can be in [0; size_default) but has to be multiple of load size. E.g. for __u32 field the following loads are supported now: * off=0, size=1 (narrow); * off=1, size=1 (narrow); * off=2, size=1 (narrow); * off=3, size=1 (narrow); * off=0, size=2 (narrow); * off=2, size=2 (narrow); * off=0, size=4 (full). Reported-by: Yonghong Song Signed-off-by: Andrey Ignatov Signed-off-by: Alexei Starovoitov Signed-off-by: Sasha Levin --- include/linux/filter.h | 16 +--------------- kernel/bpf/verifier.c | 21 ++++++++++++++++----- 2 files changed, 17 insertions(+), 20 deletions(-) diff --git a/include/linux/filter.h b/include/linux/filter.h index e975d1329404..b4597fb3d4f1 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -734,24 +734,10 @@ static inline u32 bpf_ctx_off_adjust_machine(u32 size) return size; } -static inline bool bpf_ctx_narrow_align_ok(u32 off, u32 size_access, - u32 size_default) -{ - size_default = bpf_ctx_off_adjust_machine(size_default); - size_access = bpf_ctx_off_adjust_machine(size_access); - -#ifdef __LITTLE_ENDIAN - return (off & (size_default - 1)) == 0; -#else - return (off & (size_default - 1)) + size_access == size_default; -#endif -} - static inline bool bpf_ctx_narrow_access_ok(u32 off, u32 size, u32 size_default) { - return bpf_ctx_narrow_align_ok(off, size, size_default) && - size <= size_default && (size & (size - 1)) == 0; + return size <= size_default && (size & (size - 1)) == 0; } #define bpf_classic_proglen(fprog) (fprog->len * sizeof(fprog->filter[0])) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index f9c6b60591a0..fff30405b02f 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5605,10 +5605,10 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) int i, cnt, size, ctx_field_size, delta = 0; const int insn_cnt = env->prog->len; struct bpf_insn insn_buf[16], *insn; + u32 target_size, size_default, off; struct bpf_prog *new_prog; enum bpf_access_type type; bool is_narrower_load; - u32 target_size; if (ops->gen_prologue) { cnt = ops->gen_prologue(insn_buf, env->seen_direct_write, @@ -5685,9 +5685,9 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) * we will apply proper mask to the result. */ is_narrower_load = size < ctx_field_size; + size_default = bpf_ctx_off_adjust_machine(ctx_field_size); + off = insn->off; if (is_narrower_load) { - u32 size_default = bpf_ctx_off_adjust_machine(ctx_field_size); - u32 off = insn->off; u8 size_code; if (type == BPF_WRITE) { @@ -5715,12 +5715,23 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) } if (is_narrower_load && size < target_size) { - if (ctx_field_size <= 4) + u8 shift = (off & (size_default - 1)) * 8; + + if (ctx_field_size <= 4) { + if (shift) + insn_buf[cnt++] = BPF_ALU32_IMM(BPF_RSH, + insn->dst_reg, + shift); insn_buf[cnt++] = BPF_ALU32_IMM(BPF_AND, insn->dst_reg, (1 << size * 8) - 1); - else + } else { + if (shift) + insn_buf[cnt++] = BPF_ALU64_IMM(BPF_RSH, + insn->dst_reg, + shift); insn_buf[cnt++] = BPF_ALU64_IMM(BPF_AND, insn->dst_reg, (1 << size * 8) - 1); + } } new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); From 2e719001944fa4d2a34416f5ecb5eda88f2b992d Mon Sep 17 00:00:00 2001 From: Jiong Wang Date: Fri, 7 Dec 2018 12:16:18 -0500 Subject: [PATCH 0810/1640] UPSTREAM: bpf: relax verifier restriction on BPF_MOV | BPF_ALU [ Upstream commit e434b8cdf788568ba65a0a0fd9f3cb41f3ca1803 ] Currently, the destination register is marked as unknown for 32-bit sub-register move (BPF_MOV | BPF_ALU) whenever the source register type is SCALAR_VALUE. This is too conservative that some valid cases will be rejected. Especially, this may turn a constant scalar value into unknown value that could break some assumptions of verifier. For example, test_l4lb_noinline.c has the following C code: struct real_definition *dst 1: if (!get_packet_dst(&dst, &pckt, vip_info, is_ipv6)) 2: return TC_ACT_SHOT; 3: 4: if (dst->flags & F_IPV6) { get_packet_dst is responsible for initializing "dst" into valid pointer and return true (1), otherwise return false (0). The compiled instruction sequence using alu32 will be: 412: (54) (u32) r7 &= (u32) 1 413: (bc) (u32) r0 = (u32) r7 414: (95) exit insn 413, a BPF_MOV | BPF_ALU, however will turn r0 into unknown value even r7 contains SCALAR_VALUE 1. This causes trouble when verifier is walking the code path that hasn't initialized "dst" inside get_packet_dst, for which case 0 is returned and we would then expect verifier concluding line 1 in the above C code pass the "if" check, therefore would skip fall through path starting at line 4. Now, because r0 returned from callee has became unknown value, so verifier won't skip analyzing path starting at line 4 and "dst->flags" requires dereferencing the pointer "dst" which actually hasn't be initialized for this path. This patch relaxed the code marking sub-register move destination. For a SCALAR_VALUE, it is safe to just copy the value from source then truncate it into 32-bit. A unit test also included to demonstrate this issue. This test will fail before this patch. This relaxation could let verifier skipping more paths for conditional comparison against immediate. It also let verifier recording a more accurate/strict value for one register at one state, if this state end up with going through exit without rejection and it is used for state comparison later, then it is possible an inaccurate/permissive value is better. So the real impact on verifier processed insn number is complex. But in all, without this fix, valid program could be rejected. >From real benchmarking on kernel selftests and Cilium bpf tests, there is no impact on processed instruction number when tests ares compiled with default compilation options. There is slightly improvements when they are compiled with -mattr=+alu32 after this patch. Also, test_xdp_noinline/-mattr=+alu32 now passed verification. It is rejected before this fix. Insn processed before/after this patch: default -mattr=+alu32 Kernel selftest === test_xdp.o 371/371 369/369 test_l4lb.o 6345/6345 5623/5623 test_xdp_noinline.o 2971/2971 rejected/2727 test_tcp_estates.o 429/429 430/430 Cilium bpf === bpf_lb-DLB_L3.o: 2085/2085 1685/1687 bpf_lb-DLB_L4.o: 2287/2287 1986/1982 bpf_lb-DUNKNOWN.o: 690/690 622/622 bpf_lxc.o: 95033/95033 N/A bpf_netdev.o: 7245/7245 N/A bpf_overlay.o: 2898/2898 3085/2947 NOTE: - bpf_lxc.o and bpf_netdev.o compiled by -mattr=+alu32 are rejected by verifier due to another issue inside verifier on supporting alu32 binary. - Each cilium bpf program could generate several processed insn number, above number is sum of them. v1->v2: - Restrict the change on SCALAR_VALUE. - Update benchmark numbers on Cilium bpf tests. Signed-off-by: Jiong Wang Signed-off-by: Alexei Starovoitov Signed-off-by: Sasha Levin --- kernel/bpf/verifier.c | 16 ++++++++++++---- tools/testing/selftests/bpf/test_verifier.c | 13 +++++++++++++ 2 files changed, 25 insertions(+), 4 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index fff30405b02f..80056f25bc8f 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3532,12 +3532,15 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) return err; if (BPF_SRC(insn->code) == BPF_X) { + struct bpf_reg_state *src_reg = regs + insn->src_reg; + struct bpf_reg_state *dst_reg = regs + insn->dst_reg; + if (BPF_CLASS(insn->code) == BPF_ALU64) { /* case: R1 = R2 * copy register state to dest reg */ - regs[insn->dst_reg] = regs[insn->src_reg]; - regs[insn->dst_reg].live |= REG_LIVE_WRITTEN; + *dst_reg = *src_reg; + dst_reg->live |= REG_LIVE_WRITTEN; } else { /* R1 = (u32) R2 */ if (is_pointer_value(env, insn->src_reg)) { @@ -3545,9 +3548,14 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) "R%d partial copy of pointer\n", insn->src_reg); return -EACCES; + } else if (src_reg->type == SCALAR_VALUE) { + *dst_reg = *src_reg; + dst_reg->live |= REG_LIVE_WRITTEN; + } else { + mark_reg_unknown(env, regs, + insn->dst_reg); } - mark_reg_unknown(env, regs, insn->dst_reg); - coerce_reg_to_size(®s[insn->dst_reg], 4); + coerce_reg_to_size(dst_reg, 4); } } else { /* case: R = imm diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c index 58fe3fe7b212..97fafe5cdc55 100644 --- a/tools/testing/selftests/bpf/test_verifier.c +++ b/tools/testing/selftests/bpf/test_verifier.c @@ -2157,6 +2157,19 @@ static struct bpf_test tests[] = { .result_unpriv = REJECT, .result = ACCEPT, }, + { + "alu32: mov u32 const", + .insns = { + BPF_MOV32_IMM(BPF_REG_7, 0), + BPF_ALU32_IMM(BPF_AND, BPF_REG_7, 1), + BPF_MOV32_REG(BPF_REG_0, BPF_REG_7), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1), + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_7, 0), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + .retval = 0, + }, { "unpriv: partial copy of pointer", .insns = { From 99e43b117f34c0954892de5aef9fcff5fbb73573 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 28 Jan 2019 21:28:19 +0100 Subject: [PATCH 0811/1640] UPSTREAM: bpf: improve verifier branch analysis [ commit 4f7b3e82589e0de723780198ec7983e427144c0a upstream ] pathological bpf programs may try to force verifier to explode in the number of branch states: 20: (d5) if r1 s<= 0x24000028 goto pc+0 21: (b5) if r0 <= 0xe1fa20 goto pc+2 22: (d5) if r1 s<= 0x7e goto pc+0 23: (b5) if r0 <= 0xe880e000 goto pc+0 24: (c5) if r0 s< 0x2100ecf4 goto pc+0 25: (d5) if r1 s<= 0xe880e000 goto pc+1 26: (c5) if r0 s< 0xf4041810 goto pc+0 27: (d5) if r1 s<= 0x1e007e goto pc+0 28: (b5) if r0 <= 0xe86be000 goto pc+0 29: (07) r0 += 16614 30: (c5) if r0 s< 0x6d0020da goto pc+0 31: (35) if r0 >= 0x2100ecf4 goto pc+0 Teach verifier to recognize always taken and always not taken branches. This analysis is already done for == and != comparison. Expand it to all other branches. It also helps real bpf programs to be verified faster: before after bpf_lb-DLB_L3.o 2003 1940 bpf_lb-DLB_L4.o 3173 3089 bpf_lb-DUNKNOWN.o 1080 1065 bpf_lxc-DDROP_ALL.o 29584 28052 bpf_lxc-DUNKNOWN.o 36916 35487 bpf_netdev.o 11188 10864 bpf_overlay.o 6679 6643 bpf_lcx_jit.o 39555 38437 Reported-by: Anatoly Trosinenko Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Acked-by: Edward Cree Signed-off-by: Daniel Borkmann Signed-off-by: Sasha Levin --- kernel/bpf/verifier.c | 93 +++++++++++++++++++++++++++++++++++++------ 1 file changed, 80 insertions(+), 13 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 80056f25bc8f..dbc83889823d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3722,6 +3722,79 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *vstate, } } +/* compute branch direction of the expression "if (reg opcode val) goto target;" + * and return: + * 1 - branch will be taken and "goto target" will be executed + * 0 - branch will not be taken and fall-through to next insn + * -1 - unknown. Example: "if (reg < 5)" is unknown when register value range [0,10] + */ +static int is_branch_taken(struct bpf_reg_state *reg, u64 val, u8 opcode) +{ + if (__is_pointer_value(false, reg)) + return -1; + + switch (opcode) { + case BPF_JEQ: + if (tnum_is_const(reg->var_off)) + return !!tnum_equals_const(reg->var_off, val); + break; + case BPF_JNE: + if (tnum_is_const(reg->var_off)) + return !tnum_equals_const(reg->var_off, val); + break; + case BPF_JGT: + if (reg->umin_value > val) + return 1; + else if (reg->umax_value <= val) + return 0; + break; + case BPF_JSGT: + if (reg->smin_value > (s64)val) + return 1; + else if (reg->smax_value < (s64)val) + return 0; + break; + case BPF_JLT: + if (reg->umax_value < val) + return 1; + else if (reg->umin_value >= val) + return 0; + break; + case BPF_JSLT: + if (reg->smax_value < (s64)val) + return 1; + else if (reg->smin_value >= (s64)val) + return 0; + break; + case BPF_JGE: + if (reg->umin_value >= val) + return 1; + else if (reg->umax_value < val) + return 0; + break; + case BPF_JSGE: + if (reg->smin_value >= (s64)val) + return 1; + else if (reg->smax_value < (s64)val) + return 0; + break; + case BPF_JLE: + if (reg->umax_value <= val) + return 1; + else if (reg->umin_value > val) + return 0; + break; + case BPF_JSLE: + if (reg->smax_value <= (s64)val) + return 1; + else if (reg->smin_value > (s64)val) + return 0; + break; + } + + return -1; +} + /* Adjusts the register min/max values in the case that the dst_reg is the * variable register that we are working on, and src_reg is a constant or we're * simply doing a BPF_K check. @@ -4115,21 +4188,15 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, dst_reg = ®s[insn->dst_reg]; - /* detect if R == 0 where R was initialized to zero earlier */ - if (BPF_SRC(insn->code) == BPF_K && - (opcode == BPF_JEQ || opcode == BPF_JNE) && - dst_reg->type == SCALAR_VALUE && - tnum_is_const(dst_reg->var_off)) { - if ((opcode == BPF_JEQ && dst_reg->var_off.value == insn->imm) || - (opcode == BPF_JNE && dst_reg->var_off.value != insn->imm)) { - /* if (imm == imm) goto pc+off; - * only follow the goto, ignore fall-through - */ + if (BPF_SRC(insn->code) == BPF_K) { + int pred = is_branch_taken(dst_reg, insn->imm, opcode); + + if (pred == 1) { + /* only follow the goto, ignore fall-through */ *insn_idx += insn->off; return 0; - } else { - /* if (imm != imm) goto pc+off; - * only follow fall-through branch, since + } else if (pred == 0) { + /* only follow fall-through branch, since * that's where the program will go */ return 0; From 48db9013400ab5b52f81304415b2f11f9d8dfed2 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 28 Jan 2019 21:28:20 +0100 Subject: [PATCH 0812/1640] UPSTREAM: bpf: add per-insn complexity limit [ commit ceefbc96fa5c5b975d87bf8e89ba8416f6b764d9 upstream ] malicious bpf program may try to force the verifier to remember a lot of distinct verifier states. Put a limit to number of per-insn 'struct bpf_verifier_state'. Note that hitting the limit doesn't reject the program. It potentially makes the verifier do more steps to analyze the program. It means that malicious programs will hit BPF_COMPLEXITY_LIMIT_INSNS sooner instead of spending cpu time walking long link list. The limit of BPF_COMPLEXITY_LIMIT_STATES==64 affects cilium progs with slight increase in number of "steps" it takes to successfully verify the programs: before after bpf_lb-DLB_L3.o 1940 1940 bpf_lb-DLB_L4.o 3089 3089 bpf_lb-DUNKNOWN.o 1065 1065 bpf_lxc-DDROP_ALL.o 28052 | 28162 bpf_lxc-DUNKNOWN.o 35487 | 35541 bpf_netdev.o 10864 10864 bpf_overlay.o 6643 6643 bpf_lcx_jit.o 38437 38437 But it also makes malicious program to be rejected in 0.4 seconds vs 6.5 Hence apply this limit to unprivileged programs only. Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Acked-by: Edward Cree Signed-off-by: Daniel Borkmann Signed-off-by: Sasha Levin --- kernel/bpf/verifier.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index dbc83889823d..91582199862f 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -156,6 +156,7 @@ struct bpf_verifier_stack_elem { #define BPF_COMPLEXITY_LIMIT_INSNS 131072 #define BPF_COMPLEXITY_LIMIT_STACK 1024 +#define BPF_COMPLEXITY_LIMIT_STATES 64 #define BPF_MAP_PTR_UNPRIV 1UL #define BPF_MAP_PTR_POISON ((void *)((0xeB9FUL << 1) + \ @@ -4994,7 +4995,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) struct bpf_verifier_state_list *new_sl; struct bpf_verifier_state_list *sl; struct bpf_verifier_state *cur = env->cur_state; - int i, j, err; + int i, j, err, states_cnt = 0; sl = env->explored_states[insn_idx]; if (!sl) @@ -5021,8 +5022,12 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) return 1; } sl = sl->next; + states_cnt++; } + if (!env->allow_ptr_leaks && states_cnt > BPF_COMPLEXITY_LIMIT_STATES) + return 0; + /* there were no equivalent states, remember current one. * technically the current state is not proven to be safe yet, * but it will either reach outer most bpf_exit (which means it's safe) From 43ad8c3af6ec6571c4a4328f52b1cdbcb325051d Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 28 Jan 2019 21:28:22 +0100 Subject: [PATCH 0813/1640] UPSTREAM: bpf: move tmp variable into ax register in interpreter [ commit 144cd91c4c2bced6eb8a7e25e590f6618a11e854 upstream ] This change moves the on-stack 64 bit tmp variable in ___bpf_prog_run() into the hidden ax register. The latter is currently only used in JITs for constant blinding as a temporary scratch register, meaning the BPF interpreter will never see the use of ax. Therefore it is safe to use it for the cases where tmp has been used earlier. This is needed to later on allow restricted hidden use of ax in both interpreter and JITs. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Signed-off-by: Sasha Levin --- kernel/bpf/core.c | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index c072338d7e09..158e20212b4f 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1061,7 +1061,6 @@ bool bpf_opcode_in_insntable(u8 code) */ static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, u64 *stack) { - u64 tmp; #define BPF_INSN_2_LBL(x, y) [BPF_##x | BPF_##y] = &&x##_##y #define BPF_INSN_3_LBL(x, y, z) [BPF_##x | BPF_##y | BPF_##z] = &&x##_##y##_##z static const void *jumptable[256] = { @@ -1135,36 +1134,36 @@ select_insn: (*(s64 *) &DST) >>= IMM; CONT; ALU64_MOD_X: - div64_u64_rem(DST, SRC, &tmp); - DST = tmp; + div64_u64_rem(DST, SRC, &AX); + DST = AX; CONT; ALU_MOD_X: - tmp = (u32) DST; - DST = do_div(tmp, (u32) SRC); + AX = (u32) DST; + DST = do_div(AX, (u32) SRC); CONT; ALU64_MOD_K: - div64_u64_rem(DST, IMM, &tmp); - DST = tmp; + div64_u64_rem(DST, IMM, &AX); + DST = AX; CONT; ALU_MOD_K: - tmp = (u32) DST; - DST = do_div(tmp, (u32) IMM); + AX = (u32) DST; + DST = do_div(AX, (u32) IMM); CONT; ALU64_DIV_X: DST = div64_u64(DST, SRC); CONT; ALU_DIV_X: - tmp = (u32) DST; - do_div(tmp, (u32) SRC); - DST = (u32) tmp; + AX = (u32) DST; + do_div(AX, (u32) SRC); + DST = (u32) AX; CONT; ALU64_DIV_K: DST = div64_u64(DST, IMM); CONT; ALU_DIV_K: - tmp = (u32) DST; - do_div(tmp, (u32) IMM); - DST = (u32) tmp; + AX = (u32) DST; + do_div(AX, (u32) IMM); + DST = (u32) AX; CONT; ALU_END_TO_BE: switch (IMM) { @@ -1433,7 +1432,7 @@ static u64 PROG_NAME_ARGS(stack_size)(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5, \ const struct bpf_insn *insn) \ { \ u64 stack[stack_size / sizeof(u64)]; \ - u64 regs[MAX_BPF_REG]; \ + u64 regs[MAX_BPF_EXT_REG]; \ \ FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; \ BPF_R1 = r1; \ From db6f6dcbad4211de47350c25572aea41196d7490 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Tue, 8 Jan 2019 14:20:44 -0800 Subject: [PATCH 0814/1640] UPSTREAM: bpf: fix panic in stack_map_get_build_id() on i386 and arm32 [ Upstream commit beaf3d1901f4ea46fbd5c9d857227d99751de469 ] As Naresh reported, test_stacktrace_build_id() causes panic on i386 and arm32 systems. This is caused by page_address() returns NULL in certain cases. This patch fixes this error by using kmap_atomic/kunmap_atomic instead of page_address. Fixes: 615755a77b24 (" bpf: extend stackmap to save binary_build_id+offset instead of address") Reported-by: Naresh Kamboju Signed-off-by: Song Liu Signed-off-by: Daniel Borkmann Signed-off-by: Sasha Levin --- kernel/bpf/stackmap.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 2af07b864cfa..e4ccffd6bb33 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -266,7 +266,7 @@ static int stack_map_get_build_id(struct vm_area_struct *vma, return -EFAULT; /* page not mapped */ ret = -EINVAL; - page_addr = page_address(page); + page_addr = kmap_atomic(page); ehdr = (Elf32_Ehdr *)page_addr; /* compare magic x7f "ELF" */ @@ -282,6 +282,7 @@ static int stack_map_get_build_id(struct vm_area_struct *vma, else if (ehdr->e_ident[EI_CLASS] == ELFCLASS64) ret = stack_map_get_build_id_64(page_addr, build_id); out: + kunmap_atomic(page_addr); put_page(page); return ret; } From 2573790af4ec184b7c87f15313965bc996cd6845 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Wed, 16 Jan 2019 14:03:15 -0800 Subject: [PATCH 0815/1640] UPSTREAM: bpf: don't assume build-id length is always 20 bytes [ Upstream commit 0b698005a9d11c0e91141ec11a2c4918a129f703 ] Build-id length is not fixed to 20, it can be (`man ld` /--build-id): * 128-bit (uuid) * 160-bit (sha1) * any length specified in ld --build-id=0xhexstring To fix the issue of missing BPF_STACK_BUILD_ID_VALID for shorter build-ids, assume that build-id is somewhere in the range of 1 .. 20. Set the remaining bytes to zero. v2: * don't introduce new "len = min(BPF_BUILD_ID_SIZE, nhdr->n_descsz)", we already know that nhdr->n_descsz <= BPF_BUILD_ID_SIZE if we enter this 'if' condition Fixes: 615755a77b24 ("bpf: extend stackmap to save binary_build_id+offset instead of address") Acked-by: Song Liu Signed-off-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann Signed-off-by: Sasha Levin --- kernel/bpf/stackmap.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index e4ccffd6bb33..5acf8cf49379 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -186,11 +186,14 @@ static inline int stack_map_parse_build_id(void *page_addr, if (nhdr->n_type == BPF_BUILD_ID && nhdr->n_namesz == sizeof("GNU") && - nhdr->n_descsz == BPF_BUILD_ID_SIZE) { + nhdr->n_descsz > 0 && + nhdr->n_descsz <= BPF_BUILD_ID_SIZE) { memcpy(build_id, note_start + note_offs + ALIGN(sizeof("GNU"), 4) + sizeof(Elf32_Nhdr), - BPF_BUILD_ID_SIZE); + nhdr->n_descsz); + memset(build_id + nhdr->n_descsz, 0, + BPF_BUILD_ID_SIZE - nhdr->n_descsz); return 0; } new_offs = note_offs + sizeof(Elf32_Nhdr) + From 2cb707d6eef50a51be8b8e7342c45a44bad3d801 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Wed, 16 Jan 2019 14:03:16 -0800 Subject: [PATCH 0816/1640] UPSTREAM: bpf: zero out build_id for BPF_STACK_BUILD_ID_IP [ Upstream commit 4af396ae4836c4ecab61e975b8e61270c551894d ] When returning BPF_STACK_BUILD_ID_IP from stack_map_get_build_id_offset, make sure that build_id field is empty. Since we are using percpu free list, there is a possibility that we might reuse some previous bpf_stack_build_id with non-zero build_id. Fixes: 615755a77b24 ("bpf: extend stackmap to save binary_build_id+offset instead of address") Acked-by: Song Liu Signed-off-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann Signed-off-by: Sasha Levin --- kernel/bpf/stackmap.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 5acf8cf49379..700e1ef25cd6 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -320,6 +320,7 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, for (i = 0; i < trace_nr; i++) { id_offs[i].status = BPF_STACK_BUILD_ID_IP; id_offs[i].ip = ips[i]; + memset(id_offs[i].build_id, 0, BPF_BUILD_ID_SIZE); } return; } @@ -330,6 +331,7 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, /* per entry fall back to ips */ id_offs[i].status = BPF_STACK_BUILD_ID_IP; id_offs[i].ip = ips[i]; + memset(id_offs[i].build_id, 0, BPF_BUILD_ID_SIZE); continue; } id_offs[i].offset = (vma->vm_pgoff << PAGE_SHIFT) + ips[i] From fcac8cb81776fe4844119621e8a64472a25a6c15 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 30 Jan 2019 18:12:43 -0800 Subject: [PATCH 0817/1640] UPSTREAM: bpf: fix lockdep false positive in percpu_freelist [ Upstream commit a89fac57b5d080771efd4d71feaae19877cf68f0 ] Lockdep warns about false positive: [ 12.492084] 00000000e6b28347 (&head->lock){+...}, at: pcpu_freelist_push+0x2a/0x40 [ 12.492696] but this lock was taken by another, HARDIRQ-safe lock in the past: [ 12.493275] (&rq->lock){-.-.} [ 12.493276] [ 12.493276] [ 12.493276] and interrupts could create inverse lock ordering between them. [ 12.493276] [ 12.494435] [ 12.494435] other info that might help us debug this: [ 12.494979] Possible interrupt unsafe locking scenario: [ 12.494979] [ 12.495518] CPU0 CPU1 [ 12.495879] ---- ---- [ 12.496243] lock(&head->lock); [ 12.496502] local_irq_disable(); [ 12.496969] lock(&rq->lock); [ 12.497431] lock(&head->lock); [ 12.497890] [ 12.498104] lock(&rq->lock); [ 12.498368] [ 12.498368] *** DEADLOCK *** [ 12.498368] [ 12.498837] 1 lock held by dd/276: [ 12.499110] #0: 00000000c58cb2ee (rcu_read_lock){....}, at: trace_call_bpf+0x5e/0x240 [ 12.499747] [ 12.499747] the shortest dependencies between 2nd lock and 1st lock: [ 12.500389] -> (&rq->lock){-.-.} { [ 12.500669] IN-HARDIRQ-W at: [ 12.500934] _raw_spin_lock+0x2f/0x40 [ 12.501373] scheduler_tick+0x4c/0xf0 [ 12.501812] update_process_times+0x40/0x50 [ 12.502294] tick_periodic+0x27/0xb0 [ 12.502723] tick_handle_periodic+0x1f/0x60 [ 12.503203] timer_interrupt+0x11/0x20 [ 12.503651] __handle_irq_event_percpu+0x43/0x2c0 [ 12.504167] handle_irq_event_percpu+0x20/0x50 [ 12.504674] handle_irq_event+0x37/0x60 [ 12.505139] handle_level_irq+0xa7/0x120 [ 12.505601] handle_irq+0xa1/0x150 [ 12.506018] do_IRQ+0x77/0x140 [ 12.506411] ret_from_intr+0x0/0x1d [ 12.506834] _raw_spin_unlock_irqrestore+0x53/0x60 [ 12.507362] __setup_irq+0x481/0x730 [ 12.507789] setup_irq+0x49/0x80 [ 12.508195] hpet_time_init+0x21/0x32 [ 12.508644] x86_late_time_init+0xb/0x16 [ 12.509106] start_kernel+0x390/0x42a [ 12.509554] secondary_startup_64+0xa4/0xb0 [ 12.510034] IN-SOFTIRQ-W at: [ 12.510305] _raw_spin_lock+0x2f/0x40 [ 12.510772] try_to_wake_up+0x1c7/0x4e0 [ 12.511220] swake_up_locked+0x20/0x40 [ 12.511657] swake_up_one+0x1a/0x30 [ 12.512070] rcu_process_callbacks+0xc5/0x650 [ 12.512553] __do_softirq+0xe6/0x47b [ 12.512978] irq_exit+0xc3/0xd0 [ 12.513372] smp_apic_timer_interrupt+0xa9/0x250 [ 12.513876] apic_timer_interrupt+0xf/0x20 [ 12.514343] default_idle+0x1c/0x170 [ 12.514765] do_idle+0x199/0x240 [ 12.515159] cpu_startup_entry+0x19/0x20 [ 12.515614] start_kernel+0x422/0x42a [ 12.516045] secondary_startup_64+0xa4/0xb0 [ 12.516521] INITIAL USE at: [ 12.516774] _raw_spin_lock_irqsave+0x38/0x50 [ 12.517258] rq_attach_root+0x16/0xd0 [ 12.517685] sched_init+0x2f2/0x3eb [ 12.518096] start_kernel+0x1fb/0x42a [ 12.518525] secondary_startup_64+0xa4/0xb0 [ 12.518986] } [ 12.519132] ... key at: [] __key.71384+0x0/0x8 [ 12.519649] ... acquired at: [ 12.519892] pcpu_freelist_pop+0x7b/0xd0 [ 12.520221] bpf_get_stackid+0x1d2/0x4d0 [ 12.520563] ___bpf_prog_run+0x8b4/0x11a0 [ 12.520887] [ 12.521008] -> (&head->lock){+...} { [ 12.521292] HARDIRQ-ON-W at: [ 12.521539] _raw_spin_lock+0x2f/0x40 [ 12.521950] pcpu_freelist_push+0x2a/0x40 [ 12.522396] bpf_get_stackid+0x494/0x4d0 [ 12.522828] ___bpf_prog_run+0x8b4/0x11a0 [ 12.523296] INITIAL USE at: [ 12.523537] _raw_spin_lock+0x2f/0x40 [ 12.523944] pcpu_freelist_populate+0xc0/0x120 [ 12.524417] htab_map_alloc+0x405/0x500 [ 12.524835] __do_sys_bpf+0x1a3/0x1a90 [ 12.525253] do_syscall_64+0x4a/0x180 [ 12.525659] entry_SYSCALL_64_after_hwframe+0x49/0xbe [ 12.526167] } [ 12.526311] ... key at: [] __key.13130+0x0/0x8 [ 12.526812] ... acquired at: [ 12.527047] __lock_acquire+0x521/0x1350 [ 12.527371] lock_acquire+0x98/0x190 [ 12.527680] _raw_spin_lock+0x2f/0x40 [ 12.527994] pcpu_freelist_push+0x2a/0x40 [ 12.528325] bpf_get_stackid+0x494/0x4d0 [ 12.528645] ___bpf_prog_run+0x8b4/0x11a0 [ 12.528970] [ 12.529092] [ 12.529092] stack backtrace: [ 12.529444] CPU: 0 PID: 276 Comm: dd Not tainted 5.0.0-rc3-00018-g2fa53f892422 #475 [ 12.530043] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.11.0-2.el7 04/01/2014 [ 12.530750] Call Trace: [ 12.530948] dump_stack+0x5f/0x8b [ 12.531248] check_usage_backwards+0x10c/0x120 [ 12.531598] ? ___bpf_prog_run+0x8b4/0x11a0 [ 12.531935] ? mark_lock+0x382/0x560 [ 12.532229] mark_lock+0x382/0x560 [ 12.532496] ? print_shortest_lock_dependencies+0x180/0x180 [ 12.532928] __lock_acquire+0x521/0x1350 [ 12.533271] ? find_get_entry+0x17f/0x2e0 [ 12.533586] ? find_get_entry+0x19c/0x2e0 [ 12.533902] ? lock_acquire+0x98/0x190 [ 12.534196] lock_acquire+0x98/0x190 [ 12.534482] ? pcpu_freelist_push+0x2a/0x40 [ 12.534810] _raw_spin_lock+0x2f/0x40 [ 12.535099] ? pcpu_freelist_push+0x2a/0x40 [ 12.535432] pcpu_freelist_push+0x2a/0x40 [ 12.535750] bpf_get_stackid+0x494/0x4d0 [ 12.536062] ___bpf_prog_run+0x8b4/0x11a0 It has been explained that is a false positive here: https://lkml.org/lkml/2018/7/25/756 Recap: - stackmap uses pcpu_freelist - The lock in pcpu_freelist is a percpu lock - stackmap is only used by tracing bpf_prog - A tracing bpf_prog cannot be run if another bpf_prog has already been running (ensured by the percpu bpf_prog_active counter). Eric pointed out that this lockdep splats stops other legit lockdep splats in selftests/bpf/test_progs.c. Fix this by calling local_irq_save/restore for stackmap. Another false positive had also been worked around by calling local_irq_save in commit 89ad2fa3f043 ("bpf: fix lockdep splat"). That commit added unnecessary irq_save/restore to fast path of bpf hash map. irqs are already disabled at that point, since htab is holding per bucket spin_lock with irqsave. Let's reduce overhead for htab by introducing __pcpu_freelist_push/pop function w/o irqsave and convert pcpu_freelist_push/pop to irqsave to be used elsewhere (right now only in stackmap). It stops lockdep false positive in stackmap with a bit of acceptable overhead. Fixes: 557c0c6e7df8 ("bpf: convert stackmap to pre-allocation") Reported-by: Naresh Kamboju Reported-by: Eric Dumazet Acked-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Signed-off-by: Sasha Levin --- kernel/bpf/percpu_freelist.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/kernel/bpf/percpu_freelist.c b/kernel/bpf/percpu_freelist.c index 7174ee2d84ee..0c1b4ba9e90e 100644 --- a/kernel/bpf/percpu_freelist.c +++ b/kernel/bpf/percpu_freelist.c @@ -88,10 +88,8 @@ struct pcpu_freelist_node *__pcpu_freelist_pop(struct pcpu_freelist *s) { struct pcpu_freelist_head *head; struct pcpu_freelist_node *node; - unsigned long flags; int orig_cpu, cpu; - local_irq_save(flags); orig_cpu = cpu = raw_smp_processor_id(); while (1) { head = per_cpu_ptr(s->freelist, cpu); @@ -99,17 +97,15 @@ struct pcpu_freelist_node *__pcpu_freelist_pop(struct pcpu_freelist *s) node = head->first; if (node) { head->first = node->next; - raw_spin_unlock_irqrestore(&head->lock, flags); + raw_spin_unlock(&head->lock); return node; } raw_spin_unlock(&head->lock); cpu = cpumask_next(cpu, cpu_possible_mask); if (cpu >= nr_cpu_ids) cpu = 0; - if (cpu == orig_cpu) { - local_irq_restore(flags); + if (cpu == orig_cpu) return NULL; - } } } From 477bf1e1ec9fa7a56b3bd7b17fa32391f2ced33e Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 30 Jan 2019 18:12:45 -0800 Subject: [PATCH 0818/1640] UPSTREAM: bpf: Fix syscall's stackmap lookup potential deadlock [ Upstream commit 7c4cd051add3d00bbff008a133c936c515eaa8fe ] The map_lookup_elem used to not acquiring spinlock in order to optimize the reader. It was true until commit 557c0c6e7df8 ("bpf: convert stackmap to pre-allocation") The syscall's map_lookup_elem(stackmap) calls bpf_stackmap_copy(). bpf_stackmap_copy() may find the elem no longer needed after the copy is done. If that is the case, pcpu_freelist_push() saves this elem for reuse later. This push requires a spinlock. If a tracing bpf_prog got run in the middle of the syscall's map_lookup_elem(stackmap) and this tracing bpf_prog is calling bpf_get_stackid(stackmap) which also requires the same pcpu_freelist's spinlock, it may end up with a dead lock situation as reported by Eric Dumazet in https://patchwork.ozlabs.org/patch/1030266/ The situation is the same as the syscall's map_update_elem() which needs to acquire the pcpu_freelist's spinlock and could race with tracing bpf_prog. Hence, this patch fixes it by protecting bpf_stackmap_copy() with this_cpu_inc(bpf_prog_active) to prevent tracing bpf_prog from running. A later syscall's map_lookup_elem commit f1a2e44a3aec ("bpf: add queue and stack maps") also acquires a spinlock and races with tracing bpf_prog similarly. Hence, this patch is forward looking and protects the majority of the map lookups. bpf_map_offload_lookup_elem() is the exception since it is for network bpf_prog only (i.e. never called by tracing bpf_prog). Fixes: 557c0c6e7df8 ("bpf: convert stackmap to pre-allocation") Reported-by: Eric Dumazet Acked-by: Alexei Starovoitov Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Signed-off-by: Sasha Levin --- kernel/bpf/syscall.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index da8eba8bbe00..840fa6c0ce25 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -702,8 +702,13 @@ static int map_lookup_elem(union bpf_attr *attr) if (bpf_map_is_dev_bound(map)) { err = bpf_map_offload_lookup_elem(map, key, value); - } else if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || - map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) { + goto done; + } + + preempt_disable(); + this_cpu_inc(bpf_prog_active); + if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || + map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) { err = bpf_percpu_hash_copy(map, key, value); } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { err = bpf_percpu_array_copy(map, key, value); @@ -726,7 +731,10 @@ static int map_lookup_elem(union bpf_attr *attr) rcu_read_unlock(); err = ptr ? 0 : -ENOENT; } + this_cpu_dec(bpf_prog_active); + preempt_enable(); +done: if (err) goto free_value; From df17a51f3bc826052a57bcc57131ac87c533be8f Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Thu, 7 Feb 2019 14:54:16 -0500 Subject: [PATCH 0819/1640] UPSTREAM: bpf: only adjust gso_size on bytestream protocols [ Upstream commit b90efd2258749e04e1b3f71ef0d716f2ac2337e0 ] bpf_skb_change_proto and bpf_skb_adjust_room change skb header length. For GSO packets they adjust gso_size to maintain the same MTU. The gso size can only be safely adjusted on bytestream protocols. Commit d02f51cbcf12 ("bpf: fix bpf_skb_adjust_net/bpf_skb_proto_xlat to deal with gso sctp skbs") excluded SKB_GSO_SCTP. Since then type SKB_GSO_UDP_L4 has been added, whose contents are one gso_size unit per datagram. Also exclude these. Move from a blacklist to a whitelist check to future proof against additional such new GSO types, e.g., for fraglist based GRO. Fixes: bec1f6f69736 ("udp: generate gso with UDP_SEGMENT") Signed-off-by: Willem de Bruijn Signed-off-by: Alexei Starovoitov Signed-off-by: Sasha Levin --- include/linux/skbuff.h | 6 ++++++ net/core/filter.c | 12 ++++-------- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 1dabd92fc251..c88859d4eb7e 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -4147,6 +4147,12 @@ static inline bool skb_is_gso_sctp(const struct sk_buff *skb) return skb_shinfo(skb)->gso_type & SKB_GSO_SCTP; } +static inline bool skb_is_gso_tcp(const struct sk_buff *skb) +{ + return skb_is_gso(skb) && + skb_shinfo(skb)->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6); +} + static inline void skb_gso_reset(struct sk_buff *skb) { skb_shinfo(skb)->gso_size = 0; diff --git a/net/core/filter.c b/net/core/filter.c index 5330117417e4..6452608e0eaf 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2621,8 +2621,7 @@ static int bpf_skb_proto_4_to_6(struct sk_buff *skb) u32 off = skb_mac_header_len(skb); int ret; - /* SCTP uses GSO_BY_FRAGS, thus cannot adjust it. */ - if (skb_is_gso(skb) && unlikely(skb_is_gso_sctp(skb))) + if (!skb_is_gso_tcp(skb)) return -ENOTSUPP; ret = skb_cow(skb, len_diff); @@ -2661,8 +2660,7 @@ static int bpf_skb_proto_6_to_4(struct sk_buff *skb) u32 off = skb_mac_header_len(skb); int ret; - /* SCTP uses GSO_BY_FRAGS, thus cannot adjust it. */ - if (skb_is_gso(skb) && unlikely(skb_is_gso_sctp(skb))) + if (!skb_is_gso_tcp(skb)) return -ENOTSUPP; ret = skb_unclone(skb, GFP_ATOMIC); @@ -2785,8 +2783,7 @@ static int bpf_skb_net_grow(struct sk_buff *skb, u32 len_diff) u32 off = skb_mac_header_len(skb) + bpf_skb_net_base_len(skb); int ret; - /* SCTP uses GSO_BY_FRAGS, thus cannot adjust it. */ - if (skb_is_gso(skb) && unlikely(skb_is_gso_sctp(skb))) + if (!skb_is_gso_tcp(skb)) return -ENOTSUPP; ret = skb_cow(skb, len_diff); @@ -2815,8 +2812,7 @@ static int bpf_skb_net_shrink(struct sk_buff *skb, u32 len_diff) u32 off = skb_mac_header_len(skb) + bpf_skb_net_base_len(skb); int ret; - /* SCTP uses GSO_BY_FRAGS, thus cannot adjust it. */ - if (skb_is_gso(skb) && unlikely(skb_is_gso_sctp(skb))) + if (!skb_is_gso_tcp(skb)) return -ENOTSUPP; ret = skb_unclone(skb, GFP_ATOMIC); From 136b0faa804ae328ab71b04a87e4c91eff24150a Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Sun, 10 Feb 2019 12:52:35 -0800 Subject: [PATCH 0820/1640] UPSTREAM: bpf: fix lockdep false positive in stackmap [ Upstream commit 3defaf2f15b2bfd86c6664181ac009e91985f8ac ] Lockdep warns about false positive: [ 11.211460] ------------[ cut here ]------------ [ 11.211936] DEBUG_LOCKS_WARN_ON(depth <= 0) [ 11.211985] WARNING: CPU: 0 PID: 141 at ../kernel/locking/lockdep.c:3592 lock_release+0x1ad/0x280 [ 11.213134] Modules linked in: [ 11.214954] RIP: 0010:lock_release+0x1ad/0x280 [ 11.223508] Call Trace: [ 11.223705] [ 11.223874] ? __local_bh_enable+0x7a/0x80 [ 11.224199] up_read+0x1c/0xa0 [ 11.224446] do_up_read+0x12/0x20 [ 11.224713] irq_work_run_list+0x43/0x70 [ 11.225030] irq_work_run+0x26/0x50 [ 11.225310] smp_irq_work_interrupt+0x57/0x1f0 [ 11.225662] irq_work_interrupt+0xf/0x20 since rw_semaphore is released in a different task vs task that locked the sema. It is expected behavior. Fix the warning with up_read_non_owner() and rwsem_release() annotation. Fixes: bae77c5eb5b2 ("bpf: enable stackmap with build_id in nmi context") Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Signed-off-by: Sasha Levin --- kernel/bpf/stackmap.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 700e1ef25cd6..bca9321d7fd7 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -44,7 +44,7 @@ static void do_up_read(struct irq_work *entry) struct stack_map_irq_work *work; work = container_of(entry, struct stack_map_irq_work, irq_work); - up_read(work->sem); + up_read_non_owner(work->sem); work->sem = NULL; } @@ -344,6 +344,12 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, } else { work->sem = ¤t->mm->mmap_sem; irq_work_queue(&work->irq_work); + /* + * The irq_work will release the mmap_sem with + * up_read_non_owner(). The rwsem_release() is called + * here to release the lock from lockdep's perspective. + */ + rwsem_release(¤t->mm->mmap_sem.dep_map, 1, _RET_IP_); } } From 95d0205b6d71a2bd9bb39f608048b3af19aee58c Mon Sep 17 00:00:00 2001 From: Alban Crequy Date: Fri, 22 Feb 2019 14:19:08 +0100 Subject: [PATCH 0821/1640] BACKPORT: bpf, lpm: fix lookup bug in map_delete_elem [ Upstream commit 7c0cdf0b3940f63d9777c3fcf250a2f83859ca54 ] trie_delete_elem() was deleting an entry even though it was not matching if the prefixlen was correct. This patch adds a check on matchlen. Reproducer: $ sudo bpftool map create /sys/fs/bpf/mylpm type lpm_trie key 8 value 1 entries 128 name mylpm flags 1 $ sudo bpftool map update pinned /sys/fs/bpf/mylpm key hex 10 00 00 00 aa bb cc dd value hex 01 $ sudo bpftool map dump pinned /sys/fs/bpf/mylpm key: 10 00 00 00 aa bb cc dd value: 01 Found 1 element $ sudo bpftool map delete pinned /sys/fs/bpf/mylpm key hex 10 00 00 00 ff ff ff ff $ echo $? 0 $ sudo bpftool map dump pinned /sys/fs/bpf/mylpm Found 0 elements A similar reproducer is added in the selftests. Without the patch: $ sudo ./tools/testing/selftests/bpf/test_lpm_map test_lpm_map: test_lpm_map.c:485: test_lpm_delete: Assertion `bpf_map_delete_elem(map_fd, key) == -1 && errno == ENOENT' failed. Aborted With the patch: test_lpm_map runs without errors. Fixes: e454cf595853 ("bpf: Implement map_delete_elem for BPF_MAP_TYPE_LPM_TRIE") Cc: Craig Gallek Signed-off-by: Alban Crequy Acked-by: Craig Gallek Signed-off-by: Daniel Borkmann Signed-off-by: Sasha Levin --- kernel/bpf/lpm_trie.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index fa382ca9231c..5e17d703d179 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -435,6 +435,7 @@ static int trie_delete_elem(struct bpf_map *map, void *_key) } if (!node || node->prefixlen != key->prefixlen || + node->prefixlen != matchlen || (node->flags & LPM_TREE_NODE_FLAG_IM)) { ret = -ENOENT; goto out; From 23bed7a9fa4a7a281e4d9c54042407b971b8560b Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Wed, 6 Mar 2019 14:35:15 -0500 Subject: [PATCH 0822/1640] UPSTREAM: bpf: only test gso type on gso packets [ Upstream commit 4c3024debf62de4c6ac6d3cb4c0063be21d4f652 ] BPF can adjust gso only for tcp bytestreams. Fail on other gso types. But only on gso packets. It does not touch this field if !gso_size. Fixes: b90efd225874 ("bpf: only adjust gso_size on bytestream protocols") Signed-off-by: Willem de Bruijn Acked-by: Yonghong Song Signed-off-by: Daniel Borkmann Signed-off-by: Sasha Levin --- include/linux/skbuff.h | 4 ++-- net/core/filter.c | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index c88859d4eb7e..66d0d75bd03b 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -4147,10 +4147,10 @@ static inline bool skb_is_gso_sctp(const struct sk_buff *skb) return skb_shinfo(skb)->gso_type & SKB_GSO_SCTP; } +/* Note: Should be called only if skb_is_gso(skb) is true */ static inline bool skb_is_gso_tcp(const struct sk_buff *skb) { - return skb_is_gso(skb) && - skb_shinfo(skb)->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6); + return skb_shinfo(skb)->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6); } static inline void skb_gso_reset(struct sk_buff *skb) diff --git a/net/core/filter.c b/net/core/filter.c index 6452608e0eaf..ad9f93134ebf 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2621,7 +2621,7 @@ static int bpf_skb_proto_4_to_6(struct sk_buff *skb) u32 off = skb_mac_header_len(skb); int ret; - if (!skb_is_gso_tcp(skb)) + if (skb_is_gso(skb) && !skb_is_gso_tcp(skb)) return -ENOTSUPP; ret = skb_cow(skb, len_diff); @@ -2660,7 +2660,7 @@ static int bpf_skb_proto_6_to_4(struct sk_buff *skb) u32 off = skb_mac_header_len(skb); int ret; - if (!skb_is_gso_tcp(skb)) + if (skb_is_gso(skb) && !skb_is_gso_tcp(skb)) return -ENOTSUPP; ret = skb_unclone(skb, GFP_ATOMIC); @@ -2783,7 +2783,7 @@ static int bpf_skb_net_grow(struct sk_buff *skb, u32 len_diff) u32 off = skb_mac_header_len(skb) + bpf_skb_net_base_len(skb); int ret; - if (!skb_is_gso_tcp(skb)) + if (skb_is_gso(skb) && !skb_is_gso_tcp(skb)) return -ENOTSUPP; ret = skb_cow(skb, len_diff); @@ -2812,7 +2812,7 @@ static int bpf_skb_net_shrink(struct sk_buff *skb, u32 len_diff) u32 off = skb_mac_header_len(skb) + bpf_skb_net_base_len(skb); int ret; - if (!skb_is_gso_tcp(skb)) + if (skb_is_gso(skb) && !skb_is_gso_tcp(skb)) return -ENOTSUPP; ret = skb_unclone(skb, GFP_ATOMIC); From 08a7ee1b15cdfacbae745b76b1a51339053588e6 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 23 Apr 2019 21:55:59 +0200 Subject: [PATCH 0823/1640] UPSTREAM: bpf: Fix preempt_enable_no_resched() abuse [ Upstream commit 0edd6b64d1939e9e9168ff27947995bb7751db5d ] Unless the very next line is schedule(), or implies it, one must not use preempt_enable_no_resched(). It can cause a preemption to go missing and thereby cause arbitrary delays, breaking the PREEMPT=y invariant. Cc: Roman Gushchin Cc: Alexei Starovoitov Cc: Daniel Borkmann Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Alexei Starovoitov Signed-off-by: Sasha Levin --- include/linux/bpf.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index eef57b96996c..7402c10dabf2 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -405,7 +405,7 @@ int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array, } \ _out: \ rcu_read_unlock(); \ - preempt_enable_no_resched(); \ + preempt_enable(); \ _ret; \ }) From 172c976449368fbd436c734241657e4b23bfe6cb Mon Sep 17 00:00:00 2001 From: Krzesimir Nowak Date: Wed, 8 May 2019 18:08:58 +0200 Subject: [PATCH 0824/1640] UPSTREAM: bpf: fix undefined behavior in narrow load handling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit e2f7fc0ac6957cabff4cecf6c721979b571af208 ] Commit 31fd85816dbe ("bpf: permits narrower load from bpf program context fields") made the verifier add AND instructions to clear the unwanted bits with a mask when doing a narrow load. The mask is computed with (1 << size * 8) - 1 where "size" is the size of the narrow load. When doing a 4 byte load of a an 8 byte field the verifier shifts the literal 1 by 32 places to the left. This results in an overflow of a signed integer, which is an undefined behavior. Typically, the computed mask was zero, so the result of the narrow load ended up being zero too. Cast the literal to long long to avoid overflows. Note that narrow load of the 4 byte fields does not have the undefined behavior, because the load size can only be either 1 or 2 bytes, so shifting 1 by 8 or 16 places will not overflow it. And reading 4 bytes would not be a narrow load of a 4 bytes field. Fixes: 31fd85816dbe ("bpf: permits narrower load from bpf program context fields") Reviewed-by: Alban Crequy Reviewed-by: Iago López Galeiras Signed-off-by: Krzesimir Nowak Cc: Yonghong Song Signed-off-by: Daniel Borkmann Signed-off-by: Sasha Levin --- kernel/bpf/verifier.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 91582199862f..e398689bbe3b 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5810,7 +5810,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) insn->dst_reg, shift); insn_buf[cnt++] = BPF_ALU64_IMM(BPF_AND, insn->dst_reg, - (1 << size * 8) - 1); + (1ULL << size * 8) - 1); } } From b6d9851ff8178b50faa1b3cbd3ae8b76e280b40b Mon Sep 17 00:00:00 2001 From: Martynas Pumputis Date: Wed, 12 Jun 2019 18:05:40 +0200 Subject: [PATCH 0825/1640] UPSTREAM: bpf: simplify definition of BPF_FIB_LOOKUP related flags commit b1d6c15b9d824a58c5415673f374fac19e8eccdf upstream. Previously, the BPF_FIB_LOOKUP_{DIRECT,OUTPUT} flags in the BPF UAPI were defined with the help of BIT macro. This had the following issues: - In order to use any of the flags, a user was required to depend on . - No other flag in bpf.h uses the macro, so it seems that an unwritten convention is to use (1 << (nr)) to define BPF-related flags. Fixes: 87f5fc7e48dd ("bpf: Provide helper to do forwarding lookups in kernel FIB table") Signed-off-by: Martynas Pumputis Acked-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Signed-off-by: Greg Kroah-Hartman --- include/uapi/linux/bpf.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 7358e2785b3a..93f6d57461aa 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2748,8 +2748,8 @@ struct bpf_raw_tracepoint_args { /* DIRECT: Skip the FIB rules and go to FIB table associated with device * OUTPUT: Do lookup from egress perspective; default is ingress */ -#define BPF_FIB_LOOKUP_DIRECT BIT(0) -#define BPF_FIB_LOOKUP_OUTPUT BIT(1) +#define BPF_FIB_LOOKUP_DIRECT (1U << 0) +#define BPF_FIB_LOOKUP_OUTPUT (1U << 1) enum { BPF_FIB_LKUP_RET_SUCCESS, /* lookup successful */ From fff1c44b22555785f2fa9a6bb7ebab45b37326f1 Mon Sep 17 00:00:00 2001 From: Jonathan Lemon Date: Sat, 8 Jun 2019 12:54:19 -0700 Subject: [PATCH 0826/1640] BACKPORT: bpf: lpm_trie: check left child of last leftmost node for NULL commit da2577fdd0932ea4eefe73903f1130ee366767d2 upstream. If the leftmost parent node of the tree has does not have a child on the left side, then trie_get_next_key (and bpftool map dump) will not look at the child on the right. This leads to the traversal missing elements. Lookup is not affected. Update selftest to handle this case. Reproducer: bpftool map create /sys/fs/bpf/lpm type lpm_trie key 6 \ value 1 entries 256 name test_lpm flags 1 bpftool map update pinned /sys/fs/bpf/lpm key 8 0 0 0 0 0 value 1 bpftool map update pinned /sys/fs/bpf/lpm key 16 0 0 0 0 128 value 2 bpftool map dump pinned /sys/fs/bpf/lpm Returns only 1 element. (2 expected) Fixes: b471f2f1de8b ("bpf: implement MAP_GET_NEXT_KEY command for LPM_TRIE") Signed-off-by: Jonathan Lemon Acked-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann Signed-off-by: Greg Kroah-Hartman --- kernel/bpf/lpm_trie.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index 5e17d703d179..fcd3a15add41 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -679,9 +679,14 @@ find_leftmost: * have exact two children, so this function will never return NULL. */ for (node = search_root; node;) { - if (!(node->flags & LPM_TREE_NODE_FLAG_IM)) + if (node->flags & LPM_TREE_NODE_FLAG_IM) { + node = rcu_dereference(node->child[0]); + } else { next_node = node; - node = rcu_dereference(node->child[0]); + node = rcu_dereference(node->child[0]); + if (!node) + node = rcu_dereference(next_node->child[1]); + } } do_copy: next_key->prefixlen = next_node->prefixlen; From 0f2d34119bda66ead39d7371d275edd30e81422d Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 7 Jun 2019 01:48:57 +0200 Subject: [PATCH 0827/1640] UPSTREAM: bpf: fix unconnected udp hooks commit 983695fa676568fc0fe5ddd995c7267aabc24632 upstream. Intention of cgroup bind/connect/sendmsg BPF hooks is to act transparently to applications as also stated in original motivation in 7828f20e3779 ("Merge branch 'bpf-cgroup-bind-connect'"). When recently integrating the latter two hooks into Cilium to enable host based load-balancing with Kubernetes, I ran into the issue that pods couldn't start up as DNS got broken. Kubernetes typically sets up DNS as a service and is thus subject to load-balancing. Upon further debugging, it turns out that the cgroupv2 sendmsg BPF hooks API is currently insufficient and thus not usable as-is for standard applications shipped with most distros. To break down the issue we ran into with a simple example: # cat /etc/resolv.conf nameserver 147.75.207.207 nameserver 147.75.207.208 For the purpose of a simple test, we set up above IPs as service IPs and transparently redirect traffic to a different DNS backend server for that node: # cilium service list ID Frontend Backend 1 147.75.207.207:53 1 => 8.8.8.8:53 2 147.75.207.208:53 1 => 8.8.8.8:53 The attached BPF program is basically selecting one of the backends if the service IP/port matches on the cgroup hook. DNS breaks here, because the hooks are not transparent enough to applications which have built-in msg_name address checks: # nslookup 1.1.1.1 ;; reply from unexpected source: 8.8.8.8#53, expected 147.75.207.207#53 ;; reply from unexpected source: 8.8.8.8#53, expected 147.75.207.208#53 ;; reply from unexpected source: 8.8.8.8#53, expected 147.75.207.207#53 [...] ;; connection timed out; no servers could be reached # dig 1.1.1.1 ;; reply from unexpected source: 8.8.8.8#53, expected 147.75.207.207#53 ;; reply from unexpected source: 8.8.8.8#53, expected 147.75.207.208#53 ;; reply from unexpected source: 8.8.8.8#53, expected 147.75.207.207#53 [...] ; <<>> DiG 9.11.3-1ubuntu1.7-Ubuntu <<>> 1.1.1.1 ;; global options: +cmd ;; connection timed out; no servers could be reached For comparison, if none of the service IPs is used, and we tell nslookup to use 8.8.8.8 directly it works just fine, of course: # nslookup 1.1.1.1 8.8.8.8 1.1.1.1.in-addr.arpa name = one.one.one.one. In order to fix this and thus act more transparent to the application, this needs reverse translation on recvmsg() side. A minimal fix for this API is to add similar recvmsg() hooks behind the BPF cgroups static key such that the program can track state and replace the current sockaddr_in{,6} with the original service IP. From BPF side, this basically tracks the service tuple plus socket cookie in an LRU map where the reverse NAT can then be retrieved via map value as one example. Side-note: the BPF cgroups static key should be converted to a per-hook static key in future. Same example after this fix: # cilium service list ID Frontend Backend 1 147.75.207.207:53 1 => 8.8.8.8:53 2 147.75.207.208:53 1 => 8.8.8.8:53 Lookups work fine now: # nslookup 1.1.1.1 1.1.1.1.in-addr.arpa name = one.one.one.one. Authoritative answers can be found from: # dig 1.1.1.1 ; <<>> DiG 9.11.3-1ubuntu1.7-Ubuntu <<>> 1.1.1.1 ;; global options: +cmd ;; Got answer: ;; ->>HEADER<<- opcode: QUERY, status: NXDOMAIN, id: 51550 ;; flags: qr rd ra ad; QUERY: 1, ANSWER: 0, AUTHORITY: 1, ADDITIONAL: 1 ;; OPT PSEUDOSECTION: ; EDNS: version: 0, flags:; udp: 512 ;; QUESTION SECTION: ;1.1.1.1. IN A ;; AUTHORITY SECTION: . 23426 IN SOA a.root-servers.net. nstld.verisign-grs.com. 2019052001 1800 900 604800 86400 ;; Query time: 17 msec ;; SERVER: 147.75.207.207#53(147.75.207.207) ;; WHEN: Tue May 21 12:59:38 UTC 2019 ;; MSG SIZE rcvd: 111 And from an actual packet level it shows that we're using the back end server when talking via 147.75.207.20{7,8} front end: # tcpdump -i any udp [...] 12:59:52.698732 IP foo.42011 > google-public-dns-a.google.com.domain: 18803+ PTR? 1.1.1.1.in-addr.arpa. (38) 12:59:52.698735 IP foo.42011 > google-public-dns-a.google.com.domain: 18803+ PTR? 1.1.1.1.in-addr.arpa. (38) 12:59:52.701208 IP google-public-dns-a.google.com.domain > foo.42011: 18803 1/0/0 PTR one.one.one.one. (67) 12:59:52.701208 IP google-public-dns-a.google.com.domain > foo.42011: 18803 1/0/0 PTR one.one.one.one. (67) [...] In order to be flexible and to have same semantics as in sendmsg BPF programs, we only allow return codes in [1,1] range. In the sendmsg case the program is called if msg->msg_name is present which can be the case in both, connected and unconnected UDP. The former only relies on the sockaddr_in{,6} passed via connect(2) if passed msg->msg_name was NULL. Therefore, on recvmsg side, we act in similar way to call into the BPF program whenever a non-NULL msg->msg_name was passed independent of sk->sk_state being TCP_ESTABLISHED or not. Note that for TCP case, the msg->msg_name is ignored in the regular recvmsg path and therefore not relevant. For the case of ip{,v6}_recv_error() paths, picked up via MSG_ERRQUEUE, the hook is not called. This is intentional as it aligns with the same semantics as in case of TCP cgroup BPF hooks right now. This might be better addressed in future through a different bpf_attach_type such that this case can be distinguished from the regular recvmsg paths, for example. Fixes: 1cedee13d25a ("bpf: Hooks for sys_sendmsg") Signed-off-by: Daniel Borkmann Acked-by: Andrey Ignatov Acked-by: Martin KaFai Lau Acked-by: Martynas Pumputis Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Signed-off-by: Greg Kroah-Hartman --- include/linux/bpf-cgroup.h | 8 ++++++++ include/uapi/linux/bpf.h | 2 ++ kernel/bpf/syscall.c | 8 ++++++++ kernel/bpf/verifier.c | 12 ++++++++---- net/core/filter.c | 2 ++ net/ipv4/udp.c | 4 ++++ net/ipv6/udp.c | 4 ++++ 7 files changed, 36 insertions(+), 4 deletions(-) diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index f91b0f8ff3a9..ad6b30137ac2 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -210,6 +210,12 @@ void bpf_cgroup_storage_release(struct bpf_prog *prog, struct bpf_map *map); #define BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk, uaddr, t_ctx) \ BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_UDP6_SENDMSG, t_ctx) +#define BPF_CGROUP_RUN_PROG_UDP4_RECVMSG_LOCK(sk, uaddr) \ + BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_UDP4_RECVMSG, NULL) + +#define BPF_CGROUP_RUN_PROG_UDP6_RECVMSG_LOCK(sk, uaddr) \ + BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_UDP6_RECVMSG, NULL) + #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) \ ({ \ int __ret = 0; \ @@ -290,6 +296,8 @@ static inline void bpf_cgroup_storage_free( #define BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr) ({ 0; }) #define BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk, uaddr, t_ctx) ({ 0; }) #define BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk, uaddr, t_ctx) ({ 0; }) +#define BPF_CGROUP_RUN_PROG_UDP4_RECVMSG_LOCK(sk, uaddr) ({ 0; }) +#define BPF_CGROUP_RUN_PROG_UDP6_RECVMSG_LOCK(sk, uaddr) ({ 0; }) #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) ({ 0; }) #define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type,major,minor,access) ({ 0; }) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 93f6d57461aa..ddc933f9a6c3 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -172,6 +172,8 @@ enum bpf_attach_type { BPF_CGROUP_UDP4_SENDMSG, BPF_CGROUP_UDP6_SENDMSG, BPF_LIRC_MODE2, + BPF_CGROUP_UDP4_RECVMSG = 19, + BPF_CGROUP_UDP6_RECVMSG, __MAX_BPF_ATTACH_TYPE }; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 840fa6c0ce25..f5e4d6fbdd95 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1343,6 +1343,8 @@ bpf_prog_load_check_attach_type(enum bpf_prog_type prog_type, case BPF_CGROUP_INET6_CONNECT: case BPF_CGROUP_UDP4_SENDMSG: case BPF_CGROUP_UDP6_SENDMSG: + case BPF_CGROUP_UDP4_RECVMSG: + case BPF_CGROUP_UDP6_RECVMSG: return 0; default: return -EINVAL; @@ -1629,6 +1631,8 @@ static int bpf_prog_attach(const union bpf_attr *attr) case BPF_CGROUP_INET6_CONNECT: case BPF_CGROUP_UDP4_SENDMSG: case BPF_CGROUP_UDP6_SENDMSG: + case BPF_CGROUP_UDP4_RECVMSG: + case BPF_CGROUP_UDP6_RECVMSG: ptype = BPF_PROG_TYPE_CGROUP_SOCK_ADDR; break; case BPF_CGROUP_SOCK_OPS: @@ -1705,6 +1709,8 @@ static int bpf_prog_detach(const union bpf_attr *attr) case BPF_CGROUP_INET6_CONNECT: case BPF_CGROUP_UDP4_SENDMSG: case BPF_CGROUP_UDP6_SENDMSG: + case BPF_CGROUP_UDP4_RECVMSG: + case BPF_CGROUP_UDP6_RECVMSG: ptype = BPF_PROG_TYPE_CGROUP_SOCK_ADDR; break; case BPF_CGROUP_SOCK_OPS: @@ -1751,6 +1757,8 @@ static int bpf_prog_query(const union bpf_attr *attr, case BPF_CGROUP_INET6_CONNECT: case BPF_CGROUP_UDP4_SENDMSG: case BPF_CGROUP_UDP6_SENDMSG: + case BPF_CGROUP_UDP4_RECVMSG: + case BPF_CGROUP_UDP6_RECVMSG: case BPF_CGROUP_SOCK_OPS: case BPF_CGROUP_DEVICE: break; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index e398689bbe3b..0bd3fa22128d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4409,9 +4409,12 @@ static int check_return_code(struct bpf_verifier_env *env) struct tnum range = tnum_range(0, 1); switch (env->prog->type) { + case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: + if (env->prog->expected_attach_type == BPF_CGROUP_UDP4_RECVMSG || + env->prog->expected_attach_type == BPF_CGROUP_UDP6_RECVMSG) + range = tnum_range(1, 1); case BPF_PROG_TYPE_CGROUP_SKB: case BPF_PROG_TYPE_CGROUP_SOCK: - case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: case BPF_PROG_TYPE_SOCK_OPS: case BPF_PROG_TYPE_CGROUP_DEVICE: break; @@ -4427,16 +4430,17 @@ static int check_return_code(struct bpf_verifier_env *env) } if (!tnum_in(range, reg->var_off)) { + char tn_buf[48]; + verbose(env, "At program exit the register R0 "); if (!tnum_is_unknown(reg->var_off)) { - char tn_buf[48]; - tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); verbose(env, "has value %s", tn_buf); } else { verbose(env, "has unknown scalar value"); } - verbose(env, " should have been 0 or 1\n"); + tnum_strn(tn_buf, sizeof(tn_buf), range); + verbose(env, " should have been in %s\n", tn_buf); return -EINVAL; } return 0; diff --git a/net/core/filter.c b/net/core/filter.c index ad9f93134ebf..a742ecef4751 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5562,6 +5562,7 @@ static bool sock_addr_is_valid_access(int off, int size, case BPF_CGROUP_INET4_BIND: case BPF_CGROUP_INET4_CONNECT: case BPF_CGROUP_UDP4_SENDMSG: + case BPF_CGROUP_UDP4_RECVMSG: break; default: return false; @@ -5572,6 +5573,7 @@ static bool sock_addr_is_valid_access(int off, int size, case BPF_CGROUP_INET6_BIND: case BPF_CGROUP_INET6_CONNECT: case BPF_CGROUP_UDP6_SENDMSG: + case BPF_CGROUP_UDP6_RECVMSG: break; default: return false; diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index d1a0444985a1..1615ccd99c28 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1736,6 +1736,10 @@ try_again: sin->sin_addr.s_addr = ip_hdr(skb)->saddr; memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); *addr_len = sizeof(*sin); + + if (cgroup_bpf_enabled) + BPF_CGROUP_RUN_PROG_UDP4_RECVMSG_LOCK(sk, + (struct sockaddr *)sin); } if (udp_sk(sk)->gro_enabled) diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 4c060cb62266..a503ca8d0800 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -434,6 +434,10 @@ try_again: inet6_iif(skb)); } *addr_len = sizeof(*sin6); + + if (cgroup_bpf_enabled) + BPF_CGROUP_RUN_PROG_UDP6_RECVMSG_LOCK(sk, + (struct sockaddr *)sin6); } if (udp_sk(sk)->gro_enabled) From 80df28edf6a71b204d1d2b1f8ac67c3e2a277c96 Mon Sep 17 00:00:00 2001 From: Toshiaki Makita Date: Fri, 14 Jun 2019 17:20:13 +0900 Subject: [PATCH 0828/1640] UPSTREAM: bpf, devmap: Fix premature entry free on destroying map [ Upstream commit d4dd153d551634683fccf8881f606fa9f3dfa1ef ] dev_map_free() waits for flush_needed bitmap to be empty in order to ensure all flush operations have completed before freeing its entries. However the corresponding clear_bit() was called before using the entries, so the entries could be used after free. All access to the entries needs to be done before clearing the bit. It seems commit a5e2da6e9787 ("bpf: netdev is never null in __dev_map_flush") accidentally changed the clear_bit() and memory access order. Note that the problem happens only in __dev_map_flush(), not in dev_map_flush_old(). dev_map_flush_old() is called only after nulling out the corresponding netdev_map entry, so dev_map_free() never frees the entry thus no such race happens there. Fixes: a5e2da6e9787 ("bpf: netdev is never null in __dev_map_flush") Signed-off-by: Toshiaki Makita Signed-off-by: Daniel Borkmann Signed-off-by: Sasha Levin --- kernel/bpf/devmap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 1e525d70f833..e001fb1a96b1 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -291,10 +291,10 @@ void __dev_map_flush(struct bpf_map *map) if (unlikely(!dev)) continue; - __clear_bit(bit, bitmap); - bq = this_cpu_ptr(dev->bulkq); bq_xmit_all(dev, bq, XDP_XMIT_FLUSH, true); + + __clear_bit(bit, bitmap); } } From 830685d23edb21cc6dbeffa19cc14382322f7ebb Mon Sep 17 00:00:00 2001 From: Toshiaki Makita Date: Fri, 14 Jun 2019 17:20:14 +0900 Subject: [PATCH 0829/1640] UPSTREAM: bpf, devmap: Add missing bulk queue free [ Upstream commit edabf4d9dd905acd60048ea1579943801e3a4876 ] dev_map_free() forgot to free bulk queue when freeing its entries. Fixes: 5d053f9da431 ("bpf: devmap prepare xdp frames for bulking") Signed-off-by: Toshiaki Makita Acked-by: Jesper Dangaard Brouer Signed-off-by: Daniel Borkmann Signed-off-by: Sasha Levin --- kernel/bpf/devmap.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index e001fb1a96b1..a126d95d12de 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -186,6 +186,7 @@ static void dev_map_free(struct bpf_map *map) if (!dev) continue; + free_percpu(dev->bulkq); dev_put(dev->dev); kfree(dev); } From 0b8a16b1eb46b50255e29cf5b72c7ef4d8121985 Mon Sep 17 00:00:00 2001 From: Toshiaki Makita Date: Fri, 14 Jun 2019 17:20:15 +0900 Subject: [PATCH 0830/1640] UPSTREAM: bpf, devmap: Add missing RCU read lock on flush [ Upstream commit 86723c8640633bee4b4588d3c7784ee7a0032f65 ] .ndo_xdp_xmit() assumes it is called under RCU. For example virtio_net uses RCU to detect it has setup the resources for tx. The assumption accidentally broke when introducing bulk queue in devmap. Fixes: 5d053f9da431 ("bpf: devmap prepare xdp frames for bulking") Reported-by: David Ahern Signed-off-by: Toshiaki Makita Signed-off-by: Daniel Borkmann Signed-off-by: Sasha Levin --- kernel/bpf/devmap.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index a126d95d12de..1defea4b2755 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -282,6 +282,7 @@ void __dev_map_flush(struct bpf_map *map) unsigned long *bitmap = this_cpu_ptr(dtab->flush_needed); u32 bit; + rcu_read_lock(); for_each_set_bit(bit, bitmap, map->max_entries) { struct bpf_dtab_netdev *dev = READ_ONCE(dtab->netdev_map[bit]); struct xdp_bulk_queue *bq; @@ -297,6 +298,7 @@ void __dev_map_flush(struct bpf_map *map) __clear_bit(bit, bitmap); } + rcu_read_unlock(); } /* rcu_read_lock (from syscall and BPF contexts) ensures that if a delete and/or @@ -389,6 +391,7 @@ static void dev_map_flush_old(struct bpf_dtab_netdev *dev) int cpu; + rcu_read_lock(); for_each_online_cpu(cpu) { bitmap = per_cpu_ptr(dev->dtab->flush_needed, cpu); __clear_bit(dev->bit, bitmap); @@ -396,6 +399,7 @@ static void dev_map_flush_old(struct bpf_dtab_netdev *dev) bq = per_cpu_ptr(dev->bulkq, cpu); bq_xmit_all(dev, bq, XDP_XMIT_FLUSH, false); } + rcu_read_unlock(); } } From fcdbf0b9b862ad7903c350be7d1825ff0a5873b5 Mon Sep 17 00:00:00 2001 From: Baruch Siach Date: Fri, 28 Jun 2019 07:08:45 +0300 Subject: [PATCH 0831/1640] BACKPORT: bpf: fix uapi bpf_prog_info fields alignment [ Upstream commit 0472301a28f6cf53a6bc5783e48a2d0bbff4682f ] Merge commit 1c8c5a9d38f60 ("Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next") undid the fix from commit 36f9814a494 ("bpf: fix uapi hole for 32 bit compat applications") by taking the gpl_compatible 1-bit field definition from commit b85fab0e67b162 ("bpf: Add gpl_compatible flag to struct bpf_prog_info") as is. That breaks architectures with 16-bit alignment like m68k. Add 31-bit pad after gpl_compatible to restore alignment of following fields. Thanks to Dmitry V. Levin his analysis of this bug history. Signed-off-by: Baruch Siach Acked-by: Song Liu Cc: Jiri Olsa Cc: Daniel Borkmann Cc: Geert Uytterhoeven Cc: Linus Torvalds Signed-off-by: Daniel Borkmann Signed-off-by: Sasha Levin --- include/uapi/linux/bpf.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index ddc933f9a6c3..8d75c1f73b11 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2528,7 +2528,7 @@ struct bpf_prog_info { char name[BPF_OBJ_NAME_LEN]; __u32 ifindex; __u32 gpl_compatible:1; - __u32 :31; + __u32 :31; /* alignment pad */ __u64 netns_dev; __u64 netns_ino; __u32 nr_jited_ksyms; From 75abb6320136c638384bf97271ab8a0ad6395015 Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Tue, 20 Aug 2019 17:50:25 +0200 Subject: [PATCH 0832/1640] UPSTREAM: bpf: allow narrow loads of some sk_reuseport_md fields with offset > 0 [ Upstream commit 2c238177bd7f4b14bdf7447cc1cd9bb791f147e6 ] test_select_reuseport fails on s390 due to verifier rejecting test_select_reuseport_kern.o with the following message: ; data_check.eth_protocol = reuse_md->eth_protocol; 18: (69) r1 = *(u16 *)(r6 +22) invalid bpf_context access off=22 size=2 This is because on big-endian machines casts from __u32 to __u16 are generated by referencing the respective variable as __u16 with an offset of 2 (as opposed to 0 on little-endian machines). The verifier already has all the infrastructure in place to allow such accesses, it's just that they are not explicitly enabled for eth_protocol field. Enable them for eth_protocol field by using bpf_ctx_range instead of offsetof. Ditto for ip_protocol, bind_inany and len, since they already allow narrowing, and the same problem can arise when working with them. Fixes: 2dbb9b9e6df6 ("bpf: Introduce BPF_PROG_TYPE_SK_REUSEPORT") Signed-off-by: Ilya Leoshkevich Signed-off-by: Daniel Borkmann Signed-off-by: Sasha Levin --- net/core/filter.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/net/core/filter.c b/net/core/filter.c index a742ecef4751..4b6daf2cb4e2 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -7238,13 +7238,13 @@ sk_reuseport_is_valid_access(int off, int size, return size == size_default; /* Fields that allow narrowing */ - case offsetof(struct sk_reuseport_md, eth_protocol): + case bpf_ctx_range(struct sk_reuseport_md, eth_protocol): if (size < FIELD_SIZEOF(struct sk_buff, protocol)) return false; /* fall through */ - case offsetof(struct sk_reuseport_md, ip_protocol): - case offsetof(struct sk_reuseport_md, bind_inany): - case offsetof(struct sk_reuseport_md, len): + case bpf_ctx_range(struct sk_reuseport_md, ip_protocol): + case bpf_ctx_range(struct sk_reuseport_md, bind_inany): + case bpf_ctx_range(struct sk_reuseport_md, len): bpf_ctx_record_field_size(info, size_default); return bpf_ctx_narrow_access_ok(off, size, size_default); From 24a859fd772accb1e257723ece1c8016e656a252 Mon Sep 17 00:00:00 2001 From: Wenwen Wang Date: Sun, 7 Oct 2018 15:23:15 -0500 Subject: [PATCH 0833/1640] UPSTREAM: bpf: btf: Fix a missing check bug [ Upstream commit 8af03d1ae2e154a8be3631e8694b87007e1bdbc2 ] In btf_parse_hdr(), the length of the btf data header is firstly copied from the user space to 'hdr_len' and checked to see whether it is larger than 'btf_data_size'. If yes, an error code EINVAL is returned. Otherwise, the whole header is copied again from the user space to 'btf->hdr'. However, after the second copy, there is no check between 'btf->hdr->hdr_len' and 'hdr_len' to confirm that the two copies get the same value. Given that the btf data is in the user space, a malicious user can race to change the data between the two copies. By doing so, the user can provide malicious data to the kernel and cause undefined behavior. This patch adds a necessary check after the second copy, to make sure 'btf->hdr->hdr_len' has the same value as 'hdr_len'. Otherwise, an error code EINVAL will be returned. Signed-off-by: Wenwen Wang Acked-by: Song Liu Signed-off-by: Alexei Starovoitov Signed-off-by: Sasha Levin --- kernel/bpf/btf.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 138f0302692e..378cef70341c 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -2114,6 +2114,9 @@ static int btf_parse_hdr(struct btf_verifier_env *env, void __user *btf_data, hdr = &btf->hdr; + if (hdr->hdr_len != hdr_len) + return -EINVAL; + btf_verifier_log_hdr(env, btf_data_size); if (hdr->magic != BTF_MAGIC) { From fc9c7277fba436656c19f831f429bdf793b2e0af Mon Sep 17 00:00:00 2001 From: Martin Lau Date: Wed, 24 Oct 2018 20:42:25 +0000 Subject: [PATCH 0834/1640] UPSTREAM: bpf, btf: fix a missing check bug in btf_parse [ Upstream commit 4a6998aff82a20a1aece86a186d8e5263f8b2315 ] Wenwen Wang reported: In btf_parse(), the header of the user-space btf data 'btf_data' is firstly parsed and verified through btf_parse_hdr(). In btf_parse_hdr(), the header is copied from user-space 'btf_data' to kernel-space 'btf->hdr' and then verified. If no error happens during the verification process, the whole data of 'btf_data', including the header, is then copied to 'data' in btf_parse(). It is obvious that the header is copied twice here. More importantly, no check is enforced after the second copy to make sure the headers obtained in these two copies are same. Given that 'btf_data' resides in the user space, a malicious user can race to modify the header between these two copies. By doing so, the user can inject inconsistent data, which can cause undefined behavior of the kernel and introduce potential security risk. This issue is similar to the one fixed in commit 8af03d1ae2e1 ("bpf: btf: Fix a missing check bug"). To fix it, this patch copies the user 'btf_data' *before* parsing / verifying the BTF header. Fixes: 69b693f0aefa ("bpf: btf: Introduce BPF Type Format (BTF)") Signed-off-by: Martin KaFai Lau Co-developed-by: Wenwen Wang Acked-by: Song Liu Signed-off-by: Daniel Borkmann Signed-off-by: Sasha Levin --- kernel/bpf/btf.c | 55 ++++++++++++++++++++++-------------------------- 1 file changed, 25 insertions(+), 30 deletions(-) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 378cef70341c..cfa27b7d1168 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -2067,50 +2067,44 @@ static int btf_check_sec_info(struct btf_verifier_env *env, return 0; } -static int btf_parse_hdr(struct btf_verifier_env *env, void __user *btf_data, - u32 btf_data_size) +static int btf_parse_hdr(struct btf_verifier_env *env) { + u32 hdr_len, hdr_copy, btf_data_size; const struct btf_header *hdr; - u32 hdr_len, hdr_copy; - /* - * Minimal part of the "struct btf_header" that - * contains the hdr_len. - */ - struct btf_min_header { - u16 magic; - u8 version; - u8 flags; - u32 hdr_len; - } __user *min_hdr; struct btf *btf; int err; btf = env->btf; - min_hdr = btf_data; + btf_data_size = btf->data_size; - if (btf_data_size < sizeof(*min_hdr)) { + if (btf_data_size < + offsetof(struct btf_header, hdr_len) + sizeof(hdr->hdr_len)) { btf_verifier_log(env, "hdr_len not found"); return -EINVAL; } - if (get_user(hdr_len, &min_hdr->hdr_len)) - return -EFAULT; - + hdr = btf->data; + hdr_len = hdr->hdr_len; if (btf_data_size < hdr_len) { btf_verifier_log(env, "btf_header not found"); return -EINVAL; } - err = bpf_check_uarg_tail_zero(btf_data, sizeof(btf->hdr), hdr_len); - if (err) { - if (err == -E2BIG) - btf_verifier_log(env, "Unsupported btf_header"); - return err; + /* Ensure the unsupported header fields are zero */ + if (hdr_len > sizeof(btf->hdr)) { + u8 *expected_zero = btf->data + sizeof(btf->hdr); + u8 *end = btf->data + hdr_len; + + for (; expected_zero < end; expected_zero++) { + if (*expected_zero) { + btf_verifier_log(env, "Unsupported btf_header"); + return -E2BIG; + } + } } hdr_copy = min_t(u32, hdr_len, sizeof(btf->hdr)); - if (copy_from_user(&btf->hdr, btf_data, hdr_copy)) - return -EFAULT; + memcpy(&btf->hdr, btf->data, hdr_copy); hdr = &btf->hdr; @@ -2186,10 +2180,6 @@ static struct btf *btf_parse(void __user *btf_data, u32 btf_data_size, } env->btf = btf; - err = btf_parse_hdr(env, btf_data, btf_data_size); - if (err) - goto errout; - data = kvmalloc(btf_data_size, GFP_KERNEL | __GFP_NOWARN); if (!data) { err = -ENOMEM; @@ -2198,13 +2188,18 @@ static struct btf *btf_parse(void __user *btf_data, u32 btf_data_size, btf->data = data; btf->data_size = btf_data_size; - btf->nohdr_data = btf->data + btf->hdr.hdr_len; if (copy_from_user(data, btf_data, btf_data_size)) { err = -EFAULT; goto errout; } + err = btf_parse_hdr(env); + if (err) + goto errout; + + btf->nohdr_data = btf->data + btf->hdr.hdr_len; + err = btf_parse_str_sec(env); if (err) goto errout; From f687c7e8db2a6de11e8fad47a05d879ce207c7e1 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Wed, 19 Dec 2018 17:00:23 +0100 Subject: [PATCH 0835/1640] UPSTREAM: bpf/cpumap: make sure frame_size for build_skb is aligned if headroom isn't [ Upstream commit 77ea5f4cbe2084db9ab021ba73fb7eadf1610884 ] The frame_size passed to build_skb must be aligned, else it is possible that the embedded struct skb_shared_info gets unaligned. For correctness make sure that xdpf->headroom in included in the alignment. No upstream drivers can hit this, as all XDP drivers provide an aligned headroom. This was discovered when playing with implementing XDP support for mvneta, which have a 2 bytes DSA header, and this Marvell ARM64 platform didn't like doing atomic operations on an unaligned skb_shinfo(skb)->dataref addresses. Fixes: 1c601d829ab0 ("bpf: cpumap xdp_buff to skb conversion and allocation") Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Daniel Borkmann Signed-off-by: Sasha Levin --- kernel/bpf/cpumap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 24aac0d0f412..8974b3755670 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -183,7 +183,7 @@ static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu, * is not at a fixed memory location, with mixed length * packets, which is bad for cache-line hotness. */ - frame_size = SKB_DATA_ALIGN(xdpf->len) + xdpf->headroom + + frame_size = SKB_DATA_ALIGN(xdpf->len + xdpf->headroom) + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); pkt_data_start = xdpf->data - xdpf->headroom; From c1f1ca7731b3c438e6e3470b470ca7a4317134e2 Mon Sep 17 00:00:00 2001 From: Toshiaki Makita Date: Fri, 3 Aug 2018 16:58:12 +0900 Subject: [PATCH 0836/1640] UPSTREAM: xdp: Helper function to clear kernel pointers in xdp_frame xdp_frame has kernel pointers which should not be readable from bpf programs. When we want to reuse xdp_frame region but it may be read by bpf programs later, we can use this helper to clear kernel pointers. This is more efficient than calling memset() for the entire struct. Signed-off-by: Toshiaki Makita Acked-by: Jesper Dangaard Brouer Signed-off-by: Daniel Borkmann --- include/net/xdp.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/include/net/xdp.h b/include/net/xdp.h index 0f32d8dc02a6..5e49e060d550 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -79,6 +79,13 @@ struct xdp_frame { struct net_device *dev_rx; /* used by cpumap */ }; +/* Clear kernel pointers in xdp_frame */ +static inline void xdp_scrub_frame(struct xdp_frame *frame) +{ + frame->data = NULL; + frame->dev_rx = NULL; +} + /* Convert xdp_buff to xdp_frame */ static inline struct xdp_frame *convert_to_xdp_frame(struct xdp_buff *xdp) From 9c696adc539d82f9397f6135640ba96fde832e3c Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Fri, 29 Mar 2019 10:18:00 +0100 Subject: [PATCH 0837/1640] UPSTREAM: xdp: fix cpumap redirect SKB creation bug [ Upstream commit 676e4a6fe703f2dae699ee9d56f14516f9ada4ea ] We want to avoid leaking pointer info from xdp_frame (that is placed in top of frame) like commit 6dfb970d3dbd ("xdp: avoid leaking info stored in frame data on page reuse"), and followup commit 97e19cce05e5 ("bpf: reserve xdp_frame size in xdp headroom") that reserve this headroom. These changes also affected how cpumap constructed SKBs, as xdpf->headroom size changed, the skb data starting point were in-effect shifted with 32 bytes (sizeof xdp_frame). This was still okay, as the cpumap frame_size calculation also included xdpf->headroom which were reduced by same amount. A bug was introduced in commit 77ea5f4cbe20 ("bpf/cpumap: make sure frame_size for build_skb is aligned if headroom isn't"), where the xdpf->headroom became part of the SKB_DATA_ALIGN rounding up. This round-up to find the frame_size is in principle still correct as it does not exceed the 2048 bytes frame_size (which is max for ixgbe and i40e), but the 32 bytes offset of pkt_data_start puts this over the 2048 bytes limit. This cause skb_shared_info to spill into next frame. It is a little hard to trigger, as the SKB need to use above 15 skb_shinfo->frags[] as far as I calculate. This does happen in practise for TCP streams when skb_try_coalesce() kicks in. KASAN can be used to detect these wrong memory accesses, I've seen: BUG: KASAN: use-after-free in skb_try_coalesce+0x3cb/0x760 BUG: KASAN: wild-memory-access in skb_release_data+0xe2/0x250 Driver veth also construct a SKB from xdp_frame in this way, but is not affected, as it doesn't reserve/deduct the room (used by xdp_frame) from the SKB headroom. Instead is clears the pointers via xdp_scrub_frame(), and allows SKB to use this area. The fix in this patch is to do like veth and instead allow SKB to (re)use the area occupied by xdp_frame, by clearing via xdp_scrub_frame(). (This does kill the idea of the SKB being able to access (mem) info from this area, but I guess it was a bad idea anyhow, and it was already killed by the veth changes.) Fixes: 77ea5f4cbe20 ("bpf/cpumap: make sure frame_size for build_skb is aligned if headroom isn't") Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Alexei Starovoitov Signed-off-by: Sasha Levin --- kernel/bpf/cpumap.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 8974b3755670..3c18260403dd 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -162,10 +162,14 @@ static void cpu_map_kthread_stop(struct work_struct *work) static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf) { + unsigned int hard_start_headroom; unsigned int frame_size; void *pkt_data_start; struct sk_buff *skb; + /* Part of headroom was reserved to xdpf */ + hard_start_headroom = sizeof(struct xdp_frame) + xdpf->headroom; + /* build_skb need to place skb_shared_info after SKB end, and * also want to know the memory "truesize". Thus, need to * know the memory frame size backing xdp_buff. @@ -183,15 +187,15 @@ static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu, * is not at a fixed memory location, with mixed length * packets, which is bad for cache-line hotness. */ - frame_size = SKB_DATA_ALIGN(xdpf->len + xdpf->headroom) + + frame_size = SKB_DATA_ALIGN(xdpf->len + hard_start_headroom) + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); - pkt_data_start = xdpf->data - xdpf->headroom; + pkt_data_start = xdpf->data - hard_start_headroom; skb = build_skb(pkt_data_start, frame_size); if (!skb) return NULL; - skb_reserve(skb, xdpf->headroom); + skb_reserve(skb, hard_start_headroom); __skb_put(skb, xdpf->len); if (xdpf->metasize) skb_metadata_set(skb, xdpf->metasize); @@ -205,6 +209,9 @@ static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu, * - RX ring dev queue index (skb_record_rx_queue) */ + /* Allow SKB to reuse area used by xdp_frame */ + xdp_scrub_frame(xdpf); + return skb; } From cbf0c5b423113942d3b0bae07be5b810963da4bb Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 27 Nov 2018 13:23:27 -0800 Subject: [PATCH 0838/1640] UPSTREAM: bpf: btf: implement btf_name_valid_identifier() [ Upstream commit cdbb096adddb3f42584cecb5ec2e07c26815b71f ] Function btf_name_valid_identifier() have been implemented in bpf-next commit 2667a2626f4d ("bpf: btf: Add BTF_KIND_FUNC and BTF_KIND_FUNC_PROTO"). Backport this function so later patch can use it. Fixes: 69b693f0aefa ("bpf: btf: Introduce BPF Type Format (BTF)") Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Signed-off-by: Sasha Levin --- kernel/bpf/btf.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index cfa27b7d1168..f0f9109f59ba 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include @@ -426,6 +427,30 @@ static bool btf_name_offset_valid(const struct btf *btf, u32 offset) offset < btf->hdr.str_len; } +/* Only C-style identifier is permitted. This can be relaxed if + * necessary. + */ +static bool btf_name_valid_identifier(const struct btf *btf, u32 offset) +{ + /* offset must be valid */ + const char *src = &btf->strings[offset]; + const char *src_limit; + + if (!isalpha(*src) && *src != '_') + return false; + + /* set a limit on identifier length */ + src_limit = src + KSYM_NAME_LEN; + src++; + while (*src && src < src_limit) { + if (!isalnum(*src) && *src != '_') + return false; + src++; + } + + return !*src; +} + static const char *btf_name_by_offset(const struct btf *btf, u32 offset) { if (!offset) From 9d5d4a23d59a9f17de49fcdc9e4dbea9306df42b Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 27 Nov 2018 13:23:28 -0800 Subject: [PATCH 0839/1640] UPSTREAM: bpf: btf: check name validity for various types [ Upstream commit eb04bbb608e683f8fd3ef7f716e2fa32dd90861f ] This patch added name checking for the following types: . BTF_KIND_PTR, BTF_KIND_ARRAY, BTF_KIND_VOLATILE, BTF_KIND_CONST, BTF_KIND_RESTRICT: the name must be null . BTF_KIND_STRUCT, BTF_KIND_UNION: the struct/member name is either null or a valid identifier . BTF_KIND_ENUM: the enum type name is either null or a valid identifier; the enumerator name must be a valid identifier. . BTF_KIND_FWD: the name must be a valid identifier . BTF_KIND_TYPEDEF: the name must be a valid identifier For those places a valid name is required, the name must be a valid C identifier. This can be relaxed later if we found use cases for a different (non-C) frontend. Fixes: 69b693f0aefa ("bpf: btf: Introduce BPF Type Format (BTF)") Acked-by: Martin KaFai Lau Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Signed-off-by: Sasha Levin --- kernel/bpf/btf.c | 57 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index f0f9109f59ba..3e2413345e71 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -1168,6 +1168,22 @@ static int btf_ref_type_check_meta(struct btf_verifier_env *env, return -EINVAL; } + /* typedef type must have a valid name, and other ref types, + * volatile, const, restrict, should have a null name. + */ + if (BTF_INFO_KIND(t->info) == BTF_KIND_TYPEDEF) { + if (!t->name_off || + !btf_name_valid_identifier(env->btf, t->name_off)) { + btf_verifier_log_type(env, t, "Invalid name"); + return -EINVAL; + } + } else { + if (t->name_off) { + btf_verifier_log_type(env, t, "Invalid name"); + return -EINVAL; + } + } + btf_verifier_log_type(env, t, NULL); return 0; @@ -1325,6 +1341,13 @@ static s32 btf_fwd_check_meta(struct btf_verifier_env *env, return -EINVAL; } + /* fwd type must have a valid name */ + if (!t->name_off || + !btf_name_valid_identifier(env->btf, t->name_off)) { + btf_verifier_log_type(env, t, "Invalid name"); + return -EINVAL; + } + btf_verifier_log_type(env, t, NULL); return 0; @@ -1381,6 +1404,12 @@ static s32 btf_array_check_meta(struct btf_verifier_env *env, return -EINVAL; } + /* array type should not have a name */ + if (t->name_off) { + btf_verifier_log_type(env, t, "Invalid name"); + return -EINVAL; + } + if (btf_type_vlen(t)) { btf_verifier_log_type(env, t, "vlen != 0"); return -EINVAL; @@ -1557,6 +1586,13 @@ static s32 btf_struct_check_meta(struct btf_verifier_env *env, return -EINVAL; } + /* struct type either no name or a valid one */ + if (t->name_off && + !btf_name_valid_identifier(env->btf, t->name_off)) { + btf_verifier_log_type(env, t, "Invalid name"); + return -EINVAL; + } + btf_verifier_log_type(env, t, NULL); last_offset = 0; @@ -1568,6 +1604,12 @@ static s32 btf_struct_check_meta(struct btf_verifier_env *env, return -EINVAL; } + /* struct member either no name or a valid one */ + if (member->name_off && + !btf_name_valid_identifier(btf, member->name_off)) { + btf_verifier_log_member(env, t, member, "Invalid name"); + return -EINVAL; + } /* A member cannot be in type void */ if (!member->type || !BTF_TYPE_ID_VALID(member->type)) { btf_verifier_log_member(env, t, member, @@ -1755,6 +1797,13 @@ static s32 btf_enum_check_meta(struct btf_verifier_env *env, return -EINVAL; } + /* enum type either no name or a valid one */ + if (t->name_off && + !btf_name_valid_identifier(env->btf, t->name_off)) { + btf_verifier_log_type(env, t, "Invalid name"); + return -EINVAL; + } + btf_verifier_log_type(env, t, NULL); for (i = 0; i < nr_enums; i++) { @@ -1764,6 +1813,14 @@ static s32 btf_enum_check_meta(struct btf_verifier_env *env, return -EINVAL; } + /* enum member must have a valid name */ + if (!enums[i].name_off || + !btf_name_valid_identifier(btf, enums[i].name_off)) { + btf_verifier_log_type(env, t, "Invalid name"); + return -EINVAL; + } + + btf_verifier_log(env, "\t%s val=%d\n", btf_name_by_offset(btf, enums[i].name_off), enums[i].val); From ab2aea5bf517396969ddebc180d029c2e7f5e4ff Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 14 Oct 2019 10:12:23 -0700 Subject: [PATCH 0840/1640] UPSTREAM: bpf/stackmap: Fix deadlock with rq_lock in bpf_get_stack() [ Upstream commit eac9153f2b584c702cea02c1f1a57d85aa9aea42 ] bpf stackmap with build-id lookup (BPF_F_STACK_BUILD_ID) can trigger A-A deadlock on rq_lock(): rcu: INFO: rcu_sched detected stalls on CPUs/tasks: [...] Call Trace: try_to_wake_up+0x1ad/0x590 wake_up_q+0x54/0x80 rwsem_wake+0x8a/0xb0 bpf_get_stack+0x13c/0x150 bpf_prog_fbdaf42eded9fe46_on_event+0x5e3/0x1000 bpf_overflow_handler+0x60/0x100 __perf_event_overflow+0x4f/0xf0 perf_swevent_overflow+0x99/0xc0 ___perf_sw_event+0xe7/0x120 __schedule+0x47d/0x620 schedule+0x29/0x90 futex_wait_queue_me+0xb9/0x110 futex_wait+0x139/0x230 do_futex+0x2ac/0xa50 __x64_sys_futex+0x13c/0x180 do_syscall_64+0x42/0x100 entry_SYSCALL_64_after_hwframe+0x44/0xa9 This can be reproduced by: 1. Start a multi-thread program that does parallel mmap() and malloc(); 2. taskset the program to 2 CPUs; 3. Attach bpf program to trace_sched_switch and gather stackmap with build-id, e.g. with trace.py from bcc tools: trace.py -U -p -s t:sched:sched_switch A sample reproducer is attached at the end. This could also trigger deadlock with other locks that are nested with rq_lock. Fix this by checking whether irqs are disabled. Since rq_lock and all other nested locks are irq safe, it is safe to do up_read() when irqs are not disable. If the irqs are disabled, postpone up_read() in irq_work. Fixes: 615755a77b24 ("bpf: extend stackmap to save binary_build_id+offset instead of address") Signed-off-by: Song Liu Signed-off-by: Alexei Starovoitov Cc: Peter Zijlstra Cc: Alexei Starovoitov Cc: Daniel Borkmann Link: https://lore.kernel.org/bpf/20191014171223.357174-1-songliubraving@fb.com Reproducer: ============================ 8< ============================ char *filename; void *worker(void *p) { void *ptr; int fd; char *pptr; fd = open(filename, O_RDONLY); if (fd < 0) return NULL; while (1) { struct timespec ts = {0, 1000 + rand() % 2000}; ptr = mmap(NULL, 4096 * 64, PROT_READ, MAP_PRIVATE, fd, 0); usleep(1); if (ptr == MAP_FAILED) { printf("failed to mmap\n"); break; } munmap(ptr, 4096 * 64); usleep(1); pptr = malloc(1); usleep(1); pptr[0] = 1; usleep(1); free(pptr); usleep(1); nanosleep(&ts, NULL); } close(fd); return NULL; } int main(int argc, char *argv[]) { void *ptr; int i; pthread_t threads[THREAD_COUNT]; if (argc < 2) return 0; filename = argv[1]; for (i = 0; i < THREAD_COUNT; i++) { if (pthread_create(threads + i, NULL, worker, NULL)) { fprintf(stderr, "Error creating thread\n"); return 0; } } for (i = 0; i < THREAD_COUNT; i++) pthread_join(threads[i], NULL); return 0; } ============================ 8< ============================ Signed-off-by: Sasha Levin --- kernel/bpf/stackmap.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index bca9321d7fd7..a41858db1441 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -298,7 +298,7 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, bool irq_work_busy = false; struct stack_map_irq_work *work = NULL; - if (in_nmi()) { + if (irqs_disabled()) { work = this_cpu_ptr(&up_read_work); if (work->irq_work.flags & IRQ_WORK_BUSY) /* cannot queue more up_read, fallback */ @@ -306,8 +306,9 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, } /* - * We cannot do up_read() in nmi context. To do build_id lookup - * in nmi context, we need to run up_read() in irq_work. We use + * We cannot do up_read() when the irq is disabled, because of + * risk to deadlock with rq_lock. To do build_id lookup when the + * irqs are disabled, we need to run up_read() in irq_work. We use * a percpu variable to do the irq_work. If the irq_work is * already used by another lookup, we fall back to report ips. * From a6cd9afde0b16812555be33557dbeca2c04dd92f Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Fri, 13 Dec 2019 18:08:17 +0000 Subject: [PATCH 0841/1640] UPSTREAM: bpf: Clear skb->tstamp in bpf_redirect when necessary [ Upstream commit 5133498f4ad1123a5ffd4c08df6431dab882cc32 ] Redirecting a packet from ingress to egress by using bpf_redirect breaks if the egress interface has an fq qdisc installed. This is the same problem as fixed in 'commit 8203e2d844d3 ("net: clear skb->tstamp in forwarding paths") Clear skb->tstamp when redirecting into the egress path. Fixes: 80b14dee2bea ("net: Add a new socket option for a future transmit time.") Fixes: fb420d5d91c1 ("tcp/fq: move back to CLOCK_MONOTONIC") Signed-off-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/bpf/20191213180817.2510-1-lmb@cloudflare.com Signed-off-by: Sasha Levin --- net/core/filter.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/core/filter.c b/net/core/filter.c index 4b6daf2cb4e2..1c6333546cd5 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2007,6 +2007,7 @@ static inline int __bpf_tx_skb(struct net_device *dev, struct sk_buff *skb) } skb->dev = dev; + skb->tstamp = 0; __this_cpu_inc(xmit_recursion); ret = dev_queue_xmit(skb); From 4c80cc3e810919a0377b2fb41b2206ffd04b44d9 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 15 Jan 2020 21:47:33 +0100 Subject: [PATCH 0842/1640] UPSTREAM: bpf: Fix incorrect verifier simulation of ARSH under ALU32 commit 0af2ffc93a4b50948f9dad2786b7f1bd253bf0b9 upstream. Anatoly has been fuzzing with kBdysch harness and reported a hang in one of the outcomes: 0: R1=ctx(id=0,off=0,imm=0) R10=fp0 0: (85) call bpf_get_socket_cookie#46 1: R0_w=invP(id=0) R10=fp0 1: (57) r0 &= 808464432 2: R0_w=invP(id=0,umax_value=808464432,var_off=(0x0; 0x30303030)) R10=fp0 2: (14) w0 -= 810299440 3: R0_w=invP(id=0,umax_value=4294967295,var_off=(0xcf800000; 0x3077fff0)) R10=fp0 3: (c4) w0 s>>= 1 4: R0_w=invP(id=0,umin_value=1740636160,umax_value=2147221496,var_off=(0x67c00000; 0x183bfff8)) R10=fp0 4: (76) if w0 s>= 0x30303030 goto pc+216 221: R0_w=invP(id=0,umin_value=1740636160,umax_value=2147221496,var_off=(0x67c00000; 0x183bfff8)) R10=fp0 221: (95) exit processed 6 insns (limit 1000000) [...] Taking a closer look, the program was xlated as follows: # ./bpftool p d x i 12 0: (85) call bpf_get_socket_cookie#7800896 1: (bf) r6 = r0 2: (57) r6 &= 808464432 3: (14) w6 -= 810299440 4: (c4) w6 s>>= 1 5: (76) if w6 s>= 0x30303030 goto pc+216 6: (05) goto pc-1 7: (05) goto pc-1 8: (05) goto pc-1 [...] 220: (05) goto pc-1 221: (05) goto pc-1 222: (95) exit Meaning, the visible effect is very similar to f54c7898ed1c ("bpf: Fix precision tracking for unbounded scalars"), that is, the fall-through branch in the instruction 5 is considered to be never taken given the conclusion from the min/max bounds tracking in w6, and therefore the dead-code sanitation rewrites it as goto pc-1. However, real-life input disagrees with verification analysis since a soft-lockup was observed. The bug sits in the analysis of the ARSH. The definition is that we shift the target register value right by K bits through shifting in copies of its sign bit. In adjust_scalar_min_max_vals(), we do first coerce the register into 32 bit mode, same happens after simulating the operation. However, for the case of simulating the actual ARSH, we don't take the mode into account and act as if it's always 64 bit, but location of sign bit is different: dst_reg->smin_value >>= umin_val; dst_reg->smax_value >>= umin_val; dst_reg->var_off = tnum_arshift(dst_reg->var_off, umin_val); Consider an unknown R0 where bpf_get_socket_cookie() (or others) would for example return 0xffff. With the above ARSH simulation, we'd see the following results: [...] 1: R1=ctx(id=0,off=0,imm=0) R2_w=invP65535 R10=fp0 1: (85) call bpf_get_socket_cookie#46 2: R0_w=invP(id=0) R10=fp0 2: (57) r0 &= 808464432 -> R0_runtime = 0x3030 3: R0_w=invP(id=0,umax_value=808464432,var_off=(0x0; 0x30303030)) R10=fp0 3: (14) w0 -= 810299440 -> R0_runtime = 0xcfb40000 4: R0_w=invP(id=0,umax_value=4294967295,var_off=(0xcf800000; 0x3077fff0)) R10=fp0 (0xffffffff) 4: (c4) w0 s>>= 1 -> R0_runtime = 0xe7da0000 5: R0_w=invP(id=0,umin_value=1740636160,umax_value=2147221496,var_off=(0x67c00000; 0x183bfff8)) R10=fp0 (0x67c00000) (0x7ffbfff8) [...] In insn 3, we have a runtime value of 0xcfb40000, which is '1100 1111 1011 0100 0000 0000 0000 0000', the result after the shift has 0xe7da0000 that is '1110 0111 1101 1010 0000 0000 0000 0000', where the sign bit is correctly retained in 32 bit mode. In insn4, the umax was 0xffffffff, and changed into 0x7ffbfff8 after the shift, that is, '0111 1111 1111 1011 1111 1111 1111 1000' and means here that the simulation didn't retain the sign bit. With above logic, the updates happen on the 64 bit min/max bounds and given we coerced the register, the sign bits of the bounds are cleared as well, meaning, we need to force the simulation into s32 space for 32 bit alu mode. Verification after the fix below. We're first analyzing the fall-through branch on 32 bit signed >= test eventually leading to rejection of the program in this specific case: 0: R1=ctx(id=0,off=0,imm=0) R10=fp0 0: (b7) r2 = 808464432 1: R1=ctx(id=0,off=0,imm=0) R2_w=invP808464432 R10=fp0 1: (85) call bpf_get_socket_cookie#46 2: R0_w=invP(id=0) R10=fp0 2: (bf) r6 = r0 3: R0_w=invP(id=0) R6_w=invP(id=0) R10=fp0 3: (57) r6 &= 808464432 4: R0_w=invP(id=0) R6_w=invP(id=0,umax_value=808464432,var_off=(0x0; 0x30303030)) R10=fp0 4: (14) w6 -= 810299440 5: R0_w=invP(id=0) R6_w=invP(id=0,umax_value=4294967295,var_off=(0xcf800000; 0x3077fff0)) R10=fp0 5: (c4) w6 s>>= 1 6: R0_w=invP(id=0) R6_w=invP(id=0,umin_value=3888119808,umax_value=4294705144,var_off=(0xe7c00000; 0x183bfff8)) R10=fp0 (0x67c00000) (0xfffbfff8) 6: (76) if w6 s>= 0x30303030 goto pc+216 7: R0_w=invP(id=0) R6_w=invP(id=0,umin_value=3888119808,umax_value=4294705144,var_off=(0xe7c00000; 0x183bfff8)) R10=fp0 7: (30) r0 = *(u8 *)skb[808464432] BPF_LD_[ABS|IND] uses reserved fields processed 8 insns (limit 1000000) [...] Fixes: 9cbe1f5a32dc ("bpf/verifier: improve register value range tracking with ARSH") Reported-by: Anatoly Trosinenko Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200115204733.16648-1-daniel@iogearbox.net Signed-off-by: Greg Kroah-Hartman --- include/linux/tnum.h | 2 +- kernel/bpf/tnum.c | 9 +++++++-- kernel/bpf/verifier.c | 13 ++++++++++--- 3 files changed, 18 insertions(+), 6 deletions(-) diff --git a/include/linux/tnum.h b/include/linux/tnum.h index c7dc2b5902c0..06b9c20cc77e 100644 --- a/include/linux/tnum.h +++ b/include/linux/tnum.h @@ -26,7 +26,7 @@ struct tnum tnum_lshift(struct tnum a, u8 shift); /* Shift (rsh) a tnum right (by a fixed shift) */ struct tnum tnum_rshift(struct tnum a, u8 shift); /* Shift (arsh) a tnum right (by a fixed min_shift) */ -struct tnum tnum_arshift(struct tnum a, u8 min_shift); +struct tnum tnum_arshift(struct tnum a, u8 min_shift, u8 insn_bitness); /* Add two tnums, return @a + @b */ struct tnum tnum_add(struct tnum a, struct tnum b); /* Subtract two tnums, return @a - @b */ diff --git a/kernel/bpf/tnum.c b/kernel/bpf/tnum.c index 938d41211be7..84984c0fc3d3 100644 --- a/kernel/bpf/tnum.c +++ b/kernel/bpf/tnum.c @@ -43,14 +43,19 @@ struct tnum tnum_rshift(struct tnum a, u8 shift) return TNUM(a.value >> shift, a.mask >> shift); } -struct tnum tnum_arshift(struct tnum a, u8 min_shift) +struct tnum tnum_arshift(struct tnum a, u8 min_shift, u8 insn_bitness) { /* if a.value is negative, arithmetic shifting by minimum shift * will have larger negative offset compared to more shifting. * If a.value is nonnegative, arithmetic shifting by minimum shift * will have larger positive offset compare to more shifting. */ - return TNUM((s64)a.value >> min_shift, (s64)a.mask >> min_shift); + if (insn_bitness == 32) + return TNUM((u32)(((s32)a.value) >> min_shift), + (u32)(((s32)a.mask) >> min_shift)); + else + return TNUM((s64)a.value >> min_shift, + (s64)a.mask >> min_shift); } struct tnum tnum_add(struct tnum a, struct tnum b) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 0bd3fa22128d..3a0770a5a5db 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3370,9 +3370,16 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, /* Upon reaching here, src_known is true and * umax_val is equal to umin_val. */ - dst_reg->smin_value >>= umin_val; - dst_reg->smax_value >>= umin_val; - dst_reg->var_off = tnum_arshift(dst_reg->var_off, umin_val); + if (insn_bitness == 32) { + dst_reg->smin_value = (u32)(((s32)dst_reg->smin_value) >> umin_val); + dst_reg->smax_value = (u32)(((s32)dst_reg->smax_value) >> umin_val); + } else { + dst_reg->smin_value >>= umin_val; + dst_reg->smax_value >>= umin_val; + } + + dst_reg->var_off = tnum_arshift(dst_reg->var_off, umin_val, + insn_bitness); /* blow away the dst_reg umin_value/umax_value and rely on * dst_reg var_off to refine the result. From 59e6d63dabcbb0c7bc5c9c8cfb4e52ddd74ed4af Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Wed, 3 Apr 2019 23:22:43 -0700 Subject: [PATCH 0843/1640] UPSTREAM: bpf: Add missed newline in verifier verbose log [ Upstream commit 1fbd20f8b77b366ea4aeb92ade72daa7f36a7e3b ] check_stack_access() that prints verbose log is used in adjust_ptr_min_max_vals() that prints its own verbose log and now they stick together, e.g.: variable stack access var_off=(0xfffffffffffffff0; 0x4) off=-16 size=1R2 stack pointer arithmetic goes out of range, prohibited for !root Add missing newline so that log is more readable: variable stack access var_off=(0xfffffffffffffff0; 0x4) off=-16 size=1 R2 stack pointer arithmetic goes out of range, prohibited for !root Fixes: f1174f77b50c ("bpf/verifier: rework value tracking") Signed-off-by: Andrey Ignatov Signed-off-by: Daniel Borkmann Signed-off-by: Sasha Levin --- kernel/bpf/verifier.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 3a0770a5a5db..b9c1940a6d2d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1251,7 +1251,7 @@ static int check_stack_access(struct bpf_verifier_env *env, char tn_buf[48]; tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose(env, "variable stack access var_off=%s off=%d size=%d", + verbose(env, "variable stack access var_off=%s off=%d size=%d\n", tn_buf, off, size); return -EACCES; } From 9ebb55304e885474c3d625ad7fa6ccc3f9f9fcd0 Mon Sep 17 00:00:00 2001 From: Anton Protopopov Date: Sat, 15 Jun 2019 22:53:48 +0000 Subject: [PATCH 0844/1640] UPSTREAM: bpf: fix the check that forwarding is enabled in bpf_ipv6_fib_lookup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 56f0f84e69c7a7f229dfa524b13b0ceb6ce9b09e ] The bpf_ipv6_fib_lookup function should return BPF_FIB_LKUP_RET_FWD_DISABLED when forwarding is disabled for the input device. However instead of checking if forwarding is enabled on the input device, it checked the global net->ipv6.devconf_all->forwarding flag. Change it to behave as expected. Fixes: 87f5fc7e48dd ("bpf: Provide helper to do forwarding lookups in kernel FIB table") Signed-off-by: Anton Protopopov Acked-by: Toke Høiland-Jørgensen Reviewed-by: David Ahern Signed-off-by: Daniel Borkmann Signed-off-by: Sasha Levin --- net/core/filter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/core/filter.c b/net/core/filter.c index 1c6333546cd5..c71924bd6a20 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4367,7 +4367,7 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params, return -ENODEV; idev = __in6_dev_get_safely(dev); - if (unlikely(!idev || !net->ipv6.devconf_all->forwarding)) + if (unlikely(!idev || !idev->cnf.forwarding)) return BPF_FIB_LKUP_RET_FWD_DISABLED; if (flags & BPF_FIB_LOOKUP_OUTPUT) { From bec2f945f671890e81a0dfa23b630e9199489e1e Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 4 Nov 2019 12:15:36 +0300 Subject: [PATCH 0845/1640] UPSTREAM: bpf, offload: Unlock on error in bpf_offload_dev_create() [ Upstream commit d0fbb51dfaa612f960519b798387be436e8f83c5 ] We need to drop the bpf_devs_lock on error before returning. Fixes: 9fd7c5559165 ("bpf: offload: aggregate offloads per-device") Signed-off-by: Dan Carpenter Signed-off-by: Daniel Borkmann Acked-by: Jakub Kicinski Link: https://lore.kernel.org/bpf/20191104091536.GB31509@mwanda Signed-off-by: Sasha Levin --- kernel/bpf/offload.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index 177a52436394..86477f3894e5 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -645,8 +645,10 @@ struct bpf_offload_dev *bpf_offload_dev_create(void) down_write(&bpf_devs_lock); if (!offdevs_inited) { err = rhashtable_init(&offdevs, &offdevs_params); - if (err) + if (err) { + up_write(&bpf_devs_lock); return ERR_PTR(err); + } offdevs_inited = true; } up_write(&bpf_devs_lock); From a0bd82be74cb10ccba1cc515149bd534e23e628b Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Fri, 10 Jan 2020 09:04:37 +0800 Subject: [PATCH 0846/1640] UPSTREAM: bpf: Return -EBADRQC for invalid map type in __bpf_tx_xdp_map [ Upstream commit 0a29275b6300f39f78a87f2038bbfe5bdbaeca47 ] A negative value should be returned if map->map_type is invalid although that is impossible now, but if we run into such situation in future, then xdpbuff could be leaked. Daniel Borkmann suggested: -EBADRQC should be returned to stay consistent with generic XDP for the tracepoint output and not to be confused with -EOPNOTSUPP from other locations like dev_map_enqueue() when ndo_xdp_xmit is missing and such. Suggested-by: Daniel Borkmann Signed-off-by: Li RongQing Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/1578618277-18085-1-git-send-email-lirongqing@baidu.com Signed-off-by: Sasha Levin --- net/core/filter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/core/filter.c b/net/core/filter.c index c71924bd6a20..c1f7f3ff7c22 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3207,7 +3207,7 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd, return err; } default: - break; + return -EBADRQC; } return 0; } From 4935295327e6200cc6fd880b65e7ed243e096313 Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Sat, 25 Jan 2020 12:10:02 +0300 Subject: [PATCH 0847/1640] UPSTREAM: bpf: map_seq_next should always increase position index [ Upstream commit 90435a7891a2259b0f74c5a1bc5600d0d64cba8f ] If seq_file .next fuction does not change position index, read after some lseek can generate an unexpected output. See also: https://bugzilla.kernel.org/show_bug.cgi?id=206283 v1 -> v2: removed missed increment in end of function Signed-off-by: Vasily Averin Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/eca84fdd-c374-a154-d874-6c7b55fc3bc4@virtuozzo.com Signed-off-by: Sasha Levin --- kernel/bpf/inode.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index dc9d7ac8228d..c04815bb15cc 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -198,6 +198,7 @@ static void *map_seq_next(struct seq_file *m, void *v, loff_t *pos) void *key = map_iter(m)->key; void *prev_key; + (*pos)++; if (map_iter(m)->done) return NULL; @@ -210,8 +211,6 @@ static void *map_seq_next(struct seq_file *m, void *v, loff_t *pos) map_iter(m)->done = true; return NULL; } - - ++(*pos); return key; } From 9396de85674ab202a26bf259f7eb35df80b77358 Mon Sep 17 00:00:00 2001 From: Johannes Krude Date: Wed, 12 Feb 2020 20:32:27 +0100 Subject: [PATCH 0848/1640] UPSTREAM: bpf, offload: Replace bitwise AND by logical AND in bpf_prog_offload_info_fill commit e20d3a055a457a10a4c748ce5b7c2ed3173a1324 upstream. This if guards whether user-space wants a copy of the offload-jited bytecode and whether this bytecode exists. By erroneously doing a bitwise AND instead of a logical AND on user- and kernel-space buffer-size can lead to no data being copied to user-space especially when user-space size is a power of two and bigger then the kernel-space buffer. Fixes: fcfb126defda ("bpf: add new jited info fields in bpf_dev_offload and bpf_prog_info") Signed-off-by: Johannes Krude Signed-off-by: Daniel Borkmann Acked-by: Jakub Kicinski Link: https://lore.kernel.org/bpf/20200212193227.GA3769@phlox.h.transitiv.net Signed-off-by: Greg Kroah-Hartman --- kernel/bpf/offload.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index 86477f3894e5..66e13aace241 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -289,7 +289,7 @@ int bpf_prog_offload_info_fill(struct bpf_prog_info *info, ulen = info->jited_prog_len; info->jited_prog_len = aux->offload->jited_len; - if (info->jited_prog_len & ulen) { + if (info->jited_prog_len && ulen) { uinsns = u64_to_user_ptr(info->jited_prog_insns); ulen = min_t(u32, info->jited_prog_len, ulen); if (copy_to_user(uinsns, aux->offload->jited_image, ulen)) { From 3baf59f8cb261c9f134a017e285d327052b5900f Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Fri, 20 Mar 2020 17:22:58 +0100 Subject: [PATCH 0849/1640] UPSTREAM: bpf: Explicitly memset some bpf info structures declared on the stack Trying to initialize a structure with "= {};" will not always clean out all padding locations in a structure. So be explicit and call memset to initialize everything for a number of bpf information structures that are then copied from userspace, sometimes from smaller memory locations than the size of the structure. Reported-by: Daniel Borkmann Signed-off-by: Greg Kroah-Hartman Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20200320162258.GA794295@kroah.com (cherry picked from commit 269efb7fc478563a7e7b22590d8076823f4ac82a) Signed-off-by: Greg Kroah-Hartman Change-Id: I52a2cab20aa310085ec104bd811ac4f2b83657b6 --- kernel/bpf/btf.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 3e2413345e71..ab1452d3d3d4 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -2387,7 +2387,7 @@ int btf_get_info_by_fd(const struct btf *btf, union bpf_attr __user *uattr) { struct bpf_btf_info __user *uinfo; - struct bpf_btf_info info = {}; + struct bpf_btf_info info; u32 info_copy, btf_copy; void __user *ubtf; u32 uinfo_len; @@ -2396,6 +2396,7 @@ int btf_get_info_by_fd(const struct btf *btf, uinfo_len = attr->info.info_len; info_copy = min_t(u32, uinfo_len, sizeof(info)); + memset(&info, 0, sizeof(info)); if (copy_from_user(&info, uinfo, info_copy)) return -EFAULT; From 7ef9d9f415e6759da8838b296ced559908653b07 Mon Sep 17 00:00:00 2001 From: Yoshiki Komachi Date: Tue, 10 Mar 2020 16:32:29 +0900 Subject: [PATCH 0850/1640] UPSTREAM: bpf/btf: Fix BTF verification of enum members in struct/union commit da6c7faeb103c493e505e87643272f70be586635 upstream. btf_enum_check_member() was currently sure to recognize the size of "enum" type members in struct/union as the size of "int" even if its size was packed. This patch fixes BTF enum verification to use the correct size of member in BPF programs. Fixes: 179cde8cef7e ("bpf: btf: Check members of struct/union") Signed-off-by: Yoshiki Komachi Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/1583825550-18606-2-git-send-email-komachi.yoshiki@gmail.com Signed-off-by: Greg Kroah-Hartman --- kernel/bpf/btf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index ab1452d3d3d4..471cc5c117a5 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -1763,7 +1763,7 @@ static int btf_enum_check_member(struct btf_verifier_env *env, struct_size = struct_type->size; bytes_offset = BITS_ROUNDDOWN_BYTES(struct_bits_off); - if (struct_size - bytes_offset < sizeof(int)) { + if (struct_size - bytes_offset < member_type->size) { btf_verifier_log_member(env, struct_type, member, "Member exceeds struct_size"); return -EINVAL; From f3f975f7dca6f4259f9b83d18672e6c1187eb978 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 20 Jan 2018 01:24:29 +0100 Subject: [PATCH 0851/1640] BACKPORT: bpf, verifier: detect misconfigured mem, size argument pair I've seen two patch proposals now for helper additions that used ARG_PTR_TO_MEM or similar in reg_X but no corresponding ARG_CONST_SIZE in reg_X+1. Verifier won't complain in such case, but it will omit verifying the memory passed to the helper thus ending up badly. Detect such buggy helper function signature and bail out during verification rather than finding them through review. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 79 ++++++++++++++++++++++++++++++------------- 1 file changed, 55 insertions(+), 24 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index b9c1940a6d2d..0400cc2a694c 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1951,6 +1951,19 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, } } +static bool arg_type_is_mem_ptr(enum bpf_arg_type type) +{ + return type == ARG_PTR_TO_MEM || + type == ARG_PTR_TO_MEM_OR_NULL || + type == ARG_PTR_TO_UNINIT_MEM; +} + +static bool arg_type_is_mem_size(enum bpf_arg_type type) +{ + return type == ARG_CONST_SIZE || + type == ARG_CONST_SIZE_OR_ZERO; +} + static int check_func_arg(struct bpf_verifier_env *env, u32 regno, enum bpf_arg_type arg_type, struct bpf_call_arg_meta *meta) @@ -2003,9 +2016,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, err = check_ctx_reg(env, reg, regno); if (err < 0) return err; - } else if (arg_type == ARG_PTR_TO_MEM || - arg_type == ARG_PTR_TO_MEM_OR_NULL || - arg_type == ARG_PTR_TO_UNINIT_MEM) { + } else if (arg_type_is_mem_ptr(arg_type)) { expected_type = PTR_TO_STACK; /* One exception here. In case function allows for NULL to be * passed in as argument, it's a SCALAR_VALUE type. Final test @@ -2066,25 +2077,12 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, err = check_stack_boundary(env, regno, meta->map_ptr->value_size, false, NULL); - } else if (arg_type == ARG_CONST_SIZE || - arg_type == ARG_CONST_SIZE_OR_ZERO) { + } else if (arg_type_is_mem_size(arg_type)) { bool zero_size_allowed = (arg_type == ARG_CONST_SIZE_OR_ZERO); - /* bpf_xxx(..., buf, len) call will access 'len' bytes - * from stack pointer 'buf'. Check it - * note: regno == len, regno - 1 == buf - */ - if (regno == 0) { - /* kernel subsystem misconfigured verifier */ - verbose(env, - "ARG_CONST_SIZE cannot be first argument\n"); - return -EACCES; - } - /* The register is SCALAR_VALUE; the access check * happens using its boundaries. */ - if (!tnum_is_const(reg->var_off)) /* For unprivileged variable accesses, disable raw * mode so that the program is required to @@ -2260,7 +2258,7 @@ error: return -EINVAL; } -static int check_raw_mode(const struct bpf_func_proto *fn) +static bool check_raw_mode_ok(const struct bpf_func_proto *fn) { int count = 0; @@ -2275,7 +2273,44 @@ static int check_raw_mode(const struct bpf_func_proto *fn) if (fn->arg5_type == ARG_PTR_TO_UNINIT_MEM) count++; - return count > 1 ? -EINVAL : 0; + /* We only support one arg being in raw mode at the moment, + * which is sufficient for the helper functions we have + * right now. + */ + return count <= 1; +} + +static bool check_args_pair_invalid(enum bpf_arg_type arg_curr, + enum bpf_arg_type arg_next) +{ + return (arg_type_is_mem_ptr(arg_curr) && + !arg_type_is_mem_size(arg_next)) || + (!arg_type_is_mem_ptr(arg_curr) && + arg_type_is_mem_size(arg_next)); +} + +static bool check_arg_pair_ok(const struct bpf_func_proto *fn) +{ + /* bpf_xxx(..., buf, len) call will access 'len' + * bytes from memory 'buf'. Both arg types need + * to be paired, so make sure there's no buggy + * helper function specification. + */ + if (arg_type_is_mem_size(fn->arg1_type) || + arg_type_is_mem_ptr(fn->arg5_type) || + check_args_pair_invalid(fn->arg1_type, fn->arg2_type) || + check_args_pair_invalid(fn->arg2_type, fn->arg3_type) || + check_args_pair_invalid(fn->arg3_type, fn->arg4_type) || + check_args_pair_invalid(fn->arg4_type, fn->arg5_type)) + return false; + + return true; +} + +static int check_func_proto(const struct bpf_func_proto *fn) +{ + return check_raw_mode_ok(fn) && + check_arg_pair_ok(fn) ? 0 : -EINVAL; } /* Packet data might have moved, any old PTR_TO_PACKET[_META,_END] @@ -2457,7 +2492,6 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn if (env->ops->get_func_proto) fn = env->ops->get_func_proto(func_id, env->prog); - if (!fn) { verbose(env, "unknown func %s#%d\n", func_id_name(func_id), func_id); @@ -2481,10 +2515,7 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn memset(&meta, 0, sizeof(meta)); meta.pkt_access = fn->pkt_access; - /* We only support one arg being in raw mode at the moment, which - * is sufficient for the helper functions we have right now. - */ - err = check_raw_mode(fn); + err = check_func_proto(fn); if (err) { verbose(env, "kernel subsystem misconfigured func %s#%d\n", func_id_name(func_id), func_id); From 169575aeb165bb8fa62ff0b60bf16f3748e008ef Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sat, 28 Apr 2018 22:28:09 -0700 Subject: [PATCH 0852/1640] BACKPORT: bpf/verifier: refine retval R0 state for bpf_get_stack helper The special property of return values for helpers bpf_get_stack and bpf_probe_read_str are captured in verifier. Both helpers return a negative error code or a length, which is equal to or smaller than the buffer size argument. This additional information in the verifier can avoid the condition such as "retval > bufsize" in the bpf program. For example, for the code blow, usize = bpf_get_stack(ctx, raw_data, max_len, BPF_F_USER_STACK); if (usize < 0 || usize > max_len) return 0; The verifier may have the following errors: 52: (85) call bpf_get_stack#65 R0=map_value(id=0,off=0,ks=4,vs=1600,imm=0) R1_w=ctx(id=0,off=0,imm=0) R2_w=map_value(id=0,off=0,ks=4,vs=1600,imm=0) R3_w=inv800 R4_w=inv256 R6=ctx(id=0,off=0,imm=0) R7=map_value(id=0,off=0,ks=4,vs=1600,imm=0) R9_w=inv800 R10=fp0,call_-1 53: (bf) r8 = r0 54: (bf) r1 = r8 55: (67) r1 <<= 32 56: (bf) r2 = r1 57: (77) r2 >>= 32 58: (25) if r2 > 0x31f goto pc+33 R0=inv(id=0) R1=inv(id=0,smax_value=9223372032559808512, umax_value=18446744069414584320, var_off=(0x0; 0xffffffff00000000)) R2=inv(id=0,umax_value=799,var_off=(0x0; 0x3ff)) R6=ctx(id=0,off=0,imm=0) R7=map_value(id=0,off=0,ks=4,vs=1600,imm=0) R8=inv(id=0) R9=inv800 R10=fp0,call_-1 59: (1f) r9 -= r8 60: (c7) r1 s>>= 32 61: (bf) r2 = r7 62: (0f) r2 += r1 math between map_value pointer and register with unbounded min value is not allowed The failure is due to llvm compiler optimization where register "r2", which is a copy of "r1", is tested for condition while later on "r1" is used for map_ptr operation. The verifier is not able to track such inst sequence effectively. Without the "usize > max_len" condition, there is no llvm optimization and the below generated code passed verifier: 52: (85) call bpf_get_stack#65 R0=map_value(id=0,off=0,ks=4,vs=1600,imm=0) R1_w=ctx(id=0,off=0,imm=0) R2_w=map_value(id=0,off=0,ks=4,vs=1600,imm=0) R3_w=inv800 R4_w=inv256 R6=ctx(id=0,off=0,imm=0) R7=map_value(id=0,off=0,ks=4,vs=1600,imm=0) R9_w=inv800 R10=fp0,call_-1 53: (b7) r1 = 0 54: (bf) r8 = r0 55: (67) r8 <<= 32 56: (c7) r8 s>>= 32 57: (6d) if r1 s> r8 goto pc+24 R0=inv(id=0,umax_value=800,var_off=(0x0; 0x3ff)) R1=inv0 R6=ctx(id=0,off=0,imm=0) R7=map_value(id=0,off=0,ks=4,vs=1600,imm=0) R8=inv(id=0,umax_value=800,var_off=(0x0; 0x3ff)) R9=inv800 R10=fp0,call_-1 58: (bf) r2 = r7 59: (0f) r2 += r8 60: (1f) r9 -= r8 61: (bf) r1 = r6 Acked-by: Alexei Starovoitov Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 0400cc2a694c..3841590624b7 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -188,6 +188,8 @@ struct bpf_call_arg_meta { bool pkt_access; int regno; int access_size; + s64 msize_smax_value; + u64 msize_umax_value; }; static DEFINE_MUTEX(bpf_verifier_lock); @@ -2080,6 +2082,12 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, } else if (arg_type_is_mem_size(arg_type)) { bool zero_size_allowed = (arg_type == ARG_CONST_SIZE_OR_ZERO); + /* remember the mem_size which may be used later + * to refine return values. + */ + meta->msize_smax_value = reg->smax_value; + meta->msize_umax_value = reg->umax_value; + /* The register is SCALAR_VALUE; the access check * happens using its boundaries. */ @@ -2449,6 +2457,23 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) return 0; } +static void do_refine_retval_range(struct bpf_reg_state *regs, int ret_type, + int func_id, + struct bpf_call_arg_meta *meta) +{ + struct bpf_reg_state *ret_reg = ®s[BPF_REG_0]; + + if (ret_type != RET_INTEGER || + (func_id != BPF_FUNC_get_stack && + func_id != BPF_FUNC_probe_read_str)) + return; + + ret_reg->smax_value = meta->msize_smax_value; + ret_reg->umax_value = meta->msize_umax_value; + __reg_deduce_bounds(ret_reg); + __reg_bound_offset(ret_reg); +} + static int record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta, int func_id, int insn_idx) @@ -2601,6 +2626,8 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn return -EINVAL; } + do_refine_retval_range(regs, fn->ret_type, func_id, &meta); + err = check_map_func_compatibility(env, meta.map_ptr, func_id); if (err) return err; From 876824d3a4389a7fb8390fa18a9651f00cacdb42 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 21 Apr 2020 14:58:22 +0200 Subject: [PATCH 0853/1640] UPSTREAM: bpf: fix buggy r0 retval refinement for tracing helpers [ no upstream commit ] See the glory details in 100605035e15 ("bpf: Verifier, do_refine_retval_range may clamp umin to 0 incorrectly") for why 849fa50662fb ("bpf/verifier: refine retval R0 state for bpf_get_stack helper") is buggy. The whole series however is not suitable for stable since it adds significant amount [0] of verifier complexity in order to add 32bit subreg tracking. Something simpler is needed. Unfortunately, reverting 849fa50662fb ("bpf/verifier: refine retval R0 state for bpf_get_stack helper") or just cherry-picking 100605035e15 ("bpf: Verifier, do_refine_retval_range may clamp umin to 0 incorrectly") is not an option since it will break existing tracing programs badly (at least those that are using bpf_get_stack() and bpf_probe_read_str() helpers). Not fixing it in stable is also not an option since on 4.19 kernels an error will cause a soft-lockup due to hitting dead-code sanitized branch since we don't hard-wire such branches in old kernels yet. But even then for 5.x 849fa50662fb ("bpf/verifier: refine retval R0 state for bpf_get_stack helper") would cause wrong bounds on the verifier simluation when an error is hit. In one of the earlier iterations of mentioned patch series for upstream there was the concern that just using smax_value in do_refine_retval_range() would nuke bounds by subsequent <<32 >>32 shifts before the comparison against 0 [1] which eventually led to the 32bit subreg tracking in the first place. While I initially went for implementing the idea [1] to pattern match the two shift operations, it turned out to be more complex than actually needed, meaning, we could simply treat do_refine_retval_range() similarly to how we branch off verification for conditionals or under speculation, that is, pushing a new reg state to the stack for later verification. This means, instead of verifying the current path with the ret_reg in [S32MIN, msize_max_value] interval where later bounds would get nuked, we split this into two: i) for the success case where ret_reg can be in [0, msize_max_value], and ii) for the error case with ret_reg known to be in interval [S32MIN, -1]. Latter will preserve the bounds during these shift patterns and can match reg < 0 test. test_progs also succeed with this approach. [0] https://lore.kernel.org/bpf/158507130343.15666.8018068546764556975.stgit@john-Precision-5820-Tower/ [1] https://lore.kernel.org/bpf/158015334199.28573.4940395881683556537.stgit@john-XPS-13-9370/T/#m2e0ad1d5949131014748b6daa48a3495e7f0456d Fixes: 849fa50662fb ("bpf/verifier: refine retval R0 state for bpf_get_stack helper") Reported-by: Lorenzo Fontana Reported-by: Leonardo Di Donato Reported-by: John Fastabend Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Acked-by: John Fastabend Tested-by: John Fastabend Tested-by: Lorenzo Fontana Tested-by: Leonardo Di Donato Signed-off-by: Greg Kroah-Hartman --- kernel/bpf/verifier.c | 45 ++++++++++++++++++++++++++++++++----------- 1 file changed, 34 insertions(+), 11 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 3841590624b7..5dd28d9c32cd 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -188,8 +188,7 @@ struct bpf_call_arg_meta { bool pkt_access; int regno; int access_size; - s64 msize_smax_value; - u64 msize_umax_value; + u64 msize_max_value; }; static DEFINE_MUTEX(bpf_verifier_lock); @@ -2085,8 +2084,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, /* remember the mem_size which may be used later * to refine return values. */ - meta->msize_smax_value = reg->smax_value; - meta->msize_umax_value = reg->umax_value; + meta->msize_max_value = reg->umax_value; /* The register is SCALAR_VALUE; the access check * happens using its boundaries. @@ -2457,21 +2455,44 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) return 0; } -static void do_refine_retval_range(struct bpf_reg_state *regs, int ret_type, - int func_id, - struct bpf_call_arg_meta *meta) +static int do_refine_retval_range(struct bpf_verifier_env *env, + struct bpf_reg_state *regs, int ret_type, + int func_id, struct bpf_call_arg_meta *meta) { struct bpf_reg_state *ret_reg = ®s[BPF_REG_0]; + struct bpf_reg_state tmp_reg = *ret_reg; + bool ret; if (ret_type != RET_INTEGER || (func_id != BPF_FUNC_get_stack && func_id != BPF_FUNC_probe_read_str)) - return; + return 0; + + /* Error case where ret is in interval [S32MIN, -1]. */ + ret_reg->smin_value = S32_MIN; + ret_reg->smax_value = -1; - ret_reg->smax_value = meta->msize_smax_value; - ret_reg->umax_value = meta->msize_umax_value; __reg_deduce_bounds(ret_reg); __reg_bound_offset(ret_reg); + __update_reg_bounds(ret_reg); + + ret = push_stack(env, env->insn_idx + 1, env->insn_idx, false); + if (!ret) + return -EFAULT; + + *ret_reg = tmp_reg; + + /* Success case where ret is in range [0, msize_max_value]. */ + ret_reg->smin_value = 0; + ret_reg->smax_value = meta->msize_max_value; + ret_reg->umin_value = ret_reg->smin_value; + ret_reg->umax_value = ret_reg->smax_value; + + __reg_deduce_bounds(ret_reg); + __reg_bound_offset(ret_reg); + __update_reg_bounds(ret_reg); + + return 0; } static int @@ -2626,7 +2647,9 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn return -EINVAL; } - do_refine_retval_range(regs, fn->ret_type, func_id, &meta); + err = do_refine_retval_range(env, regs, fn->ret_type, func_id, &meta); + if (err) + return err; err = check_map_func_compatibility(env, meta.map_ptr, func_id); if (err) From 43250fc197726c403dfc1287adbbfcda2bbb1f07 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Wed, 29 Apr 2020 15:16:49 +0200 Subject: [PATCH 0854/1640] UPSTREAM: ANDROID: bpf: fix export symbol type In commit ff5bf35998cc ("ANDROID: bpf: validate bpf_func when BPF_JIT is enabled with CFI") a new symbol was exported, but it should have been set as a _GPL symbol. Fix this up by properly. Bug: 145210207 Cc: Sami Tolvanen Signed-off-by: Greg Kroah-Hartman Change-Id: I7239bb8e0ef329cd7eac6afcd06c341b17ea680b --- kernel/bpf/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 158e20212b4f..13c15398db05 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -634,7 +634,7 @@ bool __weak arch_bpf_jit_check_func(const struct bpf_prog *prog) { return true; } -EXPORT_SYMBOL(arch_bpf_jit_check_func); +EXPORT_SYMBOL_GPL(arch_bpf_jit_check_func); #endif struct bpf_binary_header * From 954d90fabe793f8cf8f8c69e3824f77e4aec3e1d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Thu, 16 Apr 2020 10:31:20 +0200 Subject: [PATCH 0855/1640] UPSTREAM: cpumap: Avoid warning when CONFIG_DEBUG_PER_CPU_MAPS is enabled MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit bc23d0e3f717ced21fbfacab3ab887d55e5ba367 upstream. When the kernel is built with CONFIG_DEBUG_PER_CPU_MAPS, the cpumap code can trigger a spurious warning if CONFIG_CPUMASK_OFFSTACK is also set. This happens because in this configuration, NR_CPUS can be larger than nr_cpumask_bits, so the initial check in cpu_map_alloc() is not sufficient to guard against hitting the warning in cpumask_check(). Fix this by explicitly checking the supplied key against the nr_cpumask_bits variable before calling cpu_possible(). Fixes: 6710e1126934 ("bpf: introduce new bpf cpu map type BPF_MAP_TYPE_CPUMAP") Reported-by: Xiumei Mu Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Alexei Starovoitov Tested-by: Xiumei Mu Acked-by: Jesper Dangaard Brouer Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20200416083120.453718-1-toke@redhat.com Signed-off-by: Greg Kroah-Hartman --- kernel/bpf/cpumap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 3c18260403dd..61fbcae82f0a 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -455,7 +455,7 @@ static int cpu_map_update_elem(struct bpf_map *map, void *key, void *value, return -EOVERFLOW; /* Make sure CPU is a valid possible cpu */ - if (!cpu_possible(key_cpu)) + if (key_cpu >= nr_cpumask_bits || !cpu_possible(key_cpu)) return -ENODEV; if (qsize == 0) { From 7025d168aa64f8a5c9aa3630dd3617b6694c6b2e Mon Sep 17 00:00:00 2001 From: YiFei Zhu Date: Wed, 10 Jun 2020 13:41:39 -0500 Subject: [PATCH 0856/1640] UPSTREAM: net/filter: Permit reading NET in load_bytes_relative when MAC not set [ Upstream commit 0f5d82f187e1beda3fe7295dfc500af266a5bd80 ] Added a check in the switch case on start_header that checks for the existence of the header, and in the case that MAC is not set and the caller requests for MAC, -EFAULT. If the caller requests for NET then MAC's existence is completely ignored. There is no function to check NET header's existence and as far as cgroup_skb/egress is concerned it should always be set. Removed for ptr >= the start of header, considering offset is bounded unsigned and should always be true. len <= end - mac is redundant to ptr + len <= end. Fixes: 3eee1f75f2b9 ("bpf: fix bpf_skb_load_bytes_relative pkt length check") Signed-off-by: YiFei Zhu Signed-off-by: Daniel Borkmann Reviewed-by: Stanislav Fomichev Link: https://lore.kernel.org/bpf/76bb820ddb6a95f59a772ecbd8c8a336f646b362.1591812755.git.zhuyifei@google.com Signed-off-by: Sasha Levin --- net/core/filter.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/net/core/filter.c b/net/core/filter.c index c1f7f3ff7c22..5eabad0b7795 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1730,25 +1730,27 @@ BPF_CALL_5(bpf_skb_load_bytes_relative, const struct sk_buff *, skb, u32, offset, void *, to, u32, len, u32, start_header) { u8 *end = skb_tail_pointer(skb); - u8 *net = skb_network_header(skb); - u8 *mac = skb_mac_header(skb); - u8 *ptr; + u8 *start, *ptr; - if (unlikely(offset > 0xffff || len > (end - mac))) + if (unlikely(offset > 0xffff)) goto err_clear; switch (start_header) { case BPF_HDR_START_MAC: - ptr = mac + offset; + if (unlikely(!skb_mac_header_was_set(skb))) + goto err_clear; + start = skb_mac_header(skb); break; case BPF_HDR_START_NET: - ptr = net + offset; + start = skb_network_header(skb); break; default: goto err_clear; } - if (likely(ptr >= mac && ptr + len <= end)) { + ptr = start + offset; + + if (likely(ptr + len <= end)) { memcpy(to, ptr, len); return 0; } From d8a7110b8ba4b2725e132b81b94211fe7ba4d3e7 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 1 Apr 2019 16:42:13 +0200 Subject: [PATCH 0857/1640] BACKPORT: net: place xmit recursion in softnet data commit 97cdcf37b57e3f204be3000b9eab9686f38b4356 upstream. This fills a hole in softnet data, so no change in structure size. Also prepares for xmit_more placement in the same spot; skb->xmit_more will be removed in followup patch. Signed-off-by: Florian Westphal Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- include/linux/netdevice.h | 41 ++++++++++++++++++++++++++++++--------- net/core/dev.c | 10 +++------- net/core/filter.c | 6 +++--- 3 files changed, 38 insertions(+), 19 deletions(-) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 27bbad0d5a12..dea63063f66e 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2532,14 +2532,6 @@ void netdev_freemem(struct net_device *dev); void synchronize_net(void); int init_dummy_netdev(struct net_device *dev); -DECLARE_PER_CPU(int, xmit_recursion); -#define XMIT_RECURSION_LIMIT 8 - -static inline int dev_recursion_level(void) -{ - return this_cpu_read(xmit_recursion); -} - struct net_device *dev_get_by_index(struct net *net, int ifindex); struct net_device *__dev_get_by_index(struct net *net, int ifindex); struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex); @@ -2887,7 +2879,11 @@ struct softnet_data { struct Qdisc *output_queue; struct Qdisc **output_queue_tailp; struct sk_buff *completion_queue; - + /* written and read only by owning cpu: */ + struct { + u16 recursion; + u8 more; + } xmit; #ifdef CONFIG_RPS /* input_queue_head should be written by cpu owning this struct, * and only read by other cpus. Worth using a cache line. @@ -2923,6 +2919,28 @@ static inline void input_queue_tail_incr_save(struct softnet_data *sd, DECLARE_PER_CPU_ALIGNED(struct softnet_data, softnet_data); +static inline int dev_recursion_level(void) +{ + return __this_cpu_read(softnet_data.xmit.recursion); +} + +#define XMIT_RECURSION_LIMIT 8 +static inline bool dev_xmit_recursion(void) +{ + return unlikely(__this_cpu_read(softnet_data.xmit.recursion) > + XMIT_RECURSION_LIMIT); +} + +static inline void dev_xmit_recursion_inc(void) +{ + __this_cpu_inc(softnet_data.xmit.recursion); +} + +static inline void dev_xmit_recursion_dec(void) +{ + __this_cpu_dec(softnet_data.xmit.recursion); +} + void __netif_schedule(struct Qdisc *q); void netif_schedule_queue(struct netdev_queue *txq); @@ -4165,6 +4183,11 @@ static inline netdev_tx_t __netdev_start_xmit(const struct net_device_ops *ops, return ops->ndo_start_xmit(skb, dev); } +static inline bool netdev_xmit_more(void) +{ + return __this_cpu_read(softnet_data.xmit.more); +} + static inline netdev_tx_t netdev_start_xmit(struct sk_buff *skb, struct net_device *dev, struct netdev_queue *txq, bool more) { diff --git a/net/core/dev.c b/net/core/dev.c index 1f949a72bb94..dc3348f185ed 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3297,9 +3297,6 @@ static void skb_update_prio(struct sk_buff *skb) #define skb_update_prio(skb) #endif -DEFINE_PER_CPU(int, xmit_recursion); -EXPORT_SYMBOL(xmit_recursion); - /** * dev_loopback_xmit - loop back @skb * @net: network namespace this loopback is happening in @@ -3539,8 +3536,7 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv) int cpu = smp_processor_id(); /* ok because BHs are off */ if (txq->xmit_lock_owner != cpu) { - if (unlikely(__this_cpu_read(xmit_recursion) > - XMIT_RECURSION_LIMIT)) + if (dev_xmit_recursion()) goto recursion_alert; skb = validate_xmit_skb(skb, dev); @@ -3550,9 +3546,9 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv) HARD_TX_LOCK(dev, txq, cpu); if (!netif_xmit_stopped(txq)) { - __this_cpu_inc(xmit_recursion); + dev_xmit_recursion_inc(); skb = dev_hard_start_xmit(skb, dev, txq, &rc); - __this_cpu_dec(xmit_recursion); + dev_xmit_recursion_dec(); if (dev_xmit_complete(rc)) { HARD_TX_UNLOCK(dev, txq); goto out; diff --git a/net/core/filter.c b/net/core/filter.c index 5eabad0b7795..d3180bf82dea 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2002,7 +2002,7 @@ static inline int __bpf_tx_skb(struct net_device *dev, struct sk_buff *skb) { int ret; - if (unlikely(__this_cpu_read(xmit_recursion) > XMIT_RECURSION_LIMIT)) { + if (dev_xmit_recursion()) { net_crit_ratelimited("bpf: recursion limit reached on datapath, buggy bpf program?\n"); kfree_skb(skb); return -ENETDOWN; @@ -2011,9 +2011,9 @@ static inline int __bpf_tx_skb(struct net_device *dev, struct sk_buff *skb) skb->dev = dev; skb->tstamp = 0; - __this_cpu_inc(xmit_recursion); + dev_xmit_recursion_inc(); ret = dev_queue_xmit(skb); - __this_cpu_dec(xmit_recursion); + dev_xmit_recursion_dec(); return ret; } From ce261538842259110333d21ef904a8f071011ec9 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Sat, 28 Apr 2018 21:36:02 +0900 Subject: [PATCH 0858/1640] UPSTREAM: kprobes: Show address of kprobes if kallsyms does Show probed address in debugfs kprobe list file as same as kallsyms does. This information is used for checking kprobes are placed in the expected address. So it should be able to compared with address in kallsyms. Signed-off-by: Masami Hiramatsu Cc: Ananth N Mavinakayanahalli Cc: Anil S Keshavamurthy Cc: Arnd Bergmann Cc: David Howells Cc: David S . Miller Cc: Heiko Carstens Cc: Jon Medhurst Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Thomas Richter Cc: Tobin C . Harding Cc: Will Deacon Cc: acme@kernel.org Cc: akpm@linux-foundation.org Cc: brueckner@linux.vnet.ibm.com Cc: linux-arch@vger.kernel.org Cc: rostedt@goodmis.org Cc: schwidefsky@de.ibm.com Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/lkml/152491896256.9916.1583733714492565296.stgit@devbox Signed-off-by: Ingo Molnar --- kernel/kprobes.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/kernel/kprobes.c b/kernel/kprobes.c index e86bbcb849ac..93a5e37a4664 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -2375,6 +2375,7 @@ static void report_probe(struct seq_file *pi, struct kprobe *p, const char *sym, int offset, char *modname, struct kprobe *pp) { char *kprobe_type; + void *addr = p->addr; if (p->pre_handler == pre_handler_kretprobe) kprobe_type = "r"; @@ -2383,13 +2384,16 @@ static void report_probe(struct seq_file *pi, struct kprobe *p, else kprobe_type = "k"; + if (!kallsyms_show_value()) + addr = NULL; + if (sym) - seq_printf(pi, "%p %s %s+0x%x %s ", - p->addr, kprobe_type, sym, offset, + seq_printf(pi, "%px %s %s+0x%x %s ", + addr, kprobe_type, sym, offset, (modname ? modname : " ")); - else - seq_printf(pi, "%p %s %p ", - p->addr, kprobe_type, p->addr); + else /* try to use %pS */ + seq_printf(pi, "%px %s %pS ", + addr, kprobe_type, p->addr); if (!pp) pp = p; From 9bf4bf02d871f67d38e558b95e7c24fa5724fd52 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Sat, 28 Apr 2018 21:35:32 +0900 Subject: [PATCH 0859/1640] UPSTREAM: kprobes: Show blacklist addresses as same as kallsyms does Show kprobes blacklist addresses under same condition of showing kallsyms addresses. Since there are several name conflict for local symbols, kprobe blacklist needs to show each addresses so that user can identify where is on blacklist by comparing with kallsyms. Signed-off-by: Masami Hiramatsu Cc: Ananth N Mavinakayanahalli Cc: Anil S Keshavamurthy Cc: Arnd Bergmann Cc: David Howells Cc: David S . Miller Cc: Heiko Carstens Cc: Jon Medhurst Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Thomas Richter Cc: Tobin C . Harding Cc: Will Deacon Cc: acme@kernel.org Cc: akpm@linux-foundation.org Cc: brueckner@linux.vnet.ibm.com Cc: linux-arch@vger.kernel.org Cc: rostedt@goodmis.org Cc: schwidefsky@de.ibm.com Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/lkml/152491893217.9916.14760965896164273464.stgit@devbox Signed-off-by: Ingo Molnar --- kernel/kprobes.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 93a5e37a4664..73a9d3ce063d 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -2481,8 +2481,16 @@ static int kprobe_blacklist_seq_show(struct seq_file *m, void *v) struct kprobe_blacklist_entry *ent = list_entry(v, struct kprobe_blacklist_entry, list); - seq_printf(m, "0x%px-0x%px\t%ps\n", (void *)ent->start_addr, - (void *)ent->end_addr, (void *)ent->start_addr); + /* + * If /proc/kallsyms is not showing kernel address, we won't + * show them here either. + */ + if (!kallsyms_show_value()) + seq_printf(m, "0x%px-0x%px\t%ps\n", NULL, NULL, + (void *)ent->start_addr); + else + seq_printf(m, "0x%px-0x%px\t%ps\n", (void *)ent->start_addr, + (void *)ent->end_addr, (void *)ent->start_addr); return 0; } From 5f88ee41c15cd9318701ae7904e18f0df0625e03 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 13 Nov 2017 17:51:00 +0100 Subject: [PATCH 0860/1640] UPSTREAM: /proc/module: fix building without kallsyms As reported by kernelci and other build bots, we now get a link failure without CONFIG_KALLSYMS: module.c:(.text+0xf2c): undefined reference to `kallsyms_show_value' This adds a dummy helper with the same name that can be used for compilation. It's not entirely clear to me what this should return for !CONFIG_KALLSYMS, I picked an unconditional 'false', which leads to the module address being unavailable to user space. Link: https://kernelci.org/build/mainline/branch/master/kernel/v4.14-5-g516fb7f2e73d/ Fixes: 516fb7f2e73d ("/proc/module: use the same logic as /proc/kallsyms for address exposure") Signed-off-by: Arnd Bergmann Signed-off-by: Linus Torvalds --- include/linux/kallsyms.h | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/include/linux/kallsyms.h b/include/linux/kallsyms.h index 0a777c5216b1..708f337d780b 100644 --- a/include/linux/kallsyms.h +++ b/include/linux/kallsyms.h @@ -14,8 +14,6 @@ #define KSYM_SYMBOL_LEN (sizeof("%s+%#lx/%#lx [%s]") + (KSYM_NAME_LEN - 1) + \ 2*(BITS_PER_LONG*3/10) + (MODULE_NAME_LEN - 1) + 1) -/* How and when do we show kallsyms values? */ -extern int kallsyms_show_value(void); #ifndef CONFIG_64BIT # define KALLSYM_FMT "%08lx" #else @@ -54,6 +52,9 @@ extern void __print_symbol(const char *fmt, unsigned long address); int lookup_symbol_name(unsigned long addr, char *symname); int lookup_symbol_attrs(unsigned long addr, unsigned long *size, unsigned long *offset, char *modname, char *name); +/* How and when do we show kallsyms values? */ +extern int kallsyms_show_value(void); + #else /* !CONFIG_KALLSYMS */ static inline unsigned long kallsyms_lookup_name(const char *name) @@ -112,6 +113,11 @@ static inline int lookup_symbol_attrs(unsigned long addr, unsigned long *size, u return -ERANGE; } +static inline int kallsyms_show_value(void) +{ + return false; +} + /* Stupid that this does nothing, but I didn't create this mess. */ #define __print_symbol(fmt, addr) #endif /*CONFIG_KALLSYMS*/ From e16567de3a2153555c56774d40b480ad1a268cf8 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Wed, 6 Dec 2017 13:36:49 +0900 Subject: [PATCH 0861/1640] BACKPORT: symbol lookup: introduce dereference_symbol_descriptor() dereference_symbol_descriptor() invokes appropriate ARCH specific function descriptor dereference callbacks: - dereference_kernel_function_descriptor() if the pointer is a kernel symbol; - dereference_module_function_descriptor() if the pointer is a module symbol. This is the last step needed to make '%pS/%ps' smart enough to handle function descriptor dereference on affected ARCHs and to retire '%pF/%pf'. To refresh it: Some architectures (ia64, ppc64, parisc64) use an indirect pointer for C function pointers - the function pointer points to a function descriptor and we need to dereference it to get the actual function pointer. Function descriptors live in .opd elf section and all affected ARCHs (ia64, ppc64, parisc64) handle it properly for kernel and modules. So we, technically, can decide if the dereference is needed by simply looking at the pointer: if it belongs to .opd section then we need to dereference it. The kernel and modules have their own .opd sections, obviously, that's why we need to split dereference_function_descriptor() and use separate kernel and module dereference arch callbacks. Link: http://lkml.kernel.org/r/20171206043649.GB15885@jagdpanzerIV Cc: Fenghua Yu Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Cc: James Bottomley Cc: Andrew Morton Cc: Jessica Yu Cc: Steven Rostedt Cc: linux-ia64@vger.kernel.org Cc: linux-parisc@vger.kernel.org Cc: linuxppc-dev@lists.ozlabs.org Cc: linux-kernel@vger.kernel.org Change-Id: Icb874edcfcae68f58a61cfe6561881db837b269e Signed-off-by: Sergey Senozhatsky Tested-by: Tony Luck #ia64 Tested-by: Santosh Sivaraj #powerpc Tested-by: Helge Deller #parisc64 Signed-off-by: Petr Mladek --- Documentation/printk-formats.txt | 34 +++++++------------- include/linux/kallsyms.h | 54 ++++++++++++++++++++++++++++++++ kernel/kallsyms.c | 35 --------------------- lib/vsprintf.c | 5 ++- 4 files changed, 68 insertions(+), 60 deletions(-) diff --git a/Documentation/printk-formats.txt b/Documentation/printk-formats.txt index 25c8ad37d86c..ed4aa7f023df 100644 --- a/Documentation/printk-formats.txt +++ b/Documentation/printk-formats.txt @@ -61,41 +61,31 @@ Symbols/Function Pointers :: + %pS versatile_init+0x0/0x110 + %ps versatile_init %pF versatile_init+0x0/0x110 %pf versatile_init - %pS versatile_init+0x0/0x110 %pSR versatile_init+0x9/0x110 (with __builtin_extract_return_addr() translation) - %ps versatile_init %pB prev_fn_of_versatile_init+0x88/0x88 -The ``F`` and ``f`` specifiers are for printing function pointers, -for example, f->func, &gettimeofday. They have the same result as -``S`` and ``s`` specifiers. But they do an extra conversion on -ia64, ppc64 and parisc64 architectures where the function pointers -are actually function descriptors. +The ``S`` and ``s`` specifiers are used for printing a pointer in symbolic +format. They result in the symbol name with (``S``) or without (``s``) +offsets. If KALLSYMS are disabled then the symbol address is printed instead. -The ``S`` and ``s`` specifiers can be used for printing symbols -from direct addresses, for example, __builtin_return_address(0), -(void *)regs->ip. They result in the symbol name with (``S``) or -without (``s``) offsets. If KALLSYMS are disabled then the symbol -address is printed instead. +Note, that the ``F`` and ``f`` specifiers are identical to ``S`` (``s``) +and thus deprecated. We have ``F`` and ``f`` because on ia64, ppc64 and +parisc64 function pointers are indirect and, in fact, are function +descriptors, which require additional dereferencing before we can lookup +the symbol. As of now, ``S`` and ``s`` perform dereferencing on those +platforms (when needed), so ``F`` and ``f`` exist for compatibility +reasons only. The ``B`` specifier results in the symbol name with offsets and should be used when printing stack backtraces. The specifier takes into consideration the effect of compiler optimisations which may occur when tail-call``s are used and marked with the noreturn GCC attribute. -Examples:: - - printk("Going to call: %pF\n", gettimeofday); - printk("Going to call: %pF\n", p->func); - printk("%s: called from %pS\n", __func__, (void *)_RET_IP_); - printk("%s: called from %pS\n", __func__, - (void *)__builtin_return_address(0)); - printk("Faulted at %pS\n", (void *)regs->ip); - printk(" %s%pB\n", (reliable ? "" : "? "), (void *)*stack); - Kernel Pointers =============== diff --git a/include/linux/kallsyms.h b/include/linux/kallsyms.h index 708f337d780b..e4f2e5a65f14 100644 --- a/include/linux/kallsyms.h +++ b/include/linux/kallsyms.h @@ -9,6 +9,10 @@ #include #include #include +#include +#include + +#include #define KSYM_NAME_LEN 128 #define KSYM_SYMBOL_LEN (sizeof("%s+%#lx/%#lx [%s]") + (KSYM_NAME_LEN - 1) + \ @@ -22,6 +26,56 @@ struct module; +static inline int is_kernel_inittext(unsigned long addr) +{ + if (addr >= (unsigned long)_sinittext + && addr <= (unsigned long)_einittext) + return 1; + return 0; +} + +static inline int is_kernel_text(unsigned long addr) +{ + if ((addr >= (unsigned long)_stext && addr <= (unsigned long)_etext) || + arch_is_kernel_text(addr)) + return 1; + return in_gate_area_no_mm(addr); +} + +static inline int is_kernel(unsigned long addr) +{ + if (addr >= (unsigned long)_stext && addr <= (unsigned long)_end) + return 1; + return in_gate_area_no_mm(addr); +} + +static inline int is_ksym_addr(unsigned long addr) +{ + if (IS_ENABLED(CONFIG_KALLSYMS_ALL)) + return is_kernel(addr); + + return is_kernel_text(addr) || is_kernel_inittext(addr); +} + +static inline void *dereference_symbol_descriptor(void *ptr) +{ +#ifdef HAVE_DEREFERENCE_FUNCTION_DESCRIPTOR + struct module *mod; + + ptr = dereference_kernel_function_descriptor(ptr); + if (is_ksym_addr((unsigned long)ptr)) + return ptr; + + preempt_disable(); + mod = __module_address((unsigned long)ptr); + preempt_enable(); + + if (mod) + ptr = dereference_module_function_descriptor(mod, ptr); +#endif + return ptr; +} + #ifdef CONFIG_KALLSYMS /* Lookup the address for a symbol. Returns 0 if not found. */ unsigned long kallsyms_lookup_name(const char *name); diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index e8b0f322323a..641fe6770fc5 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -12,7 +12,6 @@ * compression (see scripts/kallsyms.c for a more complete description) */ #include -#include #include #include #include @@ -20,14 +19,11 @@ #include #include #include /* for cond_resched */ -#include #include #include #include #include -#include - #include #ifdef CONFIG_KALLSYMS_ALL @@ -89,37 +85,6 @@ void sec_debug_summary_set_kallsyms_info( } #endif -static inline int is_kernel_inittext(unsigned long addr) -{ - if (addr >= (unsigned long)_sinittext - && addr <= (unsigned long)_einittext) - return 1; - return 0; -} - -static inline int is_kernel_text(unsigned long addr) -{ - if ((addr >= (unsigned long)_stext && addr <= (unsigned long)_etext) || - arch_is_kernel_text(addr)) - return 1; - return in_gate_area_no_mm(addr); -} - -static inline int is_kernel(unsigned long addr) -{ - if (addr >= (unsigned long)_stext && addr <= (unsigned long)_end) - return 1; - return in_gate_area_no_mm(addr); -} - -static int is_ksym_addr(unsigned long addr) -{ - if (IS_ENABLED(CONFIG_KALLSYMS_ALL)) - return is_kernel(addr); - - return is_kernel_text(addr) || is_kernel_inittext(addr); -} - /* * Expand a compressed symbol data into the resulting uncompressed string, * if uncompressed string is too long (>= maxlen), it will be truncated, diff --git a/lib/vsprintf.c b/lib/vsprintf.c index 60a95211efa6..7c6a8d6b777a 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c @@ -42,7 +42,6 @@ #include "../mm/internal.h" /* For the trace_print_flags arrays */ #include /* for PAGE_SIZE */ -#include /* for dereference_function_descriptor() */ #include /* cpu_to_le16 */ #include @@ -1891,10 +1890,10 @@ char *pointer(const char *fmt, char *buf, char *end, void *ptr, switch (*fmt) { case 'F': case 'f': - ptr = dereference_function_descriptor(ptr); - /* Fallthrough */ case 'S': case 's': + ptr = dereference_symbol_descriptor(ptr); + /* Fallthrough */ case 'B': return symbol_string(buf, end, ptr, spec, fmt); case 'R': From 06f7b7f87a49caab5760b2b26c0f2b71db9df072 Mon Sep 17 00:00:00 2001 From: Tim Zimmermann Date: Mon, 11 Aug 2025 18:20:18 +0000 Subject: [PATCH 0862/1640] security: Fully revert ANDROID_PARANOID_NETWORK changes * This was missed in 140cda105bb80a40bde962208adb34f410ec501a, as can be seen in the original commit 2b02b4ab89b9cba5aec936046d8538962c5142fc --- security/commoncap.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/security/commoncap.c b/security/commoncap.c index 776efd3c0b8c..90a252d68878 100644 --- a/security/commoncap.c +++ b/security/commoncap.c @@ -54,7 +54,7 @@ static void warn_setuid_and_fcaps_mixed(const char *fname) } /** - * __cap_capable - Determine whether a task has a particular effective capability + * cap_capable - Determine whether a task has a particular effective capability * @cred: The credentials to use * @ns: The user namespace in which we need the capability * @cap: The capability to check for @@ -68,7 +68,7 @@ static void warn_setuid_and_fcaps_mixed(const char *fname) * cap_has_capability() returns 0 when a task has a capability, but the * kernel's capable() and has_capability() returns 1 for this case. */ -int __cap_capable(const struct cred *cred, struct user_namespace *targ_ns, +int cap_capable(const struct cred *cred, struct user_namespace *targ_ns, int cap, int audit) { struct user_namespace *ns = targ_ns; @@ -113,11 +113,6 @@ int __cap_capable(const struct cred *cred, struct user_namespace *targ_ns, /* We never get here */ } -int cap_capable(const struct cred *cred, struct user_namespace *targ_ns, - int cap, int audit) -{ - return __cap_capable(cred, targ_ns, cap, audit); -} /** * cap_settime - Determine whether the current process may set the system clock * @ts: The time to set From ec275f990bd3cbbbecf71b391535220919e1ef86 Mon Sep 17 00:00:00 2001 From: Casey Schaufler Date: Mon, 8 Jan 2018 10:25:32 -0800 Subject: [PATCH 0863/1640] UPSTREAM: Smack: Privilege check on key operations Smack: Privilege check on key operations Operations on key objects are subjected to Smack policy even if the process is privileged. This is inconsistent with the general behavior of Smack and may cause issues with authentication by privileged daemons. This patch allows processes with CAP_MAC_OVERRIDE to access keys even if the Smack rules indicate otherwise. Reported-by: Jose Bollo Signed-off-by: Casey Schaufler --- security/smack/smack.h | 1 + security/smack/smack_access.c | 40 +++++++++++++++++++++++++---------- security/smack/smack_lsm.c | 4 ++++ 3 files changed, 34 insertions(+), 11 deletions(-) diff --git a/security/smack/smack.h b/security/smack/smack.h index 6a71fc7831ab..f7db791fb566 100644 --- a/security/smack/smack.h +++ b/security/smack/smack.h @@ -321,6 +321,7 @@ struct smack_known *smk_import_entry(const char *, int); void smk_insert_entry(struct smack_known *skp); struct smack_known *smk_find_entry(const char *); bool smack_privileged(int cap); +bool smack_privileged_cred(int cap, const struct cred *cred); void smk_destroy_label_list(struct list_head *list); /* diff --git a/security/smack/smack_access.c b/security/smack/smack_access.c index c8e82d6a12b5..8f4ec04b5ca6 100644 --- a/security/smack/smack_access.c +++ b/security/smack/smack_access.c @@ -622,26 +622,24 @@ struct smack_known *smack_from_secid(const u32 secid) LIST_HEAD(smack_onlycap_list); DEFINE_MUTEX(smack_onlycap_lock); -/* +/** + * smack_privileged_cred - are all privilege requirements met by cred + * @cap: The requested capability + * @cred: the credential to use + * * Is the task privileged and allowed to be privileged * by the onlycap rule. * * Returns true if the task is allowed to be privileged, false if it's not. */ -bool smack_privileged(int cap) +bool smack_privileged_cred(int cap, const struct cred *cred) { - struct smack_known *skp = smk_of_current(); + struct task_smack *tsp = cred->security; + struct smack_known *skp = tsp->smk_task; struct smack_known_list_elem *sklep; int rc; - /* - * All kernel tasks are privileged - */ - if (unlikely(current->flags & PF_KTHREAD)) - return true; - - rc = cap_capable(current_cred(), &init_user_ns, cap, - SECURITY_CAP_AUDIT); + rc = cap_capable(cred, &init_user_ns, cap, SECURITY_CAP_AUDIT); if (rc) return false; @@ -661,3 +659,23 @@ bool smack_privileged(int cap) return false; } + +/** + * smack_privileged - are all privilege requirements met + * @cap: The requested capability + * + * Is the task privileged and allowed to be privileged + * by the onlycap rule. + * + * Returns true if the task is allowed to be privileged, false if it's not. + */ +bool smack_privileged(int cap) +{ + /* + * All kernel tasks are privileged + */ + if (unlikely(current->flags & PF_KTHREAD)) + return true; + + return smack_privileged_cred(cap, current_cred()); +} diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c index 86d66d17f641..33095c32fa19 100644 --- a/security/smack/smack_lsm.c +++ b/security/smack/smack_lsm.c @@ -4395,6 +4395,10 @@ static int smack_key_permission(key_ref_t key_ref, */ if (tkp == NULL) return -EACCES; + + if (smack_privileged_cred(CAP_MAC_OVERRIDE, cred)) + return 0; + #ifdef CONFIG_AUDIT smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_KEY); ad.a.u.key_struct.key = keyp->serial; From 4fb5d562e3b9a6d146e35d27bcef154466557e0e Mon Sep 17 00:00:00 2001 From: John Johansen Date: Wed, 11 Apr 2018 02:03:26 -0700 Subject: [PATCH 0864/1640] UPSTREAM: apparmor: fix mediation of prlimit For primit apparmor requires that if target confinement does not match the setting task's confinement, the setting task requires CAP_SYS_RESOURCE. Unfortunately this was broken when rlimit enforcement was reworked to support labels. Fixes: 86b92cb782b3 ("apparmor: move resource checks to using labels") Signed-off-by: John Johansen --- security/apparmor/resource.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/security/apparmor/resource.c b/security/apparmor/resource.c index d8bc842594ed..5d4219709d44 100644 --- a/security/apparmor/resource.c +++ b/security/apparmor/resource.c @@ -124,7 +124,7 @@ int aa_task_setrlimit(struct aa_label *label, struct task_struct *task, */ if (label != peer && - !aa_capable(label, CAP_SYS_RESOURCE, SECURITY_CAP_NOAUDIT)) + aa_capable(label, CAP_SYS_RESOURCE, SECURITY_CAP_NOAUDIT) != 0) error = fn_for_each(label, profile, audit_resource(profile, resource, new_rlim->rlim_max, peer, From d681f2fbb012e769ee3e5ade32e85186b55355f1 Mon Sep 17 00:00:00 2001 From: Micah Morton Date: Mon, 7 Jan 2019 16:10:53 -0800 Subject: [PATCH 0865/1640] BACKPORT: LSM: generalize flag passing to security_capable This patch provides a general mechanism for passing flags to the security_capable LSM hook. It replaces the specific 'audit' flag that is used to tell security_capable whether it should log an audit message for the given capability check. The reason for generalizing this flag passing is so we can add an additional flag that signifies whether security_capable is being called by a setid syscall (which is needed by the proposed SafeSetID LSM). Signed-off-by: Micah Morton Reviewed-by: Kees Cook Signed-off-by: James Morris --- include/linux/lsm_hooks.h | 8 +++++--- include/linux/security.h | 28 +++++++++++++------------- kernel/capability.c | 22 +++++++++++--------- kernel/ptrace.c | 4 ++-- kernel/seccomp.c | 4 ++-- security/apparmor/capability.c | 14 ++++++------- security/apparmor/include/capability.h | 2 +- security/apparmor/ipc.c | 3 ++- security/apparmor/lsm.c | 4 ++-- security/apparmor/resource.c | 2 +- security/commoncap.c | 17 ++++++++-------- security/security.c | 14 +++++-------- security/selinux/hooks.c | 18 ++++++++--------- security/smack/smack_access.c | 2 +- 14 files changed, 73 insertions(+), 69 deletions(-) diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h index 1db7dde2adfe..ecd852b557a0 100644 --- a/include/linux/lsm_hooks.h +++ b/include/linux/lsm_hooks.h @@ -1235,7 +1235,7 @@ * @cred contains the credentials to use. * @ns contains the user namespace we want the capability in * @cap contains the capability . - * @audit contains whether to write an audit message or not + * @opts contains options for the capable check * Return 0 if the capability is granted for @tsk. * @syslog: * Check permission before accessing the kernel message ring or changing @@ -1411,8 +1411,10 @@ union security_list_options { const kernel_cap_t *effective, const kernel_cap_t *inheritable, const kernel_cap_t *permitted); - int (*capable)(const struct cred *cred, struct user_namespace *ns, - int cap, int audit); + int (*capable)(const struct cred *cred, + struct user_namespace *ns, + int cap, + unsigned int opts); int (*quotactl)(int cmds, int type, int id, struct super_block *sb); int (*quota_on)(struct dentry *dentry); int (*syslog)(int type); diff --git a/include/linux/security.h b/include/linux/security.h index a1a407bc9e92..fa06673570b8 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -57,9 +57,12 @@ struct xattr; struct xfrm_sec_ctx; struct mm_struct; +/* Default (no) options for the capable function */ +#define CAP_OPT_NONE 0x0 /* If capable should audit the security request */ -#define SECURITY_CAP_NOAUDIT 0 -#define SECURITY_CAP_AUDIT 1 +#define CAP_OPT_NOAUDIT BIT(1) +/* If capable is being called by a setid function */ +#define CAP_OPT_INSETID BIT(2) /* LSM Agnostic defines for sb_set_mnt_opts */ #define SECURITY_LSM_NATIVE_LABELS 1 @@ -103,7 +106,7 @@ enum lsm_event { /* These functions are in security/commoncap.c */ extern int cap_capable(const struct cred *cred, struct user_namespace *ns, - int cap, int audit); + int cap, unsigned int opts); extern int cap_settime(const struct timespec64 *ts, const struct timezone *tz); extern int cap_ptrace_access_check(struct task_struct *child, unsigned int mode); extern int cap_ptrace_traceme(struct task_struct *parent); @@ -242,10 +245,10 @@ int security_capset(struct cred *new, const struct cred *old, const kernel_cap_t *effective, const kernel_cap_t *inheritable, const kernel_cap_t *permitted); -int security_capable(const struct cred *cred, struct user_namespace *ns, - int cap); -int security_capable_noaudit(const struct cred *cred, struct user_namespace *ns, - int cap); +int security_capable(const struct cred *cred, + struct user_namespace *ns, + int cap, + unsigned int opts); int security_quotactl(int cmds, int type, int id, struct super_block *sb); int security_quota_on(struct dentry *dentry); int security_syslog(int type); @@ -507,14 +510,11 @@ static inline int security_capset(struct cred *new, } static inline int security_capable(const struct cred *cred, - struct user_namespace *ns, int cap) + struct user_namespace *ns, + int cap, + unsigned int opts) { - return cap_capable(cred, ns, cap, SECURITY_CAP_AUDIT); -} - -static inline int security_capable_noaudit(const struct cred *cred, - struct user_namespace *ns, int cap) { - return cap_capable(cred, ns, cap, SECURITY_CAP_NOAUDIT); + return cap_capable(cred, ns, cap, opts); } static inline int security_quotactl(int cmds, int type, int id, diff --git a/kernel/capability.c b/kernel/capability.c index 1e1c0236f55b..7718d7dcadc7 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -299,7 +299,7 @@ bool has_ns_capability(struct task_struct *t, int ret; rcu_read_lock(); - ret = security_capable(__task_cred(t), ns, cap); + ret = security_capable(__task_cred(t), ns, cap, CAP_OPT_NONE); rcu_read_unlock(); return (ret == 0); @@ -340,7 +340,7 @@ bool has_ns_capability_noaudit(struct task_struct *t, int ret; rcu_read_lock(); - ret = security_capable_noaudit(__task_cred(t), ns, cap); + ret = security_capable(__task_cred(t), ns, cap, CAP_OPT_NOAUDIT); rcu_read_unlock(); return (ret == 0); @@ -363,7 +363,9 @@ bool has_capability_noaudit(struct task_struct *t, int cap) return has_ns_capability_noaudit(t, &init_user_ns, cap); } -static bool ns_capable_common(struct user_namespace *ns, int cap, bool audit) +static bool ns_capable_common(struct user_namespace *ns, + int cap, + unsigned int opts) { int capable; @@ -372,8 +374,7 @@ static bool ns_capable_common(struct user_namespace *ns, int cap, bool audit) BUG(); } - capable = audit ? security_capable(current_cred(), ns, cap) : - security_capable_noaudit(current_cred(), ns, cap); + capable = security_capable(current_cred(), ns, cap, opts); if (capable == 0) { current->flags |= PF_SUPERPRIV; return true; @@ -394,7 +395,7 @@ static bool ns_capable_common(struct user_namespace *ns, int cap, bool audit) */ bool ns_capable(struct user_namespace *ns, int cap) { - return ns_capable_common(ns, cap, true); + return ns_capable_common(ns, cap, CAP_OPT_NONE); } EXPORT_SYMBOL(ns_capable); @@ -412,7 +413,7 @@ EXPORT_SYMBOL(ns_capable); */ bool ns_capable_noaudit(struct user_namespace *ns, int cap) { - return ns_capable_common(ns, cap, false); + return ns_capable_common(ns, cap, CAP_OPT_NOAUDIT); } EXPORT_SYMBOL(ns_capable_noaudit); @@ -448,10 +449,11 @@ EXPORT_SYMBOL(capable); bool file_ns_capable(const struct file *file, struct user_namespace *ns, int cap) { + if (WARN_ON_ONCE(!cap_valid(cap))) return false; - if (security_capable(file->f_cred, ns, cap) == 0) + if (security_capable(file->f_cred, ns, cap, CAP_OPT_NONE) == 0) return true; return false; @@ -500,10 +502,12 @@ bool ptracer_capable(struct task_struct *tsk, struct user_namespace *ns) { int ret = 0; /* An absent tracer adds no restrictions */ const struct cred *cred; + rcu_read_lock(); cred = rcu_dereference(tsk->ptracer_cred); if (cred) - ret = security_capable_noaudit(cred, ns, CAP_SYS_PTRACE); + ret = security_capable(cred, ns, CAP_SYS_PTRACE, + CAP_OPT_NOAUDIT); rcu_read_unlock(); return (ret == 0); } diff --git a/kernel/ptrace.c b/kernel/ptrace.c index de0f4e93ce32..2154776ac1a0 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -281,9 +281,9 @@ static bool ptrace_has_cap(const struct cred *cred, struct user_namespace *ns, int ret; if (mode & PTRACE_MODE_NOAUDIT) - ret = security_capable(cred, ns, CAP_SYS_PTRACE); + ret = security_capable(cred, ns, CAP_SYS_PTRACE, CAP_OPT_NOAUDIT); else - ret = security_capable(cred, ns, CAP_SYS_PTRACE); + ret = security_capable(cred, ns, CAP_SYS_PTRACE, CAP_OPT_NONE); return ret == 0; } diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 158ae456e8b1..da34e800a398 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -386,8 +386,8 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog) * behavior of privileged children. */ if (!task_no_new_privs(current) && - security_capable_noaudit(current_cred(), current_user_ns(), - CAP_SYS_ADMIN) != 0) + security_capable(current_cred(), current_user_ns(), + CAP_SYS_ADMIN, CAP_OPT_NOAUDIT) != 0) return ERR_PTR(-EACCES); /* Allocate a new seccomp_filter */ diff --git a/security/apparmor/capability.c b/security/apparmor/capability.c index 67e347192a55..4ee296ec2f11 100644 --- a/security/apparmor/capability.c +++ b/security/apparmor/capability.c @@ -110,13 +110,13 @@ static int audit_caps(struct common_audit_data *sa, struct aa_profile *profile, * profile_capable - test if profile allows use of capability @cap * @profile: profile being enforced (NOT NULL, NOT unconfined) * @cap: capability to test if allowed - * @audit: whether an audit record should be generated + * @opts: CAP_OPT_NOAUDIT bit determines whether audit record is generated * @sa: audit data (MAY BE NULL indicating no auditing) * * Returns: 0 if allowed else -EPERM */ -static int profile_capable(struct aa_profile *profile, int cap, int audit, - struct common_audit_data *sa) +static int profile_capable(struct aa_profile *profile, int cap, + unsigned int opts, struct common_audit_data *sa) { int error; @@ -126,7 +126,7 @@ static int profile_capable(struct aa_profile *profile, int cap, int audit, else error = -EPERM; - if (audit == SECURITY_CAP_NOAUDIT) { + if (opts & CAP_OPT_NOAUDIT) { if (!COMPLAIN_MODE(profile)) return error; /* audit the cap request in complain mode but note that it @@ -142,13 +142,13 @@ static int profile_capable(struct aa_profile *profile, int cap, int audit, * aa_capable - test permission to use capability * @label: label being tested for capability (NOT NULL) * @cap: capability to be tested - * @audit: whether an audit record should be generated + * @opts: CAP_OPT_NOAUDIT bit determines whether audit record is generated * * Look up capability in profile capability set. * * Returns: 0 on success, or else an error code. */ -int aa_capable(struct aa_label *label, int cap, int audit) +int aa_capable(struct aa_label *label, int cap, unsigned int opts) { struct aa_profile *profile; int error = 0; @@ -156,7 +156,7 @@ int aa_capable(struct aa_label *label, int cap, int audit) sa.u.cap = cap; error = fn_for_each_confined(label, profile, - profile_capable(profile, cap, audit, &sa)); + profile_capable(profile, cap, opts, &sa)); return error; } diff --git a/security/apparmor/include/capability.h b/security/apparmor/include/capability.h index e0304e2aeb7f..1b3663b6ab12 100644 --- a/security/apparmor/include/capability.h +++ b/security/apparmor/include/capability.h @@ -40,7 +40,7 @@ struct aa_caps { extern struct aa_sfs_entry aa_sfs_entry_caps[]; -int aa_capable(struct aa_label *label, int cap, int audit); +int aa_capable(struct aa_label *label, int cap, unsigned int opts); static inline void aa_free_cap_rules(struct aa_caps *caps) { diff --git a/security/apparmor/ipc.c b/security/apparmor/ipc.c index 586facd35f7c..b4ed35233b1a 100644 --- a/security/apparmor/ipc.c +++ b/security/apparmor/ipc.c @@ -107,7 +107,8 @@ static int profile_tracer_perm(struct aa_profile *tracer, aad(sa)->label = &tracer->label; aad(sa)->peer = tracee; aad(sa)->request = 0; - aad(sa)->error = aa_capable(&tracer->label, CAP_SYS_PTRACE, 1); + aad(sa)->error = aa_capable(&tracer->label, CAP_SYS_PTRACE, + CAP_OPT_NONE); return aa_audit(AUDIT_APPARMOR_AUTO, tracer, sa, audit_ptrace_cb); } diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c index b9dcf7ec95a0..ccdc3d7b34ec 100644 --- a/security/apparmor/lsm.c +++ b/security/apparmor/lsm.c @@ -167,14 +167,14 @@ static int apparmor_capget(struct task_struct *target, kernel_cap_t *effective, } static int apparmor_capable(const struct cred *cred, struct user_namespace *ns, - int cap, int audit) + int cap, unsigned int opts) { struct aa_label *label; int error = 0; label = aa_get_newest_cred_label(cred); if (!unconfined(label)) - error = aa_capable(label, cap, audit); + error = aa_capable(label, cap, opts); aa_put_label(label); return error; diff --git a/security/apparmor/resource.c b/security/apparmor/resource.c index 5d4219709d44..e0268424e11c 100644 --- a/security/apparmor/resource.c +++ b/security/apparmor/resource.c @@ -124,7 +124,7 @@ int aa_task_setrlimit(struct aa_label *label, struct task_struct *task, */ if (label != peer && - aa_capable(label, CAP_SYS_RESOURCE, SECURITY_CAP_NOAUDIT) != 0) + aa_capable(label, CAP_SYS_RESOURCE, CAP_OPT_NOAUDIT) != 0) error = fn_for_each(label, profile, audit_resource(profile, resource, new_rlim->rlim_max, peer, diff --git a/security/commoncap.c b/security/commoncap.c index 90a252d68878..b0626bbf3e0b 100644 --- a/security/commoncap.c +++ b/security/commoncap.c @@ -69,7 +69,7 @@ static void warn_setuid_and_fcaps_mixed(const char *fname) * kernel's capable() and has_capability() returns 1 for this case. */ int cap_capable(const struct cred *cred, struct user_namespace *targ_ns, - int cap, int audit) + int cap, unsigned int opts) { struct user_namespace *ns = targ_ns; @@ -230,12 +230,11 @@ int cap_capget(struct task_struct *target, kernel_cap_t *effective, */ static inline int cap_inh_is_capped(void) { - /* they are so limited unless the current task has the CAP_SETPCAP * capability */ if (cap_capable(current_cred(), current_cred()->user_ns, - CAP_SETPCAP, SECURITY_CAP_AUDIT) == 0) + CAP_SETPCAP, CAP_OPT_NONE) == 0) return 0; return 1; } @@ -1174,8 +1173,9 @@ int cap_task_prctl(int option, unsigned long arg2, unsigned long arg3, || ((old->securebits & SECURE_ALL_LOCKS & ~arg2)) /*[2]*/ || (arg2 & ~(SECURE_ALL_LOCKS | SECURE_ALL_BITS)) /*[3]*/ || (cap_capable(current_cred(), - current_cred()->user_ns, CAP_SETPCAP, - SECURITY_CAP_AUDIT) != 0) /*[4]*/ + current_cred()->user_ns, + CAP_SETPCAP, + CAP_OPT_NONE) != 0) /*[4]*/ /* * [1] no changing of bits that are locked * [2] no unlocking of locks @@ -1270,9 +1270,10 @@ int cap_vm_enough_memory(struct mm_struct *mm, long pages) { int cap_sys_admin = 0; - if (cap_capable(current_cred(), &init_user_ns, CAP_SYS_ADMIN, - SECURITY_CAP_NOAUDIT) == 0) + if (cap_capable(current_cred(), &init_user_ns, + CAP_SYS_ADMIN, CAP_OPT_NOAUDIT) == 0) cap_sys_admin = 1; + return cap_sys_admin; } @@ -1291,7 +1292,7 @@ int cap_mmap_addr(unsigned long addr) if (addr < dac_mmap_min_addr) { ret = cap_capable(current_cred(), &init_user_ns, CAP_SYS_RAWIO, - SECURITY_CAP_AUDIT); + CAP_OPT_NONE); /* set PF_SUPERPRIV if it turns out we allow the low mmap */ if (ret == 0) current->flags |= PF_SUPERPRIV; diff --git a/security/security.c b/security/security.c index 960e25cc4983..4166afebbdb0 100644 --- a/security/security.c +++ b/security/security.c @@ -285,16 +285,12 @@ int security_capset(struct cred *new, const struct cred *old, effective, inheritable, permitted); } -int security_capable(const struct cred *cred, struct user_namespace *ns, - int cap) +int security_capable(const struct cred *cred, + struct user_namespace *ns, + int cap, + unsigned int opts) { - return call_int_hook(capable, 0, cred, ns, cap, SECURITY_CAP_AUDIT); -} - -int security_capable_noaudit(const struct cred *cred, struct user_namespace *ns, - int cap) -{ - return call_int_hook(capable, 0, cred, ns, cap, SECURITY_CAP_NOAUDIT); + return call_int_hook(capable, 0, cred, ns, cap, opts); } int security_quotactl(int cmds, int type, int id, struct super_block *sb) diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 27551e16b75a..25e49e232f49 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -1907,7 +1907,7 @@ static inline u32 signal_to_av(int sig) /* Check whether a task is allowed to use a capability. */ static int cred_has_capability(const struct cred *cred, - int cap, int audit, bool initns) + int cap, unsigned int opts, bool initns) { struct common_audit_data ad; struct av_decision avd; @@ -1935,7 +1935,7 @@ static int cred_has_capability(const struct cred *cred, rc = avc_has_perm_noaudit(&selinux_state, sid, sid, sclass, av, 0, &avd); - if (audit == SECURITY_CAP_AUDIT) { + if (!(opts & CAP_OPT_NOAUDIT)) { int rc2 = avc_audit(&selinux_state, sid, sid, sclass, av, &avd, rc, &ad, 0); if (rc2) @@ -2449,9 +2449,9 @@ static int selinux_capset(struct cred *new, const struct cred *old, */ static int selinux_capable(const struct cred *cred, struct user_namespace *ns, - int cap, int audit) + int cap, unsigned int opts) { - return cred_has_capability(cred, cap, audit, ns == &init_user_ns); + return cred_has_capability(cred, cap, opts, ns == &init_user_ns); } static int selinux_quotactl(int cmds, int type, int id, struct super_block *sb) @@ -2525,7 +2525,7 @@ static int selinux_vm_enough_memory(struct mm_struct *mm, long pages) int rc, cap_sys_admin = 0; rc = cred_has_capability(current_cred(), CAP_SYS_ADMIN, - SECURITY_CAP_NOAUDIT, true); + CAP_OPT_NOAUDIT, true); if (rc == 0) cap_sys_admin = 1; @@ -3411,11 +3411,11 @@ static int selinux_inode_setotherxattr(struct dentry *dentry, const char *name) static bool has_cap_mac_admin(bool audit) { const struct cred *cred = current_cred(); - int cap_audit = audit ? SECURITY_CAP_AUDIT : SECURITY_CAP_NOAUDIT; + unsigned int opts = audit ? CAP_OPT_NONE : CAP_OPT_NOAUDIT; - if (cap_capable(cred, &init_user_ns, CAP_MAC_ADMIN, cap_audit)) + if (cap_capable(cred, &init_user_ns, CAP_MAC_ADMIN, opts)) return false; - if (cred_has_capability(cred, CAP_MAC_ADMIN, cap_audit, true)) + if (cred_has_capability(cred, CAP_MAC_ADMIN, opts, true)) return false; return true; } @@ -3805,7 +3805,7 @@ static int selinux_file_ioctl(struct file *file, unsigned int cmd, case KDSKBENT: case KDSKBSENT: error = cred_has_capability(cred, CAP_SYS_TTY_CONFIG, - SECURITY_CAP_AUDIT, true); + CAP_OPT_NONE, true); break; /* default case assumes that the command will go diff --git a/security/smack/smack_access.c b/security/smack/smack_access.c index 8f4ec04b5ca6..07d23b4f76f3 100644 --- a/security/smack/smack_access.c +++ b/security/smack/smack_access.c @@ -639,7 +639,7 @@ bool smack_privileged_cred(int cap, const struct cred *cred) struct smack_known_list_elem *sklep; int rc; - rc = cap_capable(cred, &init_user_ns, cap, SECURITY_CAP_AUDIT); + rc = cap_capable(cred, &init_user_ns, cap, CAP_OPT_NONE); if (rc) return false; From fd4ac8667b115fd8d54f86f564f4dc39a5129f75 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micka=C3=ABl=20Sala=C3=BCn?= Date: Fri, 30 Oct 2020 13:38:48 +0100 Subject: [PATCH 0866/1640] UPSTREAM: ptrace: Set PF_SUPERPRIV when checking capability MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 69f594a38967 ("ptrace: do not audit capability check when outputing /proc/pid/stat") replaced the use of ns_capable() with has_ns_capability{,_noaudit}() which doesn't set PF_SUPERPRIV. Commit 6b3ad6649a4c ("ptrace: reintroduce usage of subjective credentials in ptrace_has_cap()") replaced has_ns_capability{,_noaudit}() with security_capable(), which doesn't set PF_SUPERPRIV neither. Since commit 98f368e9e263 ("kernel: Add noaudit variant of ns_capable()"), a new ns_capable_noaudit() helper is available. Let's use it! As a result, the signature of ptrace_has_cap() is restored to its original one. Cc: Christian Brauner Cc: Eric Paris Cc: Jann Horn Cc: Kees Cook Cc: Oleg Nesterov Cc: Serge E. Hallyn Cc: Tyler Hicks Cc: stable@vger.kernel.org Fixes: 6b3ad6649a4c ("ptrace: reintroduce usage of subjective credentials in ptrace_has_cap()") Fixes: 69f594a38967 ("ptrace: do not audit capability check when outputing /proc/pid/stat") Signed-off-by: Mickaël Salaün Reviewed-by: Jann Horn Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20201030123849.770769-2-mic@digikod.net --- kernel/ptrace.c | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 2154776ac1a0..7269b0b1105b 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -275,17 +275,11 @@ static int ptrace_check_attach(struct task_struct *child, bool ignore_state) return ret; } -static bool ptrace_has_cap(const struct cred *cred, struct user_namespace *ns, - unsigned int mode) +static bool ptrace_has_cap(struct user_namespace *ns, unsigned int mode) { - int ret; - if (mode & PTRACE_MODE_NOAUDIT) - ret = security_capable(cred, ns, CAP_SYS_PTRACE, CAP_OPT_NOAUDIT); - else - ret = security_capable(cred, ns, CAP_SYS_PTRACE, CAP_OPT_NONE); - - return ret == 0; + return ns_capable_noaudit(ns, CAP_SYS_PTRACE); + return ns_capable(ns, CAP_SYS_PTRACE); } /* Returns 0 on success, -errno on denial. */ @@ -337,7 +331,7 @@ static int __ptrace_may_access(struct task_struct *task, unsigned int mode) gid_eq(caller_gid, tcred->sgid) && gid_eq(caller_gid, tcred->gid)) goto ok; - if (ptrace_has_cap(cred, tcred->user_ns, mode)) + if (ptrace_has_cap(tcred->user_ns, mode)) goto ok; rcu_read_unlock(); return -EPERM; @@ -356,7 +350,7 @@ ok: mm = task->mm; if (mm && ((get_dumpable(mm) != SUID_DUMP_USER) && - !ptrace_has_cap(cred, mm->user_ns, mode))) + !ptrace_has_cap(mm->user_ns, mode))) return -EPERM; return security_ptrace_access_check(task, mode); From 65e1feb5933c1a61ba7a0f1576b85b8e4301cda6 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 2 Jul 2020 11:49:23 -0700 Subject: [PATCH 0867/1640] BACKPORT: kallsyms: Refactor kallsyms_show_value() to take cred In order to perform future tests against the cred saved during open(), switch kallsyms_show_value() to operate on a cred, and have all current callers pass current_cred(). This makes it very obvious where callers are checking the wrong credential in their "read" contexts. These will be fixed in the coming patches. Additionally switch return value to bool, since it is always used as a direct permission check, not a 0-on-success, negative-on-error style function return. Cc: stable@vger.kernel.org Signed-off-by: Kees Cook --- include/linux/filter.h | 2 +- include/linux/kallsyms.h | 5 +++-- kernel/kallsyms.c | 17 +++++++++++------ kernel/kprobes.c | 4 ++-- kernel/module.c | 2 +- 5 files changed, 18 insertions(+), 12 deletions(-) diff --git a/include/linux/filter.h b/include/linux/filter.h index b4597fb3d4f1..b43ebff4f7d6 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -829,7 +829,7 @@ static inline bool bpf_dump_raw_ok(void) /* Reconstruction of call-sites is dependent on kallsyms, * thus make dump the same restriction. */ - return kallsyms_show_value() == 1; + return kallsyms_show_value(current_cred()); } struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, diff --git a/include/linux/kallsyms.h b/include/linux/kallsyms.h index e4f2e5a65f14..4d280c25d1d8 100644 --- a/include/linux/kallsyms.h +++ b/include/linux/kallsyms.h @@ -24,6 +24,7 @@ # define KALLSYM_FMT "%016lx" #endif +struct cred; struct module; static inline int is_kernel_inittext(unsigned long addr) @@ -107,7 +108,7 @@ int lookup_symbol_name(unsigned long addr, char *symname); int lookup_symbol_attrs(unsigned long addr, unsigned long *size, unsigned long *offset, char *modname, char *name); /* How and when do we show kallsyms values? */ -extern int kallsyms_show_value(void); +extern bool kallsyms_show_value(const struct cred *cred); #else /* !CONFIG_KALLSYMS */ @@ -167,7 +168,7 @@ static inline int lookup_symbol_attrs(unsigned long addr, unsigned long *size, u return -ERANGE; } -static inline int kallsyms_show_value(void) +static inline bool kallsyms_show_value(const struct cred *cred) { return false; } diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 641fe6770fc5..78de6486f191 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -681,19 +681,20 @@ static inline int kallsyms_for_perf(void) * Otherwise, require CAP_SYSLOG (assuming kptr_restrict isn't set to * block even that). */ -int kallsyms_show_value(void) +bool kallsyms_show_value(const struct cred *cred) { switch (kptr_restrict) { case 0: if (kallsyms_for_perf()) - return 1; + return true; /* fallthrough */ case 1: - if (has_capability_noaudit(current, CAP_SYSLOG)) - return 1; + if (security_capable(cred, &init_user_ns, CAP_SYSLOG, + CAP_OPT_NOAUDIT) == 0) + return true; /* fallthrough */ default: - return 0; + return false; } } @@ -710,7 +711,11 @@ static int kallsyms_open(struct inode *inode, struct file *file) return -ENOMEM; reset_iter(iter, 0); - iter->show_value = kallsyms_show_value(); + /* + * Instead of checking this on every s_show() call, cache + * the result here at open time. + */ + iter->show_value = kallsyms_show_value(file->f_cred); return 0; } diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 73a9d3ce063d..ee058734fb43 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -2384,7 +2384,7 @@ static void report_probe(struct seq_file *pi, struct kprobe *p, else kprobe_type = "k"; - if (!kallsyms_show_value()) + if (!kallsyms_show_value(current_cred())) addr = NULL; if (sym) @@ -2485,7 +2485,7 @@ static int kprobe_blacklist_seq_show(struct seq_file *m, void *v) * If /proc/kallsyms is not showing kernel address, we won't * show them here either. */ - if (!kallsyms_show_value()) + if (!kallsyms_show_value(current_cred())) seq_printf(m, "0x%px-0x%px\t%ps\n", NULL, NULL, (void *)ent->start_addr); else diff --git a/kernel/module.c b/kernel/module.c index 261c53757961..28948386223d 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -4427,7 +4427,7 @@ static int modules_open(struct inode *inode, struct file *file) if (!err) { struct seq_file *m = file->private_data; - m->private = kallsyms_show_value() ? NULL : (void *)8ul; + m->private = kallsyms_show_value(current_cred()) ? NULL : (void *)8ul; } return 0; From 795c7982b167b983f9b1c99c294893923bb80bd5 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 20 Jan 2018 01:24:34 +0100 Subject: [PATCH 0868/1640] BACKPORT: bpf: restrict access to core bpf sysctls Given BPF reaches far beyond just networking these days, it was never intended to allow setting and in some cases reading those knobs out of a user namespace root running without CAP_SYS_ADMIN, thus tighten such access. Also the bpf_jit_enable = 2 debugging mode should only be allowed if kptr_restrict is not set since it otherwise can leak addresses to the kernel log. Dump a note to the kernel log that this is for debugging JITs only when enabled. [Linux4: Apply the missing parts that weren't present in the 4.14 backport] Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: Alexei Starovoitov --- net/core/sysctl_net_core.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 1b5749f2ef9c..bea2b9a84157 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -270,9 +270,14 @@ static int proc_dointvec_minmax_bpf_enable(struct ctl_table *table, int write, tmp.data = &jit_enable; ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); if (write && !ret) { - *(int *)table->data = jit_enable; - if (jit_enable == 2) - pr_warn("bpf_jit_enable = 2 was set! NEVER use this in production, only for JIT debugging!\n"); + if (jit_enable < 2 || + (jit_enable == 2 && bpf_dump_raw_ok())) { + *(int *)table->data = jit_enable; + if (jit_enable == 2) + pr_warn("bpf_jit_enable = 2 was set! NEVER use this in production, only for JIT debugging!\n"); + } else { + ret = -EPERM; + } } return ret; } From baa351a1387bf6c7fdda0ee78743ce014672ffc2 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 2 Jul 2020 15:45:23 -0700 Subject: [PATCH 0869/1640] UPSTREAM: bpf: Check correct cred for CAP_SYSLOG in bpf_dump_raw_ok() commit 63960260457a02af2a6cb35d75e6bdb17299c882 upstream. When evaluating access control over kallsyms visibility, credentials at open() time need to be used, not the "current" creds (though in BPF's case, this has likely always been the same). Plumb access to associated file->f_cred down through bpf_dump_raw_ok() and its callers now that kallsysm_show_value() has been refactored to take struct cred. Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: bpf@vger.kernel.org Cc: stable@vger.kernel.org Fixes: 7105e828c087 ("bpf: allow for correlation of maps and helpers in dump") Signed-off-by: Kees Cook Signed-off-by: Greg Kroah-Hartman --- include/linux/filter.h | 4 ++-- kernel/bpf/syscall.c | 32 ++++++++++++++++++-------------- net/core/sysctl_net_core.c | 2 +- 3 files changed, 21 insertions(+), 17 deletions(-) diff --git a/include/linux/filter.h b/include/linux/filter.h index b43ebff4f7d6..ff764bd0968e 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -824,12 +824,12 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog); void bpf_jit_compile(struct bpf_prog *prog); bool bpf_helper_changes_pkt_data(void *func); -static inline bool bpf_dump_raw_ok(void) +static inline bool bpf_dump_raw_ok(const struct cred *cred) { /* Reconstruction of call-sites is dependent on kallsyms, * thus make dump the same restriction. */ - return kallsyms_show_value(current_cred()); + return kallsyms_show_value(cred); } struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index f5e4d6fbdd95..c5c4a3c15b86 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1904,7 +1904,8 @@ static const struct bpf_map *bpf_map_from_imm(const struct bpf_prog *prog, return NULL; } -static struct bpf_insn *bpf_insn_prepare_dump(const struct bpf_prog *prog) +static struct bpf_insn *bpf_insn_prepare_dump(const struct bpf_prog *prog, + const struct cred *f_cred) { const struct bpf_map *map; struct bpf_insn *insns; @@ -1926,7 +1927,7 @@ static struct bpf_insn *bpf_insn_prepare_dump(const struct bpf_prog *prog) insns[i].code == (BPF_JMP | BPF_CALL_ARGS)) { if (insns[i].code == (BPF_JMP | BPF_CALL_ARGS)) insns[i].code = BPF_JMP | BPF_CALL; - if (!bpf_dump_raw_ok()) + if (!bpf_dump_raw_ok(f_cred)) insns[i].imm = 0; continue; } @@ -1943,7 +1944,7 @@ static struct bpf_insn *bpf_insn_prepare_dump(const struct bpf_prog *prog) continue; } - if (!bpf_dump_raw_ok() && + if (!bpf_dump_raw_ok(f_cred) && imm == (unsigned long)prog->aux) { insns[i].imm = 0; insns[i + 1].imm = 0; @@ -1954,7 +1955,8 @@ static struct bpf_insn *bpf_insn_prepare_dump(const struct bpf_prog *prog) return insns; } -static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, +static int bpf_prog_get_info_by_fd(struct file *file, + struct bpf_prog *prog, const union bpf_attr *attr, union bpf_attr __user *uattr) { @@ -2011,11 +2013,11 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, struct bpf_insn *insns_sanitized; bool fault; - if (prog->blinded && !bpf_dump_raw_ok()) { + if (prog->blinded && !bpf_dump_raw_ok(file->f_cred)) { info.xlated_prog_insns = 0; goto done; } - insns_sanitized = bpf_insn_prepare_dump(prog); + insns_sanitized = bpf_insn_prepare_dump(prog, file->f_cred); if (!insns_sanitized) return -ENOMEM; uinsns = u64_to_user_ptr(info.xlated_prog_insns); @@ -2049,7 +2051,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, } if (info.jited_prog_len && ulen) { - if (bpf_dump_raw_ok()) { + if (bpf_dump_raw_ok(file->f_cred)) { uinsns = u64_to_user_ptr(info.jited_prog_insns); ulen = min_t(u32, info.jited_prog_len, ulen); @@ -2084,7 +2086,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, ulen = info.nr_jited_ksyms; info.nr_jited_ksyms = prog->aux->func_cnt; if (info.nr_jited_ksyms && ulen) { - if (bpf_dump_raw_ok()) { + if (bpf_dump_raw_ok(file->f_cred)) { u64 __user *user_ksyms; ulong ksym_addr; u32 i; @@ -2108,7 +2110,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, ulen = info.nr_jited_func_lens; info.nr_jited_func_lens = prog->aux->func_cnt; if (info.nr_jited_func_lens && ulen) { - if (bpf_dump_raw_ok()) { + if (bpf_dump_raw_ok(file->f_cred)) { u32 __user *user_lens; u32 func_len, i; @@ -2133,7 +2135,8 @@ done: return 0; } -static int bpf_map_get_info_by_fd(struct bpf_map *map, +static int bpf_map_get_info_by_fd(struct file *file, + struct bpf_map *map, const union bpf_attr *attr, union bpf_attr __user *uattr) { @@ -2175,7 +2178,8 @@ static int bpf_map_get_info_by_fd(struct bpf_map *map, return 0; } -static int bpf_btf_get_info_by_fd(struct btf *btf, +static int bpf_btf_get_info_by_fd(struct file *file, + struct btf *btf, const union bpf_attr *attr, union bpf_attr __user *uattr) { @@ -2207,13 +2211,13 @@ static int bpf_obj_get_info_by_fd(const union bpf_attr *attr, return -EBADFD; if (f.file->f_op == &bpf_prog_fops) - err = bpf_prog_get_info_by_fd(f.file->private_data, attr, + err = bpf_prog_get_info_by_fd(f.file, f.file->private_data, attr, uattr); else if (f.file->f_op == &bpf_map_fops) - err = bpf_map_get_info_by_fd(f.file->private_data, attr, + err = bpf_map_get_info_by_fd(f.file, f.file->private_data, attr, uattr); else if (f.file->f_op == &btf_fops) - err = bpf_btf_get_info_by_fd(f.file->private_data, attr, uattr); + err = bpf_btf_get_info_by_fd(f.file, f.file->private_data, attr, uattr); else err = -EINVAL; diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index bea2b9a84157..3885c74f1110 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -271,7 +271,7 @@ static int proc_dointvec_minmax_bpf_enable(struct ctl_table *table, int write, ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); if (write && !ret) { if (jit_enable < 2 || - (jit_enable == 2 && bpf_dump_raw_ok())) { + (jit_enable == 2 && bpf_dump_raw_ok(current_cred()))) { *(int *)table->data = jit_enable; if (jit_enable == 2) pr_warn("bpf_jit_enable = 2 was set! NEVER use this in production, only for JIT debugging!\n"); From 6ea655f88998b0be0c79eb8c600f63ab74f60a19 Mon Sep 17 00:00:00 2001 From: Qiaobin Fu Date: Sun, 1 Jul 2018 15:16:27 -0400 Subject: [PATCH 0870/1640] UPSTREAM: net:sched: add action inheritdsfield to skbedit The new action inheritdsfield copies the field DS of IPv4 and IPv6 packets into skb->priority. This enables later classification of packets based on the DS field. v5: *Update the drop counter for TC_ACT_SHOT v4: *Not allow setting flags other than the expected ones. *Allow dumping the pure flags. v3: *Use optional flags, so that it won't break old versions of tc. *Allow users to set both SKBEDIT_F_PRIORITY and SKBEDIT_F_INHERITDSFIELD flags. v2: *Fix the style issue *Move the code from skbmod to skbedit Original idea by Jamal Hadi Salim Signed-off-by: Qiaobin Fu Reviewed-by: Michel Machado Acked-by: Jamal Hadi Salim Reviewed-by: Marcelo Ricardo Leitner Acked-by: Davide Caratti Signed-off-by: David S. Miller --- include/uapi/linux/tc_act/tc_skbedit.h | 2 ++ net/sched/act_skbedit.c | 41 ++++++++++++++++++++++++++ 2 files changed, 43 insertions(+) diff --git a/include/uapi/linux/tc_act/tc_skbedit.h b/include/uapi/linux/tc_act/tc_skbedit.h index fbcfe27a4e6c..6de6071ebed6 100644 --- a/include/uapi/linux/tc_act/tc_skbedit.h +++ b/include/uapi/linux/tc_act/tc_skbedit.h @@ -30,6 +30,7 @@ #define SKBEDIT_F_MARK 0x4 #define SKBEDIT_F_PTYPE 0x8 #define SKBEDIT_F_MASK 0x10 +#define SKBEDIT_F_INHERITDSFIELD 0x20 struct tc_skbedit { tc_gen; @@ -45,6 +46,7 @@ enum { TCA_SKBEDIT_PAD, TCA_SKBEDIT_PTYPE, TCA_SKBEDIT_MASK, + TCA_SKBEDIT_FLAGS, __TCA_SKBEDIT_MAX }; #define TCA_SKBEDIT_MAX (__TCA_SKBEDIT_MAX - 1) diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c index 1a8a49e33320..b2ea8d768d14 100644 --- a/net/sched/act_skbedit.c +++ b/net/sched/act_skbedit.c @@ -23,6 +23,9 @@ #include #include #include +#include +#include +#include #include #include @@ -41,6 +44,25 @@ static int tcf_skbedit(struct sk_buff *skb, const struct tc_action *a, if (d->flags & SKBEDIT_F_PRIORITY) skb->priority = d->priority; + if (d->flags & SKBEDIT_F_INHERITDSFIELD) { + int wlen = skb_network_offset(skb); + + switch (tc_skb_protocol(skb)) { + case htons(ETH_P_IP): + wlen += sizeof(struct iphdr); + if (!pskb_may_pull(skb, wlen)) + goto err; + skb->priority = ipv4_get_dsfield(ip_hdr(skb)) >> 2; + break; + + case htons(ETH_P_IPV6): + wlen += sizeof(struct ipv6hdr); + if (!pskb_may_pull(skb, wlen)) + goto err; + skb->priority = ipv6_get_dsfield(ipv6_hdr(skb)) >> 2; + break; + } + } if (d->flags & SKBEDIT_F_QUEUE_MAPPING && skb->dev->real_num_tx_queues > d->queue_mapping) skb_set_queue_mapping(skb, d->queue_mapping); @@ -53,6 +75,11 @@ static int tcf_skbedit(struct sk_buff *skb, const struct tc_action *a, spin_unlock(&d->tcf_lock); return d->tcf_action; + +err: + d->tcf_qstats.drops++; + spin_unlock(&d->tcf_lock); + return TC_ACT_SHOT; } static const struct nla_policy skbedit_policy[TCA_SKBEDIT_MAX + 1] = { @@ -62,6 +89,7 @@ static const struct nla_policy skbedit_policy[TCA_SKBEDIT_MAX + 1] = { [TCA_SKBEDIT_MARK] = { .len = sizeof(u32) }, [TCA_SKBEDIT_PTYPE] = { .len = sizeof(u16) }, [TCA_SKBEDIT_MASK] = { .len = sizeof(u32) }, + [TCA_SKBEDIT_FLAGS] = { .len = sizeof(u64) }, }; static int tcf_skbedit_init(struct net *net, struct nlattr *nla, @@ -114,6 +142,13 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla, mask = nla_data(tb[TCA_SKBEDIT_MASK]); } + if (tb[TCA_SKBEDIT_FLAGS] != NULL) { + u64 *pure_flags = nla_data(tb[TCA_SKBEDIT_FLAGS]); + + if (*pure_flags & SKBEDIT_F_INHERITDSFIELD) + flags |= SKBEDIT_F_INHERITDSFIELD; + } + parm = nla_data(tb[TCA_SKBEDIT_PARMS]); exists = tcf_idr_check(tn, parm->index, a, bind); @@ -178,6 +213,7 @@ static int tcf_skbedit_dump(struct sk_buff *skb, struct tc_action *a, .action = d->tcf_action, }; struct tcf_t t; + u64 pure_flags = 0; if (nla_put(skb, TCA_SKBEDIT_PARMS, sizeof(opt), &opt)) goto nla_put_failure; @@ -196,6 +232,11 @@ static int tcf_skbedit_dump(struct sk_buff *skb, struct tc_action *a, if ((d->flags & SKBEDIT_F_MASK) && nla_put_u32(skb, TCA_SKBEDIT_MASK, d->mask)) goto nla_put_failure; + if (d->flags & SKBEDIT_F_INHERITDSFIELD) + pure_flags |= SKBEDIT_F_INHERITDSFIELD; + if (pure_flags != 0 && + nla_put(skb, TCA_SKBEDIT_FLAGS, sizeof(pure_flags), &pure_flags)) + goto nla_put_failure; tcf_tm_dump(&t, &d->tcf_tm); if (nla_put_64bit(skb, TCA_SKBEDIT_TM, sizeof(t), &t, TCA_SKBEDIT_PAD)) From 9255e4ec48e0040010a8b63699d60ab3a35ed3f5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Fri, 3 Jul 2020 22:26:43 +0200 Subject: [PATCH 0871/1640] BACKPORT: sched: consistently handle layer3 header accesses in the presence of VLANs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit d7bf2ebebc2bd61ab95e2a8e33541ef282f303d4 ] There are a couple of places in net/sched/ that check skb->protocol and act on the value there. However, in the presence of VLAN tags, the value stored in skb->protocol can be inconsistent based on whether VLAN acceleration is enabled. The commit quoted in the Fixes tag below fixed the users of skb->protocol to use a helper that will always see the VLAN ethertype. However, most of the callers don't actually handle the VLAN ethertype, but expect to find the IP header type in the protocol field. This means that things like changing the ECN field, or parsing diffserv values, stops working if there's a VLAN tag, or if there are multiple nested VLAN tags (QinQ). To fix this, change the helper to take an argument that indicates whether the caller wants to skip the VLAN tags or not. When skipping VLAN tags, we make sure to skip all of them, so behaviour is consistent even in QinQ mode. To make the helper usable from the ECN code, move it to if_vlan.h instead of pkt_sched.h. v3: - Remove empty lines - Move vlan variable definitions inside loop in skb_protocol() - Also use skb_protocol() helper in IP{,6}_ECN_decapsulate() and bpf_skb_ecn_set_ce() v2: - Use eth_type_vlan() helper in skb_protocol() - Also fix code that reads skb->protocol directly - Change a couple of 'if/else if' statements to switch constructs to avoid calling the helper twice Reported-by: Ilya Ponetayev Fixes: d8b9605d2697 ("net: sched: fix skb->protocol use in case of accelerated vlan path") Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- include/net/inet_ecn.h | 22 +++++++++++++++------- include/net/pkt_sched.h | 11 ----------- net/core/filter.c | 8 ++++---- net/sched/act_connmark.c | 9 ++++++--- net/sched/act_csum.c | 2 +- net/sched/act_skbedit.c | 2 +- net/sched/cls_api.c | 2 +- net/sched/cls_flow.c | 8 ++++---- net/sched/cls_flower.c | 2 +- net/sched/em_ipset.c | 2 +- net/sched/em_meta.c | 2 +- net/sched/sch_teql.c | 2 +- 12 files changed, 36 insertions(+), 36 deletions(-) diff --git a/include/net/inet_ecn.h b/include/net/inet_ecn.h index 09ed8a48b454..75568f090035 100644 --- a/include/net/inet_ecn.h +++ b/include/net/inet_ecn.h @@ -148,7 +148,7 @@ static inline void ipv6_copy_dscp(unsigned int dscp, struct ipv6hdr *inner) static inline int INET_ECN_set_ce(struct sk_buff *skb) { - switch (skb->protocol) { + switch (skb_protocol(skb, true)) { case cpu_to_be16(ETH_P_IP): if (skb_network_header(skb) + sizeof(struct iphdr) <= skb_tail_pointer(skb)) @@ -215,12 +215,16 @@ static inline int IP_ECN_decapsulate(const struct iphdr *oiph, { __u8 inner; - if (skb->protocol == htons(ETH_P_IP)) + switch (skb_protocol(skb, true)) { + case htons(ETH_P_IP): inner = ip_hdr(skb)->tos; - else if (skb->protocol == htons(ETH_P_IPV6)) + break; + case htons(ETH_P_IPV6): inner = ipv6_get_dsfield(ipv6_hdr(skb)); - else + break; + default: return 0; + } return INET_ECN_decapsulate(skb, oiph->tos, inner); } @@ -230,12 +234,16 @@ static inline int IP6_ECN_decapsulate(const struct ipv6hdr *oipv6h, { __u8 inner; - if (skb->protocol == htons(ETH_P_IP)) + switch (skb_protocol(skb, true)) { + case htons(ETH_P_IP): inner = ip_hdr(skb)->tos; - else if (skb->protocol == htons(ETH_P_IPV6)) + break; + case htons(ETH_P_IPV6): inner = ipv6_get_dsfield(ipv6_hdr(skb)); - else + break; + default: return 0; + } return INET_ECN_decapsulate(skb, ipv6_get_dsfield(oipv6h), inner); } diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h index 8b3f0fccb8a0..02fd755e6627 100644 --- a/include/net/pkt_sched.h +++ b/include/net/pkt_sched.h @@ -115,17 +115,6 @@ static inline void qdisc_run(struct Qdisc *q) __qdisc_run(q); } -static inline __be16 tc_skb_protocol(const struct sk_buff *skb) -{ - /* We need to take extra care in case the skb came via - * vlan accelerated path. In that case, use skb->vlan_proto - * as the original vlan header was already stripped. - */ - if (skb_vlan_tag_present(skb)) - return skb->vlan_proto; - return skb->protocol; -} - extern const struct nla_policy rtm_tca_policy[TCA_MAX + 1]; extern int tc_qdisc_flow_control(struct net_device *dev, u32 tcm_handle, diff --git a/net/core/filter.c b/net/core/filter.c index d3180bf82dea..e7e24fd7dadc 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2698,7 +2698,7 @@ static int bpf_skb_proto_6_to_4(struct sk_buff *skb) static int bpf_skb_proto_xlat(struct sk_buff *skb, __be16 to_proto) { - __be16 from_proto = skb->protocol; + __be16 from_proto = skb_protocol(skb, true); if (from_proto == htons(ETH_P_IP) && to_proto == htons(ETH_P_IPV6)) @@ -2771,7 +2771,7 @@ static const struct bpf_func_proto bpf_skb_change_type_proto = { static u32 bpf_skb_net_base_len(const struct sk_buff *skb) { - switch (skb->protocol) { + switch (skb_protocol(skb, true)) { case htons(ETH_P_IP): return sizeof(struct iphdr); case htons(ETH_P_IPV6): @@ -2847,7 +2847,7 @@ static int bpf_skb_adjust_net(struct sk_buff *skb, s32 len_diff) u32 len_cur, len_diff_abs = abs(len_diff); u32 len_min = bpf_skb_net_base_len(skb); u32 len_max = BPF_SKB_MAX_LEN; - __be16 proto = skb->protocol; + __be16 proto = skb_protocol(skb, true); bool shrink = len_diff < 0; int ret; @@ -4552,7 +4552,7 @@ static int bpf_push_seg6_encap(struct sk_buff *skb, u32 type, void *hdr, u32 len switch (type) { case BPF_LWT_ENCAP_SEG6_INLINE: - if (skb->protocol != htons(ETH_P_IPV6)) + if (skb_protocol(skb, true) != htons(ETH_P_IPV6)) return -EBADMSG; err = seg6_do_srh_inline(skb, srh); diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c index de0cd73a5a5d..b0285f15b2ee 100644 --- a/net/sched/act_connmark.c +++ b/net/sched/act_connmark.c @@ -46,17 +46,20 @@ static int tcf_connmark(struct sk_buff *skb, const struct tc_action *a, tcf_lastuse_update(&ca->tcf_tm); bstats_update(&ca->tcf_bstats, skb); - if (skb->protocol == htons(ETH_P_IP)) { + switch (skb_protocol(skb, true)) { + case htons(ETH_P_IP): if (skb->len < sizeof(struct iphdr)) goto out; proto = NFPROTO_IPV4; - } else if (skb->protocol == htons(ETH_P_IPV6)) { + break; + case htons(ETH_P_IPV6): if (skb->len < sizeof(struct ipv6hdr)) goto out; proto = NFPROTO_IPV6; - } else { + break; + default: goto out; } diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c index a449594553d0..10587d0aac10 100644 --- a/net/sched/act_csum.c +++ b/net/sched/act_csum.c @@ -552,7 +552,7 @@ static int tcf_csum(struct sk_buff *skb, const struct tc_action *a, if (unlikely(action == TC_ACT_SHOT)) goto drop; - switch (tc_skb_protocol(skb)) { + switch (skb_protocol(skb, false)) { case cpu_to_be16(ETH_P_IP): if (!tcf_csum_ipv4(skb, update_flags)) goto drop; diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c index b2ea8d768d14..d9a3f5d01da6 100644 --- a/net/sched/act_skbedit.c +++ b/net/sched/act_skbedit.c @@ -47,7 +47,7 @@ static int tcf_skbedit(struct sk_buff *skb, const struct tc_action *a, if (d->flags & SKBEDIT_F_INHERITDSFIELD) { int wlen = skb_network_offset(skb); - switch (tc_skb_protocol(skb)) { + switch (skb_protocol(skb, true)) { case htons(ETH_P_IP): wlen += sizeof(struct iphdr); if (!pskb_may_pull(skb, wlen)) diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 8808133e78a3..14d93ed68e21 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -325,7 +325,7 @@ int tcf_classify(struct sk_buff *skb, const struct tcf_proto *tp, reclassify: #endif for (; tp; tp = rcu_dereference_bh(tp->next)) { - __be16 protocol = tc_skb_protocol(skb); + __be16 protocol = skb_protocol(skb, false); int err; if (tp->protocol != protocol && diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c index 0d364166ccc9..ef5af64d9452 100644 --- a/net/sched/cls_flow.c +++ b/net/sched/cls_flow.c @@ -87,7 +87,7 @@ static u32 flow_get_dst(const struct sk_buff *skb, const struct flow_keys *flow) if (dst) return ntohl(dst); - return addr_fold(skb_dst(skb)) ^ (__force u16) tc_skb_protocol(skb); + return addr_fold(skb_dst(skb)) ^ (__force u16)skb_protocol(skb, true); } static u32 flow_get_proto(const struct sk_buff *skb, @@ -111,7 +111,7 @@ static u32 flow_get_proto_dst(const struct sk_buff *skb, if (flow->ports.ports) return ntohs(flow->ports.dst); - return addr_fold(skb_dst(skb)) ^ (__force u16) tc_skb_protocol(skb); + return addr_fold(skb_dst(skb)) ^ (__force u16)skb_protocol(skb, true); } static u32 flow_get_iif(const struct sk_buff *skb) @@ -158,7 +158,7 @@ static u32 flow_get_nfct(const struct sk_buff *skb) static u32 flow_get_nfct_src(const struct sk_buff *skb, const struct flow_keys *flow) { - switch (tc_skb_protocol(skb)) { + switch (skb_protocol(skb, true)) { case htons(ETH_P_IP): return ntohl(CTTUPLE(skb, src.u3.ip)); case htons(ETH_P_IPV6): @@ -171,7 +171,7 @@ fallback: static u32 flow_get_nfct_dst(const struct sk_buff *skb, const struct flow_keys *flow) { - switch (tc_skb_protocol(skb)) { + switch (skb_protocol(skb, true)) { case htons(ETH_P_IP): return ntohl(CTTUPLE(skb, dst.u3.ip)); case htons(ETH_P_IPV6): diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 8974bd25c71e..3cc9dfe7fb02 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -191,7 +191,7 @@ static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp, /* skb_flow_dissect() does not set n_proto in case an unknown protocol, * so do it rather here. */ - skb_key.basic.n_proto = skb->protocol; + skb_key.basic.n_proto = skb_protocol(skb, false); skb_flow_dissect(skb, &head->dissector, &skb_key, 0); fl_set_masked_key(&skb_mkey, &skb_key, &head->mask); diff --git a/net/sched/em_ipset.c b/net/sched/em_ipset.c index c1b23e3060b8..ef3b6b66c26a 100644 --- a/net/sched/em_ipset.c +++ b/net/sched/em_ipset.c @@ -62,7 +62,7 @@ static int em_ipset_match(struct sk_buff *skb, struct tcf_ematch *em, }; int ret, network_offset; - switch (tc_skb_protocol(skb)) { + switch (skb_protocol(skb, true)) { case htons(ETH_P_IP): state.pf = NFPROTO_IPV4; if (!pskb_network_may_pull(skb, sizeof(struct iphdr))) diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c index d6e97115500b..e36fa9272259 100644 --- a/net/sched/em_meta.c +++ b/net/sched/em_meta.c @@ -199,7 +199,7 @@ META_COLLECTOR(int_priority) META_COLLECTOR(int_protocol) { /* Let userspace take care of the byte ordering */ - dst->value = tc_skb_protocol(skb); + dst->value = skb_protocol(skb, false); } META_COLLECTOR(int_pkttype) diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c index 5d8c095ec5ec..b55ddc3c8aa4 100644 --- a/net/sched/sch_teql.c +++ b/net/sched/sch_teql.c @@ -245,7 +245,7 @@ __teql_resolve(struct sk_buff *skb, struct sk_buff *skb_res, char haddr[MAX_ADDR_LEN]; neigh_ha_snapshot(haddr, n, dev); - err = dev_hard_header(skb, dev, ntohs(tc_skb_protocol(skb)), + err = dev_hard_header(skb, dev, ntohs(skb_protocol(skb, false)), haddr, NULL, skb->len); if (err < 0) From 582f1f5c626a41d9843b8b9f10222cfc72de713c Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 8 Sep 2020 00:04:10 +0200 Subject: [PATCH 0872/1640] UPSTREAM: bpf: Fix clobbering of r2 in bpf_gen_ld_abs [ Upstream commit e6a18d36118bea3bf497c9df4d9988b6df120689 ] Bryce reported that he saw the following with: 0: r6 = r1 1: r1 = 12 2: r0 = *(u16 *)skb[r1] The xlated sequence was incorrectly clobbering r2 with pointer value of r6 ... 0: (bf) r6 = r1 1: (b7) r1 = 12 2: (bf) r1 = r6 3: (bf) r2 = r1 4: (85) call bpf_skb_load_helper_16_no_cache#7692160 ... and hence call to the load helper never succeeded given the offset was too high. Fix it by reordering the load of r6 to r1. Other than that the insn has similar calling convention than BPF helpers, that is, r0 - r5 are scratch regs, so nothing else affected after the insn. Fixes: e0cea7ce988c ("bpf: implement ld_abs/ld_ind in native bpf") Reported-by: Bryce Kahle Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/cace836e4d07bb63b1a53e49c5dfb238a040c298.1599512096.git.daniel@iogearbox.net Signed-off-by: Sasha Levin --- net/core/filter.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/core/filter.c b/net/core/filter.c index e7e24fd7dadc..421d62516778 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5422,8 +5422,6 @@ static int bpf_gen_ld_abs(const struct bpf_insn *orig, bool indirect = BPF_MODE(orig->code) == BPF_IND; struct bpf_insn *insn = insn_buf; - /* We're guaranteed here that CTX is in R6. */ - *insn++ = BPF_MOV64_REG(BPF_REG_1, BPF_REG_CTX); if (!indirect) { *insn++ = BPF_MOV64_IMM(BPF_REG_2, orig->imm); } else { @@ -5431,6 +5429,8 @@ static int bpf_gen_ld_abs(const struct bpf_insn *orig, if (orig->imm) *insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, orig->imm); } + /* We're guaranteed here that CTX is in R6. */ + *insn++ = BPF_MOV64_REG(BPF_REG_1, BPF_REG_CTX); switch (BPF_SIZE(orig->code)) { case BPF_B: From daa12ad87bcce2804ae961a27f17cf1b211fffc1 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 15 Sep 2020 17:44:01 -0700 Subject: [PATCH 0873/1640] UPSTREAM: bpf: Fix a rcu warning for bpffs map pretty-print [ Upstream commit ce880cb825fcc22d4e39046a6c3a3a7f6603883d ] Running selftest ./btf_btf -p the kernel had the following warning: [ 51.528185] WARNING: CPU: 3 PID: 1756 at kernel/bpf/hashtab.c:717 htab_map_get_next_key+0x2eb/0x300 [ 51.529217] Modules linked in: [ 51.529583] CPU: 3 PID: 1756 Comm: test_btf Not tainted 5.9.0-rc1+ #878 [ 51.530346] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.9.3-1.el7.centos 04/01/2014 [ 51.531410] RIP: 0010:htab_map_get_next_key+0x2eb/0x300 ... [ 51.542826] Call Trace: [ 51.543119] map_seq_next+0x53/0x80 [ 51.543528] seq_read+0x263/0x400 [ 51.543932] vfs_read+0xad/0x1c0 [ 51.544311] ksys_read+0x5f/0xe0 [ 51.544689] do_syscall_64+0x33/0x40 [ 51.545116] entry_SYSCALL_64_after_hwframe+0x44/0xa9 The related source code in kernel/bpf/hashtab.c: 709 static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key) 710 { 711 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 712 struct hlist_nulls_head *head; 713 struct htab_elem *l, *next_l; 714 u32 hash, key_size; 715 int i = 0; 716 717 WARN_ON_ONCE(!rcu_read_lock_held()); In kernel/bpf/inode.c, bpffs map pretty print calls map->ops->map_get_next_key() without holding a rcu_read_lock(), hence causing the above warning. To fix the issue, just surrounding map->ops->map_get_next_key() with rcu read lock. Fixes: a26ca7c982cb ("bpf: btf: Add pretty print support to the basic arraymap") Reported-by: Alexei Starovoitov Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Cc: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20200916004401.146277-1-yhs@fb.com Signed-off-by: Sasha Levin --- kernel/bpf/inode.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index c04815bb15cc..11fade89c1f3 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -207,10 +207,12 @@ static void *map_seq_next(struct seq_file *m, void *v, loff_t *pos) else prev_key = key; + rcu_read_lock(); if (map->ops->map_get_next_key(map, prev_key, key)) { map_iter(m)->done = true; - return NULL; + key = NULL; } + rcu_read_unlock(); return key; } From 19911d698eb5fc50c8b08e11fb2f05c721c0dbef Mon Sep 17 00:00:00 2001 From: Song Liu Date: Thu, 10 Sep 2020 13:33:14 -0700 Subject: [PATCH 0874/1640] BACKPORT: bpf: Fix comment for helper bpf_current_task_under_cgroup() commit 1aef5b4391f0c75c0a1523706a7b0311846ee12f upstream. This should be "current" not "skb". Fixes: c6b5fb8690fa ("bpf: add documentation for eBPF helpers (42-50)") Signed-off-by: Song Liu Signed-off-by: Alexei Starovoitov Cc: Link: https://lore.kernel.org/bpf/20200910203314.70018-1-songliubraving@fb.com Signed-off-by: Greg Kroah-Hartman --- include/uapi/linux/bpf.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 8d75c1f73b11..7556bc317705 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1193,8 +1193,8 @@ union bpf_attr { * Return * The return value depends on the result of the test, and can be: * - * * 0, if the *skb* task belongs to the cgroup2. - * * 1, if the *skb* task does not belong to the cgroup2. + * * 0, if current task belongs to the cgroup2. + * * 1, if current task does not belong to the cgroup2. * * A negative error code, if an error occurred. * * int bpf_skb_change_tail(struct sk_buff *skb, u32 len, u64 flags) From efa5428768dfc4c0678b5df3380cf724107e3b2d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maciej=20=C5=BBenczykowski?= Date: Sun, 26 Apr 2020 09:15:25 -0700 Subject: [PATCH 0875/1640] BACKPORT: bpf: add bpf_ktime_get_boot_ns() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On a device like a cellphone which is constantly suspending and resuming CLOCK_MONOTONIC is not particularly useful for keeping track of or reacting to external network events. Instead you want to use CLOCK_BOOTTIME. Hence add bpf_ktime_get_boot_ns() as a mirror of bpf_ktime_get_ns() based around CLOCK_BOOTTIME instead of CLOCK_MONOTONIC. Signed-off-by: Maciej Żenczykowski Signed-off-by: Alexei Starovoitov (cherry picked from commit 71d19214776e61b33da48f7c1b46e522c7f78221) Change-Id: Ifd62c410dcc5112fd1a473a7e1f70231ca514bc0 --- drivers/media/rc/bpf-lirc.c | 2 ++ include/uapi/linux/bpf.h | 10 ++++++++++ 2 files changed, 12 insertions(+) diff --git a/drivers/media/rc/bpf-lirc.c b/drivers/media/rc/bpf-lirc.c index 5a0e26e47f59..ca00c6cc110d 100644 --- a/drivers/media/rc/bpf-lirc.c +++ b/drivers/media/rc/bpf-lirc.c @@ -75,6 +75,8 @@ lirc_mode2_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_map_delete_elem_proto; case BPF_FUNC_ktime_get_ns: return &bpf_ktime_get_ns_proto; + case BPF_FUNC_ktime_get_boot_ns: + return &bpf_ktime_get_boot_ns_proto; case BPF_FUNC_tail_call: return &bpf_tail_call_proto; case BPF_FUNC_get_prandom_u32: diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 7556bc317705..271f9df82274 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -471,6 +471,8 @@ union bpf_attr { * u64 bpf_ktime_get_ns(void) * Description * Return the time elapsed since system boot, in nanoseconds. + * Does not include time the system was suspended. + * See: clock_gettime(CLOCK_MONOTONIC) * Return * Current *ktime*. * @@ -2143,6 +2145,14 @@ union bpf_attr { * request in the skb. * Return * 0 on success, or a negative error in case of failure. + * + * u64 bpf_ktime_get_boot_ns(void) + * Description + * Return the time elapsed since system boot, in nanoseconds. + * Does include the time the system was suspended. + * See: clock_gettime(CLOCK_BOOTTIME) + * Return + * Current *ktime*. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ From cdee54691b9196409b6868922cdb9dfa860867cd Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Tue, 9 Feb 2021 14:38:14 +0100 Subject: [PATCH 0876/1640] UPSTREAM: bpf: Fix bpf_fib_lookup helper MTU check for SKB ctx [ Upstream commit 2c0a10af688c02adcf127aad29e923e0056c6b69 ] BPF end-user on Cilium slack-channel (Carlo Carraro) wants to use bpf_fib_lookup for doing MTU-check, but *prior* to extending packet size, by adjusting fib_params 'tot_len' with the packet length plus the expected encap size. (Just like the bpf_check_mtu helper supports). He discovered that for SKB ctx the param->tot_len was not used, instead skb->len was used (via MTU check in is_skb_forwardable() that checks against netdev MTU). Fix this by using fib_params 'tot_len' for MTU check. If not provided (e.g. zero) then keep existing TC behaviour intact. Notice that 'tot_len' for MTU check is done like XDP code-path, which checks against FIB-dst MTU. V16: - Revert V13 optimization, 2nd lookup is against egress/resulting netdev V13: - Only do ifindex lookup one time, calling dev_get_by_index_rcu(). V10: - Use same method as XDP for 'tot_len' MTU check Fixes: 4c79579b44b1 ("bpf: Change bpf_fib_lookup to return lookup status") Reported-by: Carlo Carraro Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/161287789444.790810.15247494756551413508.stgit@firesoul Signed-off-by: Sasha Levin --- net/core/filter.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/net/core/filter.c b/net/core/filter.c index 421d62516778..6291714ce960 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4500,6 +4500,7 @@ BPF_CALL_4(bpf_skb_fib_lookup, struct sk_buff *, skb, { struct net *net = dev_net(skb->dev); int rc = -EAFNOSUPPORT; + bool check_mtu = false; if (plen < sizeof(*params)) return -EINVAL; @@ -4507,22 +4508,28 @@ BPF_CALL_4(bpf_skb_fib_lookup, struct sk_buff *, skb, if (flags & ~(BPF_FIB_LOOKUP_DIRECT | BPF_FIB_LOOKUP_OUTPUT)) return -EINVAL; + if (params->tot_len) + check_mtu = true; + switch (params->family) { #if IS_ENABLED(CONFIG_INET) case AF_INET: - rc = bpf_ipv4_fib_lookup(net, params, flags, false); + rc = bpf_ipv4_fib_lookup(net, params, flags, check_mtu); break; #endif #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: - rc = bpf_ipv6_fib_lookup(net, params, flags, false); + rc = bpf_ipv6_fib_lookup(net, params, flags, check_mtu); break; #endif } - if (!rc) { + if (rc == BPF_FIB_LKUP_RET_SUCCESS && !check_mtu) { struct net_device *dev; + /* When tot_len isn't provided by user, check skb + * against MTU of FIB lookup resulting net_device + */ dev = dev_get_by_index_rcu(net, params->ifindex); if (!is_skb_forwardable(dev, skb)) rc = BPF_FIB_LKUP_RET_FRAG_NEEDED; From b333119d9b3ba763160c20b83a1888ff6ef50628 Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Sat, 27 Mar 2021 18:27:53 -0400 Subject: [PATCH 0877/1640] UPSTREAM: bpf: Don't do bpf_cgroup_storage_set() for kuprobe/tp programs [ Upstream commit 05a68ce5fa51a83c360381630f823545c5757aa2 ] For kuprobe and tracepoint bpf programs, kernel calls trace_call_bpf() which calls BPF_PROG_RUN_ARRAY_CHECK() to run the program array. Currently, BPF_PROG_RUN_ARRAY_CHECK() also calls bpf_cgroup_storage_set() to set percpu cgroup local storage with NULL value. This is due to Commit 394e40a29788 ("bpf: extend bpf_prog_array to store pointers to the cgroup storage") which modified __BPF_PROG_RUN_ARRAY() to call bpf_cgroup_storage_set() and this macro is also used by BPF_PROG_RUN_ARRAY_CHECK(). kuprobe and tracepoint programs are not allowed to call bpf_get_local_storage() helper hence does not access percpu cgroup local storage. Let us change BPF_PROG_RUN_ARRAY_CHECK() not to modify percpu cgroup local storage. The issue is observed when I tried to debug [1] where percpu data is overwritten due to preempt_disable -> migration_disable change. This patch does not completely fix the above issue, which will be addressed separately, e.g., multiple cgroup prog runs may preempt each other. But it does fix any potential issue caused by tracing program overwriting percpu cgroup storage: - in a busy system, a tracing program is to run between bpf_cgroup_storage_set() and the cgroup prog run. - a kprobe program is triggered by a helper in cgroup prog before bpf_get_local_storage() is called. [1] https://lore.kernel.org/bpf/CAKH8qBuXCfUz=w8L+Fj74OaUpbosO29niYwTki7e3Ag044_aww@mail.gmail.com/T Fixes: 394e40a29788 ("bpf: extend bpf_prog_array to store pointers to the cgroup storage") Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Roman Gushchin Link: https://lore.kernel.org/bpf/20210309185028.3763817-1-yhs@fb.com Signed-off-by: Sasha Levin --- include/linux/bpf.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 7402c10dabf2..9ed8d541f2a5 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -386,7 +386,7 @@ int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array, struct bpf_prog *include_prog, struct bpf_prog_array **new_array); -#define __BPF_PROG_RUN_ARRAY(array, ctx, func, check_non_null) \ +#define __BPF_PROG_RUN_ARRAY(array, ctx, func, check_non_null, set_cg_storage) \ ({ \ struct bpf_prog_array_item *_item; \ struct bpf_prog *_prog; \ @@ -399,7 +399,8 @@ int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array, goto _out; \ _item = &_array->items[0]; \ while ((_prog = READ_ONCE(_item->prog))) { \ - bpf_cgroup_storage_set(_item->cgroup_storage); \ + if (set_cg_storage) \ + bpf_cgroup_storage_set(_item->cgroup_storage); \ _ret &= func(_prog, ctx); \ _item++; \ } \ @@ -410,10 +411,10 @@ _out: \ }) #define BPF_PROG_RUN_ARRAY(array, ctx, func) \ - __BPF_PROG_RUN_ARRAY(array, ctx, func, false) + __BPF_PROG_RUN_ARRAY(array, ctx, func, false, true) #define BPF_PROG_RUN_ARRAY_CHECK(array, ctx, func) \ - __BPF_PROG_RUN_ARRAY(array, ctx, func, true) + __BPF_PROG_RUN_ARRAY(array, ctx, func, true, false) #ifdef CONFIG_BPF_SYSCALL DECLARE_PER_CPU(int, bpf_prog_active); From f077e98c261761f04a957c56c9eb01ec58ddfaf7 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 28 May 2021 13:37:56 +0300 Subject: [PATCH 0878/1640] UPSTREAM: bpf: extend is_branch_taken to registers commit fb8d251ee2a6bf4d7f4af5548e9c8f4fb5f90402 upstream This patch extends is_branch_taken() logic from JMP+K instructions to JMP+X instructions. Conditional branches are often done when src and dst registers contain known scalars. In such case the verifier can follow the branch that is going to be taken when program executes. That speeds up the verification and is essential feature to support bounded loops. Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann [OP: drop is_jmp32 parameter from is_branch_taken() calls and adjust context] Signed-off-by: Ovidiu Panait Signed-off-by: Greg Kroah-Hartman --- kernel/bpf/verifier.c | 32 ++++++++++++++++++-------------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 5dd28d9c32cd..82d2a9f6cb7d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4238,8 +4238,9 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, struct bpf_verifier_state *this_branch = env->cur_state; struct bpf_verifier_state *other_branch; struct bpf_reg_state *regs = this_branch->frame[this_branch->curframe]->regs; - struct bpf_reg_state *dst_reg, *other_branch_regs; + struct bpf_reg_state *dst_reg, *other_branch_regs, *src_reg = NULL; u8 opcode = BPF_OP(insn->code); + int pred = -1; int err; if (opcode > BPF_JSLE) { @@ -4263,6 +4264,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, insn->src_reg); return -EACCES; } + src_reg = ®s[insn->src_reg]; } else { if (insn->src_reg != BPF_REG_0) { verbose(env, "BPF_JMP uses reserved fields\n"); @@ -4277,19 +4279,21 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, dst_reg = ®s[insn->dst_reg]; - if (BPF_SRC(insn->code) == BPF_K) { - int pred = is_branch_taken(dst_reg, insn->imm, opcode); - - if (pred == 1) { - /* only follow the goto, ignore fall-through */ - *insn_idx += insn->off; - return 0; - } else if (pred == 0) { - /* only follow fall-through branch, since - * that's where the program will go - */ - return 0; - } + if (BPF_SRC(insn->code) == BPF_K) + pred = is_branch_taken(dst_reg, insn->imm, opcode); + else if (src_reg->type == SCALAR_VALUE && + tnum_is_const(src_reg->var_off)) + pred = is_branch_taken(dst_reg, src_reg->var_off.value, + opcode); + if (pred == 1) { + /* only follow the goto, ignore fall-through */ + *insn_idx += insn->off; + return 0; + } else if (pred == 0) { + /* only follow fall-through branch, since + * that's where the program will go + */ + return 0; } other_branch = push_stack(env, *insn_idx + insn->off + 1, *insn_idx, From e84ac8d2b5534ae665fe1cd79d842e747849a418 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Wed, 2 Jun 2021 11:27:48 +0800 Subject: [PATCH 0879/1640] BACKPORT: bpf: Add BPF_F_ANY_ALIGNMENT. commit e9ee9efc0d176512cdce9d27ff8549d7ffa2bfcd upstream Often we want to write tests cases that check things like bad context offset accesses. And one way to do this is to use an odd offset on, for example, a 32-bit load. This unfortunately triggers the alignment checks first on platforms that do not set CONFIG_EFFICIENT_UNALIGNED_ACCESS. So the test case see the alignment failure rather than what it was testing for. It is often not completely possible to respect the original intention of the test, or even test the same exact thing, while solving the alignment issue. Another option could have been to check the alignment after the context and other validations are performed by the verifier, but that is a non-trivial change to the verifier. Signed-off-by: David S. Miller Signed-off-by: Alexei Starovoitov Signed-off-by: Tiezhu Yang Signed-off-by: Greg Kroah-Hartman --- include/uapi/linux/bpf.h | 14 ++++++++++++++ kernel/bpf/syscall.c | 7 ++++++- kernel/bpf/verifier.c | 3 +++ 3 files changed, 23 insertions(+), 1 deletion(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 271f9df82274..c7f3bc4a0c7e 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -228,6 +228,20 @@ enum bpf_attach_type { */ #define BPF_F_STRICT_ALIGNMENT (1U << 0) +/* If BPF_F_ANY_ALIGNMENT is used in BPF_PROF_LOAD command, the + * verifier will allow any alignment whatsoever. On platforms + * with strict alignment requirements for loads ands stores (such + * as sparc and mips) the verifier validates that all loads and + * stores provably follow this requirement. This flag turns that + * checking and enforcement off. + * + * It is mostly used for testing when we want to validate the + * context and memory access aspects of the verifier, but because + * of an unaligned access the alignment check would trigger before + * the one we are interested in. + */ +#define BPF_F_ANY_ALIGNMENT (1U << 1) + /* when bpf_ldimm64->src_reg == BPF_PSEUDO_MAP_FD, bpf_ldimm64->imm == fd */ #define BPF_PSEUDO_MAP_FD 1 diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index c5c4a3c15b86..e940c1f65938 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1368,9 +1368,14 @@ static int bpf_prog_load(union bpf_attr *attr) if (CHECK_ATTR(BPF_PROG_LOAD)) return -EINVAL; - if (attr->prog_flags & ~BPF_F_STRICT_ALIGNMENT) + if (attr->prog_flags & ~(BPF_F_STRICT_ALIGNMENT | BPF_F_ANY_ALIGNMENT)) return -EINVAL; + if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && + (attr->prog_flags & BPF_F_ANY_ALIGNMENT) && + !capable(CAP_SYS_ADMIN)) + return -EPERM; + /* copy eBPF program license from user space */ if (strncpy_from_user(license, u64_to_user_ptr(attr->license), sizeof(license) - 1) < 0) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 82d2a9f6cb7d..621a4371dfdb 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6451,6 +6451,9 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) env->strict_alignment = true; + if (attr->prog_flags & BPF_F_ANY_ALIGNMENT) + env->strict_alignment = false; + ret = replace_map_fd_with_map_ptr(env); if (ret < 0) goto skip_full_check; From 3988cd785a8dfac2f28ba2400909518cd173eaff Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 12 Aug 2021 20:00:34 +0300 Subject: [PATCH 0880/1640] UPSTREAM: bpf: Inherit expanded/patched seen count from old aux data commit d203b0fd863a2261e5d00b97f3d060c4c2a6db71 upstream. Instead of relying on current env->pass_cnt, use the seen count from the old aux data in adjust_insn_aux_data(), and expand it to the new range of patched instructions. This change is valid given we always expand 1:n with n>=1, so what applies to the old/original instruction needs to apply for the replacement as well. Not relying on env->pass_cnt is a prerequisite for a later change where we want to avoid marking an instruction seen when verified under speculative execution path. Signed-off-by: Daniel Borkmann Reviewed-by: John Fastabend Reviewed-by: Benedict Schlueter Reviewed-by: Piotr Krysiuk Acked-by: Alexei Starovoitov [OP: - declare old_data as bool instead of u32 (struct bpf_insn_aux_data.seen is bool in 5.4) - adjusted context for 4.19] Signed-off-by: Ovidiu Panait Signed-off-by: Greg Kroah-Hartman --- kernel/bpf/verifier.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 621a4371dfdb..ac94be1156d9 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5700,6 +5700,7 @@ static int adjust_insn_aux_data(struct bpf_verifier_env *env, u32 prog_len, u32 off, u32 cnt) { struct bpf_insn_aux_data *new_data, *old_data = env->insn_aux_data; + bool old_seen = old_data[off].seen; int i; if (cnt == 1) @@ -5711,8 +5712,10 @@ static int adjust_insn_aux_data(struct bpf_verifier_env *env, u32 prog_len, memcpy(new_data, old_data, sizeof(struct bpf_insn_aux_data) * off); memcpy(new_data + off + cnt - 1, old_data + off, sizeof(struct bpf_insn_aux_data) * (prog_len - off - cnt + 1)); - for (i = off; i < off + cnt - 1; i++) - new_data[i].seen = true; + for (i = off; i < off + cnt - 1; i++) { + /* Expand insni[off]'s seen count to the patched range. */ + new_data[i].seen = old_seen; + } env->insn_aux_data = new_data; vfree(old_data); return 0; From c0adb1217db727620fe6137dece06daa8a6b0871 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 12 Aug 2021 20:00:35 +0300 Subject: [PATCH 0881/1640] BACKPORT: bpf: Do not mark insn as seen under speculative path verification commit fe9a5ca7e370e613a9a75a13008a3845ea759d6e upstream. ... in such circumstances, we do not want to mark the instruction as seen given the goal is still to jmp-1 rewrite/sanitize dead code, if it is not reachable from the non-speculative path verification. We do however want to verify it for safety regardless. With the patch as-is all the insns that have been marked as seen before the patch will also be marked as seen after the patch (just with a potentially different non-zero count). An upcoming patch will also verify paths that are unreachable in the non-speculative domain, hence this extension is needed. Signed-off-by: Daniel Borkmann Reviewed-by: John Fastabend Reviewed-by: Benedict Schlueter Reviewed-by: Piotr Krysiuk Acked-by: Alexei Starovoitov [OP: - env->pass_cnt is not used in 4.19, so adjust sanitize_mark_insn_seen() to assign "true" instead - drop sanitize_insn_aux_data() comment changes, as the function is not present in 4.19] Signed-off-by: Ovidiu Panait Signed-off-by: Greg Kroah-Hartman --- kernel/bpf/verifier.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index ac94be1156d9..a0046cb3721d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2910,6 +2910,19 @@ do_sim: return !ret ? REASON_STACK : 0; } +static void sanitize_mark_insn_seen(struct bpf_verifier_env *env) +{ + struct bpf_verifier_state *vstate = env->cur_state; + + /* If we simulate paths under speculation, we don't update the + * insn as 'seen' such that when we verify unreachable paths in + * the non-speculative domain, sanitize_dead_code() can still + * rewrite/sanitize them. + */ + if (!vstate->speculative) + env->insn_aux_data[env->insn_idx].seen = true; +} + static int sanitize_err(struct bpf_verifier_env *env, const struct bpf_insn *insn, int reason, const struct bpf_reg_state *off_reg, @@ -5265,7 +5278,8 @@ static int do_check(struct bpf_verifier_env *env) } regs = cur_regs(env); - env->insn_aux_data[env->insn_idx].seen = true; + sanitize_mark_insn_seen(env); + if (class == BPF_ALU || class == BPF_ALU64) { err = check_alu_op(env, insn); if (err) @@ -5482,7 +5496,7 @@ process_bpf_exit: return err; env->insn_idx++; - env->insn_aux_data[env->insn_idx].seen = true; + sanitize_mark_insn_seen(env); } else { verbose(env, "invalid BPF_LD mode\n"); return -EINVAL; From a826a1e451fcaf546b5e6440f469b6fd29f0a999 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 12 Aug 2021 20:00:36 +0300 Subject: [PATCH 0882/1640] UPSTREAM: bpf: Fix leakage under speculation on mispredicted branches commit 9183671af6dbf60a1219371d4ed73e23f43b49db upstream. The verifier only enumerates valid control-flow paths and skips paths that are unreachable in the non-speculative domain. And so it can miss issues under speculative execution on mispredicted branches. For example, a type confusion has been demonstrated with the following crafted program: // r0 = pointer to a map array entry // r6 = pointer to readable stack slot // r9 = scalar controlled by attacker 1: r0 = *(u64 *)(r0) // cache miss 2: if r0 != 0x0 goto line 4 3: r6 = r9 4: if r0 != 0x1 goto line 6 5: r9 = *(u8 *)(r6) 6: // leak r9 Since line 3 runs iff r0 == 0 and line 5 runs iff r0 == 1, the verifier concludes that the pointer dereference on line 5 is safe. But: if the attacker trains both the branches to fall-through, such that the following is speculatively executed ... r6 = r9 r9 = *(u8 *)(r6) // leak r9 ... then the program will dereference an attacker-controlled value and could leak its content under speculative execution via side-channel. This requires to mistrain the branch predictor, which can be rather tricky, because the branches are mutually exclusive. However such training can be done at congruent addresses in user space using different branches that are not mutually exclusive. That is, by training branches in user space ... A: if r0 != 0x0 goto line C B: ... C: if r0 != 0x0 goto line D D: ... ... such that addresses A and C collide to the same CPU branch prediction entries in the PHT (pattern history table) as those of the BPF program's lines 2 and 4, respectively. A non-privileged attacker could simply brute force such collisions in the PHT until observing the attack succeeding. Alternative methods to mistrain the branch predictor are also possible that avoid brute forcing the collisions in the PHT. A reliable attack has been demonstrated, for example, using the following crafted program: // r0 = pointer to a [control] map array entry // r7 = *(u64 *)(r0 + 0), training/attack phase // r8 = *(u64 *)(r0 + 8), oob address // [...] // r0 = pointer to a [data] map array entry 1: if r7 == 0x3 goto line 3 2: r8 = r0 // crafted sequence of conditional jumps to separate the conditional // branch in line 193 from the current execution flow 3: if r0 != 0x0 goto line 5 4: if r0 == 0x0 goto exit 5: if r0 != 0x0 goto line 7 6: if r0 == 0x0 goto exit [...] 187: if r0 != 0x0 goto line 189 188: if r0 == 0x0 goto exit // load any slowly-loaded value (due to cache miss in phase 3) ... 189: r3 = *(u64 *)(r0 + 0x1200) // ... and turn it into known zero for verifier, while preserving slowly- // loaded dependency when executing: 190: r3 &= 1 191: r3 &= 2 // speculatively bypassed phase dependency 192: r7 += r3 193: if r7 == 0x3 goto exit 194: r4 = *(u8 *)(r8 + 0) // leak r4 As can be seen, in training phase (phase != 0x3), the condition in line 1 turns into false and therefore r8 with the oob address is overridden with the valid map value address, which in line 194 we can read out without issues. However, in attack phase, line 2 is skipped, and due to the cache miss in line 189 where the map value is (zeroed and later) added to the phase register, the condition in line 193 takes the fall-through path due to prior branch predictor training, where under speculation, it'll load the byte at oob address r8 (unknown scalar type at that point) which could then be leaked via side-channel. One way to mitigate these is to 'branch off' an unreachable path, meaning, the current verification path keeps following the is_branch_taken() path and we push the other branch to the verification stack. Given this is unreachable from the non-speculative domain, this branch's vstate is explicitly marked as speculative. This is needed for two reasons: i) if this path is solely seen from speculative execution, then we later on still want the dead code elimination to kick in in order to sanitize these instructions with jmp-1s, and ii) to ensure that paths walked in the non-speculative domain are not pruned from earlier walks of paths walked in the speculative domain. Additionally, for robustness, we mark the registers which have been part of the conditional as unknown in the speculative path given there should be no assumptions made on their content. The fix in here mitigates type confusion attacks described earlier due to i) all code paths in the BPF program being explored and ii) existing verifier logic already ensuring that given memory access instruction references one specific data structure. An alternative to this fix that has also been looked at in this scope was to mark aux->alu_state at the jump instruction with a BPF_JMP_TAKEN state as well as direction encoding (always-goto, always-fallthrough, unknown), such that mixing of different always-* directions themselves as well as mixing of always-* with unknown directions would cause a program rejection by the verifier, e.g. programs with constructs like 'if ([...]) { x = 0; } else { x = 1; }' with subsequent 'if (x == 1) { [...] }'. For unprivileged, this would result in only single direction always-* taken paths, and unknown taken paths being allowed, such that the former could be patched from a conditional jump to an unconditional jump (ja). Compared to this approach here, it would have two downsides: i) valid programs that otherwise are not performing any pointer arithmetic, etc, would potentially be rejected/broken, and ii) we are required to turn off path pruning for unprivileged, where both can be avoided in this work through pushing the invalid branch to the verification stack. The issue was originally discovered by Adam and Ofek, and later independently discovered and reported as a result of Benedict and Piotr's research work. Fixes: b2157399cc98 ("bpf: prevent out-of-bounds speculation") Reported-by: Adam Morrison Reported-by: Ofek Kirzner Reported-by: Benedict Schlueter Reported-by: Piotr Krysiuk Signed-off-by: Daniel Borkmann Reviewed-by: John Fastabend Reviewed-by: Benedict Schlueter Reviewed-by: Piotr Krysiuk Acked-by: Alexei Starovoitov [OP: use allow_ptr_leaks instead of bypass_spec_v1] Signed-off-by: Ovidiu Panait Signed-off-by: Greg Kroah-Hartman --- kernel/bpf/verifier.c | 44 +++++++++++++++++++++++++++++++++++++++---- 1 file changed, 40 insertions(+), 4 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a0046cb3721d..079bef78bdc7 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2821,6 +2821,27 @@ struct bpf_sanitize_info { bool mask_to_left; }; +static struct bpf_verifier_state * +sanitize_speculative_path(struct bpf_verifier_env *env, + const struct bpf_insn *insn, + u32 next_idx, u32 curr_idx) +{ + struct bpf_verifier_state *branch; + struct bpf_reg_state *regs; + + branch = push_stack(env, next_idx, curr_idx, true); + if (branch && insn) { + regs = branch->frame[branch->curframe]->regs; + if (BPF_SRC(insn->code) == BPF_K) { + mark_reg_unknown(env, regs, insn->dst_reg); + } else if (BPF_SRC(insn->code) == BPF_X) { + mark_reg_unknown(env, regs, insn->dst_reg); + mark_reg_unknown(env, regs, insn->src_reg); + } + } + return branch; +} + static int sanitize_ptr_alu(struct bpf_verifier_env *env, struct bpf_insn *insn, const struct bpf_reg_state *ptr_reg, @@ -2904,7 +2925,8 @@ do_sim: tmp = *dst_reg; *dst_reg = *ptr_reg; } - ret = push_stack(env, env->insn_idx + 1, env->insn_idx, true); + ret = sanitize_speculative_path(env, NULL, env->insn_idx + 1, + env->insn_idx); if (!ptr_is_dst_reg && ret) *dst_reg = tmp; return !ret ? REASON_STACK : 0; @@ -4298,14 +4320,28 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, tnum_is_const(src_reg->var_off)) pred = is_branch_taken(dst_reg, src_reg->var_off.value, opcode); + if (pred == 1) { - /* only follow the goto, ignore fall-through */ + /* Only follow the goto, ignore fall-through. If needed, push + * the fall-through branch for simulation under speculative + * execution. + */ + if (!env->allow_ptr_leaks && + !sanitize_speculative_path(env, insn, *insn_idx + 1, + *insn_idx)) + return -EFAULT; *insn_idx += insn->off; return 0; } else if (pred == 0) { - /* only follow fall-through branch, since - * that's where the program will go + /* Only follow the fall-through branch, since that's where the + * program will go. If needed, push the goto branch for + * simulation under speculative execution. */ + if (!env->allow_ptr_leaks && + !sanitize_speculative_path(env, insn, + *insn_idx + insn->off + 1, + *insn_idx)) + return -EFAULT; return 0; } From 401e4faae3e11b2725b1cc40cdc042aeaea7dede Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 27 Aug 2021 10:55:31 -0300 Subject: [PATCH 0883/1640] UPSTREAM: bpf: Do not use ax register in interpreter on div/mod Partially undo old commit 144cd91c4c2b ("bpf: move tmp variable into ax register in interpreter"). The reason we need this here is because ax register will be used for holding temporary state for div/mod instruction which otherwise interpreter would corrupt. This will cause a small +8 byte stack increase for interpreter, but with the gain that we can use it from verifier rewrites as scratch register. Signed-off-by: Daniel Borkmann Reviewed-by: John Fastabend [cascardo: This partial revert is needed in order to support using AX for the following two commits, as there is no JMP32 on 4.19.y] Signed-off-by: Thadeu Lima de Souza Cascardo Signed-off-by: Greg Kroah-Hartman --- kernel/bpf/core.c | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 13c15398db05..87f361cf3847 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1074,6 +1074,7 @@ static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, u64 *stack) #undef BPF_INSN_3_LBL #undef BPF_INSN_2_LBL u32 tail_call_cnt = 0; + u64 tmp; #define CONT ({ insn++; goto select_insn; }) #define CONT_JMP ({ insn++; goto select_insn; }) @@ -1134,36 +1135,36 @@ select_insn: (*(s64 *) &DST) >>= IMM; CONT; ALU64_MOD_X: - div64_u64_rem(DST, SRC, &AX); - DST = AX; + div64_u64_rem(DST, SRC, &tmp); + DST = tmp; CONT; ALU_MOD_X: - AX = (u32) DST; - DST = do_div(AX, (u32) SRC); + tmp = (u32) DST; + DST = do_div(tmp, (u32) SRC); CONT; ALU64_MOD_K: - div64_u64_rem(DST, IMM, &AX); - DST = AX; + div64_u64_rem(DST, IMM, &tmp); + DST = tmp; CONT; ALU_MOD_K: - AX = (u32) DST; - DST = do_div(AX, (u32) IMM); + tmp = (u32) DST; + DST = do_div(tmp, (u32) IMM); CONT; ALU64_DIV_X: DST = div64_u64(DST, SRC); CONT; ALU_DIV_X: - AX = (u32) DST; - do_div(AX, (u32) SRC); - DST = (u32) AX; + tmp = (u32) DST; + do_div(tmp, (u32) SRC); + DST = (u32) tmp; CONT; ALU64_DIV_K: DST = div64_u64(DST, IMM); CONT; ALU_DIV_K: - AX = (u32) DST; - do_div(AX, (u32) IMM); - DST = (u32) AX; + tmp = (u32) DST; + do_div(tmp, (u32) IMM); + DST = (u32) tmp; CONT; ALU_END_TO_BE: switch (IMM) { From 67e307a631d4f969047c4a803cf3476134490f2f Mon Sep 17 00:00:00 2001 From: Edward Cree Date: Mon, 13 Sep 2021 18:35:25 +0300 Subject: [PATCH 0884/1640] UPSTREAM: bpf/verifier: per-register parent pointers commit 679c782de14bd48c19dd74cd1af20a2bc05dd936 upstream. By giving each register its own liveness chain, we elide the skip_callee() logic. Instead, each register's parent is the state it inherits from; both check_func_call() and prepare_func_exit() automatically connect reg states to the correct chain since when they copy the reg state across (r1-r5 into the callee as args, and r0 out as the return value) they also copy the parent pointer. Signed-off-by: Edward Cree Signed-off-by: Alexei Starovoitov [OP: adjusted context for 4.19] Signed-off-by: Ovidiu Panait Signed-off-by: Greg Kroah-Hartman --- include/linux/bpf_verifier.h | 8 +- kernel/bpf/verifier.c | 184 +++++++++-------------------------- 2 files changed, 47 insertions(+), 145 deletions(-) diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 1c8517320ea6..daab0960c054 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -41,6 +41,7 @@ enum bpf_reg_liveness { }; struct bpf_reg_state { + /* Ordering of fields matters. See states_equal() */ enum bpf_reg_type type; union { /* valid when type == PTR_TO_PACKET */ @@ -62,7 +63,6 @@ struct bpf_reg_state { * came from, when one is tested for != NULL. */ u32 id; - /* Ordering of fields matters. See states_equal() */ /* For scalar types (SCALAR_VALUE), this represents our knowledge of * the actual value. * For pointer types, this represents the variable part of the offset @@ -79,15 +79,15 @@ struct bpf_reg_state { s64 smax_value; /* maximum possible (s64)value */ u64 umin_value; /* minimum possible (u64)value */ u64 umax_value; /* maximum possible (u64)value */ + /* parentage chain for liveness checking */ + struct bpf_reg_state *parent; /* Inside the callee two registers can be both PTR_TO_STACK like * R1=fp-8 and R2=fp-8, but one of them points to this function stack * while another to the caller's stack. To differentiate them 'frameno' * is used which is an index in bpf_verifier_state->frame[] array * pointing to bpf_func_state. - * This field must be second to last, for states_equal() reasons. */ u32 frameno; - /* This field must be last, for states_equal() reasons. */ enum bpf_reg_liveness live; }; @@ -110,7 +110,6 @@ struct bpf_stack_state { */ struct bpf_func_state { struct bpf_reg_state regs[MAX_BPF_REG]; - struct bpf_verifier_state *parent; /* index of call instruction that called into this func */ int callsite; /* stack frame number of this function state from pov of @@ -132,7 +131,6 @@ struct bpf_func_state { struct bpf_verifier_state { /* call stack tracking */ struct bpf_func_state *frame[MAX_CALL_FRAMES]; - struct bpf_verifier_state *parent; u32 curframe; bool speculative; }; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 079bef78bdc7..90ed73a9d022 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -380,9 +380,9 @@ static int copy_stack_state(struct bpf_func_state *dst, /* do_check() starts with zero-sized stack in struct bpf_verifier_state to * make it consume minimal amount of memory. check_stack_write() access from * the program calls into realloc_func_state() to grow the stack size. - * Note there is a non-zero 'parent' pointer inside bpf_verifier_state - * which this function copies over. It points to previous bpf_verifier_state - * which is never reallocated + * Note there is a non-zero parent pointer inside each reg of bpf_verifier_state + * which this function copies over. It points to corresponding reg in previous + * bpf_verifier_state which is never reallocated */ static int realloc_func_state(struct bpf_func_state *state, int size, bool copy_old) @@ -467,7 +467,6 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state, } dst_state->speculative = src->speculative; dst_state->curframe = src->curframe; - dst_state->parent = src->parent; for (i = 0; i <= src->curframe; i++) { dst = dst_state->frame[i]; if (!dst) { @@ -739,6 +738,7 @@ static void init_reg_state(struct bpf_verifier_env *env, for (i = 0; i < MAX_BPF_REG; i++) { mark_reg_not_init(env, regs, i); regs[i].live = REG_LIVE_NONE; + regs[i].parent = NULL; } /* frame pointer */ @@ -883,74 +883,21 @@ next: return 0; } -static -struct bpf_verifier_state *skip_callee(struct bpf_verifier_env *env, - const struct bpf_verifier_state *state, - struct bpf_verifier_state *parent, - u32 regno) -{ - struct bpf_verifier_state *tmp = NULL; - - /* 'parent' could be a state of caller and - * 'state' could be a state of callee. In such case - * parent->curframe < state->curframe - * and it's ok for r1 - r5 registers - * - * 'parent' could be a callee's state after it bpf_exit-ed. - * In such case parent->curframe > state->curframe - * and it's ok for r0 only - */ - if (parent->curframe == state->curframe || - (parent->curframe < state->curframe && - regno >= BPF_REG_1 && regno <= BPF_REG_5) || - (parent->curframe > state->curframe && - regno == BPF_REG_0)) - return parent; - - if (parent->curframe > state->curframe && - regno >= BPF_REG_6) { - /* for callee saved regs we have to skip the whole chain - * of states that belong to callee and mark as LIVE_READ - * the registers before the call - */ - tmp = parent; - while (tmp && tmp->curframe != state->curframe) { - tmp = tmp->parent; - } - if (!tmp) - goto bug; - parent = tmp; - } else { - goto bug; - } - return parent; -bug: - verbose(env, "verifier bug regno %d tmp %p\n", regno, tmp); - verbose(env, "regno %d parent frame %d current frame %d\n", - regno, parent->curframe, state->curframe); - return NULL; -} - +/* Parentage chain of this register (or stack slot) should take care of all + * issues like callee-saved registers, stack slot allocation time, etc. + */ static int mark_reg_read(struct bpf_verifier_env *env, - const struct bpf_verifier_state *state, - struct bpf_verifier_state *parent, - u32 regno) + const struct bpf_reg_state *state, + struct bpf_reg_state *parent) { bool writes = parent == state->parent; /* Observe write marks */ - if (regno == BPF_REG_FP) - /* We don't need to worry about FP liveness because it's read-only */ - return 0; - while (parent) { /* if read wasn't screened by an earlier write ... */ - if (writes && state->frame[state->curframe]->regs[regno].live & REG_LIVE_WRITTEN) + if (writes && state->live & REG_LIVE_WRITTEN) break; - parent = skip_callee(env, state, parent, regno); - if (!parent) - return -EFAULT; /* ... then we depend on parent's value */ - parent->frame[parent->curframe]->regs[regno].live |= REG_LIVE_READ; + parent->live |= REG_LIVE_READ; state = parent; parent = state->parent; writes = true; @@ -976,7 +923,10 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno, verbose(env, "R%d !read_ok\n", regno); return -EACCES; } - return mark_reg_read(env, vstate, vstate->parent, regno); + /* We don't need to worry about FP liveness because it's read-only */ + if (regno != BPF_REG_FP) + return mark_reg_read(env, ®s[regno], + regs[regno].parent); } else { /* check whether register used as dest operand can be written to */ if (regno == BPF_REG_FP) { @@ -1087,8 +1037,8 @@ static int check_stack_write(struct bpf_verifier_env *env, } else { u8 type = STACK_MISC; - /* regular write of data into stack */ - state->stack[spi].spilled_ptr = (struct bpf_reg_state) {}; + /* regular write of data into stack destroys any spilled ptr */ + state->stack[spi].spilled_ptr.type = NOT_INIT; /* only mark the slot as written if all 8 bytes were written * otherwise read propagation may incorrectly stop too soon @@ -1113,61 +1063,6 @@ static int check_stack_write(struct bpf_verifier_env *env, return 0; } -/* registers of every function are unique and mark_reg_read() propagates - * the liveness in the following cases: - * - from callee into caller for R1 - R5 that were used as arguments - * - from caller into callee for R0 that used as result of the call - * - from caller to the same caller skipping states of the callee for R6 - R9, - * since R6 - R9 are callee saved by implicit function prologue and - * caller's R6 != callee's R6, so when we propagate liveness up to - * parent states we need to skip callee states for R6 - R9. - * - * stack slot marking is different, since stacks of caller and callee are - * accessible in both (since caller can pass a pointer to caller's stack to - * callee which can pass it to another function), hence mark_stack_slot_read() - * has to propagate the stack liveness to all parent states at given frame number. - * Consider code: - * f1() { - * ptr = fp - 8; - * *ptr = ctx; - * call f2 { - * .. = *ptr; - * } - * .. = *ptr; - * } - * First *ptr is reading from f1's stack and mark_stack_slot_read() has - * to mark liveness at the f1's frame and not f2's frame. - * Second *ptr is also reading from f1's stack and mark_stack_slot_read() has - * to propagate liveness to f2 states at f1's frame level and further into - * f1 states at f1's frame level until write into that stack slot - */ -static void mark_stack_slot_read(struct bpf_verifier_env *env, - const struct bpf_verifier_state *state, - struct bpf_verifier_state *parent, - int slot, int frameno) -{ - bool writes = parent == state->parent; /* Observe write marks */ - - while (parent) { - if (parent->frame[frameno]->allocated_stack <= slot * BPF_REG_SIZE) - /* since LIVE_WRITTEN mark is only done for full 8-byte - * write the read marks are conservative and parent - * state may not even have the stack allocated. In such case - * end the propagation, since the loop reached beginning - * of the function - */ - break; - /* if read wasn't screened by an earlier write ... */ - if (writes && state->frame[frameno]->stack[slot].spilled_ptr.live & REG_LIVE_WRITTEN) - break; - /* ... then we depend on parent's value */ - parent->frame[frameno]->stack[slot].spilled_ptr.live |= REG_LIVE_READ; - state = parent; - parent = state->parent; - writes = true; - } -} - static int check_stack_read(struct bpf_verifier_env *env, struct bpf_func_state *reg_state /* func where register points to */, int off, int size, int value_regno) @@ -1205,8 +1100,8 @@ static int check_stack_read(struct bpf_verifier_env *env, */ state->regs[value_regno].live |= REG_LIVE_WRITTEN; } - mark_stack_slot_read(env, vstate, vstate->parent, spi, - reg_state->frameno); + mark_reg_read(env, ®_state->stack[spi].spilled_ptr, + reg_state->stack[spi].spilled_ptr.parent); return 0; } else { int zeros = 0; @@ -1222,8 +1117,8 @@ static int check_stack_read(struct bpf_verifier_env *env, off, i, size); return -EACCES; } - mark_stack_slot_read(env, vstate, vstate->parent, spi, - reg_state->frameno); + mark_reg_read(env, ®_state->stack[spi].spilled_ptr, + reg_state->stack[spi].spilled_ptr.parent); if (value_regno >= 0) { if (zeros == size) { /* any size read into register is zero extended, @@ -1926,8 +1821,8 @@ mark: /* reading any byte out of 8-byte 'spill_slot' will cause * the whole slot to be marked as 'read' */ - mark_stack_slot_read(env, env->cur_state, env->cur_state->parent, - spi, state->frameno); + mark_reg_read(env, &state->stack[spi].spilled_ptr, + state->stack[spi].spilled_ptr.parent); } return update_stack_depth(env, state, off); } @@ -2393,11 +2288,13 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, state->curframe + 1 /* frameno within this callchain */, subprog /* subprog number within this prog */); - /* copy r1 - r5 args that callee can access */ + /* copy r1 - r5 args that callee can access. The copy includes parent + * pointers, which connects us up to the liveness chain + */ for (i = BPF_REG_1; i <= BPF_REG_5; i++) callee->regs[i] = caller->regs[i]; - /* after the call regsiters r0 - r5 were scratched */ + /* after the call registers r0 - r5 were scratched */ for (i = 0; i < CALLER_SAVED_REGS; i++) { mark_reg_not_init(env, caller->regs, caller_saved[i]); check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK); @@ -4854,7 +4751,7 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur, /* explored state didn't use this */ return true; - equal = memcmp(rold, rcur, offsetof(struct bpf_reg_state, frameno)) == 0; + equal = memcmp(rold, rcur, offsetof(struct bpf_reg_state, parent)) == 0; if (rold->type == PTR_TO_STACK) /* two stack pointers are equal only if they're pointing to @@ -5093,7 +4990,7 @@ static bool states_equal(struct bpf_verifier_env *env, * equivalent state (jump target or such) we didn't arrive by the straight-line * code, so read marks in the state must propagate to the parent regardless * of the state's write marks. That's what 'parent == state->parent' comparison - * in mark_reg_read() and mark_stack_slot_read() is for. + * in mark_reg_read() is for. */ static int propagate_liveness(struct bpf_verifier_env *env, const struct bpf_verifier_state *vstate, @@ -5114,7 +5011,8 @@ static int propagate_liveness(struct bpf_verifier_env *env, if (vparent->frame[vparent->curframe]->regs[i].live & REG_LIVE_READ) continue; if (vstate->frame[vstate->curframe]->regs[i].live & REG_LIVE_READ) { - err = mark_reg_read(env, vstate, vparent, i); + err = mark_reg_read(env, &vstate->frame[vstate->curframe]->regs[i], + &vparent->frame[vstate->curframe]->regs[i]); if (err) return err; } @@ -5129,7 +5027,8 @@ static int propagate_liveness(struct bpf_verifier_env *env, if (parent->stack[i].spilled_ptr.live & REG_LIVE_READ) continue; if (state->stack[i].spilled_ptr.live & REG_LIVE_READ) - mark_stack_slot_read(env, vstate, vparent, i, frame); + mark_reg_read(env, &state->stack[i].spilled_ptr, + &parent->stack[i].spilled_ptr); } } return err; @@ -5139,7 +5038,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) { struct bpf_verifier_state_list *new_sl; struct bpf_verifier_state_list *sl; - struct bpf_verifier_state *cur = env->cur_state; + struct bpf_verifier_state *cur = env->cur_state, *new; int i, j, err, states_cnt = 0; sl = env->explored_states[insn_idx]; @@ -5185,16 +5084,18 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) return -ENOMEM; /* add new state to the head of linked list */ - err = copy_verifier_state(&new_sl->state, cur); + new = &new_sl->state; + err = copy_verifier_state(new, cur); if (err) { - free_verifier_state(&new_sl->state, false); + free_verifier_state(new, false); kfree(new_sl); return err; } new_sl->next = env->explored_states[insn_idx]; env->explored_states[insn_idx] = new_sl; /* connect new state to parentage chain */ - cur->parent = &new_sl->state; + for (i = 0; i < BPF_REG_FP; i++) + cur_regs(env)[i].parent = &new->frame[new->curframe]->regs[i]; /* clear write marks in current state: the writes we did are not writes * our child did, so they don't screen off its reads from us. * (There are no read marks in current state, because reads always mark @@ -5207,9 +5108,13 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) /* all stack frames are accessible from callee, clear them all */ for (j = 0; j <= cur->curframe; j++) { struct bpf_func_state *frame = cur->frame[j]; + struct bpf_func_state *newframe = new->frame[j]; - for (i = 0; i < frame->allocated_stack / BPF_REG_SIZE; i++) + for (i = 0; i < frame->allocated_stack / BPF_REG_SIZE; i++) { frame->stack[i].spilled_ptr.live = REG_LIVE_NONE; + frame->stack[i].spilled_ptr.parent = + &newframe->stack[i].spilled_ptr; + } } return 0; } @@ -5228,7 +5133,6 @@ static int do_check(struct bpf_verifier_env *env) return -ENOMEM; state->curframe = 0; state->speculative = false; - state->parent = NULL; state->frame[0] = kzalloc(sizeof(struct bpf_func_state), GFP_KERNEL); if (!state->frame[0]) { kfree(state); From 8b160e8f152ecda5ac4daf3d556dacdbdddc3f6d Mon Sep 17 00:00:00 2001 From: Jiong Wang Date: Mon, 13 Sep 2021 18:35:26 +0300 Subject: [PATCH 0885/1640] BACKPORT: bpf: correct slot_type marking logic to allow more stack slot sharing commit 0bae2d4d62d523f06ff1a8e88ce38b45400acd28 upstream. Verifier is supposed to support sharing stack slot allocated to ptr with SCALAR_VALUE for privileged program. However this doesn't happen for some cases. The reason is verifier is not clearing slot_type STACK_SPILL for all bytes, it only clears part of them, while verifier is using: slot_type[0] == STACK_SPILL as a convention to check one slot is ptr type. So, the consequence of partial clearing slot_type is verifier could treat a partially overridden ptr slot, which should now be a SCALAR_VALUE slot, still as ptr slot, and rejects some valid programs. Before this patch, test_xdp_noinline.o under bpf selftests, bpf_lxc.o and bpf_netdev.o under Cilium bpf repo, when built with -mattr=+alu32 are rejected due to this issue. After this patch, they all accepted. There is no processed insn number change before and after this patch on Cilium bpf programs. Reviewed-by: Jakub Kicinski Signed-off-by: Jiong Wang Reviewed-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov [OP: adjusted context for 4.19] Signed-off-by: Ovidiu Panait Signed-off-by: Greg Kroah-Hartman --- kernel/bpf/verifier.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 90ed73a9d022..770b273fa022 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1039,6 +1039,10 @@ static int check_stack_write(struct bpf_verifier_env *env, /* regular write of data into stack destroys any spilled ptr */ state->stack[spi].spilled_ptr.type = NOT_INIT; + /* Mark slots as STACK_MISC if they belonged to spilled ptr. */ + if (state->stack[spi].slot_type[0] == STACK_SPILL) + for (i = 0; i < BPF_REG_SIZE; i++) + state->stack[spi].slot_type[i] = STACK_MISC; /* only mark the slot as written if all 8 bytes were written * otherwise read propagation may incorrectly stop too soon @@ -1056,6 +1060,7 @@ static int check_stack_write(struct bpf_verifier_env *env, register_is_null(&cur->regs[value_regno])) type = STACK_ZERO; + /* Mark slots affected by this stack write. */ for (i = 0; i < size; i++) state->stack[spi].slot_type[(slot - i) % BPF_REG_SIZE] = type; From 845118aebc74a0d729eab85f972ded9acbce3899 Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Mon, 13 Sep 2021 18:35:27 +0300 Subject: [PATCH 0886/1640] UPSTREAM: bpf: Support variable offset stack access from helpers commit 2011fccfb61bbd1d7c8864b2b3ed7012342e9ba3 upstream. Currently there is a difference in how verifier checks memory access for helper arguments for PTR_TO_MAP_VALUE and PTR_TO_STACK with regard to variable part of offset. check_map_access, that is used for PTR_TO_MAP_VALUE, can handle variable offsets just fine, so that BPF program can call a helper like this: some_helper(map_value_ptr + off, size); , where offset is unknown at load time, but is checked by program to be in a safe rage (off >= 0 && off + size < map_value_size). But it's not the case for check_stack_boundary, that is used for PTR_TO_STACK, and same code with pointer to stack is rejected by verifier: some_helper(stack_value_ptr + off, size); For example: 0: (7a) *(u64 *)(r10 -16) = 0 1: (7a) *(u64 *)(r10 -8) = 0 2: (61) r2 = *(u32 *)(r1 +0) 3: (57) r2 &= 4 4: (17) r2 -= 16 5: (0f) r2 += r10 6: (18) r1 = 0xffff888111343a80 8: (85) call bpf_map_lookup_elem#1 invalid variable stack read R2 var_off=(0xfffffffffffffff0; 0x4) Add support for variable offset access to check_stack_boundary so that if offset is checked by program to be in a safe range it's accepted by verifier. Signed-off-by: Andrey Ignatov Signed-off-by: Alexei Starovoitov [OP: replace reg_state(env, regno) helper with "cur_regs(env) + regno"] Signed-off-by: Ovidiu Panait Signed-off-by: Greg Kroah-Hartman --- kernel/bpf/verifier.c | 75 +++++++++++++++++++++++++++++++------------ 1 file changed, 54 insertions(+), 21 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 770b273fa022..b69ebe988e5c 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1754,6 +1754,29 @@ static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_ins BPF_SIZE(insn->code), BPF_WRITE, -1, true); } +static int __check_stack_boundary(struct bpf_verifier_env *env, u32 regno, + int off, int access_size, + bool zero_size_allowed) +{ + struct bpf_reg_state *reg = cur_regs(env) + regno; + + if (off >= 0 || off < -MAX_BPF_STACK || off + access_size > 0 || + access_size < 0 || (access_size == 0 && !zero_size_allowed)) { + if (tnum_is_const(reg->var_off)) { + verbose(env, "invalid stack type R%d off=%d access_size=%d\n", + regno, off, access_size); + } else { + char tn_buf[48]; + + tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); + verbose(env, "invalid stack type R%d var_off=%s access_size=%d\n", + regno, tn_buf, access_size); + } + return -EACCES; + } + return 0; +} + /* when register 'regno' is passed into function that will read 'access_size' * bytes from that pointer, make sure that it's within stack boundary * and all elements of stack are initialized. @@ -1766,7 +1789,7 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, { struct bpf_reg_state *reg = cur_regs(env) + regno; struct bpf_func_state *state = func(env, reg); - int off, i, slot, spi; + int err, min_off, max_off, i, slot, spi; if (reg->type != PTR_TO_STACK) { /* Allow zero-byte read from NULL, regardless of pointer type */ @@ -1780,21 +1803,23 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, return -EACCES; } - /* Only allow fixed-offset stack reads */ - if (!tnum_is_const(reg->var_off)) { - char tn_buf[48]; - - tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose(env, "invalid variable stack read R%d var_off=%s\n", - regno, tn_buf); - return -EACCES; - } - off = reg->off + reg->var_off.value; - if (off >= 0 || off < -MAX_BPF_STACK || off + access_size > 0 || - access_size < 0 || (access_size == 0 && !zero_size_allowed)) { - verbose(env, "invalid stack type R%d off=%d access_size=%d\n", - regno, off, access_size); - return -EACCES; + if (tnum_is_const(reg->var_off)) { + min_off = max_off = reg->var_off.value + reg->off; + err = __check_stack_boundary(env, regno, min_off, access_size, + zero_size_allowed); + if (err) + return err; + } else { + min_off = reg->smin_value + reg->off; + max_off = reg->umax_value + reg->off; + err = __check_stack_boundary(env, regno, min_off, access_size, + zero_size_allowed); + if (err) + return err; + err = __check_stack_boundary(env, regno, max_off, access_size, + zero_size_allowed); + if (err) + return err; } if (meta && meta->raw_mode) { @@ -1803,10 +1828,10 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, return 0; } - for (i = 0; i < access_size; i++) { + for (i = min_off; i < max_off + access_size; i++) { u8 *stype; - slot = -(off + i) - 1; + slot = -i - 1; spi = slot / BPF_REG_SIZE; if (state->allocated_stack <= slot) goto err; @@ -1819,8 +1844,16 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, goto mark; } err: - verbose(env, "invalid indirect read from stack off %d+%d size %d\n", - off, i, access_size); + if (tnum_is_const(reg->var_off)) { + verbose(env, "invalid indirect read from stack off %d+%d size %d\n", + min_off, i - min_off, access_size); + } else { + char tn_buf[48]; + + tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); + verbose(env, "invalid indirect read from stack var_off %s+%d size %d\n", + tn_buf, i - min_off, access_size); + } return -EACCES; mark: /* reading any byte out of 8-byte 'spill_slot' will cause @@ -1829,7 +1862,7 @@ mark: mark_reg_read(env, &state->stack[spi].spilled_ptr, state->stack[spi].spilled_ptr.parent); } - return update_stack_depth(env, state, off); + return update_stack_depth(env, state, min_off); } static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, From 15283a468063f54ce43a96968cc599e7df8af935 Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Mon, 13 Sep 2021 18:35:28 +0300 Subject: [PATCH 0887/1640] UPSTREAM: bpf: Reject indirect var_off stack access in raw mode commit f2bcd05ec7b839ff826d2008506ad2d2dff46a59 upstream. It's hard to guarantee that whole memory is marked as initialized on helper return if uninitialized stack is accessed with variable offset since specific bounds are unknown to verifier. This may cause uninitialized stack leaking. Reject such an access in check_stack_boundary to prevent possible leaking. There are no known use-cases for indirect uninitialized stack access with variable offset so it shouldn't break anything. Fixes: 2011fccfb61b ("bpf: Support variable offset stack access from helpers") Reported-by: Daniel Borkmann Signed-off-by: Andrey Ignatov Signed-off-by: Daniel Borkmann Signed-off-by: Ovidiu Panait Signed-off-by: Greg Kroah-Hartman --- kernel/bpf/verifier.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index b69ebe988e5c..6d8071c77fd5 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1810,6 +1810,15 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, if (err) return err; } else { + /* Only initialized buffer on stack is allowed to be accessed + * with variable offset. With uninitialized buffer it's hard to + * guarantee that whole memory is marked as initialized on + * helper return since specific bounds are unknown what may + * cause uninitialized stack leaking. + */ + if (meta && meta->raw_mode) + meta = NULL; + min_off = reg->smin_value + reg->off; max_off = reg->umax_value + reg->off; err = __check_stack_boundary(env, regno, min_off, access_size, From 5e2a2ae0bfb49f1431853b663b435f026e0c875f Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Mon, 13 Sep 2021 18:35:29 +0300 Subject: [PATCH 0888/1640] UPSTREAM: bpf: Reject indirect var_off stack access in unpriv mode commit 088ec26d9c2da9d879ab73e3f4117f9df6c566ee upstream. Proper support of indirect stack access with variable offset in unprivileged mode (!root) requires corresponding support in Spectre masking for stack ALU in retrieve_ptr_limit(). There are no use-case for variable offset in unprivileged mode though so make verifier reject such accesses for simplicity. Pointer arithmetics is one (and only?) way to cause variable offset and it's already rejected in unpriv mode so that verifier won't even get to helper function whose argument contains variable offset, e.g.: 0: (7a) *(u64 *)(r10 -16) = 0 1: (7a) *(u64 *)(r10 -8) = 0 2: (61) r2 = *(u32 *)(r1 +0) 3: (57) r2 &= 4 4: (17) r2 -= 16 5: (0f) r2 += r10 variable stack access var_off=(0xfffffffffffffff0; 0x4) off=-16 size=1R2 stack pointer arithmetic goes out of range, prohibited for !root Still it looks like a good idea to reject variable offset indirect stack access for unprivileged mode in check_stack_boundary() explicitly. Fixes: 2011fccfb61b ("bpf: Support variable offset stack access from helpers") Reported-by: Daniel Borkmann Signed-off-by: Andrey Ignatov Signed-off-by: Daniel Borkmann [OP: drop comment in retrieve_ptr_limit()] Signed-off-by: Ovidiu Panait Signed-off-by: Greg Kroah-Hartman --- kernel/bpf/verifier.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 6d8071c77fd5..223dd5f6352d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1810,6 +1810,19 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, if (err) return err; } else { + /* Variable offset is prohibited for unprivileged mode for + * simplicity since it requires corresponding support in + * Spectre masking for stack ALU. + * See also retrieve_ptr_limit(). + */ + if (!env->allow_ptr_leaks) { + char tn_buf[48]; + + tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); + verbose(env, "R%d indirect variable offset stack access prohibited for !root, var_off=%s\n", + regno, tn_buf); + return -EACCES; + } /* Only initialized buffer on stack is allowed to be accessed * with variable offset. With uninitialized buffer it's hard to * guarantee that whole memory is marked as initialized on From d46c8ab7cf731f3b5ffdac6035bff5a513d4ef44 Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Mon, 13 Sep 2021 18:35:30 +0300 Subject: [PATCH 0889/1640] UPSTREAM: bpf: Sanity check max value for var_off stack access commit 107c26a70ca81bfc33657366ad69d02fdc9efc9d upstream. As discussed in [1] max value of variable offset has to be checked for overflow on stack access otherwise verifier would accept code like this: 0: (b7) r2 = 6 1: (b7) r3 = 28 2: (7a) *(u64 *)(r10 -16) = 0 3: (7a) *(u64 *)(r10 -8) = 0 4: (79) r4 = *(u64 *)(r1 +168) 5: (c5) if r4 s< 0x0 goto pc+4 R1=ctx(id=0,off=0,imm=0) R2=inv6 R3=inv28 R4=inv(id=0,umax_value=9223372036854775807,var_off=(0x0; 0x7fffffffffffffff)) R10=fp0,call_-1 fp-8=mmmmmmmm fp-16=mmmmmmmm 6: (17) r4 -= 16 7: (0f) r4 += r10 8: (b7) r5 = 8 9: (85) call bpf_getsockopt#57 10: (b7) r0 = 0 11: (95) exit , where R4 obviosly has unbounded max value. Fix it by checking that reg->smax_value is inside (-BPF_MAX_VAR_OFF; BPF_MAX_VAR_OFF) range. reg->smax_value is used instead of reg->umax_value because stack pointers are calculated using negative offset from fp. This is opposite to e.g. map access where offset must be non-negative and where umax_value is used. Also dedicated verbose logs are added for both min and max bound check failures to have diagnostics consistent with variable offset handling in check_map_access(). [1] https://marc.info/?l=linux-netdev&m=155433357510597&w=2 Fixes: 2011fccfb61b ("bpf: Support variable offset stack access from helpers") Reported-by: Daniel Borkmann Signed-off-by: Andrey Ignatov Signed-off-by: Daniel Borkmann Signed-off-by: Ovidiu Panait Signed-off-by: Greg Kroah-Hartman --- kernel/bpf/verifier.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 223dd5f6352d..de4beb10edca 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1832,16 +1832,28 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, if (meta && meta->raw_mode) meta = NULL; + if (reg->smax_value >= BPF_MAX_VAR_OFF || + reg->smax_value <= -BPF_MAX_VAR_OFF) { + verbose(env, "R%d unbounded indirect variable offset stack access\n", + regno); + return -EACCES; + } min_off = reg->smin_value + reg->off; - max_off = reg->umax_value + reg->off; + max_off = reg->smax_value + reg->off; err = __check_stack_boundary(env, regno, min_off, access_size, zero_size_allowed); - if (err) + if (err) { + verbose(env, "R%d min value is outside of stack bound\n", + regno); return err; + } err = __check_stack_boundary(env, regno, max_off, access_size, zero_size_allowed); - if (err) + if (err) { + verbose(env, "R%d max value is outside of stack bound\n", + regno); return err; + } } if (meta && meta->raw_mode) { From 130adc1549fa5d56e1cccbe0f5a13283ae8d34a0 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 13 Sep 2021 18:35:32 +0300 Subject: [PATCH 0890/1640] UPSTREAM: bpf: track spill/fill of constants commit f7cf25b2026dc8441e0fa3a202c2aa8a56211e30 upstream. Compilers often spill induction variables into the stack, hence it is necessary for the verifier to track scalar values of the registers through stack slots. Also few bpf programs were incorrectly rejected in the past, since the verifier was not able to track such constants while they were used to compute offsets into packet headers. Tracking constants through the stack significantly decreases the chances of state pruning, since two different constants are considered to be different by state equivalency. End result that cilium tests suffer serious degradation in the number of states processed and corresponding verification time increase. before after bpf_lb-DLB_L3.o 1838 6441 bpf_lb-DLB_L4.o 3218 5908 bpf_lb-DUNKNOWN.o 1064 1064 bpf_lxc-DDROP_ALL.o 26935 93790 bpf_lxc-DUNKNOWN.o 34439 123886 bpf_netdev.o 9721 31413 bpf_overlay.o 6184 18561 bpf_lxc_jit.o 39389 359445 After further debugging turned out that cillium progs are getting hurt by clang due to the same constant tracking issue. Newer clang generates better code by spilling less to the stack. Instead it keeps more constants in the registers which hurts state pruning since the verifier already tracks constants in the registers: old clang new clang (no spill/fill tracking introduced by this patch) bpf_lb-DLB_L3.o 1838 1923 bpf_lb-DLB_L4.o 3218 3077 bpf_lb-DUNKNOWN.o 1064 1062 bpf_lxc-DDROP_ALL.o 26935 166729 bpf_lxc-DUNKNOWN.o 34439 174607 bpf_netdev.o 9721 8407 bpf_overlay.o 6184 5420 bpf_lcx_jit.o 39389 39389 The final table is depressing: old clang old clang new clang new clang const spill/fill const spill/fill bpf_lb-DLB_L3.o 1838 6441 1923 8128 bpf_lb-DLB_L4.o 3218 5908 3077 6707 bpf_lb-DUNKNOWN.o 1064 1064 1062 1062 bpf_lxc-DDROP_ALL.o 26935 93790 166729 380712 bpf_lxc-DUNKNOWN.o 34439 123886 174607 440652 bpf_netdev.o 9721 31413 8407 31904 bpf_overlay.o 6184 18561 5420 23569 bpf_lxc_jit.o 39389 359445 39389 359445 Tracking constants in the registers hurts state pruning already. Adding tracking of constants through stack hurts pruning even more. The later patch address this general constant tracking issue with coarse/precise logic. Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann [OP: - drop verbose_linfo() calls, as the function is not implemented in 4.19 - adjust mark_reg_read() calls to match the prototype in 4.19] Signed-off-by: Ovidiu Panait Signed-off-by: Greg Kroah-Hartman --- kernel/bpf/verifier.c | 86 +++++++++++++++++++++++++++++++------------ 1 file changed, 63 insertions(+), 23 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index de4beb10edca..11f6f7c4f091 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -963,6 +963,23 @@ static bool register_is_null(struct bpf_reg_state *reg) return reg->type == SCALAR_VALUE && tnum_equals_const(reg->var_off, 0); } +static bool register_is_const(struct bpf_reg_state *reg) +{ + return reg->type == SCALAR_VALUE && tnum_is_const(reg->var_off); +} + +static void save_register_state(struct bpf_func_state *state, + int spi, struct bpf_reg_state *reg) +{ + int i; + + state->stack[spi].spilled_ptr = *reg; + state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN; + + for (i = 0; i < BPF_REG_SIZE; i++) + state->stack[spi].slot_type[i] = STACK_SPILL; +} + /* check_stack_read/write functions track spill/fill of registers, * stack boundary and alignment are checked in check_mem_access() */ @@ -972,7 +989,7 @@ static int check_stack_write(struct bpf_verifier_env *env, { struct bpf_func_state *cur; /* state of the current function */ int i, slot = -off - 1, spi = slot / BPF_REG_SIZE, err; - enum bpf_reg_type type; + struct bpf_reg_state *reg = NULL; err = realloc_func_state(state, round_up(slot + 1, BPF_REG_SIZE), true); @@ -989,27 +1006,36 @@ static int check_stack_write(struct bpf_verifier_env *env, } cur = env->cur_state->frame[env->cur_state->curframe]; - if (value_regno >= 0 && - is_spillable_regtype((type = cur->regs[value_regno].type))) { + if (value_regno >= 0) + reg = &cur->regs[value_regno]; + if (reg && size == BPF_REG_SIZE && register_is_const(reg) && + !register_is_null(reg) && env->allow_ptr_leaks) { + save_register_state(state, spi, reg); + } else if (reg && is_spillable_regtype(reg->type)) { /* register containing pointer is being spilled into stack */ if (size != BPF_REG_SIZE) { verbose(env, "invalid size of register spill\n"); return -EACCES; } - if (state != cur && type == PTR_TO_STACK) { + if (state != cur && reg->type == PTR_TO_STACK) { verbose(env, "cannot spill pointers to stack into stack frame of the caller\n"); return -EINVAL; } - /* save register state */ - state->stack[spi].spilled_ptr = cur->regs[value_regno]; - state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN; + if (!env->allow_ptr_leaks) { + bool sanitize = false; - for (i = 0; i < BPF_REG_SIZE; i++) { - if (state->stack[spi].slot_type[i] == STACK_MISC && - !env->allow_ptr_leaks) { + if (state->stack[spi].slot_type[0] == STACK_SPILL && + register_is_const(&state->stack[spi].spilled_ptr)) + sanitize = true; + for (i = 0; i < BPF_REG_SIZE; i++) + if (state->stack[spi].slot_type[i] == STACK_MISC) { + sanitize = true; + break; + } + if (sanitize) { int *poff = &env->insn_aux_data[insn_idx].sanitize_stack_off; int soff = (-spi - 1) * BPF_REG_SIZE; @@ -1032,8 +1058,8 @@ static int check_stack_write(struct bpf_verifier_env *env, } *poff = soff; } - state->stack[spi].slot_type[i] = STACK_SPILL; } + save_register_state(state, spi, reg); } else { u8 type = STACK_MISC; @@ -1056,8 +1082,7 @@ static int check_stack_write(struct bpf_verifier_env *env, state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN; /* when we zero initialize stack slots mark them as such */ - if (value_regno >= 0 && - register_is_null(&cur->regs[value_regno])) + if (reg && register_is_null(reg)) type = STACK_ZERO; /* Mark slots affected by this stack write. */ @@ -1075,6 +1100,7 @@ static int check_stack_read(struct bpf_verifier_env *env, struct bpf_verifier_state *vstate = env->cur_state; struct bpf_func_state *state = vstate->frame[vstate->curframe]; int i, slot = -off - 1, spi = slot / BPF_REG_SIZE; + struct bpf_reg_state *reg; u8 *stype; if (reg_state->allocated_stack <= slot) { @@ -1083,11 +1109,20 @@ static int check_stack_read(struct bpf_verifier_env *env, return -EACCES; } stype = reg_state->stack[spi].slot_type; + reg = ®_state->stack[spi].spilled_ptr; if (stype[0] == STACK_SPILL) { if (size != BPF_REG_SIZE) { - verbose(env, "invalid size of register spill\n"); - return -EACCES; + if (reg->type != SCALAR_VALUE) { + verbose(env, "invalid size of register fill\n"); + return -EACCES; + } + if (value_regno >= 0) { + mark_reg_unknown(env, state->regs, value_regno); + state->regs[value_regno].live |= REG_LIVE_WRITTEN; + } + mark_reg_read(env, reg, reg->parent); + return 0; } for (i = 1; i < BPF_REG_SIZE; i++) { if (stype[(slot - i) % BPF_REG_SIZE] != STACK_SPILL) { @@ -1098,16 +1133,14 @@ static int check_stack_read(struct bpf_verifier_env *env, if (value_regno >= 0) { /* restore register state from stack */ - state->regs[value_regno] = reg_state->stack[spi].spilled_ptr; + state->regs[value_regno] = *reg; /* mark reg as written since spilled pointer state likely * has its liveness marks cleared by is_state_visited() * which resets stack/reg liveness for state transitions */ state->regs[value_regno].live |= REG_LIVE_WRITTEN; } - mark_reg_read(env, ®_state->stack[spi].spilled_ptr, - reg_state->stack[spi].spilled_ptr.parent); - return 0; + mark_reg_read(env, reg, reg->parent); } else { int zeros = 0; @@ -1122,8 +1155,7 @@ static int check_stack_read(struct bpf_verifier_env *env, off, i, size); return -EACCES; } - mark_reg_read(env, ®_state->stack[spi].spilled_ptr, - reg_state->stack[spi].spilled_ptr.parent); + mark_reg_read(env, reg, reg->parent); if (value_regno >= 0) { if (zeros == size) { /* any size read into register is zero extended, @@ -1136,8 +1168,8 @@ static int check_stack_read(struct bpf_verifier_env *env, } state->regs[value_regno].live |= REG_LIVE_WRITTEN; } - return 0; } + return 0; } static int check_stack_access(struct bpf_verifier_env *env, @@ -1789,7 +1821,7 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, { struct bpf_reg_state *reg = cur_regs(env) + regno; struct bpf_func_state *state = func(env, reg); - int err, min_off, max_off, i, slot, spi; + int err, min_off, max_off, i, j, slot, spi; if (reg->type != PTR_TO_STACK) { /* Allow zero-byte read from NULL, regardless of pointer type */ @@ -1877,6 +1909,14 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, *stype = STACK_MISC; goto mark; } + if (state->stack[spi].slot_type[0] == STACK_SPILL && + state->stack[spi].spilled_ptr.type == SCALAR_VALUE) { + __mark_reg_unknown(&state->stack[spi].spilled_ptr); + for (j = 0; j < BPF_REG_SIZE; j++) + state->stack[spi].slot_type[j] = STACK_MISC; + goto mark; + } + err: if (tnum_is_const(reg->var_off)) { verbose(env, "invalid indirect read from stack off %d+%d size %d\n", From aea6f9ac167cc1652db2439ab15848310b0139fe Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 14 Dec 2017 17:55:16 -0800 Subject: [PATCH 0891/1640] UPSTREAM: bpf: arm64: add JIT support for multi-function programs similar to x64 add support for bpf-to-bpf calls. When program has calls to in-kernel helpers the target call offset is known at JIT time and arm64 architecture needs 2 passes. With bpf-to-bpf calls the dynamically allocated function start is unknown until all functions of the program are JITed. Therefore (just like x64) arm64 JIT needs one extra pass over the program to emit correct call offsets. Implementation detail: Avoid being too clever in 64-bit immediate moves and always use 4 instructions (instead of 3-4 depending on the address) to make sure only one extra pass is needed. If some future optimization would make it worth while to optimize 'call 64-bit imm' further, the JIT would need to do 4 passes over the program instead of 3 as in this patch. For typical bpf program address the mov needs 3 or 4 insns, so unconditional 4 insns to save extra pass is a worthy trade off at this state of JIT. Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: Daniel Borkmann --- arch/arm64/net/bpf_jit_comp.c | 68 ++++++++++++++++++++++++++++++++--- 1 file changed, 64 insertions(+), 4 deletions(-) diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index 7b1b649da4b9..41b483953c6c 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -99,6 +99,20 @@ static inline void emit_a64_mov_i64(const int reg, const u64 val, } } +static inline void emit_addr_mov_i64(const int reg, const u64 val, + struct jit_ctx *ctx) +{ + u64 tmp = val; + int shift = 0; + + emit(A64_MOVZ(1, reg, tmp & 0xffff, shift), ctx); + for (;shift < 48;) { + tmp >>= 16; + shift += 16; + emit(A64_MOVK(1, reg, tmp & 0xffff, shift), ctx); + } +} + static inline void emit_a64_mov_i(const int is64, const int reg, const s32 val, struct jit_ctx *ctx) { @@ -606,7 +620,10 @@ emit_cond_jmp: const u8 r0 = bpf2a64[BPF_REG_0]; const u64 func = (u64)__bpf_call_base + imm; - emit_a64_mov_i64(tmp, func, ctx); + if (ctx->prog->is_func) + emit_addr_mov_i64(tmp, func, ctx); + else + emit_a64_mov_i64(tmp, func, ctx); emit(A64_BLR(tmp), ctx); emit(A64_MOV(1, r0, A64_R(0)), ctx); break; @@ -847,11 +864,19 @@ static inline void bpf_flush_icache(void *start, void *end) flush_icache_range((unsigned long)start, (unsigned long)end); } +struct arm64_jit_data { + struct bpf_binary_header *header; + u8 *image; + struct jit_ctx ctx; +}; + struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) { struct bpf_prog *tmp, *orig_prog = prog; struct bpf_binary_header *header; + struct arm64_jit_data *jit_data; bool tmp_blinded = false; + bool extra_pass = false; struct jit_ctx ctx; int image_size; u8 *image_ptr; @@ -870,13 +895,29 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) prog = tmp; } + jit_data = prog->aux->jit_data; + if (!jit_data) { + jit_data = kzalloc(sizeof(*jit_data), GFP_KERNEL); + if (!jit_data) { + prog = orig_prog; + goto out; + } + prog->aux->jit_data = jit_data; + } + if (jit_data->ctx.offset) { + ctx = jit_data->ctx; + image_ptr = jit_data->image; + header = jit_data->header; + extra_pass = true; + goto skip_init_ctx; + } memset(&ctx, 0, sizeof(ctx)); ctx.prog = prog; ctx.offset = kcalloc(prog->len, sizeof(int), GFP_KERNEL); if (ctx.offset == NULL) { prog = orig_prog; - goto out; + goto out_off; } /* 1. Initial fake pass to compute ctx->idx. */ @@ -907,6 +948,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) /* 2. Now, the actual pass. */ ctx.image = (__le32 *)image_ptr; +skip_init_ctx: ctx.idx = 0; build_prologue(&ctx); @@ -932,7 +974,21 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) bpf_flush_icache(header, ctx.image + ctx.idx); - bpf_jit_binary_lock_ro(header); + if (!prog->is_func || extra_pass) { + if (extra_pass && ctx.idx != jit_data->ctx.idx) { + pr_err_once("multi-func JIT bug %d != %d\n", + ctx.idx, jit_data->ctx.idx); + bpf_jit_binary_free(header); + prog->bpf_func = NULL; + prog->jited = 0; + goto out_off; + } + bpf_jit_binary_lock_ro(header); + } else { + jit_data->ctx = ctx; + jit_data->image = image_ptr; + jit_data->header = header; + } prog->bpf_func = (void *)ctx.image; prog->jited = 1; prog->jited_len = image_size; @@ -940,8 +996,12 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) uh_call(UH_APP_RKP, RKP_BFP_LOAD, (u64)header, (u64)(header->pages * PAGE_SIZE), RKP_BPF_JIT_LOAD, 0); #endif + if (!prog->is_func || extra_pass) { out_off: - kfree(ctx.offset); + kfree(ctx.offset); + kfree(jit_data); + prog->aux->jit_data = NULL; + } out: if (tmp_blinded) bpf_jit_prog_release_other(prog, prog == orig_prog ? From ae9829f4f5eaf4e1f65dbc93632bb676d213ec6d Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 18 Dec 2017 10:09:44 -0800 Subject: [PATCH 0892/1640] UPSTREAM: bpf: arm64: fix uninitialized variable fix the following issue: arch/arm64/net/bpf_jit_comp.c: In function 'bpf_int_jit_compile': arch/arm64/net/bpf_jit_comp.c:982:18: error: 'image_size' may be used uninitialized in this function [-Werror=maybe-uninitialized] Fixes: db496944fdaa ("bpf: arm64: add JIT support for multi-function programs") Reported-by: Arnd Bergmann Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- arch/arm64/net/bpf_jit_comp.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index 41b483953c6c..f3eacd400a4a 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -909,6 +909,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) image_ptr = jit_data->image; header = jit_data->header; extra_pass = true; + image_size = sizeof(u32) * ctx.idx; goto skip_init_ctx; } memset(&ctx, 0, sizeof(ctx)); From 124cb2a17bc0c845bc3c55d2a8f2bbc67998be66 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 13 Sep 2021 18:35:34 +0300 Subject: [PATCH 0893/1640] BACKPORT: bpf: Introduce BPF nospec instruction for mitigating Spectre v4 commit f5e81d1117501546b7be050c5fbafa6efd2c722c upstream. In case of JITs, each of the JIT backends compiles the BPF nospec instruction /either/ to a machine instruction which emits a speculation barrier /or/ to /no/ machine instruction in case the underlying architecture is not affected by Speculative Store Bypass or has different mitigations in place already. This covers both x86 and (implicitly) arm64: In case of x86, we use 'lfence' instruction for mitigation. In case of arm64, we rely on the firmware mitigation as controlled via the ssbd kernel parameter. Whenever the mitigation is enabled, it works for all of the kernel code with no need to provide any additional instructions here (hence only comment in arm64 JIT). Other archs can follow as needed. The BPF nospec instruction is specifically targeting Spectre v4 since i) we don't use a serialization barrier for the Spectre v1 case, and ii) mitigation instructions for v1 and v4 might be different on some archs. The BPF nospec is required for a future commit, where the BPF verifier does annotate intermediate BPF programs with speculation barriers. Co-developed-by: Piotr Krysiuk Co-developed-by: Benedict Schlueter Change-Id: Id2f6f6b1b9ce1502c678bd79b1da1e07dce99982 Signed-off-by: Daniel Borkmann Signed-off-by: Piotr Krysiuk Signed-off-by: Benedict Schlueter Acked-by: Alexei Starovoitov [OP: adjusted context for 4.19, drop riscv and ppc32 changes] Signed-off-by: Ovidiu Panait Signed-off-by: Greg Kroah-Hartman --- arch/arm/net/bpf_jit_32.c | 3 +++ arch/arm64/net/bpf_jit_comp.c | 13 +++++++++++++ arch/mips/net/ebpf_jit.c | 3 +++ arch/powerpc/net/bpf_jit_comp64.c | 6 ++++++ arch/s390/net/bpf_jit_comp.c | 5 +++++ arch/sparc/net/bpf_jit_comp_64.c | 3 +++ arch/x86/net/bpf_jit_comp.c | 7 +++++++ include/linux/filter.h | 15 +++++++++++++++ kernel/bpf/core.c | 18 +++++++++++++++++- kernel/bpf/disasm.c | 16 +++++++++------- 10 files changed, 81 insertions(+), 8 deletions(-) diff --git a/arch/arm/net/bpf_jit_32.c b/arch/arm/net/bpf_jit_32.c index 68aa2f6d9f83..009d8e97d70a 100644 --- a/arch/arm/net/bpf_jit_32.c +++ b/arch/arm/net/bpf_jit_32.c @@ -1545,6 +1545,9 @@ exit: } break; } + /* speculation barrier */ + case BPF_ST | BPF_NOSPEC: + break; /* ST: *(size *)(dst + off) = imm */ case BPF_ST | BPF_MEM | BPF_W: case BPF_ST | BPF_MEM | BPF_H: diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index f3eacd400a4a..93586c84fc07 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -678,6 +678,19 @@ emit_cond_jmp: } break; + /* speculation barrier */ + case BPF_ST | BPF_NOSPEC: + /* + * Nothing required here. + * + * In case of arm64, we rely on the firmware mitigation of + * Speculative Store Bypass as controlled via the ssbd kernel + * parameter. Whenever the mitigation is enabled, it works + * for all of the kernel code with no need to provide any + * additional instructions. + */ + break; + /* ST: *(size *)(dst + off) = imm */ case BPF_ST | BPF_MEM | BPF_W: case BPF_ST | BPF_MEM | BPF_H: diff --git a/arch/mips/net/ebpf_jit.c b/arch/mips/net/ebpf_jit.c index 1186e481a083..9e01a4fbc41d 100644 --- a/arch/mips/net/ebpf_jit.c +++ b/arch/mips/net/ebpf_jit.c @@ -1433,6 +1433,9 @@ ld_skb_common: } break; + case BPF_ST | BPF_NOSPEC: /* speculation barrier */ + break; + case BPF_ST | BPF_B | BPF_MEM: case BPF_ST | BPF_H | BPF_MEM: case BPF_ST | BPF_W | BPF_MEM: diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c index 4c58af3d300c..bfbac413c5ae 100644 --- a/arch/powerpc/net/bpf_jit_comp64.c +++ b/arch/powerpc/net/bpf_jit_comp64.c @@ -652,6 +652,12 @@ emit_clear: } break; + /* + * BPF_ST NOSPEC (speculation barrier) + */ + case BPF_ST | BPF_NOSPEC: + break; + /* * BPF_ST(X) */ diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c index 6b1003fdd05d..fbe672551993 100644 --- a/arch/s390/net/bpf_jit_comp.c +++ b/arch/s390/net/bpf_jit_comp.c @@ -931,6 +931,11 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, int i break; } break; + /* + * BPF_NOSPEC (speculation barrier) + */ + case BPF_ST | BPF_NOSPEC: + break; /* * BPF_ST(X) */ diff --git a/arch/sparc/net/bpf_jit_comp_64.c b/arch/sparc/net/bpf_jit_comp_64.c index 85ae4b0d5fbc..1b2971e2c6ff 100644 --- a/arch/sparc/net/bpf_jit_comp_64.c +++ b/arch/sparc/net/bpf_jit_comp_64.c @@ -1317,6 +1317,9 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx) emit(opcode | RS1(src) | rs2 | RD(dst), ctx); break; } + /* speculation barrier */ + case BPF_ST | BPF_NOSPEC: + break; /* ST: *(size *)(dst + off) = imm */ case BPF_ST | BPF_MEM | BPF_W: case BPF_ST | BPF_MEM | BPF_H: diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index bcb23d13f6f5..a2a10a66070e 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -750,6 +750,13 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, } break; + /* speculation barrier */ + case BPF_ST | BPF_NOSPEC: + if (boot_cpu_has(X86_FEATURE_XMM2)) + /* Emit 'lfence' */ + EMIT3(0x0F, 0xAE, 0xE8); + break; + /* ST: *(u8*)(dst_reg + off) = imm */ case BPF_ST | BPF_MEM | BPF_B: if (is_ereg(dst_reg)) diff --git a/include/linux/filter.h b/include/linux/filter.h index ff764bd0968e..9800cd0a21d3 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -64,6 +64,11 @@ struct sock_reuseport; /* unused opcode to mark call to interpreter with arguments */ #define BPF_CALL_ARGS 0xe0 +/* unused opcode to mark speculation barrier for mitigating + * Speculative Store Bypass + */ +#define BPF_NOSPEC 0xc0 + /* As per nm, we expose JITed images as text (code) section for * kallsyms. That way, tools like perf can find it to match * addresses. @@ -344,6 +349,16 @@ struct sock_reuseport; .off = 0, \ .imm = 0 }) +/* Speculation barrier */ + +#define BPF_ST_NOSPEC() \ + ((struct bpf_insn) { \ + .code = BPF_ST | BPF_NOSPEC, \ + .dst_reg = 0, \ + .src_reg = 0, \ + .off = 0, \ + .imm = 0 }) + /* Internal classic blocks for direct assignment */ #define __BPF_STMT(CODE, K) \ diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 87f361cf3847..ff9424e399af 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -37,6 +37,7 @@ #include #endif +#include #include /* Registers */ @@ -1070,6 +1071,7 @@ static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, u64 *stack) /* Non-UAPI available opcodes. */ [BPF_JMP | BPF_CALL_ARGS] = &&JMP_CALL_ARGS, [BPF_JMP | BPF_TAIL_CALL] = &&JMP_TAIL_CALL, + [BPF_ST | BPF_NOSPEC] = &&ST_NOSPEC, }; #undef BPF_INSN_3_LBL #undef BPF_INSN_2_LBL @@ -1376,7 +1378,21 @@ out: JMP_EXIT: return BPF_R0; - /* STX and ST and LDX*/ + /* ST, STX and LDX*/ + ST_NOSPEC: + /* Speculation barrier for mitigating Speculative Store Bypass. + * In case of arm64, we rely on the firmware mitigation as + * controlled via the ssbd kernel parameter. Whenever the + * mitigation is enabled, it works for all of the kernel code + * with no need to provide any additional instructions here. + * In case of x86, we use 'lfence' insn for mitigation. We + * reuse preexisting logic from Spectre v1 mitigation that + * happens to produce the required code on x86 for v4 as well. + */ +#ifdef CONFIG_X86 + barrier_nospec(); +#endif + CONT; #define LDST(SIZEOP, SIZE) \ STX_MEM_##SIZEOP: \ *(SIZE *)(unsigned long) (DST + insn->off) = SRC; \ diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c index d6b76377cb6e..cbd75dd5992e 100644 --- a/kernel/bpf/disasm.c +++ b/kernel/bpf/disasm.c @@ -171,15 +171,17 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs, else verbose(cbs->private_data, "BUG_%02x\n", insn->code); } else if (class == BPF_ST) { - if (BPF_MODE(insn->code) != BPF_MEM) { + if (BPF_MODE(insn->code) == BPF_MEM) { + verbose(cbs->private_data, "(%02x) *(%s *)(r%d %+d) = %d\n", + insn->code, + bpf_ldst_string[BPF_SIZE(insn->code) >> 3], + insn->dst_reg, + insn->off, insn->imm); + } else if (BPF_MODE(insn->code) == 0xc0 /* BPF_NOSPEC, no UAPI */) { + verbose(cbs->private_data, "(%02x) nospec\n", insn->code); + } else { verbose(cbs->private_data, "BUG_st_%02x\n", insn->code); - return; } - verbose(cbs->private_data, "(%02x) *(%s *)(r%d %+d) = %d\n", - insn->code, - bpf_ldst_string[BPF_SIZE(insn->code) >> 3], - insn->dst_reg, - insn->off, insn->imm); } else if (class == BPF_LDX) { if (BPF_MODE(insn->code) != BPF_MEM) { verbose(cbs->private_data, "BUG_ldx_%02x\n", insn->code); From 00a287b227b13a3193895287b1cf51c866a98987 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 13 Sep 2021 18:35:35 +0300 Subject: [PATCH 0894/1640] UPSTREAM: bpf: Fix leakage due to insufficient speculative store bypass mitigation commit 2039f26f3aca5b0e419b98f65dd36481337b86ee upstream. Spectre v4 gadgets make use of memory disambiguation, which is a set of techniques that execute memory access instructions, that is, loads and stores, out of program order; Intel's optimization manual, section 2.4.4.5: A load instruction micro-op may depend on a preceding store. Many microarchitectures block loads until all preceding store addresses are known. The memory disambiguator predicts which loads will not depend on any previous stores. When the disambiguator predicts that a load does not have such a dependency, the load takes its data from the L1 data cache. Eventually, the prediction is verified. If an actual conflict is detected, the load and all succeeding instructions are re-executed. af86ca4e3088 ("bpf: Prevent memory disambiguation attack") tried to mitigate this attack by sanitizing the memory locations through preemptive "fast" (low latency) stores of zero prior to the actual "slow" (high latency) store of a pointer value such that upon dependency misprediction the CPU then speculatively executes the load of the pointer value and retrieves the zero value instead of the attacker controlled scalar value previously stored at that location, meaning, subsequent access in the speculative domain is then redirected to the "zero page". The sanitized preemptive store of zero prior to the actual "slow" store is done through a simple ST instruction based on r10 (frame pointer) with relative offset to the stack location that the verifier has been tracking on the original used register for STX, which does not have to be r10. Thus, there are no memory dependencies for this store, since it's only using r10 and immediate constant of zero; hence af86ca4e3088 /assumed/ a low latency operation. However, a recent attack demonstrated that this mitigation is not sufficient since the preemptive store of zero could also be turned into a "slow" store and is thus bypassed as well: [...] // r2 = oob address (e.g. scalar) // r7 = pointer to map value 31: (7b) *(u64 *)(r10 -16) = r2 // r9 will remain "fast" register, r10 will become "slow" register below 32: (bf) r9 = r10 // JIT maps BPF reg to x86 reg: // r9 -> r15 (callee saved) // r10 -> rbp // train store forward prediction to break dependency link between both r9 // and r10 by evicting them from the predictor's LRU table. 33: (61) r0 = *(u32 *)(r7 +24576) 34: (63) *(u32 *)(r7 +29696) = r0 35: (61) r0 = *(u32 *)(r7 +24580) 36: (63) *(u32 *)(r7 +29700) = r0 37: (61) r0 = *(u32 *)(r7 +24584) 38: (63) *(u32 *)(r7 +29704) = r0 39: (61) r0 = *(u32 *)(r7 +24588) 40: (63) *(u32 *)(r7 +29708) = r0 [...] 543: (61) r0 = *(u32 *)(r7 +25596) 544: (63) *(u32 *)(r7 +30716) = r0 // prepare call to bpf_ringbuf_output() helper. the latter will cause rbp // to spill to stack memory while r13/r14/r15 (all callee saved regs) remain // in hardware registers. rbp becomes slow due to push/pop latency. below is // disasm of bpf_ringbuf_output() helper for better visual context: // // ffffffff8117ee20: 41 54 push r12 // ffffffff8117ee22: 55 push rbp // ffffffff8117ee23: 53 push rbx // ffffffff8117ee24: 48 f7 c1 fc ff ff ff test rcx,0xfffffffffffffffc // ffffffff8117ee2b: 0f 85 af 00 00 00 jne ffffffff8117eee0 <-- jump taken // [...] // ffffffff8117eee0: 49 c7 c4 ea ff ff ff mov r12,0xffffffffffffffea // ffffffff8117eee7: 5b pop rbx // ffffffff8117eee8: 5d pop rbp // ffffffff8117eee9: 4c 89 e0 mov rax,r12 // ffffffff8117eeec: 41 5c pop r12 // ffffffff8117eeee: c3 ret 545: (18) r1 = map[id:4] 547: (bf) r2 = r7 548: (b7) r3 = 0 549: (b7) r4 = 4 550: (85) call bpf_ringbuf_output#194288 // instruction 551 inserted by verifier \ 551: (7a) *(u64 *)(r10 -16) = 0 | /both/ are now slow stores here // storing map value pointer r7 at fp-16 | since value of r10 is "slow". 552: (7b) *(u64 *)(r10 -16) = r7 / // following "fast" read to the same memory location, but due to dependency // misprediction it will speculatively execute before insn 551/552 completes. 553: (79) r2 = *(u64 *)(r9 -16) // in speculative domain contains attacker controlled r2. in non-speculative // domain this contains r7, and thus accesses r7 +0 below. 554: (71) r3 = *(u8 *)(r2 +0) // leak r3 As can be seen, the current speculative store bypass mitigation which the verifier inserts at line 551 is insufficient since /both/, the write of the zero sanitation as well as the map value pointer are a high latency instruction due to prior memory access via push/pop of r10 (rbp) in contrast to the low latency read in line 553 as r9 (r15) which stays in hardware registers. Thus, architecturally, fp-16 is r7, however, microarchitecturally, fp-16 can still be r2. Initial thoughts to address this issue was to track spilled pointer loads from stack and enforce their load via LDX through r10 as well so that /both/ the preemptive store of zero /as well as/ the load use the /same/ register such that a dependency is created between the store and load. However, this option is not sufficient either since it can be bypassed as well under speculation. An updated attack with pointer spill/fills now _all_ based on r10 would look as follows: [...] // r2 = oob address (e.g. scalar) // r7 = pointer to map value [...] // longer store forward prediction training sequence than before. 2062: (61) r0 = *(u32 *)(r7 +25588) 2063: (63) *(u32 *)(r7 +30708) = r0 2064: (61) r0 = *(u32 *)(r7 +25592) 2065: (63) *(u32 *)(r7 +30712) = r0 2066: (61) r0 = *(u32 *)(r7 +25596) 2067: (63) *(u32 *)(r7 +30716) = r0 // store the speculative load address (scalar) this time after the store // forward prediction training. 2068: (7b) *(u64 *)(r10 -16) = r2 // preoccupy the CPU store port by running sequence of dummy stores. 2069: (63) *(u32 *)(r7 +29696) = r0 2070: (63) *(u32 *)(r7 +29700) = r0 2071: (63) *(u32 *)(r7 +29704) = r0 2072: (63) *(u32 *)(r7 +29708) = r0 2073: (63) *(u32 *)(r7 +29712) = r0 2074: (63) *(u32 *)(r7 +29716) = r0 2075: (63) *(u32 *)(r7 +29720) = r0 2076: (63) *(u32 *)(r7 +29724) = r0 2077: (63) *(u32 *)(r7 +29728) = r0 2078: (63) *(u32 *)(r7 +29732) = r0 2079: (63) *(u32 *)(r7 +29736) = r0 2080: (63) *(u32 *)(r7 +29740) = r0 2081: (63) *(u32 *)(r7 +29744) = r0 2082: (63) *(u32 *)(r7 +29748) = r0 2083: (63) *(u32 *)(r7 +29752) = r0 2084: (63) *(u32 *)(r7 +29756) = r0 2085: (63) *(u32 *)(r7 +29760) = r0 2086: (63) *(u32 *)(r7 +29764) = r0 2087: (63) *(u32 *)(r7 +29768) = r0 2088: (63) *(u32 *)(r7 +29772) = r0 2089: (63) *(u32 *)(r7 +29776) = r0 2090: (63) *(u32 *)(r7 +29780) = r0 2091: (63) *(u32 *)(r7 +29784) = r0 2092: (63) *(u32 *)(r7 +29788) = r0 2093: (63) *(u32 *)(r7 +29792) = r0 2094: (63) *(u32 *)(r7 +29796) = r0 2095: (63) *(u32 *)(r7 +29800) = r0 2096: (63) *(u32 *)(r7 +29804) = r0 2097: (63) *(u32 *)(r7 +29808) = r0 2098: (63) *(u32 *)(r7 +29812) = r0 // overwrite scalar with dummy pointer; same as before, also including the // sanitation store with 0 from the current mitigation by the verifier. 2099: (7a) *(u64 *)(r10 -16) = 0 | /both/ are now slow stores here 2100: (7b) *(u64 *)(r10 -16) = r7 | since store unit is still busy. // load from stack intended to bypass stores. 2101: (79) r2 = *(u64 *)(r10 -16) 2102: (71) r3 = *(u8 *)(r2 +0) // leak r3 [...] Looking at the CPU microarchitecture, the scheduler might issue loads (such as seen in line 2101) before stores (line 2099,2100) because the load execution units become available while the store execution unit is still busy with the sequence of dummy stores (line 2069-2098). And so the load may use the prior stored scalar from r2 at address r10 -16 for speculation. The updated attack may work less reliable on CPU microarchitectures where loads and stores share execution resources. This concludes that the sanitizing with zero stores from af86ca4e3088 ("bpf: Prevent memory disambiguation attack") is insufficient. Moreover, the detection of stack reuse from af86ca4e3088 where previously data (STACK_MISC) has been written to a given stack slot where a pointer value is now to be stored does not have sufficient coverage as precondition for the mitigation either; for several reasons outlined as follows: 1) Stack content from prior program runs could still be preserved and is therefore not "random", best example is to split a speculative store bypass attack between tail calls, program A would prepare and store the oob address at a given stack slot and then tail call into program B which does the "slow" store of a pointer to the stack with subsequent "fast" read. From program B PoV such stack slot type is STACK_INVALID, and therefore also must be subject to mitigation. 2) The STACK_SPILL must not be coupled to register_is_const(&stack->spilled_ptr) condition, for example, the previous content of that memory location could also be a pointer to map or map value. Without the fix, a speculative store bypass is not mitigated in such precondition and can then lead to a type confusion in the speculative domain leaking kernel memory near these pointer types. While brainstorming on various alternative mitigation possibilities, we also stumbled upon a retrospective from Chrome developers [0]: [...] For variant 4, we implemented a mitigation to zero the unused memory of the heap prior to allocation, which cost about 1% when done concurrently and 4% for scavenging. Variant 4 defeats everything we could think of. We explored more mitigations for variant 4 but the threat proved to be more pervasive and dangerous than we anticipated. For example, stack slots used by the register allocator in the optimizing compiler could be subject to type confusion, leading to pointer crafting. Mitigating type confusion for stack slots alone would have required a complete redesign of the backend of the optimizing compiler, perhaps man years of work, without a guarantee of completeness. [...] >From BPF side, the problem space is reduced, however, options are rather limited. One idea that has been explored was to xor-obfuscate pointer spills to the BPF stack: [...] // preoccupy the CPU store port by running sequence of dummy stores. [...] 2106: (63) *(u32 *)(r7 +29796) = r0 2107: (63) *(u32 *)(r7 +29800) = r0 2108: (63) *(u32 *)(r7 +29804) = r0 2109: (63) *(u32 *)(r7 +29808) = r0 2110: (63) *(u32 *)(r7 +29812) = r0 // overwrite scalar with dummy pointer; xored with random 'secret' value // of 943576462 before store ... 2111: (b4) w11 = 943576462 2112: (af) r11 ^= r7 2113: (7b) *(u64 *)(r10 -16) = r11 2114: (79) r11 = *(u64 *)(r10 -16) 2115: (b4) w2 = 943576462 2116: (af) r2 ^= r11 // ... and restored with the same 'secret' value with the help of AX reg. 2117: (71) r3 = *(u8 *)(r2 +0) [...] While the above would not prevent speculation, it would make data leakage infeasible by directing it to random locations. In order to be effective and prevent type confusion under speculation, such random secret would have to be regenerated for each store. The additional complexity involved for a tracking mechanism that prevents jumps such that restoring spilled pointers would not get corrupted is not worth the gain for unprivileged. Hence, the fix in here eventually opted for emitting a non-public BPF_ST | BPF_NOSPEC instruction which the x86 JIT translates into a lfence opcode. Inserting the latter in between the store and load instruction is one of the mitigations options [1]. The x86 instruction manual notes: [...] An LFENCE that follows an instruction that stores to memory might complete before the data being stored have become globally visible. [...] The latter meaning that the preceding store instruction finished execution and the store is at minimum guaranteed to be in the CPU's store queue, but it's not guaranteed to be in that CPU's L1 cache at that point (globally visible). The latter would only be guaranteed via sfence. So the load which is guaranteed to execute after the lfence for that local CPU would have to rely on store-to-load forwarding. [2], in section 2.3 on store buffers says: [...] For every store operation that is added to the ROB, an entry is allocated in the store buffer. This entry requires both the virtual and physical address of the target. Only if there is no free entry in the store buffer, the frontend stalls until there is an empty slot available in the store buffer again. Otherwise, the CPU can immediately continue adding subsequent instructions to the ROB and execute them out of order. On Intel CPUs, the store buffer has up to 56 entries. [...] One small upside on the fix is that it lifts constraints from af86ca4e3088 where the sanitize_stack_off relative to r10 must be the same when coming from different paths. The BPF_ST | BPF_NOSPEC gets emitted after a BPF_STX or BPF_ST instruction. This happens either when we store a pointer or data value to the BPF stack for the first time, or upon later pointer spills. The former needs to be enforced since otherwise stale stack data could be leaked under speculation as outlined earlier. For non-x86 JITs the BPF_ST | BPF_NOSPEC mapping is currently optimized away, but others could emit a speculation barrier as well if necessary. For real-world unprivileged programs e.g. generated by LLVM, pointer spill/fill is only generated upon register pressure and LLVM only tries to do that for pointers which are not used often. The program main impact will be the initial BPF_ST | BPF_NOSPEC sanitation for the STACK_INVALID case when the first write to a stack slot occurs e.g. upon map lookup. In future we might refine ways to mitigate the latter cost. [0] https://arxiv.org/pdf/1902.05178.pdf [1] https://msrc-blog.microsoft.com/2018/05/21/analysis-and-mitigation-of-speculative-store-bypass-cve-2018-3639/ [2] https://arxiv.org/pdf/1905.05725.pdf Fixes: af86ca4e3088 ("bpf: Prevent memory disambiguation attack") Fixes: f7cf25b2026d ("bpf: track spill/fill of constants") Co-developed-by: Piotr Krysiuk Co-developed-by: Benedict Schlueter Signed-off-by: Daniel Borkmann Signed-off-by: Piotr Krysiuk Signed-off-by: Benedict Schlueter Acked-by: Alexei Starovoitov [OP: - apply check_stack_write_fixed_off() changes in check_stack_write() - replace env->bypass_spec_v4 -> env->allow_ptr_leaks] Signed-off-by: Ovidiu Panait Signed-off-by: Greg Kroah-Hartman --- include/linux/bpf_verifier.h | 2 +- kernel/bpf/verifier.c | 88 ++++++++++++++---------------------- 2 files changed, 34 insertions(+), 56 deletions(-) diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index daab0960c054..e64ac93f7f4c 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -158,8 +158,8 @@ struct bpf_insn_aux_data { u32 alu_limit; /* limit for add/sub register with pointer */ }; int ctx_field_size; /* the ctx field size for load insn, maybe 0 */ - int sanitize_stack_off; /* stack slot to be cleared */ bool seen; /* this insn was processed by the verifier */ + bool sanitize_stack_spill; /* subject to Spectre v4 sanitation */ u8 alu_state; /* used in combination with alu_limit */ }; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 11f6f7c4f091..1d82d01fd230 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1008,6 +1008,19 @@ static int check_stack_write(struct bpf_verifier_env *env, cur = env->cur_state->frame[env->cur_state->curframe]; if (value_regno >= 0) reg = &cur->regs[value_regno]; + if (!env->allow_ptr_leaks) { + bool sanitize = reg && is_spillable_regtype(reg->type); + + for (i = 0; i < size; i++) { + if (state->stack[spi].slot_type[i] == STACK_INVALID) { + sanitize = true; + break; + } + } + + if (sanitize) + env->insn_aux_data[insn_idx].sanitize_stack_spill = true; + } if (reg && size == BPF_REG_SIZE && register_is_const(reg) && !register_is_null(reg) && env->allow_ptr_leaks) { @@ -1018,47 +1031,10 @@ static int check_stack_write(struct bpf_verifier_env *env, verbose(env, "invalid size of register spill\n"); return -EACCES; } - if (state != cur && reg->type == PTR_TO_STACK) { verbose(env, "cannot spill pointers to stack into stack frame of the caller\n"); return -EINVAL; } - - if (!env->allow_ptr_leaks) { - bool sanitize = false; - - if (state->stack[spi].slot_type[0] == STACK_SPILL && - register_is_const(&state->stack[spi].spilled_ptr)) - sanitize = true; - for (i = 0; i < BPF_REG_SIZE; i++) - if (state->stack[spi].slot_type[i] == STACK_MISC) { - sanitize = true; - break; - } - if (sanitize) { - int *poff = &env->insn_aux_data[insn_idx].sanitize_stack_off; - int soff = (-spi - 1) * BPF_REG_SIZE; - - /* detected reuse of integer stack slot with a pointer - * which means either llvm is reusing stack slot or - * an attacker is trying to exploit CVE-2018-3639 - * (speculative store bypass) - * Have to sanitize that slot with preemptive - * store of zero. - */ - if (*poff && *poff != soff) { - /* disallow programs where single insn stores - * into two different stack slots, since verifier - * cannot sanitize them - */ - verbose(env, - "insn %d cannot access two stack slots fp%d and fp%d", - insn_idx, *poff, soff); - return -EINVAL; - } - *poff = soff; - } - } save_register_state(state, spi, reg); } else { u8 type = STACK_MISC; @@ -5877,34 +5853,33 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) insn = env->prog->insnsi + delta; for (i = 0; i < insn_cnt; i++, insn++) { + bool ctx_access; + if (insn->code == (BPF_LDX | BPF_MEM | BPF_B) || insn->code == (BPF_LDX | BPF_MEM | BPF_H) || insn->code == (BPF_LDX | BPF_MEM | BPF_W) || - insn->code == (BPF_LDX | BPF_MEM | BPF_DW)) + insn->code == (BPF_LDX | BPF_MEM | BPF_DW)) { type = BPF_READ; - else if (insn->code == (BPF_STX | BPF_MEM | BPF_B) || - insn->code == (BPF_STX | BPF_MEM | BPF_H) || - insn->code == (BPF_STX | BPF_MEM | BPF_W) || - insn->code == (BPF_STX | BPF_MEM | BPF_DW)) + ctx_access = true; + } else if (insn->code == (BPF_STX | BPF_MEM | BPF_B) || + insn->code == (BPF_STX | BPF_MEM | BPF_H) || + insn->code == (BPF_STX | BPF_MEM | BPF_W) || + insn->code == (BPF_STX | BPF_MEM | BPF_DW) || + insn->code == (BPF_ST | BPF_MEM | BPF_B) || + insn->code == (BPF_ST | BPF_MEM | BPF_H) || + insn->code == (BPF_ST | BPF_MEM | BPF_W) || + insn->code == (BPF_ST | BPF_MEM | BPF_DW)) { type = BPF_WRITE; - else + ctx_access = BPF_CLASS(insn->code) == BPF_STX; + } else { continue; + } if (type == BPF_WRITE && - env->insn_aux_data[i + delta].sanitize_stack_off) { + env->insn_aux_data[i + delta].sanitize_stack_spill) { struct bpf_insn patch[] = { - /* Sanitize suspicious stack slot with zero. - * There are no memory dependencies for this store, - * since it's only using frame pointer and immediate - * constant of zero - */ - BPF_ST_MEM(BPF_DW, BPF_REG_FP, - env->insn_aux_data[i + delta].sanitize_stack_off, - 0), - /* the original STX instruction will immediately - * overwrite the same stack slot with appropriate value - */ *insn, + BPF_ST_NOSPEC(), }; cnt = ARRAY_SIZE(patch); @@ -5918,6 +5893,9 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) continue; } + if (!ctx_access) + continue; + if (env->insn_aux_data[i + delta].ptr_type != PTR_TO_CTX) continue; From ee2d3d14cc6e7333517626ae0904955e74625a1e Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Mon, 13 Sep 2021 18:35:36 +0300 Subject: [PATCH 0895/1640] BACKPORT: bpf: verifier: Allocate idmap scratch in verifier env commit c9e73e3d2b1eb1ea7ff068e05007eec3bd8ef1c9 upstream. func_states_equal makes a very short lived allocation for idmap, probably because it's too large to fit on the stack. However the function is called quite often, leading to a lot of alloc / free churn. Replace the temporary allocation with dedicated scratch space in struct bpf_verifier_env. Signed-off-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov Acked-by: Edward Cree Link: https://lore.kernel.org/bpf/20210429134656.122225-4-lmb@cloudflare.com [OP: adjusted context for 4.19] Signed-off-by: Ovidiu Panait Signed-off-by: Greg Kroah-Hartman --- include/linux/bpf_verifier.h | 8 +++++++ kernel/bpf/verifier.c | 42 +++++++++++------------------------- 2 files changed, 21 insertions(+), 29 deletions(-) diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index e64ac93f7f4c..729c65b320d4 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -127,6 +127,13 @@ struct bpf_func_state { struct bpf_stack_state *stack; }; +struct bpf_id_pair { + u32 old; + u32 cur; +}; + +/* Maximum number of register states that can exist at once */ +#define BPF_ID_MAP_SIZE (MAX_BPF_REG + MAX_BPF_STACK / BPF_REG_SIZE) #define MAX_CALL_FRAMES 8 struct bpf_verifier_state { /* call stack tracking */ @@ -213,6 +220,7 @@ struct bpf_verifier_env { struct bpf_insn_aux_data *insn_aux_data; /* array of per-insn state */ struct bpf_verifier_log log; struct bpf_subprog_info subprog_info[BPF_MAX_SUBPROGS + 1]; + struct bpf_id_pair idmap_scratch[BPF_ID_MAP_SIZE]; u32 subprog_cnt; }; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 1d82d01fd230..6bea0cb3f18f 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4793,13 +4793,6 @@ static bool range_within(struct bpf_reg_state *old, old->smax_value >= cur->smax_value; } -/* Maximum number of register states that can exist at once */ -#define ID_MAP_SIZE (MAX_BPF_REG + MAX_BPF_STACK / BPF_REG_SIZE) -struct idpair { - u32 old; - u32 cur; -}; - /* If in the old state two registers had the same id, then they need to have * the same id in the new state as well. But that id could be different from * the old state, so we need to track the mapping from old to new ids. @@ -4810,11 +4803,11 @@ struct idpair { * So we look through our idmap to see if this old id has been seen before. If * so, we require the new id to match; otherwise, we add the id pair to the map. */ -static bool check_ids(u32 old_id, u32 cur_id, struct idpair *idmap) +static bool check_ids(u32 old_id, u32 cur_id, struct bpf_id_pair *idmap) { unsigned int i; - for (i = 0; i < ID_MAP_SIZE; i++) { + for (i = 0; i < BPF_ID_MAP_SIZE; i++) { if (!idmap[i].old) { /* Reached an empty slot; haven't seen this id before */ idmap[i].old = old_id; @@ -4831,7 +4824,7 @@ static bool check_ids(u32 old_id, u32 cur_id, struct idpair *idmap) /* Returns true if (rold safe implies rcur safe) */ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur, - struct idpair *idmap) + struct bpf_id_pair *idmap) { bool equal; @@ -4935,7 +4928,7 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur, static bool stacksafe(struct bpf_func_state *old, struct bpf_func_state *cur, - struct idpair *idmap) + struct bpf_id_pair *idmap) { int i, spi; @@ -5021,29 +5014,20 @@ static bool stacksafe(struct bpf_func_state *old, * whereas register type in current state is meaningful, it means that * the current state will reach 'bpf_exit' instruction safely */ -static bool func_states_equal(struct bpf_func_state *old, +static bool func_states_equal(struct bpf_verifier_env *env, struct bpf_func_state *old, struct bpf_func_state *cur) { - struct idpair *idmap; - bool ret = false; int i; - idmap = kcalloc(ID_MAP_SIZE, sizeof(struct idpair), GFP_KERNEL); - /* If we failed to allocate the idmap, just say it's not safe */ - if (!idmap) + memset(env->idmap_scratch, 0, sizeof(env->idmap_scratch)); + for (i = 0; i < MAX_BPF_REG; i++) + if (!regsafe(&old->regs[i], &cur->regs[i], env->idmap_scratch)) + return false; + + if (!stacksafe(old, cur, env->idmap_scratch)) return false; - for (i = 0; i < MAX_BPF_REG; i++) { - if (!regsafe(&old->regs[i], &cur->regs[i], idmap)) - goto out_free; - } - - if (!stacksafe(old, cur, idmap)) - goto out_free; - ret = true; -out_free: - kfree(idmap); - return ret; + return true; } static bool states_equal(struct bpf_verifier_env *env, @@ -5067,7 +5051,7 @@ static bool states_equal(struct bpf_verifier_env *env, for (i = 0; i <= old->curframe; i++) { if (old->frame[i]->callsite != cur->frame[i]->callsite) return false; - if (!func_states_equal(old->frame[i], cur->frame[i])) + if (!func_states_equal(env, old->frame[i], cur->frame[i])) return false; } return true; From 0114554bd9494197f3f9047cba701d5580da9228 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 13 Sep 2021 18:35:37 +0300 Subject: [PATCH 0896/1640] UPSTREAM: bpf: Fix pointer arithmetic mask tightening under state pruning commit e042aa532c84d18ff13291d00620502ce7a38dda upstream. In 7fedb63a8307 ("bpf: Tighten speculative pointer arithmetic mask") we narrowed the offset mask for unprivileged pointer arithmetic in order to mitigate a corner case where in the speculative domain it is possible to advance, for example, the map value pointer by up to value_size-1 out-of- bounds in order to leak kernel memory via side-channel to user space. The verifier's state pruning for scalars leaves one corner case open where in the first verification path R_x holds an unknown scalar with an aux->alu_limit of e.g. 7, and in a second verification path that same register R_x, here denoted as R_x', holds an unknown scalar which has tighter bounds and would thus satisfy range_within(R_x, R_x') as well as tnum_in(R_x, R_x') for state pruning, yielding an aux->alu_limit of 3: Given the second path fits the register constraints for pruning, the final generated mask from aux->alu_limit will remain at 7. While technically not wrong for the non-speculative domain, it would however be possible to craft similar cases where the mask would be too wide as in 7fedb63a8307. One way to fix it is to detect the presence of unknown scalar map pointer arithmetic and force a deeper search on unknown scalars to ensure that we do not run into a masking mismatch. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov [OP: adjusted context for 4.19] Signed-off-by: Ovidiu Panait Signed-off-by: Greg Kroah-Hartman --- include/linux/bpf_verifier.h | 1 + kernel/bpf/verifier.c | 27 +++++++++++++++++---------- 2 files changed, 18 insertions(+), 10 deletions(-) diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 729c65b320d4..4acd06cca703 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -215,6 +215,7 @@ struct bpf_verifier_env { struct bpf_map *used_maps[MAX_USED_MAPS]; /* array of map's used by eBPF program */ u32 used_map_cnt; /* number of used maps */ u32 id_gen; /* used to generate unique reg IDs */ + bool explore_alu_limits; bool allow_ptr_leaks; bool seen_direct_write; struct bpf_insn_aux_data *insn_aux_data; /* array of per-insn state */ diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 6bea0cb3f18f..81aa127a469d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2880,6 +2880,12 @@ static int sanitize_ptr_alu(struct bpf_verifier_env *env, alu_state |= off_is_imm ? BPF_ALU_IMMEDIATE : 0; alu_state |= ptr_is_dst_reg ? BPF_ALU_SANITIZE_SRC : BPF_ALU_SANITIZE_DST; + + /* Limit pruning on unknown scalars to enable deep search for + * potential masking differences from other program paths. + */ + if (!off_is_imm) + env->explore_alu_limits = true; } err = update_alu_sanitation_state(aux, alu_state, alu_limit); @@ -4823,8 +4829,8 @@ static bool check_ids(u32 old_id, u32 cur_id, struct bpf_id_pair *idmap) } /* Returns true if (rold safe implies rcur safe) */ -static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur, - struct bpf_id_pair *idmap) +static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold, + struct bpf_reg_state *rcur, struct bpf_id_pair *idmap) { bool equal; @@ -4850,6 +4856,8 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur, return false; switch (rold->type) { case SCALAR_VALUE: + if (env->explore_alu_limits) + return false; if (rcur->type == SCALAR_VALUE) { /* new val must satisfy old val knowledge */ return range_within(rold, rcur) && @@ -4926,9 +4934,8 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur, return false; } -static bool stacksafe(struct bpf_func_state *old, - struct bpf_func_state *cur, - struct bpf_id_pair *idmap) +static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old, + struct bpf_func_state *cur, struct bpf_id_pair *idmap) { int i, spi; @@ -4970,9 +4977,8 @@ static bool stacksafe(struct bpf_func_state *old, continue; if (old->stack[spi].slot_type[0] != STACK_SPILL) continue; - if (!regsafe(&old->stack[spi].spilled_ptr, - &cur->stack[spi].spilled_ptr, - idmap)) + if (!regsafe(env, &old->stack[spi].spilled_ptr, + &cur->stack[spi].spilled_ptr, idmap)) /* when explored and current stack slot are both storing * spilled registers, check that stored pointers types * are the same as well. @@ -5021,10 +5027,11 @@ static bool func_states_equal(struct bpf_verifier_env *env, struct bpf_func_stat memset(env->idmap_scratch, 0, sizeof(env->idmap_scratch)); for (i = 0; i < MAX_BPF_REG; i++) - if (!regsafe(&old->regs[i], &cur->regs[i], env->idmap_scratch)) + if (!regsafe(env, &old->regs[i], &cur->regs[i], + env->idmap_scratch)) return false; - if (!stacksafe(old, cur, env->idmap_scratch)) + if (!stacksafe(env, old, cur, env->idmap_scratch)) return false; return true; From db31721e5eaa130930fa2c1274f233983733c755 Mon Sep 17 00:00:00 2001 From: Hengqi Chen Date: Thu, 10 Mar 2022 23:53:35 +0800 Subject: [PATCH 0897/1640] BACKPORT: bpf: Fix comment for helper bpf_current_task_under_cgroup() commit 58617014405ad5c9f94f464444f4972dabb71ca7 upstream. Fix the descriptions of the return values of helper bpf_current_task_under_cgroup(). Fixes: c6b5fb8690fa ("bpf: add documentation for eBPF helpers (42-50)") Signed-off-by: Hengqi Chen Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20220310155335.1278783-1-hengqi.chen@gmail.com Signed-off-by: Greg Kroah-Hartman --- include/uapi/linux/bpf.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index c7f3bc4a0c7e..8b9fb412fbe5 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1209,8 +1209,8 @@ union bpf_attr { * Return * The return value depends on the result of the test, and can be: * - * * 0, if current task belongs to the cgroup2. - * * 1, if current task does not belong to the cgroup2. + * * 1, if current task belongs to the cgroup2. + * * 0, if current task does not belong to the cgroup2. * * A negative error code, if an error occurred. * * int bpf_skb_change_tail(struct sk_buff *skb, u32 len, u64 flags) From 9406b9ce137bf1472816f74beb26345a3d906c06 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Mon, 25 Apr 2022 16:40:02 -0700 Subject: [PATCH 0898/1640] UPSTREAM: x86/speculation: Add missing prototype for unpriv_ebpf_notify() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 2147c438fde135d6c145a96e373d9348e7076f7f ] Fix the following warnings seen with "make W=1": kernel/sysctl.c:183:13: warning: no previous prototype for ‘unpriv_ebpf_notify’ [-Wmissing-prototypes] 183 | void __weak unpriv_ebpf_notify(int new_state) | ^~~~~~~~~~~~~~~~~~ arch/x86/kernel/cpu/bugs.c:659:6: warning: no previous prototype for ‘unpriv_ebpf_notify’ [-Wmissing-prototypes] 659 | void unpriv_ebpf_notify(int new_state) | ^~~~~~~~~~~~~~~~~~ Fixes: 44a3918c8245 ("x86/speculation: Include unprivileged eBPF status in Spectre v2 mitigation reporting") Reported-by: kernel test robot Signed-off-by: Josh Poimboeuf Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/5689d065f739602ececaee1e05e68b8644009608.1650930000.git.jpoimboe@redhat.com Signed-off-by: Sasha Levin --- include/linux/bpf.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 9ed8d541f2a5..d2b881e5d834 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -693,6 +693,8 @@ void bpf_offload_dev_netdev_unregister(struct bpf_offload_dev *offdev, struct net_device *netdev); bool bpf_offload_dev_match(struct bpf_prog *prog, struct net_device *netdev); +void unpriv_ebpf_notify(int new_state); + #if defined(CONFIG_NET) && defined(CONFIG_BPF_SYSCALL) int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr); From 7b5318306da04fd7cca3f1e7030d02ef43141c5c Mon Sep 17 00:00:00 2001 From: Andrea Mayer Date: Tue, 12 Jul 2022 19:58:37 +0200 Subject: [PATCH 0899/1640] UPSTREAM: seg6: bpf: fix skb checksum in bpf_push_seg6_encap() [ Upstream commit 4889fbd98deaf243c3baadc54e296d71c6af1eb0 ] Both helper functions bpf_lwt_seg6_action() and bpf_lwt_push_encap() use the bpf_push_seg6_encap() to encapsulate the packet in an IPv6 with Segment Routing Header (SRH) or insert an SRH between the IPv6 header and the payload. To achieve this result, such helper functions rely on bpf_push_seg6_encap() which, in turn, leverages seg6_do_srh_{encap,inline}() to perform the required operation (i.e. encap/inline). This patch removes the initialization of the IPv6 header payload length from bpf_push_seg6_encap(), as it is now handled properly by seg6_do_srh_{encap,inline}() to prevent corruption of the skb checksum. Fixes: fe94cc290f53 ("bpf: Add IPv6 Segment Routing helpers") Signed-off-by: Andrea Mayer Signed-off-by: Paolo Abeni Signed-off-by: Sasha Levin --- net/core/filter.c | 1 - 1 file changed, 1 deletion(-) diff --git a/net/core/filter.c b/net/core/filter.c index 6291714ce960..aa3e5f7ced39 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4577,7 +4577,6 @@ static int bpf_push_seg6_encap(struct sk_buff *skb, u32 type, void *hdr, u32 len if (err) return err; - ipv6_hdr(skb)->payload_len = htons(skb->len - sizeof(struct ipv6hdr)); skb_set_transport_header(skb, sizeof(struct ipv6hdr)); return seg6_lookup_nexthop(skb, NULL, 0); From 54554d8e63b1128fd8247309032bca6f73690570 Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Sat, 10 Sep 2022 11:01:20 +0000 Subject: [PATCH 0900/1640] UPSTREAM: bpf: btf: fix truncated last_member_type_id in btf_struct_resolve [ Upstream commit a37a32583e282d8d815e22add29bc1e91e19951a ] When trying to finish resolving a struct member, btf_struct_resolve saves the member type id in a u16 temporary variable. This truncates the 32 bit type id value if it exceeds UINT16_MAX. As a result, structs that have members with type ids > UINT16_MAX and which need resolution will fail with a message like this: [67414] STRUCT ff_device size=120 vlen=12 effect_owners type_id=67434 bits_offset=960 Member exceeds struct_size Fix this by changing the type of last_member_type_id to u32. Fixes: a0791f0df7d2 ("bpf: fix BTF limits") Reviewed-by: Stanislav Fomichev Signed-off-by: Lorenz Bauer Link: https://lore.kernel.org/r/20220910110120.339242-1-oss@lmb.io Signed-off-by: Alexei Starovoitov Signed-off-by: Sasha Levin --- kernel/bpf/btf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 471cc5c117a5..62e05b6283c0 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -1660,7 +1660,7 @@ static int btf_struct_resolve(struct btf_verifier_env *env, if (v->next_member) { const struct btf_type *last_member_type; const struct btf_member *last_member; - u16 last_member_type_id; + u32 last_member_type_id; last_member = btf_type_member(v->t) + v->next_member - 1; last_member_type_id = last_member->type; From a0b84982e7f6a1c3438b772d22f95ad63768637a Mon Sep 17 00:00:00 2001 From: Lee Jones Date: Mon, 12 Sep 2022 14:38:55 +0100 Subject: [PATCH 0901/1640] UPSTREAM: bpf: Ensure correct locking around vulnerable function find_vpid() [ Upstream commit 83c10cc362d91c0d8d25e60779ee52fdbbf3894d ] The documentation for find_vpid() clearly states: "Must be called with the tasklist_lock or rcu_read_lock() held." Presently we do neither for find_vpid() instance in bpf_task_fd_query(). Add proper rcu_read_lock/unlock() to fix the issue. Fixes: 41bdc4b40ed6f ("bpf: introduce bpf subcommand BPF_TASK_FD_QUERY") Signed-off-by: Lee Jones Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20220912133855.1218900-1-lee@kernel.org Signed-off-by: Sasha Levin --- kernel/bpf/syscall.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index e940c1f65938..02e5bdb82a9a 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2325,7 +2325,9 @@ static int bpf_task_fd_query(const union bpf_attr *attr, if (attr->task_fd_query.flags != 0) return -EINVAL; + rcu_read_lock(); task = get_pid_task(find_vpid(pid), PIDTYPE_PID); + rcu_read_unlock(); if (!task) return -ENOENT; From 62d84cac8136aa96bba46537dad1e167e8307e15 Mon Sep 17 00:00:00 2001 From: Luis Gerhorst Date: Mon, 9 Jan 2023 16:05:46 +0100 Subject: [PATCH 0902/1640] UPSTREAM: bpf: Fix pointer-leak due to insufficient speculative store bypass mitigation [ Upstream commit e4f4db47794c9f474b184ee1418f42e6a07412b6 ] To mitigate Spectre v4, 2039f26f3aca ("bpf: Fix leakage due to insufficient speculative store bypass mitigation") inserts lfence instructions after 1) initializing a stack slot and 2) spilling a pointer to the stack. However, this does not cover cases where a stack slot is first initialized with a pointer (subject to sanitization) but then overwritten with a scalar (not subject to sanitization because the slot was already initialized). In this case, the second write may be subject to speculative store bypass (SSB) creating a speculative pointer-as-scalar type confusion. This allows the program to subsequently leak the numerical pointer value using, for example, a branch-based cache side channel. To fix this, also sanitize scalars if they write a stack slot that previously contained a pointer. Assuming that pointer-spills are only generated by LLVM on register-pressure, the performance impact on most real-world BPF programs should be small. The following unprivileged BPF bytecode drafts a minimal exploit and the mitigation: [...] // r6 = 0 or 1 (skalar, unknown user input) // r7 = accessible ptr for side channel // r10 = frame pointer (fp), to be leaked // r9 = r10 # fp alias to encourage ssb *(u64 *)(r9 - 8) = r10 // fp[-8] = ptr, to be leaked // lfence added here because of pointer spill to stack. // // Ommitted: Dummy bpf_ringbuf_output() here to train alias predictor // for no r9-r10 dependency. // *(u64 *)(r10 - 8) = r6 // fp[-8] = scalar, overwrites ptr // 2039f26f3aca: no lfence added because stack slot was not STACK_INVALID, // store may be subject to SSB // // fix: also add an lfence when the slot contained a ptr // r8 = *(u64 *)(r9 - 8) // r8 = architecturally a scalar, speculatively a ptr // // leak ptr using branch-based cache side channel: r8 &= 1 // choose bit to leak if r8 == 0 goto SLOW // no mispredict // architecturally dead code if input r6 is 0, // only executes speculatively iff ptr bit is 1 r8 = *(u64 *)(r7 + 0) # encode bit in cache (0: slow, 1: fast) SLOW: [...] After running this, the program can time the access to *(r7 + 0) to determine whether the chosen pointer bit was 0 or 1. Repeat this 64 times to recover the whole address on amd64. In summary, sanitization can only be skipped if one scalar is overwritten with another scalar. Scalar-confusion due to speculative store bypass can not lead to invalid accesses because the pointer bounds deducted during verification are enforced using branchless logic. See 979d63d50c0c ("bpf: prevent out of bounds speculation on pointer arithmetic") for details. Do not make the mitigation depend on !env->allow_{uninit_stack,ptr_leaks} because speculative leaks are likely unexpected if these were enabled. For example, leaking the address to a protected log file may be acceptable while disabling the mitigation might unintentionally leak the address into the cached-state of a map that is accessible to unprivileged processes. Fixes: 2039f26f3aca ("bpf: Fix leakage due to insufficient speculative store bypass mitigation") Signed-off-by: Luis Gerhorst Signed-off-by: Daniel Borkmann Acked-by: Henriette Hofmeier Link: https://lore.kernel.org/bpf/edc95bad-aada-9cfc-ffe2-fa9bb206583c@cs.fau.de Link: https://lore.kernel.org/bpf/20230109150544.41465-1-gerhorst@cs.fau.de Signed-off-by: Sasha Levin --- kernel/bpf/verifier.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 81aa127a469d..e6cdf91cecab 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1012,7 +1012,9 @@ static int check_stack_write(struct bpf_verifier_env *env, bool sanitize = reg && is_spillable_regtype(reg->type); for (i = 0; i < size; i++) { - if (state->stack[spi].slot_type[i] == STACK_INVALID) { + u8 type = state->stack[spi].slot_type[i]; + + if (type != STACK_MISC && type != STACK_ZERO) { sanitize = true; break; } From cc1fc432be36c897f0fa09ffa944dba2cf1bf3ca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Fri, 9 Oct 2020 20:42:34 +0200 Subject: [PATCH 0903/1640] UPSTREAM: bpf: Always return target ifindex in bpf_fib_lookup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit d1c362e1dd68a421cf9033404cf141a4ab734a5d upstream. The bpf_fib_lookup() helper performs a neighbour lookup for the destination IP and returns BPF_FIB_LKUP_NO_NEIGH if this fails, with the expectation that the BPF program will pass the packet up the stack in this case. However, with the addition of bpf_redirect_neigh() that can be used instead to perform the neighbour lookup, at the cost of a bit of duplicated work. For that we still need the target ifindex, and since bpf_fib_lookup() already has that at the time it performs the neighbour lookup, there is really no reason why it can't just return it in any case. So let's just always return the ifindex if the FIB lookup itself succeeds. Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Daniel Borkmann Cc: David Ahern Link: https://lore.kernel.org/bpf/20201009184234.134214-1-toke@redhat.com Signed-off-by: Greg Kroah-Hartman --- net/core/filter.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/core/filter.c b/net/core/filter.c index aa3e5f7ced39..46fc2daf9564 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4235,7 +4235,6 @@ static int bpf_fib_set_fwd_params(struct bpf_fib_lookup *params, memcpy(params->smac, dev->dev_addr, ETH_ALEN); params->h_vlan_TCI = 0; params->h_vlan_proto = 0; - params->ifindex = dev->ifindex; return 0; } @@ -4333,6 +4332,7 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params, params->ipv4_dst = nh->nh_gw; params->rt_metric = res.fi->fib_priority; + params->ifindex = dev->ifindex; /* xdp and cls_bpf programs are run in RCU-bh so * rcu_read_lock_bh is not needed here @@ -4447,6 +4447,7 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params, dev = f6i->fib6_nh.nh_dev; params->rt_metric = f6i->fib6_metric; + params->ifindex = dev->ifindex; /* xdp and cls_bpf programs are run in RCU-bh so rcu_read_lock_bh is * not needed here. Can not use __ipv6_neigh_lookup_noref here From 6cfb3d550c53e07531bef3062a6f208f75a693a3 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Tue, 21 Feb 2023 12:30:15 -0800 Subject: [PATCH 0904/1640] UPSTREAM: uaccess: Add speculation barrier to copy_from_user() commit 74e19ef0ff8061ef55957c3abd71614ef0f42f47 upstream. The results of "access_ok()" can be mis-speculated. The result is that you can end speculatively: if (access_ok(from, size)) // Right here even for bad from/size combinations. On first glance, it would be ideal to just add a speculation barrier to "access_ok()" so that its results can never be mis-speculated. But there are lots of system calls just doing access_ok() via "copy_to_user()" and friends (example: fstat() and friends). Those are generally not problematic because they do not _consume_ data from userspace other than the pointer. They are also very quick and common system calls that should not be needlessly slowed down. "copy_from_user()" on the other hand uses a user-controller pointer and is frequently followed up with code that might affect caches. Take something like this: if (!copy_from_user(&kernelvar, uptr, size)) do_something_with(kernelvar); If userspace passes in an evil 'uptr' that *actually* points to a kernel addresses, and then do_something_with() has cache (or other) side-effects, it could allow userspace to infer kernel data values. Add a barrier to the common copy_from_user() code to prevent mis-speculated values which happen after the copy. Also add a stub for architectures that do not define barrier_nospec(). This makes the macro usable in generic code. Since the barrier is now usable in generic code, the x86 #ifdef in the BPF code can also go away. Reported-by: Jordy Zomer Suggested-by: Linus Torvalds Signed-off-by: Dave Hansen Reviewed-by: Thomas Gleixner Acked-by: Daniel Borkmann # BPF bits Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- kernel/bpf/core.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index ff9424e399af..e632e451d530 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1389,9 +1389,7 @@ out: * reuse preexisting logic from Spectre v1 mitigation that * happens to produce the required code on x86 for v4 as well. */ -#ifdef CONFIG_X86 barrier_nospec(); -#endif CONT; #define LDST(SIZEOP, SIZE) \ STX_MEM_##SIZEOP: \ From bd0b35010632ed8705c87a40b7bb657fc6951a47 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 22 Feb 2023 09:52:32 -0800 Subject: [PATCH 0905/1640] UPSTREAM: bpf: add missing header file include MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit f3dd0c53370e70c0f9b7e931bbec12916f3bb8cc upstream. Commit 74e19ef0ff80 ("uaccess: Add speculation barrier to copy_from_user()") built fine on x86-64 and arm64, and that's the extent of my local build testing. It turns out those got the include incidentally through other header files ( in particular), but that was not true of other architectures, resulting in build errors kernel/bpf/core.c: In function ‘___bpf_prog_run’: kernel/bpf/core.c:1913:3: error: implicit declaration of function ‘barrier_nospec’ so just make sure to explicitly include the proper header file to make everybody see it. Fixes: 74e19ef0ff80 ("uaccess: Add speculation barrier to copy_from_user()") Reported-by: kernel test robot Reported-by: Viresh Kumar Reported-by: Huacai Chen Tested-by: Geert Uytterhoeven Tested-by: Dave Hansen Acked-by: Alexei Starovoitov Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- kernel/bpf/core.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index e632e451d530..604cdf006f66 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -32,6 +32,7 @@ #include #include #include +#include #ifdef CONFIG_RKP_MODULE_SUPPORT #include From f6dc08b77289f80d38f546abdcd9e07d0621c146 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Thu, 18 May 2023 11:25:28 +0100 Subject: [PATCH 0906/1640] UPSTREAM: bpf: Fix mask generation for 32-bit narrow loads of 64-bit fields commit 0613d8ca9ab382caabe9ed2dceb429e9781e443f upstream. A narrow load from a 64-bit context field results in a 64-bit load followed potentially by a 64-bit right-shift and then a bitwise AND operation to extract the relevant data. In the case of a 32-bit access, an immediate mask of 0xffffffff is used to construct a 64-bit BPP_AND operation which then sign-extends the mask value and effectively acts as a glorified no-op. For example: 0: 61 10 00 00 00 00 00 00 r0 = *(u32 *)(r1 + 0) results in the following code generation for a 64-bit field: ldr x7, [x7] // 64-bit load mov x10, #0xffffffffffffffff and x7, x7, x10 Fix the mask generation so that narrow loads always perform a 32-bit AND operation: ldr x7, [x7] // 64-bit load mov w10, #0xffffffff and w7, w7, w10 Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: John Fastabend Cc: Krzesimir Nowak Cc: Andrey Ignatov Acked-by: Yonghong Song Fixes: 31fd85816dbe ("bpf: permits narrower load from bpf program context fields") Signed-off-by: Will Deacon Link: https://lore.kernel.org/r/20230518102528.1341-1-will@kernel.org Signed-off-by: Alexei Starovoitov Signed-off-by: Greg Kroah-Hartman --- kernel/bpf/verifier.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index e6cdf91cecab..2c1689c1132b 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5945,7 +5945,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) insn_buf[cnt++] = BPF_ALU64_IMM(BPF_RSH, insn->dst_reg, shift); - insn_buf[cnt++] = BPF_ALU64_IMM(BPF_AND, insn->dst_reg, + insn_buf[cnt++] = BPF_ALU32_IMM(BPF_AND, insn->dst_reg, (1ULL << size * 8) - 1); } } From d7d63adfaac8338eaa64c6b6f6f230928b4bcc82 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Mon, 11 Sep 2023 12:47:30 -0700 Subject: [PATCH 0907/1640] BACKPORT: bpf: Clarify error expectations from bpf_clone_redirect [ Upstream commit 7cb779a6867fea00b4209bcf6de2f178a743247d ] Commit 151e887d8ff9 ("veth: Fixing transmit return status for dropped packets") exposed the fact that bpf_clone_redirect is capable of returning raw NET_XMIT_XXX return codes. This is in the conflict with its UAPI doc which says the following: "0 on success, or a negative error in case of failure." Update the UAPI to reflect the fact that bpf_clone_redirect can return positive error numbers, but don't explicitly define their meaning. Reported-by: Daniel Borkmann Signed-off-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20230911194731.286342-1-sdf@google.com Signed-off-by: Sasha Levin --- include/uapi/linux/bpf.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 8b9fb412fbe5..17cff15efcdf 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -695,7 +695,9 @@ union bpf_attr { * performed again, if the helper is used in combination with * direct packet access. * Return - * 0 on success, or a negative error in case of failure. + * 0 on success, or a negative error in case of failure. Positive + * error indicates a potential drop or congestion in the target + * device. The particular positive error codes are not defined. * * u64 bpf_get_current_pid_tgid(void) * Return From aac2e51438dd80e32f6f16f43362e5e9c4ff18de Mon Sep 17 00:00:00 2001 From: Tao Chen Date: Tue, 10 Sep 2024 22:41:10 +0800 Subject: [PATCH 0908/1640] UPSTREAM: bpf: Check percpu map value size first [ Upstream commit 1d244784be6b01162b732a5a7d637dfc024c3203 ] Percpu map is often used, but the map value size limit often ignored, like issue: https://github.com/iovisor/bcc/issues/2519. Actually, percpu map value size is bound by PCPU_MIN_UNIT_SIZE, so we can check the value size whether it exceeds PCPU_MIN_UNIT_SIZE first, like percpu map of local_storage. Maybe the error message seems clearer compared with "cannot allocate memory". Signed-off-by: Jinke Han Signed-off-by: Tao Chen Signed-off-by: Andrii Nakryiko Acked-by: Jiri Olsa Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20240910144111.1464912-2-chen.dylane@gmail.com Signed-off-by: Sasha Levin --- kernel/bpf/arraymap.c | 3 +++ kernel/bpf/hashtab.c | 3 +++ 2 files changed, 6 insertions(+) diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 44f53c06629e..03e244b11f5a 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -71,6 +71,9 @@ int array_map_alloc_check(union bpf_attr *attr) * access the elements. */ return -E2BIG; + /* percpu map value size is bound by PCPU_MIN_UNIT_SIZE */ + if (percpu && round_up(attr->value_size, 8) > PCPU_MIN_UNIT_SIZE) + return -E2BIG; return 0; } diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 16081d8384bf..bca328703046 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -291,6 +291,9 @@ static int htab_map_alloc_check(union bpf_attr *attr) * kmalloc-able later in htab_map_update_elem() */ return -E2BIG; + /* percpu map value size is bound by PCPU_MIN_UNIT_SIZE */ + if (percpu && round_up(attr->value_size, 8) > PCPU_MIN_UNIT_SIZE) + return -E2BIG; return 0; } From 000cd427e986dab99e82073a3201990f11ca2b5d Mon Sep 17 00:00:00 2001 From: Byeonguk Jeong Date: Sat, 26 Oct 2024 14:02:43 +0900 Subject: [PATCH 0909/1640] UPSTREAM: bpf: Fix out-of-bounds write in trie_get_next_key() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 13400ac8fb80c57c2bfb12ebd35ee121ce9b4d21 ] trie_get_next_key() allocates a node stack with size trie->max_prefixlen, while it writes (trie->max_prefixlen + 1) nodes to the stack when it has full paths from the root to leaves. For example, consider a trie with max_prefixlen is 8, and the nodes with key 0x00/0, 0x00/1, 0x00/2, ... 0x00/8 inserted. Subsequent calls to trie_get_next_key with _key with .prefixlen = 8 make 9 nodes be written on the node stack with size 8. Fixes: b471f2f1de8b ("bpf: implement MAP_GET_NEXT_KEY command for LPM_TRIE map") Signed-off-by: Byeonguk Jeong Reviewed-by: Toke Høiland-Jørgensen Tested-by: Hou Tao Acked-by: Hou Tao Link: https://lore.kernel.org/r/Zxx384ZfdlFYnz6J@localhost.localdomain Signed-off-by: Alexei Starovoitov Signed-off-by: Sasha Levin --- kernel/bpf/lpm_trie.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index fcd3a15add41..a929ee0e86b1 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -629,7 +629,7 @@ static int trie_get_next_key(struct bpf_map *map, void *_key, void *_next_key) if (!key || key->prefixlen > trie->max_prefixlen) goto find_leftmost; - node_stack = kmalloc_array(trie->max_prefixlen, + node_stack = kmalloc_array(trie->max_prefixlen + 1, sizeof(struct lpm_trie_node *), GFP_ATOMIC | __GFP_NOWARN); if (!node_stack) From ac606334e7e73f183c77140c774d59337b53e2cc Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Tue, 8 Oct 2024 17:07:35 -0400 Subject: [PATCH 0910/1640] UPSTREAM: bpf: use kvzmalloc to allocate BPF verifier environment [ Upstream commit 434247637c66e1be2bc71a9987d4c3f0d8672387 ] The kzmalloc call in bpf_check can fail when memory is very fragmented, which in turn can lead to an OOM kill. Use kvzmalloc to fall back to vmalloc when memory is too fragmented to allocate an order 3 sized bpf verifier environment. Admittedly this is not a very common case, and only happens on systems where memory has already been squeezed close to the limit, but this does not seem like much of a hot path, and it's a simple enough fix. Signed-off-by: Rik van Riel Reviewed-by: Shakeel Butt Link: https://lore.kernel.org/r/20241008170735.16766766@imladris.surriel.com Signed-off-by: Alexei Starovoitov Signed-off-by: Sasha Levin --- kernel/bpf/verifier.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 2c1689c1132b..078c71d13dc3 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6455,7 +6455,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) /* 'struct bpf_verifier_env' can be global, but since it's not small, * allocate/free it every time bpf_check() is called */ - env = kzalloc(sizeof(struct bpf_verifier_env), GFP_KERNEL); + env = kvzalloc(sizeof(struct bpf_verifier_env), GFP_KERNEL); if (!env) return -ENOMEM; log = &env->log; @@ -6582,6 +6582,6 @@ err_unlock: mutex_unlock(&bpf_verifier_lock); vfree(env->insn_aux_data); err_free_env: - kfree(env); + kvfree(env); return ret; } From 6a193821d385a29d09c2501505503976fe921247 Mon Sep 17 00:00:00 2001 From: Paul Chaignon Date: Tue, 24 Apr 2018 15:07:54 +0200 Subject: [PATCH 0911/1640] UPSTREAM: bpf: allow map helpers access to map values directly Helpers that expect ARG_PTR_TO_MAP_KEY and ARG_PTR_TO_MAP_VALUE can only access stack and packet memory. Allow these helpers to directly access map values by passing registers of type PTR_TO_MAP_VALUE. This change removes the need for an extra copy to the stack when using a map value to perform a second map lookup, as in the following: struct bpf_map_def SEC("maps") infobyreq = { .type = BPF_MAP_TYPE_HASHMAP, .key_size = sizeof(struct request *), .value_size = sizeof(struct info_t), .max_entries = 1024, }; struct bpf_map_def SEC("maps") counts = { .type = BPF_MAP_TYPE_HASHMAP, .key_size = sizeof(struct info_t), .value_size = sizeof(u64), .max_entries = 1024, }; SEC("kprobe/blk_account_io_start") int bpf_blk_account_io_start(struct pt_regs *ctx) { struct info_t *info = bpf_map_lookup_elem(&infobyreq, &ctx->di); u64 *count = bpf_map_lookup_elem(&counts, info); (*count)++; } Signed-off-by: Paul Chaignon Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 24 +++++++----------------- 1 file changed, 7 insertions(+), 17 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 078c71d13dc3..1a410e1753c4 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1983,7 +1983,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, if (arg_type == ARG_PTR_TO_MAP_KEY || arg_type == ARG_PTR_TO_MAP_VALUE) { expected_type = PTR_TO_STACK; - if (!type_is_pkt_pointer(type) && + if (!type_is_pkt_pointer(type) && type != PTR_TO_MAP_VALUE && type != expected_type) goto err_type; } else if (arg_type == ARG_CONST_SIZE || @@ -2038,14 +2038,9 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, verbose(env, "invalid map_ptr to access map->key\n"); return -EACCES; } - if (type_is_pkt_pointer(type)) - err = check_packet_access(env, regno, reg->off, - meta->map_ptr->key_size, - false); - else - err = check_stack_boundary(env, regno, - meta->map_ptr->key_size, - false, NULL); + err = check_helper_mem_access(env, regno, + meta->map_ptr->key_size, false, + NULL); } else if (arg_type == ARG_PTR_TO_MAP_VALUE) { /* bpf_map_xxx(..., map_ptr, ..., value) call: * check [value, value + map->value_size) validity @@ -2055,14 +2050,9 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, verbose(env, "invalid map_ptr to access map->value\n"); return -EACCES; } - if (type_is_pkt_pointer(type)) - err = check_packet_access(env, regno, reg->off, - meta->map_ptr->value_size, - false); - else - err = check_stack_boundary(env, regno, - meta->map_ptr->value_size, - false, NULL); + err = check_helper_mem_access(env, regno, + meta->map_ptr->value_size, false, + NULL); } else if (arg_type_is_mem_size(arg_type)) { bool zero_size_allowed = (arg_type == ARG_CONST_SIZE_OR_ZERO); From 0072cacddfb8e98a24e84655ebd50523b5197efb Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 3 Apr 2019 18:39:01 +0000 Subject: [PATCH 0912/1640] BACKPORT: bpf: reduce verifier memory consumption commit 638f5b90d46016372a8e3e0a434f199cc5e12b8c upstream. the verifier got progressively smarter over time and size of its internal state grew as well. Time to reduce the memory consumption. Before: sizeof(struct bpf_verifier_state) = 6520 After: sizeof(struct bpf_verifier_state) = 896 It's done by observing that majority of BPF programs use little to no stack whereas verifier kept all of 512 stack slots ready always. Instead dynamically reallocate struct verifier state when stack access is detected. Runtime difference before vs after is within a noise. The number of processed instructions stays the same. [Linux4: Reapply to fix variable order] Cc: jakub.kicinski@netronome.com Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller [Backported to 4.14 by sblbir] Signed-off-by: Balbir Singh Signed-off-by: Greg Kroah-Hartman --- kernel/bpf/verifier.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 1a410e1753c4..8cf52b387199 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -513,8 +513,8 @@ static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env, int insn_idx, int prev_insn_idx, bool speculative) { - struct bpf_verifier_stack_elem *elem; struct bpf_verifier_state *cur = env->cur_state; + struct bpf_verifier_stack_elem *elem; int err; elem = kzalloc(sizeof(struct bpf_verifier_stack_elem), GFP_KERNEL); From bf739364c890dfa23b5a5436fffcc35df9a02b75 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 3 Apr 2019 18:39:12 +0000 Subject: [PATCH 0913/1640] BACKPORT: bpf: prevent out of bounds speculation on pointer arithmetic commit 979d63d50c0c0f7bc537bf821e056cc9fe5abd38 upstream. Jann reported that the original commit back in b2157399cc98 ("bpf: prevent out-of-bounds speculation") was not sufficient to stop CPU from speculating out of bounds memory access: While b2157399cc98 only focussed on masking array map access for unprivileged users for tail calls and data access such that the user provided index gets sanitized from BPF program and syscall side, there is still a more generic form affected from BPF programs that applies to most maps that hold user data in relation to dynamic map access when dealing with unknown scalars or "slow" known scalars as access offset, for example: - Load a map value pointer into R6 - Load an index into R7 - Do a slow computation (e.g. with a memory dependency) that loads a limit into R8 (e.g. load the limit from a map for high latency, then mask it to make the verifier happy) - Exit if R7 >= R8 (mispredicted branch) - Load R0 = R6[R7] - Load R0 = R6[R0] For unknown scalars there are two options in the BPF verifier where we could derive knowledge from in order to guarantee safe access to the memory: i) While /<=/>= variants won't allow to derive any lower or upper bounds from the unknown scalar where it would be safe to add it to the map value pointer, it is possible through ==/!= test however. ii) another option is to transform the unknown scalar into a known scalar, for example, through ALU ops combination such as R &= followed by R |= or any similar combination where the original information from the unknown scalar would be destroyed entirely leaving R with a constant. The initial slow load still precedes the latter ALU ops on that register, so the CPU executes speculatively from that point. Once we have the known scalar, any compare operation would work then. A third option only involving registers with known scalars could be crafted as described in [0] where a CPU port (e.g. Slow Int unit) would be filled with many dependent computations such that the subsequent condition depending on its outcome has to wait for evaluation on its execution port and thereby executing speculatively if the speculated code can be scheduled on a different execution port, or any other form of mistraining as described in [1], for example. Given this is not limited to only unknown scalars, not only map but also stack access is affected since both is accessible for unprivileged users and could potentially be used for out of bounds access under speculation. In order to prevent any of these cases, the verifier is now sanitizing pointer arithmetic on the offset such that any out of bounds speculation would be masked in a way where the pointer arithmetic result in the destination register will stay unchanged, meaning offset masked into zero similar as in array_index_nospec() case. With regards to implementation, there are three options that were considered: i) new insn for sanitation, ii) push/pop insn and sanitation as inlined BPF, iii) reuse of ax register and sanitation as inlined BPF. Option i) has the downside that we end up using from reserved bits in the opcode space, but also that we would require each JIT to emit masking as native arch opcodes meaning mitigation would have slow adoption till everyone implements it eventually which is counter-productive. Option ii) and iii) have both in common that a temporary register is needed in order to implement the sanitation as inlined BPF since we are not allowed to modify the source register. While a push / pop insn in ii) would be useful to have in any case, it requires once again that every JIT needs to implement it first. While possible, amount of changes needed would also be unsuitable for a -stable patch. Therefore, the path which has fewer changes, less BPF instructions for the mitigation and does not require anything to be changed in the JITs is option iii) which this work is pursuing. The ax register is already mapped to a register in all JITs (modulo arm32 where it's mapped to stack as various other BPF registers there) and used in constant blinding for JITs-only so far. It can be reused for verifier rewrites under certain constraints. The interpreter's tmp "register" has therefore been remapped into extending the register set with hidden ax register and reusing that for a number of instructions that needed the prior temporary variable internally (e.g. div, mod). This allows for zero increase in stack space usage in the interpreter, and enables (restricted) generic use in rewrites otherwise as long as such a patchlet does not make use of these instructions. The sanitation mask is dynamic and relative to the offset the map value or stack pointer currently holds. There are various cases that need to be taken under consideration for the masking, e.g. such operation could look as follows: ptr += val or val += ptr or ptr -= val. Thus, the value to be sanitized could reside either in source or in destination register, and the limit is different depending on whether the ALU op is addition or subtraction and depending on the current known and bounded offset. The limit is derived as follows: limit := max_value_size - (smin_value + off). For subtraction: limit := umax_value + off. This holds because we do not allow any pointer arithmetic that would temporarily go out of bounds or would have an unknown value with mixed signed bounds where it is unclear at verification time whether the actual runtime value would be either negative or positive. For example, we have a derived map pointer value with constant offset and bounded one, so limit based on smin_value works because the verifier requires that statically analyzed arithmetic on the pointer must be in bounds, and thus it checks if resulting smin_value + off and umax_value + off is still within map value bounds at time of arithmetic in addition to time of access. Similarly, for the case of stack access we derive the limit as follows: MAX_BPF_STACK + off for subtraction and -off for the case of addition where off := ptr_reg->off + ptr_reg->var_off.value. Subtraction is a special case for the masking which can be in form of ptr += -val, ptr -= -val, or ptr -= val. In the first two cases where we know that the value is negative, we need to temporarily negate the value in order to do the sanitation on a positive value where we later swap the ALU op, and restore original source register if the value was in source. The sanitation of pointer arithmetic alone is still not fully sufficient as is, since a scenario like the following could happen ... PTR += 0x1000 (e.g. K-based imm) PTR -= BIG_NUMBER_WITH_SLOW_COMPARISON PTR += 0x1000 PTR -= BIG_NUMBER_WITH_SLOW_COMPARISON [...] ... which under speculation could end up as ... PTR += 0x1000 PTR -= 0 [ truncated by mitigation ] PTR += 0x1000 PTR -= 0 [ truncated by mitigation ] [...] ... and therefore still access out of bounds. To prevent such case, the verifier is also analyzing safety for potential out of bounds access under speculative execution. Meaning, it is also simulating pointer access under truncation. We therefore "branch off" and push the current verification state after the ALU operation with known 0 to the verification stack for later analysis. Given the current path analysis succeeded it is likely that the one under speculation can be pruned. In any case, it is also subject to existing complexity limits and therefore anything beyond this point will be rejected. In terms of pruning, it needs to be ensured that the verification state from speculative execution simulation must never prune a non-speculative execution path, therefore, we mark verifier state accordingly at the time of push_stack(). If verifier detects out of bounds access under speculative execution from one of the possible paths that includes a truncation, it will reject such program. Given we mask every reg-based pointer arithmetic for unprivileged programs, we've been looking into how it could affect real-world programs in terms of size increase. As the majority of programs are targeted for privileged-only use case, we've unconditionally enabled masking (with its alu restrictions on top of it) for privileged programs for the sake of testing in order to check i) whether they get rejected in its current form, and ii) by how much the number of instructions and size will increase. We've tested this by using Katran, Cilium and test_l4lb from the kernel selftests. For Katran we've evaluated balancer_kern.o, Cilium bpf_lxc.o and an older test object bpf_lxc_opt_-DUNKNOWN.o and l4lb we've used test_l4lb.o as well as test_l4lb_noinline.o. We found that none of the programs got rejected by the verifier with this change, and that impact is rather minimal to none. balancer_kern.o had 13,904 bytes (1,738 insns) xlated and 7,797 bytes JITed before and after the change. Most complex program in bpf_lxc.o had 30,544 bytes (3,817 insns) xlated and 18,538 bytes JITed before and after and none of the other tail call programs in bpf_lxc.o had any changes either. For the older bpf_lxc_opt_-DUNKNOWN.o object we found a small increase from 20,616 bytes (2,576 insns) and 12,536 bytes JITed before to 20,664 bytes (2,582 insns) and 12,558 bytes JITed after the change. Other programs from that object file had similar small increase. Both test_l4lb.o had no change and remained at 6,544 bytes (817 insns) xlated and 3,401 bytes JITed and for test_l4lb_noinline.o constant at 5,080 bytes (634 insns) xlated and 3,313 bytes JITed. This can be explained in that LLVM typically optimizes stack based pointer arithmetic by using K-based operations and that use of dynamic map access is not overly frequent. However, in future we may decide to optimize the algorithm further under known guarantees from branch and value speculation. Latter seems also unclear in terms of prediction heuristics that today's CPUs apply as well as whether there could be collisions in e.g. the predictor's Value History/Pattern Table for triggering out of bounds access, thus masking is performed unconditionally at this point but could be subject to relaxation later on. We were generally also brainstorming various other approaches for mitigation, but the blocker was always lack of available registers at runtime and/or overhead for runtime tracking of limits belonging to a specific pointer. Thus, we found this to be minimally intrusive under given constraints. With that in place, a simple example with sanitized access on unprivileged load at post-verification time looks as follows: # bpftool prog dump xlated id 282 [...] 28: (79) r1 = *(u64 *)(r7 +0) 29: (79) r2 = *(u64 *)(r7 +8) 30: (57) r1 &= 15 31: (79) r3 = *(u64 *)(r0 +4608) 32: (57) r3 &= 1 33: (47) r3 |= 1 34: (2d) if r2 > r3 goto pc+19 35: (b4) (u32) r11 = (u32) 20479 | 36: (1f) r11 -= r2 | Dynamic sanitation for pointer 37: (4f) r11 |= r2 | arithmetic with registers 38: (87) r11 = -r11 | containing bounded or known 39: (c7) r11 s>>= 63 | scalars in order to prevent 40: (5f) r11 &= r2 | out of bounds speculation. 41: (0f) r4 += r11 | 42: (71) r4 = *(u8 *)(r4 +0) 43: (6f) r4 <<= r1 [...] For the case where the scalar sits in the destination register as opposed to the source register, the following code is emitted for the above example: [...] 16: (b4) (u32) r11 = (u32) 20479 17: (1f) r11 -= r2 18: (4f) r11 |= r2 19: (87) r11 = -r11 20: (c7) r11 s>>= 63 21: (5f) r2 &= r11 22: (0f) r2 += r0 23: (61) r0 = *(u32 *)(r2 +0) [...] JIT blinding example with non-conflicting use of r10: [...] d5: je 0x0000000000000106 _ d7: mov 0x0(%rax),%edi | da: mov $0xf153246,%r10d | Index load from map value and e0: xor $0xf153259,%r10 | (const blinded) mask with 0x1f. e7: and %r10,%rdi |_ ea: mov $0x2f,%r10d | f0: sub %rdi,%r10 | Sanitized addition. Both use r10 f3: or %rdi,%r10 | but do not interfere with each f6: neg %r10 | other. (Neither do these instructions f9: sar $0x3f,%r10 | interfere with the use of ax as temp fd: and %r10,%rdi | in interpreter.) 100: add %rax,%rdi |_ 103: mov 0x0(%rdi),%eax [...] Tested that it fixes Jann's reproducer, and also checked that test_verifier and test_progs suite with interpreter, JIT and JIT with hardening enabled on x86-64 and arm64 runs successfully. [0] Speculose: Analyzing the Security Implications of Speculative Execution in CPUs, Giorgi Maisuradze and Christian Rossow, https://arxiv.org/pdf/1801.04084.pdf [1] A Systematic Evaluation of Transient Execution Attacks and Defenses, Claudio Canella, Jo Van Bulck, Michael Schwarz, Moritz Lipp, Benjamin von Berg, Philipp Ortner, Frank Piessens, Dmitry Evtyushkin, Daniel Gruss, https://arxiv.org/pdf/1811.05441.pdf [Linux4: Reapply to fix order] Fixes: b2157399cc98 ("bpf: prevent out-of-bounds speculation") Reported-by: Jann Horn Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: Alexei Starovoitov Signed-off-by: Vallish Vaidyeshwara [some checkpatch cleanups and backported to 4.14 by sblbir] Signed-off-by: Balbir Singh Signed-off-by: Greg Kroah-Hartman --- kernel/bpf/verifier.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 8cf52b387199..67943bf49339 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -524,12 +524,12 @@ static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env, elem->insn_idx = insn_idx; elem->prev_insn_idx = prev_insn_idx; elem->next = env->head; - elem->st.speculative |= speculative; env->head = elem; env->stack_size++; err = copy_verifier_state(&elem->st, cur); if (err) goto err; + elem->st.speculative |= speculative; if (env->stack_size > BPF_COMPLEXITY_LIMIT_STACK) { verbose(env, "BPF program is too complex\n"); goto err; From 41f243145aabd2182b4ca4387f53512d9f7764bb Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 31 May 2021 18:25:49 +0000 Subject: [PATCH 0914/1640] BACKPORT: bpf: do not allow root to mangle valid pointers commit 82abbf8d2fc46d79611ab58daa7c608df14bb3ee upstream. Do not allow root to convert valid pointers into unknown scalars. In particular disallow: ptr &= reg ptr <<= reg ptr += ptr and explicitly allow: ptr -= ptr since pkt_end - pkt == length 1. This minimizes amount of address leaks root can do. In the future may need to further tighten the leaks with kptr_restrict. 2. If program has such pointer math it's likely a user mistake and when verifier complains about it right away instead of many instructions later on invalid memory access it's easier for users to fix their progs. 3. when register holding a pointer cannot change to scalar it allows JITs to optimize better. Like 32-bit archs could use single register for pointers instead of a pair required to hold 64-bit scalars. 4. reduces architecture dependent behavior. Since code: r1 = r10; r1 &= 0xff; if (r1 ...) will behave differently arm64 vs x64 and offloaded vs native. A significant chunk of ptr mangling was allowed by commit f1174f77b50c ("bpf/verifier: rework value tracking") yet some of it was allowed even earlier. [Linux4: Reapply to fix comment] Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann [fllinden@amazon.com: backport to 4.14] Signed-off-by: Frank van der Linden Signed-off-by: Greg Kroah-Hartman --- kernel/bpf/verifier.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 67943bf49339..7d0417ca98ce 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3191,7 +3191,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, case BPF_AND: case BPF_OR: case BPF_XOR: - /* bitwise ops on pointers are troublesome. */ + /* bitwise ops on pointers are troublesome, prohibit. */ verbose(env, "R%d bitwise operator %s on pointer prohibited\n", dst, bpf_alu_string[opcode >> 4]); return -EACCES; From f3c86541947bc6cc7931a34db6bb09d58b95cb49 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 7 Jun 2018 17:40:03 +0200 Subject: [PATCH 0915/1640] BACKPORT: bpf: reject passing modified ctx to helper functions commit 58990d1ff3f7896ee341030e9a7c2e4002570683 upstream. As commit 28e33f9d78ee ("bpf: disallow arithmetic operations on context pointer") already describes, f1174f77b50c ("bpf/verifier: rework value tracking") removed the specific white-listed cases we had previously where we would allow for pointer arithmetic in order to further generalize it, and allow e.g. context access via modified registers. While the dereferencing of modified context pointers had been forbidden through 28e33f9d78ee, syzkaller did recently manage to trigger several KASAN splats for slab out of bounds access and use after frees by simply passing a modified context pointer to a helper function which would then do the bad access since verifier allowed it in adjust_ptr_min_max_vals(). Rejecting arithmetic on ctx pointer in adjust_ptr_min_max_vals() generally could break existing programs as there's a valid use case in tracing in combination with passing the ctx to helpers as bpf_probe_read(), where the register then becomes unknown at verification time due to adding a non-constant offset to it. An access sequence may look like the following: offset = args->filename; /* field __data_loc filename */ bpf_probe_read(&dst, len, (char *)args + offset); // args is ctx There are two options: i) we could special case the ctx and as soon as we add a constant or bounded offset to it (hence ctx type wouldn't change) we could turn the ctx into an unknown scalar, or ii) we generalize the sanity test for ctx member access into a small helper and assert it on the ctx register that was passed as a function argument. Fwiw, latter is more obvious and less complex at the same time, and one case that may potentially be legitimate in future for ctx member access at least would be for ctx to carry a const offset. Therefore, fix follows approach from ii) and adds test cases to BPF kselftests. [Linux4: Reapply to fix missing newline] Fixes: f1174f77b50c ("bpf/verifier: rework value tracking") Reported-by: syzbot+3d0b2441dbb71751615e@syzkaller.appspotmail.com Reported-by: syzbot+c8504affd4fdd0c1b626@syzkaller.appspotmail.com Reported-by: syzbot+e5190cb881d8660fb1a3@syzkaller.appspotmail.com Reported-by: syzbot+efae31b384d5badbd620@syzkaller.appspotmail.com Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Acked-by: Yonghong Song Acked-by: Edward Cree Signed-off-by: Alexei Starovoitov Signed-off-by: Greg Kroah-Hartman --- kernel/bpf/verifier.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 7d0417ca98ce..de0926cff835 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1657,6 +1657,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn verbose(env, "R%d leaks addr into ctx\n", value_regno); return -EACCES; } + err = check_ctx_reg(env, reg, regno); if (err < 0) return err; From 1011830d2933f81d2f319006dd436bf0248bdfff Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Tue, 28 Aug 2018 07:42:32 +0000 Subject: [PATCH 0916/1640] UPSTREAM: bpf: remove duplicated include from syscall.c Remove duplicated include. Change-Id: I364d76cce6b82335ebe10986dc0a47b99c1465db Signed-off-by: YueHaibing Signed-off-by: Daniel Borkmann --- kernel/bpf/syscall.c | 1 - 1 file changed, 1 deletion(-) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 02e5bdb82a9a..dcef8e5c2c65 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -30,7 +30,6 @@ #include #include #include -#include #include #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PROG_ARRAY || \ From 60a412bd2222b312b80db1dccee8493af3f90515 Mon Sep 17 00:00:00 2001 From: Edward Cree Date: Wed, 22 Aug 2018 20:02:44 +0100 Subject: [PATCH 0917/1640] UPSTREAM: bpf/verifier: display non-spill stack slot types in print_verifier_state If a stack slot does not hold a spilled register (STACK_SPILL), then each of its eight bytes could potentially have a different slot_type. This information can be important for debugging, and previously we either did not print anything for the stack slot, or just printed fp-X=0 in the case where its first byte was STACK_ZERO. Instead, print eight characters with either 0 (STACK_ZERO), m (STACK_MISC) or ? (STACK_INVALID) for any stack slot which is neither STACK_SPILL nor entirely STACK_INVALID. Change-Id: I1e0dd51823b87b981be1d0bbb96b382e07666fc3 Signed-off-by: Edward Cree Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 32 +++++++++++++++++++++++++------- 1 file changed, 25 insertions(+), 7 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index de0926cff835..096bd4689549 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -263,6 +263,13 @@ static const char * const reg_type_str[] = { [PTR_TO_PACKET_END] = "pkt_end", }; +static char slot_type_char[] = { + [STACK_INVALID] = '?', + [STACK_SPILL] = 'r', + [STACK_MISC] = 'm', + [STACK_ZERO] = '0', +}; + static void print_liveness(struct bpf_verifier_env *env, enum bpf_reg_liveness live) { @@ -349,15 +356,26 @@ static void print_verifier_state(struct bpf_verifier_env *env, } } for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { - if (state->stack[i].slot_type[0] == STACK_SPILL) { - verbose(env, " fp%d", - (-i - 1) * BPF_REG_SIZE); - print_liveness(env, state->stack[i].spilled_ptr.live); + char types_buf[BPF_REG_SIZE + 1]; + bool valid = false; + int j; + + for (j = 0; j < BPF_REG_SIZE; j++) { + if (state->stack[i].slot_type[j] != STACK_INVALID) + valid = true; + types_buf[j] = slot_type_char[ + state->stack[i].slot_type[j]]; + } + types_buf[BPF_REG_SIZE] = 0; + if (!valid) + continue; + verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE); + print_liveness(env, state->stack[i].spilled_ptr.live); + if (state->stack[i].slot_type[0] == STACK_SPILL) verbose(env, "=%s", reg_type_str[state->stack[i].spilled_ptr.type]); - } - if (state->stack[i].slot_type[0] == STACK_ZERO) - verbose(env, " fp%d=0", (-i - 1) * BPF_REG_SIZE); + else + verbose(env, "=%s", types_buf); } verbose(env, "\n"); } From c71bbb5912eec8db3139b09b96211e7911f70945 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 29 Aug 2018 14:43:13 -0700 Subject: [PATCH 0918/1640] UPSTREAM: bpf: add bpffs pretty print for percpu arraymap/hash/lru_hash Added bpffs pretty print for percpu arraymap, percpu hashmap and percpu lru hashmap. For each map pair, the format is: : { cpu0: cpu1: ... cpun: } For example, on my VM, there are 4 cpus, and for test_btf test in the next patch: cat /sys/fs/bpf/pprint_test_percpu_hash You may get: ... 43602: { cpu0: {43602,0,-43602,0x3,0xaa52,0x3,{43602|[82,170,0,0,0,0,0,0]},ENUM_TWO} cpu1: {43602,0,-43602,0x3,0xaa52,0x3,{43602|[82,170,0,0,0,0,0,0]},ENUM_TWO} cpu2: {43602,0,-43602,0x3,0xaa52,0x3,{43602|[82,170,0,0,0,0,0,0]},ENUM_TWO} cpu3: {43602,0,-43602,0x3,0xaa52,0x3,{43602|[82,170,0,0,0,0,0,0]},ENUM_TWO} } 72847: { cpu0: {72847,0,-72847,0x3,0x11c8f,0x3,{72847|[143,28,1,0,0,0,0,0]},ENUM_THREE} cpu1: {72847,0,-72847,0x3,0x11c8f,0x3,{72847|[143,28,1,0,0,0,0,0]},ENUM_THREE} cpu2: {72847,0,-72847,0x3,0x11c8f,0x3,{72847|[143,28,1,0,0,0,0,0]},ENUM_THREE} cpu3: {72847,0,-72847,0x3,0x11c8f,0x3,{72847|[143,28,1,0,0,0,0,0]},ENUM_THREE} } ... Change-Id: I286e7505765aa92ea9a8919ddecf8434a24fc187 Signed-off-by: Yonghong Song Signed-off-by: Daniel Borkmann --- kernel/bpf/arraymap.c | 24 ++++++++++++++++++++++++ kernel/bpf/hashtab.c | 31 +++++++++++++++++++++++++++++++ 2 files changed, 55 insertions(+) diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 03e244b11f5a..a0ef2006bba0 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -361,6 +361,29 @@ static void array_map_seq_show_elem(struct bpf_map *map, void *key, rcu_read_unlock(); } +static void percpu_array_map_seq_show_elem(struct bpf_map *map, void *key, + struct seq_file *m) +{ + struct bpf_array *array = container_of(map, struct bpf_array, map); + u32 index = *(u32 *)key; + void __percpu *pptr; + int cpu; + + rcu_read_lock(); + + seq_printf(m, "%u: {\n", *(u32 *)key); + pptr = array->pptrs[index & array->index_mask]; + for_each_possible_cpu(cpu) { + seq_printf(m, "\tcpu%d: ", cpu); + btf_type_seq_show(map->btf, map->btf_value_type_id, + per_cpu_ptr(pptr, cpu), m); + seq_puts(m, "\n"); + } + seq_puts(m, "}\n"); + + rcu_read_unlock(); +} + static int array_map_check_btf(const struct bpf_map *map, const struct btf_type *key_type, const struct btf_type *value_type) @@ -401,6 +424,7 @@ const struct bpf_map_ops percpu_array_map_ops = { .map_lookup_elem = percpu_array_map_lookup_elem, .map_update_elem = array_map_update_elem, .map_delete_elem = array_map_delete_elem, + .map_seq_show_elem = percpu_array_map_seq_show_elem, .map_check_btf = array_map_check_btf, }; diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index bca328703046..ae6ec91035cc 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -1303,6 +1303,35 @@ int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value, return ret; } +static void htab_percpu_map_seq_show_elem(struct bpf_map *map, void *key, + struct seq_file *m) +{ + struct htab_elem *l; + void __percpu *pptr; + int cpu; + + rcu_read_lock(); + + l = __htab_map_lookup_elem(map, key); + if (!l) { + rcu_read_unlock(); + return; + } + + btf_type_seq_show(map->btf, map->btf_key_type_id, key, m); + seq_puts(m, ": {\n"); + pptr = htab_elem_get_ptr(l, map->key_size); + for_each_possible_cpu(cpu) { + seq_printf(m, "\tcpu%d: ", cpu); + btf_type_seq_show(map->btf, map->btf_value_type_id, + per_cpu_ptr(pptr, cpu), m); + seq_puts(m, "\n"); + } + seq_puts(m, "}\n"); + + rcu_read_unlock(); +} + const struct bpf_map_ops htab_percpu_map_ops = { .map_alloc_check = htab_map_alloc_check, .map_alloc = htab_map_alloc, @@ -1311,6 +1340,7 @@ const struct bpf_map_ops htab_percpu_map_ops = { .map_lookup_elem = htab_percpu_map_lookup_elem, .map_update_elem = htab_percpu_map_update_elem, .map_delete_elem = htab_map_delete_elem, + .map_seq_show_elem = htab_percpu_map_seq_show_elem, }; const struct bpf_map_ops htab_lru_percpu_map_ops = { @@ -1321,6 +1351,7 @@ const struct bpf_map_ops htab_lru_percpu_map_ops = { .map_lookup_elem = htab_lru_percpu_map_lookup_elem, .map_update_elem = htab_lru_percpu_map_update_elem, .map_delete_elem = htab_lru_map_delete_elem, + .map_seq_show_elem = htab_percpu_map_seq_show_elem, }; static int fd_htab_map_alloc_check(union bpf_attr *attr) From 84ace6305a5adb1a95982a3d784a27286870f2b5 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 6 Sep 2018 17:26:04 -0700 Subject: [PATCH 0919/1640] UPSTREAM: bpf: add bpffs pretty print for program array map Added bpffs pretty print for program array map. For a particular array index, if the program array points to a valid program, the ": " will be printed out like 0: 6 which means bpf program with id "6" is installed at index "0". Change-Id: Ibfeac1777df6dc8742debe574ba259d212e7ecea Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov --- kernel/bpf/arraymap.c | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index a0ef2006bba0..2a9b4ca27a34 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -557,6 +557,29 @@ static void bpf_fd_array_map_clear(struct bpf_map *map) fd_array_map_delete_elem(map, &i); } +static void prog_array_map_seq_show_elem(struct bpf_map *map, void *key, + struct seq_file *m) +{ + void **elem, *ptr; + u32 prog_id; + + rcu_read_lock(); + + elem = array_map_lookup_elem(map, key); + if (elem) { + ptr = READ_ONCE(*elem); + if (ptr) { + seq_printf(m, "%u: ", *(u32 *)key); + prog_id = prog_fd_array_sys_lookup_elem(ptr); + btf_type_seq_show(map->btf, map->btf_value_type_id, + &prog_id, m); + seq_puts(m, "\n"); + } + } + + rcu_read_unlock(); +} + const struct bpf_map_ops prog_array_map_ops = { .map_alloc_check = fd_array_map_alloc_check, .map_alloc = array_map_alloc, @@ -568,7 +591,7 @@ const struct bpf_map_ops prog_array_map_ops = { .map_fd_put_ptr = prog_fd_array_put_ptr, .map_fd_sys_lookup_elem = prog_fd_array_sys_lookup_elem, .map_release_uref = bpf_fd_array_map_clear, - .map_check_btf = map_check_no_btf, + .map_seq_show_elem = prog_array_map_seq_show_elem, }; static struct bpf_event_entry *bpf_event_entry_gen(struct file *perf_file, From 44c6a4a1dd0fd46bf4d91b84ab0d00b8e41d5951 Mon Sep 17 00:00:00 2001 From: Petar Penkov Date: Fri, 14 Sep 2018 07:46:18 -0700 Subject: [PATCH 0920/1640] BACKPORT: flow_dissector: implements flow dissector BPF hook Adds a hook for programs of type BPF_PROG_TYPE_FLOW_DISSECTOR and attach type BPF_FLOW_DISSECTOR that is executed in the flow dissector path. The BPF program is per-network namespace. Change-Id: I08d14153138c67d7affbddf241171dc8c036279d Signed-off-by: Petar Penkov Signed-off-by: Willem de Bruijn Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 1 + include/linux/bpf_types.h | 1 + include/linux/skbuff.h | 7 ++ include/net/net_namespace.h | 3 + include/net/sch_generic.h | 12 +++- include/uapi/linux/bpf.h | 26 +++++++ kernel/bpf/syscall.c | 8 +++ kernel/bpf/verifier.c | 32 +++++++++ net/core/filter.c | 70 +++++++++++++++++++ net/core/flow_dissector.c | 134 ++++++++++++++++++++++++++++++++++++ 10 files changed, 291 insertions(+), 3 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index d2b881e5d834..bb1954f724a1 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -217,6 +217,7 @@ enum bpf_reg_type { PTR_TO_PACKET_META, /* skb->data - meta_len */ PTR_TO_PACKET, /* reg points to skb->data */ PTR_TO_PACKET_END, /* skb->data + headlen */ + PTR_TO_FLOW_KEYS, /* reg points to bpf_flow_keys */ }; /* The information passed from prog-specific *_is_valid_access diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index cd26c090e7c0..22083712dd18 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -32,6 +32,7 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_LIRC_MODE2, lirc_mode2) #ifdef CONFIG_INET BPF_PROG_TYPE(BPF_PROG_TYPE_SK_REUSEPORT, sk_reuseport) #endif +BPF_PROG_TYPE(BPF_PROG_TYPE_FLOW_DISSECTOR, flow_dissector) BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY, array_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_ARRAY, percpu_array_map_ops) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 66d0d75bd03b..005f1b732504 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -243,6 +243,8 @@ struct scatterlist; struct pipe_inode_info; struct iov_iter; struct napi_struct; +struct bpf_prog; +union bpf_attr; #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) struct nf_conntrack { @@ -1189,6 +1191,11 @@ void skb_flow_dissector_init(struct flow_dissector *flow_dissector, const struct flow_dissector_key *key, unsigned int key_count); +int skb_flow_dissector_bpf_prog_attach(const union bpf_attr *attr, + struct bpf_prog *prog); + +int skb_flow_dissector_bpf_prog_detach(const union bpf_attr *attr); + bool __skb_flow_dissect(const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container, diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index a1fc638aee47..276bd1c3e0c7 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -42,6 +42,7 @@ struct ctl_table_header; struct net_generic; struct sock; struct netns_ipvs; +struct bpf_prog; #define NETDEV_HASHBITS 8 @@ -137,6 +138,8 @@ struct net { #endif struct net_generic __rcu *gen; + struct bpf_prog __rcu *flow_dissector_prog; + /* Note : following structs are cache line aligned */ #ifdef CONFIG_XFRM struct netns_xfrm xfrm; diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index c4ab9934b41d..2e5fe6a6eb60 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -19,6 +19,7 @@ struct Qdisc_ops; struct qdisc_walker; struct tcf_walker; struct module; +struct bpf_flow_keys; struct qdisc_rate_table { struct tc_ratespec rate; @@ -255,9 +256,14 @@ struct tcf_proto { }; struct qdisc_skb_cb { - unsigned int pkt_len; - u16 slave_dev_queue_mapping; - u16 tc_classid; + union { + struct { + unsigned int pkt_len; + u16 slave_dev_queue_mapping; + u16 tc_classid; + }; + struct bpf_flow_keys *flow_keys; + }; #define QDISC_CB_PRIV_LEN 20 unsigned char data[QDISC_CB_PRIV_LEN]; }; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 17cff15efcdf..266a52e8feac 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -152,6 +152,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_LWT_SEG6LOCAL, BPF_PROG_TYPE_LIRC_MODE2, BPF_PROG_TYPE_SK_REUSEPORT, + BPF_PROG_TYPE_FLOW_DISSECTOR, }; enum bpf_attach_type { @@ -172,6 +173,7 @@ enum bpf_attach_type { BPF_CGROUP_UDP4_SENDMSG, BPF_CGROUP_UDP6_SENDMSG, BPF_LIRC_MODE2, + BPF_FLOW_DISSECTOR, BPF_CGROUP_UDP4_RECVMSG = 19, BPF_CGROUP_UDP6_RECVMSG, __MAX_BPF_ATTACH_TYPE @@ -2403,6 +2405,7 @@ struct __sk_buff { /* ... here. */ __u32 data_meta; + struct bpf_flow_keys *flow_keys; }; struct bpf_tunnel_key { @@ -2849,4 +2852,27 @@ enum bpf_task_fd_type { BPF_FD_TYPE_URETPROBE, /* filename + offset */ }; +struct bpf_flow_keys { + __u16 nhoff; + __u16 thoff; + __u16 addr_proto; /* ETH_P_* of valid addrs */ + __u8 is_frag; + __u8 is_first_frag; + __u8 is_encap; + __u8 ip_proto; + __be16 n_proto; + __be16 sport; + __be16 dport; + union { + struct { + __be32 ipv4_src; + __be32 ipv4_dst; + }; + struct { + __u32 ipv6_src[4]; /* in6_addr; network order */ + __u32 ipv6_dst[4]; /* in6_addr; network order */ + }; + }; +}; + #endif /* _UAPI__LINUX_BPF_H__ */ diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index dcef8e5c2c65..de5324fc5c33 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1655,6 +1655,9 @@ static int bpf_prog_attach(const union bpf_attr *attr) case BPF_LIRC_MODE2: ptype = BPF_PROG_TYPE_LIRC_MODE2; break; + case BPF_FLOW_DISSECTOR: + ptype = BPF_PROG_TYPE_FLOW_DISSECTOR; + break; default: return -EINVAL; } @@ -1676,6 +1679,9 @@ static int bpf_prog_attach(const union bpf_attr *attr) case BPF_PROG_TYPE_LIRC_MODE2: ret = lirc_prog_attach(attr, prog); break; + case BPF_PROG_TYPE_FLOW_DISSECTOR: + ret = skb_flow_dissector_bpf_prog_attach(attr, prog); + break; default: ret = cgroup_bpf_prog_attach(attr, ptype, prog); } @@ -1730,6 +1736,8 @@ static int bpf_prog_detach(const union bpf_attr *attr) return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_SKB, NULL); case BPF_LIRC_MODE2: return lirc_prog_detach(attr); + case BPF_FLOW_DISSECTOR: + return skb_flow_dissector_bpf_prog_detach(attr); default: return -EINVAL; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 096bd4689549..32b43f491380 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -261,6 +261,7 @@ static const char * const reg_type_str[] = { [PTR_TO_PACKET] = "pkt", [PTR_TO_PACKET_META] = "pkt_meta", [PTR_TO_PACKET_END] = "pkt_end", + [PTR_TO_FLOW_KEYS] = "flow_keys", }; static char slot_type_char[] = { @@ -968,6 +969,7 @@ static bool is_spillable_regtype(enum bpf_reg_type type) case PTR_TO_PACKET: case PTR_TO_PACKET_META: case PTR_TO_PACKET_END: + case PTR_TO_FLOW_KEYS: case CONST_PTR_TO_MAP: return true; default: @@ -1285,6 +1287,7 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env, case BPF_PROG_TYPE_LWT_XMIT: case BPF_PROG_TYPE_SK_SKB: case BPF_PROG_TYPE_SK_MSG: + case BPF_PROG_TYPE_FLOW_DISSECTOR: if (meta) return meta->pkt_access; @@ -1368,6 +1371,18 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, return -EACCES; } +static int check_flow_keys_access(struct bpf_verifier_env *env, int off, + int size) +{ + if (size < 0 || off < 0 || + (u64)off + size > sizeof(struct bpf_flow_keys)) { + verbose(env, "invalid access to flow keys off=%d size=%d\n", + off, size); + return -EACCES; + } + return 0; +} + static bool __is_pointer_value(bool allow_ptr_leaks, const struct bpf_reg_state *reg) { @@ -1469,6 +1484,9 @@ static int check_ptr_alignment(struct bpf_verifier_env *env, * right in front, treat it the very same way. */ return check_pkt_ptr_alignment(env, reg, off, size, strict); + case PTR_TO_FLOW_KEYS: + pointer_desc = "flow keys "; + break; case PTR_TO_MAP_VALUE: pointer_desc = "value "; break; @@ -1725,6 +1743,17 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn err = check_packet_access(env, regno, off, size, false); if (!err && t == BPF_READ && value_regno >= 0) mark_reg_unknown(env, regs, value_regno); + } else if (reg->type == PTR_TO_FLOW_KEYS) { + if (t == BPF_WRITE && value_regno >= 0 && + is_pointer_value(env, value_regno)) { + verbose(env, "R%d leaks addr into flow keys\n", + value_regno); + return -EACCES; + } + + err = check_flow_keys_access(env, off, size); + if (!err && t == BPF_READ && value_regno >= 0) + mark_reg_unknown(env, regs, value_regno); } else { verbose(env, "R%d invalid mem access '%s'\n", regno, reg_type_str[reg->type]); @@ -1947,6 +1976,8 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, case PTR_TO_PACKET_META: return check_packet_access(env, regno, reg->off, access_size, zero_size_allowed); + case PTR_TO_FLOW_KEYS: + return check_flow_keys_access(env, reg->off, access_size); case PTR_TO_MAP_VALUE: return check_map_access(env, regno, reg->off, access_size, zero_size_allowed); @@ -4932,6 +4963,7 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold, case PTR_TO_CTX: case CONST_PTR_TO_MAP: case PTR_TO_PACKET_END: + case PTR_TO_FLOW_KEYS: /* Only valid matches are exact, which memcmp() above * would have accepted */ diff --git a/net/core/filter.c b/net/core/filter.c index 46fc2daf9564..805ac851a58d 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5114,6 +5114,17 @@ sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) } } +static const struct bpf_func_proto * +flow_dissector_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + switch (func_id) { + case BPF_FUNC_skb_load_bytes: + return &bpf_skb_load_bytes_proto; + default: + return bpf_base_func_proto(func_id); + } +} + static const struct bpf_func_proto * lwt_out_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { @@ -5232,6 +5243,10 @@ static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type if (size != size_default) return false; break; + case bpf_ctx_range(struct __sk_buff, flow_keys): + if (size != sizeof(struct bpf_flow_keys *)) + return false; + break; default: /* Only narrow read access allowed for now. */ if (type == BPF_WRITE) { @@ -5257,6 +5272,7 @@ static bool sk_filter_is_valid_access(int off, int size, case bpf_ctx_range(struct __sk_buff, data): case bpf_ctx_range(struct __sk_buff, data_meta): case bpf_ctx_range(struct __sk_buff, data_end): + case bpf_ctx_range(struct __sk_buff, flow_keys): case bpf_ctx_range_till(struct __sk_buff, family, local_port): return false; } @@ -5282,6 +5298,7 @@ static bool lwt_is_valid_access(int off, int size, case bpf_ctx_range(struct __sk_buff, tc_classid): case bpf_ctx_range_till(struct __sk_buff, family, local_port): case bpf_ctx_range(struct __sk_buff, data_meta): + case bpf_ctx_range(struct __sk_buff, flow_keys): return false; } @@ -5492,6 +5509,7 @@ static bool tc_cls_act_is_valid_access(int off, int size, case bpf_ctx_range(struct __sk_buff, data_end): info->reg_type = PTR_TO_PACKET_END; break; + case bpf_ctx_range(struct __sk_buff, flow_keys): case bpf_ctx_range_till(struct __sk_buff, family, local_port): return false; } @@ -5695,6 +5713,7 @@ static bool sk_skb_is_valid_access(int off, int size, switch (off) { case bpf_ctx_range(struct __sk_buff, tc_classid): case bpf_ctx_range(struct __sk_buff, data_meta): + case bpf_ctx_range(struct __sk_buff, flow_keys): return false; } @@ -5754,6 +5773,39 @@ static bool sk_msg_is_valid_access(int off, int size, return true; } +static bool flow_dissector_is_valid_access(int off, int size, + enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + if (type == BPF_WRITE) { + switch (off) { + case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]): + break; + default: + return false; + } + } + + switch (off) { + case bpf_ctx_range(struct __sk_buff, data): + info->reg_type = PTR_TO_PACKET; + break; + case bpf_ctx_range(struct __sk_buff, data_end): + info->reg_type = PTR_TO_PACKET_END; + break; + case bpf_ctx_range(struct __sk_buff, flow_keys): + info->reg_type = PTR_TO_FLOW_KEYS; + break; + case bpf_ctx_range(struct __sk_buff, tc_classid): + case bpf_ctx_range(struct __sk_buff, data_meta): + case bpf_ctx_range_till(struct __sk_buff, family, local_port): + return false; + } + + return bpf_skb_is_valid_access(off, size, type, prog, info); +} + static u32 bpf_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, @@ -6048,6 +6100,15 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type, bpf_target_off(struct sock_common, skc_num, 2, target_size)); break; + + case offsetof(struct __sk_buff, flow_keys): + off = si->off; + off -= offsetof(struct __sk_buff, flow_keys); + off += offsetof(struct sk_buff, cb); + off += offsetof(struct qdisc_skb_cb, flow_keys); + *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, + si->src_reg, off); + break; } return insn - insn_buf; @@ -7011,6 +7072,15 @@ const struct bpf_verifier_ops sk_msg_verifier_ops = { const struct bpf_prog_ops sk_msg_prog_ops = { }; +const struct bpf_verifier_ops flow_dissector_verifier_ops = { + .get_func_proto = flow_dissector_func_proto, + .is_valid_access = flow_dissector_is_valid_access, + .convert_ctx_access = bpf_convert_ctx_access, +}; + +const struct bpf_prog_ops flow_dissector_prog_ops = { +}; + int sk_detach_filter(struct sock *sk) { int ret = -ENOENT; diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index b4dddb685fc2..974993f0c533 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -22,6 +22,9 @@ #include #include #include +#include + +static DEFINE_MUTEX(flow_dissector_mutex); static void dissector_set_key(struct flow_dissector *flow_dissector, enum flow_dissector_key_id key_id) @@ -59,6 +62,44 @@ void skb_flow_dissector_init(struct flow_dissector *flow_dissector, } EXPORT_SYMBOL(skb_flow_dissector_init); +int skb_flow_dissector_bpf_prog_attach(const union bpf_attr *attr, + struct bpf_prog *prog) +{ + struct bpf_prog *attached; + struct net *net; + + net = current->nsproxy->net_ns; + mutex_lock(&flow_dissector_mutex); + attached = rcu_dereference_protected(net->flow_dissector_prog, + lockdep_is_held(&flow_dissector_mutex)); + if (attached) { + /* Only one BPF program can be attached at a time */ + mutex_unlock(&flow_dissector_mutex); + return -EEXIST; + } + rcu_assign_pointer(net->flow_dissector_prog, prog); + mutex_unlock(&flow_dissector_mutex); + return 0; +} + +int skb_flow_dissector_bpf_prog_detach(const union bpf_attr *attr) +{ + struct bpf_prog *attached; + struct net *net; + + net = current->nsproxy->net_ns; + mutex_lock(&flow_dissector_mutex); + attached = rcu_dereference_protected(net->flow_dissector_prog, + lockdep_is_held(&flow_dissector_mutex)); + if (!attached) { + mutex_unlock(&flow_dissector_mutex); + return -ENOENT; + } + bpf_prog_put(attached); + RCU_INIT_POINTER(net->flow_dissector_prog, NULL); + mutex_unlock(&flow_dissector_mutex); + return 0; +} /** * skb_flow_get_be16 - extract be16 entity * @skb: sk_buff to extract from @@ -408,6 +449,60 @@ static bool skb_flow_dissect_allowed(int *num_hdrs) return (*num_hdrs <= MAX_FLOW_DISSECT_HDRS); } +static void __skb_flow_bpf_to_target(const struct bpf_flow_keys *flow_keys, + struct flow_dissector *flow_dissector, + void *target_container) +{ + struct flow_dissector_key_control *key_control; + struct flow_dissector_key_basic *key_basic; + struct flow_dissector_key_addrs *key_addrs; + struct flow_dissector_key_ports *key_ports; + + key_control = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_CONTROL, + target_container); + key_control->thoff = flow_keys->thoff; + if (flow_keys->is_frag) + key_control->flags |= FLOW_DIS_IS_FRAGMENT; + if (flow_keys->is_first_frag) + key_control->flags |= FLOW_DIS_FIRST_FRAG; + if (flow_keys->is_encap) + key_control->flags |= FLOW_DIS_ENCAPSULATION; + + key_basic = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_BASIC, + target_container); + key_basic->n_proto = flow_keys->n_proto; + key_basic->ip_proto = flow_keys->ip_proto; + + if (flow_keys->addr_proto == ETH_P_IP && + dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_IPV4_ADDRS)) { + key_addrs = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_IPV4_ADDRS, + target_container); + key_addrs->v4addrs.src = flow_keys->ipv4_src; + key_addrs->v4addrs.dst = flow_keys->ipv4_dst; + key_control->addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; + } else if (flow_keys->addr_proto == ETH_P_IPV6 && + dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_IPV6_ADDRS)) { + key_addrs = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_IPV6_ADDRS, + target_container); + memcpy(&key_addrs->v6addrs, &flow_keys->ipv6_src, + sizeof(key_addrs->v6addrs)); + key_control->addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; + } + + if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_PORTS)) { + key_ports = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_PORTS, + target_container); + key_ports->src = flow_keys->sport; + key_ports->dst = flow_keys->dport; + } +} + /** * __skb_flow_dissect - extract the flow_keys struct and return it * @skb: sk_buff to extract the flow from, can be NULL if the rest are specified @@ -439,6 +534,7 @@ bool __skb_flow_dissect(const struct sk_buff *skb, struct flow_dissector_key_vlan *key_vlan; enum flow_dissect_ret fdret; bool skip_vlan = false; + struct bpf_prog *attached; int num_hdrs = 0; u8 ip_proto = 0; bool ret; @@ -479,6 +575,44 @@ bool __skb_flow_dissect(const struct sk_buff *skb, FLOW_DISSECTOR_KEY_BASIC, target_container); + rcu_read_lock(); + attached = skb ? rcu_dereference(dev_net(skb->dev)->flow_dissector_prog) + : NULL; + if (attached) { + /* Note that even though the const qualifier is discarded + * throughout the execution of the BPF program, all changes(the + * control block) are reverted after the BPF program returns. + * Therefore, __skb_flow_dissect does not alter the skb. + */ + struct bpf_flow_keys flow_keys = {}; + struct bpf_skb_data_end cb_saved; + struct bpf_skb_data_end *cb; + u32 result; + + cb = (struct bpf_skb_data_end *)skb->cb; + + /* Save Control Block */ + memcpy(&cb_saved, cb, sizeof(cb_saved)); + memset(cb, 0, sizeof(cb_saved)); + + /* Pass parameters to the BPF program */ + cb->qdisc_cb.flow_keys = &flow_keys; + flow_keys.nhoff = nhoff; + + bpf_compute_data_pointers((struct sk_buff *)skb); + result = BPF_PROG_RUN(attached, skb); + + /* Restore state */ + memcpy(cb, &cb_saved, sizeof(cb_saved)); + + __skb_flow_bpf_to_target(&flow_keys, flow_dissector, + target_container); + key_control->thoff = min_t(u16, key_control->thoff, skb->len); + rcu_read_unlock(); + return result == BPF_OK; + } + rcu_read_unlock(); + if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ETH_ADDRS)) { struct ethhdr *eth = eth_hdr(skb); From 463a07c281f51611e4741f179fc0f996052c0c71 Mon Sep 17 00:00:00 2001 From: zhong jiang Date: Thu, 20 Sep 2018 17:46:12 +0800 Subject: [PATCH 0921/1640] UPSTREAM: bpf: remove redundant null pointer check before consume_skb consume_skb has taken the null pointer into account. hence it is safe to remove the redundant null pointer check before consume_skb. Change-Id: I19d42a3082f6a260b8fb978e622494b48dca23e8 Signed-off-by: zhong jiang Acked-by: Song Liu Signed-off-by: Daniel Borkmann --- kernel/bpf/sockmap.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 0a0f2ec75370..d37a1a0a6e1e 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -612,8 +612,7 @@ static int free_sg(struct sock *sk, int start, if (i == MAX_SKB_FRAGS) i = 0; } - if (md->skb) - consume_skb(md->skb); + consume_skb(md->skb); return free; } @@ -995,8 +994,7 @@ bytes_ready: if (!sg->length && md->sg_start == md->sg_end) { list_del(&md->list); - if (md->skb) - consume_skb(md->skb); + consume_skb(md->skb); kfree(md); } } From 010f9b2bfaeb1ab7951ad4bd9d094d99d125d1c5 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 27 Sep 2018 14:37:30 -0700 Subject: [PATCH 0922/1640] UPSTREAM: bpf: permit CGROUP_DEVICE programs accessing helper bpf_get_current_cgroup_id() Currently, helper bpf_get_current_cgroup_id() is not permitted for CGROUP_DEVICE type of programs. If the helper is used in such cases, the verifier will log the following error: 0: (bf) r6 = r1 1: (69) r7 = *(u16 *)(r6 +0) 2: (85) call bpf_get_current_cgroup_id#80 unknown func bpf_get_current_cgroup_id#80 The bpf_get_current_cgroup_id() is useful for CGROUP_DEVICE type of programs in order to customize action based on cgroup id. This patch added such a support. Cc: Roman Gushchin Change-Id: I4beda5d3fb26c8f1ce3b1e995d90173e44992c58 Signed-off-by: Yonghong Song Acked-by: Alexei Starovoitov Acked-by: Roman Gushchin Signed-off-by: Daniel Borkmann --- kernel/bpf/cgroup.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 6a7d931bbc55..549f6fbcc461 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -677,6 +677,8 @@ cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_current_uid_gid_proto; case BPF_FUNC_get_local_storage: return &bpf_get_local_storage_proto; + case BPF_FUNC_get_current_cgroup_id: + return &bpf_get_current_cgroup_id_proto; case BPF_FUNC_trace_printk: if (capable(CAP_SYS_ADMIN)) return bpf_get_trace_printk_proto(); From cc1e692a27eb51acd6791052dd7c1a8cbf96ff8f Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Fri, 28 Sep 2018 14:45:36 +0000 Subject: [PATCH 0923/1640] UPSTREAM: bpf: extend cgroup bpf core to allow multiple cgroup storage types In order to introduce per-cpu cgroup storage, let's generalize bpf cgroup core to support multiple cgroup storage types. Potentially, per-node cgroup storage can be added later. This commit is mostly a formal change that replaces cgroup_storage pointer with a array of cgroup_storage pointers. It doesn't actually introduce a new storage type, it will be done later. Each bpf program is now able to have one cgroup storage of each type. Change-Id: Ib2f694b5d4af80d27e18eb7001ad078f1f9956c9 Signed-off-by: Roman Gushchin Acked-by: Song Liu Cc: Daniel Borkmann Cc: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/linux/bpf-cgroup.h | 44 ++++++++++++++++------- include/linux/bpf.h | 11 ++++-- kernel/bpf/cgroup.c | 74 ++++++++++++++++++++++++++------------ kernel/bpf/helpers.c | 15 ++++---- kernel/bpf/local_storage.c | 18 ++++++---- kernel/bpf/syscall.c | 9 +++-- kernel/bpf/verifier.c | 8 +++-- net/bpf/test_run.c | 20 +++++++---- 8 files changed, 139 insertions(+), 60 deletions(-) diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index ad6b30137ac2..65e19fe60a49 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -2,6 +2,7 @@ #ifndef _BPF_CGROUP_H #define _BPF_CGROUP_H +#include #include #include #include @@ -22,7 +23,10 @@ struct bpf_cgroup_storage; extern struct static_key_false cgroup_bpf_enabled_key; #define cgroup_bpf_enabled static_branch_unlikely(&cgroup_bpf_enabled_key) -DECLARE_PER_CPU(void*, bpf_cgroup_storage); +DECLARE_PER_CPU(void*, bpf_cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]); + +#define for_each_cgroup_storage_type(stype) \ + for (stype = 0; stype < MAX_BPF_CGROUP_STORAGE_TYPE; stype++) struct bpf_cgroup_storage_map; @@ -43,7 +47,7 @@ struct bpf_cgroup_storage { struct bpf_prog_list { struct list_head node; struct bpf_prog *prog; - struct bpf_cgroup_storage *storage; + struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE]; }; struct bpf_prog_array; @@ -101,18 +105,29 @@ int __cgroup_bpf_run_filter_sock_ops(struct sock *sk, int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, short access, enum bpf_attach_type type); -static inline void bpf_cgroup_storage_set(struct bpf_cgroup_storage *storage) +static inline enum bpf_cgroup_storage_type cgroup_storage_type( + struct bpf_map *map) { - struct bpf_storage_buffer *buf; - - if (!storage) - return; - - buf = READ_ONCE(storage->buf); - this_cpu_write(bpf_cgroup_storage, &buf->data[0]); + return BPF_CGROUP_STORAGE_SHARED; } -struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(struct bpf_prog *prog); +static inline void bpf_cgroup_storage_set(struct bpf_cgroup_storage + *storage[MAX_BPF_CGROUP_STORAGE_TYPE]) +{ + enum bpf_cgroup_storage_type stype; + struct bpf_storage_buffer *buf; + + for_each_cgroup_storage_type(stype) { + if (!storage[stype]) + continue; + + buf = READ_ONCE(storage[stype]->buf); + this_cpu_write(bpf_cgroup_storage[stype], &buf->data[0]); + } +} + +struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(struct bpf_prog *prog, + enum bpf_cgroup_storage_type stype); void bpf_cgroup_storage_free(struct bpf_cgroup_storage *storage); void bpf_cgroup_storage_link(struct bpf_cgroup_storage *storage, struct cgroup *cgroup, @@ -271,13 +286,14 @@ static inline int cgroup_bpf_prog_query(const union bpf_attr *attr, return -EINVAL; } -static inline void bpf_cgroup_storage_set(struct bpf_cgroup_storage *storage) {} +static inline void bpf_cgroup_storage_set( + struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE]) {} static inline int bpf_cgroup_storage_assign(struct bpf_prog *prog, struct bpf_map *map) { return 0; } static inline void bpf_cgroup_storage_release(struct bpf_prog *prog, struct bpf_map *map) {} static inline struct bpf_cgroup_storage *bpf_cgroup_storage_alloc( - struct bpf_prog *prog) { return 0; } + struct bpf_prog *prog, enum bpf_cgroup_storage_type stype) { return 0; } static inline void bpf_cgroup_storage_free( struct bpf_cgroup_storage *storage) {} @@ -301,6 +317,8 @@ static inline void bpf_cgroup_storage_free( #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) ({ 0; }) #define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type,major,minor,access) ({ 0; }) +#define for_each_cgroup_storage_type(stype) for (; false; ) + #endif /* CONFIG_CGROUP_BPF */ #endif /* _BPF_CGROUP_H */ diff --git a/include/linux/bpf.h b/include/linux/bpf.h index bb1954f724a1..4bb7a3b567fe 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -277,6 +277,13 @@ struct bpf_prog_offload { u32 jited_len; }; +enum bpf_cgroup_storage_type { + BPF_CGROUP_STORAGE_SHARED, + __BPF_CGROUP_STORAGE_MAX +}; + +#define MAX_BPF_CGROUP_STORAGE_TYPE __BPF_CGROUP_STORAGE_MAX + struct bpf_prog_aux { atomic_t refcnt; u32 used_map_cnt; @@ -294,7 +301,7 @@ struct bpf_prog_aux { struct bpf_prog *prog; struct user_struct *user; u64 load_time; /* ns since boottime */ - struct bpf_map *cgroup_storage; + struct bpf_map *cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]; char name[BPF_OBJ_NAME_LEN]; #ifdef CONFIG_SECURITY void *security; @@ -363,7 +370,7 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr, */ struct bpf_prog_array_item { struct bpf_prog *prog; - struct bpf_cgroup_storage *cgroup_storage; + struct bpf_cgroup_storage *cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]; }; struct bpf_prog_array { diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 549f6fbcc461..00f6ed2e4f9a 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -25,6 +25,7 @@ EXPORT_SYMBOL(cgroup_bpf_enabled_key); */ void cgroup_bpf_put(struct cgroup *cgrp) { + enum bpf_cgroup_storage_type stype; unsigned int type; for (type = 0; type < ARRAY_SIZE(cgrp->bpf.progs); type++) { @@ -34,8 +35,10 @@ void cgroup_bpf_put(struct cgroup *cgrp) list_for_each_entry_safe(pl, tmp, progs, node) { list_del(&pl->node); bpf_prog_put(pl->prog); - bpf_cgroup_storage_unlink(pl->storage); - bpf_cgroup_storage_free(pl->storage); + for_each_cgroup_storage_type(stype) { + bpf_cgroup_storage_unlink(pl->storage[stype]); + bpf_cgroup_storage_free(pl->storage[stype]); + } kfree(pl); static_branch_dec(&cgroup_bpf_enabled_key); } @@ -97,6 +100,7 @@ static int compute_effective_progs(struct cgroup *cgrp, enum bpf_attach_type type, struct bpf_prog_array __rcu **array) { + enum bpf_cgroup_storage_type stype; struct bpf_prog_array *progs; struct bpf_prog_list *pl; struct cgroup *p = cgrp; @@ -125,7 +129,9 @@ static int compute_effective_progs(struct cgroup *cgrp, continue; progs->items[cnt].prog = pl->prog; - progs->items[cnt].cgroup_storage = pl->storage; + for_each_cgroup_storage_type(stype) + progs->items[cnt].cgroup_storage[stype] = + pl->storage[stype]; cnt++; } } while ((p = cgroup_parent(p))); @@ -232,7 +238,9 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, { struct list_head *progs = &cgrp->bpf.progs[type]; struct bpf_prog *old_prog = NULL; - struct bpf_cgroup_storage *storage, *old_storage = NULL; + struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE], + *old_storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {NULL}; + enum bpf_cgroup_storage_type stype; struct bpf_prog_list *pl; bool pl_was_allocated; int err; @@ -254,34 +262,44 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, if (prog_list_length(progs) >= BPF_CGROUP_MAX_PROGS) return -E2BIG; - storage = bpf_cgroup_storage_alloc(prog); - if (IS_ERR(storage)) - return -ENOMEM; + for_each_cgroup_storage_type(stype) { + storage[stype] = bpf_cgroup_storage_alloc(prog, stype); + if (IS_ERR(storage[stype])) { + storage[stype] = NULL; + for_each_cgroup_storage_type(stype) + bpf_cgroup_storage_free(storage[stype]); + return -ENOMEM; + } + } if (flags & BPF_F_ALLOW_MULTI) { list_for_each_entry(pl, progs, node) { if (pl->prog == prog) { /* disallow attaching the same prog twice */ - bpf_cgroup_storage_free(storage); + for_each_cgroup_storage_type(stype) + bpf_cgroup_storage_free(storage[stype]); return -EINVAL; } } pl = kmalloc(sizeof(*pl), GFP_KERNEL); if (!pl) { - bpf_cgroup_storage_free(storage); + for_each_cgroup_storage_type(stype) + bpf_cgroup_storage_free(storage[stype]); return -ENOMEM; } pl_was_allocated = true; pl->prog = prog; - pl->storage = storage; + for_each_cgroup_storage_type(stype) + pl->storage[stype] = storage[stype]; list_add_tail(&pl->node, progs); } else { if (list_empty(progs)) { pl = kmalloc(sizeof(*pl), GFP_KERNEL); if (!pl) { - bpf_cgroup_storage_free(storage); + for_each_cgroup_storage_type(stype) + bpf_cgroup_storage_free(storage[stype]); return -ENOMEM; } pl_was_allocated = true; @@ -289,12 +307,15 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, } else { pl = list_first_entry(progs, typeof(*pl), node); old_prog = pl->prog; - old_storage = pl->storage; - bpf_cgroup_storage_unlink(old_storage); + for_each_cgroup_storage_type(stype) { + old_storage[stype] = pl->storage[stype]; + bpf_cgroup_storage_unlink(old_storage[stype]); + } pl_was_allocated = false; } pl->prog = prog; - pl->storage = storage; + for_each_cgroup_storage_type(stype) + pl->storage[stype] = storage[stype]; } cgrp->bpf.flags[type] = flags; @@ -304,21 +325,27 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, goto cleanup; static_branch_inc(&cgroup_bpf_enabled_key); - if (old_storage) - bpf_cgroup_storage_free(old_storage); + for_each_cgroup_storage_type(stype) { + if (!old_storage[stype]) + continue; + bpf_cgroup_storage_free(old_storage[stype]); + } if (old_prog) { bpf_prog_put(old_prog); static_branch_dec(&cgroup_bpf_enabled_key); } - bpf_cgroup_storage_link(storage, cgrp, type); + for_each_cgroup_storage_type(stype) + bpf_cgroup_storage_link(storage[stype], cgrp, type); return 0; cleanup: /* and cleanup the prog list */ pl->prog = old_prog; - bpf_cgroup_storage_free(pl->storage); - pl->storage = old_storage; - bpf_cgroup_storage_link(old_storage, cgrp, type); + for_each_cgroup_storage_type(stype) { + bpf_cgroup_storage_free(pl->storage[stype]); + pl->storage[stype] = old_storage[stype]; + bpf_cgroup_storage_link(old_storage[stype], cgrp, type); + } if (pl_was_allocated) { list_del(&pl->node); kfree(pl); @@ -339,6 +366,7 @@ int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, enum bpf_attach_type type, u32 unused_flags) { struct list_head *progs = &cgrp->bpf.progs[type]; + enum bpf_cgroup_storage_type stype; u32 flags = cgrp->bpf.flags[type]; struct bpf_prog *old_prog = NULL; struct bpf_prog_list *pl; @@ -385,8 +413,10 @@ int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, /* now can actually delete it from this cgroup list */ list_del(&pl->node); - bpf_cgroup_storage_unlink(pl->storage); - bpf_cgroup_storage_free(pl->storage); + for_each_cgroup_storage_type(stype) { + bpf_cgroup_storage_unlink(pl->storage[stype]); + bpf_cgroup_storage_free(pl->storage[stype]); + } kfree(pl); if (list_empty(progs)) /* last program was detached, reset flags to zero */ diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index c54e3ac03389..65e2978090f7 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -206,16 +206,18 @@ const struct bpf_func_proto bpf_get_current_cgroup_id_proto = { .ret_type = RET_INTEGER, }; -DECLARE_PER_CPU(void*, bpf_cgroup_storage); +#ifdef CONFIG_CGROUP_BPF +DECLARE_PER_CPU(void*, bpf_cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]); BPF_CALL_2(bpf_get_local_storage, struct bpf_map *, map, u64, flags) { - /* map and flags arguments are not used now, - * but provide an ability to extend the API - * for other types of local storages. - * verifier checks that their values are correct. + /* flags argument is not used now, + * but provides an ability to extend the API. + * verifier checks that its value is correct. */ - return (unsigned long) this_cpu_read(bpf_cgroup_storage); + enum bpf_cgroup_storage_type stype = cgroup_storage_type(map); + + return (unsigned long) this_cpu_read(bpf_cgroup_storage[stype]); } const struct bpf_func_proto bpf_get_local_storage_proto = { @@ -226,3 +228,4 @@ const struct bpf_func_proto bpf_get_local_storage_proto = { .arg2_type = ARG_ANYTHING, }; #endif +#endif diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index fc1605aee5ea..fc9835e69b83 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -7,7 +7,7 @@ #include #include -DEFINE_PER_CPU(void*, bpf_cgroup_storage); +DEFINE_PER_CPU(void*, bpf_cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]); #ifdef CONFIG_CGROUP_BPF @@ -255,6 +255,7 @@ const struct bpf_map_ops cgroup_storage_map_ops = { int bpf_cgroup_storage_assign(struct bpf_prog *prog, struct bpf_map *_map) { + enum bpf_cgroup_storage_type stype = cgroup_storage_type(_map); struct bpf_cgroup_storage_map *map = map_to_storage(_map); int ret = -EBUSY; @@ -262,11 +263,12 @@ int bpf_cgroup_storage_assign(struct bpf_prog *prog, struct bpf_map *_map) if (map->prog && map->prog != prog) goto unlock; - if (prog->aux->cgroup_storage && prog->aux->cgroup_storage != _map) + if (prog->aux->cgroup_storage[stype] && + prog->aux->cgroup_storage[stype] != _map) goto unlock; map->prog = prog; - prog->aux->cgroup_storage = _map; + prog->aux->cgroup_storage[stype] = _map; ret = 0; unlock: spin_unlock_bh(&map->lock); @@ -276,24 +278,26 @@ unlock: void bpf_cgroup_storage_release(struct bpf_prog *prog, struct bpf_map *_map) { + enum bpf_cgroup_storage_type stype = cgroup_storage_type(_map); struct bpf_cgroup_storage_map *map = map_to_storage(_map); spin_lock_bh(&map->lock); if (map->prog == prog) { - WARN_ON(prog->aux->cgroup_storage != _map); + WARN_ON(prog->aux->cgroup_storage[stype] != _map); map->prog = NULL; - prog->aux->cgroup_storage = NULL; + prog->aux->cgroup_storage[stype] = NULL; } spin_unlock_bh(&map->lock); } -struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(struct bpf_prog *prog) +struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(struct bpf_prog *prog, + enum bpf_cgroup_storage_type stype) { struct bpf_cgroup_storage *storage; struct bpf_map *map; u32 pages; - map = prog->aux->cgroup_storage; + map = prog->aux->cgroup_storage[stype]; if (!map) return NULL; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index de5324fc5c33..3206f3847045 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1013,10 +1013,15 @@ static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog) /* drop refcnt on maps used by eBPF program and free auxilary data */ static void free_used_maps(struct bpf_prog_aux *aux) { + enum bpf_cgroup_storage_type stype; int i; - if (aux->cgroup_storage) - bpf_cgroup_storage_release(aux->prog, aux->cgroup_storage); + for_each_cgroup_storage_type(stype) { + if (!aux->cgroup_storage[stype]) + continue; + bpf_cgroup_storage_release(aux->prog, + aux->cgroup_storage[stype]); + } for (i = 0; i < aux->used_map_cnt; i++) bpf_map_put(aux->used_maps[i]); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 32b43f491380..866b5f7c4463 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5746,11 +5746,15 @@ next_insn: /* drop refcnt of maps used by the rejected program */ static void release_maps(struct bpf_verifier_env *env) { + enum bpf_cgroup_storage_type stype; int i; - if (env->prog->aux->cgroup_storage) + for_each_cgroup_storage_type(stype) { + if (!env->prog->aux->cgroup_storage[stype]) + continue; bpf_cgroup_storage_release(env->prog, - env->prog->aux->cgroup_storage); + env->prog->aux->cgroup_storage[stype]); + } for (i = 0; i < env->used_map_cnt; i++) bpf_map_put(env->used_maps[i]); diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index e0c6dfae42d8..e5510dc51a8c 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -12,7 +12,7 @@ #include static __always_inline u32 bpf_test_run_one(struct bpf_prog *prog, void *ctx, - struct bpf_cgroup_storage *storage) + struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE]) { u32 ret; @@ -28,13 +28,20 @@ static __always_inline u32 bpf_test_run_one(struct bpf_prog *prog, void *ctx, static u32 bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat, u32 *time) { - struct bpf_cgroup_storage *storage = NULL; + struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE] = { 0 }; + enum bpf_cgroup_storage_type stype; u64 time_start, time_spent = 0; u32 ret = 0, i; - storage = bpf_cgroup_storage_alloc(prog); - if (IS_ERR(storage)) - return PTR_ERR(storage); + for_each_cgroup_storage_type(stype) { + storage[stype] = bpf_cgroup_storage_alloc(prog, stype); + if (IS_ERR(storage[stype])) { + storage[stype] = NULL; + for_each_cgroup_storage_type(stype) + bpf_cgroup_storage_free(storage[stype]); + return -ENOMEM; + } + } if (!repeat) repeat = 1; @@ -53,7 +60,8 @@ static u32 bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat, u32 *time) do_div(time_spent, repeat); *time = time_spent > U32_MAX ? U32_MAX : (u32)time_spent; - bpf_cgroup_storage_free(storage); + for_each_cgroup_storage_type(stype) + bpf_cgroup_storage_free(storage[stype]); return ret; } From f3a772caa49faab40c7b894ec67bf6f37f6bbd73 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Fri, 28 Sep 2018 14:45:40 +0000 Subject: [PATCH 0924/1640] UPSTREAM: bpf: rework cgroup storage pointer passing To simplify the following introduction of per-cpu cgroup storage, let's rework a bit a mechanism of passing a pointer to a cgroup storage into the bpf_get_local_storage(). Let's save a pointer to the corresponding bpf_cgroup_storage structure, instead of a pointer to the actual buffer. It will help us to handle per-cpu storage later, which has a different way of accessing to the actual data. Change-Id: Idd1d15d93fc01175eb9d355eb98adef33627aa83 Signed-off-by: Roman Gushchin Acked-by: Song Liu Cc: Daniel Borkmann Cc: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/linux/bpf-cgroup.h | 13 ++++--------- kernel/bpf/helpers.c | 8 ++++++-- kernel/bpf/local_storage.c | 3 ++- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index 65e19fe60a49..8af9b0053d7a 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -23,7 +23,8 @@ struct bpf_cgroup_storage; extern struct static_key_false cgroup_bpf_enabled_key; #define cgroup_bpf_enabled static_branch_unlikely(&cgroup_bpf_enabled_key) -DECLARE_PER_CPU(void*, bpf_cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]); +DECLARE_PER_CPU(struct bpf_cgroup_storage*, + bpf_cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]); #define for_each_cgroup_storage_type(stype) \ for (stype = 0; stype < MAX_BPF_CGROUP_STORAGE_TYPE; stype++) @@ -115,15 +116,9 @@ static inline void bpf_cgroup_storage_set(struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE]) { enum bpf_cgroup_storage_type stype; - struct bpf_storage_buffer *buf; - for_each_cgroup_storage_type(stype) { - if (!storage[stype]) - continue; - - buf = READ_ONCE(storage[stype]->buf); - this_cpu_write(bpf_cgroup_storage[stype], &buf->data[0]); - } + for_each_cgroup_storage_type(stype) + this_cpu_write(bpf_cgroup_storage[stype], storage[stype]); } struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(struct bpf_prog *prog, diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 65e2978090f7..b31e613cc1e0 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -207,7 +207,8 @@ const struct bpf_func_proto bpf_get_current_cgroup_id_proto = { }; #ifdef CONFIG_CGROUP_BPF -DECLARE_PER_CPU(void*, bpf_cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]); +DECLARE_PER_CPU(struct bpf_cgroup_storage*, + bpf_cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]); BPF_CALL_2(bpf_get_local_storage, struct bpf_map *, map, u64, flags) { @@ -216,8 +217,11 @@ BPF_CALL_2(bpf_get_local_storage, struct bpf_map *, map, u64, flags) * verifier checks that its value is correct. */ enum bpf_cgroup_storage_type stype = cgroup_storage_type(map); + struct bpf_cgroup_storage *storage; - return (unsigned long) this_cpu_read(bpf_cgroup_storage[stype]); + storage = this_cpu_read(bpf_cgroup_storage[stype]); + + return (unsigned long)&READ_ONCE(storage->buf)->data[0]; } const struct bpf_func_proto bpf_get_local_storage_proto = { diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index fc9835e69b83..e0b1d9ccde39 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -7,7 +7,8 @@ #include #include -DEFINE_PER_CPU(void*, bpf_cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]); +DEFINE_PER_CPU(struct bpf_cgroup_storage*, + bpf_cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]); #ifdef CONFIG_CGROUP_BPF From 9290fc0909ac5227a77cd9b4e5a58e6de2155416 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Fri, 28 Sep 2018 14:45:43 +0000 Subject: [PATCH 0925/1640] UPSTREAM: bpf: introduce per-cpu cgroup local storage This commit introduced per-cpu cgroup local storage. Per-cpu cgroup local storage is very similar to simple cgroup storage (let's call it shared), except all the data is per-cpu. The main goal of per-cpu variant is to implement super fast counters (e.g. packet counters), which don't require neither lookups, neither atomic operations. >From userspace's point of view, accessing a per-cpu cgroup storage is similar to other per-cpu map types (e.g. per-cpu hashmaps and arrays). Writing to a per-cpu cgroup storage is not atomic, but is performed by copying longs, so some minimal atomicity is here, exactly as with other per-cpu maps. Change-Id: Icc47681bca5e17f1325d2c856f7da5a9d8d0f12d Signed-off-by: Roman Gushchin Cc: Daniel Borkmann Cc: Alexei Starovoitov Acked-by: Song Liu Signed-off-by: Daniel Borkmann --- include/linux/bpf-cgroup.h | 20 ++++- include/linux/bpf.h | 1 + include/linux/bpf_types.h | 1 + include/uapi/linux/bpf.h | 1 + kernel/bpf/helpers.c | 8 +- kernel/bpf/local_storage.c | 150 ++++++++++++++++++++++++++++++++----- kernel/bpf/syscall.c | 11 ++- kernel/bpf/verifier.c | 15 +++- 8 files changed, 179 insertions(+), 28 deletions(-) diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index 8af9b0053d7a..96f5ae7b8f41 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -37,7 +37,10 @@ struct bpf_storage_buffer { }; struct bpf_cgroup_storage { - struct bpf_storage_buffer *buf; + union { + struct bpf_storage_buffer *buf; + void __percpu *percpu_buf; + }; struct bpf_cgroup_storage_map *map; struct bpf_cgroup_storage_key key; struct list_head list; @@ -109,6 +112,9 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, static inline enum bpf_cgroup_storage_type cgroup_storage_type( struct bpf_map *map) { + if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) + return BPF_CGROUP_STORAGE_PERCPU; + return BPF_CGROUP_STORAGE_SHARED; } @@ -131,6 +137,10 @@ void bpf_cgroup_storage_unlink(struct bpf_cgroup_storage *storage); int bpf_cgroup_storage_assign(struct bpf_prog *prog, struct bpf_map *map); void bpf_cgroup_storage_release(struct bpf_prog *prog, struct bpf_map *map); +int bpf_percpu_cgroup_storage_copy(struct bpf_map *map, void *key, void *value); +int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, + void *value, u64 flags); + /* Wrappers for __cgroup_bpf_run_filter_skb() guarded by cgroup_bpf_enabled. */ #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb) \ ({ \ @@ -291,6 +301,14 @@ static inline struct bpf_cgroup_storage *bpf_cgroup_storage_alloc( struct bpf_prog *prog, enum bpf_cgroup_storage_type stype) { return 0; } static inline void bpf_cgroup_storage_free( struct bpf_cgroup_storage *storage) {} +static inline int bpf_percpu_cgroup_storage_copy(struct bpf_map *map, void *key, + void *value) { + return 0; +} +static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map, + void *key, void *value, u64 flags) { + return 0; +} #define cgroup_bpf_enabled (0) #define BPF_CGROUP_PRE_CONNECT_ENABLED(sk) (0) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 4bb7a3b567fe..ebe244113194 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -279,6 +279,7 @@ struct bpf_prog_offload { enum bpf_cgroup_storage_type { BPF_CGROUP_STORAGE_SHARED, + BPF_CGROUP_STORAGE_PERCPU, __BPF_CGROUP_STORAGE_MAX }; diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index 22083712dd18..9d5f77e2d8dd 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -43,6 +43,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_CGROUP_ARRAY, cgroup_array_map_ops) #endif #ifdef CONFIG_CGROUP_BPF BPF_MAP_TYPE(BPF_MAP_TYPE_CGROUP_STORAGE, cgroup_storage_map_ops) +BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE, cgroup_storage_map_ops) #endif BPF_MAP_TYPE(BPF_MAP_TYPE_HASH, htab_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_HASH, htab_percpu_map_ops) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 266a52e8feac..95c8230fe2dd 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -127,6 +127,7 @@ enum bpf_map_type { BPF_MAP_TYPE_SOCKHASH, BPF_MAP_TYPE_CGROUP_STORAGE, BPF_MAP_TYPE_REUSEPORT_SOCKARRAY, + BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE, }; enum bpf_prog_type { diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index b31e613cc1e0..f3a8b3475d43 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -218,10 +218,16 @@ BPF_CALL_2(bpf_get_local_storage, struct bpf_map *, map, u64, flags) */ enum bpf_cgroup_storage_type stype = cgroup_storage_type(map); struct bpf_cgroup_storage *storage; + void *ptr; storage = this_cpu_read(bpf_cgroup_storage[stype]); - return (unsigned long)&READ_ONCE(storage->buf)->data[0]; + if (stype == BPF_CGROUP_STORAGE_SHARED) + ptr = &READ_ONCE(storage->buf)->data[0]; + else + ptr = this_cpu_ptr(storage->percpu_buf); + + return (unsigned long)ptr; } const struct bpf_func_proto bpf_get_local_storage_proto = { diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index e0b1d9ccde39..bed9d48a7ae9 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -153,6 +153,71 @@ static int cgroup_storage_update_elem(struct bpf_map *map, void *_key, return 0; } +int bpf_percpu_cgroup_storage_copy(struct bpf_map *_map, void *_key, + void *value) +{ + struct bpf_cgroup_storage_map *map = map_to_storage(_map); + struct bpf_cgroup_storage_key *key = _key; + struct bpf_cgroup_storage *storage; + int cpu, off = 0; + u32 size; + + rcu_read_lock(); + storage = cgroup_storage_lookup(map, key, false); + if (!storage) { + rcu_read_unlock(); + return -ENOENT; + } + + /* per_cpu areas are zero-filled and bpf programs can only + * access 'value_size' of them, so copying rounded areas + * will not leak any kernel data + */ + size = round_up(_map->value_size, 8); + for_each_possible_cpu(cpu) { + bpf_long_memcpy(value + off, + per_cpu_ptr(storage->percpu_buf, cpu), size); + off += size; + } + rcu_read_unlock(); + return 0; +} + +int bpf_percpu_cgroup_storage_update(struct bpf_map *_map, void *_key, + void *value, u64 map_flags) +{ + struct bpf_cgroup_storage_map *map = map_to_storage(_map); + struct bpf_cgroup_storage_key *key = _key; + struct bpf_cgroup_storage *storage; + int cpu, off = 0; + u32 size; + + if (map_flags != BPF_ANY && map_flags != BPF_EXIST) + return -EINVAL; + + rcu_read_lock(); + storage = cgroup_storage_lookup(map, key, false); + if (!storage) { + rcu_read_unlock(); + return -ENOENT; + } + + /* the user space will provide round_up(value_size, 8) bytes that + * will be copied into per-cpu area. bpf programs can only access + * value_size of it. During lookup the same extra bytes will be + * returned or zeros which were zero-filled by percpu_alloc, + * so no kernel data leaks possible + */ + size = round_up(_map->value_size, 8); + for_each_possible_cpu(cpu) { + bpf_long_memcpy(per_cpu_ptr(storage->percpu_buf, cpu), + value + off, size); + off += size; + } + rcu_read_unlock(); + return 0; +} + static int cgroup_storage_get_next_key(struct bpf_map *_map, void *_key, void *_next_key) { @@ -291,60 +356,105 @@ void bpf_cgroup_storage_release(struct bpf_prog *prog, struct bpf_map *_map) spin_unlock_bh(&map->lock); } +static size_t bpf_cgroup_storage_calculate_size(struct bpf_map *map, u32 *pages) +{ + size_t size; + + if (cgroup_storage_type(map) == BPF_CGROUP_STORAGE_SHARED) { + size = sizeof(struct bpf_storage_buffer) + map->value_size; + *pages = round_up(sizeof(struct bpf_cgroup_storage) + size, + PAGE_SIZE) >> PAGE_SHIFT; + } else { + size = map->value_size; + *pages = round_up(round_up(size, 8) * num_possible_cpus(), + PAGE_SIZE) >> PAGE_SHIFT; + } + + return size; +} + struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(struct bpf_prog *prog, enum bpf_cgroup_storage_type stype) { struct bpf_cgroup_storage *storage; struct bpf_map *map; + gfp_t flags; + size_t size; u32 pages; map = prog->aux->cgroup_storage[stype]; if (!map) return NULL; - pages = round_up(sizeof(struct bpf_cgroup_storage) + - sizeof(struct bpf_storage_buffer) + - map->value_size, PAGE_SIZE) >> PAGE_SHIFT; + size = bpf_cgroup_storage_calculate_size(map, &pages); + if (bpf_map_charge_memlock(map, pages)) return ERR_PTR(-EPERM); storage = kmalloc_node(sizeof(struct bpf_cgroup_storage), __GFP_ZERO | GFP_USER, map->numa_node); - if (!storage) { - bpf_map_uncharge_memlock(map, pages); - return ERR_PTR(-ENOMEM); - } + if (!storage) + goto enomem; - storage->buf = kmalloc_node(sizeof(struct bpf_storage_buffer) + - map->value_size, __GFP_ZERO | GFP_USER, - map->numa_node); - if (!storage->buf) { - bpf_map_uncharge_memlock(map, pages); - kfree(storage); - return ERR_PTR(-ENOMEM); + flags = __GFP_ZERO | GFP_USER; + + if (stype == BPF_CGROUP_STORAGE_SHARED) { + storage->buf = kmalloc_node(size, flags, map->numa_node); + if (!storage->buf) + goto enomem; + } else { + storage->percpu_buf = __alloc_percpu_gfp(size, 8, flags); + if (!storage->percpu_buf) + goto enomem; } storage->map = (struct bpf_cgroup_storage_map *)map; return storage; + +enomem: + bpf_map_uncharge_memlock(map, pages); + kfree(storage); + return ERR_PTR(-ENOMEM); +} + +static void free_shared_cgroup_storage_rcu(struct rcu_head *rcu) +{ + struct bpf_cgroup_storage *storage = + container_of(rcu, struct bpf_cgroup_storage, rcu); + + kfree(storage->buf); + kfree(storage); +} + +static void free_percpu_cgroup_storage_rcu(struct rcu_head *rcu) +{ + struct bpf_cgroup_storage *storage = + container_of(rcu, struct bpf_cgroup_storage, rcu); + + free_percpu(storage->percpu_buf); + kfree(storage); } void bpf_cgroup_storage_free(struct bpf_cgroup_storage *storage) { - u32 pages; + enum bpf_cgroup_storage_type stype; struct bpf_map *map; + u32 pages; if (!storage) return; map = &storage->map->map; - pages = round_up(sizeof(struct bpf_cgroup_storage) + - sizeof(struct bpf_storage_buffer) + - map->value_size, PAGE_SIZE) >> PAGE_SHIFT; + + bpf_cgroup_storage_calculate_size(map, &pages); bpf_map_uncharge_memlock(map, pages); - kfree_rcu(storage->buf, rcu); - kfree_rcu(storage, rcu); + stype = cgroup_storage_type(map); + if (stype == BPF_CGROUP_STORAGE_SHARED) + call_rcu(&storage->rcu, free_shared_cgroup_storage_rcu); + else + call_rcu(&storage->rcu, free_percpu_cgroup_storage_rcu); } void bpf_cgroup_storage_link(struct bpf_cgroup_storage *storage, diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 3206f3847045..598d1d560aa2 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -687,7 +687,8 @@ static int map_lookup_elem(union bpf_attr *attr) if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH || - map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) + map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY || + map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) value_size = round_up(map->value_size, 8) * num_possible_cpus(); else if (IS_FD_MAP(map)) value_size = sizeof(u32); @@ -711,6 +712,8 @@ static int map_lookup_elem(union bpf_attr *attr) err = bpf_percpu_hash_copy(map, key, value); } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { err = bpf_percpu_array_copy(map, key, value); + } else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) { + err = bpf_percpu_cgroup_storage_copy(map, key, value); } else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) { err = bpf_stackmap_copy(map, key, value); } else if (IS_FD_ARRAY(map)) { @@ -797,7 +800,8 @@ static int map_update_elem(union bpf_attr *attr) if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH || - map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) + map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY || + map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) value_size = round_up(map->value_size, 8) * num_possible_cpus(); else value_size = map->value_size; @@ -832,6 +836,9 @@ static int map_update_elem(union bpf_attr *attr) err = bpf_percpu_hash_update(map, key, value, attr->flags); } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { err = bpf_percpu_array_update(map, key, value, attr->flags); + } else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) { + err = bpf_percpu_cgroup_storage_update(map, key, value, + attr->flags); } else if (IS_FD_ARRAY(map)) { rcu_read_lock(); err = bpf_fd_array_map_update_elem(map, f.file, key, value, diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 866b5f7c4463..2a2747a8419b 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2181,6 +2181,7 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, goto error; break; case BPF_MAP_TYPE_CGROUP_STORAGE: + case BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE: if (func_id != BPF_FUNC_get_local_storage) goto error; break; @@ -2271,7 +2272,8 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, goto error; break; case BPF_FUNC_get_local_storage: - if (map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE) + if (map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE && + map->map_type != BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) goto error; break; case BPF_FUNC_sk_select_reuseport: @@ -5624,6 +5626,12 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, return 0; } +static bool bpf_map_is_cgroup_storage(struct bpf_map *map) +{ + return (map->map_type == BPF_MAP_TYPE_CGROUP_STORAGE || + map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE); +} + /* look for pseudo eBPF instructions that access map FDs and * replace them with actual map pointers */ @@ -5714,10 +5722,9 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env) } env->used_maps[env->used_map_cnt++] = map; - if (map->map_type == BPF_MAP_TYPE_CGROUP_STORAGE && + if (bpf_map_is_cgroup_storage(map) && bpf_cgroup_storage_assign(env->prog, map)) { - verbose(env, - "only one cgroup storage is allowed\n"); + verbose(env, "only one cgroup storage of each type is allowed\n"); fdput(f); return -EBUSY; } From f517946ef26f22448bc2d67aea511d973f10c50a Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Fri, 28 Sep 2018 14:45:46 +0000 Subject: [PATCH 0926/1640] UPSTREAM: bpf: don't allow create maps of per-cpu cgroup local storages Explicitly forbid creating map of per-cpu cgroup local storages. This behavior matches the behavior of shared cgroup storages. Change-Id: If0e1d7437bf0fd4bbbd89d2979c61b4107ba8908 Signed-off-by: Roman Gushchin Acked-by: Song Liu Cc: Daniel Borkmann Cc: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- kernel/bpf/map_in_map.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c index 051c5e40792c..c65fdc06600c 100644 --- a/kernel/bpf/map_in_map.c +++ b/kernel/bpf/map_in_map.c @@ -25,7 +25,8 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd) * in the verifier is not enough. */ if (inner_map->map_type == BPF_MAP_TYPE_PROG_ARRAY || - inner_map->map_type == BPF_MAP_TYPE_CGROUP_STORAGE) { + inner_map->map_type == BPF_MAP_TYPE_CGROUP_STORAGE || + inner_map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) { fdput(f); return ERR_PTR(-ENOTSUPP); } From 953b5556c33cee13ecec9241977685b2630cb27f Mon Sep 17 00:00:00 2001 From: Joe Stringer Date: Tue, 2 Oct 2018 13:35:29 -0700 Subject: [PATCH 0927/1640] UPSTREAM: bpf: Add iterator for spilled registers Add this iterator for spilled registers, it concentrates the details of how to get the current frame's spilled registers into a single macro while clarifying the intention of the code which is calling the macro. Change-Id: I5e2f5d852d33fe9009cf870d36e978dea6d4b460 Signed-off-by: Joe Stringer Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/linux/bpf_verifier.h | 11 +++++++++++ kernel/bpf/verifier.c | 16 +++++++--------- 2 files changed, 18 insertions(+), 9 deletions(-) diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 4acd06cca703..f9fb477b5add 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -142,6 +142,17 @@ struct bpf_verifier_state { bool speculative; }; +#define bpf_get_spilled_reg(slot, frame) \ + (((slot < frame->allocated_stack / BPF_REG_SIZE) && \ + (frame->stack[slot].slot_type[0] == STACK_SPILL)) \ + ? &frame->stack[slot].spilled_ptr : NULL) + +/* Iterate over 'frame', setting 'reg' to either NULL or a spilled register. */ +#define bpf_for_each_spilled_reg(iter, frame, reg) \ + for (iter = 0, reg = bpf_get_spilled_reg(iter, frame); \ + iter < frame->allocated_stack / BPF_REG_SIZE; \ + iter++, reg = bpf_get_spilled_reg(iter, frame)) + /* linked list of verifier states used to prune search */ struct bpf_verifier_state_list { struct bpf_verifier_state state; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 2a2747a8419b..b7b5467d0d73 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2359,10 +2359,9 @@ static void __clear_all_pkt_pointers(struct bpf_verifier_env *env, if (reg_is_pkt_pointer_any(®s[i])) mark_reg_unknown(env, regs, i); - for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { - if (state->stack[i].slot_type[0] != STACK_SPILL) + bpf_for_each_spilled_reg(i, state, reg) { + if (!reg) continue; - reg = &state->stack[i].spilled_ptr; if (reg_is_pkt_pointer_any(reg)) __mark_reg_unknown(reg); } @@ -3871,10 +3870,9 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *vstate, for (j = 0; j <= vstate->curframe; j++) { state = vstate->frame[j]; - for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { - if (state->stack[i].slot_type[0] != STACK_SPILL) + bpf_for_each_spilled_reg(i, state, reg) { + if (!reg) continue; - reg = &state->stack[i].spilled_ptr; if (reg->type == type && reg->id == dst_reg->id) reg->range = max(reg->range, new_range); } @@ -4192,7 +4190,7 @@ static void mark_map_regs(struct bpf_verifier_state *vstate, u32 regno, bool is_null) { struct bpf_func_state *state = vstate->frame[vstate->curframe]; - struct bpf_reg_state *regs = state->regs; + struct bpf_reg_state *reg, *regs = state->regs; u32 id = regs[regno].id; int i, j; @@ -4201,8 +4199,8 @@ static void mark_map_regs(struct bpf_verifier_state *vstate, u32 regno, for (j = 0; j <= vstate->curframe; j++) { state = vstate->frame[j]; - for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { - if (state->stack[i].slot_type[0] != STACK_SPILL) + bpf_for_each_spilled_reg(i, state, reg) { + if (!reg) continue; mark_map_reg(&state->stack[i].spilled_ptr, 0, id, is_null); } From 4ce83021ac9dd0fdc9869ab2c7dcc22d6bf3b17c Mon Sep 17 00:00:00 2001 From: Joe Stringer Date: Tue, 2 Oct 2018 13:35:30 -0700 Subject: [PATCH 0928/1640] UPSTREAM: bpf: Simplify ptr_min_max_vals adjustment An upcoming commit will add another two pointer types that need very similar behaviour, so generalise this function now. Change-Id: Ib8b4c1dab67194cc39b3acc7de8bacb6b4c4277b Signed-off-by: Joe Stringer Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 22 ++++++++++----------- tools/testing/selftests/bpf/test_verifier.c | 14 ++++++------- 2 files changed, 17 insertions(+), 19 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index b7b5467d0d73..ebe862595bd4 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3095,20 +3095,18 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, return -EACCES; } - if (ptr_reg->type == PTR_TO_MAP_VALUE_OR_NULL) { - verbose(env, "R%d pointer arithmetic on PTR_TO_MAP_VALUE_OR_NULL prohibited, null-check it first\n", - dst); + switch (ptr_reg->type) { + case PTR_TO_MAP_VALUE_OR_NULL: + verbose(env, "R%d pointer arithmetic on %s prohibited, null-check it first\n", + dst, reg_type_str[ptr_reg->type]); return -EACCES; - } - if (ptr_reg->type == CONST_PTR_TO_MAP) { - verbose(env, "R%d pointer arithmetic on CONST_PTR_TO_MAP prohibited\n", - dst); - return -EACCES; - } - if (ptr_reg->type == PTR_TO_PACKET_END) { - verbose(env, "R%d pointer arithmetic on PTR_TO_PACKET_END prohibited\n", - dst); + case CONST_PTR_TO_MAP: + case PTR_TO_PACKET_END: + verbose(env, "R%d pointer arithmetic on %s prohibited\n", + dst, reg_type_str[ptr_reg->type]); return -EACCES; + default: + break; } /* In case of 'scalar += pointer', dst_reg inherits pointer type and id. diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c index 97fafe5cdc55..0440948d4579 100644 --- a/tools/testing/selftests/bpf/test_verifier.c +++ b/tools/testing/selftests/bpf/test_verifier.c @@ -3012,7 +3012,7 @@ static struct bpf_test tests[] = { BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, - .errstr = "R3 pointer arithmetic on PTR_TO_PACKET_END", + .errstr = "R3 pointer arithmetic on pkt_end", .result = REJECT, .prog_type = BPF_PROG_TYPE_SCHED_CLS, }, @@ -4040,7 +4040,7 @@ static struct bpf_test tests[] = { BPF_EXIT_INSN(), }, .fixup_map1 = { 4 }, - .errstr = "R4 pointer arithmetic on PTR_TO_MAP_VALUE_OR_NULL", + .errstr = "R4 pointer arithmetic on map_value_or_null", .result = REJECT, .prog_type = BPF_PROG_TYPE_SCHED_CLS }, @@ -4061,7 +4061,7 @@ static struct bpf_test tests[] = { BPF_EXIT_INSN(), }, .fixup_map1 = { 4 }, - .errstr = "R4 pointer arithmetic on PTR_TO_MAP_VALUE_OR_NULL", + .errstr = "R4 pointer arithmetic on map_value_or_null", .result = REJECT, .prog_type = BPF_PROG_TYPE_SCHED_CLS }, @@ -4082,7 +4082,7 @@ static struct bpf_test tests[] = { BPF_EXIT_INSN(), }, .fixup_map1 = { 4 }, - .errstr = "R4 pointer arithmetic on PTR_TO_MAP_VALUE_OR_NULL", + .errstr = "R4 pointer arithmetic on map_value_or_null", .result = REJECT, .prog_type = BPF_PROG_TYPE_SCHED_CLS }, @@ -5959,7 +5959,7 @@ static struct bpf_test tests[] = { BPF_EXIT_INSN(), }, .fixup_map_in_map = { 3 }, - .errstr = "R1 pointer arithmetic on CONST_PTR_TO_MAP prohibited", + .errstr = "R1 pointer arithmetic on map_ptr prohibited", .result = REJECT, }, { @@ -7347,7 +7347,7 @@ static struct bpf_test tests[] = { BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, - .errstr = "R3 pointer arithmetic on PTR_TO_PACKET_END", + .errstr = "R3 pointer arithmetic on pkt_end", .result = REJECT, .prog_type = BPF_PROG_TYPE_XDP, }, @@ -7366,7 +7366,7 @@ static struct bpf_test tests[] = { BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, - .errstr = "R3 pointer arithmetic on PTR_TO_PACKET_END", + .errstr = "R3 pointer arithmetic on pkt_end", .result = REJECT, .prog_type = BPF_PROG_TYPE_XDP, }, From deacc64270f78c71a5f03e75e8e0113e015e3a75 Mon Sep 17 00:00:00 2001 From: Joe Stringer Date: Tue, 2 Oct 2018 13:35:31 -0700 Subject: [PATCH 0929/1640] BACKPORT: bpf: Reuse canonical string formatter for ctx errs The array "reg_type_str" provides canonical formatting of register types, however a couple of places would previously check whether a register represented the context and write the name "context" directly. An upcoming commit will add another pointer type to these statements, so to provide more accurate error messages in the verifier, update these error messages to use "reg_type_str" instead. Change-Id: If9e9f3db1b2adc837597828dcece94293853a0a6 Signed-off-by: Joe Stringer Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index ebe862595bd4..4c792c37725d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1796,8 +1796,7 @@ static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_ins if (is_ctx_reg(env, insn->dst_reg) || is_pkt_reg(env, insn->dst_reg)) { verbose(env, "BPF_XADD stores into R%d %s is not allowed\n", - insn->dst_reg, is_ctx_reg(env, insn->dst_reg) ? - "context" : "packet"); + insn->dst_reg, reg_type_str[insn->dst_reg]); return -EACCES; } @@ -5445,8 +5444,8 @@ static int do_check(struct bpf_verifier_env *env) return err; if (is_ctx_reg(env, insn->dst_reg)) { - verbose(env, "BPF_ST stores into R%d context is not allowed\n", - insn->dst_reg); + verbose(env, "BPF_ST stores into R%d %s is not allowed\n", + insn->dst_reg, reg_type_str[insn->dst_reg]); return -EACCES; } From 541fca04a27ca184c1237fd32a16a9280cf7ff67 Mon Sep 17 00:00:00 2001 From: Joe Stringer Date: Tue, 2 Oct 2018 13:35:32 -0700 Subject: [PATCH 0930/1640] UPSTREAM: bpf: Generalize ptr_or_null regs check This check will be reused by an upcoming commit for conditional jump checks for sockets. Refactor it a bit to simplify the later commit. Change-Id: Ib52c3c0636977c2c947580a3612f06b38093dac8 Signed-off-by: Joe Stringer Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 43 +++++++++++++++++++++++++------------------ 1 file changed, 25 insertions(+), 18 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 4c792c37725d..baf383e23ca6 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -249,6 +249,11 @@ static bool type_is_pkt_pointer(enum bpf_reg_type type) type == PTR_TO_PACKET_META; } +static bool reg_type_may_be_null(enum bpf_reg_type type) +{ + return type == PTR_TO_MAP_VALUE_OR_NULL; +} + /* string representation of 'enum bpf_reg_type' */ static const char * const reg_type_str[] = { [NOT_INIT] = "?", @@ -4148,12 +4153,10 @@ static void reg_combine_min_max(struct bpf_reg_state *true_src, } } -static void mark_map_reg(struct bpf_reg_state *regs, u32 regno, u32 id, - bool is_null) +static void mark_ptr_or_null_reg(struct bpf_reg_state *reg, u32 id, + bool is_null) { - struct bpf_reg_state *reg = ®s[regno]; - - if (reg->type == PTR_TO_MAP_VALUE_OR_NULL && reg->id == id) { + if (reg_type_may_be_null(reg->type) && reg->id == id) { /* Old offset (both fixed and variable parts) should * have been known-zero, because we don't allow pointer * arithmetic on pointers that might be NULL. @@ -4166,11 +4169,13 @@ static void mark_map_reg(struct bpf_reg_state *regs, u32 regno, u32 id, } if (is_null) { reg->type = SCALAR_VALUE; - } else if (reg->map_ptr->inner_map_meta) { - reg->type = CONST_PTR_TO_MAP; - reg->map_ptr = reg->map_ptr->inner_map_meta; - } else { - reg->type = PTR_TO_MAP_VALUE; + } else if (reg->type == PTR_TO_MAP_VALUE_OR_NULL) { + if (reg->map_ptr->inner_map_meta) { + reg->type = CONST_PTR_TO_MAP; + reg->map_ptr = reg->map_ptr->inner_map_meta; + } else { + reg->type = PTR_TO_MAP_VALUE; + } } /* We don't need id from this point onwards anymore, thus we * should better reset it, so that state pruning has chances @@ -4183,8 +4188,8 @@ static void mark_map_reg(struct bpf_reg_state *regs, u32 regno, u32 id, /* The logic is similar to find_good_pkt_pointers(), both could eventually * be folded together at some point. */ -static void mark_map_regs(struct bpf_verifier_state *vstate, u32 regno, - bool is_null) +static void mark_ptr_or_null_regs(struct bpf_verifier_state *vstate, u32 regno, + bool is_null) { struct bpf_func_state *state = vstate->frame[vstate->curframe]; struct bpf_reg_state *reg, *regs = state->regs; @@ -4192,14 +4197,14 @@ static void mark_map_regs(struct bpf_verifier_state *vstate, u32 regno, int i, j; for (i = 0; i < MAX_BPF_REG; i++) - mark_map_reg(regs, i, id, is_null); + mark_ptr_or_null_reg(®s[i], id, is_null); for (j = 0; j <= vstate->curframe; j++) { state = vstate->frame[j]; bpf_for_each_spilled_reg(i, state, reg) { if (!reg) continue; - mark_map_reg(&state->stack[i].spilled_ptr, 0, id, is_null); + mark_ptr_or_null_reg(reg, id, is_null); } } } @@ -4414,12 +4419,14 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, /* detect if R == 0 where R is returned from bpf_map_lookup_elem() */ if (BPF_SRC(insn->code) == BPF_K && insn->imm == 0 && (opcode == BPF_JEQ || opcode == BPF_JNE) && - dst_reg->type == PTR_TO_MAP_VALUE_OR_NULL) { - /* Mark all identical map registers in each branch as either + reg_type_may_be_null(dst_reg->type)) { + /* Mark all identical registers in each branch as either * safe or unknown depending R == 0 or R != 0 conditional. */ - mark_map_regs(this_branch, insn->dst_reg, opcode == BPF_JNE); - mark_map_regs(other_branch, insn->dst_reg, opcode == BPF_JEQ); + mark_ptr_or_null_regs(this_branch, insn->dst_reg, + opcode == BPF_JNE); + mark_ptr_or_null_regs(other_branch, insn->dst_reg, + opcode == BPF_JEQ); } else if (!try_match_pkt_pointers(insn, dst_reg, ®s[insn->src_reg], this_branch, other_branch) && is_pointer_value(env, insn->dst_reg)) { From 0f484da6234d60b413c02301a72ef177531a5b64 Mon Sep 17 00:00:00 2001 From: Joe Stringer Date: Tue, 2 Oct 2018 13:35:33 -0700 Subject: [PATCH 0931/1640] BACKPORT: bpf: Add PTR_TO_SOCKET verifier type Teach the verifier a little bit about a new type of pointer, a PTR_TO_SOCKET. This pointer type is accessed from BPF through the 'struct bpf_sock' structure. Change-Id: Id23e6b19e576df0f65fbc0310278247754ff04a3 Signed-off-by: Joe Stringer Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 34 ++++++++++ include/linux/bpf_verifier.h | 2 + kernel/bpf/verifier.c | 119 ++++++++++++++++++++++++++++++----- net/core/filter.c | 30 +++++---- 4 files changed, 159 insertions(+), 26 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index ebe244113194..a141f6798b16 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -159,6 +159,7 @@ enum bpf_arg_type { ARG_PTR_TO_CTX, /* pointer to context */ ARG_ANYTHING, /* any (initialized) argument is ok */ + ARG_PTR_TO_SOCKET, /* pointer to bpf_sock */ }; /* type of values returned from helper functions */ @@ -167,6 +168,7 @@ enum bpf_return_type { RET_VOID, /* function doesn't return anything */ RET_PTR_TO_MAP_VALUE, /* returns a pointer to map elem value */ RET_PTR_TO_MAP_VALUE_OR_NULL, /* returns a pointer to map elem value or NULL */ + RET_PTR_TO_SOCKET_OR_NULL, /* returns a pointer to a socket or NULL */ }; /* eBPF function prototype used by verifier to allow BPF_CALLs from eBPF programs @@ -218,6 +220,8 @@ enum bpf_reg_type { PTR_TO_PACKET, /* reg points to skb->data */ PTR_TO_PACKET_END, /* skb->data + headlen */ PTR_TO_FLOW_KEYS, /* reg points to bpf_flow_keys */ + PTR_TO_SOCKET, /* reg points to struct bpf_sock */ + PTR_TO_SOCKET_OR_NULL, /* reg points to struct bpf_sock or NULL */ }; /* The information passed from prog-specific *_is_valid_access @@ -348,6 +352,11 @@ const struct bpf_func_proto *bpf_get_trace_printk_proto(void); typedef unsigned long (*bpf_ctx_copy_t)(void *dst, const void *src, unsigned long off, unsigned long len); +typedef u32 (*bpf_convert_ctx_access_t)(enum bpf_access_type type, + const struct bpf_insn *src, + struct bpf_insn *dst, + struct bpf_prog *prog, + u32 *target_size); u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size, void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy); @@ -856,4 +865,29 @@ extern const struct bpf_func_proto bpf_get_local_storage_proto; void bpf_user_rnd_init_once(void); u64 bpf_user_rnd_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); +#if defined(CONFIG_NET) +bool bpf_sock_is_valid_access(int off, int size, enum bpf_access_type type, + struct bpf_insn_access_aux *info); +u32 bpf_sock_convert_ctx_access(enum bpf_access_type type, + const struct bpf_insn *si, + struct bpf_insn *insn_buf, + struct bpf_prog *prog, + u32 *target_size); +#else +static inline bool bpf_sock_is_valid_access(int off, int size, + enum bpf_access_type type, + struct bpf_insn_access_aux *info) +{ + return false; +} +static inline u32 bpf_sock_convert_ctx_access(enum bpf_access_type type, + const struct bpf_insn *si, + struct bpf_insn *insn_buf, + struct bpf_prog *prog, + u32 *target_size) +{ + return 0; +} +#endif + #endif /* _LINUX_BPF_H */ diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index f9fb477b5add..26abc94cd916 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -61,6 +61,8 @@ struct bpf_reg_state { * offset, so they can share range knowledge. * For PTR_TO_MAP_VALUE_OR_NULL this is used to share which map value we * came from, when one is tested for != NULL. + * For PTR_TO_SOCKET this is used to share which pointers retain the + * same reference to the socket, to determine proper reference freeing. */ u32 id; /* For scalar types (SCALAR_VALUE), this represents our knowledge of diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index baf383e23ca6..4cdc9deeaff6 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -80,8 +80,8 @@ static const struct bpf_verifier_ops * const bpf_verifier_ops[] = { * (like pointer plus pointer becomes SCALAR_VALUE type) * * When verifier sees load or store instructions the type of base register - * can be: PTR_TO_MAP_VALUE, PTR_TO_CTX, PTR_TO_STACK. These are three pointer - * types recognized by check_mem_access() function. + * can be: PTR_TO_MAP_VALUE, PTR_TO_CTX, PTR_TO_STACK, PTR_TO_SOCKET. These are + * four pointer types recognized by check_mem_access() function. * * PTR_TO_MAP_VALUE means that this register is pointing to 'map element value' * and the range of [ptr, ptr + map's value_size) is accessible. @@ -267,6 +267,8 @@ static const char * const reg_type_str[] = { [PTR_TO_PACKET_META] = "pkt_meta", [PTR_TO_PACKET_END] = "pkt_end", [PTR_TO_FLOW_KEYS] = "flow_keys", + [PTR_TO_SOCKET] = "sock", + [PTR_TO_SOCKET_OR_NULL] = "sock_or_null", }; static char slot_type_char[] = { @@ -976,6 +978,8 @@ static bool is_spillable_regtype(enum bpf_reg_type type) case PTR_TO_PACKET_END: case PTR_TO_FLOW_KEYS: case CONST_PTR_TO_MAP: + case PTR_TO_SOCKET: + case PTR_TO_SOCKET_OR_NULL: return true; default: return false; @@ -1388,6 +1392,28 @@ static int check_flow_keys_access(struct bpf_verifier_env *env, int off, return 0; } +static int check_sock_access(struct bpf_verifier_env *env, u32 regno, int off, + int size, enum bpf_access_type t) +{ + struct bpf_reg_state *regs = cur_regs(env); + struct bpf_reg_state *reg = ®s[regno]; + struct bpf_insn_access_aux info; + + if (reg->smin_value < 0) { + verbose(env, "R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n", + regno); + return -EACCES; + } + + if (!bpf_sock_is_valid_access(off, size, t, &info)) { + verbose(env, "invalid bpf_sock access off=%d size=%d\n", + off, size); + return -EACCES; + } + + return 0; +} + static bool __is_pointer_value(bool allow_ptr_leaks, const struct bpf_reg_state *reg) { @@ -1506,6 +1532,9 @@ static int check_ptr_alignment(struct bpf_verifier_env *env, */ strict = true; break; + case PTR_TO_SOCKET: + pointer_desc = "sock "; + break; default: break; } @@ -1759,6 +1788,14 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn err = check_flow_keys_access(env, off, size); if (!err && t == BPF_READ && value_regno >= 0) mark_reg_unknown(env, regs, value_regno); + } else if (reg->type == PTR_TO_SOCKET) { + if (t == BPF_WRITE) { + verbose(env, "cannot write into socket\n"); + return -EACCES; + } + err = check_sock_access(env, regno, off, size, t); + if (!err && value_regno >= 0) + mark_reg_unknown(env, regs, value_regno); } else { verbose(env, "R%d invalid mem access '%s'\n", regno, reg_type_str[reg->type]); @@ -2056,6 +2093,10 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, err = check_ctx_reg(env, reg, regno); if (err < 0) return err; + } else if (arg_type == ARG_PTR_TO_SOCKET) { + expected_type = PTR_TO_SOCKET; + if (type != expected_type) + goto err_type; } else if (arg_type_is_mem_ptr(arg_type)) { expected_type = PTR_TO_STACK; /* One exception here. In case function allows for NULL to be @@ -2673,6 +2714,10 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn } regs[BPF_REG_0].map_ptr = meta.map_ptr; regs[BPF_REG_0].id = ++env->id_gen; + } else if (fn->ret_type == RET_PTR_TO_SOCKET_OR_NULL) { + mark_reg_known_zero(env, regs, BPF_REG_0); + regs[BPF_REG_0].type = PTR_TO_SOCKET_OR_NULL; + regs[BPF_REG_0].id = ++env->id_gen; } else { verbose(env, "unknown return type %d of func %s#%d\n", fn->ret_type, func_id_name(func_id), func_id); @@ -3106,6 +3151,8 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, return -EACCES; case CONST_PTR_TO_MAP: case PTR_TO_PACKET_END: + case PTR_TO_SOCKET: + case PTR_TO_SOCKET_OR_NULL: verbose(env, "R%d pointer arithmetic on %s prohibited\n", dst, reg_type_str[ptr_reg->type]); return -EACCES; @@ -4176,6 +4223,8 @@ static void mark_ptr_or_null_reg(struct bpf_reg_state *reg, u32 id, } else { reg->type = PTR_TO_MAP_VALUE; } + } else if (reg->type == PTR_TO_SOCKET_OR_NULL) { + reg->type = PTR_TO_SOCKET; } /* We don't need id from this point onwards anymore, thus we * should better reset it, so that state pruning has chances @@ -4968,6 +5017,8 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold, case CONST_PTR_TO_MAP: case PTR_TO_PACKET_END: case PTR_TO_FLOW_KEYS: + case PTR_TO_SOCKET: + case PTR_TO_SOCKET_OR_NULL: /* Only valid matches are exact, which memcmp() above * would have accepted */ @@ -5245,6 +5296,37 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) return 0; } +/* Return true if it's OK to have the same insn return a different type. */ +static bool reg_type_mismatch_ok(enum bpf_reg_type type) +{ + switch (type) { + case PTR_TO_CTX: + case PTR_TO_SOCKET: + case PTR_TO_SOCKET_OR_NULL: + return false; + default: + return true; + } +} + +/* If an instruction was previously used with particular pointer types, then we + * need to be careful to avoid cases such as the below, where it may be ok + * for one branch accessing the pointer, but not ok for the other branch: + * + * R1 = sock_ptr + * goto X; + * ... + * R1 = some_other_valid_ptr; + * goto X; + * ... + * R2 = *(u32 *)(R1 + 0); + */ +static bool reg_type_mismatch(enum bpf_reg_type src, enum bpf_reg_type prev) +{ + return src != prev && (!reg_type_mismatch_ok(src) || + !reg_type_mismatch_ok(prev)); +} + static int do_check(struct bpf_verifier_env *env) { struct bpf_verifier_state *state; @@ -5385,9 +5467,7 @@ static int do_check(struct bpf_verifier_env *env) */ *prev_src_type = src_reg_type; - } else if (src_reg_type != *prev_src_type && - (src_reg_type == PTR_TO_CTX || - *prev_src_type == PTR_TO_CTX)) { + } else if (reg_type_mismatch(src_reg_type, *prev_src_type)) { /* ABuser program is trying to use the same insn * dst_reg = *(u32*) (src_reg + off) * with different pointer types: @@ -5432,9 +5512,7 @@ static int do_check(struct bpf_verifier_env *env) if (*prev_dst_type == NOT_INIT) { *prev_dst_type = dst_reg_type; - } else if (dst_reg_type != *prev_dst_type && - (dst_reg_type == PTR_TO_CTX || - *prev_dst_type == PTR_TO_CTX)) { + } else if (reg_type_mismatch(dst_reg_type, *prev_dst_type)) { verbose(env, "same insn cannot be used with different pointers\n"); return -EINVAL; } @@ -5864,8 +5942,10 @@ static void sanitize_dead_code(struct bpf_verifier_env *env) } } -/* convert load instructions that access fields of 'struct __sk_buff' - * into sequence of instructions that access fields of 'struct sk_buff' +/* convert load instructions that access fields of a context type into a + * sequence of instructions that access fields of the underlying structure: + * struct __sk_buff -> struct sk_buff + * struct bpf_sock_ops -> struct sock */ static int convert_ctx_accesses(struct bpf_verifier_env *env) { @@ -5894,12 +5974,13 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) } } - if (!ops->convert_ctx_access || bpf_prog_is_dev_bound(env->prog->aux)) + if (bpf_prog_is_dev_bound(env->prog->aux)) return 0; insn = env->prog->insnsi + delta; for (i = 0; i < insn_cnt; i++, insn++) { + bpf_convert_ctx_access_t convert_ctx_access; bool ctx_access; if (insn->code == (BPF_LDX | BPF_MEM | BPF_B) || @@ -5943,8 +6024,18 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) if (!ctx_access) continue; - if (env->insn_aux_data[i + delta].ptr_type != PTR_TO_CTX) + switch (env->insn_aux_data[i + delta].ptr_type) { + case PTR_TO_CTX: + if (!ops->convert_ctx_access) + continue; + convert_ctx_access = ops->convert_ctx_access; + break; + case PTR_TO_SOCKET: + convert_ctx_access = bpf_sock_convert_ctx_access; + break; + default: continue; + } ctx_field_size = env->insn_aux_data[i + delta].ctx_field_size; size = BPF_LDST_BYTES(insn); @@ -5976,8 +6067,8 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) } target_size = 0; - cnt = ops->convert_ctx_access(type, insn, insn_buf, env->prog, - &target_size); + cnt = convert_ctx_access(type, insn, insn_buf, env->prog, + &target_size); if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf) || (ctx_field_size && !target_size)) { verbose(env, "bpf verifier is misconfigured\n"); diff --git a/net/core/filter.c b/net/core/filter.c index 805ac851a58d..1fdf0f5a9818 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5384,23 +5384,29 @@ static bool __sock_filter_check_size(int off, int size, return size == size_default; } -static bool sock_filter_is_valid_access(int off, int size, - enum bpf_access_type type, - const struct bpf_prog *prog, - struct bpf_insn_access_aux *info) +bool bpf_sock_is_valid_access(int off, int size, enum bpf_access_type type, + struct bpf_insn_access_aux *info) { if (off < 0 || off >= sizeof(struct bpf_sock)) return false; if (off % size != 0) return false; - if (!__sock_filter_check_attach_type(off, type, - prog->expected_attach_type)) - return false; if (!__sock_filter_check_size(off, size, info)) return false; return true; } +static bool sock_filter_is_valid_access(int off, int size, + enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + if (!bpf_sock_is_valid_access(off, size, type, info)) + return false; + return __sock_filter_check_attach_type(off, type, + prog->expected_attach_type); +} + static int bpf_unclone_prologue(struct bpf_insn *insn_buf, bool direct_write, const struct bpf_prog *prog, int drop_verdict) { @@ -6114,10 +6120,10 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type, return insn - insn_buf; } -static u32 sock_filter_convert_ctx_access(enum bpf_access_type type, - const struct bpf_insn *si, - struct bpf_insn *insn_buf, - struct bpf_prog *prog, u32 *target_size) +u32 bpf_sock_convert_ctx_access(enum bpf_access_type type, + const struct bpf_insn *si, + struct bpf_insn *insn_buf, + struct bpf_prog *prog, u32 *target_size) { struct bpf_insn *insn = insn_buf; int off; @@ -7029,7 +7035,7 @@ const struct bpf_prog_ops lwt_seg6local_prog_ops = { const struct bpf_verifier_ops cg_sock_verifier_ops = { .get_func_proto = sock_filter_func_proto, .is_valid_access = sock_filter_is_valid_access, - .convert_ctx_access = sock_filter_convert_ctx_access, + .convert_ctx_access = bpf_sock_convert_ctx_access, }; const struct bpf_prog_ops cg_sock_prog_ops = { From cdb6e3061e62498d1f37befcf3f13ed8231c4567 Mon Sep 17 00:00:00 2001 From: Joe Stringer Date: Tue, 2 Oct 2018 13:35:34 -0700 Subject: [PATCH 0932/1640] UPSTREAM: bpf: Macrofy stack state copy An upcoming commit will need very similar copy/realloc boilerplate, so refactor the existing stack copy/realloc functions into macros to simplify it. Change-Id: Ia36d56792281241440ffa1082cb0248067c9652f Signed-off-by: Joe Stringer Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 104 ++++++++++++++++++++++++------------------ 1 file changed, 59 insertions(+), 45 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 4cdc9deeaff6..78d26341976c 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -388,60 +388,74 @@ static void print_verifier_state(struct bpf_verifier_env *env, verbose(env, "\n"); } -static int copy_stack_state(struct bpf_func_state *dst, - const struct bpf_func_state *src) -{ - if (!src->stack) - return 0; - if (WARN_ON_ONCE(dst->allocated_stack < src->allocated_stack)) { - /* internal bug, make state invalid to reject the program */ - memset(dst, 0, sizeof(*dst)); - return -EFAULT; - } - memcpy(dst->stack, src->stack, - sizeof(*src->stack) * (src->allocated_stack / BPF_REG_SIZE)); - return 0; +#define COPY_STATE_FN(NAME, COUNT, FIELD, SIZE) \ +static int copy_##NAME##_state(struct bpf_func_state *dst, \ + const struct bpf_func_state *src) \ +{ \ + if (!src->FIELD) \ + return 0; \ + if (WARN_ON_ONCE(dst->COUNT < src->COUNT)) { \ + /* internal bug, make state invalid to reject the program */ \ + memset(dst, 0, sizeof(*dst)); \ + return -EFAULT; \ + } \ + memcpy(dst->FIELD, src->FIELD, \ + sizeof(*src->FIELD) * (src->COUNT / SIZE)); \ + return 0; \ } +/* copy_stack_state() */ +COPY_STATE_FN(stack, allocated_stack, stack, BPF_REG_SIZE) +#undef COPY_STATE_FN + +#define REALLOC_STATE_FN(NAME, COUNT, FIELD, SIZE) \ +static int realloc_##NAME##_state(struct bpf_func_state *state, int size, \ + bool copy_old) \ +{ \ + u32 old_size = state->COUNT; \ + struct bpf_##NAME##_state *new_##FIELD; \ + int slot = size / SIZE; \ + \ + if (size <= old_size || !size) { \ + if (copy_old) \ + return 0; \ + state->COUNT = slot * SIZE; \ + if (!size && old_size) { \ + kfree(state->FIELD); \ + state->FIELD = NULL; \ + } \ + return 0; \ + } \ + new_##FIELD = kmalloc_array(slot, sizeof(struct bpf_##NAME##_state), \ + GFP_KERNEL); \ + if (!new_##FIELD) \ + return -ENOMEM; \ + if (copy_old) { \ + if (state->FIELD) \ + memcpy(new_##FIELD, state->FIELD, \ + sizeof(*new_##FIELD) * (old_size / SIZE)); \ + memset(new_##FIELD + old_size / SIZE, 0, \ + sizeof(*new_##FIELD) * (size - old_size) / SIZE); \ + } \ + state->COUNT = slot * SIZE; \ + kfree(state->FIELD); \ + state->FIELD = new_##FIELD; \ + return 0; \ +} +/* realloc_stack_state() */ +REALLOC_STATE_FN(stack, allocated_stack, stack, BPF_REG_SIZE) +#undef REALLOC_STATE_FN /* do_check() starts with zero-sized stack in struct bpf_verifier_state to * make it consume minimal amount of memory. check_stack_write() access from * the program calls into realloc_func_state() to grow the stack size. - * Note there is a non-zero parent pointer inside each reg of bpf_verifier_state - * which this function copies over. It points to corresponding reg in previous - * bpf_verifier_state which is never reallocated + * Note there is a non-zero 'parent' pointer inside bpf_verifier_state + * which realloc_stack_state() copies over. It points to previous + * bpf_verifier_state which is never reallocated. */ static int realloc_func_state(struct bpf_func_state *state, int size, bool copy_old) { - u32 old_size = state->allocated_stack; - struct bpf_stack_state *new_stack; - int slot = size / BPF_REG_SIZE; - - if (size <= old_size || !size) { - if (copy_old) - return 0; - state->allocated_stack = slot * BPF_REG_SIZE; - if (!size && old_size) { - kfree(state->stack); - state->stack = NULL; - } - return 0; - } - new_stack = kmalloc_array(slot, sizeof(struct bpf_stack_state), - GFP_KERNEL); - if (!new_stack) - return -ENOMEM; - if (copy_old) { - if (state->stack) - memcpy(new_stack, state->stack, - sizeof(*new_stack) * (old_size / BPF_REG_SIZE)); - memset(new_stack + old_size / BPF_REG_SIZE, 0, - sizeof(*new_stack) * (size - old_size) / BPF_REG_SIZE); - } - state->allocated_stack = slot * BPF_REG_SIZE; - kfree(state->stack); - state->stack = new_stack; - return 0; + return realloc_stack_state(state, size, copy_old); } static void free_func_state(struct bpf_func_state *state) From a994400839876af8f54044680447130933ffd73a Mon Sep 17 00:00:00 2001 From: Joe Stringer Date: Tue, 2 Oct 2018 13:35:35 -0700 Subject: [PATCH 0933/1640] BACKPORT: bpf: Add reference tracking to verifier Allow helper functions to acquire a reference and return it into a register. Specific pointer types such as the PTR_TO_SOCKET will implicitly represent such a reference. The verifier must ensure that these references are released exactly once in each path through the program. To achieve this, this commit assigns an id to the pointer and tracks it in the 'bpf_func_state', then when the function or program exits, verifies that all of the acquired references have been freed. When the pointer is passed to a function that frees the reference, it is removed from the 'bpf_func_state` and all existing copies of the pointer in registers are marked invalid. Change-Id: I2f03a4b8d98af0b85d24a348d9109dfb83ffaa86 Signed-off-by: Joe Stringer Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/linux/bpf_verifier.h | 24 ++- kernel/bpf/verifier.c | 307 ++++++++++++++++++++++++++++++++--- 2 files changed, 308 insertions(+), 23 deletions(-) diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 26abc94cd916..996f5e6a3a65 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -107,6 +107,17 @@ struct bpf_stack_state { u8 slot_type[BPF_REG_SIZE]; }; +struct bpf_reference_state { + /* Track each reference created with a unique id, even if the same + * instruction creates the reference multiple times (eg, via CALL). + */ + int id; + /* Instruction where the allocation of this reference occurred. This + * is used purely to inform the user of a reference leak. + */ + int insn_idx; +}; + /* state of the program: * type of all registers and stack info */ @@ -124,7 +135,9 @@ struct bpf_func_state { */ u32 subprogno; - /* should be second to last. See copy_func_state() */ + /* The following fields should be last. See copy_func_state() */ + int acquired_refs; + struct bpf_reference_state *refs; int allocated_stack; struct bpf_stack_state *stack; }; @@ -243,11 +256,16 @@ __printf(2, 0) void bpf_verifier_vlog(struct bpf_verifier_log *log, __printf(2, 3) void bpf_verifier_log_write(struct bpf_verifier_env *env, const char *fmt, ...); -static inline struct bpf_reg_state *cur_regs(struct bpf_verifier_env *env) +static inline struct bpf_func_state *cur_func(struct bpf_verifier_env *env) { struct bpf_verifier_state *cur = env->cur_state; - return cur->frame[cur->curframe]->regs; + return cur->frame[cur->curframe]; +} + +static inline struct bpf_reg_state *cur_regs(struct bpf_verifier_env *env) +{ + return cur_func(env)->regs; } int bpf_prog_offload_verifier_prep(struct bpf_verifier_env *env); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 78d26341976c..e5198f5832b9 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1,5 +1,6 @@ /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com * Copyright (c) 2016 Facebook + * Copyright (c) 2018 Covalent IO, Inc. http://covalent.io * * This program is free software; you can redistribute it and/or * modify it under the terms of version 2 of the GNU General Public @@ -140,6 +141,18 @@ static const struct bpf_verifier_ops * const bpf_verifier_ops[] = { * * After the call R0 is set to return type of the function and registers R1-R5 * are set to NOT_INIT to indicate that they are no longer readable. + * + * The following reference types represent a potential reference to a kernel + * resource which, after first being allocated, must be checked and freed by + * the BPF program: + * - PTR_TO_SOCKET_OR_NULL, PTR_TO_SOCKET + * + * When the verifier sees a helper call return a reference type, it allocates a + * pointer id for the reference and stores it in the current function state. + * Similar to the way that PTR_TO_MAP_VALUE_OR_NULL is converted into + * PTR_TO_MAP_VALUE, PTR_TO_SOCKET_OR_NULL becomes PTR_TO_SOCKET when the type + * passes through a NULL-check conditional. For the branch wherein the state is + * changed to CONST_IMM, the verifier releases the reference. */ /* verifier_state + insn_idx are pushed to stack when branch is encountered */ @@ -189,6 +202,7 @@ struct bpf_call_arg_meta { int regno; int access_size; u64 msize_max_value; + int ptr_id; }; static DEFINE_MUTEX(bpf_verifier_lock); @@ -251,7 +265,42 @@ static bool type_is_pkt_pointer(enum bpf_reg_type type) static bool reg_type_may_be_null(enum bpf_reg_type type) { - return type == PTR_TO_MAP_VALUE_OR_NULL; + return type == PTR_TO_MAP_VALUE_OR_NULL || + type == PTR_TO_SOCKET_OR_NULL; +} + +static bool type_is_refcounted(enum bpf_reg_type type) +{ + return type == PTR_TO_SOCKET; +} + +static bool type_is_refcounted_or_null(enum bpf_reg_type type) +{ + return type == PTR_TO_SOCKET || type == PTR_TO_SOCKET_OR_NULL; +} + +static bool reg_is_refcounted(const struct bpf_reg_state *reg) +{ + return type_is_refcounted(reg->type); +} + +static bool reg_is_refcounted_or_null(const struct bpf_reg_state *reg) +{ + return type_is_refcounted_or_null(reg->type); +} + +static bool arg_type_is_refcounted(enum bpf_arg_type type) +{ + return type == ARG_PTR_TO_SOCKET; +} + +/* Determine whether the function releases some resources allocated by another + * function call. The first reference type argument will be assumed to be + * released by release_reference(). + */ +static bool is_release_function(enum bpf_func_id func_id) +{ + return false; } /* string representation of 'enum bpf_reg_type' */ @@ -385,6 +434,12 @@ static void print_verifier_state(struct bpf_verifier_env *env, else verbose(env, "=%s", types_buf); } + if (state->acquired_refs && state->refs[0].id) { + verbose(env, " refs=%d", state->refs[0].id); + for (i = 1; i < state->acquired_refs; i++) + if (state->refs[i].id) + verbose(env, ",%d", state->refs[i].id); + } verbose(env, "\n"); } @@ -403,6 +458,8 @@ static int copy_##NAME##_state(struct bpf_func_state *dst, \ sizeof(*src->FIELD) * (src->COUNT / SIZE)); \ return 0; \ } +/* copy_reference_state() */ +COPY_STATE_FN(reference, acquired_refs, refs, 1) /* copy_stack_state() */ COPY_STATE_FN(stack, allocated_stack, stack, BPF_REG_SIZE) #undef COPY_STATE_FN @@ -441,6 +498,8 @@ static int realloc_##NAME##_state(struct bpf_func_state *state, int size, \ state->FIELD = new_##FIELD; \ return 0; \ } +/* realloc_reference_state() */ +REALLOC_STATE_FN(reference, acquired_refs, refs, 1) /* realloc_stack_state() */ REALLOC_STATE_FN(stack, allocated_stack, stack, BPF_REG_SIZE) #undef REALLOC_STATE_FN @@ -452,16 +511,89 @@ REALLOC_STATE_FN(stack, allocated_stack, stack, BPF_REG_SIZE) * which realloc_stack_state() copies over. It points to previous * bpf_verifier_state which is never reallocated. */ -static int realloc_func_state(struct bpf_func_state *state, int size, - bool copy_old) +static int realloc_func_state(struct bpf_func_state *state, int stack_size, + int refs_size, bool copy_old) { - return realloc_stack_state(state, size, copy_old); + int err = realloc_reference_state(state, refs_size, copy_old); + if (err) + return err; + return realloc_stack_state(state, stack_size, copy_old); +} + +/* Acquire a pointer id from the env and update the state->refs to include + * this new pointer reference. + * On success, returns a valid pointer id to associate with the register + * On failure, returns a negative errno. + */ +static int acquire_reference_state(struct bpf_verifier_env *env, int insn_idx) +{ + struct bpf_func_state *state = cur_func(env); + int new_ofs = state->acquired_refs; + int id, err; + + err = realloc_reference_state(state, state->acquired_refs + 1, true); + if (err) + return err; + id = ++env->id_gen; + state->refs[new_ofs].id = id; + state->refs[new_ofs].insn_idx = insn_idx; + + return id; +} + +/* release function corresponding to acquire_reference_state(). Idempotent. */ +static int __release_reference_state(struct bpf_func_state *state, int ptr_id) +{ + int i, last_idx; + + if (!ptr_id) + return -EFAULT; + + last_idx = state->acquired_refs - 1; + for (i = 0; i < state->acquired_refs; i++) { + if (state->refs[i].id == ptr_id) { + if (last_idx && i != last_idx) + memcpy(&state->refs[i], &state->refs[last_idx], + sizeof(*state->refs)); + memset(&state->refs[last_idx], 0, sizeof(*state->refs)); + state->acquired_refs--; + return 0; + } + } + return -EFAULT; +} + +/* variation on the above for cases where we expect that there must be an + * outstanding reference for the specified ptr_id. + */ +static int release_reference_state(struct bpf_verifier_env *env, int ptr_id) +{ + struct bpf_func_state *state = cur_func(env); + int err; + + err = __release_reference_state(state, ptr_id); + if (WARN_ON_ONCE(err != 0)) + verbose(env, "verifier internal error: can't release reference\n"); + return err; +} + +static int transfer_reference_state(struct bpf_func_state *dst, + struct bpf_func_state *src) +{ + int err = realloc_reference_state(dst, src->acquired_refs, false); + if (err) + return err; + err = copy_reference_state(dst, src); + if (err) + return err; + return 0; } static void free_func_state(struct bpf_func_state *state) { if (!state) return; + kfree(state->refs); kfree(state->stack); kfree(state); } @@ -487,10 +619,14 @@ static int copy_func_state(struct bpf_func_state *dst, { int err; - err = realloc_func_state(dst, src->allocated_stack, false); + err = realloc_func_state(dst, src->allocated_stack, src->acquired_refs, + false); + if (err) + return err; + memcpy(dst, src, offsetof(struct bpf_func_state, acquired_refs)); + err = copy_reference_state(dst, src); if (err) return err; - memcpy(dst, src, offsetof(struct bpf_func_state, allocated_stack)); return copy_stack_state(dst, src); } @@ -1035,7 +1171,7 @@ static int check_stack_write(struct bpf_verifier_env *env, struct bpf_reg_state *reg = NULL; err = realloc_func_state(state, round_up(slot + 1, BPF_REG_SIZE), - true); + state->acquired_refs, true); if (err) return err; /* caller checked that off % size == 0 and -MAX_BPF_STACK <= off < 0, @@ -1446,7 +1582,8 @@ static bool is_ctx_reg(struct bpf_verifier_env *env, int regno) { const struct bpf_reg_state *reg = cur_regs(env) + regno; - return reg->type == PTR_TO_CTX; + return reg->type == PTR_TO_CTX || + reg->type == PTR_TO_SOCKET; } static bool is_pkt_reg(struct bpf_verifier_env *env, int regno) @@ -2111,6 +2248,12 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, expected_type = PTR_TO_SOCKET; if (type != expected_type) goto err_type; + if (meta->ptr_id || !reg->id) { + verbose(env, "verifier internal error: mismatched references meta=%d, reg=%d\n", + meta->ptr_id, reg->id); + return -EFAULT; + } + meta->ptr_id = reg->id; } else if (arg_type_is_mem_ptr(arg_type)) { expected_type = PTR_TO_STACK; /* One exception here. In case function allows for NULL to be @@ -2399,10 +2542,32 @@ static bool check_arg_pair_ok(const struct bpf_func_proto *fn) return true; } +static bool check_refcount_ok(const struct bpf_func_proto *fn) +{ + int count = 0; + + if (arg_type_is_refcounted(fn->arg1_type)) + count++; + if (arg_type_is_refcounted(fn->arg2_type)) + count++; + if (arg_type_is_refcounted(fn->arg3_type)) + count++; + if (arg_type_is_refcounted(fn->arg4_type)) + count++; + if (arg_type_is_refcounted(fn->arg5_type)) + count++; + + /* We only support one arg being unreferenced at the moment, + * which is sufficient for the helper functions we have right now. + */ + return count <= 1; +} + static int check_func_proto(const struct bpf_func_proto *fn) { return check_raw_mode_ok(fn) && - check_arg_pair_ok(fn) ? 0 : -EINVAL; + check_arg_pair_ok(fn) && + check_refcount_ok(fn) ? 0 : -EINVAL; } /* Packet data might have moved, any old PTR_TO_PACKET[_META,_END] @@ -2435,12 +2600,45 @@ static void clear_all_pkt_pointers(struct bpf_verifier_env *env) __clear_all_pkt_pointers(env, vstate->frame[i]); } +static void release_reg_references(struct bpf_verifier_env *env, + struct bpf_func_state *state, int id) +{ + struct bpf_reg_state *regs = state->regs, *reg; + int i; + + for (i = 0; i < MAX_BPF_REG; i++) + if (regs[i].id == id) + mark_reg_unknown(env, regs, i); + + bpf_for_each_spilled_reg(i, state, reg) { + if (!reg) + continue; + if (reg_is_refcounted(reg) && reg->id == id) + __mark_reg_unknown(reg); + } +} + +/* The pointer with the specified id has released its reference to kernel + * resources. Identify all copies of the same pointer and clear the reference. + */ +static int release_reference(struct bpf_verifier_env *env, + struct bpf_call_arg_meta *meta) +{ + struct bpf_verifier_state *vstate = env->cur_state; + int i; + + for (i = 0; i <= vstate->curframe; i++) + release_reg_references(env, vstate->frame[i], meta->ptr_id); + + return release_reference_state(env, meta->ptr_id); +} + static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, int *insn_idx) { struct bpf_verifier_state *state = env->cur_state; struct bpf_func_state *caller, *callee; - int i, subprog, target_insn; + int i, err, subprog, target_insn; if (state->curframe + 1 >= MAX_CALL_FRAMES) { verbose(env, "the call stack of %d frames is too deep\n", @@ -2478,6 +2676,11 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, state->curframe + 1 /* frameno within this callchain */, subprog /* subprog number within this prog */); + /* Transfer references to the callee */ + err = transfer_reference_state(callee, caller); + if (err) + return err; + /* copy r1 - r5 args that callee can access. The copy includes parent * pointers, which connects us up to the liveness chain */ @@ -2510,6 +2713,7 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) struct bpf_verifier_state *state = env->cur_state; struct bpf_func_state *caller, *callee; struct bpf_reg_state *r0; + int err; callee = state->frame[state->curframe]; r0 = &callee->regs[BPF_REG_0]; @@ -2529,6 +2733,11 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) /* return to the caller whatever r0 had in the callee */ caller->regs[BPF_REG_0] = *r0; + /* Transfer references to the caller */ + err = transfer_reference_state(caller, callee); + if (err) + return err; + *insn_idx = callee->callsite + 1; if (env->log.level) { verbose(env, "returning from callee:\n"); @@ -2608,6 +2817,18 @@ record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta, return 0; } +static int check_reference_leak(struct bpf_verifier_env *env) +{ + struct bpf_func_state *state = cur_func(env); + int i; + + for (i = 0; i < state->acquired_refs; i++) { + verbose(env, "Unreleased reference id=%d alloc_insn=%d\n", + state->refs[i].id, state->refs[i].insn_idx); + } + return state->acquired_refs ? -EINVAL : 0; +} + static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn_idx) { const struct bpf_func_proto *fn = NULL; @@ -2686,6 +2907,18 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn return err; } + if (func_id == BPF_FUNC_tail_call) { + err = check_reference_leak(env); + if (err) { + verbose(env, "tail_call would lead to reference leak\n"); + return err; + } + } else if (is_release_function(func_id)) { + err = release_reference(env, &meta); + if (err) + return err; + } + regs = cur_regs(env); /* check that flags argument in get_local_storage(map, flags) is 0, @@ -2729,9 +2962,12 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn regs[BPF_REG_0].map_ptr = meta.map_ptr; regs[BPF_REG_0].id = ++env->id_gen; } else if (fn->ret_type == RET_PTR_TO_SOCKET_OR_NULL) { + int id = acquire_reference_state(env, insn_idx); + if (id < 0) + return id; mark_reg_known_zero(env, regs, BPF_REG_0); regs[BPF_REG_0].type = PTR_TO_SOCKET_OR_NULL; - regs[BPF_REG_0].id = ++env->id_gen; + regs[BPF_REG_0].id = id; } else { verbose(env, "unknown return type %d of func %s#%d\n", fn->ret_type, func_id_name(func_id), func_id); @@ -4214,7 +4450,8 @@ static void reg_combine_min_max(struct bpf_reg_state *true_src, } } -static void mark_ptr_or_null_reg(struct bpf_reg_state *reg, u32 id, +static void mark_ptr_or_null_reg(struct bpf_func_state *state, + struct bpf_reg_state *reg, u32 id, bool is_null) { if (reg_type_may_be_null(reg->type) && reg->id == id) { @@ -4240,11 +4477,13 @@ static void mark_ptr_or_null_reg(struct bpf_reg_state *reg, u32 id, } else if (reg->type == PTR_TO_SOCKET_OR_NULL) { reg->type = PTR_TO_SOCKET; } - /* We don't need id from this point onwards anymore, thus we - * should better reset it, so that state pruning has chances - * to take effect. - */ - reg->id = 0; + if (is_null || !reg_is_refcounted(reg)) { + /* We don't need id from this point onwards anymore, + * thus we should better reset it, so that state + * pruning has chances to take effect. + */ + reg->id = 0; + } } } @@ -4259,15 +4498,18 @@ static void mark_ptr_or_null_regs(struct bpf_verifier_state *vstate, u32 regno, u32 id = regs[regno].id; int i, j; + if (reg_is_refcounted_or_null(®s[regno]) && is_null) + __release_reference_state(state, id); + for (i = 0; i < MAX_BPF_REG; i++) - mark_ptr_or_null_reg(®s[i], id, is_null); + mark_ptr_or_null_reg(state, ®s[i], id, is_null); for (j = 0; j <= vstate->curframe; j++) { state = vstate->frame[j]; bpf_for_each_spilled_reg(i, state, reg) { if (!reg) continue; - mark_ptr_or_null_reg(reg, id, is_null); + mark_ptr_or_null_reg(state, reg, id, is_null); } } } @@ -4613,7 +4855,17 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) if (err) return err; - if (regs[ctx_reg].type != PTR_TO_CTX) { + /* Disallow usage of BPF_LD_[ABS|IND] with reference tracking, as + * gen_ld_abs() may terminate the program at runtime, leading to + * reference leak. + */ + err = check_reference_leak(env); + if (err) { + verbose(env, "BPF_LD_[ABS|IND] cannot be mixed with socket references\n"); + return err; + } + + if (regs[BPF_REG_6].type != PTR_TO_CTX) { verbose(env, "at the time of BPF_LD_ABS|IND R6 != pointer to skb\n"); return -EINVAL; @@ -5106,6 +5358,14 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old, return true; } +static bool refsafe(struct bpf_func_state *old, struct bpf_func_state *cur) +{ + if (old->acquired_refs != cur->acquired_refs) + return false; + return !memcmp(old->refs, cur->refs, + sizeof(*old->refs) * old->acquired_refs); +} + /* compare two verifier states * * all states stored in state_list are known to be valid, since @@ -5146,6 +5406,9 @@ static bool func_states_equal(struct bpf_verifier_env *env, struct bpf_func_stat if (!stacksafe(env, old, cur, env->idmap_scratch)) return false; + if (!refsafe(old, cur)) + return false; + return true; } @@ -5606,6 +5869,10 @@ static int do_check(struct bpf_verifier_env *env) continue; } + err = check_reference_leak(env); + if (err) + return err; + /* eBPF calling convetion is such that R0 is used * to return the value from eBPF program. * Make sure that it's readable at this time From 8d77ec6fb2d21648cd7db5c9635e62abb7ca6c71 Mon Sep 17 00:00:00 2001 From: Joe Stringer Date: Tue, 2 Oct 2018 13:35:36 -0700 Subject: [PATCH 0934/1640] BACKPORT: bpf: Add helper to retrieve socket in BPF This patch adds new BPF helper functions, bpf_sk_lookup_tcp() and bpf_sk_lookup_udp() which allows BPF programs to find out if there is a socket listening on this host, and returns a socket pointer which the BPF program can then access to determine, for instance, whether to forward or drop traffic. bpf_sk_lookup_xxx() may take a reference on the socket, so when a BPF program makes use of this function, it must subsequently pass the returned pointer into the newly added sk_release() to return the reference. By way of example, the following pseudocode would filter inbound connections at XDP if there is no corresponding service listening for the traffic: struct bpf_sock_tuple tuple; struct bpf_sock_ops *sk; populate_tuple(ctx, &tuple); // Extract the 5tuple from the packet sk = bpf_sk_lookup_tcp(ctx, &tuple, sizeof tuple, netns, 0); if (!sk) { // Couldn't find a socket listening for this traffic. Drop. return TC_ACT_SHOT; } bpf_sk_release(sk, 0); return TC_ACT_OK; Change-Id: I534d92a4cc11df0c6157f1c3282e0cf262e95a7c Signed-off-by: Joe Stringer Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/uapi/linux/bpf.h | 88 +++++++++++++++++++++++ kernel/bpf/verifier.c | 8 ++- net/core/filter.c | 151 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 246 insertions(+), 1 deletion(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 95c8230fe2dd..204a78da938d 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2165,6 +2165,77 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * + * struct bpf_sock *bpf_sk_lookup_tcp(void *ctx, struct bpf_sock_tuple *tuple, u32 tuple_size, u32 netns, u64 flags) + * Description + * Look for TCP socket matching *tuple*, optionally in a child + * network namespace *netns*. The return value must be checked, + * and if non-NULL, released via **bpf_sk_release**\ (). + * + * The *ctx* should point to the context of the program, such as + * the skb or socket (depending on the hook in use). This is used + * to determine the base network namespace for the lookup. + * + * *tuple_size* must be one of: + * + * **sizeof**\ (*tuple*\ **->ipv4**) + * Look for an IPv4 socket. + * **sizeof**\ (*tuple*\ **->ipv6**) + * Look for an IPv6 socket. + * + * If the *netns* is zero, then the socket lookup table in the + * netns associated with the *ctx* will be used. For the TC hooks, + * this in the netns of the device in the skb. For socket hooks, + * this in the netns of the socket. If *netns* is non-zero, then + * it specifies the ID of the netns relative to the netns + * associated with the *ctx*. + * + * All values for *flags* are reserved for future usage, and must + * be left at zero. + * + * This helper is available only if the kernel was compiled with + * **CONFIG_NET** configuration option. + * Return + * Pointer to *struct bpf_sock*, or NULL in case of failure. + * + * struct bpf_sock *bpf_sk_lookup_udp(void *ctx, struct bpf_sock_tuple *tuple, u32 tuple_size, u32 netns, u64 flags) + * Description + * Look for UDP socket matching *tuple*, optionally in a child + * network namespace *netns*. The return value must be checked, + * and if non-NULL, released via **bpf_sk_release**\ (). + * + * The *ctx* should point to the context of the program, such as + * the skb or socket (depending on the hook in use). This is used + * to determine the base network namespace for the lookup. + * + * *tuple_size* must be one of: + * + * **sizeof**\ (*tuple*\ **->ipv4**) + * Look for an IPv4 socket. + * **sizeof**\ (*tuple*\ **->ipv6**) + * Look for an IPv6 socket. + * + * If the *netns* is zero, then the socket lookup table in the + * netns associated with the *ctx* will be used. For the TC hooks, + * this in the netns of the device in the skb. For socket hooks, + * this in the netns of the socket. If *netns* is non-zero, then + * it specifies the ID of the netns relative to the netns + * associated with the *ctx*. + * + * All values for *flags* are reserved for future usage, and must + * be left at zero. + * + * This helper is available only if the kernel was compiled with + * **CONFIG_NET** configuration option. + * Return + * Pointer to *struct bpf_sock*, or NULL in case of failure. + * + * int bpf_sk_release(struct bpf_sock *sk) + * Description + * Release the reference held by *sock*. *sock* must be a non-NULL + * pointer that was returned from bpf_sk_lookup_xxx\ (). + * Return + * 0 on success, or a negative error in case of failure. + * * u64 bpf_ktime_get_boot_ns(void) * Description * Return the time elapsed since system boot, in nanoseconds. @@ -2469,6 +2540,23 @@ struct bpf_sock { */ }; +struct bpf_sock_tuple { + union { + struct { + __be32 saddr; + __be32 daddr; + __be16 sport; + __be16 dport; + } ipv4; + struct { + __be32 saddr[4]; + __be32 daddr[4]; + __be16 sport; + __be16 dport; + } ipv6; + }; +}; + #define XDP_PACKET_HEADROOM 256 /* User return codes for XDP prog type. diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index e5198f5832b9..5ad73dc5dc80 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -153,6 +153,12 @@ static const struct bpf_verifier_ops * const bpf_verifier_ops[] = { * PTR_TO_MAP_VALUE, PTR_TO_SOCKET_OR_NULL becomes PTR_TO_SOCKET when the type * passes through a NULL-check conditional. For the branch wherein the state is * changed to CONST_IMM, the verifier releases the reference. + * + * For each helper function that allocates a reference, such as + * bpf_sk_lookup_tcp(), there is a corresponding release function, such as + * bpf_sk_release(). When a reference type passes into the release function, + * the verifier also releases the reference. If any unchecked or unreleased + * reference remains at the end of the program, the verifier rejects it. */ /* verifier_state + insn_idx are pushed to stack when branch is encountered */ @@ -300,7 +306,7 @@ static bool arg_type_is_refcounted(enum bpf_arg_type type) */ static bool is_release_function(enum bpf_func_id func_id) { - return false; + return func_id == BPF_FUNC_sk_release; } /* string representation of 'enum bpf_reg_type' */ diff --git a/net/core/filter.c b/net/core/filter.c index 1fdf0f5a9818..49cf7744fdfd 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -58,13 +58,17 @@ #include #include #include +#include #include #include #include +#include +#include #include #include #include #include +#include #include #include #include @@ -4799,6 +4803,141 @@ static const struct bpf_func_proto bpf_lwt_seg6_adjust_srh_proto = { }; #endif /* CONFIG_IPV6_SEG6_BPF */ +struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple, + struct sk_buff *skb, u8 family, u8 proto) +{ + int dif = skb->dev->ifindex; + bool refcounted = false; + struct sock *sk = NULL; + + if (family == AF_INET) { + __be32 src4 = tuple->ipv4.saddr; + __be32 dst4 = tuple->ipv4.daddr; + int sdif = inet_sdif(skb); + + if (proto == IPPROTO_TCP) + sk = __inet_lookup(net, &tcp_hashinfo, skb, 0, + src4, tuple->ipv4.sport, + dst4, tuple->ipv4.dport, + dif, sdif, &refcounted); + else + sk = __udp4_lib_lookup(net, src4, tuple->ipv4.sport, + dst4, tuple->ipv4.dport, + dif, sdif, &udp_table, skb); +#if IS_ENABLED(CONFIG_IPV6) + } else { + struct in6_addr *src6 = (struct in6_addr *)&tuple->ipv6.saddr; + struct in6_addr *dst6 = (struct in6_addr *)&tuple->ipv6.daddr; + int sdif = inet6_sdif(skb); + + if (proto == IPPROTO_TCP) + sk = __inet6_lookup(net, &tcp_hashinfo, skb, 0, + src6, tuple->ipv6.sport, + dst6, tuple->ipv6.dport, + dif, sdif, &refcounted); + else + sk = __udp6_lib_lookup(net, src6, tuple->ipv6.sport, + dst6, tuple->ipv6.dport, + dif, sdif, &udp_table, skb); +#endif + } + + if (unlikely(sk && !refcounted && !sock_flag(sk, SOCK_RCU_FREE))) { + WARN_ONCE(1, "Found non-RCU, unreferenced socket!"); + sk = NULL; + } + return sk; +} + +/* bpf_sk_lookup performs the core lookup for different types of sockets, + * taking a reference on the socket if it doesn't have the flag SOCK_RCU_FREE. + * Returns the socket as an 'unsigned long' to simplify the casting in the + * callers to satisfy BPF_CALL declarations. + */ +static unsigned long +bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, + u8 proto, u64 netns_id, u64 flags) +{ + struct net *caller_net; + struct sock *sk = NULL; + u8 family = AF_UNSPEC; + struct net *net; + + family = len == sizeof(tuple->ipv4) ? AF_INET : AF_INET6; + if (unlikely(family == AF_UNSPEC || netns_id > U32_MAX || flags)) + goto out; + + if (skb->dev) + caller_net = dev_net(skb->dev); + else + caller_net = sock_net(skb->sk); + if (netns_id) { + net = get_net_ns_by_id(caller_net, netns_id); + if (unlikely(!net)) + goto out; + sk = sk_lookup(net, tuple, skb, family, proto); + put_net(net); + } else { + net = caller_net; + sk = sk_lookup(net, tuple, skb, family, proto); + } + + if (sk) + sk = sk_to_full_sk(sk); +out: + return (unsigned long) sk; +} + +BPF_CALL_5(bpf_sk_lookup_tcp, struct sk_buff *, skb, + struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) +{ + return bpf_sk_lookup(skb, tuple, len, IPPROTO_TCP, netns_id, flags); +} + +static const struct bpf_func_proto bpf_sk_lookup_tcp_proto = { + .func = bpf_sk_lookup_tcp, + .gpl_only = false, + .pkt_access = true, + .ret_type = RET_PTR_TO_SOCKET_OR_NULL, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_ANYTHING, + .arg5_type = ARG_ANYTHING, +}; + +BPF_CALL_5(bpf_sk_lookup_udp, struct sk_buff *, skb, + struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) +{ + return bpf_sk_lookup(skb, tuple, len, IPPROTO_UDP, netns_id, flags); +} + +static const struct bpf_func_proto bpf_sk_lookup_udp_proto = { + .func = bpf_sk_lookup_udp, + .gpl_only = false, + .pkt_access = true, + .ret_type = RET_PTR_TO_SOCKET_OR_NULL, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_ANYTHING, + .arg5_type = ARG_ANYTHING, +}; + +BPF_CALL_1(bpf_sk_release, struct sock *, sk) +{ + if (!sock_flag(sk, SOCK_RCU_FREE)) + sock_gen_put(sk); + return 0; +} + +static const struct bpf_func_proto bpf_sk_release_proto = { + .func = bpf_sk_release, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_SOCKET, +}; + bool bpf_helper_changes_pkt_data(void *func) { if (func == bpf_skb_vlan_push || @@ -5009,6 +5148,12 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_skb_ancestor_cgroup_id: return &bpf_skb_ancestor_cgroup_id_proto; #endif + case BPF_FUNC_sk_lookup_tcp: + return &bpf_sk_lookup_tcp_proto; + case BPF_FUNC_sk_lookup_udp: + return &bpf_sk_lookup_udp_proto; + case BPF_FUNC_sk_release: + return &bpf_sk_release_proto; default: return bpf_base_func_proto(func_id); } @@ -5109,6 +5254,12 @@ sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_sk_redirect_hash_proto; case BPF_FUNC_get_local_storage: return &bpf_get_local_storage_proto; + case BPF_FUNC_sk_lookup_tcp: + return &bpf_sk_lookup_tcp_proto; + case BPF_FUNC_sk_lookup_udp: + return &bpf_sk_lookup_udp_proto; + case BPF_FUNC_sk_release: + return &bpf_sk_release_proto; default: return bpf_base_func_proto(func_id); } From 5c0dfcf929cf4f55bf3bda94db4feed57ef1b0e6 Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Sun, 7 Oct 2018 12:56:47 +0100 Subject: [PATCH 0935/1640] BACKPORT: bpf: add verifier callback to get stack usage info for offloaded progs In preparation for BPF-to-BPF calls in offloaded programs, add a new function attribute to the struct bpf_prog_offload_ops so that drivers supporting eBPF offload can hook at the end of program verification, and potentially extract information collected by the verifier. Implement a minimal callback (returning 0) in the drivers providing the structs, namely netdevsim and nfp. This will be useful in the nfp driver, in later commits, to extract the number of subprograms as well as the stack depth for those subprograms. Change-Id: I57ebef18255dc7a974078c828415bc384955b952 Signed-off-by: Quentin Monnet Reviewed-by: Jiong Wang Reviewed-by: Jakub Kicinski Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 1 + include/linux/bpf_verifier.h | 1 + kernel/bpf/offload.c | 18 ++++++++++++++++++ kernel/bpf/verifier.c | 3 +++ 4 files changed, 23 insertions(+) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index a141f6798b16..b7dac171766b 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -268,6 +268,7 @@ struct bpf_verifier_ops { struct bpf_prog_offload_ops { int (*insn_hook)(struct bpf_verifier_env *env, int insn_idx, int prev_insn_idx); + int (*finalize)(struct bpf_verifier_env *env); }; struct bpf_prog_offload { diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 996f5e6a3a65..1c169ab2f3be 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -271,5 +271,6 @@ static inline struct bpf_reg_state *cur_regs(struct bpf_verifier_env *env) int bpf_prog_offload_verifier_prep(struct bpf_verifier_env *env); int bpf_prog_offload_verify_insn(struct bpf_verifier_env *env, int insn_idx, int prev_insn_idx); +int bpf_prog_offload_finalize(struct bpf_verifier_env *env); #endif /* _LINUX_BPF_VERIFIER_H */ diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index 66e13aace241..e82645e53953 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -172,6 +172,24 @@ int bpf_prog_offload_verify_insn(struct bpf_verifier_env *env, return ret; } +int bpf_prog_offload_finalize(struct bpf_verifier_env *env) +{ + struct bpf_prog_offload *offload; + int ret = -ENODEV; + + down_read(&bpf_devs_lock); + offload = env->prog->aux->offload; + if (offload) { + if (offload->dev_ops->finalize) + ret = offload->dev_ops->finalize(env); + else + ret = 0; + } + up_read(&bpf_devs_lock); + + return ret; +} + static void __bpf_prog_offload_destroy(struct bpf_prog *prog) { struct bpf_prog_offload *offload = prog->aux->offload; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 5ad73dc5dc80..c63e4e4a5462 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6955,6 +6955,9 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) env->cur_state = NULL; } + if (ret == 0 && bpf_prog_is_dev_bound(env->prog->aux)) + ret = bpf_prog_offload_finalize(env); + skip_full_check: while (!pop_stack(env, NULL, NULL)); free_states(env); From 0f8abb4f21e1ebb1570a154039776affa433a199 Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Sun, 7 Oct 2018 12:56:58 +0100 Subject: [PATCH 0936/1640] UPSTREAM: bpf: allow offload of programs with BPF-to-BPF function calls Now that there is at least one driver supporting BPF-to-BPF function calls, lift the restriction, in the verifier, on hardware offload of eBPF programs containing such calls. But prevent jit_subprogs(), still in the verifier, from being run for offloaded programs. Change-Id: I7befe1f24dd07148d54fe1121686a61bc5f31d7d Signed-off-by: Quentin Monnet Reviewed-by: Jiong Wang Reviewed-by: Jakub Kicinski Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index c63e4e4a5462..997cf8e99ff4 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1012,10 +1012,6 @@ static int check_subprogs(struct bpf_verifier_env *env) verbose(env, "function calls to other bpf functions are allowed for root only\n"); return -EPERM; } - if (bpf_prog_is_dev_bound(env->prog->aux)) { - verbose(env, "function calls in offloaded programs are not supported yet\n"); - return -EINVAL; - } ret = add_subprog(env, i + insn[i].imm + 1); if (ret < 0) return ret; @@ -6557,10 +6553,10 @@ static int fixup_call_args(struct bpf_verifier_env *env) struct bpf_insn *insn = prog->insnsi; int i, depth; #endif - int err; + int err = 0; - err = 0; - if (env->prog->jit_requested) { + if (env->prog->jit_requested && + !bpf_prog_is_dev_bound(env->prog->aux)) { err = jit_subprogs(env); if (err == 0) return 0; From 3f1bd7f928d59bfe484f4a45b96484adafcf7a1e Mon Sep 17 00:00:00 2001 From: Prashant Bhole Date: Tue, 9 Oct 2018 10:04:49 +0900 Subject: [PATCH 0937/1640] BACKPORT: bpf: error handling when map_lookup_elem isn't supported The error value returned by map_lookup_elem doesn't differentiate whether lookup was failed because of invalid key or lookup is not supported. Lets add handling for -EOPNOTSUPP return value of map_lookup_elem() method of map, with expectation from map's implementation that it should return -EOPNOTSUPP if lookup is not supported. The errno for bpf syscall for BPF_MAP_LOOKUP_ELEM command will be set to EOPNOTSUPP if map lookup is not supported. Change-Id: I8b0d49378a13f7cef5c3bad86c27120ad9b9abe5 Signed-off-by: Prashant Bhole Acked-by: Alexei Starovoitov Acked-by: Song Liu Signed-off-by: Alexei Starovoitov --- kernel/bpf/syscall.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 598d1d560aa2..9d7499817719 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -728,10 +728,16 @@ static int map_lookup_elem(union bpf_attr *attr) ptr = map->ops->map_lookup_elem_sys_only(map, key); else ptr = map->ops->map_lookup_elem(map, key); - if (ptr) + ptr = map->ops->map_lookup_elem(map, key); + if (IS_ERR(ptr)) { + err = PTR_ERR(ptr); + } else if (!ptr) { + err = -ENOENT; + } else { + err = 0; memcpy(value, ptr, value_size); + } rcu_read_unlock(); - err = ptr ? 0 : -ENOENT; } this_cpu_dec(bpf_prog_active); preempt_enable(); From b61bfd459347201d3d373046f2464622510d9998 Mon Sep 17 00:00:00 2001 From: Prashant Bhole Date: Tue, 9 Oct 2018 10:04:50 +0900 Subject: [PATCH 0938/1640] UPSTREAM: bpf: return EOPNOTSUPP when map lookup isn't supported Return ERR_PTR(-EOPNOTSUPP) from map_lookup_elem() methods of below map types: - BPF_MAP_TYPE_PROG_ARRAY - BPF_MAP_TYPE_STACK_TRACE - BPF_MAP_TYPE_XSKMAP - BPF_MAP_TYPE_SOCKMAP/BPF_MAP_TYPE_SOCKHASH Change-Id: I13937c36055b419f4446d8bfa06f139c757480c9 Signed-off-by: Prashant Bhole Acked-by: Alexei Starovoitov Acked-by: Song Liu Signed-off-by: Alexei Starovoitov --- kernel/bpf/arraymap.c | 2 +- kernel/bpf/sockmap.c | 2 +- kernel/bpf/stackmap.c | 2 +- kernel/bpf/xskmap.c | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 2a9b4ca27a34..8eac775ab709 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -452,7 +452,7 @@ static void fd_array_map_free(struct bpf_map *map) static void *fd_array_map_lookup_elem(struct bpf_map *map, void *key) { - return NULL; + return ERR_PTR(-EOPNOTSUPP); } /* only called from syscall */ diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index d37a1a0a6e1e..5d0677d808ae 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -2096,7 +2096,7 @@ int sockmap_get_from_fd(const union bpf_attr *attr, int type, static void *sock_map_lookup(struct bpf_map *map, void *key) { - return NULL; + return ERR_PTR(-EOPNOTSUPP); } static int sock_map_update_elem(struct bpf_map *map, diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index a41858db1441..887698ed9256 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -524,7 +524,7 @@ const struct bpf_func_proto bpf_get_stack_proto = { /* Called from eBPF program */ static void *stack_map_lookup_elem(struct bpf_map *map, void *key) { - return NULL; + return ERR_PTR(-EOPNOTSUPP); } /* Called from syscall */ diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c index 47147c9e184d..686d244e798d 100644 --- a/kernel/bpf/xskmap.c +++ b/kernel/bpf/xskmap.c @@ -154,7 +154,7 @@ void __xsk_map_flush(struct bpf_map *map) static void *xsk_map_lookup_elem(struct bpf_map *map, void *key) { - return NULL; + return ERR_PTR(-EOPNOTSUPP); } static int xsk_map_update_elem(struct bpf_map *map, void *key, void *value, From 505170009f68af2f512a1463ee4d87d9121097bf Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 13 Oct 2018 02:45:57 +0200 Subject: [PATCH 0939/1640] BACKPORT: tcp, ulp: remove ulp bits from sockmap In order to prepare sockmap logic to be used in combination with kTLS we need to detangle it from ULP, and further split it in later commits into a generic API. Joint work with John. Change-Id: I743b8fe0add82442f692857c05c362824a1f86f4 Signed-off-by: Daniel Borkmann Signed-off-by: John Fastabend Signed-off-by: Alexei Starovoitov --- include/net/tcp.h | 1 - kernel/bpf/sockmap.c | 39 +++++-------------- net/ipv4/tcp_ulp.c | 89 ++++++++++++-------------------------------- 3 files changed, 33 insertions(+), 96 deletions(-) diff --git a/include/net/tcp.h b/include/net/tcp.h index 8bbdcf82494a..1049bc73055a 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -2141,7 +2141,6 @@ struct tcp_ulp_ops { int tcp_register_ulp(struct tcp_ulp_ops *type); void tcp_unregister_ulp(struct tcp_ulp_ops *type); int tcp_set_ulp(struct sock *sk, const char *name); -int tcp_set_ulp_id(struct sock *sk, const int ulp); void tcp_get_available_ulp(char *buf, size_t len); void tcp_cleanup_ulp(struct sock *sk); diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 5d0677d808ae..de6f7a65c72b 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -182,6 +182,7 @@ enum { static struct proto *saved_tcpv6_prot __read_mostly; static DEFINE_SPINLOCK(tcpv6_prot_lock); static struct proto bpf_tcp_prots[SOCKMAP_NUM_PROTS][SOCKMAP_NUM_CONFIGS]; + static void build_protos(struct proto prot[SOCKMAP_NUM_CONFIGS], struct proto *base) { @@ -239,6 +240,13 @@ static int bpf_tcp_init(struct sock *sk) return 0; } +static int __init bpf_sock_init(void) +{ + build_protos(bpf_tcp_prots[SOCKMAP_IPV4], &tcp_prot); + return 0; +} +core_initcall(bpf_sock_init); + static void smap_release_sock(struct smap_psock *psock, struct sock *sock); static int free_start_sg(struct sock *sk, struct sk_msg_buff *md, bool charge); @@ -413,15 +421,6 @@ enum __sk_action { __SK_NONE, }; -static struct tcp_ulp_ops bpf_tcp_ulp_ops __read_mostly = { - .name = "bpf_tcp", - .uid = TCP_ULP_BPF, - .user_visible = false, - .owner = NULL, - .init = bpf_tcp_init, - .release = bpf_tcp_release, -}; - static int memcopy_from_iter(struct sock *sk, struct sk_msg_buff *md, struct iov_iter *from, int bytes) @@ -1236,16 +1235,6 @@ static void bpf_tcp_msg_add(struct smap_psock *psock, bpf_prog_put(orig_tx_msg); } -static int bpf_tcp_ulp_register(void) -{ - build_protos(bpf_tcp_prots[SOCKMAP_IPV4], &tcp_prot); - /* Once BPF TX ULP is registered it is never unregistered. It - * will be in the ULP list for the lifetime of the system. Doing - * duplicate registers is not a problem. - */ - return tcp_register_ulp(&bpf_tcp_ulp_ops); -} - static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb) { struct bpf_prog *prog = READ_ONCE(psock->bpf_verdict); @@ -1491,7 +1480,7 @@ static void smap_release_sock(struct smap_psock *psock, struct sock *sock) { if (refcount_dec_and_test(&psock->refcnt)) { if (psock_is_smap_sk(sock)) - tcp_cleanup_ulp(sock); + bpf_tcp_release(sock); write_lock_bh(&sock->sk_callback_lock); smap_stop_sock(psock, sock); write_unlock_bh(&sock->sk_callback_lock); @@ -1666,10 +1655,6 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr) attr->value_size != 4 || attr->map_flags & ~SOCK_CREATE_FLAG_MASK) return ERR_PTR(-EINVAL); - err = bpf_tcp_ulp_register(); - if (err && err != -EEXIST) - return ERR_PTR(err); - stab = kzalloc(sizeof(*stab), GFP_USER); if (!stab) return ERR_PTR(-ENOMEM); @@ -1951,7 +1936,7 @@ static int __sock_map_ctx_update_elem(struct bpf_map *map, if (tx_msg) bpf_tcp_msg_add(psock, sock, tx_msg); if (new) { - err = tcp_set_ulp_id(sock, TCP_ULP_BPF); + err = bpf_tcp_init(sock); if (err) goto out_free; } @@ -2187,10 +2172,6 @@ static struct bpf_map *sock_hash_alloc(union bpf_attr *attr) */ return ERR_PTR(-E2BIG); - err = bpf_tcp_ulp_register(); - if (err && err != -EEXIST) - return ERR_PTR(err); - htab = kzalloc(sizeof(*htab), GFP_USER); if (!htab) return ERR_PTR(-ENOMEM); diff --git a/net/ipv4/tcp_ulp.c b/net/ipv4/tcp_ulp.c index 7dd44b6156c7..07bf9e02df13 100644 --- a/net/ipv4/tcp_ulp.c +++ b/net/ipv4/tcp_ulp.c @@ -6,7 +6,7 @@ * */ -#include +#include #include #include #include @@ -29,18 +29,6 @@ static struct tcp_ulp_ops *tcp_ulp_find(const char *name) return NULL; } -static struct tcp_ulp_ops *tcp_ulp_find_id(const int ulp) -{ - struct tcp_ulp_ops *e; - - list_for_each_entry_rcu(e, &tcp_ulp_list, list) { - if (e->uid == ulp) - return e; - } - - return NULL; -} - static const struct tcp_ulp_ops *__tcp_ulp_find_autoload(const char *name) { const struct tcp_ulp_ops *ulp = NULL; @@ -63,18 +51,6 @@ static const struct tcp_ulp_ops *__tcp_ulp_find_autoload(const char *name) return ulp; } -static const struct tcp_ulp_ops *__tcp_ulp_lookup(const int uid) -{ - const struct tcp_ulp_ops *ulp; - - rcu_read_lock(); - ulp = tcp_ulp_find_id(uid); - if (!ulp || !try_module_get(ulp->owner)) - ulp = NULL; - rcu_read_unlock(); - return ulp; -} - /* Attach new upper layer protocol to the list * of available protocols. */ @@ -131,54 +107,35 @@ void tcp_cleanup_ulp(struct sock *sk) module_put(icsk->icsk_ulp_ops->owner); } -/* Change upper layer protocol for socket */ -int tcp_set_ulp(struct sock *sk, const char *name) +static int __tcp_set_ulp(struct sock *sk, const struct tcp_ulp_ops *ulp_ops) { struct inet_connection_sock *icsk = inet_csk(sk); - const struct tcp_ulp_ops *ulp_ops; - int err = 0; + int err; + err = -EEXIST; if (icsk->icsk_ulp_ops) - return -EEXIST; + goto out_err; + + err = ulp_ops->init(sk); + if (err) + goto out_err; + + icsk->icsk_ulp_ops = ulp_ops; + return 0; +out_err: + module_put(ulp_ops->owner); + return err; +} + +int tcp_set_ulp(struct sock *sk, const char *name) +{ + const struct tcp_ulp_ops *ulp_ops; + + sock_owned_by_me(sk); ulp_ops = __tcp_ulp_find_autoload(name); if (!ulp_ops) return -ENOENT; - if (!ulp_ops->user_visible) { - module_put(ulp_ops->owner); - return -ENOENT; - } - - err = ulp_ops->init(sk); - if (err) { - module_put(ulp_ops->owner); - return err; - } - - icsk->icsk_ulp_ops = ulp_ops; - return 0; -} - -int tcp_set_ulp_id(struct sock *sk, int ulp) -{ - struct inet_connection_sock *icsk = inet_csk(sk); - const struct tcp_ulp_ops *ulp_ops; - int err; - - if (icsk->icsk_ulp_ops) - return -EEXIST; - - ulp_ops = __tcp_ulp_lookup(ulp); - if (!ulp_ops) - return -ENOENT; - - err = ulp_ops->init(sk); - if (err) { - module_put(ulp_ops->owner); - return err; - } - - icsk->icsk_ulp_ops = ulp_ops; - return 0; + return __tcp_set_ulp(sk, ulp_ops); } From 4da7ff1932df5464d5863ad3266f84ed516a8f81 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 13 Oct 2018 02:45:58 +0200 Subject: [PATCH 0940/1640] BACKPORT: bpf, sockmap: convert to generic sk_msg interface Add a generic sk_msg layer, and convert current sockmap and later kTLS over to make use of it. While sk_buff handles network packet representation from netdevice up to socket, sk_msg handles data representation from application to socket layer. This means that sk_msg framework spans across ULP users in the kernel, and enables features such as introspection or filtering of data with the help of BPF programs that operate on this data structure. Latter becomes in particular useful for kTLS where data encryption is deferred into the kernel, and as such enabling the kernel to perform L7 introspection and policy based on BPF for TLS connections where the record is being encrypted after BPF has run and came to a verdict. In order to get there, first step is to transform open coding of scatter-gather list handling into a common core framework that subsystems can use. The code itself has been split and refactored into three bigger pieces: i) the generic sk_msg API which deals with managing the scatter gather ring, providing helpers for walking and mangling, transferring application data from user space into it, and preparing it for BPF pre/post-processing, ii) the plain sock map itself where sockets can be attached to or detached from; these bits are independent of i) which can now be used also without sock map, and iii) the integration with plain TCP as one protocol to be used for processing L7 application data (later this could e.g. also be extended to other protocols like UDP). The semantics are the same with the old sock map code and therefore no change of user facing behavior or APIs. While pursuing this work it also helped finding a number of bugs in the old sockmap code that we've fixed already in earlier commits. The test_sockmap kselftest suite passes through fine as well. Joint work with John. Change-Id: Iacc1636681a70825399a0f63890a65e1fc95e674 Signed-off-by: Daniel Borkmann Signed-off-by: John Fastabend Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 33 +- include/linux/bpf_types.h | 2 +- include/linux/filter.h | 21 - include/linux/skmsg.h | 371 ++++++ include/net/tcp.h | 27 + kernel/bpf/Makefile | 5 - kernel/bpf/core.c | 2 - kernel/bpf/sockmap.c | 2610 ------------------------------------- kernel/bpf/syscall.c | 6 +- net/Kconfig | 11 + net/core/Makefile | 2 + net/core/filter.c | 272 ++-- net/core/skmsg.c | 763 +++++++++++ net/core/sock_map.c | 1002 ++++++++++++++ net/ipv4/Makefile | 1 + net/ipv4/tcp_bpf.c | 655 ++++++++++ net/strparser/Kconfig | 4 +- 17 files changed, 2926 insertions(+), 2861 deletions(-) create mode 100644 include/linux/skmsg.h delete mode 100644 kernel/bpf/sockmap.c create mode 100644 net/core/skmsg.c create mode 100644 net/core/sock_map.c create mode 100644 net/ipv4/tcp_bpf.c diff --git a/include/linux/bpf.h b/include/linux/bpf.h index b7dac171766b..439d486e5dfd 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -756,33 +756,18 @@ static inline void bpf_map_offload_map_free(struct bpf_map *map) } #endif /* CONFIG_NET && CONFIG_BPF_SYSCALL */ -#if defined(CONFIG_STREAM_PARSER) && defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_INET) -struct sock *__sock_map_lookup_elem(struct bpf_map *map, u32 key); -struct sock *__sock_hash_lookup_elem(struct bpf_map *map, void *key); -int sock_map_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type); -int sockmap_get_from_fd(const union bpf_attr *attr, int type, - struct bpf_prog *prog); +#if defined(CONFIG_BPF_STREAM_PARSER) +int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, u32 which); +int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog); #else -static inline struct sock *__sock_map_lookup_elem(struct bpf_map *map, u32 key) -{ - return NULL; -} - -static inline struct sock *__sock_hash_lookup_elem(struct bpf_map *map, - void *key) -{ - return NULL; -} - -static inline int sock_map_prog(struct bpf_map *map, - struct bpf_prog *prog, - u32 type) +static inline int sock_map_prog_update(struct bpf_map *map, + struct bpf_prog *prog, u32 which) { return -EOPNOTSUPP; } -static inline int sockmap_get_from_fd(const union bpf_attr *attr, int type, - struct bpf_prog *prog) +static inline int sock_map_get_from_fd(const union bpf_attr *attr, + struct bpf_prog *prog) { return -EINVAL; } @@ -859,6 +844,10 @@ extern const struct bpf_func_proto bpf_get_stack_proto; extern const struct bpf_func_proto bpf_sock_map_update_proto; extern const struct bpf_func_proto bpf_sock_hash_update_proto; extern const struct bpf_func_proto bpf_get_current_cgroup_id_proto; +extern const struct bpf_func_proto bpf_msg_redirect_hash_proto; +extern const struct bpf_func_proto bpf_msg_redirect_map_proto; +extern const struct bpf_func_proto bpf_sk_redirect_hash_proto; +extern const struct bpf_func_proto bpf_sk_redirect_map_proto; extern const struct bpf_func_proto bpf_get_local_storage_proto; diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index 9d5f77e2d8dd..e40d23a8e8a2 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -57,7 +57,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY_OF_MAPS, array_of_maps_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_HASH_OF_MAPS, htab_of_maps_map_ops) #ifdef CONFIG_NET BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP, dev_map_ops) -#if defined(CONFIG_STREAM_PARSER) && defined(CONFIG_INET) +#if defined(CONFIG_BPF_STREAM_PARSER) BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKMAP, sock_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKHASH, sock_hash_ops) #endif diff --git a/include/linux/filter.h b/include/linux/filter.h index 9800cd0a21d3..ccd2f16de004 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -604,24 +604,6 @@ struct bpf_skb_data_end { void *data_end; }; -struct sk_msg_buff { - void *data; - void *data_end; - __u32 apply_bytes; - __u32 cork_bytes; - int sg_copybreak; - int sg_start; - int sg_curr; - int sg_end; - struct scatterlist sg_data[MAX_SKB_FRAGS]; - bool sg_copy[MAX_SKB_FRAGS]; - __u32 flags; - struct sock *sk_redir; - struct sock *sk; - struct sk_buff *skb; - struct list_head list; -}; - struct bpf_redirect_info { u32 ifindex; u32 flags; @@ -903,9 +885,6 @@ void xdp_do_flush_map(void); void bpf_warn_invalid_xdp_action(u32 act); -struct sock *do_sk_redirect_map(struct sk_buff *skb); -struct sock *do_msg_redirect_map(struct sk_msg_buff *md); - #ifdef CONFIG_INET struct sock *bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk, struct bpf_prog *prog, struct sk_buff *skb, diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h new file mode 100644 index 000000000000..95678103c4a0 --- /dev/null +++ b/include/linux/skmsg.h @@ -0,0 +1,371 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (c) 2017 - 2018 Covalent IO, Inc. http://covalent.io */ + +#ifndef _LINUX_SKMSG_H +#define _LINUX_SKMSG_H + +#include +#include +#include +#include + +#include +#include +#include + +#define MAX_MSG_FRAGS MAX_SKB_FRAGS + +enum __sk_action { + __SK_DROP = 0, + __SK_PASS, + __SK_REDIRECT, + __SK_NONE, +}; + +struct sk_msg_sg { + u32 start; + u32 curr; + u32 end; + u32 size; + u32 copybreak; + bool copy[MAX_MSG_FRAGS]; + struct scatterlist data[MAX_MSG_FRAGS]; +}; + +struct sk_msg { + struct sk_msg_sg sg; + void *data; + void *data_end; + u32 apply_bytes; + u32 cork_bytes; + u32 flags; + struct sk_buff *skb; + struct sock *sk_redir; + struct sock *sk; + struct list_head list; +}; + +struct sk_psock_progs { + struct bpf_prog *msg_parser; + struct bpf_prog *skb_parser; + struct bpf_prog *skb_verdict; +}; + +enum sk_psock_state_bits { + SK_PSOCK_TX_ENABLED, +}; + +struct sk_psock_link { + struct list_head list; + struct bpf_map *map; + void *link_raw; +}; + +struct sk_psock_parser { + struct strparser strp; + bool enabled; + void (*saved_data_ready)(struct sock *sk); +}; + +struct sk_psock_work_state { + struct sk_buff *skb; + u32 len; + u32 off; +}; + +struct sk_psock { + struct sock *sk; + struct sock *sk_redir; + u32 apply_bytes; + u32 cork_bytes; + u32 eval; + struct sk_msg *cork; + struct sk_psock_progs progs; + struct sk_psock_parser parser; + struct sk_buff_head ingress_skb; + struct list_head ingress_msg; + unsigned long state; + struct list_head link; + spinlock_t link_lock; + refcount_t refcnt; + void (*saved_unhash)(struct sock *sk); + void (*saved_close)(struct sock *sk, long timeout); + void (*saved_write_space)(struct sock *sk); + struct proto *sk_proto; + struct sk_psock_work_state work_state; + struct work_struct work; + union { + struct rcu_head rcu; + struct work_struct gc; + }; +}; + +int sk_msg_alloc(struct sock *sk, struct sk_msg *msg, int len, + int elem_first_coalesce); +void sk_msg_trim(struct sock *sk, struct sk_msg *msg, int len); +int sk_msg_free(struct sock *sk, struct sk_msg *msg); +int sk_msg_free_nocharge(struct sock *sk, struct sk_msg *msg); +void sk_msg_free_partial(struct sock *sk, struct sk_msg *msg, u32 bytes); +void sk_msg_free_partial_nocharge(struct sock *sk, struct sk_msg *msg, + u32 bytes); + +void sk_msg_return(struct sock *sk, struct sk_msg *msg, int bytes); + +int sk_msg_zerocopy_from_iter(struct sock *sk, struct iov_iter *from, + struct sk_msg *msg, u32 bytes); +int sk_msg_memcopy_from_iter(struct sock *sk, struct iov_iter *from, + struct sk_msg *msg, u32 bytes); + +static inline void sk_msg_check_to_free(struct sk_msg *msg, u32 i, u32 bytes) +{ + WARN_ON(i == msg->sg.end && bytes); +} + +static inline void sk_msg_apply_bytes(struct sk_psock *psock, u32 bytes) +{ + if (psock->apply_bytes) { + if (psock->apply_bytes < bytes) + psock->apply_bytes = 0; + else + psock->apply_bytes -= bytes; + } +} + +#define sk_msg_iter_var_prev(var) \ + do { \ + if (var == 0) \ + var = MAX_MSG_FRAGS - 1; \ + else \ + var--; \ + } while (0) + +#define sk_msg_iter_var_next(var) \ + do { \ + var++; \ + if (var == MAX_MSG_FRAGS) \ + var = 0; \ + } while (0) + +#define sk_msg_iter_prev(msg, which) \ + sk_msg_iter_var_prev(msg->sg.which) + +#define sk_msg_iter_next(msg, which) \ + sk_msg_iter_var_next(msg->sg.which) + +static inline void sk_msg_clear_meta(struct sk_msg *msg) +{ + memset(&msg->sg, 0, offsetofend(struct sk_msg_sg, copy)); +} + +static inline void sk_msg_init(struct sk_msg *msg) +{ + memset(msg, 0, sizeof(*msg)); + sg_init_marker(msg->sg.data, ARRAY_SIZE(msg->sg.data)); +} + +static inline void sk_msg_xfer(struct sk_msg *dst, struct sk_msg *src, + int which, u32 size) +{ + dst->sg.data[which] = src->sg.data[which]; + dst->sg.data[which].length = size; + src->sg.data[which].length -= size; + src->sg.data[which].offset += size; +} + +static inline u32 sk_msg_elem_used(const struct sk_msg *msg) +{ + return msg->sg.end >= msg->sg.start ? + msg->sg.end - msg->sg.start : + msg->sg.end + (MAX_MSG_FRAGS - msg->sg.start); +} + +static inline bool sk_msg_full(const struct sk_msg *msg) +{ + return (msg->sg.end == msg->sg.start) && msg->sg.size; +} + +static inline struct scatterlist *sk_msg_elem(struct sk_msg *msg, int which) +{ + return &msg->sg.data[which]; +} + +static inline struct page *sk_msg_page(struct sk_msg *msg, int which) +{ + return sg_page(sk_msg_elem(msg, which)); +} + +static inline bool sk_msg_to_ingress(const struct sk_msg *msg) +{ + return msg->flags & BPF_F_INGRESS; +} + +static inline void sk_msg_compute_data_pointers(struct sk_msg *msg) +{ + struct scatterlist *sge = sk_msg_elem(msg, msg->sg.start); + + if (msg->sg.copy[msg->sg.start]) { + msg->data = NULL; + msg->data_end = NULL; + } else { + msg->data = sg_virt(sge); + msg->data_end = msg->data + sge->length; + } +} + +static inline void sk_msg_page_add(struct sk_msg *msg, struct page *page, + u32 len, u32 offset) +{ + struct scatterlist *sge; + + get_page(page); + sge = sk_msg_elem(msg, msg->sg.end); + sg_set_page(sge, page, len, offset); + sg_unmark_end(sge); + + msg->sg.copy[msg->sg.end] = true; + msg->sg.size += len; + sk_msg_iter_next(msg, end); +} + +static inline struct sk_psock *sk_psock(const struct sock *sk) +{ + return rcu_dereference_sk_user_data(sk); +} + +static inline bool sk_has_psock(struct sock *sk) +{ + return sk_psock(sk) != NULL && sk->sk_prot->recvmsg == tcp_bpf_recvmsg; +} + +static inline void sk_psock_queue_msg(struct sk_psock *psock, + struct sk_msg *msg) +{ + list_add_tail(&msg->list, &psock->ingress_msg); +} + +static inline void sk_psock_report_error(struct sk_psock *psock, int err) +{ + struct sock *sk = psock->sk; + + sk->sk_err = err; + sk->sk_error_report(sk); +} + +struct sk_psock *sk_psock_init(struct sock *sk, int node); + +int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock); +void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock); +void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock); + +int sk_psock_msg_verdict(struct sock *sk, struct sk_psock *psock, + struct sk_msg *msg); + +static inline struct sk_psock_link *sk_psock_init_link(void) +{ + return kzalloc(sizeof(struct sk_psock_link), + GFP_ATOMIC | __GFP_NOWARN); +} + +static inline void sk_psock_free_link(struct sk_psock_link *link) +{ + kfree(link); +} + +struct sk_psock_link *sk_psock_link_pop(struct sk_psock *psock); +#if defined(CONFIG_BPF_STREAM_PARSER) +void sk_psock_unlink(struct sock *sk, struct sk_psock_link *link); +#else +static inline void sk_psock_unlink(struct sock *sk, + struct sk_psock_link *link) +{ +} +#endif + +void __sk_psock_purge_ingress_msg(struct sk_psock *psock); + +static inline void sk_psock_cork_free(struct sk_psock *psock) +{ + if (psock->cork) { + sk_msg_free(psock->sk, psock->cork); + kfree(psock->cork); + psock->cork = NULL; + } +} + +static inline void sk_psock_update_proto(struct sock *sk, + struct sk_psock *psock, + struct proto *ops) +{ + psock->saved_unhash = sk->sk_prot->unhash; + psock->saved_close = sk->sk_prot->close; + psock->saved_write_space = sk->sk_write_space; + + psock->sk_proto = sk->sk_prot; + sk->sk_prot = ops; +} + +static inline void sk_psock_restore_proto(struct sock *sk, + struct sk_psock *psock) +{ + if (psock->sk_proto) { + sk->sk_prot = psock->sk_proto; + psock->sk_proto = NULL; + } +} + +static inline void sk_psock_set_state(struct sk_psock *psock, + enum sk_psock_state_bits bit) +{ + set_bit(bit, &psock->state); +} + +static inline void sk_psock_clear_state(struct sk_psock *psock, + enum sk_psock_state_bits bit) +{ + clear_bit(bit, &psock->state); +} + +static inline bool sk_psock_test_state(const struct sk_psock *psock, + enum sk_psock_state_bits bit) +{ + return test_bit(bit, &psock->state); +} + +static inline struct sk_psock *sk_psock_get(struct sock *sk) +{ + struct sk_psock *psock; + + rcu_read_lock(); + psock = sk_psock(sk); + if (psock && !refcount_inc_not_zero(&psock->refcnt)) + psock = NULL; + rcu_read_unlock(); + return psock; +} + +void sk_psock_stop(struct sock *sk, struct sk_psock *psock); +void sk_psock_destroy(struct rcu_head *rcu); +void sk_psock_drop(struct sock *sk, struct sk_psock *psock); + +static inline void sk_psock_put(struct sock *sk, struct sk_psock *psock) +{ + if (refcount_dec_and_test(&psock->refcnt)) + sk_psock_drop(sk, psock); +} + +static inline void psock_set_prog(struct bpf_prog **pprog, + struct bpf_prog *prog) +{ + prog = xchg(pprog, prog); + if (prog) + bpf_prog_put(prog); +} + +static inline void psock_progs_drop(struct sk_psock_progs *progs) +{ + psock_set_prog(&progs->msg_parser, NULL); + psock_set_prog(&progs->skb_parser, NULL); + psock_set_prog(&progs->skb_verdict, NULL); +} + +#endif /* _LINUX_SKMSG_H */ diff --git a/include/net/tcp.h b/include/net/tcp.h index 1049bc73055a..2c8f18224acc 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -897,6 +897,21 @@ static inline void bpf_compute_data_end_sk_skb(struct sk_buff *skb) TCP_SKB_CB(skb)->bpf.data_end = skb->data + skb_headlen(skb); } +static inline bool tcp_skb_bpf_ingress(const struct sk_buff *skb) +{ + return TCP_SKB_CB(skb)->bpf.flags & BPF_F_INGRESS; +} + +static inline struct sock *tcp_skb_bpf_redirect_fetch(struct sk_buff *skb) +{ + return TCP_SKB_CB(skb)->bpf.sk_redir; +} + +static inline void tcp_skb_bpf_redirect_clear(struct sk_buff *skb) +{ + TCP_SKB_CB(skb)->bpf.sk_redir = NULL; +} + #if IS_ENABLED(CONFIG_IPV6) /* This is the variant of inet6_iif() that must be used by TCP, * as TCP moves IP6CB into a different location in skb->cb[] @@ -2148,6 +2163,18 @@ void tcp_cleanup_ulp(struct sock *sk); __MODULE_INFO(alias, alias_userspace, name); \ __MODULE_INFO(alias, alias_tcp_ulp, "tcp-ulp-" name) +struct sk_msg; +struct sk_psock; + +int tcp_bpf_init(struct sock *sk); +void tcp_bpf_reinit(struct sock *sk); +int tcp_bpf_sendmsg_redir(struct sock *sk, struct sk_msg *msg, u32 bytes, + int flags); +int tcp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, + int nonblock, int flags, int *addr_len); +int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock, + struct msghdr *msg, int len); + /* Call BPF_SOCK_OPS program that returns an int. If the return value * is < 0, then the BPF op failed (for example if the loaded BPF * program does not support the chosen operation or there is no BPF diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index ffc39a7e028d..47bbc691c983 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -14,11 +14,6 @@ ifeq ($(CONFIG_XDP_SOCKETS),y) obj-$(CONFIG_BPF_SYSCALL) += xskmap.o endif obj-$(CONFIG_BPF_SYSCALL) += offload.o -ifeq ($(CONFIG_STREAM_PARSER),y) -ifeq ($(CONFIG_INET),y) -obj-$(CONFIG_BPF_SYSCALL) += sockmap.o -endif -endif endif ifeq ($(CONFIG_PERF_EVENTS),y) obj-$(CONFIG_BPF_SYSCALL) += stackmap.o diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 604cdf006f66..426ce68a203d 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1898,8 +1898,6 @@ const struct bpf_func_proto bpf_ktime_get_boot_ns_proto __weak; const struct bpf_func_proto bpf_get_current_pid_tgid_proto __weak; const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak; const struct bpf_func_proto bpf_get_current_comm_proto __weak; -const struct bpf_func_proto bpf_sock_map_update_proto __weak; -const struct bpf_func_proto bpf_sock_hash_update_proto __weak; const struct bpf_func_proto bpf_get_current_cgroup_id_proto __weak; const struct bpf_func_proto bpf_get_local_storage_proto __weak; diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c deleted file mode 100644 index de6f7a65c72b..000000000000 --- a/kernel/bpf/sockmap.c +++ /dev/null @@ -1,2610 +0,0 @@ -/* Copyright (c) 2017 Covalent IO, Inc. http://covalent.io - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - */ - -/* A BPF sock_map is used to store sock objects. This is primarly used - * for doing socket redirect with BPF helper routines. - * - * A sock map may have BPF programs attached to it, currently a program - * used to parse packets and a program to provide a verdict and redirect - * decision on the packet are supported. Any programs attached to a sock - * map are inherited by sock objects when they are added to the map. If - * no BPF programs are attached the sock object may only be used for sock - * redirect. - * - * A sock object may be in multiple maps, but can only inherit a single - * parse or verdict program. If adding a sock object to a map would result - * in having multiple parsing programs the update will return an EBUSY error. - * - * For reference this program is similar to devmap used in XDP context - * reviewing these together may be useful. For an example please review - * ./samples/bpf/sockmap/. - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#define SOCK_CREATE_FLAG_MASK \ - (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) - -struct bpf_sock_progs { - struct bpf_prog *bpf_tx_msg; - struct bpf_prog *bpf_parse; - struct bpf_prog *bpf_verdict; -}; - -struct bpf_stab { - struct bpf_map map; - struct sock **sock_map; - struct bpf_sock_progs progs; - raw_spinlock_t lock; -}; - -struct bucket { - struct hlist_head head; - raw_spinlock_t lock; -}; - -struct bpf_htab { - struct bpf_map map; - struct bucket *buckets; - atomic_t count; - u32 n_buckets; - u32 elem_size; - struct bpf_sock_progs progs; - struct rcu_head rcu; -}; - -struct htab_elem { - struct rcu_head rcu; - struct hlist_node hash_node; - u32 hash; - struct sock *sk; - char key[0]; -}; - -enum smap_psock_state { - SMAP_TX_RUNNING, -}; - -struct smap_psock_map_entry { - struct list_head list; - struct bpf_map *map; - struct sock **entry; - struct htab_elem __rcu *hash_link; -}; - -struct smap_psock { - struct rcu_head rcu; - refcount_t refcnt; - - /* datapath variables */ - struct sk_buff_head rxqueue; - bool strp_enabled; - - /* datapath error path cache across tx work invocations */ - int save_rem; - int save_off; - struct sk_buff *save_skb; - - /* datapath variables for tx_msg ULP */ - struct sock *sk_redir; - int apply_bytes; - int cork_bytes; - int sg_size; - int eval; - struct sk_msg_buff *cork; - struct list_head ingress; - - struct strparser strp; - struct bpf_prog *bpf_tx_msg; - struct bpf_prog *bpf_parse; - struct bpf_prog *bpf_verdict; - struct list_head maps; - spinlock_t maps_lock; - - /* Back reference used when sock callback trigger sockmap operations */ - struct sock *sock; - unsigned long state; - - struct work_struct tx_work; - struct work_struct gc_work; - - struct proto *sk_proto; - void (*save_unhash)(struct sock *sk); - void (*save_close)(struct sock *sk, long timeout); - void (*save_data_ready)(struct sock *sk); - void (*save_write_space)(struct sock *sk); -}; - -static void smap_release_sock(struct smap_psock *psock, struct sock *sock); -static int bpf_tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, - int nonblock, int flags, int *addr_len); -static int bpf_tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size); -static int bpf_tcp_sendpage(struct sock *sk, struct page *page, - int offset, size_t size, int flags); -static void bpf_tcp_unhash(struct sock *sk); -static void bpf_tcp_close(struct sock *sk, long timeout); - -static inline struct smap_psock *smap_psock_sk(const struct sock *sk) -{ - return rcu_dereference_sk_user_data(sk); -} - -static bool bpf_tcp_stream_read(const struct sock *sk) -{ - struct smap_psock *psock; - bool empty = true; - - rcu_read_lock(); - psock = smap_psock_sk(sk); - if (unlikely(!psock)) - goto out; - empty = list_empty(&psock->ingress); -out: - rcu_read_unlock(); - return !empty; -} - -enum { - SOCKMAP_IPV4, - SOCKMAP_IPV6, - SOCKMAP_NUM_PROTS, -}; - -enum { - SOCKMAP_BASE, - SOCKMAP_TX, - SOCKMAP_NUM_CONFIGS, -}; - -static struct proto *saved_tcpv6_prot __read_mostly; -static DEFINE_SPINLOCK(tcpv6_prot_lock); -static struct proto bpf_tcp_prots[SOCKMAP_NUM_PROTS][SOCKMAP_NUM_CONFIGS]; - -static void build_protos(struct proto prot[SOCKMAP_NUM_CONFIGS], - struct proto *base) -{ - prot[SOCKMAP_BASE] = *base; - prot[SOCKMAP_BASE].unhash = bpf_tcp_unhash; - prot[SOCKMAP_BASE].close = bpf_tcp_close; - prot[SOCKMAP_BASE].recvmsg = bpf_tcp_recvmsg; - prot[SOCKMAP_BASE].stream_memory_read = bpf_tcp_stream_read; - - prot[SOCKMAP_TX] = prot[SOCKMAP_BASE]; - prot[SOCKMAP_TX].sendmsg = bpf_tcp_sendmsg; - prot[SOCKMAP_TX].sendpage = bpf_tcp_sendpage; -} - -static void update_sk_prot(struct sock *sk, struct smap_psock *psock) -{ - int family = sk->sk_family == AF_INET6 ? SOCKMAP_IPV6 : SOCKMAP_IPV4; - int conf = psock->bpf_tx_msg ? SOCKMAP_TX : SOCKMAP_BASE; - - sk->sk_prot = &bpf_tcp_prots[family][conf]; -} - -static int bpf_tcp_init(struct sock *sk) -{ - struct smap_psock *psock; - - rcu_read_lock(); - psock = smap_psock_sk(sk); - if (unlikely(!psock)) { - rcu_read_unlock(); - return -EINVAL; - } - - if (unlikely(psock->sk_proto)) { - rcu_read_unlock(); - return -EBUSY; - } - - psock->save_unhash = sk->sk_prot->unhash; - psock->save_close = sk->sk_prot->close; - psock->sk_proto = sk->sk_prot; - - /* Build IPv6 sockmap whenever the address of tcpv6_prot changes */ - if (sk->sk_family == AF_INET6 && - unlikely(sk->sk_prot != smp_load_acquire(&saved_tcpv6_prot))) { - spin_lock_bh(&tcpv6_prot_lock); - if (likely(sk->sk_prot != saved_tcpv6_prot)) { - build_protos(bpf_tcp_prots[SOCKMAP_IPV6], sk->sk_prot); - smp_store_release(&saved_tcpv6_prot, sk->sk_prot); - } - spin_unlock_bh(&tcpv6_prot_lock); - } - update_sk_prot(sk, psock); - rcu_read_unlock(); - return 0; -} - -static int __init bpf_sock_init(void) -{ - build_protos(bpf_tcp_prots[SOCKMAP_IPV4], &tcp_prot); - return 0; -} -core_initcall(bpf_sock_init); - -static void smap_release_sock(struct smap_psock *psock, struct sock *sock); -static int free_start_sg(struct sock *sk, struct sk_msg_buff *md, bool charge); - -static void bpf_tcp_release(struct sock *sk) -{ - struct smap_psock *psock; - - rcu_read_lock(); - psock = smap_psock_sk(sk); - if (unlikely(!psock)) - goto out; - - if (psock->cork) { - free_start_sg(psock->sock, psock->cork, true); - kfree(psock->cork); - psock->cork = NULL; - } - - if (psock->sk_proto) { - sk->sk_prot = psock->sk_proto; - psock->sk_proto = NULL; - } -out: - rcu_read_unlock(); -} - -static struct htab_elem *lookup_elem_raw(struct hlist_head *head, - u32 hash, void *key, u32 key_size) -{ - struct htab_elem *l; - - hlist_for_each_entry_rcu(l, head, hash_node) { - if (l->hash == hash && !memcmp(&l->key, key, key_size)) - return l; - } - - return NULL; -} - -static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash) -{ - return &htab->buckets[hash & (htab->n_buckets - 1)]; -} - -static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash) -{ - return &__select_bucket(htab, hash)->head; -} - -static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) -{ - atomic_dec(&htab->count); - kfree_rcu(l, rcu); -} - -static struct smap_psock_map_entry *psock_map_pop(struct sock *sk, - struct smap_psock *psock) -{ - struct smap_psock_map_entry *e; - - spin_lock_bh(&psock->maps_lock); - e = list_first_entry_or_null(&psock->maps, - struct smap_psock_map_entry, - list); - if (e) - list_del(&e->list); - spin_unlock_bh(&psock->maps_lock); - return e; -} - -static void bpf_tcp_remove(struct sock *sk, struct smap_psock *psock) -{ - struct smap_psock_map_entry *e; - struct sk_msg_buff *md, *mtmp; - struct sock *osk; - - if (psock->cork) { - free_start_sg(psock->sock, psock->cork, true); - kfree(psock->cork); - psock->cork = NULL; - } - - list_for_each_entry_safe(md, mtmp, &psock->ingress, list) { - list_del(&md->list); - free_start_sg(psock->sock, md, true); - kfree(md); - } - - e = psock_map_pop(sk, psock); - while (e) { - if (e->entry) { - struct bpf_stab *stab = container_of(e->map, struct bpf_stab, map); - - raw_spin_lock_bh(&stab->lock); - osk = *e->entry; - if (osk == sk) { - *e->entry = NULL; - smap_release_sock(psock, sk); - } - raw_spin_unlock_bh(&stab->lock); - } else { - struct htab_elem *link = rcu_dereference(e->hash_link); - struct bpf_htab *htab = container_of(e->map, struct bpf_htab, map); - struct hlist_head *head; - struct htab_elem *l; - struct bucket *b; - - b = __select_bucket(htab, link->hash); - head = &b->head; - raw_spin_lock_bh(&b->lock); - l = lookup_elem_raw(head, - link->hash, link->key, - htab->map.key_size); - /* If another thread deleted this object skip deletion. - * The refcnt on psock may or may not be zero. - */ - if (l && l == link) { - hlist_del_rcu(&link->hash_node); - smap_release_sock(psock, link->sk); - free_htab_elem(htab, link); - } - raw_spin_unlock_bh(&b->lock); - } - kfree(e); - e = psock_map_pop(sk, psock); - } -} - -static void bpf_tcp_unhash(struct sock *sk) -{ - void (*unhash_fun)(struct sock *sk); - struct smap_psock *psock; - - rcu_read_lock(); - psock = smap_psock_sk(sk); - if (unlikely(!psock)) { - rcu_read_unlock(); - if (sk->sk_prot->unhash) - sk->sk_prot->unhash(sk); - return; - } - unhash_fun = psock->save_unhash; - bpf_tcp_remove(sk, psock); - rcu_read_unlock(); - unhash_fun(sk); -} - -static void bpf_tcp_close(struct sock *sk, long timeout) -{ - void (*close_fun)(struct sock *sk, long timeout); - struct smap_psock *psock; - - lock_sock(sk); - rcu_read_lock(); - psock = smap_psock_sk(sk); - if (unlikely(!psock)) { - rcu_read_unlock(); - release_sock(sk); - return sk->sk_prot->close(sk, timeout); - } - close_fun = psock->save_close; - bpf_tcp_remove(sk, psock); - rcu_read_unlock(); - release_sock(sk); - close_fun(sk, timeout); -} - -enum __sk_action { - __SK_DROP = 0, - __SK_PASS, - __SK_REDIRECT, - __SK_NONE, -}; - -static int memcopy_from_iter(struct sock *sk, - struct sk_msg_buff *md, - struct iov_iter *from, int bytes) -{ - struct scatterlist *sg = md->sg_data; - int i = md->sg_curr, rc = -ENOSPC; - - do { - int copy; - char *to; - - if (md->sg_copybreak >= sg[i].length) { - md->sg_copybreak = 0; - - if (++i == MAX_SKB_FRAGS) - i = 0; - - if (i == md->sg_end) - break; - } - - copy = sg[i].length - md->sg_copybreak; - to = sg_virt(&sg[i]) + md->sg_copybreak; - md->sg_copybreak += copy; - - if (sk->sk_route_caps & NETIF_F_NOCACHE_COPY) - rc = copy_from_iter_nocache(to, copy, from); - else - rc = copy_from_iter(to, copy, from); - - if (rc != copy) { - rc = -EFAULT; - goto out; - } - - bytes -= copy; - if (!bytes) - break; - - md->sg_copybreak = 0; - if (++i == MAX_SKB_FRAGS) - i = 0; - } while (i != md->sg_end); -out: - md->sg_curr = i; - return rc; -} - -static int bpf_tcp_push(struct sock *sk, int apply_bytes, - struct sk_msg_buff *md, - int flags, bool uncharge) -{ - bool apply = apply_bytes; - struct scatterlist *sg; - int offset, ret = 0; - struct page *p; - size_t size; - - while (1) { - sg = md->sg_data + md->sg_start; - size = (apply && apply_bytes < sg->length) ? - apply_bytes : sg->length; - offset = sg->offset; - - tcp_rate_check_app_limited(sk); - p = sg_page(sg); -retry: - ret = do_tcp_sendpages(sk, p, offset, size, flags); - if (ret != size) { - if (ret > 0) { - if (apply) - apply_bytes -= ret; - - sg->offset += ret; - sg->length -= ret; - size -= ret; - offset += ret; - if (uncharge) - sk_mem_uncharge(sk, ret); - goto retry; - } - - return ret; - } - - if (apply) - apply_bytes -= ret; - sg->offset += ret; - sg->length -= ret; - if (uncharge) - sk_mem_uncharge(sk, ret); - - if (!sg->length) { - put_page(p); - md->sg_start++; - if (md->sg_start == MAX_SKB_FRAGS) - md->sg_start = 0; - sg_init_table(sg, 1); - - if (md->sg_start == md->sg_end) - break; - } - - if (apply && !apply_bytes) - break; - } - return 0; -} - -static inline void bpf_compute_data_pointers_sg(struct sk_msg_buff *md) -{ - struct scatterlist *sg = md->sg_data + md->sg_start; - - if (md->sg_copy[md->sg_start]) { - md->data = md->data_end = 0; - } else { - md->data = sg_virt(sg); - md->data_end = md->data + sg->length; - } -} - -static void return_mem_sg(struct sock *sk, int bytes, struct sk_msg_buff *md) -{ - struct scatterlist *sg = md->sg_data; - int i = md->sg_start; - - do { - int uncharge = (bytes < sg[i].length) ? bytes : sg[i].length; - - sk_mem_uncharge(sk, uncharge); - bytes -= uncharge; - if (!bytes) - break; - i++; - if (i == MAX_SKB_FRAGS) - i = 0; - } while (i != md->sg_end); -} - -static void free_bytes_sg(struct sock *sk, int bytes, - struct sk_msg_buff *md, bool charge) -{ - struct scatterlist *sg = md->sg_data; - int i = md->sg_start, free; - - while (bytes && sg[i].length) { - free = sg[i].length; - if (bytes < free) { - sg[i].length -= bytes; - sg[i].offset += bytes; - if (charge) - sk_mem_uncharge(sk, bytes); - break; - } - - if (charge) - sk_mem_uncharge(sk, sg[i].length); - put_page(sg_page(&sg[i])); - bytes -= sg[i].length; - sg[i].length = 0; - sg[i].page_link = 0; - sg[i].offset = 0; - i++; - - if (i == MAX_SKB_FRAGS) - i = 0; - } - md->sg_start = i; -} - -static int free_sg(struct sock *sk, int start, - struct sk_msg_buff *md, bool charge) -{ - struct scatterlist *sg = md->sg_data; - int i = start, free = 0; - - while (sg[i].length) { - free += sg[i].length; - if (charge) - sk_mem_uncharge(sk, sg[i].length); - if (!md->skb) - put_page(sg_page(&sg[i])); - sg[i].length = 0; - sg[i].page_link = 0; - sg[i].offset = 0; - i++; - - if (i == MAX_SKB_FRAGS) - i = 0; - } - consume_skb(md->skb); - - return free; -} - -static int free_start_sg(struct sock *sk, struct sk_msg_buff *md, bool charge) -{ - int free = free_sg(sk, md->sg_start, md, charge); - - md->sg_start = md->sg_end; - return free; -} - -static int free_curr_sg(struct sock *sk, struct sk_msg_buff *md) -{ - return free_sg(sk, md->sg_curr, md, true); -} - -static int bpf_map_msg_verdict(int _rc, struct sk_msg_buff *md) -{ - return ((_rc == SK_PASS) ? - (md->sk_redir ? __SK_REDIRECT : __SK_PASS) : - __SK_DROP); -} - -static unsigned int smap_do_tx_msg(struct sock *sk, - struct smap_psock *psock, - struct sk_msg_buff *md) -{ - struct bpf_prog *prog; - unsigned int rc, _rc; - - preempt_disable(); - rcu_read_lock(); - - /* If the policy was removed mid-send then default to 'accept' */ - prog = READ_ONCE(psock->bpf_tx_msg); - if (unlikely(!prog)) { - _rc = SK_PASS; - goto verdict; - } - - bpf_compute_data_pointers_sg(md); - md->sk = sk; - rc = (*prog->bpf_func)(md, prog->insnsi); - psock->apply_bytes = md->apply_bytes; - - /* Moving return codes from UAPI namespace into internal namespace */ - _rc = bpf_map_msg_verdict(rc, md); - - /* The psock has a refcount on the sock but not on the map and because - * we need to drop rcu read lock here its possible the map could be - * removed between here and when we need it to execute the sock - * redirect. So do the map lookup now for future use. - */ - if (_rc == __SK_REDIRECT) { - if (psock->sk_redir) - sock_put(psock->sk_redir); - psock->sk_redir = do_msg_redirect_map(md); - if (!psock->sk_redir) { - _rc = __SK_DROP; - goto verdict; - } - sock_hold(psock->sk_redir); - } -verdict: - rcu_read_unlock(); - preempt_enable(); - - return _rc; -} - -static int bpf_tcp_ingress(struct sock *sk, int apply_bytes, - struct smap_psock *psock, - struct sk_msg_buff *md, int flags) -{ - bool apply = apply_bytes; - size_t size, copied = 0; - struct sk_msg_buff *r; - int err = 0, i; - - r = kzalloc(sizeof(struct sk_msg_buff), __GFP_NOWARN | GFP_KERNEL); - if (unlikely(!r)) - return -ENOMEM; - - lock_sock(sk); - r->sg_start = md->sg_start; - i = md->sg_start; - - do { - size = (apply && apply_bytes < md->sg_data[i].length) ? - apply_bytes : md->sg_data[i].length; - - if (!sk_wmem_schedule(sk, size)) { - if (!copied) - err = -ENOMEM; - break; - } - - sk_mem_charge(sk, size); - r->sg_data[i] = md->sg_data[i]; - r->sg_data[i].length = size; - md->sg_data[i].length -= size; - md->sg_data[i].offset += size; - copied += size; - - if (md->sg_data[i].length) { - get_page(sg_page(&r->sg_data[i])); - r->sg_end = (i + 1) == MAX_SKB_FRAGS ? 0 : i + 1; - } else { - i++; - if (i == MAX_SKB_FRAGS) - i = 0; - r->sg_end = i; - } - - if (apply) { - apply_bytes -= size; - if (!apply_bytes) - break; - } - } while (i != md->sg_end); - - md->sg_start = i; - - if (!err) { - list_add_tail(&r->list, &psock->ingress); - sk->sk_data_ready(sk); - } else { - free_start_sg(sk, r, true); - kfree(r); - } - - release_sock(sk); - return err; -} - -static int bpf_tcp_sendmsg_do_redirect(struct sock *sk, int send, - struct sk_msg_buff *md, - int flags) -{ - bool ingress = !!(md->flags & BPF_F_INGRESS); - struct smap_psock *psock; - int err = 0; - - rcu_read_lock(); - psock = smap_psock_sk(sk); - if (unlikely(!psock)) - goto out_rcu; - - if (!refcount_inc_not_zero(&psock->refcnt)) - goto out_rcu; - - rcu_read_unlock(); - - if (ingress) { - err = bpf_tcp_ingress(sk, send, psock, md, flags); - } else { - lock_sock(sk); - err = bpf_tcp_push(sk, send, md, flags, false); - release_sock(sk); - } - smap_release_sock(psock, sk); - return err; -out_rcu: - rcu_read_unlock(); - return 0; -} - -static inline void bpf_md_init(struct smap_psock *psock) -{ - if (!psock->apply_bytes) { - psock->eval = __SK_NONE; - if (psock->sk_redir) { - sock_put(psock->sk_redir); - psock->sk_redir = NULL; - } - } -} - -static void apply_bytes_dec(struct smap_psock *psock, int i) -{ - if (psock->apply_bytes) { - if (psock->apply_bytes < i) - psock->apply_bytes = 0; - else - psock->apply_bytes -= i; - } -} - -static int bpf_exec_tx_verdict(struct smap_psock *psock, - struct sk_msg_buff *m, - struct sock *sk, - int *copied, int flags) -{ - bool cork = false, enospc = (m->sg_start == m->sg_end); - struct sock *redir; - int err = 0; - int send; - -more_data: - if (psock->eval == __SK_NONE) - psock->eval = smap_do_tx_msg(sk, psock, m); - - if (m->cork_bytes && - m->cork_bytes > psock->sg_size && !enospc) { - psock->cork_bytes = m->cork_bytes - psock->sg_size; - if (!psock->cork) { - psock->cork = kcalloc(1, - sizeof(struct sk_msg_buff), - GFP_ATOMIC | __GFP_NOWARN); - - if (!psock->cork) { - err = -ENOMEM; - goto out_err; - } - } - memcpy(psock->cork, m, sizeof(*m)); - goto out_err; - } - - send = psock->sg_size; - if (psock->apply_bytes && psock->apply_bytes < send) - send = psock->apply_bytes; - - switch (psock->eval) { - case __SK_PASS: - err = bpf_tcp_push(sk, send, m, flags, true); - if (unlikely(err)) { - *copied -= free_start_sg(sk, m, true); - break; - } - - apply_bytes_dec(psock, send); - psock->sg_size -= send; - break; - case __SK_REDIRECT: - redir = psock->sk_redir; - apply_bytes_dec(psock, send); - - if (psock->cork) { - cork = true; - psock->cork = NULL; - } - - return_mem_sg(sk, send, m); - release_sock(sk); - - err = bpf_tcp_sendmsg_do_redirect(redir, send, m, flags); - lock_sock(sk); - - if (unlikely(err < 0)) { - int free = free_start_sg(sk, m, false); - - psock->sg_size = 0; - if (!cork) - *copied -= free; - } else { - psock->sg_size -= send; - } - - if (cork) { - free_start_sg(sk, m, true); - psock->sg_size = 0; - kfree(m); - m = NULL; - err = 0; - } - break; - case __SK_DROP: - default: - free_bytes_sg(sk, send, m, true); - apply_bytes_dec(psock, send); - *copied -= send; - psock->sg_size -= send; - err = -EACCES; - break; - } - - if (likely(!err)) { - bpf_md_init(psock); - if (m && - m->sg_data[m->sg_start].page_link && - m->sg_data[m->sg_start].length) - goto more_data; - } - -out_err: - return err; -} - -static int bpf_wait_data(struct sock *sk, - struct smap_psock *psk, int flags, - long timeo, int *err) -{ - int rc; - - DEFINE_WAIT_FUNC(wait, woken_wake_function); - - add_wait_queue(sk_sleep(sk), &wait); - sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); - rc = sk_wait_event(sk, &timeo, - !list_empty(&psk->ingress) || - !skb_queue_empty(&sk->sk_receive_queue), - &wait); - sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); - remove_wait_queue(sk_sleep(sk), &wait); - - return rc; -} - -static int bpf_tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, - int nonblock, int flags, int *addr_len) -{ - struct iov_iter *iter = &msg->msg_iter; - struct smap_psock *psock; - int copied = 0; - - if (unlikely(flags & MSG_ERRQUEUE)) - return inet_recv_error(sk, msg, len, addr_len); - if (!skb_queue_empty(&sk->sk_receive_queue)) - return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len); - - rcu_read_lock(); - psock = smap_psock_sk(sk); - if (unlikely(!psock)) - goto out; - - if (unlikely(!refcount_inc_not_zero(&psock->refcnt))) - goto out; - rcu_read_unlock(); - - lock_sock(sk); -bytes_ready: - while (copied != len) { - struct scatterlist *sg; - struct sk_msg_buff *md; - int i; - - md = list_first_entry_or_null(&psock->ingress, - struct sk_msg_buff, list); - if (unlikely(!md)) - break; - i = md->sg_start; - do { - struct page *page; - int n, copy; - - sg = &md->sg_data[i]; - copy = sg->length; - page = sg_page(sg); - - if (copied + copy > len) - copy = len - copied; - - n = copy_page_to_iter(page, sg->offset, copy, iter); - if (n != copy) { - md->sg_start = i; - release_sock(sk); - smap_release_sock(psock, sk); - return -EFAULT; - } - - copied += copy; - sg->offset += copy; - sg->length -= copy; - sk_mem_uncharge(sk, copy); - - if (!sg->length) { - i++; - if (i == MAX_SKB_FRAGS) - i = 0; - if (!md->skb) - put_page(page); - } - if (copied == len) - break; - } while (i != md->sg_end); - md->sg_start = i; - - if (!sg->length && md->sg_start == md->sg_end) { - list_del(&md->list); - consume_skb(md->skb); - kfree(md); - } - } - - if (!copied) { - long timeo; - int data; - int err = 0; - - timeo = sock_rcvtimeo(sk, nonblock); - data = bpf_wait_data(sk, psock, flags, timeo, &err); - - if (data) { - if (!skb_queue_empty(&sk->sk_receive_queue)) { - release_sock(sk); - smap_release_sock(psock, sk); - copied = tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len); - return copied; - } - goto bytes_ready; - } - - if (err) - copied = err; - } - - release_sock(sk); - smap_release_sock(psock, sk); - return copied; -out: - rcu_read_unlock(); - return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len); -} - - -static int bpf_tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) -{ - int flags = msg->msg_flags | MSG_NO_SHARED_FRAGS; - struct sk_msg_buff md = {0}; - unsigned int sg_copy = 0; - struct smap_psock *psock; - int copied = 0, err = 0; - struct scatterlist *sg; - long timeo; - - /* Its possible a sock event or user removed the psock _but_ the ops - * have not been reprogrammed yet so we get here. In this case fallback - * to tcp_sendmsg. Note this only works because we _only_ ever allow - * a single ULP there is no hierarchy here. - */ - rcu_read_lock(); - psock = smap_psock_sk(sk); - if (unlikely(!psock)) { - rcu_read_unlock(); - return tcp_sendmsg(sk, msg, size); - } - - /* Increment the psock refcnt to ensure its not released while sending a - * message. Required because sk lookup and bpf programs are used in - * separate rcu critical sections. Its OK if we lose the map entry - * but we can't lose the sock reference. - */ - if (!refcount_inc_not_zero(&psock->refcnt)) { - rcu_read_unlock(); - return tcp_sendmsg(sk, msg, size); - } - - sg = md.sg_data; - sg_init_marker(sg, MAX_SKB_FRAGS); - rcu_read_unlock(); - - lock_sock(sk); - timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); - - while (msg_data_left(msg)) { - struct sk_msg_buff *m = NULL; - bool enospc = false; - int copy; - - if (sk->sk_err) { - err = -sk->sk_err; - goto out_err; - } - - copy = msg_data_left(msg); - if (!sk_stream_memory_free(sk)) - goto wait_for_sndbuf; - - m = psock->cork_bytes ? psock->cork : &md; - m->sg_curr = m->sg_copybreak ? m->sg_curr : m->sg_end; - err = sk_alloc_sg(sk, copy, m->sg_data, - m->sg_start, &m->sg_end, &sg_copy, - m->sg_end - 1); - if (err) { - if (err != -ENOSPC) - goto wait_for_memory; - enospc = true; - copy = sg_copy; - } - - err = memcopy_from_iter(sk, m, &msg->msg_iter, copy); - if (err < 0) { - free_curr_sg(sk, m); - goto out_err; - } - - psock->sg_size += copy; - copied += copy; - sg_copy = 0; - - /* When bytes are being corked skip running BPF program and - * applying verdict unless there is no more buffer space. In - * the ENOSPC case simply run BPF prorgram with currently - * accumulated data. We don't have much choice at this point - * we could try extending the page frags or chaining complex - * frags but even in these cases _eventually_ we will hit an - * OOM scenario. More complex recovery schemes may be - * implemented in the future, but BPF programs must handle - * the case where apply_cork requests are not honored. The - * canonical method to verify this is to check data length. - */ - if (psock->cork_bytes) { - if (copy > psock->cork_bytes) - psock->cork_bytes = 0; - else - psock->cork_bytes -= copy; - - if (psock->cork_bytes && !enospc) - goto out_cork; - - /* All cork bytes accounted for re-run filter */ - psock->eval = __SK_NONE; - psock->cork_bytes = 0; - } - - err = bpf_exec_tx_verdict(psock, m, sk, &copied, flags); - if (unlikely(err < 0)) - goto out_err; - continue; -wait_for_sndbuf: - set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); -wait_for_memory: - err = sk_stream_wait_memory(sk, &timeo); - if (err) { - if (m && m != psock->cork) - free_start_sg(sk, m, true); - goto out_err; - } - } -out_err: - if (err < 0) - err = sk_stream_error(sk, msg->msg_flags, err); -out_cork: - release_sock(sk); - smap_release_sock(psock, sk); - return copied ? copied : err; -} - -static int bpf_tcp_sendpage(struct sock *sk, struct page *page, - int offset, size_t size, int flags) -{ - struct sk_msg_buff md = {0}, *m = NULL; - int err = 0, copied = 0; - struct smap_psock *psock; - struct scatterlist *sg; - bool enospc = false; - - rcu_read_lock(); - psock = smap_psock_sk(sk); - if (unlikely(!psock)) - goto accept; - - if (!refcount_inc_not_zero(&psock->refcnt)) - goto accept; - rcu_read_unlock(); - - lock_sock(sk); - - if (psock->cork_bytes) { - m = psock->cork; - sg = &m->sg_data[m->sg_end]; - } else { - m = &md; - sg = m->sg_data; - sg_init_marker(sg, MAX_SKB_FRAGS); - } - - /* Catch case where ring is full and sendpage is stalled. */ - if (unlikely(m->sg_end == m->sg_start && - m->sg_data[m->sg_end].length)) - goto out_err; - - psock->sg_size += size; - sg_set_page(sg, page, size, offset); - get_page(page); - m->sg_copy[m->sg_end] = true; - sk_mem_charge(sk, size); - m->sg_end++; - copied = size; - - if (m->sg_end == MAX_SKB_FRAGS) - m->sg_end = 0; - - if (m->sg_end == m->sg_start) - enospc = true; - - if (psock->cork_bytes) { - if (size > psock->cork_bytes) - psock->cork_bytes = 0; - else - psock->cork_bytes -= size; - - if (psock->cork_bytes && !enospc) - goto out_err; - - /* All cork bytes accounted for re-run filter */ - psock->eval = __SK_NONE; - psock->cork_bytes = 0; - } - - err = bpf_exec_tx_verdict(psock, m, sk, &copied, flags); -out_err: - release_sock(sk); - smap_release_sock(psock, sk); - return copied ? copied : err; -accept: - rcu_read_unlock(); - return tcp_sendpage(sk, page, offset, size, flags); -} - -static void bpf_tcp_msg_add(struct smap_psock *psock, - struct sock *sk, - struct bpf_prog *tx_msg) -{ - struct bpf_prog *orig_tx_msg; - - orig_tx_msg = xchg(&psock->bpf_tx_msg, tx_msg); - if (orig_tx_msg) - bpf_prog_put(orig_tx_msg); -} - -static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb) -{ - struct bpf_prog *prog = READ_ONCE(psock->bpf_verdict); - int rc; - - if (unlikely(!prog)) - return __SK_DROP; - - skb_orphan(skb); - /* We need to ensure that BPF metadata for maps is also cleared - * when we orphan the skb so that we don't have the possibility - * to reference a stale map. - */ - TCP_SKB_CB(skb)->bpf.sk_redir = NULL; - skb->sk = psock->sock; - bpf_compute_data_end_sk_skb(skb); - preempt_disable(); - rc = (*prog->bpf_func)(skb, prog->insnsi); - preempt_enable(); - skb->sk = NULL; - - /* Moving return codes from UAPI namespace into internal namespace */ - return rc == SK_PASS ? - (TCP_SKB_CB(skb)->bpf.sk_redir ? __SK_REDIRECT : __SK_PASS) : - __SK_DROP; -} - -static int smap_do_ingress(struct smap_psock *psock, struct sk_buff *skb) -{ - struct sock *sk = psock->sock; - int copied = 0, num_sg; - struct sk_msg_buff *r; - - r = kzalloc(sizeof(struct sk_msg_buff), __GFP_NOWARN | GFP_ATOMIC); - if (unlikely(!r)) - return -EAGAIN; - - if (!sk_rmem_schedule(sk, skb, skb->len)) { - kfree(r); - return -EAGAIN; - } - - sg_init_table(r->sg_data, MAX_SKB_FRAGS); - num_sg = skb_to_sgvec(skb, r->sg_data, 0, skb->len); - if (unlikely(num_sg < 0)) { - kfree(r); - return num_sg; - } - sk_mem_charge(sk, skb->len); - copied = skb->len; - r->sg_start = 0; - r->sg_end = num_sg == MAX_SKB_FRAGS ? 0 : num_sg; - r->skb = skb; - list_add_tail(&r->list, &psock->ingress); - sk->sk_data_ready(sk); - return copied; -} - -static void smap_do_verdict(struct smap_psock *psock, struct sk_buff *skb) -{ - struct smap_psock *peer; - struct sock *sk; - __u32 in; - int rc; - - rc = smap_verdict_func(psock, skb); - switch (rc) { - case __SK_REDIRECT: - sk = do_sk_redirect_map(skb); - if (!sk) { - kfree_skb(skb); - break; - } - - peer = smap_psock_sk(sk); - in = (TCP_SKB_CB(skb)->bpf.flags) & BPF_F_INGRESS; - - if (unlikely(!peer || sock_flag(sk, SOCK_DEAD) || - !test_bit(SMAP_TX_RUNNING, &peer->state))) { - kfree_skb(skb); - break; - } - - if (!in && sock_writeable(sk)) { - skb_set_owner_w(skb, sk); - skb_queue_tail(&peer->rxqueue, skb); - schedule_work(&peer->tx_work); - break; - } else if (in && - atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf) { - skb_queue_tail(&peer->rxqueue, skb); - schedule_work(&peer->tx_work); - break; - } - /* Fall through and free skb otherwise */ - case __SK_DROP: - default: - kfree_skb(skb); - } -} - -static void smap_report_sk_error(struct smap_psock *psock, int err) -{ - struct sock *sk = psock->sock; - - sk->sk_err = err; - sk->sk_error_report(sk); -} - -static void smap_read_sock_strparser(struct strparser *strp, - struct sk_buff *skb) -{ - struct smap_psock *psock; - - rcu_read_lock(); - psock = container_of(strp, struct smap_psock, strp); - smap_do_verdict(psock, skb); - rcu_read_unlock(); -} - -/* Called with lock held on socket */ -static void smap_data_ready(struct sock *sk) -{ - struct smap_psock *psock; - - rcu_read_lock(); - psock = smap_psock_sk(sk); - if (likely(psock)) { - write_lock_bh(&sk->sk_callback_lock); - strp_data_ready(&psock->strp); - write_unlock_bh(&sk->sk_callback_lock); - } - rcu_read_unlock(); -} - -static void smap_tx_work(struct work_struct *w) -{ - struct smap_psock *psock; - struct sk_buff *skb; - int rem, off, n; - - psock = container_of(w, struct smap_psock, tx_work); - - /* lock sock to avoid losing sk_socket at some point during loop */ - lock_sock(psock->sock); - if (psock->save_skb) { - skb = psock->save_skb; - rem = psock->save_rem; - off = psock->save_off; - psock->save_skb = NULL; - goto start; - } - - while ((skb = skb_dequeue(&psock->rxqueue))) { - __u32 flags; - - rem = skb->len; - off = 0; -start: - flags = (TCP_SKB_CB(skb)->bpf.flags) & BPF_F_INGRESS; - do { - if (likely(psock->sock->sk_socket)) { - if (flags) - n = smap_do_ingress(psock, skb); - else - n = skb_send_sock_locked(psock->sock, - skb, off, rem); - } else { - n = -EINVAL; - } - - if (n <= 0) { - if (n == -EAGAIN) { - /* Retry when space is available */ - psock->save_skb = skb; - psock->save_rem = rem; - psock->save_off = off; - goto out; - } - /* Hard errors break pipe and stop xmit */ - smap_report_sk_error(psock, n ? -n : EPIPE); - clear_bit(SMAP_TX_RUNNING, &psock->state); - kfree_skb(skb); - goto out; - } - rem -= n; - off += n; - } while (rem); - - if (!flags) - kfree_skb(skb); - } -out: - release_sock(psock->sock); -} - -static void smap_write_space(struct sock *sk) -{ - struct smap_psock *psock; - void (*write_space)(struct sock *sk); - - rcu_read_lock(); - psock = smap_psock_sk(sk); - if (likely(psock && test_bit(SMAP_TX_RUNNING, &psock->state))) - schedule_work(&psock->tx_work); - write_space = psock->save_write_space; - rcu_read_unlock(); - write_space(sk); -} - -static void smap_stop_sock(struct smap_psock *psock, struct sock *sk) -{ - if (!psock->strp_enabled) - return; - sk->sk_data_ready = psock->save_data_ready; - sk->sk_write_space = psock->save_write_space; - psock->save_data_ready = NULL; - psock->save_write_space = NULL; - strp_stop(&psock->strp); - psock->strp_enabled = false; -} - -static void smap_destroy_psock(struct rcu_head *rcu) -{ - struct smap_psock *psock = container_of(rcu, - struct smap_psock, rcu); - - /* Now that a grace period has passed there is no longer - * any reference to this sock in the sockmap so we can - * destroy the psock, strparser, and bpf programs. But, - * because we use workqueue sync operations we can not - * do it in rcu context - */ - schedule_work(&psock->gc_work); -} - -static bool psock_is_smap_sk(struct sock *sk) -{ - return inet_csk(sk)->icsk_ulp_ops == &bpf_tcp_ulp_ops; -} - -static void smap_release_sock(struct smap_psock *psock, struct sock *sock) -{ - if (refcount_dec_and_test(&psock->refcnt)) { - if (psock_is_smap_sk(sock)) - bpf_tcp_release(sock); - write_lock_bh(&sock->sk_callback_lock); - smap_stop_sock(psock, sock); - write_unlock_bh(&sock->sk_callback_lock); - clear_bit(SMAP_TX_RUNNING, &psock->state); - rcu_assign_sk_user_data(sock, NULL); - call_rcu_sched(&psock->rcu, smap_destroy_psock); - } -} - -static int smap_parse_func_strparser(struct strparser *strp, - struct sk_buff *skb) -{ - struct smap_psock *psock; - struct bpf_prog *prog; - int rc; - - rcu_read_lock(); - psock = container_of(strp, struct smap_psock, strp); - prog = READ_ONCE(psock->bpf_parse); - - if (unlikely(!prog)) { - rcu_read_unlock(); - return skb->len; - } - - /* Attach socket for bpf program to use if needed we can do this - * because strparser clones the skb before handing it to a upper - * layer, meaning skb_orphan has been called. We NULL sk on the - * way out to ensure we don't trigger a BUG_ON in skb/sk operations - * later and because we are not charging the memory of this skb to - * any socket yet. - */ - skb->sk = psock->sock; - bpf_compute_data_end_sk_skb(skb); - rc = (*prog->bpf_func)(skb, prog->insnsi); - skb->sk = NULL; - rcu_read_unlock(); - return rc; -} - -static int smap_read_sock_done(struct strparser *strp, int err) -{ - return err; -} - -static int smap_init_sock(struct smap_psock *psock, - struct sock *sk) -{ - static const struct strp_callbacks cb = { - .rcv_msg = smap_read_sock_strparser, - .parse_msg = smap_parse_func_strparser, - .read_sock_done = smap_read_sock_done, - }; - - return strp_init(&psock->strp, sk, &cb); -} - -static void smap_init_progs(struct smap_psock *psock, - struct bpf_prog *verdict, - struct bpf_prog *parse) -{ - struct bpf_prog *orig_parse, *orig_verdict; - - orig_parse = xchg(&psock->bpf_parse, parse); - orig_verdict = xchg(&psock->bpf_verdict, verdict); - - if (orig_verdict) - bpf_prog_put(orig_verdict); - if (orig_parse) - bpf_prog_put(orig_parse); -} - -static void smap_start_sock(struct smap_psock *psock, struct sock *sk) -{ - if (sk->sk_data_ready == smap_data_ready) - return; - psock->save_data_ready = sk->sk_data_ready; - psock->save_write_space = sk->sk_write_space; - sk->sk_data_ready = smap_data_ready; - sk->sk_write_space = smap_write_space; - psock->strp_enabled = true; -} - -static void sock_map_remove_complete(struct bpf_stab *stab) -{ - bpf_map_area_free(stab->sock_map); - kfree(stab); -} - -static void smap_gc_work(struct work_struct *w) -{ - struct smap_psock_map_entry *e, *tmp; - struct sk_msg_buff *md, *mtmp; - struct smap_psock *psock; - - psock = container_of(w, struct smap_psock, gc_work); - - /* no callback lock needed because we already detached sockmap ops */ - if (psock->strp_enabled) - strp_done(&psock->strp); - - cancel_work_sync(&psock->tx_work); - __skb_queue_purge(&psock->rxqueue); - - /* At this point all strparser and xmit work must be complete */ - if (psock->bpf_parse) - bpf_prog_put(psock->bpf_parse); - if (psock->bpf_verdict) - bpf_prog_put(psock->bpf_verdict); - if (psock->bpf_tx_msg) - bpf_prog_put(psock->bpf_tx_msg); - - if (psock->cork) { - free_start_sg(psock->sock, psock->cork, true); - kfree(psock->cork); - } - - list_for_each_entry_safe(md, mtmp, &psock->ingress, list) { - list_del(&md->list); - free_start_sg(psock->sock, md, true); - kfree(md); - } - - list_for_each_entry_safe(e, tmp, &psock->maps, list) { - list_del(&e->list); - kfree(e); - } - - if (psock->sk_redir) - sock_put(psock->sk_redir); - - sock_put(psock->sock); - kfree(psock); -} - -static struct smap_psock *smap_init_psock(struct sock *sock, int node) -{ - struct smap_psock *psock; - - psock = kzalloc_node(sizeof(struct smap_psock), - GFP_ATOMIC | __GFP_NOWARN, - node); - if (!psock) - return ERR_PTR(-ENOMEM); - - psock->eval = __SK_NONE; - psock->sock = sock; - skb_queue_head_init(&psock->rxqueue); - INIT_WORK(&psock->tx_work, smap_tx_work); - INIT_WORK(&psock->gc_work, smap_gc_work); - INIT_LIST_HEAD(&psock->maps); - INIT_LIST_HEAD(&psock->ingress); - refcount_set(&psock->refcnt, 1); - spin_lock_init(&psock->maps_lock); - - rcu_assign_sk_user_data(sock, psock); - sock_hold(sock); - return psock; -} - -static struct bpf_map *sock_map_alloc(union bpf_attr *attr) -{ - struct bpf_stab *stab; - u64 cost; - int err; - - if (!capable(CAP_NET_ADMIN)) - return ERR_PTR(-EPERM); - - /* check sanity of attributes */ - if (attr->max_entries == 0 || attr->key_size != 4 || - attr->value_size != 4 || attr->map_flags & ~SOCK_CREATE_FLAG_MASK) - return ERR_PTR(-EINVAL); - - stab = kzalloc(sizeof(*stab), GFP_USER); - if (!stab) - return ERR_PTR(-ENOMEM); - - bpf_map_init_from_attr(&stab->map, attr); - raw_spin_lock_init(&stab->lock); - - /* make sure page count doesn't overflow */ - cost = (u64) stab->map.max_entries * sizeof(struct sock *); - err = -EINVAL; - if (cost >= U32_MAX - PAGE_SIZE) - goto free_stab; - - stab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; - - /* if map size is larger than memlock limit, reject it early */ - err = bpf_map_precharge_memlock(stab->map.pages); - if (err) - goto free_stab; - - err = -ENOMEM; - stab->sock_map = bpf_map_area_alloc(stab->map.max_entries * - sizeof(struct sock *), - stab->map.numa_node); - if (!stab->sock_map) - goto free_stab; - - return &stab->map; -free_stab: - kfree(stab); - return ERR_PTR(err); -} - -static void smap_list_map_remove(struct smap_psock *psock, - struct sock **entry) -{ - struct smap_psock_map_entry *e, *tmp; - - spin_lock_bh(&psock->maps_lock); - list_for_each_entry_safe(e, tmp, &psock->maps, list) { - if (e->entry == entry) { - list_del(&e->list); - kfree(e); - } - } - spin_unlock_bh(&psock->maps_lock); -} - -static void smap_list_hash_remove(struct smap_psock *psock, - struct htab_elem *hash_link) -{ - struct smap_psock_map_entry *e, *tmp; - - spin_lock_bh(&psock->maps_lock); - list_for_each_entry_safe(e, tmp, &psock->maps, list) { - struct htab_elem *c = rcu_dereference(e->hash_link); - - if (c == hash_link) { - list_del(&e->list); - kfree(e); - } - } - spin_unlock_bh(&psock->maps_lock); -} - -static void sock_map_free(struct bpf_map *map) -{ - struct bpf_stab *stab = container_of(map, struct bpf_stab, map); - int i; - - synchronize_rcu(); - - /* At this point no update, lookup or delete operations can happen. - * However, be aware we can still get a socket state event updates, - * and data ready callabacks that reference the psock from sk_user_data - * Also psock worker threads are still in-flight. So smap_release_sock - * will only free the psock after cancel_sync on the worker threads - * and a grace period expire to ensure psock is really safe to remove. - */ - rcu_read_lock(); - raw_spin_lock_bh(&stab->lock); - for (i = 0; i < stab->map.max_entries; i++) { - struct smap_psock *psock; - struct sock *sock; - - sock = stab->sock_map[i]; - if (!sock) - continue; - stab->sock_map[i] = NULL; - psock = smap_psock_sk(sock); - /* This check handles a racing sock event that can get the - * sk_callback_lock before this case but after xchg happens - * causing the refcnt to hit zero and sock user data (psock) - * to be null and queued for garbage collection. - */ - if (likely(psock)) { - smap_list_map_remove(psock, &stab->sock_map[i]); - smap_release_sock(psock, sock); - } - } - raw_spin_unlock_bh(&stab->lock); - rcu_read_unlock(); - - sock_map_remove_complete(stab); -} - -static int sock_map_get_next_key(struct bpf_map *map, void *key, void *next_key) -{ - struct bpf_stab *stab = container_of(map, struct bpf_stab, map); - u32 i = key ? *(u32 *)key : U32_MAX; - u32 *next = (u32 *)next_key; - - if (i >= stab->map.max_entries) { - *next = 0; - return 0; - } - - if (i == stab->map.max_entries - 1) - return -ENOENT; - - *next = i + 1; - return 0; -} - -struct sock *__sock_map_lookup_elem(struct bpf_map *map, u32 key) -{ - struct bpf_stab *stab = container_of(map, struct bpf_stab, map); - - if (key >= map->max_entries) - return NULL; - - return READ_ONCE(stab->sock_map[key]); -} - -static int sock_map_delete_elem(struct bpf_map *map, void *key) -{ - struct bpf_stab *stab = container_of(map, struct bpf_stab, map); - struct smap_psock *psock; - int k = *(u32 *)key; - struct sock *sock; - - if (k >= map->max_entries) - return -EINVAL; - - raw_spin_lock_bh(&stab->lock); - sock = stab->sock_map[k]; - stab->sock_map[k] = NULL; - raw_spin_unlock_bh(&stab->lock); - if (!sock) - return -EINVAL; - - psock = smap_psock_sk(sock); - if (!psock) - return 0; - if (psock->bpf_parse) { - write_lock_bh(&sock->sk_callback_lock); - smap_stop_sock(psock, sock); - write_unlock_bh(&sock->sk_callback_lock); - } - smap_list_map_remove(psock, &stab->sock_map[k]); - smap_release_sock(psock, sock); - return 0; -} - -/* Locking notes: Concurrent updates, deletes, and lookups are allowed and are - * done inside rcu critical sections. This ensures on updates that the psock - * will not be released via smap_release_sock() until concurrent updates/deletes - * complete. All operations operate on sock_map using cmpxchg and xchg - * operations to ensure we do not get stale references. Any reads into the - * map must be done with READ_ONCE() because of this. - * - * A psock is destroyed via call_rcu and after any worker threads are cancelled - * and syncd so we are certain all references from the update/lookup/delete - * operations as well as references in the data path are no longer in use. - * - * Psocks may exist in multiple maps, but only a single set of parse/verdict - * programs may be inherited from the maps it belongs to. A reference count - * is kept with the total number of references to the psock from all maps. The - * psock will not be released until this reaches zero. The psock and sock - * user data data use the sk_callback_lock to protect critical data structures - * from concurrent access. This allows us to avoid two updates from modifying - * the user data in sock and the lock is required anyways for modifying - * callbacks, we simply increase its scope slightly. - * - * Rules to follow, - * - psock must always be read inside RCU critical section - * - sk_user_data must only be modified inside sk_callback_lock and read - * inside RCU critical section. - * - psock->maps list must only be read & modified inside sk_callback_lock - * - sock_map must use READ_ONCE and (cmp)xchg operations - * - BPF verdict/parse programs must use READ_ONCE and xchg operations - */ - -static int __sock_map_ctx_update_elem(struct bpf_map *map, - struct bpf_sock_progs *progs, - struct sock *sock, - void *key) -{ - struct bpf_prog *verdict, *parse, *tx_msg; - struct smap_psock *psock; - bool new = false; - int err = 0; - - /* 1. If sock map has BPF programs those will be inherited by the - * sock being added. If the sock is already attached to BPF programs - * this results in an error. - */ - verdict = READ_ONCE(progs->bpf_verdict); - parse = READ_ONCE(progs->bpf_parse); - tx_msg = READ_ONCE(progs->bpf_tx_msg); - - if (parse && verdict) { - /* bpf prog refcnt may be zero if a concurrent attach operation - * removes the program after the above READ_ONCE() but before - * we increment the refcnt. If this is the case abort with an - * error. - */ - verdict = bpf_prog_inc_not_zero(verdict); - if (IS_ERR(verdict)) - return PTR_ERR(verdict); - - parse = bpf_prog_inc_not_zero(parse); - if (IS_ERR(parse)) { - bpf_prog_put(verdict); - return PTR_ERR(parse); - } - } - - if (tx_msg) { - tx_msg = bpf_prog_inc_not_zero(tx_msg); - if (IS_ERR(tx_msg)) { - if (parse && verdict) { - bpf_prog_put(parse); - bpf_prog_put(verdict); - } - return PTR_ERR(tx_msg); - } - } - - psock = smap_psock_sk(sock); - - /* 2. Do not allow inheriting programs if psock exists and has - * already inherited programs. This would create confusion on - * which parser/verdict program is running. If no psock exists - * create one. Inside sk_callback_lock to ensure concurrent create - * doesn't update user data. - */ - if (psock) { - if (!psock_is_smap_sk(sock)) { - err = -EBUSY; - goto out_progs; - } - if (READ_ONCE(psock->bpf_parse) && parse) { - err = -EBUSY; - goto out_progs; - } - if (READ_ONCE(psock->bpf_tx_msg) && tx_msg) { - err = -EBUSY; - goto out_progs; - } - if (!refcount_inc_not_zero(&psock->refcnt)) { - err = -EAGAIN; - goto out_progs; - } - } else { - psock = smap_init_psock(sock, map->numa_node); - if (IS_ERR(psock)) { - err = PTR_ERR(psock); - goto out_progs; - } - - set_bit(SMAP_TX_RUNNING, &psock->state); - new = true; - } - - /* 3. At this point we have a reference to a valid psock that is - * running. Attach any BPF programs needed. - */ - if (tx_msg) - bpf_tcp_msg_add(psock, sock, tx_msg); - if (new) { - err = bpf_tcp_init(sock); - if (err) - goto out_free; - } - - if (parse && verdict && !psock->strp_enabled) { - err = smap_init_sock(psock, sock); - if (err) - goto out_free; - smap_init_progs(psock, verdict, parse); - write_lock_bh(&sock->sk_callback_lock); - smap_start_sock(psock, sock); - write_unlock_bh(&sock->sk_callback_lock); - } - - return err; -out_free: - smap_release_sock(psock, sock); -out_progs: - if (parse && verdict) { - bpf_prog_put(parse); - bpf_prog_put(verdict); - } - if (tx_msg) - bpf_prog_put(tx_msg); - return err; -} - -static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops, - struct bpf_map *map, - void *key, u64 flags) -{ - struct bpf_stab *stab = container_of(map, struct bpf_stab, map); - struct bpf_sock_progs *progs = &stab->progs; - struct sock *osock, *sock = skops->sk; - struct smap_psock_map_entry *e; - struct smap_psock *psock; - u32 i = *(u32 *)key; - int err; - - if (unlikely(flags > BPF_EXIST)) - return -EINVAL; - if (unlikely(i >= stab->map.max_entries)) - return -E2BIG; - - e = kzalloc(sizeof(*e), GFP_ATOMIC | __GFP_NOWARN); - if (!e) - return -ENOMEM; - - err = __sock_map_ctx_update_elem(map, progs, sock, key); - if (err) - goto out; - - /* psock guaranteed to be present. */ - psock = smap_psock_sk(sock); - raw_spin_lock_bh(&stab->lock); - osock = stab->sock_map[i]; - if (osock && flags == BPF_NOEXIST) { - err = -EEXIST; - goto out_unlock; - } - if (!osock && flags == BPF_EXIST) { - err = -ENOENT; - goto out_unlock; - } - - e->entry = &stab->sock_map[i]; - e->map = map; - spin_lock_bh(&psock->maps_lock); - list_add_tail(&e->list, &psock->maps); - spin_unlock_bh(&psock->maps_lock); - - stab->sock_map[i] = sock; - if (osock) { - psock = smap_psock_sk(osock); - smap_list_map_remove(psock, &stab->sock_map[i]); - smap_release_sock(psock, osock); - } - raw_spin_unlock_bh(&stab->lock); - return 0; -out_unlock: - smap_release_sock(psock, sock); - raw_spin_unlock_bh(&stab->lock); -out: - kfree(e); - return err; -} - -int sock_map_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type) -{ - struct bpf_sock_progs *progs; - struct bpf_prog *orig; - - if (map->map_type == BPF_MAP_TYPE_SOCKMAP) { - struct bpf_stab *stab = container_of(map, struct bpf_stab, map); - - progs = &stab->progs; - } else if (map->map_type == BPF_MAP_TYPE_SOCKHASH) { - struct bpf_htab *htab = container_of(map, struct bpf_htab, map); - - progs = &htab->progs; - } else { - return -EINVAL; - } - - switch (type) { - case BPF_SK_MSG_VERDICT: - orig = xchg(&progs->bpf_tx_msg, prog); - break; - case BPF_SK_SKB_STREAM_PARSER: - orig = xchg(&progs->bpf_parse, prog); - break; - case BPF_SK_SKB_STREAM_VERDICT: - orig = xchg(&progs->bpf_verdict, prog); - break; - default: - return -EOPNOTSUPP; - } - - if (orig) - bpf_prog_put(orig); - - return 0; -} - -int sockmap_get_from_fd(const union bpf_attr *attr, int type, - struct bpf_prog *prog) -{ - int ufd = attr->target_fd; - struct bpf_map *map; - struct fd f; - int err; - - f = fdget(ufd); - map = __bpf_map_get(f); - if (IS_ERR(map)) - return PTR_ERR(map); - - err = sock_map_prog(map, prog, attr->attach_type); - fdput(f); - return err; -} - -static void *sock_map_lookup(struct bpf_map *map, void *key) -{ - return ERR_PTR(-EOPNOTSUPP); -} - -static int sock_map_update_elem(struct bpf_map *map, - void *key, void *value, u64 flags) -{ - struct bpf_sock_ops_kern skops; - u32 fd = *(u32 *)value; - struct socket *socket; - int err; - - socket = sockfd_lookup(fd, &err); - if (!socket) - return err; - - skops.sk = socket->sk; - if (!skops.sk) { - fput(socket->file); - return -EINVAL; - } - - /* ULPs are currently supported only for TCP sockets in ESTABLISHED - * state. - */ - if (skops.sk->sk_type != SOCK_STREAM || - skops.sk->sk_protocol != IPPROTO_TCP || - skops.sk->sk_state != TCP_ESTABLISHED) { - fput(socket->file); - return -EOPNOTSUPP; - } - - lock_sock(skops.sk); - preempt_disable(); - rcu_read_lock(); - err = sock_map_ctx_update_elem(&skops, map, key, flags); - rcu_read_unlock(); - preempt_enable(); - release_sock(skops.sk); - fput(socket->file); - return err; -} - -static void sock_map_release(struct bpf_map *map) -{ - struct bpf_sock_progs *progs; - struct bpf_prog *orig; - - if (map->map_type == BPF_MAP_TYPE_SOCKMAP) { - struct bpf_stab *stab = container_of(map, struct bpf_stab, map); - - progs = &stab->progs; - } else { - struct bpf_htab *htab = container_of(map, struct bpf_htab, map); - - progs = &htab->progs; - } - - orig = xchg(&progs->bpf_parse, NULL); - if (orig) - bpf_prog_put(orig); - orig = xchg(&progs->bpf_verdict, NULL); - if (orig) - bpf_prog_put(orig); - - orig = xchg(&progs->bpf_tx_msg, NULL); - if (orig) - bpf_prog_put(orig); -} - -static struct bpf_map *sock_hash_alloc(union bpf_attr *attr) -{ - struct bpf_htab *htab; - int i, err; - u64 cost; - - if (!capable(CAP_NET_ADMIN)) - return ERR_PTR(-EPERM); - - /* check sanity of attributes */ - if (attr->max_entries == 0 || - attr->key_size == 0 || - attr->value_size != 4 || - attr->map_flags & ~SOCK_CREATE_FLAG_MASK) - return ERR_PTR(-EINVAL); - - if (attr->key_size > MAX_BPF_STACK) - /* eBPF programs initialize keys on stack, so they cannot be - * larger than max stack size - */ - return ERR_PTR(-E2BIG); - - htab = kzalloc(sizeof(*htab), GFP_USER); - if (!htab) - return ERR_PTR(-ENOMEM); - - bpf_map_init_from_attr(&htab->map, attr); - - htab->n_buckets = roundup_pow_of_two(htab->map.max_entries); - htab->elem_size = sizeof(struct htab_elem) + - round_up(htab->map.key_size, 8); - err = -EINVAL; - if (htab->n_buckets == 0 || - htab->n_buckets > U32_MAX / sizeof(struct bucket)) - goto free_htab; - - cost = (u64) htab->n_buckets * sizeof(struct bucket) + - (u64) htab->elem_size * htab->map.max_entries; - - if (cost >= U32_MAX - PAGE_SIZE) - goto free_htab; - - htab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; - err = bpf_map_precharge_memlock(htab->map.pages); - if (err) - goto free_htab; - - err = -ENOMEM; - htab->buckets = bpf_map_area_alloc( - htab->n_buckets * sizeof(struct bucket), - htab->map.numa_node); - if (!htab->buckets) - goto free_htab; - - for (i = 0; i < htab->n_buckets; i++) { - INIT_HLIST_HEAD(&htab->buckets[i].head); - raw_spin_lock_init(&htab->buckets[i].lock); - } - - return &htab->map; -free_htab: - kfree(htab); - return ERR_PTR(err); -} - -static void __bpf_htab_free(struct rcu_head *rcu) -{ - struct bpf_htab *htab; - - htab = container_of(rcu, struct bpf_htab, rcu); - bpf_map_area_free(htab->buckets); - kfree(htab); -} - -static void sock_hash_free(struct bpf_map *map) -{ - struct bpf_htab *htab = container_of(map, struct bpf_htab, map); - int i; - - synchronize_rcu(); - - /* At this point no update, lookup or delete operations can happen. - * However, be aware we can still get a socket state event updates, - * and data ready callabacks that reference the psock from sk_user_data - * Also psock worker threads are still in-flight. So smap_release_sock - * will only free the psock after cancel_sync on the worker threads - * and a grace period expire to ensure psock is really safe to remove. - */ - rcu_read_lock(); - for (i = 0; i < htab->n_buckets; i++) { - struct bucket *b = __select_bucket(htab, i); - struct hlist_head *head; - struct hlist_node *n; - struct htab_elem *l; - - raw_spin_lock_bh(&b->lock); - head = &b->head; - hlist_for_each_entry_safe(l, n, head, hash_node) { - struct sock *sock = l->sk; - struct smap_psock *psock; - - hlist_del_rcu(&l->hash_node); - psock = smap_psock_sk(sock); - /* This check handles a racing sock event that can get - * the sk_callback_lock before this case but after xchg - * causing the refcnt to hit zero and sock user data - * (psock) to be null and queued for garbage collection. - */ - if (likely(psock)) { - smap_list_hash_remove(psock, l); - smap_release_sock(psock, sock); - } - free_htab_elem(htab, l); - } - raw_spin_unlock_bh(&b->lock); - } - rcu_read_unlock(); - call_rcu(&htab->rcu, __bpf_htab_free); -} - -static struct htab_elem *alloc_sock_hash_elem(struct bpf_htab *htab, - void *key, u32 key_size, u32 hash, - struct sock *sk, - struct htab_elem *old_elem) -{ - struct htab_elem *l_new; - - if (atomic_inc_return(&htab->count) > htab->map.max_entries) { - if (!old_elem) { - atomic_dec(&htab->count); - return ERR_PTR(-E2BIG); - } - } - l_new = kmalloc_node(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN, - htab->map.numa_node); - if (!l_new) { - atomic_dec(&htab->count); - return ERR_PTR(-ENOMEM); - } - - memcpy(l_new->key, key, key_size); - l_new->sk = sk; - l_new->hash = hash; - return l_new; -} - -static inline u32 htab_map_hash(const void *key, u32 key_len) -{ - return jhash(key, key_len, 0); -} - -static int sock_hash_get_next_key(struct bpf_map *map, - void *key, void *next_key) -{ - struct bpf_htab *htab = container_of(map, struct bpf_htab, map); - struct htab_elem *l, *next_l; - struct hlist_head *h; - u32 hash, key_size; - int i = 0; - - WARN_ON_ONCE(!rcu_read_lock_held()); - - key_size = map->key_size; - if (!key) - goto find_first_elem; - hash = htab_map_hash(key, key_size); - h = select_bucket(htab, hash); - - l = lookup_elem_raw(h, hash, key, key_size); - if (!l) - goto find_first_elem; - next_l = hlist_entry_safe( - rcu_dereference_raw(hlist_next_rcu(&l->hash_node)), - struct htab_elem, hash_node); - if (next_l) { - memcpy(next_key, next_l->key, key_size); - return 0; - } - - /* no more elements in this hash list, go to the next bucket */ - i = hash & (htab->n_buckets - 1); - i++; - -find_first_elem: - /* iterate over buckets */ - for (; i < htab->n_buckets; i++) { - h = select_bucket(htab, i); - - /* pick first element in the bucket */ - next_l = hlist_entry_safe( - rcu_dereference_raw(hlist_first_rcu(h)), - struct htab_elem, hash_node); - if (next_l) { - /* if it's not empty, just return it */ - memcpy(next_key, next_l->key, key_size); - return 0; - } - } - - /* iterated over all buckets and all elements */ - return -ENOENT; -} - -static int sock_hash_ctx_update_elem(struct bpf_sock_ops_kern *skops, - struct bpf_map *map, - void *key, u64 map_flags) -{ - struct bpf_htab *htab = container_of(map, struct bpf_htab, map); - struct bpf_sock_progs *progs = &htab->progs; - struct htab_elem *l_new = NULL, *l_old; - struct smap_psock_map_entry *e = NULL; - struct hlist_head *head; - struct smap_psock *psock; - u32 key_size, hash; - struct sock *sock; - struct bucket *b; - int err; - - sock = skops->sk; - - if (sock->sk_type != SOCK_STREAM || - sock->sk_protocol != IPPROTO_TCP) - return -EOPNOTSUPP; - - if (unlikely(map_flags > BPF_EXIST)) - return -EINVAL; - - e = kzalloc(sizeof(*e), GFP_ATOMIC | __GFP_NOWARN); - if (!e) - return -ENOMEM; - - WARN_ON_ONCE(!rcu_read_lock_held()); - key_size = map->key_size; - hash = htab_map_hash(key, key_size); - b = __select_bucket(htab, hash); - head = &b->head; - - err = __sock_map_ctx_update_elem(map, progs, sock, key); - if (err) - goto err; - - /* psock is valid here because otherwise above *ctx_update_elem would - * have thrown an error. It is safe to skip error check. - */ - psock = smap_psock_sk(sock); - raw_spin_lock_bh(&b->lock); - l_old = lookup_elem_raw(head, hash, key, key_size); - if (l_old && map_flags == BPF_NOEXIST) { - err = -EEXIST; - goto bucket_err; - } - if (!l_old && map_flags == BPF_EXIST) { - err = -ENOENT; - goto bucket_err; - } - - l_new = alloc_sock_hash_elem(htab, key, key_size, hash, sock, l_old); - if (IS_ERR(l_new)) { - err = PTR_ERR(l_new); - goto bucket_err; - } - - rcu_assign_pointer(e->hash_link, l_new); - e->map = map; - spin_lock_bh(&psock->maps_lock); - list_add_tail(&e->list, &psock->maps); - spin_unlock_bh(&psock->maps_lock); - - /* add new element to the head of the list, so that - * concurrent search will find it before old elem - */ - hlist_add_head_rcu(&l_new->hash_node, head); - if (l_old) { - psock = smap_psock_sk(l_old->sk); - - hlist_del_rcu(&l_old->hash_node); - smap_list_hash_remove(psock, l_old); - smap_release_sock(psock, l_old->sk); - free_htab_elem(htab, l_old); - } - raw_spin_unlock_bh(&b->lock); - return 0; -bucket_err: - smap_release_sock(psock, sock); - raw_spin_unlock_bh(&b->lock); -err: - kfree(e); - return err; -} - -static int sock_hash_update_elem(struct bpf_map *map, - void *key, void *value, u64 flags) -{ - struct bpf_sock_ops_kern skops; - u32 fd = *(u32 *)value; - struct socket *socket; - int err; - - socket = sockfd_lookup(fd, &err); - if (!socket) - return err; - - skops.sk = socket->sk; - if (!skops.sk) { - fput(socket->file); - return -EINVAL; - } - - /* ULPs are currently supported only for TCP sockets in ESTABLISHED - * state. - */ - if (skops.sk->sk_type != SOCK_STREAM || - skops.sk->sk_protocol != IPPROTO_TCP || - skops.sk->sk_state != TCP_ESTABLISHED) { - fput(socket->file); - return -EOPNOTSUPP; - } - - lock_sock(skops.sk); - preempt_disable(); - rcu_read_lock(); - err = sock_hash_ctx_update_elem(&skops, map, key, flags); - rcu_read_unlock(); - preempt_enable(); - release_sock(skops.sk); - fput(socket->file); - return err; -} - -static int sock_hash_delete_elem(struct bpf_map *map, void *key) -{ - struct bpf_htab *htab = container_of(map, struct bpf_htab, map); - struct hlist_head *head; - struct bucket *b; - struct htab_elem *l; - u32 hash, key_size; - int ret = -ENOENT; - - key_size = map->key_size; - hash = htab_map_hash(key, key_size); - b = __select_bucket(htab, hash); - head = &b->head; - - raw_spin_lock_bh(&b->lock); - l = lookup_elem_raw(head, hash, key, key_size); - if (l) { - struct sock *sock = l->sk; - struct smap_psock *psock; - - hlist_del_rcu(&l->hash_node); - psock = smap_psock_sk(sock); - /* This check handles a racing sock event that can get the - * sk_callback_lock before this case but after xchg happens - * causing the refcnt to hit zero and sock user data (psock) - * to be null and queued for garbage collection. - */ - if (likely(psock)) { - smap_list_hash_remove(psock, l); - smap_release_sock(psock, sock); - } - free_htab_elem(htab, l); - ret = 0; - } - raw_spin_unlock_bh(&b->lock); - return ret; -} - -struct sock *__sock_hash_lookup_elem(struct bpf_map *map, void *key) -{ - struct bpf_htab *htab = container_of(map, struct bpf_htab, map); - struct hlist_head *head; - struct htab_elem *l; - u32 key_size, hash; - struct bucket *b; - struct sock *sk; - - key_size = map->key_size; - hash = htab_map_hash(key, key_size); - b = __select_bucket(htab, hash); - head = &b->head; - - l = lookup_elem_raw(head, hash, key, key_size); - sk = l ? l->sk : NULL; - return sk; -} - -const struct bpf_map_ops sock_map_ops = { - .map_alloc = sock_map_alloc, - .map_free = sock_map_free, - .map_lookup_elem = sock_map_lookup, - .map_get_next_key = sock_map_get_next_key, - .map_update_elem = sock_map_update_elem, - .map_delete_elem = sock_map_delete_elem, - .map_release_uref = sock_map_release, - .map_check_btf = map_check_no_btf, -}; - -const struct bpf_map_ops sock_hash_ops = { - .map_alloc = sock_hash_alloc, - .map_free = sock_hash_free, - .map_lookup_elem = sock_map_lookup, - .map_get_next_key = sock_hash_get_next_key, - .map_update_elem = sock_hash_update_elem, - .map_delete_elem = sock_hash_delete_elem, - .map_release_uref = sock_map_release, - .map_check_btf = map_check_no_btf, -}; - -static bool bpf_is_valid_sock_op(struct bpf_sock_ops_kern *ops) -{ - return ops->op == BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB || - ops->op == BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB; -} -BPF_CALL_4(bpf_sock_map_update, struct bpf_sock_ops_kern *, bpf_sock, - struct bpf_map *, map, void *, key, u64, flags) -{ - WARN_ON_ONCE(!rcu_read_lock_held()); - - /* ULPs are currently supported only for TCP sockets in ESTABLISHED - * state. This checks that the sock ops triggering the update is - * one indicating we are (or will be soon) in an ESTABLISHED state. - */ - if (!bpf_is_valid_sock_op(bpf_sock)) - return -EOPNOTSUPP; - return sock_map_ctx_update_elem(bpf_sock, map, key, flags); -} - -const struct bpf_func_proto bpf_sock_map_update_proto = { - .func = bpf_sock_map_update, - .gpl_only = false, - .pkt_access = true, - .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_CONST_MAP_PTR, - .arg3_type = ARG_PTR_TO_MAP_KEY, - .arg4_type = ARG_ANYTHING, -}; - -BPF_CALL_4(bpf_sock_hash_update, struct bpf_sock_ops_kern *, bpf_sock, - struct bpf_map *, map, void *, key, u64, flags) -{ - WARN_ON_ONCE(!rcu_read_lock_held()); - - if (!bpf_is_valid_sock_op(bpf_sock)) - return -EOPNOTSUPP; - return sock_hash_ctx_update_elem(bpf_sock, map, key, flags); -} - -const struct bpf_func_proto bpf_sock_hash_update_proto = { - .func = bpf_sock_hash_update, - .gpl_only = false, - .pkt_access = true, - .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_CONST_MAP_PTR, - .arg3_type = ARG_PTR_TO_MAP_KEY, - .arg4_type = ARG_ANYTHING, -}; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 9d7499817719..65ae5071dd6e 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1692,7 +1692,7 @@ static int bpf_prog_attach(const union bpf_attr *attr) switch (ptype) { case BPF_PROG_TYPE_SK_SKB: case BPF_PROG_TYPE_SK_MSG: - ret = sockmap_get_from_fd(attr, ptype, prog); + ret = sock_map_get_from_fd(attr, prog); break; case BPF_PROG_TYPE_LIRC_MODE2: ret = lirc_prog_attach(attr, prog); @@ -1748,10 +1748,10 @@ static int bpf_prog_detach(const union bpf_attr *attr) ptype = BPF_PROG_TYPE_CGROUP_DEVICE; break; case BPF_SK_MSG_VERDICT: - return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_MSG, NULL); + return sock_map_get_from_fd(attr, NULL); case BPF_SK_SKB_STREAM_PARSER: case BPF_SK_SKB_STREAM_VERDICT: - return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_SKB, NULL); + return sock_map_get_from_fd(attr, NULL); case BPF_LIRC_MODE2: return lirc_prog_detach(attr); case BPF_FLOW_DISSECTOR: diff --git a/net/Kconfig b/net/Kconfig index 7cce8b19b11a..cb530725373c 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -300,8 +300,11 @@ config BPF_JIT config BPF_STREAM_PARSER bool "enable BPF STREAM_PARSER" + depends on INET depends on BPF_SYSCALL + depends on CGROUP_BPF select STREAM_PARSER + select NET_SOCK_MSG ---help--- Enabling this allows a stream parser to be used with BPF_MAP_TYPE_SOCKMAP. @@ -436,6 +439,14 @@ config GRO_CELLS bool default n +config NET_SOCK_MSG + bool + default n + help + The NET_SOCK_MSG provides a framework for plain sockets (e.g. TCP) or + ULPs (upper layer modules, e.g. TLS) to process L7 application data + with the help of BPF programs. + config NET_DEVLINK tristate "Network physical/parent device Netlink interface" help diff --git a/net/core/Makefile b/net/core/Makefile index 95a78f92b636..40a5ddefc003 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -16,6 +16,7 @@ obj-y += dev.o ethtool.o dev_addr_lists.o dst.o netevent.o \ obj-y += net-sysfs.o obj-$(CONFIG_PAGE_POOL) += page_pool.o obj-$(CONFIG_PROC_FS) += net-procfs.o +obj-$(CONFIG_NET_SOCK_MSG) += skmsg.o obj-$(CONFIG_NET_PKTGEN) += pktgen.o obj-$(CONFIG_NETPOLL) += netpoll.o obj-$(CONFIG_FIB_RULES) += fib_rules.o @@ -28,6 +29,7 @@ obj-$(CONFIG_CGROUP_NET_CLASSID) += netclassid_cgroup.o obj-$(CONFIG_LWTUNNEL) += lwtunnel.o obj-$(CONFIG_LWTUNNEL_BPF) += lwt_bpf.o obj-$(CONFIG_SOCKEV_NLMCAST) += sockev_nlmcast.o +obj-$(CONFIG_BPF_STREAM_PARSER) += sock_map.o obj-$(CONFIG_DST_CACHE) += dst_cache.o obj-$(CONFIG_HWBM) += hwbm.o obj-$(CONFIG_NET_DEVLINK) += devlink.o diff --git a/net/core/filter.c b/net/core/filter.c index 49cf7744fdfd..d8254ca65b5a 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -38,6 +38,7 @@ #include #include #include +#include #include #include #include @@ -2150,123 +2151,7 @@ static const struct bpf_func_proto bpf_redirect_proto = { .arg2_type = ARG_ANYTHING, }; -BPF_CALL_4(bpf_sk_redirect_hash, struct sk_buff *, skb, - struct bpf_map *, map, void *, key, u64, flags) -{ - struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); - - /* If user passes invalid input drop the packet. */ - if (unlikely(flags & ~(BPF_F_INGRESS))) - return SK_DROP; - - tcb->bpf.flags = flags; - tcb->bpf.sk_redir = __sock_hash_lookup_elem(map, key); - if (!tcb->bpf.sk_redir) - return SK_DROP; - - return SK_PASS; -} - -static const struct bpf_func_proto bpf_sk_redirect_hash_proto = { - .func = bpf_sk_redirect_hash, - .gpl_only = false, - .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_CONST_MAP_PTR, - .arg3_type = ARG_PTR_TO_MAP_KEY, - .arg4_type = ARG_ANYTHING, -}; - -BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb, - struct bpf_map *, map, u32, key, u64, flags) -{ - struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); - - /* If user passes invalid input drop the packet. */ - if (unlikely(flags & ~(BPF_F_INGRESS))) - return SK_DROP; - - tcb->bpf.flags = flags; - tcb->bpf.sk_redir = __sock_map_lookup_elem(map, key); - if (!tcb->bpf.sk_redir) - return SK_DROP; - - return SK_PASS; -} - -struct sock *do_sk_redirect_map(struct sk_buff *skb) -{ - struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); - - return tcb->bpf.sk_redir; -} - -static const struct bpf_func_proto bpf_sk_redirect_map_proto = { - .func = bpf_sk_redirect_map, - .gpl_only = false, - .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_CONST_MAP_PTR, - .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_ANYTHING, -}; - -BPF_CALL_4(bpf_msg_redirect_hash, struct sk_msg_buff *, msg, - struct bpf_map *, map, void *, key, u64, flags) -{ - /* If user passes invalid input drop the packet. */ - if (unlikely(flags & ~(BPF_F_INGRESS))) - return SK_DROP; - - msg->flags = flags; - msg->sk_redir = __sock_hash_lookup_elem(map, key); - if (!msg->sk_redir) - return SK_DROP; - - return SK_PASS; -} - -static const struct bpf_func_proto bpf_msg_redirect_hash_proto = { - .func = bpf_msg_redirect_hash, - .gpl_only = false, - .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_CONST_MAP_PTR, - .arg3_type = ARG_PTR_TO_MAP_KEY, - .arg4_type = ARG_ANYTHING, -}; - -BPF_CALL_4(bpf_msg_redirect_map, struct sk_msg_buff *, msg, - struct bpf_map *, map, u32, key, u64, flags) -{ - /* If user passes invalid input drop the packet. */ - if (unlikely(flags & ~(BPF_F_INGRESS))) - return SK_DROP; - - msg->flags = flags; - msg->sk_redir = __sock_map_lookup_elem(map, key); - if (!msg->sk_redir) - return SK_DROP; - - return SK_PASS; -} - -struct sock *do_msg_redirect_map(struct sk_msg_buff *msg) -{ - return msg->sk_redir; -} - -static const struct bpf_func_proto bpf_msg_redirect_map_proto = { - .func = bpf_msg_redirect_map, - .gpl_only = false, - .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_CONST_MAP_PTR, - .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_ANYTHING, -}; - -BPF_CALL_2(bpf_msg_apply_bytes, struct sk_msg_buff *, msg, u32, bytes) +BPF_CALL_2(bpf_msg_apply_bytes, struct sk_msg *, msg, u32, bytes) { msg->apply_bytes = bytes; return 0; @@ -2280,7 +2165,7 @@ static const struct bpf_func_proto bpf_msg_apply_bytes_proto = { .arg2_type = ARG_ANYTHING, }; -BPF_CALL_2(bpf_msg_cork_bytes, struct sk_msg_buff *, msg, u32, bytes) +BPF_CALL_2(bpf_msg_cork_bytes, struct sk_msg *, msg, u32, bytes) { msg->cork_bytes = bytes; return 0; @@ -2294,45 +2179,37 @@ static const struct bpf_func_proto bpf_msg_cork_bytes_proto = { .arg2_type = ARG_ANYTHING, }; -#define sk_msg_iter_var(var) \ - do { \ - var++; \ - if (var == MAX_SKB_FRAGS) \ - var = 0; \ - } while (0) - -BPF_CALL_4(bpf_msg_pull_data, - struct sk_msg_buff *, msg, u32, start, u32, end, u64, flags) +BPF_CALL_4(bpf_msg_pull_data, struct sk_msg *, msg, u32, start, + u32, end, u64, flags) { - unsigned int len = 0, offset = 0, copy = 0, poffset = 0; - int bytes = end - start, bytes_sg_total; - struct scatterlist *sg = msg->sg_data; - int first_sg, last_sg, i, shift; - unsigned char *p, *to, *from; + u32 len = 0, offset = 0, copy = 0, poffset = 0, bytes = end - start; + u32 first_sge, last_sge, i, shift, bytes_sg_total; + struct scatterlist *sge; + u8 *raw, *to, *from; struct page *page; if (unlikely(flags || end <= start)) return -EINVAL; /* First find the starting scatterlist element */ - i = msg->sg_start; + i = msg->sg.start; do { - len = sg[i].length; + len = sk_msg_elem(msg, i)->length; if (start < offset + len) break; offset += len; - sk_msg_iter_var(i); - } while (i != msg->sg_end); + sk_msg_iter_var_next(i); + } while (i != msg->sg.end); if (unlikely(start >= offset + len)) return -EINVAL; - first_sg = i; + first_sge = i; /* The start may point into the sg element so we need to also * account for the headroom. */ bytes_sg_total = start - offset + bytes; - if (!msg->sg_copy[i] && bytes_sg_total <= len) + if (!msg->sg.copy[i] && bytes_sg_total <= len) goto out; /* At this point we need to linearize multiple scatterlist @@ -2346,12 +2223,12 @@ BPF_CALL_4(bpf_msg_pull_data, * will copy the entire sg entry. */ do { - copy += sg[i].length; - sk_msg_iter_var(i); + copy += sk_msg_elem(msg, i)->length; + sk_msg_iter_var_next(i); if (bytes_sg_total <= copy) break; - } while (i != msg->sg_end); - last_sg = i; + } while (i != msg->sg.end); + last_sge = i; if (unlikely(bytes_sg_total > copy)) return -EINVAL; @@ -2360,63 +2237,61 @@ BPF_CALL_4(bpf_msg_pull_data, get_order(copy)); if (unlikely(!page)) return -ENOMEM; - p = page_address(page); - i = first_sg; + raw = page_address(page); + i = first_sge; do { - from = sg_virt(&sg[i]); - len = sg[i].length; - to = p + poffset; + sge = sk_msg_elem(msg, i); + from = sg_virt(sge); + len = sge->length; + to = raw + poffset; memcpy(to, from, len); poffset += len; - sg[i].length = 0; - put_page(sg_page(&sg[i])); + sge->length = 0; + put_page(sg_page(sge)); - sk_msg_iter_var(i); - } while (i != last_sg); + sk_msg_iter_var_next(i); + } while (i != last_sge); - sg[first_sg].length = copy; - sg_set_page(&sg[first_sg], page, copy, 0); + sg_set_page(&msg->sg.data[first_sge], page, copy, 0); /* To repair sg ring we need to shift entries. If we only * had a single entry though we can just replace it and * be done. Otherwise walk the ring and shift the entries. */ - WARN_ON_ONCE(last_sg == first_sg); - shift = last_sg > first_sg ? - last_sg - first_sg - 1 : - MAX_SKB_FRAGS - first_sg + last_sg - 1; + WARN_ON_ONCE(last_sge == first_sge); + shift = last_sge > first_sge ? + last_sge - first_sge - 1 : + MAX_SKB_FRAGS - first_sge + last_sge - 1; if (!shift) goto out; - i = first_sg; - sk_msg_iter_var(i); + i = first_sge; + sk_msg_iter_var_next(i); do { - int move_from; + u32 move_from; - if (i + shift >= MAX_SKB_FRAGS) - move_from = i + shift - MAX_SKB_FRAGS; + if (i + shift >= MAX_MSG_FRAGS) + move_from = i + shift - MAX_MSG_FRAGS; else move_from = i + shift; - - if (move_from == msg->sg_end) + if (move_from == msg->sg.end) break; - sg[i] = sg[move_from]; - sg[move_from].length = 0; - sg[move_from].page_link = 0; - sg[move_from].offset = 0; - - sk_msg_iter_var(i); + msg->sg.data[i] = msg->sg.data[move_from]; + msg->sg.data[move_from].length = 0; + msg->sg.data[move_from].page_link = 0; + msg->sg.data[move_from].offset = 0; + sk_msg_iter_var_next(i); } while (1); - msg->sg_end -= shift; - if (msg->sg_end < 0) - msg->sg_end += MAX_SKB_FRAGS; -out: - msg->data = sg_virt(&sg[first_sg]) + start - offset; - msg->data_end = msg->data + bytes; + msg->sg.end = msg->sg.end - shift > msg->sg.end ? + msg->sg.end - shift + MAX_MSG_FRAGS : + msg->sg.end - shift; +out: + msg->data = sg_virt(&msg->sg.data[first_sge]) + start - offset; + msg->data_end = msg->data + bytes; return 0; } @@ -5186,6 +5061,9 @@ xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) } } +const struct bpf_func_proto bpf_sock_map_update_proto __weak; +const struct bpf_func_proto bpf_sock_hash_update_proto __weak; + static const struct bpf_func_proto * sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { @@ -5209,6 +5087,9 @@ sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) } } +const struct bpf_func_proto bpf_msg_redirect_map_proto __weak; +const struct bpf_func_proto bpf_msg_redirect_hash_proto __weak; + static const struct bpf_func_proto * sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { @@ -5230,6 +5111,9 @@ sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) } } +const struct bpf_func_proto bpf_sk_redirect_map_proto __weak; +const struct bpf_func_proto bpf_sk_redirect_hash_proto __weak; + static const struct bpf_func_proto * sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { @@ -6984,22 +6868,22 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type, switch (si->off) { case offsetof(struct sk_msg_md, data): - *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg_buff, data), + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg, data), si->dst_reg, si->src_reg, - offsetof(struct sk_msg_buff, data)); + offsetof(struct sk_msg, data)); break; case offsetof(struct sk_msg_md, data_end): - *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg_buff, data_end), + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg, data_end), si->dst_reg, si->src_reg, - offsetof(struct sk_msg_buff, data_end)); + offsetof(struct sk_msg, data_end)); break; case offsetof(struct sk_msg_md, family): BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_family) != 2); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( - struct sk_msg_buff, sk), + struct sk_msg, sk), si->dst_reg, si->src_reg, - offsetof(struct sk_msg_buff, sk)); + offsetof(struct sk_msg, sk)); *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_family)); break; @@ -7008,9 +6892,9 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type, BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_daddr) != 4); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( - struct sk_msg_buff, sk), + struct sk_msg, sk), si->dst_reg, si->src_reg, - offsetof(struct sk_msg_buff, sk)); + offsetof(struct sk_msg, sk)); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_daddr)); break; @@ -7020,9 +6904,9 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type, skc_rcv_saddr) != 4); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( - struct sk_msg_buff, sk), + struct sk_msg, sk), si->dst_reg, si->src_reg, - offsetof(struct sk_msg_buff, sk)); + offsetof(struct sk_msg, sk)); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_rcv_saddr)); @@ -7037,9 +6921,9 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type, off = si->off; off -= offsetof(struct sk_msg_md, remote_ip6[0]); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( - struct sk_msg_buff, sk), + struct sk_msg, sk), si->dst_reg, si->src_reg, - offsetof(struct sk_msg_buff, sk)); + offsetof(struct sk_msg, sk)); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_v6_daddr.s6_addr32[0]) + @@ -7058,9 +6942,9 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type, off = si->off; off -= offsetof(struct sk_msg_md, local_ip6[0]); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( - struct sk_msg_buff, sk), + struct sk_msg, sk), si->dst_reg, si->src_reg, - offsetof(struct sk_msg_buff, sk)); + offsetof(struct sk_msg, sk)); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_v6_rcv_saddr.s6_addr32[0]) + @@ -7074,9 +6958,9 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type, BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_dport) != 2); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( - struct sk_msg_buff, sk), + struct sk_msg, sk), si->dst_reg, si->src_reg, - offsetof(struct sk_msg_buff, sk)); + offsetof(struct sk_msg, sk)); *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_dport)); #ifndef __BIG_ENDIAN_BITFIELD @@ -7088,9 +6972,9 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type, BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_num) != 2); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( - struct sk_msg_buff, sk), + struct sk_msg, sk), si->dst_reg, si->src_reg, - offsetof(struct sk_msg_buff, sk)); + offsetof(struct sk_msg, sk)); *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_num)); break; diff --git a/net/core/skmsg.c b/net/core/skmsg.c new file mode 100644 index 000000000000..ae2b281c9c57 --- /dev/null +++ b/net/core/skmsg.c @@ -0,0 +1,763 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2017 - 2018 Covalent IO, Inc. http://covalent.io */ + +#include +#include +#include + +#include +#include + +static bool sk_msg_try_coalesce_ok(struct sk_msg *msg, int elem_first_coalesce) +{ + if (msg->sg.end > msg->sg.start && + elem_first_coalesce < msg->sg.end) + return true; + + if (msg->sg.end < msg->sg.start && + (elem_first_coalesce > msg->sg.start || + elem_first_coalesce < msg->sg.end)) + return true; + + return false; +} + +int sk_msg_alloc(struct sock *sk, struct sk_msg *msg, int len, + int elem_first_coalesce) +{ + struct page_frag *pfrag = sk_page_frag(sk); + int ret = 0; + + len -= msg->sg.size; + while (len > 0) { + struct scatterlist *sge; + u32 orig_offset; + int use, i; + + if (!sk_page_frag_refill(sk, pfrag)) + return -ENOMEM; + + orig_offset = pfrag->offset; + use = min_t(int, len, pfrag->size - orig_offset); + if (!sk_wmem_schedule(sk, use)) + return -ENOMEM; + + i = msg->sg.end; + sk_msg_iter_var_prev(i); + sge = &msg->sg.data[i]; + + if (sk_msg_try_coalesce_ok(msg, elem_first_coalesce) && + sg_page(sge) == pfrag->page && + sge->offset + sge->length == orig_offset) { + sge->length += use; + } else { + if (sk_msg_full(msg)) { + ret = -ENOSPC; + break; + } + + sge = &msg->sg.data[msg->sg.end]; + sg_unmark_end(sge); + sg_set_page(sge, pfrag->page, use, orig_offset); + get_page(pfrag->page); + sk_msg_iter_next(msg, end); + } + + sk_mem_charge(sk, use); + msg->sg.size += use; + pfrag->offset += use; + len -= use; + } + + return ret; +} +EXPORT_SYMBOL_GPL(sk_msg_alloc); + +void sk_msg_return_zero(struct sock *sk, struct sk_msg *msg, int bytes) +{ + int i = msg->sg.start; + + do { + struct scatterlist *sge = sk_msg_elem(msg, i); + + if (bytes < sge->length) { + sge->length -= bytes; + sge->offset += bytes; + sk_mem_uncharge(sk, bytes); + break; + } + + sk_mem_uncharge(sk, sge->length); + bytes -= sge->length; + sge->length = 0; + sge->offset = 0; + sk_msg_iter_var_next(i); + } while (bytes && i != msg->sg.end); + msg->sg.start = i; +} +EXPORT_SYMBOL_GPL(sk_msg_return_zero); + +void sk_msg_return(struct sock *sk, struct sk_msg *msg, int bytes) +{ + int i = msg->sg.start; + + do { + struct scatterlist *sge = &msg->sg.data[i]; + int uncharge = (bytes < sge->length) ? bytes : sge->length; + + sk_mem_uncharge(sk, uncharge); + bytes -= uncharge; + sk_msg_iter_var_next(i); + } while (i != msg->sg.end); +} +EXPORT_SYMBOL_GPL(sk_msg_return); + +static int sk_msg_free_elem(struct sock *sk, struct sk_msg *msg, u32 i, + bool charge) +{ + struct scatterlist *sge = sk_msg_elem(msg, i); + u32 len = sge->length; + + if (charge) + sk_mem_uncharge(sk, len); + if (!msg->skb) + put_page(sg_page(sge)); + memset(sge, 0, sizeof(*sge)); + return len; +} + +static int __sk_msg_free(struct sock *sk, struct sk_msg *msg, u32 i, + bool charge) +{ + struct scatterlist *sge = sk_msg_elem(msg, i); + int freed = 0; + + while (msg->sg.size) { + msg->sg.size -= sge->length; + freed += sk_msg_free_elem(sk, msg, i, charge); + sk_msg_iter_var_next(i); + sk_msg_check_to_free(msg, i, msg->sg.size); + sge = sk_msg_elem(msg, i); + } + if (msg->skb) + consume_skb(msg->skb); + sk_msg_init(msg); + return freed; +} + +int sk_msg_free_nocharge(struct sock *sk, struct sk_msg *msg) +{ + return __sk_msg_free(sk, msg, msg->sg.start, false); +} +EXPORT_SYMBOL_GPL(sk_msg_free_nocharge); + +int sk_msg_free(struct sock *sk, struct sk_msg *msg) +{ + return __sk_msg_free(sk, msg, msg->sg.start, true); +} +EXPORT_SYMBOL_GPL(sk_msg_free); + +static void __sk_msg_free_partial(struct sock *sk, struct sk_msg *msg, + u32 bytes, bool charge) +{ + struct scatterlist *sge; + u32 i = msg->sg.start; + + while (bytes) { + sge = sk_msg_elem(msg, i); + if (!sge->length) + break; + if (bytes < sge->length) { + if (charge) + sk_mem_uncharge(sk, bytes); + sge->length -= bytes; + sge->offset += bytes; + msg->sg.size -= bytes; + break; + } + + msg->sg.size -= sge->length; + bytes -= sge->length; + sk_msg_free_elem(sk, msg, i, charge); + sk_msg_iter_var_next(i); + sk_msg_check_to_free(msg, i, bytes); + } + msg->sg.start = i; +} + +void sk_msg_free_partial(struct sock *sk, struct sk_msg *msg, u32 bytes) +{ + __sk_msg_free_partial(sk, msg, bytes, true); +} +EXPORT_SYMBOL_GPL(sk_msg_free_partial); + +void sk_msg_free_partial_nocharge(struct sock *sk, struct sk_msg *msg, + u32 bytes) +{ + __sk_msg_free_partial(sk, msg, bytes, false); +} + +void sk_msg_trim(struct sock *sk, struct sk_msg *msg, int len) +{ + int trim = msg->sg.size - len; + u32 i = msg->sg.end; + + if (trim <= 0) { + WARN_ON(trim < 0); + return; + } + + sk_msg_iter_var_prev(i); + msg->sg.size = len; + while (msg->sg.data[i].length && + trim >= msg->sg.data[i].length) { + trim -= msg->sg.data[i].length; + sk_msg_free_elem(sk, msg, i, true); + sk_msg_iter_var_prev(i); + if (!trim) + goto out; + } + + msg->sg.data[i].length -= trim; + sk_mem_uncharge(sk, trim); +out: + /* If we trim data before curr pointer update copybreak and current + * so that any future copy operations start at new copy location. + * However trimed data that has not yet been used in a copy op + * does not require an update. + */ + if (msg->sg.curr >= i) { + msg->sg.curr = i; + msg->sg.copybreak = msg->sg.data[i].length; + } + sk_msg_iter_var_next(i); + msg->sg.end = i; +} +EXPORT_SYMBOL_GPL(sk_msg_trim); + +int sk_msg_zerocopy_from_iter(struct sock *sk, struct iov_iter *from, + struct sk_msg *msg, u32 bytes) +{ + int i, maxpages, ret = 0, num_elems = sk_msg_elem_used(msg); + const int to_max_pages = MAX_MSG_FRAGS; + struct page *pages[MAX_MSG_FRAGS]; + ssize_t orig, copied, use, offset; + + orig = msg->sg.size; + while (bytes > 0) { + i = 0; + maxpages = to_max_pages - num_elems; + if (maxpages == 0) { + ret = -EFAULT; + goto out; + } + + copied = iov_iter_get_pages(from, pages, bytes, maxpages, + &offset); + if (copied <= 0) { + ret = -EFAULT; + goto out; + } + + iov_iter_advance(from, copied); + bytes -= copied; + msg->sg.size += copied; + + while (copied) { + use = min_t(int, copied, PAGE_SIZE - offset); + sg_set_page(&msg->sg.data[msg->sg.end], + pages[i], use, offset); + sg_unmark_end(&msg->sg.data[msg->sg.end]); + sk_mem_charge(sk, use); + + offset = 0; + copied -= use; + sk_msg_iter_next(msg, end); + num_elems++; + i++; + } + /* When zerocopy is mixed with sk_msg_*copy* operations we + * may have a copybreak set in this case clear and prefer + * zerocopy remainder when possible. + */ + msg->sg.copybreak = 0; + msg->sg.curr = msg->sg.end; + } +out: + /* Revert iov_iter updates, msg will need to use 'trim' later if it + * also needs to be cleared. + */ + if (ret) + iov_iter_revert(from, msg->sg.size - orig); + return ret; +} +EXPORT_SYMBOL_GPL(sk_msg_zerocopy_from_iter); + +int sk_msg_memcopy_from_iter(struct sock *sk, struct iov_iter *from, + struct sk_msg *msg, u32 bytes) +{ + int ret = -ENOSPC, i = msg->sg.curr; + struct scatterlist *sge; + u32 copy, buf_size; + void *to; + + do { + sge = sk_msg_elem(msg, i); + /* This is possible if a trim operation shrunk the buffer */ + if (msg->sg.copybreak >= sge->length) { + msg->sg.copybreak = 0; + sk_msg_iter_var_next(i); + if (i == msg->sg.end) + break; + sge = sk_msg_elem(msg, i); + } + + buf_size = sge->length - msg->sg.copybreak; + copy = (buf_size > bytes) ? bytes : buf_size; + to = sg_virt(sge) + msg->sg.copybreak; + msg->sg.copybreak += copy; + if (sk->sk_route_caps & NETIF_F_NOCACHE_COPY) + ret = copy_from_iter_nocache(to, copy, from); + else + ret = copy_from_iter(to, copy, from); + if (ret != copy) { + ret = -EFAULT; + goto out; + } + bytes -= copy; + if (!bytes) + break; + msg->sg.copybreak = 0; + sk_msg_iter_var_next(i); + } while (i != msg->sg.end); +out: + msg->sg.curr = i; + return ret; +} +EXPORT_SYMBOL_GPL(sk_msg_memcopy_from_iter); + +static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb) +{ + struct sock *sk = psock->sk; + int copied = 0, num_sge; + struct sk_msg *msg; + + msg = kzalloc(sizeof(*msg), __GFP_NOWARN | GFP_ATOMIC); + if (unlikely(!msg)) + return -EAGAIN; + if (!sk_rmem_schedule(sk, skb, skb->len)) { + kfree(msg); + return -EAGAIN; + } + + sk_msg_init(msg); + num_sge = skb_to_sgvec(skb, msg->sg.data, 0, skb->len); + if (unlikely(num_sge < 0)) { + kfree(msg); + return num_sge; + } + + sk_mem_charge(sk, skb->len); + copied = skb->len; + msg->sg.start = 0; + msg->sg.end = num_sge == MAX_MSG_FRAGS ? 0 : num_sge; + msg->skb = skb; + + sk_psock_queue_msg(psock, msg); + sk->sk_data_ready(sk); + return copied; +} + +static int sk_psock_handle_skb(struct sk_psock *psock, struct sk_buff *skb, + u32 off, u32 len, bool ingress) +{ + if (ingress) + return sk_psock_skb_ingress(psock, skb); + else + return skb_send_sock_locked(psock->sk, skb, off, len); +} + +static void sk_psock_backlog(struct work_struct *work) +{ + struct sk_psock *psock = container_of(work, struct sk_psock, work); + struct sk_psock_work_state *state = &psock->work_state; + struct sk_buff *skb; + bool ingress; + u32 len, off; + int ret; + + /* Lock sock to avoid losing sk_socket during loop. */ + lock_sock(psock->sk); + if (state->skb) { + skb = state->skb; + len = state->len; + off = state->off; + state->skb = NULL; + goto start; + } + + while ((skb = skb_dequeue(&psock->ingress_skb))) { + len = skb->len; + off = 0; +start: + ingress = tcp_skb_bpf_ingress(skb); + do { + ret = -EIO; + if (likely(psock->sk->sk_socket)) + ret = sk_psock_handle_skb(psock, skb, off, + len, ingress); + if (ret <= 0) { + if (ret == -EAGAIN) { + state->skb = skb; + state->len = len; + state->off = off; + goto end; + } + /* Hard errors break pipe and stop xmit. */ + sk_psock_report_error(psock, ret ? -ret : EPIPE); + sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED); + kfree_skb(skb); + goto end; + } + off += ret; + len -= ret; + } while (len); + + if (!ingress) + kfree_skb(skb); + } +end: + release_sock(psock->sk); +} + +struct sk_psock *sk_psock_init(struct sock *sk, int node) +{ + struct sk_psock *psock = kzalloc_node(sizeof(*psock), + GFP_ATOMIC | __GFP_NOWARN, + node); + if (!psock) + return NULL; + + psock->sk = sk; + psock->eval = __SK_NONE; + + INIT_LIST_HEAD(&psock->link); + spin_lock_init(&psock->link_lock); + + INIT_WORK(&psock->work, sk_psock_backlog); + INIT_LIST_HEAD(&psock->ingress_msg); + skb_queue_head_init(&psock->ingress_skb); + + sk_psock_set_state(psock, SK_PSOCK_TX_ENABLED); + refcount_set(&psock->refcnt, 1); + + rcu_assign_sk_user_data(sk, psock); + sock_hold(sk); + + return psock; +} +EXPORT_SYMBOL_GPL(sk_psock_init); + +struct sk_psock_link *sk_psock_link_pop(struct sk_psock *psock) +{ + struct sk_psock_link *link; + + spin_lock_bh(&psock->link_lock); + link = list_first_entry_or_null(&psock->link, struct sk_psock_link, + list); + if (link) + list_del(&link->list); + spin_unlock_bh(&psock->link_lock); + return link; +} + +void __sk_psock_purge_ingress_msg(struct sk_psock *psock) +{ + struct sk_msg *msg, *tmp; + + list_for_each_entry_safe(msg, tmp, &psock->ingress_msg, list) { + list_del(&msg->list); + sk_msg_free(psock->sk, msg); + kfree(msg); + } +} + +static void sk_psock_zap_ingress(struct sk_psock *psock) +{ + __skb_queue_purge(&psock->ingress_skb); + __sk_psock_purge_ingress_msg(psock); +} + +static void sk_psock_link_destroy(struct sk_psock *psock) +{ + struct sk_psock_link *link, *tmp; + + list_for_each_entry_safe(link, tmp, &psock->link, list) { + list_del(&link->list); + sk_psock_free_link(link); + } +} + +static void sk_psock_destroy_deferred(struct work_struct *gc) +{ + struct sk_psock *psock = container_of(gc, struct sk_psock, gc); + + /* No sk_callback_lock since already detached. */ + if (psock->parser.enabled) + strp_done(&psock->parser.strp); + + cancel_work_sync(&psock->work); + + psock_progs_drop(&psock->progs); + + sk_psock_link_destroy(psock); + sk_psock_cork_free(psock); + sk_psock_zap_ingress(psock); + + if (psock->sk_redir) + sock_put(psock->sk_redir); + sock_put(psock->sk); + kfree(psock); +} + +void sk_psock_destroy(struct rcu_head *rcu) +{ + struct sk_psock *psock = container_of(rcu, struct sk_psock, rcu); + + INIT_WORK(&psock->gc, sk_psock_destroy_deferred); + schedule_work(&psock->gc); +} +EXPORT_SYMBOL_GPL(sk_psock_destroy); + +void sk_psock_drop(struct sock *sk, struct sk_psock *psock) +{ + rcu_assign_sk_user_data(sk, NULL); + sk_psock_cork_free(psock); + sk_psock_restore_proto(sk, psock); + + write_lock_bh(&sk->sk_callback_lock); + if (psock->progs.skb_parser) + sk_psock_stop_strp(sk, psock); + write_unlock_bh(&sk->sk_callback_lock); + sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED); + + call_rcu_sched(&psock->rcu, sk_psock_destroy); +} +EXPORT_SYMBOL_GPL(sk_psock_drop); + +static int sk_psock_map_verd(int verdict, bool redir) +{ + switch (verdict) { + case SK_PASS: + return redir ? __SK_REDIRECT : __SK_PASS; + case SK_DROP: + default: + break; + } + + return __SK_DROP; +} + +int sk_psock_msg_verdict(struct sock *sk, struct sk_psock *psock, + struct sk_msg *msg) +{ + struct bpf_prog *prog; + int ret; + + preempt_disable(); + rcu_read_lock(); + prog = READ_ONCE(psock->progs.msg_parser); + if (unlikely(!prog)) { + ret = __SK_PASS; + goto out; + } + + sk_msg_compute_data_pointers(msg); + msg->sk = sk; + ret = BPF_PROG_RUN(prog, msg); + ret = sk_psock_map_verd(ret, msg->sk_redir); + psock->apply_bytes = msg->apply_bytes; + if (ret == __SK_REDIRECT) { + if (psock->sk_redir) + sock_put(psock->sk_redir); + psock->sk_redir = msg->sk_redir; + if (!psock->sk_redir) { + ret = __SK_DROP; + goto out; + } + sock_hold(psock->sk_redir); + } +out: + rcu_read_unlock(); + preempt_enable(); + return ret; +} +EXPORT_SYMBOL_GPL(sk_psock_msg_verdict); + +static int sk_psock_bpf_run(struct sk_psock *psock, struct bpf_prog *prog, + struct sk_buff *skb) +{ + int ret; + + skb->sk = psock->sk; + bpf_compute_data_end_sk_skb(skb); + preempt_disable(); + ret = BPF_PROG_RUN(prog, skb); + preempt_enable(); + /* strparser clones the skb before handing it to a upper layer, + * meaning skb_orphan has been called. We NULL sk on the way out + * to ensure we don't trigger a BUG_ON() in skb/sk operations + * later and because we are not charging the memory of this skb + * to any socket yet. + */ + skb->sk = NULL; + return ret; +} + +static struct sk_psock *sk_psock_from_strp(struct strparser *strp) +{ + struct sk_psock_parser *parser; + + parser = container_of(strp, struct sk_psock_parser, strp); + return container_of(parser, struct sk_psock, parser); +} + +static void sk_psock_verdict_apply(struct sk_psock *psock, + struct sk_buff *skb, int verdict) +{ + struct sk_psock *psock_other; + struct sock *sk_other; + bool ingress; + + switch (verdict) { + case __SK_REDIRECT: + sk_other = tcp_skb_bpf_redirect_fetch(skb); + if (unlikely(!sk_other)) + goto out_free; + psock_other = sk_psock(sk_other); + if (!psock_other || sock_flag(sk_other, SOCK_DEAD) || + !sk_psock_test_state(psock_other, SK_PSOCK_TX_ENABLED)) + goto out_free; + ingress = tcp_skb_bpf_ingress(skb); + if ((!ingress && sock_writeable(sk_other)) || + (ingress && + atomic_read(&sk_other->sk_rmem_alloc) <= + sk_other->sk_rcvbuf)) { + if (!ingress) + skb_set_owner_w(skb, sk_other); + skb_queue_tail(&psock_other->ingress_skb, skb); + schedule_work(&psock_other->work); + break; + } + /* fall-through */ + case __SK_DROP: + /* fall-through */ + default: +out_free: + kfree_skb(skb); + } +} + +static void sk_psock_strp_read(struct strparser *strp, struct sk_buff *skb) +{ + struct sk_psock *psock = sk_psock_from_strp(strp); + struct bpf_prog *prog; + int ret = __SK_DROP; + + rcu_read_lock(); + prog = READ_ONCE(psock->progs.skb_verdict); + if (likely(prog)) { + skb_orphan(skb); + tcp_skb_bpf_redirect_clear(skb); + ret = sk_psock_bpf_run(psock, prog, skb); + ret = sk_psock_map_verd(ret, tcp_skb_bpf_redirect_fetch(skb)); + } + rcu_read_unlock(); + sk_psock_verdict_apply(psock, skb, ret); +} + +static int sk_psock_strp_read_done(struct strparser *strp, int err) +{ + return err; +} + +static int sk_psock_strp_parse(struct strparser *strp, struct sk_buff *skb) +{ + struct sk_psock *psock = sk_psock_from_strp(strp); + struct bpf_prog *prog; + int ret = skb->len; + + rcu_read_lock(); + prog = READ_ONCE(psock->progs.skb_parser); + if (likely(prog)) + ret = sk_psock_bpf_run(psock, prog, skb); + rcu_read_unlock(); + return ret; +} + +/* Called with socket lock held. */ +static void sk_psock_data_ready(struct sock *sk) +{ + struct sk_psock *psock; + + rcu_read_lock(); + psock = sk_psock(sk); + if (likely(psock)) { + write_lock_bh(&sk->sk_callback_lock); + strp_data_ready(&psock->parser.strp); + write_unlock_bh(&sk->sk_callback_lock); + } + rcu_read_unlock(); +} + +static void sk_psock_write_space(struct sock *sk) +{ + struct sk_psock *psock; + void (*write_space)(struct sock *sk); + + rcu_read_lock(); + psock = sk_psock(sk); + if (likely(psock && sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED))) + schedule_work(&psock->work); + write_space = psock->saved_write_space; + rcu_read_unlock(); + write_space(sk); +} + +int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock) +{ + static const struct strp_callbacks cb = { + .rcv_msg = sk_psock_strp_read, + .read_sock_done = sk_psock_strp_read_done, + .parse_msg = sk_psock_strp_parse, + }; + + psock->parser.enabled = false; + return strp_init(&psock->parser.strp, sk, &cb); +} + +void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock) +{ + struct sk_psock_parser *parser = &psock->parser; + + if (parser->enabled) + return; + + parser->saved_data_ready = sk->sk_data_ready; + sk->sk_data_ready = sk_psock_data_ready; + sk->sk_write_space = sk_psock_write_space; + parser->enabled = true; +} + +void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock) +{ + struct sk_psock_parser *parser = &psock->parser; + + if (!parser->enabled) + return; + + sk->sk_data_ready = parser->saved_data_ready; + parser->saved_data_ready = NULL; + strp_stop(&parser->strp); + parser->enabled = false; +} diff --git a/net/core/sock_map.c b/net/core/sock_map.c new file mode 100644 index 000000000000..3c0e44cb811a --- /dev/null +++ b/net/core/sock_map.c @@ -0,0 +1,1002 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2017 - 2018 Covalent IO, Inc. http://covalent.io */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct bpf_stab { + struct bpf_map map; + struct sock **sks; + struct sk_psock_progs progs; + raw_spinlock_t lock; +}; + +#define SOCK_CREATE_FLAG_MASK \ + (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) + +static struct bpf_map *sock_map_alloc(union bpf_attr *attr) +{ + struct bpf_stab *stab; + u64 cost; + int err; + + if (!capable(CAP_NET_ADMIN)) + return ERR_PTR(-EPERM); + if (attr->max_entries == 0 || + attr->key_size != 4 || + attr->value_size != 4 || + attr->map_flags & ~SOCK_CREATE_FLAG_MASK) + return ERR_PTR(-EINVAL); + + stab = kzalloc(sizeof(*stab), GFP_USER); + if (!stab) + return ERR_PTR(-ENOMEM); + + bpf_map_init_from_attr(&stab->map, attr); + raw_spin_lock_init(&stab->lock); + + /* Make sure page count doesn't overflow. */ + cost = (u64) stab->map.max_entries * sizeof(struct sock *); + if (cost >= U32_MAX - PAGE_SIZE) { + err = -EINVAL; + goto free_stab; + } + + stab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; + err = bpf_map_precharge_memlock(stab->map.pages); + if (err) + goto free_stab; + + stab->sks = bpf_map_area_alloc(stab->map.max_entries * + sizeof(struct sock *), + stab->map.numa_node); + if (stab->sks) + return &stab->map; + err = -ENOMEM; +free_stab: + kfree(stab); + return ERR_PTR(err); +} + +int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog) +{ + u32 ufd = attr->target_fd; + struct bpf_map *map; + struct fd f; + int ret; + + f = fdget(ufd); + map = __bpf_map_get(f); + if (IS_ERR(map)) + return PTR_ERR(map); + ret = sock_map_prog_update(map, prog, attr->attach_type); + fdput(f); + return ret; +} + +static void sock_map_sk_acquire(struct sock *sk) + __acquires(&sk->sk_lock.slock) +{ + lock_sock(sk); + preempt_disable(); + rcu_read_lock(); +} + +static void sock_map_sk_release(struct sock *sk) + __releases(&sk->sk_lock.slock) +{ + rcu_read_unlock(); + preempt_enable(); + release_sock(sk); +} + +static void sock_map_add_link(struct sk_psock *psock, + struct sk_psock_link *link, + struct bpf_map *map, void *link_raw) +{ + link->link_raw = link_raw; + link->map = map; + spin_lock_bh(&psock->link_lock); + list_add_tail(&link->list, &psock->link); + spin_unlock_bh(&psock->link_lock); +} + +static void sock_map_del_link(struct sock *sk, + struct sk_psock *psock, void *link_raw) +{ + struct sk_psock_link *link, *tmp; + bool strp_stop = false; + + spin_lock_bh(&psock->link_lock); + list_for_each_entry_safe(link, tmp, &psock->link, list) { + if (link->link_raw == link_raw) { + struct bpf_map *map = link->map; + struct bpf_stab *stab = container_of(map, struct bpf_stab, + map); + if (psock->parser.enabled && stab->progs.skb_parser) + strp_stop = true; + list_del(&link->list); + sk_psock_free_link(link); + } + } + spin_unlock_bh(&psock->link_lock); + if (strp_stop) { + write_lock_bh(&sk->sk_callback_lock); + sk_psock_stop_strp(sk, psock); + write_unlock_bh(&sk->sk_callback_lock); + } +} + +static void sock_map_unref(struct sock *sk, void *link_raw) +{ + struct sk_psock *psock = sk_psock(sk); + + if (likely(psock)) { + sock_map_del_link(sk, psock, link_raw); + sk_psock_put(sk, psock); + } +} + +static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs, + struct sock *sk) +{ + struct bpf_prog *msg_parser, *skb_parser, *skb_verdict; + bool skb_progs, sk_psock_is_new = false; + struct sk_psock *psock; + int ret; + + skb_verdict = READ_ONCE(progs->skb_verdict); + skb_parser = READ_ONCE(progs->skb_parser); + skb_progs = skb_parser && skb_verdict; + if (skb_progs) { + skb_verdict = bpf_prog_inc_not_zero(skb_verdict); + if (IS_ERR(skb_verdict)) + return PTR_ERR(skb_verdict); + skb_parser = bpf_prog_inc_not_zero(skb_parser); + if (IS_ERR(skb_parser)) { + bpf_prog_put(skb_verdict); + return PTR_ERR(skb_parser); + } + } + + msg_parser = READ_ONCE(progs->msg_parser); + if (msg_parser) { + msg_parser = bpf_prog_inc_not_zero(msg_parser); + if (IS_ERR(msg_parser)) { + ret = PTR_ERR(msg_parser); + goto out; + } + } + + psock = sk_psock_get(sk); + if (psock) { + if (!sk_has_psock(sk)) { + ret = -EBUSY; + goto out_progs; + } + if ((msg_parser && READ_ONCE(psock->progs.msg_parser)) || + (skb_progs && READ_ONCE(psock->progs.skb_parser))) { + sk_psock_put(sk, psock); + ret = -EBUSY; + goto out_progs; + } + } else { + psock = sk_psock_init(sk, map->numa_node); + if (!psock) { + ret = -ENOMEM; + goto out_progs; + } + sk_psock_is_new = true; + } + + if (msg_parser) + psock_set_prog(&psock->progs.msg_parser, msg_parser); + if (sk_psock_is_new) { + ret = tcp_bpf_init(sk); + if (ret < 0) + goto out_drop; + } else { + tcp_bpf_reinit(sk); + } + + write_lock_bh(&sk->sk_callback_lock); + if (skb_progs && !psock->parser.enabled) { + ret = sk_psock_init_strp(sk, psock); + if (ret) { + write_unlock_bh(&sk->sk_callback_lock); + goto out_drop; + } + psock_set_prog(&psock->progs.skb_verdict, skb_verdict); + psock_set_prog(&psock->progs.skb_parser, skb_parser); + sk_psock_start_strp(sk, psock); + } + write_unlock_bh(&sk->sk_callback_lock); + return 0; +out_drop: + sk_psock_put(sk, psock); +out_progs: + if (msg_parser) + bpf_prog_put(msg_parser); +out: + if (skb_progs) { + bpf_prog_put(skb_verdict); + bpf_prog_put(skb_parser); + } + return ret; +} + +static void sock_map_free(struct bpf_map *map) +{ + struct bpf_stab *stab = container_of(map, struct bpf_stab, map); + int i; + + synchronize_rcu(); + rcu_read_lock(); + raw_spin_lock_bh(&stab->lock); + for (i = 0; i < stab->map.max_entries; i++) { + struct sock **psk = &stab->sks[i]; + struct sock *sk; + + sk = xchg(psk, NULL); + if (sk) + sock_map_unref(sk, psk); + } + raw_spin_unlock_bh(&stab->lock); + rcu_read_unlock(); + + bpf_map_area_free(stab->sks); + kfree(stab); +} + +static void sock_map_release_progs(struct bpf_map *map) +{ + psock_progs_drop(&container_of(map, struct bpf_stab, map)->progs); +} + +static struct sock *__sock_map_lookup_elem(struct bpf_map *map, u32 key) +{ + struct bpf_stab *stab = container_of(map, struct bpf_stab, map); + + WARN_ON_ONCE(!rcu_read_lock_held()); + + if (unlikely(key >= map->max_entries)) + return NULL; + return READ_ONCE(stab->sks[key]); +} + +static void *sock_map_lookup(struct bpf_map *map, void *key) +{ + return ERR_PTR(-EOPNOTSUPP); +} + +static int __sock_map_delete(struct bpf_stab *stab, struct sock *sk_test, + struct sock **psk) +{ + struct sock *sk; + + raw_spin_lock_bh(&stab->lock); + sk = *psk; + if (!sk_test || sk_test == sk) + *psk = NULL; + raw_spin_unlock_bh(&stab->lock); + if (unlikely(!sk)) + return -EINVAL; + sock_map_unref(sk, psk); + return 0; +} + +static void sock_map_delete_from_link(struct bpf_map *map, struct sock *sk, + void *link_raw) +{ + struct bpf_stab *stab = container_of(map, struct bpf_stab, map); + + __sock_map_delete(stab, sk, link_raw); +} + +static int sock_map_delete_elem(struct bpf_map *map, void *key) +{ + struct bpf_stab *stab = container_of(map, struct bpf_stab, map); + u32 i = *(u32 *)key; + struct sock **psk; + + if (unlikely(i >= map->max_entries)) + return -EINVAL; + + psk = &stab->sks[i]; + return __sock_map_delete(stab, NULL, psk); +} + +static int sock_map_get_next_key(struct bpf_map *map, void *key, void *next) +{ + struct bpf_stab *stab = container_of(map, struct bpf_stab, map); + u32 i = key ? *(u32 *)key : U32_MAX; + u32 *key_next = next; + + if (i == stab->map.max_entries - 1) + return -ENOENT; + if (i >= stab->map.max_entries) + *key_next = 0; + else + *key_next = i + 1; + return 0; +} + +static int sock_map_update_common(struct bpf_map *map, u32 idx, + struct sock *sk, u64 flags) +{ + struct bpf_stab *stab = container_of(map, struct bpf_stab, map); + struct sk_psock_link *link; + struct sk_psock *psock; + struct sock *osk; + int ret; + + WARN_ON_ONCE(!rcu_read_lock_held()); + if (unlikely(flags > BPF_EXIST)) + return -EINVAL; + if (unlikely(idx >= map->max_entries)) + return -E2BIG; + + link = sk_psock_init_link(); + if (!link) + return -ENOMEM; + + ret = sock_map_link(map, &stab->progs, sk); + if (ret < 0) + goto out_free; + + psock = sk_psock(sk); + WARN_ON_ONCE(!psock); + + raw_spin_lock_bh(&stab->lock); + osk = stab->sks[idx]; + if (osk && flags == BPF_NOEXIST) { + ret = -EEXIST; + goto out_unlock; + } else if (!osk && flags == BPF_EXIST) { + ret = -ENOENT; + goto out_unlock; + } + + sock_map_add_link(psock, link, map, &stab->sks[idx]); + stab->sks[idx] = sk; + if (osk) + sock_map_unref(osk, &stab->sks[idx]); + raw_spin_unlock_bh(&stab->lock); + return 0; +out_unlock: + raw_spin_unlock_bh(&stab->lock); + if (psock) + sk_psock_put(sk, psock); +out_free: + sk_psock_free_link(link); + return ret; +} + +static bool sock_map_op_okay(const struct bpf_sock_ops_kern *ops) +{ + return ops->op == BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB || + ops->op == BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB; +} + +static bool sock_map_sk_is_suitable(const struct sock *sk) +{ + return sk->sk_type == SOCK_STREAM && + sk->sk_protocol == IPPROTO_TCP; +} + +static int sock_map_update_elem(struct bpf_map *map, void *key, + void *value, u64 flags) +{ + u32 ufd = *(u32 *)value; + u32 idx = *(u32 *)key; + struct socket *sock; + struct sock *sk; + int ret; + + sock = sockfd_lookup(ufd, &ret); + if (!sock) + return ret; + sk = sock->sk; + if (!sk) { + ret = -EINVAL; + goto out; + } + if (!sock_map_sk_is_suitable(sk) || + sk->sk_state != TCP_ESTABLISHED) { + ret = -EOPNOTSUPP; + goto out; + } + + sock_map_sk_acquire(sk); + ret = sock_map_update_common(map, idx, sk, flags); + sock_map_sk_release(sk); +out: + fput(sock->file); + return ret; +} + +BPF_CALL_4(bpf_sock_map_update, struct bpf_sock_ops_kern *, sops, + struct bpf_map *, map, void *, key, u64, flags) +{ + WARN_ON_ONCE(!rcu_read_lock_held()); + + if (likely(sock_map_sk_is_suitable(sops->sk) && + sock_map_op_okay(sops))) + return sock_map_update_common(map, *(u32 *)key, sops->sk, + flags); + return -EOPNOTSUPP; +} + +const struct bpf_func_proto bpf_sock_map_update_proto = { + .func = bpf_sock_map_update, + .gpl_only = false, + .pkt_access = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_PTR_TO_MAP_KEY, + .arg4_type = ARG_ANYTHING, +}; + +BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb, + struct bpf_map *, map, u32, key, u64, flags) +{ + struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); + + if (unlikely(flags & ~(BPF_F_INGRESS))) + return SK_DROP; + tcb->bpf.flags = flags; + tcb->bpf.sk_redir = __sock_map_lookup_elem(map, key); + if (!tcb->bpf.sk_redir) + return SK_DROP; + return SK_PASS; +} + +const struct bpf_func_proto bpf_sk_redirect_map_proto = { + .func = bpf_sk_redirect_map, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_ANYTHING, +}; + +BPF_CALL_4(bpf_msg_redirect_map, struct sk_msg *, msg, + struct bpf_map *, map, u32, key, u64, flags) +{ + if (unlikely(flags & ~(BPF_F_INGRESS))) + return SK_DROP; + msg->flags = flags; + msg->sk_redir = __sock_map_lookup_elem(map, key); + if (!msg->sk_redir) + return SK_DROP; + return SK_PASS; +} + +const struct bpf_func_proto bpf_msg_redirect_map_proto = { + .func = bpf_msg_redirect_map, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_ANYTHING, +}; + +const struct bpf_map_ops sock_map_ops = { + .map_alloc = sock_map_alloc, + .map_free = sock_map_free, + .map_get_next_key = sock_map_get_next_key, + .map_update_elem = sock_map_update_elem, + .map_delete_elem = sock_map_delete_elem, + .map_lookup_elem = sock_map_lookup, + .map_release_uref = sock_map_release_progs, + .map_check_btf = map_check_no_btf, +}; + +struct bpf_htab_elem { + struct rcu_head rcu; + u32 hash; + struct sock *sk; + struct hlist_node node; + u8 key[0]; +}; + +struct bpf_htab_bucket { + struct hlist_head head; + raw_spinlock_t lock; +}; + +struct bpf_htab { + struct bpf_map map; + struct bpf_htab_bucket *buckets; + u32 buckets_num; + u32 elem_size; + struct sk_psock_progs progs; + atomic_t count; +}; + +static inline u32 sock_hash_bucket_hash(const void *key, u32 len) +{ + return jhash(key, len, 0); +} + +static struct bpf_htab_bucket *sock_hash_select_bucket(struct bpf_htab *htab, + u32 hash) +{ + return &htab->buckets[hash & (htab->buckets_num - 1)]; +} + +static struct bpf_htab_elem * +sock_hash_lookup_elem_raw(struct hlist_head *head, u32 hash, void *key, + u32 key_size) +{ + struct bpf_htab_elem *elem; + + hlist_for_each_entry_rcu(elem, head, node) { + if (elem->hash == hash && + !memcmp(&elem->key, key, key_size)) + return elem; + } + + return NULL; +} + +static struct sock *__sock_hash_lookup_elem(struct bpf_map *map, void *key) +{ + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + u32 key_size = map->key_size, hash; + struct bpf_htab_bucket *bucket; + struct bpf_htab_elem *elem; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + hash = sock_hash_bucket_hash(key, key_size); + bucket = sock_hash_select_bucket(htab, hash); + elem = sock_hash_lookup_elem_raw(&bucket->head, hash, key, key_size); + + return elem ? elem->sk : NULL; +} + +static void sock_hash_free_elem(struct bpf_htab *htab, + struct bpf_htab_elem *elem) +{ + atomic_dec(&htab->count); + kfree_rcu(elem, rcu); +} + +static void sock_hash_delete_from_link(struct bpf_map *map, struct sock *sk, + void *link_raw) +{ + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + struct bpf_htab_elem *elem_probe, *elem = link_raw; + struct bpf_htab_bucket *bucket; + + WARN_ON_ONCE(!rcu_read_lock_held()); + bucket = sock_hash_select_bucket(htab, elem->hash); + + /* elem may be deleted in parallel from the map, but access here + * is okay since it's going away only after RCU grace period. + * However, we need to check whether it's still present. + */ + raw_spin_lock_bh(&bucket->lock); + elem_probe = sock_hash_lookup_elem_raw(&bucket->head, elem->hash, + elem->key, map->key_size); + if (elem_probe && elem_probe == elem) { + hlist_del_rcu(&elem->node); + sock_map_unref(elem->sk, elem); + sock_hash_free_elem(htab, elem); + } + raw_spin_unlock_bh(&bucket->lock); +} + +static int sock_hash_delete_elem(struct bpf_map *map, void *key) +{ + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + u32 hash, key_size = map->key_size; + struct bpf_htab_bucket *bucket; + struct bpf_htab_elem *elem; + int ret = -ENOENT; + + hash = sock_hash_bucket_hash(key, key_size); + bucket = sock_hash_select_bucket(htab, hash); + + raw_spin_lock_bh(&bucket->lock); + elem = sock_hash_lookup_elem_raw(&bucket->head, hash, key, key_size); + if (elem) { + hlist_del_rcu(&elem->node); + sock_map_unref(elem->sk, elem); + sock_hash_free_elem(htab, elem); + ret = 0; + } + raw_spin_unlock_bh(&bucket->lock); + return ret; +} + +static struct bpf_htab_elem *sock_hash_alloc_elem(struct bpf_htab *htab, + void *key, u32 key_size, + u32 hash, struct sock *sk, + struct bpf_htab_elem *old) +{ + struct bpf_htab_elem *new; + + if (atomic_inc_return(&htab->count) > htab->map.max_entries) { + if (!old) { + atomic_dec(&htab->count); + return ERR_PTR(-E2BIG); + } + } + + new = kmalloc_node(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN, + htab->map.numa_node); + if (!new) { + atomic_dec(&htab->count); + return ERR_PTR(-ENOMEM); + } + memcpy(new->key, key, key_size); + new->sk = sk; + new->hash = hash; + return new; +} + +static int sock_hash_update_common(struct bpf_map *map, void *key, + struct sock *sk, u64 flags) +{ + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + u32 key_size = map->key_size, hash; + struct bpf_htab_elem *elem, *elem_new; + struct bpf_htab_bucket *bucket; + struct sk_psock_link *link; + struct sk_psock *psock; + int ret; + + WARN_ON_ONCE(!rcu_read_lock_held()); + if (unlikely(flags > BPF_EXIST)) + return -EINVAL; + + link = sk_psock_init_link(); + if (!link) + return -ENOMEM; + + ret = sock_map_link(map, &htab->progs, sk); + if (ret < 0) + goto out_free; + + psock = sk_psock(sk); + WARN_ON_ONCE(!psock); + + hash = sock_hash_bucket_hash(key, key_size); + bucket = sock_hash_select_bucket(htab, hash); + + raw_spin_lock_bh(&bucket->lock); + elem = sock_hash_lookup_elem_raw(&bucket->head, hash, key, key_size); + if (elem && flags == BPF_NOEXIST) { + ret = -EEXIST; + goto out_unlock; + } else if (!elem && flags == BPF_EXIST) { + ret = -ENOENT; + goto out_unlock; + } + + elem_new = sock_hash_alloc_elem(htab, key, key_size, hash, sk, elem); + if (IS_ERR(elem_new)) { + ret = PTR_ERR(elem_new); + goto out_unlock; + } + + sock_map_add_link(psock, link, map, elem_new); + /* Add new element to the head of the list, so that + * concurrent search will find it before old elem. + */ + hlist_add_head_rcu(&elem_new->node, &bucket->head); + if (elem) { + hlist_del_rcu(&elem->node); + sock_map_unref(elem->sk, elem); + sock_hash_free_elem(htab, elem); + } + raw_spin_unlock_bh(&bucket->lock); + return 0; +out_unlock: + raw_spin_unlock_bh(&bucket->lock); + sk_psock_put(sk, psock); +out_free: + sk_psock_free_link(link); + return ret; +} + +static int sock_hash_update_elem(struct bpf_map *map, void *key, + void *value, u64 flags) +{ + u32 ufd = *(u32 *)value; + struct socket *sock; + struct sock *sk; + int ret; + + sock = sockfd_lookup(ufd, &ret); + if (!sock) + return ret; + sk = sock->sk; + if (!sk) { + ret = -EINVAL; + goto out; + } + if (!sock_map_sk_is_suitable(sk) || + sk->sk_state != TCP_ESTABLISHED) { + ret = -EOPNOTSUPP; + goto out; + } + + sock_map_sk_acquire(sk); + ret = sock_hash_update_common(map, key, sk, flags); + sock_map_sk_release(sk); +out: + fput(sock->file); + return ret; +} + +static int sock_hash_get_next_key(struct bpf_map *map, void *key, + void *key_next) +{ + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + struct bpf_htab_elem *elem, *elem_next; + u32 hash, key_size = map->key_size; + struct hlist_head *head; + int i = 0; + + if (!key) + goto find_first_elem; + hash = sock_hash_bucket_hash(key, key_size); + head = &sock_hash_select_bucket(htab, hash)->head; + elem = sock_hash_lookup_elem_raw(head, hash, key, key_size); + if (!elem) + goto find_first_elem; + + elem_next = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(&elem->node)), + struct bpf_htab_elem, node); + if (elem_next) { + memcpy(key_next, elem_next->key, key_size); + return 0; + } + + i = hash & (htab->buckets_num - 1); + i++; +find_first_elem: + for (; i < htab->buckets_num; i++) { + head = &sock_hash_select_bucket(htab, i)->head; + elem_next = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)), + struct bpf_htab_elem, node); + if (elem_next) { + memcpy(key_next, elem_next->key, key_size); + return 0; + } + } + + return -ENOENT; +} + +static struct bpf_map *sock_hash_alloc(union bpf_attr *attr) +{ + struct bpf_htab *htab; + int i, err; + u64 cost; + + if (!capable(CAP_NET_ADMIN)) + return ERR_PTR(-EPERM); + if (attr->max_entries == 0 || + attr->key_size == 0 || + attr->value_size != 4 || + attr->map_flags & ~SOCK_CREATE_FLAG_MASK) + return ERR_PTR(-EINVAL); + if (attr->key_size > MAX_BPF_STACK) + return ERR_PTR(-E2BIG); + + htab = kzalloc(sizeof(*htab), GFP_USER); + if (!htab) + return ERR_PTR(-ENOMEM); + + bpf_map_init_from_attr(&htab->map, attr); + + htab->buckets_num = roundup_pow_of_two(htab->map.max_entries); + htab->elem_size = sizeof(struct bpf_htab_elem) + + round_up(htab->map.key_size, 8); + if (htab->buckets_num == 0 || + htab->buckets_num > U32_MAX / sizeof(struct bpf_htab_bucket)) { + err = -EINVAL; + goto free_htab; + } + + cost = (u64) htab->buckets_num * sizeof(struct bpf_htab_bucket) + + (u64) htab->elem_size * htab->map.max_entries; + if (cost >= U32_MAX - PAGE_SIZE) { + err = -EINVAL; + goto free_htab; + } + + htab->buckets = bpf_map_area_alloc(htab->buckets_num * + sizeof(struct bpf_htab_bucket), + htab->map.numa_node); + if (!htab->buckets) { + err = -ENOMEM; + goto free_htab; + } + + for (i = 0; i < htab->buckets_num; i++) { + INIT_HLIST_HEAD(&htab->buckets[i].head); + raw_spin_lock_init(&htab->buckets[i].lock); + } + + return &htab->map; +free_htab: + kfree(htab); + return ERR_PTR(err); +} + +static void sock_hash_free(struct bpf_map *map) +{ + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + struct bpf_htab_bucket *bucket; + struct bpf_htab_elem *elem; + struct hlist_node *node; + int i; + + synchronize_rcu(); + rcu_read_lock(); + for (i = 0; i < htab->buckets_num; i++) { + bucket = sock_hash_select_bucket(htab, i); + raw_spin_lock_bh(&bucket->lock); + hlist_for_each_entry_safe(elem, node, &bucket->head, node) { + hlist_del_rcu(&elem->node); + sock_map_unref(elem->sk, elem); + } + raw_spin_unlock_bh(&bucket->lock); + } + rcu_read_unlock(); + + bpf_map_area_free(htab->buckets); + kfree(htab); +} + +static void sock_hash_release_progs(struct bpf_map *map) +{ + psock_progs_drop(&container_of(map, struct bpf_htab, map)->progs); +} + +BPF_CALL_4(bpf_sock_hash_update, struct bpf_sock_ops_kern *, sops, + struct bpf_map *, map, void *, key, u64, flags) +{ + WARN_ON_ONCE(!rcu_read_lock_held()); + + if (likely(sock_map_sk_is_suitable(sops->sk) && + sock_map_op_okay(sops))) + return sock_hash_update_common(map, key, sops->sk, flags); + return -EOPNOTSUPP; +} + +const struct bpf_func_proto bpf_sock_hash_update_proto = { + .func = bpf_sock_hash_update, + .gpl_only = false, + .pkt_access = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_PTR_TO_MAP_KEY, + .arg4_type = ARG_ANYTHING, +}; + +BPF_CALL_4(bpf_sk_redirect_hash, struct sk_buff *, skb, + struct bpf_map *, map, void *, key, u64, flags) +{ + struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); + + if (unlikely(flags & ~(BPF_F_INGRESS))) + return SK_DROP; + tcb->bpf.flags = flags; + tcb->bpf.sk_redir = __sock_hash_lookup_elem(map, key); + if (!tcb->bpf.sk_redir) + return SK_DROP; + return SK_PASS; +} + +const struct bpf_func_proto bpf_sk_redirect_hash_proto = { + .func = bpf_sk_redirect_hash, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_PTR_TO_MAP_KEY, + .arg4_type = ARG_ANYTHING, +}; + +BPF_CALL_4(bpf_msg_redirect_hash, struct sk_msg *, msg, + struct bpf_map *, map, void *, key, u64, flags) +{ + if (unlikely(flags & ~(BPF_F_INGRESS))) + return SK_DROP; + msg->flags = flags; + msg->sk_redir = __sock_hash_lookup_elem(map, key); + if (!msg->sk_redir) + return SK_DROP; + return SK_PASS; +} + +const struct bpf_func_proto bpf_msg_redirect_hash_proto = { + .func = bpf_msg_redirect_hash, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_PTR_TO_MAP_KEY, + .arg4_type = ARG_ANYTHING, +}; + +const struct bpf_map_ops sock_hash_ops = { + .map_alloc = sock_hash_alloc, + .map_free = sock_hash_free, + .map_get_next_key = sock_hash_get_next_key, + .map_update_elem = sock_hash_update_elem, + .map_delete_elem = sock_hash_delete_elem, + .map_lookup_elem = sock_map_lookup, + .map_release_uref = sock_hash_release_progs, + .map_check_btf = map_check_no_btf, +}; + +static struct sk_psock_progs *sock_map_progs(struct bpf_map *map) +{ + switch (map->map_type) { + case BPF_MAP_TYPE_SOCKMAP: + return &container_of(map, struct bpf_stab, map)->progs; + case BPF_MAP_TYPE_SOCKHASH: + return &container_of(map, struct bpf_htab, map)->progs; + default: + break; + } + + return NULL; +} + +int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, + u32 which) +{ + struct sk_psock_progs *progs = sock_map_progs(map); + + if (!progs) + return -EOPNOTSUPP; + + switch (which) { + case BPF_SK_MSG_VERDICT: + psock_set_prog(&progs->msg_parser, prog); + break; + case BPF_SK_SKB_STREAM_PARSER: + psock_set_prog(&progs->skb_parser, prog); + break; + case BPF_SK_SKB_STREAM_VERDICT: + psock_set_prog(&progs->skb_verdict, prog); + break; + default: + return -EOPNOTSUPP; + } + + return 0; +} + +void sk_psock_unlink(struct sock *sk, struct sk_psock_link *link) +{ + switch (link->map->map_type) { + case BPF_MAP_TYPE_SOCKMAP: + return sock_map_delete_from_link(link->map, sk, + link->link_raw); + case BPF_MAP_TYPE_SOCKHASH: + return sock_hash_delete_from_link(link->map, sk, + link->link_raw); + default: + break; + } +} diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index 398cc0a301b1..9b6a0e031e22 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -63,6 +63,7 @@ obj-$(CONFIG_TCP_CONG_SCALABLE) += tcp_scalable.o obj-$(CONFIG_TCP_CONG_LP) += tcp_lp.o obj-$(CONFIG_TCP_CONG_YEAH) += tcp_yeah.o obj-$(CONFIG_TCP_CONG_ILLINOIS) += tcp_illinois.o +obj-$(CONFIG_NET_SOCK_MSG) += tcp_bpf.o obj-$(CONFIG_NETLABEL) += cipso_ipv4.o obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \ diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c new file mode 100644 index 000000000000..80debb0daf37 --- /dev/null +++ b/net/ipv4/tcp_bpf.c @@ -0,0 +1,655 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2017 - 2018 Covalent IO, Inc. http://covalent.io */ + +#include +#include +#include +#include +#include + +#include + +static bool tcp_bpf_stream_read(const struct sock *sk) +{ + struct sk_psock *psock; + bool empty = true; + + rcu_read_lock(); + psock = sk_psock(sk); + if (likely(psock)) + empty = list_empty(&psock->ingress_msg); + rcu_read_unlock(); + return !empty; +} + +static int tcp_bpf_wait_data(struct sock *sk, struct sk_psock *psock, + int flags, long timeo, int *err) +{ + DEFINE_WAIT_FUNC(wait, woken_wake_function); + int ret; + + add_wait_queue(sk_sleep(sk), &wait); + sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); + ret = sk_wait_event(sk, &timeo, + !list_empty(&psock->ingress_msg) || + !skb_queue_empty(&sk->sk_receive_queue), &wait); + sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); + remove_wait_queue(sk_sleep(sk), &wait); + return ret; +} + +int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock, + struct msghdr *msg, int len) +{ + struct iov_iter *iter = &msg->msg_iter; + int i, ret, copied = 0; + + while (copied != len) { + struct scatterlist *sge; + struct sk_msg *msg_rx; + + msg_rx = list_first_entry_or_null(&psock->ingress_msg, + struct sk_msg, list); + if (unlikely(!msg_rx)) + break; + + i = msg_rx->sg.start; + do { + struct page *page; + int copy; + + sge = sk_msg_elem(msg_rx, i); + copy = sge->length; + page = sg_page(sge); + if (copied + copy > len) + copy = len - copied; + ret = copy_page_to_iter(page, sge->offset, copy, iter); + if (ret != copy) { + msg_rx->sg.start = i; + return -EFAULT; + } + + copied += copy; + sge->offset += copy; + sge->length -= copy; + sk_mem_uncharge(sk, copy); + if (!sge->length) { + i++; + if (i == MAX_SKB_FRAGS) + i = 0; + if (!msg_rx->skb) + put_page(page); + } + + if (copied == len) + break; + } while (i != msg_rx->sg.end); + + msg_rx->sg.start = i; + if (!sge->length && msg_rx->sg.start == msg_rx->sg.end) { + list_del(&msg_rx->list); + if (msg_rx->skb) + consume_skb(msg_rx->skb); + kfree(msg_rx); + } + } + + return copied; +} +EXPORT_SYMBOL_GPL(__tcp_bpf_recvmsg); + +int tcp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, + int nonblock, int flags, int *addr_len) +{ + struct sk_psock *psock; + int copied, ret; + + if (unlikely(flags & MSG_ERRQUEUE)) + return inet_recv_error(sk, msg, len, addr_len); + if (!skb_queue_empty(&sk->sk_receive_queue)) + return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len); + + psock = sk_psock_get(sk); + if (unlikely(!psock)) + return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len); + lock_sock(sk); +msg_bytes_ready: + copied = __tcp_bpf_recvmsg(sk, psock, msg, len); + if (!copied) { + int data, err = 0; + long timeo; + + timeo = sock_rcvtimeo(sk, nonblock); + data = tcp_bpf_wait_data(sk, psock, flags, timeo, &err); + if (data) { + if (skb_queue_empty(&sk->sk_receive_queue)) + goto msg_bytes_ready; + release_sock(sk); + sk_psock_put(sk, psock); + return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len); + } + if (err) { + ret = err; + goto out; + } + } + ret = copied; +out: + release_sock(sk); + sk_psock_put(sk, psock); + return ret; +} + +static int bpf_tcp_ingress(struct sock *sk, struct sk_psock *psock, + struct sk_msg *msg, u32 apply_bytes, int flags) +{ + bool apply = apply_bytes; + struct scatterlist *sge; + u32 size, copied = 0; + struct sk_msg *tmp; + int i, ret = 0; + + tmp = kzalloc(sizeof(*tmp), __GFP_NOWARN | GFP_KERNEL); + if (unlikely(!tmp)) + return -ENOMEM; + + lock_sock(sk); + tmp->sg.start = msg->sg.start; + i = msg->sg.start; + do { + sge = sk_msg_elem(msg, i); + size = (apply && apply_bytes < sge->length) ? + apply_bytes : sge->length; + if (!sk_wmem_schedule(sk, size)) { + if (!copied) + ret = -ENOMEM; + break; + } + + sk_mem_charge(sk, size); + sk_msg_xfer(tmp, msg, i, size); + copied += size; + if (sge->length) + get_page(sk_msg_page(tmp, i)); + sk_msg_iter_var_next(i); + tmp->sg.end = i; + if (apply) { + apply_bytes -= size; + if (!apply_bytes) + break; + } + } while (i != msg->sg.end); + + if (!ret) { + msg->sg.start = i; + msg->sg.size -= apply_bytes; + sk_psock_queue_msg(psock, tmp); + sk->sk_data_ready(sk); + } else { + sk_msg_free(sk, tmp); + kfree(tmp); + } + + release_sock(sk); + return ret; +} + +static int tcp_bpf_push(struct sock *sk, struct sk_msg *msg, u32 apply_bytes, + int flags, bool uncharge) +{ + bool apply = apply_bytes; + struct scatterlist *sge; + struct page *page; + int size, ret = 0; + u32 off; + + while (1) { + sge = sk_msg_elem(msg, msg->sg.start); + size = (apply && apply_bytes < sge->length) ? + apply_bytes : sge->length; + off = sge->offset; + page = sg_page(sge); + + tcp_rate_check_app_limited(sk); +retry: + ret = do_tcp_sendpages(sk, page, off, size, flags); + if (ret <= 0) + return ret; + if (apply) + apply_bytes -= ret; + msg->sg.size -= ret; + sge->offset += ret; + sge->length -= ret; + if (uncharge) + sk_mem_uncharge(sk, ret); + if (ret != size) { + size -= ret; + off += ret; + goto retry; + } + if (!sge->length) { + put_page(page); + sk_msg_iter_next(msg, start); + sg_init_table(sge, 1); + if (msg->sg.start == msg->sg.end) + break; + } + if (apply && !apply_bytes) + break; + } + + return 0; +} + +static int tcp_bpf_push_locked(struct sock *sk, struct sk_msg *msg, + u32 apply_bytes, int flags, bool uncharge) +{ + int ret; + + lock_sock(sk); + ret = tcp_bpf_push(sk, msg, apply_bytes, flags, uncharge); + release_sock(sk); + return ret; +} + +int tcp_bpf_sendmsg_redir(struct sock *sk, struct sk_msg *msg, + u32 bytes, int flags) +{ + bool ingress = sk_msg_to_ingress(msg); + struct sk_psock *psock = sk_psock_get(sk); + int ret; + + if (unlikely(!psock)) { + sk_msg_free(sk, msg); + return 0; + } + ret = ingress ? bpf_tcp_ingress(sk, psock, msg, bytes, flags) : + tcp_bpf_push_locked(sk, msg, bytes, flags, false); + sk_psock_put(sk, psock); + return ret; +} +EXPORT_SYMBOL_GPL(tcp_bpf_sendmsg_redir); + +static int tcp_bpf_send_verdict(struct sock *sk, struct sk_psock *psock, + struct sk_msg *msg, int *copied, int flags) +{ + bool cork = false, enospc = msg->sg.start == msg->sg.end; + struct sock *sk_redir; + u32 tosend; + int ret; + +more_data: + if (psock->eval == __SK_NONE) + psock->eval = sk_psock_msg_verdict(sk, psock, msg); + + if (msg->cork_bytes && + msg->cork_bytes > msg->sg.size && !enospc) { + psock->cork_bytes = msg->cork_bytes - msg->sg.size; + if (!psock->cork) { + psock->cork = kzalloc(sizeof(*psock->cork), + GFP_ATOMIC | __GFP_NOWARN); + if (!psock->cork) + return -ENOMEM; + } + memcpy(psock->cork, msg, sizeof(*msg)); + return 0; + } + + tosend = msg->sg.size; + if (psock->apply_bytes && psock->apply_bytes < tosend) + tosend = psock->apply_bytes; + + switch (psock->eval) { + case __SK_PASS: + ret = tcp_bpf_push(sk, msg, tosend, flags, true); + if (unlikely(ret)) { + *copied -= sk_msg_free(sk, msg); + break; + } + sk_msg_apply_bytes(psock, tosend); + break; + case __SK_REDIRECT: + sk_redir = psock->sk_redir; + sk_msg_apply_bytes(psock, tosend); + if (psock->cork) { + cork = true; + psock->cork = NULL; + } + sk_msg_return(sk, msg, tosend); + release_sock(sk); + ret = tcp_bpf_sendmsg_redir(sk_redir, msg, tosend, flags); + lock_sock(sk); + if (unlikely(ret < 0)) { + int free = sk_msg_free_nocharge(sk, msg); + + if (!cork) + *copied -= free; + } + if (cork) { + sk_msg_free(sk, msg); + kfree(msg); + msg = NULL; + ret = 0; + } + break; + case __SK_DROP: + default: + sk_msg_free_partial(sk, msg, tosend); + sk_msg_apply_bytes(psock, tosend); + *copied -= tosend; + return -EACCES; + } + + if (likely(!ret)) { + if (!psock->apply_bytes) { + psock->eval = __SK_NONE; + if (psock->sk_redir) { + sock_put(psock->sk_redir); + psock->sk_redir = NULL; + } + } + if (msg && + msg->sg.data[msg->sg.start].page_link && + msg->sg.data[msg->sg.start].length) + goto more_data; + } + return ret; +} + +static int tcp_bpf_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) +{ + struct sk_msg tmp, *msg_tx = NULL; + int flags = msg->msg_flags | MSG_NO_SHARED_FRAGS; + int copied = 0, err = 0; + struct sk_psock *psock; + long timeo; + + psock = sk_psock_get(sk); + if (unlikely(!psock)) + return tcp_sendmsg(sk, msg, size); + + lock_sock(sk); + timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); + while (msg_data_left(msg)) { + bool enospc = false; + u32 copy, osize; + + if (sk->sk_err) { + err = -sk->sk_err; + goto out_err; + } + + copy = msg_data_left(msg); + if (!sk_stream_memory_free(sk)) + goto wait_for_sndbuf; + if (psock->cork) { + msg_tx = psock->cork; + } else { + msg_tx = &tmp; + sk_msg_init(msg_tx); + } + + osize = msg_tx->sg.size; + err = sk_msg_alloc(sk, msg_tx, msg_tx->sg.size + copy, msg_tx->sg.end - 1); + if (err) { + if (err != -ENOSPC) + goto wait_for_memory; + enospc = true; + copy = msg_tx->sg.size - osize; + } + + err = sk_msg_memcopy_from_iter(sk, &msg->msg_iter, msg_tx, + copy); + if (err < 0) { + sk_msg_trim(sk, msg_tx, osize); + goto out_err; + } + + copied += copy; + if (psock->cork_bytes) { + if (size > psock->cork_bytes) + psock->cork_bytes = 0; + else + psock->cork_bytes -= size; + if (psock->cork_bytes && !enospc) + goto out_err; + /* All cork bytes are accounted, rerun the prog. */ + psock->eval = __SK_NONE; + psock->cork_bytes = 0; + } + + err = tcp_bpf_send_verdict(sk, psock, msg_tx, &copied, flags); + if (unlikely(err < 0)) + goto out_err; + continue; +wait_for_sndbuf: + set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); +wait_for_memory: + err = sk_stream_wait_memory(sk, &timeo); + if (err) { + if (msg_tx && msg_tx != psock->cork) + sk_msg_free(sk, msg_tx); + goto out_err; + } + } +out_err: + if (err < 0) + err = sk_stream_error(sk, msg->msg_flags, err); + release_sock(sk); + sk_psock_put(sk, psock); + return copied ? copied : err; +} + +static int tcp_bpf_sendpage(struct sock *sk, struct page *page, int offset, + size_t size, int flags) +{ + struct sk_msg tmp, *msg = NULL; + int err = 0, copied = 0; + struct sk_psock *psock; + bool enospc = false; + + psock = sk_psock_get(sk); + if (unlikely(!psock)) + return tcp_sendpage(sk, page, offset, size, flags); + + lock_sock(sk); + if (psock->cork) { + msg = psock->cork; + } else { + msg = &tmp; + sk_msg_init(msg); + } + + /* Catch case where ring is full and sendpage is stalled. */ + if (unlikely(sk_msg_full(msg))) + goto out_err; + + sk_msg_page_add(msg, page, size, offset); + sk_mem_charge(sk, size); + copied = size; + if (sk_msg_full(msg)) + enospc = true; + if (psock->cork_bytes) { + if (size > psock->cork_bytes) + psock->cork_bytes = 0; + else + psock->cork_bytes -= size; + if (psock->cork_bytes && !enospc) + goto out_err; + /* All cork bytes are accounted, rerun the prog. */ + psock->eval = __SK_NONE; + psock->cork_bytes = 0; + } + + err = tcp_bpf_send_verdict(sk, psock, msg, &copied, flags); +out_err: + release_sock(sk); + sk_psock_put(sk, psock); + return copied ? copied : err; +} + +static void tcp_bpf_remove(struct sock *sk, struct sk_psock *psock) +{ + struct sk_psock_link *link; + + sk_psock_cork_free(psock); + __sk_psock_purge_ingress_msg(psock); + while ((link = sk_psock_link_pop(psock))) { + sk_psock_unlink(sk, link); + sk_psock_free_link(link); + } +} + +static void tcp_bpf_unhash(struct sock *sk) +{ + void (*saved_unhash)(struct sock *sk); + struct sk_psock *psock; + + rcu_read_lock(); + psock = sk_psock(sk); + if (unlikely(!psock)) { + rcu_read_unlock(); + if (sk->sk_prot->unhash) + sk->sk_prot->unhash(sk); + return; + } + + saved_unhash = psock->saved_unhash; + tcp_bpf_remove(sk, psock); + rcu_read_unlock(); + saved_unhash(sk); +} + +static void tcp_bpf_close(struct sock *sk, long timeout) +{ + void (*saved_close)(struct sock *sk, long timeout); + struct sk_psock *psock; + + lock_sock(sk); + rcu_read_lock(); + psock = sk_psock(sk); + if (unlikely(!psock)) { + rcu_read_unlock(); + release_sock(sk); + return sk->sk_prot->close(sk, timeout); + } + + saved_close = psock->saved_close; + tcp_bpf_remove(sk, psock); + rcu_read_unlock(); + release_sock(sk); + saved_close(sk, timeout); +} + +enum { + TCP_BPF_IPV4, + TCP_BPF_IPV6, + TCP_BPF_NUM_PROTS, +}; + +enum { + TCP_BPF_BASE, + TCP_BPF_TX, + TCP_BPF_NUM_CFGS, +}; + +static struct proto *tcpv6_prot_saved __read_mostly; +static DEFINE_SPINLOCK(tcpv6_prot_lock); +static struct proto tcp_bpf_prots[TCP_BPF_NUM_PROTS][TCP_BPF_NUM_CFGS]; + +static void tcp_bpf_rebuild_protos(struct proto prot[TCP_BPF_NUM_CFGS], + struct proto *base) +{ + prot[TCP_BPF_BASE] = *base; + prot[TCP_BPF_BASE].unhash = tcp_bpf_unhash; + prot[TCP_BPF_BASE].close = tcp_bpf_close; + prot[TCP_BPF_BASE].recvmsg = tcp_bpf_recvmsg; + prot[TCP_BPF_BASE].stream_memory_read = tcp_bpf_stream_read; + + prot[TCP_BPF_TX] = prot[TCP_BPF_BASE]; + prot[TCP_BPF_TX].sendmsg = tcp_bpf_sendmsg; + prot[TCP_BPF_TX].sendpage = tcp_bpf_sendpage; +} + +static void tcp_bpf_check_v6_needs_rebuild(struct sock *sk, struct proto *ops) +{ + if (sk->sk_family == AF_INET6 && + unlikely(ops != smp_load_acquire(&tcpv6_prot_saved))) { + spin_lock_bh(&tcpv6_prot_lock); + if (likely(ops != tcpv6_prot_saved)) { + tcp_bpf_rebuild_protos(tcp_bpf_prots[TCP_BPF_IPV6], ops); + smp_store_release(&tcpv6_prot_saved, ops); + } + spin_unlock_bh(&tcpv6_prot_lock); + } +} + +static int __init tcp_bpf_v4_build_proto(void) +{ + tcp_bpf_rebuild_protos(tcp_bpf_prots[TCP_BPF_IPV4], &tcp_prot); + return 0; +} +core_initcall(tcp_bpf_v4_build_proto); + +static void tcp_bpf_update_sk_prot(struct sock *sk, struct sk_psock *psock) +{ + int family = sk->sk_family == AF_INET6 ? TCP_BPF_IPV6 : TCP_BPF_IPV4; + int config = psock->progs.msg_parser ? TCP_BPF_TX : TCP_BPF_BASE; + + sk_psock_update_proto(sk, psock, &tcp_bpf_prots[family][config]); +} + +static void tcp_bpf_reinit_sk_prot(struct sock *sk, struct sk_psock *psock) +{ + int family = sk->sk_family == AF_INET6 ? TCP_BPF_IPV6 : TCP_BPF_IPV4; + int config = psock->progs.msg_parser ? TCP_BPF_TX : TCP_BPF_BASE; + + /* Reinit occurs when program types change e.g. TCP_BPF_TX is removed + * or added requiring sk_prot hook updates. We keep original saved + * hooks in this case. + */ + sk->sk_prot = &tcp_bpf_prots[family][config]; +} + +static int tcp_bpf_assert_proto_ops(struct proto *ops) +{ + /* In order to avoid retpoline, we make assumptions when we call + * into ops if e.g. a psock is not present. Make sure they are + * indeed valid assumptions. + */ + return ops->recvmsg == tcp_recvmsg && + ops->sendmsg == tcp_sendmsg && + ops->sendpage == tcp_sendpage ? 0 : -ENOTSUPP; +} + +void tcp_bpf_reinit(struct sock *sk) +{ + struct sk_psock *psock; + + sock_owned_by_me(sk); + + rcu_read_lock(); + psock = sk_psock(sk); + tcp_bpf_reinit_sk_prot(sk, psock); + rcu_read_unlock(); +} + +int tcp_bpf_init(struct sock *sk) +{ + struct proto *ops = READ_ONCE(sk->sk_prot); + struct sk_psock *psock; + + sock_owned_by_me(sk); + + rcu_read_lock(); + psock = sk_psock(sk); + if (unlikely(!psock || psock->sk_proto || + tcp_bpf_assert_proto_ops(ops))) { + rcu_read_unlock(); + return -EINVAL; + } + tcp_bpf_check_v6_needs_rebuild(sk, ops); + tcp_bpf_update_sk_prot(sk, psock); + rcu_read_unlock(); + return 0; +} diff --git a/net/strparser/Kconfig b/net/strparser/Kconfig index 6cff3f6d0c3a..94da19a2a220 100644 --- a/net/strparser/Kconfig +++ b/net/strparser/Kconfig @@ -1,4 +1,2 @@ - config STREAM_PARSER - tristate - default n + def_bool n From 74d966c14e22907decf9696c1c04f9128548fdd1 Mon Sep 17 00:00:00 2001 From: Mauricio Vasquez B Date: Thu, 18 Oct 2018 15:16:09 +0200 Subject: [PATCH 0941/1640] UPSTREAM: bpf: rename stack trace map operations In the following patches queue and stack maps (FIFO and LIFO datastructures) will be implemented. In order to avoid confusion and a possible name clash rename stack_map_ops to stack_trace_map_ops Change-Id: I4083e53979275e3f710fca7aa60da879416afcf5 Signed-off-by: Mauricio Vasquez B Acked-by: Song Liu Signed-off-by: Alexei Starovoitov --- include/linux/bpf_types.h | 2 +- kernel/bpf/stackmap.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index e40d23a8e8a2..fa44bb57bd33 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -51,7 +51,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_LRU_HASH, htab_lru_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_LRU_PERCPU_HASH, htab_lru_percpu_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_LPM_TRIE, trie_map_ops) #ifdef CONFIG_PERF_EVENTS -BPF_MAP_TYPE(BPF_MAP_TYPE_STACK_TRACE, stack_map_ops) +BPF_MAP_TYPE(BPF_MAP_TYPE_STACK_TRACE, stack_trace_map_ops) #endif BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY_OF_MAPS, array_of_maps_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_HASH_OF_MAPS, htab_of_maps_map_ops) diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 887698ed9256..d97c080567a6 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -619,7 +619,7 @@ static void stack_map_free(struct bpf_map *map) put_callchain_buffers(); } -const struct bpf_map_ops stack_map_ops = { +const struct bpf_map_ops stack_trace_map_ops = { .map_alloc = stack_map_alloc, .map_free = stack_map_free, .map_get_next_key = stack_map_get_next_key, From 43b3869784e6a71c8f89e61db8ae7b1ae1e176f1 Mon Sep 17 00:00:00 2001 From: Mauricio Vasquez B Date: Thu, 18 Oct 2018 15:16:14 +0200 Subject: [PATCH 0942/1640] UPSTREAM: bpf/syscall: allow key to be null in map functions This commit adds the required logic to allow key being NULL in case the key_size of the map is 0. A new __bpf_copy_key function helper only copies the key from userpsace when key_size != 0, otherwise it enforces that key must be null. Change-Id: Ic960c4d3861511c50e007cc37292cd1a3ea2ecaf Signed-off-by: Mauricio Vasquez B Acked-by: Song Liu Signed-off-by: Alexei Starovoitov --- kernel/bpf/syscall.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 65ae5071dd6e..f1a23b05051d 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -652,6 +652,17 @@ int __weak bpf_stackmap_copy(struct bpf_map *map, void *key, void *value) return -ENOTSUPP; } +static void *__bpf_copy_key(void __user *ukey, u64 key_size) +{ + if (key_size) + return memdup_user(ukey, key_size); + + if (ukey) + return ERR_PTR(-EINVAL); + + return NULL; +} + /* last field in 'union bpf_attr' used by this command */ #define BPF_MAP_LOOKUP_ELEM_LAST_FIELD value @@ -679,7 +690,7 @@ static int map_lookup_elem(union bpf_attr *attr) goto err_put; } - key = memdup_user(ukey, map->key_size); + key = __bpf_copy_key(ukey, map->key_size); if (IS_ERR(key)) { err = PTR_ERR(key); goto err_put; @@ -798,7 +809,7 @@ static int map_update_elem(union bpf_attr *attr) goto err_put; } - key = memdup_user(ukey, map->key_size); + key = __bpf_copy_key(ukey, map->key_size); if (IS_ERR(key)) { err = PTR_ERR(key); goto err_put; @@ -901,7 +912,7 @@ static int map_delete_elem(union bpf_attr *attr) goto err_put; } - key = memdup_user(ukey, map->key_size); + key = __bpf_copy_key(ukey, map->key_size); if (IS_ERR(key)) { err = PTR_ERR(key); goto err_put; @@ -954,7 +965,7 @@ static int map_get_next_key(union bpf_attr *attr) } if (ukey) { - key = memdup_user(ukey, map->key_size); + key = __bpf_copy_key(ukey, map->key_size); if (IS_ERR(key)) { err = PTR_ERR(key); goto err_put; From d72f407b9639a7e5aa5bd4820871498dcbe3e8c9 Mon Sep 17 00:00:00 2001 From: Mauricio Vasquez B Date: Thu, 18 Oct 2018 15:16:20 +0200 Subject: [PATCH 0943/1640] UPSTREAM: bpf/verifier: add ARG_PTR_TO_UNINIT_MAP_VALUE ARG_PTR_TO_UNINIT_MAP_VALUE argument is a pointer to a memory zone used to save the value of a map. Basically the same as ARG_PTR_TO_UNINIT_MEM, but the size has not be passed as an extra argument. This will be used in the following patch that implements some new helpers that receive a pointer to be filled with a map value. Change-Id: I9434140fd0d1638a0779197c9ff867326e5a61b4 Signed-off-by: Mauricio Vasquez B Acked-by: Song Liu Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 1 + kernel/bpf/verifier.c | 9 ++++++--- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 439d486e5dfd..3e8827a667c0 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -143,6 +143,7 @@ enum bpf_arg_type { ARG_CONST_MAP_PTR, /* const argument used as pointer to bpf_map */ ARG_PTR_TO_MAP_KEY, /* pointer to stack used as map key */ ARG_PTR_TO_MAP_VALUE, /* pointer to stack used as map value */ + ARG_PTR_TO_UNINIT_MAP_VALUE, /* pointer to valid memory used to store a map value */ /* the following constraints used to prototype bpf_memcmp() and other * functions that access data on eBPF program stack diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 997cf8e99ff4..6a578f352db1 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2225,7 +2225,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, } if (arg_type == ARG_PTR_TO_MAP_KEY || - arg_type == ARG_PTR_TO_MAP_VALUE) { + arg_type == ARG_PTR_TO_MAP_VALUE || + arg_type == ARG_PTR_TO_UNINIT_MAP_VALUE) { expected_type = PTR_TO_STACK; if (!type_is_pkt_pointer(type) && type != PTR_TO_MAP_VALUE && type != expected_type) @@ -2295,7 +2296,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, err = check_helper_mem_access(env, regno, meta->map_ptr->key_size, false, NULL); - } else if (arg_type == ARG_PTR_TO_MAP_VALUE) { + } else if (arg_type == ARG_PTR_TO_MAP_VALUE || + arg_type == ARG_PTR_TO_UNINIT_MAP_VALUE) { /* bpf_map_xxx(..., map_ptr, ..., value) call: * check [value, value + map->value_size) validity */ @@ -2304,9 +2306,10 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, verbose(env, "invalid map_ptr to access map->value\n"); return -EACCES; } + meta->raw_mode = (arg_type == ARG_PTR_TO_UNINIT_MAP_VALUE); err = check_helper_mem_access(env, regno, meta->map_ptr->value_size, false, - NULL); + meta); } else if (arg_type_is_mem_size(arg_type)) { bool zero_size_allowed = (arg_type == ARG_CONST_SIZE_OR_ZERO); From 27f22bf8fea76411cfd4139178591f19d7ebc352 Mon Sep 17 00:00:00 2001 From: Mauricio Vasquez B Date: Thu, 18 Oct 2018 15:16:25 +0200 Subject: [PATCH 0944/1640] BACKPORT: bpf: add queue and stack maps Queue/stack maps implement a FIFO/LIFO data storage for ebpf programs. These maps support peek, pop and push operations that are exposed to eBPF programs through the new bpf_map[peek/pop/push] helpers. Those operations are exposed to userspace applications through the already existing syscalls in the following way: BPF_MAP_LOOKUP_ELEM -> peek BPF_MAP_LOOKUP_AND_DELETE_ELEM -> pop BPF_MAP_UPDATE_ELEM -> push Queue/stack maps are implemented using a buffer, tail and head indexes, hence BPF_F_NO_PREALLOC is not supported. As opposite to other maps, queue and stack do not use RCU for protecting maps values, the bpf_map[peek/pop] have a ARG_PTR_TO_UNINIT_MAP_VALUE argument that is a pointer to a memory zone where to save the value of a map. Basically the same as ARG_PTR_TO_UNINIT_MEM, but the size has not be passed as an extra argument. Our main motivation for implementing queue/stack maps was to keep track of a pool of elements, like network ports in a SNAT, however we forsee other use cases, like for exampling saving last N kernel events in a map and then analysing from userspace. Change-Id: Ie216fff6dc596719c7fa6eef7ce932198888d201 Signed-off-by: Mauricio Vasquez B Acked-by: Song Liu Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 6 + include/linux/bpf_types.h | 2 + include/uapi/linux/bpf.h | 24 +++ kernel/bpf/Makefile | 2 +- kernel/bpf/core.c | 3 + kernel/bpf/helpers.c | 43 +++++ kernel/bpf/queue_stack_maps.c | 288 ++++++++++++++++++++++++++++++++++ kernel/bpf/syscall.c | 6 + kernel/bpf/verifier.c | 19 ++- net/core/filter.c | 6 + 10 files changed, 397 insertions(+), 2 deletions(-) create mode 100644 kernel/bpf/queue_stack_maps.c diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 3e8827a667c0..ff873523c32c 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -40,6 +40,9 @@ struct bpf_map_ops { void *(*map_lookup_elem)(struct bpf_map *map, void *key); int (*map_update_elem)(struct bpf_map *map, void *key, void *value, u64 flags); int (*map_delete_elem)(struct bpf_map *map, void *key); + int (*map_push_elem)(struct bpf_map *map, void *value, u64 flags); + int (*map_pop_elem)(struct bpf_map *map, void *value); + int (*map_peek_elem)(struct bpf_map *map, void *value); /* funcs called by prog_array and perf_event_array map */ void *(*map_fd_get_ptr)(struct bpf_map *map, struct file *map_file, @@ -830,6 +833,9 @@ static inline int bpf_fd_reuseport_array_update_elem(struct bpf_map *map, extern const struct bpf_func_proto bpf_map_lookup_elem_proto; extern const struct bpf_func_proto bpf_map_update_elem_proto; extern const struct bpf_func_proto bpf_map_delete_elem_proto; +extern const struct bpf_func_proto bpf_map_push_elem_proto; +extern const struct bpf_func_proto bpf_map_pop_elem_proto; +extern const struct bpf_func_proto bpf_map_peek_elem_proto; extern const struct bpf_func_proto bpf_get_prandom_u32_proto; extern const struct bpf_func_proto bpf_get_smp_processor_id_proto; diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index fa44bb57bd33..c72ac19a9b2a 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -69,3 +69,5 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_XSKMAP, xsk_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_REUSEPORT_SOCKARRAY, reuseport_array_ops) #endif #endif +BPF_MAP_TYPE(BPF_MAP_TYPE_QUEUE, queue_map_ops) +BPF_MAP_TYPE(BPF_MAP_TYPE_STACK, stack_map_ops) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 204a78da938d..2bdbeb72aba3 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -128,6 +128,8 @@ enum bpf_map_type { BPF_MAP_TYPE_CGROUP_STORAGE, BPF_MAP_TYPE_REUSEPORT_SOCKARRAY, BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE, + BPF_MAP_TYPE_QUEUE, + BPF_MAP_TYPE_STACK, }; enum bpf_prog_type { @@ -478,6 +480,28 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * + * int bpf_map_push_elem(struct bpf_map *map, const void *value, u64 flags) + * Description + * Push an element *value* in *map*. *flags* is one of: + * + * **BPF_EXIST** + * If the queue/stack is full, the oldest element is removed to + * make room for this. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_map_pop_elem(struct bpf_map *map, void *value) + * Description + * Pop an element from *map*. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_map_peek_elem(struct bpf_map *map, void *value) + * Description + * Get an element from *map* without removing it. + * Return + * 0 on success, or a negative error in case of failure. + * * int bpf_probe_read(void *dst, u32 size, const void *src) * Description * For tracing programs, safely attempt to read *size* bytes from diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 47bbc691c983..29d781061cd5 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -4,7 +4,7 @@ CFLAGS_core.o += $(call cc-disable-warning, override-init) obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o -obj-$(CONFIG_BPF_SYSCALL) += local_storage.o +obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o obj-$(CONFIG_BPF_SYSCALL) += disasm.o obj-$(CONFIG_BPF_SYSCALL) += btf.o ifeq ($(CONFIG_NET),y) diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 426ce68a203d..9a53ba30947c 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1888,6 +1888,9 @@ BPF_CALL_0(bpf_user_rnd_u32) const struct bpf_func_proto bpf_map_lookup_elem_proto __weak; const struct bpf_func_proto bpf_map_update_elem_proto __weak; const struct bpf_func_proto bpf_map_delete_elem_proto __weak; +const struct bpf_func_proto bpf_map_push_elem_proto __weak; +const struct bpf_func_proto bpf_map_pop_elem_proto __weak; +const struct bpf_func_proto bpf_map_peek_elem_proto __weak; const struct bpf_func_proto bpf_get_prandom_u32_proto __weak; const struct bpf_func_proto bpf_get_smp_processor_id_proto __weak; diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index f3a8b3475d43..758345279705 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -76,6 +76,49 @@ const struct bpf_func_proto bpf_map_delete_elem_proto = { .arg2_type = ARG_PTR_TO_MAP_KEY, }; +BPF_CALL_3(bpf_map_push_elem, struct bpf_map *, map, void *, value, u64, flags) +{ + return map->ops->map_push_elem(map, value, flags); +} + +const struct bpf_func_proto bpf_map_push_elem_proto = { + .func = bpf_map_push_elem, + .gpl_only = false, + .pkt_access = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_MAP_VALUE, + .arg3_type = ARG_ANYTHING, +}; + +BPF_CALL_2(bpf_map_pop_elem, struct bpf_map *, map, void *, value) +{ + return map->ops->map_pop_elem(map, value); +} + +const struct bpf_func_proto bpf_map_pop_elem_proto = { + .func = bpf_map_pop_elem, + .gpl_only = false, + .pkt_access = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_UNINIT_MAP_VALUE, +}; + +BPF_CALL_2(bpf_map_peek_elem, struct bpf_map *, map, void *, value) +{ + return map->ops->map_peek_elem(map, value); +} + +const struct bpf_func_proto bpf_map_peek_elem_proto = { + .func = bpf_map_pop_elem, + .gpl_only = false, + .pkt_access = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_UNINIT_MAP_VALUE, +}; + const struct bpf_func_proto bpf_get_prandom_u32_proto = { .func = bpf_user_rnd_u32, .gpl_only = false, diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c new file mode 100644 index 000000000000..12a93fb37449 --- /dev/null +++ b/kernel/bpf/queue_stack_maps.c @@ -0,0 +1,288 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * queue_stack_maps.c: BPF queue and stack maps + * + * Copyright (c) 2018 Politecnico di Torino + */ +#include +#include +#include +#include "percpu_freelist.h" + +#define QUEUE_STACK_CREATE_FLAG_MASK \ + (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) + + +struct bpf_queue_stack { + struct bpf_map map; + raw_spinlock_t lock; + u32 head, tail; + u32 size; /* max_entries + 1 */ + + char elements[0] __aligned(8); +}; + +static struct bpf_queue_stack *bpf_queue_stack(struct bpf_map *map) +{ + return container_of(map, struct bpf_queue_stack, map); +} + +static bool queue_stack_map_is_empty(struct bpf_queue_stack *qs) +{ + return qs->head == qs->tail; +} + +static bool queue_stack_map_is_full(struct bpf_queue_stack *qs) +{ + u32 head = qs->head + 1; + + if (unlikely(head >= qs->size)) + head = 0; + + return head == qs->tail; +} + +/* Called from syscall */ +static int queue_stack_map_alloc_check(union bpf_attr *attr) +{ + /* check sanity of attributes */ + if (attr->max_entries == 0 || attr->key_size != 0 || + attr->map_flags & ~QUEUE_STACK_CREATE_FLAG_MASK) + return -EINVAL; + + if (attr->value_size > KMALLOC_MAX_SIZE) + /* if value_size is bigger, the user space won't be able to + * access the elements. + */ + return -E2BIG; + + return 0; +} + +static struct bpf_map *queue_stack_map_alloc(union bpf_attr *attr) +{ + int ret, numa_node = bpf_map_attr_numa_node(attr); + struct bpf_queue_stack *qs; + u32 size, value_size; + u64 queue_size, cost; + + size = attr->max_entries + 1; + value_size = attr->value_size; + + queue_size = sizeof(*qs) + (u64) value_size * size; + + cost = queue_size; + if (cost >= U32_MAX - PAGE_SIZE) + return ERR_PTR(-E2BIG); + + cost = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; + + ret = bpf_map_precharge_memlock(cost); + if (ret < 0) + return ERR_PTR(ret); + + qs = bpf_map_area_alloc(queue_size, numa_node); + if (!qs) + return ERR_PTR(-ENOMEM); + + memset(qs, 0, sizeof(*qs)); + + bpf_map_init_from_attr(&qs->map, attr); + + qs->map.pages = cost; + qs->size = size; + + raw_spin_lock_init(&qs->lock); + + return &qs->map; +} + +/* Called when map->refcnt goes to zero, either from workqueue or from syscall */ +static void queue_stack_map_free(struct bpf_map *map) +{ + struct bpf_queue_stack *qs = bpf_queue_stack(map); + + /* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0, + * so the programs (can be more than one that used this map) were + * disconnected from events. Wait for outstanding critical sections in + * these programs to complete + */ + synchronize_rcu(); + + bpf_map_area_free(qs); +} + +static int __queue_map_get(struct bpf_map *map, void *value, bool delete) +{ + struct bpf_queue_stack *qs = bpf_queue_stack(map); + unsigned long flags; + int err = 0; + void *ptr; + + raw_spin_lock_irqsave(&qs->lock, flags); + + if (queue_stack_map_is_empty(qs)) { + err = -ENOENT; + goto out; + } + + ptr = &qs->elements[qs->tail * qs->map.value_size]; + memcpy(value, ptr, qs->map.value_size); + + if (delete) { + if (unlikely(++qs->tail >= qs->size)) + qs->tail = 0; + } + +out: + raw_spin_unlock_irqrestore(&qs->lock, flags); + return err; +} + + +static int __stack_map_get(struct bpf_map *map, void *value, bool delete) +{ + struct bpf_queue_stack *qs = bpf_queue_stack(map); + unsigned long flags; + int err = 0; + void *ptr; + u32 index; + + raw_spin_lock_irqsave(&qs->lock, flags); + + if (queue_stack_map_is_empty(qs)) { + err = -ENOENT; + goto out; + } + + index = qs->head - 1; + if (unlikely(index >= qs->size)) + index = qs->size - 1; + + ptr = &qs->elements[index * qs->map.value_size]; + memcpy(value, ptr, qs->map.value_size); + + if (delete) + qs->head = index; + +out: + raw_spin_unlock_irqrestore(&qs->lock, flags); + return err; +} + +/* Called from syscall or from eBPF program */ +static int queue_map_peek_elem(struct bpf_map *map, void *value) +{ + return __queue_map_get(map, value, false); +} + +/* Called from syscall or from eBPF program */ +static int stack_map_peek_elem(struct bpf_map *map, void *value) +{ + return __stack_map_get(map, value, false); +} + +/* Called from syscall or from eBPF program */ +static int queue_map_pop_elem(struct bpf_map *map, void *value) +{ + return __queue_map_get(map, value, true); +} + +/* Called from syscall or from eBPF program */ +static int stack_map_pop_elem(struct bpf_map *map, void *value) +{ + return __stack_map_get(map, value, true); +} + +/* Called from syscall or from eBPF program */ +static int queue_stack_map_push_elem(struct bpf_map *map, void *value, + u64 flags) +{ + struct bpf_queue_stack *qs = bpf_queue_stack(map); + unsigned long irq_flags; + int err = 0; + void *dst; + + /* BPF_EXIST is used to force making room for a new element in case the + * map is full + */ + bool replace = (flags & BPF_EXIST); + + /* Check supported flags for queue and stack maps */ + if (flags & BPF_NOEXIST || flags > BPF_EXIST) + return -EINVAL; + + raw_spin_lock_irqsave(&qs->lock, irq_flags); + + if (queue_stack_map_is_full(qs)) { + if (!replace) { + err = -E2BIG; + goto out; + } + /* advance tail pointer to overwrite oldest element */ + if (unlikely(++qs->tail >= qs->size)) + qs->tail = 0; + } + + dst = &qs->elements[qs->head * qs->map.value_size]; + memcpy(dst, value, qs->map.value_size); + + if (unlikely(++qs->head >= qs->size)) + qs->head = 0; + +out: + raw_spin_unlock_irqrestore(&qs->lock, irq_flags); + return err; +} + +/* Called from syscall or from eBPF program */ +static void *queue_stack_map_lookup_elem(struct bpf_map *map, void *key) +{ + return NULL; +} + +/* Called from syscall or from eBPF program */ +static int queue_stack_map_update_elem(struct bpf_map *map, void *key, + void *value, u64 flags) +{ + return -EINVAL; +} + +/* Called from syscall or from eBPF program */ +static int queue_stack_map_delete_elem(struct bpf_map *map, void *key) +{ + return -EINVAL; +} + +/* Called from syscall */ +static int queue_stack_map_get_next_key(struct bpf_map *map, void *key, + void *next_key) +{ + return -EINVAL; +} + +const struct bpf_map_ops queue_map_ops = { + .map_alloc_check = queue_stack_map_alloc_check, + .map_alloc = queue_stack_map_alloc, + .map_free = queue_stack_map_free, + .map_lookup_elem = queue_stack_map_lookup_elem, + .map_update_elem = queue_stack_map_update_elem, + .map_delete_elem = queue_stack_map_delete_elem, + .map_push_elem = queue_stack_map_push_elem, + .map_pop_elem = queue_map_pop_elem, + .map_peek_elem = queue_map_peek_elem, + .map_get_next_key = queue_stack_map_get_next_key, +}; + +const struct bpf_map_ops stack_map_ops = { + .map_alloc_check = queue_stack_map_alloc_check, + .map_alloc = queue_stack_map_alloc, + .map_free = queue_stack_map_free, + .map_lookup_elem = queue_stack_map_lookup_elem, + .map_update_elem = queue_stack_map_update_elem, + .map_delete_elem = queue_stack_map_delete_elem, + .map_push_elem = queue_stack_map_push_elem, + .map_pop_elem = stack_map_pop_elem, + .map_peek_elem = stack_map_peek_elem, + .map_get_next_key = queue_stack_map_get_next_key, +}; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index f1a23b05051d..3a47e5b1edc3 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -733,6 +733,9 @@ static int map_lookup_elem(union bpf_attr *attr) err = bpf_fd_htab_map_lookup_elem(map, key, value); } else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) { err = bpf_fd_reuseport_array_lookup_elem(map, key, value); + } else if (map->map_type == BPF_MAP_TYPE_QUEUE || + map->map_type == BPF_MAP_TYPE_STACK) { + err = map->ops->map_peek_elem(map, value); } else { rcu_read_lock(); if (map->ops->map_lookup_elem_sys_only) @@ -870,6 +873,9 @@ static int map_update_elem(union bpf_attr *attr) /* rcu_read_lock() is not needed */ err = bpf_fd_reuseport_array_update_elem(map, key, value, attr->flags); + } else if (map->map_type == BPF_MAP_TYPE_QUEUE || + map->map_type == BPF_MAP_TYPE_STACK) { + err = map->ops->map_push_elem(map, value, attr->flags); } else { rcu_read_lock(); err = map->ops->map_update_elem(map, key, value, attr->flags); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 6a578f352db1..1c84ec0149d6 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2431,6 +2431,13 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, if (func_id != BPF_FUNC_sk_select_reuseport) goto error; break; + case BPF_MAP_TYPE_QUEUE: + case BPF_MAP_TYPE_STACK: + if (func_id != BPF_FUNC_map_peek_elem && + func_id != BPF_FUNC_map_pop_elem && + func_id != BPF_FUNC_map_push_elem) + goto error; + break; default: break; } @@ -2487,6 +2494,13 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, if (map->map_type != BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) goto error; break; + case BPF_FUNC_map_peek_elem: + case BPF_FUNC_map_pop_elem: + case BPF_FUNC_map_push_elem: + if (map->map_type != BPF_MAP_TYPE_QUEUE && + map->map_type != BPF_MAP_TYPE_STACK) + goto error; + break; default: break; } @@ -2805,7 +2819,10 @@ record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta, if (func_id != BPF_FUNC_tail_call && func_id != BPF_FUNC_map_lookup_elem && func_id != BPF_FUNC_map_update_elem && - func_id != BPF_FUNC_map_delete_elem) + func_id != BPF_FUNC_map_delete_elem && + func_id != BPF_FUNC_map_push_elem && + func_id != BPF_FUNC_map_pop_elem && + func_id != BPF_FUNC_map_peek_elem) return 0; if (meta->map_ptr == NULL) { diff --git a/net/core/filter.c b/net/core/filter.c index d8254ca65b5a..32f207f87a2f 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4854,6 +4854,12 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_map_update_elem_proto; case BPF_FUNC_map_delete_elem: return &bpf_map_delete_elem_proto; + case BPF_FUNC_map_push_elem: + return &bpf_map_push_elem_proto; + case BPF_FUNC_map_pop_elem: + return &bpf_map_pop_elem_proto; + case BPF_FUNC_map_peek_elem: + return &bpf_map_peek_elem_proto; case BPF_FUNC_get_prandom_u32: return &bpf_get_prandom_u32_proto; case BPF_FUNC_get_smp_processor_id: From bf1010045ce6cea7f23aa1d1dc0605a53911a307 Mon Sep 17 00:00:00 2001 From: Mauricio Vasquez B Date: Thu, 18 Oct 2018 15:16:30 +0200 Subject: [PATCH 0945/1640] UPSTREAM: bpf: add MAP_LOOKUP_AND_DELETE_ELEM syscall The previous patch implemented a bpf queue/stack maps that provided the peek/pop/push functions. There is not a direct relationship between those functions and the current maps syscalls, hence a new MAP_LOOKUP_AND_DELETE_ELEM syscall is added, this is mapped to the pop operation in the queue/stack maps and it is still to implement in other kind of maps. Change-Id: I7f49d6066b875810781172a4aa27377652209795 Signed-off-by: Mauricio Vasquez B Acked-by: Song Liu Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 1 + kernel/bpf/syscall.c | 66 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 67 insertions(+) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 2bdbeb72aba3..e32a26bd7a9d 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -103,6 +103,7 @@ enum bpf_cmd { BPF_BTF_LOAD, BPF_BTF_GET_FD_BY_ID, BPF_TASK_FD_QUERY, + BPF_MAP_LOOKUP_AND_DELETE_ELEM, }; enum bpf_map_type { diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 3a47e5b1edc3..8440520be4b5 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1012,6 +1012,69 @@ err_put: return err; } +#define BPF_MAP_LOOKUP_AND_DELETE_ELEM_LAST_FIELD value + +static int map_lookup_and_delete_elem(union bpf_attr *attr) +{ + void __user *ukey = u64_to_user_ptr(attr->key); + void __user *uvalue = u64_to_user_ptr(attr->value); + int ufd = attr->map_fd; + struct bpf_map *map; + void *key, *value, *ptr; + u32 value_size; + struct fd f; + int err; + + if (CHECK_ATTR(BPF_MAP_LOOKUP_AND_DELETE_ELEM)) + return -EINVAL; + + f = fdget(ufd); + map = __bpf_map_get(f); + if (IS_ERR(map)) + return PTR_ERR(map); + + if (!(f.file->f_mode & FMODE_CAN_WRITE)) { + err = -EPERM; + goto err_put; + } + + key = __bpf_copy_key(ukey, map->key_size); + if (IS_ERR(key)) { + err = PTR_ERR(key); + goto err_put; + } + + value_size = map->value_size; + + err = -ENOMEM; + value = kmalloc(value_size, GFP_USER | __GFP_NOWARN); + if (!value) + goto free_key; + + if (map->map_type == BPF_MAP_TYPE_QUEUE || + map->map_type == BPF_MAP_TYPE_STACK) { + err = map->ops->map_pop_elem(map, value); + } else { + err = -ENOTSUPP; + } + + if (err) + goto free_value; + + if (copy_to_user(uvalue, value, value_size) != 0) + goto free_value; + + err = 0; + +free_value: + kfree(value); +free_key: + kfree(key); +err_put: + fdput(f); + return err; +} + static const struct bpf_prog_ops * const bpf_prog_types[] = { #define BPF_PROG_TYPE(_id, _name) \ [_id] = & _name ## _prog_ops, @@ -2514,6 +2577,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz case BPF_TASK_FD_QUERY: err = bpf_task_fd_query(&attr, uattr); break; + case BPF_MAP_LOOKUP_AND_DELETE_ELEM: + err = map_lookup_and_delete_elem(&attr); + break; default: err = -EINVAL; break; From 5caab0ddeb6d9a12ba3269899b4ee79198274d73 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Fri, 19 Oct 2018 09:57:57 -0700 Subject: [PATCH 0946/1640] UPSTREAM: bpf: add cg_skb_is_valid_access for BPF_PROG_TYPE_CGROUP_SKB BPF programs of BPF_PROG_TYPE_CGROUP_SKB need to access headers in the skb. This patch enables direct access of skb for these programs. Two helper functions bpf_compute_and_save_data_end() and bpf_restore_data_end() are introduced. There are used in __cgroup_bpf_run_filter_skb(), to compute proper data_end for the BPF program, and restore original data afterwards. Change-Id: I9c50949aee0134767df0d607ad8730b3affa49e9 Signed-off-by: Song Liu Signed-off-by: Alexei Starovoitov --- include/linux/filter.h | 21 +++++++++++++++++++++ kernel/bpf/cgroup.c | 6 ++++++ net/core/filter.c | 36 +++++++++++++++++++++++++++++++++++- 3 files changed, 62 insertions(+), 1 deletion(-) diff --git a/include/linux/filter.h b/include/linux/filter.h index ccd2f16de004..0a6c8d5e6c07 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -632,6 +632,27 @@ static inline void bpf_compute_data_pointers(struct sk_buff *skb) cb->data_end = skb->data + skb_headlen(skb); } +/* Similar to bpf_compute_data_pointers(), except that save orginal + * data in cb->data and cb->meta_data for restore. + */ +static inline void bpf_compute_and_save_data_end( + struct sk_buff *skb, void **saved_data_end) +{ + struct bpf_skb_data_end *cb = (struct bpf_skb_data_end *)skb->cb; + + *saved_data_end = cb->data_end; + cb->data_end = skb->data + skb_headlen(skb); +} + +/* Restore data saved by bpf_compute_data_pointers(). */ +static inline void bpf_restore_data_end( + struct sk_buff *skb, void *saved_data_end) +{ + struct bpf_skb_data_end *cb = (struct bpf_skb_data_end *)skb->cb; + + cb->data_end = saved_data_end; +} + static inline u8 *bpf_skb_cb(struct sk_buff *skb) { /* eBPF programs may read/write skb->cb[] area to transfer meta diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 00f6ed2e4f9a..9425c2fb872f 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -553,6 +553,7 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk, { unsigned int offset = skb->data - skb_network_header(skb); struct sock *save_sk; + void *saved_data_end; struct cgroup *cgrp; int ret; @@ -566,8 +567,13 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk, save_sk = skb->sk; skb->sk = sk; __skb_push(skb, offset); + + /* compute pointers for the bpf prog */ + bpf_compute_and_save_data_end(skb, &saved_data_end); + ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], skb, bpf_prog_run_save_cb); + bpf_restore_data_end(skb, saved_data_end); __skb_pull(skb, offset); skb->sk = save_sk; return ret == 1 ? 0 : -EPERM; diff --git a/net/core/filter.c b/net/core/filter.c index 32f207f87a2f..6d2ace812f83 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5330,6 +5330,40 @@ static bool sk_filter_is_valid_access(int off, int size, return bpf_skb_is_valid_access(off, size, type, prog, info); } +static bool cg_skb_is_valid_access(int off, int size, + enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + switch (off) { + case bpf_ctx_range(struct __sk_buff, tc_classid): + case bpf_ctx_range(struct __sk_buff, data_meta): + case bpf_ctx_range(struct __sk_buff, flow_keys): + return false; + } + if (type == BPF_WRITE) { + switch (off) { + case bpf_ctx_range(struct __sk_buff, mark): + case bpf_ctx_range(struct __sk_buff, priority): + case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]): + break; + default: + return false; + } + } + + switch (off) { + case bpf_ctx_range(struct __sk_buff, data): + info->reg_type = PTR_TO_PACKET; + break; + case bpf_ctx_range(struct __sk_buff, data_end): + info->reg_type = PTR_TO_PACKET_END; + break; + } + + return bpf_skb_is_valid_access(off, size, type, prog, info); +} + static bool lwt_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, @@ -7024,7 +7058,7 @@ const struct bpf_prog_ops xdp_prog_ops = { const struct bpf_verifier_ops cg_skb_verifier_ops = { .get_func_proto = cg_skb_func_proto, - .is_valid_access = sk_filter_is_valid_access, + .is_valid_access = cg_skb_is_valid_access, .convert_ctx_access = bpf_convert_ctx_access, }; From bac67db367d286631bf3beef7b348ed864dc3c46 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 19 Oct 2018 13:52:38 -0700 Subject: [PATCH 0947/1640] UPSTREAM: bpf: remove unused variable MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit fix the following warning ../kernel/bpf/syscall.c: In function ‘map_lookup_and_delete_elem’: ../kernel/bpf/syscall.c:1010:22: warning: unused variable ‘ptr’ [-Wunused-variable] void *key, *value, *ptr; ^~~ Fixes: bd513cd08f10 ("bpf: add MAP_LOOKUP_AND_DELETE_ELEM syscall") Change-Id: I81b937ec919b03ef6a81a24305aedfb7c88b6801 Signed-off-by: Alexei Starovoitov --- kernel/bpf/syscall.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 8440520be4b5..93a01a838e20 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1020,7 +1020,7 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr) void __user *uvalue = u64_to_user_ptr(attr->value); int ufd = attr->map_fd; struct bpf_map *map; - void *key, *value, *ptr; + void *key, *value; u32 value_size; struct fd f; int err; From 899a8ffa12d314f01478921002e8b33f95045f91 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sun, 21 Oct 2018 02:09:24 +0200 Subject: [PATCH 0948/1640] BACKPORT: bpf, verifier: fix register type dump in xadd and st Using reg_type_str[insn->dst_reg] is incorrect since insn->dst_reg contains the register number but not the actual register type. Add a small reg_state() helper and use it to get to the type. Also fix up the test_verifier test cases that have an incorrect errstr. Fixes: 9d2be44a7f33 ("bpf: Reuse canonical string formatter for ctx errs") Change-Id: Ib5cb28d28dcf2c1613772421aaca94dadf6622ea Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 1c84ec0149d6..1350af03a6d0 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1575,14 +1575,19 @@ static bool __is_pointer_value(bool allow_ptr_leaks, return reg->type != SCALAR_VALUE; } +static struct bpf_reg_state *reg_state(struct bpf_verifier_env *env, int regno) +{ + return cur_regs(env) + regno; +} + static bool is_pointer_value(struct bpf_verifier_env *env, int regno) { - return __is_pointer_value(env->allow_ptr_leaks, cur_regs(env) + regno); + return __is_pointer_value(env->allow_ptr_leaks, reg_state(env, regno)); } static bool is_ctx_reg(struct bpf_verifier_env *env, int regno) { - const struct bpf_reg_state *reg = cur_regs(env) + regno; + const struct bpf_reg_state *reg = reg_state(env, regno); return reg->type == PTR_TO_CTX || reg->type == PTR_TO_SOCKET; @@ -1590,7 +1595,7 @@ static bool is_ctx_reg(struct bpf_verifier_env *env, int regno) static bool is_pkt_reg(struct bpf_verifier_env *env, int regno) { - const struct bpf_reg_state *reg = cur_regs(env) + regno; + const struct bpf_reg_state *reg = reg_state(env, regno); return type_is_pkt_pointer(reg->type); } @@ -1991,7 +1996,8 @@ static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_ins if (is_ctx_reg(env, insn->dst_reg) || is_pkt_reg(env, insn->dst_reg)) { verbose(env, "BPF_XADD stores into R%d %s is not allowed\n", - insn->dst_reg, reg_type_str[insn->dst_reg]); + insn->dst_reg, + reg_type_str[reg_state(env, insn->dst_reg)->type]); return -EACCES; } @@ -2039,7 +2045,7 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, int access_size, bool zero_size_allowed, struct bpf_call_arg_meta *meta) { - struct bpf_reg_state *reg = cur_regs(env) + regno; + struct bpf_reg_state *reg = reg_state(env, regno); struct bpf_func_state *state = func(env, reg); int err, min_off, max_off, i, j, slot, spi; @@ -5829,7 +5835,8 @@ static int do_check(struct bpf_verifier_env *env) if (is_ctx_reg(env, insn->dst_reg)) { verbose(env, "BPF_ST stores into R%d %s is not allowed\n", - insn->dst_reg, reg_type_str[insn->dst_reg]); + insn->dst_reg, + reg_type_str[reg_state(env, insn->dst_reg)->type]); return -EACCES; } From 5d8f5f6bebdacf8f7c75b504451a7b0d1f84f0c5 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sun, 21 Oct 2018 02:09:25 +0200 Subject: [PATCH 0949/1640] UPSTREAM: bpf, verifier: reject xadd on flow key memory We should not enable xadd operation for flow key memory if not needed there anyway. There is no such issue as described in the commit f37a8cb84cce ("bpf: reject stores into ctx via st and xadd") since there's no context rewriter for flow keys today, but it also shouldn't become part of the user facing behavior to allow for it. After patch: 0: (79) r7 = *(u64 *)(r1 +144) 1: (b7) r3 = 4096 2: (db) lock *(u64 *)(r7 +0) += r3 BPF_XADD stores into R7 flow_keys is not allowed Change-Id: I7fa26aafca6dbc05d5fb61d4e5ce3b8cb4dc1112 Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 1350af03a6d0..6a5b347e562d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1600,6 +1600,14 @@ static bool is_pkt_reg(struct bpf_verifier_env *env, int regno) return type_is_pkt_pointer(reg->type); } +static bool is_flow_key_reg(struct bpf_verifier_env *env, int regno) +{ + const struct bpf_reg_state *reg = reg_state(env, regno); + + /* Separate to is_ctx_reg() since we still want to allow BPF_ST here. */ + return reg->type == PTR_TO_FLOW_KEYS; +} + static int check_pkt_ptr_alignment(struct bpf_verifier_env *env, const struct bpf_reg_state *reg, int off, int size, bool strict) @@ -1994,7 +2002,8 @@ static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_ins } if (is_ctx_reg(env, insn->dst_reg) || - is_pkt_reg(env, insn->dst_reg)) { + is_pkt_reg(env, insn->dst_reg) || + is_flow_key_reg(env, insn->dst_reg)) { verbose(env, "BPF_XADD stores into R%d %s is not allowed\n", insn->dst_reg, reg_type_str[reg_state(env, insn->dst_reg)->type]); From efe3359b8ed88992b7a5bf67f3fa514b6dfad3f4 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sun, 21 Oct 2018 02:09:26 +0200 Subject: [PATCH 0950/1640] UPSTREAM: bpf, verifier: remove unneeded flow key in check_helper_mem_access They PTR_TO_FLOW_KEYS is not used today to be passed into a helper as memory, so it can be removed from check_helper_mem_access(). Change-Id: I41f340a3aed2a76d35d8e4a88b2ed8706a533b09 Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 6a5b347e562d..c086294a6938 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2185,8 +2185,6 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, case PTR_TO_PACKET_META: return check_packet_access(env, regno, reg->off, access_size, zero_size_allowed); - case PTR_TO_FLOW_KEYS: - return check_flow_keys_access(env, reg->off, access_size); case PTR_TO_MAP_VALUE: return check_map_access(env, regno, reg->off, access_size, zero_size_allowed); From 09612ab41b77fd1064893dc9e6554d9c2601e9d0 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sun, 21 Oct 2018 02:09:27 +0200 Subject: [PATCH 0951/1640] UPSTREAM: bpf, verifier: avoid retpoline for map push/pop/peek operation Extend prior work from 09772d92cd5a ("bpf: avoid retpoline for lookup/update/delete calls on maps") to also apply to the recently added map helpers that perform push/pop/peek operations so that the indirect call can be avoided. Change-Id: I1b7e67c6ffe096cb656aed85b45224525b612bfb Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index c086294a6938..fe9296232631 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6813,7 +6813,10 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) if (prog->jit_requested && BITS_PER_LONG == 64 && (insn->imm == BPF_FUNC_map_lookup_elem || insn->imm == BPF_FUNC_map_update_elem || - insn->imm == BPF_FUNC_map_delete_elem)) { + insn->imm == BPF_FUNC_map_delete_elem || + insn->imm == BPF_FUNC_map_push_elem || + insn->imm == BPF_FUNC_map_pop_elem || + insn->imm == BPF_FUNC_map_peek_elem)) { aux = &env->insn_aux_data[i + delta]; if (bpf_map_ptr_poisoned(aux)) goto patch_call_imm; @@ -6846,6 +6849,14 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) BUILD_BUG_ON(!__same_type(ops->map_update_elem, (int (*)(struct bpf_map *map, void *key, void *value, u64 flags))NULL)); + BUILD_BUG_ON(!__same_type(ops->map_push_elem, + (int (*)(struct bpf_map *map, void *value, + u64 flags))NULL)); + BUILD_BUG_ON(!__same_type(ops->map_pop_elem, + (int (*)(struct bpf_map *map, void *value))NULL)); + BUILD_BUG_ON(!__same_type(ops->map_peek_elem, + (int (*)(struct bpf_map *map, void *value))NULL)); + switch (insn->imm) { case BPF_FUNC_map_lookup_elem: insn->imm = BPF_CAST_CALL(ops->map_lookup_elem) - @@ -6859,6 +6870,18 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) insn->imm = BPF_CAST_CALL(ops->map_delete_elem) - __bpf_call_base; continue; + case BPF_FUNC_map_push_elem: + insn->imm = BPF_CAST_CALL(ops->map_push_elem) - + __bpf_call_base; + continue; + case BPF_FUNC_map_pop_elem: + insn->imm = BPF_CAST_CALL(ops->map_pop_elem) - + __bpf_call_base; + continue; + case BPF_FUNC_map_peek_elem: + insn->imm = BPF_CAST_CALL(ops->map_peek_elem) - + __bpf_call_base; + continue; } goto patch_call_imm; From a9762745b0efed1e5b828a14b7cbb4c8555715b6 Mon Sep 17 00:00:00 2001 From: Martin Lau Date: Wed, 24 Oct 2018 20:42:25 +0000 Subject: [PATCH 0952/1640] UPSTREAM: bpf, btf: fix a missing check bug in btf_parse Wenwen Wang reported: In btf_parse(), the header of the user-space btf data 'btf_data' is firstly parsed and verified through btf_parse_hdr(). In btf_parse_hdr(), the header is copied from user-space 'btf_data' to kernel-space 'btf->hdr' and then verified. If no error happens during the verification process, the whole data of 'btf_data', including the header, is then copied to 'data' in btf_parse(). It is obvious that the header is copied twice here. More importantly, no check is enforced after the second copy to make sure the headers obtained in these two copies are same. Given that 'btf_data' resides in the user space, a malicious user can race to modify the header between these two copies. By doing so, the user can inject inconsistent data, which can cause undefined behavior of the kernel and introduce potential security risk. This issue is similar to the one fixed in commit 8af03d1ae2e1 ("bpf: btf: Fix a missing check bug"). To fix it, this patch copies the user 'btf_data' *before* parsing / verifying the BTF header. Fixes: 69b693f0aefa ("bpf: btf: Introduce BPF Type Format (BTF)") Change-Id: Idf5c86c4bd5c406f3e03c1f62eb83d2cde812e7c Signed-off-by: Martin KaFai Lau Co-developed-by: Wenwen Wang Acked-by: Song Liu Signed-off-by: Daniel Borkmann --- kernel/bpf/btf.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 62e05b6283c0..0fb791da6a5b 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -2190,9 +2190,6 @@ static int btf_parse_hdr(struct btf_verifier_env *env) hdr = &btf->hdr; - if (hdr->hdr_len != hdr_len) - return -EINVAL; - btf_verifier_log_hdr(env, btf_data_size); if (hdr->magic != BTF_MAGIC) { From 2e2d326644e7ff522f170ff1a67b28cb42f5e41b Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 24 Oct 2018 22:05:45 +0200 Subject: [PATCH 0953/1640] UPSTREAM: bpf: fix direct packet access for flow dissector progs Commit d58e468b1112 ("flow_dissector: implements flow dissector BPF hook") added direct packet access for skbs in may_access_direct_pkt_data() function where this enables read and write access to the skb->data. This is buggy because without a prologue generator such as bpf_unclone_prologue() we would allow for writing into cloned skbs. Original intention might have been to only allow read access where this is not needed (similar as the flow_dissector_func_proto() indicates which enables only bpf_skb_load_bytes() as well), therefore this patch fixes it to restrict to read-only. Fixes: d58e468b1112 ("flow_dissector: implements flow dissector BPF hook") Change-Id: I6e81fc189916ae634d070dd05eabe31ff51cffce Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Cc: Petar Penkov Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index fe9296232631..6d4c16698e68 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1434,21 +1434,23 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env, enum bpf_access_type t) { switch (env->prog->type) { + /* Program types only with direct read access go here! */ case BPF_PROG_TYPE_LWT_IN: case BPF_PROG_TYPE_LWT_OUT: case BPF_PROG_TYPE_LWT_SEG6LOCAL: case BPF_PROG_TYPE_SK_REUSEPORT: - /* dst_input() and dst_output() can't write for now */ + case BPF_PROG_TYPE_FLOW_DISSECTOR: if (t == BPF_WRITE) return false; /* fallthrough */ + + /* Program types with direct read + write access go here! */ case BPF_PROG_TYPE_SCHED_CLS: case BPF_PROG_TYPE_SCHED_ACT: case BPF_PROG_TYPE_XDP: case BPF_PROG_TYPE_LWT_XMIT: case BPF_PROG_TYPE_SK_SKB: case BPF_PROG_TYPE_SK_MSG: - case BPF_PROG_TYPE_FLOW_DISSECTOR: if (meta) return meta->pkt_access; From 532d0cc15f5010a35647d7896479c1aa7c789cf1 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 24 Oct 2018 22:05:46 +0200 Subject: [PATCH 0954/1640] UPSTREAM: bpf: fix cg_skb types to hint access type in may_access_direct_pkt_data Commit b39b5f411dcf ("bpf: add cg_skb_is_valid_access for BPF_PROG_TYPE_CGROUP_SKB") added direct packet access for skbs in cg_skb program types, however allowed access type was not added to the may_access_direct_pkt_data() helper. Therefore the latter always returns false. This is not directly an issue, it just means writes are unconditionally disabled (which is correct) but also reads. Latter is relevant in this function when BPF helpers may read direct packet data which is unconditionally disabled then. Fix it by properly adding BPF_PROG_TYPE_CGROUP_SKB to may_access_direct_pkt_data(). Fixes: b39b5f411dcf ("bpf: add cg_skb_is_valid_access for BPF_PROG_TYPE_CGROUP_SKB") Change-Id: I8c4c775fd2e187a85cdcb8ba16637cb538896303 Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Cc: Song Liu Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 6d4c16698e68..f3769e63de8d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1440,6 +1440,7 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env, case BPF_PROG_TYPE_LWT_SEG6LOCAL: case BPF_PROG_TYPE_SK_REUSEPORT: case BPF_PROG_TYPE_FLOW_DISSECTOR: + case BPF_PROG_TYPE_CGROUP_SKB: if (t == BPF_WRITE) return false; /* fallthrough */ From 3bcab52dffb239022053b248c6e64fcb4ed57f6e Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 24 Oct 2018 22:05:47 +0200 Subject: [PATCH 0955/1640] UPSTREAM: bpf: fix direct packet write into pop/peek helpers Commit f1a2e44a3aec ("bpf: add queue and stack maps") probably just copy-pasted .pkt_access for bpf_map_{pop,peek}_elem() helpers, but this is buggy in this context since it would allow writes into cloned skbs which is invalid. Therefore, disable .pkt_access for the two. Fixes: f1a2e44a3aec ("bpf: add queue and stack maps") Change-Id: I33fd03dea306e9a6827cef0658b279bfa89d861b Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Cc: Mauricio Vasquez B Acked-by: Mauricio Vasquez B Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 758345279705..769f3377828d 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -99,7 +99,6 @@ BPF_CALL_2(bpf_map_pop_elem, struct bpf_map *, map, void *, value) const struct bpf_func_proto bpf_map_pop_elem_proto = { .func = bpf_map_pop_elem, .gpl_only = false, - .pkt_access = true, .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_PTR_TO_UNINIT_MAP_VALUE, @@ -113,7 +112,6 @@ BPF_CALL_2(bpf_map_peek_elem, struct bpf_map *, map, void *, value) const struct bpf_func_proto bpf_map_peek_elem_proto = { .func = bpf_map_pop_elem, .gpl_only = false, - .pkt_access = true, .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_PTR_TO_UNINIT_MAP_VALUE, From be97ffb2fad942c9190e8e01c5a29590a55fdedc Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 24 Oct 2018 22:05:48 +0200 Subject: [PATCH 0956/1640] UPSTREAM: bpf: fix leaking uninitialized memory on pop/peek helpers Commit f1a2e44a3aec ("bpf: add queue and stack maps") added helpers with ARG_PTR_TO_UNINIT_MAP_VALUE. Meaning, the helper is supposed to fill the map value buffer with data instead of reading from it like in other helpers such as map update. However, given the buffer is allowed to be uninitialized (since we fill it in the helper anyway), it also means that the helper is obliged to wipe the memory in case of an error in order to not allow for leaking uninitialized memory. Given pop/peek is both handled inside __{stack,queue}_map_get(), lets wipe it there on error case, that is, empty stack/queue. Fixes: f1a2e44a3aec ("bpf: add queue and stack maps") Change-Id: Iac5ddd4bd3530ff764ca5cc129711b2e2b73348e Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Cc: Mauricio Vasquez B Acked-by: Mauricio Vasquez B Signed-off-by: Alexei Starovoitov --- kernel/bpf/queue_stack_maps.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c index 12a93fb37449..8bbd72d3a121 100644 --- a/kernel/bpf/queue_stack_maps.c +++ b/kernel/bpf/queue_stack_maps.c @@ -122,6 +122,7 @@ static int __queue_map_get(struct bpf_map *map, void *value, bool delete) raw_spin_lock_irqsave(&qs->lock, flags); if (queue_stack_map_is_empty(qs)) { + memset(value, 0, qs->map.value_size); err = -ENOENT; goto out; } @@ -151,6 +152,7 @@ static int __stack_map_get(struct bpf_map *map, void *value, bool delete) raw_spin_lock_irqsave(&qs->lock, flags); if (queue_stack_map_is_empty(qs)) { + memset(value, 0, qs->map.value_size); err = -ENOENT; goto out; } From 6bfc612ed88f744342b1a307c284dfdf82a2b0ed Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 24 Oct 2018 22:05:49 +0200 Subject: [PATCH 0957/1640] UPSTREAM: bpf: make direct packet write unclone more robust Given this seems to be quite fragile and can easily slip through the cracks, lets make direct packet write more robust by requiring that future program types which allow for such write must provide a prologue callback. In case of XDP and sk_msg it's noop, thus add a generic noop handler there. The latter starts out with NULL data/data_end unconditionally when sg pages are shared. Change-Id: I78e24b3ebacf60097222f186daf3f8159463530b Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Acked-by: Song Liu Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 6 +++++- net/core/filter.c | 11 +++++++++++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index f3769e63de8d..5f9090e746ed 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6278,7 +6278,11 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) enum bpf_access_type type; bool is_narrower_load; - if (ops->gen_prologue) { + if (ops->gen_prologue || env->seen_direct_write) { + if (!ops->gen_prologue) { + verbose(env, "bpf verifier is misconfigured\n"); + return -EINVAL; + } cnt = ops->gen_prologue(insn_buf, env->seen_direct_write, env->prog); if (cnt >= ARRAY_SIZE(insn_buf)) { diff --git a/net/core/filter.c b/net/core/filter.c index 6d2ace812f83..8ff14481a6af 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5482,6 +5482,15 @@ static bool sock_filter_is_valid_access(int off, int size, prog->expected_attach_type); } +static int bpf_noop_prologue(struct bpf_insn *insn_buf, bool direct_write, + const struct bpf_prog *prog) +{ + /* Neither direct read nor direct write requires any preliminary + * action. + */ + return 0; +} + static int bpf_unclone_prologue(struct bpf_insn *insn_buf, bool direct_write, const struct bpf_prog *prog, int drop_verdict) { @@ -7050,6 +7059,7 @@ const struct bpf_verifier_ops xdp_verifier_ops = { .get_func_proto = xdp_func_proto, .is_valid_access = xdp_is_valid_access, .convert_ctx_access = xdp_convert_ctx_access, + .gen_prologue = bpf_noop_prologue, }; const struct bpf_prog_ops xdp_prog_ops = { @@ -7148,6 +7158,7 @@ const struct bpf_verifier_ops sk_msg_verifier_ops = { .get_func_proto = sk_msg_func_proto, .is_valid_access = sk_msg_is_valid_access, .convert_ctx_access = sk_msg_convert_ctx_access, + .gen_prologue = bpf_noop_prologue, }; const struct bpf_prog_ops sk_msg_prog_ops = { From 940bee676734d698f1cba7f106aa935790d9b68b Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 1 Nov 2018 00:05:53 +0100 Subject: [PATCH 0958/1640] UPSTREAM: bpf: don't set id on after map lookup with ptr_to_map_val return In the verifier there is no such semantics where registers with PTR_TO_MAP_VALUE type have an id assigned to them. This is only used in PTR_TO_MAP_VALUE_OR_NULL and later on nullified once the test against NULL has been pattern matched and type transformed into PTR_TO_MAP_VALUE. Fixes: 3e6a4b3e0289 ("bpf/verifier: introduce BPF_PTR_TO_MAP_VALUE") Change-Id: I1309a295e52bc90279207ea1c746184f16dc6e07 Signed-off-by: Daniel Borkmann Cc: Roman Gushchin Acked-by: Alexei Starovoitov Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 5f9090e746ed..000df6e7c2da 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2982,10 +2982,6 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn regs[BPF_REG_0].type = NOT_INIT; } else if (fn->ret_type == RET_PTR_TO_MAP_VALUE_OR_NULL || fn->ret_type == RET_PTR_TO_MAP_VALUE) { - if (fn->ret_type == RET_PTR_TO_MAP_VALUE) - regs[BPF_REG_0].type = PTR_TO_MAP_VALUE; - else - regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL; /* There is no offset yet applied, variable or fixed */ mark_reg_known_zero(env, regs, BPF_REG_0); /* remember map_ptr, so that check_map_access() @@ -2998,7 +2994,12 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn return -EINVAL; } regs[BPF_REG_0].map_ptr = meta.map_ptr; - regs[BPF_REG_0].id = ++env->id_gen; + if (fn->ret_type == RET_PTR_TO_MAP_VALUE) { + regs[BPF_REG_0].type = PTR_TO_MAP_VALUE; + } else { + regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL; + regs[BPF_REG_0].id = ++env->id_gen; + } } else if (fn->ret_type == RET_PTR_TO_SOCKET_OR_NULL) { int id = acquire_reference_state(env, insn_idx); if (id < 0) From 7aac45ba9cb7a4d7266f7124e8b56358e2eaba83 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Fri, 2 Nov 2018 10:16:15 -0700 Subject: [PATCH 0959/1640] UPSTREAM: bpf: show real jited prog address in /proc/kallsyms Currently, /proc/kallsyms shows page address of jited bpf program. The main reason here is to not expose randomized start address. However, This is not ideal for detailed profiling (find hot instructions from stack traces). This patch replaces the page address with real prog start address. This change is OK because these addresses are still protected by sysctl kptr_restrict (see kallsyms_show_value()), and only programs loaded by root are added to kallsyms (see bpf_prog_kallsyms_add()). Change-Id: I0b87c5f6750867dee089d9158c43860b715b0d5e Signed-off-by: Song Liu Signed-off-by: Daniel Borkmann --- kernel/bpf/core.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 9a53ba30947c..60bef6ed659a 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -561,7 +561,6 @@ bool is_bpf_text_address(unsigned long addr) int bpf_get_kallsym(unsigned int symnum, unsigned long *value, char *type, char *sym) { - unsigned long symbol_start, symbol_end; struct bpf_prog_aux *aux; unsigned int it = 0; int ret = -ERANGE; @@ -574,10 +573,9 @@ int bpf_get_kallsym(unsigned int symnum, unsigned long *value, char *type, if (it++ != symnum) continue; - bpf_get_prog_addr_region(aux->prog, &symbol_start, &symbol_end); bpf_get_prog_name(aux->prog, sym); - *value = symbol_start; + *value = (unsigned long)aux->prog->bpf_func; *type = BPF_SYM_ELF_TYPE; ret = 0; From 8221956dac92572f13283913601c5dddbcdf33e6 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Fri, 2 Nov 2018 10:16:16 -0700 Subject: [PATCH 0960/1640] UPSTREAM: bpf: show real jited address in bpf_prog_info->jited_ksyms Currently, jited_ksyms in bpf_prog_info shows page addresses of jited bpf program. The main reason here is to not expose randomized start address. However, this is not ideal for detailed profiling (find hot instructions from stack traces). This patch replaces the page address with real prog start address. This change is OK because bpf_prog_get_info_by_fd() is only available to root. Change-Id: Id3351bc83cf55deeb28a8175a4fde5022620d82e Signed-off-by: Song Liu Signed-off-by: Daniel Borkmann --- kernel/bpf/syscall.c | 1 - 1 file changed, 1 deletion(-) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 93a01a838e20..fef41fe95da2 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2208,7 +2208,6 @@ static int bpf_prog_get_info_by_fd(struct file *file, user_ksyms = u64_to_user_ptr(info.jited_ksyms); for (i = 0; i < ulen; i++) { ksym_addr = (ulong) prog->aux->func[i]->bpf_func; - ksym_addr &= PAGE_MASK; if (put_user((u64) ksym_addr, &user_ksyms[i])) return -EFAULT; } From af16b242a11610dd5e57801d9e607d722191d98e Mon Sep 17 00:00:00 2001 From: Song Liu Date: Fri, 2 Nov 2018 10:16:17 -0700 Subject: [PATCH 0961/1640] BACKPORT: bpf: show main program address and length in bpf_prog_info Currently, when there is no subprog (prog->aux->func_cnt == 0), bpf_prog_info does not return any jited_ksyms or jited_func_lens. This patch adds main program address (prog->bpf_func) and main program length (prog->jited_len) to bpf_prog_info. Change-Id: I6b587709f3f55e70a3ac8b204467a8cf89d3bd31 Signed-off-by: Song Liu Signed-off-by: Daniel Borkmann --- kernel/bpf/syscall.c | 33 ++++++++++++++++++++++++--------- 1 file changed, 24 insertions(+), 9 deletions(-) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index fef41fe95da2..5debb897df59 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2194,11 +2194,11 @@ static int bpf_prog_get_info_by_fd(struct file *file, } ulen = info.nr_jited_ksyms; - info.nr_jited_ksyms = prog->aux->func_cnt; + info.nr_jited_ksyms = prog->aux->func_cnt ? : 1; if (info.nr_jited_ksyms && ulen) { if (bpf_dump_raw_ok(file->f_cred)) { + unsigned long ksym_addr; u64 __user *user_ksyms; - ulong ksym_addr; u32 i; /* copy the address of the kernel symbol @@ -2206,9 +2206,17 @@ static int bpf_prog_get_info_by_fd(struct file *file, */ ulen = min_t(u32, info.nr_jited_ksyms, ulen); user_ksyms = u64_to_user_ptr(info.jited_ksyms); - for (i = 0; i < ulen; i++) { - ksym_addr = (ulong) prog->aux->func[i]->bpf_func; - if (put_user((u64) ksym_addr, &user_ksyms[i])) + if (prog->aux->func_cnt) { + for (i = 0; i < ulen; i++) { + ksym_addr = (unsigned long) + prog->aux->func[i]->bpf_func; + if (put_user((u64) ksym_addr, + &user_ksyms[i])) + return -EFAULT; + } + } else { + ksym_addr = (unsigned long) prog->bpf_func; + if (put_user((u64) ksym_addr, &user_ksyms[0])) return -EFAULT; } } else { @@ -2217,7 +2225,7 @@ static int bpf_prog_get_info_by_fd(struct file *file, } ulen = info.nr_jited_func_lens; - info.nr_jited_func_lens = prog->aux->func_cnt; + info.nr_jited_func_lens = prog->aux->func_cnt ? : 1; if (info.nr_jited_func_lens && ulen) { if (bpf_dump_raw_ok(file->f_cred)) { u32 __user *user_lens; @@ -2226,9 +2234,16 @@ static int bpf_prog_get_info_by_fd(struct file *file, /* copy the JITed image lengths for each function */ ulen = min_t(u32, info.nr_jited_func_lens, ulen); user_lens = u64_to_user_ptr(info.jited_func_lens); - for (i = 0; i < ulen; i++) { - func_len = prog->aux->func[i]->jited_len; - if (put_user(func_len, &user_lens[i])) + if (prog->aux->func_cnt) { + for (i = 0; i < ulen; i++) { + func_len = + prog->aux->func[i]->jited_len; + if (put_user(func_len, &user_lens[i])) + return -EFAULT; + } + } else { + func_len = prog->jited_len; + if (put_user(func_len, &user_lens[0])) return -EFAULT; } } else { From 4b97126d8d723c6866c979d5b37b37d246b478e3 Mon Sep 17 00:00:00 2001 From: Jiong Wang Date: Thu, 8 Nov 2018 04:08:42 -0500 Subject: [PATCH 0962/1640] UPSTREAM: bpf: let verifier to calculate and record max_pkt_offset In check_packet_access, update max_pkt_offset after the offset has passed __check_packet_access. It should be safe to use u32 for max_pkt_offset as explained in code comment. Also, when there is tail call, the max_pkt_offset of the called program is unknown, so conservatively set max_pkt_offset to MAX_PACKET_OFF for such case. Reviewed-by: Jakub Kicinski Change-Id: I4ae0f9257f4ad1fc2323d5acee5e89fd62e6d3b2 Signed-off-by: Jiong Wang Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 1 + kernel/bpf/verifier.c | 12 ++++++++++++ 2 files changed, 13 insertions(+) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index ff873523c32c..cf0350ff2b46 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -298,6 +298,7 @@ struct bpf_prog_aux { atomic_t refcnt; u32 used_map_cnt; u32 max_ctx_offset; + u32 max_pkt_offset; u32 stack_depth; u32 id; u32 func_cnt; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 000df6e7c2da..b34dba09285f 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1502,6 +1502,17 @@ static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off, verbose(env, "R%d offset is outside of the packet\n", regno); return err; } + + /* __check_packet_access has made sure "off + size - 1" is within u16. + * reg->umax_value can't be bigger than MAX_PACKET_OFF which is 0xffff, + * otherwise find_good_pkt_pointers would have refused to set range info + * that __check_packet_access would have rejected this pkt access. + * Therefore, "off + reg->umax_value + size - 1" won't overflow u32. + */ + env->prog->aux->max_pkt_offset = + max_t(u32, env->prog->aux->max_pkt_offset, + off + reg->umax_value + size - 1); + return err; } @@ -6771,6 +6782,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) */ prog->cb_access = 1; env->prog->aux->stack_depth = MAX_BPF_STACK; + env->prog->aux->max_pkt_offset = MAX_PACKET_OFF; /* mark bpf_tail_call as different opcode to avoid * conditional branch in the interpeter for every normal From f4a479b485c3d64fa2c0e20a771342ea75d5c3ff Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Fri, 9 Nov 2018 13:03:25 +0000 Subject: [PATCH 0963/1640] BACKPORT: bpf: pass a struct with offload callbacks to bpf_offload_dev_create() For passing device functions for offloaded eBPF programs, there used to be no place where to store the pointer without making the non-offloaded programs pay a memory price. As a consequence, three functions were called with ndo_bpf() through specific commands. Now that we have struct bpf_offload_dev, and since none of those operations rely on RTNL, we can turn these three commands into hooks inside the struct bpf_prog_offload_ops, and pass them as part of bpf_offload_dev_create(). This commit effectively passes a pointer to the struct to bpf_offload_dev_create(). We temporarily have two struct bpf_prog_offload_ops instances, one under offdev->ops and one under offload->dev_ops. The next patches will make the transition towards the former, so that offload->dev_ops can be removed, and callbacks relying on ndo_bpf() added to offdev->ops as well. While at it, rename "nfp_bpf_analyzer_ops" as "nfp_bpf_dev_ops" (and similarly for netdevsim). Suggested-by: Jakub Kicinski Change-Id: Iab21632d51a34d98cc97597c71a56b93d91b2bab Signed-off-by: Quentin Monnet Reviewed-by: Jakub Kicinski Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 3 ++- kernel/bpf/offload.c | 5 ++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index cf0350ff2b46..10269fd05fff 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -709,7 +709,8 @@ int bpf_map_offload_get_next_key(struct bpf_map *map, bool bpf_offload_prog_map_match(struct bpf_prog *prog, struct bpf_map *map); -struct bpf_offload_dev *bpf_offload_dev_create(void); +struct bpf_offload_dev * +bpf_offload_dev_create(const struct bpf_prog_offload_ops *ops); void bpf_offload_dev_destroy(struct bpf_offload_dev *offdev); int bpf_offload_dev_netdev_register(struct bpf_offload_dev *offdev, struct net_device *netdev); diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index e82645e53953..3ce2214c7910 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -33,6 +33,7 @@ static DECLARE_RWSEM(bpf_devs_lock); struct bpf_offload_dev { + const struct bpf_prog_offload_ops *ops; struct list_head netdevs; }; @@ -655,7 +656,8 @@ unlock: } EXPORT_SYMBOL_GPL(bpf_offload_dev_netdev_unregister); -struct bpf_offload_dev *bpf_offload_dev_create(void) +struct bpf_offload_dev * +bpf_offload_dev_create(const struct bpf_prog_offload_ops *ops) { struct bpf_offload_dev *offdev; int err; @@ -675,6 +677,7 @@ struct bpf_offload_dev *bpf_offload_dev_create(void) if (!offdev) return ERR_PTR(-ENOMEM); + offdev->ops = ops; INIT_LIST_HEAD(&offdev->netdevs); return offdev; From 224ed11fc9e8289d9ed439bbb5e957ada2f6dc73 Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Fri, 9 Nov 2018 13:03:26 +0000 Subject: [PATCH 0964/1640] UPSTREAM: bpf: call verify_insn from its callback in struct bpf_offload_dev We intend to remove the dev_ops in struct bpf_prog_offload, and to only keep the ops in struct bpf_offload_dev instead, which is accessible from more locations for passing function pointers. But dev_ops is used for calling the verify_insn hook. Switch to the newly added ops in struct bpf_prog_offload instead. To avoid table lookups for each eBPF instruction to verify, we remember the offdev attached to a netdev and modify bpf_offload_find_netdev() to avoid performing more than once a lookup for a given offload object. Change-Id: I1a39402621d79044f39f35bfae1c0bd734a4dc43 Signed-off-by: Quentin Monnet Reviewed-by: Jakub Kicinski Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 1 + kernel/bpf/offload.c | 4 +++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 10269fd05fff..3b4323be69b1 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -278,6 +278,7 @@ struct bpf_prog_offload_ops { struct bpf_prog_offload { struct bpf_prog *prog; struct net_device *netdev; + struct bpf_offload_dev *offdev; void *dev_priv; struct list_head offloads; bool dev_state; diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index 3ce2214c7910..642418368bb8 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -107,6 +107,7 @@ int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr) err = -EINVAL; goto err_unlock; } + offload->offdev = ondev->offdev; prog->aux->offload = offload; list_add_tail(&offload->offloads, &ondev->progs); dev_put(offload->netdev); @@ -167,7 +168,8 @@ int bpf_prog_offload_verify_insn(struct bpf_verifier_env *env, down_read(&bpf_devs_lock); offload = env->prog->aux->offload; if (offload) - ret = offload->dev_ops->insn_hook(env, insn_idx, prev_insn_idx); + ret = offload->offdev->ops->insn_hook(env, insn_idx, + prev_insn_idx); up_read(&bpf_devs_lock); return ret; From 9adeee702179991d8f478132fe0bd1d121ffbde5 Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Fri, 9 Nov 2018 13:03:27 +0000 Subject: [PATCH 0965/1640] UPSTREAM: bpf: call finalize() from its callback in struct bpf_offload_dev In a way similar to the change previously brought to the verify_insn hook, switch to the newly added ops in struct bpf_prog_offload for calling the functions used to perform final verification steps for offloaded programs. Change-Id: Id58d76bbc7f78e8915188f7d3a8445d463f48b98 Signed-off-by: Quentin Monnet Reviewed-by: Jakub Kicinski Signed-off-by: Alexei Starovoitov --- kernel/bpf/offload.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index 642418368bb8..1ad6bc6f35f0 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -183,8 +183,8 @@ int bpf_prog_offload_finalize(struct bpf_verifier_env *env) down_read(&bpf_devs_lock); offload = env->prog->aux->offload; if (offload) { - if (offload->dev_ops->finalize) - ret = offload->dev_ops->finalize(env); + if (offload->offdev->ops->finalize) + ret = offload->offdev->ops->finalize(env); else ret = 0; } From ffe02db8e64619e320c3fe16ad7b87188b53ac1b Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Fri, 9 Nov 2018 13:03:28 +0000 Subject: [PATCH 0966/1640] BACKPORT: bpf: call verifier_prep from its callback in struct bpf_offload_dev In a way similar to the change previously brought to the verify_insn hook and to the finalize callback, switch to the newly added ops in struct bpf_prog_offload for calling the functions used to prepare driver verifiers. Since the dev_ops pointer in struct bpf_prog_offload is no longer used by any callback, we can now remove it from struct bpf_prog_offload. Change-Id: I311beb7cadfc7dccb2d0d27ac72c14d8458be1c2 Signed-off-by: Quentin Monnet Reviewed-by: Jakub Kicinski Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 2 +- include/linux/netdevice.h | 6 ------ kernel/bpf/offload.c | 22 +++++++++------------- 3 files changed, 10 insertions(+), 20 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 3b4323be69b1..efbc20d544b4 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -273,6 +273,7 @@ struct bpf_prog_offload_ops { int (*insn_hook)(struct bpf_verifier_env *env, int insn_idx, int prev_insn_idx); int (*finalize)(struct bpf_verifier_env *env); + int (*prepare)(struct net_device *netdev, struct bpf_verifier_env *env); }; struct bpf_prog_offload { @@ -282,7 +283,6 @@ struct bpf_prog_offload { void *dev_priv; struct list_head offloads; bool dev_state; - const struct bpf_prog_offload_ops *dev_ops; void *jited_image; u32 jited_len; }; diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index dea63063f66e..7c7d765038da 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -814,7 +814,6 @@ enum bpf_netdev_command { */ XDP_QUERY_PROG, /* BPF program for offload callbacks, invoked at program load time. */ - BPF_OFFLOAD_VERIFIER_PREP, BPF_OFFLOAD_TRANSLATE, BPF_OFFLOAD_DESTROY, BPF_OFFLOAD_MAP_ALLOC, @@ -840,11 +839,6 @@ struct netdev_bpf { /* flags with which program was installed */ u32 prog_flags; }; - /* BPF_OFFLOAD_VERIFIER_PREP */ - struct { - struct bpf_prog *prog; - const struct bpf_prog_offload_ops *ops; /* callee set */ - } verifier; /* BPF_OFFLOAD_TRANSLATE, BPF_OFFLOAD_DESTROY */ struct { struct bpf_prog *prog; diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index 1ad6bc6f35f0..f4b8d316ba63 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -142,21 +142,17 @@ static int __bpf_offload_ndo(struct bpf_prog *prog, enum bpf_netdev_command cmd, int bpf_prog_offload_verifier_prep(struct bpf_verifier_env *env) { - struct netdev_bpf data = {}; - int err; + struct bpf_prog_offload *offload; + int ret = -ENODEV; - data.verifier.prog = env->prog; + down_read(&bpf_devs_lock); + offload = env->prog->aux->offload; + if (offload) + ret = offload->offdev->ops->prepare(offload->netdev, env); + offload->dev_state = !ret; + up_read(&bpf_devs_lock); - rtnl_lock(); - err = __bpf_offload_ndo(env->prog, BPF_OFFLOAD_VERIFIER_PREP, &data); - if (err) - goto exit_unlock; - - env->prog->aux->offload->dev_ops = data.verifier.ops; - env->prog->aux->offload->dev_state = true; -exit_unlock: - rtnl_unlock(); - return err; + return ret; } int bpf_prog_offload_verify_insn(struct bpf_verifier_env *env, From 4d7d4f8b185fcc5418517687eca7ed770d3a5b96 Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Fri, 9 Nov 2018 13:03:29 +0000 Subject: [PATCH 0967/1640] BACKPORT: bpf: pass translate() as a callback and remove its ndo_bpf subcommand As part of the transition from ndo_bpf() to callbacks attached to struct bpf_offload_dev for some of the eBPF offload operations, move the functions related to code translation to the struct and remove the subcommand that was used to call them through the NDO. Change-Id: I41268d2264e4baf277de3d3512e5cfe314b876bf Signed-off-by: Quentin Monnet Reviewed-by: Jakub Kicinski Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 1 + include/linux/netdevice.h | 3 +-- kernel/bpf/offload.c | 14 +++++++------- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index efbc20d544b4..9386a871f366 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -274,6 +274,7 @@ struct bpf_prog_offload_ops { int insn_idx, int prev_insn_idx); int (*finalize)(struct bpf_verifier_env *env); int (*prepare)(struct net_device *netdev, struct bpf_verifier_env *env); + int (*translate)(struct net_device *netdev, struct bpf_prog *prog); }; struct bpf_prog_offload { diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 7c7d765038da..07655a021748 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -814,7 +814,6 @@ enum bpf_netdev_command { */ XDP_QUERY_PROG, /* BPF program for offload callbacks, invoked at program load time. */ - BPF_OFFLOAD_TRANSLATE, BPF_OFFLOAD_DESTROY, BPF_OFFLOAD_MAP_ALLOC, BPF_OFFLOAD_MAP_FREE, @@ -839,7 +838,7 @@ struct netdev_bpf { /* flags with which program was installed */ u32 prog_flags; }; - /* BPF_OFFLOAD_TRANSLATE, BPF_OFFLOAD_DESTROY */ + /* BPF_OFFLOAD_DESTROY */ struct { struct bpf_prog *prog; } offload; diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index f4b8d316ba63..9b14a09d43e1 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -219,14 +219,14 @@ void bpf_prog_offload_destroy(struct bpf_prog *prog) static int bpf_prog_offload_translate(struct bpf_prog *prog) { - struct netdev_bpf data = {}; - int ret; + struct bpf_prog_offload *offload; + int ret = -ENODEV; - data.offload.prog = prog; - - rtnl_lock(); - ret = __bpf_offload_ndo(prog, BPF_OFFLOAD_TRANSLATE, &data); - rtnl_unlock(); + down_read(&bpf_devs_lock); + offload = prog->aux->offload; + if (offload) + ret = offload->offdev->ops->translate(offload->netdev, prog); + up_read(&bpf_devs_lock); return ret; } From 51319ad43ceab71df887dc4f982195fe60658ede Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Fri, 9 Nov 2018 13:03:30 +0000 Subject: [PATCH 0968/1640] BACKPORT: bpf: pass destroy() as a callback and remove its ndo_bpf subcommand As part of the transition from ndo_bpf() to callbacks attached to struct bpf_offload_dev for some of the eBPF offload operations, move the functions related to program destruction to the struct and remove the subcommand that was used to call them through the NDO. Remove function __bpf_offload_ndo(), which is no longer used. Change-Id: Ibd8de86bf01cd52322205cf64c0fe4a94c1c716c Signed-off-by: Quentin Monnet Reviewed-by: Jakub Kicinski Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 1 + include/linux/netdevice.h | 5 ----- kernel/bpf/offload.c | 24 +----------------------- 3 files changed, 2 insertions(+), 28 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 9386a871f366..b9859d073f5c 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -275,6 +275,7 @@ struct bpf_prog_offload_ops { int (*finalize)(struct bpf_verifier_env *env); int (*prepare)(struct net_device *netdev, struct bpf_verifier_env *env); int (*translate)(struct net_device *netdev, struct bpf_prog *prog); + void (*destroy)(struct bpf_prog *prog); }; struct bpf_prog_offload { diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 07655a021748..d9c5b6f4167c 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -814,7 +814,6 @@ enum bpf_netdev_command { */ XDP_QUERY_PROG, /* BPF program for offload callbacks, invoked at program load time. */ - BPF_OFFLOAD_DESTROY, BPF_OFFLOAD_MAP_ALLOC, BPF_OFFLOAD_MAP_FREE, }; @@ -838,10 +837,6 @@ struct netdev_bpf { /* flags with which program was installed */ u32 prog_flags; }; - /* BPF_OFFLOAD_DESTROY */ - struct { - struct bpf_prog *prog; - } offload; /* BPF_OFFLOAD_MAP_ALLOC, BPF_OFFLOAD_MAP_FREE */ struct { struct bpf_offloaded_map *offmap; diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index 9b14a09d43e1..41e2fb351ab8 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -123,23 +123,6 @@ err_maybe_put: return err; } -static int __bpf_offload_ndo(struct bpf_prog *prog, enum bpf_netdev_command cmd, - struct netdev_bpf *data) -{ - struct bpf_prog_offload *offload = prog->aux->offload; - struct net_device *netdev; - - ASSERT_RTNL(); - - if (!offload) - return -ENODEV; - netdev = offload->netdev; - - data->command = cmd; - - return netdev->netdev_ops->ndo_bpf(netdev, data); -} - int bpf_prog_offload_verifier_prep(struct bpf_verifier_env *env) { struct bpf_prog_offload *offload; @@ -192,12 +175,9 @@ int bpf_prog_offload_finalize(struct bpf_verifier_env *env) static void __bpf_prog_offload_destroy(struct bpf_prog *prog) { struct bpf_prog_offload *offload = prog->aux->offload; - struct netdev_bpf data = {}; - - data.offload.prog = prog; if (offload->dev_state) - WARN_ON(__bpf_offload_ndo(prog, BPF_OFFLOAD_DESTROY, &data)); + offload->offdev->ops->destroy(prog); /* Make sure BPF_PROG_GET_NEXT_ID can't find this dead program */ bpf_prog_free_id(prog, true); @@ -209,12 +189,10 @@ static void __bpf_prog_offload_destroy(struct bpf_prog *prog) void bpf_prog_offload_destroy(struct bpf_prog *prog) { - rtnl_lock(); down_write(&bpf_devs_lock); if (prog->aux->offload) __bpf_prog_offload_destroy(prog); up_write(&bpf_devs_lock); - rtnl_unlock(); } static int bpf_prog_offload_translate(struct bpf_prog *prog) From 667d3c3be2013e14e554c9fe8ac21f38860b29bf Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Fri, 9 Nov 2018 13:03:31 +0000 Subject: [PATCH 0969/1640] BACKPORT: bpf: pass prog instead of env to bpf_prog_offload_verifier_prep() Function bpf_prog_offload_verifier_prep(), called from the kernel BPF verifier to run a driver-specific callback for preparing for the verification step for offloaded programs, takes a pointer to a struct bpf_verifier_env object. However, no driver callback needs the whole structure at this time: the two drivers supporting this, nfp and netdevsim, only need a pointer to the struct bpf_prog instance held by env. Update the callback accordingly, on kernel side and in these two drivers. Change-Id: I79193f7bda74cf70bf2cd4a627f9e8e161435595 Signed-off-by: Quentin Monnet Reviewed-by: Jakub Kicinski Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 2 +- include/linux/bpf_verifier.h | 2 +- kernel/bpf/offload.c | 6 +++--- kernel/bpf/verifier.c | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index b9859d073f5c..88880c682060 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -273,7 +273,7 @@ struct bpf_prog_offload_ops { int (*insn_hook)(struct bpf_verifier_env *env, int insn_idx, int prev_insn_idx); int (*finalize)(struct bpf_verifier_env *env); - int (*prepare)(struct net_device *netdev, struct bpf_verifier_env *env); + int (*prepare)(struct net_device *netdev, struct bpf_prog *prog); int (*translate)(struct net_device *netdev, struct bpf_prog *prog); void (*destroy)(struct bpf_prog *prog); }; diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 1c169ab2f3be..1001573674a1 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -268,7 +268,7 @@ static inline struct bpf_reg_state *cur_regs(struct bpf_verifier_env *env) return cur_func(env)->regs; } -int bpf_prog_offload_verifier_prep(struct bpf_verifier_env *env); +int bpf_prog_offload_verifier_prep(struct bpf_prog *prog); int bpf_prog_offload_verify_insn(struct bpf_verifier_env *env, int insn_idx, int prev_insn_idx); int bpf_prog_offload_finalize(struct bpf_verifier_env *env); diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index 41e2fb351ab8..e21d2950b1d2 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -123,15 +123,15 @@ err_maybe_put: return err; } -int bpf_prog_offload_verifier_prep(struct bpf_verifier_env *env) +int bpf_prog_offload_verifier_prep(struct bpf_prog *prog) { struct bpf_prog_offload *offload; int ret = -ENODEV; down_read(&bpf_devs_lock); - offload = env->prog->aux->offload; + offload = prog->aux->offload; if (offload) - ret = offload->offdev->ops->prepare(offload->netdev, env); + ret = offload->offdev->ops->prepare(offload->netdev, prog); offload->dev_state = !ret; up_read(&bpf_devs_lock); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index b34dba09285f..5441fa621fd8 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -7004,7 +7004,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) goto skip_full_check; if (bpf_prog_is_dev_bound(env->prog->aux)) { - ret = bpf_prog_offload_verifier_prep(env); + ret = bpf_prog_offload_verifier_prep(env->prog); if (ret) goto skip_full_check; } From ab20913c5000d496074378c0fdb0a1de8d4f3216 Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Fri, 9 Nov 2018 13:03:32 +0000 Subject: [PATCH 0970/1640] BACKPORT: bpf: do not pass netdev to translate() and prepare() offload callbacks The kernel functions to prepare verifier and translate for offloaded program retrieve "offload" from "prog", and "netdev" from "offload". Then both "prog" and "netdev" are passed to the callbacks. Simplify this by letting the drivers retrieve the net device themselves from the offload object attached to prog - if they need it at all. There is currently no need to pass the netdev as an argument to those functions. Change-Id: I02837695e5c0aeaf5496a9311b2d84a718909890 Signed-off-by: Quentin Monnet Reviewed-by: Jakub Kicinski Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 4 ++-- kernel/bpf/offload.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 88880c682060..4b50da4e47cc 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -273,8 +273,8 @@ struct bpf_prog_offload_ops { int (*insn_hook)(struct bpf_verifier_env *env, int insn_idx, int prev_insn_idx); int (*finalize)(struct bpf_verifier_env *env); - int (*prepare)(struct net_device *netdev, struct bpf_prog *prog); - int (*translate)(struct net_device *netdev, struct bpf_prog *prog); + int (*prepare)(struct bpf_prog *prog); + int (*translate)(struct bpf_prog *prog); void (*destroy)(struct bpf_prog *prog); }; diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index e21d2950b1d2..e094680f4280 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -131,7 +131,7 @@ int bpf_prog_offload_verifier_prep(struct bpf_prog *prog) down_read(&bpf_devs_lock); offload = prog->aux->offload; if (offload) - ret = offload->offdev->ops->prepare(offload->netdev, prog); + ret = offload->offdev->ops->prepare(prog); offload->dev_state = !ret; up_read(&bpf_devs_lock); @@ -203,7 +203,7 @@ static int bpf_prog_offload_translate(struct bpf_prog *prog) down_read(&bpf_devs_lock); offload = prog->aux->offload; if (offload) - ret = offload->offdev->ops->translate(offload->netdev, prog); + ret = offload->offdev->ops->translate(prog); up_read(&bpf_devs_lock); return ret; From 65a8a450021cded61288a3f90925ead378d104a3 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 13 Nov 2018 09:29:26 +0000 Subject: [PATCH 0971/1640] UPSTREAM: bpf: fix null pointer dereference on pointer offload Pointer offload is being null checked however the following statement dereferences the potentially null pointer offload when assigning offload->dev_state. Fix this by only assigning it if offload is not null. Detected by CoverityScan, CID#1475437 ("Dereference after null check") Fixes: 00db12c3d141 ("bpf: call verifier_prep from its callback in struct bpf_offload_dev") Change-Id: Icba10b65cfeb53dfb4b423dfb291df817b86fc05 Signed-off-by: Colin Ian King Acked-by: Jakub Kicinski Signed-off-by: Alexei Starovoitov --- kernel/bpf/offload.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index e094680f4280..454736159480 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -130,9 +130,10 @@ int bpf_prog_offload_verifier_prep(struct bpf_prog *prog) down_read(&bpf_devs_lock); offload = prog->aux->offload; - if (offload) + if (offload) { ret = offload->offdev->ops->prepare(prog); - offload->dev_state = !ret; + offload->dev_state = !ret; + } up_read(&bpf_devs_lock); return ret; From 4a4579f6a455dbe12959006be35f22125f9ca729 Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Fri, 16 Nov 2018 11:41:08 +0000 Subject: [PATCH 0972/1640] UPSTREAM: bpf: allow zero-initializing hash map seed Add a new flag BPF_F_ZERO_SEED, which forces a hash map to initialize the seed to zero. This is useful when doing performance analysis both on individual BPF programs, as well as the kernel's hash table implementation. Change-Id: Ibc5daa9ddcc46b27589d45551af0c416233073ff Signed-off-by: Lorenz Bauer Signed-off-by: Daniel Borkmann --- include/uapi/linux/bpf.h | 3 +++ kernel/bpf/hashtab.c | 13 +++++++++++-- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index e32a26bd7a9d..4595144529c9 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -285,6 +285,9 @@ enum bpf_attach_type { /* Flag for stack_map, store build_id+offset instead of pointer */ #define BPF_F_STACK_BUILD_ID (1U << 5) +/* Zero-initialize hash function seed. This should only be used for testing. */ +#define BPF_F_ZERO_SEED (1U << 6) + enum bpf_stack_build_id_status { /* user space need an empty entry to identify end of a trace */ BPF_STACK_BUILD_ID_EMPTY = 0, diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index ae6ec91035cc..de1748d9eab0 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -23,7 +23,7 @@ #define HTAB_CREATE_FLAG_MASK \ (BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE | \ - BPF_F_RDONLY | BPF_F_WRONLY) + BPF_F_RDONLY | BPF_F_WRONLY | BPF_F_ZERO_SEED) struct bucket { struct hlist_nulls_head head; @@ -244,6 +244,7 @@ static int htab_map_alloc_check(union bpf_attr *attr) */ bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU); bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC); + bool zero_seed = (attr->map_flags & BPF_F_ZERO_SEED); int numa_node = bpf_map_attr_numa_node(attr); BUILD_BUG_ON(offsetof(struct htab_elem, htab) != @@ -257,6 +258,10 @@ static int htab_map_alloc_check(union bpf_attr *attr) */ return -EPERM; + if (zero_seed && !capable(CAP_SYS_ADMIN)) + /* Guard against local DoS, and discourage production use. */ + return -EPERM; + if (attr->map_flags & ~HTAB_CREATE_FLAG_MASK) /* reserved bits should not be used */ return -EINVAL; @@ -380,7 +385,11 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) if (!htab->buckets) goto free_htab; - htab->hashrnd = get_random_int(); + if (htab->map.map_flags & BPF_F_ZERO_SEED) + htab->hashrnd = 0; + else + htab->hashrnd = get_random_int(); + for (i = 0; i < htab->n_buckets; i++) { INIT_HLIST_NULLS_HEAD(&htab->buckets[i].head, i); raw_spin_lock_init(&htab->buckets[i].lock); From cd3154768eaa89fc5d7cfba17a2f8c5da77e8e7c Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Mon, 19 Nov 2018 15:29:06 -0800 Subject: [PATCH 0973/1640] UPSTREAM: bpf: btf: Break up btf_type_is_void() This patch breaks up btf_type_is_void() into btf_type_is_void() and btf_type_is_fwd(). It also adds btf_type_nosize() to better describe it is testing a type has nosize info. Change-Id: Id427a71323da413fadd1eab0948a31ecacb793de Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov --- kernel/bpf/btf.c | 37 ++++++++++++++++++++++--------------- 1 file changed, 22 insertions(+), 15 deletions(-) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 0fb791da6a5b..f425f30dd541 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -307,15 +307,22 @@ static bool btf_type_is_modifier(const struct btf_type *t) static bool btf_type_is_void(const struct btf_type *t) { - /* void => no type and size info. - * Hence, FWD is also treated as void. - */ - return t == &btf_void || BTF_INFO_KIND(t->info) == BTF_KIND_FWD; + return t == &btf_void; } -static bool btf_type_is_void_or_null(const struct btf_type *t) +static bool btf_type_is_fwd(const struct btf_type *t) { - return !t || btf_type_is_void(t); + return BTF_INFO_KIND(t->info) == BTF_KIND_FWD; +} + +static bool btf_type_nosize(const struct btf_type *t) +{ + return btf_type_is_void(t) || btf_type_is_fwd(t); +} + +static bool btf_type_nosize_or_null(const struct btf_type *t) +{ + return !t || btf_type_nosize(t); } /* union is only a special case of struct: @@ -851,7 +858,7 @@ const struct btf_type *btf_type_id_size(const struct btf *btf, u32 size = 0; size_type = btf_type_by_id(btf, size_type_id); - if (btf_type_is_void_or_null(size_type)) + if (btf_type_nosize_or_null(size_type)) return NULL; if (btf_type_has_size(size_type)) { @@ -867,7 +874,7 @@ const struct btf_type *btf_type_id_size(const struct btf *btf, size = btf->resolved_sizes[size_type_id]; size_type_id = btf->resolved_ids[size_type_id]; size_type = btf_type_by_id(btf, size_type_id); - if (btf_type_is_void(size_type)) + if (btf_type_nosize_or_null(size_type)) return NULL; } @@ -1205,7 +1212,7 @@ static int btf_modifier_resolve(struct btf_verifier_env *env, } /* "typedef void new_void", "const void"...etc */ - if (btf_type_is_void(next_type)) + if (btf_type_is_void(next_type) || btf_type_is_fwd(next_type)) goto resolved; if (!env_type_is_resolve_sink(env, next_type) && @@ -1219,7 +1226,7 @@ static int btf_modifier_resolve(struct btf_verifier_env *env, * pretty print). */ if (!btf_type_id_size(btf, &next_type_id, &next_type_size) && - !btf_type_is_void(btf_type_id_resolve(btf, &next_type_id))) { + !btf_type_nosize(btf_type_id_resolve(btf, &next_type_id))) { btf_verifier_log_type(env, v->t, "Invalid type_id"); return -EINVAL; } @@ -1246,7 +1253,7 @@ static int btf_ptr_resolve(struct btf_verifier_env *env, } /* "void *" */ - if (btf_type_is_void(next_type)) + if (btf_type_is_void(next_type) || btf_type_is_fwd(next_type)) goto resolved; if (!env_type_is_resolve_sink(env, next_type) && @@ -1276,7 +1283,7 @@ static int btf_ptr_resolve(struct btf_verifier_env *env, } if (!btf_type_id_size(btf, &next_type_id, &next_type_size) && - !btf_type_is_void(btf_type_id_resolve(btf, &next_type_id))) { + !btf_type_nosize(btf_type_id_resolve(btf, &next_type_id))) { btf_verifier_log_type(env, v->t, "Invalid type_id"); return -EINVAL; } @@ -1450,7 +1457,7 @@ static int btf_array_resolve(struct btf_verifier_env *env, /* Check array->index_type */ index_type_id = array->index_type; index_type = btf_type_by_id(btf, index_type_id); - if (btf_type_is_void_or_null(index_type)) { + if (btf_type_nosize_or_null(index_type)) { btf_verifier_log_type(env, v->t, "Invalid index"); return -EINVAL; } @@ -1469,7 +1476,7 @@ static int btf_array_resolve(struct btf_verifier_env *env, /* Check array->type */ elem_type_id = array->type; elem_type = btf_type_by_id(btf, elem_type_id); - if (btf_type_is_void_or_null(elem_type)) { + if (btf_type_nosize_or_null(elem_type)) { btf_verifier_log_type(env, v->t, "Invalid elem"); return -EINVAL; @@ -1682,7 +1689,7 @@ static int btf_struct_resolve(struct btf_verifier_env *env, const struct btf_type *member_type = btf_type_by_id(env->btf, member_type_id); - if (btf_type_is_void_or_null(member_type)) { + if (btf_type_nosize_or_null(member_type)) { btf_verifier_log_member(env, v->t, member, "Invalid member"); return -EINVAL; From 5edc5f48f18613e1eca33ff8f90fa94f37700c3a Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Mon, 19 Nov 2018 15:29:08 -0800 Subject: [PATCH 0974/1640] UPSTREAM: bpf: btf: Add BTF_KIND_FUNC and BTF_KIND_FUNC_PROTO This patch adds BTF_KIND_FUNC and BTF_KIND_FUNC_PROTO to support the function debug info. BTF_KIND_FUNC_PROTO must not have a name (i.e. !t->name_off) and it is followed by >= 0 'struct bpf_param' objects to describe the function arguments. The BTF_KIND_FUNC must have a valid name and it must refer back to a BTF_KIND_FUNC_PROTO. The above is the conclusion after the discussion between Edward Cree, Alexei, Daniel, Yonghong and Martin. By combining BTF_KIND_FUNC and BTF_LIND_FUNC_PROTO, a complete function signature can be obtained. It will be used in the later patches to learn the function signature of a running bpf program. Change-Id: I896a9d3c2a9f2a3f05b309faaa4df5668e0f216e Signed-off-by: Martin KaFai Lau Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov --- include/uapi/linux/btf.h | 18 +- kernel/bpf/btf.c | 364 +++++++++++++++++++++++++++++++++------ 2 files changed, 329 insertions(+), 53 deletions(-) diff --git a/include/uapi/linux/btf.h b/include/uapi/linux/btf.h index 972265f32871..14f66948fc95 100644 --- a/include/uapi/linux/btf.h +++ b/include/uapi/linux/btf.h @@ -40,7 +40,8 @@ struct btf_type { /* "size" is used by INT, ENUM, STRUCT and UNION. * "size" tells the size of the type it is describing. * - * "type" is used by PTR, TYPEDEF, VOLATILE, CONST and RESTRICT. + * "type" is used by PTR, TYPEDEF, VOLATILE, CONST, RESTRICT, + * FUNC and FUNC_PROTO. * "type" is a type_id referring to another type. */ union { @@ -64,8 +65,10 @@ struct btf_type { #define BTF_KIND_VOLATILE 9 /* Volatile */ #define BTF_KIND_CONST 10 /* Const */ #define BTF_KIND_RESTRICT 11 /* Restrict */ -#define BTF_KIND_MAX 11 -#define NR_BTF_KINDS 12 +#define BTF_KIND_FUNC 12 /* Function */ +#define BTF_KIND_FUNC_PROTO 13 /* Function Proto */ +#define BTF_KIND_MAX 13 +#define NR_BTF_KINDS 14 /* For some specific BTF_KIND, "struct btf_type" is immediately * followed by extra data. @@ -110,4 +113,13 @@ struct btf_member { __u32 offset; /* offset in bits */ }; +/* BTF_KIND_FUNC_PROTO is followed by multiple "struct btf_param". + * The exact number of btf_param is stored in the vlen (of the + * info in "struct btf_type"). + */ +struct btf_param { + __u32 name_off; + __u32 type; +}; + #endif /* _UAPI__LINUX_BTF_H__ */ diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index f425f30dd541..50c6d05879e1 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -260,6 +260,8 @@ static const char * const btf_kind_str[NR_BTF_KINDS] = { [BTF_KIND_VOLATILE] = "VOLATILE", [BTF_KIND_CONST] = "CONST", [BTF_KIND_RESTRICT] = "RESTRICT", + [BTF_KIND_FUNC] = "FUNC", + [BTF_KIND_FUNC_PROTO] = "FUNC_PROTO", }; struct btf_kind_operations { @@ -282,6 +284,9 @@ struct btf_kind_operations { static const struct btf_kind_operations * const kind_ops[NR_BTF_KINDS]; static struct btf_type btf_void; +static int btf_resolve(struct btf_verifier_env *env, + const struct btf_type *t, u32 type_id); + static bool btf_type_is_modifier(const struct btf_type *t) { /* Some of them is not strictly a C modifier @@ -315,9 +320,20 @@ static bool btf_type_is_fwd(const struct btf_type *t) return BTF_INFO_KIND(t->info) == BTF_KIND_FWD; } +static bool btf_type_is_func(const struct btf_type *t) +{ + return BTF_INFO_KIND(t->info) == BTF_KIND_FUNC; +} + +static bool btf_type_is_func_proto(const struct btf_type *t) +{ + return BTF_INFO_KIND(t->info) == BTF_KIND_FUNC_PROTO; +} + static bool btf_type_nosize(const struct btf_type *t) { - return btf_type_is_void(t) || btf_type_is_fwd(t); + return btf_type_is_void(t) || btf_type_is_fwd(t) || + btf_type_is_func(t) || btf_type_is_func_proto(t); } static bool btf_type_nosize_or_null(const struct btf_type *t) @@ -772,11 +788,15 @@ static bool env_type_is_resolve_sink(const struct btf_verifier_env *env, /* int, enum or void is a sink */ return !btf_type_needs_resolve(next_type); case RESOLVE_PTR: - /* int, enum, void, struct or array is a sink for ptr */ + /* int, enum, void, struct, array, func or func_proto is a sink + * for ptr + */ return !btf_type_is_modifier(next_type) && !btf_type_is_ptr(next_type); case RESOLVE_STRUCT_OR_ARRAY: - /* int, enum, void or ptr is a sink for struct and array */ + /* int, enum, void, ptr, func or func_proto is a sink + * for struct and array + */ return !btf_type_is_modifier(next_type) && !btf_type_is_array(next_type) && !btf_type_is_struct(next_type); @@ -1211,10 +1231,6 @@ static int btf_modifier_resolve(struct btf_verifier_env *env, return -EINVAL; } - /* "typedef void new_void", "const void"...etc */ - if (btf_type_is_void(next_type) || btf_type_is_fwd(next_type)) - goto resolved; - if (!env_type_is_resolve_sink(env, next_type) && !env_type_is_resolved(env, next_type_id)) return env_stack_push(env, next_type, next_type_id); @@ -1225,13 +1241,18 @@ static int btf_modifier_resolve(struct btf_verifier_env *env, * save us a few type-following when we use it later (e.g. in * pretty print). */ - if (!btf_type_id_size(btf, &next_type_id, &next_type_size) && - !btf_type_nosize(btf_type_id_resolve(btf, &next_type_id))) { - btf_verifier_log_type(env, v->t, "Invalid type_id"); - return -EINVAL; + if (!btf_type_id_size(btf, &next_type_id, &next_type_size)) { + if (env_type_is_resolved(env, next_type_id)) + next_type = btf_type_id_resolve(btf, &next_type_id); + + /* "typedef void new_void", "const void"...etc */ + if (!btf_type_is_void(next_type) && + !btf_type_is_fwd(next_type)) { + btf_verifier_log_type(env, v->t, "Invalid type_id"); + return -EINVAL; + } } -resolved: env_stack_pop_resolved(env, next_type_id, next_type_size); return 0; @@ -1244,7 +1265,6 @@ static int btf_ptr_resolve(struct btf_verifier_env *env, const struct btf_type *t = v->t; u32 next_type_id = t->type; struct btf *btf = env->btf; - u32 next_type_size = 0; next_type = btf_type_by_id(btf, next_type_id); if (!next_type) { @@ -1252,10 +1272,6 @@ static int btf_ptr_resolve(struct btf_verifier_env *env, return -EINVAL; } - /* "void *" */ - if (btf_type_is_void(next_type) || btf_type_is_fwd(next_type)) - goto resolved; - if (!env_type_is_resolve_sink(env, next_type) && !env_type_is_resolved(env, next_type_id)) return env_stack_push(env, next_type, next_type_id); @@ -1282,13 +1298,18 @@ static int btf_ptr_resolve(struct btf_verifier_env *env, resolved_type_id); } - if (!btf_type_id_size(btf, &next_type_id, &next_type_size) && - !btf_type_nosize(btf_type_id_resolve(btf, &next_type_id))) { - btf_verifier_log_type(env, v->t, "Invalid type_id"); - return -EINVAL; + if (!btf_type_id_size(btf, &next_type_id, NULL)) { + if (env_type_is_resolved(env, next_type_id)) + next_type = btf_type_id_resolve(btf, &next_type_id); + + if (!btf_type_is_void(next_type) && + !btf_type_is_fwd(next_type) && + !btf_type_is_func_proto(next_type)) { + btf_verifier_log_type(env, v->t, "Invalid type_id"); + return -EINVAL; + } } -resolved: env_stack_pop_resolved(env, next_type_id, 0); return 0; @@ -1869,6 +1890,232 @@ static struct btf_kind_operations enum_ops = { .seq_show = btf_enum_seq_show, }; +static s32 btf_func_proto_check_meta(struct btf_verifier_env *env, + const struct btf_type *t, + u32 meta_left) +{ + u32 meta_needed = btf_type_vlen(t) * sizeof(struct btf_param); + + if (meta_left < meta_needed) { + btf_verifier_log_basic(env, t, + "meta_left:%u meta_needed:%u", + meta_left, meta_needed); + return -EINVAL; + } + + if (t->name_off) { + btf_verifier_log_type(env, t, "Invalid name"); + return -EINVAL; + } + + btf_verifier_log_type(env, t, NULL); + + return meta_needed; +} + +static void btf_func_proto_log(struct btf_verifier_env *env, + const struct btf_type *t) +{ + const struct btf_param *args = (const struct btf_param *)(t + 1); + u16 nr_args = btf_type_vlen(t), i; + + btf_verifier_log(env, "return=%u args=(", t->type); + if (!nr_args) { + btf_verifier_log(env, "void"); + goto done; + } + + if (nr_args == 1 && !args[0].type) { + /* Only one vararg */ + btf_verifier_log(env, "vararg"); + goto done; + } + + btf_verifier_log(env, "%u %s", args[0].type, + btf_name_by_offset(env->btf, + args[0].name_off)); + for (i = 1; i < nr_args - 1; i++) + btf_verifier_log(env, ", %u %s", args[i].type, + btf_name_by_offset(env->btf, + args[i].name_off)); + + if (nr_args > 1) { + const struct btf_param *last_arg = &args[nr_args - 1]; + + if (last_arg->type) + btf_verifier_log(env, ", %u %s", last_arg->type, + btf_name_by_offset(env->btf, + last_arg->name_off)); + else + btf_verifier_log(env, ", vararg"); + } + +done: + btf_verifier_log(env, ")"); +} + +static struct btf_kind_operations func_proto_ops = { + .check_meta = btf_func_proto_check_meta, + .resolve = btf_df_resolve, + /* + * BTF_KIND_FUNC_PROTO cannot be directly referred by + * a struct's member. + * + * It should be a funciton pointer instead. + * (i.e. struct's member -> BTF_KIND_PTR -> BTF_KIND_FUNC_PROTO) + * + * Hence, there is no btf_func_check_member(). + */ + .check_member = btf_df_check_member, + .log_details = btf_func_proto_log, + .seq_show = btf_df_seq_show, +}; + +static s32 btf_func_check_meta(struct btf_verifier_env *env, + const struct btf_type *t, + u32 meta_left) +{ + if (!t->name_off || + !btf_name_valid_identifier(env->btf, t->name_off)) { + btf_verifier_log_type(env, t, "Invalid name"); + return -EINVAL; + } + + if (btf_type_vlen(t)) { + btf_verifier_log_type(env, t, "vlen != 0"); + return -EINVAL; + } + + btf_verifier_log_type(env, t, NULL); + + return 0; +} + +static struct btf_kind_operations func_ops = { + .check_meta = btf_func_check_meta, + .resolve = btf_df_resolve, + .check_member = btf_df_check_member, + .log_details = btf_ref_type_log, + .seq_show = btf_df_seq_show, +}; + +static int btf_func_proto_check(struct btf_verifier_env *env, + const struct btf_type *t) +{ + const struct btf_type *ret_type; + const struct btf_param *args; + const struct btf *btf; + u16 nr_args, i; + int err; + + btf = env->btf; + args = (const struct btf_param *)(t + 1); + nr_args = btf_type_vlen(t); + + /* Check func return type which could be "void" (t->type == 0) */ + if (t->type) { + u32 ret_type_id = t->type; + + ret_type = btf_type_by_id(btf, ret_type_id); + if (!ret_type) { + btf_verifier_log_type(env, t, "Invalid return type"); + return -EINVAL; + } + + if (btf_type_needs_resolve(ret_type) && + !env_type_is_resolved(env, ret_type_id)) { + err = btf_resolve(env, ret_type, ret_type_id); + if (err) + return err; + } + + /* Ensure the return type is a type that has a size */ + if (!btf_type_id_size(btf, &ret_type_id, NULL)) { + btf_verifier_log_type(env, t, "Invalid return type"); + return -EINVAL; + } + } + + if (!nr_args) + return 0; + + /* Last func arg type_id could be 0 if it is a vararg */ + if (!args[nr_args - 1].type) { + if (args[nr_args - 1].name_off) { + btf_verifier_log_type(env, t, "Invalid arg#%u", + nr_args); + return -EINVAL; + } + nr_args--; + } + + err = 0; + for (i = 0; i < nr_args; i++) { + const struct btf_type *arg_type; + u32 arg_type_id; + + arg_type_id = args[i].type; + arg_type = btf_type_by_id(btf, arg_type_id); + if (!arg_type) { + btf_verifier_log_type(env, t, "Invalid arg#%u", i + 1); + err = -EINVAL; + break; + } + + if (args[i].name_off && + (!btf_name_offset_valid(btf, args[i].name_off) || + !btf_name_valid_identifier(btf, args[i].name_off))) { + btf_verifier_log_type(env, t, + "Invalid arg#%u", i + 1); + err = -EINVAL; + break; + } + + if (btf_type_needs_resolve(arg_type) && + !env_type_is_resolved(env, arg_type_id)) { + err = btf_resolve(env, arg_type, arg_type_id); + if (err) + break; + } + + if (!btf_type_id_size(btf, &arg_type_id, NULL)) { + btf_verifier_log_type(env, t, "Invalid arg#%u", i + 1); + err = -EINVAL; + break; + } + } + + return err; +} + +static int btf_func_check(struct btf_verifier_env *env, + const struct btf_type *t) +{ + const struct btf_type *proto_type; + const struct btf_param *args; + const struct btf *btf; + u16 nr_args, i; + + btf = env->btf; + proto_type = btf_type_by_id(btf, t->type); + + if (!proto_type || !btf_type_is_func_proto(proto_type)) { + btf_verifier_log_type(env, t, "Invalid type_id"); + return -EINVAL; + } + + args = (const struct btf_param *)(proto_type + 1); + nr_args = btf_type_vlen(proto_type); + for (i = 0; i < nr_args; i++) { + if (!args[i].name_off && args[i].type) { + btf_verifier_log_type(env, t, "Invalid arg#%u", i + 1); + return -EINVAL; + } + } + + return 0; +} + static const struct btf_kind_operations * const kind_ops[NR_BTF_KINDS] = { [BTF_KIND_INT] = &int_ops, [BTF_KIND_PTR] = &ptr_ops, @@ -1881,6 +2128,8 @@ static const struct btf_kind_operations * const kind_ops[NR_BTF_KINDS] = { [BTF_KIND_VOLATILE] = &modifier_ops, [BTF_KIND_CONST] = &modifier_ops, [BTF_KIND_RESTRICT] = &modifier_ops, + [BTF_KIND_FUNC] = &func_ops, + [BTF_KIND_FUNC_PROTO] = &func_proto_ops, }; static s32 btf_check_meta(struct btf_verifier_env *env, @@ -1952,30 +2201,6 @@ static int btf_check_all_metas(struct btf_verifier_env *env) return 0; } -static int btf_resolve(struct btf_verifier_env *env, - const struct btf_type *t, u32 type_id) -{ - const struct resolve_vertex *v; - int err = 0; - - env->resolve_mode = RESOLVE_TBD; - env_stack_push(env, t, type_id); - while (!err && (v = env_stack_peak(env))) { - env->log_type_id = v->type_id; - err = btf_type_ops(v->t)->resolve(env, v); - } - - env->log_type_id = type_id; - if (err == -E2BIG) - btf_verifier_log_type(env, t, - "Exceeded max resolving depth:%u", - MAX_RESOLVE_DEPTH); - else if (err == -EEXIST) - btf_verifier_log_type(env, t, "Loop detected"); - - return err; -} - static bool btf_resolve_valid(struct btf_verifier_env *env, const struct btf_type *t, u32 type_id) @@ -2009,6 +2234,39 @@ static bool btf_resolve_valid(struct btf_verifier_env *env, return false; } +static int btf_resolve(struct btf_verifier_env *env, + const struct btf_type *t, u32 type_id) +{ + u32 save_log_type_id = env->log_type_id; + const struct resolve_vertex *v; + int err = 0; + + env->resolve_mode = RESOLVE_TBD; + env_stack_push(env, t, type_id); + while (!err && (v = env_stack_peak(env))) { + env->log_type_id = v->type_id; + err = btf_type_ops(v->t)->resolve(env, v); + } + + env->log_type_id = type_id; + if (err == -E2BIG) { + btf_verifier_log_type(env, t, + "Exceeded max resolving depth:%u", + MAX_RESOLVE_DEPTH); + } else if (err == -EEXIST) { + btf_verifier_log_type(env, t, "Loop detected"); + } + + /* Final sanity check */ + if (!err && !btf_resolve_valid(env, t, type_id)) { + btf_verifier_log_type(env, t, "Invalid resolve state"); + err = -EINVAL; + } + + env->log_type_id = save_log_type_id; + return err; +} + static int btf_check_all_types(struct btf_verifier_env *env) { struct btf *btf = env->btf; @@ -2031,10 +2289,16 @@ static int btf_check_all_types(struct btf_verifier_env *env) return err; } - if (btf_type_needs_resolve(t) && - !btf_resolve_valid(env, t, type_id)) { - btf_verifier_log_type(env, t, "Invalid resolve state"); - return -EINVAL; + if (btf_type_is_func_proto(t)) { + err = btf_func_proto_check(env, t); + if (err) + return err; + } + + if (btf_type_is_func(t)) { + err = btf_func_check(env, t); + if (err) + return err; } } From 7a15d642331b6565227cada1128dfd75f8b63551 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Mon, 19 Nov 2018 15:29:11 -0800 Subject: [PATCH 0975/1640] UPSTREAM: bpf: Introduce bpf_func_info This patch added interface to load a program with the following additional information: . prog_btf_fd . func_info, func_info_rec_size and func_info_cnt where func_info will provide function range and type_id corresponding to each function. The func_info_rec_size is introduced in the UAPI to specify struct bpf_func_info size passed from user space. This intends to make bpf_func_info structure growable in the future. If the kernel gets a different bpf_func_info size from userspace, it will try to handle user request with part of bpf_func_info it can understand. In this patch, kernel can understand struct bpf_func_info { __u32 insn_offset; __u32 type_id; }; If user passed a bpf func_info record size of 16 bytes, the kernel can still handle part of records with the above definition. If verifier agrees with function range provided by the user, the bpf_prog ksym for each function will use the func name provided in the type_id, which is supposed to provide better encoding as it is not limited by 16 bytes program name limitation and this is better for bpf program which contains multiple subprograms. The bpf_prog_info interface is also extended to return btf_id, func_info, func_info_rec_size and func_info_cnt to userspace, so userspace can print out the function prototype for each xlated function. The insn_offset in the returned func_info corresponds to the insn offset for xlated functions. With other jit related fields in bpf_prog_info, userspace can also print out function prototypes for each jited function. Change-Id: Ic0b0f678153f135b70711bfab7d33207d99b8308 Signed-off-by: Yonghong Song Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 5 +- include/linux/bpf_verifier.h | 1 + include/linux/btf.h | 2 + include/uapi/linux/bpf.h | 13 ++++ kernel/bpf/btf.c | 4 +- kernel/bpf/core.c | 13 ++++ kernel/bpf/syscall.c | 59 +++++++++++++++-- kernel/bpf/verifier.c | 120 ++++++++++++++++++++++++++++++++++- 8 files changed, 209 insertions(+), 8 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 4b50da4e47cc..cf4903c8863a 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -321,6 +321,8 @@ struct bpf_prog_aux { void *security; #endif struct bpf_prog_offload *offload; + struct btf *btf; + u32 type_id; /* type id for this prog/func */ union { struct work_struct work; struct rcu_head rcu; @@ -533,7 +535,8 @@ static inline void bpf_long_memcpy(void *dst, const void *src, u32 size) } /* verify correctness of eBPF program */ -int bpf_check(struct bpf_prog **fp, union bpf_attr *attr); +int bpf_check(struct bpf_prog **fp, union bpf_attr *attr, + union bpf_attr __user *uattr); void bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth); /* Map specifics */ diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 1001573674a1..0d754d0c1de9 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -223,6 +223,7 @@ static inline bool bpf_verifier_log_needed(const struct bpf_verifier_log *log) struct bpf_subprog_info { u32 start; /* insn idx of function entry point */ u16 stack_depth; /* max. stack depth used by this function */ + u32 type_id; /* btf type_id for this subprog */ }; /* single container for all structs diff --git a/include/linux/btf.h b/include/linux/btf.h index e076c4697049..7f2c0a4a45ea 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -46,5 +46,7 @@ void btf_type_seq_show(const struct btf *btf, u32 type_id, void *obj, struct seq_file *m); int btf_get_fd_by_id(u32 id); u32 btf_id(const struct btf *btf); +const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id); +const char *btf_name_by_offset(const struct btf *btf, u32 offset); #endif diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 4595144529c9..6fea445b4955 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -354,6 +354,10 @@ union bpf_attr { * (context accesses, allowed helpers, etc). */ __u32 expected_attach_type; + __u32 prog_btf_fd; /* fd pointing to BTF type data */ + __u32 func_info_rec_size; /* userspace bpf_func_info size */ + __aligned_u64 func_info; /* func info */ + __u32 func_info_cnt; /* number of bpf_func_info records */ }; struct { /* anonymous struct used by BPF_OBJ_* commands */ @@ -2681,6 +2685,10 @@ struct bpf_prog_info { __u32 nr_jited_func_lens; __aligned_u64 jited_ksyms; __aligned_u64 jited_func_lens; + __u32 btf_id; + __u32 func_info_rec_size; + __aligned_u64 func_info; + __u32 func_info_cnt; } __attribute__((aligned(8))); struct bpf_map_info { @@ -2992,4 +3000,9 @@ struct bpf_flow_keys { }; }; +struct bpf_func_info { + __u32 insn_offset; + __u32 type_id; +}; + #endif /* _UAPI__LINUX_BPF_H__ */ diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 50c6d05879e1..ccfdf4914ae8 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -474,7 +474,7 @@ static bool btf_name_valid_identifier(const struct btf *btf, u32 offset) return !*src; } -static const char *btf_name_by_offset(const struct btf *btf, u32 offset) +const char *btf_name_by_offset(const struct btf *btf, u32 offset) { if (!offset) return "(anon)"; @@ -484,7 +484,7 @@ static const char *btf_name_by_offset(const struct btf *btf, u32 offset) return "(invalid-name-offset)"; } -static const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id) +const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id) { if (type_id > btf->nr_types) return NULL; diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 60bef6ed659a..ecc6d850b0ec 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -21,12 +21,14 @@ * Kris Katterjohn - Added many additional checks in bpf_check_classic() */ +#include #include #include #include #include #include #include +#include #include #include #include @@ -398,6 +400,8 @@ bpf_get_prog_addr_region(const struct bpf_prog *prog, static void bpf_get_prog_name(const struct bpf_prog *prog, char *sym) { const char *end = sym + KSYM_NAME_LEN; + const struct btf_type *type; + const char *func_name; BUILD_BUG_ON(sizeof("bpf_prog_") + sizeof(prog->tag) * 2 + @@ -412,6 +416,15 @@ static void bpf_get_prog_name(const struct bpf_prog *prog, char *sym) sym += snprintf(sym, KSYM_NAME_LEN, "bpf_prog_"); sym = bin2hex(sym, prog->tag, sizeof(prog->tag)); + + /* prog->aux->name will be ignored if full btf name is available */ + if (prog->aux->btf) { + type = btf_type_by_id(prog->aux->btf, prog->aux->type_id); + func_name = btf_name_by_offset(prog->aux->btf, type->name_off); + snprintf(sym, (size_t)(end - sym), "_%s", func_name); + return; + } + if (prog->aux->name[0]) snprintf(sym, (size_t)(end - sym), "_%s", prog->aux->name); else diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 5debb897df59..ce7b3a77f4c4 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1226,6 +1226,7 @@ static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock) /* bpf_prog_free_id() must be called first */ bpf_prog_free_id(prog, do_idr_lock); bpf_prog_kallsyms_del_all(prog); + btf_put(prog->aux->btf); call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu); } @@ -1452,9 +1453,9 @@ bpf_prog_load_check_attach_type(enum bpf_prog_type prog_type, } /* last field in 'union bpf_attr' used by this command */ -#define BPF_PROG_LOAD_LAST_FIELD expected_attach_type +#define BPF_PROG_LOAD_LAST_FIELD func_info_cnt -static int bpf_prog_load(union bpf_attr *attr) +static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) { enum bpf_prog_type type = attr->prog_type; struct bpf_prog *prog; @@ -1545,7 +1546,7 @@ static int bpf_prog_load(union bpf_attr *attr) goto free_prog; /* run eBPF verifier */ - err = bpf_check(&prog, attr); + err = bpf_check(&prog, attr, uattr); if (err < 0) goto free_used_maps; @@ -2114,6 +2115,7 @@ static int bpf_prog_get_info_by_fd(struct file *file, info.xlated_prog_len = 0; info.nr_jited_ksyms = 0; info.nr_jited_func_lens = 0; + info.func_info_cnt = 0; goto done; } @@ -2251,6 +2253,55 @@ static int bpf_prog_get_info_by_fd(struct file *file, } } + if (prog->aux->btf) { + u32 ucnt, urec_size; + + info.btf_id = btf_id(prog->aux->btf); + + ucnt = info.func_info_cnt; + info.func_info_cnt = prog->aux->func_cnt ? : 1; + urec_size = info.func_info_rec_size; + info.func_info_rec_size = sizeof(struct bpf_func_info); + if (ucnt) { + /* expect passed-in urec_size is what the kernel expects */ + if (urec_size != info.func_info_rec_size) + return -EINVAL; + + if (bpf_dump_raw_ok(file->f_cred)) { + struct bpf_func_info kern_finfo; + char __user *user_finfo; + u32 i, insn_offset; + + user_finfo = u64_to_user_ptr(info.func_info); + if (prog->aux->func_cnt) { + ucnt = min_t(u32, info.func_info_cnt, ucnt); + insn_offset = 0; + for (i = 0; i < ucnt; i++) { + kern_finfo.insn_offset = insn_offset; + kern_finfo.type_id = prog->aux->func[i]->aux->type_id; + if (copy_to_user(user_finfo, &kern_finfo, + sizeof(kern_finfo))) + return -EFAULT; + + /* func[i]->len holds the prog len */ + insn_offset += prog->aux->func[i]->len; + user_finfo += urec_size; + } + } else { + kern_finfo.insn_offset = 0; + kern_finfo.type_id = prog->aux->type_id; + if (copy_to_user(user_finfo, &kern_finfo, + sizeof(kern_finfo))) + return -EFAULT; + } + } else { + info.func_info_cnt = 0; + } + } + } else { + info.func_info_cnt = 0; + } + done: if (copy_to_user(uinfo, &info, info_len) || put_user(info_len, &uattr->info.info_len)) @@ -2542,7 +2593,7 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz err = map_get_next_key(&attr); break; case BPF_PROG_LOAD: - err = bpf_prog_load(&attr); + err = bpf_prog_load(&attr, uattr); break; case BPF_OBJ_PIN: err = bpf_obj_pin(&attr); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 5441fa621fd8..430ceecdd525 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -11,10 +11,12 @@ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. */ +#include #include #include #include #include +#include #include #include #include @@ -5200,6 +5202,114 @@ err_free: return ret; } +/* The minimum supported BTF func info size */ +#define MIN_BPF_FUNCINFO_SIZE 8 +#define MAX_FUNCINFO_REC_SIZE 252 + +static int check_btf_func(struct bpf_prog *prog, struct bpf_verifier_env *env, + union bpf_attr *attr, union bpf_attr __user *uattr) +{ + u32 i, nfuncs, urec_size, min_size, prev_offset; + u32 krec_size = sizeof(struct bpf_func_info); + struct bpf_func_info krecord = {}; + const struct btf_type *type; + void __user *urecord; + struct btf *btf; + int ret = 0; + + nfuncs = attr->func_info_cnt; + if (!nfuncs) + return 0; + + if (nfuncs != env->subprog_cnt) { + verbose(env, "number of funcs in func_info doesn't match number of subprogs\n"); + return -EINVAL; + } + + urec_size = attr->func_info_rec_size; + if (urec_size < MIN_BPF_FUNCINFO_SIZE || + urec_size > MAX_FUNCINFO_REC_SIZE || + urec_size % sizeof(u32)) { + verbose(env, "invalid func info rec size %u\n", urec_size); + return -EINVAL; + } + + btf = btf_get_by_fd(attr->prog_btf_fd); + if (IS_ERR(btf)) { + verbose(env, "unable to get btf from fd\n"); + return PTR_ERR(btf); + } + + urecord = u64_to_user_ptr(attr->func_info); + min_size = min_t(u32, krec_size, urec_size); + + for (i = 0; i < nfuncs; i++) { + ret = bpf_check_uarg_tail_zero(urecord, krec_size, urec_size); + if (ret) { + if (ret == -E2BIG) { + verbose(env, "nonzero tailing record in func info"); + /* set the size kernel expects so loader can zero + * out the rest of the record. + */ + if (put_user(min_size, &uattr->func_info_rec_size)) + ret = -EFAULT; + } + goto free_btf; + } + + if (copy_from_user(&krecord, urecord, min_size)) { + ret = -EFAULT; + goto free_btf; + } + + /* check insn_offset */ + if (i == 0) { + if (krecord.insn_offset) { + verbose(env, + "nonzero insn_offset %u for the first func info record", + krecord.insn_offset); + ret = -EINVAL; + goto free_btf; + } + } else if (krecord.insn_offset <= prev_offset) { + verbose(env, + "same or smaller insn offset (%u) than previous func info record (%u)", + krecord.insn_offset, prev_offset); + ret = -EINVAL; + goto free_btf; + } + + if (env->subprog_info[i].start != krecord.insn_offset) { + verbose(env, "func_info BTF section doesn't match subprog layout in BPF program\n"); + ret = -EINVAL; + goto free_btf; + } + + /* check type_id */ + type = btf_type_by_id(btf, krecord.type_id); + if (!type || BTF_INFO_KIND(type->info) != BTF_KIND_FUNC) { + verbose(env, "invalid type id %d in func info", + krecord.type_id); + ret = -EINVAL; + goto free_btf; + } + + if (i == 0) + prog->aux->type_id = krecord.type_id; + env->subprog_info[i].type_id = krecord.type_id; + + prev_offset = krecord.insn_offset; + urecord += urec_size; + } + + prog->aux->btf = btf; + return 0; + +free_btf: + btf_put(btf); + return ret; +} + /* check %cur's range satisfies %old's */ static bool range_within(struct bpf_reg_state *old, struct bpf_reg_state *cur) @@ -6507,6 +6617,9 @@ static int jit_subprogs(struct bpf_verifier_env *env) func[i]->aux->name[0] = 'F'; func[i]->aux->stack_depth = env->subprog_info[i].stack_depth; func[i]->jit_requested = 1; + /* the btf will be freed only at prog->aux */ + func[i]->aux->btf = prog->aux->btf; + func[i]->aux->type_id = env->subprog_info[i].type_id; func[i] = bpf_int_jit_compile(func[i]); if (!func[i]->jited) { err = -ENOTSUPP; @@ -6947,7 +7060,8 @@ static void free_states(struct bpf_verifier_env *env) kfree(env->explored_states); } -int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) +int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, + union bpf_attr __user *uattr) { struct bpf_verifier_env *env; struct bpf_verifier_log *log; @@ -7022,6 +7136,10 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) if (ret < 0) goto skip_full_check; + ret = check_btf_func(env->prog, env, attr, uattr); + if (ret < 0) + goto skip_full_check; + ret = do_check(env); if (env->cur_state) { free_verifier_state(env->cur_state, true); From 6512e9fba3ab83bba501a7abe0307cce279b3e92 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 21 Nov 2018 21:39:52 -0800 Subject: [PATCH 0976/1640] UPSTREAM: bpf, lpm: make longest_prefix_match() faster At LPC 2018 in Vancouver, Vlad Dumitrescu mentioned that longest_prefix_match() has a high cost [1]. One reason for that cost is a loop handling one byte at a time. We can handle more bytes at a time, if enough attention is paid to endianness. I was able to remove ~55 % of longest_prefix_match() cpu costs. [1] https://linuxplumbersconf.org/event/2/contributions/88/attachments/76/87/lpc-bpf-2018-shaping.pdf Change-Id: I35186774b2bb2ad844584f071dcb01ec0b3ee52f Signed-off-by: Eric Dumazet Cc: Vlad Dumitrescu Cc: Alexei Starovoitov Cc: Daniel Borkmann Signed-off-by: Daniel Borkmann --- kernel/bpf/lpm_trie.c | 59 +++++++++++++++++++++++++++++++++++-------- 1 file changed, 49 insertions(+), 10 deletions(-) diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index a929ee0e86b1..ac461d3eae34 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -168,20 +168,59 @@ static size_t longest_prefix_match(const struct lpm_trie *trie, const struct lpm_trie_node *node, const struct bpf_lpm_trie_key *key) { - size_t prefixlen = 0; - size_t i; + u32 limit = min(node->prefixlen, key->prefixlen); + u32 prefixlen = 0, i = 0; - for (i = 0; i < trie->data_size; i++) { - size_t b; + BUILD_BUG_ON(offsetof(struct lpm_trie_node, data) % sizeof(u32)); + BUILD_BUG_ON(offsetof(struct bpf_lpm_trie_key, data) % sizeof(u32)); - b = 8 - fls(node->data[i] ^ key->data[i]); - prefixlen += b; +#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && defined(CONFIG_64BIT) - if (prefixlen >= node->prefixlen || prefixlen >= key->prefixlen) - return min(node->prefixlen, key->prefixlen); + /* data_size >= 16 has very small probability. + * We do not use a loop for optimal code generation. + */ + if (trie->data_size >= 8) { + u64 diff = be64_to_cpu(*(__be64 *)node->data ^ + *(__be64 *)key->data); - if (b < 8) - break; + prefixlen = 64 - fls64(diff); + if (prefixlen >= limit) + return limit; + if (diff) + return prefixlen; + i = 8; + } +#endif + + while (trie->data_size >= i + 4) { + u32 diff = be32_to_cpu(*(__be32 *)&node->data[i] ^ + *(__be32 *)&key->data[i]); + + prefixlen += 32 - fls(diff); + if (prefixlen >= limit) + return limit; + if (diff) + return prefixlen; + i += 4; + } + + if (trie->data_size >= i + 2) { + u16 diff = be16_to_cpu(*(__be16 *)&node->data[i] ^ + *(__be16 *)&key->data[i]); + + prefixlen += 16 - fls(diff); + if (prefixlen >= limit) + return limit; + if (diff) + return prefixlen; + i += 2; + } + + if (trie->data_size >= i + 1) { + prefixlen += 8 - fls(node->data[i] ^ key->data[i]); + + if (prefixlen >= limit) + return limit; } return prefixlen; From d5ee8a78df886fcf92c582b3e348348f0ae4d365 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 22 Nov 2018 10:49:56 -0800 Subject: [PATCH 0977/1640] UPSTREAM: bpf: fix integer overflow in queue_stack_map Fix the following issues: - allow queue_stack_map for root only - fix u32 max_entries overflow - disallow value_size == 0 Fixes: f1a2e44a3aec ("bpf: add queue and stack maps") Reported-by: Wei Wu Change-Id: I68e30e5aeca185856529953ba23cdb5ee8045229 Signed-off-by: Alexei Starovoitov Cc: Mauricio Vasquez B Signed-off-by: Daniel Borkmann --- kernel/bpf/queue_stack_maps.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c index 8bbd72d3a121..b384ea9f3254 100644 --- a/kernel/bpf/queue_stack_maps.c +++ b/kernel/bpf/queue_stack_maps.c @@ -7,6 +7,7 @@ #include #include #include +#include #include "percpu_freelist.h" #define QUEUE_STACK_CREATE_FLAG_MASK \ @@ -45,8 +46,12 @@ static bool queue_stack_map_is_full(struct bpf_queue_stack *qs) /* Called from syscall */ static int queue_stack_map_alloc_check(union bpf_attr *attr) { + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + /* check sanity of attributes */ if (attr->max_entries == 0 || attr->key_size != 0 || + attr->value_size == 0 || attr->map_flags & ~QUEUE_STACK_CREATE_FLAG_MASK) return -EINVAL; @@ -63,15 +68,10 @@ static struct bpf_map *queue_stack_map_alloc(union bpf_attr *attr) { int ret, numa_node = bpf_map_attr_numa_node(attr); struct bpf_queue_stack *qs; - u32 size, value_size; - u64 queue_size, cost; + u64 size, queue_size, cost; - size = attr->max_entries + 1; - value_size = attr->value_size; - - queue_size = sizeof(*qs) + (u64) value_size * size; - - cost = queue_size; + size = (u64) attr->max_entries + 1; + cost = queue_size = sizeof(*qs) + size * attr->value_size; if (cost >= U32_MAX - PAGE_SIZE) return ERR_PTR(-E2BIG); From a66177268f36dd0328e382f0f4249e4598be8dcd Mon Sep 17 00:00:00 2001 From: Rustam Kovhaev Date: Fri, 23 Nov 2018 15:48:16 -0800 Subject: [PATCH 0978/1640] UPSTREAM: bpf, tags: Fix DEFINE_PER_CPU expansion Building tags produces warning: ctags: Warning: kernel/bpf/local_storage.c:10: null expansion of name pattern "\1" Let's use the same fix as in commit 25528213fe9f ("tags: Fix DEFINE_PER_CPU expansions"), even though it violates the usual code style. Change-Id: I16fb10701f474e04fdc9bf1856c691be05b42fbc Signed-off-by: Rustam Kovhaev Signed-off-by: Daniel Borkmann --- kernel/bpf/local_storage.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index bed9d48a7ae9..b65017dead44 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -7,8 +7,7 @@ #include #include -DEFINE_PER_CPU(struct bpf_cgroup_storage*, - bpf_cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]); +DEFINE_PER_CPU(struct bpf_cgroup_storage*, bpf_cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]); #ifdef CONFIG_CGROUP_BPF From 7f64f8cc003ee5680883396b92d1e5baecd81f4a Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Sun, 25 Nov 2018 23:32:51 +0000 Subject: [PATCH 0979/1640] UPSTREAM: bpf: btf: fix spelling mistake "Memmber" -> "Member" There is a spelling mistake in a btf_verifier_log_member message, fix it. Change-Id: Ic0a82a340f25e1c8e0805c237378a3ee081e0527 Signed-off-by: Colin Ian King Signed-off-by: Daniel Borkmann --- kernel/bpf/btf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index ccfdf4914ae8..170081b29a2e 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -1663,7 +1663,7 @@ static s32 btf_struct_check_meta(struct btf_verifier_env *env, if (BITS_ROUNDUP_BYTES(member->offset) > struct_size) { btf_verifier_log_member(env, t, member, - "Memmber bits_offset exceeds its struct size"); + "Member bits_offset exceeds its struct size"); return -EINVAL; } From d890fc7d329b4fca30c5c3c0db27f71e517f02c0 Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Thu, 24 May 2018 12:26:47 +0530 Subject: [PATCH 0980/1640] UPSTREAM: bpf: powerpc64: add JIT support for multi-function programs This adds support for bpf-to-bpf function calls in the powerpc64 JIT compiler. The JIT compiler converts the bpf call instructions to native branch instructions. After a round of the usual passes, the start addresses of the JITed images for the callee functions are known. Finally, to fixup the branch target addresses, we need to perform an extra pass. Because of the address range in which JITed images are allocated on powerpc64, the offsets of the start addresses of these images from __bpf_call_base are as large as 64 bits. So, for a function call, we cannot use the imm field of the instruction to determine the callee's address. Instead, we use the alternative method of getting it from the list of function addresses in the auxiliary data of the caller by using the off field as an index. Signed-off-by: Sandipan Das Signed-off-by: Daniel Borkmann --- arch/powerpc/net/bpf_jit_comp64.c | 76 +++++++++++++++++++++++++++---- 1 file changed, 66 insertions(+), 10 deletions(-) diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c index bfbac413c5ae..259616fdaa1f 100644 --- a/arch/powerpc/net/bpf_jit_comp64.c +++ b/arch/powerpc/net/bpf_jit_comp64.c @@ -304,7 +304,7 @@ static int bpf_jit_emit_tail_call(u32 *image, struct codegen_context *ctx, u32 o /* Assemble the body code between the prologue & epilogue */ static int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, struct codegen_context *ctx, - u32 *addrs) + u32 *addrs, bool extra_pass) { const struct bpf_insn *insn = fp->insnsi; int flen = fp->len; @@ -768,11 +768,25 @@ emit_clear: break; /* - * Call kernel helper + * Call kernel helper or bpf function */ case BPF_JMP | BPF_CALL: ctx->seen |= SEEN_FUNC; - func = (u8 *) __bpf_call_base + imm; + + /* bpf function call */ + if (insn[i].src_reg == BPF_PSEUDO_CALL) + if (!extra_pass) + func = NULL; + else if (fp->aux->func && off < fp->aux->func_cnt) + /* use the subprog id from the off + * field to lookup the callee address + */ + func = (u8 *) fp->aux->func[off]->bpf_func; + else + return -EINVAL; + /* kernel helper call */ + else + func = (u8 *) __bpf_call_base + imm; /* Save skb pointer if we need to re-cache skb data */ if ((ctx->seen & SEEN_SKB) && @@ -994,6 +1008,14 @@ common_load: return 0; } +struct powerpc64_jit_data { + struct bpf_binary_header *header; + u32 *addrs; + u8 *image; + u32 proglen; + struct codegen_context ctx; +}; + struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp) { u32 proglen; @@ -1001,6 +1023,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp) u8 *image = NULL; u32 *code_base; u32 *addrs; + struct powerpc64_jit_data *jit_data; struct codegen_context cgctx; int pass; int flen; @@ -1008,6 +1031,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp) struct bpf_prog *org_fp = fp; struct bpf_prog *tmp_fp; bool bpf_blinded = false; + bool extra_pass = false; if (!fp->jit_requested) return org_fp; @@ -1021,20 +1045,41 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp) fp = tmp_fp; } + jit_data = fp->aux->jit_data; + if (!jit_data) { + jit_data = kzalloc(sizeof(*jit_data), GFP_KERNEL); + if (!jit_data) { + fp = org_fp; + goto out; + } + fp->aux->jit_data = jit_data; + } + flen = fp->len; + addrs = jit_data->addrs; + if (addrs) { + cgctx = jit_data->ctx; + image = jit_data->image; + bpf_hdr = jit_data->header; + proglen = jit_data->proglen; + alloclen = proglen + FUNCTION_DESCR_SIZE; + extra_pass = true; + goto skip_init_ctx; + } + addrs = kzalloc((flen+1) * sizeof(*addrs), GFP_KERNEL); if (addrs == NULL) { fp = org_fp; - goto out; + goto out_addrs; } memset(&cgctx, 0, sizeof(struct codegen_context)); /* Scouting faux-generate pass 0 */ - if (bpf_jit_build_body(fp, 0, &cgctx, addrs)) { + if (bpf_jit_build_body(fp, 0, &cgctx, addrs, false)) { /* We hit something illegal or unsupported. */ fp = org_fp; - goto out; + goto out_addrs; } /* @@ -1052,9 +1097,10 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp) bpf_jit_fill_ill_insns); if (!bpf_hdr) { fp = org_fp; - goto out; + goto out_addrs; } +skip_init_ctx: code_base = (u32 *)(image + FUNCTION_DESCR_SIZE); /* Code generation passes 1-2 */ @@ -1062,7 +1108,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp) /* Now build the prologue, body code & epilogue for real. */ cgctx.idx = 0; bpf_jit_build_prologue(code_base, &cgctx); - bpf_jit_build_body(fp, code_base, &cgctx, addrs); + bpf_jit_build_body(fp, code_base, &cgctx, addrs, extra_pass); bpf_jit_build_epilogue(code_base, &cgctx); if (bpf_jit_enable > 1) @@ -1088,10 +1134,20 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp) fp->jited_len = alloclen; bpf_flush_icache(bpf_hdr, (u8 *)bpf_hdr + (bpf_hdr->pages * PAGE_SIZE)); + if (!fp->is_func || extra_pass) { +out_addrs: + kfree(addrs); + kfree(jit_data); + fp->aux->jit_data = NULL; + } else { + jit_data->addrs = addrs; + jit_data->ctx = cgctx; + jit_data->proglen = proglen; + jit_data->image = image; + jit_data->header = bpf_hdr; + } out: - kfree(addrs); - if (bpf_blinded) bpf_jit_prog_release_other(fp, fp == org_fp ? tmp_fp : org_fp); From 7f76451e0ae395c4913ae7dc88d388071d384a64 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 4 May 2018 01:08:21 +0200 Subject: [PATCH 0981/1640] UPSTREAM: bpf, ppc64: remove ld_abs/ld_ind Since LD_ABS/LD_IND instructions are now removed from the core and reimplemented through a combination of inlined BPF instructions and a slow-path helper, we can get rid of the complexity from ppc64 JIT. Signed-off-by: Daniel Borkmann Acked-by: Naveen N. Rao Acked-by: Alexei Starovoitov Tested-by: Sandipan Das Signed-off-by: Alexei Starovoitov --- arch/powerpc/net/Makefile | 2 +- arch/powerpc/net/bpf_jit64.h | 37 ++---- arch/powerpc/net/bpf_jit_asm64.S | 180 ------------------------------ arch/powerpc/net/bpf_jit_comp64.c | 109 +----------------- 4 files changed, 11 insertions(+), 317 deletions(-) delete mode 100644 arch/powerpc/net/bpf_jit_asm64.S diff --git a/arch/powerpc/net/Makefile b/arch/powerpc/net/Makefile index 02d369ca6a53..809f019d3cba 100644 --- a/arch/powerpc/net/Makefile +++ b/arch/powerpc/net/Makefile @@ -3,7 +3,7 @@ # Arch-specific network modules # ifeq ($(CONFIG_PPC64),y) -obj-$(CONFIG_BPF_JIT) += bpf_jit_asm64.o bpf_jit_comp64.o +obj-$(CONFIG_BPF_JIT) += bpf_jit_comp64.o else obj-$(CONFIG_BPF_JIT) += bpf_jit_asm.o bpf_jit_comp.o endif diff --git a/arch/powerpc/net/bpf_jit64.h b/arch/powerpc/net/bpf_jit64.h index bb944b6018d7..d2353270223e 100644 --- a/arch/powerpc/net/bpf_jit64.h +++ b/arch/powerpc/net/bpf_jit64.h @@ -20,7 +20,7 @@ * with our redzone usage. * * [ prev sp ] <------------- - * [ nv gpr save area ] 8*8 | + * [ nv gpr save area ] 6*8 | * [ tail_call_cnt ] 8 | * [ local_tmp_var ] 8 | * fp (r31) --> [ ebpf stack space ] 512 | @@ -28,8 +28,8 @@ * sp (r1) ---> [ stack pointer ] -------------- */ -/* for gpr non volatile registers BPG_REG_6 to 10, plus skb cache registers */ -#define BPF_PPC_STACK_SAVE (8*8) +/* for gpr non volatile registers BPG_REG_6 to 10 */ +#define BPF_PPC_STACK_SAVE (6*8) /* for bpf JIT code internal usage */ #define BPF_PPC_STACK_LOCALS 16 /* Ensure this is quadword aligned */ @@ -39,10 +39,8 @@ #ifndef __ASSEMBLY__ /* BPF register usage */ -#define SKB_HLEN_REG (MAX_BPF_JIT_REG + 0) -#define SKB_DATA_REG (MAX_BPF_JIT_REG + 1) -#define TMP_REG_1 (MAX_BPF_JIT_REG + 2) -#define TMP_REG_2 (MAX_BPF_JIT_REG + 3) +#define TMP_REG_1 (MAX_BPF_JIT_REG + 0) +#define TMP_REG_2 (MAX_BPF_JIT_REG + 1) /* BPF to ppc register mappings */ static const int b2p[] = { @@ -63,28 +61,12 @@ static const int b2p[] = { [BPF_REG_FP] = 31, /* eBPF jit internal registers */ [BPF_REG_AX] = 2, - [SKB_HLEN_REG] = 25, - [SKB_DATA_REG] = 26, [TMP_REG_1] = 9, [TMP_REG_2] = 10 }; -/* PPC NVR range -- update this if we ever use NVRs below r24 */ -#define BPF_PPC_NVR_MIN 24 - -/* Assembly helpers */ -#define DECLARE_LOAD_FUNC(func) u64 func(u64 r3, u64 r4); \ - u64 func##_negative_offset(u64 r3, u64 r4); \ - u64 func##_positive_offset(u64 r3, u64 r4); - -DECLARE_LOAD_FUNC(sk_load_word); -DECLARE_LOAD_FUNC(sk_load_half); -DECLARE_LOAD_FUNC(sk_load_byte); - -#define CHOOSE_LOAD_FUNC(imm, func) \ - (imm < 0 ? \ - (imm >= SKF_LL_OFF ? func##_negative_offset : func) : \ - func##_positive_offset) +/* PPC NVR range -- update this if we ever use NVRs below r27 */ +#define BPF_PPC_NVR_MIN 27 /* * WARNING: These can use TMP_REG_2 if the offset is not at word boundary, @@ -108,15 +90,14 @@ DECLARE_LOAD_FUNC(sk_load_byte); #define SEEN_FUNC 0x1000 /* might call external helpers */ #define SEEN_STACK 0x2000 /* uses BPF stack */ -#define SEEN_SKB 0x4000 /* uses sk_buff */ -#define SEEN_TAILCALL 0x8000 /* uses tail calls */ +#define SEEN_TAILCALL 0x4000 /* uses tail calls */ struct codegen_context { /* * This is used to track register usage as well * as calls to external helpers. * - register usage is tracked with corresponding - * bits (r3-r10 and r25-r31) + * bits (r3-r10 and r27-r31) * - rest of the bits can be used to track other * things -- for now, we use bits 16 to 23 * encoded in SEEN_* macros above diff --git a/arch/powerpc/net/bpf_jit_asm64.S b/arch/powerpc/net/bpf_jit_asm64.S deleted file mode 100644 index 7e4c51430b84..000000000000 --- a/arch/powerpc/net/bpf_jit_asm64.S +++ /dev/null @@ -1,180 +0,0 @@ -/* - * bpf_jit_asm64.S: Packet/header access helper functions - * for PPC64 BPF compiler. - * - * Copyright 2016, Naveen N. Rao - * IBM Corporation - * - * Based on bpf_jit_asm.S by Matt Evans - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; version 2 - * of the License. - */ - -#include -#include -#include "bpf_jit64.h" - -/* - * All of these routines are called directly from generated code, - * with the below register usage: - * r27 skb pointer (ctx) - * r25 skb header length - * r26 skb->data pointer - * r4 offset - * - * Result is passed back in: - * r8 data read in host endian format (accumulator) - * - * r9 is used as a temporary register - */ - -#define r_skb r27 -#define r_hlen r25 -#define r_data r26 -#define r_off r4 -#define r_val r8 -#define r_tmp r9 - -_GLOBAL_TOC(sk_load_word) - cmpdi r_off, 0 - blt bpf_slow_path_word_neg - b sk_load_word_positive_offset - -_GLOBAL_TOC(sk_load_word_positive_offset) - /* Are we accessing past headlen? */ - subi r_tmp, r_hlen, 4 - cmpd r_tmp, r_off - blt bpf_slow_path_word - /* Nope, just hitting the header. cr0 here is eq or gt! */ - LWZX_BE r_val, r_data, r_off - blr /* Return success, cr0 != LT */ - -_GLOBAL_TOC(sk_load_half) - cmpdi r_off, 0 - blt bpf_slow_path_half_neg - b sk_load_half_positive_offset - -_GLOBAL_TOC(sk_load_half_positive_offset) - subi r_tmp, r_hlen, 2 - cmpd r_tmp, r_off - blt bpf_slow_path_half - LHZX_BE r_val, r_data, r_off - blr - -_GLOBAL_TOC(sk_load_byte) - cmpdi r_off, 0 - blt bpf_slow_path_byte_neg - b sk_load_byte_positive_offset - -_GLOBAL_TOC(sk_load_byte_positive_offset) - cmpd r_hlen, r_off - ble bpf_slow_path_byte - lbzx r_val, r_data, r_off - blr - -/* - * Call out to skb_copy_bits: - * Allocate a new stack frame here to remain ABI-compliant in - * stashing LR. - */ -#define bpf_slow_path_common(SIZE) \ - mflr r0; \ - std r0, PPC_LR_STKOFF(r1); \ - stdu r1, -(STACK_FRAME_MIN_SIZE + BPF_PPC_STACK_LOCALS)(r1); \ - mr r3, r_skb; \ - /* r4 = r_off as passed */ \ - addi r5, r1, STACK_FRAME_MIN_SIZE; \ - li r6, SIZE; \ - bl skb_copy_bits; \ - nop; \ - /* save r5 */ \ - addi r5, r1, STACK_FRAME_MIN_SIZE; \ - /* r3 = 0 on success */ \ - addi r1, r1, STACK_FRAME_MIN_SIZE + BPF_PPC_STACK_LOCALS; \ - ld r0, PPC_LR_STKOFF(r1); \ - mtlr r0; \ - cmpdi r3, 0; \ - blt bpf_error; /* cr0 = LT */ - -bpf_slow_path_word: - bpf_slow_path_common(4) - /* Data value is on stack, and cr0 != LT */ - LWZX_BE r_val, 0, r5 - blr - -bpf_slow_path_half: - bpf_slow_path_common(2) - LHZX_BE r_val, 0, r5 - blr - -bpf_slow_path_byte: - bpf_slow_path_common(1) - lbzx r_val, 0, r5 - blr - -/* - * Call out to bpf_internal_load_pointer_neg_helper - */ -#define sk_negative_common(SIZE) \ - mflr r0; \ - std r0, PPC_LR_STKOFF(r1); \ - stdu r1, -STACK_FRAME_MIN_SIZE(r1); \ - mr r3, r_skb; \ - /* r4 = r_off, as passed */ \ - li r5, SIZE; \ - bl bpf_internal_load_pointer_neg_helper; \ - nop; \ - addi r1, r1, STACK_FRAME_MIN_SIZE; \ - ld r0, PPC_LR_STKOFF(r1); \ - mtlr r0; \ - /* R3 != 0 on success */ \ - cmpldi r3, 0; \ - beq bpf_error_slow; /* cr0 = EQ */ - -bpf_slow_path_word_neg: - lis r_tmp, -32 /* SKF_LL_OFF */ - cmpd r_off, r_tmp /* addr < SKF_* */ - blt bpf_error /* cr0 = LT */ - b sk_load_word_negative_offset - -_GLOBAL_TOC(sk_load_word_negative_offset) - sk_negative_common(4) - LWZX_BE r_val, 0, r3 - blr - -bpf_slow_path_half_neg: - lis r_tmp, -32 /* SKF_LL_OFF */ - cmpd r_off, r_tmp /* addr < SKF_* */ - blt bpf_error /* cr0 = LT */ - b sk_load_half_negative_offset - -_GLOBAL_TOC(sk_load_half_negative_offset) - sk_negative_common(2) - LHZX_BE r_val, 0, r3 - blr - -bpf_slow_path_byte_neg: - lis r_tmp, -32 /* SKF_LL_OFF */ - cmpd r_off, r_tmp /* addr < SKF_* */ - blt bpf_error /* cr0 = LT */ - b sk_load_byte_negative_offset - -_GLOBAL_TOC(sk_load_byte_negative_offset) - sk_negative_common(1) - lbzx r_val, 0, r3 - blr - -bpf_error_slow: - /* fabricate a cr0 = lt */ - li r_tmp, -1 - cmpdi r_tmp, 0 -bpf_error: - /* - * Entered with cr0 = lt - * Generated code will 'blt epilogue', returning 0. - */ - li r_val, 0 - blr diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c index 259616fdaa1f..6a2c1c35a1c1 100644 --- a/arch/powerpc/net/bpf_jit_comp64.c +++ b/arch/powerpc/net/bpf_jit_comp64.c @@ -59,7 +59,7 @@ static inline bool bpf_has_stack_frame(struct codegen_context *ctx) * [ prev sp ] <------------- * [ ... ] | * sp (r1) ---> [ stack pointer ] -------------- - * [ nv gpr save area ] 8*8 + * [ nv gpr save area ] 6*8 * [ tail_call_cnt ] 8 * [ local_tmp_var ] 8 * [ unused red zone ] 208 bytes protected @@ -87,21 +87,6 @@ static int bpf_jit_stack_offsetof(struct codegen_context *ctx, int reg) BUG(); } -static void bpf_jit_emit_skb_loads(u32 *image, struct codegen_context *ctx) -{ - /* - * Load skb->len and skb->data_len - * r3 points to skb - */ - PPC_LWZ(b2p[SKB_HLEN_REG], 3, offsetof(struct sk_buff, len)); - PPC_LWZ(b2p[TMP_REG_1], 3, offsetof(struct sk_buff, data_len)); - /* header_len = len - data_len */ - PPC_SUB(b2p[SKB_HLEN_REG], b2p[SKB_HLEN_REG], b2p[TMP_REG_1]); - - /* skb->data pointer */ - PPC_BPF_LL(b2p[SKB_DATA_REG], 3, offsetof(struct sk_buff, data)); -} - static void bpf_jit_build_prologue(u32 *image, struct codegen_context *ctx) { int i; @@ -144,18 +129,6 @@ static void bpf_jit_build_prologue(u32 *image, struct codegen_context *ctx) if (bpf_is_seen_register(ctx, i)) PPC_BPF_STL(b2p[i], 1, bpf_jit_stack_offsetof(ctx, b2p[i])); - /* - * Save additional non-volatile regs if we cache skb - * Also, setup skb data - */ - if (ctx->seen & SEEN_SKB) { - PPC_BPF_STL(b2p[SKB_HLEN_REG], 1, - bpf_jit_stack_offsetof(ctx, b2p[SKB_HLEN_REG])); - PPC_BPF_STL(b2p[SKB_DATA_REG], 1, - bpf_jit_stack_offsetof(ctx, b2p[SKB_DATA_REG])); - bpf_jit_emit_skb_loads(image, ctx); - } - /* Setup frame pointer to point to the bpf stack area */ if (bpf_is_seen_register(ctx, BPF_REG_FP)) PPC_ADDI(b2p[BPF_REG_FP], 1, @@ -171,14 +144,6 @@ static void bpf_jit_emit_common_epilogue(u32 *image, struct codegen_context *ctx if (bpf_is_seen_register(ctx, i)) PPC_BPF_LL(b2p[i], 1, bpf_jit_stack_offsetof(ctx, b2p[i])); - /* Restore non-volatile registers used for skb cache */ - if (ctx->seen & SEEN_SKB) { - PPC_BPF_LL(b2p[SKB_HLEN_REG], 1, - bpf_jit_stack_offsetof(ctx, b2p[SKB_HLEN_REG])); - PPC_BPF_LL(b2p[SKB_DATA_REG], 1, - bpf_jit_stack_offsetof(ctx, b2p[SKB_DATA_REG])); - } - /* Tear down our stack frame */ if (bpf_has_stack_frame(ctx)) { PPC_ADDI(1, 1, BPF_PPC_STACKFRAME); @@ -788,23 +753,10 @@ emit_clear: else func = (u8 *) __bpf_call_base + imm; - /* Save skb pointer if we need to re-cache skb data */ - if ((ctx->seen & SEEN_SKB) && - bpf_helper_changes_pkt_data(func)) - PPC_BPF_STL(3, 1, bpf_jit_stack_local(ctx)); - bpf_jit_emit_func_call(image, ctx, (u64)func); /* move return value from r3 to BPF_REG_0 */ PPC_MR(b2p[BPF_REG_0], 3); - - /* refresh skb cache */ - if ((ctx->seen & SEEN_SKB) && - bpf_helper_changes_pkt_data(func)) { - /* reload skb pointer to r3 */ - PPC_BPF_LL(3, 1, bpf_jit_stack_local(ctx)); - bpf_jit_emit_skb_loads(image, ctx); - } break; /* @@ -921,65 +873,6 @@ cond_branch: PPC_BCC(true_cond, addrs[i + 1 + off]); break; - /* - * Loads from packet header/data - * Assume 32-bit input value in imm and X (src_reg) - */ - - /* Absolute loads */ - case BPF_LD | BPF_W | BPF_ABS: - func = (u8 *)CHOOSE_LOAD_FUNC(imm, sk_load_word); - goto common_load_abs; - case BPF_LD | BPF_H | BPF_ABS: - func = (u8 *)CHOOSE_LOAD_FUNC(imm, sk_load_half); - goto common_load_abs; - case BPF_LD | BPF_B | BPF_ABS: - func = (u8 *)CHOOSE_LOAD_FUNC(imm, sk_load_byte); -common_load_abs: - /* - * Load from [imm] - * Load into r4, which can just be passed onto - * skb load helpers as the second parameter - */ - PPC_LI32(4, imm); - goto common_load; - - /* Indirect loads */ - case BPF_LD | BPF_W | BPF_IND: - func = (u8 *)sk_load_word; - goto common_load_ind; - case BPF_LD | BPF_H | BPF_IND: - func = (u8 *)sk_load_half; - goto common_load_ind; - case BPF_LD | BPF_B | BPF_IND: - func = (u8 *)sk_load_byte; -common_load_ind: - /* - * Load from [src_reg + imm] - * Treat src_reg as a 32-bit value - */ - PPC_EXTSW(4, src_reg); - if (imm) { - if (imm >= -32768 && imm < 32768) - PPC_ADDI(4, 4, IMM_L(imm)); - else { - PPC_LI32(b2p[TMP_REG_1], imm); - PPC_ADD(4, 4, b2p[TMP_REG_1]); - } - } - -common_load: - ctx->seen |= SEEN_SKB; - ctx->seen |= SEEN_FUNC; - bpf_jit_emit_func_call(image, ctx, (u64)func); - - /* - * Helper returns 'lt' condition on error, and an - * appropriate return value in BPF_REG_0 - */ - PPC_BCC(COND_LT, exit_addr); - break; - /* * Tail call */ From dfc143b86edf4b278855ad0bff41b4c832c9834d Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 26 Nov 2018 14:05:38 +0100 Subject: [PATCH 0982/1640] UPSTREAM: bpf, ppc64: generalize fetching subprog into bpf_jit_get_func_addr Make fetching of the BPF call address from ppc64 JIT generic. ppc64 was using a slightly different variant rather than through the insns' imm field encoding as the target address would not fit into that space. Therefore, the target subprog number was encoded into the insns' offset and fetched through fp->aux->func[off]->bpf_func instead. Given there are other JITs with this issue and the mechanism of fetching the address is JIT-generic, move it into the core as a helper instead. On the JIT side, we get information on whether the retrieved address is a fixed one, that is, not changing through JIT passes, or a dynamic one. For the former, JITs can optimize their imm emission because this doesn't change jump offsets throughout JIT process. Change-Id: If6d9bc37bf165ebfbdb50ed3e6f52743be7299ab Signed-off-by: Daniel Borkmann Reviewed-by: Sandipan Das Tested-by: Sandipan Das Signed-off-by: Alexei Starovoitov --- arch/powerpc/net/bpf_jit_comp64.c | 55 +++++++++++++++++++++---------- include/linux/filter.h | 4 +++ kernel/bpf/core.c | 34 +++++++++++++++++++ 3 files changed, 75 insertions(+), 18 deletions(-) diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c index 6a2c1c35a1c1..1d213bdb393c 100644 --- a/arch/powerpc/net/bpf_jit_comp64.c +++ b/arch/powerpc/net/bpf_jit_comp64.c @@ -164,7 +164,33 @@ static void bpf_jit_build_epilogue(u32 *image, struct codegen_context *ctx) PPC_BLR(); } -static void bpf_jit_emit_func_call(u32 *image, struct codegen_context *ctx, u64 func) +static void bpf_jit_emit_func_call_hlp(u32 *image, struct codegen_context *ctx, + u64 func) +{ +#ifdef PPC64_ELF_ABI_v1 + /* func points to the function descriptor */ + PPC_LI64(b2p[TMP_REG_2], func); + /* Load actual entry point from function descriptor */ + PPC_BPF_LL(b2p[TMP_REG_1], b2p[TMP_REG_2], 0); + /* ... and move it to LR */ + PPC_MTLR(b2p[TMP_REG_1]); + /* + * Load TOC from function descriptor at offset 8. + * We can clobber r2 since we get called through a + * function pointer (so caller will save/restore r2) + * and since we don't use a TOC ourself. + */ + PPC_BPF_LL(2, b2p[TMP_REG_2], 8); +#else + /* We can clobber r12 */ + PPC_FUNC_ADDR(12, func); + PPC_MTLR(12); +#endif + PPC_BLRL(); +} + +static void bpf_jit_emit_func_call_rel(u32 *image, struct codegen_context *ctx, + u64 func) { unsigned int i, ctx_idx = ctx->idx; @@ -284,8 +310,9 @@ static int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, u32 src_reg = b2p[insn[i].src_reg]; s16 off = insn[i].off; s32 imm = insn[i].imm; + bool func_addr_fixed; + u64 func_addr; u64 imm64; - u8 *func; u32 true_cond; u32 tmp_idx; @@ -738,23 +765,15 @@ emit_clear: case BPF_JMP | BPF_CALL: ctx->seen |= SEEN_FUNC; - /* bpf function call */ - if (insn[i].src_reg == BPF_PSEUDO_CALL) - if (!extra_pass) - func = NULL; - else if (fp->aux->func && off < fp->aux->func_cnt) - /* use the subprog id from the off - * field to lookup the callee address - */ - func = (u8 *) fp->aux->func[off]->bpf_func; - else - return -EINVAL; - /* kernel helper call */ + ret = bpf_jit_get_func_addr(fp, &insn[i], extra_pass, + &func_addr, &func_addr_fixed); + if (ret < 0) + return ret; + + if (func_addr_fixed) + bpf_jit_emit_func_call_hlp(image, ctx, func_addr); else - func = (u8 *) __bpf_call_base + imm; - - bpf_jit_emit_func_call(image, ctx, (u64)func); - + bpf_jit_emit_func_call_rel(image, ctx, func_addr); /* move return value from r3 to BPF_REG_0 */ PPC_MR(b2p[BPF_REG_0], 3); break; diff --git a/include/linux/filter.h b/include/linux/filter.h index 0a6c8d5e6c07..ad7c95a503d5 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -939,6 +939,10 @@ void *bpf_jit_alloc_exec(unsigned long size); void bpf_jit_free_exec(void *addr); void bpf_jit_free(struct bpf_prog *fp); +int bpf_jit_get_func_addr(const struct bpf_prog *prog, + const struct bpf_insn *insn, bool extra_pass, + u64 *func_addr, bool *func_addr_fixed); + struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *fp); void bpf_jit_prog_release_other(struct bpf_prog *fp, struct bpf_prog *fp_other); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index ecc6d850b0ec..cc8c76dacf11 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -717,6 +717,40 @@ void __weak bpf_jit_free(struct bpf_prog *fp) bpf_prog_unlock_free(fp); } +int bpf_jit_get_func_addr(const struct bpf_prog *prog, + const struct bpf_insn *insn, bool extra_pass, + u64 *func_addr, bool *func_addr_fixed) +{ + s16 off = insn->off; + s32 imm = insn->imm; + u8 *addr; + + *func_addr_fixed = insn->src_reg != BPF_PSEUDO_CALL; + if (!*func_addr_fixed) { + /* Place-holder address till the last pass has collected + * all addresses for JITed subprograms in which case we + * can pick them up from prog->aux. + */ + if (!extra_pass) + addr = NULL; + else if (prog->aux->func && + off >= 0 && off < prog->aux->func_cnt) + addr = (u8 *)prog->aux->func[off]->bpf_func; + else + return -EINVAL; + } else { + /* Address of a BPF helper call. Since part of the core + * kernel, it's always at a fixed location. __bpf_call_base + * and the helper with imm relative to it are both in core + * kernel. + */ + addr = (u8 *)__bpf_call_base + imm; + } + + *func_addr = (unsigned long)addr; + return 0; +} + static int bpf_jit_blind_insn(const struct bpf_insn *from, const struct bpf_insn *aux, struct bpf_insn *to_buff) From c3e2bd892cfc17e7fe9790e4474e5d913f1a1f73 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sat, 24 Nov 2018 23:20:44 -0800 Subject: [PATCH 0983/1640] UPSTREAM: bpf: btf: support proper non-jit func info Commit 838e96904ff3 ("bpf: Introduce bpf_func_info") added bpf func info support. The userspace is able to get better ksym's for bpf programs with jit, and is able to print out func prototypes. For a program containing func-to-func calls, the existing implementation returns user specified number of function calls and BTF types if jit is enabled. If the jit is not enabled, it only returns the type for the main function. This is undesirable. Interpreter may still be used and we should keep feature identical regardless of whether jit is enabled or not. This patch fixed this discrepancy. Fixes: 838e96904ff3 ("bpf: Introduce bpf_func_info") Change-Id: I208a85d244c88fe71c9301945cec0099b1284459 Signed-off-by: Yonghong Song Acked-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 6 ++-- include/linux/bpf_verifier.h | 1 - kernel/bpf/core.c | 3 +- kernel/bpf/syscall.c | 33 ++++++---------------- kernel/bpf/verifier.c | 55 +++++++++++++++++++++++++----------- 5 files changed, 52 insertions(+), 46 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index cf4903c8863a..848e0962861f 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -304,7 +304,8 @@ struct bpf_prog_aux { u32 max_pkt_offset; u32 stack_depth; u32 id; - u32 func_cnt; + u32 func_cnt; /* used by non-func prog as the number of func progs */ + u32 func_idx; /* 0 for non-func prog, the index in func array for func prog */ bool offload_requested; struct bpf_prog **func; void *jit_data; /* JIT specific data. arch dependent */ @@ -322,7 +323,8 @@ struct bpf_prog_aux { #endif struct bpf_prog_offload *offload; struct btf *btf; - u32 type_id; /* type id for this prog/func */ + struct bpf_func_info *func_info; + u32 func_info_cnt; union { struct work_struct work; struct rcu_head rcu; diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 0d754d0c1de9..1001573674a1 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -223,7 +223,6 @@ static inline bool bpf_verifier_log_needed(const struct bpf_verifier_log *log) struct bpf_subprog_info { u32 start; /* insn idx of function entry point */ u16 stack_depth; /* max. stack depth used by this function */ - u32 type_id; /* btf type_id for this subprog */ }; /* single container for all structs diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index cc8c76dacf11..6a6a6ceb5eb5 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -419,7 +419,8 @@ static void bpf_get_prog_name(const struct bpf_prog *prog, char *sym) /* prog->aux->name will be ignored if full btf name is available */ if (prog->aux->btf) { - type = btf_type_by_id(prog->aux->btf, prog->aux->type_id); + type = btf_type_by_id(prog->aux->btf, + prog->aux->func_info[prog->aux->func_idx].type_id); func_name = btf_name_by_offset(prog->aux->btf, type->name_off); snprintf(sym, (size_t)(end - sym), "_%s", func_name); return; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index ce7b3a77f4c4..7c9c6db99a47 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1227,6 +1227,7 @@ static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock) bpf_prog_free_id(prog, do_idr_lock); bpf_prog_kallsyms_del_all(prog); btf_put(prog->aux->btf); + kvfree(prog->aux->func_info); call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu); } @@ -2254,46 +2255,28 @@ static int bpf_prog_get_info_by_fd(struct file *file, } if (prog->aux->btf) { + u32 krec_size = sizeof(struct bpf_func_info); u32 ucnt, urec_size; info.btf_id = btf_id(prog->aux->btf); ucnt = info.func_info_cnt; - info.func_info_cnt = prog->aux->func_cnt ? : 1; + info.func_info_cnt = prog->aux->func_info_cnt; urec_size = info.func_info_rec_size; - info.func_info_rec_size = sizeof(struct bpf_func_info); + info.func_info_rec_size = krec_size; if (ucnt) { /* expect passed-in urec_size is what the kernel expects */ if (urec_size != info.func_info_rec_size) return -EINVAL; if (bpf_dump_raw_ok(file->f_cred)) { - struct bpf_func_info kern_finfo; char __user *user_finfo; - u32 i, insn_offset; user_finfo = u64_to_user_ptr(info.func_info); - if (prog->aux->func_cnt) { - ucnt = min_t(u32, info.func_info_cnt, ucnt); - insn_offset = 0; - for (i = 0; i < ucnt; i++) { - kern_finfo.insn_offset = insn_offset; - kern_finfo.type_id = prog->aux->func[i]->aux->type_id; - if (copy_to_user(user_finfo, &kern_finfo, - sizeof(kern_finfo))) - return -EFAULT; - - /* func[i]->len holds the prog len */ - insn_offset += prog->aux->func[i]->len; - user_finfo += urec_size; - } - } else { - kern_finfo.insn_offset = 0; - kern_finfo.type_id = prog->aux->type_id; - if (copy_to_user(user_finfo, &kern_finfo, - sizeof(kern_finfo))) - return -EFAULT; - } + ucnt = min_t(u32, info.func_info_cnt, ucnt); + if (copy_to_user(user_finfo, prog->aux->func_info, + krec_size * ucnt)) + return -EFAULT; } else { info.func_info_cnt = 0; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 430ceecdd525..0fabe2cb6b23 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5211,7 +5211,7 @@ static int check_btf_func(struct bpf_prog *prog, struct bpf_verifier_env *env, { u32 i, nfuncs, urec_size, min_size, prev_offset; u32 krec_size = sizeof(struct bpf_func_info); - struct bpf_func_info krecord = {}; + struct bpf_func_info *krecord = NULL; const struct btf_type *type; void __user *urecord; struct btf *btf; @@ -5243,6 +5243,12 @@ static int check_btf_func(struct bpf_prog *prog, struct bpf_verifier_env *env, urecord = u64_to_user_ptr(attr->func_info); min_size = min_t(u32, krec_size, urec_size); + krecord = kvcalloc(nfuncs, krec_size, GFP_KERNEL | __GFP_NOWARN); + if (!krecord) { + ret = -ENOMEM; + goto free_btf; + } + for (i = 0; i < nfuncs; i++) { ret = bpf_check_uarg_tail_zero(urecord, krec_size, urec_size); if (ret) { @@ -5257,59 +5263,69 @@ static int check_btf_func(struct bpf_prog *prog, struct bpf_verifier_env *env, goto free_btf; } - if (copy_from_user(&krecord, urecord, min_size)) { + if (copy_from_user(&krecord[i], urecord, min_size)) { ret = -EFAULT; goto free_btf; } /* check insn_offset */ if (i == 0) { - if (krecord.insn_offset) { + if (krecord[i].insn_offset) { verbose(env, "nonzero insn_offset %u for the first func info record", - krecord.insn_offset); + krecord[i].insn_offset); ret = -EINVAL; goto free_btf; } - } else if (krecord.insn_offset <= prev_offset) { + } else if (krecord[i].insn_offset <= prev_offset) { verbose(env, "same or smaller insn offset (%u) than previous func info record (%u)", - krecord.insn_offset, prev_offset); + krecord[i].insn_offset, prev_offset); ret = -EINVAL; goto free_btf; } - if (env->subprog_info[i].start != krecord.insn_offset) { + if (env->subprog_info[i].start != krecord[i].insn_offset) { verbose(env, "func_info BTF section doesn't match subprog layout in BPF program\n"); ret = -EINVAL; goto free_btf; } /* check type_id */ - type = btf_type_by_id(btf, krecord.type_id); + type = btf_type_by_id(btf, krecord[i].type_id); if (!type || BTF_INFO_KIND(type->info) != BTF_KIND_FUNC) { verbose(env, "invalid type id %d in func info", - krecord.type_id); + krecord[i].type_id); ret = -EINVAL; goto free_btf; } - if (i == 0) - prog->aux->type_id = krecord.type_id; - env->subprog_info[i].type_id = krecord.type_id; - - prev_offset = krecord.insn_offset; + prev_offset = krecord[i].insn_offset; urecord += urec_size; } prog->aux->btf = btf; + prog->aux->func_info = krecord; + prog->aux->func_info_cnt = nfuncs; return 0; free_btf: btf_put(btf); + kvfree(krecord); return ret; } +static void adjust_btf_func(struct bpf_verifier_env *env) +{ + int i; + + if (!env->prog->aux->func_info) + return; + + for (i = 0; i < env->subprog_cnt; i++) + env->prog->aux->func_info[i].insn_offset = env->subprog_info[i].start; +} + /* check %cur's range satisfies %old's */ static bool range_within(struct bpf_reg_state *old, struct bpf_reg_state *cur) @@ -6611,15 +6627,17 @@ static int jit_subprogs(struct bpf_verifier_env *env) if (bpf_prog_calc_tag(func[i])) goto out_free; func[i]->is_func = 1; + func[i]->aux->func_idx = i; + /* the btf and func_info will be freed only at prog->aux */ + func[i]->aux->btf = prog->aux->btf; + func[i]->aux->func_info = prog->aux->func_info; + /* Use bpf_prog_F_tag to indicate functions in stack traces. * Long term would need debug info to populate names */ func[i]->aux->name[0] = 'F'; func[i]->aux->stack_depth = env->subprog_info[i].stack_depth; func[i]->jit_requested = 1; - /* the btf will be freed only at prog->aux */ - func[i]->aux->btf = prog->aux->btf; - func[i]->aux->type_id = env->subprog_info[i].type_id; func[i] = bpf_int_jit_compile(func[i]); if (!func[i]->jited) { err = -ENOTSUPP; @@ -7197,6 +7215,9 @@ skip_full_check: convert_pseudo_ld_imm64(env); } + if (ret == 0) + adjust_btf_func(env); + err_release_maps: if (!env->prog->aux->used_maps) /* if we didn't copy map pointers into bpf_prog_info, release From 900a3e2df8955714b4bf206a2de71787ccc31151 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Sat, 1 Dec 2018 17:08:44 -0800 Subject: [PATCH 0984/1640] UPSTREAM: bpf: Fix memleak in aux->func_info and aux->btf The aux->func_info and aux->btf are leaked in the error out cases during bpf_prog_load(). This patch fixes it. Fixes: ba64e7d85252 ("bpf: btf: support proper non-jit func info") Cc: Yonghong Song Change-Id: Ic290514ffa2f0659c835c467c72aaea171c67565 Signed-off-by: Martin KaFai Lau Acked-by: Yonghong Song Signed-off-by: Alexei Starovoitov --- kernel/bpf/syscall.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 7c9c6db99a47..58e84e27372f 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1581,6 +1581,8 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) return err; free_used_maps: + kvfree(prog->aux->func_info); + btf_put(prog->aux->btf); bpf_prog_kallsyms_del_subprogs(prog); free_used_maps(prog->aux); free_prog: From faa3075b2393896303bbfca48d7c1a52fb57d483 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Fri, 23 Nov 2018 23:18:03 +0100 Subject: [PATCH 0985/1640] UPSTREAM: bpf: add __weak hook for allocating executable memory By default, BPF uses module_alloc() to allocate executable memory, but this is not necessary on all arches and potentially undesirable on some of them. So break out the module_alloc() and module_memfree() calls into __weak functions to allow them to be overridden in arch code. Change-Id: I76cc6fb24abbb2d463fe6d2588b8d897e8e5437d Signed-off-by: Ard Biesheuvel Signed-off-by: Daniel Borkmann --- kernel/bpf/core.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 6a6a6ceb5eb5..e526955dbd5e 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -651,6 +651,16 @@ bool __weak arch_bpf_jit_check_func(const struct bpf_prog *prog) EXPORT_SYMBOL_GPL(arch_bpf_jit_check_func); #endif +void *__weak bpf_jit_alloc_exec(unsigned long size) +{ + return module_alloc(size); +} + +void __weak bpf_jit_free_exec(void *addr) +{ + module_memfree(addr); +} + struct bpf_binary_header * bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr, unsigned int alignment, @@ -668,7 +678,7 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr, if (bpf_jit_charge_modmem(pages)) return NULL; - hdr = module_alloc(size); + hdr = bpf_jit_alloc_exec(size); if (!hdr) { bpf_jit_uncharge_modmem(pages); return NULL; @@ -696,7 +706,7 @@ void bpf_jit_binary_free(struct bpf_binary_header *hdr) #ifdef CONFIG_RKP_MODULE_SUPPORT uh_call(UH_APP_RKP, RKP_BFP_LOAD, (u64)hdr, (u64)(hdr->pages * PAGE_SIZE), RKP_BPF_JIT_FREE, 0); #endif - module_memfree(hdr); + bpf_jit_free_exec(hdr); bpf_jit_uncharge_modmem(pages); } From c783635e96735e65cab9792b3709b35754750c68 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 5 Dec 2018 17:35:43 -0800 Subject: [PATCH 0986/1640] BACKPORT: bpf: Improve the info.func_info and info.func_info_rec_size behavior 1) When bpf_dump_raw_ok() == false and the kernel can provide >=1 func_info to the userspace, the current behavior is setting the info.func_info_cnt to 0 instead of setting info.func_info to 0. It is different from the behavior in jited_func_lens/nr_jited_func_lens, jited_ksyms/nr_jited_ksyms...etc. This patch fixes it. (i.e. set func_info to 0 instead of func_info_cnt to 0 when bpf_dump_raw_ok() == false). 2) When the userspace passed in info.func_info_cnt == 0, the kernel will set the expected func_info size back to the info.func_info_rec_size. It is a way for the userspace to learn the kernel expected func_info_rec_size introduced in commit 838e96904ff3 ("bpf: Introduce bpf_func_info"). An exception is the kernel expected size is not set when func_info is not available for a bpf_prog. This makes the returned info.func_info_rec_size has different values depending on the returned value of info.func_info_cnt. This patch sets the kernel expected size to info.func_info_rec_size independent of the info.func_info_cnt. 3) The current logic only rejects invalid func_info_rec_size if func_info_cnt is non zero. This patch also rejects invalid nonzero info.func_info_rec_size and not equal to the kernel expected size. 4) Set info.btf_id as long as prog->aux->btf != NULL. That will setup the later copy_to_user() codes look the same as others which then easier to understand and maintain. prog->aux->btf is not NULL only if prog->aux->func_info_cnt > 0. Breaking up info.btf_id from prog->aux->func_info_cnt is needed for the later line info patch anyway. A similar change is made to bpf_get_prog_name(). Fixes: 838e96904ff3 ("bpf: Introduce bpf_func_info") Change-Id: I11f04196794fe750dc346d13683de1140f96ea90 Signed-off-by: Martin KaFai Lau Acked-by: Yonghong Song Signed-off-by: Alexei Starovoitov --- kernel/bpf/core.c | 2 +- kernel/bpf/syscall.c | 44 +++++++++++++++++++------------------------- 2 files changed, 20 insertions(+), 26 deletions(-) diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index e526955dbd5e..dceab0743228 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -418,7 +418,7 @@ static void bpf_get_prog_name(const struct bpf_prog *prog, char *sym) sym = bin2hex(sym, prog->tag, sizeof(prog->tag)); /* prog->aux->name will be ignored if full btf name is available */ - if (prog->aux->btf) { + if (prog->aux->func_info_cnt) { type = btf_type_by_id(prog->aux->btf, prog->aux->func_info[prog->aux->func_idx].type_id); func_name = btf_name_by_offset(prog->aux->btf, type->name_off); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 58e84e27372f..e0611f3a33e5 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2113,6 +2113,12 @@ static int bpf_prog_get_info_by_fd(struct file *file, return -EFAULT; } + if ((info.func_info_cnt || info.func_info_rec_size) && + info.func_info_rec_size != sizeof(struct bpf_func_info)) + return -EINVAL; + + info.func_info_rec_size = sizeof(struct bpf_func_info); + if (!capable(CAP_SYS_ADMIN)) { info.jited_prog_len = 0; info.xlated_prog_len = 0; @@ -2256,35 +2262,23 @@ static int bpf_prog_get_info_by_fd(struct file *file, } } - if (prog->aux->btf) { - u32 krec_size = sizeof(struct bpf_func_info); - u32 ucnt, urec_size; - + if (prog->aux->btf) info.btf_id = btf_id(prog->aux->btf); - ucnt = info.func_info_cnt; - info.func_info_cnt = prog->aux->func_info_cnt; - urec_size = info.func_info_rec_size; - info.func_info_rec_size = krec_size; - if (ucnt) { - /* expect passed-in urec_size is what the kernel expects */ - if (urec_size != info.func_info_rec_size) - return -EINVAL; + ulen = info.func_info_cnt; + info.func_info_cnt = prog->aux->func_info_cnt; + if (info.func_info_cnt && ulen) { + if (bpf_dump_raw_ok(file->f_cred)) { + char __user *user_finfo; - if (bpf_dump_raw_ok(file->f_cred)) { - char __user *user_finfo; - - user_finfo = u64_to_user_ptr(info.func_info); - ucnt = min_t(u32, info.func_info_cnt, ucnt); - if (copy_to_user(user_finfo, prog->aux->func_info, - krec_size * ucnt)) - return -EFAULT; - } else { - info.func_info_cnt = 0; - } + user_finfo = u64_to_user_ptr(info.func_info); + ulen = min_t(u32, info.func_info_cnt, ulen); + if (copy_to_user(user_finfo, prog->aux->func_info, + info.func_info_rec_size * ulen)) + return -EFAULT; + } else { + info.func_info = 0; } - } else { - info.func_info_cnt = 0; } done: From fbd83f59fb77991933c051a9e6ad8edebe5ba15b Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 5 Dec 2018 17:35:44 -0800 Subject: [PATCH 0987/1640] UPSTREAM: bpf: Change insn_offset to insn_off in bpf_func_info The later patch will introduce "struct bpf_line_info" which has member "line_off" and "file_off" referring back to the string section in btf. The line_"off" and file_"off" are more consistent to the naming convention in btf.h that means "offset" (e.g. name_off in "struct btf_type"). The to-be-added "struct bpf_line_info" also has another member, "insn_off" which is the same as the "insn_offset" in "struct bpf_func_info". Hence, this patch renames "insn_offset" to "insn_off" for "struct bpf_func_info". Change-Id: I239669161b1994507bd22141dea4177579fc5447 Signed-off-by: Martin KaFai Lau Acked-by: Yonghong Song Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 2 +- kernel/bpf/verifier.c | 18 +++++++++--------- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 6fea445b4955..dd6e0cef9750 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3001,7 +3001,7 @@ struct bpf_flow_keys { }; struct bpf_func_info { - __u32 insn_offset; + __u32 insn_off; __u32 type_id; }; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 0fabe2cb6b23..3e0adef5ce97 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5268,24 +5268,24 @@ static int check_btf_func(struct bpf_prog *prog, struct bpf_verifier_env *env, goto free_btf; } - /* check insn_offset */ + /* check insn_off */ if (i == 0) { - if (krecord[i].insn_offset) { + if (krecord[i].insn_off) { verbose(env, - "nonzero insn_offset %u for the first func info record", - krecord[i].insn_offset); + "nonzero insn_off %u for the first func info record", + krecord[i].insn_off); ret = -EINVAL; goto free_btf; } - } else if (krecord[i].insn_offset <= prev_offset) { + } else if (krecord[i].insn_off <= prev_offset) { verbose(env, "same or smaller insn offset (%u) than previous func info record (%u)", - krecord[i].insn_offset, prev_offset); + krecord[i].insn_off, prev_offset); ret = -EINVAL; goto free_btf; } - if (env->subprog_info[i].start != krecord[i].insn_offset) { + if (env->subprog_info[i].start != krecord[i].insn_off) { verbose(env, "func_info BTF section doesn't match subprog layout in BPF program\n"); ret = -EINVAL; goto free_btf; @@ -5300,7 +5300,7 @@ static int check_btf_func(struct bpf_prog *prog, struct bpf_verifier_env *env, goto free_btf; } - prev_offset = krecord[i].insn_offset; + prev_offset = krecord[i].insn_off; urecord += urec_size; } @@ -5323,7 +5323,7 @@ static void adjust_btf_func(struct bpf_verifier_env *env) return; for (i = 0; i < env->subprog_cnt; i++) - env->prog->aux->func_info[i].insn_offset = env->subprog_info[i].start; + env->prog->aux->func_info[i].insn_off = env->subprog_info[i].start; } /* check %cur's range satisfies %old's */ From b977ff3e69183cd97f28677d035d15ecba7224b2 Mon Sep 17 00:00:00 2001 From: Jiong Wang Date: Wed, 5 Dec 2018 13:52:34 -0500 Subject: [PATCH 0988/1640] UPSTREAM: bpf: interpreter support BPF_ALU | BPF_ARSH This patch implements interpreting BPF_ALU | BPF_ARSH. Do arithmetic right shift on low 32-bit sub-register, and zero the high 32 bits. Reviewed-by: Jakub Kicinski Change-Id: Ia505f22efa86bed9306bf4a219457b4af9a76c60 Signed-off-by: Jiong Wang Signed-off-by: Alexei Starovoitov --- kernel/bpf/core.c | 52 +++++++++++++++++++++++++++-------------------- 1 file changed, 30 insertions(+), 22 deletions(-) diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index dceab0743228..c3525dea65bc 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -982,32 +982,34 @@ EXPORT_SYMBOL_GPL(__bpf_call_base); #define BPF_INSN_MAP(INSN_2, INSN_3) \ /* 32 bit ALU operations. */ \ /* Register based. */ \ - INSN_3(ALU, ADD, X), \ - INSN_3(ALU, SUB, X), \ - INSN_3(ALU, AND, X), \ - INSN_3(ALU, OR, X), \ - INSN_3(ALU, LSH, X), \ - INSN_3(ALU, RSH, X), \ - INSN_3(ALU, XOR, X), \ - INSN_3(ALU, MUL, X), \ - INSN_3(ALU, MOV, X), \ - INSN_3(ALU, DIV, X), \ - INSN_3(ALU, MOD, X), \ + INSN_3(ALU, ADD, X), \ + INSN_3(ALU, SUB, X), \ + INSN_3(ALU, AND, X), \ + INSN_3(ALU, OR, X), \ + INSN_3(ALU, LSH, X), \ + INSN_3(ALU, RSH, X), \ + INSN_3(ALU, XOR, X), \ + INSN_3(ALU, MUL, X), \ + INSN_3(ALU, MOV, X), \ + INSN_3(ALU, ARSH, X), \ + INSN_3(ALU, DIV, X), \ + INSN_3(ALU, MOD, X), \ INSN_2(ALU, NEG), \ INSN_3(ALU, END, TO_BE), \ INSN_3(ALU, END, TO_LE), \ /* Immediate based. */ \ - INSN_3(ALU, ADD, K), \ - INSN_3(ALU, SUB, K), \ - INSN_3(ALU, AND, K), \ - INSN_3(ALU, OR, K), \ - INSN_3(ALU, LSH, K), \ - INSN_3(ALU, RSH, K), \ - INSN_3(ALU, XOR, K), \ - INSN_3(ALU, MUL, K), \ - INSN_3(ALU, MOV, K), \ - INSN_3(ALU, DIV, K), \ - INSN_3(ALU, MOD, K), \ + INSN_3(ALU, ADD, K), \ + INSN_3(ALU, SUB, K), \ + INSN_3(ALU, AND, K), \ + INSN_3(ALU, OR, K), \ + INSN_3(ALU, LSH, K), \ + INSN_3(ALU, RSH, K), \ + INSN_3(ALU, XOR, K), \ + INSN_3(ALU, MUL, K), \ + INSN_3(ALU, MOV, K), \ + INSN_3(ALU, ARSH, K), \ + INSN_3(ALU, DIV, K), \ + INSN_3(ALU, MOD, K), \ /* 64 bit ALU operations. */ \ /* Register based. */ \ INSN_3(ALU64, ADD, X), \ @@ -1187,6 +1189,12 @@ select_insn: DST = (u64) (u32) insn[0].imm | ((u64) (u32) insn[1].imm) << 32; insn++; CONT; + ALU_ARSH_X: + DST = (u64) (u32) ((*(s32 *) &DST) >> SRC); + CONT; + ALU_ARSH_K: + DST = (u64) (u32) ((*(s32 *) &DST) >> IMM); + CONT; ALU64_ARSH_X: (*(s64 *) &DST) >>= SRC; CONT; From 3a7ec174e91b778c7f94fa172932c31bfbdcd7e1 Mon Sep 17 00:00:00 2001 From: Jiong Wang Date: Wed, 5 Dec 2018 13:52:35 -0500 Subject: [PATCH 0989/1640] UPSTREAM: bpf: verifier remove the rejection on BPF_ALU | BPF_ARSH This patch remove the rejection on BPF_ALU | BPF_ARSH as we have supported them on interpreter and all JIT back-ends Reviewed-by: Jakub Kicinski Change-Id: Iaf84977ffe2fb50704056a19e8245671d303dbf2 Signed-off-by: Jiong Wang Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 3e0adef5ce97..778c7a0d4a81 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4115,11 +4115,6 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) return -EINVAL; } - if (opcode == BPF_ARSH && BPF_CLASS(insn->code) != BPF_ALU64) { - verbose(env, "BPF_ARSH not supported for 32 bit ALU\n"); - return -EINVAL; - } - if ((opcode == BPF_LSH || opcode == BPF_RSH || opcode == BPF_ARSH) && BPF_SRC(insn->code) == BPF_K) { int size = BPF_CLASS(insn->code) == BPF_ALU64 ? 64 : 32; From f46b0355ad7398715b0929ecb91b41ea077c0501 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 20 Nov 2018 14:08:20 -0800 Subject: [PATCH 0990/1640] UPSTREAM: bpf: fix a compilation error when CONFIG_BPF_SYSCALL is not defined Kernel test robot (lkp@intel.com) reports a compilation error at https://www.spinics.net/lists/netdev/msg534913.html introduced by commit 838e96904ff3 ("bpf: Introduce bpf_func_info"). If CONFIG_BPF is defined and CONFIG_BPF_SYSCALL is not defined, the following error will appear: kernel/bpf/core.c:414: undefined reference to `btf_type_by_id' kernel/bpf/core.c:415: undefined reference to `btf_name_by_offset' When CONFIG_BPF_SYSCALL is not defined, let us define stub inline functions for btf_type_by_id() and btf_name_by_offset() in include/linux/btf.h. This way, the compilation failure can be avoided. Fixes: 838e96904ff3 ("bpf: Introduce bpf_func_info") Reported-by: kbuild test robot Cc: Martin KaFai Lau Change-Id: I36eb14214637283d1429dc886c7612b04325e660 Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov --- include/linux/btf.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/include/linux/btf.h b/include/linux/btf.h index 7f2c0a4a45ea..8c2199b5d250 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -46,7 +46,21 @@ void btf_type_seq_show(const struct btf *btf, u32 type_id, void *obj, struct seq_file *m); int btf_get_fd_by_id(u32 id); u32 btf_id(const struct btf *btf); + +#ifdef CONFIG_BPF_SYSCALL const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id); const char *btf_name_by_offset(const struct btf *btf, u32 offset); +#else +static inline const struct btf_type *btf_type_by_id(const struct btf *btf, + u32 type_id) +{ + return NULL; +} +static inline const char *btf_name_by_offset(const struct btf *btf, + u32 offset) +{ + return NULL; +} +#endif #endif From 1efa3f9fdbd87646a4f70d5178cd434517937574 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 2 May 2018 20:12:23 +0200 Subject: [PATCH 0991/1640] UPSTREAM: bpf, x64: fix memleak when not converging on calls The JIT logic in jit_subprogs() is as follows: for all subprogs we allocate a bpf_prog_alloc(), populate it (prog->is_func = 1 here), and pass it to bpf_int_jit_compile(). If a failure occurred during JIT and prog->jited is not set, then we bail out from attempting to JIT the whole program, and punt to the interpreter instead. In case JITing went successful, we fixup BPF call offsets and do another pass to bpf_int_jit_compile() (extra_pass is true at that point) to complete JITing calls. Given that requires to pass JIT context around addrs and jit_data from x86 JIT are freed in the extra_pass in bpf_int_jit_compile() when calls are involved (if not, they can be freed immediately). However, if in the original pass, the JIT image didn't converge then we leak addrs and jit_data since image itself is NULL, the prog->is_func is set and extra_pass is false in that case, meaning both will become unreachable and are never cleaned up, therefore we need to free as well on !image. Only x64 JIT is affected. Fixes: 1c2a088a6626 ("bpf: x64: add JIT support for multi-function programs") Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Acked-by: David S. Miller Signed-off-by: Alexei Starovoitov --- arch/x86/net/bpf_jit_comp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index a2a10a66070e..bba4abce1772 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -1262,7 +1262,7 @@ out_image: prog = orig_prog; } - if (!prog->is_func || extra_pass) { + if (!image || !prog->is_func || extra_pass) { out_addrs: kfree(addrs); kfree(jit_data); From 67ccc1f25b8ed8ebb1f5a30f2966920186ef28bc Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Fri, 7 Dec 2018 16:42:25 -0800 Subject: [PATCH 0992/1640] BACKPORT: bpf: Add bpf_line_info support This patch adds bpf_line_info support. It accepts an array of bpf_line_info objects during BPF_PROG_LOAD. The "line_info", "line_info_cnt" and "line_info_rec_size" are added to the "union bpf_attr". The "line_info_rec_size" makes bpf_line_info extensible in the future. The new "check_btf_line()" ensures the userspace line_info is valid for the kernel to use. When the verifier is translating/patching the bpf_prog (through "bpf_patch_insn_single()"), the line_infos' insn_off is also adjusted by the newly added "bpf_adj_linfo()". If the bpf_prog is jited, this patch also provides the jited addrs (in aux->jited_linfo) for the corresponding line_info.insn_off. "bpf_prog_fill_jited_linfo()" is added to fill the aux->jited_linfo. It is currently called by the x86 jit. Other jits can also use "bpf_prog_fill_jited_linfo()" and it will be done in the followup patches. In the future, if it deemed necessary, a particular jit could also provide its own "bpf_prog_fill_jited_linfo()" implementation. A few "*line_info*" fields are added to the bpf_prog_info such that the user can get the xlated line_info back (i.e. the line_info with its insn_off reflecting the translated prog). The jited_line_info is available if the prog is jited. It is an array of __u64. If the prog is not jited, jited_line_info_cnt is 0. The verifier's verbose log with line_info will be done in a follow up patch. Change-Id: Iec9c1ec3b75aa358d4961d2434e2dee92543f193 Signed-off-by: Martin KaFai Lau Acked-by: Yonghong Song Signed-off-by: Alexei Starovoitov --- arch/x86/net/bpf_jit_comp.c | 2 + include/linux/bpf.h | 21 ++++ include/linux/bpf_verifier.h | 1 + include/linux/btf.h | 1 + include/linux/filter.h | 7 ++ include/uapi/linux/bpf.h | 19 ++++ kernel/bpf/btf.c | 2 +- kernel/bpf/core.c | 118 ++++++++++++++++++++- kernel/bpf/syscall.c | 83 +++++++++++++-- kernel/bpf/verifier.c | 198 ++++++++++++++++++++++++++++++----- 10 files changed, 419 insertions(+), 33 deletions(-) diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index bba4abce1772..e9f20eafafbc 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -1263,6 +1263,8 @@ out_image: } if (!image || !prog->is_func || extra_pass) { + if (image) + bpf_prog_fill_jited_linfo(prog, addrs); out_addrs: kfree(addrs); kfree(jit_data); diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 848e0962861f..09b4f77fd0a9 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -324,7 +324,28 @@ struct bpf_prog_aux { struct bpf_prog_offload *offload; struct btf *btf; struct bpf_func_info *func_info; + /* bpf_line_info loaded from userspace. linfo->insn_off + * has the xlated insn offset. + * Both the main and sub prog share the same linfo. + * The subprog can access its first linfo by + * using the linfo_idx. + */ + struct bpf_line_info *linfo; + /* jited_linfo is the jited addr of the linfo. It has a + * one to one mapping to linfo: + * jited_linfo[i] is the jited addr for the linfo[i]->insn_off. + * Both the main and sub prog share the same jited_linfo. + * The subprog can access its first jited_linfo by + * using the linfo_idx. + */ + void **jited_linfo; u32 func_info_cnt; + u32 nr_linfo; + /* subprog can use linfo_idx to access its first linfo and + * jited_linfo. + * main prog always has linfo_idx == 0 + */ + u32 linfo_idx; union { struct work_struct work; struct rcu_head rcu; diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 1001573674a1..c3b9c7ebd8e0 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -222,6 +222,7 @@ static inline bool bpf_verifier_log_needed(const struct bpf_verifier_log *log) struct bpf_subprog_info { u32 start; /* insn idx of function entry point */ + u32 linfo_idx; /* The idx to the main_prog->aux->linfo */ u16 stack_depth; /* max. stack depth used by this function */ }; diff --git a/include/linux/btf.h b/include/linux/btf.h index 8c2199b5d250..b98405a56383 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -46,6 +46,7 @@ void btf_type_seq_show(const struct btf *btf, u32 type_id, void *obj, struct seq_file *m); int btf_get_fd_by_id(u32 id); u32 btf_id(const struct btf *btf); +bool btf_name_offset_valid(const struct btf *btf, u32 offset); #ifdef CONFIG_BPF_SYSCALL const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id); diff --git a/include/linux/filter.h b/include/linux/filter.h index ad7c95a503d5..2a4d16122bec 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -802,6 +802,13 @@ void bpf_prog_free(struct bpf_prog *fp); bool bpf_opcode_in_insntable(u8 code); +void bpf_prog_free_linfo(struct bpf_prog *prog); +void bpf_prog_fill_jited_linfo(struct bpf_prog *prog, + const u32 *insn_to_jit_off); +int bpf_prog_alloc_jited_linfo(struct bpf_prog *prog); +void bpf_prog_free_jited_linfo(struct bpf_prog *prog); +void bpf_prog_free_unused_jited_linfo(struct bpf_prog *prog); + struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags); struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size, gfp_t gfp_extra_flags); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index dd6e0cef9750..3711679a9035 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -358,6 +358,9 @@ union bpf_attr { __u32 func_info_rec_size; /* userspace bpf_func_info size */ __aligned_u64 func_info; /* func info */ __u32 func_info_cnt; /* number of bpf_func_info records */ + __u32 line_info_rec_size; /* userspace bpf_line_info size */ + __aligned_u64 line_info; /* line info */ + __u32 line_info_cnt; /* number of bpf_line_info records */ }; struct { /* anonymous struct used by BPF_OBJ_* commands */ @@ -2689,6 +2692,12 @@ struct bpf_prog_info { __u32 func_info_rec_size; __aligned_u64 func_info; __u32 func_info_cnt; + __u32 line_info_cnt; + __aligned_u64 line_info; + __aligned_u64 jited_line_info; + __u32 jited_line_info_cnt; + __u32 line_info_rec_size; + __u32 jited_line_info_rec_size; } __attribute__((aligned(8))); struct bpf_map_info { @@ -3005,4 +3014,14 @@ struct bpf_func_info { __u32 type_id; }; +#define BPF_LINE_INFO_LINE_NUM(line_col) ((line_col) >> 10) +#define BPF_LINE_INFO_LINE_COL(line_col) ((line_col) & 0x3ff) + +struct bpf_line_info { + __u32 insn_off; + __u32 file_name_off; + __u32 line_off; + __u32 line_col; +}; + #endif /* _UAPI__LINUX_BPF_H__ */ diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 170081b29a2e..5ac21e49b7d6 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -444,7 +444,7 @@ static const struct btf_kind_operations *btf_type_ops(const struct btf_type *t) return kind_ops[BTF_INFO_KIND(t->info)]; } -static bool btf_name_offset_valid(const struct btf *btf, u32 offset) +bool btf_name_offset_valid(const struct btf *btf, u32 offset) { return BTF_STR_OFFSET_VALID(offset) && offset < btf->hdr.str_len; diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index c3525dea65bc..f9c70cff2818 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -114,6 +114,91 @@ struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags) } EXPORT_SYMBOL_GPL(bpf_prog_alloc); +int bpf_prog_alloc_jited_linfo(struct bpf_prog *prog) +{ + if (!prog->aux->nr_linfo || !prog->jit_requested) + return 0; + + prog->aux->jited_linfo = kcalloc(prog->aux->nr_linfo, + sizeof(*prog->aux->jited_linfo), + GFP_KERNEL | __GFP_NOWARN); + if (!prog->aux->jited_linfo) + return -ENOMEM; + + return 0; +} + +void bpf_prog_free_jited_linfo(struct bpf_prog *prog) +{ + kfree(prog->aux->jited_linfo); + prog->aux->jited_linfo = NULL; +} + +void bpf_prog_free_unused_jited_linfo(struct bpf_prog *prog) +{ + if (prog->aux->jited_linfo && !prog->aux->jited_linfo[0]) + bpf_prog_free_jited_linfo(prog); +} + +/* The jit engine is responsible to provide an array + * for insn_off to the jited_off mapping (insn_to_jit_off). + * + * The idx to this array is the insn_off. Hence, the insn_off + * here is relative to the prog itself instead of the main prog. + * This array has one entry for each xlated bpf insn. + * + * jited_off is the byte off to the last byte of the jited insn. + * + * Hence, with + * insn_start: + * The first bpf insn off of the prog. The insn off + * here is relative to the main prog. + * e.g. if prog is a subprog, insn_start > 0 + * linfo_idx: + * The prog's idx to prog->aux->linfo and jited_linfo + * + * jited_linfo[linfo_idx] = prog->bpf_func + * + * For i > linfo_idx, + * + * jited_linfo[i] = prog->bpf_func + + * insn_to_jit_off[linfo[i].insn_off - insn_start - 1] + */ +void bpf_prog_fill_jited_linfo(struct bpf_prog *prog, + const u32 *insn_to_jit_off) +{ + u32 linfo_idx, insn_start, insn_end, nr_linfo, i; + const struct bpf_line_info *linfo; + void **jited_linfo; + + if (!prog->aux->jited_linfo) + /* Userspace did not provide linfo */ + return; + + linfo_idx = prog->aux->linfo_idx; + linfo = &prog->aux->linfo[linfo_idx]; + insn_start = linfo[0].insn_off; + insn_end = insn_start + prog->len; + + jited_linfo = &prog->aux->jited_linfo[linfo_idx]; + jited_linfo[0] = prog->bpf_func; + + nr_linfo = prog->aux->nr_linfo - linfo_idx; + + for (i = 1; i < nr_linfo && linfo[i].insn_off < insn_end; i++) + /* The verifier ensures that linfo[i].insn_off is + * strictly increasing + */ + jited_linfo[i] = prog->bpf_func + + insn_to_jit_off[linfo[i].insn_off - insn_start - 1]; +} + +void bpf_prog_free_linfo(struct bpf_prog *prog) +{ + bpf_prog_free_jited_linfo(prog); + kvfree(prog->aux->linfo); +} + struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size, gfp_t gfp_extra_flags) { @@ -303,6 +388,26 @@ static int bpf_adj_branches(struct bpf_prog *prog, u32 pos, u32 delta, return ret; } +static void bpf_adj_linfo(struct bpf_prog *prog, u32 off, u32 delta) +{ + struct bpf_line_info *linfo; + u32 i, nr_linfo; + + nr_linfo = prog->aux->nr_linfo; + if (!nr_linfo || !delta) + return; + + linfo = prog->aux->linfo; + + for (i = 0; i < nr_linfo; i++) + if (off < linfo[i].insn_off) + break; + + /* Push all off < linfo[i].insn_off by delta */ + for (; i < nr_linfo; i++) + linfo[i].insn_off += delta; +} + struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, const struct bpf_insn *patch, u32 len) { @@ -358,6 +463,8 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, */ BUG_ON(bpf_adj_branches(prog_adj, off, insn_delta, false)); + bpf_adj_linfo(prog_adj, off, insn_delta); + return prog_adj; } @@ -1653,13 +1760,20 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err) * be JITed, but falls back to the interpreter. */ if (!bpf_prog_is_dev_bound(fp->aux)) { + *err = bpf_prog_alloc_jited_linfo(fp); + if (*err) + return fp; + fp = bpf_int_jit_compile(fp); -#ifdef CONFIG_BPF_JIT_ALWAYS_ON if (!fp->jited) { + bpf_prog_free_jited_linfo(fp); +#ifdef CONFIG_BPF_JIT_ALWAYS_ON *err = -ENOTSUPP; return fp; - } #endif + } else { + bpf_prog_free_unused_jited_linfo(fp); + } } else { *err = bpf_prog_offload_compile(fp); if (*err) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index e0611f3a33e5..c3f8b7b1e906 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1228,6 +1228,7 @@ static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock) bpf_prog_kallsyms_del_all(prog); btf_put(prog->aux->btf); kvfree(prog->aux->func_info); + bpf_prog_free_linfo(prog); call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu); } @@ -1454,7 +1455,7 @@ bpf_prog_load_check_attach_type(enum bpf_prog_type prog_type, } /* last field in 'union bpf_attr' used by this command */ -#define BPF_PROG_LOAD_LAST_FIELD func_info_cnt +#define BPF_PROG_LOAD_LAST_FIELD line_info_cnt static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) { @@ -1581,6 +1582,7 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) return err; free_used_maps: + bpf_prog_free_linfo(prog); kvfree(prog->aux->func_info); btf_put(prog->aux->btf); bpf_prog_kallsyms_del_subprogs(prog); @@ -2069,6 +2071,37 @@ static struct bpf_insn *bpf_insn_prepare_dump(const struct bpf_prog *prog, return insns; } +static int set_info_rec_size(struct bpf_prog_info *info) +{ + /* + * Ensure info.*_rec_size is the same as kernel expected size + * + * or + * + * Only allow zero *_rec_size if both _rec_size and _cnt are + * zero. In this case, the kernel will set the expected + * _rec_size back to the info. + */ + + if ((info->func_info_cnt || info->func_info_rec_size) && + info->func_info_rec_size != sizeof(struct bpf_func_info)) + return -EINVAL; + + if ((info->line_info_cnt || info->line_info_rec_size) && + info->line_info_rec_size != sizeof(struct bpf_line_info)) + return -EINVAL; + + if ((info->jited_line_info_cnt || info->jited_line_info_rec_size) && + info->jited_line_info_rec_size != sizeof(__u64)) + return -EINVAL; + + info->func_info_rec_size = sizeof(struct bpf_func_info); + info->line_info_rec_size = sizeof(struct bpf_line_info); + info->jited_line_info_rec_size = sizeof(__u64); + + return 0; +} + static int bpf_prog_get_info_by_fd(struct file *file, struct bpf_prog *prog, const union bpf_attr *attr, @@ -2113,11 +2146,9 @@ static int bpf_prog_get_info_by_fd(struct file *file, return -EFAULT; } - if ((info.func_info_cnt || info.func_info_rec_size) && - info.func_info_rec_size != sizeof(struct bpf_func_info)) - return -EINVAL; - - info.func_info_rec_size = sizeof(struct bpf_func_info); + err = set_info_rec_size(&info); + if (err) + return err; if (!capable(CAP_SYS_ADMIN)) { info.jited_prog_len = 0; @@ -2125,6 +2156,8 @@ static int bpf_prog_get_info_by_fd(struct file *file, info.nr_jited_ksyms = 0; info.nr_jited_func_lens = 0; info.func_info_cnt = 0; + info.line_info_cnt = 0; + info.jited_line_info_cnt = 0; goto done; } @@ -2281,6 +2314,44 @@ static int bpf_prog_get_info_by_fd(struct file *file, } } + ulen = info.line_info_cnt; + info.line_info_cnt = prog->aux->nr_linfo; + if (info.line_info_cnt && ulen) { + if (bpf_dump_raw_ok(file->f_cred)) { + __u8 __user *user_linfo; + + user_linfo = u64_to_user_ptr(info.line_info); + ulen = min_t(u32, info.line_info_cnt, ulen); + if (copy_to_user(user_linfo, prog->aux->linfo, + info.line_info_rec_size * ulen)) + return -EFAULT; + } else { + info.line_info = 0; + } + } + + ulen = info.jited_line_info_cnt; + if (prog->aux->jited_linfo) + info.jited_line_info_cnt = prog->aux->nr_linfo; + else + info.jited_line_info_cnt = 0; + if (info.jited_line_info_cnt && ulen) { + if (bpf_dump_raw_ok(file->f_cred)) { + __u64 __user *user_linfo; + u32 i; + + user_linfo = u64_to_user_ptr(info.jited_line_info); + ulen = min_t(u32, info.jited_line_info_cnt, ulen); + for (i = 0; i < ulen; i++) { + if (put_user((__u64)(long)prog->aux->jited_linfo[i], + &user_linfo[i])) + return -EFAULT; + } + } else { + info.jited_line_info = 0; + } + } + done: if (copy_to_user(uinfo, &info, info_len) || put_user(info_len, &uattr->info.info_len)) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 778c7a0d4a81..6fdb6d85187c 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5201,15 +5201,17 @@ err_free: #define MIN_BPF_FUNCINFO_SIZE 8 #define MAX_FUNCINFO_REC_SIZE 252 -static int check_btf_func(struct bpf_prog *prog, struct bpf_verifier_env *env, - union bpf_attr *attr, union bpf_attr __user *uattr) +static int check_btf_func(struct bpf_verifier_env *env, + const union bpf_attr *attr, + union bpf_attr __user *uattr) { u32 i, nfuncs, urec_size, min_size, prev_offset; u32 krec_size = sizeof(struct bpf_func_info); - struct bpf_func_info *krecord = NULL; + struct bpf_func_info *krecord; const struct btf_type *type; + struct bpf_prog *prog; + const struct btf *btf; void __user *urecord; - struct btf *btf; int ret = 0; nfuncs = attr->func_info_cnt; @@ -5229,20 +5231,15 @@ static int check_btf_func(struct bpf_prog *prog, struct bpf_verifier_env *env, return -EINVAL; } - btf = btf_get_by_fd(attr->prog_btf_fd); - if (IS_ERR(btf)) { - verbose(env, "unable to get btf from fd\n"); - return PTR_ERR(btf); - } + prog = env->prog; + btf = prog->aux->btf; urecord = u64_to_user_ptr(attr->func_info); min_size = min_t(u32, krec_size, urec_size); krecord = kvcalloc(nfuncs, krec_size, GFP_KERNEL | __GFP_NOWARN); - if (!krecord) { - ret = -ENOMEM; - goto free_btf; - } + if (!krecord) + return -ENOMEM; for (i = 0; i < nfuncs; i++) { ret = bpf_check_uarg_tail_zero(urecord, krec_size, urec_size); @@ -5255,12 +5252,12 @@ static int check_btf_func(struct bpf_prog *prog, struct bpf_verifier_env *env, if (put_user(min_size, &uattr->func_info_rec_size)) ret = -EFAULT; } - goto free_btf; + goto err_free; } if (copy_from_user(&krecord[i], urecord, min_size)) { ret = -EFAULT; - goto free_btf; + goto err_free; } /* check insn_off */ @@ -5270,20 +5267,20 @@ static int check_btf_func(struct bpf_prog *prog, struct bpf_verifier_env *env, "nonzero insn_off %u for the first func info record", krecord[i].insn_off); ret = -EINVAL; - goto free_btf; + goto err_free; } } else if (krecord[i].insn_off <= prev_offset) { verbose(env, "same or smaller insn offset (%u) than previous func info record (%u)", krecord[i].insn_off, prev_offset); ret = -EINVAL; - goto free_btf; + goto err_free; } if (env->subprog_info[i].start != krecord[i].insn_off) { verbose(env, "func_info BTF section doesn't match subprog layout in BPF program\n"); ret = -EINVAL; - goto free_btf; + goto err_free; } /* check type_id */ @@ -5292,20 +5289,18 @@ static int check_btf_func(struct bpf_prog *prog, struct bpf_verifier_env *env, verbose(env, "invalid type id %d in func info", krecord[i].type_id); ret = -EINVAL; - goto free_btf; + goto err_free; } prev_offset = krecord[i].insn_off; urecord += urec_size; } - prog->aux->btf = btf; prog->aux->func_info = krecord; prog->aux->func_info_cnt = nfuncs; return 0; -free_btf: - btf_put(btf); +err_free: kvfree(krecord); return ret; } @@ -5321,6 +5316,150 @@ static void adjust_btf_func(struct bpf_verifier_env *env) env->prog->aux->func_info[i].insn_off = env->subprog_info[i].start; } +#define MIN_BPF_LINEINFO_SIZE (offsetof(struct bpf_line_info, line_col) + \ + sizeof(((struct bpf_line_info *)(0))->line_col)) +#define MAX_LINEINFO_REC_SIZE MAX_FUNCINFO_REC_SIZE + +static int check_btf_line(struct bpf_verifier_env *env, + const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + u32 i, s, nr_linfo, ncopy, expected_size, rec_size, prev_offset = 0; + struct bpf_subprog_info *sub; + struct bpf_line_info *linfo; + struct bpf_prog *prog; + const struct btf *btf; + void __user *ulinfo; + int err; + + nr_linfo = attr->line_info_cnt; + if (!nr_linfo) + return 0; + + rec_size = attr->line_info_rec_size; + if (rec_size < MIN_BPF_LINEINFO_SIZE || + rec_size > MAX_LINEINFO_REC_SIZE || + rec_size & (sizeof(u32) - 1)) + return -EINVAL; + + /* Need to zero it in case the userspace may + * pass in a smaller bpf_line_info object. + */ + linfo = kvcalloc(nr_linfo, sizeof(struct bpf_line_info), + GFP_KERNEL | __GFP_NOWARN); + if (!linfo) + return -ENOMEM; + + prog = env->prog; + btf = prog->aux->btf; + + s = 0; + sub = env->subprog_info; + ulinfo = u64_to_user_ptr(attr->line_info); + expected_size = sizeof(struct bpf_line_info); + ncopy = min_t(u32, expected_size, rec_size); + for (i = 0; i < nr_linfo; i++) { + err = bpf_check_uarg_tail_zero(ulinfo, expected_size, rec_size); + if (err) { + if (err == -E2BIG) { + verbose(env, "nonzero tailing record in line_info"); + if (put_user(expected_size, + &uattr->line_info_rec_size)) + err = -EFAULT; + } + goto err_free; + } + + if (copy_from_user(&linfo[i], ulinfo, ncopy)) { + err = -EFAULT; + goto err_free; + } + + /* + * Check insn_off to ensure + * 1) strictly increasing AND + * 2) bounded by prog->len + * + * The linfo[0].insn_off == 0 check logically falls into + * the later "missing bpf_line_info for func..." case + * because the first linfo[0].insn_off must be the + * first sub also and the first sub must have + * subprog_info[0].start == 0. + */ + if ((i && linfo[i].insn_off <= prev_offset) || + linfo[i].insn_off >= prog->len) { + verbose(env, "Invalid line_info[%u].insn_off:%u (prev_offset:%u prog->len:%u)\n", + i, linfo[i].insn_off, prev_offset, + prog->len); + err = -EINVAL; + goto err_free; + } + + if (!btf_name_offset_valid(btf, linfo[i].line_off) || + !btf_name_offset_valid(btf, linfo[i].file_name_off)) { + verbose(env, "Invalid line_info[%u].line_off or .file_name_off\n", i); + err = -EINVAL; + goto err_free; + } + + if (s != env->subprog_cnt) { + if (linfo[i].insn_off == sub[s].start) { + sub[s].linfo_idx = i; + s++; + } else if (sub[s].start < linfo[i].insn_off) { + verbose(env, "missing bpf_line_info for func#%u\n", s); + err = -EINVAL; + goto err_free; + } + } + + prev_offset = linfo[i].insn_off; + ulinfo += rec_size; + } + + if (s != env->subprog_cnt) { + verbose(env, "missing bpf_line_info for %u funcs starting from func#%u\n", + env->subprog_cnt - s, s); + err = -EINVAL; + goto err_free; + } + + prog->aux->linfo = linfo; + prog->aux->nr_linfo = nr_linfo; + + return 0; + +err_free: + kvfree(linfo); + return err; +} + +static int check_btf_info(struct bpf_verifier_env *env, + const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + struct btf *btf; + int err; + + if (!attr->func_info_cnt && !attr->line_info_cnt) + return 0; + + btf = btf_get_by_fd(attr->prog_btf_fd); + if (IS_ERR(btf)) + return PTR_ERR(btf); + env->prog->aux->btf = btf; + + err = check_btf_func(env, attr, uattr); + if (err) + return err; + + err = check_btf_line(env, attr, uattr); + if (err) + return err; + + return 0; +} + /* check %cur's range satisfies %old's */ static bool range_within(struct bpf_reg_state *old, struct bpf_reg_state *cur) @@ -6572,7 +6711,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) int i, j, subprog_start, subprog_end = 0, len, subprog; struct bpf_insn *insn; void *old_bpf_func; - int err = -ENOMEM; + int err; if (env->subprog_cnt <= 1) return 0; @@ -6603,6 +6742,11 @@ static int jit_subprogs(struct bpf_verifier_env *env) insn->imm = 1; } + err = bpf_prog_alloc_jited_linfo(prog); + if (err) + goto out_undo_insn; + + err = -ENOMEM; func = kcalloc(env->subprog_cnt, sizeof(prog), GFP_KERNEL); if (!func) goto out_undo_insn; @@ -6633,6 +6777,10 @@ static int jit_subprogs(struct bpf_verifier_env *env) func[i]->aux->name[0] = 'F'; func[i]->aux->stack_depth = env->subprog_info[i].stack_depth; func[i]->jit_requested = 1; + func[i]->aux->linfo = prog->aux->linfo; + func[i]->aux->nr_linfo = prog->aux->nr_linfo; + func[i]->aux->jited_linfo = prog->aux->jited_linfo; + func[i]->aux->linfo_idx = env->subprog_info[i].linfo_idx; func[i] = bpf_int_jit_compile(func[i]); if (!func[i]->jited) { err = -ENOTSUPP; @@ -6706,6 +6854,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) prog->bpf_func = func[0]->bpf_func; prog->aux->func = func; prog->aux->func_cnt = env->subprog_cnt; + bpf_prog_free_unused_jited_linfo(prog); return 0; out_free: for (i = 0; i < env->subprog_cnt; i++) @@ -6722,6 +6871,7 @@ out_undo_insn: insn->off = 0; insn->imm = env->insn_aux_data[i].call_imm; } + bpf_prog_free_jited_linfo(prog); return err; } @@ -7149,7 +7299,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, if (ret < 0) goto skip_full_check; - ret = check_btf_func(env->prog, env, attr, uattr); + ret = check_btf_info(env, attr, uattr); if (ret < 0) goto skip_full_check; From c1c9c7502711e9d9136f1dbd8b95a7c95aa49a29 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 10 Dec 2018 11:17:50 -0800 Subject: [PATCH 0993/1640] BACKPORT: bpf: clean up bpf_prog_get_info_by_fd() info.nr_jited_ksyms and info.nr_jited_func_lens cannot be 0 in these two statements, so we don't need to check them. Change-Id: I15baf7ffba2f2bdca56c1f3fcd3bc2fb6c290401 Signed-off-by: Song Liu Acked-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov --- kernel/bpf/syscall.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index c3f8b7b1e906..55100fc7f84b 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2239,7 +2239,7 @@ static int bpf_prog_get_info_by_fd(struct file *file, ulen = info.nr_jited_ksyms; info.nr_jited_ksyms = prog->aux->func_cnt ? : 1; - if (info.nr_jited_ksyms && ulen) { + if (ulen) { if (bpf_dump_raw_ok(file->f_cred)) { unsigned long ksym_addr; u64 __user *user_ksyms; @@ -2270,7 +2270,7 @@ static int bpf_prog_get_info_by_fd(struct file *file, ulen = info.nr_jited_func_lens; info.nr_jited_func_lens = prog->aux->func_cnt ? : 1; - if (info.nr_jited_func_lens && ulen) { + if (ulen) { if (bpf_dump_raw_ok(file->f_cred)) { u32 __user *user_lens; u32 func_len, i; From 11f4afe6516488d66248a49a965342c62564a81c Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Mon, 10 Dec 2018 14:14:08 -0800 Subject: [PATCH 0994/1640] BACKPORT: bpf: rename *_info_cnt to nr_*_info in bpf_prog_info In uapi bpf.h, currently we have the following fields in the struct bpf_prog_info: __u32 func_info_cnt; __u32 line_info_cnt; __u32 jited_line_info_cnt; The above field names "func_info_cnt" and "line_info_cnt" also appear in union bpf_attr for program loading. The original intention is to keep the names the same between bpf_prog_info and bpf_attr so it will imply what we returned to user space will be the same as what the user space passed to the kernel. Such a naming convention in bpf_prog_info is not consistent with other fields like: __u32 nr_jited_ksyms; __u32 nr_jited_func_lens; This patch made this adjustment so in bpf_prog_info newly introduced *_info_cnt becomes nr_*_info. Acked-by: Song Liu Acked-by: Martin KaFai Lau Change-Id: If09e2055af54d981612e2a9d4e9aca402ed3119c Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 6 +++--- kernel/bpf/syscall.c | 38 +++++++++++++++++++------------------- 2 files changed, 22 insertions(+), 22 deletions(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 3711679a9035..a841e411e2fb 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2691,11 +2691,11 @@ struct bpf_prog_info { __u32 btf_id; __u32 func_info_rec_size; __aligned_u64 func_info; - __u32 func_info_cnt; - __u32 line_info_cnt; + __u32 nr_func_info; + __u32 nr_line_info; __aligned_u64 line_info; __aligned_u64 jited_line_info; - __u32 jited_line_info_cnt; + __u32 nr_jited_line_info; __u32 line_info_rec_size; __u32 jited_line_info_rec_size; } __attribute__((aligned(8))); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 55100fc7f84b..0320a1c0a64d 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2083,15 +2083,15 @@ static int set_info_rec_size(struct bpf_prog_info *info) * _rec_size back to the info. */ - if ((info->func_info_cnt || info->func_info_rec_size) && + if ((info->nr_func_info || info->func_info_rec_size) && info->func_info_rec_size != sizeof(struct bpf_func_info)) return -EINVAL; - if ((info->line_info_cnt || info->line_info_rec_size) && + if ((info->nr_line_info || info->line_info_rec_size) && info->line_info_rec_size != sizeof(struct bpf_line_info)) return -EINVAL; - if ((info->jited_line_info_cnt || info->jited_line_info_rec_size) && + if ((info->nr_jited_line_info || info->jited_line_info_rec_size) && info->jited_line_info_rec_size != sizeof(__u64)) return -EINVAL; @@ -2155,9 +2155,9 @@ static int bpf_prog_get_info_by_fd(struct file *file, info.xlated_prog_len = 0; info.nr_jited_ksyms = 0; info.nr_jited_func_lens = 0; - info.func_info_cnt = 0; - info.line_info_cnt = 0; - info.jited_line_info_cnt = 0; + info.nr_func_info = 0; + info.nr_line_info = 0; + info.nr_jited_line_info = 0; goto done; } @@ -2298,14 +2298,14 @@ static int bpf_prog_get_info_by_fd(struct file *file, if (prog->aux->btf) info.btf_id = btf_id(prog->aux->btf); - ulen = info.func_info_cnt; - info.func_info_cnt = prog->aux->func_info_cnt; - if (info.func_info_cnt && ulen) { + ulen = info.nr_func_info; + info.nr_func_info = prog->aux->func_info_cnt; + if (info.nr_func_info && ulen) { if (bpf_dump_raw_ok(file->f_cred)) { char __user *user_finfo; user_finfo = u64_to_user_ptr(info.func_info); - ulen = min_t(u32, info.func_info_cnt, ulen); + ulen = min_t(u32, info.nr_func_info, ulen); if (copy_to_user(user_finfo, prog->aux->func_info, info.func_info_rec_size * ulen)) return -EFAULT; @@ -2314,14 +2314,14 @@ static int bpf_prog_get_info_by_fd(struct file *file, } } - ulen = info.line_info_cnt; - info.line_info_cnt = prog->aux->nr_linfo; - if (info.line_info_cnt && ulen) { + ulen = info.nr_line_info; + info.nr_line_info = prog->aux->nr_linfo; + if (info.nr_line_info && ulen) { if (bpf_dump_raw_ok(file->f_cred)) { __u8 __user *user_linfo; user_linfo = u64_to_user_ptr(info.line_info); - ulen = min_t(u32, info.line_info_cnt, ulen); + ulen = min_t(u32, info.nr_line_info, ulen); if (copy_to_user(user_linfo, prog->aux->linfo, info.line_info_rec_size * ulen)) return -EFAULT; @@ -2330,18 +2330,18 @@ static int bpf_prog_get_info_by_fd(struct file *file, } } - ulen = info.jited_line_info_cnt; + ulen = info.nr_jited_line_info; if (prog->aux->jited_linfo) - info.jited_line_info_cnt = prog->aux->nr_linfo; + info.nr_jited_line_info = prog->aux->nr_linfo; else - info.jited_line_info_cnt = 0; - if (info.jited_line_info_cnt && ulen) { + info.nr_jited_line_info = 0; + if (info.nr_jited_line_info && ulen) { if (bpf_dump_raw_ok(file->f_cred)) { __u64 __user *user_linfo; u32 i; user_linfo = u64_to_user_ptr(info.jited_line_info); - ulen = min_t(u32, info.jited_line_info_cnt, ulen); + ulen = min_t(u32, info.nr_jited_line_info, ulen); for (i = 0; i < ulen; i++) { if (put_user((__u64)(long)prog->aux->jited_linfo[i], &user_linfo[i])) From 7a056468d82729d2aee9ac96909cd1ce8a416092 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Mon, 10 Dec 2018 15:43:00 -0800 Subject: [PATCH 0995/1640] UPSTREAM: bpf: pass struct btf pointer to the map_check_btf() callback If key_type or value_type are of non-trivial data types (e.g. structure or typedef), it's not possible to check them without the additional information, which can't be obtained without a pointer to the btf structure. So, let's pass btf pointer to the map_check_btf() callbacks. Change-Id: I95716060b450288d4ffcbe231d1cf5fdb530e292 Signed-off-by: Roman Gushchin Cc: Alexei Starovoitov Cc: Daniel Borkmann Acked-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 3 +++ kernel/bpf/arraymap.c | 1 + kernel/bpf/lpm_trie.c | 1 + kernel/bpf/syscall.c | 3 ++- 4 files changed, 7 insertions(+), 1 deletion(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 09b4f77fd0a9..c22de2495772 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -23,6 +23,7 @@ struct bpf_prog; struct bpf_map; struct sock; struct seq_file; +struct btf; struct btf_type; /* map is generic key/value storage optionally accesible by eBPF programs */ @@ -57,6 +58,7 @@ struct bpf_map_ops { void (*map_seq_show_elem)(struct bpf_map *map, void *key, struct seq_file *m); int (*map_check_btf)(const struct bpf_map *map, + const struct btf *btf, const struct btf_type *key_type, const struct btf_type *value_type); }; @@ -131,6 +133,7 @@ static inline bool bpf_map_support_seq_show(const struct bpf_map *map) } int map_check_no_btf(const struct bpf_map *map, + const struct btf *btf, const struct btf_type *key_type, const struct btf_type *value_type); diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 8eac775ab709..73ee1cbf0604 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -385,6 +385,7 @@ static void percpu_array_map_seq_show_elem(struct bpf_map *map, void *key, } static int array_map_check_btf(const struct bpf_map *map, + const struct btf *btf, const struct btf_type *key_type, const struct btf_type *value_type) { diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index ac461d3eae34..5373298670a5 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -737,6 +737,7 @@ free_stack: } static int trie_check_btf(const struct bpf_map *map, + const struct btf *btf, const struct btf_type *key_type, const struct btf_type *value_type) { diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 0320a1c0a64d..bc16b85be524 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -457,6 +457,7 @@ static int bpf_obj_name_cpy(char *dst, const char *src) } int map_check_no_btf(const struct bpf_map *map, + const struct btf *btf, const struct btf_type *key_type, const struct btf_type *value_type) { @@ -479,7 +480,7 @@ static int map_check_btf(const struct bpf_map *map, const struct btf *btf, return -EINVAL; if (map->ops->map_check_btf) - ret = map->ops->map_check_btf(map, key_type, value_type); + ret = map->ops->map_check_btf(map, btf, key_type, value_type); return ret; } From 307bef0593b0f034cf7d6e56542aeb5f647ccffe Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Mon, 10 Dec 2018 15:43:01 -0800 Subject: [PATCH 0996/1640] UPSTREAM: bpf: add bpffs pretty print for cgroup local storage maps Implement bpffs pretty printing for cgroup local storage maps (both shared and per-cpu). Output example (captured for tools/testing/selftests/bpf/netcnt_prog.c): Shared: $ cat /sys/fs/bpf/map_2 # WARNING!! The output is for debug purpose only # WARNING!! The output format will change {4294968594,1}: {9999,1039896} Per-cpu: $ cat /sys/fs/bpf/map_1 # WARNING!! The output is for debug purpose only # WARNING!! The output format will change {4294968594,1}: { cpu0: {0,0,0,0,0} cpu1: {0,0,0,0,0} cpu2: {1,104,0,0,0} cpu3: {0,0,0,0,0} } Change-Id: Ib43827c88ab44a20075aac575d5d10f2b405d6d9 Signed-off-by: Roman Gushchin Cc: Alexei Starovoitov Cc: Daniel Borkmann Acked-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov --- include/linux/btf.h | 1 + kernel/bpf/btf.c | 22 +++++++++ kernel/bpf/local_storage.c | 93 +++++++++++++++++++++++++++++++++++++- 3 files changed, 115 insertions(+), 1 deletion(-) diff --git a/include/linux/btf.h b/include/linux/btf.h index b98405a56383..a4cf075b89eb 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -47,6 +47,7 @@ void btf_type_seq_show(const struct btf *btf, u32 type_id, void *obj, int btf_get_fd_by_id(u32 id); u32 btf_id(const struct btf *btf); bool btf_name_offset_valid(const struct btf *btf, u32 offset); +bool btf_type_is_reg_int(const struct btf_type *t, u32 expected_size); #ifdef CONFIG_BPF_SYSCALL const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id); diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 5ac21e49b7d6..e69aa33c89cd 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -514,6 +514,28 @@ static bool btf_type_int_is_regular(const struct btf_type *t) return true; } +/* + * Check that given type is a regular int and has the expected size. + */ +bool btf_type_is_reg_int(const struct btf_type *t, u32 expected_size) +{ + u8 nr_bits, nr_bytes; + u32 int_data; + + if (!btf_type_is_int(t)) + return false; + + int_data = btf_type_int(t); + nr_bits = BTF_INT_BITS(int_data); + nr_bytes = BITS_ROUNDUP_BYTES(nr_bits); + if (BITS_PER_BYTE_MASKED(nr_bits) || + BTF_INT_OFFSET(int_data) || + nr_bytes != expected_size) + return false; + + return true; +} + __printf(2, 3) static void __btf_verifier_log(struct bpf_verifier_log *log, const char *fmt, ...) { diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index b65017dead44..5eca03da0989 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -1,11 +1,13 @@ //SPDX-License-Identifier: GPL-2.0 #include #include +#include #include #include #include #include #include +#include DEFINE_PER_CPU(struct bpf_cgroup_storage*, bpf_cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]); @@ -308,6 +310,94 @@ static int cgroup_storage_delete_elem(struct bpf_map *map, void *key) return -EINVAL; } +static int cgroup_storage_check_btf(const struct bpf_map *map, + const struct btf *btf, + const struct btf_type *key_type, + const struct btf_type *value_type) +{ + const struct btf_type *t; + struct btf_member *m; + u32 id, size; + + /* Key is expected to be of struct bpf_cgroup_storage_key type, + * which is: + * struct bpf_cgroup_storage_key { + * __u64 cgroup_inode_id; + * __u32 attach_type; + * }; + */ + + /* + * Key_type must be a structure with two fields. + */ + if (BTF_INFO_KIND(key_type->info) != BTF_KIND_STRUCT || + BTF_INFO_VLEN(key_type->info) != 2) + return -EINVAL; + + /* + * The first field must be a 64 bit integer at 0 offset. + */ + m = (struct btf_member *)(key_type + 1); + if (m->offset) + return -EINVAL; + id = m->type; + t = btf_type_id_size(btf, &id, NULL); + size = FIELD_SIZEOF(struct bpf_cgroup_storage_key, cgroup_inode_id); + if (!t || !btf_type_is_reg_int(t, size)) + return -EINVAL; + + /* + * The second field must be a 32 bit integer at 64 bit offset. + */ + m++; + if (m->offset != offsetof(struct bpf_cgroup_storage_key, attach_type) * + BITS_PER_BYTE) + return -EINVAL; + id = m->type; + t = btf_type_id_size(btf, &id, NULL); + size = FIELD_SIZEOF(struct bpf_cgroup_storage_key, attach_type); + if (!t || !btf_type_is_reg_int(t, size)) + return -EINVAL; + + return 0; +} + +static void cgroup_storage_seq_show_elem(struct bpf_map *map, void *_key, + struct seq_file *m) +{ + enum bpf_cgroup_storage_type stype = cgroup_storage_type(map); + struct bpf_cgroup_storage_key *key = _key; + struct bpf_cgroup_storage *storage; + int cpu; + + rcu_read_lock(); + storage = cgroup_storage_lookup(map_to_storage(map), key, false); + if (!storage) { + rcu_read_unlock(); + return; + } + + btf_type_seq_show(map->btf, map->btf_key_type_id, key, m); + stype = cgroup_storage_type(map); + if (stype == BPF_CGROUP_STORAGE_SHARED) { + seq_puts(m, ": "); + btf_type_seq_show(map->btf, map->btf_value_type_id, + &READ_ONCE(storage->buf)->data[0], m); + seq_puts(m, "\n"); + } else { + seq_puts(m, ": {\n"); + for_each_possible_cpu(cpu) { + seq_printf(m, "\tcpu%d: ", cpu); + btf_type_seq_show(map->btf, map->btf_value_type_id, + per_cpu_ptr(storage->percpu_buf, cpu), + m); + seq_puts(m, "\n"); + } + seq_puts(m, "}\n"); + } + rcu_read_unlock(); +} + const struct bpf_map_ops cgroup_storage_map_ops = { .map_alloc = cgroup_storage_map_alloc, .map_free = cgroup_storage_map_free, @@ -315,7 +405,8 @@ const struct bpf_map_ops cgroup_storage_map_ops = { .map_lookup_elem = cgroup_storage_lookup_elem, .map_update_elem = cgroup_storage_update_elem, .map_delete_elem = cgroup_storage_delete_elem, - .map_check_btf = map_check_no_btf, + .map_check_btf = cgroup_storage_check_btf, + .map_seq_show_elem = cgroup_storage_seq_show_elem, }; int bpf_cgroup_storage_assign(struct bpf_prog *prog, struct bpf_map *_map) From 90250c00ddd1bd53971e598af26cfa929eadb486 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 12 Dec 2018 10:18:21 -0800 Subject: [PATCH 0997/1640] BACKPORT: bpf: Remove bpf_dump_raw_ok() check for func_info and line_info The func_info and line_info have the bpf insn offset but they do not contain kernel address. They will still be useful for the userspace tool to annotate the xlated insn. This patch removes the bpf_dump_raw_ok() guard for the func_info and line_info during bpf_prog_get_info_by_fd(). The guard stays for jited_line_info which contains the kernel address. Although this bpf_dump_raw_ok() guard behavior has started since the earlier func_info patch series, I marked the Fixes tag to the latest line_info patch series which contains both func_info and line_info and this patch is fixing for both of them. Fixes: c454a46b5efd ("bpf: Add bpf_line_info support") Change-Id: Ibabbd53dfb5d10bb2df2bbc6e7dbd6ed6891e1bf Signed-off-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann --- kernel/bpf/syscall.c | 32 ++++++++++++-------------------- 1 file changed, 12 insertions(+), 20 deletions(-) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index bc16b85be524..a142b5b0126c 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2302,33 +2302,25 @@ static int bpf_prog_get_info_by_fd(struct file *file, ulen = info.nr_func_info; info.nr_func_info = prog->aux->func_info_cnt; if (info.nr_func_info && ulen) { - if (bpf_dump_raw_ok(file->f_cred)) { - char __user *user_finfo; + char __user *user_finfo; - user_finfo = u64_to_user_ptr(info.func_info); - ulen = min_t(u32, info.nr_func_info, ulen); - if (copy_to_user(user_finfo, prog->aux->func_info, - info.func_info_rec_size * ulen)) - return -EFAULT; - } else { - info.func_info = 0; - } + user_finfo = u64_to_user_ptr(info.func_info); + ulen = min_t(u32, info.nr_func_info, ulen); + if (copy_to_user(user_finfo, prog->aux->func_info, + info.func_info_rec_size * ulen)) + return -EFAULT; } ulen = info.nr_line_info; info.nr_line_info = prog->aux->nr_linfo; if (info.nr_line_info && ulen) { - if (bpf_dump_raw_ok(file->f_cred)) { - __u8 __user *user_linfo; + __u8 __user *user_linfo; - user_linfo = u64_to_user_ptr(info.line_info); - ulen = min_t(u32, info.nr_line_info, ulen); - if (copy_to_user(user_linfo, prog->aux->linfo, - info.line_info_rec_size * ulen)) - return -EFAULT; - } else { - info.line_info = 0; - } + user_linfo = u64_to_user_ptr(info.line_info); + ulen = min_t(u32, info.nr_line_info, ulen); + if (copy_to_user(user_linfo, prog->aux->linfo, + info.line_info_rec_size * ulen)) + return -EFAULT; } ulen = info.nr_jited_line_info; From 23a776978d7f4ce55db3cbc1eed2f97b650d3dd8 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Wed, 12 Dec 2018 09:37:46 -0800 Subject: [PATCH 0998/1640] UPSTREAM: bpf: include sub program tags in bpf_prog_info Changes v2 -> v3: 1. remove check for bpf_dump_raw_ok(). Changes v1 -> v2: 1. Fix error path as Martin suggested. This patch adds nr_prog_tags and prog_tags to bpf_prog_info. This is a reliable way for user space to get tags of all sub programs. Before this patch, user space need to find sub program tags via kallsyms. This feature will be used in BPF introspection, where user space queries information about BPF programs via sys_bpf. Change-Id: I880935712b896e37e96f53de76ec7fcf0a011cb1 Signed-off-by: Song Liu Acked-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann --- include/uapi/linux/bpf.h | 2 ++ kernel/bpf/syscall.c | 22 ++++++++++++++++++++++ 2 files changed, 24 insertions(+) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index a841e411e2fb..023a6fb75e82 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2698,6 +2698,8 @@ struct bpf_prog_info { __u32 nr_jited_line_info; __u32 line_info_rec_size; __u32 jited_line_info_rec_size; + __u32 nr_prog_tags; + __aligned_u64 prog_tags; } __attribute__((aligned(8))); struct bpf_map_info { diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index a142b5b0126c..2607f3b665f7 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2345,6 +2345,28 @@ static int bpf_prog_get_info_by_fd(struct file *file, } } + ulen = info.nr_prog_tags; + info.nr_prog_tags = prog->aux->func_cnt ? : 1; + if (ulen) { + __u8 __user (*user_prog_tags)[BPF_TAG_SIZE]; + u32 i; + + user_prog_tags = u64_to_user_ptr(info.prog_tags); + ulen = min_t(u32, info.nr_prog_tags, ulen); + if (prog->aux->func_cnt) { + for (i = 0; i < ulen; i++) { + if (copy_to_user(user_prog_tags[i], + prog->aux->func[i]->tag, + BPF_TAG_SIZE)) + return -EFAULT; + } + } else { + if (copy_to_user(user_prog_tags[0], + prog->tag, BPF_TAG_SIZE)) + return -EFAULT; + } + } + done: if (copy_to_user(uinfo, &info, info_len) || put_user(info_len, &uattr->info.info_len)) From d6d37fc717f83b66770f0cf20b25c0b3cc39bfd5 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 12 Dec 2018 16:29:07 -0800 Subject: [PATCH 0999/1640] BACKPORT: bpf: verifier: make sure callees don't prune with caller differences Currently for liveness and state pruning the register parentage chains don't include states of the callee. This makes some sense as the callee can't access those registers. However, this means that READs done after the callee returns will not propagate into the states of the callee. Callee will then perform pruning disregarding differences in caller state. Example: 0: (85) call bpf_user_rnd_u32 1: (b7) r8 = 0 2: (55) if r0 != 0x0 goto pc+1 3: (b7) r8 = 1 4: (bf) r1 = r8 5: (85) call pc+4 6: (15) if r8 == 0x1 goto pc+1 7: (05) *(u64 *)(r9 - 8) = r3 8: (b7) r0 = 0 9: (95) exit 10: (15) if r1 == 0x0 goto pc+0 11: (95) exit Here we acquire unknown state with call to get_random() [1]. Then we store this random state in r8 (either 0 or 1) [1 - 3], and make a call on line 5. Callee does nothing but a trivial conditional jump (to create a pruning point). Upon return caller checks the state of r8 and either performs an unsafe read or not. Verifier will first explore the path with r8 == 1, creating a pruning point at [11]. The parentage chain for r8 will include only callers states so once verifier reaches [6] it will mark liveness only on states in the caller, and not [11]. Now when verifier walks the paths with r8 == 0 it will reach [11] and since REG_LIVE_READ on r8 was not propagated there it will prune the walk entirely (stop walking the entire program, not just the callee). Since [6] was never walked with r8 == 0, [7] will be considered dead and replaced with "goto -1" causing hang at runtime. This patch weaves the callee's explored states onto the callers parentage chain. Rough parentage for r8 would have looked like this before: [0] [1] [2] [3] [4] [5] [10] [11] [6] [7] | | ,---|----. | | | sl0: sl0: / sl0: \ sl0: sl0: sl0: fr0: r8 <-- fr0: r8<+--fr0: r8 `fr0: r8 ,fr0: r8<-fr0: r8 \ fr1: r8 <- fr1: r8 / \__________________/ after: [0] [1] [2] [3] [4] [5] [10] [11] [6] [7] | | | | | | sl0: sl0: sl0: sl0: sl0: sl0: fr0: r8 <-- fr0: r8 <- fr0: r8 <- fr0: r8 <-fr0: r8<-fr0: r8 fr1: r8 <- fr1: r8 Now the mark from instruction 6 will travel through callees states. Note that we don't have to connect r0 because its overwritten by callees state on return and r1 - r5 because those are not alive any more once a call is made. v2: - don't connect the callees registers twice (Alexei: suggestion & code) - add more details to the comment (Ed & Alexei) v1: don't unnecessarily link caller saved regs (Jiong) Fixes: f4d7e40a5b71 ("bpf: introduce function calls (verification)") Reported-by: David Beckett Change-Id: I3a4c57d7abe85b9d9a34795ada9692b0a9a2577d Signed-off-by: Jakub Kicinski Reviewed-by: Jiong Wang Reviewed-by: Edward Cree Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 6fdb6d85187c..4d8052fe3a79 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5857,9 +5857,16 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) } new_sl->next = env->explored_states[insn_idx]; env->explored_states[insn_idx] = new_sl; - /* connect new state to parentage chain */ - for (i = 0; i < BPF_REG_FP; i++) - cur_regs(env)[i].parent = &new->frame[new->curframe]->regs[i]; + /* connect new state to parentage chain. Current frame needs all + * registers connected. Only r6 - r9 of the callers are alive (pushed + * to the stack implicitly by JITs) so in callers' frames connect just + * r6 - r9 as an optimization. Callers will have r1 - r5 connected to + * the state of the call instruction (with WRITTEN set), and r0 comes + * from callee with its full parentage chain, anyway. + */ + for (j = 0; j <= cur->curframe; j++) + for (i = j < cur->curframe ? BPF_REG_6 : 0; i < BPF_REG_FP; i++) + cur->frame[j]->regs[i].parent = &new->frame[j]->regs[i]; /* clear write marks in current state: the writes we did are not writes * our child did, so they don't screen off its reads from us. * (There are no read marks in current state, because reads always mark From 716df5d2c348574670b7778886842a380233f0dd Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 12 Dec 2018 10:45:38 +0100 Subject: [PATCH 1000/1640] UPSTREAM: bpf: remove obsolete prog->aux sanitation in bpf_insn_prepare_dump This logic is not needed anymore since we got rid of the verifier rewrite that was using prog->aux address in f6069b9aa993 ("bpf: fix redirect to map under tail calls"). Change-Id: If62ad9ebe0689b204c13496062604d76d4cfe665 Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov --- kernel/bpf/syscall.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 2607f3b665f7..4ce41a76e5ee 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2060,13 +2060,6 @@ static struct bpf_insn *bpf_insn_prepare_dump(const struct bpf_prog *prog, insns[i + 1].imm = 0; continue; } - - if (!bpf_dump_raw_ok(f_cred) && - imm == (unsigned long)prog->aux) { - insns[i].imm = 0; - insns[i + 1].imm = 0; - continue; - } } return insns; From 429e7e323f7c33a50a7453b76c7014ac2177ffe8 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 13 Dec 2018 10:41:46 -0800 Subject: [PATCH 1001/1640] UPSTREAM: bpf: Create a new btf_name_by_offset() for non type name use case The current btf_name_by_offset() is returning "(anon)" type name for the offset == 0 case and "(invalid-name-offset)" for the out-of-bound offset case. It fits well for the internal BTF verbose log purpose which is focusing on type. For example, offset == 0 => "(anon)" => anonymous type/name. Returning non-NULL for the bad offset case is needed during the BTF verification process because the BTF verifier may complain about another field first before discovering the name_off is invalid. However, it may not be ideal for the newer use case which does not necessary mean type name. For example, when logging line_info in the BPF verifier in the next patch, it is better to log an empty src line instead of logging "(anon)". The existing bpf_name_by_offset() is renamed to __bpf_name_by_offset() and static to btf.c. A new bpf_name_by_offset() is added for generic context usage. It returns "\0" for name_off == 0 (note that btf->strings[0] is "\0") and NULL for invalid offset. It allows the caller to decide what is the best output in its context. The new btf_name_by_offset() is overlapped with btf_name_offset_valid(). Hence, btf_name_offset_valid() is removed from btf.h to keep the btf.h API minimal. The existing btf_name_offset_valid() usage in btf.c could also be replaced later. Change-Id: Ibfdef031fd224d26f5d8be7a8fff5d02f32aaa6e Signed-off-by: Martin KaFai Lau Acked-by: Yonghong Song Signed-off-by: Alexei Starovoitov --- include/linux/btf.h | 1 - kernel/bpf/btf.c | 31 ++++++++++++++++++++----------- kernel/bpf/verifier.c | 4 ++-- 3 files changed, 22 insertions(+), 14 deletions(-) diff --git a/include/linux/btf.h b/include/linux/btf.h index a4cf075b89eb..58000d7e06e3 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -46,7 +46,6 @@ void btf_type_seq_show(const struct btf *btf, u32 type_id, void *obj, struct seq_file *m); int btf_get_fd_by_id(u32 id); u32 btf_id(const struct btf *btf); -bool btf_name_offset_valid(const struct btf *btf, u32 offset); bool btf_type_is_reg_int(const struct btf_type *t, u32 expected_size); #ifdef CONFIG_BPF_SYSCALL diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index e69aa33c89cd..f8d47dfe0c1b 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -474,7 +474,7 @@ static bool btf_name_valid_identifier(const struct btf *btf, u32 offset) return !*src; } -const char *btf_name_by_offset(const struct btf *btf, u32 offset) +static const char *__btf_name_by_offset(const struct btf *btf, u32 offset) { if (!offset) return "(anon)"; @@ -484,6 +484,14 @@ const char *btf_name_by_offset(const struct btf *btf, u32 offset) return "(invalid-name-offset)"; } +const char *btf_name_by_offset(const struct btf *btf, u32 offset) +{ + if (offset < btf->hdr.str_len) + return &btf->strings[offset]; + + return NULL; +} + const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id) { if (type_id > btf->nr_types) @@ -576,7 +584,7 @@ __printf(4, 5) static void __btf_verifier_log_type(struct btf_verifier_env *env, __btf_verifier_log(log, "[%u] %s %s%s", env->log_type_id, btf_kind_str[kind], - btf_name_by_offset(btf, t->name_off), + __btf_name_by_offset(btf, t->name_off), log_details ? " " : ""); if (log_details) @@ -620,7 +628,7 @@ static void btf_verifier_log_member(struct btf_verifier_env *env, btf_verifier_log_type(env, struct_type, NULL); __btf_verifier_log(log, "\t%s type_id=%u bits_offset=%u", - btf_name_by_offset(btf, member->name_off), + __btf_name_by_offset(btf, member->name_off), member->type, member->offset); if (fmt && *fmt) { @@ -1872,7 +1880,7 @@ static s32 btf_enum_check_meta(struct btf_verifier_env *env, btf_verifier_log(env, "\t%s val=%d\n", - btf_name_by_offset(btf, enums[i].name_off), + __btf_name_by_offset(btf, enums[i].name_off), enums[i].val); } @@ -1896,7 +1904,8 @@ static void btf_enum_seq_show(const struct btf *btf, const struct btf_type *t, for (i = 0; i < nr_enums; i++) { if (v == enums[i].val) { seq_printf(m, "%s", - btf_name_by_offset(btf, enums[i].name_off)); + __btf_name_by_offset(btf, + enums[i].name_off)); return; } } @@ -1954,20 +1963,20 @@ static void btf_func_proto_log(struct btf_verifier_env *env, } btf_verifier_log(env, "%u %s", args[0].type, - btf_name_by_offset(env->btf, - args[0].name_off)); + __btf_name_by_offset(env->btf, + args[0].name_off)); for (i = 1; i < nr_args - 1; i++) btf_verifier_log(env, ", %u %s", args[i].type, - btf_name_by_offset(env->btf, - args[i].name_off)); + __btf_name_by_offset(env->btf, + args[i].name_off)); if (nr_args > 1) { const struct btf_param *last_arg = &args[nr_args - 1]; if (last_arg->type) btf_verifier_log(env, ", %u %s", last_arg->type, - btf_name_by_offset(env->btf, - last_arg->name_off)); + __btf_name_by_offset(env->btf, + last_arg->name_off)); else btf_verifier_log(env, ", vararg"); } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 4d8052fe3a79..7809a20832c3 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5395,8 +5395,8 @@ static int check_btf_line(struct bpf_verifier_env *env, goto err_free; } - if (!btf_name_offset_valid(btf, linfo[i].line_off) || - !btf_name_offset_valid(btf, linfo[i].file_name_off)) { + if (!btf_name_by_offset(btf, linfo[i].line_off) || + !btf_name_by_offset(btf, linfo[i].file_name_off)) { verbose(env, "Invalid line_info[%u].line_off or .file_name_off\n", i); err = -EINVAL; goto err_free; From 11a9ec02cff7b21baf5e80626d456faff28cde2b Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 13 Dec 2018 10:41:48 -0800 Subject: [PATCH 1002/1640] BACKPORT: bpf: verbose log bpf_line_info in verifier This patch adds bpf_line_info during the verifier's verbose. It can give error context for debug purpose. ~~~~~~~~~~ Here is the verbose log for backedge: while (a) { a += bpf_get_smp_processor_id(); bpf_trace_printk(fmt, sizeof(fmt), a); } ~> bpftool prog load ./test_loop.o /sys/fs/bpf/test_loop type tracepoint 13: while (a) { 3: a += bpf_get_smp_processor_id(); back-edge from insn 13 to 3 ~~~~~~~~~~ Here is the verbose log for invalid pkt access: Modification to test_xdp_noinline.c: data = (void *)(long)xdp->data; data_end = (void *)(long)xdp->data_end; /* if (data + 4 > data_end) return XDP_DROP; */ *(u32 *)data = dst->dst; ~> bpftool prog load ./test_xdp_noinline.o /sys/fs/bpf/test_xdp_noinline type xdp ; data = (void *)(long)xdp->data; 224: (79) r2 = *(u64 *)(r10 -112) 225: (61) r2 = *(u32 *)(r2 +0) ; *(u32 *)data = dst->dst; 226: (63) *(u32 *)(r2 +0) = r1 invalid access to packet, off=0 size=4, R2(id=0,off=0,r=0) R2 offset is outside of the packet Change-Id: I3851c21fd61e282ac6bab184b57bce38d8795a20 Signed-off-by: Martin KaFai Lau Acked-by: Yonghong Song Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 1 + kernel/bpf/verifier.c | 74 +++++++++++++++++++++++++++++++++--- 2 files changed, 70 insertions(+), 5 deletions(-) diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index c3b9c7ebd8e0..a96bf8604c9d 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -246,6 +246,7 @@ struct bpf_verifier_env { bool allow_ptr_leaks; bool seen_direct_write; struct bpf_insn_aux_data *insn_aux_data; /* array of per-insn state */ + const struct bpf_line_info *prev_linfo; struct bpf_verifier_log log; struct bpf_subprog_info subprog_info[BPF_MAX_SUBPROGS + 1]; struct bpf_id_pair idmap_scratch[BPF_ID_MAP_SIZE]; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 7809a20832c3..60abd54a64c9 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -26,6 +26,7 @@ #include #include #include +#include #include "disasm.h" @@ -215,6 +216,27 @@ struct bpf_call_arg_meta { static DEFINE_MUTEX(bpf_verifier_lock); +static const struct bpf_line_info * +find_linfo(const struct bpf_verifier_env *env, u32 insn_off) +{ + const struct bpf_line_info *linfo; + const struct bpf_prog *prog; + u32 i, nr_linfo; + + prog = env->prog; + nr_linfo = prog->aux->nr_linfo; + + if (!nr_linfo || insn_off >= prog->len) + return NULL; + + linfo = prog->aux->linfo; + for (i = 1; i < nr_linfo; i++) + if (insn_off < linfo[i].insn_off) + break; + + return &linfo[i - 1]; +} + void bpf_verifier_vlog(struct bpf_verifier_log *log, const char *fmt, va_list args) { @@ -265,6 +287,42 @@ __printf(2, 3) static void verbose(void *private_data, const char *fmt, ...) va_end(args); } +static const char *ltrim(const char *s) +{ + while (isspace(*s)) + s++; + + return s; +} + +__printf(3, 4) static void verbose_linfo(struct bpf_verifier_env *env, + u32 insn_off, + const char *prefix_fmt, ...) +{ + const struct bpf_line_info *linfo; + + if (!bpf_verifier_log_needed(&env->log)) + return; + + linfo = find_linfo(env, insn_off); + if (!linfo || linfo == env->prev_linfo) + return; + + if (prefix_fmt) { + va_list args; + + va_start(args, prefix_fmt); + bpf_verifier_vlog(&env->log, prefix_fmt, args); + va_end(args); + } + + verbose(env, "%s\n", + ltrim(btf_name_by_offset(env->prog->aux->btf, + linfo->line_off))); + + env->prev_linfo = linfo; +} + static bool type_is_pkt_pointer(enum bpf_reg_type type) { return type == PTR_TO_PACKET || @@ -5046,6 +5104,7 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env) return 0; if (w < 0 || w >= env->prog->len) { + verbose_linfo(env, t, "%d: ", t); verbose(env, "jump out of range from insn %d to %d\n", t, w); return -EINVAL; } @@ -5063,6 +5122,8 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env) insn_stack[cur_stack++] = w; return 1; } else if ((insn_state[w] & 0xF0) == DISCOVERED) { + verbose_linfo(env, t, "%d: ", t); + verbose_linfo(env, w, "%d: ", w); verbose(env, "back-edge from insn %d to %d\n", t, w); return -EINVAL; } else if (insn_state[w] == EXPLORED) { @@ -5085,10 +5146,6 @@ static int check_cfg(struct bpf_verifier_env *env) int ret = 0; int i, t; - ret = check_subprogs(env); - if (ret < 0) - return ret; - insn_state = kcalloc(insn_cnt, sizeof(int), GFP_KERNEL); if (!insn_state) return -ENOMEM; @@ -5930,6 +5987,8 @@ static int do_check(struct bpf_verifier_env *env) int insn_processed = 0; bool do_print_state = false; + env->prev_linfo = NULL; + state = kzalloc(sizeof(struct bpf_verifier_state), GFP_KERNEL); if (!state) return -ENOMEM; @@ -6008,6 +6067,7 @@ static int do_check(struct bpf_verifier_env *env) .private_data = env, }; + verbose_linfo(env, env->insn_idx, "; "); verbose(env, "%d: ", env->insn_idx); print_bpf_insn(&cbs, insn, env->allow_ptr_leaks); } @@ -7302,7 +7362,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, env->allow_ptr_leaks = capable(CAP_SYS_ADMIN); - ret = check_cfg(env); + ret = check_subprogs(env); if (ret < 0) goto skip_full_check; @@ -7310,6 +7370,10 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, if (ret < 0) goto skip_full_check; + ret = check_cfg(env); + if (ret < 0) + goto skip_full_check; + ret = do_check(env); if (env->cur_state) { free_verifier_state(env->cur_state, true); From c4452e4f4346982b48009edb10568721d9385d49 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 13 Dec 2018 11:42:31 -0800 Subject: [PATCH 1003/1640] UPSTREAM: bpf: speed up stacksafe check Don't check the same stack liveness condition 8 times. once is enough. Change-Id: I221e670d6291abe5c1f63d340ef4ef15e9339971 Signed-off-by: Alexei Starovoitov Acked-by: Edward Cree Acked-by: Jakub Kicinski Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 60abd54a64c9..1e93ec2637b2 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5683,9 +5683,11 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old, for (i = 0; i < old->allocated_stack; i++) { spi = i / BPF_REG_SIZE; - if (!(old->stack[spi].spilled_ptr.live & REG_LIVE_READ)) + if (!(old->stack[spi].spilled_ptr.live & REG_LIVE_READ)) { + i += BPF_REG_SIZE - 1; /* explored state didn't use this */ continue; + } if (old->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_INVALID) continue; From 6f1f0dc385d8e79c0afbdabc5f83334314aa7a42 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 13 Dec 2018 11:42:33 -0800 Subject: [PATCH 1004/1640] BACKPORT: bpf: improve stacksafe state comparison "if (old->allocated_stack > cur->allocated_stack)" check is too conservative. In some cases explored stack could have allocated more space, but that stack space was not live. The test case improves from 19 to 15 processed insns and improvement on real programs is significant as well: before after bpf_lb-DLB_L3.o 1940 1831 bpf_lb-DLB_L4.o 3089 3029 bpf_lb-DUNKNOWN.o 1065 1064 bpf_lxc-DDROP_ALL.o 28052 26309 bpf_lxc-DUNKNOWN.o 35487 33517 bpf_netdev.o 10864 9713 bpf_overlay.o 6643 6184 bpf_lcx_jit.o 38437 37335 Change-Id: Ia52207499b80110161af901cc945dc22953f1881 Signed-off-by: Alexei Starovoitov Acked-by: Edward Cree Acked-by: Jakub Kicinski Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 1e93ec2637b2..592a562841d1 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5670,12 +5670,6 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old, { int i, spi; - /* if explored stack has more populated slots than current stack - * such stacks are not equivalent - */ - if (old->allocated_stack > cur->allocated_stack) - return false; - /* walk slots of the explored stack and ignore any additional * slots in the current stack, since explored(safe) state * didn't use them @@ -5691,6 +5685,13 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old, if (old->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_INVALID) continue; + + /* explored stack has more populated slots than current stack + * and these slots were used + */ + if (i >= cur->allocated_stack) + return false; + /* if old state was safe with misc data in the stack * it will be safe with zero-initialized stack. * The opposite is not true From 0fe079b7343a68c93caa5984bbf5f78465fccf32 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 13 Dec 2018 11:42:34 -0800 Subject: [PATCH 1005/1640] UPSTREAM: bpf: add self-check logic to liveness analysis Introduce REG_LIVE_DONE to check the liveness propagation and prepare the states for merging. See algorithm description in clean_live_states(). Change-Id: Ia0063f8098770f9d1e0e1713fc267a5e1e0234fe Signed-off-by: Alexei Starovoitov Acked-by: Jakub Kicinski Signed-off-by: Daniel Borkmann --- include/linux/bpf_verifier.h | 1 + kernel/bpf/verifier.c | 108 ++++++++++++++++++++++++++++++++++- 2 files changed, 108 insertions(+), 1 deletion(-) diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index a96bf8604c9d..c0c9bba96624 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -38,6 +38,7 @@ enum bpf_reg_liveness { REG_LIVE_NONE = 0, /* reg hasn't been read or written this branch */ REG_LIVE_READ, /* reg was read, so we're sensitive to initial value */ REG_LIVE_WRITTEN, /* reg was written first, screening off later reads */ + REG_LIVE_DONE = 4, /* liveness won't be updating this register anymore */ }; struct bpf_reg_state { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 592a562841d1..b726bc823bd8 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -396,12 +396,14 @@ static char slot_type_char[] = { static void print_liveness(struct bpf_verifier_env *env, enum bpf_reg_liveness live) { - if (live & (REG_LIVE_READ | REG_LIVE_WRITTEN)) + if (live & (REG_LIVE_READ | REG_LIVE_WRITTEN | REG_LIVE_DONE)) verbose(env, "_"); if (live & REG_LIVE_READ) verbose(env, "r"); if (live & REG_LIVE_WRITTEN) verbose(env, "w"); + if (live & REG_LIVE_DONE) + verbose(env, "D"); } static struct bpf_func_state *func(struct bpf_verifier_env *env, @@ -1134,6 +1136,12 @@ static int mark_reg_read(struct bpf_verifier_env *env, /* if read wasn't screened by an earlier write ... */ if (writes && state->live & REG_LIVE_WRITTEN) break; + if (parent->live & REG_LIVE_DONE) { + verbose(env, "verifier BUG type %s var_off %lld off %d\n", + reg_type_str[parent->type], + parent->var_off.value, parent->off); + return -EFAULT; + } /* ... then we depend on parent's value */ parent->live |= REG_LIVE_READ; state = parent; @@ -5556,6 +5564,102 @@ static bool check_ids(u32 old_id, u32 cur_id, struct bpf_id_pair *idmap) return false; } +static void clean_func_state(struct bpf_verifier_env *env, + struct bpf_func_state *st) +{ + enum bpf_reg_liveness live; + int i, j; + + for (i = 0; i < BPF_REG_FP; i++) { + live = st->regs[i].live; + /* liveness must not touch this register anymore */ + st->regs[i].live |= REG_LIVE_DONE; + if (!(live & REG_LIVE_READ)) + /* since the register is unused, clear its state + * to make further comparison simpler + */ + __mark_reg_not_init(&st->regs[i]); + } + + for (i = 0; i < st->allocated_stack / BPF_REG_SIZE; i++) { + live = st->stack[i].spilled_ptr.live; + /* liveness must not touch this stack slot anymore */ + st->stack[i].spilled_ptr.live |= REG_LIVE_DONE; + if (!(live & REG_LIVE_READ)) { + __mark_reg_not_init(&st->stack[i].spilled_ptr); + for (j = 0; j < BPF_REG_SIZE; j++) + st->stack[i].slot_type[j] = STACK_INVALID; + } + } +} + +static void clean_verifier_state(struct bpf_verifier_env *env, + struct bpf_verifier_state *st) +{ + int i; + + if (st->frame[0]->regs[0].live & REG_LIVE_DONE) + /* all regs in this state in all frames were already marked */ + return; + + for (i = 0; i <= st->curframe; i++) + clean_func_state(env, st->frame[i]); +} + +/* the parentage chains form a tree. + * the verifier states are added to state lists at given insn and + * pushed into state stack for future exploration. + * when the verifier reaches bpf_exit insn some of the verifer states + * stored in the state lists have their final liveness state already, + * but a lot of states will get revised from liveness point of view when + * the verifier explores other branches. + * Example: + * 1: r0 = 1 + * 2: if r1 == 100 goto pc+1 + * 3: r0 = 2 + * 4: exit + * when the verifier reaches exit insn the register r0 in the state list of + * insn 2 will be seen as !REG_LIVE_READ. Then the verifier pops the other_branch + * of insn 2 and goes exploring further. At the insn 4 it will walk the + * parentage chain from insn 4 into insn 2 and will mark r0 as REG_LIVE_READ. + * + * Since the verifier pushes the branch states as it sees them while exploring + * the program the condition of walking the branch instruction for the second + * time means that all states below this branch were already explored and + * their final liveness markes are already propagated. + * Hence when the verifier completes the search of state list in is_state_visited() + * we can call this clean_live_states() function to mark all liveness states + * as REG_LIVE_DONE to indicate that 'parent' pointers of 'struct bpf_reg_state' + * will not be used. + * This function also clears the registers and stack for states that !READ + * to simplify state merging. + * + * Important note here that walking the same branch instruction in the callee + * doesn't meant that the states are DONE. The verifier has to compare + * the callsites + */ +static void clean_live_states(struct bpf_verifier_env *env, int insn, + struct bpf_verifier_state *cur) +{ + struct bpf_verifier_state_list *sl; + int i; + + sl = env->explored_states[insn]; + if (!sl) + return; + + while (sl != STATE_LIST_MARK) { + if (sl->state.curframe != cur->curframe) + goto next; + for (i = 0; i <= cur->curframe; i++) + if (sl->state.frame[i]->callsite != cur->frame[i]->callsite) + goto next; + clean_verifier_state(env, &sl->state); +next: + sl = sl->next; + } +} + /* Returns true if (rold safe implies rcur safe) */ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold, struct bpf_reg_state *rcur, struct bpf_id_pair *idmap) @@ -5872,6 +5976,8 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) */ return 0; + clean_live_states(env, insn_idx, cur); + while (sl != STATE_LIST_MARK) { if (states_equal(env, &sl->state, cur)) { /* reached equivalent register/stack state, From b5b779ece84e0f83d2ac9515e536e857ea5e3583 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sun, 16 Dec 2018 00:49:47 +0100 Subject: [PATCH 1006/1640] UPSTREAM: bpf: remove useless version check for prog load Existing libraries and tracing frameworks work around this kernel version check by automatically deriving the kernel version from uname(3) or similar such that the user does not need to do it manually; these workarounds also make the version check useless at the same time. Moreover, most other BPF tracing types enabling bpf_probe_read()-like functionality have /not/ adapted this check, and in general these days it is well understood anyway that all the tracing programs are not stable with regards to future kernels as kernel internal data structures are subject to change from release to release. Back at last netconf we discussed [0] and agreed to remove this check from bpf_prog_load() and instead document it here in the uapi header that there is no such guarantee for stable API for these programs. [0] http://vger.kernel.org/netconf2018_files/DanielBorkmann_netconf2018.pdf Change-Id: Ibe112b27eadca1ac3b7cff1696684fb42fac1f8c Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Acked-by: Quentin Monnet Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 10 +++++++++- kernel/bpf/syscall.c | 5 ----- tools/include/uapi/linux/bpf.h | 10 +++++++++- 3 files changed, 18 insertions(+), 7 deletions(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 023a6fb75e82..adfea446bfc5 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -133,6 +133,14 @@ enum bpf_map_type { BPF_MAP_TYPE_STACK, }; +/* Note that tracing related programs such as + * BPF_PROG_TYPE_{KPROBE,TRACEPOINT,PERF_EVENT,RAW_TRACEPOINT} + * are not subject to a stable API since kernel internal data + * structures can change from release to release and may + * therefore break existing tracing BPF programs. Tracing BPF + * programs correspond to /a/ specific kernel which is to be + * analyzed, and not /a/ specific kernel /and/ all future ones. + */ enum bpf_prog_type { BPF_PROG_TYPE_UNSPEC, BPF_PROG_TYPE_SOCKET_FILTER, @@ -345,7 +353,7 @@ union bpf_attr { __u32 log_level; /* verbosity level of verifier */ __u32 log_size; /* size of user buffer */ __aligned_u64 log_buf; /* user supplied buffer */ - __u32 kern_version; /* checked when prog_type=kprobe */ + __u32 kern_version; /* not used */ __u32 prog_flags; char prog_name[BPF_OBJ_NAME_LEN]; __u32 prog_ifindex; /* ifindex of netdev to prep for */ diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 4ce41a76e5ee..0db41febb193 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1488,11 +1488,6 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) if (attr->insn_cnt == 0 || attr->insn_cnt > BPF_MAXINSNS) return -E2BIG; - - if (type == BPF_PROG_TYPE_KPROBE && - attr->kern_version != LINUX_VERSION_CODE) - return -EINVAL; - if (type != BPF_PROG_TYPE_SOCKET_FILTER && type != BPF_PROG_TYPE_CGROUP_SKB && !capable(CAP_SYS_ADMIN)) diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index b5957432a74d..3682622e129f 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -115,6 +115,14 @@ enum bpf_map_type { BPF_MAP_TYPE_CPUMAP, }; +/* Note that tracing related programs such as + * BPF_PROG_TYPE_{KPROBE,TRACEPOINT,PERF_EVENT,RAW_TRACEPOINT} + * are not subject to a stable API since kernel internal data + * structures can change from release to release and may + * therefore break existing tracing BPF programs. Tracing BPF + * programs correspond to /a/ specific kernel which is to be + * analyzed, and not /a/ specific kernel /and/ all future ones. + */ enum bpf_prog_type { BPF_PROG_TYPE_UNSPEC, BPF_PROG_TYPE_SOCKET_FILTER, @@ -210,7 +218,7 @@ union bpf_attr { __u32 log_level; /* verbosity level of verifier */ __u32 log_size; /* size of user buffer */ __aligned_u64 log_buf; /* user supplied buffer */ - __u32 kern_version; /* checked when prog_type=kprobe */ + __u32 kern_version; /* not used */ __u32 prog_flags; }; From 5fd21ca992188292c54c70c85879a84c7bd172ad Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sat, 15 Dec 2018 22:13:50 -0800 Subject: [PATCH 1007/1640] UPSTREAM: bpf: btf: refactor btf_int_bits_seq_show() Refactor function btf_int_bits_seq_show() by creating function btf_bitfield_seq_show() which has no dependence on btf and btf_type. The function btf_bitfield_seq_show() will be in later patch to directly dump bitfield member values. Acked-by: Martin KaFai Lau Change-Id: Ie0609392a926c313b6574a87b3003ef79bd6ae23 Signed-off-by: Yonghong Song Signed-off-by: Daniel Borkmann --- kernel/bpf/btf.c | 35 +++++++++++++++++++++-------------- 1 file changed, 21 insertions(+), 14 deletions(-) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index f8d47dfe0c1b..484ef8413b87 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -1068,26 +1068,16 @@ static void btf_int_log(struct btf_verifier_env *env, btf_int_encoding_str(BTF_INT_ENCODING(int_data))); } -static void btf_int_bits_seq_show(const struct btf *btf, - const struct btf_type *t, - void *data, u8 bits_offset, - struct seq_file *m) +static void btf_bitfield_seq_show(void *data, u8 bits_offset, + u8 nr_bits, struct seq_file *m) { u16 left_shift_bits, right_shift_bits; - u32 int_data = btf_type_int(t); - u8 nr_bits = BTF_INT_BITS(int_data); - u8 total_bits_offset; u8 nr_copy_bytes; u8 nr_copy_bits; u64 print_num; - /* - * bits_offset is at most 7. - * BTF_INT_OFFSET() cannot exceed 64 bits. - */ - total_bits_offset = bits_offset + BTF_INT_OFFSET(int_data); - data += BITS_ROUNDDOWN_BYTES(total_bits_offset); - bits_offset = BITS_PER_BYTE_MASKED(total_bits_offset); + data += BITS_ROUNDDOWN_BYTES(bits_offset); + bits_offset = BITS_PER_BYTE_MASKED(bits_offset); nr_copy_bits = nr_bits + bits_offset; nr_copy_bytes = BITS_ROUNDUP_BYTES(nr_copy_bits); @@ -1107,6 +1097,23 @@ static void btf_int_bits_seq_show(const struct btf *btf, seq_printf(m, "0x%llx", print_num); } +static void btf_int_bits_seq_show(const struct btf *btf, + const struct btf_type *t, + void *data, u8 bits_offset, + struct seq_file *m) +{ + u32 int_data = btf_type_int(t); + u8 nr_bits = BTF_INT_BITS(int_data); + u8 total_bits_offset; + + /* + * bits_offset is at most 7. + * BTF_INT_OFFSET() cannot exceed 64 bits. + */ + total_bits_offset = bits_offset + BTF_INT_OFFSET(int_data); + btf_bitfield_seq_show(data, total_bits_offset, nr_bits, m); +} + static void btf_int_seq_show(const struct btf *btf, const struct btf_type *t, u32 type_id, void *data, u8 bits_offset, struct seq_file *m) From f3a885fb865403451d985ded6b74b5116c076291 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sat, 15 Dec 2018 22:13:51 -0800 Subject: [PATCH 1008/1640] UPSTREAM: bpf: btf: fix struct/union/fwd types with kind_flag This patch fixed two issues with BTF. One is related to struct/union bitfield encoding and the other is related to forward type. Issue #1 and solution: ====================== Current btf encoding of bitfield follows what pahole generates. For each bitfield, pahole will duplicate the type chain and put the bitfield size at the final int or enum type. Since the BTF enum type cannot encode bit size, pahole workarounds the issue by generating an int type whenever the enum bit size is not 32. For example, -bash-4.4$ cat t.c typedef int ___int; enum A { A1, A2, A3 }; struct t { int a[5]; ___int b:4; volatile enum A c:4; } g; -bash-4.4$ gcc -c -O2 -g t.c The current kernel supports the following BTF encoding: $ pahole -JV t.o [1] TYPEDEF ___int type_id=2 [2] INT int size=4 bit_offset=0 nr_bits=32 encoding=SIGNED [3] ENUM A size=4 vlen=3 A1 val=0 A2 val=1 A3 val=2 [4] STRUCT t size=24 vlen=3 a type_id=5 bits_offset=0 b type_id=9 bits_offset=160 c type_id=11 bits_offset=164 [5] ARRAY (anon) type_id=2 index_type_id=2 nr_elems=5 [6] INT sizetype size=8 bit_offset=0 nr_bits=64 encoding=(none) [7] VOLATILE (anon) type_id=3 [8] INT int size=1 bit_offset=0 nr_bits=4 encoding=(none) [9] TYPEDEF ___int type_id=8 [10] INT (anon) size=1 bit_offset=0 nr_bits=4 encoding=SIGNED [11] VOLATILE (anon) type_id=10 Two issues are in the above: . by changing enum type to int, we lost the original type information and this will not be ideal later when we try to convert BTF to a header file. . the type duplication for bitfields will cause BTF bloat. Duplicated types cannot be deduplicated later if the bitfield size is different. To fix this issue, this patch implemented a compatible change for BTF struct type encoding: . the bit 31 of struct_type->info, previously reserved, now is used to indicate whether bitfield_size is encoded in btf_member or not. . if bit 31 of struct_type->info is set, btf_member->offset will encode like: bit 0 - 23: bit offset bit 24 - 31: bitfield size if bit 31 is not set, the old behavior is preserved: bit 0 - 31: bit offset So if the struct contains a bit field, the maximum bit offset will be reduced to (2^24 - 1) instead of MAX_UINT. The maximum bitfield size will be 256 which is enough for today as maximum bitfield in compiler can be 128 where int128 type is supported. This kernel patch intends to support the new BTF encoding: $ pahole -JV t.o [1] TYPEDEF ___int type_id=2 [2] INT int size=4 bit_offset=0 nr_bits=32 encoding=SIGNED [3] ENUM A size=4 vlen=3 A1 val=0 A2 val=1 A3 val=2 [4] STRUCT t kind_flag=1 size=24 vlen=3 a type_id=5 bitfield_size=0 bits_offset=0 b type_id=1 bitfield_size=4 bits_offset=160 c type_id=7 bitfield_size=4 bits_offset=164 [5] ARRAY (anon) type_id=2 index_type_id=2 nr_elems=5 [6] INT sizetype size=8 bit_offset=0 nr_bits=64 encoding=(none) [7] VOLATILE (anon) type_id=3 Issue #2 and solution: ====================== Current forward type in BTF does not specify whether the original type is struct or union. This will not work for type pretty print and BTF-to-header-file conversion as struct/union must be specified. $ cat tt.c struct t; union u; int foo(struct t *t, union u *u) { return 0; } $ gcc -c -g -O2 tt.c $ pahole -JV tt.o [1] INT int size=4 bit_offset=0 nr_bits=32 encoding=SIGNED [2] FWD t type_id=0 [3] PTR (anon) type_id=2 [4] FWD u type_id=0 [5] PTR (anon) type_id=4 To fix this issue, similar to issue #1, type->info bit 31 is used. If the bit is set, it is union type. Otherwise, it is a struct type. $ pahole -JV tt.o [1] INT int size=4 bit_offset=0 nr_bits=32 encoding=SIGNED [2] FWD t kind_flag=0 type_id=0 [3] PTR (anon) kind_flag=0 type_id=2 [4] FWD u kind_flag=1 type_id=0 [5] PTR (anon) kind_flag=0 type_id=4 Pahole/LLVM change: =================== The new kind_flag functionality has been implemented in pahole and llvm: https://github.com/yonghong-song/pahole/tree/bitfield https://github.com/yonghong-song/llvm/tree/bitfield Note that pahole hasn't implemented func/func_proto kind and .BTF.ext. So to print function signature with bpftool, the llvm compiler should be used. Fixes: 69b693f0aefa ("bpf: btf: Introduce BPF Type Format (BTF)") Acked-by: Martin KaFai Lau Change-Id: I606b0651706f2a1e7e1bf7674f35e73fc0d2312c Signed-off-by: Martin KaFai Lau Signed-off-by: Yonghong Song Signed-off-by: Daniel Borkmann --- include/uapi/linux/btf.h | 20 ++- kernel/bpf/btf.c | 280 ++++++++++++++++++++++++++++++++++++--- 2 files changed, 278 insertions(+), 22 deletions(-) diff --git a/include/uapi/linux/btf.h b/include/uapi/linux/btf.h index 14f66948fc95..7b7475ef2f17 100644 --- a/include/uapi/linux/btf.h +++ b/include/uapi/linux/btf.h @@ -34,7 +34,9 @@ struct btf_type { * bits 0-15: vlen (e.g. # of struct's members) * bits 16-23: unused * bits 24-27: kind (e.g. int, ptr, array...etc) - * bits 28-31: unused + * bits 28-30: unused + * bit 31: kind_flag, currently used by + * struct, union and fwd */ __u32 info; /* "size" is used by INT, ENUM, STRUCT and UNION. @@ -52,6 +54,7 @@ struct btf_type { #define BTF_INFO_KIND(info) (((info) >> 24) & 0x0f) #define BTF_INFO_VLEN(info) ((info) & 0xffff) +#define BTF_INFO_KFLAG(info) ((info) >> 31) #define BTF_KIND_UNKN 0 /* Unknown */ #define BTF_KIND_INT 1 /* Integer */ @@ -110,9 +113,22 @@ struct btf_array { struct btf_member { __u32 name_off; __u32 type; - __u32 offset; /* offset in bits */ + /* If the type info kind_flag is set, the btf_member offset + * contains both member bitfield size and bit offset. The + * bitfield size is set for bitfield members. If the type + * info kind_flag is not set, the offset contains only bit + * offset. + */ + __u32 offset; }; +/* If the struct/union type info kind_flag is set, the + * following two macros are used to access bitfield_size + * and bit_offset from btf_member.offset. + */ +#define BTF_MEMBER_BITFIELD_SIZE(val) ((val) >> 24) +#define BTF_MEMBER_BIT_OFFSET(val) ((val) & 0xffffff) + /* BTF_KIND_FUNC_PROTO is followed by multiple "struct btf_param". * The exact number of btf_param is stored in the vlen (of the * info in "struct btf_type"). diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 484ef8413b87..201c705f7e20 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -164,7 +164,7 @@ #define BITS_ROUNDUP_BYTES(bits) \ (BITS_ROUNDDOWN_BYTES(bits) + !!BITS_PER_BYTE_MASKED(bits)) -#define BTF_INFO_MASK 0x0f00ffff +#define BTF_INFO_MASK 0x8f00ffff #define BTF_INT_MASK 0x0fffffff #define BTF_TYPE_ID_VALID(type_id) ((type_id) <= BTF_MAX_TYPE) #define BTF_STR_OFFSET_VALID(name_off) ((name_off) <= BTF_MAX_NAME_OFFSET) @@ -274,6 +274,10 @@ struct btf_kind_operations { const struct btf_type *struct_type, const struct btf_member *member, const struct btf_type *member_type); + int (*check_kflag_member)(struct btf_verifier_env *env, + const struct btf_type *struct_type, + const struct btf_member *member, + const struct btf_type *member_type); void (*log_details)(struct btf_verifier_env *env, const struct btf_type *t); void (*seq_show)(const struct btf *btf, const struct btf_type *t, @@ -419,6 +423,25 @@ static u16 btf_type_vlen(const struct btf_type *t) return BTF_INFO_VLEN(t->info); } +static bool btf_type_kflag(const struct btf_type *t) +{ + return BTF_INFO_KFLAG(t->info); +} + +static u32 btf_member_bit_offset(const struct btf_type *struct_type, + const struct btf_member *member) +{ + return btf_type_kflag(struct_type) ? BTF_MEMBER_BIT_OFFSET(member->offset) + : member->offset; +} + +static u32 btf_member_bitfield_size(const struct btf_type *struct_type, + const struct btf_member *member) +{ + return btf_type_kflag(struct_type) ? BTF_MEMBER_BITFIELD_SIZE(member->offset) + : 0; +} + static u32 btf_type_int(const struct btf_type *t) { return *(u32 *)(t + 1); @@ -627,9 +650,17 @@ static void btf_verifier_log_member(struct btf_verifier_env *env, if (env->phase != CHECK_META) btf_verifier_log_type(env, struct_type, NULL); - __btf_verifier_log(log, "\t%s type_id=%u bits_offset=%u", - __btf_name_by_offset(btf, member->name_off), - member->type, member->offset); + if (btf_type_kflag(struct_type)) + __btf_verifier_log(log, + "\t%s type_id=%u bitfield_size=%u bits_offset=%u", + __btf_name_by_offset(btf, member->name_off), + member->type, + BTF_MEMBER_BITFIELD_SIZE(member->offset), + BTF_MEMBER_BIT_OFFSET(member->offset)); + else + __btf_verifier_log(log, "\t%s type_id=%u bits_offset=%u", + __btf_name_by_offset(btf, member->name_off), + member->type, member->offset); if (fmt && *fmt) { __btf_verifier_log(log, " "); @@ -945,6 +976,38 @@ static int btf_df_check_member(struct btf_verifier_env *env, return -EINVAL; } +static int btf_df_check_kflag_member(struct btf_verifier_env *env, + const struct btf_type *struct_type, + const struct btf_member *member, + const struct btf_type *member_type) +{ + btf_verifier_log_basic(env, struct_type, + "Unsupported check_kflag_member"); + return -EINVAL; +} + +/* Used for ptr, array and struct/union type members. + * int, enum and modifier types have their specific callback functions. + */ +static int btf_generic_check_kflag_member(struct btf_verifier_env *env, + const struct btf_type *struct_type, + const struct btf_member *member, + const struct btf_type *member_type) +{ + if (BTF_MEMBER_BITFIELD_SIZE(member->offset)) { + btf_verifier_log_member(env, struct_type, member, + "Invalid member bitfield_size"); + return -EINVAL; + } + + /* bitfield size is 0, so member->offset represents bit offset only. + * It is safe to call non kflag check_member variants. + */ + return btf_type_ops(member_type)->check_member(env, struct_type, + member, + member_type); +} + static int btf_df_resolve(struct btf_verifier_env *env, const struct resolve_vertex *v) { @@ -997,6 +1060,62 @@ static int btf_int_check_member(struct btf_verifier_env *env, return 0; } +static int btf_int_check_kflag_member(struct btf_verifier_env *env, + const struct btf_type *struct_type, + const struct btf_member *member, + const struct btf_type *member_type) +{ + u32 struct_bits_off, nr_bits, nr_int_data_bits, bytes_offset; + u32 int_data = btf_type_int(member_type); + u32 struct_size = struct_type->size; + u32 nr_copy_bits; + + /* a regular int type is required for the kflag int member */ + if (!btf_type_int_is_regular(member_type)) { + btf_verifier_log_member(env, struct_type, member, + "Invalid member base type"); + return -EINVAL; + } + + /* check sanity of bitfield size */ + nr_bits = BTF_MEMBER_BITFIELD_SIZE(member->offset); + struct_bits_off = BTF_MEMBER_BIT_OFFSET(member->offset); + nr_int_data_bits = BTF_INT_BITS(int_data); + if (!nr_bits) { + /* Not a bitfield member, member offset must be at byte + * boundary. + */ + if (BITS_PER_BYTE_MASKED(struct_bits_off)) { + btf_verifier_log_member(env, struct_type, member, + "Invalid member offset"); + return -EINVAL; + } + + nr_bits = nr_int_data_bits; + } else if (nr_bits > nr_int_data_bits) { + btf_verifier_log_member(env, struct_type, member, + "Invalid member bitfield_size"); + return -EINVAL; + } + + bytes_offset = BITS_ROUNDDOWN_BYTES(struct_bits_off); + nr_copy_bits = nr_bits + BITS_PER_BYTE_MASKED(struct_bits_off); + if (nr_copy_bits > BITS_PER_U64) { + btf_verifier_log_member(env, struct_type, member, + "nr_copy_bits exceeds 64"); + return -EINVAL; + } + + if (struct_size < bytes_offset || + struct_size - bytes_offset < BITS_ROUNDUP_BYTES(nr_copy_bits)) { + btf_verifier_log_member(env, struct_type, member, + "Member exceeds struct_size"); + return -EINVAL; + } + + return 0; +} + static s32 btf_int_check_meta(struct btf_verifier_env *env, const struct btf_type *t, u32 meta_left) @@ -1016,6 +1135,11 @@ static s32 btf_int_check_meta(struct btf_verifier_env *env, return -EINVAL; } + if (btf_type_kflag(t)) { + btf_verifier_log_type(env, t, "Invalid btf_info kind_flag"); + return -EINVAL; + } + int_data = btf_type_int(t); if (int_data & ~BTF_INT_MASK) { btf_verifier_log_basic(env, t, "Invalid int_data:%x", @@ -1097,6 +1221,7 @@ static void btf_bitfield_seq_show(void *data, u8 bits_offset, seq_printf(m, "0x%llx", print_num); } + static void btf_int_bits_seq_show(const struct btf *btf, const struct btf_type *t, void *data, u8 bits_offset, @@ -1163,6 +1288,7 @@ static const struct btf_kind_operations int_ops = { .check_meta = btf_int_check_meta, .resolve = btf_df_resolve, .check_member = btf_int_check_member, + .check_kflag_member = btf_int_check_kflag_member, .log_details = btf_int_log, .seq_show = btf_int_seq_show, }; @@ -1192,6 +1318,31 @@ static int btf_modifier_check_member(struct btf_verifier_env *env, resolved_type); } +static int btf_modifier_check_kflag_member(struct btf_verifier_env *env, + const struct btf_type *struct_type, + const struct btf_member *member, + const struct btf_type *member_type) +{ + const struct btf_type *resolved_type; + u32 resolved_type_id = member->type; + struct btf_member resolved_member; + struct btf *btf = env->btf; + + resolved_type = btf_type_id_size(btf, &resolved_type_id, NULL); + if (!resolved_type) { + btf_verifier_log_member(env, struct_type, member, + "Invalid member"); + return -EINVAL; + } + + resolved_member = *member; + resolved_member.type = resolved_type_id; + + return btf_type_ops(resolved_type)->check_kflag_member(env, struct_type, + &resolved_member, + resolved_type); +} + static int btf_ptr_check_member(struct btf_verifier_env *env, const struct btf_type *struct_type, const struct btf_member *member, @@ -1227,6 +1378,11 @@ static int btf_ref_type_check_meta(struct btf_verifier_env *env, return -EINVAL; } + if (btf_type_kflag(t)) { + btf_verifier_log_type(env, t, "Invalid btf_info kind_flag"); + return -EINVAL; + } + if (!BTF_TYPE_ID_VALID(t->type)) { btf_verifier_log_type(env, t, "Invalid type_id"); return -EINVAL; @@ -1380,6 +1536,7 @@ static struct btf_kind_operations modifier_ops = { .check_meta = btf_ref_type_check_meta, .resolve = btf_modifier_resolve, .check_member = btf_modifier_check_member, + .check_kflag_member = btf_modifier_check_kflag_member, .log_details = btf_ref_type_log, .seq_show = btf_modifier_seq_show, }; @@ -1388,6 +1545,7 @@ static struct btf_kind_operations ptr_ops = { .check_meta = btf_ref_type_check_meta, .resolve = btf_ptr_resolve, .check_member = btf_ptr_check_member, + .check_kflag_member = btf_generic_check_kflag_member, .log_details = btf_ref_type_log, .seq_show = btf_ptr_seq_show, }; @@ -1422,6 +1580,7 @@ static struct btf_kind_operations fwd_ops = { .check_meta = btf_fwd_check_meta, .resolve = btf_df_resolve, .check_member = btf_df_check_member, + .check_kflag_member = btf_df_check_kflag_member, .log_details = btf_ref_type_log, .seq_show = btf_df_seq_show, }; @@ -1480,6 +1639,11 @@ static s32 btf_array_check_meta(struct btf_verifier_env *env, return -EINVAL; } + if (btf_type_kflag(t)) { + btf_verifier_log_type(env, t, "Invalid btf_info kind_flag"); + return -EINVAL; + } + if (t->size) { btf_verifier_log_type(env, t, "size != 0"); return -EINVAL; @@ -1603,6 +1767,7 @@ static struct btf_kind_operations array_ops = { .check_meta = btf_array_check_meta, .resolve = btf_array_resolve, .check_member = btf_array_check_member, + .check_kflag_member = btf_generic_check_kflag_member, .log_details = btf_array_log, .seq_show = btf_array_seq_show, }; @@ -1641,6 +1806,7 @@ static s32 btf_struct_check_meta(struct btf_verifier_env *env, u32 meta_needed, last_offset; struct btf *btf = env->btf; u32 struct_size = t->size; + u32 offset; u16 i; meta_needed = btf_type_vlen(t) * sizeof(*member); @@ -1682,7 +1848,8 @@ static s32 btf_struct_check_meta(struct btf_verifier_env *env, return -EINVAL; } - if (is_union && member->offset) { + offset = btf_member_bit_offset(t, member); + if (is_union && offset) { btf_verifier_log_member(env, t, member, "Invalid member bits_offset"); return -EINVAL; @@ -1692,20 +1859,20 @@ static s32 btf_struct_check_meta(struct btf_verifier_env *env, * ">" instead of ">=" because the last member could be * "char a[0];" */ - if (last_offset > member->offset) { + if (last_offset > offset) { btf_verifier_log_member(env, t, member, "Invalid member bits_offset"); return -EINVAL; } - if (BITS_ROUNDUP_BYTES(member->offset) > struct_size) { + if (BITS_ROUNDUP_BYTES(offset) > struct_size) { btf_verifier_log_member(env, t, member, "Member bits_offset exceeds its struct size"); return -EINVAL; } btf_verifier_log_member(env, t, member, NULL); - last_offset = member->offset; + last_offset = offset; } return meta_needed; @@ -1735,9 +1902,14 @@ static int btf_struct_resolve(struct btf_verifier_env *env, last_member_type = btf_type_by_id(env->btf, last_member_type_id); - err = btf_type_ops(last_member_type)->check_member(env, v->t, - last_member, - last_member_type); + if (btf_type_kflag(v->t)) + err = btf_type_ops(last_member_type)->check_kflag_member(env, v->t, + last_member, + last_member_type); + else + err = btf_type_ops(last_member_type)->check_member(env, v->t, + last_member, + last_member_type); if (err) return err; } @@ -1759,9 +1931,14 @@ static int btf_struct_resolve(struct btf_verifier_env *env, return env_stack_push(env, member_type, member_type_id); } - err = btf_type_ops(member_type)->check_member(env, v->t, - member, - member_type); + if (btf_type_kflag(v->t)) + err = btf_type_ops(member_type)->check_kflag_member(env, v->t, + member, + member_type); + else + err = btf_type_ops(member_type)->check_member(env, v->t, + member, + member_type); if (err) return err; } @@ -1789,17 +1966,26 @@ static void btf_struct_seq_show(const struct btf *btf, const struct btf_type *t, for_each_member(i, t, member) { const struct btf_type *member_type = btf_type_by_id(btf, member->type); - u32 member_offset = member->offset; - u32 bytes_offset = BITS_ROUNDDOWN_BYTES(member_offset); - u8 bits8_offset = BITS_PER_BYTE_MASKED(member_offset); const struct btf_kind_operations *ops; + u32 member_offset, bitfield_size; + u32 bytes_offset; + u8 bits8_offset; if (i) seq_puts(m, seq); - ops = btf_type_ops(member_type); - ops->seq_show(btf, member_type, member->type, - data + bytes_offset, bits8_offset, m); + member_offset = btf_member_bit_offset(t, member); + bitfield_size = btf_member_bitfield_size(t, member); + if (bitfield_size) { + btf_bitfield_seq_show(data, member_offset, + bitfield_size, m); + } else { + bytes_offset = BITS_ROUNDDOWN_BYTES(member_offset); + bits8_offset = BITS_PER_BYTE_MASKED(member_offset); + ops = btf_type_ops(member_type); + ops->seq_show(btf, member_type, member->type, + data + bytes_offset, bits8_offset, m); + } } seq_puts(m, "}"); } @@ -1808,6 +1994,7 @@ static struct btf_kind_operations struct_ops = { .check_meta = btf_struct_check_meta, .resolve = btf_struct_resolve, .check_member = btf_struct_check_member, + .check_kflag_member = btf_generic_check_kflag_member, .log_details = btf_struct_log, .seq_show = btf_struct_seq_show, }; @@ -1837,6 +2024,41 @@ static int btf_enum_check_member(struct btf_verifier_env *env, return 0; } +static int btf_enum_check_kflag_member(struct btf_verifier_env *env, + const struct btf_type *struct_type, + const struct btf_member *member, + const struct btf_type *member_type) +{ + u32 struct_bits_off, nr_bits, bytes_end, struct_size; + u32 int_bitsize = sizeof(int) * BITS_PER_BYTE; + + struct_bits_off = BTF_MEMBER_BIT_OFFSET(member->offset); + nr_bits = BTF_MEMBER_BITFIELD_SIZE(member->offset); + if (!nr_bits) { + if (BITS_PER_BYTE_MASKED(struct_bits_off)) { + btf_verifier_log_member(env, struct_type, member, + "Member is not byte aligned"); + return -EINVAL; + } + + nr_bits = int_bitsize; + } else if (nr_bits > int_bitsize) { + btf_verifier_log_member(env, struct_type, member, + "Invalid member bitfield_size"); + return -EINVAL; + } + + struct_size = struct_type->size; + bytes_end = BITS_ROUNDUP_BYTES(struct_bits_off + nr_bits); + if (struct_size < bytes_end) { + btf_verifier_log_member(env, struct_type, member, + "Member exceeds struct_size"); + return -EINVAL; + } + + return 0; +} + static s32 btf_enum_check_meta(struct btf_verifier_env *env, const struct btf_type *t, u32 meta_left) @@ -1856,6 +2078,11 @@ static s32 btf_enum_check_meta(struct btf_verifier_env *env, return -EINVAL; } + if (btf_type_kflag(t)) { + btf_verifier_log_type(env, t, "Invalid btf_info kind_flag"); + return -EINVAL; + } + if (t->size != sizeof(int)) { btf_verifier_log_type(env, t, "Expected size:%zu", sizeof(int)); @@ -1924,6 +2151,7 @@ static struct btf_kind_operations enum_ops = { .check_meta = btf_enum_check_meta, .resolve = btf_df_resolve, .check_member = btf_enum_check_member, + .check_kflag_member = btf_enum_check_kflag_member, .log_details = btf_enum_log, .seq_show = btf_enum_seq_show, }; @@ -1946,6 +2174,11 @@ static s32 btf_func_proto_check_meta(struct btf_verifier_env *env, return -EINVAL; } + if (btf_type_kflag(t)) { + btf_verifier_log_type(env, t, "Invalid btf_info kind_flag"); + return -EINVAL; + } + btf_verifier_log_type(env, t, NULL); return meta_needed; @@ -2005,6 +2238,7 @@ static struct btf_kind_operations func_proto_ops = { * Hence, there is no btf_func_check_member(). */ .check_member = btf_df_check_member, + .check_kflag_member = btf_df_check_kflag_member, .log_details = btf_func_proto_log, .seq_show = btf_df_seq_show, }; @@ -2024,6 +2258,11 @@ static s32 btf_func_check_meta(struct btf_verifier_env *env, return -EINVAL; } + if (btf_type_kflag(t)) { + btf_verifier_log_type(env, t, "Invalid btf_info kind_flag"); + return -EINVAL; + } + btf_verifier_log_type(env, t, NULL); return 0; @@ -2033,6 +2272,7 @@ static struct btf_kind_operations func_ops = { .check_meta = btf_func_check_meta, .resolve = btf_df_resolve, .check_member = btf_df_check_member, + .check_kflag_member = btf_df_check_kflag_member, .log_details = btf_ref_type_log, .seq_show = btf_df_seq_show, }; From 95f9037bf2434dbb84e798c4f539b489199683cc Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sat, 15 Dec 2018 22:13:52 -0800 Subject: [PATCH 1009/1640] UPSTREAM: bpf: enable cgroup local storage map pretty print with kind_flag Commit 970289fc0a83 ("bpf: add bpffs pretty print for cgroup local storage maps") added bpffs pretty print for cgroup local storage maps. The commit worked for struct without kind_flag set. This patch refactored and made pretty print also work with kind_flag set for the struct. Acked-by: Martin KaFai Lau Change-Id: I4e1eb1245172ec78a492e539b251929f128b8fae Signed-off-by: Yonghong Song Signed-off-by: Daniel Borkmann --- include/linux/btf.h | 5 ++++- kernel/bpf/btf.c | 37 ++++++++++++++++++++++++++++--------- kernel/bpf/local_storage.c | 17 ++++------------- 3 files changed, 36 insertions(+), 23 deletions(-) diff --git a/include/linux/btf.h b/include/linux/btf.h index 58000d7e06e3..12502e25e767 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -7,6 +7,7 @@ #include struct btf; +struct btf_member; struct btf_type; union bpf_attr; @@ -46,7 +47,9 @@ void btf_type_seq_show(const struct btf *btf, u32 type_id, void *obj, struct seq_file *m); int btf_get_fd_by_id(u32 id); u32 btf_id(const struct btf *btf); -bool btf_type_is_reg_int(const struct btf_type *t, u32 expected_size); +bool btf_member_is_reg_int(const struct btf *btf, const struct btf_type *s, + const struct btf_member *m, + u32 expected_offset, u32 expected_size); #ifdef CONFIG_BPF_SYSCALL const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id); diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 201c705f7e20..8c0c01917156 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -546,22 +546,41 @@ static bool btf_type_int_is_regular(const struct btf_type *t) } /* - * Check that given type is a regular int and has the expected size. + * Check that given struct member is a regular int with expected + * offset and size. */ -bool btf_type_is_reg_int(const struct btf_type *t, u32 expected_size) +bool btf_member_is_reg_int(const struct btf *btf, const struct btf_type *s, + const struct btf_member *m, + u32 expected_offset, u32 expected_size) { - u8 nr_bits, nr_bytes; - u32 int_data; + const struct btf_type *t; + u32 id, int_data; + u8 nr_bits; - if (!btf_type_is_int(t)) + id = m->type; + t = btf_type_id_size(btf, &id, NULL); + if (!t || !btf_type_is_int(t)) return false; int_data = btf_type_int(t); nr_bits = BTF_INT_BITS(int_data); - nr_bytes = BITS_ROUNDUP_BYTES(nr_bits); - if (BITS_PER_BYTE_MASKED(nr_bits) || - BTF_INT_OFFSET(int_data) || - nr_bytes != expected_size) + if (btf_type_kflag(s)) { + u32 bitfield_size = BTF_MEMBER_BITFIELD_SIZE(m->offset); + u32 bit_offset = BTF_MEMBER_BIT_OFFSET(m->offset); + + /* if kflag set, int should be a regular int and + * bit offset should be at byte boundary. + */ + return !bitfield_size && + BITS_ROUNDUP_BYTES(bit_offset) == expected_offset && + BITS_ROUNDUP_BYTES(nr_bits) == expected_size; + } + + if (BTF_INT_OFFSET(int_data) || + BITS_PER_BYTE_MASKED(m->offset) || + BITS_ROUNDUP_BYTES(m->offset) != expected_offset || + BITS_PER_BYTE_MASKED(nr_bits) || + BITS_ROUNDUP_BYTES(nr_bits) != expected_size) return false; return true; diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index 5eca03da0989..07a34ef562a0 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -315,9 +315,8 @@ static int cgroup_storage_check_btf(const struct bpf_map *map, const struct btf_type *key_type, const struct btf_type *value_type) { - const struct btf_type *t; struct btf_member *m; - u32 id, size; + u32 offset, size; /* Key is expected to be of struct bpf_cgroup_storage_key type, * which is: @@ -338,25 +337,17 @@ static int cgroup_storage_check_btf(const struct bpf_map *map, * The first field must be a 64 bit integer at 0 offset. */ m = (struct btf_member *)(key_type + 1); - if (m->offset) - return -EINVAL; - id = m->type; - t = btf_type_id_size(btf, &id, NULL); size = FIELD_SIZEOF(struct bpf_cgroup_storage_key, cgroup_inode_id); - if (!t || !btf_type_is_reg_int(t, size)) + if (!btf_member_is_reg_int(btf, key_type, m, 0, size)) return -EINVAL; /* * The second field must be a 32 bit integer at 64 bit offset. */ m++; - if (m->offset != offsetof(struct bpf_cgroup_storage_key, attach_type) * - BITS_PER_BYTE) - return -EINVAL; - id = m->type; - t = btf_type_id_size(btf, &id, NULL); + offset = offsetof(struct bpf_cgroup_storage_key, attach_type); size = FIELD_SIZEOF(struct bpf_cgroup_storage_key, attach_type); - if (!t || !btf_type_is_reg_int(t, size)) + if (!btf_member_is_reg_int(btf, key_type, m, offset, size)) return -EINVAL; return 0; From c6116b006ac90b4a44ef085f8343fca7f0cde177 Mon Sep 17 00:00:00 2001 From: Matt Mullins Date: Wed, 12 Dec 2018 16:42:37 -0800 Subject: [PATCH 1010/1640] BACKPORT: bpf: support raw tracepoints in modules Distributions build drivers as modules, including network and filesystem drivers which export numerous tracepoints. This enables bpf(BPF_RAW_TRACEPOINT_OPEN) to attach to those tracepoints. Change-Id: I2ea2898f5dedf7e70aff39c0f8ae0a5d7aa1d2af Signed-off-by: Matt Mullins Acked-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov --- include/linux/module.h | 4 ++ include/linux/trace_events.h | 8 ++- kernel/bpf/syscall.c | 11 ++-- kernel/module.c | 5 ++ kernel/trace/bpf_trace.c | 99 +++++++++++++++++++++++++++++++++++- 5 files changed, 120 insertions(+), 7 deletions(-) diff --git a/include/linux/module.h b/include/linux/module.h index 49176730b851..b4bd7a4d2369 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -437,6 +437,10 @@ struct module { unsigned int num_tracepoints; struct tracepoint * const *tracepoints_ptrs; #endif +#ifdef CONFIG_BPF_EVENTS + unsigned int num_bpf_raw_events; + struct bpf_raw_event_map *bpf_raw_events; +#endif #ifdef HAVE_JUMP_LABEL struct jump_entry *jump_entries; unsigned int num_jump_entries; diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 9fa0bfab007e..173caf91cdbe 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -467,7 +467,8 @@ void perf_event_detach_bpf_prog(struct perf_event *event); int perf_event_query_prog_array(struct perf_event *event, void __user *info); int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *prog); int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog *prog); -struct bpf_raw_event_map *bpf_find_raw_tracepoint(const char *name); +struct bpf_raw_event_map *bpf_get_raw_tracepoint(const char *name); +void bpf_put_raw_tracepoint(struct bpf_raw_event_map *btp); int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id, u32 *fd_type, const char **buf, u64 *probe_offset, u64 *probe_addr); @@ -498,10 +499,13 @@ static inline int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf { return -EOPNOTSUPP; } -static inline struct bpf_raw_event_map *bpf_find_raw_tracepoint(const char *name) +static inline struct bpf_raw_event_map *bpf_get_raw_tracepoint(const char *name) { return NULL; } +static inline void bpf_put_raw_tracepoint(struct bpf_raw_event_map *btp) +{ +} static inline int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id, u32 *fd_type, const char **buf, u64 *probe_offset, diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 0db41febb193..4dd510bb6ef3 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1625,6 +1625,7 @@ static int bpf_raw_tracepoint_release(struct inode *inode, struct file *filp) bpf_probe_unregister(raw_tp->btp, raw_tp->prog); bpf_prog_put(raw_tp->prog); } + bpf_put_raw_tracepoint(raw_tp->btp); kfree(raw_tp); return 0; } @@ -1650,13 +1651,15 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr) return -EFAULT; tp_name[sizeof(tp_name) - 1] = 0; - btp = bpf_find_raw_tracepoint(tp_name); + btp = bpf_get_raw_tracepoint(tp_name); if (!btp) return -ENOENT; raw_tp = kzalloc(sizeof(*raw_tp), GFP_USER); - if (!raw_tp) - return -ENOMEM; + if (!raw_tp) { + err = -ENOMEM; + goto out_put_btp; + } raw_tp->btp = btp; prog = bpf_prog_get_type(attr->raw_tracepoint.prog_fd, @@ -1684,6 +1687,8 @@ out_put_prog: bpf_prog_put(prog); out_free_tp: kfree(raw_tp); +out_put_btp: + bpf_put_raw_tracepoint(btp); return err; } diff --git a/kernel/module.c b/kernel/module.c index 28948386223d..9e80cb64751c 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -3241,6 +3241,11 @@ static int find_module_sections(struct module *mod, struct load_info *info) sizeof(*mod->tracepoints_ptrs), &mod->num_tracepoints); #endif +#ifdef CONFIG_BPF_EVENTS + mod->bpf_raw_events = section_objs(info, "__bpf_raw_tp_map", + sizeof(*mod->bpf_raw_events), + &mod->num_bpf_raw_events); +#endif #ifdef HAVE_JUMP_LABEL mod->jump_entries = section_objs(info, "__jump_table", sizeof(*mod->jump_entries), diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 54ed70fd96ab..60e6e2a7ba24 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -20,6 +20,43 @@ #include "trace_probe.h" #include "trace.h" +#ifdef CONFIG_MODULES +struct bpf_trace_module { + struct module *module; + struct list_head list; +}; + +static LIST_HEAD(bpf_trace_modules); +static DEFINE_MUTEX(bpf_module_mutex); + +static struct bpf_raw_event_map *bpf_get_raw_tracepoint_module(const char *name) +{ + struct bpf_raw_event_map *btp, *ret = NULL; + struct bpf_trace_module *btm; + unsigned int i; + + mutex_lock(&bpf_module_mutex); + list_for_each_entry(btm, &bpf_trace_modules, list) { + for (i = 0; i < btm->module->num_bpf_raw_events; ++i) { + btp = &btm->module->bpf_raw_events[i]; + if (!strcmp(btp->tp->name, name)) { + if (try_module_get(btm->module)) + ret = btp; + goto out; + } + } + } +out: + mutex_unlock(&bpf_module_mutex); + return ret; +} +#else +static struct bpf_raw_event_map *bpf_get_raw_tracepoint_module(const char *name) +{ + return NULL; +} +#endif /* CONFIG_MODULES */ + u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); u64 bpf_get_stack(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); @@ -1071,7 +1108,7 @@ int perf_event_query_prog_array(struct perf_event *event, void __user *info) extern struct bpf_raw_event_map __start__bpf_raw_tp[]; extern struct bpf_raw_event_map __stop__bpf_raw_tp[]; -struct bpf_raw_event_map *bpf_find_raw_tracepoint(const char *name) +struct bpf_raw_event_map *bpf_get_raw_tracepoint(const char *name) { struct bpf_raw_event_map *btp = __start__bpf_raw_tp; @@ -1079,7 +1116,16 @@ struct bpf_raw_event_map *bpf_find_raw_tracepoint(const char *name) if (!strcmp(btp->tp->name, name)) return btp; } - return NULL; + + return bpf_get_raw_tracepoint_module(name); +} + +void bpf_put_raw_tracepoint(struct bpf_raw_event_map *btp) +{ + struct module *mod = __module_address((unsigned long)btp); + + if (mod) + module_put(mod); } static __always_inline @@ -1217,3 +1263,52 @@ int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id, return err; } + +#ifdef CONFIG_MODULES +int bpf_event_notify(struct notifier_block *nb, unsigned long op, void *module) +{ + struct bpf_trace_module *btm, *tmp; + struct module *mod = module; + + if (mod->num_bpf_raw_events == 0 || + (op != MODULE_STATE_COMING && op != MODULE_STATE_GOING)) + return 0; + + mutex_lock(&bpf_module_mutex); + + switch (op) { + case MODULE_STATE_COMING: + btm = kzalloc(sizeof(*btm), GFP_KERNEL); + if (btm) { + btm->module = module; + list_add(&btm->list, &bpf_trace_modules); + } + break; + case MODULE_STATE_GOING: + list_for_each_entry_safe(btm, tmp, &bpf_trace_modules, list) { + if (btm->module == module) { + list_del(&btm->list); + kfree(btm); + break; + } + } + break; + } + + mutex_unlock(&bpf_module_mutex); + + return 0; +} + +static struct notifier_block bpf_module_nb = { + .notifier_call = bpf_event_notify, +}; + +int __init bpf_event_init(void) +{ + register_module_notifier(&bpf_module_nb); + return 0; +} + +fs_initcall(bpf_event_init); +#endif /* CONFIG_MODULES */ From 162305ef550d929bde334f887a51c5887df9a0b4 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 18 Dec 2018 13:43:58 -0800 Subject: [PATCH 1011/1640] UPSTREAM: bpf: log struct/union attribute for forward type Current btf internal verbose logger logs fwd type as [2] FWD A type_id=0 where A is the type name. Commit 9d5f9f701b18 ("bpf: btf: fix struct/union/fwd types with kind_flag") introduced kind_flag which can be used to distinguish whether a forward type is a struct or union. Also, "type_id=0" does not carry any meaningful information for fwd type as btf_type.type = 0 is simply enforced during btf verification and is not used anywhere else. This commit changed the log to [2] FWD A struct if kind_flag = 0, or [2] FWD A union if kind_flag = 1. Acked-by: Martin KaFai Lau Change-Id: I8506533e4f2664c598ecd74ff204408ee22b83e0 Signed-off-by: Yonghong Song Signed-off-by: Daniel Borkmann --- kernel/bpf/btf.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 8c0c01917156..45617bbbd0a0 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -1595,12 +1595,18 @@ static s32 btf_fwd_check_meta(struct btf_verifier_env *env, return 0; } +static void btf_fwd_type_log(struct btf_verifier_env *env, + const struct btf_type *t) +{ + btf_verifier_log(env, "%s", btf_type_kflag(t) ? "union" : "struct"); +} + static struct btf_kind_operations fwd_ops = { .check_meta = btf_fwd_check_meta, .resolve = btf_df_resolve, .check_member = btf_df_check_member, .check_kflag_member = btf_df_check_kflag_member, - .log_details = btf_ref_type_log, + .log_details = btf_fwd_type_log, .seq_show = btf_df_seq_show, }; From 5997d859e415da890e668d89b0abea806018851d Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 19 Dec 2018 13:01:01 -0800 Subject: [PATCH 1012/1640] UPSTREAM: bpf: Ensure line_info.insn_off cannot point to insn with zero code This patch rejects a line_info if the bpf insn code referred by line_info.insn_off is 0. F.e. a broken userspace tool might generate a line_info.insn_off that points to the second 8 bytes of a BPF_LD_IMM64. Change-Id: I28b5191bee0773a0fb92caa9ee849c2fe486f64a Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index b726bc823bd8..ff97e54eb941 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5460,6 +5460,14 @@ static int check_btf_line(struct bpf_verifier_env *env, goto err_free; } + if (!prog->insnsi[linfo[i].insn_off].code) { + verbose(env, + "Invalid insn code at line_info[%u].insn_off\n", + i); + err = -EINVAL; + goto err_free; + } + if (!btf_name_by_offset(btf, linfo[i].line_off) || !btf_name_by_offset(btf, linfo[i].file_name_off)) { verbose(env, "Invalid line_info[%u].line_off or .file_name_off\n", i); From 0f40166bf0137d28fe496595ceb004a470caabb3 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 19 Dec 2018 22:13:04 -0800 Subject: [PATCH 1013/1640] UPSTREAM: bpf: verifier: teach the verifier to reason about the BPF_JSET instruction Some JITs (nfp) try to optimize code on their own. It could make sense in case of BPF_JSET instruction which is currently not interpreted by the verifier, meaning for instance that dead could would not be detected if it was under BPF_JSET branch. Teach the verifier basics of BPF_JSET, JIT optimizations will be removed shortly. Change-Id: I54080752727c2fd3ae389536745c26e48b81e82b Signed-off-by: Jakub Kicinski Reviewed-by: Jiong Wang Acked-by: Edward Cree Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index ff97e54eb941..ec377523bde2 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4311,6 +4311,12 @@ static int is_branch_taken(struct bpf_reg_state *reg, u64 val, u8 opcode) if (tnum_is_const(reg->var_off)) return !tnum_equals_const(reg->var_off, val); break; + case BPF_JSET: + if ((~reg->var_off.mask & reg->var_off.value) & val) + return 1; + if (!((reg->var_off.mask | reg->var_off.value) & val)) + return 0; + break; case BPF_JGT: if (reg->umin_value > val) return 1; @@ -4395,6 +4401,13 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg, */ __mark_reg_known(false_reg, val); break; + case BPF_JSET: + false_reg->var_off = tnum_and(false_reg->var_off, + tnum_const(~val)); + if (is_power_of_2(val)) + true_reg->var_off = tnum_or(true_reg->var_off, + tnum_const(val)); + break; case BPF_JGT: false_reg->umax_value = min(false_reg->umax_value, val); true_reg->umin_value = max(true_reg->umin_value, val + 1); @@ -4467,6 +4480,13 @@ static void reg_set_min_max_inv(struct bpf_reg_state *true_reg, */ __mark_reg_known(false_reg, val); break; + case BPF_JSET: + false_reg->var_off = tnum_and(false_reg->var_off, + tnum_const(~val)); + if (is_power_of_2(val)) + true_reg->var_off = tnum_or(true_reg->var_off, + tnum_const(val)); + break; case BPF_JGT: true_reg->umax_value = min(true_reg->umax_value, val - 1); false_reg->umin_value = max(false_reg->umin_value, val); From d27720a51fed25f329b9e2850bc129d8ae1e724c Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 19 Dec 2018 22:13:06 -0800 Subject: [PATCH 1014/1640] UPSTREAM: bpf: verifier: reorder stack size check with dead code sanitization Reorder the calls to check_max_stack_depth() and sanitize_dead_code() to separate functions which can rewrite instructions from pure checks. No functional changes. Change-Id: I784098388cdabf3394f375753a796734720f3329 Signed-off-by: Jakub Kicinski Reviewed-by: Jiong Wang Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index ec377523bde2..bc58528f0ad9 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -7524,12 +7524,13 @@ skip_full_check: while (!pop_stack(env, NULL, NULL)); free_states(env); - if (ret == 0) - sanitize_dead_code(env); - if (ret == 0) ret = check_max_stack_depth(env); + /* instruction rewrites happen after this point */ + if (ret == 0) + sanitize_dead_code(env); + if (ret == 0) /* program is valid, convert *(u32*)(ctx + off) accesses */ ret = convert_ctx_accesses(env); From 2519826ba3e9b94e12ceea660bb24f8b3abbfbff Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 3 Jan 2019 00:58:28 +0100 Subject: [PATCH 1015/1640] UPSTREAM: bpf: move tmp variable into ax register in interpreter This change moves the on-stack 64 bit tmp variable in ___bpf_prog_run() into the hidden ax register. The latter is currently only used in JITs for constant blinding as a temporary scratch register, meaning the BPF interpreter will never see the use of ax. Therefore it is safe to use it for the cases where tmp has been used earlier. This is needed to later on allow restricted hidden use of ax in both interpreter and JITs. Change-Id: Ife126971856bd91c6fa58a6e8cc9bd3a51510c56 Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: Alexei Starovoitov --- kernel/bpf/core.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index f9c70cff2818..f8076ec30c1a 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1309,36 +1309,36 @@ select_insn: (*(s64 *) &DST) >>= IMM; CONT; ALU64_MOD_X: - div64_u64_rem(DST, SRC, &tmp); - DST = tmp; + div64_u64_rem(DST, SRC, &AX); + DST = AX; CONT; ALU_MOD_X: - tmp = (u32) DST; - DST = do_div(tmp, (u32) SRC); + AX = (u32) DST; + DST = do_div(AX, (u32) SRC); CONT; ALU64_MOD_K: - div64_u64_rem(DST, IMM, &tmp); - DST = tmp; + div64_u64_rem(DST, IMM, &AX); + DST = AX; CONT; ALU_MOD_K: - tmp = (u32) DST; - DST = do_div(tmp, (u32) IMM); + AX = (u32) DST; + DST = do_div(AX, (u32) IMM); CONT; ALU64_DIV_X: DST = div64_u64(DST, SRC); CONT; ALU_DIV_X: - tmp = (u32) DST; - do_div(tmp, (u32) SRC); - DST = (u32) tmp; + AX = (u32) DST; + do_div(AX, (u32) SRC); + DST = (u32) AX; CONT; ALU64_DIV_K: DST = div64_u64(DST, IMM); CONT; ALU_DIV_K: - tmp = (u32) DST; - do_div(tmp, (u32) IMM); - DST = (u32) tmp; + AX = (u32) DST; + do_div(AX, (u32) IMM); + DST = (u32) AX; CONT; ALU_END_TO_BE: switch (IMM) { From 76dcd608a21f9a74facc2c83563a709f82737dd3 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 10 Jan 2019 11:14:00 -0800 Subject: [PATCH 1016/1640] UPSTREAM: bpf: fix bpffs bitfield pretty print Commit 9d5f9f701b18 ("bpf: btf: fix struct/union/fwd types with kind_flag") introduced kind_flag and used bitfield_size in the btf_member to directly pretty print member values. The commit contained a bug where the incorrect parameters could be passed to function btf_bitfield_seq_show(). The bits_offset parameter in the function expects a value less than 8. Instead, the member offset in the structure is passed. The below is btf_bitfield_seq_show() func signature: void btf_bitfield_seq_show(void *data, u8 bits_offset, u8 nr_bits, struct seq_file *m) both bits_offset and nr_bits are u8 type. If the bitfield member offset is greater than 256, incorrect value will be printed. This patch fixed the issue by calculating correct proper data offset and bits_offset similar to non kind_flag case. Fixes: 9d5f9f701b18 ("bpf: btf: fix struct/union/fwd types with kind_flag") Acked-by: Martin KaFai Lau Change-Id: I2701e1fa1b7d34cfad3ec46416864be77b49cfc1 Signed-off-by: Yonghong Song Signed-off-by: Daniel Borkmann --- kernel/bpf/btf.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 45617bbbd0a0..4c53e3a8c8a9 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -1219,8 +1219,6 @@ static void btf_bitfield_seq_show(void *data, u8 bits_offset, u8 nr_copy_bits; u64 print_num; - data += BITS_ROUNDDOWN_BYTES(bits_offset); - bits_offset = BITS_PER_BYTE_MASKED(bits_offset); nr_copy_bits = nr_bits + bits_offset; nr_copy_bytes = BITS_ROUNDUP_BYTES(nr_copy_bits); @@ -1255,7 +1253,9 @@ static void btf_int_bits_seq_show(const struct btf *btf, * BTF_INT_OFFSET() cannot exceed 64 bits. */ total_bits_offset = bits_offset + BTF_INT_OFFSET(int_data); - btf_bitfield_seq_show(data, total_bits_offset, nr_bits, m); + data += BITS_ROUNDDOWN_BYTES(total_bits_offset); + bits_offset = BITS_PER_BYTE_MASKED(total_bits_offset); + btf_bitfield_seq_show(data, bits_offset, nr_bits, m); } static void btf_int_seq_show(const struct btf *btf, const struct btf_type *t, @@ -2001,12 +2001,12 @@ static void btf_struct_seq_show(const struct btf *btf, const struct btf_type *t, member_offset = btf_member_bit_offset(t, member); bitfield_size = btf_member_bitfield_size(t, member); + bytes_offset = BITS_ROUNDDOWN_BYTES(member_offset); + bits8_offset = BITS_PER_BYTE_MASKED(member_offset); if (bitfield_size) { - btf_bitfield_seq_show(data, member_offset, + btf_bitfield_seq_show(data + bytes_offset, bits8_offset, bitfield_size, m); } else { - bytes_offset = BITS_ROUNDDOWN_BYTES(member_offset); - bits8_offset = BITS_PER_BYTE_MASKED(member_offset); ops = btf_type_ops(member_type); ops->seq_show(btf, member_type, member->type, data + bytes_offset, bits8_offset, m); From 08181da223495d1cf46b558ec8133732c7a3bfaa Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 15 Jan 2019 17:07:47 -0800 Subject: [PATCH 1017/1640] UPSTREAM: bpf: btf: support 128 bit integer type Currently, btf only supports up to 64-bit integer. On the other hand, 128bit support for gcc and clang has existed for a long time. For example, both gcc 4.8 and llvm 3.7 supports types "__int128" and "unsigned __int128" for virtually all 64bit architectures including bpf. The requirement for __int128 support comes from two areas: . bpf program may use __int128. For example, some bcc tools (https://github.com/iovisor/bcc/tree/master/tools), mostly tcp v6 related, tcpstates.py, tcpaccept.py, etc., are using __int128 to represent the ipv6 addresses. . linux itself is using __int128 types. Hence supporting __int128 type in BTF is required for vmlinux BTF, which will be used by "compile once and run everywhere" and other projects. For 128bit integer, instead of base-10, hex numbers are pretty printed out as large decimal number is hard to decipher, e.g., for ipv6 addresses. Acked-by: Martin KaFai Lau Change-Id: Icb0b4a68a05e08fb3c7e0c05ac0f314cf6bfde45 Signed-off-by: Yonghong Song Signed-off-by: Daniel Borkmann --- kernel/bpf/btf.c | 104 ++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 85 insertions(+), 19 deletions(-) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 4c53e3a8c8a9..9d40c36c3291 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -157,7 +157,7 @@ * */ -#define BITS_PER_U64 (sizeof(u64) * BITS_PER_BYTE) +#define BITS_PER_U128 (sizeof(u64) * BITS_PER_BYTE * 2) #define BITS_PER_BYTE_MASK (BITS_PER_BYTE - 1) #define BITS_PER_BYTE_MASKED(bits) ((bits) & BITS_PER_BYTE_MASK) #define BITS_ROUNDDOWN_BYTES(bits) ((bits) >> 3) @@ -525,7 +525,7 @@ const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id) /* * Regular int is not a bit field and it must be either - * u8/u16/u32/u64. + * u8/u16/u32/u64 or __int128. */ static bool btf_type_int_is_regular(const struct btf_type *t) { @@ -538,7 +538,8 @@ static bool btf_type_int_is_regular(const struct btf_type *t) if (BITS_PER_BYTE_MASKED(nr_bits) || BTF_INT_OFFSET(int_data) || (nr_bytes != sizeof(u8) && nr_bytes != sizeof(u16) && - nr_bytes != sizeof(u32) && nr_bytes != sizeof(u64))) { + nr_bytes != sizeof(u32) && nr_bytes != sizeof(u64) && + nr_bytes != (2 * sizeof(u64)))) { return false; } @@ -1063,9 +1064,9 @@ static int btf_int_check_member(struct btf_verifier_env *env, nr_copy_bits = BTF_INT_BITS(int_data) + BITS_PER_BYTE_MASKED(struct_bits_off); - if (nr_copy_bits > BITS_PER_U64) { + if (nr_copy_bits > BITS_PER_U128) { btf_verifier_log_member(env, struct_type, member, - "nr_copy_bits exceeds 64"); + "nr_copy_bits exceeds 128"); return -EINVAL; } @@ -1119,9 +1120,9 @@ static int btf_int_check_kflag_member(struct btf_verifier_env *env, bytes_offset = BITS_ROUNDDOWN_BYTES(struct_bits_off); nr_copy_bits = nr_bits + BITS_PER_BYTE_MASKED(struct_bits_off); - if (nr_copy_bits > BITS_PER_U64) { + if (nr_copy_bits > BITS_PER_U128) { btf_verifier_log_member(env, struct_type, member, - "nr_copy_bits exceeds 64"); + "nr_copy_bits exceeds 128"); return -EINVAL; } @@ -1168,9 +1169,9 @@ static s32 btf_int_check_meta(struct btf_verifier_env *env, nr_bits = BTF_INT_BITS(int_data) + BTF_INT_OFFSET(int_data); - if (nr_bits > BITS_PER_U64) { + if (nr_bits > BITS_PER_U128) { btf_verifier_log_type(env, t, "nr_bits exceeds %zu", - BITS_PER_U64); + BITS_PER_U128); return -EINVAL; } @@ -1211,31 +1212,93 @@ static void btf_int_log(struct btf_verifier_env *env, btf_int_encoding_str(BTF_INT_ENCODING(int_data))); } +static void btf_int128_print(struct seq_file *m, void *data) +{ + /* data points to a __int128 number. + * Suppose + * int128_num = *(__int128 *)data; + * The below formulas shows what upper_num and lower_num represents: + * upper_num = int128_num >> 64; + * lower_num = int128_num & 0xffffffffFFFFFFFFULL; + */ + u64 upper_num, lower_num; + +#ifdef __BIG_ENDIAN_BITFIELD + upper_num = *(u64 *)data; + lower_num = *(u64 *)(data + 8); +#else + upper_num = *(u64 *)(data + 8); + lower_num = *(u64 *)data; +#endif + if (upper_num == 0) + seq_printf(m, "0x%llx", lower_num); + else + seq_printf(m, "0x%llx%016llx", upper_num, lower_num); +} + +static void btf_int128_shift(u64 *print_num, u16 left_shift_bits, + u16 right_shift_bits) +{ + u64 upper_num, lower_num; + +#ifdef __BIG_ENDIAN_BITFIELD + upper_num = print_num[0]; + lower_num = print_num[1]; +#else + upper_num = print_num[1]; + lower_num = print_num[0]; +#endif + + /* shake out un-needed bits by shift/or operations */ + if (left_shift_bits >= 64) { + upper_num = lower_num << (left_shift_bits - 64); + lower_num = 0; + } else { + upper_num = (upper_num << left_shift_bits) | + (lower_num >> (64 - left_shift_bits)); + lower_num = lower_num << left_shift_bits; + } + + if (right_shift_bits >= 64) { + lower_num = upper_num >> (right_shift_bits - 64); + upper_num = 0; + } else { + lower_num = (lower_num >> right_shift_bits) | + (upper_num << (64 - right_shift_bits)); + upper_num = upper_num >> right_shift_bits; + } + +#ifdef __BIG_ENDIAN_BITFIELD + print_num[0] = upper_num; + print_num[1] = lower_num; +#else + print_num[0] = lower_num; + print_num[1] = upper_num; +#endif +} + static void btf_bitfield_seq_show(void *data, u8 bits_offset, u8 nr_bits, struct seq_file *m) { u16 left_shift_bits, right_shift_bits; u8 nr_copy_bytes; u8 nr_copy_bits; - u64 print_num; + u64 print_num[2] = {}; nr_copy_bits = nr_bits + bits_offset; nr_copy_bytes = BITS_ROUNDUP_BYTES(nr_copy_bits); - print_num = 0; - memcpy(&print_num, data, nr_copy_bytes); + memcpy(print_num, data, nr_copy_bytes); #ifdef __BIG_ENDIAN_BITFIELD left_shift_bits = bits_offset; #else - left_shift_bits = BITS_PER_U64 - nr_copy_bits; + left_shift_bits = BITS_PER_U128 - nr_copy_bits; #endif - right_shift_bits = BITS_PER_U64 - nr_bits; + right_shift_bits = BITS_PER_U128 - nr_bits; - print_num <<= left_shift_bits; - print_num >>= right_shift_bits; - - seq_printf(m, "0x%llx", print_num); + btf_int128_shift(print_num, left_shift_bits, right_shift_bits); + btf_int128_print(m, print_num); } @@ -1250,7 +1313,7 @@ static void btf_int_bits_seq_show(const struct btf *btf, /* * bits_offset is at most 7. - * BTF_INT_OFFSET() cannot exceed 64 bits. + * BTF_INT_OFFSET() cannot exceed 128 bits. */ total_bits_offset = bits_offset + BTF_INT_OFFSET(int_data); data += BITS_ROUNDDOWN_BYTES(total_bits_offset); @@ -1274,6 +1337,9 @@ static void btf_int_seq_show(const struct btf *btf, const struct btf_type *t, } switch (nr_bits) { + case 128: + btf_int128_print(m, data); + break; case 64: if (sign) seq_printf(m, "%lld", *(s64 *)data); From 4273a6986788866e9150bcef7f795b0302f83a57 Mon Sep 17 00:00:00 2001 From: Peter Oskolkov Date: Wed, 16 Jan 2019 10:43:01 -0800 Subject: [PATCH 1018/1640] UPSTREAM: bpf: fix a (false) compiler warning An older GCC compiler complains: kernel/bpf/verifier.c: In function 'bpf_check': kernel/bpf/verifier.c:4***:13: error: 'prev_offset' may be used uninitialized in this function [-Werror=maybe-uninitialized] } else if (krecord[i].insn_offset <= prev_offset) { ^ kernel/bpf/verifier.c:4***:38: note: 'prev_offset' was declared here u32 i, nfuncs, urec_size, min_size, prev_offset; Although the compiler is wrong here, the patch makes sure that prev_offset is always initialized, just to silence the warning. v2: fix a spelling error in the commit message. Change-Id: Iac5d45e36019cdae5d0eecdf7ff359c3f6fc4aaa Signed-off-by: Peter Oskolkov Acked-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index bc58528f0ad9..a92f4db1928d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5290,13 +5290,14 @@ static int check_btf_func(struct bpf_verifier_env *env, const union bpf_attr *attr, union bpf_attr __user *uattr) { - u32 i, nfuncs, urec_size, min_size, prev_offset; + u32 i, nfuncs, urec_size, min_size; u32 krec_size = sizeof(struct bpf_func_info); struct bpf_func_info *krecord; const struct btf_type *type; struct bpf_prog *prog; const struct btf *btf; void __user *urecord; + u32 prev_offset = 0; int ret = 0; nfuncs = attr->func_info_cnt; From 7727499d0fcce7e171913f399169caa4827e6f20 Mon Sep 17 00:00:00 2001 From: Mathieu Malaterre Date: Wed, 16 Jan 2019 20:29:40 +0100 Subject: [PATCH 1019/1640] UPSTREAM: bpf: Make function btf_name_offset_valid static Initially in commit 69b693f0aefa ("bpf: btf: Introduce BPF Type Format (BTF)") the function 'btf_name_offset_valid' was introduced as static function it was later on changed to a non-static one, and then finally in commit 23127b33ec80 ("bpf: Create a new btf_name_by_offset() for non type name use case") the function prototype was removed. Revert back to original implementation and make the function static. Remove warning triggered with W=1: kernel/bpf/btf.c:470:6: warning: no previous prototype for 'btf_name_offset_valid' [-Wmissing-prototypes] Fixes: 23127b33ec80 ("bpf: Create a new btf_name_by_offset() for non type name use case") Change-Id: I1bc81f52fd0f75cb803fa16397f445d6868747ba Signed-off-by: Mathieu Malaterre Acked-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann --- kernel/bpf/btf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 9d40c36c3291..ab01c1615c1d 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -467,7 +467,7 @@ static const struct btf_kind_operations *btf_type_ops(const struct btf_type *t) return kind_ops[BTF_INFO_KIND(t->info)]; } -bool btf_name_offset_valid(const struct btf *btf, u32 offset) +static bool btf_name_offset_valid(const struct btf *btf, u32 offset) { return BTF_STR_OFFSET_VALID(offset) && offset < btf->hdr.str_len; From dd77a6ce209c5119a98c7f4eb051a66fbbde583f Mon Sep 17 00:00:00 2001 From: Mathieu Malaterre Date: Wed, 16 Jan 2019 20:35:29 +0100 Subject: [PATCH 1020/1640] UPSTREAM: bpf: Annotate implicit fall through in cgroup_dev_func_proto There is a plan to build the kernel with -Wimplicit-fallthrough and this place in the code produced a warnings (W=1). This commit removes the following warning: kernel/bpf/cgroup.c:719:6: warning: this statement may fall through [-Wimplicit-fallthrough=] Change-Id: Icb19e4815c8887e95dea5897cdcc7468680466dc Signed-off-by: Mathieu Malaterre Signed-off-by: Daniel Borkmann --- kernel/bpf/cgroup.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 9425c2fb872f..ab612fe9862f 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -718,6 +718,7 @@ cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_trace_printk: if (capable(CAP_SYS_ADMIN)) return bpf_get_trace_printk_proto(); + /* fall through */ default: return NULL; } From 4b1f20231f4d6603ba18642429c6a7c7577b151c Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 22 Jan 2019 22:45:18 -0800 Subject: [PATCH 1021/1640] UPSTREAM: bpf: change parameters of call/branch offset adjustment In preparation for code removal change parameters to branch and call adjustment functions to be more universal. The current parameters assume we are patching a single instruction with a longer set. A diagram may help reading the change, this is for the patch single case, patching instruction 1 with a replacement of 4: ____ 0 |____| 1 |____| <-- pos ^ 2 | | <-- end old ^ | 3 | | | delta | len 4 |____| | | (patch region) 5 | | <-- end new v v 6 |____| end_old = pos + 1 end_new = pos + delta + 1 If we are before the patch region - curr variable and the target are fully in old coordinates (hence comparing against end_old). If we are after the region curr is in new coordinates (hence the comparison to end_new) but target is in mixed coordinates, so we just check if it falls before end_new, and if so it needs the adjustment. Note that we will not fix up branches which land in removed region in case of removal, which should be okay, as we are only going to remove dead code. Change-Id: I7156b968844bcdb3609dc77f07e34cb290f7bceb Signed-off-by: Jakub Kicinski Acked-by: Yonghong Song Signed-off-by: Alexei Starovoitov --- kernel/bpf/core.c | 40 +++++++++++++++++++++------------------- 1 file changed, 21 insertions(+), 19 deletions(-) diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index f8076ec30c1a..570f6a9740f5 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -315,15 +315,16 @@ int bpf_prog_calc_tag(struct bpf_prog *fp) return 0; } -static int bpf_adj_delta_to_imm(struct bpf_insn *insn, u32 pos, u32 delta, - u32 curr, const bool probe_pass) +static int bpf_adj_delta_to_imm(struct bpf_insn *insn, u32 pos, s32 end_old, + s32 end_new, u32 curr, const bool probe_pass) { const s64 imm_min = S32_MIN, imm_max = S32_MAX; + s32 delta = end_new - end_old; s64 imm = insn->imm; - if (curr < pos && curr + imm + 1 > pos) + if (curr < pos && curr + imm + 1 >= end_old) imm += delta; - else if (curr > pos + delta && curr + imm + 1 <= pos + delta) + else if (curr >= end_new && curr + imm + 1 < end_new) imm -= delta; if (imm < imm_min || imm > imm_max) return -ERANGE; @@ -332,15 +333,16 @@ static int bpf_adj_delta_to_imm(struct bpf_insn *insn, u32 pos, u32 delta, return 0; } -static int bpf_adj_delta_to_off(struct bpf_insn *insn, u32 pos, u32 delta, - u32 curr, const bool probe_pass) +static int bpf_adj_delta_to_off(struct bpf_insn *insn, u32 pos, s32 end_old, + s32 end_new, u32 curr, const bool probe_pass) { const s32 off_min = S16_MIN, off_max = S16_MAX; + s32 delta = end_new - end_old; s32 off = insn->off; - if (curr < pos && curr + off + 1 > pos) + if (curr < pos && curr + off + 1 >= end_old) off += delta; - else if (curr > pos + delta && curr + off + 1 <= pos + delta) + else if (curr >= end_new && curr + off + 1 < end_new) off -= delta; if (off < off_min || off > off_max) return -ERANGE; @@ -349,10 +351,10 @@ static int bpf_adj_delta_to_off(struct bpf_insn *insn, u32 pos, u32 delta, return 0; } -static int bpf_adj_branches(struct bpf_prog *prog, u32 pos, u32 delta, - const bool probe_pass) +static int bpf_adj_branches(struct bpf_prog *prog, u32 pos, s32 end_old, + s32 end_new, const bool probe_pass) { - u32 i, insn_cnt = prog->len + (probe_pass ? delta : 0); + u32 i, insn_cnt = prog->len + (probe_pass ? end_new - end_old : 0); struct bpf_insn *insn = prog->insnsi; int ret = 0; @@ -364,8 +366,8 @@ static int bpf_adj_branches(struct bpf_prog *prog, u32 pos, u32 delta, * do any other adjustments. Therefore skip the patchlet. */ if (probe_pass && i == pos) { - i += delta + 1; - insn++; + i = end_new; + insn = prog->insnsi + end_old; } code = insn->code; if (BPF_CLASS(code) != BPF_JMP || @@ -375,11 +377,11 @@ static int bpf_adj_branches(struct bpf_prog *prog, u32 pos, u32 delta, if (BPF_OP(code) == BPF_CALL) { if (insn->src_reg != BPF_PSEUDO_CALL) continue; - ret = bpf_adj_delta_to_imm(insn, pos, delta, i, - probe_pass); + ret = bpf_adj_delta_to_imm(insn, pos, end_old, + end_new, i, probe_pass); } else { - ret = bpf_adj_delta_to_off(insn, pos, delta, i, - probe_pass); + ret = bpf_adj_delta_to_off(insn, pos, end_old, + end_new, i, probe_pass); } if (ret) break; @@ -429,7 +431,7 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, * we afterwards may not fail anymore. */ if (insn_adj_cnt > cnt_max && - bpf_adj_branches(prog, off, insn_delta, true)) + bpf_adj_branches(prog, off, off + 1, off + len, true)) return NULL; /* Several new instructions need to be inserted. Make room @@ -461,7 +463,7 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, * the ship has sailed to reverse to the original state. An * overflow cannot happen at this point. */ - BUG_ON(bpf_adj_branches(prog_adj, off, insn_delta, false)); + BUG_ON(bpf_adj_branches(prog_adj, off, off + 1, off + len, false)); bpf_adj_linfo(prog_adj, off, insn_delta); From 17549388657c5cae1984450612f8c7c473840260 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 22 Jan 2019 22:45:19 -0800 Subject: [PATCH 1022/1640] UPSTREAM: bpf: verifier: hard wire branches to dead code Loading programs with dead code becomes more and more common, as people begin to patch constants at load time. Turn conditional jumps to unconditional ones, to avoid potential branch misprediction penalty. This optimization is enabled for privileged users only. For branches which just fall through we could just mark them as not seen and have dead code removal take care of them, but that seems less clean. v0.2: - don't call capable(CAP_SYS_ADMIN) twice (Jiong). v3: - fix GCC warning; Change-Id: Ie5e37f435e552ff93f434b038d6ae2eecc7ef057 Signed-off-by: Jakub Kicinski Acked-by: Yonghong Song Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 45 +++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 43 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a92f4db1928d..a282797e194f 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6739,6 +6739,40 @@ static void sanitize_dead_code(struct bpf_verifier_env *env) } } +static bool insn_is_cond_jump(u8 code) +{ + u8 op; + + if (BPF_CLASS(code) != BPF_JMP) + return false; + + op = BPF_OP(code); + return op != BPF_JA && op != BPF_EXIT && op != BPF_CALL; +} + +static void opt_hard_wire_dead_code_branches(struct bpf_verifier_env *env) +{ + struct bpf_insn_aux_data *aux_data = env->insn_aux_data; + struct bpf_insn ja = BPF_JMP_IMM(BPF_JA, 0, 0, 0); + struct bpf_insn *insn = env->prog->insnsi; + const int insn_cnt = env->prog->len; + int i; + + for (i = 0; i < insn_cnt; i++, insn++) { + if (!insn_is_cond_jump(insn->code)) + continue; + + if (!aux_data[i + 1].seen) + ja.off = insn->off; + else if (!aux_data[i + 1 + insn->off].seen) + ja.off = 0; + else + continue; + + memcpy(insn, &ja, sizeof(ja)); + } +} + /* convert load instructions that access fields of a context type into a * sequence of instructions that access fields of the underlying structure: * struct __sk_buff -> struct sk_buff @@ -7434,6 +7468,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, struct bpf_verifier_env *env; struct bpf_verifier_log *log; int ret = -EINVAL; + bool is_priv; /* no program is valid */ if (ARRAY_SIZE(bpf_verifier_ops) == 0) @@ -7481,6 +7516,9 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, if (attr->prog_flags & BPF_F_ANY_ALIGNMENT) env->strict_alignment = false; + is_priv = capable(CAP_SYS_ADMIN); + env->allow_ptr_leaks = is_priv; + ret = replace_map_fd_with_map_ptr(env); if (ret < 0) goto skip_full_check; @@ -7498,8 +7536,6 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, if (!env->explored_states) goto skip_full_check; - env->allow_ptr_leaks = capable(CAP_SYS_ADMIN); - ret = check_subprogs(env); if (ret < 0) goto skip_full_check; @@ -7529,6 +7565,11 @@ skip_full_check: ret = check_max_stack_depth(env); /* instruction rewrites happen after this point */ + if (is_priv) { + if (ret == 0) + opt_hard_wire_dead_code_branches(env); + } + if (ret == 0) sanitize_dead_code(env); From e3abdfbb2a93d8a7f8847e3f766eacd45c5e26ce Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 22 Jan 2019 22:45:20 -0800 Subject: [PATCH 1023/1640] UPSTREAM: bpf: verifier: remove dead code Instead of overwriting dead code with jmp -1 instructions remove it completely for root. Adjust verifier state and line info appropriately. v2: - adjust func_info (Alexei); - make sure first instruction retains line info (Alexei). v4: (Yonghong) - remove unnecessary if (!insn to remove) checks; - always keep last line info if first live instruction lacks one. v5: (Martin Lau) - improve and clarify comments. Change-Id: Ib9047b99b7941173450efdbad682a5986a3569bb Signed-off-by: Jakub Kicinski Acked-by: Yonghong Song Signed-off-by: Alexei Starovoitov --- include/linux/filter.h | 1 + kernel/bpf/core.c | 12 +++ kernel/bpf/verifier.c | 176 ++++++++++++++++++++++++++++++++++++++++- 3 files changed, 186 insertions(+), 3 deletions(-) diff --git a/include/linux/filter.h b/include/linux/filter.h index 2a4d16122bec..9140dc4fe919 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -859,6 +859,7 @@ static inline bool bpf_dump_raw_ok(const struct cred *cred) struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, const struct bpf_insn *patch, u32 len); +int bpf_remove_insns(struct bpf_prog *prog, u32 off, u32 cnt); void bpf_clear_redirect_map(struct bpf_map *map); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 570f6a9740f5..47bc49667637 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -470,6 +470,18 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, return prog_adj; } +int bpf_remove_insns(struct bpf_prog *prog, u32 off, u32 cnt) +{ + /* Branch offsets can't overflow when program is shrinking, no need + * to call bpf_adj_branches(..., true) here + */ + memmove(prog->insnsi + off, prog->insnsi + off + cnt, + sizeof(struct bpf_insn) * (prog->len - off - cnt)); + prog->len -= cnt; + + return WARN_ON_ONCE(bpf_adj_branches(prog, off, off + cnt, off, false)); +} + void bpf_prog_kallsyms_del_subprogs(struct bpf_prog *fp) { int i; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a282797e194f..82a3098135af 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6713,6 +6713,150 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of return new_prog; } +static int adjust_subprog_starts_after_remove(struct bpf_verifier_env *env, + u32 off, u32 cnt) +{ + int i, j; + + /* find first prog starting at or after off (first to remove) */ + for (i = 0; i < env->subprog_cnt; i++) + if (env->subprog_info[i].start >= off) + break; + /* find first prog starting at or after off + cnt (first to stay) */ + for (j = i; j < env->subprog_cnt; j++) + if (env->subprog_info[j].start >= off + cnt) + break; + /* if j doesn't start exactly at off + cnt, we are just removing + * the front of previous prog + */ + if (env->subprog_info[j].start != off + cnt) + j--; + + if (j > i) { + struct bpf_prog_aux *aux = env->prog->aux; + int move; + + /* move fake 'exit' subprog as well */ + move = env->subprog_cnt + 1 - j; + + memmove(env->subprog_info + i, + env->subprog_info + j, + sizeof(*env->subprog_info) * move); + env->subprog_cnt -= j - i; + + /* remove func_info */ + if (aux->func_info) { + move = aux->func_info_cnt - j; + + memmove(aux->func_info + i, + aux->func_info + j, + sizeof(*aux->func_info) * move); + aux->func_info_cnt -= j - i; + /* func_info->insn_off is set after all code rewrites, + * in adjust_btf_func() - no need to adjust + */ + } + } else { + /* convert i from "first prog to remove" to "first to adjust" */ + if (env->subprog_info[i].start == off) + i++; + } + + /* update fake 'exit' subprog as well */ + for (; i <= env->subprog_cnt; i++) + env->subprog_info[i].start -= cnt; + + return 0; +} + +static int bpf_adj_linfo_after_remove(struct bpf_verifier_env *env, u32 off, + u32 cnt) +{ + struct bpf_prog *prog = env->prog; + u32 i, l_off, l_cnt, nr_linfo; + struct bpf_line_info *linfo; + + nr_linfo = prog->aux->nr_linfo; + if (!nr_linfo) + return 0; + + linfo = prog->aux->linfo; + + /* find first line info to remove, count lines to be removed */ + for (i = 0; i < nr_linfo; i++) + if (linfo[i].insn_off >= off) + break; + + l_off = i; + l_cnt = 0; + for (; i < nr_linfo; i++) + if (linfo[i].insn_off < off + cnt) + l_cnt++; + else + break; + + /* First live insn doesn't match first live linfo, it needs to "inherit" + * last removed linfo. prog is already modified, so prog->len == off + * means no live instructions after (tail of the program was removed). + */ + if (prog->len != off && l_cnt && + (i == nr_linfo || linfo[i].insn_off != off + cnt)) { + l_cnt--; + linfo[--i].insn_off = off + cnt; + } + + /* remove the line info which refer to the removed instructions */ + if (l_cnt) { + memmove(linfo + l_off, linfo + i, + sizeof(*linfo) * (nr_linfo - i)); + + prog->aux->nr_linfo -= l_cnt; + nr_linfo = prog->aux->nr_linfo; + } + + /* pull all linfo[i].insn_off >= off + cnt in by cnt */ + for (i = l_off; i < nr_linfo; i++) + linfo[i].insn_off -= cnt; + + /* fix up all subprogs (incl. 'exit') which start >= off */ + for (i = 0; i <= env->subprog_cnt; i++) + if (env->subprog_info[i].linfo_idx > l_off) { + /* program may have started in the removed region but + * may not be fully removed + */ + if (env->subprog_info[i].linfo_idx >= l_off + l_cnt) + env->subprog_info[i].linfo_idx -= l_cnt; + else + env->subprog_info[i].linfo_idx = l_off; + } + + return 0; +} + +static int verifier_remove_insns(struct bpf_verifier_env *env, u32 off, u32 cnt) +{ + struct bpf_insn_aux_data *aux_data = env->insn_aux_data; + unsigned int orig_prog_len = env->prog->len; + int err; + + err = bpf_remove_insns(env->prog, off, cnt); + if (err) + return err; + + err = adjust_subprog_starts_after_remove(env, off, cnt); + if (err) + return err; + + err = bpf_adj_linfo_after_remove(env, off, cnt); + if (err) + return err; + + memmove(aux_data + off, aux_data + off + cnt, + sizeof(*aux_data) * (orig_prog_len - off - cnt)); + + return 0; +} + /* The verifier does more data flow analysis than llvm and will not * explore branches that are dead at run time. Malicious programs can * have dead code too. Therefore replace all dead at-run-time code @@ -6773,6 +6917,30 @@ static void opt_hard_wire_dead_code_branches(struct bpf_verifier_env *env) } } +static int opt_remove_dead_code(struct bpf_verifier_env *env) +{ + struct bpf_insn_aux_data *aux_data = env->insn_aux_data; + int insn_cnt = env->prog->len; + int i, err; + + for (i = 0; i < insn_cnt; i++) { + int j; + + j = 0; + while (i + j < insn_cnt && !aux_data[i + j].seen) + j++; + if (!j) + continue; + + err = verifier_remove_insns(env, i, j); + if (err) + return err; + insn_cnt = env->prog->len; + } + + return 0; +} + /* convert load instructions that access fields of a context type into a * sequence of instructions that access fields of the underlying structure: * struct __sk_buff -> struct sk_buff @@ -7568,11 +7736,13 @@ skip_full_check: if (is_priv) { if (ret == 0) opt_hard_wire_dead_code_branches(env); + if (ret == 0) + ret = opt_remove_dead_code(env); + } else { + if (ret == 0) + sanitize_dead_code(env); } - if (ret == 0) - sanitize_dead_code(env); - if (ret == 0) /* program is valid, convert *(u32*)(ctx + off) accesses */ ret = convert_ctx_accesses(env); From 5c143108f0e8ca4d51bfce31fecd49169f825f2e Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 22 Jan 2019 22:45:21 -0800 Subject: [PATCH 1024/1640] UPSTREAM: bpf: verifier: remove unconditional branches by 0 Unconditional branches by 0 instructions are basically noops but they can result from earlier optimizations, e.g. a conditional jumps which would never be taken or a conditional jump around dead code. Remove those branches. v0.2: - s/opt_remove_dead_branches/opt_remove_nops/ (Jiong). Change-Id: I01d2288965bcc3cab40447cf5fd618345140d3c8 Signed-off-by: Jakub Kicinski Reviewed-by: Jiong Wang Acked-by: Yonghong Song Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 82a3098135af..701f9ce0c8c6 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6941,6 +6941,27 @@ static int opt_remove_dead_code(struct bpf_verifier_env *env) return 0; } +static int opt_remove_nops(struct bpf_verifier_env *env) +{ + const struct bpf_insn ja = BPF_JMP_IMM(BPF_JA, 0, 0, 0); + struct bpf_insn *insn = env->prog->insnsi; + int insn_cnt = env->prog->len; + int i, err; + + for (i = 0; i < insn_cnt; i++) { + if (memcmp(&insn[i], &ja, sizeof(ja))) + continue; + + err = verifier_remove_insns(env, i, 1); + if (err) + return err; + insn_cnt--; + i--; + } + + return 0; +} + /* convert load instructions that access fields of a context type into a * sequence of instructions that access fields of the underlying structure: * struct __sk_buff -> struct sk_buff @@ -7738,6 +7759,8 @@ skip_full_check: opt_hard_wire_dead_code_branches(env); if (ret == 0) ret = opt_remove_dead_code(env); + if (ret == 0) + ret = opt_remove_nops(env); } else { if (ret == 0) sanitize_dead_code(env); From 43c3655a89b66e5daf98e5dd0a387b8b07402624 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 22 Jan 2019 22:45:23 -0800 Subject: [PATCH 1025/1640] UPSTREAM: bpf: verifier: record original instruction index The communication between the verifier and advanced JITs is based on instruction indexes. We have to keep them stable throughout the optimizations otherwise referring to a particular instruction gets messy quickly. Change-Id: I8bf9a3b76945f5f6924bf3fb3fde4cad60063ebc Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 1 + kernel/bpf/verifier.c | 8 +++++--- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index c0c9bba96624..ff32a1cbea88 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -195,6 +195,7 @@ struct bpf_insn_aux_data { bool seen; /* this insn was processed by the verifier */ bool sanitize_stack_spill; /* subject to Spectre v4 sanitation */ u8 alu_state; /* used in combination with alu_limit */ + unsigned int orig_idx; /* original instruction index */ }; #define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */ diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 701f9ce0c8c6..9c99814262ee 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -7656,7 +7656,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, { struct bpf_verifier_env *env; struct bpf_verifier_log *log; - int ret = -EINVAL; + int i, len, ret = -EINVAL; bool is_priv; /* no program is valid */ @@ -7671,12 +7671,14 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, return -ENOMEM; log = &env->log; + len = (*prog)->len; env->insn_aux_data = - vzalloc(array_size(sizeof(struct bpf_insn_aux_data), - (*prog)->len)); + vzalloc(array_size(sizeof(struct bpf_insn_aux_data), len)); ret = -ENOMEM; if (!env->insn_aux_data) goto err_free_env; + for (i = 0; i < len; i++) + env->insn_aux_data[i].orig_idx = i; env->prog = *prog; env->ops = bpf_verifier_ops[env->prog->type]; From dcc3e61b1ed01b666b91d524db04962b21add89b Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 22 Jan 2019 22:45:24 -0800 Subject: [PATCH 1026/1640] UPSTREAM: bpf: notify offload JITs about optimizations Let offload JITs know when instructions are replaced and optimized out, so they can update their state appropriately. The optimizations are best effort, if JIT returns an error from any callback verifier will stop notifying it as state may now be out of sync, but the verifier continues making progress. Change-Id: I329b7fe9164a72d8c0d85ff92169e06ba480a8f8 Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 7 +++++++ include/linux/bpf_verifier.h | 5 +++++ kernel/bpf/offload.c | 35 +++++++++++++++++++++++++++++++++++ kernel/bpf/verifier.c | 6 ++++++ 4 files changed, 53 insertions(+) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index c22de2495772..7e4fb56fadac 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -273,9 +273,15 @@ struct bpf_verifier_ops { }; struct bpf_prog_offload_ops { + /* verifier basic callbacks */ int (*insn_hook)(struct bpf_verifier_env *env, int insn_idx, int prev_insn_idx); int (*finalize)(struct bpf_verifier_env *env); + /* verifier optimization callbacks (called after .finalize) */ + int (*replace_insn)(struct bpf_verifier_env *env, u32 off, + struct bpf_insn *insn); + int (*remove_insns)(struct bpf_verifier_env *env, u32 off, u32 cnt); + /* program management callbacks */ int (*prepare)(struct bpf_prog *prog); int (*translate)(struct bpf_prog *prog); void (*destroy)(struct bpf_prog *prog); @@ -288,6 +294,7 @@ struct bpf_prog_offload { void *dev_priv; struct list_head offloads; bool dev_state; + bool opt_failed; void *jited_image; u32 jited_len; }; diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index ff32a1cbea88..d60320c6e1f5 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -276,5 +276,10 @@ int bpf_prog_offload_verifier_prep(struct bpf_prog *prog); int bpf_prog_offload_verify_insn(struct bpf_verifier_env *env, int insn_idx, int prev_insn_idx); int bpf_prog_offload_finalize(struct bpf_verifier_env *env); +void +bpf_prog_offload_replace_insn(struct bpf_verifier_env *env, u32 off, + struct bpf_insn *insn); +void +bpf_prog_offload_remove_insns(struct bpf_verifier_env *env, u32 off, u32 cnt); #endif /* _LINUX_BPF_VERIFIER_H */ diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index 454736159480..24d855b384e9 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -173,6 +173,41 @@ int bpf_prog_offload_finalize(struct bpf_verifier_env *env) return ret; } +void +bpf_prog_offload_replace_insn(struct bpf_verifier_env *env, u32 off, + struct bpf_insn *insn) +{ + const struct bpf_prog_offload_ops *ops; + struct bpf_prog_offload *offload; + int ret = -EOPNOTSUPP; + + down_read(&bpf_devs_lock); + offload = env->prog->aux->offload; + if (offload) { + ops = offload->offdev->ops; + if (!offload->opt_failed && ops->replace_insn) + ret = ops->replace_insn(env, off, insn); + offload->opt_failed |= ret; + } + up_read(&bpf_devs_lock); +} + +void +bpf_prog_offload_remove_insns(struct bpf_verifier_env *env, u32 off, u32 cnt) +{ + struct bpf_prog_offload *offload; + int ret = -EOPNOTSUPP; + + down_read(&bpf_devs_lock); + offload = env->prog->aux->offload; + if (offload) { + if (!offload->opt_failed && offload->offdev->ops->remove_insns) + ret = offload->offdev->ops->remove_insns(env, off, cnt); + offload->opt_failed |= ret; + } + up_read(&bpf_devs_lock); +} + static void __bpf_prog_offload_destroy(struct bpf_prog *prog) { struct bpf_prog_offload *offload = prog->aux->offload; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 9c99814262ee..89de722633ce 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6839,6 +6839,9 @@ static int verifier_remove_insns(struct bpf_verifier_env *env, u32 off, u32 cnt) unsigned int orig_prog_len = env->prog->len; int err; + if (bpf_prog_is_dev_bound(env->prog->aux)) + bpf_prog_offload_remove_insns(env, off, cnt); + err = bpf_remove_insns(env->prog, off, cnt); if (err) return err; @@ -6913,6 +6916,9 @@ static void opt_hard_wire_dead_code_branches(struct bpf_verifier_env *env) else continue; + if (bpf_prog_is_dev_bound(env->prog->aux)) + bpf_prog_offload_replace_insn(env, i, &ja); + memcpy(insn, &ja, sizeof(ja)); } } From d5c1cc78feb504afc5183a61301ff6f75198e22e Mon Sep 17 00:00:00 2001 From: Jiong Wang Date: Sat, 26 Jan 2019 12:26:00 -0500 Subject: [PATCH 1027/1640] UPSTREAM: bpf: refactor verifier min/max code for condition jump The current min/max code does both signed and unsigned comparisons against the input argument "val" which is "u64" and there is explicit type casting when the comparison is signed. As we will need slightly more complexer type casting when JMP32 introduced, it is better to host the signed type casting. This makes the code more clean with ignorable runtime overhead. Also, code for J*GE/GT/LT/LE and JEQ/JNE are very similar, this patch combine them. The main purpose for this refactor is to make sure the min/max code will still be readable and with minimum code duplication after JMP32 introduced. Reviewed-by: Jakub Kicinski Change-Id: I42cc2f5562e9a591c8f9abe1b5caa7d9160f2e60 Signed-off-by: Jiong Wang Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 172 ++++++++++++++++++++++++------------------ 1 file changed, 99 insertions(+), 73 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 89de722633ce..f518b0dda642 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4299,9 +4299,13 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *vstate, */ static int is_branch_taken(struct bpf_reg_state *reg, u64 val, u8 opcode) { + s64 sval; + if (__is_pointer_value(false, reg)) return -1; + sval = (s64)val; + switch (opcode) { case BPF_JEQ: if (tnum_is_const(reg->var_off)) @@ -4324,9 +4328,9 @@ static int is_branch_taken(struct bpf_reg_state *reg, u64 val, u8 opcode) return 0; break; case BPF_JSGT: - if (reg->smin_value > (s64)val) + if (reg->smin_value > sval) return 1; - else if (reg->smax_value < (s64)val) + else if (reg->smax_value < sval) return 0; break; case BPF_JLT: @@ -4336,9 +4340,9 @@ static int is_branch_taken(struct bpf_reg_state *reg, u64 val, u8 opcode) return 0; break; case BPF_JSLT: - if (reg->smax_value < (s64)val) + if (reg->smax_value < sval) return 1; - else if (reg->smin_value >= (s64)val) + else if (reg->smin_value >= sval) return 0; break; case BPF_JGE: @@ -4348,9 +4352,9 @@ static int is_branch_taken(struct bpf_reg_state *reg, u64 val, u8 opcode) return 0; break; case BPF_JSGE: - if (reg->smin_value >= (s64)val) + if (reg->smin_value >= sval) return 1; - else if (reg->smax_value < (s64)val) + else if (reg->smax_value < sval) return 0; break; case BPF_JLE: @@ -4360,9 +4364,9 @@ static int is_branch_taken(struct bpf_reg_state *reg, u64 val, u8 opcode) return 0; break; case BPF_JSLE: - if (reg->smax_value <= (s64)val) + if (reg->smax_value <= sval) return 1; - else if (reg->smin_value > (s64)val) + else if (reg->smin_value > sval) return 0; break; } @@ -4379,6 +4383,8 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg, struct bpf_reg_state *false_reg, u64 val, u8 opcode) { + s64 sval; + /* If the dst_reg is a pointer, we can't learn anything about its * variable offset from the compare (unless src_reg were a pointer into * the same object, but we don't bother with that. @@ -4388,19 +4394,22 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg, if (__is_pointer_value(false, false_reg)) return; + sval = (s64)val; + switch (opcode) { case BPF_JEQ: - /* If this is false then we know nothing Jon Snow, but if it is - * true then we know for sure. - */ - __mark_reg_known(true_reg, val); - break; case BPF_JNE: - /* If this is true we know nothing Jon Snow, but if it is false - * we know the value for sure; + { + struct bpf_reg_state *reg = + opcode == BPF_JEQ ? true_reg : false_reg; + + /* For BPF_JEQ, if this is false we know nothing Jon Snow, but + * if it is true we know the value for sure. Likewise for + * BPF_JNE. */ - __mark_reg_known(false_reg, val); + __mark_reg_known(reg, val); break; + } case BPF_JSET: false_reg->var_off = tnum_and(false_reg->var_off, tnum_const(~val)); @@ -4408,38 +4417,46 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg, true_reg->var_off = tnum_or(true_reg->var_off, tnum_const(val)); break; - case BPF_JGT: - false_reg->umax_value = min(false_reg->umax_value, val); - true_reg->umin_value = max(true_reg->umin_value, val + 1); - break; - case BPF_JSGT: - false_reg->smax_value = min_t(s64, false_reg->smax_value, val); - true_reg->smin_value = max_t(s64, true_reg->smin_value, val + 1); - break; - case BPF_JLT: - false_reg->umin_value = max(false_reg->umin_value, val); - true_reg->umax_value = min(true_reg->umax_value, val - 1); - break; - case BPF_JSLT: - false_reg->smin_value = max_t(s64, false_reg->smin_value, val); - true_reg->smax_value = min_t(s64, true_reg->smax_value, val - 1); - break; case BPF_JGE: - false_reg->umax_value = min(false_reg->umax_value, val - 1); - true_reg->umin_value = max(true_reg->umin_value, val); + case BPF_JGT: + { + u64 false_umax = opcode == BPF_JGT ? val : val - 1; + u64 true_umin = opcode == BPF_JGT ? val + 1 : val; + + false_reg->umax_value = min(false_reg->umax_value, false_umax); + true_reg->umin_value = max(true_reg->umin_value, true_umin); break; + } case BPF_JSGE: - false_reg->smax_value = min_t(s64, false_reg->smax_value, val - 1); - true_reg->smin_value = max_t(s64, true_reg->smin_value, val); + case BPF_JSGT: + { + s64 false_smax = opcode == BPF_JSGT ? sval : sval - 1; + s64 true_smin = opcode == BPF_JSGT ? sval + 1 : sval; + + false_reg->smax_value = min(false_reg->smax_value, false_smax); + true_reg->smin_value = max(true_reg->smin_value, true_smin); break; + } case BPF_JLE: - false_reg->umin_value = max(false_reg->umin_value, val + 1); - true_reg->umax_value = min(true_reg->umax_value, val); + case BPF_JLT: + { + u64 false_umin = opcode == BPF_JLT ? val : val + 1; + u64 true_umax = opcode == BPF_JLT ? val - 1 : val; + + false_reg->umin_value = max(false_reg->umin_value, false_umin); + true_reg->umax_value = min(true_reg->umax_value, true_umax); break; + } case BPF_JSLE: - false_reg->smin_value = max_t(s64, false_reg->smin_value, val + 1); - true_reg->smax_value = min_t(s64, true_reg->smax_value, val); + case BPF_JSLT: + { + s64 false_smin = opcode == BPF_JSLT ? sval : sval + 1; + s64 true_smax = opcode == BPF_JSLT ? sval - 1 : sval; + + false_reg->smin_value = max(false_reg->smin_value, false_smin); + true_reg->smax_value = min(true_reg->smax_value, true_smax); break; + } default: break; } @@ -4464,22 +4481,23 @@ static void reg_set_min_max_inv(struct bpf_reg_state *true_reg, struct bpf_reg_state *false_reg, u64 val, u8 opcode) { + s64 sval; + if (__is_pointer_value(false, false_reg)) return; + sval = (s64)val; + switch (opcode) { case BPF_JEQ: - /* If this is false then we know nothing Jon Snow, but if it is - * true then we know for sure. - */ - __mark_reg_known(true_reg, val); - break; case BPF_JNE: - /* If this is true we know nothing Jon Snow, but if it is false - * we know the value for sure; - */ - __mark_reg_known(false_reg, val); + { + struct bpf_reg_state *reg = + opcode == BPF_JEQ ? true_reg : false_reg; + + __mark_reg_known(reg, val); break; + } case BPF_JSET: false_reg->var_off = tnum_and(false_reg->var_off, tnum_const(~val)); @@ -4487,38 +4505,46 @@ static void reg_set_min_max_inv(struct bpf_reg_state *true_reg, true_reg->var_off = tnum_or(true_reg->var_off, tnum_const(val)); break; - case BPF_JGT: - true_reg->umax_value = min(true_reg->umax_value, val - 1); - false_reg->umin_value = max(false_reg->umin_value, val); - break; - case BPF_JSGT: - true_reg->smax_value = min_t(s64, true_reg->smax_value, val - 1); - false_reg->smin_value = max_t(s64, false_reg->smin_value, val); - break; - case BPF_JLT: - true_reg->umin_value = max(true_reg->umin_value, val + 1); - false_reg->umax_value = min(false_reg->umax_value, val); - break; - case BPF_JSLT: - true_reg->smin_value = max_t(s64, true_reg->smin_value, val + 1); - false_reg->smax_value = min_t(s64, false_reg->smax_value, val); - break; case BPF_JGE: - true_reg->umax_value = min(true_reg->umax_value, val); - false_reg->umin_value = max(false_reg->umin_value, val + 1); + case BPF_JGT: + { + u64 false_umin = opcode == BPF_JGT ? val : val + 1; + u64 true_umax = opcode == BPF_JGT ? val - 1 : val; + + false_reg->umin_value = max(false_reg->umin_value, false_umin); + true_reg->umax_value = min(true_reg->umax_value, true_umax); break; + } case BPF_JSGE: - true_reg->smax_value = min_t(s64, true_reg->smax_value, val); - false_reg->smin_value = max_t(s64, false_reg->smin_value, val + 1); + case BPF_JSGT: + { + s64 false_smin = opcode == BPF_JSGT ? sval : sval + 1; + s64 true_smax = opcode == BPF_JSGT ? sval - 1 : sval; + + false_reg->smin_value = max(false_reg->smin_value, false_smin); + true_reg->smax_value = min(true_reg->smax_value, true_smax); break; + } case BPF_JLE: - true_reg->umin_value = max(true_reg->umin_value, val); - false_reg->umax_value = min(false_reg->umax_value, val - 1); + case BPF_JLT: + { + u64 false_umax = opcode == BPF_JLT ? val : val - 1; + u64 true_umin = opcode == BPF_JLT ? val + 1 : val; + + false_reg->umax_value = min(false_reg->umax_value, false_umax); + true_reg->umin_value = max(true_reg->umin_value, true_umin); break; + } case BPF_JSLE: - true_reg->smin_value = max_t(s64, true_reg->smin_value, val); - false_reg->smax_value = min_t(s64, false_reg->smax_value, val - 1); + case BPF_JSLT: + { + s64 false_smax = opcode == BPF_JSLT ? sval : sval - 1; + s64 true_smin = opcode == BPF_JSLT ? sval + 1 : sval; + + false_reg->smax_value = min(false_reg->smax_value, false_smax); + true_reg->smin_value = max(true_reg->smin_value, true_smin); break; + } default: break; } From 366479893c264a97d7dc9940c86c50301b3a6cfd Mon Sep 17 00:00:00 2001 From: Jiong Wang Date: Sat, 26 Jan 2019 12:25:59 -0500 Subject: [PATCH 1028/1640] UPSTREAM: bpf: allocate 0x06 to new eBPF instruction class JMP32 The new eBPF instruction class JMP32 uses the reserved class number 0x6. Kernel BPF ISA documentation updated accordingly. Reviewed-by: Jakub Kicinski Change-Id: Idfa9c7ef537b67b06ffea33db60785f754f9bfea Signed-off-by: Jiong Wang Signed-off-by: Alexei Starovoitov --- Documentation/networking/filter.txt | 15 ++++++++------- include/uapi/linux/bpf.h | 1 + tools/include/uapi/linux/bpf.h | 1 + 3 files changed, 10 insertions(+), 7 deletions(-) diff --git a/Documentation/networking/filter.txt b/Documentation/networking/filter.txt index 87814859cfc2..44293164244c 100644 --- a/Documentation/networking/filter.txt +++ b/Documentation/networking/filter.txt @@ -857,7 +857,7 @@ Three LSB bits store instruction class which is one of: BPF_STX 0x03 BPF_STX 0x03 BPF_ALU 0x04 BPF_ALU 0x04 BPF_JMP 0x05 BPF_JMP 0x05 - BPF_RET 0x06 [ class 6 unused, for future if needed ] + BPF_RET 0x06 BPF_JMP32 0x06 BPF_MISC 0x07 BPF_ALU64 0x07 When BPF_CLASS(code) == BPF_ALU or BPF_JMP, 4th bit encodes source operand ... @@ -894,9 +894,9 @@ If BPF_CLASS(code) == BPF_ALU or BPF_ALU64 [ in eBPF ], BPF_OP(code) is one of: BPF_ARSH 0xc0 /* eBPF only: sign extending shift right */ BPF_END 0xd0 /* eBPF only: endianness conversion */ -If BPF_CLASS(code) == BPF_JMP, BPF_OP(code) is one of: +If BPF_CLASS(code) == BPF_JMP or BPF_JMP32 [ in eBPF ], BPF_OP(code) is one of: - BPF_JA 0x00 + BPF_JA 0x00 /* BPF_JMP only */ BPF_JEQ 0x10 BPF_JGT 0x20 BPF_JGE 0x30 @@ -904,8 +904,8 @@ If BPF_CLASS(code) == BPF_JMP, BPF_OP(code) is one of: BPF_JNE 0x50 /* eBPF only: jump != */ BPF_JSGT 0x60 /* eBPF only: signed '>' */ BPF_JSGE 0x70 /* eBPF only: signed '>=' */ - BPF_CALL 0x80 /* eBPF only: function call */ - BPF_EXIT 0x90 /* eBPF only: function return */ + BPF_CALL 0x80 /* eBPF BPF_JMP only: function call */ + BPF_EXIT 0x90 /* eBPF BPF_JMP only: function return */ BPF_JLT 0xa0 /* eBPF only: unsigned '<' */ BPF_JLE 0xb0 /* eBPF only: unsigned '<=' */ BPF_JSLT 0xc0 /* eBPF only: signed '<' */ @@ -928,8 +928,9 @@ Classic BPF wastes the whole BPF_RET class to represent a single 'ret' operation. Classic BPF_RET | BPF_K means copy imm32 into return register and perform function exit. eBPF is modeled to match CPU, so BPF_JMP | BPF_EXIT in eBPF means function exit only. The eBPF program needs to store return -value into register R0 before doing a BPF_EXIT. Class 6 in eBPF is currently -unused and reserved for future use. +value into register R0 before doing a BPF_EXIT. Class 6 in eBPF is used as +BPF_JMP32 to mean exactly the same operations as BPF_JMP, but with 32-bit wide +operands for the comparisons instead. For load and store instructions the 8-bit 'code' field is divided as: diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index adfea446bfc5..c5eb9df531c9 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -14,6 +14,7 @@ /* Extended instruction set based on top of classic BPF */ /* instruction classes */ +#define BPF_JMP32 0x06 /* jmp mode in word width */ #define BPF_ALU64 0x07 /* alu mode in double word width */ /* ld/ldx fields */ diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 3682622e129f..63f11d1b77a4 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -14,6 +14,7 @@ /* Extended instruction set based on top of classic BPF */ /* instruction classes */ +#define BPF_JMP32 0x06 /* jmp mode in word width */ #define BPF_ALU64 0x07 /* alu mode in double word width */ /* ld/ldx fields */ From e43e7e0161a0d5a54bfae941559179ba06e3d75b Mon Sep 17 00:00:00 2001 From: Jiong Wang Date: Sat, 26 Jan 2019 12:26:01 -0500 Subject: [PATCH 1029/1640] BACKPORT: bpf: verifier support JMP32 This patch teach verifier about the new BPF_JMP32 instruction class. Verifier need to treat it similar as the existing BPF_JMP class. A BPF_JMP32 insn needs to go through all checks that have been done on BPF_JMP. Also, verifier is doing runtime optimizations based on the extra info conditional jump instruction could offer, especially when the comparison is between constant and register that the value range of the register could be improved based on the comparison results. These code are updated accordingly. Acked-by: Jakub Kicinski Change-Id: Ifa4326721f253164a5610e150a09e6e28f4dec77 Signed-off-by: Jiong Wang Signed-off-by: Alexei Starovoitov --- kernel/bpf/core.c | 3 +- kernel/bpf/verifier.c | 205 +++++++++++++++++++++++++++++++++++------- 2 files changed, 174 insertions(+), 34 deletions(-) diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 47bc49667637..776a96ef93f9 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -370,7 +370,8 @@ static int bpf_adj_branches(struct bpf_prog *prog, u32 pos, s32 end_old, insn = prog->insnsi + end_old; } code = insn->code; - if (BPF_CLASS(code) != BPF_JMP || + if ((BPF_CLASS(code) != BPF_JMP && + BPF_CLASS(code) != BPF_JMP32) || BPF_OP(code) == BPF_EXIT) continue; /* Adjust offset of jmps if we cross patch boundaries. */ diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index f518b0dda642..850963da3ea9 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1094,7 +1094,7 @@ static int check_subprogs(struct bpf_verifier_env *env) for (i = 0; i < insn_cnt; i++) { u8 code = insn[i].code; - if (BPF_CLASS(code) != BPF_JMP) + if (BPF_CLASS(code) != BPF_JMP && BPF_CLASS(code) != BPF_JMP32) goto next; if (BPF_OP(code) == BPF_EXIT || BPF_OP(code) == BPF_CALL) goto next; @@ -4297,14 +4297,49 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *vstate, * 0 - branch will not be taken and fall-through to next insn * -1 - unknown. Example: "if (reg < 5)" is unknown when register value range [0,10] */ -static int is_branch_taken(struct bpf_reg_state *reg, u64 val, u8 opcode) +static int is_branch_taken(struct bpf_reg_state *reg, u64 val, u8 opcode, + bool is_jmp32) { + struct bpf_reg_state reg_lo; s64 sval; if (__is_pointer_value(false, reg)) return -1; - sval = (s64)val; + if (is_jmp32) { + reg_lo = *reg; + reg = ®_lo; + /* For JMP32, only low 32 bits are compared, coerce_reg_to_size + * could truncate high bits and update umin/umax according to + * information of low bits. + */ + coerce_reg_to_size(reg, 4); + /* smin/smax need special handling. For example, after coerce, + * if smin_value is 0x00000000ffffffffLL, the value is -1 when + * used as operand to JMP32. It is a negative number from s32's + * point of view, while it is a positive number when seen as + * s64. The smin/smax are kept as s64, therefore, when used with + * JMP32, they need to be transformed into s32, then sign + * extended back to s64. + * + * Also, smin/smax were copied from umin/umax. If umin/umax has + * different sign bit, then min/max relationship doesn't + * maintain after casting into s32, for this case, set smin/smax + * to safest range. + */ + if ((reg->umax_value ^ reg->umin_value) & + (1ULL << 31)) { + reg->smin_value = S32_MIN; + reg->smax_value = S32_MAX; + } + reg->smin_value = (s64)(s32)reg->smin_value; + reg->smax_value = (s64)(s32)reg->smax_value; + + val = (u32)val; + sval = (s64)(s32)val; + } else { + sval = (s64)val; + } switch (opcode) { case BPF_JEQ: @@ -4374,6 +4409,29 @@ static int is_branch_taken(struct bpf_reg_state *reg, u64 val, u8 opcode) return -1; } +/* Generate min value of the high 32-bit from TNUM info. */ +static u64 gen_hi_min(struct tnum var) +{ + return var.value & ~0xffffffffULL; +} + +/* Generate max value of the high 32-bit from TNUM info. */ +static u64 gen_hi_max(struct tnum var) +{ + return (var.value | var.mask) & ~0xffffffffULL; +} + +/* Return true if VAL is compared with a s64 sign extended from s32, and they + * are with the same signedness. + */ +static bool cmp_val_with_extended_s64(s64 sval, struct bpf_reg_state *reg) +{ + return ((s32)sval >= 0 && + reg->smin_value >= 0 && reg->smax_value <= S32_MAX) || + ((s32)sval < 0 && + reg->smax_value <= 0 && reg->smin_value >= S32_MIN); +} + /* Adjusts the register min/max values in the case that the dst_reg is the * variable register that we are working on, and src_reg is a constant or we're * simply doing a BPF_K check. @@ -4381,7 +4439,7 @@ static int is_branch_taken(struct bpf_reg_state *reg, u64 val, u8 opcode) */ static void reg_set_min_max(struct bpf_reg_state *true_reg, struct bpf_reg_state *false_reg, u64 val, - u8 opcode) + u8 opcode, bool is_jmp32) { s64 sval; @@ -4394,7 +4452,8 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg, if (__is_pointer_value(false, false_reg)) return; - sval = (s64)val; + val = is_jmp32 ? (u32)val : val; + sval = is_jmp32 ? (s64)(s32)val : (s64)val; switch (opcode) { case BPF_JEQ: @@ -4407,7 +4466,15 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg, * if it is true we know the value for sure. Likewise for * BPF_JNE. */ - __mark_reg_known(reg, val); + if (is_jmp32) { + u64 old_v = reg->var_off.value; + u64 hi_mask = ~0xffffffffULL; + + reg->var_off.value = (old_v & hi_mask) | val; + reg->var_off.mask &= hi_mask; + } else { + __mark_reg_known(reg, val); + } break; } case BPF_JSET: @@ -4423,6 +4490,10 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg, u64 false_umax = opcode == BPF_JGT ? val : val - 1; u64 true_umin = opcode == BPF_JGT ? val + 1 : val; + if (is_jmp32) { + false_umax += gen_hi_max(false_reg->var_off); + true_umin += gen_hi_min(true_reg->var_off); + } false_reg->umax_value = min(false_reg->umax_value, false_umax); true_reg->umin_value = max(true_reg->umin_value, true_umin); break; @@ -4433,6 +4504,11 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg, s64 false_smax = opcode == BPF_JSGT ? sval : sval - 1; s64 true_smin = opcode == BPF_JSGT ? sval + 1 : sval; + /* If the full s64 was not sign-extended from s32 then don't + * deduct further info. + */ + if (is_jmp32 && !cmp_val_with_extended_s64(sval, false_reg)) + break; false_reg->smax_value = min(false_reg->smax_value, false_smax); true_reg->smin_value = max(true_reg->smin_value, true_smin); break; @@ -4443,6 +4519,10 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg, u64 false_umin = opcode == BPF_JLT ? val : val + 1; u64 true_umax = opcode == BPF_JLT ? val - 1 : val; + if (is_jmp32) { + false_umin += gen_hi_min(false_reg->var_off); + true_umax += gen_hi_max(true_reg->var_off); + } false_reg->umin_value = max(false_reg->umin_value, false_umin); true_reg->umax_value = min(true_reg->umax_value, true_umax); break; @@ -4453,6 +4533,8 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg, s64 false_smin = opcode == BPF_JSLT ? sval : sval + 1; s64 true_smax = opcode == BPF_JSLT ? sval - 1 : sval; + if (is_jmp32 && !cmp_val_with_extended_s64(sval, false_reg)) + break; false_reg->smin_value = max(false_reg->smin_value, false_smin); true_reg->smax_value = min(true_reg->smax_value, true_smax); break; @@ -4479,14 +4561,15 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg, */ static void reg_set_min_max_inv(struct bpf_reg_state *true_reg, struct bpf_reg_state *false_reg, u64 val, - u8 opcode) + u8 opcode, bool is_jmp32) { s64 sval; if (__is_pointer_value(false, false_reg)) return; - sval = (s64)val; + val = is_jmp32 ? (u32)val : val; + sval = is_jmp32 ? (s64)(s32)val : (s64)val; switch (opcode) { case BPF_JEQ: @@ -4495,7 +4578,15 @@ static void reg_set_min_max_inv(struct bpf_reg_state *true_reg, struct bpf_reg_state *reg = opcode == BPF_JEQ ? true_reg : false_reg; - __mark_reg_known(reg, val); + if (is_jmp32) { + u64 old_v = reg->var_off.value; + u64 hi_mask = ~0xffffffffULL; + + reg->var_off.value = (old_v & hi_mask) | val; + reg->var_off.mask &= hi_mask; + } else { + __mark_reg_known(reg, val); + } break; } case BPF_JSET: @@ -4511,6 +4602,10 @@ static void reg_set_min_max_inv(struct bpf_reg_state *true_reg, u64 false_umin = opcode == BPF_JGT ? val : val + 1; u64 true_umax = opcode == BPF_JGT ? val - 1 : val; + if (is_jmp32) { + false_umin += gen_hi_min(false_reg->var_off); + true_umax += gen_hi_max(true_reg->var_off); + } false_reg->umin_value = max(false_reg->umin_value, false_umin); true_reg->umax_value = min(true_reg->umax_value, true_umax); break; @@ -4521,6 +4616,8 @@ static void reg_set_min_max_inv(struct bpf_reg_state *true_reg, s64 false_smin = opcode == BPF_JSGT ? sval : sval + 1; s64 true_smax = opcode == BPF_JSGT ? sval - 1 : sval; + if (is_jmp32 && !cmp_val_with_extended_s64(sval, false_reg)) + break; false_reg->smin_value = max(false_reg->smin_value, false_smin); true_reg->smax_value = min(true_reg->smax_value, true_smax); break; @@ -4531,6 +4628,10 @@ static void reg_set_min_max_inv(struct bpf_reg_state *true_reg, u64 false_umax = opcode == BPF_JLT ? val : val - 1; u64 true_umin = opcode == BPF_JLT ? val + 1 : val; + if (is_jmp32) { + false_umax += gen_hi_max(false_reg->var_off); + true_umin += gen_hi_min(true_reg->var_off); + } false_reg->umax_value = min(false_reg->umax_value, false_umax); true_reg->umin_value = max(true_reg->umin_value, true_umin); break; @@ -4541,6 +4642,8 @@ static void reg_set_min_max_inv(struct bpf_reg_state *true_reg, s64 false_smax = opcode == BPF_JSLT ? sval : sval - 1; s64 true_smin = opcode == BPF_JSLT ? sval + 1 : sval; + if (is_jmp32 && !cmp_val_with_extended_s64(sval, false_reg)) + break; false_reg->smax_value = min(false_reg->smax_value, false_smax); true_reg->smin_value = max(true_reg->smin_value, true_smin); break; @@ -4682,6 +4785,10 @@ static bool try_match_pkt_pointers(const struct bpf_insn *insn, if (BPF_SRC(insn->code) != BPF_X) return false; + /* Pointers are always 64-bit. */ + if (BPF_CLASS(insn->code) == BPF_JMP32) + return false; + switch (BPF_OP(insn->code)) { case BPF_JGT: if ((dst_reg->type == PTR_TO_PACKET && @@ -4774,17 +4881,19 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, struct bpf_reg_state *regs = this_branch->frame[this_branch->curframe]->regs; struct bpf_reg_state *dst_reg, *other_branch_regs, *src_reg = NULL; u8 opcode = BPF_OP(insn->code); + bool is_jmp32; int pred = -1; int err; - if (opcode > BPF_JSLE) { - verbose(env, "invalid BPF_JMP opcode %x\n", opcode); + /* Only conditional jumps are expected to reach here. */ + if (opcode == BPF_JA || opcode > BPF_JSLE) { + verbose(env, "invalid BPF_JMP/JMP32 opcode %x\n", opcode); return -EINVAL; } if (BPF_SRC(insn->code) == BPF_X) { if (insn->imm != 0) { - verbose(env, "BPF_JMP uses reserved fields\n"); + verbose(env, "BPF_JMP/JMP32 uses reserved fields\n"); return -EINVAL; } @@ -4801,7 +4910,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, src_reg = ®s[insn->src_reg]; } else { if (insn->src_reg != BPF_REG_0) { - verbose(env, "BPF_JMP uses reserved fields\n"); + verbose(env, "BPF_JMP/JMP32 uses reserved fields\n"); return -EINVAL; } } @@ -4812,13 +4921,15 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, return err; dst_reg = ®s[insn->dst_reg]; + is_jmp32 = BPF_CLASS(insn->code) == BPF_JMP32; if (BPF_SRC(insn->code) == BPF_K) - pred = is_branch_taken(dst_reg, insn->imm, opcode); + pred = is_branch_taken(dst_reg, insn->imm, + opcode, is_jmp32); else if (src_reg->type == SCALAR_VALUE && tnum_is_const(src_reg->var_off)) pred = is_branch_taken(dst_reg, src_reg->var_off.value, - opcode); + opcode, is_jmp32); if (pred == 1) { /* Only follow the goto, ignore fall-through. If needed, push @@ -4858,30 +4969,51 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, * comparable. */ if (BPF_SRC(insn->code) == BPF_X) { + struct bpf_reg_state *src_reg = ®s[insn->src_reg]; + struct bpf_reg_state lo_reg0 = *dst_reg; + struct bpf_reg_state lo_reg1 = *src_reg; + struct bpf_reg_state *src_lo, *dst_lo; + + dst_lo = &lo_reg0; + src_lo = &lo_reg1; + coerce_reg_to_size(dst_lo, 4); + coerce_reg_to_size(src_lo, 4); + if (dst_reg->type == SCALAR_VALUE && - regs[insn->src_reg].type == SCALAR_VALUE) { - if (tnum_is_const(regs[insn->src_reg].var_off)) + src_reg->type == SCALAR_VALUE) { + if (tnum_is_const(src_reg->var_off) || + (is_jmp32 && tnum_is_const(src_lo->var_off))) reg_set_min_max(&other_branch_regs[insn->dst_reg], - dst_reg, regs[insn->src_reg].var_off.value, - opcode); - else if (tnum_is_const(dst_reg->var_off)) + dst_reg, + is_jmp32 + ? src_lo->var_off.value + : src_reg->var_off.value, + opcode, is_jmp32); + else if (tnum_is_const(dst_reg->var_off) || + (is_jmp32 && tnum_is_const(dst_lo->var_off))) reg_set_min_max_inv(&other_branch_regs[insn->src_reg], - ®s[insn->src_reg], - dst_reg->var_off.value, opcode); - else if (opcode == BPF_JEQ || opcode == BPF_JNE) + src_reg, + is_jmp32 + ? dst_lo->var_off.value + : dst_reg->var_off.value, + opcode, is_jmp32); + else if (!is_jmp32 && + (opcode == BPF_JEQ || opcode == BPF_JNE)) /* Comparing for equality, we can combine knowledge */ reg_combine_min_max(&other_branch_regs[insn->src_reg], &other_branch_regs[insn->dst_reg], - ®s[insn->src_reg], - ®s[insn->dst_reg], opcode); + src_reg, dst_reg, opcode); } } else if (dst_reg->type == SCALAR_VALUE) { reg_set_min_max(&other_branch_regs[insn->dst_reg], - dst_reg, insn->imm, opcode); + dst_reg, insn->imm, opcode, is_jmp32); } - /* detect if R == 0 where R is returned from bpf_map_lookup_elem() */ - if (BPF_SRC(insn->code) == BPF_K && + /* detect if R == 0 where R is returned from bpf_map_lookup_elem(). + * NOTE: these optimizations below are related with pointer comparison + * which will never be JMP32. + */ + if (!is_jmp32 && BPF_SRC(insn->code) == BPF_K && insn->imm == 0 && (opcode == BPF_JEQ || opcode == BPF_JNE) && reg_type_may_be_null(dst_reg->type)) { /* Mark all identical registers in each branch as either @@ -5219,7 +5351,8 @@ peek_stack: goto check_state; t = insn_stack[cur_stack - 1]; - if (BPF_CLASS(insns[t].code) == BPF_JMP) { + if (BPF_CLASS(insns[t].code) == BPF_JMP || + BPF_CLASS(insns[t].code) == BPF_JMP32) { u8 opcode = BPF_OP(insns[t].code); if (opcode == BPF_EXIT) { @@ -6360,7 +6493,7 @@ static int do_check(struct bpf_verifier_env *env) if (err) return err; - } else if (class == BPF_JMP) { + } else if (class == BPF_JMP || class == BPF_JMP32) { u8 opcode = BPF_OP(insn->code); if (opcode == BPF_CALL) { @@ -6368,7 +6501,8 @@ static int do_check(struct bpf_verifier_env *env) insn->off != 0 || (insn->src_reg != BPF_REG_0 && insn->src_reg != BPF_PSEUDO_CALL) || - insn->dst_reg != BPF_REG_0) { + insn->dst_reg != BPF_REG_0 || + class == BPF_JMP32) { verbose(env, "BPF_CALL uses reserved fields\n"); return -EINVAL; } @@ -6384,7 +6518,8 @@ static int do_check(struct bpf_verifier_env *env) if (BPF_SRC(insn->code) != BPF_K || insn->imm != 0 || insn->src_reg != BPF_REG_0 || - insn->dst_reg != BPF_REG_0) { + insn->dst_reg != BPF_REG_0 || + class == BPF_JMP32) { verbose(env, "BPF_JA uses reserved fields\n"); return -EINVAL; } @@ -6396,7 +6531,8 @@ static int do_check(struct bpf_verifier_env *env) if (BPF_SRC(insn->code) != BPF_K || insn->imm != 0 || insn->src_reg != BPF_REG_0 || - insn->dst_reg != BPF_REG_0) { + insn->dst_reg != BPF_REG_0 || + class == BPF_JMP32) { verbose(env, "BPF_EXIT uses reserved fields\n"); return -EINVAL; } @@ -6916,6 +7052,9 @@ static bool insn_is_cond_jump(u8 code) { u8 op; + if (BPF_CLASS(code) == BPF_JMP32) + return true; + if (BPF_CLASS(code) != BPF_JMP) return false; From e8c280586cc7d3f08763d116f85ab92e5b6a856c Mon Sep 17 00:00:00 2001 From: Jiong Wang Date: Sat, 26 Jan 2019 12:26:02 -0500 Subject: [PATCH 1030/1640] UPSTREAM: bpf: disassembler support JMP32 This patch teaches disassembler about JMP32. There are two places to update: - Class 0x6 now used by BPF_JMP32, not "unused". - BPF_JMP32 need to show comparison operands properly. The disassemble format is to add an extra "(32)" before the operands if it is a sub-register. A better disassemble format for both JMP32 and ALU32 just show the register prefix as "w" instead of "r", this is the format using by LLVM assembler. Reviewed-by: Jakub Kicinski Change-Id: I74c6c319c6468581fcbebb427aa361f7704866c7 Signed-off-by: Jiong Wang Signed-off-by: Alexei Starovoitov --- kernel/bpf/disasm.c | 34 +++++++++++++++++++--------------- 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c index cbd75dd5992e..3016372d01c1 100644 --- a/kernel/bpf/disasm.c +++ b/kernel/bpf/disasm.c @@ -67,7 +67,7 @@ const char *const bpf_class_string[8] = { [BPF_STX] = "stx", [BPF_ALU] = "alu", [BPF_JMP] = "jmp", - [BPF_RET] = "BUG", + [BPF_JMP32] = "jmp32", [BPF_ALU64] = "alu64", }; @@ -136,23 +136,22 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs, else print_bpf_end_insn(verbose, cbs->private_data, insn); } else if (BPF_OP(insn->code) == BPF_NEG) { - verbose(cbs->private_data, "(%02x) r%d = %s-r%d\n", - insn->code, insn->dst_reg, - class == BPF_ALU ? "(u32) " : "", + verbose(cbs->private_data, "(%02x) %c%d = -%c%d\n", + insn->code, class == BPF_ALU ? 'w' : 'r', + insn->dst_reg, class == BPF_ALU ? 'w' : 'r', insn->dst_reg); } else if (BPF_SRC(insn->code) == BPF_X) { - verbose(cbs->private_data, "(%02x) %sr%d %s %sr%d\n", - insn->code, class == BPF_ALU ? "(u32) " : "", + verbose(cbs->private_data, "(%02x) %c%d %s %c%d\n", + insn->code, class == BPF_ALU ? 'w' : 'r', insn->dst_reg, bpf_alu_string[BPF_OP(insn->code) >> 4], - class == BPF_ALU ? "(u32) " : "", + class == BPF_ALU ? 'w' : 'r', insn->src_reg); } else { - verbose(cbs->private_data, "(%02x) %sr%d %s %s%d\n", - insn->code, class == BPF_ALU ? "(u32) " : "", + verbose(cbs->private_data, "(%02x) %c%d %s %d\n", + insn->code, class == BPF_ALU ? 'w' : 'r', insn->dst_reg, bpf_alu_string[BPF_OP(insn->code) >> 4], - class == BPF_ALU ? "(u32) " : "", insn->imm); } } else if (class == BPF_STX) { @@ -222,7 +221,7 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs, verbose(cbs->private_data, "BUG_ld_%02x\n", insn->code); return; } - } else if (class == BPF_JMP) { + } else if (class == BPF_JMP32 || class == BPF_JMP) { u8 opcode = BPF_OP(insn->code); if (opcode == BPF_CALL) { @@ -246,13 +245,18 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs, } else if (insn->code == (BPF_JMP | BPF_EXIT)) { verbose(cbs->private_data, "(%02x) exit\n", insn->code); } else if (BPF_SRC(insn->code) == BPF_X) { - verbose(cbs->private_data, "(%02x) if r%d %s r%d goto pc%+d\n", - insn->code, insn->dst_reg, + verbose(cbs->private_data, + "(%02x) if %c%d %s %c%d goto pc%+d\n", + insn->code, class == BPF_JMP32 ? 'w' : 'r', + insn->dst_reg, bpf_jmp_string[BPF_OP(insn->code) >> 4], + class == BPF_JMP32 ? 'w' : 'r', insn->src_reg, insn->off); } else { - verbose(cbs->private_data, "(%02x) if r%d %s 0x%x goto pc%+d\n", - insn->code, insn->dst_reg, + verbose(cbs->private_data, + "(%02x) if %c%d %s 0x%x goto pc%+d\n", + insn->code, class == BPF_JMP32 ? 'w' : 'r', + insn->dst_reg, bpf_jmp_string[BPF_OP(insn->code) >> 4], insn->imm, insn->off); } From 102fabddf7f32cf299dd2a8528b69668c452d7e0 Mon Sep 17 00:00:00 2001 From: Jiong Wang Date: Sat, 26 Jan 2019 12:26:04 -0500 Subject: [PATCH 1031/1640] BACKPORT: bpf: interpreter support for JMP32 This patch implements interpreting new JMP32 instructions. Reviewed-by: Jakub Kicinski Change-Id: I1b193530b95332afeec324ea3898a475f90881c0 Signed-off-by: Jiong Wang Signed-off-by: Alexei Starovoitov --- kernel/bpf/core.c | 197 +++++++++++++++------------------------------- 1 file changed, 63 insertions(+), 134 deletions(-) diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 776a96ef93f9..de01c8034299 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1164,6 +1164,31 @@ EXPORT_SYMBOL_GPL(__bpf_call_base); INSN_2(JMP, CALL), \ /* Exit instruction. */ \ INSN_2(JMP, EXIT), \ + /* 32-bit Jump instructions. */ \ + /* Register based. */ \ + INSN_3(JMP32, JEQ, X), \ + INSN_3(JMP32, JNE, X), \ + INSN_3(JMP32, JGT, X), \ + INSN_3(JMP32, JLT, X), \ + INSN_3(JMP32, JGE, X), \ + INSN_3(JMP32, JLE, X), \ + INSN_3(JMP32, JSGT, X), \ + INSN_3(JMP32, JSLT, X), \ + INSN_3(JMP32, JSGE, X), \ + INSN_3(JMP32, JSLE, X), \ + INSN_3(JMP32, JSET, X), \ + /* Immediate based. */ \ + INSN_3(JMP32, JEQ, K), \ + INSN_3(JMP32, JNE, K), \ + INSN_3(JMP32, JGT, K), \ + INSN_3(JMP32, JLT, K), \ + INSN_3(JMP32, JGE, K), \ + INSN_3(JMP32, JLE, K), \ + INSN_3(JMP32, JSGT, K), \ + INSN_3(JMP32, JSLT, K), \ + INSN_3(JMP32, JSGE, K), \ + INSN_3(JMP32, JSLE, K), \ + INSN_3(JMP32, JSET, K), \ /* Jump instructions. */ \ /* Register based. */ \ INSN_3(JMP, JEQ, X), \ @@ -1426,145 +1451,49 @@ select_insn: out: CONT; } - /* JMP */ JMP_JA: insn += insn->off; CONT; - JMP_JEQ_X: - if (DST == SRC) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JEQ_K: - if (DST == IMM) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JNE_X: - if (DST != SRC) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JNE_K: - if (DST != IMM) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JGT_X: - if (DST > SRC) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JGT_K: - if (DST > IMM) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JLT_X: - if (DST < SRC) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JLT_K: - if (DST < IMM) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JGE_X: - if (DST >= SRC) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JGE_K: - if (DST >= IMM) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JLE_X: - if (DST <= SRC) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JLE_K: - if (DST <= IMM) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JSGT_X: - if (((s64) DST) > ((s64) SRC)) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JSGT_K: - if (((s64) DST) > ((s64) IMM)) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JSLT_X: - if (((s64) DST) < ((s64) SRC)) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JSLT_K: - if (((s64) DST) < ((s64) IMM)) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JSGE_X: - if (((s64) DST) >= ((s64) SRC)) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JSGE_K: - if (((s64) DST) >= ((s64) IMM)) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JSLE_X: - if (((s64) DST) <= ((s64) SRC)) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JSLE_K: - if (((s64) DST) <= ((s64) IMM)) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JSET_X: - if (DST & SRC) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JSET_K: - if (DST & IMM) { - insn += insn->off; - CONT_JMP; - } - CONT; JMP_EXIT: return BPF_R0; - + /* JMP */ +#define COND_JMP(SIGN, OPCODE, CMP_OP) \ + JMP_##OPCODE##_X: \ + if ((SIGN##64) DST CMP_OP (SIGN##64) SRC) { \ + insn += insn->off; \ + CONT_JMP; \ + } \ + CONT; \ + JMP32_##OPCODE##_X: \ + if ((SIGN##32) DST CMP_OP (SIGN##32) SRC) { \ + insn += insn->off; \ + CONT_JMP; \ + } \ + CONT; \ + JMP_##OPCODE##_K: \ + if ((SIGN##64) DST CMP_OP (SIGN##64) IMM) { \ + insn += insn->off; \ + CONT_JMP; \ + } \ + CONT; \ + JMP32_##OPCODE##_K: \ + if ((SIGN##32) DST CMP_OP (SIGN##32) IMM) { \ + insn += insn->off; \ + CONT_JMP; \ + } \ + CONT; + COND_JMP(u, JEQ, ==) + COND_JMP(u, JNE, !=) + COND_JMP(u, JGT, >) + COND_JMP(u, JLT, <) + COND_JMP(u, JGE, >=) + COND_JMP(u, JLE, <=) + COND_JMP(u, JSET, &) + COND_JMP(s, JSGT, >) + COND_JMP(s, JSLT, <) + COND_JMP(s, JSGE, >=) + COND_JMP(s, JSLE, <=) +#undef COND_JMP /* ST, STX and LDX*/ ST_NOSPEC: /* Speculation barrier for mitigating Speculative Store Bypass. From 890a23f3becb66f004035a7d9a876abdaf2466de Mon Sep 17 00:00:00 2001 From: Jiong Wang Date: Sat, 26 Jan 2019 12:26:05 -0500 Subject: [PATCH 1032/1640] UPSTREAM: bpf: JIT blinds support JMP32 This patch adds JIT blinds support for JMP32. Like BPF_JMP_REG/IMM, JMP32 version are needed for building raw bpf insn. They are added to both include/linux/filter.h and tools/include/linux/filter.h. Reviewed-by: Jakub Kicinski Change-Id: I86ee0692437ff685811dadfc7bd911ca58aeb4bf Signed-off-by: Jiong Wang Signed-off-by: Alexei Starovoitov --- include/linux/filter.h | 20 ++++++++++++++++++++ kernel/bpf/core.c | 21 +++++++++++++++++++++ tools/include/linux/filter.h | 20 ++++++++++++++++++++ 3 files changed, 61 insertions(+) diff --git a/include/linux/filter.h b/include/linux/filter.h index 9140dc4fe919..b8219d28a1c4 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -306,6 +306,26 @@ struct sock_reuseport; .off = OFF, \ .imm = IMM }) +/* Like BPF_JMP_REG, but with 32-bit wide operands for comparison. */ + +#define BPF_JMP32_REG(OP, DST, SRC, OFF) \ + ((struct bpf_insn) { \ + .code = BPF_JMP32 | BPF_OP(OP) | BPF_X, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = OFF, \ + .imm = 0 }) + +/* Like BPF_JMP_IMM, but with 32-bit wide operands for comparison. */ + +#define BPF_JMP32_IMM(OP, DST, IMM, OFF) \ + ((struct bpf_insn) { \ + .code = BPF_JMP32 | BPF_OP(OP) | BPF_K, \ + .dst_reg = DST, \ + .src_reg = 0, \ + .off = OFF, \ + .imm = IMM }) + /* Unconditional jumps, goto pc + off16 */ #define BPF_JMP_A(OFF) \ diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index de01c8034299..82e6a6786a47 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -968,6 +968,27 @@ static int bpf_jit_blind_insn(const struct bpf_insn *from, *to++ = BPF_JMP_REG(from->code, from->dst_reg, BPF_REG_AX, off); break; + case BPF_JMP32 | BPF_JEQ | BPF_K: + case BPF_JMP32 | BPF_JNE | BPF_K: + case BPF_JMP32 | BPF_JGT | BPF_K: + case BPF_JMP32 | BPF_JLT | BPF_K: + case BPF_JMP32 | BPF_JGE | BPF_K: + case BPF_JMP32 | BPF_JLE | BPF_K: + case BPF_JMP32 | BPF_JSGT | BPF_K: + case BPF_JMP32 | BPF_JSLT | BPF_K: + case BPF_JMP32 | BPF_JSGE | BPF_K: + case BPF_JMP32 | BPF_JSLE | BPF_K: + case BPF_JMP32 | BPF_JSET | BPF_K: + /* Accommodate for extra offset in case of a backjump. */ + off = from->off; + if (off < 0) + off -= 2; + *to++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm); + *to++ = BPF_ALU32_IMM(BPF_XOR, BPF_REG_AX, imm_rnd); + *to++ = BPF_JMP32_REG(from->code, from->dst_reg, BPF_REG_AX, + off); + break; + case BPF_LD | BPF_IMM | BPF_DW: *to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ aux[1].imm); *to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd); diff --git a/tools/include/linux/filter.h b/tools/include/linux/filter.h index c5e512da8d8a..6beceae2dfc2 100644 --- a/tools/include/linux/filter.h +++ b/tools/include/linux/filter.h @@ -199,6 +199,16 @@ .off = OFF, \ .imm = 0 }) +/* Like BPF_JMP_REG, but with 32-bit wide operands for comparison. */ + +#define BPF_JMP32_REG(OP, DST, SRC, OFF) \ + ((struct bpf_insn) { \ + .code = BPF_JMP32 | BPF_OP(OP) | BPF_X, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = OFF, \ + .imm = 0 }) + /* Conditional jumps against immediates, if (dst_reg 'op' imm32) goto pc + off16 */ #define BPF_JMP_IMM(OP, DST, IMM, OFF) \ @@ -209,6 +219,16 @@ .off = OFF, \ .imm = IMM }) +/* Like BPF_JMP_IMM, but with 32-bit wide operands for comparison. */ + +#define BPF_JMP32_IMM(OP, DST, IMM, OFF) \ + ((struct bpf_insn) { \ + .code = BPF_JMP32 | BPF_OP(OP) | BPF_K, \ + .dst_reg = DST, \ + .src_reg = 0, \ + .off = OFF, \ + .imm = IMM }) + /* Unconditional jumps, goto pc + off16 */ #define BPF_JMP_A(OFF) \ From 9b483eac0470d5f527834d012b86bf06f59611de Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 29 Jan 2019 16:38:16 -0800 Subject: [PATCH 1033/1640] UPSTREAM: bpf: btf: allow typedef func_proto Current implementation does not allow typedef func_proto. But it is actually allowed. -bash-4.4$ cat t.c typedef int (f) (int); f *g; -bash-4.4$ clang -O2 -g -c -target bpf t.c -Xclang -target-feature -Xclang +dwarfris -bash-4.4$ pahole -JV t.o File t.o: [1] PTR (anon) type_id=2 [2] TYPEDEF f type_id=3 [3] FUNC_PROTO (anon) return=4 args=(4 (anon)) [4] INT int size=4 bit_offset=0 nr_bits=32 encoding=SIGNED -bash-4.4$ This patch related btf verifier to allow such (typedef func_proto) patterns. Fixes: 2667a2626f4d ("bpf: btf: Add BTF_KIND_FUNC and BTF_KIND_FUNC_PROTO") Acked-by: Martin KaFai Lau Change-Id: Ie01a95a742e55651541103785210a45ff7a90420 Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov --- kernel/bpf/btf.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index ab01c1615c1d..c3bfa206bcc8 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -1525,7 +1525,8 @@ static int btf_modifier_resolve(struct btf_verifier_env *env, /* "typedef void new_void", "const void"...etc */ if (!btf_type_is_void(next_type) && - !btf_type_is_fwd(next_type)) { + !btf_type_is_fwd(next_type) && + !btf_type_is_func_proto(next_type)) { btf_verifier_log_type(env, v->t, "Invalid type_id"); return -EINVAL; } From b151f81d99a368c240faa59edd2501d19b3ea3fe Mon Sep 17 00:00:00 2001 From: Valdis Kletnieks Date: Mon, 28 Jan 2019 23:04:46 -0500 Subject: [PATCH 1034/1640] UPSTREAM: bpf: fix bitrotted kerneldoc Over the years, the function signature has changed, but the kerneldoc block hasn't. Change-Id: I48265c82a3cdb094a228491ece952bcbb740f16d Signed-off-by: Valdis Kletnieks Acked-by: Song Liu Signed-off-by: Daniel Borkmann --- kernel/bpf/core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 82e6a6786a47..3399f696edea 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1282,8 +1282,9 @@ bool bpf_opcode_in_insntable(u8 code) #ifndef CONFIG_BPF_JIT_ALWAYS_ON /** * __bpf_prog_run - run eBPF program on a given context - * @ctx: is the data we are operating on + * @regs: is the array of MAX_BPF_EXT_REG eBPF pseudo-registers * @insn: is the array of eBPF instructions + * @stack: is the eBPF storage stack * * Decode and execute eBPF instructions. */ From 74c041b81981156d815ebdebc8b15efba407fd84 Mon Sep 17 00:00:00 2001 From: Valdis Kletnieks Date: Tue, 29 Jan 2019 01:47:06 -0500 Subject: [PATCH 1035/1640] UPSTREAM: bpf, cgroups: clean up kerneldoc warnings Building with W=1 reveals some bitrot: CC kernel/bpf/cgroup.o kernel/bpf/cgroup.c:238: warning: Function parameter or member 'flags' not described in '__cgroup_bpf_attach' kernel/bpf/cgroup.c:367: warning: Function parameter or member 'unused_flags' not described in '__cgroup_bpf_detach' Add a kerneldoc line for 'flags'. Fixing the warning for 'unused_flags' is best approached by removing the unused parameter on the function call. Change-Id: I5bcde82bd6bddb5cc1c1c58867070c6025e5ab8e Signed-off-by: Valdis Kletnieks Acked-by: Song Liu Signed-off-by: Daniel Borkmann --- include/linux/bpf-cgroup.h | 2 +- kernel/bpf/cgroup.c | 3 ++- kernel/cgroup/cgroup.c | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index 96f5ae7b8f41..eb3266d39762 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -78,7 +78,7 @@ int cgroup_bpf_inherit(struct cgroup *cgrp); int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, enum bpf_attach_type type, u32 flags); int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, - enum bpf_attach_type type, u32 flags); + enum bpf_attach_type type); int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, union bpf_attr __user *uattr); diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index ab612fe9862f..d78cfec5807d 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -230,6 +230,7 @@ cleanup: * @cgrp: The cgroup which descendants to traverse * @prog: A program to attach * @type: Type of attach operation + * @flags: Option flags * * Must be called with cgroup_mutex held. */ @@ -363,7 +364,7 @@ cleanup: * Must be called with cgroup_mutex held. */ int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, - enum bpf_attach_type type, u32 unused_flags) + enum bpf_attach_type type) { struct list_head *progs = &cgrp->bpf.progs[type]; enum bpf_cgroup_storage_type stype; diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 604773a53d8b..97d6ee5b2f65 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -6121,7 +6121,7 @@ int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, int ret; mutex_lock(&cgroup_mutex); - ret = __cgroup_bpf_detach(cgrp, prog, type, flags); + ret = __cgroup_bpf_detach(cgrp, prog, type); mutex_unlock(&cgroup_mutex); return ret; } From 4df6ea73fac4c53b2afb003ffcbbf12d84fb36c7 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 28 Jan 2019 18:43:34 -0800 Subject: [PATCH 1036/1640] UPSTREAM: bpf: run bpf programs with preemption disabled Disabled preemption is necessary for proper access to per-cpu maps from BPF programs. But the sender side of socket filters didn't have preemption disabled: unix_dgram_sendmsg->sk_filter->sk_filter_trim_cap->bpf_prog_run_save_cb->BPF_PROG_RUN and a combination of af_packet with tun device didn't disable either: tpacket_snd->packet_direct_xmit->packet_pick_tx_queue->ndo_select_queue-> tun_select_queue->tun_ebpf_select_queue->bpf_prog_run_clear_cb->BPF_PROG_RUN Disable preemption before executing BPF programs (both classic and extended). Reported-by: Jann Horn Change-Id: Ib7bee5270af08dc887608ba7e67ef97426a3c554 Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Signed-off-by: Daniel Borkmann --- include/linux/filter.h | 21 ++++++++++++++++++--- kernel/bpf/cgroup.c | 2 +- 2 files changed, 19 insertions(+), 4 deletions(-) diff --git a/include/linux/filter.h b/include/linux/filter.h index b8219d28a1c4..069a13b39eb9 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -692,8 +692,8 @@ static inline u8 *bpf_skb_cb(struct sk_buff *skb) return qdisc_skb_cb(skb)->data; } -static inline u32 bpf_prog_run_save_cb(const struct bpf_prog *prog, - struct sk_buff *skb) +static inline u32 __bpf_prog_run_save_cb(const struct bpf_prog *prog, + struct sk_buff *skb) { u8 *cb_data = bpf_skb_cb(skb); u8 cb_saved[BPF_SKB_CB_LEN]; @@ -712,15 +712,30 @@ static inline u32 bpf_prog_run_save_cb(const struct bpf_prog *prog, return res; } +static inline u32 bpf_prog_run_save_cb(const struct bpf_prog *prog, + struct sk_buff *skb) +{ + u32 res; + + preempt_disable(); + res = __bpf_prog_run_save_cb(prog, skb); + preempt_enable(); + return res; +} + static inline u32 bpf_prog_run_clear_cb(const struct bpf_prog *prog, struct sk_buff *skb) { u8 *cb_data = bpf_skb_cb(skb); + u32 res; if (unlikely(prog->cb_access)) memset(cb_data, 0, BPF_SKB_CB_LEN); - return BPF_PROG_RUN(prog, skb); + preempt_disable(); + res = BPF_PROG_RUN(prog, skb); + preempt_enable(); + return res; } static __always_inline u32 bpf_prog_run_xdp(const struct bpf_prog *prog, diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index d78cfec5807d..4e807973aa80 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -573,7 +573,7 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk, bpf_compute_and_save_data_end(skb, &saved_data_end); ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], skb, - bpf_prog_run_save_cb); + __bpf_prog_run_save_cb); bpf_restore_data_end(skb, saved_data_end); __skb_pull(skb, offset); skb->sk = save_sk; From eb0dfde54013ec9071cfb5eccd4de78f830a2204 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 31 Jan 2019 15:40:04 -0800 Subject: [PATCH 1037/1640] BACKPORT: bpf: introduce bpf_spin_lock Introduce 'struct bpf_spin_lock' and bpf_spin_lock/unlock() helpers to let bpf program serialize access to other variables. Example: struct hash_elem { int cnt; struct bpf_spin_lock lock; }; struct hash_elem * val = bpf_map_lookup_elem(&hash_map, &key); if (val) { bpf_spin_lock(&val->lock); val->cnt++; bpf_spin_unlock(&val->lock); } Restrictions and safety checks: - bpf_spin_lock is only allowed inside HASH and ARRAY maps. - BTF description of the map is mandatory for safety analysis. - bpf program can take one bpf_spin_lock at a time, since two or more can cause dead locks. - only one 'struct bpf_spin_lock' is allowed per map element. It drastically simplifies implementation yet allows bpf program to use any number of bpf_spin_locks. - when bpf_spin_lock is taken the calls (either bpf2bpf or helpers) are not allowed. - bpf program must bpf_spin_unlock() before return. - bpf program can access 'struct bpf_spin_lock' only via bpf_spin_lock()/bpf_spin_unlock() helpers. - load/store into 'struct bpf_spin_lock lock;' field is not allowed. - to use bpf_spin_lock() helper the BTF description of map value must be a struct and have 'struct bpf_spin_lock anyname;' field at the top level. Nested lock inside another struct is not allowed. - syscall map_lookup doesn't copy bpf_spin_lock field to user space. - syscall map_update and program map_update do not update bpf_spin_lock field. - bpf_spin_lock cannot be on the stack or inside networking packet. bpf_spin_lock can only be inside HASH or ARRAY map value. - bpf_spin_lock is available to root only and to all program types. - bpf_spin_lock is not allowed in inner maps of map-in-map. - ld_abs is not allowed inside spin_lock-ed region. - tracing progs and socket filter progs cannot use bpf_spin_lock due to insufficient preemption checks Implementation details: - cgroup-bpf class of programs can nest with xdp/tc programs. Hence bpf_spin_lock is equivalent to spin_lock_irqsave. Other solutions to avoid nested bpf_spin_lock are possible. Like making sure that all networking progs run with softirq disabled. spin_lock_irqsave is the simplest and doesn't add overhead to the programs that don't use it. - arch_spinlock_t is used when its implemented as queued_spin_lock - archs can force their own arch_spinlock_t - on architectures where queued_spin_lock is not available and sizeof(arch_spinlock_t) != sizeof(__u32) trivial lock is used. - presence of bpf_spin_lock inside map value could have been indicated via extra flag during map_create, but specifying it via BTF is cleaner. It provides introspection for map key/value and reduces user mistakes. Next steps: - allow bpf_spin_lock in other map types (like cgroup local storage) - introduce BPF_F_LOCK flag for bpf_map_update() syscall and helper to request kernel to grab bpf_spin_lock before rewriting the value. That will serialize access to map elements. Acked-by: Peter Zijlstra (Intel) Change-Id: Id03322189a8f05c006a05479f7078b23c8c020ea Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 37 +++++++- include/linux/bpf_verifier.h | 1 + include/linux/btf.h | 1 + include/uapi/linux/bpf.h | 3 + kernel/Kconfig.locks | 3 + kernel/bpf/arraymap.c | 7 +- kernel/bpf/btf.c | 42 +++++++++ kernel/bpf/core.c | 2 + kernel/bpf/hashtab.c | 21 +++-- kernel/bpf/helpers.c | 80 +++++++++++++++++ kernel/bpf/map_in_map.c | 5 ++ kernel/bpf/syscall.c | 21 ++++- kernel/bpf/verifier.c | 169 ++++++++++++++++++++++++++++++++++- net/core/filter.c | 16 +++- 14 files changed, 383 insertions(+), 25 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 7e4fb56fadac..d2fac19b51c7 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -77,14 +77,15 @@ struct bpf_map { u32 value_size; u32 max_entries; u32 map_flags; - u32 pages; + int spin_lock_off; /* >=0 valid offset, <0 error */ u32 id; int numa_node; u32 btf_key_type_id; u32 btf_value_type_id; struct btf *btf; + u32 pages; bool unpriv_array; - /* 55 bytes hole */ + /* 51 bytes hole */ /* The 3rd and 4th cacheline with misc members to avoid false sharing * particularly with refcounting. @@ -96,6 +97,34 @@ struct bpf_map { char name[BPF_OBJ_NAME_LEN]; }; +static inline bool map_value_has_spin_lock(const struct bpf_map *map) +{ + return map->spin_lock_off >= 0; +} + +static inline void check_and_init_map_lock(struct bpf_map *map, void *dst) +{ + if (likely(!map_value_has_spin_lock(map))) + return; + *(struct bpf_spin_lock *)(dst + map->spin_lock_off) = + (struct bpf_spin_lock){}; +} + +/* copy everything but bpf_spin_lock */ +static inline void copy_map_value(struct bpf_map *map, void *dst, void *src) +{ + if (unlikely(map_value_has_spin_lock(map))) { + u32 off = map->spin_lock_off; + + memcpy(dst, src, off); + memcpy(dst + off + sizeof(struct bpf_spin_lock), + src + off + sizeof(struct bpf_spin_lock), + map->value_size - off - sizeof(struct bpf_spin_lock)); + } else { + memcpy(dst, src, map->value_size); + } +} + struct bpf_offload_dev; struct bpf_offloaded_map; @@ -167,6 +196,7 @@ enum bpf_arg_type { ARG_PTR_TO_CTX, /* pointer to context */ ARG_ANYTHING, /* any (initialized) argument is ok */ ARG_PTR_TO_SOCKET, /* pointer to bpf_sock */ + ARG_PTR_TO_SPIN_LOCK, /* pointer to bpf_spin_lock */ }; /* type of values returned from helper functions */ @@ -896,7 +926,8 @@ extern const struct bpf_func_proto bpf_msg_redirect_hash_proto; extern const struct bpf_func_proto bpf_msg_redirect_map_proto; extern const struct bpf_func_proto bpf_sk_redirect_hash_proto; extern const struct bpf_func_proto bpf_sk_redirect_map_proto; - +extern const struct bpf_func_proto bpf_spin_lock_proto; +extern const struct bpf_func_proto bpf_spin_unlock_proto; extern const struct bpf_func_proto bpf_get_local_storage_proto; /* Shared helpers among cBPF and eBPF. */ diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index d60320c6e1f5..33fea3502118 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -155,6 +155,7 @@ struct bpf_verifier_state { /* call stack tracking */ struct bpf_func_state *frame[MAX_CALL_FRAMES]; u32 curframe; + u32 active_spin_lock; bool speculative; }; diff --git a/include/linux/btf.h b/include/linux/btf.h index 12502e25e767..455d31b55828 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -50,6 +50,7 @@ u32 btf_id(const struct btf *btf); bool btf_member_is_reg_int(const struct btf *btf, const struct btf_type *s, const struct btf_member *m, u32 expected_offset, u32 expected_size); +int btf_find_spin_lock(const struct btf *btf, const struct btf_type *t); #ifdef CONFIG_BPF_SYSCALL const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index c5eb9df531c9..296c0b29bae4 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3035,4 +3035,7 @@ struct bpf_line_info { __u32 line_col; }; +struct bpf_spin_lock { + __u32 val; +}; #endif /* _UAPI__LINUX_BPF_H__ */ diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks index 621c296fe8f8..c4c8062429a3 100644 --- a/kernel/Kconfig.locks +++ b/kernel/Kconfig.locks @@ -242,6 +242,9 @@ config QUEUED_SPINLOCKS def_bool y if ARCH_USE_QUEUED_SPINLOCKS depends on SMP +config BPF_ARCH_SPINLOCK + bool + config ARCH_USE_QUEUED_RWLOCKS bool diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 73ee1cbf0604..e971ee47c87f 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -273,9 +273,10 @@ static int array_map_update_elem(struct bpf_map *map, void *key, void *value, memcpy(this_cpu_ptr(array->pptrs[index & array->index_mask]), value, map->value_size); else - memcpy(array->value + - array->elem_size * (index & array->index_mask), - value, map->value_size); + copy_map_value(map, + array->value + + array->elem_size * (index & array->index_mask), + value); return 0; } diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index c3bfa206bcc8..f0c2e33203e8 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -355,6 +355,11 @@ static bool btf_type_is_struct(const struct btf_type *t) return kind == BTF_KIND_STRUCT || kind == BTF_KIND_UNION; } +static bool __btf_type_is_struct(const struct btf_type *t) +{ + return BTF_INFO_KIND(t->info) == BTF_KIND_STRUCT; +} + static bool btf_type_is_array(const struct btf_type *t) { return BTF_INFO_KIND(t->info) == BTF_KIND_ARRAY; @@ -2046,6 +2051,43 @@ static void btf_struct_log(struct btf_verifier_env *env, btf_verifier_log(env, "size=%u vlen=%u", t->size, btf_type_vlen(t)); } +/* find 'struct bpf_spin_lock' in map value. + * return >= 0 offset if found + * and < 0 in case of error + */ +int btf_find_spin_lock(const struct btf *btf, const struct btf_type *t) +{ + const struct btf_member *member; + u32 i, off = -ENOENT; + + if (!__btf_type_is_struct(t)) + return -EINVAL; + + for_each_member(i, t, member) { + const struct btf_type *member_type = btf_type_by_id(btf, + member->type); + if (!__btf_type_is_struct(member_type)) + continue; + if (member_type->size != sizeof(struct bpf_spin_lock)) + continue; + if (strcmp(__btf_name_by_offset(btf, member_type->name_off), + "bpf_spin_lock")) + continue; + if (off != -ENOENT) + /* only one 'struct bpf_spin_lock' is allowed */ + return -E2BIG; + off = btf_member_bit_offset(t, member); + if (off % 8) + /* valid C code cannot generate such BTF */ + return -EINVAL; + off /= 8; + if (off % __alignof__(struct bpf_spin_lock)) + /* valid struct bpf_spin_lock will be 4 byte aligned */ + return -EINVAL; + } + return off; +} + static void btf_struct_seq_show(const struct btf *btf, const struct btf_type *t, u32 type_id, void *data, u8 bits_offset, struct seq_file *m) diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 3399f696edea..be30c4be3eb0 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2035,6 +2035,8 @@ const struct bpf_func_proto bpf_map_delete_elem_proto __weak; const struct bpf_func_proto bpf_map_push_elem_proto __weak; const struct bpf_func_proto bpf_map_pop_elem_proto __weak; const struct bpf_func_proto bpf_map_peek_elem_proto __weak; +const struct bpf_func_proto bpf_spin_lock_proto __weak; +const struct bpf_func_proto bpf_spin_unlock_proto __weak; const struct bpf_func_proto bpf_get_prandom_u32_proto __weak; const struct bpf_func_proto bpf_get_smp_processor_id_proto __weak; diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index de1748d9eab0..731e26f6c281 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -734,21 +734,12 @@ static bool fd_htab_map_needs_adjust(const struct bpf_htab *htab) BITS_PER_LONG == 64; } -static u32 htab_size_value(const struct bpf_htab *htab, bool percpu) -{ - u32 size = htab->map.value_size; - - if (percpu || fd_htab_map_needs_adjust(htab)) - size = round_up(size, 8); - return size; -} - static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, void *value, u32 key_size, u32 hash, bool percpu, bool onallcpus, struct htab_elem *old_elem) { - u32 size = htab_size_value(htab, percpu); + u32 size = htab->map.value_size; bool prealloc = htab_is_prealloc(htab); struct htab_elem *l_new, **pl_new; void __percpu *pptr; @@ -787,10 +778,13 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, l_new = ERR_PTR(-ENOMEM); goto dec_count; } + check_and_init_map_lock(&htab->map, + l_new->key + round_up(key_size, 8)); } memcpy(l_new->key, key, key_size); if (percpu) { + size = round_up(size, 8); if (prealloc) { pptr = htab_elem_get_ptr(l_new, key_size); } else { @@ -808,8 +802,13 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, if (!prealloc) htab_elem_set_ptr(l_new, key_size, pptr); - } else { + } else if (fd_htab_map_needs_adjust(htab)) { + size = round_up(size, 8); memcpy(l_new->key + round_up(key_size, 8), value, size); + } else { + copy_map_value(&htab->map, + l_new->key + round_up(key_size, 8), + value); } l_new->hash = hash; diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 769f3377828d..fd6b192c3fb7 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -233,6 +233,86 @@ const struct bpf_func_proto bpf_get_current_comm_proto = { .arg2_type = ARG_CONST_SIZE, }; +#if defined(CONFIG_QUEUED_SPINLOCKS) || defined(CONFIG_BPF_ARCH_SPINLOCK) + +static inline void __bpf_spin_lock(struct bpf_spin_lock *lock) +{ + arch_spinlock_t *l = (void *)lock; + union { + __u32 val; + arch_spinlock_t lock; + } u = { .lock = __ARCH_SPIN_LOCK_UNLOCKED }; + + compiletime_assert(u.val == 0, "__ARCH_SPIN_LOCK_UNLOCKED not 0"); + BUILD_BUG_ON(sizeof(*l) != sizeof(__u32)); + BUILD_BUG_ON(sizeof(*lock) != sizeof(__u32)); + arch_spin_lock(l); +} + +static inline void __bpf_spin_unlock(struct bpf_spin_lock *lock) +{ + arch_spinlock_t *l = (void *)lock; + + arch_spin_unlock(l); +} + +#else + +static inline void __bpf_spin_lock(struct bpf_spin_lock *lock) +{ + atomic_t *l = (void *)lock; + + BUILD_BUG_ON(sizeof(*l) != sizeof(*lock)); + do { + atomic_cond_read_relaxed(l, !VAL); + } while (atomic_xchg(l, 1)); +} + +static inline void __bpf_spin_unlock(struct bpf_spin_lock *lock) +{ + atomic_t *l = (void *)lock; + + atomic_set_release(l, 0); +} + +#endif + +static DEFINE_PER_CPU(unsigned long, irqsave_flags); + +notrace BPF_CALL_1(bpf_spin_lock, struct bpf_spin_lock *, lock) +{ + unsigned long flags; + + local_irq_save(flags); + __bpf_spin_lock(lock); + __this_cpu_write(irqsave_flags, flags); + return 0; +} + +const struct bpf_func_proto bpf_spin_lock_proto = { + .func = bpf_spin_lock, + .gpl_only = false, + .ret_type = RET_VOID, + .arg1_type = ARG_PTR_TO_SPIN_LOCK, +}; + +notrace BPF_CALL_1(bpf_spin_unlock, struct bpf_spin_lock *, lock) +{ + unsigned long flags; + + flags = __this_cpu_read(irqsave_flags); + __bpf_spin_unlock(lock); + local_irq_restore(flags); + return 0; +} + +const struct bpf_func_proto bpf_spin_unlock_proto = { + .func = bpf_spin_unlock, + .gpl_only = false, + .ret_type = RET_VOID, + .arg1_type = ARG_PTR_TO_SPIN_LOCK, +}; + #ifdef CONFIG_CGROUPS BPF_CALL_0(bpf_get_current_cgroup_id) { diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c index c65fdc06600c..2cc43f053771 100644 --- a/kernel/bpf/map_in_map.c +++ b/kernel/bpf/map_in_map.c @@ -37,6 +37,11 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd) return ERR_PTR(-EINVAL); } + if (map_value_has_spin_lock(inner_map)) { + fdput(f); + return ERR_PTR(-ENOTSUPP); + } + inner_map_meta_size = sizeof(*inner_map_meta); /* In some cases verifier needs to access beyond just base map. */ if (inner_map->ops == &array_map_ops) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 4dd510bb6ef3..5eb835e3bcbf 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -464,7 +464,7 @@ int map_check_no_btf(const struct bpf_map *map, return -ENOTSUPP; } -static int map_check_btf(const struct bpf_map *map, const struct btf *btf, +static int map_check_btf(struct bpf_map *map, const struct btf *btf, u32 btf_key_id, u32 btf_value_id) { const struct btf_type *key_type, *value_type; @@ -479,6 +479,21 @@ static int map_check_btf(const struct bpf_map *map, const struct btf *btf, if (!value_type || value_size != map->value_size) return -EINVAL; + map->spin_lock_off = btf_find_spin_lock(btf, value_type); + + if (map_value_has_spin_lock(map)) { + if (map->map_type != BPF_MAP_TYPE_HASH && + map->map_type != BPF_MAP_TYPE_ARRAY) + return -ENOTSUPP; + if (map->spin_lock_off + sizeof(struct bpf_spin_lock) > + map->value_size) { + WARN_ONCE(1, + "verifier bug spin_lock_off %d value_size %d\n", + map->spin_lock_off, map->value_size); + return -EFAULT; + } + } + if (map->ops->map_check_btf) ret = map->ops->map_check_btf(map, btf, key_type, value_type); @@ -543,6 +558,8 @@ static int map_create(union bpf_attr *attr) map->btf = btf; map->btf_key_type_id = attr->btf_key_type_id; map->btf_value_type_id = attr->btf_value_type_id; + } else { + map->spin_lock_off = -EINVAL; } err = security_bpf_map_alloc(map); @@ -750,7 +767,7 @@ static int map_lookup_elem(union bpf_attr *attr) err = -ENOENT; } else { err = 0; - memcpy(value, ptr, value_size); + copy_map_value(map, value, ptr); } rcu_read_unlock(); } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 850963da3ea9..5cb631168409 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -212,6 +212,7 @@ struct bpf_call_arg_meta { int access_size; u64 msize_max_value; int ptr_id; + int func_id; }; static DEFINE_MUTEX(bpf_verifier_lock); @@ -350,6 +351,12 @@ static bool reg_is_refcounted(const struct bpf_reg_state *reg) return type_is_refcounted(reg->type); } +static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg) +{ + return reg->type == PTR_TO_MAP_VALUE && + map_value_has_spin_lock(reg->map_ptr); +} + static bool reg_is_refcounted_or_null(const struct bpf_reg_state *reg) { return type_is_refcounted_or_null(reg->type); @@ -711,6 +718,7 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state, } dst_state->speculative = src->speculative; dst_state->curframe = src->curframe; + dst_state->active_spin_lock = src->active_spin_lock; for (i = 0; i <= src->curframe; i++) { dst = dst_state->frame[i]; if (!dst) { @@ -1492,6 +1500,21 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, if (err) verbose(env, "R%d max value is outside of the array range\n", regno); + + if (map_value_has_spin_lock(reg->map_ptr)) { + u32 lock = reg->map_ptr->spin_lock_off; + + /* if any part of struct bpf_spin_lock can be touched by + * load/store reject this program. + * To check that [x1, x2) overlaps with [y1, y2) + * it is sufficient to check x1 < y2 && y1 < x2. + */ + if (reg->smin_value + off < lock + sizeof(struct bpf_spin_lock) && + lock < reg->umax_value + off + size) { + verbose(env, "bpf_spin_lock cannot be accessed directly by load/store\n"); + return -EACCES; + } + } return err; } @@ -2276,6 +2299,91 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, } } +/* Implementation details: + * bpf_map_lookup returns PTR_TO_MAP_VALUE_OR_NULL + * Two bpf_map_lookups (even with the same key) will have different reg->id. + * For traditional PTR_TO_MAP_VALUE the verifier clears reg->id after + * value_or_null->value transition, since the verifier only cares about + * the range of access to valid map value pointer and doesn't care about actual + * address of the map element. + * For maps with 'struct bpf_spin_lock' inside map value the verifier keeps + * reg->id > 0 after value_or_null->value transition. By doing so + * two bpf_map_lookups will be considered two different pointers that + * point to different bpf_spin_locks. + * The verifier allows taking only one bpf_spin_lock at a time to avoid + * dead-locks. + * Since only one bpf_spin_lock is allowed the checks are simpler than + * reg_is_refcounted() logic. The verifier needs to remember only + * one spin_lock instead of array of acquired_refs. + * cur_state->active_spin_lock remembers which map value element got locked + * and clears it after bpf_spin_unlock. + */ +static int process_spin_lock(struct bpf_verifier_env *env, int regno, + bool is_lock) +{ + struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; + struct bpf_verifier_state *cur = env->cur_state; + bool is_const = tnum_is_const(reg->var_off); + struct bpf_map *map = reg->map_ptr; + u64 val = reg->var_off.value; + + if (reg->type != PTR_TO_MAP_VALUE) { + verbose(env, "R%d is not a pointer to map_value\n", regno); + return -EINVAL; + } + if (!is_const) { + verbose(env, + "R%d doesn't have constant offset. bpf_spin_lock has to be at the constant offset\n", + regno); + return -EINVAL; + } + if (!map->btf) { + verbose(env, + "map '%s' has to have BTF in order to use bpf_spin_lock\n", + map->name); + return -EINVAL; + } + if (!map_value_has_spin_lock(map)) { + if (map->spin_lock_off == -E2BIG) + verbose(env, + "map '%s' has more than one 'struct bpf_spin_lock'\n", + map->name); + else if (map->spin_lock_off == -ENOENT) + verbose(env, + "map '%s' doesn't have 'struct bpf_spin_lock'\n", + map->name); + else + verbose(env, + "map '%s' is not a struct type or bpf_spin_lock is mangled\n", + map->name); + return -EINVAL; + } + if (map->spin_lock_off != val + reg->off) { + verbose(env, "off %lld doesn't point to 'struct bpf_spin_lock'\n", + val + reg->off); + return -EINVAL; + } + if (is_lock) { + if (cur->active_spin_lock) { + verbose(env, + "Locking two bpf_spin_locks are not allowed\n"); + return -EINVAL; + } + cur->active_spin_lock = reg->id; + } else { + if (!cur->active_spin_lock) { + verbose(env, "bpf_spin_unlock without taking a lock\n"); + return -EINVAL; + } + if (cur->active_spin_lock != reg->id) { + verbose(env, "bpf_spin_unlock of different lock\n"); + return -EINVAL; + } + cur->active_spin_lock = 0; + } + return 0; +} + static bool arg_type_is_mem_ptr(enum bpf_arg_type type) { return type == ARG_PTR_TO_MEM || @@ -2352,6 +2460,17 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, return -EFAULT; } meta->ptr_id = reg->id; + } else if (arg_type == ARG_PTR_TO_SPIN_LOCK) { + if (meta->func_id == BPF_FUNC_spin_lock) { + if (process_spin_lock(env, regno, true)) + return -EACCES; + } else if (meta->func_id == BPF_FUNC_spin_unlock) { + if (process_spin_lock(env, regno, false)) + return -EACCES; + } else { + verbose(env, "verifier internal error\n"); + return -EFAULT; + } } else if (arg_type_is_mem_ptr(arg_type)) { expected_type = PTR_TO_STACK; /* One exception here. In case function allows for NULL to be @@ -2993,6 +3112,7 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn return err; } + meta.func_id = func_id; /* check args */ err = check_func_arg(env, BPF_REG_1, fn->arg1_type, &meta); if (err) @@ -4739,7 +4859,8 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state, } else if (reg->type == PTR_TO_SOCKET_OR_NULL) { reg->type = PTR_TO_SOCKET; } - if (is_null || !reg_is_refcounted(reg)) { + if (is_null || !(reg_is_refcounted(reg) || + reg_may_point_to_spin_lock(reg))) { /* We don't need id from this point onwards anymore, * thus we should better reset it, so that state * pruning has chances to take effect. @@ -5156,6 +5277,11 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) return err; } + if (env->cur_state->active_spin_lock) { + verbose(env, "BPF_LD_[ABS|IND] cannot be used inside bpf_spin_lock-ed region\n"); + return -EINVAL; + } + if (regs[BPF_REG_6].type != PTR_TO_CTX) { verbose(env, "at the time of BPF_LD_ABS|IND R6 != pointer to skb\n"); @@ -5895,8 +6021,11 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold, case PTR_TO_MAP_VALUE: /* If the new min/max/var_off satisfy the old ones and * everything else matches, we are OK. - * We don't care about the 'id' value, because nothing - * uses it for PTR_TO_MAP_VALUE (only for ..._OR_NULL) + * 'id' is not compared, since it's only used for maps with + * bpf_spin_lock inside map element and in such cases if + * the rest of the prog is valid for one map element then + * it's valid for all map elements regardless of the key + * used in bpf_map_lookup() */ return memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)) == 0 && range_within(rold, rcur) && @@ -6089,6 +6218,9 @@ static bool states_equal(struct bpf_verifier_env *env, if (old->speculative && !cur->speculative) return false; + if (old->active_spin_lock != cur->active_spin_lock) + return false; + /* for states to be equal callsites have to be the same * and all frame states need to be equivalent */ @@ -6507,6 +6639,12 @@ static int do_check(struct bpf_verifier_env *env) return -EINVAL; } + if (env->cur_state->active_spin_lock && + (insn->src_reg == BPF_PSEUDO_CALL || + insn->imm != BPF_FUNC_spin_unlock)) { + verbose(env, "function calls are not allowed while holding a lock\n"); + return -EINVAL; + } if (insn->src_reg == BPF_PSEUDO_CALL) err = check_func_call(env, insn, &env->insn_idx); else @@ -6537,6 +6675,11 @@ static int do_check(struct bpf_verifier_env *env) return -EINVAL; } + if (env->cur_state->active_spin_lock) { + verbose(env, "bpf_spin_unlock is missing\n"); + return -EINVAL; + } + if (state->curframe) { /* exit from nested function */ env->prev_insn_idx = env->insn_idx; @@ -6634,6 +6777,19 @@ static int check_map_prealloc(struct bpf_map *map) !(map->map_flags & BPF_F_NO_PREALLOC); } +static bool is_tracing_prog_type(enum bpf_prog_type type) +{ + switch (type) { + case BPF_PROG_TYPE_KPROBE: + case BPF_PROG_TYPE_TRACEPOINT: + case BPF_PROG_TYPE_PERF_EVENT: + case BPF_PROG_TYPE_RAW_TRACEPOINT: + return true; + default: + return false; + } +} + static int check_map_prog_compatibility(struct bpf_verifier_env *env, struct bpf_map *map, struct bpf_prog *prog) @@ -6656,6 +6812,13 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, } } + if ((is_tracing_prog_type(prog->type) || + prog->type == BPF_PROG_TYPE_SOCKET_FILTER) && + map_value_has_spin_lock(map)) { + verbose(env, "tracing progs cannot use bpf_spin_lock yet\n"); + return -EINVAL; + } + if ((bpf_prog_is_dev_bound(prog->aux) || bpf_map_is_dev_bound(map)) && !bpf_offload_prog_map_match(prog, map)) { verbose(env, "offload device mismatch between prog and map\n"); diff --git a/net/core/filter.c b/net/core/filter.c index 8ff14481a6af..492763d70adc 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4872,10 +4872,20 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_ktime_get_ns_proto; case BPF_FUNC_ktime_get_boot_ns: return &bpf_ktime_get_boot_ns_proto; + default: + break; + } + + if (!capable(CAP_SYS_ADMIN)) + return NULL; + + switch (func_id) { + case BPF_FUNC_spin_lock: + return &bpf_spin_lock_proto; + case BPF_FUNC_spin_unlock: + return &bpf_spin_unlock_proto; case BPF_FUNC_trace_printk: - if (capable(CAP_SYS_ADMIN)) - return bpf_get_trace_printk_proto(); - /* else: fall through */ + return bpf_get_trace_printk_proto(); default: return NULL; } From 8e044459e5be583721a747de47ab2b7e01745300 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 31 Jan 2019 15:40:05 -0800 Subject: [PATCH 1038/1640] UPSTREAM: bpf: add support for bpf_spin_lock to cgroup local storage Allow 'struct bpf_spin_lock' to reside inside cgroup local storage. Change-Id: If4d76516e7c0db862fb74c21a1e99a502c28813b Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- kernel/bpf/local_storage.c | 2 ++ kernel/bpf/syscall.c | 3 ++- kernel/bpf/verifier.c | 2 ++ 3 files changed, 6 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index 07a34ef562a0..0295427f06e2 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -147,6 +147,7 @@ static int cgroup_storage_update_elem(struct bpf_map *map, void *_key, return -ENOMEM; memcpy(&new->data[0], value, map->value_size); + check_and_init_map_lock(map, new->data); new = xchg(&storage->buf, new); kfree_rcu(new, rcu); @@ -483,6 +484,7 @@ struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(struct bpf_prog *prog, storage->buf = kmalloc_node(size, flags, map->numa_node); if (!storage->buf) goto enomem; + check_and_init_map_lock(map, storage->buf->data); } else { storage->percpu_buf = __alloc_percpu_gfp(size, 8, flags); if (!storage->percpu_buf) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 5eb835e3bcbf..c1d6fa7c153e 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -483,7 +483,8 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf, if (map_value_has_spin_lock(map)) { if (map->map_type != BPF_MAP_TYPE_HASH && - map->map_type != BPF_MAP_TYPE_ARRAY) + map->map_type != BPF_MAP_TYPE_ARRAY && + map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE) return -ENOTSUPP; if (map->spin_lock_off + sizeof(struct bpf_spin_lock) > map->value_size) { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 5cb631168409..79706af2e85c 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3195,6 +3195,8 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn regs[BPF_REG_0].map_ptr = meta.map_ptr; if (fn->ret_type == RET_PTR_TO_MAP_VALUE) { regs[BPF_REG_0].type = PTR_TO_MAP_VALUE; + if (map_value_has_spin_lock(meta.map_ptr)) + regs[BPF_REG_0].id = ++env->id_gen; } else { regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL; regs[BPF_REG_0].id = ++env->id_gen; From eb322f919d7a7ba4a7973672424a322e0b857c04 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 31 Jan 2019 15:40:09 -0800 Subject: [PATCH 1039/1640] UPSTREAM: bpf: introduce BPF_F_LOCK flag Introduce BPF_F_LOCK flag for map_lookup and map_update syscall commands and for map_update() helper function. In all these cases take a lock of existing element (which was provided in BTF description) before copying (in or out) the rest of map value. Implementation details that are part of uapi: Array: The array map takes the element lock for lookup/update. Hash: hash map also takes the lock for lookup/update and tries to avoid the bucket lock. If old element exists it takes the element lock and updates the element in place. If element doesn't exist it allocates new one and inserts into hash table while holding the bucket lock. In rare case the hashmap has to take both the bucket lock and the element lock to update old value in place. Cgroup local storage: It is similar to array. update in place and lookup are done with lock taken. Change-Id: I76b13e23e1f6241c1f919a1c24650530f7705d9e Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 2 ++ include/uapi/linux/bpf.h | 1 + kernel/bpf/arraymap.c | 24 ++++++++++++++-------- kernel/bpf/hashtab.c | 42 +++++++++++++++++++++++++++++++++++--- kernel/bpf/helpers.c | 16 +++++++++++++++ kernel/bpf/local_storage.c | 14 ++++++++++++- kernel/bpf/syscall.c | 25 +++++++++++++++++++++-- 7 files changed, 110 insertions(+), 14 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index d2fac19b51c7..8baebbd77b65 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -124,6 +124,8 @@ static inline void copy_map_value(struct bpf_map *map, void *dst, void *src) memcpy(dst, src, map->value_size); } } +void copy_map_value_locked(struct bpf_map *map, void *dst, void *src, + bool lock_src); struct bpf_offload_dev; struct bpf_offloaded_map; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 296c0b29bae4..4f6522a5c3fc 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -269,6 +269,7 @@ enum bpf_attach_type { #define BPF_ANY 0 /* create new element or update existing */ #define BPF_NOEXIST 1 /* create new element if it didn't exist */ #define BPF_EXIST 2 /* update existing element */ +#define BPF_F_LOCK 4 /* spin_lock-ed map_lookup/map_update */ /* flags for BPF_MAP_CREATE command */ #define BPF_F_NO_PREALLOC (1U << 0) diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index e971ee47c87f..2a477256b333 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -256,8 +256,9 @@ static int array_map_update_elem(struct bpf_map *map, void *key, void *value, { struct bpf_array *array = container_of(map, struct bpf_array, map); u32 index = *(u32 *)key; + char *val; - if (unlikely(map_flags > BPF_EXIST)) + if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST)) /* unknown flags */ return -EINVAL; @@ -265,18 +266,25 @@ static int array_map_update_elem(struct bpf_map *map, void *key, void *value, /* all elements were pre-allocated, cannot insert a new one */ return -E2BIG; - if (unlikely(map_flags == BPF_NOEXIST)) + if (unlikely(map_flags & BPF_NOEXIST)) /* all elements already exist */ return -EEXIST; - if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) + if (unlikely((map_flags & BPF_F_LOCK) && + !map_value_has_spin_lock(map))) + return -EINVAL; + + if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { memcpy(this_cpu_ptr(array->pptrs[index & array->index_mask]), value, map->value_size); - else - copy_map_value(map, - array->value + - array->elem_size * (index & array->index_mask), - value); + } else { + val = array->value + + array->elem_size * (index & array->index_mask); + if (map_flags & BPF_F_LOCK) + copy_map_value_locked(map, val, value, false); + else + copy_map_value(map, val, value); + } return 0; } diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 731e26f6c281..65d588ed8049 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -821,11 +821,11 @@ dec_count: static int check_flags(struct bpf_htab *htab, struct htab_elem *l_old, u64 map_flags) { - if (l_old && map_flags == BPF_NOEXIST) + if (l_old && (map_flags & ~BPF_F_LOCK) == BPF_NOEXIST) /* elem already exists */ return -EEXIST; - if (!l_old && map_flags == BPF_EXIST) + if (!l_old && (map_flags & ~BPF_F_LOCK) == BPF_EXIST) /* elem doesn't exist, cannot update it */ return -ENOENT; @@ -844,7 +844,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, u32 key_size, hash; int ret; - if (unlikely(map_flags > BPF_EXIST)) + if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST)) /* unknown flags */ return -EINVAL; @@ -857,6 +857,28 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, b = __select_bucket(htab, hash); head = &b->head; + if (unlikely(map_flags & BPF_F_LOCK)) { + if (unlikely(!map_value_has_spin_lock(map))) + return -EINVAL; + /* find an element without taking the bucket lock */ + l_old = lookup_nulls_elem_raw(head, hash, key, key_size, + htab->n_buckets); + ret = check_flags(htab, l_old, map_flags); + if (ret) + return ret; + if (l_old) { + /* grab the element lock and update value in place */ + copy_map_value_locked(map, + l_old->key + round_up(key_size, 8), + value, false); + return 0; + } + /* fall through, grab the bucket lock and lookup again. + * 99.9% chance that the element won't be found, + * but second lookup under lock has to be done. + */ + } + /* bpf_map_update_elem() can be called in_irq() */ raw_spin_lock_irqsave(&b->lock, flags); @@ -866,6 +888,20 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, if (ret) goto err; + if (unlikely(l_old && (map_flags & BPF_F_LOCK))) { + /* first lookup without the bucket lock didn't find the element, + * but second lookup with the bucket lock found it. + * This case is highly unlikely, but has to be dealt with: + * grab the element lock in addition to the bucket lock + * and update element in place + */ + copy_map_value_locked(map, + l_old->key + round_up(key_size, 8), + value, false); + ret = 0; + goto err; + } + l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false, l_old); if (IS_ERR(l_new)) { diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index fd6b192c3fb7..679c02914590 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -313,6 +313,22 @@ const struct bpf_func_proto bpf_spin_unlock_proto = { .arg1_type = ARG_PTR_TO_SPIN_LOCK, }; +void copy_map_value_locked(struct bpf_map *map, void *dst, void *src, + bool lock_src) +{ + struct bpf_spin_lock *lock; + + if (lock_src) + lock = src + map->spin_lock_off; + else + lock = dst + map->spin_lock_off; + preempt_disable(); + ____bpf_spin_lock(lock); + copy_map_value(map, dst, src); + ____bpf_spin_unlock(lock); + preempt_enable(); +} + #ifdef CONFIG_CGROUPS BPF_CALL_0(bpf_get_current_cgroup_id) { diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index 0295427f06e2..6b572e2de7fb 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -131,7 +131,14 @@ static int cgroup_storage_update_elem(struct bpf_map *map, void *_key, struct bpf_cgroup_storage *storage; struct bpf_storage_buffer *new; - if (flags != BPF_ANY && flags != BPF_EXIST) + if (unlikely(flags & ~(BPF_F_LOCK | BPF_EXIST | BPF_NOEXIST))) + return -EINVAL; + + if (unlikely(flags & BPF_NOEXIST)) + return -EINVAL; + + if (unlikely((flags & BPF_F_LOCK) && + !map_value_has_spin_lock(map))) return -EINVAL; storage = cgroup_storage_lookup((struct bpf_cgroup_storage_map *)map, @@ -139,6 +146,11 @@ static int cgroup_storage_update_elem(struct bpf_map *map, void *_key, if (!storage) return -ENOENT; + if (flags & BPF_F_LOCK) { + copy_map_value_locked(map, storage->buf->data, value, false); + return 0; + } + new = kmalloc_node(sizeof(struct bpf_storage_buffer) + map->value_size, __GFP_ZERO | GFP_ATOMIC | __GFP_NOWARN, diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index c1d6fa7c153e..2eefcb8b2953 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -683,7 +683,7 @@ static void *__bpf_copy_key(void __user *ukey, u64 key_size) } /* last field in 'union bpf_attr' used by this command */ -#define BPF_MAP_LOOKUP_ELEM_LAST_FIELD value +#define BPF_MAP_LOOKUP_ELEM_LAST_FIELD flags static int map_lookup_elem(union bpf_attr *attr) { @@ -699,6 +699,9 @@ static int map_lookup_elem(union bpf_attr *attr) if (CHECK_ATTR(BPF_MAP_LOOKUP_ELEM)) return -EINVAL; + if (attr->flags & ~BPF_F_LOCK) + return -EINVAL; + f = fdget(ufd); map = __bpf_map_get(f); if (IS_ERR(map)) @@ -709,6 +712,12 @@ static int map_lookup_elem(union bpf_attr *attr) goto err_put; } + if ((attr->flags & BPF_F_LOCK) && + !map_value_has_spin_lock(map)) { + err = -EINVAL; + goto err_put; + } + key = __bpf_copy_key(ukey, map->key_size); if (IS_ERR(key)) { err = PTR_ERR(key); @@ -768,7 +777,13 @@ static int map_lookup_elem(union bpf_attr *attr) err = -ENOENT; } else { err = 0; - copy_map_value(map, value, ptr); + if (attr->flags & BPF_F_LOCK) + /* lock 'ptr' and copy everything but lock */ + copy_map_value_locked(map, value, ptr, true); + else + copy_map_value(map, value, ptr); + /* mask lock, since value wasn't zero inited */ + check_and_init_map_lock(map, value); } rcu_read_unlock(); } @@ -831,6 +846,12 @@ static int map_update_elem(union bpf_attr *attr) goto err_put; } + if ((attr->flags & BPF_F_LOCK) && + !map_value_has_spin_lock(map)) { + err = -EINVAL; + goto err_put; + } + key = __bpf_copy_key(ukey, map->key_size); if (IS_ERR(key)) { err = PTR_ERR(key); From 81233a82f0c924b5c056fb648688e81252dfe939 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Fri, 8 Feb 2019 22:25:54 -0800 Subject: [PATCH 1040/1640] UPSTREAM: bpf: Fix narrow load on a bpf_sock returned from sk_lookup() By adding this test to test_verifier: { "reference tracking: access sk->src_ip4 (narrow load)", .insns = { BPF_SK_LOOKUP, BPF_MOV64_REG(BPF_REG_6, BPF_REG_0), BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 3), BPF_LDX_MEM(BPF_H, BPF_REG_2, BPF_REG_0, offsetof(struct bpf_sock, src_ip4) + 2), BPF_MOV64_REG(BPF_REG_1, BPF_REG_6), BPF_EMIT_CALL(BPF_FUNC_sk_release), BPF_EXIT_INSN(), }, .prog_type = BPF_PROG_TYPE_SCHED_CLS, .result = ACCEPT, }, The above test loads 2 bytes from sk->src_ip4 where sk is obtained by bpf_sk_lookup_tcp(). It hits an internal verifier error from convert_ctx_accesses(): [root@arch-fb-vm1 bpf]# ./test_verifier 665 665 Failed to load prog 'Invalid argument'! 0: (b7) r2 = 0 1: (63) *(u32 *)(r10 -8) = r2 2: (7b) *(u64 *)(r10 -16) = r2 3: (7b) *(u64 *)(r10 -24) = r2 4: (7b) *(u64 *)(r10 -32) = r2 5: (7b) *(u64 *)(r10 -40) = r2 6: (7b) *(u64 *)(r10 -48) = r2 7: (bf) r2 = r10 8: (07) r2 += -48 9: (b7) r3 = 36 10: (b7) r4 = 0 11: (b7) r5 = 0 12: (85) call bpf_sk_lookup_tcp#84 13: (bf) r6 = r0 14: (15) if r0 == 0x0 goto pc+3 R0=sock(id=1,off=0,imm=0) R6=sock(id=1,off=0,imm=0) R10=fp0,call_-1 fp-8=????0000 fp-16=0000mmmm fp-24=mmmmmmmm fp-32=mmmmmmmm fp-40=mmmmmmmm fp-48=mmmmmmmm refs=1 15: (69) r2 = *(u16 *)(r0 +26) 16: (bf) r1 = r6 17: (85) call bpf_sk_release#86 18: (95) exit from 14 to 18: safe processed 20 insns (limit 131072), stack depth 48 bpf verifier is misconfigured Summary: 0 PASSED, 0 SKIPPED, 1 FAILED The bpf_sock_is_valid_access() is expecting src_ip4 can be narrowly loaded (meaning load any 1 or 2 bytes of the src_ip4) by marking info->ctx_field_size. However, this marked ctx_field_size is not used. This patch fixes it. Due to the recent refactoring in test_verifier, this new test will be added to the bpf-next branch (together with the bpf_tcp_sock patchset) to avoid merge conflict. Fixes: c64b7983288e ("bpf: Add PTR_TO_SOCKET verifier type") Cc: Joe Stringer Change-Id: I105d05710ad29b156940e46d78fd916d3e80df58 Signed-off-by: Martin KaFai Lau Acked-by: Joe Stringer Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 79706af2e85c..7f8b5ca18642 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1649,12 +1649,13 @@ static int check_flow_keys_access(struct bpf_verifier_env *env, int off, return 0; } -static int check_sock_access(struct bpf_verifier_env *env, u32 regno, int off, - int size, enum bpf_access_type t) +static int check_sock_access(struct bpf_verifier_env *env, int insn_idx, + u32 regno, int off, int size, + enum bpf_access_type t) { struct bpf_reg_state *regs = cur_regs(env); struct bpf_reg_state *reg = ®s[regno]; - struct bpf_insn_access_aux info; + struct bpf_insn_access_aux info = {}; if (reg->smin_value < 0) { verbose(env, "R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n", @@ -1668,6 +1669,8 @@ static int check_sock_access(struct bpf_verifier_env *env, u32 regno, int off, return -EACCES; } + env->insn_aux_data[insn_idx].ctx_field_size = info.ctx_field_size; + return 0; } @@ -2064,7 +2067,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn verbose(env, "cannot write into socket\n"); return -EACCES; } - err = check_sock_access(env, regno, off, size, t); + err = check_sock_access(env, insn_idx, regno, off, size, t); if (!err && value_regno >= 0) mark_reg_unknown(env, regs, value_regno); } else { From 0e83b90c43cf538d89bdbda7a3e92a2ac1ec3bb4 Mon Sep 17 00:00:00 2001 From: Vlad Dumitrescu Date: Thu, 22 Nov 2018 14:39:16 -0500 Subject: [PATCH 1041/1640] BACKPORT: bpf: add skb->tstamp r/w access from tc clsact and cg skb progs This could be used to rate limit egress traffic in concert with a qdisc which supports Earliest Departure Time, such as FQ. Write access from cg skb progs only with CAP_SYS_ADMIN, since the value will be used by downstream qdiscs. It might make sense to relax this. Changes v1 -> v2: - allow access from cg skb, write only with CAP_SYS_ADMIN Change-Id: I5e4e521bca04b4b402bfd7853cf11cbc285b6067 Signed-off-by: Vlad Dumitrescu Acked-by: Eric Dumazet Acked-by: Willem de Bruijn Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 1 + net/core/filter.c | 29 +++++++++++++++++++++++++++++ 2 files changed, 30 insertions(+) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 4f6522a5c3fc..a9408d9d68aa 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2523,6 +2523,7 @@ struct __sk_buff { __u32 data_meta; struct bpf_flow_keys *flow_keys; + __u64 tstamp; }; struct bpf_tunnel_key { diff --git a/net/core/filter.c b/net/core/filter.c index 492763d70adc..9750fd15aa3b 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5298,6 +5298,10 @@ static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type if (size != sizeof(struct bpf_flow_keys *)) return false; break; + case bpf_ctx_range(struct __sk_buff, tstamp): + if (size != sizeof(__u64)) + return false; + break; default: /* Only narrow read access allowed for now. */ if (type == BPF_WRITE) { @@ -5325,6 +5329,7 @@ static bool sk_filter_is_valid_access(int off, int size, case bpf_ctx_range(struct __sk_buff, data_end): case bpf_ctx_range(struct __sk_buff, flow_keys): case bpf_ctx_range_till(struct __sk_buff, family, local_port): + case bpf_ctx_range(struct __sk_buff, tstamp): return false; } @@ -5357,6 +5362,10 @@ static bool cg_skb_is_valid_access(int off, int size, case bpf_ctx_range(struct __sk_buff, priority): case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]): break; + case bpf_ctx_range(struct __sk_buff, tstamp): + if (!capable(CAP_SYS_ADMIN)) + return false; + break; default: return false; } @@ -5384,6 +5393,7 @@ static bool lwt_is_valid_access(int off, int size, case bpf_ctx_range_till(struct __sk_buff, family, local_port): case bpf_ctx_range(struct __sk_buff, data_meta): case bpf_ctx_range(struct __sk_buff, flow_keys): + case bpf_ctx_range(struct __sk_buff, tstamp): return false; } @@ -5593,6 +5603,7 @@ static bool tc_cls_act_is_valid_access(int off, int size, case bpf_ctx_range(struct __sk_buff, priority): case bpf_ctx_range(struct __sk_buff, tc_classid): case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]): + case bpf_ctx_range(struct __sk_buff, tstamp): break; default: return false; @@ -5814,6 +5825,7 @@ static bool sk_skb_is_valid_access(int off, int size, case bpf_ctx_range(struct __sk_buff, tc_classid): case bpf_ctx_range(struct __sk_buff, data_meta): case bpf_ctx_range(struct __sk_buff, flow_keys): + case bpf_ctx_range(struct __sk_buff, tstamp): return false; } @@ -5900,6 +5912,7 @@ static bool flow_dissector_is_valid_access(int off, int size, case bpf_ctx_range(struct __sk_buff, tc_classid): case bpf_ctx_range(struct __sk_buff, data_meta): case bpf_ctx_range_till(struct __sk_buff, family, local_port): + case bpf_ctx_range(struct __sk_buff, tstamp): return false; } @@ -6209,6 +6222,22 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type, *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, si->src_reg, off); break; + + case offsetof(struct __sk_buff, tstamp): + BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, tstamp) != 8); + + if (type == BPF_WRITE) + *insn++ = BPF_STX_MEM(BPF_DW, + si->dst_reg, si->src_reg, + bpf_target_off(struct sk_buff, + tstamp, 8, + target_size)); + else + *insn++ = BPF_LDX_MEM(BPF_DW, + si->dst_reg, si->src_reg, + bpf_target_off(struct sk_buff, + tstamp, 8, + target_size)); } return insn - insn_buf; From fc33db89f37b68074c9aa41e20c9490577ade56c Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 1 Dec 2018 01:18:53 +0100 Subject: [PATCH 1042/1640] BACKPORT: bpf: fix pointer offsets in context for 32 bit Currently, pointer offsets in three BPF context structures are broken in two scenarios: i) 32 bit compiled applications running on 64 bit kernels, and ii) LLVM compiled BPF programs running on 32 bit kernels. The latter is due to BPF target machine being strictly 64 bit. So in each of the cases the offsets will mismatch in verifier when checking / rewriting context access. Fix this by providing a helper macro __bpf_md_ptr() that will enforce padding up to 64 bit and proper alignment, and for context access a macro bpf_ctx_range_ptr() which will cover full 64 bit member range on 32 bit archs. For flow_keys, we additionally need to force the size check to sizeof(__u64) as with other pointer types. Fixes: d58e468b1112 ("flow_dissector: implements flow dissector BPF hook") Fixes: 4f738adba30a ("bpf: create tcp_bpf_ulp allowing BPF to monitor socket TX/RX data") Fixes: 2dbb9b9e6df6 ("bpf: Introduce BPF_PROG_TYPE_SK_REUSEPORT") Reported-by: David S. Miller Change-Id: I388c9b13f024f536b90b7f3b92a2512318d797dc Signed-off-by: Daniel Borkmann Acked-by: David S. Miller Tested-by: David S. Miller Signed-off-by: Alexei Starovoitov --- include/linux/filter.h | 7 +++++++ include/uapi/linux/bpf.h | 17 ++++++++++++----- net/core/filter.c | 16 ++++++++-------- 3 files changed, 27 insertions(+), 13 deletions(-) diff --git a/include/linux/filter.h b/include/linux/filter.h index 069a13b39eb9..2e15b0e7277f 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -494,6 +494,13 @@ struct sock_reuseport; offsetof(TYPE, MEMBER) ... offsetofend(TYPE, MEMBER) - 1 #define bpf_ctx_range_till(TYPE, MEMBER1, MEMBER2) \ offsetof(TYPE, MEMBER1) ... offsetofend(TYPE, MEMBER2) - 1 +#if BITS_PER_LONG == 64 +# define bpf_ctx_range_ptr(TYPE, MEMBER) \ + offsetof(TYPE, MEMBER) ... offsetofend(TYPE, MEMBER) - 1 +#else +# define bpf_ctx_range_ptr(TYPE, MEMBER) \ + offsetof(TYPE, MEMBER) ... offsetof(TYPE, MEMBER) + 8 - 1 +#endif /* BITS_PER_LONG == 64 */ #define bpf_target_off(TYPE, MEMBER, SIZE, PTR_SIZE) \ ({ \ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index a9408d9d68aa..ea920d64f870 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2488,6 +2488,12 @@ enum bpf_lwt_encap_mode { BPF_LWT_ENCAP_SEG6_INLINE }; +#define __bpf_md_ptr(type, name) \ +union { \ + type name; \ + __u64 :64; \ +} __attribute__((aligned(8))) + /* user accessible mirror of in-kernel sk_buff. * new fields can only be added to the end of this structure */ @@ -2522,7 +2528,7 @@ struct __sk_buff { /* ... here. */ __u32 data_meta; - struct bpf_flow_keys *flow_keys; + __bpf_md_ptr(struct bpf_flow_keys *, flow_keys); __u64 tstamp; }; @@ -2639,8 +2645,8 @@ enum sk_action { * be added to the end of this structure */ struct sk_msg_md { - void *data; - void *data_end; + __bpf_md_ptr(void *, data); + __bpf_md_ptr(void *, data_end); __u32 family; __u32 remote_ip4; /* Stored in network byte order */ @@ -2656,8 +2662,9 @@ struct sk_reuseport_md { * Start of directly accessible data. It begins from * the tcp/udp header. */ - void *data; - void *data_end; /* End of directly accessible data */ + __bpf_md_ptr(void *, data); + /* End of directly accessible data */ + __bpf_md_ptr(void *, data_end); /* * Total length of packet (starting from the tcp/udp header). * Note that the directly accessible bytes (data_end - data) diff --git a/net/core/filter.c b/net/core/filter.c index 9750fd15aa3b..5d2fc13e006b 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5294,8 +5294,8 @@ static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type if (size != size_default) return false; break; - case bpf_ctx_range(struct __sk_buff, flow_keys): - if (size != sizeof(struct bpf_flow_keys *)) + case bpf_ctx_range_ptr(struct __sk_buff, flow_keys): + if (size != sizeof(__u64)) return false; break; case bpf_ctx_range(struct __sk_buff, tstamp): @@ -5327,7 +5327,7 @@ static bool sk_filter_is_valid_access(int off, int size, case bpf_ctx_range(struct __sk_buff, data): case bpf_ctx_range(struct __sk_buff, data_meta): case bpf_ctx_range(struct __sk_buff, data_end): - case bpf_ctx_range(struct __sk_buff, flow_keys): + case bpf_ctx_range_ptr(struct __sk_buff, flow_keys): case bpf_ctx_range_till(struct __sk_buff, family, local_port): case bpf_ctx_range(struct __sk_buff, tstamp): return false; @@ -5353,7 +5353,7 @@ static bool cg_skb_is_valid_access(int off, int size, switch (off) { case bpf_ctx_range(struct __sk_buff, tc_classid): case bpf_ctx_range(struct __sk_buff, data_meta): - case bpf_ctx_range(struct __sk_buff, flow_keys): + case bpf_ctx_range_ptr(struct __sk_buff, flow_keys): return false; } if (type == BPF_WRITE) { @@ -5392,7 +5392,7 @@ static bool lwt_is_valid_access(int off, int size, case bpf_ctx_range(struct __sk_buff, tc_classid): case bpf_ctx_range_till(struct __sk_buff, family, local_port): case bpf_ctx_range(struct __sk_buff, data_meta): - case bpf_ctx_range(struct __sk_buff, flow_keys): + case bpf_ctx_range_ptr(struct __sk_buff, flow_keys): case bpf_ctx_range(struct __sk_buff, tstamp): return false; } @@ -5620,7 +5620,7 @@ static bool tc_cls_act_is_valid_access(int off, int size, case bpf_ctx_range(struct __sk_buff, data_end): info->reg_type = PTR_TO_PACKET_END; break; - case bpf_ctx_range(struct __sk_buff, flow_keys): + case bpf_ctx_range_ptr(struct __sk_buff, flow_keys): case bpf_ctx_range_till(struct __sk_buff, family, local_port): return false; } @@ -5824,7 +5824,7 @@ static bool sk_skb_is_valid_access(int off, int size, switch (off) { case bpf_ctx_range(struct __sk_buff, tc_classid): case bpf_ctx_range(struct __sk_buff, data_meta): - case bpf_ctx_range(struct __sk_buff, flow_keys): + case bpf_ctx_range_ptr(struct __sk_buff, flow_keys): case bpf_ctx_range(struct __sk_buff, tstamp): return false; } @@ -5906,7 +5906,7 @@ static bool flow_dissector_is_valid_access(int off, int size, case bpf_ctx_range(struct __sk_buff, data_end): info->reg_type = PTR_TO_PACKET_END; break; - case bpf_ctx_range(struct __sk_buff, flow_keys): + case bpf_ctx_range_ptr(struct __sk_buff, flow_keys): info->reg_type = PTR_TO_FLOW_KEYS; break; case bpf_ctx_range(struct __sk_buff, tc_classid): From 7413469e7c6905b43ebcf36660951f129c24c3d3 Mon Sep 17 00:00:00 2001 From: Petar Penkov Date: Sun, 2 Dec 2018 20:18:19 -0500 Subject: [PATCH 1043/1640] BACKPORT: bpf: allow BPF read access to qdisc pkt_len The pkt_len field in qdisc_skb_cb stores the skb length as it will appear on the wire after segmentation. For byte accounting, this value is more accurate than skb->len. It is computed on entry to the TC layer, so only valid there. Allow read access to this field from BPF tc classifier and action programs. The implementation is analogous to tc_classid, aside from restricting to read access. To distinguish it from skb->len and self-describe export as wire_len. Changes v1->v2 - Rename pkt_len to wire_len Change-Id: Id019cfcb31a1325bfc13363f5f70148971807355 Signed-off-by: Petar Penkov Signed-off-by: Vlad Dumitrescu Signed-off-by: Willem de Bruijn Acked-by: Song Liu Signed-off-by: Daniel Borkmann --- include/uapi/linux/bpf.h | 1 + net/core/filter.c | 16 ++++++++++++++++ 2 files changed, 17 insertions(+) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index ea920d64f870..22a5cd845cc3 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2530,6 +2530,7 @@ struct __sk_buff { __u32 data_meta; __bpf_md_ptr(struct bpf_flow_keys *, flow_keys); __u64 tstamp; + __u32 wire_len; }; struct bpf_tunnel_key { diff --git a/net/core/filter.c b/net/core/filter.c index 5d2fc13e006b..ad68a212220a 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5330,6 +5330,7 @@ static bool sk_filter_is_valid_access(int off, int size, case bpf_ctx_range_ptr(struct __sk_buff, flow_keys): case bpf_ctx_range_till(struct __sk_buff, family, local_port): case bpf_ctx_range(struct __sk_buff, tstamp): + case bpf_ctx_range(struct __sk_buff, wire_len): return false; } @@ -5354,6 +5355,7 @@ static bool cg_skb_is_valid_access(int off, int size, case bpf_ctx_range(struct __sk_buff, tc_classid): case bpf_ctx_range(struct __sk_buff, data_meta): case bpf_ctx_range_ptr(struct __sk_buff, flow_keys): + case bpf_ctx_range(struct __sk_buff, wire_len): return false; } if (type == BPF_WRITE) { @@ -5394,6 +5396,7 @@ static bool lwt_is_valid_access(int off, int size, case bpf_ctx_range(struct __sk_buff, data_meta): case bpf_ctx_range_ptr(struct __sk_buff, flow_keys): case bpf_ctx_range(struct __sk_buff, tstamp): + case bpf_ctx_range(struct __sk_buff, wire_len): return false; } @@ -5826,6 +5829,7 @@ static bool sk_skb_is_valid_access(int off, int size, case bpf_ctx_range(struct __sk_buff, data_meta): case bpf_ctx_range_ptr(struct __sk_buff, flow_keys): case bpf_ctx_range(struct __sk_buff, tstamp): + case bpf_ctx_range(struct __sk_buff, wire_len): return false; } @@ -5913,6 +5917,7 @@ static bool flow_dissector_is_valid_access(int off, int size, case bpf_ctx_range(struct __sk_buff, data_meta): case bpf_ctx_range_till(struct __sk_buff, family, local_port): case bpf_ctx_range(struct __sk_buff, tstamp): + case bpf_ctx_range(struct __sk_buff, wire_len): return false; } @@ -6238,6 +6243,17 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type, bpf_target_off(struct sk_buff, tstamp, 8, target_size)); + break; + + case offsetof(struct __sk_buff, wire_len): + BUILD_BUG_ON(FIELD_SIZEOF(struct qdisc_skb_cb, pkt_len) != 4); + + off = si->off; + off -= offsetof(struct __sk_buff, wire_len); + off += offsetof(struct sk_buff, cb); + off += offsetof(struct qdisc_skb_cb, pkt_len); + *target_size = 4; + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, off); } return insn - insn_buf; From b3e2b2c16a8116c87225900bedcd76c973d7363a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 23 Jan 2019 09:22:27 -0800 Subject: [PATCH 1044/1640] BACKPORT: bpf: allow BPF programs access skb_shared_info->gso_segs field This adds the ability to read gso_segs from a BPF program. v3: Use BPF_REG_AX instead of BPF_REG_TMP for the temporary register, as suggested by Martin. v2: refined Eddie Hao patch to address Alexei feedback. Change-Id: Ibcd87e0dd74f6d89f919301e0c9dfd40395e1583 Signed-off-by: Eric Dumazet Cc: Eddie Hao Cc: Martin KaFai Lau Acked-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann --- include/uapi/linux/bpf.h | 1 + net/core/filter.c | 21 +++++++++++++++++++++ 2 files changed, 22 insertions(+) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 22a5cd845cc3..5c6b6bcb474d 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2531,6 +2531,7 @@ struct __sk_buff { __bpf_md_ptr(struct bpf_flow_keys *, flow_keys); __u64 tstamp; __u32 wire_len; + __u32 gso_segs; }; struct bpf_tunnel_key { diff --git a/net/core/filter.c b/net/core/filter.c index ad68a212220a..b7eb13e19e2d 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -6245,6 +6245,27 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type, target_size)); break; + case offsetof(struct __sk_buff, gso_segs): + /* si->dst_reg = skb_shinfo(SKB); */ +#ifdef NET_SKBUFF_DATA_USES_OFFSET + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, head), + si->dst_reg, si->src_reg, + offsetof(struct sk_buff, head)); + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, end), + BPF_REG_AX, si->src_reg, + offsetof(struct sk_buff, end)); + *insn++ = BPF_ALU64_REG(BPF_ADD, si->dst_reg, BPF_REG_AX); +#else + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, end), + si->dst_reg, si->src_reg, + offsetof(struct sk_buff, end)); +#endif + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct skb_shared_info, gso_segs), + si->dst_reg, si->dst_reg, + bpf_target_off(struct skb_shared_info, + gso_segs, 2, + target_size)); + break; case offsetof(struct __sk_buff, wire_len): BUILD_BUG_ON(FIELD_SIZEOF(struct qdisc_skb_cb, pkt_len) != 4); From ed69b442e5c2b8b5614095a8fd9b733924c55fb6 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Sat, 9 Feb 2019 23:22:20 -0800 Subject: [PATCH 1045/1640] BACKPORT: bpf: Add a bpf_sock pointer to __sk_buff and a bpf_sk_fullsock helper In kernel, it is common to check "skb->sk && sk_fullsock(skb->sk)" before accessing the fields in sock. For example, in __netdev_pick_tx: static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb, struct net_device *sb_dev) { /* ... */ struct sock *sk = skb->sk; if (queue_index != new_index && sk && sk_fullsock(sk) && rcu_access_pointer(sk->sk_dst_cache)) sk_tx_queue_set(sk, new_index); /* ... */ return queue_index; } This patch adds a "struct bpf_sock *sk" pointer to the "struct __sk_buff" where a few of the convert_ctx_access() in filter.c has already been accessing the skb->sk sock_common's fields, e.g. sock_ops_convert_ctx_access(). "__sk_buff->sk" is a PTR_TO_SOCK_COMMON_OR_NULL in the verifier. Some of the fileds in "bpf_sock" will not be directly accessible through the "__sk_buff->sk" pointer. It is limited by the new "bpf_sock_common_is_valid_access()". e.g. The existing "type", "protocol", "mark" and "priority" in bpf_sock are not allowed. The newly added "struct bpf_sock *bpf_sk_fullsock(struct bpf_sock *sk)" can be used to get a sk with all accessible fields in "bpf_sock". This helper is added to both cg_skb and sched_(cls|act). int cg_skb_foo(struct __sk_buff *skb) { struct bpf_sock *sk; sk = skb->sk; if (!sk) return 1; sk = bpf_sk_fullsock(sk); if (!sk) return 1; if (sk->family != AF_INET6 || sk->protocol != IPPROTO_TCP) return 1; /* some_traffic_shaping(); */ return 1; } (1) The sk is read only (2) There is no new "struct bpf_sock_common" introduced. (3) Future kernel sock's members could be added to bpf_sock only instead of repeatedly adding at multiple places like currently in bpf_sock_ops_md, bpf_sock_addr_md, sk_reuseport_md...etc. (4) After "sk = skb->sk", the reg holding sk is in type PTR_TO_SOCK_COMMON_OR_NULL. (5) After bpf_sk_fullsock(), the return type will be in type PTR_TO_SOCKET_OR_NULL which is the same as the return type of bpf_sk_lookup_xxx(). However, bpf_sk_fullsock() does not take refcnt. The acquire_reference_state() is only depending on the return type now. To avoid it, a new is_acquire_function() is checked before calling acquire_reference_state(). (6) The WARN_ON in "release_reference_state()" is no longer an internal verifier bug. When reg->id is not found in state->refs[], it means the bpf_prog does something wrong like "bpf_sk_release(bpf_sk_fullsock(skb->sk))" where reference has never been acquired by calling "bpf_sk_fullsock(skb->sk)". A -EINVAL and a verbose are done instead of WARN_ON. A test is added to the test_verifier in a later patch. Since the WARN_ON in "release_reference_state()" is no longer needed, "__release_reference_state()" is folded into "release_reference_state()" also. Acked-by: Alexei Starovoitov Change-Id: Ifc10204de0224b07d9199ad1cfed6cc1b70b2ad8 Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 12 ++++ include/uapi/linux/bpf.h | 9 +++ kernel/bpf/verifier.c | 132 +++++++++++++++++++++++++++------------ net/core/filter.c | 42 +++++++++++++ 4 files changed, 155 insertions(+), 40 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 8baebbd77b65..b96ed5ab186d 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -199,6 +199,7 @@ enum bpf_arg_type { ARG_ANYTHING, /* any (initialized) argument is ok */ ARG_PTR_TO_SOCKET, /* pointer to bpf_sock */ ARG_PTR_TO_SPIN_LOCK, /* pointer to bpf_spin_lock */ + ARG_PTR_TO_SOCK_COMMON, /* pointer to sock_common */ }; /* type of values returned from helper functions */ @@ -261,6 +262,8 @@ enum bpf_reg_type { PTR_TO_FLOW_KEYS, /* reg points to bpf_flow_keys */ PTR_TO_SOCKET, /* reg points to struct bpf_sock */ PTR_TO_SOCKET_OR_NULL, /* reg points to struct bpf_sock or NULL */ + PTR_TO_SOCK_COMMON, /* reg points to sock_common */ + PTR_TO_SOCK_COMMON_OR_NULL, /* reg points to sock_common or NULL */ }; /* The information passed from prog-specific *_is_valid_access @@ -937,6 +940,9 @@ void bpf_user_rnd_init_once(void); u64 bpf_user_rnd_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); #if defined(CONFIG_NET) +bool bpf_sock_common_is_valid_access(int off, int size, + enum bpf_access_type type, + struct bpf_insn_access_aux *info); bool bpf_sock_is_valid_access(int off, int size, enum bpf_access_type type, struct bpf_insn_access_aux *info); u32 bpf_sock_convert_ctx_access(enum bpf_access_type type, @@ -945,6 +951,12 @@ u32 bpf_sock_convert_ctx_access(enum bpf_access_type type, struct bpf_prog *prog, u32 *target_size); #else +static inline bool bpf_sock_common_is_valid_access(int off, int size, + enum bpf_access_type type, + struct bpf_insn_access_aux *info) +{ + return false; +} static inline bool bpf_sock_is_valid_access(int off, int size, enum bpf_access_type type, struct bpf_insn_access_aux *info) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 5c6b6bcb474d..4b40731e532c 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2288,6 +2288,14 @@ union bpf_attr { * See: clock_gettime(CLOCK_BOOTTIME) * Return * Current *ktime*. + * + * struct bpf_sock *bpf_sk_fullsock(struct bpf_sock *sk) + * Description + * This helper gets a **struct bpf_sock** pointer such + * that all the fields in bpf_sock can be accessed. + * Return + * A **struct bpf_sock** pointer on success, or NULL in + * case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -2532,6 +2540,7 @@ struct __sk_buff { __u64 tstamp; __u32 wire_len; __u32 gso_segs; + __bpf_md_ptr(struct bpf_sock *, sk); }; struct bpf_tunnel_key { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 7f8b5ca18642..f83bafc569d9 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -330,10 +330,17 @@ static bool type_is_pkt_pointer(enum bpf_reg_type type) type == PTR_TO_PACKET_META; } +static bool type_is_sk_pointer(enum bpf_reg_type type) +{ + return type == PTR_TO_SOCKET || + type == PTR_TO_SOCK_COMMON; +} + static bool reg_type_may_be_null(enum bpf_reg_type type) { return type == PTR_TO_MAP_VALUE_OR_NULL || - type == PTR_TO_SOCKET_OR_NULL; + type == PTR_TO_SOCKET_OR_NULL || + type == PTR_TO_SOCK_COMMON_OR_NULL; } static bool type_is_refcounted(enum bpf_reg_type type) @@ -376,6 +383,12 @@ static bool is_release_function(enum bpf_func_id func_id) return func_id == BPF_FUNC_sk_release; } +static bool is_acquire_function(enum bpf_func_id func_id) +{ + return func_id == BPF_FUNC_sk_lookup_tcp || + func_id == BPF_FUNC_sk_lookup_udp; +} + /* string representation of 'enum bpf_reg_type' */ static const char * const reg_type_str[] = { [NOT_INIT] = "?", @@ -391,6 +404,8 @@ static const char * const reg_type_str[] = { [PTR_TO_FLOW_KEYS] = "flow_keys", [PTR_TO_SOCKET] = "sock", [PTR_TO_SOCKET_OR_NULL] = "sock_or_null", + [PTR_TO_SOCK_COMMON] = "sock_common", + [PTR_TO_SOCK_COMMON_OR_NULL] = "sock_common_or_null", }; static char slot_type_char[] = { @@ -617,13 +632,10 @@ static int acquire_reference_state(struct bpf_verifier_env *env, int insn_idx) } /* release function corresponding to acquire_reference_state(). Idempotent. */ -static int __release_reference_state(struct bpf_func_state *state, int ptr_id) +static int release_reference_state(struct bpf_func_state *state, int ptr_id) { int i, last_idx; - if (!ptr_id) - return -EFAULT; - last_idx = state->acquired_refs - 1; for (i = 0; i < state->acquired_refs; i++) { if (state->refs[i].id == ptr_id) { @@ -635,21 +647,7 @@ static int __release_reference_state(struct bpf_func_state *state, int ptr_id) return 0; } } - return -EFAULT; -} - -/* variation on the above for cases where we expect that there must be an - * outstanding reference for the specified ptr_id. - */ -static int release_reference_state(struct bpf_verifier_env *env, int ptr_id) -{ - struct bpf_func_state *state = cur_func(env); - int err; - - err = __release_reference_state(state, ptr_id); - if (WARN_ON_ONCE(err != 0)) - verbose(env, "verifier internal error: can't release reference\n"); - return err; + return -EINVAL; } static int transfer_reference_state(struct bpf_func_state *dst, @@ -1208,6 +1206,8 @@ static bool is_spillable_regtype(enum bpf_reg_type type) case CONST_PTR_TO_MAP: case PTR_TO_SOCKET: case PTR_TO_SOCKET_OR_NULL: + case PTR_TO_SOCK_COMMON: + case PTR_TO_SOCK_COMMON_OR_NULL: return true; default: return false; @@ -1656,6 +1656,7 @@ static int check_sock_access(struct bpf_verifier_env *env, int insn_idx, struct bpf_reg_state *regs = cur_regs(env); struct bpf_reg_state *reg = ®s[regno]; struct bpf_insn_access_aux info = {}; + bool valid; if (reg->smin_value < 0) { verbose(env, "R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n", @@ -1663,15 +1664,28 @@ static int check_sock_access(struct bpf_verifier_env *env, int insn_idx, return -EACCES; } - if (!bpf_sock_is_valid_access(off, size, t, &info)) { - verbose(env, "invalid bpf_sock access off=%d size=%d\n", - off, size); - return -EACCES; + switch (reg->type) { + case PTR_TO_SOCK_COMMON: + valid = bpf_sock_common_is_valid_access(off, size, t, &info); + break; + case PTR_TO_SOCKET: + valid = bpf_sock_is_valid_access(off, size, t, &info); + break; + default: + valid = false; } - env->insn_aux_data[insn_idx].ctx_field_size = info.ctx_field_size; - return 0; + if (valid) { + env->insn_aux_data[insn_idx].ctx_field_size = + info.ctx_field_size; + return 0; + } + + verbose(env, "R%d invalid %s access off=%d size=%d\n", + regno, reg_type_str[reg->type], off, size); + + return -EACCES; } static bool __is_pointer_value(bool allow_ptr_leaks, @@ -1697,8 +1711,14 @@ static bool is_ctx_reg(struct bpf_verifier_env *env, int regno) { const struct bpf_reg_state *reg = reg_state(env, regno); - return reg->type == PTR_TO_CTX || - reg->type == PTR_TO_SOCKET; + return reg->type == PTR_TO_CTX; +} + +static bool is_sk_reg(struct bpf_verifier_env *env, int regno) +{ + const struct bpf_reg_state *reg = reg_state(env, regno); + + return type_is_sk_pointer(reg->type); } static bool is_pkt_reg(struct bpf_verifier_env *env, int regno) @@ -1809,6 +1829,9 @@ static int check_ptr_alignment(struct bpf_verifier_env *env, case PTR_TO_SOCKET: pointer_desc = "sock "; break; + case PTR_TO_SOCK_COMMON: + pointer_desc = "sock_common "; + break; default: break; } @@ -2012,11 +2035,14 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn * PTR_TO_PACKET[_META,_END]. In the latter * case, we know the offset is zero. */ - if (reg_type == SCALAR_VALUE) + if (reg_type == SCALAR_VALUE) { mark_reg_unknown(env, regs, value_regno); - else + } else { mark_reg_known_zero(env, regs, value_regno); + if (reg_type_may_be_null(reg_type)) + regs[value_regno].id = ++env->id_gen; + } regs[value_regno].type = reg_type; } @@ -2062,9 +2088,10 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn err = check_flow_keys_access(env, off, size); if (!err && t == BPF_READ && value_regno >= 0) mark_reg_unknown(env, regs, value_regno); - } else if (reg->type == PTR_TO_SOCKET) { + } else if (type_is_sk_pointer(reg->type)) { if (t == BPF_WRITE) { - verbose(env, "cannot write into socket\n"); + verbose(env, "R%d cannot write into %s\n", + regno, reg_type_str[reg->type]); return -EACCES; } err = check_sock_access(env, insn_idx, regno, off, size, t); @@ -2111,7 +2138,8 @@ static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_ins if (is_ctx_reg(env, insn->dst_reg) || is_pkt_reg(env, insn->dst_reg) || - is_flow_key_reg(env, insn->dst_reg)) { + is_flow_key_reg(env, insn->dst_reg) || + is_sk_reg(env, insn->dst_reg)) { verbose(env, "BPF_XADD stores into R%d %s is not allowed\n", insn->dst_reg, reg_type_str[reg_state(env, insn->dst_reg)->type]); @@ -2453,6 +2481,11 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, err = check_ctx_reg(env, reg, regno); if (err < 0) return err; + } else if (arg_type == ARG_PTR_TO_SOCK_COMMON) { + expected_type = PTR_TO_SOCK_COMMON; + /* Any sk pointer can be ARG_PTR_TO_SOCK_COMMON */ + if (!type_is_sk_pointer(type)) + goto err_type; } else if (arg_type == ARG_PTR_TO_SOCKET) { expected_type = PTR_TO_SOCKET; if (type != expected_type) @@ -2866,7 +2899,7 @@ static int release_reference(struct bpf_verifier_env *env, for (i = 0; i <= vstate->curframe; i++) release_reg_references(env, vstate->frame[i], meta->ptr_id); - return release_reference_state(env, meta->ptr_id); + return release_reference_state(cur_func(env), meta->ptr_id); } static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, @@ -3155,8 +3188,11 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn } } else if (is_release_function(func_id)) { err = release_reference(env, &meta); - if (err) + if (err) { + verbose(env, "func %s#%d reference has not been acquired before\n", + func_id_name(func_id), func_id); return err; + } } regs = cur_regs(env); @@ -3205,12 +3241,19 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn regs[BPF_REG_0].id = ++env->id_gen; } } else if (fn->ret_type == RET_PTR_TO_SOCKET_OR_NULL) { - int id = acquire_reference_state(env, insn_idx); - if (id < 0) - return id; mark_reg_known_zero(env, regs, BPF_REG_0); regs[BPF_REG_0].type = PTR_TO_SOCKET_OR_NULL; - regs[BPF_REG_0].id = id; + if (is_acquire_function(func_id)) { + int id = acquire_reference_state(env, insn_idx); + + if (id < 0) + return id; + /* For release_reference() */ + regs[BPF_REG_0].id = id; + } else { + /* For mark_ptr_or_null_reg() */ + regs[BPF_REG_0].id = ++env->id_gen; + } } else { verbose(env, "unknown return type %d of func %s#%d\n", fn->ret_type, func_id_name(func_id), func_id); @@ -3646,6 +3689,8 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, case PTR_TO_PACKET_END: case PTR_TO_SOCKET: case PTR_TO_SOCKET_OR_NULL: + case PTR_TO_SOCK_COMMON: + case PTR_TO_SOCK_COMMON_OR_NULL: verbose(env, "R%d pointer arithmetic on %s prohibited\n", dst, reg_type_str[ptr_reg->type]); return -EACCES; @@ -4863,6 +4908,8 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state, } } else if (reg->type == PTR_TO_SOCKET_OR_NULL) { reg->type = PTR_TO_SOCKET; + } else if (reg->type == PTR_TO_SOCK_COMMON_OR_NULL) { + reg->type = PTR_TO_SOCK_COMMON; } if (is_null || !(reg_is_refcounted(reg) || reg_may_point_to_spin_lock(reg))) { @@ -4887,7 +4934,7 @@ static void mark_ptr_or_null_regs(struct bpf_verifier_state *vstate, u32 regno, int i, j; if (reg_is_refcounted_or_null(®s[regno]) && is_null) - __release_reference_state(state, id); + release_reference_state(state, id); for (i = 0; i < MAX_BPF_REG; i++) mark_ptr_or_null_reg(state, ®s[i], id, is_null); @@ -6078,6 +6125,8 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold, case PTR_TO_FLOW_KEYS: case PTR_TO_SOCKET: case PTR_TO_SOCKET_OR_NULL: + case PTR_TO_SOCK_COMMON: + case PTR_TO_SOCK_COMMON_OR_NULL: /* Only valid matches are exact, which memcmp() above * would have accepted */ @@ -6388,6 +6437,8 @@ static bool reg_type_mismatch_ok(enum bpf_reg_type type) case PTR_TO_CTX: case PTR_TO_SOCKET: case PTR_TO_SOCKET_OR_NULL: + case PTR_TO_SOCK_COMMON: + case PTR_TO_SOCK_COMMON_OR_NULL: return false; default: return true; @@ -7394,6 +7445,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) convert_ctx_access = ops->convert_ctx_access; break; case PTR_TO_SOCKET: + case PTR_TO_SOCK_COMMON: convert_ctx_access = bpf_sock_convert_ctx_access; break; default: diff --git a/net/core/filter.c b/net/core/filter.c index b7eb13e19e2d..618816995cf7 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1798,6 +1798,20 @@ static const struct bpf_func_proto bpf_skb_pull_data_proto = { .arg2_type = ARG_ANYTHING, }; +BPF_CALL_1(bpf_sk_fullsock, struct sock *, sk) +{ + sk = sk_to_full_sk(sk); + + return sk_fullsock(sk) ? (unsigned long)sk : (unsigned long)NULL; +} + +static const struct bpf_func_proto bpf_sk_fullsock_proto = { + .func = bpf_sk_fullsock, + .gpl_only = false, + .ret_type = RET_PTR_TO_SOCKET_OR_NULL, + .arg1_type = ARG_PTR_TO_SOCK_COMMON, +}; + static inline int sk_skb_try_make_writable(struct sk_buff *skb, unsigned int write_len) { @@ -4956,6 +4970,8 @@ cg_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) switch (func_id) { case BPF_FUNC_get_local_storage: return &bpf_get_local_storage_proto; + case BPF_FUNC_sk_fullsock: + return &bpf_sk_fullsock_proto; default: return sk_filter_func_proto(func_id, prog); } @@ -5029,6 +5045,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_socket_uid_proto; case BPF_FUNC_fib_lookup: return &bpf_skb_fib_lookup_proto; + case BPF_FUNC_sk_fullsock: + return &bpf_sk_fullsock_proto; #ifdef CONFIG_XFRM case BPF_FUNC_skb_get_xfrm_state: return &bpf_skb_get_xfrm_state_proto; @@ -5302,6 +5320,11 @@ static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type if (size != sizeof(__u64)) return false; break; + case offsetof(struct __sk_buff, sk): + if (type == BPF_WRITE || size != sizeof(__u64)) + return false; + info->reg_type = PTR_TO_SOCK_COMMON_OR_NULL; + break; default: /* Only narrow read access allowed for now. */ if (type == BPF_WRITE) { @@ -5482,6 +5505,18 @@ static bool __sock_filter_check_size(int off, int size, return size == size_default; } +bool bpf_sock_common_is_valid_access(int off, int size, + enum bpf_access_type type, + struct bpf_insn_access_aux *info) +{ + switch (off) { + case bpf_ctx_range_till(struct bpf_sock, type, priority): + return false; + default: + return bpf_sock_is_valid_access(off, size, type, info); + } +} + bool bpf_sock_is_valid_access(int off, int size, enum bpf_access_type type, struct bpf_insn_access_aux *info) { @@ -6275,6 +6310,13 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type, off += offsetof(struct qdisc_skb_cb, pkt_len); *target_size = 4; *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, off); + break; + + case offsetof(struct __sk_buff, sk): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk), + si->dst_reg, si->src_reg, + offsetof(struct sk_buff, sk)); + break; } return insn - insn_buf; From 270139751938b712c3b4bc8a9b688607cbf35084 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 8 Oct 2018 11:30:01 -0700 Subject: [PATCH 1046/1640] UPSTREAM: bpf: fix building without CONFIG_INET The newly added TCP and UDP handling fails to link when CONFIG_INET is disabled: net/core/filter.o: In function `sk_lookup': filter.c:(.text+0x7ff8): undefined reference to `tcp_hashinfo' filter.c:(.text+0x7ffc): undefined reference to `tcp_hashinfo' filter.c:(.text+0x8020): undefined reference to `__inet_lookup_established' filter.c:(.text+0x8058): undefined reference to `__inet_lookup_listener' filter.c:(.text+0x8068): undefined reference to `udp_table' filter.c:(.text+0x8070): undefined reference to `udp_table' filter.c:(.text+0x808c): undefined reference to `__udp4_lib_lookup' net/core/filter.o: In function `bpf_sk_release': filter.c:(.text+0x82e8): undefined reference to `sock_gen_put' Wrap the related sections of code in #ifdefs for the config option. Furthermore, sk_lookup() should always have been marked 'static', this also avoids a warning about a missing prototype when building with 'make W=1'. Fixes: 6acc9b432e67 ("bpf: Add helper to retrieve socket in BPF") Change-Id: I224de16eb4e5a4ca281ad3ff072fbb1d99e4320b Signed-off-by: Arnd Bergmann Signed-off-by: Joe Stringer Acked-by: Song Liu Signed-off-by: Daniel Borkmann --- net/core/filter.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/net/core/filter.c b/net/core/filter.c index 618816995cf7..1429073521cf 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4692,8 +4692,9 @@ static const struct bpf_func_proto bpf_lwt_seg6_adjust_srh_proto = { }; #endif /* CONFIG_IPV6_SEG6_BPF */ -struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple, - struct sk_buff *skb, u8 family, u8 proto) +#ifdef CONFIG_INET +static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple, + struct sk_buff *skb, u8 family, u8 proto) { int dif = skb->dev->ifindex; bool refcounted = false; @@ -4826,6 +4827,7 @@ static const struct bpf_func_proto bpf_sk_release_proto = { .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_SOCKET, }; +#endif /* CONFIG_INET */ bool bpf_helper_changes_pkt_data(void *func) { @@ -5057,12 +5059,14 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_skb_ancestor_cgroup_id: return &bpf_skb_ancestor_cgroup_id_proto; #endif +#ifdef CONFIG_INET case BPF_FUNC_sk_lookup_tcp: return &bpf_sk_lookup_tcp_proto; case BPF_FUNC_sk_lookup_udp: return &bpf_sk_lookup_udp_proto; case BPF_FUNC_sk_release: return &bpf_sk_release_proto; +#endif default: return bpf_base_func_proto(func_id); } @@ -5172,12 +5176,14 @@ sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_sk_redirect_hash_proto; case BPF_FUNC_get_local_storage: return &bpf_get_local_storage_proto; +#ifdef CONFIG_INET case BPF_FUNC_sk_lookup_tcp: return &bpf_sk_lookup_tcp_proto; case BPF_FUNC_sk_lookup_udp: return &bpf_sk_lookup_udp_proto; case BPF_FUNC_sk_release: return &bpf_sk_release_proto; +#endif default: return bpf_base_func_proto(func_id); } From 74a8a579327bc1ff9fe57f011e4e21f1061b05c3 Mon Sep 17 00:00:00 2001 From: Joe Stringer Date: Mon, 15 Oct 2018 10:27:45 -0700 Subject: [PATCH 1047/1640] UPSTREAM: bpf: Allow sk_lookup with IPv6 module This is a more complete fix than d71019b54bff ("net: core: Fix build with CONFIG_IPV6=m"), so that IPv6 sockets may be looked up if the IPv6 module is loaded (not just if it's compiled in). Change-Id: I18c1f5d5e606d5c31f6fefca64676925ded3e38d Signed-off-by: Joe Stringer Signed-off-by: Alexei Starovoitov --- include/net/addrconf.h | 5 +++++ net/core/filter.c | 10 ++++++---- net/ipv6/af_inet6.c | 1 + 3 files changed, 12 insertions(+), 4 deletions(-) diff --git a/include/net/addrconf.h b/include/net/addrconf.h index 0b07d5e0ab1b..ecff0e070291 100644 --- a/include/net/addrconf.h +++ b/include/net/addrconf.h @@ -256,6 +256,11 @@ extern const struct ipv6_stub *ipv6_stub __read_mostly; struct ipv6_bpf_stub { int (*inet6_bind)(struct sock *sk, struct sockaddr *uaddr, int addr_len, bool force_bind_address_no_port, bool with_lock); + struct sock *(*udp6_lib_lookup)(struct net *net, + const struct in6_addr *saddr, __be16 sport, + const struct in6_addr *daddr, __be16 dport, + int dif, int sdif, struct udp_table *tbl, + struct sk_buff *skb); }; extern const struct ipv6_bpf_stub *ipv6_bpf_stub __read_mostly; diff --git a/net/core/filter.c b/net/core/filter.c index 1429073521cf..c359d843e7a2 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4725,10 +4725,12 @@ static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple, src6, tuple->ipv6.sport, dst6, tuple->ipv6.dport, dif, sdif, &refcounted); - else - sk = __udp6_lib_lookup(net, src6, tuple->ipv6.sport, - dst6, tuple->ipv6.dport, - dif, sdif, &udp_table, skb); + else if (likely(ipv6_bpf_stub)) + sk = ipv6_bpf_stub->udp6_lib_lookup(net, + src6, tuple->ipv6.sport, + dst6, tuple->ipv6.dport, + dif, sdif, + &udp_table, skb); #endif } diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index efa8462cfae1..0947d4b06dcb 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -934,6 +934,7 @@ static const struct ipv6_stub ipv6_stub_impl = { static const struct ipv6_bpf_stub ipv6_bpf_stub_impl = { .inet6_bind = __inet6_bind, + .udp6_lib_lookup = __udp6_lib_lookup, }; static int __init inet6_init(void) From bb89111a23e5a12af54d01e0a8fd95d6b4ebe4c9 Mon Sep 17 00:00:00 2001 From: Joe Stringer Date: Mon, 15 Oct 2018 10:27:46 -0700 Subject: [PATCH 1048/1640] UPSTREAM: bpf: Fix IPv6 dport byte-order in bpf_sk_lookup Commit 6acc9b432e67 ("bpf: Add helper to retrieve socket in BPF") mistakenly passed the destination port in network byte-order to the IPv6 TCP/UDP socket lookup functions, which meant that BPF writers would need to either manually swap the byte-order of this field or otherwise IPv6 sockets could not be located via this helper. Fix the issue by swapping the byte-order appropriately in the helper. This also makes the API more consistent with the IPv4 version. Fixes: 6acc9b432e67 ("bpf: Add helper to retrieve socket in BPF") Change-Id: I1d2304cd5a37ef8eb1736674f960899bfcd8099b Signed-off-by: Joe Stringer Signed-off-by: Alexei Starovoitov --- net/core/filter.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/net/core/filter.c b/net/core/filter.c index c359d843e7a2..daea80bd4a3c 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4718,17 +4718,18 @@ static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple, } else { struct in6_addr *src6 = (struct in6_addr *)&tuple->ipv6.saddr; struct in6_addr *dst6 = (struct in6_addr *)&tuple->ipv6.daddr; + u16 hnum = ntohs(tuple->ipv6.dport); int sdif = inet6_sdif(skb); if (proto == IPPROTO_TCP) sk = __inet6_lookup(net, &tcp_hashinfo, skb, 0, src6, tuple->ipv6.sport, - dst6, tuple->ipv6.dport, + dst6, hnum, dif, sdif, &refcounted); else if (likely(ipv6_bpf_stub)) sk = ipv6_bpf_stub->udp6_lib_lookup(net, src6, tuple->ipv6.sport, - dst6, tuple->ipv6.dport, + dst6, hnum, dif, sdif, &udp_table, skb); #endif From 02e70446a7b94f22b5ef9f8e1f6ec0ac813e8c55 Mon Sep 17 00:00:00 2001 From: Nitin Hande Date: Sun, 28 Oct 2018 21:02:45 -0700 Subject: [PATCH 1049/1640] UPSTREAM: bpf: Extend the sk_lookup() helper to XDP hookpoint. This patch proposes to extend the sk_lookup() BPF API to the XDP hookpoint. The sk_lookup() helper supports a lookup on incoming packet to find the corresponding socket that will receive this packet. Current support for this BPF API is at the tc hookpoint. This patch will extend this API at XDP hookpoint. A XDP program can map the incoming packet to the 5-tuple parameter and invoke the API to find the corresponding socket structure. Change-Id: I1280b4b0f499fd8648d75398d5f1e84a9b4ec411 Signed-off-by: Nitin Hande Signed-off-by: Daniel Borkmann --- include/uapi/linux/bpf.h | 4 ++ net/core/filter.c | 102 +++++++++++++++++++++++++++++++++------ 2 files changed, 90 insertions(+), 16 deletions(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 4b40731e532c..ba0a2f6e4ce9 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2241,6 +2241,8 @@ union bpf_attr { * **CONFIG_NET** configuration option. * Return * Pointer to *struct bpf_sock*, or NULL in case of failure. + * For sockets with reuseport option, *struct bpf_sock* + * return is from reuse->socks[] using hash of the packet. * * struct bpf_sock *bpf_sk_lookup_udp(void *ctx, struct bpf_sock_tuple *tuple, u32 tuple_size, u32 netns, u64 flags) * Description @@ -2273,6 +2275,8 @@ union bpf_attr { * **CONFIG_NET** configuration option. * Return * Pointer to *struct bpf_sock*, or NULL in case of failure. + * For sockets with reuseport option, *struct bpf_sock* + * return is from reuse->socks[] using hash of the packet. * * int bpf_sk_release(struct bpf_sock *sk) * Description diff --git a/net/core/filter.c b/net/core/filter.c index daea80bd4a3c..d3006947d9d8 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4694,35 +4694,32 @@ static const struct bpf_func_proto bpf_lwt_seg6_adjust_srh_proto = { #ifdef CONFIG_INET static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple, - struct sk_buff *skb, u8 family, u8 proto) + int dif, int sdif, u8 family, u8 proto) { - int dif = skb->dev->ifindex; bool refcounted = false; struct sock *sk = NULL; if (family == AF_INET) { __be32 src4 = tuple->ipv4.saddr; __be32 dst4 = tuple->ipv4.daddr; - int sdif = inet_sdif(skb); if (proto == IPPROTO_TCP) - sk = __inet_lookup(net, &tcp_hashinfo, skb, 0, + sk = __inet_lookup(net, &tcp_hashinfo, NULL, 0, src4, tuple->ipv4.sport, dst4, tuple->ipv4.dport, dif, sdif, &refcounted); else sk = __udp4_lib_lookup(net, src4, tuple->ipv4.sport, dst4, tuple->ipv4.dport, - dif, sdif, &udp_table, skb); + dif, sdif, &udp_table, NULL); #if IS_ENABLED(CONFIG_IPV6) } else { struct in6_addr *src6 = (struct in6_addr *)&tuple->ipv6.saddr; struct in6_addr *dst6 = (struct in6_addr *)&tuple->ipv6.daddr; u16 hnum = ntohs(tuple->ipv6.dport); - int sdif = inet6_sdif(skb); if (proto == IPPROTO_TCP) - sk = __inet6_lookup(net, &tcp_hashinfo, skb, 0, + sk = __inet6_lookup(net, &tcp_hashinfo, NULL, 0, src6, tuple->ipv6.sport, dst6, hnum, dif, sdif, &refcounted); @@ -4731,7 +4728,7 @@ static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple, src6, tuple->ipv6.sport, dst6, hnum, dif, sdif, - &udp_table, skb); + &udp_table, NULL); #endif } @@ -4748,31 +4745,33 @@ static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple, * callers to satisfy BPF_CALL declarations. */ static unsigned long -bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, - u8 proto, u64 netns_id, u64 flags) +__bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, + struct net *caller_net, u32 ifindex, u8 proto, u64 netns_id, + u64 flags) { - struct net *caller_net; struct sock *sk = NULL; u8 family = AF_UNSPEC; struct net *net; + int sdif; family = len == sizeof(tuple->ipv4) ? AF_INET : AF_INET6; if (unlikely(family == AF_UNSPEC || netns_id > U32_MAX || flags)) goto out; - if (skb->dev) - caller_net = dev_net(skb->dev); + if (family == AF_INET) + sdif = inet_sdif(skb); else - caller_net = sock_net(skb->sk); + sdif = inet6_sdif(skb); + if (netns_id) { net = get_net_ns_by_id(caller_net, netns_id); if (unlikely(!net)) goto out; - sk = sk_lookup(net, tuple, skb, family, proto); + sk = sk_lookup(net, tuple, ifindex, sdif, family, proto); put_net(net); } else { net = caller_net; - sk = sk_lookup(net, tuple, skb, family, proto); + sk = sk_lookup(net, tuple, ifindex, sdif, family, proto); } if (sk) @@ -4781,6 +4780,25 @@ out: return (unsigned long) sk; } +static unsigned long +bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, + u8 proto, u64 netns_id, u64 flags) +{ + struct net *caller_net; + int ifindex; + + if (skb->dev) { + caller_net = dev_net(skb->dev); + ifindex = skb->dev->ifindex; + } else { + caller_net = sock_net(skb->sk); + ifindex = 0; + } + + return __bpf_sk_lookup(skb, tuple, len, caller_net, ifindex, + proto, netns_id, flags); +} + BPF_CALL_5(bpf_sk_lookup_tcp, struct sk_buff *, skb, struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) { @@ -4830,6 +4848,50 @@ static const struct bpf_func_proto bpf_sk_release_proto = { .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_SOCKET, }; + +BPF_CALL_5(bpf_xdp_sk_lookup_udp, struct xdp_buff *, ctx, + struct bpf_sock_tuple *, tuple, u32, len, u32, netns_id, u64, flags) +{ + struct net *caller_net = dev_net(ctx->rxq->dev); + int ifindex = ctx->rxq->dev->ifindex; + + return __bpf_sk_lookup(NULL, tuple, len, caller_net, ifindex, + IPPROTO_UDP, netns_id, flags); +} + +static const struct bpf_func_proto bpf_xdp_sk_lookup_udp_proto = { + .func = bpf_xdp_sk_lookup_udp, + .gpl_only = false, + .pkt_access = true, + .ret_type = RET_PTR_TO_SOCKET_OR_NULL, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_ANYTHING, + .arg5_type = ARG_ANYTHING, +}; + +BPF_CALL_5(bpf_xdp_sk_lookup_tcp, struct xdp_buff *, ctx, + struct bpf_sock_tuple *, tuple, u32, len, u32, netns_id, u64, flags) +{ + struct net *caller_net = dev_net(ctx->rxq->dev); + int ifindex = ctx->rxq->dev->ifindex; + + return __bpf_sk_lookup(NULL, tuple, len, caller_net, ifindex, + IPPROTO_TCP, netns_id, flags); +} + +static const struct bpf_func_proto bpf_xdp_sk_lookup_tcp_proto = { + .func = bpf_xdp_sk_lookup_tcp, + .gpl_only = false, + .pkt_access = true, + .ret_type = RET_PTR_TO_SOCKET_OR_NULL, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_ANYTHING, + .arg5_type = ARG_ANYTHING, +}; #endif /* CONFIG_INET */ bool bpf_helper_changes_pkt_data(void *func) @@ -5097,6 +5159,14 @@ xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_xdp_adjust_tail_proto; case BPF_FUNC_fib_lookup: return &bpf_xdp_fib_lookup_proto; +#ifdef CONFIG_INET + case BPF_FUNC_sk_lookup_udp: + return &bpf_xdp_sk_lookup_udp_proto; + case BPF_FUNC_sk_lookup_tcp: + return &bpf_xdp_sk_lookup_tcp_proto; + case BPF_FUNC_sk_release: + return &bpf_sk_release_proto; +#endif default: return bpf_base_func_proto(func_id); } From 7367e5e8aafca0c3fe944082150cf016fab62115 Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Fri, 9 Nov 2018 10:54:01 -0800 Subject: [PATCH 1050/1640] UPSTREAM: bpf: Support socket lookup in CGROUP_SOCK_ADDR progs Make bpf_sk_lookup_tcp, bpf_sk_lookup_udp and bpf_sk_release helpers available in programs of type BPF_PROG_TYPE_CGROUP_SOCK_ADDR. Such programs operate on sockets and have access to socket and struct sockaddr passed by user to system calls such as sys_bind, sys_connect, sys_sendmsg. It's useful to be able to lookup other sockets from these programs. E.g. sys_connect may lookup IP:port endpoint and if there is a server socket bound to that endpoint ("server" can be defined by saddr & sport being zero), redirect client connection to it by rewriting IP:port in sockaddr passed to sys_connect. Change-Id: Ia8ba060aed40e7e633f7449636513efac06a2810 Signed-off-by: Andrey Ignatov Acked-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov --- net/core/filter.c | 45 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/net/core/filter.c b/net/core/filter.c index d3006947d9d8..fefc94137b20 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4892,6 +4892,43 @@ static const struct bpf_func_proto bpf_xdp_sk_lookup_tcp_proto = { .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, }; + +BPF_CALL_5(bpf_sock_addr_sk_lookup_tcp, struct bpf_sock_addr_kern *, ctx, + struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) +{ + return __bpf_sk_lookup(NULL, tuple, len, sock_net(ctx->sk), 0, + IPPROTO_TCP, netns_id, flags); +} + +static const struct bpf_func_proto bpf_sock_addr_sk_lookup_tcp_proto = { + .func = bpf_sock_addr_sk_lookup_tcp, + .gpl_only = false, + .ret_type = RET_PTR_TO_SOCKET_OR_NULL, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_ANYTHING, + .arg5_type = ARG_ANYTHING, +}; + +BPF_CALL_5(bpf_sock_addr_sk_lookup_udp, struct bpf_sock_addr_kern *, ctx, + struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) +{ + return __bpf_sk_lookup(NULL, tuple, len, sock_net(ctx->sk), 0, + IPPROTO_UDP, netns_id, flags); +} + +static const struct bpf_func_proto bpf_sock_addr_sk_lookup_udp_proto = { + .func = bpf_sock_addr_sk_lookup_udp, + .gpl_only = false, + .ret_type = RET_PTR_TO_SOCKET_OR_NULL, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_ANYTHING, + .arg5_type = ARG_ANYTHING, +}; + #endif /* CONFIG_INET */ bool bpf_helper_changes_pkt_data(void *func) @@ -5009,6 +5046,14 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_socket_cookie_sock_addr_proto; case BPF_FUNC_get_local_storage: return &bpf_get_local_storage_proto; +#ifdef CONFIG_INET + case BPF_FUNC_sk_lookup_tcp: + return &bpf_sock_addr_sk_lookup_tcp_proto; + case BPF_FUNC_sk_lookup_udp: + return &bpf_sock_addr_sk_lookup_udp_proto; + case BPF_FUNC_sk_release: + return &bpf_sk_release_proto; +#endif /* CONFIG_INET */ default: return bpf_base_func_proto(func_id); } From a1461096b90e3de8275d46eaafed2b332367c1dd Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Sat, 9 Feb 2019 23:22:23 -0800 Subject: [PATCH 1051/1640] UPSTREAM: bpf: Refactor sock_ops_convert_ctx_access The next patch will introduce a new "struct bpf_tcp_sock" which exposes the same tcp_sock's fields already exposed in "struct bpf_sock_ops". This patch refactor the existing convert_ctx_access() codes for "struct bpf_sock_ops" to get them ready to be reused for "struct bpf_tcp_sock". The "rtt_min" is not refactored in this patch because its handling is different from other fields. The SOCK_OPS_GET_TCP_SOCK_FIELD is new. All other SOCK_OPS_XXX_FIELD changes are code move only. Acked-by: Alexei Starovoitov Change-Id: Ib922da49dc630f623fc5233d0049e304ab9888a2 Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov --- net/core/filter.c | 287 ++++++++++++++++++++-------------------------- 1 file changed, 127 insertions(+), 160 deletions(-) diff --git a/net/core/filter.c b/net/core/filter.c index fefc94137b20..e9c2ddc33b79 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4692,6 +4692,54 @@ static const struct bpf_func_proto bpf_lwt_seg6_adjust_srh_proto = { }; #endif /* CONFIG_IPV6_SEG6_BPF */ +#define CONVERT_COMMON_TCP_SOCK_FIELDS(md_type, CONVERT) \ +do { \ + switch (si->off) { \ + case offsetof(md_type, snd_cwnd): \ + CONVERT(snd_cwnd); break; \ + case offsetof(md_type, srtt_us): \ + CONVERT(srtt_us); break; \ + case offsetof(md_type, snd_ssthresh): \ + CONVERT(snd_ssthresh); break; \ + case offsetof(md_type, rcv_nxt): \ + CONVERT(rcv_nxt); break; \ + case offsetof(md_type, snd_nxt): \ + CONVERT(snd_nxt); break; \ + case offsetof(md_type, snd_una): \ + CONVERT(snd_una); break; \ + case offsetof(md_type, mss_cache): \ + CONVERT(mss_cache); break; \ + case offsetof(md_type, ecn_flags): \ + CONVERT(ecn_flags); break; \ + case offsetof(md_type, rate_delivered): \ + CONVERT(rate_delivered); break; \ + case offsetof(md_type, rate_interval_us): \ + CONVERT(rate_interval_us); break; \ + case offsetof(md_type, packets_out): \ + CONVERT(packets_out); break; \ + case offsetof(md_type, retrans_out): \ + CONVERT(retrans_out); break; \ + case offsetof(md_type, total_retrans): \ + CONVERT(total_retrans); break; \ + case offsetof(md_type, segs_in): \ + CONVERT(segs_in); break; \ + case offsetof(md_type, data_segs_in): \ + CONVERT(data_segs_in); break; \ + case offsetof(md_type, segs_out): \ + CONVERT(segs_out); break; \ + case offsetof(md_type, data_segs_out): \ + CONVERT(data_segs_out); break; \ + case offsetof(md_type, lost_out): \ + CONVERT(lost_out); break; \ + case offsetof(md_type, sacked_out): \ + CONVERT(sacked_out); break; \ + case offsetof(md_type, bytes_received): \ + CONVERT(bytes_received); break; \ + case offsetof(md_type, bytes_acked): \ + CONVERT(bytes_acked); break; \ + } \ +} while (0) + #ifdef CONFIG_INET static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple, int dif, int sdif, u8 family, u8 proto) @@ -6792,6 +6840,85 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, struct bpf_insn *insn = insn_buf; int off; +/* Helper macro for adding read access to tcp_sock or sock fields. */ +#define SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ) \ + do { \ + BUILD_BUG_ON(FIELD_SIZEOF(OBJ, OBJ_FIELD) > \ + FIELD_SIZEOF(struct bpf_sock_ops, BPF_FIELD)); \ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ + struct bpf_sock_ops_kern, \ + is_fullsock), \ + si->dst_reg, si->src_reg, \ + offsetof(struct bpf_sock_ops_kern, \ + is_fullsock)); \ + *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 2); \ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ + struct bpf_sock_ops_kern, sk),\ + si->dst_reg, si->src_reg, \ + offsetof(struct bpf_sock_ops_kern, sk));\ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(OBJ, \ + OBJ_FIELD), \ + si->dst_reg, si->dst_reg, \ + offsetof(OBJ, OBJ_FIELD)); \ + } while (0) + +#define SOCK_OPS_GET_TCP_SOCK_FIELD(FIELD) \ + SOCK_OPS_GET_FIELD(FIELD, FIELD, struct tcp_sock) + +/* Helper macro for adding write access to tcp_sock or sock fields. + * The macro is called with two registers, dst_reg which contains a pointer + * to ctx (context) and src_reg which contains the value that should be + * stored. However, we need an additional register since we cannot overwrite + * dst_reg because it may be used later in the program. + * Instead we "borrow" one of the other register. We first save its value + * into a new (temp) field in bpf_sock_ops_kern, use it, and then restore + * it at the end of the macro. + */ +#define SOCK_OPS_SET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ) \ + do { \ + int reg = BPF_REG_9; \ + BUILD_BUG_ON(FIELD_SIZEOF(OBJ, OBJ_FIELD) > \ + FIELD_SIZEOF(struct bpf_sock_ops, BPF_FIELD)); \ + if (si->dst_reg == reg || si->src_reg == reg) \ + reg--; \ + if (si->dst_reg == reg || si->src_reg == reg) \ + reg--; \ + *insn++ = BPF_STX_MEM(BPF_DW, si->dst_reg, reg, \ + offsetof(struct bpf_sock_ops_kern, \ + temp)); \ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ + struct bpf_sock_ops_kern, \ + is_fullsock), \ + reg, si->dst_reg, \ + offsetof(struct bpf_sock_ops_kern, \ + is_fullsock)); \ + *insn++ = BPF_JMP_IMM(BPF_JEQ, reg, 0, 2); \ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ + struct bpf_sock_ops_kern, sk),\ + reg, si->dst_reg, \ + offsetof(struct bpf_sock_ops_kern, sk));\ + *insn++ = BPF_STX_MEM(BPF_FIELD_SIZEOF(OBJ, OBJ_FIELD), \ + reg, si->src_reg, \ + offsetof(OBJ, OBJ_FIELD)); \ + *insn++ = BPF_LDX_MEM(BPF_DW, reg, si->dst_reg, \ + offsetof(struct bpf_sock_ops_kern, \ + temp)); \ + } while (0) + +#define SOCK_OPS_GET_OR_SET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ, TYPE) \ + do { \ + if (TYPE == BPF_WRITE) \ + SOCK_OPS_SET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ); \ + else \ + SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ); \ + } while (0) + + CONVERT_COMMON_TCP_SOCK_FIELDS(struct bpf_sock_ops, + SOCK_OPS_GET_TCP_SOCK_FIELD); + + if (insn > insn_buf) + return insn - insn_buf; + switch (si->off) { case offsetof(struct bpf_sock_ops, op) ... offsetof(struct bpf_sock_ops, replylong[3]): @@ -6949,175 +7076,15 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, FIELD_SIZEOF(struct minmax_sample, t)); break; -/* Helper macro for adding read access to tcp_sock or sock fields. */ -#define SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ) \ - do { \ - BUILD_BUG_ON(FIELD_SIZEOF(OBJ, OBJ_FIELD) > \ - FIELD_SIZEOF(struct bpf_sock_ops, BPF_FIELD)); \ - *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ - struct bpf_sock_ops_kern, \ - is_fullsock), \ - si->dst_reg, si->src_reg, \ - offsetof(struct bpf_sock_ops_kern, \ - is_fullsock)); \ - *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 2); \ - *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ - struct bpf_sock_ops_kern, sk),\ - si->dst_reg, si->src_reg, \ - offsetof(struct bpf_sock_ops_kern, sk));\ - *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(OBJ, \ - OBJ_FIELD), \ - si->dst_reg, si->dst_reg, \ - offsetof(OBJ, OBJ_FIELD)); \ - } while (0) - -/* Helper macro for adding write access to tcp_sock or sock fields. - * The macro is called with two registers, dst_reg which contains a pointer - * to ctx (context) and src_reg which contains the value that should be - * stored. However, we need an additional register since we cannot overwrite - * dst_reg because it may be used later in the program. - * Instead we "borrow" one of the other register. We first save its value - * into a new (temp) field in bpf_sock_ops_kern, use it, and then restore - * it at the end of the macro. - */ -#define SOCK_OPS_SET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ) \ - do { \ - int reg = BPF_REG_9; \ - BUILD_BUG_ON(FIELD_SIZEOF(OBJ, OBJ_FIELD) > \ - FIELD_SIZEOF(struct bpf_sock_ops, BPF_FIELD)); \ - if (si->dst_reg == reg || si->src_reg == reg) \ - reg--; \ - if (si->dst_reg == reg || si->src_reg == reg) \ - reg--; \ - *insn++ = BPF_STX_MEM(BPF_DW, si->dst_reg, reg, \ - offsetof(struct bpf_sock_ops_kern, \ - temp)); \ - *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ - struct bpf_sock_ops_kern, \ - is_fullsock), \ - reg, si->dst_reg, \ - offsetof(struct bpf_sock_ops_kern, \ - is_fullsock)); \ - *insn++ = BPF_JMP_IMM(BPF_JEQ, reg, 0, 2); \ - *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ - struct bpf_sock_ops_kern, sk),\ - reg, si->dst_reg, \ - offsetof(struct bpf_sock_ops_kern, sk));\ - *insn++ = BPF_STX_MEM(BPF_FIELD_SIZEOF(OBJ, OBJ_FIELD), \ - reg, si->src_reg, \ - offsetof(OBJ, OBJ_FIELD)); \ - *insn++ = BPF_LDX_MEM(BPF_DW, reg, si->dst_reg, \ - offsetof(struct bpf_sock_ops_kern, \ - temp)); \ - } while (0) - -#define SOCK_OPS_GET_OR_SET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ, TYPE) \ - do { \ - if (TYPE == BPF_WRITE) \ - SOCK_OPS_SET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ); \ - else \ - SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ); \ - } while (0) - - case offsetof(struct bpf_sock_ops, snd_cwnd): - SOCK_OPS_GET_FIELD(snd_cwnd, snd_cwnd, struct tcp_sock); - break; - - case offsetof(struct bpf_sock_ops, srtt_us): - SOCK_OPS_GET_FIELD(srtt_us, srtt_us, struct tcp_sock); - break; - case offsetof(struct bpf_sock_ops, bpf_sock_ops_cb_flags): SOCK_OPS_GET_FIELD(bpf_sock_ops_cb_flags, bpf_sock_ops_cb_flags, struct tcp_sock); break; - case offsetof(struct bpf_sock_ops, snd_ssthresh): - SOCK_OPS_GET_FIELD(snd_ssthresh, snd_ssthresh, struct tcp_sock); - break; - - case offsetof(struct bpf_sock_ops, rcv_nxt): - SOCK_OPS_GET_FIELD(rcv_nxt, rcv_nxt, struct tcp_sock); - break; - - case offsetof(struct bpf_sock_ops, snd_nxt): - SOCK_OPS_GET_FIELD(snd_nxt, snd_nxt, struct tcp_sock); - break; - - case offsetof(struct bpf_sock_ops, snd_una): - SOCK_OPS_GET_FIELD(snd_una, snd_una, struct tcp_sock); - break; - - case offsetof(struct bpf_sock_ops, mss_cache): - SOCK_OPS_GET_FIELD(mss_cache, mss_cache, struct tcp_sock); - break; - - case offsetof(struct bpf_sock_ops, ecn_flags): - SOCK_OPS_GET_FIELD(ecn_flags, ecn_flags, struct tcp_sock); - break; - - case offsetof(struct bpf_sock_ops, rate_delivered): - SOCK_OPS_GET_FIELD(rate_delivered, rate_delivered, - struct tcp_sock); - break; - - case offsetof(struct bpf_sock_ops, rate_interval_us): - SOCK_OPS_GET_FIELD(rate_interval_us, rate_interval_us, - struct tcp_sock); - break; - - case offsetof(struct bpf_sock_ops, packets_out): - SOCK_OPS_GET_FIELD(packets_out, packets_out, struct tcp_sock); - break; - - case offsetof(struct bpf_sock_ops, retrans_out): - SOCK_OPS_GET_FIELD(retrans_out, retrans_out, struct tcp_sock); - break; - - case offsetof(struct bpf_sock_ops, total_retrans): - SOCK_OPS_GET_FIELD(total_retrans, total_retrans, - struct tcp_sock); - break; - - case offsetof(struct bpf_sock_ops, segs_in): - SOCK_OPS_GET_FIELD(segs_in, segs_in, struct tcp_sock); - break; - - case offsetof(struct bpf_sock_ops, data_segs_in): - SOCK_OPS_GET_FIELD(data_segs_in, data_segs_in, struct tcp_sock); - break; - - case offsetof(struct bpf_sock_ops, segs_out): - SOCK_OPS_GET_FIELD(segs_out, segs_out, struct tcp_sock); - break; - - case offsetof(struct bpf_sock_ops, data_segs_out): - SOCK_OPS_GET_FIELD(data_segs_out, data_segs_out, - struct tcp_sock); - break; - - case offsetof(struct bpf_sock_ops, lost_out): - SOCK_OPS_GET_FIELD(lost_out, lost_out, struct tcp_sock); - break; - - case offsetof(struct bpf_sock_ops, sacked_out): - SOCK_OPS_GET_FIELD(sacked_out, sacked_out, struct tcp_sock); - break; - case offsetof(struct bpf_sock_ops, sk_txhash): SOCK_OPS_GET_OR_SET_FIELD(sk_txhash, sk_txhash, struct sock, type); break; - - case offsetof(struct bpf_sock_ops, bytes_received): - SOCK_OPS_GET_FIELD(bytes_received, bytes_received, - struct tcp_sock); - break; - - case offsetof(struct bpf_sock_ops, bytes_acked): - SOCK_OPS_GET_FIELD(bytes_acked, bytes_acked, struct tcp_sock); - break; - } return insn - insn_buf; } From 9ebb87cc5bc62dbe9b5c4cfbe8a734a850baa5a8 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Sat, 9 Feb 2019 23:22:24 -0800 Subject: [PATCH 1052/1640] BACKPORT: bpf: Add struct bpf_tcp_sock and BPF_FUNC_tcp_sock This patch adds a helper function BPF_FUNC_tcp_sock and it is currently available for cg_skb and sched_(cls|act): struct bpf_tcp_sock *bpf_tcp_sock(struct bpf_sock *sk); int cg_skb_foo(struct __sk_buff *skb) { struct bpf_tcp_sock *tp; struct bpf_sock *sk; __u32 snd_cwnd; sk = skb->sk; if (!sk) return 1; tp = bpf_tcp_sock(sk); if (!tp) return 1; snd_cwnd = tp->snd_cwnd; /* ... */ return 1; } A 'struct bpf_tcp_sock' is also added to the uapi bpf.h to provide read-only access. bpf_tcp_sock has all the existing tcp_sock's fields that has already been exposed by the bpf_sock_ops. i.e. no new tcp_sock's fields are exposed in bpf.h. This helper returns a pointer to the tcp_sock. If it is not a tcp_sock or it cannot be traced back to a tcp_sock by sk_to_full_sk(), it returns NULL. Hence, the caller needs to check for NULL before accessing it. The current use case is to expose members from tcp_sock to allow a cg_skb_bpf_prog to provide per cgroup traffic policing/shaping. Acked-by: Alexei Starovoitov Change-Id: I6efe1fd67f02a3a1c489ed5b7f4e86990b9d225a Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 30 +++++++++++++++ include/uapi/linux/bpf.h | 48 ++++++++++++++++++++++++ kernel/bpf/verifier.c | 31 +++++++++++++++- net/core/filter.c | 79 ++++++++++++++++++++++++++++++++++++++++ 4 files changed, 186 insertions(+), 2 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index b96ed5ab186d..6ecee0424d8a 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -209,6 +209,7 @@ enum bpf_return_type { RET_PTR_TO_MAP_VALUE, /* returns a pointer to map elem value */ RET_PTR_TO_MAP_VALUE_OR_NULL, /* returns a pointer to map elem value or NULL */ RET_PTR_TO_SOCKET_OR_NULL, /* returns a pointer to a socket or NULL */ + RET_PTR_TO_TCP_SOCK_OR_NULL, /* returns a pointer to a tcp_sock or NULL */ }; /* eBPF function prototype used by verifier to allow BPF_CALLs from eBPF programs @@ -264,6 +265,8 @@ enum bpf_reg_type { PTR_TO_SOCKET_OR_NULL, /* reg points to struct bpf_sock or NULL */ PTR_TO_SOCK_COMMON, /* reg points to sock_common */ PTR_TO_SOCK_COMMON_OR_NULL, /* reg points to sock_common or NULL */ + PTR_TO_TCP_SOCK, /* reg points to struct tcp_sock */ + PTR_TO_TCP_SOCK_OR_NULL, /* reg points to struct tcp_sock or NULL */ }; /* The information passed from prog-specific *_is_valid_access @@ -973,4 +976,31 @@ static inline u32 bpf_sock_convert_ctx_access(enum bpf_access_type type, } #endif +#ifdef CONFIG_INET +bool bpf_tcp_sock_is_valid_access(int off, int size, enum bpf_access_type type, + struct bpf_insn_access_aux *info); + +u32 bpf_tcp_sock_convert_ctx_access(enum bpf_access_type type, + const struct bpf_insn *si, + struct bpf_insn *insn_buf, + struct bpf_prog *prog, + u32 *target_size); +#else +static inline bool bpf_tcp_sock_is_valid_access(int off, int size, + enum bpf_access_type type, + struct bpf_insn_access_aux *info) +{ + return false; +} + +static inline u32 bpf_tcp_sock_convert_ctx_access(enum bpf_access_type type, + const struct bpf_insn *si, + struct bpf_insn *insn_buf, + struct bpf_prog *prog, + u32 *target_size) +{ + return 0; +} +#endif /* CONFIG_INET */ + #endif /* _LINUX_BPF_H */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index ba0a2f6e4ce9..54f387e243ac 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2300,6 +2300,15 @@ union bpf_attr { * Return * A **struct bpf_sock** pointer on success, or NULL in * case of failure. + * + * struct bpf_tcp_sock *bpf_tcp_sock(struct bpf_sock *sk) + * Description + * This helper gets a **struct bpf_tcp_sock** pointer from a + * **struct bpf_sock** pointer. + * + * Return + * A **struct bpf_tcp_sock** pointer on success, or NULL in + * case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -2607,6 +2616,45 @@ struct bpf_sock { */ }; +struct bpf_tcp_sock { + __u32 snd_cwnd; /* Sending congestion window */ + __u32 srtt_us; /* smoothed round trip time << 3 in usecs */ + __u32 rtt_min; + __u32 snd_ssthresh; /* Slow start size threshold */ + __u32 rcv_nxt; /* What we want to receive next */ + __u32 snd_nxt; /* Next sequence we send */ + __u32 snd_una; /* First byte we want an ack for */ + __u32 mss_cache; /* Cached effective mss, not including SACKS */ + __u32 ecn_flags; /* ECN status bits. */ + __u32 rate_delivered; /* saved rate sample: packets delivered */ + __u32 rate_interval_us; /* saved rate sample: time elapsed */ + __u32 packets_out; /* Packets which are "in flight" */ + __u32 retrans_out; /* Retransmitted packets out */ + __u32 total_retrans; /* Total retransmits for entire connection */ + __u32 segs_in; /* RFC4898 tcpEStatsPerfSegsIn + * total number of segments in. + */ + __u32 data_segs_in; /* RFC4898 tcpEStatsPerfDataSegsIn + * total number of data segments in. + */ + __u32 segs_out; /* RFC4898 tcpEStatsPerfSegsOut + * The total number of segments sent. + */ + __u32 data_segs_out; /* RFC4898 tcpEStatsPerfDataSegsOut + * total number of data segments sent. + */ + __u32 lost_out; /* Lost packets */ + __u32 sacked_out; /* SACK'd packets */ + __u64 bytes_received; /* RFC4898 tcpEStatsAppHCThruOctetsReceived + * sum(delta(rcv_nxt)), or how many bytes + * were acked. + */ + __u64 bytes_acked; /* RFC4898 tcpEStatsAppHCThruOctetsAcked + * sum(delta(snd_una)), or how many bytes + * were acked. + */ +}; + struct bpf_sock_tuple { union { struct { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index f83bafc569d9..f29a61eb52e1 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -333,14 +333,16 @@ static bool type_is_pkt_pointer(enum bpf_reg_type type) static bool type_is_sk_pointer(enum bpf_reg_type type) { return type == PTR_TO_SOCKET || - type == PTR_TO_SOCK_COMMON; + type == PTR_TO_SOCK_COMMON || + type == PTR_TO_TCP_SOCK; } static bool reg_type_may_be_null(enum bpf_reg_type type) { return type == PTR_TO_MAP_VALUE_OR_NULL || type == PTR_TO_SOCKET_OR_NULL || - type == PTR_TO_SOCK_COMMON_OR_NULL; + type == PTR_TO_SOCK_COMMON_OR_NULL || + type == PTR_TO_TCP_SOCK_OR_NULL; } static bool type_is_refcounted(enum bpf_reg_type type) @@ -406,6 +408,8 @@ static const char * const reg_type_str[] = { [PTR_TO_SOCKET_OR_NULL] = "sock_or_null", [PTR_TO_SOCK_COMMON] = "sock_common", [PTR_TO_SOCK_COMMON_OR_NULL] = "sock_common_or_null", + [PTR_TO_TCP_SOCK] = "tcp_sock", + [PTR_TO_TCP_SOCK_OR_NULL] = "tcp_sock_or_null", }; static char slot_type_char[] = { @@ -1208,6 +1212,8 @@ static bool is_spillable_regtype(enum bpf_reg_type type) case PTR_TO_SOCKET_OR_NULL: case PTR_TO_SOCK_COMMON: case PTR_TO_SOCK_COMMON_OR_NULL: + case PTR_TO_TCP_SOCK: + case PTR_TO_TCP_SOCK_OR_NULL: return true; default: return false; @@ -1671,6 +1677,9 @@ static int check_sock_access(struct bpf_verifier_env *env, int insn_idx, case PTR_TO_SOCKET: valid = bpf_sock_is_valid_access(off, size, t, &info); break; + case PTR_TO_TCP_SOCK: + valid = bpf_tcp_sock_is_valid_access(off, size, t, &info); + break; default: valid = false; } @@ -1832,6 +1841,9 @@ static int check_ptr_alignment(struct bpf_verifier_env *env, case PTR_TO_SOCK_COMMON: pointer_desc = "sock_common "; break; + case PTR_TO_TCP_SOCK: + pointer_desc = "tcp_sock "; + break; default: break; } @@ -3254,6 +3266,10 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn /* For mark_ptr_or_null_reg() */ regs[BPF_REG_0].id = ++env->id_gen; } + } else if (fn->ret_type == RET_PTR_TO_TCP_SOCK_OR_NULL) { + mark_reg_known_zero(env, regs, BPF_REG_0); + regs[BPF_REG_0].type = PTR_TO_TCP_SOCK_OR_NULL; + regs[BPF_REG_0].id = ++env->id_gen; } else { verbose(env, "unknown return type %d of func %s#%d\n", fn->ret_type, func_id_name(func_id), func_id); @@ -3691,6 +3707,8 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, case PTR_TO_SOCKET_OR_NULL: case PTR_TO_SOCK_COMMON: case PTR_TO_SOCK_COMMON_OR_NULL: + case PTR_TO_TCP_SOCK: + case PTR_TO_TCP_SOCK_OR_NULL: verbose(env, "R%d pointer arithmetic on %s prohibited\n", dst, reg_type_str[ptr_reg->type]); return -EACCES; @@ -4910,6 +4928,8 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state, reg->type = PTR_TO_SOCKET; } else if (reg->type == PTR_TO_SOCK_COMMON_OR_NULL) { reg->type = PTR_TO_SOCK_COMMON; + } else if (reg->type == PTR_TO_TCP_SOCK_OR_NULL) { + reg->type = PTR_TO_TCP_SOCK; } if (is_null || !(reg_is_refcounted(reg) || reg_may_point_to_spin_lock(reg))) { @@ -6127,6 +6147,8 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold, case PTR_TO_SOCKET_OR_NULL: case PTR_TO_SOCK_COMMON: case PTR_TO_SOCK_COMMON_OR_NULL: + case PTR_TO_TCP_SOCK: + case PTR_TO_TCP_SOCK_OR_NULL: /* Only valid matches are exact, which memcmp() above * would have accepted */ @@ -6439,6 +6461,8 @@ static bool reg_type_mismatch_ok(enum bpf_reg_type type) case PTR_TO_SOCKET_OR_NULL: case PTR_TO_SOCK_COMMON: case PTR_TO_SOCK_COMMON_OR_NULL: + case PTR_TO_TCP_SOCK: + case PTR_TO_TCP_SOCK_OR_NULL: return false; default: return true; @@ -7448,6 +7472,9 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) case PTR_TO_SOCK_COMMON: convert_ctx_access = bpf_sock_convert_ctx_access; break; + case PTR_TO_TCP_SOCK: + convert_ctx_access = bpf_tcp_sock_convert_ctx_access; + break; default: continue; } diff --git a/net/core/filter.c b/net/core/filter.c index e9c2ddc33b79..1aba772312f5 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4977,6 +4977,79 @@ static const struct bpf_func_proto bpf_sock_addr_sk_lookup_udp_proto = { .arg5_type = ARG_ANYTHING, }; +bool bpf_tcp_sock_is_valid_access(int off, int size, enum bpf_access_type type, + struct bpf_insn_access_aux *info) +{ + if (off < 0 || off >= offsetofend(struct bpf_tcp_sock, bytes_acked)) + return false; + + if (off % size != 0) + return false; + + switch (off) { + case offsetof(struct bpf_tcp_sock, bytes_received): + case offsetof(struct bpf_tcp_sock, bytes_acked): + return size == sizeof(__u64); + default: + return size == sizeof(__u32); + } +} + +u32 bpf_tcp_sock_convert_ctx_access(enum bpf_access_type type, + const struct bpf_insn *si, + struct bpf_insn *insn_buf, + struct bpf_prog *prog, u32 *target_size) +{ + struct bpf_insn *insn = insn_buf; + +#define BPF_TCP_SOCK_GET_COMMON(FIELD) \ + do { \ + BUILD_BUG_ON(FIELD_SIZEOF(struct tcp_sock, FIELD) > \ + FIELD_SIZEOF(struct bpf_tcp_sock, FIELD)); \ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct tcp_sock, FIELD),\ + si->dst_reg, si->src_reg, \ + offsetof(struct tcp_sock, FIELD)); \ + } while (0) + + CONVERT_COMMON_TCP_SOCK_FIELDS(struct bpf_tcp_sock, + BPF_TCP_SOCK_GET_COMMON); + + if (insn > insn_buf) + return insn - insn_buf; + + switch (si->off) { + case offsetof(struct bpf_tcp_sock, rtt_min): + BUILD_BUG_ON(FIELD_SIZEOF(struct tcp_sock, rtt_min) != + sizeof(struct minmax)); + BUILD_BUG_ON(sizeof(struct minmax) < + sizeof(struct minmax_sample)); + + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, + offsetof(struct tcp_sock, rtt_min) + + offsetof(struct minmax_sample, v)); + break; + } + + return insn - insn_buf; +} + +BPF_CALL_1(bpf_tcp_sock, struct sock *, sk) +{ + sk = sk_to_full_sk(sk); + + if (sk_fullsock(sk) && sk->sk_protocol == IPPROTO_TCP) + return (unsigned long)sk; + + return (unsigned long)NULL; +} + +static const struct bpf_func_proto bpf_tcp_sock_proto = { + .func = bpf_tcp_sock, + .gpl_only = false, + .ret_type = RET_PTR_TO_TCP_SOCK_OR_NULL, + .arg1_type = ARG_PTR_TO_SOCK_COMMON, +}; + #endif /* CONFIG_INET */ bool bpf_helper_changes_pkt_data(void *func) @@ -5132,6 +5205,10 @@ cg_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_local_storage_proto; case BPF_FUNC_sk_fullsock: return &bpf_sk_fullsock_proto; +#ifdef CONFIG_INET + case BPF_FUNC_tcp_sock: + return &bpf_tcp_sock_proto; +#endif default: return sk_filter_func_proto(func_id, prog); } @@ -5224,6 +5301,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_sk_lookup_udp_proto; case BPF_FUNC_sk_release: return &bpf_sk_release_proto; + case BPF_FUNC_tcp_sock: + return &bpf_tcp_sock_proto; #endif default: return bpf_base_func_proto(func_id); From d84c881552c32a6cd3970ffe6b10e6bb3a41301b Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 12 Feb 2019 00:20:39 -0800 Subject: [PATCH 1053/1640] BACKPORT: bpf: offload: add priv field for drivers Currently bpf_offload_dev does not have any priv pointer, forcing the drivers to work backwards from the netdev in program metadata. This is not great given programs are conceptually associated with the offload device, and it means one or two unnecessary deferences. Add a priv pointer to bpf_offload_dev. Change-Id: I4e592fd1286dc91834bcf6e5134ef52d7231bc40 Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 3 ++- kernel/bpf/offload.c | 10 +++++++++- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 6ecee0424d8a..1148e30f0295 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -787,8 +787,9 @@ int bpf_map_offload_get_next_key(struct bpf_map *map, bool bpf_offload_prog_map_match(struct bpf_prog *prog, struct bpf_map *map); struct bpf_offload_dev * -bpf_offload_dev_create(const struct bpf_prog_offload_ops *ops); +bpf_offload_dev_create(const struct bpf_prog_offload_ops *ops, void *priv); void bpf_offload_dev_destroy(struct bpf_offload_dev *offdev); +void *bpf_offload_dev_priv(struct bpf_offload_dev *offdev); int bpf_offload_dev_netdev_register(struct bpf_offload_dev *offdev, struct net_device *netdev); void bpf_offload_dev_netdev_unregister(struct bpf_offload_dev *offdev, diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index 24d855b384e9..3668a0bc18ec 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -35,6 +35,7 @@ static DECLARE_RWSEM(bpf_devs_lock); struct bpf_offload_dev { const struct bpf_prog_offload_ops *ops; struct list_head netdevs; + void *priv; }; struct bpf_offload_netdev { @@ -669,7 +670,7 @@ unlock: EXPORT_SYMBOL_GPL(bpf_offload_dev_netdev_unregister); struct bpf_offload_dev * -bpf_offload_dev_create(const struct bpf_prog_offload_ops *ops) +bpf_offload_dev_create(const struct bpf_prog_offload_ops *ops, void *priv) { struct bpf_offload_dev *offdev; int err; @@ -690,6 +691,7 @@ bpf_offload_dev_create(const struct bpf_prog_offload_ops *ops) return ERR_PTR(-ENOMEM); offdev->ops = ops; + offdev->priv = priv; INIT_LIST_HEAD(&offdev->netdevs); return offdev; @@ -702,3 +704,9 @@ void bpf_offload_dev_destroy(struct bpf_offload_dev *offdev) kfree(offdev); } EXPORT_SYMBOL_GPL(bpf_offload_dev_destroy); + +void *bpf_offload_dev_priv(struct bpf_offload_dev *offdev) +{ + return offdev->priv; +} +EXPORT_SYMBOL_GPL(bpf_offload_dev_priv); From 9e48f78dfa41eb21b294e92a1a011a038994b460 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 28 Jan 2019 17:21:52 -0800 Subject: [PATCH 1054/1640] UPSTREAM: bpf: check that BPF programs run with preemption disabled Introduce cant_sleep() macro for annotation of functions that cannot sleep. Use it in BPF_PROG_RUN to catch execution of BPF programs in preemptable context. Suggested-by: Jann Horn Change-Id: If46f95bfbb77de85daf1db948927a72460010d07 Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Signed-off-by: Daniel Borkmann --- include/linux/filter.h | 2 +- include/linux/kernel.h | 14 ++++++++++++-- kernel/sched/core.c | 28 ++++++++++++++++++++++++++++ 3 files changed, 41 insertions(+), 3 deletions(-) diff --git a/include/linux/filter.h b/include/linux/filter.h index 2e15b0e7277f..4e5a6f4b0e05 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -621,7 +621,7 @@ static inline void bpf_jit_set_header_magic(struct bpf_binary_header *hdr) } #endif -#define BPF_PROG_RUN(filter, ctx) bpf_call_func(filter, ctx) +#define BPF_PROG_RUN(filter, ctx) ({ cant_sleep(); (*(filter)->bpf_func)(ctx, (filter)->insnsi); }) #define BPF_SKB_CB_LEN QDISC_CB_PRIV_LEN diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 2114e3e5abd1..6766c845da8b 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -212,8 +212,10 @@ extern int _cond_resched(void); #endif #ifdef CONFIG_DEBUG_ATOMIC_SLEEP - void ___might_sleep(const char *file, int line, int preempt_offset); - void __might_sleep(const char *file, int line, int preempt_offset); +extern void ___might_sleep(const char *file, int line, int preempt_offset); +extern void __might_sleep(const char *file, int line, int preempt_offset); +extern void __cant_sleep(const char *file, int line, int preempt_offset); + /** * might_sleep - annotation for functions that can sleep * @@ -226,6 +228,13 @@ extern int _cond_resched(void); */ # define might_sleep() \ do { __might_sleep(__FILE__, __LINE__, 0); might_resched(); } while (0) +/** + * cant_sleep - annotation for functions that cannot sleep + * + * this macro will print a stack trace if it is executed with preemption enabled + */ +# define cant_sleep() \ + do { __cant_sleep(__FILE__, __LINE__, 0); } while (0) # define sched_annotate_sleep() (current->task_state_change = 0) #else static inline void ___might_sleep(const char *file, int line, @@ -233,6 +242,7 @@ extern int _cond_resched(void); static inline void __might_sleep(const char *file, int line, int preempt_offset) { } # define might_sleep() do { might_resched(); } while (0) +# define cant_sleep() do { } while (0) # define sched_annotate_sleep() do { } while (0) #endif diff --git a/kernel/sched/core.c b/kernel/sched/core.c index ae615d26c61c..c85b870325c2 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6658,6 +6658,34 @@ void ___might_sleep(const char *file, int line, int preempt_offset) add_taint(TAINT_WARN, LOCKDEP_STILL_OK); } EXPORT_SYMBOL(___might_sleep); + +void __cant_sleep(const char *file, int line, int preempt_offset) +{ + static unsigned long prev_jiffy; + + if (irqs_disabled()) + return; + + if (!IS_ENABLED(CONFIG_PREEMPT_COUNT)) + return; + + if (preempt_count() > preempt_offset) + return; + + if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) + return; + prev_jiffy = jiffies; + + printk(KERN_ERR "BUG: assuming atomic context at %s:%d\n", file, line); + printk(KERN_ERR "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n", + in_atomic(), irqs_disabled(), + current->pid, current->comm); + + debug_show_held_locks(current); + dump_stack(); + add_taint(TAINT_WARN, LOCKDEP_STILL_OK); +} +EXPORT_SYMBOL_GPL(__cant_sleep); #endif #ifdef CONFIG_MAGIC_SYSRQ From f3a5dc8b00a6500a0b150a86f568eaed620a5910 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 25 Feb 2019 14:28:39 -0800 Subject: [PATCH 1055/1640] BACKPORT: bpf: enable program stats JITed BPF programs are indistinguishable from kernel functions, but unlike kernel code BPF code can be changed often. Typical approach of "perf record" + "perf report" profiling and tuning of kernel code works just as well for BPF programs, but kernel code doesn't need to be monitored whereas BPF programs do. Users load and run large amount of BPF programs. These BPF stats allow tools monitor the usage of BPF on the server. The monitoring tools will turn sysctl kernel.bpf_stats_enabled on and off for few seconds to sample average cost of the programs. Aggregated data over hours and days will provide an insight into cost of BPF and alarms can trigger in case given program suddenly gets more expensive. The cost of two sched_clock() per program invocation adds ~20 nsec. Fast BPF progs (like selftests/bpf/progs/test_pkt_access.c) will slow down from ~10 nsec to ~30 nsec. static_key minimizes the cost of the stats collection. There is no measurable difference before/after this patch with kernel.bpf_stats_enabled=0 Change-Id: If2f51ea469ce232eb67b806514a0a158847ae302 Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 9 +++++++++ include/linux/filter.h | 20 +++++++++++++++++++- kernel/bpf/core.c | 31 +++++++++++++++++++++++++++++-- kernel/bpf/syscall.c | 34 ++++++++++++++++++++++++++++++++-- kernel/bpf/verifier.c | 7 ++++++- kernel/sysctl.c | 34 ++++++++++++++++++++++++++++++++++ 6 files changed, 129 insertions(+), 6 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 1148e30f0295..92cee04b3215 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -16,6 +16,7 @@ #include #include #include +#include struct bpf_verifier_env; struct perf_event; @@ -345,6 +346,12 @@ enum bpf_cgroup_storage_type { #define MAX_BPF_CGROUP_STORAGE_TYPE __BPF_CGROUP_STORAGE_MAX +struct bpf_prog_stats { + u64 cnt; + u64 nsecs; + struct u64_stats_sync syncp; +}; + struct bpf_prog_aux { atomic_t refcnt; u32 used_map_cnt; @@ -394,6 +401,7 @@ struct bpf_prog_aux { * main prog always has linfo_idx == 0 */ u32 linfo_idx; + struct bpf_prog_stats __percpu *stats; union { struct work_struct work; struct rcu_head rcu; @@ -562,6 +570,7 @@ void bpf_map_area_free(void *base); void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr); extern int sysctl_unprivileged_bpf_disabled; +extern int sysctl_bpf_stats_enabled; int bpf_map_new_fd(struct bpf_map *map, int flags); int bpf_prog_new_fd(struct bpf_prog *prog); diff --git a/include/linux/filter.h b/include/linux/filter.h index 4e5a6f4b0e05..036903dfea6f 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -621,7 +621,24 @@ static inline void bpf_jit_set_header_magic(struct bpf_binary_header *hdr) } #endif -#define BPF_PROG_RUN(filter, ctx) ({ cant_sleep(); (*(filter)->bpf_func)(ctx, (filter)->insnsi); }) +DECLARE_STATIC_KEY_FALSE(bpf_stats_enabled_key); + +#define BPF_PROG_RUN(prog, ctx) ({ \ + u32 ret; \ + cant_sleep(); \ + if (static_branch_unlikely(&bpf_stats_enabled_key)) { \ + struct bpf_prog_stats *stats; \ + u64 start = sched_clock(); \ + ret = (*(prog)->bpf_func)(ctx, (prog)->insnsi); \ + stats = this_cpu_ptr(prog->aux->stats); \ + u64_stats_update_begin(&stats->syncp); \ + stats->cnt++; \ + stats->nsecs += sched_clock() - start; \ + u64_stats_update_end(&stats->syncp); \ + } else { \ + ret = (*(prog)->bpf_func)(ctx, (prog)->insnsi); \ + } \ + ret; }) #define BPF_SKB_CB_LEN QDISC_CB_PRIV_LEN @@ -852,6 +869,7 @@ void bpf_prog_free_jited_linfo(struct bpf_prog *prog); void bpf_prog_free_unused_jited_linfo(struct bpf_prog *prog); struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags); +struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flags); struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size, gfp_t gfp_extra_flags); void __bpf_prog_free(struct bpf_prog *fp); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index be30c4be3eb0..e4bb91ab7842 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -86,7 +86,7 @@ void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb, int k, uns return NULL; } -struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags) +struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flags) { gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | gfp_extra_flags; struct bpf_prog_aux *aux; @@ -112,6 +112,26 @@ struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags) return fp; } + +struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags) +{ + gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | gfp_extra_flags; + struct bpf_prog *prog; + + prog = bpf_prog_alloc_no_stats(size, gfp_extra_flags); + if (!prog) + return NULL; + + prog->aux->stats = alloc_percpu_gfp(struct bpf_prog_stats, gfp_flags); + if (!prog->aux->stats) { + kfree(prog->aux); + vfree(prog); + return NULL; + } + + u64_stats_init(&prog->aux->stats->syncp); + return prog; +} EXPORT_SYMBOL_GPL(bpf_prog_alloc); int bpf_prog_alloc_jited_linfo(struct bpf_prog *prog) @@ -239,7 +259,10 @@ struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size, void __bpf_prog_free(struct bpf_prog *fp) { - kfree(fp->aux); + if (fp->aux) { + free_percpu(fp->aux->stats); + kfree(fp->aux); + } vfree(fp); } @@ -2103,6 +2126,10 @@ int __weak skb_copy_bits(const struct sk_buff *skb, int offset, void *to, return -EFAULT; } +DEFINE_STATIC_KEY_FALSE(bpf_stats_enabled_key); +EXPORT_SYMBOL(bpf_stats_enabled_key); +int sysctl_bpf_stats_enabled __read_mostly; + /* All definitions of tracepoints related to BPF. */ #define CREATE_TRACE_POINTS #include diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 2eefcb8b2953..7e56c66ef3a0 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1288,24 +1288,54 @@ static int bpf_prog_release(struct inode *inode, struct file *filp) return 0; } +static void bpf_prog_get_stats(const struct bpf_prog *prog, + struct bpf_prog_stats *stats) +{ + u64 nsecs = 0, cnt = 0; + int cpu; + + for_each_possible_cpu(cpu) { + const struct bpf_prog_stats *st; + unsigned int start; + u64 tnsecs, tcnt; + + st = per_cpu_ptr(prog->aux->stats, cpu); + do { + start = u64_stats_fetch_begin_irq(&st->syncp); + tnsecs = st->nsecs; + tcnt = st->cnt; + } while (u64_stats_fetch_retry_irq(&st->syncp, start)); + nsecs += tnsecs; + cnt += tcnt; + } + stats->nsecs = nsecs; + stats->cnt = cnt; +} + #ifdef CONFIG_PROC_FS static void bpf_prog_show_fdinfo(struct seq_file *m, struct file *filp) { const struct bpf_prog *prog = filp->private_data; char prog_tag[sizeof(prog->tag) * 2 + 1] = { }; + struct bpf_prog_stats stats; + bpf_prog_get_stats(prog, &stats); bin2hex(prog_tag, prog->tag, sizeof(prog->tag)); seq_printf(m, "prog_type:\t%u\n" "prog_jited:\t%u\n" "prog_tag:\t%s\n" "memlock:\t%llu\n" - "prog_id:\t%u\n", + "prog_id:\t%u\n" + "run_time_ns:\t%llu\n" + "run_cnt:\t%llu\n", prog->type, prog->jited, prog_tag, prog->pages * 1ULL << PAGE_SHIFT, - prog->aux->id); + prog->aux->id, + stats.nsecs, + stats.cnt); } #endif diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index f29a61eb52e1..628eb475e6ea 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -7602,7 +7602,12 @@ static int jit_subprogs(struct bpf_verifier_env *env) subprog_end = env->subprog_info[i + 1].start; len = subprog_end - subprog_start; - func[i] = bpf_prog_alloc(bpf_prog_size(len), GFP_USER); + /* BPF_PROG_RUN doesn't call subprogs directly, + * hence main prog stats include the runtime of subprogs. + * subprogs don't have IDs and not reachable via prog_get_next_id + * func[i]->aux->stats will never be accessed and stays NULL + */ + func[i] = bpf_prog_alloc_no_stats(bpf_prog_size(len), GFP_USER); if (!func[i]) goto out_free; memcpy(func[i]->insnsi, &prog->insnsi[subprog_start], diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 8af069d2fc53..16d4b91cd991 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -234,6 +234,9 @@ static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write, static int proc_dostring_coredump(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); #endif +static int proc_dointvec_minmax_bpf_stats(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos); #ifdef CONFIG_MAGIC_SYSRQ /* Note: sysrq code uses it's own private copy */ @@ -1428,6 +1431,15 @@ static struct ctl_table kern_table[] = { .extra2 = &two, }, #endif + { + .procname = "bpf_stats_enabled", + .data = &sysctl_bpf_stats_enabled, + .maxlen = sizeof(sysctl_bpf_stats_enabled), + .mode = 0644, + .proc_handler = proc_dointvec_minmax_bpf_stats, + .extra1 = &zero, + .extra2 = &one, + }, #if defined(CONFIG_TREE_RCU) || defined(CONFIG_PREEMPT_RCU) { .procname = "panic_on_rcu_stall", @@ -3533,6 +3545,28 @@ int proc_douintvec_capacity(struct ctl_table *table, int write, #endif /* CONFIG_PROC_SYSCTL */ +static int proc_dointvec_minmax_bpf_stats(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret, bpf_stats = *(int *)table->data; + struct ctl_table tmp = *table; + + if (write && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + tmp.data = &bpf_stats; + ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); + if (write && !ret) { + *(int *)table->data = bpf_stats; + if (bpf_stats) + static_branch_enable(&bpf_stats_enabled_key); + else + static_branch_disable(&bpf_stats_enabled_key); + } + return ret; +} + /* * No sense putting this after each symbol definition, twice, * exception granted :-) From c6be4f6e86617accfb8feb7e69ec374b7d505d63 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 25 Feb 2019 14:28:40 -0800 Subject: [PATCH 1056/1640] UPSTREAM: bpf: expose program stats via bpf_prog_info Return bpf program run_time_ns and run_cnt via bpf_prog_info Change-Id: I76830ceaabd2de88a6d76c04c3e5a4c356c29ee6 Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann --- include/uapi/linux/bpf.h | 2 ++ kernel/bpf/syscall.c | 5 +++++ 2 files changed, 7 insertions(+) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 54f387e243ac..6b795a95ec89 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2781,6 +2781,8 @@ struct bpf_prog_info { __u32 jited_line_info_rec_size; __u32 nr_prog_tags; __aligned_u64 prog_tags; + __u64 run_time_ns; + __u64 run_cnt; } __attribute__((aligned(8))); struct bpf_map_info { diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 7e56c66ef3a0..bc8c0fc0edcc 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2173,6 +2173,7 @@ static int bpf_prog_get_info_by_fd(struct file *file, struct bpf_prog_info __user *uinfo = u64_to_user_ptr(attr->info.info); struct bpf_prog_info info; u32 info_len = attr->info.info_len; + struct bpf_prog_stats stats; char __user *uinsns; u32 ulen; int err; @@ -2213,6 +2214,10 @@ static int bpf_prog_get_info_by_fd(struct file *file, if (err) return err; + bpf_prog_get_stats(prog, &stats); + info.run_time_ns = stats.nsecs; + info.run_cnt = stats.cnt; + if (!capable(CAP_SYS_ADMIN)) { info.jited_prog_len = 0; info.xlated_prog_len = 0; From 427863a41c3e977aa40ce2651201f0bbf0cfb707 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 27 Feb 2019 13:22:56 -0800 Subject: [PATCH 1057/1640] UPSTREAM: bpf: set inner_map_meta->spin_lock_off correctly Commit d83525ca62cf ("bpf: introduce bpf_spin_lock") introduced bpf_spin_lock and the field spin_lock_off in kernel internal structure bpf_map has the following meaning: >=0 valid offset, <0 error For every map created, the kernel will ensure spin_lock_off has correct value. Currently, bpf_map->spin_lock_off is not copied from the inner map to the map_in_map inner_map_meta during a map_in_map type map creation, so inner_map_meta->spin_lock_off = 0. This will give verifier wrong information that inner_map has bpf_spin_lock and the bpf_spin_lock is defined at offset 0. An access to offset 0 of a value pointer will trigger the following error: bpf_spin_lock cannot be accessed directly by load/store This patch fixed the issue by copy inner map's spin_lock_off value to inner_map_meta->spin_lock_off. Fixes: d83525ca62cf ("bpf: introduce bpf_spin_lock") Change-Id: I6b6b24301a09c6ca696658ca19a25691612be2d7 Signed-off-by: Yonghong Song Acked-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov --- kernel/bpf/map_in_map.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c index 2cc43f053771..830c3187828b 100644 --- a/kernel/bpf/map_in_map.c +++ b/kernel/bpf/map_in_map.c @@ -58,6 +58,7 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd) inner_map_meta->value_size = inner_map->value_size; inner_map_meta->map_flags = inner_map->map_flags; inner_map_meta->max_entries = inner_map->max_entries; + inner_map_meta->spin_lock_off = inner_map->spin_lock_off; /* Misc members not needed in bpf_map_meta_equal() check. */ inner_map_meta->ops = inner_map->ops; From 78be9b2f5df69a79bc1411bd5e7d960464294ecb Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 1 Mar 2019 14:33:11 -0800 Subject: [PATCH 1058/1640] UPSTREAM: bpf: fix u64_stats_init() usage in bpf_prog_alloc() We need to iterate through all possible cpus. Fixes: 492ecee892c2 ("bpf: enable program stats") Change-Id: I6428056a575315e7e6e4c52f38e1842925a27c66 Signed-off-by: Eric Dumazet Reported-by: Guenter Roeck Tested-by: Guenter Roeck Acked-by: Song Liu Signed-off-by: Daniel Borkmann --- kernel/bpf/core.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index e4bb91ab7842..06ffea1a3646 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -117,6 +117,7 @@ struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags) { gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | gfp_extra_flags; struct bpf_prog *prog; + int cpu; prog = bpf_prog_alloc_no_stats(size, gfp_extra_flags); if (!prog) @@ -129,7 +130,12 @@ struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags) return NULL; } - u64_stats_init(&prog->aux->stats->syncp); + for_each_possible_cpu(cpu) { + struct bpf_prog_stats *pstats; + + pstats = per_cpu_ptr(prog->aux->stats, cpu); + u64_stats_init(&pstats->syncp); + } return prog; } EXPORT_SYMBOL_GPL(bpf_prog_alloc); From 95ec82a0a69ca8ba71c50ee7a444ec199b820f02 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 4 Mar 2019 21:08:53 +0100 Subject: [PATCH 1059/1640] UPSTREAM: bpf: fix replace_map_fd_with_map_ptr's ldimm64 second imm field Non-zero imm value in the second part of the ldimm64 instruction for BPF_PSEUDO_MAP_FD is invalid, and thus must be rejected. The map fd only ever sits in the first instructions' imm field. None of the BPF loaders known to us are using it, so risk of regression is minimal. For clarity and consistency, the few insn->{src_reg,imm} occurrences are rewritten into insn[0].{src_reg,imm}. Add a test case to the BPF selftest suite as well. Fixes: 0246e64d9a5f ("bpf: handle pseudo BPF_LD_IMM64 insn") Change-Id: I89f089d2cfcf6fc92d609d18a403c3222f76b2a6 Signed-off-by: Daniel Borkmann Acked-by: Song Liu Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 628eb475e6ea..80d6837a201d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6956,17 +6956,17 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env) /* valid generic load 64-bit imm */ goto next_insn; - if (insn->src_reg != BPF_PSEUDO_MAP_FD) { - verbose(env, - "unrecognized bpf_ld_imm64 insn\n"); + if (insn[0].src_reg != BPF_PSEUDO_MAP_FD || + insn[1].imm != 0) { + verbose(env, "unrecognized bpf_ld_imm64 insn\n"); return -EINVAL; } - f = fdget(insn->imm); + f = fdget(insn[0].imm); map = __bpf_map_get(f); if (IS_ERR(map)) { verbose(env, "fd %d is not pointing to valid bpf_map\n", - insn->imm); + insn[0].imm); return PTR_ERR(map); } From 91b521d48383176169639edddcab36dafa3c08a6 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Tue, 12 Mar 2019 10:23:02 -0700 Subject: [PATCH 1060/1640] BACKPORT: bpf: Fix bpf_tcp_sock and bpf_sk_fullsock issue related to bpf_sk_release Lorenz Bauer [thanks!] reported that a ptr returned by bpf_tcp_sock(sk) can still be accessed after bpf_sk_release(sk). Both bpf_tcp_sock() and bpf_sk_fullsock() have the same issue. This patch addresses them together. A simple reproducer looks like this: sk = bpf_sk_lookup_tcp(); /* if (!sk) ... */ tp = bpf_tcp_sock(sk); /* if (!tp) ... */ bpf_sk_release(sk); snd_cwnd = tp->snd_cwnd; /* oops! The verifier does not complain. */ The problem is the verifier did not scrub the register's states of the tcp_sock ptr (tp) after bpf_sk_release(sk). [ Note that when calling bpf_tcp_sock(sk), the sk is not always refcount-acquired. e.g. bpf_tcp_sock(skb->sk). The verifier works fine for this case. ] Currently, the verifier does not track if a helper's return ptr (in REG_0) is "carry"-ing one of its argument's refcount status. To carry this info, the reg1->id needs to be stored in reg0. One approach was tried, like "reg0->id = reg1->id", when calling "bpf_tcp_sock()". The main idea was to avoid adding another "ref_obj_id" for the same reg. However, overlapping the NULL marking and ref tracking purpose in one "id" does not work well: ref_sk = bpf_sk_lookup_tcp(); fullsock = bpf_sk_fullsock(ref_sk); tp = bpf_tcp_sock(ref_sk); if (!fullsock) { bpf_sk_release(ref_sk); return 0; } /* fullsock_reg->id is marked for NOT-NULL. * Same for tp_reg->id because they have the same id. */ /* oops. verifier did not complain about the missing !tp check */ snd_cwnd = tp->snd_cwnd; Hence, a new "ref_obj_id" is needed in "struct bpf_reg_state". With a new ref_obj_id, when bpf_sk_release(sk) is called, the verifier can scrub all reg states which has a ref_obj_id match. It is done with the changes in release_reg_references() in this patch. While fixing it, sk_to_full_sk() is removed from bpf_tcp_sock() and bpf_sk_fullsock() to avoid these helpers from returning another ptr. It will make bpf_sk_release(tp) possible: sk = bpf_sk_lookup_tcp(); /* if (!sk) ... */ tp = bpf_tcp_sock(sk); /* if (!tp) ... */ bpf_sk_release(tp); A separate helper "bpf_get_listener_sock()" will be added in a later patch to do sk_to_full_sk(). Misc change notes: - To allow bpf_sk_release(tp), the arg of bpf_sk_release() is changed from ARG_PTR_TO_SOCKET to ARG_PTR_TO_SOCK_COMMON. ARG_PTR_TO_SOCKET is removed from bpf.h since no helper is using it. - arg_type_is_refcounted() is renamed to arg_type_may_be_refcounted() because ARG_PTR_TO_SOCK_COMMON is the only one and skb->sk is not refcounted. All bpf_sk_release(), bpf_sk_fullsock() and bpf_tcp_sock() take ARG_PTR_TO_SOCK_COMMON. - check_refcount_ok() ensures is_acquire_function() cannot take arg_type_may_be_refcounted() as its argument. - The check_func_arg() can only allow one refcount-ed arg. It is guaranteed by check_refcount_ok() which ensures at most one arg can be refcounted. Hence, it is a verifier internal error if >1 refcount arg found in check_func_arg(). - In release_reference(), release_reference_state() is called first to ensure a match on "reg->ref_obj_id" can be found before scrubbing the reg states with release_reg_references(). - reg_is_refcounted() is no longer needed. 1. In mark_ptr_or_null_regs(), its usage is replaced by "ref_obj_id && ref_obj_id == id" because, when is_null == true, release_reference_state() should only be called on the ref_obj_id obtained by a acquire helper (i.e. is_acquire_function() == true). Otherwise, the following would happen: sk = bpf_sk_lookup_tcp(); /* if (!sk) { ... } */ fullsock = bpf_sk_fullsock(sk); if (!fullsock) { /* * release_reference_state(fullsock_reg->ref_obj_id) * where fullsock_reg->ref_obj_id == sk_reg->ref_obj_id. * * Hence, the following bpf_sk_release(sk) will fail * because the ref state has already been released in the * earlier release_reference_state(fullsock_reg->ref_obj_id). */ bpf_sk_release(sk); } 2. In release_reg_references(), the current reg_is_refcounted() call is unnecessary because the id check is enough. - The type_is_refcounted() and type_is_refcounted_or_null() are no longer needed also because reg_is_refcounted() is removed. Fixes: 655a51e536c0 ("bpf: Add struct bpf_tcp_sock and BPF_FUNC_tcp_sock") Reported-by: Lorenz Bauer Change-Id: I8190c2d2ec5f092f3b715d002c5270580a4a4fff Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 1 - include/linux/bpf_verifier.h | 40 +++++++++++ kernel/bpf/verifier.c | 133 ++++++++++++++++++++--------------- net/core/filter.c | 6 +- 4 files changed, 116 insertions(+), 64 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 92cee04b3215..cabc4fcd6bf0 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -198,7 +198,6 @@ enum bpf_arg_type { ARG_PTR_TO_CTX, /* pointer to context */ ARG_ANYTHING, /* any (initialized) argument is ok */ - ARG_PTR_TO_SOCKET, /* pointer to bpf_sock */ ARG_PTR_TO_SPIN_LOCK, /* pointer to bpf_spin_lock */ ARG_PTR_TO_SOCK_COMMON, /* pointer to sock_common */ }; diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 33fea3502118..ecf3bb76790a 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -66,6 +66,46 @@ struct bpf_reg_state { * same reference to the socket, to determine proper reference freeing. */ u32 id; + /* PTR_TO_SOCKET and PTR_TO_TCP_SOCK could be a ptr returned + * from a pointer-cast helper, bpf_sk_fullsock() and + * bpf_tcp_sock(). + * + * Consider the following where "sk" is a reference counted + * pointer returned from "sk = bpf_sk_lookup_tcp();": + * + * 1: sk = bpf_sk_lookup_tcp(); + * 2: if (!sk) { return 0; } + * 3: fullsock = bpf_sk_fullsock(sk); + * 4: if (!fullsock) { bpf_sk_release(sk); return 0; } + * 5: tp = bpf_tcp_sock(fullsock); + * 6: if (!tp) { bpf_sk_release(sk); return 0; } + * 7: bpf_sk_release(sk); + * 8: snd_cwnd = tp->snd_cwnd; // verifier will complain + * + * After bpf_sk_release(sk) at line 7, both "fullsock" ptr and + * "tp" ptr should be invalidated also. In order to do that, + * the reg holding "fullsock" and "sk" need to remember + * the original refcounted ptr id (i.e. sk_reg->id) in ref_obj_id + * such that the verifier can reset all regs which have + * ref_obj_id matching the sk_reg->id. + * + * sk_reg->ref_obj_id is set to sk_reg->id at line 1. + * sk_reg->id will stay as NULL-marking purpose only. + * After NULL-marking is done, sk_reg->id can be reset to 0. + * + * After "fullsock = bpf_sk_fullsock(sk);" at line 3, + * fullsock_reg->ref_obj_id is set to sk_reg->ref_obj_id. + * + * After "tp = bpf_tcp_sock(fullsock);" at line 5, + * tp_reg->ref_obj_id is set to fullsock_reg->ref_obj_id + * which is the same as sk_reg->ref_obj_id. + * + * From the verifier perspective, if sk, fullsock and tp + * are not NULL, they are the same ptr with different + * reg->type. In particular, bpf_sk_release(tp) is also + * allowed and has the same effect as bpf_sk_release(sk). + */ + u32 ref_obj_id; /* For scalar types (SCALAR_VALUE), this represents our knowledge of * the actual value. * For pointer types, this represents the variable part of the offset diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 80d6837a201d..d29e30b1579c 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -211,7 +211,7 @@ struct bpf_call_arg_meta { int regno; int access_size; u64 msize_max_value; - int ptr_id; + int ref_obj_id; int func_id; }; @@ -345,35 +345,15 @@ static bool reg_type_may_be_null(enum bpf_reg_type type) type == PTR_TO_TCP_SOCK_OR_NULL; } -static bool type_is_refcounted(enum bpf_reg_type type) -{ - return type == PTR_TO_SOCKET; -} - -static bool type_is_refcounted_or_null(enum bpf_reg_type type) -{ - return type == PTR_TO_SOCKET || type == PTR_TO_SOCKET_OR_NULL; -} - -static bool reg_is_refcounted(const struct bpf_reg_state *reg) -{ - return type_is_refcounted(reg->type); -} - static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg) { return reg->type == PTR_TO_MAP_VALUE && map_value_has_spin_lock(reg->map_ptr); } -static bool reg_is_refcounted_or_null(const struct bpf_reg_state *reg) +static bool arg_type_may_be_refcounted(enum bpf_arg_type type) { - return type_is_refcounted_or_null(reg->type); -} - -static bool arg_type_is_refcounted(enum bpf_arg_type type) -{ - return type == ARG_PTR_TO_SOCKET; + return type == ARG_PTR_TO_SOCK_COMMON; } /* Determine whether the function releases some resources allocated by another @@ -391,6 +371,12 @@ static bool is_acquire_function(enum bpf_func_id func_id) func_id == BPF_FUNC_sk_lookup_udp; } +static bool is_ptr_cast_function(enum bpf_func_id func_id) +{ + return func_id == BPF_FUNC_tcp_sock || + func_id == BPF_FUNC_sk_fullsock; +} + /* string representation of 'enum bpf_reg_type' */ static const char * const reg_type_str[] = { [NOT_INIT] = "?", @@ -464,7 +450,8 @@ static void print_verifier_state(struct bpf_verifier_env *env, if (t == PTR_TO_STACK) verbose(env, ",call_%d", func(env, reg)->callsite); } else { - verbose(env, "(id=%d", reg->id); + verbose(env, "(id=%d ref_obj_id=%d", reg->id, + reg->ref_obj_id); if (t != SCALAR_VALUE) verbose(env, ",off=%d", reg->off); if (type_is_pkt_pointer(t)) @@ -2498,16 +2485,15 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, /* Any sk pointer can be ARG_PTR_TO_SOCK_COMMON */ if (!type_is_sk_pointer(type)) goto err_type; - } else if (arg_type == ARG_PTR_TO_SOCKET) { - expected_type = PTR_TO_SOCKET; - if (type != expected_type) - goto err_type; - if (meta->ptr_id || !reg->id) { - verbose(env, "verifier internal error: mismatched references meta=%d, reg=%d\n", - meta->ptr_id, reg->id); - return -EFAULT; + if (reg->ref_obj_id) { + if (meta->ref_obj_id) { + verbose(env, "verifier internal error: more than one arg with ref_obj_id R%d %u %u\n", + regno, reg->ref_obj_id, + meta->ref_obj_id); + return -EFAULT; + } + meta->ref_obj_id = reg->ref_obj_id; } - meta->ptr_id = reg->id; } else if (arg_type == ARG_PTR_TO_SPIN_LOCK) { if (meta->func_id == BPF_FUNC_spin_lock) { if (process_spin_lock(env, regno, true)) @@ -2823,32 +2809,38 @@ static bool check_arg_pair_ok(const struct bpf_func_proto *fn) return true; } -static bool check_refcount_ok(const struct bpf_func_proto *fn) +static bool check_refcount_ok(const struct bpf_func_proto *fn, int func_id) { int count = 0; - if (arg_type_is_refcounted(fn->arg1_type)) + if (arg_type_may_be_refcounted(fn->arg1_type)) count++; - if (arg_type_is_refcounted(fn->arg2_type)) + if (arg_type_may_be_refcounted(fn->arg2_type)) count++; - if (arg_type_is_refcounted(fn->arg3_type)) + if (arg_type_may_be_refcounted(fn->arg3_type)) count++; - if (arg_type_is_refcounted(fn->arg4_type)) + if (arg_type_may_be_refcounted(fn->arg4_type)) count++; - if (arg_type_is_refcounted(fn->arg5_type)) + if (arg_type_may_be_refcounted(fn->arg5_type)) count++; + /* A reference acquiring function cannot acquire + * another refcounted ptr. + */ + if (is_acquire_function(func_id) && count) + return false; + /* We only support one arg being unreferenced at the moment, * which is sufficient for the helper functions we have right now. */ return count <= 1; } -static int check_func_proto(const struct bpf_func_proto *fn) +static int check_func_proto(const struct bpf_func_proto *fn, int func_id) { return check_raw_mode_ok(fn) && check_arg_pair_ok(fn) && - check_refcount_ok(fn) ? 0 : -EINVAL; + check_refcount_ok(fn, func_id) ? 0 : -EINVAL; } /* Packet data might have moved, any old PTR_TO_PACKET[_META,_END] @@ -2882,19 +2874,20 @@ static void clear_all_pkt_pointers(struct bpf_verifier_env *env) } static void release_reg_references(struct bpf_verifier_env *env, - struct bpf_func_state *state, int id) + struct bpf_func_state *state, + int ref_obj_id) { struct bpf_reg_state *regs = state->regs, *reg; int i; for (i = 0; i < MAX_BPF_REG; i++) - if (regs[i].id == id) + if (regs[i].ref_obj_id == ref_obj_id) mark_reg_unknown(env, regs, i); bpf_for_each_spilled_reg(i, state, reg) { if (!reg) continue; - if (reg_is_refcounted(reg) && reg->id == id) + if (reg->ref_obj_id == ref_obj_id) __mark_reg_unknown(reg); } } @@ -2903,15 +2896,20 @@ static void release_reg_references(struct bpf_verifier_env *env, * resources. Identify all copies of the same pointer and clear the reference. */ static int release_reference(struct bpf_verifier_env *env, - struct bpf_call_arg_meta *meta) + int ref_obj_id) { struct bpf_verifier_state *vstate = env->cur_state; + int err; int i; - for (i = 0; i <= vstate->curframe; i++) - release_reg_references(env, vstate->frame[i], meta->ptr_id); + err = release_reference_state(cur_func(env), ref_obj_id); + if (err) + return err; - return release_reference_state(cur_func(env), meta->ptr_id); + for (i = 0; i <= vstate->curframe; i++) + release_reg_references(env, vstate->frame[i], ref_obj_id); + + return 0; } static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, @@ -3153,7 +3151,7 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn memset(&meta, 0, sizeof(meta)); meta.pkt_access = fn->pkt_access; - err = check_func_proto(fn); + err = check_func_proto(fn, func_id); if (err) { verbose(env, "kernel subsystem misconfigured func %s#%d\n", func_id_name(func_id), func_id); @@ -3199,7 +3197,7 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn return err; } } else if (is_release_function(func_id)) { - err = release_reference(env, &meta); + err = release_reference(env, meta.ref_obj_id); if (err) { verbose(env, "func %s#%d reference has not been acquired before\n", func_id_name(func_id), func_id); @@ -3260,8 +3258,10 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn if (id < 0) return id; - /* For release_reference() */ + /* For mark_ptr_or_null_reg() */ regs[BPF_REG_0].id = id; + /* For release_reference() */ + regs[BPF_REG_0].ref_obj_id = id; } else { /* For mark_ptr_or_null_reg() */ regs[BPF_REG_0].id = ++env->id_gen; @@ -3276,6 +3276,10 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn return -EINVAL; } + if (is_ptr_cast_function(func_id)) + /* For release_reference() */ + regs[BPF_REG_0].ref_obj_id = meta.ref_obj_id; + err = do_refine_retval_range(env, regs, fn->ret_type, func_id, &meta); if (err) return err; @@ -4931,11 +4935,19 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state, } else if (reg->type == PTR_TO_TCP_SOCK_OR_NULL) { reg->type = PTR_TO_TCP_SOCK; } - if (is_null || !(reg_is_refcounted(reg) || - reg_may_point_to_spin_lock(reg))) { - /* We don't need id from this point onwards anymore, - * thus we should better reset it, so that state - * pruning has chances to take effect. + if (is_null) { + /* We don't need id and ref_obj_id from this point + * onwards anymore, thus we should better reset it, + * so that state pruning has chances to take effect. + */ + reg->id = 0; + reg->ref_obj_id = 0; + } else if (!reg_may_point_to_spin_lock(reg)) { + /* For not-NULL ptr, reg->ref_obj_id will be reset + * in release_reg_references(). + * + * reg->id is still used by spin_lock ptr. Other + * than spin_lock ptr type, reg->id can be reset. */ reg->id = 0; } @@ -4950,11 +4962,16 @@ static void mark_ptr_or_null_regs(struct bpf_verifier_state *vstate, u32 regno, { struct bpf_func_state *state = vstate->frame[vstate->curframe]; struct bpf_reg_state *reg, *regs = state->regs; + u32 ref_obj_id = regs[regno].ref_obj_id; u32 id = regs[regno].id; int i, j; - if (reg_is_refcounted_or_null(®s[regno]) && is_null) - release_reference_state(state, id); + if (ref_obj_id && ref_obj_id == id && is_null) + /* regs[regno] is in the " == NULL" branch. + * No one could have freed the reference state before + * doing the NULL check. + */ + WARN_ON_ONCE(release_reference_state(state, id)); for (i = 0; i < MAX_BPF_REG; i++) mark_ptr_or_null_reg(state, ®s[i], id, is_null); diff --git a/net/core/filter.c b/net/core/filter.c index 1aba772312f5..dd3372f5c740 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1800,8 +1800,6 @@ static const struct bpf_func_proto bpf_skb_pull_data_proto = { BPF_CALL_1(bpf_sk_fullsock, struct sock *, sk) { - sk = sk_to_full_sk(sk); - return sk_fullsock(sk) ? (unsigned long)sk : (unsigned long)NULL; } @@ -4894,7 +4892,7 @@ static const struct bpf_func_proto bpf_sk_release_proto = { .func = bpf_sk_release, .gpl_only = false, .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_SOCKET, + .arg1_type = ARG_PTR_TO_SOCK_COMMON, }; BPF_CALL_5(bpf_xdp_sk_lookup_udp, struct xdp_buff *, ctx, @@ -5035,8 +5033,6 @@ u32 bpf_tcp_sock_convert_ctx_access(enum bpf_access_type type, BPF_CALL_1(bpf_tcp_sock, struct sock *, sk) { - sk = sk_to_full_sk(sk); - if (sk_fullsock(sk) && sk->sk_protocol == IPPROTO_TCP) return (unsigned long)sk; From 1343dda9aab4e5f2bbdca77f114cce387436a585 Mon Sep 17 00:00:00 2001 From: Martynas Pumputis Date: Mon, 18 Mar 2019 16:10:26 +0100 Subject: [PATCH 1061/1640] UPSTREAM: bpf: Try harder when allocating memory for large maps It has been observed that sometimes a higher order memory allocation for BPF maps fails when there is no obvious memory pressure in a system. E.g. the map (BPF_MAP_TYPE_LRU_HASH, key=38, value=56, max_elems=524288) could not be created due to vmalloc unable to allocate 75497472B, when the system's memory consumption (in MB) was the following: Total: 3942 Used: 837 (21.24%) Free: 138 Buffers: 239 Cached: 2727 Later analysis [1] by Michal Hocko showed that the vmalloc was not trying to reclaim memory from the page cache and was failing prematurely due to __GFP_NORETRY. Considering dcda9b0471 ("mm, tree wide: replace __GFP_REPEAT by __GFP_RETRY_MAYFAIL with more useful semantic") and [1], we can replace __GFP_NORETRY with __GFP_RETRY_MAYFAIL, as it won't invoke OOM killer and will try harder to fulfil allocation requests. Unfortunately, replacing the body of the BPF map memory allocation function with the kvmalloc_node helper function is not an option at this point in time, given 1) kmalloc is non-optional for higher order allocations, and 2) passing __GFP_RETRY_MAYFAIL to the kmalloc would stress the slab allocator too much for large requests. The change has been tested with the workloads mentioned above and by observing oom_kill value from /proc/vmstat. [1]: https://lore.kernel.org/bpf/20190310071318.GW5232@dhcp22.suse.cz/ Change-Id: I956bb011e85c5ad9a375df4c27f7ca4c5a5c4395 Signed-off-by: Martynas Pumputis Acked-by: Yonghong Song Cc: Michal Hocko Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20190318153940.GL8924@dhcp22.suse.cz/ --- kernel/bpf/syscall.c | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index bc8c0fc0edcc..0f83af8a6971 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -137,21 +137,29 @@ static struct bpf_map *find_and_alloc_map(union bpf_attr *attr) void *bpf_map_area_alloc(size_t size, int numa_node) { - /* We definitely need __GFP_NORETRY, so OOM killer doesn't - * trigger under memory pressure as we really just want to - * fail instead. + /* We really just want to fail instead of triggering OOM killer + * under memory pressure, therefore we set __GFP_NORETRY to kmalloc, + * which is used for lower order allocation requests. + * + * It has been observed that higher order allocation requests done by + * vmalloc with __GFP_NORETRY being set might fail due to not trying + * to reclaim memory from the page cache, thus we set + * __GFP_RETRY_MAYFAIL to avoid such situations. */ - const gfp_t flags = __GFP_NOWARN | __GFP_NORETRY | __GFP_ZERO; + + const gfp_t flags = __GFP_NOWARN | __GFP_ZERO; void *area; if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) { - area = kmalloc_node(size, GFP_USER | flags, numa_node); + area = kmalloc_node(size, GFP_USER | __GFP_NORETRY | flags, + numa_node); if (area != NULL) return area; } - return __vmalloc_node_flags_caller(size, numa_node, GFP_KERNEL | flags, - __builtin_return_address(0)); + return __vmalloc_node_flags_caller(size, numa_node, + GFP_KERNEL | __GFP_RETRY_MAYFAIL | + flags, __builtin_return_address(0)); } void bpf_map_area_free(void *area) From 25e0e453c7cd57ff7bb56dc4c805505225044360 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Mon, 18 Mar 2019 10:37:13 -0700 Subject: [PATCH 1062/1640] UPSTREAM: bpf: Only print ref_obj_id for refcounted reg Naresh reported that test_align fails because of the mismatch at the verbose printout of the register states. The reason is due to the newly added ref_obj_id. ref_obj_id is only useful for refcounted reg. Thus, this patch fixes it by only printing ref_obj_id for refcounted reg. While at it, it also uses comma instead of space to separate between "id" and "ref_obj_id". Fixes: 1b986589680a ("bpf: Fix bpf_tcp_sock and bpf_sk_fullsock issue related to bpf_sk_release") Reported-by: Naresh Kamboju Change-Id: Ib3f3623a59918de2e3711300739aea716bdd237d Signed-off-by: Martin KaFai Lau Acked-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d29e30b1579c..4f8cae6a863d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -351,6 +351,14 @@ static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg) map_value_has_spin_lock(reg->map_ptr); } +static bool reg_type_may_be_refcounted_or_null(enum bpf_reg_type type) +{ + return type == PTR_TO_SOCKET || + type == PTR_TO_SOCKET_OR_NULL || + type == PTR_TO_TCP_SOCK || + type == PTR_TO_TCP_SOCK_OR_NULL; +} + static bool arg_type_may_be_refcounted(enum bpf_arg_type type) { return type == ARG_PTR_TO_SOCK_COMMON; @@ -450,8 +458,9 @@ static void print_verifier_state(struct bpf_verifier_env *env, if (t == PTR_TO_STACK) verbose(env, ",call_%d", func(env, reg)->callsite); } else { - verbose(env, "(id=%d ref_obj_id=%d", reg->id, - reg->ref_obj_id); + verbose(env, "(id=%d", reg->id); + if (reg_type_may_be_refcounted_or_null(t)) + verbose(env, ",ref_obj_id=%d", reg->ref_obj_id); if (t != SCALAR_VALUE) verbose(env, ",off=%d", reg->off); if (type_is_pkt_pointer(t)) From 03c8683361ffc6a54411acaafc4bfcbe52152237 Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Fri, 22 Mar 2019 09:53:59 +0800 Subject: [PATCH 1063/1640] UPSTREAM: bpf: track references based on is_acquire_func So far, the verifier only acquires reference tracking state for RET_PTR_TO_SOCKET_OR_NULL. Instead of extending this for every new return type which desires these semantics, acquire reference tracking state iff the called helper is an acquire function. Change-Id: I5d45e64bdcc6ec8213c1dca4cdcbdc1160b67f4f Signed-off-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 4f8cae6a863d..c3f2c82ba356 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3262,19 +3262,7 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn } else if (fn->ret_type == RET_PTR_TO_SOCKET_OR_NULL) { mark_reg_known_zero(env, regs, BPF_REG_0); regs[BPF_REG_0].type = PTR_TO_SOCKET_OR_NULL; - if (is_acquire_function(func_id)) { - int id = acquire_reference_state(env, insn_idx); - - if (id < 0) - return id; - /* For mark_ptr_or_null_reg() */ - regs[BPF_REG_0].id = id; - /* For release_reference() */ - regs[BPF_REG_0].ref_obj_id = id; - } else { - /* For mark_ptr_or_null_reg() */ - regs[BPF_REG_0].id = ++env->id_gen; - } + regs[BPF_REG_0].id = ++env->id_gen; } else if (fn->ret_type == RET_PTR_TO_TCP_SOCK_OR_NULL) { mark_reg_known_zero(env, regs, BPF_REG_0); regs[BPF_REG_0].type = PTR_TO_TCP_SOCK_OR_NULL; @@ -3285,9 +3273,19 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn return -EINVAL; } - if (is_ptr_cast_function(func_id)) + if (is_ptr_cast_function(func_id)) { /* For release_reference() */ regs[BPF_REG_0].ref_obj_id = meta.ref_obj_id; + } else if (is_acquire_function(func_id)) { + int id = acquire_reference_state(env, insn_idx); + + if (id < 0) + return id; + /* For mark_ptr_or_null_reg() */ + regs[BPF_REG_0].id = id; + /* For release_reference() */ + regs[BPF_REG_0].ref_obj_id = id; + } err = do_refine_retval_range(env, regs, fn->ret_type, func_id, &meta); if (err) From d2fe840de9ca89558a3b9af40a1dd06e073dfb35 Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Fri, 22 Mar 2019 09:54:00 +0800 Subject: [PATCH 1064/1640] UPSTREAM: bpf: allow helpers to return PTR_TO_SOCK_COMMON It's currently not possible to access timewait or request sockets from eBPF, since there is no way to return a PTR_TO_SOCK_COMMON from a helper. Introduce RET_PTR_TO_SOCK_COMMON to enable this behaviour. Change-Id: I3dd6ff716a441ba6dd4d0208b3597f2f5017a3a2 Signed-off-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 1 + kernel/bpf/verifier.c | 4 ++++ 2 files changed, 5 insertions(+) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index cabc4fcd6bf0..d214d8b85ab2 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -210,6 +210,7 @@ enum bpf_return_type { RET_PTR_TO_MAP_VALUE_OR_NULL, /* returns a pointer to map elem value or NULL */ RET_PTR_TO_SOCKET_OR_NULL, /* returns a pointer to a socket or NULL */ RET_PTR_TO_TCP_SOCK_OR_NULL, /* returns a pointer to a tcp_sock or NULL */ + RET_PTR_TO_SOCK_COMMON_OR_NULL, /* returns a pointer to a sock_common or NULL */ }; /* eBPF function prototype used by verifier to allow BPF_CALLs from eBPF programs diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index c3f2c82ba356..c40255066b53 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3263,6 +3263,10 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn mark_reg_known_zero(env, regs, BPF_REG_0); regs[BPF_REG_0].type = PTR_TO_SOCKET_OR_NULL; regs[BPF_REG_0].id = ++env->id_gen; + } else if (fn->ret_type == RET_PTR_TO_SOCK_COMMON_OR_NULL) { + mark_reg_known_zero(env, regs, BPF_REG_0); + regs[BPF_REG_0].type = PTR_TO_SOCK_COMMON_OR_NULL; + regs[BPF_REG_0].id = ++env->id_gen; } else if (fn->ret_type == RET_PTR_TO_TCP_SOCK_OR_NULL) { mark_reg_known_zero(env, regs, BPF_REG_0); regs[BPF_REG_0].type = PTR_TO_TCP_SOCK_OR_NULL; From 4b36bb7ab06204cebbf867bce34a7f0687c1c90a Mon Sep 17 00:00:00 2001 From: brakmo Date: Fri, 1 Mar 2019 12:38:46 -0800 Subject: [PATCH 1065/1640] BACKPORT: bpf: add bpf helper bpf_skb_ecn_set_ce This patch adds a new bpf helper BPF_FUNC_skb_ecn_set_ce "int bpf_skb_ecn_set_ce(struct sk_buff *skb)". It is added to BPF_PROG_TYPE_CGROUP_SKB typed bpf_prog which currently can be attached to the ingress and egress path. The helper is needed because his type of bpf_prog cannot modify the skb directly. This helper is used to set the ECN field of ECN capable IP packets to ce (congestion encountered) in the IPv6 or IPv4 header of the skb. It can be used by a bpf_prog to manage egress or ingress network bandwdith limit per cgroupv2 by inducing an ECN response in the TCP sender. This works best when using DCTCP. Change-Id: Ib8d55cf5c26c0ae6733e992a70a6ff829a64258e Signed-off-by: Lawrence Brakmo Signed-off-by: Martin KaFai Lau Acked-by: Song Liu Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 7 +++++++ net/core/filter.c | 28 ++++++++++++++++++++++++++++ 2 files changed, 35 insertions(+) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 6b795a95ec89..72211dfdbf7d 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2309,6 +2309,13 @@ union bpf_attr { * Return * A **struct bpf_tcp_sock** pointer on success, or NULL in * case of failure. + * + * int bpf_skb_ecn_set_ce(struct sk_buf *skb) + * Description + * Sets ECN of IP header to ce (congestion encountered) if + * current value is ect (ECN capable). Works with IPv6 and IPv4. + * Return + * 1 if set, 0 if not set. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ diff --git a/net/core/filter.c b/net/core/filter.c index dd3372f5c740..7de792b9368d 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5046,6 +5046,32 @@ static const struct bpf_func_proto bpf_tcp_sock_proto = { .arg1_type = ARG_PTR_TO_SOCK_COMMON, }; +BPF_CALL_1(bpf_skb_ecn_set_ce, struct sk_buff *, skb) +{ + unsigned int iphdr_len; + + if (skb->protocol == cpu_to_be16(ETH_P_IP)) + iphdr_len = sizeof(struct iphdr); + else if (skb->protocol == cpu_to_be16(ETH_P_IPV6)) + iphdr_len = sizeof(struct ipv6hdr); + else + return 0; + + if (skb_headlen(skb) < iphdr_len) + return 0; + + if (skb_cloned(skb) && !skb_clone_writable(skb, iphdr_len)) + return 0; + + return INET_ECN_set_ce(skb); +} + +static const struct bpf_func_proto bpf_skb_ecn_set_ce_proto = { + .func = bpf_skb_ecn_set_ce, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, +}; #endif /* CONFIG_INET */ bool bpf_helper_changes_pkt_data(void *func) @@ -5204,6 +5230,8 @@ cg_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) #ifdef CONFIG_INET case BPF_FUNC_tcp_sock: return &bpf_tcp_sock_proto; + case BPF_FUNC_skb_ecn_set_ce: + return &bpf_skb_ecn_set_ce_proto; #endif default: return sk_filter_func_proto(func_id, prog); From 18245479aa39838bbf6fe9b3d55b8777e5ba8fa8 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Tue, 12 Mar 2019 10:23:04 -0700 Subject: [PATCH 1066/1640] BACKPORT: bpf: Add bpf_get_listener_sock(struct bpf_sock *sk) helper Add a new helper "struct bpf_sock *bpf_get_listener_sock(struct bpf_sock *sk)" which returns a bpf_sock in TCP_LISTEN state. It will trace back to the listener sk from a request_sock if possible. It returns NULL for all other cases. No reference is taken because the helper ensures the sk is in SOCK_RCU_FREE (where the TCP_LISTEN sock should be in). Hence, bpf_sk_release() is unnecessary and the verifier does not allow bpf_sk_release(listen_sk) to be called either. The following is also allowed because the bpf_prog is run under rcu_read_lock(): sk = bpf_sk_lookup_tcp(); /* if (!sk) { ... } */ listen_sk = bpf_get_listener_sock(sk); /* if (!listen_sk) { ... } */ bpf_sk_release(sk); src_port = listen_sk->src_port; /* Allowed */ Change-Id: I143d228735c2561dab63e58ed404e2ec8052bade Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 8 ++++++++ net/core/filter.c | 21 +++++++++++++++++++++ 2 files changed, 29 insertions(+) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 72211dfdbf7d..b1ee6bfa8058 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2316,6 +2316,14 @@ union bpf_attr { * current value is ect (ECN capable). Works with IPv6 and IPv4. * Return * 1 if set, 0 if not set. + * + * struct bpf_sock *bpf_get_listener_sock(struct bpf_sock *sk) + * Description + * Return a **struct bpf_sock** pointer in TCP_LISTEN state. + * bpf_sk_release() is unnecessary and not allowed. + * Return + * A **struct bpf_sock** pointer on success, or NULL in + * case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ diff --git a/net/core/filter.c b/net/core/filter.c index 7de792b9368d..7ffba60b3ad8 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5046,6 +5046,23 @@ static const struct bpf_func_proto bpf_tcp_sock_proto = { .arg1_type = ARG_PTR_TO_SOCK_COMMON, }; +BPF_CALL_1(bpf_get_listener_sock, struct sock *, sk) +{ + sk = sk_to_full_sk(sk); + + if (sk->sk_state == TCP_LISTEN && sock_flag(sk, SOCK_RCU_FREE)) + return (unsigned long)sk; + + return (unsigned long)NULL; +} + +static const struct bpf_func_proto bpf_get_listener_sock_proto = { + .func = bpf_get_listener_sock, + .gpl_only = false, + .ret_type = RET_PTR_TO_SOCKET_OR_NULL, + .arg1_type = ARG_PTR_TO_SOCK_COMMON, +}; + BPF_CALL_1(bpf_skb_ecn_set_ce, struct sk_buff *, skb) { unsigned int iphdr_len; @@ -5230,6 +5247,8 @@ cg_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) #ifdef CONFIG_INET case BPF_FUNC_tcp_sock: return &bpf_tcp_sock_proto; + case BPF_FUNC_get_listener_sock: + return &bpf_get_listener_sock_proto; case BPF_FUNC_skb_ecn_set_ce: return &bpf_skb_ecn_set_ce_proto; #endif @@ -5327,6 +5346,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_sk_release_proto; case BPF_FUNC_tcp_sock: return &bpf_tcp_sock_proto; + case BPF_FUNC_get_listener_sock: + return &bpf_get_listener_sock_proto; #endif default: return bpf_base_func_proto(func_id); From 997a6891b7f5619cbfab596e22af2963d0fe15ec Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Fri, 22 Mar 2019 09:54:01 +0800 Subject: [PATCH 1067/1640] BACKPORT: bpf: add skc_lookup_tcp helper Allow looking up a sock_common. This gives eBPF programs access to timewait and request sockets. Change-Id: I123e56d7a5c9ca59ea6ff2c9bb2fb4f4bef6fdda Signed-off-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 17 +++++ kernel/bpf/verifier.c | 3 +- net/core/filter.c | 146 +++++++++++++++++++++++++++++++++------ 3 files changed, 142 insertions(+), 24 deletions(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index b1ee6bfa8058..b19372e4fb05 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2324,6 +2324,23 @@ union bpf_attr { * Return * A **struct bpf_sock** pointer on success, or NULL in * case of failure. + * + * struct bpf_sock *bpf_skc_lookup_tcp(void *ctx, struct bpf_sock_tuple *tuple, u32 tuple_size, u64 netns, u64 flags) + * Description + * Look for TCP socket matching *tuple*, optionally in a child + * network namespace *netns*. The return value must be checked, + * and if non-**NULL**, released via **bpf_sk_release**\ (). + * + * This function is identical to bpf_sk_lookup_tcp, except that it + * also returns timewait or request sockets. Use bpf_sk_fullsock + * or bpf_tcp_socket to access the full structure. + * + * This helper is available only if the kernel was compiled with + * **CONFIG_NET** configuration option. + * Return + * Pointer to **struct bpf_sock**, or **NULL** in case of failure. + * For sockets with reuseport option, the **struct bpf_sock** + * result is from **reuse->socks**\ [] using the hash of the tuple. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index c40255066b53..0e551d071115 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -376,7 +376,8 @@ static bool is_release_function(enum bpf_func_id func_id) static bool is_acquire_function(enum bpf_func_id func_id) { return func_id == BPF_FUNC_sk_lookup_tcp || - func_id == BPF_FUNC_sk_lookup_udp; + func_id == BPF_FUNC_sk_lookup_udp || + func_id == BPF_FUNC_skc_lookup_tcp; } static bool is_ptr_cast_function(enum bpf_func_id func_id) diff --git a/net/core/filter.c b/net/core/filter.c index 7ffba60b3ad8..e7783a2d841b 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4785,15 +4785,15 @@ static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple, return sk; } -/* bpf_sk_lookup performs the core lookup for different types of sockets, +/* bpf_skc_lookup performs the core lookup for different types of sockets, * taking a reference on the socket if it doesn't have the flag SOCK_RCU_FREE. * Returns the socket as an 'unsigned long' to simplify the casting in the * callers to satisfy BPF_CALL declarations. */ -static unsigned long -__bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, - struct net *caller_net, u32 ifindex, u8 proto, u64 netns_id, - u64 flags) +static struct sock * +__bpf_skc_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, + struct net *caller_net, u32 ifindex, u8 proto, u64 netns_id, + u64 flags) { struct sock *sk = NULL; u8 family = AF_UNSPEC; @@ -4820,15 +4820,27 @@ __bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, sk = sk_lookup(net, tuple, ifindex, sdif, family, proto); } - if (sk) - sk = sk_to_full_sk(sk); out: - return (unsigned long) sk; + return sk; } -static unsigned long -bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, - u8 proto, u64 netns_id, u64 flags) +static struct sock * +__bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, + struct net *caller_net, u32 ifindex, u8 proto, u64 netns_id, + u64 flags) +{ + struct sock *sk = __bpf_skc_lookup(skb, tuple, len, caller_net, + ifindex, proto, netns_id, flags); + + if (sk) + sk = sk_to_full_sk(sk); + + return sk; +} + +static struct sock * +bpf_skc_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, + u8 proto, u64 netns_id, u64 flags) { struct net *caller_net; int ifindex; @@ -4841,14 +4853,47 @@ bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, ifindex = 0; } - return __bpf_sk_lookup(skb, tuple, len, caller_net, ifindex, - proto, netns_id, flags); + return __bpf_skc_lookup(skb, tuple, len, caller_net, ifindex, proto, + netns_id, flags); } +static struct sock * +bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, + u8 proto, u64 netns_id, u64 flags) +{ + struct sock *sk = bpf_skc_lookup(skb, tuple, len, proto, netns_id, + flags); + + if (sk) + sk = sk_to_full_sk(sk); + + return sk; +} + +BPF_CALL_5(bpf_skc_lookup_tcp, struct sk_buff *, skb, + struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) +{ + return (unsigned long)bpf_skc_lookup(skb, tuple, len, IPPROTO_TCP, + netns_id, flags); +} + +static const struct bpf_func_proto bpf_skc_lookup_tcp_proto = { + .func = bpf_skc_lookup_tcp, + .gpl_only = false, + .pkt_access = true, + .ret_type = RET_PTR_TO_SOCK_COMMON_OR_NULL, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_ANYTHING, + .arg5_type = ARG_ANYTHING, +}; + BPF_CALL_5(bpf_sk_lookup_tcp, struct sk_buff *, skb, struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) { - return bpf_sk_lookup(skb, tuple, len, IPPROTO_TCP, netns_id, flags); + return (unsigned long)bpf_sk_lookup(skb, tuple, len, IPPROTO_TCP, + netns_id, flags); } static const struct bpf_func_proto bpf_sk_lookup_tcp_proto = { @@ -4866,7 +4911,8 @@ static const struct bpf_func_proto bpf_sk_lookup_tcp_proto = { BPF_CALL_5(bpf_sk_lookup_udp, struct sk_buff *, skb, struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) { - return bpf_sk_lookup(skb, tuple, len, IPPROTO_UDP, netns_id, flags); + return (unsigned long)bpf_sk_lookup(skb, tuple, len, IPPROTO_UDP, + netns_id, flags); } static const struct bpf_func_proto bpf_sk_lookup_udp_proto = { @@ -4901,8 +4947,9 @@ BPF_CALL_5(bpf_xdp_sk_lookup_udp, struct xdp_buff *, ctx, struct net *caller_net = dev_net(ctx->rxq->dev); int ifindex = ctx->rxq->dev->ifindex; - return __bpf_sk_lookup(NULL, tuple, len, caller_net, ifindex, - IPPROTO_UDP, netns_id, flags); + return (unsigned long)__bpf_sk_lookup(NULL, tuple, len, caller_net, + ifindex, IPPROTO_UDP, netns_id, + flags); } static const struct bpf_func_proto bpf_xdp_sk_lookup_udp_proto = { @@ -4917,14 +4964,38 @@ static const struct bpf_func_proto bpf_xdp_sk_lookup_udp_proto = { .arg5_type = ARG_ANYTHING, }; +BPF_CALL_5(bpf_xdp_skc_lookup_tcp, struct xdp_buff *, ctx, + struct bpf_sock_tuple *, tuple, u32, len, u32, netns_id, u64, flags) +{ + struct net *caller_net = dev_net(ctx->rxq->dev); + int ifindex = ctx->rxq->dev->ifindex; + + return (unsigned long)__bpf_skc_lookup(NULL, tuple, len, caller_net, + ifindex, IPPROTO_TCP, netns_id, + flags); +} + +static const struct bpf_func_proto bpf_xdp_skc_lookup_tcp_proto = { + .func = bpf_xdp_skc_lookup_tcp, + .gpl_only = false, + .pkt_access = true, + .ret_type = RET_PTR_TO_SOCK_COMMON_OR_NULL, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_ANYTHING, + .arg5_type = ARG_ANYTHING, +}; + BPF_CALL_5(bpf_xdp_sk_lookup_tcp, struct xdp_buff *, ctx, struct bpf_sock_tuple *, tuple, u32, len, u32, netns_id, u64, flags) { struct net *caller_net = dev_net(ctx->rxq->dev); int ifindex = ctx->rxq->dev->ifindex; - return __bpf_sk_lookup(NULL, tuple, len, caller_net, ifindex, - IPPROTO_TCP, netns_id, flags); + return (unsigned long)__bpf_sk_lookup(NULL, tuple, len, caller_net, + ifindex, IPPROTO_TCP, netns_id, + flags); } static const struct bpf_func_proto bpf_xdp_sk_lookup_tcp_proto = { @@ -4939,11 +5010,31 @@ static const struct bpf_func_proto bpf_xdp_sk_lookup_tcp_proto = { .arg5_type = ARG_ANYTHING, }; +BPF_CALL_5(bpf_sock_addr_skc_lookup_tcp, struct bpf_sock_addr_kern *, ctx, + struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) +{ + return (unsigned long)__bpf_skc_lookup(NULL, tuple, len, + sock_net(ctx->sk), 0, + IPPROTO_TCP, netns_id, flags); +} + +static const struct bpf_func_proto bpf_sock_addr_skc_lookup_tcp_proto = { + .func = bpf_sock_addr_skc_lookup_tcp, + .gpl_only = false, + .ret_type = RET_PTR_TO_SOCK_COMMON_OR_NULL, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_ANYTHING, + .arg5_type = ARG_ANYTHING, +}; + BPF_CALL_5(bpf_sock_addr_sk_lookup_tcp, struct bpf_sock_addr_kern *, ctx, struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) { - return __bpf_sk_lookup(NULL, tuple, len, sock_net(ctx->sk), 0, - IPPROTO_TCP, netns_id, flags); + return (unsigned long)__bpf_sk_lookup(NULL, tuple, len, + sock_net(ctx->sk), 0, IPPROTO_TCP, + netns_id, flags); } static const struct bpf_func_proto bpf_sock_addr_sk_lookup_tcp_proto = { @@ -4960,8 +5051,9 @@ static const struct bpf_func_proto bpf_sock_addr_sk_lookup_tcp_proto = { BPF_CALL_5(bpf_sock_addr_sk_lookup_udp, struct bpf_sock_addr_kern *, ctx, struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) { - return __bpf_sk_lookup(NULL, tuple, len, sock_net(ctx->sk), 0, - IPPROTO_UDP, netns_id, flags); + return (unsigned long)__bpf_sk_lookup(NULL, tuple, len, + sock_net(ctx->sk), 0, IPPROTO_UDP, + netns_id, flags); } static const struct bpf_func_proto bpf_sock_addr_sk_lookup_udp_proto = { @@ -5213,6 +5305,8 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_sock_addr_sk_lookup_udp_proto; case BPF_FUNC_sk_release: return &bpf_sk_release_proto; + case BPF_FUNC_skc_lookup_tcp: + return &bpf_sock_addr_skc_lookup_tcp_proto; #endif /* CONFIG_INET */ default: return bpf_base_func_proto(func_id); @@ -5348,6 +5442,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_tcp_sock_proto; case BPF_FUNC_get_listener_sock: return &bpf_get_listener_sock_proto; + case BPF_FUNC_skc_lookup_tcp: + return &bpf_skc_lookup_tcp_proto; #endif default: return bpf_base_func_proto(func_id); @@ -5383,6 +5479,8 @@ xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_xdp_sk_lookup_tcp_proto; case BPF_FUNC_sk_release: return &bpf_sk_release_proto; + case BPF_FUNC_skc_lookup_tcp: + return &bpf_xdp_skc_lookup_tcp_proto; #endif default: return bpf_base_func_proto(func_id); @@ -5473,6 +5571,8 @@ sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_sk_lookup_udp_proto; case BPF_FUNC_sk_release: return &bpf_sk_release_proto; + case BPF_FUNC_skc_lookup_tcp: + return &bpf_skc_lookup_tcp_proto; #endif default: return bpf_base_func_proto(func_id); From b688dbe16182be936e89a45036ff1a7555c72135 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 21 Mar 2019 14:34:36 -0700 Subject: [PATCH 1068/1640] UPSTREAM: bpf: verifier: propagate liveness on all frames Commit 7640ead93924 ("bpf: verifier: make sure callees don't prune with caller differences") connected up parentage chains of all frames of the stack. It didn't, however, ensure propagate_liveness() propagates all liveness information along those chains. This means pruning happening in the callee may generate explored states with incomplete liveness for the chains in lower frames of the stack. The included selftest is similar to the prior one from commit 7640ead93924 ("bpf: verifier: make sure callees don't prune with caller differences"), where callee would prune regardless of the difference in r8 state. Now we also initialize r9 to 0 or 1 based on a result from get_random(). r9 is never read so the walk with r9 = 0 gets pruned (correctly) after the walk with r9 = 1 completes. The selftest is so arranged that the pruning will happen in the callee. Since callee does not propagate read marks of r8, the explored state at the pruning point prior to the callee will now ignore r8. Propagate liveness on all frames of the stack when pruning. Fixes: f4d7e40a5b71 ("bpf: introduce function calls (verification)") Change-Id: I5c32a48fe8b7950b40ea262d02c343c95540b05d Signed-off-by: Jakub Kicinski Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 0e551d071115..6a14a3d4f750 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6359,15 +6359,17 @@ static int propagate_liveness(struct bpf_verifier_env *env, } /* Propagate read liveness of registers... */ BUILD_BUG_ON(BPF_REG_FP + 1 != MAX_BPF_REG); - /* We don't need to worry about FP liveness because it's read-only */ - for (i = 0; i < BPF_REG_FP; i++) { - if (vparent->frame[vparent->curframe]->regs[i].live & REG_LIVE_READ) - continue; - if (vstate->frame[vstate->curframe]->regs[i].live & REG_LIVE_READ) { - err = mark_reg_read(env, &vstate->frame[vstate->curframe]->regs[i], - &vparent->frame[vstate->curframe]->regs[i]); - if (err) - return err; + for (frame = 0; frame <= vstate->curframe; frame++) { + /* We don't need to worry about FP liveness, it's read-only */ + for (i = frame < vstate->curframe ? BPF_REG_6 : 0; i < BPF_REG_FP; i++) { + if (vparent->frame[frame]->regs[i].live & REG_LIVE_READ) + continue; + if (vstate->frame[frame]->regs[i].live & REG_LIVE_READ) { + err = mark_reg_read(env, &vstate->frame[frame]->regs[i], + &vparent->frame[frame]->regs[i]); + if (err) + return err; + } } } From 634563f7e5a7480630722be7bfc44edbeb485b5e Mon Sep 17 00:00:00 2001 From: Paul Chaignon Date: Wed, 20 Mar 2019 13:58:27 +0100 Subject: [PATCH 1069/1640] UPSTREAM: bpf: remove incorrect 'verifier bug' warning The BPF verifier checks the maximum number of call stack frames twice, first in the main CFG traversal (do_check) and then in a subsequent traversal (check_max_stack_depth). If the second check fails, it logs a 'verifier bug' warning and errors out, as the number of call stack frames should have been verified already. However, the second check may fail without indicating a verifier bug: if the excessive function calls reside in dead code, the main CFG traversal may not visit them; the subsequent traversal visits all instructions, including dead code. This case raises the question of how invalid dead code should be treated. This patch implements the conservative option and rejects such code. Change-Id: Iafd1e41066b0efc4e9f364ba3f82d6bc64b94212 Signed-off-by: Paul Chaignon Tested-by: Xiao Han Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 6a14a3d4f750..dc6cca6c49a9 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1907,8 +1907,9 @@ continue_func: } frame++; if (frame >= MAX_CALL_FRAMES) { - WARN_ONCE(1, "verifier bug. Call stack is too deep\n"); - return -EFAULT; + verbose(env, "the call stack of %d frames is too deep !\n", + frame); + return -E2BIG; } goto process_func; } From ff7f79fe683e9ee62990ccc7753beeeaf3f27951 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 1 Apr 2019 21:27:40 -0700 Subject: [PATCH 1070/1640] UPSTREAM: bpf: add verifier stats and log_level bit 2 In order to understand the verifier bottlenecks add various stats and extend log_level: log_level 1 and 2 are kept as-is: bit 0 - level=1 - print every insn and verifier state at branch points bit 1 - level=2 - print every insn and verifier state at every insn bit 2 - level=4 - print verifier error and stats at the end of verification When verifier rejects the program the libbpf is trying to load the program twice. Once with log_level=0 (no messages, only error code is reported to user space) and second time with log_level=1 to tell the user why the verifier rejected it. With introduction of bit 2 - level=4 the libbpf can choose to always use that level and load programs once, since the verification speed is not affected and in case of error the verbose message will be available. Note that the verifier stats are not part of uapi just like all other verbose messages. They're expected to change in the future. Change-Id: I615a7066f2822777129ad8efb7c90e32e5827f28 Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/linux/bpf_verifier.h | 21 ++++++++++ kernel/bpf/verifier.c | 76 ++++++++++++++++++++++++------------ 2 files changed, 73 insertions(+), 24 deletions(-) diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index ecf3bb76790a..0dec7dfd5395 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -256,6 +256,12 @@ static inline bool bpf_verifier_log_full(const struct bpf_verifier_log *log) return log->len_used >= log->len_total - 1; } +#define BPF_LOG_LEVEL1 1 +#define BPF_LOG_LEVEL2 2 +#define BPF_LOG_STATS 4 +#define BPF_LOG_LEVEL (BPF_LOG_LEVEL1 | BPF_LOG_LEVEL2) +#define BPF_LOG_MASK (BPF_LOG_LEVEL | BPF_LOG_STATS) + static inline bool bpf_verifier_log_needed(const struct bpf_verifier_log *log) { return log->level && log->ubuf && !bpf_verifier_log_full(log); @@ -294,6 +300,21 @@ struct bpf_verifier_env { struct bpf_subprog_info subprog_info[BPF_MAX_SUBPROGS + 1]; struct bpf_id_pair idmap_scratch[BPF_ID_MAP_SIZE]; u32 subprog_cnt; + /* number of instructions analyzed by the verifier */ + u32 insn_processed; + /* total verification time */ + u64 verification_time; + /* maximum number of verifier states kept in 'branching' instructions */ + u32 max_states_per_insn; + /* total number of allocated verifier states */ + u32 total_states; + /* some states are freed during program analysis. + * this is peak number of states. this number dominates kernel + * memory consumption during verification + */ + u32 peak_states; + /* longest register parentage chain walked for liveness marking */ + u32 longest_mark_read_walk; }; __printf(2, 0) void bpf_verifier_vlog(struct bpf_verifier_log *log, diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index dc6cca6c49a9..67e2f9821be7 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1091,7 +1091,7 @@ static int check_subprogs(struct bpf_verifier_env *env) */ subprog[env->subprog_cnt].start = insn_cnt; - if (env->log.level > 1) + if (env->log.level & BPF_LOG_LEVEL2) for (i = 0; i < env->subprog_cnt; i++) verbose(env, "func#%d @%d\n", i, subprog[i].start); @@ -1138,6 +1138,7 @@ static int mark_reg_read(struct bpf_verifier_env *env, struct bpf_reg_state *parent) { bool writes = parent == state->parent; /* Observe write marks */ + int cnt = 0; while (parent) { /* if read wasn't screened by an earlier write ... */ @@ -1154,7 +1155,11 @@ static int mark_reg_read(struct bpf_verifier_env *env, state = parent; parent = state->parent; writes = true; + cnt++; } + + if (env->longest_mark_read_walk < cnt) + env->longest_mark_read_walk = cnt; return 0; } @@ -1464,7 +1469,7 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, * need to try adding each of min_value and max_value to off * to make sure our theoretical access will be safe. */ - if (env->log.level) + if (env->log.level & BPF_LOG_LEVEL) print_verifier_state(env, state); /* The minimum value is only important with signed @@ -2989,7 +2994,7 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, /* and go analyze first insn of the callee */ *insn_idx = target_insn; - if (env->log.level) { + if (env->log.level & BPF_LOG_LEVEL) { verbose(env, "caller:\n"); print_verifier_state(env, caller); verbose(env, "callee:\n"); @@ -3029,7 +3034,7 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) return err; *insn_idx = callee->callsite + 1; - if (env->log.level) { + if (env->log.level & BPF_LOG_LEVEL) { verbose(env, "returning from callee:\n"); print_verifier_state(env, callee); verbose(env, "to caller at %d:\n", *insn_idx); @@ -5253,7 +5258,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, insn->dst_reg); return -EACCES; } - if (env->log.level) + if (env->log.level & BPF_LOG_LEVEL) print_verifier_state(env, this_branch->frame[this_branch->curframe]); return 0; } @@ -6427,6 +6432,9 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) states_cnt++; } + if (env->max_states_per_insn < states_cnt) + env->max_states_per_insn = states_cnt; + if (!env->allow_ptr_leaks && states_cnt > BPF_COMPLEXITY_LIMIT_STATES) return 0; @@ -6440,6 +6448,8 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) new_sl = kzalloc(sizeof(struct bpf_verifier_state_list), GFP_KERNEL); if (!new_sl) return -ENOMEM; + env->total_states++; + env->peak_states++; /* add new state to the head of linked list */ new = &new_sl->state; @@ -6524,8 +6534,7 @@ static int do_check(struct bpf_verifier_env *env) struct bpf_verifier_state *state; struct bpf_insn *insns = env->prog->insnsi; struct bpf_reg_state *regs; - int insn_cnt = env->prog->len, i; - int insn_processed = 0; + int insn_cnt = env->prog->len; bool do_print_state = false; env->prev_linfo = NULL; @@ -6560,10 +6569,10 @@ static int do_check(struct bpf_verifier_env *env) insn = &insns[env->insn_idx]; class = BPF_CLASS(insn->code); - if (++insn_processed > BPF_COMPLEXITY_LIMIT_INSNS) { + if (++env->insn_processed > BPF_COMPLEXITY_LIMIT_INSNS) { verbose(env, "BPF program is too large. Processed %d insn\n", - insn_processed); + env->insn_processed); return -E2BIG; } @@ -6572,7 +6581,7 @@ static int do_check(struct bpf_verifier_env *env) return err; if (err == 1) { /* found equivalent state, can prune the search */ - if (env->log.level) { + if (env->log.level & BPF_LOG_LEVEL) { if (do_print_state) verbose(env, "\nfrom %d to %d%s: safe\n", env->prev_insn_idx, env->insn_idx, @@ -6590,8 +6599,9 @@ static int do_check(struct bpf_verifier_env *env) if (need_resched()) cond_resched(); - if (env->log.level > 1 || (env->log.level && do_print_state)) { - if (env->log.level > 1) + if (env->log.level & BPF_LOG_LEVEL2 || + (env->log.level & BPF_LOG_LEVEL && do_print_state)) { + if (env->log.level & BPF_LOG_LEVEL2) verbose(env, "%d:", env->insn_idx); else verbose(env, "\nfrom %d to %d%s:", @@ -6602,7 +6612,7 @@ static int do_check(struct bpf_verifier_env *env) do_print_state = false; } - if (env->log.level) { + if (env->log.level & BPF_LOG_LEVEL) { const struct bpf_insn_cbs cbs = { .cb_print = verbose, .private_data = env, @@ -6867,16 +6877,6 @@ process_bpf_exit: env->insn_idx++; } - verbose(env, "processed %d insns (limit %d), stack depth ", - insn_processed, BPF_COMPLEXITY_LIMIT_INSNS); - for (i = 0; i < env->subprog_cnt; i++) { - u32 depth = env->subprog_info[i].stack_depth; - - verbose(env, "%d", depth); - if (i + 1 < env->subprog_cnt) - verbose(env, "+"); - } - verbose(env, "\n"); env->prog->aux->stack_depth = env->subprog_info[0].stack_depth; return 0; } @@ -8106,9 +8106,34 @@ static void free_states(struct bpf_verifier_env *env) kfree(env->explored_states); } +static void print_verification_stats(struct bpf_verifier_env *env) +{ + int i; + + if (env->log.level & BPF_LOG_STATS) { + verbose(env, "verification time %lld usec\n", + div_u64(env->verification_time, 1000)); + verbose(env, "stack depth "); + for (i = 0; i < env->subprog_cnt; i++) { + u32 depth = env->subprog_info[i].stack_depth; + + verbose(env, "%d", depth); + if (i + 1 < env->subprog_cnt) + verbose(env, "+"); + } + verbose(env, "\n"); + } + verbose(env, "processed %d insns (limit %d) max_states_per_insn %d " + "total_states %d peak_states %d mark_read %d\n", + env->insn_processed, BPF_COMPLEXITY_LIMIT_INSNS, + env->max_states_per_insn, env->total_states, + env->peak_states, env->longest_mark_read_walk); +} + int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, union bpf_attr __user *uattr) { + u64 start_time = ktime_get_ns(); struct bpf_verifier_env *env; struct bpf_verifier_log *log; int i, len, ret = -EINVAL; @@ -8151,7 +8176,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, ret = -EINVAL; /* log attributes have to be sane */ if (log->len_total < 128 || log->len_total > UINT_MAX >> 8 || - !log->level || !log->ubuf) + !log->level || !log->ubuf || log->level & ~BPF_LOG_MASK) goto err_unlock; } @@ -8233,6 +8258,9 @@ skip_full_check: if (ret == 0) ret = fixup_call_args(env); + env->verification_time = ktime_get_ns() - start_time; + print_verification_stats(env); + if (log->level && bpf_verifier_log_full(log)) ret = -ENOSPC; if (log->level && !log->ubuf) { From 6dfd37bb6cf5385588e7c8a0227568131c5f41d8 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 1 Apr 2019 21:27:41 -0700 Subject: [PATCH 1071/1640] UPSTREAM: bpf: improve verification speed by droping states Branch instructions, branch targets and calls in a bpf program are the places where the verifier remembers states that led to successful verification of the program. These states are used to prune brute force program analysis. For unprivileged programs there is a limit of 64 states per such 'branching' instructions (maximum length is tracked by max_states_per_insn counter introduced in the previous patch). Simply reducing this threshold to 32 or lower increases insn_processed metric to the point that small valid programs get rejected. For root programs there is no limit and cilium programs can have max_states_per_insn to be 100 or higher. Walking 100+ states multiplied by number of 'branching' insns during verification consumes significant amount of cpu time. Turned out simple LRU-like mechanism can be used to remove states that unlikely will be helpful in future search pruning. This patch introduces hit_cnt and miss_cnt counters: hit_cnt - this many times this state successfully pruned the search miss_cnt - this many times this state was not equivalent to other states (and that other states were added to state list) The heuristic introduced in this patch is: if (sl->miss_cnt > sl->hit_cnt * 3 + 3) /* drop this state from future considerations */ Higher numbers increase max_states_per_insn (allow more states to be considered for pruning) and slow verification speed, but do not meaningfully reduce insn_processed metric. Lower numbers drop too many states and insn_processed increases too much. Many different formulas were considered. This one is simple and works well enough in practice. (the analysis was done on selftests/progs/* and on cilium programs) The end result is this heuristic improves verification speed by 10 times. Large synthetic programs that used to take a second more now take 1/10 of a second. In cases where max_states_per_insn used to be 100 or more, now it's ~10. There is a slight increase in insn_processed for cilium progs: before after bpf_lb-DLB_L3.o 1831 1838 bpf_lb-DLB_L4.o 3029 3218 bpf_lb-DUNKNOWN.o 1064 1064 bpf_lxc-DDROP_ALL.o 26309 26935 bpf_lxc-DUNKNOWN.o 33517 34439 bpf_netdev.o 9713 9721 bpf_overlay.o 6184 6184 bpf_lcx_jit.o 37335 39389 And 2-3 times improvement in the verification speed. Change-Id: I436f6e52750b1c0b63ba81dc640550ad3e79821e Signed-off-by: Alexei Starovoitov Reviewed-by: Jakub Kicinski Signed-off-by: Daniel Borkmann --- include/linux/bpf_verifier.h | 2 ++ kernel/bpf/verifier.c | 44 +++++++++++++++++++++++++++++++++--- 2 files changed, 43 insertions(+), 3 deletions(-) diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 0dec7dfd5395..a91abfed05b9 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -214,6 +214,7 @@ struct bpf_verifier_state { struct bpf_verifier_state_list { struct bpf_verifier_state state; struct bpf_verifier_state_list *next; + int miss_cnt, hit_cnt; }; /* Possible states for alu_state member. */ @@ -288,6 +289,7 @@ struct bpf_verifier_env { bool strict_alignment; /* perform strict pointer alignment checks */ struct bpf_verifier_state *cur_state; /* current verifier state */ struct bpf_verifier_state_list **explored_states; /* search pruning optimization */ + struct bpf_verifier_state_list *free_list; struct bpf_map *used_maps[MAX_USED_MAPS]; /* array of map's used by eBPF program */ u32 used_map_cnt; /* number of used maps */ u32 id_gen; /* used to generate unique reg IDs */ diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 67e2f9821be7..e9ff8d2ac995 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6398,11 +6398,13 @@ static int propagate_liveness(struct bpf_verifier_env *env, static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) { struct bpf_verifier_state_list *new_sl; - struct bpf_verifier_state_list *sl; + struct bpf_verifier_state_list *sl, **pprev; struct bpf_verifier_state *cur = env->cur_state, *new; int i, j, err, states_cnt = 0; - sl = env->explored_states[insn_idx]; + pprev = &env->explored_states[insn_idx]; + sl = *pprev; + if (!sl) /* this 'insn_idx' instruction wasn't marked, so we will not * be doing state search here @@ -6413,6 +6415,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) while (sl != STATE_LIST_MARK) { if (states_equal(env, &sl->state, cur)) { + sl->hit_cnt++; /* reached equivalent register/stack state, * prune the search. * Registers read by the continuation are read by us. @@ -6428,8 +6431,35 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) return err; return 1; } - sl = sl->next; states_cnt++; + sl->miss_cnt++; + /* heuristic to determine whether this state is beneficial + * to keep checking from state equivalence point of view. + * Higher numbers increase max_states_per_insn and verification time, + * but do not meaningfully decrease insn_processed. + */ + if (sl->miss_cnt > sl->hit_cnt * 3 + 3) { + /* the state is unlikely to be useful. Remove it to + * speed up verification + */ + *pprev = sl->next; + if (sl->state.frame[0]->regs[0].live & REG_LIVE_DONE) { + free_verifier_state(&sl->state, false); + kfree(sl); + env->peak_states--; + } else { + /* cannot free this state, since parentage chain may + * walk it later. Add it for free_list instead to + * be freed at the end of verification + */ + sl->next = env->free_list; + env->free_list = sl; + } + sl = *pprev; + continue; + } + pprev = &sl->next; + sl = *pprev; } if (env->max_states_per_insn < states_cnt) @@ -8088,6 +8118,14 @@ static void free_states(struct bpf_verifier_env *env) struct bpf_verifier_state_list *sl, *sln; int i; + sl = env->free_list; + while (sl) { + sln = sl->next; + free_verifier_state(&sl->state, false); + kfree(sl); + sl = sln; + } + if (!env->explored_states) return; From c30087b22a8b19e25296a64fb2e1d057dfe33f9a Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 1 Apr 2019 21:27:42 -0700 Subject: [PATCH 1072/1640] UPSTREAM: bpf: improve verification speed by not remarking live_read With large verifier speed improvement brought by the previous patch mark_reg_read() becomes the hottest function during verification. On a typical program it consumes 40% of cpu. mark_reg_read() walks parentage chain of registers to mark parents as LIVE_READ. Once the register is marked there is no need to remark it again in the future. Hence stop walking the chain once first LIVE_READ is seen. This optimization drops mark_reg_read() time from 40% of cpu to <1% and overall 2x improvement of verification speed. For some programs the longest_mark_read_walk counter improves from ~500 to ~5 Change-Id: Ia83a5d42831f6bf9e8423edb38a37cea37555f5f Signed-off-by: Alexei Starovoitov Reviewed-by: Jakub Kicinski Reviewed-by: Edward Cree Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index e9ff8d2ac995..7c25aee329de 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1150,6 +1150,15 @@ static int mark_reg_read(struct bpf_verifier_env *env, parent->var_off.value, parent->off); return -EFAULT; } + if (parent->live & REG_LIVE_READ) + /* The parentage chain never changes and + * this parent was already marked as LIVE_READ. + * There is no need to keep walking the chain again and + * keep re-marking all parents as LIVE_READ. + * This case happens when the same register is read + * multiple times without writes into it in-between. + */ + break; /* ... then we depend on parent's value */ parent->live |= REG_LIVE_READ; state = parent; From 7ee30cb93060782ebb7d7fa7e5f387a34b65c5f8 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 1 Apr 2019 21:27:43 -0700 Subject: [PATCH 1073/1640] UPSTREAM: bpf: convert temp arrays to kvcalloc Temporary arrays used during program verification need to be vmalloc-ed to support large bpf programs. Change-Id: I6672ac648d02acb5c00f4cd6b386938b523a64a4 Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 7c25aee329de..8138f3f78c7a 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5574,13 +5574,13 @@ static int check_cfg(struct bpf_verifier_env *env) int ret = 0; int i, t; - insn_state = kcalloc(insn_cnt, sizeof(int), GFP_KERNEL); + insn_state = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL); if (!insn_state) return -ENOMEM; - insn_stack = kcalloc(insn_cnt, sizeof(int), GFP_KERNEL); + insn_stack = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL); if (!insn_stack) { - kfree(insn_state); + kvfree(insn_state); return -ENOMEM; } @@ -5678,8 +5678,8 @@ check_state: ret = 0; /* cfg looks good */ err_free: - kfree(insn_state); - kfree(insn_stack); + kvfree(insn_state); + kvfree(insn_stack); return ret; } @@ -8150,7 +8150,7 @@ static void free_states(struct bpf_verifier_env *env) } } - kfree(env->explored_states); + kvfree(env->explored_states); } static void print_verification_stats(struct bpf_verifier_env *env) @@ -8247,7 +8247,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, goto skip_full_check; } - env->explored_states = kcalloc(env->prog->len, + env->explored_states = kvcalloc(env->prog->len, sizeof(struct bpf_verifier_state_list *), GFP_USER); ret = -ENOMEM; From ce68dae0f687c22ce995fdfe4995d441efa1297b Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 1 Apr 2019 21:27:44 -0700 Subject: [PATCH 1074/1640] UPSTREAM: bpf: verbose jump offset overflow check Larger programs may trigger 16-bit jump offset overflow check during instruction patching. Make this error verbose otherwise users cannot decipher error code without printks in the verifier. Change-Id: I56e4d82085b211f646f5b049ede0f5fe3d7fa66e Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- kernel/bpf/core.c | 11 ++++++----- kernel/bpf/verifier.c | 7 ++++++- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 06ffea1a3646..d88766257711 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -446,6 +446,7 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, u32 insn_adj_cnt, insn_rest, insn_delta = len - 1; const u32 cnt_max = S16_MAX; struct bpf_prog *prog_adj; + int err; /* Since our patchlet doesn't expand the image, we're done. */ if (insn_delta == 0) { @@ -461,8 +462,8 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, * we afterwards may not fail anymore. */ if (insn_adj_cnt > cnt_max && - bpf_adj_branches(prog, off, off + 1, off + len, true)) - return NULL; + (err = bpf_adj_branches(prog, off, off + 1, off + len, true))) + return ERR_PTR(err); /* Several new instructions need to be inserted. Make room * for them. Likely, there's no need for a new allocation as @@ -471,7 +472,7 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, prog_adj = bpf_prog_realloc(prog, bpf_prog_size(insn_adj_cnt), GFP_USER); if (!prog_adj) - return NULL; + return ERR_PTR(-ENOMEM); prog_adj->len = insn_adj_cnt; @@ -1115,13 +1116,13 @@ struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *prog) continue; tmp = bpf_patch_insn_single(clone, i, insn_buff, rewritten); - if (!tmp) { + if (IS_ERR(tmp)) { /* Patching may have repointed aux->prog during * realloc from the original one, so we need to * fix it up here on error. */ bpf_jit_prog_release_other(prog, clone); - return ERR_PTR(-ENOMEM); + return tmp; } clone = tmp; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 8138f3f78c7a..44e5267af562 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -7181,8 +7181,13 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of struct bpf_prog *new_prog; new_prog = bpf_patch_insn_single(env->prog, off, patch, len); - if (!new_prog) + if (IS_ERR(new_prog)) { + if (PTR_ERR(new_prog) == -ERANGE) + verbose(env, + "insn %d cannot be patched due to 16-bit range\n", + env->insn_aux_data[off].orig_idx); return NULL; + } if (adjust_insn_aux_data(env, new_prog->len, off, len)) return NULL; adjust_subprog_starts(env, off, len); From 6094b21faec2a208739f133d92d551ef1cb7d1b7 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 1 Apr 2019 21:27:45 -0700 Subject: [PATCH 1075/1640] UPSTREAM: bpf: increase complexity limit and maximum program size Large verifier speed improvements allow to increase verifier complexity limit. Now regardless of the program composition and its size it takes little time for the verifier to hit insn_processed limit. On typical x86 machine non-debug kernel processes 1M instructions in 1/10 of a second. (before these speed improvements specially crafted programs could be hitting multi-second verification times) Full kasan kernel with debug takes ~1 second for the same 1M insns. Hence bump the BPF_COMPLEXITY_LIMIT_INSNS limit to 1M. Also increase the number of instructions per program from 4k to internal BPF_COMPLEXITY_LIMIT_INSNS limit. 4k limit was confusing to users, since small programs with hundreds of insns could be hitting BPF_COMPLEXITY_LIMIT_INSNS limit. Sometimes adding more insns and bpf_trace_printk debug statements would make the verifier accept the program while removing code would make the verifier reject it. Some user space application started to add #define MAX_FOO to their programs and do: MAX_FOO=100; again: compile with MAX_FOO; try to load; if (fails_to_load) { reduce MAX_FOO; goto again; } to be able to fit maximum amount of processing into single program. Other users artificially split their single program into a set of programs and use all 32 iterations of tail_calls to increase compute limits. And the most advanced folks used unlimited tc-bpf filter list to execute many bpf programs. Essentially the users managed to workaround 4k insn limit. This patch removes the limit for root programs from uapi. BPF_COMPLEXITY_LIMIT_INSNS is the kernel internal limit and success to load the program no longer depends on program size, but on 'smartness' of the verifier only. The verifier will continue to get smarter with every kernel release. Change-Id: Ied2ef4464ec98a710182a8ffafba773d4d93564e Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 1 + kernel/bpf/syscall.c | 3 ++- kernel/bpf/verifier.c | 1 - 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index d214d8b85ab2..71fbf2e64946 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -426,6 +426,7 @@ struct bpf_array { }; }; +#define BPF_COMPLEXITY_LIMIT_INSNS 1000000 /* yes. 1M insns */ #define MAX_TAIL_CALL_CNT 32 struct bpf_event_entry { diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 0f83af8a6971..2c39d2891e0c 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1563,7 +1563,8 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) /* eBPF programs must be GPL compatible to use GPL-ed functions */ is_gpl = license_is_gpl_compatible(license); - if (attr->insn_cnt == 0 || attr->insn_cnt > BPF_MAXINSNS) + if (attr->insn_cnt == 0 || + attr->insn_cnt > (capable(CAP_SYS_ADMIN) ? BPF_COMPLEXITY_LIMIT_INSNS : BPF_MAXINSNS)) return -E2BIG; if (type != BPF_PROG_TYPE_SOCKET_FILTER && type != BPF_PROG_TYPE_CGROUP_SKB && diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 44e5267af562..6a844664118c 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -176,7 +176,6 @@ struct bpf_verifier_stack_elem { struct bpf_verifier_stack_elem *next; }; -#define BPF_COMPLEXITY_LIMIT_INSNS 131072 #define BPF_COMPLEXITY_LIMIT_STACK 1024 #define BPF_COMPLEXITY_LIMIT_STATES 64 From 1052ffdd896635c45df953bd8a22c4336e81b245 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 1 Apr 2019 21:27:46 -0700 Subject: [PATCH 1076/1640] UPSTREAM: bpf: increase verifier log limit The existing 16Mbyte verifier log limit is not enough for log_level=2 even for small programs. Increase it to 1Gbyte. Note it's not a kernel memory limit. It's an amount of memory user space provides to store the verifier log. The kernel populates it 1k at a time. Change-Id: Ib43c8c4c444fff5ad40ac9369727b00d722fdb16 Signed-off-by: Alexei Starovoitov Reviewed-by: Jakub Kicinski Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 6a844664118c..9a032b31f553 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -8226,7 +8226,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, ret = -EINVAL; /* log attributes have to be sane */ - if (log->len_total < 128 || log->len_total > UINT_MAX >> 8 || + if (log->len_total < 128 || log->len_total > UINT_MAX >> 2 || !log->level || !log->ubuf || log->level & ~BPF_LOG_MASK) goto err_unlock; } From 54894742939282edb28575a500e91063f2c37897 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 9 Apr 2019 23:20:03 +0200 Subject: [PATCH 1077/1640] BACKPORT: bpf: implement lookup-free direct value access for maps This generic extension to BPF maps allows for directly loading an address residing inside a BPF map value as a single BPF ldimm64 instruction! The idea is similar to what BPF_PSEUDO_MAP_FD does today, which is a special src_reg flag for ldimm64 instruction that indicates that inside the first part of the double insns's imm field is a file descriptor which the verifier then replaces as a full 64bit address of the map into both imm parts. For the newly added BPF_PSEUDO_MAP_VALUE src_reg flag, the idea is the following: the first part of the double insns's imm field is again a file descriptor corresponding to the map, and the second part of the imm field is an offset into the value. The verifier will then replace both imm parts with an address that points into the BPF map value at the given value offset for maps that support this operation. Currently supported is array map with single entry. It is possible to support more than just single map element by reusing both 16bit off fields of the insns as a map index, so full array map lookup could be expressed that way. It hasn't been implemented here due to lack of concrete use case, but could easily be done so in future in a compatible way, since both off fields right now have to be 0 and would correctly denote a map index 0. The BPF_PSEUDO_MAP_VALUE is a distinct flag as otherwise with BPF_PSEUDO_MAP_FD we could not differ offset 0 between load of map pointer versus load of map's value at offset 0, and changing BPF_PSEUDO_MAP_FD's encoding into off by one to differ between regular map pointer and map value pointer would add unnecessary complexity and increases barrier for debugability thus less suitable. Using the second part of the imm field as an offset into the value does /not/ come with limitations since maximum possible value size is in u32 universe anyway. This optimization allows for efficiently retrieving an address to a map value memory area without having to issue a helper call which needs to prepare registers according to calling convention, etc, without needing the extra NULL test, and without having to add the offset in an additional instruction to the value base pointer. The verifier then treats the destination register as PTR_TO_MAP_VALUE with constant reg->off from the user passed offset from the second imm field, and guarantees that this is within bounds of the map value. Any subsequent operations are normally treated as typical map value handling without anything extra needed from verification side. The two map operations for direct value access have been added to array map for now. In future other types could be supported as well depending on the use case. The main use case for this commit is to allow for BPF loader support for global variables that reside in .data/.rodata/.bss sections such that we can directly load the address of them with minimal additional infrastructure required. Loader support has been added in subsequent commits for libbpf library. Change-Id: I51974f2fe227ba837b338b8b3ebb44c145583673 Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 6 +++ include/linux/bpf_verifier.h | 4 ++ include/uapi/linux/bpf.h | 13 +++++- kernel/bpf/arraymap.c | 32 ++++++++++++++ kernel/bpf/core.c | 3 +- kernel/bpf/disasm.c | 5 ++- kernel/bpf/syscall.c | 28 +++++++++--- kernel/bpf/verifier.c | 86 +++++++++++++++++++++++++++--------- 8 files changed, 146 insertions(+), 31 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 71fbf2e64946..678057fba291 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -62,6 +62,12 @@ struct bpf_map_ops { const struct btf *btf, const struct btf_type *key_type, const struct btf_type *value_type); + + /* Direct value access helpers. */ + int (*map_direct_value_addr)(const struct bpf_map *map, + u64 *imm, u32 off); + int (*map_direct_value_meta)(const struct bpf_map *map, + u64 imm, u32 *off); }; struct bpf_map { diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index a91abfed05b9..54210d179948 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -232,6 +232,10 @@ struct bpf_insn_aux_data { unsigned long map_state; /* pointer/poison value for maps */ s32 call_imm; /* saved imm field of call insn */ u32 alu_limit; /* limit for add/sub register with pointer */ + struct { + u32 map_index; /* index into used_maps[] */ + u32 map_off; /* offset from value base address */ + }; }; int ctx_field_size; /* the ctx field size for load insn, maybe 0 */ bool seen; /* this insn was processed by the verifier */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index b19372e4fb05..fe43e376933c 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -257,8 +257,19 @@ enum bpf_attach_type { */ #define BPF_F_ANY_ALIGNMENT (1U << 1) -/* when bpf_ldimm64->src_reg == BPF_PSEUDO_MAP_FD, bpf_ldimm64->imm == fd */ +/* When BPF ldimm64's insn[0].src_reg != 0 then this can have + * two extensions: + * + * insn[0].src_reg: BPF_PSEUDO_MAP_FD BPF_PSEUDO_MAP_VALUE + * insn[0].imm: map fd map fd + * insn[1].imm: 0 offset into value + * insn[0].off: 0 0 + * insn[1].off: 0 0 + * ldimm64 rewrite: address of map address of map[0]+offset + * verifier type: CONST_PTR_TO_MAP PTR_TO_MAP_VALUE + */ #define BPF_PSEUDO_MAP_FD 1 +#define BPF_PSEUDO_MAP_VALUE 2 /* when bpf_call->src_reg == BPF_PSEUDO_CALL, bpf_call->imm == pc-relative * offset to another bpf function diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 2a477256b333..2a4950c1fc41 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -163,6 +163,36 @@ static void *array_map_lookup_elem(struct bpf_map *map, void *key) return array->value + array->elem_size * (index & array->index_mask); } +static int array_map_direct_value_addr(const struct bpf_map *map, u64 *imm, + u32 off) +{ + struct bpf_array *array = container_of(map, struct bpf_array, map); + + if (map->max_entries != 1) + return -ENOTSUPP; + if (off >= map->value_size) + return -EINVAL; + + *imm = (unsigned long)array->value; + return 0; +} + +static int array_map_direct_value_meta(const struct bpf_map *map, u64 imm, + u32 *off) +{ + struct bpf_array *array = container_of(map, struct bpf_array, map); + u64 base = (unsigned long)array->value; + u64 range = array->elem_size; + + if (map->max_entries != 1) + return -ENOTSUPP; + if (imm < base || imm >= base + range) + return -ENOENT; + + *off = imm - base; + return 0; +} + /* emit BPF instructions equivalent to C code of array_map_lookup_elem() */ static u32 array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) { @@ -422,6 +452,8 @@ const struct bpf_map_ops array_map_ops = { .map_update_elem = array_map_update_elem, .map_delete_elem = array_map_delete_elem, .map_gen_lookup = array_map_gen_lookup, + .map_direct_value_addr = array_map_direct_value_addr, + .map_direct_value_meta = array_map_direct_value_meta, .map_seq_show_elem = array_map_seq_show_elem, .map_check_btf = array_map_check_btf, }; diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index d88766257711..d8537a5d45bf 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -300,7 +300,8 @@ int bpf_prog_calc_tag(struct bpf_prog *fp) dst[i] = fp->insnsi[i]; if (!was_ld_map && dst[i].code == (BPF_LD | BPF_IMM | BPF_DW) && - dst[i].src_reg == BPF_PSEUDO_MAP_FD) { + (dst[i].src_reg == BPF_PSEUDO_MAP_FD || + dst[i].src_reg == BPF_PSEUDO_MAP_VALUE)) { was_ld_map = true; dst[i].imm = 0; } else if (was_ld_map && diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c index 3016372d01c1..6929928be5d7 100644 --- a/kernel/bpf/disasm.c +++ b/kernel/bpf/disasm.c @@ -207,10 +207,11 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs, * part of the ldimm64 insn is accessible. */ u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm; - bool map_ptr = insn->src_reg == BPF_PSEUDO_MAP_FD; + bool is_ptr = insn->src_reg == BPF_PSEUDO_MAP_FD || + insn->src_reg == BPF_PSEUDO_MAP_VALUE; char tmp[64]; - if (map_ptr && !allow_ptr_leaks) + if (is_ptr && !allow_ptr_leaks) imm = 0; verbose(cbs->private_data, "(%02x) r%d = %s\n", diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 2c39d2891e0c..4d3a7bbdd2a1 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2089,13 +2089,26 @@ static int bpf_map_get_fd_by_id(const union bpf_attr *attr) } static const struct bpf_map *bpf_map_from_imm(const struct bpf_prog *prog, - unsigned long addr) + unsigned long addr, u32 *off, + u32 *type) { + const struct bpf_map *map; int i; - for (i = 0; i < prog->aux->used_map_cnt; i++) - if (prog->aux->used_maps[i] == (void *)addr) - return prog->aux->used_maps[i]; + for (i = 0, *off = 0; i < prog->aux->used_map_cnt; i++) { + map = prog->aux->used_maps[i]; + if (map == (void *)addr) { + *type = BPF_PSEUDO_MAP_FD; + return map; + } + if (!map->ops->map_direct_value_meta) + continue; + if (!map->ops->map_direct_value_meta(map, addr, off)) { + *type = BPF_PSEUDO_MAP_VALUE; + return map; + } + } + return NULL; } @@ -2104,6 +2117,7 @@ static struct bpf_insn *bpf_insn_prepare_dump(const struct bpf_prog *prog, { const struct bpf_map *map; struct bpf_insn *insns; + u32 off, type; u64 imm; int i; @@ -2131,11 +2145,11 @@ static struct bpf_insn *bpf_insn_prepare_dump(const struct bpf_prog *prog, continue; imm = ((u64)insns[i + 1].imm << 32) | (u32)insns[i].imm; - map = bpf_map_from_imm(prog, imm); + map = bpf_map_from_imm(prog, imm, &off, &type); if (map) { - insns[i].src_reg = BPF_PSEUDO_MAP_FD; + insns[i].src_reg = type; insns[i].imm = map->id; - insns[i + 1].imm = 0; + insns[i + 1].imm = off; continue; } } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 9a032b31f553..251d31fc65b1 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5271,18 +5271,12 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, return 0; } -/* return the map pointer stored inside BPF_LD_IMM64 instruction */ -static struct bpf_map *ld_imm64_to_map_ptr(struct bpf_insn *insn) -{ - u64 imm64 = ((u64) (u32) insn[0].imm) | ((u64) (u32) insn[1].imm) << 32; - - return (struct bpf_map *) (unsigned long) imm64; -} - /* verify BPF_LD_IMM64 instruction */ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn) { + struct bpf_insn_aux_data *aux = cur_aux(env); struct bpf_reg_state *regs = cur_regs(env); + struct bpf_map *map; int err; if (BPF_SIZE(insn->code) != BPF_DW) { @@ -5306,11 +5300,22 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn) return 0; } - /* replace_map_fd_with_map_ptr() should have caught bad ld_imm64 */ - BUG_ON(insn->src_reg != BPF_PSEUDO_MAP_FD); + map = env->used_maps[aux->map_index]; + mark_reg_known_zero(env, regs, insn->dst_reg); + regs[insn->dst_reg].map_ptr = map; + + if (insn->src_reg == BPF_PSEUDO_MAP_VALUE) { + regs[insn->dst_reg].type = PTR_TO_MAP_VALUE; + regs[insn->dst_reg].off = aux->map_off; + if (map_value_has_spin_lock(map)) + regs[insn->dst_reg].id = ++env->id_gen; + } else if (insn->src_reg == BPF_PSEUDO_MAP_FD) { + regs[insn->dst_reg].type = CONST_PTR_TO_MAP; + } else { + verbose(env, "bpf verifier is misconfigured\n"); + return -EINVAL; + } - regs[insn->dst_reg].type = CONST_PTR_TO_MAP; - regs[insn->dst_reg].map_ptr = ld_imm64_to_map_ptr(insn); return 0; } @@ -7012,8 +7017,10 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env) } if (insn[0].code == (BPF_LD | BPF_IMM | BPF_DW)) { + struct bpf_insn_aux_data *aux; struct bpf_map *map; struct fd f; + u64 addr; if (i == insn_cnt - 1 || insn[1].code != 0 || insn[1].dst_reg != 0 || insn[1].src_reg != 0 || @@ -7022,13 +7029,19 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env) return -EINVAL; } - if (insn->src_reg == 0) + if (insn[0].src_reg == 0) /* valid generic load 64-bit imm */ goto next_insn; - if (insn[0].src_reg != BPF_PSEUDO_MAP_FD || - insn[1].imm != 0) { - verbose(env, "unrecognized bpf_ld_imm64 insn\n"); + /* In final convert_pseudo_ld_imm64() step, this is + * converted into regular 64-bit imm load insn. + */ + if ((insn[0].src_reg != BPF_PSEUDO_MAP_FD && + insn[0].src_reg != BPF_PSEUDO_MAP_VALUE) || + (insn[0].src_reg == BPF_PSEUDO_MAP_FD && + insn[1].imm != 0)) { + verbose(env, + "unrecognized bpf_ld_imm64 insn\n"); return -EINVAL; } @@ -7046,16 +7059,47 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env) return err; } - /* store map pointer inside BPF_LD_IMM64 instruction */ - insn[0].imm = (u32) (unsigned long) map; - insn[1].imm = ((u64) (unsigned long) map) >> 32; + aux = &env->insn_aux_data[i]; + if (insn->src_reg == BPF_PSEUDO_MAP_FD) { + addr = (unsigned long)map; + } else { + u32 off = insn[1].imm; + + if (off >= BPF_MAX_VAR_OFF) { + verbose(env, "direct value offset of %u is not allowed\n", off); + fdput(f); + return -EINVAL; + } + + if (!map->ops->map_direct_value_addr) { + verbose(env, "no direct value access support for this map type\n"); + fdput(f); + return -EINVAL; + } + + err = map->ops->map_direct_value_addr(map, &addr, off); + if (err) { + verbose(env, "invalid access to map value pointer, value_size=%u off=%u\n", + map->value_size, off); + fdput(f); + return err; + } + + aux->map_off = off; + addr += off; + } + + insn[0].imm = (u32)addr; + insn[1].imm = addr >> 32; /* check whether we recorded this map already */ - for (j = 0; j < env->used_map_cnt; j++) + for (j = 0; j < env->used_map_cnt; j++) { if (env->used_maps[j] == map) { + aux->map_index = j; fdput(f); goto next_insn; } + } if (env->used_map_cnt >= MAX_USED_MAPS) { fdput(f); @@ -7072,6 +7116,8 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env) fdput(f); return PTR_ERR(map); } + + aux->map_index = env->used_map_cnt; env->used_maps[env->used_map_cnt++] = map; if (bpf_map_is_cgroup_storage(map) && From 3acb1677c2980c81118de9363d106e4f68460ad0 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 9 Apr 2019 23:20:04 +0200 Subject: [PATCH 1078/1640] UPSTREAM: bpf: do not retain flags that are not tied to map lifetime Both BPF_F_WRONLY / BPF_F_RDONLY flags are tied to the map file descriptor, but not to the map object itself! Meaning, at map creation time BPF_F_RDONLY can be set to make the map read-only from syscall side, but this holds only for the returned fd, so any other fd either retrieved via bpf file system or via map id for the very same underlying map object can have read-write access instead. Given that, keeping the two flags around in the map_flags attribute and exposing them to user space upon map dump is misleading and may lead to false conclusions. Since these two flags are not tied to the map object lets also not store them as map property. Change-Id: I9e15aaba3ea5ef57bb97001d9b85cee8824fd412 Signed-off-by: Daniel Borkmann Acked-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov --- kernel/bpf/syscall.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 4d3a7bbdd2a1..2442ba79efb7 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -167,13 +167,25 @@ void bpf_map_area_free(void *area) kvfree(area); } +static u32 bpf_map_flags_retain_permanent(u32 flags) +{ + /* Some map creation flags are not tied to the map object but + * rather to the map fd instead, so they have no meaning upon + * map object inspection since multiple file descriptors with + * different (access) properties can exist here. Thus, given + * this has zero meaning for the map itself, lets clear these + * from here. + */ + return flags & ~(BPF_F_RDONLY | BPF_F_WRONLY); +} + void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr) { map->map_type = attr->map_type; map->key_size = attr->key_size; map->value_size = attr->value_size; map->max_entries = attr->max_entries; - map->map_flags = attr->map_flags; + map->map_flags = bpf_map_flags_retain_permanent(attr->map_flags); map->numa_node = bpf_map_attr_numa_node(attr); } From 225f12d83c2297fcf5e937dc016b93f17f052280 Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Fri, 16 Nov 2018 11:41:09 +0000 Subject: [PATCH 1079/1640] UPSTREAM: bpf: move BPF_F_QUERY_EFFECTIVE after map flags BPF_F_QUERY_EFFECTIVE is in the middle of the flags valid for BPF_MAP_CREATE. Move it to its own section to reduce confusion. Change-Id: I271ee60c5f96b9a4d410e05e025712a177fed02c Signed-off-by: Lorenz Bauer Signed-off-by: Daniel Borkmann --- include/uapi/linux/bpf.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index fe43e376933c..2bc24b611187 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -294,9 +294,6 @@ enum bpf_attach_type { /* Specify numa node during map creation */ #define BPF_F_NUMA_NODE (1U << 2) -/* flags for BPF_PROG_QUERY */ -#define BPF_F_QUERY_EFFECTIVE (1U << 0) - #define BPF_OBJ_NAME_LEN 16U /* Flags for accessing BPF object */ @@ -309,6 +306,9 @@ enum bpf_attach_type { /* Zero-initialize hash function seed. This should only be used for testing. */ #define BPF_F_ZERO_SEED (1U << 6) +/* flags for BPF_PROG_QUERY */ +#define BPF_F_QUERY_EFFECTIVE (1U << 0) + enum bpf_stack_build_id_status { /* user space need an empty entry to identify end of a trace */ BPF_STACK_BUILD_ID_EMPTY = 0, From 6851cbec4183664e31089e58c8ee5381e7b64f90 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 9 Apr 2019 23:20:05 +0200 Subject: [PATCH 1080/1640] UPSTREAM: bpf: add program side {rd, wr}only support for maps This work adds two new map creation flags BPF_F_RDONLY_PROG and BPF_F_WRONLY_PROG in order to allow for read-only or write-only BPF maps from a BPF program side. Today we have BPF_F_RDONLY and BPF_F_WRONLY, but this only applies to system call side, meaning the BPF program has full read/write access to the map as usual while bpf(2) calls with map fd can either only read or write into the map depending on the flags. BPF_F_RDONLY_PROG and BPF_F_WRONLY_PROG allows for the exact opposite such that verifier is going to reject program loads if write into a read-only map or a read into a write-only map is detected. For read-only map case also some helpers are forbidden for programs that would alter the map state such as map deletion, update, etc. As opposed to the two BPF_F_RDONLY / BPF_F_WRONLY flags, BPF_F_RDONLY_PROG as well as BPF_F_WRONLY_PROG really do correspond to the map lifetime. We've enabled this generic map extension to various non-special maps holding normal user data: array, hash, lru, lpm, local storage, queue and stack. Further generic map types could be followed up in future depending on use-case. Main use case here is to forbid writes into .rodata map values from verifier side. Change-Id: Iad96790cec92137902fe3ad12f53f1a94d58bc61 Signed-off-by: Daniel Borkmann Acked-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 29 ++++++++++++++++++++++ include/uapi/linux/bpf.h | 6 ++++- kernel/bpf/arraymap.c | 6 ++++- kernel/bpf/hashtab.c | 6 ++--- kernel/bpf/local_storage.c | 6 ++--- kernel/bpf/lpm_trie.c | 3 ++- kernel/bpf/queue_stack_maps.c | 6 ++--- kernel/bpf/syscall.c | 2 ++ kernel/bpf/verifier.c | 46 +++++++++++++++++++++++++++++++++-- 9 files changed, 96 insertions(+), 14 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 678057fba291..f7039ce8418f 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -435,6 +435,35 @@ struct bpf_array { #define BPF_COMPLEXITY_LIMIT_INSNS 1000000 /* yes. 1M insns */ #define MAX_TAIL_CALL_CNT 32 +#define BPF_F_ACCESS_MASK (BPF_F_RDONLY | \ + BPF_F_RDONLY_PROG | \ + BPF_F_WRONLY | \ + BPF_F_WRONLY_PROG) + +#define BPF_MAP_CAN_READ BIT(0) +#define BPF_MAP_CAN_WRITE BIT(1) + +static inline u32 bpf_map_flags_to_cap(struct bpf_map *map) +{ + u32 access_flags = map->map_flags & (BPF_F_RDONLY_PROG | BPF_F_WRONLY_PROG); + + /* Combination of BPF_F_RDONLY_PROG | BPF_F_WRONLY_PROG is + * not possible. + */ + if (access_flags & BPF_F_RDONLY_PROG) + return BPF_MAP_CAN_READ; + else if (access_flags & BPF_F_WRONLY_PROG) + return BPF_MAP_CAN_WRITE; + else + return BPF_MAP_CAN_READ | BPF_MAP_CAN_WRITE; +} + +static inline bool bpf_map_flags_access_ok(u32 access_flags) +{ + return (access_flags & (BPF_F_RDONLY_PROG | BPF_F_WRONLY_PROG)) != + (BPF_F_RDONLY_PROG | BPF_F_WRONLY_PROG); +} + struct bpf_event_entry { struct perf_event *event; struct file *perf_file; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 2bc24b611187..45d4436b894a 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -296,7 +296,7 @@ enum bpf_attach_type { #define BPF_OBJ_NAME_LEN 16U -/* Flags for accessing BPF object */ +/* Flags for accessing BPF object from syscall side. */ #define BPF_F_RDONLY (1U << 3) #define BPF_F_WRONLY (1U << 4) @@ -306,6 +306,10 @@ enum bpf_attach_type { /* Zero-initialize hash function seed. This should only be used for testing. */ #define BPF_F_ZERO_SEED (1U << 6) +/* Flags for accessing BPF object from program side. */ +#define BPF_F_RDONLY_PROG (1U << 7) +#define BPF_F_WRONLY_PROG (1U << 8) + /* flags for BPF_PROG_QUERY */ #define BPF_F_QUERY_EFFECTIVE (1U << 0) diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 2a4950c1fc41..cfbd7bc60acc 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -22,7 +22,7 @@ #include "map_in_map.h" #define ARRAY_CREATE_FLAG_MASK \ - (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) + (BPF_F_NUMA_NODE | BPF_F_ACCESS_MASK) static void bpf_array_free_percpu(struct bpf_array *array) { @@ -63,6 +63,7 @@ int array_map_alloc_check(union bpf_attr *attr) if (attr->max_entries == 0 || attr->key_size != 4 || attr->value_size == 0 || attr->map_flags & ~ARRAY_CREATE_FLAG_MASK || + !bpf_map_flags_access_ok(attr->map_flags) || (percpu && numa_node != NUMA_NO_NODE)) return -EINVAL; @@ -475,6 +476,9 @@ static int fd_array_map_alloc_check(union bpf_attr *attr) /* only file descriptors can be stored in this type of map */ if (attr->value_size != sizeof(u32)) return -EINVAL; + /* Program read-only/write-only not supported for special maps yet. */ + if (attr->map_flags & (BPF_F_RDONLY_PROG | BPF_F_WRONLY_PROG)) + return -EINVAL; return array_map_alloc_check(attr); } diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 65d588ed8049..681c9ec8d44e 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -23,7 +23,7 @@ #define HTAB_CREATE_FLAG_MASK \ (BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE | \ - BPF_F_RDONLY | BPF_F_WRONLY | BPF_F_ZERO_SEED) + BPF_F_ACCESS_MASK | BPF_F_ZERO_SEED) struct bucket { struct hlist_nulls_head head; @@ -262,8 +262,8 @@ static int htab_map_alloc_check(union bpf_attr *attr) /* Guard against local DoS, and discourage production use. */ return -EPERM; - if (attr->map_flags & ~HTAB_CREATE_FLAG_MASK) - /* reserved bits should not be used */ + if (attr->map_flags & ~HTAB_CREATE_FLAG_MASK || + !bpf_map_flags_access_ok(attr->map_flags)) return -EINVAL; if (!lru && percpu_lru) diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index 6b572e2de7fb..980e8f1f6cb5 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -14,7 +14,7 @@ DEFINE_PER_CPU(struct bpf_cgroup_storage*, bpf_cgroup_storage[MAX_BPF_CGROUP_STO #ifdef CONFIG_CGROUP_BPF #define LOCAL_STORAGE_CREATE_FLAG_MASK \ - (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) + (BPF_F_NUMA_NODE | BPF_F_ACCESS_MASK) struct bpf_cgroup_storage_map { struct bpf_map map; @@ -282,8 +282,8 @@ static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr) if (attr->value_size > PAGE_SIZE) return ERR_PTR(-E2BIG); - if (attr->map_flags & ~LOCAL_STORAGE_CREATE_FLAG_MASK) - /* reserved bits should not be used */ + if (attr->map_flags & ~LOCAL_STORAGE_CREATE_FLAG_MASK || + !bpf_map_flags_access_ok(attr->map_flags)) return ERR_PTR(-EINVAL); if (attr->max_entries) diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index 5373298670a5..612b8e126012 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -541,7 +541,7 @@ out: #define LPM_KEY_SIZE_MIN LPM_KEY_SIZE(LPM_DATA_SIZE_MIN) #define LPM_CREATE_FLAG_MASK (BPF_F_NO_PREALLOC | BPF_F_NUMA_NODE | \ - BPF_F_RDONLY | BPF_F_WRONLY) + BPF_F_ACCESS_MASK) static struct bpf_map *trie_alloc(union bpf_attr *attr) { @@ -556,6 +556,7 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr) if (attr->max_entries == 0 || !(attr->map_flags & BPF_F_NO_PREALLOC) || attr->map_flags & ~LPM_CREATE_FLAG_MASK || + !bpf_map_flags_access_ok(attr->map_flags) || attr->key_size < LPM_KEY_SIZE_MIN || attr->key_size > LPM_KEY_SIZE_MAX || attr->value_size < LPM_VAL_SIZE_MIN || diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c index b384ea9f3254..0b140d236889 100644 --- a/kernel/bpf/queue_stack_maps.c +++ b/kernel/bpf/queue_stack_maps.c @@ -11,8 +11,7 @@ #include "percpu_freelist.h" #define QUEUE_STACK_CREATE_FLAG_MASK \ - (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) - + (BPF_F_NUMA_NODE | BPF_F_ACCESS_MASK) struct bpf_queue_stack { struct bpf_map map; @@ -52,7 +51,8 @@ static int queue_stack_map_alloc_check(union bpf_attr *attr) /* check sanity of attributes */ if (attr->max_entries == 0 || attr->key_size != 0 || attr->value_size == 0 || - attr->map_flags & ~QUEUE_STACK_CREATE_FLAG_MASK) + attr->map_flags & ~QUEUE_STACK_CREATE_FLAG_MASK || + !bpf_map_flags_access_ok(attr->map_flags)) return -EINVAL; if (attr->value_size > KMALLOC_MAX_SIZE) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 2442ba79efb7..620ac4b99f67 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -502,6 +502,8 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf, map->spin_lock_off = btf_find_spin_lock(btf, value_type); if (map_value_has_spin_lock(map)) { + if (map->map_flags & BPF_F_RDONLY_PROG) + return -EACCES; if (map->map_type != BPF_MAP_TYPE_HASH && map->map_type != BPF_MAP_TYPE_ARRAY && map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 251d31fc65b1..7b346636a0c3 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1448,6 +1448,28 @@ static int check_stack_access(struct bpf_verifier_env *env, return 0; } +static int check_map_access_type(struct bpf_verifier_env *env, u32 regno, + int off, int size, enum bpf_access_type type) +{ + struct bpf_reg_state *regs = cur_regs(env); + struct bpf_map *map = regs[regno].map_ptr; + u32 cap = bpf_map_flags_to_cap(map); + + if (type == BPF_WRITE && !(cap & BPF_MAP_CAN_WRITE)) { + verbose(env, "write into map forbidden, value_size=%d off=%d size=%d\n", + map->value_size, off, size); + return -EACCES; + } + + if (type == BPF_READ && !(cap & BPF_MAP_CAN_READ)) { + verbose(env, "read from map forbidden, value_size=%d off=%d size=%d\n", + map->value_size, off, size); + return -EACCES; + } + + return 0; +} + /* check read/write into map element returned by bpf_map_lookup_elem() */ static int __check_map_access(struct bpf_verifier_env *env, u32 regno, int off, int size, bool zero_size_allowed) @@ -2034,7 +2056,9 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn verbose(env, "R%d leaks addr into map\n", value_regno); return -EACCES; } - + err = check_map_access_type(env, regno, off, size, t); + if (err) + return err; err = check_map_access(env, regno, off, size, false); if (!err && t == BPF_READ && value_regno >= 0) mark_reg_unknown(env, regs, value_regno); @@ -2345,6 +2369,10 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, return check_packet_access(env, regno, reg->off, access_size, zero_size_allowed); case PTR_TO_MAP_VALUE: + if (check_map_access_type(env, regno, reg->off, access_size, + meta && meta->raw_mode ? BPF_WRITE : + BPF_READ)) + return -EACCES; return check_map_access(env, regno, reg->off, access_size, zero_size_allowed); default: /* scalar_value|ptr_to_stack or invalid ptr */ @@ -3099,6 +3127,7 @@ record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta, int func_id, int insn_idx) { struct bpf_insn_aux_data *aux = &env->insn_aux_data[insn_idx]; + struct bpf_map *map = meta->map_ptr; if (func_id != BPF_FUNC_tail_call && func_id != BPF_FUNC_map_lookup_elem && @@ -3109,11 +3138,24 @@ record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta, func_id != BPF_FUNC_map_peek_elem) return 0; - if (meta->map_ptr == NULL) { + if (map == NULL) { verbose(env, "kernel subsystem misconfigured verifier\n"); return -EINVAL; } + /* In case of read-only, some additional restrictions + * need to be applied in order to prevent altering the + * state of the map from program side. + */ + if ((map->map_flags & BPF_F_RDONLY_PROG) && + (func_id == BPF_FUNC_map_delete_elem || + func_id == BPF_FUNC_map_update_elem || + func_id == BPF_FUNC_map_push_elem || + func_id == BPF_FUNC_map_pop_elem)) { + verbose(env, "write into map forbidden\n"); + return -EACCES; + } + if (!BPF_MAP_PTR(aux->map_state)) bpf_map_ptr_store(aux, meta->map_ptr, meta->map_ptr->unpriv_array); From 097221481d40323f40e6511bbed84d6d862c988f Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 9 Apr 2019 23:20:06 +0200 Subject: [PATCH 1081/1640] UPSTREAM: bpf: add syscall side map freeze support This patch adds a new BPF_MAP_FREEZE command which allows to "freeze" the map globally as read-only / immutable from syscall side. Map permission handling has been refactored into map_get_sys_perms() and drops FMODE_CAN_WRITE in case of locked map. Main use case is to allow for setting up .rodata sections from the BPF ELF which are loaded into the kernel, meaning BPF loader first allocates map, sets up map value by copying .rodata section into it and once complete, it calls BPF_MAP_FREEZE on the map fd to prevent further modifications. Right now BPF_MAP_FREEZE only takes map fd as argument while remaining bpf_attr members are required to be zero. I didn't add write-only locking here as counterpart since I don't have a concrete use-case for it on my side, and I think it makes probably more sense to wait once there is actually one. In that case bpf_attr can be extended as usual with a flag field and/or others where flag 0 means that we lock the map read-only hence this doesn't prevent to add further extensions to BPF_MAP_FREEZE upon need. A map creation flag like BPF_F_WRONCE was not considered for couple of reasons: i) in case of a generic implementation, a map can consist of more than just one element, thus there could be multiple map updates needed to set the map into a state where it can then be made immutable, ii) WRONCE indicates exact one-time write before it is then set immutable. A generic implementation would set a bit atomically on map update entry (if unset), indicating that every subsequent update from then onwards will need to bail out there. However, map updates can fail, so upon failure that flag would need to be unset again and the update attempt would need to be repeated for it to be eventually made immutable. While this can be made race-free, this approach feels less clean and in combination with reason i), it's not generic enough. A dedicated BPF_MAP_FREEZE command directly sets the flag and caller has the guarantee that map is immutable from syscall side upon successful return for any future syscall invocations that would alter the map state, which is also more intuitive from an API point of view. A command name such as BPF_MAP_LOCK has been avoided as it's too close with BPF map spin locks (which already has BPF_F_LOCK flag). BPF_MAP_FREEZE is so far only enabled for privileged users. Change-Id: I42eb839aaa947b3269beb34f7588c338ac056d50 Signed-off-by: Daniel Borkmann Acked-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 3 +- include/uapi/linux/bpf.h | 1 + kernel/bpf/syscall.c | 66 ++++++++++++++++++++++++++++++++-------- 3 files changed, 57 insertions(+), 13 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index f7039ce8418f..01d300f9ac5e 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -92,7 +92,8 @@ struct bpf_map { struct btf *btf; u32 pages; bool unpriv_array; - /* 51 bytes hole */ + bool frozen; /* write-once */ + /* 48 bytes hole */ /* The 3rd and 4th cacheline with misc members to avoid false sharing * particularly with refcounting. diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 45d4436b894a..03c05ed68839 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -105,6 +105,7 @@ enum bpf_cmd { BPF_BTF_GET_FD_BY_ID, BPF_TASK_FD_QUERY, BPF_MAP_LOOKUP_AND_DELETE_ELEM, + BPF_MAP_FREEZE, }; enum bpf_map_type { diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 620ac4b99f67..e53deecd4d95 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -356,6 +356,18 @@ static int bpf_map_release(struct inode *inode, struct file *filp) return 0; } +static fmode_t map_get_sys_perms(struct bpf_map *map, struct fd f) +{ + fmode_t mode = f.file->f_mode; + + /* Our file permissions may have been overridden by global + * map permissions facing syscall side. + */ + if (READ_ONCE(map->frozen)) + mode &= ~FMODE_CAN_WRITE; + return mode; +} + #ifdef CONFIG_PROC_FS static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp) { @@ -377,14 +389,16 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp) "max_entries:\t%u\n" "map_flags:\t%#x\n" "memlock:\t%llu\n" - "map_id:\t%u\n", + "map_id:\t%u\n" + "frozen:\t%u\n", map->map_type, map->key_size, map->value_size, map->max_entries, map->map_flags, map->pages * 1ULL << PAGE_SHIFT, - map->id); + map->id, + READ_ONCE(map->frozen)); if (owner_prog_type) { seq_printf(m, "owner_prog_type:\t%u\n", @@ -728,8 +742,7 @@ static int map_lookup_elem(union bpf_attr *attr) map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); - - if (!(f.file->f_mode & FMODE_CAN_READ)) { + if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ)) { err = -EPERM; goto err_put; } @@ -862,8 +875,7 @@ static int map_update_elem(union bpf_attr *attr) map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); - - if (!(f.file->f_mode & FMODE_CAN_WRITE)) { + if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) { err = -EPERM; goto err_put; } @@ -974,8 +986,7 @@ static int map_delete_elem(union bpf_attr *attr) map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); - - if (!(f.file->f_mode & FMODE_CAN_WRITE)) { + if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) { err = -EPERM; goto err_put; } @@ -1026,8 +1037,7 @@ static int map_get_next_key(union bpf_attr *attr) map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); - - if (!(f.file->f_mode & FMODE_CAN_READ)) { + if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ)) { err = -EPERM; goto err_put; } @@ -1094,8 +1104,7 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr) map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); - - if (!(f.file->f_mode & FMODE_CAN_WRITE)) { + if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) { err = -EPERM; goto err_put; } @@ -1137,6 +1146,36 @@ err_put: return err; } +#define BPF_MAP_FREEZE_LAST_FIELD map_fd + +static int map_freeze(const union bpf_attr *attr) +{ + int err = 0, ufd = attr->map_fd; + struct bpf_map *map; + struct fd f; + + if (CHECK_ATTR(BPF_MAP_FREEZE)) + return -EINVAL; + + f = fdget(ufd); + map = __bpf_map_get(f); + if (IS_ERR(map)) + return PTR_ERR(map); + if (READ_ONCE(map->frozen)) { + err = -EBUSY; + goto err_put; + } + if (!capable(CAP_SYS_ADMIN)) { + err = -EPERM; + goto err_put; + } + + WRITE_ONCE(map->frozen, true); +err_put: + fdput(f); + return err; +} + static const struct bpf_prog_ops * const bpf_prog_types[] = { #define BPF_PROG_TYPE(_id, _name) \ [_id] = & _name ## _prog_ops, @@ -2761,6 +2800,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz case BPF_MAP_GET_NEXT_KEY: err = map_get_next_key(&attr); break; + case BPF_MAP_FREEZE: + err = map_freeze(&attr); + break; case BPF_PROG_LOAD: err = bpf_prog_load(&attr, uattr); break; From f0939cfa3ce2b215f19047e304c0a39e2c36c278 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 9 Apr 2019 23:20:07 +0200 Subject: [PATCH 1082/1640] UPSTREAM: bpf: allow . char as part of the object name Trivial addition to allow '.' aside from '_' as "special" characters in the object name. Used to allow for substrings in maps from loader side such as ".bss", ".data", ".rodata", but could also be useful for other purposes. Change-Id: Ic7a2aa981e73847b187edd10b3b04e4e538fe433 Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Acked-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov --- kernel/bpf/syscall.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index e53deecd4d95..0a1a87b564d8 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -475,10 +475,10 @@ static int bpf_obj_name_cpy(char *dst, const char *src) const char *end = src + BPF_OBJ_NAME_LEN; memset(dst, 0, BPF_OBJ_NAME_LEN); - - /* Copy all isalnum() and '_' char */ + /* Copy all isalnum(), '_' and '.' chars. */ while (src < end && *src) { - if (!isalnum(*src) && *src != '_') + if (!isalnum(*src) && + *src != '_' && *src != '.') return -EINVAL; *dst++ = *src++; } From 6106f050b33756867066bf0e027f1c812ab562fb Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 9 Apr 2019 23:20:09 +0200 Subject: [PATCH 1083/1640] UPSTREAM: bpf: kernel side support for BTF Var and DataSec This work adds kernel-side verification, logging and seq_show dumping of BTF Var and DataSec kinds which are emitted with latest LLVM. The following constraints apply: BTF Var must have: - Its kind_flag is 0 - Its vlen is 0 - Must point to a valid type - Type must not resolve to a forward type - Size of underlying type must be > 0 - Must have a valid name - Can only be a source type, not sink or intermediate one - Name may include dots (e.g. in case of static variables inside functions) - Cannot be a member of a struct/union - Linkage so far can either only be static or global/allocated BTF DataSec must have: - Its kind_flag is 0 - Its vlen cannot be 0 - Its size cannot be 0 - Must have a valid name - Can only be a source type, not sink or intermediate one - Name may include dots (e.g. to represent .bss, .data, .rodata etc) - Cannot be a member of a struct/union - Inner btf_var_secinfo array with {type,offset,size} triple must be sorted by offset in ascending order - Type must always point to BTF Var - BTF resolved size of Var must be <= size provided by triple - DataSec size must be >= sum of triple sizes (thus holes are allowed) btf_var_resolve(), btf_ptr_resolve() and btf_modifier_resolve() are on a high level quite similar but each come with slight, subtle differences. They could potentially be a bit refactored in future which hasn't been done here to ease review. Change-Id: I7e0f14d86eb71277a754533fe27a3bd0c117441f Signed-off-by: Daniel Borkmann Acked-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov --- kernel/bpf/btf.c | 417 ++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 397 insertions(+), 20 deletions(-) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index f0c2e33203e8..7910c7e29ed7 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -185,6 +185,16 @@ i < btf_type_vlen(struct_type); \ i++, member++) +#define for_each_vsi(i, struct_type, member) \ + for (i = 0, member = btf_type_var_secinfo(struct_type); \ + i < btf_type_vlen(struct_type); \ + i++, member++) + +#define for_each_vsi_from(i, from, struct_type, member) \ + for (i = from, member = btf_type_var_secinfo(struct_type) + from; \ + i < btf_type_vlen(struct_type); \ + i++, member++) + static DEFINE_IDR(btf_idr); static DEFINE_SPINLOCK(btf_idr_lock); @@ -262,6 +272,8 @@ static const char * const btf_kind_str[NR_BTF_KINDS] = { [BTF_KIND_RESTRICT] = "RESTRICT", [BTF_KIND_FUNC] = "FUNC", [BTF_KIND_FUNC_PROTO] = "FUNC_PROTO", + [BTF_KIND_VAR] = "VAR", + [BTF_KIND_DATASEC] = "DATASEC", }; struct btf_kind_operations { @@ -375,13 +387,36 @@ static bool btf_type_is_int(const struct btf_type *t) return BTF_INFO_KIND(t->info) == BTF_KIND_INT; } +static bool btf_type_is_var(const struct btf_type *t) +{ + return BTF_INFO_KIND(t->info) == BTF_KIND_VAR; +} + +static bool btf_type_is_datasec(const struct btf_type *t) +{ + return BTF_INFO_KIND(t->info) == BTF_KIND_DATASEC; +} + +/* Types that act only as a source, not sink or intermediate + * type when resolving. + */ +static bool btf_type_is_resolve_source_only(const struct btf_type *t) +{ + return btf_type_is_var(t) || + btf_type_is_datasec(t); +} + /* What types need to be resolved? * * btf_type_is_modifier() is an obvious one. * * btf_type_is_struct() because its member refers to * another type (through member->type). - + * + * btf_type_is_var() because the variable refers to + * another type. btf_type_is_datasec() holds multiple + * btf_type_is_var() types that need resolving. + * * btf_type_is_array() because its element (array->type) * refers to another type. Array can be thought of a * special case of struct while array just has the same @@ -390,9 +425,11 @@ static bool btf_type_is_int(const struct btf_type *t) static bool btf_type_needs_resolve(const struct btf_type *t) { return btf_type_is_modifier(t) || - btf_type_is_ptr(t) || - btf_type_is_struct(t) || - btf_type_is_array(t); + btf_type_is_ptr(t) || + btf_type_is_struct(t) || + btf_type_is_array(t) || + btf_type_is_var(t) || + btf_type_is_datasec(t); } /* t->size can be used */ @@ -403,6 +440,7 @@ static bool btf_type_has_size(const struct btf_type *t) case BTF_KIND_STRUCT: case BTF_KIND_UNION: case BTF_KIND_ENUM: + case BTF_KIND_DATASEC: return true; } @@ -467,6 +505,16 @@ static const struct btf_enum *btf_type_enum(const struct btf_type *t) return (const struct btf_enum *)(t + 1); } +static const struct btf_var *btf_type_var(const struct btf_type *t) +{ + return (const struct btf_var *)(t + 1); +} + +static const struct btf_var_secinfo *btf_type_var_secinfo(const struct btf_type *t) +{ + return (const struct btf_var_secinfo *)(t + 1); +} + static const struct btf_kind_operations *btf_type_ops(const struct btf_type *t) { return kind_ops[BTF_INFO_KIND(t->info)]; @@ -478,23 +526,31 @@ static bool btf_name_offset_valid(const struct btf *btf, u32 offset) offset < btf->hdr.str_len; } -/* Only C-style identifier is permitted. This can be relaxed if - * necessary. - */ -static bool btf_name_valid_identifier(const struct btf *btf, u32 offset) +static bool __btf_name_char_ok(char c, bool first, bool dot_ok) +{ + if ((first ? !isalpha(c) : + !isalnum(c)) && + c != '_' && + ((c == '.' && !dot_ok) || + c != '.')) + return false; + return true; +} + +static bool __btf_name_valid(const struct btf *btf, u32 offset, bool dot_ok) { /* offset must be valid */ const char *src = &btf->strings[offset]; const char *src_limit; - if (!isalpha(*src) && *src != '_') + if (!__btf_name_char_ok(*src, true, dot_ok)) return false; /* set a limit on identifier length */ src_limit = src + KSYM_NAME_LEN; src++; while (*src && src < src_limit) { - if (!isalnum(*src) && *src != '_') + if (!__btf_name_char_ok(*src, false, dot_ok)) return false; src++; } @@ -502,6 +558,19 @@ static bool btf_name_valid_identifier(const struct btf *btf, u32 offset) return !*src; } +/* Only C-style identifier is permitted. This can be relaxed if + * necessary. + */ +static bool btf_name_valid_identifier(const struct btf *btf, u32 offset) +{ + return __btf_name_valid(btf, offset, false); +} + +static bool btf_name_valid_section(const struct btf *btf, u32 offset) +{ + return __btf_name_valid(btf, offset, true); +} + static const char *__btf_name_by_offset(const struct btf *btf, u32 offset) { if (!offset) @@ -697,6 +766,32 @@ static void btf_verifier_log_member(struct btf_verifier_env *env, __btf_verifier_log(log, "\n"); } +__printf(4, 5) +static void btf_verifier_log_vsi(struct btf_verifier_env *env, + const struct btf_type *datasec_type, + const struct btf_var_secinfo *vsi, + const char *fmt, ...) +{ + struct bpf_verifier_log *log = &env->log; + va_list args; + + if (!bpf_verifier_log_needed(log)) + return; + if (env->phase != CHECK_META) + btf_verifier_log_type(env, datasec_type, NULL); + + __btf_verifier_log(log, "\t type_id=%u offset=%u size=%u", + vsi->type, vsi->offset, vsi->size); + if (fmt && *fmt) { + __btf_verifier_log(log, " "); + va_start(args, fmt); + bpf_verifier_vlog(log, fmt, args); + va_end(args); + } + + __btf_verifier_log(log, "\n"); +} + static void btf_verifier_log_hdr(struct btf_verifier_env *env, u32 btf_data_size) { @@ -974,7 +1069,8 @@ const struct btf_type *btf_type_id_size(const struct btf *btf, } else if (btf_type_is_ptr(size_type)) { size = sizeof(void *); } else { - if (WARN_ON_ONCE(!btf_type_is_modifier(size_type))) + if (WARN_ON_ONCE(!btf_type_is_modifier(size_type) && + !btf_type_is_var(size_type))) return NULL; size = btf->resolved_sizes[size_type_id]; @@ -1509,7 +1605,7 @@ static int btf_modifier_resolve(struct btf_verifier_env *env, u32 next_type_size = 0; next_type = btf_type_by_id(btf, next_type_id); - if (!next_type) { + if (!next_type || btf_type_is_resolve_source_only(next_type)) { btf_verifier_log_type(env, v->t, "Invalid type_id"); return -EINVAL; } @@ -1542,6 +1638,53 @@ static int btf_modifier_resolve(struct btf_verifier_env *env, return 0; } +static int btf_var_resolve(struct btf_verifier_env *env, + const struct resolve_vertex *v) +{ + const struct btf_type *next_type; + const struct btf_type *t = v->t; + u32 next_type_id = t->type; + struct btf *btf = env->btf; + u32 next_type_size; + + next_type = btf_type_by_id(btf, next_type_id); + if (!next_type || btf_type_is_resolve_source_only(next_type)) { + btf_verifier_log_type(env, v->t, "Invalid type_id"); + return -EINVAL; + } + + if (!env_type_is_resolve_sink(env, next_type) && + !env_type_is_resolved(env, next_type_id)) + return env_stack_push(env, next_type, next_type_id); + + if (btf_type_is_modifier(next_type)) { + const struct btf_type *resolved_type; + u32 resolved_type_id; + + resolved_type_id = next_type_id; + resolved_type = btf_type_id_resolve(btf, &resolved_type_id); + + if (btf_type_is_ptr(resolved_type) && + !env_type_is_resolve_sink(env, resolved_type) && + !env_type_is_resolved(env, resolved_type_id)) + return env_stack_push(env, resolved_type, + resolved_type_id); + } + + /* We must resolve to something concrete at this point, no + * forward types or similar that would resolve to size of + * zero is allowed. + */ + if (!btf_type_id_size(btf, &next_type_id, &next_type_size)) { + btf_verifier_log_type(env, v->t, "Invalid type_id"); + return -EINVAL; + } + + env_stack_pop_resolved(env, next_type_id, next_type_size); + + return 0; +} + static int btf_ptr_resolve(struct btf_verifier_env *env, const struct resolve_vertex *v) { @@ -1551,7 +1694,7 @@ static int btf_ptr_resolve(struct btf_verifier_env *env, struct btf *btf = env->btf; next_type = btf_type_by_id(btf, next_type_id); - if (!next_type) { + if (!next_type || btf_type_is_resolve_source_only(next_type)) { btf_verifier_log_type(env, v->t, "Invalid type_id"); return -EINVAL; } @@ -1609,6 +1752,15 @@ static void btf_modifier_seq_show(const struct btf *btf, btf_type_ops(t)->seq_show(btf, t, type_id, data, bits_offset, m); } +static void btf_var_seq_show(const struct btf *btf, const struct btf_type *t, + u32 type_id, void *data, u8 bits_offset, + struct seq_file *m) +{ + t = btf_type_id_resolve(btf, &type_id); + + btf_type_ops(t)->seq_show(btf, t, type_id, data, bits_offset, m); +} + static void btf_ptr_seq_show(const struct btf *btf, const struct btf_type *t, u32 type_id, void *data, u8 bits_offset, struct seq_file *m) @@ -1776,7 +1928,8 @@ static int btf_array_resolve(struct btf_verifier_env *env, /* Check array->index_type */ index_type_id = array->index_type; index_type = btf_type_by_id(btf, index_type_id); - if (btf_type_nosize_or_null(index_type)) { + if (btf_type_is_resolve_source_only(index_type) || + btf_type_nosize_or_null(index_type)) { btf_verifier_log_type(env, v->t, "Invalid index"); return -EINVAL; } @@ -1795,7 +1948,8 @@ static int btf_array_resolve(struct btf_verifier_env *env, /* Check array->type */ elem_type_id = array->type; elem_type = btf_type_by_id(btf, elem_type_id); - if (btf_type_nosize_or_null(elem_type)) { + if (btf_type_is_resolve_source_only(elem_type) || + btf_type_nosize_or_null(elem_type)) { btf_verifier_log_type(env, v->t, "Invalid elem"); return -EINVAL; @@ -2016,7 +2170,8 @@ static int btf_struct_resolve(struct btf_verifier_env *env, const struct btf_type *member_type = btf_type_by_id(env->btf, member_type_id); - if (btf_type_nosize_or_null(member_type)) { + if (btf_type_is_resolve_source_only(member_type) || + btf_type_nosize_or_null(member_type)) { btf_verifier_log_member(env, v->t, member, "Invalid member"); return -EINVAL; @@ -2411,6 +2566,222 @@ static struct btf_kind_operations func_ops = { .seq_show = btf_df_seq_show, }; +static s32 btf_var_check_meta(struct btf_verifier_env *env, + const struct btf_type *t, + u32 meta_left) +{ + const struct btf_var *var; + u32 meta_needed = sizeof(*var); + + if (meta_left < meta_needed) { + btf_verifier_log_basic(env, t, + "meta_left:%u meta_needed:%u", + meta_left, meta_needed); + return -EINVAL; + } + + if (btf_type_vlen(t)) { + btf_verifier_log_type(env, t, "vlen != 0"); + return -EINVAL; + } + + if (btf_type_kflag(t)) { + btf_verifier_log_type(env, t, "Invalid btf_info kind_flag"); + return -EINVAL; + } + + if (!t->name_off || + !__btf_name_valid(env->btf, t->name_off, true)) { + btf_verifier_log_type(env, t, "Invalid name"); + return -EINVAL; + } + + /* A var cannot be in type void */ + if (!t->type || !BTF_TYPE_ID_VALID(t->type)) { + btf_verifier_log_type(env, t, "Invalid type_id"); + return -EINVAL; + } + + var = btf_type_var(t); + if (var->linkage != BTF_VAR_STATIC && + var->linkage != BTF_VAR_GLOBAL_ALLOCATED) { + btf_verifier_log_type(env, t, "Linkage not supported"); + return -EINVAL; + } + + btf_verifier_log_type(env, t, NULL); + + return meta_needed; +} + +static void btf_var_log(struct btf_verifier_env *env, const struct btf_type *t) +{ + const struct btf_var *var = btf_type_var(t); + + btf_verifier_log(env, "type_id=%u linkage=%u", t->type, var->linkage); +} + +static const struct btf_kind_operations var_ops = { + .check_meta = btf_var_check_meta, + .resolve = btf_var_resolve, + .check_member = btf_df_check_member, + .check_kflag_member = btf_df_check_kflag_member, + .log_details = btf_var_log, + .seq_show = btf_var_seq_show, +}; + +static s32 btf_datasec_check_meta(struct btf_verifier_env *env, + const struct btf_type *t, + u32 meta_left) +{ + const struct btf_var_secinfo *vsi; + u64 last_vsi_end_off = 0, sum = 0; + u32 i, meta_needed; + + meta_needed = btf_type_vlen(t) * sizeof(*vsi); + if (meta_left < meta_needed) { + btf_verifier_log_basic(env, t, + "meta_left:%u meta_needed:%u", + meta_left, meta_needed); + return -EINVAL; + } + + if (!btf_type_vlen(t)) { + btf_verifier_log_type(env, t, "vlen == 0"); + return -EINVAL; + } + + if (!t->size) { + btf_verifier_log_type(env, t, "size == 0"); + return -EINVAL; + } + + if (btf_type_kflag(t)) { + btf_verifier_log_type(env, t, "Invalid btf_info kind_flag"); + return -EINVAL; + } + + if (!t->name_off || + !btf_name_valid_section(env->btf, t->name_off)) { + btf_verifier_log_type(env, t, "Invalid name"); + return -EINVAL; + } + + btf_verifier_log_type(env, t, NULL); + + for_each_vsi(i, t, vsi) { + /* A var cannot be in type void */ + if (!vsi->type || !BTF_TYPE_ID_VALID(vsi->type)) { + btf_verifier_log_vsi(env, t, vsi, + "Invalid type_id"); + return -EINVAL; + } + + if (vsi->offset < last_vsi_end_off || vsi->offset >= t->size) { + btf_verifier_log_vsi(env, t, vsi, + "Invalid offset"); + return -EINVAL; + } + + if (!vsi->size || vsi->size > t->size) { + btf_verifier_log_vsi(env, t, vsi, + "Invalid size"); + return -EINVAL; + } + + last_vsi_end_off = vsi->offset + vsi->size; + if (last_vsi_end_off > t->size) { + btf_verifier_log_vsi(env, t, vsi, + "Invalid offset+size"); + return -EINVAL; + } + + btf_verifier_log_vsi(env, t, vsi, NULL); + sum += vsi->size; + } + + if (t->size < sum) { + btf_verifier_log_type(env, t, "Invalid btf_info size"); + return -EINVAL; + } + + return meta_needed; +} + +static int btf_datasec_resolve(struct btf_verifier_env *env, + const struct resolve_vertex *v) +{ + const struct btf_var_secinfo *vsi; + struct btf *btf = env->btf; + u16 i; + + for_each_vsi_from(i, v->next_member, v->t, vsi) { + u32 var_type_id = vsi->type, type_id, type_size = 0; + const struct btf_type *var_type = btf_type_by_id(env->btf, + var_type_id); + if (!var_type || !btf_type_is_var(var_type)) { + btf_verifier_log_vsi(env, v->t, vsi, + "Not a VAR kind member"); + return -EINVAL; + } + + if (!env_type_is_resolve_sink(env, var_type) && + !env_type_is_resolved(env, var_type_id)) { + env_stack_set_next_member(env, i + 1); + return env_stack_push(env, var_type, var_type_id); + } + + type_id = var_type->type; + if (!btf_type_id_size(btf, &type_id, &type_size)) { + btf_verifier_log_vsi(env, v->t, vsi, "Invalid type"); + return -EINVAL; + } + + if (vsi->size < type_size) { + btf_verifier_log_vsi(env, v->t, vsi, "Invalid size"); + return -EINVAL; + } + } + + env_stack_pop_resolved(env, 0, 0); + return 0; +} + +static void btf_datasec_log(struct btf_verifier_env *env, + const struct btf_type *t) +{ + btf_verifier_log(env, "size=%u vlen=%u", t->size, btf_type_vlen(t)); +} + +static void btf_datasec_seq_show(const struct btf *btf, + const struct btf_type *t, u32 type_id, + void *data, u8 bits_offset, + struct seq_file *m) +{ + const struct btf_var_secinfo *vsi; + const struct btf_type *var; + u32 i; + + seq_printf(m, "section (\"%s\") = {", __btf_name_by_offset(btf, t->name_off)); + for_each_vsi(i, t, vsi) { + var = btf_type_by_id(btf, vsi->type); + if (i) + seq_puts(m, ","); + btf_type_ops(var)->seq_show(btf, var, vsi->type, + data + vsi->offset, bits_offset, m); + } + seq_puts(m, "}"); +} + +static const struct btf_kind_operations datasec_ops = { + .check_meta = btf_datasec_check_meta, + .resolve = btf_datasec_resolve, + .check_member = btf_df_check_member, + .check_kflag_member = btf_df_check_kflag_member, + .log_details = btf_datasec_log, + .seq_show = btf_datasec_seq_show, +}; + static int btf_func_proto_check(struct btf_verifier_env *env, const struct btf_type *t) { @@ -2542,6 +2913,8 @@ static const struct btf_kind_operations * const kind_ops[NR_BTF_KINDS] = { [BTF_KIND_RESTRICT] = &modifier_ops, [BTF_KIND_FUNC] = &func_ops, [BTF_KIND_FUNC_PROTO] = &func_proto_ops, + [BTF_KIND_VAR] = &var_ops, + [BTF_KIND_DATASEC] = &datasec_ops, }; static s32 btf_check_meta(struct btf_verifier_env *env, @@ -2622,13 +2995,17 @@ static bool btf_resolve_valid(struct btf_verifier_env *env, if (!env_type_is_resolved(env, type_id)) return false; - if (btf_type_is_struct(t)) + if (btf_type_is_struct(t) || btf_type_is_datasec(t)) return !btf->resolved_ids[type_id] && - !btf->resolved_sizes[type_id]; + !btf->resolved_sizes[type_id]; - if (btf_type_is_modifier(t) || btf_type_is_ptr(t)) { + if (btf_type_is_modifier(t) || btf_type_is_ptr(t) || + btf_type_is_var(t)) { t = btf_type_id_resolve(btf, &type_id); - return t && !btf_type_is_modifier(t); + return t && + !btf_type_is_modifier(t) && + !btf_type_is_var(t) && + !btf_type_is_datasec(t); } if (btf_type_is_array(t)) { From 8911d09bd5fb79e3181abb165eeea79b069d38dc Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 9 Apr 2019 23:20:08 +0200 Subject: [PATCH 1084/1640] UPSTREAM: bpf: add specification for BTF Var and DataSec kinds This adds the BTF specification and UAPI bits for supporting BTF Var and DataSec kinds. This is following LLVM upstream commit ac4082b77e07 ("[BPF] Add BTF Var and DataSec Support") which has been merged recently. Var itself is for describing a global variable and DataSec to describe ELF sections e.g. data/bss/rodata sections that hold one or multiple global variables. Change-Id: Ie3785c9d7ecf64d36721f4200cc8bf1394382c18 Signed-off-by: Daniel Borkmann Acked-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov --- include/uapi/linux/btf.h | 32 ++++++++++++++++++++++++++++---- 1 file changed, 28 insertions(+), 4 deletions(-) diff --git a/include/uapi/linux/btf.h b/include/uapi/linux/btf.h index 7b7475ef2f17..9310652ca4f9 100644 --- a/include/uapi/linux/btf.h +++ b/include/uapi/linux/btf.h @@ -39,11 +39,11 @@ struct btf_type { * struct, union and fwd */ __u32 info; - /* "size" is used by INT, ENUM, STRUCT and UNION. + /* "size" is used by INT, ENUM, STRUCT, UNION and DATASEC. * "size" tells the size of the type it is describing. * * "type" is used by PTR, TYPEDEF, VOLATILE, CONST, RESTRICT, - * FUNC and FUNC_PROTO. + * FUNC, FUNC_PROTO and VAR. * "type" is a type_id referring to another type. */ union { @@ -70,8 +70,10 @@ struct btf_type { #define BTF_KIND_RESTRICT 11 /* Restrict */ #define BTF_KIND_FUNC 12 /* Function */ #define BTF_KIND_FUNC_PROTO 13 /* Function Proto */ -#define BTF_KIND_MAX 13 -#define NR_BTF_KINDS 14 +#define BTF_KIND_VAR 14 /* Variable */ +#define BTF_KIND_DATASEC 15 /* Section */ +#define BTF_KIND_MAX BTF_KIND_DATASEC +#define NR_BTF_KINDS (BTF_KIND_MAX + 1) /* For some specific BTF_KIND, "struct btf_type" is immediately * followed by extra data. @@ -138,4 +140,26 @@ struct btf_param { __u32 type; }; +enum { + BTF_VAR_STATIC = 0, + BTF_VAR_GLOBAL_ALLOCATED, +}; + +/* BTF_KIND_VAR is followed by a single "struct btf_var" to describe + * additional information related to the variable such as its linkage. + */ +struct btf_var { + __u32 linkage; +}; + +/* BTF_KIND_DATASEC is followed by multiple "struct btf_var_secinfo" + * to describe all BTF_KIND_VAR types it contains along with it's + * in-section offset as well as size. + */ +struct btf_var_secinfo { + __u32 type; + __u32 offset; + __u32 size; +}; + #endif /* _UAPI__LINUX_BTF_H__ */ From b0c33de60f00f36400c48f578de725106495fa7a Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 9 Apr 2019 23:20:10 +0200 Subject: [PATCH 1085/1640] UPSTREAM: bpf: allow for key-less BTF in array map Given we'll be reusing BPF array maps for global data/bss/rodata sections, we need a way to associate BTF DataSec type as its map value type. In usual cases we have this ugly BPF_ANNOTATE_KV_PAIR() macro hack e.g. via 38d5d3b3d5db ("bpf: Introduce BPF_ANNOTATE_KV_PAIR") to get initial map to type association going. While more use cases for it are discouraged, this also won't work for global data since the use of array map is a BPF loader detail and therefore unknown at compilation time. For array maps with just a single entry we make an exception in terms of BTF in that key type is declared optional if value type is of DataSec type. The latter LLVM is guaranteed to emit and it also aligns with how we regard global data maps as just a plain buffer area reusing existing map facilities for allowing things like introspection with existing tools. Change-Id: I6fd7e20b453529e07aa1c77beacff4e62c7500bd Signed-off-by: Daniel Borkmann Acked-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov --- include/linux/btf.h | 1 + kernel/bpf/arraymap.c | 15 ++++++++++++++- kernel/bpf/btf.c | 2 +- kernel/bpf/syscall.c | 15 +++++++++++---- 4 files changed, 27 insertions(+), 6 deletions(-) diff --git a/include/linux/btf.h b/include/linux/btf.h index 455d31b55828..64cdf2a23d42 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -51,6 +51,7 @@ bool btf_member_is_reg_int(const struct btf *btf, const struct btf_type *s, const struct btf_member *m, u32 expected_offset, u32 expected_size); int btf_find_spin_lock(const struct btf *btf, const struct btf_type *t); +bool btf_type_is_void(const struct btf_type *t); #ifdef CONFIG_BPF_SYSCALL const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id); diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index cfbd7bc60acc..3ed5784f476a 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -394,7 +394,8 @@ static void array_map_seq_show_elem(struct bpf_map *map, void *key, return; } - seq_printf(m, "%u: ", *(u32 *)key); + if (map->btf_key_type_id) + seq_printf(m, "%u: ", *(u32 *)key); btf_type_seq_show(map->btf, map->btf_value_type_id, value, m); seq_puts(m, "\n"); @@ -431,6 +432,18 @@ static int array_map_check_btf(const struct bpf_map *map, { u32 int_data; + /* One exception for keyless BTF: .bss/.data/.rodata map */ + if (btf_type_is_void(key_type)) { + if (map->map_type != BPF_MAP_TYPE_ARRAY || + map->max_entries != 1) + return -EINVAL; + + if (BTF_INFO_KIND(value_type->info) != BTF_KIND_DATASEC) + return -EINVAL; + + return 0; + } + if (BTF_INFO_KIND(key_type->info) != BTF_KIND_INT) return -EINVAL; diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 7910c7e29ed7..d890db3452fa 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -326,7 +326,7 @@ static bool btf_type_is_modifier(const struct btf_type *t) return false; } -static bool btf_type_is_void(const struct btf_type *t) +bool btf_type_is_void(const struct btf_type *t) { return t == &btf_void; } diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 0a1a87b564d8..8e6645d1aec5 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -505,9 +505,16 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf, u32 key_size, value_size; int ret = 0; - key_type = btf_type_id_size(btf, &btf_key_id, &key_size); - if (!key_type || key_size != map->key_size) - return -EINVAL; + /* Some maps allow key to be unspecified. */ + if (btf_key_id) { + key_type = btf_type_id_size(btf, &btf_key_id, &key_size); + if (!key_type || key_size != map->key_size) + return -EINVAL; + } else { + key_type = btf_type_by_id(btf, 0); + if (!map->ops->map_check_btf) + return -EINVAL; + } value_type = btf_type_id_size(btf, &btf_value_id, &value_size); if (!value_type || value_size != map->value_size) @@ -574,7 +581,7 @@ static int map_create(union bpf_attr *attr) if (attr->btf_key_type_id || attr->btf_value_type_id) { struct btf *btf; - if (!attr->btf_key_type_id || !attr->btf_value_type_id) { + if (!attr->btf_value_type_id) { err = -EINVAL; goto free_map_nouncharge; } From 563cbcce0ca0a7f1d1c999cd4995903a9489eb33 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Fri, 19 Oct 2018 09:57:58 -0700 Subject: [PATCH 1086/1640] BACKPORT: bpf: add tests for direct packet access from CGROUP_SKB Tests are added to make sure CGROUP_SKB cannot access: tc_classid, data_meta, flow_keys and can read and write: mark, prority, and cb[0-4] and can read other fields. To make selftest with skb->sk work, a dummy sk is added in bpf_prog_test_run_skb(). Change-Id: I788ec54e782b923c99ab4b719232dc7cb0454495 Signed-off-by: Song Liu Signed-off-by: Alexei Starovoitov --- net/bpf/test_run.c | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index e5510dc51a8c..2a085d9f0ffa 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -10,6 +10,8 @@ #include #include #include +#include +#include static __always_inline u32 bpf_test_run_one(struct bpf_prog *prog, void *ctx, struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE]) @@ -116,6 +118,7 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr, u32 retval, duration; int hh_len = ETH_HLEN; struct sk_buff *skb; + struct sock *sk; void *data; int ret; @@ -138,11 +141,21 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr, break; } - skb = build_skb(data, 0); - if (!skb) { + sk = kzalloc(sizeof(struct sock), GFP_USER); + if (!sk) { kfree(data); return -ENOMEM; } + sock_net_set(sk, current->nsproxy->net_ns); + sock_init_data(NULL, sk); + + skb = build_skb(data, 0); + if (!skb) { + kfree(data); + kfree(sk); + return -ENOMEM; + } + skb->sk = sk; skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN); __skb_put(skb, size); @@ -160,6 +173,7 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr, if (pskb_expand_head(skb, nhead, 0, GFP_USER)) { kfree_skb(skb); + kfree(sk); return -ENOMEM; } } @@ -172,6 +186,7 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr, size = skb_headlen(skb); ret = bpf_test_finish(kattr, uattr, skb->data, size, retval, duration); kfree_skb(skb); + kfree(sk); return ret; } From dc2727d4d1348099ea16bae3e977c59bb2eba7bf Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Sat, 1 Dec 2018 10:39:44 -0800 Subject: [PATCH 1087/1640] UPSTREAM: bpf: refactor bpf_test_run() to separate own failures and test program result After commit f42ee093be29 ("bpf/test_run: support cgroup local storage") the bpf_test_run() function may fail with -ENOMEM, if it's not possible to allocate memory for a cgroup local storage. This error shouldn't be mixed with the return value of the testing program. Let's add an additional argument with a pointer where to store the testing program's result; and make bpf_test_run() return either 0 or -ENOMEM. Fixes: f42ee093be29 ("bpf/test_run: support cgroup local storage") Reported-by: Dan Carpenter Suggested-by: Alexei Starovoitov Change-Id: I55188ea781e8289489af0e850ce96ce4450f98e2 Signed-off-by: Roman Gushchin Cc: Daniel Borkmann Cc: Alexei Starovoitov Signed-off-by: Alexei Starovoitov --- net/bpf/test_run.c | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 2a085d9f0ffa..0c0383093b8f 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -28,12 +28,13 @@ static __always_inline u32 bpf_test_run_one(struct bpf_prog *prog, void *ctx, return ret; } -static u32 bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat, u32 *time) +static int bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat, u32 *ret, + u32 *time) { struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE] = { 0 }; enum bpf_cgroup_storage_type stype; u64 time_start, time_spent = 0; - u32 ret = 0, i; + u32 i; for_each_cgroup_storage_type(stype) { storage[stype] = bpf_cgroup_storage_alloc(prog, stype); @@ -49,7 +50,7 @@ static u32 bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat, u32 *time) repeat = 1; time_start = ktime_get_ns(); for (i = 0; i < repeat; i++) { - ret = bpf_test_run_one(prog, ctx, storage); + *ret = bpf_test_run_one(prog, ctx, storage); if (need_resched()) { if (signal_pending(current)) break; @@ -65,7 +66,7 @@ static u32 bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat, u32 *time) for_each_cgroup_storage_type(stype) bpf_cgroup_storage_free(storage[stype]); - return ret; + return 0; } static int bpf_test_finish(const union bpf_attr *kattr, @@ -166,7 +167,12 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr, __skb_push(skb, hh_len); if (is_direct_pkt_access) bpf_compute_data_pointers(skb); - retval = bpf_test_run(prog, skb, repeat, &duration); + ret = bpf_test_run(prog, skb, repeat, &retval, &duration); + if (ret) { + kfree_skb(skb); + kfree(sk); + return ret; + } if (!is_l2) { if (skb_headroom(skb) < hh_len) { int nhead = HH_DATA_ALIGN(hh_len - skb_headroom(skb)); @@ -213,11 +219,14 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr, rxqueue = __netif_get_rx_queue(current->nsproxy->net_ns->loopback_dev, 0); xdp.rxq = &rxqueue->xdp_rxq; - retval = bpf_test_run(prog, &xdp, repeat, &duration); + ret = bpf_test_run(prog, &xdp, repeat, &retval, &duration); + if (ret) + goto out; if (xdp.data != data + XDP_PACKET_HEADROOM + NET_IP_ALIGN || xdp.data_end != xdp.data + size) size = xdp.data_end - xdp.data; ret = bpf_test_finish(kattr, uattr, xdp.data, size, retval, duration); +out: kfree(data); return ret; } From a5eb5f35bdb95551c7a21ee52ce7bb3bc1ac3f7d Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Tue, 9 Apr 2019 11:49:09 -0700 Subject: [PATCH 1088/1640] UPSTREAM: bpf: support input __sk_buff context in BPF_PROG_TEST_RUN Add new set of arguments to bpf_attr for BPF_PROG_TEST_RUN: * ctx_in/ctx_size_in - input context * ctx_out/ctx_size_out - output context The intended use case is to pass some meta data to the test runs that operate on skb (this has being brought up on recent LPC). For programs that use bpf_prog_test_run_skb, support __sk_buff input and output. Initially, from input __sk_buff, copy _only_ cb and priority into skb, all other non-zero fields are prohibited (with EINVAL). If the user has set ctx_out/ctx_size_out, copy the potentially modified __sk_buff back to the userspace. We require all fields of input __sk_buff except the ones we explicitly support to be set to zero. The expectation is that in the future we might add support for more fields and we want to fail explicitly if the user runs the program on the kernel where we don't yet support them. The API is intentionally vague (i.e. we don't explicitly add __sk_buff to bpf_attr, but ctx_in) to potentially let other test_run types use this interface in the future (this can be xdp_md for xdp types for example). v4: * don't copy more than allowed in bpf_ctx_init [Martin] v3: * handle case where ctx_in is NULL, but ctx_out is not [Martin] * convert size==0 checks to ptr==NULL checks and add some extra ptr checks [Martin] v2: * Addressed comments from Martin Lau Change-Id: Ic4b5cfef2528eb6280083ac96759d48fbf03aeec Signed-off-by: Stanislav Fomichev Acked-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann --- include/uapi/linux/bpf.h | 7 ++ kernel/bpf/syscall.c | 10 ++- net/bpf/test_run.c | 143 ++++++++++++++++++++++++++++++++++++--- 3 files changed, 151 insertions(+), 9 deletions(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 03c05ed68839..015ad0ad3fa7 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -411,6 +411,13 @@ union bpf_attr { __aligned_u64 data_out; __u32 repeat; __u32 duration; + __u32 ctx_size_in; /* input: len of ctx_in */ + __u32 ctx_size_out; /* input/output: len of ctx_out + * returns ENOSPC if ctx_out + * is too small. + */ + __aligned_u64 ctx_in; + __aligned_u64 ctx_out; } test; struct { /* anonymous struct used by BPF_*_GET_*_ID */ diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 8e6645d1aec5..58d36c2a91fd 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2026,7 +2026,7 @@ static int bpf_prog_query(const union bpf_attr *attr, return cgroup_bpf_prog_query(attr, uattr); } -#define BPF_PROG_TEST_RUN_LAST_FIELD test.duration +#define BPF_PROG_TEST_RUN_LAST_FIELD test.ctx_out static int bpf_prog_test_run(const union bpf_attr *attr, union bpf_attr __user *uattr) @@ -2039,6 +2039,14 @@ static int bpf_prog_test_run(const union bpf_attr *attr, if (CHECK_ATTR(BPF_PROG_TEST_RUN)) return -EINVAL; + if ((attr->test.ctx_size_in && !attr->test.ctx_in) || + (!attr->test.ctx_size_in && attr->test.ctx_in)) + return -EINVAL; + + if ((attr->test.ctx_size_out && !attr->test.ctx_out) || + (!attr->test.ctx_size_out && attr->test.ctx_out)) + return -EINVAL; + prog = bpf_prog_get(attr->test.prog_fd); if (IS_ERR(prog)) return PTR_ERR(prog); diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 0c0383093b8f..59067cda3250 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -110,12 +110,126 @@ static void *bpf_test_init(const union bpf_attr *kattr, u32 size, return data; } +static void *bpf_ctx_init(const union bpf_attr *kattr, u32 max_size) +{ + void __user *data_in = u64_to_user_ptr(kattr->test.ctx_in); + void __user *data_out = u64_to_user_ptr(kattr->test.ctx_out); + u32 size = kattr->test.ctx_size_in; + void *data; + int err; + + if (!data_in && !data_out) + return NULL; + + data = kzalloc(max_size, GFP_USER); + if (!data) + return ERR_PTR(-ENOMEM); + + if (data_in) { + err = bpf_check_uarg_tail_zero(data_in, max_size, size); + if (err) { + kfree(data); + return ERR_PTR(err); + } + + size = min_t(u32, max_size, size); + if (copy_from_user(data, data_in, size)) { + kfree(data); + return ERR_PTR(-EFAULT); + } + } + return data; +} + +static int bpf_ctx_finish(const union bpf_attr *kattr, + union bpf_attr __user *uattr, const void *data, + u32 size) +{ + void __user *data_out = u64_to_user_ptr(kattr->test.ctx_out); + int err = -EFAULT; + u32 copy_size = size; + + if (!data || !data_out) + return 0; + + if (copy_size > kattr->test.ctx_size_out) { + copy_size = kattr->test.ctx_size_out; + err = -ENOSPC; + } + + if (copy_to_user(data_out, data, copy_size)) + goto out; + if (copy_to_user(&uattr->test.ctx_size_out, &size, sizeof(size))) + goto out; + if (err != -ENOSPC) + err = 0; +out: + return err; +} + +/** + * range_is_zero - test whether buffer is initialized + * @buf: buffer to check + * @from: check from this position + * @to: check up until (excluding) this position + * + * This function returns true if the there is a non-zero byte + * in the buf in the range [from,to). + */ +static inline bool range_is_zero(void *buf, size_t from, size_t to) +{ + return !memchr_inv((u8 *)buf + from, 0, to - from); +} + +static int convert___skb_to_skb(struct sk_buff *skb, struct __sk_buff *__skb) +{ + struct qdisc_skb_cb *cb = (struct qdisc_skb_cb *)skb->cb; + + if (!__skb) + return 0; + + /* make sure the fields we don't use are zeroed */ + if (!range_is_zero(__skb, 0, offsetof(struct __sk_buff, priority))) + return -EINVAL; + + /* priority is allowed */ + + if (!range_is_zero(__skb, offsetof(struct __sk_buff, priority) + + FIELD_SIZEOF(struct __sk_buff, priority), + offsetof(struct __sk_buff, cb))) + return -EINVAL; + + /* cb is allowed */ + + if (!range_is_zero(__skb, offsetof(struct __sk_buff, cb) + + FIELD_SIZEOF(struct __sk_buff, cb), + sizeof(struct __sk_buff))) + return -EINVAL; + + skb->priority = __skb->priority; + memcpy(&cb->data, __skb->cb, QDISC_CB_PRIV_LEN); + + return 0; +} + +static void convert_skb_to___skb(struct sk_buff *skb, struct __sk_buff *__skb) +{ + struct qdisc_skb_cb *cb = (struct qdisc_skb_cb *)skb->cb; + + if (!__skb) + return; + + __skb->priority = skb->priority; + memcpy(__skb->cb, &cb->data, QDISC_CB_PRIV_LEN); +} + int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr) { bool is_l2 = false, is_direct_pkt_access = false; u32 size = kattr->test.data_size_in; u32 repeat = kattr->test.repeat; + struct __sk_buff *ctx = NULL; u32 retval, duration; int hh_len = ETH_HLEN; struct sk_buff *skb; @@ -128,6 +242,12 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr, if (IS_ERR(data)) return PTR_ERR(data); + ctx = bpf_ctx_init(kattr, sizeof(struct __sk_buff)); + if (IS_ERR(ctx)) { + kfree(data); + return PTR_ERR(ctx); + } + switch (prog->type) { case BPF_PROG_TYPE_SCHED_CLS: case BPF_PROG_TYPE_SCHED_ACT: @@ -145,6 +265,7 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr, sk = kzalloc(sizeof(struct sock), GFP_USER); if (!sk) { kfree(data); + kfree(ctx); return -ENOMEM; } sock_net_set(sk, current->nsproxy->net_ns); @@ -153,6 +274,7 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr, skb = build_skb(data, 0); if (!skb) { kfree(data); + kfree(ctx); kfree(sk); return -ENOMEM; } @@ -167,32 +289,37 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr, __skb_push(skb, hh_len); if (is_direct_pkt_access) bpf_compute_data_pointers(skb); + ret = convert___skb_to_skb(skb, ctx); + if (ret) + goto out; ret = bpf_test_run(prog, skb, repeat, &retval, &duration); - if (ret) { - kfree_skb(skb); - kfree(sk); - return ret; - } + if (ret) + goto out; if (!is_l2) { if (skb_headroom(skb) < hh_len) { int nhead = HH_DATA_ALIGN(hh_len - skb_headroom(skb)); if (pskb_expand_head(skb, nhead, 0, GFP_USER)) { - kfree_skb(skb); - kfree(sk); - return -ENOMEM; + ret = -ENOMEM; + goto out; } } memset(__skb_push(skb, hh_len), 0, hh_len); } + convert_skb_to___skb(skb, ctx); size = skb->len; /* bpf program can never convert linear skb to non-linear */ if (WARN_ON_ONCE(skb_is_nonlinear(skb))) size = skb_headlen(skb); ret = bpf_test_finish(kattr, uattr, skb->data, size, retval, duration); + if (!ret) + ret = bpf_ctx_finish(kattr, uattr, ctx, + sizeof(struct __sk_buff)); +out: kfree_skb(skb); kfree(sk); + kfree(ctx); return ret; } From 602b7898b303c3c6253ef27c30da30895b7c7971 Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Tue, 12 Mar 2019 09:27:09 -0700 Subject: [PATCH 1089/1640] UPSTREAM: bpf: Add base proto function for cgroup-bpf programs Currently kernel/bpf/cgroup.c contains only one program type and one proto function cgroup_dev_func_proto(). It'd be useful to have base proto function that can be reused for new cgroup-bpf program types coming soon. Introduce cgroup_base_func_proto(). Change-Id: Id6f272a0a343dc569a08a12b2e39d3c651eaf9ef Signed-off-by: Andrey Ignatov Signed-off-by: Alexei Starovoitov --- kernel/bpf/cgroup.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 4e807973aa80..f6cd38746df2 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -701,7 +701,7 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, EXPORT_SYMBOL(__cgroup_bpf_check_dev_permission); static const struct bpf_func_proto * -cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +cgroup_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_map_lookup_elem: @@ -725,6 +725,12 @@ cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) } } +static const struct bpf_func_proto * +cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + return cgroup_base_func_proto(func_id, prog); +} + static bool cgroup_dev_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, From 51e21da97e922de6291092379dd420efaf068fc2 Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Wed, 27 Feb 2019 12:59:24 -0800 Subject: [PATCH 1090/1640] BACKPORT: bpf: Sysctl hook Containerized applications may run as root and it may create problems for whole host. Specifically such applications may change a sysctl and affect applications in other containers. Furthermore in existing infrastructure it may not be possible to just completely disable writing to sysctl, instead such a process should be gradual with ability to log what sysctl are being changed by a container, investigate, limit the set of writable sysctl to currently used ones (so that new ones can not be changed) and eventually reduce this set to zero. The patch introduces new program type BPF_PROG_TYPE_CGROUP_SYSCTL and attach type BPF_CGROUP_SYSCTL to solve these problems on cgroup basis. New program type has access to following minimal context: struct bpf_sysctl { __u32 write; }; Where @write indicates whether sysctl is being read (= 0) or written (= 1). Helpers to access sysctl name and value will be introduced separately. BPF_CGROUP_SYSCTL attach point is added to sysctl code right before passing control to ctl_table->proc_handler so that BPF program can either allow or deny access to sysctl. Suggested-by: Roman Gushchin Change-Id: I1cefe648cac50683d02ca33cbbbd865bd4522dfe Signed-off-by: Andrey Ignatov Signed-off-by: Alexei Starovoitov --- fs/proc/proc_sysctl.c | 5 +++ include/linux/bpf-cgroup.h | 18 ++++++++ include/linux/bpf_types.h | 1 + include/linux/filter.h | 8 ++++ include/uapi/linux/bpf.h | 11 ++++- kernel/bpf/cgroup.c | 92 ++++++++++++++++++++++++++++++++++++++ kernel/bpf/syscall.c | 7 +++ kernel/bpf/verifier.c | 1 + 8 files changed, 142 insertions(+), 1 deletion(-) diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index f4b46f796901..b74d74d9b275 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -14,6 +14,7 @@ #include #include #include +#include #include "internal.h" static const struct dentry_operations proc_sys_dentry_operations; @@ -594,6 +595,10 @@ static ssize_t proc_sys_call_handler(struct file *filp, void __user *buf, if (!table->proc_handler) goto out; + error = BPF_CGROUP_RUN_PROG_SYSCTL(head, table, write); + if (error) + goto out; + /* careful: calling conventions are nasty here */ res = count; error = table->proc_handler(table, write, buf, &res, ppos); diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index eb3266d39762..17340d242af7 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -17,6 +17,8 @@ struct bpf_map; struct bpf_prog; struct bpf_sock_ops_kern; struct bpf_cgroup_storage; +struct ctl_table; +struct ctl_table_header; #ifdef CONFIG_CGROUP_BPF @@ -109,6 +111,10 @@ int __cgroup_bpf_run_filter_sock_ops(struct sock *sk, int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, short access, enum bpf_attach_type type); +int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, + struct ctl_table *table, int write, + enum bpf_attach_type type); + static inline enum bpf_cgroup_storage_type cgroup_storage_type( struct bpf_map *map) { @@ -259,6 +265,17 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, \ __ret; \ }) + + +#define BPF_CGROUP_RUN_PROG_SYSCTL(head, table, write) \ +({ \ + int __ret = 0; \ + if (cgroup_bpf_enabled) \ + __ret = __cgroup_bpf_run_filter_sysctl(head, table, write, \ + BPF_CGROUP_SYSCTL); \ + __ret; \ +}) + int cgroup_bpf_prog_attach(const union bpf_attr *attr, enum bpf_prog_type ptype, struct bpf_prog *prog); int cgroup_bpf_prog_detach(const union bpf_attr *attr, @@ -329,6 +346,7 @@ static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map, #define BPF_CGROUP_RUN_PROG_UDP6_RECVMSG_LOCK(sk, uaddr) ({ 0; }) #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) ({ 0; }) #define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type,major,minor,access) ({ 0; }) +#define BPF_CGROUP_RUN_PROG_SYSCTL(head, table, write) ({ 0; }) #define for_each_cgroup_storage_type(stype) for (; false; ) diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index c72ac19a9b2a..805db999815e 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -25,6 +25,7 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_RAW_TRACEPOINT, raw_tracepoint) #endif #ifdef CONFIG_CGROUP_BPF BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_DEVICE, cg_dev) +BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SYSCTL, cg_sysctl) #endif #ifdef CONFIG_BPF_LIRC_MODE2 BPF_PROG_TYPE(BPF_PROG_TYPE_LIRC_MODE2, lirc_mode2) diff --git a/include/linux/filter.h b/include/linux/filter.h index 036903dfea6f..a2818cfa5200 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -33,6 +33,8 @@ struct bpf_prog_aux; struct xdp_rxq_info; struct xdp_buff; struct sock_reuseport; +struct ctl_table; +struct ctl_table_header; /* ArgX, context and stack frame pointer register positions. Note, * Arg1, Arg2, Arg3, etc are used as argument mappings of function @@ -1259,4 +1261,10 @@ struct bpf_sock_ops_kern { */ }; +struct bpf_sysctl_kern { + struct ctl_table_header *head; + struct ctl_table *table; + int write; +}; + #endif /* __LINUX_FILTER_H__ */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 015ad0ad3fa7..f2dab143f40d 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -167,6 +167,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_LIRC_MODE2, BPF_PROG_TYPE_SK_REUSEPORT, BPF_PROG_TYPE_FLOW_DISSECTOR, + BPF_PROG_TYPE_CGROUP_SYSCTL, }; enum bpf_attach_type { @@ -188,7 +189,8 @@ enum bpf_attach_type { BPF_CGROUP_UDP6_SENDMSG, BPF_LIRC_MODE2, BPF_FLOW_DISSECTOR, - BPF_CGROUP_UDP4_RECVMSG = 19, + BPF_CGROUP_SYSCTL, + BPF_CGROUP_UDP4_RECVMSG, BPF_CGROUP_UDP6_RECVMSG, __MAX_BPF_ATTACH_TYPE }; @@ -3167,4 +3169,11 @@ struct bpf_line_info { struct bpf_spin_lock { __u32 val; }; + +struct bpf_sysctl { + __u32 write; /* Sysctl is being read (= 0) or written (= 1). + * Allows 1,2,4-byte read, but no write. + */ +}; + #endif /* _UAPI__LINUX_BPF_H__ */ diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index f6cd38746df2..610491b5f0aa 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -11,7 +11,9 @@ #include #include #include +#include #include +#include #include #include #include @@ -768,3 +770,93 @@ const struct bpf_verifier_ops cg_dev_verifier_ops = { .get_func_proto = cgroup_dev_func_proto, .is_valid_access = cgroup_dev_is_valid_access, }; + +/** + * __cgroup_bpf_run_filter_sysctl - Run a program on sysctl + * + * @head: sysctl table header + * @table: sysctl table + * @write: sysctl is being read (= 0) or written (= 1) + * @type: type of program to be executed + * + * Program is run when sysctl is being accessed, either read or written, and + * can allow or deny such access. + * + * This function will return %-EPERM if an attached program is found and + * returned value != 1 during execution. In all other cases 0 is returned. + */ +int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, + struct ctl_table *table, int write, + enum bpf_attach_type type) +{ + struct bpf_sysctl_kern ctx = { + .head = head, + .table = table, + .write = write, + }; + struct cgroup *cgrp; + int ret; + + rcu_read_lock(); + cgrp = task_dfl_cgroup(current); + ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx, BPF_PROG_RUN); + rcu_read_unlock(); + + return ret == 1 ? 0 : -EPERM; +} +EXPORT_SYMBOL(__cgroup_bpf_run_filter_sysctl); + +static const struct bpf_func_proto * +sysctl_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + return cgroup_base_func_proto(func_id, prog); +} + +static bool sysctl_is_valid_access(int off, int size, enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + const int size_default = sizeof(__u32); + + if (off < 0 || off + size > sizeof(struct bpf_sysctl) || + off % size || type != BPF_READ) + return false; + + switch (off) { + case offsetof(struct bpf_sysctl, write): + bpf_ctx_record_field_size(info, size_default); + return bpf_ctx_narrow_access_ok(off, size, size_default); + default: + return false; + } +} + +static u32 sysctl_convert_ctx_access(enum bpf_access_type type, + const struct bpf_insn *si, + struct bpf_insn *insn_buf, + struct bpf_prog *prog, u32 *target_size) +{ + struct bpf_insn *insn = insn_buf; + + switch (si->off) { + case offsetof(struct bpf_sysctl, write): + *insn++ = BPF_LDX_MEM( + BPF_SIZE(si->code), si->dst_reg, si->src_reg, + bpf_target_off(struct bpf_sysctl_kern, write, + FIELD_SIZEOF(struct bpf_sysctl_kern, + write), + target_size)); + break; + } + + return insn - insn_buf; +} + +const struct bpf_verifier_ops cg_sysctl_verifier_ops = { + .get_func_proto = sysctl_func_proto, + .is_valid_access = sysctl_is_valid_access, + .convert_ctx_access = sysctl_convert_ctx_access, +}; + +const struct bpf_prog_ops cg_sysctl_prog_ops = { +}; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 58d36c2a91fd..8685d2bcd99c 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1901,6 +1901,9 @@ static int bpf_prog_attach(const union bpf_attr *attr) case BPF_FLOW_DISSECTOR: ptype = BPF_PROG_TYPE_FLOW_DISSECTOR; break; + case BPF_CGROUP_SYSCTL: + ptype = BPF_PROG_TYPE_CGROUP_SYSCTL; + break; default: return -EINVAL; } @@ -1981,6 +1984,9 @@ static int bpf_prog_detach(const union bpf_attr *attr) return lirc_prog_detach(attr); case BPF_FLOW_DISSECTOR: return skb_flow_dissector_bpf_prog_detach(attr); + case BPF_CGROUP_SYSCTL: + ptype = BPF_PROG_TYPE_CGROUP_SYSCTL; + break; default: return -EINVAL; } @@ -2016,6 +2022,7 @@ static int bpf_prog_query(const union bpf_attr *attr, case BPF_CGROUP_UDP6_RECVMSG: case BPF_CGROUP_SOCK_OPS: case BPF_CGROUP_DEVICE: + case BPF_CGROUP_SYSCTL: break; case BPF_LIRC_MODE2: return lirc_prog_query(attr, uattr); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 7b346636a0c3..49533749f375 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5489,6 +5489,7 @@ static int check_return_code(struct bpf_verifier_env *env) case BPF_PROG_TYPE_CGROUP_SOCK: case BPF_PROG_TYPE_SOCK_OPS: case BPF_PROG_TYPE_CGROUP_DEVICE: + case BPF_PROG_TYPE_CGROUP_SYSCTL: break; default: return 0; From b9c0c6297d67a5b8ef74c1f1c923fda70913d0c3 Mon Sep 17 00:00:00 2001 From: Joe Stringer Date: Fri, 30 Nov 2018 15:32:20 -0800 Subject: [PATCH 1091/1640] BACKPORT: bpf: Support sk lookup in netns with id 0 David Ahern and Nicolas Dichtel report that the handling of the netns id 0 is incorrect for the BPF socket lookup helpers: rather than finding the netns with id 0, it is resolving to the current netns. This renders the netns_id 0 inaccessible. To fix this, adjust the API for the netns to treat all negative s32 values as a lookup in the current netns (including u64 values which when truncated to s32 become negative), while any values with a positive value in the signed 32-bit integer space would result in a lookup for a socket in the netns corresponding to that id. As before, if the netns with that ID does not exist, no socket will be found. Any netns outside of these ranges will fail to find a corresponding socket, as those values are reserved for future usage. Change-Id: Ib10a6977dbb38065ef7199b33aa99b96e020c9bb Signed-off-by: Joe Stringer Acked-by: Nicolas Dichtel Acked-by: Joey Pabalinas Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 35 +++++++++++++++++++++-------------- net/core/filter.c | 11 ++++++----- 2 files changed, 27 insertions(+), 19 deletions(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index f2dab143f40d..085e33584a65 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2235,7 +2235,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * struct bpf_sock *bpf_sk_lookup_tcp(void *ctx, struct bpf_sock_tuple *tuple, u32 tuple_size, u32 netns, u64 flags) + * struct bpf_sock *bpf_sk_lookup_tcp(void *ctx, struct bpf_sock_tuple *tuple, u32 tuple_size, u64 netns, u64 flags) * Description * Look for TCP socket matching *tuple*, optionally in a child * network namespace *netns*. The return value must be checked, @@ -2252,12 +2252,14 @@ union bpf_attr { * **sizeof**\ (*tuple*\ **->ipv6**) * Look for an IPv6 socket. * - * If the *netns* is zero, then the socket lookup table in the - * netns associated with the *ctx* will be used. For the TC hooks, - * this in the netns of the device in the skb. For socket hooks, - * this in the netns of the socket. If *netns* is non-zero, then - * it specifies the ID of the netns relative to the netns - * associated with the *ctx*. + * If the *netns* is a negative signed 32-bit integer, then the + * socket lookup table in the netns associated with the *ctx* will + * will be used. For the TC hooks, this is the netns of the device + * in the skb. For socket hooks, this is the netns of the socket. + * If *netns* is any other signed 32-bit value greater than or + * equal to zero then it specifies the ID of the netns relative to + * the netns associated with the *ctx*. *netns* values beyond the + * range of 32-bit integers are reserved for future use. * * All values for *flags* are reserved for future usage, and must * be left at zero. @@ -2269,7 +2271,7 @@ union bpf_attr { * For sockets with reuseport option, *struct bpf_sock* * return is from reuse->socks[] using hash of the packet. * - * struct bpf_sock *bpf_sk_lookup_udp(void *ctx, struct bpf_sock_tuple *tuple, u32 tuple_size, u32 netns, u64 flags) + * struct bpf_sock *bpf_sk_lookup_udp(void *ctx, struct bpf_sock_tuple *tuple, u32 tuple_size, u64 netns, u64 flags) * Description * Look for UDP socket matching *tuple*, optionally in a child * network namespace *netns*. The return value must be checked, @@ -2286,12 +2288,14 @@ union bpf_attr { * **sizeof**\ (*tuple*\ **->ipv6**) * Look for an IPv6 socket. * - * If the *netns* is zero, then the socket lookup table in the - * netns associated with the *ctx* will be used. For the TC hooks, - * this in the netns of the device in the skb. For socket hooks, - * this in the netns of the socket. If *netns* is non-zero, then - * it specifies the ID of the netns relative to the netns - * associated with the *ctx*. + * If the *netns* is a negative signed 32-bit integer, then the + * socket lookup table in the netns associated with the *ctx* will + * will be used. For the TC hooks, this is the netns of the device + * in the skb. For socket hooks, this is the netns of the socket. + * If *netns* is any other signed 32-bit value greater than or + * equal to zero then it specifies the ID of the netns relative to + * the netns associated with the *ctx*. *netns* values beyond the + * range of 32-bit integers are reserved for future use. * * All values for *flags* are reserved for future usage, and must * be left at zero. @@ -2549,6 +2553,9 @@ enum bpf_func_id { /* BPF_FUNC_perf_event_output for sk_buff input context. */ #define BPF_F_CTXLEN_MASK (0xfffffULL << 32) +/* Current network namespace */ +#define BPF_F_CURRENT_NETNS (-1L) + /* Mode for BPF_FUNC_skb_adjust_room helper. */ enum bpf_adj_room_mode { BPF_ADJ_ROOM_NET, diff --git a/net/core/filter.c b/net/core/filter.c index e7783a2d841b..f15d6b5bcc29 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4801,7 +4801,8 @@ __bpf_skc_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, int sdif; family = len == sizeof(tuple->ipv4) ? AF_INET : AF_INET6; - if (unlikely(family == AF_UNSPEC || netns_id > U32_MAX || flags)) + if (unlikely(family == AF_UNSPEC || flags || + !((s32)netns_id < 0 || netns_id <= S32_MAX))) goto out; if (family == AF_INET) @@ -4809,15 +4810,15 @@ __bpf_skc_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, else sdif = inet6_sdif(skb); - if (netns_id) { + if ((s32)netns_id < 0) { + net = caller_net; + sk = sk_lookup(net, tuple, ifindex, sdif, family, proto); + } else { net = get_net_ns_by_id(caller_net, netns_id); if (unlikely(!net)) goto out; sk = sk_lookup(net, tuple, ifindex, sdif, family, proto); put_net(net); - } else { - net = caller_net; - sk = sk_lookup(net, tuple, ifindex, sdif, family, proto); } out: From 83bd63756b722a17ba3deca5b2c947756d3e5b05 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Fri, 22 Mar 2019 14:32:54 -0400 Subject: [PATCH 1092/1640] UPSTREAM: bpf: add bpf_skb_adjust_room mode BPF_ADJ_ROOM_MAC bpf_skb_adjust_room net allows inserting room in an skb. Existing mode BPF_ADJ_ROOM_NET inserts room after the network header by pulling the skb, moving the network header forward and zeroing the new space. Add new mode BPF_ADJUST_ROOM_MAC that inserts room after the mac header. This allows inserting tunnel headers in front of the network header without having to recreate the network header in the original space, avoiding two copies. Change-Id: I93d50dfbba0864cc7e41351087ec66cbe0d2ea6a Signed-off-by: Willem de Bruijn Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 6 +++++- net/core/filter.c | 38 ++++++++++++++++++++------------------ 2 files changed, 25 insertions(+), 19 deletions(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 085e33584a65..5ab664b5855c 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1528,7 +1528,10 @@ union bpf_attr { * Grow or shrink the room for data in the packet associated to * *skb* by *len_diff*, and according to the selected *mode*. * - * There is a single supported mode at this time: + * There are two supported modes at this time: + * + * * **BPF_ADJ_ROOM_MAC**: Adjust room at the mac layer + * (room space is added or removed below the layer 2 header). * * * **BPF_ADJ_ROOM_NET**: Adjust room at the network layer * (room space is added or removed below the layer 3 header). @@ -2559,6 +2562,7 @@ enum bpf_func_id { /* Mode for BPF_FUNC_skb_adjust_room helper. */ enum bpf_adj_room_mode { BPF_ADJ_ROOM_NET, + BPF_ADJ_ROOM_MAC, }; /* Mode for BPF_FUNC_skb_load_bytes_relative helper. */ diff --git a/net/core/filter.c b/net/core/filter.c index f15d6b5bcc29..793aae18f38b 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2672,9 +2672,8 @@ static u32 bpf_skb_net_base_len(const struct sk_buff *skb) } } -static int bpf_skb_net_grow(struct sk_buff *skb, u32 len_diff) +static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff) { - u32 off = skb_mac_header_len(skb) + bpf_skb_net_base_len(skb); int ret; if (skb_is_gso(skb) && !skb_is_gso_tcp(skb)) @@ -2701,9 +2700,8 @@ static int bpf_skb_net_grow(struct sk_buff *skb, u32 len_diff) return 0; } -static int bpf_skb_net_shrink(struct sk_buff *skb, u32 len_diff) +static int bpf_skb_net_shrink(struct sk_buff *skb, u32 off, u32 len_diff) { - u32 off = skb_mac_header_len(skb) + bpf_skb_net_base_len(skb); int ret; if (skb_is_gso(skb) && !skb_is_gso_tcp(skb)) @@ -2732,7 +2730,8 @@ static int bpf_skb_net_shrink(struct sk_buff *skb, u32 len_diff) #define BPF_SKB_MAX_LEN SKB_MAX_ALLOC -static int bpf_skb_adjust_net(struct sk_buff *skb, s32 len_diff) +BPF_CALL_4(bpf_skb_adjust_room, struct sk_buff *, skb, s32, len_diff, + u32, mode, u64, flags) { bool trans_same = skb->transport_header == skb->network_header; u32 len_cur, len_diff_abs = abs(len_diff); @@ -2740,14 +2739,28 @@ static int bpf_skb_adjust_net(struct sk_buff *skb, s32 len_diff) u32 len_max = BPF_SKB_MAX_LEN; __be16 proto = skb_protocol(skb, true); bool shrink = len_diff < 0; + u32 off; int ret; + if (unlikely(flags)) + return -EINVAL; if (unlikely(len_diff_abs > 0xfffU)) return -EFAULT; if (unlikely(proto != htons(ETH_P_IP) && proto != htons(ETH_P_IPV6))) return -ENOTSUPP; + off = skb_mac_header_len(skb); + switch (mode) { + case BPF_ADJ_ROOM_NET: + off += bpf_skb_net_base_len(skb); + break; + case BPF_ADJ_ROOM_MAC: + break; + default: + return -ENOTSUPP; + } + len_cur = skb->len - skb_network_offset(skb); if (skb_transport_header_was_set(skb) && !trans_same) len_cur = skb_network_header_len(skb); @@ -2757,24 +2770,13 @@ static int bpf_skb_adjust_net(struct sk_buff *skb, s32 len_diff) !skb_is_gso(skb)))) return -ENOTSUPP; - ret = shrink ? bpf_skb_net_shrink(skb, len_diff_abs) : - bpf_skb_net_grow(skb, len_diff_abs); + ret = shrink ? bpf_skb_net_shrink(skb, off, len_diff_abs) : + bpf_skb_net_grow(skb, off, len_diff_abs); bpf_compute_data_pointers(skb); return ret; } -BPF_CALL_4(bpf_skb_adjust_room, struct sk_buff *, skb, s32, len_diff, - u32, mode, u64, flags) -{ - if (unlikely(flags)) - return -EINVAL; - if (likely(mode == BPF_ADJ_ROOM_NET)) - return bpf_skb_adjust_net(skb, len_diff); - - return -ENOTSUPP; -} - static const struct bpf_func_proto bpf_skb_adjust_room_proto = { .func = bpf_skb_adjust_room, .gpl_only = false, From 796acead7cfc9416b59cced9fd4f212b3c64fc03 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Fri, 22 Mar 2019 14:32:55 -0400 Subject: [PATCH 1093/1640] UPSTREAM: bpf: add bpf_skb_adjust_room flag BPF_F_ADJ_ROOM_FIXED_GSO bpf_skb_adjust_room adjusts gso_size of gso packets to account for the pushed or popped header room. This is not allowed with UDP, where gso_size delineates datagrams. Add an option to avoid these updates and allow this call for datagrams. It can also be used with TCP, when MSS is known to allow headroom, e.g., through MSS clamping or route MTU. Changes v1->v2: - document flag BPF_F_ADJ_ROOM_FIXED_GSO - do not expose BPF_F_ADJ_ROOM_MASK through uapi, as it may change. Link: https://patchwork.ozlabs.org/patch/1052497/ Change-Id: Ifa86028c428e178b0b0fbaf6045d855c614ab423 Signed-off-by: Willem de Bruijn Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 9 +++++++-- net/core/filter.c | 38 +++++++++++++++++++++++++++----------- 2 files changed, 34 insertions(+), 13 deletions(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 5ab664b5855c..7e048478406b 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1536,8 +1536,10 @@ union bpf_attr { * * **BPF_ADJ_ROOM_NET**: Adjust room at the network layer * (room space is added or removed below the layer 3 header). * - * All values for *flags* are reserved for future usage, and must - * be left at zero. + * There is one supported flag at this time: + * + * * **BPF_F_ADJ_ROOM_FIXED_GSO**: Do not adjust gso_size. + * Adjusting mss in this way is not allowed for datagrams. * * A call to this helper is susceptible to change the underlaying * packet buffer. Therefore, at load time, all checks on pointers @@ -2559,6 +2561,9 @@ enum bpf_func_id { /* Current network namespace */ #define BPF_F_CURRENT_NETNS (-1L) +/* BPF_FUNC_skb_adjust_room flags. */ +#define BPF_F_ADJ_ROOM_FIXED_GSO (1ULL << 0) + /* Mode for BPF_FUNC_skb_adjust_room helper. */ enum bpf_adj_room_mode { BPF_ADJ_ROOM_NET, diff --git a/net/core/filter.c b/net/core/filter.c index 793aae18f38b..53fbc6e35502 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2672,12 +2672,19 @@ static u32 bpf_skb_net_base_len(const struct sk_buff *skb) } } -static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff) +#define BPF_F_ADJ_ROOM_MASK (BPF_F_ADJ_ROOM_FIXED_GSO) + +static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff, + u64 flags) { int ret; - if (skb_is_gso(skb) && !skb_is_gso_tcp(skb)) - return -ENOTSUPP; + if (skb_is_gso(skb) && !skb_is_gso_tcp(skb)) { + /* udp gso_size delineates datagrams, only allow if fixed */ + if (!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4) || + !(flags & BPF_F_ADJ_ROOM_FIXED_GSO)) + return -ENOTSUPP; + } ret = skb_cow(skb, len_diff); if (unlikely(ret < 0)) @@ -2691,7 +2698,9 @@ static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff) struct skb_shared_info *shinfo = skb_shinfo(skb); /* Due to header grow, MSS needs to be downgraded. */ - skb_decrease_gso_size(shinfo, len_diff); + if (!(flags & BPF_F_ADJ_ROOM_FIXED_GSO)) + skb_decrease_gso_size(shinfo, len_diff); + /* Header must be checked, and gso_segs recomputed. */ shinfo->gso_type |= SKB_GSO_DODGY; shinfo->gso_segs = 0; @@ -2700,12 +2709,17 @@ static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff) return 0; } -static int bpf_skb_net_shrink(struct sk_buff *skb, u32 off, u32 len_diff) +static int bpf_skb_net_shrink(struct sk_buff *skb, u32 off, u32 len_diff, + u64 flags) { int ret; - if (skb_is_gso(skb) && !skb_is_gso_tcp(skb)) - return -ENOTSUPP; + if (skb_is_gso(skb) && !skb_is_gso_tcp(skb)) { + /* udp gso_size delineates datagrams, only allow if fixed */ + if (!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4) || + !(flags & BPF_F_ADJ_ROOM_FIXED_GSO)) + return -ENOTSUPP; + } ret = skb_unclone(skb, GFP_ATOMIC); if (unlikely(ret < 0)) @@ -2719,7 +2733,9 @@ static int bpf_skb_net_shrink(struct sk_buff *skb, u32 off, u32 len_diff) struct skb_shared_info *shinfo = skb_shinfo(skb); /* Due to header shrink, MSS can be upgraded. */ - skb_increase_gso_size(shinfo, len_diff); + if (!(flags & BPF_F_ADJ_ROOM_FIXED_GSO)) + skb_increase_gso_size(shinfo, len_diff); + /* Header must be checked, and gso_segs recomputed. */ shinfo->gso_type |= SKB_GSO_DODGY; shinfo->gso_segs = 0; @@ -2742,7 +2758,7 @@ BPF_CALL_4(bpf_skb_adjust_room, struct sk_buff *, skb, s32, len_diff, u32 off; int ret; - if (unlikely(flags)) + if (unlikely(flags & ~BPF_F_ADJ_ROOM_MASK)) return -EINVAL; if (unlikely(len_diff_abs > 0xfffU)) return -EFAULT; @@ -2770,8 +2786,8 @@ BPF_CALL_4(bpf_skb_adjust_room, struct sk_buff *, skb, s32, len_diff, !skb_is_gso(skb)))) return -ENOTSUPP; - ret = shrink ? bpf_skb_net_shrink(skb, off, len_diff_abs) : - bpf_skb_net_grow(skb, off, len_diff_abs); + ret = shrink ? bpf_skb_net_shrink(skb, off, len_diff_abs, flags) : + bpf_skb_net_grow(skb, off, len_diff_abs, flags); bpf_compute_data_pointers(skb); return ret; From 0134e563c0f98bb485a5246697592ef3dce88c15 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Fri, 22 Mar 2019 14:32:56 -0400 Subject: [PATCH 1094/1640] UPSTREAM: bpf: add bpf_skb_adjust_room encap flags When pushing tunnel headers, annotate skbs in the same way as tunnel devices. For GSO packets, the network stack requires certain fields set to segment packets with tunnel headers. gro_gse_segment depends on transport and inner mac header, for instance. Add an option to pass this information. Remove the restriction on len_diff to network header length, which is too short, e.g., for GRE protocols. Changes v1->v2: - document new flags - BPF_F_ADJ_ROOM_MASK moved v2->v3: - BPF_F_ADJ_ROOM_ENCAP_L3_MASK moved Change-Id: Ied347f8e7cc2fe5e0806cc8cc9f1ce6e26c69a49 Signed-off-by: Willem de Bruijn Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 16 +++++++++- net/core/filter.c | 66 +++++++++++++++++++++++++++++++++++++--- 2 files changed, 76 insertions(+), 6 deletions(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 7e048478406b..44b7eeaca424 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1536,11 +1536,20 @@ union bpf_attr { * * **BPF_ADJ_ROOM_NET**: Adjust room at the network layer * (room space is added or removed below the layer 3 header). * - * There is one supported flag at this time: + * The following flags are supported at this time: * * * **BPF_F_ADJ_ROOM_FIXED_GSO**: Do not adjust gso_size. * Adjusting mss in this way is not allowed for datagrams. * + * * **BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 **: + * * **BPF_F_ADJ_ROOM_ENCAP_L3_IPV6 **: + * Any new space is reserved to hold a tunnel header. + * Configure skb offsets and other fields accordingly. + * + * * **BPF_F_ADJ_ROOM_ENCAP_L4_GRE **: + * * **BPF_F_ADJ_ROOM_ENCAP_L4_UDP **: + * Use with ENCAP_L3 flags to further specify the tunnel type. + * * A call to this helper is susceptible to change the underlaying * packet buffer. Therefore, at load time, all checks on pointers * previously done by the verifier are invalidated and must be @@ -2564,6 +2573,11 @@ enum bpf_func_id { /* BPF_FUNC_skb_adjust_room flags. */ #define BPF_F_ADJ_ROOM_FIXED_GSO (1ULL << 0) +#define BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 (1ULL << 1) +#define BPF_F_ADJ_ROOM_ENCAP_L3_IPV6 (1ULL << 2) +#define BPF_F_ADJ_ROOM_ENCAP_L4_GRE (1ULL << 3) +#define BPF_F_ADJ_ROOM_ENCAP_L4_UDP (1ULL << 4) + /* Mode for BPF_FUNC_skb_adjust_room helper. */ enum bpf_adj_room_mode { BPF_ADJ_ROOM_NET, diff --git a/net/core/filter.c b/net/core/filter.c index 53fbc6e35502..22770ef5bf0b 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2672,11 +2672,20 @@ static u32 bpf_skb_net_base_len(const struct sk_buff *skb) } } -#define BPF_F_ADJ_ROOM_MASK (BPF_F_ADJ_ROOM_FIXED_GSO) +#define BPF_F_ADJ_ROOM_ENCAP_L3_MASK (BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 | \ + BPF_F_ADJ_ROOM_ENCAP_L3_IPV6) + +#define BPF_F_ADJ_ROOM_MASK (BPF_F_ADJ_ROOM_FIXED_GSO | \ + BPF_F_ADJ_ROOM_ENCAP_L3_MASK | \ + BPF_F_ADJ_ROOM_ENCAP_L4_GRE | \ + BPF_F_ADJ_ROOM_ENCAP_L4_UDP) static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff, u64 flags) { + bool encap = flags & BPF_F_ADJ_ROOM_ENCAP_L3_MASK; + unsigned int gso_type = SKB_GSO_DODGY; + u16 mac_len, inner_net, inner_trans; int ret; if (skb_is_gso(skb) && !skb_is_gso_tcp(skb)) { @@ -2690,10 +2699,60 @@ static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff, if (unlikely(ret < 0)) return ret; + if (encap) { + if (skb->protocol != htons(ETH_P_IP) && + skb->protocol != htons(ETH_P_IPV6)) + return -ENOTSUPP; + + if (flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 && + flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV6) + return -EINVAL; + + if (flags & BPF_F_ADJ_ROOM_ENCAP_L4_GRE && + flags & BPF_F_ADJ_ROOM_ENCAP_L4_UDP) + return -EINVAL; + + if (skb->encapsulation) + return -EALREADY; + + mac_len = skb->network_header - skb->mac_header; + inner_net = skb->network_header; + inner_trans = skb->transport_header; + } + ret = bpf_skb_net_hdr_push(skb, off, len_diff); if (unlikely(ret < 0)) return ret; + if (encap) { + /* inner mac == inner_net on l3 encap */ + skb->inner_mac_header = inner_net; + skb->inner_network_header = inner_net; + skb->inner_transport_header = inner_trans; + skb_set_inner_protocol(skb, skb->protocol); + + skb->encapsulation = 1; + skb_set_network_header(skb, mac_len); + + if (flags & BPF_F_ADJ_ROOM_ENCAP_L4_UDP) + gso_type |= SKB_GSO_UDP_TUNNEL; + else if (flags & BPF_F_ADJ_ROOM_ENCAP_L4_GRE) + gso_type |= SKB_GSO_GRE; + else if (flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV6) + gso_type |= SKB_GSO_IPXIP6; + else + gso_type |= SKB_GSO_IPXIP4; + + if (flags & BPF_F_ADJ_ROOM_ENCAP_L4_GRE || + flags & BPF_F_ADJ_ROOM_ENCAP_L4_UDP) { + int nh_len = flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV6 ? + sizeof(struct ipv6hdr) : + sizeof(struct iphdr); + + skb_set_transport_header(skb, mac_len + nh_len); + } + } + if (skb_is_gso(skb)) { struct skb_shared_info *shinfo = skb_shinfo(skb); @@ -2702,7 +2761,7 @@ static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff, skb_decrease_gso_size(shinfo, len_diff); /* Header must be checked, and gso_segs recomputed. */ - shinfo->gso_type |= SKB_GSO_DODGY; + shinfo->gso_type |= gso_type; shinfo->gso_segs = 0; } @@ -2749,7 +2808,6 @@ static int bpf_skb_net_shrink(struct sk_buff *skb, u32 off, u32 len_diff, BPF_CALL_4(bpf_skb_adjust_room, struct sk_buff *, skb, s32, len_diff, u32, mode, u64, flags) { - bool trans_same = skb->transport_header == skb->network_header; u32 len_cur, len_diff_abs = abs(len_diff); u32 len_min = bpf_skb_net_base_len(skb); u32 len_max = BPF_SKB_MAX_LEN; @@ -2778,8 +2836,6 @@ BPF_CALL_4(bpf_skb_adjust_room, struct sk_buff *, skb, s32, len_diff, } len_cur = skb->len - skb_network_offset(skb); - if (skb_transport_header_was_set(skb) && !trans_same) - len_cur = skb_network_header_len(skb); if ((shrink && (len_diff_abs >= len_cur || len_cur - len_diff_abs < len_min)) || (!shrink && (skb->len + len_diff_abs > len_max && From 5bd7e6db64e71ec56fceb3f5d0ca702661a21714 Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Wed, 27 Feb 2019 13:28:48 -0800 Subject: [PATCH 1095/1640] BACKPORT: bpf: Introduce bpf_sysctl_get_name helper Add bpf_sysctl_get_name() helper to copy sysctl name (/proc/sys/ entry) into provided by BPF_PROG_TYPE_CGROUP_SYSCTL program buffer. By default full name (w/o /proc/sys/) is copied, e.g. "net/ipv4/tcp_mem". If BPF_F_SYSCTL_BASE_NAME flag is set, only base name will be copied, e.g. "tcp_mem". Documentation for the new helper is provided in bpf.h UAPI. Change-Id: Icbe00bfd09e7914393adde88a9cc053b27151e34 Signed-off-by: Andrey Ignatov Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 19 +++++++++++ kernel/bpf/cgroup.c | 70 +++++++++++++++++++++++++++++++++++++++- 2 files changed, 88 insertions(+), 1 deletion(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 44b7eeaca424..ea8bc6322a74 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2384,6 +2384,22 @@ union bpf_attr { * Pointer to **struct bpf_sock**, or **NULL** in case of failure. * For sockets with reuseport option, the **struct bpf_sock** * result is from **reuse->socks**\ [] using the hash of the tuple. + * + * int bpf_sysctl_get_name(struct bpf_sysctl *ctx, char *buf, size_t buf_len, u64 flags) + * Description + * Get name of sysctl in /proc/sys/ and copy it into provided by + * program buffer *buf* of size *buf_len*. + * + * The buffer is always NUL terminated, unless it's zero-sized. + * + * If *flags* is zero, full name (e.g. "net/ipv4/tcp_mem") is + * copied. Use **BPF_F_SYSCTL_BASE_NAME** flag to copy base name + * only (e.g. "tcp_mem"). + * Return + * Number of character copied (not including the trailing NUL). + * + * **-E2BIG** if the buffer wasn't big enough (*buf* will contain + * truncated name in this case). */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -2578,6 +2594,9 @@ enum bpf_func_id { #define BPF_F_ADJ_ROOM_ENCAP_L4_GRE (1ULL << 3) #define BPF_F_ADJ_ROOM_ENCAP_L4_UDP (1ULL << 4) +/* BPF_FUNC_sysctl_get_name flags. */ +#define BPF_F_SYSCTL_BASE_NAME (1ULL << 0) + /* Mode for BPF_FUNC_skb_adjust_room helper. */ enum bpf_adj_room_mode { BPF_ADJ_ROOM_NET, diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 610491b5f0aa..a68387043244 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -806,10 +807,77 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, } EXPORT_SYMBOL(__cgroup_bpf_run_filter_sysctl); +static ssize_t sysctl_cpy_dir(const struct ctl_dir *dir, char **bufp, + size_t *lenp) +{ + ssize_t tmp_ret = 0, ret; + + if (dir->header.parent) { + tmp_ret = sysctl_cpy_dir(dir->header.parent, bufp, lenp); + if (tmp_ret < 0) + return tmp_ret; + } + + ret = strscpy(*bufp, dir->header.ctl_table[0].procname, *lenp); + if (ret < 0) + return ret; + *bufp += ret; + *lenp -= ret; + ret += tmp_ret; + + /* Avoid leading slash. */ + if (!ret) + return ret; + + tmp_ret = strscpy(*bufp, "/", *lenp); + if (tmp_ret < 0) + return tmp_ret; + *bufp += tmp_ret; + *lenp -= tmp_ret; + + return ret + tmp_ret; +} + +BPF_CALL_4(bpf_sysctl_get_name, struct bpf_sysctl_kern *, ctx, char *, buf, + size_t, buf_len, u64, flags) +{ + ssize_t tmp_ret = 0, ret; + + if (!buf) + return -EINVAL; + + if (!(flags & BPF_F_SYSCTL_BASE_NAME)) { + if (!ctx->head) + return -EINVAL; + tmp_ret = sysctl_cpy_dir(ctx->head->parent, &buf, &buf_len); + if (tmp_ret < 0) + return tmp_ret; + } + + ret = strscpy(buf, ctx->table->procname, buf_len); + + return ret < 0 ? ret : tmp_ret + ret; +} + +static const struct bpf_func_proto bpf_sysctl_get_name_proto = { + .func = bpf_sysctl_get_name, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_ANYTHING, +}; + static const struct bpf_func_proto * sysctl_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { - return cgroup_base_func_proto(func_id, prog); + switch (func_id) { + case BPF_FUNC_sysctl_get_name: + return &bpf_sysctl_get_name_proto; + default: + return cgroup_base_func_proto(func_id, prog); + } } static bool sysctl_is_valid_access(int off, int size, enum bpf_access_type type, From 293588879a388f079ac909d3c5823cf7b704206a Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Thu, 28 Feb 2019 19:22:15 -0800 Subject: [PATCH 1096/1640] BACKPORT: bpf: Introduce bpf_sysctl_get_current_value helper Add bpf_sysctl_get_current_value() helper to copy current sysctl value into provided by BPF_PROG_TYPE_CGROUP_SYSCTL program buffer. It provides same string as user space can see by reading corresponding file in /proc/sys/, including new line, etc. Documentation for the new helper is provided in bpf.h UAPI. Since current value is kept in ctl_table->data in a parsed form, ctl_table->proc_handler() with write=0 is called to read that data and convert it to a string. Such a string can later be parsed by a program using helpers that will be introduced separately. Unfortunately it's not trivial to provide API to access parsed data due to variety of data representations (string, intvec, uintvec, ulongvec, custom structures, even NULL, etc). Instead it's assumed that user know how to handle specific sysctl they're interested in and appropriate helpers can be used. Since ctl_table->proc_handler() expects __user buffer, conversion to __user happens for kernel allocated one where the value is stored. Change-Id: I8e6c6834f500bdd88ecfa054b4655bf2059168e7 Signed-off-by: Andrey Ignatov Signed-off-by: Alexei Starovoitov --- include/linux/filter.h | 2 ++ include/uapi/linux/bpf.h | 19 ++++++++++++ kernel/bpf/cgroup.c | 65 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 86 insertions(+) diff --git a/include/linux/filter.h b/include/linux/filter.h index a2818cfa5200..26ac5095a07e 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1264,6 +1264,8 @@ struct bpf_sock_ops_kern { struct bpf_sysctl_kern { struct ctl_table_header *head; struct ctl_table *table; + void *cur_val; + size_t cur_len; int write; }; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index ea8bc6322a74..d2e73573d371 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2400,6 +2400,25 @@ union bpf_attr { * * **-E2BIG** if the buffer wasn't big enough (*buf* will contain * truncated name in this case). + * + * int bpf_sysctl_get_current_value(struct bpf_sysctl *ctx, char *buf, size_t buf_len) + * Description + * Get current value of sysctl as it is presented in /proc/sys + * (incl. newline, etc), and copy it as a string into provided + * by program buffer *buf* of size *buf_len*. + * + * The whole value is copied, no matter what file position user + * space issued e.g. sys_read at. + * + * The buffer is always NUL terminated, unless it's zero-sized. + * Return + * Number of character copied (not including the trailing NUL). + * + * **-E2BIG** if the buffer wasn't big enough (*buf* will contain + * truncated name in this case). + * + * **-EINVAL** if current value was unavailable, e.g. because + * sysctl is uninitialized and read returns -EIO for it. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index a68387043244..c6b2cf29a54b 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -794,15 +794,37 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, .head = head, .table = table, .write = write, + .cur_val = NULL, + .cur_len = PAGE_SIZE, }; struct cgroup *cgrp; int ret; + ctx.cur_val = kmalloc_track_caller(ctx.cur_len, GFP_KERNEL); + if (ctx.cur_val) { + mm_segment_t old_fs; + loff_t pos = 0; + + old_fs = get_fs(); + set_fs(KERNEL_DS); + if (table->proc_handler(table, 0, (void __user *)ctx.cur_val, + &ctx.cur_len, &pos)) { + /* Let BPF program decide how to proceed. */ + ctx.cur_len = 0; + } + set_fs(old_fs); + } else { + /* Let BPF program decide how to proceed. */ + ctx.cur_len = 0; + } + rcu_read_lock(); cgrp = task_dfl_cgroup(current); ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx, BPF_PROG_RUN); rcu_read_unlock(); + kfree(ctx.cur_val); + return ret == 1 ? 0 : -EPERM; } EXPORT_SYMBOL(__cgroup_bpf_run_filter_sysctl); @@ -869,12 +891,55 @@ static const struct bpf_func_proto bpf_sysctl_get_name_proto = { .arg4_type = ARG_ANYTHING, }; +static int copy_sysctl_value(char *dst, size_t dst_len, char *src, + size_t src_len) +{ + if (!dst) + return -EINVAL; + + if (!dst_len) + return -E2BIG; + + if (!src || !src_len) { + memset(dst, 0, dst_len); + return -EINVAL; + } + + memcpy(dst, src, min(dst_len, src_len)); + + if (dst_len > src_len) { + memset(dst + src_len, '\0', dst_len - src_len); + return src_len; + } + + dst[dst_len - 1] = '\0'; + + return -E2BIG; +} + +BPF_CALL_3(bpf_sysctl_get_current_value, struct bpf_sysctl_kern *, ctx, + char *, buf, size_t, buf_len) +{ + return copy_sysctl_value(buf, buf_len, ctx->cur_val, ctx->cur_len); +} + +static const struct bpf_func_proto bpf_sysctl_get_current_value_proto = { + .func = bpf_sysctl_get_current_value, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_UNINIT_MEM, + .arg3_type = ARG_CONST_SIZE, +}; + static const struct bpf_func_proto * sysctl_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_sysctl_get_name: return &bpf_sysctl_get_name_proto; + case BPF_FUNC_sysctl_get_current_value: + return &bpf_sysctl_get_current_value_proto; default: return cgroup_base_func_proto(func_id, prog); } From 4649001f8a4491c022f5b0e9fcdf89c2c401053a Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Thu, 7 Mar 2019 18:38:43 -0800 Subject: [PATCH 1097/1640] BACKPORT: bpf: Introduce bpf_sysctl_{get,set}_new_value helpers Add helpers to work with new value being written to sysctl by user space. bpf_sysctl_get_new_value() copies value being written to sysctl into provided buffer. bpf_sysctl_set_new_value() overrides new value being written by user space with a one from provided buffer. Buffer should contain string representation of the value, similar to what can be seen in /proc/sys/. Both helpers can be used only on sysctl write. File position matters and can be managed by an interface that will be introduced separately. E.g. if user space calls sys_write to a file in /proc/sys/ at file position = X, where X > 0, then the value set by bpf_sysctl_set_new_value() will be written starting from X. If program wants to override whole value with specified buffer, file position has to be set to zero. Documentation for the new helpers is provided in bpf.h UAPI. Change-Id: Ibd8931b2ebb9b88563424141a66269ddb0d193b4 Signed-off-by: Andrey Ignatov Signed-off-by: Alexei Starovoitov --- fs/proc/proc_sysctl.c | 22 ++++++++--- include/linux/bpf-cgroup.h | 8 ++-- include/linux/filter.h | 3 ++ include/uapi/linux/bpf.h | 34 ++++++++++++++++ kernel/bpf/cgroup.c | 81 +++++++++++++++++++++++++++++++++++++- 5 files changed, 139 insertions(+), 9 deletions(-) diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index b74d74d9b275..fba36a0118e5 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -576,8 +576,8 @@ static ssize_t proc_sys_call_handler(struct file *filp, void __user *buf, struct inode *inode = file_inode(filp); struct ctl_table_header *head = grab_header(inode); struct ctl_table *table = PROC_I(inode)->sysctl_entry; + void *new_buf = NULL; ssize_t error; - size_t res; if (IS_ERR(head)) return PTR_ERR(head); @@ -595,15 +595,27 @@ static ssize_t proc_sys_call_handler(struct file *filp, void __user *buf, if (!table->proc_handler) goto out; - error = BPF_CGROUP_RUN_PROG_SYSCTL(head, table, write); + error = BPF_CGROUP_RUN_PROG_SYSCTL(head, table, write, buf, &count, + &new_buf); if (error) goto out; /* careful: calling conventions are nasty here */ - res = count; - error = table->proc_handler(table, write, buf, &res, ppos); + if (new_buf) { + mm_segment_t old_fs; + + old_fs = get_fs(); + set_fs(KERNEL_DS); + error = table->proc_handler(table, write, (void __user *)new_buf, + &count, ppos); + set_fs(old_fs); + kfree(new_buf); + } else { + error = table->proc_handler(table, write, buf, &count, ppos); + } + if (!error) - error = res; + error = count; out: sysctl_head_finish(head); diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index 17340d242af7..0522cc46af0e 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -113,7 +113,8 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, struct ctl_table *table, int write, - enum bpf_attach_type type); + void __user *buf, size_t *pcount, + void **new_buf, enum bpf_attach_type type); static inline enum bpf_cgroup_storage_type cgroup_storage_type( struct bpf_map *map) @@ -267,11 +268,12 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, }) -#define BPF_CGROUP_RUN_PROG_SYSCTL(head, table, write) \ +#define BPF_CGROUP_RUN_PROG_SYSCTL(head, table, write, buf, count, nbuf) \ ({ \ int __ret = 0; \ if (cgroup_bpf_enabled) \ __ret = __cgroup_bpf_run_filter_sysctl(head, table, write, \ + buf, count, nbuf, \ BPF_CGROUP_SYSCTL); \ __ret; \ }) @@ -346,7 +348,7 @@ static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map, #define BPF_CGROUP_RUN_PROG_UDP6_RECVMSG_LOCK(sk, uaddr) ({ 0; }) #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) ({ 0; }) #define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type,major,minor,access) ({ 0; }) -#define BPF_CGROUP_RUN_PROG_SYSCTL(head, table, write) ({ 0; }) +#define BPF_CGROUP_RUN_PROG_SYSCTL(head,table,write,buf,count,nbuf) ({ 0; }) #define for_each_cgroup_storage_type(stype) for (; false; ) diff --git a/include/linux/filter.h b/include/linux/filter.h index 26ac5095a07e..8bddb00592ac 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1266,6 +1266,9 @@ struct bpf_sysctl_kern { struct ctl_table *table; void *cur_val; size_t cur_len; + void *new_val; + size_t new_len; + int new_updated; int write; }; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index d2e73573d371..8ae40b3916f2 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2419,6 +2419,40 @@ union bpf_attr { * * **-EINVAL** if current value was unavailable, e.g. because * sysctl is uninitialized and read returns -EIO for it. + * + * int bpf_sysctl_get_new_value(struct bpf_sysctl *ctx, char *buf, size_t buf_len) + * Description + * Get new value being written by user space to sysctl (before + * the actual write happens) and copy it as a string into + * provided by program buffer *buf* of size *buf_len*. + * + * User space may write new value at file position > 0. + * + * The buffer is always NUL terminated, unless it's zero-sized. + * Return + * Number of character copied (not including the trailing NUL). + * + * **-E2BIG** if the buffer wasn't big enough (*buf* will contain + * truncated name in this case). + * + * **-EINVAL** if sysctl is being read. + * + * int bpf_sysctl_set_new_value(struct bpf_sysctl *ctx, const char *buf, size_t buf_len) + * Description + * Override new value being written by user space to sysctl with + * value provided by program in buffer *buf* of size *buf_len*. + * + * *buf* should contain a string in same form as provided by user + * space on sysctl write. + * + * User space may write new value at file position > 0. To override + * the whole sysctl value file position should be set to zero. + * Return + * 0 on success. + * + * **-E2BIG** if the *buf_len* is too big. + * + * **-EINVAL** if sysctl is being read. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index c6b2cf29a54b..ba4e21986760 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -778,6 +778,13 @@ const struct bpf_verifier_ops cg_dev_verifier_ops = { * @head: sysctl table header * @table: sysctl table * @write: sysctl is being read (= 0) or written (= 1) + * @buf: pointer to buffer passed by user space + * @pcount: value-result argument: value is size of buffer pointed to by @buf, + * result is size of @new_buf if program set new value, initial value + * otherwise + * @new_buf: pointer to pointer to new buffer that will be allocated if program + * overrides new value provided by user space on sysctl write + * NOTE: it's caller responsibility to free *new_buf if it was set * @type: type of program to be executed * * Program is run when sysctl is being accessed, either read or written, and @@ -788,7 +795,8 @@ const struct bpf_verifier_ops cg_dev_verifier_ops = { */ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, struct ctl_table *table, int write, - enum bpf_attach_type type) + void __user *buf, size_t *pcount, + void **new_buf, enum bpf_attach_type type) { struct bpf_sysctl_kern ctx = { .head = head, @@ -796,6 +804,9 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, .write = write, .cur_val = NULL, .cur_len = PAGE_SIZE, + .new_val = NULL, + .new_len = 0, + .new_updated = 0, }; struct cgroup *cgrp; int ret; @@ -818,6 +829,18 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, ctx.cur_len = 0; } + if (write && buf && *pcount) { + /* BPF program should be able to override new value with a + * buffer bigger than provided by user. + */ + ctx.new_val = kmalloc_track_caller(PAGE_SIZE, GFP_KERNEL); + ctx.new_len = min(PAGE_SIZE, *pcount); + if (!ctx.new_val || + copy_from_user(ctx.new_val, buf, ctx.new_len)) + /* Let BPF program decide how to proceed. */ + ctx.new_len = 0; + } + rcu_read_lock(); cgrp = task_dfl_cgroup(current); ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx, BPF_PROG_RUN); @@ -825,6 +848,13 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, kfree(ctx.cur_val); + if (ret == 1 && ctx.new_updated) { + *new_buf = ctx.new_val; + *pcount = ctx.new_len; + } else { + kfree(ctx.new_val); + } + return ret == 1 ? 0 : -EPERM; } EXPORT_SYMBOL(__cgroup_bpf_run_filter_sysctl); @@ -932,6 +962,51 @@ static const struct bpf_func_proto bpf_sysctl_get_current_value_proto = { .arg3_type = ARG_CONST_SIZE, }; +BPF_CALL_3(bpf_sysctl_get_new_value, struct bpf_sysctl_kern *, ctx, char *, buf, + size_t, buf_len) +{ + if (!ctx->write) { + if (buf && buf_len) + memset(buf, '\0', buf_len); + return -EINVAL; + } + return copy_sysctl_value(buf, buf_len, ctx->new_val, ctx->new_len); +} + +static const struct bpf_func_proto bpf_sysctl_get_new_value_proto = { + .func = bpf_sysctl_get_new_value, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_UNINIT_MEM, + .arg3_type = ARG_CONST_SIZE, +}; + +BPF_CALL_3(bpf_sysctl_set_new_value, struct bpf_sysctl_kern *, ctx, + const char *, buf, size_t, buf_len) +{ + if (!ctx->write || !ctx->new_val || !ctx->new_len || !buf || !buf_len) + return -EINVAL; + + if (buf_len > PAGE_SIZE - 1) + return -E2BIG; + + memcpy(ctx->new_val, buf, buf_len); + ctx->new_len = buf_len; + ctx->new_updated = 1; + + return 0; +} + +static const struct bpf_func_proto bpf_sysctl_set_new_value_proto = { + .func = bpf_sysctl_set_new_value, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, +}; + static const struct bpf_func_proto * sysctl_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { @@ -940,6 +1015,10 @@ sysctl_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_sysctl_get_name_proto; case BPF_FUNC_sysctl_get_current_value: return &bpf_sysctl_get_current_value_proto; + case BPF_FUNC_sysctl_get_new_value: + return &bpf_sysctl_get_new_value_proto; + case BPF_FUNC_sysctl_set_new_value: + return &bpf_sysctl_set_new_value_proto; default: return cgroup_base_func_proto(func_id, prog); } From eefaf9fd44a0b94212b1af90ad928a367b8791b3 Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Thu, 7 Mar 2019 18:50:52 -0800 Subject: [PATCH 1098/1640] UPSTREAM: bpf: Add file_pos field to bpf_sysctl ctx Add file_pos field to bpf_sysctl context to read and write sysctl file position at which sysctl is being accessed (read or written). The field can be used to e.g. override whole sysctl value on write to sysctl even when sys_write is called by user space with file_pos > 0. Or BPF program may reject such accesses. Change-Id: Ia2674d14b24b99c5081522f8b3025c52f6228bfa Signed-off-by: Andrey Ignatov Signed-off-by: Alexei Starovoitov --- fs/proc/proc_sysctl.c | 2 +- include/linux/bpf-cgroup.h | 9 ++++--- include/linux/filter.h | 3 +++ include/uapi/linux/bpf.h | 3 +++ kernel/bpf/cgroup.c | 54 +++++++++++++++++++++++++++++++++++--- 5 files changed, 63 insertions(+), 8 deletions(-) diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index fba36a0118e5..481d4a91cd69 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -596,7 +596,7 @@ static ssize_t proc_sys_call_handler(struct file *filp, void __user *buf, goto out; error = BPF_CGROUP_RUN_PROG_SYSCTL(head, table, write, buf, &count, - &new_buf); + ppos, &new_buf); if (error) goto out; diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index 0522cc46af0e..665893e0f07b 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -114,7 +114,8 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, struct ctl_table *table, int write, void __user *buf, size_t *pcount, - void **new_buf, enum bpf_attach_type type); + loff_t *ppos, void **new_buf, + enum bpf_attach_type type); static inline enum bpf_cgroup_storage_type cgroup_storage_type( struct bpf_map *map) @@ -268,12 +269,12 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, }) -#define BPF_CGROUP_RUN_PROG_SYSCTL(head, table, write, buf, count, nbuf) \ +#define BPF_CGROUP_RUN_PROG_SYSCTL(head, table, write, buf, count, pos, nbuf) \ ({ \ int __ret = 0; \ if (cgroup_bpf_enabled) \ __ret = __cgroup_bpf_run_filter_sysctl(head, table, write, \ - buf, count, nbuf, \ + buf, count, pos, nbuf, \ BPF_CGROUP_SYSCTL); \ __ret; \ }) @@ -348,7 +349,7 @@ static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map, #define BPF_CGROUP_RUN_PROG_UDP6_RECVMSG_LOCK(sk, uaddr) ({ 0; }) #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) ({ 0; }) #define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type,major,minor,access) ({ 0; }) -#define BPF_CGROUP_RUN_PROG_SYSCTL(head,table,write,buf,count,nbuf) ({ 0; }) +#define BPF_CGROUP_RUN_PROG_SYSCTL(head,table,write,buf,count,pos,nbuf) ({ 0; }) #define for_each_cgroup_storage_type(stype) for (; false; ) diff --git a/include/linux/filter.h b/include/linux/filter.h index 8bddb00592ac..52221169b1a0 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1270,6 +1270,9 @@ struct bpf_sysctl_kern { size_t new_len; int new_updated; int write; + loff_t *ppos; + /* Temporary "register" for indirect stores to ppos. */ + u64 tmp_reg; }; #endif /* __LINUX_FILTER_H__ */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 8ae40b3916f2..c73b57a3d4ec 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3276,6 +3276,9 @@ struct bpf_sysctl { __u32 write; /* Sysctl is being read (= 0) or written (= 1). * Allows 1,2,4-byte read, but no write. */ + __u32 file_pos; /* Sysctl file position to read from, write to. + * Allows 1,2,4-byte read an 4-byte write. + */ }; #endif /* _UAPI__LINUX_BPF_H__ */ diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index ba4e21986760..b2adf22139b3 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -782,6 +782,9 @@ const struct bpf_verifier_ops cg_dev_verifier_ops = { * @pcount: value-result argument: value is size of buffer pointed to by @buf, * result is size of @new_buf if program set new value, initial value * otherwise + * @ppos: value-result argument: value is position at which read from or write + * to sysctl is happening, result is new position if program overrode it, + * initial value otherwise * @new_buf: pointer to pointer to new buffer that will be allocated if program * overrides new value provided by user space on sysctl write * NOTE: it's caller responsibility to free *new_buf if it was set @@ -796,12 +799,14 @@ const struct bpf_verifier_ops cg_dev_verifier_ops = { int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, struct ctl_table *table, int write, void __user *buf, size_t *pcount, - void **new_buf, enum bpf_attach_type type) + loff_t *ppos, void **new_buf, + enum bpf_attach_type type) { struct bpf_sysctl_kern ctx = { .head = head, .table = table, .write = write, + .ppos = ppos, .cur_val = NULL, .cur_len = PAGE_SIZE, .new_val = NULL, @@ -1030,14 +1035,22 @@ static bool sysctl_is_valid_access(int off, int size, enum bpf_access_type type, { const int size_default = sizeof(__u32); - if (off < 0 || off + size > sizeof(struct bpf_sysctl) || - off % size || type != BPF_READ) + if (off < 0 || off + size > sizeof(struct bpf_sysctl) || off % size) return false; switch (off) { case offsetof(struct bpf_sysctl, write): + if (type != BPF_READ) + return false; bpf_ctx_record_field_size(info, size_default); return bpf_ctx_narrow_access_ok(off, size, size_default); + case offsetof(struct bpf_sysctl, file_pos): + if (type == BPF_READ) { + bpf_ctx_record_field_size(info, size_default); + return bpf_ctx_narrow_access_ok(off, size, size_default); + } else { + return size == size_default; + } default: return false; } @@ -1059,6 +1072,41 @@ static u32 sysctl_convert_ctx_access(enum bpf_access_type type, write), target_size)); break; + case offsetof(struct bpf_sysctl, file_pos): + /* ppos is a pointer so it should be accessed via indirect + * loads and stores. Also for stores additional temporary + * register is used since neither src_reg nor dst_reg can be + * overridden. + */ + if (type == BPF_WRITE) { + int treg = BPF_REG_9; + + if (si->src_reg == treg || si->dst_reg == treg) + --treg; + if (si->src_reg == treg || si->dst_reg == treg) + --treg; + *insn++ = BPF_STX_MEM( + BPF_DW, si->dst_reg, treg, + offsetof(struct bpf_sysctl_kern, tmp_reg)); + *insn++ = BPF_LDX_MEM( + BPF_FIELD_SIZEOF(struct bpf_sysctl_kern, ppos), + treg, si->dst_reg, + offsetof(struct bpf_sysctl_kern, ppos)); + *insn++ = BPF_STX_MEM( + BPF_SIZEOF(u32), treg, si->src_reg, 0); + *insn++ = BPF_LDX_MEM( + BPF_DW, treg, si->dst_reg, + offsetof(struct bpf_sysctl_kern, tmp_reg)); + } else { + *insn++ = BPF_LDX_MEM( + BPF_FIELD_SIZEOF(struct bpf_sysctl_kern, ppos), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sysctl_kern, ppos)); + *insn++ = BPF_LDX_MEM( + BPF_SIZE(si->code), si->dst_reg, si->dst_reg, 0); + } + *target_size = sizeof(u32); + break; } return insn - insn_buf; From 60f423699998cb39a216042117d113cf30b4e8d8 Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Mon, 18 Mar 2019 16:57:10 -0700 Subject: [PATCH 1099/1640] UPSTREAM: bpf: Introduce ARG_PTR_TO_{INT,LONG} arg types Currently the way to pass result from BPF helper to BPF program is to provide memory area defined by pointer and size: func(void *, size_t). It works great for generic use-case, but for simple types, such as int, it's overkill and consumes two arguments when it could use just one. Introduce new argument types ARG_PTR_TO_INT and ARG_PTR_TO_LONG to be able to pass result from helper to program via pointer to int and long correspondingly: func(int *) or func(long *). New argument types are similar to ARG_PTR_TO_MEM with the following differences: * they don't require corresponding ARG_CONST_SIZE argument, predefined access sizes are used instead (32bit for int, 64bit for long); * it's possible to use more than one such an argument in a helper; * provided pointers have to be aligned. It's easy to introduce similar ARG_PTR_TO_CHAR and ARG_PTR_TO_SHORT argument types. It's not done due to lack of use-case though. Change-Id: I3b1a89f1be6ad4701810058ae3de9f81f9ee31cd Signed-off-by: Andrey Ignatov Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 2 ++ kernel/bpf/verifier.c | 29 +++++++++++++++++++++++++++++ 2 files changed, 31 insertions(+) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 01d300f9ac5e..cd24d0fb1896 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -207,6 +207,8 @@ enum bpf_arg_type { ARG_ANYTHING, /* any (initialized) argument is ok */ ARG_PTR_TO_SPIN_LOCK, /* pointer to bpf_spin_lock */ ARG_PTR_TO_SOCK_COMMON, /* pointer to sock_common */ + ARG_PTR_TO_INT, /* pointer to int */ + ARG_PTR_TO_LONG, /* pointer to long */ }; /* type of values returned from helper functions */ diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 49533749f375..ca6a9c28441b 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2479,6 +2479,22 @@ static bool arg_type_is_mem_size(enum bpf_arg_type type) type == ARG_CONST_SIZE_OR_ZERO; } +static bool arg_type_is_int_ptr(enum bpf_arg_type type) +{ + return type == ARG_PTR_TO_INT || + type == ARG_PTR_TO_LONG; +} + +static int int_ptr_type_to_size(enum bpf_arg_type type) +{ + if (type == ARG_PTR_TO_INT) + return sizeof(u32); + else if (type == ARG_PTR_TO_LONG) + return sizeof(u64); + + return -EINVAL; +} + static int check_func_arg(struct bpf_verifier_env *env, u32 regno, enum bpf_arg_type arg_type, struct bpf_call_arg_meta *meta) @@ -2571,6 +2587,12 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, type != expected_type) goto err_type; meta->raw_mode = arg_type == ARG_PTR_TO_UNINIT_MEM; + } else if (arg_type_is_int_ptr(arg_type)) { + expected_type = PTR_TO_STACK; + if (!type_is_pkt_pointer(type) && + type != PTR_TO_MAP_VALUE && + type != expected_type) + goto err_type; } else { verbose(env, "unsupported arg_type %d\n", arg_type); return -EFAULT; @@ -2651,6 +2673,13 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, err = check_helper_mem_access(env, regno - 1, reg->umax_value, zero_size_allowed, meta); + } else if (arg_type_is_int_ptr(arg_type)) { + int size = int_ptr_type_to_size(arg_type); + + err = check_helper_mem_access(env, regno, size, false, meta); + if (err) + return err; + err = check_ptr_alignment(env, reg, 0, size, true); } return err; From 42221a2f30bd62c9a0e3486507c3a90f02e3fa87 Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Mon, 18 Mar 2019 17:55:26 -0700 Subject: [PATCH 1100/1640] BACKPORT: bpf: Introduce bpf_strtol and bpf_strtoul helpers Add bpf_strtol and bpf_strtoul to convert a string to long and unsigned long correspondingly. It's similar to user space strtol(3) and strtoul(3) with a few changes to the API: * instead of NUL-terminated C string the helpers expect buffer and buffer length; * resulting long or unsigned long is returned in a separate result-argument; * return value is used to indicate success or failure, on success number of consumed bytes is returned that can be used to identify position to read next if the buffer is expected to contain multiple integers; * instead of *base* argument, *flags* is used that provides base in 5 LSB, other bits are reserved for future use; * number of supported bases is limited. Documentation for the new helpers is provided in bpf.h UAPI. The helpers are made available to BPF_PROG_TYPE_CGROUP_SYSCTL programs to be able to convert string input to e.g. "ulongvec" output. E.g. "net/ipv4/tcp_mem" consists of three ulong integers. They can be parsed by calling to bpf_strtoul three times. Implementation notes: Implementation includes "../../lib/kstrtox.h" to reuse integer parsing functions. It's done exactly same way as fs/proc/base.c already does. Unfortunately existing kstrtoX function can't be used directly since they fail if any invalid character is present right after integer in the string. Existing simple_strtoX functions can't be used either since they're obsolete and don't handle overflow properly. Change-Id: I852fe274aaaa43265cb94fe230099a8171b01c40 Signed-off-by: Andrey Ignatov Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 2 + include/uapi/linux/bpf.h | 47 ++++++++++++++ kernel/bpf/cgroup.c | 4 ++ kernel/bpf/helpers.c | 131 +++++++++++++++++++++++++++++++++++++++ 4 files changed, 184 insertions(+) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index cd24d0fb1896..50816a195b42 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -986,6 +986,8 @@ extern const struct bpf_func_proto bpf_sk_redirect_map_proto; extern const struct bpf_func_proto bpf_spin_lock_proto; extern const struct bpf_func_proto bpf_spin_unlock_proto; extern const struct bpf_func_proto bpf_get_local_storage_proto; +extern const struct bpf_func_proto bpf_strtol_proto; +extern const struct bpf_func_proto bpf_strtoul_proto; /* Shared helpers among cBPF and eBPF. */ void bpf_user_rnd_init_once(void); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index c73b57a3d4ec..7331d527fbbc 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2453,6 +2453,53 @@ union bpf_attr { * **-E2BIG** if the *buf_len* is too big. * * **-EINVAL** if sysctl is being read. + * + * int bpf_strtol(const char *buf, size_t buf_len, u64 flags, long *res) + * Description + * Convert the initial part of the string from buffer *buf* of + * size *buf_len* to a long integer according to the given base + * and save the result in *res*. + * + * The string may begin with an arbitrary amount of white space + * (as determined by isspace(3)) followed by a single optional '-' + * sign. + * + * Five least significant bits of *flags* encode base, other bits + * are currently unused. + * + * Base must be either 8, 10, 16 or 0 to detect it automatically + * similar to user space strtol(3). + * Return + * Number of characters consumed on success. Must be positive but + * no more than buf_len. + * + * **-EINVAL** if no valid digits were found or unsupported base + * was provided. + * + * **-ERANGE** if resulting value was out of range. + * + * int bpf_strtoul(const char *buf, size_t buf_len, u64 flags, unsigned long *res) + * Description + * Convert the initial part of the string from buffer *buf* of + * size *buf_len* to an unsigned long integer according to the + * given base and save the result in *res*. + * + * The string may begin with an arbitrary amount of white space + * (as determined by isspace(3)). + * + * Five least significant bits of *flags* encode base, other bits + * are currently unused. + * + * Base must be either 8, 10, 16 or 0 to detect it automatically + * similar to user space strtoul(3). + * Return + * Number of characters consumed on success. Must be positive but + * no more than buf_len. + * + * **-EINVAL** if no valid digits were found or unsupported base + * was provided. + * + * **-ERANGE** if resulting value was out of range. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index b2adf22139b3..789d4ab2336e 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -1016,6 +1016,10 @@ static const struct bpf_func_proto * sysctl_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { + case BPF_FUNC_strtol: + return &bpf_strtol_proto; + case BPF_FUNC_strtoul: + return &bpf_strtoul_proto; case BPF_FUNC_sysctl_get_name: return &bpf_sysctl_get_name_proto; case BPF_FUNC_sysctl_get_current_value: diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 679c02914590..8bf07a9596b3 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -18,6 +18,9 @@ #include #include #include +#include + +#include "../../lib/kstrtox.h" /* If kernel subsystem is allowing eBPF programs to call this function, * inside its own verifier_ops->get_func_proto() callback it should return @@ -375,4 +378,132 @@ const struct bpf_func_proto bpf_get_local_storage_proto = { .arg2_type = ARG_ANYTHING, }; #endif + +#define BPF_STRTOX_BASE_MASK 0x1F + +static int __bpf_strtoull(const char *buf, size_t buf_len, u64 flags, + unsigned long long *res, bool *is_negative) +{ + unsigned int base = flags & BPF_STRTOX_BASE_MASK; + const char *cur_buf = buf; + size_t cur_len = buf_len; + unsigned int consumed; + size_t val_len; + char str[64]; + + if (!buf || !buf_len || !res || !is_negative) + return -EINVAL; + + if (base != 0 && base != 8 && base != 10 && base != 16) + return -EINVAL; + + if (flags & ~BPF_STRTOX_BASE_MASK) + return -EINVAL; + + while (cur_buf < buf + buf_len && isspace(*cur_buf)) + ++cur_buf; + + *is_negative = (cur_buf < buf + buf_len && *cur_buf == '-'); + if (*is_negative) + ++cur_buf; + + consumed = cur_buf - buf; + cur_len -= consumed; + if (!cur_len) + return -EINVAL; + + cur_len = min(cur_len, sizeof(str) - 1); + memcpy(str, cur_buf, cur_len); + str[cur_len] = '\0'; + cur_buf = str; + + cur_buf = _parse_integer_fixup_radix(cur_buf, &base); + val_len = _parse_integer(cur_buf, base, res); + + if (val_len & KSTRTOX_OVERFLOW) + return -ERANGE; + + if (val_len == 0) + return -EINVAL; + + cur_buf += val_len; + consumed += cur_buf - str; + + return consumed; +} + +static int __bpf_strtoll(const char *buf, size_t buf_len, u64 flags, + long long *res) +{ + unsigned long long _res; + bool is_negative; + int err; + + err = __bpf_strtoull(buf, buf_len, flags, &_res, &is_negative); + if (err < 0) + return err; + if (is_negative) { + if ((long long)-_res > 0) + return -ERANGE; + *res = -_res; + } else { + if ((long long)_res < 0) + return -ERANGE; + *res = _res; + } + return err; +} + +BPF_CALL_4(bpf_strtol, const char *, buf, size_t, buf_len, u64, flags, + long *, res) +{ + long long _res; + int err; + + err = __bpf_strtoll(buf, buf_len, flags, &_res); + if (err < 0) + return err; + if (_res != (long)_res) + return -ERANGE; + *res = _res; + return err; +} + +const struct bpf_func_proto bpf_strtol_proto = { + .func = bpf_strtol, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_CONST_SIZE, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_LONG, +}; + +BPF_CALL_4(bpf_strtoul, const char *, buf, size_t, buf_len, u64, flags, + unsigned long *, res) +{ + unsigned long long _res; + bool is_negative; + int err; + + err = __bpf_strtoull(buf, buf_len, flags, &_res, &is_negative); + if (err < 0) + return err; + if (is_negative) + return -EINVAL; + if (_res != (unsigned long)_res) + return -ERANGE; + *res = _res; + return err; +} + +const struct bpf_func_proto bpf_strtoul_proto = { + .func = bpf_strtoul, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_CONST_SIZE, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_LONG, +}; #endif From 5ffdfa87d575124e7e73b5f1cb0d3b45d05b9bf8 Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Fri, 12 Apr 2019 16:01:01 -0700 Subject: [PATCH 1101/1640] UPSTREAM: bpf: Fix distinct pointer types warning for ARCH=i386 Fix a new warning reported by kbuild for make ARCH=i386: In file included from kernel/bpf/cgroup.c:11:0: kernel/bpf/cgroup.c: In function '__cgroup_bpf_run_filter_sysctl': include/linux/kernel.h:827:29: warning: comparison of distinct pointer types lacks a cast (!!(sizeof((typeof(x) *)1 == (typeof(y) *)1))) ^ include/linux/kernel.h:841:4: note: in expansion of macro '__typecheck' (__typecheck(x, y) && __no_side_effects(x, y)) ^~~~~~~~~~~ include/linux/kernel.h:851:24: note: in expansion of macro '__safe_cmp' __builtin_choose_expr(__safe_cmp(x, y), \ ^~~~~~~~~~ include/linux/kernel.h:860:19: note: in expansion of macro '__careful_cmp' #define min(x, y) __careful_cmp(x, y, <) ^~~~~~~~~~~~~ >> kernel/bpf/cgroup.c:837:17: note: in expansion of macro 'min' ctx.new_len = min(PAGE_SIZE, *pcount); ^~~ Fixes: 4e63acdff864 ("bpf: Introduce bpf_sysctl_{get,set}_new_value helpers") Change-Id: I1a1391f03baf081a108bb0fde4690bd7dd445856 Signed-off-by: Andrey Ignatov Signed-off-by: Alexei Starovoitov --- kernel/bpf/cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 789d4ab2336e..e58a6c247f56 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -839,7 +839,7 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, * buffer bigger than provided by user. */ ctx.new_val = kmalloc_track_caller(PAGE_SIZE, GFP_KERNEL); - ctx.new_len = min(PAGE_SIZE, *pcount); + ctx.new_len = min_t(size_t, PAGE_SIZE, *pcount); if (!ctx.new_val || copy_from_user(ctx.new_val, buf, ctx.new_len)) /* Let BPF program decide how to proceed. */ From 98f084de535c2039c358878b80bfe9928111ec90 Mon Sep 17 00:00:00 2001 From: Jiong Wang Date: Fri, 12 Apr 2019 22:59:34 +0100 Subject: [PATCH 1102/1640] UPSTREAM: bpf: refactor propagate_liveness to eliminate duplicated for loop Propagation for register and stack slot are finished in separate for loop, while they are perfect to be put into a single loop. This could also let them share some common variables in later patches. Change-Id: I4a9488b9f313f46b6d84772d3d142fdfbc2f7ef2 Signed-off-by: Jiong Wang Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index ca6a9c28441b..e3b7dad3efd5 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6462,10 +6462,8 @@ static int propagate_liveness(struct bpf_verifier_env *env, return err; } } - } - /* ... and stack slots */ - for (frame = 0; frame <= vstate->curframe; frame++) { + /* Propagate stack slots. */ state = vstate->frame[frame]; parent = vparent->frame[frame]; for (i = 0; i < state->allocated_stack / BPF_REG_SIZE && From 82d254300d8842080248da3c545a04b46864e255 Mon Sep 17 00:00:00 2001 From: Jiong Wang Date: Fri, 12 Apr 2019 22:59:35 +0100 Subject: [PATCH 1103/1640] UPSTREAM: bpf: refactor propagate_liveness to eliminate code redundance Access to reg states were not factored out, the consequence is long code for dereferencing them which made the indentation not good for reading. This patch factor out these code so the core code in the loop could be easier to follow. Reviewed-by: Jakub Kicinski Change-Id: I5560821475352b73362e073ac5ac3cd403528571 Signed-off-by: Jiong Wang Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 34 ++++++++++++++++++++-------------- 1 file changed, 20 insertions(+), 14 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index e3b7dad3efd5..d4349b513d26 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6440,8 +6440,9 @@ static int propagate_liveness(struct bpf_verifier_env *env, const struct bpf_verifier_state *vstate, struct bpf_verifier_state *vparent) { - int i, frame, err = 0; + struct bpf_reg_state *state_reg, *parent_reg; struct bpf_func_state *state, *parent; + int i, frame, err = 0; if (vparent->curframe != vstate->curframe) { WARN(1, "propagate_live: parent frame %d current frame %d\n", @@ -6451,28 +6452,33 @@ static int propagate_liveness(struct bpf_verifier_env *env, /* Propagate read liveness of registers... */ BUILD_BUG_ON(BPF_REG_FP + 1 != MAX_BPF_REG); for (frame = 0; frame <= vstate->curframe; frame++) { + parent = vparent->frame[frame]; + state = vstate->frame[frame]; + parent_reg = parent->regs; + state_reg = state->regs; /* We don't need to worry about FP liveness, it's read-only */ for (i = frame < vstate->curframe ? BPF_REG_6 : 0; i < BPF_REG_FP; i++) { - if (vparent->frame[frame]->regs[i].live & REG_LIVE_READ) + if (parent_reg[i].live & REG_LIVE_READ) continue; - if (vstate->frame[frame]->regs[i].live & REG_LIVE_READ) { - err = mark_reg_read(env, &vstate->frame[frame]->regs[i], - &vparent->frame[frame]->regs[i]); - if (err) - return err; - } + if (!(state_reg[i].live & REG_LIVE_READ)) + continue; + err = mark_reg_read(env, &state_reg[i], &parent_reg[i]); + if (err) + return err; } /* Propagate stack slots. */ - state = vstate->frame[frame]; - parent = vparent->frame[frame]; for (i = 0; i < state->allocated_stack / BPF_REG_SIZE && i < parent->allocated_stack / BPF_REG_SIZE; i++) { - if (parent->stack[i].spilled_ptr.live & REG_LIVE_READ) + parent_reg = &parent->stack[i].spilled_ptr; + state_reg = &state->stack[i].spilled_ptr; + if (parent_reg->live & REG_LIVE_READ) continue; - if (state->stack[i].spilled_ptr.live & REG_LIVE_READ) - mark_reg_read(env, &state->stack[i].spilled_ptr, - &parent->stack[i].spilled_ptr); + if (!(state_reg->live & REG_LIVE_READ)) + continue; + err = mark_reg_read(env, state_reg, parent_reg); + if (err) + return err; } } return err; From 52ab911c8b9dfb7bb5c5deb755fd91509490e1a6 Mon Sep 17 00:00:00 2001 From: Jiong Wang Date: Fri, 12 Apr 2019 22:59:36 +0100 Subject: [PATCH 1104/1640] UPSTREAM: bpf: factor out reg and stack slot propagation into "propagate_liveness_reg" After code refactor in previous patches, the propagation logic inside the for loop in "propagate_liveness" becomes clear that they are good enough to be factored out into a common function "propagate_liveness_reg". Reviewed-by: Jakub Kicinski Change-Id: I5d23d0c20b99b09d1c61849f12c3d6b5023adbb7 Signed-off-by: Jiong Wang Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d4349b513d26..e63081d5a8fd 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6429,6 +6429,22 @@ static bool states_equal(struct bpf_verifier_env *env, return true; } +static int propagate_liveness_reg(struct bpf_verifier_env *env, + struct bpf_reg_state *reg, + struct bpf_reg_state *parent_reg) +{ + int err; + + if (parent_reg->live & REG_LIVE_READ || !(reg->live & REG_LIVE_READ)) + return 0; + + err = mark_reg_read(env, reg, parent_reg); + if (err) + return err; + + return 0; +} + /* A write screens off any subsequent reads; but write marks come from the * straight-line code between a state and its parent. When we arrive at an * equivalent state (jump target or such) we didn't arrive by the straight-line @@ -6458,11 +6474,8 @@ static int propagate_liveness(struct bpf_verifier_env *env, state_reg = state->regs; /* We don't need to worry about FP liveness, it's read-only */ for (i = frame < vstate->curframe ? BPF_REG_6 : 0; i < BPF_REG_FP; i++) { - if (parent_reg[i].live & REG_LIVE_READ) - continue; - if (!(state_reg[i].live & REG_LIVE_READ)) - continue; - err = mark_reg_read(env, &state_reg[i], &parent_reg[i]); + err = propagate_liveness_reg(env, &state_reg[i], + &parent_reg[i]); if (err) return err; } @@ -6472,11 +6485,8 @@ static int propagate_liveness(struct bpf_verifier_env *env, i < parent->allocated_stack / BPF_REG_SIZE; i++) { parent_reg = &parent->stack[i].spilled_ptr; state_reg = &state->stack[i].spilled_ptr; - if (parent_reg->live & REG_LIVE_READ) - continue; - if (!(state_reg->live & REG_LIVE_READ)) - continue; - err = mark_reg_read(env, state_reg, parent_reg); + err = propagate_liveness_reg(env, state_reg, + parent_reg); if (err) return err; } From 2c8d1d3519d6449caceec96a19fd8a2f6f2f6c80 Mon Sep 17 00:00:00 2001 From: Jiong Wang Date: Fri, 12 Apr 2019 22:59:37 +0100 Subject: [PATCH 1105/1640] UPSTREAM: bpf: refactor "check_reg_arg" to eliminate code redundancy There are a few "regs[regno]" here are there across "check_reg_arg", this patch factor it out into a simple "reg" pointer. The intention is to simplify code indentation and make the later patches in this set look cleaner. Reviewed-by: Jakub Kicinski Change-Id: I4a05e62555e06d40a25837ea8e261d17a5d09e95 Signed-off-by: Jiong Wang Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index e63081d5a8fd..7b2d9fb54b21 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1176,30 +1176,32 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno, { struct bpf_verifier_state *vstate = env->cur_state; struct bpf_func_state *state = vstate->frame[vstate->curframe]; - struct bpf_reg_state *regs = state->regs; + struct bpf_reg_state *reg, *regs = state->regs; if (regno >= MAX_BPF_REG) { verbose(env, "R%d is invalid\n", regno); return -EINVAL; } + reg = ®s[regno]; if (t == SRC_OP) { /* check whether register used as source operand can be read */ - if (regs[regno].type == NOT_INIT) { + if (reg->type == NOT_INIT) { verbose(env, "R%d !read_ok\n", regno); return -EACCES; } /* We don't need to worry about FP liveness because it's read-only */ - if (regno != BPF_REG_FP) - return mark_reg_read(env, ®s[regno], - regs[regno].parent); + if (regno == BPF_REG_FP) + return 0; + + return mark_reg_read(env, reg, reg->parent); } else { /* check whether register used as dest operand can be written to */ if (regno == BPF_REG_FP) { verbose(env, "frame pointer is read only\n"); return -EACCES; } - regs[regno].live |= REG_LIVE_WRITTEN; + reg->live |= REG_LIVE_WRITTEN; if (t == DST_OP) mark_reg_unknown(env, regs, regno); } From cab00d97f67d5bb7cbc5053ed844d8731b6efb9f Mon Sep 17 00:00:00 2001 From: Alban Crequy Date: Sun, 14 Apr 2019 18:58:46 +0200 Subject: [PATCH 1106/1640] UPSTREAM: bpf: add map helper functions push, pop, peek in more BPF programs commit f1a2e44a3aec ("bpf: add queue and stack maps") introduced new BPF helper functions: - BPF_FUNC_map_push_elem - BPF_FUNC_map_pop_elem - BPF_FUNC_map_peek_elem but they were made available only for network BPF programs. This patch makes them available for tracepoint, cgroup and lirc programs. Change-Id: Id49274e5be3ab81f6eb60b6890834247d68487e6 Signed-off-by: Alban Crequy Cc: Mauricio Vasquez B Acked-by: Song Liu Signed-off-by: Daniel Borkmann --- drivers/media/rc/bpf-lirc.c | 6 ++++++ kernel/bpf/cgroup.c | 6 ++++++ kernel/trace/bpf_trace.c | 6 ++++++ 3 files changed, 18 insertions(+) diff --git a/drivers/media/rc/bpf-lirc.c b/drivers/media/rc/bpf-lirc.c index ca00c6cc110d..a3fc3812eaee 100644 --- a/drivers/media/rc/bpf-lirc.c +++ b/drivers/media/rc/bpf-lirc.c @@ -73,6 +73,12 @@ lirc_mode2_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_map_update_elem_proto; case BPF_FUNC_map_delete_elem: return &bpf_map_delete_elem_proto; + case BPF_FUNC_map_push_elem: + return &bpf_map_push_elem_proto; + case BPF_FUNC_map_pop_elem: + return &bpf_map_pop_elem_proto; + case BPF_FUNC_map_peek_elem: + return &bpf_map_peek_elem_proto; case BPF_FUNC_ktime_get_ns: return &bpf_ktime_get_ns_proto; case BPF_FUNC_ktime_get_boot_ns: diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index e58a6c247f56..fcde0f7b2585 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -713,6 +713,12 @@ cgroup_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_map_update_elem_proto; case BPF_FUNC_map_delete_elem: return &bpf_map_delete_elem_proto; + case BPF_FUNC_map_push_elem: + return &bpf_map_push_elem_proto; + case BPF_FUNC_map_pop_elem: + return &bpf_map_pop_elem_proto; + case BPF_FUNC_map_peek_elem: + return &bpf_map_peek_elem_proto; case BPF_FUNC_get_current_uid_gid: return &bpf_get_current_uid_gid_proto; case BPF_FUNC_get_local_storage: diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 60e6e2a7ba24..e0f72f057346 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -576,6 +576,12 @@ tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_map_update_elem_proto; case BPF_FUNC_map_delete_elem: return &bpf_map_delete_elem_proto; + case BPF_FUNC_map_push_elem: + return &bpf_map_push_elem_proto; + case BPF_FUNC_map_pop_elem: + return &bpf_map_pop_elem_proto; + case BPF_FUNC_map_peek_elem: + return &bpf_map_peek_elem_proto; case BPF_FUNC_probe_read: return &bpf_probe_read_proto; case BPF_FUNC_ktime_get_ns: From e3953d3ef60f0e44c77454bda43ecb8b0b533a67 Mon Sep 17 00:00:00 2001 From: Prashant Bhole Date: Tue, 16 Apr 2019 18:13:01 +0900 Subject: [PATCH 1107/1640] UPSTREAM: bpf: use BPF_CAST_CALL for casting bpf call verifier.c uses BPF_CAST_CALL for casting bpf call except at one place in jit_subprogs(). Let's use the macro for consistency. Change-Id: I94ba41658661adb7bbea64c5c6e19ffe9f0b95c6 Signed-off-by: Prashant Bhole Acked-by: Song Liu Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 7b2d9fb54b21..a51c2d3b0601 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -7859,9 +7859,8 @@ static int jit_subprogs(struct bpf_verifier_env *env) insn->src_reg != BPF_PSEUDO_CALL) continue; subprog = insn->off; - insn->imm = (u64 (*)(u64, u64, u64, u64, u64)) - func[subprog]->bpf_func - - __bpf_call_base; + insn->imm = BPF_CAST_CALL(func[subprog]->bpf_func) - + __bpf_call_base; } /* we use the aux data to keep a list of the start addresses From 28ff139aa3738d324a36a4375f9aae315992ed03 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Fri, 12 Apr 2019 17:07:32 +0200 Subject: [PATCH 1108/1640] UPSTREAM: bpf: cpumap use ptr_ring_consume_batched Move ptr_ring dequeue outside loop, that allocate SKBs and calls network stack, as these operations that can take some time. The ptr_ring is a communication channel between CPUs, where we want to reduce/limit any cacheline bouncing. Do a concentrated bulk dequeue via ptr_ring_consume_batched, to shorten the period and times the remote cacheline in ptr_ring is read Batch size 8 is both to (1) limit BH-disable period, and (2) consume one cacheline on 64-bit archs. After reducing the BH-disable section further then we can consider changing this, while still thinking about L1 cacheline size being active. Change-Id: I2cb8f3dca618462ed76278d63e82c52d8140a27c Signed-off-by: Jesper Dangaard Brouer Acked-by: Song Liu Signed-off-by: Alexei Starovoitov --- kernel/bpf/cpumap.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 61fbcae82f0a..2aea0f6ce70b 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -240,6 +240,8 @@ static void put_cpu_map_entry(struct bpf_cpu_map_entry *rcpu) } } +#define CPUMAP_BATCH 8 + static int cpu_map_kthread_run(void *data) { struct bpf_cpu_map_entry *rcpu = data; @@ -252,8 +254,9 @@ static int cpu_map_kthread_run(void *data) * kthread_stop signal until queue is empty. */ while (!kthread_should_stop() || !__ptr_ring_empty(rcpu->queue)) { - unsigned int processed = 0, drops = 0, sched = 0; - struct xdp_frame *xdpf; + unsigned int drops = 0, sched = 0; + void *frames[CPUMAP_BATCH]; + int i, n; /* Release CPU reschedule checks */ if (__ptr_ring_empty(rcpu->queue)) { @@ -269,14 +272,16 @@ static int cpu_map_kthread_run(void *data) sched = cond_resched(); } - /* Process packets in rcpu->queue */ - local_bh_disable(); /* * The bpf_cpu_map_entry is single consumer, with this * kthread CPU pinned. Lockless access to ptr_ring * consume side valid as no-resize allowed of queue. */ - while ((xdpf = __ptr_ring_consume(rcpu->queue))) { + n = ptr_ring_consume_batched(rcpu->queue, frames, CPUMAP_BATCH); + + local_bh_disable(); + for (i = 0; i < n; i++) { + struct xdp_frame *xdpf = frames[i]; struct sk_buff *skb; int ret; @@ -290,13 +295,9 @@ static int cpu_map_kthread_run(void *data) ret = netif_receive_skb_core(skb); if (ret == NET_RX_DROP) drops++; - - /* Limit BH-disable period */ - if (++processed == 8) - break; } /* Feedback loop via tracepoint */ - trace_xdp_cpumap_kthread(rcpu->map_id, processed, drops, sched); + trace_xdp_cpumap_kthread(rcpu->map_id, n, drops, sched); local_bh_enable(); /* resched point, may call do_softirq() */ } From d3afe4d86fe157f3c814a86ca2ac35016f62eaf2 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Fri, 12 Apr 2019 17:07:37 +0200 Subject: [PATCH 1109/1640] UPSTREAM: net: core: introduce build_skb_around The function build_skb() also have the responsibility to allocate and clear the SKB structure. Introduce a new function build_skb_around(), that moves the responsibility of allocation and clearing to the caller. This allows caller to use kmem_cache (slab/slub) bulk allocation API. Next patch use this function combined with kmem_cache_alloc_bulk. Change-Id: I8f0ed067a322551d2877bb42ab1971a49634adcf Signed-off-by: Jesper Dangaard Brouer Acked-by: Song Liu Acked-by: Eric Dumazet Signed-off-by: Alexei Starovoitov --- include/linux/skbuff.h | 10 ++++++ net/core/skbuff.c | 71 +++++++++++++++++++++++++++++++----------- 2 files changed, 62 insertions(+), 19 deletions(-) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 005f1b732504..745cb9ce08cf 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -989,6 +989,16 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t priority, int flags, int node); struct sk_buff *__build_skb(void *data, unsigned int frag_size); struct sk_buff *build_skb(void *data, unsigned int frag_size); +struct sk_buff *build_skb_around(struct sk_buff *skb, + void *data, unsigned int frag_size); + +/** + * alloc_skb - allocate a network buffer + * @size: size to allocate + * @priority: allocation mask + * + * This function is a convenient wrapper around __alloc_skb(). + */ static inline struct sk_buff *alloc_skb(unsigned int size, gfp_t priority) { diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 4baf28959bd1..4dff86beb55d 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -256,6 +256,33 @@ nodata: } EXPORT_SYMBOL(__alloc_skb); +/* Caller must provide SKB that is memset cleared */ +static struct sk_buff *__build_skb_around(struct sk_buff *skb, + void *data, unsigned int frag_size) +{ + struct skb_shared_info *shinfo; + unsigned int size = frag_size ? : ksize(data); + + size -= SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + + /* Assumes caller memset cleared SKB */ + skb->truesize = SKB_TRUESIZE(size); + refcount_set(&skb->users, 1); + skb->head = data; + skb->data = data; + skb_reset_tail_pointer(skb); + skb->end = skb->tail + size; + skb->mac_header = (typeof(skb->mac_header))~0U; + skb->transport_header = (typeof(skb->transport_header))~0U; + + /* make sure we initialize shinfo sequentially */ + shinfo = skb_shinfo(skb); + memset(shinfo, 0, offsetof(struct skb_shared_info, dataref)); + atomic_set(&shinfo->dataref, 1); + + return skb; +} + /** * __build_skb - build a network buffer * @data: data buffer provided by caller @@ -277,32 +304,15 @@ EXPORT_SYMBOL(__alloc_skb); */ struct sk_buff *__build_skb(void *data, unsigned int frag_size) { - struct skb_shared_info *shinfo; struct sk_buff *skb; - unsigned int size = frag_size ? : ksize(data); skb = kmem_cache_alloc(skbuff_head_cache, GFP_ATOMIC); - if (!skb) + if (unlikely(!skb)) return NULL; - size -= SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); - memset(skb, 0, offsetof(struct sk_buff, tail)); - skb->truesize = SKB_TRUESIZE(size); - refcount_set(&skb->users, 1); - skb->head = data; - skb->data = data; - skb_reset_tail_pointer(skb); - skb->end = skb->tail + size; - skb->mac_header = (typeof(skb->mac_header))~0U; - skb->transport_header = (typeof(skb->transport_header))~0U; - /* make sure we initialize shinfo sequentially */ - shinfo = skb_shinfo(skb); - memset(shinfo, 0, offsetof(struct skb_shared_info, dataref)); - atomic_set(&shinfo->dataref, 1); - - return skb; + return __build_skb_around(skb, data, frag_size); } /* build_skb() is wrapper over __build_skb(), that specifically @@ -323,6 +333,29 @@ struct sk_buff *build_skb(void *data, unsigned int frag_size) } EXPORT_SYMBOL(build_skb); +/** + * build_skb_around - build a network buffer around provided skb + * @skb: sk_buff provide by caller, must be memset cleared + * @data: data buffer provided by caller + * @frag_size: size of data, or 0 if head was kmalloced + */ +struct sk_buff *build_skb_around(struct sk_buff *skb, + void *data, unsigned int frag_size) +{ + if (unlikely(!skb)) + return NULL; + + skb = __build_skb_around(skb, data, frag_size); + + if (skb && frag_size) { + skb->head_frag = 1; + if (page_is_pfmemalloc(virt_to_head_page(data))) + skb->pfmemalloc = 1; + } + return skb; +} +EXPORT_SYMBOL(build_skb_around); + #define NAPI_SKB_CACHE_SIZE 64 struct napi_alloc_cache { From 5c1a2efb7c08eda6f0050762c2bc7ca1b1ec58df Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Fri, 12 Apr 2019 17:07:43 +0200 Subject: [PATCH 1110/1640] UPSTREAM: bpf: cpumap do bulk allocation of SKBs As cpumap now batch consume xdp_frame's from the ptr_ring, it knows how many SKBs it need to allocate. Thus, lets bulk allocate these SKBs via kmem_cache_alloc_bulk() API, and use the previously introduced function build_skb_around(). Notice that the flag __GFP_ZERO asks the slab/slub allocator to clear the memory for us. This does clear a larger area than needed, but my micro benchmarks on Intel CPUs show that this is slightly faster due to being a cacheline aligned area is cleared for the SKBs. (For SLUB allocator, there is a future optimization potential, because SKBs will with high probability originate from same page. If we can find/identify continuous memory areas then the Intel CPU memset rep stos will have a real performance gain.) Change-Id: I5efed7a1ddf7904230df3ba291bd4feb10d46f84 Signed-off-by: Jesper Dangaard Brouer Acked-by: Song Liu Signed-off-by: Alexei Starovoitov --- kernel/bpf/cpumap.c | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 2aea0f6ce70b..0407ac179f72 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -160,12 +160,12 @@ static void cpu_map_kthread_stop(struct work_struct *work) } static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu, - struct xdp_frame *xdpf) + struct xdp_frame *xdpf, + struct sk_buff *skb) { unsigned int hard_start_headroom; unsigned int frame_size; void *pkt_data_start; - struct sk_buff *skb; /* Part of headroom was reserved to xdpf */ hard_start_headroom = sizeof(struct xdp_frame) + xdpf->headroom; @@ -191,8 +191,8 @@ static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu, SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); pkt_data_start = xdpf->data - hard_start_headroom; - skb = build_skb(pkt_data_start, frame_size); - if (!skb) + skb = build_skb_around(skb, pkt_data_start, frame_size); + if (unlikely(!skb)) return NULL; skb_reserve(skb, hard_start_headroom); @@ -256,7 +256,9 @@ static int cpu_map_kthread_run(void *data) while (!kthread_should_stop() || !__ptr_ring_empty(rcpu->queue)) { unsigned int drops = 0, sched = 0; void *frames[CPUMAP_BATCH]; - int i, n; + void *skbs[CPUMAP_BATCH]; + gfp_t gfp = __GFP_ZERO | GFP_ATOMIC; + int i, n, m; /* Release CPU reschedule checks */ if (__ptr_ring_empty(rcpu->queue)) { @@ -278,14 +280,20 @@ static int cpu_map_kthread_run(void *data) * consume side valid as no-resize allowed of queue. */ n = ptr_ring_consume_batched(rcpu->queue, frames, CPUMAP_BATCH); + m = kmem_cache_alloc_bulk(skbuff_head_cache, gfp, n, skbs); + if (unlikely(m == 0)) { + for (i = 0; i < n; i++) + skbs[i] = NULL; /* effect: xdp_return_frame */ + drops = n; + } local_bh_disable(); for (i = 0; i < n; i++) { struct xdp_frame *xdpf = frames[i]; - struct sk_buff *skb; + struct sk_buff *skb = skbs[i]; int ret; - skb = cpu_map_build_skb(rcpu, xdpf); + skb = cpu_map_build_skb(rcpu, xdpf, skb); if (!skb) { xdp_return_frame(xdpf); continue; From 66e1469d96bd684d857777eef63869ef53b80a28 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Fri, 12 Apr 2019 17:07:48 +0200 Subject: [PATCH 1111/1640] UPSTREAM: bpf: cpumap memory prefetchw optimizations for struct page A lot of the performance gain comes from this patch. While analysing performance overhead it was found that the largest CPU stalls were caused when touching the struct page area. It is first read with a READ_ONCE from build_skb_around via page_is_pfmemalloc(), and when freed written by page_frag_free() call. Measurements show that the prefetchw (W) variant operation is needed to achieve the performance gain. We believe this optimization it two fold, first the W-variant saves one step in the cache-coherency protocol, and second it helps us to avoid the non-temporal prefetch HW optimizations and bring this into all cache-levels. It might be worth investigating if prefetch into L2 will have the same benefit. Change-Id: I194bf256f3577392f59f2dda4d3fa26c80a612f3 Signed-off-by: Jesper Dangaard Brouer Acked-by: Ilias Apalodimas Acked-by: Song Liu Signed-off-by: Alexei Starovoitov --- kernel/bpf/cpumap.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 0407ac179f72..62f2c295034d 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -280,6 +280,18 @@ static int cpu_map_kthread_run(void *data) * consume side valid as no-resize allowed of queue. */ n = ptr_ring_consume_batched(rcpu->queue, frames, CPUMAP_BATCH); + + for (i = 0; i < n; i++) { + void *f = frames[i]; + struct page *page = virt_to_page(f); + + /* Bring struct page memory area to curr CPU. Read by + * build_skb_around via page_is_pfmemalloc(), and when + * freed written by page_frag_free call. + */ + prefetchw(page); + } + m = kmem_cache_alloc_bulk(skbuff_head_cache, gfp, n, skbs); if (unlikely(m == 0)) { for (i = 0; i < n; i++) From 503239cd1830e79c65775a461c364374a7aa9d97 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 19 Apr 2019 07:44:54 -0700 Subject: [PATCH 1112/1640] UPSTREAM: bpf: remove global variables Move three global variables protected by bpf_verifier_lock into 'struct bpf_verifier_env' to allow parallel verification. Change-Id: I281b67b100214063830e17f30ca116f6d6c8ed62 Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/linux/bpf_verifier.h | 5 +++++ kernel/bpf/verifier.c | 25 +++++++++++++------------ 2 files changed, 18 insertions(+), 12 deletions(-) diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 54210d179948..1bedaea66284 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -305,6 +305,11 @@ struct bpf_verifier_env { struct bpf_verifier_log log; struct bpf_subprog_info subprog_info[BPF_MAX_SUBPROGS + 1]; struct bpf_id_pair idmap_scratch[BPF_ID_MAP_SIZE]; + struct { + int *insn_state; + int *insn_stack; + int cur_stack; + } cfg; u32 subprog_cnt; /* number of instructions analyzed by the verifier */ u32 insn_processed; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a51c2d3b0601..88619066a354 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5592,10 +5592,6 @@ enum { #define STATE_LIST_MARK ((struct bpf_verifier_state_list *) -1L) -static int *insn_stack; /* stack of insns to process */ -static int cur_stack; /* current stack index */ -static int *insn_state; - /* t, w, e - match pseudo-code above: * t - index of current instruction * w - next instruction @@ -5603,6 +5599,9 @@ static int *insn_state; */ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env) { + int *insn_stack = env->cfg.insn_stack; + int *insn_state = env->cfg.insn_state; + if (e == FALLTHROUGH && insn_state[t] >= (DISCOVERED | FALLTHROUGH)) return 0; @@ -5623,9 +5622,9 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env) /* tree-edge */ insn_state[t] = DISCOVERED | e; insn_state[w] = DISCOVERED; - if (cur_stack >= env->prog->len) + if (env->cfg.cur_stack >= env->prog->len) return -E2BIG; - insn_stack[cur_stack++] = w; + insn_stack[env->cfg.cur_stack++] = w; return 1; } else if ((insn_state[w] & 0xF0) == DISCOVERED) { verbose_linfo(env, t, "%d: ", t); @@ -5649,14 +5648,15 @@ static int check_cfg(struct bpf_verifier_env *env) { struct bpf_insn *insns = env->prog->insnsi; int insn_cnt = env->prog->len; + int *insn_stack, *insn_state; int ret = 0; int i, t; - insn_state = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL); + insn_state = env->cfg.insn_state = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL); if (!insn_state) return -ENOMEM; - insn_stack = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL); + insn_stack = env->cfg.insn_stack = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL); if (!insn_stack) { kvfree(insn_state); return -ENOMEM; @@ -5664,12 +5664,12 @@ static int check_cfg(struct bpf_verifier_env *env) insn_state[0] = DISCOVERED; /* mark 1st insn as discovered */ insn_stack[0] = 0; /* 0 is the first instruction */ - cur_stack = 1; + env->cfg.cur_stack = 1; peek_stack: - if (cur_stack == 0) + if (env->cfg.cur_stack == 0) goto check_state; - t = insn_stack[cur_stack - 1]; + t = insn_stack[env->cfg.cur_stack - 1]; if (BPF_CLASS(insns[t].code) == BPF_JMP || BPF_CLASS(insns[t].code) == BPF_JMP32) { @@ -5738,7 +5738,7 @@ peek_stack: mark_explored: insn_state[t] = EXPLORED; - if (cur_stack-- <= 0) { + if (env->cfg.cur_stack-- <= 0) { verbose(env, "pop stack internal bug\n"); ret = -EFAULT; goto err_free; @@ -5758,6 +5758,7 @@ check_state: err_free: kvfree(insn_state); kvfree(insn_stack); + env->cfg.insn_state = env->cfg.insn_stack = NULL; return ret; } From f6ad6f7e7804b8ad12256e38a20f68987f982990 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 19 Apr 2019 07:44:55 -0700 Subject: [PATCH 1113/1640] UPSTREAM: bpf: drop bpf_verifier_lock Drop bpf_verifier_lock for root to avoid being DoS-ed by unprivileged. The BPF verifier is now fully parallel. All unpriv users are still serialized by bpf_verifier_lock to avoid exhausting kernel memory by running N parallel verifications. Change-Id: I3debaa6e0d9dd338cdd1ce17f28cf30e0d4fb0c1 Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 88619066a354..e788579dfdf1 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -8346,9 +8346,11 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, env->insn_aux_data[i].orig_idx = i; env->prog = *prog; env->ops = bpf_verifier_ops[env->prog->type]; + is_priv = capable(CAP_SYS_ADMIN); /* grab the mutex to protect few globals used by verifier */ - mutex_lock(&bpf_verifier_lock); + if (!is_priv) + mutex_lock(&bpf_verifier_lock); if (attr->log_level || attr->log_buf || attr->log_size) { /* user requested verbose verifier output @@ -8372,7 +8374,6 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, if (attr->prog_flags & BPF_F_ANY_ALIGNMENT) env->strict_alignment = false; - is_priv = capable(CAP_SYS_ADMIN); env->allow_ptr_leaks = is_priv; ret = replace_map_fd_with_map_ptr(env); @@ -8485,7 +8486,8 @@ err_release_maps: release_maps(env); *prog = env->prog; err_unlock: - mutex_unlock(&bpf_verifier_lock); + if (!is_priv) + mutex_unlock(&bpf_verifier_lock); vfree(env->insn_aux_data); err_free_env: kvfree(env); From a139ced4bf56bf6c018222e145c5c62db92b321d Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Tue, 18 Sep 2018 16:20:18 -0400 Subject: [PATCH 1114/1640] UPSTREAM: flow_dissector: fix build failure without CONFIG_NET If boolean CONFIG_BPF_SYSCALL is enabled, kernel/bpf/syscall.c will call flow_dissector functions from net/core/flow_dissector.c. This causes this build failure if CONFIG_NET is disabled: kernel/bpf/syscall.o: In function `__x64_sys_bpf': syscall.c:(.text+0x3278): undefined reference to `skb_flow_dissector_bpf_prog_attach' syscall.c:(.text+0x3310): undefined reference to `skb_flow_dissector_bpf_prog_detach' kernel/bpf/syscall.o:(.rodata+0x3f0): undefined reference to `flow_dissector_prog_ops' kernel/bpf/verifier.o:(.rodata+0x250): undefined reference to `flow_dissector_verifier_ops' Analogous to other optional BPF program types in syscall.c, add stubs if the relevant functions are not compiled and move the BPF_PROG_TYPE definition in the #ifdef CONFIG_NET block. Fixes: d58e468b1112 ("flow_dissector: implements flow dissector BPF hook") Reported-by: Randy Dunlap Change-Id: I3c06753658e2d06c1cef5158106ba2fe5ee41b34 Signed-off-by: Willem de Bruijn Acked-by: Randy Dunlap # build-tested Acked-by: Yonghong Song Signed-off-by: Daniel Borkmann --- include/linux/bpf_types.h | 2 +- include/linux/skbuff.h | 13 +++++++++++++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index 805db999815e..48fa6480fc48 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -16,6 +16,7 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_SEG6LOCAL, lwt_seg6local) BPF_PROG_TYPE(BPF_PROG_TYPE_SOCK_OPS, sock_ops) BPF_PROG_TYPE(BPF_PROG_TYPE_SK_SKB, sk_skb) BPF_PROG_TYPE(BPF_PROG_TYPE_SK_MSG, sk_msg) +BPF_PROG_TYPE(BPF_PROG_TYPE_FLOW_DISSECTOR, flow_dissector) #endif #ifdef CONFIG_BPF_EVENTS BPF_PROG_TYPE(BPF_PROG_TYPE_KPROBE, kprobe) @@ -33,7 +34,6 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_LIRC_MODE2, lirc_mode2) #ifdef CONFIG_INET BPF_PROG_TYPE(BPF_PROG_TYPE_SK_REUSEPORT, sk_reuseport) #endif -BPF_PROG_TYPE(BPF_PROG_TYPE_FLOW_DISSECTOR, flow_dissector) BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY, array_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_ARRAY, percpu_array_map_ops) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 745cb9ce08cf..a1d0dd432600 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1201,10 +1201,23 @@ void skb_flow_dissector_init(struct flow_dissector *flow_dissector, const struct flow_dissector_key *key, unsigned int key_count); +#ifdef CONFIG_NET int skb_flow_dissector_bpf_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog); int skb_flow_dissector_bpf_prog_detach(const union bpf_attr *attr); +#else +static inline int skb_flow_dissector_bpf_prog_attach(const union bpf_attr *attr, + struct bpf_prog *prog) +{ + return -EOPNOTSUPP; +} + +static inline int skb_flow_dissector_bpf_prog_detach(const union bpf_attr *attr) +{ + return -EOPNOTSUPP; +} +#endif bool __skb_flow_dissect(const struct sk_buff *skb, struct flow_dissector *flow_dissector, From 9ff286303a38b947040806a7b545ee6fa4f41cee Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Thu, 25 Apr 2019 14:37:23 -0700 Subject: [PATCH 1115/1640] UPSTREAM: bpf: support BPF_PROG_QUERY for BPF_FLOW_DISSECTOR attach_type target_fd is target namespace. If there is a flow dissector BPF program attached to that namespace, its (single) id is returned. v5: * drop net ref right after rcu unlock (Daniel Borkmann) v4: * add missing put_net (Jann Horn) v3: * add missing inline to skb_flow_dissector_prog_query static def (kbuild test robot ) v2: * don't sleep in rcu critical section (Jakub Kicinski) * check input prog_cnt (exit early) Cc: Jann Horn Change-Id: I0ee56753ab9d515f2369c00b4470051ab2e12322 Signed-off-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann --- include/linux/skbuff.h | 8 ++++++++ kernel/bpf/syscall.c | 2 ++ net/core/flow_dissector.c | 39 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 49 insertions(+) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index a1d0dd432600..c210e316e07f 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1202,11 +1202,19 @@ void skb_flow_dissector_init(struct flow_dissector *flow_dissector, unsigned int key_count); #ifdef CONFIG_NET +int skb_flow_dissector_prog_query(const union bpf_attr *attr, + union bpf_attr __user *uattr); int skb_flow_dissector_bpf_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog); int skb_flow_dissector_bpf_prog_detach(const union bpf_attr *attr); #else +static inline int skb_flow_dissector_prog_query(const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + return -EOPNOTSUPP; +} + static inline int skb_flow_dissector_bpf_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog) { diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 8685d2bcd99c..453f21c12ca6 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2026,6 +2026,8 @@ static int bpf_prog_query(const union bpf_attr *attr, break; case BPF_LIRC_MODE2: return lirc_prog_query(attr, uattr); + case BPF_FLOW_DISSECTOR: + return skb_flow_dissector_prog_query(attr, uattr); default: return -EINVAL; } diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 974993f0c533..adc7d081941e 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -62,6 +62,45 @@ void skb_flow_dissector_init(struct flow_dissector *flow_dissector, } EXPORT_SYMBOL(skb_flow_dissector_init); +int skb_flow_dissector_prog_query(const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + __u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids); + u32 prog_id, prog_cnt = 0, flags = 0; + struct bpf_prog *attached; + struct net *net; + + if (attr->query.query_flags) + return -EINVAL; + + net = get_net_ns_by_fd(attr->query.target_fd); + if (IS_ERR(net)) + return PTR_ERR(net); + + rcu_read_lock(); + attached = rcu_dereference(net->flow_dissector_prog); + if (attached) { + prog_cnt = 1; + prog_id = attached->aux->id; + } + rcu_read_unlock(); + + put_net(net); + + if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags))) + return -EFAULT; + if (copy_to_user(&uattr->query.prog_cnt, &prog_cnt, sizeof(prog_cnt))) + return -EFAULT; + + if (!attr->query.prog_cnt || !prog_ids || !prog_cnt) + return 0; + + if (copy_to_user(prog_ids, &prog_id, sizeof(u32))) + return -EFAULT; + + return 0; +} + int skb_flow_dissector_bpf_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog) { From 3a6bdb5d2391de191e283eebf165bcfc21633136 Mon Sep 17 00:00:00 2001 From: Paul Chaignon Date: Wed, 24 Apr 2019 21:50:42 +0200 Subject: [PATCH 1116/1640] UPSTREAM: bpf: mark registers in all frames after pkt/null checks In case of a null check on a pointer inside a subprog, we should mark all registers with this pointer as either safe or unknown, in both the current and previous frames. Currently, only spilled registers and registers in the current frame are marked. Packet bound checks in subprogs have the same issue. This patch fixes it to mark registers in previous frames as well. A good reproducer for null checks looks as follow: 1: ptr = bpf_map_lookup_elem(map, &key); 2: ret = subprog(ptr) { 3: return ptr != NULL; 4: } 5: if (ret) 6: value = *ptr; With the above, the verifier will complain on line 6 because it sees ptr as map_value_or_null despite the null check in subprog 1. Note that this patch fixes another resulting bug when using bpf_sk_release(): 1: sk = bpf_sk_lookup_tcp(...); 2: subprog(sk) { 3: if (sk) 4: bpf_sk_release(sk); 5: } 6: if (!sk) 7: return 0; 8: return 1; In the above, mark_ptr_or_null_regs will warn on line 6 because it will try to free the reference state, even though it was already freed on line 3. Fixes: f4d7e40a5b71 ("bpf: introduce function calls (verification)") Change-Id: I82e0b1f307bd9c7f8a7b6cab98f7dcaacef2ffa4 Signed-off-by: Paul Chaignon Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 76 ++++++++++++++++++++++++++----------------- 1 file changed, 46 insertions(+), 30 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index e788579dfdf1..63e47deaae51 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4493,15 +4493,35 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) return 0; } +static void __find_good_pkt_pointers(struct bpf_func_state *state, + struct bpf_reg_state *dst_reg, + enum bpf_reg_type type, u16 new_range) +{ + struct bpf_reg_state *reg; + int i; + + for (i = 0; i < MAX_BPF_REG; i++) { + reg = &state->regs[i]; + if (reg->type == type && reg->id == dst_reg->id) + /* keep the maximum range already checked */ + reg->range = max(reg->range, new_range); + } + + bpf_for_each_spilled_reg(i, state, reg) { + if (!reg) + continue; + if (reg->type == type && reg->id == dst_reg->id) + reg->range = max(reg->range, new_range); + } +} + static void find_good_pkt_pointers(struct bpf_verifier_state *vstate, struct bpf_reg_state *dst_reg, enum bpf_reg_type type, bool range_right_open) { - struct bpf_func_state *state = vstate->frame[vstate->curframe]; - struct bpf_reg_state *regs = state->regs, *reg; u16 new_range; - int i, j; + int i; if (dst_reg->off < 0 || (dst_reg->off == 0 && range_right_open)) @@ -4566,20 +4586,9 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *vstate, * the range won't allow anything. * dst_reg->off is known < MAX_PACKET_OFF, therefore it fits in a u16. */ - for (i = 0; i < MAX_BPF_REG; i++) - if (regs[i].type == type && regs[i].id == dst_reg->id) - /* keep the maximum range already checked */ - regs[i].range = max(regs[i].range, new_range); - - for (j = 0; j <= vstate->curframe; j++) { - state = vstate->frame[j]; - bpf_for_each_spilled_reg(i, state, reg) { - if (!reg) - continue; - if (reg->type == type && reg->id == dst_reg->id) - reg->range = max(reg->range, new_range); - } - } + for (i = 0; i <= vstate->curframe; i++) + __find_good_pkt_pointers(vstate->frame[i], dst_reg, type, + new_range); } /* compute branch direction of the expression "if (reg opcode val) goto target;" @@ -5053,6 +5062,22 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state, } } +static void __mark_ptr_or_null_regs(struct bpf_func_state *state, u32 id, + bool is_null) +{ + struct bpf_reg_state *reg; + int i; + + for (i = 0; i < MAX_BPF_REG; i++) + mark_ptr_or_null_reg(state, &state->regs[i], id, is_null); + + bpf_for_each_spilled_reg(i, state, reg) { + if (!reg) + continue; + mark_ptr_or_null_reg(state, reg, id, is_null); + } +} + /* The logic is similar to find_good_pkt_pointers(), both could eventually * be folded together at some point. */ @@ -5060,10 +5085,10 @@ static void mark_ptr_or_null_regs(struct bpf_verifier_state *vstate, u32 regno, bool is_null) { struct bpf_func_state *state = vstate->frame[vstate->curframe]; - struct bpf_reg_state *reg, *regs = state->regs; + struct bpf_reg_state *regs = state->regs; u32 ref_obj_id = regs[regno].ref_obj_id; u32 id = regs[regno].id; - int i, j; + int i; if (ref_obj_id && ref_obj_id == id && is_null) /* regs[regno] is in the " == NULL" branch. @@ -5072,17 +5097,8 @@ static void mark_ptr_or_null_regs(struct bpf_verifier_state *vstate, u32 regno, */ WARN_ON_ONCE(release_reference_state(state, id)); - for (i = 0; i < MAX_BPF_REG; i++) - mark_ptr_or_null_reg(state, ®s[i], id, is_null); - - for (j = 0; j <= vstate->curframe; j++) { - state = vstate->frame[j]; - bpf_for_each_spilled_reg(i, state, reg) { - if (!reg) - continue; - mark_ptr_or_null_reg(state, reg, id, is_null); - } - } + for (i = 0; i <= vstate->curframe; i++) + __mark_ptr_or_null_regs(vstate->frame[i], id, is_null); } static bool try_match_pkt_pointers(const struct bpf_insn *insn, From 55e35647119b263c5cacf6625064d02959ac1218 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Tue, 29 Jun 2021 09:40:10 -0400 Subject: [PATCH 1117/1640] BACKPORT: tracepoint: Add tracepoint_probe_register_may_exist() for BPF tracing commit 9913d5745bd720c4266805c8d29952a3702e4eca upstream. All internal use cases for tracepoint_probe_register() is set to not ever be called with the same function and data. If it is, it is considered a bug, as that means the accounting of handling tracepoints is corrupted. If the function and data for a tracepoint is already registered when tracepoint_probe_register() is called, it will call WARN_ON_ONCE() and return with EEXISTS. The BPF system call can end up calling tracepoint_probe_register() with the same data, which now means that this can trigger the warning because of a user space process. As WARN_ON_ONCE() should not be called because user space called a system call with bad data, there needs to be a way to register a tracepoint without triggering a warning. Enter tracepoint_probe_register_may_exist(), which can be called, but will not cause a WARN_ON() if the probe already exists. It will still error out with EEXIST, which will then be sent to the user space that performed the BPF system call. This keeps the previous testing for issues with other users of the tracepoint code, while letting BPF call it with duplicated data and not warn about it. Link: https://lore.kernel.org/lkml/20210626135845.4080-1-penguin-kernel@I-love.SAKURA.ne.jp/ Link: https://syzkaller.appspot.com/bug?id=41f4318cf01762389f4d1c1c459da4f542fe5153 Cc: stable@vger.kernel.org Fixes: c4f6699dfcb85 ("bpf: introduce BPF_RAW_TRACEPOINT") Reported-by: syzbot Reported-by: Tetsuo Handa Tested-by: syzbot+721aa903751db87aa244@syzkaller.appspotmail.com Signed-off-by: Steven Rostedt (VMware) Signed-off-by: Greg Kroah-Hartman --- include/linux/tracepoint.h | 10 ++++++++++ kernel/trace/bpf_trace.c | 3 ++- kernel/tracepoint.c | 33 ++++++++++++++++++++++++++++++--- 3 files changed, 42 insertions(+), 4 deletions(-) diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index 06cb9184a42a..f2ebb94c8a71 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -39,7 +39,17 @@ extern int tracepoint_probe_register_prio(struct tracepoint *tp, void *probe, void *data, int prio); extern int +tracepoint_probe_register_prio_may_exist(struct tracepoint *tp, void *probe, void *data, + int prio); +extern int tracepoint_probe_unregister(struct tracepoint *tp, void *probe, void *data); +static inline int +tracepoint_probe_register_may_exist(struct tracepoint *tp, void *probe, + void *data) +{ + return tracepoint_probe_register_prio_may_exist(tp, probe, data, + TRACEPOINT_DEFAULT_PRIO); +} extern void for_each_kernel_tracepoint(void (*fct)(struct tracepoint *tp, void *priv), void *priv); diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index e0f72f057346..f635a5c1d378 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1200,7 +1200,8 @@ static int __bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog * if (prog->aux->max_ctx_offset > btp->num_args * sizeof(u64)) return -EINVAL; - return tracepoint_probe_register(tp, (void *)btp->bpf_func, prog); + return tracepoint_probe_register_may_exist(tp, (void *)btp->bpf_func, + prog); } int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *prog) diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index b65b2e7fd850..8a8f0ea3bc6f 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -238,7 +238,8 @@ static void *func_remove(struct tracepoint_func **funcs, * Add the probe function to a tracepoint. */ static int tracepoint_add_func(struct tracepoint *tp, - struct tracepoint_func *func, int prio) + struct tracepoint_func *func, int prio, + bool warn) { struct tracepoint_func *old, *tp_funcs; int ret; @@ -253,7 +254,7 @@ static int tracepoint_add_func(struct tracepoint *tp, lockdep_is_held(&tracepoints_mutex)); old = func_add(&tp_funcs, func, prio); if (IS_ERR(old)) { - WARN_ON_ONCE(PTR_ERR(old) != -ENOMEM); + WARN_ON_ONCE(warn && PTR_ERR(old) != -ENOMEM); return PTR_ERR(old); } @@ -305,6 +306,32 @@ static int tracepoint_remove_func(struct tracepoint *tp, return 0; } +/** + * tracepoint_probe_register_prio_may_exist - Connect a probe to a tracepoint with priority + * @tp: tracepoint + * @probe: probe handler + * @data: tracepoint data + * @prio: priority of this function over other registered functions + * + * Same as tracepoint_probe_register_prio() except that it will not warn + * if the tracepoint is already registered. + */ +int tracepoint_probe_register_prio_may_exist(struct tracepoint *tp, void *probe, + void *data, int prio) +{ + struct tracepoint_func tp_func; + int ret; + + mutex_lock(&tracepoints_mutex); + tp_func.func = probe; + tp_func.data = data; + tp_func.prio = prio; + ret = tracepoint_add_func(tp, &tp_func, prio, false); + mutex_unlock(&tracepoints_mutex); + return ret; +} +EXPORT_SYMBOL_GPL(tracepoint_probe_register_prio_may_exist); + /** * tracepoint_probe_register - Connect a probe to a tracepoint * @tp: tracepoint @@ -328,7 +355,7 @@ int tracepoint_probe_register_prio(struct tracepoint *tp, void *probe, tp_func.func = probe; tp_func.data = data; tp_func.prio = prio; - ret = tracepoint_add_func(tp, &tp_func, prio); + ret = tracepoint_add_func(tp, &tp_func, prio, true); mutex_unlock(&tracepoints_mutex); return ret; } From 6ef8f481729c127ed78539ebcb3668e569d39ac7 Mon Sep 17 00:00:00 2001 From: Matt Mullins Date: Fri, 26 Apr 2019 11:49:47 -0700 Subject: [PATCH 1118/1640] BACKPORT: bpf: add writable context for raw tracepoints This is an opt-in interface that allows a tracepoint to provide a safe buffer that can be written from a BPF_PROG_TYPE_RAW_TRACEPOINT program. The size of the buffer must be a compile-time constant, and is checked before allowing a BPF program to attach to a tracepoint that uses this feature. The pointer to this buffer will be the first argument of tracepoints that opt in; the pointer is valid and can be bpf_probe_read() by both BPF_PROG_TYPE_RAW_TRACEPOINT and BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE programs that attach to such a tracepoint, but the buffer to which it points may only be written by the latter. Change-Id: I9f96e1c0e7ae90fe32795d537a100f26e388af2d Signed-off-by: Matt Mullins Acked-by: Yonghong Song Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 2 ++ include/linux/bpf_types.h | 1 + include/linux/tracepoint-defs.h | 1 + include/trace/bpf_probe.h | 27 +++++++++++++++++++++++++-- include/uapi/linux/bpf.h | 1 + kernel/bpf/syscall.c | 8 ++++++-- kernel/bpf/verifier.c | 31 +++++++++++++++++++++++++++++++ kernel/trace/bpf_trace.c | 25 +++++++++++++++++++++++++ 8 files changed, 92 insertions(+), 4 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 50816a195b42..aebe71e2e971 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -277,6 +277,7 @@ enum bpf_reg_type { PTR_TO_SOCK_COMMON_OR_NULL, /* reg points to sock_common or NULL */ PTR_TO_TCP_SOCK, /* reg points to struct tcp_sock */ PTR_TO_TCP_SOCK_OR_NULL, /* reg points to struct tcp_sock or NULL */ + PTR_TO_TP_BUFFER, /* reg points to a writable raw tp's buffer */ }; /* The information passed from prog-specific *_is_valid_access @@ -366,6 +367,7 @@ struct bpf_prog_aux { u32 used_map_cnt; u32 max_ctx_offset; u32 max_pkt_offset; + u32 max_tp_access; u32 stack_depth; u32 id; u32 func_cnt; /* used by non-func prog as the number of func progs */ diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index 48fa6480fc48..acfc3ce63978 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -23,6 +23,7 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_KPROBE, kprobe) BPF_PROG_TYPE(BPF_PROG_TYPE_TRACEPOINT, tracepoint) BPF_PROG_TYPE(BPF_PROG_TYPE_PERF_EVENT, perf_event) BPF_PROG_TYPE(BPF_PROG_TYPE_RAW_TRACEPOINT, raw_tracepoint) +BPF_PROG_TYPE(BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE, raw_tracepoint_writable) #endif #ifdef CONFIG_CGROUP_BPF BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_DEVICE, cg_dev) diff --git a/include/linux/tracepoint-defs.h b/include/linux/tracepoint-defs.h index 22c5a46e9693..ad6ce90b8ad3 100644 --- a/include/linux/tracepoint-defs.h +++ b/include/linux/tracepoint-defs.h @@ -39,6 +39,7 @@ struct bpf_raw_event_map { struct tracepoint *tp; void *bpf_func; u32 num_args; + u32 writable_size; } __aligned(32); #endif diff --git a/include/trace/bpf_probe.h b/include/trace/bpf_probe.h index 505dae0bed80..d6e556c0a085 100644 --- a/include/trace/bpf_probe.h +++ b/include/trace/bpf_probe.h @@ -69,8 +69,7 @@ __bpf_trace_##call(void *__data, proto) \ * to make sure that if the tracepoint handling changes, the * bpf probe will fail to compile unless it too is updated. */ -#undef DEFINE_EVENT -#define DEFINE_EVENT(template, call, proto, args) \ +#define __DEFINE_EVENT(template, call, proto, args, size) \ static inline void bpf_test_probe_##call(void) \ { \ check_trace_callback_type_##call(__bpf_trace_##template); \ @@ -81,12 +80,36 @@ __bpf_trace_tp_map_##call = { \ .tp = &__tracepoint_##call, \ .bpf_func = (void *)__bpf_trace_##template, \ .num_args = COUNT_ARGS(args), \ + .writable_size = size, \ }; +#define FIRST(x, ...) x + +#undef DEFINE_EVENT_WRITABLE +#define DEFINE_EVENT_WRITABLE(template, call, proto, args, size) \ +static inline void bpf_test_buffer_##call(void) \ +{ \ + /* BUILD_BUG_ON() is ignored if the code is completely eliminated, but \ + * BUILD_BUG_ON_ZERO() uses a different mechanism that is not \ + * dead-code-eliminated. \ + */ \ + FIRST(proto); \ + (void)BUILD_BUG_ON_ZERO(size != sizeof(*FIRST(args))); \ +} \ +__DEFINE_EVENT(template, call, PARAMS(proto), PARAMS(args), size) + +#undef DEFINE_EVENT +#define DEFINE_EVENT(template, call, proto, args) \ + __DEFINE_EVENT(template, call, PARAMS(proto), PARAMS(args), 0) #undef DEFINE_EVENT_PRINT #define DEFINE_EVENT_PRINT(template, name, proto, args, print) \ DEFINE_EVENT(template, name, PARAMS(proto), PARAMS(args)) #include TRACE_INCLUDE(TRACE_INCLUDE_FILE) + +#undef DEFINE_EVENT_WRITABLE +#undef __DEFINE_EVENT +#undef FIRST + #endif /* CONFIG_BPF_EVENTS */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 7331d527fbbc..b62d977c355a 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -168,6 +168,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_SK_REUSEPORT, BPF_PROG_TYPE_FLOW_DISSECTOR, BPF_PROG_TYPE_CGROUP_SYSCTL, + BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE, }; enum bpf_attach_type { diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 453f21c12ca6..6e779e13e2fc 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1800,12 +1800,16 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr) } raw_tp->btp = btp; - prog = bpf_prog_get_type(attr->raw_tracepoint.prog_fd, - BPF_PROG_TYPE_RAW_TRACEPOINT); + prog = bpf_prog_get(attr->raw_tracepoint.prog_fd); if (IS_ERR(prog)) { err = PTR_ERR(prog); goto out_free_tp; } + if (prog->type != BPF_PROG_TYPE_RAW_TRACEPOINT && + prog->type != BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE) { + err = -EINVAL; + goto out_put_prog; + } err = bpf_probe_register(raw_tp->btp, prog); if (err) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 63e47deaae51..a3e7ee4322f8 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -404,6 +404,7 @@ static const char * const reg_type_str[] = { [PTR_TO_SOCK_COMMON_OR_NULL] = "sock_common_or_null", [PTR_TO_TCP_SOCK] = "tcp_sock", [PTR_TO_TCP_SOCK_OR_NULL] = "tcp_sock_or_null", + [PTR_TO_TP_BUFFER] = "tp_buffer", }; static char slot_type_char[] = { @@ -2002,6 +2003,32 @@ static int check_ctx_reg(struct bpf_verifier_env *env, return 0; } +static int check_tp_buffer_access(struct bpf_verifier_env *env, + const struct bpf_reg_state *reg, + int regno, int off, int size) +{ + if (off < 0) { + verbose(env, + "R%d invalid tracepoint buffer access: off=%d, size=%d", + regno, off, size); + return -EACCES; + } + if (!tnum_is_const(reg->var_off) || reg->var_off.value) { + char tn_buf[48]; + + tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); + verbose(env, + "R%d invalid variable buffer offset: off=%d, var_off=%s", + regno, off, tn_buf); + return -EACCES; + } + if (off + size > env->prog->aux->max_tp_access) + env->prog->aux->max_tp_access = off + size; + + return 0; +} + + /* truncate register to smaller size (in bytes) * must be called with size < BPF_REG_SIZE */ @@ -2146,6 +2173,10 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn err = check_sock_access(env, insn_idx, regno, off, size, t); if (!err && value_regno >= 0) mark_reg_unknown(env, regs, value_regno); + } else if (reg->type == PTR_TO_TP_BUFFER) { + err = check_tp_buffer_access(env, reg, regno, off, size); + if (!err && t == BPF_READ && value_regno >= 0) + mark_reg_unknown(env, regs, value_regno); } else { verbose(env, "R%d invalid mem access '%s'\n", regno, reg_type_str[reg->type]); diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index f635a5c1d378..6122e7c48688 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -922,6 +922,27 @@ const struct bpf_verifier_ops raw_tracepoint_verifier_ops = { const struct bpf_prog_ops raw_tracepoint_prog_ops = { }; +static bool raw_tp_writable_prog_is_valid_access(int off, int size, + enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + if (off == 0) { + if (size != sizeof(u64) || type != BPF_READ) + return false; + info->reg_type = PTR_TO_TP_BUFFER; + } + return raw_tp_prog_is_valid_access(off, size, type, prog, info); +} + +const struct bpf_verifier_ops raw_tracepoint_writable_verifier_ops = { + .get_func_proto = raw_tp_prog_func_proto, + .is_valid_access = raw_tp_writable_prog_is_valid_access, +}; + +const struct bpf_prog_ops raw_tracepoint_writable_prog_ops = { +}; + static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) @@ -1200,8 +1221,12 @@ static int __bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog * if (prog->aux->max_ctx_offset > btp->num_args * sizeof(u64)) return -EINVAL; + if (prog->aux->max_tp_access > btp->writable_size) + return -EINVAL; + return tracepoint_probe_register_may_exist(tp, (void *)btp->bpf_func, prog); + } int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *prog) From c9dcea7d1b7c48719b03114f357fc99d4a906750 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Fri, 26 Apr 2019 16:39:39 -0700 Subject: [PATCH 1119/1640] BACKPORT: bpf: Introduce bpf sk local storage MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After allowing a bpf prog to - directly read the skb->sk ptr - get the fullsock bpf_sock by "bpf_sk_fullsock()" - get the bpf_tcp_sock by "bpf_tcp_sock()" - get the listener sock by "bpf_get_listener_sock()" - avoid duplicating the fields of "(bpf_)sock" and "(bpf_)tcp_sock" into different bpf running context. this patch is another effort to make bpf's network programming more intuitive to do (together with memory and performance benefit). When bpf prog needs to store data for a sk, the current practice is to define a map with the usual 4-tuples (src/dst ip/port) as the key. If multiple bpf progs require to store different sk data, multiple maps have to be defined. Hence, wasting memory to store the duplicated keys (i.e. 4 tuples here) in each of the bpf map. [ The smallest key could be the sk pointer itself which requires some enhancement in the verifier and it is a separate topic. ] Also, the bpf prog needs to clean up the elem when sk is freed. Otherwise, the bpf map will become full and un-usable quickly. The sk-free tracking currently could be done during sk state transition (e.g. BPF_SOCK_OPS_STATE_CB). The size of the map needs to be predefined which then usually ended-up with an over-provisioned map in production. Even the map was re-sizable, while the sk naturally come and go away already, this potential re-size operation is arguably redundant if the data can be directly connected to the sk itself instead of proxy-ing through a bpf map. This patch introduces sk->sk_bpf_storage to provide local storage space at sk for bpf prog to use. The space will be allocated when the first bpf prog has created data for this particular sk. The design optimizes the bpf prog's lookup (and then optionally followed by an inline update). bpf_spin_lock should be used if the inline update needs to be protected. BPF_MAP_TYPE_SK_STORAGE: ----------------------- To define a bpf "sk-local-storage", a BPF_MAP_TYPE_SK_STORAGE map (new in this patch) needs to be created. Multiple BPF_MAP_TYPE_SK_STORAGE maps can be created to fit different bpf progs' needs. The map enforces BTF to allow printing the sk-local-storage during a system-wise sk dump (e.g. "ss -ta") in the future. The purpose of a BPF_MAP_TYPE_SK_STORAGE map is not for lookup/update/delete a "sk-local-storage" data from a particular sk. Think of the map as a meta-data (or "type") of a "sk-local-storage". This particular "type" of "sk-local-storage" data can then be stored in any sk. The main purposes of this map are mostly: 1. Define the size of a "sk-local-storage" type. 2. Provide a similar syscall userspace API as the map (e.g. lookup/update, map-id, map-btf...etc.) 3. Keep track of all sk's storages of this "type" and clean them up when the map is freed. sk->sk_bpf_storage: ------------------ The main lookup/update/delete is done on sk->sk_bpf_storage (which is a "struct bpf_sk_storage"). When doing a lookup, the "map" pointer is now used as the "key" to search on the sk_storage->list. The "map" pointer is actually serving as the "type" of the "sk-local-storage" that is being requested. To allow very fast lookup, it should be as fast as looking up an array at a stable-offset. At the same time, it is not ideal to set a hard limit on the number of sk-local-storage "type" that the system can have. Hence, this patch takes a cache approach. The last search result from sk_storage->list is cached in sk_storage->cache[] which is a stable sized array. Each "sk-local-storage" type has a stable offset to the cache[] array. In the future, a map's flag could be introduced to do cache opt-out/enforcement if it became necessary. The cache size is 16 (i.e. 16 types of "sk-local-storage"). Programs can share map. On the program side, having a few bpf_progs running in the networking hotpath is already a lot. The bpf_prog should have already consolidated the existing sock-key-ed map usage to minimize the map lookup penalty. 16 has enough runway to grow. All sk-local-storage data will be removed from sk->sk_bpf_storage during sk destruction. bpf_sk_storage_get() and bpf_sk_storage_delete(): ------------------------------------------------ Instead of using bpf_map_(lookup|update|delete)_elem(), the bpf prog needs to use the new helper bpf_sk_storage_get() and bpf_sk_storage_delete(). The verifier can then enforce the ARG_PTR_TO_SOCKET argument. The bpf_sk_storage_get() also allows to "create" new elem if one does not exist in the sk. It is done by the new BPF_SK_STORAGE_GET_F_CREATE flag. An optional value can also be provided as the initial value during BPF_SK_STORAGE_GET_F_CREATE. The BPF_MAP_TYPE_SK_STORAGE also supports bpf_spin_lock. Together, it has eliminated the potential use cases for an equivalent bpf_map_update_elem() API (for bpf_prog) in this patch. Misc notes: ---------- 1. map_get_next_key is not supported. From the userspace syscall perspective, the map has the socket fd as the key while the map can be shared by pinned-file or map-id. Since btf is enforced, the existing "ss" could be enhanced to pretty print the local-storage. Supporting a kernel defined btf with 4 tuples as the return key could be explored later also. 2. The sk->sk_lock cannot be acquired. Atomic operations is used instead. e.g. cmpxchg is done on the sk->sk_bpf_storage ptr. Please refer to the source code comments for the details in synchronization cases and considerations. 3. The mem is charged to the sk->sk_omem_alloc as the sk filter does. Benchmark: --------- Here is the benchmark data collected by turning on the "kernel.bpf_stats_enabled" sysctl. Two bpf progs are tested: One bpf prog with the usual bpf hashmap (max_entries = 8192) with the sk ptr as the key. (verifier is modified to support sk ptr as the key That should have shortened the key lookup time.) Another bpf prog is with the new BPF_MAP_TYPE_SK_STORAGE. Both are storing a "u32 cnt", do a lookup on "egress_skb/cgroup" for each egress skb and then bump the cnt. netperf is used to drive data with 4096 connected UDP sockets. BPF_MAP_TYPE_HASH with a modifier verifier (152ns per bpf run) 27: cgroup_skb name egress_sk_map tag 74f56e832918070b run_time_ns 58280107540 run_cnt 381347633 loaded_at 2019-04-15T13:46:39-0700 uid 0 xlated 344B jited 258B memlock 4096B map_ids 16 btf_id 5 BPF_MAP_TYPE_SK_STORAGE in this patch (66ns per bpf run) 30: cgroup_skb name egress_sk_stora tag d4aa70984cc7bbf6 run_time_ns 25617093319 run_cnt 390989739 loaded_at 2019-04-15T13:47:54-0700 uid 0 xlated 168B jited 156B memlock 4096B map_ids 17 btf_id 6 Here is a high-level picture on how are the objects organized: sk ┌──────┐ │ │ │ │ │ │ │*sk_bpf_storage─────▶ bpf_sk_storage └──────┘ ┌───────┐ ┌───────────┤ list │ │ │ │ │ │ │ │ │ │ │ └───────┘ │ │ elem │ ┌────────┐ ├─▶│ snode │ │ ├────────┤ │ │ data │ bpf_map │ ├────────┤ ┌─────────┐ │ │map_node│◀─┬─────┤ list │ │ └────────┘ │ │ │ │ │ │ │ │ elem │ │ │ │ ┌────────┐ │ └─────────┘ └─▶│ snode │ │ ├────────┤ │ bpf_map │ data │ │ ┌─────────┐ ├────────┤ │ │ list ├───────▶│map_node│ │ │ │ └────────┘ │ │ │ │ │ │ elem │ └─────────┘ ┌────────┐ │ ┌─▶│ snode │ │ │ ├────────┤ │ │ │ data │ │ │ ├────────┤ │ │ │map_node│◀─┘ │ └────────┘ │ │ │ ┌───────┐ sk └──────────│ list │ ┌──────┐ │ │ │ │ │ │ │ │ │ │ │ │ └───────┘ │*sk_bpf_storage───────▶bpf_sk_storage └──────┘ Change-Id: Idb39432e4982b2343299b70904235d5644522c7c Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 2 + include/linux/bpf_types.h | 1 + include/net/bpf_sk_storage.h | 13 + include/net/sock.h | 5 + include/uapi/linux/bpf.h | 40 ++ kernel/bpf/syscall.c | 3 +- kernel/bpf/verifier.c | 27 +- net/bpf/test_run.c | 2 + net/core/Makefile | 1 + net/core/bpf_sk_storage.c | 804 +++++++++++++++++++++++++++++++++++ net/core/filter.c | 13 + net/core/sock.c | 5 + 12 files changed, 912 insertions(+), 4 deletions(-) create mode 100644 include/net/bpf_sk_storage.h create mode 100644 net/core/bpf_sk_storage.c diff --git a/include/linux/bpf.h b/include/linux/bpf.h index aebe71e2e971..4c177b2af7b7 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -189,6 +189,7 @@ enum bpf_arg_type { ARG_PTR_TO_MAP_KEY, /* pointer to stack used as map key */ ARG_PTR_TO_MAP_VALUE, /* pointer to stack used as map value */ ARG_PTR_TO_UNINIT_MAP_VALUE, /* pointer to valid memory used to store a map value */ + ARG_PTR_TO_MAP_VALUE_OR_NULL, /* pointer to stack used as map value or NULL */ /* the following constraints used to prototype bpf_memcmp() and other * functions that access data on eBPF program stack @@ -209,6 +210,7 @@ enum bpf_arg_type { ARG_PTR_TO_SOCK_COMMON, /* pointer to sock_common */ ARG_PTR_TO_INT, /* pointer to int */ ARG_PTR_TO_LONG, /* pointer to long */ + ARG_PTR_TO_SOCKET, /* pointer to bpf_sock (fullsock) */ }; /* type of values returned from helper functions */ diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index acfc3ce63978..8c562302e511 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -59,6 +59,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY_OF_MAPS, array_of_maps_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_HASH_OF_MAPS, htab_of_maps_map_ops) #ifdef CONFIG_NET BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP, dev_map_ops) +BPF_MAP_TYPE(BPF_MAP_TYPE_SK_STORAGE, sk_storage_map_ops) #if defined(CONFIG_BPF_STREAM_PARSER) BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKMAP, sock_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKHASH, sock_hash_ops) diff --git a/include/net/bpf_sk_storage.h b/include/net/bpf_sk_storage.h new file mode 100644 index 000000000000..b9dcb02e756b --- /dev/null +++ b/include/net/bpf_sk_storage.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (c) 2019 Facebook */ +#ifndef _BPF_SK_STORAGE_H +#define _BPF_SK_STORAGE_H + +struct sock; + +void bpf_sk_storage_free(struct sock *sk); + +extern const struct bpf_func_proto bpf_sk_storage_get_proto; +extern const struct bpf_func_proto bpf_sk_storage_delete_proto; + +#endif /* _BPF_SK_STORAGE_H */ diff --git a/include/net/sock.h b/include/net/sock.h index 5741ff62cbe2..9f58e77fe19a 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -236,6 +236,8 @@ struct sock_common { /* public: */ }; +struct bpf_sk_storage; + /** * struct sock - network layer representation of sockets * @__sk_common: shared layout with inet_timewait_sock @@ -502,6 +504,9 @@ struct sock { struct sk_buff *skb); void (*sk_destruct)(struct sock *sk); struct sock_reuseport __rcu *sk_reuseport_cb; +#ifdef CONFIG_BPF_SYSCALL + struct bpf_sk_storage __rcu *sk_bpf_storage; +#endif struct rcu_head sk_rcu; }; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index b62d977c355a..64673fcc2c1e 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -133,6 +133,7 @@ enum bpf_map_type { BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE, BPF_MAP_TYPE_QUEUE, BPF_MAP_TYPE_STACK, + BPF_MAP_TYPE_SK_STORAGE, }; /* Note that tracing related programs such as @@ -2501,6 +2502,42 @@ union bpf_attr { * was provided. * * **-ERANGE** if resulting value was out of range. + * + * void *bpf_sk_storage_get(struct bpf_map *map, struct bpf_sock *sk, void *value, u64 flags) + * Description + * Get a bpf-local-storage from a sk. + * + * Logically, it could be thought of getting the value from + * a *map* with *sk* as the **key**. From this + * perspective, the usage is not much different from + * **bpf_map_lookup_elem(map, &sk)** except this + * helper enforces the key must be a **bpf_fullsock()** + * and the map must be a BPF_MAP_TYPE_SK_STORAGE also. + * + * Underneath, the value is stored locally at *sk* instead of + * the map. The *map* is used as the bpf-local-storage **type**. + * The bpf-local-storage **type** (i.e. the *map*) is searched + * against all bpf-local-storages residing at sk. + * + * An optional *flags* (BPF_SK_STORAGE_GET_F_CREATE) can be + * used such that a new bpf-local-storage will be + * created if one does not exist. *value* can be used + * together with BPF_SK_STORAGE_GET_F_CREATE to specify + * the initial value of a bpf-local-storage. If *value* is + * NULL, the new bpf-local-storage will be zero initialized. + * Return + * A bpf-local-storage pointer is returned on success. + * + * **NULL** if not found or there was an error in adding + * a new bpf-local-storage. + * + * int bpf_sk_storage_delete(struct bpf_map *map, struct bpf_sock *sk) + * Description + * Delete a bpf-local-storage from a sk. + * Return + * 0 on success. + * + * **-ENOENT** if the bpf-local-storage cannot be found. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -2698,6 +2735,9 @@ enum bpf_func_id { /* BPF_FUNC_sysctl_get_name flags. */ #define BPF_F_SYSCTL_BASE_NAME (1ULL << 0) +/* BPF_FUNC_sk_storage_get flags */ +#define BPF_SK_STORAGE_GET_F_CREATE (1ULL << 0) + /* Mode for BPF_FUNC_skb_adjust_room helper. */ enum bpf_adj_room_mode { BPF_ADJ_ROOM_NET, diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 6e779e13e2fc..5e013bb61ef2 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -527,7 +527,8 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf, return -EACCES; if (map->map_type != BPF_MAP_TYPE_HASH && map->map_type != BPF_MAP_TYPE_ARRAY && - map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE) + map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE && + map->map_type != BPF_MAP_TYPE_SK_STORAGE) return -ENOTSUPP; if (map->spin_lock_off + sizeof(struct bpf_spin_lock) > map->value_size) { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a3e7ee4322f8..65044ce53926 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2560,10 +2560,15 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, if (arg_type == ARG_PTR_TO_MAP_KEY || arg_type == ARG_PTR_TO_MAP_VALUE || - arg_type == ARG_PTR_TO_UNINIT_MAP_VALUE) { + arg_type == ARG_PTR_TO_UNINIT_MAP_VALUE || + arg_type == ARG_PTR_TO_MAP_VALUE_OR_NULL) { expected_type = PTR_TO_STACK; - if (!type_is_pkt_pointer(type) && type != PTR_TO_MAP_VALUE && - type != expected_type) + if (register_is_null(reg) && + arg_type == ARG_PTR_TO_MAP_VALUE_OR_NULL) + /* final test in check_stack_boundary() */; + else if (!type_is_pkt_pointer(type) && + type != PTR_TO_MAP_VALUE && + type != expected_type) goto err_type; } else if (arg_type == ARG_CONST_SIZE || arg_type == ARG_CONST_SIZE_OR_ZERO) { @@ -2595,6 +2600,10 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, } meta->ref_obj_id = reg->ref_obj_id; } + } else if (arg_type == ARG_PTR_TO_SOCKET) { + expected_type = PTR_TO_SOCKET; + if (type != expected_type) + goto err_type; } else if (arg_type == ARG_PTR_TO_SPIN_LOCK) { if (meta->func_id == BPF_FUNC_spin_lock) { if (process_spin_lock(env, regno, true)) @@ -2652,6 +2661,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, meta->map_ptr->key_size, false, NULL); } else if (arg_type == ARG_PTR_TO_MAP_VALUE || + (arg_type == ARG_PTR_TO_MAP_VALUE_OR_NULL && + !register_is_null(reg)) || arg_type == ARG_PTR_TO_UNINIT_MAP_VALUE) { /* bpf_map_xxx(..., map_ptr, ..., value) call: * check [value, value + map->value_size) validity @@ -2800,6 +2811,11 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, func_id != BPF_FUNC_map_push_elem) goto error; break; + case BPF_MAP_TYPE_SK_STORAGE: + if (func_id != BPF_FUNC_sk_storage_get && + func_id != BPF_FUNC_sk_storage_delete) + goto error; + break; default: break; } @@ -2863,6 +2879,11 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, map->map_type != BPF_MAP_TYPE_STACK) goto error; break; + case BPF_FUNC_sk_storage_get: + case BPF_FUNC_sk_storage_delete: + if (map->map_type != BPF_MAP_TYPE_SK_STORAGE) + goto error; + break; default: break; } diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 59067cda3250..5ec08a0081b0 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include @@ -318,6 +319,7 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr, sizeof(struct __sk_buff)); out: kfree_skb(skb); + bpf_sk_storage_free(sk); kfree(sk); kfree(ctx); return ret; diff --git a/net/core/Makefile b/net/core/Makefile index 40a5ddefc003..2931e5c23359 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -34,3 +34,4 @@ obj-$(CONFIG_DST_CACHE) += dst_cache.o obj-$(CONFIG_HWBM) += hwbm.o obj-$(CONFIG_NET_DEVLINK) += devlink.o obj-$(CONFIG_GRO_CELLS) += gro_cells.o +obj-$(CONFIG_BPF_SYSCALL) += bpf_sk_storage.o diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c new file mode 100644 index 000000000000..a8e9ac71b22d --- /dev/null +++ b/net/core/bpf_sk_storage.c @@ -0,0 +1,804 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2019 Facebook */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static atomic_t cache_idx; + +struct bucket { + struct hlist_head list; + raw_spinlock_t lock; +}; + +/* Thp map is not the primary owner of a bpf_sk_storage_elem. + * Instead, the sk->sk_bpf_storage is. + * + * The map (bpf_sk_storage_map) is for two purposes + * 1. Define the size of the "sk local storage". It is + * the map's value_size. + * + * 2. Maintain a list to keep track of all elems such + * that they can be cleaned up during the map destruction. + * + * When a bpf local storage is being looked up for a + * particular sk, the "bpf_map" pointer is actually used + * as the "key" to search in the list of elem in + * sk->sk_bpf_storage. + * + * Hence, consider sk->sk_bpf_storage is the mini-map + * with the "bpf_map" pointer as the searching key. + */ +struct bpf_sk_storage_map { + struct bpf_map map; + /* Lookup elem does not require accessing the map. + * + * Updating/Deleting requires a bucket lock to + * link/unlink the elem from the map. Having + * multiple buckets to improve contention. + */ + struct bucket *buckets; + u32 bucket_log; + u16 elem_size; + u16 cache_idx; +}; + +struct bpf_sk_storage_data { + /* smap is used as the searching key when looking up + * from sk->sk_bpf_storage. + * + * Put it in the same cacheline as the data to minimize + * the number of cachelines access during the cache hit case. + */ + struct bpf_sk_storage_map __rcu *smap; + u8 data[0] __aligned(8); +}; + +/* Linked to bpf_sk_storage and bpf_sk_storage_map */ +struct bpf_sk_storage_elem { + struct hlist_node map_node; /* Linked to bpf_sk_storage_map */ + struct hlist_node snode; /* Linked to bpf_sk_storage */ + struct bpf_sk_storage __rcu *sk_storage; + struct rcu_head rcu; + /* 8 bytes hole */ + /* The data is stored in aother cacheline to minimize + * the number of cachelines access during a cache hit. + */ + struct bpf_sk_storage_data sdata ____cacheline_aligned; +}; + +#define SELEM(_SDATA) container_of((_SDATA), struct bpf_sk_storage_elem, sdata) +#define SDATA(_SELEM) (&(_SELEM)->sdata) +#define BPF_SK_STORAGE_CACHE_SIZE 16 + +struct bpf_sk_storage { + struct bpf_sk_storage_data __rcu *cache[BPF_SK_STORAGE_CACHE_SIZE]; + struct hlist_head list; /* List of bpf_sk_storage_elem */ + struct sock *sk; /* The sk that owns the the above "list" of + * bpf_sk_storage_elem. + */ + struct rcu_head rcu; + raw_spinlock_t lock; /* Protect adding/removing from the "list" */ +}; + +static struct bucket *select_bucket(struct bpf_sk_storage_map *smap, + struct bpf_sk_storage_elem *selem) +{ + return &smap->buckets[hash_ptr(selem, smap->bucket_log)]; +} + +static int omem_charge(struct sock *sk, unsigned int size) +{ + /* same check as in sock_kmalloc() */ + if (size <= sysctl_optmem_max && + atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) { + atomic_add(size, &sk->sk_omem_alloc); + return 0; + } + + return -ENOMEM; +} + +static bool selem_linked_to_sk(const struct bpf_sk_storage_elem *selem) +{ + return !hlist_unhashed(&selem->snode); +} + +static bool selem_linked_to_map(const struct bpf_sk_storage_elem *selem) +{ + return !hlist_unhashed(&selem->map_node); +} + +static struct bpf_sk_storage_elem *selem_alloc(struct bpf_sk_storage_map *smap, + struct sock *sk, void *value, + bool charge_omem) +{ + struct bpf_sk_storage_elem *selem; + + if (charge_omem && omem_charge(sk, smap->elem_size)) + return NULL; + + selem = kzalloc(smap->elem_size, GFP_ATOMIC | __GFP_NOWARN); + if (selem) { + if (value) + memcpy(SDATA(selem)->data, value, smap->map.value_size); + return selem; + } + + if (charge_omem) + atomic_sub(smap->elem_size, &sk->sk_omem_alloc); + + return NULL; +} + +/* sk_storage->lock must be held and selem->sk_storage == sk_storage. + * The caller must ensure selem->smap is still valid to be + * dereferenced for its smap->elem_size and smap->cache_idx. + */ +static bool __selem_unlink_sk(struct bpf_sk_storage *sk_storage, + struct bpf_sk_storage_elem *selem, + bool uncharge_omem) +{ + struct bpf_sk_storage_map *smap; + bool free_sk_storage; + struct sock *sk; + + smap = rcu_dereference(SDATA(selem)->smap); + sk = sk_storage->sk; + + /* All uncharging on sk->sk_omem_alloc must be done first. + * sk may be freed once the last selem is unlinked from sk_storage. + */ + if (uncharge_omem) + atomic_sub(smap->elem_size, &sk->sk_omem_alloc); + + free_sk_storage = hlist_is_singular_node(&selem->snode, + &sk_storage->list); + if (free_sk_storage) { + atomic_sub(sizeof(struct bpf_sk_storage), &sk->sk_omem_alloc); + sk_storage->sk = NULL; + /* After this RCU_INIT, sk may be freed and cannot be used */ + RCU_INIT_POINTER(sk->sk_bpf_storage, NULL); + + /* sk_storage is not freed now. sk_storage->lock is + * still held and raw_spin_unlock_bh(&sk_storage->lock) + * will be done by the caller. + * + * Although the unlock will be done under + * rcu_read_lock(), it is more intutivie to + * read if kfree_rcu(sk_storage, rcu) is done + * after the raw_spin_unlock_bh(&sk_storage->lock). + * + * Hence, a "bool free_sk_storage" is returned + * to the caller which then calls the kfree_rcu() + * after unlock. + */ + } + hlist_del_init_rcu(&selem->snode); + if (rcu_access_pointer(sk_storage->cache[smap->cache_idx]) == + SDATA(selem)) + RCU_INIT_POINTER(sk_storage->cache[smap->cache_idx], NULL); + + kfree_rcu(selem, rcu); + + return free_sk_storage; +} + +static void selem_unlink_sk(struct bpf_sk_storage_elem *selem) +{ + struct bpf_sk_storage *sk_storage; + bool free_sk_storage = false; + + if (unlikely(!selem_linked_to_sk(selem))) + /* selem has already been unlinked from sk */ + return; + + sk_storage = rcu_dereference(selem->sk_storage); + raw_spin_lock_bh(&sk_storage->lock); + if (likely(selem_linked_to_sk(selem))) + free_sk_storage = __selem_unlink_sk(sk_storage, selem, true); + raw_spin_unlock_bh(&sk_storage->lock); + + if (free_sk_storage) + kfree_rcu(sk_storage, rcu); +} + +/* sk_storage->lock must be held and sk_storage->list cannot be empty */ +static void __selem_link_sk(struct bpf_sk_storage *sk_storage, + struct bpf_sk_storage_elem *selem) +{ + RCU_INIT_POINTER(selem->sk_storage, sk_storage); + hlist_add_head(&selem->snode, &sk_storage->list); +} + +static void selem_unlink_map(struct bpf_sk_storage_elem *selem) +{ + struct bpf_sk_storage_map *smap; + struct bucket *b; + + if (unlikely(!selem_linked_to_map(selem))) + /* selem has already be unlinked from smap */ + return; + + smap = rcu_dereference(SDATA(selem)->smap); + b = select_bucket(smap, selem); + raw_spin_lock_bh(&b->lock); + if (likely(selem_linked_to_map(selem))) + hlist_del_init_rcu(&selem->map_node); + raw_spin_unlock_bh(&b->lock); +} + +static void selem_link_map(struct bpf_sk_storage_map *smap, + struct bpf_sk_storage_elem *selem) +{ + struct bucket *b = select_bucket(smap, selem); + + raw_spin_lock_bh(&b->lock); + RCU_INIT_POINTER(SDATA(selem)->smap, smap); + hlist_add_head_rcu(&selem->map_node, &b->list); + raw_spin_unlock_bh(&b->lock); +} + +static void selem_unlink(struct bpf_sk_storage_elem *selem) +{ + /* Always unlink from map before unlinking from sk_storage + * because selem will be freed after successfully unlinked from + * the sk_storage. + */ + selem_unlink_map(selem); + selem_unlink_sk(selem); +} + +static struct bpf_sk_storage_data * +__sk_storage_lookup(struct bpf_sk_storage *sk_storage, + struct bpf_sk_storage_map *smap, + bool cacheit_lockit) +{ + struct bpf_sk_storage_data *sdata; + struct bpf_sk_storage_elem *selem; + + /* Fast path (cache hit) */ + sdata = rcu_dereference(sk_storage->cache[smap->cache_idx]); + if (sdata && rcu_access_pointer(sdata->smap) == smap) + return sdata; + + /* Slow path (cache miss) */ + hlist_for_each_entry_rcu(selem, &sk_storage->list, snode) + if (rcu_access_pointer(SDATA(selem)->smap) == smap) + break; + + if (!selem) + return NULL; + + sdata = SDATA(selem); + if (cacheit_lockit) { + /* spinlock is needed to avoid racing with the + * parallel delete. Otherwise, publishing an already + * deleted sdata to the cache will become a use-after-free + * problem in the next __sk_storage_lookup(). + */ + raw_spin_lock_bh(&sk_storage->lock); + if (selem_linked_to_sk(selem)) + rcu_assign_pointer(sk_storage->cache[smap->cache_idx], + sdata); + raw_spin_unlock_bh(&sk_storage->lock); + } + + return sdata; +} + +static struct bpf_sk_storage_data * +sk_storage_lookup(struct sock *sk, struct bpf_map *map, bool cacheit_lockit) +{ + struct bpf_sk_storage *sk_storage; + struct bpf_sk_storage_map *smap; + + sk_storage = rcu_dereference(sk->sk_bpf_storage); + if (!sk_storage) + return NULL; + + smap = (struct bpf_sk_storage_map *)map; + return __sk_storage_lookup(sk_storage, smap, cacheit_lockit); +} + +static int check_flags(const struct bpf_sk_storage_data *old_sdata, + u64 map_flags) +{ + if (old_sdata && (map_flags & ~BPF_F_LOCK) == BPF_NOEXIST) + /* elem already exists */ + return -EEXIST; + + if (!old_sdata && (map_flags & ~BPF_F_LOCK) == BPF_EXIST) + /* elem doesn't exist, cannot update it */ + return -ENOENT; + + return 0; +} + +static int sk_storage_alloc(struct sock *sk, + struct bpf_sk_storage_map *smap, + struct bpf_sk_storage_elem *first_selem) +{ + struct bpf_sk_storage *prev_sk_storage, *sk_storage; + int err; + + err = omem_charge(sk, sizeof(*sk_storage)); + if (err) + return err; + + sk_storage = kzalloc(sizeof(*sk_storage), GFP_ATOMIC | __GFP_NOWARN); + if (!sk_storage) { + err = -ENOMEM; + goto uncharge; + } + INIT_HLIST_HEAD(&sk_storage->list); + raw_spin_lock_init(&sk_storage->lock); + sk_storage->sk = sk; + + __selem_link_sk(sk_storage, first_selem); + selem_link_map(smap, first_selem); + /* Publish sk_storage to sk. sk->sk_lock cannot be acquired. + * Hence, atomic ops is used to set sk->sk_bpf_storage + * from NULL to the newly allocated sk_storage ptr. + * + * From now on, the sk->sk_bpf_storage pointer is protected + * by the sk_storage->lock. Hence, when freeing + * the sk->sk_bpf_storage, the sk_storage->lock must + * be held before setting sk->sk_bpf_storage to NULL. + */ + prev_sk_storage = cmpxchg((struct bpf_sk_storage **)&sk->sk_bpf_storage, + NULL, sk_storage); + if (unlikely(prev_sk_storage)) { + selem_unlink_map(first_selem); + err = -EAGAIN; + goto uncharge; + + /* Note that even first_selem was linked to smap's + * bucket->list, first_selem can be freed immediately + * (instead of kfree_rcu) because + * bpf_sk_storage_map_free() does a + * synchronize_rcu() before walking the bucket->list. + * Hence, no one is accessing selem from the + * bucket->list under rcu_read_lock(). + */ + } + + return 0; + +uncharge: + kfree(sk_storage); + atomic_sub(sizeof(*sk_storage), &sk->sk_omem_alloc); + return err; +} + +/* sk cannot be going away because it is linking new elem + * to sk->sk_bpf_storage. (i.e. sk->sk_refcnt cannot be 0). + * Otherwise, it will become a leak (and other memory issues + * during map destruction). + */ +static struct bpf_sk_storage_data *sk_storage_update(struct sock *sk, + struct bpf_map *map, + void *value, + u64 map_flags) +{ + struct bpf_sk_storage_data *old_sdata = NULL; + struct bpf_sk_storage_elem *selem; + struct bpf_sk_storage *sk_storage; + struct bpf_sk_storage_map *smap; + int err; + + /* BPF_EXIST and BPF_NOEXIST cannot be both set */ + if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST) || + /* BPF_F_LOCK can only be used in a value with spin_lock */ + unlikely((map_flags & BPF_F_LOCK) && !map_value_has_spin_lock(map))) + return ERR_PTR(-EINVAL); + + smap = (struct bpf_sk_storage_map *)map; + sk_storage = rcu_dereference(sk->sk_bpf_storage); + if (!sk_storage || hlist_empty(&sk_storage->list)) { + /* Very first elem for this sk */ + err = check_flags(NULL, map_flags); + if (err) + return ERR_PTR(err); + + selem = selem_alloc(smap, sk, value, true); + if (!selem) + return ERR_PTR(-ENOMEM); + + err = sk_storage_alloc(sk, smap, selem); + if (err) { + kfree(selem); + atomic_sub(smap->elem_size, &sk->sk_omem_alloc); + return ERR_PTR(err); + } + + return SDATA(selem); + } + + if ((map_flags & BPF_F_LOCK) && !(map_flags & BPF_NOEXIST)) { + /* Hoping to find an old_sdata to do inline update + * such that it can avoid taking the sk_storage->lock + * and changing the lists. + */ + old_sdata = __sk_storage_lookup(sk_storage, smap, false); + err = check_flags(old_sdata, map_flags); + if (err) + return ERR_PTR(err); + if (old_sdata && selem_linked_to_sk(SELEM(old_sdata))) { + copy_map_value_locked(map, old_sdata->data, + value, false); + return old_sdata; + } + } + + raw_spin_lock_bh(&sk_storage->lock); + + /* Recheck sk_storage->list under sk_storage->lock */ + if (unlikely(hlist_empty(&sk_storage->list))) { + /* A parallel del is happening and sk_storage is going + * away. It has just been checked before, so very + * unlikely. Return instead of retry to keep things + * simple. + */ + err = -EAGAIN; + goto unlock_err; + } + + old_sdata = __sk_storage_lookup(sk_storage, smap, false); + err = check_flags(old_sdata, map_flags); + if (err) + goto unlock_err; + + if (old_sdata && (map_flags & BPF_F_LOCK)) { + copy_map_value_locked(map, old_sdata->data, value, false); + selem = SELEM(old_sdata); + goto unlock; + } + + /* sk_storage->lock is held. Hence, we are sure + * we can unlink and uncharge the old_sdata successfully + * later. Hence, instead of charging the new selem now + * and then uncharge the old selem later (which may cause + * a potential but unnecessary charge failure), avoid taking + * a charge at all here (the "!old_sdata" check) and the + * old_sdata will not be uncharged later during __selem_unlink_sk(). + */ + selem = selem_alloc(smap, sk, value, !old_sdata); + if (!selem) { + err = -ENOMEM; + goto unlock_err; + } + + /* First, link the new selem to the map */ + selem_link_map(smap, selem); + + /* Second, link (and publish) the new selem to sk_storage */ + __selem_link_sk(sk_storage, selem); + + /* Third, remove old selem, SELEM(old_sdata) */ + if (old_sdata) { + selem_unlink_map(SELEM(old_sdata)); + __selem_unlink_sk(sk_storage, SELEM(old_sdata), false); + } + +unlock: + raw_spin_unlock_bh(&sk_storage->lock); + return SDATA(selem); + +unlock_err: + raw_spin_unlock_bh(&sk_storage->lock); + return ERR_PTR(err); +} + +static int sk_storage_delete(struct sock *sk, struct bpf_map *map) +{ + struct bpf_sk_storage_data *sdata; + + sdata = sk_storage_lookup(sk, map, false); + if (!sdata) + return -ENOENT; + + selem_unlink(SELEM(sdata)); + + return 0; +} + +/* Called by __sk_destruct() */ +void bpf_sk_storage_free(struct sock *sk) +{ + struct bpf_sk_storage_elem *selem; + struct bpf_sk_storage *sk_storage; + bool free_sk_storage = false; + struct hlist_node *n; + + rcu_read_lock(); + sk_storage = rcu_dereference(sk->sk_bpf_storage); + if (!sk_storage) { + rcu_read_unlock(); + return; + } + + /* Netiher the bpf_prog nor the bpf-map's syscall + * could be modifying the sk_storage->list now. + * Thus, no elem can be added-to or deleted-from the + * sk_storage->list by the bpf_prog or by the bpf-map's syscall. + * + * It is racing with bpf_sk_storage_map_free() alone + * when unlinking elem from the sk_storage->list and + * the map's bucket->list. + */ + raw_spin_lock_bh(&sk_storage->lock); + hlist_for_each_entry_safe(selem, n, &sk_storage->list, snode) { + /* Always unlink from map before unlinking from + * sk_storage. + */ + selem_unlink_map(selem); + free_sk_storage = __selem_unlink_sk(sk_storage, selem, true); + } + raw_spin_unlock_bh(&sk_storage->lock); + rcu_read_unlock(); + + if (free_sk_storage) + kfree_rcu(sk_storage, rcu); +} + +static void bpf_sk_storage_map_free(struct bpf_map *map) +{ + struct bpf_sk_storage_elem *selem; + struct bpf_sk_storage_map *smap; + struct bucket *b; + unsigned int i; + + smap = (struct bpf_sk_storage_map *)map; + + synchronize_rcu(); + + /* bpf prog and the userspace can no longer access this map + * now. No new selem (of this map) can be added + * to the sk->sk_bpf_storage or to the map bucket's list. + * + * The elem of this map can be cleaned up here + * or + * by bpf_sk_storage_free() during __sk_destruct(). + */ + for (i = 0; i < (1U << smap->bucket_log); i++) { + b = &smap->buckets[i]; + + rcu_read_lock(); + /* No one is adding to b->list now */ + while ((selem = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(&b->list)), + struct bpf_sk_storage_elem, + map_node))) { + selem_unlink(selem); + cond_resched_rcu(); + } + rcu_read_unlock(); + } + + /* bpf_sk_storage_free() may still need to access the map. + * e.g. bpf_sk_storage_free() has unlinked selem from the map + * which then made the above while((selem = ...)) loop + * exited immediately. + * + * However, the bpf_sk_storage_free() still needs to access + * the smap->elem_size to do the uncharging in + * __selem_unlink_sk(). + * + * Hence, wait another rcu grace period for the + * bpf_sk_storage_free() to finish. + */ + synchronize_rcu(); + + kvfree(smap->buckets); + kfree(map); +} + +static int bpf_sk_storage_map_alloc_check(union bpf_attr *attr) +{ + if (attr->map_flags != BPF_F_NO_PREALLOC || attr->max_entries || + attr->key_size != sizeof(int) || !attr->value_size || + /* Enforce BTF for userspace sk dumping */ + !attr->btf_key_type_id || !attr->btf_value_type_id) + return -EINVAL; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (attr->value_size >= KMALLOC_MAX_SIZE - + MAX_BPF_STACK - sizeof(struct bpf_sk_storage_elem) || + /* U16_MAX is much more than enough for sk local storage + * considering a tcp_sock is ~2k. + */ + attr->value_size > U16_MAX - sizeof(struct bpf_sk_storage_elem)) + return -E2BIG; + + return 0; +} + +static struct bpf_map *bpf_sk_storage_map_alloc(union bpf_attr *attr) +{ + struct bpf_sk_storage_map *smap; + unsigned int i; + u32 nbuckets; + u64 cost; + + smap = kzalloc(sizeof(*smap), GFP_USER | __GFP_NOWARN); + if (!smap) + return ERR_PTR(-ENOMEM); + bpf_map_init_from_attr(&smap->map, attr); + + smap->bucket_log = ilog2(roundup_pow_of_two(num_possible_cpus())); + nbuckets = 1U << smap->bucket_log; + smap->buckets = kvcalloc(sizeof(*smap->buckets), nbuckets, + GFP_USER | __GFP_NOWARN); + if (!smap->buckets) { + kfree(smap); + return ERR_PTR(-ENOMEM); + } + cost = sizeof(*smap->buckets) * nbuckets + sizeof(*smap); + + for (i = 0; i < nbuckets; i++) { + INIT_HLIST_HEAD(&smap->buckets[i].list); + raw_spin_lock_init(&smap->buckets[i].lock); + } + + smap->elem_size = sizeof(struct bpf_sk_storage_elem) + attr->value_size; + smap->cache_idx = (unsigned int)atomic_inc_return(&cache_idx) % + BPF_SK_STORAGE_CACHE_SIZE; + smap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; + + return &smap->map; +} + +static int notsupp_get_next_key(struct bpf_map *map, void *key, + void *next_key) +{ + return -ENOTSUPP; +} + +static int bpf_sk_storage_map_check_btf(const struct bpf_map *map, + const struct btf *btf, + const struct btf_type *key_type, + const struct btf_type *value_type) +{ + u32 int_data; + + if (BTF_INFO_KIND(key_type->info) != BTF_KIND_INT) + return -EINVAL; + + int_data = *(u32 *)(key_type + 1); + if (BTF_INT_BITS(int_data) != 32 || BTF_INT_OFFSET(int_data)) + return -EINVAL; + + return 0; +} + +static void *bpf_fd_sk_storage_lookup_elem(struct bpf_map *map, void *key) +{ + struct bpf_sk_storage_data *sdata; + struct socket *sock; + int fd, err; + + fd = *(int *)key; + sock = sockfd_lookup(fd, &err); + if (sock) { + sdata = sk_storage_lookup(sock->sk, map, true); + sockfd_put(sock); + return sdata ? sdata->data : NULL; + } + + return ERR_PTR(err); +} + +static int bpf_fd_sk_storage_update_elem(struct bpf_map *map, void *key, + void *value, u64 map_flags) +{ + struct bpf_sk_storage_data *sdata; + struct socket *sock; + int fd, err; + + fd = *(int *)key; + sock = sockfd_lookup(fd, &err); + if (sock) { + sdata = sk_storage_update(sock->sk, map, value, map_flags); + sockfd_put(sock); + return IS_ERR(sdata) ? PTR_ERR(sdata) : 0; + } + + return err; +} + +static int bpf_fd_sk_storage_delete_elem(struct bpf_map *map, void *key) +{ + struct socket *sock; + int fd, err; + + fd = *(int *)key; + sock = sockfd_lookup(fd, &err); + if (sock) { + err = sk_storage_delete(sock->sk, map); + sockfd_put(sock); + return err; + } + + return err; +} + +BPF_CALL_4(bpf_sk_storage_get, struct bpf_map *, map, struct sock *, sk, + void *, value, u64, flags) +{ + struct bpf_sk_storage_data *sdata; + + if (flags > BPF_SK_STORAGE_GET_F_CREATE) + return (unsigned long)NULL; + + sdata = sk_storage_lookup(sk, map, true); + if (sdata) + return (unsigned long)sdata->data; + + if (flags == BPF_SK_STORAGE_GET_F_CREATE && + /* Cannot add new elem to a going away sk. + * Otherwise, the new elem may become a leak + * (and also other memory issues during map + * destruction). + */ + refcount_inc_not_zero(&sk->sk_refcnt)) { + sdata = sk_storage_update(sk, map, value, BPF_NOEXIST); + /* sk must be a fullsock (guaranteed by verifier), + * so sock_gen_put() is unnecessary. + */ + sock_put(sk); + return IS_ERR(sdata) ? + (unsigned long)NULL : (unsigned long)sdata->data; + } + + return (unsigned long)NULL; +} + +BPF_CALL_2(bpf_sk_storage_delete, struct bpf_map *, map, struct sock *, sk) +{ + if (refcount_inc_not_zero(&sk->sk_refcnt)) { + int err; + + err = sk_storage_delete(sk, map); + sock_put(sk); + return err; + } + + return -ENOENT; +} + +const struct bpf_map_ops sk_storage_map_ops = { + .map_alloc_check = bpf_sk_storage_map_alloc_check, + .map_alloc = bpf_sk_storage_map_alloc, + .map_free = bpf_sk_storage_map_free, + .map_get_next_key = notsupp_get_next_key, + .map_lookup_elem = bpf_fd_sk_storage_lookup_elem, + .map_update_elem = bpf_fd_sk_storage_update_elem, + .map_delete_elem = bpf_fd_sk_storage_delete_elem, + .map_check_btf = bpf_sk_storage_map_check_btf, +}; + +const struct bpf_func_proto bpf_sk_storage_get_proto = { + .func = bpf_sk_storage_get, + .gpl_only = false, + .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_SOCKET, + .arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL, + .arg4_type = ARG_ANYTHING, +}; + +const struct bpf_func_proto bpf_sk_storage_delete_proto = { + .func = bpf_sk_storage_delete, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_SOCKET, +}; diff --git a/net/core/filter.c b/net/core/filter.c index 22770ef5bf0b..eae293d6ee2d 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -73,6 +73,8 @@ #include #include #include +#include +#include /** * sk_filter_trim_cap - run a packet through a socket filter @@ -5405,6 +5407,9 @@ sk_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) } } +const struct bpf_func_proto bpf_sk_storage_get_proto __weak; +const struct bpf_func_proto bpf_sk_storage_delete_proto __weak; + static const struct bpf_func_proto * cg_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { @@ -5413,6 +5418,10 @@ cg_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_local_storage_proto; case BPF_FUNC_sk_fullsock: return &bpf_sk_fullsock_proto; + case BPF_FUNC_sk_storage_get: + return &bpf_sk_storage_get_proto; + case BPF_FUNC_sk_storage_delete: + return &bpf_sk_storage_delete_proto; #ifdef CONFIG_INET case BPF_FUNC_tcp_sock: return &bpf_tcp_sock_proto; @@ -5496,6 +5505,10 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_skb_fib_lookup_proto; case BPF_FUNC_sk_fullsock: return &bpf_sk_fullsock_proto; + case BPF_FUNC_sk_storage_get: + return &bpf_sk_storage_get_proto; + case BPF_FUNC_sk_storage_delete: + return &bpf_sk_storage_delete_proto; #ifdef CONFIG_XFRM case BPF_FUNC_skb_get_xfrm_state: return &bpf_skb_get_xfrm_state_proto; diff --git a/net/core/sock.c b/net/core/sock.c index be7e26cc67b4..0ed2d253aee7 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -136,6 +136,7 @@ #include #include +#include #include @@ -1744,6 +1745,10 @@ static void __sk_destruct(struct rcu_head *head) sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP); +#ifdef CONFIG_BPF_SYSCALL + bpf_sk_storage_free(sk); +#endif + if (atomic_read(&sk->sk_omem_alloc)) pr_debug("%s: optmem leakage (%d bytes) detected\n", __func__, atomic_read(&sk->sk_omem_alloc)); From b303b0d6924492d625ce9c1ec285119d2ef4baaf Mon Sep 17 00:00:00 2001 From: Rick Edgecombe Date: Thu, 25 Apr 2019 17:11:34 -0700 Subject: [PATCH 1120/1640] BACKPORT: x86/mm/cpa: Add set_direct_map_*() functions Add two new functions set_direct_map_default_noflush() and set_direct_map_invalid_noflush() for setting the direct map alias for the page to its default valid permissions and to an invalid state that cannot be cached in a TLB, respectively. These functions do not flush the TLB. Note, __kernel_map_pages() does something similar but flushes the TLB and doesn't reset the permission bits to default on all architectures. Also add an ARCH config ARCH_HAS_SET_DIRECT_MAP for specifying whether these have an actual implementation or a default empty one. Change-Id: I0f5c4b3cac1c937ccab68c3278ce8ea3ca833032 Signed-off-by: Rick Edgecombe Signed-off-by: Peter Zijlstra (Intel) Cc: Cc: Cc: Cc: Cc: Cc: Cc: Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Nadav Amit Cc: Rik van Riel Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20190426001143.4983-15-namit@vmware.com Signed-off-by: Ingo Molnar --- arch/Kconfig | 4 ++++ arch/x86/Kconfig | 1 + arch/x86/include/asm/set_memory.h | 3 +++ arch/x86/mm/pageattr.c | 14 +++++++++++--- include/linux/set_memory.h | 11 +++++++++++ 5 files changed, 30 insertions(+), 3 deletions(-) diff --git a/arch/Kconfig b/arch/Kconfig index 17ea90fa6808..bb31eb5d863a 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -243,6 +243,10 @@ config ARCH_HAS_SET_MEMORY config ARCH_HAS_CPU_FINALIZE_INIT bool +# Select if arch has all set_direct_map_invalid/default() functions +config ARCH_HAS_SET_DIRECT_MAP + bool + # Select if arch init_task initializer is different to init/init_task.c config ARCH_INIT_TASK bool diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index c2507a716059..35b39fc581d1 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -60,6 +60,7 @@ config X86 select ARCH_HAS_UACCESS_FLUSHCACHE if X86_64 select ARCH_HAS_SET_MEMORY select ARCH_HAS_SG_CHAIN + select ARCH_HAS_SET_DIRECT_MAP select ARCH_HAS_STRICT_KERNEL_RWX select ARCH_HAS_STRICT_MODULE_RWX select ARCH_HAS_UBSAN_SANITIZE_ALL diff --git a/arch/x86/include/asm/set_memory.h b/arch/x86/include/asm/set_memory.h index bd090367236c..9abd49698129 100644 --- a/arch/x86/include/asm/set_memory.h +++ b/arch/x86/include/asm/set_memory.h @@ -84,6 +84,9 @@ int set_pages_nx(struct page *page, int numpages); int set_pages_ro(struct page *page, int numpages); int set_pages_rw(struct page *page, int numpages); +int set_direct_map_invalid_noflush(struct page *page); +int set_direct_map_default_noflush(struct page *page); + extern int kernel_set_to_readonly; void set_kernel_text_rw(void); void set_kernel_text_ro(void); diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index eaee1a7ed0b5..da9b4c91b04b 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -1976,8 +1976,6 @@ int set_pages_rw(struct page *page, int numpages) return set_memory_rw(addr, numpages); } -#ifdef CONFIG_DEBUG_PAGEALLOC - static int __set_pages_p(struct page *page, int numpages) { unsigned long tempaddr = (unsigned long) page_address(page); @@ -2016,6 +2014,17 @@ static int __set_pages_np(struct page *page, int numpages) return __change_page_attr_set_clr(&cpa, 0); } +int set_direct_map_invalid_noflush(struct page *page) +{ + return __set_pages_np(page, 1); +} + +int set_direct_map_default_noflush(struct page *page) +{ + return __set_pages_p(page, 1); +} + +#ifdef CONFIG_DEBUG_PAGEALLOC void __kernel_map_pages(struct page *page, int numpages, int enable) { if (PageHighMem(page)) @@ -2049,7 +2058,6 @@ void __kernel_map_pages(struct page *page, int numpages, int enable) } #ifdef CONFIG_HIBERNATION - bool kernel_page_present(struct page *page) { unsigned int level; diff --git a/include/linux/set_memory.h b/include/linux/set_memory.h index e5140648f638..99ebd122a23b 100644 --- a/include/linux/set_memory.h +++ b/include/linux/set_memory.h @@ -17,4 +17,15 @@ static inline int set_memory_x(unsigned long addr, int numpages) { return 0; } static inline int set_memory_nx(unsigned long addr, int numpages) { return 0; } #endif +#ifndef CONFIG_ARCH_HAS_SET_DIRECT_MAP +static inline int set_direct_map_invalid_noflush(struct page *page) +{ + return 0; +} +static inline int set_direct_map_default_noflush(struct page *page) +{ + return 0; +} +#endif + #endif /* _LINUX_SET_MEMORY_H_ */ From 9d0eabd147a39d0eb550aebe8c4eb8b98065c564 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 10 Apr 2019 14:43:44 -0400 Subject: [PATCH 1121/1640] BACKPORT: new inode method: ->free_inode() A lot of ->destroy_inode() instances end with call_rcu() of a callback that does RCU-delayed part of freeing. Introduce a new method for doing just that, with saner signature. Rules: ->destroy_inode ->free_inode f g immediate call of f(), RCU-delayed call of g() f NULL immediate call of f(), no RCU-delayed calls NULL g RCU-delayed call of g() NULL NULL RCU-delayed default freeing IOW, NULL ->free_inode gives the same behaviour as now. Note that NULL, NULL is equivalent to NULL, free_inode_nonrcu; we could mandate the latter form, but that would have very little benefit beyond making rules a bit more symmetric. It would break backwards compatibility, require extra boilerplate and expected semantics for (NULL, NULL) pair would have no use whatsoever... Change-Id: I828dd519b381a1f25013ec3d1eae0b6835eb9c26 Signed-off-by: Al Viro --- Documentation/filesystems/Locking | 2 ++ Documentation/filesystems/porting | 39 ++++++++++++++++++--- fs/inode.c | 56 +++++++++++++++++++------------ include/linux/fs.h | 6 +++- 4 files changed, 75 insertions(+), 28 deletions(-) diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index a39e9311d5ae..ce4bcfd12b8e 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking @@ -119,6 +119,7 @@ set: yes --------------------------- super_operations --------------------------- prototypes: struct inode *(*alloc_inode)(struct super_block *sb); + void (*free_inode)(struct inode *); void (*destroy_inode)(struct inode *); void (*dirty_inode) (struct inode *, int flags); int (*write_inode) (struct inode *, struct writeback_control *wbc); @@ -140,6 +141,7 @@ locking rules: All may block [not true, see below] s_umount alloc_inode: +free_inode: called from RCU callback destroy_inode: dirty_inode: write_inode: diff --git a/Documentation/filesystems/porting b/Documentation/filesystems/porting index c757c1c3cb81..bc34d7dd253e 100644 --- a/Documentation/filesystems/porting +++ b/Documentation/filesystems/porting @@ -607,9 +607,38 @@ in your dentry operations instead. to specify the fields and sync type requested by statx. Filesystems not supporting any statx-specific features may ignore the new arguments. -- -[mandatory] +[recommended] + ->lookup() instances doing an equivalent of + if (IS_ERR(inode)) + return ERR_CAST(inode); + return d_splice_alias(inode, dentry); + don't need to bother with the check - d_splice_alias() will do the + right thing when given ERR_PTR(...) as inode. Moreover, passing NULL + inode to d_splice_alias() will also do the right thing (equivalent of + d_add(dentry, NULL); return NULL;), so that kind of special cases + also doesn't need a separate treatment. +-- +[strongly recommended] + take the RCU-delayed parts of ->destroy_inode() into a new method - + ->free_inode(). If ->destroy_inode() becomes empty - all the better, + just get rid of it. Synchronous work (e.g. the stuff that can't + be done from an RCU callback, or any WARN_ON() where we want the + stack trace) *might* be movable to ->evict_inode(); however, + that goes only for the things that are not needed to balance something + done by ->alloc_inode(). IOW, if it's cleaning up the stuff that + might have accumulated over the life of in-core inode, ->evict_inode() + might be a fit. - [should've been added in 2016] stale comment in finish_open() - nonwithstanding, failure exits in ->atomic_open() instances should - *NOT* fput() the file, no matter what. Everything is handled by the - caller. + Rules for inode destruction: + * if ->destroy_inode() is non-NULL, it gets called + * if ->free_inode() is non-NULL, it gets scheduled by call_rcu() + * combination of NULL ->destroy_inode and NULL ->free_inode is + treated as NULL/free_inode_nonrcu, to preserve the compatibility. + + Note that the callback (be it via ->free_inode() or explicit call_rcu() + in ->destroy_inode()) is *NOT* ordered wrt superblock destruction; + as the matter of fact, the superblock and all associated structures + might be already gone. The filesystem driver is guaranteed to be still + there, but that's it. Freeing memory in the callback is fine; doing + more than that is possible, but requires a lot of care and is best + avoided. diff --git a/fs/inode.c b/fs/inode.c index d27472d69ced..7aabf51462ea 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -205,12 +205,28 @@ int inode_init_always(struct super_block *sb, struct inode *inode) } EXPORT_SYMBOL(inode_init_always); +void free_inode_nonrcu(struct inode *inode) +{ + kmem_cache_free(inode_cachep, inode); +} +EXPORT_SYMBOL(free_inode_nonrcu); + +static void i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + if (inode->free_inode) + inode->free_inode(inode); + else + free_inode_nonrcu(inode); +} + static struct inode *alloc_inode(struct super_block *sb) { + const struct super_operations *ops = sb->s_op; struct inode *inode; - if (sb->s_op->alloc_inode) - inode = sb->s_op->alloc_inode(sb); + if (ops->alloc_inode) + inode = ops->alloc_inode(sb); else inode = kmem_cache_alloc(inode_cachep, GFP_KERNEL); @@ -218,22 +234,19 @@ static struct inode *alloc_inode(struct super_block *sb) return NULL; if (unlikely(inode_init_always(sb, inode))) { - if (inode->i_sb->s_op->destroy_inode) - inode->i_sb->s_op->destroy_inode(inode); - else - kmem_cache_free(inode_cachep, inode); + if (ops->destroy_inode) { + ops->destroy_inode(inode); + if (!ops->free_inode) + return NULL; + } + inode->free_inode = ops->free_inode; + i_callback(&inode->i_rcu); return NULL; } return inode; } -void free_inode_nonrcu(struct inode *inode) -{ - kmem_cache_free(inode_cachep, inode); -} -EXPORT_SYMBOL(free_inode_nonrcu); - void __destroy_inode(struct inode *inode) { BUG_ON(inode_has_buffers(inode)); @@ -256,20 +269,19 @@ void __destroy_inode(struct inode *inode) } EXPORT_SYMBOL(__destroy_inode); -static void i_callback(struct rcu_head *head) -{ - struct inode *inode = container_of(head, struct inode, i_rcu); - kmem_cache_free(inode_cachep, inode); -} - static void destroy_inode(struct inode *inode) { + const struct super_operations *ops = inode->i_sb->s_op; + BUG_ON(!list_empty(&inode->i_lru)); __destroy_inode(inode); - if (inode->i_sb->s_op->destroy_inode) - inode->i_sb->s_op->destroy_inode(inode); - else - call_rcu(&inode->i_rcu, i_callback); + if (ops->destroy_inode) { + ops->destroy_inode(inode); + if (!ops->free_inode) + return; + } + inode->free_inode = ops->free_inode; + call_rcu(&inode->i_rcu, i_callback); } /** diff --git a/include/linux/fs.h b/include/linux/fs.h index c2a0c49790c2..ae353137c767 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -665,7 +665,10 @@ struct inode { #ifdef CONFIG_IMA atomic_t i_readcount; /* struct files open RO */ #endif - const struct file_operations *i_fop; /* former ->i_op->default_file_ops */ + union { + const struct file_operations *i_fop; /* former ->i_op->default_file_ops */ + void (*free_inode)(struct inode *); + }; struct file_lock_context *i_flctx; struct address_space i_data; struct list_head i_devices; @@ -1867,6 +1870,7 @@ extern int vfs_dedupe_file_range(struct file *file, struct super_operations { struct inode *(*alloc_inode)(struct super_block *sb); void (*destroy_inode)(struct inode *); + void (*free_inode)(struct inode *); void (*dirty_inode) (struct inode *, int flags); int (*write_inode) (struct inode *, struct writeback_control *wbc); From 69605f36d9a7d5fcc01e84dacbdde58cc8a24e02 Mon Sep 17 00:00:00 2001 From: Rick Edgecombe Date: Thu, 25 Apr 2019 17:11:36 -0700 Subject: [PATCH 1122/1640] UPSTREAM: mm/vmalloc: Add flag for freeing of special permsissions Add a new flag VM_FLUSH_RESET_PERMS, for enabling vfree operations to immediately clear executable TLB entries before freeing pages, and handle resetting permissions on the directmap. This flag is useful for any kind of memory with elevated permissions, or where there can be related permissions changes on the directmap. Today this is RO+X and RO memory. Although this enables directly vfreeing non-writeable memory now, non-writable memory cannot be freed in an interrupt because the allocation itself is used as a node on deferred free list. So when RO memory needs to be freed in an interrupt the code doing the vfree needs to have its own work queue, as was the case before the deferred vfree list was added to vmalloc. For architectures with set_direct_map_ implementations this whole operation can be done with one TLB flush when centralized like this. For others with directmap permissions, currently only arm64, a backup method using set_memory functions is used to reset the directmap. When arm64 adds set_direct_map_ functions, this backup can be removed. When the TLB is flushed to both remove TLB entries for the vmalloc range mapping and the direct map permissions, the lazy purge operation could be done to try to save a TLB flush later. However today vm_unmap_aliases could flush a TLB range that does not include the directmap. So a helper is added with extra parameters that can allow both the vmalloc address and the direct mapping to be flushed during this operation. The behavior of the normal vm_unmap_aliases function is unchanged. Suggested-by: Dave Hansen Suggested-by: Andy Lutomirski Suggested-by: Will Deacon Change-Id: I914f605f3601bde21f194069dd31e658c9a22fcd Signed-off-by: Rick Edgecombe Signed-off-by: Peter Zijlstra (Intel) Cc: Cc: Cc: Cc: Cc: Cc: Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Nadav Amit Cc: Rik van Riel Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20190426001143.4983-17-namit@vmware.com Signed-off-by: Ingo Molnar --- include/linux/vmalloc.h | 15 ++++++ mm/vmalloc.c | 113 +++++++++++++++++++++++++++++++++------- 2 files changed, 109 insertions(+), 19 deletions(-) diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index e40bf8b2a83a..53cad7dca9bd 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -23,6 +23,11 @@ struct notifier_block; /* in notifier.h */ #define VM_KASAN 0x00000080 /* has allocated kasan shadow memory */ #define VM_LOWMEM 0x00000100 /* Tracking of direct mapped lowmem */ +/* + * Memory with VM_FLUSH_RESET_PERMS cannot be freed in an interrupt or with + * vfree_atomic(). + */ +#define VM_FLUSH_RESET_PERMS 0x00000120 /* Reset direct map and flush TLB on unmap */ /* bits [20..32] reserved for arch specific ioremap internals */ /* @@ -147,6 +152,13 @@ extern int map_kernel_range_noflush(unsigned long start, unsigned long size, pgprot_t prot, struct page **pages); extern void unmap_kernel_range_noflush(unsigned long addr, unsigned long size); extern void unmap_kernel_range(unsigned long addr, unsigned long size); +static inline void set_vm_flush_reset_perms(void *addr) +{ + struct vm_struct *vm = find_vm_area(addr); + + if (vm) + vm->flags |= VM_FLUSH_RESET_PERMS; +} #else static inline int map_kernel_range_noflush(unsigned long start, unsigned long size, @@ -162,6 +174,9 @@ static inline void unmap_kernel_range(unsigned long addr, unsigned long size) { } +static inline void set_vm_flush_reset_perms(void *addr) +{ +} #endif /* Allocate/destroy a 'vmalloc' VM area. */ diff --git a/mm/vmalloc.c b/mm/vmalloc.c index b013740c8743..19fb83b1f6b4 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -1130,24 +1131,9 @@ static void vb_free(const void *addr, unsigned long size) spin_unlock(&vb->lock); } -/** - * vm_unmap_aliases - unmap outstanding lazy aliases in the vmap layer - * - * The vmap/vmalloc layer lazily flushes kernel virtual mappings primarily - * to amortize TLB flushing overheads. What this means is that any page you - * have now, may, in a former life, have been mapped into kernel virtual - * address by the vmap layer and so there might be some CPUs with TLB entries - * still referencing that page (additional to the regular 1:1 kernel mapping). - * - * vm_unmap_aliases flushes all such lazy mappings. After it returns, we can - * be sure that none of the pages we have control over will have any aliases - * from the vmap layer. - */ -void vm_unmap_aliases(void) +static void _vm_unmap_aliases(unsigned long start, unsigned long end, int flush) { - unsigned long start = ULONG_MAX, end = 0; int cpu; - int flush = 0; if (unlikely(!vmap_initialized)) return; @@ -1184,6 +1170,27 @@ void vm_unmap_aliases(void) flush_tlb_kernel_range(start, end); mutex_unlock(&vmap_purge_lock); } + +/** + * vm_unmap_aliases - unmap outstanding lazy aliases in the vmap layer + * + * The vmap/vmalloc layer lazily flushes kernel virtual mappings primarily + * to amortize TLB flushing overheads. What this means is that any page you + * have now, may, in a former life, have been mapped into kernel virtual + * address by the vmap layer and so there might be some CPUs with TLB entries + * still referencing that page (additional to the regular 1:1 kernel mapping). + * + * vm_unmap_aliases flushes all such lazy mappings. After it returns, we can + * be sure that none of the pages we have control over will have any aliases + * from the vmap layer. + */ +void vm_unmap_aliases(void) +{ + unsigned long start = ULONG_MAX, end = 0; + int flush = 0; + + _vm_unmap_aliases(start, end, flush); +} EXPORT_SYMBOL_GPL(vm_unmap_aliases); /** @@ -1608,6 +1615,72 @@ struct vm_struct *remove_vm_area(const void *addr) return NULL; } +static inline void set_area_direct_map(const struct vm_struct *area, + int (*set_direct_map)(struct page *page)) +{ + int i; + + for (i = 0; i < area->nr_pages; i++) + if (page_address(area->pages[i])) + set_direct_map(area->pages[i]); +} + +/* Handle removing and resetting vm mappings related to the vm_struct. */ +static void vm_remove_mappings(struct vm_struct *area, int deallocate_pages) +{ + unsigned long addr = (unsigned long)area->addr; + unsigned long start = ULONG_MAX, end = 0; + int flush_reset = area->flags & VM_FLUSH_RESET_PERMS; + int i; + + /* + * The below block can be removed when all architectures that have + * direct map permissions also have set_direct_map_() implementations. + * This is concerned with resetting the direct map any an vm alias with + * execute permissions, without leaving a RW+X window. + */ + if (flush_reset && !IS_ENABLED(CONFIG_ARCH_HAS_SET_DIRECT_MAP)) { + set_memory_nx(addr, area->nr_pages); + set_memory_rw(addr, area->nr_pages); + } + + remove_vm_area(area->addr); + + /* If this is not VM_FLUSH_RESET_PERMS memory, no need for the below. */ + if (!flush_reset) + return; + + /* + * If not deallocating pages, just do the flush of the VM area and + * return. + */ + if (!deallocate_pages) { + vm_unmap_aliases(); + return; + } + + /* + * If execution gets here, flush the vm mapping and reset the direct + * map. Find the start and end range of the direct mappings to make sure + * the vm_unmap_aliases() flush includes the direct map. + */ + for (i = 0; i < area->nr_pages; i++) { + if (page_address(area->pages[i])) { + start = min(addr, start); + end = max(addr, end); + } + } + + /* + * Set direct map to something invalid so that it won't be cached if + * there are any accesses after the TLB flush, then flush the TLB and + * reset the direct map permissions to the default. + */ + set_area_direct_map(area, set_direct_map_invalid_noflush); + _vm_unmap_aliases(start, end, 1); + set_area_direct_map(area, set_direct_map_default_noflush); +} + static void __vunmap(const void *addr, int deallocate_pages) { struct vm_struct *area; @@ -1629,7 +1702,8 @@ static void __vunmap(const void *addr, int deallocate_pages) debug_check_no_locks_freed(addr, get_vm_area_size(area)); debug_check_no_obj_freed(addr, get_vm_area_size(area)); - remove_vm_area(addr); + vm_remove_mappings(area, deallocate_pages); + if (deallocate_pages) { int i; @@ -2050,8 +2124,9 @@ EXPORT_SYMBOL(vzalloc_node); void *vmalloc_exec(unsigned long size) { - return __vmalloc_node(size, 1, GFP_KERNEL, PAGE_KERNEL_EXEC, - NUMA_NO_NODE, __builtin_return_address(0)); + return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END, + GFP_KERNEL, PAGE_KERNEL_EXEC, VM_FLUSH_RESET_PERMS, + NUMA_NO_NODE, __builtin_return_address(0)); } #if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32) From 22a952959e79b30236855acaa1c534182776a9a8 Mon Sep 17 00:00:00 2001 From: Rick Edgecombe Date: Thu, 25 Apr 2019 17:11:38 -0700 Subject: [PATCH 1123/1640] UPSTREAM: bpf: Use vmalloc special flag Use new flag VM_FLUSH_RESET_PERMS for handling freeing of special permissioned memory in vmalloc and remove places where memory was set RW before freeing which is no longer needed. Don't track if the memory is RO anymore because it is now tracked in vmalloc. Change-Id: I8629e70ea37c67bd56beca35ef9bc2f65846e4ff Signed-off-by: Rick Edgecombe Signed-off-by: Peter Zijlstra (Intel) Cc: Cc: Cc: Cc: Cc: Cc: Cc: Cc: Alexei Starovoitov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Daniel Borkmann Cc: Dave Hansen Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Nadav Amit Cc: Rik van Riel Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20190426001143.4983-19-namit@vmware.com Signed-off-by: Ingo Molnar --- include/linux/filter.h | 17 +++-------------- kernel/bpf/core.c | 1 - 2 files changed, 3 insertions(+), 15 deletions(-) diff --git a/include/linux/filter.h b/include/linux/filter.h index 52221169b1a0..3d67f3a850f1 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -20,6 +20,7 @@ #include #include #include +#include #include @@ -538,7 +539,6 @@ struct bpf_prog { u16 pages; /* Number of allocated pages */ u16 jited:1, /* Is our filter JIT'ed? */ jit_requested:1,/* archs need to JIT the prog */ - undo_set_mem:1, /* Passed set_memory_ro() checkpoint */ gpl_compatible:1, /* Is filter GPL compatible? */ cb_access:1, /* Is control block accessed? */ dst_needed:1, /* Do we need dst entry? */ @@ -823,26 +823,16 @@ bpf_ctx_narrow_access_ok(u32 off, u32 size, u32 size_default) static inline void bpf_prog_lock_ro(struct bpf_prog *fp) { - fp->undo_set_mem = 1; + set_vm_flush_reset_perms(fp); set_memory_ro((unsigned long)fp, fp->pages); } -static inline void bpf_prog_unlock_ro(struct bpf_prog *fp) -{ - if (fp->undo_set_mem) - set_memory_rw((unsigned long)fp, fp->pages); -} - static inline void bpf_jit_binary_lock_ro(struct bpf_binary_header *hdr) { + set_vm_flush_reset_perms(hdr); set_memory_ro((unsigned long)hdr, hdr->pages); } -static inline void bpf_jit_binary_unlock_ro(struct bpf_binary_header *hdr) -{ - set_memory_rw((unsigned long)hdr, hdr->pages); -} - static inline struct bpf_binary_header * bpf_jit_binary_hdr(const struct bpf_prog *fp) { @@ -878,7 +868,6 @@ void __bpf_prog_free(struct bpf_prog *fp); static inline void bpf_prog_unlock_free(struct bpf_prog *fp) { - bpf_prog_unlock_ro(fp); __bpf_prog_free(fp); } diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index d8537a5d45bf..7dc25281b39a 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -872,7 +872,6 @@ void __weak bpf_jit_free(struct bpf_prog *fp) if (fp->jited) { struct bpf_binary_header *hdr = bpf_jit_binary_hdr(fp); - bpf_jit_binary_unlock_ro(hdr); bpf_jit_binary_free(hdr); WARN_ON_ONCE(!bpf_prog_kallsyms_verify_off(fp)); From a775f89110ee96b9f8178450a3a112ac8e2d24fe Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 15 Apr 2019 22:31:29 -0400 Subject: [PATCH 1124/1640] UPSTREAM: bpf: switch to ->free_inode() Acked-by: Alexei Starovoitov Acked-by: Song Liu Change-Id: I4247c4f883c9221b576d4449127c2f86581dd9b3 Signed-off-by: Al Viro --- kernel/bpf/inode.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 11fade89c1f3..d2d305a29b30 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -567,9 +567,8 @@ static int bpf_show_options(struct seq_file *m, struct dentry *root) return 0; } -static void bpf_destroy_inode_deferred(struct rcu_head *head) +static void bpf_free_inode(struct inode *inode) { - struct inode *inode = container_of(head, struct inode, i_rcu); enum bpf_type type; if (S_ISLNK(inode->i_mode)) @@ -579,16 +578,11 @@ static void bpf_destroy_inode_deferred(struct rcu_head *head) free_inode_nonrcu(inode); } -static void bpf_destroy_inode(struct inode *inode) -{ - call_rcu(&inode->i_rcu, bpf_destroy_inode_deferred); -} - static const struct super_operations bpf_super_ops = { .statfs = simple_statfs, .drop_inode = generic_delete_inode, .show_options = bpf_show_options, - .destroy_inode = bpf_destroy_inode, + .free_inode = bpf_free_inode, }; enum { From a9ae1c3e72562ffdff3fb0c89b0c2d9d0f68e43b Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 11 May 2019 03:03:09 +0200 Subject: [PATCH 1125/1640] UPSTREAM: bpf: fix out of bounds backwards jmps due to dead code removal systemtap folks reported the following splat recently: [ 7790.862212] WARNING: CPU: 3 PID: 26759 at arch/x86/kernel/kprobes/core.c:1022 kprobe_fault_handler+0xec/0xf0 [...] [ 7790.864113] CPU: 3 PID: 26759 Comm: sshd Not tainted 5.1.0-0.rc7.git1.1.fc31.x86_64 #1 [ 7790.864198] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS[...] [ 7790.864314] RIP: 0010:kprobe_fault_handler+0xec/0xf0 [ 7790.864375] Code: 48 8b 50 [...] [ 7790.864714] RSP: 0018:ffffc06800bdbb48 EFLAGS: 00010082 [ 7790.864812] RAX: ffff9e2b75a16320 RBX: 0000000000000000 RCX: 0000000000000000 [ 7790.865306] RDX: ffffffffffffffff RSI: 000000000000000e RDI: ffffc06800bdbbf8 [ 7790.865514] RBP: ffffc06800bdbbf8 R08: 0000000000000000 R09: 0000000000000000 [ 7790.865960] R10: 0000000000000000 R11: 0000000000000000 R12: ffffc06800bdbbf8 [ 7790.866037] R13: ffff9e2ab56a0418 R14: ffff9e2b6d0bb400 R15: ffff9e2b6d268000 [ 7790.866114] FS: 00007fde49937d80(0000) GS:ffff9e2b75a00000(0000) knlGS:0000000000000000 [ 7790.866193] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 7790.866318] CR2: 0000000000000000 CR3: 000000012f312000 CR4: 00000000000006e0 [ 7790.866419] Call Trace: [ 7790.866677] do_user_addr_fault+0x64/0x480 [ 7790.867513] do_page_fault+0x33/0x210 [ 7790.868002] async_page_fault+0x1e/0x30 [ 7790.868071] RIP: 0010: (null) [ 7790.868144] Code: Bad RIP value. [ 7790.868229] RSP: 0018:ffffc06800bdbca8 EFLAGS: 00010282 [ 7790.868362] RAX: ffff9e2b598b60f8 RBX: ffffc06800bdbe48 RCX: 0000000000000004 [ 7790.868629] RDX: 0000000000000004 RSI: ffffc06800bdbc6c RDI: ffff9e2b598b60f0 [ 7790.868834] RBP: ffffc06800bdbcf8 R08: 0000000000000000 R09: 0000000000000004 [ 7790.870432] R10: 00000000ff6f7a03 R11: 0000000000000000 R12: 0000000000000001 [ 7790.871859] R13: ffffc06800bdbcb8 R14: 0000000000000000 R15: ffff9e2acd0a5310 [ 7790.873455] ? vfs_read+0x5/0x170 [ 7790.874639] ? vfs_read+0x1/0x170 [ 7790.875834] ? trace_call_bpf+0xf6/0x260 [ 7790.877044] ? vfs_read+0x1/0x170 [ 7790.878208] ? vfs_read+0x5/0x170 [ 7790.879345] ? kprobe_perf_func+0x233/0x260 [ 7790.880503] ? vfs_read+0x1/0x170 [ 7790.881632] ? vfs_read+0x5/0x170 [ 7790.882751] ? kprobe_ftrace_handler+0x92/0xf0 [ 7790.883926] ? __vfs_read+0x30/0x30 [ 7790.885050] ? ftrace_ops_assist_func+0x94/0x100 [ 7790.886183] ? vfs_read+0x1/0x170 [ 7790.887283] ? vfs_read+0x5/0x170 [ 7790.888348] ? ksys_read+0x5a/0xe0 [ 7790.889389] ? do_syscall_64+0x5c/0xa0 [ 7790.890401] ? entry_SYSCALL_64_after_hwframe+0x49/0xbe After some debugging, turns out that the logic in 2cbd95a5c4fb ("bpf: change parameters of call/branch offset adjustment") has a bug that is exposed after 52875a04f4b2 ("bpf: verifier: remove dead code") in that we miss some of the jump offset adjustments after code patching when we remove dead code, more concretely, upon backward jump spanning over the area that is being removed. BPF insns of a case that was hit pre 52875a04f4b2: [...] 676: (85) call bpf_perf_event_output#-47616 677: (05) goto pc-636 678: (62) *(u32 *)(r10 -64) = 0 679: (bf) r7 = r10 680: (07) r7 += -64 681: (05) goto pc-44 682: (05) goto pc-1 683: (05) goto pc-1 BPF insns afterwards: [...] 618: (85) call bpf_perf_event_output#-47616 619: (05) goto pc-638 620: (62) *(u32 *)(r10 -64) = 0 621: (bf) r7 = r10 622: (07) r7 += -64 623: (05) goto pc-44 To illustrate the bug, situation looks as follows: ____ 0 | | <-- foo: [...] 1 |____| 2 |____| <-- pos / end_new ^ 3 | | | 4 | | | len 5 |____| | (remove region) 6 | | <-- end_old v 7 | | 8 | | <-- curr (jmp foo) 9 |____| The condition curr >= end_new && curr + off + 1 < end_new in the branch delta adjustments is never hit because curr + off + 1 < end_new is compared as unsigned and therefore curr + off + 1 > end_new in unsigned realm as curr + off + 1 becomes negative since the insns are memmove()'d before the offset adjustments. Correct BPF insns after this fix: [...] 618: (85) call bpf_perf_event_output#-47216 619: (05) goto pc-578 620: (62) *(u32 *)(r10 -64) = 0 621: (bf) r7 = r10 622: (07) r7 += -64 623: (05) goto pc-44 Note that unprivileged case is not affected from this. Fixes: 52875a04f4b2 ("bpf: verifier: remove dead code") Fixes: 2cbd95a5c4fb ("bpf: change parameters of call/branch offset adjustment") Reported-by: Frank Ch. Eigler Change-Id: Ib9be41cace8eb8ab441606e8706af98221dac3fe Signed-off-by: Daniel Borkmann Reviewed-by: Jakub Kicinski Signed-off-by: Alexei Starovoitov --- kernel/bpf/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 7dc25281b39a..12b94a459f60 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -346,7 +346,7 @@ int bpf_prog_calc_tag(struct bpf_prog *fp) } static int bpf_adj_delta_to_imm(struct bpf_insn *insn, u32 pos, s32 end_old, - s32 end_new, u32 curr, const bool probe_pass) + s32 end_new, s32 curr, const bool probe_pass) { const s64 imm_min = S32_MIN, imm_max = S32_MAX; s32 delta = end_new - end_old; @@ -364,7 +364,7 @@ static int bpf_adj_delta_to_imm(struct bpf_insn *insn, u32 pos, s32 end_old, } static int bpf_adj_delta_to_off(struct bpf_insn *insn, u32 pos, s32 end_old, - s32 end_new, u32 curr, const bool probe_pass) + s32 end_new, s32 curr, const bool probe_pass) { const s32 off_min = S16_MIN, off_max = S16_MAX; s32 delta = end_new - end_old; From f945ebb8608d66a44d0f26ed1f41a997019dd04f Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 21 May 2019 20:14:19 -0700 Subject: [PATCH 1126/1640] UPSTREAM: bpf: bump jmp sequence limit The limit of 1024 subsequent jumps was causing otherwise valid programs to be rejected. Bump it to 8192 and make the error more verbose. Change-Id: I700c94e2234d328747e2bcba498147c02252c740 Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 65044ce53926..f58d7e76dd07 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -176,7 +176,7 @@ struct bpf_verifier_stack_elem { struct bpf_verifier_stack_elem *next; }; -#define BPF_COMPLEXITY_LIMIT_STACK 1024 +#define BPF_COMPLEXITY_LIMIT_JMP_SEQ 8192 #define BPF_COMPLEXITY_LIMIT_STATES 64 #define BPF_MAP_PTR_UNPRIV 1UL @@ -781,8 +781,9 @@ static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env, if (err) goto err; elem->st.speculative |= speculative; - if (env->stack_size > BPF_COMPLEXITY_LIMIT_STACK) { - verbose(env, "BPF program is too complex\n"); + if (env->stack_size > BPF_COMPLEXITY_LIMIT_JMP_SEQ) { + verbose(env, "The sequence of %d jumps is too complex.\n", + env->stack_size); goto err; } return &elem->st; From b36e30c0800fb691307afea839e0c0502322f7e1 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 21 May 2019 20:17:05 -0700 Subject: [PATCH 1127/1640] UPSTREAM: bpf: cleanup explored_states clean up explored_states to prep for introduction of hashtable No functional changes. Change-Id: Ieb3453277e7aa4efa22483be5781462ca33ab648 Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 29 ++++++++++++++++++++--------- 1 file changed, 20 insertions(+), 9 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index f58d7e76dd07..deb43c7b8274 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5660,6 +5660,17 @@ enum { }; #define STATE_LIST_MARK ((struct bpf_verifier_state_list *) -1L) +static struct bpf_verifier_state_list **explored_state( + struct bpf_verifier_env *env, + int idx) +{ + return &env->explored_states[idx]; +} + +static void init_explored_state(struct bpf_verifier_env *env, int idx) +{ + env->explored_states[idx] = STATE_LIST_MARK; +} /* t, w, e - match pseudo-code above: * t - index of current instruction @@ -5685,7 +5696,7 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env) if (e == BRANCH) /* mark branch target for state pruning */ - env->explored_states[w] = STATE_LIST_MARK; + init_explored_state(env, w); if (insn_state[w] == 0) { /* tree-edge */ @@ -5753,9 +5764,9 @@ peek_stack: else if (ret < 0) goto err_free; if (t + 1 < insn_cnt) - env->explored_states[t + 1] = STATE_LIST_MARK; + init_explored_state(env, t + 1); if (insns[t].src_reg == BPF_PSEUDO_CALL) { - env->explored_states[t] = STATE_LIST_MARK; + init_explored_state(env, t); ret = push_insn(t, t + insns[t].imm + 1, BRANCH, env); if (ret == 1) goto peek_stack; @@ -5778,10 +5789,10 @@ peek_stack: * after every call and jump */ if (t + 1 < insn_cnt) - env->explored_states[t + 1] = STATE_LIST_MARK; + init_explored_state(env, t + 1); } else { /* conditional jump with two edges */ - env->explored_states[t] = STATE_LIST_MARK; + init_explored_state(env, t); ret = push_insn(t, t + 1, FALLTHROUGH, env); if (ret == 1) goto peek_stack; @@ -6222,7 +6233,7 @@ static void clean_live_states(struct bpf_verifier_env *env, int insn, struct bpf_verifier_state_list *sl; int i; - sl = env->explored_states[insn]; + sl = *explored_state(env, insn); if (!sl) return; @@ -6573,7 +6584,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) struct bpf_verifier_state *cur = env->cur_state, *new; int i, j, err, states_cnt = 0; - pprev = &env->explored_states[insn_idx]; + pprev = explored_state(env, insn_idx); sl = *pprev; if (!sl) @@ -6660,8 +6671,8 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) kfree(new_sl); return err; } - new_sl->next = env->explored_states[insn_idx]; - env->explored_states[insn_idx] = new_sl; + new_sl->next = *explored_state(env, insn_idx); + *explored_state(env, insn_idx) = new_sl; /* connect new state to parentage chain. Current frame needs all * registers connected. Only r6 - r9 of the callers are alive (pushed * to the stack implicitly by JITs) so in callers' frames connect just From 4854bb718c1952229c9a87a5178df199959da8cc Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 21 May 2019 20:17:06 -0700 Subject: [PATCH 1128/1640] UPSTREAM: bpf: split explored_states split explored_states into prune_point boolean mark and link list of explored states. This removes STATE_LIST_MARK hack and allows marks to be separate from states. Change-Id: I974649a944b309ff4459b3b40f6485e18b69128e Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/linux/bpf_verifier.h | 1 + kernel/bpf/verifier.c | 31 +++++++++++++------------------ 2 files changed, 14 insertions(+), 18 deletions(-) diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 1bedaea66284..868197bda363 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -241,6 +241,7 @@ struct bpf_insn_aux_data { bool seen; /* this insn was processed by the verifier */ bool sanitize_stack_spill; /* subject to Spectre v4 sanitation */ u8 alu_state; /* used in combination with alu_limit */ + bool prune_point; unsigned int orig_idx; /* original instruction index */ }; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index deb43c7b8274..2c8940bbb4bf 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5659,7 +5659,6 @@ enum { BRANCH = 2, }; -#define STATE_LIST_MARK ((struct bpf_verifier_state_list *) -1L) static struct bpf_verifier_state_list **explored_state( struct bpf_verifier_env *env, int idx) @@ -5669,7 +5668,7 @@ static struct bpf_verifier_state_list **explored_state( static void init_explored_state(struct bpf_verifier_env *env, int idx) { - env->explored_states[idx] = STATE_LIST_MARK; + env->insn_aux_data[idx].prune_point = true; } /* t, w, e - match pseudo-code above: @@ -6234,10 +6233,7 @@ static void clean_live_states(struct bpf_verifier_env *env, int insn, int i; sl = *explored_state(env, insn); - if (!sl) - return; - - while (sl != STATE_LIST_MARK) { + while (sl) { if (sl->state.curframe != cur->curframe) goto next; for (i = 0; i <= cur->curframe; i++) @@ -6584,18 +6580,18 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) struct bpf_verifier_state *cur = env->cur_state, *new; int i, j, err, states_cnt = 0; - pprev = explored_state(env, insn_idx); - sl = *pprev; - - if (!sl) + if (!env->insn_aux_data[insn_idx].prune_point) /* this 'insn_idx' instruction wasn't marked, so we will not * be doing state search here */ return 0; + pprev = explored_state(env, insn_idx); + sl = *pprev; + clean_live_states(env, insn_idx, cur); - while (sl != STATE_LIST_MARK) { + while (sl) { if (states_equal(env, &sl->state, cur)) { sl->hit_cnt++; /* reached equivalent register/stack state, @@ -8359,13 +8355,12 @@ static void free_states(struct bpf_verifier_env *env) for (i = 0; i < env->prog->len; i++) { sl = env->explored_states[i]; - if (sl) - while (sl != STATE_LIST_MARK) { - sln = sl->next; - free_verifier_state(&sl->state, false); - kfree(sl); - sl = sln; - } + while (sl) { + sln = sl->next; + free_verifier_state(&sl->state, false); + kfree(sl); + sl = sln; + } } kvfree(env->explored_states); From fd77cbbab1cb4ce0d64103f0ddf670a1c4075b6e Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 21 May 2019 20:17:07 -0700 Subject: [PATCH 1129/1640] UPSTREAM: bpf: convert explored_states to hash table All prune points inside a callee bpf function most likely will have different callsites. For example, if function foo() is called from two callsites the half of explored states in all prune points in foo() will be useless for subsequent walking of one of those callsites. Fortunately explored_states pruning heuristics keeps the number of states per prune point small, but walking these states is still a waste of cpu time when the callsite of the current state is different from the callsite of the explored state. To improve pruning logic convert explored_states into hash table and use simple insn_idx ^ callsite hash to select hash bucket. This optimization has no effect on programs without bpf2bpf calls and drastically improves programs with calls. In the later case it reduces total memory consumption in 1M scale tests by almost 3 times (peak_states drops from 5752 to 2016). Care should be taken when comparing the states for equivalency. Since the same hash bucket can now contain states with different indices the insn_idx has to be part of verifier_state and compared. Different hash table sizes and different hash functions were explored, but the results were not significantly better vs this patch. They can be improved in the future. Hit/miss heuristic is not counting index miscompare as a miss. Otherwise verifier stats become unstable when experimenting with different hash functions. Change-Id: I5dc90d4d1239b246593fcbbcaa94c9a79b6378cb Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/linux/bpf_verifier.h | 1 + kernel/bpf/verifier.c | 23 ++++++++++++++++++----- 2 files changed, 19 insertions(+), 5 deletions(-) diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 868197bda363..9c90c2395445 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -194,6 +194,7 @@ struct bpf_id_pair { struct bpf_verifier_state { /* call stack tracking */ struct bpf_func_state *frame[MAX_CALL_FRAMES]; + u32 insn_idx; u32 curframe; u32 active_spin_lock; bool speculative; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 2c8940bbb4bf..dc7d8e3ce8bd 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5659,11 +5659,19 @@ enum { BRANCH = 2, }; +static u32 state_htab_size(struct bpf_verifier_env *env) +{ + return env->prog->len; +} + static struct bpf_verifier_state_list **explored_state( struct bpf_verifier_env *env, int idx) { - return &env->explored_states[idx]; + struct bpf_verifier_state *cur = env->cur_state; + struct bpf_func_state *state = cur->frame[cur->curframe]; + + return &env->explored_states[(idx ^ state->callsite) % state_htab_size(env)]; } static void init_explored_state(struct bpf_verifier_env *env, int idx) @@ -6234,7 +6242,8 @@ static void clean_live_states(struct bpf_verifier_env *env, int insn, sl = *explored_state(env, insn); while (sl) { - if (sl->state.curframe != cur->curframe) + if (sl->state.insn_idx != insn || + sl->state.curframe != cur->curframe) goto next; for (i = 0; i <= cur->curframe; i++) if (sl->state.frame[i]->callsite != cur->frame[i]->callsite) @@ -6592,6 +6601,9 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) clean_live_states(env, insn_idx, cur); while (sl) { + states_cnt++; + if (sl->state.insn_idx != insn_idx) + goto next; if (states_equal(env, &sl->state, cur)) { sl->hit_cnt++; /* reached equivalent register/stack state, @@ -6609,7 +6621,6 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) return err; return 1; } - states_cnt++; sl->miss_cnt++; /* heuristic to determine whether this state is beneficial * to keep checking from state equivalence point of view. @@ -6636,6 +6647,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) sl = *pprev; continue; } +next: pprev = &sl->next; sl = *pprev; } @@ -6667,6 +6679,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) kfree(new_sl); return err; } + new->insn_idx = insn_idx; new_sl->next = *explored_state(env, insn_idx); *explored_state(env, insn_idx) = new_sl; /* connect new state to parentage chain. Current frame needs all @@ -8352,7 +8365,7 @@ static void free_states(struct bpf_verifier_env *env) if (!env->explored_states) return; - for (i = 0; i < env->prog->len; i++) { + for (i = 0; i < state_htab_size(env); i++) { sl = env->explored_states[i]; while (sl) { @@ -8461,7 +8474,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, goto skip_full_check; } - env->explored_states = kvcalloc(env->prog->len, + env->explored_states = kvcalloc(state_htab_size(env), sizeof(struct bpf_verifier_state_list *), GFP_USER); ret = -ENOMEM; From aac495c27873f6ee6d996cd9ecfa7f670f71e7a8 Mon Sep 17 00:00:00 2001 From: Jiong Wang Date: Fri, 24 May 2019 23:25:12 +0100 Subject: [PATCH 1130/1640] BACKPORT: bpf: verifier: mark verified-insn with sub-register zext flag eBPF ISA specification requires high 32-bit cleared when low 32-bit sub-register is written. This applies to destination register of ALU32 etc. JIT back-ends must guarantee this semantic when doing code-gen. x86_64 and AArch64 ISA has the same semantics, so the corresponding JIT back-end doesn't need to do extra work. However, 32-bit arches (arm, x86, nfp etc.) and some other 64-bit arches (PowerPC, SPARC etc) need to do explicit zero extension to meet this requirement, otherwise code like the following will fail. u64_value = (u64) u32_value ... other uses of u64_value This is because compiler could exploit the semantic described above and save those zero extensions for extending u32_value to u64_value, these JIT back-ends are expected to guarantee this through inserting extra zero extensions which however could be a significant increase on the code size. Some benchmarks show there could be ~40% sub-register writes out of total insns, meaning at least ~40% extra code-gen. One observation is these extra zero extensions are not always necessary. Take above code snippet for example, it is possible u32_value will never be casted into a u64, the value of high 32-bit of u32_value then could be ignored and extra zero extension could be eliminated. This patch implements this idea, insns defining sub-registers will be marked when the high 32-bit of the defined sub-register matters. For those unmarked insns, it is safe to eliminate high 32-bit clearnace for them. Algo: - Split read flags into READ32 and READ64. - Record index of insn that does sub-register write. Keep the index inside reg state and update it during verifier insn walking. - A full register read on a sub-register marks its definition insn as needing zero extension on dst register. A new sub-register write overrides the old one. - When propagating read64 during path pruning, also mark any insn defining a sub-register that is read in the pruned path as full-register. Reviewed-by: Jakub Kicinski Change-Id: I620b88a97ad0432c0009aef177a69378078cf565 Signed-off-by: Jiong Wang Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 14 ++- kernel/bpf/verifier.c | 173 ++++++++++++++++++++++++++++++++--- 2 files changed, 170 insertions(+), 17 deletions(-) diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 9c90c2395445..e7ea144d4f79 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -36,9 +36,11 @@ */ enum bpf_reg_liveness { REG_LIVE_NONE = 0, /* reg hasn't been read or written this branch */ - REG_LIVE_READ, /* reg was read, so we're sensitive to initial value */ - REG_LIVE_WRITTEN, /* reg was written first, screening off later reads */ - REG_LIVE_DONE = 4, /* liveness won't be updating this register anymore */ + REG_LIVE_READ32 = 0x1, /* reg was read, so we're sensitive to initial value */ + REG_LIVE_READ64 = 0x2, /* likewise, but full 64-bit content matters */ + REG_LIVE_READ = REG_LIVE_READ32 | REG_LIVE_READ64, + REG_LIVE_WRITTEN = 0x4, /* reg was written first, screening off later reads */ + REG_LIVE_DONE = 0x8, /* liveness won't be updating this register anymore */ }; struct bpf_reg_state { @@ -131,6 +133,11 @@ struct bpf_reg_state { * pointing to bpf_func_state. */ u32 frameno; + /* Tracks subreg definition. The stored value is the insn_idx of the + * writing insn. This is safe because subreg_def is used before any insn + * patching which only happens after main verification finished. + */ + s32 subreg_def; enum bpf_reg_liveness live; }; @@ -241,6 +248,7 @@ struct bpf_insn_aux_data { int ctx_field_size; /* the ctx field size for load insn, maybe 0 */ bool seen; /* this insn was processed by the verifier */ bool sanitize_stack_spill; /* subject to Spectre v4 sanitation */ + bool zext_dst; /* this insn zero extends dst reg */ u8 alu_state; /* used in combination with alu_limit */ bool prune_point; unsigned int orig_idx; /* original instruction index */ diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index dc7d8e3ce8bd..7c917e33d452 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -981,6 +981,7 @@ static void mark_reg_not_init(struct bpf_verifier_env *env, __mark_reg_not_init(regs + regno); } +#define DEF_NOT_SUBREG (0) static void init_reg_state(struct bpf_verifier_env *env, struct bpf_func_state *state) { @@ -991,6 +992,7 @@ static void init_reg_state(struct bpf_verifier_env *env, mark_reg_not_init(env, regs, i); regs[i].live = REG_LIVE_NONE; regs[i].parent = NULL; + regs[i].subreg_def = DEF_NOT_SUBREG; } /* frame pointer */ @@ -1136,7 +1138,7 @@ next: */ static int mark_reg_read(struct bpf_verifier_env *env, const struct bpf_reg_state *state, - struct bpf_reg_state *parent) + struct bpf_reg_state *parent, u8 flag) { bool writes = parent == state->parent; /* Observe write marks */ int cnt = 0; @@ -1151,17 +1153,26 @@ static int mark_reg_read(struct bpf_verifier_env *env, parent->var_off.value, parent->off); return -EFAULT; } - if (parent->live & REG_LIVE_READ) + /* The first condition is more likely to be true than the + * second, checked it first. + */ + if ((parent->live & REG_LIVE_READ) == flag || + parent->live & REG_LIVE_READ64) /* The parentage chain never changes and * this parent was already marked as LIVE_READ. * There is no need to keep walking the chain again and * keep re-marking all parents as LIVE_READ. * This case happens when the same register is read * multiple times without writes into it in-between. + * Also, if parent has the stronger REG_LIVE_READ64 set, + * then no need to set the weak REG_LIVE_READ32. */ break; /* ... then we depend on parent's value */ - parent->live |= REG_LIVE_READ; + parent->live |= flag; + /* REG_LIVE_READ64 overrides REG_LIVE_READ32. */ + if (flag == REG_LIVE_READ64) + parent->live &= ~REG_LIVE_READ32; state = parent; parent = state->parent; writes = true; @@ -1173,12 +1184,111 @@ static int mark_reg_read(struct bpf_verifier_env *env, return 0; } +/* This function is supposed to be used by the following 32-bit optimization + * code only. It returns TRUE if the source or destination register operates + * on 64-bit, otherwise return FALSE. + */ +static bool is_reg64(struct bpf_verifier_env *env, struct bpf_insn *insn, + u32 regno, struct bpf_reg_state *reg, enum reg_arg_type t) +{ + u8 code, class, op; + + code = insn->code; + class = BPF_CLASS(code); + op = BPF_OP(code); + if (class == BPF_JMP) { + /* BPF_EXIT for "main" will reach here. Return TRUE + * conservatively. + */ + if (op == BPF_EXIT) + return true; + if (op == BPF_CALL) { + /* BPF to BPF call will reach here because of marking + * caller saved clobber with DST_OP_NO_MARK for which we + * don't care the register def because they are anyway + * marked as NOT_INIT already. + */ + if (insn->src_reg == BPF_PSEUDO_CALL) + return false; + /* Helper call will reach here because of arg type + * check, conservatively return TRUE. + */ + if (t == SRC_OP) + return true; + + return false; + } + } + + if (class == BPF_ALU64 || class == BPF_JMP || + /* BPF_END always use BPF_ALU class. */ + (class == BPF_ALU && op == BPF_END && insn->imm == 64)) + return true; + + if (class == BPF_ALU || class == BPF_JMP32) + return false; + + if (class == BPF_LDX) { + if (t != SRC_OP) + return BPF_SIZE(code) == BPF_DW; + /* LDX source must be ptr. */ + return true; + } + + if (class == BPF_STX) { + if (reg->type != SCALAR_VALUE) + return true; + return BPF_SIZE(code) == BPF_DW; + } + + if (class == BPF_LD) { + u8 mode = BPF_MODE(code); + + /* LD_IMM64 */ + if (mode == BPF_IMM) + return true; + + /* Both LD_IND and LD_ABS return 32-bit data. */ + if (t != SRC_OP) + return false; + + /* Implicit ctx ptr. */ + if (regno == BPF_REG_6) + return true; + + /* Explicit source could be any width. */ + return true; + } + + if (class == BPF_ST) + /* The only source register for BPF_ST is a ptr. */ + return true; + + /* Conservatively return true at default. */ + return true; +} + +static void mark_insn_zext(struct bpf_verifier_env *env, + struct bpf_reg_state *reg) +{ + s32 def_idx = reg->subreg_def; + + if (def_idx == DEF_NOT_SUBREG) + return; + + env->insn_aux_data[def_idx - 1].zext_dst = true; + /* The dst will be zero extended, so won't be sub-register anymore. */ + reg->subreg_def = DEF_NOT_SUBREG; +} + static int check_reg_arg(struct bpf_verifier_env *env, u32 regno, enum reg_arg_type t) { struct bpf_verifier_state *vstate = env->cur_state; struct bpf_func_state *state = vstate->frame[vstate->curframe]; + struct bpf_insn *insn = env->prog->insnsi + env->insn_idx; struct bpf_reg_state *reg, *regs = state->regs; + bool rw64; if (regno >= MAX_BPF_REG) { verbose(env, "R%d is invalid\n", regno); @@ -1186,6 +1296,7 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno, } reg = ®s[regno]; + rw64 = is_reg64(env, insn, regno, reg, t); if (t == SRC_OP) { /* check whether register used as source operand can be read */ if (reg->type == NOT_INIT) { @@ -1196,7 +1307,11 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno, if (regno == BPF_REG_FP) return 0; - return mark_reg_read(env, reg, reg->parent); + if (rw64) + mark_insn_zext(env, reg); + + return mark_reg_read(env, reg, reg->parent, + rw64 ? REG_LIVE_READ64 : REG_LIVE_READ32); } else { /* check whether register used as dest operand can be written to */ if (regno == BPF_REG_FP) { @@ -1204,6 +1319,7 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno, return -EACCES; } reg->live |= REG_LIVE_WRITTEN; + reg->subreg_def = rw64 ? DEF_NOT_SUBREG : env->insn_idx + 1; if (t == DST_OP) mark_reg_unknown(env, regs, regno); } @@ -1376,7 +1492,7 @@ static int check_stack_read(struct bpf_verifier_env *env, mark_reg_unknown(env, state->regs, value_regno); state->regs[value_regno].live |= REG_LIVE_WRITTEN; } - mark_reg_read(env, reg, reg->parent); + mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64); return 0; } for (i = 1; i < BPF_REG_SIZE; i++) { @@ -1395,7 +1511,7 @@ static int check_stack_read(struct bpf_verifier_env *env, */ state->regs[value_regno].live |= REG_LIVE_WRITTEN; } - mark_reg_read(env, reg, reg->parent); + mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64); } else { int zeros = 0; @@ -1410,7 +1526,7 @@ static int check_stack_read(struct bpf_verifier_env *env, off, i, size); return -EACCES; } - mark_reg_read(env, reg, reg->parent); + mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64); if (value_regno >= 0) { if (zeros == size) { /* any size read into register is zero extended, @@ -2119,6 +2235,12 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn value_regno); if (reg_type_may_be_null(reg_type)) regs[value_regno].id = ++env->id_gen; + /* A load of ctx field could have different + * actual load size with the one encoded in the + * insn. When the dst is PTR, it is for sure not + * a sub-register. + */ + regs[value_regno].subreg_def = DEF_NOT_SUBREG; } regs[value_regno].type = reg_type; } @@ -2386,7 +2508,8 @@ mark: * the whole slot to be marked as 'read' */ mark_reg_read(env, &state->stack[spi].spilled_ptr, - state->stack[spi].spilled_ptr.parent); + state->stack[spi].spilled_ptr.parent, + REG_LIVE_READ64); } return update_stack_depth(env, state, min_off); } @@ -3372,6 +3495,9 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK); } + /* helper call returns 64-bit value. */ + regs[BPF_REG_0].subreg_def = DEF_NOT_SUBREG; + /* update return register (already marked as written above) */ if (fn->ret_type == RET_INTEGER) { /* sets type to SCALAR_VALUE */ @@ -4460,6 +4586,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) */ *dst_reg = *src_reg; dst_reg->live |= REG_LIVE_WRITTEN; + dst_reg->subreg_def = DEF_NOT_SUBREG; } else { /* R1 = (u32) R2 */ if (is_pointer_value(env, insn->src_reg)) { @@ -4470,6 +4597,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) } else if (src_reg->type == SCALAR_VALUE) { *dst_reg = *src_reg; dst_reg->live |= REG_LIVE_WRITTEN; + dst_reg->subreg_def = env->insn_idx + 1; } else { mark_reg_unknown(env, regs, insn->dst_reg); @@ -5572,6 +5700,8 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) * Already marked as written above. */ mark_reg_unknown(env, regs, BPF_REG_0); + /* ld_abs load up to 32-bit skb data. */ + regs[BPF_REG_0].subreg_def = env->insn_idx + 1; return 0; } @@ -6517,20 +6647,33 @@ static bool states_equal(struct bpf_verifier_env *env, return true; } +/* Return 0 if no propagation happened. Return negative error code if error + * happened. Otherwise, return the propagated bit. + */ static int propagate_liveness_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg, struct bpf_reg_state *parent_reg) { + u8 parent_flag = parent_reg->live & REG_LIVE_READ; + u8 flag = reg->live & REG_LIVE_READ; int err; - if (parent_reg->live & REG_LIVE_READ || !(reg->live & REG_LIVE_READ)) + /* When comes here, read flags of PARENT_REG or REG could be any of + * REG_LIVE_READ64, REG_LIVE_READ32, REG_LIVE_NONE. There is no need + * of propagation if PARENT_REG has strongest REG_LIVE_READ64. + */ + if (parent_flag == REG_LIVE_READ64 || + /* Or if there is no read flag from REG. */ + !flag || + /* Or if the read flag from REG is the same as PARENT_REG. */ + parent_flag == flag) return 0; - err = mark_reg_read(env, reg, parent_reg); + err = mark_reg_read(env, reg, parent_reg, flag); if (err) return err; - return 0; + return flag; } /* A write screens off any subsequent reads; but write marks come from the @@ -6564,8 +6707,10 @@ static int propagate_liveness(struct bpf_verifier_env *env, for (i = frame < vstate->curframe ? BPF_REG_6 : 0; i < BPF_REG_FP; i++) { err = propagate_liveness_reg(env, &state_reg[i], &parent_reg[i]); - if (err) + if (err < 0) return err; + if (err == REG_LIVE_READ64) + mark_insn_zext(env, &parent_reg[i]); } /* Propagate stack slots. */ @@ -6575,11 +6720,11 @@ static int propagate_liveness(struct bpf_verifier_env *env, state_reg = &state->stack[i].spilled_ptr; err = propagate_liveness_reg(env, state_reg, parent_reg); - if (err) + if (err < 0) return err; } } - return err; + return 0; } static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) From b627a98851b84ab217dea28ba5c1d77dd81e794d Mon Sep 17 00:00:00 2001 From: Jiong Wang Date: Fri, 24 May 2019 23:25:13 +0100 Subject: [PATCH 1131/1640] BACKPORT: bpf: verifier: mark patched-insn with sub-register zext flag Patched insns do not go through generic verification, therefore doesn't has zero extension information collected during insn walking. We don't bother analyze them at the moment, for any sub-register def comes from them, just conservatively mark it as needing zero extension. Change-Id: I4fb757fd4a883eebab7262732e395239a32c1c25 Signed-off-by: Jiong Wang Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 34 +++++++++++++++++++++++++++++++--- 1 file changed, 31 insertions(+), 3 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 7c917e33d452..1cb0cd0399da 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1268,6 +1268,24 @@ static bool is_reg64(struct bpf_verifier_env *env, struct bpf_insn *insn, return true; } +/* Return TRUE if INSN doesn't have explicit value define. */ +static bool insn_no_def(struct bpf_insn *insn) +{ + u8 class = BPF_CLASS(insn->code); + + return (class == BPF_JMP || class == BPF_JMP32 || + class == BPF_STX || class == BPF_ST); +} + +/* Return TRUE if INSN has defined any 32-bit value explicitly. */ +static bool insn_has_def32(struct bpf_verifier_env *env, struct bpf_insn *insn) +{ + if (insn_no_def(insn)) + return false; + + return !is_reg64(env, insn, insn->dst_reg, NULL, DST_OP); +} + static void mark_insn_zext(struct bpf_verifier_env *env, struct bpf_reg_state *reg) { @@ -7504,15 +7522,24 @@ static void convert_pseudo_ld_imm64(struct bpf_verifier_env *env) * insni[off, off + cnt). Adjust corresponding insn_aux_data by copying * [0, off) and [off, end) to new locations, so the patched range stays zero */ -static int adjust_insn_aux_data(struct bpf_verifier_env *env, u32 prog_len, - u32 off, u32 cnt) +static int adjust_insn_aux_data(struct bpf_verifier_env *env, + struct bpf_prog *new_prog, u32 off, u32 cnt) { struct bpf_insn_aux_data *new_data, *old_data = env->insn_aux_data; + struct bpf_insn *insn = new_prog->insnsi; bool old_seen = old_data[off].seen; + u32 prog_len; int i; + /* aux info at OFF always needs adjustment, no matter fast path + * (cnt == 1) is taken or not. There is no guarantee INSN at OFF is the + * original insn at old prog. + */ + old_data[off].zext_dst = insn_has_def32(env, insn + off + cnt - 1); + if (cnt == 1) return 0; + prog_len = new_prog->len; new_data = vzalloc(array_size(prog_len, sizeof(struct bpf_insn_aux_data))); if (!new_data) @@ -7523,6 +7550,7 @@ static int adjust_insn_aux_data(struct bpf_verifier_env *env, u32 prog_len, for (i = off; i < off + cnt - 1; i++) { /* Expand insni[off]'s seen count to the patched range. */ new_data[i].seen = old_seen; + new_data[i].zext_dst = insn_has_def32(env, insn + i); } env->insn_aux_data = new_data; vfree(old_data); @@ -7556,7 +7584,7 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of env->insn_aux_data[off].orig_idx); return NULL; } - if (adjust_insn_aux_data(env, new_prog->len, off, len)) + if (adjust_insn_aux_data(env, new_prog, off, len)) return NULL; adjust_subprog_starts(env, off, len); return new_prog; From 4562c80ca558537d4bdf2881581a689857162907 Mon Sep 17 00:00:00 2001 From: Jiong Wang Date: Fri, 24 May 2019 23:25:14 +0100 Subject: [PATCH 1132/1640] UPSTREAM: bpf: introduce new mov32 variant for doing explicit zero extension The encoding for this new variant is based on BPF_X format. "imm" field was 0 only, now it could be 1 which means doing zero extension unconditionally .code = BPF_ALU | BPF_MOV | BPF_X .dst_reg = DST .src_reg = SRC .imm = 1 We use this new form for doing zero extension for which verifier will guarantee SRC == DST. Implications on JIT back-ends when doing code-gen for BPF_ALU | BPF_MOV | BPF_X: 1. No change if hardware already does zero extension unconditionally for sub-register write. 2. Otherwise, when seeing imm == 1, just generate insns to clear high 32-bit. No need to generate insns for the move because when imm == 1, dst_reg is the same as src_reg at the moment. Interpreter doesn't need change as well. It is doing unconditionally zero extension for mov32 already. One helper macro BPF_ZEXT_REG is added to help creating zero extension insn using this new mov32 variant. One helper function insn_is_zext is added for checking one insn is an zero extension on dst. This will be widely used by a few JIT back-ends in later patches in this set. Change-Id: I8b04c1010e1415779076185eb634ae6ca4704b81 Signed-off-by: Jiong Wang Signed-off-by: Alexei Starovoitov --- include/linux/filter.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/include/linux/filter.h b/include/linux/filter.h index 3d67f3a850f1..6c7bf1a44ce5 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -189,6 +189,20 @@ struct ctl_table_header; .off = (insn).off, \ .imm = (insn).imm }) +/* Special form of mov32, used for doing explicit zero extension on dst. */ +#define BPF_ZEXT_REG(DST) \ + ((struct bpf_insn) { \ + .code = BPF_ALU | BPF_MOV | BPF_X, \ + .dst_reg = DST, \ + .src_reg = DST, \ + .off = 0, \ + .imm = 1 }) + +static inline bool insn_is_zext(const struct bpf_insn *insn) +{ + return insn->code == (BPF_ALU | BPF_MOV | BPF_X) && insn->imm == 1; +} + /* BPF_LD_IMM64 macro encodes single 'load 64-bit immediate' insn */ #define BPF_LD_IMM64(DST, IMM) \ BPF_LD_IMM64_RAW(DST, 0, IMM) From 3028365514007163d4fa4e5e8c3fcd6bd77fe765 Mon Sep 17 00:00:00 2001 From: Jiong Wang Date: Fri, 24 May 2019 23:25:15 +0100 Subject: [PATCH 1133/1640] UPSTREAM: bpf: verifier: insert zero extension according to analysis result After previous patches, verifier will mark a insn if it really needs zero extension on dst_reg. It is then for back-ends to decide how to use such information to eliminate unnecessary zero extension code-gen during JIT compilation. One approach is verifier insert explicit zero extension for those insns that need zero extension in a generic way, JIT back-ends then do not generate zero extension for sub-register write at default. However, only those back-ends which do not have hardware zero extension want this optimization. Back-ends like x86_64 and AArch64 have hardware zero extension support that the insertion should be disabled. This patch introduces new target hook "bpf_jit_needs_zext" which returns false at default, meaning verifier zero extension insertion is disabled at default. A back-end could override this hook to return true if it doesn't have hardware support and want verifier insert zero extension explicitly. Offload targets do not use this native target hook, instead, they could get the optimization results using bpf_prog_offload_ops.finalize. NOTE: arches could have diversified features, it is possible for one arch to have hardware zero extension support for some sub-register write insns but not for all. For example, PowerPC, SPARC have zero extended loads, but not for alu32. So when verifier zero extension insertion enabled, these JIT back-ends need to peephole insns to remove those zero extension inserted for insn that actually has hardware zero extension support. The peephole could be as simple as looking the next insn, if it is a special zero extension insn then it is safe to eliminate it if the current insn has hardware zero extension support. Reviewed-by: Jakub Kicinski Change-Id: I29f26c1899fb2fc546148ea14b5f4115b19ade9b Signed-off-by: Jiong Wang Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 1 + include/linux/filter.h | 1 + kernel/bpf/core.c | 9 +++++++++ kernel/bpf/verifier.c | 41 +++++++++++++++++++++++++++++++++++++++++ 4 files changed, 52 insertions(+) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 4c177b2af7b7..1377b53657e8 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -374,6 +374,7 @@ struct bpf_prog_aux { u32 id; u32 func_cnt; /* used by non-func prog as the number of func progs */ u32 func_idx; /* 0 for non-func prog, the index in func array for func prog */ + bool verifier_zext; /* Zero extensions has been inserted by verifier. */ bool offload_requested; struct bpf_prog **func; void *jit_data; /* JIT specific data. arch dependent */ diff --git a/include/linux/filter.h b/include/linux/filter.h index 6c7bf1a44ce5..b6fac88c5200 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -912,6 +912,7 @@ u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog); void bpf_jit_compile(struct bpf_prog *prog); +bool bpf_jit_needs_zext(void); bool bpf_helper_changes_pkt_data(void *func); static inline bool bpf_dump_raw_ok(const struct cred *cred) diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 12b94a459f60..b3f57861f564 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2124,6 +2124,15 @@ bool __weak bpf_helper_changes_pkt_data(void *func) return false; } +/* Return TRUE if the JIT backend wants verifier to enable sub-register usage + * analysis code and wants explicit zero extension inserted by verifier. + * Otherwise, return FALSE. + */ +bool __weak bpf_jit_needs_zext(void) +{ + return false; +} + /* To execute LD_ABS/LD_IND instructions __bpf_prog_run() may call * skb_copy_bits(), so provide a weak definition of it for NET-less config. */ diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 1cb0cd0399da..8b0da3f698e9 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -7848,6 +7848,38 @@ static int opt_remove_nops(struct bpf_verifier_env *env) return 0; } +static int opt_subreg_zext_lo32(struct bpf_verifier_env *env) +{ + struct bpf_insn_aux_data *aux = env->insn_aux_data; + struct bpf_insn *insns = env->prog->insnsi; + int i, delta = 0, len = env->prog->len; + struct bpf_insn zext_patch[2]; + struct bpf_prog *new_prog; + + zext_patch[1] = BPF_ZEXT_REG(0); + for (i = 0; i < len; i++) { + int adj_idx = i + delta; + struct bpf_insn insn; + + if (!aux[adj_idx].zext_dst) + continue; + + insn = insns[adj_idx]; + zext_patch[0] = insn; + zext_patch[1].dst_reg = insn.dst_reg; + zext_patch[1].src_reg = insn.dst_reg; + new_prog = bpf_patch_insn_data(env, adj_idx, zext_patch, 2); + if (!new_prog) + return -ENOMEM; + env->prog = new_prog; + insns = new_prog->insnsi; + aux = env->insn_aux_data; + delta += 2; + } + + return 0; +} + /* convert load instructions that access fields of a context type into a * sequence of instructions that access fields of the underlying structure: * struct __sk_buff -> struct sk_buff @@ -8702,6 +8734,15 @@ skip_full_check: if (ret == 0) ret = fixup_bpf_calls(env); + /* do 32-bit optimization after insn patching has done so those patched + * insns could be handled correctly. + */ + if (ret == 0 && bpf_jit_needs_zext() && + !bpf_prog_is_dev_bound(env->prog->aux)) { + ret = opt_subreg_zext_lo32(env); + env->prog->aux->verifier_zext = !ret; + } + if (ret == 0) ret = fixup_call_args(env); From fe9ac073e698d1872cf82552d5e58363e47ae3ef Mon Sep 17 00:00:00 2001 From: Jiong Wang Date: Fri, 24 May 2019 23:25:16 +0100 Subject: [PATCH 1134/1640] UPSTREAM: bpf: introduce new bpf prog load flags "BPF_F_TEST_RND_HI32" x86_64 and AArch64 perhaps are two arches that running bpf testsuite frequently, however the zero extension insertion pass is not enabled for them because of their hardware support. It is critical to guarantee the pass correction as it is supposed to be enabled at default for a couple of other arches, for example PowerPC, SPARC, arm, NFP etc. Therefore, it would be very useful if there is a way to test this pass on for example x86_64. The test methodology employed by this set is "poisoning" useless bits. High 32-bit of a definition is randomized if it is identified as not used by any later insn. Such randomization is only enabled under testing mode which is gated by the new bpf prog load flags "BPF_F_TEST_RND_HI32". Suggested-by: Alexei Starovoitov Change-Id: I0cd1f1ea81ff68b4941dddd59583573645f4cd1e Signed-off-by: Jiong Wang Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 18 ++++++++++++++++++ kernel/bpf/syscall.c | 4 +++- 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 64673fcc2c1e..a18b81f8421b 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -262,6 +262,24 @@ enum bpf_attach_type { */ #define BPF_F_ANY_ALIGNMENT (1U << 1) +/* BPF_F_TEST_RND_HI32 is used in BPF_PROG_LOAD command for testing purpose. + * Verifier does sub-register def/use analysis and identifies instructions whose + * def only matters for low 32-bit, high 32-bit is never referenced later + * through implicit zero extension. Therefore verifier notifies JIT back-ends + * that it is safe to ignore clearing high 32-bit for these instructions. This + * saves some back-ends a lot of code-gen. However such optimization is not + * necessary on some arches, for example x86_64, arm64 etc, whose JIT back-ends + * hence hasn't used verifier's analysis result. But, we really want to have a + * way to be able to verify the correctness of the described optimization on + * x86_64 on which testsuites are frequently exercised. + * + * So, this flag is introduced. Once it is set, verifier will randomize high + * 32-bit for those instructions who has been identified as safe to ignore them. + * Then, if verifier is not doing correct analysis, such randomization will + * regress tests to expose bugs. + */ +#define BPF_F_TEST_RND_HI32 (1U << 2) + /* When BPF ldimm64's insn[0].src_reg != 0 then this can have * two extensions: * diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 5e013bb61ef2..3d026ada0d80 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1607,7 +1607,9 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) if (CHECK_ATTR(BPF_PROG_LOAD)) return -EINVAL; - if (attr->prog_flags & ~(BPF_F_STRICT_ALIGNMENT | BPF_F_ANY_ALIGNMENT)) + if (attr->prog_flags & ~(BPF_F_STRICT_ALIGNMENT | + BPF_F_ANY_ALIGNMENT | + BPF_F_TEST_RND_HI32)) return -EINVAL; if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && From 0cf0668b1e36bc919f79f726aebcb540d1ccfe98 Mon Sep 17 00:00:00 2001 From: Jiong Wang Date: Fri, 24 May 2019 23:25:18 +0100 Subject: [PATCH 1135/1640] UPSTREAM: bpf: verifier: randomize high 32-bit when BPF_F_TEST_RND_HI32 is set This patch randomizes high 32-bit of a definition when BPF_F_TEST_RND_HI32 is set. Suggested-by: Alexei Starovoitov Change-Id: Ieca58255736ae47ec31a0cd67705ad444fc0e35a Signed-off-by: Jiong Wang Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 68 ++++++++++++++++++++++++++++++++++++------- 1 file changed, 57 insertions(+), 11 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 8b0da3f698e9..b0b7887d40a7 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -7848,33 +7848,79 @@ static int opt_remove_nops(struct bpf_verifier_env *env) return 0; } -static int opt_subreg_zext_lo32(struct bpf_verifier_env *env) +static int opt_subreg_zext_lo32_rnd_hi32(struct bpf_verifier_env *env, + const union bpf_attr *attr) { + struct bpf_insn *patch, zext_patch[2], rnd_hi32_patch[4]; struct bpf_insn_aux_data *aux = env->insn_aux_data; + int i, patch_len, delta = 0, len = env->prog->len; struct bpf_insn *insns = env->prog->insnsi; - int i, delta = 0, len = env->prog->len; - struct bpf_insn zext_patch[2]; struct bpf_prog *new_prog; + bool rnd_hi32; + rnd_hi32 = attr->prog_flags & BPF_F_TEST_RND_HI32; zext_patch[1] = BPF_ZEXT_REG(0); + rnd_hi32_patch[1] = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, 0); + rnd_hi32_patch[2] = BPF_ALU64_IMM(BPF_LSH, BPF_REG_AX, 32); + rnd_hi32_patch[3] = BPF_ALU64_REG(BPF_OR, 0, BPF_REG_AX); for (i = 0; i < len; i++) { int adj_idx = i + delta; struct bpf_insn insn; - if (!aux[adj_idx].zext_dst) + insn = insns[adj_idx]; + if (!aux[adj_idx].zext_dst) { + u8 code, class; + u32 imm_rnd; + + if (!rnd_hi32) + continue; + + code = insn.code; + class = BPF_CLASS(code); + if (insn_no_def(&insn)) + continue; + + /* NOTE: arg "reg" (the fourth one) is only used for + * BPF_STX which has been ruled out in above + * check, it is safe to pass NULL here. + */ + if (is_reg64(env, &insn, insn.dst_reg, NULL, DST_OP)) { + if (class == BPF_LD && + BPF_MODE(code) == BPF_IMM) + i++; + continue; + } + + /* ctx load could be transformed into wider load. */ + if (class == BPF_LDX && + aux[adj_idx].ptr_type == PTR_TO_CTX) + continue; + + imm_rnd = get_random_int(); + rnd_hi32_patch[0] = insn; + rnd_hi32_patch[1].imm = imm_rnd; + rnd_hi32_patch[3].dst_reg = insn.dst_reg; + patch = rnd_hi32_patch; + patch_len = 4; + goto apply_patch_buffer; + } + + if (!bpf_jit_needs_zext()) continue; - insn = insns[adj_idx]; zext_patch[0] = insn; zext_patch[1].dst_reg = insn.dst_reg; zext_patch[1].src_reg = insn.dst_reg; - new_prog = bpf_patch_insn_data(env, adj_idx, zext_patch, 2); + patch = zext_patch; + patch_len = 2; +apply_patch_buffer: + new_prog = bpf_patch_insn_data(env, adj_idx, patch, patch_len); if (!new_prog) return -ENOMEM; env->prog = new_prog; insns = new_prog->insnsi; aux = env->insn_aux_data; - delta += 2; + delta += patch_len - 1; } return 0; @@ -8737,10 +8783,10 @@ skip_full_check: /* do 32-bit optimization after insn patching has done so those patched * insns could be handled correctly. */ - if (ret == 0 && bpf_jit_needs_zext() && - !bpf_prog_is_dev_bound(env->prog->aux)) { - ret = opt_subreg_zext_lo32(env); - env->prog->aux->verifier_zext = !ret; + if (ret == 0 && !bpf_prog_is_dev_bound(env->prog->aux)) { + ret = opt_subreg_zext_lo32_rnd_hi32(env, attr); + env->prog->aux->verifier_zext = bpf_jit_needs_zext() ? !ret + : false; } if (ret == 0) From 3eda2d7205404e987adb4f47315fb7dc10f33f65 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Tue, 28 May 2019 14:14:41 -0700 Subject: [PATCH 1136/1640] UPSTREAM: bpf: remove __rcu annotations from bpf_prog_array Drop __rcu annotations and rcu read sections from bpf_prog_array helper functions. They are not needed since all existing callers call those helpers from the rcu update side while holding a mutex. This guarantees that use-after-free could not happen. In the next patches I'll fix the callers with missing rcu_dereference_protected to make sparse/lockdep happy, the proper way to use these helpers is: struct bpf_prog_array __rcu *progs = ...; struct bpf_prog_array *p; mutex_lock(&mtx); p = rcu_dereference_protected(progs, lockdep_is_held(&mtx)); bpf_prog_array_length(p); bpf_prog_array_copy_to_user(p, ...); bpf_prog_array_delete_safe(p, ...); bpf_prog_array_copy_info(p, ...); bpf_prog_array_copy(p, ...); bpf_prog_array_free(p); mutex_unlock(&mtx); No functional changes! rcu_dereference_protected with lockdep_is_held should catch any cases where we update prog array without a mutex (I've looked at existing call sites and I think we hold a mutex everywhere). Motivation is to fix sparse warnings: kernel/bpf/core.c:1803:9: warning: incorrect type in argument 1 (different address spaces) kernel/bpf/core.c:1803:9: expected struct callback_head *head kernel/bpf/core.c:1803:9: got struct callback_head [noderef] * kernel/bpf/core.c:1877:44: warning: incorrect type in initializer (different address spaces) kernel/bpf/core.c:1877:44: expected struct bpf_prog_array_item *item kernel/bpf/core.c:1877:44: got struct bpf_prog_array_item [noderef] * kernel/bpf/core.c:1901:26: warning: incorrect type in assignment (different address spaces) kernel/bpf/core.c:1901:26: expected struct bpf_prog_array_item *existing kernel/bpf/core.c:1901:26: got struct bpf_prog_array_item [noderef] * kernel/bpf/core.c:1935:26: warning: incorrect type in assignment (different address spaces) kernel/bpf/core.c:1935:26: expected struct bpf_prog_array_item *[assigned] existing kernel/bpf/core.c:1935:26: got struct bpf_prog_array_item [noderef] * v2: * remove comment about potential race; that can't happen because all callers are in rcu-update section Cc: Roman Gushchin Acked-by: Roman Gushchin Change-Id: I7f5dd95acc62e190914093ed972630a7f4f61d01 Signed-off-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 12 ++++++------ kernel/bpf/core.c | 37 +++++++++++++------------------------ 2 files changed, 19 insertions(+), 30 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 1377b53657e8..eaa08ca41863 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -523,17 +523,17 @@ struct bpf_prog_array { }; struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags); -void bpf_prog_array_free(struct bpf_prog_array __rcu *progs); -int bpf_prog_array_length(struct bpf_prog_array __rcu *progs); -int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *progs, +void bpf_prog_array_free(struct bpf_prog_array *progs); +int bpf_prog_array_length(struct bpf_prog_array *progs); +int bpf_prog_array_copy_to_user(struct bpf_prog_array *progs, __u32 __user *prog_ids, u32 cnt); -void bpf_prog_array_delete_safe(struct bpf_prog_array __rcu *progs, +void bpf_prog_array_delete_safe(struct bpf_prog_array *progs, struct bpf_prog *old_prog); -int bpf_prog_array_copy_info(struct bpf_prog_array __rcu *array, +int bpf_prog_array_copy_info(struct bpf_prog_array *array, u32 *prog_ids, u32 request_cnt, u32 *prog_cnt); -int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array, +int bpf_prog_array_copy(struct bpf_prog_array *old_array, struct bpf_prog *exclude_prog, struct bpf_prog *include_prog, struct bpf_prog_array **new_array); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index b3f57861f564..a15a38492ec9 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1828,38 +1828,33 @@ struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags) return &empty_prog_array.hdr; } -void bpf_prog_array_free(struct bpf_prog_array __rcu *progs) +void bpf_prog_array_free(struct bpf_prog_array *progs) { - if (!progs || - progs == (struct bpf_prog_array __rcu *)&empty_prog_array.hdr) + if (!progs || progs == &empty_prog_array.hdr) return; kfree_rcu(progs, rcu); } -int bpf_prog_array_length(struct bpf_prog_array __rcu *array) +int bpf_prog_array_length(struct bpf_prog_array *array) { struct bpf_prog_array_item *item; u32 cnt = 0; - rcu_read_lock(); - item = rcu_dereference(array)->items; - for (; item->prog; item++) + for (item = array->items; item->prog; item++) if (item->prog != &dummy_bpf_prog.prog) cnt++; - rcu_read_unlock(); return cnt; } -static bool bpf_prog_array_copy_core(struct bpf_prog_array __rcu *array, +static bool bpf_prog_array_copy_core(struct bpf_prog_array *array, u32 *prog_ids, u32 request_cnt) { struct bpf_prog_array_item *item; int i = 0; - item = rcu_dereference_check(array, 1)->items; - for (; item->prog; item++) { + for (item = array->items; item->prog; item++) { if (item->prog == &dummy_bpf_prog.prog) continue; prog_ids[i] = item->prog->aux->id; @@ -1872,7 +1867,7 @@ static bool bpf_prog_array_copy_core(struct bpf_prog_array __rcu *array, return !!(item->prog); } -int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *array, +int bpf_prog_array_copy_to_user(struct bpf_prog_array *array, __u32 __user *prog_ids, u32 cnt) { unsigned long err = 0; @@ -1883,18 +1878,12 @@ int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *array, * cnt = bpf_prog_array_length(); * if (cnt > 0) * bpf_prog_array_copy_to_user(..., cnt); - * so below kcalloc doesn't need extra cnt > 0 check, but - * bpf_prog_array_length() releases rcu lock and - * prog array could have been swapped with empty or larger array, - * so always copy 'cnt' prog_ids to the user. - * In a rare race the user will see zero prog_ids + * so below kcalloc doesn't need extra cnt > 0 check. */ ids = kcalloc(cnt, sizeof(u32), GFP_USER | __GFP_NOWARN); if (!ids) return -ENOMEM; - rcu_read_lock(); nospc = bpf_prog_array_copy_core(array, ids, cnt); - rcu_read_unlock(); err = copy_to_user(prog_ids, ids, cnt * sizeof(u32)); kfree(ids); if (err) @@ -1904,19 +1893,19 @@ int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *array, return 0; } -void bpf_prog_array_delete_safe(struct bpf_prog_array __rcu *array, +void bpf_prog_array_delete_safe(struct bpf_prog_array *array, struct bpf_prog *old_prog) { - struct bpf_prog_array_item *item = array->items; + struct bpf_prog_array_item *item; - for (; item->prog; item++) + for (item = array->items; item->prog; item++) if (item->prog == old_prog) { WRITE_ONCE(item->prog, &dummy_bpf_prog.prog); break; } } -int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array, +int bpf_prog_array_copy(struct bpf_prog_array *old_array, struct bpf_prog *exclude_prog, struct bpf_prog *include_prog, struct bpf_prog_array **new_array) @@ -1980,7 +1969,7 @@ int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array, return 0; } -int bpf_prog_array_copy_info(struct bpf_prog_array __rcu *array, +int bpf_prog_array_copy_info(struct bpf_prog_array *array, u32 *prog_ids, u32 request_cnt, u32 *prog_cnt) { From 7d7a8f50b85da5720963d348ab3d41ab878de306 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Mon, 9 Mar 2020 22:16:05 -0700 Subject: [PATCH 1137/1640] UPSTREAM: cgroup: memcg: net: do not associate sock with unrelated cgroup [ Upstream commit e876ecc67db80dfdb8e237f71e5b43bb88ae549c ] We are testing network memory accounting in our setup and noticed inconsistent network memory usage and often unrelated cgroups network usage correlates with testing workload. On further inspection, it seems like mem_cgroup_sk_alloc() and cgroup_sk_alloc() are broken in irq context specially for cgroup v1. mem_cgroup_sk_alloc() and cgroup_sk_alloc() can be called in irq context and kind of assumes that this can only happen from sk_clone_lock() and the source sock object has already associated cgroup. However in cgroup v1, where network memory accounting is opt-in, the source sock can be unassociated with any cgroup and the new cloned sock can get associated with unrelated interrupted cgroup. Cgroup v2 can also suffer if the source sock object was created by process in the root cgroup or if sk_alloc() is called in irq context. The fix is to just do nothing in interrupt. WARNING: Please note that about half of the TCP sockets are allocated from the IRQ context, so, memory used by such sockets will not be accouted by the memcg. The stack trace of mem_cgroup_sk_alloc() from IRQ-context: CPU: 70 PID: 12720 Comm: ssh Tainted: 5.6.0-smp-DEV #1 Hardware name: ... Call Trace: dump_stack+0x57/0x75 mem_cgroup_sk_alloc+0xe9/0xf0 sk_clone_lock+0x2a7/0x420 inet_csk_clone_lock+0x1b/0x110 tcp_create_openreq_child+0x23/0x3b0 tcp_v6_syn_recv_sock+0x88/0x730 tcp_check_req+0x429/0x560 tcp_v6_rcv+0x72d/0xa40 ip6_protocol_deliver_rcu+0xc9/0x400 ip6_input+0x44/0xd0 ? ip6_protocol_deliver_rcu+0x400/0x400 ip6_rcv_finish+0x71/0x80 ipv6_rcv+0x5b/0xe0 ? ip6_sublist_rcv+0x2e0/0x2e0 process_backlog+0x108/0x1e0 net_rx_action+0x26b/0x460 __do_softirq+0x104/0x2a6 do_softirq_own_stack+0x2a/0x40 do_softirq.part.19+0x40/0x50 __local_bh_enable_ip+0x51/0x60 ip6_finish_output2+0x23d/0x520 ? ip6table_mangle_hook+0x55/0x160 __ip6_finish_output+0xa1/0x100 ip6_finish_output+0x30/0xd0 ip6_output+0x73/0x120 ? __ip6_finish_output+0x100/0x100 ip6_xmit+0x2e3/0x600 ? ipv6_anycast_cleanup+0x50/0x50 ? inet6_csk_route_socket+0x136/0x1e0 ? skb_free_head+0x1e/0x30 inet6_csk_xmit+0x95/0xf0 __tcp_transmit_skb+0x5b4/0xb20 __tcp_send_ack.part.60+0xa3/0x110 tcp_send_ack+0x1d/0x20 tcp_rcv_state_process+0xe64/0xe80 ? tcp_v6_connect+0x5d1/0x5f0 tcp_v6_do_rcv+0x1b1/0x3f0 ? tcp_v6_do_rcv+0x1b1/0x3f0 __release_sock+0x7f/0xd0 release_sock+0x30/0xa0 __inet_stream_connect+0x1c3/0x3b0 ? prepare_to_wait+0xb0/0xb0 inet_stream_connect+0x3b/0x60 __sys_connect+0x101/0x120 ? __sys_getsockopt+0x11b/0x140 __x64_sys_connect+0x1a/0x20 do_syscall_64+0x51/0x200 entry_SYSCALL_64_after_hwframe+0x44/0xa9 The stack trace of mem_cgroup_sk_alloc() from IRQ-context: Fixes: 2d7580738345 ("mm: memcontrol: consolidate cgroup socket tracking") Fixes: d979a39d7242 ("cgroup: duplicate cgroup reference when cloning sockets") Change-Id: I8872313e1e69d0809cbffdf96549f0dc3edfacda Signed-off-by: Shakeel Butt Reviewed-by: Roman Gushchin Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- mm/memcontrol.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 9c595fd53766..b4c92d212df0 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5974,6 +5974,10 @@ void mem_cgroup_sk_alloc(struct sock *sk) if (in_interrupt()) return; + /* Do not associate the sock with unrelated interrupted task's memcg. */ + if (in_interrupt()) + return; + rcu_read_lock(); memcg = mem_cgroup_from_task(current); if (memcg == root_mem_cgroup) From c290cf907463ceaf5006d76bee312e4bf7eb93a1 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Sat, 25 May 2019 09:37:39 -0700 Subject: [PATCH 1138/1640] BACKPORT: bpf: decouple the lifetime of cgroup_bpf from cgroup itself Currently the lifetime of bpf programs attached to a cgroup is bound to the lifetime of the cgroup itself. It means that if a user forgets (or intentionally avoids) to detach a bpf program before removing the cgroup, it will stay attached up to the release of the cgroup. Since the cgroup can stay in the dying state (the state between being rmdir()'ed and being released) for a very long time, it leads to a waste of memory. Also, it blocks a possibility to implement the memcg-based memory accounting for bpf objects, because a circular reference dependency will occur. Charged memory pages are pinning the corresponding memory cgroup, and if the memory cgroup is pinning the attached bpf program, nothing will be ever released. A dying cgroup can not contain any processes, so the only chance for an attached bpf program to be executed is a live socket associated with the cgroup. So in order to release all bpf data early, let's count associated sockets using a new percpu refcounter. On cgroup removal the counter is transitioned to the atomic mode, and as soon as it reaches 0, all bpf programs are detached. Because cgroup_bpf_release() can block, it can't be called from the percpu ref counter callback directly, so instead an asynchronous work is scheduled. The reference counter is not socket specific, and can be used for any other types of programs, which can be executed from a cgroup-bpf hook outside of the process context, had such a need arise in the future. Change-Id: I522bed27d3800bf9276272348898ed5b393fa5f2 Signed-off-by: Roman Gushchin Cc: jolsa@redhat.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf-cgroup.h | 11 ++++++++-- include/linux/cgroup.h | 18 +++++++++++++++ kernel/bpf/cgroup.c | 45 +++++++++++++++++++++++++++++++++----- kernel/cgroup/cgroup.c | 11 ++++++---- 4 files changed, 73 insertions(+), 12 deletions(-) diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index 665893e0f07b..f0705ad9f82c 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -6,6 +6,7 @@ #include #include #include +#include #include #include @@ -72,10 +73,16 @@ struct cgroup_bpf { /* temp storage for effective prog array used by prog_attach/detach */ struct bpf_prog_array __rcu *inactive; + + /* reference counter used to detach bpf programs after cgroup removal */ + struct percpu_ref refcnt; + + /* cgroup_bpf is released using a work queue */ + struct work_struct release_work; }; -void cgroup_bpf_put(struct cgroup *cgrp); int cgroup_bpf_inherit(struct cgroup *cgrp); +void cgroup_bpf_offline(struct cgroup *cgrp); int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, enum bpf_attach_type type, u32 flags); @@ -289,8 +296,8 @@ int cgroup_bpf_prog_query(const union bpf_attr *attr, struct bpf_prog; struct cgroup_bpf {}; -static inline void cgroup_bpf_put(struct cgroup *cgrp) {} static inline int cgroup_bpf_inherit(struct cgroup *cgrp) { return 0; } +static inline void cgroup_bpf_offline(struct cgroup *cgrp) {} static inline int cgroup_bpf_prog_attach(const union bpf_attr *attr, enum bpf_prog_type ptype, diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 49ad1e6f2e21..3afd96d06ed9 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -843,4 +843,22 @@ static inline void put_cgroup_ns(struct cgroup_namespace *ns) free_cgroup_ns(ns); } +#ifdef CONFIG_CGROUP_BPF +static inline void cgroup_bpf_get(struct cgroup *cgrp) +{ + percpu_ref_get(&cgrp->bpf.refcnt); +} + +static inline void cgroup_bpf_put(struct cgroup *cgrp) +{ + percpu_ref_put(&cgrp->bpf.refcnt); +} + +#else /* CONFIG_CGROUP_BPF */ + +static inline void cgroup_bpf_get(struct cgroup *cgrp) {} +static inline void cgroup_bpf_put(struct cgroup *cgrp) {} + +#endif /* CONFIG_CGROUP_BPF */ + #endif /* _LINUX_CGROUP_H */ diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index fcde0f7b2585..d995edbe816d 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -22,12 +22,21 @@ DEFINE_STATIC_KEY_FALSE(cgroup_bpf_enabled_key); EXPORT_SYMBOL(cgroup_bpf_enabled_key); -/** - * cgroup_bpf_put() - put references of all bpf programs - * @cgrp: the cgroup to modify - */ -void cgroup_bpf_put(struct cgroup *cgrp) +void cgroup_bpf_offline(struct cgroup *cgrp) { + cgroup_get(cgrp); + percpu_ref_kill(&cgrp->bpf.refcnt); +} + +/** + * cgroup_bpf_release() - put references of all bpf programs and + * release all cgroup bpf data + * @work: work structure embedded into the cgroup to modify + */ +static void cgroup_bpf_release(struct work_struct *work) +{ + struct cgroup *cgrp = container_of(work, struct cgroup, + bpf.release_work); enum bpf_cgroup_storage_type stype; unsigned int type; @@ -47,6 +56,22 @@ void cgroup_bpf_put(struct cgroup *cgrp) } bpf_prog_array_free(cgrp->bpf.effective[type]); } + + percpu_ref_exit(&cgrp->bpf.refcnt); + cgroup_put(cgrp); +} + +/** + * cgroup_bpf_release_fn() - callback used to schedule releasing + * of bpf cgroup data + * @ref: percpu ref counter structure + */ +static void cgroup_bpf_release_fn(struct percpu_ref *ref) +{ + struct cgroup *cgrp = container_of(ref, struct cgroup, bpf.refcnt); + + INIT_WORK(&cgrp->bpf.release_work, cgroup_bpf_release); + queue_work(system_wq, &cgrp->bpf.release_work); } /* count number of elements in the list. @@ -167,7 +192,12 @@ int cgroup_bpf_inherit(struct cgroup *cgrp) */ #define NR ARRAY_SIZE(cgrp->bpf.effective) struct bpf_prog_array __rcu *arrays[NR] = {}; - int i; + int ret, i; + + ret = percpu_ref_init(&cgrp->bpf.refcnt, cgroup_bpf_release_fn, 0, + GFP_KERNEL); + if (ret) + return ret; for (i = 0; i < NR; i++) INIT_LIST_HEAD(&cgrp->bpf.progs[i]); @@ -183,6 +213,9 @@ int cgroup_bpf_inherit(struct cgroup *cgrp) cleanup: for (i = 0; i < NR; i++) bpf_prog_array_free(arrays[i]); + + percpu_ref_exit(&cgrp->bpf.refcnt); + return -ENOMEM; } diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 97d6ee5b2f65..d9368aec5fa2 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -4859,8 +4859,6 @@ static void css_release_work_fn(struct work_struct *work) if (cgrp->kn) RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv, NULL); - - cgroup_bpf_put(cgrp); } mutex_unlock(&cgroup_mutex); @@ -5348,6 +5346,8 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) cgroup1_check_for_release(parent); + cgroup_bpf_offline(cgrp); + /* put the base reference */ percpu_ref_kill(&cgrp->self.refcnt); @@ -6071,6 +6071,7 @@ void cgroup_sk_alloc(struct sock_cgroup_data *skcd) cset = task_css_set(current); if (likely(cgroup_tryget(cset->dfl_cgrp))) { skcd->val = (unsigned long)cset->dfl_cgrp; + cgroup_bpf_get(cset->dfl_cgrp); break; } cpu_relax(); @@ -6096,10 +6097,12 @@ void cgroup_sk_clone(struct sock_cgroup_data *skcd) void cgroup_sk_free(struct sock_cgroup_data *skcd) { + struct cgroup *cgrp = sock_cgroup_ptr(skcd); + if (skcd->no_refcnt) return; - - cgroup_put(sock_cgroup_ptr(skcd)); + cgroup_bpf_put(cgrp); + cgroup_put(cgrp); } #endif /* CONFIG_SOCK_CGROUP_DATA */ From b7598fadbc02932dd4b7af43ed55df7c962fc03b Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Tue, 28 May 2019 14:14:43 -0700 Subject: [PATCH 1139/1640] UPSTREAM: bpf: cgroup: properly use bpf_prog_array api Now that we don't have __rcu markers on the bpf_prog_array helpers, let's use proper rcu_dereference_protected to obtain array pointer under mutex. We also don't need __rcu annotations on cgroup_bpf.inactive since it's not read/updated concurrently. v4: * drop cgroup_rcu_xyz wrappers and use rcu APIs directly; presumably should be more clear to understand which mutex/refcount protects each particular place v3: * amend cgroup_rcu_dereference to include percpu_ref_is_dying; cgroup_bpf is now reference counted and we don't hold cgroup_mutex anymore in cgroup_bpf_release v2: * replace xchg with rcu_swap_protected Cc: Roman Gushchin Change-Id: Ib5845f2f4860c65342d124c32a02ffa2d7f486ee Signed-off-by: Stanislav Fomichev Acked-by: Roman Gushchin Signed-off-by: Daniel Borkmann --- include/linux/bpf-cgroup.h | 2 +- kernel/bpf/cgroup.c | 28 +++++++++++++++++----------- 2 files changed, 18 insertions(+), 12 deletions(-) diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index f0705ad9f82c..c60113c46b72 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -72,7 +72,7 @@ struct cgroup_bpf { u32 flags[MAX_BPF_ATTACH_TYPE]; /* temp storage for effective prog array used by prog_attach/detach */ - struct bpf_prog_array __rcu *inactive; + struct bpf_prog_array *inactive; /* reference counter used to detach bpf programs after cgroup removal */ struct percpu_ref refcnt; diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index d995edbe816d..ff594eb86fd7 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -38,6 +38,7 @@ static void cgroup_bpf_release(struct work_struct *work) struct cgroup *cgrp = container_of(work, struct cgroup, bpf.release_work); enum bpf_cgroup_storage_type stype; + struct bpf_prog_array *old_array; unsigned int type; for (type = 0; type < ARRAY_SIZE(cgrp->bpf.progs); type++) { @@ -54,7 +55,10 @@ static void cgroup_bpf_release(struct work_struct *work) kfree(pl); static_branch_dec(&cgroup_bpf_enabled_key); } - bpf_prog_array_free(cgrp->bpf.effective[type]); + old_array = rcu_dereference_protected( + cgrp->bpf.effective[type], + percpu_ref_is_dying(&cgrp->bpf.refcnt)); + bpf_prog_array_free(old_array); } percpu_ref_exit(&cgrp->bpf.refcnt); @@ -126,7 +130,7 @@ static bool hierarchy_allows_attach(struct cgroup *cgrp, */ static int compute_effective_progs(struct cgroup *cgrp, enum bpf_attach_type type, - struct bpf_prog_array __rcu **array) + struct bpf_prog_array **array) { enum bpf_cgroup_storage_type stype; struct bpf_prog_array *progs; @@ -164,17 +168,16 @@ static int compute_effective_progs(struct cgroup *cgrp, } } while ((p = cgroup_parent(p))); - rcu_assign_pointer(*array, progs); + *array = progs; return 0; } static void activate_effective_progs(struct cgroup *cgrp, enum bpf_attach_type type, - struct bpf_prog_array __rcu *array) + struct bpf_prog_array *old_array) { - struct bpf_prog_array __rcu *old_array; - - old_array = xchg(&cgrp->bpf.effective[type], array); + rcu_swap_protected(cgrp->bpf.effective[type], old_array, + lockdep_is_held(&cgroup_mutex)); /* free prog array after grace period, since __cgroup_bpf_run_*() * might be still walking the array */ @@ -191,7 +194,7 @@ int cgroup_bpf_inherit(struct cgroup *cgrp) * that array below is variable length */ #define NR ARRAY_SIZE(cgrp->bpf.effective) - struct bpf_prog_array __rcu *arrays[NR] = {}; + struct bpf_prog_array *arrays[NR] = {}; int ret, i; ret = percpu_ref_init(&cgrp->bpf.refcnt, cgroup_bpf_release_fn, 0, @@ -477,10 +480,14 @@ int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, enum bpf_attach_type type = attr->query.attach_type; struct list_head *progs = &cgrp->bpf.progs[type]; u32 flags = cgrp->bpf.flags[type]; + struct bpf_prog_array *effective; int cnt, ret = 0, i; + effective = rcu_dereference_protected(cgrp->bpf.effective[type], + lockdep_is_held(&cgroup_mutex)); + if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) - cnt = bpf_prog_array_length(cgrp->bpf.effective[type]); + cnt = bpf_prog_array_length(effective); else cnt = prog_list_length(progs); @@ -497,8 +504,7 @@ int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, } if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) { - return bpf_prog_array_copy_to_user(cgrp->bpf.effective[type], - prog_ids, cnt); + return bpf_prog_array_copy_to_user(effective, prog_ids, cnt); } else { struct bpf_prog_list *pl; u32 id; From 076c2e7f7fd69f461a52391ee79cb5d44a4161cb Mon Sep 17 00:00:00 2001 From: brakmo Date: Tue, 28 May 2019 16:59:36 -0700 Subject: [PATCH 1140/1640] UPSTREAM: bpf: cgroup inet skb programs can return 0 to 3 Allows cgroup inet skb programs to return values in the range [0, 3]. The second bit is used to deterine if congestion occurred and higher level protocol should decrease rate. E.g. TCP would call tcp_enter_cwr() The bpf_prog must set expected_attach_type to BPF_CGROUP_INET_EGRESS at load time if it uses the new return values (i.e. 2 or 3). The expected_attach_type is currently not enforced for BPF_PROG_TYPE_CGROUP_SKB. e.g Meaning the current bpf_prog with expected_attach_type setting to BPF_CGROUP_INET_EGRESS can attach to BPF_CGROUP_INET_INGRESS. Blindly enforcing expected_attach_type will break backward compatibility. This patch adds a enforce_expected_attach_type bit to only enforce the expected_attach_type when it uses the new return value. Change-Id: I5647151d48513d98c39fcf1d0381c31b5474e399 Signed-off-by: Lawrence Brakmo Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov --- include/linux/filter.h | 3 ++- kernel/bpf/syscall.c | 12 ++++++++++++ kernel/bpf/verifier.c | 11 ++++++++++- 3 files changed, 24 insertions(+), 2 deletions(-) diff --git a/include/linux/filter.h b/include/linux/filter.h index b6fac88c5200..1d760b867650 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -559,7 +559,8 @@ struct bpf_prog { blinded:1, /* Was blinded */ is_func:1, /* program is a bpf function */ kprobe_override:1, /* Do we override a kprobe? */ - has_callchain_buf:1; /* callchain buffer allocated? */ + has_callchain_buf:1, /* callchain buffer allocated? */ + enforce_expected_attach_type:1; /* Enforce expected_attach_type checking at attach time */ enum bpf_prog_type type; /* Type of BPF program */ enum bpf_attach_type expected_attach_type; /* For some prog types */ u32 len; /* Number of filter blocks */ diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 3d026ada0d80..b6793c0d7601 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1588,6 +1588,14 @@ bpf_prog_load_check_attach_type(enum bpf_prog_type prog_type, default: return -EINVAL; } + case BPF_PROG_TYPE_CGROUP_SKB: + switch (expected_attach_type) { + case BPF_CGROUP_INET_INGRESS: + case BPF_CGROUP_INET_EGRESS: + return 0; + default: + return -EINVAL; + } default: return 0; } @@ -1844,6 +1852,10 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, case BPF_PROG_TYPE_CGROUP_SOCK: case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: return attach_type == prog->expected_attach_type ? 0 : -EINVAL; + case BPF_PROG_TYPE_CGROUP_SKB: + return prog->enforce_expected_attach_type && + prog->expected_attach_type != attach_type ? + -EINVAL : 0; default: return 0; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index b0b7887d40a7..a83bc8caeb0e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5725,6 +5725,7 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) static int check_return_code(struct bpf_verifier_env *env) { + struct tnum enforce_attach_type_range = tnum_unknown; struct bpf_reg_state *reg; struct tnum range = tnum_range(0, 1); @@ -5734,6 +5735,10 @@ static int check_return_code(struct bpf_verifier_env *env) env->prog->expected_attach_type == BPF_CGROUP_UDP6_RECVMSG) range = tnum_range(1, 1); case BPF_PROG_TYPE_CGROUP_SKB: + if (env->prog->expected_attach_type == BPF_CGROUP_INET_EGRESS) { + range = tnum_range(0, 3); + enforce_attach_type_range = tnum_range(2, 3); + } case BPF_PROG_TYPE_CGROUP_SOCK: case BPF_PROG_TYPE_SOCK_OPS: case BPF_PROG_TYPE_CGROUP_DEVICE: @@ -5761,9 +5766,13 @@ static int check_return_code(struct bpf_verifier_env *env) verbose(env, "has unknown scalar value"); } tnum_strn(tn_buf, sizeof(tn_buf), range); - verbose(env, " should have been in %s\n", tn_buf); + verbose(env, " should have been %s\n", tn_buf); return -EINVAL; } + + if (!tnum_is_unknown(enforce_attach_type_range) && + tnum_in(enforce_attach_type_range, reg->var_off)) + env->prog->enforce_expected_attach_type = 1; return 0; } From 9ef4bbe985b5193493bd72008e0f0a87c0be93f5 Mon Sep 17 00:00:00 2001 From: brakmo Date: Tue, 28 May 2019 16:59:35 -0700 Subject: [PATCH 1141/1640] UPSTREAM: bpf: Create BPF_PROG_CGROUP_INET_EGRESS_RUN_ARRAY Create new macro BPF_PROG_CGROUP_INET_EGRESS_RUN_ARRAY() to be used by __cgroup_bpf_run_filter_skb for EGRESS BPF progs so BPF programs can request cwr for TCP packets. Current cgroup skb programs can only return 0 or 1 (0 to drop the packet. This macro changes the behavior so the low order bit indicates whether the packet should be dropped (0) or not (1) and the next bit is used for congestion notification (cn). Hence, new allowed return values of CGROUP EGRESS BPF programs are: 0: drop packet 1: keep packet 2: drop packet and call cwr 3: keep packet and call cwr This macro then converts it to one of NET_XMIT values or -EPERM that has the effect of dropping the packet with no cn. 0: NET_XMIT_SUCCESS skb should be transmitted (no cn) 1: NET_XMIT_DROP skb should be dropped and cwr called 2: NET_XMIT_CN skb should be transmitted and cwr called 3: -EPERM skb should be dropped (no cn) Note that when more than one BPF program is called, the packet is dropped if at least one of programs requests it be dropped, and there is cn if at least one program returns cn. Change-Id: I78fe91e8682dcdc4a8dddc6354563a5a7a93aaea Signed-off-by: Lawrence Brakmo Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 50 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index eaa08ca41863..183ddcaa5009 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -562,6 +562,56 @@ _out: \ _ret; \ }) +/* To be used by __cgroup_bpf_run_filter_skb for EGRESS BPF progs + * so BPF programs can request cwr for TCP packets. + * + * Current cgroup skb programs can only return 0 or 1 (0 to drop the + * packet. This macro changes the behavior so the low order bit + * indicates whether the packet should be dropped (0) or not (1) + * and the next bit is a congestion notification bit. This could be + * used by TCP to call tcp_enter_cwr() + * + * Hence, new allowed return values of CGROUP EGRESS BPF programs are: + * 0: drop packet + * 1: keep packet + * 2: drop packet and cn + * 3: keep packet and cn + * + * This macro then converts it to one of the NET_XMIT or an error + * code that is then interpreted as drop packet (and no cn): + * 0: NET_XMIT_SUCCESS skb should be transmitted + * 1: NET_XMIT_DROP skb should be dropped and cn + * 2: NET_XMIT_CN skb should be transmitted and cn + * 3: -EPERM skb should be dropped + */ +#define BPF_PROG_CGROUP_INET_EGRESS_RUN_ARRAY(array, ctx, func) \ + ({ \ + struct bpf_prog_array_item *_item; \ + struct bpf_prog *_prog; \ + struct bpf_prog_array *_array; \ + u32 ret; \ + u32 _ret = 1; \ + u32 _cn = 0; \ + preempt_disable(); \ + rcu_read_lock(); \ + _array = rcu_dereference(array); \ + _item = &_array->items[0]; \ + while ((_prog = READ_ONCE(_item->prog))) { \ + bpf_cgroup_storage_set(_item->cgroup_storage); \ + ret = func(_prog, ctx); \ + _ret &= (ret & 1); \ + _cn |= (ret & 2); \ + _item++; \ + } \ + rcu_read_unlock(); \ + preempt_enable(); \ + if (_ret) \ + _ret = (_cn ? NET_XMIT_CN : NET_XMIT_SUCCESS); \ + else \ + _ret = (_cn ? NET_XMIT_DROP : -EPERM); \ + _ret; \ + }) + #define BPF_PROG_RUN_ARRAY(array, ctx, func) \ __BPF_PROG_RUN_ARRAY(array, ctx, func, false, true) From e6e353ebf3cd4c671f7a0f6cd8674792bb4de849 Mon Sep 17 00:00:00 2001 From: brakmo Date: Tue, 28 May 2019 16:59:37 -0700 Subject: [PATCH 1142/1640] UPSTREAM: bpf: Update __cgroup_bpf_run_filter_skb with cn For egress packets, __cgroup_bpf_fun_filter_skb() will now call BPF_PROG_CGROUP_INET_EGRESS_RUN_ARRAY() instead of PROG_CGROUP_RUN_ARRAY() in order to propagate congestion notifications (cn) requests to TCP callers. For egress packets, this function can return: NET_XMIT_SUCCESS (0) - continue with packet output NET_XMIT_DROP (1) - drop packet and notify TCP to call cwr NET_XMIT_CN (2) - continue with packet output and notify TCP to call cwr -EPERM - drop packet For ingress packets, this function will return -EPERM if any attached program was found and if it returned != 1 during execution. Otherwise 0 is returned. Change-Id: I5d4a74dce4fa05f7f4851ef40c45d470f041ae7e Signed-off-by: Lawrence Brakmo Signed-off-by: Alexei Starovoitov --- kernel/bpf/cgroup.c | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index ff594eb86fd7..1b65ab0df457 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -587,8 +587,16 @@ int cgroup_bpf_prog_query(const union bpf_attr *attr, * The program type passed in via @type must be suitable for network * filtering. No further check is performed to assert that. * - * This function will return %-EPERM if any if an attached program was found - * and if it returned != 1 during execution. In all other cases, 0 is returned. + * For egress packets, this function can return: + * NET_XMIT_SUCCESS (0) - continue with packet output + * NET_XMIT_DROP (1) - drop packet and notify TCP to call cwr + * NET_XMIT_CN (2) - continue with packet output and notify TCP + * to call cwr + * -EPERM - drop packet + * + * For ingress packets, this function will return -EPERM if any + * attached program was found and if it returned != 1 during execution. + * Otherwise 0 is returned. */ int __cgroup_bpf_run_filter_skb(struct sock *sk, struct sk_buff *skb, @@ -614,12 +622,19 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk, /* compute pointers for the bpf prog */ bpf_compute_and_save_data_end(skb, &saved_data_end); - ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], skb, - __bpf_prog_run_save_cb); + if (type == BPF_CGROUP_INET_EGRESS) { + ret = BPF_PROG_CGROUP_INET_EGRESS_RUN_ARRAY( + cgrp->bpf.effective[type], skb, __bpf_prog_run_save_cb); + } else { + ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], skb, + __bpf_prog_run_save_cb); + ret = (ret == 1 ? 0 : -EPERM); + } bpf_restore_data_end(skb, saved_data_end); __skb_pull(skb, offset); skb->sk = save_sk; - return ret == 1 ? 0 : -EPERM; + + return ret; } EXPORT_SYMBOL(__cgroup_bpf_run_filter_skb); From 00a100cea6a59e8285b2b9f8b1e2457d2fecf150 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Wed, 29 May 2019 18:03:55 -0700 Subject: [PATCH 1143/1640] UPSTREAM: bpf: add memlock precharge check for cgroup_local_storage Cgroup local storage maps lack the memlock precharge check, which is performed before the memory allocation for most other bpf map types. Let's add it in order to unify all map types. Change-Id: I0fac7d3e47cecaf24250eb81c902948802a883a7 Signed-off-by: Roman Gushchin Acked-by: Song Liu Signed-off-by: Alexei Starovoitov --- kernel/bpf/local_storage.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index 980e8f1f6cb5..e48302ecb389 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -272,6 +272,8 @@ static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr) { int numa_node = bpf_map_attr_numa_node(attr); struct bpf_cgroup_storage_map *map; + u32 pages; + int ret; if (attr->key_size != sizeof(struct bpf_cgroup_storage_key)) return ERR_PTR(-EINVAL); @@ -290,13 +292,18 @@ static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr) /* max_entries is not used and enforced to be 0 */ return ERR_PTR(-EINVAL); + pages = round_up(sizeof(struct bpf_cgroup_storage_map), PAGE_SIZE) >> + PAGE_SHIFT; + ret = bpf_map_precharge_memlock(pages); + if (ret < 0) + return ERR_PTR(ret); + map = kmalloc_node(sizeof(struct bpf_cgroup_storage_map), __GFP_ZERO | GFP_USER, numa_node); if (!map) return ERR_PTR(-ENOMEM); - map->map.pages = round_up(sizeof(struct bpf_cgroup_storage_map), - PAGE_SIZE) >> PAGE_SHIFT; + map->map.pages = pages; /* copy mandatory map attributes */ bpf_map_init_from_attr(&map->map, attr); From e6f631f9ae0c512193979e19d006f522920617be Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Wed, 29 May 2019 18:03:56 -0700 Subject: [PATCH 1144/1640] UPSTREAM: bpf: add memlock precharge for socket local storage Socket local storage maps lack the memlock precharge check, which is performed before the memory allocation for most other bpf map types. Let's add it in order to unify all map types. Change-Id: If844b9f761a9b7a0fad3ec3399a81093fb69b9ae Signed-off-by: Roman Gushchin Acked-by: Song Liu Signed-off-by: Alexei Starovoitov --- net/core/bpf_sk_storage.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index a8e9ac71b22d..377f3b5e53e5 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -626,7 +626,9 @@ static struct bpf_map *bpf_sk_storage_map_alloc(union bpf_attr *attr) struct bpf_sk_storage_map *smap; unsigned int i; u32 nbuckets; + u32 pages; u64 cost; + int ret; smap = kzalloc(sizeof(*smap), GFP_USER | __GFP_NOWARN); if (!smap) @@ -635,13 +637,19 @@ static struct bpf_map *bpf_sk_storage_map_alloc(union bpf_attr *attr) smap->bucket_log = ilog2(roundup_pow_of_two(num_possible_cpus())); nbuckets = 1U << smap->bucket_log; + cost = sizeof(*smap->buckets) * nbuckets + sizeof(*smap); + pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; + + ret = bpf_map_precharge_memlock(pages); + if (ret < 0) + return ERR_PTR(ret); + smap->buckets = kvcalloc(sizeof(*smap->buckets), nbuckets, GFP_USER | __GFP_NOWARN); if (!smap->buckets) { kfree(smap); return ERR_PTR(-ENOMEM); } - cost = sizeof(*smap->buckets) * nbuckets + sizeof(*smap); for (i = 0; i < nbuckets; i++) { INIT_HLIST_HEAD(&smap->buckets[i].list); @@ -651,7 +659,7 @@ static struct bpf_map *bpf_sk_storage_map_alloc(union bpf_attr *attr) smap->elem_size = sizeof(struct bpf_sk_storage_elem) + attr->value_size; smap->cache_idx = (unsigned int)atomic_inc_return(&cache_idx) % BPF_SK_STORAGE_CACHE_SIZE; - smap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; + smap->map.pages = pages; return &smap->map; } From 7032f895461e7feb92419b083bcb9542ecca50d1 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Wed, 29 May 2019 18:03:57 -0700 Subject: [PATCH 1145/1640] UPSTREAM: bpf: group memory related fields in struct bpf_map_memory Group "user" and "pages" fields of bpf_map into the bpf_map_memory structure. Later it can be extended with "memcg" and other related information. The main reason for a such change (beside cosmetics) is to pass bpf_map_memory structure to charging functions before the actual allocation of bpf_map. Change-Id: I04e4edf805bfe4c26fce45f7166317fe00dd0dfa Signed-off-by: Roman Gushchin Acked-by: Song Liu Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 10 +++++++--- kernel/bpf/arraymap.c | 2 +- kernel/bpf/cpumap.c | 4 ++-- kernel/bpf/devmap.c | 4 ++-- kernel/bpf/hashtab.c | 4 ++-- kernel/bpf/local_storage.c | 2 +- kernel/bpf/lpm_trie.c | 4 ++-- kernel/bpf/queue_stack_maps.c | 2 +- kernel/bpf/reuseport_array.c | 2 +- kernel/bpf/stackmap.c | 4 ++-- kernel/bpf/syscall.c | 19 ++++++++++--------- kernel/bpf/xskmap.c | 4 ++-- net/core/bpf_sk_storage.c | 2 +- net/core/sock_map.c | 4 ++-- 14 files changed, 36 insertions(+), 31 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 183ddcaa5009..902b304d5767 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -70,6 +70,11 @@ struct bpf_map_ops { u64 imm, u32 *off); }; +struct bpf_map_memory { + u32 pages; + struct user_struct *user; +}; + struct bpf_map { /* The first two cachelines with read-mostly members of which some * are also accessed in fast-path (e.g. ops, max_entries). @@ -90,7 +95,7 @@ struct bpf_map { u32 btf_key_type_id; u32 btf_value_type_id; struct btf *btf; - u32 pages; + struct bpf_map_memory memory; bool unpriv_array; bool frozen; /* write-once */ /* 48 bytes hole */ @@ -98,8 +103,7 @@ struct bpf_map { /* The 3rd and 4th cacheline with misc members to avoid false sharing * particularly with refcounting. */ - struct user_struct *user ____cacheline_aligned; - atomic_t refcnt; + atomic_t refcnt ____cacheline_aligned; atomic_t usercnt; struct work_struct work; char name[BPF_OBJ_NAME_LEN]; diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 3ed5784f476a..37f47fd85514 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -141,7 +141,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) /* copy mandatory map attributes */ bpf_map_init_from_attr(&array->map, attr); - array->map.pages = cost; + array->map.memory.pages = cost; array->elem_size = elem_size; if (percpu && bpf_array_alloc_percpu(array)) { diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 62f2c295034d..e8e529bde067 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -108,10 +108,10 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) cost += cpu_map_bitmap_size(attr) * num_possible_cpus(); if (cost >= U32_MAX - PAGE_SIZE) goto free_cmap; - cmap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; + cmap->map.memory.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; /* Notice returns -EPERM on if map size is larger than memlock limit */ - ret = bpf_map_precharge_memlock(cmap->map.pages); + ret = bpf_map_precharge_memlock(cmap->map.memory.pages); if (ret) { err = ret; goto free_cmap; diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 1defea4b2755..3f7b6b403d78 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -111,10 +111,10 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) if (cost >= U32_MAX - PAGE_SIZE) goto free_dtab; - dtab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; + dtab->map.memory.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; /* if map size is larger than memlock limit, reject it early */ - err = bpf_map_precharge_memlock(dtab->map.pages); + err = bpf_map_precharge_memlock(dtab->map.memory.pages); if (err) goto free_dtab; diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 681c9ec8d44e..dc2b406a52f7 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -371,10 +371,10 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) /* make sure page count doesn't overflow */ goto free_htab; - htab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; + htab->map.memory.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; /* if map size is larger than memlock limit, reject it early */ - err = bpf_map_precharge_memlock(htab->map.pages); + err = bpf_map_precharge_memlock(htab->map.memory.pages); if (err) goto free_htab; diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index e48302ecb389..574325276650 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -303,7 +303,7 @@ static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr) if (!map) return ERR_PTR(-ENOMEM); - map->map.pages = pages; + map->map.memory.pages = pages; /* copy mandatory map attributes */ bpf_map_init_from_attr(&map->map, attr); diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index 612b8e126012..26cd7a0dbc3c 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -581,9 +581,9 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr) goto out_err; } - trie->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; + trie->map.memory.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; - ret = bpf_map_precharge_memlock(trie->map.pages); + ret = bpf_map_precharge_memlock(trie->map.memory.pages); if (ret) goto out_err; diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c index 0b140d236889..8a510e71d486 100644 --- a/kernel/bpf/queue_stack_maps.c +++ b/kernel/bpf/queue_stack_maps.c @@ -89,7 +89,7 @@ static struct bpf_map *queue_stack_map_alloc(union bpf_attr *attr) bpf_map_init_from_attr(&qs->map, attr); - qs->map.pages = cost; + qs->map.memory.pages = cost; qs->size = size; raw_spin_lock_init(&qs->lock); diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c index 18e225de80ff..819515242739 100644 --- a/kernel/bpf/reuseport_array.c +++ b/kernel/bpf/reuseport_array.c @@ -176,7 +176,7 @@ static struct bpf_map *reuseport_array_alloc(union bpf_attr *attr) /* copy mandatory map attributes */ bpf_map_init_from_attr(&array->map, attr); - array->map.pages = cost; + array->map.memory.pages = cost; return &array->map; } diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index d97c080567a6..beb23d9b8fcf 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -137,9 +137,9 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr) bpf_map_init_from_attr(&smap->map, attr); smap->map.value_size = value_size; smap->n_buckets = n_buckets; - smap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; + smap->map.memory.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; - err = bpf_map_precharge_memlock(smap->map.pages); + err = bpf_map_precharge_memlock(smap->map.memory.pages); if (err) goto free_smap; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index b6793c0d7601..43c6fd098cee 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -223,19 +223,20 @@ static int bpf_map_init_memlock(struct bpf_map *map) struct user_struct *user = get_current_user(); int ret; - ret = bpf_charge_memlock(user, map->pages); + ret = bpf_charge_memlock(user, map->memory.pages); if (ret) { free_uid(user); return ret; } - map->user = user; + map->memory.user = user; return ret; } static void bpf_map_release_memlock(struct bpf_map *map) { - struct user_struct *user = map->user; - bpf_uncharge_memlock(user, map->pages); + struct user_struct *user = map->memory.user; + + bpf_uncharge_memlock(user, map->memory.pages); free_uid(user); } @@ -243,17 +244,17 @@ int bpf_map_charge_memlock(struct bpf_map *map, u32 pages) { int ret; - ret = bpf_charge_memlock(map->user, pages); + ret = bpf_charge_memlock(map->memory.user, pages); if (ret) return ret; - map->pages += pages; + map->memory.pages += pages; return ret; } void bpf_map_uncharge_memlock(struct bpf_map *map, u32 pages) { - bpf_uncharge_memlock(map->user, pages); - map->pages -= pages; + bpf_uncharge_memlock(map->memory.user, pages); + map->memory.pages -= pages; } static int bpf_map_alloc_id(struct bpf_map *map) @@ -396,7 +397,7 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp) map->value_size, map->max_entries, map->map_flags, - map->pages * 1ULL << PAGE_SHIFT, + map->memory.pages * 1ULL << PAGE_SHIFT, map->id, READ_ONCE(map->frozen)); diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c index 686d244e798d..f816ee1a0fa0 100644 --- a/kernel/bpf/xskmap.c +++ b/kernel/bpf/xskmap.c @@ -40,10 +40,10 @@ static struct bpf_map *xsk_map_alloc(union bpf_attr *attr) if (cost >= U32_MAX - PAGE_SIZE) goto free_m; - m->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; + m->map.memory.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; /* Notice returns -EPERM on if map size is larger than memlock limit */ - err = bpf_map_precharge_memlock(m->map.pages); + err = bpf_map_precharge_memlock(m->map.memory.pages); if (err) goto free_m; diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index 377f3b5e53e5..448b12d0f94e 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -659,7 +659,7 @@ static struct bpf_map *bpf_sk_storage_map_alloc(union bpf_attr *attr) smap->elem_size = sizeof(struct bpf_sk_storage_elem) + attr->value_size; smap->cache_idx = (unsigned int)atomic_inc_return(&cache_idx) % BPF_SK_STORAGE_CACHE_SIZE; - smap->map.pages = pages; + smap->map.memory.pages = pages; return &smap->map; } diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 3c0e44cb811a..d48c09dcc900 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -49,8 +49,8 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr) goto free_stab; } - stab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; - err = bpf_map_precharge_memlock(stab->map.pages); + stab->map.memory.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; + err = bpf_map_precharge_memlock(stab->map.memory.pages); if (err) goto free_stab; From 632d849a6d29f3512ec75f8d7b4d185992f0b738 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Wed, 29 May 2019 18:03:58 -0700 Subject: [PATCH 1146/1640] UPSTREAM: bpf: rework memlock-based memory accounting for maps In order to unify the existing memlock charging code with the memcg-based memory accounting, which will be added later, let's rework the current scheme. Currently the following design is used: 1) .alloc() callback optionally checks if the allocation will likely succeed using bpf_map_precharge_memlock() 2) .alloc() performs actual allocations 3) .alloc() callback calculates map cost and sets map.memory.pages 4) map_create() calls bpf_map_init_memlock() which sets map.memory.user and performs actual charging; in case of failure the map is destroyed 1) bpf_map_free_deferred() calls bpf_map_release_memlock(), which performs uncharge and releases the user 2) .map_free() callback releases the memory The scheme can be simplified and made more robust: 1) .alloc() calculates map cost and calls bpf_map_charge_init() 2) bpf_map_charge_init() sets map.memory.user and performs actual charge 3) .alloc() performs actual allocations 1) .map_free() callback releases the memory 2) bpf_map_charge_finish() performs uncharge and releases the user The new scheme also allows to reuse bpf_map_charge_init()/finish() functions for memcg-based accounting. Because charges are performed before actual allocations and uncharges after freeing the memory, no bogus memory pressure can be created. In cases when the map structure is not available (e.g. it's not created yet, or is already destroyed), on-stack bpf_map_memory structure is used. The charge can be transferred with the bpf_map_charge_move() function. Change-Id: I299bfa9d3e74f366861b6de3bf17951a1374824b Signed-off-by: Roman Gushchin Acked-by: Song Liu Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 5 ++- kernel/bpf/arraymap.c | 10 +++-- kernel/bpf/cpumap.c | 8 ++-- kernel/bpf/devmap.c | 13 ++++--- kernel/bpf/hashtab.c | 11 +++--- kernel/bpf/local_storage.c | 9 +++-- kernel/bpf/lpm_trie.c | 5 +-- kernel/bpf/queue_stack_maps.c | 9 +++-- kernel/bpf/reuseport_array.c | 9 +++-- kernel/bpf/stackmap.c | 32 +++++++++------- kernel/bpf/syscall.c | 69 +++++++++++++++++------------------ kernel/bpf/xskmap.c | 9 +++-- net/core/bpf_sk_storage.c | 8 ++-- net/core/sock_map.c | 5 ++- 14 files changed, 113 insertions(+), 89 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 902b304d5767..e5e71b47175f 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -660,9 +660,12 @@ struct bpf_map *__bpf_map_get(struct fd f); struct bpf_map * __must_check bpf_map_inc(struct bpf_map *map, bool uref); void bpf_map_put_with_uref(struct bpf_map *map); void bpf_map_put(struct bpf_map *map); -int bpf_map_precharge_memlock(u32 pages); int bpf_map_charge_memlock(struct bpf_map *map, u32 pages); void bpf_map_uncharge_memlock(struct bpf_map *map, u32 pages); +int bpf_map_charge_init(struct bpf_map_memory *mem, u32 pages); +void bpf_map_charge_finish(struct bpf_map_memory *mem); +void bpf_map_charge_move(struct bpf_map_memory *dst, + struct bpf_map_memory *src); void *bpf_map_area_alloc(size_t size, int numa_node); void bpf_map_area_free(void *base); void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr); diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 37f47fd85514..e63a0355caeb 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -86,6 +86,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) u32 elem_size, index_mask, max_entries; bool unpriv = !capable(CAP_SYS_ADMIN); u64 cost, array_size, mask64; + struct bpf_map_memory mem; struct bpf_array *array; elem_size = round_up(attr->value_size, 8); @@ -128,23 +129,26 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) } cost = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; - ret = bpf_map_precharge_memlock(cost); + ret = bpf_map_charge_init(&mem, cost); if (ret < 0) return ERR_PTR(ret); /* allocate all map elements and zero-initialize them */ array = bpf_map_area_alloc(array_size, numa_node); - if (!array) + if (!array) { + bpf_map_charge_finish(&mem); return ERR_PTR(-ENOMEM); + } array->index_mask = index_mask; array->map.unpriv_array = unpriv; /* copy mandatory map attributes */ bpf_map_init_from_attr(&array->map, attr); - array->map.memory.pages = cost; + bpf_map_charge_move(&array->map.memory, &mem); array->elem_size = elem_size; if (percpu && bpf_array_alloc_percpu(array)) { + bpf_map_charge_finish(&array->map.memory); bpf_map_area_free(array); return ERR_PTR(-ENOMEM); } diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index e8e529bde067..82117594f698 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -108,10 +108,10 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) cost += cpu_map_bitmap_size(attr) * num_possible_cpus(); if (cost >= U32_MAX - PAGE_SIZE) goto free_cmap; - cmap->map.memory.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; /* Notice returns -EPERM on if map size is larger than memlock limit */ - ret = bpf_map_precharge_memlock(cmap->map.memory.pages); + ret = bpf_map_charge_init(&cmap->map.memory, + round_up(cost, PAGE_SIZE) >> PAGE_SHIFT); if (ret) { err = ret; goto free_cmap; @@ -121,7 +121,7 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) cmap->flush_needed = __alloc_percpu(cpu_map_bitmap_size(attr), __alignof__(unsigned long)); if (!cmap->flush_needed) - goto free_cmap; + goto free_charge; /* Alloc array for possible remote "destination" CPUs */ cmap->cpu_map = bpf_map_area_alloc(cmap->map.max_entries * @@ -133,6 +133,8 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) return &cmap->map; free_percpu: free_percpu(cmap->flush_needed); +free_charge: + bpf_map_charge_finish(&cmap->map.memory); free_cmap: kfree(cmap); return ERR_PTR(err); diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 3f7b6b403d78..db674bd1a032 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -111,10 +111,9 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) if (cost >= U32_MAX - PAGE_SIZE) goto free_dtab; - dtab->map.memory.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; - - /* if map size is larger than memlock limit, reject it early */ - err = bpf_map_precharge_memlock(dtab->map.memory.pages); + /* if map size is larger than memlock limit, reject it */ + err = bpf_map_charge_init(&dtab->map.memory, + round_up(cost, PAGE_SIZE) >> PAGE_SHIFT); if (err) goto free_dtab; @@ -125,19 +124,21 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) __alignof__(unsigned long), GFP_KERNEL | __GFP_NOWARN); if (!dtab->flush_needed) - goto free_dtab; + goto free_charge; dtab->netdev_map = bpf_map_area_alloc(dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *), dtab->map.numa_node); if (!dtab->netdev_map) - goto free_dtab; + goto free_charge; spin_lock(&dev_map_lock); list_add_tail_rcu(&dtab->list, &dev_map_list); spin_unlock(&dev_map_lock); return &dtab->map; +free_charge: + bpf_map_charge_finish(&dtab->map.memory); free_dtab: free_percpu(dtab->flush_needed); kfree(dtab); diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index dc2b406a52f7..96399b280852 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -371,10 +371,9 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) /* make sure page count doesn't overflow */ goto free_htab; - htab->map.memory.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; - - /* if map size is larger than memlock limit, reject it early */ - err = bpf_map_precharge_memlock(htab->map.memory.pages); + /* if map size is larger than memlock limit, reject it */ + err = bpf_map_charge_init(&htab->map.memory, + round_up(cost, PAGE_SIZE) >> PAGE_SHIFT); if (err) goto free_htab; @@ -383,7 +382,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) sizeof(struct bucket), htab->map.numa_node); if (!htab->buckets) - goto free_htab; + goto free_charge; if (htab->map.map_flags & BPF_F_ZERO_SEED) htab->hashrnd = 0; @@ -416,6 +415,8 @@ free_prealloc: prealloc_destroy(htab); free_buckets: bpf_map_area_free(htab->buckets); +free_charge: + bpf_map_charge_finish(&htab->map.memory); free_htab: kfree(htab); return ERR_PTR(err); diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index 574325276650..e49bfd4f4f6d 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -272,6 +272,7 @@ static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr) { int numa_node = bpf_map_attr_numa_node(attr); struct bpf_cgroup_storage_map *map; + struct bpf_map_memory mem; u32 pages; int ret; @@ -294,16 +295,18 @@ static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr) pages = round_up(sizeof(struct bpf_cgroup_storage_map), PAGE_SIZE) >> PAGE_SHIFT; - ret = bpf_map_precharge_memlock(pages); + ret = bpf_map_charge_init(&mem, pages); if (ret < 0) return ERR_PTR(ret); map = kmalloc_node(sizeof(struct bpf_cgroup_storage_map), __GFP_ZERO | GFP_USER, numa_node); - if (!map) + if (!map) { + bpf_map_charge_finish(&mem); return ERR_PTR(-ENOMEM); + } - map->map.memory.pages = pages; + bpf_map_charge_move(&map->map.memory, &mem); /* copy mandatory map attributes */ bpf_map_init_from_attr(&map->map, attr); diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index 26cd7a0dbc3c..a0211be6f572 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -581,9 +581,8 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr) goto out_err; } - trie->map.memory.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; - - ret = bpf_map_precharge_memlock(trie->map.memory.pages); + ret = bpf_map_charge_init(&trie->map.memory, + round_up(cost, PAGE_SIZE) >> PAGE_SHIFT); if (ret) goto out_err; diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c index 8a510e71d486..224cb0fd8f03 100644 --- a/kernel/bpf/queue_stack_maps.c +++ b/kernel/bpf/queue_stack_maps.c @@ -67,6 +67,7 @@ static int queue_stack_map_alloc_check(union bpf_attr *attr) static struct bpf_map *queue_stack_map_alloc(union bpf_attr *attr) { int ret, numa_node = bpf_map_attr_numa_node(attr); + struct bpf_map_memory mem = {0}; struct bpf_queue_stack *qs; u64 size, queue_size, cost; @@ -77,19 +78,21 @@ static struct bpf_map *queue_stack_map_alloc(union bpf_attr *attr) cost = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; - ret = bpf_map_precharge_memlock(cost); + ret = bpf_map_charge_init(&mem, cost); if (ret < 0) return ERR_PTR(ret); qs = bpf_map_area_alloc(queue_size, numa_node); - if (!qs) + if (!qs) { + bpf_map_charge_finish(&mem); return ERR_PTR(-ENOMEM); + } memset(qs, 0, sizeof(*qs)); bpf_map_init_from_attr(&qs->map, attr); - qs->map.memory.pages = cost; + bpf_map_charge_move(&qs->map.memory, &mem); qs->size = size; raw_spin_lock_init(&qs->lock); diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c index 819515242739..5c6e25b1b9b1 100644 --- a/kernel/bpf/reuseport_array.c +++ b/kernel/bpf/reuseport_array.c @@ -151,6 +151,7 @@ static struct bpf_map *reuseport_array_alloc(union bpf_attr *attr) { int err, numa_node = bpf_map_attr_numa_node(attr); struct reuseport_array *array; + struct bpf_map_memory mem; u64 cost, array_size; if (!capable(CAP_SYS_ADMIN)) @@ -165,18 +166,20 @@ static struct bpf_map *reuseport_array_alloc(union bpf_attr *attr) return ERR_PTR(-ENOMEM); cost = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; - err = bpf_map_precharge_memlock(cost); + err = bpf_map_charge_init(&mem, cost); if (err) return ERR_PTR(err); /* allocate all map elements and zero-initialize them */ array = bpf_map_area_alloc(array_size, numa_node); - if (!array) + if (!array) { + bpf_map_charge_finish(&mem); return ERR_PTR(-ENOMEM); + } /* copy mandatory map attributes */ bpf_map_init_from_attr(&array->map, attr); - array->map.memory.pages = cost; + bpf_map_charge_move(&array->map.memory, &mem); return &array->map; } diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index beb23d9b8fcf..7cd12fb393ad 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -90,6 +90,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr) { u32 value_size = attr->value_size; struct bpf_stack_map *smap; + struct bpf_map_memory mem; u64 cost, n_buckets; int err; @@ -124,38 +125,41 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr) cost = n_buckets * sizeof(struct stack_map_bucket *) + sizeof(*smap); if (cost >= U32_MAX - PAGE_SIZE) return ERR_PTR(-E2BIG); - - smap = bpf_map_area_alloc(cost, bpf_map_attr_numa_node(attr)); - if (!smap) - return ERR_PTR(-ENOMEM); - - err = -E2BIG; cost += n_buckets * (value_size + sizeof(struct stack_map_bucket)); if (cost >= U32_MAX - PAGE_SIZE) - goto free_smap; + return ERR_PTR(-E2BIG); + + err = bpf_map_charge_init(&mem, + round_up(cost, PAGE_SIZE) >> PAGE_SHIFT); + if (err) + return ERR_PTR(err); + + smap = bpf_map_area_alloc(cost, bpf_map_attr_numa_node(attr)); + if (!smap) { + bpf_map_charge_finish(&mem); + return ERR_PTR(-ENOMEM); + } bpf_map_init_from_attr(&smap->map, attr); smap->map.value_size = value_size; smap->n_buckets = n_buckets; - smap->map.memory.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; - - err = bpf_map_precharge_memlock(smap->map.memory.pages); - if (err) - goto free_smap; err = get_callchain_buffers(sysctl_perf_event_max_stack); if (err) - goto free_smap; + goto free_charge; err = prealloc_elems_and_freelist(smap); if (err) goto put_buffers; + bpf_map_charge_move(&smap->map.memory, &mem); + return &smap->map; put_buffers: put_callchain_buffers(); -free_smap: +free_charge: + bpf_map_charge_finish(&mem); bpf_map_area_free(smap); return ERR_PTR(err); } diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 43c6fd098cee..2937f26ae9fb 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -189,19 +189,6 @@ void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr) map->numa_node = bpf_map_attr_numa_node(attr); } -int bpf_map_precharge_memlock(u32 pages) -{ - struct user_struct *user = get_current_user(); - unsigned long memlock_limit, cur; - - memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; - cur = atomic_long_read(&user->locked_vm); - free_uid(user); - if (cur + pages > memlock_limit) - return -EPERM; - return 0; -} - static int bpf_charge_memlock(struct user_struct *user, u32 pages) { unsigned long memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; @@ -215,29 +202,40 @@ static int bpf_charge_memlock(struct user_struct *user, u32 pages) static void bpf_uncharge_memlock(struct user_struct *user, u32 pages) { - atomic_long_sub(pages, &user->locked_vm); + if (user) + atomic_long_sub(pages, &user->locked_vm); } -static int bpf_map_init_memlock(struct bpf_map *map) +int bpf_map_charge_init(struct bpf_map_memory *mem, u32 pages) { struct user_struct *user = get_current_user(); int ret; - ret = bpf_charge_memlock(user, map->memory.pages); + ret = bpf_charge_memlock(user, pages); if (ret) { free_uid(user); return ret; } - map->memory.user = user; - return ret; + + mem->pages = pages; + mem->user = user; + + return 0; } -static void bpf_map_release_memlock(struct bpf_map *map) +void bpf_map_charge_finish(struct bpf_map_memory *mem) { - struct user_struct *user = map->memory.user; + bpf_uncharge_memlock(mem->user, mem->pages); + free_uid(mem->user); +} - bpf_uncharge_memlock(user, map->memory.pages); - free_uid(user); +void bpf_map_charge_move(struct bpf_map_memory *dst, + struct bpf_map_memory *src) +{ + *dst = *src; + + /* Make sure src will not be used for the redundant uncharging. */ + memset(src, 0, sizeof(struct bpf_map_memory)); } int bpf_map_charge_memlock(struct bpf_map *map, u32 pages) @@ -305,11 +303,13 @@ void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock) static void bpf_map_free_deferred(struct work_struct *work) { struct bpf_map *map = container_of(work, struct bpf_map, work); + struct bpf_map_memory mem; - bpf_map_release_memlock(map); + bpf_map_charge_move(&mem, &map->memory); security_bpf_map_free(map); /* implementation dependent freeing */ map->ops->map_free(map); + bpf_map_charge_finish(&mem); } static void bpf_map_put_uref(struct bpf_map *map) @@ -551,6 +551,7 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf, static int map_create(union bpf_attr *attr) { int numa_node = bpf_map_attr_numa_node(attr); + struct bpf_map_memory mem; struct bpf_map *map; int f_flags; int err; @@ -575,7 +576,7 @@ static int map_create(union bpf_attr *attr) err = bpf_obj_name_cpy(map->name, attr->map_name); if (err) - goto free_map_nouncharge; + goto free_map; atomic_set(&map->refcnt, 1); atomic_set(&map->usercnt, 1); @@ -585,20 +586,20 @@ static int map_create(union bpf_attr *attr) if (!attr->btf_value_type_id) { err = -EINVAL; - goto free_map_nouncharge; + goto free_map; } btf = btf_get_by_fd(attr->btf_fd); if (IS_ERR(btf)) { err = PTR_ERR(btf); - goto free_map_nouncharge; + goto free_map; } err = map_check_btf(map, btf, attr->btf_key_type_id, attr->btf_value_type_id); if (err) { btf_put(btf); - goto free_map_nouncharge; + goto free_map; } map->btf = btf; @@ -610,15 +611,11 @@ static int map_create(union bpf_attr *attr) err = security_bpf_map_alloc(map); if (err) - goto free_map_nouncharge; - - err = bpf_map_init_memlock(map); - if (err) - goto free_map_sec; + goto free_map; err = bpf_map_alloc_id(map); if (err) - goto free_map; + goto free_map_sec; err = bpf_map_new_fd(map, f_flags); if (err < 0) { @@ -634,13 +631,13 @@ static int map_create(union bpf_attr *attr) return err; -free_map: - bpf_map_release_memlock(map); free_map_sec: security_bpf_map_free(map); -free_map_nouncharge: +free_map: btf_put(map->btf); + bpf_map_charge_move(&mem, &map->memory); map->ops->map_free(map); + bpf_map_charge_finish(&mem); return err; } diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c index f816ee1a0fa0..a329dab7c7a4 100644 --- a/kernel/bpf/xskmap.c +++ b/kernel/bpf/xskmap.c @@ -40,10 +40,9 @@ static struct bpf_map *xsk_map_alloc(union bpf_attr *attr) if (cost >= U32_MAX - PAGE_SIZE) goto free_m; - m->map.memory.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; - /* Notice returns -EPERM on if map size is larger than memlock limit */ - err = bpf_map_precharge_memlock(m->map.memory.pages); + err = bpf_map_charge_init(&m->map.memory, + round_up(cost, PAGE_SIZE) >> PAGE_SHIFT); if (err) goto free_m; @@ -51,7 +50,7 @@ static struct bpf_map *xsk_map_alloc(union bpf_attr *attr) m->flush_list = alloc_percpu(struct list_head); if (!m->flush_list) - goto free_m; + goto free_charge; for_each_possible_cpu(cpu) INIT_LIST_HEAD(per_cpu_ptr(m->flush_list, cpu)); @@ -65,6 +64,8 @@ static struct bpf_map *xsk_map_alloc(union bpf_attr *attr) free_percpu: free_percpu(m->flush_list); +free_charge: + bpf_map_charge_finish(&m->map.memory); free_m: kfree(m); return ERR_PTR(err); diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index 448b12d0f94e..e8c2bf973ce8 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -640,13 +640,16 @@ static struct bpf_map *bpf_sk_storage_map_alloc(union bpf_attr *attr) cost = sizeof(*smap->buckets) * nbuckets + sizeof(*smap); pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; - ret = bpf_map_precharge_memlock(pages); - if (ret < 0) + ret = bpf_map_charge_init(&smap->map.memory, pages); + if (ret < 0) { + kfree(smap); return ERR_PTR(ret); + } smap->buckets = kvcalloc(sizeof(*smap->buckets), nbuckets, GFP_USER | __GFP_NOWARN); if (!smap->buckets) { + bpf_map_charge_finish(&smap->map.memory); kfree(smap); return ERR_PTR(-ENOMEM); } @@ -659,7 +662,6 @@ static struct bpf_map *bpf_sk_storage_map_alloc(union bpf_attr *attr) smap->elem_size = sizeof(struct bpf_sk_storage_elem) + attr->value_size; smap->cache_idx = (unsigned int)atomic_inc_return(&cache_idx) % BPF_SK_STORAGE_CACHE_SIZE; - smap->map.memory.pages = pages; return &smap->map; } diff --git a/net/core/sock_map.c b/net/core/sock_map.c index d48c09dcc900..3563be4cf6aa 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -49,8 +49,8 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr) goto free_stab; } - stab->map.memory.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; - err = bpf_map_precharge_memlock(stab->map.memory.pages); + err = bpf_map_charge_init(&stab->map.memory, + round_up(cost, PAGE_SIZE) >> PAGE_SHIFT); if (err) goto free_stab; @@ -60,6 +60,7 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr) if (stab->sks) return &stab->map; err = -ENOMEM; + bpf_map_charge_finish(&stab->map.memory); free_stab: kfree(stab); return ERR_PTR(err); From 1a371f6225ee86dfc92e07df70239845b1f4865a Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Wed, 29 May 2019 18:03:59 -0700 Subject: [PATCH 1147/1640] UPSTREAM: bpf: move memory size checks to bpf_map_charge_init() Most bpf map types doing similar checks and bytes to pages conversion during memory allocation and charging. Let's unify these checks by moving them into bpf_map_charge_init(). Change-Id: I55ceded2303102feba9e485042e8f5169f490609 Signed-off-by: Roman Gushchin Acked-by: Song Liu Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 2 +- kernel/bpf/arraymap.c | 8 +------- kernel/bpf/cpumap.c | 5 +---- kernel/bpf/devmap.c | 5 +---- kernel/bpf/hashtab.c | 7 +------ kernel/bpf/local_storage.c | 5 +---- kernel/bpf/lpm_trie.c | 7 +------ kernel/bpf/queue_stack_maps.c | 4 ---- kernel/bpf/reuseport_array.c | 10 ++-------- kernel/bpf/stackmap.c | 8 +------- kernel/bpf/syscall.c | 9 +++++++-- kernel/bpf/xskmap.c | 5 +---- net/core/bpf_sk_storage.c | 4 +--- net/core/sock_map.c | 8 +------- 14 files changed, 20 insertions(+), 67 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index e5e71b47175f..80252adb522d 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -662,7 +662,7 @@ void bpf_map_put_with_uref(struct bpf_map *map); void bpf_map_put(struct bpf_map *map); int bpf_map_charge_memlock(struct bpf_map *map, u32 pages); void bpf_map_uncharge_memlock(struct bpf_map *map, u32 pages); -int bpf_map_charge_init(struct bpf_map_memory *mem, u32 pages); +int bpf_map_charge_init(struct bpf_map_memory *mem, size_t size); void bpf_map_charge_finish(struct bpf_map_memory *mem); void bpf_map_charge_move(struct bpf_map_memory *dst, struct bpf_map_memory *src); diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index e63a0355caeb..9695ff02e22d 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -120,14 +120,8 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) /* make sure there is no u32 overflow later in round_up() */ cost = array_size; - if (cost >= U32_MAX - PAGE_SIZE) - return ERR_PTR(-ENOMEM); - if (percpu) { + if (percpu) cost += (u64)attr->max_entries * elem_size * num_possible_cpus(); - if (cost >= U32_MAX - PAGE_SIZE) - return ERR_PTR(-ENOMEM); - } - cost = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; ret = bpf_map_charge_init(&mem, cost); if (ret < 0) diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 82117594f698..30e2373315d5 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -106,12 +106,9 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) /* make sure page count doesn't overflow */ cost = (u64) cmap->map.max_entries * sizeof(struct bpf_cpu_map_entry *); cost += cpu_map_bitmap_size(attr) * num_possible_cpus(); - if (cost >= U32_MAX - PAGE_SIZE) - goto free_cmap; /* Notice returns -EPERM on if map size is larger than memlock limit */ - ret = bpf_map_charge_init(&cmap->map.memory, - round_up(cost, PAGE_SIZE) >> PAGE_SHIFT); + ret = bpf_map_charge_init(&cmap->map.memory, cost); if (ret) { err = ret; goto free_cmap; diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index db674bd1a032..e6e1adbce038 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -108,12 +108,9 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) /* make sure page count doesn't overflow */ cost = (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *); cost += dev_map_bitmap_size(attr) * num_possible_cpus(); - if (cost >= U32_MAX - PAGE_SIZE) - goto free_dtab; /* if map size is larger than memlock limit, reject it */ - err = bpf_map_charge_init(&dtab->map.memory, - round_up(cost, PAGE_SIZE) >> PAGE_SHIFT); + err = bpf_map_charge_init(&dtab->map.memory, cost); if (err) goto free_dtab; diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 96399b280852..ee867e86e2b8 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -367,13 +367,8 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) else cost += (u64) htab->elem_size * num_possible_cpus(); - if (cost >= U32_MAX - PAGE_SIZE) - /* make sure page count doesn't overflow */ - goto free_htab; - /* if map size is larger than memlock limit, reject it */ - err = bpf_map_charge_init(&htab->map.memory, - round_up(cost, PAGE_SIZE) >> PAGE_SHIFT); + err = bpf_map_charge_init(&htab->map.memory, cost); if (err) goto free_htab; diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index e49bfd4f4f6d..addd6fdceec8 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -273,7 +273,6 @@ static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr) int numa_node = bpf_map_attr_numa_node(attr); struct bpf_cgroup_storage_map *map; struct bpf_map_memory mem; - u32 pages; int ret; if (attr->key_size != sizeof(struct bpf_cgroup_storage_key)) @@ -293,9 +292,7 @@ static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr) /* max_entries is not used and enforced to be 0 */ return ERR_PTR(-EINVAL); - pages = round_up(sizeof(struct bpf_cgroup_storage_map), PAGE_SIZE) >> - PAGE_SHIFT; - ret = bpf_map_charge_init(&mem, pages); + ret = bpf_map_charge_init(&mem, sizeof(struct bpf_cgroup_storage_map)); if (ret < 0) return ERR_PTR(ret); diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index a0211be6f572..709bd91161ba 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -576,13 +576,8 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr) cost_per_node = sizeof(struct lpm_trie_node) + attr->value_size + trie->data_size; cost += (u64) attr->max_entries * cost_per_node; - if (cost >= U32_MAX - PAGE_SIZE) { - ret = -E2BIG; - goto out_err; - } - ret = bpf_map_charge_init(&trie->map.memory, - round_up(cost, PAGE_SIZE) >> PAGE_SHIFT); + ret = bpf_map_charge_init(&trie->map.memory, cost); if (ret) goto out_err; diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c index 224cb0fd8f03..f697647ceb54 100644 --- a/kernel/bpf/queue_stack_maps.c +++ b/kernel/bpf/queue_stack_maps.c @@ -73,10 +73,6 @@ static struct bpf_map *queue_stack_map_alloc(union bpf_attr *attr) size = (u64) attr->max_entries + 1; cost = queue_size = sizeof(*qs) + size * attr->value_size; - if (cost >= U32_MAX - PAGE_SIZE) - return ERR_PTR(-E2BIG); - - cost = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; ret = bpf_map_charge_init(&mem, cost); if (ret < 0) diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c index 5c6e25b1b9b1..50c083ba978c 100644 --- a/kernel/bpf/reuseport_array.c +++ b/kernel/bpf/reuseport_array.c @@ -152,7 +152,7 @@ static struct bpf_map *reuseport_array_alloc(union bpf_attr *attr) int err, numa_node = bpf_map_attr_numa_node(attr); struct reuseport_array *array; struct bpf_map_memory mem; - u64 cost, array_size; + u64 array_size; if (!capable(CAP_SYS_ADMIN)) return ERR_PTR(-EPERM); @@ -160,13 +160,7 @@ static struct bpf_map *reuseport_array_alloc(union bpf_attr *attr) array_size = sizeof(*array); array_size += (u64)attr->max_entries * sizeof(struct sock *); - /* make sure there is no u32 overflow later in round_up() */ - cost = array_size; - if (cost >= U32_MAX - PAGE_SIZE) - return ERR_PTR(-ENOMEM); - cost = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; - - err = bpf_map_charge_init(&mem, cost); + err = bpf_map_charge_init(&mem, array_size); if (err) return ERR_PTR(err); diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 7cd12fb393ad..1e7667ad3712 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -123,14 +123,8 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr) n_buckets = roundup_pow_of_two(attr->max_entries); cost = n_buckets * sizeof(struct stack_map_bucket *) + sizeof(*smap); - if (cost >= U32_MAX - PAGE_SIZE) - return ERR_PTR(-E2BIG); cost += n_buckets * (value_size + sizeof(struct stack_map_bucket)); - if (cost >= U32_MAX - PAGE_SIZE) - return ERR_PTR(-E2BIG); - - err = bpf_map_charge_init(&mem, - round_up(cost, PAGE_SIZE) >> PAGE_SHIFT); + err = bpf_map_charge_init(&mem, cost); if (err) return ERR_PTR(err); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 2937f26ae9fb..d9f55b907608 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -206,11 +206,16 @@ static void bpf_uncharge_memlock(struct user_struct *user, u32 pages) atomic_long_sub(pages, &user->locked_vm); } -int bpf_map_charge_init(struct bpf_map_memory *mem, u32 pages) +int bpf_map_charge_init(struct bpf_map_memory *mem, size_t size) { - struct user_struct *user = get_current_user(); + u32 pages = round_up(size, PAGE_SIZE) >> PAGE_SHIFT; + struct user_struct *user; int ret; + if (size >= U32_MAX - PAGE_SIZE) + return -E2BIG; + + user = get_current_user(); ret = bpf_charge_memlock(user, pages); if (ret) { free_uid(user); diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c index a329dab7c7a4..22066c28ba61 100644 --- a/kernel/bpf/xskmap.c +++ b/kernel/bpf/xskmap.c @@ -37,12 +37,9 @@ static struct bpf_map *xsk_map_alloc(union bpf_attr *attr) cost = (u64)m->map.max_entries * sizeof(struct xdp_sock *); cost += sizeof(struct list_head) * num_possible_cpus(); - if (cost >= U32_MAX - PAGE_SIZE) - goto free_m; /* Notice returns -EPERM on if map size is larger than memlock limit */ - err = bpf_map_charge_init(&m->map.memory, - round_up(cost, PAGE_SIZE) >> PAGE_SHIFT); + err = bpf_map_charge_init(&m->map.memory, cost); if (err) goto free_m; diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index e8c2bf973ce8..70da3de6e2ab 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -626,7 +626,6 @@ static struct bpf_map *bpf_sk_storage_map_alloc(union bpf_attr *attr) struct bpf_sk_storage_map *smap; unsigned int i; u32 nbuckets; - u32 pages; u64 cost; int ret; @@ -638,9 +637,8 @@ static struct bpf_map *bpf_sk_storage_map_alloc(union bpf_attr *attr) smap->bucket_log = ilog2(roundup_pow_of_two(num_possible_cpus())); nbuckets = 1U << smap->bucket_log; cost = sizeof(*smap->buckets) * nbuckets + sizeof(*smap); - pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; - ret = bpf_map_charge_init(&smap->map.memory, pages); + ret = bpf_map_charge_init(&smap->map.memory, cost); if (ret < 0) { kfree(smap); return ERR_PTR(ret); diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 3563be4cf6aa..b476d119de7e 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -44,13 +44,7 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr) /* Make sure page count doesn't overflow. */ cost = (u64) stab->map.max_entries * sizeof(struct sock *); - if (cost >= U32_MAX - PAGE_SIZE) { - err = -EINVAL; - goto free_stab; - } - - err = bpf_map_charge_init(&stab->map.memory, - round_up(cost, PAGE_SIZE) >> PAGE_SHIFT); + err = bpf_map_charge_init(&stab->map.memory, cost); if (err) goto free_stab; From cf13e160ccc0eddf3464c819a326e1854f8f3e65 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 4 Jun 2019 09:21:46 +0100 Subject: [PATCH 1148/1640] UPSTREAM: bpf: remove redundant assignment to err The variable err is assigned with the value -EINVAL that is never read and it is re-assigned a new value later on. The assignment is redundant and can be removed. Addresses-Coverity: ("Unused value") Change-Id: Id4523d352feaedeae5be03c7303137e2221e03a7 Signed-off-by: Colin Ian King Acked-by: Jesper Dangaard Brouer Signed-off-by: Daniel Borkmann --- kernel/bpf/devmap.c | 2 +- kernel/bpf/xskmap.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index e6e1adbce038..d50907193c09 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -88,8 +88,8 @@ static u64 dev_map_bitmap_size(const union bpf_attr *attr) static struct bpf_map *dev_map_alloc(union bpf_attr *attr) { struct bpf_dtab *dtab; - int err = -EINVAL; u64 cost; + int err; if (!capable(CAP_NET_ADMIN)) return ERR_PTR(-EPERM); diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c index 22066c28ba61..413d75f4fc72 100644 --- a/kernel/bpf/xskmap.c +++ b/kernel/bpf/xskmap.c @@ -17,8 +17,8 @@ struct xsk_map { static struct bpf_map *xsk_map_alloc(union bpf_attr *attr) { - int cpu, err = -EINVAL; struct xsk_map *m; + int cpu, err; u64 cost; if (!capable(CAP_NET_ADMIN)) From 3deb30fb748435fa2e6dc6c63f548f77ab24e161 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 29 May 2019 07:18:09 -0700 Subject: [PATCH 1149/1640] BACKPORT: treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 295 Based on 1 normalized pattern(s): this program is free software you can redistribute it and or modify it under the terms of version 2 of the gnu general public license as published by the free software foundation this program is distributed in the hope that it will be useful but without any warranty without even the implied warranty of merchantability or fitness for a particular purpose see the gnu general public license for more details extracted by the scancode license scanner the SPDX license identifier GPL-2.0-only has been chosen to replace the boilerplate/reference in 64 file(s). Change-Id: Ic7cca08bbba3c38e0d53d3374c43ee8bf1e24172 Signed-off-by: Thomas Gleixner Reviewed-by: Alexios Zavras Reviewed-by: Allison Randal Cc: linux-spdx@vger.kernel.org Link: https://lkml.kernel.org/r/20190529141901.894819585@linutronix.de Signed-off-by: Greg Kroah-Hartman --- arch/x86/mm/pti.c | 10 +--------- drivers/acpi/nfit/core.c | 10 +--------- drivers/acpi/nfit/mce.c | 10 +--------- drivers/acpi/nfit/nfit.h | 10 +--------- drivers/dax/dax-private.h | 10 +--------- drivers/dax/super.c | 10 +--------- drivers/net/wireless/realtek/rtl8xxxu/rtl8xxxu.h | 10 +--------- .../net/wireless/realtek/rtl8xxxu/rtl8xxxu_8192c.c | 10 +--------- .../net/wireless/realtek/rtl8xxxu/rtl8xxxu_8192e.c | 10 +--------- .../net/wireless/realtek/rtl8xxxu/rtl8xxxu_8723a.c | 10 +--------- .../net/wireless/realtek/rtl8xxxu/rtl8xxxu_8723b.c | 10 +--------- .../net/wireless/realtek/rtl8xxxu/rtl8xxxu_core.c | 10 +--------- .../net/wireless/realtek/rtl8xxxu/rtl8xxxu_regs.h | 10 +--------- drivers/nvdimm/btt_devs.c | 10 +--------- drivers/nvdimm/bus.c | 10 +--------- drivers/nvdimm/claim.c | 10 +--------- drivers/nvdimm/core.c | 10 +--------- drivers/nvdimm/dax_devs.c | 10 +--------- drivers/nvdimm/dimm.c | 10 +--------- drivers/nvdimm/dimm_devs.c | 10 +--------- drivers/nvdimm/label.c | 10 +--------- drivers/nvdimm/label.h | 10 +--------- drivers/nvdimm/namespace_devs.c | 10 +--------- drivers/nvdimm/nd-core.h | 10 +--------- drivers/nvdimm/nd.h | 10 +--------- drivers/nvdimm/pfn_devs.c | 10 +--------- drivers/nvdimm/region.c | 10 +--------- drivers/nvdimm/region_devs.c | 10 +--------- drivers/nvmem/meson-efuse.c | 10 +--------- drivers/nvmem/rockchip-efuse.c | 10 +--------- include/linux/libnvdimm.h | 10 +--------- include/linux/nd.h | 10 +--------- include/net/mpls.h | 10 +--------- include/net/mpls_iptunnel.h | 10 +--------- kernel/bpf/arraymap.c | 10 +--------- kernel/bpf/devmap.c | 10 +--------- kernel/bpf/disasm.c | 10 +--------- kernel/bpf/disasm.h | 10 +--------- kernel/bpf/hashtab.c | 10 +--------- kernel/bpf/helpers.c | 10 +--------- kernel/bpf/syscall.c | 10 +--------- kernel/bpf/verifier.c | 10 +--------- lib/test_bpf.c | 10 +--------- net/core/lwt_bpf.c | 10 +--------- net/core/ptp_classifier.c | 10 +--------- net/openvswitch/conntrack.c | 10 +--------- net/openvswitch/conntrack.h | 10 +--------- samples/bpf/xdp_redirect_map_user.c | 10 +--------- samples/bpf/xdp_redirect_user.c | 10 +--------- security/selinux/ibpkey.c | 12 +----------- security/selinux/include/ibpkey.h | 12 +----------- security/selinux/include/netnode.h | 12 +----------- security/selinux/include/netport.h | 12 +----------- security/selinux/netnode.c | 12 +----------- security/selinux/netport.c | 12 +----------- sound/pci/asihpi/hpioctl.c | 9 +-------- tools/testing/nvdimm/test/iomap.c | 10 +--------- tools/testing/nvdimm/test/nfit.c | 10 +--------- tools/testing/nvdimm/test/nfit_test.h | 10 +--------- tools/testing/selftests/timers/freq-step.c | 10 +--------- 60 files changed, 60 insertions(+), 551 deletions(-) diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c index 639f56dc626a..73faaf6e6852 100644 --- a/arch/x86/mm/pti.c +++ b/arch/x86/mm/pti.c @@ -1,15 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright(c) 2017 Intel Corporation. All rights reserved. * - * This program is free software; you can redistribute it and/or modify - * it under the terms of version 2 of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * * This code is based in part on work published here: * * https://github.com/IAIK/KAISER diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c index 41e1b135cf97..ab368bdee05a 100644 --- a/drivers/acpi/nfit/core.c +++ b/drivers/acpi/nfit/core.c @@ -1,14 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright(c) 2013-2015 Intel Corporation. All rights reserved. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of version 2 of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. */ #include #include diff --git a/drivers/acpi/nfit/mce.c b/drivers/acpi/nfit/mce.c index feeb95d574fa..1516f357a18d 100644 --- a/drivers/acpi/nfit/mce.c +++ b/drivers/acpi/nfit/mce.c @@ -1,16 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * NFIT - Machine Check Handler * * Copyright(c) 2013-2016 Intel Corporation. All rights reserved. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of version 2 of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. */ #include #include diff --git a/drivers/acpi/nfit/nfit.h b/drivers/acpi/nfit/nfit.h index 54292db61262..9deeb15efd91 100644 --- a/drivers/acpi/nfit/nfit.h +++ b/drivers/acpi/nfit/nfit.h @@ -1,16 +1,8 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* * NVDIMM Firmware Interface Table - NFIT * * Copyright(c) 2013-2015 Intel Corporation. All rights reserved. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of version 2 of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. */ #ifndef __NFIT_H__ #define __NFIT_H__ diff --git a/drivers/dax/dax-private.h b/drivers/dax/dax-private.h index b6fc4f04636d..ea569dc0ba94 100644 --- a/drivers/dax/dax-private.h +++ b/drivers/dax/dax-private.h @@ -1,14 +1,6 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright(c) 2016 Intel Corporation. All rights reserved. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of version 2 of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. */ #ifndef __DAX_PRIVATE_H__ #define __DAX_PRIVATE_H__ diff --git a/drivers/dax/super.c b/drivers/dax/super.c index 6c179c2a9ff9..1baa8ed6bb7b 100644 --- a/drivers/dax/super.c +++ b/drivers/dax/super.c @@ -1,14 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright(c) 2017 Intel Corporation. All rights reserved. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of version 2 of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. */ #include #include diff --git a/drivers/net/wireless/realtek/rtl8xxxu/rtl8xxxu.h b/drivers/net/wireless/realtek/rtl8xxxu/rtl8xxxu.h index 1913b51c1e80..880110b3df09 100644 --- a/drivers/net/wireless/realtek/rtl8xxxu/rtl8xxxu.h +++ b/drivers/net/wireless/realtek/rtl8xxxu/rtl8xxxu.h @@ -1,15 +1,7 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (c) 2014 - 2017 Jes Sorensen * - * This program is free software; you can redistribute it and/or modify it - * under the terms of version 2 of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. - * * Register definitions taken from original Realtek rtl8723au driver */ diff --git a/drivers/net/wireless/realtek/rtl8xxxu/rtl8xxxu_8192c.c b/drivers/net/wireless/realtek/rtl8xxxu/rtl8xxxu_8192c.c index a41a29612582..27c4cb688be4 100644 --- a/drivers/net/wireless/realtek/rtl8xxxu/rtl8xxxu_8192c.c +++ b/drivers/net/wireless/realtek/rtl8xxxu/rtl8xxxu_8192c.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * RTL8XXXU mac80211 USB driver - 8188c/8188r/8192c specific subdriver * @@ -10,15 +11,6 @@ * rtl8723au driver. As the Realtek 8xxx chips are very similar in * their programming interface, I have started adding support for * additional 8xxx chips like the 8192cu, 8188cus, etc. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of version 2 of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. */ #include diff --git a/drivers/net/wireless/realtek/rtl8xxxu/rtl8xxxu_8192e.c b/drivers/net/wireless/realtek/rtl8xxxu/rtl8xxxu_8192e.c index 413f0ced960a..f6af1d401a31 100644 --- a/drivers/net/wireless/realtek/rtl8xxxu/rtl8xxxu_8192e.c +++ b/drivers/net/wireless/realtek/rtl8xxxu/rtl8xxxu_8192e.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * RTL8XXXU mac80211 USB driver - 8192e specific subdriver * @@ -10,15 +11,6 @@ * rtl8723au driver. As the Realtek 8xxx chips are very similar in * their programming interface, I have started adding support for * additional 8xxx chips like the 8192cu, 8188cus, etc. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of version 2 of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. */ #include diff --git a/drivers/net/wireless/realtek/rtl8xxxu/rtl8xxxu_8723a.c b/drivers/net/wireless/realtek/rtl8xxxu/rtl8xxxu_8723a.c index 174631132b96..4f93f88716a9 100644 --- a/drivers/net/wireless/realtek/rtl8xxxu/rtl8xxxu_8723a.c +++ b/drivers/net/wireless/realtek/rtl8xxxu/rtl8xxxu_8723a.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * RTL8XXXU mac80211 USB driver - 8723a specific subdriver * @@ -10,15 +11,6 @@ * rtl8723au driver. As the Realtek 8xxx chips are very similar in * their programming interface, I have started adding support for * additional 8xxx chips like the 8192cu, 8188cus, etc. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of version 2 of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. */ #include diff --git a/drivers/net/wireless/realtek/rtl8xxxu/rtl8xxxu_8723b.c b/drivers/net/wireless/realtek/rtl8xxxu/rtl8xxxu_8723b.c index 27e97df996c7..22a76910b785 100644 --- a/drivers/net/wireless/realtek/rtl8xxxu/rtl8xxxu_8723b.c +++ b/drivers/net/wireless/realtek/rtl8xxxu/rtl8xxxu_8723b.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * RTL8XXXU mac80211 USB driver - 8723b specific subdriver * @@ -10,15 +11,6 @@ * rtl8723au driver. As the Realtek 8xxx chips are very similar in * their programming interface, I have started adding support for * additional 8xxx chips like the 8192cu, 8188cus, etc. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of version 2 of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. */ #include diff --git a/drivers/net/wireless/realtek/rtl8xxxu/rtl8xxxu_core.c b/drivers/net/wireless/realtek/rtl8xxxu/rtl8xxxu_core.c index 9263a6a64788..305f9934b451 100644 --- a/drivers/net/wireless/realtek/rtl8xxxu/rtl8xxxu_core.c +++ b/drivers/net/wireless/realtek/rtl8xxxu/rtl8xxxu_core.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * RTL8XXXU mac80211 USB driver * @@ -10,15 +11,6 @@ * rtl8723au driver. As the Realtek 8xxx chips are very similar in * their programming interface, I have started adding support for * additional 8xxx chips like the 8192cu, 8188cus, etc. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of version 2 of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. */ #include diff --git a/drivers/net/wireless/realtek/rtl8xxxu/rtl8xxxu_regs.h b/drivers/net/wireless/realtek/rtl8xxxu/rtl8xxxu_regs.h index 3d3e2e1ada6f..a2a31f374a82 100644 --- a/drivers/net/wireless/realtek/rtl8xxxu/rtl8xxxu_regs.h +++ b/drivers/net/wireless/realtek/rtl8xxxu/rtl8xxxu_regs.h @@ -1,15 +1,7 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (c) 2014 - 2017 Jes Sorensen * - * This program is free software; you can redistribute it and/or modify it - * under the terms of version 2 of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. - * * Register definitions taken from original Realtek rtl8723au driver */ diff --git a/drivers/nvdimm/btt_devs.c b/drivers/nvdimm/btt_devs.c index 76a74e292fd7..c2d17c957cf8 100644 --- a/drivers/nvdimm/btt_devs.c +++ b/drivers/nvdimm/btt_devs.c @@ -1,14 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright(c) 2013-2015 Intel Corporation. All rights reserved. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of version 2 of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. */ #include #include diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c index 17d54cabcbc6..291ccf717a64 100644 --- a/drivers/nvdimm/bus.c +++ b/drivers/nvdimm/bus.c @@ -1,14 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright(c) 2013-2015 Intel Corporation. All rights reserved. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of version 2 of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include diff --git a/drivers/nvdimm/claim.c b/drivers/nvdimm/claim.c index 32f2aaf62f27..85380f9d6efa 100644 --- a/drivers/nvdimm/claim.c +++ b/drivers/nvdimm/claim.c @@ -1,14 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright(c) 2013-2015 Intel Corporation. All rights reserved. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of version 2 of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. */ #include #include diff --git a/drivers/nvdimm/core.c b/drivers/nvdimm/core.c index bb71f0cf8f5d..2244b4c478f1 100644 --- a/drivers/nvdimm/core.c +++ b/drivers/nvdimm/core.c @@ -1,14 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright(c) 2013-2015 Intel Corporation. All rights reserved. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of version 2 of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. */ #include #include diff --git a/drivers/nvdimm/dax_devs.c b/drivers/nvdimm/dax_devs.c index 0e9e37410c58..16ed08d76fff 100644 --- a/drivers/nvdimm/dax_devs.c +++ b/drivers/nvdimm/dax_devs.c @@ -1,14 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright(c) 2013-2016 Intel Corporation. All rights reserved. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of version 2 of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. */ #include #include diff --git a/drivers/nvdimm/dimm.c b/drivers/nvdimm/dimm.c index 0939f064054d..a6ba9e65e2f6 100644 --- a/drivers/nvdimm/dimm.c +++ b/drivers/nvdimm/dimm.c @@ -1,14 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright(c) 2013-2015 Intel Corporation. All rights reserved. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of version 2 of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. */ #include #include diff --git a/drivers/nvdimm/dimm_devs.c b/drivers/nvdimm/dimm_devs.c index 58e0dcfeb724..1674e9280c74 100644 --- a/drivers/nvdimm/dimm_devs.c +++ b/drivers/nvdimm/dimm_devs.c @@ -1,14 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright(c) 2013-2015 Intel Corporation. All rights reserved. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of version 2 of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include diff --git a/drivers/nvdimm/label.c b/drivers/nvdimm/label.c index 84eb510456fd..43194f1a7e2d 100644 --- a/drivers/nvdimm/label.c +++ b/drivers/nvdimm/label.c @@ -1,14 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright(c) 2013-2015 Intel Corporation. All rights reserved. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of version 2 of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. */ #include #include diff --git a/drivers/nvdimm/label.h b/drivers/nvdimm/label.h index 9ed772db6900..2d374af0429d 100644 --- a/drivers/nvdimm/label.h +++ b/drivers/nvdimm/label.h @@ -1,14 +1,6 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright(c) 2013-2015 Intel Corporation. All rights reserved. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of version 2 of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. */ #ifndef __LABEL_H__ #define __LABEL_H__ diff --git a/drivers/nvdimm/namespace_devs.c b/drivers/nvdimm/namespace_devs.c index 6ed3b4ed27dd..0182947d7c89 100644 --- a/drivers/nvdimm/namespace_devs.c +++ b/drivers/nvdimm/namespace_devs.c @@ -1,14 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright(c) 2013-2015 Intel Corporation. All rights reserved. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of version 2 of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. */ #include #include diff --git a/drivers/nvdimm/nd-core.h b/drivers/nvdimm/nd-core.h index f7b0c39ac339..6a71b083eaf7 100644 --- a/drivers/nvdimm/nd-core.h +++ b/drivers/nvdimm/nd-core.h @@ -1,14 +1,6 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright(c) 2013-2015 Intel Corporation. All rights reserved. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of version 2 of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. */ #ifndef __ND_CORE_H__ #define __ND_CORE_H__ diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h index cf30b49481a1..024ace14ea39 100644 --- a/drivers/nvdimm/nd.h +++ b/drivers/nvdimm/nd.h @@ -1,14 +1,6 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright(c) 2013-2015 Intel Corporation. All rights reserved. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of version 2 of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. */ #ifndef __ND_H__ #define __ND_H__ diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c index e2af91a91c22..cd1d0f88329c 100644 --- a/drivers/nvdimm/pfn_devs.c +++ b/drivers/nvdimm/pfn_devs.c @@ -1,14 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright(c) 2013-2016 Intel Corporation. All rights reserved. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of version 2 of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. */ #include #include diff --git a/drivers/nvdimm/region.c b/drivers/nvdimm/region.c index 034f0a07d627..d29084736f5a 100644 --- a/drivers/nvdimm/region.c +++ b/drivers/nvdimm/region.c @@ -1,14 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright(c) 2013-2015 Intel Corporation. All rights reserved. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of version 2 of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. */ #include #include diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c index c0e6a6d235de..57711462fa23 100644 --- a/drivers/nvdimm/region_devs.c +++ b/drivers/nvdimm/region_devs.c @@ -1,14 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright(c) 2013-2015 Intel Corporation. All rights reserved. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of version 2 of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. */ #include #include diff --git a/drivers/nvmem/meson-efuse.c b/drivers/nvmem/meson-efuse.c index 70bfc9839bb2..e4d821ba964d 100644 --- a/drivers/nvmem/meson-efuse.c +++ b/drivers/nvmem/meson-efuse.c @@ -1,17 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Amlogic eFuse Driver * * Copyright (c) 2016 Endless Computers, Inc. * Author: Carlo Caione - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of version 2 of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. */ #include diff --git a/drivers/nvmem/rockchip-efuse.c b/drivers/nvmem/rockchip-efuse.c index 63e3eb55f3ac..c7e5e253860f 100644 --- a/drivers/nvmem/rockchip-efuse.c +++ b/drivers/nvmem/rockchip-efuse.c @@ -1,17 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Rockchip eFuse Driver * * Copyright (c) 2015 Rockchip Electronics Co. Ltd. * Author: Caesar Wang - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of version 2 of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. */ #include diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h index 84284e3353ed..27570fa47b4d 100644 --- a/include/linux/libnvdimm.h +++ b/include/linux/libnvdimm.h @@ -1,16 +1,8 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* * libnvdimm - Non-volatile-memory Devices Subsystem * * Copyright(c) 2013-2015 Intel Corporation. All rights reserved. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of version 2 of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. */ #ifndef __LIBNVDIMM_H__ #define __LIBNVDIMM_H__ diff --git a/include/linux/nd.h b/include/linux/nd.h index 5dc6b695437d..0bdaae2f525a 100644 --- a/include/linux/nd.h +++ b/include/linux/nd.h @@ -1,14 +1,6 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright(c) 2013-2015 Intel Corporation. All rights reserved. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of version 2 of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. */ #ifndef __LINUX_ND_H__ #define __LINUX_ND_H__ diff --git a/include/net/mpls.h b/include/net/mpls.h index 1dbc669b770e..ccaf238e8ea7 100644 --- a/include/net/mpls.h +++ b/include/net/mpls.h @@ -1,14 +1,6 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (c) 2014 Nicira, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. */ #ifndef _NET_MPLS_H diff --git a/include/net/mpls_iptunnel.h b/include/net/mpls_iptunnel.h index 9d22bf67ac86..6b4759eae158 100644 --- a/include/net/mpls_iptunnel.h +++ b/include/net/mpls_iptunnel.h @@ -1,14 +1,6 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (c) 2015 Cumulus Networks, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. */ #ifndef _NET_MPLS_IPTUNNEL_H diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 9695ff02e22d..af90c4498e80 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -1,14 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com * Copyright (c) 2016,2017 Facebook - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. */ #include #include diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index d50907193c09..40e86a7e0ef0 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -1,13 +1,5 @@ +// SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2017 Covalent IO, Inc. http://covalent.io - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. */ /* Devmaps primary use is as a backend map for XDP BPF helper call diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c index 6929928be5d7..ff1dd7d45b58 100644 --- a/kernel/bpf/disasm.c +++ b/kernel/bpf/disasm.c @@ -1,14 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com * Copyright (c) 2016 Facebook - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. */ #include diff --git a/kernel/bpf/disasm.h b/kernel/bpf/disasm.h index e1324a834a24..e546b18d27da 100644 --- a/kernel/bpf/disasm.h +++ b/kernel/bpf/disasm.h @@ -1,14 +1,6 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com * Copyright (c) 2016 Facebook - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. */ #ifndef __BPF_DISASM_H__ diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index ee867e86e2b8..87579912f656 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -1,14 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com * Copyright (c) 2016 Facebook - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. */ #include #include diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 8bf07a9596b3..d15fc6dcb6d6 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1,13 +1,5 @@ +// SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. */ #include #include diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index d9f55b907608..72a3de6cf3ba 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1,13 +1,5 @@ +// SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. */ #include #include diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a83bc8caeb0e..469990884f5d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1,15 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com * Copyright (c) 2016 Facebook * Copyright (c) 2018 Covalent IO, Inc. http://covalent.io - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. */ #include #include diff --git a/lib/test_bpf.c b/lib/test_bpf.c index 724674c421ca..a811848b5db4 100644 --- a/lib/test_bpf.c +++ b/lib/test_bpf.c @@ -1,16 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Testsuite for BPF interpreter and BPF JIT compiler * * Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c index 680782c53225..3c29c02db4ef 100644 --- a/net/core/lwt_bpf.c +++ b/net/core/lwt_bpf.c @@ -1,13 +1,5 @@ +// SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2016 Thomas Graf - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. */ #include diff --git a/net/core/ptp_classifier.c b/net/core/ptp_classifier.c index 703cf76aa7c2..4083e90e3ac0 100644 --- a/net/core/ptp_classifier.c +++ b/net/core/ptp_classifier.c @@ -1,13 +1,5 @@ +// SPDX-License-Identifier: GPL-2.0-only /* PTP classifier - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. */ /* The below program is the bpf_asm (tools/net/) representation of diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index 3d74e33bf829..3117b0c143d0 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -1,14 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2015 Nicira, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. */ #include diff --git a/net/openvswitch/conntrack.h b/net/openvswitch/conntrack.h index bc7efd1867ab..e2ba460ec91e 100644 --- a/net/openvswitch/conntrack.h +++ b/net/openvswitch/conntrack.h @@ -1,14 +1,6 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (c) 2015 Nicira, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. */ #ifndef OVS_CONNTRACK_H diff --git a/samples/bpf/xdp_redirect_map_user.c b/samples/bpf/xdp_redirect_map_user.c index d4d86a273fba..98c270013353 100644 --- a/samples/bpf/xdp_redirect_map_user.c +++ b/samples/bpf/xdp_redirect_map_user.c @@ -1,13 +1,5 @@ +// SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2017 Covalent IO, Inc. http://covalent.io - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. */ #include #include diff --git a/samples/bpf/xdp_redirect_user.c b/samples/bpf/xdp_redirect_user.c index bd9fa7a55a30..3ce2a63254e9 100644 --- a/samples/bpf/xdp_redirect_user.c +++ b/samples/bpf/xdp_redirect_user.c @@ -1,13 +1,5 @@ +// SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2016 John Fastabend - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. */ #include #include diff --git a/security/selinux/ibpkey.c b/security/selinux/ibpkey.c index cb05ae28ce00..5887bff50560 100644 --- a/security/selinux/ibpkey.c +++ b/security/selinux/ibpkey.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Pkey table * @@ -11,21 +12,10 @@ * Paul Moore * (see security/selinux/netif.c and security/selinux/netport.c for more * information) - * */ /* * (c) Mellanox Technologies, 2016 - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of version 2 of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * */ #include diff --git a/security/selinux/include/ibpkey.h b/security/selinux/include/ibpkey.h index e3c2b12ac22a..a684bf86933d 100644 --- a/security/selinux/include/ibpkey.h +++ b/security/selinux/include/ibpkey.h @@ -1,24 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* * pkey table * * SELinux must keep a mapping of pkeys to labels/SIDs. This * mapping is maintained as part of the normal policy but a fast cache is * needed to reduce the lookup overhead. - * */ /* * (c) Mellanox Technologies, 2016 - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of version 2 of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * */ #ifndef _SELINUX_IB_PKEY_H diff --git a/security/selinux/include/netnode.h b/security/selinux/include/netnode.h index 937668dd3024..e3f784a85840 100644 --- a/security/selinux/include/netnode.h +++ b/security/selinux/include/netnode.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* * Network node table * @@ -7,21 +8,10 @@ * a per-packet basis. * * Author: Paul Moore - * */ /* * (c) Copyright Hewlett-Packard Development Company, L.P., 2007 - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of version 2 of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * */ #ifndef _SELINUX_NETNODE_H diff --git a/security/selinux/include/netport.h b/security/selinux/include/netport.h index d1ce896b2cb0..31bc16e29cd1 100644 --- a/security/selinux/include/netport.h +++ b/security/selinux/include/netport.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* * Network port table * @@ -6,21 +7,10 @@ * needed to reduce the lookup overhead. * * Author: Paul Moore - * */ /* * (c) Copyright Hewlett-Packard Development Company, L.P., 2008 - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of version 2 of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * */ #ifndef _SELINUX_NETPORT_H diff --git a/security/selinux/netnode.c b/security/selinux/netnode.c index 08ee81bfcc1c..8335595f7050 100644 --- a/security/selinux/netnode.c +++ b/security/selinux/netnode.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Network node table * @@ -11,21 +12,10 @@ * This code is heavily based on the "netif" concept originally developed by * James Morris * (see security/selinux/netif.c for more information) - * */ /* * (c) Copyright Hewlett-Packard Development Company, L.P., 2007 - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of version 2 of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * */ #include diff --git a/security/selinux/netport.c b/security/selinux/netport.c index 4e8e4458b55f..24c2bce74248 100644 --- a/security/selinux/netport.c +++ b/security/selinux/netport.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Network port table * @@ -10,21 +11,10 @@ * This code is heavily based on the "netif" concept originally developed by * James Morris * (see security/selinux/netif.c for more information) - * */ /* * (c) Copyright Hewlett-Packard Development Company, L.P., 2008 - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of version 2 of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * */ #include diff --git a/sound/pci/asihpi/hpioctl.c b/sound/pci/asihpi/hpioctl.c index 81f240472943..3aa075460f8b 100644 --- a/sound/pci/asihpi/hpioctl.c +++ b/sound/pci/asihpi/hpioctl.c @@ -1,17 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0-only /******************************************************************************* AudioScience HPI driver Common Linux HPI ioctl and module probe/remove functions Copyright (C) 1997-2014 AudioScience Inc. - This program is free software; you can redistribute it and/or modify - it under the terms of version 2 of the GNU General Public License as - published by the Free Software Foundation; - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. *******************************************************************************/ #define SOURCEFILE_NAME "hpioctl.c" diff --git a/tools/testing/nvdimm/test/iomap.c b/tools/testing/nvdimm/test/iomap.c index f2a00b0698a3..a77aa7b1ca8c 100644 --- a/tools/testing/nvdimm/test/iomap.c +++ b/tools/testing/nvdimm/test/iomap.c @@ -1,14 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright(c) 2013-2015 Intel Corporation. All rights reserved. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of version 2 of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. */ #include #include diff --git a/tools/testing/nvdimm/test/nfit.c b/tools/testing/nvdimm/test/nfit.c index 3ad0b3a3317b..fe88ad7fa82d 100644 --- a/tools/testing/nvdimm/test/nfit.c +++ b/tools/testing/nvdimm/test/nfit.c @@ -1,14 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright(c) 2013-2015 Intel Corporation. All rights reserved. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of version 2 of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include diff --git a/tools/testing/nvdimm/test/nfit_test.h b/tools/testing/nvdimm/test/nfit_test.h index d3d63dd5ed38..d1d488a652b6 100644 --- a/tools/testing/nvdimm/test/nfit_test.h +++ b/tools/testing/nvdimm/test/nfit_test.h @@ -1,14 +1,6 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright(c) 2013-2015 Intel Corporation. All rights reserved. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of version 2 of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. */ #ifndef __NFIT_TEST_H__ #define __NFIT_TEST_H__ diff --git a/tools/testing/selftests/timers/freq-step.c b/tools/testing/selftests/timers/freq-step.c index 14a2b77fd012..8cd10662ffba 100644 --- a/tools/testing/selftests/timers/freq-step.c +++ b/tools/testing/selftests/timers/freq-step.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * This test checks the response of the system clock to frequency * steps made with adjtimex(). The frequency error and stability of @@ -6,15 +7,6 @@ * values from the second interval exceed specified limits. * * Copyright (C) Miroslav Lichvar 2017 - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of version 2 of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. */ #include From 039080665ca2c5d4374264d188bcdb63e9e511eb Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 31 May 2019 01:09:24 -0700 Subject: [PATCH 1150/1640] UPSTREAM: treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 363 Based on 1 normalized pattern(s): released under terms in gpl version 2 see copying extracted by the scancode license scanner the SPDX license identifier GPL-2.0-only has been chosen to replace the boilerplate/reference in 5 file(s). Change-Id: I7bf284e48b904ef798c2b5d2e94ef27bc58472df Signed-off-by: Thomas Gleixner Reviewed-by: Armijn Hemel Reviewed-by: Allison Randal Cc: linux-spdx@vger.kernel.org Link: https://lkml.kernel.org/r/20190531081035.689962394@linutronix.de Signed-off-by: Greg Kroah-Hartman --- include/linux/sunrpc/cache.h | 4 +--- include/net/xdp.h | 2 +- kernel/bpf/cpumap.c | 2 +- net/core/xdp.c | 2 +- net/sunrpc/cache.c | 4 +--- 5 files changed, 5 insertions(+), 9 deletions(-) diff --git a/include/linux/sunrpc/cache.h b/include/linux/sunrpc/cache.h index 270bad0e1bed..8903f42f6e3e 100644 --- a/include/linux/sunrpc/cache.h +++ b/include/linux/sunrpc/cache.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* * include/linux/sunrpc/cache.h * @@ -5,9 +6,6 @@ * used by sunrpc clients and servers. * * Copyright (C) 2002 Neil Brown - * - * Released under terms in GPL version 2. See COPYING. - * */ #ifndef _LINUX_SUNRPC_CACHE_H_ diff --git a/include/net/xdp.h b/include/net/xdp.h index 5e49e060d550..826ea013bea0 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -1,7 +1,7 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* include/net/xdp.h * * Copyright (c) 2017 Jesper Dangaard Brouer, Red Hat Inc. - * Released under terms in GPL version 2. See COPYING. */ #ifndef __LINUX_NET_XDP_H__ #define __LINUX_NET_XDP_H__ diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 30e2373315d5..5a614e62a9f1 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -1,7 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0-only /* bpf/cpumap.c * * Copyright (c) 2017 Jesper Dangaard Brouer, Red Hat Inc. - * Released under terms in GPL version 2. See COPYING. */ /* The 'cpumap' is primarily used as a backend map for XDP BPF helper diff --git a/net/core/xdp.c b/net/core/xdp.c index 12c16e6a1d6c..31715ddf4c49 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -1,7 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0-only /* net/core/xdp.c * * Copyright (c) 2017 Jesper Dangaard Brouer, Red Hat Inc. - * Released under terms in GPL version 2. See COPYING. */ #include #include diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c index 556989b0b5fc..09c9b4ca84bc 100644 --- a/net/sunrpc/cache.c +++ b/net/sunrpc/cache.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * net/sunrpc/cache.c * @@ -5,9 +6,6 @@ * used by sunrpc clients and servers. * * Copyright (C) 2002 Neil Brown - * - * Released under terms in GPL version 2. See COPYING. - * */ #include From 3c6ca42c559b8f0ee3d00dfd886ccc2e2841aa41 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Wed, 2 May 2018 13:01:22 +0200 Subject: [PATCH 1151/1640] BACKPORT: net: initial AF_XDP skeleton MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Buildable skeleton of AF_XDP without any functionality. Just what it takes to register a new address family. Signed-off-by: Björn Töpel Signed-off-by: Alexei Starovoitov --- MAINTAINERS | 8 ++++++++ include/linux/socket.h | 5 ++++- net/Kconfig | 1 + net/core/sock.c | 12 ++++++++---- net/xdp/Kconfig | 7 +++++++ security/selinux/hooks.c | 4 +++- security/selinux/include/classmap.h | 4 +++- 7 files changed, 34 insertions(+), 7 deletions(-) create mode 100644 net/xdp/Kconfig diff --git a/MAINTAINERS b/MAINTAINERS index 324dc3cc5c1b..a17554af111f 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -14706,6 +14706,14 @@ T: git git://linuxtv.org/media_tree.git S: Maintained F: drivers/media/tuners/tuner-xc2028.* +XDP SOCKETS (AF_XDP) +M: Björn Töpel +M: Magnus Karlsson +L: netdev@vger.kernel.org +S: Maintained +F: kernel/bpf/xskmap.c +F: net/xdp/ + XEN BLOCK SUBSYSTEM M: Konrad Rzeszutek Wilk M: Roger Pau Monné diff --git a/include/linux/socket.h b/include/linux/socket.h index add9360cec7f..a668c219e814 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -207,8 +207,9 @@ struct ucred { * PF_SMC protocol family that * reuses AF_INET address family */ +#define AF_XDP 44 /* XDP sockets */ -#define AF_MAX 44 /* For now.. */ +#define AF_MAX 45 /* For now.. */ /* Protocol families, same as address families. */ #define PF_UNSPEC AF_UNSPEC @@ -257,6 +258,7 @@ struct ucred { #define PF_KCM AF_KCM #define PF_QIPCRTR AF_QIPCRTR #define PF_SMC AF_SMC +#define PF_XDP AF_XDP #define PF_MAX AF_MAX /* Maximum queue length specifiable by listen. */ @@ -338,6 +340,7 @@ struct ucred { #define SOL_NFC 280 #define SOL_KCM 281 #define SOL_TLS 282 +#define SOL_XDP 283 /* IPX options */ #define IPX_TYPE 1 diff --git a/net/Kconfig b/net/Kconfig index cb530725373c..cb26f0ddc435 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -60,6 +60,7 @@ source "net/xfrm/Kconfig" source "net/iucv/Kconfig" source "net/ncm/Kconfig" source "net/smc/Kconfig" +source "net/xdp/Kconfig" config INET bool "TCP/IP networking" diff --git a/net/core/sock.c b/net/core/sock.c index 0ed2d253aee7..5981b4443e3b 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -231,7 +231,8 @@ static struct lock_class_key af_family_kern_slock_keys[AF_MAX]; x "AF_RXRPC" , x "AF_ISDN" , x "AF_PHONET" , \ x "AF_IEEE802154", x "AF_CAIF" , x "AF_ALG" , \ x "AF_NFC" , x "AF_VSOCK" , x "AF_KCM" , \ - x "AF_QIPCRTR", x "AF_SMC" , x "AF_MAX" + x "AF_QIPCRTR", x "AF_SMC" , x "AF_XDP" , \ + x "AF_MAX" static const char *const af_family_key_strings[AF_MAX+1] = { _sock_locks("sk_lock-") @@ -267,7 +268,8 @@ static const char *const af_family_rlock_key_strings[AF_MAX+1] = { "rlock-AF_RXRPC" , "rlock-AF_ISDN" , "rlock-AF_PHONET" , "rlock-AF_IEEE802154", "rlock-AF_CAIF" , "rlock-AF_ALG" , "rlock-AF_NFC" , "rlock-AF_VSOCK" , "rlock-AF_KCM" , - "rlock-AF_QIPCRTR", "rlock-AF_SMC" , "rlock-AF_MAX" + "rlock-AF_QIPCRTR", "rlock-AF_SMC" , "rlock-AF_XDP" , + "rlock-AF_MAX" }; static const char *const af_family_wlock_key_strings[AF_MAX+1] = { "wlock-AF_UNSPEC", "wlock-AF_UNIX" , "wlock-AF_INET" , @@ -284,7 +286,8 @@ static const char *const af_family_wlock_key_strings[AF_MAX+1] = { "wlock-AF_RXRPC" , "wlock-AF_ISDN" , "wlock-AF_PHONET" , "wlock-AF_IEEE802154", "wlock-AF_CAIF" , "wlock-AF_ALG" , "wlock-AF_NFC" , "wlock-AF_VSOCK" , "wlock-AF_KCM" , - "wlock-AF_QIPCRTR", "wlock-AF_SMC" , "wlock-AF_MAX" + "wlock-AF_QIPCRTR", "wlock-AF_SMC" , "wlock-AF_XDP" , + "wlock-AF_MAX" }; static const char *const af_family_elock_key_strings[AF_MAX+1] = { "elock-AF_UNSPEC", "elock-AF_UNIX" , "elock-AF_INET" , @@ -301,7 +304,8 @@ static const char *const af_family_elock_key_strings[AF_MAX+1] = { "elock-AF_RXRPC" , "elock-AF_ISDN" , "elock-AF_PHONET" , "elock-AF_IEEE802154", "elock-AF_CAIF" , "elock-AF_ALG" , "elock-AF_NFC" , "elock-AF_VSOCK" , "elock-AF_KCM" , - "elock-AF_QIPCRTR", "elock-AF_SMC" , "elock-AF_MAX" + "elock-AF_QIPCRTR", "elock-AF_SMC" , "elock-AF_XDP" , + "elock-AF_MAX" }; /* diff --git a/net/xdp/Kconfig b/net/xdp/Kconfig new file mode 100644 index 000000000000..90e4a7152854 --- /dev/null +++ b/net/xdp/Kconfig @@ -0,0 +1,7 @@ +config XDP_SOCKETS + bool "XDP sockets" + depends on BPF_SYSCALL + default n + help + XDP sockets allows a channel between XDP programs and + userspace applications. diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 25e49e232f49..c526194d920e 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -1618,7 +1618,9 @@ static inline u16 socket_type_to_security_class(int family, int type, int protoc return SECCLASS_QIPCRTR_SOCKET; case PF_SMC: return SECCLASS_SMC_SOCKET; -#if PF_MAX > 44 + case PF_XDP: + return SECCLASS_XDP_SOCKET; +#if PF_MAX > 45 #error New address family defined, please update this function. #endif } diff --git a/security/selinux/include/classmap.h b/security/selinux/include/classmap.h index b52c7e1d7a84..2eea88e95b29 100644 --- a/security/selinux/include/classmap.h +++ b/security/selinux/include/classmap.h @@ -242,11 +242,13 @@ struct security_class_mapping secclass_map[] = { { "manage_subnet", NULL } }, { "bpf", {"map_create", "map_read", "map_write", "prog_load", "prog_run"} }, + { "xdp_socket", + { COMMON_SOCK_PERMS, NULL } }, { "perf_event", {"open", "cpu", "kernel", "tracepoint", "read", "write"} }, { NULL } }; -#if PF_MAX > 44 +#if PF_MAX > 45 #error New address family defined, please update secclass_map. #endif From 0b1e0d00e16c707cb766690bfacb4b287caf7124 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Wed, 2 May 2018 13:01:31 +0200 Subject: [PATCH 1152/1640] UPSTREAM: xsk: add umem completion queue support and mmap Here, we add another setsockopt for registered user memory (umem) called XDP_UMEM_COMPLETION_QUEUE. Using this socket option, the process can ask the kernel to allocate a queue (ring buffer) and also mmap it (XDP_UMEM_PGOFF_COMPLETION_QUEUE) into the process. The queue is used to explicitly pass ownership of umem frames from the kernel to user process. This will be used by the TX path to tell user space that a certain frame has been transmitted and user space can use it for something else, if it wishes. Signed-off-by: Magnus Karlsson Signed-off-by: Alexei Starovoitov --- include/uapi/linux/if_xdp.h | 2 ++ net/xdp/xdp_umem.c | 7 ++++++- net/xdp/xdp_umem.h | 1 + net/xdp/xsk.c | 7 ++++++- 4 files changed, 15 insertions(+), 2 deletions(-) diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h index 2c2d12750bed..f10cf4ab1706 100644 --- a/include/uapi/linux/if_xdp.h +++ b/include/uapi/linux/if_xdp.h @@ -27,6 +27,7 @@ struct sockaddr_xdp { #define XDP_RX_RING 1 #define XDP_UMEM_REG 3 #define XDP_UMEM_FILL_RING 4 +#define XDP_UMEM_COMPLETION_RING 5 struct xdp_umem_reg { __u64 addr; /* Start of packet data area */ @@ -38,6 +39,7 @@ struct xdp_umem_reg { /* Pgoff for mmaping the rings */ #define XDP_PGOFF_RX_RING 0 #define XDP_UMEM_PGOFF_FILL_RING 0x100000000 +#define XDP_UMEM_PGOFF_COMPLETION_RING 0x180000000 struct xdp_desc { __u32 idx; diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c index 1a25690f9721..5c6e6f5024c0 100644 --- a/net/xdp/xdp_umem.c +++ b/net/xdp/xdp_umem.c @@ -61,6 +61,11 @@ static void xdp_umem_release(struct xdp_umem *umem) umem->fq = NULL; } + if (umem->cq) { + xskq_destroy(umem->cq); + umem->cq = NULL; + } + if (umem->pgs) { xdp_umem_unpin_pages(umem); @@ -242,5 +247,5 @@ out: bool xdp_umem_validate_queues(struct xdp_umem *umem) { - return umem->fq; + return (umem->fq && umem->cq); } diff --git a/net/xdp/xdp_umem.h b/net/xdp/xdp_umem.h index c205d53ca479..70fe225baa51 100644 --- a/net/xdp/xdp_umem.h +++ b/net/xdp/xdp_umem.h @@ -15,6 +15,7 @@ struct xdp_umem { struct xsk_queue *fq; + struct xsk_queue *cq; struct page **pgs; struct xdp_umem_props props; u32 npgs; diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 344a67d86e9a..621d8a24b8a7 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -246,6 +246,7 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) } else { /* This xsk has its own umem. */ xskq_set_umem(xs->umem->fq, &xs->umem->props); + xskq_set_umem(xs->umem->cq, &xs->umem->props); } /* Rebind? */ @@ -325,6 +326,7 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname, return 0; } case XDP_UMEM_FILL_RING: + case XDP_UMEM_COMPLETION_RING: { struct xsk_queue **q; int entries; @@ -336,7 +338,8 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname, return -EFAULT; mutex_lock(&xs->mutex); - q = &xs->umem->fq; + q = (optname == XDP_UMEM_FILL_RING) ? &xs->umem->fq : + &xs->umem->cq; err = xsk_init_queue(entries, q, true); mutex_unlock(&xs->mutex); return err; @@ -366,6 +369,8 @@ static int xsk_mmap(struct file *file, struct socket *sock, if (offset == XDP_UMEM_PGOFF_FILL_RING) q = xs->umem->fq; + else if (offset == XDP_UMEM_PGOFF_COMPLETION_RING) + q = xs->umem->cq; else return -EINVAL; } From 483a84fc2ccebc7013afbb2792595c2cd4cb9683 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Wed, 2 May 2018 13:01:32 +0200 Subject: [PATCH 1153/1640] UPSTREAM: xsk: add Tx queue setup and mmap support Another setsockopt (XDP_TX_QUEUE) is added to let the process allocate a queue, where the user process can pass frames to be transmitted by the kernel. The mmapping of the queue is done using the XDP_PGOFF_TX_QUEUE offset. Signed-off-by: Magnus Karlsson Signed-off-by: Alexei Starovoitov --- include/net/xdp_sock.h | 1 + include/uapi/linux/if_xdp.h | 2 ++ net/xdp/xsk.c | 8 ++++++-- 3 files changed, 9 insertions(+), 2 deletions(-) diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index ec369686c88f..7a647c56ec15 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -21,6 +21,7 @@ struct xdp_sock { struct xdp_umem *umem; struct list_head flush_node; u16 queue_id; + struct xsk_queue *tx ____cacheline_aligned_in_smp; /* Protects multiple processes in the control path */ struct mutex mutex; u64 rx_dropped; diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h index f10cf4ab1706..01ae7b72bfa4 100644 --- a/include/uapi/linux/if_xdp.h +++ b/include/uapi/linux/if_xdp.h @@ -25,6 +25,7 @@ struct sockaddr_xdp { /* XDP socket options */ #define XDP_RX_RING 1 +#define XDP_TX_RING 2 #define XDP_UMEM_REG 3 #define XDP_UMEM_FILL_RING 4 #define XDP_UMEM_COMPLETION_RING 5 @@ -38,6 +39,7 @@ struct xdp_umem_reg { /* Pgoff for mmaping the rings */ #define XDP_PGOFF_RX_RING 0 +#define XDP_PGOFF_TX_RING 0x80000000 #define XDP_UMEM_PGOFF_FILL_RING 0x100000000 #define XDP_UMEM_PGOFF_COMPLETION_RING 0x180000000 diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 621d8a24b8a7..683853dcf51a 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -197,7 +197,7 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) goto out_release; } - if (!xs->rx) { + if (!xs->rx && !xs->tx) { err = -EINVAL; goto out_unlock; } @@ -282,6 +282,7 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname, switch (optname) { case XDP_RX_RING: + case XDP_TX_RING: { struct xsk_queue **q; int entries; @@ -292,7 +293,7 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname, return -EFAULT; mutex_lock(&xs->mutex); - q = &xs->rx; + q = (optname == XDP_TX_RING) ? &xs->tx : &xs->rx; err = xsk_init_queue(entries, q, false); mutex_unlock(&xs->mutex); return err; @@ -363,6 +364,8 @@ static int xsk_mmap(struct file *file, struct socket *sock, if (offset == XDP_PGOFF_RX_RING) { q = xs->rx; + } else if (offset == XDP_PGOFF_TX_RING) { + q = xs->tx; } else { if (!xs->umem) return -EINVAL; @@ -422,6 +425,7 @@ static void xsk_destruct(struct sock *sk) return; xskq_destroy(xs->rx); + xskq_destroy(xs->tx); xdp_put_umem(xs->umem); sk_refcnt_debug_dec(sk); From bc6db7d7c40e5826fe1ed38f9dab1175ba4a0b8e Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Wed, 2 May 2018 13:01:34 +0200 Subject: [PATCH 1154/1640] UPSTREAM: xsk: support for Tx Here, Tx support is added. The user fills the Tx queue with frames to be sent by the kernel, and let's the kernel know using the sendmsg syscall. Signed-off-by: Magnus Karlsson Signed-off-by: Alexei Starovoitov --- net/xdp/xsk.c | 111 ++++++++++++++++++++++++++++++++++++++++++-- net/xdp/xsk_queue.h | 93 ++++++++++++++++++++++++++++++++++++- 2 files changed, 200 insertions(+), 4 deletions(-) diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 683853dcf51a..7c66599ae496 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -27,6 +27,8 @@ #include "xsk_queue.h" #include "xdp_umem.h" +#define TX_BATCH_SIZE 16 + static struct xdp_sock *xdp_sk(struct sock *sk) { return (struct xdp_sock *)sk; @@ -92,6 +94,108 @@ int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) return err; } +static void xsk_destruct_skb(struct sk_buff *skb) +{ + u32 id = (u32)(long)skb_shinfo(skb)->destructor_arg; + struct xdp_sock *xs = xdp_sk(skb->sk); + + WARN_ON_ONCE(xskq_produce_id(xs->umem->cq, id)); + + sock_wfree(skb); +} + +static int xsk_generic_xmit(struct sock *sk, struct msghdr *m, + size_t total_len) +{ + bool need_wait = !(m->msg_flags & MSG_DONTWAIT); + u32 max_batch = TX_BATCH_SIZE; + struct xdp_sock *xs = xdp_sk(sk); + bool sent_frame = false; + struct xdp_desc desc; + struct sk_buff *skb; + int err = 0; + + if (unlikely(!xs->tx)) + return -ENOBUFS; + if (need_wait) + return -EOPNOTSUPP; + + mutex_lock(&xs->mutex); + + while (xskq_peek_desc(xs->tx, &desc)) { + char *buffer; + u32 id, len; + + if (max_batch-- == 0) { + err = -EAGAIN; + goto out; + } + + if (xskq_reserve_id(xs->umem->cq)) { + err = -EAGAIN; + goto out; + } + + len = desc.len; + if (unlikely(len > xs->dev->mtu)) { + err = -EMSGSIZE; + goto out; + } + + skb = sock_alloc_send_skb(sk, len, !need_wait, &err); + if (unlikely(!skb)) { + err = -EAGAIN; + goto out; + } + + skb_put(skb, len); + id = desc.idx; + buffer = xdp_umem_get_data(xs->umem, id) + desc.offset; + err = skb_store_bits(skb, 0, buffer, len); + if (unlikely(err)) { + kfree_skb(skb); + goto out; + } + + skb->dev = xs->dev; + skb->priority = sk->sk_priority; + skb->mark = sk->sk_mark; + skb_shinfo(skb)->destructor_arg = (void *)(long)id; + skb->destructor = xsk_destruct_skb; + + err = dev_direct_xmit(skb, xs->queue_id); + /* Ignore NET_XMIT_CN as packet might have been sent */ + if (err == NET_XMIT_DROP || err == NETDEV_TX_BUSY) { + err = -EAGAIN; + /* SKB consumed by dev_direct_xmit() */ + goto out; + } + + sent_frame = true; + xskq_discard_desc(xs->tx); + } + +out: + if (sent_frame) + sk->sk_write_space(sk); + + mutex_unlock(&xs->mutex); + return err; +} + +static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len) +{ + struct sock *sk = sock->sk; + struct xdp_sock *xs = xdp_sk(sk); + + if (unlikely(!xs->dev)) + return -ENXIO; + if (unlikely(!(xs->dev->flags & IFF_UP))) + return -ENETDOWN; + + return xsk_generic_xmit(sk, m, total_len); +} + static unsigned int xsk_poll(struct file *file, struct socket *sock, struct poll_table_struct *wait) { @@ -101,6 +205,8 @@ static unsigned int xsk_poll(struct file *file, struct socket *sock, if (xs->rx && !xskq_empty_desc(xs->rx)) mask |= POLLIN | POLLRDNORM; + if (xs->tx && !xskq_full_desc(xs->tx)) + mask |= POLLOUT | POLLWRNORM; return mask; } @@ -261,6 +367,7 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) xs->queue_id = sxdp->sxdp_queue_id; xskq_set_umem(xs->rx, &xs->umem->props); + xskq_set_umem(xs->tx, &xs->umem->props); out_unlock: if (err) @@ -374,8 +481,6 @@ static int xsk_mmap(struct file *file, struct socket *sock, q = xs->umem->fq; else if (offset == XDP_UMEM_PGOFF_COMPLETION_RING) q = xs->umem->cq; - else - return -EINVAL; } if (!q) @@ -411,7 +516,7 @@ static const struct proto_ops xsk_proto_ops = { .shutdown = sock_no_shutdown, .setsockopt = xsk_setsockopt, .getsockopt = sock_no_getsockopt, - .sendmsg = sock_no_sendmsg, + .sendmsg = xsk_sendmsg, .recvmsg = sock_no_recvmsg, .mmap = xsk_mmap, .sendpage = sock_no_sendpage, diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index 7ca9ce45c342..5e0ac867b2ad 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -102,7 +102,93 @@ static inline void xskq_discard_id(struct xsk_queue *q) (void)xskq_validate_id(q); } -/* Rx queue */ +static inline int xskq_produce_id(struct xsk_queue *q, u32 id) +{ + struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring; + + ring->desc[q->prod_tail++ & q->ring_mask] = id; + + /* Order producer and data */ + smp_wmb(); + + WRITE_ONCE(q->ring->producer, q->prod_tail); + return 0; +} + +static inline int xskq_reserve_id(struct xsk_queue *q) +{ + if (xskq_nb_free(q, q->prod_head, 1) == 0) + return -ENOSPC; + + q->prod_head++; + return 0; +} + +/* Rx/Tx queue */ + +static inline bool xskq_is_valid_desc(struct xsk_queue *q, struct xdp_desc *d) +{ + u32 buff_len; + + if (unlikely(d->idx >= q->umem_props.nframes)) { + q->invalid_descs++; + return false; + } + + buff_len = q->umem_props.frame_size; + if (unlikely(d->len > buff_len || d->len == 0 || + d->offset > buff_len || d->offset + d->len > buff_len)) { + q->invalid_descs++; + return false; + } + + return true; +} + +static inline struct xdp_desc *xskq_validate_desc(struct xsk_queue *q, + struct xdp_desc *desc) +{ + while (q->cons_tail != q->cons_head) { + struct xdp_rxtx_ring *ring = (struct xdp_rxtx_ring *)q->ring; + unsigned int idx = q->cons_tail & q->ring_mask; + + if (xskq_is_valid_desc(q, &ring->desc[idx])) { + if (desc) + *desc = ring->desc[idx]; + return desc; + } + + q->cons_tail++; + } + + return NULL; +} + +static inline struct xdp_desc *xskq_peek_desc(struct xsk_queue *q, + struct xdp_desc *desc) +{ + struct xdp_rxtx_ring *ring; + + if (q->cons_tail == q->cons_head) { + WRITE_ONCE(q->ring->consumer, q->cons_tail); + q->cons_head = q->cons_tail + xskq_nb_avail(q, RX_BATCH_SIZE); + + /* Order consumer and data */ + smp_rmb(); + + return xskq_validate_desc(q, desc); + } + + ring = (struct xdp_rxtx_ring *)q->ring; + *desc = ring->desc[q->cons_tail & q->ring_mask]; + return desc; +} + +static inline void xskq_discard_desc(struct xsk_queue *q) +{ + q->cons_tail++; + (void)xskq_validate_desc(q, NULL); +} static inline int xskq_produce_batch_desc(struct xsk_queue *q, u32 id, u32 len, u16 offset) @@ -130,6 +216,11 @@ static inline void xskq_produce_flush_desc(struct xsk_queue *q) WRITE_ONCE(q->ring->producer, q->prod_tail); } +static inline bool xskq_full_desc(struct xsk_queue *q) +{ + return (xskq_nb_avail(q, q->nentries) == q->nentries); +} + static inline bool xskq_empty_desc(struct xsk_queue *q) { return (xskq_nb_free(q, q->prod_tail, 1) == q->nentries); From edbe0a56f4780dcd3e216cf29fa1abec911cea57 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Wed, 2 May 2018 13:01:35 +0200 Subject: [PATCH 1155/1640] UPSTREAM: xsk: statistics support In this commit, a new getsockopt is added: XDP_STATISTICS. This is used to obtain stats from the sockets. v2: getsockopt now returns size of stats structure. Signed-off-by: Magnus Karlsson Signed-off-by: Alexei Starovoitov --- include/uapi/linux/if_xdp.h | 7 ++++++ net/xdp/xsk.c | 45 ++++++++++++++++++++++++++++++++++++- net/xdp/xsk_queue.h | 5 +++++ 3 files changed, 56 insertions(+), 1 deletion(-) diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h index 01ae7b72bfa4..56db977221d2 100644 --- a/include/uapi/linux/if_xdp.h +++ b/include/uapi/linux/if_xdp.h @@ -29,6 +29,7 @@ struct sockaddr_xdp { #define XDP_UMEM_REG 3 #define XDP_UMEM_FILL_RING 4 #define XDP_UMEM_COMPLETION_RING 5 +#define XDP_STATISTICS 6 struct xdp_umem_reg { __u64 addr; /* Start of packet data area */ @@ -37,6 +38,12 @@ struct xdp_umem_reg { __u32 frame_headroom; /* Frame head room */ }; +struct xdp_statistics { + __u64 rx_dropped; /* Dropped for reasons other than invalid desc */ + __u64 rx_invalid_descs; /* Dropped due to invalid descriptor */ + __u64 tx_invalid_descs; /* Dropped due to invalid descriptor */ +}; + /* Pgoff for mmaping the rings */ #define XDP_PGOFF_RX_RING 0 #define XDP_PGOFF_TX_RING 0x80000000 diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 7c66599ae496..29c4de3a120c 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -459,6 +459,49 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname, return -ENOPROTOOPT; } +static int xsk_getsockopt(struct socket *sock, int level, int optname, + char __user *optval, int __user *optlen) +{ + struct sock *sk = sock->sk; + struct xdp_sock *xs = xdp_sk(sk); + int len; + + if (level != SOL_XDP) + return -ENOPROTOOPT; + + if (get_user(len, optlen)) + return -EFAULT; + if (len < 0) + return -EINVAL; + + switch (optname) { + case XDP_STATISTICS: + { + struct xdp_statistics stats; + + if (len < sizeof(stats)) + return -EINVAL; + + mutex_lock(&xs->mutex); + stats.rx_dropped = xs->rx_dropped; + stats.rx_invalid_descs = xskq_nb_invalid_descs(xs->rx); + stats.tx_invalid_descs = xskq_nb_invalid_descs(xs->tx); + mutex_unlock(&xs->mutex); + + if (copy_to_user(optval, &stats, sizeof(stats))) + return -EFAULT; + if (put_user(sizeof(stats), optlen)) + return -EFAULT; + + return 0; + } + default: + break; + } + + return -EOPNOTSUPP; +} + static int xsk_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma) { @@ -515,7 +558,7 @@ static const struct proto_ops xsk_proto_ops = { .listen = sock_no_listen, .shutdown = sock_no_shutdown, .setsockopt = xsk_setsockopt, - .getsockopt = sock_no_getsockopt, + .getsockopt = xsk_getsockopt, .sendmsg = xsk_sendmsg, .recvmsg = sock_no_recvmsg, .mmap = xsk_mmap, diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index 5e0ac867b2ad..928d464e57b9 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -27,6 +27,11 @@ struct xsk_queue { /* Common functions operating for both RXTX and umem queues */ +static inline u64 xskq_nb_invalid_descs(struct xsk_queue *q) +{ + return q ? q->invalid_descs : 0; +} + static inline u32 xskq_nb_avail(struct xsk_queue *q, u32 dcnt) { u32 entries = q->prod_tail - q->cons_tail; From 2a6fdb944c7bc1eeddd07b89dfe3311907af214f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Mon, 7 May 2018 19:43:50 +0200 Subject: [PATCH 1156/1640] UPSTREAM: xsk: fix 64-bit division MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit i386 builds report: net/xdp/xdp_umem.o: In function `xdp_umem_reg': xdp_umem.c:(.text+0x47e): undefined reference to `__udivdi3' This fix uses div_u64 instead of the GCC built-in. Fixes: c0c77d8fb787 ("xsk: add user memory registration support sockopt") Signed-off-by: Björn Töpel Reported-by: Randy Dunlap Tested-by: Randy Dunlap Signed-off-by: Daniel Borkmann --- net/xdp/xdp_umem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c index 5c6e6f5024c0..df4ea97c433b 100644 --- a/net/xdp/xdp_umem.c +++ b/net/xdp/xdp_umem.c @@ -200,7 +200,7 @@ int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) if ((addr + size) < addr) return -EINVAL; - nframes = size / frame_size; + nframes = (unsigned int)div_u64(size, frame_size); if (nframes == 0 || nframes > UINT_MAX) return -EINVAL; From 5bb2b034e8dcceceb1df5b18d9185df5ecc74c9b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Fri, 18 May 2018 14:00:22 +0200 Subject: [PATCH 1157/1640] UPSTREAM: xsk: remove newline at end of file MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Minor cleanup, remove newline at end of Makefile. Signed-off-by: Björn Töpel Signed-off-by: Daniel Borkmann --- net/xdp/Makefile | 1 - 1 file changed, 1 deletion(-) diff --git a/net/xdp/Makefile b/net/xdp/Makefile index 074fb2b2d51c..04f073146256 100644 --- a/net/xdp/Makefile +++ b/net/xdp/Makefile @@ -1,2 +1 @@ obj-$(CONFIG_XDP_SOCKETS) += xsk.o xdp_umem.o xsk_queue.o - From b0ba506991ad1f9b46c8b494920fbe8905e4c998 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Fri, 18 May 2018 14:00:23 +0200 Subject: [PATCH 1158/1640] UPSTREAM: xsk: fixed some cases of unnecessary parentheses MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Removed some cases of unnecessary parentheses. Signed-off-by: Björn Töpel Signed-off-by: Daniel Borkmann --- net/xdp/xdp_umem.c | 4 ++-- net/xdp/xsk_queue.c | 3 +-- net/xdp/xsk_queue.h | 4 ++-- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c index df4ea97c433b..c47909c74899 100644 --- a/net/xdp/xdp_umem.c +++ b/net/xdp/xdp_umem.c @@ -20,7 +20,7 @@ int xdp_umem_create(struct xdp_umem **umem) { *umem = kzalloc(sizeof(**umem), GFP_KERNEL); - if (!(*umem)) + if (!*umem) return -ENOMEM; return 0; @@ -247,5 +247,5 @@ out: bool xdp_umem_validate_queues(struct xdp_umem *umem) { - return (umem->fq && umem->cq); + return umem->fq && umem->cq; } diff --git a/net/xdp/xsk_queue.c b/net/xdp/xsk_queue.c index 9f605d22dad4..ebe85e59507e 100644 --- a/net/xdp/xsk_queue.c +++ b/net/xdp/xsk_queue.c @@ -22,8 +22,7 @@ static u32 xskq_umem_get_ring_size(struct xsk_queue *q) static u32 xskq_rxtx_get_ring_size(struct xsk_queue *q) { - return (sizeof(struct xdp_ring) + - q->nentries * sizeof(struct xdp_desc)); + return sizeof(struct xdp_ring) + q->nentries * sizeof(struct xdp_desc); } struct xsk_queue *xskq_create(u32 nentries, bool umem_queue) diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index 928d464e57b9..62e43be407d8 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -223,12 +223,12 @@ static inline void xskq_produce_flush_desc(struct xsk_queue *q) static inline bool xskq_full_desc(struct xsk_queue *q) { - return (xskq_nb_avail(q, q->nentries) == q->nentries); + return xskq_nb_avail(q, q->nentries) == q->nentries; } static inline bool xskq_empty_desc(struct xsk_queue *q) { - return (xskq_nb_free(q, q->prod_tail, 1) == q->nentries); + return xskq_nb_free(q, q->prod_tail, 1) == q->nentries; } void xskq_set_umem(struct xsk_queue *q, struct xdp_umem_props *umem_props); From b42772222463cbdbd7dffa2588bbb2eb5e13744b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Fri, 18 May 2018 14:00:24 +0200 Subject: [PATCH 1159/1640] UPSTREAM: xsk: proper '=' alignment MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Properly align xsk_proto_ops initialization. Signed-off-by: Björn Töpel Signed-off-by: Daniel Borkmann --- net/xdp/xsk.c | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 29c4de3a120c..43f5a9331ebe 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -545,24 +545,24 @@ static struct proto xsk_proto = { }; static const struct proto_ops xsk_proto_ops = { - .family = PF_XDP, - .owner = THIS_MODULE, - .release = xsk_release, - .bind = xsk_bind, - .connect = sock_no_connect, - .socketpair = sock_no_socketpair, - .accept = sock_no_accept, - .getname = sock_no_getname, - .poll = xsk_poll, - .ioctl = sock_no_ioctl, - .listen = sock_no_listen, - .shutdown = sock_no_shutdown, - .setsockopt = xsk_setsockopt, - .getsockopt = xsk_getsockopt, - .sendmsg = xsk_sendmsg, - .recvmsg = sock_no_recvmsg, - .mmap = xsk_mmap, - .sendpage = sock_no_sendpage, + .family = PF_XDP, + .owner = THIS_MODULE, + .release = xsk_release, + .bind = xsk_bind, + .connect = sock_no_connect, + .socketpair = sock_no_socketpair, + .accept = sock_no_accept, + .getname = sock_no_getname, + .poll = xsk_poll, + .ioctl = sock_no_ioctl, + .listen = sock_no_listen, + .shutdown = sock_no_shutdown, + .setsockopt = xsk_setsockopt, + .getsockopt = xsk_getsockopt, + .sendmsg = xsk_sendmsg, + .recvmsg = sock_no_recvmsg, + .mmap = xsk_mmap, + .sendpage = sock_no_sendpage, }; static void xsk_destruct(struct sock *sk) From d20e3c7c2cf475624ebe104184c9b859462afebe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Tue, 22 May 2018 09:34:56 +0200 Subject: [PATCH 1160/1640] UPSTREAM: xsk: remove rebind support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Supporting rebind, i.e. after a successful bind the process can call bind again without closing the socket, makes the AF_XDP setup state machine more complex. Constrain the state space, by not supporting rebind. Signed-off-by: Björn Töpel Signed-off-by: Daniel Borkmann --- net/xdp/xsk.c | 30 +++++++++--------------------- 1 file changed, 9 insertions(+), 21 deletions(-) diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 43f5a9331ebe..734389869d47 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -227,14 +227,6 @@ static int xsk_init_queue(u32 entries, struct xsk_queue **queue, return 0; } -static void __xsk_release(struct xdp_sock *xs) -{ - /* Wait for driver to stop using the xdp socket. */ - synchronize_net(); - - dev_put(xs->dev); -} - static int xsk_release(struct socket *sock) { struct sock *sk = sock->sk; @@ -251,7 +243,9 @@ static int xsk_release(struct socket *sock) local_bh_enable(); if (xs->dev) { - __xsk_release(xs); + /* Wait for driver to stop using the xdp socket. */ + synchronize_net(); + dev_put(xs->dev); xs->dev = NULL; } @@ -285,9 +279,8 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) { struct sockaddr_xdp *sxdp = (struct sockaddr_xdp *)addr; struct sock *sk = sock->sk; - struct net_device *dev, *dev_curr; struct xdp_sock *xs = xdp_sk(sk); - struct xdp_umem *old_umem = NULL; + struct net_device *dev; int err = 0; if (addr_len < sizeof(struct sockaddr_xdp)) @@ -296,7 +289,11 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) return -EINVAL; mutex_lock(&xs->mutex); - dev_curr = xs->dev; + if (xs->dev) { + err = -EBUSY; + goto out_release; + } + dev = dev_get_by_index(sock_net(sk), sxdp->sxdp_ifindex); if (!dev) { err = -ENODEV; @@ -343,7 +340,6 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) } xdp_get_umem(umem_xs->umem); - old_umem = xs->umem; xs->umem = umem_xs->umem; sockfd_put(sock); } else if (!xs->umem || !xdp_umem_validate_queues(xs->umem)) { @@ -355,14 +351,6 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) xskq_set_umem(xs->umem->cq, &xs->umem->props); } - /* Rebind? */ - if (dev_curr && (dev_curr != dev || - xs->queue_id != sxdp->sxdp_queue_id)) { - __xsk_release(xs); - if (old_umem) - xdp_put_umem(old_umem); - } - xs->dev = dev; xs->queue_id = sxdp->sxdp_queue_id; From 29620f6da4f90fabdc2efa42a448f1aff7eb5b14 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Tue, 22 May 2018 09:34:58 +0200 Subject: [PATCH 1161/1640] UPSTREAM: xsk: proper queue id check at bind Validate the queue id against both Rx and Tx on the netdev. Also, make sure that the queue exists at xmit time. Reported-by: Jesper Dangaard Brouer Tested-by: Jesper Dangaard Brouer Signed-off-by: Magnus Karlsson Signed-off-by: Daniel Borkmann --- net/xdp/xsk.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 734389869d47..41e6eb5b16ca 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -142,6 +142,11 @@ static int xsk_generic_xmit(struct sock *sk, struct msghdr *m, goto out; } + if (xs->queue_id >= xs->dev->real_num_tx_queues) { + err = -ENXIO; + goto out; + } + skb = sock_alloc_send_skb(sk, len, !need_wait, &err); if (unlikely(!skb)) { err = -EAGAIN; @@ -305,7 +310,8 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) goto out_unlock; } - if (sxdp->sxdp_queue_id >= dev->num_rx_queues) { + if ((xs->rx && sxdp->sxdp_queue_id >= dev->real_num_rx_queues) || + (xs->tx && sxdp->sxdp_queue_id >= dev->real_num_tx_queues)) { err = -EINVAL; goto out_unlock; } From de6c3813a2d6a924ff87b2a6799193e7020a5211 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Tue, 22 May 2018 09:34:59 +0200 Subject: [PATCH 1162/1640] UPSTREAM: xsk: remove explicit ring structure from uapi MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In this commit we remove the explicit ring structure from the the uapi. It is tricky for an uapi to depend on a certain L1 cache line size, since it can differ for variants of the same architecture. Now, we let the user application determine the offsets of the producer, consumer and descriptors by asking the socket via getsockopt. A typical flow would be (Rx ring): struct xdp_mmap_offsets off; struct xdp_desc *ring; u32 *prod, *cons; void *map; ... getsockopt(fd, SOL_XDP, XDP_MMAP_OFFSETS, &off, &optlen); map = mmap(NULL, off.rx.desc + NUM_DESCS * sizeof(struct xdp_desc), PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, sfd, XDP_PGOFF_RX_RING); prod = map + off.rx.producer; cons = map + off.rx.consumer; ring = map + off.rx.desc; Signed-off-by: Björn Töpel Signed-off-by: Daniel Borkmann --- include/uapi/linux/if_xdp.h | 44 ++++++++++++++++++------------------- net/xdp/xsk.c | 29 ++++++++++++++++++++++++ net/xdp/xsk_queue.h | 17 ++++++++++++++ 3 files changed, 68 insertions(+), 22 deletions(-) diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h index 56db977221d2..16cc76b4a4d9 100644 --- a/include/uapi/linux/if_xdp.h +++ b/include/uapi/linux/if_xdp.h @@ -23,13 +23,27 @@ struct sockaddr_xdp { __u16 sxdp_flags; }; +struct xdp_ring_offset { + __u64 producer; + __u64 consumer; + __u64 desc; +}; + +struct xdp_mmap_offsets { + struct xdp_ring_offset rx; + struct xdp_ring_offset tx; + struct xdp_ring_offset fr; /* Fill */ + struct xdp_ring_offset cr; /* Completion */ +}; + /* XDP socket options */ -#define XDP_RX_RING 1 -#define XDP_TX_RING 2 -#define XDP_UMEM_REG 3 -#define XDP_UMEM_FILL_RING 4 -#define XDP_UMEM_COMPLETION_RING 5 -#define XDP_STATISTICS 6 +#define XDP_MMAP_OFFSETS 1 +#define XDP_RX_RING 2 +#define XDP_TX_RING 3 +#define XDP_UMEM_REG 4 +#define XDP_UMEM_FILL_RING 5 +#define XDP_UMEM_COMPLETION_RING 6 +#define XDP_STATISTICS 7 struct xdp_umem_reg { __u64 addr; /* Start of packet data area */ @@ -50,6 +64,7 @@ struct xdp_statistics { #define XDP_UMEM_PGOFF_FILL_RING 0x100000000 #define XDP_UMEM_PGOFF_COMPLETION_RING 0x180000000 +/* Rx/Tx descriptor */ struct xdp_desc { __u32 idx; __u32 len; @@ -58,21 +73,6 @@ struct xdp_desc { __u8 padding[5]; }; -struct xdp_ring { - __u32 producer __attribute__((aligned(64))); - __u32 consumer __attribute__((aligned(64))); -}; - -/* Used for the RX and TX queues for packets */ -struct xdp_rxtx_ring { - struct xdp_ring ptrs; - struct xdp_desc desc[0] __attribute__((aligned(64))); -}; - -/* Used for the fill and completion queues for buffers */ -struct xdp_umem_ring { - struct xdp_ring ptrs; - __u32 desc[0] __attribute__((aligned(64))); -}; +/* UMEM descriptor is __u32 */ #endif /* _LINUX_IF_XDP_H */ diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 41e6eb5b16ca..cd3713f5bf07 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -489,6 +489,35 @@ static int xsk_getsockopt(struct socket *sock, int level, int optname, return 0; } + case XDP_MMAP_OFFSETS: + { + struct xdp_mmap_offsets off; + + if (len < sizeof(off)) + return -EINVAL; + + off.rx.producer = offsetof(struct xdp_rxtx_ring, ptrs.producer); + off.rx.consumer = offsetof(struct xdp_rxtx_ring, ptrs.consumer); + off.rx.desc = offsetof(struct xdp_rxtx_ring, desc); + off.tx.producer = offsetof(struct xdp_rxtx_ring, ptrs.producer); + off.tx.consumer = offsetof(struct xdp_rxtx_ring, ptrs.consumer); + off.tx.desc = offsetof(struct xdp_rxtx_ring, desc); + + off.fr.producer = offsetof(struct xdp_umem_ring, ptrs.producer); + off.fr.consumer = offsetof(struct xdp_umem_ring, ptrs.consumer); + off.fr.desc = offsetof(struct xdp_umem_ring, desc); + off.cr.producer = offsetof(struct xdp_umem_ring, ptrs.producer); + off.cr.consumer = offsetof(struct xdp_umem_ring, ptrs.consumer); + off.cr.desc = offsetof(struct xdp_umem_ring, desc); + + len = sizeof(off); + if (copy_to_user(optval, &off, len)) + return -EFAULT; + if (put_user(len, optlen)) + return -EFAULT; + + return 0; + } default: break; } diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index 62e43be407d8..cb8e5be35110 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -13,6 +13,23 @@ #define RX_BATCH_SIZE 16 +struct xdp_ring { + u32 producer ____cacheline_aligned_in_smp; + u32 consumer ____cacheline_aligned_in_smp; +}; + +/* Used for the RX and TX queues for packets */ +struct xdp_rxtx_ring { + struct xdp_ring ptrs; + struct xdp_desc desc[0] ____cacheline_aligned_in_smp; +}; + +/* Used for the fill and completion queues for buffers */ +struct xdp_umem_ring { + struct xdp_ring ptrs; + u32 desc[0] ____cacheline_aligned_in_smp; +}; + struct xsk_queue { struct xdp_umem_props umem_props; u32 ring_mask; From 12f65ad623aabdeda2109e0469dc4628a43d3ce6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Tue, 22 May 2018 09:35:01 +0200 Subject: [PATCH 1163/1640] UPSTREAM: xsk: add missing write- and data-dependency barrier MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Here, we add a missing write-barrier, and use READ_ONCE for the data-dependency barrier. Signed-off-by: Björn Töpel Signed-off-by: Daniel Borkmann --- net/xdp/xsk.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index cd3713f5bf07..0396837459f4 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -228,6 +228,8 @@ static int xsk_init_queue(u32 entries, struct xsk_queue **queue, if (!q) return -ENOMEM; + /* Make sure queue is ready before it can be seen by others */ + smp_wmb(); *queue = q; return 0; } @@ -532,21 +534,23 @@ static int xsk_mmap(struct file *file, struct socket *sock, unsigned long size = vma->vm_end - vma->vm_start; struct xdp_sock *xs = xdp_sk(sock->sk); struct xsk_queue *q = NULL; + struct xdp_umem *umem; unsigned long pfn; struct page *qpg; if (offset == XDP_PGOFF_RX_RING) { - q = xs->rx; + q = READ_ONCE(xs->rx); } else if (offset == XDP_PGOFF_TX_RING) { - q = xs->tx; + q = READ_ONCE(xs->tx); } else { - if (!xs->umem) + umem = READ_ONCE(xs->umem); + if (!umem) return -EINVAL; if (offset == XDP_UMEM_PGOFF_FILL_RING) - q = xs->umem->fq; + q = READ_ONCE(umem->fq); else if (offset == XDP_UMEM_PGOFF_COMPLETION_RING) - q = xs->umem->cq; + q = READ_ONCE(umem->cq); } if (!q) From 036fd2d68a4697b2dcad22cf6b39a60e8d0d3b88 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Tue, 22 May 2018 09:35:02 +0200 Subject: [PATCH 1164/1640] UPSTREAM: xsk: simplified umem setup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As suggested by Daniel Borkmann, the umem setup code was a too defensive and complex. Here, we reduce the number of checks. Also, the memory pinning is now folded into the umem creation, and we do correct locking. Signed-off-by: Björn Töpel Signed-off-by: Daniel Borkmann --- net/xdp/xdp_umem.c | 79 ++++++++++++++++++++++------------------------ net/xdp/xdp_umem.h | 3 +- net/xdp/xsk.c | 26 +++++++-------- 3 files changed, 52 insertions(+), 56 deletions(-) diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c index c47909c74899..faa6ffbaf6ab 100644 --- a/net/xdp/xdp_umem.c +++ b/net/xdp/xdp_umem.c @@ -16,39 +16,25 @@ #define XDP_UMEM_MIN_FRAME_SIZE 2048 -int xdp_umem_create(struct xdp_umem **umem) -{ - *umem = kzalloc(sizeof(**umem), GFP_KERNEL); - - if (!*umem) - return -ENOMEM; - - return 0; -} - static void xdp_umem_unpin_pages(struct xdp_umem *umem) { unsigned int i; - if (umem->pgs) { - for (i = 0; i < umem->npgs; i++) { - struct page *page = umem->pgs[i]; + for (i = 0; i < umem->npgs; i++) { + struct page *page = umem->pgs[i]; - set_page_dirty_lock(page); - put_page(page); - } - - kfree(umem->pgs); - umem->pgs = NULL; + set_page_dirty_lock(page); + put_page(page); } + + kfree(umem->pgs); + umem->pgs = NULL; } static void xdp_umem_unaccount_pages(struct xdp_umem *umem) { - if (umem->user) { - atomic_long_sub(umem->npgs, &umem->user->locked_vm); - free_uid(umem->user); - } + atomic_long_sub(umem->npgs, &umem->user->locked_vm); + free_uid(umem->user); } static void xdp_umem_release(struct xdp_umem *umem) @@ -66,22 +52,18 @@ static void xdp_umem_release(struct xdp_umem *umem) umem->cq = NULL; } - if (umem->pgs) { - xdp_umem_unpin_pages(umem); + xdp_umem_unpin_pages(umem); - task = get_pid_task(umem->pid, PIDTYPE_PID); - put_pid(umem->pid); - if (!task) - goto out; - mm = get_task_mm(task); - put_task_struct(task); - if (!mm) - goto out; - - mmput(mm); - umem->pgs = NULL; - } + task = get_pid_task(umem->pid, PIDTYPE_PID); + put_pid(umem->pid); + if (!task) + goto out; + mm = get_task_mm(task); + put_task_struct(task); + if (!mm) + goto out; + mmput(mm); xdp_umem_unaccount_pages(umem); out: kfree(umem); @@ -167,16 +149,13 @@ static int xdp_umem_account_pages(struct xdp_umem *umem) return 0; } -int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) +static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) { u32 frame_size = mr->frame_size, frame_headroom = mr->frame_headroom; u64 addr = mr->addr, size = mr->len; unsigned int nframes, nfpp; int size_chk, err; - if (!umem) - return -EINVAL; - if (frame_size < XDP_UMEM_MIN_FRAME_SIZE || frame_size > PAGE_SIZE) { /* Strictly speaking we could support this, if: * - huge pages, or* @@ -245,6 +224,24 @@ out: return err; } +struct xdp_umem *xdp_umem_create(struct xdp_umem_reg *mr) +{ + struct xdp_umem *umem; + int err; + + umem = kzalloc(sizeof(*umem), GFP_KERNEL); + if (!umem) + return ERR_PTR(-ENOMEM); + + err = xdp_umem_reg(umem, mr); + if (err) { + kfree(umem); + return ERR_PTR(err); + } + + return umem; +} + bool xdp_umem_validate_queues(struct xdp_umem *umem) { return umem->fq && umem->cq; diff --git a/net/xdp/xdp_umem.h b/net/xdp/xdp_umem.h index 70fe225baa51..9802287ff19d 100644 --- a/net/xdp/xdp_umem.h +++ b/net/xdp/xdp_umem.h @@ -50,9 +50,8 @@ static inline char *xdp_umem_get_data_with_headroom(struct xdp_umem *umem, } bool xdp_umem_validate_queues(struct xdp_umem *umem); -int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr); void xdp_get_umem(struct xdp_umem *umem); void xdp_put_umem(struct xdp_umem *umem); -int xdp_umem_create(struct xdp_umem **umem); +struct xdp_umem *xdp_umem_create(struct xdp_umem_reg *mr); #endif /* XDP_UMEM_H_ */ diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 0396837459f4..a54146d105ce 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -406,25 +406,23 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname, struct xdp_umem_reg mr; struct xdp_umem *umem; - if (xs->umem) - return -EBUSY; - if (copy_from_user(&mr, optval, sizeof(mr))) return -EFAULT; mutex_lock(&xs->mutex); - err = xdp_umem_create(&umem); - - err = xdp_umem_reg(umem, &mr); - if (err) { - kfree(umem); + if (xs->umem) { mutex_unlock(&xs->mutex); - return err; + return -EBUSY; + } + + umem = xdp_umem_create(&mr); + if (IS_ERR(umem)) { + mutex_unlock(&xs->mutex); + return PTR_ERR(umem); } /* Make sure umem is ready before it can be seen by others */ smp_wmb(); - xs->umem = umem; mutex_unlock(&xs->mutex); return 0; @@ -435,13 +433,15 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname, struct xsk_queue **q; int entries; - if (!xs->umem) - return -EINVAL; - if (copy_from_user(&entries, optval, sizeof(entries))) return -EFAULT; mutex_lock(&xs->mutex); + if (!xs->umem) { + mutex_unlock(&xs->mutex); + return -EINVAL; + } + q = (optname == XDP_UMEM_FILL_RING) ? &xs->umem->fq : &xs->umem->cq; err = xsk_init_queue(entries, q, true); From d247aca5452910ff48ac025b87ba60e10df5d3d3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Tue, 22 May 2018 09:35:03 +0200 Subject: [PATCH 1165/1640] UPSTREAM: xsk: convert atomic_t to refcount_t MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduce refcount_t, in favor of atomic_t. Signed-off-by: Björn Töpel Signed-off-by: Daniel Borkmann --- net/xdp/xdp_umem.c | 6 +++--- net/xdp/xdp_umem.h | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c index faa6ffbaf6ab..87998818116f 100644 --- a/net/xdp/xdp_umem.c +++ b/net/xdp/xdp_umem.c @@ -78,7 +78,7 @@ static void xdp_umem_release_deferred(struct work_struct *work) void xdp_get_umem(struct xdp_umem *umem) { - atomic_inc(&umem->users); + refcount_inc(&umem->users); } void xdp_put_umem(struct xdp_umem *umem) @@ -86,7 +86,7 @@ void xdp_put_umem(struct xdp_umem *umem) if (!umem) return; - if (atomic_dec_and_test(&umem->users)) { + if (refcount_dec_and_test(&umem->users)) { INIT_WORK(&umem->work, xdp_umem_release_deferred); schedule_work(&umem->work); } @@ -206,7 +206,7 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) umem->frame_size_log2 = ilog2(frame_size); umem->nfpp_mask = nfpp - 1; umem->nfpplog2 = ilog2(nfpp); - atomic_set(&umem->users, 1); + refcount_set(&umem->users, 1); err = xdp_umem_account_pages(umem); if (err) diff --git a/net/xdp/xdp_umem.h b/net/xdp/xdp_umem.h index 9802287ff19d..0881cf456230 100644 --- a/net/xdp/xdp_umem.h +++ b/net/xdp/xdp_umem.h @@ -27,7 +27,7 @@ struct xdp_umem { struct pid *pid; unsigned long address; size_t size; - atomic_t users; + refcount_t users; struct work_struct work; }; From 4ee6026e6910aa71a62695ff8b19d9e8b6b21a43 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 31 May 2018 11:00:13 +0200 Subject: [PATCH 1166/1640] UPSTREAM: xdp: done implementing ndo_xdp_xmit flush flag for all drivers Removing XDP_XMIT_FLAGS_NONE as all driver now implement a flush operation in their ndo_xdp_xmit call. The compiler will catch if any users of XDP_XMIT_FLAGS_NONE remains. Signed-off-by: Jesper Dangaard Brouer Acked-by: Song Liu Signed-off-by: Alexei Starovoitov --- include/net/xdp.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/net/xdp.h b/include/net/xdp.h index 826ea013bea0..e79e3b938448 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -41,7 +41,6 @@ enum xdp_mem_type { }; /* XDP flags for ndo_xdp_xmit */ -#define XDP_XMIT_FLAGS_NONE 0U #define XDP_XMIT_FLUSH (1U << 0) /* doorbell signal consumer */ #define XDP_XMIT_FLAGS_MASK XDP_XMIT_FLUSH From fa89a8d77c2b082118d96f77d398733b76c08c3b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Mon, 4 Jun 2018 13:57:11 +0200 Subject: [PATCH 1167/1640] UPSTREAM: xsk: proper fill queue descriptor validation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously the fill queue descriptor was not copied to kernel space prior validating it, making it possible for userland to change the descriptor post-kernel-validation. Signed-off-by: Björn Töpel Signed-off-by: Daniel Borkmann --- net/xdp/xsk.c | 11 +++++------ net/xdp/xsk_queue.h | 32 +++++++++----------------------- 2 files changed, 14 insertions(+), 29 deletions(-) diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index a54146d105ce..bb76a0398879 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -41,20 +41,19 @@ bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs) static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) { - u32 *id, len = xdp->data_end - xdp->data; + u32 id, len = xdp->data_end - xdp->data; void *buffer; - int err = 0; + int err; if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index) return -EINVAL; - id = xskq_peek_id(xs->umem->fq); - if (!id) + if (!xskq_peek_id(xs->umem->fq, &id)) return -ENOSPC; - buffer = xdp_umem_get_data_with_headroom(xs->umem, *id); + buffer = xdp_umem_get_data_with_headroom(xs->umem, id); memcpy(buffer, xdp->data, len); - err = xskq_produce_batch_desc(xs->rx, *id, len, + err = xskq_produce_batch_desc(xs->rx, id, len, xs->umem->frame_headroom); if (!err) xskq_discard_id(xs->umem->fq); diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index cb8e5be35110..b5924e7aeb2b 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -85,14 +85,15 @@ static inline bool xskq_is_valid_id(struct xsk_queue *q, u32 idx) return true; } -static inline u32 *xskq_validate_id(struct xsk_queue *q) +static inline u32 *xskq_validate_id(struct xsk_queue *q, u32 *id) { while (q->cons_tail != q->cons_head) { struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring; unsigned int idx = q->cons_tail & q->ring_mask; - if (xskq_is_valid_id(q, ring->desc[idx])) - return &ring->desc[idx]; + *id = READ_ONCE(ring->desc[idx]); + if (xskq_is_valid_id(q, *id)) + return id; q->cons_tail++; } @@ -100,28 +101,22 @@ static inline u32 *xskq_validate_id(struct xsk_queue *q) return NULL; } -static inline u32 *xskq_peek_id(struct xsk_queue *q) +static inline u32 *xskq_peek_id(struct xsk_queue *q, u32 *id) { - struct xdp_umem_ring *ring; - if (q->cons_tail == q->cons_head) { WRITE_ONCE(q->ring->consumer, q->cons_tail); q->cons_head = q->cons_tail + xskq_nb_avail(q, RX_BATCH_SIZE); /* Order consumer and data */ smp_rmb(); - - return xskq_validate_id(q); } - ring = (struct xdp_umem_ring *)q->ring; - return &ring->desc[q->cons_tail & q->ring_mask]; + return xskq_validate_id(q, id); } static inline void xskq_discard_id(struct xsk_queue *q) { q->cons_tail++; - (void)xskq_validate_id(q); } static inline int xskq_produce_id(struct xsk_queue *q, u32 id) @@ -174,11 +169,9 @@ static inline struct xdp_desc *xskq_validate_desc(struct xsk_queue *q, struct xdp_rxtx_ring *ring = (struct xdp_rxtx_ring *)q->ring; unsigned int idx = q->cons_tail & q->ring_mask; - if (xskq_is_valid_desc(q, &ring->desc[idx])) { - if (desc) - *desc = ring->desc[idx]; + *desc = READ_ONCE(ring->desc[idx]); + if (xskq_is_valid_desc(q, desc)) return desc; - } q->cons_tail++; } @@ -189,27 +182,20 @@ static inline struct xdp_desc *xskq_validate_desc(struct xsk_queue *q, static inline struct xdp_desc *xskq_peek_desc(struct xsk_queue *q, struct xdp_desc *desc) { - struct xdp_rxtx_ring *ring; - if (q->cons_tail == q->cons_head) { WRITE_ONCE(q->ring->consumer, q->cons_tail); q->cons_head = q->cons_tail + xskq_nb_avail(q, RX_BATCH_SIZE); /* Order consumer and data */ smp_rmb(); - - return xskq_validate_desc(q, desc); } - ring = (struct xdp_rxtx_ring *)q->ring; - *desc = ring->desc[q->cons_tail & q->ring_mask]; - return desc; + return xskq_validate_desc(q, desc); } static inline void xskq_discard_desc(struct xsk_queue *q) { q->cons_tail++; - (void)xskq_validate_desc(q, NULL); } static inline int xskq_produce_batch_desc(struct xsk_queue *q, From b716f6740c05432a3bf9994e6274e3f705ced16f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Mon, 4 Jun 2018 13:57:12 +0200 Subject: [PATCH 1168/1640] UPSTREAM: xsk: proper Rx drop statistics update MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously, rx_dropped could be updated incorrectly, e.g. if the XDP program redirected the frame to a socket bound to a different queue than where the XDP program was executing. Signed-off-by: Björn Töpel Signed-off-by: Daniel Borkmann --- net/xdp/xsk.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index bb76a0398879..cb257ec955fb 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -48,8 +48,10 @@ static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index) return -EINVAL; - if (!xskq_peek_id(xs->umem->fq, &id)) + if (!xskq_peek_id(xs->umem->fq, &id)) { + xs->rx_dropped++; return -ENOSPC; + } buffer = xdp_umem_get_data_with_headroom(xs->umem, id); memcpy(buffer, xdp->data, len); @@ -57,6 +59,8 @@ static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) xs->umem->frame_headroom); if (!err) xskq_discard_id(xs->umem->fq); + else + xs->rx_dropped++; return err; } @@ -68,8 +72,6 @@ int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) err = __xsk_rcv(xs, xdp); if (likely(!err)) xdp_return_buff(xdp); - else - xs->rx_dropped++; return err; } @@ -87,8 +89,6 @@ int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) err = __xsk_rcv(xs, xdp); if (!err) xsk_flush(xs); - else - xs->rx_dropped++; return err; } From 5fd6ba9adda3f9d4bdd3986556955d9348555ff0 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Wed, 2 May 2018 13:01:36 +0200 Subject: [PATCH 1169/1640] BACKPORT: samples/bpf: sample application and documentation for AF_XDP sockets MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is a sample application for AF_XDP sockets. The application supports three different modes of operation: rxdrop, txonly and l2fwd. To show-case a simple round-robin load-balancing between a set of sockets in an xskmap, set the RR_LB compile time define option to 1 in "xdpsock.h". v2: The entries variable was calculated twice in {umem,xq}_nb_avail. Co-authored-by: Björn Töpel Signed-off-by: Björn Töpel Signed-off-by: Magnus Karlsson Signed-off-by: Alexei Starovoitov --- Documentation/networking/af_xdp.rst | 297 +++++++++ Documentation/networking/index.rst | 1 + samples/bpf/Makefile | 4 + samples/bpf/xdpsock.h | 11 + samples/bpf/xdpsock_kern.c | 56 ++ samples/bpf/xdpsock_user.c | 948 ++++++++++++++++++++++++++++ 6 files changed, 1317 insertions(+) create mode 100644 Documentation/networking/af_xdp.rst create mode 100644 samples/bpf/xdpsock.h create mode 100644 samples/bpf/xdpsock_kern.c create mode 100644 samples/bpf/xdpsock_user.c diff --git a/Documentation/networking/af_xdp.rst b/Documentation/networking/af_xdp.rst new file mode 100644 index 000000000000..91928d9ee4bf --- /dev/null +++ b/Documentation/networking/af_xdp.rst @@ -0,0 +1,297 @@ +.. SPDX-License-Identifier: GPL-2.0 + +====== +AF_XDP +====== + +Overview +======== + +AF_XDP is an address family that is optimized for high performance +packet processing. + +This document assumes that the reader is familiar with BPF and XDP. If +not, the Cilium project has an excellent reference guide at +http://cilium.readthedocs.io/en/doc-1.0/bpf/. + +Using the XDP_REDIRECT action from an XDP program, the program can +redirect ingress frames to other XDP enabled netdevs, using the +bpf_redirect_map() function. AF_XDP sockets enable the possibility for +XDP programs to redirect frames to a memory buffer in a user-space +application. + +An AF_XDP socket (XSK) is created with the normal socket() +syscall. Associated with each XSK are two rings: the RX ring and the +TX ring. A socket can receive packets on the RX ring and it can send +packets on the TX ring. These rings are registered and sized with the +setsockopts XDP_RX_RING and XDP_TX_RING, respectively. It is mandatory +to have at least one of these rings for each socket. An RX or TX +descriptor ring points to a data buffer in a memory area called a +UMEM. RX and TX can share the same UMEM so that a packet does not have +to be copied between RX and TX. Moreover, if a packet needs to be kept +for a while due to a possible retransmit, the descriptor that points +to that packet can be changed to point to another and reused right +away. This again avoids copying data. + +The UMEM consists of a number of equally size frames and each frame +has a unique frame id. A descriptor in one of the rings references a +frame by referencing its frame id. The user space allocates memory for +this UMEM using whatever means it feels is most appropriate (malloc, +mmap, huge pages, etc). This memory area is then registered with the +kernel using the new setsockopt XDP_UMEM_REG. The UMEM also has two +rings: the FILL ring and the COMPLETION ring. The fill ring is used by +the application to send down frame ids for the kernel to fill in with +RX packet data. References to these frames will then appear in the RX +ring once each packet has been received. The completion ring, on the +other hand, contains frame ids that the kernel has transmitted +completely and can now be used again by user space, for either TX or +RX. Thus, the frame ids appearing in the completion ring are ids that +were previously transmitted using the TX ring. In summary, the RX and +FILL rings are used for the RX path and the TX and COMPLETION rings +are used for the TX path. + +The socket is then finally bound with a bind() call to a device and a +specific queue id on that device, and it is not until bind is +completed that traffic starts to flow. + +The UMEM can be shared between processes, if desired. If a process +wants to do this, it simply skips the registration of the UMEM and its +corresponding two rings, sets the XDP_SHARED_UMEM flag in the bind +call and submits the XSK of the process it would like to share UMEM +with as well as its own newly created XSK socket. The new process will +then receive frame id references in its own RX ring that point to this +shared UMEM. Note that since the ring structures are single-consumer / +single-producer (for performance reasons), the new process has to +create its own socket with associated RX and TX rings, since it cannot +share this with the other process. This is also the reason that there +is only one set of FILL and COMPLETION rings per UMEM. It is the +responsibility of a single process to handle the UMEM. + +How is then packets distributed from an XDP program to the XSKs? There +is a BPF map called XSKMAP (or BPF_MAP_TYPE_XSKMAP in full). The +user-space application can place an XSK at an arbitrary place in this +map. The XDP program can then redirect a packet to a specific index in +this map and at this point XDP validates that the XSK in that map was +indeed bound to that device and ring number. If not, the packet is +dropped. If the map is empty at that index, the packet is also +dropped. This also means that it is currently mandatory to have an XDP +program loaded (and one XSK in the XSKMAP) to be able to get any +traffic to user space through the XSK. + +AF_XDP can operate in two different modes: XDP_SKB and XDP_DRV. If the +driver does not have support for XDP, or XDP_SKB is explicitly chosen +when loading the XDP program, XDP_SKB mode is employed that uses SKBs +together with the generic XDP support and copies out the data to user +space. A fallback mode that works for any network device. On the other +hand, if the driver has support for XDP, it will be used by the AF_XDP +code to provide better performance, but there is still a copy of the +data into user space. + +Concepts +======== + +In order to use an AF_XDP socket, a number of associated objects need +to be setup. + +Jonathan Corbet has also written an excellent article on LWN, +"Accelerating networking with AF_XDP". It can be found at +https://lwn.net/Articles/750845/. + +UMEM +---- + +UMEM is a region of virtual contiguous memory, divided into +equal-sized frames. An UMEM is associated to a netdev and a specific +queue id of that netdev. It is created and configured (frame size, +frame headroom, start address and size) by using the XDP_UMEM_REG +setsockopt system call. A UMEM is bound to a netdev and queue id, via +the bind() system call. + +An AF_XDP is socket linked to a single UMEM, but one UMEM can have +multiple AF_XDP sockets. To share an UMEM created via one socket A, +the next socket B can do this by setting the XDP_SHARED_UMEM flag in +struct sockaddr_xdp member sxdp_flags, and passing the file descriptor +of A to struct sockaddr_xdp member sxdp_shared_umem_fd. + +The UMEM has two single-producer/single-consumer rings, that are used +to transfer ownership of UMEM frames between the kernel and the +user-space application. + +Rings +----- + +There are a four different kind of rings: Fill, Completion, RX and +TX. All rings are single-producer/single-consumer, so the user-space +application need explicit synchronization of multiple +processes/threads are reading/writing to them. + +The UMEM uses two rings: Fill and Completion. Each socket associated +with the UMEM must have an RX queue, TX queue or both. Say, that there +is a setup with four sockets (all doing TX and RX). Then there will be +one Fill ring, one Completion ring, four TX rings and four RX rings. + +The rings are head(producer)/tail(consumer) based rings. A producer +writes the data ring at the index pointed out by struct xdp_ring +producer member, and increasing the producer index. A consumer reads +the data ring at the index pointed out by struct xdp_ring consumer +member, and increasing the consumer index. + +The rings are configured and created via the _RING setsockopt system +calls and mmapped to user-space using the appropriate offset to mmap() +(XDP_PGOFF_RX_RING, XDP_PGOFF_TX_RING, XDP_UMEM_PGOFF_FILL_RING and +XDP_UMEM_PGOFF_COMPLETION_RING). + +The size of the rings need to be of size power of two. + +UMEM Fill Ring +~~~~~~~~~~~~~~ + +The Fill ring is used to transfer ownership of UMEM frames from +user-space to kernel-space. The UMEM indicies are passed in the +ring. As an example, if the UMEM is 64k and each frame is 4k, then the +UMEM has 16 frames and can pass indicies between 0 and 15. + +Frames passed to the kernel are used for the ingress path (RX rings). + +The user application produces UMEM indicies to this ring. + +UMEM Completetion Ring +~~~~~~~~~~~~~~~~~~~~~~ + +The Completion Ring is used transfer ownership of UMEM frames from +kernel-space to user-space. Just like the Fill ring, UMEM indicies are +used. + +Frames passed from the kernel to user-space are frames that has been +sent (TX ring) and can be used by user-space again. + +The user application consumes UMEM indicies from this ring. + + +RX Ring +~~~~~~~ + +The RX ring is the receiving side of a socket. Each entry in the ring +is a struct xdp_desc descriptor. The descriptor contains UMEM index +(idx), the length of the data (len), the offset into the frame +(offset). + +If no frames have been passed to kernel via the Fill ring, no +descriptors will (or can) appear on the RX ring. + +The user application consumes struct xdp_desc descriptors from this +ring. + +TX Ring +~~~~~~~ + +The TX ring is used to send frames. The struct xdp_desc descriptor is +filled (index, length and offset) and passed into the ring. + +To start the transfer a sendmsg() system call is required. This might +be relaxed in the future. + +The user application produces struct xdp_desc descriptors to this +ring. + +XSKMAP / BPF_MAP_TYPE_XSKMAP +---------------------------- + +On XDP side there is a BPF map type BPF_MAP_TYPE_XSKMAP (XSKMAP) that +is used in conjunction with bpf_redirect_map() to pass the ingress +frame to a socket. + +The user application inserts the socket into the map, via the bpf() +system call. + +Note that if an XDP program tries to redirect to a socket that does +not match the queue configuration and netdev, the frame will be +dropped. E.g. an AF_XDP socket is bound to netdev eth0 and +queue 17. Only the XDP program executing for eth0 and queue 17 will +successfully pass data to the socket. Please refer to the sample +application (samples/bpf/) in for an example. + +Usage +===== + +In order to use AF_XDP sockets there are two parts needed. The +user-space application and the XDP program. For a complete setup and +usage example, please refer to the sample application. The user-space +side is xdpsock_user.c and the XDP side xdpsock_kern.c. + +Naive ring dequeue and enqueue could look like this:: + + // typedef struct xdp_rxtx_ring RING; + // typedef struct xdp_umem_ring RING; + + // typedef struct xdp_desc RING_TYPE; + // typedef __u32 RING_TYPE; + + int dequeue_one(RING *ring, RING_TYPE *item) + { + __u32 entries = ring->ptrs.producer - ring->ptrs.consumer; + + if (entries == 0) + return -1; + + // read-barrier! + + *item = ring->desc[ring->ptrs.consumer & (RING_SIZE - 1)]; + ring->ptrs.consumer++; + return 0; + } + + int enqueue_one(RING *ring, const RING_TYPE *item) + { + u32 free_entries = RING_SIZE - (ring->ptrs.producer - ring->ptrs.consumer); + + if (free_entries == 0) + return -1; + + ring->desc[ring->ptrs.producer & (RING_SIZE - 1)] = *item; + + // write-barrier! + + ring->ptrs.producer++; + return 0; + } + + +For a more optimized version, please refer to the sample application. + +Sample application +================== + +There is a xdpsock benchmarking/test application included that +demonstrates how to use AF_XDP sockets with both private and shared +UMEMs. Say that you would like your UDP traffic from port 4242 to end +up in queue 16, that we will enable AF_XDP on. Here, we use ethtool +for this:: + + ethtool -N p3p2 rx-flow-hash udp4 fn + ethtool -N p3p2 flow-type udp4 src-port 4242 dst-port 4242 \ + action 16 + +Running the rxdrop benchmark in XDP_DRV mode can then be done +using:: + + samples/bpf/xdpsock -i p3p2 -q 16 -r -N + +For XDP_SKB mode, use the switch "-S" instead of "-N" and all options +can be displayed with "-h", as usual. + +Credits +======= + +- Björn Töpel (AF_XDP core) +- Magnus Karlsson (AF_XDP core) +- Alexander Duyck +- Alexei Starovoitov +- Daniel Borkmann +- Jesper Dangaard Brouer +- John Fastabend +- Jonathan Corbet (LWN coverage) +- Michael S. Tsirkin +- Qi Z Zhang +- Willem de Bruijn + diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index 66e620866245..94c43ffffaef 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -6,6 +6,7 @@ Contents: .. toctree:: :maxdepth: 2 + af_xdp batman-adv kapi z8530book diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index 3460036621e4..efb677f88b33 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -42,6 +42,7 @@ hostprogs-y += xdp_redirect hostprogs-y += xdp_redirect_map hostprogs-y += xdp_monitor hostprogs-y += syscall_tp +hostprogs-y += xdpsock # Libbpf dependencies LIBBPF := ../../tools/lib/bpf/bpf.o @@ -87,6 +88,7 @@ xdp_redirect-objs := bpf_load.o $(LIBBPF) xdp_redirect_user.o xdp_redirect_map-objs := bpf_load.o $(LIBBPF) xdp_redirect_map_user.o xdp_monitor-objs := bpf_load.o $(LIBBPF) xdp_monitor_user.o syscall_tp-objs := bpf_load.o $(LIBBPF) syscall_tp_user.o +xdpsock-objs := bpf_load.o $(LIBBPF) xdpsock_user.o # Tell kbuild to always build the programs always := $(hostprogs-y) @@ -132,6 +134,7 @@ always += xdp_redirect_kern.o always += xdp_redirect_map_kern.o always += xdp_monitor_kern.o always += syscall_tp_kern.o +always += xdpsock_kern.o HOSTCFLAGS += -I$(objtree)/usr/include HOSTCFLAGS += -I$(srctree)/tools/lib/ @@ -172,6 +175,7 @@ HOSTLOADLIBES_xdp_redirect += -lelf HOSTLOADLIBES_xdp_redirect_map += -lelf HOSTLOADLIBES_xdp_monitor += -lelf HOSTLOADLIBES_syscall_tp += -lelf +HOSTLOADLIBES_xdpsock += -lelf -pthread # Allows pointing LLC/CLANG to a LLVM backend with bpf support, redefine on cmdline: # make samples/bpf/ LLC=~/git/llvm/build/bin/llc CLANG=~/git/llvm/build/bin/clang diff --git a/samples/bpf/xdpsock.h b/samples/bpf/xdpsock.h new file mode 100644 index 000000000000..533ab81adfa1 --- /dev/null +++ b/samples/bpf/xdpsock.h @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef XDPSOCK_H_ +#define XDPSOCK_H_ + +/* Power-of-2 number of sockets */ +#define MAX_SOCKS 4 + +/* Round-robin receive */ +#define RR_LB 0 + +#endif /* XDPSOCK_H_ */ diff --git a/samples/bpf/xdpsock_kern.c b/samples/bpf/xdpsock_kern.c new file mode 100644 index 000000000000..d8806c41362e --- /dev/null +++ b/samples/bpf/xdpsock_kern.c @@ -0,0 +1,56 @@ +// SPDX-License-Identifier: GPL-2.0 +#define KBUILD_MODNAME "foo" +#include +#include "bpf_helpers.h" + +#include "xdpsock.h" + +struct bpf_map_def SEC("maps") qidconf_map = { + .type = BPF_MAP_TYPE_ARRAY, + .key_size = sizeof(int), + .value_size = sizeof(int), + .max_entries = 1, +}; + +struct bpf_map_def SEC("maps") xsks_map = { + .type = BPF_MAP_TYPE_XSKMAP, + .key_size = sizeof(int), + .value_size = sizeof(int), + .max_entries = 4, +}; + +struct bpf_map_def SEC("maps") rr_map = { + .type = BPF_MAP_TYPE_PERCPU_ARRAY, + .key_size = sizeof(int), + .value_size = sizeof(unsigned int), + .max_entries = 1, +}; + +SEC("xdp_sock") +int xdp_sock_prog(struct xdp_md *ctx) +{ + int *qidconf, key = 0, idx; + unsigned int *rr; + + qidconf = bpf_map_lookup_elem(&qidconf_map, &key); + if (!qidconf) + return XDP_ABORTED; + + if (*qidconf != ctx->rx_queue_index) + return XDP_PASS; + +#if RR_LB /* NB! RR_LB is configured in xdpsock.h */ + rr = bpf_map_lookup_elem(&rr_map, &key); + if (!rr) + return XDP_ABORTED; + + *rr = (*rr + 1) & (MAX_SOCKS - 1); + idx = *rr; +#else + idx = 0; +#endif + + return bpf_redirect_map(&xsks_map, idx, 0); +} + +char _license[] SEC("license") = "GPL"; diff --git a/samples/bpf/xdpsock_user.c b/samples/bpf/xdpsock_user.c new file mode 100644 index 000000000000..4b8a7cf3e63b --- /dev/null +++ b/samples/bpf/xdpsock_user.c @@ -0,0 +1,948 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright(c) 2017 - 2018 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "bpf_load.h" +#include "bpf_util.h" +#include "libbpf.h" + +#include "xdpsock.h" + +#ifndef SOL_XDP +#define SOL_XDP 283 +#endif + +#ifndef AF_XDP +#define AF_XDP 44 +#endif + +#ifndef PF_XDP +#define PF_XDP AF_XDP +#endif + +#define NUM_FRAMES 131072 +#define FRAME_HEADROOM 0 +#define FRAME_SIZE 2048 +#define NUM_DESCS 1024 +#define BATCH_SIZE 16 + +#define FQ_NUM_DESCS 1024 +#define CQ_NUM_DESCS 1024 + +#define DEBUG_HEXDUMP 0 + +typedef __u32 u32; + +static unsigned long prev_time; + +enum benchmark_type { + BENCH_RXDROP = 0, + BENCH_TXONLY = 1, + BENCH_L2FWD = 2, +}; + +static enum benchmark_type opt_bench = BENCH_RXDROP; +static u32 opt_xdp_flags; +static const char *opt_if = ""; +static int opt_ifindex; +static int opt_queue; +static int opt_poll; +static int opt_shared_packet_buffer; +static int opt_interval = 1; + +struct xdp_umem_uqueue { + u32 cached_prod; + u32 cached_cons; + u32 mask; + u32 size; + struct xdp_umem_ring *ring; +}; + +struct xdp_umem { + char (*frames)[FRAME_SIZE]; + struct xdp_umem_uqueue fq; + struct xdp_umem_uqueue cq; + int fd; +}; + +struct xdp_uqueue { + u32 cached_prod; + u32 cached_cons; + u32 mask; + u32 size; + struct xdp_rxtx_ring *ring; +}; + +struct xdpsock { + struct xdp_uqueue rx; + struct xdp_uqueue tx; + int sfd; + struct xdp_umem *umem; + u32 outstanding_tx; + unsigned long rx_npkts; + unsigned long tx_npkts; + unsigned long prev_rx_npkts; + unsigned long prev_tx_npkts; +}; + +#define MAX_SOCKS 4 +static int num_socks; +struct xdpsock *xsks[MAX_SOCKS]; + +static unsigned long get_nsecs(void) +{ + struct timespec ts; + + clock_gettime(CLOCK_MONOTONIC, &ts); + return ts.tv_sec * 1000000000UL + ts.tv_nsec; +} + +static void dump_stats(void); + +#define lassert(expr) \ + do { \ + if (!(expr)) { \ + fprintf(stderr, "%s:%s:%i: Assertion failed: " \ + #expr ": errno: %d/\"%s\"\n", \ + __FILE__, __func__, __LINE__, \ + errno, strerror(errno)); \ + dump_stats(); \ + exit(EXIT_FAILURE); \ + } \ + } while (0) + +#define barrier() __asm__ __volatile__("": : :"memory") +#define u_smp_rmb() barrier() +#define u_smp_wmb() barrier() +#define likely(x) __builtin_expect(!!(x), 1) +#define unlikely(x) __builtin_expect(!!(x), 0) + +static const char pkt_data[] = + "\x3c\xfd\xfe\x9e\x7f\x71\xec\xb1\xd7\x98\x3a\xc0\x08\x00\x45\x00" + "\x00\x2e\x00\x00\x00\x00\x40\x11\x88\x97\x05\x08\x07\x08\xc8\x14" + "\x1e\x04\x10\x92\x10\x92\x00\x1a\x6d\xa3\x34\x33\x1f\x69\x40\x6b" + "\x54\x59\xb6\x14\x2d\x11\x44\xbf\xaf\xd9\xbe\xaa"; + +static inline u32 umem_nb_free(struct xdp_umem_uqueue *q, u32 nb) +{ + u32 free_entries = q->size - (q->cached_prod - q->cached_cons); + + if (free_entries >= nb) + return free_entries; + + /* Refresh the local tail pointer */ + q->cached_cons = q->ring->ptrs.consumer; + + return q->size - (q->cached_prod - q->cached_cons); +} + +static inline u32 xq_nb_free(struct xdp_uqueue *q, u32 ndescs) +{ + u32 free_entries = q->cached_cons - q->cached_prod; + + if (free_entries >= ndescs) + return free_entries; + + /* Refresh the local tail pointer */ + q->cached_cons = q->ring->ptrs.consumer + q->size; + return q->cached_cons - q->cached_prod; +} + +static inline u32 umem_nb_avail(struct xdp_umem_uqueue *q, u32 nb) +{ + u32 entries = q->cached_prod - q->cached_cons; + + if (entries == 0) { + q->cached_prod = q->ring->ptrs.producer; + entries = q->cached_prod - q->cached_cons; + } + + return (entries > nb) ? nb : entries; +} + +static inline u32 xq_nb_avail(struct xdp_uqueue *q, u32 ndescs) +{ + u32 entries = q->cached_prod - q->cached_cons; + + if (entries == 0) { + q->cached_prod = q->ring->ptrs.producer; + entries = q->cached_prod - q->cached_cons; + } + + return (entries > ndescs) ? ndescs : entries; +} + +static inline int umem_fill_to_kernel_ex(struct xdp_umem_uqueue *fq, + struct xdp_desc *d, + size_t nb) +{ + u32 i; + + if (umem_nb_free(fq, nb) < nb) + return -ENOSPC; + + for (i = 0; i < nb; i++) { + u32 idx = fq->cached_prod++ & fq->mask; + + fq->ring->desc[idx] = d[i].idx; + } + + u_smp_wmb(); + + fq->ring->ptrs.producer = fq->cached_prod; + + return 0; +} + +static inline int umem_fill_to_kernel(struct xdp_umem_uqueue *fq, u32 *d, + size_t nb) +{ + u32 i; + + if (umem_nb_free(fq, nb) < nb) + return -ENOSPC; + + for (i = 0; i < nb; i++) { + u32 idx = fq->cached_prod++ & fq->mask; + + fq->ring->desc[idx] = d[i]; + } + + u_smp_wmb(); + + fq->ring->ptrs.producer = fq->cached_prod; + + return 0; +} + +static inline size_t umem_complete_from_kernel(struct xdp_umem_uqueue *cq, + u32 *d, size_t nb) +{ + u32 idx, i, entries = umem_nb_avail(cq, nb); + + u_smp_rmb(); + + for (i = 0; i < entries; i++) { + idx = cq->cached_cons++ & cq->mask; + d[i] = cq->ring->desc[idx]; + } + + if (entries > 0) { + u_smp_wmb(); + + cq->ring->ptrs.consumer = cq->cached_cons; + } + + return entries; +} + +static inline void *xq_get_data(struct xdpsock *xsk, __u32 idx, __u32 off) +{ + lassert(idx < NUM_FRAMES); + return &xsk->umem->frames[idx][off]; +} + +static inline int xq_enq(struct xdp_uqueue *uq, + const struct xdp_desc *descs, + unsigned int ndescs) +{ + struct xdp_rxtx_ring *r = uq->ring; + unsigned int i; + + if (xq_nb_free(uq, ndescs) < ndescs) + return -ENOSPC; + + for (i = 0; i < ndescs; i++) { + u32 idx = uq->cached_prod++ & uq->mask; + + r->desc[idx].idx = descs[i].idx; + r->desc[idx].len = descs[i].len; + r->desc[idx].offset = descs[i].offset; + } + + u_smp_wmb(); + + r->ptrs.producer = uq->cached_prod; + return 0; +} + +static inline int xq_enq_tx_only(struct xdp_uqueue *uq, + __u32 idx, unsigned int ndescs) +{ + struct xdp_rxtx_ring *q = uq->ring; + unsigned int i; + + if (xq_nb_free(uq, ndescs) < ndescs) + return -ENOSPC; + + for (i = 0; i < ndescs; i++) { + u32 idx = uq->cached_prod++ & uq->mask; + + q->desc[idx].idx = idx + i; + q->desc[idx].len = sizeof(pkt_data) - 1; + q->desc[idx].offset = 0; + } + + u_smp_wmb(); + + q->ptrs.producer = uq->cached_prod; + return 0; +} + +static inline int xq_deq(struct xdp_uqueue *uq, + struct xdp_desc *descs, + int ndescs) +{ + struct xdp_rxtx_ring *r = uq->ring; + unsigned int idx; + int i, entries; + + entries = xq_nb_avail(uq, ndescs); + + u_smp_rmb(); + + for (i = 0; i < entries; i++) { + idx = uq->cached_cons++ & uq->mask; + descs[i] = r->desc[idx]; + } + + if (entries > 0) { + u_smp_wmb(); + + r->ptrs.consumer = uq->cached_cons; + } + + return entries; +} + +static void swap_mac_addresses(void *data) +{ + struct ether_header *eth = (struct ether_header *)data; + struct ether_addr *src_addr = (struct ether_addr *)ð->ether_shost; + struct ether_addr *dst_addr = (struct ether_addr *)ð->ether_dhost; + struct ether_addr tmp; + + tmp = *src_addr; + *src_addr = *dst_addr; + *dst_addr = tmp; +} + +#if DEBUG_HEXDUMP +static void hex_dump(void *pkt, size_t length, const char *prefix) +{ + int i = 0; + const unsigned char *address = (unsigned char *)pkt; + const unsigned char *line = address; + size_t line_size = 32; + unsigned char c; + + printf("length = %zu\n", length); + printf("%s | ", prefix); + while (length-- > 0) { + printf("%02X ", *address++); + if (!(++i % line_size) || (length == 0 && i % line_size)) { + if (length == 0) { + while (i++ % line_size) + printf("__ "); + } + printf(" | "); /* right close */ + while (line < address) { + c = *line++; + printf("%c", (c < 33 || c == 255) ? 0x2E : c); + } + printf("\n"); + if (length > 0) + printf("%s | ", prefix); + } + } + printf("\n"); +} +#endif + +static size_t gen_eth_frame(char *frame) +{ + memcpy(frame, pkt_data, sizeof(pkt_data) - 1); + return sizeof(pkt_data) - 1; +} + +static struct xdp_umem *xdp_umem_configure(int sfd) +{ + int fq_size = FQ_NUM_DESCS, cq_size = CQ_NUM_DESCS; + struct xdp_umem_reg mr; + struct xdp_umem *umem; + void *bufs; + + umem = calloc(1, sizeof(*umem)); + lassert(umem); + + lassert(posix_memalign(&bufs, getpagesize(), /* PAGE_SIZE aligned */ + NUM_FRAMES * FRAME_SIZE) == 0); + + mr.addr = (__u64)bufs; + mr.len = NUM_FRAMES * FRAME_SIZE; + mr.frame_size = FRAME_SIZE; + mr.frame_headroom = FRAME_HEADROOM; + + lassert(setsockopt(sfd, SOL_XDP, XDP_UMEM_REG, &mr, sizeof(mr)) == 0); + lassert(setsockopt(sfd, SOL_XDP, XDP_UMEM_FILL_RING, &fq_size, + sizeof(int)) == 0); + lassert(setsockopt(sfd, SOL_XDP, XDP_UMEM_COMPLETION_RING, &cq_size, + sizeof(int)) == 0); + + umem->fq.ring = mmap(0, sizeof(struct xdp_umem_ring) + + FQ_NUM_DESCS * sizeof(u32), + PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_POPULATE, sfd, + XDP_UMEM_PGOFF_FILL_RING); + lassert(umem->fq.ring != MAP_FAILED); + + umem->fq.mask = FQ_NUM_DESCS - 1; + umem->fq.size = FQ_NUM_DESCS; + + umem->cq.ring = mmap(0, sizeof(struct xdp_umem_ring) + + CQ_NUM_DESCS * sizeof(u32), + PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_POPULATE, sfd, + XDP_UMEM_PGOFF_COMPLETION_RING); + lassert(umem->cq.ring != MAP_FAILED); + + umem->cq.mask = CQ_NUM_DESCS - 1; + umem->cq.size = CQ_NUM_DESCS; + + umem->frames = (char (*)[FRAME_SIZE])bufs; + umem->fd = sfd; + + if (opt_bench == BENCH_TXONLY) { + int i; + + for (i = 0; i < NUM_FRAMES; i++) + (void)gen_eth_frame(&umem->frames[i][0]); + } + + return umem; +} + +static struct xdpsock *xsk_configure(struct xdp_umem *umem) +{ + struct sockaddr_xdp sxdp = {}; + int sfd, ndescs = NUM_DESCS; + struct xdpsock *xsk; + bool shared = true; + u32 i; + + sfd = socket(PF_XDP, SOCK_RAW, 0); + lassert(sfd >= 0); + + xsk = calloc(1, sizeof(*xsk)); + lassert(xsk); + + xsk->sfd = sfd; + xsk->outstanding_tx = 0; + + if (!umem) { + shared = false; + xsk->umem = xdp_umem_configure(sfd); + } else { + xsk->umem = umem; + } + + lassert(setsockopt(sfd, SOL_XDP, XDP_RX_RING, + &ndescs, sizeof(int)) == 0); + lassert(setsockopt(sfd, SOL_XDP, XDP_TX_RING, + &ndescs, sizeof(int)) == 0); + + /* Rx */ + xsk->rx.ring = mmap(NULL, + sizeof(struct xdp_ring) + + NUM_DESCS * sizeof(struct xdp_desc), + PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_POPULATE, sfd, + XDP_PGOFF_RX_RING); + lassert(xsk->rx.ring != MAP_FAILED); + + if (!shared) { + for (i = 0; i < NUM_DESCS / 2; i++) + lassert(umem_fill_to_kernel(&xsk->umem->fq, &i, 1) + == 0); + } + + /* Tx */ + xsk->tx.ring = mmap(NULL, + sizeof(struct xdp_ring) + + NUM_DESCS * sizeof(struct xdp_desc), + PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_POPULATE, sfd, + XDP_PGOFF_TX_RING); + lassert(xsk->tx.ring != MAP_FAILED); + + xsk->rx.mask = NUM_DESCS - 1; + xsk->rx.size = NUM_DESCS; + + xsk->tx.mask = NUM_DESCS - 1; + xsk->tx.size = NUM_DESCS; + + sxdp.sxdp_family = PF_XDP; + sxdp.sxdp_ifindex = opt_ifindex; + sxdp.sxdp_queue_id = opt_queue; + if (shared) { + sxdp.sxdp_flags = XDP_SHARED_UMEM; + sxdp.sxdp_shared_umem_fd = umem->fd; + } + + lassert(bind(sfd, (struct sockaddr *)&sxdp, sizeof(sxdp)) == 0); + + return xsk; +} + +static void print_benchmark(bool running) +{ + const char *bench_str = "INVALID"; + + if (opt_bench == BENCH_RXDROP) + bench_str = "rxdrop"; + else if (opt_bench == BENCH_TXONLY) + bench_str = "txonly"; + else if (opt_bench == BENCH_L2FWD) + bench_str = "l2fwd"; + + printf("%s:%d %s ", opt_if, opt_queue, bench_str); + if (opt_xdp_flags & XDP_FLAGS_SKB_MODE) + printf("xdp-skb "); + else if (opt_xdp_flags & XDP_FLAGS_DRV_MODE) + printf("xdp-drv "); + else + printf(" "); + + if (opt_poll) + printf("poll() "); + + if (running) { + printf("running..."); + fflush(stdout); + } +} + +static void dump_stats(void) +{ + unsigned long now = get_nsecs(); + long dt = now - prev_time; + int i; + + prev_time = now; + + for (i = 0; i < num_socks; i++) { + char *fmt = "%-15s %'-11.0f %'-11lu\n"; + double rx_pps, tx_pps; + + rx_pps = (xsks[i]->rx_npkts - xsks[i]->prev_rx_npkts) * + 1000000000. / dt; + tx_pps = (xsks[i]->tx_npkts - xsks[i]->prev_tx_npkts) * + 1000000000. / dt; + + printf("\n sock%d@", i); + print_benchmark(false); + printf("\n"); + + printf("%-15s %-11s %-11s %-11.2f\n", "", "pps", "pkts", + dt / 1000000000.); + printf(fmt, "rx", rx_pps, xsks[i]->rx_npkts); + printf(fmt, "tx", tx_pps, xsks[i]->tx_npkts); + + xsks[i]->prev_rx_npkts = xsks[i]->rx_npkts; + xsks[i]->prev_tx_npkts = xsks[i]->tx_npkts; + } +} + +static void *poller(void *arg) +{ + (void)arg; + for (;;) { + sleep(opt_interval); + dump_stats(); + } + + return NULL; +} + +static void int_exit(int sig) +{ + (void)sig; + dump_stats(); + bpf_set_link_xdp_fd(opt_ifindex, -1, opt_xdp_flags); + exit(EXIT_SUCCESS); +} + +static struct option long_options[] = { + {"rxdrop", no_argument, 0, 'r'}, + {"txonly", no_argument, 0, 't'}, + {"l2fwd", no_argument, 0, 'l'}, + {"interface", required_argument, 0, 'i'}, + {"queue", required_argument, 0, 'q'}, + {"poll", no_argument, 0, 'p'}, + {"shared-buffer", no_argument, 0, 's'}, + {"xdp-skb", no_argument, 0, 'S'}, + {"xdp-native", no_argument, 0, 'N'}, + {"interval", required_argument, 0, 'n'}, + {0, 0, 0, 0} +}; + +static void usage(const char *prog) +{ + const char *str = + " Usage: %s [OPTIONS]\n" + " Options:\n" + " -r, --rxdrop Discard all incoming packets (default)\n" + " -t, --txonly Only send packets\n" + " -l, --l2fwd MAC swap L2 forwarding\n" + " -i, --interface=n Run on interface n\n" + " -q, --queue=n Use queue n (default 0)\n" + " -p, --poll Use poll syscall\n" + " -s, --shared-buffer Use shared packet buffer\n" + " -S, --xdp-skb=n Use XDP skb-mod\n" + " -N, --xdp-native=n Enfore XDP native mode\n" + " -n, --interval=n Specify statistics update interval (default 1 sec).\n" + "\n"; + fprintf(stderr, str, prog); + exit(EXIT_FAILURE); +} + +static void parse_command_line(int argc, char **argv) +{ + int option_index, c; + + opterr = 0; + + for (;;) { + c = getopt_long(argc, argv, "rtli:q:psSNn:", long_options, + &option_index); + if (c == -1) + break; + + switch (c) { + case 'r': + opt_bench = BENCH_RXDROP; + break; + case 't': + opt_bench = BENCH_TXONLY; + break; + case 'l': + opt_bench = BENCH_L2FWD; + break; + case 'i': + opt_if = optarg; + break; + case 'q': + opt_queue = atoi(optarg); + break; + case 's': + opt_shared_packet_buffer = 1; + break; + case 'p': + opt_poll = 1; + break; + case 'S': + opt_xdp_flags |= XDP_FLAGS_SKB_MODE; + break; + case 'N': + opt_xdp_flags |= XDP_FLAGS_DRV_MODE; + break; + case 'n': + opt_interval = atoi(optarg); + break; + default: + usage(basename(argv[0])); + } + } + + opt_ifindex = if_nametoindex(opt_if); + if (!opt_ifindex) { + fprintf(stderr, "ERROR: interface \"%s\" does not exist\n", + opt_if); + usage(basename(argv[0])); + } +} + +static void kick_tx(int fd) +{ + int ret; + + ret = sendto(fd, NULL, 0, MSG_DONTWAIT, NULL, 0); + if (ret >= 0 || errno == ENOBUFS || errno == EAGAIN) + return; + lassert(0); +} + +static inline void complete_tx_l2fwd(struct xdpsock *xsk) +{ + u32 descs[BATCH_SIZE]; + unsigned int rcvd; + size_t ndescs; + + if (!xsk->outstanding_tx) + return; + + kick_tx(xsk->sfd); + ndescs = (xsk->outstanding_tx > BATCH_SIZE) ? BATCH_SIZE : + xsk->outstanding_tx; + + /* re-add completed Tx buffers */ + rcvd = umem_complete_from_kernel(&xsk->umem->cq, descs, ndescs); + if (rcvd > 0) { + umem_fill_to_kernel(&xsk->umem->fq, descs, rcvd); + xsk->outstanding_tx -= rcvd; + xsk->tx_npkts += rcvd; + } +} + +static inline void complete_tx_only(struct xdpsock *xsk) +{ + u32 descs[BATCH_SIZE]; + unsigned int rcvd; + + if (!xsk->outstanding_tx) + return; + + kick_tx(xsk->sfd); + + rcvd = umem_complete_from_kernel(&xsk->umem->cq, descs, BATCH_SIZE); + if (rcvd > 0) { + xsk->outstanding_tx -= rcvd; + xsk->tx_npkts += rcvd; + } +} + +static void rx_drop(struct xdpsock *xsk) +{ + struct xdp_desc descs[BATCH_SIZE]; + unsigned int rcvd, i; + + rcvd = xq_deq(&xsk->rx, descs, BATCH_SIZE); + if (!rcvd) + return; + + for (i = 0; i < rcvd; i++) { + u32 idx = descs[i].idx; + + lassert(idx < NUM_FRAMES); +#if DEBUG_HEXDUMP + char *pkt; + char buf[32]; + + pkt = xq_get_data(xsk, idx, descs[i].offset); + sprintf(buf, "idx=%d", idx); + hex_dump(pkt, descs[i].len, buf); +#endif + } + + xsk->rx_npkts += rcvd; + + umem_fill_to_kernel_ex(&xsk->umem->fq, descs, rcvd); +} + +static void rx_drop_all(void) +{ + struct pollfd fds[MAX_SOCKS + 1]; + int i, ret, timeout, nfds = 1; + + memset(fds, 0, sizeof(fds)); + + for (i = 0; i < num_socks; i++) { + fds[i].fd = xsks[i]->sfd; + fds[i].events = POLLIN; + timeout = 1000; /* 1sn */ + } + + for (;;) { + if (opt_poll) { + ret = poll(fds, nfds, timeout); + if (ret <= 0) + continue; + } + + for (i = 0; i < num_socks; i++) + rx_drop(xsks[i]); + } +} + +static void tx_only(struct xdpsock *xsk) +{ + int timeout, ret, nfds = 1; + struct pollfd fds[nfds + 1]; + unsigned int idx = 0; + + memset(fds, 0, sizeof(fds)); + fds[0].fd = xsk->sfd; + fds[0].events = POLLOUT; + timeout = 1000; /* 1sn */ + + for (;;) { + if (opt_poll) { + ret = poll(fds, nfds, timeout); + if (ret <= 0) + continue; + + if (fds[0].fd != xsk->sfd || + !(fds[0].revents & POLLOUT)) + continue; + } + + if (xq_nb_free(&xsk->tx, BATCH_SIZE) >= BATCH_SIZE) { + lassert(xq_enq_tx_only(&xsk->tx, idx, BATCH_SIZE) == 0); + + xsk->outstanding_tx += BATCH_SIZE; + idx += BATCH_SIZE; + idx %= NUM_FRAMES; + } + + complete_tx_only(xsk); + } +} + +static void l2fwd(struct xdpsock *xsk) +{ + for (;;) { + struct xdp_desc descs[BATCH_SIZE]; + unsigned int rcvd, i; + int ret; + + for (;;) { + complete_tx_l2fwd(xsk); + + rcvd = xq_deq(&xsk->rx, descs, BATCH_SIZE); + if (rcvd > 0) + break; + } + + for (i = 0; i < rcvd; i++) { + char *pkt = xq_get_data(xsk, descs[i].idx, + descs[i].offset); + + swap_mac_addresses(pkt); +#if DEBUG_HEXDUMP + char buf[32]; + u32 idx = descs[i].idx; + + sprintf(buf, "idx=%d", idx); + hex_dump(pkt, descs[i].len, buf); +#endif + } + + xsk->rx_npkts += rcvd; + + ret = xq_enq(&xsk->tx, descs, rcvd); + lassert(ret == 0); + xsk->outstanding_tx += rcvd; + } +} + +int main(int argc, char **argv) +{ + struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; + char xdp_filename[256]; + int i, ret, key = 0; + pthread_t pt; + + parse_command_line(argc, argv); + + if (setrlimit(RLIMIT_MEMLOCK, &r)) { + fprintf(stderr, "ERROR: setrlimit(RLIMIT_MEMLOCK) \"%s\"\n", + strerror(errno)); + exit(EXIT_FAILURE); + } + + snprintf(xdp_filename, sizeof(xdp_filename), "%s_kern.o", argv[0]); + + if (load_bpf_file(xdp_filename)) { + fprintf(stderr, "ERROR: load_bpf_file %s\n", bpf_log_buf); + exit(EXIT_FAILURE); + } + + if (!prog_fd[0]) { + fprintf(stderr, "ERROR: load_bpf_file: \"%s\"\n", + strerror(errno)); + exit(EXIT_FAILURE); + } + + if (bpf_set_link_xdp_fd(opt_ifindex, prog_fd[0], opt_xdp_flags) < 0) { + fprintf(stderr, "ERROR: link set xdp fd failed\n"); + exit(EXIT_FAILURE); + } + + ret = bpf_map_update_elem(map_fd[0], &key, &opt_queue, 0); + if (ret) { + fprintf(stderr, "ERROR: bpf_map_update_elem qidconf\n"); + exit(EXIT_FAILURE); + } + + /* Create sockets... */ + xsks[num_socks++] = xsk_configure(NULL); + +#if RR_LB + for (i = 0; i < MAX_SOCKS - 1; i++) + xsks[num_socks++] = xsk_configure(xsks[0]->umem); +#endif + + /* ...and insert them into the map. */ + for (i = 0; i < num_socks; i++) { + key = i; + ret = bpf_map_update_elem(map_fd[1], &key, &xsks[i]->sfd, 0); + if (ret) { + fprintf(stderr, "ERROR: bpf_map_update_elem %d\n", i); + exit(EXIT_FAILURE); + } + } + + signal(SIGINT, int_exit); + signal(SIGTERM, int_exit); + signal(SIGABRT, int_exit); + + setlocale(LC_ALL, ""); + + ret = pthread_create(&pt, NULL, poller, NULL); + lassert(ret == 0); + + prev_time = get_nsecs(); + + if (opt_bench == BENCH_RXDROP) + rx_drop_all(); + else if (opt_bench == BENCH_TXONLY) + tx_only(xsks[0]); + else + l2fwd(xsks[0]); + + return 0; +} From e232edd832a502040006f4b0e3ac431c4daf8fb1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Mon, 4 Jun 2018 13:57:13 +0200 Subject: [PATCH 1170/1640] UPSTREAM: xsk: new descriptor addressing scheme MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently, AF_XDP only supports a fixed frame-size memory scheme where each frame is referenced via an index (idx). A user passes the frame index to the kernel, and the kernel acts upon the data. Some NICs, however, do not have a fixed frame-size model, instead they have a model where a memory window is passed to the hardware and multiple frames are filled into that window (referred to as the "type-writer" model). By changing the descriptor format from the current frame index addressing scheme, AF_XDP can in the future be extended to support these kinds of NICs. In the index-based model, an idx refers to a frame of size frame_size. Addressing a frame in the UMEM is done by offseting the UMEM starting address by a global offset, idx * frame_size + offset. Communicating via the fill- and completion-rings are done by means of idx. In this commit, the idx is removed in favor of an address (addr), which is a relative address ranging over the UMEM. To convert an idx-based address to the new addr is simply: addr = idx * frame_size + offset. We also stop referring to the UMEM "frame" as a frame. Instead it is simply called a chunk. To transfer ownership of a chunk to the kernel, the addr of the chunk is passed in the fill-ring. Note, that the kernel will mask addr to make it chunk aligned, so there is no need for userspace to do that. E.g., for a chunk size of 2k, passing an addr of 2048, 2050 or 3000 to the fill-ring will refer to the same chunk. On the completion-ring, the addr will match that of the Tx descriptor, passed to the kernel. Changing the descriptor format to use chunks/addr will allow for future changes to move to a type-writer based model, where multiple frames can reside in one chunk. In this model passing one single chunk into the fill-ring, would potentially result in multiple Rx descriptors. This commit changes the uapi of AF_XDP sockets, and updates the documentation. Signed-off-by: Björn Töpel Signed-off-by: Daniel Borkmann --- Documentation/networking/af_xdp.rst | 101 ++++++++++++++++------------ include/uapi/linux/if_xdp.h | 12 ++-- net/xdp/xdp_umem.c | 33 +++++---- net/xdp/xdp_umem.h | 27 ++------ net/xdp/xdp_umem_props.h | 4 +- net/xdp/xsk.c | 30 +++++---- net/xdp/xsk_queue.c | 2 +- net/xdp/xsk_queue.h | 43 ++++++------ 8 files changed, 123 insertions(+), 129 deletions(-) diff --git a/Documentation/networking/af_xdp.rst b/Documentation/networking/af_xdp.rst index 91928d9ee4bf..ff929cfab4f4 100644 --- a/Documentation/networking/af_xdp.rst +++ b/Documentation/networking/af_xdp.rst @@ -12,7 +12,7 @@ packet processing. This document assumes that the reader is familiar with BPF and XDP. If not, the Cilium project has an excellent reference guide at -http://cilium.readthedocs.io/en/doc-1.0/bpf/. +http://cilium.readthedocs.io/en/latest/bpf/. Using the XDP_REDIRECT action from an XDP program, the program can redirect ingress frames to other XDP enabled netdevs, using the @@ -33,22 +33,22 @@ for a while due to a possible retransmit, the descriptor that points to that packet can be changed to point to another and reused right away. This again avoids copying data. -The UMEM consists of a number of equally size frames and each frame -has a unique frame id. A descriptor in one of the rings references a -frame by referencing its frame id. The user space allocates memory for -this UMEM using whatever means it feels is most appropriate (malloc, -mmap, huge pages, etc). This memory area is then registered with the -kernel using the new setsockopt XDP_UMEM_REG. The UMEM also has two -rings: the FILL ring and the COMPLETION ring. The fill ring is used by -the application to send down frame ids for the kernel to fill in with -RX packet data. References to these frames will then appear in the RX -ring once each packet has been received. The completion ring, on the -other hand, contains frame ids that the kernel has transmitted -completely and can now be used again by user space, for either TX or -RX. Thus, the frame ids appearing in the completion ring are ids that -were previously transmitted using the TX ring. In summary, the RX and -FILL rings are used for the RX path and the TX and COMPLETION rings -are used for the TX path. +The UMEM consists of a number of equally sized chunks. A descriptor in +one of the rings references a frame by referencing its addr. The addr +is simply an offset within the entire UMEM region. The user space +allocates memory for this UMEM using whatever means it feels is most +appropriate (malloc, mmap, huge pages, etc). This memory area is then +registered with the kernel using the new setsockopt XDP_UMEM_REG. The +UMEM also has two rings: the FILL ring and the COMPLETION ring. The +fill ring is used by the application to send down addr for the kernel +to fill in with RX packet data. References to these frames will then +appear in the RX ring once each packet has been received. The +completion ring, on the other hand, contains frame addr that the +kernel has transmitted completely and can now be used again by user +space, for either TX or RX. Thus, the frame addrs appearing in the +completion ring are addrs that were previously transmitted using the +TX ring. In summary, the RX and FILL rings are used for the RX path +and the TX and COMPLETION rings are used for the TX path. The socket is then finally bound with a bind() call to a device and a specific queue id on that device, and it is not until bind is @@ -59,13 +59,13 @@ wants to do this, it simply skips the registration of the UMEM and its corresponding two rings, sets the XDP_SHARED_UMEM flag in the bind call and submits the XSK of the process it would like to share UMEM with as well as its own newly created XSK socket. The new process will -then receive frame id references in its own RX ring that point to this -shared UMEM. Note that since the ring structures are single-consumer / -single-producer (for performance reasons), the new process has to -create its own socket with associated RX and TX rings, since it cannot -share this with the other process. This is also the reason that there -is only one set of FILL and COMPLETION rings per UMEM. It is the -responsibility of a single process to handle the UMEM. +then receive frame addr references in its own RX ring that point to +this shared UMEM. Note that since the ring structures are +single-consumer / single-producer (for performance reasons), the new +process has to create its own socket with associated RX and TX rings, +since it cannot share this with the other process. This is also the +reason that there is only one set of FILL and COMPLETION rings per +UMEM. It is the responsibility of a single process to handle the UMEM. How is then packets distributed from an XDP program to the XSKs? There is a BPF map called XSKMAP (or BPF_MAP_TYPE_XSKMAP in full). The @@ -102,10 +102,10 @@ UMEM UMEM is a region of virtual contiguous memory, divided into equal-sized frames. An UMEM is associated to a netdev and a specific -queue id of that netdev. It is created and configured (frame size, -frame headroom, start address and size) by using the XDP_UMEM_REG -setsockopt system call. A UMEM is bound to a netdev and queue id, via -the bind() system call. +queue id of that netdev. It is created and configured (chunk size, +headroom, start address and size) by using the XDP_UMEM_REG setsockopt +system call. A UMEM is bound to a netdev and queue id, via the bind() +system call. An AF_XDP is socket linked to a single UMEM, but one UMEM can have multiple AF_XDP sockets. To share an UMEM created via one socket A, @@ -147,13 +147,17 @@ UMEM Fill Ring ~~~~~~~~~~~~~~ The Fill ring is used to transfer ownership of UMEM frames from -user-space to kernel-space. The UMEM indicies are passed in the -ring. As an example, if the UMEM is 64k and each frame is 4k, then the -UMEM has 16 frames and can pass indicies between 0 and 15. +user-space to kernel-space. The UMEM addrs are passed in the ring. As +an example, if the UMEM is 64k and each chunk is 4k, then the UMEM has +16 chunks and can pass addrs between 0 and 64k. Frames passed to the kernel are used for the ingress path (RX rings). -The user application produces UMEM indicies to this ring. +The user application produces UMEM addrs to this ring. Note that the +kernel will mask the incoming addr. E.g. for a chunk size of 2k, the +log2(2048) LSB of the addr will be masked off, meaning that 2048, 2050 +and 3000 refers to the same chunk. + UMEM Completetion Ring ~~~~~~~~~~~~~~~~~~~~~~ @@ -165,16 +169,15 @@ used. Frames passed from the kernel to user-space are frames that has been sent (TX ring) and can be used by user-space again. -The user application consumes UMEM indicies from this ring. +The user application consumes UMEM addrs from this ring. RX Ring ~~~~~~~ The RX ring is the receiving side of a socket. Each entry in the ring -is a struct xdp_desc descriptor. The descriptor contains UMEM index -(idx), the length of the data (len), the offset into the frame -(offset). +is a struct xdp_desc descriptor. The descriptor contains UMEM offset +(addr) and the length of the data (len). If no frames have been passed to kernel via the Fill ring, no descriptors will (or can) appear on the RX ring. @@ -221,38 +224,50 @@ side is xdpsock_user.c and the XDP side xdpsock_kern.c. Naive ring dequeue and enqueue could look like this:: + // struct xdp_rxtx_ring { + // __u32 *producer; + // __u32 *consumer; + // struct xdp_desc *desc; + // }; + + // struct xdp_umem_ring { + // __u32 *producer; + // __u32 *consumer; + // __u64 *desc; + // }; + // typedef struct xdp_rxtx_ring RING; // typedef struct xdp_umem_ring RING; // typedef struct xdp_desc RING_TYPE; - // typedef __u32 RING_TYPE; + // typedef __u64 RING_TYPE; int dequeue_one(RING *ring, RING_TYPE *item) { - __u32 entries = ring->ptrs.producer - ring->ptrs.consumer; + __u32 entries = *ring->producer - *ring->consumer; if (entries == 0) return -1; // read-barrier! - *item = ring->desc[ring->ptrs.consumer & (RING_SIZE - 1)]; - ring->ptrs.consumer++; + *item = ring->desc[*ring->consumer & (RING_SIZE - 1)]; + (*ring->consumer)++; return 0; } int enqueue_one(RING *ring, const RING_TYPE *item) { - u32 free_entries = RING_SIZE - (ring->ptrs.producer - ring->ptrs.consumer); + u32 free_entries = RING_SIZE - (*ring->producer - *ring->consumer); if (free_entries == 0) return -1; - ring->desc[ring->ptrs.producer & (RING_SIZE - 1)] = *item; + ring->desc[*ring->producer & (RING_SIZE - 1)] = *item; // write-barrier! - ring->ptrs.producer++; + (*ring->producer)++; return 0; } diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h index 16cc76b4a4d9..b884362edbd0 100644 --- a/include/uapi/linux/if_xdp.h +++ b/include/uapi/linux/if_xdp.h @@ -48,8 +48,8 @@ struct xdp_mmap_offsets { struct xdp_umem_reg { __u64 addr; /* Start of packet data area */ __u64 len; /* Length of packet data area */ - __u32 frame_size; /* Frame size */ - __u32 frame_headroom; /* Frame head room */ + __u32 chunk_size; + __u32 headroom; }; struct xdp_statistics { @@ -66,13 +66,11 @@ struct xdp_statistics { /* Rx/Tx descriptor */ struct xdp_desc { - __u32 idx; + __u64 addr; __u32 len; - __u16 offset; - __u8 flags; - __u8 padding[5]; + __u32 options; }; -/* UMEM descriptor is __u32 */ +/* UMEM descriptor is __u64 */ #endif /* _LINUX_IF_XDP_H */ diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c index 87998818116f..9ad791ff4739 100644 --- a/net/xdp/xdp_umem.c +++ b/net/xdp/xdp_umem.c @@ -14,7 +14,7 @@ #include "xdp_umem.h" -#define XDP_UMEM_MIN_FRAME_SIZE 2048 +#define XDP_UMEM_MIN_CHUNK_SIZE 2048 static void xdp_umem_unpin_pages(struct xdp_umem *umem) { @@ -151,12 +151,12 @@ static int xdp_umem_account_pages(struct xdp_umem *umem) static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) { - u32 frame_size = mr->frame_size, frame_headroom = mr->frame_headroom; + u32 chunk_size = mr->chunk_size, headroom = mr->headroom; + unsigned int chunks, chunks_per_page; u64 addr = mr->addr, size = mr->len; - unsigned int nframes, nfpp; int size_chk, err; - if (frame_size < XDP_UMEM_MIN_FRAME_SIZE || frame_size > PAGE_SIZE) { + if (chunk_size < XDP_UMEM_MIN_CHUNK_SIZE || chunk_size > PAGE_SIZE) { /* Strictly speaking we could support this, if: * - huge pages, or* * - using an IOMMU, or @@ -166,7 +166,7 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) return -EINVAL; } - if (!is_power_of_2(frame_size)) + if (!is_power_of_2(chunk_size)) return -EINVAL; if (!PAGE_ALIGNED(addr)) { @@ -179,33 +179,30 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) if ((addr + size) < addr) return -EINVAL; - nframes = (unsigned int)div_u64(size, frame_size); - if (nframes == 0 || nframes > UINT_MAX) + chunks = (unsigned int)div_u64(size, chunk_size); + if (chunks == 0) return -EINVAL; - nfpp = PAGE_SIZE / frame_size; - if (nframes < nfpp || nframes % nfpp) + chunks_per_page = PAGE_SIZE / chunk_size; + if (chunks < chunks_per_page || chunks % chunks_per_page) return -EINVAL; - frame_headroom = ALIGN(frame_headroom, 64); + headroom = ALIGN(headroom, 64); - size_chk = frame_size - frame_headroom - XDP_PACKET_HEADROOM; + size_chk = chunk_size - headroom - XDP_PACKET_HEADROOM; if (size_chk < 0) return -EINVAL; umem->pid = get_task_pid(current, PIDTYPE_PID); - umem->size = (size_t)size; umem->address = (unsigned long)addr; - umem->props.frame_size = frame_size; - umem->props.nframes = nframes; - umem->frame_headroom = frame_headroom; + umem->props.chunk_mask = ~((u64)chunk_size - 1); + umem->props.size = size; + umem->headroom = headroom; + umem->chunk_size_nohr = chunk_size - headroom; umem->npgs = size / PAGE_SIZE; umem->pgs = NULL; umem->user = NULL; - umem->frame_size_log2 = ilog2(frame_size); - umem->nfpp_mask = nfpp - 1; - umem->nfpplog2 = ilog2(nfpp); refcount_set(&umem->users, 1); err = xdp_umem_account_pages(umem); diff --git a/net/xdp/xdp_umem.h b/net/xdp/xdp_umem.h index 0881cf456230..aeadd1bcb72d 100644 --- a/net/xdp/xdp_umem.h +++ b/net/xdp/xdp_umem.h @@ -18,35 +18,20 @@ struct xdp_umem { struct xsk_queue *cq; struct page **pgs; struct xdp_umem_props props; - u32 npgs; - u32 frame_headroom; - u32 nfpp_mask; - u32 nfpplog2; - u32 frame_size_log2; + u32 headroom; + u32 chunk_size_nohr; struct user_struct *user; struct pid *pid; unsigned long address; - size_t size; refcount_t users; struct work_struct work; + u32 npgs; }; -static inline char *xdp_umem_get_data(struct xdp_umem *umem, u32 idx) +static inline char *xdp_umem_get_data(struct xdp_umem *umem, u64 addr) { - u64 pg, off; - char *data; - - pg = idx >> umem->nfpplog2; - off = (idx & umem->nfpp_mask) << umem->frame_size_log2; - - data = page_address(umem->pgs[pg]); - return data + off; -} - -static inline char *xdp_umem_get_data_with_headroom(struct xdp_umem *umem, - u32 idx) -{ - return xdp_umem_get_data(umem, idx) + umem->frame_headroom; + return page_address(umem->pgs[addr >> PAGE_SHIFT]) + + (addr & (PAGE_SIZE - 1)); } bool xdp_umem_validate_queues(struct xdp_umem *umem); diff --git a/net/xdp/xdp_umem_props.h b/net/xdp/xdp_umem_props.h index 2cf8ec485fd2..40eab10dfc49 100644 --- a/net/xdp/xdp_umem_props.h +++ b/net/xdp/xdp_umem_props.h @@ -7,8 +7,8 @@ #define XDP_UMEM_PROPS_H_ struct xdp_umem_props { - u32 frame_size; - u32 nframes; + u64 chunk_mask; + u64 size; }; #endif /* XDP_UMEM_PROPS_H_ */ diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index cb257ec955fb..a0ca48f642ca 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -41,24 +41,27 @@ bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs) static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) { - u32 id, len = xdp->data_end - xdp->data; + u32 len = xdp->data_end - xdp->data; void *buffer; + u64 addr; int err; if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index) return -EINVAL; - if (!xskq_peek_id(xs->umem->fq, &id)) { + if (!xskq_peek_addr(xs->umem->fq, &addr) || + len > xs->umem->chunk_size_nohr) { xs->rx_dropped++; return -ENOSPC; } - buffer = xdp_umem_get_data_with_headroom(xs->umem, id); + addr += xs->umem->headroom; + + buffer = xdp_umem_get_data(xs->umem, addr); memcpy(buffer, xdp->data, len); - err = xskq_produce_batch_desc(xs->rx, id, len, - xs->umem->frame_headroom); + err = xskq_produce_batch_desc(xs->rx, addr, len); if (!err) - xskq_discard_id(xs->umem->fq); + xskq_discard_addr(xs->umem->fq); else xs->rx_dropped++; @@ -95,10 +98,10 @@ int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) static void xsk_destruct_skb(struct sk_buff *skb) { - u32 id = (u32)(long)skb_shinfo(skb)->destructor_arg; + u64 addr = (u64)(long)skb_shinfo(skb)->destructor_arg; struct xdp_sock *xs = xdp_sk(skb->sk); - WARN_ON_ONCE(xskq_produce_id(xs->umem->cq, id)); + WARN_ON_ONCE(xskq_produce_addr(xs->umem->cq, addr)); sock_wfree(skb); } @@ -123,14 +126,15 @@ static int xsk_generic_xmit(struct sock *sk, struct msghdr *m, while (xskq_peek_desc(xs->tx, &desc)) { char *buffer; - u32 id, len; + u64 addr; + u32 len; if (max_batch-- == 0) { err = -EAGAIN; goto out; } - if (xskq_reserve_id(xs->umem->cq)) { + if (xskq_reserve_addr(xs->umem->cq)) { err = -EAGAIN; goto out; } @@ -153,8 +157,8 @@ static int xsk_generic_xmit(struct sock *sk, struct msghdr *m, } skb_put(skb, len); - id = desc.idx; - buffer = xdp_umem_get_data(xs->umem, id) + desc.offset; + addr = desc.addr; + buffer = xdp_umem_get_data(xs->umem, addr); err = skb_store_bits(skb, 0, buffer, len); if (unlikely(err)) { kfree_skb(skb); @@ -164,7 +168,7 @@ static int xsk_generic_xmit(struct sock *sk, struct msghdr *m, skb->dev = xs->dev; skb->priority = sk->sk_priority; skb->mark = sk->sk_mark; - skb_shinfo(skb)->destructor_arg = (void *)(long)id; + skb_shinfo(skb)->destructor_arg = (void *)(long)addr; skb->destructor = xsk_destruct_skb; err = dev_direct_xmit(skb, xs->queue_id); diff --git a/net/xdp/xsk_queue.c b/net/xdp/xsk_queue.c index ebe85e59507e..6c32e92e98fc 100644 --- a/net/xdp/xsk_queue.c +++ b/net/xdp/xsk_queue.c @@ -17,7 +17,7 @@ void xskq_set_umem(struct xsk_queue *q, struct xdp_umem_props *umem_props) static u32 xskq_umem_get_ring_size(struct xsk_queue *q) { - return sizeof(struct xdp_umem_ring) + q->nentries * sizeof(u32); + return sizeof(struct xdp_umem_ring) + q->nentries * sizeof(u64); } static u32 xskq_rxtx_get_ring_size(struct xsk_queue *q) diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index b5924e7aeb2b..337e5ad3b10e 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -27,7 +27,7 @@ struct xdp_rxtx_ring { /* Used for the fill and completion queues for buffers */ struct xdp_umem_ring { struct xdp_ring ptrs; - u32 desc[0] ____cacheline_aligned_in_smp; + u64 desc[0] ____cacheline_aligned_in_smp; }; struct xsk_queue { @@ -76,24 +76,25 @@ static inline u32 xskq_nb_free(struct xsk_queue *q, u32 producer, u32 dcnt) /* UMEM queue */ -static inline bool xskq_is_valid_id(struct xsk_queue *q, u32 idx) +static inline bool xskq_is_valid_addr(struct xsk_queue *q, u64 addr) { - if (unlikely(idx >= q->umem_props.nframes)) { + if (addr >= q->umem_props.size) { q->invalid_descs++; return false; } + return true; } -static inline u32 *xskq_validate_id(struct xsk_queue *q, u32 *id) +static inline u64 *xskq_validate_addr(struct xsk_queue *q, u64 *addr) { while (q->cons_tail != q->cons_head) { struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring; unsigned int idx = q->cons_tail & q->ring_mask; - *id = READ_ONCE(ring->desc[idx]); - if (xskq_is_valid_id(q, *id)) - return id; + *addr = READ_ONCE(ring->desc[idx]) & q->umem_props.chunk_mask; + if (xskq_is_valid_addr(q, *addr)) + return addr; q->cons_tail++; } @@ -101,7 +102,7 @@ static inline u32 *xskq_validate_id(struct xsk_queue *q, u32 *id) return NULL; } -static inline u32 *xskq_peek_id(struct xsk_queue *q, u32 *id) +static inline u64 *xskq_peek_addr(struct xsk_queue *q, u64 *addr) { if (q->cons_tail == q->cons_head) { WRITE_ONCE(q->ring->consumer, q->cons_tail); @@ -111,19 +112,19 @@ static inline u32 *xskq_peek_id(struct xsk_queue *q, u32 *id) smp_rmb(); } - return xskq_validate_id(q, id); + return xskq_validate_addr(q, addr); } -static inline void xskq_discard_id(struct xsk_queue *q) +static inline void xskq_discard_addr(struct xsk_queue *q) { q->cons_tail++; } -static inline int xskq_produce_id(struct xsk_queue *q, u32 id) +static inline int xskq_produce_addr(struct xsk_queue *q, u64 addr) { struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring; - ring->desc[q->prod_tail++ & q->ring_mask] = id; + ring->desc[q->prod_tail++ & q->ring_mask] = addr; /* Order producer and data */ smp_wmb(); @@ -132,7 +133,7 @@ static inline int xskq_produce_id(struct xsk_queue *q, u32 id) return 0; } -static inline int xskq_reserve_id(struct xsk_queue *q) +static inline int xskq_reserve_addr(struct xsk_queue *q) { if (xskq_nb_free(q, q->prod_head, 1) == 0) return -ENOSPC; @@ -145,16 +146,11 @@ static inline int xskq_reserve_id(struct xsk_queue *q) static inline bool xskq_is_valid_desc(struct xsk_queue *q, struct xdp_desc *d) { - u32 buff_len; - - if (unlikely(d->idx >= q->umem_props.nframes)) { - q->invalid_descs++; + if (!xskq_is_valid_addr(q, d->addr)) return false; - } - buff_len = q->umem_props.frame_size; - if (unlikely(d->len > buff_len || d->len == 0 || - d->offset > buff_len || d->offset + d->len > buff_len)) { + if (((d->addr + d->len) & q->umem_props.chunk_mask) != + (d->addr & q->umem_props.chunk_mask)) { q->invalid_descs++; return false; } @@ -199,7 +195,7 @@ static inline void xskq_discard_desc(struct xsk_queue *q) } static inline int xskq_produce_batch_desc(struct xsk_queue *q, - u32 id, u32 len, u16 offset) + u64 addr, u32 len) { struct xdp_rxtx_ring *ring = (struct xdp_rxtx_ring *)q->ring; unsigned int idx; @@ -208,9 +204,8 @@ static inline int xskq_produce_batch_desc(struct xsk_queue *q, return -ENOSPC; idx = (q->prod_head++) & q->ring_mask; - ring->desc[idx].idx = id; + ring->desc[idx].addr = addr; ring->desc[idx].len = len; - ring->desc[idx].offset = offset; return 0; } From 5e2acd81dd0b2e2cbd3f8849fd5dc19236741d50 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Mon, 4 Jun 2018 14:05:51 +0200 Subject: [PATCH 1171/1640] UPSTREAM: xsk: moved struct xdp_umem definition MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Moved struct xdp_umem to xdp_sock.h, in order to prepare for zero-copy support. Signed-off-by: Björn Töpel Signed-off-by: Daniel Borkmann --- include/net/xdp_sock.h | 24 +++++++++++++++++++++++- net/xdp/xdp_umem.c | 1 + net/xdp/xdp_umem.h | 22 +--------------------- net/xdp/xsk_queue.h | 3 +-- 4 files changed, 26 insertions(+), 24 deletions(-) diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index 7a647c56ec15..3a6cd88f179d 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -6,12 +6,34 @@ #ifndef _LINUX_XDP_SOCK_H #define _LINUX_XDP_SOCK_H +#include +#include #include +#include #include struct net_device; struct xsk_queue; -struct xdp_umem; + +struct xdp_umem_props { + u64 chunk_mask; + u64 size; +}; + +struct xdp_umem { + struct xsk_queue *fq; + struct xsk_queue *cq; + struct page **pgs; + struct xdp_umem_props props; + u32 headroom; + u32 chunk_size_nohr; + struct user_struct *user; + struct pid *pid; + unsigned long address; + refcount_t users; + struct work_struct work; + u32 npgs; +}; struct xdp_sock { /* struct sock must be the first member of struct xdp_sock */ diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c index 9ad791ff4739..2793a503223e 100644 --- a/net/xdp/xdp_umem.c +++ b/net/xdp/xdp_umem.c @@ -13,6 +13,7 @@ #include #include "xdp_umem.h" +#include "xsk_queue.h" #define XDP_UMEM_MIN_CHUNK_SIZE 2048 diff --git a/net/xdp/xdp_umem.h b/net/xdp/xdp_umem.h index aeadd1bcb72d..9433e8af650a 100644 --- a/net/xdp/xdp_umem.h +++ b/net/xdp/xdp_umem.h @@ -6,27 +6,7 @@ #ifndef XDP_UMEM_H_ #define XDP_UMEM_H_ -#include -#include -#include - -#include "xsk_queue.h" -#include "xdp_umem_props.h" - -struct xdp_umem { - struct xsk_queue *fq; - struct xsk_queue *cq; - struct page **pgs; - struct xdp_umem_props props; - u32 headroom; - u32 chunk_size_nohr; - struct user_struct *user; - struct pid *pid; - unsigned long address; - refcount_t users; - struct work_struct work; - u32 npgs; -}; +#include static inline char *xdp_umem_get_data(struct xdp_umem *umem, u64 addr) { diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index 337e5ad3b10e..5246ed420a16 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -8,8 +8,7 @@ #include #include - -#include "xdp_umem_props.h" +#include #define RX_BATCH_SIZE 16 From d9d6e24438af378cec8778883c29e2a4068213e8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Mon, 4 Jun 2018 14:05:52 +0200 Subject: [PATCH 1172/1640] UPSTREAM: xsk: introduce xdp_umem_page MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The xdp_umem_page holds the address for a page. Trade memory for faster lookup. Later, we'll add DMA address here as well. Signed-off-by: Björn Töpel Signed-off-by: Daniel Borkmann --- include/net/xdp_sock.h | 7 ++++++- net/xdp/xdp_umem.c | 15 ++++++++++++++- net/xdp/xdp_umem.h | 3 +-- 3 files changed, 21 insertions(+), 4 deletions(-) diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index 3a6cd88f179d..caf343a7e224 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -20,10 +20,14 @@ struct xdp_umem_props { u64 size; }; +struct xdp_umem_page { + void *addr; +}; + struct xdp_umem { struct xsk_queue *fq; struct xsk_queue *cq; - struct page **pgs; + struct xdp_umem_page *pages; struct xdp_umem_props props; u32 headroom; u32 chunk_size_nohr; @@ -32,6 +36,7 @@ struct xdp_umem { unsigned long address; refcount_t users; struct work_struct work; + struct page **pgs; u32 npgs; }; diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c index 2793a503223e..aca826011f6c 100644 --- a/net/xdp/xdp_umem.c +++ b/net/xdp/xdp_umem.c @@ -65,6 +65,9 @@ static void xdp_umem_release(struct xdp_umem *umem) goto out; mmput(mm); + kfree(umem->pages); + umem->pages = NULL; + xdp_umem_unaccount_pages(umem); out: kfree(umem); @@ -155,7 +158,7 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) u32 chunk_size = mr->chunk_size, headroom = mr->headroom; unsigned int chunks, chunks_per_page; u64 addr = mr->addr, size = mr->len; - int size_chk, err; + int size_chk, err, i; if (chunk_size < XDP_UMEM_MIN_CHUNK_SIZE || chunk_size > PAGE_SIZE) { /* Strictly speaking we could support this, if: @@ -213,6 +216,16 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) err = xdp_umem_pin_pages(umem); if (err) goto out_account; + + umem->pages = kcalloc(umem->npgs, sizeof(*umem->pages), GFP_KERNEL); + if (!umem->pages) { + err = -ENOMEM; + goto out_account; + } + + for (i = 0; i < umem->npgs; i++) + umem->pages[i].addr = page_address(umem->pgs[i]); + return 0; out_account: diff --git a/net/xdp/xdp_umem.h b/net/xdp/xdp_umem.h index 9433e8af650a..40e8fa4a92af 100644 --- a/net/xdp/xdp_umem.h +++ b/net/xdp/xdp_umem.h @@ -10,8 +10,7 @@ static inline char *xdp_umem_get_data(struct xdp_umem *umem, u64 addr) { - return page_address(umem->pgs[addr >> PAGE_SHIFT]) + - (addr & (PAGE_SIZE - 1)); + return umem->pages[addr >> PAGE_SHIFT].addr + (addr & (PAGE_SIZE - 1)); } bool xdp_umem_validate_queues(struct xdp_umem *umem); From 475cd915032c0f2a3a0403ab879cfb71838bf29c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Mon, 4 Jun 2018 14:05:54 +0200 Subject: [PATCH 1173/1640] UPSTREAM: xdp: add MEM_TYPE_ZERO_COPY MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Here, a new type of allocator support is added to the XDP return API. A zero-copy allocated xdp_buff cannot be converted to an xdp_frame. Instead is the buff has to be copied. This is not supported at all in this commit. Also, an opaque "handle" is added to xdp_buff. This can be used as a context for the zero-copy allocator implementation. Signed-off-by: Björn Töpel Signed-off-by: Daniel Borkmann --- include/net/xdp.h | 10 ++++++++++ net/core/xdp.c | 19 ++++++++++++++----- 2 files changed, 24 insertions(+), 5 deletions(-) diff --git a/include/net/xdp.h b/include/net/xdp.h index e79e3b938448..be8ceb836b19 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -37,6 +37,7 @@ enum xdp_mem_type { MEM_TYPE_PAGE_SHARED = 0, /* Split-page refcnt based model */ MEM_TYPE_PAGE_ORDER0, /* Orig XDP full page model */ MEM_TYPE_PAGE_POOL, + MEM_TYPE_ZERO_COPY, MEM_TYPE_MAX, }; @@ -51,6 +52,10 @@ struct xdp_mem_info { struct page_pool; +struct zero_copy_allocator { + void (*free)(struct zero_copy_allocator *zca, unsigned long handle); +}; + struct xdp_rxq_info { struct net_device *dev; u32 queue_index; @@ -63,6 +68,7 @@ struct xdp_buff { void *data_end; void *data_meta; void *data_hard_start; + unsigned long handle; struct xdp_rxq_info *rxq; }; @@ -93,6 +99,10 @@ struct xdp_frame *convert_to_xdp_frame(struct xdp_buff *xdp) int metasize; int headroom; + /* TODO: implement clone, copy, use "native" MEM_TYPE */ + if (xdp->rxq->mem.type == MEM_TYPE_ZERO_COPY) + return NULL; + /* Assure headroom is available for storing info */ headroom = xdp->data - xdp->data_hard_start; metasize = xdp->data - xdp->data_meta; diff --git a/net/core/xdp.c b/net/core/xdp.c index 31715ddf4c49..1a19e2b69aba 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -34,6 +34,7 @@ struct xdp_mem_allocator { union { void *allocator; struct page_pool *page_pool; + struct zero_copy_allocator *zc_alloc; }; struct rhash_head node; struct rcu_head rcu; @@ -264,7 +265,7 @@ int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq, xdp_rxq->mem.type = type; if (!allocator) { - if (type == MEM_TYPE_PAGE_POOL) + if (type == MEM_TYPE_PAGE_POOL || type == MEM_TYPE_ZERO_COPY) return -EINVAL; /* Setup time check page_pool req */ return 0; } @@ -317,7 +318,8 @@ EXPORT_SYMBOL_GPL(xdp_rxq_info_reg_mem_model); * is used for those calls sites. Thus, allowing for faster recycling * of xdp_frames/pages in those cases. */ -static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct) +static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct, + unsigned long handle) { struct xdp_mem_allocator *xa; struct page *page; @@ -343,6 +345,13 @@ static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct) page = virt_to_page(data); /* Assumes order0 page*/ put_page(page); break; + case MEM_TYPE_ZERO_COPY: + /* NB! Only valid from an xdp_buff! */ + rcu_read_lock(); + /* mem->id is valid, checked in xdp_rxq_info_reg_mem_model() */ + xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params); + xa->zc_alloc->free(xa->zc_alloc, handle); + rcu_read_unlock(); default: /* Not possible, checked in xdp_rxq_info_reg_mem_model() */ break; @@ -351,19 +360,19 @@ static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct) void xdp_return_frame(struct xdp_frame *xdpf) { - __xdp_return(xdpf->data, &xdpf->mem, false); + __xdp_return(xdpf->data, &xdpf->mem, false, 0); } EXPORT_SYMBOL_GPL(xdp_return_frame); void xdp_return_frame_rx_napi(struct xdp_frame *xdpf) { - __xdp_return(xdpf->data, &xdpf->mem, true); + __xdp_return(xdpf->data, &xdpf->mem, true, 0); } EXPORT_SYMBOL_GPL(xdp_return_frame_rx_napi); void xdp_return_buff(struct xdp_buff *xdp) { - __xdp_return(xdp->data, &xdp->rxq->mem, true); + __xdp_return(xdp->data, &xdp->rxq->mem, true, xdp->handle); } EXPORT_SYMBOL_GPL(xdp_return_buff); From ccce653ae429aa1fff8760f1c55e7c3f0a06ac17 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Mon, 4 Jun 2018 14:05:55 +0200 Subject: [PATCH 1174/1640] UPSTREAM: xsk: add zero-copy support for Rx MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Extend the xsk_rcv to support the new MEM_TYPE_ZERO_COPY memory, and wireup ndo_bpf call in bind. Signed-off-by: Björn Töpel Signed-off-by: Daniel Borkmann --- include/net/xdp_sock.h | 6 +++ include/uapi/linux/if_xdp.h | 4 +- net/xdp/xdp_umem.c | 77 +++++++++++++++++++++++++++++ net/xdp/xdp_umem.h | 3 ++ net/xdp/xsk.c | 98 +++++++++++++++++++++++++++++-------- 5 files changed, 166 insertions(+), 22 deletions(-) diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index caf343a7e224..d93d3aac3fc9 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -22,6 +22,7 @@ struct xdp_umem_props { struct xdp_umem_page { void *addr; + dma_addr_t dma; }; struct xdp_umem { @@ -38,6 +39,9 @@ struct xdp_umem { struct work_struct work; struct page **pgs; u32 npgs; + struct net_device *dev; + u16 queue_id; + bool zc; }; struct xdp_sock { @@ -60,6 +64,8 @@ int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp); int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp); void xsk_flush(struct xdp_sock *xs); bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs); +u64 *xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr); +void xsk_umem_discard_addr(struct xdp_umem *umem); #else static inline int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) { diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h index b884362edbd0..730b692c04a5 100644 --- a/include/uapi/linux/if_xdp.h +++ b/include/uapi/linux/if_xdp.h @@ -13,7 +13,9 @@ #include /* Options for the sxdp_flags field */ -#define XDP_SHARED_UMEM 1 +#define XDP_SHARED_UMEM (1 << 0) +#define XDP_COPY (1 << 1) /* Force copy-mode */ +#define XDP_ZEROCOPY (1 << 2) /* Force zero-copy mode */ struct sockaddr_xdp { __u16 sxdp_family; diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c index aca826011f6c..f729d79b8d91 100644 --- a/net/xdp/xdp_umem.c +++ b/net/xdp/xdp_umem.c @@ -17,6 +17,81 @@ #define XDP_UMEM_MIN_CHUNK_SIZE 2048 +int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev, + u32 queue_id, u16 flags) +{ + bool force_zc, force_copy; + struct netdev_bpf bpf; + int err; + + force_zc = flags & XDP_ZEROCOPY; + force_copy = flags & XDP_COPY; + + if (force_zc && force_copy) + return -EINVAL; + + if (force_copy) + return 0; + + dev_hold(dev); + + if (dev->netdev_ops->ndo_bpf) { + bpf.command = XDP_QUERY_XSK_UMEM; + + rtnl_lock(); + err = dev->netdev_ops->ndo_bpf(dev, &bpf); + rtnl_unlock(); + + if (err) { + dev_put(dev); + return force_zc ? -ENOTSUPP : 0; + } + + bpf.command = XDP_SETUP_XSK_UMEM; + bpf.xsk.umem = umem; + bpf.xsk.queue_id = queue_id; + + rtnl_lock(); + err = dev->netdev_ops->ndo_bpf(dev, &bpf); + rtnl_unlock(); + + if (err) { + dev_put(dev); + return force_zc ? err : 0; /* fail or fallback */ + } + + umem->dev = dev; + umem->queue_id = queue_id; + umem->zc = true; + return 0; + } + + dev_put(dev); + return force_zc ? -ENOTSUPP : 0; /* fail or fallback */ +} + +void xdp_umem_clear_dev(struct xdp_umem *umem) +{ + struct netdev_bpf bpf; + int err; + + if (umem->dev) { + bpf.command = XDP_SETUP_XSK_UMEM; + bpf.xsk.umem = NULL; + bpf.xsk.queue_id = umem->queue_id; + + rtnl_lock(); + err = umem->dev->netdev_ops->ndo_bpf(umem->dev, &bpf); + rtnl_unlock(); + + if (err) + WARN(1, "failed to disable umem!\n"); + + dev_put(umem->dev); + umem->dev = NULL; + } +} + static void xdp_umem_unpin_pages(struct xdp_umem *umem) { unsigned int i; @@ -43,6 +118,8 @@ static void xdp_umem_release(struct xdp_umem *umem) struct task_struct *task; struct mm_struct *mm; + xdp_umem_clear_dev(umem); + if (umem->fq) { xskq_destroy(umem->fq); umem->fq = NULL; diff --git a/net/xdp/xdp_umem.h b/net/xdp/xdp_umem.h index 40e8fa4a92af..674508a32a4d 100644 --- a/net/xdp/xdp_umem.h +++ b/net/xdp/xdp_umem.h @@ -13,6 +13,9 @@ static inline char *xdp_umem_get_data(struct xdp_umem *umem, u64 addr) return umem->pages[addr >> PAGE_SHIFT].addr + (addr & (PAGE_SIZE - 1)); } +int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev, + u32 queue_id, u16 flags); +void xdp_umem_clear_dev(struct xdp_umem *umem); bool xdp_umem_validate_queues(struct xdp_umem *umem); void xdp_get_umem(struct xdp_umem *umem); void xdp_put_umem(struct xdp_umem *umem); diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index a0ca48f642ca..7ebc2db9af02 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -36,19 +36,28 @@ static struct xdp_sock *xdp_sk(struct sock *sk) bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs) { - return !!xs->rx; + return READ_ONCE(xs->rx) && READ_ONCE(xs->umem) && + READ_ONCE(xs->umem->fq); } -static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) +u64 *xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr) +{ + return xskq_peek_addr(umem->fq, addr); +} +EXPORT_SYMBOL(xsk_umem_peek_addr); + +void xsk_umem_discard_addr(struct xdp_umem *umem) +{ + xskq_discard_addr(umem->fq); +} +EXPORT_SYMBOL(xsk_umem_discard_addr); + +static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) { - u32 len = xdp->data_end - xdp->data; void *buffer; u64 addr; int err; - if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index) - return -EINVAL; - if (!xskq_peek_addr(xs->umem->fq, &addr) || len > xs->umem->chunk_size_nohr) { xs->rx_dropped++; @@ -60,23 +69,39 @@ static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) buffer = xdp_umem_get_data(xs->umem, addr); memcpy(buffer, xdp->data, len); err = xskq_produce_batch_desc(xs->rx, addr, len); - if (!err) + if (!err) { xskq_discard_addr(xs->umem->fq); - else + xdp_return_buff(xdp); + return 0; + } + + xs->rx_dropped++; + return err; +} + +static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) +{ + int err = xskq_produce_batch_desc(xs->rx, (u64)xdp->handle, len); + + if (err) { + xdp_return_buff(xdp); xs->rx_dropped++; + } return err; } int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) { - int err; + u32 len; - err = __xsk_rcv(xs, xdp); - if (likely(!err)) - xdp_return_buff(xdp); + if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index) + return -EINVAL; - return err; + len = xdp->data_end - xdp->data; + + return (xdp->rxq->mem.type == MEM_TYPE_ZERO_COPY) ? + __xsk_rcv_zc(xs, xdp, len) : __xsk_rcv(xs, xdp, len); } void xsk_flush(struct xdp_sock *xs) @@ -87,12 +112,29 @@ void xsk_flush(struct xdp_sock *xs) int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) { + u32 len = xdp->data_end - xdp->data; + void *buffer; + u64 addr; int err; - err = __xsk_rcv(xs, xdp); - if (!err) - xsk_flush(xs); + if (!xskq_peek_addr(xs->umem->fq, &addr) || + len > xs->umem->chunk_size_nohr) { + xs->rx_dropped++; + return -ENOSPC; + } + addr += xs->umem->headroom; + + buffer = xdp_umem_get_data(xs->umem, addr); + memcpy(buffer, xdp->data, len); + err = xskq_produce_batch_desc(xs->rx, addr, len); + if (!err) { + xskq_discard_addr(xs->umem->fq); + xsk_flush(xs); + return 0; + } + + xs->rx_dropped++; return err; } @@ -291,6 +333,7 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) struct sock *sk = sock->sk; struct xdp_sock *xs = xdp_sk(sk); struct net_device *dev; + u32 flags, qid; int err = 0; if (addr_len < sizeof(struct sockaddr_xdp)) @@ -315,16 +358,26 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) goto out_unlock; } - if ((xs->rx && sxdp->sxdp_queue_id >= dev->real_num_rx_queues) || - (xs->tx && sxdp->sxdp_queue_id >= dev->real_num_tx_queues)) { + qid = sxdp->sxdp_queue_id; + + if ((xs->rx && qid >= dev->real_num_rx_queues) || + (xs->tx && qid >= dev->real_num_tx_queues)) { err = -EINVAL; goto out_unlock; } - if (sxdp->sxdp_flags & XDP_SHARED_UMEM) { + flags = sxdp->sxdp_flags; + + if (flags & XDP_SHARED_UMEM) { struct xdp_sock *umem_xs; struct socket *sock; + if ((flags & XDP_COPY) || (flags & XDP_ZEROCOPY)) { + /* Cannot specify flags for shared sockets. */ + err = -EINVAL; + goto out_unlock; + } + if (xs->umem) { /* We have already our own. */ err = -EINVAL; @@ -343,8 +396,7 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) err = -EBADF; sockfd_put(sock); goto out_unlock; - } else if (umem_xs->dev != dev || - umem_xs->queue_id != sxdp->sxdp_queue_id) { + } else if (umem_xs->dev != dev || umem_xs->queue_id != qid) { err = -EINVAL; sockfd_put(sock); goto out_unlock; @@ -360,6 +412,10 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) /* This xsk has its own umem. */ xskq_set_umem(xs->umem->fq, &xs->umem->props); xskq_set_umem(xs->umem->cq, &xs->umem->props); + + err = xdp_umem_assign_dev(xs->umem, dev, qid, flags); + if (err) + goto out_unlock; } xs->dev = dev; From 59b0ea7fcc10a209ea6c3bf8d106387fb8b40ec4 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Mon, 4 Jun 2018 14:05:57 +0200 Subject: [PATCH 1175/1640] UPSTREAM: xsk: wire upp Tx zero-copy functions Here we add the functionality required to support zero-copy Tx, and also exposes various zero-copy related functions for the netdevs. Signed-off-by: Magnus Karlsson Signed-off-by: Daniel Borkmann --- include/net/xdp_sock.h | 9 ++++++ net/xdp/xdp_umem.c | 29 +++++++++++++++-- net/xdp/xdp_umem.h | 8 ++++- net/xdp/xsk.c | 70 +++++++++++++++++++++++++++++++++++++----- net/xdp/xsk_queue.h | 32 ++++++++++++++++++- 5 files changed, 137 insertions(+), 11 deletions(-) diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index d93d3aac3fc9..9fe472f2ac95 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -9,6 +9,7 @@ #include #include #include +#include #include #include @@ -42,6 +43,8 @@ struct xdp_umem { struct net_device *dev; u16 queue_id; bool zc; + spinlock_t xsk_list_lock; + struct list_head xsk_list; }; struct xdp_sock { @@ -53,6 +56,8 @@ struct xdp_sock { struct list_head flush_node; u16 queue_id; struct xsk_queue *tx ____cacheline_aligned_in_smp; + struct list_head list; + bool zc; /* Protects multiple processes in the control path */ struct mutex mutex; u64 rx_dropped; @@ -64,8 +69,12 @@ int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp); int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp); void xsk_flush(struct xdp_sock *xs); bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs); +/* Used from netdev driver */ u64 *xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr); void xsk_umem_discard_addr(struct xdp_umem *umem); +void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries); +bool xsk_umem_consume_tx(struct xdp_umem *umem, dma_addr_t *dma, u32 *len); +void xsk_umem_consume_tx_done(struct xdp_umem *umem); #else static inline int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) { diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c index f729d79b8d91..7eb4948a38d2 100644 --- a/net/xdp/xdp_umem.c +++ b/net/xdp/xdp_umem.c @@ -17,6 +17,29 @@ #define XDP_UMEM_MIN_CHUNK_SIZE 2048 +void xdp_add_sk_umem(struct xdp_umem *umem, struct xdp_sock *xs) +{ + unsigned long flags; + + spin_lock_irqsave(&umem->xsk_list_lock, flags); + list_add_rcu(&xs->list, &umem->xsk_list); + spin_unlock_irqrestore(&umem->xsk_list_lock, flags); +} + +void xdp_del_sk_umem(struct xdp_umem *umem, struct xdp_sock *xs) +{ + unsigned long flags; + + if (xs->dev) { + spin_lock_irqsave(&umem->xsk_list_lock, flags); + list_del_rcu(&xs->list); + spin_unlock_irqrestore(&umem->xsk_list_lock, flags); + + if (umem->zc) + synchronize_net(); + } +} + int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev, u32 queue_id, u16 flags) { @@ -35,7 +58,7 @@ int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev, dev_hold(dev); - if (dev->netdev_ops->ndo_bpf) { + if (dev->netdev_ops->ndo_bpf && dev->netdev_ops->ndo_xsk_async_xmit) { bpf.command = XDP_QUERY_XSK_UMEM; rtnl_lock(); @@ -70,7 +93,7 @@ int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev, return force_zc ? -ENOTSUPP : 0; /* fail or fallback */ } -void xdp_umem_clear_dev(struct xdp_umem *umem) +static void xdp_umem_clear_dev(struct xdp_umem *umem) { struct netdev_bpf bpf; int err; @@ -283,6 +306,8 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) umem->npgs = size / PAGE_SIZE; umem->pgs = NULL; umem->user = NULL; + INIT_LIST_HEAD(&umem->xsk_list); + spin_lock_init(&umem->xsk_list_lock); refcount_set(&umem->users, 1); diff --git a/net/xdp/xdp_umem.h b/net/xdp/xdp_umem.h index 674508a32a4d..f11560334f88 100644 --- a/net/xdp/xdp_umem.h +++ b/net/xdp/xdp_umem.h @@ -13,12 +13,18 @@ static inline char *xdp_umem_get_data(struct xdp_umem *umem, u64 addr) return umem->pages[addr >> PAGE_SHIFT].addr + (addr & (PAGE_SIZE - 1)); } +static inline dma_addr_t xdp_umem_get_dma(struct xdp_umem *umem, u64 addr) +{ + return umem->pages[addr >> PAGE_SHIFT].dma + (addr & (PAGE_SIZE - 1)); +} + int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev, u32 queue_id, u16 flags); -void xdp_umem_clear_dev(struct xdp_umem *umem); bool xdp_umem_validate_queues(struct xdp_umem *umem); void xdp_get_umem(struct xdp_umem *umem); void xdp_put_umem(struct xdp_umem *umem); +void xdp_add_sk_umem(struct xdp_umem *umem, struct xdp_sock *xs); +void xdp_del_sk_umem(struct xdp_umem *umem, struct xdp_sock *xs); struct xdp_umem *xdp_umem_create(struct xdp_umem_reg *mr); #endif /* XDP_UMEM_H_ */ diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 7ebc2db9af02..7ab4adc84a20 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include @@ -138,6 +139,59 @@ int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) return err; } +void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries) +{ + xskq_produce_flush_addr_n(umem->cq, nb_entries); +} +EXPORT_SYMBOL(xsk_umem_complete_tx); + +void xsk_umem_consume_tx_done(struct xdp_umem *umem) +{ + struct xdp_sock *xs; + + rcu_read_lock(); + list_for_each_entry_rcu(xs, &umem->xsk_list, list) { + xs->sk.sk_write_space(&xs->sk); + } + rcu_read_unlock(); +} +EXPORT_SYMBOL(xsk_umem_consume_tx_done); + +bool xsk_umem_consume_tx(struct xdp_umem *umem, dma_addr_t *dma, u32 *len) +{ + struct xdp_desc desc; + struct xdp_sock *xs; + + rcu_read_lock(); + list_for_each_entry_rcu(xs, &umem->xsk_list, list) { + if (!xskq_peek_desc(xs->tx, &desc)) + continue; + + if (xskq_produce_addr_lazy(umem->cq, desc.addr)) + goto out; + + *dma = xdp_umem_get_dma(umem, desc.addr); + *len = desc.len; + + xskq_discard_desc(xs->tx); + rcu_read_unlock(); + return true; + } + +out: + rcu_read_unlock(); + return false; +} +EXPORT_SYMBOL(xsk_umem_consume_tx); + +static int xsk_zc_xmit(struct sock *sk) +{ + struct xdp_sock *xs = xdp_sk(sk); + struct net_device *dev = xs->dev; + + return dev->netdev_ops->ndo_xsk_async_xmit(dev, xs->queue_id); +} + static void xsk_destruct_skb(struct sk_buff *skb) { u64 addr = (u64)(long)skb_shinfo(skb)->destructor_arg; @@ -151,7 +205,6 @@ static void xsk_destruct_skb(struct sk_buff *skb) static int xsk_generic_xmit(struct sock *sk, struct msghdr *m, size_t total_len) { - bool need_wait = !(m->msg_flags & MSG_DONTWAIT); u32 max_batch = TX_BATCH_SIZE; struct xdp_sock *xs = xdp_sk(sk); bool sent_frame = false; @@ -161,8 +214,6 @@ static int xsk_generic_xmit(struct sock *sk, struct msghdr *m, if (unlikely(!xs->tx)) return -ENOBUFS; - if (need_wait) - return -EOPNOTSUPP; mutex_lock(&xs->mutex); @@ -192,7 +243,7 @@ static int xsk_generic_xmit(struct sock *sk, struct msghdr *m, goto out; } - skb = sock_alloc_send_skb(sk, len, !need_wait, &err); + skb = sock_alloc_send_skb(sk, len, 1, &err); if (unlikely(!skb)) { err = -EAGAIN; goto out; @@ -235,6 +286,7 @@ out: static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len) { + bool need_wait = !(m->msg_flags & MSG_DONTWAIT); struct sock *sk = sock->sk; struct xdp_sock *xs = xdp_sk(sk); @@ -242,8 +294,10 @@ static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len) return -ENXIO; if (unlikely(!(xs->dev->flags & IFF_UP))) return -ENETDOWN; + if (need_wait) + return -EOPNOTSUPP; - return xsk_generic_xmit(sk, m, total_len); + return (xs->zc) ? xsk_zc_xmit(sk) : xsk_generic_xmit(sk, m, total_len); } static unsigned int xsk_poll(struct file *file, struct socket *sock, @@ -419,10 +473,11 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) } xs->dev = dev; - xs->queue_id = sxdp->sxdp_queue_id; - + xs->zc = xs->umem->zc; + xs->queue_id = qid; xskq_set_umem(xs->rx, &xs->umem->props); xskq_set_umem(xs->tx, &xs->umem->props); + xdp_add_sk_umem(xs->umem, xs); out_unlock: if (err) @@ -660,6 +715,7 @@ static void xsk_destruct(struct sock *sk) xskq_destroy(xs->rx); xskq_destroy(xs->tx); + xdp_del_sk_umem(xs->umem, xs); xdp_put_umem(xs->umem); sk_refcnt_debug_dec(sk); diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index 5246ed420a16..ef6a6f0ec949 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -11,6 +11,7 @@ #include #define RX_BATCH_SIZE 16 +#define LAZY_UPDATE_THRESHOLD 128 struct xdp_ring { u32 producer ____cacheline_aligned_in_smp; @@ -61,9 +62,14 @@ static inline u32 xskq_nb_avail(struct xsk_queue *q, u32 dcnt) return (entries > dcnt) ? dcnt : entries; } +static inline u32 xskq_nb_free_lazy(struct xsk_queue *q, u32 producer) +{ + return q->nentries - (producer - q->cons_tail); +} + static inline u32 xskq_nb_free(struct xsk_queue *q, u32 producer, u32 dcnt) { - u32 free_entries = q->nentries - (producer - q->cons_tail); + u32 free_entries = xskq_nb_free_lazy(q, producer); if (free_entries >= dcnt) return free_entries; @@ -123,6 +129,9 @@ static inline int xskq_produce_addr(struct xsk_queue *q, u64 addr) { struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring; + if (xskq_nb_free(q, q->prod_tail, LAZY_UPDATE_THRESHOLD) == 0) + return -ENOSPC; + ring->desc[q->prod_tail++ & q->ring_mask] = addr; /* Order producer and data */ @@ -132,6 +141,27 @@ static inline int xskq_produce_addr(struct xsk_queue *q, u64 addr) return 0; } +static inline int xskq_produce_addr_lazy(struct xsk_queue *q, u64 addr) +{ + struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring; + + if (xskq_nb_free(q, q->prod_head, LAZY_UPDATE_THRESHOLD) == 0) + return -ENOSPC; + + ring->desc[q->prod_head++ & q->ring_mask] = addr; + return 0; +} + +static inline void xskq_produce_flush_addr_n(struct xsk_queue *q, + u32 nb_entries) +{ + /* Order producer and data */ + smp_wmb(); + + q->prod_tail += nb_entries; + WRITE_ONCE(q->ring->producer, q->prod_tail); +} + static inline int xskq_reserve_addr(struct xsk_queue *q) { if (xskq_nb_free(q, q->prod_head, 1) == 0) From 26bfb639e3fa6e8e45ff4b081eab023e097e5ab5 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Thu, 7 Jun 2018 15:37:34 +0200 Subject: [PATCH 1176/1640] UPSTREAM: xsk: Fix umem fill/completion queue mmap on 32-bit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With gcc-4.1.2 on 32-bit: net/xdp/xsk.c:663: warning: integer constant is too large for ‘long’ type net/xdp/xsk.c:665: warning: integer constant is too large for ‘long’ type Add the missing "ULL" suffixes to the large XDP_UMEM_PGOFF_*_RING values to fix this. net/xdp/xsk.c:663: warning: comparison is always false due to limited range of data type net/xdp/xsk.c:665: warning: comparison is always false due to limited range of data type "unsigned long" is 32-bit on 32-bit systems, hence the offset is truncated, and can never be equal to any of the XDP_UMEM_PGOFF_*_RING values. Use loff_t (and the required cast) to fix this. Fixes: 423f38329d267969 ("xsk: add umem fill queue support and mmap") Fixes: fe2308328cd2f26e ("xsk: add umem completion queue support and mmap") Signed-off-by: Geert Uytterhoeven Acked-by: Björn Töpel Signed-off-by: Daniel Borkmann --- include/uapi/linux/if_xdp.h | 4 ++-- net/xdp/xsk.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h index 730b692c04a5..c3b61f4481f3 100644 --- a/include/uapi/linux/if_xdp.h +++ b/include/uapi/linux/if_xdp.h @@ -63,8 +63,8 @@ struct xdp_statistics { /* Pgoff for mmaping the rings */ #define XDP_PGOFF_RX_RING 0 #define XDP_PGOFF_TX_RING 0x80000000 -#define XDP_UMEM_PGOFF_FILL_RING 0x100000000 -#define XDP_UMEM_PGOFF_COMPLETION_RING 0x180000000 +#define XDP_UMEM_PGOFF_FILL_RING 0x100000000ULL +#define XDP_UMEM_PGOFF_COMPLETION_RING 0x180000000ULL /* Rx/Tx descriptor */ struct xdp_desc { diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 7ab4adc84a20..c2af7a867a0a 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -644,7 +644,7 @@ static int xsk_getsockopt(struct socket *sock, int level, int optname, static int xsk_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma) { - unsigned long offset = vma->vm_pgoff << PAGE_SHIFT; + loff_t offset = (loff_t)vma->vm_pgoff << PAGE_SHIFT; unsigned long size = vma->vm_end - vma->vm_start; struct xdp_sock *xs = xdp_sk(sock->sk); struct xsk_queue *q = NULL; From 8b45aebb47f7e3714419869a07b154448fb96f3e Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 8 Jun 2018 00:06:01 +0200 Subject: [PATCH 1177/1640] UPSTREAM: bpf, xdp: fix crash in xdp_umem_unaccount_pages syzkaller was able to trigger the following panic for AF_XDP: BUG: KASAN: null-ptr-deref in atomic64_sub include/asm-generic/atomic-instrumented.h:144 [inline] BUG: KASAN: null-ptr-deref in atomic_long_sub include/asm-generic/atomic-long.h:199 [inline] BUG: KASAN: null-ptr-deref in xdp_umem_unaccount_pages.isra.4+0x3d/0x80 net/xdp/xdp_umem.c:135 Write of size 8 at addr 0000000000000060 by task syz-executor246/4527 CPU: 1 PID: 4527 Comm: syz-executor246 Not tainted 4.17.0+ #89 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x1b9/0x294 lib/dump_stack.c:113 kasan_report_error mm/kasan/report.c:352 [inline] kasan_report.cold.7+0x6d/0x2fe mm/kasan/report.c:412 check_memory_region_inline mm/kasan/kasan.c:260 [inline] check_memory_region+0x13e/0x1b0 mm/kasan/kasan.c:267 kasan_check_write+0x14/0x20 mm/kasan/kasan.c:278 atomic64_sub include/asm-generic/atomic-instrumented.h:144 [inline] atomic_long_sub include/asm-generic/atomic-long.h:199 [inline] xdp_umem_unaccount_pages.isra.4+0x3d/0x80 net/xdp/xdp_umem.c:135 xdp_umem_reg net/xdp/xdp_umem.c:334 [inline] xdp_umem_create+0xd6c/0x10f0 net/xdp/xdp_umem.c:349 xsk_setsockopt+0x443/0x550 net/xdp/xsk.c:531 __sys_setsockopt+0x1bd/0x390 net/socket.c:1935 __do_sys_setsockopt net/socket.c:1946 [inline] __se_sys_setsockopt net/socket.c:1943 [inline] __x64_sys_setsockopt+0xbe/0x150 net/socket.c:1943 do_syscall_64+0x1b1/0x800 arch/x86/entry/common.c:287 entry_SYSCALL_64_after_hwframe+0x49/0xbe In xdp_umem_reg() the call to xdp_umem_account_pages() passed with CAP_IPC_LOCK where we didn't need to end up charging rlimit on memlock for the current user and therefore umem->user continues to be NULL. Later on through fault injection syzkaller triggered a failure in either umem->pgs or umem->pages allocation such that we bail out and undo accounting in xdp_umem_unaccount_pages() where we eventually hit the panic since it tries to deref the umem->user. The code is pretty close to mm_account_pinned_pages() and mm_unaccount_pinned_pages() pair and potentially could reuse it even in a later cleanup, and it appears that the initial commit c0c77d8fb787 ("xsk: add user memory registration support sockopt") got this right while later follow-up introduced the bug via a49049ea2576 ("xsk: simplified umem setup"). Fixes: a49049ea2576 ("xsk: simplified umem setup") Reported-by: syzbot+979217770b09ebf5c407@syzkaller.appspotmail.com Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov --- net/xdp/xdp_umem.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c index 7eb4948a38d2..b9ef487c4618 100644 --- a/net/xdp/xdp_umem.c +++ b/net/xdp/xdp_umem.c @@ -132,8 +132,10 @@ static void xdp_umem_unpin_pages(struct xdp_umem *umem) static void xdp_umem_unaccount_pages(struct xdp_umem *umem) { - atomic_long_sub(umem->npgs, &umem->user->locked_vm); - free_uid(umem->user); + if (umem->user) { + atomic_long_sub(umem->npgs, &umem->user->locked_vm); + free_uid(umem->user); + } } static void xdp_umem_release(struct xdp_umem *umem) From 1ebbfddfa562aa73608ba380c29a8f2a156c2fb3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Mon, 11 Jun 2018 13:57:12 +0200 Subject: [PATCH 1178/1640] UPSTREAM: xsk: silence warning on memory allocation failure MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit syzkaller reported a warning from xdp_umem_pin_pages(): WARNING: CPU: 1 PID: 4537 at mm/slab_common.c:996 kmalloc_slab+0x56/0x70 mm/slab_common.c:996 ... __do_kmalloc mm/slab.c:3713 [inline] __kmalloc+0x25/0x760 mm/slab.c:3727 kmalloc_array include/linux/slab.h:634 [inline] kcalloc include/linux/slab.h:645 [inline] xdp_umem_pin_pages net/xdp/xdp_umem.c:205 [inline] xdp_umem_reg net/xdp/xdp_umem.c:318 [inline] xdp_umem_create+0x5c9/0x10f0 net/xdp/xdp_umem.c:349 xsk_setsockopt+0x443/0x550 net/xdp/xsk.c:531 __sys_setsockopt+0x1bd/0x390 net/socket.c:1935 __do_sys_setsockopt net/socket.c:1946 [inline] __se_sys_setsockopt net/socket.c:1943 [inline] __x64_sys_setsockopt+0xbe/0x150 net/socket.c:1943 do_syscall_64+0x1b1/0x800 arch/x86/entry/common.c:287 entry_SYSCALL_64_after_hwframe+0x49/0xbe This is a warning about attempting to allocate more than KMALLOC_MAX_SIZE memory. The request originates from userspace, and if the request is too big, the kernel is free to deny its allocation. In this patch, the failed allocation attempt is silenced with __GFP_NOWARN. Fixes: c0c77d8fb787 ("xsk: add user memory registration support sockopt") Reported-by: syzbot+4abadc5d69117b346506@syzkaller.appspotmail.com Signed-off-by: Björn Töpel Signed-off-by: Daniel Borkmann --- net/xdp/xdp_umem.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c index b9ef487c4618..f47abb46c587 100644 --- a/net/xdp/xdp_umem.c +++ b/net/xdp/xdp_umem.c @@ -204,7 +204,8 @@ static int xdp_umem_pin_pages(struct xdp_umem *umem) long npgs; int err; - umem->pgs = kcalloc(umem->npgs, sizeof(*umem->pgs), GFP_KERNEL); + umem->pgs = kcalloc(umem->npgs, sizeof(*umem->pgs), + GFP_KERNEL | __GFP_NOWARN); if (!umem->pgs) return -ENOMEM; From 8ac8325dbd442e995cde8c53d978b654563d3bcf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Tue, 12 Jun 2018 12:02:56 +0200 Subject: [PATCH 1179/1640] UPSTREAM: xsk: re-add queue id check for XDP_SKB path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 173d3adb6f43 ("xsk: add zero-copy support for Rx") introduced a regression on the XDP_SKB receive path, when the queue id checks were removed. Now, they are back again. Fixes: 173d3adb6f43 ("xsk: add zero-copy support for Rx") Reported-by: Qi Zhang Signed-off-by: Björn Töpel Signed-off-by: Daniel Borkmann --- net/xdp/xsk.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index c2af7a867a0a..9374c94a2c95 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -118,6 +118,9 @@ int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) u64 addr; int err; + if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index) + return -EINVAL; + if (!xskq_peek_addr(xs->umem->fq, &addr) || len > xs->umem->chunk_size_nohr) { xs->rx_dropped++; From 0bf5bf2385b548046a9d85e7a941acf9474b7ba9 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Fri, 29 Jun 2018 09:48:17 +0200 Subject: [PATCH 1180/1640] UPSTREAM: xsk: fix potential lost completion message in SKB path The code in xskq_produce_addr erroneously checked if there was up to LAZY_UPDATE_THRESHOLD amount of space in the completion queue. It only needs to check if there is one slot left in the queue. This bug could under some circumstances lead to a WARN_ON_ONCE being triggered and the completion message to user space being lost. Fixes: 35fcde7f8deb ("xsk: support for Tx") Signed-off-by: Magnus Karlsson Reported-by: Pavel Odintsov Signed-off-by: Alexei Starovoitov --- net/xdp/xsk_queue.h | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index ef6a6f0ec949..52ecaf770642 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -62,14 +62,9 @@ static inline u32 xskq_nb_avail(struct xsk_queue *q, u32 dcnt) return (entries > dcnt) ? dcnt : entries; } -static inline u32 xskq_nb_free_lazy(struct xsk_queue *q, u32 producer) -{ - return q->nentries - (producer - q->cons_tail); -} - static inline u32 xskq_nb_free(struct xsk_queue *q, u32 producer, u32 dcnt) { - u32 free_entries = xskq_nb_free_lazy(q, producer); + u32 free_entries = q->nentries - (producer - q->cons_tail); if (free_entries >= dcnt) return free_entries; @@ -129,7 +124,7 @@ static inline int xskq_produce_addr(struct xsk_queue *q, u64 addr) { struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring; - if (xskq_nb_free(q, q->prod_tail, LAZY_UPDATE_THRESHOLD) == 0) + if (xskq_nb_free(q, q->prod_tail, 1) == 0) return -ENOSPC; ring->desc[q->prod_tail++ & q->ring_mask] = addr; From aa5ab4e2abeb77a9b12d22a893c8a632e37ef75d Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Fri, 29 Jun 2018 09:48:18 +0200 Subject: [PATCH 1181/1640] UPSTREAM: xsk: frame could be completed more than once in SKB path Fixed a bug in which a frame could be completed more than once when an error was returned from dev_direct_xmit(). The code erroneously retried sending the message leading to multiple calls to the SKB destructor and therefore multiple completions of the same buffer to user space. The error code in this case has been changed from EAGAIN to EBUSY in order to tell user space that the sending of the packet failed and the buffer has been return to user space through the completion queue. Fixes: 35fcde7f8deb ("xsk: support for Tx") Signed-off-by: Magnus Karlsson Reported-by: Pavel Odintsov Signed-off-by: Alexei Starovoitov --- net/xdp/xsk.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 9374c94a2c95..47e55bdd22b9 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -268,15 +268,15 @@ static int xsk_generic_xmit(struct sock *sk, struct msghdr *m, skb->destructor = xsk_destruct_skb; err = dev_direct_xmit(skb, xs->queue_id); + xskq_discard_desc(xs->tx); /* Ignore NET_XMIT_CN as packet might have been sent */ if (err == NET_XMIT_DROP || err == NETDEV_TX_BUSY) { - err = -EAGAIN; - /* SKB consumed by dev_direct_xmit() */ + /* SKB completed but not sent */ + err = -EBUSY; goto out; } sent_frame = true; - xskq_discard_desc(xs->tx); } out: From c3566667cb58badf928747546f6d878b77b8f2ca Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Fri, 29 Jun 2018 09:48:20 +0200 Subject: [PATCH 1182/1640] UPSTREAM: xsk: fix potential race in SKB TX completion code There is a potential race in the TX completion code for the SKB case. One process enters the sendmsg code of an AF_XDP socket in order to send a frame. The execution eventually trickles down to the driver that is told to send the packet. However, it decides to drop the packet due to some error condition (e.g., rings full) and frees the SKB. This will trigger the SKB destructor and a completion will be sent to the AF_XDP user space through its single-producer/single-consumer queues. At the same time a TX interrupt has fired on another core and it dispatches the TX completion code in the driver. It does its HW specific things and ends up freeing the SKB associated with the transmitted packet. This will trigger the SKB destructor and a completion will be sent to the AF_XDP user space through its single-producer/single-consumer queues. With a pseudo call stack, it would look like this: Core 1: sendmsg() being called in the application netdev_start_xmit() Driver entered through ndo_start_xmit Driver decides to free the SKB for some reason (e.g., rings full) Destructor of SKB called xskq_produce_addr() is called to signal completion to user space Core 2: TX completion irq NAPI loop Driver irq handler for TX completions Frees the SKB Destructor of SKB called xskq_produce_addr() is called to signal completion to user space We now have a violation of the single-producer/single-consumer principle for our queues as there are two threads trying to produce at the same time on the same queue. Fixed by introducing a spin_lock in the destructor. In regards to the performance, I get around 1.74 Mpps for txonly before and after the introduction of the spinlock. There is of course some impact due to the spin lock but it is in the less significant digits that are too noisy for me to measure. But let us say that the version without the spin lock got 1.745 Mpps in the best case and the version with 1.735 Mpps in the worst case, then that would mean a maximum drop in performance of 0.5%. Fixes: 35fcde7f8deb ("xsk: support for Tx") Signed-off-by: Magnus Karlsson Signed-off-by: Alexei Starovoitov --- include/net/xdp_sock.h | 4 ++++ net/xdp/xsk.c | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index 9fe472f2ac95..7161856bcf9c 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -60,6 +60,10 @@ struct xdp_sock { bool zc; /* Protects multiple processes in the control path */ struct mutex mutex; + /* Mutual exclusion of NAPI TX thread and sendmsg error paths + * in the SKB destructor callback. + */ + spinlock_t tx_completion_lock; u64 rx_dropped; }; diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 47e55bdd22b9..affc05d0fcc3 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -199,8 +199,11 @@ static void xsk_destruct_skb(struct sk_buff *skb) { u64 addr = (u64)(long)skb_shinfo(skb)->destructor_arg; struct xdp_sock *xs = xdp_sk(skb->sk); + unsigned long flags; + spin_lock_irqsave(&xs->tx_completion_lock, flags); WARN_ON_ONCE(xskq_produce_addr(xs->umem->cq, addr)); + spin_unlock_irqrestore(&xs->tx_completion_lock, flags); sock_wfree(skb); } @@ -757,6 +760,7 @@ static int xsk_create(struct net *net, struct socket *sock, int protocol, xs = xdp_sk(sk); mutex_init(&xs->mutex); + spin_lock_init(&xs->tx_completion_lock); local_bh_disable(); sock_prot_inuse_add(net, &xsk_proto, 1); From afe478cb755fc49a4e7e807ee31a742c8b70fa10 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Wed, 11 Jul 2018 10:12:49 +0200 Subject: [PATCH 1183/1640] UPSTREAM: xsk: do not return ENXIO from TX copy mode This patch removes the ENXIO return code from TX copy-mode when someone has forcefully changed the number of queues on the device so that the queue bound to the socket is no longer available. Just silently stop sending anything as in zero-copy mode so the error reporting gets consistent between the two modes. Fixes: 35fcde7f8deb ("xsk: support for Tx") Signed-off-by: Magnus Karlsson Signed-off-by: Daniel Borkmann --- net/xdp/xsk.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index affc05d0fcc3..aa81b4701669 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -244,10 +244,8 @@ static int xsk_generic_xmit(struct sock *sk, struct msghdr *m, goto out; } - if (xs->queue_id >= xs->dev->real_num_tx_queues) { - err = -ENXIO; + if (xs->queue_id >= xs->dev->real_num_tx_queues) goto out; - } skb = sock_alloc_send_skb(sk, len, 1, &err); if (unlikely(!skb)) { From 146f122fe15083e1468122f37d5a2a3c40d2f5c8 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Wed, 11 Jul 2018 10:12:50 +0200 Subject: [PATCH 1184/1640] UPSTREAM: xsk: do not return EAGAIN from sendmsg when completion queue is full This patch stops returning EAGAIN in TX copy mode when the completion queue is full as zero-copy does not do this. Instead this situation can be detected by comparing the head and tail pointers of the completion queue in both modes. In any case, EAGAIN was not the correct error code here since no amount of calling sendmsg will solve the problem. Only consuming one or more messages on the completion queue will fix this. With this patch, the error reporting becomes consistent between copy mode and zero-copy mode. Fixes: 35fcde7f8deb ("xsk: support for Tx") Signed-off-by: Magnus Karlsson Signed-off-by: Daniel Borkmann --- net/xdp/xsk.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index aa81b4701669..6a020017ce7e 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -233,10 +233,8 @@ static int xsk_generic_xmit(struct sock *sk, struct msghdr *m, goto out; } - if (xskq_reserve_addr(xs->umem->cq)) { - err = -EAGAIN; + if (xskq_reserve_addr(xs->umem->cq)) goto out; - } len = desc.len; if (unlikely(len > xs->dev->mtu)) { From cf1583acad2011fd8fad988724840e6a7ed46c74 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Wed, 11 Jul 2018 10:12:51 +0200 Subject: [PATCH 1185/1640] UPSTREAM: xsk: always return ENOBUFS from sendmsg if there is no TX queue This patch makes sure ENOBUFS is always returned from sendmsg if there is no TX queue configured. This was not the case for zero-copy mode. With this patch this error reporting is consistent between copy mode and zero-copy mode. Fixes: ac98d8aab61b ("xsk: wire upp Tx zero-copy functions") Signed-off-by: Magnus Karlsson Signed-off-by: Daniel Borkmann --- net/xdp/xsk.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 6a020017ce7e..b3a334577b74 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -218,9 +218,6 @@ static int xsk_generic_xmit(struct sock *sk, struct msghdr *m, struct sk_buff *skb; int err = 0; - if (unlikely(!xs->tx)) - return -ENOBUFS; - mutex_lock(&xs->mutex); while (xskq_peek_desc(xs->tx, &desc)) { @@ -296,6 +293,8 @@ static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len) return -ENXIO; if (unlikely(!(xs->dev->flags & IFF_UP))) return -ENETDOWN; + if (unlikely(!xs->tx)) + return -ENOBUFS; if (need_wait) return -EOPNOTSUPP; From 6afd42a1aebca8cf0ca554b9932fee94fd2b1c90 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Wed, 11 Jul 2018 10:12:52 +0200 Subject: [PATCH 1186/1640] UPSTREAM: xsk: do not return EMSGSIZE in copy mode for packets larger than MTU This patch stops returning EMSGSIZE from sendmsg in copy mode when the size of the packet is larger than the MTU. Just send it to the device so that it will drop it as in zero-copy mode. This makes the error reporting consistent between copy mode and zero-copy mode. Fixes: 35fcde7f8deb ("xsk: support for Tx") Signed-off-by: Magnus Karlsson Signed-off-by: Daniel Borkmann --- net/xdp/xsk.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index b3a334577b74..a12b34056911 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -233,15 +233,10 @@ static int xsk_generic_xmit(struct sock *sk, struct msghdr *m, if (xskq_reserve_addr(xs->umem->cq)) goto out; - len = desc.len; - if (unlikely(len > xs->dev->mtu)) { - err = -EMSGSIZE; - goto out; - } - if (xs->queue_id >= xs->dev->real_num_tx_queues) goto out; + len = desc.len; skb = sock_alloc_send_skb(sk, len, 1, &err); if (unlikely(!skb)) { err = -EAGAIN; From 5591a2592a7905234b9fefa11d0ba035b89ef091 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Mon, 23 Jul 2018 11:43:03 +0200 Subject: [PATCH 1187/1640] UPSTREAM: xsk: fix poll/POLLIN premature returns MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Polling for the ingress queues relies on reading the producer/consumer pointers of the Rx queue. Prior this commit, a cached consumer pointer could be used, instead of the actual consumer pointer and therefore report POLLIN prematurely. This patch makes sure that the non-cached consumer pointer is used instead. Reported-by: Qi Zhang Tested-by: Qi Zhang Fixes: c497176cb2e4 ("xsk: add Rx receive functions and poll support") Signed-off-by: Björn Töpel Signed-off-by: Daniel Borkmann --- net/xdp/xsk_queue.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index 52ecaf770642..8a64b150be54 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -250,7 +250,7 @@ static inline bool xskq_full_desc(struct xsk_queue *q) static inline bool xskq_empty_desc(struct xsk_queue *q) { - return xskq_nb_free(q, q->prod_tail, 1) == q->nentries; + return xskq_nb_free(q, q->prod_tail, q->nentries) == q->nentries; } void xskq_set_umem(struct xsk_queue *q, struct xdp_umem_props *umem_props); From 960807f77735967e1d3cdcab9cba1afe71cdc0b4 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 27 Jul 2018 20:20:08 -0700 Subject: [PATCH 1188/1640] UPSTREAM: net: xsk: don't return frames via the allocator on error MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit xdp_return_buff() is used when frame has been successfully handled (transmitted) or if an error occurred during delayed processing and there is no way to report it back to xdp_do_redirect(). In case of __xsk_rcv_zc() error is propagated all the way back to the driver, so there is no need to call xdp_return_buff(). Driver will recycle the frame anyway after seeing that error happened. Fixes: 173d3adb6f43 ("xsk: add zero-copy support for Rx") Signed-off-by: Jakub Kicinski Acked-by: Björn Töpel Signed-off-by: Daniel Borkmann --- net/xdp/xsk.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index a12b34056911..661504042d30 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -84,10 +84,8 @@ static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) { int err = xskq_produce_batch_desc(xs->rx, (u64)xdp->handle, len); - if (err) { - xdp_return_buff(xdp); + if (err) xs->rx_dropped++; - } return err; } From fe8d10a87da1f386f94ff1ff680826cf8c17026f Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 30 Jul 2018 20:43:52 -0700 Subject: [PATCH 1189/1640] UPSTREAM: xsk: refactor xdp_umem_assign_dev() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Return early and only take the ref on dev once there is no possibility of failing. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Acked-by: Björn Töpel Signed-off-by: David S. Miller --- net/xdp/xdp_umem.c | 61 ++++++++++++++++++++-------------------------- 1 file changed, 27 insertions(+), 34 deletions(-) diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c index f47abb46c587..c199d66b5f3f 100644 --- a/net/xdp/xdp_umem.c +++ b/net/xdp/xdp_umem.c @@ -56,41 +56,34 @@ int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev, if (force_copy) return 0; + if (!dev->netdev_ops->ndo_bpf || !dev->netdev_ops->ndo_xsk_async_xmit) + return force_zc ? -ENOTSUPP : 0; /* fail or fallback */ + + bpf.command = XDP_QUERY_XSK_UMEM; + + rtnl_lock(); + err = dev->netdev_ops->ndo_bpf(dev, &bpf); + rtnl_unlock(); + + if (err) + return force_zc ? -ENOTSUPP : 0; + + bpf.command = XDP_SETUP_XSK_UMEM; + bpf.xsk.umem = umem; + bpf.xsk.queue_id = queue_id; + + rtnl_lock(); + err = dev->netdev_ops->ndo_bpf(dev, &bpf); + rtnl_unlock(); + + if (err) + return force_zc ? err : 0; /* fail or fallback */ + dev_hold(dev); - - if (dev->netdev_ops->ndo_bpf && dev->netdev_ops->ndo_xsk_async_xmit) { - bpf.command = XDP_QUERY_XSK_UMEM; - - rtnl_lock(); - err = dev->netdev_ops->ndo_bpf(dev, &bpf); - rtnl_unlock(); - - if (err) { - dev_put(dev); - return force_zc ? -ENOTSUPP : 0; - } - - bpf.command = XDP_SETUP_XSK_UMEM; - bpf.xsk.umem = umem; - bpf.xsk.queue_id = queue_id; - - rtnl_lock(); - err = dev->netdev_ops->ndo_bpf(dev, &bpf); - rtnl_unlock(); - - if (err) { - dev_put(dev); - return force_zc ? err : 0; /* fail or fallback */ - } - - umem->dev = dev; - umem->queue_id = queue_id; - umem->zc = true; - return 0; - } - - dev_put(dev); - return force_zc ? -ENOTSUPP : 0; /* fail or fallback */ + umem->dev = dev; + umem->queue_id = queue_id; + umem->zc = true; + return 0; } static void xdp_umem_clear_dev(struct xdp_umem *umem) From a4a14a17e14f78bb63d733d1be972f00c2befa22 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Mon, 4 Jun 2018 14:05:53 +0200 Subject: [PATCH 1190/1640] UPSTREAM: net: xdp: added bpf_netdev_command XDP_{QUERY, SETUP}_XSK_UMEM MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Extend ndo_bpf with two new commands used for query zero-copy support and register an UMEM to a queue_id of a netdev. Signed-off-by: Björn Töpel Signed-off-by: Daniel Borkmann --- include/linux/netdevice.h | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index d9c5b6f4167c..74e94c7563d0 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -816,10 +816,13 @@ enum bpf_netdev_command { /* BPF program for offload callbacks, invoked at program load time. */ BPF_OFFLOAD_MAP_ALLOC, BPF_OFFLOAD_MAP_FREE, + XDP_QUERY_XSK_UMEM, + XDP_SETUP_XSK_UMEM, }; struct bpf_prog_offload_ops; struct netlink_ext_ack; +struct xdp_umem; struct netdev_bpf { enum bpf_netdev_command command; @@ -841,6 +844,11 @@ struct netdev_bpf { struct { struct bpf_offloaded_map *offmap; }; + /* XDP_SETUP_XSK_UMEM */ + struct { + struct xdp_umem *umem; + u16 queue_id; + } xsk; }; }; From f0b883949f4e9674288ae10db94ecd37139633ec Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 1 Dec 2017 15:08:55 -0800 Subject: [PATCH 1191/1640] UPSTREAM: net: xdp: avoid output parameters when querying XDP prog The output parameters will get unwieldy if we want to add more information about the program. Simply pass the entire struct netdev_bpf in. Signed-off-by: Jakub Kicinski Reviewed-by: Simon Horman Reviewed-by: Quentin Monnet Signed-off-by: Daniel Borkmann --- include/linux/netdevice.h | 3 ++- net/core/dev.c | 24 ++++++++++++++---------- net/core/rtnetlink.c | 6 +++++- 3 files changed, 21 insertions(+), 12 deletions(-) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 74e94c7563d0..2f0c8dfdf15d 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3447,7 +3447,8 @@ struct sk_buff *dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, typedef int (*bpf_op_t)(struct net_device *dev, struct netdev_bpf *bpf); int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack, int fd, u32 flags); -u8 __dev_xdp_attached(struct net_device *dev, bpf_op_t xdp_op, u32 *prog_id); +void __dev_xdp_query(struct net_device *dev, bpf_op_t xdp_op, + struct netdev_bpf *xdp); int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb); int dev_forward_skb(struct net_device *dev, struct sk_buff *skb); diff --git a/net/core/dev.c b/net/core/dev.c index dc3348f185ed..f50905af7c75 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -7226,17 +7226,21 @@ int dev_change_proto_down(struct net_device *dev, bool proto_down) } EXPORT_SYMBOL(dev_change_proto_down); -u8 __dev_xdp_attached(struct net_device *dev, bpf_op_t bpf_op, u32 *prog_id) +void __dev_xdp_query(struct net_device *dev, bpf_op_t bpf_op, + struct netdev_bpf *xdp) +{ + memset(xdp, 0, sizeof(*xdp)); + xdp->command = XDP_QUERY_PROG; + + /* Query must always succeed. */ + WARN_ON(bpf_op(dev, xdp) < 0); +} + +static u8 __dev_xdp_attached(struct net_device *dev, bpf_op_t bpf_op) { struct netdev_bpf xdp; - memset(&xdp, 0, sizeof(xdp)); - xdp.command = XDP_QUERY_PROG; - - /* Query must always succeed. */ - WARN_ON(bpf_op(dev, &xdp) < 0); - if (prog_id) - *prog_id = xdp.prog_id; + __dev_xdp_query(dev, bpf_op, &xdp); return xdp.prog_attached; } @@ -7287,10 +7291,10 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack, bpf_chk = generic_xdp_install; if (fd >= 0) { - if (bpf_chk && __dev_xdp_attached(dev, bpf_chk, NULL)) + if (bpf_chk && __dev_xdp_attached(dev, bpf_chk)) return -EEXIST; if ((flags & XDP_FLAGS_UPDATE_IF_NOEXIST) && - __dev_xdp_attached(dev, bpf_op, NULL)) + __dev_xdp_attached(dev, bpf_op)) return -EBUSY; prog = bpf_prog_get_type_dev(fd, BPF_PROG_TYPE_XDP, diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index ad6a31c485f5..781927f6df5e 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1235,6 +1235,7 @@ static u8 rtnl_xdp_attached_mode(struct net_device *dev, u32 *prog_id) { const struct net_device_ops *ops = dev->netdev_ops; const struct bpf_prog *generic_xdp_prog; + struct netdev_bpf xdp; ASSERT_RTNL(); @@ -1247,7 +1248,10 @@ static u8 rtnl_xdp_attached_mode(struct net_device *dev, u32 *prog_id) if (!ops->ndo_bpf) return XDP_ATTACHED_NONE; - return __dev_xdp_attached(dev, ops->ndo_bpf, prog_id); + __dev_xdp_query(dev, ops->ndo_bpf, &xdp); + *prog_id = xdp.prog_id; + + return xdp.prog_attached; } static int rtnl_xdp_fill(struct sk_buff *skb, struct net_device *dev) From bc262310b89d2c3cda1df6cbd14d359b4909b531 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Tue, 8 May 2018 09:07:02 -0700 Subject: [PATCH 1192/1640] UPSTREAM: net: Update generic_xdp_needed static key to modern api No changes in refcount semantics -- key init is false; replace static_key_slow_inc|dec with static_branch_inc|dec static_key_false with static_branch_unlikely Added a '_key' suffix to generic_xdp_needed, for better self documentation. Signed-off-by: Davidlohr Bueso Signed-off-by: David S. Miller --- net/core/dev.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/net/core/dev.c b/net/core/dev.c index f50905af7c75..29f944631a02 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4082,7 +4082,7 @@ void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog) } EXPORT_SYMBOL_GPL(generic_xdp_tx); -static struct static_key generic_xdp_needed __read_mostly; +static DEFINE_STATIC_KEY_FALSE(generic_xdp_needed_key); int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb) { @@ -4122,7 +4122,7 @@ static int netif_rx_internal(struct sk_buff *skb) trace_netif_rx(skb); - if (static_key_false(&generic_xdp_needed)) { + if (static_branch_unlikely(&generic_xdp_needed_key)) { int ret; preempt_disable(); @@ -4679,9 +4679,9 @@ static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp) bpf_prog_put(old); if (old && !new) { - static_key_slow_dec(&generic_xdp_needed); + static_branch_dec(&generic_xdp_needed_key); } else if (new && !old) { - static_key_slow_inc(&generic_xdp_needed); + static_branch_inc(&generic_xdp_needed_key); dev_disable_lro(dev); } break; @@ -4708,7 +4708,7 @@ static int netif_receive_skb_internal(struct sk_buff *skb) if (skb_defer_rx_timestamp(skb)) return NET_RX_SUCCESS; - if (static_key_false(&generic_xdp_needed)) { + if (static_branch_unlikely(&generic_xdp_needed_key)) { int ret; preempt_disable(); From ad10baa64c72a549c55ff76d0e056dd65c9437a5 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 1 Dec 2017 15:08:57 -0800 Subject: [PATCH 1193/1640] BACKPORT: net: xdp: make the stack take care of the tear down Since day one of XDP drivers had to remember to free the program on the remove path. This leads to code duplication and is error prone. Make the stack query the installed programs on unregister and if something is installed, remove the program. Freeing of program attached to XDP generic is moved from free_netdev() as well. Because the remove will now be called before notifiers are invoked, BPF offload state of the program will not get destroyed before uninstall. Signed-off-by: Jakub Kicinski Reviewed-by: Simon Horman Reviewed-by: Quentin Monnet Signed-off-by: Daniel Borkmann --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 2 -- .../net/ethernet/mellanox/mlx5/core/en_main.c | 3 -- drivers/net/ethernet/qlogic/qede/qede_main.c | 4 --- drivers/net/tun.c | 4 --- net/core/dev.c | 29 ++++++++++++++----- 5 files changed, 22 insertions(+), 20 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index df6e76e5d414..0f962cb2ca2e 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -7830,8 +7830,6 @@ static void bnxt_remove_one(struct pci_dev *pdev) bnxt_dcb_free(bp); kfree(bp->edev); bp->edev = NULL; - if (bp->xdp_prog) - bpf_prog_put(bp->xdp_prog); bnxt_cleanup_pci(bp); free_netdev(dev); } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 707c87f9987c..adfce014c6a3 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -4174,9 +4174,6 @@ static void mlx5e_nic_cleanup(struct mlx5e_priv *priv) { mlx5e_ipsec_cleanup(priv); mlx5e_vxlan_cleanup(priv); - - if (priv->channels.params.xdp_prog) - bpf_prog_put(priv->channels.params.xdp_prog); } static int mlx5e_init_nic_rx(struct mlx5e_priv *priv) diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c b/drivers/net/ethernet/qlogic/qede/qede_main.c index f6bf8efd9fe9..cf01fdfda5ff 100644 --- a/drivers/net/ethernet/qlogic/qede/qede_main.c +++ b/drivers/net/ethernet/qlogic/qede/qede_main.c @@ -1089,10 +1089,6 @@ static void __qede_remove(struct pci_dev *pdev, enum qede_remove_mode mode) pci_set_drvdata(pdev, NULL); - /* Release edev's reference to XDP's bpf if such exist */ - if (edev->xdp_prog) - bpf_prog_put(edev->xdp_prog); - /* Use global ops since we've freed edev */ qed_ops->common->slowpath_stop(cdev); if (system_state == SYSTEM_POWER_OFF) diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 757dff1c7216..1e64384146f2 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -632,7 +632,6 @@ static void tun_detach(struct tun_file *tfile, bool clean) static void tun_detach_all(struct net_device *dev) { struct tun_struct *tun = netdev_priv(dev); - struct bpf_prog *xdp_prog = rtnl_dereference(tun->xdp_prog); struct tun_file *tfile, *tmp; int i, n = tun->numqueues; @@ -665,9 +664,6 @@ static void tun_detach_all(struct net_device *dev) } BUG_ON(tun->numdisabled != 0); - if (xdp_prog) - bpf_prog_put(xdp_prog); - if (tun->flags & IFF_PERSIST) module_put(THIS_MODULE); } diff --git a/net/core/dev.c b/net/core/dev.c index 29f944631a02..f95ed1fd909b 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -7263,6 +7263,27 @@ static int dev_xdp_install(struct net_device *dev, bpf_op_t bpf_op, return bpf_op(dev, &xdp); } +static void dev_xdp_uninstall(struct net_device *dev) +{ + struct netdev_bpf xdp; + bpf_op_t ndo_bpf; + + /* Remove generic XDP */ + WARN_ON(dev_xdp_install(dev, generic_xdp_install, NULL, 0, NULL)); + + /* Remove from the driver */ + ndo_bpf = dev->netdev_ops->ndo_bpf; + if (!ndo_bpf) + return; + + __dev_xdp_query(dev, ndo_bpf, &xdp); + if (xdp.prog_attached == XDP_ATTACHED_NONE) + return; + + /* Program removal should always succeed */ + WARN_ON(dev_xdp_install(dev, ndo_bpf, NULL, xdp.prog_flags, NULL)); +} + /** * dev_change_xdp_fd - set or clear a bpf program for a device rx path * @dev: device @@ -7386,6 +7407,7 @@ static void rollback_registered_many(struct list_head *head) /* Shutdown queueing discipline. */ dev_shutdown(dev); + dev_xdp_uninstall(dev); /* Notify protocols, that we are about to destroy * this device. They should clean all the things. @@ -8406,7 +8428,6 @@ EXPORT_SYMBOL(alloc_netdev_mqs); void free_netdev(struct net_device *dev) { struct napi_struct *p, *n; - struct bpf_prog *prog; might_sleep(); netif_free_tx_queues(dev); @@ -8423,12 +8444,6 @@ void free_netdev(struct net_device *dev) free_percpu(dev->pcpu_refcnt); dev->pcpu_refcnt = NULL; - prog = rcu_dereference_protected(dev->xdp_prog, 1); - if (prog) { - bpf_prog_put(prog); - static_key_slow_dec(&generic_xdp_needed); - } - /* Compatibility with error handling in drivers */ if (dev->reg_state == NETREG_UNINITIALIZED) { netdev_freemem(dev); From 23a65838517694f38f02278ca672e99b4a821eaa Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 11 Jul 2018 20:36:39 -0700 Subject: [PATCH 1194/1640] BACKPORT: xdp: don't make drivers report attachment mode prog_attached of struct netdev_bpf should have been superseded by simply setting prog_id long time ago, but we kept it around to allow offloading drivers to communicate attachment mode (drv vs hw). Subsequently drivers were also allowed to report back attachment flags (prog_flags), and since nowadays only programs attached will XDP_FLAGS_HW_MODE can get offloaded, we can tell the attachment mode from the flags driver reports. Remove prog_attached member. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Signed-off-by: Daniel Borkmann --- drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c | 1 - drivers/net/ethernet/cavium/thunder/nicvf_main.c | 1 - drivers/net/ethernet/intel/i40e/i40e_main.c | 1 - drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 1 - drivers/net/ethernet/mellanox/mlx4/en_netdev.c | 1 - drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 1 - drivers/net/ethernet/netronome/nfp/nfp_net_common.c | 3 --- drivers/net/ethernet/qlogic/qede/qede_filter.c | 1 - drivers/net/tun.c | 1 - drivers/net/virtio_net.c | 1 - include/linux/netdevice.h | 5 ----- net/core/dev.c | 7 +++---- net/core/rtnetlink.c | 8 ++++++-- 13 files changed, 9 insertions(+), 23 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c index 261e5847557a..476fc871c585 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c @@ -218,7 +218,6 @@ int bnxt_xdp(struct net_device *dev, struct netdev_bpf *xdp) rc = bnxt_xdp_set(bp, xdp->prog); break; case XDP_QUERY_PROG: - xdp->prog_attached = !!bp->xdp_prog; xdp->prog_id = bp->xdp_prog ? bp->xdp_prog->aux->id : 0; rc = 0; break; diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_main.c b/drivers/net/ethernet/cavium/thunder/nicvf_main.c index 8f952a92f172..82791bfb3c05 100644 --- a/drivers/net/ethernet/cavium/thunder/nicvf_main.c +++ b/drivers/net/ethernet/cavium/thunder/nicvf_main.c @@ -1788,7 +1788,6 @@ static int nicvf_xdp(struct net_device *netdev, struct netdev_bpf *xdp) case XDP_SETUP_PROG: return nicvf_xdp_setup(nic, xdp->prog); case XDP_QUERY_PROG: - xdp->prog_attached = !!nic->xdp_prog; xdp->prog_id = nic->xdp_prog ? nic->xdp_prog->aux->id : 0; return 0; default: diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index 751c931fe184..ca08e13a898a 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -9662,7 +9662,6 @@ static int i40e_xdp(struct net_device *dev, case XDP_SETUP_PROG: return i40e_xdp_setup(vsi, xdp->prog); case XDP_QUERY_PROG: - xdp->prog_attached = i40e_enabled_xdp_vsi(vsi); xdp->prog_id = vsi->xdp_prog ? vsi->xdp_prog->aux->id : 0; return 0; default: diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index feda171bf323..47c14dd3bf81 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -9894,7 +9894,6 @@ static int ixgbe_xdp(struct net_device *dev, struct netdev_bpf *xdp) case XDP_SETUP_PROG: return ixgbe_xdp_setup(dev, xdp->prog); case XDP_QUERY_PROG: - xdp->prog_attached = !!(adapter->xdp_prog); xdp->prog_id = adapter->xdp_prog ? adapter->xdp_prog->aux->id : 0; return 0; diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c index 1bc7e3497a1a..59cf6493b261 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c @@ -2937,7 +2937,6 @@ static int mlx4_xdp(struct net_device *dev, struct netdev_bpf *xdp) return mlx4_xdp_set(dev, xdp->prog); case XDP_QUERY_PROG: xdp->prog_id = mlx4_xdp_query(dev); - xdp->prog_attached = !!xdp->prog_id; return 0; default: return -EINVAL; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index adfce014c6a3..307c067cd51d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -3742,7 +3742,6 @@ static int mlx5e_xdp(struct net_device *dev, struct netdev_bpf *xdp) return mlx5e_xdp_set(dev, xdp->prog); case XDP_QUERY_PROG: xdp->prog_id = mlx5e_xdp_query(dev); - xdp->prog_attached = !!xdp->prog_id; return 0; default: return -EINVAL; diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c index 411a29cf84ac..95f39434f304 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c @@ -3433,9 +3433,6 @@ static int nfp_net_xdp(struct net_device *netdev, struct netdev_bpf *xdp) return nfp_net_xdp_setup(nn, xdp->prog, xdp->flags, xdp->extack); case XDP_QUERY_PROG: - xdp->prog_attached = !!nn->xdp_prog; - if (nn->dp.bpf_offload_xdp) - xdp->prog_attached = XDP_ATTACHED_HW; xdp->prog_id = nn->xdp_prog ? nn->xdp_prog->aux->id : 0; xdp->flags = nn->xdp_prog ? nn->xdp_flags : 0; return 0; diff --git a/drivers/net/ethernet/qlogic/qede/qede_filter.c b/drivers/net/ethernet/qlogic/qede/qede_filter.c index 924cb2ea664d..6a30751a7c84 100644 --- a/drivers/net/ethernet/qlogic/qede/qede_filter.c +++ b/drivers/net/ethernet/qlogic/qede/qede_filter.c @@ -1073,7 +1073,6 @@ int qede_xdp(struct net_device *dev, struct netdev_bpf *xdp) case XDP_SETUP_PROG: return qede_xdp_set(edev, xdp->prog); case XDP_QUERY_PROG: - xdp->prog_attached = !!edev->xdp_prog; xdp->prog_id = edev->xdp_prog ? edev->xdp_prog->aux->id : 0; return 0; default: diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 1e64384146f2..ece060edc280 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -1080,7 +1080,6 @@ static int tun_xdp(struct net_device *dev, struct netdev_bpf *xdp) return tun_xdp_set(dev, xdp->prog, xdp->extack); case XDP_QUERY_PROG: xdp->prog_id = tun_xdp_query(dev); - xdp->prog_attached = !!xdp->prog_id; return 0; default: return -EINVAL; diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 339d6c0b162a..91871e8bfd47 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -2104,7 +2104,6 @@ static int virtnet_xdp(struct net_device *dev, struct netdev_bpf *xdp) return virtnet_xdp_set(dev, xdp->prog, xdp->extack); case XDP_QUERY_PROG: xdp->prog_id = virtnet_xdp_query(dev); - xdp->prog_attached = !!xdp->prog_id; return 0; default: return -EINVAL; diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 2f0c8dfdf15d..1c65902581ee 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -808,10 +808,6 @@ enum bpf_netdev_command { */ XDP_SETUP_PROG, XDP_SETUP_PROG_HW, - /* Check if a bpf program is set on the device. The callee should - * set @prog_attached to one of XDP_ATTACHED_* values, note that "true" - * is equivalent to XDP_ATTACHED_DRV. - */ XDP_QUERY_PROG, /* BPF program for offload callbacks, invoked at program load time. */ BPF_OFFLOAD_MAP_ALLOC, @@ -835,7 +831,6 @@ struct netdev_bpf { }; /* XDP_QUERY_PROG */ struct { - u8 prog_attached; u32 prog_id; /* flags with which program was installed */ u32 prog_flags; diff --git a/net/core/dev.c b/net/core/dev.c index f95ed1fd909b..0b7e0dd82b9e 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4687,7 +4687,6 @@ static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp) break; case XDP_QUERY_PROG: - xdp->prog_attached = !!old; xdp->prog_id = old ? old->aux->id : 0; break; @@ -7236,13 +7235,13 @@ void __dev_xdp_query(struct net_device *dev, bpf_op_t bpf_op, WARN_ON(bpf_op(dev, xdp) < 0); } -static u8 __dev_xdp_attached(struct net_device *dev, bpf_op_t bpf_op) +static bool __dev_xdp_attached(struct net_device *dev, bpf_op_t bpf_op) { struct netdev_bpf xdp; __dev_xdp_query(dev, bpf_op, &xdp); - return xdp.prog_attached; + return xdp.prog_id; } static int dev_xdp_install(struct net_device *dev, bpf_op_t bpf_op, @@ -7277,7 +7276,7 @@ static void dev_xdp_uninstall(struct net_device *dev) return; __dev_xdp_query(dev, ndo_bpf, &xdp); - if (xdp.prog_attached == XDP_ATTACHED_NONE) + if (!xdp.prog_id) return; /* Program removal should always succeed */ diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 781927f6df5e..29bd9ff6aca1 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1249,9 +1249,13 @@ static u8 rtnl_xdp_attached_mode(struct net_device *dev, u32 *prog_id) return XDP_ATTACHED_NONE; __dev_xdp_query(dev, ops->ndo_bpf, &xdp); - *prog_id = xdp.prog_id; + if (!xdp.prog_id) + return XDP_ATTACHED_NONE; - return xdp.prog_attached; + *prog_id = xdp.prog_id; + if (xdp.prog_flags & XDP_FLAGS_HW_MODE) + return XDP_ATTACHED_HW; + return XDP_ATTACHED_DRV; } static int rtnl_xdp_fill(struct sk_buff *skb, struct net_device *dev) From 858ae0215fa291b04afa08599dda89cd3ce917c8 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 11 Jul 2018 20:36:38 -0700 Subject: [PATCH 1195/1640] UPSTREAM: xdp: add per mode attributes for attached programs In preparation for support of simultaneous driver and hardware XDP support add per-mode attributes. The catch-all IFLA_XDP_PROG_ID will still be reported, but user space can now also access the program ID in a new IFLA_XDP__PROG_ID attribute. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Signed-off-by: Daniel Borkmann --- include/uapi/linux/if_link.h | 3 +++ net/core/rtnetlink.c | 30 ++++++++++++++++++++++++++---- 2 files changed, 29 insertions(+), 4 deletions(-) diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 6603c3a867c8..6f008b9bc563 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -923,6 +923,9 @@ enum { IFLA_XDP_ATTACHED, IFLA_XDP_FLAGS, IFLA_XDP_PROG_ID, + IFLA_XDP_DRV_PROG_ID, + IFLA_XDP_SKB_PROG_ID, + IFLA_XDP_HW_PROG_ID, __IFLA_XDP_MAX, }; diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 29bd9ff6aca1..b783ac9bae8c 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -881,7 +881,8 @@ static size_t rtnl_xdp_size(void) { size_t xdp_size = nla_total_size(0) + /* nest IFLA_XDP */ nla_total_size(1) + /* XDP_ATTACHED */ - nla_total_size(4); /* XDP_PROG_ID */ + nla_total_size(4) + /* XDP_PROG_ID */ + nla_total_size(4); /* XDP__PROG_ID */ return xdp_size; } @@ -1260,16 +1261,17 @@ static u8 rtnl_xdp_attached_mode(struct net_device *dev, u32 *prog_id) static int rtnl_xdp_fill(struct sk_buff *skb, struct net_device *dev) { + u32 prog_attr, prog_id; struct nlattr *xdp; - u32 prog_id; int err; + u8 mode; xdp = nla_nest_start(skb, IFLA_XDP); if (!xdp) return -EMSGSIZE; - err = nla_put_u8(skb, IFLA_XDP_ATTACHED, - rtnl_xdp_attached_mode(dev, &prog_id)); + mode = rtnl_xdp_attached_mode(dev, &prog_id); + err = nla_put_u8(skb, IFLA_XDP_ATTACHED, mode); if (err) goto err_cancel; @@ -1277,6 +1279,26 @@ static int rtnl_xdp_fill(struct sk_buff *skb, struct net_device *dev) err = nla_put_u32(skb, IFLA_XDP_PROG_ID, prog_id); if (err) goto err_cancel; + + switch (mode) { + case XDP_ATTACHED_DRV: + prog_attr = IFLA_XDP_DRV_PROG_ID; + break; + case XDP_ATTACHED_SKB: + prog_attr = IFLA_XDP_SKB_PROG_ID; + break; + case XDP_ATTACHED_HW: + prog_attr = IFLA_XDP_HW_PROG_ID; + break; + case XDP_ATTACHED_NONE: + default: + err = -EINVAL; + goto err_cancel; + } + + err = nla_put_u32(skb, prog_attr, prog_id); + if (err) + goto err_cancel; } nla_nest_end(skb, xdp); From 70c37291583c3491a2264dd24b3e7eb596728f9e Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 11 Jul 2018 20:36:41 -0700 Subject: [PATCH 1196/1640] BACKPORT: xdp: support simultaneous driver and hw XDP attachment Split the query of HW-attached program from the software one. Introduce new .ndo_bpf command to query HW-attached program. This will allow drivers to install different programs in HW and SW at the same time. Netlink can now also carry multiple programs on dump (in which case mode will be set to XDP_ATTACHED_MULTI and user has to check per-attachment point attributes, IFLA_XDP_PROG_ID will not be present). We reuse IFLA_XDP_PROG_ID skb space for second mode, so rtnl_xdp_size() doesn't need to be updated. Note that the installation side is still not there, since all drivers currently reject installing more than one program at the time. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Signed-off-by: Daniel Borkmann --- include/linux/netdevice.h | 7 +-- include/uapi/linux/if_link.h | 1 + net/core/dev.c | 47 ++++++++++-------- net/core/rtnetlink.c | 93 ++++++++++++++++++++---------------- 4 files changed, 85 insertions(+), 63 deletions(-) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 1c65902581ee..b9566202d38a 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -809,6 +809,7 @@ enum bpf_netdev_command { XDP_SETUP_PROG, XDP_SETUP_PROG_HW, XDP_QUERY_PROG, + XDP_QUERY_PROG_HW, /* BPF program for offload callbacks, invoked at program load time. */ BPF_OFFLOAD_MAP_ALLOC, BPF_OFFLOAD_MAP_FREE, @@ -829,7 +830,7 @@ struct netdev_bpf { struct bpf_prog *prog; struct netlink_ext_ack *extack; }; - /* XDP_QUERY_PROG */ + /* XDP_QUERY_PROG, XDP_QUERY_PROG_HW */ struct { u32 prog_id; /* flags with which program was installed */ @@ -3442,8 +3443,8 @@ struct sk_buff *dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, typedef int (*bpf_op_t)(struct net_device *dev, struct netdev_bpf *bpf); int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack, int fd, u32 flags); -void __dev_xdp_query(struct net_device *dev, bpf_op_t xdp_op, - struct netdev_bpf *xdp); +u32 __dev_xdp_query(struct net_device *dev, bpf_op_t xdp_op, + enum bpf_netdev_command cmd); int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb); int dev_forward_skb(struct net_device *dev, struct sk_buff *skb); diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 6f008b9bc563..49a04cad14b2 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -915,6 +915,7 @@ enum { XDP_ATTACHED_DRV, XDP_ATTACHED_SKB, XDP_ATTACHED_HW, + XDP_ATTACHED_MULTI, }; enum { diff --git a/net/core/dev.c b/net/core/dev.c index 0b7e0dd82b9e..0e58e5b05a9c 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -7225,21 +7225,19 @@ int dev_change_proto_down(struct net_device *dev, bool proto_down) } EXPORT_SYMBOL(dev_change_proto_down); -void __dev_xdp_query(struct net_device *dev, bpf_op_t bpf_op, - struct netdev_bpf *xdp) -{ - memset(xdp, 0, sizeof(*xdp)); - xdp->command = XDP_QUERY_PROG; - - /* Query must always succeed. */ - WARN_ON(bpf_op(dev, xdp) < 0); -} - -static bool __dev_xdp_attached(struct net_device *dev, bpf_op_t bpf_op) +u32 __dev_xdp_query(struct net_device *dev, bpf_op_t bpf_op, + enum bpf_netdev_command cmd) { struct netdev_bpf xdp; - __dev_xdp_query(dev, bpf_op, &xdp); + if (!bpf_op) + return 0; + + memset(&xdp, 0, sizeof(xdp)); + xdp.command = cmd; + + /* Query must always succeed. */ + WARN_ON(bpf_op(dev, &xdp) < 0 && cmd == XDP_QUERY_PROG); return xdp.prog_id; } @@ -7275,12 +7273,19 @@ static void dev_xdp_uninstall(struct net_device *dev) if (!ndo_bpf) return; - __dev_xdp_query(dev, ndo_bpf, &xdp); - if (!xdp.prog_id) - return; + memset(&xdp, 0, sizeof(xdp)); + xdp.command = XDP_QUERY_PROG; + WARN_ON(ndo_bpf(dev, &xdp)); + if (xdp.prog_id) + WARN_ON(dev_xdp_install(dev, ndo_bpf, NULL, xdp.prog_flags, + NULL)); - /* Program removal should always succeed */ - WARN_ON(dev_xdp_install(dev, ndo_bpf, NULL, xdp.prog_flags, NULL)); + /* Remove HW offload */ + memset(&xdp, 0, sizeof(xdp)); + xdp.command = XDP_QUERY_PROG_HW; + if (!ndo_bpf(dev, &xdp) && xdp.prog_id) + WARN_ON(dev_xdp_install(dev, ndo_bpf, NULL, xdp.prog_flags, + NULL)); } /** @@ -7296,12 +7301,15 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack, int fd, u32 flags) { const struct net_device_ops *ops = dev->netdev_ops; + enum bpf_netdev_command query; struct bpf_prog *prog = NULL; bpf_op_t bpf_op, bpf_chk; int err; ASSERT_RTNL(); + query = flags & XDP_FLAGS_HW_MODE ? XDP_QUERY_PROG_HW : XDP_QUERY_PROG; + bpf_op = bpf_chk = ops->ndo_bpf; if (!bpf_op && (flags & (XDP_FLAGS_DRV_MODE | XDP_FLAGS_HW_MODE))) return -EOPNOTSUPP; @@ -7311,10 +7319,11 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack, bpf_chk = generic_xdp_install; if (fd >= 0) { - if (bpf_chk && __dev_xdp_attached(dev, bpf_chk)) + if (__dev_xdp_query(dev, bpf_chk, XDP_QUERY_PROG) || + __dev_xdp_query(dev, bpf_chk, XDP_QUERY_PROG_HW)) return -EEXIST; if ((flags & XDP_FLAGS_UPDATE_IF_NOEXIST) && - __dev_xdp_attached(dev, bpf_op)) + __dev_xdp_query(dev, bpf_op, query)) return -EBUSY; prog = bpf_prog_get_type_dev(fd, BPF_PROG_TYPE_XDP, diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index b783ac9bae8c..46c0eae20d96 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -881,7 +881,7 @@ static size_t rtnl_xdp_size(void) { size_t xdp_size = nla_total_size(0) + /* nest IFLA_XDP */ nla_total_size(1) + /* XDP_ATTACHED */ - nla_total_size(4) + /* XDP_PROG_ID */ + nla_total_size(4) + /* XDP_PROG_ID (or 1st mode) */ nla_total_size(4); /* XDP__PROG_ID */ return xdp_size; @@ -1232,37 +1232,57 @@ static int rtnl_fill_link_ifmap(struct sk_buff *skb, struct net_device *dev) return 0; } -static u8 rtnl_xdp_attached_mode(struct net_device *dev, u32 *prog_id) +static u32 rtnl_xdp_prog_skb(struct net_device *dev) { - const struct net_device_ops *ops = dev->netdev_ops; const struct bpf_prog *generic_xdp_prog; - struct netdev_bpf xdp; ASSERT_RTNL(); - *prog_id = 0; generic_xdp_prog = rtnl_dereference(dev->xdp_prog); - if (generic_xdp_prog) { - *prog_id = generic_xdp_prog->aux->id; - return XDP_ATTACHED_SKB; - } - if (!ops->ndo_bpf) - return XDP_ATTACHED_NONE; + if (!generic_xdp_prog) + return 0; + return generic_xdp_prog->aux->id; +} - __dev_xdp_query(dev, ops->ndo_bpf, &xdp); - if (!xdp.prog_id) - return XDP_ATTACHED_NONE; +static u32 rtnl_xdp_prog_drv(struct net_device *dev) +{ + return __dev_xdp_query(dev, dev->netdev_ops->ndo_bpf, XDP_QUERY_PROG); +} - *prog_id = xdp.prog_id; - if (xdp.prog_flags & XDP_FLAGS_HW_MODE) - return XDP_ATTACHED_HW; - return XDP_ATTACHED_DRV; +static u32 rtnl_xdp_prog_hw(struct net_device *dev) +{ + return __dev_xdp_query(dev, dev->netdev_ops->ndo_bpf, + XDP_QUERY_PROG_HW); +} + +static int rtnl_xdp_report_one(struct sk_buff *skb, struct net_device *dev, + u32 *prog_id, u8 *mode, u8 tgt_mode, u32 attr, + u32 (*get_prog_id)(struct net_device *dev)) +{ + u32 curr_id; + int err; + + curr_id = get_prog_id(dev); + if (!curr_id) + return 0; + + *prog_id = curr_id; + err = nla_put_u32(skb, attr, curr_id); + if (err) + return err; + + if (*mode != XDP_ATTACHED_NONE) + *mode = XDP_ATTACHED_MULTI; + else + *mode = tgt_mode; + + return 0; } static int rtnl_xdp_fill(struct sk_buff *skb, struct net_device *dev) { - u32 prog_attr, prog_id; struct nlattr *xdp; + u32 prog_id; int err; u8 mode; @@ -1270,35 +1290,26 @@ static int rtnl_xdp_fill(struct sk_buff *skb, struct net_device *dev) if (!xdp) return -EMSGSIZE; - mode = rtnl_xdp_attached_mode(dev, &prog_id); + prog_id = 0; + mode = XDP_ATTACHED_NONE; + if (rtnl_xdp_report_one(skb, dev, &prog_id, &mode, XDP_ATTACHED_SKB, + IFLA_XDP_SKB_PROG_ID, rtnl_xdp_prog_skb)) + goto err_cancel; + if (rtnl_xdp_report_one(skb, dev, &prog_id, &mode, XDP_ATTACHED_DRV, + IFLA_XDP_DRV_PROG_ID, rtnl_xdp_prog_drv)) + goto err_cancel; + if (rtnl_xdp_report_one(skb, dev, &prog_id, &mode, XDP_ATTACHED_HW, + IFLA_XDP_HW_PROG_ID, rtnl_xdp_prog_hw)) + goto err_cancel; + err = nla_put_u8(skb, IFLA_XDP_ATTACHED, mode); if (err) goto err_cancel; - if (prog_id) { + if (prog_id && mode != XDP_ATTACHED_MULTI) { err = nla_put_u32(skb, IFLA_XDP_PROG_ID, prog_id); if (err) goto err_cancel; - - switch (mode) { - case XDP_ATTACHED_DRV: - prog_attr = IFLA_XDP_DRV_PROG_ID; - break; - case XDP_ATTACHED_SKB: - prog_attr = IFLA_XDP_SKB_PROG_ID; - break; - case XDP_ATTACHED_HW: - prog_attr = IFLA_XDP_HW_PROG_ID; - break; - case XDP_ATTACHED_NONE: - default: - err = -EINVAL; - goto err_cancel; - } - - err = nla_put_u32(skb, prog_attr, prog_id); - if (err) - goto err_cancel; } nla_nest_end(skb, xdp); From 9b5606f1f8b8473b3d7cb55f7b07fc94111ddb71 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 30 Jul 2018 20:43:53 -0700 Subject: [PATCH 1197/1640] UPSTREAM: xsk: don't allow umem replace at stack level MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently drivers have to check if they already have a umem installed for a given queue and return an error if so. Make better use of XDP_QUERY_XSK_UMEM and move this functionality to the core. We need to keep rtnl across the calls now. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Acked-by: Björn Töpel Signed-off-by: David S. Miller --- include/linux/netdevice.h | 7 ++++--- net/xdp/xdp_umem.c | 37 ++++++++++++++++++++++++++++--------- 2 files changed, 32 insertions(+), 12 deletions(-) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index b9566202d38a..fa403396f850 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -840,10 +840,10 @@ struct netdev_bpf { struct { struct bpf_offloaded_map *offmap; }; - /* XDP_SETUP_XSK_UMEM */ + /* XDP_QUERY_XSK_UMEM, XDP_SETUP_XSK_UMEM */ struct { - struct xdp_umem *umem; - u16 queue_id; + struct xdp_umem *umem; /* out for query*/ + u16 queue_id; /* in for query */ } xsk; }; }; @@ -3445,6 +3445,7 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack, int fd, u32 flags); u32 __dev_xdp_query(struct net_device *dev, bpf_op_t xdp_op, enum bpf_netdev_command cmd); +int xdp_umem_query(struct net_device *dev, u16 queue_id); int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb); int dev_forward_skb(struct net_device *dev, struct sk_buff *skb); diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c index c199d66b5f3f..911ca6d3cb5a 100644 --- a/net/xdp/xdp_umem.c +++ b/net/xdp/xdp_umem.c @@ -11,6 +11,8 @@ #include #include #include +#include +#include #include "xdp_umem.h" #include "xsk_queue.h" @@ -40,6 +42,21 @@ void xdp_del_sk_umem(struct xdp_umem *umem, struct xdp_sock *xs) } } +int xdp_umem_query(struct net_device *dev, u16 queue_id) +{ + struct netdev_bpf bpf; + + ASSERT_RTNL(); + + memset(&bpf, 0, sizeof(bpf)); + bpf.command = XDP_QUERY_XSK_UMEM; + bpf.xsk.queue_id = queue_id; + + if (!dev->netdev_ops->ndo_bpf) + return 0; + return dev->netdev_ops->ndo_bpf(dev, &bpf) ?: !!bpf.xsk.umem; +} + int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev, u32 queue_id, u16 flags) { @@ -62,28 +79,30 @@ int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev, bpf.command = XDP_QUERY_XSK_UMEM; rtnl_lock(); - err = dev->netdev_ops->ndo_bpf(dev, &bpf); - rtnl_unlock(); - - if (err) - return force_zc ? -ENOTSUPP : 0; + err = xdp_umem_query(dev, queue_id); + if (err) { + err = err < 0 ? -ENOTSUPP : -EBUSY; + goto err_rtnl_unlock; + } bpf.command = XDP_SETUP_XSK_UMEM; bpf.xsk.umem = umem; bpf.xsk.queue_id = queue_id; - rtnl_lock(); err = dev->netdev_ops->ndo_bpf(dev, &bpf); - rtnl_unlock(); - if (err) - return force_zc ? err : 0; /* fail or fallback */ + goto err_rtnl_unlock; + rtnl_unlock(); dev_hold(dev); umem->dev = dev; umem->queue_id = queue_id; umem->zc = true; return 0; + +err_rtnl_unlock: + rtnl_unlock(); + return force_zc ? err : 0; /* fail or fallback */ } static void xdp_umem_clear_dev(struct xdp_umem *umem) From 1ab9498b163deef7337161d6cbf950a34876e2bd Mon Sep 17 00:00:00 2001 From: Prashant Bhole Date: Mon, 20 Aug 2018 09:54:25 +0900 Subject: [PATCH 1198/1640] UPSTREAM: xsk: fix return value of xdp_umem_assign_dev() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit s/ENOTSUPP/EOPNOTSUPP/ in function umem_assign_dev(). This function's return value is directly returned by xsk_bind(). EOPNOTSUPP is bind()'s possible return value. Fixes: f734607e819b ("xsk: refactor xdp_umem_assign_dev()") Signed-off-by: Prashant Bhole Acked-by: Song Liu Acked-by: Björn Töpel Signed-off-by: Daniel Borkmann --- net/xdp/xdp_umem.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c index 911ca6d3cb5a..bfe2dbea480b 100644 --- a/net/xdp/xdp_umem.c +++ b/net/xdp/xdp_umem.c @@ -74,14 +74,14 @@ int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev, return 0; if (!dev->netdev_ops->ndo_bpf || !dev->netdev_ops->ndo_xsk_async_xmit) - return force_zc ? -ENOTSUPP : 0; /* fail or fallback */ + return force_zc ? -EOPNOTSUPP : 0; /* fail or fallback */ bpf.command = XDP_QUERY_XSK_UMEM; rtnl_lock(); err = xdp_umem_query(dev, queue_id); if (err) { - err = err < 0 ? -ENOTSUPP : -EBUSY; + err = err < 0 ? -EOPNOTSUPP : -EBUSY; goto err_rtnl_unlock; } From bb78abc2d9df1b7e005fef9f0d5e7d3fa980302d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Wed, 13 Mar 2019 15:15:49 +0100 Subject: [PATCH 1199/1640] UPSTREAM: xsk: fix umem memory leak on cleanup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 044175a06706d516aa42874bb44dbbfc3c4d20eb ] When the umem is cleaned up, the task that created it might already be gone. If the task was gone, the xdp_umem_release function did not free the pages member of struct xdp_umem. It turned out that the task lookup was not needed at all; The code was a left-over when we moved from task accounting to user accounting [1]. This patch fixes the memory leak by removing the task lookup logic completely. [1] https://lore.kernel.org/netdev/20180131135356.19134-3-bjorn.topel@gmail.com/ Link: https://lore.kernel.org/netdev/c1cb2ca8-6a14-3980-8672-f3de0bb38dfd@suse.cz/ Fixes: c0c77d8fb787 ("xsk: add user memory registration support sockopt") Reported-by: Jiri Slaby Signed-off-by: Björn Töpel Signed-off-by: Daniel Borkmann Signed-off-by: Sasha Levin (Microsoft) --- include/net/xdp_sock.h | 1 - net/xdp/xdp_umem.c | 19 +------------------ 2 files changed, 1 insertion(+), 19 deletions(-) diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index 7161856bcf9c..c2c10cc9ffa0 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -34,7 +34,6 @@ struct xdp_umem { u32 headroom; u32 chunk_size_nohr; struct user_struct *user; - struct pid *pid; unsigned long address; refcount_t users; struct work_struct work; diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c index bfe2dbea480b..a3b037fbfecd 100644 --- a/net/xdp/xdp_umem.c +++ b/net/xdp/xdp_umem.c @@ -152,9 +152,6 @@ static void xdp_umem_unaccount_pages(struct xdp_umem *umem) static void xdp_umem_release(struct xdp_umem *umem) { - struct task_struct *task; - struct mm_struct *mm; - xdp_umem_clear_dev(umem); if (umem->fq) { @@ -169,21 +166,10 @@ static void xdp_umem_release(struct xdp_umem *umem) xdp_umem_unpin_pages(umem); - task = get_pid_task(umem->pid, PIDTYPE_PID); - put_pid(umem->pid); - if (!task) - goto out; - mm = get_task_mm(task); - put_task_struct(task); - if (!mm) - goto out; - - mmput(mm); kfree(umem->pages); umem->pages = NULL; xdp_umem_unaccount_pages(umem); -out: kfree(umem); } @@ -312,7 +298,6 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) if (size_chk < 0) return -EINVAL; - umem->pid = get_task_pid(current, PIDTYPE_PID); umem->address = (unsigned long)addr; umem->props.chunk_mask = ~((u64)chunk_size - 1); umem->props.size = size; @@ -328,7 +313,7 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) err = xdp_umem_account_pages(umem); if (err) - goto out; + return err; err = xdp_umem_pin_pages(umem); if (err) @@ -347,8 +332,6 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) out_account: xdp_umem_unaccount_pages(umem); -out: - put_pid(umem->pid); return err; } From 9e373c3882fbaa56d5aa0d1e8b668c2592dd8a87 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Tue, 25 Jun 2019 11:23:52 -0700 Subject: [PATCH 1200/1640] UPSTREAM: xsk: Properly terminate assignment in xskq_produce_flush_desc MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit f7019b7b0ad14bde732b8953161994edfc384953 ] Clang warns: In file included from net/xdp/xsk_queue.c:10: net/xdp/xsk_queue.h:292:2: warning: expression result unused [-Wunused-value] WRITE_ONCE(q->ring->producer, q->prod_tail); ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ include/linux/compiler.h:284:6: note: expanded from macro 'WRITE_ONCE' __u.__val; \ ~~~ ^~~~~ 1 warning generated. The q->prod_tail assignment has a comma at the end, not a semi-colon. Fix that so clang no longer warns and everything works as expected. Fixes: c497176cb2e4 ("xsk: add Rx receive functions and poll support") Link: https://github.com/ClangBuiltLinux/linux/issues/544 Signed-off-by: Nathan Chancellor Acked-by: Nick Desaulniers Acked-by: Jonathan Lemon Acked-by: Björn Töpel Acked-by: Song Liu Signed-off-by: Daniel Borkmann Signed-off-by: Sasha Levin --- net/xdp/xsk_queue.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index 8a64b150be54..fe96c0d039f2 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -239,7 +239,7 @@ static inline void xskq_produce_flush_desc(struct xsk_queue *q) /* Order producer and data */ smp_wmb(); - q->prod_tail = q->prod_head, + q->prod_tail = q->prod_head; WRITE_ONCE(q->ring->producer, q->prod_tail); } From 7cd75f2b9081263f04c180618c20900a9316074a Mon Sep 17 00:00:00 2001 From: Ivan Khoronzhuk Date: Thu, 15 Aug 2019 23:56:35 +0300 Subject: [PATCH 1201/1640] UPSTREAM: xdp: unpin xdp umem pages in error path [ Upstream commit fb89c39455e4b49881c5a42761bd71f03d3ef888 ] Fix mem leak caused by missed unpin routine for umem pages. Fixes: 8aef7340ae9695 ("xsk: introduce xdp_umem_page") Signed-off-by: Ivan Khoronzhuk Acked-by: Jonathan Lemon Signed-off-by: Daniel Borkmann Signed-off-by: Sasha Levin --- net/xdp/xdp_umem.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c index a3b037fbfecd..8cab91c482ff 100644 --- a/net/xdp/xdp_umem.c +++ b/net/xdp/xdp_umem.c @@ -322,7 +322,7 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) umem->pages = kcalloc(umem->npgs, sizeof(*umem->pages), GFP_KERNEL); if (!umem->pages) { err = -ENOMEM; - goto out_account; + goto out_pin; } for (i = 0; i < umem->npgs; i++) @@ -330,6 +330,8 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) return 0; +out_pin: + xdp_umem_unpin_pages(umem); out_account: xdp_umem_unaccount_pages(umem); return err; From 40216a787547a9784666a3b0409afb9b70ba1883 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Fri, 5 Oct 2018 13:25:15 +0200 Subject: [PATCH 1202/1640] UPSTREAM: xsk: proper AF_XDP socket teardown ordering MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 541d7fdd7694560404c502f64298a90ffe017e6b ] The AF_XDP socket struct can exist in three different, implicit states: setup, bound and released. Setup is prior the socket has been bound to a device. Bound is when the socket is active for receive and send. Released is when the process/userspace side of the socket is released, but the sock object is still lingering, e.g. when there is a reference to the socket in an XSKMAP after process termination. The Rx fast-path code uses the "dev" member of struct xdp_sock to check whether a socket is bound or relased, and the Tx code uses the struct xdp_umem "xsk_list" member in conjunction with "dev" to determine the state of a socket. However, the transition from bound to released did not tear the socket down in correct order. On the Rx side "dev" was cleared after synchronize_net() making the synchronization useless. On the Tx side, the internal queues were destroyed prior removing them from the "xsk_list". This commit corrects the cleanup order, and by doing so xdp_del_sk_umem() can be simplified and one synchronize_net() can be removed. Fixes: 965a99098443 ("xsk: add support for bind for Rx") Fixes: ac98d8aab61b ("xsk: wire upp Tx zero-copy functions") Reported-by: Jesper Dangaard Brouer Signed-off-by: Björn Töpel Acked-by: Song Liu Signed-off-by: Daniel Borkmann Signed-off-by: Sasha Levin --- net/xdp/xdp_umem.c | 11 +++-------- net/xdp/xsk.c | 13 ++++++++----- 2 files changed, 11 insertions(+), 13 deletions(-) diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c index 8cab91c482ff..d9117ab035f7 100644 --- a/net/xdp/xdp_umem.c +++ b/net/xdp/xdp_umem.c @@ -32,14 +32,9 @@ void xdp_del_sk_umem(struct xdp_umem *umem, struct xdp_sock *xs) { unsigned long flags; - if (xs->dev) { - spin_lock_irqsave(&umem->xsk_list_lock, flags); - list_del_rcu(&xs->list); - spin_unlock_irqrestore(&umem->xsk_list_lock, flags); - - if (umem->zc) - synchronize_net(); - } + spin_lock_irqsave(&umem->xsk_list_lock, flags); + list_del_rcu(&xs->list); + spin_unlock_irqrestore(&umem->xsk_list_lock, flags); } int xdp_umem_query(struct net_device *dev, u16 queue_id) diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 661504042d30..ff15207036dc 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -343,12 +343,18 @@ static int xsk_release(struct socket *sock) local_bh_enable(); if (xs->dev) { + struct net_device *dev = xs->dev; + /* Wait for driver to stop using the xdp socket. */ - synchronize_net(); - dev_put(xs->dev); + xdp_del_sk_umem(xs->umem, xs); xs->dev = NULL; + synchronize_net(); + dev_put(dev); } + xskq_destroy(xs->rx); + xskq_destroy(xs->tx); + sock_orphan(sk); sock->sk = NULL; @@ -707,9 +713,6 @@ static void xsk_destruct(struct sock *sk) if (!sock_flag(sk, SOCK_DEAD)) return; - xskq_destroy(xs->rx); - xskq_destroy(xs->tx); - xdp_del_sk_umem(xs->umem, xs); xdp_put_umem(xs->umem); sk_refcnt_debug_dec(sk); From a35f9181045a20c05770ff7c6c2980c972389709 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Fri, 8 Feb 2019 14:13:50 +0100 Subject: [PATCH 1203/1640] UPSTREAM: xsk: add missing smp_rmb() in xsk_mmap [ Upstream commit e6762c8bcf982821935a2b1cb33cf8335d0eefae ] All the setup code in AF_XDP is protected by a mutex with the exception of the mmap code that cannot use it. To make sure that a process banging on the mmap call at the same time as another process is setting up the socket, smp_wmb() calls were added in the umem registration code and the queue creation code, so that the published structures that xsk_mmap needs would be consistent. However, the corresponding smp_rmb() calls were not added to the xsk_mmap code. This patch adds these calls. Fixes: 37b076933a8e3 ("xsk: add missing write- and data-dependency barrier") Fixes: c0c77d8fb787c ("xsk: add user memory registration support sockopt") Signed-off-by: Magnus Karlsson Signed-off-by: Alexei Starovoitov Signed-off-by: Sasha Levin --- net/xdp/xsk.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index ff15207036dc..547fc4554b22 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -661,6 +661,8 @@ static int xsk_mmap(struct file *file, struct socket *sock, if (!umem) return -EINVAL; + /* Matches the smp_wmb() in XDP_UMEM_REG */ + smp_rmb(); if (offset == XDP_UMEM_PGOFF_FILL_RING) q = READ_ONCE(umem->fq); else if (offset == XDP_UMEM_PGOFF_COMPLETION_RING) @@ -670,6 +672,8 @@ static int xsk_mmap(struct file *file, struct socket *sock, if (!q) return -EINVAL; + /* Matches the smp_wmb() in xsk_init_queue */ + smp_rmb(); qpg = virt_to_head_page(q->ring); if (size > (PAGE_SIZE << compound_order(qpg))) return -EINVAL; From e89f8f2fa81e14e0e928be2ab33c04f688dd8650 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Thu, 4 Jul 2019 17:25:03 +0300 Subject: [PATCH 1204/1640] UPSTREAM: xdp: fix possible cq entry leak MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 675716400da6f15b9d3db04ef74ee74ca9a00af3 ] Completion queue address reservation could not be undone. In case of bad 'queue_id' or skb allocation failure, reserved entry will be leaked reducing the total capacity of completion queue. Fix that by moving reservation to the point where failure is not possible. Additionally, 'queue_id' checking moved out from the loop since there is no point to check it there. Fixes: 35fcde7f8deb ("xsk: support for Tx") Signed-off-by: Ilya Maximets Acked-by: Björn Töpel Tested-by: William Tu Signed-off-by: Daniel Borkmann Signed-off-by: Sasha Levin --- net/xdp/xsk.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 547fc4554b22..c90854bc3048 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -218,6 +218,9 @@ static int xsk_generic_xmit(struct sock *sk, struct msghdr *m, mutex_lock(&xs->mutex); + if (xs->queue_id >= xs->dev->real_num_tx_queues) + goto out; + while (xskq_peek_desc(xs->tx, &desc)) { char *buffer; u64 addr; @@ -228,12 +231,6 @@ static int xsk_generic_xmit(struct sock *sk, struct msghdr *m, goto out; } - if (xskq_reserve_addr(xs->umem->cq)) - goto out; - - if (xs->queue_id >= xs->dev->real_num_tx_queues) - goto out; - len = desc.len; skb = sock_alloc_send_skb(sk, len, 1, &err); if (unlikely(!skb)) { @@ -245,7 +242,7 @@ static int xsk_generic_xmit(struct sock *sk, struct msghdr *m, addr = desc.addr; buffer = xdp_umem_get_data(xs->umem, addr); err = skb_store_bits(skb, 0, buffer, len); - if (unlikely(err)) { + if (unlikely(err) || xskq_reserve_addr(xs->umem->cq)) { kfree_skb(skb); goto out; } From 2ba0b40aec7eef63df0eb54ba8cdf74b6769a9ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Wed, 4 Sep 2019 13:49:10 +0200 Subject: [PATCH 1205/1640] UPSTREAM: xsk: avoid store-tearing when assigning queues MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 94a997637c5b562fa0ca44fca1d2cd02ec08236f ] Use WRITE_ONCE when doing the store of tx, rx, fq, and cq, to avoid potential store-tearing. These members are read outside of the control mutex in the mmap implementation. Acked-by: Jonathan Lemon Fixes: 37b076933a8e ("xsk: add missing write- and data-dependency barrier") Signed-off-by: Björn Töpel Signed-off-by: Daniel Borkmann Signed-off-by: Sasha Levin --- net/xdp/xsk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index c90854bc3048..b580078f04d1 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -320,7 +320,7 @@ static int xsk_init_queue(u32 entries, struct xsk_queue **queue, /* Make sure queue is ready before it can be seen by others */ smp_wmb(); - *queue = q; + WRITE_ONCE(*queue, q); return 0; } From 1b364c66b4e3c2a36abcbb54cf01b89c527a355b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Wed, 4 Sep 2019 13:49:11 +0200 Subject: [PATCH 1206/1640] UPSTREAM: xsk: avoid store-tearing when assigning umem MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 9764f4b301c3e7eb3b75eec85b73cad449cdbb0d ] The umem member of struct xdp_sock is read outside of the control mutex, in the mmap implementation, and needs a WRITE_ONCE to avoid potential store-tearing. Acked-by: Jonathan Lemon Fixes: 423f38329d26 ("xsk: add umem fill queue support and mmap") Signed-off-by: Björn Töpel Signed-off-by: Daniel Borkmann Signed-off-by: Sasha Levin --- net/xdp/xsk.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index b580078f04d1..72caa4fb13f4 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -454,7 +454,7 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) } xdp_get_umem(umem_xs->umem); - xs->umem = umem_xs->umem; + WRITE_ONCE(xs->umem, umem_xs->umem); sockfd_put(sock); } else if (!xs->umem || !xdp_umem_validate_queues(xs->umem)) { err = -EINVAL; @@ -534,7 +534,7 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname, /* Make sure umem is ready before it can be seen by others */ smp_wmb(); - xs->umem = umem; + WRITE_ONCE(xs->umem, umem); mutex_unlock(&xs->mutex); return 0; } From 386b1b7bb3bc598cc4c7f443868bc1a4764d470b Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Mon, 21 Oct 2019 10:16:58 +0200 Subject: [PATCH 1207/1640] UPSTREAM: xsk: Fix registration of Rx-only sockets [ Upstream commit 2afd23f78f39da84937006ecd24aa664a4ab052b ] Having Rx-only AF_XDP sockets can potentially lead to a crash in the system by a NULL pointer dereference in xsk_umem_consume_tx(). This function iterates through a list of all sockets tied to a umem and checks if there are any packets to send on the Tx ring. Rx-only sockets do not have a Tx ring, so this will cause a NULL pointer dereference. This will happen if you have registered one or more Rx-only sockets to a umem and the driver is checking the Tx ring even on Rx, or if the XDP_SHARED_UMEM mode is used and there is a mix of Rx-only and other sockets tied to the same umem. Fixed by only putting sockets with a Tx component on the list that xsk_umem_consume_tx() iterates over. Fixes: ac98d8aab61b ("xsk: wire upp Tx zero-copy functions") Reported-by: Kal Cutter Conley Signed-off-by: Magnus Karlsson Signed-off-by: Alexei Starovoitov Acked-by: Jonathan Lemon Link: https://lore.kernel.org/bpf/1571645818-16244-1-git-send-email-magnus.karlsson@intel.com Signed-off-by: Sasha Levin --- net/xdp/xdp_umem.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c index d9117ab035f7..556a649512b6 100644 --- a/net/xdp/xdp_umem.c +++ b/net/xdp/xdp_umem.c @@ -23,6 +23,9 @@ void xdp_add_sk_umem(struct xdp_umem *umem, struct xdp_sock *xs) { unsigned long flags; + if (!xs->tx) + return; + spin_lock_irqsave(&umem->xsk_list_lock, flags); list_add_rcu(&xs->list, &umem->xsk_list); spin_unlock_irqrestore(&umem->xsk_list_lock, flags); @@ -32,6 +35,9 @@ void xdp_del_sk_umem(struct xdp_umem *umem, struct xdp_sock *xs) { unsigned long flags; + if (!xs->tx) + return; + spin_lock_irqsave(&umem->xsk_list_lock, flags); list_del_rcu(&xs->list); spin_unlock_irqrestore(&umem->xsk_list_lock, flags); From dc01e3468fa9634a9ec6531e3b418ae13b749c09 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Tue, 14 Apr 2020 09:35:15 +0200 Subject: [PATCH 1208/1640] UPSTREAM: xsk: Add missing check on user supplied headroom size [ Upstream commit 99e3a236dd43d06c65af0a2ef9cb44306aef6e02 ] Add a check that the headroom cannot be larger than the available space in the chunk. In the current code, a malicious user can set the headroom to a value larger than the chunk size minus the fixed XDP headroom. That way packets with a length larger than the supported size in the umem could get accepted and result in an out-of-bounds write. Fixes: c0c77d8fb787 ("xsk: add user memory registration support sockopt") Reported-by: Bui Quang Minh Signed-off-by: Magnus Karlsson Signed-off-by: Daniel Borkmann Link: https://bugzilla.kernel.org/show_bug.cgi?id=207225 Link: https://lore.kernel.org/bpf/1586849715-23490-1-git-send-email-magnus.karlsson@intel.com Signed-off-by: Sasha Levin --- net/xdp/xdp_umem.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c index 556a649512b6..706fad12f22c 100644 --- a/net/xdp/xdp_umem.c +++ b/net/xdp/xdp_umem.c @@ -260,7 +260,7 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) u32 chunk_size = mr->chunk_size, headroom = mr->headroom; unsigned int chunks, chunks_per_page; u64 addr = mr->addr, size = mr->len; - int size_chk, err, i; + int err, i; if (chunk_size < XDP_UMEM_MIN_CHUNK_SIZE || chunk_size > PAGE_SIZE) { /* Strictly speaking we could support this, if: @@ -295,8 +295,7 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) headroom = ALIGN(headroom, 64); - size_chk = chunk_size - headroom - XDP_PACKET_HEADROOM; - if (size_chk < 0) + if (headroom >= chunk_size - XDP_PACKET_HEADROOM) return -EINVAL; umem->address = (unsigned long)addr; From 24cc8de2647f346704efcbd71465c0fbe0e4e7db Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Mon, 25 May 2020 10:03:59 +0200 Subject: [PATCH 1209/1640] UPSTREAM: xsk: Add overflow check for u64 division, stored into u32 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit b16a87d0aef7a6be766f6618976dc5ff2c689291 upstream. The npgs member of struct xdp_umem is an u32 entity, and stores the number of pages the UMEM consumes. The calculation of npgs npgs = size / PAGE_SIZE can overflow. To avoid overflow scenarios, the division is now first stored in a u64, and the result is verified to fit into 32b. An alternative would be storing the npgs as a u64, however, this wastes memory and is an unrealisticly large packet area. Fixes: c0c77d8fb787 ("xsk: add user memory registration support sockopt") Reported-by: "Minh Bùi Quang" Signed-off-by: Björn Töpel Signed-off-by: Daniel Borkmann Acked-by: Jonathan Lemon Link: https://lore.kernel.org/bpf/CACtPs=GGvV-_Yj6rbpzTVnopgi5nhMoCcTkSkYrJHGQHJWFZMQ@mail.gmail.com/ Link: https://lore.kernel.org/bpf/20200525080400.13195-1-bjorn.topel@gmail.com Signed-off-by: Greg Kroah-Hartman --- net/xdp/xdp_umem.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c index 706fad12f22c..b87e63cb55be 100644 --- a/net/xdp/xdp_umem.c +++ b/net/xdp/xdp_umem.c @@ -258,8 +258,8 @@ static int xdp_umem_account_pages(struct xdp_umem *umem) static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) { u32 chunk_size = mr->chunk_size, headroom = mr->headroom; + u64 npgs, addr = mr->addr, size = mr->len; unsigned int chunks, chunks_per_page; - u64 addr = mr->addr, size = mr->len; int err, i; if (chunk_size < XDP_UMEM_MIN_CHUNK_SIZE || chunk_size > PAGE_SIZE) { @@ -285,6 +285,10 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) if ((addr + size) < addr) return -EINVAL; + npgs = div_u64(size, PAGE_SIZE); + if (npgs > U32_MAX) + return -EINVAL; + chunks = (unsigned int)div_u64(size, chunk_size); if (chunks == 0) return -EINVAL; @@ -303,7 +307,7 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) umem->props.size = size; umem->headroom = headroom; umem->chunk_size_nohr = chunk_size - headroom; - umem->npgs = size / PAGE_SIZE; + umem->npgs = (u32)npgs; umem->pgs = NULL; umem->user = NULL; INIT_LIST_HEAD(&umem->xsk_list); From 2d323dcc56a220a7a872b61024f189cce2415f03 Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Thu, 11 Jun 2020 13:11:06 +0800 Subject: [PATCH 1210/1640] UPSTREAM: xdp: Fix xsk_generic_xmit errno MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit aa2cad0600ed2ca6a0ab39948d4db1666b6c962b ] Propagate sock_alloc_send_skb error code, not set it to EAGAIN unconditionally, when fail to allocate skb, which might cause that user space unnecessary loops. Fixes: 35fcde7f8deb ("xsk: support for Tx") Signed-off-by: Li RongQing Signed-off-by: Daniel Borkmann Acked-by: Björn Töpel Link: https://lore.kernel.org/bpf/1591852266-24017-1-git-send-email-lirongqing@baidu.com Signed-off-by: Sasha Levin --- net/xdp/xsk.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 72caa4fb13f4..9ff2ab63e639 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -233,10 +233,8 @@ static int xsk_generic_xmit(struct sock *sk, struct msghdr *m, len = desc.len; skb = sock_alloc_send_skb(sk, len, 1, &err); - if (unlikely(!skb)) { - err = -EAGAIN; + if (unlikely(!skb)) goto out; - } skb_put(skb, len); addr = desc.addr; From 0eee29fad4927902caa39f927615572867964220 Mon Sep 17 00:00:00 2001 From: Luc Van Oostenryck Date: Wed, 20 Nov 2019 01:10:42 +0100 Subject: [PATCH 1211/1640] UPSTREAM: xsk: Fix xsk_poll()'s return type MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 5d946c5abbaf68083fa6a41824dd79e1f06286d8 ] xsk_poll() is defined as returning 'unsigned int' but the .poll method is declared as returning '__poll_t', a bitwise type. Fix this by using the proper return type and using the EPOLL constants instead of the POLL ones, as required for __poll_t. Signed-off-by: Luc Van Oostenryck Signed-off-by: Daniel Borkmann Acked-by: Björn Töpel Link: https://lore.kernel.org/bpf/20191120001042.30830-1-luc.vanoostenryck@gmail.com Signed-off-by: Sasha Levin --- net/xdp/xsk.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 9ff2ab63e639..6bb0649c028c 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -289,17 +289,17 @@ static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len) return (xs->zc) ? xsk_zc_xmit(sk) : xsk_generic_xmit(sk, m, total_len); } -static unsigned int xsk_poll(struct file *file, struct socket *sock, +static __poll_t xsk_poll(struct file *file, struct socket *sock, struct poll_table_struct *wait) { - unsigned int mask = datagram_poll(file, sock, wait); + __poll_t mask = datagram_poll(file, sock, wait); struct sock *sk = sock->sk; struct xdp_sock *xs = xdp_sk(sk); if (xs->rx && !xskq_empty_desc(xs->rx)) - mask |= POLLIN | POLLRDNORM; + mask |= EPOLLIN | EPOLLRDNORM; if (xs->tx && !xskq_full_desc(xs->tx)) - mask |= POLLOUT | POLLWRNORM; + mask |= EPOLLOUT | EPOLLWRNORM; return mask; } From 786371214f613a4b8235e867c9dfd09765367b13 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Thu, 19 Dec 2019 13:39:21 +0100 Subject: [PATCH 1212/1640] UPSTREAM: xsk: Simplify detection of empty and full rings [ Upstream commit 11cc2d21499cabe7e7964389634ed1de3ee91d33 ] In order to set the correct return flags for poll, the xsk code has to check if the Rx queue is empty and if the Tx queue is full. This code was unnecessarily large and complex as it used the functions that are used to update the local state from the global state (xskq_nb_free and xskq_nb_avail). Since we are not doing this nor updating any data dependent on this state, we can simplify the functions. Another benefit from this is that we can also simplify the xskq_nb_free and xskq_nb_avail functions in a later commit. Signed-off-by: Magnus Karlsson Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/1576759171-28550-3-git-send-email-magnus.karlsson@intel.com Signed-off-by: Sasha Levin --- net/xdp/xsk_queue.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index fe96c0d039f2..cf7cbb5dd918 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -245,12 +245,15 @@ static inline void xskq_produce_flush_desc(struct xsk_queue *q) static inline bool xskq_full_desc(struct xsk_queue *q) { - return xskq_nb_avail(q, q->nentries) == q->nentries; + /* No barriers needed since data is not accessed */ + return READ_ONCE(q->ring->producer) - READ_ONCE(q->ring->consumer) == + q->nentries; } static inline bool xskq_empty_desc(struct xsk_queue *q) { - return xskq_nb_free(q, q->prod_tail, q->nentries) == q->nentries; + /* No barriers needed since data is not accessed */ + return READ_ONCE(q->ring->consumer) == READ_ONCE(q->ring->producer); } void xskq_set_umem(struct xsk_queue *q, struct xdp_umem_props *umem_props); From 23ccce651696c8959c536101e97c3cab8bc8ba1a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 4 Apr 2024 20:27:38 +0000 Subject: [PATCH 1213/1640] UPSTREAM: xsk: validate user input for XDP_{UMEM|COMPLETION}_FILL_RING MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 237f3cf13b20db183d3706d997eedc3c49eacd44 upstream. syzbot reported an illegal copy in xsk_setsockopt() [1] Make sure to validate setsockopt() @optlen parameter. [1] BUG: KASAN: slab-out-of-bounds in copy_from_sockptr_offset include/linux/sockptr.h:49 [inline] BUG: KASAN: slab-out-of-bounds in copy_from_sockptr include/linux/sockptr.h:55 [inline] BUG: KASAN: slab-out-of-bounds in xsk_setsockopt+0x909/0xa40 net/xdp/xsk.c:1420 Read of size 4 at addr ffff888028c6cde3 by task syz-executor.0/7549 CPU: 0 PID: 7549 Comm: syz-executor.0 Not tainted 6.8.0-syzkaller-08951-gfe46a7dd189e #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 03/27/2024 Call Trace: __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0x241/0x360 lib/dump_stack.c:114 print_address_description mm/kasan/report.c:377 [inline] print_report+0x169/0x550 mm/kasan/report.c:488 kasan_report+0x143/0x180 mm/kasan/report.c:601 copy_from_sockptr_offset include/linux/sockptr.h:49 [inline] copy_from_sockptr include/linux/sockptr.h:55 [inline] xsk_setsockopt+0x909/0xa40 net/xdp/xsk.c:1420 do_sock_setsockopt+0x3af/0x720 net/socket.c:2311 __sys_setsockopt+0x1ae/0x250 net/socket.c:2334 __do_sys_setsockopt net/socket.c:2343 [inline] __se_sys_setsockopt net/socket.c:2340 [inline] __x64_sys_setsockopt+0xb5/0xd0 net/socket.c:2340 do_syscall_64+0xfb/0x240 entry_SYSCALL_64_after_hwframe+0x6d/0x75 RIP: 0033:0x7fb40587de69 Code: 28 00 00 00 75 05 48 83 c4 28 c3 e8 e1 20 00 00 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 b0 ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007fb40665a0c8 EFLAGS: 00000246 ORIG_RAX: 0000000000000036 RAX: ffffffffffffffda RBX: 00007fb4059abf80 RCX: 00007fb40587de69 RDX: 0000000000000005 RSI: 000000000000011b RDI: 0000000000000006 RBP: 00007fb4058ca47a R08: 0000000000000002 R09: 0000000000000000 R10: 0000000020001980 R11: 0000000000000246 R12: 0000000000000000 R13: 000000000000000b R14: 00007fb4059abf80 R15: 00007fff57ee4d08 Allocated by task 7549: kasan_save_stack mm/kasan/common.c:47 [inline] kasan_save_track+0x3f/0x80 mm/kasan/common.c:68 poison_kmalloc_redzone mm/kasan/common.c:370 [inline] __kasan_kmalloc+0x98/0xb0 mm/kasan/common.c:387 kasan_kmalloc include/linux/kasan.h:211 [inline] __do_kmalloc_node mm/slub.c:3966 [inline] __kmalloc+0x233/0x4a0 mm/slub.c:3979 kmalloc include/linux/slab.h:632 [inline] __cgroup_bpf_run_filter_setsockopt+0xd2f/0x1040 kernel/bpf/cgroup.c:1869 do_sock_setsockopt+0x6b4/0x720 net/socket.c:2293 __sys_setsockopt+0x1ae/0x250 net/socket.c:2334 __do_sys_setsockopt net/socket.c:2343 [inline] __se_sys_setsockopt net/socket.c:2340 [inline] __x64_sys_setsockopt+0xb5/0xd0 net/socket.c:2340 do_syscall_64+0xfb/0x240 entry_SYSCALL_64_after_hwframe+0x6d/0x75 The buggy address belongs to the object at ffff888028c6cde0 which belongs to the cache kmalloc-8 of size 8 The buggy address is located 1 bytes to the right of allocated 2-byte region [ffff888028c6cde0, ffff888028c6cde2) The buggy address belongs to the physical page: page:ffffea0000a31b00 refcount:1 mapcount:0 mapping:0000000000000000 index:0xffff888028c6c9c0 pfn:0x28c6c anon flags: 0xfff00000000800(slab|node=0|zone=1|lastcpupid=0x7ff) page_type: 0xffffffff() raw: 00fff00000000800 ffff888014c41280 0000000000000000 dead000000000001 raw: ffff888028c6c9c0 0000000080800057 00000001ffffffff 0000000000000000 page dumped because: kasan: bad access detected page_owner tracks the page as allocated page last allocated via order 0, migratetype Unmovable, gfp_mask 0x112cc0(GFP_USER|__GFP_NOWARN|__GFP_NORETRY), pid 6648, tgid 6644 (syz-executor.0), ts 133906047828, free_ts 133859922223 set_page_owner include/linux/page_owner.h:31 [inline] post_alloc_hook+0x1ea/0x210 mm/page_alloc.c:1533 prep_new_page mm/page_alloc.c:1540 [inline] get_page_from_freelist+0x33ea/0x3580 mm/page_alloc.c:3311 __alloc_pages+0x256/0x680 mm/page_alloc.c:4569 __alloc_pages_node include/linux/gfp.h:238 [inline] alloc_pages_node include/linux/gfp.h:261 [inline] alloc_slab_page+0x5f/0x160 mm/slub.c:2175 allocate_slab mm/slub.c:2338 [inline] new_slab+0x84/0x2f0 mm/slub.c:2391 ___slab_alloc+0xc73/0x1260 mm/slub.c:3525 __slab_alloc mm/slub.c:3610 [inline] __slab_alloc_node mm/slub.c:3663 [inline] slab_alloc_node mm/slub.c:3835 [inline] __do_kmalloc_node mm/slub.c:3965 [inline] __kmalloc_node+0x2db/0x4e0 mm/slub.c:3973 kmalloc_node include/linux/slab.h:648 [inline] __vmalloc_area_node mm/vmalloc.c:3197 [inline] __vmalloc_node_range+0x5f9/0x14a0 mm/vmalloc.c:3392 __vmalloc_node mm/vmalloc.c:3457 [inline] vzalloc+0x79/0x90 mm/vmalloc.c:3530 bpf_check+0x260/0x19010 kernel/bpf/verifier.c:21162 bpf_prog_load+0x1667/0x20f0 kernel/bpf/syscall.c:2895 __sys_bpf+0x4ee/0x810 kernel/bpf/syscall.c:5631 __do_sys_bpf kernel/bpf/syscall.c:5738 [inline] __se_sys_bpf kernel/bpf/syscall.c:5736 [inline] __x64_sys_bpf+0x7c/0x90 kernel/bpf/syscall.c:5736 do_syscall_64+0xfb/0x240 entry_SYSCALL_64_after_hwframe+0x6d/0x75 page last free pid 6650 tgid 6647 stack trace: reset_page_owner include/linux/page_owner.h:24 [inline] free_pages_prepare mm/page_alloc.c:1140 [inline] free_unref_page_prepare+0x95d/0xa80 mm/page_alloc.c:2346 free_unref_page_list+0x5a3/0x850 mm/page_alloc.c:2532 release_pages+0x2117/0x2400 mm/swap.c:1042 tlb_batch_pages_flush mm/mmu_gather.c:98 [inline] tlb_flush_mmu_free mm/mmu_gather.c:293 [inline] tlb_flush_mmu+0x34d/0x4e0 mm/mmu_gather.c:300 tlb_finish_mmu+0xd4/0x200 mm/mmu_gather.c:392 exit_mmap+0x4b6/0xd40 mm/mmap.c:3300 __mmput+0x115/0x3c0 kernel/fork.c:1345 exit_mm+0x220/0x310 kernel/exit.c:569 do_exit+0x99e/0x27e0 kernel/exit.c:865 do_group_exit+0x207/0x2c0 kernel/exit.c:1027 get_signal+0x176e/0x1850 kernel/signal.c:2907 arch_do_signal_or_restart+0x96/0x860 arch/x86/kernel/signal.c:310 exit_to_user_mode_loop kernel/entry/common.c:105 [inline] exit_to_user_mode_prepare include/linux/entry-common.h:328 [inline] __syscall_exit_to_user_mode_work kernel/entry/common.c:201 [inline] syscall_exit_to_user_mode+0xc9/0x360 kernel/entry/common.c:212 do_syscall_64+0x10a/0x240 arch/x86/entry/common.c:89 entry_SYSCALL_64_after_hwframe+0x6d/0x75 Memory state around the buggy address: ffff888028c6cc80: fa fc fc fc fa fc fc fc fa fc fc fc fa fc fc fc ffff888028c6cd00: fa fc fc fc fa fc fc fc 00 fc fc fc 06 fc fc fc >ffff888028c6cd80: fa fc fc fc fa fc fc fc fa fc fc fc 02 fc fc fc ^ ffff888028c6ce00: fa fc fc fc fa fc fc fc fa fc fc fc fa fc fc fc ffff888028c6ce80: fa fc fc fc fa fc fc fc fa fc fc fc fa fc fc fc Fixes: 423f38329d26 ("xsk: add umem fill queue support and mmap") Reported-by: syzbot Signed-off-by: Eric Dumazet Cc: "Björn Töpel" Cc: Magnus Karlsson Cc: Maciej Fijalkowski Cc: Jonathan Lemon Acked-by: Daniel Borkmann Link: https://lore.kernel.org/r/20240404202738.3634547-1-edumazet@google.com Signed-off-by: Jakub Kicinski [shung-hsi.yu: two additional changes not present in the original 1. Check optlen in the XDP_UMEM_REG case as well. It was added in commit c05cd36458147 ("xsk: add support to allow unaligned chunk placement") but seems like too big of a change for stable 2. copy_from_sockptr() in the context was replace copy_from_usr() because commit a7b75c5a8c414 ("net: pass a sockptr_t into ->setsockopt") was not present] Signed-off-by: Shung-Hsi Yu Signed-off-by: Greg Kroah-Hartman --- net/xdp/xsk.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 6bb0649c028c..d5a9c43930de 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -515,6 +515,8 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname, struct xdp_umem_reg mr; struct xdp_umem *umem; + if (optlen < sizeof(mr)) + return -EINVAL; if (copy_from_user(&mr, optval, sizeof(mr))) return -EFAULT; @@ -542,6 +544,8 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname, struct xsk_queue **q; int entries; + if (optlen < sizeof(entries)) + return -EINVAL; if (copy_from_user(&entries, optval, sizeof(entries))) return -EFAULT; From 9931c92728ba6938020095cac2fd0cd3778cade5 Mon Sep 17 00:00:00 2001 From: Jonathan Lemon Date: Thu, 6 Jun 2019 13:59:40 -0700 Subject: [PATCH 1214/1640] UPSTREAM: bpf: Allow bpf_map_lookup_elem() on an xskmap Currently, the AF_XDP code uses a separate map in order to determine if an xsk is bound to a queue. Instead of doing this, have bpf_map_lookup_elem() return a xdp_sock. Rearrange some xdp_sock members to eliminate structure holes. Remove selftest - will be added back in later patch. Change-Id: Icfad5e0d4f996eb52bc36e87344e57343354e834 Signed-off-by: Jonathan Lemon Acked-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 8 ++++++++ include/net/xdp_sock.h | 4 ++-- include/uapi/linux/bpf.h | 4 ++++ kernel/bpf/verifier.c | 26 ++++++++++++++++++++++++-- kernel/bpf/xskmap.c | 7 +++++++ net/core/filter.c | 40 ++++++++++++++++++++++++++++++++++++++++ 6 files changed, 85 insertions(+), 4 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 80252adb522d..1479a2d88129 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -284,6 +284,7 @@ enum bpf_reg_type { PTR_TO_TCP_SOCK, /* reg points to struct tcp_sock */ PTR_TO_TCP_SOCK_OR_NULL, /* reg points to struct tcp_sock or NULL */ PTR_TO_TP_BUFFER, /* reg points to a writable raw tp's buffer */ + PTR_TO_XDP_SOCK, /* reg points to struct xdp_sock */ }; /* The information passed from prog-specific *_is_valid_access @@ -737,6 +738,13 @@ void __cpu_map_insert_ctx(struct bpf_map *map, u32 index); void __cpu_map_flush(struct bpf_map *map); int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp, struct net_device *dev_rx); +bool bpf_xdp_sock_is_valid_access(int off, int size, enum bpf_access_type type, + struct bpf_insn_access_aux *info); +u32 bpf_xdp_sock_convert_ctx_access(enum bpf_access_type type, + const struct bpf_insn *si, + struct bpf_insn *insn_buf, + struct bpf_prog *prog, + u32 *target_size); /* Return map's numa specified by userspace */ static inline int bpf_map_attr_numa_node(const union bpf_attr *attr) diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index c2c10cc9ffa0..de1336dabe11 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -54,11 +54,11 @@ struct xdp_sock { struct xdp_umem *umem; struct list_head flush_node; u16 queue_id; - struct xsk_queue *tx ____cacheline_aligned_in_smp; - struct list_head list; bool zc; /* Protects multiple processes in the control path */ struct mutex mutex; + struct xsk_queue *tx ____cacheline_aligned_in_smp; + struct list_head list; /* Mutual exclusion of NAPI TX thread and sendmsg error paths * in the SKB destructor callback. */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index a18b81f8421b..59a31ea9a002 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2937,6 +2937,10 @@ struct bpf_sock_tuple { }; }; +struct bpf_xdp_sock { + __u32 queue_id; +}; + #define XDP_PACKET_HEADROOM 256 /* User return codes for XDP prog type. diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 469990884f5d..074c5d0c0563 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -325,7 +325,8 @@ static bool type_is_sk_pointer(enum bpf_reg_type type) { return type == PTR_TO_SOCKET || type == PTR_TO_SOCK_COMMON || - type == PTR_TO_TCP_SOCK; + type == PTR_TO_TCP_SOCK || + type == PTR_TO_XDP_SOCK; } static bool reg_type_may_be_null(enum bpf_reg_type type) @@ -397,6 +398,7 @@ static const char * const reg_type_str[] = { [PTR_TO_TCP_SOCK] = "tcp_sock", [PTR_TO_TCP_SOCK_OR_NULL] = "tcp_sock_or_null", [PTR_TO_TP_BUFFER] = "tp_buffer", + [PTR_TO_XDP_SOCK] = "xdp_sock", }; static char slot_type_char[] = { @@ -1354,6 +1356,7 @@ static bool is_spillable_regtype(enum bpf_reg_type type) case PTR_TO_SOCK_COMMON_OR_NULL: case PTR_TO_TCP_SOCK: case PTR_TO_TCP_SOCK_OR_NULL: + case PTR_TO_XDP_SOCK: return true; default: return false; @@ -1842,6 +1845,9 @@ static int check_sock_access(struct bpf_verifier_env *env, int insn_idx, case PTR_TO_TCP_SOCK: valid = bpf_tcp_sock_is_valid_access(off, size, t, &info); break; + case PTR_TO_XDP_SOCK: + valid = bpf_xdp_sock_is_valid_access(off, size, t, &info); + break; default: valid = false; } @@ -2006,6 +2012,9 @@ static int check_ptr_alignment(struct bpf_verifier_env *env, case PTR_TO_TCP_SOCK: pointer_desc = "tcp_sock "; break; + case PTR_TO_XDP_SOCK: + pointer_desc = "xdp_sock "; + break; default: break; } @@ -2911,10 +2920,14 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, * appear. */ case BPF_MAP_TYPE_CPUMAP: - case BPF_MAP_TYPE_XSKMAP: if (func_id != BPF_FUNC_redirect_map) goto error; break; + case BPF_MAP_TYPE_XSKMAP: + if (func_id != BPF_FUNC_redirect_map && + func_id != BPF_FUNC_map_lookup_elem) + goto error; + break; case BPF_MAP_TYPE_ARRAY_OF_MAPS: case BPF_MAP_TYPE_HASH_OF_MAPS: if (func_id != BPF_FUNC_map_lookup_elem) @@ -4001,6 +4014,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, case PTR_TO_SOCK_COMMON_OR_NULL: case PTR_TO_TCP_SOCK: case PTR_TO_TCP_SOCK_OR_NULL: + case PTR_TO_XDP_SOCK: verbose(env, "R%d pointer arithmetic on %s prohibited\n", dst, reg_type_str[ptr_reg->type]); return -EACCES; @@ -5224,6 +5238,9 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state, if (reg->map_ptr->inner_map_meta) { reg->type = CONST_PTR_TO_MAP; reg->map_ptr = reg->map_ptr->inner_map_meta; + } else if (reg->map_ptr->map_type == + BPF_MAP_TYPE_XSKMAP) { + reg->type = PTR_TO_XDP_SOCK; } else { reg->type = PTR_TO_MAP_VALUE; } @@ -6506,6 +6523,7 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold, case PTR_TO_SOCK_COMMON_OR_NULL: case PTR_TO_TCP_SOCK: case PTR_TO_TCP_SOCK_OR_NULL: + case PTR_TO_XDP_SOCK: /* Only valid matches are exact, which memcmp() above * would have accepted */ @@ -6890,6 +6908,7 @@ static bool reg_type_mismatch_ok(enum bpf_reg_type type) case PTR_TO_SOCK_COMMON_OR_NULL: case PTR_TO_TCP_SOCK: case PTR_TO_TCP_SOCK_OR_NULL: + case PTR_TO_XDP_SOCK: return false; default: return true; @@ -8026,6 +8045,9 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) case PTR_TO_TCP_SOCK: convert_ctx_access = bpf_tcp_sock_convert_ctx_access; break; + case PTR_TO_XDP_SOCK: + convert_ctx_access = bpf_xdp_sock_convert_ctx_access; + break; default: continue; } diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c index 413d75f4fc72..ef7338cebd18 100644 --- a/kernel/bpf/xskmap.c +++ b/kernel/bpf/xskmap.c @@ -151,6 +151,12 @@ void __xsk_map_flush(struct bpf_map *map) } static void *xsk_map_lookup_elem(struct bpf_map *map, void *key) +{ + WARN_ON_ONCE(!rcu_read_lock_held()); + return __xsk_map_lookup_elem(map, *(u32 *)key); +} + +static void *xsk_map_lookup_elem_sys_only(struct bpf_map *map, void *key) { return ERR_PTR(-EOPNOTSUPP); } @@ -218,6 +224,7 @@ const struct bpf_map_ops xsk_map_ops = { .map_free = xsk_map_free, .map_get_next_key = xsk_map_get_next_key, .map_lookup_elem = xsk_map_lookup_elem, + .map_lookup_elem_sys_only = xsk_map_lookup_elem_sys_only, .map_update_elem = xsk_map_update_elem, .map_delete_elem = xsk_map_delete_elem, .map_check_btf = map_check_no_btf, diff --git a/net/core/filter.c b/net/core/filter.c index eae293d6ee2d..0529b7ef022f 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5252,6 +5252,46 @@ BPF_CALL_1(bpf_skb_ecn_set_ce, struct sk_buff *, skb) return INET_ECN_set_ce(skb); } +bool bpf_xdp_sock_is_valid_access(int off, int size, enum bpf_access_type type, + struct bpf_insn_access_aux *info) +{ + if (off < 0 || off >= offsetofend(struct bpf_xdp_sock, queue_id)) + return false; + + if (off % size != 0) + return false; + + switch (off) { + default: + return size == sizeof(__u32); + } +} + +u32 bpf_xdp_sock_convert_ctx_access(enum bpf_access_type type, + const struct bpf_insn *si, + struct bpf_insn *insn_buf, + struct bpf_prog *prog, u32 *target_size) +{ + struct bpf_insn *insn = insn_buf; + +#define BPF_XDP_SOCK_GET(FIELD) \ + do { \ + BUILD_BUG_ON(FIELD_SIZEOF(struct xdp_sock, FIELD) > \ + FIELD_SIZEOF(struct bpf_xdp_sock, FIELD)); \ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_sock, FIELD),\ + si->dst_reg, si->src_reg, \ + offsetof(struct xdp_sock, FIELD)); \ + } while (0) + + switch (si->off) { + case offsetof(struct bpf_xdp_sock, queue_id): + BPF_XDP_SOCK_GET(queue_id); + break; + } + + return insn - insn_buf; +} + static const struct bpf_func_proto bpf_skb_ecn_set_ce_proto = { .func = bpf_skb_ecn_set_ce, .gpl_only = false, From 0373fd40fde33852865bd3e263bd66c47c5bd077 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 14 Jun 2019 16:22:18 -0700 Subject: [PATCH 1215/1640] BACKPORT: sysctl: define proc_do_static_key() Convert proc_dointvec_minmax_bpf_stats() into a more generic helper, since we are going to use jump labels more often. Note that sysctl_bpf_stats_enabled is removed, since it is no longer needed/used. Change-Id: I701cd21310062f420120c88baab2b0ea9c4405fe Signed-off-by: Eric Dumazet Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/bpf.h | 1 - include/linux/sysctl.h | 3 +++ kernel/bpf/core.c | 1 - kernel/sysctl.c | 42 ++++++++++++++++++++++++------------------ 4 files changed, 27 insertions(+), 20 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 1479a2d88129..d1326e64a4ac 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -672,7 +672,6 @@ void bpf_map_area_free(void *base); void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr); extern int sysctl_unprivileged_bpf_disabled; -extern int sysctl_bpf_stats_enabled; int bpf_map_new_fd(struct bpf_map *map, int flags); int bpf_prog_new_fd(struct bpf_prog *prog); diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 02c9b18bbbf7..9ba832445964 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -66,6 +66,9 @@ extern int proc_do_large_bitmap(struct ctl_table *, int, extern int proc_douintvec_capacity(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); +extern int proc_do_static_key(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos); /* * Register a set of sysctl names by calling register_sysctl_table diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index a15a38492ec9..f96675683936 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2133,7 +2133,6 @@ int __weak skb_copy_bits(const struct sk_buff *skb, int offset, void *to, DEFINE_STATIC_KEY_FALSE(bpf_stats_enabled_key); EXPORT_SYMBOL(bpf_stats_enabled_key); -int sysctl_bpf_stats_enabled __read_mostly; /* All definitions of tracepoints related to BPF. */ #define CREATE_TRACE_POINTS diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 16d4b91cd991..2afaec84bbc0 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -234,9 +234,6 @@ static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write, static int proc_dostring_coredump(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); #endif -static int proc_dointvec_minmax_bpf_stats(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos); #ifdef CONFIG_MAGIC_SYSRQ /* Note: sysrq code uses it's own private copy */ @@ -1433,12 +1430,10 @@ static struct ctl_table kern_table[] = { #endif { .procname = "bpf_stats_enabled", - .data = &sysctl_bpf_stats_enabled, - .maxlen = sizeof(sysctl_bpf_stats_enabled), + .data = &bpf_stats_enabled_key.key, + .maxlen = sizeof(bpf_stats_enabled_key), .mode = 0644, - .proc_handler = proc_dointvec_minmax_bpf_stats, - .extra1 = &zero, - .extra2 = &one, + .proc_handler = proc_do_static_key, }, #if defined(CONFIG_TREE_RCU) || defined(CONFIG_PREEMPT_RCU) { @@ -3545,27 +3540,38 @@ int proc_douintvec_capacity(struct ctl_table *table, int write, #endif /* CONFIG_PROC_SYSCTL */ -static int proc_dointvec_minmax_bpf_stats(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) +#if defined(CONFIG_SYSCTL) +int proc_do_static_key(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) { - int ret, bpf_stats = *(int *)table->data; - struct ctl_table tmp = *table; + struct static_key *key = (struct static_key *)table->data; + static DEFINE_MUTEX(static_key_mutex); + int val, ret; + struct ctl_table tmp = { + .data = &val, + .maxlen = sizeof(val), + .mode = table->mode, + .extra1 = &zero, + .extra2 = &one, + }; if (write && !capable(CAP_SYS_ADMIN)) return -EPERM; - tmp.data = &bpf_stats; + mutex_lock(&static_key_mutex); + val = static_key_enabled(key); ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); if (write && !ret) { - *(int *)table->data = bpf_stats; - if (bpf_stats) - static_branch_enable(&bpf_stats_enabled_key); + if (val) + static_key_enable(key); else - static_branch_disable(&bpf_stats_enabled_key); + static_key_disable(key); } + mutex_unlock(&static_key_mutex); return ret; } +#endif /* * No sense putting this after each symbol definition, twice, From 11c5c355445545911b62d5dd5995a510f555364d Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Sat, 15 Jun 2019 12:12:20 -0700 Subject: [PATCH 1216/1640] UPSTREAM: bpf: introduce bounded loops Allow the verifier to validate the loops by simulating their execution. Exisiting programs have used '#pragma unroll' to unroll the loops by the compiler. Instead let the verifier simulate all iterations of the loop. In order to do that introduce parentage chain of bpf_verifier_state and 'branches' counter for the number of branches left to explore. See more detailed algorithm description in bpf_verifier.h This algorithm borrows the key idea from Edward Cree approach: https://patchwork.ozlabs.org/patch/877222/ Additional state pruning heuristics make such brute force loop walk practical even for large loops. Change-Id: Ia586b471755764f7f819d86c9da14d41e074c755 Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann --- include/linux/bpf_verifier.h | 51 ++++++++++++- kernel/bpf/verifier.c | 143 ++++++++++++++++++++++++++++++++--- 2 files changed, 181 insertions(+), 13 deletions(-) diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index e7ea144d4f79..a48d100f215c 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -201,6 +201,53 @@ struct bpf_id_pair { struct bpf_verifier_state { /* call stack tracking */ struct bpf_func_state *frame[MAX_CALL_FRAMES]; + struct bpf_verifier_state *parent; + /* + * 'branches' field is the number of branches left to explore: + * 0 - all possible paths from this state reached bpf_exit or + * were safely pruned + * 1 - at least one path is being explored. + * This state hasn't reached bpf_exit + * 2 - at least two paths are being explored. + * This state is an immediate parent of two children. + * One is fallthrough branch with branches==1 and another + * state is pushed into stack (to be explored later) also with + * branches==1. The parent of this state has branches==1. + * The verifier state tree connected via 'parent' pointer looks like: + * 1 + * 1 + * 2 -> 1 (first 'if' pushed into stack) + * 1 + * 2 -> 1 (second 'if' pushed into stack) + * 1 + * 1 + * 1 bpf_exit. + * + * Once do_check() reaches bpf_exit, it calls update_branch_counts() + * and the verifier state tree will look: + * 1 + * 1 + * 2 -> 1 (first 'if' pushed into stack) + * 1 + * 1 -> 1 (second 'if' pushed into stack) + * 0 + * 0 + * 0 bpf_exit. + * After pop_stack() the do_check() will resume at second 'if'. + * + * If is_state_visited() sees a state with branches > 0 it means + * there is a loop. If such state is exactly equal to the current state + * it's an infinite loop. Note states_equal() checks for states + * equvalency, so two states being 'states_equal' does not mean + * infinite loop. The exact comparison is provided by + * states_maybe_looping() function. It's a stronger pre-check and + * much faster than states_equal(). + * + * This algorithm may not find all possible infinite loops or + * loop iteration count may be too high. + * In such cases BPF_COMPLEXITY_LIMIT_INSNS limit kicks in. + */ + u32 branches; u32 insn_idx; u32 curframe; u32 active_spin_lock; @@ -322,7 +369,9 @@ struct bpf_verifier_env { } cfg; u32 subprog_cnt; /* number of instructions analyzed by the verifier */ - u32 insn_processed; + u32 prev_insn_processed, insn_processed; + /* number of jmps, calls, exits analyzed so far */ + u32 prev_jmps_processed, jmps_processed; /* total verification time */ u64 verification_time; /* maximum number of verifier states kept in 'branching' instructions */ diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 074c5d0c0563..9210a40673d4 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -712,6 +712,8 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state, dst_state->speculative = src->speculative; dst_state->curframe = src->curframe; dst_state->active_spin_lock = src->active_spin_lock; + dst_state->branches = src->branches; + dst_state->parent = src->parent; for (i = 0; i <= src->curframe; i++) { dst = dst_state->frame[i]; if (!dst) { @@ -727,6 +729,23 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state, return 0; } +static void update_branch_counts(struct bpf_verifier_env *env, struct bpf_verifier_state *st) +{ + while (st) { + u32 br = --st->branches; + + /* WARN_ON(br > 1) technically makes sense here, + * but see comment in push_stack(), hence: + */ + WARN_ONCE((int)br < 0, + "BUG update_branch_counts:branches_to_explore=%d\n", + br); + if (br) + break; + st = st->parent; + } +} + static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx, int *insn_idx) { @@ -780,6 +799,18 @@ static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env, env->stack_size); goto err; } + if (elem->st.parent) { + ++elem->st.parent->branches; + /* WARN_ON(branches > 2) technically makes sense here, + * but + * 1. speculative states will bump 'branches' for non-branch + * instructions + * 2. is_state_visited() heuristics may decide not to create + * a new state for a sequence of branches and all such current + * and cloned states will be pointing to a single parent state + * which might have large 'branches' count. + */ + } return &elem->st; err: free_verifier_state(env->cur_state, true); @@ -5850,7 +5881,8 @@ static void init_explored_state(struct bpf_verifier_env *env, int idx) * w - next instruction * e - edge */ -static int push_insn(int t, int w, int e, struct bpf_verifier_env *env) +static int push_insn(int t, int w, int e, struct bpf_verifier_env *env, + bool loop_ok) { int *insn_stack = env->cfg.insn_stack; int *insn_state = env->cfg.insn_state; @@ -5880,6 +5912,8 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env) insn_stack[env->cfg.cur_stack++] = w; return 1; } else if ((insn_state[w] & 0xF0) == DISCOVERED) { + if (loop_ok && env->allow_ptr_leaks) + return 0; verbose_linfo(env, t, "%d: ", t); verbose_linfo(env, w, "%d: ", w); verbose(env, "back-edge from insn %d to %d\n", t, w); @@ -5931,7 +5965,7 @@ peek_stack: if (opcode == BPF_EXIT) { goto mark_explored; } else if (opcode == BPF_CALL) { - ret = push_insn(t, t + 1, FALLTHROUGH, env); + ret = push_insn(t, t + 1, FALLTHROUGH, env, false); if (ret == 1) goto peek_stack; else if (ret < 0) @@ -5940,7 +5974,8 @@ peek_stack: init_explored_state(env, t + 1); if (insns[t].src_reg == BPF_PSEUDO_CALL) { init_explored_state(env, t); - ret = push_insn(t, t + insns[t].imm + 1, BRANCH, env); + ret = push_insn(t, t + insns[t].imm + 1, BRANCH, + env, false); if (ret == 1) goto peek_stack; else if (ret < 0) @@ -5953,7 +5988,7 @@ peek_stack: } /* unconditional jump with single edge */ ret = push_insn(t, t + insns[t].off + 1, - FALLTHROUGH, env); + FALLTHROUGH, env, true); if (ret == 1) goto peek_stack; else if (ret < 0) @@ -5966,13 +6001,13 @@ peek_stack: } else { /* conditional jump with two edges */ init_explored_state(env, t); - ret = push_insn(t, t + 1, FALLTHROUGH, env); + ret = push_insn(t, t + 1, FALLTHROUGH, env, true); if (ret == 1) goto peek_stack; else if (ret < 0) goto err_free; - ret = push_insn(t, t + insns[t].off + 1, BRANCH, env); + ret = push_insn(t, t + insns[t].off + 1, BRANCH, env, true); if (ret == 1) goto peek_stack; else if (ret < 0) @@ -5982,7 +6017,7 @@ peek_stack: /* all other non-branch instructions with single * fall-through edge */ - ret = push_insn(t, t + 1, FALLTHROUGH, env); + ret = push_insn(t, t + 1, FALLTHROUGH, env, false); if (ret == 1) goto peek_stack; else if (ret < 0) @@ -6408,6 +6443,8 @@ static void clean_live_states(struct bpf_verifier_env *env, int insn, sl = *explored_state(env, insn); while (sl) { + if (sl->state.branches) + goto next; if (sl->state.insn_idx != insn || sl->state.curframe != cur->curframe) goto next; @@ -6764,12 +6801,32 @@ static int propagate_liveness(struct bpf_verifier_env *env, return 0; } +static bool states_maybe_looping(struct bpf_verifier_state *old, + struct bpf_verifier_state *cur) +{ + struct bpf_func_state *fold, *fcur; + int i, fr = cur->curframe; + + if (old->curframe != fr) + return false; + + fold = old->frame[fr]; + fcur = cur->frame[fr]; + for (i = 0; i < MAX_BPF_REG; i++) + if (memcmp(&fold->regs[i], &fcur->regs[i], + offsetof(struct bpf_reg_state, parent))) + return false; + return true; +} + + static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) { struct bpf_verifier_state_list *new_sl; struct bpf_verifier_state_list *sl, **pprev; struct bpf_verifier_state *cur = env->cur_state, *new; int i, j, err, states_cnt = 0; + bool add_new_state = false; if (!env->insn_aux_data[insn_idx].prune_point) /* this 'insn_idx' instruction wasn't marked, so we will not @@ -6777,6 +6834,18 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) */ return 0; + /* bpf progs typically have pruning point every 4 instructions + * http://vger.kernel.org/bpfconf2019.html#session-1 + * Do not add new state for future pruning if the verifier hasn't seen + * at least 2 jumps and at least 8 instructions. + * This heuristics helps decrease 'total_states' and 'peak_states' metric. + * In tests that amounts to up to 50% reduction into total verifier + * memory consumption and 20% verifier time speedup. + */ + if (env->jmps_processed - env->prev_jmps_processed >= 2 && + env->insn_processed - env->prev_insn_processed >= 8) + add_new_state = true; + pprev = explored_state(env, insn_idx); sl = *pprev; @@ -6786,6 +6855,30 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) states_cnt++; if (sl->state.insn_idx != insn_idx) goto next; + if (sl->state.branches) { + if (states_maybe_looping(&sl->state, cur) && + states_equal(env, &sl->state, cur)) { + verbose_linfo(env, insn_idx, "; "); + verbose(env, "infinite loop detected at insn %d\n", insn_idx); + return -EINVAL; + } + /* if the verifier is processing a loop, avoid adding new state + * too often, since different loop iterations have distinct + * states and may not help future pruning. + * This threshold shouldn't be too low to make sure that + * a loop with large bound will be rejected quickly. + * The most abusive loop will be: + * r1 += 1 + * if r1 < 1000000 goto pc-2 + * 1M insn_procssed limit / 100 == 10k peak states. + * This threshold shouldn't be too high either, since states + * at the end of the loop are likely to be useful in pruning. + */ + if (env->jmps_processed - env->prev_jmps_processed < 20 && + env->insn_processed - env->prev_insn_processed < 100) + add_new_state = false; + goto miss; + } if (states_equal(env, &sl->state, cur)) { sl->hit_cnt++; /* reached equivalent register/stack state, @@ -6803,7 +6896,15 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) return err; return 1; } - sl->miss_cnt++; +miss: + /* when new state is not going to be added do not increase miss count. + * Otherwise several loop iterations will remove the state + * recorded earlier. The goal of these heuristics is to have + * states from some iterations of the loop (some in the beginning + * and some at the end) to help pruning. + */ + if (add_new_state) + sl->miss_cnt++; /* heuristic to determine whether this state is beneficial * to keep checking from state equivalence point of view. * Higher numbers increase max_states_per_insn and verification time, @@ -6815,6 +6916,11 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) */ *pprev = sl->next; if (sl->state.frame[0]->regs[0].live & REG_LIVE_DONE) { + u32 br = sl->state.branches; + + WARN_ONCE(br, + "BUG live_done but branches_to_explore %d\n", + br); free_verifier_state(&sl->state, false); kfree(sl); env->peak_states--; @@ -6840,18 +6946,25 @@ next: if (!env->allow_ptr_leaks && states_cnt > BPF_COMPLEXITY_LIMIT_STATES) return 0; - /* there were no equivalent states, remember current one. - * technically the current state is not proven to be safe yet, + if (!add_new_state) + return 0; + + /* There were no equivalent states, remember the current one. + * Technically the current state is not proven to be safe yet, * but it will either reach outer most bpf_exit (which means it's safe) - * or it will be rejected. Since there are no loops, we won't be + * or it will be rejected. When there are no loops the verifier won't be * seeing this tuple (frame[0].callsite, frame[1].callsite, .. insn_idx) - * again on the way to bpf_exit + * again on the way to bpf_exit. + * When looping the sl->state.branches will be > 0 and this state + * will not be considered for equivalence until branches == 0. */ new_sl = kzalloc(sizeof(struct bpf_verifier_state_list), GFP_KERNEL); if (!new_sl) return -ENOMEM; env->total_states++; env->peak_states++; + env->prev_jmps_processed = env->jmps_processed; + env->prev_insn_processed = env->insn_processed; /* add new state to the head of linked list */ new = &new_sl->state; @@ -6862,6 +6975,9 @@ next: return err; } new->insn_idx = insn_idx; + WARN_ONCE(new->branches != 1, + "BUG is_state_visited:branches_to_explore=%d insn %d\n", new->branches, insn_idx); + cur->parent = new; new_sl->next = *explored_state(env, insn_idx); *explored_state(env, insn_idx) = new_sl; /* connect new state to parentage chain. Current frame needs all @@ -6948,6 +7064,7 @@ static int do_check(struct bpf_verifier_env *env) return -ENOMEM; state->curframe = 0; state->speculative = false; + state->branches = 1; state->frame[0] = kzalloc(sizeof(struct bpf_func_state), GFP_KERNEL); if (!state->frame[0]) { kfree(state); @@ -7154,6 +7271,7 @@ static int do_check(struct bpf_verifier_env *env) } else if (class == BPF_JMP || class == BPF_JMP32) { u8 opcode = BPF_OP(insn->code); + env->jmps_processed++; if (opcode == BPF_CALL) { if (BPF_SRC(insn->code) != BPF_K || insn->off != 0 || @@ -7239,6 +7357,7 @@ static int do_check(struct bpf_verifier_env *env) if (err) return err; process_bpf_exit: + update_branch_counts(env, env->cur_state); err = pop_stack(env, &env->prev_insn_idx, &env->insn_idx); if (err < 0) { From 55e13de05c50a15d31297d18c98a5e3daa5be9e7 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Sat, 15 Jun 2019 12:12:21 -0700 Subject: [PATCH 1217/1640] UPSTREAM: bpf: fix callees pruning callers The commit 7640ead93924 partially resolved the issue of callees incorrectly pruning the callers. With introduction of bounded loops and jmps_processed heuristic single verifier state may contain multiple branches and calls. It's possible that new verifier state (for future pruning) will be allocated inside callee. Then callee will exit (still within the same verifier state). It will go back to the caller and there R6-R9 registers will be read and will trigger mark_reg_read. But the reg->live for all frames but the top frame is not set to LIVE_NONE. Hence mark_reg_read will fail to propagate liveness into parent and future walking will incorrectly conclude that the states are equivalent because LIVE_READ is not set. In other words the rule for parent/live should be: whenever register parentage chain is set the reg->live should be set to LIVE_NONE. is_state_visited logic already follows this rule for spilled registers. Fixes: 7640ead93924 ("bpf: verifier: make sure callees don't prune with caller differences") Fixes: f4d7e40a5b71 ("bpf: introduce function calls (verification)") Change-Id: I7130cf0296505860f7a903b7efe5c3ff104a579f Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 9210a40673d4..a2f40cbe3399 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6987,17 +6987,18 @@ next: * the state of the call instruction (with WRITTEN set), and r0 comes * from callee with its full parentage chain, anyway. */ - for (j = 0; j <= cur->curframe; j++) - for (i = j < cur->curframe ? BPF_REG_6 : 0; i < BPF_REG_FP; i++) - cur->frame[j]->regs[i].parent = &new->frame[j]->regs[i]; /* clear write marks in current state: the writes we did are not writes * our child did, so they don't screen off its reads from us. * (There are no read marks in current state, because reads always mark * their parent and current state never has children yet. Only * explored_states can get read marks.) */ - for (i = 0; i < BPF_REG_FP; i++) - cur->frame[cur->curframe]->regs[i].live = REG_LIVE_NONE; + for (j = 0; j <= cur->curframe; j++) { + for (i = j < cur->curframe ? BPF_REG_6 : 0; i < BPF_REG_FP; i++) + cur->frame[j]->regs[i].parent = &new->frame[j]->regs[i]; + for (i = 0; i < BPF_REG_FP; i++) + cur->frame[j]->regs[i].live = REG_LIVE_NONE; + } /* all stack frames are accessible from callee, clear them all */ for (j = 0; j <= cur->curframe; j++) { From cd879d257b65dd7c2fd6bf717f0b2caf27b74d57 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Sat, 15 Jun 2019 12:12:25 -0700 Subject: [PATCH 1218/1640] BACKPORT: bpf: precise scalar_value tracking Introduce precision tracking logic that helps cilium programs the most: old clang old clang new clang new clang with all patches with all patches bpf_lb-DLB_L3.o 1838 2283 1923 1863 bpf_lb-DLB_L4.o 3218 2657 3077 2468 bpf_lb-DUNKNOWN.o 1064 545 1062 544 bpf_lxc-DDROP_ALL.o 26935 23045 166729 22629 bpf_lxc-DUNKNOWN.o 34439 35240 174607 28805 bpf_netdev.o 9721 8753 8407 6801 bpf_overlay.o 6184 7901 5420 4754 bpf_lxc_jit.o 39389 50925 39389 50925 Consider code: 654: (85) call bpf_get_hash_recalc#34 655: (bf) r7 = r0 656: (15) if r8 == 0x0 goto pc+29 657: (bf) r2 = r10 658: (07) r2 += -48 659: (18) r1 = 0xffff8881e41e1b00 661: (85) call bpf_map_lookup_elem#1 662: (15) if r0 == 0x0 goto pc+23 663: (69) r1 = *(u16 *)(r0 +0) 664: (15) if r1 == 0x0 goto pc+21 665: (bf) r8 = r7 666: (57) r8 &= 65535 667: (bf) r2 = r8 668: (3f) r2 /= r1 669: (2f) r2 *= r1 670: (bf) r1 = r8 671: (1f) r1 -= r2 672: (57) r1 &= 255 673: (25) if r1 > 0x1e goto pc+12 R0=map_value(id=0,off=0,ks=20,vs=64,imm=0) R1_w=inv(id=0,umax_value=30,var_off=(0x0; 0x1f)) 674: (67) r1 <<= 1 675: (0f) r0 += r1 At this point the verifier will notice that scalar R1 is used in map pointer adjustment. R1 has to be precise for later operations on R0 to be validated properly. The verifier will backtrack the above code in the following way: last_idx 675 first_idx 664 regs=2 stack=0 before 675: (0f) r0 += r1 // started backtracking R1 regs=2 is a bitmask regs=2 stack=0 before 674: (67) r1 <<= 1 regs=2 stack=0 before 673: (25) if r1 > 0x1e goto pc+12 regs=2 stack=0 before 672: (57) r1 &= 255 regs=2 stack=0 before 671: (1f) r1 -= r2 // now both R1 and R2 has to be precise -> regs=6 mask regs=6 stack=0 before 670: (bf) r1 = r8 // after this insn R8 and R2 has to be precise regs=104 stack=0 before 669: (2f) r2 *= r1 // after this one R8, R2, and R1 regs=106 stack=0 before 668: (3f) r2 /= r1 regs=106 stack=0 before 667: (bf) r2 = r8 regs=102 stack=0 before 666: (57) r8 &= 65535 regs=102 stack=0 before 665: (bf) r8 = r7 regs=82 stack=0 before 664: (15) if r1 == 0x0 goto pc+21 // this is the end of verifier state. The following regs will be marked precised: R1_rw=invP(id=0,umax_value=65535,var_off=(0x0; 0xffff)) R7_rw=invP(id=0) parent didn't have regs=82 stack=0 marks // so backtracking continues into parent state last_idx 663 first_idx 655 regs=82 stack=0 before 663: (69) r1 = *(u16 *)(r0 +0) // R1 was assigned no need to track it further regs=80 stack=0 before 662: (15) if r0 == 0x0 goto pc+23 // keep tracking R7 regs=80 stack=0 before 661: (85) call bpf_map_lookup_elem#1 // keep tracking R7 regs=80 stack=0 before 659: (18) r1 = 0xffff8881e41e1b00 regs=80 stack=0 before 658: (07) r2 += -48 regs=80 stack=0 before 657: (bf) r2 = r10 regs=80 stack=0 before 656: (15) if r8 == 0x0 goto pc+29 regs=80 stack=0 before 655: (bf) r7 = r0 // here the assignment into R7 // mark R0 to be precise: R0_rw=invP(id=0) parent didn't have regs=1 stack=0 marks // regs=1 -> tracking R0 last_idx 654 first_idx 644 regs=1 stack=0 before 654: (85) call bpf_get_hash_recalc#34 // and in the parent frame it was a return value // nothing further to backtrack Two scalar registers not marked precise are equivalent from state pruning point of view. More details in the patch comments. It doesn't support bpf2bpf calls yet and enabled for root only. Change-Id: I51f87dc10346934d02024845a9df2312ca2738ed Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann --- include/linux/bpf_verifier.h | 18 ++ kernel/bpf/verifier.c | 492 ++++++++++++++++++++++++++++++++++- 2 files changed, 498 insertions(+), 12 deletions(-) diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index a48d100f215c..e462b2420a44 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -139,6 +139,8 @@ struct bpf_reg_state { */ s32 subreg_def; enum bpf_reg_liveness live; + /* if (!precise && SCALAR_VALUE) min/max/tnum don't affect safety */ + bool precise; }; enum bpf_stack_slot_type { @@ -197,6 +199,11 @@ struct bpf_id_pair { /* Maximum number of register states that can exist at once */ #define BPF_ID_MAP_SIZE (MAX_BPF_REG + MAX_BPF_STACK / BPF_REG_SIZE) +struct bpf_idx_pair { + u32 prev_idx; + u32 idx; +}; + #define MAX_CALL_FRAMES 8 struct bpf_verifier_state { /* call stack tracking */ @@ -252,6 +259,17 @@ struct bpf_verifier_state { u32 curframe; u32 active_spin_lock; bool speculative; + + /* first and last insn idx of this verifier state */ + u32 first_insn_idx; + u32 last_insn_idx; + /* jmp history recorded from first to last. + * backtracking is using it to go from last to first. + * For most states jmp_history_cnt is [0-3]. + * For loops can go up to ~40. + */ + struct bpf_idx_pair *jmp_history; + u32 jmp_history_cnt; }; #define bpf_get_spilled_reg(slot, frame) \ diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a2f40cbe3399..e23128cb93a2 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -446,12 +446,12 @@ static void print_verifier_state(struct bpf_verifier_env *env, verbose(env, " R%d", i); print_liveness(env, reg->live); verbose(env, "=%s", reg_type_str[t]); + if (t == SCALAR_VALUE && reg->precise) + verbose(env, "P"); if ((t == SCALAR_VALUE || t == PTR_TO_STACK) && tnum_is_const(reg->var_off)) { /* reg->off should be 0 for SCALAR_VALUE */ verbose(env, "%lld", reg->var_off.value + reg->off); - if (t == PTR_TO_STACK) - verbose(env, ",call_%d", func(env, reg)->callsite); } else { verbose(env, "(id=%d", reg->id); if (reg_type_may_be_refcounted_or_null(t)) @@ -513,11 +513,17 @@ static void print_verifier_state(struct bpf_verifier_env *env, continue; verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE); print_liveness(env, state->stack[i].spilled_ptr.live); - if (state->stack[i].slot_type[0] == STACK_SPILL) - verbose(env, "=%s", - reg_type_str[state->stack[i].spilled_ptr.type]); - else + if (state->stack[i].slot_type[0] == STACK_SPILL) { + reg = &state->stack[i].spilled_ptr; + t = reg->type; + verbose(env, "=%s", reg_type_str[t]); + if (t == SCALAR_VALUE && reg->precise) + verbose(env, "P"); + if (t == SCALAR_VALUE && tnum_is_const(reg->var_off)) + verbose(env, "%lld", reg->var_off.value + reg->off); + } else { verbose(env, "=%s", types_buf); + } } if (state->acquired_refs && state->refs[0].id) { verbose(env, " refs=%d", state->refs[0].id); @@ -666,6 +672,13 @@ static void free_func_state(struct bpf_func_state *state) kfree(state); } +static void clear_jmp_history(struct bpf_verifier_state *state) +{ + kfree(state->jmp_history); + state->jmp_history = NULL; + state->jmp_history_cnt = 0; +} + static void free_verifier_state(struct bpf_verifier_state *state, bool free_self) { @@ -675,6 +688,7 @@ static void free_verifier_state(struct bpf_verifier_state *state, free_func_state(state->frame[i]); state->frame[i] = NULL; } + clear_jmp_history(state); if (free_self) kfree(state); } @@ -702,8 +716,18 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state, const struct bpf_verifier_state *src) { struct bpf_func_state *dst; + u32 jmp_sz = sizeof(struct bpf_idx_pair) * src->jmp_history_cnt; int i, err; + if (dst_state->jmp_history_cnt < src->jmp_history_cnt) { + kfree(dst_state->jmp_history); + dst_state->jmp_history = kmalloc(jmp_sz, GFP_USER); + if (!dst_state->jmp_history) + return -ENOMEM; + } + memcpy(dst_state->jmp_history, src->jmp_history, jmp_sz); + dst_state->jmp_history_cnt = src->jmp_history_cnt; + /* if dst has more stack frames then src frame, free them */ for (i = src->curframe + 1; i <= dst_state->curframe; i++) { free_func_state(dst_state->frame[i]); @@ -714,6 +738,8 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state, dst_state->active_spin_lock = src->active_spin_lock; dst_state->branches = src->branches; dst_state->parent = src->parent; + dst_state->first_insn_idx = src->first_insn_idx; + dst_state->last_insn_idx = src->last_insn_idx; for (i = 0; i <= src->curframe; i++) { dst = dst_state->frame[i]; if (!dst) { @@ -958,6 +984,9 @@ static void __mark_reg_unbounded(struct bpf_reg_state *reg) reg->smax_value = S64_MAX; reg->umin_value = 0; reg->umax_value = U64_MAX; + + /* constant backtracking is enabled for root only for now */ + reg->precise = capable(CAP_SYS_ADMIN) ? false : true; } /* Mark a register as having a completely unknown (scalar) value. */ @@ -1369,6 +1398,389 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno, return 0; } +/* for any branch, call, exit record the history of jmps in the given state */ +static int push_jmp_history(struct bpf_verifier_env *env, + struct bpf_verifier_state *cur) +{ + u32 cnt = cur->jmp_history_cnt; + struct bpf_idx_pair *p; + + cnt++; + p = krealloc(cur->jmp_history, cnt * sizeof(*p), GFP_USER); + if (!p) + return -ENOMEM; + p[cnt - 1].idx = env->insn_idx; + p[cnt - 1].prev_idx = env->prev_insn_idx; + cur->jmp_history = p; + cur->jmp_history_cnt = cnt; + return 0; +} + +/* Backtrack one insn at a time. If idx is not at the top of recorded + * history then previous instruction came from straight line execution. + */ +static int get_prev_insn_idx(struct bpf_verifier_state *st, int i, + u32 *history) +{ + u32 cnt = *history; + + if (cnt && st->jmp_history[cnt - 1].idx == i) { + i = st->jmp_history[cnt - 1].prev_idx; + (*history)--; + } else { + i--; + } + return i; +} + +/* For given verifier state backtrack_insn() is called from the last insn to + * the first insn. Its purpose is to compute a bitmask of registers and + * stack slots that needs precision in the parent verifier state. + */ +static int backtrack_insn(struct bpf_verifier_env *env, int idx, + u32 *reg_mask, u64 *stack_mask) +{ + const struct bpf_insn_cbs cbs = { + .cb_print = verbose, + .private_data = env, + }; + struct bpf_insn *insn = env->prog->insnsi + idx; + u8 class = BPF_CLASS(insn->code); + u8 opcode = BPF_OP(insn->code); + u8 mode = BPF_MODE(insn->code); + u32 dreg = 1u << insn->dst_reg; + u32 sreg = 1u << insn->src_reg; + u32 spi; + + if (insn->code == 0) + return 0; + if (env->log.level & BPF_LOG_LEVEL) { + verbose(env, "regs=%x stack=%llx before ", *reg_mask, *stack_mask); + verbose(env, "%d: ", idx); + print_bpf_insn(&cbs, insn, env->allow_ptr_leaks); + } + + if (class == BPF_ALU || class == BPF_ALU64) { + if (!(*reg_mask & dreg)) + return 0; + if (opcode == BPF_MOV) { + if (BPF_SRC(insn->code) == BPF_X) { + /* dreg = sreg + * dreg needs precision after this insn + * sreg needs precision before this insn + */ + *reg_mask &= ~dreg; + *reg_mask |= sreg; + } else { + /* dreg = K + * dreg needs precision after this insn. + * Corresponding register is already marked + * as precise=true in this verifier state. + * No further markings in parent are necessary + */ + *reg_mask &= ~dreg; + } + } else { + if (BPF_SRC(insn->code) == BPF_X) { + /* dreg += sreg + * both dreg and sreg need precision + * before this insn + */ + *reg_mask |= sreg; + } /* else dreg += K + * dreg still needs precision before this insn + */ + } + } else if (class == BPF_LDX) { + if (!(*reg_mask & dreg)) + return 0; + *reg_mask &= ~dreg; + + /* scalars can only be spilled into stack w/o losing precision. + * Load from any other memory can be zero extended. + * The desire to keep that precision is already indicated + * by 'precise' mark in corresponding register of this state. + * No further tracking necessary. + */ + if (insn->src_reg != BPF_REG_FP) + return 0; + if (BPF_SIZE(insn->code) != BPF_DW) + return 0; + + /* dreg = *(u64 *)[fp - off] was a fill from the stack. + * that [fp - off] slot contains scalar that needs to be + * tracked with precision + */ + spi = (-insn->off - 1) / BPF_REG_SIZE; + if (spi >= 64) { + verbose(env, "BUG spi %d\n", spi); + WARN_ONCE(1, "verifier backtracking bug"); + return -EFAULT; + } + *stack_mask |= 1ull << spi; + } else if (class == BPF_STX) { + if (*reg_mask & dreg) + /* stx shouldn't be using _scalar_ dst_reg + * to access memory. It means backtracking + * encountered a case of pointer subtraction. + */ + return -ENOTSUPP; + /* scalars can only be spilled into stack */ + if (insn->dst_reg != BPF_REG_FP) + return 0; + if (BPF_SIZE(insn->code) != BPF_DW) + return 0; + spi = (-insn->off - 1) / BPF_REG_SIZE; + if (spi >= 64) { + verbose(env, "BUG spi %d\n", spi); + WARN_ONCE(1, "verifier backtracking bug"); + return -EFAULT; + } + if (!(*stack_mask & (1ull << spi))) + return 0; + *stack_mask &= ~(1ull << spi); + *reg_mask |= sreg; + } else if (class == BPF_JMP || class == BPF_JMP32) { + if (opcode == BPF_CALL) { + if (insn->src_reg == BPF_PSEUDO_CALL) + return -ENOTSUPP; + /* regular helper call sets R0 */ + *reg_mask &= ~1; + if (*reg_mask & 0x3f) { + /* if backtracing was looking for registers R1-R5 + * they should have been found already. + */ + verbose(env, "BUG regs %x\n", *reg_mask); + WARN_ONCE(1, "verifier backtracking bug"); + return -EFAULT; + } + } else if (opcode == BPF_EXIT) { + return -ENOTSUPP; + } + } else if (class == BPF_LD) { + if (!(*reg_mask & dreg)) + return 0; + *reg_mask &= ~dreg; + /* It's ld_imm64 or ld_abs or ld_ind. + * For ld_imm64 no further tracking of precision + * into parent is necessary + */ + if (mode == BPF_IND || mode == BPF_ABS) + /* to be analyzed */ + return -ENOTSUPP; + } else if (class == BPF_ST) { + if (*reg_mask & dreg) + /* likely pointer subtraction */ + return -ENOTSUPP; + } + return 0; +} + +/* the scalar precision tracking algorithm: + * . at the start all registers have precise=false. + * . scalar ranges are tracked as normal through alu and jmp insns. + * . once precise value of the scalar register is used in: + * . ptr + scalar alu + * . if (scalar cond K|scalar) + * . helper_call(.., scalar, ...) where ARG_CONST is expected + * backtrack through the verifier states and mark all registers and + * stack slots with spilled constants that these scalar regisers + * should be precise. + * . during state pruning two registers (or spilled stack slots) + * are equivalent if both are not precise. + * + * Note the verifier cannot simply walk register parentage chain, + * since many different registers and stack slots could have been + * used to compute single precise scalar. + * + * The approach of starting with precise=true for all registers and then + * backtrack to mark a register as not precise when the verifier detects + * that program doesn't care about specific value (e.g., when helper + * takes register as ARG_ANYTHING parameter) is not safe. + * + * It's ok to walk single parentage chain of the verifier states. + * It's possible that this backtracking will go all the way till 1st insn. + * All other branches will be explored for needing precision later. + * + * The backtracking needs to deal with cases like: + * R8=map_value(id=0,off=0,ks=4,vs=1952,imm=0) R9_w=map_value(id=0,off=40,ks=4,vs=1952,imm=0) + * r9 -= r8 + * r5 = r9 + * if r5 > 0x79f goto pc+7 + * R5_w=inv(id=0,umax_value=1951,var_off=(0x0; 0x7ff)) + * r5 += 1 + * ... + * call bpf_perf_event_output#25 + * where .arg5_type = ARG_CONST_SIZE_OR_ZERO + * + * and this case: + * r6 = 1 + * call foo // uses callee's r6 inside to compute r0 + * r0 += r6 + * if r0 == 0 goto + * + * to track above reg_mask/stack_mask needs to be independent for each frame. + * + * Also if parent's curframe > frame where backtracking started, + * the verifier need to mark registers in both frames, otherwise callees + * may incorrectly prune callers. This is similar to + * commit 7640ead93924 ("bpf: verifier: make sure callees don't prune with caller differences") + * + * For now backtracking falls back into conservative marking. + */ +static void mark_all_scalars_precise(struct bpf_verifier_env *env, + struct bpf_verifier_state *st) +{ + struct bpf_func_state *func; + struct bpf_reg_state *reg; + int i, j; + + /* big hammer: mark all scalars precise in this path. + * pop_stack may still get !precise scalars. + */ + for (; st; st = st->parent) + for (i = 0; i <= st->curframe; i++) { + func = st->frame[i]; + for (j = 0; j < BPF_REG_FP; j++) { + reg = &func->regs[j]; + if (reg->type != SCALAR_VALUE) + continue; + reg->precise = true; + } + for (j = 0; j < func->allocated_stack / BPF_REG_SIZE; j++) { + if (func->stack[j].slot_type[0] != STACK_SPILL) + continue; + reg = &func->stack[j].spilled_ptr; + if (reg->type != SCALAR_VALUE) + continue; + reg->precise = true; + } + } +} + +static int mark_chain_precision(struct bpf_verifier_env *env, int regno) +{ + struct bpf_verifier_state *st = env->cur_state; + int first_idx = st->first_insn_idx; + int last_idx = env->insn_idx; + struct bpf_func_state *func; + struct bpf_reg_state *reg; + u32 reg_mask = 1u << regno; + u64 stack_mask = 0; + bool skip_first = true; + int i, err; + + if (!env->allow_ptr_leaks) + /* backtracking is root only for now */ + return 0; + + func = st->frame[st->curframe]; + reg = &func->regs[regno]; + if (reg->type != SCALAR_VALUE) { + WARN_ONCE(1, "backtracing misuse"); + return -EFAULT; + } + if (reg->precise) + return 0; + func->regs[regno].precise = true; + + for (;;) { + DECLARE_BITMAP(mask, 64); + bool new_marks = false; + u32 history = st->jmp_history_cnt; + + if (env->log.level & BPF_LOG_LEVEL) + verbose(env, "last_idx %d first_idx %d\n", last_idx, first_idx); + for (i = last_idx;;) { + if (skip_first) { + err = 0; + skip_first = false; + } else { + err = backtrack_insn(env, i, ®_mask, &stack_mask); + } + if (err == -ENOTSUPP) { + mark_all_scalars_precise(env, st); + return 0; + } else if (err) { + return err; + } + if (!reg_mask && !stack_mask) + /* Found assignment(s) into tracked register in this state. + * Since this state is already marked, just return. + * Nothing to be tracked further in the parent state. + */ + return 0; + if (i == first_idx) + break; + i = get_prev_insn_idx(st, i, &history); + if (i >= env->prog->len) { + /* This can happen if backtracking reached insn 0 + * and there are still reg_mask or stack_mask + * to backtrack. + * It means the backtracking missed the spot where + * particular register was initialized with a constant. + */ + verbose(env, "BUG backtracking idx %d\n", i); + WARN_ONCE(1, "verifier backtracking bug"); + return -EFAULT; + } + } + st = st->parent; + if (!st) + break; + + func = st->frame[st->curframe]; + bitmap_from_u64(mask, reg_mask); + for_each_set_bit(i, mask, 32) { + reg = &func->regs[i]; + if (reg->type != SCALAR_VALUE) + continue; + if (!reg->precise) + new_marks = true; + reg->precise = true; + } + + bitmap_from_u64(mask, stack_mask); + for_each_set_bit(i, mask, 64) { + if (i >= func->allocated_stack / BPF_REG_SIZE) { + /* This can happen if backtracking + * is propagating stack precision where + * caller has larger stack frame + * than callee, but backtrack_insn() should + * have returned -ENOTSUPP. + */ + verbose(env, "BUG spi %d stack_size %d\n", + i, func->allocated_stack); + WARN_ONCE(1, "verifier backtracking bug"); + return -EFAULT; + } + + if (func->stack[i].slot_type[0] != STACK_SPILL) + continue; + reg = &func->stack[i].spilled_ptr; + if (reg->type != SCALAR_VALUE) + continue; + if (!reg->precise) + new_marks = true; + reg->precise = true; + } + if (env->log.level & BPF_LOG_LEVEL) { + print_verifier_state(env, func); + verbose(env, "parent %s regs=%x stack=%llx marks\n", + new_marks ? "didn't have" : "already had", + reg_mask, stack_mask); + } + + if (!new_marks) + break; + + last_idx = st->last_insn_idx; + first_idx = st->first_insn_idx; + } + return 0; +} + + static bool is_spillable_regtype(enum bpf_reg_type type) { switch (type) { @@ -1426,6 +1838,7 @@ static int check_stack_write(struct bpf_verifier_env *env, { struct bpf_func_state *cur; /* state of the current function */ int i, slot = -off - 1, spi = slot / BPF_REG_SIZE, err; + u32 dst_reg = env->prog->insnsi[insn_idx].dst_reg; struct bpf_reg_state *reg = NULL; err = realloc_func_state(state, round_up(slot + 1, BPF_REG_SIZE), @@ -1463,6 +1876,17 @@ static int check_stack_write(struct bpf_verifier_env *env, if (reg && size == BPF_REG_SIZE && register_is_const(reg) && !register_is_null(reg) && env->allow_ptr_leaks) { + if (dst_reg != BPF_REG_FP) { + /* The backtracking logic can only recognize explicit + * stack slot address like [fp - 8]. Other spill of + * scalar via different register has to be conervative. + * Backtrack from here and mark all registers as precise + * that contributed into 'reg' being a constant. + */ + err = mark_chain_precision(env, value_regno); + if (err) + return err; + } save_register_state(state, spi, reg); } else if (reg && is_spillable_regtype(reg->type)) { /* register containing pointer is being spilled into stack */ @@ -1497,8 +1921,13 @@ static int check_stack_write(struct bpf_verifier_env *env, state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN; /* when we zero initialize stack slots mark them as such */ - if (reg && register_is_null(reg)) + if (reg && register_is_null(reg)) { + /* backtracking doesn't work for STACK_ZERO yet. */ + err = mark_chain_precision(env, value_regno); + if (err) + return err; type = STACK_ZERO; + } /* Mark slots affected by this stack write. */ for (i = 0; i < size; i++) @@ -1577,6 +2006,17 @@ static int check_stack_read(struct bpf_verifier_env *env, * so the whole register == const_zero */ __mark_reg_const_zero(&state->regs[value_regno]); + /* backtracking doesn't support STACK_ZERO yet, + * so mark it precise here, so that later + * backtracking can stop here. + * Backtracking may not need this if this register + * doesn't participate in pointer adjustment. + * Forward propagation of precise flag is not + * necessary either. This mark is only to stop + * backtracking. Any register that contributed + * to const 0 was marked precise before spill. + */ + state->regs[value_regno].precise = true; } else { /* have read misc data from the stack */ mark_reg_unknown(env, state->regs, value_regno); @@ -2891,6 +3331,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, err = check_helper_mem_access(env, regno - 1, reg->umax_value, zero_size_allowed, meta); + if (!err) + err = mark_chain_precision(env, regno); } else if (arg_type_is_int_ptr(arg_type)) { int size = int_ptr_type_to_size(arg_type); @@ -4507,6 +4949,7 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env, struct bpf_reg_state *regs = state->regs, *dst_reg, *src_reg; struct bpf_reg_state *ptr_reg = NULL, off_reg = {0}; u8 opcode = BPF_OP(insn->code); + int err; dst_reg = ®s[insn->dst_reg]; src_reg = NULL; @@ -4533,11 +4976,17 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env, * This is legal, but we have to reverse our * src/dest handling in computing the range */ + err = mark_chain_precision(env, insn->dst_reg); + if (err) + return err; return adjust_ptr_min_max_vals(env, insn, src_reg, dst_reg); } } else if (ptr_reg) { /* pointer += scalar */ + err = mark_chain_precision(env, insn->src_reg); + if (err) + return err; return adjust_ptr_min_max_vals(env, insn, dst_reg, src_reg); } @@ -5494,7 +5943,13 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, tnum_is_const(src_reg->var_off)) pred = is_branch_taken(dst_reg, src_reg->var_off.value, opcode, is_jmp32); - + if (pred >= 0) { + err = mark_chain_precision(env, insn->dst_reg); + if (BPF_SRC(insn->code) == BPF_X && !err) + err = mark_chain_precision(env, insn->src_reg); + if (err) + return err; + } if (pred == 1) { /* Only follow the goto, ignore fall-through. If needed, push * the fall-through branch for simulation under speculative @@ -5993,6 +6448,11 @@ peek_stack: goto peek_stack; else if (ret < 0) goto err_free; + /* unconditional jmp is not a good pruning point, + * but it's marked, since backtracking needs + * to record jmp history in is_state_visited(). + */ + init_explored_state(env, t + insns[t].off + 1); /* tell verifier to check for equivalent states * after every call and jump */ @@ -6488,6 +6948,8 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold, if (env->explore_alu_limits) return false; if (rcur->type == SCALAR_VALUE) { + if (!rold->precise && !rcur->precise) + return true; /* new val must satisfy old val knowledge */ return range_within(rold, rcur) && tnum_in(rold->var_off, rcur->var_off); @@ -6828,6 +7290,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) int i, j, err, states_cnt = 0; bool add_new_state = false; + cur->last_insn_idx = env->prev_insn_idx; if (!env->insn_aux_data[insn_idx].prune_point) /* this 'insn_idx' instruction wasn't marked, so we will not * be doing state search here @@ -6944,10 +7407,10 @@ next: env->max_states_per_insn = states_cnt; if (!env->allow_ptr_leaks && states_cnt > BPF_COMPLEXITY_LIMIT_STATES) - return 0; + return push_jmp_history(env, cur); if (!add_new_state) - return 0; + return push_jmp_history(env, cur); /* There were no equivalent states, remember the current one. * Technically the current state is not proven to be safe yet, @@ -6977,7 +7440,10 @@ next: new->insn_idx = insn_idx; WARN_ONCE(new->branches != 1, "BUG is_state_visited:branches_to_explore=%d insn %d\n", new->branches, insn_idx); + cur->parent = new; + cur->first_insn_idx = insn_idx; + clear_jmp_history(cur); new_sl->next = *explored_state(env, insn_idx); *explored_state(env, insn_idx) = new_sl; /* connect new state to parentage chain. Current frame needs all @@ -7057,6 +7523,7 @@ static int do_check(struct bpf_verifier_env *env) struct bpf_reg_state *regs; int insn_cnt = env->prog->len; bool do_print_state = false; + int prev_insn_idx = -1; env->prev_linfo = NULL; @@ -7082,6 +7549,7 @@ static int do_check(struct bpf_verifier_env *env) u8 class; int err; + env->prev_insn_idx = prev_insn_idx; if (env->insn_idx >= insn_cnt) { verbose(env, "invalid insn idx %d insn_cnt %d\n", env->insn_idx, insn_cnt); @@ -7154,6 +7622,7 @@ static int do_check(struct bpf_verifier_env *env) regs = cur_regs(env); sanitize_mark_insn_seen(env); + prev_insn_idx = env->insn_idx; if (class == BPF_ALU || class == BPF_ALU64) { err = check_alu_op(env, insn); @@ -7327,7 +7796,6 @@ static int do_check(struct bpf_verifier_env *env) if (state->curframe) { /* exit from nested function */ - env->prev_insn_idx = env->insn_idx; err = prepare_func_exit(env, &env->insn_idx); if (err) return err; @@ -7359,7 +7827,7 @@ static int do_check(struct bpf_verifier_env *env) return err; process_bpf_exit: update_branch_counts(env, env->cur_state); - err = pop_stack(env, &env->prev_insn_idx, + err = pop_stack(env, &prev_insn_idx, &env->insn_idx); if (err < 0) { if (err != -ENOENT) From 21391eb5198d44b79dc347da7607189f0a51edaa Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 4 Jun 2019 10:10:45 +0200 Subject: [PATCH 1219/1640] UPSTREAM: treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 451 Based on 1 normalized pattern(s): this file is subject to the terms and conditions of version 2 of the gnu general public license see the file copying in the main directory of the linux distribution for more details extracted by the scancode license scanner the SPDX license identifier GPL-2.0-only has been chosen to replace the boilerplate/reference in 5 file(s). Change-Id: I8d95ed3d46b0fab9bdcfcb286a583054ddd7d8ea Signed-off-by: Thomas Gleixner Reviewed-by: Enrico Weigelt Reviewed-by: Allison Randal Cc: linux-spdx@vger.kernel.org Link: https://lkml.kernel.org/r/20190604081200.872755311@linutronix.de Signed-off-by: Greg Kroah-Hartman --- include/linux/cgroup_rdma.h | 5 +---- kernel/bpf/cgroup.c | 5 +---- kernel/bpf/lpm_trie.c | 5 +---- kernel/cgroup/pids.c | 5 +---- kernel/cgroup/rdma.c | 5 +---- 5 files changed, 5 insertions(+), 20 deletions(-) diff --git a/include/linux/cgroup_rdma.h b/include/linux/cgroup_rdma.h index e94290b29e99..aa5b50b249de 100644 --- a/include/linux/cgroup_rdma.h +++ b/include/linux/cgroup_rdma.h @@ -1,9 +1,6 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (C) 2016 Parav Pandit - * - * This file is subject to the terms and conditions of version 2 of the GNU - * General Public License. See the file COPYING in the main directory of the - * Linux distribution for more details. */ #ifndef _CGROUP_RDMA_H diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 1b65ab0df457..c225c42e114a 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -1,11 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Functions to manage eBPF programs attached to cgroups * * Copyright (c) 2016 Daniel Mack - * - * This file is subject to the terms and conditions of version 2 of the GNU - * General Public License. See the file COPYING in the main directory of the - * Linux distribution for more details. */ #include diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index 709bd91161ba..c372be6df264 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -1,12 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Longest prefix match list implementation * * Copyright (c) 2016,2017 Daniel Mack * Copyright (c) 2016 David Herrmann - * - * This file is subject to the terms and conditions of version 2 of the GNU - * General Public License. See the file COPYING in the main directory of the - * Linux distribution for more details. */ #include diff --git a/kernel/cgroup/pids.c b/kernel/cgroup/pids.c index 940d2e8db776..138059eb730d 100644 --- a/kernel/cgroup/pids.c +++ b/kernel/cgroup/pids.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Process number limiting controller for cgroups. * @@ -25,10 +26,6 @@ * a superset of parent/child/pids.current. * * Copyright (C) 2015 Aleksa Sarai - * - * This file is subject to the terms and conditions of version 2 of the GNU - * General Public License. See the file COPYING in the main directory of the - * Linux distribution for more details. */ #include diff --git a/kernel/cgroup/rdma.c b/kernel/cgroup/rdma.c index defad3c5e7dc..8db8eb22c218 100644 --- a/kernel/cgroup/rdma.c +++ b/kernel/cgroup/rdma.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * RDMA resource limiting controller for cgroups. * @@ -5,10 +6,6 @@ * additional RDMA resources after a certain limit is reached. * * Copyright (C) 2016 Parav Pandit - * - * This file is subject to the terms and conditions of version 2 of the GNU - * General Public License. See the file COPYING in the main directory of the - * Linux distribution for more details. */ #include From 2aeb1456180f6b62899a53e63af41848ee4f579a Mon Sep 17 00:00:00 2001 From: Changbin Du Date: Wed, 15 Nov 2017 17:32:53 -0800 Subject: [PATCH 1220/1640] UPSTREAM: mm: update comments for struct page.mapping struct page.mapping can be NULL or points to one object of type address_space, anon_vma or KSM private structure. Link: http://lkml.kernel.org/r/1506485067-15954-1-git-send-email-changbin.du@intel.com Signed-off-by: Changbin Du Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 92f7c16cef5f..5b0c4faaf26c 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -48,8 +48,10 @@ struct page { * inode address_space, or NULL. * If page mapped as anonymous * memory, low bit is set, and - * it points to anon_vma object: - * see PAGE_MAPPING_ANON below. + * it points to anon_vma object + * or KSM private structure. See + * PAGE_MAPPING_ANON and + * PAGE_MAPPING_KSM. */ void *s_mem; /* slab first object */ atomic_t compound_mapcount; /* first tail page */ From 87a44b114660781a8921ea5e5b6f4bd130eebe1d Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 15 Nov 2017 17:35:33 -0800 Subject: [PATCH 1221/1640] BACKPORT: mm: account pud page tables On a machine with 5-level paging support a process can allocate significant amount of memory and stay unnoticed by oom-killer and memory cgroup. The trick is to allocate a lot of PUD page tables. We don't account PUD page tables, only PMD and PTE. We already addressed the same issue for PMD page tables, see commit dc6c9a35b66b ("mm: account pmd page tables to the process"). Introduction of 5-level paging brings the same issue for PUD page tables. The patch expands accounting to PUD level. [kirill.shutemov@linux.intel.com: s/pmd_t/pud_t/] Link: http://lkml.kernel.org/r/20171004074305.x35eh5u7ybbt5kar@black.fi.intel.com [heiko.carstens@de.ibm.com: s390/mm: fix pud table accounting] Link: http://lkml.kernel.org/r/20171103090551.18231-1-heiko.carstens@de.ibm.com Link: http://lkml.kernel.org/r/20171002080427.3320-1-kirill.shutemov@linux.intel.com Change-Id: I95df97bc8a42ea1e615b9eb50ab20fa1e0f81cf7 Signed-off-by: Kirill A. Shutemov Signed-off-by: Heiko Carstens Acked-by: Rik van Riel Acked-by: Michal Hocko Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/sysctl/vm.txt | 8 +++---- arch/powerpc/mm/hugetlbpage.c | 1 + arch/s390/include/asm/mmu_context.h | 4 +++- arch/sparc/mm/hugetlbpage.c | 1 + fs/proc/task_mmu.c | 5 +++- include/linux/mm.h | 36 ++++++++++++++++++++++++++--- include/linux/mm_types.h | 3 +++ kernel/fork.c | 4 ++++ mm/debug.c | 6 +++-- mm/memory.c | 15 +++++++----- mm/oom_kill.c | 8 ++++--- 11 files changed, 71 insertions(+), 20 deletions(-) diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index 946748424871..45b5a39af038 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt @@ -641,10 +641,10 @@ oom_dump_tasks Enables a system-wide task dump (excluding kernel threads) to be produced when the kernel performs an OOM-killing and includes such information as -pid, uid, tgid, vm size, rss, nr_ptes, nr_pmds, swapents, oom_score_adj -score, and name. This is helpful to determine why the OOM killer was -invoked, to identify the rogue task that caused it, and to determine why -the OOM killer chose the task it did to kill. +pid, uid, tgid, vm size, rss, nr_ptes, nr_pmds, nr_puds, swapents, +oom_score_adj score, and name. This is helpful to determine why the OOM +killer was invoked, to identify the rogue task that caused it, and to +determine why the OOM killer chose the task it did to kill. If this is set to zero, this information is suppressed. On very large systems with thousands of tasks it may not be feasible to dump diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index ad08e13138de..7551f7435f06 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -437,6 +437,7 @@ static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, pud = pud_offset(pgd, start); pgd_clear(pgd); pud_free_tlb(tlb, pud, start); + mm_dec_nr_puds(tlb->mm); } /* diff --git a/arch/s390/include/asm/mmu_context.h b/arch/s390/include/asm/mmu_context.h index a6cc744ff5fb..3ffc56124725 100644 --- a/arch/s390/include/asm/mmu_context.h +++ b/arch/s390/include/asm/mmu_context.h @@ -44,6 +44,8 @@ static inline int init_new_context(struct task_struct *tsk, mm->context.asce_limit = STACK_TOP_MAX; mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH | _ASCE_USER_BITS | _ASCE_TYPE_REGION3; + /* pgd_alloc() did not account this pud */ + mm_inc_nr_puds(mm); break; case -PAGE_SIZE: /* forked 5-level task, set new asce with new_mm->pgd */ @@ -59,7 +61,7 @@ static inline int init_new_context(struct task_struct *tsk, /* forked 2-level compat task, set new asce with new mm->pgd */ mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH | _ASCE_USER_BITS | _ASCE_TYPE_SEGMENT; - /* pgd_alloc() did not increase mm->nr_pmds */ + /* pgd_alloc() did not account this pmd */ mm_inc_nr_pmds(mm); } crst_table_init((unsigned long *) mm->pgd, pgd_entry_type(mm)); diff --git a/arch/sparc/mm/hugetlbpage.c b/arch/sparc/mm/hugetlbpage.c index c572e573c540..f69e0b59d262 100644 --- a/arch/sparc/mm/hugetlbpage.c +++ b/arch/sparc/mm/hugetlbpage.c @@ -472,6 +472,7 @@ static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, pud = pud_offset(pgd, start); pgd_clear(pgd); pud_free_tlb(tlb, pud, start); + mm_dec_nr_puds(tlb->mm); } void hugetlb_free_pgd_range(struct mmu_gather *tlb, diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 3310f533f448..9a98a39676f7 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -35,7 +35,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) { - unsigned long text, lib, swap, ptes, pmds, anon, file, shmem; + unsigned long text, lib, swap, ptes, pmds, puds, anon, file, shmem; unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss; anon = get_mm_counter(mm, MM_ANONPAGES); @@ -61,6 +61,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) swap = get_mm_counter(mm, MM_SWAPENTS); ptes = PTRS_PER_PTE * sizeof(pte_t) * atomic_long_read(&mm->nr_ptes); pmds = PTRS_PER_PMD * sizeof(pmd_t) * mm_nr_pmds(mm); + puds = PTRS_PER_PUD * sizeof(pud_t) * mm_nr_puds(mm); seq_printf(m, "VmPeak:\t%8lu kB\n" "VmSize:\t%8lu kB\n" @@ -77,6 +78,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) "VmLib:\t%8lu kB\n" "VmPTE:\t%8lu kB\n" "VmPMD:\t%8lu kB\n" + "VmPUD:\t%8lu kB\n" "VmSwap:\t%8lu kB\n", hiwater_vm << (PAGE_SHIFT-10), total_vm << (PAGE_SHIFT-10), @@ -91,6 +93,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) mm->stack_vm << (PAGE_SHIFT-10), text, lib, ptes >> 10, pmds >> 10, + puds >> 10, swap << (PAGE_SHIFT-10)); hugetlb_report_usage(m, mm); } diff --git a/include/linux/mm.h b/include/linux/mm.h index 0d7c2168e13c..45a13f7b2712 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1756,14 +1756,44 @@ static inline int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd, int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address); #endif -#ifdef __PAGETABLE_PUD_FOLDED +#if defined(__PAGETABLE_PUD_FOLDED) || !defined(CONFIG_MMU) static inline int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address) { return 0; } + +static inline unsigned long mm_nr_puds(const struct mm_struct *mm) +{ + return 0; +} + +static inline void mm_nr_puds_init(struct mm_struct *mm) {} +static inline void mm_inc_nr_puds(struct mm_struct *mm) {} +static inline void mm_dec_nr_puds(struct mm_struct *mm) {} + #else int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address); + +static inline void mm_nr_puds_init(struct mm_struct *mm) +{ + atomic_long_set(&mm->nr_puds, 0); +} + +static inline unsigned long mm_nr_puds(const struct mm_struct *mm) +{ + return atomic_long_read(&mm->nr_puds); +} + +static inline void mm_inc_nr_puds(struct mm_struct *mm) +{ + atomic_long_inc(&mm->nr_puds); +} + +static inline void mm_dec_nr_puds(struct mm_struct *mm) +{ + atomic_long_dec(&mm->nr_puds); +} #endif #if defined(__PAGETABLE_PMD_FOLDED) || !defined(CONFIG_MMU) @@ -1775,7 +1805,7 @@ static inline int __pmd_alloc(struct mm_struct *mm, pud_t *pud, static inline void mm_nr_pmds_init(struct mm_struct *mm) {} -static inline unsigned long mm_nr_pmds(struct mm_struct *mm) +static inline unsigned long mm_nr_pmds(const struct mm_struct *mm) { return 0; } @@ -1791,7 +1821,7 @@ static inline void mm_nr_pmds_init(struct mm_struct *mm) atomic_long_set(&mm->nr_pmds, 0); } -static inline unsigned long mm_nr_pmds(struct mm_struct *mm) +static inline unsigned long mm_nr_pmds(const struct mm_struct *mm) { return atomic_long_read(&mm->nr_pmds); } diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 5b0c4faaf26c..c546dc35079d 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -408,6 +408,9 @@ struct mm_struct { atomic_long_t nr_ptes; /* PTE page table pages */ #if CONFIG_PGTABLE_LEVELS > 2 atomic_long_t nr_pmds; /* PMD page table pages */ +#endif +#if CONFIG_PGTABLE_LEVELS > 3 + atomic_long_t nr_puds; /* PUD page table pages */ #endif int map_count; /* number of VMAs */ diff --git a/kernel/fork.c b/kernel/fork.c index c086a57371be..e529b24d63c5 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -840,6 +840,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, mm->core_state = NULL; atomic_long_set(&mm->nr_ptes, 0); mm_nr_pmds_init(mm); + mm_nr_puds_init(mm); mm->map_count = 0; mm->locked_vm = 0; mm->pinned_vm = 0; @@ -907,6 +908,9 @@ static void check_mm(struct mm_struct *mm) if (mm_nr_pmds(mm)) pr_alert("BUG: non-zero nr_pmds on freeing mm: %ld\n", mm_nr_pmds(mm)); + if (mm_nr_puds(mm)) + pr_alert("BUG: non-zero nr_puds on freeing mm: %ld\n", + mm_nr_puds(mm)); #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS VM_BUG_ON_MM(mm->pmd_huge_pte, mm); diff --git a/mm/debug.c b/mm/debug.c index 97609290dd51..6a6b413c565a 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -105,7 +105,8 @@ void dump_mm(const struct mm_struct *mm) "get_unmapped_area %px\n" #endif "mmap_base %lu mmap_legacy_base %lu highest_vm_end %lu\n" - "pgd %px mm_users %d mm_count %d nr_ptes %lu nr_pmds %lu map_count %d\n" + "pgd %px mm_users %d mm_count %d\n" + "nr_ptes %lu nr_pmds %lu nr_puds %lu map_count %d\n" "hiwater_rss %lx hiwater_vm %lx total_vm %lx locked_vm %lx\n" "pinned_vm %lx data_vm %lx exec_vm %lx stack_vm %lx\n" "start_code %lx end_code %lx start_data %lx end_data %lx\n" @@ -136,7 +137,8 @@ void dump_mm(const struct mm_struct *mm) mm->pgd, atomic_read(&mm->mm_users), atomic_read(&mm->mm_count), atomic_long_read((atomic_long_t *)&mm->nr_ptes), - mm_nr_pmds((struct mm_struct *)mm), + mm_nr_pmds(mm), + mm_nr_puds(mm), mm->map_count, mm->hiwater_rss, mm->hiwater_vm, mm->total_vm, mm->locked_vm, mm->pinned_vm, mm->data_vm, mm->exec_vm, mm->stack_vm, diff --git a/mm/memory.c b/mm/memory.c index 14847ad0cbb0..0a7a59815f11 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -542,6 +542,7 @@ static inline void free_pud_range(struct mmu_gather *tlb, p4d_t *p4d, pud = pud_offset(p4d, start); p4d_clear(p4d); pud_free_tlb(tlb, pud, start); + mm_dec_nr_puds(tlb->mm); } static inline void free_p4d_range(struct mmu_gather *tlb, pgd_t *pgd, @@ -4362,15 +4363,17 @@ int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address) spin_lock(&mm->page_table_lock); #ifndef __ARCH_HAS_5LEVEL_HACK - if (p4d_present(*p4d)) /* Another has populated it */ - pud_free(mm, new); - else + if (!p4d_present(*p4d)) { + mm_inc_nr_puds(mm); p4d_populate(mm, p4d, new); -#else - if (pgd_present(*p4d)) /* Another has populated it */ + } else /* Another has populated it */ pud_free(mm, new); - else +#else + if (!pgd_present(*p4d)) { + mm_inc_nr_puds(mm); pgd_populate(mm, p4d, new); + } else /* Another has populated it */ + pud_free(mm, new); #endif /* __ARCH_HAS_5LEVEL_HACK */ spin_unlock(&mm->page_table_lock); return 0; diff --git a/mm/oom_kill.c b/mm/oom_kill.c index b67a1183700c..29803c2ac0f5 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -206,7 +206,8 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, * task's rss, pagetable and swap space use. */ points = get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS) + - atomic_long_read(&p->mm->nr_ptes) + mm_nr_pmds(p->mm); + atomic_long_read(&p->mm->nr_ptes) + mm_nr_pmds(p->mm) + + mm_nr_puds(p->mm); task_unlock(p); /* Normalize to oom_score_adj units */ @@ -388,7 +389,7 @@ void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask) swap_comp_nrpages = get_swap_comp_pool_nrpages(); pr_info("[ pid ] uid tgid total_vm total_rss ( rss swap ) nr_ptes nr_pmds swapents oom_score_adj name\n"); #else - pr_info("[ pid ] uid tgid total_vm rss nr_ptes nr_pmds swapents oom_score_adj name\n"); + pr_info("[ pid ] uid tgid total_vm rss nr_ptes nr_pmds nr_puds swapents oom_score_adj name\n"); #endif rcu_read_lock(); for_each_process(p) { @@ -410,7 +411,7 @@ void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask) swap_comp_nrpages / swap_orig_nrpages; pr_info("[%5d] %5d %5d %8lu %8lu (%8lu %8lu) %7ld %7ld %8lu %5hd %s\n", #else - pr_info("[%5d] %5d %5d %8lu %8lu %7ld %7ld %8lu %5hd %s\n", + pr_info("[%5d] %5d %5d %8lu %8lu %7ld %7ld %7ld %8lu %5hd %s\n", #endif task->pid, from_kuid(&init_user_ns, task_uid(task)), task->tgid, task->mm->total_vm, @@ -422,6 +423,7 @@ void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask) #endif atomic_long_read(&task->mm->nr_ptes), mm_nr_pmds(task->mm), + mm_nr_puds(task->mm), get_mm_counter(task->mm, MM_SWAPENTS), task->signal->oom_score_adj, task->comm); cur_rss_sum = get_mm_rss(task->mm) + From fad514cb8e3d6b1039a71bb1702e9f3537d60a2a Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 15 Nov 2017 17:35:37 -0800 Subject: [PATCH 1222/1640] BACKPORT: mm: introduce wrappers to access mm->nr_ptes Let's add wrappers for ->nr_ptes with the same interface as for nr_pmd and nr_pud. The patch also makes nr_ptes accounting dependent onto CONFIG_MMU. Page table accounting doesn't make sense if you don't have page tables. It's preparation for consolidation of page-table counters in mm_struct. Link: http://lkml.kernel.org/r/20171006100651.44742-1-kirill.shutemov@linux.intel.com Change-Id: I02b5ffc25ebc0d393d0a77f6213489024db68de6 Signed-off-by: Kirill A. Shutemov Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/mm/pgd.c | 2 +- arch/sparc/mm/hugetlbpage.c | 2 +- arch/unicore32/mm/pgd.c | 2 +- fs/proc/task_mmu.c | 2 +- include/linux/mm.h | 32 ++++++++++++++++++++++++++++++++ include/linux/mm_types.h | 2 ++ kernel/fork.c | 6 +++--- mm/debug.c | 2 +- mm/huge_memory.c | 10 +++++----- mm/khugepaged.c | 1 + mm/memory.c | 8 ++++---- mm/oom_kill.c | 5 ++--- 12 files changed, 54 insertions(+), 20 deletions(-) diff --git a/arch/arm/mm/pgd.c b/arch/arm/mm/pgd.c index 6a1e9b44be99..006421cc9257 100644 --- a/arch/arm/mm/pgd.c +++ b/arch/arm/mm/pgd.c @@ -142,7 +142,7 @@ void pgd_free(struct mm_struct *mm, pgd_t *pgd_base) pte = pmd_pgtable(*pmd); pmd_clear(pmd); pte_free(mm, pte); - atomic_long_dec(&mm->nr_ptes); + mm_dec_nr_ptes(mm); no_pmd: pud_clear(pud); pmd_free(mm, pmd); diff --git a/arch/sparc/mm/hugetlbpage.c b/arch/sparc/mm/hugetlbpage.c index f69e0b59d262..3a2886fcc10b 100644 --- a/arch/sparc/mm/hugetlbpage.c +++ b/arch/sparc/mm/hugetlbpage.c @@ -397,7 +397,7 @@ static void hugetlb_free_pte_range(struct mmu_gather *tlb, pmd_t *pmd, pmd_clear(pmd); pte_free_tlb(tlb, token, addr); - atomic_long_dec(&tlb->mm->nr_ptes); + mm_dec_nr_ptes(tlb->mm); } static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud, diff --git a/arch/unicore32/mm/pgd.c b/arch/unicore32/mm/pgd.c index c572a28c76c9..a830a300aaa1 100644 --- a/arch/unicore32/mm/pgd.c +++ b/arch/unicore32/mm/pgd.c @@ -97,7 +97,7 @@ void free_pgd_slow(struct mm_struct *mm, pgd_t *pgd) pte = pmd_pgtable(*pmd); pmd_clear(pmd); pte_free(mm, pte); - atomic_long_dec(&mm->nr_ptes); + mm_dec_nr_ptes(mm); pmd_free(mm, pmd); mm_dec_nr_pmds(mm); free: diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 9a98a39676f7..6cff72912504 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -59,7 +59,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10; lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text; swap = get_mm_counter(mm, MM_SWAPENTS); - ptes = PTRS_PER_PTE * sizeof(pte_t) * atomic_long_read(&mm->nr_ptes); + ptes = PTRS_PER_PTE * sizeof(pte_t) * mm_nr_ptes(mm); pmds = PTRS_PER_PMD * sizeof(pmd_t) * mm_nr_pmds(mm); puds = PTRS_PER_PUD * sizeof(pud_t) * mm_nr_puds(mm); seq_printf(m, diff --git a/include/linux/mm.h b/include/linux/mm.h index 45a13f7b2712..3c235dbb4f4a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1837,6 +1837,38 @@ static inline void mm_dec_nr_pmds(struct mm_struct *mm) } #endif +#ifdef CONFIG_MMU +static inline void mm_nr_ptes_init(struct mm_struct *mm) +{ + atomic_long_set(&mm->nr_ptes, 0); +} + +static inline unsigned long mm_nr_ptes(const struct mm_struct *mm) +{ + return atomic_long_read(&mm->nr_ptes); +} + +static inline void mm_inc_nr_ptes(struct mm_struct *mm) +{ + atomic_long_inc(&mm->nr_ptes); +} + +static inline void mm_dec_nr_ptes(struct mm_struct *mm) +{ + atomic_long_dec(&mm->nr_ptes); +} +#else +static inline void mm_nr_ptes_init(struct mm_struct *mm) {} + +static inline unsigned long mm_nr_ptes(const struct mm_struct *mm) +{ + return 0; +} + +static inline void mm_inc_nr_ptes(struct mm_struct *mm) {} +static inline void mm_dec_nr_ptes(struct mm_struct *mm) {} +#endif + int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address); int __pte_alloc_kernel(pmd_t *pmd, unsigned long address); diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index c546dc35079d..e557f7778793 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -405,7 +405,9 @@ struct mm_struct { */ atomic_t mm_count; +#ifdef CONFIG_MMU atomic_long_t nr_ptes; /* PTE page table pages */ +#endif #if CONFIG_PGTABLE_LEVELS > 2 atomic_long_t nr_pmds; /* PMD page table pages */ #endif diff --git a/kernel/fork.c b/kernel/fork.c index e529b24d63c5..26e583842e31 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -838,7 +838,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, init_rwsem(&mm->mmap_sem); INIT_LIST_HEAD(&mm->mmlist); mm->core_state = NULL; - atomic_long_set(&mm->nr_ptes, 0); + mm_nr_ptes_init(mm); mm_nr_pmds_init(mm); mm_nr_puds_init(mm); mm->map_count = 0; @@ -902,9 +902,9 @@ static void check_mm(struct mm_struct *mm) "mm:%p idx:%d val:%ld\n", mm, i, x); } - if (atomic_long_read(&mm->nr_ptes)) + if (mm_nr_ptes(mm)) pr_alert("BUG: non-zero nr_ptes on freeing mm: %ld\n", - atomic_long_read(&mm->nr_ptes)); + mm_nr_ptes(mm)); if (mm_nr_pmds(mm)) pr_alert("BUG: non-zero nr_pmds on freeing mm: %ld\n", mm_nr_pmds(mm)); diff --git a/mm/debug.c b/mm/debug.c index 6a6b413c565a..ffec146806c1 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -136,7 +136,7 @@ void dump_mm(const struct mm_struct *mm) mm->mmap_base, mm->mmap_legacy_base, mm->highest_vm_end, mm->pgd, atomic_read(&mm->mm_users), atomic_read(&mm->mm_count), - atomic_long_read((atomic_long_t *)&mm->nr_ptes), + mm_nr_ptes(mm), mm_nr_pmds(mm), mm_nr_puds(mm), mm->map_count, diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 5932e9160606..676f1d4107df 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -610,7 +610,7 @@ static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page, pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable); set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry); add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR); - atomic_long_inc(&vma->vm_mm->nr_ptes); + mm_inc_nr_ptes(vma->vm_mm); spin_unlock(vmf->ptl); count_vm_event(THP_FAULT_ALLOC); } @@ -666,7 +666,7 @@ static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, if (pgtable) pgtable_trans_huge_deposit(mm, pmd, pgtable); set_pmd_at(mm, haddr, pmd, entry); - atomic_long_inc(&mm->nr_ptes); + mm_inc_nr_ptes(mm); return true; } @@ -750,7 +750,7 @@ static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, if (pgtable) { pgtable_trans_huge_deposit(mm, pmd, pgtable); - atomic_long_inc(&mm->nr_ptes); + mm_inc_nr_ptes(mm); } set_pmd_at(mm, addr, pmd, entry); @@ -974,7 +974,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, get_page(src_page); page_dup_rmap(src_page, true); add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR); - atomic_long_inc(&dst_mm->nr_ptes); + mm_inc_nr_ptes(dst_mm); pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable); pmdp_set_wrprotect(src_mm, addr, src_pmd); @@ -1666,7 +1666,7 @@ static inline void zap_deposited_table(struct mm_struct *mm, pmd_t *pmd) pgtable = pgtable_trans_huge_withdraw(mm, pmd); pte_free(mm, pgtable); - atomic_long_dec(&mm->nr_ptes); + mm_dec_nr_ptes(mm); } int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 644f0a9c8a55..f5205e7fb629 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1313,6 +1313,7 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) /* assume page table is clear */ _pmd = pmdp_collapse_flush(vma, addr, pmd); spin_unlock(ptl); + mm_dec_nr_ptes(vma->vm_mm); atomic_long_dec(&mm->nr_ptes); tlb_remove_table_sync_one(); pte_free(mm, pmd_pgtable(_pmd)); diff --git a/mm/memory.c b/mm/memory.c index 0a7a59815f11..d68a43b3d6d2 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -474,7 +474,7 @@ static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd, pgtable_t token = pmd_pgtable(*pmd); pmd_clear(pmd); pte_free_tlb(tlb, token, addr); - atomic_long_dec(&tlb->mm->nr_ptes); + mm_dec_nr_ptes(tlb->mm); } static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud, @@ -702,7 +702,7 @@ int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address) ptl = pmd_lock(mm, pmd); if (likely(pmd_none(*pmd))) { /* Has another populated it ? */ - atomic_long_inc(&mm->nr_ptes); + mm_inc_nr_ptes(mm); pmd_populate(mm, pmd, new); new = NULL; } @@ -3408,7 +3408,7 @@ static int pte_alloc_one_map(struct vm_fault *vmf) goto map_pte; } - atomic_long_inc(&vma->vm_mm->nr_ptes); + mm_inc_nr_ptes(vma->vm_mm); pmd_populate(vma->vm_mm, vmf->pmd, vmf->prealloc_pte); spin_unlock(vmf->ptl); vmf->prealloc_pte = NULL; @@ -3467,7 +3467,7 @@ static void deposit_prealloc_pte(struct vm_fault *vmf) * We are going to consume the prealloc table, * count that as nr_ptes. */ - atomic_long_inc(&vma->vm_mm->nr_ptes); + mm_inc_nr_ptes(vma->vm_mm); vmf->prealloc_pte = NULL; } diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 29803c2ac0f5..08be45369e39 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -206,8 +206,7 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, * task's rss, pagetable and swap space use. */ points = get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS) + - atomic_long_read(&p->mm->nr_ptes) + mm_nr_pmds(p->mm) + - mm_nr_puds(p->mm); + mm_nr_ptes(p->mm) + mm_nr_pmds(p->mm) + mm_nr_puds(p->mm); task_unlock(p); /* Normalize to oom_score_adj units */ @@ -421,7 +420,7 @@ void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask) #else get_mm_rss(task->mm), #endif - atomic_long_read(&task->mm->nr_ptes), + mm_nr_ptes(task->mm), mm_nr_pmds(task->mm), mm_nr_puds(task->mm), get_mm_counter(task->mm, MM_SWAPENTS), From fb16082a9e2a19af2115c4f28afa98160ea3da9e Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 15 Nov 2017 17:35:40 -0800 Subject: [PATCH 1223/1640] BACKPORT: mm: consolidate page table accounting Currently, we account page tables separately for each page table level, but that's redundant -- we only make use of total memory allocated to page tables for oom_badness calculation. We also provide the information to userspace, but it has dubious value there too. This patch switches page table accounting to single counter. mm->pgtables_bytes is now used to account all page table levels. We use bytes, because page table size for different levels of page table tree may be different. The change has user-visible effect: we don't have VmPMD and VmPUD reported in /proc/[pid]/status. Not sure if anybody uses them. (As alternative, we can always report 0 kB for them.) OOM-killer report is also slightly changed: we now report pgtables_bytes instead of nr_ptes, nr_pmd, nr_puds. Apart from reducing number of counters per-mm, the benefit is that we now calculate oom_badness() more correctly for machines which have different size of page tables depending on level or where page tables are less than a page in size. The only downside can be debuggability because we do not know which page table level could leak. But I do not remember many bugs that would be caught by separate counters so I wouldn't lose sleep over this. [akpm@linux-foundation.org: fix mm/huge_memory.c] Link: http://lkml.kernel.org/r/20171006100651.44742-2-kirill.shutemov@linux.intel.com Change-Id: I16d36b9be0b0eed208cae82a3905bde9b8678fbd Signed-off-by: Kirill A. Shutemov Acked-by: Michal Hocko [kirill.shutemov@linux.intel.com: fix build] Link: http://lkml.kernel.org/r/20171016150113.ikfxy3e7zzfvsr4w@black.fi.intel.com Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/filesystems/proc.txt | 1 - Documentation/sysctl/vm.txt | 8 ++--- fs/proc/task_mmu.c | 11 ++---- include/linux/mm.h | 58 +++++++----------------------- include/linux/mm_types.h | 8 +---- kernel/fork.c | 16 +++------ mm/debug.c | 7 ++-- mm/huge_memory.c | 2 +- mm/oom_kill.c | 16 ++++----- 9 files changed, 33 insertions(+), 94 deletions(-) diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index df2f752ab0b5..b9baeea45d9e 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -251,7 +251,6 @@ Table 1-2: Contents of the status files (as of 4.8) VmExe size of text segment VmLib size of shared library code VmPTE size of page table entries - VmPMD size of second level page tables VmSwap amount of swap used by anonymous private data (shmem swap usage is not included) HugetlbPages size of hugetlb memory portions diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index 45b5a39af038..33c6df730a6b 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt @@ -641,10 +641,10 @@ oom_dump_tasks Enables a system-wide task dump (excluding kernel threads) to be produced when the kernel performs an OOM-killing and includes such information as -pid, uid, tgid, vm size, rss, nr_ptes, nr_pmds, nr_puds, swapents, -oom_score_adj score, and name. This is helpful to determine why the OOM -killer was invoked, to identify the rogue task that caused it, and to -determine why the OOM killer chose the task it did to kill. +pid, uid, tgid, vm size, rss, pgtables_bytes, swapents, oom_score_adj +score, and name. This is helpful to determine why the OOM killer was +invoked, to identify the rogue task that caused it, and to determine why +the OOM killer chose the task it did to kill. If this is set to zero, this information is suppressed. On very large systems with thousands of tasks it may not be feasible to dump diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 6cff72912504..132edcbe187f 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -35,7 +35,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) { - unsigned long text, lib, swap, ptes, pmds, puds, anon, file, shmem; + unsigned long text, lib, swap, anon, file, shmem; unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss; anon = get_mm_counter(mm, MM_ANONPAGES); @@ -59,9 +59,6 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10; lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text; swap = get_mm_counter(mm, MM_SWAPENTS); - ptes = PTRS_PER_PTE * sizeof(pte_t) * mm_nr_ptes(mm); - pmds = PTRS_PER_PMD * sizeof(pmd_t) * mm_nr_pmds(mm); - puds = PTRS_PER_PUD * sizeof(pud_t) * mm_nr_puds(mm); seq_printf(m, "VmPeak:\t%8lu kB\n" "VmSize:\t%8lu kB\n" @@ -77,8 +74,6 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) "VmExe:\t%8lu kB\n" "VmLib:\t%8lu kB\n" "VmPTE:\t%8lu kB\n" - "VmPMD:\t%8lu kB\n" - "VmPUD:\t%8lu kB\n" "VmSwap:\t%8lu kB\n", hiwater_vm << (PAGE_SHIFT-10), total_vm << (PAGE_SHIFT-10), @@ -91,9 +86,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) shmem << (PAGE_SHIFT-10), mm->data_vm << (PAGE_SHIFT-10), mm->stack_vm << (PAGE_SHIFT-10), text, lib, - ptes >> 10, - pmds >> 10, - puds >> 10, + mm_pgtables_bytes(mm) >> 10, swap << (PAGE_SHIFT-10)); hugetlb_report_usage(m, mm); } diff --git a/include/linux/mm.h b/include/linux/mm.h index 3c235dbb4f4a..627ff9f3765f 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1762,37 +1762,20 @@ static inline int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, { return 0; } - -static inline unsigned long mm_nr_puds(const struct mm_struct *mm) -{ - return 0; -} - -static inline void mm_nr_puds_init(struct mm_struct *mm) {} static inline void mm_inc_nr_puds(struct mm_struct *mm) {} static inline void mm_dec_nr_puds(struct mm_struct *mm) {} #else int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address); -static inline void mm_nr_puds_init(struct mm_struct *mm) -{ - atomic_long_set(&mm->nr_puds, 0); -} - -static inline unsigned long mm_nr_puds(const struct mm_struct *mm) -{ - return atomic_long_read(&mm->nr_puds); -} - static inline void mm_inc_nr_puds(struct mm_struct *mm) { - atomic_long_inc(&mm->nr_puds); + atomic_long_add(PTRS_PER_PUD * sizeof(pud_t), &mm->pgtables_bytes); } static inline void mm_dec_nr_puds(struct mm_struct *mm) { - atomic_long_dec(&mm->nr_puds); + atomic_long_sub(PTRS_PER_PUD * sizeof(pud_t), &mm->pgtables_bytes); } #endif @@ -1803,64 +1786,47 @@ static inline int __pmd_alloc(struct mm_struct *mm, pud_t *pud, return 0; } -static inline void mm_nr_pmds_init(struct mm_struct *mm) {} - -static inline unsigned long mm_nr_pmds(const struct mm_struct *mm) -{ - return 0; -} - static inline void mm_inc_nr_pmds(struct mm_struct *mm) {} static inline void mm_dec_nr_pmds(struct mm_struct *mm) {} #else int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address); -static inline void mm_nr_pmds_init(struct mm_struct *mm) -{ - atomic_long_set(&mm->nr_pmds, 0); -} - -static inline unsigned long mm_nr_pmds(const struct mm_struct *mm) -{ - return atomic_long_read(&mm->nr_pmds); -} - static inline void mm_inc_nr_pmds(struct mm_struct *mm) { - atomic_long_inc(&mm->nr_pmds); + atomic_long_add(PTRS_PER_PMD * sizeof(pmd_t), &mm->pgtables_bytes); } static inline void mm_dec_nr_pmds(struct mm_struct *mm) { - atomic_long_dec(&mm->nr_pmds); + atomic_long_sub(PTRS_PER_PMD * sizeof(pmd_t), &mm->pgtables_bytes); } #endif #ifdef CONFIG_MMU -static inline void mm_nr_ptes_init(struct mm_struct *mm) +static inline void mm_pgtables_bytes_init(struct mm_struct *mm) { - atomic_long_set(&mm->nr_ptes, 0); + atomic_long_set(&mm->pgtables_bytes, 0); } -static inline unsigned long mm_nr_ptes(const struct mm_struct *mm) +static inline unsigned long mm_pgtables_bytes(const struct mm_struct *mm) { - return atomic_long_read(&mm->nr_ptes); + return atomic_long_read(&mm->pgtables_bytes); } static inline void mm_inc_nr_ptes(struct mm_struct *mm) { - atomic_long_inc(&mm->nr_ptes); + atomic_long_add(PTRS_PER_PTE * sizeof(pte_t), &mm->pgtables_bytes); } static inline void mm_dec_nr_ptes(struct mm_struct *mm) { - atomic_long_dec(&mm->nr_ptes); + atomic_long_sub(PTRS_PER_PTE * sizeof(pte_t), &mm->pgtables_bytes); } #else -static inline void mm_nr_ptes_init(struct mm_struct *mm) {} -static inline unsigned long mm_nr_ptes(const struct mm_struct *mm) +static inline void mm_pgtables_bytes_init(struct mm_struct *mm) {} +static inline unsigned long mm_pgtables_bytes(const struct mm_struct *mm) { return 0; } diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index e557f7778793..ee0d1520494f 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -406,13 +406,7 @@ struct mm_struct { atomic_t mm_count; #ifdef CONFIG_MMU - atomic_long_t nr_ptes; /* PTE page table pages */ -#endif -#if CONFIG_PGTABLE_LEVELS > 2 - atomic_long_t nr_pmds; /* PMD page table pages */ -#endif -#if CONFIG_PGTABLE_LEVELS > 3 - atomic_long_t nr_puds; /* PUD page table pages */ + atomic_long_t pgtables_bytes; /* PTE page table pages */ #endif int map_count; /* number of VMAs */ diff --git a/kernel/fork.c b/kernel/fork.c index 26e583842e31..ce8aa2b8bf41 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -838,9 +838,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, init_rwsem(&mm->mmap_sem); INIT_LIST_HEAD(&mm->mmlist); mm->core_state = NULL; - mm_nr_ptes_init(mm); - mm_nr_pmds_init(mm); - mm_nr_puds_init(mm); + mm_pgtables_bytes_init(mm); mm->map_count = 0; mm->locked_vm = 0; mm->pinned_vm = 0; @@ -902,15 +900,9 @@ static void check_mm(struct mm_struct *mm) "mm:%p idx:%d val:%ld\n", mm, i, x); } - if (mm_nr_ptes(mm)) - pr_alert("BUG: non-zero nr_ptes on freeing mm: %ld\n", - mm_nr_ptes(mm)); - if (mm_nr_pmds(mm)) - pr_alert("BUG: non-zero nr_pmds on freeing mm: %ld\n", - mm_nr_pmds(mm)); - if (mm_nr_puds(mm)) - pr_alert("BUG: non-zero nr_puds on freeing mm: %ld\n", - mm_nr_puds(mm)); + if (mm_pgtables_bytes(mm)) + pr_alert("BUG: non-zero pgtables_bytes on freeing mm: %ld\n", + mm_pgtables_bytes(mm)); #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS VM_BUG_ON_MM(mm->pmd_huge_pte, mm); diff --git a/mm/debug.c b/mm/debug.c index ffec146806c1..81442951281c 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -105,8 +105,7 @@ void dump_mm(const struct mm_struct *mm) "get_unmapped_area %px\n" #endif "mmap_base %lu mmap_legacy_base %lu highest_vm_end %lu\n" - "pgd %px mm_users %d mm_count %d\n" - "nr_ptes %lu nr_pmds %lu nr_puds %lu map_count %d\n" + "pgd %px mm_users %d mm_count %d pgtables_bytes %lu map_count %d\n" "hiwater_rss %lx hiwater_vm %lx total_vm %lx locked_vm %lx\n" "pinned_vm %lx data_vm %lx exec_vm %lx stack_vm %lx\n" "start_code %lx end_code %lx start_data %lx end_data %lx\n" @@ -136,9 +135,7 @@ void dump_mm(const struct mm_struct *mm) mm->mmap_base, mm->mmap_legacy_base, mm->highest_vm_end, mm->pgd, atomic_read(&mm->mm_users), atomic_read(&mm->mm_count), - mm_nr_ptes(mm), - mm_nr_pmds(mm), - mm_nr_puds(mm), + mm_pgtables_bytes(mm), mm->map_count, mm->hiwater_rss, mm->hiwater_vm, mm->total_vm, mm->locked_vm, mm->pinned_vm, mm->data_vm, mm->exec_vm, mm->stack_vm, diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 676f1d4107df..9737b3975b05 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -938,7 +938,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, set_pmd_at(src_mm, addr, src_pmd, pmd); } add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR); - atomic_long_inc(&dst_mm->nr_ptes); + mm_inc_nr_ptes(dst_mm); pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable); set_pmd_at(dst_mm, addr, dst_pmd, pmd); ret = 0; diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 08be45369e39..a478c5b8218f 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -206,7 +206,7 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, * task's rss, pagetable and swap space use. */ points = get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS) + - mm_nr_ptes(p->mm) + mm_nr_pmds(p->mm) + mm_nr_puds(p->mm); + mm_pgtables_bytes(p->mm) / PAGE_SIZE; task_unlock(p); /* Normalize to oom_score_adj units */ @@ -367,8 +367,8 @@ static void select_bad_process(struct oom_control *oc) * Dumps the current memory state of all eligible tasks. Tasks not in the same * memcg, not in the same cpuset, or bound to a disjoint set of mempolicy nodes * are not shown. - * State information includes task's pid, uid, tgid, vm size, rss, nr_ptes, - * swapents, oom_score_adj value, and name. + * State information includes task's pid, uid, tgid, vm size, rss, + * pgtables_bytes, swapents, oom_score_adj value, and name. */ void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask) { @@ -388,7 +388,7 @@ void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask) swap_comp_nrpages = get_swap_comp_pool_nrpages(); pr_info("[ pid ] uid tgid total_vm total_rss ( rss swap ) nr_ptes nr_pmds swapents oom_score_adj name\n"); #else - pr_info("[ pid ] uid tgid total_vm rss nr_ptes nr_pmds nr_puds swapents oom_score_adj name\n"); + pr_info("[ pid ] uid tgid total_vm rss pgtables_bytes swapents oom_score_adj name\n"); #endif rcu_read_lock(); for_each_process(p) { @@ -408,9 +408,9 @@ void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask) #if defined(CONFIG_SWAP) task_swap = get_mm_counter(task->mm, MM_SWAPENTS) * swap_comp_nrpages / swap_orig_nrpages; - pr_info("[%5d] %5d %5d %8lu %8lu (%8lu %8lu) %7ld %7ld %8lu %5hd %s\n", + pr_info("[%5d] %5d %5d %8lu %8lu (%8lu %8lu) %8ld %5hd %s\n", #else - pr_info("[%5d] %5d %5d %8lu %8lu %7ld %7ld %7ld %8lu %5hd %s\n", + pr_info("[%5d] %5d %5d %8lu %8lu %8ld %8lu %5hd %s\n", #endif task->pid, from_kuid(&init_user_ns, task_uid(task)), task->tgid, task->mm->total_vm, @@ -420,9 +420,7 @@ void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask) #else get_mm_rss(task->mm), #endif - mm_nr_ptes(task->mm), - mm_nr_pmds(task->mm), - mm_nr_puds(task->mm), + mm_pgtables_bytes(task->mm), get_mm_counter(task->mm, MM_SWAPENTS), task->signal->oom_score_adj, task->comm); cur_rss_sum = get_mm_rss(task->mm) + From c3702b2293fe4f4a5c3c77c229b161feb087bb32 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Wed, 31 Jan 2018 16:18:44 -0800 Subject: [PATCH 1224/1640] UPSTREAM: mm: align struct page more aesthetically Patch series "Restructure struct page", v2. This series does not attempt any grand restructuring. Instead, it cures the worst of the indentitis, fixes the documentation and reduces the ifdeffery. The only layout change is compound_dtor and compound_order are each reduced to one byte. This patch (of 8): Instead of an ifdef block at the end of the struct, which needed its own comment, define _struct_page_alignment up at the top where it fits nicely with the existing comment. Link: http://lkml.kernel.org/r/20171220155552.15884-2-willy@infradead.org Signed-off-by: Matthew Wilcox Acked-by: Kirill A. Shutemov Acked-by: Michal Hocko Acked-by: Christoph Lameter Cc: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index ee0d1520494f..798dffcd798b 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -39,6 +39,12 @@ struct hmm; * allows the use of atomic double word operations on the flags/mapping * and lru list pointers also. */ +#ifdef CONFIG_HAVE_ALIGNED_STRUCT_PAGE +#define _struct_page_alignment __aligned(2 * sizeof(unsigned long)) +#else +#define _struct_page_alignment +#endif + struct page { /* First double word block */ unsigned long flags; /* Atomic flags, some possibly @@ -212,15 +218,7 @@ struct page { #ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS int _last_cpupid; #endif -} -/* - * The struct page can be forced to be double word aligned so that atomic ops - * on double words work. The SLUB allocator can make use of such a feature. - */ -#ifdef CONFIG_HAVE_ALIGNED_STRUCT_PAGE - __aligned(2 * sizeof(unsigned long)) -#endif -; +} _struct_page_alignment; #define PAGE_FRAG_CACHE_MAX_SIZE __ALIGN_MASK(32768, ~PAGE_MASK) #define PAGE_FRAG_CACHE_MAX_ORDER get_order(PAGE_FRAG_CACHE_MAX_SIZE) From c7687c2f75ec7aa771fb23324ef75896236fa596 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Wed, 31 Jan 2018 16:18:47 -0800 Subject: [PATCH 1225/1640] UPSTREAM: mm: de-indent struct page I found the struct { union { struct { union { struct { } } } } } layout rather confusing. Fortunately, there is an easier way to write this. The innermost union is of four things which are the size of an int, so the ones which are used by slab/slob/slub can be pulled up two levels to be in the outermost union with 'counters'. That leaves us with struct { union { struct { atomic_t; atomic_t; } } } which has the same layout, but is easier to read. Output from the current git version of pahole, diffed with -uw to ignore the whitespace changes from the indentation: }; /* 16 8 */ union { long unsigned int counters; /* 24 8 */ - struct { - union { - atomic_t _mapcount; /* 24 4 */ unsigned int active; /* 24 4 */ struct { unsigned int inuse:16; /* 24:16 4 */ @@ -21,7 +18,8 @@ unsigned int frozen:1; /* 24: 0 4 */ }; /* 24 4 */ int units; /* 24 4 */ - }; /* 24 4 */ + struct { + atomic_t _mapcount; /* 24 4 */ atomic_t _refcount; /* 28 4 */ }; /* 24 8 */ }; /* 24 8 */ Link: http://lkml.kernel.org/r/20171220155552.15884-3-willy@infradead.org Signed-off-by: Matthew Wilcox Acked-by: Kirill A. Shutemov Acked-by: Michal Hocko Acked-by: Christoph Lameter Cc: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 38 ++++++++++++++++++-------------------- 1 file changed, 18 insertions(+), 20 deletions(-) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 798dffcd798b..5b67f58fec60 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -84,28 +84,26 @@ struct page { */ unsigned counters; #endif - struct { + unsigned int active; /* SLAB */ + struct { /* SLUB */ + unsigned inuse:16; + unsigned objects:15; + unsigned frozen:1; + }; + int units; /* SLOB */ - union { - /* - * Count of ptes mapped in mms, to show when - * page is mapped & limit reverse map searches. - * - * Extra information about page type may be - * stored here for pages that are never mapped, - * in which case the value MUST BE <= -2. - * See page-flags.h for more details. - */ - atomic_t _mapcount; + struct { /* Page cache */ + /* + * Count of ptes mapped in mms, to show when + * page is mapped & limit reverse map searches. + * + * Extra information about page type may be + * stored here for pages that are never mapped, + * in which case the value MUST BE <= -2. + * See page-flags.h for more details. + */ + atomic_t _mapcount; - unsigned int active; /* SLAB */ - struct { /* SLUB */ - unsigned inuse:16; - unsigned objects:15; - unsigned frozen:1; - }; - int units; /* SLOB */ - }; /* * Usage count, *USE WRAPPER FUNCTION* when manual * accounting. See page_ref.h From 8d14608a85f8dd904ae815c63277960cc2132df2 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Wed, 31 Jan 2018 16:18:51 -0800 Subject: [PATCH 1226/1640] UPSTREAM: mm: remove misleading alignment claims The "third double word block" isn't on 32-bit systems. The layout looks like this: unsigned long flags; struct address_space *mapping pgoff_t index; atomic_t _mapcount; atomic_t _refcount; which is 32 bytes on 64-bit, but 20 bytes on 32-bit. Nobody is trying to use the fact that it's double-word aligned today, so just remove the misleading claims. Link: http://lkml.kernel.org/r/20171220155552.15884-4-willy@infradead.org Signed-off-by: Matthew Wilcox Acked-by: Kirill A. Shutemov Acked-by: Christoph Lameter Cc: Michal Hocko Cc: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 5b67f58fec60..8bae3627509d 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -33,11 +33,11 @@ struct hmm; * a page, though if it is a pagecache page, rmap structures can tell us * who is mapping it. * - * The objects in struct page are organized in double word blocks in - * order to allows us to use atomic double word operations on portions - * of struct page. That is currently only used by slub but the arrangement - * allows the use of atomic double word operations on the flags/mapping - * and lru list pointers also. + * SLUB uses cmpxchg_double() to atomically update its freelist and + * counters. That requires that freelist & counters be adjacent and + * double-word aligned. We align all struct pages to double-word + * boundaries, and ensure that 'freelist' is aligned within the + * struct. */ #ifdef CONFIG_HAVE_ALIGNED_STRUCT_PAGE #define _struct_page_alignment __aligned(2 * sizeof(unsigned long)) @@ -113,8 +113,6 @@ struct page { }; /* - * Third double word block - * * WARNING: bit 0 of the first word encode PageTail(). That means * the rest users of the storage space MUST NOT use the bit to * avoid collision and false-positive PageTail(). @@ -175,7 +173,6 @@ struct page { #endif }; - /* Remainder is not double word aligned */ union { unsigned long private; /* Mapping-private opaque data: * usually used for buffer_heads From 63c045b2059c3452846ee80bbb255f79fc536aff Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Wed, 31 Jan 2018 16:18:55 -0800 Subject: [PATCH 1227/1640] UPSTREAM: mm: improve comment on page->mapping The comment on page->mapping is terse, and out of date (it does not mention the possibility of PAGE_MAPPING_MOVABLE). Instead, point the interested reader to page-flags.h where there is a much better comment. Link: http://lkml.kernel.org/r/20171220155552.15884-5-willy@infradead.org Signed-off-by: Matthew Wilcox Acked-by: Kirill A. Shutemov Acked-by: Michal Hocko Acked-by: Christoph Lameter Cc: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 8bae3627509d..4ca6f853eea4 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -50,15 +50,9 @@ struct page { unsigned long flags; /* Atomic flags, some possibly * updated asynchronously */ union { - struct address_space *mapping; /* If low bit clear, points to - * inode address_space, or NULL. - * If page mapped as anonymous - * memory, low bit is set, and - * it points to anon_vma object - * or KSM private structure. See - * PAGE_MAPPING_ANON and - * PAGE_MAPPING_KSM. - */ + /* See page-flags.h for the definition of PAGE_MAPPING_FLAGS */ + struct address_space *mapping; + void *s_mem; /* slab first object */ atomic_t compound_mapcount; /* first tail page */ /* page_deferred_list().next -- second tail page */ From 8b29ab6b74aedf48d71867fa866733a8e0f13e42 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Wed, 31 Jan 2018 16:18:58 -0800 Subject: [PATCH 1228/1640] UPSTREAM: mm: introduce _slub_counter_t Instead of putting the ifdef in the middle of the definition of struct page, pull it forward to the rest of the ifdeffery around the SLUB cmpxchg_double optimisation. Link: http://lkml.kernel.org/r/20171220155552.15884-6-willy@infradead.org Signed-off-by: Matthew Wilcox Acked-by: Kirill A. Shutemov Acked-by: Michal Hocko Cc: Christoph Lameter Cc: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 4ca6f853eea4..52a11d12354c 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -41,9 +41,15 @@ struct hmm; */ #ifdef CONFIG_HAVE_ALIGNED_STRUCT_PAGE #define _struct_page_alignment __aligned(2 * sizeof(unsigned long)) +#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) +#define _slub_counter_t unsigned long #else -#define _struct_page_alignment +#define _slub_counter_t unsigned int #endif +#else /* !CONFIG_HAVE_ALIGNED_STRUCT_PAGE */ +#define _struct_page_alignment +#define _slub_counter_t unsigned int +#endif /* !CONFIG_HAVE_ALIGNED_STRUCT_PAGE */ struct page { /* First double word block */ @@ -66,18 +72,7 @@ struct page { }; union { -#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \ - defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE) - /* Used for cmpxchg_double in slub */ - unsigned long counters; -#else - /* - * Keep _refcount separate from slub cmpxchg_double data. - * As the rest of the double word is protected by slab_lock - * but _refcount is not. - */ - unsigned counters; -#endif + _slub_counter_t counters; unsigned int active; /* SLAB */ struct { /* SLUB */ unsigned inuse:16; From dac0765eec184355c55f9619dc45f53a4274ae4f Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Wed, 31 Jan 2018 16:19:02 -0800 Subject: [PATCH 1229/1640] UPSTREAM: mm: store compound_dtor / compound_order as bytes Neither of these values get even close to 256; compound_dtor is currently at a maximum of 3, and compound_order can't be over 64. No machine has inefficient access to bytes since EV5, and while those are still supported, we don't optimise for them any more. This does not shrink struct page, but it removes an ifdef and frees up 2-6 bytes for future use. diff of pahole output: struct callback_head callback_head; /* 32 16 */ struct { long unsigned int compound_head; /* 32 8 */ - unsigned int compound_dtor; /* 40 4 */ - unsigned int compound_order; /* 44 4 */ + unsigned char compound_dtor; /* 40 1 */ + unsigned char compound_order; /* 41 1 */ }; /* 32 16 */ }; /* 32 16 */ union { [mawilcox@microsoft.com: add comment] Link: http://lkml.kernel.org/r/20171221000144.GB2980@bombadil.infradead.org Link: http://lkml.kernel.org/r/20171220155552.15884-7-willy@infradead.org Signed-off-by: Matthew Wilcox Acked-by: Kirill A. Shutemov Acked-by: Michal Hocko Cc: Christoph Lameter Cc: Randy Dunlap Signed-off-by: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 52a11d12354c..1da4318d8e24 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -136,19 +136,9 @@ struct page { unsigned long compound_head; /* If bit zero is set */ /* First tail page only */ -#ifdef CONFIG_64BIT - /* - * On 64 bit system we have enough space in struct page - * to encode compound_dtor and compound_order with - * unsigned int. It can help compiler generate better or - * smaller code on some archtectures. - */ - unsigned int compound_dtor; - unsigned int compound_order; -#else - unsigned short int compound_dtor; - unsigned short int compound_order; -#endif + unsigned char compound_dtor; + unsigned char compound_order; + /* two/six bytes available here */ }; #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && USE_SPLIT_PMD_PTLOCKS From fe1661532278083047aaa1b528ad1663a4fcf1f9 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Wed, 31 Jan 2018 16:19:06 -0800 Subject: [PATCH 1230/1640] UPSTREAM: mm: document how to use struct page Be really explicit about what bits / bytes are reserved for users that want to store extra information about the pages they allocate. Link: http://lkml.kernel.org/r/20171220155552.15884-8-willy@infradead.org Signed-off-by: Matthew Wilcox Acked-by: Kirill A. Shutemov Reviewed-by: Randy Dunlap Acked-by: Michal Hocko Cc: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 1da4318d8e24..4be1549003b6 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -31,7 +31,29 @@ struct hmm; * it to keep track of whatever it is we are using the page for at the * moment. Note that we have no way to track which tasks are using * a page, though if it is a pagecache page, rmap structures can tell us - * who is mapping it. + * who is mapping it. If you allocate the page using alloc_pages(), you + * can use some of the space in struct page for your own purposes. + * + * Pages that were once in the page cache may be found under the RCU lock + * even after they have been recycled to a different purpose. The page + * cache reads and writes some of the fields in struct page to pin the + * page before checking that it's still in the page cache. It is vital + * that all users of struct page: + * 1. Use the first word as PageFlags. + * 2. Clear or preserve bit 0 of page->compound_head. It is used as + * PageTail for compound pages, and the page cache must not see false + * positives. Some users put a pointer here (guaranteed to be at least + * 4-byte aligned), other users avoid using the field altogether. + * 3. page->_refcount must either not be used, or must be used in such a + * way that other CPUs temporarily incrementing and then decrementing the + * refcount does not cause problems. On receiving the page from + * alloc_pages(), the refcount will be positive. + * 4. Either preserve page->_mapcount or restore it to -1 before freeing it. + * + * If you allocate pages of order > 0, you can use the fields in the struct + * page associated with each page, but bear in mind that the pages may have + * been inserted individually into the page cache, so you must use the above + * four fields in a compatible way for each struct page. * * SLUB uses cmpxchg_double() to atomically update its freelist and * counters. That requires that freelist & counters be adjacent and From aa238b53d20695ca76bf8b715facad8fb0c39efc Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Wed, 31 Jan 2018 16:19:11 -0800 Subject: [PATCH 1231/1640] UPSTREAM: mm: remove reference to PG_buddy PG_buddy doesn't exist any more. It's called PageBuddy now. Link: http://lkml.kernel.org/r/20171220155552.15884-9-willy@infradead.org Signed-off-by: Matthew Wilcox Acked-by: Kirill A. Shutemov Acked-by: Michal Hocko Acked-by: Christoph Lameter Cc: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 4be1549003b6..65a3c2b4dc8a 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -175,13 +175,13 @@ struct page { }; union { - unsigned long private; /* Mapping-private opaque data: - * usually used for buffer_heads - * if PagePrivate set; used for - * swp_entry_t if PageSwapCache; - * indicates order in the buddy - * system if PG_buddy is set. - */ + /* + * Mapping-private opaque data: + * Usually used for buffer_heads if PagePrivate + * Used for swp_entry_t if PageSwapCache + * Indicates order in the buddy system if PageBuddy + */ + unsigned long private; #if USE_SPLIT_PTE_PTLOCKS #if ALLOC_SPLIT_PTLOCKS spinlock_t *ptl; From 28740c475437629d687016c8ce0f93db4d3989ce Mon Sep 17 00:00:00 2001 From: Souptick Joarder Date: Thu, 5 Apr 2018 16:25:23 -0700 Subject: [PATCH 1232/1640] UPSTREAM: mm: change return type to vm_fault_t The plan for these patches is to introduce the typedef, initially just as documentation ("These functions should return a VM_FAULT_ status"). We'll trickle the patches to individual drivers/filesystems in through the maintainers, as far as possible. Then we'll change the typedef to an unsigned int and break the compilation of any unconverted drivers/filesystems. vmf_insert_page(), vmf_insert_mixed() and vmf_insert_pfn() are three newly added functions. The various drivers/filesystems where return value of fault(), huge_fault(), page_mkwrite() and pfn_mkwrite() get converted, will need them. These functions will return correct VM_FAULT_ code based on err value. We've had bugs before where drivers returned -EFOO. And we have this silly inefficiency where vm_insert_xxx() return an errno which (afaict) every driver then converts into a VM_FAULT code. In many cases drivers failed to return correct VM_FAULT code value despite of vm_insert_xxx() fails. We have indentified and clean up all those existing bugs and silly inefficiencies in driver/filesystems by adding these three new inline wrappers. As mentioned above, we will trickle those patches to individual drivers/filesystems in through maintainers after these three wrapper functions are merged. Eventually we can convert vm_insert_xxx() into vmf_insert_xxx() and remove these inline wrappers, but these are a good intermediate step. Link: http://lkml.kernel.org/r/20180310162351.GA7422@jordon-HP-15-Notebook-PC Signed-off-by: Souptick Joarder Acked-by: Michal Hocko Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 47 ++++++++++++++++++++++++++++++++++++---- include/linux/mm_types.h | 2 ++ 2 files changed, 45 insertions(+), 4 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 627ff9f3765f..4c144755d2d3 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -453,17 +453,18 @@ struct vm_operations_struct { void (*close)(struct vm_area_struct * area); int (*split)(struct vm_area_struct * area, unsigned long addr); int (*mremap)(struct vm_area_struct * area); - int (*fault)(struct vm_fault *vmf); - int (*huge_fault)(struct vm_fault *vmf, enum page_entry_size pe_size); + vm_fault_t (*fault)(struct vm_fault *vmf); + vm_fault_t (*huge_fault)(struct vm_fault *vmf, + enum page_entry_size pe_size); void (*map_pages)(struct vm_fault *vmf, pgoff_t start_pgoff, pgoff_t end_pgoff); /* notification that a previously read-only page is about to become * writable, if an error is returned it will cause a SIGBUS */ - int (*page_mkwrite)(struct vm_fault *vmf); + vm_fault_t (*page_mkwrite)(struct vm_fault *vmf); /* same as page_mkwrite when using VM_PFNMAP|VM_MIXEDMAP */ - int (*pfn_mkwrite)(struct vm_fault *vmf); + vm_fault_t (*pfn_mkwrite)(struct vm_fault *vmf); /* called by access_process_vm when get_user_pages() fails, typically * for use by special VMAs that can switch between memory and hardware @@ -2546,6 +2547,44 @@ int vm_insert_mixed_mkwrite(struct vm_area_struct *vma, unsigned long addr, pfn_t pfn); int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len); +static inline vm_fault_t vmf_insert_page(struct vm_area_struct *vma, + unsigned long addr, struct page *page) +{ + int err = vm_insert_page(vma, addr, page); + + if (err == -ENOMEM) + return VM_FAULT_OOM; + if (err < 0 && err != -EBUSY) + return VM_FAULT_SIGBUS; + + return VM_FAULT_NOPAGE; +} + +static inline vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, + unsigned long addr, pfn_t pfn) +{ + int err = vm_insert_mixed(vma, addr, pfn); + + if (err == -ENOMEM) + return VM_FAULT_OOM; + if (err < 0 && err != -EBUSY) + return VM_FAULT_SIGBUS; + + return VM_FAULT_NOPAGE; +} + +static inline vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, + unsigned long addr, unsigned long pfn) +{ + int err = vm_insert_pfn(vma, addr, pfn); + + if (err == -ENOMEM) + return VM_FAULT_OOM; + if (err < 0 && err != -EBUSY) + return VM_FAULT_SIGBUS; + + return VM_FAULT_NOPAGE; +} struct page *follow_page(struct vm_area_struct *vma, unsigned long address, unsigned int foll_flags); diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 65a3c2b4dc8a..da1098f10b54 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -22,6 +22,8 @@ #endif #define AT_VECTOR_SIZE (2*(AT_VECTOR_SIZE_ARCH + AT_VECTOR_SIZE_BASE + 1)) +typedef int vm_fault_t; + struct address_space; struct mem_cgroup; struct hmm; From 0c2fe72a99d1698715febbb6574c902127b231ee Mon Sep 17 00:00:00 2001 From: Souptick Joarder Date: Thu, 7 Jun 2018 17:08:04 -0700 Subject: [PATCH 1233/1640] UPSTREAM: mm: change return type to vm_fault_t Use new return type vm_fault_t for fault handler in struct vm_operations_struct. For now, this is just documenting that the function returns a VM_FAULT value rather than an errno. Once all instances are converted, vm_fault_t will become a distinct type. See commit 1c8f422059ae ("mm: change return type to vm_fault_t") Link: http://lkml.kernel.org/r/20180512063745.GA26866@jordon-HP-15-Notebook-PC Signed-off-by: Souptick Joarder Reviewed-by: Matthew Wilcox Reviewed-by: Andrew Morton Cc: Joe Perches Cc: Michal Hocko Cc: Hugh Dickins Cc: Dan Williams Cc: David Rientjes Cc: Mike Kravetz Cc: Naoya Horiguchi Cc: Aneesh Kumar K.V Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 6 +++--- mm/hugetlb.c | 2 +- mm/mmap.c | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index da1098f10b54..b82115da008b 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -641,9 +641,9 @@ struct vm_special_mapping { * If non-NULL, then this is called to resolve page faults * on the special mapping. If used, .pages is not checked. */ - int (*fault)(const struct vm_special_mapping *sm, - struct vm_area_struct *vma, - struct vm_fault *vmf); + vm_fault_t (*fault)(const struct vm_special_mapping *sm, + struct vm_area_struct *vma, + struct vm_fault *vmf); int (*mremap)(const struct vm_special_mapping *sm, struct vm_area_struct *new_vma); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 7869c4a63756..e3775096a338 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3209,7 +3209,7 @@ static int hugetlb_vm_op_split(struct vm_area_struct *vma, unsigned long addr) * hugegpage VMA. do_page_fault() is supposed to trap this, so BUG is we get * this far. */ -static int hugetlb_vm_op_fault(struct vm_fault *vmf) +static vm_fault_t hugetlb_vm_op_fault(struct vm_fault *vmf) { BUG(); return 0; diff --git a/mm/mmap.c b/mm/mmap.c index 40074c94c6b6..795a22380a2c 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -3305,7 +3305,7 @@ void vm_stat_account(struct mm_struct *mm, vm_flags_t flags, long npages) mm->data_vm += npages; } -static int special_mapping_fault(struct vm_fault *vmf); +static vm_fault_t special_mapping_fault(struct vm_fault *vmf); /* * Having a close hook prevents vma merging regardless of flags. @@ -3344,7 +3344,7 @@ static const struct vm_operations_struct legacy_special_mapping_vmops = { .fault = special_mapping_fault, }; -static int special_mapping_fault(struct vm_fault *vmf) +static vm_fault_t special_mapping_fault(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; pgoff_t pgoff; From f113669e2f2d69b81f5e35268fac7b643b5b1a4a Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Thu, 7 Jun 2018 17:08:18 -0700 Subject: [PATCH 1234/1640] UPSTREAM: mm: split page_type out from _mapcount MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We're already using a union of many fields here, so stop abusing the _mapcount and make page_type its own field. That implies renaming some of the machinery that creates PageBuddy, PageBalloon and PageKmemcg; bring back the PG_buddy, PG_balloon and PG_kmemcg names. As suggested by Kirill, make page_type a bitmask. Because it starts out life as -1 (thanks to sharing the storage with _mapcount), setting a page flag means clearing the appropriate bit. This gives us space for probably twenty or so extra bits (depending how paranoid we want to be about _mapcount underflow). Link: http://lkml.kernel.org/r/20180518194519.3820-3-willy@infradead.org Signed-off-by: Matthew Wilcox Acked-by: Kirill A. Shutemov Acked-by: Vlastimil Babka Cc: Christoph Lameter Cc: Dave Hansen Cc: Jérôme Glisse Cc: Lai Jiangshan Cc: Martin Schwidefsky Cc: Pekka Enberg Cc: Randy Dunlap Cc: Andrey Ryabinin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 13 ++++++----- include/linux/page-flags.h | 45 ++++++++++++++++++++++---------------- kernel/crash_core.c | 1 + mm/page_alloc.c | 13 +++++------ scripts/tags.sh | 6 ++--- 5 files changed, 43 insertions(+), 35 deletions(-) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index b82115da008b..046a6ddb3151 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -96,6 +96,14 @@ struct page { }; union { + /* + * If the page is neither PageSlab nor mappable to userspace, + * the value stored here may help determine what this page + * is used for. See page-flags.h for a list of page types + * which are currently stored here. + */ + unsigned int page_type; + _slub_counter_t counters; unsigned int active; /* SLAB */ struct { /* SLUB */ @@ -109,11 +117,6 @@ struct page { /* * Count of ptes mapped in mms, to show when * page is mapped & limit reverse map searches. - * - * Extra information about page type may be - * stored here for pages that are never mapped, - * in which case the value MUST BE <= -2. - * See page-flags.h for more details. */ atomic_t _mapcount; diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index a02794afc4dc..7da4687ca781 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -654,49 +654,56 @@ PAGEFLAG_FALSE(DoubleMap) #endif /* - * For pages that are never mapped to userspace, page->mapcount may be - * used for storing extra information about page type. Any value used - * for this purpose must be <= -2, but it's better start not too close - * to -2 so that an underflow of the page_mapcount() won't be mistaken - * for a special page. + * For pages that are never mapped to userspace (and aren't PageSlab), + * page_type may be used. Because it is initialised to -1, we invert the + * sense of the bit, so __SetPageFoo *clears* the bit used for PageFoo, and + * __ClearPageFoo *sets* the bit used for PageFoo. We reserve a few high and + * low bits so that an underflow or overflow of page_mapcount() won't be + * mistaken for a page type value. */ -#define PAGE_MAPCOUNT_OPS(uname, lname) \ + +#define PAGE_TYPE_BASE 0xf0000000 +/* Reserve 0x0000007f to catch underflows of page_mapcount */ +#define PG_buddy 0x00000080 +#define PG_balloon 0x00000100 +#define PG_kmemcg 0x00000200 + +#define PageType(page, flag) \ + ((page->page_type & (PAGE_TYPE_BASE | flag)) == PAGE_TYPE_BASE) + +#define PAGE_TYPE_OPS(uname, lname) \ static __always_inline int Page##uname(struct page *page) \ { \ - return atomic_read(&page->_mapcount) == \ - PAGE_##lname##_MAPCOUNT_VALUE; \ + return PageType(page, PG_##lname); \ } \ static __always_inline void __SetPage##uname(struct page *page) \ { \ - VM_BUG_ON_PAGE(atomic_read(&page->_mapcount) != -1, page); \ - atomic_set(&page->_mapcount, PAGE_##lname##_MAPCOUNT_VALUE); \ + VM_BUG_ON_PAGE(!PageType(page, 0), page); \ + page->page_type &= ~PG_##lname; \ } \ static __always_inline void __ClearPage##uname(struct page *page) \ { \ VM_BUG_ON_PAGE(!Page##uname(page), page); \ - atomic_set(&page->_mapcount, -1); \ + page->page_type |= PG_##lname; \ } /* - * PageBuddy() indicate that the page is free and in the buddy system + * PageBuddy() indicates that the page is free and in the buddy system * (see mm/page_alloc.c). */ -#define PAGE_BUDDY_MAPCOUNT_VALUE (-128) -PAGE_MAPCOUNT_OPS(Buddy, BUDDY) +PAGE_TYPE_OPS(Buddy, buddy) /* - * PageBalloon() is set on pages that are on the balloon page list + * PageBalloon() is true for pages that are on the balloon page list * (see mm/balloon_compaction.c). */ -#define PAGE_BALLOON_MAPCOUNT_VALUE (-256) -PAGE_MAPCOUNT_OPS(Balloon, BALLOON) +PAGE_TYPE_OPS(Balloon, balloon) /* * If kmemcg is enabled, the buddy allocator will set PageKmemcg() on * pages allocated with __GFP_ACCOUNT. It gets cleared on page free. */ -#define PAGE_KMEMCG_MAPCOUNT_VALUE (-512) -PAGE_MAPCOUNT_OPS(Kmemcg, KMEMCG) +PAGE_TYPE_OPS(Kmemcg, kmemcg) extern bool is_free_buddy_page(struct page *page); diff --git a/kernel/crash_core.c b/kernel/crash_core.c index 2d90996dbe77..1001b581ae8f 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -457,6 +457,7 @@ static int __init crash_save_vmcoreinfo_init(void) VMCOREINFO_NUMBER(PG_hwpoison); #endif VMCOREINFO_NUMBER(PG_head_mask); +#define PAGE_BUDDY_MAPCOUNT_VALUE (~PG_buddy) VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE); #ifdef CONFIG_HUGETLB_PAGE VMCOREINFO_NUMBER(HUGETLB_PAGE_DTOR); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 13de09a6c709..0866c6d26d7c 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -803,16 +803,14 @@ static inline void rmv_page_order(struct page *page) /* * This function checks whether a page is free && is the buddy - * we can do coalesce a page and its buddy if + * we can coalesce a page and its buddy if * (a) the buddy is not in a hole (check before calling!) && * (b) the buddy is in the buddy system && * (c) a page and its buddy have the same order && * (d) a page and its buddy are in the same zone. * - * For recording whether a page is in the buddy system, we set ->_mapcount - * PAGE_BUDDY_MAPCOUNT_VALUE. - * Setting, clearing, and testing _mapcount PAGE_BUDDY_MAPCOUNT_VALUE is - * serialized by zone->lock. + * For recording whether a page is in the buddy system, we set PageBuddy. + * Setting, clearing, and testing PageBuddy is serialized by zone->lock. * * For recording page's order, we use page_private(page). */ @@ -857,9 +855,8 @@ static inline int page_is_buddy(struct page *page, struct page *buddy, * as necessary, plus some accounting needed to play nicely with other * parts of the VM system. * At each level, we keep a list of pages, which are heads of continuous - * free pages of length of (1 << order) and marked with _mapcount - * PAGE_BUDDY_MAPCOUNT_VALUE. Page's order is recorded in page_private(page) - * field. + * free pages of length of (1 << order) and marked with PageBuddy. + * Page's order is recorded in page_private(page) field. * So when we are allocating or freeing one, we can derive the state of the * other. That is, if we allocate a small block, and both were * free, the remainder of the region must be split into blocks. diff --git a/scripts/tags.sh b/scripts/tags.sh index 086341e10aa8..4936366e9bd6 100755 --- a/scripts/tags.sh +++ b/scripts/tags.sh @@ -195,9 +195,9 @@ regex_c=( '/\ Date: Thu, 7 Jun 2018 17:08:26 -0700 Subject: [PATCH 1235/1640] UPSTREAM: mm: switch s_mem and slab_cache in struct page MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This will allow us to store slub's counters in the same bits as slab's s_mem. slub now needs to set page->mapping to NULL as it frees the page, just like slab does. Link: http://lkml.kernel.org/r/20180518194519.3820-5-willy@infradead.org Signed-off-by: Matthew Wilcox Acked-by: Christoph Lameter Acked-by: Vlastimil Babka Cc: Dave Hansen Cc: Jérôme Glisse Cc: "Kirill A . Shutemov" Cc: Lai Jiangshan Cc: Martin Schwidefsky Cc: Pekka Enberg Cc: Randy Dunlap Cc: Andrey Ryabinin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 4 ++-- mm/slub.c | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 046a6ddb3151..a610629cf5ff 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -83,7 +83,7 @@ struct page { /* See page-flags.h for the definition of PAGE_MAPPING_FLAGS */ struct address_space *mapping; - void *s_mem; /* slab first object */ + struct kmem_cache *slab_cache; /* SL[AU]B: Pointer to slab */ atomic_t compound_mapcount; /* first tail page */ /* page_deferred_list().next -- second tail page */ }; @@ -194,7 +194,7 @@ struct page { spinlock_t ptl; #endif #endif - struct kmem_cache *slab_cache; /* SL[AU]B: Pointer to slab */ + void *s_mem; /* slab first object */ }; #ifdef CONFIG_MEMCG diff --git a/mm/slub.c b/mm/slub.c index b71c85f98566..215617adc783 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1982,6 +1982,7 @@ static void __free_slab(struct kmem_cache *s, struct page *page) __ClearPageSlab(page); page_mapcount_reset(page); + page->mapping = NULL; if (current->reclaim_state) current->reclaim_state->reclaimed_slab += pages; From eabce47bce017101db962545ca42cd69ccc58e11 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Thu, 7 Jun 2018 17:08:31 -0700 Subject: [PATCH 1236/1640] UPSTREAM: mm: move 'private' union within struct page MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit By moving page->private to the fourth word of struct page, we can put the SLUB counters in the same word as SLAB's s_mem and still do the cmpxchg_double trick. Now the SLUB counters no longer overlap with the mapcount or refcount so we can drop the call to page_mapcount_reset() and simplify set_page_slub_counters() to a single line. Link: http://lkml.kernel.org/r/20180518194519.3820-6-willy@infradead.org Signed-off-by: Matthew Wilcox Acked-by: Vlastimil Babka Acked-by: Kirill A. Shutemov Cc: Christoph Lameter Cc: Dave Hansen Cc: Jérôme Glisse Cc: Lai Jiangshan Cc: Martin Schwidefsky Cc: Pekka Enberg Cc: Randy Dunlap Cc: Andrey Ryabinin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 56 ++++++++++++++++++---------------------- mm/slub.c | 20 ++------------ 2 files changed, 27 insertions(+), 49 deletions(-) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index a610629cf5ff..4f22e834278f 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -65,15 +65,9 @@ struct hmm; */ #ifdef CONFIG_HAVE_ALIGNED_STRUCT_PAGE #define _struct_page_alignment __aligned(2 * sizeof(unsigned long)) -#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) -#define _slub_counter_t unsigned long #else -#define _slub_counter_t unsigned int -#endif -#else /* !CONFIG_HAVE_ALIGNED_STRUCT_PAGE */ #define _struct_page_alignment -#define _slub_counter_t unsigned int -#endif /* !CONFIG_HAVE_ALIGNED_STRUCT_PAGE */ +#endif struct page { /* First double word block */ @@ -95,6 +89,30 @@ struct page { /* page_deferred_list().prev -- second tail page */ }; + union { + /* + * Mapping-private opaque data: + * Usually used for buffer_heads if PagePrivate + * Used for swp_entry_t if PageSwapCache + * Indicates order in the buddy system if PageBuddy + */ + unsigned long private; +#if USE_SPLIT_PTE_PTLOCKS +#if ALLOC_SPLIT_PTLOCKS + spinlock_t *ptl; +#else + spinlock_t ptl; +#endif +#endif + void *s_mem; /* slab first object */ + unsigned long counters; /* SLUB */ + struct { /* SLUB */ + unsigned inuse:16; + unsigned objects:15; + unsigned frozen:1; + }; + }; + union { /* * If the page is neither PageSlab nor mappable to userspace, @@ -104,13 +122,7 @@ struct page { */ unsigned int page_type; - _slub_counter_t counters; unsigned int active; /* SLAB */ - struct { /* SLUB */ - unsigned inuse:16; - unsigned objects:15; - unsigned frozen:1; - }; int units; /* SLOB */ struct { /* Page cache */ @@ -179,24 +191,6 @@ struct page { #endif }; - union { - /* - * Mapping-private opaque data: - * Usually used for buffer_heads if PagePrivate - * Used for swp_entry_t if PageSwapCache - * Indicates order in the buddy system if PageBuddy - */ - unsigned long private; -#if USE_SPLIT_PTE_PTLOCKS -#if ALLOC_SPLIT_PTLOCKS - spinlock_t *ptl; -#else - spinlock_t ptl; -#endif -#endif - void *s_mem; /* slab first object */ - }; - #ifdef CONFIG_MEMCG struct mem_cgroup *mem_cgroup; #endif diff --git a/mm/slub.c b/mm/slub.c index 215617adc783..71f92f6fbaae 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -383,21 +383,6 @@ static __always_inline void slab_unlock(struct page *page) __bit_spin_unlock(PG_locked, &page->flags); } -static inline void set_page_slub_counters(struct page *page, unsigned long counters_new) -{ - struct page tmp; - tmp.counters = counters_new; - /* - * page->counters can cover frozen/inuse/objects as well - * as page->_refcount. If we assign to ->counters directly - * we run the risk of losing updates to page->_refcount, so - * be careful and only assign to the fields we need. - */ - page->frozen = tmp.frozen; - page->inuse = tmp.inuse; - page->objects = tmp.objects; -} - /* Interrupts must be disabled (for the fallback code to work right) */ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page, void *freelist_old, unsigned long counters_old, @@ -419,7 +404,7 @@ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page if (page->freelist == freelist_old && page->counters == counters_old) { page->freelist = freelist_new; - set_page_slub_counters(page, counters_new); + page->counters = counters_new; slab_unlock(page); return true; } @@ -458,7 +443,7 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page, if (page->freelist == freelist_old && page->counters == counters_old) { page->freelist = freelist_new; - set_page_slub_counters(page, counters_new); + page->counters = counters_new; slab_unlock(page); local_irq_restore(flags); return true; @@ -1981,7 +1966,6 @@ static void __free_slab(struct kmem_cache *s, struct page *page) __ClearPageSlabPfmemalloc(page); __ClearPageSlab(page); - page_mapcount_reset(page); page->mapping = NULL; if (current->reclaim_state) current->reclaim_state->reclaimed_slab += pages; From db128daa6e7ad00430cb5ba3c53f0dd3517db830 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Thu, 7 Jun 2018 17:08:35 -0700 Subject: [PATCH 1237/1640] UPSTREAM: mm: move _refcount out of struct page union MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Keeping the refcount in the union only encourages people to put something else in the union which will overlap with _refcount and eventually explode messily. pahole reports no fields change location. Link: http://lkml.kernel.org/r/20180518194519.3820-7-willy@infradead.org Signed-off-by: Matthew Wilcox Acked-by: Vlastimil Babka Acked-by: Kirill A. Shutemov Cc: Christoph Lameter Cc: Dave Hansen Cc: Jérôme Glisse Cc: Lai Jiangshan Cc: Martin Schwidefsky Cc: Pekka Enberg Cc: Randy Dunlap Cc: Andrey Ryabinin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 25 ++++++++++--------------- 1 file changed, 10 insertions(+), 15 deletions(-) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 4f22e834278f..a2fd8bd814fc 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -113,7 +113,13 @@ struct page { }; }; - union { + union { /* This union is 4 bytes in size. */ + /* + * If the page can be mapped to userspace, encodes the number + * of times this page is referenced by a page table. + */ + atomic_t _mapcount; + /* * If the page is neither PageSlab nor mappable to userspace, * the value stored here may help determine what this page @@ -124,22 +130,11 @@ struct page { unsigned int active; /* SLAB */ int units; /* SLOB */ - - struct { /* Page cache */ - /* - * Count of ptes mapped in mms, to show when - * page is mapped & limit reverse map searches. - */ - atomic_t _mapcount; - - /* - * Usage count, *USE WRAPPER FUNCTION* when manual - * accounting. See page_ref.h - */ - atomic_t _refcount; - }; }; + /* Usage count. *DO NOT USE DIRECTLY*. See page_ref.h */ + atomic_t _refcount; + /* * WARNING: bit 0 of the first word encode PageTail(). That means * the rest users of the storage space MUST NOT use the bit to From 4d41c395125dd3a8a96d87b17b6723650c62a33f Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Thu, 7 Jun 2018 17:08:39 -0700 Subject: [PATCH 1238/1640] UPSTREAM: mm: combine first three unions in struct page MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit By combining these three one-word unions into one three-word union, we make it easier for users to add their own multi-word fields to struct page, as well as making it obvious that SLUB needs to keep its double-word alignment for its freelist & counters. No field moves position; verified with pahole. Link: http://lkml.kernel.org/r/20180518194519.3820-8-willy@infradead.org Signed-off-by: Matthew Wilcox Acked-by: Kirill A. Shutemov Acked-by: Vlastimil Babka Cc: Christoph Lameter Cc: Dave Hansen Cc: Jérôme Glisse Cc: Lai Jiangshan Cc: Martin Schwidefsky Cc: Pekka Enberg Cc: Randy Dunlap Cc: Andrey Ryabinin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 66 ++++++++++++++++++++-------------------- 1 file changed, 33 insertions(+), 33 deletions(-) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index a2fd8bd814fc..98a1eabc7598 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -70,46 +70,46 @@ struct hmm; #endif struct page { - /* First double word block */ unsigned long flags; /* Atomic flags, some possibly * updated asynchronously */ + /* Three words (12/24 bytes) are available in this union. */ union { - /* See page-flags.h for the definition of PAGE_MAPPING_FLAGS */ - struct address_space *mapping; - - struct kmem_cache *slab_cache; /* SL[AU]B: Pointer to slab */ + struct { /* Page cache and anonymous pages */ + /* See page-flags.h for PAGE_MAPPING_FLAGS */ + struct address_space *mapping; + pgoff_t index; /* Our offset within mapping. */ + /** + * @private: Mapping-private opaque data. + * Usually used for buffer_heads if PagePrivate. + * Used for swp_entry_t if PageSwapCache. + * Indicates order in the buddy system if PageBuddy. + */ + unsigned long private; + }; + struct { /* slab, slob and slub */ + struct kmem_cache *slab_cache; /* not slob */ + /* Double-word boundary */ + void *freelist; /* first free object */ + union { + void *s_mem; /* slab: first object */ + unsigned long counters; /* SLUB */ + struct { /* SLUB */ + unsigned inuse:16; + unsigned objects:15; + unsigned frozen:1; + }; + }; + }; atomic_t compound_mapcount; /* first tail page */ - /* page_deferred_list().next -- second tail page */ - }; - - /* Second double word */ - union { - pgoff_t index; /* Our offset within mapping. */ - void *freelist; /* sl[aou]b first free object */ - /* page_deferred_list().prev -- second tail page */ - }; - - union { - /* - * Mapping-private opaque data: - * Usually used for buffer_heads if PagePrivate - * Used for swp_entry_t if PageSwapCache - * Indicates order in the buddy system if PageBuddy - */ - unsigned long private; -#if USE_SPLIT_PTE_PTLOCKS + struct list_head deferred_list; /* second tail page */ + struct { /* Page table pages */ + unsigned long _pt_pad_2; /* mapping */ + unsigned long _pt_pad_3; #if ALLOC_SPLIT_PTLOCKS - spinlock_t *ptl; + spinlock_t *ptl; #else - spinlock_t ptl; + spinlock_t ptl; #endif -#endif - void *s_mem; /* slab first object */ - unsigned long counters; /* SLUB */ - struct { /* SLUB */ - unsigned inuse:16; - unsigned objects:15; - unsigned frozen:1; }; }; From db3186c9d41d880cb31a2906191dd74030484a53 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Thu, 7 Jun 2018 17:08:46 -0700 Subject: [PATCH 1239/1640] UPSTREAM: mm: move lru union within struct page MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since the LRU is two words, this does not affect the double-word alignment of SLUB's freelist. Link: http://lkml.kernel.org/r/20180518194519.3820-10-willy@infradead.org Signed-off-by: Matthew Wilcox Acked-by: Vlastimil Babka Acked-by: Kirill A. Shutemov Cc: Christoph Lameter Cc: Dave Hansen Cc: Jérôme Glisse Cc: Lai Jiangshan Cc: Martin Schwidefsky Cc: Pekka Enberg Cc: Randy Dunlap Cc: Andrey Ryabinin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 102 +++++++++++++++++++-------------------- mm/slub.c | 8 +-- 2 files changed, 55 insertions(+), 55 deletions(-) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 98a1eabc7598..bab2bbcb8af3 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -72,6 +72,57 @@ struct hmm; struct page { unsigned long flags; /* Atomic flags, some possibly * updated asynchronously */ + /* + * WARNING: bit 0 of the first word encode PageTail(). That means + * the rest users of the storage space MUST NOT use the bit to + * avoid collision and false-positive PageTail(). + */ + union { + struct list_head lru; /* Pageout list, eg. active_list + * protected by zone_lru_lock ! + * Can be used as a generic list + * by the page owner. + */ + struct dev_pagemap *pgmap; /* ZONE_DEVICE pages are never on an + * lru or handled by a slab + * allocator, this points to the + * hosting device page map. + */ + struct { /* slub per cpu partial pages */ + struct page *next; /* Next partial slab */ +#ifdef CONFIG_64BIT + int pages; /* Nr of partial slabs left */ + int pobjects; /* Approximate # of objects */ +#else + short int pages; + short int pobjects; +#endif + }; + + struct rcu_head rcu_head; /* Used by SLAB + * when destroying via RCU + */ + /* Tail pages of compound page */ + struct { + unsigned long compound_head; /* If bit zero is set */ + + /* First tail page only */ + unsigned char compound_dtor; + unsigned char compound_order; + /* two/six bytes available here */ + }; + +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && USE_SPLIT_PMD_PTLOCKS + struct { + unsigned long __pad; /* do not overlay pmd_huge_pte + * with compound_head to avoid + * possible bit 0 collision. + */ + pgtable_t pmd_huge_pte; /* protected by page->ptl */ + }; +#endif + }; + /* Three words (12/24 bytes) are available in this union. */ union { struct { /* Page cache and anonymous pages */ @@ -135,57 +186,6 @@ struct page { /* Usage count. *DO NOT USE DIRECTLY*. See page_ref.h */ atomic_t _refcount; - /* - * WARNING: bit 0 of the first word encode PageTail(). That means - * the rest users of the storage space MUST NOT use the bit to - * avoid collision and false-positive PageTail(). - */ - union { - struct list_head lru; /* Pageout list, eg. active_list - * protected by zone_lru_lock ! - * Can be used as a generic list - * by the page owner. - */ - struct dev_pagemap *pgmap; /* ZONE_DEVICE pages are never on an - * lru or handled by a slab - * allocator, this points to the - * hosting device page map. - */ - struct { /* slub per cpu partial pages */ - struct page *next; /* Next partial slab */ -#ifdef CONFIG_64BIT - int pages; /* Nr of partial slabs left */ - int pobjects; /* Approximate # of objects */ -#else - short int pages; - short int pobjects; -#endif - }; - - struct rcu_head rcu_head; /* Used by SLAB - * when destroying via RCU - */ - /* Tail pages of compound page */ - struct { - unsigned long compound_head; /* If bit zero is set */ - - /* First tail page only */ - unsigned char compound_dtor; - unsigned char compound_order; - /* two/six bytes available here */ - }; - -#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && USE_SPLIT_PMD_PTLOCKS - struct { - unsigned long __pad; /* do not overlay pmd_huge_pte - * with compound_head to avoid - * possible bit 0 collision. - */ - pgtable_t pmd_huge_pte; /* protected by page->ptl */ - }; -#endif - }; - #ifdef CONFIG_MEMCG struct mem_cgroup *mem_cgroup; #endif diff --git a/mm/slub.c b/mm/slub.c index 71f92f6fbaae..1684c7fa0fa5 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -68,11 +68,11 @@ do { \ * and to synchronize major metadata changes to slab cache structures. * * The slab_lock is only used for debugging and on arches that do not - * have the ability to do a cmpxchg_double. It only protects the second - * double word in the page struct. Meaning + * have the ability to do a cmpxchg_double. It only protects: * A. page->freelist -> List of object free in a page - * B. page->counters -> Counters of objects - * C. page->frozen -> frozen state + * B. page->inuse -> Number of objects in use + * C. page->objects -> Number of objects in page + * D. page->frozen -> frozen state * * If a slab is frozen then it is exempt from list management. It is not * on any list. The processor that froze the slab is the one who can From 7e2b9520b67a9456fd8b3677a03ca87a7b017954 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Thu, 7 Jun 2018 17:08:50 -0700 Subject: [PATCH 1240/1640] UPSTREAM: mm: combine LRU and main union in struct page MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This gives us five words of space in a single union in struct page. The compound_mapcount moves position (from offset 24 to offset 20) on 64-bit systems, but that does not seem likely to cause any trouble. Link: http://lkml.kernel.org/r/20180518194519.3820-11-willy@infradead.org Signed-off-by: Matthew Wilcox Acked-by: Vlastimil Babka Acked-by: Kirill A. Shutemov Cc: Christoph Lameter Cc: Dave Hansen Cc: Jérôme Glisse Cc: Lai Jiangshan Cc: Martin Schwidefsky Cc: Pekka Enberg Cc: Randy Dunlap Cc: Andrey Ryabinin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 97 +++++++++++++++++++--------------------- mm/page_alloc.c | 2 +- 2 files changed, 47 insertions(+), 52 deletions(-) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index bab2bbcb8af3..b917c1b31903 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -73,59 +73,19 @@ struct page { unsigned long flags; /* Atomic flags, some possibly * updated asynchronously */ /* - * WARNING: bit 0 of the first word encode PageTail(). That means - * the rest users of the storage space MUST NOT use the bit to + * Five words (20/40 bytes) are available in this union. + * WARNING: bit 0 of the first word is used for PageTail(). That + * means the other users of this union MUST NOT use the bit to * avoid collision and false-positive PageTail(). */ - union { - struct list_head lru; /* Pageout list, eg. active_list - * protected by zone_lru_lock ! - * Can be used as a generic list - * by the page owner. - */ - struct dev_pagemap *pgmap; /* ZONE_DEVICE pages are never on an - * lru or handled by a slab - * allocator, this points to the - * hosting device page map. - */ - struct { /* slub per cpu partial pages */ - struct page *next; /* Next partial slab */ -#ifdef CONFIG_64BIT - int pages; /* Nr of partial slabs left */ - int pobjects; /* Approximate # of objects */ -#else - short int pages; - short int pobjects; -#endif - }; - - struct rcu_head rcu_head; /* Used by SLAB - * when destroying via RCU - */ - /* Tail pages of compound page */ - struct { - unsigned long compound_head; /* If bit zero is set */ - - /* First tail page only */ - unsigned char compound_dtor; - unsigned char compound_order; - /* two/six bytes available here */ - }; - -#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && USE_SPLIT_PMD_PTLOCKS - struct { - unsigned long __pad; /* do not overlay pmd_huge_pte - * with compound_head to avoid - * possible bit 0 collision. - */ - pgtable_t pmd_huge_pte; /* protected by page->ptl */ - }; -#endif - }; - - /* Three words (12/24 bytes) are available in this union. */ union { struct { /* Page cache and anonymous pages */ + /** + * @lru: Pageout list, eg. active_list protected by + * zone_lru_lock. Sometimes used as a generic list + * by the page owner. + */ + struct list_head lru; /* See page-flags.h for PAGE_MAPPING_FLAGS */ struct address_space *mapping; pgoff_t index; /* Our offset within mapping. */ @@ -138,6 +98,19 @@ struct page { unsigned long private; }; struct { /* slab, slob and slub */ + union { + struct list_head slab_list; /* uses lru */ + struct { /* Partial pages */ + struct page *next; +#ifdef CONFIG_64BIT + int pages; /* Nr of pages left */ + int pobjects; /* Approximate count */ +#else + short int pages; + short int pobjects; +#endif + }; + }; struct kmem_cache *slab_cache; /* not slob */ /* Double-word boundary */ void *freelist; /* first free object */ @@ -151,9 +124,22 @@ struct page { }; }; }; - atomic_t compound_mapcount; /* first tail page */ - struct list_head deferred_list; /* second tail page */ + struct { /* Tail pages of compound page */ + unsigned long compound_head; /* Bit zero is set */ + + /* First tail page only */ + unsigned char compound_dtor; + unsigned char compound_order; + atomic_t compound_mapcount; + }; + struct { /* Second tail page of compound page */ + unsigned long _compound_pad_1; /* compound_head */ + unsigned long _compound_pad_2; + struct list_head deferred_list; + }; struct { /* Page table pages */ + unsigned long _pt_pad_1; /* compound_head */ + pgtable_t pmd_huge_pte; /* protected by page->ptl */ unsigned long _pt_pad_2; /* mapping */ unsigned long _pt_pad_3; #if ALLOC_SPLIT_PTLOCKS @@ -162,6 +148,15 @@ struct page { spinlock_t ptl; #endif }; + + /** @rcu_head: You can use this to free a page by RCU. */ + struct rcu_head rcu_head; + + /** + * @pgmap: For ZONE_DEVICE pages, this points to the hosting + * device page map. + */ + struct dev_pagemap *pgmap; }; union { /* This union is 4 bytes in size. */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 0866c6d26d7c..93dddfd211dc 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1041,7 +1041,7 @@ static int free_tail_pages_check(struct page *head_page, struct page *page) } switch (page - head_page) { case 1: - /* the first tail page: ->mapping is compound_mapcount() */ + /* the first tail page: ->mapping may be compound_mapcount() */ if (unlikely(compound_mapcount(page))) { bad_page(page, "nonzero compound_mapcount", 0); goto out; From a1755ea5bffa0eb704066eb90a823bb4cc5fa9dd Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Thu, 7 Jun 2018 17:08:53 -0700 Subject: [PATCH 1241/1640] UPSTREAM: mm: improve struct page documentation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rewrite the documentation to describe what you can use in struct page rather than what you can't. Link: http://lkml.kernel.org/r/20180518194519.3820-12-willy@infradead.org Signed-off-by: Matthew Wilcox Reviewed-by: Randy Dunlap Acked-by: Vlastimil Babka Cc: Christoph Lameter Cc: Dave Hansen Cc: Jérôme Glisse Cc: "Kirill A . Shutemov" Cc: Lai Jiangshan Cc: Martin Schwidefsky Cc: Pekka Enberg Cc: Andrey Ryabinin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 40 +++++++++++++++++++--------------------- 1 file changed, 19 insertions(+), 21 deletions(-) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index b917c1b31903..859f0a972f04 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -33,29 +33,27 @@ struct hmm; * it to keep track of whatever it is we are using the page for at the * moment. Note that we have no way to track which tasks are using * a page, though if it is a pagecache page, rmap structures can tell us - * who is mapping it. If you allocate the page using alloc_pages(), you - * can use some of the space in struct page for your own purposes. + * who is mapping it. * - * Pages that were once in the page cache may be found under the RCU lock - * even after they have been recycled to a different purpose. The page - * cache reads and writes some of the fields in struct page to pin the - * page before checking that it's still in the page cache. It is vital - * that all users of struct page: - * 1. Use the first word as PageFlags. - * 2. Clear or preserve bit 0 of page->compound_head. It is used as - * PageTail for compound pages, and the page cache must not see false - * positives. Some users put a pointer here (guaranteed to be at least - * 4-byte aligned), other users avoid using the field altogether. - * 3. page->_refcount must either not be used, or must be used in such a - * way that other CPUs temporarily incrementing and then decrementing the - * refcount does not cause problems. On receiving the page from - * alloc_pages(), the refcount will be positive. - * 4. Either preserve page->_mapcount or restore it to -1 before freeing it. + * If you allocate the page using alloc_pages(), you can use some of the + * space in struct page for your own purposes. The five words in the main + * union are available, except for bit 0 of the first word which must be + * kept clear. Many users use this word to store a pointer to an object + * which is guaranteed to be aligned. If you use the same storage as + * page->mapping, you must restore it to NULL before freeing the page. * - * If you allocate pages of order > 0, you can use the fields in the struct - * page associated with each page, but bear in mind that the pages may have - * been inserted individually into the page cache, so you must use the above - * four fields in a compatible way for each struct page. + * If your page will not be mapped to userspace, you can also use the four + * bytes in the mapcount union, but you must call page_mapcount_reset() + * before freeing it. + * + * If you want to use the refcount field, it must be used in such a way + * that other CPUs temporarily incrementing and then decrementing the + * refcount does not cause problems. On receiving the page from + * alloc_pages(), the refcount will be positive. + * + * If you allocate pages of order > 0, you can use some of the fields + * in each subpage, but you may need to restore some of their values + * afterwards. * * SLUB uses cmpxchg_double() to atomically update its freelist and * counters. That requires that freelist & counters be adjacent and From 39afa9cf9a52575c5e3a54173f42703d3dcdb0a5 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Thu, 7 Jun 2018 17:08:57 -0700 Subject: [PATCH 1242/1640] UPSTREAM: mm: add pt_mm to struct page MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For pgd page table pages, x86 overloads the page->index field to store a pointer to the mm_struct. Rename this to pt_mm so it's visible to other users. Link: http://lkml.kernel.org/r/20180518194519.3820-13-willy@infradead.org Signed-off-by: Matthew Wilcox Acked-by: Vlastimil Babka Cc: Christoph Lameter Cc: Dave Hansen Cc: Jérôme Glisse Cc: "Kirill A . Shutemov" Cc: Lai Jiangshan Cc: Martin Schwidefsky Cc: Pekka Enberg Cc: Randy Dunlap Cc: Andrey Ryabinin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/mm/pgtable.c | 5 ++--- include/linux/mm_types.h | 2 +- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 84e5c5bdaa74..e5b62984483e 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -114,13 +114,12 @@ static inline void pgd_list_del(pgd_t *pgd) static void pgd_set_mm(pgd_t *pgd, struct mm_struct *mm) { - BUILD_BUG_ON(sizeof(virt_to_page(pgd)->index) < sizeof(mm)); - virt_to_page(pgd)->index = (pgoff_t)mm; + virt_to_page(pgd)->pt_mm = mm; } struct mm_struct *pgd_page_get_mm(struct page *page) { - return (struct mm_struct *)page->index; + return page->pt_mm; } static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 859f0a972f04..4f07f4815bec 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -139,7 +139,7 @@ struct page { unsigned long _pt_pad_1; /* compound_head */ pgtable_t pmd_huge_pte; /* protected by page->ptl */ unsigned long _pt_pad_2; /* mapping */ - unsigned long _pt_pad_3; + struct mm_struct *pt_mm; /* x86 pgds only */ #if ALLOC_SPLIT_PTLOCKS spinlock_t *ptl; #else From 998c10e89fe7c9123526ec9ddf65931670dcd25b Mon Sep 17 00:00:00 2001 From: Ralph Campbell Date: Wed, 15 Nov 2017 17:34:00 -0800 Subject: [PATCH 1243/1640] UPSTREAM: mm/hmm: constify hmm_devmem_page_get_drvdata() parameter MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Constify pointer parameter to avoid issue when use from code that only has const struct page pointer to use in the first place. Link: http://lkml.kernel.org/r/1506972774-10191-1-git-send-email-jglisse@redhat.com Signed-off-by: Ralph Campbell Signed-off-by: Jérôme Glisse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hmm.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/hmm.h b/include/linux/hmm.h index 7d799c4d2669..805daeee9e94 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -469,9 +469,9 @@ static inline void hmm_devmem_page_set_drvdata(struct page *page, * @page: pointer to struct page * Return: driver data value */ -static inline unsigned long hmm_devmem_page_get_drvdata(struct page *page) +static inline unsigned long hmm_devmem_page_get_drvdata(const struct page *page) { - unsigned long *drvdata = (unsigned long *)&page->pgmap; + const unsigned long *drvdata = (const unsigned long *)&page->pgmap; return drvdata[1]; } From 26a54435c3c514274e005d77b8eba82e6154db87 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Thu, 7 Jun 2018 17:09:01 -0700 Subject: [PATCH 1244/1640] UPSTREAM: mm: add hmm_data to struct page MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Make hmm_data an explicit member of the struct page union. Link: http://lkml.kernel.org/r/20180518194519.3820-14-willy@infradead.org Signed-off-by: Matthew Wilcox Acked-by: Vlastimil Babka Cc: Christoph Lameter Cc: Dave Hansen Cc: Jérôme Glisse Cc: "Kirill A . Shutemov" Cc: Lai Jiangshan Cc: Martin Schwidefsky Cc: Pekka Enberg Cc: Randy Dunlap Cc: Andrey Ryabinin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hmm.h | 8 ++------ include/linux/mm_types.h | 12 ++++++------ 2 files changed, 8 insertions(+), 12 deletions(-) diff --git a/include/linux/hmm.h b/include/linux/hmm.h index 805daeee9e94..6cacf5155499 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -458,9 +458,7 @@ struct hmm_devmem *hmm_devmem_add_resource(const struct hmm_devmem_ops *ops, static inline void hmm_devmem_page_set_drvdata(struct page *page, unsigned long data) { - unsigned long *drvdata = (unsigned long *)&page->pgmap; - - drvdata[1] = data; + page->hmm_data = data; } /* @@ -471,9 +469,7 @@ static inline void hmm_devmem_page_set_drvdata(struct page *page, */ static inline unsigned long hmm_devmem_page_get_drvdata(const struct page *page) { - const unsigned long *drvdata = (const unsigned long *)&page->pgmap; - - return drvdata[1]; + return page->hmm_data; } diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 4f07f4815bec..7a5022fb0541 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -146,15 +146,15 @@ struct page { spinlock_t ptl; #endif }; + struct { /* ZONE_DEVICE pages */ + /** @pgmap: Points to the hosting device page map. */ + struct dev_pagemap *pgmap; + unsigned long hmm_data; + unsigned long _zd_pad_1; /* uses mapping */ + }; /** @rcu_head: You can use this to free a page by RCU. */ struct rcu_head rcu_head; - - /** - * @pgmap: For ZONE_DEVICE pages, this points to the hosting - * device page map. - */ - struct dev_pagemap *pgmap; }; union { /* This union is 4 bytes in size. */ From f46f558e901a58efd457d7e1c5a81c290688eed5 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Wed, 13 Feb 2019 02:55:40 +0100 Subject: [PATCH 1245/1640] UPSTREAM: mm: add dma_addr_t to struct page The page_pool API is using page->private to store DMA addresses. As pointed out by David Miller we can't use that on 32-bit architectures with 64-bit DMA This patch adds a new dma_addr_t struct to allow storing DMA addresses Change-Id: I475de16b573d76567869be8939858970e8c0d8de Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Ilias Apalodimas Acked-by: Andrew Morton Signed-off-by: David S. Miller --- include/linux/mm_types.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 7a5022fb0541..e94d52ab9388 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -95,6 +95,13 @@ struct page { */ unsigned long private; }; + struct { /* page_pool used by netstack */ + /** + * @dma_addr: might require a 64-bit value even on + * 32-bit architectures. + */ + dma_addr_t dma_addr; + }; struct { /* slab, slob and slub */ union { struct list_head slab_list; /* uses lru */ From 285a046ddf16b8ec8fa179604323a65177dec3aa Mon Sep 17 00:00:00 2001 From: Ilias Apalodimas Date: Tue, 18 Jun 2019 15:05:12 +0200 Subject: [PATCH 1246/1640] UPSTREAM: net: page_pool: add helper function to retrieve dma addresses On a previous patch dma addr was stored in 'struct page'. Use that to retrieve DMA addresses used by network drivers Change-Id: Ida4c075b66bd37de87986b817fd350c2ea55760c Signed-off-by: Ilias Apalodimas Signed-off-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- include/net/page_pool.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/include/net/page_pool.h b/include/net/page_pool.h index 694d055e01ef..b885d86cb7a1 100644 --- a/include/net/page_pool.h +++ b/include/net/page_pool.h @@ -132,6 +132,11 @@ static inline void page_pool_recycle_direct(struct page_pool *pool, __page_pool_put_page(pool, page, true); } +static inline dma_addr_t page_pool_get_dma_addr(struct page *page) +{ + return page->dma_addr; +} + static inline bool is_page_pool_compiled_in(void) { #ifdef CONFIG_PAGE_POOL From 601f125c3839186b73a0f540c889ae89a8489c44 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Tue, 18 Jun 2019 15:05:27 +0200 Subject: [PATCH 1247/1640] UPSTREAM: xdp: page_pool related fix to cpumap When converting an xdp_frame into an SKB, and sending this into the network stack, then the underlying XDP memory model need to release associated resources, because the network stack don't have callbacks for XDP memory models. The only memory model that needs this is page_pool, when a driver use the DMA-mapping feature. Introduce page_pool_release_page(), which basically does the same as page_pool_unmap_page(). Add xdp_release_frame() as the XDP memory model interface for calling it, if the memory model match MEM_TYPE_PAGE_POOL, to save the function call overhead for others. Have cpumap call xdp_release_frame() before xdp_scrub_frame(). Change-Id: I2e93e6e669bb57f62ee93f3cf665810c72a23aa4 Signed-off-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- include/net/page_pool.h | 14 ++++++++++++++ include/net/xdp.h | 15 +++++++++++++++ kernel/bpf/cpumap.c | 3 +++ net/core/xdp.c | 15 +++++++++++++++ 4 files changed, 47 insertions(+) diff --git a/include/net/page_pool.h b/include/net/page_pool.h index b885d86cb7a1..e240fac4c5b9 100644 --- a/include/net/page_pool.h +++ b/include/net/page_pool.h @@ -132,6 +132,20 @@ static inline void page_pool_recycle_direct(struct page_pool *pool, __page_pool_put_page(pool, page, true); } +/* Disconnects a page (from a page_pool). API users can have a need + * to disconnect a page (from a page_pool), to allow it to be used as + * a regular page (that will eventually be returned to the normal + * page-allocator via put_page). + */ +void page_pool_unmap_page(struct page_pool *pool, struct page *page); +static inline void page_pool_release_page(struct page_pool *pool, + struct page *page) +{ +#ifdef CONFIG_PAGE_POOL + page_pool_unmap_page(pool, page); +#endif +} + static inline dma_addr_t page_pool_get_dma_addr(struct page *page) { return page->dma_addr; diff --git a/include/net/xdp.h b/include/net/xdp.h index be8ceb836b19..254595f30a70 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -128,6 +128,21 @@ void xdp_return_frame(struct xdp_frame *xdpf); void xdp_return_frame_rx_napi(struct xdp_frame *xdpf); void xdp_return_buff(struct xdp_buff *xdp); +/* When sending xdp_frame into the network stack, then there is no + * return point callback, which is needed to release e.g. DMA-mapping + * resources with page_pool. Thus, have explicit function to release + * frame resources. + */ +void __xdp_release_frame(void *data, struct xdp_mem_info *mem); +static inline void xdp_release_frame(struct xdp_frame *xdpf) +{ + struct xdp_mem_info *mem = &xdpf->mem; + + /* Curr only page_pool needs this */ + if (mem->type == MEM_TYPE_PAGE_POOL) + __xdp_release_frame(xdpf->data, mem); +} + int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq, struct net_device *dev, u32 queue_index); void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq); diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 5a614e62a9f1..d39b34e0384e 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -208,6 +208,9 @@ static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu, * - RX ring dev queue index (skb_record_rx_queue) */ + /* Until page_pool get SKB return path, release DMA here */ + xdp_release_frame(xdpf); + /* Allow SKB to reuse area used by xdp_frame */ xdp_scrub_frame(xdpf); diff --git a/net/core/xdp.c b/net/core/xdp.c index 1a19e2b69aba..5df56e7eeeba 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -376,6 +376,21 @@ void xdp_return_buff(struct xdp_buff *xdp) } EXPORT_SYMBOL_GPL(xdp_return_buff); +/* Only called for MEM_TYPE_PAGE_POOL see xdp.h */ +void __xdp_release_frame(void *data, struct xdp_mem_info *mem) +{ + struct xdp_mem_allocator *xa; + struct page *page; + + rcu_read_lock(); + xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params); + page = virt_to_head_page(data); + if (xa) + page_pool_release_page(xa->page_pool, page); + rcu_read_unlock(); +} +EXPORT_SYMBOL_GPL(__xdp_release_frame); + int xdp_attachment_query(struct xdp_attachment_info *info, struct netdev_bpf *bpf) { From 41dcb524e353b339879566a227f91c0168c9f990 Mon Sep 17 00:00:00 2001 From: Jouni Malinen Date: Sat, 2 Feb 2019 01:16:31 +0200 Subject: [PATCH 1248/1640] UPSTREAM: mac80211_hwsim: Support boottime in scan results This makes the age information for cfg80211 scan results more accurate and fixes issues with wpa_supplicant dropping "old" scan results (e.g., "wlan0: Own scan request started a scan in 0.000456 seconds") that looked like would have been received before a scan started due to the inaccuracy of the default timing mechanism for calculating the BSS entry age. This makes hwsim test cases significantly more robust to run. Change-Id: I0518fad6cbe0d56a4ce816f021351b1167ddb75e Signed-off-by: Jouni Malinen Signed-off-by: Johannes Berg --- drivers/net/wireless/mac80211_hwsim.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/wireless/mac80211_hwsim.c b/drivers/net/wireless/mac80211_hwsim.c index d3e946de86b5..1bad9ea40919 100644 --- a/drivers/net/wireless/mac80211_hwsim.c +++ b/drivers/net/wireless/mac80211_hwsim.c @@ -1343,10 +1343,12 @@ static bool mac80211_hwsim_tx_frame_no_nl(struct ieee80211_hw *hw, * probably doesn't really matter. */ if (ieee80211_is_beacon(hdr->frame_control) || - ieee80211_is_probe_resp(hdr->frame_control)) + ieee80211_is_probe_resp(hdr->frame_control)) { + rx_status.boottime_ns = ktime_get_boot_ns(); now = data->abs_bcn_ts; - else + } else { now = mac80211_hwsim_get_tsf_raw(); + } /* Copy skb to all enabled radios that are on the current frequency */ spin_lock(&hwsim_radio_lock); From fb997644b886597d70957343d35971e340506215 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 11 Jul 2018 14:41:00 +0200 Subject: [PATCH 1249/1640] UPSTREAM: drm/admkfd use modern ktime accessors getrawmonotonic64() and get_monotonic_boottime64() are deprecated because of the nonstandard naming. The replacement functions ktime_get_raw_ns() and ktime_get_boot_ns() also simplify the callers. Reviewed-by: Felix Kuehling . Change-Id: I19131a401f765c1969058d7b780d725043b7dd6b Signed-off-by: Arnd Bergmann Signed-off-by: Oded Gabbay --- drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c index 8a05efa7edf0..a8b56f6495c3 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c @@ -713,7 +713,6 @@ static int kfd_ioctl_get_clock_counters(struct file *filep, { struct kfd_ioctl_get_clock_counters_args *args = data; struct kfd_dev *dev; - struct timespec64 time; dev = kfd_device_by_id(args->gpu_id); if (dev) @@ -725,11 +724,8 @@ static int kfd_ioctl_get_clock_counters(struct file *filep, args->gpu_clock_counter = 0; /* No access to rdtsc. Using raw monotonic time */ - getrawmonotonic64(&time); - args->cpu_clock_counter = (uint64_t)timespec64_to_ns(&time); - - get_monotonic_boottime64(&time); - args->system_clock_counter = (uint64_t)timespec64_to_ns(&time); + args->cpu_clock_counter = ktime_get_raw_ns(); + args->system_clock_counter = ktime_get_boot_ns(); /* Since the counter is in nano-seconds we use 1GHz frequency */ args->system_clock_freq = 1000000000; From 93694edc8a108ec921596f7ea72e85394a779b7a Mon Sep 17 00:00:00 2001 From: Harald Geyer Date: Sun, 11 Feb 2018 11:09:40 +0000 Subject: [PATCH 1250/1640] UPSTREAM: iio: dht11: Improve detection of sensor type The old code was based on a DHT11 datasheet which specifies a measurement range of 20%-90% RH. Turns out the sensor actually reports values outside this range, so we should support it as far as possible. Reported-by: Edward Attfield Change-Id: If164e31473125a5f97a885e35b917de0bd0c246a Signed-off-by: Harald Geyer Signed-off-by: Jonathan Cameron --- drivers/iio/humidity/dht11.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iio/humidity/dht11.c b/drivers/iio/humidity/dht11.c index 2a22ad920333..7ccabcd5614a 100644 --- a/drivers/iio/humidity/dht11.c +++ b/drivers/iio/humidity/dht11.c @@ -159,7 +159,7 @@ static int dht11_decode(struct dht11 *dht11, int offset) } dht11->timestamp = ktime_get_boot_ns(); - if (hum_int < 20) { /* DHT22 */ + if (hum_int < 4) { /* DHT22: 100000 = (3*256+232)*100 */ dht11->temperature = (((temp_int & 0x7f) << 8) + temp_dec) * ((temp_int & 0x80) ? -100 : 100); dht11->humidity = ((hum_int << 8) + hum_dec) * 100; From 6da1fb2e5b17eaf6427bba965e6e34ddfcb91162 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 27 Apr 2018 15:40:15 +0200 Subject: [PATCH 1251/1640] UPSTREAM: timekeeping: Add ktime_get_coarse_with_offset I have run into a couple of drivers using current_kernel_time() suffering from the y2038 problem, and they could be converted to using ktime_t, but don't have interfaces that skip the nanosecond calculation at the moment. This introduces ktime_get_coarse_with_offset() as a simpler variant of ktime_get_with_offset(), and adds wrappers for the three time domains we support with the existing function. Change-Id: Ieedd1596482ce9c740a820d92f957f19a486e15f Signed-off-by: Arnd Bergmann Signed-off-by: Thomas Gleixner Cc: Stephen Boyd Cc: y2038@lists.linaro.org Cc: John Stultz Link: https://lkml.kernel.org/r/20180427134016.2525989-5-arnd@arndb.de --- include/linux/timekeeping.h | 16 ++++++++++++++++ kernel/time/timekeeping.c | 19 +++++++++++++++++++ 2 files changed, 35 insertions(+) diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h index 0021575fe871..3045aee98b06 100644 --- a/include/linux/timekeeping.h +++ b/include/linux/timekeeping.h @@ -173,6 +173,7 @@ enum tk_offsets { extern ktime_t ktime_get(void); extern ktime_t ktime_get_with_offset(enum tk_offsets offs); +extern ktime_t ktime_get_coarse_with_offset(enum tk_offsets offs); extern ktime_t ktime_mono_to_any(ktime_t tmono, enum tk_offsets offs); extern ktime_t ktime_get_raw(void); extern u32 ktime_get_resolution_ns(void); @@ -185,6 +186,11 @@ static inline ktime_t ktime_get_real(void) return ktime_get_with_offset(TK_OFFS_REAL); } +static inline ktime_t ktime_get_coarse_real(void) +{ + return ktime_get_coarse_with_offset(TK_OFFS_REAL); +} + /** * ktime_get_boottime - Returns monotonic time since boot in ktime_t format * @@ -196,6 +202,11 @@ static inline ktime_t ktime_get_boottime(void) return ktime_get_with_offset(TK_OFFS_BOOT); } +static inline ktime_t ktime_get_coarse_boottime(void) +{ + return ktime_get_coarse_with_offset(TK_OFFS_BOOT); +} + /** * ktime_get_clocktai - Returns the TAI time of day in ktime_t format */ @@ -204,6 +215,11 @@ static inline ktime_t ktime_get_clocktai(void) return ktime_get_with_offset(TK_OFFS_TAI); } +static inline ktime_t ktime_get_coarse_clocktai(void) +{ + return ktime_get_coarse_with_offset(TK_OFFS_TAI); +} + /** * ktime_mono_to_real - Convert monotonic time to clock realtime */ diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 6c14e0389ddb..d4cb7152ba5b 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -816,6 +816,25 @@ ktime_t ktime_get_with_offset(enum tk_offsets offs) } EXPORT_SYMBOL_GPL(ktime_get_with_offset); +ktime_t ktime_get_coarse_with_offset(enum tk_offsets offs) +{ + struct timekeeper *tk = &tk_core.timekeeper; + unsigned int seq; + ktime_t base, *offset = offsets[offs]; + + WARN_ON(timekeeping_suspended); + + do { + seq = read_seqcount_begin(&tk_core.seq); + base = ktime_add(tk->tkr_mono.base, *offset); + + } while (read_seqcount_retry(&tk_core.seq, seq)); + + return base; + +} +EXPORT_SYMBOL_GPL(ktime_get_coarse_with_offset); + /** * ktime_mono_to_any() - convert mononotic time to any other time * @tmono: time to convert. From 45386e68b8bbdf36bbc0b466fb7ec846ce3c7b28 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 19 Oct 2017 13:14:49 +0200 Subject: [PATCH 1252/1640] UPSTREAM: time: Move old timekeeping interfaces to timekeeping32.h The interfaces based on 'struct timespec' and 'unsigned long' seconds are no longer recommended for new code, and we are trying to migrate to ktime_t based interfaces and other y2038-safe variants. This moves all the legacy interfaces from linux/timekeeping.h into a new timekeeping32.h to better document this. Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Miroslav Lichvar Cc: Richard Cochran Cc: Prarit Bhargava Cc: Stephen Boyd Change-Id: I0b3bb1e7f3404f356dba7f2b9714ec0e3937d142 Signed-off-by: Arnd Bergmann Signed-off-by: John Stultz --- include/linux/ktime.h | 1 + include/linux/timekeeping.h | 137 +----------------------------- include/linux/timekeeping32.h | 151 ++++++++++++++++++++++++++++++++++ 3 files changed, 154 insertions(+), 135 deletions(-) create mode 100644 include/linux/timekeeping32.h diff --git a/include/linux/ktime.h b/include/linux/ktime.h index 0c8bd45c8206..5b9fddbaac41 100644 --- a/include/linux/ktime.h +++ b/include/linux/ktime.h @@ -270,5 +270,6 @@ static inline ktime_t ms_to_ktime(u64 ms) } # include +# include #endif diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h index 3045aee98b06..50a79eb8c607 100644 --- a/include/linux/timekeeping.h +++ b/include/linux/timekeeping.h @@ -16,27 +16,16 @@ extern void xtime_update(unsigned long ticks); /* * Get and set timeofday */ -extern void do_gettimeofday(struct timeval *tv); extern int do_settimeofday64(const struct timespec64 *ts); extern int do_sys_settimeofday64(const struct timespec64 *tv, const struct timezone *tz); /* * Kernel time accessors */ -unsigned long get_seconds(void); struct timespec64 current_kernel_time64(void); -/* does not take xtime_lock */ -struct timespec __current_kernel_time(void); - -static inline struct timespec current_kernel_time(void) -{ - struct timespec64 now = current_kernel_time64(); - - return timespec64_to_timespec(now); -} /* - * timespec based interfaces + * timespec64 based interfaces */ struct timespec64 get_monotonic_coarse64(void); extern void getrawmonotonic64(struct timespec64 *ts); @@ -48,116 +37,6 @@ extern int __getnstimeofday64(struct timespec64 *tv); extern void getnstimeofday64(struct timespec64 *tv); extern void getboottime64(struct timespec64 *ts); -#if BITS_PER_LONG == 64 -/** - * Deprecated. Use do_settimeofday64(). - */ -static inline int do_settimeofday(const struct timespec *ts) -{ - return do_settimeofday64(ts); -} - -static inline int __getnstimeofday(struct timespec *ts) -{ - return __getnstimeofday64(ts); -} - -static inline void getnstimeofday(struct timespec *ts) -{ - getnstimeofday64(ts); -} - -static inline void ktime_get_ts(struct timespec *ts) -{ - ktime_get_ts64(ts); -} - -static inline void ktime_get_real_ts(struct timespec *ts) -{ - getnstimeofday64(ts); -} - -static inline void getrawmonotonic(struct timespec *ts) -{ - getrawmonotonic64(ts); -} - -static inline struct timespec get_monotonic_coarse(void) -{ - return get_monotonic_coarse64(); -} - -static inline void getboottime(struct timespec *ts) -{ - return getboottime64(ts); -} -#else -/** - * Deprecated. Use do_settimeofday64(). - */ -static inline int do_settimeofday(const struct timespec *ts) -{ - struct timespec64 ts64; - - ts64 = timespec_to_timespec64(*ts); - return do_settimeofday64(&ts64); -} - -static inline int __getnstimeofday(struct timespec *ts) -{ - struct timespec64 ts64; - int ret = __getnstimeofday64(&ts64); - - *ts = timespec64_to_timespec(ts64); - return ret; -} - -static inline void getnstimeofday(struct timespec *ts) -{ - struct timespec64 ts64; - - getnstimeofday64(&ts64); - *ts = timespec64_to_timespec(ts64); -} - -static inline void ktime_get_ts(struct timespec *ts) -{ - struct timespec64 ts64; - - ktime_get_ts64(&ts64); - *ts = timespec64_to_timespec(ts64); -} - -static inline void ktime_get_real_ts(struct timespec *ts) -{ - struct timespec64 ts64; - - getnstimeofday64(&ts64); - *ts = timespec64_to_timespec(ts64); -} - -static inline void getrawmonotonic(struct timespec *ts) -{ - struct timespec64 ts64; - - getrawmonotonic64(&ts64); - *ts = timespec64_to_timespec(ts64); -} - -static inline struct timespec get_monotonic_coarse(void) -{ - return timespec64_to_timespec(get_monotonic_coarse64()); -} - -static inline void getboottime(struct timespec *ts) -{ - struct timespec64 ts64; - - getboottime64(&ts64); - *ts = timespec64_to_timespec(ts64); -} -#endif - #define ktime_get_real_ts64(ts) getnstimeofday64(ts) /* @@ -258,23 +137,13 @@ extern u64 ktime_get_raw_fast_ns(void); extern u64 ktime_get_boot_fast_ns(void); /* - * Timespec interfaces utilizing the ktime based ones + * timespec64 interfaces utilizing the ktime based ones */ -static inline void get_monotonic_boottime(struct timespec *ts) -{ - *ts = ktime_to_timespec(ktime_get_boottime()); -} - static inline void get_monotonic_boottime64(struct timespec64 *ts) { *ts = ktime_to_timespec64(ktime_get_boottime()); } -static inline void timekeeping_clocktai(struct timespec *ts) -{ - *ts = ktime_to_timespec(ktime_get_clocktai()); -} - static inline void timekeeping_clocktai64(struct timespec64 *ts) { *ts = ktime_to_timespec64(ktime_get_clocktai()); @@ -357,10 +226,8 @@ extern void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot); */ extern int persistent_clock_is_local; -extern void read_persistent_clock(struct timespec *ts); extern void read_persistent_clock64(struct timespec64 *ts); extern void read_boot_clock64(struct timespec64 *ts); -extern int update_persistent_clock(struct timespec now); extern int update_persistent_clock64(struct timespec64 now); diff --git a/include/linux/timekeeping32.h b/include/linux/timekeeping32.h new file mode 100644 index 000000000000..af4114d5dc17 --- /dev/null +++ b/include/linux/timekeeping32.h @@ -0,0 +1,151 @@ +#ifndef _LINUX_TIMEKEEPING32_H +#define _LINUX_TIMEKEEPING32_H +/* + * These interfaces are all based on the old timespec type + * and should get replaced with the timespec64 based versions + * over time so we can remove the file here. + */ + +extern void do_gettimeofday(struct timeval *tv); +unsigned long get_seconds(void); + +/* does not take xtime_lock */ +struct timespec __current_kernel_time(void); + +static inline struct timespec current_kernel_time(void) +{ + struct timespec64 now = current_kernel_time64(); + + return timespec64_to_timespec(now); +} + +#if BITS_PER_LONG == 64 +/** + * Deprecated. Use do_settimeofday64(). + */ +static inline int do_settimeofday(const struct timespec *ts) +{ + return do_settimeofday64(ts); +} + +static inline int __getnstimeofday(struct timespec *ts) +{ + return __getnstimeofday64(ts); +} + +static inline void getnstimeofday(struct timespec *ts) +{ + getnstimeofday64(ts); +} + +static inline void ktime_get_ts(struct timespec *ts) +{ + ktime_get_ts64(ts); +} + +static inline void ktime_get_real_ts(struct timespec *ts) +{ + getnstimeofday64(ts); +} + +static inline void getrawmonotonic(struct timespec *ts) +{ + getrawmonotonic64(ts); +} + +static inline struct timespec get_monotonic_coarse(void) +{ + return get_monotonic_coarse64(); +} + +static inline void getboottime(struct timespec *ts) +{ + return getboottime64(ts); +} +#else +/** + * Deprecated. Use do_settimeofday64(). + */ +static inline int do_settimeofday(const struct timespec *ts) +{ + struct timespec64 ts64; + + ts64 = timespec_to_timespec64(*ts); + return do_settimeofday64(&ts64); +} + +static inline int __getnstimeofday(struct timespec *ts) +{ + struct timespec64 ts64; + int ret = __getnstimeofday64(&ts64); + + *ts = timespec64_to_timespec(ts64); + return ret; +} + +static inline void getnstimeofday(struct timespec *ts) +{ + struct timespec64 ts64; + + getnstimeofday64(&ts64); + *ts = timespec64_to_timespec(ts64); +} + +static inline void ktime_get_ts(struct timespec *ts) +{ + struct timespec64 ts64; + + ktime_get_ts64(&ts64); + *ts = timespec64_to_timespec(ts64); +} + +static inline void ktime_get_real_ts(struct timespec *ts) +{ + struct timespec64 ts64; + + getnstimeofday64(&ts64); + *ts = timespec64_to_timespec(ts64); +} + +static inline void getrawmonotonic(struct timespec *ts) +{ + struct timespec64 ts64; + + getrawmonotonic64(&ts64); + *ts = timespec64_to_timespec(ts64); +} + +static inline struct timespec get_monotonic_coarse(void) +{ + return timespec64_to_timespec(get_monotonic_coarse64()); +} + +static inline void getboottime(struct timespec *ts) +{ + struct timespec64 ts64; + + getboottime64(&ts64); + *ts = timespec64_to_timespec(ts64); +} +#endif + +/* + * Timespec interfaces utilizing the ktime based ones + */ +static inline void get_monotonic_boottime(struct timespec *ts) +{ + *ts = ktime_to_timespec(ktime_get_boottime()); +} + +static inline void timekeeping_clocktai(struct timespec *ts) +{ + *ts = ktime_to_timespec(ktime_get_clocktai()); +} + +/* + * Persistent clock related interfaces + */ +extern void read_persistent_clock(struct timespec *ts); +extern int update_persistent_clock(struct timespec now); + +#endif From ab623d4d783ced70aeff2e25fbebf4ed1d604f79 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 19 Oct 2017 13:14:44 +0200 Subject: [PATCH 1253/1640] BACKPORT: timekeeping: Consolidate timekeeping_inject_offset code The code to check the adjtimex() or clock_adjtime() arguments is spread out across multiple files for presumably only historic reasons. As a preparatation for a rework to get rid of the use of 'struct timeval' and 'struct timespec' in there, this moves all the portions into kernel/time/timekeeping.c and marks them as 'static'. The warp_clock() function here is not as closely related as the others, but I feel it still makes sense to move it here in order to consolidate all callers of timekeeping_inject_offset(). Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Miroslav Lichvar Cc: Richard Cochran Cc: Prarit Bhargava Cc: Stephen Boyd Change-Id: Ib268add953c41504b8a88067c18b4086fa37beb8 Signed-off-by: Arnd Bergmann [jstultz: Whitespace fixup] Signed-off-by: John Stultz --- include/linux/time.h | 26 -------- kernel/time/ntp.c | 61 ------------------- kernel/time/ntp_internal.h | 1 - kernel/time/time.c | 36 +---------- kernel/time/timekeeping.c | 122 ++++++++++++++++++++++++++++++++++++- kernel/time/timekeeping.h | 2 +- 6 files changed, 122 insertions(+), 126 deletions(-) diff --git a/include/linux/time.h b/include/linux/time.h index 21086c5143d9..d5e5ecb231f9 100644 --- a/include/linux/time.h +++ b/include/linux/time.h @@ -135,32 +135,6 @@ static inline bool timeval_valid(const struct timeval *tv) extern struct timespec timespec_trunc(struct timespec t, unsigned gran); -/* - * Validates if a timespec/timeval used to inject a time offset is valid. - * Offsets can be postive or negative. The value of the timeval/timespec - * is the sum of its fields, but *NOTE*: the field tv_usec/tv_nsec must - * always be non-negative. - */ -static inline bool timeval_inject_offset_valid(const struct timeval *tv) -{ - /* We don't check the tv_sec as it can be positive or negative */ - - /* Can't have more microseconds then a second */ - if (tv->tv_usec < 0 || tv->tv_usec >= USEC_PER_SEC) - return false; - return true; -} - -static inline bool timespec_inject_offset_valid(const struct timespec *ts) -{ - /* We don't check the tv_sec as it can be positive or negative */ - - /* Can't have more nanoseconds then a second */ - if (ts->tv_nsec < 0 || ts->tv_nsec >= NSEC_PER_SEC) - return false; - return true; -} - /* Some architectures do not supply their own clocksource. * This is mainly the case in architectures that get their * inter-tick times by reading the counter on their interval diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 64443ce1d28e..b5fd65049aa6 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -655,67 +655,6 @@ static inline void process_adjtimex_modes(struct timex *txc, } - -/** - * ntp_validate_timex - Ensures the timex is ok for use in do_adjtimex - */ -int ntp_validate_timex(struct timex *txc) -{ - if (txc->modes & ADJ_ADJTIME) { - /* singleshot must not be used with any other mode bits */ - if (!(txc->modes & ADJ_OFFSET_SINGLESHOT)) - return -EINVAL; - if (!(txc->modes & ADJ_OFFSET_READONLY) && - !capable(CAP_SYS_TIME)) - return -EPERM; - } else { - /* In order to modify anything, you gotta be super-user! */ - if (txc->modes && !capable(CAP_SYS_TIME)) - return -EPERM; - /* - * if the quartz is off by more than 10% then - * something is VERY wrong! - */ - if (txc->modes & ADJ_TICK && - (txc->tick < 900000/USER_HZ || - txc->tick > 1100000/USER_HZ)) - return -EINVAL; - } - - if (txc->modes & ADJ_SETOFFSET) { - /* In order to inject time, you gotta be super-user! */ - if (!capable(CAP_SYS_TIME)) - return -EPERM; - - if (txc->modes & ADJ_NANO) { - struct timespec ts; - - ts.tv_sec = txc->time.tv_sec; - ts.tv_nsec = txc->time.tv_usec; - if (!timespec_inject_offset_valid(&ts)) - return -EINVAL; - - } else { - if (!timeval_inject_offset_valid(&txc->time)) - return -EINVAL; - } - } - - /* - * Check for potential multiplication overflows that can - * only happen on 64-bit systems: - */ - if ((txc->modes & ADJ_FREQUENCY) && (BITS_PER_LONG == 64)) { - if (LLONG_MIN / PPM_SCALE > txc->freq) - return -EINVAL; - if (LLONG_MAX / PPM_SCALE < txc->freq) - return -EINVAL; - } - - return 0; -} - - /* * adjtimex mainly allows reading (and writing, if superuser) of * kernel time-keeping variables. used by xntpd. diff --git a/kernel/time/ntp_internal.h b/kernel/time/ntp_internal.h index 0a53e6ea47b1..909bd1f1bfb1 100644 --- a/kernel/time/ntp_internal.h +++ b/kernel/time/ntp_internal.h @@ -8,7 +8,6 @@ extern void ntp_clear(void); extern u64 ntp_tick_length(void); extern ktime_t ntp_get_next_leap(void); extern int second_overflow(time64_t secs); -extern int ntp_validate_timex(struct timex *); extern int __do_adjtimex(struct timex *, struct timespec64 *, s32 *); extern void __hardpps(const struct timespec64 *, const struct timespec64 *); #endif /* _LINUX_NTP_INTERNAL_H */ diff --git a/kernel/time/time.c b/kernel/time/time.c index 319935af02fb..4aede6a56c20 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -158,40 +158,6 @@ SYSCALL_DEFINE2(gettimeofday, struct timeval __user *, tv, return 0; } -/* - * Indicates if there is an offset between the system clock and the hardware - * clock/persistent clock/rtc. - */ -int persistent_clock_is_local; - -/* - * Adjust the time obtained from the CMOS to be UTC time instead of - * local time. - * - * This is ugly, but preferable to the alternatives. Otherwise we - * would either need to write a program to do it in /etc/rc (and risk - * confusion if the program gets run more than once; it would also be - * hard to make the program warp the clock precisely n hours) or - * compile in the timezone information into the kernel. Bad, bad.... - * - * - TYT, 1992-01-01 - * - * The best thing to do is to keep the CMOS clock in universal time (UTC) - * as real UNIX machines always do it. This avoids all headaches about - * daylight saving times and warping kernel clocks. - */ -static inline void warp_clock(void) -{ - if (sys_tz.tz_minuteswest != 0) { - struct timespec adjust; - - persistent_clock_is_local = 1; - adjust.tv_sec = sys_tz.tz_minuteswest * 60; - adjust.tv_nsec = 0; - timekeeping_inject_offset(&adjust); - } -} - /* * In case for some reason the CMOS clock has not already been running * in UTC, but in some local time: The first time we set the timezone, @@ -225,7 +191,7 @@ int do_sys_settimeofday64(const struct timespec64 *tv, const struct timezone *tz if (firsttime) { firsttime = 0; if (!tv) - warp_clock(); + timekeeping_warp_clock(); } } if (tv) diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index d4cb7152ba5b..c7f7d4da647c 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1282,13 +1282,39 @@ out: } EXPORT_SYMBOL(do_settimeofday64); +/* + * Validates if a timespec/timeval used to inject a time offset is valid. + * Offsets can be postive or negative. The value of the timeval/timespec + * is the sum of its fields, but *NOTE*: the field tv_usec/tv_nsec must + * always be non-negative. + */ +static inline bool timeval_inject_offset_valid(const struct timeval *tv) +{ + /* We don't check the tv_sec as it can be positive or negative */ + + /* Can't have more microseconds then a second */ + if (tv->tv_usec < 0 || tv->tv_usec >= USEC_PER_SEC) + return false; + return true; +} + +static inline bool timespec_inject_offset_valid(const struct timespec *ts) +{ + /* We don't check the tv_sec as it can be positive or negative */ + + /* Can't have more nanoseconds then a second */ + if (ts->tv_nsec < 0 || ts->tv_nsec >= NSEC_PER_SEC) + return false; + return true; +} + /** * timekeeping_inject_offset - Adds or subtracts from the current time. * @tv: pointer to the timespec variable containing the offset * * Adds or subtracts an offset value from the current time. */ -int timekeeping_inject_offset(struct timespec *ts) +static int timekeeping_inject_offset(struct timespec *ts) { struct timekeeper *tk = &tk_core.timekeeper; unsigned long flags; @@ -1327,7 +1353,40 @@ error: /* even if we error out, we forwarded the time, so call update */ return ret; } -EXPORT_SYMBOL(timekeeping_inject_offset); + +/* + * Indicates if there is an offset between the system clock and the hardware + * clock/persistent clock/rtc. + */ +int persistent_clock_is_local; + +/* + * Adjust the time obtained from the CMOS to be UTC time instead of + * local time. + * + * This is ugly, but preferable to the alternatives. Otherwise we + * would either need to write a program to do it in /etc/rc (and risk + * confusion if the program gets run more than once; it would also be + * hard to make the program warp the clock precisely n hours) or + * compile in the timezone information into the kernel. Bad, bad.... + * + * - TYT, 1992-01-01 + * + * The best thing to do is to keep the CMOS clock in universal time (UTC) + * as real UNIX machines always do it. This avoids all headaches about + * daylight saving times and warping kernel clocks. + */ +void timekeeping_warp_clock(void) +{ + if (sys_tz.tz_minuteswest != 0) { + struct timespec adjust; + + persistent_clock_is_local = 1; + adjust.tv_sec = sys_tz.tz_minuteswest * 60; + adjust.tv_nsec = 0; + timekeeping_inject_offset(&adjust); + } +} /** * __timekeeping_set_tai_offset - Sets the TAI offset from UTC and monotonic @@ -2283,6 +2342,65 @@ ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq, ktime_t *offs_real, return base; } +/** + * ntp_validate_timex - Ensures the timex is ok for use in do_adjtimex + */ +static int ntp_validate_timex(struct timex *txc) +{ + if (txc->modes & ADJ_ADJTIME) { + /* singleshot must not be used with any other mode bits */ + if (!(txc->modes & ADJ_OFFSET_SINGLESHOT)) + return -EINVAL; + if (!(txc->modes & ADJ_OFFSET_READONLY) && + !capable(CAP_SYS_TIME)) + return -EPERM; + } else { + /* In order to modify anything, you gotta be super-user! */ + if (txc->modes && !capable(CAP_SYS_TIME)) + return -EPERM; + /* + * if the quartz is off by more than 10% then + * something is VERY wrong! + */ + if (txc->modes & ADJ_TICK && + (txc->tick < 900000/USER_HZ || + txc->tick > 1100000/USER_HZ)) + return -EINVAL; + } + + if (txc->modes & ADJ_SETOFFSET) { + /* In order to inject time, you gotta be super-user! */ + if (!capable(CAP_SYS_TIME)) + return -EPERM; + + if (txc->modes & ADJ_NANO) { + struct timespec ts; + + ts.tv_sec = txc->time.tv_sec; + ts.tv_nsec = txc->time.tv_usec; + if (!timespec_inject_offset_valid(&ts)) + return -EINVAL; + + } else { + if (!timeval_inject_offset_valid(&txc->time)) + return -EINVAL; + } + } + + /* + * Check for potential multiplication overflows that can + * only happen on 64-bit systems: + */ + if ((txc->modes & ADJ_FREQUENCY) && (BITS_PER_LONG == 64)) { + if (LLONG_MIN / PPM_SCALE > txc->freq) + return -EINVAL; + if (LLONG_MAX / PPM_SCALE < txc->freq) + return -EINVAL; + } + + return 0; +} + /** * random_get_entropy_fallback - Returns the raw clock source value, * used by random.c for platforms with no valid random_get_entropy(). diff --git a/kernel/time/timekeeping.h b/kernel/time/timekeeping.h index c9f9af339914..7a9b4eb7a1d5 100644 --- a/kernel/time/timekeeping.h +++ b/kernel/time/timekeeping.h @@ -11,7 +11,7 @@ extern ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq, extern int timekeeping_valid_for_hres(void); extern u64 timekeeping_max_deferment(void); -extern int timekeeping_inject_offset(struct timespec *ts); +extern void timekeeping_warp_clock(void); extern int timekeeping_suspend(void); extern void timekeeping_resume(void); From 36ffe3f67ba22d56b32e714c2b3ae8dde3bf0a95 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 19 Oct 2017 13:14:46 +0200 Subject: [PATCH 1254/1640] UPSTREAM: time: Remove unused functions The (slow but) ongoing work on conversion from timespec to timespec64 has led some timespec based helper functions to become unused. No new code should use them, so we can remove the functions entirely. I'm planning to obsolete additional interfaces next and remove more of these. Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Miroslav Lichvar Cc: Richard Cochran Cc: Prarit Bhargava Cc: Stephen Boyd Change-Id: Ib2c46b76e8186b0f0d06252ccc328884bfb359e9 Signed-off-by: Arnd Bergmann Signed-off-by: John Stultz --- include/linux/time.h | 18 ------------------ include/linux/time64.h | 28 ---------------------------- kernel/time/time.c | 18 ------------------ 3 files changed, 64 deletions(-) diff --git a/include/linux/time.h b/include/linux/time.h index d5e5ecb231f9..9a20e4ffd06c 100644 --- a/include/linux/time.h +++ b/include/linux/time.h @@ -40,15 +40,6 @@ static inline int timespec_compare(const struct timespec *lhs, const struct time return lhs->tv_nsec - rhs->tv_nsec; } -static inline int timeval_compare(const struct timeval *lhs, const struct timeval *rhs) -{ - if (lhs->tv_sec < rhs->tv_sec) - return -1; - if (lhs->tv_sec > rhs->tv_sec) - return 1; - return lhs->tv_usec - rhs->tv_usec; -} - extern time64_t mktime64(const unsigned int year, const unsigned int mon, const unsigned int day, const unsigned int hour, const unsigned int min, const unsigned int sec); @@ -66,15 +57,6 @@ static inline unsigned long mktime(const unsigned int year, extern void set_normalized_timespec(struct timespec *ts, time_t sec, s64 nsec); -/* - * timespec_add_safe assumes both values are positive and checks - * for overflow. It will return TIME_T_MAX if the reutrn would be - * smaller then either of the arguments. - */ -extern struct timespec timespec_add_safe(const struct timespec lhs, - const struct timespec rhs); - - static inline struct timespec timespec_add(struct timespec lhs, struct timespec rhs) { diff --git a/include/linux/time64.h b/include/linux/time64.h index 99ab4a686c30..dea5ed22429c 100644 --- a/include/linux/time64.h +++ b/include/linux/time64.h @@ -54,16 +54,6 @@ static inline struct timespec64 timespec_to_timespec64(const struct timespec ts) return ts; } -static inline struct itimerspec itimerspec64_to_itimerspec(struct itimerspec64 *its64) -{ - return *its64; -} - -static inline struct itimerspec64 itimerspec_to_itimerspec64(struct itimerspec *its) -{ - return *its; -} - # define timespec64_equal timespec_equal # define timespec64_compare timespec_compare # define set_normalized_timespec64 set_normalized_timespec @@ -95,24 +85,6 @@ static inline struct timespec64 timespec_to_timespec64(const struct timespec ts) return ret; } -static inline struct itimerspec itimerspec64_to_itimerspec(struct itimerspec64 *its64) -{ - struct itimerspec ret; - - ret.it_interval = timespec64_to_timespec(its64->it_interval); - ret.it_value = timespec64_to_timespec(its64->it_value); - return ret; -} - -static inline struct itimerspec64 itimerspec_to_itimerspec64(struct itimerspec *its) -{ - struct itimerspec64 ret; - - ret.it_interval = timespec_to_timespec64(its->it_interval); - ret.it_value = timespec_to_timespec64(its->it_value); - return ret; -} - static inline int timespec64_equal(const struct timespec64 *a, const struct timespec64 *b) { diff --git a/kernel/time/time.c b/kernel/time/time.c index 4aede6a56c20..7acef179c9f1 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -820,24 +820,6 @@ unsigned long nsecs_to_jiffies(u64 n) } EXPORT_SYMBOL_GPL(nsecs_to_jiffies); -/* - * Add two timespec values and do a safety check for overflow. - * It's assumed that both values are valid (>= 0) - */ -struct timespec timespec_add_safe(const struct timespec lhs, - const struct timespec rhs) -{ - struct timespec res; - - set_normalized_timespec(&res, lhs.tv_sec + rhs.tv_sec, - lhs.tv_nsec + rhs.tv_nsec); - - if (res.tv_sec < lhs.tv_sec || res.tv_sec < rhs.tv_sec) - res.tv_sec = TIME_T_MAX; - - return res; -} - /* * Add two timespec64 values and do a safety check for overflow. * It's assumed that both values are valid (>= 0). From 468e0c9024b88f5df4e95f22dedd4b2af2079503 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 19 Oct 2017 13:14:47 +0200 Subject: [PATCH 1255/1640] UPSTREAM: time: Move time_t based interfaces to time32.h Interfaces based on 'struct timespec' or 'struct timeval' should no longer be used for new code, which can use either ktime_t or 'struct timespec64' instead. To make this a little clearer, this moves the various helpers into a new time32.h header. For the moment, this gets included by the normal time.h, but we may be able to separate it entirely when most users of time32.h are gone. Individual helpers in the new file can get removed once they become unused in the future. Since the contents of time32.h look a lot like what's in time64.h, I'm reordering them during the move to make them more similar, and to allow a follow-up patch to redirect the 'timespec' based functions to thei 'timespec64' based counterparts on 64-bit architectures later. Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Miroslav Lichvar Cc: Richard Cochran Cc: Prarit Bhargava Cc: Stephen Boyd Change-Id: I10bbc02d2639115fec39d34480ace5b00dc3c8d2 Signed-off-by: Arnd Bergmann [jstultz: Whitespace & checkpatch fixups] Signed-off-by: John Stultz --- include/linux/time.h | 163 +------------------------------------- include/linux/time32.h | 176 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 177 insertions(+), 162 deletions(-) create mode 100644 include/linux/time32.h diff --git a/include/linux/time.h b/include/linux/time.h index 9a20e4ffd06c..9f1501f83e42 100644 --- a/include/linux/time.h +++ b/include/linux/time.h @@ -18,105 +18,10 @@ int get_itimerspec64(struct itimerspec64 *it, int put_itimerspec64(const struct itimerspec64 *it, struct itimerspec __user *uit); -#define TIME_T_MAX (time_t)((1UL << ((sizeof(time_t) << 3) - 1)) - 1) - -static inline int timespec_equal(const struct timespec *a, - const struct timespec *b) -{ - return (a->tv_sec == b->tv_sec) && (a->tv_nsec == b->tv_nsec); -} - -/* - * lhs < rhs: return <0 - * lhs == rhs: return 0 - * lhs > rhs: return >0 - */ -static inline int timespec_compare(const struct timespec *lhs, const struct timespec *rhs) -{ - if (lhs->tv_sec < rhs->tv_sec) - return -1; - if (lhs->tv_sec > rhs->tv_sec) - return 1; - return lhs->tv_nsec - rhs->tv_nsec; -} - extern time64_t mktime64(const unsigned int year, const unsigned int mon, const unsigned int day, const unsigned int hour, const unsigned int min, const unsigned int sec); -/** - * Deprecated. Use mktime64(). - */ -static inline unsigned long mktime(const unsigned int year, - const unsigned int mon, const unsigned int day, - const unsigned int hour, const unsigned int min, - const unsigned int sec) -{ - return mktime64(year, mon, day, hour, min, sec); -} - -extern void set_normalized_timespec(struct timespec *ts, time_t sec, s64 nsec); - -static inline struct timespec timespec_add(struct timespec lhs, - struct timespec rhs) -{ - struct timespec ts_delta; - set_normalized_timespec(&ts_delta, lhs.tv_sec + rhs.tv_sec, - lhs.tv_nsec + rhs.tv_nsec); - return ts_delta; -} - -/* - * sub = lhs - rhs, in normalized form - */ -static inline struct timespec timespec_sub(struct timespec lhs, - struct timespec rhs) -{ - struct timespec ts_delta; - set_normalized_timespec(&ts_delta, lhs.tv_sec - rhs.tv_sec, - lhs.tv_nsec - rhs.tv_nsec); - return ts_delta; -} - -/* - * Returns true if the timespec is norm, false if denorm: - */ -static inline bool timespec_valid(const struct timespec *ts) -{ - /* Dates before 1970 are bogus */ - if (ts->tv_sec < 0) - return false; - /* Can't have more nanoseconds then a second */ - if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC) - return false; - return true; -} - -static inline bool timespec_valid_strict(const struct timespec *ts) -{ - if (!timespec_valid(ts)) - return false; - /* Disallow values that could overflow ktime_t */ - if ((unsigned long long)ts->tv_sec >= KTIME_SEC_MAX) - return false; - return true; -} - -static inline bool timeval_valid(const struct timeval *tv) -{ - /* Dates before 1970 are bogus */ - if (tv->tv_sec < 0) - return false; - - /* Can't have more microseconds then a second */ - if (tv->tv_usec < 0 || tv->tv_usec >= USEC_PER_SEC) - return false; - - return true; -} - -extern struct timespec timespec_trunc(struct timespec t, unsigned gran); - /* Some architectures do not supply their own clocksource. * This is mainly the case in architectures that get their * inter-tick times by reading the counter on their interval @@ -165,73 +70,7 @@ struct tm { void time64_to_tm(time64_t totalsecs, int offset, struct tm *result); -/** - * time_to_tm - converts the calendar time to local broken-down time - * - * @totalsecs the number of seconds elapsed since 00:00:00 on January 1, 1970, - * Coordinated Universal Time (UTC). - * @offset offset seconds adding to totalsecs. - * @result pointer to struct tm variable to receive broken-down time - */ -static inline void time_to_tm(time_t totalsecs, int offset, struct tm *result) -{ - time64_to_tm(totalsecs, offset, result); -} - -/** - * timespec_to_ns - Convert timespec to nanoseconds - * @ts: pointer to the timespec variable to be converted - * - * Returns the scalar nanosecond representation of the timespec - * parameter. - */ -static inline s64 timespec_to_ns(const struct timespec *ts) -{ - return ((s64) ts->tv_sec * NSEC_PER_SEC) + ts->tv_nsec; -} - -/** - * timeval_to_ns - Convert timeval to nanoseconds - * @ts: pointer to the timeval variable to be converted - * - * Returns the scalar nanosecond representation of the timeval - * parameter. - */ -static inline s64 timeval_to_ns(const struct timeval *tv) -{ - return ((s64) tv->tv_sec * NSEC_PER_SEC) + - tv->tv_usec * NSEC_PER_USEC; -} - -/** - * ns_to_timespec - Convert nanoseconds to timespec - * @nsec: the nanoseconds value to be converted - * - * Returns the timespec representation of the nsec parameter. - */ -extern struct timespec ns_to_timespec(const s64 nsec); - -/** - * ns_to_timeval - Convert nanoseconds to timeval - * @nsec: the nanoseconds value to be converted - * - * Returns the timeval representation of the nsec parameter. - */ -extern struct timeval ns_to_timeval(const s64 nsec); - -/** - * timespec_add_ns - Adds nanoseconds to a timespec - * @a: pointer to timespec to be incremented - * @ns: unsigned nanoseconds value to be added - * - * This must always be inlined because its used from the x86-64 vdso, - * which cannot call other kernel functions. - */ -static __always_inline void timespec_add_ns(struct timespec *a, u64 ns) -{ - a->tv_sec += __iter_div_u64_rem(a->tv_nsec + ns, NSEC_PER_SEC, &ns); - a->tv_nsec = ns; -} +# include static inline bool itimerspec64_valid(const struct itimerspec64 *its) { diff --git a/include/linux/time32.h b/include/linux/time32.h new file mode 100644 index 000000000000..9b9c43f0d39b --- /dev/null +++ b/include/linux/time32.h @@ -0,0 +1,176 @@ +#ifndef _LINUX_TIME32_H +#define _LINUX_TIME32_H +/* + * These are all interfaces based on the old time_t definition + * that overflows in 2038 on 32-bit architectures. New code + * should use the replacements based on time64_t and timespec64. + * + * Any interfaces in here that become unused as we migrate + * code to time64_t should get removed. + */ + +#include + +#define TIME_T_MAX (time_t)((1UL << ((sizeof(time_t) << 3) - 1)) - 1) + +static inline int timespec_equal(const struct timespec *a, + const struct timespec *b) +{ + return (a->tv_sec == b->tv_sec) && (a->tv_nsec == b->tv_nsec); +} + +/* + * lhs < rhs: return <0 + * lhs == rhs: return 0 + * lhs > rhs: return >0 + */ +static inline int timespec_compare(const struct timespec *lhs, const struct timespec *rhs) +{ + if (lhs->tv_sec < rhs->tv_sec) + return -1; + if (lhs->tv_sec > rhs->tv_sec) + return 1; + return lhs->tv_nsec - rhs->tv_nsec; +} + +extern void set_normalized_timespec(struct timespec *ts, time_t sec, s64 nsec); + +static inline struct timespec timespec_add(struct timespec lhs, + struct timespec rhs) +{ + struct timespec ts_delta; + + set_normalized_timespec(&ts_delta, lhs.tv_sec + rhs.tv_sec, + lhs.tv_nsec + rhs.tv_nsec); + return ts_delta; +} + +/* + * sub = lhs - rhs, in normalized form + */ +static inline struct timespec timespec_sub(struct timespec lhs, + struct timespec rhs) +{ + struct timespec ts_delta; + + set_normalized_timespec(&ts_delta, lhs.tv_sec - rhs.tv_sec, + lhs.tv_nsec - rhs.tv_nsec); + return ts_delta; +} + +/* + * Returns true if the timespec is norm, false if denorm: + */ +static inline bool timespec_valid(const struct timespec *ts) +{ + /* Dates before 1970 are bogus */ + if (ts->tv_sec < 0) + return false; + /* Can't have more nanoseconds then a second */ + if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC) + return false; + return true; +} + +static inline bool timespec_valid_strict(const struct timespec *ts) +{ + if (!timespec_valid(ts)) + return false; + /* Disallow values that could overflow ktime_t */ + if ((unsigned long long)ts->tv_sec >= KTIME_SEC_MAX) + return false; + return true; +} + +/** + * timespec_to_ns - Convert timespec to nanoseconds + * @ts: pointer to the timespec variable to be converted + * + * Returns the scalar nanosecond representation of the timespec + * parameter. + */ +static inline s64 timespec_to_ns(const struct timespec *ts) +{ + return ((s64) ts->tv_sec * NSEC_PER_SEC) + ts->tv_nsec; +} + +/** + * ns_to_timespec - Convert nanoseconds to timespec + * @nsec: the nanoseconds value to be converted + * + * Returns the timespec representation of the nsec parameter. + */ +extern struct timespec ns_to_timespec(const s64 nsec); + +/** + * timespec_add_ns - Adds nanoseconds to a timespec + * @a: pointer to timespec to be incremented + * @ns: unsigned nanoseconds value to be added + * + * This must always be inlined because its used from the x86-64 vdso, + * which cannot call other kernel functions. + */ +static __always_inline void timespec_add_ns(struct timespec *a, u64 ns) +{ + a->tv_sec += __iter_div_u64_rem(a->tv_nsec + ns, NSEC_PER_SEC, &ns); + a->tv_nsec = ns; +} + +/** + * time_to_tm - converts the calendar time to local broken-down time + * + * @totalsecs the number of seconds elapsed since 00:00:00 on January 1, 1970, + * Coordinated Universal Time (UTC). + * @offset offset seconds adding to totalsecs. + * @result pointer to struct tm variable to receive broken-down time + */ +static inline void time_to_tm(time_t totalsecs, int offset, struct tm *result) +{ + time64_to_tm(totalsecs, offset, result); +} + +static inline unsigned long mktime(const unsigned int year, + const unsigned int mon, const unsigned int day, + const unsigned int hour, const unsigned int min, + const unsigned int sec) +{ + return mktime64(year, mon, day, hour, min, sec); +} + +static inline bool timeval_valid(const struct timeval *tv) +{ + /* Dates before 1970 are bogus */ + if (tv->tv_sec < 0) + return false; + + /* Can't have more microseconds then a second */ + if (tv->tv_usec < 0 || tv->tv_usec >= USEC_PER_SEC) + return false; + + return true; +} + +extern struct timespec timespec_trunc(struct timespec t, unsigned int gran); + +/** + * timeval_to_ns - Convert timeval to nanoseconds + * @ts: pointer to the timeval variable to be converted + * + * Returns the scalar nanosecond representation of the timeval + * parameter. + */ +static inline s64 timeval_to_ns(const struct timeval *tv) +{ + return ((s64) tv->tv_sec * NSEC_PER_SEC) + + tv->tv_usec * NSEC_PER_USEC; +} + +/** + * ns_to_timeval - Convert nanoseconds to timeval + * @nsec: the nanoseconds value to be converted + * + * Returns the timeval representation of the nsec parameter. + */ +extern struct timeval ns_to_timeval(const s64 nsec); + +#endif From 8729a147e234587aeb92239dd1a204e0d7feaa4e Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 19 Oct 2017 13:14:48 +0200 Subject: [PATCH 1256/1640] UPSTREAM: time: Move time_t conversion helpers to time32.h On 64-bit architectures, the timespec64 based helpers in linux/time.h are defined as macros pointing to their timespec based counterparts. This made sense when they were first introduced, but as we are migrating away from timespec in general, it's much less intuitive now. This changes the macros to work in the exact opposite way: we always provide the timespec64 based helpers and define the old interfaces as macros for them. Now we can move those macros into linux/time32.h, which already contains the respective helpers for 32-bit architectures. Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Miroslav Lichvar Cc: Richard Cochran Cc: Prarit Bhargava Cc: Stephen Boyd Change-Id: I0305f4df7eebc94e364c702c39dc5ca79c63f67e Signed-off-by: Arnd Bergmann Signed-off-by: John Stultz --- include/linux/time32.h | 45 +++++++++++++++++++++++++++++++++++++ include/linux/time64.h | 50 +----------------------------------------- kernel/time/time.c | 5 +++-- 3 files changed, 49 insertions(+), 51 deletions(-) diff --git a/include/linux/time32.h b/include/linux/time32.h index 9b9c43f0d39b..65b1de25198d 100644 --- a/include/linux/time32.h +++ b/include/linux/time32.h @@ -13,6 +13,49 @@ #define TIME_T_MAX (time_t)((1UL << ((sizeof(time_t) << 3) - 1)) - 1) +#if __BITS_PER_LONG == 64 + +/* timespec64 is defined as timespec here */ +static inline struct timespec timespec64_to_timespec(const struct timespec64 ts64) +{ + return ts64; +} + +static inline struct timespec64 timespec_to_timespec64(const struct timespec ts) +{ + return ts; +} + +# define timespec_equal timespec64_equal +# define timespec_compare timespec64_compare +# define set_normalized_timespec set_normalized_timespec64 +# define timespec_add timespec64_add +# define timespec_sub timespec64_sub +# define timespec_valid timespec64_valid +# define timespec_valid_strict timespec64_valid_strict +# define timespec_to_ns timespec64_to_ns +# define ns_to_timespec ns_to_timespec64 +# define timespec_add_ns timespec64_add_ns + +#else +static inline struct timespec timespec64_to_timespec(const struct timespec64 ts64) +{ + struct timespec ret; + + ret.tv_sec = (time_t)ts64.tv_sec; + ret.tv_nsec = ts64.tv_nsec; + return ret; +} + +static inline struct timespec64 timespec_to_timespec64(const struct timespec ts) +{ + struct timespec64 ret; + + ret.tv_sec = ts.tv_sec; + ret.tv_nsec = ts.tv_nsec; + return ret; +} + static inline int timespec_equal(const struct timespec *a, const struct timespec *b) { @@ -116,6 +159,8 @@ static __always_inline void timespec_add_ns(struct timespec *a, u64 ns) a->tv_nsec = ns; } +#endif + /** * time_to_tm - converts the calendar time to local broken-down time * diff --git a/include/linux/time64.h b/include/linux/time64.h index dea5ed22429c..54ed6955f81b 100644 --- a/include/linux/time64.h +++ b/include/linux/time64.h @@ -8,11 +8,8 @@ typedef __s64 time64_t; typedef __u64 timeu64_t; -/* - * This wants to go into uapi/linux/time.h once we agreed about the - * userspace interfaces. - */ #if __BITS_PER_LONG == 64 +/* this trick allows us to optimize out timespec64_to_timespec */ # define timespec64 timespec #define itimerspec64 itimerspec #else @@ -42,49 +39,6 @@ struct itimerspec64 { #define KTIME_MAX ((s64)~((u64)1 << 63)) #define KTIME_SEC_MAX (KTIME_MAX / NSEC_PER_SEC) -#if __BITS_PER_LONG == 64 - -static inline struct timespec timespec64_to_timespec(const struct timespec64 ts64) -{ - return ts64; -} - -static inline struct timespec64 timespec_to_timespec64(const struct timespec ts) -{ - return ts; -} - -# define timespec64_equal timespec_equal -# define timespec64_compare timespec_compare -# define set_normalized_timespec64 set_normalized_timespec -# define timespec64_add timespec_add -# define timespec64_sub timespec_sub -# define timespec64_valid timespec_valid -# define timespec64_valid_strict timespec_valid_strict -# define timespec64_to_ns timespec_to_ns -# define ns_to_timespec64 ns_to_timespec -# define timespec64_add_ns timespec_add_ns - -#else - -static inline struct timespec timespec64_to_timespec(const struct timespec64 ts64) -{ - struct timespec ret; - - ret.tv_sec = (time_t)ts64.tv_sec; - ret.tv_nsec = ts64.tv_nsec; - return ret; -} - -static inline struct timespec64 timespec_to_timespec64(const struct timespec ts) -{ - struct timespec64 ret; - - ret.tv_sec = ts.tv_sec; - ret.tv_nsec = ts.tv_nsec; - return ret; -} - static inline int timespec64_equal(const struct timespec64 *a, const struct timespec64 *b) { @@ -190,8 +144,6 @@ static __always_inline void timespec64_add_ns(struct timespec64 *a, u64 ns) a->tv_nsec = ns; } -#endif - /* * timespec64_add_safe assumes both values are positive and checks for * overflow. It will return TIME64_MAX in case of overflow. diff --git a/kernel/time/time.c b/kernel/time/time.c index 7acef179c9f1..64e0fdb62426 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -409,6 +409,7 @@ time64_t mktime64(const unsigned int year0, const unsigned int mon0, } EXPORT_SYMBOL(mktime64); +#if __BITS_PER_LONG == 32 /** * set_normalized_timespec - set timespec sec and nsec parts and normalize * @@ -469,6 +470,7 @@ struct timespec ns_to_timespec(const s64 nsec) return ts; } EXPORT_SYMBOL(ns_to_timespec); +#endif /** * ns_to_timeval - Convert nanoseconds to timeval @@ -488,7 +490,6 @@ struct timeval ns_to_timeval(const s64 nsec) } EXPORT_SYMBOL(ns_to_timeval); -#if BITS_PER_LONG == 32 /** * set_normalized_timespec - set timespec sec and nsec parts and normalize * @@ -549,7 +550,7 @@ struct timespec64 ns_to_timespec64(const s64 nsec) return ts; } EXPORT_SYMBOL(ns_to_timespec64); -#endif + /** * msecs_to_jiffies: - convert milliseconds to jiffies * @m: time in milliseconds From 7a6335b790dd01e65e6994efb840204298a19b2d Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 15 Mar 2018 17:12:40 +0100 Subject: [PATCH 1257/1640] UPSTREAM: y2038: Introduce struct __kernel_old_timeval Dealing with 'struct timeval' users in the y2038 series is a bit tricky: We have two definitions of timeval that are visible to user space, one comes from glibc (or some other C library), the other comes from linux/time.h. The kernel copy is what we want to be used for a number of structures defined by the kernel itself, e.g. elf_prstatus (used it core dumps), sysinfo and rusage (used in system calls). These generally tend to be used for passing time intervals rather than absolute (epoch-based) times, so they do not suffer from the y2038 overflow. Some of them could be changed to use 64-bit timestamps by creating new system calls, others like the core files cannot easily be changed. An application using these interfaces likely also uses gettimeofday() or other interfaces that use absolute times, and pass 'struct timeval' pointers directly into kernel interfaces, so glibc must redefine their timeval based on a 64-bit time_t when they introduce their y2038-safe interfaces. The only reasonable way forward I see is to remove the 'timeval' definion from the kernel's uapi headers, and change the interfaces that we do not want to (or cannot) duplicate for 64-bit times to use a new __kernel_old_timeval definition instead. This type should be avoided for all new interfaces (those can use 64-bit nanoseconds, or the 64-bit version of timespec instead), and should be used with great care when converting existing interfaces from timeval, to be sure they don't suffer from the y2038 overflow, and only with consensus for the particular user that using __kernel_old_timeval is better than moving to a 64-bit based interface. The structure name is intentionally chosen to not conflict with user space types, and to be ugly enough to discourage its use. Note that ioctl based interfaces that pass a bare 'timeval' pointer cannot change to '__kernel_old_timeval' because the user space source code refers to 'timeval' instead, and we don't want to modify the user space sources if possible. However, any application that relies on a structure to contain an embedded 'timeval' (e.g. by passing a pointer to the member into a function call that expects a timeval pointer) is broken when that structure gets converted to __kernel_old_timeval. I don't see any way around that, and we have to rely on the compiler to produce a warning or compile failure that will alert users when they recompile their sources against a new libc. Change-Id: I89fee6b13307424100b8a630f75414c77129ee5c Signed-off-by: Arnd Bergmann Signed-off-by: Thomas Gleixner Cc: Stephen Boyd Cc: John Stultz Cc: Al Viro Link: https://lkml.kernel.org/r/20180315161739.576085-1-arnd@arndb.de --- include/linux/time32.h | 1 + include/uapi/linux/time.h | 12 ++++++++++++ kernel/time/time.c | 12 ++++++++++++ 3 files changed, 25 insertions(+) diff --git a/include/linux/time32.h b/include/linux/time32.h index 65b1de25198d..d2bcd4377b56 100644 --- a/include/linux/time32.h +++ b/include/linux/time32.h @@ -217,5 +217,6 @@ static inline s64 timeval_to_ns(const struct timeval *tv) * Returns the timeval representation of the nsec parameter. */ extern struct timeval ns_to_timeval(const s64 nsec); +extern struct __kernel_old_timeval ns_to_kernel_old_timeval(s64 nsec); #endif diff --git a/include/uapi/linux/time.h b/include/uapi/linux/time.h index 53f8dd84beb5..4c0338ea308a 100644 --- a/include/uapi/linux/time.h +++ b/include/uapi/linux/time.h @@ -42,6 +42,18 @@ struct itimerval { struct timeval it_value; /* current value */ }; +/* + * legacy timeval structure, only embedded in structures that + * traditionally used 'timeval' to pass time intervals (not absolute + * times). Do not add new users. If user space fails to compile + * here, this is probably because it is not y2038 safe and needs to + * be changed to use another interface. + */ +struct __kernel_old_timeval { + __kernel_long_t tv_sec; + __kernel_long_t tv_usec; +}; + /* * The IDs of the various system clocks (for POSIX.1b interval timers): */ diff --git a/kernel/time/time.c b/kernel/time/time.c index 64e0fdb62426..607e7f5c1654 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -490,6 +490,18 @@ struct timeval ns_to_timeval(const s64 nsec) } EXPORT_SYMBOL(ns_to_timeval); +struct __kernel_old_timeval ns_to_kernel_old_timeval(const s64 nsec) +{ + struct timespec64 ts = ns_to_timespec64(nsec); + struct __kernel_old_timeval tv; + + tv.tv_sec = ts.tv_sec; + tv.tv_usec = (suseconds_t)ts.tv_nsec / 1000; + + return tv; +} +EXPORT_SYMBOL(ns_to_kernel_old_timeval); + /** * set_normalized_timespec - set timespec sec and nsec parts and normalize * From a454ae18f8b88a2cdb4b384e3c1a9d9c48271007 Mon Sep 17 00:00:00 2001 From: Deepa Dinamani Date: Tue, 13 Mar 2018 21:03:30 -0700 Subject: [PATCH 1258/1640] UPSTREAM: time: Add new y2038 safe __kernel_timespec The new struct __kernel_timespec is similar to current internal kernel struct timespec64 on 64 bit architecture. The compat structure however is similar to below on little endian systems (padding and tv_nsec are switched for big endian systems): typedef s32 compat_long_t; typedef s64 compat_kernel_time64_t; struct compat_kernel_timespec { compat_kernel_time64_t tv_sec; compat_long_t tv_nsec; compat_long_t padding; }; This allows for both the native and compat representations to be the same and syscalls using this type as part of their ABI can have a single entry point to both. Note that the compat define is not included anywhere in the kernel explicitly to avoid confusion. These types will be used by the new syscalls that will be introduced in the consequent patches. Most of the new syscalls are just an update to the existing native ones with this new type. Hence, put this new type under an ifdef so that the architectures can define CONFIG_64BIT_TIME when they are ready to handle this switch. Cc: linux-arch@vger.kernel.org Change-Id: I06fbf455714d0f1bae69b9ac9def451e9a5abd53 Signed-off-by: Deepa Dinamani Signed-off-by: Arnd Bergmann --- include/linux/time64.h | 10 +++++++++- include/uapi/asm-generic/posix_types.h | 1 + include/uapi/linux/time.h | 7 +++++++ 3 files changed, 17 insertions(+), 1 deletion(-) diff --git a/include/linux/time64.h b/include/linux/time64.h index 54ed6955f81b..6a632b41bd14 100644 --- a/include/linux/time64.h +++ b/include/linux/time64.h @@ -2,12 +2,20 @@ #ifndef _LINUX_TIME64_H #define _LINUX_TIME64_H -#include #include typedef __s64 time64_t; typedef __u64 timeu64_t; +/* CONFIG_64BIT_TIME enables new 64 bit time_t syscalls in the compat path + * and 32-bit emulation. + */ +#ifndef CONFIG_64BIT_TIME +#define __kernel_timespec timespec +#endif + +#include + #if __BITS_PER_LONG == 64 /* this trick allows us to optimize out timespec64_to_timespec */ # define timespec64 timespec diff --git a/include/uapi/asm-generic/posix_types.h b/include/uapi/asm-generic/posix_types.h index 5e6ea22bd525..f0733a26ebfc 100644 --- a/include/uapi/asm-generic/posix_types.h +++ b/include/uapi/asm-generic/posix_types.h @@ -87,6 +87,7 @@ typedef struct { typedef __kernel_long_t __kernel_off_t; typedef long long __kernel_loff_t; typedef __kernel_long_t __kernel_time_t; +typedef long long __kernel_time64_t; typedef __kernel_long_t __kernel_clock_t; typedef int __kernel_timer_t; typedef int __kernel_clockid_t; diff --git a/include/uapi/linux/time.h b/include/uapi/linux/time.h index 4c0338ea308a..fcf936656493 100644 --- a/include/uapi/linux/time.h +++ b/include/uapi/linux/time.h @@ -42,6 +42,13 @@ struct itimerval { struct timeval it_value; /* current value */ }; +#ifndef __kernel_timespec +struct __kernel_timespec { + __kernel_time64_t tv_sec; /* seconds */ + long long tv_nsec; /* nanoseconds */ +}; +#endif + /* * legacy timeval structure, only embedded in structures that * traditionally used 'timeval' to pass time intervals (not absolute From 6c67fb5d31d4506438903037b7beaf7c6668d8d8 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 27 Apr 2018 15:40:12 +0200 Subject: [PATCH 1259/1640] UPSTREAM: timekeeping: Remove timespec64 hack At this point, we have converted most of the kernel to use timespec64 consistently in place of timespec, so it seems it's time to make timespec64 the native structure and define timespec in terms of that one on 64-bit architectures. Starting with gcc-5, the compiler can completely optimize away the timespec_to_timespec64 and timespec64_to_timespec functions on 64-bit architectures. With older compilers, we introduce a couple of extra copies of local variables, but those are easily avoided by using the timespec64 based interfaces consistently, as we do in most of the important code paths already. The main upside of removing the hack is that printing the tv_sec field of a timespec64 structure can now use the %lld format string on all architectures without a cast to time64_t. Without this patch, the field is a 'long' type and would have to be printed using %ld on 64-bit architectures. Change-Id: I5f90acafd007c8f2a2d0a0a597c5b10b7803a0d5 Signed-off-by: Arnd Bergmann Signed-off-by: Thomas Gleixner Cc: Stephen Boyd Cc: y2038@lists.linaro.org Cc: John Stultz Link: https://lkml.kernel.org/r/20180427134016.2525989-2-arnd@arndb.de --- include/linux/time32.h | 18 +++----------- include/linux/time64.h | 7 ------ include/linux/timekeeping32.h | 45 ----------------------------------- kernel/time/time.c | 2 -- 4 files changed, 3 insertions(+), 69 deletions(-) diff --git a/include/linux/time32.h b/include/linux/time32.h index d2bcd4377b56..0b14f936100a 100644 --- a/include/linux/time32.h +++ b/include/linux/time32.h @@ -18,25 +18,14 @@ /* timespec64 is defined as timespec here */ static inline struct timespec timespec64_to_timespec(const struct timespec64 ts64) { - return ts64; + return *(const struct timespec *)&ts64; } static inline struct timespec64 timespec_to_timespec64(const struct timespec ts) { - return ts; + return *(const struct timespec64 *)&ts; } -# define timespec_equal timespec64_equal -# define timespec_compare timespec64_compare -# define set_normalized_timespec set_normalized_timespec64 -# define timespec_add timespec64_add -# define timespec_sub timespec64_sub -# define timespec_valid timespec64_valid -# define timespec_valid_strict timespec64_valid_strict -# define timespec_to_ns timespec64_to_ns -# define ns_to_timespec ns_to_timespec64 -# define timespec_add_ns timespec64_add_ns - #else static inline struct timespec timespec64_to_timespec(const struct timespec64 ts64) { @@ -55,6 +44,7 @@ static inline struct timespec64 timespec_to_timespec64(const struct timespec ts) ret.tv_nsec = ts.tv_nsec; return ret; } +#endif static inline int timespec_equal(const struct timespec *a, const struct timespec *b) @@ -159,8 +149,6 @@ static __always_inline void timespec_add_ns(struct timespec *a, u64 ns) a->tv_nsec = ns; } -#endif - /** * time_to_tm - converts the calendar time to local broken-down time * diff --git a/include/linux/time64.h b/include/linux/time64.h index 6a632b41bd14..fbb7fd156e4b 100644 --- a/include/linux/time64.h +++ b/include/linux/time64.h @@ -16,11 +16,6 @@ typedef __u64 timeu64_t; #include -#if __BITS_PER_LONG == 64 -/* this trick allows us to optimize out timespec64_to_timespec */ -# define timespec64 timespec -#define itimerspec64 itimerspec -#else struct timespec64 { time64_t tv_sec; /* seconds */ long tv_nsec; /* nanoseconds */ @@ -31,8 +26,6 @@ struct itimerspec64 { struct timespec64 it_value; }; -#endif - /* Parameters used to convert the timespec values: */ #define MSEC_PER_SEC 1000L #define USEC_PER_MSEC 1000L diff --git a/include/linux/timekeeping32.h b/include/linux/timekeeping32.h index af4114d5dc17..f707ddb4dad7 100644 --- a/include/linux/timekeeping32.h +++ b/include/linux/timekeeping32.h @@ -19,50 +19,6 @@ static inline struct timespec current_kernel_time(void) return timespec64_to_timespec(now); } -#if BITS_PER_LONG == 64 -/** - * Deprecated. Use do_settimeofday64(). - */ -static inline int do_settimeofday(const struct timespec *ts) -{ - return do_settimeofday64(ts); -} - -static inline int __getnstimeofday(struct timespec *ts) -{ - return __getnstimeofday64(ts); -} - -static inline void getnstimeofday(struct timespec *ts) -{ - getnstimeofday64(ts); -} - -static inline void ktime_get_ts(struct timespec *ts) -{ - ktime_get_ts64(ts); -} - -static inline void ktime_get_real_ts(struct timespec *ts) -{ - getnstimeofday64(ts); -} - -static inline void getrawmonotonic(struct timespec *ts) -{ - getrawmonotonic64(ts); -} - -static inline struct timespec get_monotonic_coarse(void) -{ - return get_monotonic_coarse64(); -} - -static inline void getboottime(struct timespec *ts) -{ - return getboottime64(ts); -} -#else /** * Deprecated. Use do_settimeofday64(). */ @@ -127,7 +83,6 @@ static inline void getboottime(struct timespec *ts) getboottime64(&ts64); *ts = timespec64_to_timespec(ts64); } -#endif /* * Timespec interfaces utilizing the ktime based ones diff --git a/kernel/time/time.c b/kernel/time/time.c index 607e7f5c1654..a92241f410e0 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -409,7 +409,6 @@ time64_t mktime64(const unsigned int year0, const unsigned int mon0, } EXPORT_SYMBOL(mktime64); -#if __BITS_PER_LONG == 32 /** * set_normalized_timespec - set timespec sec and nsec parts and normalize * @@ -470,7 +469,6 @@ struct timespec ns_to_timespec(const s64 nsec) return ts; } EXPORT_SYMBOL(ns_to_timespec); -#endif /** * ns_to_timeval - Convert nanoseconds to timeval From cefbb1b7adc169eaecbb2f7fab5a030c9a4acfb6 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Fri, 13 Apr 2018 13:27:58 +0800 Subject: [PATCH 1260/1640] UPSTREAM: timekeeping: Remove __current_kernel_time() The __current_kernel_time() function based on 'struct timespec' is no longer recommended for new code, and the only user of this function has been replaced by commit 6909e29fdefb ("kdb: use __ktime_get_real_seconds instead of __current_kernel_time"). Remove the obsolete interface. Change-Id: If68b9be1b08d508237ce15003cacb2d730f915e4 Signed-off-by: Baolin Wang Signed-off-by: Thomas Gleixner Cc: arnd@arndb.de Cc: sboyd@kernel.org Cc: broonie@kernel.org Cc: john.stultz@linaro.org Link: https://lkml.kernel.org/r/1a9dbea7ee2cda7efe9ed330874075cf17fdbff6.1523596316.git.baolin.wang@linaro.org --- include/linux/timekeeping32.h | 3 --- kernel/time/timekeeping.c | 7 ------- 2 files changed, 10 deletions(-) diff --git a/include/linux/timekeeping32.h b/include/linux/timekeeping32.h index f707ddb4dad7..4ea45d0df1d4 100644 --- a/include/linux/timekeeping32.h +++ b/include/linux/timekeeping32.h @@ -9,9 +9,6 @@ extern void do_gettimeofday(struct timeval *tv); unsigned long get_seconds(void); -/* does not take xtime_lock */ -struct timespec __current_kernel_time(void); - static inline struct timespec current_kernel_time(void) { struct timespec64 now = current_kernel_time64(); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index c7f7d4da647c..6114c66424f3 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -2246,13 +2246,6 @@ unsigned long get_seconds(void) } EXPORT_SYMBOL(get_seconds); -struct timespec __current_kernel_time(void) -{ - struct timekeeper *tk = &tk_core.timekeeper; - - return timespec64_to_timespec(tk_xtime(tk)); -} - struct timespec64 current_kernel_time64(void) { struct timekeeper *tk = &tk_core.timekeeper; From 6afec5e1e1e0e71a8d7809b646c73e3b838ceece Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 27 Apr 2018 15:40:13 +0200 Subject: [PATCH 1261/1640] UPSTREAM: timekeeping: Clean up ktime_get_real_ts64 In a move to make ktime_get_*() the preferred driver interface into the timekeeping code, sanitizes ktime_get_real_ts64() to be a proper exported symbol rather than an alias for getnstimeofday64(). The internal __getnstimeofday64() is no longer used, so remove that and merge it into ktime_get_real_ts64(). Change-Id: I79e7ad3d71807eafb7eb0827e3945ccbfcd1cfdc Signed-off-by: Arnd Bergmann Signed-off-by: Thomas Gleixner Cc: Stephen Boyd Cc: y2038@lists.linaro.org Cc: John Stultz Link: https://lkml.kernel.org/r/20180427134016.2525989-3-arnd@arndb.de --- include/linux/timekeeping.h | 8 +++++--- include/linux/timekeeping32.h | 13 ++----------- kernel/time/timekeeping.c | 31 ++++++------------------------- 3 files changed, 13 insertions(+), 39 deletions(-) diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h index 50a79eb8c607..9c02d23a6a95 100644 --- a/include/linux/timekeeping.h +++ b/include/linux/timekeeping.h @@ -30,14 +30,12 @@ struct timespec64 current_kernel_time64(void); struct timespec64 get_monotonic_coarse64(void); extern void getrawmonotonic64(struct timespec64 *ts); extern void ktime_get_ts64(struct timespec64 *ts); +extern void ktime_get_real_ts64(struct timespec64 *tv); extern time64_t ktime_get_seconds(void); extern time64_t ktime_get_real_seconds(void); -extern int __getnstimeofday64(struct timespec64 *tv); -extern void getnstimeofday64(struct timespec64 *tv); extern void getboottime64(struct timespec64 *ts); -#define ktime_get_real_ts64(ts) getnstimeofday64(ts) /* * ktime_t based interfaces @@ -230,5 +228,9 @@ extern void read_persistent_clock64(struct timespec64 *ts); extern void read_boot_clock64(struct timespec64 *ts); extern int update_persistent_clock64(struct timespec64 now); +/* + * deprecated aliases, don't use in new code + */ +#define getnstimeofday64(ts) ktime_get_real_ts64(ts) #endif diff --git a/include/linux/timekeeping32.h b/include/linux/timekeeping32.h index 4ea45d0df1d4..5abff52d07fd 100644 --- a/include/linux/timekeeping32.h +++ b/include/linux/timekeeping32.h @@ -27,20 +27,11 @@ static inline int do_settimeofday(const struct timespec *ts) return do_settimeofday64(&ts64); } -static inline int __getnstimeofday(struct timespec *ts) -{ - struct timespec64 ts64; - int ret = __getnstimeofday64(&ts64); - - *ts = timespec64_to_timespec(ts64); - return ret; -} - static inline void getnstimeofday(struct timespec *ts) { struct timespec64 ts64; - getnstimeofday64(&ts64); + ktime_get_real_ts64(&ts64); *ts = timespec64_to_timespec(ts64); } @@ -56,7 +47,7 @@ static inline void ktime_get_real_ts(struct timespec *ts) { struct timespec64 ts64; - getnstimeofday64(&ts64); + ktime_get_real_ts64(&ts64); *ts = timespec64_to_timespec(ts64); } diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 6114c66424f3..2a3f04300128 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -707,18 +707,19 @@ static void timekeeping_forward_now(struct timekeeper *tk) } /** - * __getnstimeofday64 - Returns the time of day in a timespec64. + * ktime_get_real_ts64 - Returns the time of day in a timespec64. * @ts: pointer to the timespec to be set * - * Updates the time of day in the timespec. - * Returns 0 on success, or -ve when suspended (timespec will be undefined). + * Returns the time of day in a timespec64 (WARN if suspended). */ -int __getnstimeofday64(struct timespec64 *ts) +void ktime_get_real_ts64(struct timespec64 *ts) { struct timekeeper *tk = &tk_core.timekeeper; unsigned long seq; u64 nsecs; + WARN_ON(timekeeping_suspended); + do { seq = read_seqcount_begin(&tk_core.seq); @@ -729,28 +730,8 @@ int __getnstimeofday64(struct timespec64 *ts) ts->tv_nsec = 0; timespec64_add_ns(ts, nsecs); - - /* - * Do not bail out early, in case there were callers still using - * the value, even in the face of the WARN_ON. - */ - if (unlikely(timekeeping_suspended)) - return -EAGAIN; - return 0; } -EXPORT_SYMBOL(__getnstimeofday64); - -/** - * getnstimeofday64 - Returns the time of day in a timespec64. - * @ts: pointer to the timespec64 to be set - * - * Returns the time of day in a timespec64 (WARN if suspended). - */ -void getnstimeofday64(struct timespec64 *ts) -{ - WARN_ON(__getnstimeofday64(ts)); -} -EXPORT_SYMBOL(getnstimeofday64); +EXPORT_SYMBOL(ktime_get_real_ts64); ktime_t ktime_get(void) { From a698ea2efc5870bd43c69cf8f56bc229ce2c7884 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 27 Apr 2018 15:40:14 +0200 Subject: [PATCH 1262/1640] UPSTREAM: timekeeping: Standardize on ktime_get_*() naming The current_kernel_time64, get_monotonic_coarse64, getrawmonotonic64, get_monotonic_boottime64 and timekeeping_clocktai64 interfaces have rather inconsistent naming, and they differ in the calling conventions by passing the output either by reference or as a return value. Rename them to ktime_get_coarse_real_ts64, ktime_get_coarse_ts64, ktime_get_raw_ts64, ktime_get_boottime_ts64 and ktime_get_clocktai_ts64 respectively, and provide the interfaces with macros or inline functions as needed. Change-Id: Ifa617b3cb1bff3821c1387efaeca899418e8bc9a Signed-off-by: Arnd Bergmann Signed-off-by: Thomas Gleixner Cc: Stephen Boyd Cc: y2038@lists.linaro.org Cc: John Stultz Link: https://lkml.kernel.org/r/20180427134016.2525989-4-arnd@arndb.de --- include/linux/timekeeping.h | 43 ++++++++++++++++++++++++++--------- include/linux/timekeeping32.h | 14 ++++++++---- kernel/time/timekeeping.c | 23 ++++++++----------- 3 files changed, 51 insertions(+), 29 deletions(-) diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h index 9c02d23a6a95..d5ff77dc5348 100644 --- a/include/linux/timekeeping.h +++ b/include/linux/timekeeping.h @@ -19,24 +19,24 @@ extern void xtime_update(unsigned long ticks); extern int do_settimeofday64(const struct timespec64 *ts); extern int do_sys_settimeofday64(const struct timespec64 *tv, const struct timezone *tz); -/* - * Kernel time accessors - */ -struct timespec64 current_kernel_time64(void); /* * timespec64 based interfaces */ -struct timespec64 get_monotonic_coarse64(void); -extern void getrawmonotonic64(struct timespec64 *ts); +extern void ktime_get_raw_ts64(struct timespec64 *ts); extern void ktime_get_ts64(struct timespec64 *ts); extern void ktime_get_real_ts64(struct timespec64 *tv); +extern void ktime_get_coarse_ts64(struct timespec64 *ts); +extern void ktime_get_coarse_real_ts64(struct timespec64 *ts); + +void getboottime64(struct timespec64 *ts); + +/* + * time64_t base interfaces + */ extern time64_t ktime_get_seconds(void); extern time64_t ktime_get_real_seconds(void); -extern void getboottime64(struct timespec64 *ts); - - /* * ktime_t based interfaces */ @@ -137,12 +137,12 @@ extern u64 ktime_get_boot_fast_ns(void); /* * timespec64 interfaces utilizing the ktime based ones */ -static inline void get_monotonic_boottime64(struct timespec64 *ts) +static inline void ktime_get_boottime_ts64(struct timespec64 *ts) { *ts = ktime_to_timespec64(ktime_get_boottime()); } -static inline void timekeeping_clocktai64(struct timespec64 *ts) +static inline void ktime_get_clocktai_ts64(struct timespec64 *ts) { *ts = ktime_to_timespec64(ktime_get_clocktai()); } @@ -232,5 +232,26 @@ extern int update_persistent_clock64(struct timespec64 now); * deprecated aliases, don't use in new code */ #define getnstimeofday64(ts) ktime_get_real_ts64(ts) +#define get_monotonic_boottime64(ts) ktime_get_boottime_ts64(ts) +#define getrawmonotonic64(ts) ktime_get_raw_ts64(ts) +#define timekeeping_clocktai64(ts) ktime_get_clocktai_ts64(ts) + +static inline struct timespec64 current_kernel_time64(void) +{ + struct timespec64 ts; + + ktime_get_coarse_real_ts64(&ts); + + return ts; +} + +static inline struct timespec64 get_monotonic_coarse64(void) +{ + struct timespec64 ts; + + ktime_get_coarse_ts64(&ts); + + return ts; +} #endif diff --git a/include/linux/timekeeping32.h b/include/linux/timekeeping32.h index 5abff52d07fd..8762c2f45f8b 100644 --- a/include/linux/timekeeping32.h +++ b/include/linux/timekeeping32.h @@ -11,9 +11,11 @@ unsigned long get_seconds(void); static inline struct timespec current_kernel_time(void) { - struct timespec64 now = current_kernel_time64(); + struct timespec64 ts64; - return timespec64_to_timespec(now); + ktime_get_coarse_real_ts64(&ts64); + + return timespec64_to_timespec(ts64); } /** @@ -55,13 +57,17 @@ static inline void getrawmonotonic(struct timespec *ts) { struct timespec64 ts64; - getrawmonotonic64(&ts64); + ktime_get_raw_ts64(&ts64); *ts = timespec64_to_timespec(ts64); } static inline struct timespec get_monotonic_coarse(void) { - return timespec64_to_timespec(get_monotonic_coarse64()); + struct timespec64 ts64; + + ktime_get_coarse_ts64(&ts64); + + return timespec64_to_timespec(ts64); } static inline void getboottime(struct timespec *ts) diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 2a3f04300128..5b10c4ac8d3d 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1438,12 +1438,12 @@ int timekeeping_notify(struct clocksource *clock) } /** - * getrawmonotonic64 - Returns the raw monotonic time in a timespec + * ktime_get_raw_ts64 - Returns the raw monotonic time in a timespec * @ts: pointer to the timespec64 to be set * * Returns the raw monotonic time (completely un-modified by ntp) */ -void getrawmonotonic64(struct timespec64 *ts) +void ktime_get_raw_ts64(struct timespec64 *ts) { struct timekeeper *tk = &tk_core.timekeeper; unsigned long seq; @@ -1459,7 +1459,7 @@ void getrawmonotonic64(struct timespec64 *ts) ts->tv_nsec = 0; timespec64_add_ns(ts, nsecs); } -EXPORT_SYMBOL(getrawmonotonic64); +EXPORT_SYMBOL(ktime_get_raw_ts64); /** @@ -2227,23 +2227,20 @@ unsigned long get_seconds(void) } EXPORT_SYMBOL(get_seconds); -struct timespec64 current_kernel_time64(void) +void ktime_get_coarse_real_ts64(struct timespec64 *ts) { struct timekeeper *tk = &tk_core.timekeeper; - struct timespec64 now; unsigned long seq; do { seq = read_seqcount_begin(&tk_core.seq); - now = tk_xtime(tk); + *ts = tk_xtime(tk); } while (read_seqcount_retry(&tk_core.seq, seq)); - - return now; } -EXPORT_SYMBOL(current_kernel_time64); +EXPORT_SYMBOL(ktime_get_coarse_real_ts64); -struct timespec64 get_monotonic_coarse64(void) +void ktime_get_coarse_ts64(struct timespec64 *ts) { struct timekeeper *tk = &tk_core.timekeeper; struct timespec64 now, mono; @@ -2256,12 +2253,10 @@ struct timespec64 get_monotonic_coarse64(void) mono = tk->wall_to_monotonic; } while (read_seqcount_retry(&tk_core.seq, seq)); - set_normalized_timespec64(&now, now.tv_sec + mono.tv_sec, + set_normalized_timespec64(ts, now.tv_sec + mono.tv_sec, now.tv_nsec + mono.tv_nsec); - - return now; } -EXPORT_SYMBOL(get_monotonic_coarse64); +EXPORT_SYMBOL(ktime_get_coarse_ts64); /* * Must hold jiffies_lock From cc17fd640942307e4c48d229d2a7e290901110ef Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 18 Jun 2018 17:31:36 +0200 Subject: [PATCH 1263/1640] UPSTREAM: iio: use timespec64 based interfaces for iio_get_time_ns() We have replacements for all the deprecated timespec based interfaces now, so this can finally convert iio_get_time_ns() to consistently use the nanosecond or timespec64 based interfaces instead, avoiding the y2038 overflow. Change-Id: I358a958789f8a8389ac3fb67e7f58ea6b3088a6f Signed-off-by: Arnd Bergmann Signed-off-by: Jonathan Cameron --- drivers/iio/industrialio-core.c | 26 +++++++++----------------- 1 file changed, 9 insertions(+), 17 deletions(-) diff --git a/drivers/iio/industrialio-core.c b/drivers/iio/industrialio-core.c index 12d73ebcadfa..9df9785026da 100644 --- a/drivers/iio/industrialio-core.c +++ b/drivers/iio/industrialio-core.c @@ -207,35 +207,27 @@ static int iio_device_set_clock(struct iio_dev *indio_dev, clockid_t clock_id) */ s64 iio_get_time_ns(const struct iio_dev *indio_dev) { - struct timespec tp; + struct timespec64 tp; switch (iio_device_get_clock(indio_dev)) { case CLOCK_REALTIME: - ktime_get_real_ts(&tp); - break; + return ktime_get_real_ns(); case CLOCK_MONOTONIC: - ktime_get_ts(&tp); - break; + return ktime_get_ns(); case CLOCK_MONOTONIC_RAW: - getrawmonotonic(&tp); - break; + return ktime_get_raw_ns(); case CLOCK_REALTIME_COARSE: - tp = current_kernel_time(); - break; + return ktime_to_ns(ktime_get_coarse_real()); case CLOCK_MONOTONIC_COARSE: - tp = get_monotonic_coarse(); - break; + ktime_get_coarse_ts64(&tp); + return timespec64_to_ns(&tp); case CLOCK_BOOTTIME: - get_monotonic_boottime(&tp); - break; + return ktime_get_boot_ns(); case CLOCK_TAI: - timekeeping_clocktai(&tp); - break; + return ktime_get_tai_ns(); default: BUG(); } - - return timespec_to_ns(&tp); } EXPORT_SYMBOL(iio_get_time_ns); From cdec44e48f7792505344c30d2d08e7522b6b688f Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 27 Nov 2017 12:48:19 +0100 Subject: [PATCH 1264/1640] UPSTREAM: wlcore: use boottime for fw time sync Using getnstimeofday()/timespec_to_ns() causes an overflow on 32-bit architectures in 2038, and may suffer from time jumps due to settimeofday() or leap seconds. I don't see a reason why this needs to be UTC, so either monotonic or boot time would be better here. Assuming that the fw time keeps running during suspend, boottime is better than monotonic, and ktime_get_boot_ns() will also save the additional conversion to nanoseconds. Change-Id: I1857989fcef22ff6112fed7f8897b6ee86590c0a Signed-off-by: Arnd Bergmann Signed-off-by: Kalle Valo --- drivers/net/wireless/ti/wlcore/main.c | 4 +--- drivers/net/wireless/ti/wlcore/tx.c | 4 +--- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/drivers/net/wireless/ti/wlcore/main.c b/drivers/net/wireless/ti/wlcore/main.c index 9f568034deb3..620885133e88 100644 --- a/drivers/net/wireless/ti/wlcore/main.c +++ b/drivers/net/wireless/ti/wlcore/main.c @@ -388,7 +388,6 @@ static void wl12xx_irq_update_links_status(struct wl1271 *wl, static int wlcore_fw_status(struct wl1271 *wl, struct wl_fw_status *status) { struct wl12xx_vif *wlvif; - struct timespec ts; u32 old_tx_blk_count = wl->tx_blocks_available; int avail, freed_blocks; int i; @@ -485,8 +484,7 @@ static int wlcore_fw_status(struct wl1271 *wl, struct wl_fw_status *status) } /* update the host-chipset time offset */ - getnstimeofday(&ts); - wl->time_offset = (timespec_to_ns(&ts) >> 10) - + wl->time_offset = (ktime_get_boot_ns() >> 10) - (s64)(status->fw_localtime); wl->fw_fast_lnk_map = status->link_fast_bitmap; diff --git a/drivers/net/wireless/ti/wlcore/tx.c b/drivers/net/wireless/ti/wlcore/tx.c index a3f5e9ca492a..00e9b4624dcf 100644 --- a/drivers/net/wireless/ti/wlcore/tx.c +++ b/drivers/net/wireless/ti/wlcore/tx.c @@ -264,7 +264,6 @@ static void wl1271_tx_fill_hdr(struct wl1271 *wl, struct wl12xx_vif *wlvif, struct sk_buff *skb, u32 extra, struct ieee80211_tx_info *control, u8 hlid) { - struct timespec ts; struct wl1271_tx_hw_descr *desc; int ac, rate_idx; s64 hosttime; @@ -287,8 +286,7 @@ static void wl1271_tx_fill_hdr(struct wl1271 *wl, struct wl12xx_vif *wlvif, } /* configure packet life time */ - getnstimeofday(&ts); - hosttime = (timespec_to_ns(&ts) >> 10); + hosttime = (ktime_get_boot_ns() >> 10); desc->start_time = cpu_to_le32(hosttime - wl->time_offset); is_dummy = wl12xx_is_dummy_packet(wl, skb); From 6dcdc94007500f8bc38ddf5046122b6dbfe27b45 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Fri, 21 Jun 2019 22:32:48 +0200 Subject: [PATCH 1265/1640] BACKPORT: timekeeping: Use proper clock specifier names in functions This makes boot uniformly boottime and tai uniformly clocktai, to address the remaining oversights. Change-Id: I3463b9045bddeba00d6f9fcf78d63008459c1b9a Signed-off-by: Jason A. Donenfeld Signed-off-by: Thomas Gleixner Reviewed-by: Arnd Bergmann Link: https://lkml.kernel.org/r/20190621203249.3909-2-Jason@zx2c4.com --- arch/x86/kvm/x86.c | 12 ++++++------ drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 2 +- drivers/iio/humidity/dht11.c | 8 ++++---- drivers/iio/industrialio-core.c | 4 ++-- drivers/infiniband/hw/mlx4/alias_GUID.c | 6 +++--- drivers/misc/memory_state_time.c | 8 ++++---- drivers/net/wireless/intel/iwlwifi/mvm/rx.c | 2 +- drivers/net/wireless/intel/iwlwifi/mvm/rxmq.c | 2 +- drivers/net/wireless/intel/iwlwifi/mvm/utils.c | 2 +- drivers/net/wireless/mac80211_hwsim.c | 2 +- drivers/net/wireless/ti/wlcore/main.c | 2 +- drivers/net/wireless/ti/wlcore/rx.c | 2 +- drivers/net/wireless/ti/wlcore/tx.c | 2 +- drivers/net/wireless/virt_wifi.c | 2 +- .../qca-wifi-host-cmn/qdf/linux/src/i_qdf_time.h | 2 +- include/linux/timekeeping.h | 4 ++-- include/net/cfg80211.h | 2 +- include/trace/events/net.h | 10 +++++----- kernel/bpf/syscall.c | 2 +- kernel/events/core.c | 4 ++-- kernel/fork.c | 2 +- 21 files changed, 41 insertions(+), 41 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e41fd810cfa5..6f5250680405 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1610,7 +1610,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr) raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags); offset = kvm_compute_tsc_offset(vcpu, data); - ns = ktime_get_boot_ns(); + ns = ktime_get_boottime_ns(); elapsed = ns - kvm->arch.last_tsc_nsec; if (vcpu->arch.virtual_tsc_khz) { @@ -1927,7 +1927,7 @@ u64 get_kvmclock_ns(struct kvm *kvm) spin_lock(&ka->pvclock_gtod_sync_lock); if (!ka->use_master_clock) { spin_unlock(&ka->pvclock_gtod_sync_lock); - return ktime_get_boot_ns() + ka->kvmclock_offset; + return ktime_get_boottime_ns() + ka->kvmclock_offset; } hv_clock.tsc_timestamp = ka->master_cycle_now; @@ -1943,7 +1943,7 @@ u64 get_kvmclock_ns(struct kvm *kvm) &hv_clock.tsc_to_system_mul); ret = __pvclock_read_cycles(&hv_clock, rdtsc()); } else - ret = ktime_get_boot_ns() + ka->kvmclock_offset; + ret = ktime_get_boottime_ns() + ka->kvmclock_offset; put_cpu(); @@ -2042,7 +2042,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) } if (!use_master_clock) { host_tsc = rdtsc(); - kernel_ns = ktime_get_boot_ns(); + kernel_ns = ktime_get_boottime_ns(); } tsc_timestamp = kvm_read_l1_tsc(v, host_tsc); @@ -8172,7 +8172,7 @@ int kvm_arch_hardware_enable(void) * before any KVM threads can be running. Unfortunately, we can't * bring the TSCs fully up to date with real time, as we aren't yet far * enough into CPU bringup that we know how much real time has actually - * elapsed; our helper function, ktime_get_boot_ns() will be using boot + * elapsed; our helper function, ktime_get_boottime_ns() will be using boot * variables that haven't been updated yet. * * So we simply find the maximum observed TSC above, then record the @@ -8411,7 +8411,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) mutex_init(&kvm->arch.hyperv.hv_lock); spin_lock_init(&kvm->arch.pvclock_gtod_sync_lock); - kvm->arch.kvmclock_offset = -ktime_get_boot_ns(); + kvm->arch.kvmclock_offset = -ktime_get_boottime_ns(); pvclock_update_vm_gtod_copy(kvm); INIT_DELAYED_WORK(&kvm->arch.kvmclock_update_work, kvmclock_update_fn); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c index a8b56f6495c3..e0d6f0d598dc 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c @@ -725,7 +725,7 @@ static int kfd_ioctl_get_clock_counters(struct file *filep, /* No access to rdtsc. Using raw monotonic time */ args->cpu_clock_counter = ktime_get_raw_ns(); - args->system_clock_counter = ktime_get_boot_ns(); + args->system_clock_counter = ktime_get_boottime_ns(); /* Since the counter is in nano-seconds we use 1GHz frequency */ args->system_clock_freq = 1000000000; diff --git a/drivers/iio/humidity/dht11.c b/drivers/iio/humidity/dht11.c index 7ccabcd5614a..43e502fcf199 100644 --- a/drivers/iio/humidity/dht11.c +++ b/drivers/iio/humidity/dht11.c @@ -158,7 +158,7 @@ static int dht11_decode(struct dht11 *dht11, int offset) return -EIO; } - dht11->timestamp = ktime_get_boot_ns(); + dht11->timestamp = ktime_get_boottime_ns(); if (hum_int < 4) { /* DHT22: 100000 = (3*256+232)*100 */ dht11->temperature = (((temp_int & 0x7f) << 8) + temp_dec) * ((temp_int & 0x80) ? -100 : 100); @@ -186,7 +186,7 @@ static irqreturn_t dht11_handle_irq(int irq, void *data) /* TODO: Consider making the handler safe for IRQ sharing */ if (dht11->num_edges < DHT11_EDGES_PER_READ && dht11->num_edges >= 0) { - dht11->edges[dht11->num_edges].ts = ktime_get_boot_ns(); + dht11->edges[dht11->num_edges].ts = ktime_get_boottime_ns(); dht11->edges[dht11->num_edges++].value = gpio_get_value(dht11->gpio); @@ -205,7 +205,7 @@ static int dht11_read_raw(struct iio_dev *iio_dev, int ret, timeres, offset; mutex_lock(&dht11->lock); - if (dht11->timestamp + DHT11_DATA_VALID_TIME < ktime_get_boot_ns()) { + if (dht11->timestamp + DHT11_DATA_VALID_TIME < ktime_get_boottime_ns()) { timeres = ktime_get_resolution_ns(); dev_dbg(dht11->dev, "current timeresolution: %dns\n", timeres); if (timeres > DHT11_MIN_TIMERES) { @@ -332,7 +332,7 @@ static int dht11_probe(struct platform_device *pdev) return -EINVAL; } - dht11->timestamp = ktime_get_boot_ns() - DHT11_DATA_VALID_TIME - 1; + dht11->timestamp = ktime_get_boottime_ns() - DHT11_DATA_VALID_TIME - 1; dht11->num_edges = -1; platform_set_drvdata(pdev, iio); diff --git a/drivers/iio/industrialio-core.c b/drivers/iio/industrialio-core.c index 9df9785026da..298afc096a24 100644 --- a/drivers/iio/industrialio-core.c +++ b/drivers/iio/industrialio-core.c @@ -222,9 +222,9 @@ s64 iio_get_time_ns(const struct iio_dev *indio_dev) ktime_get_coarse_ts64(&tp); return timespec64_to_ns(&tp); case CLOCK_BOOTTIME: - return ktime_get_boot_ns(); + return ktime_get_boottime_ns(); case CLOCK_TAI: - return ktime_get_tai_ns(); + return ktime_get_clocktai_ns(); default: BUG(); } diff --git a/drivers/infiniband/hw/mlx4/alias_GUID.c b/drivers/infiniband/hw/mlx4/alias_GUID.c index f2d975c2659d..620a4075bcb4 100644 --- a/drivers/infiniband/hw/mlx4/alias_GUID.c +++ b/drivers/infiniband/hw/mlx4/alias_GUID.c @@ -310,7 +310,7 @@ static void aliasguid_query_handler(int status, if (status) { pr_debug("(port: %d) failed: status = %d\n", cb_ctx->port, status); - rec->time_to_run = ktime_get_boot_ns() + 1 * NSEC_PER_SEC; + rec->time_to_run = ktime_get_boottime_ns() + 1 * NSEC_PER_SEC; goto out; } @@ -416,7 +416,7 @@ next_entry: be64_to_cpu((__force __be64)rec->guid_indexes), be64_to_cpu((__force __be64)applied_guid_indexes), be64_to_cpu((__force __be64)declined_guid_indexes)); - rec->time_to_run = ktime_get_boot_ns() + + rec->time_to_run = ktime_get_boottime_ns() + resched_delay_sec * NSEC_PER_SEC; } else { rec->status = MLX4_GUID_INFO_STATUS_SET; @@ -709,7 +709,7 @@ static int get_low_record_time_index(struct mlx4_ib_dev *dev, u8 port, } } if (resched_delay_sec) { - u64 curr_time = ktime_get_boot_ns(); + u64 curr_time = ktime_get_boottime_ns(); *resched_delay_sec = (low_record_time < curr_time) ? 0 : div_u64((low_record_time - curr_time), NSEC_PER_SEC); diff --git a/drivers/misc/memory_state_time.c b/drivers/misc/memory_state_time.c index ba94dcf09169..3e449bfced41 100644 --- a/drivers/misc/memory_state_time.c +++ b/drivers/misc/memory_state_time.c @@ -129,7 +129,7 @@ static ssize_t show_stat_show(struct kobject *kobj, } } } - pr_debug("Current Time: %llu\n", ktime_get_boot_ns()); + pr_debug("Current Time: %llu\n", ktime_get_boottime_ns()); return len; } KERNEL_ATTR_RO(show_stat); @@ -212,7 +212,7 @@ static void memory_state_freq_update(struct memory_state_update_block *ub, return; INIT_WORK(&freq_container->update_state, freq_update_do_work); - freq_container->time_now = ktime_get_boot_ns(); + freq_container->time_now = ktime_get_boottime_ns(); freq_container->value = value; pr_debug("Scheduling freq update in work queue\n"); queue_work(memory_wq, &freq_container->update_state); @@ -234,7 +234,7 @@ static void memory_state_bw_update(struct memory_state_update_block *ub, return; INIT_WORK(&bw_container->update_state, bw_update_do_work); - bw_container->time_now = ktime_get_boot_ns(); + bw_container->time_now = ktime_get_boottime_ns(); bw_container->value = value; bw_container->id = ub->id; pr_debug("Scheduling bandwidth update in work queue\n"); @@ -400,7 +400,7 @@ static int memory_state_time_probe(struct platform_device *pdev) error = freq_buckets_init(&pdev->dev); if (error) return error; - last_update = ktime_get_boot_ns(); + last_update = ktime_get_boottime_ns(); init_success = true; pr_debug("memory_state_time initialized with num_freqs %d\n", diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/rx.c b/drivers/net/wireless/intel/iwlwifi/mvm/rx.c index c31303d13069..f8a481818031 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/rx.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/rx.c @@ -489,7 +489,7 @@ void iwl_mvm_rx_rx_mpdu(struct iwl_mvm *mvm, struct napi_struct *napi, if (unlikely(ieee80211_is_beacon(hdr->frame_control) || ieee80211_is_probe_resp(hdr->frame_control))) - rx_status->boottime_ns = ktime_get_boot_ns(); + rx_status->boottime_ns = ktime_get_boottime_ns(); /* Take a reference briefly to kick off a d0i3 entry delay so * we can handle bursts of RX packets without toggling the diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/rxmq.c b/drivers/net/wireless/intel/iwlwifi/mvm/rxmq.c index 1a12e829e98b..aff8b2ddd5d1 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/rxmq.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/rxmq.c @@ -1046,7 +1046,7 @@ void iwl_mvm_rx_mpdu_mq(struct iwl_mvm *mvm, struct napi_struct *napi, if (unlikely(ieee80211_is_beacon(hdr->frame_control) || ieee80211_is_probe_resp(hdr->frame_control))) - rx_status->boottime_ns = ktime_get_boot_ns(); + rx_status->boottime_ns = ktime_get_boottime_ns(); } if (iwl_mvm_create_skb(mvm, skb, hdr, len, crypt_len, rxb)) { diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/utils.c b/drivers/net/wireless/intel/iwlwifi/mvm/utils.c index 3303fc85d76f..0b307410c640 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/utils.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/utils.c @@ -1409,7 +1409,7 @@ void iwl_mvm_get_sync_time(struct iwl_mvm *mvm, u32 *gp2, u64 *boottime) } *gp2 = iwl_read_prph(mvm->trans, DEVICE_SYSTEM_TIME_REG); - *boottime = ktime_get_boot_ns(); + *boottime = ktime_get_boottime_ns(); if (!ps_disabled) { mvm->ps_disabled = ps_disabled; diff --git a/drivers/net/wireless/mac80211_hwsim.c b/drivers/net/wireless/mac80211_hwsim.c index 1bad9ea40919..7f11f420129f 100644 --- a/drivers/net/wireless/mac80211_hwsim.c +++ b/drivers/net/wireless/mac80211_hwsim.c @@ -1344,7 +1344,7 @@ static bool mac80211_hwsim_tx_frame_no_nl(struct ieee80211_hw *hw, */ if (ieee80211_is_beacon(hdr->frame_control) || ieee80211_is_probe_resp(hdr->frame_control)) { - rx_status.boottime_ns = ktime_get_boot_ns(); + rx_status.boottime_ns = ktime_get_boottime_ns(); now = data->abs_bcn_ts; } else { now = mac80211_hwsim_get_tsf_raw(); diff --git a/drivers/net/wireless/ti/wlcore/main.c b/drivers/net/wireless/ti/wlcore/main.c index 620885133e88..e21fe7c6c718 100644 --- a/drivers/net/wireless/ti/wlcore/main.c +++ b/drivers/net/wireless/ti/wlcore/main.c @@ -484,7 +484,7 @@ static int wlcore_fw_status(struct wl1271 *wl, struct wl_fw_status *status) } /* update the host-chipset time offset */ - wl->time_offset = (ktime_get_boot_ns() >> 10) - + wl->time_offset = (ktime_get_boottime_ns() >> 10) - (s64)(status->fw_localtime); wl->fw_fast_lnk_map = status->link_fast_bitmap; diff --git a/drivers/net/wireless/ti/wlcore/rx.c b/drivers/net/wireless/ti/wlcore/rx.c index 078a4940bc5c..cb4b17bfba23 100644 --- a/drivers/net/wireless/ti/wlcore/rx.c +++ b/drivers/net/wireless/ti/wlcore/rx.c @@ -107,7 +107,7 @@ static void wl1271_rx_status(struct wl1271 *wl, } if (beacon || probe_rsp) - status->boottime_ns = ktime_get_boot_ns(); + status->boottime_ns = ktime_get_boottime_ns(); if (beacon) wlcore_set_pending_regdomain_ch(wl, (u16)desc->channel, diff --git a/drivers/net/wireless/ti/wlcore/tx.c b/drivers/net/wireless/ti/wlcore/tx.c index 00e9b4624dcf..1d12b293c8a5 100644 --- a/drivers/net/wireless/ti/wlcore/tx.c +++ b/drivers/net/wireless/ti/wlcore/tx.c @@ -286,7 +286,7 @@ static void wl1271_tx_fill_hdr(struct wl1271 *wl, struct wl12xx_vif *wlvif, } /* configure packet life time */ - hosttime = (ktime_get_boot_ns() >> 10); + hosttime = (ktime_get_boottime_ns() >> 10); desc->start_time = cpu_to_le32(hosttime - wl->time_offset); is_dummy = wl12xx_is_dummy_packet(wl, skb); diff --git a/drivers/net/wireless/virt_wifi.c b/drivers/net/wireless/virt_wifi.c index c06b11c84732..ff4837a1dc7d 100644 --- a/drivers/net/wireless/virt_wifi.c +++ b/drivers/net/wireless/virt_wifi.c @@ -174,7 +174,7 @@ static void virt_wifi_scan_result(struct work_struct *work) scan_result.work); struct wiphy *wiphy = priv_to_wiphy(priv); struct cfg80211_scan_info scan_info = { .aborted = false }; - u64 tsf = div_u64(ktime_get_boot_ns(), 1000); + u64 tsf = div_u64(ktime_get_boottime_ns(), 1000); informed_bss = cfg80211_inform_bss(wiphy, &channel_5ghz, CFG80211_BSS_FTYPE_PRESP, diff --git a/drivers/staging/qca-wifi-host-cmn/qdf/linux/src/i_qdf_time.h b/drivers/staging/qca-wifi-host-cmn/qdf/linux/src/i_qdf_time.h index 7f0b41e914ce..c661b2a799bd 100644 --- a/drivers/staging/qca-wifi-host-cmn/qdf/linux/src/i_qdf_time.h +++ b/drivers/staging/qca-wifi-host-cmn/qdf/linux/src/i_qdf_time.h @@ -314,7 +314,7 @@ static inline uint64_t __qdf_get_log_timestamp(void) #if (LINUX_VERSION_CODE >= KERNEL_VERSION(3, 17, 0)) static inline uint64_t __qdf_get_bootbased_boottime_ns(void) { - return ktime_get_boot_ns(); + return ktime_get_boottime_ns(); } #else diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h index d5ff77dc5348..baf4d6b84d46 100644 --- a/include/linux/timekeeping.h +++ b/include/linux/timekeeping.h @@ -115,12 +115,12 @@ static inline u64 ktime_get_real_ns(void) return ktime_to_ns(ktime_get_real()); } -static inline u64 ktime_get_boot_ns(void) +static inline u64 ktime_get_boottime_ns(void) { return ktime_to_ns(ktime_get_boottime()); } -static inline u64 ktime_get_tai_ns(void) +static inline u64 ktime_get_clocktai_ns(void) { return ktime_to_ns(ktime_get_clocktai()); } diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index b0f3b530415e..e9b2b46f2fd4 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -1935,7 +1935,7 @@ enum cfg80211_signal_type { * received by the device (not just by the host, in case it was * buffered on the device) and be accurate to about 10ms. * If the frame isn't buffered, just passing the return value of - * ktime_get_boot_ns() is likely appropriate. + * ktime_get_boottime_ns() is likely appropriate. * @parent_tsf: the time at the start of reception of the first octet of the * timestamp field of the frame. The time is the TSF of the BSS specified * by %parent_bssid. diff --git a/include/trace/events/net.h b/include/trace/events/net.h index a9869b69e96d..cfac7d3a471d 100644 --- a/include/trace/events/net.h +++ b/include/trace/events/net.h @@ -59,7 +59,7 @@ TRACE_EVENT(net_dev_start_xmit, __entry->gso_size = skb_shinfo(skb)->gso_size; __entry->gso_segs = skb_shinfo(skb)->gso_segs; __entry->gso_type = skb_shinfo(skb)->gso_type; - __entry->utctime = ktime_get_tai_ns(); + __entry->utctime = ktime_get_clocktai_ns(); ), @@ -93,7 +93,7 @@ TRACE_EVENT(net_receive_skb_exit, TP_fast_assign( __entry->skbaddr = skb; - __entry->utctime = ktime_get_tai_ns(); + __entry->utctime = ktime_get_clocktai_ns(); ), @@ -122,7 +122,7 @@ TRACE_EVENT(net_dev_xmit, __entry->len = skb_len; __entry->rc = rc; __assign_str(name, dev->name); - __entry->utctime = ktime_get_tai_ns(); + __entry->utctime = ktime_get_clocktai_ns(); ), TP_printk("dev=%s skbaddr=%pK len=%u rc=%d UTC: %ld", @@ -147,7 +147,7 @@ DECLARE_EVENT_CLASS(net_dev_template, __entry->skbaddr = skb; __entry->len = skb->len; __assign_str(name, skb->dev->name); - __entry->utctime = ktime_get_tai_ns(); + __entry->utctime = ktime_get_clocktai_ns(); ), TP_printk("dev=%s skbaddr=%pK len=%u UTC: %ld", @@ -223,7 +223,7 @@ DECLARE_EVENT_CLASS(net_dev_rx_verbose_template, __entry->nr_frags = skb_shinfo(skb)->nr_frags; __entry->gso_size = skb_shinfo(skb)->gso_size; __entry->gso_type = skb_shinfo(skb)->gso_type; - __entry->utctime = ktime_get_tai_ns(); + __entry->utctime = ktime_get_clocktai_ns(); ), diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 72a3de6cf3ba..94e1c43bc584 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1682,7 +1682,7 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) if (err < 0) goto free_prog; - prog->aux->load_time = ktime_get_boot_ns(); + prog->aux->load_time = ktime_get_boottime_ns(); err = bpf_obj_name_cpy(prog->aux->name, attr->prog_name); if (err) goto free_prog; diff --git a/kernel/events/core.c b/kernel/events/core.c index 0ff0ce06df10..d7ea0f35f34d 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -10557,11 +10557,11 @@ static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id) break; case CLOCK_BOOTTIME: - event->clock = &ktime_get_boot_ns; + event->clock = &ktime_get_boottime_ns; break; case CLOCK_TAI: - event->clock = &ktime_get_tai_ns; + event->clock = &ktime_get_clocktai_ns; break; default: diff --git a/kernel/fork.c b/kernel/fork.c index ce8aa2b8bf41..6e3637bc035a 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2145,7 +2145,7 @@ static __latent_entropy struct task_struct *copy_process( */ p->start_time = ktime_get_ns(); - p->real_start_time = ktime_get_boot_ns(); + p->real_start_time = ktime_get_boottime_ns(); /* * Make it visible to the rest of the system, but dont wake it up yet. From 477ae2b7badf051eb2d6eede2a4e75a2ef4a4077 Mon Sep 17 00:00:00 2001 From: Deepa Dinamani Date: Fri, 4 Aug 2017 21:12:31 -0700 Subject: [PATCH 1266/1640] select: Use get/put_timespec64 Usage of these apis and their compat versions makes the syscalls: select family of syscalls and their compat implementations simpler. This is a preparatory patch to isolate data conversions to struct timespec64 at userspace boundaries. This helps contain the changes needed to transition to new y2038 safe types. Change-Id: I5fa34f0baccc0eba709d450dd7616bb6e45613f2 Signed-off-by: Deepa Dinamani Signed-off-by: Al Viro --- fs/select.c | 60 +++++++++++++++++++++-------------------------------- 1 file changed, 24 insertions(+), 36 deletions(-) diff --git a/fs/select.c b/fs/select.c index 7ceaa6754c96..4c6ade444327 100644 --- a/fs/select.c +++ b/fs/select.c @@ -292,8 +292,7 @@ static int poll_select_copy_remaining(struct timespec64 *end_time, void __user *p, int timeval, int ret) { - struct timespec64 rts64; - struct timespec rts; + struct timespec64 rts; struct timeval rtv; if (!p) @@ -306,23 +305,22 @@ static int poll_select_copy_remaining(struct timespec64 *end_time, if (!end_time->tv_sec && !end_time->tv_nsec) return ret; - ktime_get_ts64(&rts64); - rts64 = timespec64_sub(*end_time, rts64); - if (rts64.tv_sec < 0) - rts64.tv_sec = rts64.tv_nsec = 0; + ktime_get_ts64(&rts); + rts = timespec64_sub(*end_time, rts); + if (rts.tv_sec < 0) + rts.tv_sec = rts.tv_nsec = 0; - rts = timespec64_to_timespec(rts64); if (timeval) { if (sizeof(rtv) > sizeof(rtv.tv_sec) + sizeof(rtv.tv_usec)) memset(&rtv, 0, sizeof(rtv)); - rtv.tv_sec = rts64.tv_sec; - rtv.tv_usec = rts64.tv_nsec / NSEC_PER_USEC; + rtv.tv_sec = rts.tv_sec; + rtv.tv_usec = rts.tv_nsec / NSEC_PER_USEC; if (!copy_to_user(p, &rtv, sizeof(rtv))) return ret; - } else if (!copy_to_user(p, &rts, sizeof(rts))) + } else if (!put_timespec64(&rts, p)) return ret; /* @@ -705,17 +703,15 @@ static long do_pselect(int n, fd_set __user *inp, fd_set __user *outp, const sigset_t __user *sigmask, size_t sigsetsize) { sigset_t ksigmask, sigsaved; - struct timespec ts; - struct timespec64 ts64, end_time, *to = NULL; + struct timespec64 ts, end_time, *to = NULL; int ret; if (tsp) { - if (copy_from_user(&ts, tsp, sizeof(ts))) + if (get_timespec64(&ts, tsp)) return -EFAULT; - ts64 = timespec_to_timespec64(ts); to = &end_time; - if (poll_select_set_timeout(to, ts64.tv_sec, ts64.tv_nsec)) + if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec)) return -EINVAL; } @@ -1050,12 +1046,11 @@ SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds, unsigned int, nfds, size_t, sigsetsize) { sigset_t ksigmask, sigsaved; - struct timespec ts; - struct timespec64 end_time, *to = NULL; + struct timespec64 ts, end_time, *to = NULL; int ret; if (tsp) { - if (copy_from_user(&ts, tsp, sizeof(ts))) + if (get_timespec64(&ts, tsp)) return -EFAULT; to = &end_time; @@ -1101,10 +1096,10 @@ SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds, unsigned int, nfds, #define __COMPAT_NFDBITS (8 * sizeof(compat_ulong_t)) static -int compat_poll_select_copy_remaining(struct timespec *end_time, void __user *p, +int compat_poll_select_copy_remaining(struct timespec64 *end_time, void __user *p, int timeval, int ret) { - struct timespec ts; + struct timespec64 ts; if (!p) return ret; @@ -1116,8 +1111,8 @@ int compat_poll_select_copy_remaining(struct timespec *end_time, void __user *p, if (!end_time->tv_sec && !end_time->tv_nsec) return ret; - ktime_get_ts(&ts); - ts = timespec_sub(*end_time, ts); + ktime_get_ts64(&ts); + ts = timespec64_sub(*end_time, ts); if (ts.tv_sec < 0) ts.tv_sec = ts.tv_nsec = 0; @@ -1130,12 +1125,7 @@ int compat_poll_select_copy_remaining(struct timespec *end_time, void __user *p, if (!copy_to_user(p, &rtv, sizeof(rtv))) return ret; } else { - struct compat_timespec rts; - - rts.tv_sec = ts.tv_sec; - rts.tv_nsec = ts.tv_nsec; - - if (!copy_to_user(p, &rts, sizeof(rts))) + if (!compat_put_timespec64(&ts, p)) return ret; } /* @@ -1193,7 +1183,7 @@ int compat_set_fd_set(unsigned long nr, compat_ulong_t __user *ufdset, */ static int compat_core_sys_select(int n, compat_ulong_t __user *inp, compat_ulong_t __user *outp, compat_ulong_t __user *exp, - struct timespec *end_time) + struct timespec64 *end_time) { fd_set_bits fds; void *bits; @@ -1266,7 +1256,7 @@ COMPAT_SYSCALL_DEFINE5(select, int, n, compat_ulong_t __user *, inp, compat_ulong_t __user *, outp, compat_ulong_t __user *, exp, struct compat_timeval __user *, tvp) { - struct timespec end_time, *to = NULL; + struct timespec64 end_time, *to = NULL; struct compat_timeval tv; int ret; @@ -1312,12 +1302,11 @@ static long do_compat_pselect(int n, compat_ulong_t __user *inp, { compat_sigset_t ss32; sigset_t ksigmask, sigsaved; - struct compat_timespec ts; - struct timespec end_time, *to = NULL; + struct timespec64 ts, end_time, *to = NULL; int ret; if (tsp) { - if (copy_from_user(&ts, tsp, sizeof(ts))) + if (compat_get_timespec64(&ts, tsp)) return -EFAULT; to = &end_time; @@ -1381,12 +1370,11 @@ COMPAT_SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds, { compat_sigset_t ss32; sigset_t ksigmask, sigsaved; - struct compat_timespec ts; - struct timespec end_time, *to = NULL; + struct timespec64 ts, end_time, *to = NULL; int ret; if (tsp) { - if (copy_from_user(&ts, tsp, sizeof(ts))) + if (compat_get_timespec64(&ts, tsp)) return -EFAULT; to = &end_time; From a220c2eb3886177f801f00f806e0f879ca049a57 Mon Sep 17 00:00:00 2001 From: Prarit Bhargava Date: Mon, 28 Aug 2017 08:21:53 -0400 Subject: [PATCH 1267/1640] timekeeping: Make fast accessors return 0 before timekeeping is initialized printk timestamps will be extended to include mono and boot time by using the fast timekeeping accessors ktime_get_mono|boot_fast_ns(). The functions can return garbage before timekeeping is initialized resulting in garbage timestamps. Initialize the fast timekeepers with dummy clocks which guarantee a 0 readout up to timekeeping_init(). Suggested-by: Peter Zijlstra Change-Id: Ia92cfef7f56ea216c40bf18b32c1bf334a1aea72 Signed-off-by: Prarit Bhargava Signed-off-by: Thomas Gleixner Cc: Stephen Boyd Cc: John Stultz Link: http://lkml.kernel.org/r/1503922914-10660-2-git-send-email-prarit@redhat.com --- kernel/time/timekeeping.c | 35 +++++++++++++++++++++-------------- 1 file changed, 21 insertions(+), 14 deletions(-) diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 5b10c4ac8d3d..b5694d411b86 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -67,8 +67,27 @@ struct tk_fast { struct tk_read_base base[2]; }; -static struct tk_fast tk_fast_mono ____cacheline_aligned; -static struct tk_fast tk_fast_raw ____cacheline_aligned; +/* Suspend-time cycles value for halted fast timekeeper. */ +static u64 cycles_at_suspend; + +static u64 dummy_clock_read(struct clocksource *cs) +{ + return cycles_at_suspend; +} + +static struct clocksource dummy_clock = { + .read = dummy_clock_read, +}; + +static struct tk_fast tk_fast_mono ____cacheline_aligned = { + .base[0] = { .clock = &dummy_clock, }, + .base[1] = { .clock = &dummy_clock, }, +}; + +static struct tk_fast tk_fast_raw ____cacheline_aligned = { + .base[0] = { .clock = &dummy_clock, }, + .base[1] = { .clock = &dummy_clock, }, +}; /* flag for if timekeeping is suspended */ int __read_mostly timekeeping_suspended; @@ -484,18 +503,6 @@ u64 notrace ktime_get_boot_fast_ns(void) } EXPORT_SYMBOL_GPL(ktime_get_boot_fast_ns); -/* Suspend-time cycles value for halted fast timekeeper. */ -static u64 cycles_at_suspend; - -static u64 dummy_clock_read(struct clocksource *cs) -{ - return cycles_at_suspend; -} - -static struct clocksource dummy_clock = { - .read = dummy_clock_read, -}; - /** * halt_fast_timekeeper - Prevent fast timekeeper from accessing clocksource. * @tk: Timekeeper to snapshot. From 1a13aa1be7d801998c82802f98bd0fa75776d015 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 31 Aug 2017 17:12:48 +0200 Subject: [PATCH 1268/1640] timekeeping: Provide NMI safe access to clock realtime The configurable printk timestamping wants access to clock realtime. Right now there is no ktime_get_real_fast_ns() accessor because reading the monotonic base and the realtime offset cannot be done atomically. Contrary to boot time this offset can change during runtime and cause half updated readouts. struct tk_read_base was fully packed when the fast timekeeper access was implemented. commit ceea5e3771ed ("time: Fix clock->read(clock) race around clocksource changes") removed the 'read' function pointer from the structure, but of course left the comment stale. So now the structure can fit a new 64bit member w/o violating the cache line constraints. Add real_base to tk_read_base and update it in the fast timekeeper update sequence. Implement an accessor which follows the same scheme as the accessor to clock monotonic, but uses the new real_base to access clock real time. The runtime overhead for updating real_base is minimal as it just adds two cache hot values and stores them into an already dirtied cache line along with the other fast timekeeper updates. Change-Id: I67297bb116aba7400645dd593f83dba149ae65d3 Signed-off-by: Thomas Gleixner Cc: Prarit Bhargava Cc: John Stultz Cc: Peter Zijlstra Link: https://lkml.kernel.org/r/1505757060-2004-3-git-send-email-prarit@redhat.com --- include/linux/timekeeper_internal.h | 6 ++++- include/linux/timekeeping.h | 1 + kernel/time/timekeeping.c | 35 +++++++++++++++++++++++++++++ 3 files changed, 41 insertions(+), 1 deletion(-) diff --git a/include/linux/timekeeper_internal.h b/include/linux/timekeeper_internal.h index 97154c61e5d2..7e9011101cb0 100644 --- a/include/linux/timekeeper_internal.h +++ b/include/linux/timekeeper_internal.h @@ -14,19 +14,22 @@ /** * struct tk_read_base - base structure for timekeeping readout * @clock: Current clocksource used for timekeeping. - * @read: Read function of @clock * @mask: Bitmask for two's complement subtraction of non 64bit clocks * @cycle_last: @clock cycle value at last update * @mult: (NTP adjusted) multiplier for scaled math conversion * @shift: Shift value for scaled math conversion * @xtime_nsec: Shifted (fractional) nano seconds offset for readout * @base: ktime_t (nanoseconds) base time for readout + * @base_real: Nanoseconds base value for clock REALTIME readout * * This struct has size 56 byte on 64 bit. Together with a seqcount it * occupies a single 64byte cache line. * * The struct is separate from struct timekeeper as it is also used * for a fast NMI safe accessors. + * + * @base_real is for the fast NMI safe accessor to allow reading clock + * realtime from any context. */ struct tk_read_base { struct clocksource *clock; @@ -36,6 +39,7 @@ struct tk_read_base { u32 shift; u64 xtime_nsec; ktime_t base; + u64 base_real; }; /** diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h index baf4d6b84d46..acab1e03bb1f 100644 --- a/include/linux/timekeeping.h +++ b/include/linux/timekeeping.h @@ -133,6 +133,7 @@ static inline u64 ktime_get_raw_ns(void) extern u64 ktime_get_mono_fast_ns(void); extern u64 ktime_get_raw_fast_ns(void); extern u64 ktime_get_boot_fast_ns(void); +extern u64 ktime_get_real_fast_ns(void); /* * timespec64 interfaces utilizing the ktime based ones diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index b5694d411b86..a65acae90dcd 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -503,6 +503,39 @@ u64 notrace ktime_get_boot_fast_ns(void) } EXPORT_SYMBOL_GPL(ktime_get_boot_fast_ns); + +/* + * See comment for __ktime_get_fast_ns() vs. timestamp ordering + */ +static __always_inline u64 __ktime_get_real_fast_ns(struct tk_fast *tkf) +{ + struct tk_read_base *tkr; + unsigned int seq; + u64 now; + + do { + seq = raw_read_seqcount_latch(&tkf->seq); + tkr = tkf->base + (seq & 0x01); + now = ktime_to_ns(tkr->base_real); + + now += timekeeping_delta_to_ns(tkr, + clocksource_delta( + tk_clock_read(tkr), + tkr->cycle_last, + tkr->mask)); + } while (read_seqcount_retry(&tkf->seq, seq)); + + return now; +} + +/** + * ktime_get_real_fast_ns: - NMI safe and fast access to clock realtime. + */ +u64 ktime_get_real_fast_ns(void) +{ + return __ktime_get_real_fast_ns(&tk_fast_mono); +} + /** * halt_fast_timekeeper - Prevent fast timekeeper from accessing clocksource. * @tk: Timekeeper to snapshot. @@ -521,6 +554,7 @@ static void halt_fast_timekeeper(struct timekeeper *tk) memcpy(&tkr_dummy, tkr, sizeof(tkr_dummy)); cycles_at_suspend = tk_clock_read(tkr); tkr_dummy.clock = &dummy_clock; + tkr_dummy.base_real = tkr->base + tk->offs_real; update_fast_timekeeper(&tkr_dummy, &tk_fast_mono); tkr = &tk->tkr_raw; @@ -668,6 +702,7 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action) update_vsyscall(tk); update_pvclock_gtod(tk, action & TK_CLOCK_WAS_SET); + tk->tkr_mono.base_real = tk->tkr_mono.base + tk->offs_real; update_fast_timekeeper(&tk->tkr_mono, &tk_fast_mono); update_fast_timekeeper(&tk->tkr_raw, &tk_fast_raw); From d900f78f79b38e8e4d882689d753ddb22b635e0c Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 19 Oct 2017 13:14:45 +0200 Subject: [PATCH 1269/1640] timekeeping: Use timespec64 in timekeeping_inject_offset As part of changing all the timekeeping code to use 64-bit time_t consistently, this removes the uses of timeval and timespec as much as possible from do_adjtimex() and timekeeping_inject_offset(). The timeval_inject_offset_valid() and timespec_inject_offset_valid() just complicate this, so I'm folding them into the respective callers. This leaves the actual 'struct timex' definition, which is part of the user-space ABI and should be dealt with separately when we have agreed on the ABI change. Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Miroslav Lichvar Cc: Richard Cochran Cc: Prarit Bhargava Cc: Stephen Boyd Change-Id: I7e05c6e81c919d6174a759af6d8a734265df6d01 Signed-off-by: Arnd Bergmann Signed-off-by: John Stultz --- kernel/time/timekeeping.c | 72 ++++++++++++++------------------------- 1 file changed, 25 insertions(+), 47 deletions(-) diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index a65acae90dcd..3f9006af86a1 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1305,65 +1305,37 @@ out: } EXPORT_SYMBOL(do_settimeofday64); -/* - * Validates if a timespec/timeval used to inject a time offset is valid. - * Offsets can be postive or negative. The value of the timeval/timespec - * is the sum of its fields, but *NOTE*: the field tv_usec/tv_nsec must - * always be non-negative. - */ -static inline bool timeval_inject_offset_valid(const struct timeval *tv) -{ - /* We don't check the tv_sec as it can be positive or negative */ - - /* Can't have more microseconds then a second */ - if (tv->tv_usec < 0 || tv->tv_usec >= USEC_PER_SEC) - return false; - return true; -} - -static inline bool timespec_inject_offset_valid(const struct timespec *ts) -{ - /* We don't check the tv_sec as it can be positive or negative */ - - /* Can't have more nanoseconds then a second */ - if (ts->tv_nsec < 0 || ts->tv_nsec >= NSEC_PER_SEC) - return false; - return true; -} - /** * timekeeping_inject_offset - Adds or subtracts from the current time. * @tv: pointer to the timespec variable containing the offset * * Adds or subtracts an offset value from the current time. */ -static int timekeeping_inject_offset(struct timespec *ts) +static int timekeeping_inject_offset(struct timespec64 *ts) { struct timekeeper *tk = &tk_core.timekeeper; unsigned long flags; - struct timespec64 ts64, tmp; + struct timespec64 tmp; int ret = 0; - if (!timespec_inject_offset_valid(ts)) + if (ts->tv_nsec < 0 || ts->tv_nsec >= NSEC_PER_SEC) return -EINVAL; - ts64 = timespec_to_timespec64(*ts); - raw_spin_lock_irqsave(&timekeeper_lock, flags); write_seqcount_begin(&tk_core.seq); timekeeping_forward_now(tk); /* Make sure the proposed value is valid */ - tmp = timespec64_add(tk_xtime(tk), ts64); - if (timespec64_compare(&tk->wall_to_monotonic, &ts64) > 0 || + tmp = timespec64_add(tk_xtime(tk), *ts); + if (timespec64_compare(&tk->wall_to_monotonic, ts) > 0 || !timespec64_valid_strict(&tmp)) { ret = -EINVAL; goto error; } - tk_xtime_add(tk, &ts64); - tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, ts64)); + tk_xtime_add(tk, ts); + tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, *ts)); error: /* even if we error out, we forwarded the time, so call update */ timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); @@ -1402,7 +1374,7 @@ int persistent_clock_is_local; void timekeeping_warp_clock(void) { if (sys_tz.tz_minuteswest != 0) { - struct timespec adjust; + struct timespec64 adjust; persistent_clock_is_local = 1; adjust.tv_sec = sys_tz.tz_minuteswest * 60; @@ -2354,9 +2326,9 @@ ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq, ktime_t *offs_real, } /** - * ntp_validate_timex - Ensures the timex is ok for use in do_adjtimex + * timekeeping_validate_timex - Ensures the timex is ok for use in do_adjtimex */ -static int ntp_validate_timex(struct timex *txc) +static int timekeeping_validate_timex(struct timex *txc) { if (txc->modes & ADJ_ADJTIME) { /* singleshot must not be used with any other mode bits */ @@ -2384,16 +2356,22 @@ static int ntp_validate_timex(struct timex *txc) if (!capable(CAP_SYS_TIME)) return -EPERM; + /* + * Validate if a timespec/timeval used to inject a time + * offset is valid. Offsets can be postive or negative, so + * we don't check tv_sec. The value of the timeval/timespec + * is the sum of its fields,but *NOTE*: + * The field tv_usec/tv_nsec must always be non-negative and + * we can't have more nanoseconds/microseconds than a second. + */ + if (txc->time.tv_usec < 0) + return -EINVAL; + if (txc->modes & ADJ_NANO) { - struct timespec ts; - - ts.tv_sec = txc->time.tv_sec; - ts.tv_nsec = txc->time.tv_usec; - if (!timespec_inject_offset_valid(&ts)) + if (txc->time.tv_usec >= NSEC_PER_SEC) return -EINVAL; - } else { - if (!timeval_inject_offset_valid(&txc->time)) + if (txc->time.tv_usec >= USEC_PER_SEC) return -EINVAL; } } @@ -2439,12 +2417,12 @@ int do_adjtimex(struct timex *txc) int ret; /* Validate the data before disabling interrupts */ - ret = ntp_validate_timex(txc); + ret = timekeeping_validate_timex(txc); if (ret) return ret; if (txc->modes & ADJ_SETOFFSET) { - struct timespec delta; + struct timespec64 delta; delta.tv_sec = txc->time.tv_sec; delta.tv_nsec = txc->time.tv_usec; if (!(txc->modes & ADJ_NANO)) From 297e763402364ef4fbe1030d788546d2ef38eb0c Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 10 Nov 2017 16:25:04 +0100 Subject: [PATCH 1270/1640] pstore: Use ktime_get_real_fast_ns() instead of __getnstimeofday() __getnstimeofday() is a rather odd interface, with a number of quirks: - The caller may come from NMI context, but the implementation is not NMI safe, one way to get there from NMI is NMI handler: something bad panic() kmsg_dump() pstore_dump() pstore_record_init() __getnstimeofday() - The calling conventions are different from any other timekeeping functions, to deal with returning an error code during suspended timekeeping. Address the above issues by using a completely different method to get the time: ktime_get_real_fast_ns() is NMI safe and has a reasonable behavior when timekeeping is suspended: it returns the time at which it got suspended. As Thomas Gleixner explained, this is safe, as ktime_get_real_fast_ns() does not call into the clocksource driver that might be suspended. The result can easily be transformed into a timespec structure. Since ktime_get_real_fast_ns() was not exported to modules, add the export. The pstore behavior for the suspended case changes slightly, as it now stores the timestamp at which timekeeping was suspended instead of storing a zero timestamp. This change is not addressing y2038-safety, that's subject to a more complex follow up patch. Change-Id: Id259b99ce7a7cba52abad39f61420ec977e26e37 Signed-off-by: Arnd Bergmann Signed-off-by: Thomas Gleixner Acked-by: Kees Cook Cc: Tony Luck Cc: Anton Vorontsov Cc: Stephen Boyd Cc: John Stultz Cc: Colin Cross Link: https://lkml.kernel.org/r/20171110152530.1926955-1-arnd@arndb.de --- fs/pstore/platform.c | 5 +---- kernel/time/timekeeping.c | 1 + 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index 8af9efa0ca0a..c44a53277018 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -483,10 +483,7 @@ void pstore_record_init(struct pstore_record *record, record->psi = psinfo; /* Report zeroed timestamp if called before timekeeping has resumed. */ - if (__getnstimeofday(&record->time)) { - record->time.tv_sec = 0; - record->time.tv_nsec = 0; - } + record->time = ns_to_timespec(ktime_get_real_fast_ns()); } /* diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 3f9006af86a1..6365d1131ed5 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -535,6 +535,7 @@ u64 ktime_get_real_fast_ns(void) { return __ktime_get_real_fast_ns(&tk_fast_mono); } +EXPORT_SYMBOL_GPL(ktime_get_real_fast_ns); /** * halt_fast_timekeeper - Prevent fast timekeeper from accessing clocksource. From 3a3cdd1a9cc21c8016fa2d4c9bff328adeb22b20 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 14 May 2018 15:50:52 -0700 Subject: [PATCH 1271/1640] pstore: Convert internal records to timespec64 This prepares pstore for converting the VFS layer to timespec64. Change-Id: Id2dfb977f0e3d41a935eb03344555eeace394875 Signed-off-by: Kees Cook Signed-off-by: Deepa Dinamani --- drivers/firmware/efi/efi-pstore.c | 27 ++++++++++++++------------- fs/pstore/inode.c | 3 ++- fs/pstore/platform.c | 2 +- fs/pstore/ram.c | 21 ++++++++++++++------- include/linux/pstore.h | 2 +- 5 files changed, 32 insertions(+), 23 deletions(-) diff --git a/drivers/firmware/efi/efi-pstore.c b/drivers/firmware/efi/efi-pstore.c index 4277147f7140..0f7d97917197 100644 --- a/drivers/firmware/efi/efi-pstore.c +++ b/drivers/firmware/efi/efi-pstore.c @@ -28,10 +28,9 @@ static int efi_pstore_close(struct pstore_info *psi) return 0; } -static inline u64 generic_id(unsigned long timestamp, - unsigned int part, int count) +static inline u64 generic_id(u64 timestamp, unsigned int part, int count) { - return ((u64) timestamp * 100 + part) * 1000 + count; + return (timestamp * 100 + part) * 1000 + count; } static int efi_pstore_read_func(struct efivar_entry *entry, @@ -42,7 +41,8 @@ static int efi_pstore_read_func(struct efivar_entry *entry, int i; int cnt; unsigned int part; - unsigned long time, size; + unsigned long size; + u64 time; if (efi_guidcmp(entry->var.VendorGuid, vendor)) return 0; @@ -50,7 +50,7 @@ static int efi_pstore_read_func(struct efivar_entry *entry, for (i = 0; i < DUMP_NAME_LEN; i++) name[i] = entry->var.VariableName[i]; - if (sscanf(name, "dump-type%u-%u-%d-%lu-%c", + if (sscanf(name, "dump-type%u-%u-%d-%llu-%c", &record->type, &part, &cnt, &time, &data_type) == 5) { record->id = generic_id(time, part, cnt); record->part = part; @@ -62,7 +62,7 @@ static int efi_pstore_read_func(struct efivar_entry *entry, else record->compressed = false; record->ecc_notice_size = 0; - } else if (sscanf(name, "dump-type%u-%u-%d-%lu", + } else if (sscanf(name, "dump-type%u-%u-%d-%llu", &record->type, &part, &cnt, &time) == 4) { record->id = generic_id(time, part, cnt); record->part = part; @@ -71,7 +71,7 @@ static int efi_pstore_read_func(struct efivar_entry *entry, record->time.tv_nsec = 0; record->compressed = false; record->ecc_notice_size = 0; - } else if (sscanf(name, "dump-type%u-%u-%lu", + } else if (sscanf(name, "dump-type%u-%u-%llu", &record->type, &part, &time) == 3) { /* * Check if an old format, @@ -250,9 +250,10 @@ static int efi_pstore_write(struct pstore_record *record) /* Since we copy the entire length of name, make sure it is wiped. */ memset(name, 0, sizeof(name)); - snprintf(name, sizeof(name), "dump-type%u-%u-%d-%lu-%c", + snprintf(name, sizeof(name), "dump-type%u-%u-%d-%lld-%c", record->type, record->part, record->count, - record->time.tv_sec, record->compressed ? 'C' : 'D'); + (long long)record->time.tv_sec, + record->compressed ? 'C' : 'D'); for (i = 0; i < DUMP_NAME_LEN; i++) efi_name[i] = name[i]; @@ -326,15 +327,15 @@ static int efi_pstore_erase(struct pstore_record *record) char name[DUMP_NAME_LEN]; int ret; - snprintf(name, sizeof(name), "dump-type%u-%u-%d-%lu", + snprintf(name, sizeof(name), "dump-type%u-%u-%d-%lld", record->type, record->part, record->count, - record->time.tv_sec); + (long long)record->time.tv_sec); ret = efi_pstore_erase_name(name); if (ret != -ENOENT) return ret; - snprintf(name, sizeof(name), "dump-type%u-%u-%lu", - record->type, record->part, record->time.tv_sec); + snprintf(name, sizeof(name), "dump-type%u-%u-%lld", + record->type, record->part, (long long)record->time.tv_sec); ret = efi_pstore_erase_name(name); return ret; diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c index d814723fb27d..d4747c552404 100644 --- a/fs/pstore/inode.c +++ b/fs/pstore/inode.c @@ -392,7 +392,8 @@ int pstore_mkfile(struct dentry *root, struct pstore_record *record) inode->i_private = private; if (record->time.tv_sec) - inode->i_mtime = inode->i_ctime = record->time; + inode->i_mtime = inode->i_ctime = + timespec64_to_timespec(record->time); d_add(dentry, inode); diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index c44a53277018..1100bef3081b 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -483,7 +483,7 @@ void pstore_record_init(struct pstore_record *record, record->psi = psinfo; /* Report zeroed timestamp if called before timekeeping has resumed. */ - record->time = ns_to_timespec(ktime_get_real_fast_ns()); + record->time = ns_to_timespec64(ktime_get_real_fast_ns()); } /* diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c index 66e55bc795e3..73aa3c297701 100644 --- a/fs/pstore/ram.c +++ b/fs/pstore/ram.c @@ -38,6 +38,11 @@ #define RAMOOPS_KERNMSG_HDR "====" #define MIN_MEM_SIZE 4096UL +#if __BITS_PER_LONG == 64 +# define TVSEC_FMT "%ld" +#else +# define TVSEC_FMT "%lld" +#endif static ulong record_size = MIN_MEM_SIZE; module_param(record_size, ulong, 0400); @@ -153,21 +158,23 @@ ramoops_get_next_prz(struct persistent_ram_zone *przs[], uint *c, uint max, return prz; } -static int ramoops_read_kmsg_hdr(char *buffer, struct timespec *time, +static int ramoops_read_kmsg_hdr(char *buffer, struct timespec64 *time, bool *compressed) { char data_type; int header_length = 0; - if (sscanf(buffer, RAMOOPS_KERNMSG_HDR "%lu.%lu-%c\n%n", &time->tv_sec, - &time->tv_nsec, &data_type, &header_length) == 3) { + if (sscanf(buffer, RAMOOPS_KERNMSG_HDR TVSEC_FMT ".%lu-%c\n%n", + &time->tv_sec, &time->tv_nsec, &data_type, + &header_length) == 3) { if (data_type == 'C') *compressed = true; else *compressed = false; - } else if (sscanf(buffer, RAMOOPS_KERNMSG_HDR "%lu.%lu\n%n", - &time->tv_sec, &time->tv_nsec, &header_length) == 2) { - *compressed = false; + } else if (sscanf(buffer, RAMOOPS_KERNMSG_HDR TVSEC_FMT ".%lu\n%n", + &time->tv_sec, &time->tv_nsec, + &header_length) == 2) { + *compressed = false; } else { time->tv_sec = 0; time->tv_nsec = 0; @@ -360,7 +367,7 @@ static size_t ramoops_write_kmsg_hdr(struct persistent_ram_zone *prz, char *hdr; size_t len; - hdr = kasprintf(GFP_ATOMIC, RAMOOPS_KERNMSG_HDR "%lu.%lu-%c\n", + hdr = kasprintf(GFP_ATOMIC, RAMOOPS_KERNMSG_HDR TVSEC_FMT ".%lu-%c\n", record->time.tv_sec, record->time.tv_nsec / 1000, record->compressed ? 'C' : 'D'); diff --git a/include/linux/pstore.h b/include/linux/pstore.h index 70913ec87785..de9093d6e660 100644 --- a/include/linux/pstore.h +++ b/include/linux/pstore.h @@ -71,7 +71,7 @@ struct pstore_record { struct pstore_info *psi; enum pstore_type_id type; u64 id; - struct timespec time; + struct timespec64 time; char *buf; ssize_t size; ssize_t ecc_notice_size; From 795401cf595a815de4f29a95f4c94872df0caa2f Mon Sep 17 00:00:00 2001 From: Deepa Dinamani Date: Tue, 8 May 2018 19:36:02 -0700 Subject: [PATCH 1272/1640] vfs: change inode times to use struct timespec64 struct timespec is not y2038 safe. Transition vfs to use y2038 safe struct timespec64 instead. The change was made with the help of the following cocinelle script. This catches about 80% of the changes. All the header file and logic changes are included in the first 5 rules. The rest are trivial substitutions. I avoid changing any of the function signatures or any other filesystem specific data structures to keep the patch simple for review. The script can be a little shorter by combining different cases. But, this version was sufficient for my usecase. virtual patch @ depends on patch @ identifier now; @@ - struct timespec + struct timespec64 current_time ( ... ) { - struct timespec now = current_kernel_time(); + struct timespec64 now = current_kernel_time64(); ... - return timespec_trunc( + return timespec64_trunc( ... ); } @ depends on patch @ identifier xtime; @@ struct \( iattr \| inode \| kstat \) { ... - struct timespec xtime; + struct timespec64 xtime; ... } @ depends on patch @ identifier t; @@ struct inode_operations { ... int (*update_time) (..., - struct timespec t, + struct timespec64 t, ...); ... } @ depends on patch @ identifier t; identifier fn_update_time =~ "update_time$"; @@ fn_update_time (..., - struct timespec *t, + struct timespec64 *t, ...) { ... } @ depends on patch @ identifier t; @@ lease_get_mtime( ... , - struct timespec *t + struct timespec64 *t ) { ... } @te depends on patch forall@ identifier ts; local idexpression struct inode *inode_node; identifier i_xtime =~ "^i_[acm]time$"; identifier ia_xtime =~ "^ia_[acm]time$"; identifier fn_update_time =~ "update_time$"; identifier fn; expression e, E3; local idexpression struct inode *node1; local idexpression struct inode *node2; local idexpression struct iattr *attr1; local idexpression struct iattr *attr2; local idexpression struct iattr attr; identifier i_xtime1 =~ "^i_[acm]time$"; identifier i_xtime2 =~ "^i_[acm]time$"; identifier ia_xtime1 =~ "^ia_[acm]time$"; identifier ia_xtime2 =~ "^ia_[acm]time$"; @@ ( ( - struct timespec ts; + struct timespec64 ts; | - struct timespec ts = current_time(inode_node); + struct timespec64 ts = current_time(inode_node); ) <+... when != ts ( - timespec_equal(&inode_node->i_xtime, &ts) + timespec64_equal(&inode_node->i_xtime, &ts) | - timespec_equal(&ts, &inode_node->i_xtime) + timespec64_equal(&ts, &inode_node->i_xtime) | - timespec_compare(&inode_node->i_xtime, &ts) + timespec64_compare(&inode_node->i_xtime, &ts) | - timespec_compare(&ts, &inode_node->i_xtime) + timespec64_compare(&ts, &inode_node->i_xtime) | ts = current_time(e) | fn_update_time(..., &ts,...) | inode_node->i_xtime = ts | node1->i_xtime = ts | ts = inode_node->i_xtime | <+... attr1->ia_xtime ...+> = ts | ts = attr1->ia_xtime | ts.tv_sec | ts.tv_nsec | btrfs_set_stack_timespec_sec(..., ts.tv_sec) | btrfs_set_stack_timespec_nsec(..., ts.tv_nsec) | - ts = timespec64_to_timespec( + ts = ... -) | - ts = ktime_to_timespec( + ts = ktime_to_timespec64( ...) | - ts = E3 + ts = timespec_to_timespec64(E3) | - ktime_get_real_ts(&ts) + ktime_get_real_ts64(&ts) | fn(..., - ts + timespec64_to_timespec(ts) ,...) ) ...+> ( <... when != ts - return ts; + return timespec64_to_timespec(ts); ...> ) | - timespec_equal(&node1->i_xtime1, &node2->i_xtime2) + timespec64_equal(&node1->i_xtime2, &node2->i_xtime2) | - timespec_equal(&node1->i_xtime1, &attr2->ia_xtime2) + timespec64_equal(&node1->i_xtime2, &attr2->ia_xtime2) | - timespec_compare(&node1->i_xtime1, &node2->i_xtime2) + timespec64_compare(&node1->i_xtime1, &node2->i_xtime2) | node1->i_xtime1 = - timespec_trunc(attr1->ia_xtime1, + timespec64_trunc(attr1->ia_xtime1, ...) | - attr1->ia_xtime1 = timespec_trunc(attr2->ia_xtime2, + attr1->ia_xtime1 = timespec64_trunc(attr2->ia_xtime2, ...) | - ktime_get_real_ts(&attr1->ia_xtime1) + ktime_get_real_ts64(&attr1->ia_xtime1) | - ktime_get_real_ts(&attr.ia_xtime1) + ktime_get_real_ts64(&attr.ia_xtime1) ) @ depends on patch @ struct inode *node; struct iattr *attr; identifier fn; identifier i_xtime =~ "^i_[acm]time$"; identifier ia_xtime =~ "^ia_[acm]time$"; expression e; @@ ( - fn(node->i_xtime); + fn(timespec64_to_timespec(node->i_xtime)); | fn(..., - node->i_xtime); + timespec64_to_timespec(node->i_xtime)); | - e = fn(attr->ia_xtime); + e = fn(timespec64_to_timespec(attr->ia_xtime)); ) @ depends on patch forall @ struct inode *node; struct iattr *attr; identifier i_xtime =~ "^i_[acm]time$"; identifier ia_xtime =~ "^ia_[acm]time$"; identifier fn; @@ { + struct timespec ts; <+... ( + ts = timespec64_to_timespec(node->i_xtime); fn (..., - &node->i_xtime, + &ts, ...); | + ts = timespec64_to_timespec(attr->ia_xtime); fn (..., - &attr->ia_xtime, + &ts, ...); ) ...+> } @ depends on patch forall @ struct inode *node; struct iattr *attr; struct kstat *stat; identifier ia_xtime =~ "^ia_[acm]time$"; identifier i_xtime =~ "^i_[acm]time$"; identifier xtime =~ "^[acm]time$"; identifier fn, ret; @@ { + struct timespec ts; <+... ( + ts = timespec64_to_timespec(node->i_xtime); ret = fn (..., - &node->i_xtime, + &ts, ...); | + ts = timespec64_to_timespec(node->i_xtime); ret = fn (..., - &node->i_xtime); + &ts); | + ts = timespec64_to_timespec(attr->ia_xtime); ret = fn (..., - &attr->ia_xtime, + &ts, ...); | + ts = timespec64_to_timespec(attr->ia_xtime); ret = fn (..., - &attr->ia_xtime); + &ts); | + ts = timespec64_to_timespec(stat->xtime); ret = fn (..., - &stat->xtime); + &ts); ) ...+> } @ depends on patch @ struct inode *node; struct inode *node2; identifier i_xtime1 =~ "^i_[acm]time$"; identifier i_xtime2 =~ "^i_[acm]time$"; identifier i_xtime3 =~ "^i_[acm]time$"; struct iattr *attrp; struct iattr *attrp2; struct iattr attr ; identifier ia_xtime1 =~ "^ia_[acm]time$"; identifier ia_xtime2 =~ "^ia_[acm]time$"; struct kstat *stat; struct kstat stat1; struct timespec64 ts; identifier xtime =~ "^[acmb]time$"; expression e; @@ ( ( node->i_xtime2 \| attrp->ia_xtime2 \| attr.ia_xtime2 \) = node->i_xtime1 ; | node->i_xtime2 = \( node2->i_xtime1 \| timespec64_trunc(...) \); | node->i_xtime2 = node->i_xtime1 = node->i_xtime3 = \(ts \| current_time(...) \); | node->i_xtime1 = node->i_xtime3 = \(ts \| current_time(...) \); | stat->xtime = node2->i_xtime1; | stat1.xtime = node2->i_xtime1; | ( node->i_xtime2 \| attrp->ia_xtime2 \) = attrp->ia_xtime1 ; | ( attrp->ia_xtime1 \| attr.ia_xtime1 \) = attrp2->ia_xtime2; | - e = node->i_xtime1; + e = timespec64_to_timespec( node->i_xtime1 ); | - e = attrp->ia_xtime1; + e = timespec64_to_timespec( attrp->ia_xtime1 ); | node->i_xtime1 = current_time(...); | node->i_xtime2 = node->i_xtime1 = node->i_xtime3 = - e; + timespec_to_timespec64(e); | node->i_xtime1 = node->i_xtime3 = - e; + timespec_to_timespec64(e); | - node->i_xtime1 = e; + node->i_xtime1 = timespec_to_timespec64(e); ) Signed-off-by: Deepa Dinamani Cc: Cc: Cc: Cc: Cc: Cc: Cc: Cc: Cc: Cc: Cc: Cc: Cc: Cc: Cc: Cc: Cc: Cc: Cc: Cc: Cc: Cc: Cc: Cc: Cc: Cc: Cc: Change-Id: I4f16fc85ebf37bc034518b00c331080aa65aa600 --- drivers/tty/tty_io.c | 15 ++++++++-- drivers/usb/gadget/function/f_fs.c | 2 +- fs/adfs/inode.c | 7 +++-- fs/attr.c | 14 +++++----- fs/bad_inode.c | 2 +- fs/btrfs/file.c | 6 ++-- fs/btrfs/inode.c | 8 +++--- fs/btrfs/ioctl.c | 4 +-- fs/btrfs/root-tree.c | 4 +-- fs/btrfs/transaction.c | 2 +- fs/ceph/addr.c | 12 ++++---- fs/ceph/cache.c | 2 +- fs/ceph/caps.c | 6 ++-- fs/ceph/file.c | 6 ++-- fs/ceph/inode.c | 44 ++++++++++++++++-------------- fs/ceph/mds_client.c | 7 +++-- fs/ceph/snap.c | 6 ++-- fs/cifs/cache.c | 4 +-- fs/cifs/inode.c | 26 ++++++++++-------- fs/coda/coda_linux.c | 12 ++++---- fs/configfs/inode.c | 12 ++++---- fs/cramfs/inode.c | 2 +- fs/ext4/ext4.h | 34 +++++++++++++---------- fs/ext4/ialloc.c | 4 +-- fs/ext4/namei.c | 2 +- fs/f2fs/f2fs.h | 1 + fs/f2fs/file.c | 12 ++++---- fs/f2fs/inode.c | 12 ++++---- fs/f2fs/namei.c | 4 +-- fs/fat/inode.c | 20 ++++++++++---- fs/fat/namei_msdos.c | 21 ++++++++------ fs/fat/namei_vfat.c | 22 +++++++++------ fs/fuse/inode.c | 2 +- fs/gfs2/dir.c | 6 ++-- fs/gfs2/glops.c | 4 +-- fs/hfs/inode.c | 4 +-- fs/hfsplus/inode.c | 12 ++++---- fs/hostfs/hostfs_kern.c | 12 ++++---- fs/inode.c | 34 +++++++++++------------ fs/jffs2/dir.c | 18 ++++++------ fs/jffs2/file.c | 2 +- fs/jffs2/fs.c | 12 ++++---- fs/kernfs/dir.c | 4 +-- fs/kernfs/inode.c | 8 +++--- fs/locks.c | 2 +- fs/nfs/callback_proc.c | 4 +-- fs/nfs/fscache-index.c | 4 +-- fs/nfs/inode.c | 34 +++++++++++++---------- fs/nfs/nfs2xdr.c | 25 ++++++++++------- fs/nfs/nfs3xdr.c | 8 ++++-- fs/nfs/nfs4xdr.c | 9 +++--- fs/nfsd/blocklayout.c | 8 ++++-- fs/nfsd/nfs3xdr.c | 10 +++++-- fs/nfsd/nfs4xdr.c | 7 +++-- fs/nfsd/nfsxdr.c | 2 +- fs/ntfs/inode.c | 30 ++++++++++---------- fs/ocfs2/dlmglue.c | 20 ++++++++++---- fs/ocfs2/file.c | 6 ++-- fs/overlayfs/inode.c | 2 +- fs/overlayfs/overlayfs.h | 2 +- fs/proc/uptime.c | 2 +- fs/pstore/inode.c | 3 +- fs/reiserfs/namei.c | 2 +- fs/reiserfs/xattr.c | 4 +-- fs/ubifs/dir.c | 4 +-- fs/ubifs/file.c | 23 ++++++++-------- fs/ubifs/ubifs.h | 2 +- fs/udf/ialloc.c | 4 +-- fs/udf/inode.c | 43 +++++++++++++++++++---------- fs/xfs/xfs_inode.c | 2 +- fs/xfs/xfs_iops.c | 2 +- fs/xfs/xfs_trans_inode.c | 2 +- include/linux/fs.h | 23 ++++++++-------- include/linux/stat.h | 8 +++--- 74 files changed, 423 insertions(+), 327 deletions(-) diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c index bf834387fb0d..9b83b078667b 100644 --- a/drivers/tty/tty_io.c +++ b/drivers/tty/tty_io.c @@ -868,8 +868,13 @@ static ssize_t tty_read(struct file *file, char __user *buf, size_t count, i = -EIO; tty_ldisc_deref(ld); - if (i > 0) - tty_update_time(&inode->i_atime); + if (i > 0) { + struct timespec ts; + + ts = timespec64_to_timespec(inode->i_atime); + tty_update_time(&ts); + inode->i_atime = timespec_to_timespec64(ts); + } return i; } @@ -970,7 +975,11 @@ static inline ssize_t do_tty_write( cond_resched(); } if (written) { - tty_update_time(&file_inode(file)->i_mtime); + struct timespec ts; + + ts = timespec64_to_timespec(file_inode(file)->i_mtime); + tty_update_time(&ts); + file_inode(file)->i_mtime = timespec_to_timespec64(ts); ret = written; } out: diff --git a/drivers/usb/gadget/function/f_fs.c b/drivers/usb/gadget/function/f_fs.c index 3d55cf1a0cf1..d2f08c194053 100644 --- a/drivers/usb/gadget/function/f_fs.c +++ b/drivers/usb/gadget/function/f_fs.c @@ -1469,7 +1469,7 @@ ffs_sb_make_inode(struct super_block *sb, void *data, inode = new_inode(sb); if (likely(inode)) { - struct timespec ts = current_time(inode); + struct timespec64 ts = current_time(inode); inode->i_ino = get_next_ino(); inode->i_mode = perms->mode; diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c index 8dbd36f5e581..c836c425ca94 100644 --- a/fs/adfs/inode.c +++ b/fs/adfs/inode.c @@ -199,7 +199,7 @@ adfs_adfs2unix_time(struct timespec *tv, struct inode *inode) return; cur_time: - *tv = current_time(inode); + *tv = timespec64_to_timespec(current_time(inode)); return; too_early: @@ -242,6 +242,7 @@ adfs_unix2adfs_time(struct inode *inode, unsigned int secs) struct inode * adfs_iget(struct super_block *sb, struct object_info *obj) { + struct timespec ts; struct inode *inode; inode = new_inode(sb); @@ -270,7 +271,9 @@ adfs_iget(struct super_block *sb, struct object_info *obj) ADFS_I(inode)->stamped = ((obj->loadaddr & 0xfff00000) == 0xfff00000); inode->i_mode = adfs_atts2mode(sb, inode); - adfs_adfs2unix_time(&inode->i_mtime, inode); + ts = timespec64_to_timespec(inode->i_mtime); + adfs_adfs2unix_time(&ts, inode); + inode->i_mtime = timespec_to_timespec64(ts); inode->i_atime = inode->i_mtime; inode->i_ctime = inode->i_mtime; diff --git a/fs/attr.c b/fs/attr.c index 784d1625bf44..8b270ac51fe4 100644 --- a/fs/attr.c +++ b/fs/attr.c @@ -166,14 +166,14 @@ void setattr_copy(struct inode *inode, const struct iattr *attr) if (ia_valid & ATTR_GID) inode->i_gid = attr->ia_gid; if (ia_valid & ATTR_ATIME) - inode->i_atime = timespec_trunc(attr->ia_atime, - inode->i_sb->s_time_gran); + inode->i_atime = timespec64_trunc(attr->ia_atime, + inode->i_sb->s_time_gran); if (ia_valid & ATTR_MTIME) - inode->i_mtime = timespec_trunc(attr->ia_mtime, - inode->i_sb->s_time_gran); + inode->i_mtime = timespec64_trunc(attr->ia_mtime, + inode->i_sb->s_time_gran); if (ia_valid & ATTR_CTIME) - inode->i_ctime = timespec_trunc(attr->ia_ctime, - inode->i_sb->s_time_gran); + inode->i_ctime = timespec64_trunc(attr->ia_ctime, + inode->i_sb->s_time_gran); if (ia_valid & ATTR_MODE) { umode_t mode = attr->ia_mode; @@ -210,7 +210,7 @@ int notify_change2(struct vfsmount *mnt, struct dentry * dentry, struct iattr * struct inode *inode = dentry->d_inode; umode_t mode = inode->i_mode; int error; - struct timespec now; + struct timespec64 now; unsigned int ia_valid = attr->ia_valid; WARN_ON_ONCE(!inode_is_locked(inode)); diff --git a/fs/bad_inode.c b/fs/bad_inode.c index 213b51dbbb60..125e8bbd22a2 100644 --- a/fs/bad_inode.c +++ b/fs/bad_inode.c @@ -126,7 +126,7 @@ static int bad_inode_fiemap(struct inode *inode, return -EIO; } -static int bad_inode_update_time(struct inode *inode, struct timespec *time, +static int bad_inode_update_time(struct inode *inode, struct timespec64 *time, int flags) { return -EIO; diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 2f386d8dbd0e..01c8fab75777 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1852,16 +1852,16 @@ out: static void update_time_for_write(struct inode *inode) { - struct timespec now; + struct timespec64 now; if (IS_NOCMTIME(inode)) return; now = current_time(inode); - if (!timespec_equal(&inode->i_mtime, &now)) + if (!timespec64_equal(&inode->i_mtime, &now)) inode->i_mtime = now; - if (!timespec_equal(&inode->i_ctime, &now)) + if (!timespec64_equal(&inode->i_ctime, &now)) inode->i_ctime = now; if (IS_I_VERSION(inode)) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index f95f9934d1bc..276116c5c64c 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -5907,7 +5907,7 @@ static struct inode *new_simple_dir(struct super_block *s, inode->i_mtime = current_time(inode); inode->i_atime = inode->i_mtime; inode->i_ctime = inode->i_mtime; - BTRFS_I(inode)->i_otime = inode->i_mtime; + BTRFS_I(inode)->i_otime = timespec64_to_timespec(inode->i_mtime); return inode; } @@ -6257,7 +6257,7 @@ static int btrfs_dirty_inode(struct inode *inode) * This is a copy of file_update_time. We need this so we can return error on * ENOSPC for updating the inode in the case of file write and mmap writes. */ -static int btrfs_update_time(struct inode *inode, struct timespec *now, +static int btrfs_update_time(struct inode *inode, struct timespec64 *now, int flags) { struct btrfs_root *root = BTRFS_I(inode)->root; @@ -6511,7 +6511,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, inode->i_mtime = current_time(inode); inode->i_atime = inode->i_mtime; inode->i_ctime = inode->i_mtime; - BTRFS_I(inode)->i_otime = inode->i_mtime; + BTRFS_I(inode)->i_otime = timespec64_to_timespec(inode->i_mtime); inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_inode_item); @@ -9821,7 +9821,7 @@ static int btrfs_rename_exchange(struct inode *old_dir, struct btrfs_root *dest = BTRFS_I(new_dir)->root; struct inode *new_inode = new_dentry->d_inode; struct inode *old_inode = old_dentry->d_inode; - struct timespec ctime = current_time(old_inode); + struct timespec64 ctime = current_time(old_inode); struct dentry *parent; u64 old_ino = btrfs_ino(BTRFS_I(old_inode)); u64 new_ino = btrfs_ino(BTRFS_I(new_inode)); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 71849ca061fb..a2ca196283a6 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -431,7 +431,7 @@ static noinline int create_subvol(struct inode *dir, struct btrfs_root *root = BTRFS_I(dir)->root; struct btrfs_root *new_root; struct btrfs_block_rsv block_rsv; - struct timespec cur_time = current_time(dir); + struct timespec64 cur_time = current_time(dir); struct inode *inode; int ret; int err; @@ -5210,7 +5210,7 @@ static long _btrfs_ioctl_set_received_subvol(struct file *file, struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_root_item *root_item = &root->root_item; struct btrfs_trans_handle *trans; - struct timespec ct = current_time(inode); + struct timespec64 ct = current_time(inode); int ret = 0; int received_uuid_changed; diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index 7bae7cff150e..fafbd1f317e0 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -510,9 +510,9 @@ void btrfs_update_root_times(struct btrfs_trans_handle *trans, struct btrfs_root *root) { struct btrfs_root_item *item = &root->root_item; - struct timespec ct; + struct timespec64 ct; - ktime_get_real_ts(&ct); + ktime_get_real_ts64(&ct); spin_lock(&root->root_item_lock); btrfs_set_root_ctransid(item, trans->transid); btrfs_set_stack_timespec_sec(&item->ctime, ct.tv_sec); diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 77563b200744..2b006a62af85 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1460,7 +1460,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, struct dentry *dentry; struct extent_buffer *tmp; struct extent_buffer *old; - struct timespec cur_time; + struct timespec64 cur_time; int ret = 0; u64 to_reserve = 0; u64 index = 0; diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index db00d49b5a15..2de502fba315 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -560,6 +560,7 @@ static u64 get_writepages_data_length(struct inode *inode, */ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) { + struct timespec ts; struct inode *inode; struct ceph_inode_info *ci; struct ceph_fs_client *fsc; @@ -612,11 +613,12 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) set_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC); set_page_writeback(page); + ts = timespec64_to_timespec(inode->i_mtime); err = ceph_osdc_writepages(&fsc->client->osdc, ceph_vino(inode), &ci->i_layout, snapc, page_off, len, ceph_wbc.truncate_seq, ceph_wbc.truncate_size, - &inode->i_mtime, &page, 1); + &ts, &page, 1); if (err < 0) { struct writeback_control tmp_wbc; if (!wbc) @@ -1114,7 +1116,7 @@ new_request: pages = NULL; } - req->r_mtime = inode->i_mtime; + req->r_mtime = timespec64_to_timespec(inode->i_mtime); rc = ceph_osdc_start_request(&fsc->client->osdc, req, true); BUG_ON(rc); req = NULL; @@ -1713,7 +1715,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page) goto out; } - req->r_mtime = inode->i_mtime; + req->r_mtime = timespec64_to_timespec(inode->i_mtime); err = ceph_osdc_start_request(&fsc->client->osdc, req, false); if (!err) err = ceph_osdc_wait_request(&fsc->client->osdc, req); @@ -1755,7 +1757,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page) goto out_put; } - req->r_mtime = inode->i_mtime; + req->r_mtime = timespec64_to_timespec(inode->i_mtime); err = ceph_osdc_start_request(&fsc->client->osdc, req, false); if (!err) err = ceph_osdc_wait_request(&fsc->client->osdc, req); @@ -1916,7 +1918,7 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, 0, false, true); err = ceph_osdc_start_request(&fsc->client->osdc, rd_req, false); - wr_req->r_mtime = ci->vfs_inode.i_mtime; + wr_req->r_mtime = timespec64_to_timespec(ci->vfs_inode.i_mtime); wr_req->r_abort_on_full = true; err2 = ceph_osdc_start_request(&fsc->client->osdc, wr_req, false); diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c index a3ab265d3215..e2591e99c1a5 100644 --- a/fs/ceph/cache.c +++ b/fs/ceph/cache.c @@ -184,7 +184,7 @@ static enum fscache_checkaux ceph_fscache_inode_check_aux( memset(&aux, 0, sizeof(aux)); aux.version = ci->i_version; - aux.mtime = inode->i_mtime; + aux.mtime = timespec64_to_timespec(inode->i_mtime); aux.size = i_size_read(inode); if (memcmp(data, &aux, sizeof(aux)) != 0) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 6146b89bfffe..2653fa2aeeef 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1242,9 +1242,9 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, arg.xattr_buf = NULL; } - arg.mtime = inode->i_mtime; - arg.atime = inode->i_atime; - arg.ctime = inode->i_ctime; + arg.mtime = timespec64_to_timespec(inode->i_mtime); + arg.atime = timespec64_to_timespec(inode->i_atime); + arg.ctime = timespec64_to_timespec(inode->i_ctime); arg.op = op; arg.caps = cap->implemented; diff --git a/fs/ceph/file.c b/fs/ceph/file.c index ddfa6ce3a0fb..d4f498bef7a4 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -843,7 +843,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, int num_pages = 0; int flags; int ret; - struct timespec mtime = current_time(inode); + struct timespec mtime = timespec64_to_timespec(current_time(inode)); size_t count = iov_iter_count(iter); loff_t pos = iocb->ki_pos; bool write = iov_iter_rw(iter) == WRITE; @@ -1052,7 +1052,7 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos, int flags; int ret; bool check_caps = false; - struct timespec mtime = current_time(inode); + struct timespec mtime = timespec64_to_timespec(current_time(inode)); size_t count = iov_iter_count(from); if (ceph_snap(file_inode(file)) != CEPH_NOSNAP) @@ -1575,7 +1575,7 @@ static int ceph_zero_partial_object(struct inode *inode, goto out; } - req->r_mtime = inode->i_mtime; + req->r_mtime = timespec64_to_timespec(inode->i_mtime); ret = ceph_osdc_start_request(&fsc->client->osdc, req, false); if (!ret) { ret = ceph_osdc_wait_request(&fsc->client->osdc, req); diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 4dd7c77dcf9e..0606c7bb6677 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -650,6 +650,9 @@ void ceph_fill_file_time(struct inode *inode, int issued, struct timespec *mtime, struct timespec *atime) { struct ceph_inode_info *ci = ceph_inode(inode); + struct timespec64 ctime64 = timespec_to_timespec64(*ctime); + struct timespec64 mtime64 = timespec_to_timespec64(*mtime); + struct timespec64 atime64 = timespec_to_timespec64(*atime); int warn = 0; if (issued & (CEPH_CAP_FILE_EXCL| @@ -657,11 +660,11 @@ void ceph_fill_file_time(struct inode *inode, int issued, CEPH_CAP_FILE_BUFFER| CEPH_CAP_AUTH_EXCL| CEPH_CAP_XATTR_EXCL)) { - if (timespec_compare(ctime, &inode->i_ctime) > 0) { + if (timespec64_compare(&ctime64, &inode->i_ctime) > 0) { dout("ctime %ld.%09ld -> %ld.%09ld inc w/ cap\n", inode->i_ctime.tv_sec, inode->i_ctime.tv_nsec, ctime->tv_sec, ctime->tv_nsec); - inode->i_ctime = *ctime; + inode->i_ctime = ctime64; } if (ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) > 0) { /* the MDS did a utimes() */ @@ -671,24 +674,24 @@ void ceph_fill_file_time(struct inode *inode, int issued, mtime->tv_sec, mtime->tv_nsec, ci->i_time_warp_seq, (int)time_warp_seq); - inode->i_mtime = *mtime; - inode->i_atime = *atime; + inode->i_mtime = mtime64; + inode->i_atime = atime64; ci->i_time_warp_seq = time_warp_seq; } else if (time_warp_seq == ci->i_time_warp_seq) { /* nobody did utimes(); take the max */ - if (timespec_compare(mtime, &inode->i_mtime) > 0) { + if (timespec64_compare(&mtime64, &inode->i_mtime) > 0) { dout("mtime %ld.%09ld -> %ld.%09ld inc\n", inode->i_mtime.tv_sec, inode->i_mtime.tv_nsec, mtime->tv_sec, mtime->tv_nsec); - inode->i_mtime = *mtime; + inode->i_mtime = mtime64; } - if (timespec_compare(atime, &inode->i_atime) > 0) { + if (timespec64_compare(&atime64, &inode->i_atime) > 0) { dout("atime %ld.%09ld -> %ld.%09ld inc\n", inode->i_atime.tv_sec, inode->i_atime.tv_nsec, atime->tv_sec, atime->tv_nsec); - inode->i_atime = *atime; + inode->i_atime = atime64; } } else if (issued & CEPH_CAP_FILE_EXCL) { /* we did a utimes(); ignore mds values */ @@ -698,9 +701,9 @@ void ceph_fill_file_time(struct inode *inode, int issued, } else { /* we have no write|excl caps; whatever the MDS says is true */ if (ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) >= 0) { - inode->i_ctime = *ctime; - inode->i_mtime = *mtime; - inode->i_atime = *atime; + inode->i_ctime = ctime64; + inode->i_mtime = mtime64; + inode->i_atime = atime64; ci->i_time_warp_seq = time_warp_seq; } else { warn = 1; @@ -1914,6 +1917,7 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr) int err = 0; int inode_dirty_flags = 0; bool lock_snap_rwsem = false; + struct timespec ts; prealloc_cf = ceph_alloc_cap_flush(); if (!prealloc_cf) @@ -1996,14 +2000,14 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr) inode->i_atime = attr->ia_atime; dirtied |= CEPH_CAP_FILE_EXCL; } else if ((issued & CEPH_CAP_FILE_WR) && - timespec_compare(&inode->i_atime, + timespec64_compare(&inode->i_atime, &attr->ia_atime) < 0) { inode->i_atime = attr->ia_atime; dirtied |= CEPH_CAP_FILE_WR; } else if ((issued & CEPH_CAP_FILE_SHARED) == 0 || - !timespec_equal(&inode->i_atime, &attr->ia_atime)) { - ceph_encode_timespec(&req->r_args.setattr.atime, - &attr->ia_atime); + !timespec64_equal(&inode->i_atime, &attr->ia_atime)) { + ts = timespec64_to_timespec(attr->ia_atime); + ceph_encode_timespec(&req->r_args.setattr.atime, &ts); mask |= CEPH_SETATTR_ATIME; release |= CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR; @@ -2018,14 +2022,14 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr) inode->i_mtime = attr->ia_mtime; dirtied |= CEPH_CAP_FILE_EXCL; } else if ((issued & CEPH_CAP_FILE_WR) && - timespec_compare(&inode->i_mtime, + timespec64_compare(&inode->i_mtime, &attr->ia_mtime) < 0) { inode->i_mtime = attr->ia_mtime; dirtied |= CEPH_CAP_FILE_WR; } else if ((issued & CEPH_CAP_FILE_SHARED) == 0 || - !timespec_equal(&inode->i_mtime, &attr->ia_mtime)) { - ceph_encode_timespec(&req->r_args.setattr.mtime, - &attr->ia_mtime); + !timespec64_equal(&inode->i_mtime, &attr->ia_mtime)) { + ts = timespec64_to_timespec(attr->ia_mtime); + ceph_encode_timespec(&req->r_args.setattr.mtime, &ts); mask |= CEPH_SETATTR_MTIME; release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR; @@ -2099,7 +2103,7 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr) req->r_inode_drop = release; req->r_args.setattr.mask = cpu_to_le32(mask); req->r_num_caps = 1; - req->r_stamp = attr->ia_ctime; + req->r_stamp = timespec64_to_timespec(attr->ia_ctime); err = ceph_mdsc_do_request(mdsc, NULL, req); } dout("setattr %p result=%d (%s locally, %d remote)\n", inode, err, diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 06109314d93c..56d444263dfe 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -2953,12 +2953,15 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap, rec.v2.pathbase = cpu_to_le64(pathbase); rec.v2.flock_len = 0; } else { + struct timespec ts; rec.v1.cap_id = cpu_to_le64(cap->cap_id); rec.v1.wanted = cpu_to_le32(__ceph_caps_wanted(ci)); rec.v1.issued = cpu_to_le32(cap->issued); rec.v1.size = cpu_to_le64(inode->i_size); - ceph_encode_timespec(&rec.v1.mtime, &inode->i_mtime); - ceph_encode_timespec(&rec.v1.atime, &inode->i_atime); + ts = timespec64_to_timespec(inode->i_mtime); + ceph_encode_timespec(&rec.v1.mtime, &ts); + ts = timespec64_to_timespec(inode->i_atime); + ceph_encode_timespec(&rec.v1.atime, &ts); rec.v1.snaprealm = cpu_to_le64(ci->i_snap_realm->ino); rec.v1.pathbase = cpu_to_le64(pathbase); } diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index ec917ad13595..2160f728056e 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -601,9 +601,9 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci, BUG_ON(capsnap->writing); capsnap->size = inode->i_size; - capsnap->mtime = inode->i_mtime; - capsnap->atime = inode->i_atime; - capsnap->ctime = inode->i_ctime; + capsnap->mtime = timespec64_to_timespec(inode->i_mtime); + capsnap->atime = timespec64_to_timespec(inode->i_atime); + capsnap->ctime = timespec64_to_timespec(inode->i_ctime); capsnap->time_warp_seq = ci->i_time_warp_seq; capsnap->truncate_size = ci->i_truncate_size; capsnap->truncate_seq = ci->i_truncate_seq; diff --git a/fs/cifs/cache.c b/fs/cifs/cache.c index 2c14020e5e1d..3659c5987e8e 100644 --- a/fs/cifs/cache.c +++ b/fs/cifs/cache.c @@ -283,8 +283,8 @@ fscache_checkaux cifs_fscache_inode_check_aux(void *cookie_netfs_data, memset(&auxdata, 0, sizeof(auxdata)); auxdata.eof = cifsi->server_eof; - auxdata.last_write_time = cifsi->vfs_inode.i_mtime; - auxdata.last_change_time = cifsi->vfs_inode.i_ctime; + auxdata.last_write_time = timespec64_to_timespec(cifsi->vfs_inode.i_mtime); + auxdata.last_change_time = timespec64_to_timespec(cifsi->vfs_inode.i_ctime); if (memcmp(data, &auxdata, datalen) != 0) return FSCACHE_CHECKAUX_OBSOLETE; diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index b76e73395299..5b8f4614a9ff 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -95,6 +95,7 @@ static void cifs_revalidate_cache(struct inode *inode, struct cifs_fattr *fattr) { struct cifsInodeInfo *cifs_i = CIFS_I(inode); + struct timespec ts; cifs_dbg(FYI, "%s: revalidating inode %llu\n", __func__, cifs_i->uniqueid); @@ -113,7 +114,8 @@ cifs_revalidate_cache(struct inode *inode, struct cifs_fattr *fattr) } /* revalidate if mtime or size have changed */ - if (timespec_equal(&inode->i_mtime, &fattr->cf_mtime) && + ts = timespec64_to_timespec(inode->i_mtime); + if (timespec_equal(&ts, &fattr->cf_mtime) && cifs_i->server_eof == fattr->cf_eof) { cifs_dbg(FYI, "%s: inode %llu is unchanged\n", __func__, cifs_i->uniqueid); @@ -162,9 +164,9 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr) cifs_revalidate_cache(inode, fattr); spin_lock(&inode->i_lock); - inode->i_atime = fattr->cf_atime; - inode->i_mtime = fattr->cf_mtime; - inode->i_ctime = fattr->cf_ctime; + inode->i_atime = timespec_to_timespec64(fattr->cf_atime); + inode->i_mtime = timespec_to_timespec64(fattr->cf_mtime); + inode->i_ctime = timespec_to_timespec64(fattr->cf_ctime); inode->i_rdev = fattr->cf_rdev; cifs_nlink_fattr_to_inode(inode, fattr); inode->i_uid = fattr->cf_uid; @@ -1148,14 +1150,14 @@ cifs_set_file_info(struct inode *inode, struct iattr *attrs, unsigned int xid, if (attrs->ia_valid & ATTR_ATIME) { set_time = true; info_buf.LastAccessTime = - cpu_to_le64(cifs_UnixTimeToNT(attrs->ia_atime)); + cpu_to_le64(cifs_UnixTimeToNT(timespec64_to_timespec(attrs->ia_atime))); } else info_buf.LastAccessTime = 0; if (attrs->ia_valid & ATTR_MTIME) { set_time = true; info_buf.LastWriteTime = - cpu_to_le64(cifs_UnixTimeToNT(attrs->ia_mtime)); + cpu_to_le64(cifs_UnixTimeToNT(timespec64_to_timespec(attrs->ia_mtime))); } else info_buf.LastWriteTime = 0; @@ -1168,7 +1170,7 @@ cifs_set_file_info(struct inode *inode, struct iattr *attrs, unsigned int xid, if (set_time && (attrs->ia_valid & ATTR_CTIME)) { cifs_dbg(FYI, "CIFS - CTIME changed\n"); info_buf.ChangeTime = - cpu_to_le64(cifs_UnixTimeToNT(attrs->ia_ctime)); + cpu_to_le64(cifs_UnixTimeToNT(timespec64_to_timespec(attrs->ia_ctime))); } else info_buf.ChangeTime = 0; @@ -2093,8 +2095,8 @@ int cifs_getattr(const struct path *path, struct kstat *stat, /* old CIFS Unix Extensions doesn't return create time */ if (CIFS_I(inode)->createtime) { stat->result_mask |= STATX_BTIME; - stat->btime = - cifs_NTtimeToUnix(cpu_to_le64(CIFS_I(inode)->createtime)); + stat->btime = timespec_to_timespec64( + cifs_NTtimeToUnix(cpu_to_le64(CIFS_I(inode)->createtime))); } stat->attributes_mask |= (STATX_ATTR_COMPRESSED | STATX_ATTR_ENCRYPTED); @@ -2305,17 +2307,17 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs) args->gid = INVALID_GID; /* no change */ if (attrs->ia_valid & ATTR_ATIME) - args->atime = cifs_UnixTimeToNT(attrs->ia_atime); + args->atime = cifs_UnixTimeToNT(timespec64_to_timespec(attrs->ia_atime)); else args->atime = NO_CHANGE_64; if (attrs->ia_valid & ATTR_MTIME) - args->mtime = cifs_UnixTimeToNT(attrs->ia_mtime); + args->mtime = cifs_UnixTimeToNT(timespec64_to_timespec(attrs->ia_mtime)); else args->mtime = NO_CHANGE_64; if (attrs->ia_valid & ATTR_CTIME) - args->ctime = cifs_UnixTimeToNT(attrs->ia_ctime); + args->ctime = cifs_UnixTimeToNT(timespec64_to_timespec(attrs->ia_ctime)); else args->ctime = NO_CHANGE_64; diff --git a/fs/coda/coda_linux.c b/fs/coda/coda_linux.c index ca599df0dcb1..f3d543dd9a98 100644 --- a/fs/coda/coda_linux.c +++ b/fs/coda/coda_linux.c @@ -105,11 +105,11 @@ void coda_vattr_to_iattr(struct inode *inode, struct coda_vattr *attr) if (attr->va_size != -1) inode->i_blocks = (attr->va_size + 511) >> 9; if (attr->va_atime.tv_sec != -1) - inode->i_atime = attr->va_atime; + inode->i_atime = timespec_to_timespec64(attr->va_atime); if (attr->va_mtime.tv_sec != -1) - inode->i_mtime = attr->va_mtime; + inode->i_mtime = timespec_to_timespec64(attr->va_mtime); if (attr->va_ctime.tv_sec != -1) - inode->i_ctime = attr->va_ctime; + inode->i_ctime = timespec_to_timespec64(attr->va_ctime); } @@ -175,13 +175,13 @@ void coda_iattr_to_vattr(struct iattr *iattr, struct coda_vattr *vattr) vattr->va_size = iattr->ia_size; } if ( valid & ATTR_ATIME ) { - vattr->va_atime = iattr->ia_atime; + vattr->va_atime = timespec64_to_timespec(iattr->ia_atime); } if ( valid & ATTR_MTIME ) { - vattr->va_mtime = iattr->ia_mtime; + vattr->va_mtime = timespec64_to_timespec(iattr->ia_mtime); } if ( valid & ATTR_CTIME ) { - vattr->va_ctime = iattr->ia_ctime; + vattr->va_ctime = timespec64_to_timespec(iattr->ia_ctime); } } diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c index ad718e5e37bb..28ef9e528853 100644 --- a/fs/configfs/inode.c +++ b/fs/configfs/inode.c @@ -90,14 +90,14 @@ int configfs_setattr(struct dentry * dentry, struct iattr * iattr) if (ia_valid & ATTR_GID) sd_iattr->ia_gid = iattr->ia_gid; if (ia_valid & ATTR_ATIME) - sd_iattr->ia_atime = timespec_trunc(iattr->ia_atime, - inode->i_sb->s_time_gran); + sd_iattr->ia_atime = timespec64_trunc(iattr->ia_atime, + inode->i_sb->s_time_gran); if (ia_valid & ATTR_MTIME) - sd_iattr->ia_mtime = timespec_trunc(iattr->ia_mtime, - inode->i_sb->s_time_gran); + sd_iattr->ia_mtime = timespec64_trunc(iattr->ia_mtime, + inode->i_sb->s_time_gran); if (ia_valid & ATTR_CTIME) - sd_iattr->ia_ctime = timespec_trunc(iattr->ia_ctime, - inode->i_sb->s_time_gran); + sd_iattr->ia_ctime = timespec64_trunc(iattr->ia_ctime, + inode->i_sb->s_time_gran); if (ia_valid & ATTR_MODE) { umode_t mode = iattr->ia_mode; diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c index 011c6f53dcda..573b4a4fa87b 100644 --- a/fs/cramfs/inode.c +++ b/fs/cramfs/inode.c @@ -81,7 +81,7 @@ static struct inode *get_cramfs_inode(struct super_block *sb, const struct cramfs_inode *cramfs_inode, unsigned int offset) { struct inode *inode; - static struct timespec zerotime; + static struct timespec64 zerotime; inode = iget_locked(sb, cramino(cramfs_inode, offset)); if (!inode) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 0ef34c1d15ff..5c79b040bfe3 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -864,12 +864,14 @@ static inline void ext4_decode_extra_time(struct timespec *time, __le32 extra) time->tv_nsec = (le32_to_cpu(extra) & EXT4_NSEC_MASK) >> EXT4_EPOCH_BITS; } -#define EXT4_INODE_SET_XTIME(xtime, inode, raw_inode) \ -do { \ - (raw_inode)->xtime = cpu_to_le32((inode)->xtime.tv_sec); \ - if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra)) \ - (raw_inode)->xtime ## _extra = \ - ext4_encode_extra_time(&(inode)->xtime); \ +#define EXT4_INODE_SET_XTIME(xtime, inode, raw_inode) \ +do { \ + (raw_inode)->xtime = cpu_to_le32((inode)->xtime.tv_sec); \ + if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra)) {\ + struct timespec ts = timespec64_to_timespec((inode)->xtime); \ + (raw_inode)->xtime ## _extra = \ + ext4_encode_extra_time(&ts); \ + } \ } while (0) #define EXT4_EINODE_SET_XTIME(xtime, einode, raw_inode) \ @@ -881,16 +883,20 @@ do { \ ext4_encode_extra_time(&(einode)->xtime); \ } while (0) -#define EXT4_INODE_GET_XTIME(xtime, inode, raw_inode) \ -do { \ - (inode)->xtime.tv_sec = (signed)le32_to_cpu((raw_inode)->xtime); \ - if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra)) \ - ext4_decode_extra_time(&(inode)->xtime, \ - raw_inode->xtime ## _extra); \ - else \ - (inode)->xtime.tv_nsec = 0; \ +#define EXT4_INODE_GET_XTIME(xtime, inode, raw_inode) \ +do { \ + (inode)->xtime.tv_sec = (signed)le32_to_cpu((raw_inode)->xtime); \ + if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra)) { \ + struct timespec ts = timespec64_to_timespec((inode)->xtime); \ + ext4_decode_extra_time(&ts, \ + raw_inode->xtime ## _extra); \ + (inode)->xtime = timespec_to_timespec64(ts); \ + } \ + else \ + (inode)->xtime.tv_nsec = 0; \ } while (0) + #define EXT4_EINODE_GET_XTIME(xtime, einode, raw_inode) \ do { \ if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime)) \ diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 2d953f097e6a..9821435544ba 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -1125,8 +1125,8 @@ got: inode->i_ino = ino + group * EXT4_INODES_PER_GROUP(sb); /* This is the optimal IO size (for stat), not the fs block size */ inode->i_blocks = 0; - inode->i_mtime = inode->i_atime = inode->i_ctime = ei->i_crtime = - current_time(inode); + inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); + ei->i_crtime = timespec64_to_timespec(inode->i_mtime); memset(ei->i_data, 0, sizeof(ei->i_data)); ei->i_dir_start_lookup = 0; diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 9b5e5aea8c26..5238799b4064 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -4149,7 +4149,7 @@ static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry, }; u8 new_file_type; int retval; - struct timespec ctime; + struct timespec64 ctime; if ((ext4_test_inode_flag(new_dir, EXT4_INODE_PROJINHERIT) && !projid_eq(EXT4_I(new_dir)->i_projid, diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index c497290298ca..13f0129c71eb 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -3161,6 +3161,7 @@ static inline bool f2fs_is_time_consistent(struct inode *inode) static inline bool f2fs_skip_inode_update(struct inode *inode, int dsync) { + struct timespec ts; bool ret; if (dsync) { diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 8327036b2055..a6e0fb1972b9 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -887,14 +887,14 @@ static void __setattr_copy(struct inode *inode, const struct iattr *attr) if (ia_valid & ATTR_GID) inode->i_gid = attr->ia_gid; if (ia_valid & ATTR_ATIME) - inode->i_atime = timespec_trunc(attr->ia_atime, - inode->i_sb->s_time_gran); + inode->i_atime = timespec64_trunc(attr->ia_atime, + inode->i_sb->s_time_gran); if (ia_valid & ATTR_MTIME) - inode->i_mtime = timespec_trunc(attr->ia_mtime, - inode->i_sb->s_time_gran); + inode->i_mtime = timespec64_trunc(attr->ia_mtime, + inode->i_sb->s_time_gran); if (ia_valid & ATTR_CTIME) - inode->i_ctime = timespec_trunc(attr->ia_ctime, - inode->i_sb->s_time_gran); + inode->i_ctime = timespec64_trunc(attr->ia_ctime, + inode->i_sb->s_time_gran); if (ia_valid & ATTR_MODE) { umode_t mode = attr->ia_mode; diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 2090a95edc27..e2b77cd0ea97 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -460,9 +460,9 @@ static int do_read_inode(struct inode *inode) } } - F2FS_I(inode)->i_disk_time[0] = inode->i_atime; - F2FS_I(inode)->i_disk_time[1] = inode->i_ctime; - F2FS_I(inode)->i_disk_time[2] = inode->i_mtime; + F2FS_I(inode)->i_disk_time[0] = timespec64_to_timespec(inode->i_atime); + F2FS_I(inode)->i_disk_time[1] = timespec64_to_timespec(inode->i_ctime); + F2FS_I(inode)->i_disk_time[2] = timespec64_to_timespec(inode->i_mtime); F2FS_I(inode)->i_disk_time[3] = F2FS_I(inode)->i_crtime; if (unlikely((inode->i_mode & S_IFMT) == 0)) { @@ -651,9 +651,9 @@ void f2fs_update_inode(struct inode *inode, struct page *node_page) if (inode->i_nlink == 0) clear_inline_node(node_page); - F2FS_I(inode)->i_disk_time[0] = inode->i_atime; - F2FS_I(inode)->i_disk_time[1] = inode->i_ctime; - F2FS_I(inode)->i_disk_time[2] = inode->i_mtime; + F2FS_I(inode)->i_disk_time[0] = timespec64_to_timespec(inode->i_atime); + F2FS_I(inode)->i_disk_time[1] = timespec64_to_timespec(inode->i_ctime); + F2FS_I(inode)->i_disk_time[2] = timespec64_to_timespec(inode->i_mtime); F2FS_I(inode)->i_disk_time[3] = F2FS_I(inode)->i_crtime; #ifdef CONFIG_F2FS_CHECK_FS diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 81083e9d9c7c..2a742a62afb9 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -53,8 +53,8 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) if (IS_I_VERSION(inode)) inode->i_version++; - inode->i_mtime = inode->i_atime = inode->i_ctime = - F2FS_I(inode)->i_crtime = current_time(inode); + inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); + F2FS_I(inode)->i_crtime = timespec64_to_timespec(inode->i_mtime); inode->i_generation = prandom_u32(); if (S_ISDIR(inode->i_mode)) diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 209f836792cd..e4b2bf7bf52e 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -504,6 +504,7 @@ static int fat_validate_dir(struct inode *dir) /* doesn't deal with root inode */ int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de) { + struct timespec ts; struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb); int error; @@ -554,11 +555,14 @@ int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de) inode->i_blocks = ((inode->i_size + (sbi->cluster_size - 1)) & ~((loff_t)sbi->cluster_size - 1)) >> 9; - fat_time_fat2unix(sbi, &inode->i_mtime, de->time, de->date, 0); + fat_time_fat2unix(sbi, &ts, de->time, de->date, 0); + inode->i_mtime = timespec_to_timespec64(ts); if (sbi->options.isvfat) { - fat_time_fat2unix(sbi, &inode->i_ctime, de->ctime, + fat_time_fat2unix(sbi, &ts, de->ctime, de->cdate, de->ctime_cs); - fat_time_fat2unix(sbi, &inode->i_atime, 0, de->adate, 0); + inode->i_ctime = timespec_to_timespec64(ts); + fat_time_fat2unix(sbi, &ts, 0, de->adate, 0); + inode->i_atime = timespec_to_timespec64(ts); } else inode->i_ctime = inode->i_atime = inode->i_mtime; @@ -846,6 +850,7 @@ static int fat_statfs(struct dentry *dentry, struct kstatfs *buf) static int __fat_write_inode(struct inode *inode, int wait) { + struct timespec ts; struct super_block *sb = inode->i_sb; struct msdos_sb_info *sbi = MSDOS_SB(sb); struct buffer_head *bh; @@ -883,13 +888,16 @@ retry: raw_entry->size = cpu_to_le32(inode->i_size); raw_entry->attr = fat_make_attrs(inode); fat_set_start(raw_entry, MSDOS_I(inode)->i_logstart); - fat_time_unix2fat(sbi, &inode->i_mtime, &raw_entry->time, + ts = timespec64_to_timespec(inode->i_mtime); + fat_time_unix2fat(sbi, &ts, &raw_entry->time, &raw_entry->date, NULL); if (sbi->options.isvfat) { __le16 atime; - fat_time_unix2fat(sbi, &inode->i_ctime, &raw_entry->ctime, + ts = timespec64_to_timespec(inode->i_ctime); + fat_time_unix2fat(sbi, &ts, &raw_entry->ctime, &raw_entry->cdate, &raw_entry->ctime_cs); - fat_time_unix2fat(sbi, &inode->i_atime, &atime, + ts = timespec64_to_timespec(inode->i_atime); + fat_time_unix2fat(sbi, &ts, &atime, &raw_entry->adate, NULL); } spin_unlock(&sbi->inode_hash_lock); diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c index 7d6a105d601b..a14241c675b9 100644 --- a/fs/fat/namei_msdos.c +++ b/fs/fat/namei_msdos.c @@ -249,7 +249,7 @@ static int msdos_add_entry(struct inode *dir, const unsigned char *name, if (err) return err; - dir->i_ctime = dir->i_mtime = *ts; + dir->i_ctime = dir->i_mtime = timespec_to_timespec64(*ts); if (IS_DIRSYNC(dir)) (void)fat_sync_inode(dir); else @@ -265,7 +265,8 @@ static int msdos_create(struct inode *dir, struct dentry *dentry, umode_t mode, struct super_block *sb = dir->i_sb; struct inode *inode = NULL; struct fat_slot_info sinfo; - struct timespec ts; + struct timespec64 ts; + struct timespec t; unsigned char msdos_name[MSDOS_NAME]; int err, is_hid; @@ -284,7 +285,8 @@ static int msdos_create(struct inode *dir, struct dentry *dentry, umode_t mode, } ts = current_time(dir); - err = msdos_add_entry(dir, msdos_name, 0, is_hid, 0, &ts, &sinfo); + t = timespec64_to_timespec(ts); + err = msdos_add_entry(dir, msdos_name, 0, is_hid, 0, &t, &sinfo); if (err) goto out; inode = fat_build_inode(sb, sinfo.de, sinfo.i_pos); @@ -347,7 +349,8 @@ static int msdos_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) struct fat_slot_info sinfo; struct inode *inode; unsigned char msdos_name[MSDOS_NAME]; - struct timespec ts; + struct timespec64 ts; + struct timespec t; int err, is_hid, cluster; mutex_lock(&MSDOS_SB(sb)->s_lock); @@ -365,12 +368,13 @@ static int msdos_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) } ts = current_time(dir); - cluster = fat_alloc_new_dir(dir, &ts); + t = timespec64_to_timespec(ts); + cluster = fat_alloc_new_dir(dir, &t); if (cluster < 0) { err = cluster; goto out; } - err = msdos_add_entry(dir, msdos_name, 1, is_hid, cluster, &ts, &sinfo); + err = msdos_add_entry(dir, msdos_name, 1, is_hid, cluster, &t, &sinfo); if (err) goto out_free; inc_nlink(dir); @@ -435,7 +439,7 @@ static int do_msdos_rename(struct inode *old_dir, unsigned char *old_name, struct msdos_dir_entry *dotdot_de; struct inode *old_inode, *new_inode; struct fat_slot_info old_sinfo, sinfo; - struct timespec ts; + struct timespec64 ts; loff_t new_i_pos; int err, old_attrs, is_dir, update_dotdot, corrupt = 0; @@ -502,8 +506,9 @@ static int do_msdos_rename(struct inode *old_dir, unsigned char *old_name, new_i_pos = MSDOS_I(new_inode)->i_pos; fat_detach(new_inode); } else { + struct timespec t = timespec64_to_timespec(ts); err = msdos_add_entry(new_dir, new_name, is_dir, is_hid, 0, - &ts, &sinfo); + &t, &sinfo); if (err) goto out; new_i_pos = sinfo.i_pos; diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c index 8ef01a0b1f94..be09be9b6867 100644 --- a/fs/fat/namei_vfat.c +++ b/fs/fat/namei_vfat.c @@ -678,7 +678,7 @@ static int vfat_add_entry(struct inode *dir, const struct qstr *qname, goto cleanup; /* update timestamp */ - dir->i_ctime = dir->i_mtime = dir->i_atime = *ts; + dir->i_ctime = dir->i_mtime = dir->i_atime = timespec_to_timespec64(*ts); if (IS_DIRSYNC(dir)) (void)fat_sync_inode(dir); else @@ -784,13 +784,15 @@ static int vfat_create(struct inode *dir, struct dentry *dentry, umode_t mode, struct super_block *sb = dir->i_sb; struct inode *inode; struct fat_slot_info sinfo; - struct timespec ts; + struct timespec64 ts; + struct timespec t; int err; mutex_lock(&MSDOS_SB(sb)->s_lock); ts = current_time(dir); - err = vfat_add_entry(dir, &dentry->d_name, 0, 0, &ts, &sinfo); + t = timespec64_to_timespec(ts); + err = vfat_add_entry(dir, &dentry->d_name, 0, 0, &t, &sinfo); if (err) goto out; dir->i_version++; @@ -873,18 +875,20 @@ static int vfat_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) struct super_block *sb = dir->i_sb; struct inode *inode; struct fat_slot_info sinfo; - struct timespec ts; + struct timespec64 ts; + struct timespec t; int err, cluster; mutex_lock(&MSDOS_SB(sb)->s_lock); ts = current_time(dir); - cluster = fat_alloc_new_dir(dir, &ts); + t = timespec64_to_timespec(ts); + cluster = fat_alloc_new_dir(dir, &t); if (cluster < 0) { err = cluster; goto out; } - err = vfat_add_entry(dir, &dentry->d_name, 1, cluster, &ts, &sinfo); + err = vfat_add_entry(dir, &dentry->d_name, 1, cluster, &t, &sinfo); if (err) goto out_free; dir->i_version++; @@ -922,7 +926,8 @@ static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry, struct msdos_dir_entry *dotdot_de; struct inode *old_inode, *new_inode; struct fat_slot_info old_sinfo, sinfo; - struct timespec ts; + struct timespec64 ts; + struct timespec t; loff_t new_i_pos; int err, is_dir, update_dotdot, corrupt = 0; struct super_block *sb = old_dir->i_sb; @@ -957,8 +962,9 @@ static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry, new_i_pos = MSDOS_I(new_inode)->i_pos; fat_detach(new_inode); } else { + t = timespec64_to_timespec(ts); err = vfat_add_entry(new_dir, &new_dentry->d_name, is_dir, 0, - &ts, &sinfo); + &t, &sinfo); if (err) goto out; new_i_pos = sinfo.i_pos; diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 3e5ca5b7c92a..26a5df719f10 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -223,7 +223,7 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr, return; } - old_mtime = inode->i_mtime; + old_mtime = timespec64_to_timespec(inode->i_mtime); fuse_change_attributes_common(inode, attr, attr_valid); oldsize = inode->i_size; diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c index 06a0d1947c77..cb8dea65ec65 100644 --- a/fs/gfs2/dir.c +++ b/fs/gfs2/dir.c @@ -872,7 +872,7 @@ static struct gfs2_leaf *new_leaf(struct inode *inode, struct buffer_head **pbh, struct buffer_head *bh; struct gfs2_leaf *leaf; struct gfs2_dirent *dent; - struct timespec tv = current_time(inode); + struct timespec64 tv = current_time(inode); error = gfs2_alloc_blocks(ip, &bn, &n, 0, NULL); if (error) @@ -1803,7 +1803,7 @@ int gfs2_dir_add(struct inode *inode, const struct qstr *name, struct gfs2_inode *ip = GFS2_I(inode); struct buffer_head *bh = da->bh; struct gfs2_dirent *dent = da->dent; - struct timespec tv; + struct timespec64 tv; struct gfs2_leaf *leaf; int error; @@ -1881,7 +1881,7 @@ int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry) const struct qstr *name = &dentry->d_name; struct gfs2_dirent *dent, *prev = NULL; struct buffer_head *bh; - struct timespec tv = current_time(&dip->i_inode); + struct timespec64 tv = current_time(&dip->i_inode); /* Returns _either_ the entry (if its first in block) or the previous entry otherwise */ diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index 4838e26c06f7..17b0b294412c 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -335,7 +335,7 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); const struct gfs2_dinode *str = buf; - struct timespec atime; + struct timespec64 atime; u16 height, depth; if (unlikely(ip->i_no_addr != be64_to_cpu(str->di_num.no_addr))) @@ -358,7 +358,7 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf) gfs2_set_inode_blocks(&ip->i_inode, be64_to_cpu(str->di_blocks)); atime.tv_sec = be64_to_cpu(str->di_atime); atime.tv_nsec = be32_to_cpu(str->di_atime_nsec); - if (timespec_compare(&ip->i_inode.i_atime, &atime) < 0) + if (timespec64_compare(&ip->i_inode.i_atime, &atime) < 0) ip->i_inode.i_atime = atime; ip->i_inode.i_mtime.tv_sec = be64_to_cpu(str->di_mtime); ip->i_inode.i_mtime.tv_nsec = be32_to_cpu(str->di_mtime_nsec); diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c index fb416af35896..4410f3e96013 100644 --- a/fs/hfs/inode.c +++ b/fs/hfs/inode.c @@ -354,7 +354,7 @@ static int hfs_read_inode(struct inode *inode, void *data) inode->i_mode &= ~hsb->s_file_umask; inode->i_mode |= S_IFREG; inode->i_ctime = inode->i_atime = inode->i_mtime = - hfs_m_to_utime(rec->file.MdDat); + timespec_to_timespec64(hfs_m_to_utime(rec->file.MdDat)); inode->i_op = &hfs_file_inode_operations; inode->i_fop = &hfs_file_operations; inode->i_mapping->a_ops = &hfs_aops; @@ -365,7 +365,7 @@ static int hfs_read_inode(struct inode *inode, void *data) HFS_I(inode)->fs_blocks = 0; inode->i_mode = S_IFDIR | (S_IRWXUGO & ~hsb->s_dir_umask); inode->i_ctime = inode->i_atime = inode->i_mtime = - hfs_m_to_utime(rec->dir.MdDat); + timespec_to_timespec64(hfs_m_to_utime(rec->dir.MdDat)); inode->i_op = &hfs_dir_inode_operations; inode->i_fop = &hfs_dir_operations; break; diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index 4924a489c8ac..16b1418d4f54 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c @@ -498,9 +498,9 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd) hfsplus_get_perms(inode, &folder->permissions, 1); set_nlink(inode, 1); inode->i_size = 2 + be32_to_cpu(folder->valence); - inode->i_atime = hfsp_mt2ut(folder->access_date); - inode->i_mtime = hfsp_mt2ut(folder->content_mod_date); - inode->i_ctime = hfsp_mt2ut(folder->attribute_mod_date); + inode->i_atime = timespec_to_timespec64(hfsp_mt2ut(folder->access_date)); + inode->i_mtime = timespec_to_timespec64(hfsp_mt2ut(folder->content_mod_date)); + inode->i_ctime = timespec_to_timespec64(hfsp_mt2ut(folder->attribute_mod_date)); HFSPLUS_I(inode)->create_date = folder->create_date; HFSPLUS_I(inode)->fs_blocks = 0; if (folder->flags & cpu_to_be16(HFSPLUS_HAS_FOLDER_COUNT)) { @@ -539,9 +539,9 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd) init_special_inode(inode, inode->i_mode, be32_to_cpu(file->permissions.dev)); } - inode->i_atime = hfsp_mt2ut(file->access_date); - inode->i_mtime = hfsp_mt2ut(file->content_mod_date); - inode->i_ctime = hfsp_mt2ut(file->attribute_mod_date); + inode->i_atime = timespec_to_timespec64(hfsp_mt2ut(file->access_date)); + inode->i_mtime = timespec_to_timespec64(hfsp_mt2ut(file->content_mod_date)); + inode->i_ctime = timespec_to_timespec64(hfsp_mt2ut(file->attribute_mod_date)); HFSPLUS_I(inode)->create_date = file->create_date; } else { pr_err("bad catalog entry used to create inode\n"); diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index c148e7f4f451..6ad66e985d59 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -555,9 +555,9 @@ static int read_name(struct inode *ino, char *name) set_nlink(ino, st.nlink); i_uid_write(ino, st.uid); i_gid_write(ino, st.gid); - ino->i_atime = st.atime; - ino->i_mtime = st.mtime; - ino->i_ctime = st.ctime; + ino->i_atime = timespec_to_timespec64(st.atime); + ino->i_mtime = timespec_to_timespec64(st.mtime); + ino->i_ctime = timespec_to_timespec64(st.ctime); ino->i_size = st.size; ino->i_blocks = st.blocks; return 0; @@ -838,15 +838,15 @@ static int hostfs_setattr(struct dentry *dentry, struct iattr *attr) } if (attr->ia_valid & ATTR_ATIME) { attrs.ia_valid |= HOSTFS_ATTR_ATIME; - attrs.ia_atime = attr->ia_atime; + attrs.ia_atime = timespec64_to_timespec(attr->ia_atime); } if (attr->ia_valid & ATTR_MTIME) { attrs.ia_valid |= HOSTFS_ATTR_MTIME; - attrs.ia_mtime = attr->ia_mtime; + attrs.ia_mtime = timespec64_to_timespec(attr->ia_mtime); } if (attr->ia_valid & ATTR_CTIME) { attrs.ia_valid |= HOSTFS_ATTR_CTIME; - attrs.ia_ctime = attr->ia_ctime; + attrs.ia_ctime = timespec64_to_timespec(attr->ia_ctime); } if (attr->ia_valid & ATTR_ATIME_SET) { attrs.ia_valid |= HOSTFS_ATTR_ATIME_SET; diff --git a/fs/inode.c b/fs/inode.c index 7aabf51462ea..83669a3b0a60 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -1615,8 +1615,8 @@ static void update_ovl_inode_times(struct dentry *dentry, struct inode *inode, if (upperdentry) { struct inode *realinode = d_inode(upperdentry); - if ((!timespec_equal(&inode->i_mtime, &realinode->i_mtime) || - !timespec_equal(&inode->i_ctime, &realinode->i_ctime))) { + if ((!timespec64_equal(&inode->i_mtime, &realinode->i_mtime) || + !timespec64_equal(&inode->i_ctime, &realinode->i_ctime))) { inode->i_mtime = realinode->i_mtime; inode->i_ctime = realinode->i_ctime; } @@ -1639,12 +1639,12 @@ static int relatime_need_update(const struct path *path, struct inode *inode, /* * Is mtime younger than atime? If yes, update atime: */ - if (timespec_compare(&inode->i_mtime, &inode->i_atime) >= 0) + if (timespec64_compare(&inode->i_mtime, &inode->i_atime) >= 0) return 1; /* * Is ctime younger than atime? If yes, update atime: */ - if (timespec_compare(&inode->i_ctime, &inode->i_atime) >= 0) + if (timespec64_compare(&inode->i_ctime, &inode->i_atime) >= 0) return 1; /* @@ -1659,7 +1659,7 @@ static int relatime_need_update(const struct path *path, struct inode *inode, return 0; } -int generic_update_time(struct inode *inode, struct timespec *time, int flags) +int generic_update_time(struct inode *inode, struct timespec64 *time, int flags) { int iflags = I_DIRTY_TIME; @@ -1683,9 +1683,9 @@ EXPORT_SYMBOL(generic_update_time); * This does the actual work of updating an inodes time or version. Must have * had called mnt_want_write() before calling this. */ -static int update_time(struct inode *inode, struct timespec *time, int flags) +static int update_time(struct inode *inode, struct timespec64 *time, int flags) { - int (*update_time)(struct inode *, struct timespec *, int); + int (*update_time)(struct inode *, struct timespec64 *, int); update_time = inode->i_op->update_time ? inode->i_op->update_time : generic_update_time; @@ -1706,7 +1706,7 @@ bool __atime_needs_update(const struct path *path, struct inode *inode, bool rcu) { struct vfsmount *mnt = path->mnt; - struct timespec now; + struct timespec64 now; if (inode->i_flags & S_NOATIME) return false; @@ -1729,10 +1729,10 @@ bool __atime_needs_update(const struct path *path, struct inode *inode, now = current_time(inode); - if (!relatime_need_update(path, inode, now, rcu)) + if (!relatime_need_update(path, inode, timespec64_to_timespec(now), rcu)) return false; - if (timespec_equal(&inode->i_atime, &now)) + if (timespec64_equal(&inode->i_atime, &now)) return false; return true; @@ -1742,7 +1742,7 @@ void touch_atime(const struct path *path) { struct vfsmount *mnt = path->mnt; struct inode *inode = d_inode(path->dentry); - struct timespec now; + struct timespec64 now; if (!__atime_needs_update(path, inode, false)) return; @@ -1881,7 +1881,7 @@ EXPORT_SYMBOL(file_remove_privs); int file_update_time(struct file *file) { struct inode *inode = file_inode(file); - struct timespec now; + struct timespec64 now; int sync_it = 0; int need_sync = 0; int ret; @@ -1891,10 +1891,10 @@ int file_update_time(struct file *file) return 0; now = current_time(inode); - if (!timespec_equal(&inode->i_mtime, &now)) + if (!timespec64_equal(&inode->i_mtime, &now)) sync_it = S_MTIME; - if (!timespec_equal(&inode->i_ctime, &now)) + if (!timespec64_equal(&inode->i_ctime, &now)) sync_it |= S_CTIME; /* iversion impacts on "write" performance. This code just filter inodes @@ -2165,16 +2165,16 @@ EXPORT_SYMBOL(inode_nohighmem); * Note that inode and inode->sb cannot be NULL. * Otherwise, the function warns and returns time without truncation. */ -struct timespec current_time(struct inode *inode) +struct timespec64 current_time(struct inode *inode) { - struct timespec now = current_kernel_time(); + struct timespec64 now = current_kernel_time64(); if (unlikely(!inode->i_sb)) { WARN(1, "current_time() called with uninitialized super_block in the inode"); return now; } - return timespec_trunc(now, inode->i_sb->s_time_gran); + return timespec64_trunc(now, inode->i_sb->s_time_gran); } EXPORT_SYMBOL(current_time); diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c index f4a5ec92f5dc..9258e90bb1ae 100644 --- a/fs/jffs2/dir.c +++ b/fs/jffs2/dir.c @@ -201,7 +201,7 @@ static int jffs2_create(struct inode *dir_i, struct dentry *dentry, if (ret) goto fail; - dir_i->i_mtime = dir_i->i_ctime = ITIME(je32_to_cpu(ri->ctime)); + dir_i->i_mtime = dir_i->i_ctime = timespec_to_timespec64(ITIME(je32_to_cpu(ri->ctime))); jffs2_free_raw_inode(ri); @@ -234,7 +234,7 @@ static int jffs2_unlink(struct inode *dir_i, struct dentry *dentry) if (dead_f->inocache) set_nlink(d_inode(dentry), dead_f->inocache->pino_nlink); if (!ret) - dir_i->i_mtime = dir_i->i_ctime = ITIME(now); + dir_i->i_mtime = dir_i->i_ctime = timespec_to_timespec64(ITIME(now)); return ret; } /***********************************************************************/ @@ -268,7 +268,7 @@ static int jffs2_link (struct dentry *old_dentry, struct inode *dir_i, struct de set_nlink(d_inode(old_dentry), ++f->inocache->pino_nlink); mutex_unlock(&f->sem); d_instantiate(dentry, d_inode(old_dentry)); - dir_i->i_mtime = dir_i->i_ctime = ITIME(now); + dir_i->i_mtime = dir_i->i_ctime = timespec_to_timespec64(ITIME(now)); ihold(d_inode(old_dentry)); } return ret; @@ -418,7 +418,7 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char goto fail; } - dir_i->i_mtime = dir_i->i_ctime = ITIME(je32_to_cpu(rd->mctime)); + dir_i->i_mtime = dir_i->i_ctime = timespec_to_timespec64(ITIME(je32_to_cpu(rd->mctime))); jffs2_free_raw_dirent(rd); @@ -561,7 +561,7 @@ static int jffs2_mkdir (struct inode *dir_i, struct dentry *dentry, umode_t mode goto fail; } - dir_i->i_mtime = dir_i->i_ctime = ITIME(je32_to_cpu(rd->mctime)); + dir_i->i_mtime = dir_i->i_ctime = timespec_to_timespec64(ITIME(je32_to_cpu(rd->mctime))); inc_nlink(dir_i); jffs2_free_raw_dirent(rd); @@ -602,7 +602,7 @@ static int jffs2_rmdir (struct inode *dir_i, struct dentry *dentry) ret = jffs2_do_unlink(c, dir_f, dentry->d_name.name, dentry->d_name.len, f, now); if (!ret) { - dir_i->i_mtime = dir_i->i_ctime = ITIME(now); + dir_i->i_mtime = dir_i->i_ctime = timespec_to_timespec64(ITIME(now)); clear_nlink(d_inode(dentry)); drop_nlink(dir_i); } @@ -737,7 +737,7 @@ static int jffs2_mknod (struct inode *dir_i, struct dentry *dentry, umode_t mode goto fail; } - dir_i->i_mtime = dir_i->i_ctime = ITIME(je32_to_cpu(rd->mctime)); + dir_i->i_mtime = dir_i->i_ctime = timespec_to_timespec64(ITIME(je32_to_cpu(rd->mctime))); jffs2_free_raw_dirent(rd); @@ -857,14 +857,14 @@ static int jffs2_rename (struct inode *old_dir_i, struct dentry *old_dentry, * caller won't do it on its own since we are returning an error. */ d_invalidate(new_dentry); - new_dir_i->i_mtime = new_dir_i->i_ctime = ITIME(now); + new_dir_i->i_mtime = new_dir_i->i_ctime = timespec_to_timespec64(ITIME(now)); return ret; } if (d_is_dir(old_dentry)) drop_nlink(old_dir_i); - new_dir_i->i_mtime = new_dir_i->i_ctime = old_dir_i->i_mtime = old_dir_i->i_ctime = ITIME(now); + new_dir_i->i_mtime = new_dir_i->i_ctime = old_dir_i->i_mtime = old_dir_i->i_ctime = timespec_to_timespec64(ITIME(now)); return 0; } diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c index 221eb2bd205e..bce8fa1b32d3 100644 --- a/fs/jffs2/file.c +++ b/fs/jffs2/file.c @@ -318,7 +318,7 @@ static int jffs2_write_end(struct file *filp, struct address_space *mapping, inode->i_size = pos + writtenlen; inode->i_blocks = (inode->i_size + 511) >> 9; - inode->i_ctime = inode->i_mtime = ITIME(je32_to_cpu(ri->ctime)); + inode->i_ctime = inode->i_mtime = timespec_to_timespec64(ITIME(je32_to_cpu(ri->ctime))); } } diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c index dd7c6fbd2cc5..dc43f61d7e8d 100644 --- a/fs/jffs2/fs.c +++ b/fs/jffs2/fs.c @@ -146,9 +146,9 @@ int jffs2_do_setattr (struct inode *inode, struct iattr *iattr) return PTR_ERR(new_metadata); } /* It worked. Update the inode */ - inode->i_atime = ITIME(je32_to_cpu(ri->atime)); - inode->i_ctime = ITIME(je32_to_cpu(ri->ctime)); - inode->i_mtime = ITIME(je32_to_cpu(ri->mtime)); + inode->i_atime = timespec_to_timespec64(ITIME(je32_to_cpu(ri->atime))); + inode->i_ctime = timespec_to_timespec64(ITIME(je32_to_cpu(ri->ctime))); + inode->i_mtime = timespec_to_timespec64(ITIME(je32_to_cpu(ri->mtime))); inode->i_mode = jemode_to_cpu(ri->mode); i_uid_write(inode, je16_to_cpu(ri->uid)); i_gid_write(inode, je16_to_cpu(ri->gid)); @@ -280,9 +280,9 @@ struct inode *jffs2_iget(struct super_block *sb, unsigned long ino) i_uid_write(inode, je16_to_cpu(latest_node.uid)); i_gid_write(inode, je16_to_cpu(latest_node.gid)); inode->i_size = je32_to_cpu(latest_node.isize); - inode->i_atime = ITIME(je32_to_cpu(latest_node.atime)); - inode->i_mtime = ITIME(je32_to_cpu(latest_node.mtime)); - inode->i_ctime = ITIME(je32_to_cpu(latest_node.ctime)); + inode->i_atime = timespec_to_timespec64(ITIME(je32_to_cpu(latest_node.atime))); + inode->i_mtime = timespec_to_timespec64(ITIME(je32_to_cpu(latest_node.mtime))); + inode->i_ctime = timespec_to_timespec64(ITIME(je32_to_cpu(latest_node.ctime))); set_nlink(inode, f->inocache->pino_nlink); diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index da25f36827eb..066cf82df21e 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -785,7 +785,7 @@ int kernfs_add_one(struct kernfs_node *kn) ps_iattr = parent->iattr; if (ps_iattr) { struct iattr *ps_iattrs = &ps_iattr->ia_iattr; - ktime_get_real_ts(&ps_iattrs->ia_ctime); + ktime_get_real_ts64(&ps_iattrs->ia_ctime); ps_iattrs->ia_mtime = ps_iattrs->ia_ctime; } @@ -1311,7 +1311,7 @@ static void __kernfs_remove(struct kernfs_node *kn) /* update timestamps on the parent */ if (ps_iattr) { - ktime_get_real_ts(&ps_iattr->ia_iattr.ia_ctime); + ktime_get_real_ts64(&ps_iattr->ia_iattr.ia_ctime); ps_iattr->ia_iattr.ia_mtime = ps_iattr->ia_iattr.ia_ctime; } diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c index a34303981deb..3d73fe9d56e2 100644 --- a/fs/kernfs/inode.c +++ b/fs/kernfs/inode.c @@ -52,7 +52,7 @@ static struct kernfs_iattrs *kernfs_iattrs(struct kernfs_node *kn) iattrs->ia_uid = GLOBAL_ROOT_UID; iattrs->ia_gid = GLOBAL_ROOT_GID; - ktime_get_real_ts(&iattrs->ia_atime); + ktime_get_real_ts64(&iattrs->ia_atime); iattrs->ia_mtime = iattrs->ia_atime; iattrs->ia_ctime = iattrs->ia_atime; @@ -176,9 +176,9 @@ static inline void set_inode_attr(struct inode *inode, struct iattr *iattr) struct super_block *sb = inode->i_sb; inode->i_uid = iattr->ia_uid; inode->i_gid = iattr->ia_gid; - inode->i_atime = timespec_trunc(iattr->ia_atime, sb->s_time_gran); - inode->i_mtime = timespec_trunc(iattr->ia_mtime, sb->s_time_gran); - inode->i_ctime = timespec_trunc(iattr->ia_ctime, sb->s_time_gran); + inode->i_atime = timespec64_trunc(iattr->ia_atime, sb->s_time_gran); + inode->i_mtime = timespec64_trunc(iattr->ia_mtime, sb->s_time_gran); + inode->i_ctime = timespec64_trunc(iattr->ia_ctime, sb->s_time_gran); } static void kernfs_refresh_inode(struct kernfs_node *kn, struct inode *inode) diff --git a/fs/locks.c b/fs/locks.c index 0b09c1bbf8b8..3d6884a54fd2 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -1562,7 +1562,7 @@ EXPORT_SYMBOL(__break_lease); * exclusive leases. The justification is that if someone has an * exclusive lease, then they could be modifying it. */ -void lease_get_mtime(struct inode *inode, struct timespec *time) +void lease_get_mtime(struct inode *inode, struct timespec64 *time) { bool has_lease = false; struct file_lock_context *ctx; diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index 825b3166605d..fe82f1bae47f 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -54,8 +54,8 @@ __be32 nfs4_callback_getattr(void *argp, void *resp, res->change_attr = delegation->change_attr; if (nfs_have_writebacks(inode)) res->change_attr++; - res->ctime = inode->i_ctime; - res->mtime = inode->i_mtime; + res->ctime = timespec64_to_timespec(inode->i_ctime); + res->mtime = timespec64_to_timespec(inode->i_mtime); res->bitmap[0] = (FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE) & args->bitmap[0]; res->bitmap[1] = (FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY) & diff --git a/fs/nfs/fscache-index.c b/fs/nfs/fscache-index.c index 3025fe8584a0..2ac0fc0ff8e0 100644 --- a/fs/nfs/fscache-index.c +++ b/fs/nfs/fscache-index.c @@ -239,8 +239,8 @@ enum fscache_checkaux nfs_fscache_inode_check_aux(void *cookie_netfs_data, memset(&auxdata, 0, sizeof(auxdata)); auxdata.size = nfsi->vfs_inode.i_size; - auxdata.mtime = nfsi->vfs_inode.i_mtime; - auxdata.ctime = nfsi->vfs_inode.i_ctime; + auxdata.mtime = timespec64_to_timespec(nfsi->vfs_inode.i_mtime); + auxdata.ctime = timespec64_to_timespec(nfsi->vfs_inode.i_ctime); if (NFS_SERVER(&nfsi->vfs_inode)->nfs_client->rpc_ops->version == 4) auxdata.change_attr = nfsi->vfs_inode.i_version; diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index ac836e202ff7..ae078829c200 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -496,15 +496,15 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, st nfsi->read_cache_jiffies = fattr->time_start; nfsi->attr_gencount = fattr->gencount; if (fattr->valid & NFS_ATTR_FATTR_ATIME) - inode->i_atime = fattr->atime; + inode->i_atime = timespec_to_timespec64(fattr->atime); else if (nfs_server_capable(inode, NFS_CAP_ATIME)) nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR); if (fattr->valid & NFS_ATTR_FATTR_MTIME) - inode->i_mtime = fattr->mtime; + inode->i_mtime = timespec_to_timespec64(fattr->mtime); else if (nfs_server_capable(inode, NFS_CAP_MTIME)) nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR); if (fattr->valid & NFS_ATTR_FATTR_CTIME) - inode->i_ctime = fattr->ctime; + inode->i_ctime = timespec_to_timespec64(fattr->ctime); else if (nfs_server_capable(inode, NFS_CAP_CTIME)) nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR); if (fattr->valid & NFS_ATTR_FATTR_CHANGE) @@ -1288,6 +1288,7 @@ static bool nfs_file_has_buffered_writers(struct nfs_inode *nfsi) static unsigned long nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr) { unsigned long ret = 0; + struct timespec ts; if ((fattr->valid & NFS_ATTR_FATTR_PRECHANGE) && (fattr->valid & NFS_ATTR_FATTR_CHANGE) @@ -1298,16 +1299,18 @@ static unsigned long nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr ret |= NFS_INO_INVALID_ATTR; } /* If we have atomic WCC data, we may update some attributes */ + ts = timespec64_to_timespec(inode->i_ctime); if ((fattr->valid & NFS_ATTR_FATTR_PRECTIME) && (fattr->valid & NFS_ATTR_FATTR_CTIME) - && timespec_equal(&inode->i_ctime, &fattr->pre_ctime)) { + && timespec_equal(&ts, &fattr->pre_ctime)) { memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime)); ret |= NFS_INO_INVALID_ATTR; } + ts = timespec64_to_timespec(inode->i_mtime); if ((fattr->valid & NFS_ATTR_FATTR_PREMTIME) && (fattr->valid & NFS_ATTR_FATTR_MTIME) - && timespec_equal(&inode->i_mtime, &fattr->pre_mtime)) { + && timespec_equal(&ts, &fattr->pre_mtime)) { memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime)); if (S_ISDIR(inode->i_mode)) nfs_set_cache_invalid(inode, NFS_INO_INVALID_DATA); @@ -1338,7 +1341,7 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat struct nfs_inode *nfsi = NFS_I(inode); loff_t cur_size, new_isize; unsigned long invalid = 0; - + struct timespec ts; if (nfs_have_delegated_attributes(inode)) return 0; @@ -1353,10 +1356,12 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 && inode->i_version != fattr->change_attr) invalid |= NFS_INO_INVALID_ATTR | NFS_INO_REVAL_PAGECACHE; - if ((fattr->valid & NFS_ATTR_FATTR_MTIME) && !timespec_equal(&inode->i_mtime, &fattr->mtime)) + ts = timespec64_to_timespec(inode->i_mtime); + if ((fattr->valid & NFS_ATTR_FATTR_MTIME) && !timespec_equal(&ts, &fattr->mtime)) invalid |= NFS_INO_INVALID_ATTR; - if ((fattr->valid & NFS_ATTR_FATTR_CTIME) && !timespec_equal(&inode->i_ctime, &fattr->ctime)) + ts = timespec64_to_timespec(inode->i_ctime); + if ((fattr->valid & NFS_ATTR_FATTR_CTIME) && !timespec_equal(&ts, &fattr->ctime)) invalid |= NFS_INO_INVALID_ATTR; if (fattr->valid & NFS_ATTR_FATTR_SIZE) { @@ -1379,7 +1384,8 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat if ((fattr->valid & NFS_ATTR_FATTR_NLINK) && inode->i_nlink != fattr->nlink) invalid |= NFS_INO_INVALID_ATTR; - if ((fattr->valid & NFS_ATTR_FATTR_ATIME) && !timespec_equal(&inode->i_atime, &fattr->atime)) + ts = timespec64_to_timespec(inode->i_atime); + if ((fattr->valid & NFS_ATTR_FATTR_ATIME) && !timespec_equal(&ts, &fattr->atime)) invalid |= NFS_INO_INVALID_ATIME; if (invalid != 0) @@ -1649,12 +1655,12 @@ int nfs_post_op_update_inode_force_wcc_locked(struct inode *inode, struct nfs_fa } if ((fattr->valid & NFS_ATTR_FATTR_CTIME) != 0 && (fattr->valid & NFS_ATTR_FATTR_PRECTIME) == 0) { - memcpy(&fattr->pre_ctime, &inode->i_ctime, sizeof(fattr->pre_ctime)); + fattr->pre_ctime = timespec64_to_timespec(inode->i_ctime); fattr->valid |= NFS_ATTR_FATTR_PRECTIME; } if ((fattr->valid & NFS_ATTR_FATTR_MTIME) != 0 && (fattr->valid & NFS_ATTR_FATTR_PREMTIME) == 0) { - memcpy(&fattr->pre_mtime, &inode->i_mtime, sizeof(fattr->pre_mtime)); + fattr->pre_mtime = timespec64_to_timespec(inode->i_mtime); fattr->valid |= NFS_ATTR_FATTR_PREMTIME; } if ((fattr->valid & NFS_ATTR_FATTR_SIZE) != 0 && @@ -1800,7 +1806,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) } if (fattr->valid & NFS_ATTR_FATTR_MTIME) { - memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime)); + inode->i_mtime = timespec_to_timespec64(fattr->mtime); } else if (server->caps & NFS_CAP_MTIME) { nfsi->cache_validity |= save_cache_validity & (NFS_INO_INVALID_ATTR @@ -1809,7 +1815,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) } if (fattr->valid & NFS_ATTR_FATTR_CTIME) { - memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime)); + inode->i_ctime = timespec_to_timespec64(fattr->ctime); } else if (server->caps & NFS_CAP_CTIME) { nfsi->cache_validity |= save_cache_validity & (NFS_INO_INVALID_ATTR @@ -1846,7 +1852,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) if (fattr->valid & NFS_ATTR_FATTR_ATIME) - memcpy(&inode->i_atime, &fattr->atime, sizeof(inode->i_atime)); + inode->i_atime = timespec_to_timespec64(fattr->atime); else if (server->caps & NFS_CAP_ATIME) { nfsi->cache_validity |= save_cache_validity & (NFS_INO_INVALID_ATIME diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c index 85e4b4a233f9..350675e3ed47 100644 --- a/fs/nfs/nfs2xdr.c +++ b/fs/nfs/nfs2xdr.c @@ -354,6 +354,7 @@ static __be32 *xdr_time_not_set(__be32 *p) static void encode_sattr(struct xdr_stream *xdr, const struct iattr *attr) { + struct timespec ts; __be32 *p; p = xdr_reserve_space(xdr, NFS_sattr_sz << 2); @@ -375,17 +376,21 @@ static void encode_sattr(struct xdr_stream *xdr, const struct iattr *attr) else *p++ = cpu_to_be32(NFS2_SATTR_NOT_SET); - if (attr->ia_valid & ATTR_ATIME_SET) - p = xdr_encode_time(p, &attr->ia_atime); - else if (attr->ia_valid & ATTR_ATIME) - p = xdr_encode_current_server_time(p, &attr->ia_atime); - else + if (attr->ia_valid & ATTR_ATIME_SET) { + ts = timespec64_to_timespec(attr->ia_atime); + p = xdr_encode_time(p, &ts); + } else if (attr->ia_valid & ATTR_ATIME) { + ts = timespec64_to_timespec(attr->ia_atime); + p = xdr_encode_current_server_time(p, &ts); + } else p = xdr_time_not_set(p); - if (attr->ia_valid & ATTR_MTIME_SET) - xdr_encode_time(p, &attr->ia_mtime); - else if (attr->ia_valid & ATTR_MTIME) - xdr_encode_current_server_time(p, &attr->ia_mtime); - else + if (attr->ia_valid & ATTR_MTIME_SET) { + ts = timespec64_to_timespec(attr->ia_atime); + xdr_encode_time(p, &ts); + } else if (attr->ia_valid & ATTR_MTIME) { + ts = timespec64_to_timespec(attr->ia_mtime); + xdr_encode_current_server_time(p, &ts); + } else xdr_time_not_set(p); } diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c index be666aee28cc..e72d451ae393 100644 --- a/fs/nfs/nfs3xdr.c +++ b/fs/nfs/nfs3xdr.c @@ -562,6 +562,7 @@ static __be32 *xdr_decode_nfstime3(__be32 *p, struct timespec *timep) */ static void encode_sattr3(struct xdr_stream *xdr, const struct iattr *attr) { + struct timespec ts; u32 nbytes; __be32 *p; @@ -611,8 +612,10 @@ static void encode_sattr3(struct xdr_stream *xdr, const struct iattr *attr) *p++ = xdr_zero; if (attr->ia_valid & ATTR_ATIME_SET) { + struct timespec ts; *p++ = xdr_two; - p = xdr_encode_nfstime3(p, &attr->ia_atime); + ts = timespec64_to_timespec(attr->ia_atime); + p = xdr_encode_nfstime3(p, &ts); } else if (attr->ia_valid & ATTR_ATIME) { *p++ = xdr_one; } else @@ -620,7 +623,8 @@ static void encode_sattr3(struct xdr_stream *xdr, const struct iattr *attr) if (attr->ia_valid & ATTR_MTIME_SET) { *p++ = xdr_two; - xdr_encode_nfstime3(p, &attr->ia_mtime); + ts = timespec64_to_timespec(attr->ia_mtime); + xdr_encode_nfstime3(p, &ts); } else if (attr->ia_valid & ATTR_MTIME) { *p = xdr_one; } else diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index e604c0e02f4d..f148f53d3f90 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -1018,6 +1018,7 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const struct nfs_server *server, const uint32_t attrmask[]) { + struct timespec ts; char owner_name[IDMAP_NAMESZ]; char owner_group[IDMAP_NAMESZ]; int owner_namelen = 0; @@ -1119,16 +1120,16 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, if (bmval[1] & FATTR4_WORD1_TIME_ACCESS_SET) { if (iap->ia_valid & ATTR_ATIME_SET) { *p++ = cpu_to_be32(NFS4_SET_TO_CLIENT_TIME); - p = xdr_encode_hyper(p, (s64)iap->ia_atime.tv_sec); - *p++ = cpu_to_be32(iap->ia_atime.tv_nsec); + ts = timespec64_to_timespec(iap->ia_atime); + p = xdr_encode_nfstime4(p, &ts); } else *p++ = cpu_to_be32(NFS4_SET_TO_SERVER_TIME); } if (bmval[1] & FATTR4_WORD1_TIME_MODIFY_SET) { if (iap->ia_valid & ATTR_MTIME_SET) { *p++ = cpu_to_be32(NFS4_SET_TO_CLIENT_TIME); - p = xdr_encode_hyper(p, (s64)iap->ia_mtime.tv_sec); - *p++ = cpu_to_be32(iap->ia_mtime.tv_nsec); + ts = timespec64_to_timespec(iap->ia_mtime); + p = xdr_encode_nfstime4(p, &ts); } else *p++ = cpu_to_be32(NFS4_SET_TO_SERVER_TIME); } diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c index 3f880ae0966b..06266f7ac19f 100644 --- a/fs/nfsd/blocklayout.c +++ b/fs/nfsd/blocklayout.c @@ -121,13 +121,15 @@ nfsd4_block_commit_blocks(struct inode *inode, struct nfsd4_layoutcommit *lcp, { loff_t new_size = lcp->lc_last_wr + 1; struct iattr iattr = { .ia_valid = 0 }; + struct timespec ts; int error; + ts = timespec64_to_timespec(inode->i_mtime); if (lcp->lc_mtime.tv_nsec == UTIME_NOW || - timespec_compare(&lcp->lc_mtime, &inode->i_mtime) < 0) - lcp->lc_mtime = current_time(inode); + timespec_compare(&lcp->lc_mtime, &ts) < 0) + lcp->lc_mtime = timespec64_to_timespec(current_time(inode)); iattr.ia_valid |= ATTR_ATIME | ATTR_CTIME | ATTR_MTIME; - iattr.ia_atime = iattr.ia_ctime = iattr.ia_mtime = lcp->lc_mtime; + iattr.ia_atime = iattr.ia_ctime = iattr.ia_mtime = timespec_to_timespec64(lcp->lc_mtime); if (new_size > i_size_read(inode)) { iattr.ia_valid |= ATTR_SIZE; diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c index 6fbc48e074be..d431688fcf6d 100644 --- a/fs/nfsd/nfs3xdr.c +++ b/fs/nfsd/nfs3xdr.c @@ -165,6 +165,7 @@ static __be32 * encode_fattr3(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp, struct kstat *stat) { + struct timespec ts; *p++ = htonl(nfs3_ftypes[(stat->mode & S_IFMT) >> 12]); *p++ = htonl((u32) (stat->mode & S_IALLUGO)); *p++ = htonl((u32) stat->nlink); @@ -180,9 +181,12 @@ encode_fattr3(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp, *p++ = htonl((u32) MINOR(stat->rdev)); p = encode_fsid(p, fhp); p = xdr_encode_hyper(p, stat->ino); - p = encode_time3(p, &stat->atime); - p = encode_time3(p, &stat->mtime); - p = encode_time3(p, &stat->ctime); + ts = timespec64_to_timespec(stat->atime); + p = encode_time3(p, &ts); + ts = timespec64_to_timespec(stat->mtime); + p = encode_time3(p, &ts); + ts = timespec64_to_timespec(stat->ctime); + p = encode_time3(p, &ts); return p; } diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 997d3134beb3..21a01fef136d 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -320,6 +320,7 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, struct iattr *iattr, struct nfs4_acl **acl, struct xdr_netobj *label, int *umask) { + struct timespec ts; int expected_len, len = 0; u32 dummy32; char *buf; @@ -421,7 +422,8 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, switch (dummy32) { case NFS4_SET_TO_CLIENT_TIME: len += 12; - status = nfsd4_decode_time(argp, &iattr->ia_atime); + status = nfsd4_decode_time(argp, &ts); + iattr->ia_atime = timespec_to_timespec64(ts); if (status) return status; iattr->ia_valid |= (ATTR_ATIME | ATTR_ATIME_SET); @@ -440,7 +442,8 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, switch (dummy32) { case NFS4_SET_TO_CLIENT_TIME: len += 12; - status = nfsd4_decode_time(argp, &iattr->ia_mtime); + status = nfsd4_decode_time(argp, &ts); + iattr->ia_mtime = timespec_to_timespec64(ts); if (status) return status; iattr->ia_valid |= (ATTR_MTIME | ATTR_MTIME_SET); diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c index 644a0342f0e0..bcd22d3efe4d 100644 --- a/fs/nfsd/nfsxdr.c +++ b/fs/nfsd/nfsxdr.c @@ -147,7 +147,7 @@ encode_fattr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp, { struct dentry *dentry = fhp->fh_dentry; int type; - struct timespec time; + struct timespec64 time; u32 f; type = (stat->mode & S_IFMT); diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c index 8ad21fd98198..2b65e5c6b75a 100644 --- a/fs/ntfs/inode.c +++ b/fs/ntfs/inode.c @@ -680,18 +680,18 @@ static int ntfs_read_locked_inode(struct inode *vi) * mtime is the last change of the data within the file. Not changed * when only metadata is changed, e.g. a rename doesn't affect mtime. */ - vi->i_mtime = ntfs2utc(si->last_data_change_time); + vi->i_mtime = timespec_to_timespec64(ntfs2utc(si->last_data_change_time)); /* * ctime is the last change of the metadata of the file. This obviously * always changes, when mtime is changed. ctime can be changed on its * own, mtime is then not changed, e.g. when a file is renamed. */ - vi->i_ctime = ntfs2utc(si->last_mft_change_time); + vi->i_ctime = timespec_to_timespec64(ntfs2utc(si->last_mft_change_time)); /* * Last access to the data within the file. Not changed during a rename * for example but changed whenever the file is written to. */ - vi->i_atime = ntfs2utc(si->last_access_time); + vi->i_atime = timespec_to_timespec64(ntfs2utc(si->last_access_time)); /* Find the attribute list attribute if present. */ ntfs_attr_reinit_search_ctx(ctx); @@ -2836,11 +2836,11 @@ done: * for real. */ if (!IS_NOCMTIME(VFS_I(base_ni)) && !IS_RDONLY(VFS_I(base_ni))) { - struct timespec now = current_time(VFS_I(base_ni)); + struct timespec64 now = current_time(VFS_I(base_ni)); int sync_it = 0; - if (!timespec_equal(&VFS_I(base_ni)->i_mtime, &now) || - !timespec_equal(&VFS_I(base_ni)->i_ctime, &now)) + if (!timespec64_equal(&VFS_I(base_ni)->i_mtime, &now) || + !timespec64_equal(&VFS_I(base_ni)->i_ctime, &now)) sync_it = 1; VFS_I(base_ni)->i_mtime = now; VFS_I(base_ni)->i_ctime = now; @@ -2955,14 +2955,14 @@ int ntfs_setattr(struct dentry *dentry, struct iattr *attr) } } if (ia_valid & ATTR_ATIME) - vi->i_atime = timespec_trunc(attr->ia_atime, - vi->i_sb->s_time_gran); + vi->i_atime = timespec64_trunc(attr->ia_atime, + vi->i_sb->s_time_gran); if (ia_valid & ATTR_MTIME) - vi->i_mtime = timespec_trunc(attr->ia_mtime, - vi->i_sb->s_time_gran); + vi->i_mtime = timespec64_trunc(attr->ia_mtime, + vi->i_sb->s_time_gran); if (ia_valid & ATTR_CTIME) - vi->i_ctime = timespec_trunc(attr->ia_ctime, - vi->i_sb->s_time_gran); + vi->i_ctime = timespec64_trunc(attr->ia_ctime, + vi->i_sb->s_time_gran); mark_inode_dirty(vi); out: return err; @@ -3029,7 +3029,7 @@ int __ntfs_write_inode(struct inode *vi, int sync) si = (STANDARD_INFORMATION*)((u8*)ctx->attr + le16_to_cpu(ctx->attr->data.resident.value_offset)); /* Update the access times if they have changed. */ - nt = utc2ntfs(vi->i_mtime); + nt = utc2ntfs(timespec64_to_timespec(vi->i_mtime)); if (si->last_data_change_time != nt) { ntfs_debug("Updating mtime for inode 0x%lx: old = 0x%llx, " "new = 0x%llx", vi->i_ino, (long long) @@ -3038,7 +3038,7 @@ int __ntfs_write_inode(struct inode *vi, int sync) si->last_data_change_time = nt; modified = true; } - nt = utc2ntfs(vi->i_ctime); + nt = utc2ntfs(timespec64_to_timespec(vi->i_ctime)); if (si->last_mft_change_time != nt) { ntfs_debug("Updating ctime for inode 0x%lx: old = 0x%llx, " "new = 0x%llx", vi->i_ino, (long long) @@ -3047,7 +3047,7 @@ int __ntfs_write_inode(struct inode *vi, int sync) si->last_mft_change_time = nt; modified = true; } - nt = utc2ntfs(vi->i_atime); + nt = utc2ntfs(timespec64_to_timespec(vi->i_atime)); if (si->last_access_time != nt) { ntfs_debug("Updating atime for inode 0x%lx: old = 0x%llx, " "new = 0x%llx", vi->i_ino, diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 3aff7dd3fd1a..6ddf427ead03 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -2103,6 +2103,7 @@ static void __ocfs2_stuff_meta_lvb(struct inode *inode) struct ocfs2_inode_info *oi = OCFS2_I(inode); struct ocfs2_lock_res *lockres = &oi->ip_inode_lockres; struct ocfs2_meta_lvb *lvb; + struct timespec ts; lvb = ocfs2_dlm_lvb(&lockres->l_lksb); @@ -2123,12 +2124,15 @@ static void __ocfs2_stuff_meta_lvb(struct inode *inode) lvb->lvb_igid = cpu_to_be32(i_gid_read(inode)); lvb->lvb_imode = cpu_to_be16(inode->i_mode); lvb->lvb_inlink = cpu_to_be16(inode->i_nlink); + ts = timespec64_to_timespec(inode->i_atime); lvb->lvb_iatime_packed = - cpu_to_be64(ocfs2_pack_timespec(&inode->i_atime)); + cpu_to_be64(ocfs2_pack_timespec(&ts)); + ts = timespec64_to_timespec(inode->i_ctime); lvb->lvb_ictime_packed = - cpu_to_be64(ocfs2_pack_timespec(&inode->i_ctime)); + cpu_to_be64(ocfs2_pack_timespec(&ts)); + ts = timespec64_to_timespec(inode->i_mtime); lvb->lvb_imtime_packed = - cpu_to_be64(ocfs2_pack_timespec(&inode->i_mtime)); + cpu_to_be64(ocfs2_pack_timespec(&ts)); lvb->lvb_iattr = cpu_to_be32(oi->ip_attr); lvb->lvb_idynfeatures = cpu_to_be16(oi->ip_dyn_features); lvb->lvb_igeneration = cpu_to_be32(inode->i_generation); @@ -2146,6 +2150,7 @@ static void ocfs2_unpack_timespec(struct timespec *spec, static void ocfs2_refresh_inode_from_lvb(struct inode *inode) { + struct timespec ts; struct ocfs2_inode_info *oi = OCFS2_I(inode); struct ocfs2_lock_res *lockres = &oi->ip_inode_lockres; struct ocfs2_meta_lvb *lvb; @@ -2173,12 +2178,15 @@ static void ocfs2_refresh_inode_from_lvb(struct inode *inode) i_gid_write(inode, be32_to_cpu(lvb->lvb_igid)); inode->i_mode = be16_to_cpu(lvb->lvb_imode); set_nlink(inode, be16_to_cpu(lvb->lvb_inlink)); - ocfs2_unpack_timespec(&inode->i_atime, + ocfs2_unpack_timespec(&ts, be64_to_cpu(lvb->lvb_iatime_packed)); - ocfs2_unpack_timespec(&inode->i_mtime, + inode->i_atime = timespec_to_timespec64(ts); + ocfs2_unpack_timespec(&ts, be64_to_cpu(lvb->lvb_imtime_packed)); - ocfs2_unpack_timespec(&inode->i_ctime, + inode->i_mtime = timespec_to_timespec64(ts); + ocfs2_unpack_timespec(&ts, be64_to_cpu(lvb->lvb_ictime_packed)); + inode->i_ctime = timespec_to_timespec64(ts); spin_unlock(&oi->ip_lock); } diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 7242dd43ae8b..36a1493be05c 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -220,7 +220,7 @@ static int ocfs2_sync_file(struct file *file, loff_t start, loff_t end, int ocfs2_should_update_atime(struct inode *inode, struct vfsmount *vfsmnt) { - struct timespec now; + struct timespec64 now; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb)) @@ -246,8 +246,8 @@ int ocfs2_should_update_atime(struct inode *inode, return 0; if (vfsmnt->mnt_flags & MNT_RELATIME) { - if ((timespec_compare(&inode->i_atime, &inode->i_mtime) <= 0) || - (timespec_compare(&inode->i_atime, &inode->i_ctime) <= 0)) + if ((timespec64_compare(&inode->i_atime, &inode->i_mtime) <= 0) || + (timespec64_compare(&inode->i_atime, &inode->i_ctime) <= 0)) return 1; return 0; diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index d19cdfe53ff8..193422780833 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -366,7 +366,7 @@ int ovl_open_maybe_copy_up(struct dentry *dentry, unsigned int file_flags) return err; } -int ovl_update_time(struct inode *inode, struct timespec *ts, int flags) +int ovl_update_time(struct inode *inode, struct timespec64 *ts, int flags) { struct dentry *alias; struct path upperpath; diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index 9380b3367bd5..696c42895209 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -287,7 +287,7 @@ int __ovl_xattr_get(struct dentry *dentry, struct inode *inode, ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size); struct posix_acl *ovl_get_acl(struct inode *inode, int type); int ovl_open_maybe_copy_up(struct dentry *dentry, unsigned int file_flags); -int ovl_update_time(struct inode *inode, struct timespec *ts, int flags); +int ovl_update_time(struct inode *inode, struct timespec64 *ts, int flags); bool ovl_is_private_xattr(const char *name); struct inode *ovl_new_inode(struct super_block *sb, umode_t mode, dev_t rdev); diff --git a/fs/proc/uptime.c b/fs/proc/uptime.c index 95a708d83721..f18e6f949e0c 100644 --- a/fs/proc/uptime.c +++ b/fs/proc/uptime.c @@ -10,7 +10,7 @@ static int uptime_proc_show(struct seq_file *m, void *v) { struct timespec uptime; - struct timespec idle; + struct timespec64 idle; u64 nsec; u32 rem; int i; diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c index d4747c552404..d814723fb27d 100644 --- a/fs/pstore/inode.c +++ b/fs/pstore/inode.c @@ -392,8 +392,7 @@ int pstore_mkfile(struct dentry *root, struct pstore_record *record) inode->i_private = private; if (record->time.tv_sec) - inode->i_mtime = inode->i_ctime = - timespec64_to_timespec(record->time); + inode->i_mtime = inode->i_ctime = record->time; d_add(dentry, inode); diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c index ad4892837c45..2843b7cf4d7a 100644 --- a/fs/reiserfs/namei.c +++ b/fs/reiserfs/namei.c @@ -1323,7 +1323,7 @@ static int reiserfs_rename(struct inode *old_dir, struct dentry *old_dentry, int jbegin_count; umode_t old_inode_mode; unsigned long savelink = 1; - struct timespec ctime; + struct timespec64 ctime; if (flags & ~RENAME_NOREPLACE) return -EINVAL; diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index 57a7b9a164a2..8ff806aed994 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -462,10 +462,10 @@ int reiserfs_commit_write(struct file *f, struct page *page, static void update_ctime(struct inode *inode) { - struct timespec now = current_time(inode); + struct timespec64 now = current_time(inode); if (inode_unhashed(inode) || !inode->i_nlink || - timespec_equal(&inode->i_ctime, &now)) + timespec64_equal(&inode->i_ctime, &now)) return; inode->i_ctime = current_time(inode); diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index c41c3b6a3251..68bc8b610e22 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -1298,7 +1298,7 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry, .dirtied_ino = 3 }; struct ubifs_budget_req ino_req = { .dirtied_ino = 1, .dirtied_ino_d = ALIGN(old_inode_ui->data_len, 8) }; - struct timespec time; + struct timespec64 time; unsigned int uninitialized_var(saved_nlink); struct fscrypt_name old_nm, new_nm; @@ -1540,7 +1540,7 @@ static int ubifs_xrename(struct inode *old_dir, struct dentry *old_dentry, int sync = IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir); struct inode *fst_inode = d_inode(old_dentry); struct inode *snd_inode = d_inode(new_dentry); - struct timespec time; + struct timespec64 time; int err; struct fscrypt_name fst_nm, snd_nm; diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index b1ec5e20e876..c078c1f34ab2 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -1092,14 +1092,14 @@ static void do_attr_changes(struct inode *inode, const struct iattr *attr) if (attr->ia_valid & ATTR_GID) inode->i_gid = attr->ia_gid; if (attr->ia_valid & ATTR_ATIME) - inode->i_atime = timespec_trunc(attr->ia_atime, - inode->i_sb->s_time_gran); + inode->i_atime = timespec64_trunc(attr->ia_atime, + inode->i_sb->s_time_gran); if (attr->ia_valid & ATTR_MTIME) - inode->i_mtime = timespec_trunc(attr->ia_mtime, - inode->i_sb->s_time_gran); + inode->i_mtime = timespec64_trunc(attr->ia_mtime, + inode->i_sb->s_time_gran); if (attr->ia_valid & ATTR_CTIME) - inode->i_ctime = timespec_trunc(attr->ia_ctime, - inode->i_sb->s_time_gran); + inode->i_ctime = timespec64_trunc(attr->ia_ctime, + inode->i_sb->s_time_gran); if (attr->ia_valid & ATTR_MODE) { umode_t mode = attr->ia_mode; @@ -1374,8 +1374,9 @@ out: static inline int mctime_update_needed(const struct inode *inode, const struct timespec *now) { - if (!timespec_equal(&inode->i_mtime, now) || - !timespec_equal(&inode->i_ctime, now)) + struct timespec64 now64 = timespec_to_timespec64(*now); + if (!timespec64_equal(&inode->i_mtime, &now64) || + !timespec64_equal(&inode->i_ctime, &now64)) return 1; return 0; } @@ -1387,7 +1388,7 @@ static inline int mctime_update_needed(const struct inode *inode, * * This function updates time of the inode. */ -int ubifs_update_time(struct inode *inode, struct timespec *time, +int ubifs_update_time(struct inode *inode, struct timespec64 *time, int flags) { struct ubifs_inode *ui = ubifs_inode(inode); @@ -1431,7 +1432,7 @@ int ubifs_update_time(struct inode *inode, struct timespec *time, */ static int update_mctime(struct inode *inode) { - struct timespec now = current_time(inode); + struct timespec now = timespec64_to_timespec(current_time(inode)); struct ubifs_inode *ui = ubifs_inode(inode); struct ubifs_info *c = inode->i_sb->s_fs_info; @@ -1525,7 +1526,7 @@ static int ubifs_vm_page_mkwrite(struct vm_fault *vmf) struct page *page = vmf->page; struct inode *inode = file_inode(vmf->vma->vm_file); struct ubifs_info *c = inode->i_sb->s_fs_info; - struct timespec now = current_time(inode); + struct timespec now = timespec64_to_timespec(current_time(inode)); struct ubifs_budget_req req = { .new_page = 1 }; int err, update_time; diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index 3949f8c9b515..10e3884347dd 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -1739,7 +1739,7 @@ int ubifs_calc_dark(const struct ubifs_info *c, int spc); int ubifs_fsync(struct file *file, loff_t start, loff_t end, int datasync); int ubifs_setattr(struct dentry *dentry, struct iattr *attr); #ifdef CONFIG_UBIFS_ATIME_SUPPORT -int ubifs_update_time(struct inode *inode, struct timespec *time, int flags); +int ubifs_update_time(struct inode *inode, struct timespec64 *time, int flags); #endif /* dir.c */ diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c index c1ed18a10ce4..13574f17bc0f 100644 --- a/fs/udf/ialloc.c +++ b/fs/udf/ialloc.c @@ -120,8 +120,8 @@ struct inode *udf_new_inode(struct inode *dir, umode_t mode) iinfo->i_alloc_type = ICBTAG_FLAG_AD_SHORT; else iinfo->i_alloc_type = ICBTAG_FLAG_AD_LONG; - inode->i_mtime = inode->i_atime = inode->i_ctime = - iinfo->i_crtime = current_time(inode); + inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); + iinfo->i_crtime = timespec64_to_timespec(inode->i_mtime); if (unlikely(insert_inode_locked(inode) < 0)) { make_bad_inode(inode); iput(inode); diff --git a/fs/udf/inode.c b/fs/udf/inode.c index b2564517e348..d4e3d9c5ca4f 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -1296,6 +1296,7 @@ static int udf_read_inode(struct inode *inode, bool hidden_inode) struct udf_inode_info *iinfo = UDF_I(inode); struct udf_sb_info *sbi = UDF_SB(inode->i_sb); struct kernel_lb_addr *iloc = &iinfo->i_location; + struct timespec ts; unsigned int link_count; unsigned int indirections = 0; int bs = inode->i_sb->s_blocksize; @@ -1471,9 +1472,12 @@ reread: inode->i_blocks = le64_to_cpu(fe->logicalBlocksRecorded) << (inode->i_sb->s_blocksize_bits - 9); - udf_disk_stamp_to_time(&inode->i_atime, fe->accessTime); - udf_disk_stamp_to_time(&inode->i_mtime, fe->modificationTime); - udf_disk_stamp_to_time(&inode->i_ctime, fe->attrTime); + udf_disk_stamp_to_time(&ts, fe->accessTime); + inode->i_atime = timespec_to_timespec64(ts); + udf_disk_stamp_to_time(&ts, fe->modificationTime); + inode->i_mtime = timespec_to_timespec64(ts); + udf_disk_stamp_to_time(&ts, fe->attrTime); + inode->i_ctime = timespec_to_timespec64(ts); iinfo->i_unique = le64_to_cpu(fe->uniqueID); iinfo->i_lenEAttr = le32_to_cpu(fe->lengthExtendedAttr); @@ -1483,10 +1487,13 @@ reread: inode->i_blocks = le64_to_cpu(efe->logicalBlocksRecorded) << (inode->i_sb->s_blocksize_bits - 9); - udf_disk_stamp_to_time(&inode->i_atime, efe->accessTime); - udf_disk_stamp_to_time(&inode->i_mtime, efe->modificationTime); + udf_disk_stamp_to_time(&ts, efe->accessTime); + inode->i_atime = timespec_to_timespec64(ts); + udf_disk_stamp_to_time(&ts, efe->modificationTime); + inode->i_mtime = timespec_to_timespec64(ts); udf_disk_stamp_to_time(&iinfo->i_crtime, efe->createTime); - udf_disk_stamp_to_time(&inode->i_ctime, efe->attrTime); + udf_disk_stamp_to_time(&ts, efe->attrTime); + inode->i_ctime = timespec_to_timespec64(ts); iinfo->i_unique = le64_to_cpu(efe->uniqueID); iinfo->i_lenEAttr = le32_to_cpu(efe->lengthExtendedAttr); @@ -1736,9 +1743,12 @@ static int udf_update_inode(struct inode *inode, int do_sync) inode->i_sb->s_blocksize - sizeof(struct fileEntry)); fe->logicalBlocksRecorded = cpu_to_le64(lb_recorded); - udf_time_to_disk_stamp(&fe->accessTime, inode->i_atime); - udf_time_to_disk_stamp(&fe->modificationTime, inode->i_mtime); - udf_time_to_disk_stamp(&fe->attrTime, inode->i_ctime); + udf_time_to_disk_stamp(&fe->accessTime, + timespec64_to_timespec(inode->i_atime)); + udf_time_to_disk_stamp(&fe->modificationTime, + timespec64_to_timespec(inode->i_mtime)); + udf_time_to_disk_stamp(&fe->attrTime, + timespec64_to_timespec(inode->i_ctime)); memset(&(fe->impIdent), 0, sizeof(struct regid)); strcpy(fe->impIdent.ident, UDF_ID_DEVELOPER); fe->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX; @@ -1757,14 +1767,17 @@ static int udf_update_inode(struct inode *inode, int do_sync) efe->objectSize = cpu_to_le64(inode->i_size); efe->logicalBlocksRecorded = cpu_to_le64(lb_recorded); - udf_adjust_time(iinfo, inode->i_atime); - udf_adjust_time(iinfo, inode->i_mtime); - udf_adjust_time(iinfo, inode->i_ctime); + udf_adjust_time(iinfo, timespec64_to_timespec(inode->i_atime)); + udf_adjust_time(iinfo, timespec64_to_timespec(inode->i_mtime)); + udf_adjust_time(iinfo, timespec64_to_timespec(inode->i_ctime)); - udf_time_to_disk_stamp(&efe->accessTime, inode->i_atime); - udf_time_to_disk_stamp(&efe->modificationTime, inode->i_mtime); + udf_time_to_disk_stamp(&efe->accessTime, + timespec64_to_timespec(inode->i_atime)); + udf_time_to_disk_stamp(&efe->modificationTime, + timespec64_to_timespec(inode->i_mtime)); udf_time_to_disk_stamp(&efe->createTime, iinfo->i_crtime); - udf_time_to_disk_stamp(&efe->attrTime, inode->i_ctime); + udf_time_to_disk_stamp(&efe->attrTime, + timespec64_to_timespec(inode->i_ctime)); memset(&(efe->impIdent), 0, sizeof(efe->impIdent)); strcpy(efe->impIdent.ident, UDF_ID_DEVELOPER); diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index cfc7d6e01158..daf5786a3d58 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -778,7 +778,7 @@ xfs_ialloc( xfs_inode_t *ip; uint flags; int error; - struct timespec tv; + struct timespec64 tv; struct inode *inode; /* diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 16d5a949fb11..2d92686a0523 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -1060,7 +1060,7 @@ xfs_vn_setattr( STATIC int xfs_vn_update_time( struct inode *inode, - struct timespec *now, + struct timespec64 *now, int flags) { struct xfs_inode *ip = XFS_I(inode); diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c index daa7615497f9..d791b7aaf8e6 100644 --- a/fs/xfs/xfs_trans_inode.c +++ b/fs/xfs/xfs_trans_inode.c @@ -68,7 +68,7 @@ xfs_trans_ichgtime( int flags) { struct inode *inode = VFS_I(ip); - struct timespec tv; + struct timespec64 tv; ASSERT(tp); ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); diff --git a/include/linux/fs.h b/include/linux/fs.h index ae353137c767..05c9add02a80 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -212,9 +212,9 @@ struct iattr { kuid_t ia_uid; kgid_t ia_gid; loff_t ia_size; - struct timespec ia_atime; - struct timespec ia_mtime; - struct timespec ia_ctime; + struct timespec64 ia_atime; + struct timespec64 ia_mtime; + struct timespec64 ia_ctime; /* * Not an attribute, but an auxiliary info for filesystems wanting to @@ -620,9 +620,9 @@ struct inode { }; dev_t i_rdev; loff_t i_size; - struct timespec i_atime; - struct timespec i_mtime; - struct timespec i_ctime; + struct timespec64 i_atime; + struct timespec64 i_mtime; + struct timespec64 i_ctime; spinlock_t i_lock; /* i_blocks, i_bytes, maybe i_size */ unsigned short i_bytes; unsigned int i_blkbits; @@ -1121,7 +1121,7 @@ extern int vfs_lock_file(struct file *, unsigned int, struct file_lock *, struct extern int vfs_cancel_lock(struct file *filp, struct file_lock *fl); extern int locks_lock_inode_wait(struct inode *inode, struct file_lock *fl); extern int __break_lease(struct inode *inode, unsigned int flags, unsigned int type); -extern void lease_get_mtime(struct inode *, struct timespec *time); +extern void lease_get_mtime(struct inode *, struct timespec64 *time); extern int generic_setlease(struct file *, long, struct file_lock **, void **priv); extern int vfs_setlease(struct file *, long, struct file_lock **, void **); extern int lease_modify(struct file_lock *, int, struct list_head *); @@ -1236,7 +1236,8 @@ static inline int __break_lease(struct inode *inode, unsigned int mode, unsigned return 0; } -static inline void lease_get_mtime(struct inode *inode, struct timespec *time) +static inline void lease_get_mtime(struct inode *inode, + struct timespec64 *time) { return; } @@ -1519,7 +1520,7 @@ static inline void i_gid_write(struct inode *inode, gid_t gid) inode->i_gid = make_kgid(inode->i_sb->s_user_ns, gid); } -extern struct timespec current_time(struct inode *inode); +extern struct timespec64 current_time(struct inode *inode); /* * Snapshotting support. @@ -1817,7 +1818,7 @@ struct inode_operations { ssize_t (*listxattr) (struct dentry *, char *, size_t); int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start, u64 len); - int (*update_time)(struct inode *, struct timespec *, int); + int (*update_time)(struct inode *, struct timespec64 *, int); int (*atomic_open)(struct inode *, struct dentry *, struct file *, unsigned open_flag, umode_t create_mode, int *opened); @@ -2276,7 +2277,7 @@ extern int current_umask(void); extern void ihold(struct inode * inode); extern void iput(struct inode *); -extern int generic_update_time(struct inode *, struct timespec *, int); +extern int generic_update_time(struct inode *, struct timespec64 *, int); /* /sys/fs */ extern struct kobject *fs_kobj; diff --git a/include/linux/stat.h b/include/linux/stat.h index 07295841fccd..528c4baad091 100644 --- a/include/linux/stat.h +++ b/include/linux/stat.h @@ -42,10 +42,10 @@ struct kstat { kuid_t uid; kgid_t gid; loff_t size; - struct timespec atime; - struct timespec mtime; - struct timespec ctime; - struct timespec btime; /* File creation time */ + struct timespec64 atime; + struct timespec64 mtime; + struct timespec64 ctime; + struct timespec64 btime; /* File creation time */ u64 blocks; }; From 60cd194978de624e29f00a64bb9d8b8f3af25794 Mon Sep 17 00:00:00 2001 From: Deepa Dinamani Date: Sun, 22 Apr 2018 20:18:46 -0700 Subject: [PATCH 1273/1640] fs: add timespec64_truncate() As vfs moves to using struct timespec64 to represent times, update the argument to timespec_truncate() to use struct timespec64. Also change the name of the function. The rest of the implementation logic is the same. Move this to fs/inode.c instead of kernel/time/time.c as all the users of this api are filesystems. Change-Id: I19b58d7d1d5ec263b04237a33b8f981e7d407171 Signed-off-by: Deepa Dinamani Cc: --- fs/inode.c | 24 ++++++++++++++++++++++++ include/linux/fs.h | 1 + 2 files changed, 25 insertions(+) diff --git a/fs/inode.c b/fs/inode.c index 83669a3b0a60..9606461ffff6 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -2155,6 +2155,30 @@ void inode_nohighmem(struct inode *inode) } EXPORT_SYMBOL(inode_nohighmem); +/** + * timespec64_trunc - Truncate timespec64 to a granularity + * @t: Timespec64 + * @gran: Granularity in ns. + * + * Truncate a timespec64 to a granularity. Always rounds down. gran must + * not be 0 nor greater than a second (NSEC_PER_SEC, or 10^9 ns). + */ +struct timespec64 timespec64_trunc(struct timespec64 t, unsigned gran) +{ + /* Avoid division in the common cases 1 ns and 1 s. */ + if (gran == 1) { + /* nothing */ + } else if (gran == NSEC_PER_SEC) { + t.tv_nsec = 0; + } else if (gran > 1 && gran < NSEC_PER_SEC) { + t.tv_nsec -= t.tv_nsec % gran; + } else { + WARN(1, "illegal file time granularity: %u", gran); + } + return t; +} +EXPORT_SYMBOL(timespec64_trunc); + /** * current_time - Return FS time * @inode: inode. diff --git a/include/linux/fs.h b/include/linux/fs.h index 05c9add02a80..b6f5ba4a80cc 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1520,6 +1520,7 @@ static inline void i_gid_write(struct inode *inode, gid_t gid) inode->i_gid = make_kgid(inode->i_sb->s_user_ns, gid); } +extern struct timespec64 timespec64_trunc(struct timespec64 t, unsigned gran); extern struct timespec64 current_time(struct inode *inode); /* From 8f4ee6cff6833b2016919930cbb4565ba0a5ea6a Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 13 Jul 2018 22:18:36 +0200 Subject: [PATCH 1274/1640] ceph: use timespec64 for inode timestamp Since the vfs structures are all using timespec64, we can now change the internal representation, using ceph_encode_timespec64 and ceph_decode_timespec64. In case of ceph_aux_inode however, we need to avoid doing a memcmp() on uninitialized padding data, so the members of the i_mtime field get copied individually into 64-bit integers. Signed-off-by: Arnd Bergmann Reviewed-by: "Yan, Zheng" Signed-off-by: Ilya Dryomov Change-Id: Ia1a10ea74dc07ce95bfc8ad6ddb5080d5ec41db7 --- fs/ceph/cache.c | 12 ++++++----- fs/ceph/caps.c | 25 +++++++++++----------- fs/ceph/dir.c | 6 +++--- fs/ceph/inode.c | 49 +++++++++++++++++++++----------------------- fs/ceph/mds_client.c | 7 ++----- fs/ceph/snap.c | 6 +++--- fs/ceph/super.h | 9 ++++---- fs/ceph/xattr.c | 4 ++-- 8 files changed, 58 insertions(+), 60 deletions(-) diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c index e2591e99c1a5..4c92aa81c78c 100644 --- a/fs/ceph/cache.c +++ b/fs/ceph/cache.c @@ -25,9 +25,10 @@ #include "cache.h" struct ceph_aux_inode { - u64 version; - struct timespec mtime; - loff_t size; + u64 version; + u64 mtime_sec; + u64 mtime_nsec; + loff_t size; }; struct fscache_netfs ceph_cache_netfs = { @@ -184,8 +185,9 @@ static enum fscache_checkaux ceph_fscache_inode_check_aux( memset(&aux, 0, sizeof(aux)); aux.version = ci->i_version; - aux.mtime = timespec64_to_timespec(inode->i_mtime); - aux.size = i_size_read(inode); + aux.mtime_sec = inode->i_mtime.tv_sec; + aux.mtime_nsec = inode->i_mtime.tv_nsec; + aux.size = i_size_read(inode); if (memcmp(data, &aux, sizeof(aux)) != 0) return FSCACHE_CHECKAUX_OBSOLETE; diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 2653fa2aeeef..ed3261b382b8 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1002,7 +1002,7 @@ struct cap_msg_args { u64 flush_tid, oldest_flush_tid, size, max_size; u64 xattr_version; struct ceph_buffer *xattr_buf; - struct timespec atime, mtime, ctime; + struct timespec64 atime, mtime, ctime; int op, caps, wanted, dirty; u32 seq, issue_seq, mseq, time_warp_seq; u32 flags; @@ -1023,7 +1023,7 @@ static int send_cap_msg(struct cap_msg_args *arg) struct ceph_msg *msg; void *p; size_t extra_len; - struct timespec zerotime = {0}; + struct timespec64 zerotime = {0}; struct ceph_osd_client *osdc = &arg->session->s_mdsc->fsc->client->osdc; dout("send_cap_msg %s %llx %llx caps %s wanted %s dirty %s" @@ -1063,9 +1063,9 @@ static int send_cap_msg(struct cap_msg_args *arg) fc->size = cpu_to_le64(arg->size); fc->max_size = cpu_to_le64(arg->max_size); - ceph_encode_timespec(&fc->mtime, &arg->mtime); - ceph_encode_timespec(&fc->atime, &arg->atime); - ceph_encode_timespec(&fc->ctime, &arg->ctime); + ceph_encode_timespec64(&fc->mtime, &arg->mtime); + ceph_encode_timespec64(&fc->atime, &arg->atime); + ceph_encode_timespec64(&fc->ctime, &arg->ctime); fc->time_warp_seq = cpu_to_le32(arg->time_warp_seq); fc->uid = cpu_to_le32(from_kuid(&init_user_ns, arg->uid)); @@ -1114,7 +1114,7 @@ static int send_cap_msg(struct cap_msg_args *arg) * We just zero these out for now, as the MDS ignores them unless * the requisite feature flags are set (which we don't do yet). */ - ceph_encode_timespec(p, &zerotime); + ceph_encode_timespec64(p, &zerotime); p += sizeof(struct ceph_timespec); ceph_encode_64(&p, 0); @@ -1242,9 +1242,9 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, arg.xattr_buf = NULL; } - arg.mtime = timespec64_to_timespec(inode->i_mtime); - arg.atime = timespec64_to_timespec(inode->i_atime); - arg.ctime = timespec64_to_timespec(inode->i_ctime); + arg.mtime = inode->i_mtime; + arg.atime = inode->i_atime; + arg.ctime = inode->i_ctime; arg.op = op; arg.caps = cap->implemented; @@ -3069,10 +3069,11 @@ static void handle_cap_grant(struct inode *inode, } if (newcaps & CEPH_CAP_ANY_RD) { + struct timespec64 mtime, atime, ctime; /* ctime/mtime/atime? */ - ceph_decode_timespec(&mtime, &grant->mtime); - ceph_decode_timespec(&atime, &grant->atime); - ceph_decode_timespec(&ctime, &grant->ctime); + ceph_decode_timespec64(&mtime, &grant->mtime); + ceph_decode_timespec64(&atime, &grant->atime); + ceph_decode_timespec64(&ctime, &grant->ctime); ceph_fill_file_time(inode, extra_info->issued, le32_to_cpu(grant->time_warp_seq), &ctime, &mtime, &atime); diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 56e8fc896f6b..497d3d706ff4 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -1381,7 +1381,7 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size, " rfiles: %20lld\n" " rsubdirs: %20lld\n" "rbytes: %20lld\n" - "rctime: %10ld.%09ld\n", + "rctime: %10lld.%09ld\n", ci->i_files + ci->i_subdirs, ci->i_files, ci->i_subdirs, @@ -1389,8 +1389,8 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size, ci->i_rfiles, ci->i_rsubdirs, ci->i_rbytes, - (long)ci->i_rctime.tv_sec, - (long)ci->i_rctime.tv_nsec); + ci->i_rctime.tv_sec, + ci->i_rctime.tv_nsec); } if (*ppos >= cf->dir_info_len) diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 0606c7bb6677..764c95ccb21a 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -646,13 +646,10 @@ int ceph_fill_file_size(struct inode *inode, int issued, } void ceph_fill_file_time(struct inode *inode, int issued, - u64 time_warp_seq, struct timespec *ctime, - struct timespec *mtime, struct timespec *atime) + u64 time_warp_seq, struct timespec64 *ctime, + struct timespec64 *mtime, struct timespec64 *atime) { struct ceph_inode_info *ci = ceph_inode(inode); - struct timespec64 ctime64 = timespec_to_timespec64(*ctime); - struct timespec64 mtime64 = timespec_to_timespec64(*mtime); - struct timespec64 atime64 = timespec_to_timespec64(*atime); int warn = 0; if (issued & (CEPH_CAP_FILE_EXCL| @@ -660,10 +657,11 @@ void ceph_fill_file_time(struct inode *inode, int issued, CEPH_CAP_FILE_BUFFER| CEPH_CAP_AUTH_EXCL| CEPH_CAP_XATTR_EXCL)) { - if (timespec64_compare(&ctime64, &inode->i_ctime) > 0) { + if (timespec64_compare(ctime, &inode->i_ctime) > 0) { dout("ctime %ld.%09ld -> %ld.%09ld inc w/ cap\n", inode->i_ctime.tv_sec, inode->i_ctime.tv_nsec, ctime->tv_sec, ctime->tv_nsec); + inode->i_ctime = *ctime; inode->i_ctime = ctime64; } if (ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) > 0) { @@ -674,24 +672,24 @@ void ceph_fill_file_time(struct inode *inode, int issued, mtime->tv_sec, mtime->tv_nsec, ci->i_time_warp_seq, (int)time_warp_seq); - inode->i_mtime = mtime64; - inode->i_atime = atime64; + inode->i_mtime = *mtime; + inode->i_atime = *atime; ci->i_time_warp_seq = time_warp_seq; } else if (time_warp_seq == ci->i_time_warp_seq) { /* nobody did utimes(); take the max */ - if (timespec64_compare(&mtime64, &inode->i_mtime) > 0) { - dout("mtime %ld.%09ld -> %ld.%09ld inc\n", + if (timespec64_compare(mtime, &inode->i_mtime) > 0) { + dout("mtime %ld.%09ld -> %lld.%09ld inc\n", inode->i_mtime.tv_sec, inode->i_mtime.tv_nsec, mtime->tv_sec, mtime->tv_nsec); - inode->i_mtime = mtime64; + inode->i_mtime = *mtime; } - if (timespec64_compare(&atime64, &inode->i_atime) > 0) { - dout("atime %ld.%09ld -> %ld.%09ld inc\n", + if (timespec64_compare(atime, &inode->i_atime) > 0) { + dout("atime %ld.%09ld -> %lld.%09ld inc\n", inode->i_atime.tv_sec, inode->i_atime.tv_nsec, atime->tv_sec, atime->tv_nsec); - inode->i_atime = atime64; + inode->i_atime = *atime; } } else if (issued & CEPH_CAP_FILE_EXCL) { /* we did a utimes(); ignore mds values */ @@ -701,9 +699,9 @@ void ceph_fill_file_time(struct inode *inode, int issued, } else { /* we have no write|excl caps; whatever the MDS says is true */ if (ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) >= 0) { - inode->i_ctime = ctime64; - inode->i_mtime = mtime64; - inode->i_atime = atime64; + inode->i_ctime = *ctime; + inode->i_mtime = *mtime; + inode->i_atime = *atime; ci->i_time_warp_seq = time_warp_seq; } else { warn = 1; @@ -729,7 +727,7 @@ static int fill_inode(struct inode *inode, struct page *locked_page, struct ceph_mds_reply_inode *info = iinfo->in; struct ceph_inode_info *ci = ceph_inode(inode); int issued = 0, implemented, new_issued; - struct timespec mtime, atime, ctime; + struct timespec64 mtime, atime, ctime; struct ceph_buffer *xattr_blob = NULL; struct ceph_buffer *old_blob = NULL; struct ceph_string *pool_ns = NULL; @@ -813,9 +811,9 @@ static int fill_inode(struct inode *inode, struct page *locked_page, if (new_version || (new_issued & CEPH_CAP_ANY_RD)) { /* be careful with mtime, atime, size */ - ceph_decode_timespec(&atime, &info->atime); - ceph_decode_timespec(&mtime, &info->mtime); - ceph_decode_timespec(&ctime, &info->ctime); + ceph_decode_timespec64(&atime, &info->atime); + ceph_decode_timespec64(&mtime, &info->mtime); + ceph_decode_timespec64(&ctime, &info->ctime); ceph_fill_file_time(inode, issued, le32_to_cpu(info->time_warp_seq), &ctime, &mtime, &atime); @@ -1917,7 +1915,6 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr) int err = 0; int inode_dirty_flags = 0; bool lock_snap_rwsem = false; - struct timespec ts; prealloc_cf = ceph_alloc_cap_flush(); if (!prealloc_cf) @@ -2006,8 +2003,8 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr) dirtied |= CEPH_CAP_FILE_WR; } else if ((issued & CEPH_CAP_FILE_SHARED) == 0 || !timespec64_equal(&inode->i_atime, &attr->ia_atime)) { - ts = timespec64_to_timespec(attr->ia_atime); - ceph_encode_timespec(&req->r_args.setattr.atime, &ts); + ceph_encode_timespec64(&req->r_args.setattr.atime, + &attr->ia_atime); mask |= CEPH_SETATTR_ATIME; release |= CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR; @@ -2028,8 +2025,8 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr) dirtied |= CEPH_CAP_FILE_WR; } else if ((issued & CEPH_CAP_FILE_SHARED) == 0 || !timespec64_equal(&inode->i_mtime, &attr->ia_mtime)) { - ts = timespec64_to_timespec(attr->ia_mtime); - ceph_encode_timespec(&req->r_args.setattr.mtime, &ts); + ceph_encode_timespec64(&req->r_args.setattr.mtime, + &attr->ia_mtime); mask |= CEPH_SETATTR_MTIME; release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR; diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 56d444263dfe..c6830a4a32d8 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -2953,15 +2953,12 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap, rec.v2.pathbase = cpu_to_le64(pathbase); rec.v2.flock_len = 0; } else { - struct timespec ts; rec.v1.cap_id = cpu_to_le64(cap->cap_id); rec.v1.wanted = cpu_to_le32(__ceph_caps_wanted(ci)); rec.v1.issued = cpu_to_le32(cap->issued); rec.v1.size = cpu_to_le64(inode->i_size); - ts = timespec64_to_timespec(inode->i_mtime); - ceph_encode_timespec(&rec.v1.mtime, &ts); - ts = timespec64_to_timespec(inode->i_atime); - ceph_encode_timespec(&rec.v1.atime, &ts); + ceph_encode_timespec64(&rec.v1.mtime, &inode->i_mtime); + ceph_encode_timespec64(&rec.v1.atime, &inode->i_atime); rec.v1.snaprealm = cpu_to_le64(ci->i_snap_realm->ino); rec.v1.pathbase = cpu_to_le64(pathbase); } diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index 2160f728056e..ec917ad13595 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -601,9 +601,9 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci, BUG_ON(capsnap->writing); capsnap->size = inode->i_size; - capsnap->mtime = timespec64_to_timespec(inode->i_mtime); - capsnap->atime = timespec64_to_timespec(inode->i_atime); - capsnap->ctime = timespec64_to_timespec(inode->i_ctime); + capsnap->mtime = inode->i_mtime; + capsnap->atime = inode->i_atime; + capsnap->ctime = inode->i_ctime; capsnap->time_warp_seq = ci->i_time_warp_seq; capsnap->truncate_size = ci->i_truncate_size; capsnap->truncate_seq = ci->i_truncate_seq; diff --git a/fs/ceph/super.h b/fs/ceph/super.h index dd5257dee6cb..c8dc45530c0a 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -192,7 +192,7 @@ struct ceph_cap_snap { u64 xattr_version; u64 size; - struct timespec mtime, atime, ctime; + struct timespec64 mtime, atime, ctime; u64 time_warp_seq; u64 truncate_size; u32 truncate_seq; @@ -305,7 +305,7 @@ struct ceph_inode_info { char *i_symlink; /* for dirs */ - struct timespec i_rctime; + struct timespec64 i_rctime; u64 i_rbytes, i_rfiles, i_rsubdirs; u64 i_files, i_subdirs; @@ -803,8 +803,9 @@ extern struct inode *ceph_get_snapdir(struct inode *parent); extern int ceph_fill_file_size(struct inode *inode, int issued, u32 truncate_seq, u64 truncate_size, u64 size); extern void ceph_fill_file_time(struct inode *inode, int issued, - u64 time_warp_seq, struct timespec *ctime, - struct timespec *mtime, struct timespec *atime); + u64 time_warp_seq, struct timespec64 *ctime, + struct timespec64 *mtime, + struct timespec64 *atime); extern int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req); extern int ceph_readdir_prepopulate(struct ceph_mds_request *req, diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index 3a166f860b6c..c8daaa061e1a 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -217,8 +217,8 @@ static size_t ceph_vxattrcb_dir_rbytes(struct ceph_inode_info *ci, char *val, static size_t ceph_vxattrcb_dir_rctime(struct ceph_inode_info *ci, char *val, size_t size) { - return snprintf(val, size, "%ld.09%ld", (long)ci->i_rctime.tv_sec, - (long)ci->i_rctime.tv_nsec); + return snprintf(val, size, "%lld.09%ld", ci->i_rctime.tv_sec, + ci->i_rctime.tv_nsec); } From f6d682ccd1ea1e3a963ad30e6594418a96e2ab5a Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 13 Jul 2018 16:35:10 +0200 Subject: [PATCH 1275/1640] fuse: convert last timespec use to timespec64 All of fuse uses 64-bit timestamps with the exception of the fuse_change_attributes(), so let's convert this one as well. Change-Id: Ibe120305d151ea8c8231ef7a73a1ae593296d2a8 Signed-off-by: Arnd Bergmann Signed-off-by: Miklos Szeredi --- fs/fuse/inode.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 26a5df719f10..ed7a05a03794 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -214,7 +214,7 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr, struct fuse_inode *fi = get_fuse_inode(inode); bool is_wb = fc->writeback_cache && !test_bit(FUSE_I_ATTR_FORCE_SYNC, &fi->state); loff_t oldsize; - struct timespec old_mtime; + struct timespec64 old_mtime; spin_lock(&fc->lock); if ((attr_version != 0 && fi->attr_version > attr_version) || @@ -223,7 +223,7 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr, return; } - old_mtime = timespec64_to_timespec(inode->i_mtime); + old_mtime = inode->i_mtime; fuse_change_attributes_common(inode, attr, attr_valid); oldsize = inode->i_size; @@ -243,7 +243,7 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr, truncate_pagecache(inode, attr->size); inval = true; } else if (fc->auto_inval_data) { - struct timespec new_mtime = { + struct timespec64 new_mtime = { .tv_sec = attr->mtime, .tv_nsec = attr->mtimensec, }; @@ -252,7 +252,7 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr, * Auto inval mode also checks and invalidates if mtime * has changed. */ - if (!timespec_equal(&old_mtime, &new_mtime)) + if (!timespec64_equal(&old_mtime, &new_mtime)) inval = true; } From ee8c9c2e042b6056bad1e04ddb81014e9a0080d0 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 20 Jun 2018 10:02:19 +0200 Subject: [PATCH 1276/1640] f2fs: use timespec64 for inode timestamps The on-disk representation and the vfs both use 64-bit tv_sec values, so let's change the last missing piece in the middle. Change-Id: Iebc60247d79e006d101cd0a63a19591aaa871a16 Signed-off-by: Arnd Bergmann Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 13 ++++++------- fs/f2fs/inode.c | 12 ++++++------ fs/f2fs/namei.c | 2 +- 3 files changed, 13 insertions(+), 14 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 13f0129c71eb..405b6a5df0f7 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -850,8 +850,8 @@ struct f2fs_inode_info { int i_extra_isize; /* size of extra space located in i_addr */ kprojid_t i_projid; /* id for project quota */ int i_inline_xattr_size; /* inline xattr size */ - struct timespec i_crtime; /* inode creation time */ - struct timespec i_disk_time[4]; /* inode disk times */ + struct timespec64 i_crtime; /* inode creation time */ + struct timespec64 i_disk_time[4]; /* inode disk times */ /* for file compress */ atomic_t i_compr_blocks; /* # of compressed blocks */ @@ -3147,13 +3147,13 @@ static inline void clear_file(struct inode *inode, int type) static inline bool f2fs_is_time_consistent(struct inode *inode) { - if (!timespec_equal(F2FS_I(inode)->i_disk_time, &inode->i_atime)) + if (!timespec64_equal(F2FS_I(inode)->i_disk_time, &inode->i_atime)) return false; - if (!timespec_equal(F2FS_I(inode)->i_disk_time + 1, &inode->i_ctime)) + if (!timespec64_equal(F2FS_I(inode)->i_disk_time + 1, &inode->i_ctime)) return false; - if (!timespec_equal(F2FS_I(inode)->i_disk_time + 2, &inode->i_mtime)) + if (!timespec64_equal(F2FS_I(inode)->i_disk_time + 2, &inode->i_mtime)) return false; - if (!timespec_equal(F2FS_I(inode)->i_disk_time + 3, + if (!timespec64_equal(F2FS_I(inode)->i_disk_time + 3, &F2FS_I(inode)->i_crtime)) return false; return true; @@ -3161,7 +3161,6 @@ static inline bool f2fs_is_time_consistent(struct inode *inode) static inline bool f2fs_skip_inode_update(struct inode *inode, int dsync) { - struct timespec ts; bool ret; if (dsync) { diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index e2b77cd0ea97..2090a95edc27 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -460,9 +460,9 @@ static int do_read_inode(struct inode *inode) } } - F2FS_I(inode)->i_disk_time[0] = timespec64_to_timespec(inode->i_atime); - F2FS_I(inode)->i_disk_time[1] = timespec64_to_timespec(inode->i_ctime); - F2FS_I(inode)->i_disk_time[2] = timespec64_to_timespec(inode->i_mtime); + F2FS_I(inode)->i_disk_time[0] = inode->i_atime; + F2FS_I(inode)->i_disk_time[1] = inode->i_ctime; + F2FS_I(inode)->i_disk_time[2] = inode->i_mtime; F2FS_I(inode)->i_disk_time[3] = F2FS_I(inode)->i_crtime; if (unlikely((inode->i_mode & S_IFMT) == 0)) { @@ -651,9 +651,9 @@ void f2fs_update_inode(struct inode *inode, struct page *node_page) if (inode->i_nlink == 0) clear_inline_node(node_page); - F2FS_I(inode)->i_disk_time[0] = timespec64_to_timespec(inode->i_atime); - F2FS_I(inode)->i_disk_time[1] = timespec64_to_timespec(inode->i_ctime); - F2FS_I(inode)->i_disk_time[2] = timespec64_to_timespec(inode->i_mtime); + F2FS_I(inode)->i_disk_time[0] = inode->i_atime; + F2FS_I(inode)->i_disk_time[1] = inode->i_ctime; + F2FS_I(inode)->i_disk_time[2] = inode->i_mtime; F2FS_I(inode)->i_disk_time[3] = F2FS_I(inode)->i_crtime; #ifdef CONFIG_F2FS_CHECK_FS diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 2a742a62afb9..3915c667a2eb 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -54,7 +54,7 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) inode->i_version++; inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); - F2FS_I(inode)->i_crtime = timespec64_to_timespec(inode->i_mtime); + F2FS_I(inode)->i_crtime = inode->i_mtime; inode->i_generation = prandom_u32(); if (S_ISDIR(inode->i_mode)) From 05f2ff2b207dab6967c1639cd06ae81bcec51522 Mon Sep 17 00:00:00 2001 From: Zhang Qilong Date: Wed, 31 Aug 2022 17:48:15 +0800 Subject: [PATCH 1277/1640] f2fs: add static init_idisk_time function to reduce the code We can use a inner function to init the disk time of f2fs_inode_info for cleaning redundant code. Change-Id: Iafd6cd55a764372c17b7eb90931336a9b9727f1a Signed-off-by: Zhang Qilong Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/inode.c | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 2090a95edc27..631bade524fb 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -326,6 +326,16 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page) return true; } +static void init_idisk_time(struct inode *inode) +{ + struct f2fs_inode_info *fi = F2FS_I(inode); + + fi->i_disk_time[0] = inode->i_atime; + fi->i_disk_time[1] = inode->i_ctime; + fi->i_disk_time[2] = inode->i_mtime; + fi->i_disk_time[3] = fi->i_crtime; +} + static int do_read_inode(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); @@ -460,10 +470,7 @@ static int do_read_inode(struct inode *inode) } } - F2FS_I(inode)->i_disk_time[0] = inode->i_atime; - F2FS_I(inode)->i_disk_time[1] = inode->i_ctime; - F2FS_I(inode)->i_disk_time[2] = inode->i_mtime; - F2FS_I(inode)->i_disk_time[3] = F2FS_I(inode)->i_crtime; + init_idisk_time(inode); if (unlikely((inode->i_mode & S_IFMT) == 0)) { print_block_data(sbi->sb, inode->i_ino, page_address(node_page), @@ -651,11 +658,7 @@ void f2fs_update_inode(struct inode *inode, struct page *node_page) if (inode->i_nlink == 0) clear_inline_node(node_page); - F2FS_I(inode)->i_disk_time[0] = inode->i_atime; - F2FS_I(inode)->i_disk_time[1] = inode->i_ctime; - F2FS_I(inode)->i_disk_time[2] = inode->i_mtime; - F2FS_I(inode)->i_disk_time[3] = F2FS_I(inode)->i_crtime; - + init_idisk_time(inode); #ifdef CONFIG_F2FS_CHECK_FS f2fs_inode_chksum_set(F2FS_I_SB(inode), node_page); #endif From 4250dd8a0334aa0e7f9d1736217d8a9d5906bf98 Mon Sep 17 00:00:00 2001 From: basamaryan Date: Tue, 23 Sep 2025 19:13:44 -0400 Subject: [PATCH 1278/1640] incfs: use timespec64 for inode timestamps Change-Id: I9d49c388569fb430325e6dda3ef7b5216ded24fd --- fs/incfs/vfs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/incfs/vfs.c b/fs/incfs/vfs.c index 4af3e8170423..51e3f9c46063 100644 --- a/fs/incfs/vfs.c +++ b/fs/incfs/vfs.c @@ -404,7 +404,7 @@ static int inode_set(struct inode *inode, void *opaque) } else if (search->ino == INCFS_PENDING_READS_INODE) { /* It's an inode for .pending_reads pseudo file. */ - inode->i_ctime = (struct timespec){}; + inode->i_ctime = (struct timespec64){}; inode->i_mtime = inode->i_ctime; inode->i_atime = inode->i_ctime; inode->i_size = 0; @@ -419,7 +419,7 @@ static int inode_set(struct inode *inode, void *opaque) } else if (search->ino == INCFS_LOG_INODE) { /* It's an inode for .log pseudo file. */ - inode->i_ctime = (struct timespec){}; + inode->i_ctime = (struct timespec64){}; inode->i_mtime = inode->i_ctime; inode->i_atime = inode->i_ctime; inode->i_size = 0; From 2967478361a07b1d27cd34f092cc88aa8358534a Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Wed, 19 Jun 2019 12:01:05 -0700 Subject: [PATCH 1279/1640] UPSTREAM: bpf: fix NULL deref in btf_type_is_resolve_source_only Commit 1dc92851849c ("bpf: kernel side support for BTF Var and DataSec") added invocations of btf_type_is_resolve_source_only before btf_type_nosize_or_null which checks for the NULL pointer. Swap the order of btf_type_nosize_or_null and btf_type_is_resolve_source_only to make sure the do the NULL pointer check first. Fixes: 1dc92851849c ("bpf: kernel side support for BTF Var and DataSec") Reported-by: syzbot Change-Id: I707832c90c42a6f94853b0afeae708f4237c03a0 Signed-off-by: Stanislav Fomichev Acked-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann --- kernel/bpf/btf.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index d890db3452fa..58ee928c07c8 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -1928,8 +1928,8 @@ static int btf_array_resolve(struct btf_verifier_env *env, /* Check array->index_type */ index_type_id = array->index_type; index_type = btf_type_by_id(btf, index_type_id); - if (btf_type_is_resolve_source_only(index_type) || - btf_type_nosize_or_null(index_type)) { + if (btf_type_nosize_or_null(index_type) || + btf_type_is_resolve_source_only(index_type)) { btf_verifier_log_type(env, v->t, "Invalid index"); return -EINVAL; } @@ -1948,8 +1948,8 @@ static int btf_array_resolve(struct btf_verifier_env *env, /* Check array->type */ elem_type_id = array->type; elem_type = btf_type_by_id(btf, elem_type_id); - if (btf_type_is_resolve_source_only(elem_type) || - btf_type_nosize_or_null(elem_type)) { + if (btf_type_nosize_or_null(elem_type) || + btf_type_is_resolve_source_only(elem_type)) { btf_verifier_log_type(env, v->t, "Invalid elem"); return -EINVAL; @@ -2170,8 +2170,8 @@ static int btf_struct_resolve(struct btf_verifier_env *env, const struct btf_type *member_type = btf_type_by_id(env->btf, member_type_id); - if (btf_type_is_resolve_source_only(member_type) || - btf_type_nosize_or_null(member_type)) { + if (btf_type_nosize_or_null(member_type) || + btf_type_is_resolve_source_only(member_type)) { btf_verifier_log_member(env, v->t, member, "Invalid member"); return -EINVAL; From 149d5aef5aa610db8494a1ffc6850fc70826d46a Mon Sep 17 00:00:00 2001 From: Toshiaki Makita Date: Thu, 13 Jun 2019 18:39:58 +0900 Subject: [PATCH 1280/1640] UPSTREAM: xdp: Add tracepoint for bulk XDP_TX This is introduced for admins to check what is happening on XDP_TX when bulk XDP_TX is in use, which will be first introduced in veth in next commit. v3: - Add act field to be in line with other XDP tracepoints. Change-Id: I0a8904ab72819428a34ad0225c2b094c15efb148 Signed-off-by: Toshiaki Makita Acked-by: Jesper Dangaard Brouer Signed-off-by: Daniel Borkmann --- include/trace/events/xdp.h | 29 +++++++++++++++++++++++++++++ kernel/bpf/core.c | 1 + 2 files changed, 30 insertions(+) diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h index e95cb86b65cf..01389b9c3c6a 100644 --- a/include/trace/events/xdp.h +++ b/include/trace/events/xdp.h @@ -50,6 +50,35 @@ TRACE_EVENT(xdp_exception, __entry->ifindex) ); +TRACE_EVENT(xdp_bulk_tx, + + TP_PROTO(const struct net_device *dev, + int sent, int drops, int err), + + TP_ARGS(dev, sent, drops, err), + + TP_STRUCT__entry( + __field(int, ifindex) + __field(u32, act) + __field(int, drops) + __field(int, sent) + __field(int, err) + ), + + TP_fast_assign( + __entry->ifindex = dev->ifindex; + __entry->act = XDP_TX; + __entry->drops = drops; + __entry->sent = sent; + __entry->err = err; + ), + + TP_printk("ifindex=%d action=%s sent=%d drops=%d err=%d", + __entry->ifindex, + __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB), + __entry->sent, __entry->drops, __entry->err) +); + DECLARE_EVENT_CLASS(xdp_redirect_template, TP_PROTO(const struct net_device *dev, diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index f96675683936..a49ec9f96984 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2139,3 +2139,4 @@ EXPORT_SYMBOL(bpf_stats_enabled_key); #include EXPORT_TRACEPOINT_SYMBOL_GPL(xdp_exception); +EXPORT_TRACEPOINT_SYMBOL_GPL(xdp_bulk_tx); From d04d0723700f66605cb8bae17e84adf0b11fc2c9 Mon Sep 17 00:00:00 2001 From: Jiong Wang Date: Tue, 25 Jun 2019 17:41:50 +0100 Subject: [PATCH 1281/1640] UPSTREAM: bpf: fix BPF_ALU32 | BPF_ARSH on BE arches Yauheni reported the following code do not work correctly on BE arches: ALU_ARSH_X: DST = (u64) (u32) ((*(s32 *) &DST) >> SRC); CONT; ALU_ARSH_K: DST = (u64) (u32) ((*(s32 *) &DST) >> IMM); CONT; and are causing failure of test_verifier test 'arsh32 on imm 2' on BE arches. The code is taking address and interpreting memory directly, so is not endianness neutral. We should instead perform standard C type casting on the variable. A u64 to s32 conversion will drop the high 32-bit and reserve the low 32-bit as signed integer, this is all we want. Fixes: 2dc6b100f928 ("bpf: interpreter support BPF_ALU | BPF_ARSH") Reported-by: Yauheni Kaliuta Reviewed-by: Jakub Kicinski Reviewed-by: Quentin Monnet Change-Id: Ia69cac3290d9a561d41030a1a516809d24e1edb0 Signed-off-by: Jiong Wang Acked-by: Song Liu Signed-off-by: Daniel Borkmann --- kernel/bpf/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index a49ec9f96984..0bfaa723469c 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1389,10 +1389,10 @@ select_insn: insn++; CONT; ALU_ARSH_X: - DST = (u64) (u32) ((*(s32 *) &DST) >> SRC); + DST = (u64) (u32) (((s32) DST) >> SRC); CONT; ALU_ARSH_K: - DST = (u64) (u32) ((*(s32 *) &DST) >> IMM); + DST = (u64) (u32) (((s32) DST) >> IMM); CONT; ALU64_ARSH_X: (*(s64 *) &DST) >>= SRC; From 9f84dc9b41d31e8788696afca34c90cc3d6be892 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Tue, 25 Jun 2019 14:38:58 -0700 Subject: [PATCH 1282/1640] UPSTREAM: bpf: fix cgroup bpf release synchronization Since commit 4bfc0bb2c60e ("bpf: decouple the lifetime of cgroup_bpf from cgroup itself"), cgroup_bpf release occurs asynchronously (from a worker context), and before the release of the cgroup itself. This introduced a previously non-existing race between the release and update paths. E.g. if a leaf's cgroup_bpf is released and a new bpf program is attached to the one of ancestor cgroups at the same time. The race may result in double-free and other memory corruptions. To fix the problem, let's protect the body of cgroup_bpf_release() with cgroup_mutex, as it was effectively previously, when all this code was called from the cgroup release path with cgroup mutex held. Also let's skip cgroups, which have no chances to invoke a bpf program, on the update path. If the cgroup bpf refcnt reached 0, it means that the cgroup is offline (no attached processes), and there are no associated sockets left. It means there is no point in updating effective progs array! And it can lead to a leak, if it happens after the release. So, let's skip such cgroups. Big thanks for Tejun Heo for discovering and debugging of this problem! Fixes: 4bfc0bb2c60e ("bpf: decouple the lifetime of cgroup_bpf from cgroup itself") Reported-by: Tejun Heo Change-Id: Ia6343fb624819673a9529d03b4e2f4be897ef8ae Signed-off-by: Roman Gushchin Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- kernel/bpf/cgroup.c | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index c225c42e114a..077ed3a19848 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -16,6 +16,8 @@ #include #include +#include "../cgroup/cgroup-internal.h" + DEFINE_STATIC_KEY_FALSE(cgroup_bpf_enabled_key); EXPORT_SYMBOL(cgroup_bpf_enabled_key); @@ -38,6 +40,8 @@ static void cgroup_bpf_release(struct work_struct *work) struct bpf_prog_array *old_array; unsigned int type; + mutex_lock(&cgroup_mutex); + for (type = 0; type < ARRAY_SIZE(cgrp->bpf.progs); type++) { struct list_head *progs = &cgrp->bpf.progs[type]; struct bpf_prog_list *pl, *tmp; @@ -54,10 +58,12 @@ static void cgroup_bpf_release(struct work_struct *work) } old_array = rcu_dereference_protected( cgrp->bpf.effective[type], - percpu_ref_is_dying(&cgrp->bpf.refcnt)); + lockdep_is_held(&cgroup_mutex)); bpf_prog_array_free(old_array); } + mutex_unlock(&cgroup_mutex); + percpu_ref_exit(&cgrp->bpf.refcnt); cgroup_put(cgrp); } @@ -229,6 +235,9 @@ static int update_effective_progs(struct cgroup *cgrp, css_for_each_descendant_pre(css, &cgrp->self) { struct cgroup *desc = container_of(css, struct cgroup, self); + if (percpu_ref_is_zero(&desc->bpf.refcnt)) + continue; + err = compute_effective_progs(desc, type, &desc->bpf.inactive); if (err) goto cleanup; @@ -238,6 +247,14 @@ static int update_effective_progs(struct cgroup *cgrp, css_for_each_descendant_pre(css, &cgrp->self) { struct cgroup *desc = container_of(css, struct cgroup, self); + if (percpu_ref_is_zero(&desc->bpf.refcnt)) { + if (unlikely(desc->bpf.inactive)) { + bpf_prog_array_free(desc->bpf.inactive); + desc->bpf.inactive = NULL; + } + continue; + } + activate_effective_progs(desc, type, desc->bpf.inactive); desc->bpf.inactive = NULL; } From 9e57f30904d1aa6b5b9ef138af765dcd4e8db1c2 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Thu, 27 Jun 2019 13:38:47 -0700 Subject: [PATCH 1283/1640] UPSTREAM: bpf: implement getsockopt and setsockopt hooks Implement new BPF_PROG_TYPE_CGROUP_SOCKOPT program type and BPF_CGROUP_{G,S}ETSOCKOPT cgroup hooks. BPF_CGROUP_SETSOCKOPT can modify user setsockopt arguments before passing them down to the kernel or bypass kernel completely. BPF_CGROUP_GETSOCKOPT can can inspect/modify getsockopt arguments that kernel returns. Both hooks reuse existing PTR_TO_PACKET{,_END} infrastructure. The buffer memory is pre-allocated (because I don't think there is a precedent for working with __user memory from bpf). This might be slow to do for each {s,g}etsockopt call, that's why I've added __cgroup_bpf_prog_array_is_empty that exits early if there is nothing attached to a cgroup. Note, however, that there is a race between __cgroup_bpf_prog_array_is_empty and BPF_PROG_RUN_ARRAY where cgroup program layout might have changed; this should not be a problem because in general there is a race between multiple calls to {s,g}etsocktop and user adding/removing bpf progs from a cgroup. The return code of the BPF program is handled as follows: * 0: EPERM * 1: success, continue with next BPF program in the cgroup chain v9: * allow overwriting setsockopt arguments (Alexei Starovoitov): * use set_fs (same as kernel_setsockopt) * buffer is always kzalloc'd (no small on-stack buffer) v8: * use s32 for optlen (Andrii Nakryiko) v7: * return only 0 or 1 (Alexei Starovoitov) * always run all progs (Alexei Starovoitov) * use optval=0 as kernel bypass in setsockopt (Alexei Starovoitov) (decided to use optval=-1 instead, optval=0 might be a valid input) * call getsockopt hook after kernel handlers (Alexei Starovoitov) v6: * rework cgroup chaining; stop as soon as bpf program returns 0 or 2; see patch with the documentation for the details * drop Andrii's and Martin's Acked-by (not sure they are comfortable with the new state of things) v5: * skip copy_to_user() and put_user() when ret == 0 (Martin Lau) v4: * don't export bpf_sk_fullsock helper (Martin Lau) * size != sizeof(__u64) for uapi pointers (Martin Lau) * offsetof instead of bpf_ctx_range when checking ctx access (Martin Lau) v3: * typos in BPF_PROG_CGROUP_SOCKOPT_RUN_ARRAY comments (Andrii Nakryiko) * reverse christmas tree in BPF_PROG_CGROUP_SOCKOPT_RUN_ARRAY (Andrii Nakryiko) * use __bpf_md_ptr instead of __u32 for optval{,_end} (Martin Lau) * use BPF_FIELD_SIZEOF() for consistency (Martin Lau) * new CG_SOCKOPT_ACCESS macro to wrap repeated parts v2: * moved bpf_sockopt_kern fields around to remove a hole (Martin Lau) * aligned bpf_sockopt_kern->buf to 8 bytes (Martin Lau) * bpf_prog_array_is_empty instead of bpf_prog_array_length (Martin Lau) * added [0,2] return code check to verifier (Martin Lau) * dropped unused buf[64] from the stack (Martin Lau) * use PTR_TO_SOCKET for bpf_sockopt->sk (Martin Lau) * dropped bpf_target_off from ctx rewrites (Martin Lau) * use return code for kernel bypass (Martin Lau & Andrii Nakryiko) Cc: Andrii Nakryiko Cc: Martin Lau Change-Id: I2eb79aea6a353ca33381dcf7859f382affadf205 Signed-off-by: Stanislav Fomichev Signed-off-by: Alexei Starovoitov --- include/linux/bpf-cgroup.h | 45 +++++ include/linux/bpf.h | 2 + include/linux/bpf_types.h | 1 + include/linux/filter.h | 10 ++ include/uapi/linux/bpf.h | 14 ++ kernel/bpf/cgroup.c | 333 +++++++++++++++++++++++++++++++++++++ kernel/bpf/core.c | 9 + kernel/bpf/syscall.c | 19 +++ kernel/bpf/verifier.c | 8 + net/core/filter.c | 2 +- net/socket.c | 30 ++++ 11 files changed, 472 insertions(+), 1 deletion(-) diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index c60113c46b72..44769d4525b9 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -124,6 +124,14 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, loff_t *ppos, void **new_buf, enum bpf_attach_type type); +int __cgroup_bpf_run_filter_setsockopt(struct sock *sock, int *level, + int *optname, char __user *optval, + int *optlen, char **kernel_optval); +int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, + int optname, char __user *optval, + int __user *optlen, int max_optlen, + int retval); + static inline enum bpf_cgroup_storage_type cgroup_storage_type( struct bpf_map *map) { @@ -286,6 +294,38 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, __ret; \ }) +#define BPF_CGROUP_RUN_PROG_SETSOCKOPT(sock, level, optname, optval, optlen, \ + kernel_optval) \ +({ \ + int __ret = 0; \ + if (cgroup_bpf_enabled) \ + __ret = __cgroup_bpf_run_filter_setsockopt(sock, level, \ + optname, optval, \ + optlen, \ + kernel_optval); \ + __ret; \ +}) + +#define BPF_CGROUP_GETSOCKOPT_MAX_OPTLEN(optlen) \ +({ \ + int __ret = 0; \ + if (cgroup_bpf_enabled) \ + get_user(__ret, optlen); \ + __ret; \ +}) + +#define BPF_CGROUP_RUN_PROG_GETSOCKOPT(sock, level, optname, optval, optlen, \ + max_optlen, retval) \ +({ \ + int __ret = retval; \ + if (cgroup_bpf_enabled) \ + __ret = __cgroup_bpf_run_filter_getsockopt(sock, level, \ + optname, optval, \ + optlen, max_optlen, \ + retval); \ + __ret; \ +}) + int cgroup_bpf_prog_attach(const union bpf_attr *attr, enum bpf_prog_type ptype, struct bpf_prog *prog); int cgroup_bpf_prog_detach(const union bpf_attr *attr, @@ -357,6 +397,11 @@ static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map, #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) ({ 0; }) #define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type,major,minor,access) ({ 0; }) #define BPF_CGROUP_RUN_PROG_SYSCTL(head,table,write,buf,count,pos,nbuf) ({ 0; }) +#define BPF_CGROUP_GETSOCKOPT_MAX_OPTLEN(optlen) ({ 0; }) +#define BPF_CGROUP_RUN_PROG_GETSOCKOPT(sock, level, optname, optval, \ + optlen, max_optlen, retval) ({ retval; }) +#define BPF_CGROUP_RUN_PROG_SETSOCKOPT(sock, level, optname, optval, optlen, \ + kernel_optval) ({ 0; }) #define for_each_cgroup_storage_type(stype) for (; false; ) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index d1326e64a4ac..5c2a2fad90e8 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -530,6 +530,7 @@ struct bpf_prog_array { struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags); void bpf_prog_array_free(struct bpf_prog_array *progs); int bpf_prog_array_length(struct bpf_prog_array *progs); +bool bpf_prog_array_is_empty(struct bpf_prog_array *array); int bpf_prog_array_copy_to_user(struct bpf_prog_array *progs, __u32 __user *prog_ids, u32 cnt); @@ -1057,6 +1058,7 @@ extern const struct bpf_func_proto bpf_spin_unlock_proto; extern const struct bpf_func_proto bpf_get_local_storage_proto; extern const struct bpf_func_proto bpf_strtol_proto; extern const struct bpf_func_proto bpf_strtoul_proto; +extern const struct bpf_func_proto bpf_tcp_sock_proto; /* Shared helpers among cBPF and eBPF. */ void bpf_user_rnd_init_once(void); diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index 8c562302e511..ced497a618c4 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -28,6 +28,7 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE, raw_tracepoint_writable) #ifdef CONFIG_CGROUP_BPF BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_DEVICE, cg_dev) BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SYSCTL, cg_sysctl) +BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SOCKOPT, cg_sockopt) #endif #ifdef CONFIG_BPF_LIRC_MODE2 BPF_PROG_TYPE(BPF_PROG_TYPE_LIRC_MODE2, lirc_mode2) diff --git a/include/linux/filter.h b/include/linux/filter.h index 1d760b867650..279b24b22b96 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1280,4 +1280,14 @@ struct bpf_sysctl_kern { u64 tmp_reg; }; +struct bpf_sockopt_kern { + struct sock *sk; + u8 *optval; + u8 *optval_end; + s32 level; + s32 optname; + s32 optlen; + s32 retval; +}; + #endif /* __LINUX_FILTER_H__ */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 59a31ea9a002..a76515e7c886 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -170,6 +170,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_FLOW_DISSECTOR, BPF_PROG_TYPE_CGROUP_SYSCTL, BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE, + BPF_PROG_TYPE_CGROUP_SOCKOPT, }; enum bpf_attach_type { @@ -194,6 +195,8 @@ enum bpf_attach_type { BPF_CGROUP_SYSCTL, BPF_CGROUP_UDP4_RECVMSG, BPF_CGROUP_UDP6_RECVMSG, + BPF_CGROUP_GETSOCKOPT, + BPF_CGROUP_SETSOCKOPT, __MAX_BPF_ATTACH_TYPE }; @@ -3391,4 +3394,15 @@ struct bpf_sysctl { */ }; +struct bpf_sockopt { + __bpf_md_ptr(struct bpf_sock *, sk); + __bpf_md_ptr(void *, optval); + __bpf_md_ptr(void *, optval_end); + + __s32 level; + __s32 optname; + __s32 optlen; + __s32 retval; +}; + #endif /* _UAPI__LINUX_BPF_H__ */ diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 077ed3a19848..76fa0076f20d 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -15,6 +15,7 @@ #include #include #include +#include #include "../cgroup/cgroup-internal.h" @@ -938,6 +939,188 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, } EXPORT_SYMBOL(__cgroup_bpf_run_filter_sysctl); +static bool __cgroup_bpf_prog_array_is_empty(struct cgroup *cgrp, + enum bpf_attach_type attach_type) +{ + struct bpf_prog_array *prog_array; + bool empty; + + rcu_read_lock(); + prog_array = rcu_dereference(cgrp->bpf.effective[attach_type]); + empty = bpf_prog_array_is_empty(prog_array); + rcu_read_unlock(); + + return empty; +} + +static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen) +{ + if (unlikely(max_optlen > PAGE_SIZE) || max_optlen < 0) + return -EINVAL; + + ctx->optval = kzalloc(max_optlen, GFP_USER); + if (!ctx->optval) + return -ENOMEM; + + ctx->optval_end = ctx->optval + max_optlen; + ctx->optlen = max_optlen; + + return 0; +} + +static void sockopt_free_buf(struct bpf_sockopt_kern *ctx) +{ + kfree(ctx->optval); +} + +int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level, + int *optname, char __user *optval, + int *optlen, char **kernel_optval) +{ + struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); + struct bpf_sockopt_kern ctx = { + .sk = sk, + .level = *level, + .optname = *optname, + }; + int ret; + + /* Opportunistic check to see whether we have any BPF program + * attached to the hook so we don't waste time allocating + * memory and locking the socket. + */ + if (!cgroup_bpf_enabled || + __cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_SETSOCKOPT)) + return 0; + + ret = sockopt_alloc_buf(&ctx, *optlen); + if (ret) + return ret; + + if (copy_from_user(ctx.optval, optval, *optlen) != 0) { + ret = -EFAULT; + goto out; + } + + lock_sock(sk); + ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[BPF_CGROUP_SETSOCKOPT], + &ctx, BPF_PROG_RUN); + release_sock(sk); + + if (!ret) { + ret = -EPERM; + goto out; + } + + if (ctx.optlen == -1) { + /* optlen set to -1, bypass kernel */ + ret = 1; + } else if (ctx.optlen > *optlen || ctx.optlen < -1) { + /* optlen is out of bounds */ + ret = -EFAULT; + } else { + /* optlen within bounds, run kernel handler */ + ret = 0; + + /* export any potential modifications */ + *level = ctx.level; + *optname = ctx.optname; + *optlen = ctx.optlen; + *kernel_optval = ctx.optval; + } + +out: + if (ret) + sockopt_free_buf(&ctx); + return ret; +} +EXPORT_SYMBOL(__cgroup_bpf_run_filter_setsockopt); + +int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, + int optname, char __user *optval, + int __user *optlen, int max_optlen, + int retval) +{ + struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); + struct bpf_sockopt_kern ctx = { + .sk = sk, + .level = level, + .optname = optname, + .retval = retval, + }; + int ret; + + /* Opportunistic check to see whether we have any BPF program + * attached to the hook so we don't waste time allocating + * memory and locking the socket. + */ + if (!cgroup_bpf_enabled || + __cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_GETSOCKOPT)) + return retval; + + ret = sockopt_alloc_buf(&ctx, max_optlen); + if (ret) + return ret; + + if (!retval) { + /* If kernel getsockopt finished successfully, + * copy whatever was returned to the user back + * into our temporary buffer. Set optlen to the + * one that kernel returned as well to let + * BPF programs inspect the value. + */ + + if (get_user(ctx.optlen, optlen)) { + ret = -EFAULT; + goto out; + } + + if (ctx.optlen > max_optlen) + ctx.optlen = max_optlen; + + if (copy_from_user(ctx.optval, optval, ctx.optlen) != 0) { + ret = -EFAULT; + goto out; + } + } + + lock_sock(sk); + ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[BPF_CGROUP_GETSOCKOPT], + &ctx, BPF_PROG_RUN); + release_sock(sk); + + if (!ret) { + ret = -EPERM; + goto out; + } + + if (ctx.optlen > max_optlen) { + ret = -EFAULT; + goto out; + } + + /* BPF programs only allowed to set retval to 0, not some + * arbitrary value. + */ + if (ctx.retval != 0 && ctx.retval != retval) { + ret = -EFAULT; + goto out; + } + + if (copy_to_user(optval, ctx.optval, ctx.optlen) || + put_user(ctx.optlen, optlen)) { + ret = -EFAULT; + goto out; + } + + ret = ctx.retval; + +out: + sockopt_free_buf(&ctx); + return ret; +} +EXPORT_SYMBOL(__cgroup_bpf_run_filter_getsockopt); + static ssize_t sysctl_cpy_dir(const struct ctl_dir *dir, char **bufp, size_t *lenp) { @@ -1198,3 +1381,153 @@ const struct bpf_verifier_ops cg_sysctl_verifier_ops = { const struct bpf_prog_ops cg_sysctl_prog_ops = { }; + +static const struct bpf_func_proto * +cg_sockopt_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + switch (func_id) { + case BPF_FUNC_sk_storage_get: + return &bpf_sk_storage_get_proto; + case BPF_FUNC_sk_storage_delete: + return &bpf_sk_storage_delete_proto; +#ifdef CONFIG_INET + case BPF_FUNC_tcp_sock: + return &bpf_tcp_sock_proto; +#endif + default: + return cgroup_base_func_proto(func_id, prog); + } +} + +static bool cg_sockopt_is_valid_access(int off, int size, + enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + const int size_default = sizeof(__u32); + + if (off < 0 || off >= sizeof(struct bpf_sockopt)) + return false; + + if (off % size != 0) + return false; + + if (type == BPF_WRITE) { + switch (off) { + case offsetof(struct bpf_sockopt, retval): + if (size != size_default) + return false; + return prog->expected_attach_type == + BPF_CGROUP_GETSOCKOPT; + case offsetof(struct bpf_sockopt, optname): + /* fallthrough */ + case offsetof(struct bpf_sockopt, level): + if (size != size_default) + return false; + return prog->expected_attach_type == + BPF_CGROUP_SETSOCKOPT; + case offsetof(struct bpf_sockopt, optlen): + return size == size_default; + default: + return false; + } + } + + switch (off) { + case offsetof(struct bpf_sockopt, sk): + if (size != sizeof(__u64)) + return false; + info->reg_type = PTR_TO_SOCKET; + break; + case offsetof(struct bpf_sockopt, optval): + if (size != sizeof(__u64)) + return false; + info->reg_type = PTR_TO_PACKET; + break; + case offsetof(struct bpf_sockopt, optval_end): + if (size != sizeof(__u64)) + return false; + info->reg_type = PTR_TO_PACKET_END; + break; + case offsetof(struct bpf_sockopt, retval): + if (size != size_default) + return false; + return prog->expected_attach_type == BPF_CGROUP_GETSOCKOPT; + default: + if (size != size_default) + return false; + break; + } + return true; +} + +#define CG_SOCKOPT_ACCESS_FIELD(T, F) \ + T(BPF_FIELD_SIZEOF(struct bpf_sockopt_kern, F), \ + si->dst_reg, si->src_reg, \ + offsetof(struct bpf_sockopt_kern, F)) + +static u32 cg_sockopt_convert_ctx_access(enum bpf_access_type type, + const struct bpf_insn *si, + struct bpf_insn *insn_buf, + struct bpf_prog *prog, + u32 *target_size) +{ + struct bpf_insn *insn = insn_buf; + + switch (si->off) { + case offsetof(struct bpf_sockopt, sk): + *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, sk); + break; + case offsetof(struct bpf_sockopt, level): + if (type == BPF_WRITE) + *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, level); + else + *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, level); + break; + case offsetof(struct bpf_sockopt, optname): + if (type == BPF_WRITE) + *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, optname); + else + *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optname); + break; + case offsetof(struct bpf_sockopt, optlen): + if (type == BPF_WRITE) + *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, optlen); + else + *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optlen); + break; + case offsetof(struct bpf_sockopt, retval): + if (type == BPF_WRITE) + *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, retval); + else + *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, retval); + break; + case offsetof(struct bpf_sockopt, optval): + *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optval); + break; + case offsetof(struct bpf_sockopt, optval_end): + *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optval_end); + break; + } + + return insn - insn_buf; +} + +static int cg_sockopt_get_prologue(struct bpf_insn *insn_buf, + bool direct_write, + const struct bpf_prog *prog) +{ + /* Nothing to do for sockopt argument. The data is kzalloc'ated. + */ + return 0; +} + +const struct bpf_verifier_ops cg_sockopt_verifier_ops = { + .get_func_proto = cg_sockopt_func_proto, + .is_valid_access = cg_sockopt_is_valid_access, + .convert_ctx_access = cg_sockopt_convert_ctx_access, + .gen_prologue = cg_sockopt_get_prologue, +}; + +const struct bpf_prog_ops cg_sockopt_prog_ops = { +}; diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 0bfaa723469c..338e4be37d96 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1846,6 +1846,15 @@ int bpf_prog_array_length(struct bpf_prog_array *array) return cnt; } +bool bpf_prog_array_is_empty(struct bpf_prog_array *array) +{ + struct bpf_prog_array_item *item; + + for (item = array->items; item->prog; item++) + if (item->prog != &dummy_bpf_prog.prog) + return false; + return true; +} static bool bpf_prog_array_copy_core(struct bpf_prog_array *array, u32 *prog_ids, diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 94e1c43bc584..731253ee9140 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1591,6 +1591,14 @@ bpf_prog_load_check_attach_type(enum bpf_prog_type prog_type, default: return -EINVAL; } + case BPF_PROG_TYPE_CGROUP_SOCKOPT: + switch (expected_attach_type) { + case BPF_CGROUP_SETSOCKOPT: + case BPF_CGROUP_GETSOCKOPT: + return 0; + default: + return -EINVAL; + } default: return 0; } @@ -1846,6 +1854,7 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, switch (prog->type) { case BPF_PROG_TYPE_CGROUP_SOCK: case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: + case BPF_PROG_TYPE_CGROUP_SOCKOPT: return attach_type == prog->expected_attach_type ? 0 : -EINVAL; case BPF_PROG_TYPE_CGROUP_SKB: return prog->enforce_expected_attach_type && @@ -1918,6 +1927,10 @@ static int bpf_prog_attach(const union bpf_attr *attr) case BPF_CGROUP_SYSCTL: ptype = BPF_PROG_TYPE_CGROUP_SYSCTL; break; + case BPF_CGROUP_GETSOCKOPT: + case BPF_CGROUP_SETSOCKOPT: + ptype = BPF_PROG_TYPE_CGROUP_SOCKOPT; + break; default: return -EINVAL; } @@ -2001,6 +2014,10 @@ static int bpf_prog_detach(const union bpf_attr *attr) case BPF_CGROUP_SYSCTL: ptype = BPF_PROG_TYPE_CGROUP_SYSCTL; break; + case BPF_CGROUP_GETSOCKOPT: + case BPF_CGROUP_SETSOCKOPT: + ptype = BPF_PROG_TYPE_CGROUP_SOCKOPT; + break; default: return -EINVAL; } @@ -2037,6 +2054,8 @@ static int bpf_prog_query(const union bpf_attr *attr, case BPF_CGROUP_SOCK_OPS: case BPF_CGROUP_DEVICE: case BPF_CGROUP_SYSCTL: + case BPF_CGROUP_GETSOCKOPT: + case BPF_CGROUP_SETSOCKOPT: break; case BPF_LIRC_MODE2: return lirc_prog_query(attr, uattr); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index e23128cb93a2..5d04f499c0ec 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2190,6 +2190,13 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env, env->seen_direct_write = true; return true; + + case BPF_PROG_TYPE_CGROUP_SOCKOPT: + if (t == BPF_WRITE) + env->seen_direct_write = true; + + return true; + default: return false; } @@ -6238,6 +6245,7 @@ static int check_return_code(struct bpf_verifier_env *env) case BPF_PROG_TYPE_SOCK_OPS: case BPF_PROG_TYPE_CGROUP_DEVICE: case BPF_PROG_TYPE_CGROUP_SYSCTL: + case BPF_PROG_TYPE_CGROUP_SOCKOPT: break; default: return 0; diff --git a/net/core/filter.c b/net/core/filter.c index 0529b7ef022f..2a361c070a9a 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5208,7 +5208,7 @@ BPF_CALL_1(bpf_tcp_sock, struct sock *, sk) return (unsigned long)NULL; } -static const struct bpf_func_proto bpf_tcp_sock_proto = { +const struct bpf_func_proto bpf_tcp_sock_proto = { .func = bpf_tcp_sock, .gpl_only = false, .ret_type = RET_PTR_TO_TCP_SOCK_OR_NULL, diff --git a/net/socket.c b/net/socket.c index f938849dc11e..f4dad9c24336 100644 --- a/net/socket.c +++ b/net/socket.c @@ -1868,6 +1868,8 @@ SYSCALL_DEFINE4(recv, int, fd, void __user *, ubuf, size_t, size, static int __sys_setsockopt(int fd, int level, int optname, char __user *optval, int optlen) { + mm_segment_t oldfs = get_fs(); + char *kernel_optval = NULL; int err, fput_needed; struct socket *sock; @@ -1880,6 +1882,22 @@ static int __sys_setsockopt(int fd, int level, int optname, if (err) goto out_put; + err = BPF_CGROUP_RUN_PROG_SETSOCKOPT(sock->sk, &level, + &optname, optval, &optlen, + &kernel_optval); + + if (err < 0) { + goto out_put; + } else if (err > 0) { + err = 0; + goto out_put; + } + + if (kernel_optval) { + set_fs(KERNEL_DS); + optval = (char __user __force *)kernel_optval; + } + if (level == SOL_SOCKET) err = sock_setsockopt(sock, level, optname, optval, @@ -1888,6 +1906,11 @@ static int __sys_setsockopt(int fd, int level, int optname, err = sock->ops->setsockopt(sock, level, optname, optval, optlen); + + if (kernel_optval) { + set_fs(oldfs); + kfree(kernel_optval); + } out_put: fput_light(sock->file, fput_needed); } @@ -1910,6 +1933,7 @@ static int __sys_getsockopt(int fd, int level, int optname, { int err, fput_needed; struct socket *sock; + int max_optlen; sock = sockfd_lookup_light(fd, &err, &fput_needed); if (sock != NULL) { @@ -1917,6 +1941,8 @@ static int __sys_getsockopt(int fd, int level, int optname, if (err) goto out_put; + max_optlen = BPF_CGROUP_GETSOCKOPT_MAX_OPTLEN(optlen); + if (level == SOL_SOCKET) err = sock_getsockopt(sock, level, optname, optval, @@ -1925,6 +1951,10 @@ static int __sys_getsockopt(int fd, int level, int optname, err = sock->ops->getsockopt(sock, level, optname, optval, optlen); + + err = BPF_CGROUP_RUN_PROG_GETSOCKOPT(sock->sk, level, optname, + optval, optlen, + max_optlen, err); out_put: fput_light(sock->file, fput_needed); } From 8f6cb04d79fad5da66c910283bc62ba69c37dc0e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Fri, 28 Jun 2019 11:12:34 +0200 Subject: [PATCH 1284/1640] UPSTREAM: xskmap: Move non-standard list manipulation to helper MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a helper in list.h for the non-standard way of clearing a list that is used in xskmap. This makes it easier to reuse it in the other map types, and also makes sure this usage is not forgotten in any list refactorings in the future. Change-Id: I3717523d2a890fad4e026aedb5066e1cc75d4d5b Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Daniel Borkmann --- include/linux/list.h | 14 ++++++++++++++ kernel/bpf/xskmap.c | 3 +-- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/include/linux/list.h b/include/linux/list.h index d2c12ef7a4e3..d3d4b06691fb 100644 --- a/include/linux/list.h +++ b/include/linux/list.h @@ -106,6 +106,20 @@ static inline void __list_del(struct list_head * prev, struct list_head * next) WRITE_ONCE(prev->next, next); } +/* + * Delete a list entry and clear the 'prev' pointer. + * + * This is a special-purpose list clearing method used in the networking code + * for lists allocated as per-cpu, where we don't want to incur the extra + * WRITE_ONCE() overhead of a regular list_del_init(). The code that uses this + * needs to check the node 'prev' pointer instead of calling list_empty(). + */ +static inline void __list_del_clearprev(struct list_head *entry) +{ + __list_del(entry->prev, entry->next); + entry->prev = NULL; +} + /** * list_del - deletes entry from list. * @entry: the element to delete from the list. diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c index ef7338cebd18..9bb96ace9fa1 100644 --- a/kernel/bpf/xskmap.c +++ b/kernel/bpf/xskmap.c @@ -145,8 +145,7 @@ void __xsk_map_flush(struct bpf_map *map) list_for_each_entry_safe(xs, tmp, flush_list, flush_node) { xsk_flush(xs); - __list_del(xs->flush_node.prev, xs->flush_node.next); - xs->flush_node.prev = NULL; + __list_del_clearprev(&xs->flush_node); } } From 27c97857f609edcafcc0805bbe403dcd5fd77921 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Fri, 28 Jun 2019 11:12:34 +0200 Subject: [PATCH 1285/1640] UPSTREAM: devmap/cpumap: Use flush list instead of bitmap MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The socket map uses a linked list instead of a bitmap to keep track of which entries to flush. Do the same for devmap and cpumap, as this means we don't have to care about the map index when enqueueing things into the map (and so we can cache the map lookup). Change-Id: I0cf9483ddfbad1555c6314a4a3bd7d82c77333ff Signed-off-by: Toke Høiland-Jørgensen Acked-by: Jonathan Lemon Acked-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann --- kernel/bpf/cpumap.c | 105 ++++++++++++++++++++----------------------- kernel/bpf/devmap.c | 107 +++++++++++++++++++------------------------- net/core/filter.c | 2 - 3 files changed, 95 insertions(+), 119 deletions(-) diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index d39b34e0384e..a367fc850393 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -32,14 +32,19 @@ /* General idea: XDP packets getting XDP redirected to another CPU, * will maximum be stored/queued for one driver ->poll() call. It is - * guaranteed that setting flush bit and flush operation happen on + * guaranteed that queueing the frame and the flush operation happen on * same CPU. Thus, cpu_map_flush operation can deduct via this_cpu_ptr() * which queue in bpf_cpu_map_entry contains packets. */ #define CPU_MAP_BULK_SIZE 8 /* 8 == one cacheline on 64-bit archs */ +struct bpf_cpu_map_entry; +struct bpf_cpu_map; + struct xdp_bulk_queue { void *q[CPU_MAP_BULK_SIZE]; + struct list_head flush_node; + struct bpf_cpu_map_entry *obj; unsigned int count; }; @@ -52,6 +57,8 @@ struct bpf_cpu_map_entry { /* XDP can run multiple RX-ring queues, need __percpu enqueue store */ struct xdp_bulk_queue __percpu *bulkq; + struct bpf_cpu_map *cmap; + /* Queue with potential multi-producers, and single-consumer kthread */ struct ptr_ring *queue; struct task_struct *kthread; @@ -65,23 +72,17 @@ struct bpf_cpu_map { struct bpf_map map; /* Below members specific for map type */ struct bpf_cpu_map_entry **cpu_map; - unsigned long __percpu *flush_needed; + struct list_head __percpu *flush_list; }; -static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu, - struct xdp_bulk_queue *bq, bool in_napi_ctx); - -static u64 cpu_map_bitmap_size(const union bpf_attr *attr) -{ - return BITS_TO_LONGS(attr->max_entries) * sizeof(unsigned long); -} +static int bq_flush_to_queue(struct xdp_bulk_queue *bq, bool in_napi_ctx); static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) { struct bpf_cpu_map *cmap; int err = -ENOMEM; + int ret, cpu; u64 cost; - int ret; if (!capable(CAP_SYS_ADMIN)) return ERR_PTR(-EPERM); @@ -105,7 +106,7 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) /* make sure page count doesn't overflow */ cost = (u64) cmap->map.max_entries * sizeof(struct bpf_cpu_map_entry *); - cost += cpu_map_bitmap_size(attr) * num_possible_cpus(); + cost += sizeof(struct list_head) * num_possible_cpus(); /* Notice returns -EPERM on if map size is larger than memlock limit */ ret = bpf_map_charge_init(&cmap->map.memory, cost); @@ -114,12 +115,13 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) goto free_cmap; } - /* A per cpu bitfield with a bit per possible CPU in map */ - cmap->flush_needed = __alloc_percpu(cpu_map_bitmap_size(attr), - __alignof__(unsigned long)); - if (!cmap->flush_needed) + cmap->flush_list = alloc_percpu(struct list_head); + if (!cmap->flush_list) goto free_charge; + for_each_possible_cpu(cpu) + INIT_LIST_HEAD(per_cpu_ptr(cmap->flush_list, cpu)); + /* Alloc array for possible remote "destination" CPUs */ cmap->cpu_map = bpf_map_area_alloc(cmap->map.max_entries * sizeof(struct bpf_cpu_map_entry *), @@ -129,7 +131,7 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) return &cmap->map; free_percpu: - free_percpu(cmap->flush_needed); + free_percpu(cmap->flush_list); free_charge: bpf_map_charge_finish(&cmap->map.memory); free_cmap: @@ -334,7 +336,8 @@ static struct bpf_cpu_map_entry *__cpu_map_entry_alloc(u32 qsize, u32 cpu, { gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; struct bpf_cpu_map_entry *rcpu; - int numa, err; + struct xdp_bulk_queue *bq; + int numa, err, i; /* Have map->numa_node, but choose node of redirect target CPU */ numa = cpu_to_node(cpu); @@ -349,6 +352,11 @@ static struct bpf_cpu_map_entry *__cpu_map_entry_alloc(u32 qsize, u32 cpu, if (!rcpu->bulkq) goto free_rcu; + for_each_possible_cpu(i) { + bq = per_cpu_ptr(rcpu->bulkq, i); + bq->obj = rcpu; + } + /* Alloc queue */ rcpu->queue = kzalloc_node(sizeof(*rcpu->queue), gfp, numa); if (!rcpu->queue) @@ -405,7 +413,7 @@ static void __cpu_map_entry_free(struct rcu_head *rcu) struct xdp_bulk_queue *bq = per_cpu_ptr(rcpu->bulkq, cpu); /* No concurrent bq_enqueue can run at this point */ - bq_flush_to_queue(rcpu, bq, false); + bq_flush_to_queue(bq, false); } free_percpu(rcpu->bulkq); /* Cannot kthread_stop() here, last put free rcpu resources */ @@ -488,6 +496,7 @@ static int cpu_map_update_elem(struct bpf_map *map, void *key, void *value, rcpu = __cpu_map_entry_alloc(qsize, key_cpu, map->id); if (!rcpu) return -ENOMEM; + rcpu->cmap = cmap; } rcu_read_lock(); __cpu_map_entry_replace(cmap, key_cpu, rcpu); @@ -514,14 +523,14 @@ static void cpu_map_free(struct bpf_map *map) synchronize_rcu(); /* To ensure all pending flush operations have completed wait for flush - * bitmap to indicate all flush_needed bits to be zero on _all_ cpus. - * Because the above synchronize_rcu() ensures the map is disconnected - * from the program we can assume no new bits will be set. + * list be empty on _all_ cpus. Because the above synchronize_rcu() + * ensures the map is disconnected from the program we can assume no new + * items will be added to the list. */ for_each_online_cpu(cpu) { - unsigned long *bitmap = per_cpu_ptr(cmap->flush_needed, cpu); + struct list_head *flush_list = per_cpu_ptr(cmap->flush_list, cpu); - while (!bitmap_empty(bitmap, cmap->map.max_entries)) + while (!list_empty(flush_list)) cond_resched(); } @@ -538,7 +547,7 @@ static void cpu_map_free(struct bpf_map *map) /* bq flush and cleanup happens after RCU graze-period */ __cpu_map_entry_replace(cmap, i, NULL); /* call_rcu */ } - free_percpu(cmap->flush_needed); + free_percpu(cmap->flush_list); bpf_map_area_free(cmap->cpu_map); kfree(cmap); } @@ -590,9 +599,9 @@ const struct bpf_map_ops cpu_map_ops = { .map_check_btf = map_check_no_btf, }; -static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu, - struct xdp_bulk_queue *bq, bool in_napi_ctx) +static int bq_flush_to_queue(struct xdp_bulk_queue *bq, bool in_napi_ctx) { + struct bpf_cpu_map_entry *rcpu = bq->obj; unsigned int processed = 0, drops = 0; const int to_cpu = rcpu->cpu; struct ptr_ring *q; @@ -621,6 +630,8 @@ static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu, bq->count = 0; spin_unlock(&q->producer_lock); + __list_del_clearprev(&bq->flush_node); + /* Feedback loop via tracepoints */ trace_xdp_cpumap_enqueue(rcpu->map_id, processed, drops, to_cpu); return 0; @@ -631,10 +642,11 @@ static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu, */ static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf) { + struct list_head *flush_list = this_cpu_ptr(rcpu->cmap->flush_list); struct xdp_bulk_queue *bq = this_cpu_ptr(rcpu->bulkq); if (unlikely(bq->count == CPU_MAP_BULK_SIZE)) - bq_flush_to_queue(rcpu, bq, true); + bq_flush_to_queue(bq, true); /* Notice, xdp_buff/page MUST be queued here, long enough for * driver to code invoking us to finished, due to driver @@ -646,6 +658,10 @@ static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf) * operation, when completing napi->poll call. */ bq->q[bq->count++] = xdpf; + + if (!bq->flush_node.prev) + list_add(&bq->flush_node, flush_list); + return 0; } @@ -665,41 +681,16 @@ int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp, return 0; } -void __cpu_map_insert_ctx(struct bpf_map *map, u32 bit) -{ - struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); - unsigned long *bitmap = this_cpu_ptr(cmap->flush_needed); - - __set_bit(bit, bitmap); -} - void __cpu_map_flush(struct bpf_map *map) { struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); - unsigned long *bitmap = this_cpu_ptr(cmap->flush_needed); - u32 bit; + struct list_head *flush_list = this_cpu_ptr(cmap->flush_list); + struct xdp_bulk_queue *bq, *tmp; - /* The napi->poll softirq makes sure __cpu_map_insert_ctx() - * and __cpu_map_flush() happen on same CPU. Thus, the percpu - * bitmap indicate which percpu bulkq have packets. - */ - for_each_set_bit(bit, bitmap, map->max_entries) { - struct bpf_cpu_map_entry *rcpu = READ_ONCE(cmap->cpu_map[bit]); - struct xdp_bulk_queue *bq; - - /* This is possible if entry is removed by user space - * between xdp redirect and flush op. - */ - if (unlikely(!rcpu)) - continue; - - __clear_bit(bit, bitmap); - - /* Flush all frames in bulkq to real queue */ - bq = this_cpu_ptr(rcpu->bulkq); - bq_flush_to_queue(rcpu, bq, true); + list_for_each_entry_safe(bq, tmp, flush_list, flush_node) { + bq_flush_to_queue(bq, true); /* If already running, costs spin_lock_irqsave + smb_mb */ - wake_up_process(rcpu->kthread); + wake_up_process(bq->obj->kthread); } } diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 40e86a7e0ef0..a4dddc867cbf 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -17,9 +17,8 @@ * datapath always has a valid copy. However, the datapath does a "flush" * operation that pushes any pending packets in the driver outside the RCU * critical section. Each bpf_dtab_netdev tracks these pending operations using - * an atomic per-cpu bitmap. The bpf_dtab_netdev object will not be destroyed - * until all bits are cleared indicating outstanding flush operations have - * completed. + * a per-cpu flush list. The bpf_dtab_netdev object will not be destroyed until + * this list is empty, indicating outstanding flush operations have completed. * * BPF syscalls may race with BPF program calls on any of the update, delete * or lookup operations. As noted above the xchg() operation also keep the @@ -48,9 +47,13 @@ (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) #define DEV_MAP_BULK_SIZE 16 +struct bpf_dtab_netdev; + struct xdp_bulk_queue { struct xdp_frame *q[DEV_MAP_BULK_SIZE]; + struct list_head flush_node; struct net_device *dev_rx; + struct bpf_dtab_netdev *obj; unsigned int count; }; @@ -65,23 +68,18 @@ struct bpf_dtab_netdev { struct bpf_dtab { struct bpf_map map; struct bpf_dtab_netdev **netdev_map; - unsigned long __percpu *flush_needed; + struct list_head __percpu *flush_list; struct list_head list; }; static DEFINE_SPINLOCK(dev_map_lock); static LIST_HEAD(dev_map_list); -static u64 dev_map_bitmap_size(const union bpf_attr *attr) -{ - return BITS_TO_LONGS((u64) attr->max_entries) * sizeof(unsigned long); -} - static struct bpf_map *dev_map_alloc(union bpf_attr *attr) { struct bpf_dtab *dtab; + int err, cpu; u64 cost; - int err; if (!capable(CAP_NET_ADMIN)) return ERR_PTR(-EPERM); @@ -99,7 +97,7 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) /* make sure page count doesn't overflow */ cost = (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *); - cost += dev_map_bitmap_size(attr) * num_possible_cpus(); + cost += sizeof(struct list_head) * num_possible_cpus(); /* if map size is larger than memlock limit, reject it */ err = bpf_map_charge_init(&dtab->map.memory, cost); @@ -108,28 +106,30 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) err = -ENOMEM; - /* A per cpu bitfield with a bit per possible net device */ - dtab->flush_needed = __alloc_percpu_gfp(dev_map_bitmap_size(attr), - __alignof__(unsigned long), - GFP_KERNEL | __GFP_NOWARN); - if (!dtab->flush_needed) + dtab->flush_list = alloc_percpu(struct list_head); + if (!dtab->flush_list) goto free_charge; + for_each_possible_cpu(cpu) + INIT_LIST_HEAD(per_cpu_ptr(dtab->flush_list, cpu)); + dtab->netdev_map = bpf_map_area_alloc(dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *), dtab->map.numa_node); if (!dtab->netdev_map) - goto free_charge; + goto free_percpu; spin_lock(&dev_map_lock); list_add_tail_rcu(&dtab->list, &dev_map_list); spin_unlock(&dev_map_lock); return &dtab->map; + +free_percpu: + free_percpu(dtab->flush_list); free_charge: bpf_map_charge_finish(&dtab->map.memory); free_dtab: - free_percpu(dtab->flush_needed); kfree(dtab); return ERR_PTR(err); } @@ -158,14 +158,14 @@ static void dev_map_free(struct bpf_map *map) rcu_barrier(); /* To ensure all pending flush operations have completed wait for flush - * bitmap to indicate all flush_needed bits to be zero on _all_ cpus. + * list to empty on _all_ cpus. * Because the above synchronize_rcu() ensures the map is disconnected - * from the program we can assume no new bits will be set. + * from the program we can assume no new items will be added. */ for_each_online_cpu(cpu) { - unsigned long *bitmap = per_cpu_ptr(dtab->flush_needed, cpu); + struct list_head *flush_list = per_cpu_ptr(dtab->flush_list, cpu); - while (!bitmap_empty(bitmap, dtab->map.max_entries)) + while (!list_empty(flush_list)) cond_resched(); } @@ -181,7 +181,7 @@ static void dev_map_free(struct bpf_map *map) kfree(dev); } - free_percpu(dtab->flush_needed); + free_percpu(dtab->flush_list); bpf_map_area_free(dtab->netdev_map); kfree(dtab); } @@ -203,18 +203,10 @@ static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key) return 0; } -void __dev_map_insert_ctx(struct bpf_map *map, u32 bit) -{ - struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); - unsigned long *bitmap = this_cpu_ptr(dtab->flush_needed); - - __set_bit(bit, bitmap); -} - -static int bq_xmit_all(struct bpf_dtab_netdev *obj, - struct xdp_bulk_queue *bq, u32 flags, +static int bq_xmit_all(struct xdp_bulk_queue *bq, u32 flags, bool in_napi_ctx) { + struct bpf_dtab_netdev *obj = bq->obj; struct net_device *dev = obj->dev; int sent = 0, drops = 0, err = 0; int i; @@ -241,6 +233,7 @@ out: trace_xdp_devmap_xmit(&obj->dtab->map, obj->bit, sent, drops, bq->dev_rx, dev, err); bq->dev_rx = NULL; + __list_del_clearprev(&bq->flush_node); return 0; error: /* If ndo_xdp_xmit fails with an errno, no frames have been @@ -263,31 +256,18 @@ error: * from the driver before returning from its napi->poll() routine. The poll() * routine is called either from busy_poll context or net_rx_action signaled * from NET_RX_SOFTIRQ. Either way the poll routine must complete before the - * net device can be torn down. On devmap tear down we ensure the ctx bitmap - * is zeroed before completing to ensure all flush operations have completed. + * net device can be torn down. On devmap tear down we ensure the flush list + * is empty before completing to ensure all flush operations have completed. */ void __dev_map_flush(struct bpf_map *map) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); - unsigned long *bitmap = this_cpu_ptr(dtab->flush_needed); - u32 bit; + struct list_head *flush_list = this_cpu_ptr(dtab->flush_list); + struct xdp_bulk_queue *bq, *tmp; rcu_read_lock(); - for_each_set_bit(bit, bitmap, map->max_entries) { - struct bpf_dtab_netdev *dev = READ_ONCE(dtab->netdev_map[bit]); - struct xdp_bulk_queue *bq; - - /* This is possible if the dev entry is removed by user space - * between xdp redirect and flush op. - */ - if (unlikely(!dev)) - continue; - - bq = this_cpu_ptr(dev->bulkq); - bq_xmit_all(dev, bq, XDP_XMIT_FLUSH, true); - - __clear_bit(bit, bitmap); - } + list_for_each_entry_safe(bq, tmp, flush_list, flush_node) + bq_xmit_all(bq, XDP_XMIT_FLUSH, true); rcu_read_unlock(); } @@ -314,10 +294,11 @@ static int bq_enqueue(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf, struct net_device *dev_rx) { + struct list_head *flush_list = this_cpu_ptr(obj->dtab->flush_list); struct xdp_bulk_queue *bq = this_cpu_ptr(obj->bulkq); if (unlikely(bq->count == DEV_MAP_BULK_SIZE)) - bq_xmit_all(obj, bq, 0, true); + bq_xmit_all(bq, 0, true); /* Ingress dev_rx will be the same for all xdp_frame's in * bulk_queue, because bq stored per-CPU and must be flushed @@ -327,6 +308,10 @@ static int bq_enqueue(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf, bq->dev_rx = dev_rx; bq->q[bq->count++] = xdpf; + + if (!bq->flush_node.prev) + list_add(&bq->flush_node, flush_list); + return 0; } @@ -377,17 +362,12 @@ static void dev_map_flush_old(struct bpf_dtab_netdev *dev) { if (dev->dev->netdev_ops->ndo_xdp_xmit) { struct xdp_bulk_queue *bq; - unsigned long *bitmap; - int cpu; rcu_read_lock(); for_each_online_cpu(cpu) { - bitmap = per_cpu_ptr(dev->dtab->flush_needed, cpu); - __clear_bit(dev->bit, bitmap); - bq = per_cpu_ptr(dev->bulkq, cpu); - bq_xmit_all(dev, bq, XDP_XMIT_FLUSH, false); + bq_xmit_all(bq, XDP_XMIT_FLUSH, false); } rcu_read_unlock(); } @@ -434,8 +414,10 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value, struct net *net = current->nsproxy->net_ns; gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN; struct bpf_dtab_netdev *dev, *old_dev; - u32 i = *(u32 *)key; u32 ifindex = *(u32 *)value; + struct xdp_bulk_queue *bq; + u32 i = *(u32 *)key; + int cpu; if (unlikely(map_flags > BPF_EXIST)) return -EINVAL; @@ -458,6 +440,11 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value, return -ENOMEM; } + for_each_possible_cpu(cpu) { + bq = per_cpu_ptr(dev->bulkq, cpu); + bq->obj = dev; + } + dev->dev = dev_get_by_index(net, ifindex); if (!dev->dev) { free_percpu(dev->bulkq); diff --git a/net/core/filter.c b/net/core/filter.c index 2a361c070a9a..76fe0a507e5f 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3157,7 +3157,6 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd, err = dev_map_enqueue(dst, xdp, dev_rx); if (err) return err; - __dev_map_insert_ctx(map, index); break; } case BPF_MAP_TYPE_CPUMAP: { @@ -3166,7 +3165,6 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd, err = cpu_map_enqueue(rcpu, xdp, dev_rx); if (err) return err; - __cpu_map_insert_ctx(map, index); break; } case BPF_MAP_TYPE_XSKMAP: { From 9749eb75892b957de40535e701e7462090ac5901 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Fri, 28 Jun 2019 11:12:35 +0200 Subject: [PATCH 1286/1640] UPSTREAM: devmap: Allow map lookups from eBPF MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We don't currently allow lookups into a devmap from eBPF, because the map lookup returns a pointer directly to the dev->ifindex, which shouldn't be modifiable from eBPF. However, being able to do lookups in devmaps is useful to know (e.g.) whether forwarding to a specific interface is enabled. Currently, programs work around this by keeping a shadow map of another type which indicates whether a map index is valid. Since we now have a flag to make maps read-only from the eBPF side, we can simply lift the lookup restriction if we make sure this flag is always set. Change-Id: Id7355cc5b9bfd45fde3d43efbecc0b40949cde2d Signed-off-by: Toke Høiland-Jørgensen Acked-by: Jonathan Lemon Acked-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann --- kernel/bpf/devmap.c | 5 +++++ kernel/bpf/verifier.c | 7 ++----- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index a4dddc867cbf..d83cf8ccc872 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -89,6 +89,11 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) attr->value_size != 4 || attr->map_flags & ~DEV_CREATE_FLAG_MASK) return ERR_PTR(-EINVAL); + /* Lookup returns a pointer straight to dev->ifindex, so make sure the + * verifier prevents writes from the BPF side + */ + attr->map_flags |= BPF_F_RDONLY_PROG; + dtab = kzalloc(sizeof(*dtab), GFP_USER); if (!dtab) return ERR_PTR(-ENOMEM); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 5d04f499c0ec..e6d35e4861e1 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3388,12 +3388,9 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, if (func_id != BPF_FUNC_get_local_storage) goto error; break; - /* devmap returns a pointer to a live net_device ifindex that we cannot - * allow to be modified from bpf side. So do not allow lookup elements - * for now. - */ case BPF_MAP_TYPE_DEVMAP: - if (func_id != BPF_FUNC_redirect_map) + if (func_id != BPF_FUNC_redirect_map && + func_id != BPF_FUNC_map_lookup_elem) goto error; break; /* Restrict bpf side of cpumap and xskmap, open when use-cases From 01b521b7d7df2fe3e248c5d0a1ae918dc53ccca1 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 28 Jun 2019 09:24:09 -0700 Subject: [PATCH 1287/1640] UPSTREAM: bpf: fix precision tracking When equivalent state is found the current state needs to propagate precision marks. Otherwise the verifier will prune the search incorrectly. There is a price for correctness: before before broken fixed cnst spill precise precise bpf_lb-DLB_L3.o 1923 8128 1863 1898 bpf_lb-DLB_L4.o 3077 6707 2468 2666 bpf_lb-DUNKNOWN.o 1062 1062 544 544 bpf_lxc-DDROP_ALL.o 166729 380712 22629 36823 bpf_lxc-DUNKNOWN.o 174607 440652 28805 45325 bpf_netdev.o 8407 31904 6801 7002 bpf_overlay.o 5420 23569 4754 4858 bpf_lxc_jit.o 39389 359445 50925 69631 Overall precision tracking is still very effective. Fixes: b5dc0163d8fd ("bpf: precise scalar_value tracking") Reported-by: Lawrence Brakmo Change-Id: Ie4c693fe103d83ab4babdda3f92772df3230d076 Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Tested-by: Lawrence Brakmo Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 121 +++++++++++++++++++++++++++++++++++++----- 1 file changed, 107 insertions(+), 14 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index e6d35e4861e1..16ae98318c51 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1658,16 +1658,18 @@ static void mark_all_scalars_precise(struct bpf_verifier_env *env, } } -static int mark_chain_precision(struct bpf_verifier_env *env, int regno) +static int __mark_chain_precision(struct bpf_verifier_env *env, int regno, + int spi) { struct bpf_verifier_state *st = env->cur_state; int first_idx = st->first_insn_idx; int last_idx = env->insn_idx; struct bpf_func_state *func; struct bpf_reg_state *reg; - u32 reg_mask = 1u << regno; - u64 stack_mask = 0; + u32 reg_mask = regno >= 0 ? 1u << regno : 0; + u64 stack_mask = spi >= 0 ? 1ull << spi : 0; bool skip_first = true; + bool new_marks = false; int i, err; if (!env->allow_ptr_leaks) @@ -1675,18 +1677,43 @@ static int mark_chain_precision(struct bpf_verifier_env *env, int regno) return 0; func = st->frame[st->curframe]; - reg = &func->regs[regno]; - if (reg->type != SCALAR_VALUE) { - WARN_ONCE(1, "backtracing misuse"); - return -EFAULT; + if (regno >= 0) { + reg = &func->regs[regno]; + if (reg->type != SCALAR_VALUE) { + WARN_ONCE(1, "backtracing misuse"); + return -EFAULT; + } + if (!reg->precise) + new_marks = true; + else + reg_mask = 0; + reg->precise = true; } - if (reg->precise) - return 0; - func->regs[regno].precise = true; + while (spi >= 0) { + if (func->stack[spi].slot_type[0] != STACK_SPILL) { + stack_mask = 0; + break; + } + reg = &func->stack[spi].spilled_ptr; + if (reg->type != SCALAR_VALUE) { + stack_mask = 0; + break; + } + if (!reg->precise) + new_marks = true; + else + stack_mask = 0; + reg->precise = true; + break; + } + + if (!new_marks) + return 0; + if (!reg_mask && !stack_mask) + return 0; for (;;) { DECLARE_BITMAP(mask, 64); - bool new_marks = false; u32 history = st->jmp_history_cnt; if (env->log.level & BPF_LOG_LEVEL) @@ -1729,12 +1756,15 @@ static int mark_chain_precision(struct bpf_verifier_env *env, int regno) if (!st) break; + new_marks = false; func = st->frame[st->curframe]; bitmap_from_u64(mask, reg_mask); for_each_set_bit(i, mask, 32) { reg = &func->regs[i]; - if (reg->type != SCALAR_VALUE) + if (reg->type != SCALAR_VALUE) { + reg_mask &= ~(1u << i); continue; + } if (!reg->precise) new_marks = true; reg->precise = true; @@ -1755,11 +1785,15 @@ static int mark_chain_precision(struct bpf_verifier_env *env, int regno) return -EFAULT; } - if (func->stack[i].slot_type[0] != STACK_SPILL) + if (func->stack[i].slot_type[0] != STACK_SPILL) { + stack_mask &= ~(1ull << i); continue; + } reg = &func->stack[i].spilled_ptr; - if (reg->type != SCALAR_VALUE) + if (reg->type != SCALAR_VALUE) { + stack_mask &= ~(1ull << i); continue; + } if (!reg->precise) new_marks = true; reg->precise = true; @@ -1771,6 +1805,8 @@ static int mark_chain_precision(struct bpf_verifier_env *env, int regno) reg_mask, stack_mask); } + if (!reg_mask && !stack_mask) + break; if (!new_marks) break; @@ -1780,6 +1816,15 @@ static int mark_chain_precision(struct bpf_verifier_env *env, int regno) return 0; } +static int mark_chain_precision(struct bpf_verifier_env *env, int regno) +{ + return __mark_chain_precision(env, regno, -1); +} + +static int mark_chain_precision_stack(struct bpf_verifier_env *env, int spi) +{ + return __mark_chain_precision(env, -1, spi); +} static bool is_spillable_regtype(enum bpf_reg_type type) { @@ -7268,6 +7313,46 @@ static int propagate_liveness(struct bpf_verifier_env *env, return 0; } +/* find precise scalars in the previous equivalent state and + * propagate them into the current state + */ +static int propagate_precision(struct bpf_verifier_env *env, + const struct bpf_verifier_state *old) +{ + struct bpf_reg_state *state_reg; + struct bpf_func_state *state; + int i, err = 0; + + state = old->frame[old->curframe]; + state_reg = state->regs; + for (i = 0; i < BPF_REG_FP; i++, state_reg++) { + if (state_reg->type != SCALAR_VALUE || + !state_reg->precise) + continue; + if (env->log.level & BPF_LOG_LEVEL2) + verbose(env, "propagating r%d\n", i); + err = mark_chain_precision(env, i); + if (err < 0) + return err; + } + + for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { + if (state->stack[i].slot_type[0] != STACK_SPILL) + continue; + state_reg = &state->stack[i].spilled_ptr; + if (state_reg->type != SCALAR_VALUE || + !state_reg->precise) + continue; + if (env->log.level & BPF_LOG_LEVEL2) + verbose(env, "propagating fp%d\n", + (-i - 1) * BPF_REG_SIZE); + err = mark_chain_precision_stack(env, i); + if (err < 0) + return err; + } + return 0; +} + static bool states_maybe_looping(struct bpf_verifier_state *old, struct bpf_verifier_state *cur) { @@ -7360,6 +7445,14 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) * this state and will pop a new one. */ err = propagate_liveness(env, &sl->state, cur); + + /* if previous state reached the exit with precision and + * current state is equivalent to it (except precsion marks) + * the precision needs to be propagated back in + * the current state. + */ + err = err ? : push_jmp_history(env, cur); + err = err ? : propagate_precision(env, &sl->state); if (err) return err; return 1; From e16843179bb0ee7b487110dda678258a4c8853af Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Wed, 3 Jul 2019 16:26:30 +0800 Subject: [PATCH 1288/1640] UPSTREAM: bpf: cgroup: Fix build error without CONFIG_NET If CONFIG_NET is not set and CONFIG_CGROUP_BPF=y, gcc building fails: kernel/bpf/cgroup.o: In function `cg_sockopt_func_proto': cgroup.c:(.text+0x237e): undefined reference to `bpf_sk_storage_get_proto' cgroup.c:(.text+0x2394): undefined reference to `bpf_sk_storage_delete_proto' kernel/bpf/cgroup.o: In function `__cgroup_bpf_run_filter_getsockopt': (.text+0x2a1f): undefined reference to `lock_sock_nested' (.text+0x2ca2): undefined reference to `release_sock' kernel/bpf/cgroup.o: In function `__cgroup_bpf_run_filter_setsockopt': (.text+0x3006): undefined reference to `lock_sock_nested' (.text+0x32bb): undefined reference to `release_sock' Reported-by: Hulk Robot Suggested-by: Stanislav Fomichev Fixes: 0d01da6afc54 ("bpf: implement getsockopt and setsockopt hooks") Change-Id: I8f0d6905338b01c0823b39657c353d5601ec7836 Signed-off-by: YueHaibing Acked-by: Yonghong Song Signed-off-by: Daniel Borkmann --- kernel/bpf/cgroup.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 76fa0076f20d..0a00eaca6fae 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -939,6 +939,7 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, } EXPORT_SYMBOL(__cgroup_bpf_run_filter_sysctl); +#ifdef CONFIG_NET static bool __cgroup_bpf_prog_array_is_empty(struct cgroup *cgrp, enum bpf_attach_type attach_type) { @@ -1120,6 +1121,7 @@ out: return ret; } EXPORT_SYMBOL(__cgroup_bpf_run_filter_getsockopt); +#endif static ssize_t sysctl_cpy_dir(const struct ctl_dir *dir, char **bufp, size_t *lenp) @@ -1386,10 +1388,12 @@ static const struct bpf_func_proto * cg_sockopt_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { +#ifdef CONFIG_NET case BPF_FUNC_sk_storage_get: return &bpf_sk_storage_get_proto; case BPF_FUNC_sk_storage_delete: return &bpf_sk_storage_delete_proto; +#endif #ifdef CONFIG_INET case BPF_FUNC_tcp_sock: return &bpf_tcp_sock_proto; From 364d0a1042811ec0be0994b0f187b4d87de2c8e9 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Thu, 27 Jun 2019 20:50:46 -0500 Subject: [PATCH 1289/1640] BACKPORT: objtool: Add support for C jump tables Objtool doesn't know how to read C jump tables, so it has to whitelist functions which use them, causing missing ORC unwinder data for such functions, e.g. ___bpf_prog_run(). C jump tables are very similar to GCC switch jump tables, which objtool already knows how to read. So adding support for C jump tables is easy. It just needs to be able to find the tables and distinguish them from other data. To allow the jump tables to be found, create an __annotate_jump_table macro which can be used to annotate them. The annotation is done by placing the jump table in an .rodata..c_jump_table section. The '.rodata' prefix ensures that the data will be placed in the rodata section by the vmlinux linker script. The double periods are part of an existing convention which distinguishes kernel sections from GCC sections. Change-Id: Ibea915c9a925fe8ff4f6bacaba957496320ad8cf Signed-off-by: Josh Poimboeuf Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: Song Liu Cc: Kairui Song Cc: Steven Rostedt Cc: Borislav Petkov Cc: Alexei Starovoitov Cc: Daniel Borkmann Link: https://lkml.kernel.org/r/0ba2ca30442b16b97165992381ce643dc27b3d1a.1561685471.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- include/linux/compiler.h | 5 +++++ tools/objtool/check.c | 27 ++++++++++++++++++++------- 2 files changed, 25 insertions(+), 7 deletions(-) diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 1b7a3a625b62..66269827085c 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -110,9 +110,14 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val, ".pushsection .discard.unreachable\n\t" \ ".long 999b - .\n\t" \ ".popsection\n\t" + +/* Annotate a C jump table to allow objtool to follow the code flow */ +#define __annotate_jump_table __section(".rodata..c_jump_table") + #else #define annotate_reachable() #define annotate_unreachable() +#define __annotate_jump_table #endif #ifndef ASM_UNREACHABLE diff --git a/tools/objtool/check.c b/tools/objtool/check.c index e93c061654a7..7e5b62bd4b31 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -30,6 +30,8 @@ #define FAKE_JUMP_OFFSET -1 +#define C_JUMP_TABLE_SECTION ".rodata..c_jump_table" + struct alternative { struct list_head list; struct instruction *insn; @@ -950,9 +952,15 @@ static struct rela *find_switch_table(struct objtool_file *file, /* * Make sure the .rodata address isn't associated with a - * symbol. gcc jump tables are anonymous data. + * symbol. GCC jump tables are anonymous data. + * + * Also support C jump tables which are in the same format as + * switch jump tables. For objtool to recognize them, they + * need to be placed in the C_JUMP_TABLE_SECTION section. They + * have symbols associated with them. */ - if (find_symbol_containing(rodata_sec, table_offset)) + if (find_symbol_containing(rodata_sec, table_offset) && + strcmp(rodata_sec->name, C_JUMP_TABLE_SECTION)) continue; rodata_rela = find_rela_by_dest(rodata_sec, table_offset); @@ -1191,13 +1199,18 @@ static void mark_rodata(struct objtool_file *file) bool found = false; /* - * This searches for the .rodata section or multiple .rodata.func_name - * sections if -fdata-sections is being used. The .str.1.1 and .str.1.8 - * rodata sections are ignored as they don't contain jump tables. + * Search for the following rodata sections, each of which can + * potentially contain jump tables: + * + * - .rodata: can contain GCC switch tables + * - .rodata.: same, if -fdata-sections is being used + * - .rodata..c_jump_table: contains C annotated jump tables + * + * .rodata.str1.* sections are ignored; they don't contain jump tables. */ for_each_sec(file, sec) { - if (!strncmp(sec->name, ".rodata", 7) && - !strstr(sec->name, ".str1.")) { + if ((!strncmp(sec->name, ".rodata", 7) && !strstr(sec->name, ".str1.")) || + !strcmp(sec->name, C_JUMP_TABLE_SECTION)) { sec->rodata = true; found = true; } From 00af12234e48cfb48c473d394234e9f3aa2260df Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Thu, 27 Jun 2019 20:50:47 -0500 Subject: [PATCH 1290/1640] UPSTREAM: bpf: Fix ORC unwinding in non-JIT BPF code Objtool previously ignored ___bpf_prog_run() because it didn't understand the jump table. This resulted in the ORC unwinder not being able to unwind through non-JIT BPF code. Now that objtool knows how to read jump tables, remove the whitelist and annotate the jump table so objtool can recognize it. Also add an additional "const" to the jump table definition to clarify that the text pointers are constant. Otherwise GCC sets the section writable flag and the assembler spits out warnings. Fixes: d15d356887e7 ("perf/x86: Make perf callchains work without CONFIG_FRAME_POINTER") Reported-by: Song Liu Change-Id: I88cff2b89c9100b00dc8c502db1cf4576b187f04 Signed-off-by: Josh Poimboeuf Signed-off-by: Thomas Gleixner Acked-by: Alexei Starovoitov Cc: Peter Zijlstra Cc: Kairui Song Cc: Steven Rostedt Cc: Borislav Petkov Cc: Daniel Borkmann Link: https://lkml.kernel.org/r/881939122b88f32be4c374d248c09d7527a87e35.1561685471.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- kernel/bpf/core.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 338e4be37d96..60fc98a9a2f4 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1322,7 +1322,7 @@ static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, u64 *stack) { #define BPF_INSN_2_LBL(x, y) [BPF_##x | BPF_##y] = &&x##_##y #define BPF_INSN_3_LBL(x, y, z) [BPF_##x | BPF_##y | BPF_##z] = &&x##_##y##_##z - static const void *jumptable[256] = { + static const void * const jumptable[256] __annotate_jump_table = { [0 ... 255] = &&default_label, /* Now overwrite non-defaults ... */ BPF_INSN_MAP(BPF_INSN_2_LBL, BPF_INSN_3_LBL), @@ -1595,7 +1595,6 @@ out: BUG_ON(1); return 0; } -STACK_FRAME_NON_STANDARD(___bpf_prog_run); /* jump table */ #define PROG_NAME(stack_size) __bpf_prog_run##stack_size #define DEFINE_BPF_PROG_RUN(stack_size) \ From aa072bf0a70aa8dd1a2d3703c8dff81aac301ddd Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 8 Jul 2019 20:32:44 -0700 Subject: [PATCH 1291/1640] UPSTREAM: bpf: fix precision bit propagation for BPF_ST instructions When backtracking instructions to propagate precision bit for registers and stack slots, one class of instructions (BPF_ST) weren't handled causing extra stack slots to be propagated into parent state. Parent state might not have that much stack allocated, though, which causes warning on invalid stack slot usage. This patch adds handling of BPF_ST instructions: BPF_MEM | | BPF_ST: *(size *) (dst_reg + off) = imm32 Reported-by: syzbot+4da3ff23081bafe74fc2@syzkaller.appspotmail.com Fixes: b5dc0163d8fd ("bpf: precise scalar_value tracking") Cc: Alexei Starovoitov Change-Id: I8efd781282da0a6339937a4553a6c4f60061aa24 Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 16ae98318c51..2a8ae46f8838 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1518,9 +1518,9 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, return -EFAULT; } *stack_mask |= 1ull << spi; - } else if (class == BPF_STX) { + } else if (class == BPF_STX || class == BPF_ST) { if (*reg_mask & dreg) - /* stx shouldn't be using _scalar_ dst_reg + /* stx & st shouldn't be using _scalar_ dst_reg * to access memory. It means backtracking * encountered a case of pointer subtraction. */ @@ -1539,7 +1539,8 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, if (!(*stack_mask & (1ull << spi))) return 0; *stack_mask &= ~(1ull << spi); - *reg_mask |= sreg; + if (class == BPF_STX) + *reg_mask |= sreg; } else if (class == BPF_JMP || class == BPF_JMP32) { if (opcode == BPF_CALL) { if (insn->src_reg == BPF_PSEUDO_CALL) @@ -1568,10 +1569,6 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, if (mode == BPF_IND || mode == BPF_ABS) /* to be analyzed */ return -ENOTSUPP; - } else if (class == BPF_ST) { - if (*reg_mask & dreg) - /* likely pointer subtraction */ - return -ENOTSUPP; } return 0; } From 034e47e4e860baedd224f9ba2331356d1bf42f2d Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Thu, 11 Jul 2019 11:22:33 -0500 Subject: [PATCH 1292/1640] UPSTREAM: bpf: verifier: avoid fall-through warnings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In preparation to enabling -Wimplicit-fallthrough, this patch silences the following warning: kernel/bpf/verifier.c: In function ‘check_return_code’: kernel/bpf/verifier.c:6106:6: warning: this statement may fall through [-Wimplicit-fallthrough=] if (env->prog->expected_attach_type == BPF_CGROUP_UDP4_RECVMSG || ^ kernel/bpf/verifier.c:6109:2: note: here case BPF_PROG_TYPE_CGROUP_SKB: ^~~~ Warning level 3 was used: -Wimplicit-fallthrough=3 Notice that is much clearer to explicitly add breaks in each case statement (that actually contains some code), rather than letting the code to fall through. This patch is part of the ongoing efforts to enable -Wimplicit-fallthrough. Change-Id: I68fa160099a44c7f77794c865ae17017770bc408 Signed-off-by: Gustavo A. R. Silva Acked-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 2a8ae46f8838..40f1f7cdef71 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6275,11 +6275,13 @@ static int check_return_code(struct bpf_verifier_env *env) if (env->prog->expected_attach_type == BPF_CGROUP_UDP4_RECVMSG || env->prog->expected_attach_type == BPF_CGROUP_UDP6_RECVMSG) range = tnum_range(1, 1); + break; case BPF_PROG_TYPE_CGROUP_SKB: if (env->prog->expected_attach_type == BPF_CGROUP_INET_EGRESS) { range = tnum_range(0, 3); enforce_attach_type_range = tnum_range(2, 3); } + break; case BPF_PROG_TYPE_CGROUP_SOCK: case BPF_PROG_TYPE_SOCK_OPS: case BPF_PROG_TYPE_CGROUP_DEVICE: From 223de9ccfbc1e3ba6e4f147522f8e65c86e8b49a Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 12 Jul 2019 10:25:55 -0700 Subject: [PATCH 1293/1640] UPSTREAM: bpf: fix BTF verifier size resolution logic BTF verifier has a size resolution bug which in some circumstances leads to invalid size resolution for, e.g., TYPEDEF modifier. This happens if we have [1] PTR -> [2] TYPEDEF -> [3] ARRAY, in which case due to being in pointer context ARRAY size won't be resolved (because for pointer it doesn't matter, so it's a sink in pointer context), but it will be permanently remembered as zero for TYPEDEF and TYPEDEF will be marked as RESOLVED. Eventually ARRAY size will be resolved correctly, but TYPEDEF resolved_size won't be updated anymore. This, subsequently, will lead to erroneous map creation failure, if that TYPEDEF is specified as either key or value, as key_size/value_size won't correspond to resolved size of TYPEDEF (kernel will believe it's zero). Note, that if BTF was ordered as [1] ARRAY <- [2] TYPEDEF <- [3] PTR, this won't be a problem, as by the time we get to TYPEDEF, ARRAY's size is already calculated and stored. This bug manifests itself in rejecting BTF-defined maps that use array typedef as a value type: typedef int array_t[16]; struct { __uint(type, BPF_MAP_TYPE_ARRAY); __type(value, array_t); /* i.e., array_t *value; */ } test_map SEC(".maps"); The fix consists on not relying on modifier's resolved_size and instead using modifier's resolved_id (type ID for "concrete" type to which modifier eventually resolves) and doing size determination for that resolved type. This allow to preserve existing "early DFS termination" logic for PTR or STRUCT_OR_ARRAY contexts, but still do correct size determination for modifier types. Fixes: eb3f595dab40 ("bpf: btf: Validate type reference") Cc: Martin KaFai Lau Change-Id: Ib56341417bc357482b7e71ea1749fed3b77d0c7a Signed-off-by: Andrii Nakryiko Acked-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann --- kernel/bpf/btf.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 58ee928c07c8..cef69ea85bf1 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -1073,11 +1073,18 @@ const struct btf_type *btf_type_id_size(const struct btf *btf, !btf_type_is_var(size_type))) return NULL; - size = btf->resolved_sizes[size_type_id]; size_type_id = btf->resolved_ids[size_type_id]; size_type = btf_type_by_id(btf, size_type_id); if (btf_type_nosize_or_null(size_type)) return NULL; + else if (btf_type_has_size(size_type)) + size = size_type->size; + else if (btf_type_is_array(size_type)) + size = btf->resolved_sizes[size_type_id]; + else if (btf_type_is_ptr(size_type)) + size = sizeof(void *); + else + return NULL; } *type_id = size_type_id; @@ -1602,7 +1609,6 @@ static int btf_modifier_resolve(struct btf_verifier_env *env, const struct btf_type *next_type; u32 next_type_id = t->type; struct btf *btf = env->btf; - u32 next_type_size = 0; next_type = btf_type_by_id(btf, next_type_id); if (!next_type || btf_type_is_resolve_source_only(next_type)) { @@ -1620,7 +1626,7 @@ static int btf_modifier_resolve(struct btf_verifier_env *env, * save us a few type-following when we use it later (e.g. in * pretty print). */ - if (!btf_type_id_size(btf, &next_type_id, &next_type_size)) { + if (!btf_type_id_size(btf, &next_type_id, NULL)) { if (env_type_is_resolved(env, next_type_id)) next_type = btf_type_id_resolve(btf, &next_type_id); @@ -1633,7 +1639,7 @@ static int btf_modifier_resolve(struct btf_verifier_env *env, } } - env_stack_pop_resolved(env, next_type_id, next_type_size); + env_stack_pop_resolved(env, next_type_id, 0); return 0; } @@ -1645,7 +1651,6 @@ static int btf_var_resolve(struct btf_verifier_env *env, const struct btf_type *t = v->t; u32 next_type_id = t->type; struct btf *btf = env->btf; - u32 next_type_size; next_type = btf_type_by_id(btf, next_type_id); if (!next_type || btf_type_is_resolve_source_only(next_type)) { @@ -1675,12 +1680,12 @@ static int btf_var_resolve(struct btf_verifier_env *env, * forward types or similar that would resolve to size of * zero is allowed. */ - if (!btf_type_id_size(btf, &next_type_id, &next_type_size)) { + if (!btf_type_id_size(btf, &next_type_id, NULL)) { btf_verifier_log_type(env, v->t, "Invalid type_id"); return -EINVAL; } - env_stack_pop_resolved(env, next_type_id, next_type_size); + env_stack_pop_resolved(env, next_type_id, 0); return 0; } From 85c9d7b82ca0f8c34473d65e5a9f4c0f9526597d Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 17 Jul 2019 20:36:45 -0500 Subject: [PATCH 1294/1640] BACKPORT: bpf: Disable GCC -fgcse optimization for ___bpf_prog_run() On x86-64, with CONFIG_RETPOLINE=n, GCC's "global common subexpression elimination" optimization results in ___bpf_prog_run()'s jumptable code changing from this: select_insn: jmp *jumptable(, %rax, 8) ... ALU64_ADD_X: ... jmp *jumptable(, %rax, 8) ALU_ADD_X: ... jmp *jumptable(, %rax, 8) to this: select_insn: mov jumptable, %r12 jmp *(%r12, %rax, 8) ... ALU64_ADD_X: ... jmp *(%r12, %rax, 8) ALU_ADD_X: ... jmp *(%r12, %rax, 8) The jumptable address is placed in a register once, at the beginning of the function. The function execution can then go through multiple indirect jumps which rely on that same register value. This has a few issues: 1) Objtool isn't smart enough to be able to track such a register value across multiple recursive indirect jumps through the jump table. 2) With CONFIG_RETPOLINE enabled, this optimization actually results in a small slowdown. I measured a ~4.7% slowdown in the test_bpf "tcpdump port 22" selftest. This slowdown is actually predicted by the GCC manual: Note: When compiling a program using computed gotos, a GCC extension, you may get better run-time performance if you disable the global common subexpression elimination pass by adding -fno-gcse to the command line. So just disable the optimization for this function. Fixes: e55a73251da3 ("bpf: Fix ORC unwinding in non-JIT BPF code") Reported-by: Randy Dunlap Change-Id: I71e856ff945796a6340df44ec48cd519a62bc5ac Signed-off-by: Josh Poimboeuf Signed-off-by: Thomas Gleixner Acked-by: Alexei Starovoitov Acked-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/30c3ca29ba037afcbd860a8672eef0021addf9fe.1563413318.git.jpoimboe@redhat.com --- include/linux/compiler-gcc.h | 2 ++ include/linux/compiler_types.h | 4 ++++ kernel/bpf/core.c | 2 +- 3 files changed, 7 insertions(+), 1 deletion(-) diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index c9303b82a5f3..f5ec66a7b516 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -378,3 +378,5 @@ #if GCC_VERSION >= 50100 #define COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW 1 #endif + +#define __no_fgcse __attribute__((optimize("-fno-gcse"))) diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index 226f4ea0e57c..b31c17302737 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -288,6 +288,10 @@ struct ftrace_likely_data { #define __assume_aligned(a, ...) #endif +#ifndef __no_fgcse +# define __no_fgcse +#endif + /* Are two types/vars the same type (ignoring qualifiers)? */ #ifndef __same_type diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 60fc98a9a2f4..a66464954649 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1318,7 +1318,7 @@ bool bpf_opcode_in_insntable(u8 code) * * Decode and execute eBPF instructions. */ -static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, u64 *stack) +static u64 __no_fgcse ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, u64 *stack) { #define BPF_INSN_2_LBL(x, y) [BPF_##x | BPF_##y] = &&x##_##y #define BPF_INSN_3_LBL(x, y, z) [BPF_##x | BPF_##y | BPF_##z] = &&x##_##y##_##z From d7943d8b5df585b8dd13cdee0d825cd1709f3548 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Mon, 1 Jul 2019 10:38:39 -0700 Subject: [PATCH 1295/1640] UPSTREAM: bpf: allow wide (u64) aligned stores for some fields of bpf_sock_addr Since commit cd17d7770578 ("bpf/tools: sync bpf.h") clang decided that it can do a single u64 store into user_ip6[2] instead of two separate u32 ones: # 17: (18) r2 = 0x100000000000000 # ; ctx->user_ip6[2] = bpf_htonl(DST_REWRITE_IP6_2); # 19: (7b) *(u64 *)(r1 +16) = r2 # invalid bpf_context access off=16 size=8 >From the compiler point of view it does look like a correct thing to do, so let's support it on the kernel side. Credit to Andrii Nakryiko for a proper implementation of bpf_ctx_wide_store_ok. Cc: Andrii Nakryiko Cc: Yonghong Song Fixes: cd17d7770578 ("bpf/tools: sync bpf.h") Reported-by: kernel test robot Acked-by: Yonghong Song Acked-by: Andrii Nakryiko Change-Id: I60bf910a9f10a870028e3a8484f8c6638aa62e43 Signed-off-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann --- include/linux/filter.h | 6 ++++++ include/uapi/linux/bpf.h | 6 +++--- net/core/filter.c | 22 ++++++++++++++-------- 3 files changed, 23 insertions(+), 11 deletions(-) diff --git a/include/linux/filter.h b/include/linux/filter.h index 279b24b22b96..d824f1a49fba 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -834,6 +834,12 @@ bpf_ctx_narrow_access_ok(u32 off, u32 size, u32 size_default) return size <= size_default && (size & (size - 1)) == 0; } +#define bpf_ctx_wide_store_ok(off, size, type, field) \ + (size == sizeof(__u64) && \ + off >= offsetof(type, field) && \ + off + sizeof(__u64) <= offsetofend(type, field) && \ + off % sizeof(__u64) == 0) + #define bpf_classic_proglen(fprog) (fprog->len * sizeof(fprog->filter[0])) static inline void bpf_prog_lock_ro(struct bpf_prog *fp) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index a76515e7c886..76c6c8f36329 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3089,7 +3089,7 @@ struct bpf_sock_addr { __u32 user_ip4; /* Allows 1,2,4-byte read and 4-byte write. * Stored in network byte order. */ - __u32 user_ip6[4]; /* Allows 1,2,4-byte read an 4-byte write. + __u32 user_ip6[4]; /* Allows 1,2,4-byte read and 4,8-byte write. * Stored in network byte order. */ __u32 user_port; /* Allows 4-byte read and write. @@ -3098,10 +3098,10 @@ struct bpf_sock_addr { __u32 family; /* Allows 4-byte read, but no write */ __u32 type; /* Allows 4-byte read, but no write */ __u32 protocol; /* Allows 4-byte read, but no write */ - __u32 msg_src_ip4; /* Allows 1,2,4-byte read an 4-byte write. + __u32 msg_src_ip4; /* Allows 1,2,4-byte read and 4-byte write. * Stored in network byte order. */ - __u32 msg_src_ip6[4]; /* Allows 1,2,4-byte read an 4-byte write. + __u32 msg_src_ip6[4]; /* Allows 1,2,4-byte read and 4,8-byte write. * Stored in network byte order. */ }; diff --git a/net/core/filter.c b/net/core/filter.c index 76fe0a507e5f..c97ddb30119c 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -6309,6 +6309,16 @@ static bool sock_addr_is_valid_access(int off, int size, if (!bpf_ctx_narrow_access_ok(off, size, size_default)) return false; } else { + if (bpf_ctx_wide_store_ok(off, size, + struct bpf_sock_addr, + user_ip6)) + return true; + + if (bpf_ctx_wide_store_ok(off, size, + struct bpf_sock_addr, + msg_src_ip6)) + return true; + if (size != size_default) return false; } @@ -7046,9 +7056,6 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type, /* SOCK_ADDR_STORE_NESTED_FIELD_OFF() has semantic similar to * SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF() but for store operation. * - * It doesn't support SIZE argument though since narrow stores are not - * supported for now. - * * In addition it uses Temporary Field TF (member of struct S) as the 3rd * "register" since two registers available in convert_ctx_access are not * enough: we can't override neither SRC, since it contains value to store, nor @@ -7056,7 +7063,7 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type, * instructions. But we need a temporary place to save pointer to nested * structure whose field we want to store to. */ -#define SOCK_ADDR_STORE_NESTED_FIELD_OFF(S, NS, F, NF, OFF, TF) \ +#define SOCK_ADDR_STORE_NESTED_FIELD_OFF(S, NS, F, NF, SIZE, OFF, TF) \ do { \ int tmp_reg = BPF_REG_9; \ if (si->src_reg == tmp_reg || si->dst_reg == tmp_reg) \ @@ -7067,8 +7074,7 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type, offsetof(S, TF)); \ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(S, F), tmp_reg, \ si->dst_reg, offsetof(S, F)); \ - *insn++ = BPF_STX_MEM( \ - BPF_FIELD_SIZEOF(NS, NF), tmp_reg, si->src_reg, \ + *insn++ = BPF_STX_MEM(SIZE, tmp_reg, si->src_reg, \ bpf_target_off(NS, NF, FIELD_SIZEOF(NS, NF), \ target_size) \ + OFF); \ @@ -7080,8 +7086,8 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type, TF) \ do { \ if (type == BPF_WRITE) { \ - SOCK_ADDR_STORE_NESTED_FIELD_OFF(S, NS, F, NF, OFF, \ - TF); \ + SOCK_ADDR_STORE_NESTED_FIELD_OFF(S, NS, F, NF, SIZE, \ + OFF, TF); \ } else { \ SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF( \ S, NS, F, NF, SIZE, OFF); \ From 2463a60ed29c5e61e7a099c25ce0c52f60842d27 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Mon, 15 Jul 2019 09:39:52 -0700 Subject: [PATCH 1296/1640] UPSTREAM: bpf: rename bpf_ctx_wide_store_ok to bpf_ctx_wide_access_ok Rename bpf_ctx_wide_store_ok to bpf_ctx_wide_access_ok to indicate that it can be used for both loads and stores. Cc: Yonghong Song Change-Id: I82b75e458dd073c2d1d5b1c42ac836f19560f5d1 Signed-off-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann --- include/linux/filter.h | 2 +- net/core/filter.c | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/include/linux/filter.h b/include/linux/filter.h index d824f1a49fba..441b921e559a 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -834,7 +834,7 @@ bpf_ctx_narrow_access_ok(u32 off, u32 size, u32 size_default) return size <= size_default && (size & (size - 1)) == 0; } -#define bpf_ctx_wide_store_ok(off, size, type, field) \ +#define bpf_ctx_wide_access_ok(off, size, type, field) \ (size == sizeof(__u64) && \ off >= offsetof(type, field) && \ off + sizeof(__u64) <= offsetofend(type, field) && \ diff --git a/net/core/filter.c b/net/core/filter.c index c97ddb30119c..ed56a8a065cf 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -6309,14 +6309,14 @@ static bool sock_addr_is_valid_access(int off, int size, if (!bpf_ctx_narrow_access_ok(off, size, size_default)) return false; } else { - if (bpf_ctx_wide_store_ok(off, size, - struct bpf_sock_addr, - user_ip6)) + if (bpf_ctx_wide_access_ok(off, size, + struct bpf_sock_addr, + user_ip6)) return true; - if (bpf_ctx_wide_store_ok(off, size, - struct bpf_sock_addr, - msg_src_ip6)) + if (bpf_ctx_wide_access_ok(off, size, + struct bpf_sock_addr, + msg_src_ip6)) return true; if (size != size_default) From f6de037044907930827803704a455744ded2f6e5 Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Fri, 19 Jul 2019 11:18:15 +0200 Subject: [PATCH 1297/1640] UPSTREAM: bpf: fix narrower loads on s390 The very first check in test_pkt_md_access is failing on s390, which happens because loading a part of a struct __sk_buff field produces an incorrect result. The preprocessed code of the check is: { __u8 tmp = *((volatile __u8 *)&skb->len + ((sizeof(skb->len) - sizeof(__u8)) / sizeof(__u8))); if (tmp != ((*(volatile __u32 *)&skb->len) & 0xFF)) return 2; }; clang generates the following code for it: 0: 71 21 00 03 00 00 00 00 r2 = *(u8 *)(r1 + 3) 1: 61 31 00 00 00 00 00 00 r3 = *(u32 *)(r1 + 0) 2: 57 30 00 00 00 00 00 ff r3 &= 255 3: 5d 23 00 1d 00 00 00 00 if r2 != r3 goto +29 Finally, verifier transforms it to: 0: (61) r2 = *(u32 *)(r1 +104) 1: (bc) w2 = w2 2: (74) w2 >>= 24 3: (bc) w2 = w2 4: (54) w2 &= 255 5: (bc) w2 = w2 The problem is that when verifier emits the code to replace a partial load of a struct __sk_buff field (*(u8 *)(r1 + 3)) with a full load of struct sk_buff field (*(u32 *)(r1 + 104)), an optional shift and a bitwise AND, it assumes that the machine is little endian and incorrectly decides to use a shift. Adjust shift count calculation to account for endianness. Fixes: 31fd85816dbe ("bpf: permits narrower load from bpf program context fields") Change-Id: I505ad06e8c3d57985b3a9a7d1f1570c33032605f Signed-off-by: Ilya Leoshkevich Signed-off-by: Alexei Starovoitov --- include/linux/filter.h | 13 +++++++++++++ kernel/bpf/verifier.c | 4 ++-- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/include/linux/filter.h b/include/linux/filter.h index 441b921e559a..477b6dab98ba 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -24,6 +24,7 @@ #include +#include #include #include @@ -834,6 +835,18 @@ bpf_ctx_narrow_access_ok(u32 off, u32 size, u32 size_default) return size <= size_default && (size & (size - 1)) == 0; } +static inline u8 +bpf_ctx_narrow_load_shift(u32 off, u32 size, u32 size_default) +{ + u8 load_off = off & (size_default - 1); + +#ifdef __LITTLE_ENDIAN + return load_off * 8; +#else + return (size_default - (load_off + size)) * 8; +#endif +} + #define bpf_ctx_wide_access_ok(off, size, type, field) \ (size == sizeof(__u64) && \ off >= offsetof(type, field) && \ diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 40f1f7cdef71..fe7fdbc225a8 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -8776,8 +8776,8 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) } if (is_narrower_load && size < target_size) { - u8 shift = (off & (size_default - 1)) * 8; - + u8 shift = bpf_ctx_narrow_load_shift(off, size, + size_default); if (ctx_field_size <= 4) { if (shift) insn_buf[cnt++] = BPF_ALU32_IMM(BPF_RSH, From b00a78c461561c58eca3929b77b8a7d8e60c2251 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Fri, 26 Jul 2019 18:06:53 +0200 Subject: [PATCH 1298/1640] UPSTREAM: xdp: Refactor devmap allocation code for reuse MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The subsequent patch to add a new devmap sub-type can re-use much of the initialisation and allocation code, so refactor it into separate functions. Change-Id: I533be13b02a1102240825d1bd029ac1fda5e0d73 Signed-off-by: Toke Høiland-Jørgensen Acked-by: Yonghong Song Acked-by: Jesper Dangaard Brouer Signed-off-by: Alexei Starovoitov --- kernel/bpf/devmap.c | 136 +++++++++++++++++++++++++++----------------- 1 file changed, 83 insertions(+), 53 deletions(-) diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index d83cf8ccc872..a0501266bdb8 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -60,9 +60,9 @@ struct xdp_bulk_queue { struct bpf_dtab_netdev { struct net_device *dev; /* must be first member, due to tracepoint */ struct bpf_dtab *dtab; - unsigned int bit; struct xdp_bulk_queue __percpu *bulkq; struct rcu_head rcu; + unsigned int idx; /* keep track of map index for tracepoint */ }; struct bpf_dtab { @@ -75,28 +75,21 @@ struct bpf_dtab { static DEFINE_SPINLOCK(dev_map_lock); static LIST_HEAD(dev_map_list); -static struct bpf_map *dev_map_alloc(union bpf_attr *attr) +static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr) { - struct bpf_dtab *dtab; int err, cpu; u64 cost; - if (!capable(CAP_NET_ADMIN)) - return ERR_PTR(-EPERM); - /* check sanity of attributes */ if (attr->max_entries == 0 || attr->key_size != 4 || attr->value_size != 4 || attr->map_flags & ~DEV_CREATE_FLAG_MASK) - return ERR_PTR(-EINVAL); + return -EINVAL; /* Lookup returns a pointer straight to dev->ifindex, so make sure the * verifier prevents writes from the BPF side */ attr->map_flags |= BPF_F_RDONLY_PROG; - dtab = kzalloc(sizeof(*dtab), GFP_USER); - if (!dtab) - return ERR_PTR(-ENOMEM); bpf_map_init_from_attr(&dtab->map, attr); @@ -107,9 +100,7 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) /* if map size is larger than memlock limit, reject it */ err = bpf_map_charge_init(&dtab->map.memory, cost); if (err) - goto free_dtab; - - err = -ENOMEM; + return -EINVAL; dtab->flush_list = alloc_percpu(struct list_head); if (!dtab->flush_list) @@ -124,19 +115,38 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) if (!dtab->netdev_map) goto free_percpu; - spin_lock(&dev_map_lock); - list_add_tail_rcu(&dtab->list, &dev_map_list); - spin_unlock(&dev_map_lock); - - return &dtab->map; + return 0; free_percpu: free_percpu(dtab->flush_list); free_charge: bpf_map_charge_finish(&dtab->map.memory); -free_dtab: - kfree(dtab); - return ERR_PTR(err); + return -ENOMEM; +} + +static struct bpf_map *dev_map_alloc(union bpf_attr *attr) +{ + struct bpf_dtab *dtab; + int err; + + if (!capable(CAP_NET_ADMIN)) + return ERR_PTR(-EPERM); + + dtab = kzalloc(sizeof(*dtab), GFP_USER); + if (!dtab) + return ERR_PTR(-ENOMEM); + + err = dev_map_init_map(dtab, attr); + if (err) { + kfree(dtab); + return ERR_PTR(err); + } + + spin_lock(&dev_map_lock); + list_add_tail_rcu(&dtab->list, &dev_map_list); + spin_unlock(&dev_map_lock); + + return &dtab->map; } static void dev_map_free(struct bpf_map *map) @@ -235,7 +245,7 @@ static int bq_xmit_all(struct xdp_bulk_queue *bq, u32 flags, out: bq->count = 0; - trace_xdp_devmap_xmit(&obj->dtab->map, obj->bit, + trace_xdp_devmap_xmit(&obj->dtab->map, obj->idx, sent, drops, bq->dev_rx, dev, err); bq->dev_rx = NULL; __list_del_clearprev(&bq->flush_node); @@ -412,17 +422,52 @@ static int dev_map_delete_elem(struct bpf_map *map, void *key) return 0; } -static int dev_map_update_elem(struct bpf_map *map, void *key, void *value, - u64 map_flags) +static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net, + struct bpf_dtab *dtab, + u32 ifindex, + unsigned int idx) +{ + gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN; + struct bpf_dtab_netdev *dev; + struct xdp_bulk_queue *bq; + int cpu; + + dev = kmalloc_node(sizeof(*dev), gfp, dtab->map.numa_node); + if (!dev) + return ERR_PTR(-ENOMEM); + + dev->bulkq = __alloc_percpu_gfp(sizeof(*dev->bulkq), + sizeof(void *), gfp); + if (!dev->bulkq) { + kfree(dev); + return ERR_PTR(-ENOMEM); + } + + for_each_possible_cpu(cpu) { + bq = per_cpu_ptr(dev->bulkq, cpu); + bq->obj = dev; + } + + dev->dev = dev_get_by_index(net, ifindex); + if (!dev->dev) { + free_percpu(dev->bulkq); + kfree(dev); + return ERR_PTR(-EINVAL); + } + + dev->idx = idx; + dev->dtab = dtab; + + return dev; +} + +static int __dev_map_update_elem(struct net *net, struct bpf_map *map, + void *key, void *value, u64 map_flags) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); - struct net *net = current->nsproxy->net_ns; - gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN; struct bpf_dtab_netdev *dev, *old_dev; u32 ifindex = *(u32 *)value; - struct xdp_bulk_queue *bq; u32 i = *(u32 *)key; - int cpu; if (unlikely(map_flags > BPF_EXIST)) return -EINVAL; @@ -434,31 +479,9 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value, if (!ifindex) { dev = NULL; } else { - dev = kmalloc_node(sizeof(*dev), gfp, map->numa_node); - if (!dev) - return -ENOMEM; - - dev->bulkq = __alloc_percpu_gfp(sizeof(*dev->bulkq), - sizeof(void *), gfp); - if (!dev->bulkq) { - kfree(dev); - return -ENOMEM; - } - - for_each_possible_cpu(cpu) { - bq = per_cpu_ptr(dev->bulkq, cpu); - bq->obj = dev; - } - - dev->dev = dev_get_by_index(net, ifindex); - if (!dev->dev) { - free_percpu(dev->bulkq); - kfree(dev); - return -EINVAL; - } - - dev->bit = i; - dev->dtab = dtab; + dev = __dev_map_alloc_node(net, dtab, ifindex, i); + if (IS_ERR(dev)) + return PTR_ERR(dev); } /* Use call_rcu() here to ensure rcu critical sections have completed @@ -472,6 +495,13 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value, return 0; } +static int dev_map_update_elem(struct bpf_map *map, void *key, void *value, + u64 map_flags) +{ + return __dev_map_update_elem(current->nsproxy->net_ns, + map, key, value, map_flags); +} + const struct bpf_map_ops dev_map_ops = { .map_alloc = dev_map_alloc, .map_free = dev_map_free, From 9ad71092389b24f978fc861f677da58339a4b0b4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Fri, 26 Jul 2019 18:06:52 +0200 Subject: [PATCH 1299/1640] UPSTREAM: include/bpf.h: Remove map_insert_ctx() stubs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When we changed the device and CPU maps to use linked lists instead of bitmaps, we also removed the need for the map_insert_ctx() helpers to keep track of the bitmaps inside each map. However, it seems I forgot to remove the function definitions stubs, so remove those here. Change-Id: I2ed6b2a2583472a52da1056b9a7c1aeec7cba88f Signed-off-by: Toke Høiland-Jørgensen Acked-by: Yonghong Song Acked-by: Jesper Dangaard Brouer Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 5c2a2fad90e8..90f2384fd721 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -726,7 +726,6 @@ struct xdp_buff; struct sk_buff; struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key); -void __dev_map_insert_ctx(struct bpf_map *map, u32 index); void __dev_map_flush(struct bpf_map *map); int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, struct net_device *dev_rx); @@ -734,7 +733,6 @@ int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb, struct bpf_prog *xdp_prog); struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key); -void __cpu_map_insert_ctx(struct bpf_map *map, u32 index); void __cpu_map_flush(struct bpf_map *map); int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp, struct net_device *dev_rx); @@ -819,10 +817,6 @@ static inline struct net_device *__dev_map_lookup_elem(struct bpf_map *map, return NULL; } -static inline void __dev_map_insert_ctx(struct bpf_map *map, u32 index) -{ -} - static inline void __dev_map_flush(struct bpf_map *map) { } @@ -852,10 +846,6 @@ struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key) return NULL; } -static inline void __cpu_map_insert_ctx(struct bpf_map *map, u32 index) -{ -} - static inline void __cpu_map_flush(struct bpf_map *map) { } From 585a13c68974a5fefeb89e52bf5845588e6e67c8 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Mon, 3 Sep 2018 09:55:02 +0200 Subject: [PATCH 1300/1640] UPSTREAM: xdp: explicit inline __xdp_map_lookup_elem The compiler chooses to not-inline the function __xdp_map_lookup_elem, because it can see that it is used by both Generic-XDP and native-XDP do redirect calls (xdp_do_generic_redirect_map and xdp_do_redirect_map). The compiler cannot know that this is a bad choice, as it cannot know that a net device cannot run both XDP modes (Generic or Native) at the same time. Thus, mark this function inline, even-though we normally leave this up-to the compiler. Change-Id: Ifb79c59dac3d7eada01e26378811ff513a3c2ea3 Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Alexei Starovoitov --- net/core/filter.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/core/filter.c b/net/core/filter.c index ed56a8a065cf..0c2a4d439454 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3203,7 +3203,7 @@ void xdp_do_flush_map(void) } EXPORT_SYMBOL_GPL(xdp_do_flush_map); -static void *__xdp_map_lookup_elem(struct bpf_map *map, u32 index) +static inline void *__xdp_map_lookup_elem(struct bpf_map *map, u32 index) { switch (map->map_type) { case BPF_MAP_TYPE_DEVMAP: @@ -3246,7 +3246,7 @@ static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp, WRITE_ONCE(ri->map, NULL); fwd = __xdp_map_lookup_elem(map, index); - if (!fwd) { + if (unlikely(!fwd)) { err = -EINVAL; goto err; } @@ -3274,7 +3274,7 @@ int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, u32 index = ri->ifindex; int err; - if (map) + if (likely(map)) return xdp_do_redirect_map(dev, xdp, xdp_prog, map); fwd = dev_get_by_index_rcu(dev_net(dev), index); From 2468986fb5ebdaaea3118b2629fb983ce6c1fc0d Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Mon, 3 Sep 2018 09:55:07 +0200 Subject: [PATCH 1301/1640] UPSTREAM: xdp: split code for map vs non-map redirect The compiler does an efficient job of inlining static C functions. Perf top clearly shows that almost everything gets inlined into the function call xdp_do_redirect. The function xdp_do_redirect end-up containing and interleaving the map and non-map redirect code. This is sub-optimal, as it would be strange for an XDP program to use both types of redirect in the same program. The two use-cases are separate, and interleaving the code just cause more instruction-cache pressure. I would like to stress (again) that the non-map variant bpf_redirect is very slow compared to the bpf_redirect_map variant, approx half the speed. Measured with driver i40e the difference is: - map redirect: 13,250,350 pps - non-map redirect: 7,491,425 pps For this reason, the function name of the non-map variant of redirect have been called xdp_do_redirect_slow. This hopefully gives a hint when using perf, that this is not the optimal XDP redirect operating mode. Change-Id: I7643d1a09acd6e276280d2e3e110d6d8d798e4f7 Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Alexei Starovoitov --- net/core/filter.c | 52 +++++++++++++++++++++++++++-------------------- 1 file changed, 30 insertions(+), 22 deletions(-) diff --git a/net/core/filter.c b/net/core/filter.c index 0c2a4d439454..93ce8a418d63 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3143,6 +3143,32 @@ static int __bpf_tx_xdp(struct net_device *dev, return 0; } +static noinline int +xdp_do_redirect_slow(struct net_device *dev, struct xdp_buff *xdp, + struct bpf_prog *xdp_prog, struct bpf_redirect_info *ri) +{ + struct net_device *fwd; + u32 index = ri->ifindex; + int err; + + fwd = dev_get_by_index_rcu(dev_net(dev), index); + ri->ifindex = 0; + if (unlikely(!fwd)) { + err = -EINVAL; + goto err; + } + + err = __bpf_tx_xdp(fwd, NULL, xdp, 0); + if (unlikely(err)) + goto err; + + _trace_xdp_redirect(dev, xdp_prog, index); + return 0; +err: + _trace_xdp_redirect_err(dev, xdp_prog, index, err); + return err; +} + static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd, struct bpf_map *map, struct xdp_buff *xdp, @@ -3235,9 +3261,9 @@ void bpf_clear_redirect_map(struct bpf_map *map) } static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp, - struct bpf_prog *xdp_prog, struct bpf_map *map) + struct bpf_prog *xdp_prog, struct bpf_map *map, + struct bpf_redirect_info *ri) { - struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); u32 index = ri->ifindex; void *fwd = NULL; int err; @@ -3270,29 +3296,11 @@ int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, { struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); struct bpf_map *map = READ_ONCE(ri->map); - struct net_device *fwd; - u32 index = ri->ifindex; - int err; if (likely(map)) - return xdp_do_redirect_map(dev, xdp, xdp_prog, map); + return xdp_do_redirect_map(dev, xdp, xdp_prog, map, ri); - fwd = dev_get_by_index_rcu(dev_net(dev), index); - ri->ifindex = 0; - if (unlikely(!fwd)) { - err = -EINVAL; - goto err; - } - - err = __bpf_tx_xdp(fwd, NULL, xdp, 0); - if (unlikely(err)) - goto err; - - _trace_xdp_redirect(dev, xdp_prog, index); - return 0; -err: - _trace_xdp_redirect_err(dev, xdp_prog, index, err); - return err; + return xdp_do_redirect_slow(dev, xdp, xdp_prog, ri); } EXPORT_SYMBOL_GPL(xdp_do_redirect); From 8834a27dfa9d523837a146559e95a335b36ef23c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Fri, 28 Jun 2019 11:12:34 +0200 Subject: [PATCH 1302/1640] UPSTREAM: devmap: Rename ifindex member in bpf_redirect_info MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The bpf_redirect_info struct has an 'ifindex' member which was named back when the redirects could only target egress interfaces. Now that we can also redirect to sockets and CPUs, this is a bit misleading, so rename the member to tgt_index. Reorder the struct members so we can have 'tgt_index' and 'tgt_value' next to each other in a subsequent patch. Change-Id: I8d7b89a02c8dc2934cfce3437cff462d4a46d33e Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Daniel Borkmann --- include/linux/filter.h | 2 +- net/core/filter.c | 26 +++++++++++++------------- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/include/linux/filter.h b/include/linux/filter.h index 477b6dab98ba..40df1216b8fa 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -667,8 +667,8 @@ struct bpf_skb_data_end { }; struct bpf_redirect_info { - u32 ifindex; u32 flags; + u32 tgt_index; struct bpf_map *map; struct bpf_map *map_to_flush; u32 kern_flags; diff --git a/net/core/filter.c b/net/core/filter.c index 93ce8a418d63..8e7dc74879fe 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2136,8 +2136,8 @@ BPF_CALL_2(bpf_redirect, u32, ifindex, u64, flags) if (unlikely(flags & ~(BPF_F_INGRESS))) return TC_ACT_SHOT; - ri->ifindex = ifindex; ri->flags = flags; + ri->tgt_index = ifindex; return TC_ACT_REDIRECT; } @@ -2147,8 +2147,8 @@ int skb_do_redirect(struct sk_buff *skb) struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); struct net_device *dev; - dev = dev_get_by_index_rcu(dev_net(skb->dev), ri->ifindex); - ri->ifindex = 0; + dev = dev_get_by_index_rcu(dev_net(skb->dev), ri->tgt_index); + ri->tgt_index = 0; if (unlikely(!dev)) { kfree_skb(skb); return -EINVAL; @@ -3148,11 +3148,11 @@ xdp_do_redirect_slow(struct net_device *dev, struct xdp_buff *xdp, struct bpf_prog *xdp_prog, struct bpf_redirect_info *ri) { struct net_device *fwd; - u32 index = ri->ifindex; + u32 index = ri->tgt_index; int err; fwd = dev_get_by_index_rcu(dev_net(dev), index); - ri->ifindex = 0; + ri->tgt_index = 0; if (unlikely(!fwd)) { err = -EINVAL; goto err; @@ -3264,11 +3264,11 @@ static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp, struct bpf_prog *xdp_prog, struct bpf_map *map, struct bpf_redirect_info *ri) { - u32 index = ri->ifindex; + u32 index = ri->tgt_index; void *fwd = NULL; int err; - ri->ifindex = 0; + ri->tgt_index = 0; WRITE_ONCE(ri->map, NULL); fwd = __xdp_map_lookup_elem(map, index); @@ -3311,11 +3311,11 @@ static int xdp_do_generic_redirect_map(struct net_device *dev, struct bpf_map *map) { struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); - u32 index = ri->ifindex; + u32 index = ri->tgt_index; void *fwd = NULL; int err = 0; - ri->ifindex = 0; + ri->tgt_index = 0; WRITE_ONCE(ri->map, NULL); fwd = __xdp_map_lookup_elem(map, index); @@ -3355,14 +3355,14 @@ int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb, { struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); struct bpf_map *map = READ_ONCE(ri->map); - u32 index = ri->ifindex; + u32 index = ri->tgt_index; struct net_device *fwd; int err = 0; if (map) return xdp_do_generic_redirect_map(dev, skb, xdp, xdp_prog, map); - ri->ifindex = 0; + ri->tgt_index = 0; fwd = dev_get_by_index_rcu(dev_net(dev), index); if (unlikely(!fwd)) { err = -EINVAL; @@ -3390,8 +3390,8 @@ BPF_CALL_2(bpf_xdp_redirect, u32, ifindex, u64, flags) if (unlikely(flags)) return XDP_ABORTED; - ri->ifindex = ifindex; ri->flags = flags; + ri->tgt_index = ifindex; WRITE_ONCE(ri->map, NULL); return XDP_REDIRECT; @@ -3413,8 +3413,8 @@ BPF_CALL_3(bpf_xdp_redirect_map, struct bpf_map *, map, u32, ifindex, if (unlikely(flags)) return XDP_ABORTED; - ri->ifindex = ifindex; ri->flags = flags; + ri->tgt_index = ifindex; WRITE_ONCE(ri->map, map); return XDP_REDIRECT; From b512dc33ebcae4472a1a50f4e3d472d16396f27c Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Mon, 3 Sep 2018 09:54:57 +0200 Subject: [PATCH 1303/1640] UPSTREAM: xdp: unlikely instrumentation for xdp map redirect Notice the compiler generated ASM code layout was suboptimal. It assumed map enqueue errors as the likely case, which is shouldn't. It assumed that xdp_do_flush_map() was a likely case, due to maps changing between packets, which should be very unlikely. Change-Id: Id4d1f724dfceabb5288efa82f81ab54d85fb371b Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Alexei Starovoitov --- net/core/filter.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/core/filter.c b/net/core/filter.c index 8e7dc74879fe..6921f9bc9a84 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3181,7 +3181,7 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd, struct bpf_dtab_netdev *dst = fwd; err = dev_map_enqueue(dst, xdp, dev_rx); - if (err) + if (unlikely(err)) return err; break; } @@ -3189,7 +3189,7 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd, struct bpf_cpu_map_entry *rcpu = fwd; err = cpu_map_enqueue(rcpu, xdp, dev_rx); - if (err) + if (unlikely(err)) return err; break; } @@ -3276,7 +3276,7 @@ static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp, err = -EINVAL; goto err; } - if (ri->map_to_flush && ri->map_to_flush != map) + if (ri->map_to_flush && unlikely(ri->map_to_flush != map)) xdp_do_flush_map(); err = __bpf_tx_xdp_map(dev, fwd, map, xdp, index); From 5f26812983a0489bfb38c598faa8e50ea66b6b5d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Fri, 28 Jun 2019 11:12:34 +0200 Subject: [PATCH 1304/1640] UPSTREAM: bpf_xdp_redirect_map: Perform map lookup in eBPF helper MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The bpf_redirect_map() helper used by XDP programs doesn't return any indication of whether it can successfully redirect to the map index it was given. Instead, BPF programs have to track this themselves, leading to programs using duplicate maps to track which entries are populated in the devmap. This patch fixes this by moving the map lookup into the bpf_redirect_map() helper, which makes it possible to return failure to the eBPF program. The lower bits of the flags argument is used as the return code, which means that existing users who pass a '0' flag argument will get XDP_ABORTED. With this, a BPF program can check the return code from the helper call and react by, for instance, substituting a different redirect. This works for any type of map used for redirect. Change-Id: I5392fcb48de313d573b8d23e3c9f8e3a07bed5f1 Signed-off-by: Toke Høiland-Jørgensen Acked-by: Jonathan Lemon Acked-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann --- include/linux/filter.h | 1 + include/trace/events/xdp.h | 5 ++--- include/uapi/linux/bpf.h | 7 +++++-- net/core/filter.c | 32 ++++++++++++++++++-------------- 4 files changed, 26 insertions(+), 19 deletions(-) diff --git a/include/linux/filter.h b/include/linux/filter.h index 40df1216b8fa..d822a1884e30 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -669,6 +669,7 @@ struct bpf_skb_data_end { struct bpf_redirect_info { u32 flags; u32 tgt_index; + void *tgt_value; struct bpf_map *map; struct bpf_map *map_to_flush; u32 kern_flags; diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h index 01389b9c3c6a..b28f6a2958d5 100644 --- a/include/trace/events/xdp.h +++ b/include/trace/events/xdp.h @@ -175,9 +175,8 @@ struct _bpf_dtab_netdev { #endif /* __DEVMAP_OBJ_TYPE */ #define devmap_ifindex(fwd, map) \ - (!fwd ? 0 : \ - ((map->map_type == BPF_MAP_TYPE_DEVMAP) ? \ - ((struct _bpf_dtab_netdev *)fwd)->dev->ifindex : 0)) + ((map->map_type == BPF_MAP_TYPE_DEVMAP) ? \ + ((struct _bpf_dtab_netdev *)fwd)->dev->ifindex : 0) #define _trace_xdp_redirect_map(dev, xdp, fwd, map, idx) \ trace_xdp_redirect_map(dev, xdp, devmap_ifindex(fwd, map), \ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 76c6c8f36329..4c78e2daf43d 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1590,8 +1590,11 @@ union bpf_attr { * but this is only implemented for native XDP (with driver * support) as of this writing). * - * All values for *flags* are reserved for future usage, and must - * be left at zero. + * The lower two bits of *flags* are used as the return code if + * the map lookup fails. This is so that the return value can be + * one of the XDP program return codes up to XDP_TX, as chosen by + * the caller. Any higher bits in the *flags* argument must be + * unset. * * When used to redirect packets to net devices, this helper * provides a high performance increase over **bpf_redirect**\ (). diff --git a/net/core/filter.c b/net/core/filter.c index 6921f9bc9a84..4267f2bc5f88 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3265,17 +3265,13 @@ static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp, struct bpf_redirect_info *ri) { u32 index = ri->tgt_index; - void *fwd = NULL; + void *fwd = ri->tgt_value; int err; ri->tgt_index = 0; + ri->tgt_value = NULL; WRITE_ONCE(ri->map, NULL); - fwd = __xdp_map_lookup_elem(map, index); - if (unlikely(!fwd)) { - err = -EINVAL; - goto err; - } if (ri->map_to_flush && unlikely(ri->map_to_flush != map)) xdp_do_flush_map(); @@ -3312,18 +3308,13 @@ static int xdp_do_generic_redirect_map(struct net_device *dev, { struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); u32 index = ri->tgt_index; - void *fwd = NULL; + void *fwd = ri->tgt_value; int err = 0; ri->tgt_index = 0; + ri->tgt_value = NULL; WRITE_ONCE(ri->map, NULL); - fwd = __xdp_map_lookup_elem(map, index); - if (unlikely(!fwd)) { - err = -EINVAL; - goto err; - } - if (map->map_type == BPF_MAP_TYPE_DEVMAP) { struct bpf_dtab_netdev *dst = fwd; @@ -3392,6 +3383,7 @@ BPF_CALL_2(bpf_xdp_redirect, u32, ifindex, u64, flags) ri->flags = flags; ri->tgt_index = ifindex; + ri->tgt_value = NULL; WRITE_ONCE(ri->map, NULL); return XDP_REDIRECT; @@ -3410,9 +3402,21 @@ BPF_CALL_3(bpf_xdp_redirect_map, struct bpf_map *, map, u32, ifindex, { struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); - if (unlikely(flags)) + /* Lower bits of the flags are used as return code on lookup failure */ + if (unlikely(flags > XDP_TX)) return XDP_ABORTED; + ri->tgt_value = __xdp_map_lookup_elem(map, ifindex); + if (unlikely(!ri->tgt_value)) { + /* If the lookup fails we want to clear out the state in the + * redirect_info struct completely, so that if an eBPF program + * performs multiple lookups, the last one always takes + * precedence. + */ + WRITE_ONCE(ri->map, NULL); + return flags; + } + ri->flags = flags; ri->tgt_index = ifindex; WRITE_ONCE(ri->map, map); From 6f3bf2ccbe9f73e78c77ba22b5e3213cdaed131e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Fri, 26 Jul 2019 18:06:55 +0200 Subject: [PATCH 1305/1640] UPSTREAM: xdp: Add devmap_hash map type for looking up devices by hashed index MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A common pattern when using xdp_redirect_map() is to create a device map where the lookup key is simply ifindex. Because device maps are arrays, this leaves holes in the map, and the map has to be sized to fit the largest ifindex, regardless of how many devices actually are actually needed in the map. This patch adds a second type of device map where the key is looked up using a hashmap, instead of being used as an array index. This allows maps to be densely packed, so they can be smaller. Change-Id: Ibf2e0d0722ca22b53349df8019115ac1c263a286 Signed-off-by: Toke Høiland-Jørgensen Acked-by: Yonghong Song Acked-by: Jesper Dangaard Brouer Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 7 ++ include/linux/bpf_types.h | 1 + include/trace/events/xdp.h | 3 +- include/uapi/linux/bpf.h | 1 + kernel/bpf/devmap.c | 200 +++++++++++++++++++++++++++++++++++++ kernel/bpf/verifier.c | 2 + net/core/filter.c | 9 +- 7 files changed, 220 insertions(+), 3 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 90f2384fd721..39d0c0f923cb 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -726,6 +726,7 @@ struct xdp_buff; struct sk_buff; struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key); +struct bpf_dtab_netdev *__dev_map_hash_lookup_elem(struct bpf_map *map, u32 key); void __dev_map_flush(struct bpf_map *map); int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, struct net_device *dev_rx); @@ -817,6 +818,12 @@ static inline struct net_device *__dev_map_lookup_elem(struct bpf_map *map, return NULL; } +static inline struct net_device *__dev_map_hash_lookup_elem(struct bpf_map *map, + u32 key) +{ + return NULL; +} + static inline void __dev_map_flush(struct bpf_map *map) { } diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index ced497a618c4..8ec5b18c401f 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -60,6 +60,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY_OF_MAPS, array_of_maps_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_HASH_OF_MAPS, htab_of_maps_map_ops) #ifdef CONFIG_NET BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP, dev_map_ops) +BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP_HASH, dev_map_hash_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_SK_STORAGE, sk_storage_map_ops) #if defined(CONFIG_BPF_STREAM_PARSER) BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKMAP, sock_map_ops) diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h index b28f6a2958d5..5233814e506f 100644 --- a/include/trace/events/xdp.h +++ b/include/trace/events/xdp.h @@ -175,7 +175,8 @@ struct _bpf_dtab_netdev { #endif /* __DEVMAP_OBJ_TYPE */ #define devmap_ifindex(fwd, map) \ - ((map->map_type == BPF_MAP_TYPE_DEVMAP) ? \ + ((map->map_type == BPF_MAP_TYPE_DEVMAP || \ + map->map_type == BPF_MAP_TYPE_DEVMAP_HASH) ? \ ((struct _bpf_dtab_netdev *)fwd)->dev->ifindex : 0) #define _trace_xdp_redirect_map(dev, xdp, fwd, map, idx) \ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 4c78e2daf43d..4a4ab5e0b46a 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -134,6 +134,7 @@ enum bpf_map_type { BPF_MAP_TYPE_QUEUE, BPF_MAP_TYPE_STACK, BPF_MAP_TYPE_SK_STORAGE, + BPF_MAP_TYPE_DEVMAP_HASH, }; /* Note that tracing related programs such as diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index a0501266bdb8..9af048a932b5 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -37,6 +37,12 @@ * notifier hook walks the map we know that new dev references can not be * added by the user because core infrastructure ensures dev_get_by_index() * calls will fail at this point. + * + * The devmap_hash type is a map type which interprets keys as ifindexes and + * indexes these using a hashmap. This allows maps that use ifindex as key to be + * densely packed instead of having holes in the lookup array for unused + * ifindexes. The setup and packet enqueue/send code is shared between the two + * types of devmap; only the lookup and insertion is different. */ #include #include @@ -59,6 +65,7 @@ struct xdp_bulk_queue { struct bpf_dtab_netdev { struct net_device *dev; /* must be first member, due to tracepoint */ + struct hlist_node index_hlist; struct bpf_dtab *dtab; struct xdp_bulk_queue __percpu *bulkq; struct rcu_head rcu; @@ -70,11 +77,30 @@ struct bpf_dtab { struct bpf_dtab_netdev **netdev_map; struct list_head __percpu *flush_list; struct list_head list; + + /* these are only used for DEVMAP_HASH type maps */ + struct hlist_head *dev_index_head; + spinlock_t index_lock; + unsigned int items; + u32 n_buckets; }; static DEFINE_SPINLOCK(dev_map_lock); static LIST_HEAD(dev_map_list); +static struct hlist_head *dev_map_create_hash(unsigned int entries) +{ + int i; + struct hlist_head *hash; + + hash = kmalloc_array(entries, sizeof(*hash), GFP_KERNEL); + if (hash != NULL) + for (i = 0; i < entries; i++) + INIT_HLIST_HEAD(&hash[i]); + + return hash; +} + static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr) { int err, cpu; @@ -97,6 +123,14 @@ static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr) cost = (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *); cost += sizeof(struct list_head) * num_possible_cpus(); + if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) { + dtab->n_buckets = roundup_pow_of_two(dtab->map.max_entries); + + if (!dtab->n_buckets) /* Overflow check */ + return -EINVAL; + cost += sizeof(struct hlist_head) * dtab->n_buckets; + } + /* if map size is larger than memlock limit, reject it */ err = bpf_map_charge_init(&dtab->map.memory, cost); if (err) @@ -115,8 +149,18 @@ static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr) if (!dtab->netdev_map) goto free_percpu; + if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) { + dtab->dev_index_head = dev_map_create_hash(dtab->n_buckets); + if (!dtab->dev_index_head) + goto free_map_area; + + spin_lock_init(&dtab->index_lock); + } + return 0; +free_map_area: + bpf_map_area_free(dtab->netdev_map); free_percpu: free_percpu(dtab->flush_list); free_charge: @@ -198,6 +242,7 @@ static void dev_map_free(struct bpf_map *map) free_percpu(dtab->flush_list); bpf_map_area_free(dtab->netdev_map); + kfree(dtab->dev_index_head); kfree(dtab); } @@ -218,6 +263,70 @@ static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key) return 0; } +static inline struct hlist_head *dev_map_index_hash(struct bpf_dtab *dtab, + int idx) +{ + return &dtab->dev_index_head[idx & (dtab->n_buckets - 1)]; +} + +struct bpf_dtab_netdev *__dev_map_hash_lookup_elem(struct bpf_map *map, u32 key) +{ + struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); + struct hlist_head *head = dev_map_index_hash(dtab, key); + struct bpf_dtab_netdev *dev; + + hlist_for_each_entry_rcu(dev, head, index_hlist) + if (dev->idx == key) + return dev; + + return NULL; +} + +static int dev_map_hash_get_next_key(struct bpf_map *map, void *key, + void *next_key) +{ + struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); + u32 idx, *next = next_key; + struct bpf_dtab_netdev *dev, *next_dev; + struct hlist_head *head; + int i = 0; + + if (!key) + goto find_first; + + idx = *(u32 *)key; + + dev = __dev_map_hash_lookup_elem(map, idx); + if (!dev) + goto find_first; + + next_dev = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(&dev->index_hlist)), + struct bpf_dtab_netdev, index_hlist); + + if (next_dev) { + *next = next_dev->idx; + return 0; + } + + i = idx & (dtab->n_buckets - 1); + i++; + + find_first: + for (; i < dtab->n_buckets; i++) { + head = dev_map_index_hash(dtab, i); + + next_dev = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)), + struct bpf_dtab_netdev, + index_hlist); + if (next_dev) { + *next = next_dev->idx; + return 0; + } + } + + return -ENOENT; +} + static int bq_xmit_all(struct xdp_bulk_queue *bq, u32 flags, bool in_napi_ctx) { @@ -373,6 +482,15 @@ static void *dev_map_lookup_elem(struct bpf_map *map, void *key) return dev ? &dev->ifindex : NULL; } +static void *dev_map_hash_lookup_elem(struct bpf_map *map, void *key) +{ + struct bpf_dtab_netdev *obj = __dev_map_hash_lookup_elem(map, + *(u32 *)key); + struct net_device *dev = obj ? obj->dev : NULL; + + return dev ? &dev->ifindex : NULL; +} + static void dev_map_flush_old(struct bpf_dtab_netdev *dev) { if (dev->dev->netdev_ops->ndo_xdp_xmit) { @@ -422,6 +540,28 @@ static int dev_map_delete_elem(struct bpf_map *map, void *key) return 0; } +static int dev_map_hash_delete_elem(struct bpf_map *map, void *key) +{ + struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); + struct bpf_dtab_netdev *old_dev; + int k = *(u32 *)key; + unsigned long flags; + int ret = -ENOENT; + + spin_lock_irqsave(&dtab->index_lock, flags); + + old_dev = __dev_map_hash_lookup_elem(map, k); + if (old_dev) { + dtab->items--; + hlist_del_init_rcu(&old_dev->index_hlist); + call_rcu(&old_dev->rcu, __dev_map_entry_free); + ret = 0; + } + spin_unlock_irqrestore(&dtab->index_lock, flags); + + return ret; +} + static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net, struct bpf_dtab *dtab, u32 ifindex, @@ -502,6 +642,56 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value, map, key, value, map_flags); } +static int __dev_map_hash_update_elem(struct net *net, struct bpf_map *map, + void *key, void *value, u64 map_flags) +{ + struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); + struct bpf_dtab_netdev *dev, *old_dev; + u32 ifindex = *(u32 *)value; + u32 idx = *(u32 *)key; + unsigned long flags; + + if (unlikely(map_flags > BPF_EXIST || !ifindex)) + return -EINVAL; + + old_dev = __dev_map_hash_lookup_elem(map, idx); + if (old_dev && (map_flags & BPF_NOEXIST)) + return -EEXIST; + + dev = __dev_map_alloc_node(net, dtab, ifindex, idx); + if (IS_ERR(dev)) + return PTR_ERR(dev); + + spin_lock_irqsave(&dtab->index_lock, flags); + + if (old_dev) { + hlist_del_rcu(&old_dev->index_hlist); + } else { + if (dtab->items >= dtab->map.max_entries) { + spin_unlock_irqrestore(&dtab->index_lock, flags); + call_rcu(&dev->rcu, __dev_map_entry_free); + return -E2BIG; + } + dtab->items++; + } + + hlist_add_head_rcu(&dev->index_hlist, + dev_map_index_hash(dtab, idx)); + spin_unlock_irqrestore(&dtab->index_lock, flags); + + if (old_dev) + call_rcu(&old_dev->rcu, __dev_map_entry_free); + + return 0; +} + +static int dev_map_hash_update_elem(struct bpf_map *map, void *key, void *value, + u64 map_flags) +{ + return __dev_map_hash_update_elem(current->nsproxy->net_ns, + map, key, value, map_flags); +} + const struct bpf_map_ops dev_map_ops = { .map_alloc = dev_map_alloc, .map_free = dev_map_free, @@ -512,6 +702,16 @@ const struct bpf_map_ops dev_map_ops = { .map_check_btf = map_check_no_btf, }; +const struct bpf_map_ops dev_map_hash_ops = { + .map_alloc = dev_map_alloc, + .map_free = dev_map_free, + .map_get_next_key = dev_map_hash_get_next_key, + .map_lookup_elem = dev_map_hash_lookup_elem, + .map_update_elem = dev_map_hash_update_elem, + .map_delete_elem = dev_map_hash_delete_elem, + .map_check_btf = map_check_no_btf, +}; + static int dev_map_notification(struct notifier_block *notifier, ulong event, void *ptr) { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index fe7fdbc225a8..0f43f1245237 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3431,6 +3431,7 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, goto error; break; case BPF_MAP_TYPE_DEVMAP: + case BPF_MAP_TYPE_DEVMAP_HASH: if (func_id != BPF_FUNC_redirect_map && func_id != BPF_FUNC_map_lookup_elem) goto error; @@ -3513,6 +3514,7 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, break; case BPF_FUNC_redirect_map: if (map->map_type != BPF_MAP_TYPE_DEVMAP && + map->map_type != BPF_MAP_TYPE_DEVMAP_HASH && map->map_type != BPF_MAP_TYPE_CPUMAP && map->map_type != BPF_MAP_TYPE_XSKMAP) goto error; diff --git a/net/core/filter.c b/net/core/filter.c index 4267f2bc5f88..f3581df507b3 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3177,7 +3177,8 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd, int err; switch (map->map_type) { - case BPF_MAP_TYPE_DEVMAP: { + case BPF_MAP_TYPE_DEVMAP: + case BPF_MAP_TYPE_DEVMAP_HASH: { struct bpf_dtab_netdev *dst = fwd; err = dev_map_enqueue(dst, xdp, dev_rx); @@ -3214,6 +3215,7 @@ void xdp_do_flush_map(void) if (map) { switch (map->map_type) { case BPF_MAP_TYPE_DEVMAP: + case BPF_MAP_TYPE_DEVMAP_HASH: __dev_map_flush(map); break; case BPF_MAP_TYPE_CPUMAP: @@ -3234,6 +3236,8 @@ static inline void *__xdp_map_lookup_elem(struct bpf_map *map, u32 index) switch (map->map_type) { case BPF_MAP_TYPE_DEVMAP: return __dev_map_lookup_elem(map, index); + case BPF_MAP_TYPE_DEVMAP_HASH: + return __dev_map_hash_lookup_elem(map, index); case BPF_MAP_TYPE_CPUMAP: return __cpu_map_lookup_elem(map, index); case BPF_MAP_TYPE_XSKMAP: @@ -3315,7 +3319,8 @@ static int xdp_do_generic_redirect_map(struct net_device *dev, ri->tgt_value = NULL; WRITE_ONCE(ri->map, NULL); - if (map->map_type == BPF_MAP_TYPE_DEVMAP) { + if (map->map_type == BPF_MAP_TYPE_DEVMAP || + map->map_type == BPF_MAP_TYPE_DEVMAP_HASH) { struct bpf_dtab_netdev *dst = fwd; err = dev_map_generic_redirect(dst, skb, xdp_prog); From 8ca901e8ffc6b08b3f7bc675720c69c4d6164d91 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Mon, 29 Jul 2019 14:51:10 -0700 Subject: [PATCH 1306/1640] UPSTREAM: bpf: always allocate at least 16 bytes for setsockopt hook Since we always allocate memory, allocate just a little bit more for the BPF program in case it need to override user input with bigger value. The canonical example is TCP_CONGESTION where input string might be too small to override (nv -> bbr or cubic). 16 bytes are chosen to match the size of TCP_CA_NAME_MAX and can be extended in the future if needed. Change-Id: Ib700728cac279a57b4ad9e1c0af83b042ba8cb2e Signed-off-by: Stanislav Fomichev Signed-off-by: Alexei Starovoitov --- kernel/bpf/cgroup.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 0a00eaca6fae..6a6a154cfa7b 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -964,7 +964,6 @@ static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen) return -ENOMEM; ctx->optval_end = ctx->optval + max_optlen; - ctx->optlen = max_optlen; return 0; } @@ -984,7 +983,7 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level, .level = *level, .optname = *optname, }; - int ret; + int ret, max_optlen; /* Opportunistic check to see whether we have any BPF program * attached to the hook so we don't waste time allocating @@ -994,10 +993,18 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level, __cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_SETSOCKOPT)) return 0; - ret = sockopt_alloc_buf(&ctx, *optlen); + /* Allocate a bit more than the initial user buffer for + * BPF program. The canonical use case is overriding + * TCP_CONGESTION(nv) to TCP_CONGESTION(cubic). + */ + max_optlen = max_t(int, 16, *optlen); + + ret = sockopt_alloc_buf(&ctx, max_optlen); if (ret) return ret; + ctx.optlen = *optlen; + if (copy_from_user(ctx.optval, optval, *optlen) != 0) { ret = -EFAULT; goto out; @@ -1016,7 +1023,7 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level, if (ctx.optlen == -1) { /* optlen set to -1, bypass kernel */ ret = 1; - } else if (ctx.optlen > *optlen || ctx.optlen < -1) { + } else if (ctx.optlen > max_optlen || ctx.optlen < -1) { /* optlen is out of bounds */ ret = -EFAULT; } else { @@ -1063,6 +1070,8 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, if (ret) return ret; + ctx.optlen = max_optlen; + if (!retval) { /* If kernel getsockopt finished successfully, * copy whatever was returned to the user back From 20ca5959f2cc875751ef022d83702a7382c8fca3 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 2 Apr 2019 09:49:50 -0700 Subject: [PATCH 1307/1640] BACKPORT: kbuild: add ability to generate BTF type info for vmlinux This patch adds new config option to trigger generation of BTF type information from DWARF debuginfo for vmlinux and kernel modules through pahole, which in turn relies on libbpf for btf_dedup() algorithm. The intent is to record compact type information of all types used inside kernel, including all the structs/unions/typedefs/etc. This enables BPF's compile-once-run-everywhere ([0]) approach, in which tracing programs that are inspecting kernel's internal data (e.g., struct task_struct) can be compiled on a system running some kernel version, but would be possible to run on other kernel versions (and configurations) without recompilation, even if the layout of structs changed and/or some of the fields were added, removed, or renamed. This is only possible if BPF loader can get kernel type info to adjust all the offsets correctly. This patch is a first time in this direction, making sure that BTF type info is part of Linux kernel image in non-loadable ELF section. BTF deduplication ([1]) algorithm typically provides 100x savings compared to DWARF data, so resulting .BTF section is not big as is typically about 2MB in size. [0] http://vger.kernel.org/lpc-bpf2018.html#session-2 [1] https://facebookmicrosites.github.io/bpf/blog/2018/11/14/btf-enhancement.html Cc: Masahiro Yamada Cc: Arnaldo Carvalho de Melo Cc: Daniel Borkmann Cc: Alexei Starovoitov Cc: Yonghong Song Cc: Martin KaFai Lau Change-Id: Id935f6e3ac658d9a92d55acd39e8287c644c941d Signed-off-by: Andrii Nakryiko Acked-by: David S. Miller Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: Daniel Borkmann --- Makefile | 3 ++- lib/Kconfig.debug | 8 ++++++++ scripts/link-vmlinux.sh | 21 ++++++++++++++++++++- 3 files changed, 30 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 6512f8b47dbe..935009b5736f 100644 --- a/Makefile +++ b/Makefile @@ -408,6 +408,7 @@ OBJCOPY = $(CROSS_COMPILE)objcopy OBJDUMP = $(CROSS_COMPILE)objdump READELF = $(CROSS_COMPILE)readelf endif +PAHOLE = pahole AWK = awk GENKSYMS = scripts/genksyms/genksyms INSTALLKERNEL := installkernel @@ -472,7 +473,7 @@ GCC_PLUGINS_CFLAGS := CLANG_FLAGS := export ARCH SRCARCH CONFIG_SHELL HOSTCC HOSTCFLAGS CROSS_COMPILE LD CC -export CPP AR NM STRIP OBJCOPY OBJDUMP READELF HOSTLDFLAGS HOST_LOADLIBES +export CPP AR NM STRIP OBJCOPY OBJDUMP PAHOLE READELF HOSTLDFLAGS HOST_LOADLIBES export MAKE AWK GENKSYMS INSTALLKERNEL PERL PYTHON UTS_MACHINE export HOSTCXX HOSTCXXFLAGS LDFLAGS_MODULE CHECK CHECKFLAGS diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index f00b214a9f6d..4109023e2953 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -208,6 +208,14 @@ config DEBUG_INFO_DWARF4 But it significantly improves the success of resolving variables in gdb on optimized code. +config DEBUG_INFO_BTF + bool "Generate BTF typeinfo" + depends on DEBUG_INFO + help + Generate deduplicated BTF type information from DWARF debug info. + Turning this on expects presence of pahole tool, which will convert + DWARF type info into equivalent deduplicated BTF type info. + config GDB_SCRIPTS bool "Provide GDB scripts for kernel debugging" depends on DEBUG_INFO diff --git a/scripts/link-vmlinux.sh b/scripts/link-vmlinux.sh index c98e0393978c..cdd794e3a0ec 100755 --- a/scripts/link-vmlinux.sh +++ b/scripts/link-vmlinux.sh @@ -40,7 +40,7 @@ set -e info() { if [ "${quiet}" != "silent_" ]; then - printf " %-7s %s\n" ${1} ${2} + printf " %-7s %s\n" "${1}" "${2}" fi } @@ -202,6 +202,21 @@ vmlinux_link() fi } +# generate .BTF typeinfo from DWARF debuginfo +gen_btf() +{ + local pahole_ver; + + pahole_ver=$(${PAHOLE} --version | sed -E 's/v([0-9]+)\.([0-9]+)/\1\2/') + if [ "${pahole_ver}" -lt "113" ]; then + info "BTF" "${1}: pahole version $(${PAHOLE} --version) is too old, need at least v1.13" + exit 0 + fi + + info "BTF" ${1} + LLVM_OBJCOPY=${OBJCOPY} ${PAHOLE} -J ${1} +} + # Create ${2} .o file with all symbols from the ${1} object file kallsyms() { @@ -437,6 +452,10 @@ fi info LD vmlinux vmlinux_link "${kallsymso}" vmlinux +if [ -n "${CONFIG_DEBUG_INFO_BTF}" ]; then + gen_btf vmlinux +fi + if [ -n "${CONFIG_BUILDTIME_EXTABLE_SORT}" ]; then info SORTEX vmlinux sortextable vmlinux From ca53734a93199ae5692dd3f1fa6fcde3859a7cfa Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Sun, 5 May 2019 17:10:33 -0700 Subject: [PATCH 1308/1640] UPSTREAM: kbuild: tolerate missing pahole when generating BTF When BTF generation is enabled through CONFIG_DEBUG_INFO_BTF, scripts/link-vmlinux.sh detects if pahole version is too old and gracefully continues build process, skipping BTF generation build step. But if pahole is not available, build will still fail. This patch adds check for whether pahole exists at all and bails out gracefully, if not. Cc: Alexei Starovoitov Reported-by: Yonghong Song Fixes: e83b9f55448a ("kbuild: add ability to generate BTF type info for vmlinux") Change-Id: I89e477f03aba8e16789b2a330469b2ac562df4d5 Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann --- scripts/link-vmlinux.sh | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/scripts/link-vmlinux.sh b/scripts/link-vmlinux.sh index cdd794e3a0ec..bba5480e5bcf 100755 --- a/scripts/link-vmlinux.sh +++ b/scripts/link-vmlinux.sh @@ -207,6 +207,11 @@ gen_btf() { local pahole_ver; + if ! [ -x "$(command -v ${PAHOLE})" ]; then + info "BTF" "${1}: pahole (${PAHOLE}) is not available" + return 0 + fi + pahole_ver=$(${PAHOLE} --version | sed -E 's/v([0-9]+)\.([0-9]+)/\1\2/') if [ "${pahole_ver}" -lt "113" ]; then info "BTF" "${1}: pahole version $(${PAHOLE} --version) is too old, need at least v1.13" From 45525d237d3caadb3ae2e393e69aa4cb2cf61f31 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 15 Apr 2019 14:58:39 -0700 Subject: [PATCH 1309/1640] UPSTREAM: kbuild: handle old pahole more gracefully when generating BTF When CONFIG_DEBUG_INFO_BTF is enabled but available version of pahole is too old to support BTF generation, build script is supposed to emit warning and proceed with the build. Due to using exit instead of return from BASH function, existing handling code prematurely exits exit code 0, not completing some of the build steps. This patch fixes issue by correctly returning just from gen_btf() function only. Fixes: e83b9f55448a ("kbuild: add ability to generate BTF type info for vmlinux") Cc: Masahiro Yamada Cc: Arnaldo Carvalho de Melo Cc: Daniel Borkmann Cc: Alexei Starovoitov Cc: Yonghong Song Cc: Martin KaFai Lau Change-Id: If817e5a03a739213cecde92b827550b1de86d296 Signed-off-by: Andrii Nakryiko Acked-by: Song Liu Signed-off-by: Daniel Borkmann --- scripts/link-vmlinux.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/link-vmlinux.sh b/scripts/link-vmlinux.sh index bba5480e5bcf..59506f652a0b 100755 --- a/scripts/link-vmlinux.sh +++ b/scripts/link-vmlinux.sh @@ -215,7 +215,7 @@ gen_btf() pahole_ver=$(${PAHOLE} --version | sed -E 's/v([0-9]+)\.([0-9]+)/\1\2/') if [ "${pahole_ver}" -lt "113" ]; then info "BTF" "${1}: pahole version $(${PAHOLE} --version) is too old, need at least v1.13" - exit 0 + return 0 fi info "BTF" ${1} From 2b94e3ab7033b89febd668cc6c051d777c295c4c Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 12 Aug 2019 11:39:47 -0700 Subject: [PATCH 1310/1640] BACKPORT: btf: expose BTF info through sysfs Make .BTF section allocated and expose its contents through sysfs. /sys/kernel/btf directory is created to contain all the BTFs present inside kernel. Currently there is only kernel's main BTF, represented as /sys/kernel/btf/kernel file. Once kernel modules' BTFs are supported, each module will expose its BTF as /sys/kernel/btf/ file. Current approach relies on a few pieces coming together: 1. pahole is used to take almost final vmlinux image (modulo .BTF and kallsyms) and generate .BTF section by converting DWARF info into BTF. This section is not allocated and not mapped to any segment, though, so is not yet accessible from inside kernel at runtime. 2. objcopy dumps .BTF contents into binary file and subsequently convert binary file into linkable object file with automatically generated symbols _binary__btf_kernel_bin_start and _binary__btf_kernel_bin_end, pointing to start and end, respectively, of BTF raw data. 3. final vmlinux image is generated by linking this object file (and kallsyms, if necessary). sysfs_btf.c then creates /sys/kernel/btf/kernel file and exposes embedded BTF contents through it. This allows, e.g., libbpf and bpftool access BTF info at well-known location, without resorting to searching for vmlinux image on disk (location of which is not standardized and vmlinux image might not be even available in some scenarios, e.g., inside qemu during testing). Alternative approach using .incbin assembler directive to embed BTF contents directly was attempted but didn't work, because sysfs_proc.o is not re-compiled during link-vmlinux.sh stage. This is required, though, to update embedded BTF data (initially empty data is embedded, then pahole generates BTF info and we need to regenerate sysfs_btf.o with updated contents, but it's too late at that point). If BTF couldn't be generated due to missing or too old pahole, sysfs_btf.c handles that gracefully by detecting that _binary__btf_kernel_bin_start (weak symbol) is 0 and not creating /sys/kernel/btf at all. v2->v3: - added Documentation/ABI/testing/sysfs-kernel-btf (Greg K-H); - created proper kobject (btf_kobj) for btf directory (Greg K-H); - undo v2 change of reusing vmlinux, as it causes extra kallsyms pass due to initially missing __binary__btf_kernel_bin_{start/end} symbols; v1->v2: - allow kallsyms stage to re-use vmlinux generated by gen_btf(); Reviewed-by: Greg Kroah-Hartman Change-Id: Ife57356dd14b9ae0d2e3801eea7c602c6245952f Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann --- Documentation/ABI/testing/sysfs-kernel-btf | 17 +++++++ kernel/bpf/Makefile | 3 ++ kernel/bpf/sysfs_btf.c | 51 ++++++++++++++++++++ scripts/link-vmlinux.sh | 56 ++++++++++++++-------- 4 files changed, 106 insertions(+), 21 deletions(-) create mode 100644 Documentation/ABI/testing/sysfs-kernel-btf create mode 100644 kernel/bpf/sysfs_btf.c diff --git a/Documentation/ABI/testing/sysfs-kernel-btf b/Documentation/ABI/testing/sysfs-kernel-btf new file mode 100644 index 000000000000..5390f8001f96 --- /dev/null +++ b/Documentation/ABI/testing/sysfs-kernel-btf @@ -0,0 +1,17 @@ +What: /sys/kernel/btf +Date: Aug 2019 +KernelVersion: 5.5 +Contact: bpf@vger.kernel.org +Description: + Contains BTF type information and related data for kernel and + kernel modules. + +What: /sys/kernel/btf/kernel +Date: Aug 2019 +KernelVersion: 5.5 +Contact: bpf@vger.kernel.org +Description: + Read-only binary attribute exposing kernel's own BTF type + information with description of all internal kernel types. See + Documentation/bpf/btf.rst for detailed description of format + itself. diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 29d781061cd5..e1d9adb212f9 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -22,3 +22,6 @@ obj-$(CONFIG_CGROUP_BPF) += cgroup.o ifeq ($(CONFIG_INET),y) obj-$(CONFIG_BPF_SYSCALL) += reuseport_array.o endif +ifeq ($(CONFIG_SYSFS),y) +obj-$(CONFIG_DEBUG_INFO_BTF) += sysfs_btf.o +endif diff --git a/kernel/bpf/sysfs_btf.c b/kernel/bpf/sysfs_btf.c new file mode 100644 index 000000000000..092e63b9758b --- /dev/null +++ b/kernel/bpf/sysfs_btf.c @@ -0,0 +1,51 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Provide kernel BTF information for introspection and use by eBPF tools. + */ +#include +#include +#include +#include +#include + +/* See scripts/link-vmlinux.sh, gen_btf() func for details */ +extern char __weak _binary__btf_kernel_bin_start[]; +extern char __weak _binary__btf_kernel_bin_end[]; + +static ssize_t +btf_kernel_read(struct file *file, struct kobject *kobj, + struct bin_attribute *bin_attr, + char *buf, loff_t off, size_t len) +{ + memcpy(buf, _binary__btf_kernel_bin_start + off, len); + return len; +} + +static struct bin_attribute bin_attr_btf_kernel __ro_after_init = { + .attr = { .name = "kernel", .mode = 0444, }, + .read = btf_kernel_read, +}; + +static struct kobject *btf_kobj; + +static int __init btf_kernel_init(void) +{ + int err; + + if (!_binary__btf_kernel_bin_start) + return 0; + + btf_kobj = kobject_create_and_add("btf", kernel_kobj); + if (IS_ERR(btf_kobj)) { + err = PTR_ERR(btf_kobj); + btf_kobj = NULL; + return err; + } + + bin_attr_btf_kernel.size = _binary__btf_kernel_bin_end - + _binary__btf_kernel_bin_start; + + return sysfs_create_bin_file(btf_kobj, &bin_attr_btf_kernel); +} + +subsys_initcall(btf_kernel_init); diff --git a/scripts/link-vmlinux.sh b/scripts/link-vmlinux.sh index 59506f652a0b..30bd220ce970 100755 --- a/scripts/link-vmlinux.sh +++ b/scripts/link-vmlinux.sh @@ -142,8 +142,8 @@ recordmcount() } # Link of vmlinux -# ${1} - optional extra .o files -# ${2} - output file +# ${1} - output file +# ${@:2} - optional extra .o files vmlinux_link() { local lds="${objtree}/${KBUILD_LDS}" @@ -165,17 +165,17 @@ vmlinux_link() --start-group \ ${KBUILD_VMLINUX_LIBS} \ --end-group \ - ${1}" + ${@:2}" else objects="${KBUILD_VMLINUX_INIT} \ --start-group \ ${KBUILD_VMLINUX_MAIN} \ ${KBUILD_VMLINUX_LIBS} \ --end-group \ - ${1}" + ${@:2}" fi - ${ld} ${ldflags} -o ${2} -T ${lds} ${objects} + ${ld} ${ldflags} -o ${1} -T ${lds} ${objects} else if [ -n "${CONFIG_THIN_ARCHIVES}" ]; then objects="-Wl,--whole-archive \ @@ -184,17 +184,17 @@ vmlinux_link() -Wl,--start-group \ ${KBUILD_VMLINUX_LIBS} \ -Wl,--end-group \ - ${1}" + ${@:2}" else objects="${KBUILD_VMLINUX_INIT} \ -Wl,--start-group \ ${KBUILD_VMLINUX_MAIN} \ ${KBUILD_VMLINUX_LIBS} \ -Wl,--end-group \ - ${1}" + ${@:2}" fi - ${CC} ${CFLAGS_vmlinux} -o ${2} \ + ${CC} ${CFLAGS_vmlinux} -o ${1} \ -Wl,-T,${lds} \ ${objects} \ -lutil -lrt -lpthread @@ -203,23 +203,34 @@ vmlinux_link() } # generate .BTF typeinfo from DWARF debuginfo +# ${1} - vmlinux image +# ${2} - file to dump raw BTF data into gen_btf() { - local pahole_ver; + local pahole_ver + local bin_arch if ! [ -x "$(command -v ${PAHOLE})" ]; then info "BTF" "${1}: pahole (${PAHOLE}) is not available" - return 0 + return 1 fi pahole_ver=$(${PAHOLE} --version | sed -E 's/v([0-9]+)\.([0-9]+)/\1\2/') if [ "${pahole_ver}" -lt "113" ]; then info "BTF" "${1}: pahole version $(${PAHOLE} --version) is too old, need at least v1.13" - return 0 + return 1 fi - info "BTF" ${1} + info "BTF" ${2} + vmlinux_link ${1} LLVM_OBJCOPY=${OBJCOPY} ${PAHOLE} -J ${1} + + # dump .BTF section into raw binary file to link with final vmlinux + bin_arch=$(${OBJDUMP} -f ${1} | grep architecture | \ + cut -d, -f1 | cut -d' ' -f2) + ${OBJCOPY} --dump-section .BTF=.btf.kernel.bin ${1} 2>/dev/null + ${OBJCOPY} -I binary -O ${CONFIG_OUTPUT_FORMAT} -B ${bin_arch} \ + --rename-section .data=.BTF .btf.kernel.bin ${2} } # Create ${2} .o file with all symbols from the ${1} object file @@ -293,6 +304,7 @@ sortextable() # Delete output files in case of error cleanup() { + rm -f .btf.* rm -f .old_version rm -f .tmp_System.map rm -f .tmp_kallsyms* @@ -386,6 +398,13 @@ if [ ! -z ${RTIC_MPGEN+x} ]; then KBUILD_VMLINUX_LIBS+=$RTIC_MP_O fi +btf_kernel_bin_o="" +if [ -n "${CONFIG_DEBUG_INFO_BTF}" ]; then + if gen_btf .tmp_vmlinux.btf .btf.kernel.bin.o ; then + btf_kernel_bin_o=.btf.kernel.bin.o + fi +fi + kallsymso="" kallsyms_vmlinux="" if [ -n "${CONFIG_KALLSYMS}" ]; then @@ -417,11 +436,11 @@ if [ -n "${CONFIG_KALLSYMS}" ]; then kallsyms_vmlinux=.tmp_vmlinux2 # step 1 - vmlinux_link "" .tmp_vmlinux1 + vmlinux_link .tmp_vmlinux1 ${btf_kernel_bin_o} kallsyms .tmp_vmlinux1 .tmp_kallsyms1.o # step 2 - vmlinux_link .tmp_kallsyms1.o .tmp_vmlinux2 + vmlinux_link .tmp_vmlinux2 .tmp_kallsyms1.o ${btf_kernel_bin_o} kallsyms .tmp_vmlinux2 .tmp_kallsyms2.o # step 3 @@ -432,8 +451,7 @@ if [ -n "${CONFIG_KALLSYMS}" ]; then kallsymso=.tmp_kallsyms3.o kallsyms_vmlinux=.tmp_vmlinux3 - vmlinux_link .tmp_kallsyms2.o .tmp_vmlinux3 - + vmlinux_link .tmp_vmlinux3 .tmp_kallsyms2.o ${btf_kernel_bin_o} kallsyms .tmp_vmlinux3 .tmp_kallsyms3.o fi fi @@ -455,11 +473,7 @@ if [ ! -z ${RTIC_MP_O} ]; then fi info LD vmlinux -vmlinux_link "${kallsymso}" vmlinux - -if [ -n "${CONFIG_DEBUG_INFO_BTF}" ]; then - gen_btf vmlinux -fi +vmlinux_link vmlinux "${kallsymso}" "${btf_kernel_bin_o}" if [ -n "${CONFIG_BUILDTIME_EXTABLE_SORT}" ]; then info SORTEX vmlinux From 8efc044c468aa8ae7f73eb62c062843125b03b6f Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 13 Aug 2019 11:54:42 -0700 Subject: [PATCH 1311/1640] UPSTREAM: btf: rename /sys/kernel/btf/kernel into /sys/kernel/btf/vmlinux Expose kernel's BTF under the name vmlinux to be more uniform with using kernel module names as file names in the future. Fixes: 341dfcf8d78e ("btf: expose BTF info through sysfs") Suggested-by: Daniel Borkmann Change-Id: I0c478e34bd67044a37b166b60c1de07025c3ec90 Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann --- Documentation/ABI/testing/sysfs-kernel-btf | 2 +- kernel/bpf/sysfs_btf.c | 30 +++++++++++----------- scripts/link-vmlinux.sh | 18 ++++++------- 3 files changed, 25 insertions(+), 25 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-kernel-btf b/Documentation/ABI/testing/sysfs-kernel-btf index 5390f8001f96..2c9744b2cd59 100644 --- a/Documentation/ABI/testing/sysfs-kernel-btf +++ b/Documentation/ABI/testing/sysfs-kernel-btf @@ -6,7 +6,7 @@ Description: Contains BTF type information and related data for kernel and kernel modules. -What: /sys/kernel/btf/kernel +What: /sys/kernel/btf/vmlinux Date: Aug 2019 KernelVersion: 5.5 Contact: bpf@vger.kernel.org diff --git a/kernel/bpf/sysfs_btf.c b/kernel/bpf/sysfs_btf.c index 092e63b9758b..4659349fc795 100644 --- a/kernel/bpf/sysfs_btf.c +++ b/kernel/bpf/sysfs_btf.c @@ -9,30 +9,30 @@ #include /* See scripts/link-vmlinux.sh, gen_btf() func for details */ -extern char __weak _binary__btf_kernel_bin_start[]; -extern char __weak _binary__btf_kernel_bin_end[]; +extern char __weak _binary__btf_vmlinux_bin_start[]; +extern char __weak _binary__btf_vmlinux_bin_end[]; static ssize_t -btf_kernel_read(struct file *file, struct kobject *kobj, - struct bin_attribute *bin_attr, - char *buf, loff_t off, size_t len) +btf_vmlinux_read(struct file *file, struct kobject *kobj, + struct bin_attribute *bin_attr, + char *buf, loff_t off, size_t len) { - memcpy(buf, _binary__btf_kernel_bin_start + off, len); + memcpy(buf, _binary__btf_vmlinux_bin_start + off, len); return len; } -static struct bin_attribute bin_attr_btf_kernel __ro_after_init = { - .attr = { .name = "kernel", .mode = 0444, }, - .read = btf_kernel_read, +static struct bin_attribute bin_attr_btf_vmlinux __ro_after_init = { + .attr = { .name = "vmlinux", .mode = 0444, }, + .read = btf_vmlinux_read, }; static struct kobject *btf_kobj; -static int __init btf_kernel_init(void) +static int __init btf_vmlinux_init(void) { int err; - if (!_binary__btf_kernel_bin_start) + if (!_binary__btf_vmlinux_bin_start) return 0; btf_kobj = kobject_create_and_add("btf", kernel_kobj); @@ -42,10 +42,10 @@ static int __init btf_kernel_init(void) return err; } - bin_attr_btf_kernel.size = _binary__btf_kernel_bin_end - - _binary__btf_kernel_bin_start; + bin_attr_btf_vmlinux.size = _binary__btf_vmlinux_bin_end - + _binary__btf_vmlinux_bin_start; - return sysfs_create_bin_file(btf_kobj, &bin_attr_btf_kernel); + return sysfs_create_bin_file(btf_kobj, &bin_attr_btf_vmlinux); } -subsys_initcall(btf_kernel_init); +subsys_initcall(btf_vmlinux_init); diff --git a/scripts/link-vmlinux.sh b/scripts/link-vmlinux.sh index 30bd220ce970..371164c692d6 100755 --- a/scripts/link-vmlinux.sh +++ b/scripts/link-vmlinux.sh @@ -228,9 +228,9 @@ gen_btf() # dump .BTF section into raw binary file to link with final vmlinux bin_arch=$(${OBJDUMP} -f ${1} | grep architecture | \ cut -d, -f1 | cut -d' ' -f2) - ${OBJCOPY} --dump-section .BTF=.btf.kernel.bin ${1} 2>/dev/null + ${OBJCOPY} --dump-section .BTF=.btf.vmlinux.bin ${1} 2>/dev/null ${OBJCOPY} -I binary -O ${CONFIG_OUTPUT_FORMAT} -B ${bin_arch} \ - --rename-section .data=.BTF .btf.kernel.bin ${2} + --rename-section .data=.BTF .btf.vmlinux.bin ${2} } # Create ${2} .o file with all symbols from the ${1} object file @@ -398,10 +398,10 @@ if [ ! -z ${RTIC_MPGEN+x} ]; then KBUILD_VMLINUX_LIBS+=$RTIC_MP_O fi -btf_kernel_bin_o="" +btf_vmlinux_bin_o="" if [ -n "${CONFIG_DEBUG_INFO_BTF}" ]; then - if gen_btf .tmp_vmlinux.btf .btf.kernel.bin.o ; then - btf_kernel_bin_o=.btf.kernel.bin.o + if gen_btf .tmp_vmlinux.btf .btf.vmlinux.bin.o ; then + btf_vmlinux_bin_o=.btf.vmlinux.bin.o fi fi @@ -436,11 +436,11 @@ if [ -n "${CONFIG_KALLSYMS}" ]; then kallsyms_vmlinux=.tmp_vmlinux2 # step 1 - vmlinux_link .tmp_vmlinux1 ${btf_kernel_bin_o} + vmlinux_link .tmp_vmlinux1 ${btf_vmlinux_bin_o} kallsyms .tmp_vmlinux1 .tmp_kallsyms1.o # step 2 - vmlinux_link .tmp_vmlinux2 .tmp_kallsyms1.o ${btf_kernel_bin_o} + vmlinux_link .tmp_vmlinux2 .tmp_kallsyms1.o ${btf_vmlinux_bin_o} kallsyms .tmp_vmlinux2 .tmp_kallsyms2.o # step 3 @@ -451,7 +451,7 @@ if [ -n "${CONFIG_KALLSYMS}" ]; then kallsymso=.tmp_kallsyms3.o kallsyms_vmlinux=.tmp_vmlinux3 - vmlinux_link .tmp_vmlinux3 .tmp_kallsyms2.o ${btf_kernel_bin_o} + vmlinux_link .tmp_vmlinux3 .tmp_kallsyms2.o ${btf_vmlinux_bin_o} kallsyms .tmp_vmlinux3 .tmp_kallsyms3.o fi fi @@ -473,7 +473,7 @@ if [ ! -z ${RTIC_MP_O} ]; then fi info LD vmlinux -vmlinux_link vmlinux "${kallsymso}" "${btf_kernel_bin_o}" +vmlinux_link vmlinux "${kallsymso}" "${btf_vmlinux_bin_o}" if [ -n "${CONFIG_BUILDTIME_EXTABLE_SORT}" ]; then info SORTEX vmlinux From a5ddb39f668da791f9ac0d0475e216b4d767e83d Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Fri, 16 Aug 2019 02:40:44 +0000 Subject: [PATCH 1312/1640] UPSTREAM: btf: fix return value check in btf_vmlinux_init() In case of error, the function kobject_create_and_add() returns NULL pointer not ERR_PTR(). The IS_ERR() test in the return value check should be replaced with NULL test. Fixes: 341dfcf8d78e ("btf: expose BTF info through sysfs") Change-Id: I779408abc87bc9b6af65d644bc35f3daf3cf362b Signed-off-by: Wei Yongjun Acked-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov --- kernel/bpf/sysfs_btf.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/kernel/bpf/sysfs_btf.c b/kernel/bpf/sysfs_btf.c index 4659349fc795..7ae5dddd1fe6 100644 --- a/kernel/bpf/sysfs_btf.c +++ b/kernel/bpf/sysfs_btf.c @@ -30,17 +30,12 @@ static struct kobject *btf_kobj; static int __init btf_vmlinux_init(void) { - int err; - if (!_binary__btf_vmlinux_bin_start) return 0; btf_kobj = kobject_create_and_add("btf", kernel_kobj); - if (IS_ERR(btf_kobj)) { - err = PTR_ERR(btf_kobj); - btf_kobj = NULL; - return err; - } + if (!btf_kobj) + return -ENOMEM; bin_attr_btf_vmlinux.size = _binary__btf_vmlinux_bin_end - _binary__btf_vmlinux_bin_start; From 76b20d0e8c242aaf7325d1901eb6bf3f531225ef Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Wed, 14 Aug 2019 10:37:48 -0700 Subject: [PATCH 1313/1640] UPSTREAM: bpf: export bpf_map_inc_not_zero Rename existing bpf_map_inc_not_zero to __bpf_map_inc_not_zero to indicate that it's caller's responsibility to do proper locking. Create and export bpf_map_inc_not_zero wrapper that properly locks map_idr_lock. Will be used in the next commit to hold a map while cloning a socket. Cc: Martin KaFai Lau Cc: Yonghong Song Acked-by: Martin KaFai Lau Acked-by: Yonghong Song Change-Id: Ibab0863d5901f01840798d5c69bdf92fbf4df5df Signed-off-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 2 ++ kernel/bpf/syscall.c | 16 +++++++++++++--- 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 39d0c0f923cb..5ca2ecca84ba 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -660,6 +660,8 @@ void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock); struct bpf_map *bpf_map_get_with_uref(u32 ufd); struct bpf_map *__bpf_map_get(struct fd f); struct bpf_map * __must_check bpf_map_inc(struct bpf_map *map, bool uref); +struct bpf_map * __must_check bpf_map_inc_not_zero(struct bpf_map *map, + bool uref); void bpf_map_put_with_uref(struct bpf_map *map); void bpf_map_put(struct bpf_map *map); int bpf_map_charge_memlock(struct bpf_map *map, u32 pages); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 731253ee9140..5ecdef31f5da 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -684,8 +684,8 @@ struct bpf_map *bpf_map_get_with_uref(u32 ufd) } /* map_idr_lock should have been held */ -static struct bpf_map *bpf_map_inc_not_zero(struct bpf_map *map, - bool uref) +static struct bpf_map *__bpf_map_inc_not_zero(struct bpf_map *map, + bool uref) { int refold; @@ -705,6 +705,16 @@ static struct bpf_map *bpf_map_inc_not_zero(struct bpf_map *map, return map; } +struct bpf_map *bpf_map_inc_not_zero(struct bpf_map *map, bool uref) +{ + spin_lock_bh(&map_idr_lock); + map = __bpf_map_inc_not_zero(map, uref); + spin_unlock_bh(&map_idr_lock); + + return map; +} +EXPORT_SYMBOL_GPL(bpf_map_inc_not_zero); + int __weak bpf_stackmap_copy(struct bpf_map *map, void *key, void *value) { return -ENOTSUPP; @@ -2183,7 +2193,7 @@ static int bpf_map_get_fd_by_id(const union bpf_attr *attr) spin_lock_bh(&map_idr_lock); map = idr_find(&map_idr, id); if (map) - map = bpf_map_inc_not_zero(map, true); + map = __bpf_map_inc_not_zero(map, true); else map = ERR_PTR(-ENOENT); spin_unlock_bh(&map_idr_lock); From 43f799f314de7f5278b3bf1138158cb6dfb63640 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Tue, 28 Aug 2018 14:44:27 +0200 Subject: [PATCH 1314/1640] UPSTREAM: xsk: expose xdp_umem_get_{data,dma} to drivers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move the xdp_umem_get_{data,dma} functions to include/net/xdp_sock.h, so that the upcoming zero-copy implementation in the Ethernet drivers can utilize them. Also, supply some dummy function implementations for CONFIG_XDP_SOCKETS=n configs. Change-Id: Iee02524300cbbdbcca8deaff7804d6c179a21dd9 Signed-off-by: Björn Töpel Signed-off-by: Alexei Starovoitov --- include/net/xdp_sock.h | 43 ++++++++++++++++++++++++++++++++++++++++++ net/xdp/xdp_umem.h | 10 ---------- 2 files changed, 43 insertions(+), 10 deletions(-) diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index de1336dabe11..d6db6eecba77 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -78,6 +78,16 @@ void xsk_umem_discard_addr(struct xdp_umem *umem); void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries); bool xsk_umem_consume_tx(struct xdp_umem *umem, dma_addr_t *dma, u32 *len); void xsk_umem_consume_tx_done(struct xdp_umem *umem); + +static inline char *xdp_umem_get_data(struct xdp_umem *umem, u64 addr) +{ + return umem->pages[addr >> PAGE_SHIFT].addr + (addr & (PAGE_SIZE - 1)); +} + +static inline dma_addr_t xdp_umem_get_dma(struct xdp_umem *umem, u64 addr) +{ + return umem->pages[addr >> PAGE_SHIFT].dma + (addr & (PAGE_SIZE - 1)); +} #else static inline int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) { @@ -97,6 +107,39 @@ static inline bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs) { return false; } + +static inline u64 *xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr) +{ + return NULL; +} + +static inline void xsk_umem_discard_addr(struct xdp_umem *umem) +{ +} + +static inline void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries) +{ +} + +static inline bool xsk_umem_consume_tx(struct xdp_umem *umem, dma_addr_t *dma, + u32 *len) +{ + return false; +} + +static inline void xsk_umem_consume_tx_done(struct xdp_umem *umem) +{ +} + +static inline char *xdp_umem_get_data(struct xdp_umem *umem, u64 addr) +{ + return NULL; +} + +static inline dma_addr_t xdp_umem_get_dma(struct xdp_umem *umem, u64 addr) +{ + return 0; +} #endif /* CONFIG_XDP_SOCKETS */ #endif /* _LINUX_XDP_SOCK_H */ diff --git a/net/xdp/xdp_umem.h b/net/xdp/xdp_umem.h index f11560334f88..c8be1ad3eb88 100644 --- a/net/xdp/xdp_umem.h +++ b/net/xdp/xdp_umem.h @@ -8,16 +8,6 @@ #include -static inline char *xdp_umem_get_data(struct xdp_umem *umem, u64 addr) -{ - return umem->pages[addr >> PAGE_SHIFT].addr + (addr & (PAGE_SIZE - 1)); -} - -static inline dma_addr_t xdp_umem_get_dma(struct xdp_umem *umem, u64 addr) -{ - return umem->pages[addr >> PAGE_SHIFT].dma + (addr & (PAGE_SIZE - 1)); -} - int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev, u32 queue_id, u16 flags); bool xdp_umem_validate_queues(struct xdp_umem *umem); From bc8df7ef11c18bd0b9aef71f93b36874fea566b3 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 7 Sep 2018 10:18:46 +0200 Subject: [PATCH 1315/1640] UPSTREAM: net: xsk: add a simple buffer reuse queue XSK UMEM is strongly single producer single consumer so reuse of frames is challenging. Add a simple "stash" of FILL packets to reuse for drivers to optionally make use of. This is useful when driver has to free (ndo_stop) or resize a ring with an active AF_XDP ZC socket. Change-Id: I3e2d50258479e9a6665417c303a9cbc7685fec99 Signed-off-by: Jakub Kicinski Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- include/net/xdp_sock.h | 69 ++++++++++++++++++++++++++++++++++++++++++ net/xdp/xdp_umem.c | 2 ++ net/xdp/xsk_queue.c | 55 +++++++++++++++++++++++++++++++++ net/xdp/xsk_queue.h | 3 ++ 4 files changed, 129 insertions(+) diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index d6db6eecba77..bca2e412e278 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -26,6 +26,12 @@ struct xdp_umem_page { dma_addr_t dma; }; +struct xdp_umem_fq_reuse { + u32 nentries; + u32 length; + u64 handles[]; +}; + struct xdp_umem { struct xsk_queue *fq; struct xsk_queue *cq; @@ -40,6 +46,7 @@ struct xdp_umem { struct page **pgs; u32 npgs; struct net_device *dev; + struct xdp_umem_fq_reuse *fq_reuse; u16 queue_id; bool zc; spinlock_t xsk_list_lock; @@ -78,6 +85,10 @@ void xsk_umem_discard_addr(struct xdp_umem *umem); void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries); bool xsk_umem_consume_tx(struct xdp_umem *umem, dma_addr_t *dma, u32 *len); void xsk_umem_consume_tx_done(struct xdp_umem *umem); +struct xdp_umem_fq_reuse *xsk_reuseq_prepare(u32 nentries); +struct xdp_umem_fq_reuse *xsk_reuseq_swap(struct xdp_umem *umem, + struct xdp_umem_fq_reuse *newq); +void xsk_reuseq_free(struct xdp_umem_fq_reuse *rq); static inline char *xdp_umem_get_data(struct xdp_umem *umem, u64 addr) { @@ -88,6 +99,35 @@ static inline dma_addr_t xdp_umem_get_dma(struct xdp_umem *umem, u64 addr) { return umem->pages[addr >> PAGE_SHIFT].dma + (addr & (PAGE_SIZE - 1)); } + +/* Reuse-queue aware version of FILL queue helpers */ +static inline u64 *xsk_umem_peek_addr_rq(struct xdp_umem *umem, u64 *addr) +{ + struct xdp_umem_fq_reuse *rq = umem->fq_reuse; + + if (!rq->length) + return xsk_umem_peek_addr(umem, addr); + + *addr = rq->handles[rq->length - 1]; + return addr; +} + +static inline void xsk_umem_discard_addr_rq(struct xdp_umem *umem) +{ + struct xdp_umem_fq_reuse *rq = umem->fq_reuse; + + if (!rq->length) + xsk_umem_discard_addr(umem); + else + rq->length--; +} + +static inline void xsk_umem_fq_reuse(struct xdp_umem *umem, u64 addr) +{ + struct xdp_umem_fq_reuse *rq = umem->fq_reuse; + + rq->handles[rq->length++] = addr; +} #else static inline int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) { @@ -131,6 +171,21 @@ static inline void xsk_umem_consume_tx_done(struct xdp_umem *umem) { } +static inline struct xdp_umem_fq_reuse *xsk_reuseq_prepare(u32 nentries) +{ + return NULL; +} + +static inline struct xdp_umem_fq_reuse *xsk_reuseq_swap( + struct xdp_umem *umem, + struct xdp_umem_fq_reuse *newq) +{ + return NULL; +} +static inline void xsk_reuseq_free(struct xdp_umem_fq_reuse *rq) +{ +} + static inline char *xdp_umem_get_data(struct xdp_umem *umem, u64 addr) { return NULL; @@ -140,6 +195,20 @@ static inline dma_addr_t xdp_umem_get_dma(struct xdp_umem *umem, u64 addr) { return 0; } + +static inline u64 *xsk_umem_peek_addr_rq(struct xdp_umem *umem, u64 *addr) +{ + return NULL; +} + +static inline void xsk_umem_discard_addr_rq(struct xdp_umem *umem) +{ +} + +static inline void xsk_umem_fq_reuse(struct xdp_umem *umem, u64 addr) +{ +} + #endif /* CONFIG_XDP_SOCKETS */ #endif /* _LINUX_XDP_SOCK_H */ diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c index b87e63cb55be..0c8247a7543d 100644 --- a/net/xdp/xdp_umem.c +++ b/net/xdp/xdp_umem.c @@ -165,6 +165,8 @@ static void xdp_umem_release(struct xdp_umem *umem) umem->cq = NULL; } + xsk_reuseq_destroy(umem); + xdp_umem_unpin_pages(umem); kfree(umem->pages); diff --git a/net/xdp/xsk_queue.c b/net/xdp/xsk_queue.c index 6c32e92e98fc..f7a0f1f0756f 100644 --- a/net/xdp/xsk_queue.c +++ b/net/xdp/xsk_queue.c @@ -3,7 +3,9 @@ * Copyright(c) 2018 Intel Corporation. */ +#include #include +#include #include "xsk_queue.h" @@ -61,3 +63,56 @@ void xskq_destroy(struct xsk_queue *q) page_frag_free(q->ring); kfree(q); } + +struct xdp_umem_fq_reuse *xsk_reuseq_prepare(u32 nentries) +{ + struct xdp_umem_fq_reuse *newq; + + /* Check for overflow */ + if (nentries > (u32)roundup_pow_of_two(nentries)) + return NULL; + nentries = roundup_pow_of_two(nentries); + + newq = kvmalloc(struct_size(newq, handles, nentries), GFP_KERNEL); + if (!newq) + return NULL; + memset(newq, 0, offsetof(typeof(*newq), handles)); + + newq->nentries = nentries; + return newq; +} +EXPORT_SYMBOL_GPL(xsk_reuseq_prepare); + +struct xdp_umem_fq_reuse *xsk_reuseq_swap(struct xdp_umem *umem, + struct xdp_umem_fq_reuse *newq) +{ + struct xdp_umem_fq_reuse *oldq = umem->fq_reuse; + + if (!oldq) { + umem->fq_reuse = newq; + return NULL; + } + + if (newq->nentries < oldq->length) + return newq; + + memcpy(newq->handles, oldq->handles, + array_size(oldq->length, sizeof(u64))); + newq->length = oldq->length; + + umem->fq_reuse = newq; + return oldq; +} +EXPORT_SYMBOL_GPL(xsk_reuseq_swap); + +void xsk_reuseq_free(struct xdp_umem_fq_reuse *rq) +{ + kvfree(rq); +} +EXPORT_SYMBOL_GPL(xsk_reuseq_free); + +void xsk_reuseq_destroy(struct xdp_umem *umem) +{ + xsk_reuseq_free(umem->fq_reuse); + umem->fq_reuse = NULL; +} diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index cf7cbb5dd918..62cf850fdad3 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -260,4 +260,7 @@ void xskq_set_umem(struct xsk_queue *q, struct xdp_umem_props *umem_props); struct xsk_queue *xskq_create(u32 nentries, bool umem_queue); void xskq_destroy(struct xsk_queue *q_ops); +/* Executed by the core when the entire UMEM gets freed */ +void xsk_reuseq_destroy(struct xdp_umem *umem); + #endif /* _LINUX_XSK_QUEUE_H */ From c96739f8d30249ba831101ec3ce3d6e05353a9cd Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Mon, 1 Oct 2018 14:51:34 +0200 Subject: [PATCH 1316/1640] UPSTREAM: xsk: fix bug when trying to use both copy and zero-copy on one queue id Previously, the xsk code did not record which umem was bound to a specific queue id. This was not required if all drivers were zero-copy enabled as this had to be recorded in the driver anyway. So if a user tried to bind two umems to the same queue, the driver would say no. But if copy-mode was first enabled and then zero-copy mode (or the reverse order), we mistakenly enabled both of them on the same umem leading to buggy behavior. The main culprit for this is that we did not store the association of umem to queue id in the copy case and only relied on the driver reporting this. As this relation was not stored in the driver for copy mode (it does not rely on the AF_XDP NDOs), this obviously could not work. This patch fixes the problem by always recording the umem to queue id relationship in the netdev_queue and netdev_rx_queue structs. This way we always know what kind of umem has been bound to a queue id and can act appropriately at bind time. Change-Id: Ife8bf67e8b0c9650db67b1d0ec734f093d80b49f Signed-off-by: Magnus Karlsson Signed-off-by: Daniel Borkmann --- net/xdp/xdp_umem.c | 94 +++++++++++++++++++++++++++++++--------------- net/xdp/xdp_umem.h | 2 +- net/xdp/xsk.c | 7 ---- 3 files changed, 65 insertions(+), 38 deletions(-) diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c index 0c8247a7543d..d48e29645045 100644 --- a/net/xdp/xdp_umem.c +++ b/net/xdp/xdp_umem.c @@ -43,27 +43,47 @@ void xdp_del_sk_umem(struct xdp_umem *umem, struct xdp_sock *xs) spin_unlock_irqrestore(&umem->xsk_list_lock, flags); } -int xdp_umem_query(struct net_device *dev, u16 queue_id) +/* The umem is stored both in the _rx struct and the _tx struct as we do + * not know if the device has more tx queues than rx, or the opposite. + * This might also change during run time. + */ +static void xdp_reg_umem_at_qid(struct net_device *dev, struct xdp_umem *umem, + u16 queue_id) { - struct netdev_bpf bpf; + if (queue_id < dev->real_num_rx_queues) + dev->_rx[queue_id].umem = umem; + if (queue_id < dev->real_num_tx_queues) + dev->_tx[queue_id].umem = umem; +} - ASSERT_RTNL(); +static struct xdp_umem *xdp_get_umem_from_qid(struct net_device *dev, + u16 queue_id) +{ + if (queue_id < dev->real_num_rx_queues) + return dev->_rx[queue_id].umem; + if (queue_id < dev->real_num_tx_queues) + return dev->_tx[queue_id].umem; - memset(&bpf, 0, sizeof(bpf)); - bpf.command = XDP_QUERY_XSK_UMEM; - bpf.xsk.queue_id = queue_id; + return NULL; +} - if (!dev->netdev_ops->ndo_bpf) - return 0; - return dev->netdev_ops->ndo_bpf(dev, &bpf) ?: !!bpf.xsk.umem; +static void xdp_clear_umem_at_qid(struct net_device *dev, u16 queue_id) +{ + /* Zero out the entry independent on how many queues are configured + * at this point in time, as it might be used in the future. + */ + if (queue_id < dev->num_rx_queues) + dev->_rx[queue_id].umem = NULL; + if (queue_id < dev->num_tx_queues) + dev->_tx[queue_id].umem = NULL; } int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev, - u32 queue_id, u16 flags) + u16 queue_id, u16 flags) { bool force_zc, force_copy; struct netdev_bpf bpf; - int err; + int err = 0; force_zc = flags & XDP_ZEROCOPY; force_copy = flags & XDP_COPY; @@ -71,19 +91,23 @@ int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev, if (force_zc && force_copy) return -EINVAL; - if (force_copy) - return 0; - - if (!dev->netdev_ops->ndo_bpf || !dev->netdev_ops->ndo_xsk_async_xmit) - return force_zc ? -EOPNOTSUPP : 0; /* fail or fallback */ - - bpf.command = XDP_QUERY_XSK_UMEM; - rtnl_lock(); - err = xdp_umem_query(dev, queue_id); - if (err) { - err = err < 0 ? -EOPNOTSUPP : -EBUSY; - goto err_rtnl_unlock; + if (xdp_get_umem_from_qid(dev, queue_id)) { + err = -EBUSY; + goto out_rtnl_unlock; + } + + xdp_reg_umem_at_qid(dev, umem, queue_id); + umem->dev = dev; + umem->queue_id = queue_id; + if (force_copy) + /* For copy-mode, we are done. */ + goto out_rtnl_unlock; + + if (!dev->netdev_ops->ndo_bpf || + !dev->netdev_ops->ndo_xsk_async_xmit) { + err = -EOPNOTSUPP; + goto err_unreg_umem; } bpf.command = XDP_SETUP_XSK_UMEM; @@ -92,18 +116,20 @@ int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev, err = dev->netdev_ops->ndo_bpf(dev, &bpf); if (err) - goto err_rtnl_unlock; + goto err_unreg_umem; rtnl_unlock(); dev_hold(dev); - umem->dev = dev; - umem->queue_id = queue_id; umem->zc = true; return 0; -err_rtnl_unlock: +err_unreg_umem: + xdp_clear_umem_at_qid(dev, queue_id); + if (!force_zc) + err = 0; /* fallback to copy mode */ +out_rtnl_unlock: rtnl_unlock(); - return force_zc ? err : 0; /* fail or fallback */ + return err; } static void xdp_umem_clear_dev(struct xdp_umem *umem) @@ -111,7 +137,7 @@ static void xdp_umem_clear_dev(struct xdp_umem *umem) struct netdev_bpf bpf; int err; - if (umem->dev) { + if (umem->zc) { bpf.command = XDP_SETUP_XSK_UMEM; bpf.xsk.umem = NULL; bpf.xsk.queue_id = umem->queue_id; @@ -122,9 +148,17 @@ static void xdp_umem_clear_dev(struct xdp_umem *umem) if (err) WARN(1, "failed to disable umem!\n"); + } + if (umem->dev) { + rtnl_lock(); + xdp_clear_umem_at_qid(umem->dev, umem->queue_id); + rtnl_unlock(); + } + + if (umem->zc) { dev_put(umem->dev); - umem->dev = NULL; + umem->zc = false; } } diff --git a/net/xdp/xdp_umem.h b/net/xdp/xdp_umem.h index c8be1ad3eb88..27603227601b 100644 --- a/net/xdp/xdp_umem.h +++ b/net/xdp/xdp_umem.h @@ -9,7 +9,7 @@ #include int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev, - u32 queue_id, u16 flags); + u16 queue_id, u16 flags); bool xdp_umem_validate_queues(struct xdp_umem *umem); void xdp_get_umem(struct xdp_umem *umem); void xdp_put_umem(struct xdp_umem *umem); diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index d5a9c43930de..b43da805df28 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -408,13 +408,6 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) } qid = sxdp->sxdp_queue_id; - - if ((xs->rx && qid >= dev->real_num_rx_queues) || - (xs->tx && qid >= dev->real_num_tx_queues)) { - err = -EINVAL; - goto out_unlock; - } - flags = sxdp->sxdp_flags; if (flags & XDP_SHARED_UMEM) { From 1dfd6f37cffc505baec10ff2f8ee84becffb62fa Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Fri, 7 Jun 2019 20:27:32 +0300 Subject: [PATCH 1317/1640] UPSTREAM: xdp: check device pointer before clearing We should not call 'ndo_bpf()' or 'dev_put()' with NULL argument. Fixes: c9b47cc1fabc ("xsk: fix bug when trying to use both copy and zero-copy on one queue id") Change-Id: Ie1ad7967be0679390b73d14578d9c48160dd21ac Signed-off-by: Ilya Maximets Acked-by: Jonathan Lemon Signed-off-by: Daniel Borkmann --- net/xdp/xdp_umem.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c index d48e29645045..9d9159fc99fe 100644 --- a/net/xdp/xdp_umem.c +++ b/net/xdp/xdp_umem.c @@ -137,6 +137,9 @@ static void xdp_umem_clear_dev(struct xdp_umem *umem) struct netdev_bpf bpf; int err; + if (!umem->dev) + return; + if (umem->zc) { bpf.command = XDP_SETUP_XSK_UMEM; bpf.xsk.umem = NULL; @@ -150,11 +153,9 @@ static void xdp_umem_clear_dev(struct xdp_umem *umem) WARN(1, "failed to disable umem!\n"); } - if (umem->dev) { - rtnl_lock(); - xdp_clear_umem_at_qid(umem->dev, umem->queue_id); - rtnl_unlock(); - } + rtnl_lock(); + xdp_clear_umem_at_qid(umem->dev, umem->queue_id); + rtnl_unlock(); if (umem->zc) { dev_put(umem->dev); From b9108883b60b2ab01a5b7659dd86ff664fc89b0f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Thu, 24 Jan 2019 19:59:37 +0100 Subject: [PATCH 1318/1640] UPSTREAM: net: xsk: track AF_XDP sockets on a per-netns list MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Track each AF_XDP socket in a per-netns list. This will be used later by the sock_diag interface for querying sockets from userspace. Change-Id: I320755cc56130becf6f253cf307a6d8173a45935 Signed-off-by: Björn Töpel Signed-off-by: Daniel Borkmann --- include/net/net_namespace.h | 4 ++++ include/net/netns/xdp.h | 13 +++++++++++++ net/xdp/xsk.c | 30 ++++++++++++++++++++++++++++++ 3 files changed, 47 insertions(+) create mode 100644 include/net/netns/xdp.h diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index 276bd1c3e0c7..be308b7fa615 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -30,6 +30,7 @@ #include #include #include +#include #include #include #include @@ -152,6 +153,9 @@ struct net { #endif #if IS_ENABLED(CONFIG_CAN) struct netns_can can; +#endif +#ifdef CONFIG_XDP_SOCKETS + struct netns_xdp xdp; #endif struct sock *diag_nlsk; atomic_t fnhe_genid; diff --git a/include/net/netns/xdp.h b/include/net/netns/xdp.h new file mode 100644 index 000000000000..e5734261ba0a --- /dev/null +++ b/include/net/netns/xdp.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __NETNS_XDP_H__ +#define __NETNS_XDP_H__ + +#include +#include + +struct netns_xdp { + struct mutex lock; + struct hlist_head list; +}; + +#endif /* __NETNS_XDP_H__ */ diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index b43da805df28..df6338e06d55 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -333,6 +333,10 @@ static int xsk_release(struct socket *sock) net = sock_net(sk); + mutex_lock(&net->xdp.lock); + sk_del_node_init_rcu(sk); + mutex_unlock(&net->xdp.lock); + local_bh_disable(); sock_prot_inuse_add(net, sk->sk_prot, -1); local_bh_enable(); @@ -749,6 +753,10 @@ static int xsk_create(struct net *net, struct socket *sock, int protocol, mutex_init(&xs->mutex); spin_lock_init(&xs->tx_completion_lock); + mutex_lock(&net->xdp.lock); + sk_add_node_rcu(sk, &net->xdp.list); + mutex_unlock(&net->xdp.lock); + local_bh_disable(); sock_prot_inuse_add(net, &xsk_proto, 1); local_bh_enable(); @@ -762,6 +770,23 @@ static const struct net_proto_family xsk_family_ops = { .owner = THIS_MODULE, }; +static int __net_init xsk_net_init(struct net *net) +{ + mutex_init(&net->xdp.lock); + INIT_HLIST_HEAD(&net->xdp.list); + return 0; +} + +static void __net_exit xsk_net_exit(struct net *net) +{ + WARN_ON_ONCE(!hlist_empty(&net->xdp.list)); +} + +static struct pernet_operations xsk_net_ops = { + .init = xsk_net_init, + .exit = xsk_net_exit, +}; + static int __init xsk_init(void) { int err; @@ -774,8 +799,13 @@ static int __init xsk_init(void) if (err) goto out_proto; + err = register_pernet_subsys(&xsk_net_ops); + if (err) + goto out_sk; return 0; +out_sk: + sock_unregister(PF_XDP); out_proto: proto_unregister(&xsk_proto); out: From f4fa55ec6a223773803e400c6d111ba51e9dca6d Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Fri, 28 Jun 2019 11:04:07 +0300 Subject: [PATCH 1319/1640] UPSTREAM: xdp: fix hang while unregistering device bound to xdp socket Device that bound to XDP socket will not have zero refcount until the userspace application will not close it. This leads to hang inside 'netdev_wait_allrefs()' if device unregistering requested: # ip link del p1 < hang on recvmsg on netlink socket > # ps -x | grep ip 5126 pts/0 D+ 0:00 ip link del p1 # journalctl -b Jun 05 07:19:16 kernel: unregister_netdevice: waiting for p1 to become free. Usage count = 1 Jun 05 07:19:27 kernel: unregister_netdevice: waiting for p1 to become free. Usage count = 1 ... Fix that by implementing NETDEV_UNREGISTER event notification handler to properly clean up all the resources and unref device. This should also allow socket killing via ss(8) utility. Fixes: 965a99098443 ("xsk: add support for bind for Rx") Change-Id: I4a3f20a82a3e936f4567949c92793a1cdc8f7dcc Signed-off-by: Ilya Maximets Acked-by: Jonathan Lemon Signed-off-by: Daniel Borkmann --- include/net/xdp_sock.h | 5 +++ net/xdp/xdp_umem.c | 10 ++--- net/xdp/xdp_umem.h | 1 + net/xdp/xsk.c | 87 ++++++++++++++++++++++++++++++++++++------ 4 files changed, 87 insertions(+), 16 deletions(-) diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index bca2e412e278..889c43138b09 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -62,6 +62,11 @@ struct xdp_sock { struct list_head flush_node; u16 queue_id; bool zc; + enum { + XSK_READY = 0, + XSK_BOUND, + XSK_UNBOUND, + } state; /* Protects multiple processes in the control path */ struct mutex mutex; struct xsk_queue *tx ____cacheline_aligned_in_smp; diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c index 9d9159fc99fe..e183f8c1ba34 100644 --- a/net/xdp/xdp_umem.c +++ b/net/xdp/xdp_umem.c @@ -132,11 +132,13 @@ out_rtnl_unlock: return err; } -static void xdp_umem_clear_dev(struct xdp_umem *umem) +void xdp_umem_clear_dev(struct xdp_umem *umem) { struct netdev_bpf bpf; int err; + ASSERT_RTNL(); + if (!umem->dev) return; @@ -145,17 +147,13 @@ static void xdp_umem_clear_dev(struct xdp_umem *umem) bpf.xsk.umem = NULL; bpf.xsk.queue_id = umem->queue_id; - rtnl_lock(); err = umem->dev->netdev_ops->ndo_bpf(umem->dev, &bpf); - rtnl_unlock(); if (err) WARN(1, "failed to disable umem!\n"); } - rtnl_lock(); xdp_clear_umem_at_qid(umem->dev, umem->queue_id); - rtnl_unlock(); if (umem->zc) { dev_put(umem->dev); @@ -188,7 +186,9 @@ static void xdp_umem_unaccount_pages(struct xdp_umem *umem) static void xdp_umem_release(struct xdp_umem *umem) { + rtnl_lock(); xdp_umem_clear_dev(umem); + rtnl_unlock(); if (umem->fq) { xskq_destroy(umem->fq); diff --git a/net/xdp/xdp_umem.h b/net/xdp/xdp_umem.h index 27603227601b..a63a9fb251f5 100644 --- a/net/xdp/xdp_umem.h +++ b/net/xdp/xdp_umem.h @@ -10,6 +10,7 @@ int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev, u16 queue_id, u16 flags); +void xdp_umem_clear_dev(struct xdp_umem *umem); bool xdp_umem_validate_queues(struct xdp_umem *umem); void xdp_get_umem(struct xdp_umem *umem); void xdp_put_umem(struct xdp_umem *umem); diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index df6338e06d55..4b26cbd87462 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -322,6 +322,22 @@ static int xsk_init_queue(u32 entries, struct xsk_queue **queue, return 0; } +static void xsk_unbind_dev(struct xdp_sock *xs) +{ + struct net_device *dev = xs->dev; + + if (!dev || xs->state != XSK_BOUND) + return; + + xs->state = XSK_UNBOUND; + + /* Wait for driver to stop using the xdp socket. */ + xdp_del_sk_umem(xs->umem, xs); + xs->dev = NULL; + synchronize_net(); + dev_put(dev); +} + static int xsk_release(struct socket *sock) { struct sock *sk = sock->sk; @@ -341,15 +357,7 @@ static int xsk_release(struct socket *sock) sock_prot_inuse_add(net, sk->sk_prot, -1); local_bh_enable(); - if (xs->dev) { - struct net_device *dev = xs->dev; - - /* Wait for driver to stop using the xdp socket. */ - xdp_del_sk_umem(xs->umem, xs); - xs->dev = NULL; - synchronize_net(); - dev_put(dev); - } + xsk_unbind_dev(xs); xskq_destroy(xs->rx); xskq_destroy(xs->tx); @@ -395,7 +403,7 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) return -EINVAL; mutex_lock(&xs->mutex); - if (xs->dev) { + if (xs->state != XSK_READY) { err = -EBUSY; goto out_release; } @@ -474,6 +482,8 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) out_unlock: if (err) dev_put(dev); + else + xs->state = XSK_BOUND; out_release: mutex_unlock(&xs->mutex); return err; @@ -502,6 +512,10 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname, return -EFAULT; mutex_lock(&xs->mutex); + if (xs->state != XSK_READY) { + mutex_unlock(&xs->mutex); + return -EBUSY; + } q = (optname == XDP_TX_RING) ? &xs->tx : &xs->rx; err = xsk_init_queue(entries, q, false); mutex_unlock(&xs->mutex); @@ -518,7 +532,7 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname, return -EFAULT; mutex_lock(&xs->mutex); - if (xs->umem) { + if (xs->state != XSK_READY || xs->umem) { mutex_unlock(&xs->mutex); return -EBUSY; } @@ -547,6 +561,10 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname, return -EFAULT; mutex_lock(&xs->mutex); + if (xs->state != XSK_READY) { + mutex_unlock(&xs->mutex); + return -EBUSY; + } if (!xs->umem) { mutex_unlock(&xs->mutex); return -EINVAL; @@ -648,6 +666,9 @@ static int xsk_mmap(struct file *file, struct socket *sock, unsigned long pfn; struct page *qpg; + if (xs->state != XSK_READY) + return -EBUSY; + if (offset == XDP_PGOFF_RX_RING) { q = READ_ONCE(xs->rx); } else if (offset == XDP_PGOFF_TX_RING) { @@ -679,6 +700,38 @@ static int xsk_mmap(struct file *file, struct socket *sock, size, vma->vm_page_prot); } +static int xsk_notifier(struct notifier_block *this, + unsigned long msg, void *ptr) +{ + struct net_device *dev = netdev_notifier_info_to_dev(ptr); + struct net *net = dev_net(dev); + struct sock *sk; + + switch (msg) { + case NETDEV_UNREGISTER: + mutex_lock(&net->xdp.lock); + sk_for_each(sk, &net->xdp.list) { + struct xdp_sock *xs = xdp_sk(sk); + + mutex_lock(&xs->mutex); + if (xs->dev == dev) { + sk->sk_err = ENETDOWN; + if (!sock_flag(sk, SOCK_DEAD)) + sk->sk_error_report(sk); + + xsk_unbind_dev(xs); + + /* Clear device references in umem. */ + xdp_umem_clear_dev(xs->umem); + } + mutex_unlock(&xs->mutex); + } + mutex_unlock(&net->xdp.lock); + break; + } + return NOTIFY_DONE; +} + static struct proto xsk_proto = { .name = "XDP", .owner = THIS_MODULE, @@ -750,6 +803,7 @@ static int xsk_create(struct net *net, struct socket *sock, int protocol, sock_set_flag(sk, SOCK_RCU_FREE); xs = xdp_sk(sk); + xs->state = XSK_READY; mutex_init(&xs->mutex); spin_lock_init(&xs->tx_completion_lock); @@ -770,6 +824,10 @@ static const struct net_proto_family xsk_family_ops = { .owner = THIS_MODULE, }; +static struct notifier_block xsk_netdev_notifier = { + .notifier_call = xsk_notifier, +}; + static int __net_init xsk_net_init(struct net *net) { mutex_init(&net->xdp.lock); @@ -802,8 +860,15 @@ static int __init xsk_init(void) err = register_pernet_subsys(&xsk_net_ops); if (err) goto out_sk; + + err = register_netdevice_notifier(&xsk_netdev_notifier); + if (err) + goto out_pernet; + return 0; +out_pernet: + unregister_pernet_subsys(&xsk_net_ops); out_sk: sock_unregister(PF_XDP); out_proto: From f8e1d88e6336ac74c82546c8ad723e788692c88e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Thu, 15 Aug 2019 11:30:13 +0200 Subject: [PATCH 1320/1640] UPSTREAM: xsk: remove AF_XDP socket from map when the socket is released MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When an AF_XDP socket is released/closed the XSKMAP still holds a reference to the socket in a "released" state. The socket will still use the netdev queue resource, and block newly created sockets from attaching to that queue, but no user application can access the fill/complete/rx/tx queues. This results in that all applications need to explicitly clear the map entry from the old "zombie state" socket. This should be done automatically. In this patch, the sockets tracks, and have a reference to, which maps it resides in. When the socket is released, it will remove itself from all maps. Suggested-by: Bruce Richardson Change-Id: Ia29bf4d4f238790383fb9bc11250936f1fcd7bee Signed-off-by: Björn Töpel Signed-off-by: Daniel Borkmann --- include/net/xdp_sock.h | 18 ++++++ kernel/bpf/xskmap.c | 125 ++++++++++++++++++++++++++++++++++------- net/xdp/xsk.c | 50 +++++++++++++++++ 3 files changed, 173 insertions(+), 20 deletions(-) diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index 889c43138b09..5cbd10b3b931 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -53,6 +53,16 @@ struct xdp_umem { struct list_head xsk_list; }; +/* Nodes are linked in the struct xdp_sock map_list field, and used to + * track which maps a certain socket reside in. + */ +struct xsk_map; +struct xsk_map_node { + struct list_head node; + struct xsk_map *map; + struct xdp_sock **map_entry; +}; + struct xdp_sock { /* struct sock must be the first member of struct xdp_sock */ struct sock sk; @@ -76,6 +86,9 @@ struct xdp_sock { */ spinlock_t tx_completion_lock; u64 rx_dropped; + struct list_head map_list; + /* Protects map_list */ + spinlock_t map_list_lock; }; struct xdp_buff; @@ -95,6 +108,11 @@ struct xdp_umem_fq_reuse *xsk_reuseq_swap(struct xdp_umem *umem, struct xdp_umem_fq_reuse *newq); void xsk_reuseq_free(struct xdp_umem_fq_reuse *rq); +void xsk_map_try_sock_delete(struct xsk_map *map, struct xdp_sock *xs, + struct xdp_sock **map_entry); +int xsk_map_inc(struct xsk_map *map); +void xsk_map_put(struct xsk_map *map); + static inline char *xdp_umem_get_data(struct xdp_umem *umem, u64 addr) { return umem->pages[addr >> PAGE_SHIFT].addr + (addr & (PAGE_SIZE - 1)); diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c index 9bb96ace9fa1..16031d489173 100644 --- a/kernel/bpf/xskmap.c +++ b/kernel/bpf/xskmap.c @@ -13,8 +13,71 @@ struct xsk_map { struct bpf_map map; struct xdp_sock **xsk_map; struct list_head __percpu *flush_list; + spinlock_t lock; /* Synchronize map updates */ }; +int xsk_map_inc(struct xsk_map *map) +{ + struct bpf_map *m = &map->map; + + m = bpf_map_inc(m, false); + return IS_ERR(m) ? PTR_ERR(m) : 0; +} + +void xsk_map_put(struct xsk_map *map) +{ + bpf_map_put(&map->map); +} + +static struct xsk_map_node *xsk_map_node_alloc(struct xsk_map *map, + struct xdp_sock **map_entry) +{ + struct xsk_map_node *node; + int err; + + node = kzalloc(sizeof(*node), GFP_ATOMIC | __GFP_NOWARN); + if (!node) + return NULL; + + err = xsk_map_inc(map); + if (err) { + kfree(node); + return ERR_PTR(err); + } + + node->map = map; + node->map_entry = map_entry; + return node; +} + +static void xsk_map_node_free(struct xsk_map_node *node) +{ + xsk_map_put(node->map); + kfree(node); +} + +static void xsk_map_sock_add(struct xdp_sock *xs, struct xsk_map_node *node) +{ + spin_lock_bh(&xs->map_list_lock); + list_add_tail(&node->node, &xs->map_list); + spin_unlock_bh(&xs->map_list_lock); +} + +static void xsk_map_sock_delete(struct xdp_sock *xs, + struct xdp_sock **map_entry) +{ + struct xsk_map_node *n, *tmp; + + spin_lock_bh(&xs->map_list_lock); + list_for_each_entry_safe(n, tmp, &xs->map_list, node) { + if (map_entry == n->map_entry) { + list_del(&n->node); + xsk_map_node_free(n); + } + } + spin_unlock_bh(&xs->map_list_lock); +} + static struct bpf_map *xsk_map_alloc(union bpf_attr *attr) { struct xsk_map *m; @@ -34,6 +97,7 @@ static struct bpf_map *xsk_map_alloc(union bpf_attr *attr) return ERR_PTR(-ENOMEM); bpf_map_init_from_attr(&m->map, attr); + spin_lock_init(&m->lock); cost = (u64)m->map.max_entries * sizeof(struct xdp_sock *); cost += sizeof(struct list_head) * num_possible_cpus(); @@ -71,21 +135,9 @@ free_m: static void xsk_map_free(struct bpf_map *map) { struct xsk_map *m = container_of(map, struct xsk_map, map); - int i; bpf_clear_redirect_map(map); synchronize_net(); - - for (i = 0; i < map->max_entries; i++) { - struct xdp_sock *xs; - - xs = m->xsk_map[i]; - if (!xs) - continue; - - sock_put((struct sock *)xs); - } - free_percpu(m->flush_list); bpf_map_area_free(m->xsk_map); kfree(m); @@ -164,8 +216,9 @@ static int xsk_map_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags) { struct xsk_map *m = container_of(map, struct xsk_map, map); + struct xdp_sock *xs, *old_xs, **map_entry; u32 i = *(u32 *)key, fd = *(u32 *)value; - struct xdp_sock *xs, *old_xs; + struct xsk_map_node *node; struct socket *sock; int err; @@ -192,32 +245,64 @@ static int xsk_map_update_elem(struct bpf_map *map, void *key, void *value, return -EOPNOTSUPP; } - sock_hold(sock->sk); + map_entry = &m->xsk_map[i]; + node = xsk_map_node_alloc(m, map_entry); + if (IS_ERR(node)) { + sockfd_put(sock); + return PTR_ERR(node); + } - old_xs = xchg(&m->xsk_map[i], xs); + spin_lock_bh(&m->lock); + old_xs = READ_ONCE(*map_entry); + if (old_xs == xs) { + err = 0; + goto out; + } + xsk_map_sock_add(xs, node); + WRITE_ONCE(*map_entry, xs); if (old_xs) - sock_put((struct sock *)old_xs); - + xsk_map_sock_delete(old_xs, map_entry); + spin_unlock_bh(&m->lock); sockfd_put(sock); return 0; + +out: + spin_unlock_bh(&m->lock); + sockfd_put(sock); + xsk_map_node_free(node); + return err; } static int xsk_map_delete_elem(struct bpf_map *map, void *key) { struct xsk_map *m = container_of(map, struct xsk_map, map); - struct xdp_sock *old_xs; + struct xdp_sock *old_xs, **map_entry; int k = *(u32 *)key; if (k >= map->max_entries) return -EINVAL; - old_xs = xchg(&m->xsk_map[k], NULL); + spin_lock_bh(&m->lock); + map_entry = &m->xsk_map[k]; + old_xs = xchg(map_entry, NULL); if (old_xs) - sock_put((struct sock *)old_xs); + xsk_map_sock_delete(old_xs, map_entry); + spin_unlock_bh(&m->lock); return 0; } +void xsk_map_try_sock_delete(struct xsk_map *map, struct xdp_sock *xs, + struct xdp_sock **map_entry) +{ + spin_lock_bh(&map->lock); + if (READ_ONCE(*map_entry) == xs) { + WRITE_ONCE(*map_entry, NULL); + xsk_map_sock_delete(xs, map_entry); + } + spin_unlock_bh(&map->lock); +} + const struct bpf_map_ops xsk_map_ops = { .map_alloc = xsk_map_alloc, .map_free = xsk_map_free, diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 4b26cbd87462..11ac829532c2 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -338,6 +338,52 @@ static void xsk_unbind_dev(struct xdp_sock *xs) dev_put(dev); } +static struct xsk_map *xsk_get_map_list_entry(struct xdp_sock *xs, + struct xdp_sock ***map_entry) +{ + struct xsk_map *map = NULL; + struct xsk_map_node *node; + + *map_entry = NULL; + + spin_lock_bh(&xs->map_list_lock); + node = list_first_entry_or_null(&xs->map_list, struct xsk_map_node, + node); + if (node) { + WARN_ON(xsk_map_inc(node->map)); + map = node->map; + *map_entry = node->map_entry; + } + spin_unlock_bh(&xs->map_list_lock); + return map; +} + +static void xsk_delete_from_maps(struct xdp_sock *xs) +{ + /* This function removes the current XDP socket from all the + * maps it resides in. We need to take extra care here, due to + * the two locks involved. Each map has a lock synchronizing + * updates to the entries, and each socket has a lock that + * synchronizes access to the list of maps (map_list). For + * deadlock avoidance the locks need to be taken in the order + * "map lock"->"socket map list lock". We start off by + * accessing the socket map list, and take a reference to the + * map to guarantee existence between the + * xsk_get_map_list_entry() and xsk_map_try_sock_delete() + * calls. Then we ask the map to remove the socket, which + * tries to remove the socket from the map. Note that there + * might be updates to the map between + * xsk_get_map_list_entry() and xsk_map_try_sock_delete(). + */ + struct xdp_sock **map_entry = NULL; + struct xsk_map *map; + + while ((map = xsk_get_map_list_entry(xs, &map_entry))) { + xsk_map_try_sock_delete(map, xs, map_entry); + xsk_map_put(map); + } +} + static int xsk_release(struct socket *sock) { struct sock *sk = sock->sk; @@ -357,6 +403,7 @@ static int xsk_release(struct socket *sock) sock_prot_inuse_add(net, sk->sk_prot, -1); local_bh_enable(); + xsk_delete_from_maps(xs); xsk_unbind_dev(xs); xskq_destroy(xs->rx); @@ -807,6 +854,9 @@ static int xsk_create(struct net *net, struct socket *sock, int protocol, mutex_init(&xs->mutex); spin_lock_init(&xs->tx_completion_lock); + INIT_LIST_HEAD(&xs->map_list); + spin_lock_init(&xs->map_list_lock); + mutex_lock(&net->xdp.lock); sk_add_node_rcu(sk, &net->xdp.list); mutex_unlock(&net->xdp.lock); From 692eec818f2e9786bbda1f3771d77609af6800eb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Thu, 15 Aug 2019 11:30:14 +0200 Subject: [PATCH 1321/1640] UPSTREAM: xsk: support BPF_EXIST and BPF_NOEXIST flags in XSKMAP MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The XSKMAP did not honor the BPF_EXIST/BPF_NOEXIST flags when updating an entry. This patch addresses that. Change-Id: I405bec641507812949781a95965821abc3aa50d5 Signed-off-by: Björn Töpel Signed-off-by: Daniel Borkmann --- kernel/bpf/xskmap.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c index 16031d489173..4cc28e226398 100644 --- a/kernel/bpf/xskmap.c +++ b/kernel/bpf/xskmap.c @@ -226,8 +226,6 @@ static int xsk_map_update_elem(struct bpf_map *map, void *key, void *value, return -EINVAL; if (unlikely(i >= m->map.max_entries)) return -E2BIG; - if (unlikely(map_flags == BPF_NOEXIST)) - return -EEXIST; sock = sockfd_lookup(fd, &err); if (!sock) @@ -257,6 +255,12 @@ static int xsk_map_update_elem(struct bpf_map *map, void *key, void *value, if (old_xs == xs) { err = 0; goto out; + } else if (old_xs && map_flags == BPF_NOEXIST) { + err = -EEXIST; + goto out; + } else if (!old_xs && map_flags == BPF_EXIST) { + err = -ENOENT; + goto out; } xsk_map_sock_add(xs, node); WRITE_ONCE(*map_entry, xs); From 4805e65d0e76574eb3dcd639a6469db99c2d2973 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Tue, 20 Aug 2019 01:36:52 +0000 Subject: [PATCH 1322/1640] UPSTREAM: bpf: Use PTR_ERR_OR_ZERO in xsk_map_inc() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use PTR_ERR_OR_ZERO rather than if(IS_ERR(...)) + PTR_ERR. Change-Id: Ic6bce529eb778a4724219c63122da3f9d9f900bf Signed-off-by: YueHaibing Acked-by: Björn Töpel Signed-off-by: Daniel Borkmann --- kernel/bpf/xskmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c index 4cc28e226398..942c662e2eed 100644 --- a/kernel/bpf/xskmap.c +++ b/kernel/bpf/xskmap.c @@ -21,7 +21,7 @@ int xsk_map_inc(struct xsk_map *map) struct bpf_map *m = &map->map; m = bpf_map_inc(m, false); - return IS_ERR(m) ? PTR_ERR(m) : 0; + return PTR_ERR_OR_ZERO(m); } void xsk_map_put(struct xsk_map *map) From 1d8ef5c833fb8abed3e3d96e33298607aee51a10 Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Tue, 20 Aug 2019 14:53:46 +0100 Subject: [PATCH 1323/1640] UPSTREAM: bpf: add BTF ids in procfs for file descriptors to BTF objects Implement the show_fdinfo hook for BTF FDs file operations, and make it print the id of the BTF object. This allows for a quick retrieval of the BTF id from its FD; or it can help understanding what type of object (BTF) the file descriptor points to. v2: - Do not expose data_size, only btf_id, in FD info. Change-Id: I6bbb7904dfa7e51aa530d7061bb8e8c7c0bc0ace Signed-off-by: Quentin Monnet Reviewed-by: Jakub Kicinski Signed-off-by: Daniel Borkmann --- kernel/bpf/btf.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index cef69ea85bf1..999698827bcf 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3376,6 +3376,15 @@ void btf_type_seq_show(const struct btf *btf, u32 type_id, void *obj, btf_type_ops(t)->seq_show(btf, t, type_id, obj, 0, m); } +#ifdef CONFIG_PROC_FS +static void bpf_btf_show_fdinfo(struct seq_file *m, struct file *filp) +{ + const struct btf *btf = filp->private_data; + + seq_printf(m, "btf_id:\t%u\n", btf->id); +} +#endif + static int btf_release(struct inode *inode, struct file *filp) { btf_put(filp->private_data); @@ -3383,6 +3392,9 @@ static int btf_release(struct inode *inode, struct file *filp) } const struct file_operations btf_fops = { +#ifdef CONFIG_PROC_FS + .show_fdinfo = bpf_btf_show_fdinfo, +#endif .release = btf_release, }; From 5d3bcd570c9227b51a6192aa84ce25e649c0c483 Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Tue, 20 Aug 2019 10:31:50 +0100 Subject: [PATCH 1324/1640] UPSTREAM: bpf: add new BPF_BTF_GET_NEXT_ID syscall command Add a new command for the bpf() system call: BPF_BTF_GET_NEXT_ID is used to cycle through all BTF objects loaded on the system. The motivation is to be able to inspect (list) all BTF objects presents on the system. Change-Id: I9b766d4c70048ff2f6c910f61820cb85f874dfdd Signed-off-by: Quentin Monnet Reviewed-by: Jakub Kicinski Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 3 +++ include/uapi/linux/bpf.h | 1 + kernel/bpf/btf.c | 4 ++-- kernel/bpf/syscall.c | 4 ++++ 4 files changed, 10 insertions(+), 2 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 5ca2ecca84ba..73189b5519c4 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -27,6 +27,9 @@ struct seq_file; struct btf; struct btf_type; +extern struct idr btf_idr; +extern spinlock_t btf_idr_lock; + /* map is generic key/value storage optionally accesible by eBPF programs */ struct bpf_map_ops { /* funcs callable from userspace (via syscall) */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 4a4ab5e0b46a..79afe576eccb 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -106,6 +106,7 @@ enum bpf_cmd { BPF_TASK_FD_QUERY, BPF_MAP_LOOKUP_AND_DELETE_ELEM, BPF_MAP_FREEZE, + BPF_BTF_GET_NEXT_ID, }; enum bpf_map_type { diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 999698827bcf..0eb18f0cf28a 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -195,8 +195,8 @@ i < btf_type_vlen(struct_type); \ i++, member++) -static DEFINE_IDR(btf_idr); -static DEFINE_SPINLOCK(btf_idr_lock); +DEFINE_IDR(btf_idr); +DEFINE_SPINLOCK(btf_idr_lock); struct btf { void *data; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 5ecdef31f5da..f184c0655644 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2899,6 +2899,10 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz err = bpf_obj_get_next_id(&attr, uattr, &map_idr, &map_idr_lock); break; + case BPF_BTF_GET_NEXT_ID: + err = bpf_obj_get_next_id(&attr, uattr, + &btf_idr, &btf_idr_lock); + break; case BPF_PROG_GET_FD_BY_ID: err = bpf_prog_get_fd_by_id(&attr); break; From 47b6c0f9336847576b87f18116e3757702e52be9 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 21 Aug 2019 14:07:10 -0700 Subject: [PATCH 1325/1640] UPSTREAM: bpf: fix precision tracking in presence of bpf2bpf calls While adding extra tests for precision tracking and extra infra to adjust verifier heuristics the existing test "calls: cross frame pruning - liveness propagation" started to fail. The root cause is the same as described in verifer.c comment: * Also if parent's curframe > frame where backtracking started, * the verifier need to mark registers in both frames, otherwise callees * may incorrectly prune callers. This is similar to * commit 7640ead93924 ("bpf: verifier: make sure callees don't prune with caller differences") * For now backtracking falls back into conservative marking. Turned out though that returning -ENOTSUPP from backtrack_insn() and doing mark_all_scalars_precise() in the current parentage chain is not enough. Depending on how is_state_visited() heuristic is creating parentage chain it's possible that callee will incorrectly prune caller. Fix the issue by setting precise=true earlier and more aggressively. Before this fix the precision tracking _within_ functions that don't do bpf2bpf calls would still work. Whereas now precision tracking is completely disabled when bpf2bpf calls are present anywhere in the program. No difference in cilium tests (they don't have bpf2bpf calls). No difference in test_progs though some of them have bpf2bpf calls, but precision tracking wasn't effective there. Fixes: b5dc0163d8fd ("bpf: precise scalar_value tracking") Change-Id: I35ba067de27b668af3b489cde91111ad8a15dbea Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 0f43f1245237..1d2b7fada296 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -984,9 +984,6 @@ static void __mark_reg_unbounded(struct bpf_reg_state *reg) reg->smax_value = S64_MAX; reg->umin_value = 0; reg->umax_value = U64_MAX; - - /* constant backtracking is enabled for root only for now */ - reg->precise = capable(CAP_SYS_ADMIN) ? false : true; } /* Mark a register as having a completely unknown (scalar) value. */ @@ -1013,7 +1010,11 @@ static void mark_reg_unknown(struct bpf_verifier_env *env, __mark_reg_not_init(regs + regno); return; } - __mark_reg_unknown(regs + regno); + regs += regno; + __mark_reg_unknown(regs); + /* constant backtracking is enabled for root without bpf2bpf calls */ + regs->precise = env->subprog_cnt > 1 || !env->allow_ptr_leaks ? + true : false; } static void __mark_reg_not_init(struct bpf_reg_state *reg) From d519f712cf45dfd81516b7942a2ae934148293d8 Mon Sep 17 00:00:00 2001 From: "Naveen N. Rao" Date: Thu, 22 Aug 2019 00:53:58 +0530 Subject: [PATCH 1326/1640] UPSTREAM: bpf: handle 32-bit zext during constant blinding Since BPF constant blinding is performed after the verifier pass, the ALU32 instructions inserted for doubleword immediate loads don't have a corresponding zext instruction. This is causing a kernel oops on powerpc and can be reproduced by running 'test_cgroup_storage' with bpf_jit_harden=2. Fix this by emitting BPF_ZEXT during constant blinding if prog->aux->verifier_zext is set. Fixes: a4b1d3c1ddf6cb ("bpf: verifier: insert zero extension according to analysis result") Reported-by: Michael Ellerman Change-Id: Ic30fd43232e4a67d43a3ad66168e172e9096221c Signed-off-by: Naveen N. Rao Reviewed-by: Jiong Wang Signed-off-by: Daniel Borkmann --- kernel/bpf/core.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index a66464954649..b801454680f7 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -916,7 +916,8 @@ int bpf_jit_get_func_addr(const struct bpf_prog *prog, static int bpf_jit_blind_insn(const struct bpf_insn *from, const struct bpf_insn *aux, - struct bpf_insn *to_buff) + struct bpf_insn *to_buff, + bool emit_zext) { struct bpf_insn *to = to_buff; u32 imm_rnd = get_random_int(); @@ -1028,6 +1029,8 @@ static int bpf_jit_blind_insn(const struct bpf_insn *from, case 0: /* Part 2 of BPF_LD | BPF_IMM | BPF_DW. */ *to++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ aux[0].imm); *to++ = BPF_ALU32_IMM(BPF_XOR, BPF_REG_AX, imm_rnd); + if (emit_zext) + *to++ = BPF_ZEXT_REG(BPF_REG_AX); *to++ = BPF_ALU64_REG(BPF_OR, aux[0].dst_reg, BPF_REG_AX); break; @@ -1111,7 +1114,8 @@ struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *prog) insn[1].code == 0) memcpy(aux, insn, sizeof(aux)); - rewritten = bpf_jit_blind_insn(insn, aux, insn_buff); + rewritten = bpf_jit_blind_insn(insn, aux, insn_buff, + clone->aux->verifier_zext); if (!rewritten) continue; From ea8f084316128c03812d880e093540eaf026aa73 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 22 Aug 2019 22:52:12 -0700 Subject: [PATCH 1327/1640] UPSTREAM: bpf: introduce verifier internal test flag Introduce BPF_F_TEST_STATE_FREQ flag to stress test parentage chain and state pruning. Change-Id: I89a8f21d4436a181045b56779a2213bd8b5d071b Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Signed-off-by: Daniel Borkmann --- include/linux/bpf_verifier.h | 1 + include/uapi/linux/bpf.h | 3 +++ kernel/bpf/syscall.c | 1 + kernel/bpf/verifier.c | 5 ++++- 4 files changed, 9 insertions(+), 1 deletion(-) diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index e462b2420a44..dbb95b500518 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -366,6 +366,7 @@ struct bpf_verifier_env { struct bpf_verifier_stack_elem *head; /* stack of verifier states to be processed */ int stack_size; /* number of states to be processed */ bool strict_alignment; /* perform strict pointer alignment checks */ + bool test_state_freq; /* test verifier with different pruning frequency */ struct bpf_verifier_state *cur_state; /* current verifier state */ struct bpf_verifier_state_list **explored_states; /* search pruning optimization */ struct bpf_verifier_state_list *free_list; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 79afe576eccb..640ddc081bc6 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -285,6 +285,9 @@ enum bpf_attach_type { */ #define BPF_F_TEST_RND_HI32 (1U << 2) +/* The verifier internal test flag. Behavior is undefined */ +#define BPF_F_TEST_STATE_FREQ (1U << 3) + /* When BPF ldimm64's insn[0].src_reg != 0 then this can have * two extensions: * diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index f184c0655644..3732a7c5a77a 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1630,6 +1630,7 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) if (attr->prog_flags & ~(BPF_F_STRICT_ALIGNMENT | BPF_F_ANY_ALIGNMENT | + BPF_F_TEST_STATE_FREQ | BPF_F_TEST_RND_HI32)) return -EINVAL; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 1d2b7fada296..e41c650fd8a6 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -7380,7 +7380,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) struct bpf_verifier_state_list *sl, **pprev; struct bpf_verifier_state *cur = env->cur_state, *new; int i, j, err, states_cnt = 0; - bool add_new_state = false; + bool add_new_state = env->test_state_freq ? true : false; cur->last_insn_idx = env->prev_insn_idx; if (!env->insn_aux_data[insn_idx].prune_point) @@ -9426,6 +9426,9 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, env->allow_ptr_leaks = is_priv; + if (is_priv) + env->test_state_freq = attr->prog_flags & BPF_F_TEST_STATE_FREQ; + ret = replace_map_fd_with_map_ptr(env); if (ret < 0) goto skip_full_check; From d8b9399789ae50a156da402f2fae8b1a37356dc7 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 3 Sep 2019 15:16:17 -0700 Subject: [PATCH 1328/1640] UPSTREAM: bpf: fix precision tracking of stack slots The problem can be seen in the following two tests: 0: (bf) r3 = r10 1: (55) if r3 != 0x7b goto pc+0 2: (7a) *(u64 *)(r3 -8) = 0 3: (79) r4 = *(u64 *)(r10 -8) .. 0: (85) call bpf_get_prandom_u32#7 1: (bf) r3 = r10 2: (55) if r3 != 0x7b goto pc+0 3: (7b) *(u64 *)(r3 -8) = r0 4: (79) r4 = *(u64 *)(r10 -8) When backtracking need to mark R4 it will mark slot fp-8. But ST or STX into fp-8 could belong to the same block of instructions. When backtracing is done the parent state may have fp-8 slot as "unallocated stack". Which will cause verifier to warn and incorrectly reject such programs. Writes into stack via non-R10 register are rare. llvm always generates canonical stack spill/fill. For such pathological case fall back to conservative precision tracking instead of rejecting. Reported-by: syzbot+c8d66267fd2b5955287e@syzkaller.appspotmail.com Fixes: b5dc0163d8fd ("bpf: precise scalar_value tracking") Change-Id: Ib6760a10d327c38cea22e2474011b17e9e0e10a0 Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index e41c650fd8a6..39396f156f98 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1771,16 +1771,21 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno, bitmap_from_u64(mask, stack_mask); for_each_set_bit(i, mask, 64) { if (i >= func->allocated_stack / BPF_REG_SIZE) { - /* This can happen if backtracking - * is propagating stack precision where - * caller has larger stack frame - * than callee, but backtrack_insn() should - * have returned -ENOTSUPP. + /* the sequence of instructions: + * 2: (bf) r3 = r10 + * 3: (7b) *(u64 *)(r3 -8) = r0 + * 4: (79) r4 = *(u64 *)(r10 -8) + * doesn't contain jmps. It's backtracked + * as a single block. + * During backtracking insn 3 is not recognized as + * stack access, so at the end of backtracking + * stack slot fp-8 is still marked in stack_mask. + * However the parent state may not have accessed + * fp-8 and it's "unallocated" stack space. + * In such case fallback to conservative. */ - verbose(env, "BUG spi %d stack_size %d\n", - i, func->allocated_stack); - WARN_ONCE(1, "verifier backtracking bug"); - return -EFAULT; + mark_all_scalars_precise(env, st); + return 0; } if (func->stack[i].slot_type[0] != STACK_SPILL) { From d1d05e1b3b258bfebf1751852a9e18ec8a2c722a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Sun, 8 Sep 2019 09:20:16 +0100 Subject: [PATCH 1329/1640] UPSTREAM: xdp: Fix race in dev_map_hash_update_elem() when replacing element MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit syzbot found a crash in dev_map_hash_update_elem(), when replacing an element with a new one. Jesper correctly identified the cause of the crash as a race condition between the initial lookup in the map (which is done before taking the lock), and the removal of the old element. Rather than just add a second lookup into the hashmap after taking the lock, fix this by reworking the function logic to take the lock before the initial lookup. Fixes: 6f9d451ab1a3 ("xdp: Add devmap_hash map type for looking up devices by hashed index") Reported-and-tested-by: syzbot+4e7a85b1432052e8d6f8@syzkaller.appspotmail.com Change-Id: I4147dd30f9972d9f99511c6b86530c69956fdcd4 Signed-off-by: Toke Høiland-Jørgensen Acked-by: Jesper Dangaard Brouer Signed-off-by: Daniel Borkmann --- kernel/bpf/devmap.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 9af048a932b5..d27f3b60ff6d 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -650,19 +650,22 @@ static int __dev_map_hash_update_elem(struct net *net, struct bpf_map *map, u32 ifindex = *(u32 *)value; u32 idx = *(u32 *)key; unsigned long flags; + int err = -EEXIST; if (unlikely(map_flags > BPF_EXIST || !ifindex)) return -EINVAL; + spin_lock_irqsave(&dtab->index_lock, flags); + old_dev = __dev_map_hash_lookup_elem(map, idx); if (old_dev && (map_flags & BPF_NOEXIST)) - return -EEXIST; + goto out_err; dev = __dev_map_alloc_node(net, dtab, ifindex, idx); - if (IS_ERR(dev)) - return PTR_ERR(dev); - - spin_lock_irqsave(&dtab->index_lock, flags); + if (IS_ERR(dev)) { + err = PTR_ERR(dev); + goto out_err; + } if (old_dev) { hlist_del_rcu(&old_dev->index_hlist); @@ -683,6 +686,10 @@ static int __dev_map_hash_update_elem(struct net *net, struct bpf_map *map, call_rcu(&old_dev->rcu, __dev_map_entry_free); return 0; + +out_err: + spin_unlock_irqrestore(&dtab->index_lock, flags); + return err; } static int dev_map_hash_update_elem(struct bpf_map *map, void *key, void *value, From 54b32e231fee2ec669bb4d71d5d460cdc053fa41 Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Fri, 16 Aug 2019 12:53:00 +0200 Subject: [PATCH 1330/1640] UPSTREAM: bpf: fix accessing bpf_sysctl.file_pos on s390 "ctx:file_pos sysctl:read write ok" fails on s390 with "Read value != nux". This is because verifier rewrites a complete 32-bit bpf_sysctl.file_pos update to a partial update of the first 32 bits of 64-bit *bpf_sysctl_kern.ppos, which is not correct on big-endian systems. Fix by using an offset on big-endian systems. Ditto for bpf_sysctl.file_pos reads. Currently the test does not detect a problem there, since it expects to see 0, which it gets with high probability in error cases, so change it to seek to offset 3 and expect 3 in bpf_sysctl.file_pos. Fixes: e1550bfe0de4 ("bpf: Add file_pos field to bpf_sysctl ctx") Change-Id: Ic856da03ff633f0e18310365c879de8812c34ed2 Signed-off-by: Ilya Leoshkevich Acked-by: Yonghong Song Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20190816105300.49035-1-iii@linux.ibm.com/ --- include/linux/filter.h | 8 ++++---- kernel/bpf/cgroup.c | 10 ++++++++-- kernel/bpf/verifier.c | 4 ++-- 3 files changed, 14 insertions(+), 8 deletions(-) diff --git a/include/linux/filter.h b/include/linux/filter.h index d822a1884e30..6dbc9794939a 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -837,14 +837,14 @@ bpf_ctx_narrow_access_ok(u32 off, u32 size, u32 size_default) } static inline u8 -bpf_ctx_narrow_load_shift(u32 off, u32 size, u32 size_default) +bpf_ctx_narrow_access_offset(u32 off, u32 size, u32 size_default) { - u8 load_off = off & (size_default - 1); + u8 access_off = off & (size_default - 1); #ifdef __LITTLE_ENDIAN - return load_off * 8; + return access_off; #else - return (size_default - (load_off + size)) * 8; + return size_default - (access_off + size); #endif } diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 6a6a154cfa7b..ddd8addcdb5c 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -1334,6 +1334,7 @@ static u32 sysctl_convert_ctx_access(enum bpf_access_type type, struct bpf_prog *prog, u32 *target_size) { struct bpf_insn *insn = insn_buf; + u32 read_size; switch (si->off) { case offsetof(struct bpf_sysctl, write): @@ -1365,7 +1366,9 @@ static u32 sysctl_convert_ctx_access(enum bpf_access_type type, treg, si->dst_reg, offsetof(struct bpf_sysctl_kern, ppos)); *insn++ = BPF_STX_MEM( - BPF_SIZEOF(u32), treg, si->src_reg, 0); + BPF_SIZEOF(u32), treg, si->src_reg, + bpf_ctx_narrow_access_offset( + 0, sizeof(u32), sizeof(loff_t))); *insn++ = BPF_LDX_MEM( BPF_DW, treg, si->dst_reg, offsetof(struct bpf_sysctl_kern, tmp_reg)); @@ -1374,8 +1377,11 @@ static u32 sysctl_convert_ctx_access(enum bpf_access_type type, BPF_FIELD_SIZEOF(struct bpf_sysctl_kern, ppos), si->dst_reg, si->src_reg, offsetof(struct bpf_sysctl_kern, ppos)); + read_size = bpf_size_to_bytes(BPF_SIZE(si->code)); *insn++ = BPF_LDX_MEM( - BPF_SIZE(si->code), si->dst_reg, si->dst_reg, 0); + BPF_SIZE(si->code), si->dst_reg, si->dst_reg, + bpf_ctx_narrow_access_offset( + 0, read_size, sizeof(loff_t))); } *target_size = sizeof(u32); break; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 39396f156f98..791f798dbd18 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -8784,8 +8784,8 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) } if (is_narrower_load && size < target_size) { - u8 shift = bpf_ctx_narrow_load_shift(off, size, - size_default); + u8 shift = bpf_ctx_narrow_access_offset( + off, size, size_default) * 8; if (ctx_field_size <= 4) { if (shift) insn_buf[cnt++] = BPF_ALU32_IMM(BPF_RSH, From 938b233d2da4c242b687f45ba78afdc928dded46 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 17 Sep 2019 10:45:37 -0700 Subject: [PATCH 1331/1640] UPSTREAM: bpf: fix BTF verification of enums vmlinux BTF has enums that are 8 byte and 1 byte in size. 2 byte enum is a valid construct as well. Fix BTF enum verification to accept those sizes. Fixes: 69b693f0aefa ("bpf: btf: Introduce BPF Type Format (BTF)") Change-Id: I810d95de3b0f21554ab3c69bceebf3c99d7a095a Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann --- kernel/bpf/btf.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 0eb18f0cf28a..18e257060958 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -2377,9 +2377,8 @@ static s32 btf_enum_check_meta(struct btf_verifier_env *env, return -EINVAL; } - if (t->size != sizeof(int)) { - btf_verifier_log_type(env, t, "Expected size:%zu", - sizeof(int)); + if (t->size > 8 || !is_power_of_2(t->size)) { + btf_verifier_log_type(env, t, "Unexpected size"); return -EINVAL; } From a0357a124ce7179b56382eae7bd5cd4c44ad7557 Mon Sep 17 00:00:00 2001 From: Jonathan Lemon Date: Tue, 24 Sep 2019 09:25:21 -0700 Subject: [PATCH 1332/1640] UPSTREAM: bpf/xskmap: Return ERR_PTR for failure case instead of NULL. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When kzalloc() failed, NULL was returned to the caller, which tested the pointer with IS_ERR(), which didn't match, so the pointer was used later, resulting in a NULL dereference. Return ERR_PTR(-ENOMEM) instead of NULL. Reported-by: syzbot+491c1b7565ba9069ecae@syzkaller.appspotmail.com Fixes: 0402acd683c6 ("xsk: remove AF_XDP socket from map when the socket is released") Change-Id: I84cde3fa3a4af63f86866eb15824c48a0ed2732c Signed-off-by: Jonathan Lemon Acked-by: Björn Töpel Signed-off-by: Daniel Borkmann --- kernel/bpf/xskmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c index 942c662e2eed..82a1ffe15dfa 100644 --- a/kernel/bpf/xskmap.c +++ b/kernel/bpf/xskmap.c @@ -37,7 +37,7 @@ static struct xsk_map_node *xsk_map_node_alloc(struct xsk_map *map, node = kzalloc(sizeof(*node), GFP_ATOMIC | __GFP_NOWARN); if (!node) - return NULL; + return ERR_PTR(-ENOMEM); err = xsk_map_inc(map); if (err) { From 316e2ef928b5f126c6df1bf9cda70755bc276279 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Wed, 25 Sep 2019 10:38:35 +0100 Subject: [PATCH 1333/1640] UPSTREAM: bpf: Clean up indentation issue in BTF kflag processing There is a statement that is indented one level too deeply, remove the extraneous tab. Change-Id: I0f6d671e63e5c81829a4839b8c81bf51cb51becb Signed-off-by: Colin Ian King Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20190925093835.19515-1-colin.king@canonical.com --- kernel/bpf/btf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 18e257060958..a28bbec8c59f 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -2332,7 +2332,7 @@ static int btf_enum_check_kflag_member(struct btf_verifier_env *env, if (BITS_PER_BYTE_MASKED(struct_bits_off)) { btf_verifier_log_member(env, struct_type, member, "Member is not byte aligned"); - return -EINVAL; + return -EINVAL; } nr_bits = int_bitsize; From c6b24ec0a67452b8b868f7a6cc649b836cac1883 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Thu, 17 Oct 2019 12:57:02 +0200 Subject: [PATCH 1334/1640] UPSTREAM: xdp: Prevent overflow in devmap_hash cost calculation for 32-bit builds MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Tetsuo pointed out that without an explicit cast, the cost calculation for devmap_hash type maps could overflow on 32-bit builds. This adds the missing cast. Fixes: 6f9d451ab1a3 ("xdp: Add devmap_hash map type for looking up devices by hashed index") Reported-by: Tetsuo Handa Change-Id: I0f5d1f7135c6c28e2066a5361e3f02e3aa0e8c54 Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20191017105702.2807093-1-toke@redhat.com --- kernel/bpf/devmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index d27f3b60ff6d..c0a48f336997 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -128,7 +128,7 @@ static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr) if (!dtab->n_buckets) /* Overflow check */ return -EINVAL; - cost += sizeof(struct hlist_head) * dtab->n_buckets; + cost += (u64) sizeof(struct hlist_head) * dtab->n_buckets; } /* if map size is larger than memlock limit, reject it */ From 88b9281f41bb281431a2cc8ae92c2570601386a3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Sat, 19 Oct 2019 13:19:31 +0200 Subject: [PATCH 1335/1640] UPSTREAM: xdp: Handle device unregister for devmap_hash map type MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It seems I forgot to add handling of devmap_hash type maps to the device unregister hook for devmaps. This omission causes devices to not be properly released, which causes hangs. Fix this by adding the missing handler. Fixes: 6f9d451ab1a3 ("xdp: Add devmap_hash map type for looking up devices by hashed index") Reported-by: Tetsuo Handa Change-Id: Iacf0b52b815a168ebbc6533ee7d5ee76c1e17085 Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20191019111931.2981954-1-toke@redhat.com --- kernel/bpf/devmap.c | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index c0a48f336997..3867864cdc2f 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -719,6 +719,32 @@ const struct bpf_map_ops dev_map_hash_ops = { .map_check_btf = map_check_no_btf, }; +static void dev_map_hash_remove_netdev(struct bpf_dtab *dtab, + struct net_device *netdev) +{ + unsigned long flags; + u32 i; + + spin_lock_irqsave(&dtab->index_lock, flags); + for (i = 0; i < dtab->n_buckets; i++) { + struct bpf_dtab_netdev *dev; + struct hlist_head *head; + struct hlist_node *next; + + head = dev_map_index_hash(dtab, i); + + hlist_for_each_entry_safe(dev, next, head, index_hlist) { + if (netdev != dev->dev) + continue; + + dtab->items--; + hlist_del_rcu(&dev->index_hlist); + call_rcu(&dev->rcu, __dev_map_entry_free); + } + } + spin_unlock_irqrestore(&dtab->index_lock, flags); +} + static int dev_map_notification(struct notifier_block *notifier, ulong event, void *ptr) { @@ -735,6 +761,11 @@ static int dev_map_notification(struct notifier_block *notifier, */ rcu_read_lock(); list_for_each_entry_rcu(dtab, &dev_map_list, list) { + if (dtab->map.map_type == BPF_MAP_TYPE_DEVMAP_HASH) { + dev_map_hash_remove_netdev(dtab, netdev); + continue; + } + for (i = 0; i < dtab->map.max_entries; i++) { struct bpf_dtab_netdev *dev, *odev; From 7e64a395ad816b22caf0318740bf1d7528eb70bd Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 22 Oct 2019 15:57:23 +0200 Subject: [PATCH 1336/1640] UPSTREAM: bpf: Fix use after free in subprog's jited symbol removal syzkaller managed to trigger the following crash: [...] BUG: unable to handle page fault for address: ffffc90001923030 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD aa551067 P4D aa551067 PUD aa552067 PMD a572b067 PTE 80000000a1173163 Oops: 0000 [#1] PREEMPT SMP KASAN CPU: 0 PID: 7982 Comm: syz-executor912 Not tainted 5.4.0-rc3+ #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:bpf_jit_binary_hdr include/linux/filter.h:787 [inline] RIP: 0010:bpf_get_prog_addr_region kernel/bpf/core.c:531 [inline] RIP: 0010:bpf_tree_comp kernel/bpf/core.c:600 [inline] RIP: 0010:__lt_find include/linux/rbtree_latch.h:115 [inline] RIP: 0010:latch_tree_find include/linux/rbtree_latch.h:208 [inline] RIP: 0010:bpf_prog_kallsyms_find kernel/bpf/core.c:674 [inline] RIP: 0010:is_bpf_text_address+0x184/0x3b0 kernel/bpf/core.c:709 [...] Call Trace: kernel_text_address kernel/extable.c:147 [inline] __kernel_text_address+0x9a/0x110 kernel/extable.c:102 unwind_get_return_address+0x4c/0x90 arch/x86/kernel/unwind_frame.c:19 arch_stack_walk+0x98/0xe0 arch/x86/kernel/stacktrace.c:26 stack_trace_save+0xb6/0x150 kernel/stacktrace.c:123 save_stack mm/kasan/common.c:69 [inline] set_track mm/kasan/common.c:77 [inline] __kasan_kmalloc+0x11c/0x1b0 mm/kasan/common.c:510 kasan_slab_alloc+0xf/0x20 mm/kasan/common.c:518 slab_post_alloc_hook mm/slab.h:584 [inline] slab_alloc mm/slab.c:3319 [inline] kmem_cache_alloc+0x1f5/0x2e0 mm/slab.c:3483 getname_flags+0xba/0x640 fs/namei.c:138 getname+0x19/0x20 fs/namei.c:209 do_sys_open+0x261/0x560 fs/open.c:1091 __do_sys_open fs/open.c:1115 [inline] __se_sys_open fs/open.c:1110 [inline] __x64_sys_open+0x87/0x90 fs/open.c:1110 do_syscall_64+0xf7/0x1c0 arch/x86/entry/common.c:290 entry_SYSCALL_64_after_hwframe+0x49/0xbe [...] After further debugging it turns out that we walk kallsyms while in parallel we tear down a BPF program which contains subprograms that have been JITed though the program itself has not been fully exposed and is eventually bailing out with error. The bpf_prog_kallsyms_del_subprogs() in bpf_prog_load()'s error path removes the symbols, however, bpf_prog_free() tears down the JIT memory too early via scheduled work. Instead, it needs to properly respect RCU grace period as the kallsyms walk for BPF is under RCU. Fix it by refactoring __bpf_prog_put()'s tear down and reuse it in our error path where we defer final destruction when we have subprogs in the program. Fixes: 7d1982b4e335 ("bpf: fix panic in prog load calls cleanup") Fixes: 1c2a088a6626 ("bpf: x64: add JIT support for multi-function programs") Reported-by: syzbot+710043c5d1d5b5013bc7@syzkaller.appspotmail.com Change-Id: I86752b5e3ec1ec2660ebfed0e820c9ba596290ca Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Tested-by: syzbot+710043c5d1d5b5013bc7@syzkaller.appspotmail.com Link: https://lore.kernel.org/bpf/55f6367324c2d7e9583fa9ccf5385dcbba0d7a6e.1571752452.git.daniel@iogearbox.net --- include/linux/filter.h | 1 - kernel/bpf/core.c | 2 +- kernel/bpf/syscall.c | 31 ++++++++++++++++++++----------- 3 files changed, 21 insertions(+), 13 deletions(-) diff --git a/include/linux/filter.h b/include/linux/filter.h index 6dbc9794939a..8ccd55009556 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1180,7 +1180,6 @@ static inline void bpf_prog_kallsyms_del(struct bpf_prog *fp) } #endif /* CONFIG_BPF_JIT */ -void bpf_prog_kallsyms_del_subprogs(struct bpf_prog *fp); void bpf_prog_kallsyms_del_all(struct bpf_prog *fp); #define BPF_ANC BIT(15) diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index b801454680f7..2c0e98af4b0c 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -514,7 +514,7 @@ int bpf_remove_insns(struct bpf_prog *prog, u32 off, u32 cnt) return WARN_ON_ONCE(bpf_adj_branches(prog, off, off + cnt, off, false)); } -void bpf_prog_kallsyms_del_subprogs(struct bpf_prog *fp) +static void bpf_prog_kallsyms_del_subprogs(struct bpf_prog *fp) { int i; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 3732a7c5a77a..dac59f9bd1b3 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1334,17 +1334,25 @@ static void __bpf_prog_put_rcu(struct rcu_head *rcu) bpf_prog_free(aux->prog); } +static void __bpf_prog_put_noref(struct bpf_prog *prog, bool deferred) +{ + bpf_prog_kallsyms_del_all(prog); + btf_put(prog->aux->btf); + kvfree(prog->aux->func_info); + bpf_prog_free_linfo(prog); + + if (deferred) + call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu); + else + __bpf_prog_put_rcu(&prog->aux->rcu); +} + static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock) { if (atomic_dec_and_test(&prog->aux->refcnt)) { /* bpf_prog_free_id() must be called first */ bpf_prog_free_id(prog, do_idr_lock); - bpf_prog_kallsyms_del_all(prog); - btf_put(prog->aux->btf); - kvfree(prog->aux->func_info); - bpf_prog_free_linfo(prog); - - call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu); + __bpf_prog_put_noref(prog, true); } } @@ -1741,11 +1749,12 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) return err; free_used_maps: - bpf_prog_free_linfo(prog); - kvfree(prog->aux->func_info); - btf_put(prog->aux->btf); - bpf_prog_kallsyms_del_subprogs(prog); - free_used_maps(prog->aux); + /* In case we have subprogs, we need to wait for a grace + * period before we can tear down JIT memory since symbols + * are already exposed under kallsyms. + */ + __bpf_prog_put_noref(prog, prog->aux->func_cnt); + return err; free_prog: bpf_prog_uncharge_memlock(prog); free_prog_sec: From a4e34d118b6efdc884c4cdc7b054acc8556f94e5 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 22 Oct 2019 23:30:38 +0200 Subject: [PATCH 1337/1640] UPSTREAM: bpf: Fix use after free in bpf_get_prog_name There is one more problematic case I noticed while recently fixing BPF kallsyms handling in cd7455f1013e ("bpf: Fix use after free in subprog's jited symbol removal") and that is bpf_get_prog_name(). If BTF has been attached to the prog, then we may be able to fetch the function signature type id in kallsyms through prog->aux->func_info[prog->aux->func_idx].type_id. However, while the BTF object itself is torn down via RCU callback, the prog's aux->func_info is immediately freed via kvfree(prog->aux->func_info) once the prog's refcount either hit zero or when subprograms were already exposed via kallsyms and we hit the error path added in 5482e9a93c83 ("bpf: Fix memleak in aux->func_info and aux->btf"). This violates RCU as well since kallsyms could be walked in parallel where we could access aux->func_info. Hence, defer kvfree() to after RCU grace period. Looking at ba64e7d85252 ("bpf: btf: support proper non-jit func info") there is no reason/dependency where we couldn't defer the kvfree(aux->func_info) into the RCU callback. Fixes: 5482e9a93c83 ("bpf: Fix memleak in aux->func_info and aux->btf") Fixes: ba64e7d85252 ("bpf: btf: support proper non-jit func info") Change-Id: I86245f360898beac01f56277eba6ec5925dafe99 Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Cc: Martin KaFai Lau Link: https://lore.kernel.org/bpf/875f2906a7c1a0691f2d567b4d8e4ea2739b1e88.1571779205.git.daniel@iogearbox.net --- kernel/bpf/syscall.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index dac59f9bd1b3..675b3a010705 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1328,6 +1328,7 @@ static void __bpf_prog_put_rcu(struct rcu_head *rcu) { struct bpf_prog_aux *aux = container_of(rcu, struct bpf_prog_aux, rcu); + kvfree(aux->func_info); free_used_maps(aux); bpf_prog_uncharge_memlock(aux->prog); security_bpf_prog_free(aux); @@ -1338,7 +1339,6 @@ static void __bpf_prog_put_noref(struct bpf_prog *prog, bool deferred) { bpf_prog_kallsyms_del_all(prog); btf_put(prog->aux->btf); - kvfree(prog->aux->func_info); bpf_prog_free_linfo(prog); if (deferred) From ad6f5fa429433ee9b9d750f7a93fa6aef39e7a2b Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Mon, 28 Oct 2019 13:29:02 +0100 Subject: [PATCH 1338/1640] UPSTREAM: bpf: Allow narrow loads of bpf_sysctl fields with offset > 0 "ctx:file_pos sysctl:read read ok narrow" works on s390 by accident: it reads the wrong byte, which happens to have the expected value of 0. Improve the test by seeking to the 4th byte and expecting 4 instead of 0. This makes the latent problem apparent: the test attempts to read the first byte of bpf_sysctl.file_pos, assuming this is the least-significant byte, which is not the case on big-endian machines: a non-zero offset is needed. The point of the test is to verify narrow loads, so we cannot cheat our way out by simply using BPF_W. The existence of the test means that such loads have to be supported, most likely because llvm can generate them. Fix the test by adding a big-endian variant, which uses an offset to access the least-significant byte of bpf_sysctl.file_pos. This reveals the final problem: verifier rejects accesses to bpf_sysctl fields with offset > 0. Such accesses are already allowed for a wide range of structs: __sk_buff, bpf_sock_addr and sk_msg_md to name a few. Extend this support to bpf_sysctl by using bpf_ctx_range instead of offsetof when matching field offsets. Fixes: 7b146cebe30c ("bpf: Sysctl hook") Fixes: e1550bfe0de4 ("bpf: Add file_pos field to bpf_sysctl ctx") Fixes: 9a1027e52535 ("selftests/bpf: Test file_pos field in bpf_sysctl ctx") Change-Id: I9d676202db8b8d766e70327ecf8e42f9ecc1627e Signed-off-by: Ilya Leoshkevich Signed-off-by: Alexei Starovoitov Acked-by: Andrey Ignatov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20191028122902.9763-1-iii@linux.ibm.com --- kernel/bpf/cgroup.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index ddd8addcdb5c..a3eaf08e7dd3 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -1311,12 +1311,12 @@ static bool sysctl_is_valid_access(int off, int size, enum bpf_access_type type, return false; switch (off) { - case offsetof(struct bpf_sysctl, write): + case bpf_ctx_range(struct bpf_sysctl, write): if (type != BPF_READ) return false; bpf_ctx_record_field_size(info, size_default); return bpf_ctx_narrow_access_ok(off, size, size_default); - case offsetof(struct bpf_sysctl, file_pos): + case bpf_ctx_range(struct bpf_sysctl, file_pos): if (type == BPF_READ) { bpf_ctx_record_field_size(info, size_default); return bpf_ctx_narrow_access_ok(off, size, size_default); From b45b8e57b471eb28fcbad3926519e1f2a16e4a71 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Tue, 29 Oct 2019 16:43:07 +0100 Subject: [PATCH 1339/1640] UPSTREAM: bpf: Change size to u64 for bpf_map_{area_alloc, charge_init}() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The functions bpf_map_area_alloc() and bpf_map_charge_init() prior this commit passed the size parameter as size_t. In this commit this is changed to u64. All users of these functions avoid size_t overflows on 32-bit systems, by explicitly using u64 when calculating the allocation size and memory charge cost. However, since the result was narrowed by the size_t when passing size and cost to the functions, the overflow handling was in vain. Instead of changing all call sites to size_t and handle overflow at the call site, the parameter is changed to u64 and checked in the functions above. Fixes: d407bd25a204 ("bpf: don't trigger OOM killer under pressure with map alloc") Fixes: c85d69135a91 ("bpf: move memory size checks to bpf_map_charge_init()") Change-Id: I0a5905e6c86972f8af4f2e0d492cbe39b1a73164 Signed-off-by: Björn Töpel Signed-off-by: Daniel Borkmann Reviewed-by: Jakub Kicinski Link: https://lore.kernel.org/bpf/20191029154307.23053-1-bjorn.topel@gmail.com --- include/linux/bpf.h | 4 ++-- kernel/bpf/syscall.c | 7 +++++-- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 73189b5519c4..2466c7b39852 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -669,11 +669,11 @@ void bpf_map_put_with_uref(struct bpf_map *map); void bpf_map_put(struct bpf_map *map); int bpf_map_charge_memlock(struct bpf_map *map, u32 pages); void bpf_map_uncharge_memlock(struct bpf_map *map, u32 pages); -int bpf_map_charge_init(struct bpf_map_memory *mem, size_t size); +int bpf_map_charge_init(struct bpf_map_memory *mem, u64 size); void bpf_map_charge_finish(struct bpf_map_memory *mem); void bpf_map_charge_move(struct bpf_map_memory *dst, struct bpf_map_memory *src); -void *bpf_map_area_alloc(size_t size, int numa_node); +void *bpf_map_area_alloc(u64 size, int numa_node); void bpf_map_area_free(void *base); void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 675b3a010705..9f6bee48c066 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -127,7 +127,7 @@ static struct bpf_map *find_and_alloc_map(union bpf_attr *attr) return map; } -void *bpf_map_area_alloc(size_t size, int numa_node) +void *bpf_map_area_alloc(u64 size, int numa_node) { /* We really just want to fail instead of triggering OOM killer * under memory pressure, therefore we set __GFP_NORETRY to kmalloc, @@ -142,6 +142,9 @@ void *bpf_map_area_alloc(size_t size, int numa_node) const gfp_t flags = __GFP_NOWARN | __GFP_ZERO; void *area; + if (size >= SIZE_MAX) + return NULL; + if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) { area = kmalloc_node(size, GFP_USER | __GFP_NORETRY | flags, numa_node); @@ -198,7 +201,7 @@ static void bpf_uncharge_memlock(struct user_struct *user, u32 pages) atomic_long_sub(pages, &user->locked_vm); } -int bpf_map_charge_init(struct bpf_map_memory *mem, size_t size) +int bpf_map_charge_init(struct bpf_map_memory *mem, u64 size) { u32 pages = round_up(size, PAGE_SIZE) >> PAGE_SHIFT; struct user_struct *user; From a2aa76be05ea78412a912ea7861fd6935b302d93 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 21 Nov 2019 09:06:50 -0800 Subject: [PATCH 1340/1640] UPSTREAM: bpf: Provide better register bounds after jmp32 instructions [ Upstream commit 581738a681b6faae5725c2555439189ca81c0f1f ] With latest llvm (trunk https://github.com/llvm/llvm-project), test_progs, which has +alu32 enabled, failed for strobemeta.o. The verifier output looks like below with edit to replace large decimal numbers with hex ones. 193: (85) call bpf_probe_read_user_str#114 R0=inv(id=0) 194: (26) if w0 > 0x1 goto pc+4 R0_w=inv(id=0,umax_value=0xffffffff00000001) 195: (6b) *(u16 *)(r7 +80) = r0 196: (bc) w6 = w0 R6_w=inv(id=0,umax_value=0xffffffff,var_off=(0x0; 0xffffffff)) 197: (67) r6 <<= 32 R6_w=inv(id=0,smax_value=0x7fffffff00000000,umax_value=0xffffffff00000000, var_off=(0x0; 0xffffffff00000000)) 198: (77) r6 >>= 32 R6=inv(id=0,umax_value=0xffffffff,var_off=(0x0; 0xffffffff)) ... 201: (79) r8 = *(u64 *)(r10 -416) R8_w=map_value(id=0,off=40,ks=4,vs=13872,imm=0) 202: (0f) r8 += r6 R8_w=map_value(id=0,off=40,ks=4,vs=13872,umax_value=0xffffffff,var_off=(0x0; 0xffffffff)) 203: (07) r8 += 9696 R8_w=map_value(id=0,off=9736,ks=4,vs=13872,umax_value=0xffffffff,var_off=(0x0; 0xffffffff)) ... 255: (bf) r1 = r8 R1_w=map_value(id=0,off=9736,ks=4,vs=13872,umax_value=0xffffffff,var_off=(0x0; 0xffffffff)) ... 257: (85) call bpf_probe_read_user_str#114 R1 unbounded memory access, make sure to bounds check any array access into a map The value range for register r6 at insn 198 should be really just 0/1. The umax_value=0xffffffff caused later verification failure. After jmp instructions, the current verifier already tried to use just obtained information to get better register range. The current mechanism is for 64bit register only. This patch implemented to tighten the range for 32bit sub-registers after jmp32 instructions. With the patch, we have the below range ranges for the above code sequence: 193: (85) call bpf_probe_read_user_str#114 R0=inv(id=0) 194: (26) if w0 > 0x1 goto pc+4 R0_w=inv(id=0,smax_value=0x7fffffff00000001,umax_value=0xffffffff00000001, var_off=(0x0; 0xffffffff00000001)) 195: (6b) *(u16 *)(r7 +80) = r0 196: (bc) w6 = w0 R6_w=inv(id=0,umax_value=0xffffffff,var_off=(0x0; 0x1)) 197: (67) r6 <<= 32 R6_w=inv(id=0,umax_value=0x100000000,var_off=(0x0; 0x100000000)) 198: (77) r6 >>= 32 R6=inv(id=0,umax_value=1,var_off=(0x0; 0x1)) ... 201: (79) r8 = *(u64 *)(r10 -416) R8_w=map_value(id=0,off=40,ks=4,vs=13872,imm=0) 202: (0f) r8 += r6 R8_w=map_value(id=0,off=40,ks=4,vs=13872,umax_value=1,var_off=(0x0; 0x1)) 203: (07) r8 += 9696 R8_w=map_value(id=0,off=9736,ks=4,vs=13872,umax_value=1,var_off=(0x0; 0x1)) ... 255: (bf) r1 = r8 R1_w=map_value(id=0,off=9736,ks=4,vs=13872,umax_value=1,var_off=(0x0; 0x1)) ... 257: (85) call bpf_probe_read_user_str#114 ... At insn 194, the register R0 has better var_off.mask and smax_value. Especially, the var_off.mask ensures later lshift and rshift maintains proper value range. Suggested-by: Alexei Starovoitov Change-Id: I180c54554377fc904f68516cc6b3d35731c25c34 Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20191121170650.449030-1-yhs@fb.com Signed-off-by: Sasha Levin --- kernel/bpf/verifier.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 791f798dbd18..26c819e4a758 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -977,6 +977,17 @@ static void __reg_bound_offset(struct bpf_reg_state *reg) reg->umax_value)); } +static void __reg_bound_offset32(struct bpf_reg_state *reg) +{ + u64 mask = 0xffffFFFF; + struct tnum range = tnum_range(reg->umin_value & mask, + reg->umax_value & mask); + struct tnum lo32 = tnum_cast(reg->var_off, 4); + struct tnum hi32 = tnum_lshift(tnum_rshift(reg->var_off, 32), 32); + + reg->var_off = tnum_or(hi32, tnum_intersect(lo32, range)); +} + /* Reset the min/max bounds of a register */ static void __mark_reg_unbounded(struct bpf_reg_state *reg) { @@ -5587,6 +5598,10 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg, /* We might have learned some bits from the bounds. */ __reg_bound_offset(false_reg); __reg_bound_offset(true_reg); + if (is_jmp32) { + __reg_bound_offset32(false_reg); + __reg_bound_offset32(true_reg); + } /* Intersecting with the old var_off might have improved our bounds * slightly. e.g. if umax was 0x7f...f and var_off was (0; 0xf...fc), * then new var_off is (0; 0x7f...fc) which improves our umax. @@ -5696,6 +5711,10 @@ static void reg_set_min_max_inv(struct bpf_reg_state *true_reg, /* We might have learned some bits from the bounds. */ __reg_bound_offset(false_reg); __reg_bound_offset(true_reg); + if (is_jmp32) { + __reg_bound_offset32(false_reg); + __reg_bound_offset32(true_reg); + } /* Intersecting with the old var_off might have improved our bounds * slightly. e.g. if umax was 0x7f...f and var_off was (0; 0xf...fc), * then new var_off is (0; 0x7f...fc) which improves our umax. From 2e6fba97eef7079dda830267867b3542549f17ef Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sun, 22 Dec 2019 23:37:40 +0100 Subject: [PATCH 1341/1640] UPSTREAM: bpf: Fix precision tracking for unbounded scalars commit f54c7898ed1c3c9331376c0337a5049c38f66497 upstream. Anatoly has been fuzzing with kBdysch harness and reported a hang in one of the outcomes. Upon closer analysis, it turns out that precise scalar value tracking is missing a few precision markings for unknown scalars: 0: R1=ctx(id=0,off=0,imm=0) R10=fp0 0: (b7) r0 = 0 1: R0_w=invP0 R1=ctx(id=0,off=0,imm=0) R10=fp0 1: (35) if r0 >= 0xf72e goto pc+0 --> only follow fallthrough 2: R0_w=invP0 R1=ctx(id=0,off=0,imm=0) R10=fp0 2: (35) if r0 >= 0x80fe0000 goto pc+0 --> only follow fallthrough 3: R0_w=invP0 R1=ctx(id=0,off=0,imm=0) R10=fp0 3: (14) w0 -= -536870912 4: R0_w=invP536870912 R1=ctx(id=0,off=0,imm=0) R10=fp0 4: (0f) r1 += r0 5: R0_w=invP536870912 R1_w=inv(id=0) R10=fp0 5: (55) if r1 != 0x104c1500 goto pc+0 --> push other branch for later analysis R0_w=invP536870912 R1_w=inv273421568 R10=fp0 6: R0_w=invP536870912 R1_w=inv273421568 R10=fp0 6: (b7) r0 = 0 7: R0=invP0 R1=inv273421568 R10=fp0 7: (76) if w1 s>= 0xffffff00 goto pc+3 --> only follow goto 11: R0=invP0 R1=inv273421568 R10=fp0 11: (95) exit 6: R0_w=invP536870912 R1_w=inv(id=0) R10=fp0 6: (b7) r0 = 0 propagating r0 7: safe processed 11 insns [...] In the analysis of the second path coming after the successful exit above, the path is being pruned at line 7. Pruning analysis found that both r0 are precise P0 and both R1 are non-precise scalars and given prior path with R1 as non-precise scalar succeeded, this one is therefore safe as well. However, problem is that given condition at insn 7 in the first run, we only followed goto and didn't push the other branch for later analysis, we've never walked the few insns in there and therefore dead-code sanitation rewrites it as goto pc-1, causing the hang depending on the skb address hitting these conditions. The issue is that R1 should have been marked as precise as well such that pruning enforces range check and conluded that new R1 is not in range of old R1. In insn 4, we mark R1 (skb) as unknown scalar via __mark_reg_unbounded() but not mark_reg_unbounded() and therefore regs->precise remains as false. Back in b5dc0163d8fd ("bpf: precise scalar_value tracking"), this was not the case since marking out of __mark_reg_unbounded() had this covered as well. Once in both are set as precise in 4 as they should have been, we conclude that given R1 was in prior fall-through path 0x104c1500 and now is completely unknown, the check at insn 7 concludes that we need to continue walking. Analysis after the fix: 0: R1=ctx(id=0,off=0,imm=0) R10=fp0 0: (b7) r0 = 0 1: R0_w=invP0 R1=ctx(id=0,off=0,imm=0) R10=fp0 1: (35) if r0 >= 0xf72e goto pc+0 2: R0_w=invP0 R1=ctx(id=0,off=0,imm=0) R10=fp0 2: (35) if r0 >= 0x80fe0000 goto pc+0 3: R0_w=invP0 R1=ctx(id=0,off=0,imm=0) R10=fp0 3: (14) w0 -= -536870912 4: R0_w=invP536870912 R1=ctx(id=0,off=0,imm=0) R10=fp0 4: (0f) r1 += r0 5: R0_w=invP536870912 R1_w=invP(id=0) R10=fp0 5: (55) if r1 != 0x104c1500 goto pc+0 R0_w=invP536870912 R1_w=invP273421568 R10=fp0 6: R0_w=invP536870912 R1_w=invP273421568 R10=fp0 6: (b7) r0 = 0 7: R0=invP0 R1=invP273421568 R10=fp0 7: (76) if w1 s>= 0xffffff00 goto pc+3 11: R0=invP0 R1=invP273421568 R10=fp0 11: (95) exit 6: R0_w=invP536870912 R1_w=invP(id=0) R10=fp0 6: (b7) r0 = 0 7: R0_w=invP0 R1_w=invP(id=0) R10=fp0 7: (76) if w1 s>= 0xffffff00 goto pc+3 R0_w=invP0 R1_w=invP(id=0) R10=fp0 8: R0_w=invP0 R1_w=invP(id=0) R10=fp0 8: (a5) if r0 < 0x2007002a goto pc+0 9: R0_w=invP0 R1_w=invP(id=0) R10=fp0 9: (57) r0 &= -16316416 10: R0_w=invP0 R1_w=invP(id=0) R10=fp0 10: (a6) if w0 < 0x1201 goto pc+0 11: R0_w=invP0 R1_w=invP(id=0) R10=fp0 11: (95) exit 11: R0=invP0 R1=invP(id=0) R10=fp0 11: (95) exit processed 16 insns [...] Fixes: 6754172c208d ("bpf: fix precision tracking in presence of bpf2bpf calls") Reported-by: Anatoly Trosinenko Change-Id: I27f6d37d128aedf8d91e9cad542d8e565aeea494 Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20191222223740.25297-1-daniel@iogearbox.net Signed-off-by: Greg Kroah-Hartman --- kernel/bpf/verifier.c | 43 ++++++++++++++++++++++--------------------- 1 file changed, 22 insertions(+), 21 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 26c819e4a758..d3c8df51e320 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -851,7 +851,8 @@ static const int caller_saved[CALLER_SAVED_REGS] = { BPF_REG_0, BPF_REG_1, BPF_REG_2, BPF_REG_3, BPF_REG_4, BPF_REG_5 }; -static void __mark_reg_not_init(struct bpf_reg_state *reg); +static void __mark_reg_not_init(const struct bpf_verifier_env *env, + struct bpf_reg_state *reg); /* Mark the unknown part of a register (variable offset or scalar value) as * known to have the value @imm. @@ -889,7 +890,7 @@ static void mark_reg_known_zero(struct bpf_verifier_env *env, verbose(env, "mark_reg_known_zero(regs, %u)\n", regno); /* Something bad happened, let's kill all regs */ for (regno = 0; regno < MAX_BPF_REG; regno++) - __mark_reg_not_init(regs + regno); + __mark_reg_not_init(env, regs + regno); return; } __mark_reg_known_zero(regs + regno); @@ -998,7 +999,8 @@ static void __mark_reg_unbounded(struct bpf_reg_state *reg) } /* Mark a register as having a completely unknown (scalar) value. */ -static void __mark_reg_unknown(struct bpf_reg_state *reg) +static void __mark_reg_unknown(const struct bpf_verifier_env *env, + struct bpf_reg_state *reg) { /* * Clear type, id, off, and union(map_ptr, range) and @@ -1008,6 +1010,8 @@ static void __mark_reg_unknown(struct bpf_reg_state *reg) reg->type = SCALAR_VALUE; reg->var_off = tnum_unknown; reg->frameno = 0; + reg->precise = env->subprog_cnt > 1 || !env->allow_ptr_leaks ? + true : false; __mark_reg_unbounded(reg); } @@ -1018,19 +1022,16 @@ static void mark_reg_unknown(struct bpf_verifier_env *env, verbose(env, "mark_reg_unknown(regs, %u)\n", regno); /* Something bad happened, let's kill all regs except FP */ for (regno = 0; regno < BPF_REG_FP; regno++) - __mark_reg_not_init(regs + regno); + __mark_reg_not_init(env, regs + regno); return; } - regs += regno; - __mark_reg_unknown(regs); - /* constant backtracking is enabled for root without bpf2bpf calls */ - regs->precise = env->subprog_cnt > 1 || !env->allow_ptr_leaks ? - true : false; + __mark_reg_unknown(env, regs + regno); } -static void __mark_reg_not_init(struct bpf_reg_state *reg) +static void __mark_reg_not_init(const struct bpf_verifier_env *env, + struct bpf_reg_state *reg) { - __mark_reg_unknown(reg); + __mark_reg_unknown(env, reg); reg->type = NOT_INIT; } @@ -1041,10 +1042,10 @@ static void mark_reg_not_init(struct bpf_verifier_env *env, verbose(env, "mark_reg_not_init(regs, %u)\n", regno); /* Something bad happened, let's kill all regs except FP */ for (regno = 0; regno < BPF_REG_FP; regno++) - __mark_reg_not_init(regs + regno); + __mark_reg_not_init(env, regs + regno); return; } - __mark_reg_not_init(regs + regno); + __mark_reg_not_init(env, regs + regno); } #define DEF_NOT_SUBREG (0) @@ -3041,7 +3042,7 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, } if (state->stack[spi].slot_type[0] == STACK_SPILL && state->stack[spi].spilled_ptr.type == SCALAR_VALUE) { - __mark_reg_unknown(&state->stack[spi].spilled_ptr); + __mark_reg_unknown(env, &state->stack[spi].spilled_ptr); for (j = 0; j < BPF_REG_SIZE; j++) state->stack[spi].slot_type[j] = STACK_MISC; goto mark; @@ -3680,7 +3681,7 @@ static void __clear_all_pkt_pointers(struct bpf_verifier_env *env, if (!reg) continue; if (reg_is_pkt_pointer_any(reg)) - __mark_reg_unknown(reg); + __mark_reg_unknown(env, reg); } } @@ -3708,7 +3709,7 @@ static void release_reg_references(struct bpf_verifier_env *env, if (!reg) continue; if (reg->ref_obj_id == ref_obj_id) - __mark_reg_unknown(reg); + __mark_reg_unknown(env, reg); } } @@ -4527,7 +4528,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, /* Taint dst register if offset had invalid bounds derived from * e.g. dead branches. */ - __mark_reg_unknown(dst_reg); + __mark_reg_unknown(env, dst_reg); return 0; } @@ -4759,13 +4760,13 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, /* Taint dst register if offset had invalid bounds derived from * e.g. dead branches. */ - __mark_reg_unknown(dst_reg); + __mark_reg_unknown(env, dst_reg); return 0; } if (!src_known && opcode != BPF_ADD && opcode != BPF_SUB && opcode != BPF_AND) { - __mark_reg_unknown(dst_reg); + __mark_reg_unknown(env, dst_reg); return 0; } @@ -6911,7 +6912,7 @@ static void clean_func_state(struct bpf_verifier_env *env, /* since the register is unused, clear its state * to make further comparison simpler */ - __mark_reg_not_init(&st->regs[i]); + __mark_reg_not_init(env, &st->regs[i]); } for (i = 0; i < st->allocated_stack / BPF_REG_SIZE; i++) { @@ -6919,7 +6920,7 @@ static void clean_func_state(struct bpf_verifier_env *env, /* liveness must not touch this stack slot anymore */ st->stack[i].spilled_ptr.live |= REG_LIVE_DONE; if (!(live & REG_LIVE_READ)) { - __mark_reg_not_init(&st->stack[i].spilled_ptr); + __mark_reg_not_init(env, &st->stack[i].spilled_ptr); for (j = 0; j < BPF_REG_SIZE; j++) st->stack[i].slot_type[j] = STACK_INVALID; } From 0daeeafabacfb80f45d8d84547a2ed96fe54a028 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 6 Jan 2020 22:51:57 +0100 Subject: [PATCH 1342/1640] UPSTREAM: bpf: Fix passing modified ctx to ld/abs/ind instruction commit 6d4f151acf9a4f6fab09b615f246c717ddedcf0c upstream. Anatoly has been fuzzing with kBdysch harness and reported a KASAN slab oob in one of the outcomes: [...] [ 77.359642] BUG: KASAN: slab-out-of-bounds in bpf_skb_load_helper_8_no_cache+0x71/0x130 [ 77.360463] Read of size 4 at addr ffff8880679bac68 by task bpf/406 [ 77.361119] [ 77.361289] CPU: 2 PID: 406 Comm: bpf Not tainted 5.5.0-rc2-xfstests-00157-g2187f215eba #1 [ 77.362134] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.12.0-1 04/01/2014 [ 77.362984] Call Trace: [ 77.363249] dump_stack+0x97/0xe0 [ 77.363603] print_address_description.constprop.0+0x1d/0x220 [ 77.364251] ? bpf_skb_load_helper_8_no_cache+0x71/0x130 [ 77.365030] ? bpf_skb_load_helper_8_no_cache+0x71/0x130 [ 77.365860] __kasan_report.cold+0x37/0x7b [ 77.366365] ? bpf_skb_load_helper_8_no_cache+0x71/0x130 [ 77.366940] kasan_report+0xe/0x20 [ 77.367295] bpf_skb_load_helper_8_no_cache+0x71/0x130 [ 77.367821] ? bpf_skb_load_helper_8+0xf0/0xf0 [ 77.368278] ? mark_lock+0xa3/0x9b0 [ 77.368641] ? kvm_sched_clock_read+0x14/0x30 [ 77.369096] ? sched_clock+0x5/0x10 [ 77.369460] ? sched_clock_cpu+0x18/0x110 [ 77.369876] ? bpf_skb_load_helper_8+0xf0/0xf0 [ 77.370330] ___bpf_prog_run+0x16c0/0x28f0 [ 77.370755] __bpf_prog_run32+0x83/0xc0 [ 77.371153] ? __bpf_prog_run64+0xc0/0xc0 [ 77.371568] ? match_held_lock+0x1b/0x230 [ 77.371984] ? rcu_read_lock_held+0xa1/0xb0 [ 77.372416] ? rcu_is_watching+0x34/0x50 [ 77.372826] sk_filter_trim_cap+0x17c/0x4d0 [ 77.373259] ? sock_kzfree_s+0x40/0x40 [ 77.373648] ? __get_filter+0x150/0x150 [ 77.374059] ? skb_copy_datagram_from_iter+0x80/0x280 [ 77.374581] ? do_raw_spin_unlock+0xa5/0x140 [ 77.375025] unix_dgram_sendmsg+0x33a/0xa70 [ 77.375459] ? do_raw_spin_lock+0x1d0/0x1d0 [ 77.375893] ? unix_peer_get+0xa0/0xa0 [ 77.376287] ? __fget_light+0xa4/0xf0 [ 77.376670] __sys_sendto+0x265/0x280 [ 77.377056] ? __ia32_sys_getpeername+0x50/0x50 [ 77.377523] ? lock_downgrade+0x350/0x350 [ 77.377940] ? __sys_setsockopt+0x2a6/0x2c0 [ 77.378374] ? sock_read_iter+0x240/0x240 [ 77.378789] ? __sys_socketpair+0x22a/0x300 [ 77.379221] ? __ia32_sys_socket+0x50/0x50 [ 77.379649] ? mark_held_locks+0x1d/0x90 [ 77.380059] ? trace_hardirqs_on_thunk+0x1a/0x1c [ 77.380536] __x64_sys_sendto+0x74/0x90 [ 77.380938] do_syscall_64+0x68/0x2a0 [ 77.381324] entry_SYSCALL_64_after_hwframe+0x49/0xbe [ 77.381878] RIP: 0033:0x44c070 [...] After further debugging, turns out while in case of other helper functions we disallow passing modified ctx, the special case of ld/abs/ind instruction which has similar semantics (except r6 being the ctx argument) is missing such check. Modified ctx is impossible here as bpf_skb_load_helper_8_no_cache() and others are expecting skb fields in original position, hence, add check_ctx_reg() to reject any modified ctx. Issue was first introduced back in f1174f77b50c ("bpf/verifier: rework value tracking"). Fixes: f1174f77b50c ("bpf/verifier: rework value tracking") Reported-by: Anatoly Trosinenko Change-Id: I4987e827ad1a96832161635e53543ce7b72d2c58 Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200106215157.3553-1-daniel@iogearbox.net Signed-off-by: Greg Kroah-Hartman --- kernel/bpf/verifier.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d3c8df51e320..2b1ed8ca5e80 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6259,7 +6259,7 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) return -EINVAL; } - if (regs[BPF_REG_6].type != PTR_TO_CTX) { + if (regs[ctx_reg].type != PTR_TO_CTX) { verbose(env, "at the time of BPF_LD_ABS|IND R6 != pointer to skb\n"); return -EINVAL; From 014f114306537d6288b52cdbf8a4e54502d8d697 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Fri, 27 Dec 2019 13:50:34 -0800 Subject: [PATCH 1343/1640] UPSTREAM: bpf: cgroup: prevent out-of-order release of cgroup bpf commit e10360f815ca6367357b2c2cfef17fc663e50f7b upstream. Before commit 4bfc0bb2c60e ("bpf: decouple the lifetime of cgroup_bpf from cgroup itself") cgroup bpf structures were released with corresponding cgroup structures. It guaranteed the hierarchical order of destruction: children were always first. It preserved attached programs from being released before their propagated copies. But with cgroup auto-detachment there are no such guarantees anymore: cgroup bpf is released as soon as the cgroup is offline and there are no live associated sockets. It means that an attached program can be detached and released, while its propagated copy is still living in the cgroup subtree. This will obviously lead to an use-after-free bug. To reproduce the issue the following script can be used: #!/bin/bash CGROOT=/sys/fs/cgroup mkdir -p ${CGROOT}/A ${CGROOT}/B ${CGROOT}/A/C sleep 1 ./test_cgrp2_attach ${CGROOT}/A egress & A_PID=$! ./test_cgrp2_attach ${CGROOT}/B egress & B_PID=$! echo $$ > ${CGROOT}/A/C/cgroup.procs iperf -s & S_PID=$! iperf -c localhost -t 100 & C_PID=$! sleep 1 echo $$ > ${CGROOT}/B/cgroup.procs echo ${S_PID} > ${CGROOT}/B/cgroup.procs echo ${C_PID} > ${CGROOT}/B/cgroup.procs sleep 1 rmdir ${CGROOT}/A/C rmdir ${CGROOT}/A sleep 1 kill -9 ${S_PID} ${C_PID} ${A_PID} ${B_PID} On the unpatched kernel the following stacktrace can be obtained: [ 33.619799] BUG: unable to handle page fault for address: ffffbdb4801ab002 [ 33.620677] #PF: supervisor read access in kernel mode [ 33.621293] #PF: error_code(0x0000) - not-present page [ 33.622754] Oops: 0000 [#1] SMP NOPTI [ 33.623202] CPU: 0 PID: 601 Comm: iperf Not tainted 5.5.0-rc2+ #23 [ 33.625545] RIP: 0010:__cgroup_bpf_run_filter_skb+0x29f/0x3d0 [ 33.635809] Call Trace: [ 33.636118] ? __cgroup_bpf_run_filter_skb+0x2bf/0x3d0 [ 33.636728] ? __switch_to_asm+0x40/0x70 [ 33.637196] ip_finish_output+0x68/0xa0 [ 33.637654] ip_output+0x76/0xf0 [ 33.638046] ? __ip_finish_output+0x1c0/0x1c0 [ 33.638576] __ip_queue_xmit+0x157/0x410 [ 33.639049] __tcp_transmit_skb+0x535/0xaf0 [ 33.639557] tcp_write_xmit+0x378/0x1190 [ 33.640049] ? _copy_from_iter_full+0x8d/0x260 [ 33.640592] tcp_sendmsg_locked+0x2a2/0xdc0 [ 33.641098] ? sock_has_perm+0x10/0xa0 [ 33.641574] tcp_sendmsg+0x28/0x40 [ 33.641985] sock_sendmsg+0x57/0x60 [ 33.642411] sock_write_iter+0x97/0x100 [ 33.642876] new_sync_write+0x1b6/0x1d0 [ 33.643339] vfs_write+0xb6/0x1a0 [ 33.643752] ksys_write+0xa7/0xe0 [ 33.644156] do_syscall_64+0x5b/0x1b0 [ 33.644605] entry_SYSCALL_64_after_hwframe+0x44/0xa9 Fix this by grabbing a reference to the bpf structure of each ancestor on the initialization of the cgroup bpf structure, and dropping the reference at the end of releasing the cgroup bpf structure. This will restore the hierarchical order of cgroup bpf releasing, without adding any operations on hot paths. Thanks to Josef Bacik for the debugging and the initial analysis of the problem. Fixes: 4bfc0bb2c60e ("bpf: decouple the lifetime of cgroup_bpf from cgroup itself") Reported-by: Josef Bacik Change-Id: Idfe8765813b61ee1d0eb5c8ce60d03444b8c79cb Signed-off-by: Roman Gushchin Acked-by: Song Liu Signed-off-by: Alexei Starovoitov Signed-off-by: Greg Kroah-Hartman --- kernel/bpf/cgroup.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index a3eaf08e7dd3..8bd69062fbe5 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -35,8 +35,8 @@ void cgroup_bpf_offline(struct cgroup *cgrp) */ static void cgroup_bpf_release(struct work_struct *work) { - struct cgroup *cgrp = container_of(work, struct cgroup, - bpf.release_work); + struct cgroup *p, *cgrp = container_of(work, struct cgroup, + bpf.release_work); enum bpf_cgroup_storage_type stype; struct bpf_prog_array *old_array; unsigned int type; @@ -65,6 +65,9 @@ static void cgroup_bpf_release(struct work_struct *work) mutex_unlock(&cgroup_mutex); + for (p = cgroup_parent(cgrp); p; p = cgroup_parent(p)) + cgroup_bpf_put(p); + percpu_ref_exit(&cgrp->bpf.refcnt); cgroup_put(cgrp); } @@ -199,6 +202,7 @@ int cgroup_bpf_inherit(struct cgroup *cgrp) */ #define NR ARRAY_SIZE(cgrp->bpf.effective) struct bpf_prog_array *arrays[NR] = {}; + struct cgroup *p; int ret, i; ret = percpu_ref_init(&cgrp->bpf.refcnt, cgroup_bpf_release_fn, 0, @@ -206,6 +210,9 @@ int cgroup_bpf_inherit(struct cgroup *cgrp) if (ret) return ret; + for (p = cgroup_parent(cgrp); p; p = cgroup_parent(p)) + cgroup_bpf_get(p); + for (i = 0; i < NR; i++) INIT_LIST_HEAD(&cgrp->bpf.progs[i]); From 47691bd07410454a9cd8b1d975ec28e5b5d1052c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Thu, 21 Nov 2019 14:36:12 +0100 Subject: [PATCH 1344/1640] UPSTREAM: xdp: Fix cleanup on map free for devmap_hash map type MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 071cdecec57fb5d5df78e6a12114ad7bccea5b0e ] Tetsuo pointed out that it was not only the device unregister hook that was broken for devmap_hash types, it was also cleanup on map free. So better fix this as well. While we're at it, there's no reason to allocate the netdev_map array for DEVMAP_HASH, so skip that and adjust the cost accordingly. Fixes: 6f9d451ab1a3 ("xdp: Add devmap_hash map type for looking up devices by hashed index") Reported-by: Tetsuo Handa Change-Id: I97e1b4a58dfc9c8038b0c51f36602cfc52f0f8b8 Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Alexei Starovoitov Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20191121133612.430414-1-toke@redhat.com Signed-off-by: Sasha Levin --- kernel/bpf/devmap.c | 74 ++++++++++++++++++++++++++++----------------- 1 file changed, 46 insertions(+), 28 deletions(-) diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 3867864cdc2f..3d3d61b5985b 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -74,7 +74,7 @@ struct bpf_dtab_netdev { struct bpf_dtab { struct bpf_map map; - struct bpf_dtab_netdev **netdev_map; + struct bpf_dtab_netdev **netdev_map; /* DEVMAP type only */ struct list_head __percpu *flush_list; struct list_head list; @@ -101,6 +101,12 @@ static struct hlist_head *dev_map_create_hash(unsigned int entries) return hash; } +static inline struct hlist_head *dev_map_index_hash(struct bpf_dtab *dtab, + int idx) +{ + return &dtab->dev_index_head[idx & (dtab->n_buckets - 1)]; +} + static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr) { int err, cpu; @@ -120,8 +126,7 @@ static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr) bpf_map_init_from_attr(&dtab->map, attr); /* make sure page count doesn't overflow */ - cost = (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *); - cost += sizeof(struct list_head) * num_possible_cpus(); + cost = (u64) sizeof(struct list_head) * num_possible_cpus(); if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) { dtab->n_buckets = roundup_pow_of_two(dtab->map.max_entries); @@ -129,6 +134,8 @@ static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr) if (!dtab->n_buckets) /* Overflow check */ return -EINVAL; cost += (u64) sizeof(struct hlist_head) * dtab->n_buckets; + } else { + cost += (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *); } /* if map size is larger than memlock limit, reject it */ @@ -143,24 +150,22 @@ static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr) for_each_possible_cpu(cpu) INIT_LIST_HEAD(per_cpu_ptr(dtab->flush_list, cpu)); - dtab->netdev_map = bpf_map_area_alloc(dtab->map.max_entries * - sizeof(struct bpf_dtab_netdev *), - dtab->map.numa_node); - if (!dtab->netdev_map) - goto free_percpu; - if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) { dtab->dev_index_head = dev_map_create_hash(dtab->n_buckets); if (!dtab->dev_index_head) - goto free_map_area; + goto free_percpu; spin_lock_init(&dtab->index_lock); + } else { + dtab->netdev_map = bpf_map_area_alloc(dtab->map.max_entries * + sizeof(struct bpf_dtab_netdev *), + dtab->map.numa_node); + if (!dtab->netdev_map) + goto free_percpu; } return 0; -free_map_area: - bpf_map_area_free(dtab->netdev_map); free_percpu: free_percpu(dtab->flush_list); free_charge: @@ -228,21 +233,40 @@ static void dev_map_free(struct bpf_map *map) cond_resched(); } - for (i = 0; i < dtab->map.max_entries; i++) { - struct bpf_dtab_netdev *dev; + if (dtab->map.map_type == BPF_MAP_TYPE_DEVMAP_HASH) { + for (i = 0; i < dtab->n_buckets; i++) { + struct bpf_dtab_netdev *dev; + struct hlist_head *head; + struct hlist_node *next; - dev = dtab->netdev_map[i]; - if (!dev) - continue; + head = dev_map_index_hash(dtab, i); - free_percpu(dev->bulkq); - dev_put(dev->dev); - kfree(dev); + hlist_for_each_entry_safe(dev, next, head, index_hlist) { + hlist_del_rcu(&dev->index_hlist); + free_percpu(dev->bulkq); + dev_put(dev->dev); + kfree(dev); + } + } + + kfree(dtab->dev_index_head); + } else { + for (i = 0; i < dtab->map.max_entries; i++) { + struct bpf_dtab_netdev *dev; + + dev = dtab->netdev_map[i]; + if (!dev) + continue; + + free_percpu(dev->bulkq); + dev_put(dev->dev); + kfree(dev); + } + + bpf_map_area_free(dtab->netdev_map); } free_percpu(dtab->flush_list); - bpf_map_area_free(dtab->netdev_map); - kfree(dtab->dev_index_head); kfree(dtab); } @@ -263,12 +287,6 @@ static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key) return 0; } -static inline struct hlist_head *dev_map_index_hash(struct bpf_dtab *dtab, - int idx) -{ - return &dtab->dev_index_head[idx & (dtab->n_buckets - 1)]; -} - struct bpf_dtab_netdev *__dev_map_hash_lookup_elem(struct bpf_map *map, u32 key) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); From b4a0c90b58c5060f6ff9cb1939209b453556276a Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Tue, 16 Jul 2019 18:12:22 -0400 Subject: [PATCH 1345/1640] BACKPORT: rcu: Add support for consolidated-RCU reader checking This commit adds RCU-reader checks to list_for_each_entry_rcu() and hlist_for_each_entry_rcu(). These checks are optional, and are indicated by a lockdep expression passed to a new optional argument to these two macros. If this optional lockdep expression is omitted, these two macros act as before, checking for an RCU read-side critical section. Change-Id: I102aad99348529184666a56bf3bf4458ae63520c Signed-off-by: Joel Fernandes (Google) [ paulmck: Update to eliminate return within macro and update comment. ] Signed-off-by: Paul E. McKenney --- include/linux/rculist.h | 32 +++++++++++-- include/linux/rcupdate.h | 7 +++ kernel/rcu/Kconfig.debug | 11 +++++ kernel/rcu/update.c | 100 ++++++++++++++++++++++++--------------- 4 files changed, 108 insertions(+), 42 deletions(-) diff --git a/include/linux/rculist.h b/include/linux/rculist.h index 127f534fec94..4ac3cd23539d 100644 --- a/include/linux/rculist.h +++ b/include/linux/rculist.h @@ -40,6 +40,24 @@ static inline void INIT_LIST_HEAD_RCU(struct list_head *list) */ #define list_next_rcu(list) (*((struct list_head __rcu **)(&(list)->next))) +/* + * Check during list traversal that we are within an RCU reader + */ + +#define check_arg_count_one(dummy) + +#ifdef CONFIG_PROVE_RCU_LIST +#define __list_check_rcu(dummy, cond, extra...) \ + ({ \ + check_arg_count_one(extra); \ + RCU_LOCKDEP_WARN(!cond && !rcu_read_lock_any_held(), \ + "RCU-list traversed in non-reader section!"); \ + }) +#else +#define __list_check_rcu(dummy, cond, extra...) \ + ({ check_arg_count_one(extra); }) +#endif + /* * Insert a new entry between two known consecutive entries. * @@ -343,14 +361,16 @@ static inline void list_splice_tail_init_rcu(struct list_head *list, * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the list_head within the struct. + * @cond: optional lockdep expression if called from non-RCU protection. * * This list-traversal primitive may safely run concurrently with * the _rcu list-mutation primitives such as list_add_rcu() * as long as the traversal is guarded by rcu_read_lock(). */ -#define list_for_each_entry_rcu(pos, head, member) \ - for (pos = list_entry_rcu((head)->next, typeof(*pos), member); \ - &pos->member != (head); \ +#define list_for_each_entry_rcu(pos, head, member, cond...) \ + for (__list_check_rcu(dummy, ## cond, 0), \ + pos = list_entry_rcu((head)->next, typeof(*pos), member); \ + &pos->member != (head); \ pos = list_entry_rcu(pos->member.next, typeof(*pos), member)) /** @@ -588,13 +608,15 @@ static inline void hlist_add_behind_rcu(struct hlist_node *n, * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the hlist_node within the struct. + * @cond: optional lockdep expression if called from non-RCU protection. * * This list-traversal primitive may safely run concurrently with * the _rcu list-mutation primitives such as hlist_add_head_rcu() * as long as the traversal is guarded by rcu_read_lock(). */ -#define hlist_for_each_entry_rcu(pos, head, member) \ - for (pos = hlist_entry_safe (rcu_dereference_raw(hlist_first_rcu(head)),\ +#define hlist_for_each_entry_rcu(pos, head, member, cond...) \ + for (__list_check_rcu(dummy, ## cond, 0), \ + pos = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)),\ typeof(*(pos)), member); \ pos; \ pos = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(\ diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 8d570190e9b4..690762ed71a4 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -255,6 +255,7 @@ int debug_lockdep_rcu_enabled(void); int rcu_read_lock_held(void); int rcu_read_lock_bh_held(void); int rcu_read_lock_sched_held(void); +int rcu_read_lock_any_held(void); #else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ @@ -275,6 +276,12 @@ static inline int rcu_read_lock_sched_held(void) { return !preemptible(); } + +static inline int rcu_read_lock_any_held(void) +{ + return !preemptible(); +} + #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ #ifdef CONFIG_PROVE_RCU diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug index e1a49c6e58ea..542e52f6cd86 100644 --- a/kernel/rcu/Kconfig.debug +++ b/kernel/rcu/Kconfig.debug @@ -7,6 +7,17 @@ menu "RCU Debugging" config PROVE_RCU def_bool PROVE_LOCKING +config PROVE_RCU_LIST + bool "RCU list lockdep debugging" + depends on PROVE_RCU && RCU_EXPERT + default n + help + Enable RCU lockdep checking for list usages. By default it is + turned off since there are several list RCU users that still + need to be converted to pass a lockdep expression. To prevent + false-positive splats, we keep it default disabled but once all + users are converted, we can remove this config option. + config TORTURE_TEST tristate default n diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 7a577bd989a4..497d45b3923c 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -72,9 +72,15 @@ module_param(rcu_normal_after_boot, int, 0); #ifdef CONFIG_DEBUG_LOCK_ALLOC /** - * rcu_read_lock_sched_held() - might we be in RCU-sched read-side critical section? + * rcu_read_lock_held_common() - might we be in RCU-sched read-side critical section? + * @ret: Best guess answer if lockdep cannot be relied on * - * If CONFIG_DEBUG_LOCK_ALLOC is selected, returns nonzero iff in an + * Returns true if lockdep must be ignored, in which case *ret contains + * the best guess described below. Otherwise returns false, in which + * case *ret tells the caller nothing and the caller should instead + * consult lockdep. + * + * If CONFIG_DEBUG_LOCK_ALLOC is selected, set *ret to nonzero iff in an * RCU-sched read-side critical section. In absence of * CONFIG_DEBUG_LOCK_ALLOC, this assumes we are in an RCU-sched read-side * critical section unless it can prove otherwise. Note that disabling @@ -86,35 +92,45 @@ module_param(rcu_normal_after_boot, int, 0); * Check debug_lockdep_rcu_enabled() to prevent false positives during boot * and while lockdep is disabled. * - * Note that if the CPU is in the idle loop from an RCU point of - * view (ie: that we are in the section between rcu_idle_enter() and - * rcu_idle_exit()) then rcu_read_lock_held() returns false even if the CPU - * did an rcu_read_lock(). The reason for this is that RCU ignores CPUs - * that are in such a section, considering these as in extended quiescent - * state, so such a CPU is effectively never in an RCU read-side critical - * section regardless of what RCU primitives it invokes. This state of - * affairs is required --- we need to keep an RCU-free window in idle - * where the CPU may possibly enter into low power mode. This way we can - * notice an extended quiescent state to other CPUs that started a grace - * period. Otherwise we would delay any grace period as long as we run in - * the idle task. + * Note that if the CPU is in the idle loop from an RCU point of view (ie: + * that we are in the section between rcu_idle_enter() and rcu_idle_exit()) + * then rcu_read_lock_held() sets *ret to false even if the CPU did an + * rcu_read_lock(). The reason for this is that RCU ignores CPUs that are + * in such a section, considering these as in extended quiescent state, + * so such a CPU is effectively never in an RCU read-side critical section + * regardless of what RCU primitives it invokes. This state of affairs is + * required --- we need to keep an RCU-free window in idle where the CPU may + * possibly enter into low power mode. This way we can notice an extended + * quiescent state to other CPUs that started a grace period. Otherwise + * we would delay any grace period as long as we run in the idle task. * - * Similarly, we avoid claiming an SRCU read lock held if the current + * Similarly, we avoid claiming an RCU read lock held if the current * CPU is offline. */ +static bool rcu_read_lock_held_common(bool *ret) +{ + if (!debug_lockdep_rcu_enabled()) { + *ret = 1; + return true; + } + if (!rcu_is_watching()) { + *ret = 0; + return true; + } + if (!rcu_lockdep_current_cpu_online()) { + *ret = 0; + return true; + } + return false; +} + int rcu_read_lock_sched_held(void) { - int lockdep_opinion = 0; + bool ret; - if (!debug_lockdep_rcu_enabled()) - return 1; - if (!rcu_is_watching()) - return 0; - if (!rcu_lockdep_current_cpu_online()) - return 0; - if (debug_locks) - lockdep_opinion = lock_is_held(&rcu_sched_lock_map); - return lockdep_opinion || !preemptible(); + if (rcu_read_lock_held_common(&ret)) + return ret; + return lock_is_held(&rcu_sched_lock_map) || !preemptible(); } EXPORT_SYMBOL(rcu_read_lock_sched_held); #endif @@ -323,12 +339,10 @@ EXPORT_SYMBOL_GPL(debug_lockdep_rcu_enabled); */ int rcu_read_lock_held(void) { - if (!debug_lockdep_rcu_enabled()) - return 1; - if (!rcu_is_watching()) - return 0; - if (!rcu_lockdep_current_cpu_online()) - return 0; + bool ret; + + if (rcu_read_lock_held_common(&ret)) + return ret; return lock_is_held(&rcu_lock_map); } EXPORT_SYMBOL_GPL(rcu_read_lock_held); @@ -350,16 +364,28 @@ EXPORT_SYMBOL_GPL(rcu_read_lock_held); */ int rcu_read_lock_bh_held(void) { - if (!debug_lockdep_rcu_enabled()) - return 1; - if (!rcu_is_watching()) - return 0; - if (!rcu_lockdep_current_cpu_online()) - return 0; + bool ret; + + if (rcu_read_lock_held_common(&ret)) + return ret; return in_softirq() || irqs_disabled(); } EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held); +int rcu_read_lock_any_held(void) +{ + bool ret; + + if (rcu_read_lock_held_common(&ret)) + return ret; + if (lock_is_held(&rcu_lock_map) || + lock_is_held(&rcu_bh_lock_map) || + lock_is_held(&rcu_sched_lock_map)) + return 1; + return !preemptible(); +} +EXPORT_SYMBOL_GPL(rcu_read_lock_any_held); + #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ /** From 9b3ccb3744715ae804da4c5bcf2f30f7bc801320 Mon Sep 17 00:00:00 2001 From: Amol Grover Date: Thu, 23 Jan 2020 17:34:38 +0530 Subject: [PATCH 1346/1640] UPSTREAM: bpf, devmap: Pass lockdep expression to RCU lists MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 485ec2ea9cf556e9c120e07961b7b459d776a115 upstream. head is traversed using hlist_for_each_entry_rcu outside an RCU read-side critical section but under the protection of dtab->index_lock. Hence, add corresponding lockdep expression to silence false-positive lockdep warnings, and harden RCU lists. Fixes: 6f9d451ab1a3 ("xdp: Add devmap_hash map type for looking up devices by hashed index") Change-Id: I5a0e2708268f71f51e4b2b3bd8c3e7449eefac26 Signed-off-by: Amol Grover Signed-off-by: Daniel Borkmann Acked-by: Jesper Dangaard Brouer Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/20200123120437.26506-1-frextrite@gmail.com Signed-off-by: Greg Kroah-Hartman --- kernel/bpf/devmap.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 3d3d61b5985b..b4b6b77f309c 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -293,7 +293,8 @@ struct bpf_dtab_netdev *__dev_map_hash_lookup_elem(struct bpf_map *map, u32 key) struct hlist_head *head = dev_map_index_hash(dtab, key); struct bpf_dtab_netdev *dev; - hlist_for_each_entry_rcu(dev, head, index_hlist) + hlist_for_each_entry_rcu(dev, head, index_hlist, + lockdep_is_held(&dtab->index_lock)) if (dev->idx == key) return dev; From 489544cbe9a684767268bf3273fdb2a51da2e891 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 9 Mar 2020 15:40:17 -0700 Subject: [PATCH 1347/1640] UPSTREAM: bpf: Fix cgroup ref leak in cgroup_bpf_inherit on out-of-memory commit 1d8006abaab4cb90f81add86e8d1bf9411add05a upstream. There is no compensating cgroup_bpf_put() for each ancestor cgroup in cgroup_bpf_inherit(). If compute_effective_progs returns error, those cgroups won't be freed ever. Fix it by putting them in cleanup code path. Fixes: e10360f815ca ("bpf: cgroup: prevent out-of-order release of cgroup bpf") Change-Id: I54d73707a063b1703b45e781d1d5c8bc546994bf Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Acked-by: Roman Gushchin Link: https://lore.kernel.org/bpf/20200309224017.1063297-1-andriin@fb.com Signed-off-by: Greg Kroah-Hartman --- kernel/bpf/cgroup.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 8bd69062fbe5..28c97aee6d8f 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -228,6 +228,9 @@ cleanup: for (i = 0; i < NR; i++) bpf_prog_array_free(arrays[i]); + for (p = cgroup_parent(cgrp); p; p = cgroup_parent(p)) + cgroup_bpf_put(p); + percpu_ref_exit(&cgrp->bpf.refcnt); return -ENOMEM; From 51facbd8386d951eeb27c19b1c8936a021c96918 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 9 Mar 2020 15:27:55 -0700 Subject: [PATCH 1348/1640] UPSTREAM: bpf: Initialize storage pointers to NULL to prevent freeing garbage pointer commit 62039c30c19dcab96621e074aeeb90da7100def7 upstream. Local storage array isn't initialized, so if cgroup storage allocation fails for BPF_CGROUP_STORAGE_SHARED, error handling code will attempt to free uninitialized pointer for BPF_CGROUP_STORAGE_PERCPU storage type. Avoid this by always initializing storage pointers to NULLs. Fixes: 8bad74f9840f ("bpf: extend cgroup bpf core to allow multiple cgroup storage types") Change-Id: I4b9ef87c24e62f5160b81f29050687005af0444e Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200309222756.1018737-1-andriin@fb.com Signed-off-by: Greg Kroah-Hartman --- kernel/bpf/cgroup.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 28c97aee6d8f..869e2e1860e8 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -303,8 +303,8 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, { struct list_head *progs = &cgrp->bpf.progs[type]; struct bpf_prog *old_prog = NULL; - struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE], - *old_storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {NULL}; + struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {}; + struct bpf_cgroup_storage *old_storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {}; enum bpf_cgroup_storage_type stype; struct bpf_prog_list *pl; bool pl_was_allocated; From 5b1c5b9725cef95dc5d9b43fed468f92a739752e Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 30 Mar 2020 18:03:22 +0200 Subject: [PATCH 1349/1640] UPSTREAM: bpf: Undo incorrect __reg_bound_offset32 handling commit f2d67fec0b43edce8c416101cdc52e71145b5fef upstream. Anatoly has been fuzzing with kBdysch harness and reported a hang in one of the outcomes: 0: (b7) r0 = 808464432 1: (7f) r0 >>= r0 2: (14) w0 -= 808464432 3: (07) r0 += 808464432 4: (b7) r1 = 808464432 5: (de) if w1 s<= w0 goto pc+0 R0_w=invP(id=0,umin_value=808464432,umax_value=5103431727,var_off=(0x30303020;0x10000001f)) R1_w=invP808464432 R10=fp0 6: (07) r0 += -2144337872 7: (14) w0 -= -1607454672 8: (25) if r0 > 0x30303030 goto pc+0 R0_w=invP(id=0,umin_value=271581184,umax_value=271581311,var_off=(0x10300000;0x7f)) R1_w=invP808464432 R10=fp0 9: (76) if w0 s>= 0x303030 goto pc+2 12: (95) exit from 8 to 9: safe from 5 to 6: R0_w=invP(id=0,umin_value=808464432,umax_value=5103431727,var_off=(0x30303020;0x10000001f)) R1_w=invP808464432 R10=fp0 6: (07) r0 += -2144337872 7: (14) w0 -= -1607454672 8: (25) if r0 > 0x30303030 goto pc+0 R0_w=invP(id=0,umin_value=271581184,umax_value=271581311,var_off=(0x10300000;0x7f)) R1_w=invP808464432 R10=fp0 9: safe from 8 to 9: safe verification time 589 usec stack depth 0 processed 17 insns (limit 1000000) [...] The underlying program was xlated as follows: # bpftool p d x i 9 0: (b7) r0 = 808464432 1: (7f) r0 >>= r0 2: (14) w0 -= 808464432 3: (07) r0 += 808464432 4: (b7) r1 = 808464432 5: (de) if w1 s<= w0 goto pc+0 6: (07) r0 += -2144337872 7: (14) w0 -= -1607454672 8: (25) if r0 > 0x30303030 goto pc+0 9: (76) if w0 s>= 0x303030 goto pc+2 10: (05) goto pc-1 11: (05) goto pc-1 12: (95) exit The verifier rewrote original instructions it recognized as dead code with 'goto pc-1', but reality differs from verifier simulation in that we're actually able to trigger a hang due to hitting the 'goto pc-1' instructions. Taking different examples to make the issue more obvious: in this example we're probing bounds on a completely unknown scalar variable in r1: [...] 5: R0_w=inv1 R1_w=inv(id=0) R10=fp0 5: (18) r2 = 0x4000000000 7: R0_w=inv1 R1_w=inv(id=0) R2_w=inv274877906944 R10=fp0 7: (18) r3 = 0x2000000000 9: R0_w=inv1 R1_w=inv(id=0) R2_w=inv274877906944 R3_w=inv137438953472 R10=fp0 9: (18) r4 = 0x400 11: R0_w=inv1 R1_w=inv(id=0) R2_w=inv274877906944 R3_w=inv137438953472 R4_w=inv1024 R10=fp0 11: (18) r5 = 0x200 13: R0_w=inv1 R1_w=inv(id=0) R2_w=inv274877906944 R3_w=inv137438953472 R4_w=inv1024 R5_w=inv512 R10=fp0 13: (2d) if r1 > r2 goto pc+4 R0_w=inv1 R1_w=inv(id=0,umax_value=274877906944,var_off=(0x0; 0x7fffffffff)) R2_w=inv274877906944 R3_w=inv137438953472 R4_w=inv1024 R5_w=inv512 R10=fp0 14: R0_w=inv1 R1_w=inv(id=0,umax_value=274877906944,var_off=(0x0; 0x7fffffffff)) R2_w=inv274877906944 R3_w=inv137438953472 R4_w=inv1024 R5_w=inv512 R10=fp0 14: (ad) if r1 < r3 goto pc+3 R0_w=inv1 R1_w=inv(id=0,umin_value=137438953472,umax_value=274877906944,var_off=(0x0; 0x7fffffffff)) R2_w=inv274877906944 R3_w=inv137438953472 R4_w=inv1024 R5_w=inv512 R10=fp0 15: R0=inv1 R1=inv(id=0,umin_value=137438953472,umax_value=274877906944,var_off=(0x0; 0x7fffffffff)) R2=inv274877906944 R3=inv137438953472 R4=inv1024 R5=inv512 R10=fp0 15: (2e) if w1 > w4 goto pc+2 R0=inv1 R1=inv(id=0,umin_value=137438953472,umax_value=274877906944,var_off=(0x0; 0x7f00000000)) R2=inv274877906944 R3=inv137438953472 R4=inv1024 R5=inv512 R10=fp0 16: R0=inv1 R1=inv(id=0,umin_value=137438953472,umax_value=274877906944,var_off=(0x0; 0x7f00000000)) R2=inv274877906944 R3=inv137438953472 R4=inv1024 R5=inv512 R10=fp0 16: (ae) if w1 < w5 goto pc+1 R0=inv1 R1=inv(id=0,umin_value=137438953472,umax_value=274877906944,var_off=(0x0; 0x7f00000000)) R2=inv274877906944 R3=inv137438953472 R4=inv1024 R5=inv512 R10=fp0 [...] We're first probing lower/upper bounds via jmp64, later we do a similar check via jmp32 and examine the resulting var_off there. After fall-through in insn 14, we get the following bounded r1 with 0x7fffffffff unknown marked bits in the variable section. Thus, after knowing r1 <= 0x4000000000 and r1 >= 0x2000000000: max: 0b100000000000000000000000000000000000000 / 0x4000000000 var: 0b111111111111111111111111111111111111111 / 0x7fffffffff min: 0b010000000000000000000000000000000000000 / 0x2000000000 Now, in insn 15 and 16, we perform a similar probe with lower/upper bounds in jmp32. Thus, after knowing r1 <= 0x4000000000 and r1 >= 0x2000000000 and w1 <= 0x400 and w1 >= 0x200: max: 0b100000000000000000000000000000000000000 / 0x4000000000 var: 0b111111100000000000000000000000000000000 / 0x7f00000000 min: 0b010000000000000000000000000000000000000 / 0x2000000000 The lower/upper bounds haven't changed since they have high bits set in u64 space and the jmp32 tests can only refine bounds in the low bits. However, for the var part the expectation would have been 0x7f000007ff or something less precise up to 0x7fffffffff. A outcome of 0x7f00000000 is not correct since it would contradict the earlier probed bounds where we know that the result should have been in [0x200,0x400] in u32 space. Therefore, tests with such info will lead to wrong verifier assumptions later on like falsely predicting conditional jumps to be always taken, etc. The issue here is that __reg_bound_offset32()'s implementation from commit 581738a681b6 ("bpf: Provide better register bounds after jmp32 instructions") makes an incorrect range assumption: static void __reg_bound_offset32(struct bpf_reg_state *reg) { u64 mask = 0xffffFFFF; struct tnum range = tnum_range(reg->umin_value & mask, reg->umax_value & mask); struct tnum lo32 = tnum_cast(reg->var_off, 4); struct tnum hi32 = tnum_lshift(tnum_rshift(reg->var_off, 32), 32); reg->var_off = tnum_or(hi32, tnum_intersect(lo32, range)); } In the above walk-through example, __reg_bound_offset32() as-is chose a range after masking with 0xffffffff of [0x0,0x0] since umin:0x2000000000 and umax:0x4000000000 and therefore the lo32 part was clamped to 0x0 as well. However, in the umin:0x2000000000 and umax:0x4000000000 range above we'd end up with an actual possible interval of [0x0,0xffffffff] for u32 space instead. In case of the original reproducer, the situation looked as follows at insn 5 for r0: [...] 5: R0_w=invP(id=0,umin_value=808464432,umax_value=5103431727,var_off=(0x0; 0x1ffffffff)) R1_w=invP808464432 R10=fp0 0x30303030 0x13030302f 5: (de) if w1 s<= w0 goto pc+0 R0_w=invP(id=0,umin_value=808464432,umax_value=5103431727,var_off=(0x30303020; 0x10000001f)) R1_w=invP808464432 R10=fp0 0x30303030 0x13030302f [...] After the fall-through, we similarly forced the var_off result into the wrong range [0x30303030,0x3030302f] suggesting later on that fixed bits must only be of 0x30303020 with 0x10000001f unknowns whereas such assumption can only be made when both bounds in hi32 range match. Originally, I was thinking to fix this by moving reg into a temp reg and use proper coerce_reg_to_size() helper on the temp reg where we can then based on that define the range tnum for later intersection: static void __reg_bound_offset32(struct bpf_reg_state *reg) { struct bpf_reg_state tmp = *reg; struct tnum lo32, hi32, range; coerce_reg_to_size(&tmp, 4); range = tnum_range(tmp.umin_value, tmp.umax_value); lo32 = tnum_cast(reg->var_off, 4); hi32 = tnum_lshift(tnum_rshift(reg->var_off, 32), 32); reg->var_off = tnum_or(hi32, tnum_intersect(lo32, range)); } In the case of the concrete example, this gives us a more conservative unknown section. Thus, after knowing r1 <= 0x4000000000 and r1 >= 0x2000000000 and w1 <= 0x400 and w1 >= 0x200: max: 0b100000000000000000000000000000000000000 / 0x4000000000 var: 0b111111111111111111111111111111111111111 / 0x7fffffffff min: 0b010000000000000000000000000000000000000 / 0x2000000000 However, above new __reg_bound_offset32() has no effect on refining the knowledge of the register contents. Meaning, if the bounds in hi32 range mismatch we'll get the identity function given the range reg spans [0x0,0xffffffff] and we cast var_off into lo32 only to later on binary or it again with the hi32. Likewise, if the bounds in hi32 range match, then we mask both bounds with 0xffffffff, use the resulting umin/umax for the range to later intersect the lo32 with it. However, _prior_ called __reg_bound_offset() did already such intersection on the full reg and we therefore would only repeat the same operation on the lo32 part twice. Given this has no effect and the original commit had false assumptions, this patch reverts the code entirely which is also more straight forward for stable trees: apparently 581738a681b6 got auto-selected by Sasha's ML system and misclassified as a fix, so it got sucked into v5.4 where it should never have landed. A revert is low-risk also from a user PoV since it requires a recent kernel and llc to opt-into -mcpu=v3 BPF CPU to generate jmp32 instructions. A proper bounds refinement would need a significantly more complex approach which is currently being worked, but no stable material [0]. Hence revert is best option for stable. After the revert, the original reported program gets rejected as follows: 1: (7f) r0 >>= r0 2: (14) w0 -= 808464432 3: (07) r0 += 808464432 4: (b7) r1 = 808464432 5: (de) if w1 s<= w0 goto pc+0 R0_w=invP(id=0,umin_value=808464432,umax_value=5103431727,var_off=(0x0; 0x1ffffffff)) R1_w=invP808464432 R10=fp0 6: (07) r0 += -2144337872 7: (14) w0 -= -1607454672 8: (25) if r0 > 0x30303030 goto pc+0 R0_w=invP(id=0,umax_value=808464432,var_off=(0x0; 0x3fffffff)) R1_w=invP808464432 R10=fp0 9: (76) if w0 s>= 0x303030 goto pc+2 R0=invP(id=0,umax_value=3158063,var_off=(0x0; 0x3fffff)) R1=invP808464432 R10=fp0 10: (30) r0 = *(u8 *)skb[808464432] BPF_LD_[ABS|IND] uses reserved fields processed 11 insns (limit 1000000) [...] [0] https://lore.kernel.org/bpf/158507130343.15666.8018068546764556975.stgit@john-Precision-5820-Tower/T/ Fixes: 581738a681b6 ("bpf: Provide better register bounds after jmp32 instructions") Reported-by: Anatoly Trosinenko Change-Id: I9dda74f58be79a042bdfc656e03b33f59d4c7578 Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200330160324.15259-2-daniel@iogearbox.net Signed-off-by: Greg Kroah-Hartman --- kernel/bpf/verifier.c | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 2b1ed8ca5e80..7ec6af2f5326 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -978,17 +978,6 @@ static void __reg_bound_offset(struct bpf_reg_state *reg) reg->umax_value)); } -static void __reg_bound_offset32(struct bpf_reg_state *reg) -{ - u64 mask = 0xffffFFFF; - struct tnum range = tnum_range(reg->umin_value & mask, - reg->umax_value & mask); - struct tnum lo32 = tnum_cast(reg->var_off, 4); - struct tnum hi32 = tnum_lshift(tnum_rshift(reg->var_off, 32), 32); - - reg->var_off = tnum_or(hi32, tnum_intersect(lo32, range)); -} - /* Reset the min/max bounds of a register */ static void __mark_reg_unbounded(struct bpf_reg_state *reg) { @@ -5599,10 +5588,6 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg, /* We might have learned some bits from the bounds. */ __reg_bound_offset(false_reg); __reg_bound_offset(true_reg); - if (is_jmp32) { - __reg_bound_offset32(false_reg); - __reg_bound_offset32(true_reg); - } /* Intersecting with the old var_off might have improved our bounds * slightly. e.g. if umax was 0x7f...f and var_off was (0; 0xf...fc), * then new var_off is (0; 0x7f...fc) which improves our umax. @@ -5712,10 +5697,6 @@ static void reg_set_min_max_inv(struct bpf_reg_state *true_reg, /* We might have learned some bits from the bounds. */ __reg_bound_offset(false_reg); __reg_bound_offset(true_reg); - if (is_jmp32) { - __reg_bound_offset32(false_reg); - __reg_bound_offset32(true_reg); - } /* Intersecting with the old var_off might have improved our bounds * slightly. e.g. if umax was 0x7f...f and var_off was (0; 0xf...fc), * then new var_off is (0; 0x7f...fc) which improves our umax. From 9bbe16c549f77acd784305c4e58b0bac486710c5 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Mon, 30 Mar 2020 18:03:23 +0200 Subject: [PATCH 1350/1640] UPSTREAM: bpf: Fix tnum constraints for 32-bit comparisons [ Upstream commit 604dca5e3af1db98bd123b7bfc02b017af99e3a0 ] The BPF verifier tried to track values based on 32-bit comparisons by (ab)using the tnum state via 581738a681b6 ("bpf: Provide better register bounds after jmp32 instructions"). The idea is that after a check like this: if ((u32)r0 > 3) exit We can't meaningfully constrain the arithmetic-range-based tracking, but we can update the tnum state to (value=0,mask=0xffff'ffff'0000'0003). However, the implementation from 581738a681b6 didn't compute the tnum constraint based on the fixed operand, but instead derives it from the arithmetic-range-based tracking. This means that after the following sequence of operations: if (r0 >= 0x1'0000'0001) exit if ((u32)r0 > 7) exit The verifier assumed that the lower half of r0 is in the range (0, 0) and apply the tnum constraint (value=0,mask=0xffff'ffff'0000'0000) thus causing the overall tnum to be (value=0,mask=0x1'0000'0000), which was incorrect. Provide a fixed implementation. Fixes: 581738a681b6 ("bpf: Provide better register bounds after jmp32 instructions") Change-Id: Ie6c6a7b8f95738a3ebf0bcacf74f4d1426391c72 Signed-off-by: Jann Horn Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200330160324.15259-3-daniel@iogearbox.net Signed-off-by: Sasha Levin --- kernel/bpf/verifier.c | 108 ++++++++++++++++++++++++++++-------------- 1 file changed, 72 insertions(+), 36 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 7ec6af2f5326..140e38afb75f 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5472,6 +5472,70 @@ static bool cmp_val_with_extended_s64(s64 sval, struct bpf_reg_state *reg) reg->smax_value <= 0 && reg->smin_value >= S32_MIN); } +/* Constrain the possible values of @reg with unsigned upper bound @bound. + * If @is_exclusive, @bound is an exclusive limit, otherwise it is inclusive. + * If @is_jmp32, @bound is a 32-bit value that only constrains the low 32 bits + * of @reg. + */ +static void set_upper_bound(struct bpf_reg_state *reg, u64 bound, bool is_jmp32, + bool is_exclusive) +{ + if (is_exclusive) { + /* There are no values for `reg` that make `reg<0` true. */ + if (bound == 0) + return; + bound--; + } + if (is_jmp32) { + /* Constrain the register's value in the tnum representation. + * For 64-bit comparisons this happens later in + * __reg_bound_offset(), but for 32-bit comparisons, we can be + * more precise than what can be derived from the updated + * numeric bounds. + */ + struct tnum t = tnum_range(0, bound); + + t.mask |= ~0xffffffffULL; /* upper half is unknown */ + reg->var_off = tnum_intersect(reg->var_off, t); + + /* Compute the 64-bit bound from the 32-bit bound. */ + bound += gen_hi_max(reg->var_off); + } + reg->umax_value = min(reg->umax_value, bound); +} + +/* Constrain the possible values of @reg with unsigned lower bound @bound. + * If @is_exclusive, @bound is an exclusive limit, otherwise it is inclusive. + * If @is_jmp32, @bound is a 32-bit value that only constrains the low 32 bits + * of @reg. + */ +static void set_lower_bound(struct bpf_reg_state *reg, u64 bound, bool is_jmp32, + bool is_exclusive) +{ + if (is_exclusive) { + /* There are no values for `reg` that make `reg>MAX` true. */ + if (bound == (is_jmp32 ? U32_MAX : U64_MAX)) + return; + bound++; + } + if (is_jmp32) { + /* Constrain the register's value in the tnum representation. + * For 64-bit comparisons this happens later in + * __reg_bound_offset(), but for 32-bit comparisons, we can be + * more precise than what can be derived from the updated + * numeric bounds. + */ + struct tnum t = tnum_range(bound, U32_MAX); + + t.mask |= ~0xffffffffULL; /* upper half is unknown */ + reg->var_off = tnum_intersect(reg->var_off, t); + + /* Compute the 64-bit bound from the 32-bit bound. */ + bound += gen_hi_min(reg->var_off); + } + reg->umin_value = max(reg->umin_value, bound); +} + /* Adjusts the register min/max values in the case that the dst_reg is the * variable register that we are working on, and src_reg is a constant or we're * simply doing a BPF_K check. @@ -5527,15 +5591,8 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg, case BPF_JGE: case BPF_JGT: { - u64 false_umax = opcode == BPF_JGT ? val : val - 1; - u64 true_umin = opcode == BPF_JGT ? val + 1 : val; - - if (is_jmp32) { - false_umax += gen_hi_max(false_reg->var_off); - true_umin += gen_hi_min(true_reg->var_off); - } - false_reg->umax_value = min(false_reg->umax_value, false_umax); - true_reg->umin_value = max(true_reg->umin_value, true_umin); + set_upper_bound(false_reg, val, is_jmp32, opcode == BPF_JGE); + set_lower_bound(true_reg, val, is_jmp32, opcode == BPF_JGT); break; } case BPF_JSGE: @@ -5556,15 +5613,8 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg, case BPF_JLE: case BPF_JLT: { - u64 false_umin = opcode == BPF_JLT ? val : val + 1; - u64 true_umax = opcode == BPF_JLT ? val - 1 : val; - - if (is_jmp32) { - false_umin += gen_hi_min(false_reg->var_off); - true_umax += gen_hi_max(true_reg->var_off); - } - false_reg->umin_value = max(false_reg->umin_value, false_umin); - true_reg->umax_value = min(true_reg->umax_value, true_umax); + set_lower_bound(false_reg, val, is_jmp32, opcode == BPF_JLE); + set_upper_bound(true_reg, val, is_jmp32, opcode == BPF_JLT); break; } case BPF_JSLE: @@ -5639,15 +5689,8 @@ static void reg_set_min_max_inv(struct bpf_reg_state *true_reg, case BPF_JGE: case BPF_JGT: { - u64 false_umin = opcode == BPF_JGT ? val : val + 1; - u64 true_umax = opcode == BPF_JGT ? val - 1 : val; - - if (is_jmp32) { - false_umin += gen_hi_min(false_reg->var_off); - true_umax += gen_hi_max(true_reg->var_off); - } - false_reg->umin_value = max(false_reg->umin_value, false_umin); - true_reg->umax_value = min(true_reg->umax_value, true_umax); + set_lower_bound(false_reg, val, is_jmp32, opcode == BPF_JGE); + set_upper_bound(true_reg, val, is_jmp32, opcode == BPF_JGT); break; } case BPF_JSGE: @@ -5665,15 +5708,8 @@ static void reg_set_min_max_inv(struct bpf_reg_state *true_reg, case BPF_JLE: case BPF_JLT: { - u64 false_umax = opcode == BPF_JLT ? val : val - 1; - u64 true_umin = opcode == BPF_JLT ? val + 1 : val; - - if (is_jmp32) { - false_umax += gen_hi_max(false_reg->var_off); - true_umin += gen_hi_min(true_reg->var_off); - } - false_reg->umax_value = min(false_reg->umax_value, false_umax); - true_reg->umin_value = max(true_reg->umin_value, true_umin); + set_upper_bound(false_reg, val, is_jmp32, opcode == BPF_JLE); + set_lower_bound(true_reg, val, is_jmp32, opcode == BPF_JLT); break; } case BPF_JSLE: From 659a019ab5417f8ed92e34108dadcdb5c4c2cf31 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Fri, 17 Apr 2020 02:00:06 +0200 Subject: [PATCH 1351/1640] UPSTREAM: bpf: Forbid XADD on spilled pointers for unprivileged users commit 6e7e63cbb023976d828cdb22422606bf77baa8a9 upstream. When check_xadd() verifies an XADD operation on a pointer to a stack slot containing a spilled pointer, check_stack_read() verifies that the read, which is part of XADD, is valid. However, since the placeholder value -1 is passed as `value_regno`, check_stack_read() can only return a binary decision and can't return the type of the value that was read. The intent here is to verify whether the value read from the stack slot may be used as a SCALAR_VALUE; but since check_stack_read() doesn't check the type, and the type information is lost when check_stack_read() returns, this is not enforced, and a malicious user can abuse XADD to leak spilled kernel pointers. Fix it by letting check_stack_read() verify that the value is usable as a SCALAR_VALUE if no type information is passed to the caller. To be able to use __is_pointer_value() in check_stack_read(), move it up. Fix up the expected unprivileged error message for a BPF selftest that, until now, assumed that unprivileged users can use XADD on stack-spilled pointers. This also gives us a test for the behavior introduced in this patch for free. In theory, this could also be fixed by forbidding XADD on stack spills entirely, since XADD is a locked operation (for operations on memory with concurrency) and there can't be any concurrency on the BPF stack; but Alexei has said that he wants to keep XADD on stack slots working to avoid changes to the test suite [1]. The following BPF program demonstrates how to leak a BPF map pointer as an unprivileged user using this bug: // r7 = map_pointer BPF_LD_MAP_FD(BPF_REG_7, small_map), // r8 = launder(map_pointer) BPF_STX_MEM(BPF_DW, BPF_REG_FP, BPF_REG_7, -8), BPF_MOV64_IMM(BPF_REG_1, 0), ((struct bpf_insn) { .code = BPF_STX | BPF_DW | BPF_XADD, .dst_reg = BPF_REG_FP, .src_reg = BPF_REG_1, .off = -8 }), BPF_LDX_MEM(BPF_DW, BPF_REG_8, BPF_REG_FP, -8), // store r8 into map BPF_MOV64_REG(BPF_REG_ARG1, BPF_REG_7), BPF_MOV64_REG(BPF_REG_ARG2, BPF_REG_FP), BPF_ALU64_IMM(BPF_ADD, BPF_REG_ARG2, -4), BPF_ST_MEM(BPF_W, BPF_REG_ARG2, 0, 0), BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem), BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1), BPF_EXIT_INSN(), BPF_STX_MEM(BPF_DW, BPF_REG_0, BPF_REG_8, 0), BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN() [1] https://lore.kernel.org/bpf/20200416211116.qxqcza5vo2ddnkdq@ast-mbp.dhcp.thefacebook.com/ Fixes: 17a5267067f3 ("bpf: verifier (add verifier core)") Change-Id: Id5e3abfb68744aab78d7d7609c1b6e88e68993ea Signed-off-by: Jann Horn Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200417000007.10734-1-jannh@google.com Signed-off-by: Greg Kroah-Hartman --- kernel/bpf/verifier.c | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 140e38afb75f..470bf7303b1c 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1866,6 +1866,15 @@ static bool register_is_const(struct bpf_reg_state *reg) return reg->type == SCALAR_VALUE && tnum_is_const(reg->var_off); } +static bool __is_pointer_value(bool allow_ptr_leaks, + const struct bpf_reg_state *reg) +{ + if (allow_ptr_leaks) + return false; + + return reg->type != SCALAR_VALUE; +} + static void save_register_state(struct bpf_func_state *state, int spi, struct bpf_reg_state *reg) { @@ -2032,6 +2041,16 @@ static int check_stack_read(struct bpf_verifier_env *env, * which resets stack/reg liveness for state transitions */ state->regs[value_regno].live |= REG_LIVE_WRITTEN; + } else if (__is_pointer_value(env->allow_ptr_leaks, reg)) { + /* If value_regno==-1, the caller is asking us whether + * it is acceptable to use this value as a SCALAR_VALUE + * (e.g. for XADD). + * We must not allow unprivileged callers to do that + * with spilled pointers. + */ + verbose(env, "leaking pointer from stack off %d\n", + off); + return -EACCES; } mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64); } else { @@ -2392,15 +2411,6 @@ static int check_sock_access(struct bpf_verifier_env *env, int insn_idx, return -EACCES; } -static bool __is_pointer_value(bool allow_ptr_leaks, - const struct bpf_reg_state *reg) -{ - if (allow_ptr_leaks) - return false; - - return reg->type != SCALAR_VALUE; -} - static struct bpf_reg_state *reg_state(struct bpf_verifier_env *env, int regno) { return cur_regs(env) + regno; From 7f7182774485e9ca19ed02275b8a5fcb86f4a3a9 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Thu, 30 Apr 2020 08:18:51 +0000 Subject: [PATCH 1352/1640] UPSTREAM: bpf: Fix error return code in map_lookup_and_delete_elem() [ Upstream commit 7f645462ca01d01abb94d75e6768c8b3ed3a188b ] Fix to return negative error code -EFAULT from the copy_to_user() error handling case instead of 0, as done elsewhere in this function. Fixes: bd513cd08f10 ("bpf: add MAP_LOOKUP_AND_DELETE_ELEM syscall") Change-Id: I72827ff32b4603f5922cf60abbc9dff3d6dd4d88 Signed-off-by: Wei Yongjun Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200430081851.166996-1-weiyongjun1@huawei.com Signed-off-by: Sasha Levin --- kernel/bpf/syscall.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 9f6bee48c066..64ac10a591f0 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1148,8 +1148,10 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr) if (err) goto free_value; - if (copy_to_user(uvalue, value, value_size) != 0) + if (copy_to_user(uvalue, value, value_size) != 0) { + err = -EFAULT; goto free_value; + } err = 0; From fe2ea4170e91c3960b23889f917c2fd4151264fd Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Wed, 18 Mar 2020 15:27:46 -0700 Subject: [PATCH 1353/1640] BACKPORT: bpf: Support llvm-objcopy for vmlinux BTF commit 90ceddcb495008ac8ba7a3dce297841efcd7d584 upstream. Simplify gen_btf logic to make it work with llvm-objcopy. The existing 'file format' and 'architecture' parsing logic is brittle and does not work with llvm-objcopy/llvm-objdump. 'file format' output of llvm-objdump>=11 will match GNU objdump, but 'architecture' (bfdarch) may not. .BTF in .tmp_vmlinux.btf is non-SHF_ALLOC. Add the SHF_ALLOC flag because it is part of vmlinux image used for introspection. C code can reference the section via linker script defined __start_BTF and __stop_BTF. This fixes a small problem that previous .BTF had the SHF_WRITE flag (objcopy -I binary -O elf* synthesized .data). Additionally, `objcopy -I binary` synthesized symbols _binary__btf_vmlinux_bin_start and _binary__btf_vmlinux_bin_stop (not used elsewhere) are replaced with more commonplace __start_BTF and __stop_BTF. Add 2>/dev/null because GNU objcopy (but not llvm-objcopy) warns "empty loadable segment detected at vaddr=0xffffffff81000000, is this intentional?" We use a dd command to change the e_type field in the ELF header from ET_EXEC to ET_REL so that lld will accept .btf.vmlinux.bin.o. Accepting ET_EXEC as an input file is an extremely rare GNU ld feature that lld does not intend to support, because this is error-prone. The output section description .BTF in include/asm-generic/vmlinux.lds.h avoids potential subtle orphan section placement issues and suppresses --orphan-handling=warn warnings. Fixes: df786c9b9476 ("bpf: Force .BTF section start to zero when dumping from vmlinux") Fixes: cb0cc635c7a9 ("powerpc: Include .BTF section") Reported-by: Nathan Chancellor Change-Id: I145b50cfe3e0e1e034fb750cf55642905382a06c Signed-off-by: Fangrui Song Signed-off-by: Daniel Borkmann Tested-by: Stanislav Fomichev Tested-by: Andrii Nakryiko Reviewed-by: Stanislav Fomichev Reviewed-by: Kees Cook Acked-by: Andrii Nakryiko Acked-by: Michael Ellerman (powerpc) Link: https://github.com/ClangBuiltLinux/linux/issues/871 Link: https://lore.kernel.org/bpf/20200318222746.173648-1-maskray@google.com Signed-off-by: Maria Teguiani Tested-by: Matthias Maennich Signed-off-by: Greg Kroah-Hartman --- arch/powerpc/kernel/vmlinux.lds.S | 6 ------ include/asm-generic/vmlinux.lds.h | 22 +++++++++++++++++++--- kernel/bpf/sysfs_btf.c | 11 +++++------ scripts/link-vmlinux.sh | 17 ++++++++++------- 4 files changed, 34 insertions(+), 22 deletions(-) diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S index 7346875ac61d..ca3d8e121663 100644 --- a/arch/powerpc/kernel/vmlinux.lds.S +++ b/arch/powerpc/kernel/vmlinux.lds.S @@ -332,12 +332,6 @@ SECTIONS *(.branch_lt) } -#ifdef CONFIG_DEBUG_INFO_BTF - .BTF : AT(ADDR(.BTF) - LOAD_OFFSET) { - *(.BTF) - } -#endif - .opd : AT(ADDR(.opd) - LOAD_OFFSET) { *(.opd) } diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 851da9de20fe..c12fef18ae55 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -504,10 +504,12 @@ VMLINUX_SYMBOL(__start___modver) = .; \ KEEP(*(__modver)) \ VMLINUX_SYMBOL(__stop___modver) = .; \ - . = ALIGN((align)); \ - VMLINUX_SYMBOL(__end_rodata) = .; \ } \ - . = ALIGN((align)); + \ + BTF \ + \ + . = ALIGN((align)); \ + VMLINUX_SYMBOL(__end_rodata) = .; /* RODATA & RO_DATA provided for backward compatibility. * All archs are supposed to use RO_DATA() */ @@ -609,6 +611,20 @@ VMLINUX_SYMBOL(__stop___ex_table) = .; \ } +/* + * .BTF + */ +#ifdef CONFIG_DEBUG_INFO_BTF +#define BTF \ + .BTF : AT(ADDR(.BTF) - LOAD_OFFSET) { \ + __start_BTF = .; \ + *(.BTF) \ + __stop_BTF = .; \ + } +#else +#define BTF +#endif + /* * Init task */ diff --git a/kernel/bpf/sysfs_btf.c b/kernel/bpf/sysfs_btf.c index 7ae5dddd1fe6..3b495773de5a 100644 --- a/kernel/bpf/sysfs_btf.c +++ b/kernel/bpf/sysfs_btf.c @@ -9,15 +9,15 @@ #include /* See scripts/link-vmlinux.sh, gen_btf() func for details */ -extern char __weak _binary__btf_vmlinux_bin_start[]; -extern char __weak _binary__btf_vmlinux_bin_end[]; +extern char __weak __start_BTF[]; +extern char __weak __stop_BTF[]; static ssize_t btf_vmlinux_read(struct file *file, struct kobject *kobj, struct bin_attribute *bin_attr, char *buf, loff_t off, size_t len) { - memcpy(buf, _binary__btf_vmlinux_bin_start + off, len); + memcpy(buf, __start_BTF + off, len); return len; } @@ -30,15 +30,14 @@ static struct kobject *btf_kobj; static int __init btf_vmlinux_init(void) { - if (!_binary__btf_vmlinux_bin_start) + if (!__start_BTF) return 0; btf_kobj = kobject_create_and_add("btf", kernel_kobj); if (!btf_kobj) return -ENOMEM; - bin_attr_btf_vmlinux.size = _binary__btf_vmlinux_bin_end - - _binary__btf_vmlinux_bin_start; + bin_attr_btf_vmlinux.size = __stop_BTF - __start_BTF; return sysfs_create_bin_file(btf_kobj, &bin_attr_btf_vmlinux); } diff --git a/scripts/link-vmlinux.sh b/scripts/link-vmlinux.sh index 371164c692d6..fe62d2fc881b 100755 --- a/scripts/link-vmlinux.sh +++ b/scripts/link-vmlinux.sh @@ -208,7 +208,6 @@ vmlinux_link() gen_btf() { local pahole_ver - local bin_arch if ! [ -x "$(command -v ${PAHOLE})" ]; then info "BTF" "${1}: pahole (${PAHOLE}) is not available" @@ -225,12 +224,16 @@ gen_btf() vmlinux_link ${1} LLVM_OBJCOPY=${OBJCOPY} ${PAHOLE} -J ${1} - # dump .BTF section into raw binary file to link with final vmlinux - bin_arch=$(${OBJDUMP} -f ${1} | grep architecture | \ - cut -d, -f1 | cut -d' ' -f2) - ${OBJCOPY} --dump-section .BTF=.btf.vmlinux.bin ${1} 2>/dev/null - ${OBJCOPY} -I binary -O ${CONFIG_OUTPUT_FORMAT} -B ${bin_arch} \ - --rename-section .data=.BTF .btf.vmlinux.bin ${2} + # Create ${2} which contains just .BTF section but no symbols. Add + # SHF_ALLOC because .BTF will be part of the vmlinux image. --strip-all + # deletes all symbols including __start_BTF and __stop_BTF, which will + # be redefined in the linker script. Add 2>/dev/null to suppress GNU + # objcopy warnings: "empty loadable segment detected at ..." + ${OBJCOPY} --only-section=.BTF --set-section-flags .BTF=alloc,readonly \ + --strip-all ${1} ${2} 2>/dev/null + # Change e_type to ET_REL so that it can be used to link final vmlinux. + # Unlike GNU ld, lld does not allow an ET_EXEC input. + printf '\1' | dd of=${2} conv=notrunc bs=1 seek=16 status=none } # Create ${2} .o file with all symbols from the ${1} object file From e05057708cc4cf8de8f0cb540b690229af55cdc8 Mon Sep 17 00:00:00 2001 From: Anton Protopopov Date: Wed, 27 May 2020 18:56:59 +0000 Subject: [PATCH 1354/1640] UPSTREAM: bpf: Fix map permissions check [ Upstream commit 1ea0f9120c8ce105ca181b070561df5cbd6bc049 ] The map_lookup_and_delete_elem() function should check for both FMODE_CAN_WRITE and FMODE_CAN_READ permissions because it returns a map element to user space. Fixes: bd513cd08f10 ("bpf: add MAP_LOOKUP_AND_DELETE_ELEM syscall") Change-Id: I5f610b367a07d3cf203ea0270573b27791831ad0 Signed-off-by: Anton Protopopov Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200527185700.14658-5-a.s.protopopov@gmail.com Signed-off-by: Alexei Starovoitov Signed-off-by: Sasha Levin --- kernel/bpf/syscall.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 64ac10a591f0..de4e47d1a31c 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1120,7 +1120,8 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr) map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); - if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) { + if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ) || + !(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) { err = -EPERM; goto err_put; } From a5eb936703f49253b2d47e476a2329612d4f8632 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Tue, 16 Jun 2020 16:28:29 +0200 Subject: [PATCH 1355/1640] UPSTREAM: devmap: Use bpf_map_area_alloc() for allocating hash buckets MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 99c51064fb06146b3d494b745c947e438a10aaa7 ] Syzkaller discovered that creating a hash of type devmap_hash with a large number of entries can hit the memory allocator limit for allocating contiguous memory regions. There's really no reason to use kmalloc_array() directly in the devmap code, so just switch it to the existing bpf_map_area_alloc() function that is used elsewhere. Fixes: 6f9d451ab1a3 ("xdp: Add devmap_hash map type for looking up devices by hashed index") Reported-by: Xiumei Mu Change-Id: Ie563f711cae0b2b3fead2ae1430439e15e5c964c Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Alexei Starovoitov Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20200616142829.114173-1-toke@redhat.com Signed-off-by: Sasha Levin --- kernel/bpf/devmap.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index b4b6b77f309c..6684696fa457 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -88,12 +88,13 @@ struct bpf_dtab { static DEFINE_SPINLOCK(dev_map_lock); static LIST_HEAD(dev_map_list); -static struct hlist_head *dev_map_create_hash(unsigned int entries) +static struct hlist_head *dev_map_create_hash(unsigned int entries, + int numa_node) { int i; struct hlist_head *hash; - hash = kmalloc_array(entries, sizeof(*hash), GFP_KERNEL); + hash = bpf_map_area_alloc(entries * sizeof(*hash), numa_node); if (hash != NULL) for (i = 0; i < entries; i++) INIT_HLIST_HEAD(&hash[i]); @@ -151,7 +152,8 @@ static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr) INIT_LIST_HEAD(per_cpu_ptr(dtab->flush_list, cpu)); if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) { - dtab->dev_index_head = dev_map_create_hash(dtab->n_buckets); + dtab->dev_index_head = dev_map_create_hash(dtab->n_buckets, + dtab->map.numa_node); if (!dtab->dev_index_head) goto free_percpu; @@ -249,7 +251,7 @@ static void dev_map_free(struct bpf_map *map) } } - kfree(dtab->dev_index_head); + bpf_map_area_free(dtab->dev_index_head); } else { for (i = 0; i < dtab->map.max_entries; i++) { struct bpf_dtab_netdev *dev; From 602e0c348965561844992fcb7fa5e4ac8d69d748 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Tue, 16 Jun 2020 18:04:14 -0700 Subject: [PATCH 1356/1640] UPSTREAM: bpf: Don't return EINVAL from {get,set}sockopt when optlen > PAGE_SIZE [ Upstream commit d8fe449a9c51a37d844ab607e14e2f5c657d3cf2 ] Attaching to these hooks can break iptables because its optval is usually quite big, or at least bigger than the current PAGE_SIZE limit. David also mentioned some SCTP options can be big (around 256k). For such optvals we expose only the first PAGE_SIZE bytes to the BPF program. BPF program has two options: 1. Set ctx->optlen to 0 to indicate that the BPF's optval should be ignored and the kernel should use original userspace value. 2. Set ctx->optlen to something that's smaller than the PAGE_SIZE. v5: * use ctx->optlen == 0 with trimmed buffer (Alexei Starovoitov) * update the docs accordingly v4: * use temporary buffer to avoid optval == optval_end == NULL; this removes the corner case in the verifier that might assume non-zero PTR_TO_PACKET/PTR_TO_PACKET_END. v3: * don't increase the limit, bypass the argument v2: * proper comments formatting (Jakub Kicinski) Fixes: 0d01da6afc54 ("bpf: implement getsockopt and setsockopt hooks") Change-Id: I5cc4abaea99d89a7133727589a736222f9ef2478 Signed-off-by: Stanislav Fomichev Signed-off-by: Alexei Starovoitov Cc: David Laight Link: https://lore.kernel.org/bpf/20200617010416.93086-1-sdf@google.com Signed-off-by: Sasha Levin --- kernel/bpf/cgroup.c | 53 ++++++++++++++++++++++++++++----------------- 1 file changed, 33 insertions(+), 20 deletions(-) diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 869e2e1860e8..b701af27a779 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -966,16 +966,23 @@ static bool __cgroup_bpf_prog_array_is_empty(struct cgroup *cgrp, static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen) { - if (unlikely(max_optlen > PAGE_SIZE) || max_optlen < 0) + if (unlikely(max_optlen < 0)) return -EINVAL; + if (unlikely(max_optlen > PAGE_SIZE)) { + /* We don't expose optvals that are greater than PAGE_SIZE + * to the BPF program. + */ + max_optlen = PAGE_SIZE; + } + ctx->optval = kzalloc(max_optlen, GFP_USER); if (!ctx->optval) return -ENOMEM; ctx->optval_end = ctx->optval + max_optlen; - return 0; + return max_optlen; } static void sockopt_free_buf(struct bpf_sockopt_kern *ctx) @@ -1009,13 +1016,13 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level, */ max_optlen = max_t(int, 16, *optlen); - ret = sockopt_alloc_buf(&ctx, max_optlen); - if (ret) - return ret; + max_optlen = sockopt_alloc_buf(&ctx, max_optlen); + if (max_optlen < 0) + return max_optlen; ctx.optlen = *optlen; - if (copy_from_user(ctx.optval, optval, *optlen) != 0) { + if (copy_from_user(ctx.optval, optval, min(*optlen, max_optlen)) != 0) { ret = -EFAULT; goto out; } @@ -1043,8 +1050,14 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level, /* export any potential modifications */ *level = ctx.level; *optname = ctx.optname; - *optlen = ctx.optlen; - *kernel_optval = ctx.optval; + + /* optlen == 0 from BPF indicates that we should + * use original userspace data. + */ + if (ctx.optlen != 0) { + *optlen = ctx.optlen; + *kernel_optval = ctx.optval; + } } out: @@ -1076,12 +1089,12 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, __cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_GETSOCKOPT)) return retval; - ret = sockopt_alloc_buf(&ctx, max_optlen); - if (ret) - return ret; - ctx.optlen = max_optlen; + max_optlen = sockopt_alloc_buf(&ctx, max_optlen); + if (max_optlen < 0) + return max_optlen; + if (!retval) { /* If kernel getsockopt finished successfully, * copy whatever was returned to the user back @@ -1095,10 +1108,8 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, goto out; } - if (ctx.optlen > max_optlen) - ctx.optlen = max_optlen; - - if (copy_from_user(ctx.optval, optval, ctx.optlen) != 0) { + if (copy_from_user(ctx.optval, optval, + min(ctx.optlen, max_optlen)) != 0) { ret = -EFAULT; goto out; } @@ -1127,10 +1138,12 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, goto out; } - if (copy_to_user(optval, ctx.optval, ctx.optlen) || - put_user(ctx.optlen, optlen)) { - ret = -EFAULT; - goto out; + if (ctx.optlen != 0) { + if (copy_to_user(optval, ctx.optval, ctx.optlen) || + put_user(ctx.optlen, optlen)) { + ret = -EFAULT; + goto out; + } } ret = ctx.retval; From 778ba90f7c9c82a0854c0cdaeae7fa336a12f38c Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Mon, 29 Jun 2020 10:56:28 +0100 Subject: [PATCH 1357/1640] UPSTREAM: bpf: sockmap: Require attach_bpf_fd when detaching a program commit bb0de3131f4c60a9bf976681e0fe4d1e55c7a821 upstream. The sockmap code currently ignores the value of attach_bpf_fd when detaching a program. This is contrary to the usual behaviour of checking that attach_bpf_fd represents the currently attached program. Ensure that attach_bpf_fd is indeed the currently attached program. It turns out that all sockmap selftests already do this, which indicates that this is unlikely to cause breakage. Fixes: 604326b41a6f ("bpf, sockmap: convert to generic sk_msg interface") Change-Id: I7140e8d681828ce099593f24db90d1b4af3a03dc Signed-off-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200629095630.7933-5-lmb@cloudflare.com Signed-off-by: Greg Kroah-Hartman --- include/linux/bpf.h | 13 +++++++++-- include/linux/skmsg.h | 13 +++++++++++ kernel/bpf/syscall.c | 4 ++-- net/core/sock_map.c | 50 ++++++++++++++++++++++++++++++++++++++----- 4 files changed, 71 insertions(+), 9 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 2466c7b39852..e7117c5a49dd 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -961,11 +961,14 @@ static inline void bpf_map_offload_map_free(struct bpf_map *map) #endif /* CONFIG_NET && CONFIG_BPF_SYSCALL */ #if defined(CONFIG_BPF_STREAM_PARSER) -int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, u32 which); +int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, + struct bpf_prog *old, u32 which); int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog); +int sock_map_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype); #else static inline int sock_map_prog_update(struct bpf_map *map, - struct bpf_prog *prog, u32 which) + struct bpf_prog *prog, + struct bpf_prog *old, u32 which) { return -EOPNOTSUPP; } @@ -975,6 +978,12 @@ static inline int sock_map_get_from_fd(const union bpf_attr *attr, { return -EINVAL; } + +static inline int sock_map_prog_detach(const union bpf_attr *attr, + enum bpf_prog_type ptype) +{ + return -EOPNOTSUPP; +} #endif #if defined(CONFIG_XDP_SOCKETS) diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index 95678103c4a0..ce7aeeed62da 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -361,6 +361,19 @@ static inline void psock_set_prog(struct bpf_prog **pprog, bpf_prog_put(prog); } +static inline int psock_replace_prog(struct bpf_prog **pprog, + struct bpf_prog *prog, + struct bpf_prog *old) +{ + if (cmpxchg(pprog, old, prog) != old) + return -ENOENT; + + if (old) + bpf_prog_put(old); + + return 0; +} + static inline void psock_progs_drop(struct sk_psock_progs *progs) { psock_set_prog(&progs->msg_parser, NULL); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index de4e47d1a31c..5fcdc882c314 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2029,10 +2029,10 @@ static int bpf_prog_detach(const union bpf_attr *attr) ptype = BPF_PROG_TYPE_CGROUP_DEVICE; break; case BPF_SK_MSG_VERDICT: - return sock_map_get_from_fd(attr, NULL); + return sock_map_prog_detach(attr, BPF_PROG_TYPE_SK_MSG); case BPF_SK_SKB_STREAM_PARSER: case BPF_SK_SKB_STREAM_VERDICT: - return sock_map_get_from_fd(attr, NULL); + return sock_map_prog_detach(attr, BPF_PROG_TYPE_SK_SKB); case BPF_LIRC_MODE2: return lirc_prog_detach(attr); case BPF_FLOW_DISSECTOR: diff --git a/net/core/sock_map.c b/net/core/sock_map.c index b476d119de7e..2a16e4d77040 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -71,7 +71,42 @@ int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog) map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); - ret = sock_map_prog_update(map, prog, attr->attach_type); + ret = sock_map_prog_update(map, prog, NULL, attr->attach_type); + fdput(f); + return ret; +} + +int sock_map_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype) +{ + u32 ufd = attr->target_fd; + struct bpf_prog *prog; + struct bpf_map *map; + struct fd f; + int ret; + + if (attr->attach_flags) + return -EINVAL; + + f = fdget(ufd); + map = __bpf_map_get(f); + if (IS_ERR(map)) + return PTR_ERR(map); + + prog = bpf_prog_get(attr->attach_bpf_fd); + if (IS_ERR(prog)) { + ret = PTR_ERR(prog); + goto put_map; + } + + if (prog->type != ptype) { + ret = -EINVAL; + goto put_prog; + } + + ret = sock_map_prog_update(map, NULL, prog, attr->attach_type); +put_prog: + bpf_prog_put(prog); +put_map: fdput(f); return ret; } @@ -958,27 +993,32 @@ static struct sk_psock_progs *sock_map_progs(struct bpf_map *map) } int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, - u32 which) + struct bpf_prog *old, u32 which) { struct sk_psock_progs *progs = sock_map_progs(map); + struct bpf_prog **pprog; if (!progs) return -EOPNOTSUPP; switch (which) { case BPF_SK_MSG_VERDICT: - psock_set_prog(&progs->msg_parser, prog); + pprog = &progs->msg_parser; break; case BPF_SK_SKB_STREAM_PARSER: - psock_set_prog(&progs->skb_parser, prog); + pprog = &progs->skb_parser; break; case BPF_SK_SKB_STREAM_VERDICT: - psock_set_prog(&progs->skb_verdict, prog); + pprog = &progs->skb_verdict; break; default: return -EOPNOTSUPP; } + if (old) + return psock_replace_prog(pprog, prog, old); + + psock_set_prog(pprog, prog); return 0; } From 0bf27479be5a054886e5690d03c58be26dbb750e Mon Sep 17 00:00:00 2001 From: Tony Ambardar Date: Sat, 19 Sep 2020 22:01:33 -0700 Subject: [PATCH 1358/1640] UPSTREAM: bpf: Fix sysfs export of empty BTF section commit e23bb04b0c938588eae41b7f4712b722290ed2b8 upstream. If BTF data is missing or removed from the ELF section it is still exported via sysfs as a zero-length file: root@OpenWrt:/# ls -l /sys/kernel/btf/vmlinux -r--r--r-- 1 root root 0 Jul 18 02:59 /sys/kernel/btf/vmlinux Moreover, reads from this file succeed and leak kernel data: root@OpenWrt:/# hexdump -C /sys/kernel/btf/vmlinux|head -10 000000 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |................| * 000cc0 00 00 00 00 00 00 00 00 00 00 00 00 80 83 b0 80 |................| 000cd0 00 10 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |................| 000ce0 00 00 00 00 00 00 00 00 00 00 00 00 57 ac 6e 9d |............W.n.| 000cf0 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |................| * 002650 00 00 00 00 00 00 00 10 00 00 00 01 00 00 00 01 |................| 002660 80 82 9a c4 80 85 97 80 81 a9 51 68 00 00 00 02 |..........Qh....| 002670 80 25 44 dc 80 85 97 80 81 a9 50 24 81 ab c4 60 |.%D.......P$...`| This situation was first observed with kernel 5.4.x, cross-compiled for a MIPS target system. Fix by adding a sanity-check for export of zero-length data sections. Fixes: 341dfcf8d78e ("btf: expose BTF info through sysfs") Change-Id: I301a5c4b01a6f01e6071cd7c70d43a0c9a1a9c22 Signed-off-by: Tony Ambardar Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/b38db205a66238f70823039a8c531535864eaac5.1600417359.git.Tony.Ambardar@gmail.com Signed-off-by: Greg Kroah-Hartman --- kernel/bpf/sysfs_btf.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/bpf/sysfs_btf.c b/kernel/bpf/sysfs_btf.c index 3b495773de5a..11b3380887fa 100644 --- a/kernel/bpf/sysfs_btf.c +++ b/kernel/bpf/sysfs_btf.c @@ -30,15 +30,15 @@ static struct kobject *btf_kobj; static int __init btf_vmlinux_init(void) { - if (!__start_BTF) + bin_attr_btf_vmlinux.size = __stop_BTF - __start_BTF; + + if (!__start_BTF || bin_attr_btf_vmlinux.size == 0) return 0; btf_kobj = kobject_create_and_add("btf", kernel_kobj); if (!btf_kobj) return -ENOMEM; - bin_attr_btf_vmlinux.size = __stop_BTF - __start_BTF; - return sysfs_create_bin_file(btf_kobj, &bin_attr_btf_vmlinux); } From 8b8555883ae8d8853f831c88dfdd8f4cd03c6393 Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Wed, 16 Sep 2020 23:10:07 +0200 Subject: [PATCH 1359/1640] UPSTREAM: bpf: Limit caller's stack depth 256 for subprogs with tailcalls [ Upstream commit 7f6e4312e15a5c370e84eaa685879b6bdcc717e4 ] Protect against potential stack overflow that might happen when bpf2bpf calls get combined with tailcalls. Limit the caller's stack depth for such case down to 256 so that the worst case scenario would result in 8k stack size (32 which is tailcall limit * 256 = 8k). Suggested-by: Alexei Starovoitov Change-Id: Icb537d911f666421c2196a8411f66aa6f63fd087 Signed-off-by: Maciej Fijalkowski Signed-off-by: Alexei Starovoitov Signed-off-by: Sasha Levin --- include/linux/bpf_verifier.h | 1 + kernel/bpf/verifier.c | 29 +++++++++++++++++++++++++++++ 2 files changed, 30 insertions(+) diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index dbb95b500518..e0b52351127d 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -353,6 +353,7 @@ struct bpf_subprog_info { u32 start; /* insn idx of function entry point */ u32 linfo_idx; /* The idx to the main_prog->aux->linfo */ u16 stack_depth; /* max. stack depth used by this function */ + bool has_tail_call; }; /* single container for all structs diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 470bf7303b1c..e41250821dd6 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1160,6 +1160,10 @@ static int check_subprogs(struct bpf_verifier_env *env) for (i = 0; i < insn_cnt; i++) { u8 code = insn[i].code; + if (code == (BPF_JMP | BPF_CALL) && + insn[i].imm == BPF_FUNC_tail_call && + insn[i].src_reg != BPF_PSEUDO_CALL) + subprog[cur_subprog].has_tail_call = true; if (BPF_CLASS(code) != BPF_JMP && BPF_CLASS(code) != BPF_JMP32) goto next; if (BPF_OP(code) == BPF_EXIT || BPF_OP(code) == BPF_CALL) @@ -2588,6 +2592,31 @@ static int check_max_stack_depth(struct bpf_verifier_env *env) int ret_prog[MAX_CALL_FRAMES]; process_func: + /* protect against potential stack overflow that might happen when + * bpf2bpf calls get combined with tailcalls. Limit the caller's stack + * depth for such case down to 256 so that the worst case scenario + * would result in 8k stack size (32 which is tailcall limit * 256 = + * 8k). + * + * To get the idea what might happen, see an example: + * func1 -> sub rsp, 128 + * subfunc1 -> sub rsp, 256 + * tailcall1 -> add rsp, 256 + * func2 -> sub rsp, 192 (total stack size = 128 + 192 = 320) + * subfunc2 -> sub rsp, 64 + * subfunc22 -> sub rsp, 128 + * tailcall2 -> add rsp, 128 + * func3 -> sub rsp, 32 (total stack size 128 + 192 + 64 + 32 = 416) + * + * tailcall will unwind the current stack frame but it will not get rid + * of caller's stack as shown on the example above. + */ + if (idx && subprog[idx].has_tail_call && depth >= 256) { + verbose(env, + "tail_calls are not allowed when call stack of previous frames is %d bytes. Too large\n", + depth); + return -EACCES; + } /* round up to 32-bytes, since this is granularity * of interpreter stack size */ From 97aa66d030cd5e115964a943196dd23759fa78fe Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 8 Sep 2020 10:57:02 -0700 Subject: [PATCH 1360/1640] UPSTREAM: bpf: Permit map_ptr arithmetic with opcode add and offset 0 [ Upstream commit 7c6967326267bd5c0dded0a99541357d70dd11ac ] Commit 41c48f3a98231 ("bpf: Support access to bpf map fields") added support to access map fields with CORE support. For example, struct bpf_map { __u32 max_entries; } __attribute__((preserve_access_index)); struct bpf_array { struct bpf_map map; __u32 elem_size; } __attribute__((preserve_access_index)); struct { __uint(type, BPF_MAP_TYPE_ARRAY); __uint(max_entries, 4); __type(key, __u32); __type(value, __u32); } m_array SEC(".maps"); SEC("cgroup_skb/egress") int cg_skb(void *ctx) { struct bpf_array *array = (struct bpf_array *)&m_array; /* .. array->map.max_entries .. */ } In kernel, bpf_htab has similar structure, struct bpf_htab { struct bpf_map map; ... } In the above cg_skb(), to access array->map.max_entries, with CORE, the clang will generate two builtin's. base = &m_array; /* access array.map */ map_addr = __builtin_preserve_struct_access_info(base, 0, 0); /* access array.map.max_entries */ max_entries_addr = __builtin_preserve_struct_access_info(map_addr, 0, 0); max_entries = *max_entries_addr; In the current llvm, if two builtin's are in the same function or in the same function after inlining, the compiler is smart enough to chain them together and generates like below: base = &m_array; max_entries = *(base + reloc_offset); /* reloc_offset = 0 in this case */ and we are fine. But if we force no inlining for one of functions in test_map_ptr() selftest, e.g., check_default(), the above two __builtin_preserve_* will be in two different functions. In this case, we will have code like: func check_hash(): reloc_offset_map = 0; base = &m_array; map_base = base + reloc_offset_map; check_default(map_base, ...) func check_default(map_base, ...): max_entries = *(map_base + reloc_offset_max_entries); In kernel, map_ptr (CONST_PTR_TO_MAP) does not allow any arithmetic. The above "map_base = base + reloc_offset_map" will trigger a verifier failure. ; VERIFY(check_default(&hash->map, map)); 0: (18) r7 = 0xffffb4fe8018a004 2: (b4) w1 = 110 3: (63) *(u32 *)(r7 +0) = r1 R1_w=invP110 R7_w=map_value(id=0,off=4,ks=4,vs=8,imm=0) R10=fp0 ; VERIFY_TYPE(BPF_MAP_TYPE_HASH, check_hash); 4: (18) r1 = 0xffffb4fe8018a000 6: (b4) w2 = 1 7: (63) *(u32 *)(r1 +0) = r2 R1_w=map_value(id=0,off=0,ks=4,vs=8,imm=0) R2_w=invP1 R7_w=map_value(id=0,off=4,ks=4,vs=8,imm=0) R10=fp0 8: (b7) r2 = 0 9: (18) r8 = 0xffff90bcb500c000 11: (18) r1 = 0xffff90bcb500c000 13: (0f) r1 += r2 R1 pointer arithmetic on map_ptr prohibited To fix the issue, let us permit map_ptr + 0 arithmetic which will result in exactly the same map_ptr. Change-Id: Ie648e6af57102fe751ac2d4f638a2e347334de38 Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200908175702.2463625-1-yhs@fb.com Signed-off-by: Sasha Levin --- kernel/bpf/verifier.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index e41250821dd6..d101467b62a2 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4574,6 +4574,10 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, dst, reg_type_str[ptr_reg->type]); return -EACCES; case CONST_PTR_TO_MAP: + /* smin_val represents the known value */ + if (known && smin_val == 0 && opcode == BPF_ADD) + break; + /* fall-through */ case PTR_TO_PACKET_END: case PTR_TO_SOCKET: case PTR_TO_SOCKET_OR_NULL: From 1d35587505501f1cbb41ffdb5c4a2c0d4c6aef33 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Wed, 28 Oct 2020 18:15:05 +0100 Subject: [PATCH 1361/1640] BACKPORT: bpf: Don't rely on GCC __attribute__((optimize)) to disable GCSE [ Upstream commit 080b6f40763565f65ebb9540219c71ce885cf568 ] Commit 3193c0836 ("bpf: Disable GCC -fgcse optimization for ___bpf_prog_run()") introduced a __no_fgcse macro that expands to a function scope __attribute__((optimize("-fno-gcse"))), to disable a GCC specific optimization that was causing trouble on x86 builds, and was not expected to have any positive effect in the first place. However, as the GCC manual documents, __attribute__((optimize)) is not for production use, and results in all other optimization options to be forgotten for the function in question. This can cause all kinds of trouble, but in one particular reported case, it causes -fno-asynchronous-unwind-tables to be disregarded, resulting in .eh_frame info to be emitted for the function. This reverts commit 3193c0836, and instead, it disables the -fgcse optimization for the entire source file, but only when building for X86 using GCC with CONFIG_BPF_JIT_ALWAYS_ON disabled. Note that the original commit states that CONFIG_RETPOLINE=n triggers the issue, whereas CONFIG_RETPOLINE=y performs better without the optimization, so it is kept disabled in both cases. Fixes: 3193c0836f20 ("bpf: Disable GCC -fgcse optimization for ___bpf_prog_run()") Change-Id: I513d26e71b1c3928ed5cb9530ce2afa392aa7038 Signed-off-by: Ard Biesheuvel Signed-off-by: Alexei Starovoitov Tested-by: Geert Uytterhoeven Reviewed-by: Nick Desaulniers Link: https://lore.kernel.org/lkml/CAMuHMdUg0WJHEcq6to0-eODpXPOywLot6UD2=GFHpzoj_hCoBQ@mail.gmail.com/ Link: https://lore.kernel.org/bpf/20201028171506.15682-2-ardb@kernel.org Signed-off-by: Sasha Levin --- include/linux/compiler-gcc.h | 2 -- include/linux/compiler_types.h | 4 ---- kernel/bpf/Makefile | 6 +++++- kernel/bpf/core.c | 2 +- 4 files changed, 6 insertions(+), 8 deletions(-) diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index f5ec66a7b516..c9303b82a5f3 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -378,5 +378,3 @@ #if GCC_VERSION >= 50100 #define COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW 1 #endif - -#define __no_fgcse __attribute__((optimize("-fno-gcse"))) diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index b31c17302737..226f4ea0e57c 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -288,10 +288,6 @@ struct ftrace_likely_data { #define __assume_aligned(a, ...) #endif -#ifndef __no_fgcse -# define __no_fgcse -#endif - /* Are two types/vars the same type (ignoring qualifiers)? */ #ifndef __same_type diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index e1d9adb212f9..b0d78bc0b197 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -1,6 +1,10 @@ # SPDX-License-Identifier: GPL-2.0 obj-y := core.o -CFLAGS_core.o += $(call cc-disable-warning, override-init) +ifneq ($(CONFIG_BPF_JIT_ALWAYS_ON),y) +# ___bpf_prog_run() needs GCSE disabled on x86; see 3193c0836f203 for details +cflags-nogcse-$(CONFIG_X86)$(CONFIG_CC_IS_GCC) := -fno-gcse +endif +CFLAGS_core.o += $(call cc-disable-warning, override-init) $(cflags-nogcse-yy) obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 2c0e98af4b0c..1e65de9b8367 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1322,7 +1322,7 @@ bool bpf_opcode_in_insntable(u8 code) * * Decode and execute eBPF instructions. */ -static u64 __no_fgcse ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, u64 *stack) +static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, u64 *stack) { #define BPF_INSN_2_LBL(x, y) [BPF_##x | BPF_##y] = &&x##_##y #define BPF_INSN_3_LBL(x, y, z) [BPF_##x | BPF_##y | BPF_##z] = &&x##_##y##_##z From 849f7daf5f54dc233d90eeeec629dd834c19fff9 Mon Sep 17 00:00:00 2001 From: David Verbeiren Date: Wed, 4 Nov 2020 12:23:32 +0100 Subject: [PATCH 1362/1640] UPSTREAM: bpf: Zero-fill re-used per-cpu map element [ Upstream commit d3bec0138bfbe58606fc1d6f57a4cdc1a20218db ] Zero-fill element values for all other cpus than current, just as when not using prealloc. This is the only way the bpf program can ensure known initial values for all cpus ('onallcpus' cannot be set when coming from the bpf program). The scenario is: bpf program inserts some elements in a per-cpu map, then deletes some (or userspace does). When later adding new elements using bpf_map_update_elem(), the bpf program can only set the value of the new elements for the current cpu. When prealloc is enabled, previously deleted elements are re-used. Without the fix, values for other cpus remain whatever they were when the re-used entry was previously freed. A selftest is added to validate correct operation in above scenario as well as in case of LRU per-cpu map element re-use. Fixes: 6c9059817432 ("bpf: pre-allocate hash map elements") Change-Id: I51fa789725ab02ee5eaf1c04bd8519135accf6a7 Signed-off-by: David Verbeiren Signed-off-by: Alexei Starovoitov Acked-by: Matthieu Baerts Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20201104112332.15191-1-david.verbeiren@tessares.net Signed-off-by: Sasha Levin --- kernel/bpf/hashtab.c | 30 ++- .../selftests/bpf/prog_tests/map_init.c | 214 ++++++++++++++++++ tools/testing/selftests/bpf/test_map_init.c | 33 +++ 3 files changed, 275 insertions(+), 2 deletions(-) create mode 100644 tools/testing/selftests/bpf/prog_tests/map_init.c create mode 100644 tools/testing/selftests/bpf/test_map_init.c diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 87579912f656..0d14a2a11463 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -716,6 +716,32 @@ static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr, } } +static void pcpu_init_value(struct bpf_htab *htab, void __percpu *pptr, + void *value, bool onallcpus) +{ + /* When using prealloc and not setting the initial value on all cpus, + * zero-fill element values for other cpus (just as what happens when + * not using prealloc). Otherwise, bpf program has no way to ensure + * known initial values for cpus other than current one + * (onallcpus=false always when coming from bpf prog). + */ + if (htab_is_prealloc(htab) && !onallcpus) { + u32 size = round_up(htab->map.value_size, 8); + int current_cpu = raw_smp_processor_id(); + int cpu; + + for_each_possible_cpu(cpu) { + if (cpu == current_cpu) + bpf_long_memcpy(per_cpu_ptr(pptr, cpu), value, + size); + else + memset(per_cpu_ptr(pptr, cpu), 0, size); + } + } else { + pcpu_copy_value(htab, pptr, value, onallcpus); + } +} + static bool fd_htab_map_needs_adjust(const struct bpf_htab *htab) { return htab->map.map_type == BPF_MAP_TYPE_HASH_OF_MAPS && @@ -786,7 +812,7 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, } } - pcpu_copy_value(htab, pptr, value, onallcpus); + pcpu_init_value(htab, pptr, value, onallcpus); if (!prealloc) htab_elem_set_ptr(l_new, key_size, pptr); @@ -1082,7 +1108,7 @@ static int __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size), value, onallcpus); } else { - pcpu_copy_value(htab, htab_elem_get_ptr(l_new, key_size), + pcpu_init_value(htab, htab_elem_get_ptr(l_new, key_size), value, onallcpus); hlist_nulls_add_head_rcu(&l_new->hash_node, head); l_new = NULL; diff --git a/tools/testing/selftests/bpf/prog_tests/map_init.c b/tools/testing/selftests/bpf/prog_tests/map_init.c new file mode 100644 index 000000000000..14a31109dd0e --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/map_init.c @@ -0,0 +1,214 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2020 Tessares SA */ + +#include +#include "test_map_init.skel.h" + +#define TEST_VALUE 0x1234 +#define FILL_VALUE 0xdeadbeef + +static int nr_cpus; +static int duration; + +typedef unsigned long long map_key_t; +typedef unsigned long long map_value_t; +typedef struct { + map_value_t v; /* padding */ +} __bpf_percpu_val_align pcpu_map_value_t; + + +static int map_populate(int map_fd, int num) +{ + pcpu_map_value_t value[nr_cpus]; + int i, err; + map_key_t key; + + for (i = 0; i < nr_cpus; i++) + bpf_percpu(value, i) = FILL_VALUE; + + for (key = 1; key <= num; key++) { + err = bpf_map_update_elem(map_fd, &key, value, BPF_NOEXIST); + if (!ASSERT_OK(err, "bpf_map_update_elem")) + return -1; + } + + return 0; +} + +static struct test_map_init *setup(enum bpf_map_type map_type, int map_sz, + int *map_fd, int populate) +{ + struct test_map_init *skel; + int err; + + skel = test_map_init__open(); + if (!ASSERT_OK_PTR(skel, "skel_open")) + return NULL; + + err = bpf_map__set_type(skel->maps.hashmap1, map_type); + if (!ASSERT_OK(err, "bpf_map__set_type")) + goto error; + + err = bpf_map__set_max_entries(skel->maps.hashmap1, map_sz); + if (!ASSERT_OK(err, "bpf_map__set_max_entries")) + goto error; + + err = test_map_init__load(skel); + if (!ASSERT_OK(err, "skel_load")) + goto error; + + *map_fd = bpf_map__fd(skel->maps.hashmap1); + if (CHECK(*map_fd < 0, "bpf_map__fd", "failed\n")) + goto error; + + err = map_populate(*map_fd, populate); + if (!ASSERT_OK(err, "map_populate")) + goto error_map; + + return skel; + +error_map: + close(*map_fd); +error: + test_map_init__destroy(skel); + return NULL; +} + +/* executes bpf program that updates map with key, value */ +static int prog_run_insert_elem(struct test_map_init *skel, map_key_t key, + map_value_t value) +{ + struct test_map_init__bss *bss; + + bss = skel->bss; + + bss->inKey = key; + bss->inValue = value; + bss->inPid = getpid(); + + if (!ASSERT_OK(test_map_init__attach(skel), "skel_attach")) + return -1; + + /* Let tracepoint trigger */ + syscall(__NR_getpgid); + + test_map_init__detach(skel); + + return 0; +} + +static int check_values_one_cpu(pcpu_map_value_t *value, map_value_t expected) +{ + int i, nzCnt = 0; + map_value_t val; + + for (i = 0; i < nr_cpus; i++) { + val = bpf_percpu(value, i); + if (val) { + if (CHECK(val != expected, "map value", + "unexpected for cpu %d: 0x%llx\n", i, val)) + return -1; + nzCnt++; + } + } + + if (CHECK(nzCnt != 1, "map value", "set for %d CPUs instead of 1!\n", + nzCnt)) + return -1; + + return 0; +} + +/* Add key=1 elem with values set for all CPUs + * Delete elem key=1 + * Run bpf prog that inserts new key=1 elem with value=0x1234 + * (bpf prog can only set value for current CPU) + * Lookup Key=1 and check value is as expected for all CPUs: + * value set by bpf prog for one CPU, 0 for all others + */ +static void test_pcpu_map_init(void) +{ + pcpu_map_value_t value[nr_cpus]; + struct test_map_init *skel; + int map_fd, err; + map_key_t key; + + /* max 1 elem in map so insertion is forced to reuse freed entry */ + skel = setup(BPF_MAP_TYPE_PERCPU_HASH, 1, &map_fd, 1); + if (!ASSERT_OK_PTR(skel, "prog_setup")) + return; + + /* delete element so the entry can be re-used*/ + key = 1; + err = bpf_map_delete_elem(map_fd, &key); + if (!ASSERT_OK(err, "bpf_map_delete_elem")) + goto cleanup; + + /* run bpf prog that inserts new elem, re-using the slot just freed */ + err = prog_run_insert_elem(skel, key, TEST_VALUE); + if (!ASSERT_OK(err, "prog_run_insert_elem")) + goto cleanup; + + /* check that key=1 was re-created by bpf prog */ + err = bpf_map_lookup_elem(map_fd, &key, value); + if (!ASSERT_OK(err, "bpf_map_lookup_elem")) + goto cleanup; + + /* and has expected values */ + check_values_one_cpu(value, TEST_VALUE); + +cleanup: + test_map_init__destroy(skel); +} + +/* Add key=1 and key=2 elems with values set for all CPUs + * Run bpf prog that inserts new key=3 elem + * (only for current cpu; other cpus should have initial value = 0) + * Lookup Key=1 and check value is as expected for all CPUs + */ +static void test_pcpu_lru_map_init(void) +{ + pcpu_map_value_t value[nr_cpus]; + struct test_map_init *skel; + int map_fd, err; + map_key_t key; + + /* Set up LRU map with 2 elements, values filled for all CPUs. + * With these 2 elements, the LRU map is full + */ + skel = setup(BPF_MAP_TYPE_LRU_PERCPU_HASH, 2, &map_fd, 2); + if (!ASSERT_OK_PTR(skel, "prog_setup")) + return; + + /* run bpf prog that inserts new key=3 element, re-using LRU slot */ + key = 3; + err = prog_run_insert_elem(skel, key, TEST_VALUE); + if (!ASSERT_OK(err, "prog_run_insert_elem")) + goto cleanup; + + /* check that key=3 replaced one of earlier elements */ + err = bpf_map_lookup_elem(map_fd, &key, value); + if (!ASSERT_OK(err, "bpf_map_lookup_elem")) + goto cleanup; + + /* and has expected values */ + check_values_one_cpu(value, TEST_VALUE); + +cleanup: + test_map_init__destroy(skel); +} + +void test_map_init(void) +{ + nr_cpus = bpf_num_possible_cpus(); + if (nr_cpus <= 1) { + printf("%s:SKIP: >1 cpu needed for this test\n", __func__); + test__skip(); + return; + } + + if (test__start_subtest("pcpu_map_init")) + test_pcpu_map_init(); + if (test__start_subtest("pcpu_lru_map_init")) + test_pcpu_lru_map_init(); +} diff --git a/tools/testing/selftests/bpf/test_map_init.c b/tools/testing/selftests/bpf/test_map_init.c new file mode 100644 index 000000000000..c89d28ead673 --- /dev/null +++ b/tools/testing/selftests/bpf/test_map_init.c @@ -0,0 +1,33 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2020 Tessares SA */ + +#include "vmlinux.h" +#include + +__u64 inKey = 0; +__u64 inValue = 0; +__u32 inPid = 0; + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_HASH); + __uint(max_entries, 2); + __type(key, __u64); + __type(value, __u64); +} hashmap1 SEC(".maps"); + + +SEC("tp/syscalls/sys_enter_getpgid") +int sysenter_getpgid(const void *ctx) +{ + /* Just do it for once, when called from our own test prog. This + * ensures the map value is only updated for a single CPU. + */ + int cur_pid = bpf_get_current_pid_tgid() >> 32; + + if (cur_pid == inPid) + bpf_map_update_elem(&hashmap1, &inKey, &inValue, BPF_NOEXIST); + + return 0; +} + +char _license[] SEC("license") = "GPL"; From fc16a3a52476787b47810404e84139ea663f6993 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Tue, 12 Jan 2021 08:28:29 -0800 Subject: [PATCH 1363/1640] UPSTREAM: bpf: Don't leak memory in bpf getsockopt when optlen == 0 commit 4be34f3d0731b38a1b24566b37fbb39500aaf3a2 upstream. optlen == 0 indicates that the kernel should ignore BPF buffer and use the original one from the user. We, however, forget to free the temporary buffer that we've allocated for BPF. Fixes: d8fe449a9c51 ("bpf: Don't return EINVAL from {get,set}sockopt when optlen > PAGE_SIZE") Reported-by: Martin KaFai Lau Change-Id: Ibfbbdb6704b7db7d61da298365c2b61e5133d9a0 Signed-off-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20210112162829.775079-1-sdf@google.com Signed-off-by: Greg Kroah-Hartman --- kernel/bpf/cgroup.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index b701af27a779..5a8b4dfdb141 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -1057,12 +1057,13 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level, if (ctx.optlen != 0) { *optlen = ctx.optlen; *kernel_optval = ctx.optval; + /* export and don't free sockopt buf */ + return 0; } } out: - if (ret) - sockopt_free_buf(&ctx); + sockopt_free_buf(&ctx); return ret; } EXPORT_SYMBOL(__cgroup_bpf_run_filter_setsockopt); From 8be886316d84426e51c4460e1ef08460a3a5771a Mon Sep 17 00:00:00 2001 From: Mircea Cirjaliu Date: Tue, 19 Jan 2021 21:53:18 +0100 Subject: [PATCH 1364/1640] UPSTREAM: bpf: Fix helper bpf_map_peek_elem_proto pointing to wrong callback commit 301a33d51880619d0c5a581b5a48d3a5248fa84b upstream. I assume this was obtained by copy/paste. Point it to bpf_map_peek_elem() instead of bpf_map_pop_elem(). In practice it may have been less likely hit when under JIT given shielded via 84430d4232c3 ("bpf, verifier: avoid retpoline for map push/pop/peek operation"). Fixes: f1a2e44a3aec ("bpf: add queue and stack maps") Change-Id: Id8f3a14a726ce269d4c9a9e09f2f13defb0ba9bb Signed-off-by: Mircea Cirjaliu Signed-off-by: Daniel Borkmann Cc: Mauricio Vasquez Link: https://lore.kernel.org/bpf/AM7PR02MB6082663DFDCCE8DA7A6DD6B1BBA30@AM7PR02MB6082.eurprd02.prod.outlook.com Signed-off-by: Greg Kroah-Hartman --- kernel/bpf/helpers.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index d15fc6dcb6d6..e178f185c3b3 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -105,7 +105,7 @@ BPF_CALL_2(bpf_map_peek_elem, struct bpf_map *, map, void *, value) } const struct bpf_func_proto bpf_map_peek_elem_proto = { - .func = bpf_map_pop_elem, + .func = bpf_map_peek_elem, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, From eaad47de99f6ae7c5431a277c995719f4e4e914b Mon Sep 17 00:00:00 2001 From: Loris Reiff Date: Fri, 22 Jan 2021 17:42:31 +0100 Subject: [PATCH 1365/1640] UPSTREAM: bpf, cgroup: Fix optlen WARN_ON_ONCE toctou [ Upstream commit bb8b81e396f7afbe7c50d789e2107512274d2a35 ] A toctou issue in `__cgroup_bpf_run_filter_getsockopt` can trigger a WARN_ON_ONCE in a check of `copy_from_user`. `*optlen` is checked to be non-negative in the individual getsockopt functions beforehand. Changing `*optlen` in a race to a negative value will result in a `copy_from_user(ctx.optval, optval, ctx.optlen)` with `ctx.optlen` being a negative integer. Fixes: 0d01da6afc54 ("bpf: implement getsockopt and setsockopt hooks") Change-Id: I3f4acf42bcd20f31e1891fab787acc64550946c0 Signed-off-by: Loris Reiff Signed-off-by: Daniel Borkmann Reviewed-by: Stanislav Fomichev Link: https://lore.kernel.org/bpf/20210122164232.61770-1-loris.reiff@liblor.ch Signed-off-by: Sasha Levin --- kernel/bpf/cgroup.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 5a8b4dfdb141..5b2413eb79db 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -1109,6 +1109,11 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, goto out; } + if (ctx.optlen < 0) { + ret = -EFAULT; + goto out; + } + if (copy_from_user(ctx.optval, optval, min(ctx.optlen, max_optlen)) != 0) { ret = -EFAULT; From de59450917d6c31f04895f79c90e00841aa7a9d3 Mon Sep 17 00:00:00 2001 From: Loris Reiff Date: Fri, 22 Jan 2021 17:42:32 +0100 Subject: [PATCH 1366/1640] UPSTREAM: bpf, cgroup: Fix problematic bounds check [ Upstream commit f4a2da755a7e1f5d845c52aee71336cee289935a ] Since ctx.optlen is signed, a larger value than max_value could be passed, as it is later on used as unsigned, which causes a WARN_ON_ONCE in the copy_to_user. Fixes: 0d01da6afc54 ("bpf: implement getsockopt and setsockopt hooks") Change-Id: I8dc191e6bd0f5310f32cf9e16c45e7c0acf1210c Signed-off-by: Loris Reiff Signed-off-by: Daniel Borkmann Reviewed-by: Stanislav Fomichev Link: https://lore.kernel.org/bpf/20210122164232.61770-2-loris.reiff@liblor.ch Signed-off-by: Sasha Levin --- kernel/bpf/cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 5b2413eb79db..c2f0aa818b7a 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -1131,7 +1131,7 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, goto out; } - if (ctx.optlen > max_optlen) { + if (ctx.optlen > max_optlen || ctx.optlen < 0) { ret = -EFAULT; goto out; } From 66985e71c0a765d8408c78655d9860335ff193dd Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 16 Jun 2021 11:25:11 +0200 Subject: [PATCH 1367/1640] UPSTREAM: bpf: Fix up register-based shifts in interpreter to silence KUBSAN [ Upstream commit 28131e9d933339a92f78e7ab6429f4aaaa07061c ] syzbot reported a shift-out-of-bounds that KUBSAN observed in the interpreter: [...] UBSAN: shift-out-of-bounds in kernel/bpf/core.c:1420:2 shift exponent 255 is too large for 64-bit type 'long long unsigned int' CPU: 1 PID: 11097 Comm: syz-executor.4 Not tainted 5.12.0-rc2-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:79 [inline] dump_stack+0x141/0x1d7 lib/dump_stack.c:120 ubsan_epilogue+0xb/0x5a lib/ubsan.c:148 __ubsan_handle_shift_out_of_bounds.cold+0xb1/0x181 lib/ubsan.c:327 ___bpf_prog_run.cold+0x19/0x56c kernel/bpf/core.c:1420 __bpf_prog_run32+0x8f/0xd0 kernel/bpf/core.c:1735 bpf_dispatcher_nop_func include/linux/bpf.h:644 [inline] bpf_prog_run_pin_on_cpu include/linux/filter.h:624 [inline] bpf_prog_run_clear_cb include/linux/filter.h:755 [inline] run_filter+0x1a1/0x470 net/packet/af_packet.c:2031 packet_rcv+0x313/0x13e0 net/packet/af_packet.c:2104 dev_queue_xmit_nit+0x7c2/0xa90 net/core/dev.c:2387 xmit_one net/core/dev.c:3588 [inline] dev_hard_start_xmit+0xad/0x920 net/core/dev.c:3609 __dev_queue_xmit+0x2121/0x2e00 net/core/dev.c:4182 __bpf_tx_skb net/core/filter.c:2116 [inline] __bpf_redirect_no_mac net/core/filter.c:2141 [inline] __bpf_redirect+0x548/0xc80 net/core/filter.c:2164 ____bpf_clone_redirect net/core/filter.c:2448 [inline] bpf_clone_redirect+0x2ae/0x420 net/core/filter.c:2420 ___bpf_prog_run+0x34e1/0x77d0 kernel/bpf/core.c:1523 __bpf_prog_run512+0x99/0xe0 kernel/bpf/core.c:1737 bpf_dispatcher_nop_func include/linux/bpf.h:644 [inline] bpf_test_run+0x3ed/0xc50 net/bpf/test_run.c:50 bpf_prog_test_run_skb+0xabc/0x1c50 net/bpf/test_run.c:582 bpf_prog_test_run kernel/bpf/syscall.c:3127 [inline] __do_sys_bpf+0x1ea9/0x4f00 kernel/bpf/syscall.c:4406 do_syscall_64+0x2d/0x70 arch/x86/entry/common.c:46 entry_SYSCALL_64_after_hwframe+0x44/0xae [...] Generally speaking, KUBSAN reports from the kernel should be fixed. However, in case of BPF, this particular report caused concerns since the large shift is not wrong from BPF point of view, just undefined. In the verifier, K-based shifts that are >= {64,32} (depending on the bitwidth of the instruction) are already rejected. The register-based cases were not given their content might not be known at verification time. Ideas such as verifier instruction rewrite with an additional AND instruction for the source register were brought up, but regularly rejected due to the additional runtime overhead they incur. As Edward Cree rightly put it: Shifts by more than insn bitness are legal in the BPF ISA; they are implementation-defined behaviour [of the underlying architecture], rather than UB, and have been made legal for performance reasons. Each of the JIT backends compiles the BPF shift operations to machine instructions which produce implementation-defined results in such a case; the resulting contents of the register may be arbitrary but program behaviour as a whole remains defined. Guard checks in the fast path (i.e. affecting JITted code) will thus not be accepted. The case of division by zero is not truly analogous here, as division instructions on many of the JIT-targeted architectures will raise a machine exception / fault on division by zero, whereas (to the best of my knowledge) none will do so on an out-of-bounds shift. Given the KUBSAN report only affects the BPF interpreter, but not JITs, one solution is to add the ANDs with 63 or 31 into ___bpf_prog_run(). That would make the shifts defined, and thus shuts up KUBSAN, and the compiler would optimize out the AND on any CPU that interprets the shift amounts modulo the width anyway (e.g., confirmed from disassembly that on x86-64 and arm64 the generated interpreter code is the same before and after this fix). The BPF interpreter is slow path, and most likely compiled out anyway as distros select BPF_JIT_ALWAYS_ON to avoid speculative execution of BPF instructions by the interpreter. Given the main argument was to avoid sacrificing performance, the fact that the AND is optimized away from compiler for mainstream archs helps as well as a solution moving forward. Also add a comment on LSH/RSH/ARSH translation for JIT authors to provide guidance when they see the ___bpf_prog_run() interpreter code and use it as a model for a new JIT backend. Reported-by: syzbot+bed360704c521841c85d@syzkaller.appspotmail.com Reported-by: Kurt Manucredo Change-Id: I958b8c82128b9c42d3b2a7beb2ea8a6fffbdb011 Signed-off-by: Eric Biggers Co-developed-by: Eric Biggers Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Tested-by: syzbot+bed360704c521841c85d@syzkaller.appspotmail.com Cc: Edward Cree Link: https://lore.kernel.org/bpf/0000000000008f912605bd30d5d7@google.com Link: https://lore.kernel.org/bpf/bac16d8d-c174-bdc4-91bd-bfa62b410190@gmail.com Signed-off-by: Sasha Levin --- kernel/bpf/core.c | 61 +++++++++++++++++++++++++++++++++-------------- 1 file changed, 43 insertions(+), 18 deletions(-) diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 1e65de9b8367..3b3154988b15 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1346,29 +1346,54 @@ static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, u64 *stack) select_insn: goto *jumptable[insn->code]; - /* ALU */ -#define ALU(OPCODE, OP) \ - ALU64_##OPCODE##_X: \ - DST = DST OP SRC; \ - CONT; \ - ALU_##OPCODE##_X: \ - DST = (u32) DST OP (u32) SRC; \ - CONT; \ - ALU64_##OPCODE##_K: \ - DST = DST OP IMM; \ - CONT; \ - ALU_##OPCODE##_K: \ - DST = (u32) DST OP (u32) IMM; \ + /* Explicitly mask the register-based shift amounts with 63 or 31 + * to avoid undefined behavior. Normally this won't affect the + * generated code, for example, in case of native 64 bit archs such + * as x86-64 or arm64, the compiler is optimizing the AND away for + * the interpreter. In case of JITs, each of the JIT backends compiles + * the BPF shift operations to machine instructions which produce + * implementation-defined results in such a case; the resulting + * contents of the register may be arbitrary, but program behaviour + * as a whole remains defined. In other words, in case of JIT backends, + * the AND must /not/ be added to the emitted LSH/RSH/ARSH translation. + */ + /* ALU (shifts) */ +#define SHT(OPCODE, OP) \ + ALU64_##OPCODE##_X: \ + DST = DST OP (SRC & 63); \ + CONT; \ + ALU_##OPCODE##_X: \ + DST = (u32) DST OP ((u32) SRC & 31); \ + CONT; \ + ALU64_##OPCODE##_K: \ + DST = DST OP IMM; \ + CONT; \ + ALU_##OPCODE##_K: \ + DST = (u32) DST OP (u32) IMM; \ + CONT; + /* ALU (rest) */ +#define ALU(OPCODE, OP) \ + ALU64_##OPCODE##_X: \ + DST = DST OP SRC; \ + CONT; \ + ALU_##OPCODE##_X: \ + DST = (u32) DST OP (u32) SRC; \ + CONT; \ + ALU64_##OPCODE##_K: \ + DST = DST OP IMM; \ + CONT; \ + ALU_##OPCODE##_K: \ + DST = (u32) DST OP (u32) IMM; \ CONT; - ALU(ADD, +) ALU(SUB, -) ALU(AND, &) ALU(OR, |) - ALU(LSH, <<) - ALU(RSH, >>) ALU(XOR, ^) ALU(MUL, *) + SHT(LSH, <<) + SHT(RSH, >>) +#undef SHT #undef ALU ALU_NEG: DST = (u32) -DST; @@ -1393,13 +1418,13 @@ select_insn: insn++; CONT; ALU_ARSH_X: - DST = (u64) (u32) (((s32) DST) >> SRC); + DST = (u64) (u32) (((s32) DST) >> (SRC & 31)); CONT; ALU_ARSH_K: DST = (u64) (u32) (((s32) DST) >> IMM); CONT; ALU64_ARSH_X: - (*(s64 *) &DST) >>= SRC; + (*(s64 *) &DST) >>= (SRC & 63); CONT; ALU64_ARSH_K: (*(s64 *) &DST) >>= IMM; From 12c28934e9c8d4542a630f405c18f20ecb6f9353 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 5 Aug 2021 18:53:40 +0300 Subject: [PATCH 1368/1640] UPSTREAM: bpf: Fix leakage under speculation on mispredicted branches commit 9183671af6dbf60a1219371d4ed73e23f43b49db upstream The verifier only enumerates valid control-flow paths and skips paths that are unreachable in the non-speculative domain. And so it can miss issues under speculative execution on mispredicted branches. For example, a type confusion has been demonstrated with the following crafted program: // r0 = pointer to a map array entry // r6 = pointer to readable stack slot // r9 = scalar controlled by attacker 1: r0 = *(u64 *)(r0) // cache miss 2: if r0 != 0x0 goto line 4 3: r6 = r9 4: if r0 != 0x1 goto line 6 5: r9 = *(u8 *)(r6) 6: // leak r9 Since line 3 runs iff r0 == 0 and line 5 runs iff r0 == 1, the verifier concludes that the pointer dereference on line 5 is safe. But: if the attacker trains both the branches to fall-through, such that the following is speculatively executed ... r6 = r9 r9 = *(u8 *)(r6) // leak r9 ... then the program will dereference an attacker-controlled value and could leak its content under speculative execution via side-channel. This requires to mistrain the branch predictor, which can be rather tricky, because the branches are mutually exclusive. However such training can be done at congruent addresses in user space using different branches that are not mutually exclusive. That is, by training branches in user space ... A: if r0 != 0x0 goto line C B: ... C: if r0 != 0x0 goto line D D: ... ... such that addresses A and C collide to the same CPU branch prediction entries in the PHT (pattern history table) as those of the BPF program's lines 2 and 4, respectively. A non-privileged attacker could simply brute force such collisions in the PHT until observing the attack succeeding. Alternative methods to mistrain the branch predictor are also possible that avoid brute forcing the collisions in the PHT. A reliable attack has been demonstrated, for example, using the following crafted program: // r0 = pointer to a [control] map array entry // r7 = *(u64 *)(r0 + 0), training/attack phase // r8 = *(u64 *)(r0 + 8), oob address // [...] // r0 = pointer to a [data] map array entry 1: if r7 == 0x3 goto line 3 2: r8 = r0 // crafted sequence of conditional jumps to separate the conditional // branch in line 193 from the current execution flow 3: if r0 != 0x0 goto line 5 4: if r0 == 0x0 goto exit 5: if r0 != 0x0 goto line 7 6: if r0 == 0x0 goto exit [...] 187: if r0 != 0x0 goto line 189 188: if r0 == 0x0 goto exit // load any slowly-loaded value (due to cache miss in phase 3) ... 189: r3 = *(u64 *)(r0 + 0x1200) // ... and turn it into known zero for verifier, while preserving slowly- // loaded dependency when executing: 190: r3 &= 1 191: r3 &= 2 // speculatively bypassed phase dependency 192: r7 += r3 193: if r7 == 0x3 goto exit 194: r4 = *(u8 *)(r8 + 0) // leak r4 As can be seen, in training phase (phase != 0x3), the condition in line 1 turns into false and therefore r8 with the oob address is overridden with the valid map value address, which in line 194 we can read out without issues. However, in attack phase, line 2 is skipped, and due to the cache miss in line 189 where the map value is (zeroed and later) added to the phase register, the condition in line 193 takes the fall-through path due to prior branch predictor training, where under speculation, it'll load the byte at oob address r8 (unknown scalar type at that point) which could then be leaked via side-channel. One way to mitigate these is to 'branch off' an unreachable path, meaning, the current verification path keeps following the is_branch_taken() path and we push the other branch to the verification stack. Given this is unreachable from the non-speculative domain, this branch's vstate is explicitly marked as speculative. This is needed for two reasons: i) if this path is solely seen from speculative execution, then we later on still want the dead code elimination to kick in in order to sanitize these instructions with jmp-1s, and ii) to ensure that paths walked in the non-speculative domain are not pruned from earlier walks of paths walked in the speculative domain. Additionally, for robustness, we mark the registers which have been part of the conditional as unknown in the speculative path given there should be no assumptions made on their content. The fix in here mitigates type confusion attacks described earlier due to i) all code paths in the BPF program being explored and ii) existing verifier logic already ensuring that given memory access instruction references one specific data structure. An alternative to this fix that has also been looked at in this scope was to mark aux->alu_state at the jump instruction with a BPF_JMP_TAKEN state as well as direction encoding (always-goto, always-fallthrough, unknown), such that mixing of different always-* directions themselves as well as mixing of always-* with unknown directions would cause a program rejection by the verifier, e.g. programs with constructs like 'if ([...]) { x = 0; } else { x = 1; }' with subsequent 'if (x == 1) { [...] }'. For unprivileged, this would result in only single direction always-* taken paths, and unknown taken paths being allowed, such that the former could be patched from a conditional jump to an unconditional jump (ja). Compared to this approach here, it would have two downsides: i) valid programs that otherwise are not performing any pointer arithmetic, etc, would potentially be rejected/broken, and ii) we are required to turn off path pruning for unprivileged, where both can be avoided in this work through pushing the invalid branch to the verification stack. The issue was originally discovered by Adam and Ofek, and later independently discovered and reported as a result of Benedict and Piotr's research work. Fixes: b2157399cc98 ("bpf: prevent out-of-bounds speculation") Reported-by: Adam Morrison Reported-by: Ofek Kirzner Reported-by: Benedict Schlueter Reported-by: Piotr Krysiuk Change-Id: I7d61318c086cf38abba8dcd3de17232909017e45 Signed-off-by: Daniel Borkmann Reviewed-by: John Fastabend Reviewed-by: Benedict Schlueter Reviewed-by: Piotr Krysiuk Acked-by: Alexei Starovoitov [OP: use allow_ptr_leaks instead of bypass_spec_v1] Signed-off-by: Ovidiu Panait Signed-off-by: Greg Kroah-Hartman --- kernel/bpf/verifier.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d101467b62a2..727e2d58f3f6 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6084,6 +6084,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, if (err) return err; } + if (pred == 1) { /* Only follow the goto, ignore fall-through. If needed, push * the fall-through branch for simulation under speculative From aa5450b04b8299050a090626bdab9e4aa080c19b Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Thu, 12 Aug 2021 17:18:10 +0200 Subject: [PATCH 1369/1640] UPSTREAM: bpf: Clear zext_dst of dead insns [ Upstream commit 45c709f8c71b525b51988e782febe84ce933e7e0 ] "access skb fields ok" verifier test fails on s390 with the "verifier bug. zext_dst is set, but no reg is defined" message. The first insns of the test prog are ... 0: 61 01 00 00 00 00 00 00 ldxw %r0,[%r1+0] 8: 35 00 00 01 00 00 00 00 jge %r0,0,1 10: 61 01 00 08 00 00 00 00 ldxw %r0,[%r1+8] ... and the 3rd one is dead (this does not look intentional to me, but this is a separate topic). sanitize_dead_code() converts dead insns into "ja -1", but keeps zext_dst. When opt_subreg_zext_lo32_rnd_hi32() tries to parse such an insn, it sees this discrepancy and bails. This problem can be seen only with JITs whose bpf_jit_needs_zext() returns true. Fix by clearning dead insns' zext_dst. The commits that contributed to this problem are: 1. 5aa5bd14c5f8 ("bpf: add initial suite for selftests"), which introduced the test with the dead code. 2. 5327ed3d44b7 ("bpf: verifier: mark verified-insn with sub-register zext flag"), which introduced the zext_dst flag. 3. 83a2881903f3 ("bpf: Account for BPF_FETCH in insn_has_def32()"), which introduced the sanity check. 4. 9183671af6db ("bpf: Fix leakage under speculation on mispredicted branches"), which bisect points to. It's best to fix this on stable branches that contain the second one, since that's the point where the inconsistency was introduced. Fixes: 5327ed3d44b7 ("bpf: verifier: mark verified-insn with sub-register zext flag") Change-Id: Id3236e80a86c50f8d2dcaf51a9fa1dbaf2bfeba8 Signed-off-by: Ilya Leoshkevich Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20210812151811.184086-2-iii@linux.ibm.com Signed-off-by: Sasha Levin --- kernel/bpf/verifier.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 727e2d58f3f6..33b2aa392b0c 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -8554,6 +8554,7 @@ static void sanitize_dead_code(struct bpf_verifier_env *env) if (aux_data[i].seen) continue; memcpy(insn + i, &trap, sizeof(trap)); + aux_data[i].zext_dst = false; } } From 5bd819f636189d09e8d5051ee9684295cce1a775 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 9 Oct 2019 13:14:57 -0700 Subject: [PATCH 1370/1640] UPSTREAM: bpf: Track contents of read-only maps as scalars commit a23740ec43ba022dbfd139d0fe3eff193216272b upstream. Maps that are read-only both from BPF program side and user space side have their contents constant, so verifier can track referenced values precisely and use that knowledge for dead code elimination, branch pruning, etc. This patch teaches BPF verifier how to do this. Change-Id: I912162fa3b1a8a91bb6b8a1a67ebb5be8b94d5a7 Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20191009201458.2679171-2-andriin@fb.com Signed-off-by: Rafael David Tinoco Signed-off-by: Greg Kroah-Hartman --- kernel/bpf/verifier.c | 57 +++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 55 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 33b2aa392b0c..b72a2c6fe8ed 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2754,6 +2754,41 @@ static void coerce_reg_to_size(struct bpf_reg_state *reg, int size) reg->smax_value = reg->umax_value; } +static bool bpf_map_is_rdonly(const struct bpf_map *map) +{ + return (map->map_flags & BPF_F_RDONLY_PROG) && map->frozen; +} + +static int bpf_map_direct_read(struct bpf_map *map, int off, int size, u64 *val) +{ + void *ptr; + u64 addr; + int err; + + err = map->ops->map_direct_value_addr(map, &addr, off); + if (err) + return err; + ptr = (void *)addr + off; + + switch (size) { + case sizeof(u8): + *val = (u64)*(u8 *)ptr; + break; + case sizeof(u16): + *val = (u64)*(u16 *)ptr; + break; + case sizeof(u32): + *val = (u64)*(u32 *)ptr; + break; + case sizeof(u64): + *val = *(u64 *)ptr; + break; + default: + return -EINVAL; + } + return 0; +} + /* check whether memory at (regno + off) is accessible for t = (read | write) * if t==write, value_regno is a register which value is stored into memory * if t==read, value_regno is a register which will receive the value from memory @@ -2791,9 +2826,27 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn if (err) return err; err = check_map_access(env, regno, off, size, false); - if (!err && t == BPF_READ && value_regno >= 0) - mark_reg_unknown(env, regs, value_regno); + if (!err && t == BPF_READ && value_regno >= 0) { + struct bpf_map *map = reg->map_ptr; + /* if map is read-only, track its contents as scalars */ + if (tnum_is_const(reg->var_off) && + bpf_map_is_rdonly(map) && + map->ops->map_direct_value_addr) { + int map_off = off + reg->var_off.value; + u64 val = 0; + + err = bpf_map_direct_read(map, map_off, size, + &val); + if (err) + return err; + + regs[value_regno].type = SCALAR_VALUE; + __mark_reg_known(®s[value_regno], val); + } else { + mark_reg_unknown(env, regs, value_regno); + } + } } else if (reg->type == PTR_TO_CTX) { enum bpf_reg_type reg_type = SCALAR_VALUE; From 3ee9b7b9188941cff11418150c65017428c6d925 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 11 Oct 2019 10:20:53 -0700 Subject: [PATCH 1371/1640] UPSTREAM: bpf: Fix cast to pointer from integer of different size warning commit 2dedd7d2165565bafa89718eaadfc5d1a7865f66 upstream. Fix "warning: cast to pointer from integer of different size" when casting u64 addr to void *. Fixes: a23740ec43ba ("bpf: Track contents of read-only maps as scalars") Reported-by: kbuild test robot Change-Id: I7c71910dbe1d03c65a54c7adb8544b86a233a0ae Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20191011172053.2980619-1-andriin@fb.com Cc: Rafael David Tinoco Signed-off-by: Greg Kroah-Hartman --- kernel/bpf/verifier.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index b72a2c6fe8ed..d95245a05bb9 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2768,7 +2768,7 @@ static int bpf_map_direct_read(struct bpf_map *map, int off, int size, u64 *val) err = map->ops->map_direct_value_addr(map, &addr, off); if (err) return err; - ptr = (void *)addr + off; + ptr = (void *)(long)addr + off; switch (size) { case sizeof(u8): From dfdd2f1553d71be2a4fc79438eeb3add36d4fd0b Mon Sep 17 00:00:00 2001 From: He Fengqing Date: Wed, 14 Jul 2021 10:18:15 +0000 Subject: [PATCH 1372/1640] UPSTREAM: bpf: Fix potential memleak and UAF in the verifier. [ Upstream commit 75f0fc7b48ad45a2e5736bcf8de26c8872fe8695 ] In bpf_patch_insn_data(), we first use the bpf_patch_insn_single() to insert new instructions, then use adjust_insn_aux_data() to adjust insn_aux_data. If the old env->prog have no enough room for new inserted instructions, we use bpf_prog_realloc to construct new_prog and free the old env->prog. There have two errors here. First, if adjust_insn_aux_data() return ENOMEM, we should free the new_prog. Second, if adjust_insn_aux_data() return ENOMEM, bpf_patch_insn_data() will return NULL, and env->prog has been freed in bpf_prog_realloc, but we will use it in bpf_check(). So in this patch, we make the adjust_insn_aux_data() never fails. In bpf_patch_insn_data(), we first pre-malloc memory for the new insn_aux_data, then call bpf_patch_insn_single() to insert new instructions, at last call adjust_insn_aux_data() to adjust insn_aux_data. Fixes: 8041902dae52 ("bpf: adjust insn_aux_data when patching insns") Change-Id: I8d3ed855632d114847e58309b3c84fd2abe20684 Signed-off-by: He Fengqing Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20210714101815.164322-1-hefengqing@huawei.com Signed-off-by: Sasha Levin --- kernel/bpf/verifier.c | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d95245a05bb9..307b9a25070e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -8369,10 +8369,11 @@ static void convert_pseudo_ld_imm64(struct bpf_verifier_env *env) * insni[off, off + cnt). Adjust corresponding insn_aux_data by copying * [0, off) and [off, end) to new locations, so the patched range stays zero */ -static int adjust_insn_aux_data(struct bpf_verifier_env *env, - struct bpf_prog *new_prog, u32 off, u32 cnt) +static void adjust_insn_aux_data(struct bpf_verifier_env *env, + struct bpf_insn_aux_data *new_data, + struct bpf_prog *new_prog, u32 off, u32 cnt) { - struct bpf_insn_aux_data *new_data, *old_data = env->insn_aux_data; + struct bpf_insn_aux_data *old_data = env->insn_aux_data; struct bpf_insn *insn = new_prog->insnsi; bool old_seen = old_data[off].seen; u32 prog_len; @@ -8385,12 +8386,9 @@ static int adjust_insn_aux_data(struct bpf_verifier_env *env, old_data[off].zext_dst = insn_has_def32(env, insn + off + cnt - 1); if (cnt == 1) - return 0; + return; prog_len = new_prog->len; - new_data = vzalloc(array_size(prog_len, - sizeof(struct bpf_insn_aux_data))); - if (!new_data) - return -ENOMEM; + memcpy(new_data, old_data, sizeof(struct bpf_insn_aux_data) * off); memcpy(new_data + off + cnt - 1, old_data + off, sizeof(struct bpf_insn_aux_data) * (prog_len - off - cnt + 1)); @@ -8401,7 +8399,6 @@ static int adjust_insn_aux_data(struct bpf_verifier_env *env, } env->insn_aux_data = new_data; vfree(old_data); - return 0; } static void adjust_subprog_starts(struct bpf_verifier_env *env, u32 off, u32 len) @@ -8422,6 +8419,14 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of const struct bpf_insn *patch, u32 len) { struct bpf_prog *new_prog; + struct bpf_insn_aux_data *new_data = NULL; + + if (len > 1) { + new_data = vzalloc(array_size(env->prog->len + len - 1, + sizeof(struct bpf_insn_aux_data))); + if (!new_data) + return NULL; + } new_prog = bpf_patch_insn_single(env->prog, off, patch, len); if (IS_ERR(new_prog)) { @@ -8429,10 +8434,10 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of verbose(env, "insn %d cannot be patched due to 16-bit range\n", env->insn_aux_data[off].orig_idx); + vfree(new_data); return NULL; } - if (adjust_insn_aux_data(env, new_prog, off, len)) - return NULL; + adjust_insn_aux_data(env, new_data, new_prog, off, len); adjust_subprog_starts(env, off, len); return new_prog; } From 9d54b7044100e537d17aabbd4c8eaed26edb096a Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Fri, 20 Aug 2021 09:39:35 -0700 Subject: [PATCH 1373/1640] UPSTREAM: bpf: Fix possible out of bound write in narrow load handling [ Upstream commit d7af7e497f0308bc97809cc48b58e8e0f13887e1 ] Fix a verifier bug found by smatch static checker in [0]. This problem has never been seen in prod to my best knowledge. Fixing it still seems to be a good idea since it's hard to say for sure whether it's possible or not to have a scenario where a combination of convert_ctx_access() and a narrow load would lead to an out of bound write. When narrow load is handled, one or two new instructions are added to insn_buf array, but before it was only checked that cnt >= ARRAY_SIZE(insn_buf) And it's safe to add a new instruction to insn_buf[cnt++] only once. The second try will lead to out of bound write. And this is what can happen if `shift` is set. Fix it by making sure that if the BPF_RSH instruction has to be added in addition to BPF_AND then there is enough space for two more instructions in insn_buf. The full report [0] is below: kernel/bpf/verifier.c:12304 convert_ctx_accesses() warn: offset 'cnt' incremented past end of array kernel/bpf/verifier.c:12311 convert_ctx_accesses() warn: offset 'cnt' incremented past end of array kernel/bpf/verifier.c 12282 12283 insn->off = off & ~(size_default - 1); 12284 insn->code = BPF_LDX | BPF_MEM | size_code; 12285 } 12286 12287 target_size = 0; 12288 cnt = convert_ctx_access(type, insn, insn_buf, env->prog, 12289 &target_size); 12290 if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf) || ^^^^^^^^^^^^^^^^^^^^^^^^^^^ Bounds check. 12291 (ctx_field_size && !target_size)) { 12292 verbose(env, "bpf verifier is misconfigured\n"); 12293 return -EINVAL; 12294 } 12295 12296 if (is_narrower_load && size < target_size) { 12297 u8 shift = bpf_ctx_narrow_access_offset( 12298 off, size, size_default) * 8; 12299 if (ctx_field_size <= 4) { 12300 if (shift) 12301 insn_buf[cnt++] = BPF_ALU32_IMM(BPF_RSH, ^^^^^ increment beyond end of array 12302 insn->dst_reg, 12303 shift); --> 12304 insn_buf[cnt++] = BPF_ALU32_IMM(BPF_AND, insn->dst_reg, ^^^^^ out of bounds write 12305 (1 << size * 8) - 1); 12306 } else { 12307 if (shift) 12308 insn_buf[cnt++] = BPF_ALU64_IMM(BPF_RSH, 12309 insn->dst_reg, 12310 shift); 12311 insn_buf[cnt++] = BPF_ALU64_IMM(BPF_AND, insn->dst_reg, ^^^^^^^^^^^^^^^ Same. 12312 (1ULL << size * 8) - 1); 12313 } 12314 } 12315 12316 new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); 12317 if (!new_prog) 12318 return -ENOMEM; 12319 12320 delta += cnt - 1; 12321 12322 /* keep walking new program and skip insns we just inserted */ 12323 env->prog = new_prog; 12324 insn = new_prog->insnsi + i + delta; 12325 } 12326 12327 return 0; 12328 } [0] https://lore.kernel.org/bpf/20210817050843.GA21456@kili/ v1->v2: - clarify that problem was only seen by static checker but not in prod; Fixes: 46f53a65d2de ("bpf: Allow narrow loads with offset > 0") Reported-by: Dan Carpenter Change-Id: Id8894114d33aa310cf6496d22cdc78cc094064ce Signed-off-by: Andrey Ignatov Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210820163935.1902398-1-rdna@fb.com Signed-off-by: Sasha Levin --- kernel/bpf/verifier.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 307b9a25070e..6215abeed24b 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -8926,6 +8926,10 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) if (is_narrower_load && size < target_size) { u8 shift = bpf_ctx_narrow_access_offset( off, size, size_default) * 8; + if (shift && cnt + 1 >= ARRAY_SIZE(insn_buf)) { + verbose(env, "bpf verifier narrow ctx load misconfigured\n"); + return -EINVAL; + } if (ctx_field_size <= 4) { if (shift) insn_buf[cnt++] = BPF_ALU32_IMM(BPF_RSH, From 53b526dce9d6c74cbb07652a166504a0081c3016 Mon Sep 17 00:00:00 2001 From: Bixuan Cui Date: Sat, 11 Sep 2021 08:55:57 +0800 Subject: [PATCH 1374/1640] UPSTREAM: bpf: Add oversize check before call kvcalloc() [ Upstream commit 0e6491b559704da720f6da09dd0a52c4df44c514 ] Commit 7661809d493b ("mm: don't allow oversized kvmalloc() calls") add the oversize check. When the allocation is larger than what kmalloc() supports, the following warning triggered: WARNING: CPU: 0 PID: 8408 at mm/util.c:597 kvmalloc_node+0x108/0x110 mm/util.c:597 Modules linked in: CPU: 0 PID: 8408 Comm: syz-executor221 Not tainted 5.14.0-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:kvmalloc_node+0x108/0x110 mm/util.c:597 Call Trace: kvmalloc include/linux/mm.h:806 [inline] kvmalloc_array include/linux/mm.h:824 [inline] kvcalloc include/linux/mm.h:829 [inline] check_btf_line kernel/bpf/verifier.c:9925 [inline] check_btf_info kernel/bpf/verifier.c:10049 [inline] bpf_check+0xd634/0x150d0 kernel/bpf/verifier.c:13759 bpf_prog_load kernel/bpf/syscall.c:2301 [inline] __sys_bpf+0x11181/0x126e0 kernel/bpf/syscall.c:4587 __do_sys_bpf kernel/bpf/syscall.c:4691 [inline] __se_sys_bpf kernel/bpf/syscall.c:4689 [inline] __x64_sys_bpf+0x78/0x90 kernel/bpf/syscall.c:4689 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x3d/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae Reported-by: syzbot+f3e749d4c662818ae439@syzkaller.appspotmail.com Change-Id: I02c9b3896c41b5f1fa54deb7e8a88c4067a39d51 Signed-off-by: Bixuan Cui Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210911005557.45518-1-cuibixuan@huawei.com Signed-off-by: Sasha Levin --- kernel/bpf/verifier.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 6215abeed24b..ae4e8ce7fa4f 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6840,6 +6840,8 @@ static int check_btf_line(struct bpf_verifier_env *env, nr_linfo = attr->line_info_cnt; if (!nr_linfo) return 0; + if (nr_linfo > INT_MAX / sizeof(struct bpf_line_info)) + return -EINVAL; rec_size = attr->line_info_rec_size; if (rec_size < MIN_BPF_LINEINFO_SIZE || From a4023c38a8a145769af1288330b74be9030d7317 Mon Sep 17 00:00:00 2001 From: Bui Quang Minh Date: Sun, 13 Jun 2021 21:34:39 +0700 Subject: [PATCH 1375/1640] UPSTREAM: bpf: Fix integer overflow in argument calculation for bpf_map_area_alloc commit 7dd5d437c258bbf4cc15b35229e5208b87b8b4e0 upstream. In 32-bit architecture, the result of sizeof() is a 32-bit integer so the expression becomes the multiplication between 2 32-bit integer which can potentially leads to integer overflow. As a result, bpf_map_area_alloc() allocates less memory than needed. Fix this by casting 1 operand to u64. Fixes: 0d2c4f964050 ("bpf: Eliminate rlimit-based memory accounting for sockmap and sockhash maps") Fixes: 99c51064fb06 ("devmap: Use bpf_map_area_alloc() for allocating hash buckets") Fixes: 546ac1ffb70d ("bpf: add devmap, a map for storing net device references") Change-Id: I2b30a9d20d66c67f312e6c24743bcb6e208ef622 Signed-off-by: Bui Quang Minh Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210613143440.71975-1-minhquangbui99@gmail.com Signed-off-by: Connor O'Brien Signed-off-by: Greg Kroah-Hartman --- kernel/bpf/devmap.c | 4 ++-- net/core/sock_map.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 6684696fa457..4b2819b0a05a 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -94,7 +94,7 @@ static struct hlist_head *dev_map_create_hash(unsigned int entries, int i; struct hlist_head *hash; - hash = bpf_map_area_alloc(entries * sizeof(*hash), numa_node); + hash = bpf_map_area_alloc((u64) entries * sizeof(*hash), numa_node); if (hash != NULL) for (i = 0; i < entries; i++) INIT_HLIST_HEAD(&hash[i]); @@ -159,7 +159,7 @@ static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr) spin_lock_init(&dtab->index_lock); } else { - dtab->netdev_map = bpf_map_area_alloc(dtab->map.max_entries * + dtab->netdev_map = bpf_map_area_alloc((u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *), dtab->map.numa_node); if (!dtab->netdev_map) diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 2a16e4d77040..d6b1073778f5 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -48,7 +48,7 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr) if (err) goto free_stab; - stab->sks = bpf_map_area_alloc(stab->map.max_entries * + stab->sks = bpf_map_area_alloc((u64) stab->map.max_entries * sizeof(struct sock *), stab->map.numa_node); if (stab->sks) From 1082b45db805703ce0bbd52446808a27b0b3fb07 Mon Sep 17 00:00:00 2001 From: Yuntao Wang Date: Thu, 7 Apr 2022 21:04:23 +0800 Subject: [PATCH 1376/1640] UPSTREAM: bpf: Fix excessive memory allocation in stack_map_alloc() [ Upstream commit b45043192b3e481304062938a6561da2ceea46a6 ] The 'n_buckets * (value_size + sizeof(struct stack_map_bucket))' part of the allocated memory for 'smap' is never used after the memlock accounting was removed, thus get rid of it. [ Note, Daniel: Commit b936ca643ade ("bpf: rework memlock-based memory accounting for maps") moved `cost += n_buckets * (value_size + sizeof(struct stack_map_bucket))` up and therefore before the bpf_map_area_alloc() allocation, sigh. In a later step commit c85d69135a91 ("bpf: move memory size checks to bpf_map_charge_init()"), and the overflow checks of `cost >= U32_MAX - PAGE_SIZE` moved into bpf_map_charge_init(). And then 370868107bf6 ("bpf: Eliminate rlimit-based memory accounting for stackmap maps") finally removed the bpf_map_charge_init(). Anyway, the original code did the allocation same way as /after/ this fix. ] Fixes: b936ca643ade ("bpf: rework memlock-based memory accounting for maps") Change-Id: I4a8febd929f09ff4e328ca099e1c47894c92d12a Signed-off-by: Yuntao Wang Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20220407130423.798386-1-ytcoode@gmail.com Signed-off-by: Sasha Levin --- kernel/bpf/stackmap.c | 1 - 1 file changed, 1 deletion(-) diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 1e7667ad3712..3d4dd9574cf5 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -123,7 +123,6 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr) n_buckets = roundup_pow_of_two(attr->max_entries); cost = n_buckets * sizeof(struct stack_map_bucket *) + sizeof(*smap); - cost += n_buckets * (value_size + sizeof(struct stack_map_bucket)); err = bpf_map_charge_init(&mem, cost); if (err) return ERR_PTR(err); From eac489c95578b267bd1a23518d103f5c84a745b9 Mon Sep 17 00:00:00 2001 From: Yuntao Wang Date: Tue, 14 Jun 2022 22:26:22 +0800 Subject: [PATCH 1377/1640] UPSTREAM: bpf: Fix incorrect memory charge cost calculation in stack_map_alloc() commit b45043192b3e481304062938a6561da2ceea46a6 upstream. This is a backport of the original upstream patch for 5.4/5.10. The original upstream patch has been applied to 5.4/5.10 branches, which simply removed the line: cost += n_buckets * (value_size + sizeof(struct stack_map_bucket)); This is correct for upstream branch but incorrect for 5.4/5.10 branches, as the 5.4/5.10 branches do not have the commit 370868107bf6 ("bpf: Eliminate rlimit-based memory accounting for stackmap maps"), so the bpf_map_charge_init() function has not been removed. Currently the bpf_map_charge_init() function in 5.4/5.10 branches takes a wrong memory charge cost, the attr->max_entries * (sizeof(struct stack_map_bucket) + (u64)value_size)) part is missing, let's fix it. Cc: # 5.4.y Cc: # 5.10.y Change-Id: I91bcb932cab87a23f16a85db2e2f9269b5be8638 Signed-off-by: Yuntao Wang Signed-off-by: Greg Kroah-Hartman --- kernel/bpf/stackmap.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 3d4dd9574cf5..6f842f740752 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -123,7 +123,8 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr) n_buckets = roundup_pow_of_two(attr->max_entries); cost = n_buckets * sizeof(struct stack_map_bucket *) + sizeof(*smap); - err = bpf_map_charge_init(&mem, cost); + err = bpf_map_charge_init(&mem, cost + attr->max_entries * + (sizeof(struct stack_map_bucket) + (u64)value_size)); if (err) return ERR_PTR(err); From 67bf25e1021f20c4a1cb544110f4deb97708d6fb Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 4 Nov 2022 09:36:44 -0700 Subject: [PATCH 1378/1640] UPSTREAM: bpf: propagate precision in ALU/ALU64 operations [ Upstream commit a3b666bfa9c9edc05bca62a87abafe0936bd7f97 ] When processing ALU/ALU64 operations (apart from BPF_MOV, which is handled correctly already; and BPF_NEG and BPF_END are special and don't have source register), if destination register is already marked precise, this causes problem with potentially missing precision tracking for the source register. E.g., when we have r1 >>= r5 and r1 is marked precise, but r5 isn't, this will lead to r5 staying as imprecise. This is due to the precision backtracking logic stopping early when it sees r1 is already marked precise. If r1 wasn't precise, we'd keep backtracking and would add r5 to the set of registers that need to be marked precise. So there is a discrepancy here which can lead to invalid and incompatible states matched due to lack of precision marking on r5. If r1 wasn't precise, precision backtracking would correctly mark both r1 and r5 as precise. This is simple to fix, though. During the forward instruction simulation pass, for arithmetic operations of `scalar = scalar` form (where is ALU or ALU64 operations), if destination register is already precise, mark source register as precise. This applies only when both involved registers are SCALARs. `ptr += scalar` and `scalar += ptr` cases are already handled correctly. This does have (negative) effect on some selftest programs and few Cilium programs. ~/baseline-tmp-results.csv are veristat results with this patch, while ~/baseline-results.csv is without it. See post scriptum for instructions on how to make Cilium programs testable with veristat. Correctness has a price. $ ./veristat -C -e file,prog,insns,states ~/baseline-results.csv ~/baseline-tmp-results.csv | grep -v '+0' File Program Total insns (A) Total insns (B) Total insns (DIFF) Total states (A) Total states (B) Total states (DIFF) ----------------------- -------------------- --------------- --------------- ------------------ ---------------- ---------------- ------------------- bpf_cubic.bpf.linked1.o bpf_cubic_cong_avoid 997 1700 +703 (+70.51%) 62 90 +28 (+45.16%) test_l4lb.bpf.linked1.o balancer_ingress 4559 5469 +910 (+19.96%) 118 126 +8 (+6.78%) ----------------------- -------------------- --------------- --------------- ------------------ ---------------- ---------------- ------------------- $ ./veristat -C -e file,prog,verdict,insns,states ~/baseline-results-cilium.csv ~/baseline-tmp-results-cilium.csv | grep -v '+0' File Program Total insns (A) Total insns (B) Total insns (DIFF) Total states (A) Total states (B) Total states (DIFF) ------------- ------------------------------ --------------- --------------- ------------------ ---------------- ---------------- ------------------- bpf_host.o tail_nodeport_nat_ingress_ipv6 4448 5261 +813 (+18.28%) 234 247 +13 (+5.56%) bpf_host.o tail_nodeport_nat_ipv6_egress 3396 3446 +50 (+1.47%) 201 203 +2 (+1.00%) bpf_lxc.o tail_nodeport_nat_ingress_ipv6 4448 5261 +813 (+18.28%) 234 247 +13 (+5.56%) bpf_overlay.o tail_nodeport_nat_ingress_ipv6 4448 5261 +813 (+18.28%) 234 247 +13 (+5.56%) bpf_xdp.o tail_lb_ipv4 71736 73442 +1706 (+2.38%) 4295 4370 +75 (+1.75%) ------------- ------------------------------ --------------- --------------- ------------------ ---------------- ---------------- ------------------- P.S. To make Cilium ([0]) programs libbpf-compatible and thus veristat-loadable, apply changes from topmost commit in [1], which does minimal changes to Cilium source code, mostly around SEC() annotations and BPF map definitions. [0] https://github.com/cilium/cilium/ [1] https://github.com/anakryiko/cilium/commits/libbpf-friendliness Fixes: b5dc0163d8fd ("bpf: precise scalar_value tracking") Change-Id: Ic4f608f1521c19c7bbb764d6d82dd7c05bf9b55b Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20221104163649.121784-2-andrii@kernel.org Signed-off-by: Alexei Starovoitov Signed-off-by: Sasha Levin --- kernel/bpf/verifier.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index ae4e8ce7fa4f..487bc9d86f51 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5140,6 +5140,11 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env, return err; return adjust_ptr_min_max_vals(env, insn, dst_reg, src_reg); + } else if (dst_reg->precise) { + /* if dst_reg is precise, src_reg should be precise as well */ + err = mark_chain_precision(env, insn->src_reg); + if (err) + return err; } } else { /* Pretend the src is a reg with a known value, since we only From dea9d5c18e8a93eb240d576338f535307a549f1a Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Tue, 22 Nov 2022 19:54:22 -0800 Subject: [PATCH 1379/1640] UPSTREAM: bpf: Prevent decl_tag from being referenced in func_proto arg [ Upstream commit f17472d4599697d701aa239b4c475a506bccfd19 ] Syzkaller managed to hit another decl_tag issue: btf_func_proto_check kernel/bpf/btf.c:4506 [inline] btf_check_all_types kernel/bpf/btf.c:4734 [inline] btf_parse_type_sec+0x1175/0x1980 kernel/bpf/btf.c:4763 btf_parse kernel/bpf/btf.c:5042 [inline] btf_new_fd+0x65a/0xb00 kernel/bpf/btf.c:6709 bpf_btf_load+0x6f/0x90 kernel/bpf/syscall.c:4342 __sys_bpf+0x50a/0x6c0 kernel/bpf/syscall.c:5034 __do_sys_bpf kernel/bpf/syscall.c:5093 [inline] __se_sys_bpf kernel/bpf/syscall.c:5091 [inline] __x64_sys_bpf+0x7c/0x90 kernel/bpf/syscall.c:5091 do_syscall_64+0x54/0x70 arch/x86/entry/common.c:48 This seems similar to commit ea68376c8bed ("bpf: prevent decl_tag from being referenced in func_proto") but for the argument. Reported-by: syzbot+8dd0551dda6020944c5d@syzkaller.appspotmail.com Change-Id: I4188f3477ec73dfe991fd1a3ef997f9b29d3fcb6 Signed-off-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20221123035422.872531-2-sdf@google.com Signed-off-by: Sasha Levin --- kernel/bpf/btf.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index a28bbec8c59f..8fd65a0eb7f3 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -2849,6 +2849,11 @@ static int btf_func_proto_check(struct btf_verifier_env *env, break; } + if (btf_type_is_resolve_source_only(arg_type)) { + btf_verifier_log_type(env, t, "Invalid arg#%u", i + 1); + return -EINVAL; + } + if (args[i].name_off && (!btf_name_offset_valid(btf, args[i].name_off) || !btf_name_valid_identifier(btf, args[i].name_off))) { From dffe1cbc9c72e1f778f2ea96527d17a7d08f8774 Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Mon, 6 Mar 2023 11:21:37 +0000 Subject: [PATCH 1380/1640] UPSTREAM: btf: fix resolving BTF_KIND_VAR after ARRAY, STRUCT, UNION, PTR [ Upstream commit 9b459804ff9973e173fabafba2a1319f771e85fa ] btf_datasec_resolve contains a bug that causes the following BTF to fail loading: [1] DATASEC a size=2 vlen=2 type_id=4 offset=0 size=1 type_id=7 offset=1 size=1 [2] INT (anon) size=1 bits_offset=0 nr_bits=8 encoding=(none) [3] PTR (anon) type_id=2 [4] VAR a type_id=3 linkage=0 [5] INT (anon) size=1 bits_offset=0 nr_bits=8 encoding=(none) [6] TYPEDEF td type_id=5 [7] VAR b type_id=6 linkage=0 This error message is printed during btf_check_all_types: [1] DATASEC a size=2 vlen=2 type_id=7 offset=1 size=1 Invalid type By tracing btf_*_resolve we can pinpoint the problem: btf_datasec_resolve(depth: 1, type_id: 1, mode: RESOLVE_TBD) = 0 btf_var_resolve(depth: 2, type_id: 4, mode: RESOLVE_TBD) = 0 btf_ptr_resolve(depth: 3, type_id: 3, mode: RESOLVE_PTR) = 0 btf_var_resolve(depth: 2, type_id: 4, mode: RESOLVE_PTR) = 0 btf_datasec_resolve(depth: 1, type_id: 1, mode: RESOLVE_PTR) = -22 The last invocation of btf_datasec_resolve should invoke btf_var_resolve by means of env_stack_push, instead it returns EINVAL. The reason is that env_stack_push is never executed for the second VAR. if (!env_type_is_resolve_sink(env, var_type) && !env_type_is_resolved(env, var_type_id)) { env_stack_set_next_member(env, i + 1); return env_stack_push(env, var_type, var_type_id); } env_type_is_resolve_sink() changes its behaviour based on resolve_mode. For RESOLVE_PTR, we can simplify the if condition to the following: (btf_type_is_modifier() || btf_type_is_ptr) && !env_type_is_resolved() Since we're dealing with a VAR the clause evaluates to false. This is not sufficient to trigger the bug however. The log output and EINVAL are only generated if btf_type_id_size() fails. if (!btf_type_id_size(btf, &type_id, &type_size)) { btf_verifier_log_vsi(env, v->t, vsi, "Invalid type"); return -EINVAL; } Most types are sized, so for example a VAR referring to an INT is not a problem. The bug is only triggered if a VAR points at a modifier. Since we skipped btf_var_resolve that modifier was also never resolved, which means that btf_resolved_type_id returns 0 aka VOID for the modifier. This in turn causes btf_type_id_size to return NULL, triggering EINVAL. To summarise, the following conditions are necessary: - VAR pointing at PTR, STRUCT, UNION or ARRAY - Followed by a VAR pointing at TYPEDEF, VOLATILE, CONST, RESTRICT or TYPE_TAG The fix is to reset resolve_mode to RESOLVE_TBD before attempting to resolve a VAR from a DATASEC. Fixes: 1dc92851849c ("bpf: kernel side support for BTF Var and DataSec") Change-Id: Icebff92466a13dce44c87d95acc8967233fc5bd0 Signed-off-by: Lorenz Bauer Link: https://lore.kernel.org/r/20230306112138.155352-2-lmb@isovalent.com Signed-off-by: Martin KaFai Lau Signed-off-by: Sasha Levin --- kernel/bpf/btf.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 8fd65a0eb7f3..5189bc5ebd89 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -2719,6 +2719,7 @@ static int btf_datasec_resolve(struct btf_verifier_env *env, struct btf *btf = env->btf; u16 i; + env->resolve_mode = RESOLVE_TBD; for_each_vsi_from(i, v->next_member, v->t, vsi) { u32 var_type_id = vsi->type, type_id, type_size = 0; const struct btf_type *var_type = btf_type_by_id(env->btf, From dc6440ab5e21e783b1a80838ccea52dbb5204a26 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 11 Apr 2023 15:24:13 +0000 Subject: [PATCH 1381/1640] UPSTREAM: bpf: Fix incorrect verifier pruning due to missing register precision taints [ Upstream commit 71b547f561247897a0a14f3082730156c0533fed ] Juan Jose et al reported an issue found via fuzzing where the verifier's pruning logic prematurely marks a program path as safe. Consider the following program: 0: (b7) r6 = 1024 1: (b7) r7 = 0 2: (b7) r8 = 0 3: (b7) r9 = -2147483648 4: (97) r6 %= 1025 5: (05) goto pc+0 6: (bd) if r6 <= r9 goto pc+2 7: (97) r6 %= 1 8: (b7) r9 = 0 9: (bd) if r6 <= r9 goto pc+1 10: (b7) r6 = 0 11: (b7) r0 = 0 12: (63) *(u32 *)(r10 -4) = r0 13: (18) r4 = 0xffff888103693400 // map_ptr(ks=4,vs=48) 15: (bf) r1 = r4 16: (bf) r2 = r10 17: (07) r2 += -4 18: (85) call bpf_map_lookup_elem#1 19: (55) if r0 != 0x0 goto pc+1 20: (95) exit 21: (77) r6 >>= 10 22: (27) r6 *= 8192 23: (bf) r1 = r0 24: (0f) r0 += r6 25: (79) r3 = *(u64 *)(r0 +0) 26: (7b) *(u64 *)(r1 +0) = r3 27: (95) exit The verifier treats this as safe, leading to oob read/write access due to an incorrect verifier conclusion: func#0 @0 0: R1=ctx(off=0,imm=0) R10=fp0 0: (b7) r6 = 1024 ; R6_w=1024 1: (b7) r7 = 0 ; R7_w=0 2: (b7) r8 = 0 ; R8_w=0 3: (b7) r9 = -2147483648 ; R9_w=-2147483648 4: (97) r6 %= 1025 ; R6_w=scalar() 5: (05) goto pc+0 6: (bd) if r6 <= r9 goto pc+2 ; R6_w=scalar(umin=18446744071562067969,var_off=(0xffffffff00000000; 0xffffffff)) R9_w=-2147483648 7: (97) r6 %= 1 ; R6_w=scalar() 8: (b7) r9 = 0 ; R9=0 9: (bd) if r6 <= r9 goto pc+1 ; R6=scalar(umin=1) R9=0 10: (b7) r6 = 0 ; R6_w=0 11: (b7) r0 = 0 ; R0_w=0 12: (63) *(u32 *)(r10 -4) = r0 last_idx 12 first_idx 9 regs=1 stack=0 before 11: (b7) r0 = 0 13: R0_w=0 R10=fp0 fp-8=0000???? 13: (18) r4 = 0xffff8ad3886c2a00 ; R4_w=map_ptr(off=0,ks=4,vs=48,imm=0) 15: (bf) r1 = r4 ; R1_w=map_ptr(off=0,ks=4,vs=48,imm=0) R4_w=map_ptr(off=0,ks=4,vs=48,imm=0) 16: (bf) r2 = r10 ; R2_w=fp0 R10=fp0 17: (07) r2 += -4 ; R2_w=fp-4 18: (85) call bpf_map_lookup_elem#1 ; R0=map_value_or_null(id=1,off=0,ks=4,vs=48,imm=0) 19: (55) if r0 != 0x0 goto pc+1 ; R0=0 20: (95) exit from 19 to 21: R0=map_value(off=0,ks=4,vs=48,imm=0) R6=0 R7=0 R8=0 R9=0 R10=fp0 fp-8=mmmm???? 21: (77) r6 >>= 10 ; R6_w=0 22: (27) r6 *= 8192 ; R6_w=0 23: (bf) r1 = r0 ; R0=map_value(off=0,ks=4,vs=48,imm=0) R1_w=map_value(off=0,ks=4,vs=48,imm=0) 24: (0f) r0 += r6 last_idx 24 first_idx 19 regs=40 stack=0 before 23: (bf) r1 = r0 regs=40 stack=0 before 22: (27) r6 *= 8192 regs=40 stack=0 before 21: (77) r6 >>= 10 regs=40 stack=0 before 19: (55) if r0 != 0x0 goto pc+1 parent didn't have regs=40 stack=0 marks: R0_rw=map_value_or_null(id=1,off=0,ks=4,vs=48,imm=0) R6_rw=P0 R7=0 R8=0 R9=0 R10=fp0 fp-8=mmmm???? last_idx 18 first_idx 9 regs=40 stack=0 before 18: (85) call bpf_map_lookup_elem#1 regs=40 stack=0 before 17: (07) r2 += -4 regs=40 stack=0 before 16: (bf) r2 = r10 regs=40 stack=0 before 15: (bf) r1 = r4 regs=40 stack=0 before 13: (18) r4 = 0xffff8ad3886c2a00 regs=40 stack=0 before 12: (63) *(u32 *)(r10 -4) = r0 regs=40 stack=0 before 11: (b7) r0 = 0 regs=40 stack=0 before 10: (b7) r6 = 0 25: (79) r3 = *(u64 *)(r0 +0) ; R0_w=map_value(off=0,ks=4,vs=48,imm=0) R3_w=scalar() 26: (7b) *(u64 *)(r1 +0) = r3 ; R1_w=map_value(off=0,ks=4,vs=48,imm=0) R3_w=scalar() 27: (95) exit from 9 to 11: R1=ctx(off=0,imm=0) R6=0 R7=0 R8=0 R9=0 R10=fp0 11: (b7) r0 = 0 ; R0_w=0 12: (63) *(u32 *)(r10 -4) = r0 last_idx 12 first_idx 11 regs=1 stack=0 before 11: (b7) r0 = 0 13: R0_w=0 R10=fp0 fp-8=0000???? 13: (18) r4 = 0xffff8ad3886c2a00 ; R4_w=map_ptr(off=0,ks=4,vs=48,imm=0) 15: (bf) r1 = r4 ; R1_w=map_ptr(off=0,ks=4,vs=48,imm=0) R4_w=map_ptr(off=0,ks=4,vs=48,imm=0) 16: (bf) r2 = r10 ; R2_w=fp0 R10=fp0 17: (07) r2 += -4 ; R2_w=fp-4 18: (85) call bpf_map_lookup_elem#1 frame 0: propagating r6 last_idx 19 first_idx 11 regs=40 stack=0 before 18: (85) call bpf_map_lookup_elem#1 regs=40 stack=0 before 17: (07) r2 += -4 regs=40 stack=0 before 16: (bf) r2 = r10 regs=40 stack=0 before 15: (bf) r1 = r4 regs=40 stack=0 before 13: (18) r4 = 0xffff8ad3886c2a00 regs=40 stack=0 before 12: (63) *(u32 *)(r10 -4) = r0 regs=40 stack=0 before 11: (b7) r0 = 0 parent didn't have regs=40 stack=0 marks: R1=ctx(off=0,imm=0) R6_r=P0 R7=0 R8=0 R9=0 R10=fp0 last_idx 9 first_idx 9 regs=40 stack=0 before 9: (bd) if r6 <= r9 goto pc+1 parent didn't have regs=40 stack=0 marks: R1=ctx(off=0,imm=0) R6_rw=Pscalar() R7_w=0 R8_w=0 R9_rw=0 R10=fp0 last_idx 8 first_idx 0 regs=40 stack=0 before 8: (b7) r9 = 0 regs=40 stack=0 before 7: (97) r6 %= 1 regs=40 stack=0 before 6: (bd) if r6 <= r9 goto pc+2 regs=40 stack=0 before 5: (05) goto pc+0 regs=40 stack=0 before 4: (97) r6 %= 1025 regs=40 stack=0 before 3: (b7) r9 = -2147483648 regs=40 stack=0 before 2: (b7) r8 = 0 regs=40 stack=0 before 1: (b7) r7 = 0 regs=40 stack=0 before 0: (b7) r6 = 1024 19: safe frame 0: propagating r6 last_idx 9 first_idx 0 regs=40 stack=0 before 6: (bd) if r6 <= r9 goto pc+2 regs=40 stack=0 before 5: (05) goto pc+0 regs=40 stack=0 before 4: (97) r6 %= 1025 regs=40 stack=0 before 3: (b7) r9 = -2147483648 regs=40 stack=0 before 2: (b7) r8 = 0 regs=40 stack=0 before 1: (b7) r7 = 0 regs=40 stack=0 before 0: (b7) r6 = 1024 from 6 to 9: safe verification time 110 usec stack depth 4 processed 36 insns (limit 1000000) max_states_per_insn 0 total_states 3 peak_states 3 mark_read 2 The verifier considers this program as safe by mistakenly pruning unsafe code paths. In the above func#0, code lines 0-10 are of interest. In line 0-3 registers r6 to r9 are initialized with known scalar values. In line 4 the register r6 is reset to an unknown scalar given the verifier does not track modulo operations. Due to this, the verifier can also not determine precisely which branches in line 6 and 9 are taken, therefore it needs to explore them both. As can be seen, the verifier starts with exploring the false/fall-through paths first. The 'from 19 to 21' path has both r6=0 and r9=0 and the pointer arithmetic on r0 += r6 is therefore considered safe. Given the arithmetic, r6 is correctly marked for precision tracking where backtracking kicks in where it walks back the current path all the way where r6 was set to 0 in the fall-through branch. Next, the pruning logics pops the path 'from 9 to 11' from the stack. Also here, the state of the registers is the same, that is, r6=0 and r9=0, so that at line 19 the path can be pruned as it is considered safe. It is interesting to note that the conditional in line 9 turned r6 into a more precise state, that is, in the fall-through path at the beginning of line 10, it is R6=scalar(umin=1), and in the branch-taken path (which is analyzed here) at the beginning of line 11, r6 turned into a known const r6=0 as r9=0 prior to that and therefore (unsigned) r6 <= 0 concludes that r6 must be 0 (**): [...] ; R6_w=scalar() 9: (bd) if r6 <= r9 goto pc+1 ; R6=scalar(umin=1) R9=0 [...] from 9 to 11: R1=ctx(off=0,imm=0) R6=0 R7=0 R8=0 R9=0 R10=fp0 [...] The next path is 'from 6 to 9'. The verifier considers the old and current state equivalent, and therefore prunes the search incorrectly. Looking into the two states which are being compared by the pruning logic at line 9, the old state consists of R6_rwD=Pscalar() R9_rwD=0 R10=fp0 and the new state consists of R1=ctx(off=0,imm=0) R6_w=scalar(umax=18446744071562067968) R7_w=0 R8_w=0 R9_w=-2147483648 R10=fp0. While r6 had the reg->precise flag correctly set in the old state, r9 did not. Both r6'es are considered as equivalent given the old one is a superset of the current, more precise one, however, r9's actual values (0 vs 0x80000000) mismatch. Given the old r9 did not have reg->precise flag set, the verifier does not consider the register as contributing to the precision state of r6, and therefore it considered both r9 states as equivalent. However, for this specific pruned path (which is also the actual path taken at runtime), register r6 will be 0x400 and r9 0x80000000 when reaching line 21, thus oob-accessing the map. The purpose of precision tracking is to initially mark registers (including spilled ones) as imprecise to help verifier's pruning logic finding equivalent states it can then prune if they don't contribute to the program's safety aspects. For example, if registers are used for pointer arithmetic or to pass constant length to a helper, then the verifier sets reg->precise flag and backtracks the BPF program instruction sequence and chain of verifier states to ensure that the given register or stack slot including their dependencies are marked as precisely tracked scalar. This also includes any other registers and slots that contribute to a tracked state of given registers/stack slot. This backtracking relies on recorded jmp_history and is able to traverse entire chain of parent states. This process ends only when all the necessary registers/slots and their transitive dependencies are marked as precise. The backtrack_insn() is called from the current instruction up to the first instruction, and its purpose is to compute a bitmask of registers and stack slots that need precision tracking in the parent's verifier state. For example, if a current instruction is r6 = r7, then r6 needs precision after this instruction and r7 needs precision before this instruction, that is, in the parent state. Hence for the latter r7 is marked and r6 unmarked. For the class of jmp/jmp32 instructions, backtrack_insn() today only looks at call and exit instructions and for all other conditionals the masks remain as-is. However, in the given situation register r6 has a dependency on r9 (as described above in **), so also that one needs to be marked for precision tracking. In other words, if an imprecise register influences a precise one, then the imprecise register should also be marked precise. Meaning, in the parent state both dest and src register need to be tracked for precision and therefore the marking must be more conservative by setting reg->precise flag for both. The precision propagation needs to cover both for the conditional: if the src reg was marked but not the dst reg and vice versa. After the fix the program is correctly rejected: func#0 @0 0: R1=ctx(off=0,imm=0) R10=fp0 0: (b7) r6 = 1024 ; R6_w=1024 1: (b7) r7 = 0 ; R7_w=0 2: (b7) r8 = 0 ; R8_w=0 3: (b7) r9 = -2147483648 ; R9_w=-2147483648 4: (97) r6 %= 1025 ; R6_w=scalar() 5: (05) goto pc+0 6: (bd) if r6 <= r9 goto pc+2 ; R6_w=scalar(umin=18446744071562067969,var_off=(0xffffffff80000000; 0x7fffffff),u32_min=-2147483648) R9_w=-2147483648 7: (97) r6 %= 1 ; R6_w=scalar() 8: (b7) r9 = 0 ; R9=0 9: (bd) if r6 <= r9 goto pc+1 ; R6=scalar(umin=1) R9=0 10: (b7) r6 = 0 ; R6_w=0 11: (b7) r0 = 0 ; R0_w=0 12: (63) *(u32 *)(r10 -4) = r0 last_idx 12 first_idx 9 regs=1 stack=0 before 11: (b7) r0 = 0 13: R0_w=0 R10=fp0 fp-8=0000???? 13: (18) r4 = 0xffff9290dc5bfe00 ; R4_w=map_ptr(off=0,ks=4,vs=48,imm=0) 15: (bf) r1 = r4 ; R1_w=map_ptr(off=0,ks=4,vs=48,imm=0) R4_w=map_ptr(off=0,ks=4,vs=48,imm=0) 16: (bf) r2 = r10 ; R2_w=fp0 R10=fp0 17: (07) r2 += -4 ; R2_w=fp-4 18: (85) call bpf_map_lookup_elem#1 ; R0=map_value_or_null(id=1,off=0,ks=4,vs=48,imm=0) 19: (55) if r0 != 0x0 goto pc+1 ; R0=0 20: (95) exit from 19 to 21: R0=map_value(off=0,ks=4,vs=48,imm=0) R6=0 R7=0 R8=0 R9=0 R10=fp0 fp-8=mmmm???? 21: (77) r6 >>= 10 ; R6_w=0 22: (27) r6 *= 8192 ; R6_w=0 23: (bf) r1 = r0 ; R0=map_value(off=0,ks=4,vs=48,imm=0) R1_w=map_value(off=0,ks=4,vs=48,imm=0) 24: (0f) r0 += r6 last_idx 24 first_idx 19 regs=40 stack=0 before 23: (bf) r1 = r0 regs=40 stack=0 before 22: (27) r6 *= 8192 regs=40 stack=0 before 21: (77) r6 >>= 10 regs=40 stack=0 before 19: (55) if r0 != 0x0 goto pc+1 parent didn't have regs=40 stack=0 marks: R0_rw=map_value_or_null(id=1,off=0,ks=4,vs=48,imm=0) R6_rw=P0 R7=0 R8=0 R9=0 R10=fp0 fp-8=mmmm???? last_idx 18 first_idx 9 regs=40 stack=0 before 18: (85) call bpf_map_lookup_elem#1 regs=40 stack=0 before 17: (07) r2 += -4 regs=40 stack=0 before 16: (bf) r2 = r10 regs=40 stack=0 before 15: (bf) r1 = r4 regs=40 stack=0 before 13: (18) r4 = 0xffff9290dc5bfe00 regs=40 stack=0 before 12: (63) *(u32 *)(r10 -4) = r0 regs=40 stack=0 before 11: (b7) r0 = 0 regs=40 stack=0 before 10: (b7) r6 = 0 25: (79) r3 = *(u64 *)(r0 +0) ; R0_w=map_value(off=0,ks=4,vs=48,imm=0) R3_w=scalar() 26: (7b) *(u64 *)(r1 +0) = r3 ; R1_w=map_value(off=0,ks=4,vs=48,imm=0) R3_w=scalar() 27: (95) exit from 9 to 11: R1=ctx(off=0,imm=0) R6=0 R7=0 R8=0 R9=0 R10=fp0 11: (b7) r0 = 0 ; R0_w=0 12: (63) *(u32 *)(r10 -4) = r0 last_idx 12 first_idx 11 regs=1 stack=0 before 11: (b7) r0 = 0 13: R0_w=0 R10=fp0 fp-8=0000???? 13: (18) r4 = 0xffff9290dc5bfe00 ; R4_w=map_ptr(off=0,ks=4,vs=48,imm=0) 15: (bf) r1 = r4 ; R1_w=map_ptr(off=0,ks=4,vs=48,imm=0) R4_w=map_ptr(off=0,ks=4,vs=48,imm=0) 16: (bf) r2 = r10 ; R2_w=fp0 R10=fp0 17: (07) r2 += -4 ; R2_w=fp-4 18: (85) call bpf_map_lookup_elem#1 frame 0: propagating r6 last_idx 19 first_idx 11 regs=40 stack=0 before 18: (85) call bpf_map_lookup_elem#1 regs=40 stack=0 before 17: (07) r2 += -4 regs=40 stack=0 before 16: (bf) r2 = r10 regs=40 stack=0 before 15: (bf) r1 = r4 regs=40 stack=0 before 13: (18) r4 = 0xffff9290dc5bfe00 regs=40 stack=0 before 12: (63) *(u32 *)(r10 -4) = r0 regs=40 stack=0 before 11: (b7) r0 = 0 parent didn't have regs=40 stack=0 marks: R1=ctx(off=0,imm=0) R6_r=P0 R7=0 R8=0 R9=0 R10=fp0 last_idx 9 first_idx 9 regs=40 stack=0 before 9: (bd) if r6 <= r9 goto pc+1 parent didn't have regs=240 stack=0 marks: R1=ctx(off=0,imm=0) R6_rw=Pscalar() R7_w=0 R8_w=0 R9_rw=P0 R10=fp0 last_idx 8 first_idx 0 regs=240 stack=0 before 8: (b7) r9 = 0 regs=40 stack=0 before 7: (97) r6 %= 1 regs=40 stack=0 before 6: (bd) if r6 <= r9 goto pc+2 regs=240 stack=0 before 5: (05) goto pc+0 regs=240 stack=0 before 4: (97) r6 %= 1025 regs=240 stack=0 before 3: (b7) r9 = -2147483648 regs=40 stack=0 before 2: (b7) r8 = 0 regs=40 stack=0 before 1: (b7) r7 = 0 regs=40 stack=0 before 0: (b7) r6 = 1024 19: safe from 6 to 9: R1=ctx(off=0,imm=0) R6_w=scalar(umax=18446744071562067968) R7_w=0 R8_w=0 R9_w=-2147483648 R10=fp0 9: (bd) if r6 <= r9 goto pc+1 last_idx 9 first_idx 0 regs=40 stack=0 before 6: (bd) if r6 <= r9 goto pc+2 regs=240 stack=0 before 5: (05) goto pc+0 regs=240 stack=0 before 4: (97) r6 %= 1025 regs=240 stack=0 before 3: (b7) r9 = -2147483648 regs=40 stack=0 before 2: (b7) r8 = 0 regs=40 stack=0 before 1: (b7) r7 = 0 regs=40 stack=0 before 0: (b7) r6 = 1024 last_idx 9 first_idx 0 regs=200 stack=0 before 6: (bd) if r6 <= r9 goto pc+2 regs=240 stack=0 before 5: (05) goto pc+0 regs=240 stack=0 before 4: (97) r6 %= 1025 regs=240 stack=0 before 3: (b7) r9 = -2147483648 regs=40 stack=0 before 2: (b7) r8 = 0 regs=40 stack=0 before 1: (b7) r7 = 0 regs=40 stack=0 before 0: (b7) r6 = 1024 11: R6=scalar(umax=18446744071562067968) R9=-2147483648 11: (b7) r0 = 0 ; R0_w=0 12: (63) *(u32 *)(r10 -4) = r0 last_idx 12 first_idx 11 regs=1 stack=0 before 11: (b7) r0 = 0 13: R0_w=0 R10=fp0 fp-8=0000???? 13: (18) r4 = 0xffff9290dc5bfe00 ; R4_w=map_ptr(off=0,ks=4,vs=48,imm=0) 15: (bf) r1 = r4 ; R1_w=map_ptr(off=0,ks=4,vs=48,imm=0) R4_w=map_ptr(off=0,ks=4,vs=48,imm=0) 16: (bf) r2 = r10 ; R2_w=fp0 R10=fp0 17: (07) r2 += -4 ; R2_w=fp-4 18: (85) call bpf_map_lookup_elem#1 ; R0_w=map_value_or_null(id=3,off=0,ks=4,vs=48,imm=0) 19: (55) if r0 != 0x0 goto pc+1 ; R0_w=0 20: (95) exit from 19 to 21: R0=map_value(off=0,ks=4,vs=48,imm=0) R6=scalar(umax=18446744071562067968) R7=0 R8=0 R9=-2147483648 R10=fp0 fp-8=mmmm???? 21: (77) r6 >>= 10 ; R6_w=scalar(umax=18014398507384832,var_off=(0x0; 0x3fffffffffffff)) 22: (27) r6 *= 8192 ; R6_w=scalar(smax=9223372036854767616,umax=18446744073709543424,var_off=(0x0; 0xffffffffffffe000),s32_max=2147475456,u32_max=-8192) 23: (bf) r1 = r0 ; R0=map_value(off=0,ks=4,vs=48,imm=0) R1_w=map_value(off=0,ks=4,vs=48,imm=0) 24: (0f) r0 += r6 last_idx 24 first_idx 21 regs=40 stack=0 before 23: (bf) r1 = r0 regs=40 stack=0 before 22: (27) r6 *= 8192 regs=40 stack=0 before 21: (77) r6 >>= 10 parent didn't have regs=40 stack=0 marks: R0_rw=map_value(off=0,ks=4,vs=48,imm=0) R6_r=Pscalar(umax=18446744071562067968) R7=0 R8=0 R9=-2147483648 R10=fp0 fp-8=mmmm???? last_idx 19 first_idx 11 regs=40 stack=0 before 19: (55) if r0 != 0x0 goto pc+1 regs=40 stack=0 before 18: (85) call bpf_map_lookup_elem#1 regs=40 stack=0 before 17: (07) r2 += -4 regs=40 stack=0 before 16: (bf) r2 = r10 regs=40 stack=0 before 15: (bf) r1 = r4 regs=40 stack=0 before 13: (18) r4 = 0xffff9290dc5bfe00 regs=40 stack=0 before 12: (63) *(u32 *)(r10 -4) = r0 regs=40 stack=0 before 11: (b7) r0 = 0 parent didn't have regs=40 stack=0 marks: R1=ctx(off=0,imm=0) R6_rw=Pscalar(umax=18446744071562067968) R7_w=0 R8_w=0 R9_w=-2147483648 R10=fp0 last_idx 9 first_idx 0 regs=40 stack=0 before 9: (bd) if r6 <= r9 goto pc+1 regs=240 stack=0 before 6: (bd) if r6 <= r9 goto pc+2 regs=240 stack=0 before 5: (05) goto pc+0 regs=240 stack=0 before 4: (97) r6 %= 1025 regs=240 stack=0 before 3: (b7) r9 = -2147483648 regs=40 stack=0 before 2: (b7) r8 = 0 regs=40 stack=0 before 1: (b7) r7 = 0 regs=40 stack=0 before 0: (b7) r6 = 1024 math between map_value pointer and register with unbounded min value is not allowed verification time 886 usec stack depth 4 processed 49 insns (limit 1000000) max_states_per_insn 1 total_states 5 peak_states 5 mark_read 2 Fixes: b5dc0163d8fd ("bpf: precise scalar_value tracking") Reported-by: Juan Jose Lopez Jaimez Reported-by: Meador Inge Reported-by: Simon Scannell Reported-by: Nenad Stojanovski Change-Id: I34c0e4663b2eca081efbad9733a64d1c274afcd2 Signed-off-by: Daniel Borkmann Co-developed-by: Andrii Nakryiko Signed-off-by: Andrii Nakryiko Reviewed-by: John Fastabend Reviewed-by: Juan Jose Lopez Jaimez Reviewed-by: Meador Inge Reviewed-by: Simon Scannell Signed-off-by: Sasha Levin --- kernel/bpf/verifier.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 487bc9d86f51..1e02cca432d3 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1563,6 +1563,21 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, } } else if (opcode == BPF_EXIT) { return -ENOTSUPP; + } else if (BPF_SRC(insn->code) == BPF_X) { + if (!(*reg_mask & (dreg | sreg))) + return 0; + /* dreg sreg + * Both dreg and sreg need precision before + * this insn. If only sreg was marked precise + * before it would be equally necessary to + * propagate it to dreg. + */ + *reg_mask |= (sreg | dreg); + /* else dreg K + * Only dreg still needs precision before + * this insn, so for the K-based conditional + * there is nothing new to be marked. + */ } } else if (class == BPF_LD) { if (!(*reg_mask & dreg)) From 885de023177865377fa8879a8d426a55bdbe37e8 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Tue, 18 Apr 2023 15:53:38 -0700 Subject: [PATCH 1382/1640] UPSTREAM: bpf: Don't EFAULT for getsockopt with optval=NULL [ Upstream commit 00e74ae0863827d944e36e56a4ce1e77e50edb91 ] Some socket options do getsockopt with optval=NULL to estimate the size of the final buffer (which is returned via optlen). This breaks BPF getsockopt assumptions about permitted optval buffer size. Let's enforce these assumptions only when non-NULL optval is provided. Fixes: 0d01da6afc54 ("bpf: implement getsockopt and setsockopt hooks") Reported-by: Martin KaFai Lau Change-Id: I8011633cb75a070da1ef3fc388a4e7c0ced63dff Signed-off-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/ZD7Js4fj5YyI2oLd@google.com/T/#mb68daf700f87a9244a15d01d00c3f0e5b08f49f7 Link: https://lore.kernel.org/bpf/20230418225343.553806-2-sdf@google.com Signed-off-by: Sasha Levin --- kernel/bpf/cgroup.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index c2f0aa818b7a..08c1246c758e 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -1131,7 +1131,7 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, goto out; } - if (ctx.optlen > max_optlen || ctx.optlen < 0) { + if (optval && (ctx.optlen > max_optlen || ctx.optlen < 0)) { ret = -EFAULT; goto out; } @@ -1145,8 +1145,11 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, } if (ctx.optlen != 0) { - if (copy_to_user(optval, ctx.optval, ctx.optlen) || - put_user(ctx.optlen, optlen)) { + if (optval && copy_to_user(optval, ctx.optval, ctx.optlen)) { + ret = -EFAULT; + goto out; + } + if (put_user(ctx.optlen, optlen)) { ret = -EFAULT; goto out; } From 73508a9098c39e07376896d6d0a3e5f310795a7a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Mon, 11 Sep 2023 15:28:14 +0200 Subject: [PATCH 1383/1640] UPSTREAM: bpf: Avoid deadlock when using queue and stack maps from NMI MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit a34a9f1a19afe9c60ca0ea61dfeee63a1c2baac8 ] Sysbot discovered that the queue and stack maps can deadlock if they are being used from a BPF program that can be called from NMI context (such as one that is attached to a perf HW counter event). To fix this, add an in_nmi() check and use raw_spin_trylock() in NMI context, erroring out if grabbing the lock fails. Fixes: f1a2e44a3aec ("bpf: add queue and stack maps") Reported-by: Hsin-Wei Hung Tested-by: Hsin-Wei Hung Co-developed-by: Hsin-Wei Hung Change-Id: Ieb5f50f1a50a86bcf069840ebfcdaa73e7cc196a Signed-off-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/r/20230911132815.717240-1-toke@redhat.com Signed-off-by: Alexei Starovoitov Signed-off-by: Sasha Levin --- kernel/bpf/queue_stack_maps.c | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c index f697647ceb54..26ba7cb01136 100644 --- a/kernel/bpf/queue_stack_maps.c +++ b/kernel/bpf/queue_stack_maps.c @@ -118,7 +118,12 @@ static int __queue_map_get(struct bpf_map *map, void *value, bool delete) int err = 0; void *ptr; - raw_spin_lock_irqsave(&qs->lock, flags); + if (in_nmi()) { + if (!raw_spin_trylock_irqsave(&qs->lock, flags)) + return -EBUSY; + } else { + raw_spin_lock_irqsave(&qs->lock, flags); + } if (queue_stack_map_is_empty(qs)) { memset(value, 0, qs->map.value_size); @@ -148,7 +153,12 @@ static int __stack_map_get(struct bpf_map *map, void *value, bool delete) void *ptr; u32 index; - raw_spin_lock_irqsave(&qs->lock, flags); + if (in_nmi()) { + if (!raw_spin_trylock_irqsave(&qs->lock, flags)) + return -EBUSY; + } else { + raw_spin_lock_irqsave(&qs->lock, flags); + } if (queue_stack_map_is_empty(qs)) { memset(value, 0, qs->map.value_size); @@ -213,7 +223,12 @@ static int queue_stack_map_push_elem(struct bpf_map *map, void *value, if (flags & BPF_NOEXIST || flags > BPF_EXIST) return -EINVAL; - raw_spin_lock_irqsave(&qs->lock, irq_flags); + if (in_nmi()) { + if (!raw_spin_trylock_irqsave(&qs->lock, irq_flags)) + return -EBUSY; + } else { + raw_spin_lock_irqsave(&qs->lock, irq_flags); + } if (queue_stack_map_is_full(qs)) { if (!replace) { From 96291cae70e7c27749b61de57fe0b0b4750ae67b Mon Sep 17 00:00:00 2001 From: Shung-Hsi Yu Date: Thu, 2 Nov 2023 13:39:03 +0800 Subject: [PATCH 1384/1640] UPSTREAM: bpf: Fix precision tracking for BPF_ALU | BPF_TO_BE | BPF_END MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 291d044fd51f8484066300ee42afecf8c8db7b3a upstream. BPF_END and BPF_NEG has a different specification for the source bit in the opcode compared to other ALU/ALU64 instructions, and is either reserved or use to specify the byte swap endianness. In both cases the source bit does not encode source operand location, and src_reg is a reserved field. backtrack_insn() currently does not differentiate BPF_END and BPF_NEG from other ALU/ALU64 instructions, which leads to r0 being incorrectly marked as precise when processing BPF_ALU | BPF_TO_BE | BPF_END instructions. This commit teaches backtrack_insn() to correctly mark precision for such case. While precise tracking of BPF_NEG and other BPF_END instructions are correct and does not need fixing, this commit opt to process all BPF_NEG and BPF_END instructions within the same if-clause to better align with current convention used in the verifier (e.g. check_alu_op). Fixes: b5dc0163d8fd ("bpf: precise scalar_value tracking") Cc: stable@vger.kernel.org Reported-by: Mohamed Mahmoud Closes: https://lore.kernel.org/r/87jzrrwptf.fsf@toke.dk Tested-by: Toke Høiland-Jørgensen Tested-by: Tao Lyu Acked-by: Eduard Zingerman Change-Id: Ic022a507af3d630b8094329a48b0a0f837e4a87e Signed-off-by: Shung-Hsi Yu Link: https://lore.kernel.org/r/20231102053913.12004-2-shung-hsi.yu@suse.com Signed-off-by: Alexei Starovoitov Signed-off-by: Greg Kroah-Hartman --- kernel/bpf/verifier.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 1e02cca432d3..8afbb1626773 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1469,7 +1469,12 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, if (class == BPF_ALU || class == BPF_ALU64) { if (!(*reg_mask & dreg)) return 0; - if (opcode == BPF_MOV) { + if (opcode == BPF_END || opcode == BPF_NEG) { + /* sreg is reserved and unused + * dreg still need precision before this insn + */ + return 0; + } else if (opcode == BPF_MOV) { if (BPF_SRC(insn->code) == BPF_X) { /* dreg = sreg * dreg needs precision after this insn From b7581026513b761703ac25acbb96128bd51f569e Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Tue, 5 Mar 2024 10:20:09 +0000 Subject: [PATCH 1385/1640] UPSTREAM: Revert "bpf: Add map and need_defer parameters to .map_fd_put_ptr()" This reverts commit eb6f68ec92ab60b0540ebf64fe851e99d846e086 which is commit 20c20bd11a0702ce4dc9300c3da58acf551d9725 upstream. It breaks the Android kernel abi and can be brought back in the future in an abi-safe way if it is really needed. Bug: 161946584 Change-Id: I4611eed3677738ab29469733e2b4f6734ef3d605 Signed-off-by: Greg Kroah-Hartman --- include/linux/bpf.h | 6 +----- kernel/bpf/arraymap.c | 12 +++++------- kernel/bpf/hashtab.c | 6 +++--- kernel/bpf/map_in_map.c | 2 +- kernel/bpf/map_in_map.h | 2 +- 5 files changed, 11 insertions(+), 17 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index e7117c5a49dd..f9424c86dfb5 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -52,11 +52,7 @@ struct bpf_map_ops { /* funcs called by prog_array and perf_event_array map */ void *(*map_fd_get_ptr)(struct bpf_map *map, struct file *map_file, int fd); - /* If need_defer is true, the implementation should guarantee that - * the to-be-put element is still alive before the bpf program, which - * may manipulate it, exists. - */ - void (*map_fd_put_ptr)(struct bpf_map *map, void *ptr, bool need_defer); + void (*map_fd_put_ptr)(void *ptr); u32 (*map_gen_lookup)(struct bpf_map *map, struct bpf_insn *insn_buf); u32 (*map_fd_sys_lookup_elem)(void *ptr); void (*map_seq_show_elem)(struct bpf_map *map, void *key, diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index af90c4498e80..fa46e7a9bf3b 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -545,7 +545,7 @@ int bpf_fd_array_map_update_elem(struct bpf_map *map, struct file *map_file, old_ptr = xchg(array->ptrs + index, new_ptr); if (old_ptr) - map->ops->map_fd_put_ptr(map, old_ptr, true); + map->ops->map_fd_put_ptr(old_ptr); return 0; } @@ -561,7 +561,7 @@ static int fd_array_map_delete_elem(struct bpf_map *map, void *key) old_ptr = xchg(array->ptrs + index, NULL); if (old_ptr) { - map->ops->map_fd_put_ptr(map, old_ptr, true); + map->ops->map_fd_put_ptr(old_ptr); return 0; } else { return -ENOENT; @@ -585,9 +585,8 @@ static void *prog_fd_array_get_ptr(struct bpf_map *map, return prog; } -static void prog_fd_array_put_ptr(struct bpf_map *map, void *ptr, bool need_defer) +static void prog_fd_array_put_ptr(void *ptr) { - /* bpf_prog is freed after one RCU or tasks trace grace period */ bpf_prog_put(ptr); } @@ -698,9 +697,8 @@ err_out: return ee; } -static void perf_event_fd_array_put_ptr(struct bpf_map *map, void *ptr, bool need_defer) +static void perf_event_fd_array_put_ptr(void *ptr) { - /* bpf_perf_event is freed after one RCU grace period */ bpf_event_entry_free_rcu(ptr); } @@ -741,7 +739,7 @@ static void *cgroup_fd_array_get_ptr(struct bpf_map *map, return cgroup_get_from_fd(fd); } -static void cgroup_fd_array_put_ptr(struct bpf_map *map, void *ptr, bool need_defer) +static void cgroup_fd_array_put_ptr(void *ptr) { /* cgroup_put free cgrp after a rcu grace period */ cgroup_put(ptr); diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 0d14a2a11463..ddcf0d46228c 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -681,7 +681,7 @@ static void htab_put_fd_value(struct bpf_htab *htab, struct htab_elem *l) if (map->ops->map_fd_put_ptr) { ptr = fd_htab_map_get_ptr(map, l); - map->ops->map_fd_put_ptr(map, ptr, true); + map->ops->map_fd_put_ptr(ptr); } } @@ -1433,7 +1433,7 @@ static void fd_htab_map_free(struct bpf_map *map) hlist_nulls_for_each_entry_safe(l, n, head, hash_node) { void *ptr = fd_htab_map_get_ptr(map, l); - map->ops->map_fd_put_ptr(map, ptr, false); + map->ops->map_fd_put_ptr(ptr); } } @@ -1474,7 +1474,7 @@ int bpf_fd_htab_map_update_elem(struct bpf_map *map, struct file *map_file, ret = htab_map_update_elem(map, key, &ptr, map_flags); if (ret) - map->ops->map_fd_put_ptr(map, ptr, false); + map->ops->map_fd_put_ptr(ptr); return ret; } diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c index 830c3187828b..3dff41403583 100644 --- a/kernel/bpf/map_in_map.c +++ b/kernel/bpf/map_in_map.c @@ -109,7 +109,7 @@ void *bpf_map_fd_get_ptr(struct bpf_map *map, return inner_map; } -void bpf_map_fd_put_ptr(struct bpf_map *map, void *ptr, bool need_defer) +void bpf_map_fd_put_ptr(void *ptr) { /* ptr->ops->map_free() has to go through one * rcu grace period by itself. diff --git a/kernel/bpf/map_in_map.h b/kernel/bpf/map_in_map.h index 1e652a7bf60e..6183db9ec08c 100644 --- a/kernel/bpf/map_in_map.h +++ b/kernel/bpf/map_in_map.h @@ -18,7 +18,7 @@ bool bpf_map_meta_equal(const struct bpf_map *meta0, const struct bpf_map *meta1); void *bpf_map_fd_get_ptr(struct bpf_map *map, struct file *map_file, int ufd); -void bpf_map_fd_put_ptr(struct bpf_map *map, void *ptr, bool need_defer); +void bpf_map_fd_put_ptr(void *ptr); u32 bpf_map_fd_sys_lookup_elem(void *ptr); #endif From 5134210a0f6334532796eff9bb2033c9c1427d7e Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 14 Jul 2021 17:54:08 -0700 Subject: [PATCH 1386/1640] UPSTREAM: bpf: Factor out bpf_spin_lock into helpers. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit c1b3fed319d32a721d4b9c17afaeb430444ff773 ] Move ____bpf_spin_lock/unlock into helpers to make it more clear that quadruple underscore bpf_spin_lock/unlock are irqsave/restore variants. Change-Id: I755be22d5462b96c52d471912e32da4f8e8c6eaa Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Martin KaFai Lau Acked-by: Andrii Nakryiko Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/20210715005417.78572-3-alexei.starovoitov@gmail.com Stable-dep-of: 178c54666f9c ("bpf: Mark bpf_spin_{lock,unlock}() helpers with notrace correctly") Signed-off-by: Sasha Levin --- kernel/bpf/helpers.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index e178f185c3b3..93811cf99103 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -274,13 +274,18 @@ static inline void __bpf_spin_unlock(struct bpf_spin_lock *lock) static DEFINE_PER_CPU(unsigned long, irqsave_flags); -notrace BPF_CALL_1(bpf_spin_lock, struct bpf_spin_lock *, lock) +static inline void __bpf_spin_lock_irqsave(struct bpf_spin_lock *lock) { unsigned long flags; local_irq_save(flags); __bpf_spin_lock(lock); __this_cpu_write(irqsave_flags, flags); +} + +notrace BPF_CALL_1(bpf_spin_lock, struct bpf_spin_lock *, lock) +{ + __bpf_spin_lock_irqsave(lock); return 0; } @@ -291,13 +296,18 @@ const struct bpf_func_proto bpf_spin_lock_proto = { .arg1_type = ARG_PTR_TO_SPIN_LOCK, }; -notrace BPF_CALL_1(bpf_spin_unlock, struct bpf_spin_lock *, lock) +static inline void __bpf_spin_unlock_irqrestore(struct bpf_spin_lock *lock) { unsigned long flags; flags = __this_cpu_read(irqsave_flags); __bpf_spin_unlock(lock); local_irq_restore(flags); +} + +notrace BPF_CALL_1(bpf_spin_unlock, struct bpf_spin_lock *, lock) +{ + __bpf_spin_unlock_irqrestore(lock); return 0; } @@ -318,9 +328,9 @@ void copy_map_value_locked(struct bpf_map *map, void *dst, void *src, else lock = dst + map->spin_lock_off; preempt_disable(); - ____bpf_spin_lock(lock); + __bpf_spin_lock_irqsave(lock); copy_map_value(map, dst, src); - ____bpf_spin_unlock(lock); + __bpf_spin_unlock_irqrestore(lock); preempt_enable(); } From c37e054253063b12c2a678aaffbd2d10391a9239 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 15 Oct 2019 20:24:56 -0700 Subject: [PATCH 1387/1640] UPSTREAM: bpf: Add typecast to bpf helpers to help BTF generation [ Upstream commit 7c6a469e3416fa23568c2395a3faa7dd6e376dcb ] When pahole converts dwarf to btf it emits only used types. Wrap existing bpf helper functions into typedef and use it in typecast to make gcc emits this type into dwarf. Then pahole will convert it to btf. The "btf_#name_of_helper" types will be used to figure out types of arguments of bpf helpers. The generated code before and after is the same. Only dwarf and btf sections are different. Change-Id: I32b0ac123e3527a4d46df74e80515abeb056155a Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Acked-by: John Fastabend Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20191016032505.2089704-3-ast@kernel.org Stable-dep-of: 178c54666f9c ("bpf: Mark bpf_spin_{lock,unlock}() helpers with notrace correctly") Signed-off-by: Sasha Levin --- include/linux/filter.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/linux/filter.h b/include/linux/filter.h index 8ccd55009556..a39f5bd53395 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -493,10 +493,11 @@ static inline bool insn_is_zext(const struct bpf_insn *insn) #define BPF_CALL_x(x, name, ...) \ static __always_inline \ u64 ____##name(__BPF_MAP(x, __BPF_DECL_ARGS, __BPF_V, __VA_ARGS__)); \ + typedef u64 (*btf_##name)(__BPF_MAP(x, __BPF_DECL_ARGS, __BPF_V, __VA_ARGS__)); \ u64 name(__BPF_REG(x, __BPF_DECL_REGS, __BPF_N, __VA_ARGS__)); \ u64 name(__BPF_REG(x, __BPF_DECL_REGS, __BPF_N, __VA_ARGS__)) \ { \ - return ____##name(__BPF_MAP(x,__BPF_CAST,__BPF_N,__VA_ARGS__));\ + return ((btf_##name)____##name)(__BPF_MAP(x,__BPF_CAST,__BPF_N,__VA_ARGS__));\ } \ static __always_inline \ u64 ____##name(__BPF_MAP(x, __BPF_DECL_ARGS, __BPF_V, __VA_ARGS__)) From a6b55da3c2a7e1952dbf8e806b4ef651fb1c4575 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 6 Feb 2024 23:01:02 -0800 Subject: [PATCH 1388/1640] UPSTREAM: bpf: Mark bpf_spin_{lock,unlock}() helpers with notrace correctly [ Upstream commit 178c54666f9c4d2f49f2ea661d0c11b52f0ed190 ] Currently tracing is supposed not to allow for bpf_spin_{lock,unlock}() helper calls. This is to prevent deadlock for the following cases: - there is a prog (prog-A) calling bpf_spin_{lock,unlock}(). - there is a tracing program (prog-B), e.g., fentry, attached to bpf_spin_lock() and/or bpf_spin_unlock(). - prog-B calls bpf_spin_{lock,unlock}(). For such a case, when prog-A calls bpf_spin_{lock,unlock}(), a deadlock will happen. The related source codes are below in kernel/bpf/helpers.c: notrace BPF_CALL_1(bpf_spin_lock, struct bpf_spin_lock *, lock) notrace BPF_CALL_1(bpf_spin_unlock, struct bpf_spin_lock *, lock) notrace is supposed to prevent fentry prog from attaching to bpf_spin_{lock,unlock}(). But actually this is not the case and fentry prog can successfully attached to bpf_spin_lock(). Siddharth Chintamaneni reported the issue in [1]. The following is the macro definition for above BPF_CALL_1: #define BPF_CALL_x(x, name, ...) \ static __always_inline \ u64 ____##name(__BPF_MAP(x, __BPF_DECL_ARGS, __BPF_V, __VA_ARGS__)); \ typedef u64 (*btf_##name)(__BPF_MAP(x, __BPF_DECL_ARGS, __BPF_V, __VA_ARGS__)); \ u64 name(__BPF_REG(x, __BPF_DECL_REGS, __BPF_N, __VA_ARGS__)); \ u64 name(__BPF_REG(x, __BPF_DECL_REGS, __BPF_N, __VA_ARGS__)) \ { \ return ((btf_##name)____##name)(__BPF_MAP(x,__BPF_CAST,__BPF_N,__VA_ARGS__));\ } \ static __always_inline \ u64 ____##name(__BPF_MAP(x, __BPF_DECL_ARGS, __BPF_V, __VA_ARGS__)) #define BPF_CALL_1(name, ...) BPF_CALL_x(1, name, __VA_ARGS__) The notrace attribute is actually applied to the static always_inline function ____bpf_spin_{lock,unlock}(). The actual callback function bpf_spin_{lock,unlock}() is not marked with notrace, hence allowing fentry prog to attach to two helpers, and this may cause the above mentioned deadlock. Siddharth Chintamaneni actually has a reproducer in [2]. To fix the issue, a new macro NOTRACE_BPF_CALL_1 is introduced which will add notrace attribute to the original function instead of the hidden always_inline function and this fixed the problem. [1] https://lore.kernel.org/bpf/CAE5sdEigPnoGrzN8WU7Tx-h-iFuMZgW06qp0KHWtpvoXxf1OAQ@mail.gmail.com/ [2] https://lore.kernel.org/bpf/CAE5sdEg6yUc_Jz50AnUXEEUh6O73yQ1Z6NV2srJnef0ZrQkZew@mail.gmail.com/ Fixes: d83525ca62cf ("bpf: introduce bpf_spin_lock") Change-Id: Ibde6d996b4c5f0b331377a2dc0cdbfce2528ac35 Signed-off-by: Yonghong Song Signed-off-by: Andrii Nakryiko Acked-by: Jiri Olsa Link: https://lore.kernel.org/bpf/20240207070102.335167-1-yonghong.song@linux.dev Signed-off-by: Sasha Levin --- include/linux/filter.h | 21 ++++++++++++--------- kernel/bpf/helpers.c | 4 ++-- 2 files changed, 14 insertions(+), 11 deletions(-) diff --git a/include/linux/filter.h b/include/linux/filter.h index a39f5bd53395..68787eadfe22 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -490,24 +490,27 @@ static inline bool insn_is_zext(const struct bpf_insn *insn) __BPF_MAP(n, __BPF_DECL_ARGS, __BPF_N, u64, __ur_1, u64, __ur_2, \ u64, __ur_3, u64, __ur_4, u64, __ur_5) -#define BPF_CALL_x(x, name, ...) \ +#define BPF_CALL_x(x, attr, name, ...) \ static __always_inline \ u64 ____##name(__BPF_MAP(x, __BPF_DECL_ARGS, __BPF_V, __VA_ARGS__)); \ typedef u64 (*btf_##name)(__BPF_MAP(x, __BPF_DECL_ARGS, __BPF_V, __VA_ARGS__)); \ - u64 name(__BPF_REG(x, __BPF_DECL_REGS, __BPF_N, __VA_ARGS__)); \ - u64 name(__BPF_REG(x, __BPF_DECL_REGS, __BPF_N, __VA_ARGS__)) \ + attr u64 name(__BPF_REG(x, __BPF_DECL_REGS, __BPF_N, __VA_ARGS__)); \ + attr u64 name(__BPF_REG(x, __BPF_DECL_REGS, __BPF_N, __VA_ARGS__)) \ { \ return ((btf_##name)____##name)(__BPF_MAP(x,__BPF_CAST,__BPF_N,__VA_ARGS__));\ } \ static __always_inline \ u64 ____##name(__BPF_MAP(x, __BPF_DECL_ARGS, __BPF_V, __VA_ARGS__)) -#define BPF_CALL_0(name, ...) BPF_CALL_x(0, name, __VA_ARGS__) -#define BPF_CALL_1(name, ...) BPF_CALL_x(1, name, __VA_ARGS__) -#define BPF_CALL_2(name, ...) BPF_CALL_x(2, name, __VA_ARGS__) -#define BPF_CALL_3(name, ...) BPF_CALL_x(3, name, __VA_ARGS__) -#define BPF_CALL_4(name, ...) BPF_CALL_x(4, name, __VA_ARGS__) -#define BPF_CALL_5(name, ...) BPF_CALL_x(5, name, __VA_ARGS__) +#define __NOATTR +#define BPF_CALL_0(name, ...) BPF_CALL_x(0, __NOATTR, name, __VA_ARGS__) +#define BPF_CALL_1(name, ...) BPF_CALL_x(1, __NOATTR, name, __VA_ARGS__) +#define BPF_CALL_2(name, ...) BPF_CALL_x(2, __NOATTR, name, __VA_ARGS__) +#define BPF_CALL_3(name, ...) BPF_CALL_x(3, __NOATTR, name, __VA_ARGS__) +#define BPF_CALL_4(name, ...) BPF_CALL_x(4, __NOATTR, name, __VA_ARGS__) +#define BPF_CALL_5(name, ...) BPF_CALL_x(5, __NOATTR, name, __VA_ARGS__) + +#define NOTRACE_BPF_CALL_1(name, ...) BPF_CALL_x(1, notrace, name, __VA_ARGS__) #define bpf_ctx_range(TYPE, MEMBER) \ offsetof(TYPE, MEMBER) ... offsetofend(TYPE, MEMBER) - 1 diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 93811cf99103..8c3c6323d2ed 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -283,7 +283,7 @@ static inline void __bpf_spin_lock_irqsave(struct bpf_spin_lock *lock) __this_cpu_write(irqsave_flags, flags); } -notrace BPF_CALL_1(bpf_spin_lock, struct bpf_spin_lock *, lock) +NOTRACE_BPF_CALL_1(bpf_spin_lock, struct bpf_spin_lock *, lock) { __bpf_spin_lock_irqsave(lock); return 0; @@ -305,7 +305,7 @@ static inline void __bpf_spin_unlock_irqrestore(struct bpf_spin_lock *lock) local_irq_restore(flags); } -notrace BPF_CALL_1(bpf_spin_unlock, struct bpf_spin_lock *, lock) +NOTRACE_BPF_CALL_1(bpf_spin_unlock, struct bpf_spin_lock *, lock) { __bpf_spin_unlock_irqrestore(lock); return 0; From 8f5aad1351b404531862e643054e410ef9c453db Mon Sep 17 00:00:00 2001 From: Yan Zhai Date: Tue, 19 Mar 2024 13:44:34 -0700 Subject: [PATCH 1389/1640] UPSTREAM: rcu: add a helper to report consolidated flavor QS [ Upstream commit 1a77557d48cff187a169c2aec01c0dd78a5e7e50 ] When under heavy load, network processing can run CPU-bound for many tens of seconds. Even in preemptible kernels (non-RT kernel), this can block RCU Tasks grace periods, which can cause trace-event removal to take more than a minute, which is unacceptably long. This commit therefore creates a new helper function that passes through both RCU and RCU-Tasks quiescent states every 100 milliseconds. This hard-coded value suffices for current workloads. Suggested-by: Paul E. McKenney Reviewed-by: Jesper Dangaard Brouer Change-Id: I1e87aea8c7101bcc6da23808d0fe084b00e88537 Signed-off-by: Yan Zhai Reviewed-by: Paul E. McKenney Acked-by: Jesper Dangaard Brouer Link: https://lore.kernel.org/r/90431d46ee112d2b0af04dbfe936faaca11810a5.1710877680.git.yan@cloudflare.com Signed-off-by: Jakub Kicinski Stable-dep-of: 00bf63122459 ("bpf: report RCU QS in cpumap kthread") Signed-off-by: Sasha Levin --- include/linux/rcupdate.h | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 690762ed71a4..ff9e13d16e28 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -198,6 +198,37 @@ do { \ rcu_note_voluntary_context_switch(current); \ } while (0) +/** + * rcu_softirq_qs_periodic - Report RCU and RCU-Tasks quiescent states + * @old_ts: jiffies at start of processing. + * + * This helper is for long-running softirq handlers, such as NAPI threads in + * networking. The caller should initialize the variable passed in as @old_ts + * at the beginning of the softirq handler. When invoked frequently, this macro + * will invoke rcu_softirq_qs() every 100 milliseconds thereafter, which will + * provide both RCU and RCU-Tasks quiescent states. Note that this macro + * modifies its old_ts argument. + * + * Because regions of code that have disabled softirq act as RCU read-side + * critical sections, this macro should be invoked with softirq (and + * preemption) enabled. + * + * The macro is not needed when CONFIG_PREEMPT_RT is defined. RT kernels would + * have more chance to invoke schedule() calls and provide necessary quiescent + * states. As a contrast, calling cond_resched() only won't achieve the same + * effect because cond_resched() does not provide RCU-Tasks quiescent states. + */ +#define rcu_softirq_qs_periodic(old_ts) \ +do { \ + if (!IS_ENABLED(CONFIG_PREEMPT_RT) && \ + time_after(jiffies, (old_ts) + HZ / 10)) { \ + preempt_disable(); \ + rcu_softirq_qs(); \ + preempt_enable(); \ + (old_ts) = jiffies; \ + } \ +} while (0) + /* * Infrastructure to implement the synchronize_() primitives in * TREE_RCU and rcu_barrier_() primitives in TINY_RCU. From f3d194bb4f828927e5eb7a0c7cd9746990082b1c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 21 Jun 2018 12:50:01 -0700 Subject: [PATCH 1390/1640] BACKPORT: rcu: Defer reporting RCU-preempt quiescent states when disabled This commit defers reporting of RCU-preempt quiescent states at rcu_read_unlock_special() time when any of interrupts, softirq, or preemption are disabled. These deferred quiescent states are reported at a later RCU_SOFTIRQ, context switch, idle entry, or CPU-hotplug offline operation. Of course, if another RCU read-side critical section has started in the meantime, the reporting of the quiescent state will be further deferred. This also means that disabling preemption, interrupts, and/or softirqs will act as an RCU-preempt read-side critical section. This is enforced by checking preempt_count() as needed. Some special cases must be handled on an ad-hoc basis, for example, context switch is a quiescent state even though both the scheduler and do_exit() disable preemption. In these cases, additional calls to rcu_preempt_deferred_qs() override the preemption disabling. Similar logic overrides disabled interrupts in rcu_preempt_check_callbacks() because in this case the quiescent state happened just before the corresponding scheduling-clock interrupt. In theory, this change lifts a long-standing restriction that required that if interrupts were disabled across a call to rcu_read_unlock() that the matching rcu_read_lock() also be contained within that interrupts-disabled region of code. Because the reporting of the corresponding RCU-preempt quiescent state is now deferred until after interrupts have been enabled, it is no longer possible for this situation to result in deadlocks involving the scheduler's runqueue and priority-inheritance locks. This may allow some code simplification that might reduce interrupt latency a bit. Unfortunately, in practice this would also defer deboosting a low-priority task that had been subjected to RCU priority boosting, so real-time-response considerations might well force this restriction to remain in place. Because RCU-preempt grace periods are now blocked not only by RCU read-side critical sections, but also by disabling of interrupts, preemption, and softirqs, it will be possible to eliminate RCU-bh and RCU-sched in favor of RCU-preempt in CONFIG_PREEMPT=y kernels. This may require some additional plumbing to provide the network denial-of-service guarantees that have been traditionally provided by RCU-bh. Once these are in place, CONFIG_PREEMPT=n kernels will be able to fold RCU-bh into RCU-sched. This would mean that all kernels would have but one flavor of RCU, which would open the door to significant code cleanup. Moving to a single flavor of RCU would also have the beneficial effect of reducing the NOCB kthreads by at least a factor of two. Signed-off-by: Paul E. McKenney [ paulmck: Apply rcu_read_unlock_special() preempt_count() feedback from Joel Fernandes. ] [ paulmck: Adjust rcu_eqs_enter() call to rcu_preempt_deferred_qs() in response to bug reports from kbuild test robot. ] [ paulmck: Fix bug located by kbuild test robot involving recursion via rcu_preempt_deferred_qs(). ] Change-Id: If76ec913be9db64e7d1e70f408407229f5291af2 --- .../RCU/Design/Requirements/Requirements.html | 50 +++--- include/linux/rcutiny.h | 5 + kernel/rcu/tree.c | 9 ++ kernel/rcu/tree.h | 3 + kernel/rcu/tree_exp.h | 71 +++++++-- kernel/rcu/tree_plugin.h | 144 +++++++++++++----- 6 files changed, 205 insertions(+), 77 deletions(-) diff --git a/Documentation/RCU/Design/Requirements/Requirements.html b/Documentation/RCU/Design/Requirements/Requirements.html index 62e847bcdcdd..8ccbce4e2f76 100644 --- a/Documentation/RCU/Design/Requirements/Requirements.html +++ b/Documentation/RCU/Design/Requirements/Requirements.html @@ -2393,30 +2393,9 @@ when invoked from a CPU-hotplug notifier.

RCU depends on the scheduler, and the scheduler uses RCU to protect some of its data structures. -This means the scheduler is forbidden from acquiring -the runqueue locks and the priority-inheritance locks -in the middle of an outermost RCU read-side critical section unless either -(1) it releases them before exiting that same -RCU read-side critical section, or -(2) interrupts are disabled across -that entire RCU read-side critical section. -This same prohibition also applies (recursively!) to any lock that is acquired -while holding any lock to which this prohibition applies. -Adhering to this rule prevents preemptible RCU from invoking -rcu_read_unlock_special() while either runqueue or -priority-inheritance locks are held, thus avoiding deadlock. - -

-Prior to v4.4, it was only necessary to disable preemption across -RCU read-side critical sections that acquired scheduler locks. -In v4.4, expedited grace periods started using IPIs, and these -IPIs could force a rcu_read_unlock() to take the slowpath. -Therefore, this expedited-grace-period change required disabling of -interrupts, not just preemption. - -

-For RCU's part, the preemptible-RCU rcu_read_unlock() -implementation must be written carefully to avoid similar deadlocks. +The preemptible-RCU rcu_read_unlock() +implementation must therefore be written carefully to avoid deadlocks +involving the scheduler's runqueue and priority-inheritance locks. In particular, rcu_read_unlock() must tolerate an interrupt where the interrupt handler invokes both rcu_read_lock() and rcu_read_unlock(). @@ -2425,7 +2404,7 @@ negative nesting levels to avoid destructive recursion via interrupt handler's use of RCU.

-This pair of mutual scheduler-RCU requirements came as a +This scheduler-RCU requirement came as a complete surprise.

@@ -2436,9 +2415,28 @@ when running context-switch-heavy workloads when built with CONFIG_NO_HZ_FULL=y did come as a surprise [PDF]. RCU has made good progress towards meeting this requirement, even -for context-switch-have CONFIG_NO_HZ_FULL=y workloads, +for context-switch-heavy CONFIG_NO_HZ_FULL=y workloads, but there is room for further improvement. +

+In the past, it was forbidden to disable interrupts across an +rcu_read_unlock() unless that interrupt-disabled region +of code also included the matching rcu_read_lock(). +Violating this restriction could result in deadlocks involving the +scheduler's runqueue and priority-inheritance spinlocks. +This restriction was lifted when interrupt-disabled calls to +rcu_read_unlock() started deferring the reporting of +the resulting RCU-preempt quiescent state until the end of that +interrupts-disabled region. +This deferred reporting means that the scheduler's runqueue and +priority-inheritance locks cannot be held while reporting an RCU-preempt +quiescent state, which lifts the earlier restriction, at least from +a deadlock perspective. +Unfortunately, real-time systems using RCU priority boosting may +need this restriction to remain in effect because deferred +quiescent-state reporting also defers deboosting, which in turn +degrades real-time latencies. +

Tracing and RCU

diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index b3dbf9502fd0..01b101e4dc39 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -116,6 +116,11 @@ static inline void rcu_irq_exit_irqson(void) { } static inline void rcu_irq_enter_irqson(void) { } static inline void rcu_irq_exit(void) { } static inline void exit_rcu(void) { } +static inline bool rcu_preempt_need_deferred_qs(struct task_struct *t) +{ + return false; +} +static inline void rcu_preempt_deferred_qs(struct task_struct *t) { } #ifdef CONFIG_SRCU void rcu_scheduler_starting(void); #else /* #ifndef CONFIG_SRCU */ diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 692506c8fe1a..8f341c566a86 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -443,6 +443,7 @@ static void rcu_momentary_dyntick_idle(void) { raw_cpu_write(rcu_dynticks.rcu_need_heavy_qs, false); rcu_dynticks_momentary_idle(); + rcu_preempt_deferred_qs(current); } /* @@ -791,6 +792,7 @@ static void rcu_eqs_enter_common(bool user) do_nocb_deferred_wakeup(rdp); } rcu_prepare_for_idle(); + rcu_preempt_deferred_qs(current); __this_cpu_inc(disable_rcu_irq_enter); rdtp->dynticks_nesting = 0; /* Breaks tracing momentarily. */ rcu_dynticks_eqs_enter(); /* After this, tracing works again. */ @@ -2909,6 +2911,12 @@ __rcu_process_callbacks(struct rcu_state *rsp) WARN_ON_ONCE(!rdp->beenonline); + /* Report any deferred quiescent states if preemption enabled. */ + if (!(preempt_count() & PREEMPT_MASK)) + rcu_preempt_deferred_qs(current); + else if (rcu_preempt_need_deferred_qs(current)) + resched_cpu(rdp->cpu); /* Provoke future context switch. */ + /* Update RCU state based on any recent quiescent states. */ rcu_check_quiescent_state(rsp, rdp); @@ -3876,6 +3884,7 @@ void rcu_report_dead(unsigned int cpu) rcu_report_exp_rdp(&rcu_sched_state, this_cpu_ptr(rcu_sched_state.rda), true); preempt_enable(); + rcu_preempt_deferred_qs(current); for_each_rcu_flavor(rsp) rcu_cleanup_dying_idle_cpu(cpu, rsp); } diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 8e1f285f0a70..0705723a03a9 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -204,6 +204,7 @@ struct rcu_data { bool core_needs_qs; /* Core waits for quiesc state. */ bool beenonline; /* CPU online at least once. */ bool gpwrap; /* Possible gpnum/completed wrap. */ + bool deferred_qs; /* This CPU awaiting a deferred QS? */ struct rcu_node *mynode; /* This CPU's leaf of hierarchy */ unsigned long grpmask; /* Mask to apply to leaf qsmask. */ unsigned long ticks_this_gp; /* The number of scheduling-clock */ @@ -474,6 +475,8 @@ static void rcu_cleanup_after_idle(void); static void rcu_prepare_for_idle(void); static void rcu_idle_count_callbacks_posted(void); static bool rcu_preempt_has_tasks(struct rcu_node *rnp); +static bool rcu_preempt_need_deferred_qs(struct task_struct *t); +static void rcu_preempt_deferred_qs(struct task_struct *t); static void print_cpu_stall_info_begin(void); static void print_cpu_stall_info(struct rcu_state *rsp, int cpu); static void print_cpu_stall_info_end(void); diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 81b4d4fd1277..161e31de2c61 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -235,6 +235,7 @@ static void rcu_report_exp_cpu_mult(struct rcu_state *rsp, struct rcu_node *rnp, static void rcu_report_exp_rdp(struct rcu_state *rsp, struct rcu_data *rdp, bool wake) { + WRITE_ONCE(rdp->deferred_qs, false); rcu_report_exp_cpu_mult(rsp, rdp->mynode, rdp->grpmask, wake); } @@ -666,32 +667,70 @@ EXPORT_SYMBOL_GPL(synchronize_sched_expedited); */ static void sync_rcu_exp_handler(void *info) { - struct rcu_data *rdp; + unsigned long flags; struct rcu_state *rsp = info; + struct rcu_data *rdp = this_cpu_ptr(rsp->rda); + struct rcu_node *rnp = rdp->mynode; struct task_struct *t = current; /* - * Within an RCU read-side critical section, request that the next - * rcu_read_unlock() report. Unless this RCU read-side critical - * section has already blocked, in which case it is already set - * up for the expedited grace period to wait on it. + * First, the common case of not being in an RCU read-side + * critical section. If also enabled or idle, immediately + * report the quiescent state, otherwise defer. */ - if (t->rcu_read_lock_nesting > 0 && - !t->rcu_read_unlock_special.b.blocked) { - t->rcu_read_unlock_special.b.exp_need_qs = true; + if (!t->rcu_read_lock_nesting) { + if (!(preempt_count() & (PREEMPT_MASK | SOFTIRQ_MASK)) || + rcu_dynticks_curr_cpu_in_eqs()) { + rcu_report_exp_rdp(rsp, rdp, true); + } else { + rdp->deferred_qs = true; + resched_cpu(rdp->cpu); + } return; } /* - * We are either exiting an RCU read-side critical section (negative - * values of t->rcu_read_lock_nesting) or are not in one at all - * (zero value of t->rcu_read_lock_nesting). Or we are in an RCU - * read-side critical section that blocked before this expedited - * grace period started. Either way, we can immediately report - * the quiescent state. + * Second, the less-common case of being in an RCU read-side + * critical section. In this case we can count on a future + * rcu_read_unlock(). However, this rcu_read_unlock() might + * execute on some other CPU, but in that case there will be + * a future context switch. Either way, if the expedited + * grace period is still waiting on this CPU, set ->deferred_qs + * so that the eventual quiescent state will be reported. + * Note that there is a large group of race conditions that + * can have caused this quiescent state to already have been + * reported, so we really do need to check ->expmask. */ - rdp = this_cpu_ptr(rsp->rda); - rcu_report_exp_rdp(rsp, rdp, true); + if (t->rcu_read_lock_nesting > 0) { + raw_spin_lock_irqsave_rcu_node(rnp, flags); + if (rnp->expmask & rdp->grpmask) + rdp->deferred_qs = true; + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + } + + /* + * The final and least likely case is where the interrupted + * code was just about to or just finished exiting the RCU-preempt + * read-side critical section, and no, we can't tell which. + * So either way, set ->deferred_qs to flag later code that + * a quiescent state is required. + * + * If the CPU is fully enabled (or if some buggy RCU-preempt + * read-side critical section is being used from idle), just + * invoke rcu_preempt_defer_qs() to immediately report the + * quiescent state. We cannot use rcu_read_unlock_special() + * because we are in an interrupt handler, which will cause that + * function to take an early exit without doing anything. + * + * Otherwise, use resched_cpu() to force a context switch after + * the CPU enables everything. + */ + rdp->deferred_qs = true; + if (!(preempt_count() & (PREEMPT_MASK | SOFTIRQ_MASK)) || + WARN_ON_ONCE(rcu_dynticks_curr_cpu_in_eqs())) + rcu_preempt_deferred_qs(t); + else + resched_cpu(rdp->cpu); } /** diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 8b3102d22823..c525d134bbd8 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -358,6 +358,9 @@ static void rcu_preempt_note_context_switch(bool preempt) * behalf of preempted instance of __rcu_read_unlock(). */ rcu_read_unlock_special(t); + rcu_preempt_deferred_qs(t); + } else { + rcu_preempt_deferred_qs(t); } /* @@ -407,54 +410,51 @@ static bool rcu_preempt_has_tasks(struct rcu_node *rnp) } /* - * Handle special cases during rcu_read_unlock(), such as needing to - * notify RCU core processing or task having blocked during the RCU - * read-side critical section. + * Report deferred quiescent states. The deferral time can + * be quite short, for example, in the case of the call from + * rcu_read_unlock_special(). */ -void rcu_read_unlock_special(struct task_struct *t) +static void +rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags) { bool empty_exp; bool empty_norm; bool empty_exp_now; - unsigned long flags; struct list_head *np; bool drop_boost_mutex = false; struct rcu_data *rdp; struct rcu_node *rnp; union rcu_special special; - /* NMI handlers cannot block and cannot safely manipulate state. */ - if (in_nmi()) - return; - - local_irq_save(flags); - /* * If RCU core is waiting for this CPU to exit its critical section, * report the fact that it has exited. Because irqs are disabled, * t->rcu_read_unlock_special cannot change. */ special = t->rcu_read_unlock_special; + rdp = this_cpu_ptr(rcu_state_p->rda); + if (!special.s && !rdp->deferred_qs) { + local_irq_restore(flags); + return; + } if (special.b.need_qs) { rcu_preempt_qs(); t->rcu_read_unlock_special.b.need_qs = false; - if (!t->rcu_read_unlock_special.s) { + if (!t->rcu_read_unlock_special.s && !rdp->deferred_qs) { local_irq_restore(flags); return; } } /* - * Respond to a request for an expedited grace period, but only if - * we were not preempted, meaning that we were running on the same - * CPU throughout. If we were preempted, the exp_need_qs flag - * would have been cleared at the time of the first preemption, - * and the quiescent state would be reported when we were dequeued. + * Respond to a request by an expedited grace period for a + * quiescent state from this CPU. Note that requests from + * tasks are handled when removing the task from the + * blocked-tasks list below. */ - if (special.b.exp_need_qs) { - WARN_ON_ONCE(special.b.blocked); + if (special.b.exp_need_qs || rdp->deferred_qs) { t->rcu_read_unlock_special.b.exp_need_qs = false; - rdp = this_cpu_ptr(rcu_state_p->rda); + rdp->deferred_qs = false; rcu_report_exp_rdp(rcu_state_p, rdp, true); if (!t->rcu_read_unlock_special.s) { local_irq_restore(flags); @@ -462,19 +462,6 @@ void rcu_read_unlock_special(struct task_struct *t) } } - /* Hardware IRQ handlers cannot block, complain if they get here. */ - if (in_irq() || in_serving_softirq()) { - lockdep_rcu_suspicious(__FILE__, __LINE__, - "rcu_read_unlock() from irq or softirq with blocking in critical section!!!\n"); - pr_alert("->rcu_read_unlock_special: %#x (b: %d, enq: %d nq: %d)\n", - t->rcu_read_unlock_special.s, - t->rcu_read_unlock_special.b.blocked, - t->rcu_read_unlock_special.b.exp_need_qs, - t->rcu_read_unlock_special.b.need_qs); - local_irq_restore(flags); - return; - } - /* Clean up if blocked during RCU read-side critical section. */ if (special.b.blocked) { t->rcu_read_unlock_special.b.blocked = false; @@ -543,6 +530,72 @@ void rcu_read_unlock_special(struct task_struct *t) } } +/* + * Is a deferred quiescent-state pending, and are we also not in + * an RCU read-side critical section? It is the caller's responsibility + * to ensure it is otherwise safe to report any deferred quiescent + * states. The reason for this is that it is safe to report a + * quiescent state during context switch even though preemption + * is disabled. This function cannot be expected to understand these + * nuances, so the caller must handle them. + */ +static bool rcu_preempt_need_deferred_qs(struct task_struct *t) +{ + return (this_cpu_ptr(&rcu_preempt_data)->deferred_qs || + READ_ONCE(t->rcu_read_unlock_special.s)) && + !t->rcu_read_lock_nesting; +} + +/* + * Report a deferred quiescent state if needed and safe to do so. + * As with rcu_preempt_need_deferred_qs(), "safe" involves only + * not being in an RCU read-side critical section. The caller must + * evaluate safety in terms of interrupt, softirq, and preemption + * disabling. + */ +static void rcu_preempt_deferred_qs(struct task_struct *t) +{ + unsigned long flags; + bool couldrecurse = t->rcu_read_lock_nesting >= 0; + + if (!rcu_preempt_need_deferred_qs(t)) + return; + if (couldrecurse) + t->rcu_read_lock_nesting -= INT_MIN; + local_irq_save(flags); + rcu_preempt_deferred_qs_irqrestore(t, flags); + if (couldrecurse) + t->rcu_read_lock_nesting += INT_MIN; +} + +/* + * Handle special cases during rcu_read_unlock(), such as needing to + * notify RCU core processing or task having blocked during the RCU + * read-side critical section. + */ +void rcu_read_unlock_special(struct task_struct *t) +{ + unsigned long flags; + bool preempt_bh_were_disabled = + !!(preempt_count() & (PREEMPT_MASK | SOFTIRQ_MASK)); + bool irqs_were_disabled; + + /* NMI handlers cannot block and cannot safely manipulate state. */ + if (in_nmi()) + return; + + local_irq_save(flags); + irqs_were_disabled = irqs_disabled_flags(flags); + if ((preempt_bh_were_disabled || irqs_were_disabled) && + t->rcu_read_unlock_special.b.blocked) { + /* Need to defer quiescent state until everything is enabled. */ + raise_softirq_irqoff(RCU_SOFTIRQ); + local_irq_restore(flags); + return; + } + rcu_preempt_deferred_qs_irqrestore(t, flags); +} + /* * Dump detailed information for all tasks blocking the current RCU * grace period on the specified rcu_node structure. @@ -674,10 +727,20 @@ static void rcu_preempt_check_callbacks(void) { struct task_struct *t = current; - if (t->rcu_read_lock_nesting == 0) { - rcu_preempt_qs(); + if (t->rcu_read_lock_nesting > 0 || + (preempt_count() & (PREEMPT_MASK | SOFTIRQ_MASK))) { + /* No QS, force context switch if deferred. */ + if (rcu_preempt_need_deferred_qs(t)) + resched_cpu(smp_processor_id()); + } else if (rcu_preempt_need_deferred_qs(t)) { + rcu_preempt_deferred_qs(t); /* Report deferred QS. */ + return; + } else if (!t->rcu_read_lock_nesting) { + rcu_preempt_qs(); /* Report immediate QS. */ return; } + + /* If GP is oldish, ask for help from rcu_read_unlock_special(). */ if (t->rcu_read_lock_nesting > 0 && __this_cpu_read(rcu_data_p->core_needs_qs) && __this_cpu_read(rcu_data_p->cpu_no_qs.b.norm)) @@ -803,6 +866,7 @@ void exit_rcu(void) barrier(); t->rcu_read_unlock_special.b.blocked = true; __rcu_read_unlock(); + rcu_preempt_deferred_qs(current); } #else /* #ifdef CONFIG_PREEMPT_RCU */ @@ -843,6 +907,16 @@ static bool rcu_preempt_has_tasks(struct rcu_node *rnp) return false; } +/* + * Because there is no preemptible RCU, there can be no deferred quiescent + * states. + */ +static bool rcu_preempt_need_deferred_qs(struct task_struct *t) +{ + return false; +} +static void rcu_preempt_deferred_qs(struct task_struct *t) { } + /* * Because preemptible RCU does not exist, we never have to check for * tasks blocked within RCU read-side critical sections. From c54b096ee0ee03a4d50998ba9069ced4c0bf858a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 28 Jun 2018 14:45:25 -0700 Subject: [PATCH 1391/1640] UPSTREAM: rcu: Apply RCU-bh QSes to RCU-sched and RCU-preempt when safe One necessary step towards consolidating the three flavors of RCU is to make sure that the resulting consolidated "one flavor to rule them all" correctly handles networking denial-of-service attacks. One thing that allows RCU-bh to do so is that __do_softirq() invokes rcu_bh_qs() every so often, and so something similar has to happen for consolidated RCU. This must be done carefully. For example, if a preemption-disabled region of code takes an interrupt which does softirq processing before returning, consolidated RCU must ignore the resulting rcu_bh_qs() invocations -- preemption is still disabled, and that means an RCU reader for the consolidated flavor. This commit therefore creates a new rcu_softirq_qs() that is called only from the ksoftirqd task, thus avoiding the interrupted-a-preempted-region problem. This new rcu_softirq_qs() function invokes rcu_sched_qs(), rcu_preempt_qs(), and rcu_preempt_deferred_qs(). The latter call handles any deferred quiescent states. Note that __do_softirq() still invokes rcu_bh_qs(). It will continue to do so until a later stage of cleanup when the RCU-bh flavor is removed. Change-Id: Id4e2d284f3f268aa7acf0f8a8b6e9f8d890785c0 Signed-off-by: Paul E. McKenney [ paulmck: Fix !SMP issue located by kbuild test robot. ] --- include/linux/rcutiny.h | 5 +++++ include/linux/rcutree.h | 1 + kernel/rcu/tree.c | 7 +++++++ kernel/rcu/tree.h | 1 + kernel/rcu/tree_plugin.h | 5 +++++ kernel/softirq.c | 2 ++ 6 files changed, 21 insertions(+) diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index 01b101e4dc39..7980be8e0080 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -90,6 +90,11 @@ static inline void kfree_call_rcu(struct rcu_head *head, call_rcu(head, func); } +static inline void rcu_softirq_qs(void) +{ + rcu_sched_qs(); +} + #define rcu_note_context_switch(preempt) \ do { \ rcu_sched_qs(); \ diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index 37d6fd3b7ff8..2c44a87c328d 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -30,6 +30,7 @@ #ifndef __LINUX_RCUTREE_H #define __LINUX_RCUTREE_H +void rcu_softirq_qs(void); void rcu_note_context_switch(bool preempt); int rcu_needs_cpu(u64 basem, u64 *nextevt); void rcu_cpu_stall_reset(void); diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 8f341c566a86..5a28ff21b3c5 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -254,6 +254,13 @@ void rcu_bh_qs(void) } } +void rcu_softirq_qs(void) +{ + rcu_sched_qs(); + rcu_preempt_qs(); + rcu_preempt_deferred_qs(current); +} + /* * Steal a bit from the bottom of ->dynticks for idle entry/exit * control. Initially this is for TLB flushing. diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 0705723a03a9..f25b2e52ae08 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -448,6 +448,7 @@ DECLARE_PER_CPU(char, rcu_cpu_has_work); /* Forward declarations for rcutree_plugin.h */ static void rcu_bootup_announce(void); +static void rcu_preempt_qs(void); static void rcu_preempt_note_context_switch(bool preempt); static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp); #ifdef CONFIG_HOTPLUG_CPU diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index c525d134bbd8..cc1c4fcfbd0a 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -882,6 +882,11 @@ static void __init rcu_bootup_announce(void) rcu_bootup_announce_oddness(); } +/* Because preemptible RCU does not exist, we can ignore its QSes. */ +static void rcu_preempt_qs(void) +{ +} + /* * Because preemptible RCU does not exist, we never have to check for * CPUs being in quiescent states. diff --git a/kernel/softirq.c b/kernel/softirq.c index d48995a6da29..cf5ee5146221 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -310,6 +310,8 @@ restart: __this_cpu_write(active_softirqs, 0); rcu_bh_qs(); + if (__this_cpu_read(ksoftirqd) == current) + rcu_softirq_qs(); local_irq_disable(); pending = local_softirq_pending(); From 189d1c3ce6e65b4ed78b17e5a030e5c94e9682c3 Mon Sep 17 00:00:00 2001 From: Yan Zhai Date: Tue, 19 Mar 2024 13:44:40 -0700 Subject: [PATCH 1392/1640] UPSTREAM: bpf: report RCU QS in cpumap kthread [ Upstream commit 00bf63122459e87193ee7f1bc6161c83a525569f ] When there are heavy load, cpumap kernel threads can be busy polling packets from redirect queues and block out RCU tasks from reaching quiescent states. It is insufficient to just call cond_resched() in such context. Periodically raise a consolidated RCU QS before cond_resched fixes the problem. Fixes: 6710e1126934 ("bpf: introduce new bpf cpu map type BPF_MAP_TYPE_CPUMAP") Reviewed-by: Jesper Dangaard Brouer Change-Id: Ieb6b0c55ead9af0905b44aeeecada34bc050d988 Signed-off-by: Yan Zhai Acked-by: Paul E. McKenney Acked-by: Jesper Dangaard Brouer Link: https://lore.kernel.org/r/c17b9f1517e19d813da3ede5ed33ee18496bb5d8.1710877680.git.yan@cloudflare.com Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- kernel/bpf/cpumap.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index a367fc850393..19be747f4e5a 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -249,6 +249,7 @@ static void put_cpu_map_entry(struct bpf_cpu_map_entry *rcpu) static int cpu_map_kthread_run(void *data) { struct bpf_cpu_map_entry *rcpu = data; + unsigned long last_qs = jiffies; set_current_state(TASK_INTERRUPTIBLE); @@ -271,10 +272,12 @@ static int cpu_map_kthread_run(void *data) if (__ptr_ring_empty(rcpu->queue)) { schedule(); sched = 1; + last_qs = jiffies; } else { __set_current_state(TASK_RUNNING); } } else { + rcu_softirq_qs_periodic(last_qs); sched = cond_resched(); } From 4b430e7d6761fd12ea8474fce656c04a0c8dee74 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Thu, 7 Mar 2024 13:03:35 +0100 Subject: [PATCH 1393/1640] UPSTREAM: bpf: Fix DEVMAP_HASH overflow check on 32-bit arches MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 281d464a34f540de166cee74b723e97ac2515ec3 upstream. The devmap code allocates a number hash buckets equal to the next power of two of the max_entries value provided when creating the map. When rounding up to the next power of two, the 32-bit variable storing the number of buckets can overflow, and the code checks for overflow by checking if the truncated 32-bit value is equal to 0. However, on 32-bit arches the rounding up itself can overflow mid-way through, because it ends up doing a left-shift of 32 bits on an unsigned long value. If the size of an unsigned long is four bytes, this is undefined behaviour, so there is no guarantee that we'll end up with a nice and tidy 0-value at the end. Syzbot managed to turn this into a crash on arm32 by creating a DEVMAP_HASH with max_entries > 0x80000000 and then trying to update it. Fix this by moving the overflow check to before the rounding up operation. Fixes: 6f9d451ab1a3 ("xdp: Add devmap_hash map type for looking up devices by hashed index") Link: https://lore.kernel.org/r/000000000000ed666a0611af6818@google.com Reported-and-tested-by: syzbot+8cd36f6b65f3cafd400a@syzkaller.appspotmail.com Change-Id: I53ca51e9f63a42656deea2edfb572ae44f21a737 Signed-off-by: Toke Høiland-Jørgensen Message-ID: <20240307120340.99577-2-toke@redhat.com> Signed-off-by: Alexei Starovoitov Signed-off-by: Pu Lehui Signed-off-by: Greg Kroah-Hartman --- kernel/bpf/devmap.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 4b2819b0a05a..2370fc31169f 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -130,10 +130,13 @@ static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr) cost = (u64) sizeof(struct list_head) * num_possible_cpus(); if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) { - dtab->n_buckets = roundup_pow_of_two(dtab->map.max_entries); - - if (!dtab->n_buckets) /* Overflow check */ + /* hash table size must be power of 2; roundup_pow_of_two() can + * overflow into UB on 32-bit arches, so check that first + */ + if (dtab->map.max_entries > 1UL << 31) return -EINVAL; + + dtab->n_buckets = roundup_pow_of_two(dtab->map.max_entries); cost += (u64) sizeof(struct hlist_head) * dtab->n_buckets; } else { cost += (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *); From f48f5006fc8a539205653c597a1a2fb5cf011635 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 13 Sep 2024 21:17:46 +0200 Subject: [PATCH 1394/1640] UPSTREAM: bpf: Fix bpf_strtol and bpf_strtoul helpers for 32bit [ Upstream commit cfe69c50b05510b24e26ccb427c7cc70beafd6c1 ] The bpf_strtol() and bpf_strtoul() helpers are currently broken on 32bit: The argument type ARG_PTR_TO_LONG is BPF-side "long", not kernel-side "long" and therefore always considered fixed 64bit no matter if 64 or 32bit underlying architecture. This contract breaks in case of the two mentioned helpers since their BPF_CALL definition for the helpers was added with {unsigned,}long *res. Meaning, the transition from BPF-side "long" (BPF program) to kernel-side "long" (BPF helper) breaks here. Both helpers call __bpf_strtoll() with "long long" correctly, but later assigning the result into 32-bit "*(long *)" on 32bit architectures. From a BPF program point of view, this means upper bits will be seen as uninitialised. Therefore, fix both BPF_CALL signatures to {s,u}64 types to fix this situation. Now, changing also uapi/bpf.h helper documentation which generates bpf_helper_defs.h for BPF programs is tricky: Changing signatures there to __{s,u}64 would trigger compiler warnings (incompatible pointer types passing 'long *' to parameter of type '__s64 *' (aka 'long long *')) for existing BPF programs. Leaving the signatures as-is would be fine as from BPF program point of view it is still BPF-side "long" and thus equivalent to __{s,u}64 on 64 or 32bit underlying architectures. Note that bpf_strtol() and bpf_strtoul() are the only helpers with this issue. Fixes: d7a4cb9b6705 ("bpf: Introduce bpf_strtol and bpf_strtoul helpers") Reported-by: Alexei Starovoitov Change-Id: I4c0c917ccd08e6aa49816d580ac5cf4e60dff6b3 Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/481fcec8-c12c-9abb-8ecb-76c71c009959@iogearbox.net Link: https://lore.kernel.org/r/20240913191754.13290-1-daniel@iogearbox.net Signed-off-by: Alexei Starovoitov Signed-off-by: Sasha Levin --- kernel/bpf/helpers.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 8c3c6323d2ed..5859edfb5ad9 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -457,7 +457,7 @@ static int __bpf_strtoll(const char *buf, size_t buf_len, u64 flags, } BPF_CALL_4(bpf_strtol, const char *, buf, size_t, buf_len, u64, flags, - long *, res) + s64 *, res) { long long _res; int err; @@ -482,7 +482,7 @@ const struct bpf_func_proto bpf_strtol_proto = { }; BPF_CALL_4(bpf_strtoul, const char *, buf, size_t, buf_len, u64, flags, - unsigned long *, res) + u64 *, res) { unsigned long long _res; bool is_negative; From 9177d979f03230fd5d06dd6226967737884bb3cd Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Fri, 6 Dec 2024 19:06:16 +0800 Subject: [PATCH 1395/1640] UPSTREAM: bpf: Handle BPF_EXIST and BPF_NOEXIST for LPM trie MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit eae6a075e9537dd69891cf77ca5a88fa8a28b4a1 ] Add the currently missing handling for the BPF_EXIST and BPF_NOEXIST flags. These flags can be specified by users and are relevant since LPM trie supports exact matches during update. Fixes: b95a5c4db09b ("bpf: add a longest prefix match trie map implementation") Reviewed-by: Toke Høiland-Jørgensen Acked-by: Daniel Borkmann Change-Id: Ic37b09992c159b5b50aa99843f5bf4d5f330b859 Signed-off-by: Hou Tao Link: https://lore.kernel.org/r/20241206110622.1161752-4-houtao@huaweicloud.com Signed-off-by: Alexei Starovoitov Signed-off-by: Sasha Levin --- kernel/bpf/lpm_trie.c | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index c372be6df264..1f92d531b446 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -364,6 +364,10 @@ static int trie_update_elem(struct bpf_map *map, * simply assign the @new_node to that slot and be done. */ if (!node) { + if (flags == BPF_EXIST) { + ret = -ENOENT; + goto out; + } rcu_assign_pointer(*slot, new_node); goto out; } @@ -372,18 +376,31 @@ static int trie_update_elem(struct bpf_map *map, * which already has the correct data array set. */ if (node->prefixlen == matchlen) { + if (!(node->flags & LPM_TREE_NODE_FLAG_IM)) { + if (flags == BPF_NOEXIST) { + ret = -EEXIST; + goto out; + } + trie->n_entries--; + } else if (flags == BPF_EXIST) { + ret = -ENOENT; + goto out; + } + new_node->child[0] = node->child[0]; new_node->child[1] = node->child[1]; - if (!(node->flags & LPM_TREE_NODE_FLAG_IM)) - trie->n_entries--; - rcu_assign_pointer(*slot, new_node); kfree_rcu(node, rcu); goto out; } + if (flags == BPF_EXIST) { + ret = -ENOENT; + goto out; + } + /* If the new node matches the prefix completely, it must be inserted * as an ancestor. Simply insert it between @node and *@slot. */ From cfaa01d9cae855c96902a3baf775fe9d2ac8a7b4 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Fri, 6 Dec 2024 19:06:18 +0800 Subject: [PATCH 1396/1640] UPSTREAM: bpf: Fix exact match conditions in trie_get_next_key() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 27abc7b3fa2e09bbe41e2924d328121546865eda ] trie_get_next_key() uses node->prefixlen == key->prefixlen to identify an exact match, However, it is incorrect because when the target key doesn't fully match the found node (e.g., node->prefixlen != matchlen), these two nodes may also have the same prefixlen. It will return expected result when the passed key exist in the trie. However when a recently-deleted key or nonexistent key is passed to trie_get_next_key(), it may skip keys and return incorrect result. Fix it by using node->prefixlen == matchlen to identify exact matches. When the condition is true after the search, it also implies node->prefixlen equals key->prefixlen, otherwise, the search would return NULL instead. Fixes: b471f2f1de8b ("bpf: implement MAP_GET_NEXT_KEY command for LPM_TRIE map") Reviewed-by: Toke Høiland-Jørgensen Change-Id: I350b5fd58131f2ac176492b94e7b8bb6079a784d Signed-off-by: Hou Tao Link: https://lore.kernel.org/r/20241206110622.1161752-6-houtao@huaweicloud.com Signed-off-by: Alexei Starovoitov Signed-off-by: Sasha Levin --- kernel/bpf/lpm_trie.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index 1f92d531b446..f726ceb8d7e9 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -655,7 +655,7 @@ static int trie_get_next_key(struct bpf_map *map, void *_key, void *_next_key) struct lpm_trie_node **node_stack = NULL; int err = 0, stack_ptr = -1; unsigned int next_bit; - size_t matchlen; + size_t matchlen = 0; /* The get_next_key follows postorder. For the 4 node example in * the top of this file, the trie_get_next_key() returns the following @@ -694,7 +694,7 @@ static int trie_get_next_key(struct bpf_map *map, void *_key, void *_next_key) next_bit = extract_bit(key->data, node->prefixlen); node = rcu_dereference(node->child[next_bit]); } - if (!node || node->prefixlen != key->prefixlen || + if (!node || node->prefixlen != matchlen || (node->flags & LPM_TREE_NODE_FLAG_IM)) goto find_leftmost; From 297517d543e4c1daaf2f6b90c40848c0be63b4dc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Thu, 19 Dec 2019 07:09:59 +0100 Subject: [PATCH 1397/1640] UPSTREAM: xdp: Simplify devmap cleanup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 0536b85239b8440735cdd910aae0eb076ebbb439 ] After the RCU flavor consolidation [1], call_rcu() and synchronize_rcu() waits for preempt-disable regions (NAPI) in addition to the read-side critical sections. As a result of this, the cleanup code in devmap can be simplified * There is no longer a need to flush in __dev_map_entry_free, since we know that this has been done when the call_rcu() callback is triggered. * When freeing the map, there is no need to explicitly wait for a flush. It's guaranteed to be done after the synchronize_rcu() call in dev_map_free(). The rcu_barrier() is still needed, so that the map is not freed prior the elements. [1] https://lwn.net/Articles/777036/ Change-Id: I5493def8c4d2279d1d95964d5acf880da76b57c1 Signed-off-by: Björn Töpel Signed-off-by: Alexei Starovoitov Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/20191219061006.21980-2-bjorn.topel@gmail.com Stable-dep-of: ab244dd7cf4c ("bpf: fix OOB devmap writes when deleting elements") Signed-off-by: Sasha Levin --- kernel/bpf/devmap.c | 43 +++++-------------------------------------- 1 file changed, 5 insertions(+), 38 deletions(-) diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 2370fc31169f..e63ecb21e055 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -206,7 +206,7 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) static void dev_map_free(struct bpf_map *map) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); - int i, cpu; + int i; /* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0, * so the programs (can be more than one that used this map) were @@ -226,18 +226,6 @@ static void dev_map_free(struct bpf_map *map) /* Make sure prior __dev_map_entry_free() have completed. */ rcu_barrier(); - /* To ensure all pending flush operations have completed wait for flush - * list to empty on _all_ cpus. - * Because the above synchronize_rcu() ensures the map is disconnected - * from the program we can assume no new items will be added. - */ - for_each_online_cpu(cpu) { - struct list_head *flush_list = per_cpu_ptr(dtab->flush_list, cpu); - - while (!list_empty(flush_list)) - cond_resched(); - } - if (dtab->map.map_type == BPF_MAP_TYPE_DEVMAP_HASH) { for (i = 0; i < dtab->n_buckets; i++) { struct bpf_dtab_netdev *dev; @@ -351,8 +339,7 @@ static int dev_map_hash_get_next_key(struct bpf_map *map, void *key, return -ENOENT; } -static int bq_xmit_all(struct xdp_bulk_queue *bq, u32 flags, - bool in_napi_ctx) +static int bq_xmit_all(struct xdp_bulk_queue *bq, u32 flags) { struct bpf_dtab_netdev *obj = bq->obj; struct net_device *dev = obj->dev; @@ -390,11 +377,7 @@ error: for (i = 0; i < bq->count; i++) { struct xdp_frame *xdpf = bq->q[i]; - /* RX path under NAPI protection, can return frames faster */ - if (likely(in_napi_ctx)) - xdp_return_frame_rx_napi(xdpf); - else - xdp_return_frame(xdpf); + xdp_return_frame_rx_napi(xdpf); drops++; } goto out; @@ -415,7 +398,7 @@ void __dev_map_flush(struct bpf_map *map) rcu_read_lock(); list_for_each_entry_safe(bq, tmp, flush_list, flush_node) - bq_xmit_all(bq, XDP_XMIT_FLUSH, true); + bq_xmit_all(bq, XDP_XMIT_FLUSH); rcu_read_unlock(); } @@ -446,7 +429,7 @@ static int bq_enqueue(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf, struct xdp_bulk_queue *bq = this_cpu_ptr(obj->bulkq); if (unlikely(bq->count == DEV_MAP_BULK_SIZE)) - bq_xmit_all(bq, 0, true); + bq_xmit_all(bq, 0); /* Ingress dev_rx will be the same for all xdp_frame's in * bulk_queue, because bq stored per-CPU and must be flushed @@ -515,27 +498,11 @@ static void *dev_map_hash_lookup_elem(struct bpf_map *map, void *key) return dev ? &dev->ifindex : NULL; } -static void dev_map_flush_old(struct bpf_dtab_netdev *dev) -{ - if (dev->dev->netdev_ops->ndo_xdp_xmit) { - struct xdp_bulk_queue *bq; - int cpu; - - rcu_read_lock(); - for_each_online_cpu(cpu) { - bq = per_cpu_ptr(dev->bulkq, cpu); - bq_xmit_all(bq, XDP_XMIT_FLUSH, false); - } - rcu_read_unlock(); - } -} - static void __dev_map_entry_free(struct rcu_head *rcu) { struct bpf_dtab_netdev *dev; dev = container_of(rcu, struct bpf_dtab_netdev, rcu); - dev_map_flush_old(dev); free_percpu(dev->bulkq); dev_put(dev->dev); kfree(dev); From bf3252d6181b23c7c4c3e1fe746d5435a75d539f Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Fri, 22 Nov 2024 13:10:30 +0100 Subject: [PATCH 1398/1640] UPSTREAM: bpf: fix OOB devmap writes when deleting elements MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit ab244dd7cf4c291f82faacdc50b45cc0f55b674d ] Jordy reported issue against XSKMAP which also applies to DEVMAP - the index used for accessing map entry, due to being a signed integer, causes the OOB writes. Fix is simple as changing the type from int to u32, however, when compared to XSKMAP case, one more thing needs to be addressed. When map is released from system via dev_map_free(), we iterate through all of the entries and an iterator variable is also an int, which implies OOB accesses. Again, change it to be u32. Example splat below: [ 160.724676] BUG: unable to handle page fault for address: ffffc8fc2c001000 [ 160.731662] #PF: supervisor read access in kernel mode [ 160.736876] #PF: error_code(0x0000) - not-present page [ 160.742095] PGD 0 P4D 0 [ 160.744678] Oops: Oops: 0000 [#1] PREEMPT SMP [ 160.749106] CPU: 1 UID: 0 PID: 520 Comm: kworker/u145:12 Not tainted 6.12.0-rc1+ #487 [ 160.757050] Hardware name: Intel Corporation S2600WFT/S2600WFT, BIOS SE5C620.86B.02.01.0008.031920191559 03/19/2019 [ 160.767642] Workqueue: events_unbound bpf_map_free_deferred [ 160.773308] RIP: 0010:dev_map_free+0x77/0x170 [ 160.777735] Code: 00 e8 fd 91 ed ff e8 b8 73 ed ff 41 83 7d 18 19 74 6e 41 8b 45 24 49 8b bd f8 00 00 00 31 db 85 c0 74 48 48 63 c3 48 8d 04 c7 <48> 8b 28 48 85 ed 74 30 48 8b 7d 18 48 85 ff 74 05 e8 b3 52 fa ff [ 160.796777] RSP: 0018:ffffc9000ee1fe38 EFLAGS: 00010202 [ 160.802086] RAX: ffffc8fc2c001000 RBX: 0000000080000000 RCX: 0000000000000024 [ 160.809331] RDX: 0000000000000000 RSI: 0000000000000024 RDI: ffffc9002c001000 [ 160.816576] RBP: 0000000000000000 R08: 0000000000000023 R09: 0000000000000001 [ 160.823823] R10: 0000000000000001 R11: 00000000000ee6b2 R12: dead000000000122 [ 160.831066] R13: ffff88810c928e00 R14: ffff8881002df405 R15: 0000000000000000 [ 160.838310] FS: 0000000000000000(0000) GS:ffff8897e0c40000(0000) knlGS:0000000000000000 [ 160.846528] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 160.852357] CR2: ffffc8fc2c001000 CR3: 0000000005c32006 CR4: 00000000007726f0 [ 160.859604] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 160.866847] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 160.874092] PKRU: 55555554 [ 160.876847] Call Trace: [ 160.879338] [ 160.881477] ? __die+0x20/0x60 [ 160.884586] ? page_fault_oops+0x15a/0x450 [ 160.888746] ? search_extable+0x22/0x30 [ 160.892647] ? search_bpf_extables+0x5f/0x80 [ 160.896988] ? exc_page_fault+0xa9/0x140 [ 160.900973] ? asm_exc_page_fault+0x22/0x30 [ 160.905232] ? dev_map_free+0x77/0x170 [ 160.909043] ? dev_map_free+0x58/0x170 [ 160.912857] bpf_map_free_deferred+0x51/0x90 [ 160.917196] process_one_work+0x142/0x370 [ 160.921272] worker_thread+0x29e/0x3b0 [ 160.925082] ? rescuer_thread+0x4b0/0x4b0 [ 160.929157] kthread+0xd4/0x110 [ 160.932355] ? kthread_park+0x80/0x80 [ 160.936079] ret_from_fork+0x2d/0x50 [ 160.943396] ? kthread_park+0x80/0x80 [ 160.950803] ret_from_fork_asm+0x11/0x20 [ 160.958482] Fixes: 546ac1ffb70d ("bpf: add devmap, a map for storing net device references") CC: stable@vger.kernel.org Reported-by: Jordy Zomer Suggested-by: Jordy Zomer Reviewed-by: Toke Høiland-Jørgensen Acked-by: John Fastabend Change-Id: Ic953af967455efbe281f1f8ad31d43d836d6a95c Signed-off-by: Maciej Fijalkowski Link: https://lore.kernel.org/r/20241122121030.716788-3-maciej.fijalkowski@intel.com Signed-off-by: Alexei Starovoitov Signed-off-by: Sasha Levin --- kernel/bpf/devmap.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index e63ecb21e055..40d424737859 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -206,7 +206,7 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) static void dev_map_free(struct bpf_map *map) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); - int i; + u32 i; /* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0, * so the programs (can be more than one that used this map) were @@ -512,7 +512,7 @@ static int dev_map_delete_elem(struct bpf_map *map, void *key) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); struct bpf_dtab_netdev *old_dev; - int k = *(u32 *)key; + u32 k = *(u32 *)key; if (k >= map->max_entries) return -EINVAL; @@ -535,7 +535,7 @@ static int dev_map_hash_delete_elem(struct bpf_map *map, void *key) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); struct bpf_dtab_netdev *old_dev; - int k = *(u32 *)key; + u32 k = *(u32 *)key; unsigned long flags; int ret = -ENOENT; From 2edabcbd56e171b86c5124d306849854a035e1ce Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Sun, 26 Jan 2020 16:14:00 -0800 Subject: [PATCH 1399/1640] UPSTREAM: bpf, xdp: Update devmap comments to reflect napi/rcu usage MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 42a84a8cd0ff0cbff5a4595e1304c4567a30267d upstream. Now that we rely on synchronize_rcu and call_rcu waiting to exit perempt-disable regions (NAPI) lets update the comments to reflect this. Fixes: 0536b85239b84 ("xdp: Simplify devmap cleanup") Change-Id: I9ac355ce6acfcfa536d6a86ef225438f36174df5 Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann Acked-by: Björn Töpel Acked-by: Song Liu Link: https://lore.kernel.org/bpf/1580084042-11598-2-git-send-email-john.fastabend@gmail.com Signed-off-by: Greg Kroah-Hartman --- kernel/bpf/devmap.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 40d424737859..08ff40e3921c 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -210,10 +210,12 @@ static void dev_map_free(struct bpf_map *map) /* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0, * so the programs (can be more than one that used this map) were - * disconnected from events. Wait for outstanding critical sections in - * these programs to complete. The rcu critical section only guarantees - * no further reads against netdev_map. It does __not__ ensure pending - * flush operations (if any) are complete. + * disconnected from events. The following synchronize_rcu() guarantees + * both rcu read critical sections complete and waits for + * preempt-disable regions (NAPI being the relevant context here) so we + * are certain there will be no further reads against the netdev_map and + * all flush operations are complete. Flush operations can only be done + * from NAPI context for this reason. */ spin_lock(&dev_map_lock); @@ -518,12 +520,11 @@ static int dev_map_delete_elem(struct bpf_map *map, void *key) return -EINVAL; /* Use call_rcu() here to ensure any rcu critical sections have - * completed, but this does not guarantee a flush has happened - * yet. Because driver side rcu_read_lock/unlock only protects the - * running XDP program. However, for pending flush operations the - * dev and ctx are stored in another per cpu map. And additionally, - * the driver tear down ensures all soft irqs are complete before - * removing the net device in the case of dev_put equals zero. + * completed as well as any flush operations because call_rcu + * will wait for preempt-disable region to complete, NAPI in this + * context. And additionally, the driver tear down ensures all + * soft irqs are complete before removing the net device in the + * case of dev_put equals zero. */ old_dev = xchg(&dtab->netdev_map[k], NULL); if (old_dev) From 797ecdec81bc331c02ccf17b07391600fb931c1a Mon Sep 17 00:00:00 2001 From: Anton Protopopov Date: Tue, 10 Dec 2024 11:42:45 +0000 Subject: [PATCH 1400/1640] UPSTREAM: bpf: fix potential error return [ Upstream commit c4441ca86afe4814039ee1b32c39d833c1a16bbc ] The bpf_remove_insns() function returns WARN_ON_ONCE(error), where error is a result of bpf_adj_branches(), and thus should be always 0 However, if for any reason it is not 0, then it will be converted to boolean by WARN_ON_ONCE and returned to user space as 1, not an actual error value. Fix this by returning the original err after the WARN check. Change-Id: I6c5a83da8e1cf62f5eef5e11828975a8bb63add7 Signed-off-by: Anton Protopopov Acked-by: Jiri Olsa Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20241210114245.836164-1-aspsk@isovalent.com Signed-off-by: Alexei Starovoitov Signed-off-by: Sasha Levin --- kernel/bpf/core.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 3b3154988b15..7f1a82d332b6 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -504,6 +504,8 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, int bpf_remove_insns(struct bpf_prog *prog, u32 off, u32 cnt) { + int err; + /* Branch offsets can't overflow when program is shrinking, no need * to call bpf_adj_branches(..., true) here */ @@ -511,7 +513,9 @@ int bpf_remove_insns(struct bpf_prog *prog, u32 off, u32 cnt) sizeof(struct bpf_insn) * (prog->len - off - cnt)); prog->len -= cnt; - return WARN_ON_ONCE(bpf_adj_branches(prog, off, off + cnt, off, false)); + err = bpf_adj_branches(prog, off, off + cnt, off, false); + WARN_ON_ONCE(err); + return err; } static void bpf_prog_kallsyms_del_subprogs(struct bpf_prog *fp) From be018d8b043e96c7c9000b8739f6c6d45fe3c6fd Mon Sep 17 00:00:00 2001 From: "Nikita V. Shirokov" Date: Thu, 30 Aug 2018 07:51:53 -0700 Subject: [PATCH 1401/1640] UPSTREAM: bpf: add TCP_SAVE_SYN/TCP_SAVED_SYN options for bpf_(set|get)sockopt Adding support for two new bpf get/set sockopts: TCP_SAVE_SYN (set) and TCP_SAVED_SYN (get). This would allow for bpf program to build logic based on data from ingress SYN packet (e.g. doing tcp's tos/ tclass reflection (see sample prog)) and do it transparently from userspace program point of view. Change-Id: I4c1327418b21fc676348335e1e7255faea8005d0 Signed-off-by: Nikita V. Shirokov Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- net/core/filter.c | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/net/core/filter.c b/net/core/filter.c index f3581df507b3..a1ca8ef82724 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4000,6 +4000,12 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock, tp->snd_ssthresh = val; } break; + case TCP_SAVE_SYN: + if (val < 0 || val > 1) + ret = -EINVAL; + else + tp->save_syn = val; + break; default: ret = -EINVAL; } @@ -4025,21 +4031,32 @@ static const struct bpf_func_proto bpf_setsockopt_proto = { BPF_CALL_5(bpf_getsockopt, struct bpf_sock_ops_kern *, bpf_sock, int, level, int, optname, char *, optval, int, optlen) { + struct inet_connection_sock *icsk; struct sock *sk = bpf_sock->sk; + struct tcp_sock *tp; if (!sk_fullsock(sk)) goto err_clear; - #ifdef CONFIG_INET if (level == SOL_TCP && sk->sk_prot->getsockopt == tcp_getsockopt) { - if (optname == TCP_CONGESTION) { - struct inet_connection_sock *icsk = inet_csk(sk); + switch (optname) { + case TCP_CONGESTION: + icsk = inet_csk(sk); if (!icsk->icsk_ca_ops || optlen <= 1) goto err_clear; strncpy(optval, icsk->icsk_ca_ops->name, optlen); optval[optlen - 1] = 0; - } else { + break; + case TCP_SAVED_SYN: + tp = tcp_sk(sk); + + if (optlen <= 0 || !tp->saved_syn || + optlen > tp->saved_syn[0]) + goto err_clear; + memcpy(optval, tp->saved_syn + 1, optlen); + break; + default: goto err_clear; } } else if (level == SOL_IP) { From 42c3991b450d7a9fe642c57c0feff9d09a4dcf3b Mon Sep 17 00:00:00 2001 From: Anders Roxell Date: Fri, 7 Sep 2018 14:50:05 +0200 Subject: [PATCH 1402/1640] UPSTREAM: net/core/filter: fix unused-variable warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Building with CONFIG_INET=n will show the warning below: net/core/filter.c: In function ‘____bpf_getsockopt’: net/core/filter.c:4048:19: warning: unused variable ‘tp’ [-Wunused-variable] struct tcp_sock *tp; ^~ net/core/filter.c:4046:31: warning: unused variable ‘icsk’ [-Wunused-variable] struct inet_connection_sock *icsk; ^~~~ Move the variable declarations inside the {} block where they are used. Fixes: 1e215300f138 ("bpf: add TCP_SAVE_SYN/TCP_SAVED_SYN options for bpf_(set|get)sockopt") Change-Id: I5858bc736599f893b5b2060590ad378bb29aefdd Signed-off-by: Anders Roxell Signed-off-by: Alexei Starovoitov --- net/core/filter.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/net/core/filter.c b/net/core/filter.c index a1ca8ef82724..31793ba74407 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4031,14 +4031,15 @@ static const struct bpf_func_proto bpf_setsockopt_proto = { BPF_CALL_5(bpf_getsockopt, struct bpf_sock_ops_kern *, bpf_sock, int, level, int, optname, char *, optval, int, optlen) { - struct inet_connection_sock *icsk; struct sock *sk = bpf_sock->sk; - struct tcp_sock *tp; if (!sk_fullsock(sk)) goto err_clear; #ifdef CONFIG_INET if (level == SOL_TCP && sk->sk_prot->getsockopt == tcp_getsockopt) { + struct inet_connection_sock *icsk; + struct tcp_sock *tp; + switch (optname) { case TCP_CONGESTION: icsk = inet_csk(sk); From d11a19b8095c312dc61a7941e0c8d8c1aede9084 Mon Sep 17 00:00:00 2001 From: Joe Stringer Date: Tue, 2 Oct 2018 22:32:26 -0700 Subject: [PATCH 1403/1640] BACKPORT: net: core: Fix build with CONFIG_IPV6=m Stephen Rothwell reports the following link failure with IPv6 as module: x86_64-linux-gnu-ld: net/core/filter.o: in function `sk_lookup': (.text+0x19219): undefined reference to `__udp6_lib_lookup' Fix the build by only enabling the IPv6 socket lookup if IPv6 support is compiled into the kernel. Change-Id: I4abb2d1f0cedc8ecf32deac3960beb03664c9c07 Signed-off-by: Joe Stringer Signed-off-by: Daniel Borkmann --- net/core/filter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/core/filter.c b/net/core/filter.c index 31793ba74407..ec3af7adfec2 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4867,7 +4867,7 @@ static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple, sk = __udp4_lib_lookup(net, src4, tuple->ipv4.sport, dst4, tuple->ipv4.dport, dif, sdif, &udp_table, NULL); -#if IS_ENABLED(CONFIG_IPV6) +#if IS_REACHABLE(CONFIG_IPV6) } else { struct in6_addr *src6 = (struct in6_addr *)&tuple->ipv6.saddr; struct in6_addr *dst6 = (struct in6_addr *)&tuple->ipv6.daddr; From 9fca6e334356522b7c253e5c4c3b25435d557c12 Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Fri, 9 Nov 2018 10:54:00 -0800 Subject: [PATCH 1404/1640] UPSTREAM: bpf: Fix IPv6 dport byte order in bpf_sk_lookup_udp Lookup functions in sk_lookup have different expectations about byte order of provided arguments. Specifically __inet_lookup, __udp4_lib_lookup and __udp6_lib_lookup expect dport to be in network byte order and do ntohs(dport) internally. At the same time __inet6_lookup expects dport to be in host byte order and correspondingly name the argument hnum. sk_lookup works correctly with __inet_lookup, __udp4_lib_lookup and __inet6_lookup with regard to dport. But in __udp6_lib_lookup case it uses host instead of expected network byte order. It makes result returned by bpf_sk_lookup_udp for IPv6 incorrect. The patch fixes byte order of dport passed to __udp6_lib_lookup. Originally sk_lookup properly handled UDPv6, but not TCPv6. 5ef0ae84f02a fixes TCPv6 but breaks UDPv6. Fixes: 5ef0ae84f02a ("bpf: Fix IPv6 dport byte-order in bpf_sk_lookup") Change-Id: I1c1382742adce803efe85654f8f663d38011153b Signed-off-by: Andrey Ignatov Acked-by: Joe Stringer Acked-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov --- net/core/filter.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/net/core/filter.c b/net/core/filter.c index ec3af7adfec2..785cc9633140 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4871,17 +4871,16 @@ static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple, } else { struct in6_addr *src6 = (struct in6_addr *)&tuple->ipv6.saddr; struct in6_addr *dst6 = (struct in6_addr *)&tuple->ipv6.daddr; - u16 hnum = ntohs(tuple->ipv6.dport); if (proto == IPPROTO_TCP) sk = __inet6_lookup(net, &tcp_hashinfo, NULL, 0, src6, tuple->ipv6.sport, - dst6, hnum, + dst6, ntohs(tuple->ipv6.dport), dif, sdif, &refcounted); else if (likely(ipv6_bpf_stub)) sk = ipv6_bpf_stub->udp6_lib_lookup(net, src6, tuple->ipv6.sport, - dst6, hnum, + dst6, tuple->ipv6.dport, dif, sdif, &udp_table, NULL); #endif From 28b0c25cf2a26b72411cba4376d800492827ca4b Mon Sep 17 00:00:00 2001 From: Joe Stringer Date: Mon, 15 Oct 2018 10:27:45 -0700 Subject: [PATCH 1405/1640] BACKPORT: bpf: Allow sk_lookup with IPv6 module This is a more complete fix than d71019b54bff ("net: core: Fix build with CONFIG_IPV6=m"), so that IPv6 sockets may be looked up if the IPv6 module is loaded (not just if it's compiled in). Change-Id: Ib4efdfd6df78e6de511a6d1b26c84a6b79028c7e Signed-off-by: Joe Stringer Signed-off-by: Alexei Starovoitov --- net/core/filter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/core/filter.c b/net/core/filter.c index 785cc9633140..4eb8d8e380a0 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4867,7 +4867,7 @@ static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple, sk = __udp4_lib_lookup(net, src4, tuple->ipv4.sport, dst4, tuple->ipv4.dport, dif, sdif, &udp_table, NULL); -#if IS_REACHABLE(CONFIG_IPV6) +#if IS_ENABLED(CONFIG_IPV6) } else { struct in6_addr *src6 = (struct in6_addr *)&tuple->ipv6.saddr; struct in6_addr *dst6 = (struct in6_addr *)&tuple->ipv6.daddr; From 0b90590c9f2b46194b527966a40e9f87165761d4 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Fri, 19 Oct 2018 19:56:49 -0700 Subject: [PATCH 1406/1640] BACKPORT: bpf: sk_msg program helper bpf_msg_push_data This allows user to push data into a msg using sk_msg program types. The format is as follows, bpf_msg_push_data(msg, offset, len, flags) this will insert 'len' bytes at offset 'offset'. For example to prepend 10 bytes at the front of the message the user can, bpf_msg_push_data(msg, 0, 10, 0); This will invalidate data bounds so BPF user will have to then recheck data bounds after calling this. After this the msg size will have been updated and the user is free to write into the added bytes. We allow any offset/len as long as it is within the (data, data_end) range. However, a copy will be required if the ring is full and its possible for the helper to fail with ENOMEM or EINVAL errors which need to be handled by the BPF program. This can be used similar to XDP metadata to pass data between sk_msg layer and lower layers. Change-Id: Ib70acf2419e2941d0bb67c3331b1dd007688e4e8 Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann --- include/linux/skmsg.h | 5 ++ include/uapi/linux/bpf.h | 17 +++++ net/core/filter.c | 134 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 156 insertions(+) diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index ce7aeeed62da..aa7e4815279d 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -189,6 +189,11 @@ static inline struct scatterlist *sk_msg_elem(struct sk_msg *msg, int which) return &msg->sg.data[which]; } +static inline struct scatterlist sk_msg_elem_cpy(struct sk_msg *msg, int which) +{ + return msg->sg.data[which]; +} + static inline struct page *sk_msg_page(struct sk_msg *msg, int which) { return sg_page(sk_msg_elem(msg, which)); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 640ddc081bc6..52334193f056 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2567,6 +2567,23 @@ union bpf_attr { * 0 on success. * * **-ENOENT** if the bpf-local-storage cannot be found. + * + * int bpf_msg_push_data(struct sk_buff *skb, u32 start, u32 len, u64 flags) + * Description + * For socket policies, insert *len* bytes into msg at offset + * *start*. + * + * If a program of type **BPF_PROG_TYPE_SK_MSG** is run on a + * *msg* it may want to insert metadata or options into the msg. + * This can later be read and used by any of the lower layer BPF + * hooks. + * + * This helper may fail if under memory pressure (a malloc + * fails) in these cases BPF programs will get an appropriate + * error and BPF programs will need to handle them. + * + * Return + * 0 on success, or a negative error in case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ diff --git a/net/core/filter.c b/net/core/filter.c index 4eb8d8e380a0..58e7cd30b432 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2319,6 +2319,137 @@ static const struct bpf_func_proto bpf_msg_pull_data_proto = { .arg4_type = ARG_ANYTHING, }; +BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start, + u32, len, u64, flags) +{ + struct scatterlist sge, nsge, nnsge, rsge = {0}, *psge; + u32 new, i = 0, l, space, copy = 0, offset = 0; + u8 *raw, *to, *from; + struct page *page; + + if (unlikely(flags)) + return -EINVAL; + + /* First find the starting scatterlist element */ + i = msg->sg.start; + do { + l = sk_msg_elem(msg, i)->length; + + if (start < offset + l) + break; + offset += l; + sk_msg_iter_var_next(i); + } while (i != msg->sg.end); + + if (start >= offset + l) + return -EINVAL; + + space = MAX_MSG_FRAGS - sk_msg_elem_used(msg); + + /* If no space available will fallback to copy, we need at + * least one scatterlist elem available to push data into + * when start aligns to the beginning of an element or two + * when it falls inside an element. We handle the start equals + * offset case because its the common case for inserting a + * header. + */ + if (!space || (space == 1 && start != offset)) + copy = msg->sg.data[i].length; + + page = alloc_pages(__GFP_NOWARN | GFP_ATOMIC | __GFP_COMP, + get_order(copy + len)); + if (unlikely(!page)) + return -ENOMEM; + + if (copy) { + int front, back; + + raw = page_address(page); + + psge = sk_msg_elem(msg, i); + front = start - offset; + back = psge->length - front; + from = sg_virt(psge); + + if (front) + memcpy(raw, from, front); + + if (back) { + from += front; + to = raw + front + len; + + memcpy(to, from, back); + } + + put_page(sg_page(psge)); + } else if (start - offset) { + psge = sk_msg_elem(msg, i); + rsge = sk_msg_elem_cpy(msg, i); + + psge->length = start - offset; + rsge.length -= psge->length; + rsge.offset += start; + + sk_msg_iter_var_next(i); + sg_unmark_end(psge); + sk_msg_iter_next(msg, end); + } + + /* Slot(s) to place newly allocated data */ + new = i; + + /* Shift one or two slots as needed */ + if (!copy) { + sge = sk_msg_elem_cpy(msg, i); + + sk_msg_iter_var_next(i); + sg_unmark_end(&sge); + sk_msg_iter_next(msg, end); + + nsge = sk_msg_elem_cpy(msg, i); + if (rsge.length) { + sk_msg_iter_var_next(i); + nnsge = sk_msg_elem_cpy(msg, i); + } + + while (i != msg->sg.end) { + msg->sg.data[i] = sge; + sge = nsge; + sk_msg_iter_var_next(i); + if (rsge.length) { + nsge = nnsge; + nnsge = sk_msg_elem_cpy(msg, i); + } else { + nsge = sk_msg_elem_cpy(msg, i); + } + } + } + + /* Place newly allocated data buffer */ + sk_mem_charge(msg->sk, len); + msg->sg.size += len; + msg->sg.copy[new] = false; + sg_set_page(&msg->sg.data[new], page, len + copy, 0); + if (rsge.length) { + get_page(sg_page(&rsge)); + sk_msg_iter_var_next(new); + msg->sg.data[new] = rsge; + } + + sk_msg_compute_data_pointers(msg); + return 0; +} + +static const struct bpf_func_proto bpf_msg_push_data_proto = { + .func = bpf_msg_push_data, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_ANYTHING, +}; + BPF_CALL_1(bpf_get_cgroup_classid, const struct sk_buff *, skb) { return task_get_classid(skb); @@ -5351,6 +5482,7 @@ bool bpf_helper_changes_pkt_data(void *func) func == bpf_xdp_adjust_head || func == bpf_xdp_adjust_meta || func == bpf_msg_pull_data || + func == bpf_msg_push_data || func == bpf_xdp_adjust_tail || #if IS_ENABLED(CONFIG_IPV6_SEG6_BPF) func == bpf_lwt_seg6_store_bytes || @@ -5690,6 +5822,8 @@ sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_msg_cork_bytes_proto; case BPF_FUNC_msg_pull_data: return &bpf_msg_pull_data_proto; + case BPF_FUNC_msg_push_data: + return &bpf_msg_push_data_proto; case BPF_FUNC_get_local_storage: return &bpf_get_local_storage_proto; default: From 7810ca032cfbbe0a0e1f76ab43e5cff81a62a8af Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 24 Oct 2018 22:05:44 +0200 Subject: [PATCH 1407/1640] BACKPORT: bpf: disallow direct packet access for unpriv in cg_skb Commit b39b5f411dcf ("bpf: add cg_skb_is_valid_access for BPF_PROG_TYPE_CGROUP_SKB") added support for returning pkt pointers for direct packet access. Given this program type is allowed for both unprivileged and privileged users, we shouldn't allow unprivileged ones to use it, e.g. besides others one reason would be to avoid any potential speculation on the packet test itself, thus guard this for root only. Fixes: b39b5f411dcf ("bpf: add cg_skb_is_valid_access for BPF_PROG_TYPE_CGROUP_SKB") Change-Id: I7ef3a39fe858666bf346410cba55e2659d7503fb Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Cc: Song Liu Signed-off-by: Alexei Starovoitov --- net/core/filter.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/net/core/filter.c b/net/core/filter.c index 58e7cd30b432..57b0cfface3f 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -6070,7 +6070,13 @@ static bool cg_skb_is_valid_access(int off, int size, case bpf_ctx_range_ptr(struct __sk_buff, flow_keys): case bpf_ctx_range(struct __sk_buff, wire_len): return false; + case bpf_ctx_range(struct __sk_buff, data): + case bpf_ctx_range(struct __sk_buff, data_end): + if (!capable(CAP_SYS_ADMIN)) + return false; + break; } + if (type == BPF_WRITE) { switch (off) { case bpf_ctx_range(struct __sk_buff, mark): From 15cc489cfec8fdeaa4cc35774cc150f37e55bb74 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 27 Oct 2018 00:49:02 +0200 Subject: [PATCH 1408/1640] UPSTREAM: bpf: fix wrong helper enablement in cgroup local storage Commit cd3394317653 ("bpf: introduce the bpf_get_local_storage() helper function") enabled the bpf_get_local_storage() helper also for BPF program types where it does not make sense to use them. They have been added both in sk_skb_func_proto() and sk_msg_func_proto() even though both program types are not invoked in combination with cgroups, and neither through BPF_PROG_RUN_ARRAY(). In the latter the bpf_cgroup_storage_set() is set shortly before BPF program invocation. Later, the helper bpf_get_local_storage() retrieves this prior set up per-cpu pointer and hands the buffer to the BPF program. The map argument in there solely retrieves the enum bpf_cgroup_storage_type from a local storage map associated with the program and based on the type returns either the global or per-cpu storage. However, there is no specific association between the program's map and the actual content in bpf_cgroup_storage[]. Meaning, any BPF program that would have been properly run from the cgroup side through BPF_PROG_RUN_ARRAY() where bpf_cgroup_storage_set() was performed, and that is later unloaded such that prog / maps are teared down will cause a use after free if that pointer is retrieved from programs that are not run through BPF_PROG_RUN_ARRAY() but have the cgroup local storage helper enabled in their func proto. Lets just remove it from the two sock_map program types to fix it. Auditing through the types where this helper is enabled, it appears that these are the only ones where it was mistakenly allowed. Fixes: cd3394317653 ("bpf: introduce the bpf_get_local_storage() helper function") Change-Id: Ib39af90d6476223c61f3ea008a9d6408d80ab5bc Signed-off-by: Daniel Borkmann Cc: Roman Gushchin Acked-by: John Fastabend Acked-by: Roman Gushchin Signed-off-by: Alexei Starovoitov --- net/core/filter.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/net/core/filter.c b/net/core/filter.c index 57b0cfface3f..6fdef3b41d6d 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5824,8 +5824,6 @@ sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_msg_pull_data_proto; case BPF_FUNC_msg_push_data: return &bpf_msg_push_data_proto; - case BPF_FUNC_get_local_storage: - return &bpf_get_local_storage_proto; default: return bpf_base_func_proto(func_id); } @@ -5856,8 +5854,6 @@ sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_sk_redirect_map_proto; case BPF_FUNC_sk_redirect_hash: return &bpf_sk_redirect_hash_proto; - case BPF_FUNC_get_local_storage: - return &bpf_get_local_storage_proto; #ifdef CONFIG_INET case BPF_FUNC_sk_lookup_tcp: return &bpf_sk_lookup_tcp_proto; From 6c675662a87bce6b134ab6e1d5682662401a2582 Mon Sep 17 00:00:00 2001 From: Sowmini Varadhan Date: Wed, 7 Nov 2018 16:12:01 -0800 Subject: [PATCH 1409/1640] UPSTREAM: bpf: add perf event notificaton support for sock_ops This patch allows eBPF programs that use sock_ops to send perf based event notifications using bpf_perf_event_output(). Our main use case for this is the following: We would like to monitor some subset of TCP sockets in user-space, (the monitoring application would define 4-tuples it wants to monitor) using TCP_INFO stats to analyze reported problems. The idea is to use those stats to see where the bottlenecks are likely to be ("is it application-limited?" or "is there evidence of BufferBloat in the path?" etc). Today we can do this by periodically polling for tcp_info, but this could be made more efficient if the kernel would asynchronously notify the application via tcp_info when some "interesting" thresholds (e.g., "RTT variance > X", or "total_retrans > Y" etc) are reached. And to make this effective, it is better if we could apply the threshold check *before* constructing the tcp_info netlink notification, so that we don't waste resources constructing notifications that will be discarded by the filter. This work solves the problem by adding perf event based notification support for sock_ops. The eBPF program can thus be designed to apply any desired filters to the bpf_sock_ops and trigger a perf event notification based on the evaluation from the filter. The user space component can use these perf event notifications to either read any state managed by the eBPF program, or issue a TCP_INFO netlink call if desired. Change-Id: I136f6f728ad1cf15deba3a5fb49c7498c1fdf894 Signed-off-by: Sowmini Varadhan Co-developed-by: Daniel Borkmann Signed-off-by: Daniel Borkmann --- net/core/filter.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/net/core/filter.c b/net/core/filter.c index 6fdef3b41d6d..6efe4934adb8 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4003,6 +4003,26 @@ static const struct bpf_func_proto bpf_get_socket_uid_proto = { .arg1_type = ARG_PTR_TO_CTX, }; +BPF_CALL_5(bpf_sockopt_event_output, struct bpf_sock_ops_kern *, bpf_sock, + struct bpf_map *, map, u64, flags, void *, data, u64, size) +{ + if (unlikely(flags & ~(BPF_F_INDEX_MASK))) + return -EINVAL; + + return bpf_event_output(map, flags, data, size, NULL, 0, NULL); +} + +static const struct bpf_func_proto bpf_sockopt_event_output_proto = { + .func = bpf_sockopt_event_output, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_MEM, + .arg5_type = ARG_CONST_SIZE_OR_ZERO, +}; + BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock, int, level, int, optname, char *, optval, int, optlen) { @@ -5800,6 +5820,8 @@ sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_socket_cookie_sock_ops_proto; case BPF_FUNC_get_local_storage: return &bpf_get_local_storage_proto; + case BPF_FUNC_perf_event_output: + return &bpf_sockopt_event_output_proto; default: return bpf_base_func_proto(func_id); } From c047c3f791d82d90e09f35a66b7b1bd007430ed7 Mon Sep 17 00:00:00 2001 From: David Miller Date: Mon, 26 Nov 2018 13:42:41 -0800 Subject: [PATCH 1410/1640] UPSTREAM: bpf: Avoid unnecessary instruction in convert_bpf_ld_abs() 'offset' is constant and if it is zero, no need to subtract it from BPF_REG_TMP. Change-Id: I022afe1b2b24d213e0623a9777e32f1e3861816b Signed-off-by: David S. Miller Signed-off-by: Daniel Borkmann --- net/core/filter.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/core/filter.c b/net/core/filter.c index 6efe4934adb8..9c247eac2d32 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -469,7 +469,8 @@ static bool convert_bpf_ld_abs(struct sock_filter *fp, struct bpf_insn **insnp) bool ldx_off_ok = offset <= S16_MAX; *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_H); - *insn++ = BPF_ALU64_IMM(BPF_SUB, BPF_REG_TMP, offset); + if (offset) + *insn++ = BPF_ALU64_IMM(BPF_SUB, BPF_REG_TMP, offset); *insn++ = BPF_JMP_IMM(BPF_JSLT, BPF_REG_TMP, size, 2 + endian + (!ldx_off_ok * 2)); if (ldx_off_ok) { From c5b4d886f33ccb5b39cad471030ec5c847d6453e Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Wed, 12 Jun 2019 17:18:47 +0800 Subject: [PATCH 1411/1640] UPSTREAM: bpf: Fix build error without CONFIG_INET If CONFIG_INET is not set, building fails: kernel/bpf/verifier.o: In function `check_mem_access': verifier.c: undefined reference to `bpf_xdp_sock_is_valid_access' kernel/bpf/verifier.o: In function `convert_ctx_accesses': verifier.c: undefined reference to `bpf_xdp_sock_convert_ctx_access' Reported-by: Hulk Robot Fixes: fada7fdc83c0 ("bpf: Allow bpf_map_lookup_elem() on an xskmap") Change-Id: I2d182f8b194ef0d689e7bffb55298f5d4325a4a2 Signed-off-by: YueHaibing Acked-by: Jonathan Lemon Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 31 ++++++++++++++++++++++++------- 1 file changed, 24 insertions(+), 7 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index f9424c86dfb5..e1ea9d8f73ec 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -738,13 +738,6 @@ struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key); void __cpu_map_flush(struct bpf_map *map); int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp, struct net_device *dev_rx); -bool bpf_xdp_sock_is_valid_access(int off, int size, enum bpf_access_type type, - struct bpf_insn_access_aux *info); -u32 bpf_xdp_sock_convert_ctx_access(enum bpf_access_type type, - const struct bpf_insn *si, - struct bpf_insn *insn_buf, - struct bpf_prog *prog, - u32 *target_size); /* Return map's numa specified by userspace */ static inline int bpf_map_attr_numa_node(const union bpf_attr *attr) @@ -1114,6 +1107,15 @@ u32 bpf_tcp_sock_convert_ctx_access(enum bpf_access_type type, struct bpf_insn *insn_buf, struct bpf_prog *prog, u32 *target_size); + +bool bpf_xdp_sock_is_valid_access(int off, int size, enum bpf_access_type type, + struct bpf_insn_access_aux *info); + +u32 bpf_xdp_sock_convert_ctx_access(enum bpf_access_type type, + const struct bpf_insn *si, + struct bpf_insn *insn_buf, + struct bpf_prog *prog, + u32 *target_size); #else static inline bool bpf_tcp_sock_is_valid_access(int off, int size, enum bpf_access_type type, @@ -1130,6 +1132,21 @@ static inline u32 bpf_tcp_sock_convert_ctx_access(enum bpf_access_type type, { return 0; } +static inline bool bpf_xdp_sock_is_valid_access(int off, int size, + enum bpf_access_type type, + struct bpf_insn_access_aux *info) +{ + return false; +} + +static inline u32 bpf_xdp_sock_convert_ctx_access(enum bpf_access_type type, + const struct bpf_insn *si, + struct bpf_insn *insn_buf, + struct bpf_prog *prog, + u32 *target_size) +{ + return 0; +} #endif /* CONFIG_INET */ #endif /* _LINUX_BPF_H */ From 28b99379b251556030aba48ec043d25f0b9a1488 Mon Sep 17 00:00:00 2001 From: Tim Zimmermann Date: Mon, 18 Aug 2025 05:11:50 +0000 Subject: [PATCH 1412/1640] Squashed revert of 4.14 tls backports Revert "net: generalize sk_alloc_sg to work with scatterlist rings" This reverts commit e351d8782539f93a7aea83055b03de0585653f1d. Revert "sock: make static tls function alloc_sg generic sock helper" This reverts commit 1a4d78e879a1f8234444af3cb7c207d422873843. Revert "net/tls: Fixed return value when tls_complete_pending_work() fails" This reverts commit 39d9e1c62e3f8b5569a0d9bf4d47a3d5d8ae02e2. Revert "tls: Use correct sk->sk_prot for IPV6" This reverts commit 2a0f5919e1e6a1c0423d895ab75eb15f94a67c69. Revert "tls: don't override sk_write_space if tls_set_sw_offload fails." This reverts commit 2b8b2e76222f1241cec381eb0ec599ee33e9cd00. Revert "tls: Avoid copying crypto_info again after cipher_type check." This reverts commit 93f16446c8ddacbf26ae8408ecd9c9c397b8d5b4. Revert "tls: Fix TLS ulp context leak, when TLS_TX setsockopt is not used." This reverts commit 797b8bb47fb27ee49a3b59ad110e5264585415aa. Revert "tls: Add function to update the TLS socket configuration" This reverts commit 25f03991a5210f31491e1e2fccec7cf0d080772e. Revert "tls: possible hang when do_tcp_sendpages hits sndbuf is full case" This reverts commit f0a8c1257fc3e8310a14c8f9d654620370cf568f. Revert "tls: clear key material from kernel memory when do_tls_setsockopt_conf fails" This reverts commit 18fef87e05d3569979bfce16ad7284259f60b52c. Revert "tls: zero the crypto information from tls_context before freeing" This reverts commit 0c0334299a7e085849c83bffe754626ba517d5ee. Revert "tls: don't copy the key out of tls12_crypto_info_aes_gcm_128" This reverts commit 10cacaf13189711ce5c81222aaae3cd61a5ca848. Revert "net/tls: Set count of SG entries if sk_alloc_sg returns -ENOSPC" This reverts commit 04f625fc5a68905f47349ec54538eff5523f29a5. Revert "tcp, ulp: add alias for all ulp modules" This reverts commit 0c02e0c3fd13f9b6eb4b5f7ce3b739e7c348ae2b. Revert "sock: fix sg page frag coalescing in sk_alloc_sg" This reverts commit 464e2326a7f5cd39d82f96750bb56a9d302edf8d. Revert "tls: Stricter error checking in zerocopy sendmsg path" This reverts commit 30a7a7b04f8b4e38b1af9acda2a4dd533e260ed2. Revert "tls: fix use-after-free in tls_push_record" This reverts commit 5e8a5c30546f731b20591eb8cffc7db299286d05. Revert "tls: retrun the correct IV in getsockopt" This reverts commit 94203f213c1938aec35f063c411bc5a73a9b814b. Revert "net/tls: Fix connection stall on partial tls record" This reverts commit 8e1b8e327903a77ca78233b9ae3e8ba2ef3c4364. Revert "net/tls: Don't recursively call push_record during tls_write_space callbacks" This reverts commit 3ac0f3e0b823a1217896ae70a731c825bf26f778. Revert "tls: reset crypto_info when do_tls_setsockopt_tx fails" This reverts commit ed10b9affb3af663d2cdb8a1e9134fb0a6358ab0. Revert "tls: return -EBUSY if crypto_info is already set" This reverts commit 2f54941c886c9f4d788ab0d4a81830a177ab7fd1. Revert "tls: fix sw_ctx leak" This reverts commit 3a28f04bc4c20e56d75327165f2922a525225b7c. Revert "net/tls: Only attach to sockets in ESTABLISHED state" This reverts commit a022bbe393fbe3a1f471ee94d846be03f7fe2136. Revert "net/tls: Fix inverted error codes to avoid endless loop" This reverts commit d3048a12f3eccc00d62db373df4cd50b1218f6f1. Revert "tls: Use kzalloc for aead_request allocation" This reverts commit f0e1cd056e99c1ac7ec59f46ad152c11f1b69570. Revert "uapi: fix linux/tls.h userspace compilation error" This reverts commit 33e58deefa0907ffba4d5a4c69f3641a649358d3. --- include/net/sock.h | 4 - include/net/tcp.h | 4 - include/net/tls.h | 25 ++---- include/uapi/linux/tls.h | 4 + net/core/sock.c | 58 -------------- net/ipv4/tcp_ulp.c | 2 +- net/tls/tls_main.c | 165 +++++++++++---------------------------- net/tls/tls_sw.c | 135 ++++++++++++++++++++++---------- 8 files changed, 149 insertions(+), 248 deletions(-) diff --git a/include/net/sock.h b/include/net/sock.h index 9f58e77fe19a..bf4234303856 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2234,10 +2234,6 @@ static inline struct page_frag *sk_page_frag(struct sock *sk) bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag); -int sk_alloc_sg(struct sock *sk, int len, struct scatterlist *sg, - int sg_start, int *sg_curr, unsigned int *sg_size, - int first_coalesce); - /* * Default write policy as shown to user space via poll/select/SIGIO */ diff --git a/include/net/tcp.h b/include/net/tcp.h index 2c8f18224acc..2beba3dc6369 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -2159,10 +2159,6 @@ int tcp_set_ulp(struct sock *sk, const char *name); void tcp_get_available_ulp(char *buf, size_t len); void tcp_cleanup_ulp(struct sock *sk); -#define MODULE_ALIAS_TCP_ULP(name) \ - __MODULE_INFO(alias, alias_userspace, name); \ - __MODULE_INFO(alias, alias_tcp_ulp, "tcp-ulp-" name) - struct sk_msg; struct sk_psock; diff --git a/include/net/tls.h b/include/net/tls.h index 604fd982da19..b89d397dd62f 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -35,10 +35,6 @@ #define _TLS_OFFLOAD_H #include -#include -#include -#include -#include #include @@ -79,18 +75,14 @@ enum { TLS_PENDING_CLOSED_RECORD }; -union tls_crypto_context { - struct tls_crypto_info info; - struct tls12_crypto_info_aes_gcm_128 aes_gcm_128; -}; - struct tls_context { - union tls_crypto_context crypto_send; + union { + struct tls_crypto_info crypto_send; + struct tls12_crypto_info_aes_gcm_128 crypto_send_aes_gcm_128; + }; void *priv_ctx; - u8 tx_conf:2; - u16 prepend_size; u16 tag_size; u16 overhead_size; @@ -102,10 +94,10 @@ struct tls_context { struct scatterlist *partially_sent_record; u16 partially_sent_offset; unsigned long flags; - bool in_tcp_sendpages; u16 pending_open_record_frags; int (*push_pending_record)(struct sock *sk, int flags); + void (*free_resources)(struct sock *sk); void (*sk_write_space)(struct sock *sk); void (*sk_proto_close)(struct sock *sk, long timeout); @@ -130,7 +122,6 @@ int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size); int tls_sw_sendpage(struct sock *sk, struct page *page, int offset, size_t size, int flags); void tls_sw_close(struct sock *sk, long timeout); -void tls_sw_free_tx_resources(struct sock *sk); void tls_sk_destruct(struct sock *sk, struct tls_context *ctx); void tls_icsk_clean_acked(struct sock *sk); @@ -173,7 +164,7 @@ static inline bool tls_is_pending_open_record(struct tls_context *tls_ctx) static inline void tls_err_abort(struct sock *sk) { - sk->sk_err = EBADMSG; + sk->sk_err = -EBADMSG; sk->sk_error_report(sk); } @@ -212,8 +203,8 @@ static inline void tls_fill_prepend(struct tls_context *ctx, * size KTLS_DTLS_HEADER_SIZE + KTLS_DTLS_NONCE_EXPLICIT_SIZE */ buf[0] = record_type; - buf[1] = TLS_VERSION_MINOR(ctx->crypto_send.info.version); - buf[2] = TLS_VERSION_MAJOR(ctx->crypto_send.info.version); + buf[1] = TLS_VERSION_MINOR(ctx->crypto_send.version); + buf[2] = TLS_VERSION_MAJOR(ctx->crypto_send.version); /* we can use IV for nonce explicit according to spec */ buf[3] = pkt_len >> 8; buf[4] = pkt_len & 0xFF; diff --git a/include/uapi/linux/tls.h b/include/uapi/linux/tls.h index 293b2cdad88d..d5e0682ab837 100644 --- a/include/uapi/linux/tls.h +++ b/include/uapi/linux/tls.h @@ -35,6 +35,10 @@ #define _UAPI_LINUX_TLS_H #include +#include +#include +#include +#include /* TLS socket options */ #define TLS_TX 1 /* Set transmit parameters */ diff --git a/net/core/sock.c b/net/core/sock.c index 5981b4443e3b..342113238216 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2429,64 +2429,6 @@ bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag) } EXPORT_SYMBOL(sk_page_frag_refill); -int sk_alloc_sg(struct sock *sk, int len, struct scatterlist *sg, - int sg_start, int *sg_curr_index, unsigned int *sg_curr_size, - int first_coalesce) -{ - int sg_curr = *sg_curr_index, use = 0, rc = 0; - unsigned int size = *sg_curr_size; - struct page_frag *pfrag; - struct scatterlist *sge; - - len -= size; - pfrag = sk_page_frag(sk); - while (len > 0) { - unsigned int orig_offset; - - if (!sk_page_frag_refill(sk, pfrag)) { - rc = -ENOMEM; - goto out; - } - - use = min_t(int, len, pfrag->size - pfrag->offset); - if (!sk_wmem_schedule(sk, use)) { - rc = -ENOMEM; - goto out; - } - - sk_mem_charge(sk, use); - size += use; - orig_offset = pfrag->offset; - pfrag->offset += use; - - sge = sg + sg_curr - 1; - if (sg_curr > first_coalesce && sg_page(sg) == pfrag->page && - sg->offset + sg->length == orig_offset) { - sg->length += use; - } else { - sge = sg + sg_curr; - sg_unmark_end(sge); - sg_set_page(sge, pfrag->page, use, orig_offset); - get_page(pfrag->page); - sg_curr++; - if (sg_curr == MAX_SKB_FRAGS) - sg_curr = 0; - - if (sg_curr == sg_start) { - rc = -ENOSPC; - break; - } - } - - len -= use; - } -out: - *sg_curr_size = size; - *sg_curr_index = sg_curr; - return rc; -} -EXPORT_SYMBOL(sk_alloc_sg); - static void __lock_sock(struct sock *sk) __releases(&sk->sk_lock.slock) __acquires(&sk->sk_lock.slock) diff --git a/net/ipv4/tcp_ulp.c b/net/ipv4/tcp_ulp.c index 07bf9e02df13..d8e6a42c6b3e 100644 --- a/net/ipv4/tcp_ulp.c +++ b/net/ipv4/tcp_ulp.c @@ -39,7 +39,7 @@ static const struct tcp_ulp_ops *__tcp_ulp_find_autoload(const char *name) #ifdef CONFIG_MODULES if (!ulp && capable(CAP_NET_ADMIN)) { rcu_read_unlock(); - request_module("tcp-ulp-%s", name); + request_module("%s", name); rcu_read_lock(); ulp = tcp_ulp_find(name); } diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index 3205335bc7b9..b6e1363cb048 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -44,30 +44,9 @@ MODULE_AUTHOR("Mellanox Technologies"); MODULE_DESCRIPTION("Transport Layer Security Support"); MODULE_LICENSE("Dual BSD/GPL"); -MODULE_ALIAS_TCP_ULP("tls"); -enum { - TLSV4, - TLSV6, - TLS_NUM_PROTS, -}; - -enum { - TLS_BASE_TX, - TLS_SW_TX, - TLS_NUM_CONFIG, -}; - -static struct proto *saved_tcpv6_prot; -static DEFINE_MUTEX(tcpv6_prot_mutex); -static struct proto tls_prots[TLS_NUM_PROTS][TLS_NUM_CONFIG]; - -static inline void update_sk_prot(struct sock *sk, struct tls_context *ctx) -{ - int ip_ver = sk->sk_family == AF_INET6 ? TLSV6 : TLSV4; - - sk->sk_prot = &tls_prots[ip_ver][ctx->tx_conf]; -} +static struct proto tls_base_prot; +static struct proto tls_sw_prot; int wait_on_pending_writer(struct sock *sk, long *timeo) { @@ -108,7 +87,6 @@ int tls_push_sg(struct sock *sk, size = sg->length - offset; offset += sg->offset; - ctx->in_tcp_sendpages = true; while (1) { if (sg_is_last(sg)) sendpage_flags = flags; @@ -129,7 +107,6 @@ retry: offset -= sg->offset; ctx->partially_sent_offset = offset; ctx->partially_sent_record = (void *)sg; - ctx->in_tcp_sendpages = false; return ret; } @@ -144,8 +121,6 @@ retry: } clear_bit(TLS_PENDING_CLOSED_RECORD, &ctx->flags); - ctx->in_tcp_sendpages = false; - ctx->sk_write_space(sk); return 0; } @@ -215,15 +190,6 @@ static void tls_write_space(struct sock *sk) { struct tls_context *ctx = tls_get_ctx(sk); - /* If in_tcp_sendpages call lower protocol write space handler - * to ensure we wake up any waiting operations there. For example - * if do_tcp_sendpages where to call sk_wait_event. - */ - if (ctx->in_tcp_sendpages) { - ctx->sk_write_space(sk); - return; - } - if (!sk->sk_write_pending && tls_is_pending_closed_record(ctx)) { gfp_t sk_allocation = sk->sk_allocation; int rc; @@ -243,15 +209,6 @@ static void tls_write_space(struct sock *sk) ctx->sk_write_space(sk); } -static void tls_ctx_free(struct tls_context *ctx) -{ - if (!ctx) - return; - - memzero_explicit(&ctx->crypto_send, sizeof(ctx->crypto_send)); - kfree(ctx); -} - static void tls_sk_proto_close(struct sock *sk, long timeout) { struct tls_context *ctx = tls_get_ctx(sk); @@ -259,12 +216,6 @@ static void tls_sk_proto_close(struct sock *sk, long timeout) void (*sk_proto_close)(struct sock *sk, long timeout); lock_sock(sk); - sk_proto_close = ctx->sk_proto_close; - - if (ctx->tx_conf == TLS_BASE_TX) { - tls_ctx_free(ctx); - goto skip_tx_cleanup; - } if (!tls_complete_pending_work(sk, ctx, 0, &timeo)) tls_handle_open_record(sk, 0); @@ -281,16 +232,13 @@ static void tls_sk_proto_close(struct sock *sk, long timeout) sg++; } } - + ctx->free_resources(sk); kfree(ctx->rec_seq); kfree(ctx->iv); - if (ctx->tx_conf == TLS_SW_TX) { - tls_sw_free_tx_resources(sk); - tls_ctx_free(ctx); - } + sk_proto_close = ctx->sk_proto_close; + kfree(ctx); -skip_tx_cleanup: release_sock(sk); sk_proto_close(sk, timeout); } @@ -317,7 +265,7 @@ static int do_tls_getsockopt_tx(struct sock *sk, char __user *optval, } /* get user crypto info */ - crypto_info = &ctx->crypto_send.info; + crypto_info = &ctx->crypto_send; if (!TLS_CRYPTO_INFO_READY(crypto_info)) { rc = -EBUSY; @@ -343,8 +291,7 @@ static int do_tls_getsockopt_tx(struct sock *sk, char __user *optval, goto out; } lock_sock(sk); - memcpy(crypto_info_aes_gcm_128->iv, - ctx->iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE, + memcpy(crypto_info_aes_gcm_128->iv, ctx->iv, TLS_CIPHER_AES_GCM_128_IV_SIZE); release_sock(sk); if (copy_to_user(optval, @@ -391,43 +338,46 @@ static int tls_getsockopt(struct sock *sk, int level, int optname, static int do_tls_setsockopt_tx(struct sock *sk, char __user *optval, unsigned int optlen) { - struct tls_crypto_info *crypto_info; + struct tls_crypto_info *crypto_info, tmp_crypto_info; struct tls_context *ctx = tls_get_ctx(sk); + struct proto *prot = NULL; int rc = 0; - int tx_conf; if (!optval || (optlen < sizeof(*crypto_info))) { rc = -EINVAL; goto out; } - crypto_info = &ctx->crypto_send.info; - /* Currently we don't support set crypto info more than one time */ - if (TLS_CRYPTO_INFO_READY(crypto_info)) { - rc = -EBUSY; - goto out; - } - - rc = copy_from_user(crypto_info, optval, sizeof(*crypto_info)); + rc = copy_from_user(&tmp_crypto_info, optval, sizeof(*crypto_info)); if (rc) { rc = -EFAULT; goto out; } /* check version */ - if (crypto_info->version != TLS_1_2_VERSION) { + if (tmp_crypto_info.version != TLS_1_2_VERSION) { rc = -ENOTSUPP; - goto err_crypto_info; + goto out; } - switch (crypto_info->cipher_type) { + /* get user crypto info */ + crypto_info = &ctx->crypto_send; + + /* Currently we don't support set crypto info more than one time */ + if (TLS_CRYPTO_INFO_READY(crypto_info)) + goto out; + + switch (tmp_crypto_info.cipher_type) { case TLS_CIPHER_AES_GCM_128: { if (optlen != sizeof(struct tls12_crypto_info_aes_gcm_128)) { rc = -EINVAL; - goto err_crypto_info; + goto out; } - rc = copy_from_user(crypto_info + 1, optval + sizeof(*crypto_info), - optlen - sizeof(*crypto_info)); + rc = copy_from_user( + crypto_info, + optval, + sizeof(struct tls12_crypto_info_aes_gcm_128)); + if (rc) { rc = -EFAULT; goto err_crypto_info; @@ -436,23 +386,25 @@ static int do_tls_setsockopt_tx(struct sock *sk, char __user *optval, } default: rc = -EINVAL; - goto err_crypto_info; + goto out; } + ctx->sk_write_space = sk->sk_write_space; + sk->sk_write_space = tls_write_space; + + ctx->sk_proto_close = sk->sk_prot->close; + /* currently SW is default, we will have ethtool in future */ rc = tls_set_sw_offload(sk, ctx); - tx_conf = TLS_SW_TX; + prot = &tls_sw_prot; if (rc) goto err_crypto_info; - ctx->tx_conf = tx_conf; - update_sk_prot(sk, ctx); - ctx->sk_write_space = sk->sk_write_space; - sk->sk_write_space = tls_write_space; + sk->sk_prot = prot; goto out; err_crypto_info: - memzero_explicit(crypto_info, sizeof(union tls_crypto_context)); + memset(crypto_info, 0, sizeof(*crypto_info)); out: return rc; } @@ -486,34 +438,12 @@ static int tls_setsockopt(struct sock *sk, int level, int optname, return do_tls_setsockopt(sk, optname, optval, optlen); } -static void build_protos(struct proto *prot, struct proto *base) -{ - prot[TLS_BASE_TX] = *base; - prot[TLS_BASE_TX].setsockopt = tls_setsockopt; - prot[TLS_BASE_TX].getsockopt = tls_getsockopt; - prot[TLS_BASE_TX].close = tls_sk_proto_close; - - prot[TLS_SW_TX] = prot[TLS_BASE_TX]; - prot[TLS_SW_TX].sendmsg = tls_sw_sendmsg; - prot[TLS_SW_TX].sendpage = tls_sw_sendpage; -} - static int tls_init(struct sock *sk) { - int ip_ver = sk->sk_family == AF_INET6 ? TLSV6 : TLSV4; struct inet_connection_sock *icsk = inet_csk(sk); struct tls_context *ctx; int rc = 0; - /* The TLS ulp is currently supported only for TCP sockets - * in ESTABLISHED state. - * Supporting sockets in LISTEN state will require us - * to modify the accept implementation to clone rather then - * share the ulp context. - */ - if (sk->sk_state != TCP_ESTABLISHED) - return -ENOTSUPP; - /* allocate tls context */ ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); if (!ctx) { @@ -523,21 +453,7 @@ static int tls_init(struct sock *sk) icsk->icsk_ulp_data = ctx; ctx->setsockopt = sk->sk_prot->setsockopt; ctx->getsockopt = sk->sk_prot->getsockopt; - ctx->sk_proto_close = sk->sk_prot->close; - - /* Build IPv6 TLS whenever the address of tcpv6_prot changes */ - if (ip_ver == TLSV6 && - unlikely(sk->sk_prot != smp_load_acquire(&saved_tcpv6_prot))) { - mutex_lock(&tcpv6_prot_mutex); - if (likely(sk->sk_prot != saved_tcpv6_prot)) { - build_protos(tls_prots[TLSV6], sk->sk_prot); - smp_store_release(&saved_tcpv6_prot, sk->sk_prot); - } - mutex_unlock(&tcpv6_prot_mutex); - } - - ctx->tx_conf = TLS_BASE_TX; - update_sk_prot(sk, ctx); + sk->sk_prot = &tls_base_prot; out: return rc; } @@ -552,7 +468,14 @@ static struct tcp_ulp_ops tcp_tls_ulp_ops __read_mostly = { static int __init tls_register(void) { - build_protos(tls_prots[TLSV4], &tcp_prot); + tls_base_prot = tcp_prot; + tls_base_prot.setsockopt = tls_setsockopt; + tls_base_prot.getsockopt = tls_getsockopt; + + tls_sw_prot = tls_base_prot; + tls_sw_prot.sendmsg = tls_sw_sendmsg; + tls_sw_prot.sendpage = tls_sw_sendpage; + tls_sw_prot.close = tls_sk_proto_close; tcp_register_ulp(&tcp_tls_ulp_ops); diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 5db3b495782f..7d80040a37b6 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -103,6 +103,63 @@ static void trim_both_sgl(struct sock *sk, int target_size) target_size); } +static int alloc_sg(struct sock *sk, int len, struct scatterlist *sg, + int *sg_num_elem, unsigned int *sg_size, + int first_coalesce) +{ + struct page_frag *pfrag; + unsigned int size = *sg_size; + int num_elem = *sg_num_elem, use = 0, rc = 0; + struct scatterlist *sge; + unsigned int orig_offset; + + len -= size; + pfrag = sk_page_frag(sk); + + while (len > 0) { + if (!sk_page_frag_refill(sk, pfrag)) { + rc = -ENOMEM; + goto out; + } + + use = min_t(int, len, pfrag->size - pfrag->offset); + + if (!sk_wmem_schedule(sk, use)) { + rc = -ENOMEM; + goto out; + } + + sk_mem_charge(sk, use); + size += use; + orig_offset = pfrag->offset; + pfrag->offset += use; + + sge = sg + num_elem - 1; + if (num_elem > first_coalesce && sg_page(sg) == pfrag->page && + sg->offset + sg->length == orig_offset) { + sg->length += use; + } else { + sge++; + sg_unmark_end(sge); + sg_set_page(sge, pfrag->page, use, orig_offset); + get_page(pfrag->page); + ++num_elem; + if (num_elem == MAX_SKB_FRAGS) { + rc = -ENOSPC; + break; + } + } + + len -= use; + } + goto out; + +out: + *sg_size = size; + *sg_num_elem = num_elem; + return rc; +} + static int alloc_encrypted_sg(struct sock *sk, int len) { struct tls_context *tls_ctx = tls_get_ctx(sk); @@ -112,11 +169,6 @@ static int alloc_encrypted_sg(struct sock *sk, int len) rc = alloc_sg(sk, len, ctx->sg_encrypted_data, &ctx->sg_encrypted_num_elem, &ctx->sg_encrypted_size, 0); - rc = sk_alloc_sg(sk, len, - ctx->sg_encrypted_data, 0, - &ctx->sg_encrypted_num_elem, - &ctx->sg_encrypted_size, 0); - return rc; } @@ -126,12 +178,9 @@ static int alloc_plaintext_sg(struct sock *sk, int len) struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx); int rc = 0; - rc = sk_alloc_sg(sk, len, ctx->sg_plaintext_data, 0, - &ctx->sg_plaintext_num_elem, &ctx->sg_plaintext_size, - tls_ctx->pending_open_record_frags); - - if (rc == -ENOSPC) - ctx->sg_plaintext_num_elem = ARRAY_SIZE(ctx->sg_plaintext_data); + rc = alloc_sg(sk, len, ctx->sg_plaintext_data, + &ctx->sg_plaintext_num_elem, &ctx->sg_plaintext_size, + tls_ctx->pending_open_record_frags); return rc; } @@ -162,12 +211,18 @@ static void tls_free_both_sg(struct sock *sk) } static int tls_do_encryption(struct tls_context *tls_ctx, - struct tls_sw_context *ctx, - struct aead_request *aead_req, - size_t data_len) + struct tls_sw_context *ctx, size_t data_len, + gfp_t flags) { + unsigned int req_size = sizeof(struct aead_request) + + crypto_aead_reqsize(ctx->aead_send); + struct aead_request *aead_req; int rc; + aead_req = kmalloc(req_size, flags); + if (!aead_req) + return -ENOMEM; + ctx->sg_encrypted_data[0].offset += tls_ctx->prepend_size; ctx->sg_encrypted_data[0].length -= tls_ctx->prepend_size; @@ -180,6 +235,7 @@ static int tls_do_encryption(struct tls_context *tls_ctx, ctx->sg_encrypted_data[0].offset -= tls_ctx->prepend_size; ctx->sg_encrypted_data[0].length += tls_ctx->prepend_size; + kfree(aead_req); return rc; } @@ -188,14 +244,8 @@ static int tls_push_record(struct sock *sk, int flags, { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx); - struct aead_request *req; int rc; - req = kzalloc(sizeof(struct aead_request) + - crypto_aead_reqsize(ctx->aead_send), sk->sk_allocation); - if (!req) - return -ENOMEM; - sg_mark_end(ctx->sg_plaintext_data + ctx->sg_plaintext_num_elem - 1); sg_mark_end(ctx->sg_encrypted_data + ctx->sg_encrypted_num_elem - 1); @@ -211,14 +261,15 @@ static int tls_push_record(struct sock *sk, int flags, tls_ctx->pending_open_record_frags = 0; set_bit(TLS_PENDING_CLOSED_RECORD, &tls_ctx->flags); - rc = tls_do_encryption(tls_ctx, ctx, req, ctx->sg_plaintext_size); + rc = tls_do_encryption(tls_ctx, ctx, ctx->sg_plaintext_size, + sk->sk_allocation); if (rc < 0) { /* If we are called from write_space and * we fail, we need to set this SOCK_NOSPACE * to trigger another write_space in the future. */ set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); - goto out_req; + return rc; } free_sg(sk, ctx->sg_plaintext_data, &ctx->sg_plaintext_num_elem, @@ -233,8 +284,6 @@ static int tls_push_record(struct sock *sk, int flags, tls_err_abort(sk); tls_advance_record_sn(sk, tls_ctx); -out_req: - kfree(req); return rc; } @@ -332,7 +381,7 @@ int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx); - int ret; + int ret = 0; int required_size; long timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); bool eor = !(msg->msg_flags & MSG_MORE); @@ -347,8 +396,7 @@ int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) lock_sock(sk); - ret = tls_complete_pending_work(sk, tls_ctx, msg->msg_flags, &timeo); - if (ret) + if (tls_complete_pending_work(sk, tls_ctx, msg->msg_flags, &timeo)) goto send_end; if (unlikely(msg->msg_controllen)) { @@ -359,7 +407,7 @@ int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) while (msg_data_left(msg)) { if (sk->sk_err) { - ret = -sk->sk_err; + ret = sk->sk_err; goto send_end; } @@ -401,7 +449,7 @@ alloc_encrypted: ret = tls_push_record(sk, msg->msg_flags, record_type); if (!ret) continue; - if (ret < 0) + if (ret == -EAGAIN) goto send_end; copied -= try_to_copy; @@ -484,7 +532,7 @@ int tls_sw_sendpage(struct sock *sk, struct page *page, { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx); - int ret; + int ret = 0; long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); bool eor; size_t orig_size = size; @@ -504,8 +552,7 @@ int tls_sw_sendpage(struct sock *sk, struct page *page, sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); - ret = tls_complete_pending_work(sk, tls_ctx, flags, &timeo); - if (ret) + if (tls_complete_pending_work(sk, tls_ctx, flags, &timeo)) goto sendpage_end; /* Call the sk_stream functions to manage the sndbuf mem. */ @@ -513,7 +560,7 @@ int tls_sw_sendpage(struct sock *sk, struct page *page, size_t copy, required_size; if (sk->sk_err) { - ret = -sk->sk_err; + ret = sk->sk_err; goto sendpage_end; } @@ -592,7 +639,7 @@ sendpage_end: return ret; } -void tls_sw_free_tx_resources(struct sock *sk) +static void tls_sw_free_resources(struct sock *sk) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx); @@ -607,6 +654,7 @@ void tls_sw_free_tx_resources(struct sock *sk) int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx) { + char keyval[TLS_CIPHER_AES_GCM_128_KEY_SIZE]; struct tls_crypto_info *crypto_info; struct tls12_crypto_info_aes_gcm_128 *gcm_128_info; struct tls_sw_context *sw_ctx; @@ -631,8 +679,9 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx) } ctx->priv_ctx = (struct tls_offload_context *)sw_ctx; + ctx->free_resources = tls_sw_free_resources; - crypto_info = &ctx->crypto_send.info; + crypto_info = &ctx->crypto_send; switch (crypto_info->cipher_type) { case TLS_CIPHER_AES_GCM_128: { nonce_size = TLS_CIPHER_AES_GCM_128_IV_SIZE; @@ -648,17 +697,18 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx) } default: rc = -EINVAL; - goto free_priv; + goto out; } ctx->prepend_size = TLS_HEADER_SIZE + nonce_size; ctx->tag_size = tag_size; ctx->overhead_size = ctx->prepend_size + ctx->tag_size; ctx->iv_size = iv_size; - ctx->iv = kmalloc(iv_size + TLS_CIPHER_AES_GCM_128_SALT_SIZE, GFP_KERNEL); + ctx->iv = kmalloc(iv_size + TLS_CIPHER_AES_GCM_128_SALT_SIZE, + GFP_KERNEL); if (!ctx->iv) { rc = -ENOMEM; - goto free_priv; + goto out; } memcpy(ctx->iv, gcm_128_info->salt, TLS_CIPHER_AES_GCM_128_SALT_SIZE); memcpy(ctx->iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE, iv, iv_size); @@ -697,14 +747,16 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx) ctx->push_pending_record = tls_sw_push_pending_record; - rc = crypto_aead_setkey(sw_ctx->aead_send, gcm_128_info->key, + memcpy(keyval, gcm_128_info->key, TLS_CIPHER_AES_GCM_128_KEY_SIZE); + + rc = crypto_aead_setkey(sw_ctx->aead_send, keyval, TLS_CIPHER_AES_GCM_128_KEY_SIZE); if (rc) goto free_aead; rc = crypto_aead_setauthsize(sw_ctx->aead_send, ctx->tag_size); if (!rc) - return 0; + goto out; free_aead: crypto_free_aead(sw_ctx->aead_send); @@ -715,9 +767,6 @@ free_rec_seq: free_iv: kfree(ctx->iv); ctx->iv = NULL; -free_priv: - kfree(ctx->priv_ctx); - ctx->priv_ctx = NULL; out: return rc; } From a35836f5e0301d4abd336dc431d76263447dee05 Mon Sep 17 00:00:00 2001 From: Ilya Lesokhin Date: Mon, 13 Nov 2017 10:22:44 +0200 Subject: [PATCH 1413/1640] UPSTREAM: tls: Use kzalloc for aead_request allocation Use kzalloc for aead_request allocation as we don't set all the bits in the request. Fixes: 3c4d7559159b ('tls: kernel TLS support') Signed-off-by: Ilya Lesokhin Signed-off-by: David S. Miller --- net/tls/tls_sw.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 7d80040a37b6..f00383a37622 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -219,7 +219,7 @@ static int tls_do_encryption(struct tls_context *tls_ctx, struct aead_request *aead_req; int rc; - aead_req = kmalloc(req_size, flags); + aead_req = kzalloc(req_size, flags); if (!aead_req) return -ENOMEM; From 5165d55a1d5fe3cb948f7c01981f7a892bed75d1 Mon Sep 17 00:00:00 2001 From: Ilya Lesokhin Date: Mon, 13 Nov 2017 10:22:45 +0200 Subject: [PATCH 1414/1640] UPSTREAM: tls: Add function to update the TLS socket configuration The tx configuration is now stored in ctx->tx_conf. And sk->sk_prot is updated trough a function This will simplify things when we add rx and support for different possible tx and rx cross configurations. Signed-off-by: Ilya Lesokhin Signed-off-by: David S. Miller --- include/net/tls.h | 2 ++ net/tls/tls_main.c | 46 ++++++++++++++++++++++++++++++++-------------- 2 files changed, 34 insertions(+), 14 deletions(-) diff --git a/include/net/tls.h b/include/net/tls.h index b89d397dd62f..f058a6e08eaa 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -83,6 +83,8 @@ struct tls_context { void *priv_ctx; + u8 tx_conf:2; + u16 prepend_size; u16 tag_size; u16 overhead_size; diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index b6e1363cb048..fa657f2171ed 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -45,8 +45,18 @@ MODULE_AUTHOR("Mellanox Technologies"); MODULE_DESCRIPTION("Transport Layer Security Support"); MODULE_LICENSE("Dual BSD/GPL"); -static struct proto tls_base_prot; -static struct proto tls_sw_prot; +enum { + TLS_BASE_TX, + TLS_SW_TX, + TLS_NUM_CONFIG, +}; + +static struct proto tls_prots[TLS_NUM_CONFIG]; + +static inline void update_sk_prot(struct sock *sk, struct tls_context *ctx) +{ + sk->sk_prot = &tls_prots[ctx->tx_conf]; +} int wait_on_pending_writer(struct sock *sk, long *timeo) { @@ -340,8 +350,8 @@ static int do_tls_setsockopt_tx(struct sock *sk, char __user *optval, { struct tls_crypto_info *crypto_info, tmp_crypto_info; struct tls_context *ctx = tls_get_ctx(sk); - struct proto *prot = NULL; int rc = 0; + int tx_conf; if (!optval || (optlen < sizeof(*crypto_info))) { rc = -EINVAL; @@ -396,11 +406,12 @@ static int do_tls_setsockopt_tx(struct sock *sk, char __user *optval, /* currently SW is default, we will have ethtool in future */ rc = tls_set_sw_offload(sk, ctx); - prot = &tls_sw_prot; + tx_conf = TLS_SW_TX; if (rc) goto err_crypto_info; - sk->sk_prot = prot; + ctx->tx_conf = tx_conf; + update_sk_prot(sk, ctx); goto out; err_crypto_info: @@ -453,7 +464,9 @@ static int tls_init(struct sock *sk) icsk->icsk_ulp_data = ctx; ctx->setsockopt = sk->sk_prot->setsockopt; ctx->getsockopt = sk->sk_prot->getsockopt; - sk->sk_prot = &tls_base_prot; + + ctx->tx_conf = TLS_BASE_TX; + update_sk_prot(sk, ctx); out: return rc; } @@ -466,16 +479,21 @@ static struct tcp_ulp_ops tcp_tls_ulp_ops __read_mostly = { .init = tls_init, }; +static void build_protos(struct proto *prot, struct proto *base) +{ + prot[TLS_BASE_TX] = *base; + prot[TLS_BASE_TX].setsockopt = tls_setsockopt; + prot[TLS_BASE_TX].getsockopt = tls_getsockopt; + + prot[TLS_SW_TX] = prot[TLS_BASE_TX]; + prot[TLS_SW_TX].close = tls_sk_proto_close; + prot[TLS_SW_TX].sendmsg = tls_sw_sendmsg; + prot[TLS_SW_TX].sendpage = tls_sw_sendpage; +} + static int __init tls_register(void) { - tls_base_prot = tcp_prot; - tls_base_prot.setsockopt = tls_setsockopt; - tls_base_prot.getsockopt = tls_getsockopt; - - tls_sw_prot = tls_base_prot; - tls_sw_prot.sendmsg = tls_sw_sendmsg; - tls_sw_prot.sendpage = tls_sw_sendpage; - tls_sw_prot.close = tls_sk_proto_close; + build_protos(tls_prots, &tcp_prot); tcp_register_ulp(&tcp_tls_ulp_ops); From 76905b01fe032f73cba51fcdf7e6e94de05907ed Mon Sep 17 00:00:00 2001 From: Ilya Lesokhin Date: Mon, 13 Nov 2017 10:22:46 +0200 Subject: [PATCH 1415/1640] UPSTREAM: tls: Fix TLS ulp context leak, when TLS_TX setsockopt is not used. Previously the TLS ulp context would leak if we attached a TLS ulp to a socket but did not use the TLS_TX setsockopt, or did use it but it failed. This patch solves the issue by overriding prot[TLS_BASE_TX].close and fixing tls_sk_proto_close to work properly when its called with ctx->tx_conf == TLS_BASE_TX. This patch also removes ctx->free_resources as we can use ctx->tx_conf to obtain the relevant information. Fixes: 3c4d7559159b ('tls: kernel TLS support') Signed-off-by: Ilya Lesokhin Signed-off-by: David S. Miller --- include/net/tls.h | 2 +- net/tls/tls_main.c | 22 ++++++++++++++-------- net/tls/tls_sw.c | 4 ++-- 3 files changed, 17 insertions(+), 11 deletions(-) diff --git a/include/net/tls.h b/include/net/tls.h index f058a6e08eaa..7cb58a6b8fd0 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -99,7 +99,6 @@ struct tls_context { u16 pending_open_record_frags; int (*push_pending_record)(struct sock *sk, int flags); - void (*free_resources)(struct sock *sk); void (*sk_write_space)(struct sock *sk); void (*sk_proto_close)(struct sock *sk, long timeout); @@ -124,6 +123,7 @@ int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size); int tls_sw_sendpage(struct sock *sk, struct page *page, int offset, size_t size, int flags); void tls_sw_close(struct sock *sk, long timeout); +void tls_sw_free_tx_resources(struct sock *sk); void tls_sk_destruct(struct sock *sk, struct tls_context *ctx); void tls_icsk_clean_acked(struct sock *sk); diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index fa657f2171ed..66fdcb3a13f2 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -226,6 +226,12 @@ static void tls_sk_proto_close(struct sock *sk, long timeout) void (*sk_proto_close)(struct sock *sk, long timeout); lock_sock(sk); + sk_proto_close = ctx->sk_proto_close; + + if (ctx->tx_conf == TLS_BASE_TX) { + kfree(ctx); + goto skip_tx_cleanup; + } if (!tls_complete_pending_work(sk, ctx, 0, &timeo)) tls_handle_open_record(sk, 0); @@ -242,13 +248,14 @@ static void tls_sk_proto_close(struct sock *sk, long timeout) sg++; } } - ctx->free_resources(sk); + kfree(ctx->rec_seq); kfree(ctx->iv); - sk_proto_close = ctx->sk_proto_close; - kfree(ctx); + if (ctx->tx_conf == TLS_SW_TX) + tls_sw_free_tx_resources(sk); +skip_tx_cleanup: release_sock(sk); sk_proto_close(sk, timeout); } @@ -402,8 +409,6 @@ static int do_tls_setsockopt_tx(struct sock *sk, char __user *optval, ctx->sk_write_space = sk->sk_write_space; sk->sk_write_space = tls_write_space; - ctx->sk_proto_close = sk->sk_prot->close; - /* currently SW is default, we will have ethtool in future */ rc = tls_set_sw_offload(sk, ctx); tx_conf = TLS_SW_TX; @@ -464,6 +469,7 @@ static int tls_init(struct sock *sk) icsk->icsk_ulp_data = ctx; ctx->setsockopt = sk->sk_prot->setsockopt; ctx->getsockopt = sk->sk_prot->getsockopt; + ctx->sk_proto_close = sk->sk_prot->close; ctx->tx_conf = TLS_BASE_TX; update_sk_prot(sk, ctx); @@ -482,11 +488,11 @@ static struct tcp_ulp_ops tcp_tls_ulp_ops __read_mostly = { static void build_protos(struct proto *prot, struct proto *base) { prot[TLS_BASE_TX] = *base; - prot[TLS_BASE_TX].setsockopt = tls_setsockopt; - prot[TLS_BASE_TX].getsockopt = tls_getsockopt; + prot[TLS_BASE_TX].setsockopt = tls_setsockopt; + prot[TLS_BASE_TX].getsockopt = tls_getsockopt; + prot[TLS_BASE_TX].close = tls_sk_proto_close; prot[TLS_SW_TX] = prot[TLS_BASE_TX]; - prot[TLS_SW_TX].close = tls_sk_proto_close; prot[TLS_SW_TX].sendmsg = tls_sw_sendmsg; prot[TLS_SW_TX].sendpage = tls_sw_sendpage; } diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index f00383a37622..fcd92a9c2d06 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -639,7 +639,7 @@ sendpage_end: return ret; } -static void tls_sw_free_resources(struct sock *sk) +void tls_sw_free_tx_resources(struct sock *sk) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx); @@ -650,6 +650,7 @@ static void tls_sw_free_resources(struct sock *sk) tls_free_both_sg(sk); kfree(ctx); + kfree(tls_ctx); } int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx) @@ -679,7 +680,6 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx) } ctx->priv_ctx = (struct tls_offload_context *)sw_ctx; - ctx->free_resources = tls_sw_free_resources; crypto_info = &ctx->crypto_send; switch (crypto_info->cipher_type) { From b38afa863d2470fd5e72a50a4f42d4d6a59c76f5 Mon Sep 17 00:00:00 2001 From: Ilya Lesokhin Date: Mon, 13 Nov 2017 10:22:47 +0200 Subject: [PATCH 1416/1640] UPSTREAM: tls: Move tls_make_aad to header to allow sharing move tls_make_aad as it is going to be reused by the device offload code and rx path. Remove unused recv parameter. Signed-off-by: Ilya Lesokhin Signed-off-by: David S. Miller --- include/net/tls.h | 15 +++++++++++++++ net/tls/tls_sw.c | 18 +----------------- 2 files changed, 16 insertions(+), 17 deletions(-) diff --git a/include/net/tls.h b/include/net/tls.h index 7cb58a6b8fd0..70becd0a9299 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -214,6 +214,21 @@ static inline void tls_fill_prepend(struct tls_context *ctx, ctx->iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE, iv_size); } +static inline void tls_make_aad(char *buf, + size_t size, + char *record_sequence, + int record_sequence_size, + unsigned char record_type) +{ + memcpy(buf, record_sequence, record_sequence_size); + + buf[8] = record_type; + buf[9] = TLS_1_2_VERSION_MAJOR; + buf[10] = TLS_1_2_VERSION_MINOR; + buf[11] = size >> 8; + buf[12] = size & 0xFF; +} + static inline struct tls_context *tls_get_ctx(const struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index fcd92a9c2d06..73d19210dd49 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -39,22 +39,6 @@ #include -static inline void tls_make_aad(int recv, - char *buf, - size_t size, - char *record_sequence, - int record_sequence_size, - unsigned char record_type) -{ - memcpy(buf, record_sequence, record_sequence_size); - - buf[8] = record_type; - buf[9] = TLS_1_2_VERSION_MAJOR; - buf[10] = TLS_1_2_VERSION_MINOR; - buf[11] = size >> 8; - buf[12] = size & 0xFF; -} - static void trim_sg(struct sock *sk, struct scatterlist *sg, int *sg_num_elem, unsigned int *sg_size, int target_size) { @@ -249,7 +233,7 @@ static int tls_push_record(struct sock *sk, int flags, sg_mark_end(ctx->sg_plaintext_data + ctx->sg_plaintext_num_elem - 1); sg_mark_end(ctx->sg_encrypted_data + ctx->sg_encrypted_num_elem - 1); - tls_make_aad(0, ctx->aad_space, ctx->sg_plaintext_size, + tls_make_aad(ctx->aad_space, ctx->sg_plaintext_size, tls_ctx->rec_seq, tls_ctx->rec_seq_size, record_type); From 1a2c4f1f3a6e6288431f85e36bbd7fd45732c270 Mon Sep 17 00:00:00 2001 From: Ilya Lesokhin Date: Mon, 13 Nov 2017 10:22:48 +0200 Subject: [PATCH 1417/1640] UPSTREAM: tls: Avoid copying crypto_info again after cipher_type check. Avoid copying crypto_info again after cipher_type check to avoid a TOCTOU exploits. The temporary array on the stack is removed as we don't really need it Fixes: 3c4d7559159b ('tls: kernel TLS support') Signed-off-by: Ilya Lesokhin Signed-off-by: David S. Miller --- net/tls/tls_main.c | 29 ++++++++++++----------------- 1 file changed, 12 insertions(+), 17 deletions(-) diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index 66fdcb3a13f2..b2188529bc94 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -355,7 +355,7 @@ static int tls_getsockopt(struct sock *sk, int level, int optname, static int do_tls_setsockopt_tx(struct sock *sk, char __user *optval, unsigned int optlen) { - struct tls_crypto_info *crypto_info, tmp_crypto_info; + struct tls_crypto_info *crypto_info; struct tls_context *ctx = tls_get_ctx(sk); int rc = 0; int tx_conf; @@ -365,36 +365,31 @@ static int do_tls_setsockopt_tx(struct sock *sk, char __user *optval, goto out; } - rc = copy_from_user(&tmp_crypto_info, optval, sizeof(*crypto_info)); + crypto_info = &ctx->crypto_send; + /* Currently we don't support set crypto info more than one time */ + if (TLS_CRYPTO_INFO_READY(crypto_info)) + goto out; + + rc = copy_from_user(crypto_info, optval, sizeof(*crypto_info)); if (rc) { rc = -EFAULT; goto out; } /* check version */ - if (tmp_crypto_info.version != TLS_1_2_VERSION) { + if (crypto_info->version != TLS_1_2_VERSION) { rc = -ENOTSUPP; - goto out; + goto err_crypto_info; } - /* get user crypto info */ - crypto_info = &ctx->crypto_send; - - /* Currently we don't support set crypto info more than one time */ - if (TLS_CRYPTO_INFO_READY(crypto_info)) - goto out; - - switch (tmp_crypto_info.cipher_type) { + switch (crypto_info->cipher_type) { case TLS_CIPHER_AES_GCM_128: { if (optlen != sizeof(struct tls12_crypto_info_aes_gcm_128)) { rc = -EINVAL; goto out; } - rc = copy_from_user( - crypto_info, - optval, - sizeof(struct tls12_crypto_info_aes_gcm_128)); - + rc = copy_from_user(crypto_info + 1, optval + sizeof(*crypto_info), + optlen - sizeof(*crypto_info)); if (rc) { rc = -EFAULT; goto err_crypto_info; From 818dd4d1919e611ffaf24ac1b46769f2b67ae896 Mon Sep 17 00:00:00 2001 From: Ilya Lesokhin Date: Mon, 13 Nov 2017 10:22:49 +0200 Subject: [PATCH 1418/1640] UPSTREAM: tls: don't override sk_write_space if tls_set_sw_offload fails. If we fail to enable tls in the kernel we shouldn't override the sk_write_space callback Fixes: 3c4d7559159b ('tls: kernel TLS support') Signed-off-by: Ilya Lesokhin Signed-off-by: David S. Miller --- net/tls/tls_main.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index b2188529bc94..eb605aa3b5ac 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -401,9 +401,6 @@ static int do_tls_setsockopt_tx(struct sock *sk, char __user *optval, goto out; } - ctx->sk_write_space = sk->sk_write_space; - sk->sk_write_space = tls_write_space; - /* currently SW is default, we will have ethtool in future */ rc = tls_set_sw_offload(sk, ctx); tx_conf = TLS_SW_TX; @@ -412,6 +409,8 @@ static int do_tls_setsockopt_tx(struct sock *sk, char __user *optval, ctx->tx_conf = tx_conf; update_sk_prot(sk, ctx); + ctx->sk_write_space = sk->sk_write_space; + sk->sk_write_space = tls_write_space; goto out; err_crypto_info: From bafa3252a714f80336b6c28d2a10636d08bffb3b Mon Sep 17 00:00:00 2001 From: "Dmitry V. Levin" Date: Tue, 14 Nov 2017 06:30:11 +0300 Subject: [PATCH 1419/1640] UPSTREAM: uapi: fix linux/tls.h userspace compilation error Move inclusion of a private kernel header from uapi/linux/tls.h to its only user - net/tls.h, to fix the following linux/tls.h userspace compilation error: /usr/include/linux/tls.h:41:21: fatal error: net/tcp.h: No such file or directory As to this point uapi/linux/tls.h was totaly unusuable for userspace, cleanup this header file further by moving other redundant includes to net/tls.h. Fixes: 3c4d7559159b ("tls: kernel TLS support") Cc: # v4.13+ Signed-off-by: Dmitry V. Levin Signed-off-by: David S. Miller --- include/net/tls.h | 4 ++++ include/uapi/linux/tls.h | 4 ---- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/include/net/tls.h b/include/net/tls.h index 70becd0a9299..936cfc5cab7d 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -35,6 +35,10 @@ #define _TLS_OFFLOAD_H #include +#include +#include +#include +#include #include diff --git a/include/uapi/linux/tls.h b/include/uapi/linux/tls.h index d5e0682ab837..293b2cdad88d 100644 --- a/include/uapi/linux/tls.h +++ b/include/uapi/linux/tls.h @@ -35,10 +35,6 @@ #define _UAPI_LINUX_TLS_H #include -#include -#include -#include -#include /* TLS socket options */ #define TLS_TX 1 /* Set transmit parameters */ From be1ea7d7f1d9746a893ae2cdc16951b5708d149f Mon Sep 17 00:00:00 2001 From: "r.hering@avm.de" Date: Fri, 12 Jan 2018 15:42:06 +0100 Subject: [PATCH 1420/1640] UPSTREAM: net/tls: Fix inverted error codes to avoid endless loop sendfile() calls can hang endless with using Kernel TLS if a socket error occurs. Socket error codes must be inverted by Kernel TLS before returning because they are stored with positive sign. If returned non-inverted they are interpreted as number of bytes sent, causing endless looping of the splice mechanic behind sendfile(). Signed-off-by: Robert Hering Signed-off-by: David S. Miller --- include/net/tls.h | 2 +- net/tls/tls_sw.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/include/net/tls.h b/include/net/tls.h index 936cfc5cab7d..9185e53a743c 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -170,7 +170,7 @@ static inline bool tls_is_pending_open_record(struct tls_context *tls_ctx) static inline void tls_err_abort(struct sock *sk) { - sk->sk_err = -EBADMSG; + sk->sk_err = EBADMSG; sk->sk_error_report(sk); } diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 73d19210dd49..9773571b6a34 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -391,7 +391,7 @@ int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) while (msg_data_left(msg)) { if (sk->sk_err) { - ret = sk->sk_err; + ret = -sk->sk_err; goto send_end; } @@ -544,7 +544,7 @@ int tls_sw_sendpage(struct sock *sk, struct page *page, size_t copy, required_size; if (sk->sk_err) { - ret = sk->sk_err; + ret = -sk->sk_err; goto sendpage_end; } From 00592312f88339af8a77146290635ba133944cb6 Mon Sep 17 00:00:00 2001 From: Ilya Lesokhin Date: Tue, 16 Jan 2018 15:31:52 +0200 Subject: [PATCH 1421/1640] UPSTREAM: net/tls: Only attach to sockets in ESTABLISHED state Calling accept on a TCP socket with a TLS ulp attached results in two sockets that share the same ulp context. The ulp context is freed while a socket is destroyed, so after one of the sockets is released, the second second will trigger a use after free when it tries to access the ulp context attached to it. We restrict the TLS ulp to sockets in ESTABLISHED state to prevent the scenario above. Fixes: 3c4d7559159b ("tls: kernel TLS support") Reported-by: syzbot+904e7cd6c5c741609228@syzkaller.appspotmail.com Signed-off-by: Ilya Lesokhin Signed-off-by: David S. Miller --- net/tls/tls_main.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index eb605aa3b5ac..f13afb4a51cc 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -454,6 +454,15 @@ static int tls_init(struct sock *sk) struct tls_context *ctx; int rc = 0; + /* The TLS ulp is currently supported only for TCP sockets + * in ESTABLISHED state. + * Supporting sockets in LISTEN state will require us + * to modify the accept implementation to clone rather then + * share the ulp context. + */ + if (sk->sk_state != TCP_ESTABLISHED) + return -ENOTSUPP; + /* allocate tls context */ ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); if (!ctx) { From 3e4982d3dff45dab78fdacc975dffc2a1788e1f1 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Tue, 16 Jan 2018 16:04:26 +0100 Subject: [PATCH 1422/1640] UPSTREAM: tls: fix sw_ctx leak During setsockopt(SOL_TCP, TLS_TX), if initialization of the software context fails in tls_set_sw_offload(), we leak sw_ctx. We also don't reassign ctx->priv_ctx to NULL, so we can't even do another attempt to set it up on the same socket, as it will fail with -EEXIST. Fixes: 3c4d7559159b ('tls: kernel TLS support') Signed-off-by: Sabrina Dubroca Signed-off-by: David S. Miller --- net/tls/tls_sw.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 9773571b6a34..61f394d369bf 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -681,18 +681,17 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx) } default: rc = -EINVAL; - goto out; + goto free_priv; } ctx->prepend_size = TLS_HEADER_SIZE + nonce_size; ctx->tag_size = tag_size; ctx->overhead_size = ctx->prepend_size + ctx->tag_size; ctx->iv_size = iv_size; - ctx->iv = kmalloc(iv_size + TLS_CIPHER_AES_GCM_128_SALT_SIZE, - GFP_KERNEL); + ctx->iv = kmalloc(iv_size + TLS_CIPHER_AES_GCM_128_SALT_SIZE, GFP_KERNEL); if (!ctx->iv) { rc = -ENOMEM; - goto out; + goto free_priv; } memcpy(ctx->iv, gcm_128_info->salt, TLS_CIPHER_AES_GCM_128_SALT_SIZE); memcpy(ctx->iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE, iv, iv_size); @@ -740,7 +739,7 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx) rc = crypto_aead_setauthsize(sw_ctx->aead_send, ctx->tag_size); if (!rc) - goto out; + return 0; free_aead: crypto_free_aead(sw_ctx->aead_send); @@ -751,6 +750,9 @@ free_rec_seq: free_iv: kfree(ctx->iv); ctx->iv = NULL; +free_priv: + kfree(ctx->priv_ctx); + ctx->priv_ctx = NULL; out: return rc; } From 67d94d905751594db96128dabddc7414d48b00ad Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Tue, 16 Jan 2018 16:04:27 +0100 Subject: [PATCH 1423/1640] UPSTREAM: tls: return -EBUSY if crypto_info is already set do_tls_setsockopt_tx returns 0 without doing anything when crypto_info is already set. Silent failure is confusing for users. Fixes: 3c4d7559159b ("tls: kernel TLS support") Signed-off-by: Sabrina Dubroca Signed-off-by: David S. Miller --- net/tls/tls_main.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index f13afb4a51cc..7bc11c6e01d9 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -367,8 +367,10 @@ static int do_tls_setsockopt_tx(struct sock *sk, char __user *optval, crypto_info = &ctx->crypto_send; /* Currently we don't support set crypto info more than one time */ - if (TLS_CRYPTO_INFO_READY(crypto_info)) + if (TLS_CRYPTO_INFO_READY(crypto_info)) { + rc = -EBUSY; goto out; + } rc = copy_from_user(crypto_info, optval, sizeof(*crypto_info)); if (rc) { From d381d7e791a0a32374b6e94e726b7347e4398f0c Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Tue, 16 Jan 2018 16:04:28 +0100 Subject: [PATCH 1424/1640] UPSTREAM: tls: reset crypto_info when do_tls_setsockopt_tx fails The current code copies directly from userspace to ctx->crypto_send, but doesn't always reinitialize it to 0 on failure. This causes any subsequent attempt to use this setsockopt to fail because of the TLS_CRYPTO_INFO_READY check, eventhough crypto_info is not actually ready. This should result in a correctly set up socket after the 3rd call, but currently it does not: size_t s = sizeof(struct tls12_crypto_info_aes_gcm_128); struct tls12_crypto_info_aes_gcm_128 crypto_good = { .info.version = TLS_1_2_VERSION, .info.cipher_type = TLS_CIPHER_AES_GCM_128, }; struct tls12_crypto_info_aes_gcm_128 crypto_bad_type = crypto_good; crypto_bad_type.info.cipher_type = 42; setsockopt(sock, SOL_TLS, TLS_TX, &crypto_bad_type, s); setsockopt(sock, SOL_TLS, TLS_TX, &crypto_good, s - 1); setsockopt(sock, SOL_TLS, TLS_TX, &crypto_good, s); Fixes: 3c4d7559159b ("tls: kernel TLS support") Signed-off-by: Sabrina Dubroca Signed-off-by: David S. Miller --- net/tls/tls_main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index 7bc11c6e01d9..1596f494e364 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -388,7 +388,7 @@ static int do_tls_setsockopt_tx(struct sock *sk, char __user *optval, case TLS_CIPHER_AES_GCM_128: { if (optlen != sizeof(struct tls12_crypto_info_aes_gcm_128)) { rc = -EINVAL; - goto out; + goto err_crypto_info; } rc = copy_from_user(crypto_info + 1, optval + sizeof(*crypto_info), optlen - sizeof(*crypto_info)); @@ -400,7 +400,7 @@ static int do_tls_setsockopt_tx(struct sock *sk, char __user *optval, } default: rc = -EINVAL; - goto out; + goto err_crypto_info; } /* currently SW is default, we will have ethtool in future */ From ac968ac2b6f4d8720be83f31379e75b318ac4abb Mon Sep 17 00:00:00 2001 From: Dave Watson Date: Fri, 19 Jan 2018 12:30:13 -0800 Subject: [PATCH 1425/1640] UPSTREAM: tls: Correct length of scatterlist in tls_sw_sendpage The scatterlist is reused by both sendmsg and sendfile. If a sendmsg of smaller number of pages is followed by a sendfile of larger number of pages, the scatterlist may be too short, resulting in a crash in gcm_encrypt. Add sg_unmark_end to make the list the correct length. tls_sw_sendmsg already calls sg_unmark_end correctly when it allocates memory in alloc_sg, or in zerocopy_from_iter. Signed-off-by: Dave Watson Signed-off-by: David S. Miller --- net/tls/tls_sw.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 61f394d369bf..0a9b72fbd761 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -577,6 +577,8 @@ alloc_payload: get_page(page); sg = ctx->sg_plaintext_data + ctx->sg_plaintext_num_elem; sg_set_page(sg, page, copy, offset); + sg_unmark_end(sg); + ctx->sg_plaintext_num_elem++; sk_mem_charge(sk, copy); From 6b730c7f8fefac0be1a81a2c87b85fecbcaa2ada Mon Sep 17 00:00:00 2001 From: Vakul Garg Date: Wed, 31 Jan 2018 21:34:37 +0530 Subject: [PATCH 1426/1640] UPSTREAM: tls: Add support for encryption using async offload accelerator Async crypto accelerators (e.g. drivers/crypto/caam) support offloading GCM operation. If they are enabled, crypto_aead_encrypt() return error code -EINPROGRESS. In this case tls_do_encryption() needs to wait on a completion till the time the response for crypto offload request is received. Signed-off-by: Vakul Garg Signed-off-by: David S. Miller --- include/net/tls.h | 2 ++ net/tls/tls_sw.c | 8 +++++++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/include/net/tls.h b/include/net/tls.h index 9185e53a743c..4913430ab807 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -36,6 +36,7 @@ #include #include +#include #include #include #include @@ -57,6 +58,7 @@ struct tls_sw_context { struct crypto_aead *aead_send; + struct crypto_wait async_wait; /* Sending context */ char aad_space[TLS_AAD_SPACE_SIZE]; diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 0a9b72fbd761..f26376e954ae 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -214,7 +214,11 @@ static int tls_do_encryption(struct tls_context *tls_ctx, aead_request_set_ad(aead_req, TLS_AAD_SPACE_SIZE); aead_request_set_crypt(aead_req, ctx->sg_aead_in, ctx->sg_aead_out, data_len, tls_ctx->iv); - rc = crypto_aead_encrypt(aead_req); + + aead_request_set_callback(aead_req, CRYPTO_TFM_REQ_MAY_BACKLOG, + crypto_req_done, &ctx->async_wait); + + rc = crypto_wait_req(crypto_aead_encrypt(aead_req), &ctx->async_wait); ctx->sg_encrypted_data[0].offset -= tls_ctx->prepend_size; ctx->sg_encrypted_data[0].length += tls_ctx->prepend_size; @@ -665,6 +669,8 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx) goto out; } + crypto_init_wait(&sw_ctx->async_wait); + ctx->priv_ctx = (struct tls_offload_context *)sw_ctx; crypto_info = &ctx->crypto_send; From ae1f6c5cbba89124d47318c321b17c0f2b551e68 Mon Sep 17 00:00:00 2001 From: Boris Pismenny Date: Wed, 14 Feb 2018 10:46:06 +0200 Subject: [PATCH 1427/1640] UPSTREAM: tls: retrun the correct IV in getsockopt Current code returns four bytes of salt followed by four bytes of IV. This patch returns all eight bytes of IV. fixes: 3c4d7559159b ("tls: kernel TLS support") Signed-off-by: Boris Pismenny Signed-off-by: David S. Miller --- net/tls/tls_main.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index 1596f494e364..6181c2ffc95d 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -308,7 +308,8 @@ static int do_tls_getsockopt_tx(struct sock *sk, char __user *optval, goto out; } lock_sock(sk); - memcpy(crypto_info_aes_gcm_128->iv, ctx->iv, + memcpy(crypto_info_aes_gcm_128->iv, + ctx->iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE, TLS_CIPHER_AES_GCM_128_IV_SIZE); release_sock(sk); if (copy_to_user(optval, From fd1084ef2f5c8f8469616e54472cd1c2f914bb60 Mon Sep 17 00:00:00 2001 From: Boris Pismenny Date: Wed, 14 Feb 2018 10:46:07 +0200 Subject: [PATCH 1428/1640] UPSTREAM: tls: reset the crypto info if copy_from_user fails copy_from_user could copy some partial information, as a result TLS_CRYPTO_INFO_READY(crypto_info) could be true while crypto_info is using uninitialzed data. This patch resets crypto_info when copy_from_user fails. fixes: 3c4d7559159b ("tls: kernel TLS support") Signed-off-by: Boris Pismenny Signed-off-by: David S. Miller --- net/tls/tls_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index 6181c2ffc95d..6fd628f8437d 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -376,7 +376,7 @@ static int do_tls_setsockopt_tx(struct sock *sk, char __user *optval, rc = copy_from_user(crypto_info, optval, sizeof(*crypto_info)); if (rc) { rc = -EFAULT; - goto out; + goto err_crypto_info; } /* check version */ From 68c2d38bf0eef7b3e91f5b9d6b988cfae16c18a8 Mon Sep 17 00:00:00 2001 From: Boris Pismenny Date: Wed, 14 Feb 2018 10:46:08 +0200 Subject: [PATCH 1429/1640] UPSTREAM: tls: getsockopt return record sequence number Return the TLS record sequence number in getsockopt. Signed-off-by: Boris Pismenny Signed-off-by: David S. Miller --- net/tls/tls_main.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index 6fd628f8437d..feef12b333b8 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -311,6 +311,8 @@ static int do_tls_getsockopt_tx(struct sock *sk, char __user *optval, memcpy(crypto_info_aes_gcm_128->iv, ctx->iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE, TLS_CIPHER_AES_GCM_128_IV_SIZE); + memcpy(crypto_info_aes_gcm_128->rec_seq, ctx->rec_seq, + TLS_CIPHER_AES_GCM_128_REC_SEQ_SIZE); release_sock(sk); if (copy_to_user(optval, crypto_info_aes_gcm_128, From ef10ea195dde02a810f3ac1ce7527bf6d63ddaa9 Mon Sep 17 00:00:00 2001 From: Boris Pismenny Date: Tue, 27 Feb 2018 14:18:39 +0200 Subject: [PATCH 1430/1640] UPSTREAM: tls: Use correct sk->sk_prot for IPV6 The tls ulp overrides sk->prot with a new tls specific proto structs. The tls specific structs were previously based on the ipv4 specific tcp_prot sturct. As a result, attaching the tls ulp to an ipv6 tcp socket replaced some ipv6 callback with the ipv4 equivalents. This patch adds ipv6 tls proto structs and uses them when attached to ipv6 sockets. Fixes: 3c4d7559159b ('tls: kernel TLS support') Signed-off-by: Boris Pismenny Signed-off-by: Ilya Lesokhin Signed-off-by: David S. Miller --- net/tls/tls_main.c | 52 +++++++++++++++++++++++++++++++++------------- 1 file changed, 37 insertions(+), 15 deletions(-) diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index feef12b333b8..fc47921edf42 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -45,17 +45,27 @@ MODULE_AUTHOR("Mellanox Technologies"); MODULE_DESCRIPTION("Transport Layer Security Support"); MODULE_LICENSE("Dual BSD/GPL"); +enum { + TLSV4, + TLSV6, + TLS_NUM_PROTS, +}; + enum { TLS_BASE_TX, TLS_SW_TX, TLS_NUM_CONFIG, }; -static struct proto tls_prots[TLS_NUM_CONFIG]; +static struct proto *saved_tcpv6_prot; +static DEFINE_MUTEX(tcpv6_prot_mutex); +static struct proto tls_prots[TLS_NUM_PROTS][TLS_NUM_CONFIG]; static inline void update_sk_prot(struct sock *sk, struct tls_context *ctx) { - sk->sk_prot = &tls_prots[ctx->tx_conf]; + int ip_ver = sk->sk_family == AF_INET6 ? TLSV6 : TLSV4; + + sk->sk_prot = &tls_prots[ip_ver][ctx->tx_conf]; } int wait_on_pending_writer(struct sock *sk, long *timeo) @@ -453,8 +463,21 @@ static int tls_setsockopt(struct sock *sk, int level, int optname, return do_tls_setsockopt(sk, optname, optval, optlen); } +static void build_protos(struct proto *prot, struct proto *base) +{ + prot[TLS_BASE_TX] = *base; + prot[TLS_BASE_TX].setsockopt = tls_setsockopt; + prot[TLS_BASE_TX].getsockopt = tls_getsockopt; + prot[TLS_BASE_TX].close = tls_sk_proto_close; + + prot[TLS_SW_TX] = prot[TLS_BASE_TX]; + prot[TLS_SW_TX].sendmsg = tls_sw_sendmsg; + prot[TLS_SW_TX].sendpage = tls_sw_sendpage; +} + static int tls_init(struct sock *sk) { + int ip_ver = sk->sk_family == AF_INET6 ? TLSV6 : TLSV4; struct inet_connection_sock *icsk = inet_csk(sk); struct tls_context *ctx; int rc = 0; @@ -479,6 +502,17 @@ static int tls_init(struct sock *sk) ctx->getsockopt = sk->sk_prot->getsockopt; ctx->sk_proto_close = sk->sk_prot->close; + /* Build IPv6 TLS whenever the address of tcpv6_prot changes */ + if (ip_ver == TLSV6 && + unlikely(sk->sk_prot != smp_load_acquire(&saved_tcpv6_prot))) { + mutex_lock(&tcpv6_prot_mutex); + if (likely(sk->sk_prot != saved_tcpv6_prot)) { + build_protos(tls_prots[TLSV6], sk->sk_prot); + smp_store_release(&saved_tcpv6_prot, sk->sk_prot); + } + mutex_unlock(&tcpv6_prot_mutex); + } + ctx->tx_conf = TLS_BASE_TX; update_sk_prot(sk, ctx); out: @@ -493,21 +527,9 @@ static struct tcp_ulp_ops tcp_tls_ulp_ops __read_mostly = { .init = tls_init, }; -static void build_protos(struct proto *prot, struct proto *base) -{ - prot[TLS_BASE_TX] = *base; - prot[TLS_BASE_TX].setsockopt = tls_setsockopt; - prot[TLS_BASE_TX].getsockopt = tls_getsockopt; - prot[TLS_BASE_TX].close = tls_sk_proto_close; - - prot[TLS_SW_TX] = prot[TLS_BASE_TX]; - prot[TLS_SW_TX].sendmsg = tls_sw_sendmsg; - prot[TLS_SW_TX].sendpage = tls_sw_sendpage; -} - static int __init tls_register(void) { - build_protos(tls_prots, &tcp_prot); + build_protos(tls_prots[TLSV4], &tcp_prot); tcp_register_ulp(&tcp_tls_ulp_ops); From 78187fd3145462b60d701ebccd9cf37f7919e918 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Sun, 18 Mar 2018 12:56:49 -0700 Subject: [PATCH 1431/1640] UPSTREAM: sock: make static tls function alloc_sg generic sock helper The TLS ULP module builds scatterlists from a sock using page_frag_refill(). This is going to be useful for other ULPs so move it into sock file for more general use. In the process remove useless goto at end of while loop. Signed-off-by: John Fastabend Acked-by: David S. Miller Signed-off-by: Daniel Borkmann --- include/net/sock.h | 4 +++ net/core/sock.c | 56 +++++++++++++++++++++++++++++++++++++ net/tls/tls_sw.c | 69 +++++----------------------------------------- 3 files changed, 67 insertions(+), 62 deletions(-) diff --git a/include/net/sock.h b/include/net/sock.h index bf4234303856..ce08e52bc586 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2234,6 +2234,10 @@ static inline struct page_frag *sk_page_frag(struct sock *sk) bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag); +int sk_alloc_sg(struct sock *sk, int len, struct scatterlist *sg, + int *sg_num_elem, unsigned int *sg_size, + int first_coalesce); + /* * Default write policy as shown to user space via poll/select/SIGIO */ diff --git a/net/core/sock.c b/net/core/sock.c index 342113238216..fb05e0ab5c60 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2429,6 +2429,62 @@ bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag) } EXPORT_SYMBOL(sk_page_frag_refill); +int sk_alloc_sg(struct sock *sk, int len, struct scatterlist *sg, + int *sg_num_elem, unsigned int *sg_size, + int first_coalesce) +{ + struct page_frag *pfrag; + unsigned int size = *sg_size; + int num_elem = *sg_num_elem, use = 0, rc = 0; + struct scatterlist *sge; + unsigned int orig_offset; + + len -= size; + pfrag = sk_page_frag(sk); + + while (len > 0) { + if (!sk_page_frag_refill(sk, pfrag)) { + rc = -ENOMEM; + goto out; + } + + use = min_t(int, len, pfrag->size - pfrag->offset); + + if (!sk_wmem_schedule(sk, use)) { + rc = -ENOMEM; + goto out; + } + + sk_mem_charge(sk, use); + size += use; + orig_offset = pfrag->offset; + pfrag->offset += use; + + sge = sg + num_elem - 1; + if (num_elem > first_coalesce && sg_page(sg) == pfrag->page && + sg->offset + sg->length == orig_offset) { + sg->length += use; + } else { + sge++; + sg_unmark_end(sge); + sg_set_page(sge, pfrag->page, use, orig_offset); + get_page(pfrag->page); + ++num_elem; + if (num_elem == MAX_SKB_FRAGS) { + rc = -ENOSPC; + break; + } + } + + len -= use; + } +out: + *sg_size = size; + *sg_num_elem = num_elem; + return rc; +} +EXPORT_SYMBOL(sk_alloc_sg); + static void __lock_sock(struct sock *sk) __releases(&sk->sk_lock.slock) __acquires(&sk->sk_lock.slock) diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index f26376e954ae..0fc8a24c6473 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -87,71 +87,16 @@ static void trim_both_sgl(struct sock *sk, int target_size) target_size); } -static int alloc_sg(struct sock *sk, int len, struct scatterlist *sg, - int *sg_num_elem, unsigned int *sg_size, - int first_coalesce) -{ - struct page_frag *pfrag; - unsigned int size = *sg_size; - int num_elem = *sg_num_elem, use = 0, rc = 0; - struct scatterlist *sge; - unsigned int orig_offset; - - len -= size; - pfrag = sk_page_frag(sk); - - while (len > 0) { - if (!sk_page_frag_refill(sk, pfrag)) { - rc = -ENOMEM; - goto out; - } - - use = min_t(int, len, pfrag->size - pfrag->offset); - - if (!sk_wmem_schedule(sk, use)) { - rc = -ENOMEM; - goto out; - } - - sk_mem_charge(sk, use); - size += use; - orig_offset = pfrag->offset; - pfrag->offset += use; - - sge = sg + num_elem - 1; - if (num_elem > first_coalesce && sg_page(sg) == pfrag->page && - sg->offset + sg->length == orig_offset) { - sg->length += use; - } else { - sge++; - sg_unmark_end(sge); - sg_set_page(sge, pfrag->page, use, orig_offset); - get_page(pfrag->page); - ++num_elem; - if (num_elem == MAX_SKB_FRAGS) { - rc = -ENOSPC; - break; - } - } - - len -= use; - } - goto out; - -out: - *sg_size = size; - *sg_num_elem = num_elem; - return rc; -} - static int alloc_encrypted_sg(struct sock *sk, int len) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx); int rc = 0; - rc = alloc_sg(sk, len, ctx->sg_encrypted_data, - &ctx->sg_encrypted_num_elem, &ctx->sg_encrypted_size, 0); + rc = sk_alloc_sg(sk, len, + ctx->sg_encrypted_data, + &ctx->sg_encrypted_num_elem, + &ctx->sg_encrypted_size, 0); return rc; } @@ -162,9 +107,9 @@ static int alloc_plaintext_sg(struct sock *sk, int len) struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx); int rc = 0; - rc = alloc_sg(sk, len, ctx->sg_plaintext_data, - &ctx->sg_plaintext_num_elem, &ctx->sg_plaintext_size, - tls_ctx->pending_open_record_frags); + rc = sk_alloc_sg(sk, len, ctx->sg_plaintext_data, + &ctx->sg_plaintext_num_elem, &ctx->sg_plaintext_size, + tls_ctx->pending_open_record_frags); return rc; } From 272864ba69beaf2f515c85b346c58db7bdfefded Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Sun, 18 Mar 2018 12:57:05 -0700 Subject: [PATCH 1432/1640] UPSTREAM: net: generalize sk_alloc_sg to work with scatterlist rings The current implementation of sk_alloc_sg expects scatterlist to always start at entry 0 and complete at entry MAX_SKB_FRAGS. Future patches will want to support starting at arbitrary offset into scatterlist so add an additional sg_start parameters and then default to the current values in TLS code paths. Signed-off-by: John Fastabend Acked-by: David S. Miller Signed-off-by: Daniel Borkmann --- include/net/sock.h | 2 +- net/core/sock.c | 27 ++++++++++++++++----------- net/tls/tls_sw.c | 4 ++-- 3 files changed, 19 insertions(+), 14 deletions(-) diff --git a/include/net/sock.h b/include/net/sock.h index ce08e52bc586..9f58e77fe19a 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2235,7 +2235,7 @@ static inline struct page_frag *sk_page_frag(struct sock *sk) bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag); int sk_alloc_sg(struct sock *sk, int len, struct scatterlist *sg, - int *sg_num_elem, unsigned int *sg_size, + int sg_start, int *sg_curr, unsigned int *sg_size, int first_coalesce); /* diff --git a/net/core/sock.c b/net/core/sock.c index fb05e0ab5c60..ddf4f8277f96 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2430,19 +2430,20 @@ bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag) EXPORT_SYMBOL(sk_page_frag_refill); int sk_alloc_sg(struct sock *sk, int len, struct scatterlist *sg, - int *sg_num_elem, unsigned int *sg_size, + int sg_start, int *sg_curr_index, unsigned int *sg_curr_size, int first_coalesce) { + int sg_curr = *sg_curr_index, use = 0, rc = 0; + unsigned int size = *sg_curr_size; struct page_frag *pfrag; - unsigned int size = *sg_size; - int num_elem = *sg_num_elem, use = 0, rc = 0; struct scatterlist *sge; - unsigned int orig_offset; len -= size; pfrag = sk_page_frag(sk); while (len > 0) { + unsigned int orig_offset; + if (!sk_page_frag_refill(sk, pfrag)) { rc = -ENOMEM; goto out; @@ -2460,17 +2461,21 @@ int sk_alloc_sg(struct sock *sk, int len, struct scatterlist *sg, orig_offset = pfrag->offset; pfrag->offset += use; - sge = sg + num_elem - 1; - if (num_elem > first_coalesce && sg_page(sg) == pfrag->page && + sge = sg + sg_curr - 1; + if (sg_curr > first_coalesce && sg_page(sg) == pfrag->page && sg->offset + sg->length == orig_offset) { sg->length += use; } else { - sge++; + sge = sg + sg_curr; sg_unmark_end(sge); sg_set_page(sge, pfrag->page, use, orig_offset); get_page(pfrag->page); - ++num_elem; - if (num_elem == MAX_SKB_FRAGS) { + sg_curr++; + + if (sg_curr == MAX_SKB_FRAGS) + sg_curr = 0; + + if (sg_curr == sg_start) { rc = -ENOSPC; break; } @@ -2479,8 +2484,8 @@ int sk_alloc_sg(struct sock *sk, int len, struct scatterlist *sg, len -= use; } out: - *sg_size = size; - *sg_num_elem = num_elem; + *sg_curr_size = size; + *sg_curr_index = sg_curr; return rc; } EXPORT_SYMBOL(sk_alloc_sg); diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 0fc8a24c6473..057a558ed6d7 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -94,7 +94,7 @@ static int alloc_encrypted_sg(struct sock *sk, int len) int rc = 0; rc = sk_alloc_sg(sk, len, - ctx->sg_encrypted_data, + ctx->sg_encrypted_data, 0, &ctx->sg_encrypted_num_elem, &ctx->sg_encrypted_size, 0); @@ -107,7 +107,7 @@ static int alloc_plaintext_sg(struct sock *sk, int len) struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx); int rc = 0; - rc = sk_alloc_sg(sk, len, ctx->sg_plaintext_data, + rc = sk_alloc_sg(sk, len, ctx->sg_plaintext_data, 0, &ctx->sg_plaintext_num_elem, &ctx->sg_plaintext_size, tls_ctx->pending_open_record_frags); From e98babba2f85836ec82b0a8939e8b33fa8184a92 Mon Sep 17 00:00:00 2001 From: Dave Watson Date: Thu, 22 Mar 2018 10:09:53 -0700 Subject: [PATCH 1433/1640] UPSTREAM: tls: Generalize zerocopy_from_iter Refactor zerocopy_from_iter to take arguments for pages and size, such that it can be used for both tx and rx. RX will also support zerocopy direct to output iter, as long as the full message can be copied at once (a large enough userspace buffer was provided). Signed-off-by: Dave Watson Signed-off-by: David S. Miller --- net/tls/tls_sw.c | 31 +++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 057a558ed6d7..ca1d20de3d2c 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -226,23 +226,24 @@ static int tls_sw_push_pending_record(struct sock *sk, int flags) } static int zerocopy_from_iter(struct sock *sk, struct iov_iter *from, - int length) + int length, int *pages_used, + unsigned int *size_used, + struct scatterlist *to, int to_max_pages, + bool charge) { - struct tls_context *tls_ctx = tls_get_ctx(sk); - struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx); struct page *pages[MAX_SKB_FRAGS]; size_t offset; ssize_t copied, use; int i = 0; - unsigned int size = ctx->sg_plaintext_size; - int num_elem = ctx->sg_plaintext_num_elem; + unsigned int size = *size_used; + int num_elem = *pages_used; int rc = 0; int maxpages; while (length > 0) { i = 0; - maxpages = ARRAY_SIZE(ctx->sg_plaintext_data) - num_elem; + maxpages = to_max_pages - num_elem; if (maxpages == 0) { rc = -EFAULT; goto out; @@ -262,10 +263,11 @@ static int zerocopy_from_iter(struct sock *sk, struct iov_iter *from, while (copied) { use = min_t(int, copied, PAGE_SIZE - offset); - sg_set_page(&ctx->sg_plaintext_data[num_elem], + sg_set_page(&to[num_elem], pages[i], use, offset); - sg_unmark_end(&ctx->sg_plaintext_data[num_elem]); - sk_mem_charge(sk, use); + sg_unmark_end(&to[num_elem]); + if (charge) + sk_mem_charge(sk, use); offset = 0; copied -= use; @@ -276,8 +278,9 @@ static int zerocopy_from_iter(struct sock *sk, struct iov_iter *from, } out: - ctx->sg_plaintext_size = size; - ctx->sg_plaintext_num_elem = num_elem; + *size_used = size; + *pages_used = num_elem; + return rc; } @@ -374,7 +377,11 @@ alloc_encrypted: if (full_record || eor) { ret = zerocopy_from_iter(sk, &msg->msg_iter, - try_to_copy); + try_to_copy, &ctx->sg_plaintext_num_elem, + &ctx->sg_plaintext_size, + ctx->sg_plaintext_data, + ARRAY_SIZE(ctx->sg_plaintext_data), + true); if (ret) goto fallback_to_reg_send; From 270cc13079620fca2c2f576a15dbc3bb71cb4c6d Mon Sep 17 00:00:00 2001 From: Dave Watson Date: Thu, 22 Mar 2018 10:10:06 -0700 Subject: [PATCH 1434/1640] UPSTREAM: tls: Move cipher info to a separate struct Separate tx crypto parameters to a separate cipher_context struct. The same parameters will be used for rx using the same struct. tls_advance_record_sn is modified to only take the cipher info. Signed-off-by: Dave Watson Signed-off-by: David S. Miller --- include/net/tls.h | 26 ++++++++++++--------- net/tls/tls_main.c | 8 +++---- net/tls/tls_sw.c | 58 ++++++++++++++++++++++++---------------------- 3 files changed, 49 insertions(+), 43 deletions(-) diff --git a/include/net/tls.h b/include/net/tls.h index 4913430ab807..019e52db1817 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -81,6 +81,16 @@ enum { TLS_PENDING_CLOSED_RECORD }; +struct cipher_context { + u16 prepend_size; + u16 tag_size; + u16 overhead_size; + u16 iv_size; + char *iv; + u16 rec_seq_size; + char *rec_seq; +}; + struct tls_context { union { struct tls_crypto_info crypto_send; @@ -91,13 +101,7 @@ struct tls_context { u8 tx_conf:2; - u16 prepend_size; - u16 tag_size; - u16 overhead_size; - u16 iv_size; - char *iv; - u16 rec_seq_size; - char *rec_seq; + struct cipher_context tx; struct scatterlist *partially_sent_record; u16 partially_sent_offset; @@ -190,7 +194,7 @@ static inline bool tls_bigint_increment(unsigned char *seq, int len) } static inline void tls_advance_record_sn(struct sock *sk, - struct tls_context *ctx) + struct cipher_context *ctx) { if (tls_bigint_increment(ctx->rec_seq, ctx->rec_seq_size)) tls_err_abort(sk); @@ -203,9 +207,9 @@ static inline void tls_fill_prepend(struct tls_context *ctx, size_t plaintext_len, unsigned char record_type) { - size_t pkt_len, iv_size = ctx->iv_size; + size_t pkt_len, iv_size = ctx->tx.iv_size; - pkt_len = plaintext_len + iv_size + ctx->tag_size; + pkt_len = plaintext_len + iv_size + ctx->tx.tag_size; /* we cover nonce explicit here as well, so buf should be of * size KTLS_DTLS_HEADER_SIZE + KTLS_DTLS_NONCE_EXPLICIT_SIZE @@ -217,7 +221,7 @@ static inline void tls_fill_prepend(struct tls_context *ctx, buf[3] = pkt_len >> 8; buf[4] = pkt_len & 0xFF; memcpy(buf + TLS_NONCE_OFFSET, - ctx->iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE, iv_size); + ctx->tx.iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE, iv_size); } static inline void tls_make_aad(char *buf, diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index fc47921edf42..1019c0329534 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -259,8 +259,8 @@ static void tls_sk_proto_close(struct sock *sk, long timeout) } } - kfree(ctx->rec_seq); - kfree(ctx->iv); + kfree(ctx->tx.rec_seq); + kfree(ctx->tx.iv); if (ctx->tx_conf == TLS_SW_TX) tls_sw_free_tx_resources(sk); @@ -319,9 +319,9 @@ static int do_tls_getsockopt_tx(struct sock *sk, char __user *optval, } lock_sock(sk); memcpy(crypto_info_aes_gcm_128->iv, - ctx->iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE, + ctx->tx.iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE, TLS_CIPHER_AES_GCM_128_IV_SIZE); - memcpy(crypto_info_aes_gcm_128->rec_seq, ctx->rec_seq, + memcpy(crypto_info_aes_gcm_128->rec_seq, ctx->tx.rec_seq, TLS_CIPHER_AES_GCM_128_REC_SEQ_SIZE); release_sock(sk); if (copy_to_user(optval, diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index ca1d20de3d2c..338d743bcc21 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -79,7 +79,7 @@ static void trim_both_sgl(struct sock *sk, int target_size) target_size); if (target_size > 0) - target_size += tls_ctx->overhead_size; + target_size += tls_ctx->tx.overhead_size; trim_sg(sk, ctx->sg_encrypted_data, &ctx->sg_encrypted_num_elem, @@ -152,21 +152,21 @@ static int tls_do_encryption(struct tls_context *tls_ctx, if (!aead_req) return -ENOMEM; - ctx->sg_encrypted_data[0].offset += tls_ctx->prepend_size; - ctx->sg_encrypted_data[0].length -= tls_ctx->prepend_size; + ctx->sg_encrypted_data[0].offset += tls_ctx->tx.prepend_size; + ctx->sg_encrypted_data[0].length -= tls_ctx->tx.prepend_size; aead_request_set_tfm(aead_req, ctx->aead_send); aead_request_set_ad(aead_req, TLS_AAD_SPACE_SIZE); aead_request_set_crypt(aead_req, ctx->sg_aead_in, ctx->sg_aead_out, - data_len, tls_ctx->iv); + data_len, tls_ctx->tx.iv); aead_request_set_callback(aead_req, CRYPTO_TFM_REQ_MAY_BACKLOG, crypto_req_done, &ctx->async_wait); rc = crypto_wait_req(crypto_aead_encrypt(aead_req), &ctx->async_wait); - ctx->sg_encrypted_data[0].offset -= tls_ctx->prepend_size; - ctx->sg_encrypted_data[0].length += tls_ctx->prepend_size; + ctx->sg_encrypted_data[0].offset -= tls_ctx->tx.prepend_size; + ctx->sg_encrypted_data[0].length += tls_ctx->tx.prepend_size; kfree(aead_req); return rc; @@ -183,7 +183,7 @@ static int tls_push_record(struct sock *sk, int flags, sg_mark_end(ctx->sg_encrypted_data + ctx->sg_encrypted_num_elem - 1); tls_make_aad(ctx->aad_space, ctx->sg_plaintext_size, - tls_ctx->rec_seq, tls_ctx->rec_seq_size, + tls_ctx->tx.rec_seq, tls_ctx->tx.rec_seq_size, record_type); tls_fill_prepend(tls_ctx, @@ -216,7 +216,7 @@ static int tls_push_record(struct sock *sk, int flags, if (rc < 0 && rc != -EAGAIN) tls_err_abort(sk); - tls_advance_record_sn(sk, tls_ctx); + tls_advance_record_sn(sk, &tls_ctx->tx); return rc; } @@ -357,7 +357,7 @@ int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) } required_size = ctx->sg_plaintext_size + try_to_copy + - tls_ctx->overhead_size; + tls_ctx->tx.overhead_size; if (!sk_stream_memory_free(sk)) goto wait_for_sndbuf; @@ -420,7 +420,7 @@ alloc_plaintext: &ctx->sg_encrypted_num_elem, &ctx->sg_encrypted_size, ctx->sg_plaintext_size + - tls_ctx->overhead_size); + tls_ctx->tx.overhead_size); } ret = memcopy_from_iter(sk, &msg->msg_iter, try_to_copy); @@ -512,7 +512,7 @@ int tls_sw_sendpage(struct sock *sk, struct page *page, full_record = true; } required_size = ctx->sg_plaintext_size + copy + - tls_ctx->overhead_size; + tls_ctx->tx.overhead_size; if (!sk_stream_memory_free(sk)) goto wait_for_sndbuf; @@ -644,24 +644,26 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx) goto free_priv; } - ctx->prepend_size = TLS_HEADER_SIZE + nonce_size; - ctx->tag_size = tag_size; - ctx->overhead_size = ctx->prepend_size + ctx->tag_size; - ctx->iv_size = iv_size; - ctx->iv = kmalloc(iv_size + TLS_CIPHER_AES_GCM_128_SALT_SIZE, GFP_KERNEL); - if (!ctx->iv) { + ctx->tx.prepend_size = TLS_HEADER_SIZE + nonce_size; + ctx->tx.tag_size = tag_size; + ctx->tx.overhead_size = ctx->tx.prepend_size + ctx->tx.tag_size; + ctx->tx.iv_size = iv_size; + ctx->tx.iv = kmalloc(iv_size + TLS_CIPHER_AES_GCM_128_SALT_SIZE, + GFP_KERNEL); + if (!ctx->tx.iv) { rc = -ENOMEM; goto free_priv; } - memcpy(ctx->iv, gcm_128_info->salt, TLS_CIPHER_AES_GCM_128_SALT_SIZE); - memcpy(ctx->iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE, iv, iv_size); - ctx->rec_seq_size = rec_seq_size; - ctx->rec_seq = kmalloc(rec_seq_size, GFP_KERNEL); - if (!ctx->rec_seq) { + memcpy(ctx->tx.iv, gcm_128_info->salt, + TLS_CIPHER_AES_GCM_128_SALT_SIZE); + memcpy(ctx->tx.iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE, iv, iv_size); + ctx->tx.rec_seq_size = rec_seq_size; + ctx->tx.rec_seq = kmalloc(rec_seq_size, GFP_KERNEL); + if (!ctx->tx.rec_seq) { rc = -ENOMEM; goto free_iv; } - memcpy(ctx->rec_seq, rec_seq, rec_seq_size); + memcpy(ctx->tx.rec_seq, rec_seq, rec_seq_size); sg_init_table(sw_ctx->sg_encrypted_data, ARRAY_SIZE(sw_ctx->sg_encrypted_data)); @@ -697,7 +699,7 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx) if (rc) goto free_aead; - rc = crypto_aead_setauthsize(sw_ctx->aead_send, ctx->tag_size); + rc = crypto_aead_setauthsize(sw_ctx->aead_send, ctx->tx.tag_size); if (!rc) return 0; @@ -705,11 +707,11 @@ free_aead: crypto_free_aead(sw_ctx->aead_send); sw_ctx->aead_send = NULL; free_rec_seq: - kfree(ctx->rec_seq); - ctx->rec_seq = NULL; + kfree(ctx->tx.rec_seq); + ctx->tx.rec_seq = NULL; free_iv: - kfree(ctx->iv); - ctx->iv = NULL; + kfree(ctx->tx.iv); + ctx->tx.iv = NULL; free_priv: kfree(ctx->priv_ctx); ctx->priv_ctx = NULL; From 05261ad1577ba1834680f39a2ff7becbb717df16 Mon Sep 17 00:00:00 2001 From: Dave Watson Date: Thu, 22 Mar 2018 10:10:15 -0700 Subject: [PATCH 1435/1640] UPSTREAM: tls: Pass error code explicitly to tls_err_abort Pass EBADMSG explicitly to tls_err_abort. Receive path will pass additional codes - EMSGSIZE if framing is larger than max TLS record size, EINVAL if TLS version mismatch. Signed-off-by: Dave Watson Signed-off-by: David S. Miller --- include/net/tls.h | 6 +++--- net/tls/tls_sw.c | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/include/net/tls.h b/include/net/tls.h index 019e52db1817..6b44875a78e5 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -174,9 +174,9 @@ static inline bool tls_is_pending_open_record(struct tls_context *tls_ctx) return tls_ctx->pending_open_record_frags; } -static inline void tls_err_abort(struct sock *sk) +static inline void tls_err_abort(struct sock *sk, int err) { - sk->sk_err = EBADMSG; + sk->sk_err = err; sk->sk_error_report(sk); } @@ -197,7 +197,7 @@ static inline void tls_advance_record_sn(struct sock *sk, struct cipher_context *ctx) { if (tls_bigint_increment(ctx->rec_seq, ctx->rec_seq_size)) - tls_err_abort(sk); + tls_err_abort(sk, EBADMSG); tls_bigint_increment(ctx->iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE, ctx->iv_size); } diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 338d743bcc21..1c79d9ad1731 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -214,7 +214,7 @@ static int tls_push_record(struct sock *sk, int flags, /* Only pass through MSG_DONTWAIT and MSG_NOSIGNAL flags */ rc = tls_push_sg(sk, tls_ctx, ctx->sg_encrypted_data, 0, flags); if (rc < 0 && rc != -EAGAIN) - tls_err_abort(sk); + tls_err_abort(sk, EBADMSG); tls_advance_record_sn(sk, &tls_ctx->tx); return rc; From 3873d1352de6ac7c787b7cd556c91d08236d4a39 Mon Sep 17 00:00:00 2001 From: Dave Watson Date: Thu, 22 Mar 2018 10:10:26 -0700 Subject: [PATCH 1436/1640] UPSTREAM: tls: Refactor variable names Several config variables are prefixed with tx, drop the prefix since these will be used for both tx and rx. Signed-off-by: Dave Watson Signed-off-by: David S. Miller --- include/net/tls.h | 2 +- net/tls/tls_main.c | 26 +++++++++++++------------- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/include/net/tls.h b/include/net/tls.h index 6b44875a78e5..095b72283861 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -99,7 +99,7 @@ struct tls_context { void *priv_ctx; - u8 tx_conf:2; + u8 conf:2; struct cipher_context tx; diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index 1019c0329534..38d1f0c93e39 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -52,7 +52,7 @@ enum { }; enum { - TLS_BASE_TX, + TLS_BASE, TLS_SW_TX, TLS_NUM_CONFIG, }; @@ -65,7 +65,7 @@ static inline void update_sk_prot(struct sock *sk, struct tls_context *ctx) { int ip_ver = sk->sk_family == AF_INET6 ? TLSV6 : TLSV4; - sk->sk_prot = &tls_prots[ip_ver][ctx->tx_conf]; + sk->sk_prot = &tls_prots[ip_ver][ctx->conf]; } int wait_on_pending_writer(struct sock *sk, long *timeo) @@ -238,7 +238,7 @@ static void tls_sk_proto_close(struct sock *sk, long timeout) lock_sock(sk); sk_proto_close = ctx->sk_proto_close; - if (ctx->tx_conf == TLS_BASE_TX) { + if (ctx->conf == TLS_BASE) { kfree(ctx); goto skip_tx_cleanup; } @@ -262,7 +262,7 @@ static void tls_sk_proto_close(struct sock *sk, long timeout) kfree(ctx->tx.rec_seq); kfree(ctx->tx.iv); - if (ctx->tx_conf == TLS_SW_TX) + if (ctx->conf == TLS_SW_TX) tls_sw_free_tx_resources(sk); skip_tx_cleanup: @@ -371,7 +371,7 @@ static int do_tls_setsockopt_tx(struct sock *sk, char __user *optval, struct tls_crypto_info *crypto_info; struct tls_context *ctx = tls_get_ctx(sk); int rc = 0; - int tx_conf; + int conf; if (!optval || (optlen < sizeof(*crypto_info))) { rc = -EINVAL; @@ -418,11 +418,11 @@ static int do_tls_setsockopt_tx(struct sock *sk, char __user *optval, /* currently SW is default, we will have ethtool in future */ rc = tls_set_sw_offload(sk, ctx); - tx_conf = TLS_SW_TX; + conf = TLS_SW_TX; if (rc) goto err_crypto_info; - ctx->tx_conf = tx_conf; + ctx->conf = conf; update_sk_prot(sk, ctx); ctx->sk_write_space = sk->sk_write_space; sk->sk_write_space = tls_write_space; @@ -465,12 +465,12 @@ static int tls_setsockopt(struct sock *sk, int level, int optname, static void build_protos(struct proto *prot, struct proto *base) { - prot[TLS_BASE_TX] = *base; - prot[TLS_BASE_TX].setsockopt = tls_setsockopt; - prot[TLS_BASE_TX].getsockopt = tls_getsockopt; - prot[TLS_BASE_TX].close = tls_sk_proto_close; + prot[TLS_BASE] = *base; + prot[TLS_BASE].setsockopt = tls_setsockopt; + prot[TLS_BASE].getsockopt = tls_getsockopt; + prot[TLS_BASE].close = tls_sk_proto_close; - prot[TLS_SW_TX] = prot[TLS_BASE_TX]; + prot[TLS_SW_TX] = prot[TLS_BASE]; prot[TLS_SW_TX].sendmsg = tls_sw_sendmsg; prot[TLS_SW_TX].sendpage = tls_sw_sendpage; } @@ -513,7 +513,7 @@ static int tls_init(struct sock *sk) mutex_unlock(&tcpv6_prot_mutex); } - ctx->tx_conf = TLS_BASE_TX; + ctx->conf = TLS_BASE; update_sk_prot(sk, ctx); out: return rc; From 27156b1459620afd3b3982ebecd04f076b8ad423 Mon Sep 17 00:00:00 2001 From: Dave Watson Date: Thu, 22 Mar 2018 10:10:35 -0700 Subject: [PATCH 1437/1640] UPSTREAM: tls: RX path for ktls Add rx path for tls software implementation. recvmsg, splice_read, and poll implemented. An additional sockopt TLS_RX is added, with the same interface as TLS_TX. Either TLX_RX or TLX_TX may be provided separately, or together (with two different setsockopt calls with appropriate keys). Control messages are passed via CMSG in a similar way to transmit. If no cmsg buffer is passed, then only application data records will be passed to userspace, and EIO is returned for other types of alerts. EBADMSG is passed for decryption errors, and EMSGSIZE is passed for framing too big, and EBADMSG for framing too small (matching openssl semantics). EINVAL is returned for TLS versions that do not match the original setsockopt call. All are unrecoverable. strparser is used to parse TLS framing. Decryption is done directly in to userspace buffers if they are large enough to support it, otherwise sk_cow_data is called (similar to ipsec), and buffers are decrypted in place and copied. splice_read always decrypts in place, since no buffers are provided to decrypt in to. sk_poll is overridden, and only returns POLLIN if a full TLS message is received. Otherwise we wait for strparser to finish reading a full frame. Actual decryption is only done during recvmsg or splice_read calls. Signed-off-by: Dave Watson Signed-off-by: David S. Miller --- include/net/tls.h | 27 +- include/uapi/linux/tls.h | 2 + net/tls/Kconfig | 1 + net/tls/tls_main.c | 62 ++++- net/tls/tls_sw.c | 581 +++++++++++++++++++++++++++++++++++---- 5 files changed, 606 insertions(+), 67 deletions(-) diff --git a/include/net/tls.h b/include/net/tls.h index 095b72283861..437a746300bf 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -40,6 +40,7 @@ #include #include #include +#include #include @@ -58,8 +59,18 @@ struct tls_sw_context { struct crypto_aead *aead_send; + struct crypto_aead *aead_recv; struct crypto_wait async_wait; + /* Receive context */ + struct strparser strp; + void (*saved_data_ready)(struct sock *sk); + unsigned int (*sk_poll)(struct file *file, struct socket *sock, + struct poll_table_struct *wait); + struct sk_buff *recv_pkt; + u8 control; + bool decrypted; + /* Sending context */ char aad_space[TLS_AAD_SPACE_SIZE]; @@ -96,12 +107,17 @@ struct tls_context { struct tls_crypto_info crypto_send; struct tls12_crypto_info_aes_gcm_128 crypto_send_aes_gcm_128; }; + union { + struct tls_crypto_info crypto_recv; + struct tls12_crypto_info_aes_gcm_128 crypto_recv_aes_gcm_128; + }; void *priv_ctx; u8 conf:2; struct cipher_context tx; + struct cipher_context rx; struct scatterlist *partially_sent_record; u16 partially_sent_offset; @@ -128,12 +144,19 @@ int tls_sk_attach(struct sock *sk, int optname, char __user *optval, unsigned int optlen); -int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx); +int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx); int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size); int tls_sw_sendpage(struct sock *sk, struct page *page, int offset, size_t size, int flags); void tls_sw_close(struct sock *sk, long timeout); -void tls_sw_free_tx_resources(struct sock *sk); +void tls_sw_free_resources(struct sock *sk); +int tls_sw_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, + int nonblock, int flags, int *addr_len); +unsigned int tls_sw_poll(struct file *file, struct socket *sock, + struct poll_table_struct *wait); +ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos, + struct pipe_inode_info *pipe, + size_t len, unsigned int flags); void tls_sk_destruct(struct sock *sk, struct tls_context *ctx); void tls_icsk_clean_acked(struct sock *sk); diff --git a/include/uapi/linux/tls.h b/include/uapi/linux/tls.h index 293b2cdad88d..c6633e97eca4 100644 --- a/include/uapi/linux/tls.h +++ b/include/uapi/linux/tls.h @@ -38,6 +38,7 @@ /* TLS socket options */ #define TLS_TX 1 /* Set transmit parameters */ +#define TLS_RX 2 /* Set receive parameters */ /* Supported versions */ #define TLS_VERSION_MINOR(ver) ((ver) & 0xFF) @@ -59,6 +60,7 @@ #define TLS_CIPHER_AES_GCM_128_REC_SEQ_SIZE 8 #define TLS_SET_RECORD_TYPE 1 +#define TLS_GET_RECORD_TYPE 2 struct tls_crypto_info { __u16 version; diff --git a/net/tls/Kconfig b/net/tls/Kconfig index eb583038c67e..89b8745a986f 100644 --- a/net/tls/Kconfig +++ b/net/tls/Kconfig @@ -7,6 +7,7 @@ config TLS select CRYPTO select CRYPTO_AES select CRYPTO_GCM + select STREAM_PARSER default n ---help--- Enable kernel support for TLS protocol. This allows symmetric diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index 38d1f0c93e39..8a87ee7c4d70 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -54,12 +54,15 @@ enum { enum { TLS_BASE, TLS_SW_TX, + TLS_SW_RX, + TLS_SW_RXTX, TLS_NUM_CONFIG, }; static struct proto *saved_tcpv6_prot; static DEFINE_MUTEX(tcpv6_prot_mutex); static struct proto tls_prots[TLS_NUM_PROTS][TLS_NUM_CONFIG]; +static struct proto_ops tls_sw_proto_ops; static inline void update_sk_prot(struct sock *sk, struct tls_context *ctx) { @@ -261,9 +264,14 @@ static void tls_sk_proto_close(struct sock *sk, long timeout) kfree(ctx->tx.rec_seq); kfree(ctx->tx.iv); + kfree(ctx->rx.rec_seq); + kfree(ctx->rx.iv); - if (ctx->conf == TLS_SW_TX) - tls_sw_free_tx_resources(sk); + if (ctx->conf == TLS_SW_TX || + ctx->conf == TLS_SW_RX || + ctx->conf == TLS_SW_RXTX) { + tls_sw_free_resources(sk); + } skip_tx_cleanup: release_sock(sk); @@ -365,8 +373,8 @@ static int tls_getsockopt(struct sock *sk, int level, int optname, return do_tls_getsockopt(sk, optname, optval, optlen); } -static int do_tls_setsockopt_tx(struct sock *sk, char __user *optval, - unsigned int optlen) +static int do_tls_setsockopt_conf(struct sock *sk, char __user *optval, + unsigned int optlen, int tx) { struct tls_crypto_info *crypto_info; struct tls_context *ctx = tls_get_ctx(sk); @@ -378,7 +386,11 @@ static int do_tls_setsockopt_tx(struct sock *sk, char __user *optval, goto out; } - crypto_info = &ctx->crypto_send; + if (tx) + crypto_info = &ctx->crypto_send; + else + crypto_info = &ctx->crypto_recv; + /* Currently we don't support set crypto info more than one time */ if (TLS_CRYPTO_INFO_READY(crypto_info)) { rc = -EBUSY; @@ -417,15 +429,31 @@ static int do_tls_setsockopt_tx(struct sock *sk, char __user *optval, } /* currently SW is default, we will have ethtool in future */ - rc = tls_set_sw_offload(sk, ctx); - conf = TLS_SW_TX; + if (tx) { + rc = tls_set_sw_offload(sk, ctx, 1); + if (ctx->conf == TLS_SW_RX) + conf = TLS_SW_RXTX; + else + conf = TLS_SW_TX; + } else { + rc = tls_set_sw_offload(sk, ctx, 0); + if (ctx->conf == TLS_SW_TX) + conf = TLS_SW_RXTX; + else + conf = TLS_SW_RX; + } + if (rc) goto err_crypto_info; ctx->conf = conf; update_sk_prot(sk, ctx); - ctx->sk_write_space = sk->sk_write_space; - sk->sk_write_space = tls_write_space; + if (tx) { + ctx->sk_write_space = sk->sk_write_space; + sk->sk_write_space = tls_write_space; + } else { + sk->sk_socket->ops = &tls_sw_proto_ops; + } goto out; err_crypto_info: @@ -441,8 +469,10 @@ static int do_tls_setsockopt(struct sock *sk, int optname, switch (optname) { case TLS_TX: + case TLS_RX: lock_sock(sk); - rc = do_tls_setsockopt_tx(sk, optval, optlen); + rc = do_tls_setsockopt_conf(sk, optval, optlen, + optname == TLS_TX); release_sock(sk); break; default: @@ -473,6 +503,14 @@ static void build_protos(struct proto *prot, struct proto *base) prot[TLS_SW_TX] = prot[TLS_BASE]; prot[TLS_SW_TX].sendmsg = tls_sw_sendmsg; prot[TLS_SW_TX].sendpage = tls_sw_sendpage; + + prot[TLS_SW_RX] = prot[TLS_BASE]; + prot[TLS_SW_RX].recvmsg = tls_sw_recvmsg; + prot[TLS_SW_RX].close = tls_sk_proto_close; + + prot[TLS_SW_RXTX] = prot[TLS_SW_TX]; + prot[TLS_SW_RXTX].recvmsg = tls_sw_recvmsg; + prot[TLS_SW_RXTX].close = tls_sk_proto_close; } static int tls_init(struct sock *sk) @@ -531,6 +569,10 @@ static int __init tls_register(void) { build_protos(tls_prots[TLSV4], &tcp_prot); + tls_sw_proto_ops = inet_stream_ops; + tls_sw_proto_ops.poll = tls_sw_poll; + tls_sw_proto_ops.splice_read = tls_sw_splice_read; + tcp_register_ulp(&tcp_tls_ulp_ops); return 0; diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 1c79d9ad1731..4dc766b03f00 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -34,11 +34,60 @@ * SOFTWARE. */ +#include #include #include +#include #include +static int tls_do_decryption(struct sock *sk, + struct scatterlist *sgin, + struct scatterlist *sgout, + char *iv_recv, + size_t data_len, + struct sk_buff *skb, + gfp_t flags) +{ + struct tls_context *tls_ctx = tls_get_ctx(sk); + struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx); + struct strp_msg *rxm = strp_msg(skb); + struct aead_request *aead_req; + + int ret; + unsigned int req_size = sizeof(struct aead_request) + + crypto_aead_reqsize(ctx->aead_recv); + + aead_req = kzalloc(req_size, flags); + if (!aead_req) + return -ENOMEM; + + aead_request_set_tfm(aead_req, ctx->aead_recv); + aead_request_set_ad(aead_req, TLS_AAD_SPACE_SIZE); + aead_request_set_crypt(aead_req, sgin, sgout, + data_len + tls_ctx->rx.tag_size, + (u8 *)iv_recv); + aead_request_set_callback(aead_req, CRYPTO_TFM_REQ_MAY_BACKLOG, + crypto_req_done, &ctx->async_wait); + + ret = crypto_wait_req(crypto_aead_decrypt(aead_req), &ctx->async_wait); + + if (ret < 0) + goto out; + + rxm->offset += tls_ctx->rx.prepend_size; + rxm->full_len -= tls_ctx->rx.overhead_size; + tls_advance_record_sn(sk, &tls_ctx->rx); + + ctx->decrypted = true; + + ctx->saved_data_ready(sk); + +out: + kfree(aead_req); + return ret; +} + static void trim_sg(struct sock *sk, struct scatterlist *sg, int *sg_num_elem, unsigned int *sg_size, int target_size) { @@ -581,13 +630,404 @@ sendpage_end: return ret; } -void tls_sw_free_tx_resources(struct sock *sk) +static struct sk_buff *tls_wait_data(struct sock *sk, int flags, + long timeo, int *err) +{ + struct tls_context *tls_ctx = tls_get_ctx(sk); + struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx); + struct sk_buff *skb; + DEFINE_WAIT_FUNC(wait, woken_wake_function); + + while (!(skb = ctx->recv_pkt)) { + if (sk->sk_err) { + *err = sock_error(sk); + return NULL; + } + + if (sock_flag(sk, SOCK_DONE)) + return NULL; + + if ((flags & MSG_DONTWAIT) || !timeo) { + *err = -EAGAIN; + return NULL; + } + + add_wait_queue(sk_sleep(sk), &wait); + sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); + sk_wait_event(sk, &timeo, ctx->recv_pkt != skb, &wait); + sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); + remove_wait_queue(sk_sleep(sk), &wait); + + /* Handle signals */ + if (signal_pending(current)) { + *err = sock_intr_errno(timeo); + return NULL; + } + } + + return skb; +} + +static int decrypt_skb(struct sock *sk, struct sk_buff *skb, + struct scatterlist *sgout) +{ + struct tls_context *tls_ctx = tls_get_ctx(sk); + struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx); + char iv[TLS_CIPHER_AES_GCM_128_SALT_SIZE + tls_ctx->rx.iv_size]; + struct scatterlist sgin_arr[MAX_SKB_FRAGS + 2]; + struct scatterlist *sgin = &sgin_arr[0]; + struct strp_msg *rxm = strp_msg(skb); + int ret, nsg = ARRAY_SIZE(sgin_arr); + char aad_recv[TLS_AAD_SPACE_SIZE]; + struct sk_buff *unused; + + ret = skb_copy_bits(skb, rxm->offset + TLS_HEADER_SIZE, + iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE, + tls_ctx->rx.iv_size); + if (ret < 0) + return ret; + + memcpy(iv, tls_ctx->rx.iv, TLS_CIPHER_AES_GCM_128_SALT_SIZE); + if (!sgout) { + nsg = skb_cow_data(skb, 0, &unused) + 1; + sgin = kmalloc_array(nsg, sizeof(*sgin), sk->sk_allocation); + if (!sgout) + sgout = sgin; + } + + sg_init_table(sgin, nsg); + sg_set_buf(&sgin[0], aad_recv, sizeof(aad_recv)); + + nsg = skb_to_sgvec(skb, &sgin[1], + rxm->offset + tls_ctx->rx.prepend_size, + rxm->full_len - tls_ctx->rx.prepend_size); + + tls_make_aad(aad_recv, + rxm->full_len - tls_ctx->rx.overhead_size, + tls_ctx->rx.rec_seq, + tls_ctx->rx.rec_seq_size, + ctx->control); + + ret = tls_do_decryption(sk, sgin, sgout, iv, + rxm->full_len - tls_ctx->rx.overhead_size, + skb, sk->sk_allocation); + + if (sgin != &sgin_arr[0]) + kfree(sgin); + + return ret; +} + +static bool tls_sw_advance_skb(struct sock *sk, struct sk_buff *skb, + unsigned int len) +{ + struct tls_context *tls_ctx = tls_get_ctx(sk); + struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx); + struct strp_msg *rxm = strp_msg(skb); + + if (len < rxm->full_len) { + rxm->offset += len; + rxm->full_len -= len; + + return false; + } + + /* Finished with message */ + ctx->recv_pkt = NULL; + kfree_skb(skb); + strp_unpause(&ctx->strp); + + return true; +} + +int tls_sw_recvmsg(struct sock *sk, + struct msghdr *msg, + size_t len, + int nonblock, + int flags, + int *addr_len) +{ + struct tls_context *tls_ctx = tls_get_ctx(sk); + struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx); + unsigned char control; + struct strp_msg *rxm; + struct sk_buff *skb; + ssize_t copied = 0; + bool cmsg = false; + int err = 0; + long timeo; + + flags |= nonblock; + + if (unlikely(flags & MSG_ERRQUEUE)) + return sock_recv_errqueue(sk, msg, len, SOL_IP, IP_RECVERR); + + lock_sock(sk); + + timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); + do { + bool zc = false; + int chunk = 0; + + skb = tls_wait_data(sk, flags, timeo, &err); + if (!skb) + goto recv_end; + + rxm = strp_msg(skb); + if (!cmsg) { + int cerr; + + cerr = put_cmsg(msg, SOL_TLS, TLS_GET_RECORD_TYPE, + sizeof(ctx->control), &ctx->control); + cmsg = true; + control = ctx->control; + if (ctx->control != TLS_RECORD_TYPE_DATA) { + if (cerr || msg->msg_flags & MSG_CTRUNC) { + err = -EIO; + goto recv_end; + } + } + } else if (control != ctx->control) { + goto recv_end; + } + + if (!ctx->decrypted) { + int page_count; + int to_copy; + + page_count = iov_iter_npages(&msg->msg_iter, + MAX_SKB_FRAGS); + to_copy = rxm->full_len - tls_ctx->rx.overhead_size; + if (to_copy <= len && page_count < MAX_SKB_FRAGS && + likely(!(flags & MSG_PEEK))) { + struct scatterlist sgin[MAX_SKB_FRAGS + 1]; + char unused[21]; + int pages = 0; + + zc = true; + sg_init_table(sgin, MAX_SKB_FRAGS + 1); + sg_set_buf(&sgin[0], unused, 13); + + err = zerocopy_from_iter(sk, &msg->msg_iter, + to_copy, &pages, + &chunk, &sgin[1], + MAX_SKB_FRAGS, false); + if (err < 0) + goto fallback_to_reg_recv; + + err = decrypt_skb(sk, skb, sgin); + for (; pages > 0; pages--) + put_page(sg_page(&sgin[pages])); + if (err < 0) { + tls_err_abort(sk, EBADMSG); + goto recv_end; + } + } else { +fallback_to_reg_recv: + err = decrypt_skb(sk, skb, NULL); + if (err < 0) { + tls_err_abort(sk, EBADMSG); + goto recv_end; + } + } + ctx->decrypted = true; + } + + if (!zc) { + chunk = min_t(unsigned int, rxm->full_len, len); + err = skb_copy_datagram_msg(skb, rxm->offset, msg, + chunk); + if (err < 0) + goto recv_end; + } + + copied += chunk; + len -= chunk; + if (likely(!(flags & MSG_PEEK))) { + u8 control = ctx->control; + + if (tls_sw_advance_skb(sk, skb, chunk)) { + /* Return full control message to + * userspace before trying to parse + * another message type + */ + msg->msg_flags |= MSG_EOR; + if (control != TLS_RECORD_TYPE_DATA) + goto recv_end; + } + } + } while (len); + +recv_end: + release_sock(sk); + return copied ? : err; +} + +ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos, + struct pipe_inode_info *pipe, + size_t len, unsigned int flags) +{ + struct tls_context *tls_ctx = tls_get_ctx(sock->sk); + struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx); + struct strp_msg *rxm = NULL; + struct sock *sk = sock->sk; + struct sk_buff *skb; + ssize_t copied = 0; + int err = 0; + long timeo; + int chunk; + + lock_sock(sk); + + timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); + + skb = tls_wait_data(sk, flags, timeo, &err); + if (!skb) + goto splice_read_end; + + /* splice does not support reading control messages */ + if (ctx->control != TLS_RECORD_TYPE_DATA) { + err = -ENOTSUPP; + goto splice_read_end; + } + + if (!ctx->decrypted) { + err = decrypt_skb(sk, skb, NULL); + + if (err < 0) { + tls_err_abort(sk, EBADMSG); + goto splice_read_end; + } + ctx->decrypted = true; + } + rxm = strp_msg(skb); + + chunk = min_t(unsigned int, rxm->full_len, len); + copied = skb_splice_bits(skb, sk, rxm->offset, pipe, chunk, flags); + if (copied < 0) + goto splice_read_end; + + if (likely(!(flags & MSG_PEEK))) + tls_sw_advance_skb(sk, skb, copied); + +splice_read_end: + release_sock(sk); + return copied ? : err; +} + +unsigned int tls_sw_poll(struct file *file, struct socket *sock, + struct poll_table_struct *wait) +{ + unsigned int ret; + struct sock *sk = sock->sk; + struct tls_context *tls_ctx = tls_get_ctx(sk); + struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx); + + /* Grab POLLOUT and POLLHUP from the underlying socket */ + ret = ctx->sk_poll(file, sock, wait); + + /* Clear POLLIN bits, and set based on recv_pkt */ + ret &= ~(POLLIN | POLLRDNORM); + if (ctx->recv_pkt) + ret |= POLLIN | POLLRDNORM; + + return ret; +} + +static int tls_read_size(struct strparser *strp, struct sk_buff *skb) +{ + struct tls_context *tls_ctx = tls_get_ctx(strp->sk); + struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx); + char header[tls_ctx->rx.prepend_size]; + struct strp_msg *rxm = strp_msg(skb); + size_t cipher_overhead; + size_t data_len = 0; + int ret; + + /* Verify that we have a full TLS header, or wait for more data */ + if (rxm->offset + tls_ctx->rx.prepend_size > skb->len) + return 0; + + /* Linearize header to local buffer */ + ret = skb_copy_bits(skb, rxm->offset, header, tls_ctx->rx.prepend_size); + + if (ret < 0) + goto read_failure; + + ctx->control = header[0]; + + data_len = ((header[4] & 0xFF) | (header[3] << 8)); + + cipher_overhead = tls_ctx->rx.tag_size + tls_ctx->rx.iv_size; + + if (data_len > TLS_MAX_PAYLOAD_SIZE + cipher_overhead) { + ret = -EMSGSIZE; + goto read_failure; + } + if (data_len < cipher_overhead) { + ret = -EBADMSG; + goto read_failure; + } + + if (header[1] != TLS_VERSION_MINOR(tls_ctx->crypto_recv.version) || + header[2] != TLS_VERSION_MAJOR(tls_ctx->crypto_recv.version)) { + ret = -EINVAL; + goto read_failure; + } + + return data_len + TLS_HEADER_SIZE; + +read_failure: + tls_err_abort(strp->sk, ret); + + return ret; +} + +static void tls_queue(struct strparser *strp, struct sk_buff *skb) +{ + struct tls_context *tls_ctx = tls_get_ctx(strp->sk); + struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx); + struct strp_msg *rxm; + + rxm = strp_msg(skb); + + ctx->decrypted = false; + + ctx->recv_pkt = skb; + strp_pause(strp); + + strp->sk->sk_state_change(strp->sk); +} + +static void tls_data_ready(struct sock *sk) +{ + struct tls_context *tls_ctx = tls_get_ctx(sk); + struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx); + + strp_data_ready(&ctx->strp); +} + +void tls_sw_free_resources(struct sock *sk) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx); if (ctx->aead_send) crypto_free_aead(ctx->aead_send); + if (ctx->aead_recv) { + if (ctx->recv_pkt) { + kfree_skb(ctx->recv_pkt); + ctx->recv_pkt = NULL; + } + crypto_free_aead(ctx->aead_recv); + strp_stop(&ctx->strp); + write_lock_bh(&sk->sk_callback_lock); + sk->sk_data_ready = ctx->saved_data_ready; + write_unlock_bh(&sk->sk_callback_lock); + release_sock(sk); + strp_done(&ctx->strp); + lock_sock(sk); + } tls_free_both_sg(sk); @@ -595,12 +1035,15 @@ void tls_sw_free_tx_resources(struct sock *sk) kfree(tls_ctx); } -int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx) +int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx) { char keyval[TLS_CIPHER_AES_GCM_128_KEY_SIZE]; struct tls_crypto_info *crypto_info; struct tls12_crypto_info_aes_gcm_128 *gcm_128_info; struct tls_sw_context *sw_ctx; + struct cipher_context *cctx; + struct crypto_aead **aead; + struct strp_callbacks cb; u16 nonce_size, tag_size, iv_size, rec_seq_size; char *iv, *rec_seq; int rc = 0; @@ -610,22 +1053,29 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx) goto out; } - if (ctx->priv_ctx) { - rc = -EEXIST; - goto out; + if (!ctx->priv_ctx) { + sw_ctx = kzalloc(sizeof(*sw_ctx), GFP_KERNEL); + if (!sw_ctx) { + rc = -ENOMEM; + goto out; + } + crypto_init_wait(&sw_ctx->async_wait); + } else { + sw_ctx = ctx->priv_ctx; } - sw_ctx = kzalloc(sizeof(*sw_ctx), GFP_KERNEL); - if (!sw_ctx) { - rc = -ENOMEM; - goto out; - } - - crypto_init_wait(&sw_ctx->async_wait); - ctx->priv_ctx = (struct tls_offload_context *)sw_ctx; - crypto_info = &ctx->crypto_send; + if (tx) { + crypto_info = &ctx->crypto_send; + cctx = &ctx->tx; + aead = &sw_ctx->aead_send; + } else { + crypto_info = &ctx->crypto_recv; + cctx = &ctx->rx; + aead = &sw_ctx->aead_recv; + } + switch (crypto_info->cipher_type) { case TLS_CIPHER_AES_GCM_128: { nonce_size = TLS_CIPHER_AES_GCM_128_IV_SIZE; @@ -644,48 +1094,49 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx) goto free_priv; } - ctx->tx.prepend_size = TLS_HEADER_SIZE + nonce_size; - ctx->tx.tag_size = tag_size; - ctx->tx.overhead_size = ctx->tx.prepend_size + ctx->tx.tag_size; - ctx->tx.iv_size = iv_size; - ctx->tx.iv = kmalloc(iv_size + TLS_CIPHER_AES_GCM_128_SALT_SIZE, - GFP_KERNEL); - if (!ctx->tx.iv) { + cctx->prepend_size = TLS_HEADER_SIZE + nonce_size; + cctx->tag_size = tag_size; + cctx->overhead_size = cctx->prepend_size + cctx->tag_size; + cctx->iv_size = iv_size; + cctx->iv = kmalloc(iv_size + TLS_CIPHER_AES_GCM_128_SALT_SIZE, + GFP_KERNEL); + if (!cctx->iv) { rc = -ENOMEM; goto free_priv; } - memcpy(ctx->tx.iv, gcm_128_info->salt, - TLS_CIPHER_AES_GCM_128_SALT_SIZE); - memcpy(ctx->tx.iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE, iv, iv_size); - ctx->tx.rec_seq_size = rec_seq_size; - ctx->tx.rec_seq = kmalloc(rec_seq_size, GFP_KERNEL); - if (!ctx->tx.rec_seq) { + memcpy(cctx->iv, gcm_128_info->salt, TLS_CIPHER_AES_GCM_128_SALT_SIZE); + memcpy(cctx->iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE, iv, iv_size); + cctx->rec_seq_size = rec_seq_size; + cctx->rec_seq = kmalloc(rec_seq_size, GFP_KERNEL); + if (!cctx->rec_seq) { rc = -ENOMEM; goto free_iv; } - memcpy(ctx->tx.rec_seq, rec_seq, rec_seq_size); + memcpy(cctx->rec_seq, rec_seq, rec_seq_size); - sg_init_table(sw_ctx->sg_encrypted_data, - ARRAY_SIZE(sw_ctx->sg_encrypted_data)); - sg_init_table(sw_ctx->sg_plaintext_data, - ARRAY_SIZE(sw_ctx->sg_plaintext_data)); + if (tx) { + sg_init_table(sw_ctx->sg_encrypted_data, + ARRAY_SIZE(sw_ctx->sg_encrypted_data)); + sg_init_table(sw_ctx->sg_plaintext_data, + ARRAY_SIZE(sw_ctx->sg_plaintext_data)); - sg_init_table(sw_ctx->sg_aead_in, 2); - sg_set_buf(&sw_ctx->sg_aead_in[0], sw_ctx->aad_space, - sizeof(sw_ctx->aad_space)); - sg_unmark_end(&sw_ctx->sg_aead_in[1]); - sg_chain(sw_ctx->sg_aead_in, 2, sw_ctx->sg_plaintext_data); - sg_init_table(sw_ctx->sg_aead_out, 2); - sg_set_buf(&sw_ctx->sg_aead_out[0], sw_ctx->aad_space, - sizeof(sw_ctx->aad_space)); - sg_unmark_end(&sw_ctx->sg_aead_out[1]); - sg_chain(sw_ctx->sg_aead_out, 2, sw_ctx->sg_encrypted_data); + sg_init_table(sw_ctx->sg_aead_in, 2); + sg_set_buf(&sw_ctx->sg_aead_in[0], sw_ctx->aad_space, + sizeof(sw_ctx->aad_space)); + sg_unmark_end(&sw_ctx->sg_aead_in[1]); + sg_chain(sw_ctx->sg_aead_in, 2, sw_ctx->sg_plaintext_data); + sg_init_table(sw_ctx->sg_aead_out, 2); + sg_set_buf(&sw_ctx->sg_aead_out[0], sw_ctx->aad_space, + sizeof(sw_ctx->aad_space)); + sg_unmark_end(&sw_ctx->sg_aead_out[1]); + sg_chain(sw_ctx->sg_aead_out, 2, sw_ctx->sg_encrypted_data); + } - if (!sw_ctx->aead_send) { - sw_ctx->aead_send = crypto_alloc_aead("gcm(aes)", 0, 0); - if (IS_ERR(sw_ctx->aead_send)) { - rc = PTR_ERR(sw_ctx->aead_send); - sw_ctx->aead_send = NULL; + if (!*aead) { + *aead = crypto_alloc_aead("gcm(aes)", 0, 0); + if (IS_ERR(*aead)) { + rc = PTR_ERR(*aead); + *aead = NULL; goto free_rec_seq; } } @@ -694,21 +1145,41 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx) memcpy(keyval, gcm_128_info->key, TLS_CIPHER_AES_GCM_128_KEY_SIZE); - rc = crypto_aead_setkey(sw_ctx->aead_send, keyval, + rc = crypto_aead_setkey(*aead, keyval, TLS_CIPHER_AES_GCM_128_KEY_SIZE); if (rc) goto free_aead; - rc = crypto_aead_setauthsize(sw_ctx->aead_send, ctx->tx.tag_size); - if (!rc) - return 0; + rc = crypto_aead_setauthsize(*aead, cctx->tag_size); + if (rc) + goto free_aead; + + if (!tx) { + /* Set up strparser */ + memset(&cb, 0, sizeof(cb)); + cb.rcv_msg = tls_queue; + cb.parse_msg = tls_read_size; + + strp_init(&sw_ctx->strp, sk, &cb); + + write_lock_bh(&sk->sk_callback_lock); + sw_ctx->saved_data_ready = sk->sk_data_ready; + sk->sk_data_ready = tls_data_ready; + write_unlock_bh(&sk->sk_callback_lock); + + sw_ctx->sk_poll = sk->sk_socket->ops->poll; + + strp_check_rcv(&sw_ctx->strp); + } + + goto out; free_aead: - crypto_free_aead(sw_ctx->aead_send); - sw_ctx->aead_send = NULL; + crypto_free_aead(*aead); + *aead = NULL; free_rec_seq: - kfree(ctx->tx.rec_seq); - ctx->tx.rec_seq = NULL; + kfree(cctx->rec_seq); + cctx->rec_seq = NULL; free_iv: kfree(ctx->tx.iv); ctx->tx.iv = NULL; From 12c5a37865e5bf508c19968aa32444b1fbe00430 Mon Sep 17 00:00:00 2001 From: Atul Gupta Date: Sat, 31 Mar 2018 21:41:52 +0530 Subject: [PATCH 1438/1640] UPSTREAM: tls: support for Inline tls record Facility to register Inline TLS drivers to net/tls. Setup TLS_HW_RECORD prot to listen on offload device. Cases handled - Inline TLS device exists, setup prot for TLS_HW_RECORD - Atleast one Inline TLS exists, sets TLS_HW_RECORD. - If non-inline device establish connection, move to TLS_SW_TX Signed-off-by: Atul Gupta Reviewed-by: Steve Wise Signed-off-by: David S. Miller --- include/net/tls.h | 32 ++++++++++++- net/tls/tls_main.c | 114 +++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 142 insertions(+), 4 deletions(-) diff --git a/include/net/tls.h b/include/net/tls.h index 437a746300bf..3da8e13a6d96 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -56,6 +56,32 @@ #define TLS_RECORD_TYPE_DATA 0x17 #define TLS_AAD_SPACE_SIZE 13 +#define TLS_DEVICE_NAME_MAX 32 + +/* + * This structure defines the routines for Inline TLS driver. + * The following routines are optional and filled with a + * null pointer if not defined. + * + * @name: Its the name of registered Inline tls device + * @dev_list: Inline tls device list + * int (*feature)(struct tls_device *device); + * Called to return Inline TLS driver capability + * + * int (*hash)(struct tls_device *device, struct sock *sk); + * This function sets Inline driver for listen and program + * device specific functioanlity as required + * + * void (*unhash)(struct tls_device *device, struct sock *sk); + * This function cleans listen state set by Inline TLS driver + */ +struct tls_device { + char name[TLS_DEVICE_NAME_MAX]; + struct list_head dev_list; + int (*feature)(struct tls_device *device); + int (*hash)(struct tls_device *device, struct sock *sk); + void (*unhash)(struct tls_device *device, struct sock *sk); +}; struct tls_sw_context { struct crypto_aead *aead_send; @@ -114,7 +140,7 @@ struct tls_context { void *priv_ctx; - u8 conf:2; + u8 conf:3; struct cipher_context tx; struct cipher_context rx; @@ -135,6 +161,8 @@ struct tls_context { int (*getsockopt)(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen); + int (*hash)(struct sock *sk); + void (*unhash)(struct sock *sk); }; int wait_on_pending_writer(struct sock *sk, long *timeo); @@ -283,5 +311,7 @@ static inline struct tls_offload_context *tls_offload_ctx( int tls_proccess_cmsg(struct sock *sk, struct msghdr *msg, unsigned char *record_type); +void tls_register_device(struct tls_device *device); +void tls_unregister_device(struct tls_device *device); #endif /* _TLS_OFFLOAD_H */ diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index 8a87ee7c4d70..69f830c3e191 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -38,6 +38,7 @@ #include #include #include +#include #include @@ -56,11 +57,14 @@ enum { TLS_SW_TX, TLS_SW_RX, TLS_SW_RXTX, + TLS_HW_RECORD, TLS_NUM_CONFIG, }; static struct proto *saved_tcpv6_prot; static DEFINE_MUTEX(tcpv6_prot_mutex); +static LIST_HEAD(device_list); +static DEFINE_MUTEX(device_mutex); static struct proto tls_prots[TLS_NUM_PROTS][TLS_NUM_CONFIG]; static struct proto_ops tls_sw_proto_ops; @@ -241,8 +245,12 @@ static void tls_sk_proto_close(struct sock *sk, long timeout) lock_sock(sk); sk_proto_close = ctx->sk_proto_close; + if (ctx->conf == TLS_HW_RECORD) + goto skip_tx_cleanup; + if (ctx->conf == TLS_BASE) { kfree(ctx); + ctx = NULL; goto skip_tx_cleanup; } @@ -276,6 +284,11 @@ static void tls_sk_proto_close(struct sock *sk, long timeout) skip_tx_cleanup: release_sock(sk); sk_proto_close(sk, timeout); + /* free ctx for TLS_HW_RECORD, used by tcp_set_state + * for sk->sk_prot->unhash [tls_hw_unhash] + */ + if (ctx && ctx->conf == TLS_HW_RECORD) + kfree(ctx); } static int do_tls_getsockopt_tx(struct sock *sk, char __user *optval, @@ -493,6 +506,79 @@ static int tls_setsockopt(struct sock *sk, int level, int optname, return do_tls_setsockopt(sk, optname, optval, optlen); } +static struct tls_context *create_ctx(struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + struct tls_context *ctx; + + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) + return NULL; + + icsk->icsk_ulp_data = ctx; + return ctx; +} + +static int tls_hw_prot(struct sock *sk) +{ + struct tls_context *ctx; + struct tls_device *dev; + int rc = 0; + + mutex_lock(&device_mutex); + list_for_each_entry(dev, &device_list, dev_list) { + if (dev->feature && dev->feature(dev)) { + ctx = create_ctx(sk); + if (!ctx) + goto out; + + ctx->hash = sk->sk_prot->hash; + ctx->unhash = sk->sk_prot->unhash; + ctx->sk_proto_close = sk->sk_prot->close; + ctx->conf = TLS_HW_RECORD; + update_sk_prot(sk, ctx); + rc = 1; + break; + } + } +out: + mutex_unlock(&device_mutex); + return rc; +} + +static void tls_hw_unhash(struct sock *sk) +{ + struct tls_context *ctx = tls_get_ctx(sk); + struct tls_device *dev; + + mutex_lock(&device_mutex); + list_for_each_entry(dev, &device_list, dev_list) { + if (dev->unhash) + dev->unhash(dev, sk); + } + mutex_unlock(&device_mutex); + ctx->unhash(sk); +} + +static int tls_hw_hash(struct sock *sk) +{ + struct tls_context *ctx = tls_get_ctx(sk); + struct tls_device *dev; + int err; + + err = ctx->hash(sk); + mutex_lock(&device_mutex); + list_for_each_entry(dev, &device_list, dev_list) { + if (dev->hash) + err |= dev->hash(dev, sk); + } + mutex_unlock(&device_mutex); + + if (err) + tls_hw_unhash(sk); + return err; +} + static void build_protos(struct proto *prot, struct proto *base) { prot[TLS_BASE] = *base; @@ -511,15 +597,22 @@ static void build_protos(struct proto *prot, struct proto *base) prot[TLS_SW_RXTX] = prot[TLS_SW_TX]; prot[TLS_SW_RXTX].recvmsg = tls_sw_recvmsg; prot[TLS_SW_RXTX].close = tls_sk_proto_close; + + prot[TLS_HW_RECORD] = *base; + prot[TLS_HW_RECORD].hash = tls_hw_hash; + prot[TLS_HW_RECORD].unhash = tls_hw_unhash; + prot[TLS_HW_RECORD].close = tls_sk_proto_close; } static int tls_init(struct sock *sk) { int ip_ver = sk->sk_family == AF_INET6 ? TLSV6 : TLSV4; - struct inet_connection_sock *icsk = inet_csk(sk); struct tls_context *ctx; int rc = 0; + if (tls_hw_prot(sk)) + goto out; + /* The TLS ulp is currently supported only for TCP sockets * in ESTABLISHED state. * Supporting sockets in LISTEN state will require us @@ -530,12 +623,11 @@ static int tls_init(struct sock *sk) return -ENOTSUPP; /* allocate tls context */ - ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + ctx = create_ctx(sk); if (!ctx) { rc = -ENOMEM; goto out; } - icsk->icsk_ulp_data = ctx; ctx->setsockopt = sk->sk_prot->setsockopt; ctx->getsockopt = sk->sk_prot->getsockopt; ctx->sk_proto_close = sk->sk_prot->close; @@ -557,6 +649,22 @@ out: return rc; } +void tls_register_device(struct tls_device *device) +{ + mutex_lock(&device_mutex); + list_add_tail(&device->dev_list, &device_list); + mutex_unlock(&device_mutex); +} +EXPORT_SYMBOL(tls_register_device); + +void tls_unregister_device(struct tls_device *device) +{ + mutex_lock(&device_mutex); + list_del(&device->dev_list); + mutex_unlock(&device_mutex); +} +EXPORT_SYMBOL(tls_unregister_device); + static struct tcp_ulp_ops tcp_tls_ulp_ops __read_mostly = { .name = "tls", .uid = TCP_ULP_TLS, From a08603d2390d713e6eda72c8d1cfcd5c622a52ca Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 10 Apr 2018 17:52:34 -0700 Subject: [PATCH 1439/1640] UPSTREAM: net/tls: Remove VLA usage In the quest to remove VLAs from the kernel[1], this replaces the VLA size with the only possible size used in the code, and adds a mechanism to double-check future IV sizes. [1] https://lkml.kernel.org/r/CA+55aFzCG-zNmZwX4A2FQpadafLfEzK6CC=qPXydAacU1RqZWA@mail.gmail.com Signed-off-by: Kees Cook Acked-by: Dave Watson Signed-off-by: David S. Miller --- net/tls/tls_sw.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 4dc766b03f00..71e79597f940 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -41,6 +41,8 @@ #include #include +#define MAX_IV_SIZE TLS_CIPHER_AES_GCM_128_IV_SIZE + static int tls_do_decryption(struct sock *sk, struct scatterlist *sgin, struct scatterlist *sgout, @@ -673,7 +675,7 @@ static int decrypt_skb(struct sock *sk, struct sk_buff *skb, { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx); - char iv[TLS_CIPHER_AES_GCM_128_SALT_SIZE + tls_ctx->rx.iv_size]; + char iv[TLS_CIPHER_AES_GCM_128_SALT_SIZE + MAX_IV_SIZE]; struct scatterlist sgin_arr[MAX_SKB_FRAGS + 2]; struct scatterlist *sgin = &sgin_arr[0]; struct strp_msg *rxm = strp_msg(skb); @@ -1094,6 +1096,12 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx) goto free_priv; } + /* Sanity-check the IV size for stack allocations. */ + if (iv_size > MAX_IV_SIZE) { + rc = -EINVAL; + goto free_priv; + } + cctx->prepend_size = TLS_HEADER_SIZE + nonce_size; cctx->tag_size = tag_size; cctx->overhead_size = cctx->prepend_size + cctx->tag_size; From d1e7f2debd1c15acd1ed76f6c7bb4bb33eda00e9 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 24 Apr 2018 13:36:58 +0100 Subject: [PATCH 1440/1640] UPSTREAM: net/tls: remove redundant second null check on sgout A duplicated null check on sgout is redundant as it is known to be already true because of the identical earlier check. Remove it. Detected by cppcheck: net/tls/tls_sw.c:696: (warning) Identical inner 'if' condition is always true. Signed-off-by: Colin Ian King Signed-off-by: David S. Miller --- net/tls/tls_sw.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 71e79597f940..6ed1c02cfc94 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -693,8 +693,7 @@ static int decrypt_skb(struct sock *sk, struct sk_buff *skb, if (!sgout) { nsg = skb_cow_data(skb, 0, &unused) + 1; sgin = kmalloc_array(nsg, sizeof(*sgin), sk->sk_allocation); - if (!sgout) - sgout = sgin; + sgout = sgin; } sg_init_table(sgin, nsg); From 5a4d6219aa454c89827fd6f5108355394d6af6b3 Mon Sep 17 00:00:00 2001 From: Boris Pismenny Date: Mon, 30 Apr 2018 10:16:15 +0300 Subject: [PATCH 1441/1640] UPSTREAM: net/tls: Split conf to rx + tx In TLS inline crypto, we can have one direction in software and another in hardware. Thus, we split the TLS configuration to separate structures for receive and transmit. Signed-off-by: Boris Pismenny Signed-off-by: David S. Miller --- include/net/tls.h | 51 +++++++++++------ net/tls/tls_main.c | 95 ++++++++++++++++--------------- net/tls/tls_sw.c | 136 ++++++++++++++++++++++++++------------------- 3 files changed, 158 insertions(+), 124 deletions(-) diff --git a/include/net/tls.h b/include/net/tls.h index 3da8e13a6d96..95a8c60b36be 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -83,21 +83,10 @@ struct tls_device { void (*unhash)(struct tls_device *device, struct sock *sk); }; -struct tls_sw_context { +struct tls_sw_context_tx { struct crypto_aead *aead_send; - struct crypto_aead *aead_recv; struct crypto_wait async_wait; - /* Receive context */ - struct strparser strp; - void (*saved_data_ready)(struct sock *sk); - unsigned int (*sk_poll)(struct file *file, struct socket *sock, - struct poll_table_struct *wait); - struct sk_buff *recv_pkt; - u8 control; - bool decrypted; - - /* Sending context */ char aad_space[TLS_AAD_SPACE_SIZE]; unsigned int sg_plaintext_size; @@ -114,6 +103,19 @@ struct tls_sw_context { struct scatterlist sg_aead_out[2]; }; +struct tls_sw_context_rx { + struct crypto_aead *aead_recv; + struct crypto_wait async_wait; + + struct strparser strp; + void (*saved_data_ready)(struct sock *sk); + unsigned int (*sk_poll)(struct file *file, struct socket *sock, + struct poll_table_struct *wait); + struct sk_buff *recv_pkt; + u8 control; + bool decrypted; +}; + enum { TLS_PENDING_CLOSED_RECORD }; @@ -138,9 +140,15 @@ struct tls_context { struct tls12_crypto_info_aes_gcm_128 crypto_recv_aes_gcm_128; }; - void *priv_ctx; + struct list_head list; + struct net_device *netdev; + refcount_t refcount; - u8 conf:3; + void *priv_ctx_tx; + void *priv_ctx_rx; + + u8 tx_conf:3; + u8 rx_conf:3; struct cipher_context tx; struct cipher_context rx; @@ -177,7 +185,8 @@ int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size); int tls_sw_sendpage(struct sock *sk, struct page *page, int offset, size_t size, int flags); void tls_sw_close(struct sock *sk, long timeout); -void tls_sw_free_resources(struct sock *sk); +void tls_sw_free_resources_tx(struct sock *sk); +void tls_sw_free_resources_rx(struct sock *sk); int tls_sw_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, int flags, int *addr_len); unsigned int tls_sw_poll(struct file *file, struct socket *sock, @@ -297,16 +306,22 @@ static inline struct tls_context *tls_get_ctx(const struct sock *sk) return icsk->icsk_ulp_data; } -static inline struct tls_sw_context *tls_sw_ctx( +static inline struct tls_sw_context_rx *tls_sw_ctx_rx( const struct tls_context *tls_ctx) { - return (struct tls_sw_context *)tls_ctx->priv_ctx; + return (struct tls_sw_context_rx *)tls_ctx->priv_ctx_rx; +} + +static inline struct tls_sw_context_tx *tls_sw_ctx_tx( + const struct tls_context *tls_ctx) +{ + return (struct tls_sw_context_tx *)tls_ctx->priv_ctx_tx; } static inline struct tls_offload_context *tls_offload_ctx( const struct tls_context *tls_ctx) { - return (struct tls_offload_context *)tls_ctx->priv_ctx; + return (struct tls_offload_context *)tls_ctx->priv_ctx_tx; } int tls_proccess_cmsg(struct sock *sk, struct msghdr *msg, diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index 69f830c3e191..94b10312ff26 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -51,12 +51,9 @@ enum { TLSV6, TLS_NUM_PROTS, }; - enum { TLS_BASE, - TLS_SW_TX, - TLS_SW_RX, - TLS_SW_RXTX, + TLS_SW, TLS_HW_RECORD, TLS_NUM_CONFIG, }; @@ -65,14 +62,14 @@ static struct proto *saved_tcpv6_prot; static DEFINE_MUTEX(tcpv6_prot_mutex); static LIST_HEAD(device_list); static DEFINE_MUTEX(device_mutex); -static struct proto tls_prots[TLS_NUM_PROTS][TLS_NUM_CONFIG]; +static struct proto tls_prots[TLS_NUM_PROTS][TLS_NUM_CONFIG][TLS_NUM_CONFIG]; static struct proto_ops tls_sw_proto_ops; -static inline void update_sk_prot(struct sock *sk, struct tls_context *ctx) +static void update_sk_prot(struct sock *sk, struct tls_context *ctx) { int ip_ver = sk->sk_family == AF_INET6 ? TLSV6 : TLSV4; - sk->sk_prot = &tls_prots[ip_ver][ctx->conf]; + sk->sk_prot = &tls_prots[ip_ver][ctx->tx_conf][ctx->rx_conf]; } int wait_on_pending_writer(struct sock *sk, long *timeo) @@ -245,10 +242,10 @@ static void tls_sk_proto_close(struct sock *sk, long timeout) lock_sock(sk); sk_proto_close = ctx->sk_proto_close; - if (ctx->conf == TLS_HW_RECORD) + if (ctx->tx_conf == TLS_HW_RECORD && ctx->rx_conf == TLS_HW_RECORD) goto skip_tx_cleanup; - if (ctx->conf == TLS_BASE) { + if (ctx->tx_conf == TLS_BASE && ctx->rx_conf == TLS_BASE) { kfree(ctx); ctx = NULL; goto skip_tx_cleanup; @@ -270,15 +267,17 @@ static void tls_sk_proto_close(struct sock *sk, long timeout) } } - kfree(ctx->tx.rec_seq); - kfree(ctx->tx.iv); - kfree(ctx->rx.rec_seq); - kfree(ctx->rx.iv); + /* We need these for tls_sw_fallback handling of other packets */ + if (ctx->tx_conf == TLS_SW) { + kfree(ctx->tx.rec_seq); + kfree(ctx->tx.iv); + tls_sw_free_resources_tx(sk); + } - if (ctx->conf == TLS_SW_TX || - ctx->conf == TLS_SW_RX || - ctx->conf == TLS_SW_RXTX) { - tls_sw_free_resources(sk); + if (ctx->rx_conf == TLS_SW) { + kfree(ctx->rx.rec_seq); + kfree(ctx->rx.iv); + tls_sw_free_resources_rx(sk); } skip_tx_cleanup: @@ -287,7 +286,8 @@ skip_tx_cleanup: /* free ctx for TLS_HW_RECORD, used by tcp_set_state * for sk->sk_prot->unhash [tls_hw_unhash] */ - if (ctx && ctx->conf == TLS_HW_RECORD) + if (ctx && ctx->tx_conf == TLS_HW_RECORD && + ctx->rx_conf == TLS_HW_RECORD) kfree(ctx); } @@ -441,25 +441,21 @@ static int do_tls_setsockopt_conf(struct sock *sk, char __user *optval, goto err_crypto_info; } - /* currently SW is default, we will have ethtool in future */ if (tx) { rc = tls_set_sw_offload(sk, ctx, 1); - if (ctx->conf == TLS_SW_RX) - conf = TLS_SW_RXTX; - else - conf = TLS_SW_TX; + conf = TLS_SW; } else { rc = tls_set_sw_offload(sk, ctx, 0); - if (ctx->conf == TLS_SW_TX) - conf = TLS_SW_RXTX; - else - conf = TLS_SW_RX; + conf = TLS_SW; } if (rc) goto err_crypto_info; - ctx->conf = conf; + if (tx) + ctx->tx_conf = conf; + else + ctx->rx_conf = conf; update_sk_prot(sk, ctx); if (tx) { ctx->sk_write_space = sk->sk_write_space; @@ -535,7 +531,8 @@ static int tls_hw_prot(struct sock *sk) ctx->hash = sk->sk_prot->hash; ctx->unhash = sk->sk_prot->unhash; ctx->sk_proto_close = sk->sk_prot->close; - ctx->conf = TLS_HW_RECORD; + ctx->rx_conf = TLS_HW_RECORD; + ctx->tx_conf = TLS_HW_RECORD; update_sk_prot(sk, ctx); rc = 1; break; @@ -579,29 +576,30 @@ static int tls_hw_hash(struct sock *sk) return err; } -static void build_protos(struct proto *prot, struct proto *base) +static void build_protos(struct proto prot[TLS_NUM_CONFIG][TLS_NUM_CONFIG], + struct proto *base) { - prot[TLS_BASE] = *base; - prot[TLS_BASE].setsockopt = tls_setsockopt; - prot[TLS_BASE].getsockopt = tls_getsockopt; - prot[TLS_BASE].close = tls_sk_proto_close; + prot[TLS_BASE][TLS_BASE] = *base; + prot[TLS_BASE][TLS_BASE].setsockopt = tls_setsockopt; + prot[TLS_BASE][TLS_BASE].getsockopt = tls_getsockopt; + prot[TLS_BASE][TLS_BASE].close = tls_sk_proto_close; - prot[TLS_SW_TX] = prot[TLS_BASE]; - prot[TLS_SW_TX].sendmsg = tls_sw_sendmsg; - prot[TLS_SW_TX].sendpage = tls_sw_sendpage; + prot[TLS_SW][TLS_BASE] = prot[TLS_BASE][TLS_BASE]; + prot[TLS_SW][TLS_BASE].sendmsg = tls_sw_sendmsg; + prot[TLS_SW][TLS_BASE].sendpage = tls_sw_sendpage; - prot[TLS_SW_RX] = prot[TLS_BASE]; - prot[TLS_SW_RX].recvmsg = tls_sw_recvmsg; - prot[TLS_SW_RX].close = tls_sk_proto_close; + prot[TLS_BASE][TLS_SW] = prot[TLS_BASE][TLS_BASE]; + prot[TLS_BASE][TLS_SW].recvmsg = tls_sw_recvmsg; + prot[TLS_BASE][TLS_SW].close = tls_sk_proto_close; - prot[TLS_SW_RXTX] = prot[TLS_SW_TX]; - prot[TLS_SW_RXTX].recvmsg = tls_sw_recvmsg; - prot[TLS_SW_RXTX].close = tls_sk_proto_close; + prot[TLS_SW][TLS_SW] = prot[TLS_SW][TLS_BASE]; + prot[TLS_SW][TLS_SW].recvmsg = tls_sw_recvmsg; + prot[TLS_SW][TLS_SW].close = tls_sk_proto_close; - prot[TLS_HW_RECORD] = *base; - prot[TLS_HW_RECORD].hash = tls_hw_hash; - prot[TLS_HW_RECORD].unhash = tls_hw_unhash; - prot[TLS_HW_RECORD].close = tls_sk_proto_close; + prot[TLS_HW_RECORD][TLS_HW_RECORD] = *base; + prot[TLS_HW_RECORD][TLS_HW_RECORD].hash = tls_hw_hash; + prot[TLS_HW_RECORD][TLS_HW_RECORD].unhash = tls_hw_unhash; + prot[TLS_HW_RECORD][TLS_HW_RECORD].close = tls_sk_proto_close; } static int tls_init(struct sock *sk) @@ -643,7 +641,8 @@ static int tls_init(struct sock *sk) mutex_unlock(&tcpv6_prot_mutex); } - ctx->conf = TLS_BASE; + ctx->tx_conf = TLS_BASE; + ctx->rx_conf = TLS_BASE; update_sk_prot(sk, ctx); out: return rc; diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 6ed1c02cfc94..5c3909c311f1 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -52,7 +52,7 @@ static int tls_do_decryption(struct sock *sk, gfp_t flags) { struct tls_context *tls_ctx = tls_get_ctx(sk); - struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx); + struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); struct strp_msg *rxm = strp_msg(skb); struct aead_request *aead_req; @@ -122,7 +122,7 @@ out: static void trim_both_sgl(struct sock *sk, int target_size) { struct tls_context *tls_ctx = tls_get_ctx(sk); - struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx); + struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); trim_sg(sk, ctx->sg_plaintext_data, &ctx->sg_plaintext_num_elem, @@ -141,7 +141,7 @@ static void trim_both_sgl(struct sock *sk, int target_size) static int alloc_encrypted_sg(struct sock *sk, int len) { struct tls_context *tls_ctx = tls_get_ctx(sk); - struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx); + struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); int rc = 0; rc = sk_alloc_sg(sk, len, @@ -155,7 +155,7 @@ static int alloc_encrypted_sg(struct sock *sk, int len) static int alloc_plaintext_sg(struct sock *sk, int len) { struct tls_context *tls_ctx = tls_get_ctx(sk); - struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx); + struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); int rc = 0; rc = sk_alloc_sg(sk, len, ctx->sg_plaintext_data, 0, @@ -181,7 +181,7 @@ static void free_sg(struct sock *sk, struct scatterlist *sg, static void tls_free_both_sg(struct sock *sk) { struct tls_context *tls_ctx = tls_get_ctx(sk); - struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx); + struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); free_sg(sk, ctx->sg_encrypted_data, &ctx->sg_encrypted_num_elem, &ctx->sg_encrypted_size); @@ -191,7 +191,7 @@ static void tls_free_both_sg(struct sock *sk) } static int tls_do_encryption(struct tls_context *tls_ctx, - struct tls_sw_context *ctx, size_t data_len, + struct tls_sw_context_tx *ctx, size_t data_len, gfp_t flags) { unsigned int req_size = sizeof(struct aead_request) + @@ -227,7 +227,7 @@ static int tls_push_record(struct sock *sk, int flags, unsigned char record_type) { struct tls_context *tls_ctx = tls_get_ctx(sk); - struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx); + struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); int rc; sg_mark_end(ctx->sg_plaintext_data + ctx->sg_plaintext_num_elem - 1); @@ -339,7 +339,7 @@ static int memcopy_from_iter(struct sock *sk, struct iov_iter *from, int bytes) { struct tls_context *tls_ctx = tls_get_ctx(sk); - struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx); + struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); struct scatterlist *sg = ctx->sg_plaintext_data; int copy, i, rc = 0; @@ -367,7 +367,7 @@ out: int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) { struct tls_context *tls_ctx = tls_get_ctx(sk); - struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx); + struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); int ret = 0; int required_size; long timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); @@ -522,7 +522,7 @@ int tls_sw_sendpage(struct sock *sk, struct page *page, int offset, size_t size, int flags) { struct tls_context *tls_ctx = tls_get_ctx(sk); - struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx); + struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); int ret = 0; long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); bool eor; @@ -636,7 +636,7 @@ static struct sk_buff *tls_wait_data(struct sock *sk, int flags, long timeo, int *err) { struct tls_context *tls_ctx = tls_get_ctx(sk); - struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx); + struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); struct sk_buff *skb; DEFINE_WAIT_FUNC(wait, woken_wake_function); @@ -674,7 +674,7 @@ static int decrypt_skb(struct sock *sk, struct sk_buff *skb, struct scatterlist *sgout) { struct tls_context *tls_ctx = tls_get_ctx(sk); - struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx); + struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); char iv[TLS_CIPHER_AES_GCM_128_SALT_SIZE + MAX_IV_SIZE]; struct scatterlist sgin_arr[MAX_SKB_FRAGS + 2]; struct scatterlist *sgin = &sgin_arr[0]; @@ -723,7 +723,7 @@ static bool tls_sw_advance_skb(struct sock *sk, struct sk_buff *skb, unsigned int len) { struct tls_context *tls_ctx = tls_get_ctx(sk); - struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx); + struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); struct strp_msg *rxm = strp_msg(skb); if (len < rxm->full_len) { @@ -749,7 +749,7 @@ int tls_sw_recvmsg(struct sock *sk, int *addr_len) { struct tls_context *tls_ctx = tls_get_ctx(sk); - struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx); + struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); unsigned char control; struct strp_msg *rxm; struct sk_buff *skb; @@ -869,7 +869,7 @@ ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos, size_t len, unsigned int flags) { struct tls_context *tls_ctx = tls_get_ctx(sock->sk); - struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx); + struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); struct strp_msg *rxm = NULL; struct sock *sk = sock->sk; struct sk_buff *skb; @@ -922,7 +922,7 @@ unsigned int tls_sw_poll(struct file *file, struct socket *sock, unsigned int ret; struct sock *sk = sock->sk; struct tls_context *tls_ctx = tls_get_ctx(sk); - struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx); + struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); /* Grab POLLOUT and POLLHUP from the underlying socket */ ret = ctx->sk_poll(file, sock, wait); @@ -938,7 +938,7 @@ unsigned int tls_sw_poll(struct file *file, struct socket *sock, static int tls_read_size(struct strparser *strp, struct sk_buff *skb) { struct tls_context *tls_ctx = tls_get_ctx(strp->sk); - struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx); + struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); char header[tls_ctx->rx.prepend_size]; struct strp_msg *rxm = strp_msg(skb); size_t cipher_overhead; @@ -987,7 +987,7 @@ read_failure: static void tls_queue(struct strparser *strp, struct sk_buff *skb) { struct tls_context *tls_ctx = tls_get_ctx(strp->sk); - struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx); + struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); struct strp_msg *rxm; rxm = strp_msg(skb); @@ -1003,18 +1003,28 @@ static void tls_queue(struct strparser *strp, struct sk_buff *skb) static void tls_data_ready(struct sock *sk) { struct tls_context *tls_ctx = tls_get_ctx(sk); - struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx); + struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); strp_data_ready(&ctx->strp); } -void tls_sw_free_resources(struct sock *sk) +void tls_sw_free_resources_tx(struct sock *sk) { struct tls_context *tls_ctx = tls_get_ctx(sk); - struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx); + struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); if (ctx->aead_send) crypto_free_aead(ctx->aead_send); + tls_free_both_sg(sk); + + kfree(ctx); +} + +void tls_sw_free_resources_rx(struct sock *sk) +{ + struct tls_context *tls_ctx = tls_get_ctx(sk); + struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); + if (ctx->aead_recv) { if (ctx->recv_pkt) { kfree_skb(ctx->recv_pkt); @@ -1030,10 +1040,7 @@ void tls_sw_free_resources(struct sock *sk) lock_sock(sk); } - tls_free_both_sg(sk); - kfree(ctx); - kfree(tls_ctx); } int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx) @@ -1041,7 +1048,8 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx) char keyval[TLS_CIPHER_AES_GCM_128_KEY_SIZE]; struct tls_crypto_info *crypto_info; struct tls12_crypto_info_aes_gcm_128 *gcm_128_info; - struct tls_sw_context *sw_ctx; + struct tls_sw_context_tx *sw_ctx_tx = NULL; + struct tls_sw_context_rx *sw_ctx_rx = NULL; struct cipher_context *cctx; struct crypto_aead **aead; struct strp_callbacks cb; @@ -1054,27 +1062,32 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx) goto out; } - if (!ctx->priv_ctx) { - sw_ctx = kzalloc(sizeof(*sw_ctx), GFP_KERNEL); - if (!sw_ctx) { + if (tx) { + sw_ctx_tx = kzalloc(sizeof(*sw_ctx_tx), GFP_KERNEL); + if (!sw_ctx_tx) { rc = -ENOMEM; goto out; } - crypto_init_wait(&sw_ctx->async_wait); + crypto_init_wait(&sw_ctx_tx->async_wait); + ctx->priv_ctx_tx = sw_ctx_tx; } else { - sw_ctx = ctx->priv_ctx; + sw_ctx_rx = kzalloc(sizeof(*sw_ctx_rx), GFP_KERNEL); + if (!sw_ctx_rx) { + rc = -ENOMEM; + goto out; + } + crypto_init_wait(&sw_ctx_rx->async_wait); + ctx->priv_ctx_rx = sw_ctx_rx; } - ctx->priv_ctx = (struct tls_offload_context *)sw_ctx; - if (tx) { crypto_info = &ctx->crypto_send; cctx = &ctx->tx; - aead = &sw_ctx->aead_send; + aead = &sw_ctx_tx->aead_send; } else { crypto_info = &ctx->crypto_recv; cctx = &ctx->rx; - aead = &sw_ctx->aead_recv; + aead = &sw_ctx_rx->aead_recv; } switch (crypto_info->cipher_type) { @@ -1121,22 +1134,24 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx) } memcpy(cctx->rec_seq, rec_seq, rec_seq_size); - if (tx) { - sg_init_table(sw_ctx->sg_encrypted_data, - ARRAY_SIZE(sw_ctx->sg_encrypted_data)); - sg_init_table(sw_ctx->sg_plaintext_data, - ARRAY_SIZE(sw_ctx->sg_plaintext_data)); + if (sw_ctx_tx) { + sg_init_table(sw_ctx_tx->sg_encrypted_data, + ARRAY_SIZE(sw_ctx_tx->sg_encrypted_data)); + sg_init_table(sw_ctx_tx->sg_plaintext_data, + ARRAY_SIZE(sw_ctx_tx->sg_plaintext_data)); - sg_init_table(sw_ctx->sg_aead_in, 2); - sg_set_buf(&sw_ctx->sg_aead_in[0], sw_ctx->aad_space, - sizeof(sw_ctx->aad_space)); - sg_unmark_end(&sw_ctx->sg_aead_in[1]); - sg_chain(sw_ctx->sg_aead_in, 2, sw_ctx->sg_plaintext_data); - sg_init_table(sw_ctx->sg_aead_out, 2); - sg_set_buf(&sw_ctx->sg_aead_out[0], sw_ctx->aad_space, - sizeof(sw_ctx->aad_space)); - sg_unmark_end(&sw_ctx->sg_aead_out[1]); - sg_chain(sw_ctx->sg_aead_out, 2, sw_ctx->sg_encrypted_data); + sg_init_table(sw_ctx_tx->sg_aead_in, 2); + sg_set_buf(&sw_ctx_tx->sg_aead_in[0], sw_ctx_tx->aad_space, + sizeof(sw_ctx_tx->aad_space)); + sg_unmark_end(&sw_ctx_tx->sg_aead_in[1]); + sg_chain(sw_ctx_tx->sg_aead_in, 2, + sw_ctx_tx->sg_plaintext_data); + sg_init_table(sw_ctx_tx->sg_aead_out, 2); + sg_set_buf(&sw_ctx_tx->sg_aead_out[0], sw_ctx_tx->aad_space, + sizeof(sw_ctx_tx->aad_space)); + sg_unmark_end(&sw_ctx_tx->sg_aead_out[1]); + sg_chain(sw_ctx_tx->sg_aead_out, 2, + sw_ctx_tx->sg_encrypted_data); } if (!*aead) { @@ -1161,22 +1176,22 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx) if (rc) goto free_aead; - if (!tx) { + if (sw_ctx_rx) { /* Set up strparser */ memset(&cb, 0, sizeof(cb)); cb.rcv_msg = tls_queue; cb.parse_msg = tls_read_size; - strp_init(&sw_ctx->strp, sk, &cb); + strp_init(&sw_ctx_rx->strp, sk, &cb); write_lock_bh(&sk->sk_callback_lock); - sw_ctx->saved_data_ready = sk->sk_data_ready; + sw_ctx_rx->saved_data_ready = sk->sk_data_ready; sk->sk_data_ready = tls_data_ready; write_unlock_bh(&sk->sk_callback_lock); - sw_ctx->sk_poll = sk->sk_socket->ops->poll; + sw_ctx_rx->sk_poll = sk->sk_socket->ops->poll; - strp_check_rcv(&sw_ctx->strp); + strp_check_rcv(&sw_ctx_rx->strp); } goto out; @@ -1188,11 +1203,16 @@ free_rec_seq: kfree(cctx->rec_seq); cctx->rec_seq = NULL; free_iv: - kfree(ctx->tx.iv); - ctx->tx.iv = NULL; + kfree(cctx->iv); + cctx->iv = NULL; free_priv: - kfree(ctx->priv_ctx); - ctx->priv_ctx = NULL; + if (tx) { + kfree(ctx->priv_ctx_tx); + ctx->priv_ctx_tx = NULL; + } else { + kfree(ctx->priv_ctx_rx); + ctx->priv_ctx_rx = NULL; + } out: return rc; } From e5ba6368eeb82f3c7e32235dbf9bc0b433f609ba Mon Sep 17 00:00:00 2001 From: Ilya Lesokhin Date: Mon, 30 Apr 2018 10:16:16 +0300 Subject: [PATCH 1442/1640] UPSTREAM: net/tls: Add generic NIC offload infrastructure This patch adds a generic infrastructure to offload TLS crypto to a network device. It enables the kernel TLS socket to skip encryption and authentication operations on the transmit side of the data path. Leaving those computationally expensive operations to the NIC. The NIC offload infrastructure builds TLS records and pushes them to the TCP layer just like the SW KTLS implementation and using the same API. TCP segmentation is mostly unaffected. Currently the only exception is that we prevent mixed SKBs where only part of the payload requires offload. In the future we are likely to add a similar restriction following a change cipher spec record. The notable differences between SW KTLS and NIC offloaded TLS implementations are as follows: 1. The offloaded implementation builds "plaintext TLS record", those records contain plaintext instead of ciphertext and place holder bytes instead of authentication tags. 2. The offloaded implementation maintains a mapping from TCP sequence number to TLS records. Thus given a TCP SKB sent from a NIC offloaded TLS socket, we can use the tls NIC offload infrastructure to obtain enough context to encrypt the payload of the SKB. A TLS record is released when the last byte of the record is ack'ed, this is done through the new icsk_clean_acked callback. The infrastructure should be extendable to support various NIC offload implementations. However it is currently written with the implementation below in mind: The NIC assumes that packets from each offloaded stream are sent as plaintext and in-order. It keeps track of the TLS records in the TCP stream. When a packet marked for offload is transmitted, the NIC encrypts the payload in-place and puts authentication tags in the relevant place holders. The responsibility for handling out-of-order packets (i.e. TCP retransmission, qdisc drops) falls on the netdev driver. The netdev driver keeps track of the expected TCP SN from the NIC's perspective. If the next packet to transmit matches the expected TCP SN, the driver advances the expected TCP SN, and transmits the packet with TLS offload indication. If the next packet to transmit does not match the expected TCP SN. The driver calls the TLS layer to obtain the TLS record that includes the TCP of the packet for transmission. Using this TLS record, the driver posts a work entry on the transmit queue to reconstruct the NIC TLS state required for the offload of the out-of-order packet. It updates the expected TCP SN accordingly and transmits the now in-order packet. The same queue is used for packet transmission and TLS context reconstruction to avoid the need for flushing the transmit queue before issuing the context reconstruction request. Signed-off-by: Ilya Lesokhin Signed-off-by: Boris Pismenny Signed-off-by: Aviad Yehezkel Signed-off-by: David S. Miller --- include/net/tls.h | 69 ++- net/tls/Kconfig | 10 + net/tls/Makefile | 2 + net/tls/tls_device.c | 764 ++++++++++++++++++++++++++++++++++ net/tls/tls_device_fallback.c | 450 ++++++++++++++++++++ net/tls/tls_main.c | 42 +- 6 files changed, 1332 insertions(+), 5 deletions(-) create mode 100644 net/tls/tls_device.c create mode 100644 net/tls/tls_device_fallback.c diff --git a/include/net/tls.h b/include/net/tls.h index 95a8c60b36be..8c56809eb384 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -116,6 +116,37 @@ struct tls_sw_context_rx { bool decrypted; }; +struct tls_record_info { + struct list_head list; + u32 end_seq; + int len; + int num_frags; + skb_frag_t frags[MAX_SKB_FRAGS]; +}; + +struct tls_offload_context { + struct crypto_aead *aead_send; + spinlock_t lock; /* protects records list */ + struct list_head records_list; + struct tls_record_info *open_record; + struct tls_record_info *retransmit_hint; + u64 hint_record_sn; + u64 unacked_record_sn; + + struct scatterlist sg_tx_data[MAX_SKB_FRAGS]; + void (*sk_destruct)(struct sock *sk); + u8 driver_state[]; + /* The TLS layer reserves room for driver specific state + * Currently the belief is that there is not enough + * driver specific state to justify another layer of indirection + */ +#define TLS_DRIVER_STATE_SIZE (max_t(size_t, 8, sizeof(void *))) +}; + +#define TLS_OFFLOAD_CONTEXT_SIZE \ + (ALIGN(sizeof(struct tls_offload_context), sizeof(void *)) + \ + TLS_DRIVER_STATE_SIZE) + enum { TLS_PENDING_CLOSED_RECORD }; @@ -195,9 +226,28 @@ ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags); -void tls_sk_destruct(struct sock *sk, struct tls_context *ctx); -void tls_icsk_clean_acked(struct sock *sk); +int tls_set_device_offload(struct sock *sk, struct tls_context *ctx); +int tls_device_sendmsg(struct sock *sk, struct msghdr *msg, size_t size); +int tls_device_sendpage(struct sock *sk, struct page *page, + int offset, size_t size, int flags); +void tls_device_sk_destruct(struct sock *sk); +void tls_device_init(void); +void tls_device_cleanup(void); +struct tls_record_info *tls_get_record(struct tls_offload_context *context, + u32 seq, u64 *p_record_sn); + +static inline bool tls_record_is_start_marker(struct tls_record_info *rec) +{ + return rec->len == 0; +} + +static inline u32 tls_record_start_seq(struct tls_record_info *rec) +{ + return rec->end_seq - rec->len; +} + +void tls_sk_destruct(struct sock *sk, struct tls_context *ctx); int tls_push_sg(struct sock *sk, struct tls_context *ctx, struct scatterlist *sg, u16 first_offset, int flags); @@ -234,6 +284,13 @@ static inline bool tls_is_pending_open_record(struct tls_context *tls_ctx) return tls_ctx->pending_open_record_frags; } +static inline bool tls_is_sk_tx_device_offloaded(struct sock *sk) +{ + return sk_fullsock(sk) && + /* matches smp_store_release in tls_set_device_offload */ + smp_load_acquire(&sk->sk_destruct) == &tls_device_sk_destruct; +} + static inline void tls_err_abort(struct sock *sk, int err) { sk->sk_err = err; @@ -329,4 +386,12 @@ int tls_proccess_cmsg(struct sock *sk, struct msghdr *msg, void tls_register_device(struct tls_device *device); void tls_unregister_device(struct tls_device *device); +struct sk_buff *tls_validate_xmit_skb(struct sock *sk, + struct net_device *dev, + struct sk_buff *skb); + +int tls_sw_fallback_init(struct sock *sk, + struct tls_offload_context *offload_ctx, + struct tls_crypto_info *crypto_info); + #endif /* _TLS_OFFLOAD_H */ diff --git a/net/tls/Kconfig b/net/tls/Kconfig index 89b8745a986f..73f05ece53d0 100644 --- a/net/tls/Kconfig +++ b/net/tls/Kconfig @@ -14,3 +14,13 @@ config TLS encryption handling of the TLS protocol to be done in-kernel. If unsure, say N. + +config TLS_DEVICE + bool "Transport Layer Security HW offload" + depends on TLS + select SOCK_VALIDATE_XMIT + default n + help + Enable kernel support for HW offload of the TLS protocol. + + If unsure, say N. diff --git a/net/tls/Makefile b/net/tls/Makefile index a930fd1c4f7b..4d6b728a67d0 100644 --- a/net/tls/Makefile +++ b/net/tls/Makefile @@ -5,3 +5,5 @@ obj-$(CONFIG_TLS) += tls.o tls-y := tls_main.o tls_sw.o + +tls-$(CONFIG_TLS_DEVICE) += tls_device.o tls_device_fallback.o diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c new file mode 100644 index 000000000000..ac45d6224e41 --- /dev/null +++ b/net/tls/tls_device.c @@ -0,0 +1,764 @@ +/* Copyright (c) 2018, Mellanox Technologies All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +/* device_offload_lock is used to synchronize tls_dev_add + * against NETDEV_DOWN notifications. + */ +static DECLARE_RWSEM(device_offload_lock); + +static void tls_device_gc_task(struct work_struct *work); + +static DECLARE_WORK(tls_device_gc_work, tls_device_gc_task); +static LIST_HEAD(tls_device_gc_list); +static LIST_HEAD(tls_device_list); +static DEFINE_SPINLOCK(tls_device_lock); + +static void tls_device_free_ctx(struct tls_context *ctx) +{ + struct tls_offload_context *offload_ctx = tls_offload_ctx(ctx); + + kfree(offload_ctx); + kfree(ctx); +} + +static void tls_device_gc_task(struct work_struct *work) +{ + struct tls_context *ctx, *tmp; + unsigned long flags; + LIST_HEAD(gc_list); + + spin_lock_irqsave(&tls_device_lock, flags); + list_splice_init(&tls_device_gc_list, &gc_list); + spin_unlock_irqrestore(&tls_device_lock, flags); + + list_for_each_entry_safe(ctx, tmp, &gc_list, list) { + struct net_device *netdev = ctx->netdev; + + if (netdev) { + netdev->tlsdev_ops->tls_dev_del(netdev, ctx, + TLS_OFFLOAD_CTX_DIR_TX); + dev_put(netdev); + } + + list_del(&ctx->list); + tls_device_free_ctx(ctx); + } +} + +static void tls_device_queue_ctx_destruction(struct tls_context *ctx) +{ + unsigned long flags; + + spin_lock_irqsave(&tls_device_lock, flags); + list_move_tail(&ctx->list, &tls_device_gc_list); + + /* schedule_work inside the spinlock + * to make sure tls_device_down waits for that work. + */ + schedule_work(&tls_device_gc_work); + + spin_unlock_irqrestore(&tls_device_lock, flags); +} + +/* We assume that the socket is already connected */ +static struct net_device *get_netdev_for_sock(struct sock *sk) +{ + struct dst_entry *dst = sk_dst_get(sk); + struct net_device *netdev = NULL; + + if (likely(dst)) { + netdev = dst->dev; + dev_hold(netdev); + } + + dst_release(dst); + + return netdev; +} + +static void destroy_record(struct tls_record_info *record) +{ + int nr_frags = record->num_frags; + skb_frag_t *frag; + + while (nr_frags-- > 0) { + frag = &record->frags[nr_frags]; + __skb_frag_unref(frag); + } + kfree(record); +} + +static void delete_all_records(struct tls_offload_context *offload_ctx) +{ + struct tls_record_info *info, *temp; + + list_for_each_entry_safe(info, temp, &offload_ctx->records_list, list) { + list_del(&info->list); + destroy_record(info); + } + + offload_ctx->retransmit_hint = NULL; +} + +static void tls_icsk_clean_acked(struct sock *sk, u32 acked_seq) +{ + struct tls_context *tls_ctx = tls_get_ctx(sk); + struct tls_record_info *info, *temp; + struct tls_offload_context *ctx; + u64 deleted_records = 0; + unsigned long flags; + + if (!tls_ctx) + return; + + ctx = tls_offload_ctx(tls_ctx); + + spin_lock_irqsave(&ctx->lock, flags); + info = ctx->retransmit_hint; + if (info && !before(acked_seq, info->end_seq)) { + ctx->retransmit_hint = NULL; + list_del(&info->list); + destroy_record(info); + deleted_records++; + } + + list_for_each_entry_safe(info, temp, &ctx->records_list, list) { + if (before(acked_seq, info->end_seq)) + break; + list_del(&info->list); + + destroy_record(info); + deleted_records++; + } + + ctx->unacked_record_sn += deleted_records; + spin_unlock_irqrestore(&ctx->lock, flags); +} + +/* At this point, there should be no references on this + * socket and no in-flight SKBs associated with this + * socket, so it is safe to free all the resources. + */ +void tls_device_sk_destruct(struct sock *sk) +{ + struct tls_context *tls_ctx = tls_get_ctx(sk); + struct tls_offload_context *ctx = tls_offload_ctx(tls_ctx); + + if (ctx->open_record) + destroy_record(ctx->open_record); + + delete_all_records(ctx); + crypto_free_aead(ctx->aead_send); + ctx->sk_destruct(sk); + clean_acked_data_disable(inet_csk(sk)); + + if (refcount_dec_and_test(&tls_ctx->refcount)) + tls_device_queue_ctx_destruction(tls_ctx); +} +EXPORT_SYMBOL(tls_device_sk_destruct); + +static void tls_append_frag(struct tls_record_info *record, + struct page_frag *pfrag, + int size) +{ + skb_frag_t *frag; + + frag = &record->frags[record->num_frags - 1]; + if (frag->page.p == pfrag->page && + frag->page_offset + frag->size == pfrag->offset) { + frag->size += size; + } else { + ++frag; + frag->page.p = pfrag->page; + frag->page_offset = pfrag->offset; + frag->size = size; + ++record->num_frags; + get_page(pfrag->page); + } + + pfrag->offset += size; + record->len += size; +} + +static int tls_push_record(struct sock *sk, + struct tls_context *ctx, + struct tls_offload_context *offload_ctx, + struct tls_record_info *record, + struct page_frag *pfrag, + int flags, + unsigned char record_type) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct page_frag dummy_tag_frag; + skb_frag_t *frag; + int i; + + /* fill prepend */ + frag = &record->frags[0]; + tls_fill_prepend(ctx, + skb_frag_address(frag), + record->len - ctx->tx.prepend_size, + record_type); + + /* HW doesn't care about the data in the tag, because it fills it. */ + dummy_tag_frag.page = skb_frag_page(frag); + dummy_tag_frag.offset = 0; + + tls_append_frag(record, &dummy_tag_frag, ctx->tx.tag_size); + record->end_seq = tp->write_seq + record->len; + spin_lock_irq(&offload_ctx->lock); + list_add_tail(&record->list, &offload_ctx->records_list); + spin_unlock_irq(&offload_ctx->lock); + offload_ctx->open_record = NULL; + set_bit(TLS_PENDING_CLOSED_RECORD, &ctx->flags); + tls_advance_record_sn(sk, &ctx->tx); + + for (i = 0; i < record->num_frags; i++) { + frag = &record->frags[i]; + sg_unmark_end(&offload_ctx->sg_tx_data[i]); + sg_set_page(&offload_ctx->sg_tx_data[i], skb_frag_page(frag), + frag->size, frag->page_offset); + sk_mem_charge(sk, frag->size); + get_page(skb_frag_page(frag)); + } + sg_mark_end(&offload_ctx->sg_tx_data[record->num_frags - 1]); + + /* all ready, send */ + return tls_push_sg(sk, ctx, offload_ctx->sg_tx_data, 0, flags); +} + +static int tls_create_new_record(struct tls_offload_context *offload_ctx, + struct page_frag *pfrag, + size_t prepend_size) +{ + struct tls_record_info *record; + skb_frag_t *frag; + + record = kmalloc(sizeof(*record), GFP_KERNEL); + if (!record) + return -ENOMEM; + + frag = &record->frags[0]; + __skb_frag_set_page(frag, pfrag->page); + frag->page_offset = pfrag->offset; + skb_frag_size_set(frag, prepend_size); + + get_page(pfrag->page); + pfrag->offset += prepend_size; + + record->num_frags = 1; + record->len = prepend_size; + offload_ctx->open_record = record; + return 0; +} + +static int tls_do_allocation(struct sock *sk, + struct tls_offload_context *offload_ctx, + struct page_frag *pfrag, + size_t prepend_size) +{ + int ret; + + if (!offload_ctx->open_record) { + if (unlikely(!skb_page_frag_refill(prepend_size, pfrag, + sk->sk_allocation))) { + sk->sk_prot->enter_memory_pressure(sk); + sk_stream_moderate_sndbuf(sk); + return -ENOMEM; + } + + ret = tls_create_new_record(offload_ctx, pfrag, prepend_size); + if (ret) + return ret; + + if (pfrag->size > pfrag->offset) + return 0; + } + + if (!sk_page_frag_refill(sk, pfrag)) + return -ENOMEM; + + return 0; +} + +static int tls_push_data(struct sock *sk, + struct iov_iter *msg_iter, + size_t size, int flags, + unsigned char record_type) +{ + struct tls_context *tls_ctx = tls_get_ctx(sk); + struct tls_offload_context *ctx = tls_offload_ctx(tls_ctx); + int tls_push_record_flags = flags | MSG_SENDPAGE_NOTLAST; + int more = flags & (MSG_SENDPAGE_NOTLAST | MSG_MORE); + struct tls_record_info *record = ctx->open_record; + struct page_frag *pfrag; + size_t orig_size = size; + u32 max_open_record_len; + int copy, rc = 0; + bool done = false; + long timeo; + + if (flags & + ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL | MSG_SENDPAGE_NOTLAST)) + return -ENOTSUPP; + + if (sk->sk_err) + return -sk->sk_err; + + timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); + rc = tls_complete_pending_work(sk, tls_ctx, flags, &timeo); + if (rc < 0) + return rc; + + pfrag = sk_page_frag(sk); + + /* TLS_HEADER_SIZE is not counted as part of the TLS record, and + * we need to leave room for an authentication tag. + */ + max_open_record_len = TLS_MAX_PAYLOAD_SIZE + + tls_ctx->tx.prepend_size; + do { + rc = tls_do_allocation(sk, ctx, pfrag, + tls_ctx->tx.prepend_size); + if (rc) { + rc = sk_stream_wait_memory(sk, &timeo); + if (!rc) + continue; + + record = ctx->open_record; + if (!record) + break; +handle_error: + if (record_type != TLS_RECORD_TYPE_DATA) { + /* avoid sending partial + * record with type != + * application_data + */ + size = orig_size; + destroy_record(record); + ctx->open_record = NULL; + } else if (record->len > tls_ctx->tx.prepend_size) { + goto last_record; + } + + break; + } + + record = ctx->open_record; + copy = min_t(size_t, size, (pfrag->size - pfrag->offset)); + copy = min_t(size_t, copy, (max_open_record_len - record->len)); + + if (copy_from_iter_nocache(page_address(pfrag->page) + + pfrag->offset, + copy, msg_iter) != copy) { + rc = -EFAULT; + goto handle_error; + } + tls_append_frag(record, pfrag, copy); + + size -= copy; + if (!size) { +last_record: + tls_push_record_flags = flags; + if (more) { + tls_ctx->pending_open_record_frags = + record->num_frags; + break; + } + + done = true; + } + + if (done || record->len >= max_open_record_len || + (record->num_frags >= MAX_SKB_FRAGS - 1)) { + rc = tls_push_record(sk, + tls_ctx, + ctx, + record, + pfrag, + tls_push_record_flags, + record_type); + if (rc < 0) + break; + } + } while (!done); + + if (orig_size - size > 0) + rc = orig_size - size; + + return rc; +} + +int tls_device_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) +{ + unsigned char record_type = TLS_RECORD_TYPE_DATA; + int rc; + + lock_sock(sk); + + if (unlikely(msg->msg_controllen)) { + rc = tls_proccess_cmsg(sk, msg, &record_type); + if (rc) + goto out; + } + + rc = tls_push_data(sk, &msg->msg_iter, size, + msg->msg_flags, record_type); + +out: + release_sock(sk); + return rc; +} + +int tls_device_sendpage(struct sock *sk, struct page *page, + int offset, size_t size, int flags) +{ + struct iov_iter msg_iter; + char *kaddr = kmap(page); + struct kvec iov; + int rc; + + if (flags & MSG_SENDPAGE_NOTLAST) + flags |= MSG_MORE; + + lock_sock(sk); + + if (flags & MSG_OOB) { + rc = -ENOTSUPP; + goto out; + } + + iov.iov_base = kaddr + offset; + iov.iov_len = size; + iov_iter_kvec(&msg_iter, WRITE | ITER_KVEC, &iov, 1, size); + rc = tls_push_data(sk, &msg_iter, size, + flags, TLS_RECORD_TYPE_DATA); + kunmap(page); + +out: + release_sock(sk); + return rc; +} + +struct tls_record_info *tls_get_record(struct tls_offload_context *context, + u32 seq, u64 *p_record_sn) +{ + u64 record_sn = context->hint_record_sn; + struct tls_record_info *info; + + info = context->retransmit_hint; + if (!info || + before(seq, info->end_seq - info->len)) { + /* if retransmit_hint is irrelevant start + * from the beggining of the list + */ + info = list_first_entry(&context->records_list, + struct tls_record_info, list); + record_sn = context->unacked_record_sn; + } + + list_for_each_entry_from(info, &context->records_list, list) { + if (before(seq, info->end_seq)) { + if (!context->retransmit_hint || + after(info->end_seq, + context->retransmit_hint->end_seq)) { + context->hint_record_sn = record_sn; + context->retransmit_hint = info; + } + *p_record_sn = record_sn; + return info; + } + record_sn++; + } + + return NULL; +} +EXPORT_SYMBOL(tls_get_record); + +static int tls_device_push_pending_record(struct sock *sk, int flags) +{ + struct iov_iter msg_iter; + + iov_iter_kvec(&msg_iter, WRITE | ITER_KVEC, NULL, 0, 0); + return tls_push_data(sk, &msg_iter, 0, flags, TLS_RECORD_TYPE_DATA); +} + +int tls_set_device_offload(struct sock *sk, struct tls_context *ctx) +{ + u16 nonce_size, tag_size, iv_size, rec_seq_size; + struct tls_record_info *start_marker_record; + struct tls_offload_context *offload_ctx; + struct tls_crypto_info *crypto_info; + struct net_device *netdev; + char *iv, *rec_seq; + struct sk_buff *skb; + int rc = -EINVAL; + __be64 rcd_sn; + + if (!ctx) + goto out; + + if (ctx->priv_ctx_tx) { + rc = -EEXIST; + goto out; + } + + start_marker_record = kmalloc(sizeof(*start_marker_record), GFP_KERNEL); + if (!start_marker_record) { + rc = -ENOMEM; + goto out; + } + + offload_ctx = kzalloc(TLS_OFFLOAD_CONTEXT_SIZE, GFP_KERNEL); + if (!offload_ctx) { + rc = -ENOMEM; + goto free_marker_record; + } + + crypto_info = &ctx->crypto_send; + switch (crypto_info->cipher_type) { + case TLS_CIPHER_AES_GCM_128: + nonce_size = TLS_CIPHER_AES_GCM_128_IV_SIZE; + tag_size = TLS_CIPHER_AES_GCM_128_TAG_SIZE; + iv_size = TLS_CIPHER_AES_GCM_128_IV_SIZE; + iv = ((struct tls12_crypto_info_aes_gcm_128 *)crypto_info)->iv; + rec_seq_size = TLS_CIPHER_AES_GCM_128_REC_SEQ_SIZE; + rec_seq = + ((struct tls12_crypto_info_aes_gcm_128 *)crypto_info)->rec_seq; + break; + default: + rc = -EINVAL; + goto free_offload_ctx; + } + + ctx->tx.prepend_size = TLS_HEADER_SIZE + nonce_size; + ctx->tx.tag_size = tag_size; + ctx->tx.overhead_size = ctx->tx.prepend_size + ctx->tx.tag_size; + ctx->tx.iv_size = iv_size; + ctx->tx.iv = kmalloc(iv_size + TLS_CIPHER_AES_GCM_128_SALT_SIZE, + GFP_KERNEL); + if (!ctx->tx.iv) { + rc = -ENOMEM; + goto free_offload_ctx; + } + + memcpy(ctx->tx.iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE, iv, iv_size); + + ctx->tx.rec_seq_size = rec_seq_size; + ctx->tx.rec_seq = kmalloc(rec_seq_size, GFP_KERNEL); + if (!ctx->tx.rec_seq) { + rc = -ENOMEM; + goto free_iv; + } + memcpy(ctx->tx.rec_seq, rec_seq, rec_seq_size); + + rc = tls_sw_fallback_init(sk, offload_ctx, crypto_info); + if (rc) + goto free_rec_seq; + + /* start at rec_seq - 1 to account for the start marker record */ + memcpy(&rcd_sn, ctx->tx.rec_seq, sizeof(rcd_sn)); + offload_ctx->unacked_record_sn = be64_to_cpu(rcd_sn) - 1; + + start_marker_record->end_seq = tcp_sk(sk)->write_seq; + start_marker_record->len = 0; + start_marker_record->num_frags = 0; + + INIT_LIST_HEAD(&offload_ctx->records_list); + list_add_tail(&start_marker_record->list, &offload_ctx->records_list); + spin_lock_init(&offload_ctx->lock); + + clean_acked_data_enable(inet_csk(sk), &tls_icsk_clean_acked); + ctx->push_pending_record = tls_device_push_pending_record; + offload_ctx->sk_destruct = sk->sk_destruct; + + /* TLS offload is greatly simplified if we don't send + * SKBs where only part of the payload needs to be encrypted. + * So mark the last skb in the write queue as end of record. + */ + skb = tcp_write_queue_tail(sk); + if (skb) + TCP_SKB_CB(skb)->eor = 1; + + refcount_set(&ctx->refcount, 1); + + /* We support starting offload on multiple sockets + * concurrently, so we only need a read lock here. + * This lock must precede get_netdev_for_sock to prevent races between + * NETDEV_DOWN and setsockopt. + */ + down_read(&device_offload_lock); + netdev = get_netdev_for_sock(sk); + if (!netdev) { + pr_err_ratelimited("%s: netdev not found\n", __func__); + rc = -EINVAL; + goto release_lock; + } + + if (!(netdev->features & NETIF_F_HW_TLS_TX)) { + rc = -ENOTSUPP; + goto release_netdev; + } + + /* Avoid offloading if the device is down + * We don't want to offload new flows after + * the NETDEV_DOWN event + */ + if (!(netdev->flags & IFF_UP)) { + rc = -EINVAL; + goto release_netdev; + } + + ctx->priv_ctx_tx = offload_ctx; + rc = netdev->tlsdev_ops->tls_dev_add(netdev, sk, TLS_OFFLOAD_CTX_DIR_TX, + &ctx->crypto_send, + tcp_sk(sk)->write_seq); + if (rc) + goto release_netdev; + + ctx->netdev = netdev; + + spin_lock_irq(&tls_device_lock); + list_add_tail(&ctx->list, &tls_device_list); + spin_unlock_irq(&tls_device_lock); + + sk->sk_validate_xmit_skb = tls_validate_xmit_skb; + /* following this assignment tls_is_sk_tx_device_offloaded + * will return true and the context might be accessed + * by the netdev's xmit function. + */ + smp_store_release(&sk->sk_destruct, + &tls_device_sk_destruct); + up_read(&device_offload_lock); + goto out; + +release_netdev: + dev_put(netdev); +release_lock: + up_read(&device_offload_lock); + clean_acked_data_disable(inet_csk(sk)); + crypto_free_aead(offload_ctx->aead_send); +free_rec_seq: + kfree(ctx->tx.rec_seq); +free_iv: + kfree(ctx->tx.iv); +free_offload_ctx: + kfree(offload_ctx); + ctx->priv_ctx_tx = NULL; +free_marker_record: + kfree(start_marker_record); +out: + return rc; +} + +static int tls_device_down(struct net_device *netdev) +{ + struct tls_context *ctx, *tmp; + unsigned long flags; + LIST_HEAD(list); + + /* Request a write lock to block new offload attempts */ + down_write(&device_offload_lock); + + spin_lock_irqsave(&tls_device_lock, flags); + list_for_each_entry_safe(ctx, tmp, &tls_device_list, list) { + if (ctx->netdev != netdev || + !refcount_inc_not_zero(&ctx->refcount)) + continue; + + list_move(&ctx->list, &list); + } + spin_unlock_irqrestore(&tls_device_lock, flags); + + list_for_each_entry_safe(ctx, tmp, &list, list) { + netdev->tlsdev_ops->tls_dev_del(netdev, ctx, + TLS_OFFLOAD_CTX_DIR_TX); + ctx->netdev = NULL; + dev_put(netdev); + list_del_init(&ctx->list); + + if (refcount_dec_and_test(&ctx->refcount)) + tls_device_free_ctx(ctx); + } + + up_write(&device_offload_lock); + + flush_work(&tls_device_gc_work); + + return NOTIFY_DONE; +} + +static int tls_dev_event(struct notifier_block *this, unsigned long event, + void *ptr) +{ + struct net_device *dev = netdev_notifier_info_to_dev(ptr); + + if (!(dev->features & NETIF_F_HW_TLS_TX)) + return NOTIFY_DONE; + + switch (event) { + case NETDEV_REGISTER: + case NETDEV_FEAT_CHANGE: + if (dev->tlsdev_ops && + dev->tlsdev_ops->tls_dev_add && + dev->tlsdev_ops->tls_dev_del) + return NOTIFY_DONE; + else + return NOTIFY_BAD; + case NETDEV_DOWN: + return tls_device_down(dev); + } + return NOTIFY_DONE; +} + +static struct notifier_block tls_dev_notifier = { + .notifier_call = tls_dev_event, +}; + +void __init tls_device_init(void) +{ + register_netdevice_notifier(&tls_dev_notifier); +} + +void __exit tls_device_cleanup(void) +{ + unregister_netdevice_notifier(&tls_dev_notifier); + flush_work(&tls_device_gc_work); +} diff --git a/net/tls/tls_device_fallback.c b/net/tls/tls_device_fallback.c new file mode 100644 index 000000000000..748914abdb60 --- /dev/null +++ b/net/tls/tls_device_fallback.c @@ -0,0 +1,450 @@ +/* Copyright (c) 2018, Mellanox Technologies All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include + +static void chain_to_walk(struct scatterlist *sg, struct scatter_walk *walk) +{ + struct scatterlist *src = walk->sg; + int diff = walk->offset - src->offset; + + sg_set_page(sg, sg_page(src), + src->length - diff, walk->offset); + + scatterwalk_crypto_chain(sg, sg_next(src), 0, 2); +} + +static int tls_enc_record(struct aead_request *aead_req, + struct crypto_aead *aead, char *aad, + char *iv, __be64 rcd_sn, + struct scatter_walk *in, + struct scatter_walk *out, int *in_len) +{ + unsigned char buf[TLS_HEADER_SIZE + TLS_CIPHER_AES_GCM_128_IV_SIZE]; + struct scatterlist sg_in[3]; + struct scatterlist sg_out[3]; + u16 len; + int rc; + + len = min_t(int, *in_len, ARRAY_SIZE(buf)); + + scatterwalk_copychunks(buf, in, len, 0); + scatterwalk_copychunks(buf, out, len, 1); + + *in_len -= len; + if (!*in_len) + return 0; + + scatterwalk_pagedone(in, 0, 1); + scatterwalk_pagedone(out, 1, 1); + + len = buf[4] | (buf[3] << 8); + len -= TLS_CIPHER_AES_GCM_128_IV_SIZE; + + tls_make_aad(aad, len - TLS_CIPHER_AES_GCM_128_TAG_SIZE, + (char *)&rcd_sn, sizeof(rcd_sn), buf[0]); + + memcpy(iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE, buf + TLS_HEADER_SIZE, + TLS_CIPHER_AES_GCM_128_IV_SIZE); + + sg_init_table(sg_in, ARRAY_SIZE(sg_in)); + sg_init_table(sg_out, ARRAY_SIZE(sg_out)); + sg_set_buf(sg_in, aad, TLS_AAD_SPACE_SIZE); + sg_set_buf(sg_out, aad, TLS_AAD_SPACE_SIZE); + chain_to_walk(sg_in + 1, in); + chain_to_walk(sg_out + 1, out); + + *in_len -= len; + if (*in_len < 0) { + *in_len += TLS_CIPHER_AES_GCM_128_TAG_SIZE; + /* the input buffer doesn't contain the entire record. + * trim len accordingly. The resulting authentication tag + * will contain garbage, but we don't care, so we won't + * include any of it in the output skb + * Note that we assume the output buffer length + * is larger then input buffer length + tag size + */ + if (*in_len < 0) + len += *in_len; + + *in_len = 0; + } + + if (*in_len) { + scatterwalk_copychunks(NULL, in, len, 2); + scatterwalk_pagedone(in, 0, 1); + scatterwalk_copychunks(NULL, out, len, 2); + scatterwalk_pagedone(out, 1, 1); + } + + len -= TLS_CIPHER_AES_GCM_128_TAG_SIZE; + aead_request_set_crypt(aead_req, sg_in, sg_out, len, iv); + + rc = crypto_aead_encrypt(aead_req); + + return rc; +} + +static void tls_init_aead_request(struct aead_request *aead_req, + struct crypto_aead *aead) +{ + aead_request_set_tfm(aead_req, aead); + aead_request_set_ad(aead_req, TLS_AAD_SPACE_SIZE); +} + +static struct aead_request *tls_alloc_aead_request(struct crypto_aead *aead, + gfp_t flags) +{ + unsigned int req_size = sizeof(struct aead_request) + + crypto_aead_reqsize(aead); + struct aead_request *aead_req; + + aead_req = kzalloc(req_size, flags); + if (aead_req) + tls_init_aead_request(aead_req, aead); + return aead_req; +} + +static int tls_enc_records(struct aead_request *aead_req, + struct crypto_aead *aead, struct scatterlist *sg_in, + struct scatterlist *sg_out, char *aad, char *iv, + u64 rcd_sn, int len) +{ + struct scatter_walk out, in; + int rc; + + scatterwalk_start(&in, sg_in); + scatterwalk_start(&out, sg_out); + + do { + rc = tls_enc_record(aead_req, aead, aad, iv, + cpu_to_be64(rcd_sn), &in, &out, &len); + rcd_sn++; + + } while (rc == 0 && len); + + scatterwalk_done(&in, 0, 0); + scatterwalk_done(&out, 1, 0); + + return rc; +} + +/* Can't use icsk->icsk_af_ops->send_check here because the ip addresses + * might have been changed by NAT. + */ +static void update_chksum(struct sk_buff *skb, int headln) +{ + struct tcphdr *th = tcp_hdr(skb); + int datalen = skb->len - headln; + const struct ipv6hdr *ipv6h; + const struct iphdr *iph; + + /* We only changed the payload so if we are using partial we don't + * need to update anything. + */ + if (likely(skb->ip_summed == CHECKSUM_PARTIAL)) + return; + + skb->ip_summed = CHECKSUM_PARTIAL; + skb->csum_start = skb_transport_header(skb) - skb->head; + skb->csum_offset = offsetof(struct tcphdr, check); + + if (skb->sk->sk_family == AF_INET6) { + ipv6h = ipv6_hdr(skb); + th->check = ~csum_ipv6_magic(&ipv6h->saddr, &ipv6h->daddr, + datalen, IPPROTO_TCP, 0); + } else { + iph = ip_hdr(skb); + th->check = ~csum_tcpudp_magic(iph->saddr, iph->daddr, datalen, + IPPROTO_TCP, 0); + } +} + +static void complete_skb(struct sk_buff *nskb, struct sk_buff *skb, int headln) +{ + skb_copy_header(nskb, skb); + + skb_put(nskb, skb->len); + memcpy(nskb->data, skb->data, headln); + update_chksum(nskb, headln); + + nskb->destructor = skb->destructor; + nskb->sk = skb->sk; + skb->destructor = NULL; + skb->sk = NULL; + refcount_add(nskb->truesize - skb->truesize, + &nskb->sk->sk_wmem_alloc); +} + +/* This function may be called after the user socket is already + * closed so make sure we don't use anything freed during + * tls_sk_proto_close here + */ + +static int fill_sg_in(struct scatterlist *sg_in, + struct sk_buff *skb, + struct tls_offload_context *ctx, + u64 *rcd_sn, + s32 *sync_size, + int *resync_sgs) +{ + int tcp_payload_offset = skb_transport_offset(skb) + tcp_hdrlen(skb); + int payload_len = skb->len - tcp_payload_offset; + u32 tcp_seq = ntohl(tcp_hdr(skb)->seq); + struct tls_record_info *record; + unsigned long flags; + int remaining; + int i; + + spin_lock_irqsave(&ctx->lock, flags); + record = tls_get_record(ctx, tcp_seq, rcd_sn); + if (!record) { + spin_unlock_irqrestore(&ctx->lock, flags); + WARN(1, "Record not found for seq %u\n", tcp_seq); + return -EINVAL; + } + + *sync_size = tcp_seq - tls_record_start_seq(record); + if (*sync_size < 0) { + int is_start_marker = tls_record_is_start_marker(record); + + spin_unlock_irqrestore(&ctx->lock, flags); + /* This should only occur if the relevant record was + * already acked. In that case it should be ok + * to drop the packet and avoid retransmission. + * + * There is a corner case where the packet contains + * both an acked and a non-acked record. + * We currently don't handle that case and rely + * on TCP to retranmit a packet that doesn't contain + * already acked payload. + */ + if (!is_start_marker) + *sync_size = 0; + return -EINVAL; + } + + remaining = *sync_size; + for (i = 0; remaining > 0; i++) { + skb_frag_t *frag = &record->frags[i]; + + __skb_frag_ref(frag); + sg_set_page(sg_in + i, skb_frag_page(frag), + skb_frag_size(frag), frag->page_offset); + + remaining -= skb_frag_size(frag); + + if (remaining < 0) + sg_in[i].length += remaining; + } + *resync_sgs = i; + + spin_unlock_irqrestore(&ctx->lock, flags); + if (skb_to_sgvec(skb, &sg_in[i], tcp_payload_offset, payload_len) < 0) + return -EINVAL; + + return 0; +} + +static void fill_sg_out(struct scatterlist sg_out[3], void *buf, + struct tls_context *tls_ctx, + struct sk_buff *nskb, + int tcp_payload_offset, + int payload_len, + int sync_size, + void *dummy_buf) +{ + sg_set_buf(&sg_out[0], dummy_buf, sync_size); + sg_set_buf(&sg_out[1], nskb->data + tcp_payload_offset, payload_len); + /* Add room for authentication tag produced by crypto */ + dummy_buf += sync_size; + sg_set_buf(&sg_out[2], dummy_buf, TLS_CIPHER_AES_GCM_128_TAG_SIZE); +} + +static struct sk_buff *tls_enc_skb(struct tls_context *tls_ctx, + struct scatterlist sg_out[3], + struct scatterlist *sg_in, + struct sk_buff *skb, + s32 sync_size, u64 rcd_sn) +{ + int tcp_payload_offset = skb_transport_offset(skb) + tcp_hdrlen(skb); + struct tls_offload_context *ctx = tls_offload_ctx(tls_ctx); + int payload_len = skb->len - tcp_payload_offset; + void *buf, *iv, *aad, *dummy_buf; + struct aead_request *aead_req; + struct sk_buff *nskb = NULL; + int buf_len; + + aead_req = tls_alloc_aead_request(ctx->aead_send, GFP_ATOMIC); + if (!aead_req) + return NULL; + + buf_len = TLS_CIPHER_AES_GCM_128_SALT_SIZE + + TLS_CIPHER_AES_GCM_128_IV_SIZE + + TLS_AAD_SPACE_SIZE + + sync_size + + TLS_CIPHER_AES_GCM_128_TAG_SIZE; + buf = kmalloc(buf_len, GFP_ATOMIC); + if (!buf) + goto free_req; + + iv = buf; + memcpy(iv, tls_ctx->crypto_send_aes_gcm_128.salt, + TLS_CIPHER_AES_GCM_128_SALT_SIZE); + aad = buf + TLS_CIPHER_AES_GCM_128_SALT_SIZE + + TLS_CIPHER_AES_GCM_128_IV_SIZE; + dummy_buf = aad + TLS_AAD_SPACE_SIZE; + + nskb = alloc_skb(skb_headroom(skb) + skb->len, GFP_ATOMIC); + if (!nskb) + goto free_buf; + + skb_reserve(nskb, skb_headroom(skb)); + + fill_sg_out(sg_out, buf, tls_ctx, nskb, tcp_payload_offset, + payload_len, sync_size, dummy_buf); + + if (tls_enc_records(aead_req, ctx->aead_send, sg_in, sg_out, aad, iv, + rcd_sn, sync_size + payload_len) < 0) + goto free_nskb; + + complete_skb(nskb, skb, tcp_payload_offset); + + /* validate_xmit_skb_list assumes that if the skb wasn't segmented + * nskb->prev will point to the skb itself + */ + nskb->prev = nskb; + +free_buf: + kfree(buf); +free_req: + kfree(aead_req); + return nskb; +free_nskb: + kfree_skb(nskb); + nskb = NULL; + goto free_buf; +} + +static struct sk_buff *tls_sw_fallback(struct sock *sk, struct sk_buff *skb) +{ + int tcp_payload_offset = skb_transport_offset(skb) + tcp_hdrlen(skb); + struct tls_context *tls_ctx = tls_get_ctx(sk); + struct tls_offload_context *ctx = tls_offload_ctx(tls_ctx); + int payload_len = skb->len - tcp_payload_offset; + struct scatterlist *sg_in, sg_out[3]; + struct sk_buff *nskb = NULL; + int sg_in_max_elements; + int resync_sgs = 0; + s32 sync_size = 0; + u64 rcd_sn; + + /* worst case is: + * MAX_SKB_FRAGS in tls_record_info + * MAX_SKB_FRAGS + 1 in SKB head and frags. + */ + sg_in_max_elements = 2 * MAX_SKB_FRAGS + 1; + + if (!payload_len) + return skb; + + sg_in = kmalloc_array(sg_in_max_elements, sizeof(*sg_in), GFP_ATOMIC); + if (!sg_in) + goto free_orig; + + sg_init_table(sg_in, sg_in_max_elements); + sg_init_table(sg_out, ARRAY_SIZE(sg_out)); + + if (fill_sg_in(sg_in, skb, ctx, &rcd_sn, &sync_size, &resync_sgs)) { + /* bypass packets before kernel TLS socket option was set */ + if (sync_size < 0 && payload_len <= -sync_size) + nskb = skb_get(skb); + goto put_sg; + } + + nskb = tls_enc_skb(tls_ctx, sg_out, sg_in, skb, sync_size, rcd_sn); + +put_sg: + while (resync_sgs) + put_page(sg_page(&sg_in[--resync_sgs])); + kfree(sg_in); +free_orig: + kfree_skb(skb); + return nskb; +} + +struct sk_buff *tls_validate_xmit_skb(struct sock *sk, + struct net_device *dev, + struct sk_buff *skb) +{ + if (dev == tls_get_ctx(sk)->netdev) + return skb; + + return tls_sw_fallback(sk, skb); +} + +int tls_sw_fallback_init(struct sock *sk, + struct tls_offload_context *offload_ctx, + struct tls_crypto_info *crypto_info) +{ + const u8 *key; + int rc; + + offload_ctx->aead_send = + crypto_alloc_aead("gcm(aes)", 0, CRYPTO_ALG_ASYNC); + if (IS_ERR(offload_ctx->aead_send)) { + rc = PTR_ERR(offload_ctx->aead_send); + pr_err_ratelimited("crypto_alloc_aead failed rc=%d\n", rc); + offload_ctx->aead_send = NULL; + goto err_out; + } + + key = ((struct tls12_crypto_info_aes_gcm_128 *)crypto_info)->key; + + rc = crypto_aead_setkey(offload_ctx->aead_send, key, + TLS_CIPHER_AES_GCM_128_KEY_SIZE); + if (rc) + goto free_aead; + + rc = crypto_aead_setauthsize(offload_ctx->aead_send, + TLS_CIPHER_AES_GCM_128_TAG_SIZE); + if (rc) + goto free_aead; + + return 0; +free_aead: + crypto_free_aead(offload_ctx->aead_send); +err_out: + return rc; +} diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index 94b10312ff26..a3bff3388cc7 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -54,6 +54,9 @@ enum { enum { TLS_BASE, TLS_SW, +#ifdef CONFIG_TLS_DEVICE + TLS_HW, +#endif TLS_HW_RECORD, TLS_NUM_CONFIG, }; @@ -280,6 +283,15 @@ static void tls_sk_proto_close(struct sock *sk, long timeout) tls_sw_free_resources_rx(sk); } +#ifdef CONFIG_TLS_DEVICE + if (ctx->tx_conf != TLS_HW) { +#else + { +#endif + kfree(ctx); + ctx = NULL; + } + skip_tx_cleanup: release_sock(sk); sk_proto_close(sk, timeout); @@ -442,8 +454,16 @@ static int do_tls_setsockopt_conf(struct sock *sk, char __user *optval, } if (tx) { - rc = tls_set_sw_offload(sk, ctx, 1); - conf = TLS_SW; +#ifdef CONFIG_TLS_DEVICE + rc = tls_set_device_offload(sk, ctx); + conf = TLS_HW; + if (rc) { +#else + { +#endif + rc = tls_set_sw_offload(sk, ctx, 1); + conf = TLS_SW; + } } else { rc = tls_set_sw_offload(sk, ctx, 0); conf = TLS_SW; @@ -596,6 +616,16 @@ static void build_protos(struct proto prot[TLS_NUM_CONFIG][TLS_NUM_CONFIG], prot[TLS_SW][TLS_SW].recvmsg = tls_sw_recvmsg; prot[TLS_SW][TLS_SW].close = tls_sk_proto_close; +#ifdef CONFIG_TLS_DEVICE + prot[TLS_HW][TLS_BASE] = prot[TLS_BASE][TLS_BASE]; + prot[TLS_HW][TLS_BASE].sendmsg = tls_device_sendmsg; + prot[TLS_HW][TLS_BASE].sendpage = tls_device_sendpage; + + prot[TLS_HW][TLS_SW] = prot[TLS_BASE][TLS_SW]; + prot[TLS_HW][TLS_SW].sendmsg = tls_device_sendmsg; + prot[TLS_HW][TLS_SW].sendpage = tls_device_sendpage; +#endif + prot[TLS_HW_RECORD][TLS_HW_RECORD] = *base; prot[TLS_HW_RECORD][TLS_HW_RECORD].hash = tls_hw_hash; prot[TLS_HW_RECORD][TLS_HW_RECORD].unhash = tls_hw_unhash; @@ -630,7 +660,7 @@ static int tls_init(struct sock *sk) ctx->getsockopt = sk->sk_prot->getsockopt; ctx->sk_proto_close = sk->sk_prot->close; - /* Build IPv6 TLS whenever the address of tcpv6_prot changes */ + /* Build IPv6 TLS whenever the address of tcpv6 _prot changes */ if (ip_ver == TLSV6 && unlikely(sk->sk_prot != smp_load_acquire(&saved_tcpv6_prot))) { mutex_lock(&tcpv6_prot_mutex); @@ -680,6 +710,9 @@ static int __init tls_register(void) tls_sw_proto_ops.poll = tls_sw_poll; tls_sw_proto_ops.splice_read = tls_sw_splice_read; +#ifdef CONFIG_TLS_DEVICE + tls_device_init(); +#endif tcp_register_ulp(&tcp_tls_ulp_ops); return 0; @@ -688,6 +721,9 @@ static int __init tls_register(void) static void __exit tls_unregister(void) { tcp_unregister_ulp(&tcp_tls_ulp_ops); +#ifdef CONFIG_TLS_DEVICE + tls_device_cleanup(); +#endif } module_init(tls_register); From b23e6ddce55a698e05b9e1fe43dd5282015deb21 Mon Sep 17 00:00:00 2001 From: Dave Watson Date: Tue, 1 May 2018 13:05:39 -0700 Subject: [PATCH 1443/1640] UPSTREAM: net/tls: Don't recursively call push_record during tls_write_space callbacks It is reported that in some cases, write_space may be called in do_tcp_sendpages, such that we recursively invoke do_tcp_sendpages again: [ 660.468802] ? do_tcp_sendpages+0x8d/0x580 [ 660.468826] ? tls_push_sg+0x74/0x130 [tls] [ 660.468852] ? tls_push_record+0x24a/0x390 [tls] [ 660.468880] ? tls_write_space+0x6a/0x80 [tls] ... tls_push_sg already does a loop over all sending sg's, so ignore any tls_write_space notifications until we are done sending. We then have to call the previous write_space to wake up poll() waiters after we are done with the send loop. Reported-by: Andre Tomt Signed-off-by: Dave Watson Signed-off-by: David S. Miller --- include/net/tls.h | 1 + net/tls/tls_main.c | 7 +++++++ 2 files changed, 8 insertions(+) diff --git a/include/net/tls.h b/include/net/tls.h index 8c56809eb384..ee78f339b4b3 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -187,6 +187,7 @@ struct tls_context { struct scatterlist *partially_sent_record; u16 partially_sent_offset; unsigned long flags; + bool in_tcp_sendpages; u16 pending_open_record_frags; int (*push_pending_record)(struct sock *sk, int flags); diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index a3bff3388cc7..e9a47ecc9dec 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -114,6 +114,7 @@ int tls_push_sg(struct sock *sk, size = sg->length - offset; offset += sg->offset; + ctx->in_tcp_sendpages = true; while (1) { if (sg_is_last(sg)) sendpage_flags = flags; @@ -148,6 +149,8 @@ retry: } clear_bit(TLS_PENDING_CLOSED_RECORD, &ctx->flags); + ctx->in_tcp_sendpages = false; + ctx->sk_write_space(sk); return 0; } @@ -217,6 +220,10 @@ static void tls_write_space(struct sock *sk) { struct tls_context *ctx = tls_get_ctx(sk); + /* We are already sending pages, ignore notification */ + if (ctx->in_tcp_sendpages) + return; + if (!sk->sk_write_pending && tls_is_pending_closed_record(ctx)) { gfp_t sk_allocation = sk->sk_allocation; int rc; From 23d0107403aafa196a3cc180ed5f01d25db2ef02 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 5 May 2018 08:35:04 -0700 Subject: [PATCH 1444/1640] BACKPORT: tls: fix use after free in tls_sk_proto_close syzbot reported a use-after-free in tls_sk_proto_close Add a boolean value to cleanup a bit this function. BUG: KASAN: use-after-free in tls_sk_proto_close+0x8ab/0x9c0 net/tls/tls_main.c:297 Read of size 1 at addr ffff8801ae40a858 by task syz-executor363/4503 CPU: 0 PID: 4503 Comm: syz-executor363 Not tainted 4.17.0-rc3+ #34 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x1b9/0x294 lib/dump_stack.c:113 print_address_description+0x6c/0x20b mm/kasan/report.c:256 kasan_report_error mm/kasan/report.c:354 [inline] kasan_report.cold.7+0x242/0x2fe mm/kasan/report.c:412 __asan_report_load1_noabort+0x14/0x20 mm/kasan/report.c:430 tls_sk_proto_close+0x8ab/0x9c0 net/tls/tls_main.c:297 inet_release+0x104/0x1f0 net/ipv4/af_inet.c:427 inet6_release+0x50/0x70 net/ipv6/af_inet6.c:460 sock_release+0x96/0x1b0 net/socket.c:594 sock_close+0x16/0x20 net/socket.c:1149 __fput+0x34d/0x890 fs/file_table.c:209 ____fput+0x15/0x20 fs/file_table.c:243 task_work_run+0x1e4/0x290 kernel/task_work.c:113 exit_task_work include/linux/task_work.h:22 [inline] do_exit+0x1aee/0x2730 kernel/exit.c:865 do_group_exit+0x16f/0x430 kernel/exit.c:968 get_signal+0x886/0x1960 kernel/signal.c:2469 do_signal+0x98/0x2040 arch/x86/kernel/signal.c:810 exit_to_usermode_loop+0x28a/0x310 arch/x86/entry/common.c:162 prepare_exit_to_usermode arch/x86/entry/common.c:196 [inline] syscall_return_slowpath arch/x86/entry/common.c:265 [inline] do_syscall_64+0x6ac/0x800 arch/x86/entry/common.c:290 entry_SYSCALL_64_after_hwframe+0x49/0xbe RIP: 0033:0x4457b9 RSP: 002b:00007fdf4d766da8 EFLAGS: 00000246 ORIG_RAX: 00000000000000ca RAX: fffffffffffffe00 RBX: 00000000006dac3c RCX: 00000000004457b9 RDX: 0000000000000000 RSI: 0000000000000000 RDI: 00000000006dac3c RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 00000000006dac38 R13: 3692738801137283 R14: 6bf92c39443c4c1d R15: 0000000000000006 Allocated by task 4498: save_stack+0x43/0xd0 mm/kasan/kasan.c:448 set_track mm/kasan/kasan.c:460 [inline] kasan_kmalloc+0xc4/0xe0 mm/kasan/kasan.c:553 kmem_cache_alloc_trace+0x152/0x780 mm/slab.c:3620 kmalloc include/linux/slab.h:512 [inline] kzalloc include/linux/slab.h:701 [inline] create_ctx net/tls/tls_main.c:521 [inline] tls_init+0x1f9/0xb00 net/tls/tls_main.c:633 tcp_set_ulp+0x1bc/0x520 net/ipv4/tcp_ulp.c:153 do_tcp_setsockopt.isra.39+0x44a/0x2600 net/ipv4/tcp.c:2588 tcp_setsockopt+0xc1/0xe0 net/ipv4/tcp.c:2893 sock_common_setsockopt+0x9a/0xe0 net/core/sock.c:3039 __sys_setsockopt+0x1bd/0x390 net/socket.c:1903 __do_sys_setsockopt net/socket.c:1914 [inline] __se_sys_setsockopt net/socket.c:1911 [inline] __x64_sys_setsockopt+0xbe/0x150 net/socket.c:1911 do_syscall_64+0x1b1/0x800 arch/x86/entry/common.c:287 entry_SYSCALL_64_after_hwframe+0x49/0xbe Freed by task 4503: save_stack+0x43/0xd0 mm/kasan/kasan.c:448 set_track mm/kasan/kasan.c:460 [inline] __kasan_slab_free+0x11a/0x170 mm/kasan/kasan.c:521 kasan_slab_free+0xe/0x10 mm/kasan/kasan.c:528 __cache_free mm/slab.c:3498 [inline] kfree+0xd9/0x260 mm/slab.c:3813 tls_sw_free_resources+0x2a3/0x360 net/tls/tls_sw.c:1037 tls_sk_proto_close+0x67c/0x9c0 net/tls/tls_main.c:288 inet_release+0x104/0x1f0 net/ipv4/af_inet.c:427 inet6_release+0x50/0x70 net/ipv6/af_inet6.c:460 sock_release+0x96/0x1b0 net/socket.c:594 sock_close+0x16/0x20 net/socket.c:1149 __fput+0x34d/0x890 fs/file_table.c:209 ____fput+0x15/0x20 fs/file_table.c:243 task_work_run+0x1e4/0x290 kernel/task_work.c:113 exit_task_work include/linux/task_work.h:22 [inline] do_exit+0x1aee/0x2730 kernel/exit.c:865 do_group_exit+0x16f/0x430 kernel/exit.c:968 get_signal+0x886/0x1960 kernel/signal.c:2469 do_signal+0x98/0x2040 arch/x86/kernel/signal.c:810 exit_to_usermode_loop+0x28a/0x310 arch/x86/entry/common.c:162 prepare_exit_to_usermode arch/x86/entry/common.c:196 [inline] syscall_return_slowpath arch/x86/entry/common.c:265 [inline] do_syscall_64+0x6ac/0x800 arch/x86/entry/common.c:290 entry_SYSCALL_64_after_hwframe+0x49/0xbe The buggy address belongs to the object at ffff8801ae40a800 which belongs to the cache kmalloc-256 of size 256 The buggy address is located 88 bytes inside of 256-byte region [ffff8801ae40a800, ffff8801ae40a900) The buggy address belongs to the page: page:ffffea0006b90280 count:1 mapcount:0 mapping:ffff8801ae40a080 index:0x0 flags: 0x2fffc0000000100(slab) raw: 02fffc0000000100 ffff8801ae40a080 0000000000000000 000000010000000c raw: ffffea0006bea9e0 ffffea0006bc94a0 ffff8801da8007c0 0000000000000000 page dumped because: kasan: bad access detected Fixes: dd0bed1665d6 ("tls: support for Inline tls record") Signed-off-by: Eric Dumazet Cc: Atul Gupta Cc: Steve Wise Cc: Ilya Lesokhin Cc: Aviad Yehezkel Cc: Dave Watson Reported-by: syzbot Signed-off-by: David S. Miller --- net/tls/tls_main.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index e9a47ecc9dec..f3b5c67bb5cf 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -248,16 +248,14 @@ static void tls_sk_proto_close(struct sock *sk, long timeout) struct tls_context *ctx = tls_get_ctx(sk); long timeo = sock_sndtimeo(sk, 0); void (*sk_proto_close)(struct sock *sk, long timeout); + bool free_ctx = false; lock_sock(sk); sk_proto_close = ctx->sk_proto_close; - if (ctx->tx_conf == TLS_HW_RECORD && ctx->rx_conf == TLS_HW_RECORD) - goto skip_tx_cleanup; - - if (ctx->tx_conf == TLS_BASE && ctx->rx_conf == TLS_BASE) { - kfree(ctx); - ctx = NULL; + if ((ctx->tx_conf == TLS_HW_RECORD && ctx->rx_conf == TLS_HW_RECORD) || + (ctx->tx_conf == TLS_BASE && ctx->rx_conf == TLS_BASE)) { + free_ctx = true; goto skip_tx_cleanup; } @@ -305,8 +303,7 @@ skip_tx_cleanup: /* free ctx for TLS_HW_RECORD, used by tcp_set_state * for sk->sk_prot->unhash [tls_hw_unhash] */ - if (ctx && ctx->tx_conf == TLS_HW_RECORD && - ctx->rx_conf == TLS_HW_RECORD) + if (free_ctx) kfree(ctx); } From 9ddce100ae34e024f1af7f7037f505835ccea8e8 Mon Sep 17 00:00:00 2001 From: Andre Tomt Date: Mon, 7 May 2018 04:24:39 +0200 Subject: [PATCH 1445/1640] UPSTREAM: net/tls: Fix connection stall on partial tls record In the case of writing a partial tls record we forgot to clear the ctx->in_tcp_sendpages flag, causing some connections to stall. Fixes: c212d2c7fc47 ("net/tls: Don't recursively call push_record during tls_write_space callbacks") Signed-off-by: Andre Tomt Signed-off-by: David S. Miller --- net/tls/tls_main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index f3b5c67bb5cf..a279eb99c6d6 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -135,6 +135,7 @@ retry: offset -= sg->offset; ctx->partially_sent_offset = offset; ctx->partially_sent_record = (void *)sg; + ctx->in_tcp_sendpages = false; return ret; } From 75c743515438d3c76a504491d4d99b2bc5d6097a Mon Sep 17 00:00:00 2001 From: Boris Pismenny Date: Thu, 10 May 2018 16:27:25 +0300 Subject: [PATCH 1446/1640] UPSTREAM: tls: Fix tls_device initialization Add sg table initialization to fix a BUG_ON encountered when enabling CONFIG_DEBUG_SG. Signed-off-by: Boris Pismenny Signed-off-by: David S. Miller --- net/tls/tls_device.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index ac45d6224e41..a7a8f8e20ff3 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -604,6 +604,8 @@ int tls_set_device_offload(struct sock *sk, struct tls_context *ctx) INIT_LIST_HEAD(&offload_ctx->records_list); list_add_tail(&start_marker_record->list, &offload_ctx->records_list); spin_lock_init(&offload_ctx->lock); + sg_init_table(offload_ctx->sg_tx_data, + ARRAY_SIZE(offload_ctx->sg_tx_data)); clean_acked_data_enable(inet_csk(sk), &tls_icsk_clean_acked); ctx->push_pending_record = tls_device_push_pending_record; From 60ad1e76edbda3852a45f265e0211f976fd70f58 Mon Sep 17 00:00:00 2001 From: Matt Mullins Date: Wed, 16 May 2018 10:48:40 -0700 Subject: [PATCH 1447/1640] BACKPORT: tls: don't use stack memory in a scatterlist scatterlist code expects virt_to_page() to work, which fails with CONFIG_VMAP_STACK=y. Fixes: c46234ebb4d1e ("tls: RX path for ktls") Signed-off-by: Matt Mullins Signed-off-by: David S. Miller --- include/net/tls.h | 4 ++++ net/tls/tls_sw.c | 9 ++++----- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/include/net/tls.h b/include/net/tls.h index ee78f339b4b3..faccdf208fd1 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -101,6 +101,10 @@ struct tls_sw_context_tx { struct scatterlist sg_aead_in[2]; /* AAD | sg_encrypted_data (data contain overhead for hdr&iv&tag) */ struct scatterlist sg_aead_out[2]; + + char rx_aad_ciphertext[TLS_AAD_SPACE_SIZE]; + char rx_aad_plaintext[TLS_AAD_SPACE_SIZE]; + }; struct tls_sw_context_rx { diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 5c3909c311f1..839e1e165a0c 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -680,7 +680,6 @@ static int decrypt_skb(struct sock *sk, struct sk_buff *skb, struct scatterlist *sgin = &sgin_arr[0]; struct strp_msg *rxm = strp_msg(skb); int ret, nsg = ARRAY_SIZE(sgin_arr); - char aad_recv[TLS_AAD_SPACE_SIZE]; struct sk_buff *unused; ret = skb_copy_bits(skb, rxm->offset + TLS_HEADER_SIZE, @@ -697,13 +696,13 @@ static int decrypt_skb(struct sock *sk, struct sk_buff *skb, } sg_init_table(sgin, nsg); - sg_set_buf(&sgin[0], aad_recv, sizeof(aad_recv)); + sg_set_buf(&sgin[0], ctx->rx_aad_ciphertext, TLS_AAD_SPACE_SIZE); nsg = skb_to_sgvec(skb, &sgin[1], rxm->offset + tls_ctx->rx.prepend_size, rxm->full_len - tls_ctx->rx.prepend_size); - tls_make_aad(aad_recv, + tls_make_aad(ctx->rx_aad_ciphertext, rxm->full_len - tls_ctx->rx.overhead_size, tls_ctx->rx.rec_seq, tls_ctx->rx.rec_seq_size, @@ -802,12 +801,12 @@ int tls_sw_recvmsg(struct sock *sk, if (to_copy <= len && page_count < MAX_SKB_FRAGS && likely(!(flags & MSG_PEEK))) { struct scatterlist sgin[MAX_SKB_FRAGS + 1]; - char unused[21]; int pages = 0; zc = true; sg_init_table(sgin, MAX_SKB_FRAGS + 1); - sg_set_buf(&sgin[0], unused, 13); + sg_set_buf(&sgin[0], ctx->rx_aad_plaintext, + TLS_AAD_SPACE_SIZE); err = zerocopy_from_iter(sk, &msg->msg_iter, to_copy, &pages, From 791c1cd43c73439d29a6aa7b29e67bc863c206ae Mon Sep 17 00:00:00 2001 From: Doron Roberts-Kedes Date: Wed, 6 Jun 2018 09:33:28 -0700 Subject: [PATCH 1448/1640] UPSTREAM: strparser: Add __strp_unpause and use it in ktls. strp_unpause queues strp_work in order to parse any messages that arrived while the strparser was paused. However, the process invoking strp_unpause could eagerly parse a buffered message itself if it held the sock lock. __strp_unpause is an alternative to strp_pause that avoids the scheduling overhead that results when a receiving thread unpauses the strparser and waits for the next message to be delivered by the workqueue thread. This patch more than doubled the IOPS achieved in a benchmark of NBD traffic encrypted using ktls. Signed-off-by: Doron Roberts-Kedes Signed-off-by: David S. Miller --- include/net/strparser.h | 2 ++ net/strparser/strparser.c | 13 +++++++++++++ net/tls/tls_sw.c | 2 +- 3 files changed, 16 insertions(+), 1 deletion(-) diff --git a/include/net/strparser.h b/include/net/strparser.h index d96b59f45eba..f177c87ce38b 100644 --- a/include/net/strparser.h +++ b/include/net/strparser.h @@ -90,6 +90,8 @@ static inline void strp_pause(struct strparser *strp) /* May be called without holding lock for attached socket */ void strp_unpause(struct strparser *strp); +/* Must be called with process lock held (lock_sock) */ +void __strp_unpause(struct strparser *strp); static inline void save_strp_stats(struct strparser *strp, struct strp_aggr_stats *agg_stats) diff --git a/net/strparser/strparser.c b/net/strparser/strparser.c index a68c754e84ea..332eb0df153f 100644 --- a/net/strparser/strparser.c +++ b/net/strparser/strparser.c @@ -497,6 +497,19 @@ int strp_init(struct strparser *strp, struct sock *sk, } EXPORT_SYMBOL_GPL(strp_init); +/* Sock process lock held (lock_sock) */ +void __strp_unpause(struct strparser *strp) +{ + strp->paused = 0; + + if (strp->need_bytes) { + if (strp_peek_len(strp) < strp->need_bytes) + return; + } + strp_read_sock(strp); +} +EXPORT_SYMBOL_GPL(__strp_unpause); + void strp_unpause(struct strparser *strp) { strp->paused = 0; diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 839e1e165a0c..8ca57d01b18f 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -735,7 +735,7 @@ static bool tls_sw_advance_skb(struct sock *sk, struct sk_buff *skb, /* Finished with message */ ctx->recv_pkt = NULL; kfree_skb(skb); - strp_unpause(&ctx->strp); + __strp_unpause(&ctx->strp); return true; } From 2a9287418218450449b509658c6cde9b73e4310c Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 11 Jun 2018 23:22:04 +0200 Subject: [PATCH 1449/1640] UPSTREAM: tls: fix NULL pointer dereference on poll While hacking on kTLS, I ran into the following panic from an unprivileged netserver / netperf TCP session: BUG: unable to handle kernel NULL pointer dereference at 0000000000000000 PGD 800000037f378067 P4D 800000037f378067 PUD 3c0e61067 PMD 0 Oops: 0010 [#1] SMP KASAN PTI CPU: 1 PID: 2289 Comm: netserver Not tainted 4.17.0+ #139 Hardware name: LENOVO 20FBCTO1WW/20FBCTO1WW, BIOS N1FET47W (1.21 ) 11/28/2016 RIP: 0010: (null) Code: Bad RIP value. RSP: 0018:ffff88036abcf740 EFLAGS: 00010246 RAX: dffffc0000000000 RBX: ffff88036f5f6800 RCX: 1ffff1006debed26 RDX: ffff88036abcf920 RSI: ffff8803cb1a4f00 RDI: ffff8803c258c280 RBP: ffff8803c258c280 R08: ffff8803c258c280 R09: ffffed006f559d48 R10: ffff88037aacea43 R11: ffffed006f559d49 R12: ffff8803c258c280 R13: ffff8803cb1a4f20 R14: 00000000000000db R15: ffffffffc168a350 FS: 00007f7e631f4700(0000) GS:ffff8803d1c80000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: ffffffffffffffd6 CR3: 00000003ccf64005 CR4: 00000000003606e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: ? tls_sw_poll+0xa4/0x160 [tls] ? sock_poll+0x20a/0x680 ? do_select+0x77b/0x11a0 ? poll_schedule_timeout.constprop.12+0x130/0x130 ? pick_link+0xb00/0xb00 ? read_word_at_a_time+0x13/0x20 ? vfs_poll+0x270/0x270 ? deref_stack_reg+0xad/0xe0 ? __read_once_size_nocheck.constprop.6+0x10/0x10 [...] Debugging further, it turns out that calling into ctx->sk_poll() is invalid since sk_poll itself is NULL which was saved from the original TCP socket in order for tls_sw_poll() to invoke it. Looks like the recent conversion from poll to poll_mask callback started in 152524231023 ("net: add support for ->poll_mask in proto_ops") missed to eventually convert kTLS, too: TCP's ->poll was converted over to the ->poll_mask in commit 2c7d3dacebd4 ("net/tcp: convert to ->poll_mask") and therefore kTLS wrongly saved the ->poll old one which is now NULL. Convert kTLS over to use ->poll_mask instead. Also instead of POLLIN | POLLRDNORM use the proper EPOLLIN | EPOLLRDNORM bits as the case in tcp_poll_mask() as well that is mangled here. Fixes: 2c7d3dacebd4 ("net/tcp: convert to ->poll_mask") Signed-off-by: Daniel Borkmann Cc: Christoph Hellwig Cc: Dave Watson Tested-by: Dave Watson Signed-off-by: David S. Miller --- include/net/tls.h | 6 ++---- net/tls/tls_main.c | 2 +- net/tls/tls_sw.c | 19 +++++++++---------- 3 files changed, 12 insertions(+), 15 deletions(-) diff --git a/include/net/tls.h b/include/net/tls.h index faccdf208fd1..985b49a4c6ba 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -113,8 +113,7 @@ struct tls_sw_context_rx { struct strparser strp; void (*saved_data_ready)(struct sock *sk); - unsigned int (*sk_poll)(struct file *file, struct socket *sock, - struct poll_table_struct *wait); + __poll_t (*sk_poll_mask)(struct socket *sock, __poll_t events); struct sk_buff *recv_pkt; u8 control; bool decrypted; @@ -225,8 +224,7 @@ void tls_sw_free_resources_tx(struct sock *sk); void tls_sw_free_resources_rx(struct sock *sk); int tls_sw_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, int flags, int *addr_len); -unsigned int tls_sw_poll(struct file *file, struct socket *sock, - struct poll_table_struct *wait); +__poll_t tls_sw_poll_mask(struct socket *sock, __poll_t events); ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags); diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index a279eb99c6d6..3b2f9978b73a 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -712,7 +712,7 @@ static int __init tls_register(void) build_protos(tls_prots[TLSV4], &tcp_prot); tls_sw_proto_ops = inet_stream_ops; - tls_sw_proto_ops.poll = tls_sw_poll; + tls_sw_proto_ops.poll_mask = tls_sw_poll_mask; tls_sw_proto_ops.splice_read = tls_sw_splice_read; #ifdef CONFIG_TLS_DEVICE diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 8ca57d01b18f..34895b7c132d 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -915,23 +915,22 @@ splice_read_end: return copied ? : err; } -unsigned int tls_sw_poll(struct file *file, struct socket *sock, - struct poll_table_struct *wait) +__poll_t tls_sw_poll_mask(struct socket *sock, __poll_t events) { - unsigned int ret; struct sock *sk = sock->sk; struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); + __poll_t mask; - /* Grab POLLOUT and POLLHUP from the underlying socket */ - ret = ctx->sk_poll(file, sock, wait); + /* Grab EPOLLOUT and EPOLLHUP from the underlying socket */ + mask = ctx->sk_poll_mask(sock, events); - /* Clear POLLIN bits, and set based on recv_pkt */ - ret &= ~(POLLIN | POLLRDNORM); + /* Clear EPOLLIN bits, and set based on recv_pkt */ + mask &= ~(EPOLLIN | EPOLLRDNORM); if (ctx->recv_pkt) - ret |= POLLIN | POLLRDNORM; + mask |= EPOLLIN | EPOLLRDNORM; - return ret; + return mask; } static int tls_read_size(struct strparser *strp, struct sk_buff *skb) @@ -1188,7 +1187,7 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx) sk->sk_data_ready = tls_data_ready; write_unlock_bh(&sk->sk_callback_lock); - sw_ctx_rx->sk_poll = sk->sk_socket->ops->poll; + sw_ctx_rx->sk_poll_mask = sk->sk_socket->ops->poll_mask; strp_check_rcv(&sw_ctx_rx->strp); } From 715c72f33a4a12b88044419649c2e1f82d79bb38 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 15 Jun 2018 03:07:45 +0200 Subject: [PATCH 1450/1640] UPSTREAM: tls: fix use-after-free in tls_push_record syzkaller managed to trigger a use-after-free in tls like the following: BUG: KASAN: use-after-free in tls_push_record.constprop.15+0x6a2/0x810 [tls] Write of size 1 at addr ffff88037aa08000 by task a.out/2317 CPU: 3 PID: 2317 Comm: a.out Not tainted 4.17.0+ #144 Hardware name: LENOVO 20FBCTO1WW/20FBCTO1WW, BIOS N1FET47W (1.21 ) 11/28/2016 Call Trace: dump_stack+0x71/0xab print_address_description+0x6a/0x280 kasan_report+0x258/0x380 ? tls_push_record.constprop.15+0x6a2/0x810 [tls] tls_push_record.constprop.15+0x6a2/0x810 [tls] tls_sw_push_pending_record+0x2e/0x40 [tls] tls_sk_proto_close+0x3fe/0x710 [tls] ? tcp_check_oom+0x4c0/0x4c0 ? tls_write_space+0x260/0x260 [tls] ? kmem_cache_free+0x88/0x1f0 inet_release+0xd6/0x1b0 __sock_release+0xc0/0x240 sock_close+0x11/0x20 __fput+0x22d/0x660 task_work_run+0x114/0x1a0 do_exit+0x71a/0x2780 ? mm_update_next_owner+0x650/0x650 ? handle_mm_fault+0x2f5/0x5f0 ? __do_page_fault+0x44f/0xa50 ? mm_fault_error+0x2d0/0x2d0 do_group_exit+0xde/0x300 __x64_sys_exit_group+0x3a/0x50 do_syscall_64+0x9a/0x300 ? page_fault+0x8/0x30 entry_SYSCALL_64_after_hwframe+0x44/0xa9 This happened through fault injection where aead_req allocation in tls_do_encryption() eventually failed and we returned -ENOMEM from the function. Turns out that the use-after-free is triggered from tls_sw_sendmsg() in the second tls_push_record(). The error then triggers a jump to waiting for memory in sk_stream_wait_memory() resp. returning immediately in case of MSG_DONTWAIT. What follows is the trim_both_sgl(sk, orig_size), which drops elements from the sg list added via tls_sw_sendmsg(). Now the use-after-free gets triggered when the socket is being closed, where tls_sk_proto_close() callback is invoked. The tls_complete_pending_work() will figure that there's a pending closed tls record to be flushed and thus calls into the tls_push_pending_closed_record() from there. ctx->push_pending_record() is called from the latter, which is the tls_sw_push_pending_record() from sw path. This again calls into tls_push_record(). And here the tls_fill_prepend() will panic since the buffer address has been freed earlier via trim_both_sgl(). One way to fix it is to move the aead request allocation out of tls_do_encryption() early into tls_push_record(). This means we don't prep the tls header and advance state to the TLS_PENDING_CLOSED_RECORD before allocation which could potentially fail happened. That fixes the issue on my side. Fixes: 3c4d7559159b ("tls: kernel TLS support") Reported-by: syzbot+5c74af81c547738e1684@syzkaller.appspotmail.com Reported-by: syzbot+709f2810a6a05f11d4d3@syzkaller.appspotmail.com Signed-off-by: Daniel Borkmann Acked-by: Dave Watson Signed-off-by: David S. Miller --- net/tls/tls_sw.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 34895b7c132d..2945a3bd538c 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -191,18 +191,12 @@ static void tls_free_both_sg(struct sock *sk) } static int tls_do_encryption(struct tls_context *tls_ctx, - struct tls_sw_context_tx *ctx, size_t data_len, - gfp_t flags) + struct tls_sw_context_tx *ctx, + struct aead_request *aead_req, + size_t data_len) { - unsigned int req_size = sizeof(struct aead_request) + - crypto_aead_reqsize(ctx->aead_send); - struct aead_request *aead_req; int rc; - aead_req = kzalloc(req_size, flags); - if (!aead_req) - return -ENOMEM; - ctx->sg_encrypted_data[0].offset += tls_ctx->tx.prepend_size; ctx->sg_encrypted_data[0].length -= tls_ctx->tx.prepend_size; @@ -219,7 +213,6 @@ static int tls_do_encryption(struct tls_context *tls_ctx, ctx->sg_encrypted_data[0].offset -= tls_ctx->tx.prepend_size; ctx->sg_encrypted_data[0].length += tls_ctx->tx.prepend_size; - kfree(aead_req); return rc; } @@ -228,8 +221,14 @@ static int tls_push_record(struct sock *sk, int flags, { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); + struct aead_request *req; int rc; + req = kzalloc(sizeof(struct aead_request) + + crypto_aead_reqsize(ctx->aead_send), sk->sk_allocation); + if (!req) + return -ENOMEM; + sg_mark_end(ctx->sg_plaintext_data + ctx->sg_plaintext_num_elem - 1); sg_mark_end(ctx->sg_encrypted_data + ctx->sg_encrypted_num_elem - 1); @@ -245,15 +244,14 @@ static int tls_push_record(struct sock *sk, int flags, tls_ctx->pending_open_record_frags = 0; set_bit(TLS_PENDING_CLOSED_RECORD, &tls_ctx->flags); - rc = tls_do_encryption(tls_ctx, ctx, ctx->sg_plaintext_size, - sk->sk_allocation); + rc = tls_do_encryption(tls_ctx, ctx, req, ctx->sg_plaintext_size); if (rc < 0) { /* If we are called from write_space and * we fail, we need to set this SOCK_NOSPACE * to trigger another write_space in the future. */ set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); - return rc; + goto out_req; } free_sg(sk, ctx->sg_plaintext_data, &ctx->sg_plaintext_num_elem, @@ -268,6 +266,8 @@ static int tls_push_record(struct sock *sk, int flags, tls_err_abort(sk, EBADMSG); tls_advance_record_sn(sk, &tls_ctx->tx); +out_req: + kfree(req); return rc; } From a673e23a47a5293d5a350a3bcb6a362813981ece Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 15 Jun 2018 03:07:46 +0200 Subject: [PATCH 1451/1640] UPSTREAM: tls: fix waitall behavior in tls_sw_recvmsg Current behavior in tls_sw_recvmsg() is to wait for incoming tls messages and copy up to exactly len bytes of data that the user provided. This is problematic in the sense that i) if no packet is currently queued in strparser we keep waiting until one has been processed and pushed into tls receive layer for tls_wait_data() to wake up and push the decrypted bits to user space. Given after tls decryption, we're back at streaming data, use sock_rcvlowat() hint from tcp socket instead. Retain current behavior with MSG_WAITALL flag and otherwise use the hint target for breaking the loop and returning to application. This is done if currently no ctx->recv_pkt is ready, otherwise continue to process it from our strparser backlog. Fixes: c46234ebb4d1 ("tls: RX path for ktls") Signed-off-by: Daniel Borkmann Acked-by: Dave Watson Signed-off-by: David S. Miller --- net/tls/tls_sw.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 2945a3bd538c..f127fac88acf 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -754,7 +754,7 @@ int tls_sw_recvmsg(struct sock *sk, struct sk_buff *skb; ssize_t copied = 0; bool cmsg = false; - int err = 0; + int target, err = 0; long timeo; flags |= nonblock; @@ -764,6 +764,7 @@ int tls_sw_recvmsg(struct sock *sk, lock_sock(sk); + target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); do { bool zc = false; @@ -856,6 +857,9 @@ fallback_to_reg_recv: goto recv_end; } } + /* If we have a new message from strparser, continue now. */ + if (copied >= target && !ctx->recv_pkt) + break; } while (len); recv_end: From 1ebc460c5b12064a987a42d5042f0229a0d2cad6 Mon Sep 17 00:00:00 2001 From: Vakul Garg Date: Mon, 25 Jun 2018 01:37:50 +0530 Subject: [PATCH 1452/1640] UPSTREAM: tls: Removed unused variable Removed unused variable 'rxm' from tls_queue(). Signed-off-by: Vakul Garg Signed-off-by: David S. Miller --- net/tls/tls_sw.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index f127fac88acf..727433b37bb5 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -990,9 +990,6 @@ static void tls_queue(struct strparser *strp, struct sk_buff *skb) { struct tls_context *tls_ctx = tls_get_ctx(strp->sk); struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); - struct strp_msg *rxm; - - rxm = strp_msg(skb); ctx->decrypted = false; From 720e2243ad70f262a841d44cc0aedd9a20e5c9b1 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 25 Jun 2018 16:55:05 -0700 Subject: [PATCH 1453/1640] UPSTREAM: net/tls: Remove VLA usage on nonce It looks like the prior VLA removal, commit b16520f7493d ("net/tls: Remove VLA usage"), and a new VLA addition, commit c46234ebb4d1e ("tls: RX path for ktls"), passed in the night. This removes the newly added VLA, which happens to have its bounds based on the same max value. Signed-off-by: Kees Cook Signed-off-by: David S. Miller --- net/tls/tls_sw.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 727433b37bb5..173d8b89072d 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -941,7 +941,7 @@ static int tls_read_size(struct strparser *strp, struct sk_buff *skb) { struct tls_context *tls_ctx = tls_get_ctx(strp->sk); struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); - char header[tls_ctx->rx.prepend_size]; + char header[TLS_HEADER_SIZE + MAX_IV_SIZE]; struct strp_msg *rxm = strp_msg(skb); size_t cipher_overhead; size_t data_len = 0; @@ -951,6 +951,12 @@ static int tls_read_size(struct strparser *strp, struct sk_buff *skb) if (rxm->offset + tls_ctx->rx.prepend_size > skb->len) return 0; + /* Sanity-check size of on-stack buffer. */ + if (WARN_ON(tls_ctx->rx.prepend_size > sizeof(header))) { + ret = -EINVAL; + goto read_failure; + } + /* Linearize header to local buffer */ ret = skb_copy_bits(skb, rxm->offset, header, tls_ctx->rx.prepend_size); @@ -1108,7 +1114,7 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx) } /* Sanity-check the IV size for stack allocations. */ - if (iv_size > MAX_IV_SIZE) { + if (iv_size > MAX_IV_SIZE || nonce_size > MAX_IV_SIZE) { rc = -EINVAL; goto free_priv; } From ac594882e9f120a994f00a5b3df1f8f927ed1d2d Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 28 Jun 2018 09:43:44 -0700 Subject: [PATCH 1454/1640] BACKPORT: Revert changes to convert to ->poll_mask() and aio IOCB_CMD_POLL The poll() changes were not well thought out, and completely unexplained. They also caused a huge performance regression, because "->poll()" was no longer a trivial file operation that just called down to the underlying file operations, but instead did at least two indirect calls. Indirect calls are sadly slow now with the Spectre mitigation, but the performance problem could at least be largely mitigated by changing the "->get_poll_head()" operation to just have a per-file-descriptor pointer to the poll head instead. That gets rid of one of the new indirections. But that doesn't fix the new complexity that is completely unwarranted for the regular case. The (undocumented) reason for the poll() changes was some alleged AIO poll race fixing, but we don't make the common case slower and more complex for some uncommon special case, so this all really needs way more explanations and most likely a fundamental redesign. [ This revert is a revert of about 30 different commits, not reverted individually because that would just be unnecessarily messy - Linus ] [Linux4: Only apply tls specific bits] Cc: Al Viro Cc: Christoph Hellwig Signed-off-by: Linus Torvalds --- include/net/tls.h | 6 ++++-- net/tls/tls_main.c | 2 +- net/tls/tls_sw.c | 19 ++++++++++--------- 3 files changed, 15 insertions(+), 12 deletions(-) diff --git a/include/net/tls.h b/include/net/tls.h index 985b49a4c6ba..faccdf208fd1 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -113,7 +113,8 @@ struct tls_sw_context_rx { struct strparser strp; void (*saved_data_ready)(struct sock *sk); - __poll_t (*sk_poll_mask)(struct socket *sock, __poll_t events); + unsigned int (*sk_poll)(struct file *file, struct socket *sock, + struct poll_table_struct *wait); struct sk_buff *recv_pkt; u8 control; bool decrypted; @@ -224,7 +225,8 @@ void tls_sw_free_resources_tx(struct sock *sk); void tls_sw_free_resources_rx(struct sock *sk); int tls_sw_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, int flags, int *addr_len); -__poll_t tls_sw_poll_mask(struct socket *sock, __poll_t events); +unsigned int tls_sw_poll(struct file *file, struct socket *sock, + struct poll_table_struct *wait); ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags); diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index 3b2f9978b73a..a279eb99c6d6 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -712,7 +712,7 @@ static int __init tls_register(void) build_protos(tls_prots[TLSV4], &tcp_prot); tls_sw_proto_ops = inet_stream_ops; - tls_sw_proto_ops.poll_mask = tls_sw_poll_mask; + tls_sw_proto_ops.poll = tls_sw_poll; tls_sw_proto_ops.splice_read = tls_sw_splice_read; #ifdef CONFIG_TLS_DEVICE diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 173d8b89072d..0d670c8adf18 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -919,22 +919,23 @@ splice_read_end: return copied ? : err; } -__poll_t tls_sw_poll_mask(struct socket *sock, __poll_t events) +unsigned int tls_sw_poll(struct file *file, struct socket *sock, + struct poll_table_struct *wait) { + unsigned int ret; struct sock *sk = sock->sk; struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); - __poll_t mask; - /* Grab EPOLLOUT and EPOLLHUP from the underlying socket */ - mask = ctx->sk_poll_mask(sock, events); + /* Grab POLLOUT and POLLHUP from the underlying socket */ + ret = ctx->sk_poll(file, sock, wait); - /* Clear EPOLLIN bits, and set based on recv_pkt */ - mask &= ~(EPOLLIN | EPOLLRDNORM); + /* Clear POLLIN bits, and set based on recv_pkt */ + ret &= ~(POLLIN | POLLRDNORM); if (ctx->recv_pkt) - mask |= EPOLLIN | EPOLLRDNORM; + ret |= POLLIN | POLLRDNORM; - return mask; + return ret; } static int tls_read_size(struct strparser *strp, struct sk_buff *skb) @@ -1194,7 +1195,7 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx) sk->sk_data_ready = tls_data_ready; write_unlock_bh(&sk->sk_callback_lock); - sw_ctx_rx->sk_poll_mask = sk->sk_socket->ops->poll_mask; + sw_ctx_rx->sk_poll = sk->sk_socket->ops->poll; strp_check_rcv(&sw_ctx_rx->strp); } From b92d8773935fa7bdab74d033f53d4f75a035ca84 Mon Sep 17 00:00:00 2001 From: Doron Roberts-Kedes Date: Mon, 2 Jul 2018 10:25:05 -0700 Subject: [PATCH 1455/1640] UPSTREAM: tls: fix skb_to_sgvec returning unhandled error. The current code does not inspect the return value of skb_to_sgvec. This can cause a nullptr kernel panic when the malformed sgvec is passed into the crypto request. Checking the return value of skb_to_sgvec and skipping decryption if it is negative fixes this problem. Fixes: c46234ebb4d1 ("tls: RX path for ktls") Acked-by: Dave Watson Signed-off-by: Doron Roberts-Kedes Signed-off-by: David S. Miller --- net/tls/tls_sw.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 0d670c8adf18..3bd7c141dbcd 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -701,6 +701,10 @@ static int decrypt_skb(struct sock *sk, struct sk_buff *skb, nsg = skb_to_sgvec(skb, &sgin[1], rxm->offset + tls_ctx->rx.prepend_size, rxm->full_len - tls_ctx->rx.prepend_size); + if (nsg < 0) { + ret = nsg; + goto out; + } tls_make_aad(ctx->rx_aad_ciphertext, rxm->full_len - tls_ctx->rx.overhead_size, @@ -712,6 +716,7 @@ static int decrypt_skb(struct sock *sk, struct sk_buff *skb, rxm->full_len - tls_ctx->rx.overhead_size, skb, sk->sk_allocation); +out: if (sgin != &sgin_arr[0]) kfree(sgin); From f9414364f8345f66e29b4761cde4c16091cea2fa Mon Sep 17 00:00:00 2001 From: Vakul Garg Date: Wed, 11 Jul 2018 14:32:20 +0530 Subject: [PATCH 1456/1640] UPSTREAM: net/tls: Use aead_request_alloc/free for request alloc/free Instead of kzalloc/free for aead_request allocation and free, use functions aead_request_alloc(), aead_request_free(). It ensures that any sensitive crypto material held in crypto transforms is securely erased from memory. Signed-off-by: Vakul Garg Acked-by: Dave Watson Signed-off-by: David S. Miller --- net/tls/tls_sw.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 3bd7c141dbcd..7ababe83a681 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -57,14 +57,11 @@ static int tls_do_decryption(struct sock *sk, struct aead_request *aead_req; int ret; - unsigned int req_size = sizeof(struct aead_request) + - crypto_aead_reqsize(ctx->aead_recv); - aead_req = kzalloc(req_size, flags); + aead_req = aead_request_alloc(ctx->aead_recv, flags); if (!aead_req) return -ENOMEM; - aead_request_set_tfm(aead_req, ctx->aead_recv); aead_request_set_ad(aead_req, TLS_AAD_SPACE_SIZE); aead_request_set_crypt(aead_req, sgin, sgout, data_len + tls_ctx->rx.tag_size, @@ -86,7 +83,7 @@ static int tls_do_decryption(struct sock *sk, ctx->saved_data_ready(sk); out: - kfree(aead_req); + aead_request_free(aead_req); return ret; } @@ -224,8 +221,7 @@ static int tls_push_record(struct sock *sk, int flags, struct aead_request *req; int rc; - req = kzalloc(sizeof(struct aead_request) + - crypto_aead_reqsize(ctx->aead_send), sk->sk_allocation); + req = aead_request_alloc(ctx->aead_send, sk->sk_allocation); if (!req) return -ENOMEM; @@ -267,7 +263,7 @@ static int tls_push_record(struct sock *sk, int flags, tls_advance_record_sn(sk, &tls_ctx->tx); out_req: - kfree(req); + aead_request_free(req); return rc; } From 7a849dd767f80e97c57577492ca8606f40980211 Mon Sep 17 00:00:00 2001 From: Boris Pismenny Date: Fri, 13 Jul 2018 14:33:39 +0300 Subject: [PATCH 1457/1640] BACKPORT: tls: Refactor tls_offload variable names For symmetry, we rename tls_offload_context to tls_offload_context_tx before we add tls_offload_context_rx. Signed-off-by: Boris Pismenny Signed-off-by: David S. Miller --- include/net/tls.h | 16 ++++++++-------- net/tls/tls_device.c | 25 ++++++++++++------------- net/tls/tls_device_fallback.c | 8 ++++---- 3 files changed, 24 insertions(+), 25 deletions(-) diff --git a/include/net/tls.h b/include/net/tls.h index faccdf208fd1..91aa96efd466 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -128,7 +128,7 @@ struct tls_record_info { skb_frag_t frags[MAX_SKB_FRAGS]; }; -struct tls_offload_context { +struct tls_offload_context_tx { struct crypto_aead *aead_send; spinlock_t lock; /* protects records list */ struct list_head records_list; @@ -147,8 +147,8 @@ struct tls_offload_context { #define TLS_DRIVER_STATE_SIZE (max_t(size_t, 8, sizeof(void *))) }; -#define TLS_OFFLOAD_CONTEXT_SIZE \ - (ALIGN(sizeof(struct tls_offload_context), sizeof(void *)) + \ +#define TLS_OFFLOAD_CONTEXT_SIZE_TX \ + (ALIGN(sizeof(struct tls_offload_context_tx), sizeof(void *)) + \ TLS_DRIVER_STATE_SIZE) enum { @@ -239,7 +239,7 @@ void tls_device_sk_destruct(struct sock *sk); void tls_device_init(void); void tls_device_cleanup(void); -struct tls_record_info *tls_get_record(struct tls_offload_context *context, +struct tls_record_info *tls_get_record(struct tls_offload_context_tx *context, u32 seq, u64 *p_record_sn); static inline bool tls_record_is_start_marker(struct tls_record_info *rec) @@ -380,10 +380,10 @@ static inline struct tls_sw_context_tx *tls_sw_ctx_tx( return (struct tls_sw_context_tx *)tls_ctx->priv_ctx_tx; } -static inline struct tls_offload_context *tls_offload_ctx( - const struct tls_context *tls_ctx) +static inline struct tls_offload_context_tx * +tls_offload_ctx_tx(const struct tls_context *tls_ctx) { - return (struct tls_offload_context *)tls_ctx->priv_ctx_tx; + return (struct tls_offload_context_tx *)tls_ctx->priv_ctx_tx; } int tls_proccess_cmsg(struct sock *sk, struct msghdr *msg, @@ -396,7 +396,7 @@ struct sk_buff *tls_validate_xmit_skb(struct sock *sk, struct sk_buff *skb); int tls_sw_fallback_init(struct sock *sk, - struct tls_offload_context *offload_ctx, + struct tls_offload_context_tx *offload_ctx, struct tls_crypto_info *crypto_info); #endif /* _TLS_OFFLOAD_H */ diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index a7a8f8e20ff3..332a5d1459b6 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -52,9 +52,8 @@ static DEFINE_SPINLOCK(tls_device_lock); static void tls_device_free_ctx(struct tls_context *ctx) { - struct tls_offload_context *offload_ctx = tls_offload_ctx(ctx); + kfree(tls_offload_ctx_tx(ctx)); - kfree(offload_ctx); kfree(ctx); } @@ -125,7 +124,7 @@ static void destroy_record(struct tls_record_info *record) kfree(record); } -static void delete_all_records(struct tls_offload_context *offload_ctx) +static void delete_all_records(struct tls_offload_context_tx *offload_ctx) { struct tls_record_info *info, *temp; @@ -141,14 +140,14 @@ static void tls_icsk_clean_acked(struct sock *sk, u32 acked_seq) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_record_info *info, *temp; - struct tls_offload_context *ctx; + struct tls_offload_context_tx *ctx; u64 deleted_records = 0; unsigned long flags; if (!tls_ctx) return; - ctx = tls_offload_ctx(tls_ctx); + ctx = tls_offload_ctx_tx(tls_ctx); spin_lock_irqsave(&ctx->lock, flags); info = ctx->retransmit_hint; @@ -179,7 +178,7 @@ static void tls_icsk_clean_acked(struct sock *sk, u32 acked_seq) void tls_device_sk_destruct(struct sock *sk) { struct tls_context *tls_ctx = tls_get_ctx(sk); - struct tls_offload_context *ctx = tls_offload_ctx(tls_ctx); + struct tls_offload_context_tx *ctx = tls_offload_ctx_tx(tls_ctx); if (ctx->open_record) destroy_record(ctx->open_record); @@ -219,7 +218,7 @@ static void tls_append_frag(struct tls_record_info *record, static int tls_push_record(struct sock *sk, struct tls_context *ctx, - struct tls_offload_context *offload_ctx, + struct tls_offload_context_tx *offload_ctx, struct tls_record_info *record, struct page_frag *pfrag, int flags, @@ -264,7 +263,7 @@ static int tls_push_record(struct sock *sk, return tls_push_sg(sk, ctx, offload_ctx->sg_tx_data, 0, flags); } -static int tls_create_new_record(struct tls_offload_context *offload_ctx, +static int tls_create_new_record(struct tls_offload_context_tx *offload_ctx, struct page_frag *pfrag, size_t prepend_size) { @@ -290,7 +289,7 @@ static int tls_create_new_record(struct tls_offload_context *offload_ctx, } static int tls_do_allocation(struct sock *sk, - struct tls_offload_context *offload_ctx, + struct tls_offload_context_tx *offload_ctx, struct page_frag *pfrag, size_t prepend_size) { @@ -324,7 +323,7 @@ static int tls_push_data(struct sock *sk, unsigned char record_type) { struct tls_context *tls_ctx = tls_get_ctx(sk); - struct tls_offload_context *ctx = tls_offload_ctx(tls_ctx); + struct tls_offload_context_tx *ctx = tls_offload_ctx_tx(tls_ctx); int tls_push_record_flags = flags | MSG_SENDPAGE_NOTLAST; int more = flags & (MSG_SENDPAGE_NOTLAST | MSG_MORE); struct tls_record_info *record = ctx->open_record; @@ -477,7 +476,7 @@ out: return rc; } -struct tls_record_info *tls_get_record(struct tls_offload_context *context, +struct tls_record_info *tls_get_record(struct tls_offload_context_tx *context, u32 seq, u64 *p_record_sn) { u64 record_sn = context->hint_record_sn; @@ -524,7 +523,7 @@ int tls_set_device_offload(struct sock *sk, struct tls_context *ctx) { u16 nonce_size, tag_size, iv_size, rec_seq_size; struct tls_record_info *start_marker_record; - struct tls_offload_context *offload_ctx; + struct tls_offload_context_tx *offload_ctx; struct tls_crypto_info *crypto_info; struct net_device *netdev; char *iv, *rec_seq; @@ -546,7 +545,7 @@ int tls_set_device_offload(struct sock *sk, struct tls_context *ctx) goto out; } - offload_ctx = kzalloc(TLS_OFFLOAD_CONTEXT_SIZE, GFP_KERNEL); + offload_ctx = kzalloc(TLS_OFFLOAD_CONTEXT_SIZE_TX, GFP_KERNEL); if (!offload_ctx) { rc = -ENOMEM; goto free_marker_record; diff --git a/net/tls/tls_device_fallback.c b/net/tls/tls_device_fallback.c index 748914abdb60..d1d7dce38e0b 100644 --- a/net/tls/tls_device_fallback.c +++ b/net/tls/tls_device_fallback.c @@ -214,7 +214,7 @@ static void complete_skb(struct sk_buff *nskb, struct sk_buff *skb, int headln) static int fill_sg_in(struct scatterlist *sg_in, struct sk_buff *skb, - struct tls_offload_context *ctx, + struct tls_offload_context_tx *ctx, u64 *rcd_sn, s32 *sync_size, int *resync_sgs) @@ -299,7 +299,7 @@ static struct sk_buff *tls_enc_skb(struct tls_context *tls_ctx, s32 sync_size, u64 rcd_sn) { int tcp_payload_offset = skb_transport_offset(skb) + tcp_hdrlen(skb); - struct tls_offload_context *ctx = tls_offload_ctx(tls_ctx); + struct tls_offload_context_tx *ctx = tls_offload_ctx_tx(tls_ctx); int payload_len = skb->len - tcp_payload_offset; void *buf, *iv, *aad, *dummy_buf; struct aead_request *aead_req; @@ -361,7 +361,7 @@ static struct sk_buff *tls_sw_fallback(struct sock *sk, struct sk_buff *skb) { int tcp_payload_offset = skb_transport_offset(skb) + tcp_hdrlen(skb); struct tls_context *tls_ctx = tls_get_ctx(sk); - struct tls_offload_context *ctx = tls_offload_ctx(tls_ctx); + struct tls_offload_context_tx *ctx = tls_offload_ctx_tx(tls_ctx); int payload_len = skb->len - tcp_payload_offset; struct scatterlist *sg_in, sg_out[3]; struct sk_buff *nskb = NULL; @@ -415,7 +415,7 @@ struct sk_buff *tls_validate_xmit_skb(struct sock *sk, } int tls_sw_fallback_init(struct sock *sk, - struct tls_offload_context *offload_ctx, + struct tls_offload_context_tx *offload_ctx, struct tls_crypto_info *crypto_info) { const u8 *key; From 8641ab08bed74c0dfcd1ed5ed1159437a399c9fb Mon Sep 17 00:00:00 2001 From: Boris Pismenny Date: Fri, 13 Jul 2018 14:33:40 +0300 Subject: [PATCH 1458/1640] UPSTREAM: tls: Split decrypt_skb to two functions Previously, decrypt_skb also updated the TLS context. Now, decrypt_skb only decrypts the payload using the current context, while decrypt_skb_update also updates the state. Later, in the tls_device Rx flow, we will use decrypt_skb directly. Signed-off-by: Boris Pismenny Signed-off-by: David S. Miller --- include/net/tls.h | 2 ++ net/tls/tls_sw.c | 44 ++++++++++++++++++++++++++------------------ 2 files changed, 28 insertions(+), 18 deletions(-) diff --git a/include/net/tls.h b/include/net/tls.h index 91aa96efd466..1eb8d1a5a12e 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -390,6 +390,8 @@ int tls_proccess_cmsg(struct sock *sk, struct msghdr *msg, unsigned char *record_type); void tls_register_device(struct tls_device *device); void tls_unregister_device(struct tls_device *device); +int decrypt_skb(struct sock *sk, struct sk_buff *skb, + struct scatterlist *sgout); struct sk_buff *tls_validate_xmit_skb(struct sock *sk, struct net_device *dev, diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 7ababe83a681..1f2805c5c4aa 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -53,7 +53,6 @@ static int tls_do_decryption(struct sock *sk, { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); - struct strp_msg *rxm = strp_msg(skb); struct aead_request *aead_req; int ret; @@ -71,18 +70,6 @@ static int tls_do_decryption(struct sock *sk, ret = crypto_wait_req(crypto_aead_decrypt(aead_req), &ctx->async_wait); - if (ret < 0) - goto out; - - rxm->offset += tls_ctx->rx.prepend_size; - rxm->full_len -= tls_ctx->rx.overhead_size; - tls_advance_record_sn(sk, &tls_ctx->rx); - - ctx->decrypted = true; - - ctx->saved_data_ready(sk); - -out: aead_request_free(aead_req); return ret; } @@ -666,8 +653,29 @@ static struct sk_buff *tls_wait_data(struct sock *sk, int flags, return skb; } -static int decrypt_skb(struct sock *sk, struct sk_buff *skb, - struct scatterlist *sgout) +static int decrypt_skb_update(struct sock *sk, struct sk_buff *skb, + struct scatterlist *sgout) +{ + struct tls_context *tls_ctx = tls_get_ctx(sk); + struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); + struct strp_msg *rxm = strp_msg(skb); + int err = 0; + + err = decrypt_skb(sk, skb, sgout); + if (err < 0) + return err; + + rxm->offset += tls_ctx->rx.prepend_size; + rxm->full_len -= tls_ctx->rx.overhead_size; + tls_advance_record_sn(sk, &tls_ctx->rx); + ctx->decrypted = true; + ctx->saved_data_ready(sk); + + return err; +} + +int decrypt_skb(struct sock *sk, struct sk_buff *skb, + struct scatterlist *sgout) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); @@ -817,7 +825,7 @@ int tls_sw_recvmsg(struct sock *sk, if (err < 0) goto fallback_to_reg_recv; - err = decrypt_skb(sk, skb, sgin); + err = decrypt_skb_update(sk, skb, sgin); for (; pages > 0; pages--) put_page(sg_page(&sgin[pages])); if (err < 0) { @@ -826,7 +834,7 @@ int tls_sw_recvmsg(struct sock *sk, } } else { fallback_to_reg_recv: - err = decrypt_skb(sk, skb, NULL); + err = decrypt_skb_update(sk, skb, NULL); if (err < 0) { tls_err_abort(sk, EBADMSG); goto recv_end; @@ -897,7 +905,7 @@ ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos, } if (!ctx->decrypted) { - err = decrypt_skb(sk, skb, NULL); + err = decrypt_skb_update(sk, skb, NULL); if (err < 0) { tls_err_abort(sk, EBADMSG); From e198364162d7f81da5f6106c22742ccc1171890c Mon Sep 17 00:00:00 2001 From: Boris Pismenny Date: Fri, 13 Jul 2018 14:33:41 +0300 Subject: [PATCH 1459/1640] UPSTREAM: tls: Split tls_sw_release_resources_rx This patch splits tls_sw_release_resources_rx into two functions one which releases all inner software tls structures and another that also frees the containing structure. In TLS_DEVICE we will need to release the software structures without freeeing the containing structure, which contains other information. Signed-off-by: Boris Pismenny Signed-off-by: David S. Miller --- include/net/tls.h | 1 + net/tls/tls_sw.c | 10 +++++++++- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/include/net/tls.h b/include/net/tls.h index 1eb8d1a5a12e..1d7e8cf4de9f 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -223,6 +223,7 @@ int tls_sw_sendpage(struct sock *sk, struct page *page, void tls_sw_close(struct sock *sk, long timeout); void tls_sw_free_resources_tx(struct sock *sk); void tls_sw_free_resources_rx(struct sock *sk); +void tls_sw_release_resources_rx(struct sock *sk); int tls_sw_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, int flags, int *addr_len); unsigned int tls_sw_poll(struct file *file, struct socket *sock, diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 1f2805c5c4aa..711b1e68e0c3 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -1035,7 +1035,7 @@ void tls_sw_free_resources_tx(struct sock *sk) kfree(ctx); } -void tls_sw_free_resources_rx(struct sock *sk) +void tls_sw_release_resources_rx(struct sock *sk) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); @@ -1054,6 +1054,14 @@ void tls_sw_free_resources_rx(struct sock *sk) strp_done(&ctx->strp); lock_sock(sk); } +} + +void tls_sw_free_resources_rx(struct sock *sk) +{ + struct tls_context *tls_ctx = tls_get_ctx(sk); + struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); + + tls_sw_release_resources_rx(sk); kfree(ctx); } From bc1c2f3351ed0f5bcd86c3a6d0c33c1d090dbbed Mon Sep 17 00:00:00 2001 From: Boris Pismenny Date: Fri, 13 Jul 2018 14:33:42 +0300 Subject: [PATCH 1460/1640] UPSTREAM: tls: Fill software context without allocation This patch allows tls_set_sw_offload to fill the context in case it was already allocated previously. We will use it in TLS_DEVICE to fill the RX software context. Signed-off-by: Boris Pismenny Signed-off-by: David S. Miller --- net/tls/tls_sw.c | 34 ++++++++++++++++++++++------------ 1 file changed, 22 insertions(+), 12 deletions(-) diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 711b1e68e0c3..775518533270 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -1086,28 +1086,38 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx) } if (tx) { - sw_ctx_tx = kzalloc(sizeof(*sw_ctx_tx), GFP_KERNEL); - if (!sw_ctx_tx) { - rc = -ENOMEM; - goto out; + if (!ctx->priv_ctx_tx) { + sw_ctx_tx = kzalloc(sizeof(*sw_ctx_tx), GFP_KERNEL); + if (!sw_ctx_tx) { + rc = -ENOMEM; + goto out; + } + ctx->priv_ctx_tx = sw_ctx_tx; + } else { + sw_ctx_tx = + (struct tls_sw_context_tx *)ctx->priv_ctx_tx; } - crypto_init_wait(&sw_ctx_tx->async_wait); - ctx->priv_ctx_tx = sw_ctx_tx; } else { - sw_ctx_rx = kzalloc(sizeof(*sw_ctx_rx), GFP_KERNEL); - if (!sw_ctx_rx) { - rc = -ENOMEM; - goto out; + if (!ctx->priv_ctx_rx) { + sw_ctx_rx = kzalloc(sizeof(*sw_ctx_rx), GFP_KERNEL); + if (!sw_ctx_rx) { + rc = -ENOMEM; + goto out; + } + ctx->priv_ctx_rx = sw_ctx_rx; + } else { + sw_ctx_rx = + (struct tls_sw_context_rx *)ctx->priv_ctx_rx; } - crypto_init_wait(&sw_ctx_rx->async_wait); - ctx->priv_ctx_rx = sw_ctx_rx; } if (tx) { + crypto_init_wait(&sw_ctx_tx->async_wait); crypto_info = &ctx->crypto_send; cctx = &ctx->tx; aead = &sw_ctx_tx->aead_send; } else { + crypto_init_wait(&sw_ctx_rx->async_wait); crypto_info = &ctx->crypto_recv; cctx = &ctx->rx; aead = &sw_ctx_rx->aead_recv; From 89a8d85df7796cab97e7a8ae22057c768d1d3696 Mon Sep 17 00:00:00 2001 From: Boris Pismenny Date: Fri, 13 Jul 2018 14:33:43 +0300 Subject: [PATCH 1461/1640] UPSTREAM: tls: Add rx inline crypto offload This patch completes the generic infrastructure to offload TLS crypto to a network device. It enables the kernel to skip decryption and authentication of some skbs marked as decrypted by the NIC. In the fast path, all packets received are decrypted by the NIC and the performance is comparable to plain TCP. This infrastructure doesn't require a TCP offload engine. Instead, the NIC only decrypts packets that contain the expected TCP sequence number. Out-Of-Order TCP packets are provided unmodified. As a result, at the worst case a received TLS record consists of both plaintext and ciphertext packets. These partially decrypted records must be reencrypted, only to be decrypted. The notable differences between SW KTLS Rx and this offload are as follows: 1. Partial decryption - Software must handle the case of a TLS record that was only partially decrypted by HW. This can happen due to packet reordering. 2. Resynchronization - tls_read_size calls the device driver to resynchronize HW after HW lost track of TLS record framing in the TCP stream. Signed-off-by: Boris Pismenny Signed-off-by: David S. Miller --- include/net/tls.h | 63 +++++++- net/tls/tls_device.c | 278 +++++++++++++++++++++++++++++++--- net/tls/tls_device_fallback.c | 1 + net/tls/tls_main.c | 32 ++-- net/tls/tls_sw.c | 24 ++- 5 files changed, 355 insertions(+), 43 deletions(-) diff --git a/include/net/tls.h b/include/net/tls.h index 1d7e8cf4de9f..91ef19a7f8f9 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -83,6 +83,16 @@ struct tls_device { void (*unhash)(struct tls_device *device, struct sock *sk); }; +enum { + TLS_BASE, + TLS_SW, +#ifdef CONFIG_TLS_DEVICE + TLS_HW, +#endif + TLS_HW_RECORD, + TLS_NUM_CONFIG, +}; + struct tls_sw_context_tx { struct crypto_aead *aead_send; struct crypto_wait async_wait; @@ -197,6 +207,7 @@ struct tls_context { int (*push_pending_record)(struct sock *sk, int flags); void (*sk_write_space)(struct sock *sk); + void (*sk_destruct)(struct sock *sk); void (*sk_proto_close)(struct sock *sk, long timeout); int (*setsockopt)(struct sock *sk, int level, @@ -209,13 +220,27 @@ struct tls_context { void (*unhash)(struct sock *sk); }; +struct tls_offload_context_rx { + /* sw must be the first member of tls_offload_context_rx */ + struct tls_sw_context_rx sw; + atomic64_t resync_req; + u8 driver_state[]; + /* The TLS layer reserves room for driver specific state + * Currently the belief is that there is not enough + * driver specific state to justify another layer of indirection + */ +}; + +#define TLS_OFFLOAD_CONTEXT_SIZE_RX \ + (ALIGN(sizeof(struct tls_offload_context_rx), sizeof(void *)) + \ + TLS_DRIVER_STATE_SIZE) + int wait_on_pending_writer(struct sock *sk, long *timeo); int tls_sk_query(struct sock *sk, int optname, char __user *optval, int __user *optlen); int tls_sk_attach(struct sock *sk, int optname, char __user *optval, unsigned int optlen); - int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx); int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size); int tls_sw_sendpage(struct sock *sk, struct page *page, @@ -290,11 +315,19 @@ static inline bool tls_is_pending_open_record(struct tls_context *tls_ctx) return tls_ctx->pending_open_record_frags; } +struct sk_buff * +tls_validate_xmit_skb(struct sock *sk, struct net_device *dev, + struct sk_buff *skb); + static inline bool tls_is_sk_tx_device_offloaded(struct sock *sk) { - return sk_fullsock(sk) && - /* matches smp_store_release in tls_set_device_offload */ - smp_load_acquire(&sk->sk_destruct) == &tls_device_sk_destruct; +#ifdef CONFIG_SOCK_VALIDATE_XMIT + return sk_fullsock(sk) & + (smp_load_acquire(&sk->sk_validate_xmit_skb) == + &tls_validate_xmit_skb); +#else + return false; +#endif } static inline void tls_err_abort(struct sock *sk, int err) @@ -387,10 +420,27 @@ tls_offload_ctx_tx(const struct tls_context *tls_ctx) return (struct tls_offload_context_tx *)tls_ctx->priv_ctx_tx; } +static inline struct tls_offload_context_rx * +tls_offload_ctx_rx(const struct tls_context *tls_ctx) +{ + return (struct tls_offload_context_rx *)tls_ctx->priv_ctx_rx; +} + +/* The TLS context is valid until sk_destruct is called */ +static inline void tls_offload_rx_resync_request(struct sock *sk, __be32 seq) +{ + struct tls_context *tls_ctx = tls_get_ctx(sk); + struct tls_offload_context_rx *rx_ctx = tls_offload_ctx_rx(tls_ctx); + + atomic64_set(&rx_ctx->resync_req, ((((uint64_t)seq) << 32) | 1)); +} + + int tls_proccess_cmsg(struct sock *sk, struct msghdr *msg, unsigned char *record_type); void tls_register_device(struct tls_device *device); void tls_unregister_device(struct tls_device *device); +int tls_device_decrypted(struct sock *sk, struct sk_buff *skb); int decrypt_skb(struct sock *sk, struct sk_buff *skb, struct scatterlist *sgout); @@ -402,4 +452,9 @@ int tls_sw_fallback_init(struct sock *sk, struct tls_offload_context_tx *offload_ctx, struct tls_crypto_info *crypto_info); +int tls_set_device_offload_rx(struct sock *sk, struct tls_context *ctx); + +void tls_device_offload_cleanup_rx(struct sock *sk); +void handle_device_resync(struct sock *sk, u32 seq, u64 rcd_sn); + #endif /* _TLS_OFFLOAD_H */ diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index 332a5d1459b6..4995d84d228d 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -52,7 +52,11 @@ static DEFINE_SPINLOCK(tls_device_lock); static void tls_device_free_ctx(struct tls_context *ctx) { - kfree(tls_offload_ctx_tx(ctx)); + if (ctx->tx_conf == TLS_HW) + kfree(tls_offload_ctx_tx(ctx)); + + if (ctx->rx_conf == TLS_HW) + kfree(tls_offload_ctx_rx(ctx)); kfree(ctx); } @@ -70,10 +74,11 @@ static void tls_device_gc_task(struct work_struct *work) list_for_each_entry_safe(ctx, tmp, &gc_list, list) { struct net_device *netdev = ctx->netdev; - if (netdev) { + if (netdev && ctx->tx_conf == TLS_HW) { netdev->tlsdev_ops->tls_dev_del(netdev, ctx, TLS_OFFLOAD_CTX_DIR_TX); dev_put(netdev); + ctx->netdev = NULL; } list_del(&ctx->list); @@ -81,6 +86,22 @@ static void tls_device_gc_task(struct work_struct *work) } } +static void tls_device_attach(struct tls_context *ctx, struct sock *sk, + struct net_device *netdev) +{ + if (sk->sk_destruct != tls_device_sk_destruct) { + refcount_set(&ctx->refcount, 1); + dev_hold(netdev); + ctx->netdev = netdev; + spin_lock_irq(&tls_device_lock); + list_add_tail(&ctx->list, &tls_device_list); + spin_unlock_irq(&tls_device_lock); + + ctx->sk_destruct = sk->sk_destruct; + sk->sk_destruct = tls_device_sk_destruct; + } +} + static void tls_device_queue_ctx_destruction(struct tls_context *ctx) { unsigned long flags; @@ -180,13 +201,15 @@ void tls_device_sk_destruct(struct sock *sk) struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_offload_context_tx *ctx = tls_offload_ctx_tx(tls_ctx); - if (ctx->open_record) - destroy_record(ctx->open_record); + tls_ctx->sk_destruct(sk); - delete_all_records(ctx); - crypto_free_aead(ctx->aead_send); - ctx->sk_destruct(sk); - clean_acked_data_disable(inet_csk(sk)); + if (tls_ctx->tx_conf == TLS_HW) { + if (ctx->open_record) + destroy_record(ctx->open_record); + delete_all_records(ctx); + crypto_free_aead(ctx->aead_send); + clean_acked_data_disable(inet_csk(sk)); + } if (refcount_dec_and_test(&tls_ctx->refcount)) tls_device_queue_ctx_destruction(tls_ctx); @@ -519,6 +542,118 @@ static int tls_device_push_pending_record(struct sock *sk, int flags) return tls_push_data(sk, &msg_iter, 0, flags, TLS_RECORD_TYPE_DATA); } +void handle_device_resync(struct sock *sk, u32 seq, u64 rcd_sn) +{ + struct tls_context *tls_ctx = tls_get_ctx(sk); + struct net_device *netdev = tls_ctx->netdev; + struct tls_offload_context_rx *rx_ctx; + u32 is_req_pending; + s64 resync_req; + u32 req_seq; + + if (tls_ctx->rx_conf != TLS_HW) + return; + + rx_ctx = tls_offload_ctx_rx(tls_ctx); + resync_req = atomic64_read(&rx_ctx->resync_req); + req_seq = ntohl(resync_req >> 32) - ((u32)TLS_HEADER_SIZE - 1); + is_req_pending = resync_req; + + if (unlikely(is_req_pending) && req_seq == seq && + atomic64_try_cmpxchg(&rx_ctx->resync_req, &resync_req, 0)) + netdev->tlsdev_ops->tls_dev_resync_rx(netdev, sk, + seq + TLS_HEADER_SIZE - 1, + rcd_sn); +} + +static int tls_device_reencrypt(struct sock *sk, struct sk_buff *skb) +{ + struct strp_msg *rxm = strp_msg(skb); + int err = 0, offset = rxm->offset, copy, nsg; + struct sk_buff *skb_iter, *unused; + struct scatterlist sg[1]; + char *orig_buf, *buf; + + orig_buf = kmalloc(rxm->full_len + TLS_HEADER_SIZE + + TLS_CIPHER_AES_GCM_128_IV_SIZE, sk->sk_allocation); + if (!orig_buf) + return -ENOMEM; + buf = orig_buf; + + nsg = skb_cow_data(skb, 0, &unused); + if (unlikely(nsg < 0)) { + err = nsg; + goto free_buf; + } + + sg_init_table(sg, 1); + sg_set_buf(&sg[0], buf, + rxm->full_len + TLS_HEADER_SIZE + + TLS_CIPHER_AES_GCM_128_IV_SIZE); + skb_copy_bits(skb, offset, buf, + TLS_HEADER_SIZE + TLS_CIPHER_AES_GCM_128_IV_SIZE); + + /* We are interested only in the decrypted data not the auth */ + err = decrypt_skb(sk, skb, sg); + if (err != -EBADMSG) + goto free_buf; + else + err = 0; + + copy = min_t(int, skb_pagelen(skb) - offset, + rxm->full_len - TLS_CIPHER_AES_GCM_128_TAG_SIZE); + + if (skb->decrypted) + skb_store_bits(skb, offset, buf, copy); + + offset += copy; + buf += copy; + + skb_walk_frags(skb, skb_iter) { + copy = min_t(int, skb_iter->len, + rxm->full_len - offset + rxm->offset - + TLS_CIPHER_AES_GCM_128_TAG_SIZE); + + if (skb_iter->decrypted) + skb_store_bits(skb, offset, buf, copy); + + offset += copy; + buf += copy; + } + +free_buf: + kfree(orig_buf); + return err; +} + +int tls_device_decrypted(struct sock *sk, struct sk_buff *skb) +{ + struct tls_context *tls_ctx = tls_get_ctx(sk); + struct tls_offload_context_rx *ctx = tls_offload_ctx_rx(tls_ctx); + int is_decrypted = skb->decrypted; + int is_encrypted = !is_decrypted; + struct sk_buff *skb_iter; + + /* Skip if it is already decrypted */ + if (ctx->sw.decrypted) + return 0; + + /* Check if all the data is decrypted already */ + skb_walk_frags(skb, skb_iter) { + is_decrypted &= skb_iter->decrypted; + is_encrypted &= !skb_iter->decrypted; + } + + ctx->sw.decrypted |= is_decrypted; + + /* Return immedeatly if the record is either entirely plaintext or + * entirely ciphertext. Otherwise handle reencrypt partially decrypted + * record. + */ + return (is_encrypted || is_decrypted) ? 0 : + tls_device_reencrypt(sk, skb); +} + int tls_set_device_offload(struct sock *sk, struct tls_context *ctx) { u16 nonce_size, tag_size, iv_size, rec_seq_size; @@ -608,7 +743,6 @@ int tls_set_device_offload(struct sock *sk, struct tls_context *ctx) clean_acked_data_enable(inet_csk(sk), &tls_icsk_clean_acked); ctx->push_pending_record = tls_device_push_pending_record; - offload_ctx->sk_destruct = sk->sk_destruct; /* TLS offload is greatly simplified if we don't send * SKBs where only part of the payload needs to be encrypted. @@ -618,8 +752,6 @@ int tls_set_device_offload(struct sock *sk, struct tls_context *ctx) if (skb) TCP_SKB_CB(skb)->eor = 1; - refcount_set(&ctx->refcount, 1); - /* We support starting offload on multiple sockets * concurrently, so we only need a read lock here. * This lock must precede get_netdev_for_sock to prevent races between @@ -654,19 +786,14 @@ int tls_set_device_offload(struct sock *sk, struct tls_context *ctx) if (rc) goto release_netdev; - ctx->netdev = netdev; + tls_device_attach(ctx, sk, netdev); - spin_lock_irq(&tls_device_lock); - list_add_tail(&ctx->list, &tls_device_list); - spin_unlock_irq(&tls_device_lock); - - sk->sk_validate_xmit_skb = tls_validate_xmit_skb; /* following this assignment tls_is_sk_tx_device_offloaded * will return true and the context might be accessed * by the netdev's xmit function. */ - smp_store_release(&sk->sk_destruct, - &tls_device_sk_destruct); + smp_store_release(&sk->sk_validate_xmit_skb, tls_validate_xmit_skb); + dev_put(netdev); up_read(&device_offload_lock); goto out; @@ -689,6 +816,105 @@ out: return rc; } +int tls_set_device_offload_rx(struct sock *sk, struct tls_context *ctx) +{ + struct tls_offload_context_rx *context; + struct net_device *netdev; + int rc = 0; + + /* We support starting offload on multiple sockets + * concurrently, so we only need a read lock here. + * This lock must precede get_netdev_for_sock to prevent races between + * NETDEV_DOWN and setsockopt. + */ + down_read(&device_offload_lock); + netdev = get_netdev_for_sock(sk); + if (!netdev) { + pr_err_ratelimited("%s: netdev not found\n", __func__); + rc = -EINVAL; + goto release_lock; + } + + if (!(netdev->features & NETIF_F_HW_TLS_RX)) { + pr_err_ratelimited("%s: netdev %s with no TLS offload\n", + __func__, netdev->name); + rc = -ENOTSUPP; + goto release_netdev; + } + + /* Avoid offloading if the device is down + * We don't want to offload new flows after + * the NETDEV_DOWN event + */ + if (!(netdev->flags & IFF_UP)) { + rc = -EINVAL; + goto release_netdev; + } + + context = kzalloc(TLS_OFFLOAD_CONTEXT_SIZE_RX, GFP_KERNEL); + if (!context) { + rc = -ENOMEM; + goto release_netdev; + } + + ctx->priv_ctx_rx = context; + rc = tls_set_sw_offload(sk, ctx, 0); + if (rc) + goto release_ctx; + + rc = netdev->tlsdev_ops->tls_dev_add(netdev, sk, TLS_OFFLOAD_CTX_DIR_RX, + &ctx->crypto_recv, + tcp_sk(sk)->copied_seq); + if (rc) { + pr_err_ratelimited("%s: The netdev has refused to offload this socket\n", + __func__); + goto free_sw_resources; + } + + tls_device_attach(ctx, sk, netdev); + goto release_netdev; + +free_sw_resources: + tls_sw_free_resources_rx(sk); +release_ctx: + ctx->priv_ctx_rx = NULL; +release_netdev: + dev_put(netdev); +release_lock: + up_read(&device_offload_lock); + return rc; +} + +void tls_device_offload_cleanup_rx(struct sock *sk) +{ + struct tls_context *tls_ctx = tls_get_ctx(sk); + struct net_device *netdev; + + down_read(&device_offload_lock); + netdev = tls_ctx->netdev; + if (!netdev) + goto out; + + if (!(netdev->features & NETIF_F_HW_TLS_RX)) { + pr_err_ratelimited("%s: device is missing NETIF_F_HW_TLS_RX cap\n", + __func__); + goto out; + } + + netdev->tlsdev_ops->tls_dev_del(netdev, tls_ctx, + TLS_OFFLOAD_CTX_DIR_RX); + + if (tls_ctx->tx_conf != TLS_HW) { + dev_put(netdev); + tls_ctx->netdev = NULL; + } +out: + up_read(&device_offload_lock); + kfree(tls_ctx->rx.rec_seq); + kfree(tls_ctx->rx.iv); + tls_sw_release_resources_rx(sk); +} + static int tls_device_down(struct net_device *netdev) { struct tls_context *ctx, *tmp; @@ -709,8 +935,12 @@ static int tls_device_down(struct net_device *netdev) spin_unlock_irqrestore(&tls_device_lock, flags); list_for_each_entry_safe(ctx, tmp, &list, list) { - netdev->tlsdev_ops->tls_dev_del(netdev, ctx, - TLS_OFFLOAD_CTX_DIR_TX); + if (ctx->tx_conf == TLS_HW) + netdev->tlsdev_ops->tls_dev_del(netdev, ctx, + TLS_OFFLOAD_CTX_DIR_TX); + if (ctx->rx_conf == TLS_HW) + netdev->tlsdev_ops->tls_dev_del(netdev, ctx, + TLS_OFFLOAD_CTX_DIR_RX); ctx->netdev = NULL; dev_put(netdev); list_del_init(&ctx->list); @@ -731,12 +961,16 @@ static int tls_dev_event(struct notifier_block *this, unsigned long event, { struct net_device *dev = netdev_notifier_info_to_dev(ptr); - if (!(dev->features & NETIF_F_HW_TLS_TX)) + if (!(dev->features & (NETIF_F_HW_TLS_RX | NETIF_F_HW_TLS_TX))) return NOTIFY_DONE; switch (event) { case NETDEV_REGISTER: case NETDEV_FEAT_CHANGE: + if ((dev->features & NETIF_F_HW_TLS_RX) && + !dev->tlsdev_ops->tls_dev_resync_rx) + return NOTIFY_BAD; + if (dev->tlsdev_ops && dev->tlsdev_ops->tls_dev_add && dev->tlsdev_ops->tls_dev_del) diff --git a/net/tls/tls_device_fallback.c b/net/tls/tls_device_fallback.c index d1d7dce38e0b..e3313c45663f 100644 --- a/net/tls/tls_device_fallback.c +++ b/net/tls/tls_device_fallback.c @@ -413,6 +413,7 @@ struct sk_buff *tls_validate_xmit_skb(struct sock *sk, return tls_sw_fallback(sk, skb); } +EXPORT_SYMBOL_GPL(tls_validate_xmit_skb); int tls_sw_fallback_init(struct sock *sk, struct tls_offload_context_tx *offload_ctx, diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index a279eb99c6d6..afaa4910bc7e 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -51,15 +51,6 @@ enum { TLSV6, TLS_NUM_PROTS, }; -enum { - TLS_BASE, - TLS_SW, -#ifdef CONFIG_TLS_DEVICE - TLS_HW, -#endif - TLS_HW_RECORD, - TLS_NUM_CONFIG, -}; static struct proto *saved_tcpv6_prot; static DEFINE_MUTEX(tcpv6_prot_mutex); @@ -290,7 +281,10 @@ static void tls_sk_proto_close(struct sock *sk, long timeout) } #ifdef CONFIG_TLS_DEVICE - if (ctx->tx_conf != TLS_HW) { + if (ctx->rx_conf == TLS_HW) + tls_device_offload_cleanup_rx(sk); + + if (ctx->tx_conf != TLS_HW && ctx->rx_conf != TLS_HW) { #else { #endif @@ -470,8 +464,16 @@ static int do_tls_setsockopt_conf(struct sock *sk, char __user *optval, conf = TLS_SW; } } else { - rc = tls_set_sw_offload(sk, ctx, 0); - conf = TLS_SW; +#ifdef CONFIG_TLS_DEVICE + rc = tls_set_device_offload_rx(sk, ctx); + conf = TLS_HW; + if (rc) { +#else + { +#endif + rc = tls_set_sw_offload(sk, ctx, 0); + conf = TLS_SW; + } } if (rc) @@ -629,6 +631,12 @@ static void build_protos(struct proto prot[TLS_NUM_CONFIG][TLS_NUM_CONFIG], prot[TLS_HW][TLS_SW] = prot[TLS_BASE][TLS_SW]; prot[TLS_HW][TLS_SW].sendmsg = tls_device_sendmsg; prot[TLS_HW][TLS_SW].sendpage = tls_device_sendpage; + + prot[TLS_BASE][TLS_HW] = prot[TLS_BASE][TLS_SW]; + + prot[TLS_SW][TLS_HW] = prot[TLS_SW][TLS_SW]; + + prot[TLS_HW][TLS_HW] = prot[TLS_HW][TLS_SW]; #endif prot[TLS_HW_RECORD][TLS_HW_RECORD] = *base; diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 775518533270..568b296182fe 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -654,16 +654,25 @@ static struct sk_buff *tls_wait_data(struct sock *sk, int flags, } static int decrypt_skb_update(struct sock *sk, struct sk_buff *skb, - struct scatterlist *sgout) + struct scatterlist *sgout, bool *zc) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); struct strp_msg *rxm = strp_msg(skb); int err = 0; - err = decrypt_skb(sk, skb, sgout); +#ifdef CONFIG_TLS_DEVICE + err = tls_device_decrypted(sk, skb); if (err < 0) return err; +#endif + if (!ctx->decrypted) { + err = decrypt_skb(sk, skb, sgout); + if (err < 0) + return err; + } else { + *zc = false; + } rxm->offset += tls_ctx->rx.prepend_size; rxm->full_len -= tls_ctx->rx.overhead_size; @@ -825,7 +834,7 @@ int tls_sw_recvmsg(struct sock *sk, if (err < 0) goto fallback_to_reg_recv; - err = decrypt_skb_update(sk, skb, sgin); + err = decrypt_skb_update(sk, skb, sgin, &zc); for (; pages > 0; pages--) put_page(sg_page(&sgin[pages])); if (err < 0) { @@ -834,7 +843,7 @@ int tls_sw_recvmsg(struct sock *sk, } } else { fallback_to_reg_recv: - err = decrypt_skb_update(sk, skb, NULL); + err = decrypt_skb_update(sk, skb, NULL, &zc); if (err < 0) { tls_err_abort(sk, EBADMSG); goto recv_end; @@ -889,6 +898,7 @@ ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos, int err = 0; long timeo; int chunk; + bool zc; lock_sock(sk); @@ -905,7 +915,7 @@ ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos, } if (!ctx->decrypted) { - err = decrypt_skb_update(sk, skb, NULL); + err = decrypt_skb_update(sk, skb, NULL, &zc); if (err < 0) { tls_err_abort(sk, EBADMSG); @@ -994,6 +1004,10 @@ static int tls_read_size(struct strparser *strp, struct sk_buff *skb) goto read_failure; } +#ifdef CONFIG_TLS_DEVICE + handle_device_resync(strp->sk, TCP_SKB_CB(skb)->seq + rxm->offset, + *(u64*)tls_ctx->rx.rec_seq); +#endif return data_len + TLS_HEADER_SIZE; read_failure: From f52195ace0f1590f0798bd8f8499c191ca4b19d1 Mon Sep 17 00:00:00 2001 From: Boris Pismenny Date: Fri, 13 Jul 2018 14:33:44 +0300 Subject: [PATCH 1462/1640] UPSTREAM: tls: Fix zerocopy_from_iter iov handling zerocopy_from_iter iterates over the message, but it doesn't revert the updates made by the iov iteration. This patch fixes it. Now, the iov can be used after calling zerocopy_from_iter. Fixes: 3c4d75591 ("tls: kernel TLS support") Signed-off-by: Boris Pismenny Signed-off-by: David S. Miller --- net/tls/tls_sw.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 568b296182fe..0d336a21b501 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -263,7 +263,7 @@ static int zerocopy_from_iter(struct sock *sk, struct iov_iter *from, int length, int *pages_used, unsigned int *size_used, struct scatterlist *to, int to_max_pages, - bool charge) + bool charge, bool revert) { struct page *pages[MAX_SKB_FRAGS]; @@ -314,6 +314,8 @@ static int zerocopy_from_iter(struct sock *sk, struct iov_iter *from, out: *size_used = size; *pages_used = num_elem; + if (revert) + iov_iter_revert(from, size); return rc; } @@ -415,7 +417,7 @@ alloc_encrypted: &ctx->sg_plaintext_size, ctx->sg_plaintext_data, ARRAY_SIZE(ctx->sg_plaintext_data), - true); + true, false); if (ret) goto fallback_to_reg_send; @@ -830,7 +832,7 @@ int tls_sw_recvmsg(struct sock *sk, err = zerocopy_from_iter(sk, &msg->msg_iter, to_copy, &pages, &chunk, &sgin[1], - MAX_SKB_FRAGS, false); + MAX_SKB_FRAGS, false, true); if (err < 0) goto fallback_to_reg_recv; From a1225d6f8b8e2e87a8e85ac7c82a0015f5bfb17e Mon Sep 17 00:00:00 2001 From: Dave Watson Date: Thu, 12 Jul 2018 08:03:43 -0700 Subject: [PATCH 1463/1640] UPSTREAM: tls: Stricter error checking in zerocopy sendmsg path In the zerocopy sendmsg() path, there are error checks to revert the zerocopy if we get any error code. syzkaller has discovered that tls_push_record can return -ECONNRESET, which is fatal, and happens after the point at which it is safe to revert the iter, as we've already passed the memory to do_tcp_sendpages. Previously this code could return -ENOMEM and we would want to revert the iter, but AFAIK this no longer returns ENOMEM after a447da7d004 ("tls: fix waitall behavior in tls_sw_recvmsg"), so we fail for all error codes. Reported-by: syzbot+c226690f7b3126c5ee04@syzkaller.appspotmail.com Reported-by: syzbot+709f2810a6a05f11d4d3@syzkaller.appspotmail.com Signed-off-by: Dave Watson Fixes: 3c4d7559159b ("tls: kernel TLS support") Signed-off-by: David S. Miller --- net/tls/tls_sw.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 0d336a21b501..0c2d029c9d4c 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -425,7 +425,7 @@ alloc_encrypted: ret = tls_push_record(sk, msg->msg_flags, record_type); if (!ret) continue; - if (ret == -EAGAIN) + if (ret < 0) goto send_end; copied -= try_to_copy; From 8808d31692ddac928645354647c1bd91bc0a634b Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Wed, 18 Jul 2018 08:27:41 -0500 Subject: [PATCH 1464/1640] UPSTREAM: tls: Fix copy-paste error in tls_device_reencrypt It seems that the proper structure to use in this particular case is *skb_iter* instead of skb. Addresses-Coverity-ID: 1471906 ("Copy-paste error") Fixes: 4799ac81e52a ("tls: Add rx inline crypto offload") Signed-off-by: Gustavo A. R. Silva Signed-off-by: David S. Miller --- net/tls/tls_device.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index 4995d84d228d..1e968d238adf 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -615,7 +615,7 @@ static int tls_device_reencrypt(struct sock *sk, struct sk_buff *skb) TLS_CIPHER_AES_GCM_128_TAG_SIZE); if (skb_iter->decrypted) - skb_store_bits(skb, offset, buf, copy); + skb_store_bits(skb_iter, offset, buf, copy); offset += copy; buf += copy; From 4d3a1649659dc505746fb819f1ae4ae6e0afccab Mon Sep 17 00:00:00 2001 From: Doron Roberts-Kedes Date: Wed, 18 Jul 2018 16:22:27 -0700 Subject: [PATCH 1465/1640] UPSTREAM: tls: check RCV_SHUTDOWN in tls_wait_data The current code does not check sk->sk_shutdown & RCV_SHUTDOWN. tls_sw_recvmsg may return a positive value in the case where bytes have already been copied when the socket is shutdown. sk->sk_err has been cleared, causing the tls_wait_data to hang forever on a subsequent invocation. Checking sk->sk_shutdown & RCV_SHUTDOWN, as in tcp_recvmsg, fixes this problem. Fixes: c46234ebb4d1 ("tls: RX path for ktls") Acked-by: Dave Watson Signed-off-by: Doron Roberts-Kedes Signed-off-by: David S. Miller --- net/tls/tls_sw.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 0c2d029c9d4c..03f1370f5db1 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -631,6 +631,9 @@ static struct sk_buff *tls_wait_data(struct sock *sk, int flags, return NULL; } + if (sk->sk_shutdown & RCV_SHUTDOWN) + return NULL; + if (sock_flag(sk, SOCK_DONE)) return NULL; From 0d1f83752158b104447bc96d5963d1a5a3a3babe Mon Sep 17 00:00:00 2001 From: Vakul Garg Date: Tue, 24 Jul 2018 16:54:27 +0530 Subject: [PATCH 1466/1640] UPSTREAM: net/tls: Removed redundant checks for non-NULL Removed checks against non-NULL before calling kfree_skb() and crypto_free_aead(). These functions are safe to be called with NULL as an argument. Signed-off-by: Vakul Garg Acked-by: Dave Watson Signed-off-by: David S. Miller --- net/tls/tls_sw.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 03f1370f5db1..0687a7a4689f 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -1047,8 +1047,7 @@ void tls_sw_free_resources_tx(struct sock *sk) struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); - if (ctx->aead_send) - crypto_free_aead(ctx->aead_send); + crypto_free_aead(ctx->aead_send); tls_free_both_sg(sk); kfree(ctx); @@ -1060,10 +1059,8 @@ void tls_sw_release_resources_rx(struct sock *sk) struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); if (ctx->aead_recv) { - if (ctx->recv_pkt) { - kfree_skb(ctx->recv_pkt); - ctx->recv_pkt = NULL; - } + kfree_skb(ctx->recv_pkt); + ctx->recv_pkt = NULL; crypto_free_aead(ctx->aead_recv); strp_stop(&ctx->strp); write_lock_bh(&sk->sk_callback_lock); From 923cba8151c7969a769aeed6670256d87e8523bb Mon Sep 17 00:00:00 2001 From: Doron Roberts-Kedes Date: Wed, 25 Jul 2018 14:48:21 -0700 Subject: [PATCH 1467/1640] UPSTREAM: tls: Skip zerocopy path for ITER_KVEC The zerocopy path ultimately calls iov_iter_get_pages, which defines the step function for ITER_KVECs as simply, return -EFAULT. Taking the non-zerocopy path for ITER_KVECs avoids the unnecessary fallback. See https://lore.kernel.org/lkml/20150401023311.GL29656@ZenIV.linux.org.uk/T/#u for a discussion of why zerocopy for vmalloc data is not a good idea. Discovered while testing NBD traffic encrypted with ktls. Fixes: c46234ebb4d1 ("tls: RX path for ktls") Signed-off-by: Doron Roberts-Kedes Signed-off-by: David S. Miller --- net/tls/tls_sw.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 0687a7a4689f..f9971717f7e0 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -362,6 +362,7 @@ int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) int record_room; bool full_record; int orig_size; + bool is_kvec = msg->msg_iter.type & ITER_KVEC; if (msg->msg_flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL)) return -ENOTSUPP; @@ -410,8 +411,7 @@ alloc_encrypted: try_to_copy -= required_size - ctx->sg_encrypted_size; full_record = true; } - - if (full_record || eor) { + if (!is_kvec && (full_record || eor)) { ret = zerocopy_from_iter(sk, &msg->msg_iter, try_to_copy, &ctx->sg_plaintext_num_elem, &ctx->sg_plaintext_size, @@ -779,6 +779,7 @@ int tls_sw_recvmsg(struct sock *sk, bool cmsg = false; int target, err = 0; long timeo; + bool is_kvec = msg->msg_iter.type & ITER_KVEC; flags |= nonblock; @@ -822,7 +823,7 @@ int tls_sw_recvmsg(struct sock *sk, page_count = iov_iter_npages(&msg->msg_iter, MAX_SKB_FRAGS); to_copy = rxm->full_len - tls_ctx->rx.overhead_size; - if (to_copy <= len && page_count < MAX_SKB_FRAGS && + if (!is_kvec && to_copy <= len && page_count < MAX_SKB_FRAGS && likely(!(flags & MSG_PEEK))) { struct scatterlist sgin[MAX_SKB_FRAGS + 1]; int pages = 0; From 6ee724bd04377aaced42b3f453f68f24387433e2 Mon Sep 17 00:00:00 2001 From: Doron Roberts-Kedes Date: Thu, 26 Jul 2018 07:59:35 -0700 Subject: [PATCH 1468/1640] UPSTREAM: tls: Remove dead code in tls_sw_sendmsg tls_push_record either returns 0 on success or a negative value on failure. This patch removes code that would only be executed if tls_push_record were to return a positive value. Signed-off-by: Doron Roberts-Kedes Signed-off-by: David S. Miller --- net/tls/tls_sw.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index f9971717f7e0..e80d70a1e138 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -423,12 +423,10 @@ alloc_encrypted: copied += try_to_copy; ret = tls_push_record(sk, msg->msg_flags, record_type); - if (!ret) - continue; - if (ret < 0) + if (ret) goto send_end; + continue; - copied -= try_to_copy; fallback_to_reg_send: iov_iter_revert(&msg->msg_iter, ctx->sg_plaintext_size - orig_size); From 1f093f51f4a592d628e6e29d48c7113cae42e0ee Mon Sep 17 00:00:00 2001 From: Doron Roberts-Kedes Date: Thu, 26 Jul 2018 07:59:36 -0700 Subject: [PATCH 1469/1640] UPSTREAM: tls: Fix improper revert in zerocopy_from_iter The current code is problematic because the iov_iter is reverted and never advanced in the non-error case. This patch skips the revert in the non-error case. This patch also fixes the amount by which the iov_iter is reverted. Currently, iov_iter is reverted by size, which can be greater than the amount by which the iter was actually advanced. Instead, only revert by the amount that the iter was advanced. Fixes: 4718799817c5 ("tls: Fix zerocopy_from_iter iov handling") Signed-off-by: Doron Roberts-Kedes Signed-off-by: David S. Miller --- net/tls/tls_sw.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index e80d70a1e138..6deceb7c56ba 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -263,7 +263,7 @@ static int zerocopy_from_iter(struct sock *sk, struct iov_iter *from, int length, int *pages_used, unsigned int *size_used, struct scatterlist *to, int to_max_pages, - bool charge, bool revert) + bool charge) { struct page *pages[MAX_SKB_FRAGS]; @@ -312,10 +312,10 @@ static int zerocopy_from_iter(struct sock *sk, struct iov_iter *from, } out: + if (rc) + iov_iter_revert(from, size - *size_used); *size_used = size; *pages_used = num_elem; - if (revert) - iov_iter_revert(from, size); return rc; } @@ -417,7 +417,7 @@ alloc_encrypted: &ctx->sg_plaintext_size, ctx->sg_plaintext_data, ARRAY_SIZE(ctx->sg_plaintext_data), - true, false); + true); if (ret) goto fallback_to_reg_send; @@ -428,8 +428,6 @@ alloc_encrypted: continue; fallback_to_reg_send: - iov_iter_revert(&msg->msg_iter, - ctx->sg_plaintext_size - orig_size); trim_sg(sk, ctx->sg_plaintext_data, &ctx->sg_plaintext_num_elem, &ctx->sg_plaintext_size, @@ -834,7 +832,7 @@ int tls_sw_recvmsg(struct sock *sk, err = zerocopy_from_iter(sk, &msg->msg_iter, to_copy, &pages, &chunk, &sgin[1], - MAX_SKB_FRAGS, false, true); + MAX_SKB_FRAGS, false); if (err < 0) goto fallback_to_reg_recv; From e21d36686a8a451dbbd4dee91fb457d80b2d7231 Mon Sep 17 00:00:00 2001 From: Vakul Garg Date: Mon, 30 Jul 2018 16:08:33 +0530 Subject: [PATCH 1470/1640] UPSTREAM: net/tls: Use socket data_ready callback on record availability On receipt of a complete tls record, use socket's saved data_ready callback instead of state_change callback. In function tls_queue(), the TLS record is queued in encrypted state. But the decryption happen inline when tls_sw_recvmsg() or tls_sw_splice_read() get invoked. So it should be ok to notify the waiting context about the availability of data as soon as we could collect a full TLS record. For new data availability notification, sk_data_ready callback is more appropriate. It points to sock_def_readable() which wakes up specifically for EPOLLIN event. This is in contrast to the socket callback sk_state_change which points to sock_def_wakeup() which issues a wakeup unconditionally (without event mask). Signed-off-by: Vakul Garg Signed-off-by: David S. Miller --- net/tls/tls_sw.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 6deceb7c56ba..33838f11fafa 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -1028,7 +1028,7 @@ static void tls_queue(struct strparser *strp, struct sk_buff *skb) ctx->recv_pkt = skb; strp_pause(strp); - strp->sk->sk_state_change(strp->sk); + ctx->saved_data_ready(strp->sk); } static void tls_data_ready(struct sock *sk) From 9b45edfa3aa958a7a6730fd408580e35c893187d Mon Sep 17 00:00:00 2001 From: zhong jiang Date: Wed, 1 Aug 2018 00:50:24 +0800 Subject: [PATCH 1471/1640] UPSTREAM: net/tls: Use kmemdup to simplify the code Kmemdup is better than kmalloc+memcpy. So replace them. Signed-off-by: zhong jiang Signed-off-by: David S. Miller --- net/tls/tls_device.c | 3 +-- net/tls/tls_sw.c | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index 1e968d238adf..292742e50bfa 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -716,12 +716,11 @@ int tls_set_device_offload(struct sock *sk, struct tls_context *ctx) memcpy(ctx->tx.iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE, iv, iv_size); ctx->tx.rec_seq_size = rec_seq_size; - ctx->tx.rec_seq = kmalloc(rec_seq_size, GFP_KERNEL); + ctx->tx.rec_seq = kmemdup(rec_seq, rec_seq_size, GFP_KERNEL); if (!ctx->tx.rec_seq) { rc = -ENOMEM; goto free_iv; } - memcpy(ctx->tx.rec_seq, rec_seq, rec_seq_size); rc = tls_sw_fallback_init(sk, offload_ctx, crypto_info); if (rc) diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 33838f11fafa..ff3a6904a722 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -1173,12 +1173,11 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx) memcpy(cctx->iv, gcm_128_info->salt, TLS_CIPHER_AES_GCM_128_SALT_SIZE); memcpy(cctx->iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE, iv, iv_size); cctx->rec_seq_size = rec_seq_size; - cctx->rec_seq = kmalloc(rec_seq_size, GFP_KERNEL); + cctx->rec_seq = kmemdup(rec_seq, rec_seq_size, GFP_KERNEL); if (!cctx->rec_seq) { rc = -ENOMEM; goto free_iv; } - memcpy(cctx->rec_seq, rec_seq, rec_seq_size); if (sw_ctx_tx) { sg_init_table(sw_ctx_tx->sg_encrypted_data, From c50758dce56c41942c41004070cdeda114d99bf0 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 23 Jul 2018 10:01:33 -0700 Subject: [PATCH 1472/1640] UPSTREAM: crypto: scatterwalk - remove 'chain' argument from scatterwalk_crypto_chain() All callers pass chain=0 to scatterwalk_crypto_chain(). Remove this unneeded parameter. Signed-off-by: Eric Biggers Signed-off-by: Herbert Xu --- crypto/lrw.c | 4 ++-- crypto/scatterwalk.c | 2 +- crypto/xts.c | 4 ++-- include/crypto/scatterwalk.h | 8 +------- net/tls/tls_device_fallback.c | 2 +- 5 files changed, 7 insertions(+), 13 deletions(-) diff --git a/crypto/lrw.c b/crypto/lrw.c index 1b73fec817cf..79b5bc97f5c8 100644 --- a/crypto/lrw.c +++ b/crypto/lrw.c @@ -189,7 +189,7 @@ static int post_crypt(struct skcipher_request *req) if (rctx->dst != sg) { rctx->dst[0] = *sg; sg_unmark_end(rctx->dst); - scatterwalk_crypto_chain(rctx->dst, sg_next(sg), 0, 2); + scatterwalk_crypto_chain(rctx->dst, sg_next(sg), 2); } rctx->dst[0].length -= offset - sg->offset; rctx->dst[0].offset = offset; @@ -266,7 +266,7 @@ static int pre_crypt(struct skcipher_request *req) if (rctx->src != sg) { rctx->src[0] = *sg; sg_unmark_end(rctx->src); - scatterwalk_crypto_chain(rctx->src, sg_next(sg), 0, 2); + scatterwalk_crypto_chain(rctx->src, sg_next(sg), 2); } rctx->src[0].length -= offset - sg->offset; rctx->src[0].offset = offset; diff --git a/crypto/scatterwalk.c b/crypto/scatterwalk.c index c16c94f88733..d0b92c1cd6e9 100644 --- a/crypto/scatterwalk.c +++ b/crypto/scatterwalk.c @@ -91,7 +91,7 @@ struct scatterlist *scatterwalk_ffwd(struct scatterlist dst[2], sg_init_table(dst, 2); sg_set_page(dst, sg_page(src), src->length - len, src->offset + len); - scatterwalk_crypto_chain(dst, sg_next(src), 0, 2); + scatterwalk_crypto_chain(dst, sg_next(src), 2); return dst; } diff --git a/crypto/xts.c b/crypto/xts.c index f5fba941d6f6..123552522b80 100644 --- a/crypto/xts.c +++ b/crypto/xts.c @@ -138,7 +138,7 @@ static int post_crypt(struct skcipher_request *req) if (rctx->dst != sg) { rctx->dst[0] = *sg; sg_unmark_end(rctx->dst); - scatterwalk_crypto_chain(rctx->dst, sg_next(sg), 0, 2); + scatterwalk_crypto_chain(rctx->dst, sg_next(sg), 2); } rctx->dst[0].length -= offset - sg->offset; rctx->dst[0].offset = offset; @@ -204,7 +204,7 @@ static int pre_crypt(struct skcipher_request *req) if (rctx->src != sg) { rctx->src[0] = *sg; sg_unmark_end(rctx->src); - scatterwalk_crypto_chain(rctx->src, sg_next(sg), 0, 2); + scatterwalk_crypto_chain(rctx->src, sg_next(sg), 2); } rctx->src[0].length -= offset - sg->offset; rctx->src[0].offset = offset; diff --git a/include/crypto/scatterwalk.h b/include/crypto/scatterwalk.h index 880e6be9e95e..eac72840a7d2 100644 --- a/include/crypto/scatterwalk.h +++ b/include/crypto/scatterwalk.h @@ -22,14 +22,8 @@ #include static inline void scatterwalk_crypto_chain(struct scatterlist *head, - struct scatterlist *sg, - int chain, int num) + struct scatterlist *sg, int num) { - if (chain) { - head->length += sg->length; - sg = sg_next(sg); - } - if (sg) sg_chain(head, num, sg); else diff --git a/net/tls/tls_device_fallback.c b/net/tls/tls_device_fallback.c index e3313c45663f..6102169239d1 100644 --- a/net/tls/tls_device_fallback.c +++ b/net/tls/tls_device_fallback.c @@ -42,7 +42,7 @@ static void chain_to_walk(struct scatterlist *sg, struct scatter_walk *walk) sg_set_page(sg, sg_page(src), src->length - diff, walk->offset); - scatterwalk_crypto_chain(sg, sg_next(src), 0, 2); + scatterwalk_crypto_chain(sg, sg_next(src), 2); } static int tls_enc_record(struct aead_request *aead_req, From 77de2ba611ad5a46fd3463894635a970a315b7dd Mon Sep 17 00:00:00 2001 From: Vakul Garg Date: Thu, 2 Aug 2018 20:43:10 +0530 Subject: [PATCH 1473/1640] UPSTREAM: net/tls: Mark the end in scatterlist table Function zerocopy_from_iter() unmarks the 'end' in input sgtable while adding new entries in it. The last entry in sgtable remained unmarked. This results in KASAN error report on using apis like sg_nents(). Before returning, the function needs to mark the 'end' in the last entry it adds. Signed-off-by: Vakul Garg Acked-by: Dave Watson Signed-off-by: David S. Miller --- net/tls/tls_sw.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index ff3a6904a722..83d67df33f0c 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -311,6 +311,9 @@ static int zerocopy_from_iter(struct sock *sk, struct iov_iter *from, } } + /* Mark the end in the last sg entry if newly added */ + if (num_elem > *pages_used) + sg_mark_end(&to[num_elem - 1]); out: if (rc) iov_iter_revert(from, size - *size_used); From 2a49f78f43c957e8c94a76b74754b08cf0f2c213 Mon Sep 17 00:00:00 2001 From: Vakul Garg Date: Fri, 10 Aug 2018 20:46:41 +0530 Subject: [PATCH 1474/1640] UPSTREAM: net/tls: Combined memory allocation for decryption request For preparing decryption request, several memory chunks are required (aead_req, sgin, sgout, iv, aad). For submitting the decrypt request to an accelerator, it is required that the buffers which are read by the accelerator must be dma-able and not come from stack. The buffers for aad and iv can be separately kmalloced each, but it is inefficient. This patch does a combined allocation for preparing decryption request and then segments into aead_req || sgin || sgout || iv || aad. Signed-off-by: Vakul Garg Signed-off-by: David S. Miller --- include/net/tls.h | 4 - net/tls/tls_sw.c | 236 +++++++++++++++++++++++++++------------------- 2 files changed, 141 insertions(+), 99 deletions(-) diff --git a/include/net/tls.h b/include/net/tls.h index 91ef19a7f8f9..d5c683e8bb22 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -111,10 +111,6 @@ struct tls_sw_context_tx { struct scatterlist sg_aead_in[2]; /* AAD | sg_encrypted_data (data contain overhead for hdr&iv&tag) */ struct scatterlist sg_aead_out[2]; - - char rx_aad_ciphertext[TLS_AAD_SPACE_SIZE]; - char rx_aad_plaintext[TLS_AAD_SPACE_SIZE]; - }; struct tls_sw_context_rx { diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 83d67df33f0c..52fbe727d7c1 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -48,19 +48,13 @@ static int tls_do_decryption(struct sock *sk, struct scatterlist *sgout, char *iv_recv, size_t data_len, - struct sk_buff *skb, - gfp_t flags) + struct aead_request *aead_req) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); - struct aead_request *aead_req; - int ret; - aead_req = aead_request_alloc(ctx->aead_recv, flags); - if (!aead_req) - return -ENOMEM; - + aead_request_set_tfm(aead_req, ctx->aead_recv); aead_request_set_ad(aead_req, TLS_AAD_SPACE_SIZE); aead_request_set_crypt(aead_req, sgin, sgout, data_len + tls_ctx->rx.tag_size, @@ -69,8 +63,6 @@ static int tls_do_decryption(struct sock *sk, crypto_req_done, &ctx->async_wait); ret = crypto_wait_req(crypto_aead_decrypt(aead_req), &ctx->async_wait); - - aead_request_free(aead_req); return ret; } @@ -657,8 +649,132 @@ static struct sk_buff *tls_wait_data(struct sock *sk, int flags, return skb; } +/* This function decrypts the input skb into either out_iov or in out_sg + * or in skb buffers itself. The input parameter 'zc' indicates if + * zero-copy mode needs to be tried or not. With zero-copy mode, either + * out_iov or out_sg must be non-NULL. In case both out_iov and out_sg are + * NULL, then the decryption happens inside skb buffers itself, i.e. + * zero-copy gets disabled and 'zc' is updated. + */ + +static int decrypt_internal(struct sock *sk, struct sk_buff *skb, + struct iov_iter *out_iov, + struct scatterlist *out_sg, + int *chunk, bool *zc) +{ + struct tls_context *tls_ctx = tls_get_ctx(sk); + struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); + struct strp_msg *rxm = strp_msg(skb); + int n_sgin, n_sgout, nsg, mem_size, aead_size, err, pages = 0; + struct aead_request *aead_req; + struct sk_buff *unused; + u8 *aad, *iv, *mem = NULL; + struct scatterlist *sgin = NULL; + struct scatterlist *sgout = NULL; + const int data_len = rxm->full_len - tls_ctx->rx.overhead_size; + + if (*zc && (out_iov || out_sg)) { + if (out_iov) + n_sgout = iov_iter_npages(out_iov, INT_MAX) + 1; + else + n_sgout = sg_nents(out_sg); + } else { + n_sgout = 0; + *zc = false; + } + + n_sgin = skb_cow_data(skb, 0, &unused); + if (n_sgin < 1) + return -EBADMSG; + + /* Increment to accommodate AAD */ + n_sgin = n_sgin + 1; + + nsg = n_sgin + n_sgout; + + aead_size = sizeof(*aead_req) + crypto_aead_reqsize(ctx->aead_recv); + mem_size = aead_size + (nsg * sizeof(struct scatterlist)); + mem_size = mem_size + TLS_AAD_SPACE_SIZE; + mem_size = mem_size + crypto_aead_ivsize(ctx->aead_recv); + + /* Allocate a single block of memory which contains + * aead_req || sgin[] || sgout[] || aad || iv. + * This order achieves correct alignment for aead_req, sgin, sgout. + */ + mem = kmalloc(mem_size, sk->sk_allocation); + if (!mem) + return -ENOMEM; + + /* Segment the allocated memory */ + aead_req = (struct aead_request *)mem; + sgin = (struct scatterlist *)(mem + aead_size); + sgout = sgin + n_sgin; + aad = (u8 *)(sgout + n_sgout); + iv = aad + TLS_AAD_SPACE_SIZE; + + /* Prepare IV */ + err = skb_copy_bits(skb, rxm->offset + TLS_HEADER_SIZE, + iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE, + tls_ctx->rx.iv_size); + if (err < 0) { + kfree(mem); + return err; + } + memcpy(iv, tls_ctx->rx.iv, TLS_CIPHER_AES_GCM_128_SALT_SIZE); + + /* Prepare AAD */ + tls_make_aad(aad, rxm->full_len - tls_ctx->rx.overhead_size, + tls_ctx->rx.rec_seq, tls_ctx->rx.rec_seq_size, + ctx->control); + + /* Prepare sgin */ + sg_init_table(sgin, n_sgin); + sg_set_buf(&sgin[0], aad, TLS_AAD_SPACE_SIZE); + err = skb_to_sgvec(skb, &sgin[1], + rxm->offset + tls_ctx->rx.prepend_size, + rxm->full_len - tls_ctx->rx.prepend_size); + if (err < 0) { + kfree(mem); + return err; + } + + if (n_sgout) { + if (out_iov) { + sg_init_table(sgout, n_sgout); + sg_set_buf(&sgout[0], aad, TLS_AAD_SPACE_SIZE); + + *chunk = 0; + err = zerocopy_from_iter(sk, out_iov, data_len, &pages, + chunk, &sgout[1], + (n_sgout - 1), false); + if (err < 0) + goto fallback_to_reg_recv; + } else if (out_sg) { + memcpy(sgout, out_sg, n_sgout * sizeof(*sgout)); + } else { + goto fallback_to_reg_recv; + } + } else { +fallback_to_reg_recv: + sgout = sgin; + pages = 0; + *chunk = 0; + *zc = false; + } + + /* Prepare and submit AEAD request */ + err = tls_do_decryption(sk, sgin, sgout, iv, data_len, aead_req); + + /* Release the pages in case iov was mapped to pages */ + for (; pages > 0; pages--) + put_page(sg_page(&sgout[pages])); + + kfree(mem); + return err; +} + static int decrypt_skb_update(struct sock *sk, struct sk_buff *skb, - struct scatterlist *sgout, bool *zc) + struct iov_iter *dest, int *chunk, bool *zc) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); @@ -671,7 +787,7 @@ static int decrypt_skb_update(struct sock *sk, struct sk_buff *skb, return err; #endif if (!ctx->decrypted) { - err = decrypt_skb(sk, skb, sgout); + err = decrypt_internal(sk, skb, dest, NULL, chunk, zc); if (err < 0) return err; } else { @@ -690,54 +806,10 @@ static int decrypt_skb_update(struct sock *sk, struct sk_buff *skb, int decrypt_skb(struct sock *sk, struct sk_buff *skb, struct scatterlist *sgout) { - struct tls_context *tls_ctx = tls_get_ctx(sk); - struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); - char iv[TLS_CIPHER_AES_GCM_128_SALT_SIZE + MAX_IV_SIZE]; - struct scatterlist sgin_arr[MAX_SKB_FRAGS + 2]; - struct scatterlist *sgin = &sgin_arr[0]; - struct strp_msg *rxm = strp_msg(skb); - int ret, nsg = ARRAY_SIZE(sgin_arr); - struct sk_buff *unused; + bool zc = true; + int chunk; - ret = skb_copy_bits(skb, rxm->offset + TLS_HEADER_SIZE, - iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE, - tls_ctx->rx.iv_size); - if (ret < 0) - return ret; - - memcpy(iv, tls_ctx->rx.iv, TLS_CIPHER_AES_GCM_128_SALT_SIZE); - if (!sgout) { - nsg = skb_cow_data(skb, 0, &unused) + 1; - sgin = kmalloc_array(nsg, sizeof(*sgin), sk->sk_allocation); - sgout = sgin; - } - - sg_init_table(sgin, nsg); - sg_set_buf(&sgin[0], ctx->rx_aad_ciphertext, TLS_AAD_SPACE_SIZE); - - nsg = skb_to_sgvec(skb, &sgin[1], - rxm->offset + tls_ctx->rx.prepend_size, - rxm->full_len - tls_ctx->rx.prepend_size); - if (nsg < 0) { - ret = nsg; - goto out; - } - - tls_make_aad(ctx->rx_aad_ciphertext, - rxm->full_len - tls_ctx->rx.overhead_size, - tls_ctx->rx.rec_seq, - tls_ctx->rx.rec_seq_size, - ctx->control); - - ret = tls_do_decryption(sk, sgin, sgout, iv, - rxm->full_len - tls_ctx->rx.overhead_size, - skb, sk->sk_allocation); - -out: - if (sgin != &sgin_arr[0]) - kfree(sgin); - - return ret; + return decrypt_internal(sk, skb, NULL, sgout, &chunk, &zc); } static bool tls_sw_advance_skb(struct sock *sk, struct sk_buff *skb, @@ -816,43 +888,17 @@ int tls_sw_recvmsg(struct sock *sk, } if (!ctx->decrypted) { - int page_count; - int to_copy; - - page_count = iov_iter_npages(&msg->msg_iter, - MAX_SKB_FRAGS); - to_copy = rxm->full_len - tls_ctx->rx.overhead_size; - if (!is_kvec && to_copy <= len && page_count < MAX_SKB_FRAGS && - likely(!(flags & MSG_PEEK))) { - struct scatterlist sgin[MAX_SKB_FRAGS + 1]; - int pages = 0; + int to_copy = rxm->full_len - tls_ctx->rx.overhead_size; + if (!is_kvec && to_copy <= len && + likely(!(flags & MSG_PEEK))) zc = true; - sg_init_table(sgin, MAX_SKB_FRAGS + 1); - sg_set_buf(&sgin[0], ctx->rx_aad_plaintext, - TLS_AAD_SPACE_SIZE); - err = zerocopy_from_iter(sk, &msg->msg_iter, - to_copy, &pages, - &chunk, &sgin[1], - MAX_SKB_FRAGS, false); - if (err < 0) - goto fallback_to_reg_recv; - - err = decrypt_skb_update(sk, skb, sgin, &zc); - for (; pages > 0; pages--) - put_page(sg_page(&sgin[pages])); - if (err < 0) { - tls_err_abort(sk, EBADMSG); - goto recv_end; - } - } else { -fallback_to_reg_recv: - err = decrypt_skb_update(sk, skb, NULL, &zc); - if (err < 0) { - tls_err_abort(sk, EBADMSG); - goto recv_end; - } + err = decrypt_skb_update(sk, skb, &msg->msg_iter, + &chunk, &zc); + if (err < 0) { + tls_err_abort(sk, EBADMSG); + goto recv_end; } ctx->decrypted = true; } @@ -903,7 +949,7 @@ ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos, int err = 0; long timeo; int chunk; - bool zc; + bool zc = false; lock_sock(sk); @@ -920,7 +966,7 @@ ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos, } if (!ctx->decrypted) { - err = decrypt_skb_update(sk, skb, NULL, &zc); + err = decrypt_skb_update(sk, skb, NULL, &chunk, &zc); if (err < 0) { tls_err_abort(sk, EBADMSG); From b4df825e9c9396f49e001bb6acd7a72f304b70b6 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 16 Aug 2018 21:49:06 +0200 Subject: [PATCH 1475/1640] BACKPORT: tcp, ulp: add alias for all ulp modules Lets not turn the TCP ULP lookup into an arbitrary module loader as we only intend to load ULP modules through this mechanism, not other unrelated kernel modules: [root@bar]# cat foo.c #include #include #include #include int main(void) { int sock = socket(PF_INET, SOCK_STREAM, 0); setsockopt(sock, IPPROTO_TCP, TCP_ULP, "sctp", sizeof("sctp")); return 0; } [root@bar]# gcc foo.c -O2 -Wall [root@bar]# lsmod | grep sctp [root@bar]# ./a.out [root@bar]# lsmod | grep sctp sctp 1077248 4 libcrc32c 16384 3 nf_conntrack,nf_nat,sctp [root@bar]# Fix it by adding module alias to TCP ULP modules, so probing module via request_module() will be limited to tcp-ulp-[name]. The existing modules like kTLS will load fine given tcp-ulp-tls alias, but others will fail to load: [root@bar]# lsmod | grep sctp [root@bar]# ./a.out [root@bar]# lsmod | grep sctp [root@bar]# Sockmap is not affected from this since it's either built-in or not. Fixes: 734942cc4ea6 ("tcp: ULP infrastructure") Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Acked-by: Song Liu Signed-off-by: Alexei Starovoitov --- include/net/tcp.h | 4 ++++ net/ipv4/tcp_ulp.c | 2 +- net/tls/tls_main.c | 1 + 3 files changed, 6 insertions(+), 1 deletion(-) diff --git a/include/net/tcp.h b/include/net/tcp.h index 2beba3dc6369..2c8f18224acc 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -2159,6 +2159,10 @@ int tcp_set_ulp(struct sock *sk, const char *name); void tcp_get_available_ulp(char *buf, size_t len); void tcp_cleanup_ulp(struct sock *sk); +#define MODULE_ALIAS_TCP_ULP(name) \ + __MODULE_INFO(alias, alias_userspace, name); \ + __MODULE_INFO(alias, alias_tcp_ulp, "tcp-ulp-" name) + struct sk_msg; struct sk_psock; diff --git a/net/ipv4/tcp_ulp.c b/net/ipv4/tcp_ulp.c index d8e6a42c6b3e..07bf9e02df13 100644 --- a/net/ipv4/tcp_ulp.c +++ b/net/ipv4/tcp_ulp.c @@ -39,7 +39,7 @@ static const struct tcp_ulp_ops *__tcp_ulp_find_autoload(const char *name) #ifdef CONFIG_MODULES if (!ulp && capable(CAP_NET_ADMIN)) { rcu_read_unlock(); - request_module("%s", name); + request_module("tcp-ulp-%s", name); rcu_read_lock(); ulp = tcp_ulp_find(name); } diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index afaa4910bc7e..c83c59598f98 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -45,6 +45,7 @@ MODULE_AUTHOR("Mellanox Technologies"); MODULE_DESCRIPTION("Transport Layer Security Support"); MODULE_LICENSE("Dual BSD/GPL"); +MODULE_ALIAS_TCP_ULP("tls"); enum { TLSV4, From dd60e463da1967815b7f8bfb1ede5b4732db34f7 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Wed, 22 Aug 2018 08:37:32 -0700 Subject: [PATCH 1476/1640] UPSTREAM: tls: possible hang when do_tcp_sendpages hits sndbuf is full case Currently, the lower protocols sk_write_space handler is not called if TLS is sending a scatterlist via tls_push_sg. However, normally tls_push_sg calls do_tcp_sendpage, which may be under memory pressure, that in turn may trigger a wait via sk_wait_event. Typically, this happens when the in-flight bytes exceed the sdnbuf size. In the normal case when enough ACKs are received sk_write_space() will be called and the sk_wait_event will be woken up allowing it to send more data and/or return to the user. But, in the TLS case because the sk_write_space() handler does not wake up the events the above send will wait until the sndtimeo is exceeded. By default this is MAX_SCHEDULE_TIMEOUT so it look like a hang to the user (especially this impatient user). To fix this pass the sk_write_space event to the lower layers sk_write_space event which in the TCP case will wake any pending events. I observed the above while integrating sockmap and ktls. It initially appeared as test_sockmap (modified to use ktls) occasionally hanging. To reliably reproduce this reduce the sndbuf size and stress the tls layer by sending many 1B sends. This results in every byte needing a header and each byte individually being sent to the crypto layer. Signed-off-by: John Fastabend Acked-by: Dave Watson Signed-off-by: Daniel Borkmann --- net/tls/tls_main.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index c83c59598f98..168b53fe3916 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -213,9 +213,14 @@ static void tls_write_space(struct sock *sk) { struct tls_context *ctx = tls_get_ctx(sk); - /* We are already sending pages, ignore notification */ - if (ctx->in_tcp_sendpages) + /* If in_tcp_sendpages call lower protocol write space handler + * to ensure we wake up any waiting operations there. For example + * if do_tcp_sendpages where to call sk_wait_event. + */ + if (ctx->in_tcp_sendpages) { + ctx->sk_write_space(sk); return; + } if (!sk->sk_write_pending && tls_is_pending_closed_record(ctx)) { gfp_t sk_allocation = sk->sk_allocation; From 60f9ec1e56802391e80cba45aaa71e5ebabfe3c4 Mon Sep 17 00:00:00 2001 From: Vakul Garg Date: Thu, 6 Sep 2018 21:41:40 +0530 Subject: [PATCH 1477/1640] UPSTREAM: net/tls: Set count of SG entries if sk_alloc_sg returns -ENOSPC tls_sw_sendmsg() allocates plaintext and encrypted SG entries using function sk_alloc_sg(). In case the number of SG entries hit MAX_SKB_FRAGS, sk_alloc_sg() returns -ENOSPC and sets the variable for current SG index to '0'. This leads to calling of function tls_push_record() with 'sg_encrypted_num_elem = 0' and later causes kernel crash. To fix this, set the number of SG elements to the number of elements in plaintext/encrypted SG arrays in case sk_alloc_sg() returns -ENOSPC. Fixes: 3c4d7559159b ("tls: kernel TLS support") Signed-off-by: Vakul Garg Signed-off-by: David S. Miller --- net/tls/tls_sw.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 52fbe727d7c1..e28a6ff25d96 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -125,6 +125,9 @@ static int alloc_encrypted_sg(struct sock *sk, int len) &ctx->sg_encrypted_num_elem, &ctx->sg_encrypted_size, 0); + if (rc == -ENOSPC) + ctx->sg_encrypted_num_elem = ARRAY_SIZE(ctx->sg_encrypted_data); + return rc; } @@ -138,6 +141,9 @@ static int alloc_plaintext_sg(struct sock *sk, int len) &ctx->sg_plaintext_num_elem, &ctx->sg_plaintext_size, tls_ctx->pending_open_record_frags); + if (rc == -ENOSPC) + ctx->sg_plaintext_num_elem = ARRAY_SIZE(ctx->sg_plaintext_data); + return rc; } From 2b42fd651b9b4e38e3cf0e02d4557b0c006cb95c Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Wed, 12 Sep 2018 17:44:41 +0200 Subject: [PATCH 1478/1640] UPSTREAM: tls: don't copy the key out of tls12_crypto_info_aes_gcm_128 There's no need to copy the key to an on-stack buffer before calling crypto_aead_setkey(). Fixes: 3c4d7559159b ("tls: kernel TLS support") Signed-off-by: Sabrina Dubroca Signed-off-by: David S. Miller --- net/tls/tls_sw.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index e28a6ff25d96..f29b7c49cbf2 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -1136,7 +1136,6 @@ void tls_sw_free_resources_rx(struct sock *sk) int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx) { - char keyval[TLS_CIPHER_AES_GCM_128_KEY_SIZE]; struct tls_crypto_info *crypto_info; struct tls12_crypto_info_aes_gcm_128 *gcm_128_info; struct tls_sw_context_tx *sw_ctx_tx = NULL; @@ -1265,9 +1264,7 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx) ctx->push_pending_record = tls_sw_push_pending_record; - memcpy(keyval, gcm_128_info->key, TLS_CIPHER_AES_GCM_128_KEY_SIZE); - - rc = crypto_aead_setkey(*aead, keyval, + rc = crypto_aead_setkey(*aead, gcm_128_info->key, TLS_CIPHER_AES_GCM_128_KEY_SIZE); if (rc) goto free_aead; From c654a345cb2ab62e65f8c6caf1ff45c0c80f90ff Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Wed, 12 Sep 2018 17:44:42 +0200 Subject: [PATCH 1479/1640] UPSTREAM: tls: zero the crypto information from tls_context before freeing This contains key material in crypto_send_aes_gcm_128 and crypto_recv_aes_gcm_128. Introduce union tls_crypto_context, and replace the two identical unions directly embedded in struct tls_context with it. We can then use this union to clean up the memory in the new tls_ctx_free() function. Fixes: 3c4d7559159b ("tls: kernel TLS support") Signed-off-by: Sabrina Dubroca Signed-off-by: David S. Miller --- include/net/tls.h | 19 +++++++++---------- net/tls/tls_device.c | 6 +++--- net/tls/tls_device_fallback.c | 2 +- net/tls/tls_main.c | 20 +++++++++++++++----- net/tls/tls_sw.c | 8 ++++---- 5 files changed, 32 insertions(+), 23 deletions(-) diff --git a/include/net/tls.h b/include/net/tls.h index d5c683e8bb22..0a769cf2f5f3 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -171,15 +171,14 @@ struct cipher_context { char *rec_seq; }; +union tls_crypto_context { + struct tls_crypto_info info; + struct tls12_crypto_info_aes_gcm_128 aes_gcm_128; +}; + struct tls_context { - union { - struct tls_crypto_info crypto_send; - struct tls12_crypto_info_aes_gcm_128 crypto_send_aes_gcm_128; - }; - union { - struct tls_crypto_info crypto_recv; - struct tls12_crypto_info_aes_gcm_128 crypto_recv_aes_gcm_128; - }; + union tls_crypto_context crypto_send; + union tls_crypto_context crypto_recv; struct list_head list; struct net_device *netdev; @@ -367,8 +366,8 @@ static inline void tls_fill_prepend(struct tls_context *ctx, * size KTLS_DTLS_HEADER_SIZE + KTLS_DTLS_NONCE_EXPLICIT_SIZE */ buf[0] = record_type; - buf[1] = TLS_VERSION_MINOR(ctx->crypto_send.version); - buf[2] = TLS_VERSION_MAJOR(ctx->crypto_send.version); + buf[1] = TLS_VERSION_MINOR(ctx->crypto_send.info.version); + buf[2] = TLS_VERSION_MAJOR(ctx->crypto_send.info.version); /* we can use IV for nonce explicit according to spec */ buf[3] = pkt_len >> 8; buf[4] = pkt_len & 0xFF; diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index 292742e50bfa..961b07d4d41c 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -686,7 +686,7 @@ int tls_set_device_offload(struct sock *sk, struct tls_context *ctx) goto free_marker_record; } - crypto_info = &ctx->crypto_send; + crypto_info = &ctx->crypto_send.info; switch (crypto_info->cipher_type) { case TLS_CIPHER_AES_GCM_128: nonce_size = TLS_CIPHER_AES_GCM_128_IV_SIZE; @@ -780,7 +780,7 @@ int tls_set_device_offload(struct sock *sk, struct tls_context *ctx) ctx->priv_ctx_tx = offload_ctx; rc = netdev->tlsdev_ops->tls_dev_add(netdev, sk, TLS_OFFLOAD_CTX_DIR_TX, - &ctx->crypto_send, + &ctx->crypto_send.info, tcp_sk(sk)->write_seq); if (rc) goto release_netdev; @@ -862,7 +862,7 @@ int tls_set_device_offload_rx(struct sock *sk, struct tls_context *ctx) goto release_ctx; rc = netdev->tlsdev_ops->tls_dev_add(netdev, sk, TLS_OFFLOAD_CTX_DIR_RX, - &ctx->crypto_recv, + &ctx->crypto_recv.info, tcp_sk(sk)->copied_seq); if (rc) { pr_err_ratelimited("%s: The netdev has refused to offload this socket\n", diff --git a/net/tls/tls_device_fallback.c b/net/tls/tls_device_fallback.c index 6102169239d1..450a6dbc5a88 100644 --- a/net/tls/tls_device_fallback.c +++ b/net/tls/tls_device_fallback.c @@ -320,7 +320,7 @@ static struct sk_buff *tls_enc_skb(struct tls_context *tls_ctx, goto free_req; iv = buf; - memcpy(iv, tls_ctx->crypto_send_aes_gcm_128.salt, + memcpy(iv, tls_ctx->crypto_send.aes_gcm_128.salt, TLS_CIPHER_AES_GCM_128_SALT_SIZE); aad = buf + TLS_CIPHER_AES_GCM_128_SALT_SIZE + TLS_CIPHER_AES_GCM_128_IV_SIZE; diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index 168b53fe3916..e86a01abd200 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -241,6 +241,16 @@ static void tls_write_space(struct sock *sk) ctx->sk_write_space(sk); } +static void tls_ctx_free(struct tls_context *ctx) +{ + if (!ctx) + return; + + memzero_explicit(&ctx->crypto_send, sizeof(ctx->crypto_send)); + memzero_explicit(&ctx->crypto_recv, sizeof(ctx->crypto_recv)); + kfree(ctx); +} + static void tls_sk_proto_close(struct sock *sk, long timeout) { struct tls_context *ctx = tls_get_ctx(sk); @@ -294,7 +304,7 @@ static void tls_sk_proto_close(struct sock *sk, long timeout) #else { #endif - kfree(ctx); + tls_ctx_free(ctx); ctx = NULL; } @@ -305,7 +315,7 @@ skip_tx_cleanup: * for sk->sk_prot->unhash [tls_hw_unhash] */ if (free_ctx) - kfree(ctx); + tls_ctx_free(ctx); } static int do_tls_getsockopt_tx(struct sock *sk, char __user *optval, @@ -330,7 +340,7 @@ static int do_tls_getsockopt_tx(struct sock *sk, char __user *optval, } /* get user crypto info */ - crypto_info = &ctx->crypto_send; + crypto_info = &ctx->crypto_send.info; if (!TLS_CRYPTO_INFO_READY(crypto_info)) { rc = -EBUSY; @@ -417,9 +427,9 @@ static int do_tls_setsockopt_conf(struct sock *sk, char __user *optval, } if (tx) - crypto_info = &ctx->crypto_send; + crypto_info = &ctx->crypto_send.info; else - crypto_info = &ctx->crypto_recv; + crypto_info = &ctx->crypto_recv.info; /* Currently we don't support set crypto info more than one time */ if (TLS_CRYPTO_INFO_READY(crypto_info)) { diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index f29b7c49cbf2..9e918489f4fb 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -1055,8 +1055,8 @@ static int tls_read_size(struct strparser *strp, struct sk_buff *skb) goto read_failure; } - if (header[1] != TLS_VERSION_MINOR(tls_ctx->crypto_recv.version) || - header[2] != TLS_VERSION_MAJOR(tls_ctx->crypto_recv.version)) { + if (header[1] != TLS_VERSION_MINOR(tls_ctx->crypto_recv.info.version) || + header[2] != TLS_VERSION_MAJOR(tls_ctx->crypto_recv.info.version)) { ret = -EINVAL; goto read_failure; } @@ -1180,12 +1180,12 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx) if (tx) { crypto_init_wait(&sw_ctx_tx->async_wait); - crypto_info = &ctx->crypto_send; + crypto_info = &ctx->crypto_send.info; cctx = &ctx->tx; aead = &sw_ctx_tx->aead_send; } else { crypto_init_wait(&sw_ctx_rx->async_wait); - crypto_info = &ctx->crypto_recv; + crypto_info = &ctx->crypto_recv.info; cctx = &ctx->rx; aead = &sw_ctx_rx->aead_recv; } From d0a4e4c4bd5f9385726abc7f445ea09237b9a9a7 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Wed, 12 Sep 2018 17:44:43 +0200 Subject: [PATCH 1480/1640] UPSTREAM: tls: clear key material from kernel memory when do_tls_setsockopt_conf fails Fixes: 3c4d7559159b ("tls: kernel TLS support") Signed-off-by: Sabrina Dubroca Signed-off-by: Sabrina Dubroca Signed-off-by: David S. Miller --- net/tls/tls_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index e86a01abd200..d6ae2b8237e9 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -509,7 +509,7 @@ static int do_tls_setsockopt_conf(struct sock *sk, char __user *optval, goto out; err_crypto_info: - memset(crypto_info, 0, sizeof(*crypto_info)); + memzero_explicit(crypto_info, sizeof(union tls_crypto_context)); out: return rc; } From 2077c90fb095e7fc6fc9835223284a46022af8e1 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 14 Sep 2018 23:00:55 +0200 Subject: [PATCH 1481/1640] BACKPORT: tls: fix currently broken MSG_PEEK behavior In kTLS MSG_PEEK behavior is currently failing, strace example: [pid 2430] socket(AF_INET, SOCK_STREAM, IPPROTO_IP) = 3 [pid 2430] socket(AF_INET, SOCK_STREAM, IPPROTO_IP) = 4 [pid 2430] bind(4, {sa_family=AF_INET, sin_port=htons(0), sin_addr=inet_addr("0.0.0.0")}, 16) = 0 [pid 2430] listen(4, 10) = 0 [pid 2430] getsockname(4, {sa_family=AF_INET, sin_port=htons(38855), sin_addr=inet_addr("0.0.0.0")}, [16]) = 0 [pid 2430] connect(3, {sa_family=AF_INET, sin_port=htons(38855), sin_addr=inet_addr("0.0.0.0")}, 16) = 0 [pid 2430] setsockopt(3, SOL_TCP, 0x1f /* TCP_??? */, [7564404], 4) = 0 [pid 2430] setsockopt(3, 0x11a /* SOL_?? */, 1, "\3\0033\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 40) = 0 [pid 2430] accept(4, {sa_family=AF_INET, sin_port=htons(49636), sin_addr=inet_addr("127.0.0.1")}, [16]) = 5 [pid 2430] setsockopt(5, SOL_TCP, 0x1f /* TCP_??? */, [7564404], 4) = 0 [pid 2430] setsockopt(5, 0x11a /* SOL_?? */, 2, "\3\0033\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 40) = 0 [pid 2430] close(4) = 0 [pid 2430] sendto(3, "test_read_peek", 14, 0, NULL, 0) = 14 [pid 2430] sendto(3, "_mult_recs\0", 11, 0, NULL, 0) = 11 [pid 2430] recvfrom(5, "test_read_peektest_read_peektest"..., 64, MSG_PEEK, NULL, NULL) = 64 As can be seen from strace, there are two TLS records sent, i) 'test_read_peek' and ii) '_mult_recs\0' where we end up peeking 'test_read_peektest_read_peektest'. This is clearly wrong, and what happens is that given peek cannot call into tls_sw_advance_skb() to unpause strparser and proceed with the next skb, we end up looping over the current one, copying the 'test_read_peek' over and over into the user provided buffer. Here, we can only peek into the currently held skb (current, full TLS record) as otherwise we would end up having to hold all the original skb(s) (depending on the peek depth) in a separate queue when unpausing strparser to process next records, minimally intrusive is to return only up to the current record's size (which likely was what c46234ebb4d1 ("tls: RX path for ktls") originally intended as well). Thus, after patch we properly peek the first record: [pid 2046] wait4(2075, [pid 2075] socket(AF_INET, SOCK_STREAM, IPPROTO_IP) = 3 [pid 2075] socket(AF_INET, SOCK_STREAM, IPPROTO_IP) = 4 [pid 2075] bind(4, {sa_family=AF_INET, sin_port=htons(0), sin_addr=inet_addr("0.0.0.0")}, 16) = 0 [pid 2075] listen(4, 10) = 0 [pid 2075] getsockname(4, {sa_family=AF_INET, sin_port=htons(55115), sin_addr=inet_addr("0.0.0.0")}, [16]) = 0 [pid 2075] connect(3, {sa_family=AF_INET, sin_port=htons(55115), sin_addr=inet_addr("0.0.0.0")}, 16) = 0 [pid 2075] setsockopt(3, SOL_TCP, 0x1f /* TCP_??? */, [7564404], 4) = 0 [pid 2075] setsockopt(3, 0x11a /* SOL_?? */, 1, "\3\0033\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 40) = 0 [pid 2075] accept(4, {sa_family=AF_INET, sin_port=htons(45732), sin_addr=inet_addr("127.0.0.1")}, [16]) = 5 [pid 2075] setsockopt(5, SOL_TCP, 0x1f /* TCP_??? */, [7564404], 4) = 0 [pid 2075] setsockopt(5, 0x11a /* SOL_?? */, 2, "\3\0033\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 40) = 0 [pid 2075] close(4) = 0 [pid 2075] sendto(3, "test_read_peek", 14, 0, NULL, 0) = 14 [pid 2075] sendto(3, "_mult_recs\0", 11, 0, NULL, 0) = 11 [pid 2075] recvfrom(5, "test_read_peek", 64, MSG_PEEK, NULL, NULL) = 14 Fixes: c46234ebb4d1 ("tls: RX path for ktls") Signed-off-by: Daniel Borkmann Signed-off-by: David S. Miller --- net/tls/tls_sw.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 9e918489f4fb..b9c6ecfbcfea 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -931,7 +931,15 @@ int tls_sw_recvmsg(struct sock *sk, if (control != TLS_RECORD_TYPE_DATA) goto recv_end; } + } else { + /* MSG_PEEK right now cannot look beyond current skb + * from strparser, meaning we cannot advance skb here + * and thus unpause strparser since we'd loose original + * one. + */ + break; } + /* If we have a new message from strparser, continue now. */ if (copied >= target && !ctx->recv_pkt) break; From 349bff46118991933022c7976d8d6d13a23eb684 Mon Sep 17 00:00:00 2001 From: Ganesh Goudar Date: Wed, 19 Dec 2018 17:18:22 +0530 Subject: [PATCH 1482/1640] UPSTREAM: net/tls: allocate tls context using GFP_ATOMIC [ Upstream commit c6ec179a0082e2e76e3a72050c2b99d3d0f3da3f ] create_ctx can be called from atomic context, hence use GFP_ATOMIC instead of GFP_KERNEL. [ 395.962599] BUG: sleeping function called from invalid context at mm/slab.h:421 [ 395.979896] in_atomic(): 1, irqs_disabled(): 0, pid: 16254, name: openssl [ 395.996564] 2 locks held by openssl/16254: [ 396.010492] #0: 00000000347acb52 (sk_lock-AF_INET){+.+.}, at: do_tcp_setsockopt.isra.44+0x13b/0x9a0 [ 396.029838] #1: 000000006c9552b5 (device_spinlock){+...}, at: tls_init+0x1d/0x280 [ 396.047675] CPU: 5 PID: 16254 Comm: openssl Tainted: G O 4.20.0-rc6+ #25 [ 396.066019] Hardware name: Supermicro X10SRA-F/X10SRA-F, BIOS 2.0c 09/25/2017 [ 396.083537] Call Trace: [ 396.096265] dump_stack+0x5e/0x8b [ 396.109876] ___might_sleep+0x216/0x250 [ 396.123940] kmem_cache_alloc_trace+0x1b0/0x240 [ 396.138800] create_ctx+0x1f/0x60 [ 396.152504] tls_init+0xbd/0x280 [ 396.166135] tcp_set_ulp+0x191/0x2d0 [ 396.180035] ? tcp_set_ulp+0x2c/0x2d0 [ 396.193960] do_tcp_setsockopt.isra.44+0x148/0x9a0 [ 396.209013] __sys_setsockopt+0x7c/0xe0 [ 396.223054] __x64_sys_setsockopt+0x20/0x30 [ 396.237378] do_syscall_64+0x4a/0x180 [ 396.251200] entry_SYSCALL_64_after_hwframe+0x49/0xbe Fixes: df9d4a178022 ("net/tls: sleeping function from invalid context") Signed-off-by: Ganesh Goudar Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/tls/tls_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index d6ae2b8237e9..061d7bf163d3 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -550,7 +550,7 @@ static struct tls_context *create_ctx(struct sock *sk) struct inet_connection_sock *icsk = inet_csk(sk); struct tls_context *ctx; - ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + ctx = kzalloc(sizeof(*ctx), GFP_ATOMIC); if (!ctx) return NULL; From fa4ccd4f8fe5d1c0a1857e500edf39ef98281d89 Mon Sep 17 00:00:00 2001 From: Atul Gupta Date: Tue, 11 Dec 2018 02:19:40 -0800 Subject: [PATCH 1483/1640] UPSTREAM: net/tls: Init routines in create_ctx [ Upstream commit 6c0563e442528733219afe15c749eb2cc365da3f ] create_ctx is called from tls_init and tls_hw_prot hence initialize function pointers in common routine. Signed-off-by: Atul Gupta Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- net/tls/tls_main.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index 061d7bf163d3..cb4846fa5860 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -555,6 +555,9 @@ static struct tls_context *create_ctx(struct sock *sk) return NULL; icsk->icsk_ulp_data = ctx; + ctx->setsockopt = sk->sk_prot->setsockopt; + ctx->getsockopt = sk->sk_prot->getsockopt; + ctx->sk_proto_close = sk->sk_prot->close; return ctx; } @@ -685,9 +688,6 @@ static int tls_init(struct sock *sk) rc = -ENOMEM; goto out; } - ctx->setsockopt = sk->sk_prot->setsockopt; - ctx->getsockopt = sk->sk_prot->getsockopt; - ctx->sk_proto_close = sk->sk_prot->close; /* Build IPv6 TLS whenever the address of tcpv6 _prot changes */ if (ip_ver == TLSV6 && From e0ec930b396e08c37d743325c10d75e57bc95b7b Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 8 Apr 2019 17:59:50 -0700 Subject: [PATCH 1484/1640] UPSTREAM: net/tls: prevent bad memory access in tls_is_sk_tx_device_offloaded() [ Upstream commit b4f47f3848eb70986f75d06112af7b48b7f5f462 ] Unlike '&&' operator, the '&' does not have short-circuit evaluation semantics. IOW both sides of the operator always get evaluated. Fix the wrong operator in tls_is_sk_tx_device_offloaded(), which would lead to out-of-bounds access for for non-full sockets. Fixes: 4799ac81e52a ("tls: Add rx inline crypto offload") Signed-off-by: Jakub Kicinski Reviewed-by: Dirk van der Merwe Reviewed-by: Simon Horman Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- include/net/tls.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/net/tls.h b/include/net/tls.h index 0a769cf2f5f3..c423b7d0b6ab 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -317,7 +317,7 @@ tls_validate_xmit_skb(struct sock *sk, struct net_device *dev, static inline bool tls_is_sk_tx_device_offloaded(struct sock *sk) { #ifdef CONFIG_SOCK_VALIDATE_XMIT - return sk_fullsock(sk) & + return sk_fullsock(sk) && (smp_load_acquire(&sk->sk_validate_xmit_skb) == &tls_validate_xmit_skb); #else From 2b457540dbfeb5219f91a1f26f1e1d6cb74428e1 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 17 Apr 2019 10:51:19 -0700 Subject: [PATCH 1485/1640] UPSTREAM: net/tls: fix refcount adjustment in fallback [ Upstream commit 9188d5ca454fd665145904267e726e9e8d122f5c ] Unlike atomic_add(), refcount_add() does not deal well with a negative argument. TLS fallback code reallocates the skb and is very likely to shrink the truesize, leading to: [ 189.513254] WARNING: CPU: 5 PID: 0 at lib/refcount.c:81 refcount_add_not_zero_checked+0x15c/0x180 Call Trace: refcount_add_checked+0x6/0x40 tls_enc_skb+0xb93/0x13e0 [tls] Once wmem_allocated count saturates the application can no longer send data on the socket. This is similar to Eric's fixes for GSO, TCP: commit 7ec318feeed1 ("tcp: gso: avoid refcount_t warning from tcp_gso_segment()") and UDP: commit 575b65bc5bff ("udp: avoid refcount_t saturation in __udp_gso_segment()"). Unlike the GSO case, for TLS fallback it's likely that the skb has shrunk, so the "likely" annotation is the other way around (likely branch being "sub"). Fixes: e8f69799810c ("net/tls: Add generic NIC offload infrastructure") Signed-off-by: Jakub Kicinski Reviewed-by: John Hurley Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/tls/tls_device_fallback.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/net/tls/tls_device_fallback.c b/net/tls/tls_device_fallback.c index 450a6dbc5a88..ef8934fd8698 100644 --- a/net/tls/tls_device_fallback.c +++ b/net/tls/tls_device_fallback.c @@ -193,6 +193,9 @@ static void update_chksum(struct sk_buff *skb, int headln) static void complete_skb(struct sk_buff *nskb, struct sk_buff *skb, int headln) { + struct sock *sk = skb->sk; + int delta; + skb_copy_header(nskb, skb); skb_put(nskb, skb->len); @@ -200,11 +203,15 @@ static void complete_skb(struct sk_buff *nskb, struct sk_buff *skb, int headln) update_chksum(nskb, headln); nskb->destructor = skb->destructor; - nskb->sk = skb->sk; + nskb->sk = sk; skb->destructor = NULL; skb->sk = NULL; - refcount_add(nskb->truesize - skb->truesize, - &nskb->sk->sk_wmem_alloc); + + delta = nskb->truesize - skb->truesize; + if (likely(delta < 0)) + WARN_ON_ONCE(refcount_sub_and_test(-delta, &sk->sk_wmem_alloc)); + else if (delta) + refcount_add(delta, &sk->sk_wmem_alloc); } /* This function may be called after the user socket is already From 38afcf5d066f356e80bbb25106add39636d9dcb9 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 19 Apr 2019 16:51:38 -0700 Subject: [PATCH 1486/1640] UPSTREAM: net/tls: avoid potential deadlock in tls_set_device_offload_rx() [ Upstream commit 62ef81d5632634d5e310ed25b9b940b2b6612b46 ] If device supports offload, but offload fails tls_set_device_offload_rx() will call tls_sw_free_resources_rx() which (unhelpfully) releases and reacquires the socket lock. For a small fix release and reacquire the device_offload_lock. Fixes: 4799ac81e52a ("tls: Add rx inline crypto offload") Signed-off-by: Jakub Kicinski Reviewed-by: Dirk van der Merwe Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/tls/tls_device.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index 961b07d4d41c..b61b893e9204 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -874,7 +874,9 @@ int tls_set_device_offload_rx(struct sock *sk, struct tls_context *ctx) goto release_netdev; free_sw_resources: + up_read(&device_offload_lock); tls_sw_free_resources_rx(sk); + down_read(&device_offload_lock); release_ctx: ctx->priv_ctx_rx = NULL; release_netdev: From 353038ddf9fef505394fc3c69e3b4c9134b335d7 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 19 Apr 2019 16:52:19 -0700 Subject: [PATCH 1487/1640] UPSTREAM: net/tls: don't leak IV and record seq when offload fails [ Upstream commit 12c7686111326148b4b5db189130522a4ad1be4a ] When device refuses the offload in tls_set_device_offload_rx() it calls tls_sw_free_resources_rx() to clean up software context state. Unfortunately, tls_sw_free_resources_rx() does not free all the state tls_set_sw_offload() allocated - it leaks IV and sequence number buffers. All other code paths which lead to tls_sw_release_resources_rx() (which tls_sw_free_resources_rx() calls) free those right before the call. Avoid the leak by moving freeing of iv and rec_seq into tls_sw_release_resources_rx(). Fixes: 4799ac81e52a ("tls: Add rx inline crypto offload") Signed-off-by: Jakub Kicinski Reviewed-by: Dirk van der Merwe Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/tls/tls_device.c | 2 -- net/tls/tls_main.c | 5 +---- net/tls/tls_sw.c | 3 +++ 3 files changed, 4 insertions(+), 6 deletions(-) diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index b61b893e9204..c9588b682db4 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -911,8 +911,6 @@ void tls_device_offload_cleanup_rx(struct sock *sk) } out: up_read(&device_offload_lock); - kfree(tls_ctx->rx.rec_seq); - kfree(tls_ctx->rx.iv); tls_sw_release_resources_rx(sk); } diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index cb4846fa5860..9538913a29d3 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -290,11 +290,8 @@ static void tls_sk_proto_close(struct sock *sk, long timeout) tls_sw_free_resources_tx(sk); } - if (ctx->rx_conf == TLS_SW) { - kfree(ctx->rx.rec_seq); - kfree(ctx->rx.iv); + if (ctx->rx_conf == TLS_SW) tls_sw_free_resources_rx(sk); - } #ifdef CONFIG_TLS_DEVICE if (ctx->rx_conf == TLS_HW) diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index b9c6ecfbcfea..6848a8196711 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -1118,6 +1118,9 @@ void tls_sw_release_resources_rx(struct sock *sk) struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); + kfree(tls_ctx->rx.rec_seq); + kfree(tls_ctx->rx.iv); + if (ctx->aead_recv) { kfree_skb(ctx->recv_pkt); ctx->recv_pkt = NULL; From a705a0a7940e1a1251eeb8a22b7da94e06dfe7a6 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 29 Apr 2019 12:19:12 -0700 Subject: [PATCH 1488/1640] UPSTREAM: net/tls: avoid NULL pointer deref on nskb->sk in fallback [ Upstream commit 2dcb003314032c6efb13a065ffae60d164b2dd35 ] update_chksum() accesses nskb->sk before it has been set by complete_skb(), move the init up. Fixes: e8f69799810c ("net/tls: Add generic NIC offload infrastructure") Signed-off-by: Jakub Kicinski Reviewed-by: Simon Horman Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/tls/tls_device_fallback.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/tls/tls_device_fallback.c b/net/tls/tls_device_fallback.c index ef8934fd8698..426dd97725e4 100644 --- a/net/tls/tls_device_fallback.c +++ b/net/tls/tls_device_fallback.c @@ -200,13 +200,14 @@ static void complete_skb(struct sk_buff *nskb, struct sk_buff *skb, int headln) skb_put(nskb, skb->len); memcpy(nskb->data, skb->data, headln); - update_chksum(nskb, headln); nskb->destructor = skb->destructor; nskb->sk = sk; skb->destructor = NULL; skb->sk = NULL; + update_chksum(nskb, headln); + delta = nskb->truesize - skb->truesize; if (likely(delta < 0)) WARN_ON_ONCE(refcount_sub_and_test(-delta, &sk->sk_wmem_alloc)); From d9bd7ca2d278fa9b98415d5386354b9a099bc254 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 25 Apr 2019 17:35:09 -0700 Subject: [PATCH 1489/1640] UPSTREAM: net/tls: don't copy negative amounts of data in reencrypt [ Upstream commit 97e1caa517e22d62a283b876fb8aa5f4672c83dd ] There is no guarantee the record starts before the skb frags. If we don't check for this condition copy amount will get negative, leading to reads and writes to random memory locations. Familiar hilarity ensues. Fixes: 4799ac81e52a ("tls: Add rx inline crypto offload") Signed-off-by: Jakub Kicinski Reviewed-by: John Hurley Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/tls/tls_device.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index c9588b682db4..8538ee22a582 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -600,14 +600,16 @@ static int tls_device_reencrypt(struct sock *sk, struct sk_buff *skb) else err = 0; - copy = min_t(int, skb_pagelen(skb) - offset, - rxm->full_len - TLS_CIPHER_AES_GCM_128_TAG_SIZE); + if (skb_pagelen(skb) > offset) { + copy = min_t(int, skb_pagelen(skb) - offset, + rxm->full_len - TLS_CIPHER_AES_GCM_128_TAG_SIZE); - if (skb->decrypted) - skb_store_bits(skb, offset, buf, copy); + if (skb->decrypted) + skb_store_bits(skb, offset, buf, copy); - offset += copy; - buf += copy; + offset += copy; + buf += copy; + } skb_walk_frags(skb, skb_iter) { copy = min_t(int, skb_iter->len, From 6eccb7bc6a9b77afd2bcc8348fc3d1e689e435a1 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 25 Apr 2019 17:35:10 -0700 Subject: [PATCH 1490/1640] UPSTREAM: net/tls: fix copy to fragments in reencrypt [ Upstream commit eb3d38d5adb520435d4e4af32529ccb13ccc9935 ] Fragments may contain data from other records so we have to account for that when we calculate the destination and max length of copy we can perform. Note that 'offset' is the offset within the message, so it can't be passed as offset within the frag.. Here skb_store_bits() would have realised the call is wrong and simply not copy data. Fixes: 4799ac81e52a ("tls: Add rx inline crypto offload") Signed-off-by: Jakub Kicinski Reviewed-by: John Hurley Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/tls/tls_device.c | 29 ++++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index 8538ee22a582..f4a19eac975d 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -569,7 +569,7 @@ void handle_device_resync(struct sock *sk, u32 seq, u64 rcd_sn) static int tls_device_reencrypt(struct sock *sk, struct sk_buff *skb) { struct strp_msg *rxm = strp_msg(skb); - int err = 0, offset = rxm->offset, copy, nsg; + int err = 0, offset = rxm->offset, copy, nsg, data_len, pos; struct sk_buff *skb_iter, *unused; struct scatterlist sg[1]; char *orig_buf, *buf; @@ -600,9 +600,10 @@ static int tls_device_reencrypt(struct sock *sk, struct sk_buff *skb) else err = 0; + data_len = rxm->full_len - TLS_CIPHER_AES_GCM_128_TAG_SIZE; + if (skb_pagelen(skb) > offset) { - copy = min_t(int, skb_pagelen(skb) - offset, - rxm->full_len - TLS_CIPHER_AES_GCM_128_TAG_SIZE); + copy = min_t(int, skb_pagelen(skb) - offset, data_len); if (skb->decrypted) skb_store_bits(skb, offset, buf, copy); @@ -611,16 +612,30 @@ static int tls_device_reencrypt(struct sock *sk, struct sk_buff *skb) buf += copy; } + pos = skb_pagelen(skb); skb_walk_frags(skb, skb_iter) { - copy = min_t(int, skb_iter->len, - rxm->full_len - offset + rxm->offset - - TLS_CIPHER_AES_GCM_128_TAG_SIZE); + int frag_pos; + + /* Practically all frags must belong to msg if reencrypt + * is needed with current strparser and coalescing logic, + * but strparser may "get optimized", so let's be safe. + */ + if (pos + skb_iter->len <= offset) + goto done_with_frag; + if (pos >= data_len + rxm->offset) + break; + + frag_pos = offset - pos; + copy = min_t(int, skb_iter->len - frag_pos, + data_len + rxm->offset - offset); if (skb_iter->decrypted) - skb_store_bits(skb_iter, offset, buf, copy); + skb_store_bits(skb_iter, frag_pos, buf, copy); offset += copy; buf += copy; +done_with_frag: + pos += skb_iter->len; } free_buf: From 8125da2430db8bf2e531575d7b9a1996ba277a6c Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 10 Apr 2019 11:04:30 -0700 Subject: [PATCH 1491/1640] UPSTREAM: net/tls: fix the IV leaks [ Upstream commit 5a03bc73abed6ae196c15e9950afde19d48be12c ] Commit f66de3ee2c16 ("net/tls: Split conf to rx + tx") made freeing of IV and record sequence number conditional to SW path only, but commit e8f69799810c ("net/tls: Add generic NIC offload infrastructure") also allocates that state for the device offload configuration. Remember to free it. Fixes: e8f69799810c ("net/tls: Add generic NIC offload infrastructure") Signed-off-by: Jakub Kicinski Reviewed-by: Dirk van der Merwe Reviewed-by: Simon Horman Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- net/tls/tls_device.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index f4a19eac975d..fdf22cb0b3e6 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -52,8 +52,11 @@ static DEFINE_SPINLOCK(tls_device_lock); static void tls_device_free_ctx(struct tls_context *ctx) { - if (ctx->tx_conf == TLS_HW) + if (ctx->tx_conf == TLS_HW) { kfree(tls_offload_ctx_tx(ctx)); + kfree(ctx->tx.rec_seq); + kfree(ctx->tx.iv); + } if (ctx->rx_conf == TLS_HW) kfree(tls_offload_ctx_rx(ctx)); From f324c86eedb617024a0c362b42af737b9e83fbcf Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 21 May 2019 19:02:01 -0700 Subject: [PATCH 1492/1640] UPSTREAM: net/tls: fix state removal with feature flags off [ Upstream commit 3686637e507b48525fcea6fb91e1988bdbc14530 ] TLS offload drivers shouldn't (and currently don't) block the TLS offload feature changes based on whether there are active offloaded connections or not. This seems to be a good idea, because we want the admin to be able to disable the TLS offload at any time, and there is no clean way of disabling it for active connections (TX side is quite problematic). So if features are cleared existing connections will stay offloaded until they close, and new connections will not attempt offload to a given device. However, the offload state removal handling is currently broken if feature flags get cleared while there are active TLS offloads. RX side will completely bail from cleanup, even on normal remove path, leaving device state dangling, potentially causing issues when the 5-tuple is reused. It will also fail to release the netdev reference. Remove the RX-side warning message, in next release cycle it should be printed when features are disabled, rather than when connection dies, but for that we need a more efficient method of finding connection of a given netdev (a'la BPF offload code). Fixes: 4799ac81e52a ("tls: Add rx inline crypto offload") Signed-off-by: Jakub Kicinski Reviewed-by: Dirk van der Merwe Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/tls/tls_device.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index fdf22cb0b3e6..5db330db77a1 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -916,12 +916,6 @@ void tls_device_offload_cleanup_rx(struct sock *sk) if (!netdev) goto out; - if (!(netdev->features & NETIF_F_HW_TLS_RX)) { - pr_err_ratelimited("%s: device is missing NETIF_F_HW_TLS_RX cap\n", - __func__); - goto out; - } - netdev->tlsdev_ops->tls_dev_del(netdev, tls_ctx, TLS_OFFLOAD_CTX_DIR_RX); From 945dfea97b70e88582267c47c15a9b14bd98febc Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 21 May 2019 19:02:02 -0700 Subject: [PATCH 1493/1640] UPSTREAM: net/tls: don't ignore netdev notifications if no TLS features [ Upstream commit c3f4a6c39cf269a40d45f813c05fa830318ad875 ] On device surprise removal path (the notifier) we can't bail just because the features are disabled. They may have been enabled during the lifetime of the device. This bug leads to leaking netdev references and use-after-frees if there are active connections while device features are cleared. Fixes: e8f69799810c ("net/tls: Add generic NIC offload infrastructure") Signed-off-by: Jakub Kicinski Reviewed-by: Dirk van der Merwe Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/tls/tls_device.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index 5db330db77a1..8035bf495eb2 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -974,7 +974,8 @@ static int tls_dev_event(struct notifier_block *this, unsigned long event, { struct net_device *dev = netdev_notifier_info_to_dev(ptr); - if (!(dev->features & (NETIF_F_HW_TLS_RX | NETIF_F_HW_TLS_TX))) + if (!dev->tlsdev_ops && + !(dev->features & (NETIF_F_HW_TLS_RX | NETIF_F_HW_TLS_TX))) return NOTIFY_DONE; switch (event) { From 8f1a4b4b6867b4500fb32669bc365ac7da9a97a3 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 4 Jun 2019 12:00:12 -0700 Subject: [PATCH 1494/1640] UPSTREAM: net/tls: replace the sleeping lock around RX resync with a bit lock [ Upstream commit e52972c11d6b1262964db96d65934196db621685 ] Commit 38030d7cb779 ("net/tls: avoid NULL-deref on resync during device removal") tried to fix a potential NULL-dereference by taking the context rwsem. Unfortunately the RX resync may get called from soft IRQ, so we can't use the rwsem to protect from the device disappearing. Because we are guaranteed there can be only one resync at a time (it's called from strparser) use a bit to indicate resync is busy and make device removal wait for the bit to get cleared. Note that there is a leftover "flags" field in struct tls_context already. Fixes: 4799ac81e52a ("tls: Add rx inline crypto offload") Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- include/net/tls.h | 4 ++++ net/tls/tls_device.c | 27 +++++++++++++++++++++------ 2 files changed, 25 insertions(+), 6 deletions(-) diff --git a/include/net/tls.h b/include/net/tls.h index c423b7d0b6ab..954110575891 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -161,6 +161,10 @@ enum { TLS_PENDING_CLOSED_RECORD }; +enum tls_context_flags { + TLS_RX_SYNC_RUNNING = 0, +}; + struct cipher_context { u16 prepend_size; u16 tag_size; diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index 8035bf495eb2..ead29c2aefa7 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -545,10 +545,22 @@ static int tls_device_push_pending_record(struct sock *sk, int flags) return tls_push_data(sk, &msg_iter, 0, flags, TLS_RECORD_TYPE_DATA); } +static void tls_device_resync_rx(struct tls_context *tls_ctx, + struct sock *sk, u32 seq, u64 rcd_sn) +{ + struct net_device *netdev; + + if (WARN_ON(test_and_set_bit(TLS_RX_SYNC_RUNNING, &tls_ctx->flags))) + return; + netdev = READ_ONCE(tls_ctx->netdev); + if (netdev) + netdev->tlsdev_ops->tls_dev_resync_rx(netdev, sk, seq, rcd_sn); + clear_bit_unlock(TLS_RX_SYNC_RUNNING, &tls_ctx->flags); +} + void handle_device_resync(struct sock *sk, u32 seq, u64 rcd_sn) { struct tls_context *tls_ctx = tls_get_ctx(sk); - struct net_device *netdev = tls_ctx->netdev; struct tls_offload_context_rx *rx_ctx; u32 is_req_pending; s64 resync_req; @@ -563,10 +575,10 @@ void handle_device_resync(struct sock *sk, u32 seq, u64 rcd_sn) is_req_pending = resync_req; if (unlikely(is_req_pending) && req_seq == seq && - atomic64_try_cmpxchg(&rx_ctx->resync_req, &resync_req, 0)) - netdev->tlsdev_ops->tls_dev_resync_rx(netdev, sk, - seq + TLS_HEADER_SIZE - 1, - rcd_sn); + atomic64_try_cmpxchg(&rx_ctx->resync_req, &resync_req, 0)) { + seq += TLS_HEADER_SIZE - 1; + tls_device_resync_rx(tls_ctx, sk, seq, rcd_sn); + } } static int tls_device_reencrypt(struct sock *sk, struct sk_buff *skb) @@ -954,7 +966,10 @@ static int tls_device_down(struct net_device *netdev) if (ctx->rx_conf == TLS_HW) netdev->tlsdev_ops->tls_dev_del(netdev, ctx, TLS_OFFLOAD_CTX_DIR_RX); - ctx->netdev = NULL; + WRITE_ONCE(ctx->netdev, NULL); + smp_mb__before_atomic(); /* pairs with test_and_set_bit() */ + while (test_bit(TLS_RX_SYNC_RUNNING, &ctx->flags)) + usleep_range(10, 200); dev_put(netdev); list_del_init(&ctx->list); From 4c42ee6ae89e3b1106bb2ecbe30ad083c18db1ec Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 28 Jun 2019 16:11:39 -0700 Subject: [PATCH 1495/1640] UPSTREAM: net/tls: make sure offload also gets the keys wiped [ Upstream commit acd3e96d53a24d219f720ed4012b62723ae05da1 ] Commit 86029d10af18 ("tls: zero the crypto information from tls_context before freeing") added memzero_explicit() calls to clear the key material before freeing struct tls_context, but it missed tls_device.c has its own way of freeing this structure. Replace the missing free. Fixes: 86029d10af18 ("tls: zero the crypto information from tls_context before freeing") Signed-off-by: Jakub Kicinski Reviewed-by: Dirk van der Merwe Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- include/net/tls.h | 1 + net/tls/tls_device.c | 2 +- net/tls/tls_main.c | 2 +- 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/include/net/tls.h b/include/net/tls.h index 954110575891..98f5ad0319a2 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -234,6 +234,7 @@ struct tls_offload_context_rx { (ALIGN(sizeof(struct tls_offload_context_rx), sizeof(void *)) + \ TLS_DRIVER_STATE_SIZE) +void tls_ctx_free(struct tls_context *ctx); int wait_on_pending_writer(struct sock *sk, long *timeo); int tls_sk_query(struct sock *sk, int optname, char __user *optval, int __user *optlen); diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index ead29c2aefa7..0a613e0ef3bf 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -61,7 +61,7 @@ static void tls_device_free_ctx(struct tls_context *ctx) if (ctx->rx_conf == TLS_HW) kfree(tls_offload_ctx_rx(ctx)); - kfree(ctx); + tls_ctx_free(ctx); } static void tls_device_gc_task(struct work_struct *work) diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index 9538913a29d3..2ed9e612fbe1 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -241,7 +241,7 @@ static void tls_write_space(struct sock *sk) ctx->sk_write_space(sk); } -static void tls_ctx_free(struct tls_context *ctx) +void tls_ctx_free(struct tls_context *ctx) { if (!ctx) return; From 6b77fe19379e237d9d94e41a3b9bd2f9c0eb3dc0 Mon Sep 17 00:00:00 2001 From: Vakul Garg Date: Mon, 10 Sep 2018 22:53:46 +0530 Subject: [PATCH 1496/1640] UPSTREAM: net/tls: Fixed return value when tls_complete_pending_work() fails [ Upstream commit 150085791afb8054e11d2e080d4b9cd755dd7f69 ] In tls_sw_sendmsg() and tls_sw_sendpage(), the variable 'ret' has been set to return value of tls_complete_pending_work(). This allows return of proper error code if tls_complete_pending_work() fails. Fixes: 3c4d7559159b ("tls: kernel TLS support") Signed-off-by: Vakul Garg Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/tls/tls_sw.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 6848a8196711..bbb2da70e870 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -354,7 +354,7 @@ int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); - int ret = 0; + int ret; int required_size; long timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); bool eor = !(msg->msg_flags & MSG_MORE); @@ -370,7 +370,8 @@ int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) lock_sock(sk); - if (tls_complete_pending_work(sk, tls_ctx, msg->msg_flags, &timeo)) + ret = tls_complete_pending_work(sk, tls_ctx, msg->msg_flags, &timeo); + if (ret) goto send_end; if (unlikely(msg->msg_controllen)) { @@ -505,7 +506,7 @@ int tls_sw_sendpage(struct sock *sk, struct page *page, { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); - int ret = 0; + int ret; long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); bool eor; size_t orig_size = size; @@ -525,7 +526,8 @@ int tls_sw_sendpage(struct sock *sk, struct page *page, sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); - if (tls_complete_pending_work(sk, tls_ctx, flags, &timeo)) + ret = tls_complete_pending_work(sk, tls_ctx, flags, &timeo); + if (ret) goto sendpage_end; /* Call the sk_stream functions to manage the sndbuf mem. */ From 16d681b33034e38151d8b974d6eb662f19c6a728 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 9 Aug 2019 18:36:23 -0700 Subject: [PATCH 1497/1640] UPSTREAM: net/tls: swap sk_write_space on close [ Upstream commit 57c722e932cfb82e9820bbaae1b1f7222ea97b52 ] Now that we swap the original proto and clear the ULP pointer on close we have to make sure no callback will try to access the freed state. sk_write_space is not part of sk_prot, remember to swap it. Reported-by: syzbot+dcdc9deefaec44785f32@syzkaller.appspotmail.com Fixes: 95fa145479fb ("bpf: sockmap/tls, close can race with map free") Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/tls/tls_main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index 2ed9e612fbe1..1f336a51393a 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -301,6 +301,7 @@ static void tls_sk_proto_close(struct sock *sk, long timeout) #else { #endif + sk->sk_write_space = ctx->sk_write_space; tls_ctx_free(ctx); ctx = NULL; } From 4a8b2694c3363d913adca64735471532e1cf1fcf Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Wed, 14 Aug 2019 05:31:54 +0000 Subject: [PATCH 1498/1640] UPSTREAM: net: tls, fix sk_write_space NULL write when tx disabled [ Upstream commit d85f01775850a35eae47a0090839baf510c1ef12 ] The ctx->sk_write_space pointer is only set when TLS tx mode is enabled. When running without TX mode its a null pointer but we still set the sk sk_write_space pointer on close(). Fix the close path to only overwrite sk->sk_write_space when the current pointer is to the tls_write_space function indicating the tls module should clean it up properly as well. Reported-by: Hillf Danton Cc: Ying Xue Cc: Andrey Konovalov Fixes: 57c722e932cfb ("net/tls: swap sk_write_space on close") Signed-off-by: John Fastabend Reviewed-by: Jakub Kicinski Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/tls/tls_main.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index 1f336a51393a..ff79db73a448 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -301,7 +301,8 @@ static void tls_sk_proto_close(struct sock *sk, long timeout) #else { #endif - sk->sk_write_space = ctx->sk_write_space; + if (sk->sk_write_space == tls_write_space) + sk->sk_write_space = ctx->sk_write_space; tls_ctx_free(ctx); ctx = NULL; } From 022889a77231d4b3956794ea658bf1fe774d0347 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 8 Jul 2019 19:53:18 -0700 Subject: [PATCH 1499/1640] UPSTREAM: net/tls: fix socket wmem accounting on fallback with netem [ Upstream commit 5c4b4608fe100838c62591877101128467e56c00 ] netem runs skb_orphan_partial() which "disconnects" the skb from normal TCP write memory accounting. We should not adjust sk->sk_wmem_alloc on the fallback path for such skbs. Fixes: e8f69799810c ("net/tls: Add generic NIC offload infrastructure") Signed-off-by: Jakub Kicinski Reviewed-by: Dirk van der Merwe Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- net/tls/tls_device_fallback.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/tls/tls_device_fallback.c b/net/tls/tls_device_fallback.c index 426dd97725e4..6cf832891b53 100644 --- a/net/tls/tls_device_fallback.c +++ b/net/tls/tls_device_fallback.c @@ -208,6 +208,10 @@ static void complete_skb(struct sk_buff *nskb, struct sk_buff *skb, int headln) update_chksum(nskb, headln); + /* sock_efree means skb must gone through skb_orphan_partial() */ + if (nskb->destructor == sock_efree) + return; + delta = nskb->truesize - skb->truesize; if (likely(delta < 0)) WARN_ON_ONCE(refcount_sub_and_test(-delta, &sk->sk_wmem_alloc)); From ccc641823e391f413cbea161b4518704a4f5bb55 Mon Sep 17 00:00:00 2001 From: Rohit Maheshwari Date: Wed, 19 Feb 2020 09:40:22 +0530 Subject: [PATCH 1500/1640] UPSTREAM: net/tls: Fix to avoid gettig invalid tls record [ Upstream commit 06f5201c6392f998a49ca9c9173e2930c8eb51d8 ] Current code doesn't check if tcp sequence number is starting from (/after) 1st record's start sequnce number. It only checks if seq number is before 1st record's end sequnce number. This problem will always be a possibility in re-transmit case. If a record which belongs to a requested seq number is already deleted, tls_get_record will start looking into list and as per the check it will look if seq number is before the end seq of 1st record, which will always be true and will return 1st record always, it should in fact return NULL. As part of the fix, start looking each record only if the sequence number lies in the list else return NULL. There is one more check added, driver look for the start marker record to handle tcp packets which are before the tls offload start sequence number, hence return 1st record if the record is tls start marker and seq number is before the 1st record's starting sequence number. Fixes: e8f69799810c ("net/tls: Add generic NIC offload infrastructure") Signed-off-by: Rohit Maheshwari Reviewed-by: Jakub Kicinski Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/tls/tls_device.c | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index 0a613e0ef3bf..8f40bbfd60ea 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -506,7 +506,7 @@ struct tls_record_info *tls_get_record(struct tls_offload_context_tx *context, u32 seq, u64 *p_record_sn) { u64 record_sn = context->hint_record_sn; - struct tls_record_info *info; + struct tls_record_info *info, *last; info = context->retransmit_hint; if (!info || @@ -516,6 +516,25 @@ struct tls_record_info *tls_get_record(struct tls_offload_context_tx *context, */ info = list_first_entry(&context->records_list, struct tls_record_info, list); + + /* send the start_marker record if seq number is before the + * tls offload start marker sequence number. This record is + * required to handle TCP packets which are before TLS offload + * started. + * And if it's not start marker, look if this seq number + * belongs to the list. + */ + if (likely(!tls_record_is_start_marker(info))) { + /* we have the first record, get the last record to see + * if this seq number belongs to the list. + */ + last = list_last_entry(&context->records_list, + struct tls_record_info, list); + + if (!between(seq, tls_record_start_seq(info), + last->end_seq)) + return NULL; + } record_sn = context->unacked_record_sn; } From b73c502fb950fb93da7a0dc455954549b21ed2d1 Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Mon, 10 Aug 2020 17:02:58 -0700 Subject: [PATCH 1501/1640] UPSTREAM: net/tls: Fix kmap usage [ Upstream commit b06c19d9f827f6743122795570bfc0c72db482b0 ] When MSG_OOB is specified to tls_device_sendpage() the mapped page is never unmapped. Hold off mapping the page until after the flags are checked and the page is actually needed. Fixes: e8f69799810c ("net/tls: Add generic NIC offload infrastructure") Signed-off-by: Ira Weiny Reviewed-by: Jakub Kicinski Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/tls/tls_device.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index 8f40bbfd60ea..575d62130578 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -476,7 +476,7 @@ int tls_device_sendpage(struct sock *sk, struct page *page, int offset, size_t size, int flags) { struct iov_iter msg_iter; - char *kaddr = kmap(page); + char *kaddr; struct kvec iov; int rc; @@ -490,6 +490,7 @@ int tls_device_sendpage(struct sock *sk, struct page *page, goto out; } + kaddr = kmap(page); iov.iov_base = kaddr + offset; iov.iov_len = size; iov_iter_kvec(&msg_iter, WRITE | ITER_KVEC, &iov, 1, size); From e1805888a67d580667cc96aae1da068c10d78bfc Mon Sep 17 00:00:00 2001 From: Rohit Maheshwari Date: Thu, 8 Oct 2020 00:10:21 +0530 Subject: [PATCH 1502/1640] UPSTREAM: net/tls: sendfile fails with ktls offload [ Upstream commit ea1dd3e9d080c961b9a451130b61c72dc9a5397b ] At first when sendpage gets called, if there is more data, 'more' in tls_push_data() gets set which later sets pending_open_record_frags, but when there is no more data in file left, and last time tls_push_data() gets called, pending_open_record_frags doesn't get reset. And later when 2 bytes of encrypted alert comes as sendmsg, it first checks for pending_open_record_frags, and since this is set, it creates a record with 0 data bytes to encrypt, meaning record length is prepend_size + tag_size only, which causes problem. We should set/reset pending_open_record_frags based on more bit. Fixes: e8f69799810c ("net/tls: Add generic NIC offload infrastructure") Signed-off-by: Rohit Maheshwari Signed-off-by: Jakub Kicinski Signed-off-by: Greg Kroah-Hartman --- net/tls/tls_device.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index 575d62130578..dd0fc2aa6875 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -351,13 +351,13 @@ static int tls_push_data(struct sock *sk, struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_offload_context_tx *ctx = tls_offload_ctx_tx(tls_ctx); int tls_push_record_flags = flags | MSG_SENDPAGE_NOTLAST; - int more = flags & (MSG_SENDPAGE_NOTLAST | MSG_MORE); struct tls_record_info *record = ctx->open_record; struct page_frag *pfrag; size_t orig_size = size; u32 max_open_record_len; - int copy, rc = 0; + bool more = false; bool done = false; + int copy, rc = 0; long timeo; if (flags & @@ -422,9 +422,8 @@ handle_error: if (!size) { last_record: tls_push_record_flags = flags; - if (more) { - tls_ctx->pending_open_record_frags = - record->num_frags; + if (flags & (MSG_SENDPAGE_NOTLAST | MSG_MORE)) { + more = true; break; } @@ -445,6 +444,8 @@ last_record: } } while (!done); + tls_ctx->pending_open_record_frags = more; + if (orig_size - size > 0) rc = orig_size - size; From 45f8f8844994ea881795b0a271ebb94b59bda2ae Mon Sep 17 00:00:00 2001 From: Vadim Fedorenko Date: Thu, 19 Nov 2020 18:59:48 +0300 Subject: [PATCH 1503/1640] UPSTREAM: net/tls: missing received data after fast remote close [ Upstream commit 20ffc7adf53a5fd3d19751fbff7895bcca66686e ] In case when tcp socket received FIN after some data and the parser haven't started before reading data caller will receive an empty buffer. This behavior differs from plain TCP socket and leads to special treating in user-space. The flow that triggers the race is simple. Server sends small amount of data right after the connection is configured to use TLS and closes the connection. In this case receiver sees TLS Handshake data, configures TLS socket right after Change Cipher Spec record. While the configuration is in process, TCP socket receives small Application Data record, Encrypted Alert record and FIN packet. So the TCP socket changes sk_shutdown to RCV_SHUTDOWN and sk_flag with SK_DONE bit set. The received data is not parsed upon arrival and is never sent to user-space. Patch unpauses parser directly if we have unparsed data in tcp receive queue. Fixes: fcf4793e278e ("tls: check RCV_SHUTDOWN in tls_wait_data") Signed-off-by: Vadim Fedorenko Link: https://lore.kernel.org/r/1605801588-12236-1-git-send-email-vfedorenko@novek.ru Signed-off-by: Jakub Kicinski Signed-off-by: Greg Kroah-Hartman --- net/tls/tls_sw.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index bbb2da70e870..7d761244a360 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -630,6 +630,12 @@ static struct sk_buff *tls_wait_data(struct sock *sk, int flags, return NULL; } + if (!skb_queue_empty(&sk->sk_receive_queue)) { + __strp_unpause(&ctx->strp); + if (ctx->recv_pkt) + return ctx->recv_pkt; + } + if (sk->sk_shutdown & RCV_SHUTDOWN) return NULL; From 1e4e1908e2866dd4cf563ac2e621a8e5b17498fd Mon Sep 17 00:00:00 2001 From: Maxim Mikityanskiy Date: Wed, 25 Nov 2020 14:18:10 -0800 Subject: [PATCH 1504/1640] UPSTREAM: net/tls: Protect from calling tls_dev_del for TLS RX twice [ Upstream commit 025cc2fb6a4e84e9a0552c0017dcd1c24b7ac7da ] tls_device_offload_cleanup_rx doesn't clear tls_ctx->netdev after calling tls_dev_del if TLX TX offload is also enabled. Clearing tls_ctx->netdev gets postponed until tls_device_gc_task. It leaves a time frame when tls_device_down may get called and call tls_dev_del for RX one extra time, confusing the driver, which may lead to a crash. This patch corrects this racy behavior by adding a flag to prevent tls_device_down from calling tls_dev_del the second time. Fixes: e8f69799810c ("net/tls: Add generic NIC offload infrastructure") Signed-off-by: Maxim Mikityanskiy Signed-off-by: Saeed Mahameed Link: https://lore.kernel.org/r/20201125221810.69870-1-saeedm@nvidia.com Signed-off-by: Jakub Kicinski Signed-off-by: Greg Kroah-Hartman --- include/net/tls.h | 6 ++++++ net/tls/tls_device.c | 5 ++++- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/include/net/tls.h b/include/net/tls.h index 98f5ad0319a2..9caef9bad075 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -163,6 +163,12 @@ enum { enum tls_context_flags { TLS_RX_SYNC_RUNNING = 0, + /* tls_dev_del was called for the RX side, device state was released, + * but tls_ctx->netdev might still be kept, because TX-side driver + * resources might not be released yet. Used to prevent the second + * tls_dev_del call in tls_device_down if it happens simultaneously. + */ + TLS_RX_DEV_CLOSED = 2, }; struct cipher_context { diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index dd0fc2aa6875..228e3ce48d43 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -955,6 +955,8 @@ void tls_device_offload_cleanup_rx(struct sock *sk) if (tls_ctx->tx_conf != TLS_HW) { dev_put(netdev); tls_ctx->netdev = NULL; + } else { + set_bit(TLS_RX_DEV_CLOSED, &tls_ctx->flags); } out: up_read(&device_offload_lock); @@ -984,7 +986,8 @@ static int tls_device_down(struct net_device *netdev) if (ctx->tx_conf == TLS_HW) netdev->tlsdev_ops->tls_dev_del(netdev, ctx, TLS_OFFLOAD_CTX_DIR_TX); - if (ctx->rx_conf == TLS_HW) + if (ctx->rx_conf == TLS_HW && + !test_bit(TLS_RX_DEV_CLOSED, &ctx->flags)) netdev->tlsdev_ops->tls_dev_del(netdev, ctx, TLS_OFFLOAD_CTX_DIR_RX); WRITE_ONCE(ctx->netdev, NULL); From ccadd83a36819c844b8be781affe2427aaa1e397 Mon Sep 17 00:00:00 2001 From: Tariq Toukan Date: Fri, 15 Jul 2022 11:42:16 +0300 Subject: [PATCH 1505/1640] UPSTREAM: net/tls: Fix race in TLS device down flow [ Upstream commit f08d8c1bb97c48f24a82afaa2fd8c140f8d3da8b ] Socket destruction flow and tls_device_down function sync against each other using tls_device_lock and the context refcount, to guarantee the device resources are freed via tls_dev_del() by the end of tls_device_down. In the following unfortunate flow, this won't happen: - refcount is decreased to zero in tls_device_sk_destruct. - tls_device_down starts, skips the context as refcount is zero, going all the way until it flushes the gc work, and returns without freeing the device resources. - only then, tls_device_queue_ctx_destruction is called, queues the gc work and frees the context's device resources. Solve it by decreasing the refcount in the socket's destruction flow under the tls_device_lock, for perfect synchronization. This does not slow down the common likely destructor flow, in which both the refcount is decreased and the spinlock is acquired, anyway. Fixes: e8f69799810c ("net/tls: Add generic NIC offload infrastructure") Reviewed-by: Maxim Mikityanskiy Signed-off-by: Tariq Toukan Reviewed-by: Jakub Kicinski Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- net/tls/tls_device.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index 228e3ce48d43..b290eb3ae155 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -110,13 +110,16 @@ static void tls_device_queue_ctx_destruction(struct tls_context *ctx) unsigned long flags; spin_lock_irqsave(&tls_device_lock, flags); + if (unlikely(!refcount_dec_and_test(&ctx->refcount))) + goto unlock; + list_move_tail(&ctx->list, &tls_device_gc_list); /* schedule_work inside the spinlock * to make sure tls_device_down waits for that work. */ schedule_work(&tls_device_gc_work); - +unlock: spin_unlock_irqrestore(&tls_device_lock, flags); } @@ -214,8 +217,7 @@ void tls_device_sk_destruct(struct sock *sk) clean_acked_data_disable(inet_csk(sk)); } - if (refcount_dec_and_test(&tls_ctx->refcount)) - tls_device_queue_ctx_destruction(tls_ctx); + tls_device_queue_ctx_destruction(tls_ctx); } EXPORT_SYMBOL(tls_device_sk_destruct); From 3b7cad6968b6388bf787b62c3bcc3898aeb98a87 Mon Sep 17 00:00:00 2001 From: Vakul Garg Date: Fri, 21 Sep 2018 09:46:13 +0530 Subject: [PATCH 1506/1640] UPSTREAM: net/tls: Add support for async encryption of records for performance In current implementation, tls records are encrypted & transmitted serially. Till the time the previously submitted user data is encrypted, the implementation waits and on finish starts transmitting the record. This approach of encrypt-one record at a time is inefficient when asynchronous crypto accelerators are used. For each record, there are overheads of interrupts, driver softIRQ scheduling etc. Also the crypto accelerator sits idle most of time while an encrypted record's pages are handed over to tcp stack for transmission. This patch enables encryption of multiple records in parallel when an async capable crypto accelerator is present in system. This is achieved by allowing the user space application to send more data using sendmsg() even while previously issued data is being processed by crypto accelerator. This requires returning the control back to user space application after submitting encryption request to accelerator. This also means that zero-copy mode of encryption cannot be used with async accelerator as we must be done with user space application buffer before returning from sendmsg(). There can be multiple records in flight to/from the accelerator. Each of the record is represented by 'struct tls_rec'. This is used to store the memory pages for the record. After the records are encrypted, they are added in a linked list called tx_ready_list which contains encrypted tls records sorted as per tls sequence number. The records from tx_ready_list are transmitted using a newly introduced function called tls_tx_records(). The tx_ready_list is polled for any record ready to be transmitted in sendmsg(), sendpage() after initiating encryption of new tls records. This achieves parallel encryption and transmission of records when async accelerator is present. There could be situation when crypto accelerator completes encryption later than polling of tx_ready_list by sendmsg()/sendpage(). Therefore we need a deferred work context to be able to transmit records from tx_ready_list. The deferred work context gets scheduled if applications are not sending much data through the socket. If the applications issue sendmsg()/sendpage() in quick succession, then the scheduling of tx_work_handler gets cancelled as the tx_ready_list would be polled from application's context itself. This saves scheduling overhead of deferred work. The patch also brings some side benefit. We are able to get rid of the concept of CLOSED record. This is because the records once closed are either encrypted and then placed into tx_ready_list or if encryption fails, the socket error is set. This simplifies the kernel tls sendpath. However since tls_device.c is still using macros, accessory functions for CLOSED records have been retained. Change-Id: I5225045ca49b797bde0e8b8cae92808e688a803a Signed-off-by: Vakul Garg Signed-off-by: David S. Miller --- include/net/tls.h | 70 +++++- net/tls/tls_main.c | 54 ++--- net/tls/tls_sw.c | 586 ++++++++++++++++++++++++++++++++++----------- 3 files changed, 522 insertions(+), 188 deletions(-) diff --git a/include/net/tls.h b/include/net/tls.h index 9caef9bad075..599e2c939a2a 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -41,7 +41,7 @@ #include #include #include - +#include #include @@ -93,24 +93,47 @@ enum { TLS_NUM_CONFIG, }; -struct tls_sw_context_tx { - struct crypto_aead *aead_send; - struct crypto_wait async_wait; - - char aad_space[TLS_AAD_SPACE_SIZE]; - - unsigned int sg_plaintext_size; - int sg_plaintext_num_elem; +/* TLS records are maintained in 'struct tls_rec'. It stores the memory pages + * allocated or mapped for each TLS record. After encryption, the records are + * stores in a linked list. + */ +struct tls_rec { + struct list_head list; + int tx_flags; struct scatterlist sg_plaintext_data[MAX_SKB_FRAGS]; - - unsigned int sg_encrypted_size; - int sg_encrypted_num_elem; struct scatterlist sg_encrypted_data[MAX_SKB_FRAGS]; /* AAD | sg_plaintext_data | sg_tag */ struct scatterlist sg_aead_in[2]; /* AAD | sg_encrypted_data (data contain overhead for hdr&iv&tag) */ struct scatterlist sg_aead_out[2]; + + unsigned int sg_plaintext_size; + unsigned int sg_encrypted_size; + int sg_plaintext_num_elem; + int sg_encrypted_num_elem; + + char aad_space[TLS_AAD_SPACE_SIZE]; + struct aead_request aead_req; + u8 aead_req_ctx[]; +}; + +struct tx_work { + struct delayed_work work; + struct sock *sk; +}; + +struct tls_sw_context_tx { + struct crypto_aead *aead_send; + struct crypto_wait async_wait; + struct tx_work tx_work; + struct tls_rec *open_rec; + struct list_head tx_ready_list; + atomic_t encrypt_pending; + int async_notify; + +#define BIT_TX_SCHEDULED 0 + unsigned long tx_bitmask; }; struct tls_sw_context_rx { @@ -205,6 +228,8 @@ struct tls_context { struct scatterlist *partially_sent_record; u16 partially_sent_offset; + u64 tx_seq_number; /* Next TLS seqnum to be transmitted */ + unsigned long flags; bool in_tcp_sendpages; @@ -270,6 +295,7 @@ int tls_device_sendpage(struct sock *sk, struct page *page, void tls_device_sk_destruct(struct sock *sk); void tls_device_init(void); void tls_device_cleanup(void); +int tls_tx_records(struct sock *sk, int flags); struct tls_record_info *tls_get_record(struct tls_offload_context_tx *context, u32 seq, u64 *p_record_sn); @@ -288,6 +314,9 @@ void tls_sk_destruct(struct sock *sk, struct tls_context *ctx); int tls_push_sg(struct sock *sk, struct tls_context *ctx, struct scatterlist *sg, u16 first_offset, int flags); +int tls_push_partial_record(struct sock *sk, struct tls_context *ctx, + int flags); + int tls_push_pending_closed_record(struct sock *sk, struct tls_context *ctx, int flags, long *timeo); @@ -321,6 +350,23 @@ static inline bool tls_is_pending_open_record(struct tls_context *tls_ctx) return tls_ctx->pending_open_record_frags; } +static inline bool is_tx_ready(struct tls_context *tls_ctx, + struct tls_sw_context_tx *ctx) +{ + struct tls_rec *rec; + u64 seq; + + rec = list_first_entry(&ctx->tx_ready_list, struct tls_rec, list); + if (!rec) + return false; + + seq = be64_to_cpup((const __be64 *)&rec->aad_space); + if (seq == tls_ctx->tx_seq_number) + return true; + else + return false; +} + struct sk_buff * tls_validate_xmit_skb(struct sock *sk, struct net_device *dev, struct sk_buff *skb); diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index ff79db73a448..822e0ce38c85 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -141,7 +141,6 @@ retry: size = sg->length; } - clear_bit(TLS_PENDING_CLOSED_RECORD, &ctx->flags); ctx->in_tcp_sendpages = false; ctx->sk_write_space(sk); @@ -193,15 +192,12 @@ int tls_proccess_cmsg(struct sock *sk, struct msghdr *msg, return rc; } -int tls_push_pending_closed_record(struct sock *sk, struct tls_context *ctx, - int flags, long *timeo) +int tls_push_partial_record(struct sock *sk, struct tls_context *ctx, + int flags) { struct scatterlist *sg; u16 offset; - if (!tls_is_partially_sent_record(ctx)) - return ctx->push_pending_record(sk, flags); - sg = ctx->partially_sent_record; offset = ctx->partially_sent_offset; @@ -209,9 +205,23 @@ int tls_push_pending_closed_record(struct sock *sk, struct tls_context *ctx, return tls_push_sg(sk, ctx, sg, offset, flags); } +int tls_push_pending_closed_record(struct sock *sk, + struct tls_context *tls_ctx, + int flags, long *timeo) +{ + struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); + + if (tls_is_partially_sent_record(tls_ctx) || + !list_empty(&ctx->tx_ready_list)) + return tls_tx_records(sk, flags); + else + return tls_ctx->push_pending_record(sk, flags); +} + static void tls_write_space(struct sock *sk) { struct tls_context *ctx = tls_get_ctx(sk); + struct tls_sw_context_tx *tx_ctx = tls_sw_ctx_tx(ctx); /* If in_tcp_sendpages call lower protocol write space handler * to ensure we wake up any waiting operations there. For example @@ -222,20 +232,11 @@ static void tls_write_space(struct sock *sk) return; } - if (!sk->sk_write_pending && tls_is_pending_closed_record(ctx)) { - gfp_t sk_allocation = sk->sk_allocation; - int rc; - long timeo = 0; - - sk->sk_allocation = GFP_ATOMIC; - rc = tls_push_pending_closed_record(sk, ctx, - MSG_DONTWAIT | - MSG_NOSIGNAL, - &timeo); - sk->sk_allocation = sk_allocation; - - if (rc < 0) - return; + /* Schedule the transmission if tx list is ready */ + if (is_tx_ready(ctx, tx_ctx) && !sk->sk_write_pending) { + /* Schedule the transmission */ + if (!test_and_set_bit(BIT_TX_SCHEDULED, &tx_ctx->tx_bitmask)) + schedule_delayed_work(&tx_ctx->tx_work.work, 0); } ctx->sk_write_space(sk); @@ -270,19 +271,6 @@ static void tls_sk_proto_close(struct sock *sk, long timeout) if (!tls_complete_pending_work(sk, ctx, 0, &timeo)) tls_handle_open_record(sk, 0); - if (ctx->partially_sent_record) { - struct scatterlist *sg = ctx->partially_sent_record; - - while (1) { - put_page(sg_page(sg)); - sk_mem_uncharge(sk, sg->length); - - if (sg_is_last(sg)) - break; - sg++; - } - } - /* We need these for tls_sw_fallback handling of other packets */ if (ctx->tx_conf == TLS_SW) { kfree(ctx->tx.rec_seq); diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 7d761244a360..c5cdf91399f8 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -99,18 +99,19 @@ static void trim_both_sgl(struct sock *sk, int target_size) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); + struct tls_rec *rec = ctx->open_rec; - trim_sg(sk, ctx->sg_plaintext_data, - &ctx->sg_plaintext_num_elem, - &ctx->sg_plaintext_size, + trim_sg(sk, rec->sg_plaintext_data, + &rec->sg_plaintext_num_elem, + &rec->sg_plaintext_size, target_size); if (target_size > 0) target_size += tls_ctx->tx.overhead_size; - trim_sg(sk, ctx->sg_encrypted_data, - &ctx->sg_encrypted_num_elem, - &ctx->sg_encrypted_size, + trim_sg(sk, rec->sg_encrypted_data, + &rec->sg_encrypted_num_elem, + &rec->sg_encrypted_size, target_size); } @@ -118,15 +119,16 @@ static int alloc_encrypted_sg(struct sock *sk, int len) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); + struct tls_rec *rec = ctx->open_rec; int rc = 0; rc = sk_alloc_sg(sk, len, - ctx->sg_encrypted_data, 0, - &ctx->sg_encrypted_num_elem, - &ctx->sg_encrypted_size, 0); + rec->sg_encrypted_data, 0, + &rec->sg_encrypted_num_elem, + &rec->sg_encrypted_size, 0); if (rc == -ENOSPC) - ctx->sg_encrypted_num_elem = ARRAY_SIZE(ctx->sg_encrypted_data); + rec->sg_encrypted_num_elem = ARRAY_SIZE(rec->sg_encrypted_data); return rc; } @@ -135,14 +137,15 @@ static int alloc_plaintext_sg(struct sock *sk, int len) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); + struct tls_rec *rec = ctx->open_rec; int rc = 0; - rc = sk_alloc_sg(sk, len, ctx->sg_plaintext_data, 0, - &ctx->sg_plaintext_num_elem, &ctx->sg_plaintext_size, + rc = sk_alloc_sg(sk, len, rec->sg_plaintext_data, 0, + &rec->sg_plaintext_num_elem, &rec->sg_plaintext_size, tls_ctx->pending_open_record_frags); if (rc == -ENOSPC) - ctx->sg_plaintext_num_elem = ARRAY_SIZE(ctx->sg_plaintext_data); + rec->sg_plaintext_num_elem = ARRAY_SIZE(rec->sg_plaintext_data); return rc; } @@ -164,37 +167,192 @@ static void tls_free_both_sg(struct sock *sk) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); + struct tls_rec *rec = ctx->open_rec; - free_sg(sk, ctx->sg_encrypted_data, &ctx->sg_encrypted_num_elem, - &ctx->sg_encrypted_size); + /* Return if there is no open record */ + if (!rec) + return; - free_sg(sk, ctx->sg_plaintext_data, &ctx->sg_plaintext_num_elem, - &ctx->sg_plaintext_size); + free_sg(sk, rec->sg_encrypted_data, + &rec->sg_encrypted_num_elem, + &rec->sg_encrypted_size); + + free_sg(sk, rec->sg_plaintext_data, + &rec->sg_plaintext_num_elem, + &rec->sg_plaintext_size); } -static int tls_do_encryption(struct tls_context *tls_ctx, +static bool append_tx_ready_list(struct tls_context *tls_ctx, + struct tls_sw_context_tx *ctx, + struct tls_rec *enc_rec) +{ + u64 new_seq = be64_to_cpup((const __be64 *)&enc_rec->aad_space); + struct list_head *pos; + + /* Need to insert encrypted record in tx_ready_list sorted + * as per sequence number. Traverse linked list from tail. + */ + list_for_each_prev(pos, &ctx->tx_ready_list) { + struct tls_rec *rec = (struct tls_rec *)pos; + u64 seq = be64_to_cpup((const __be64 *)&rec->aad_space); + + if (new_seq > seq) + break; + } + + list_add((struct list_head *)&enc_rec->list, pos); + + return is_tx_ready(tls_ctx, ctx); +} + +int tls_tx_records(struct sock *sk, int flags) +{ + struct tls_context *tls_ctx = tls_get_ctx(sk); + struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); + struct tls_rec *rec, *tmp; + int tx_flags, rc = 0; + + if (tls_is_partially_sent_record(tls_ctx)) { + rec = list_first_entry(&ctx->tx_ready_list, + struct tls_rec, list); + + if (flags == -1) + tx_flags = rec->tx_flags; + else + tx_flags = flags; + + rc = tls_push_partial_record(sk, tls_ctx, tx_flags); + if (rc) + goto tx_err; + + /* Full record has been transmitted. + * Remove the head of tx_ready_list + */ + tls_ctx->tx_seq_number++; + list_del(&rec->list); + kfree(rec); + } + + /* Tx all ready records which have expected sequence number */ + list_for_each_entry_safe(rec, tmp, &ctx->tx_ready_list, list) { + u64 seq = be64_to_cpup((const __be64 *)&rec->aad_space); + + if (seq == tls_ctx->tx_seq_number) { + if (flags == -1) + tx_flags = rec->tx_flags; + else + tx_flags = flags; + + rc = tls_push_sg(sk, tls_ctx, + &rec->sg_encrypted_data[0], + 0, tx_flags); + if (rc) + goto tx_err; + + tls_ctx->tx_seq_number++; + list_del(&rec->list); + kfree(rec); + } else { + break; + } + } + +tx_err: + if (rc < 0 && rc != -EAGAIN) + tls_err_abort(sk, EBADMSG); + + return rc; +} + +static void tls_encrypt_done(struct crypto_async_request *req, int err) +{ + struct aead_request *aead_req = (struct aead_request *)req; + struct sock *sk = req->data; + struct tls_context *tls_ctx = tls_get_ctx(sk); + struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); + struct tls_rec *rec; + bool ready = false; + int pending; + + rec = container_of(aead_req, struct tls_rec, aead_req); + + rec->sg_encrypted_data[0].offset -= tls_ctx->tx.prepend_size; + rec->sg_encrypted_data[0].length += tls_ctx->tx.prepend_size; + + free_sg(sk, rec->sg_plaintext_data, + &rec->sg_plaintext_num_elem, &rec->sg_plaintext_size); + + /* Free the record if error is previously set on socket */ + if (err || sk->sk_err) { + free_sg(sk, rec->sg_encrypted_data, + &rec->sg_encrypted_num_elem, &rec->sg_encrypted_size); + + kfree(rec); + rec = NULL; + + /* If err is already set on socket, return the same code */ + if (sk->sk_err) { + ctx->async_wait.err = sk->sk_err; + } else { + ctx->async_wait.err = err; + tls_err_abort(sk, err); + } + } + + /* Append the record in tx queue */ + if (rec) + ready = append_tx_ready_list(tls_ctx, ctx, rec); + + pending = atomic_dec_return(&ctx->encrypt_pending); + + if (!pending && READ_ONCE(ctx->async_notify)) + complete(&ctx->async_wait.completion); + + if (!ready) + return; + + /* Schedule the transmission */ + if (!test_and_set_bit(BIT_TX_SCHEDULED, &ctx->tx_bitmask)) + schedule_delayed_work(&ctx->tx_work.work, 1); +} + +static int tls_do_encryption(struct sock *sk, + struct tls_context *tls_ctx, struct tls_sw_context_tx *ctx, struct aead_request *aead_req, size_t data_len) { + struct tls_rec *rec = ctx->open_rec; int rc; - ctx->sg_encrypted_data[0].offset += tls_ctx->tx.prepend_size; - ctx->sg_encrypted_data[0].length -= tls_ctx->tx.prepend_size; + rec->sg_encrypted_data[0].offset += tls_ctx->tx.prepend_size; + rec->sg_encrypted_data[0].length -= tls_ctx->tx.prepend_size; aead_request_set_tfm(aead_req, ctx->aead_send); aead_request_set_ad(aead_req, TLS_AAD_SPACE_SIZE); - aead_request_set_crypt(aead_req, ctx->sg_aead_in, ctx->sg_aead_out, + aead_request_set_crypt(aead_req, rec->sg_aead_in, + rec->sg_aead_out, data_len, tls_ctx->tx.iv); aead_request_set_callback(aead_req, CRYPTO_TFM_REQ_MAY_BACKLOG, - crypto_req_done, &ctx->async_wait); + tls_encrypt_done, sk); - rc = crypto_wait_req(crypto_aead_encrypt(aead_req), &ctx->async_wait); + atomic_inc(&ctx->encrypt_pending); - ctx->sg_encrypted_data[0].offset -= tls_ctx->tx.prepend_size; - ctx->sg_encrypted_data[0].length += tls_ctx->tx.prepend_size; + rc = crypto_aead_encrypt(aead_req); + if (!rc || rc != -EINPROGRESS) { + atomic_dec(&ctx->encrypt_pending); + rec->sg_encrypted_data[0].offset -= tls_ctx->tx.prepend_size; + rec->sg_encrypted_data[0].length += tls_ctx->tx.prepend_size; + } + /* Case of encryption failure */ + if (rc && rc != -EINPROGRESS) + return rc; + + /* Unhook the record from context if encryption is not failure */ + ctx->open_rec = NULL; + tls_advance_record_sn(sk, &tls_ctx->tx); return rc; } @@ -203,53 +361,49 @@ static int tls_push_record(struct sock *sk, int flags, { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); + struct tls_rec *rec = ctx->open_rec; struct aead_request *req; int rc; - req = aead_request_alloc(ctx->aead_send, sk->sk_allocation); - if (!req) - return -ENOMEM; + if (!rec) + return 0; - sg_mark_end(ctx->sg_plaintext_data + ctx->sg_plaintext_num_elem - 1); - sg_mark_end(ctx->sg_encrypted_data + ctx->sg_encrypted_num_elem - 1); + rec->tx_flags = flags; + req = &rec->aead_req; - tls_make_aad(ctx->aad_space, ctx->sg_plaintext_size, + sg_mark_end(rec->sg_plaintext_data + rec->sg_plaintext_num_elem - 1); + sg_mark_end(rec->sg_encrypted_data + rec->sg_encrypted_num_elem - 1); + + tls_make_aad(rec->aad_space, rec->sg_plaintext_size, tls_ctx->tx.rec_seq, tls_ctx->tx.rec_seq_size, record_type); tls_fill_prepend(tls_ctx, - page_address(sg_page(&ctx->sg_encrypted_data[0])) + - ctx->sg_encrypted_data[0].offset, - ctx->sg_plaintext_size, record_type); + page_address(sg_page(&rec->sg_encrypted_data[0])) + + rec->sg_encrypted_data[0].offset, + rec->sg_plaintext_size, record_type); tls_ctx->pending_open_record_frags = 0; - set_bit(TLS_PENDING_CLOSED_RECORD, &tls_ctx->flags); - rc = tls_do_encryption(tls_ctx, ctx, req, ctx->sg_plaintext_size); + rc = tls_do_encryption(sk, tls_ctx, ctx, req, rec->sg_plaintext_size); + if (rc == -EINPROGRESS) + return -EINPROGRESS; + + free_sg(sk, rec->sg_plaintext_data, &rec->sg_plaintext_num_elem, + &rec->sg_plaintext_size); + if (rc < 0) { - /* If we are called from write_space and - * we fail, we need to set this SOCK_NOSPACE - * to trigger another write_space in the future. - */ - set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); - goto out_req; + tls_err_abort(sk, EBADMSG); + return rc; } - free_sg(sk, ctx->sg_plaintext_data, &ctx->sg_plaintext_num_elem, - &ctx->sg_plaintext_size); + /* Put the record in tx_ready_list and start tx if permitted. + * This happens only when encryption is not asynchronous. + */ + if (append_tx_ready_list(tls_ctx, ctx, rec)) + return tls_tx_records(sk, flags); - ctx->sg_encrypted_num_elem = 0; - ctx->sg_encrypted_size = 0; - - /* Only pass through MSG_DONTWAIT and MSG_NOSIGNAL flags */ - rc = tls_push_sg(sk, tls_ctx, ctx->sg_encrypted_data, 0, flags); - if (rc < 0 && rc != -EAGAIN) - tls_err_abort(sk, EBADMSG); - - tls_advance_record_sn(sk, &tls_ctx->tx); -out_req: - aead_request_free(req); - return rc; + return 0; } static int tls_sw_push_pending_record(struct sock *sk, int flags) @@ -326,11 +480,12 @@ static int memcopy_from_iter(struct sock *sk, struct iov_iter *from, { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); - struct scatterlist *sg = ctx->sg_plaintext_data; + struct tls_rec *rec = ctx->open_rec; + struct scatterlist *sg = rec->sg_plaintext_data; int copy, i, rc = 0; for (i = tls_ctx->pending_open_record_frags; - i < ctx->sg_plaintext_num_elem; ++i) { + i < rec->sg_plaintext_num_elem; ++i) { copy = sg[i].length; if (copy_from_iter( page_address(sg_page(&sg[i])) + sg[i].offset, @@ -350,34 +505,85 @@ out: return rc; } -int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) +struct tls_rec *get_rec(struct sock *sk) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); - int ret; - int required_size; + struct tls_rec *rec; + int mem_size; + + /* Return if we already have an open record */ + if (ctx->open_rec) + return ctx->open_rec; + + mem_size = sizeof(struct tls_rec) + crypto_aead_reqsize(ctx->aead_send); + + rec = kzalloc(mem_size, sk->sk_allocation); + if (!rec) + return NULL; + + sg_init_table(&rec->sg_plaintext_data[0], + ARRAY_SIZE(rec->sg_plaintext_data)); + sg_init_table(&rec->sg_encrypted_data[0], + ARRAY_SIZE(rec->sg_encrypted_data)); + + sg_init_table(rec->sg_aead_in, 2); + sg_set_buf(&rec->sg_aead_in[0], rec->aad_space, + sizeof(rec->aad_space)); + sg_unmark_end(&rec->sg_aead_in[1]); + sg_chain(rec->sg_aead_in, 2, rec->sg_plaintext_data); + + sg_init_table(rec->sg_aead_out, 2); + sg_set_buf(&rec->sg_aead_out[0], rec->aad_space, + sizeof(rec->aad_space)); + sg_unmark_end(&rec->sg_aead_out[1]); + sg_chain(rec->sg_aead_out, 2, rec->sg_encrypted_data); + + ctx->open_rec = rec; + + return rec; +} + +int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) +{ long timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); + struct tls_context *tls_ctx = tls_get_ctx(sk); + struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); + struct crypto_tfm *tfm = crypto_aead_tfm(ctx->aead_send); + bool async_capable = tfm->__crt_alg->cra_flags & CRYPTO_ALG_ASYNC; + unsigned char record_type = TLS_RECORD_TYPE_DATA; + bool is_kvec = msg->msg_iter.type & ITER_KVEC; bool eor = !(msg->msg_flags & MSG_MORE); size_t try_to_copy, copied = 0; - unsigned char record_type = TLS_RECORD_TYPE_DATA; - int record_room; + struct tls_rec *rec; + int required_size; + int num_async = 0; bool full_record; + int record_room; + int num_zc = 0; int orig_size; - bool is_kvec = msg->msg_iter.type & ITER_KVEC; + int ret; if (msg->msg_flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL)) return -ENOTSUPP; lock_sock(sk); - ret = tls_complete_pending_work(sk, tls_ctx, msg->msg_flags, &timeo); - if (ret) - goto send_end; + /* Wait till there is any pending write on socket */ + if (unlikely(sk->sk_write_pending)) { + ret = wait_on_pending_writer(sk, &timeo); + if (unlikely(ret)) + goto send_end; + } if (unlikely(msg->msg_controllen)) { ret = tls_proccess_cmsg(sk, msg, &record_type); - if (ret) - goto send_end; + if (ret) { + if (ret == -EINPROGRESS) + num_async++; + else if (ret != -EAGAIN) + goto send_end; + } } while (msg_data_left(msg)) { @@ -386,20 +592,27 @@ int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) goto send_end; } - orig_size = ctx->sg_plaintext_size; + rec = get_rec(sk); + if (!rec) { + ret = -ENOMEM; + goto send_end; + } + + orig_size = rec->sg_plaintext_size; full_record = false; try_to_copy = msg_data_left(msg); - record_room = TLS_MAX_PAYLOAD_SIZE - ctx->sg_plaintext_size; + record_room = TLS_MAX_PAYLOAD_SIZE - rec->sg_plaintext_size; if (try_to_copy >= record_room) { try_to_copy = record_room; full_record = true; } - required_size = ctx->sg_plaintext_size + try_to_copy + + required_size = rec->sg_plaintext_size + try_to_copy + tls_ctx->tx.overhead_size; if (!sk_stream_memory_free(sk)) goto wait_for_sndbuf; + alloc_encrypted: ret = alloc_encrypted_sg(sk, required_size); if (ret) { @@ -410,33 +623,39 @@ alloc_encrypted: * actually allocated. The difference is due * to max sg elements limit */ - try_to_copy -= required_size - ctx->sg_encrypted_size; + try_to_copy -= required_size - rec->sg_encrypted_size; full_record = true; } - if (!is_kvec && (full_record || eor)) { + + if (!is_kvec && (full_record || eor) && !async_capable) { ret = zerocopy_from_iter(sk, &msg->msg_iter, - try_to_copy, &ctx->sg_plaintext_num_elem, - &ctx->sg_plaintext_size, - ctx->sg_plaintext_data, - ARRAY_SIZE(ctx->sg_plaintext_data), + try_to_copy, &rec->sg_plaintext_num_elem, + &rec->sg_plaintext_size, + rec->sg_plaintext_data, + ARRAY_SIZE(rec->sg_plaintext_data), true); if (ret) goto fallback_to_reg_send; + num_zc++; copied += try_to_copy; ret = tls_push_record(sk, msg->msg_flags, record_type); - if (ret) - goto send_end; + if (ret) { + if (ret == -EINPROGRESS) + num_async++; + else if (ret != -EAGAIN) + goto send_end; + } continue; fallback_to_reg_send: - trim_sg(sk, ctx->sg_plaintext_data, - &ctx->sg_plaintext_num_elem, - &ctx->sg_plaintext_size, + trim_sg(sk, rec->sg_plaintext_data, + &rec->sg_plaintext_num_elem, + &rec->sg_plaintext_size, orig_size); } - required_size = ctx->sg_plaintext_size + try_to_copy; + required_size = rec->sg_plaintext_size + try_to_copy; alloc_plaintext: ret = alloc_plaintext_sg(sk, required_size); if (ret) { @@ -447,13 +666,13 @@ alloc_plaintext: * actually allocated. The difference is due * to max sg elements limit */ - try_to_copy -= required_size - ctx->sg_plaintext_size; + try_to_copy -= required_size - rec->sg_plaintext_size; full_record = true; - trim_sg(sk, ctx->sg_encrypted_data, - &ctx->sg_encrypted_num_elem, - &ctx->sg_encrypted_size, - ctx->sg_plaintext_size + + trim_sg(sk, rec->sg_encrypted_data, + &rec->sg_encrypted_num_elem, + &rec->sg_encrypted_size, + rec->sg_plaintext_size + tls_ctx->tx.overhead_size); } @@ -463,13 +682,12 @@ alloc_plaintext: copied += try_to_copy; if (full_record || eor) { -push_record: ret = tls_push_record(sk, msg->msg_flags, record_type); if (ret) { - if (ret == -ENOMEM) - goto wait_for_memory; - - goto send_end; + if (ret == -EINPROGRESS) + num_async++; + else if (ret != -EAGAIN) + goto send_end; } } @@ -485,15 +703,37 @@ trim_sgl: goto send_end; } - if (tls_is_pending_closed_record(tls_ctx)) - goto push_record; - - if (ctx->sg_encrypted_size < required_size) + if (rec->sg_encrypted_size < required_size) goto alloc_encrypted; goto alloc_plaintext; } + if (!num_async) { + goto send_end; + } else if (num_zc) { + /* Wait for pending encryptions to get completed */ + smp_store_mb(ctx->async_notify, true); + + if (atomic_read(&ctx->encrypt_pending)) + crypto_wait_req(-EINPROGRESS, &ctx->async_wait); + else + reinit_completion(&ctx->async_wait.completion); + + WRITE_ONCE(ctx->async_notify, false); + + if (ctx->async_wait.err) { + ret = ctx->async_wait.err; + copied = 0; + } + } + + /* Transmit if any encryptions have completed */ + if (test_and_clear_bit(BIT_TX_SCHEDULED, &ctx->tx_bitmask)) { + cancel_delayed_work(&ctx->tx_work.work); + tls_tx_records(sk, msg->msg_flags); + } + send_end: ret = sk_stream_error(sk, msg->msg_flags, ret); @@ -504,16 +744,18 @@ send_end: int tls_sw_sendpage(struct sock *sk, struct page *page, int offset, size_t size, int flags) { + long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); - int ret; - long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); - bool eor; - size_t orig_size = size; unsigned char record_type = TLS_RECORD_TYPE_DATA; + size_t orig_size = size; struct scatterlist *sg; + struct tls_rec *rec; + int num_async = 0; bool full_record; int record_room; + bool eor; + int ret; if (flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL | MSG_SENDPAGE_NOTLAST)) @@ -526,9 +768,12 @@ int tls_sw_sendpage(struct sock *sk, struct page *page, sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); - ret = tls_complete_pending_work(sk, tls_ctx, flags, &timeo); - if (ret) - goto sendpage_end; + /* Wait till there is any pending write on socket */ + if (unlikely(sk->sk_write_pending)) { + ret = wait_on_pending_writer(sk, &timeo); + if (unlikely(ret)) + goto sendpage_end; + } /* Call the sk_stream functions to manage the sndbuf mem. */ while (size > 0) { @@ -539,14 +784,20 @@ int tls_sw_sendpage(struct sock *sk, struct page *page, goto sendpage_end; } + rec = get_rec(sk); + if (!rec) { + ret = -ENOMEM; + goto sendpage_end; + } + full_record = false; - record_room = TLS_MAX_PAYLOAD_SIZE - ctx->sg_plaintext_size; + record_room = TLS_MAX_PAYLOAD_SIZE - rec->sg_plaintext_size; copy = size; if (copy >= record_room) { copy = record_room; full_record = true; } - required_size = ctx->sg_plaintext_size + copy + + required_size = rec->sg_plaintext_size + copy + tls_ctx->tx.overhead_size; if (!sk_stream_memory_free(sk)) @@ -561,33 +812,32 @@ alloc_payload: * actually allocated. The difference is due * to max sg elements limit */ - copy -= required_size - ctx->sg_plaintext_size; + copy -= required_size - rec->sg_plaintext_size; full_record = true; } get_page(page); - sg = ctx->sg_plaintext_data + ctx->sg_plaintext_num_elem; + sg = rec->sg_plaintext_data + rec->sg_plaintext_num_elem; sg_set_page(sg, page, copy, offset); sg_unmark_end(sg); - ctx->sg_plaintext_num_elem++; + rec->sg_plaintext_num_elem++; sk_mem_charge(sk, copy); offset += copy; size -= copy; - ctx->sg_plaintext_size += copy; - tls_ctx->pending_open_record_frags = ctx->sg_plaintext_num_elem; + rec->sg_plaintext_size += copy; + tls_ctx->pending_open_record_frags = rec->sg_plaintext_num_elem; if (full_record || eor || - ctx->sg_plaintext_num_elem == - ARRAY_SIZE(ctx->sg_plaintext_data)) { -push_record: + rec->sg_plaintext_num_elem == + ARRAY_SIZE(rec->sg_plaintext_data)) { ret = tls_push_record(sk, flags, record_type); if (ret) { - if (ret == -ENOMEM) - goto wait_for_memory; - - goto sendpage_end; + if (ret == -EINPROGRESS) + num_async++; + else if (ret != -EAGAIN) + goto sendpage_end; } } continue; @@ -596,16 +846,20 @@ wait_for_sndbuf: wait_for_memory: ret = sk_stream_wait_memory(sk, &timeo); if (ret) { - trim_both_sgl(sk, ctx->sg_plaintext_size); + trim_both_sgl(sk, rec->sg_plaintext_size); goto sendpage_end; } - if (tls_is_pending_closed_record(tls_ctx)) - goto push_record; - goto alloc_payload; } + if (num_async) { + /* Transmit if any encryptions have completed */ + if (test_and_clear_bit(BIT_TX_SCHEDULED, &ctx->tx_bitmask)) { + cancel_delayed_work(&ctx->tx_work.work); + tls_tx_records(sk, flags); + } + } sendpage_end: if (orig_size > size) ret = orig_size - size; @@ -1114,6 +1368,49 @@ void tls_sw_free_resources_tx(struct sock *sk) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); + struct tls_rec *rec, *tmp; + + /* Wait for any pending async encryptions to complete */ + smp_store_mb(ctx->async_notify, true); + if (atomic_read(&ctx->encrypt_pending)) + crypto_wait_req(-EINPROGRESS, &ctx->async_wait); + + cancel_delayed_work_sync(&ctx->tx_work.work); + + /* Tx whatever records we can transmit and abandon the rest */ + tls_tx_records(sk, -1); + + /* Free up un-sent records in tx_ready_list. First, free + * the partially sent record if any at head of tx_list. + */ + if (tls_ctx->partially_sent_record) { + struct scatterlist *sg = tls_ctx->partially_sent_record; + + while (1) { + put_page(sg_page(sg)); + sk_mem_uncharge(sk, sg->length); + + if (sg_is_last(sg)) + break; + sg++; + } + + tls_ctx->partially_sent_record = NULL; + + rec = list_first_entry(&ctx->tx_ready_list, + struct tls_rec, list); + list_del(&rec->list); + kfree(rec); + } + + list_for_each_entry_safe(rec, tmp, &ctx->tx_ready_list, list) { + free_sg(sk, rec->sg_encrypted_data, + &rec->sg_encrypted_num_elem, + &rec->sg_encrypted_size); + + list_del(&rec->list); + kfree(rec); + } crypto_free_aead(ctx->aead_send); tls_free_both_sg(sk); @@ -1153,6 +1450,24 @@ void tls_sw_free_resources_rx(struct sock *sk) kfree(ctx); } +/* The work handler to transmitt the encrypted records in tx_ready_list */ +static void tx_work_handler(struct work_struct *work) +{ + struct delayed_work *delayed_work = to_delayed_work(work); + struct tx_work *tx_work = container_of(delayed_work, + struct tx_work, work); + struct sock *sk = tx_work->sk; + struct tls_context *tls_ctx = tls_get_ctx(sk); + struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); + + if (!test_and_clear_bit(BIT_TX_SCHEDULED, &ctx->tx_bitmask)) + return; + + lock_sock(sk); + tls_tx_records(sk, -1); + release_sock(sk); +} + int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx) { struct tls_crypto_info *crypto_info; @@ -1202,6 +1517,9 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx) crypto_info = &ctx->crypto_send.info; cctx = &ctx->tx; aead = &sw_ctx_tx->aead_send; + INIT_LIST_HEAD(&sw_ctx_tx->tx_ready_list); + INIT_DELAYED_WORK(&sw_ctx_tx->tx_work.work, tx_work_handler); + sw_ctx_tx->tx_work.sk = sk; } else { crypto_init_wait(&sw_ctx_rx->async_wait); crypto_info = &ctx->crypto_recv.info; @@ -1252,26 +1570,6 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx) goto free_iv; } - if (sw_ctx_tx) { - sg_init_table(sw_ctx_tx->sg_encrypted_data, - ARRAY_SIZE(sw_ctx_tx->sg_encrypted_data)); - sg_init_table(sw_ctx_tx->sg_plaintext_data, - ARRAY_SIZE(sw_ctx_tx->sg_plaintext_data)); - - sg_init_table(sw_ctx_tx->sg_aead_in, 2); - sg_set_buf(&sw_ctx_tx->sg_aead_in[0], sw_ctx_tx->aad_space, - sizeof(sw_ctx_tx->aad_space)); - sg_unmark_end(&sw_ctx_tx->sg_aead_in[1]); - sg_chain(sw_ctx_tx->sg_aead_in, 2, - sw_ctx_tx->sg_plaintext_data); - sg_init_table(sw_ctx_tx->sg_aead_out, 2); - sg_set_buf(&sw_ctx_tx->sg_aead_out[0], sw_ctx_tx->aad_space, - sizeof(sw_ctx_tx->aad_space)); - sg_unmark_end(&sw_ctx_tx->sg_aead_out[1]); - sg_chain(sw_ctx_tx->sg_aead_out, 2, - sw_ctx_tx->sg_encrypted_data); - } - if (!*aead) { *aead = crypto_alloc_aead("gcm(aes)", 0, 0); if (IS_ERR(*aead)) { @@ -1308,6 +1606,8 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx) sw_ctx_rx->sk_poll = sk->sk_socket->ops->poll; strp_check_rcv(&sw_ctx_rx->strp); + } else { + ctx->tx_seq_number = be64_to_cpup((const __be64 *)rec_seq); } goto out; From ac6061160ce75129b67ef5b61ab4eb2916bdc114 Mon Sep 17 00:00:00 2001 From: Vakul Garg Date: Mon, 24 Sep 2018 15:35:56 +0530 Subject: [PATCH 1507/1640] UPSTREAM: net/tls: Fixed race condition in async encryption On processors with multi-engine crypto accelerators, it is possible that multiple records get encrypted in parallel and their encryption completion is notified to different cpus in multicore processor. This leads to the situation where tls_encrypt_done() starts executing in parallel on different cores. In current implementation, encrypted records are queued to tx_ready_list in tls_encrypt_done(). This requires addition to linked list 'tx_ready_list' to be protected. As tls_decrypt_done() could be executing in irq content, it is not possible to protect linked list addition operation using a lock. To fix the problem, we remove linked list addition operation from the irq context. We do tx_ready_list addition/removal operation from application context only and get rid of possible multiple access to the linked list. Before starting encryption on the record, we add it to the tail of tx_ready_list. To prevent tls_tx_records() from transmitting it, we mark the record with a new flag 'tx_ready' in 'struct tls_rec'. When record encryption gets completed, tls_encrypt_done() has to only update the 'tx_ready' flag to true & linked list add operation is not required. The changed logic brings some other side benefits. Since the records are always submitted in tls sequence number order for encryption, the tx_ready_list always remains sorted and addition of new records to it does not have to traverse the linked list. Lastly, we renamed tx_ready_list in 'struct tls_sw_context_tx' to 'tx_list'. This is because now, the some of the records at the tail are not ready to transmit. Fixes: a42055e8d2c3 ("net/tls: Add support for async encryption") Change-Id: Ia6b257be1354681c9b9e6592e1636658025989c5 Signed-off-by: Vakul Garg Signed-off-by: David S. Miller --- include/net/tls.h | 16 +++------ net/tls/tls_main.c | 4 +-- net/tls/tls_sw.c | 81 +++++++++++++++++----------------------------- 3 files changed, 37 insertions(+), 64 deletions(-) diff --git a/include/net/tls.h b/include/net/tls.h index 599e2c939a2a..5eb17d3267ed 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -99,6 +99,7 @@ enum { */ struct tls_rec { struct list_head list; + int tx_ready; int tx_flags; struct scatterlist sg_plaintext_data[MAX_SKB_FRAGS]; struct scatterlist sg_encrypted_data[MAX_SKB_FRAGS]; @@ -128,7 +129,7 @@ struct tls_sw_context_tx { struct crypto_wait async_wait; struct tx_work tx_work; struct tls_rec *open_rec; - struct list_head tx_ready_list; + struct list_head tx_list; atomic_t encrypt_pending; int async_notify; @@ -228,7 +229,6 @@ struct tls_context { struct scatterlist *partially_sent_record; u16 partially_sent_offset; - u64 tx_seq_number; /* Next TLS seqnum to be transmitted */ unsigned long flags; bool in_tcp_sendpages; @@ -350,21 +350,15 @@ static inline bool tls_is_pending_open_record(struct tls_context *tls_ctx) return tls_ctx->pending_open_record_frags; } -static inline bool is_tx_ready(struct tls_context *tls_ctx, - struct tls_sw_context_tx *ctx) +static inline bool is_tx_ready(struct tls_sw_context_tx *ctx) { struct tls_rec *rec; - u64 seq; - rec = list_first_entry(&ctx->tx_ready_list, struct tls_rec, list); + rec = list_first_entry(&ctx->tx_list, struct tls_rec, list); if (!rec) return false; - seq = be64_to_cpup((const __be64 *)&rec->aad_space); - if (seq == tls_ctx->tx_seq_number) - return true; - else - return false; + return READ_ONCE(rec->tx_ready); } struct sk_buff * diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index 822e0ce38c85..d956837ba75e 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -212,7 +212,7 @@ int tls_push_pending_closed_record(struct sock *sk, struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); if (tls_is_partially_sent_record(tls_ctx) || - !list_empty(&ctx->tx_ready_list)) + !list_empty(&ctx->tx_list)) return tls_tx_records(sk, flags); else return tls_ctx->push_pending_record(sk, flags); @@ -233,7 +233,7 @@ static void tls_write_space(struct sock *sk) } /* Schedule the transmission if tx list is ready */ - if (is_tx_ready(ctx, tx_ctx) && !sk->sk_write_pending) { + if (is_tx_ready(tx_ctx) && !sk->sk_write_pending) { /* Schedule the transmission */ if (!test_and_set_bit(BIT_TX_SCHEDULED, &tx_ctx->tx_bitmask)) schedule_delayed_work(&tx_ctx->tx_work.work, 0); diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index c5cdf91399f8..eac0d1e32275 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -182,29 +182,6 @@ static void tls_free_both_sg(struct sock *sk) &rec->sg_plaintext_size); } -static bool append_tx_ready_list(struct tls_context *tls_ctx, - struct tls_sw_context_tx *ctx, - struct tls_rec *enc_rec) -{ - u64 new_seq = be64_to_cpup((const __be64 *)&enc_rec->aad_space); - struct list_head *pos; - - /* Need to insert encrypted record in tx_ready_list sorted - * as per sequence number. Traverse linked list from tail. - */ - list_for_each_prev(pos, &ctx->tx_ready_list) { - struct tls_rec *rec = (struct tls_rec *)pos; - u64 seq = be64_to_cpup((const __be64 *)&rec->aad_space); - - if (new_seq > seq) - break; - } - - list_add((struct list_head *)&enc_rec->list, pos); - - return is_tx_ready(tls_ctx, ctx); -} - int tls_tx_records(struct sock *sk, int flags) { struct tls_context *tls_ctx = tls_get_ctx(sk); @@ -213,7 +190,7 @@ int tls_tx_records(struct sock *sk, int flags) int tx_flags, rc = 0; if (tls_is_partially_sent_record(tls_ctx)) { - rec = list_first_entry(&ctx->tx_ready_list, + rec = list_first_entry(&ctx->tx_list, struct tls_rec, list); if (flags == -1) @@ -226,18 +203,15 @@ int tls_tx_records(struct sock *sk, int flags) goto tx_err; /* Full record has been transmitted. - * Remove the head of tx_ready_list + * Remove the head of tx_list */ - tls_ctx->tx_seq_number++; list_del(&rec->list); kfree(rec); } - /* Tx all ready records which have expected sequence number */ - list_for_each_entry_safe(rec, tmp, &ctx->tx_ready_list, list) { - u64 seq = be64_to_cpup((const __be64 *)&rec->aad_space); - - if (seq == tls_ctx->tx_seq_number) { + /* Tx all ready records */ + list_for_each_entry_safe(rec, tmp, &ctx->tx_list, list) { + if (READ_ONCE(rec->tx_ready)) { if (flags == -1) tx_flags = rec->tx_flags; else @@ -249,7 +223,6 @@ int tls_tx_records(struct sock *sk, int flags) if (rc) goto tx_err; - tls_ctx->tx_seq_number++; list_del(&rec->list); kfree(rec); } else { @@ -299,9 +272,18 @@ static void tls_encrypt_done(struct crypto_async_request *req, int err) } } - /* Append the record in tx queue */ - if (rec) - ready = append_tx_ready_list(tls_ctx, ctx, rec); + if (rec) { + struct tls_rec *first_rec; + + /* Mark the record as ready for transmission */ + smp_store_mb(rec->tx_ready, true); + + /* If received record is at head of tx_list, schedule tx */ + first_rec = list_first_entry(&ctx->tx_list, + struct tls_rec, list); + if (rec == first_rec) + ready = true; + } pending = atomic_dec_return(&ctx->encrypt_pending); @@ -337,6 +319,8 @@ static int tls_do_encryption(struct sock *sk, aead_request_set_callback(aead_req, CRYPTO_TFM_REQ_MAY_BACKLOG, tls_encrypt_done, sk); + /* Add the record in tx_list */ + list_add_tail((struct list_head *)&rec->list, &ctx->tx_list); atomic_inc(&ctx->encrypt_pending); rc = crypto_aead_encrypt(aead_req); @@ -346,9 +330,12 @@ static int tls_do_encryption(struct sock *sk, rec->sg_encrypted_data[0].length += tls_ctx->tx.prepend_size; } - /* Case of encryption failure */ - if (rc && rc != -EINPROGRESS) + if (!rc) { + WRITE_ONCE(rec->tx_ready, true); + } else if (rc != -EINPROGRESS) { + list_del(&rec->list); return rc; + } /* Unhook the record from context if encryption is not failure */ ctx->open_rec = NULL; @@ -397,13 +384,7 @@ static int tls_push_record(struct sock *sk, int flags, return rc; } - /* Put the record in tx_ready_list and start tx if permitted. - * This happens only when encryption is not asynchronous. - */ - if (append_tx_ready_list(tls_ctx, ctx, rec)) - return tls_tx_records(sk, flags); - - return 0; + return tls_tx_records(sk, flags); } static int tls_sw_push_pending_record(struct sock *sk, int flags) @@ -1380,7 +1361,7 @@ void tls_sw_free_resources_tx(struct sock *sk) /* Tx whatever records we can transmit and abandon the rest */ tls_tx_records(sk, -1); - /* Free up un-sent records in tx_ready_list. First, free + /* Free up un-sent records in tx_list. First, free * the partially sent record if any at head of tx_list. */ if (tls_ctx->partially_sent_record) { @@ -1397,13 +1378,13 @@ void tls_sw_free_resources_tx(struct sock *sk) tls_ctx->partially_sent_record = NULL; - rec = list_first_entry(&ctx->tx_ready_list, + rec = list_first_entry(&ctx->tx_list, struct tls_rec, list); list_del(&rec->list); kfree(rec); } - list_for_each_entry_safe(rec, tmp, &ctx->tx_ready_list, list) { + list_for_each_entry_safe(rec, tmp, &ctx->tx_list, list) { free_sg(sk, rec->sg_encrypted_data, &rec->sg_encrypted_num_elem, &rec->sg_encrypted_size); @@ -1450,7 +1431,7 @@ void tls_sw_free_resources_rx(struct sock *sk) kfree(ctx); } -/* The work handler to transmitt the encrypted records in tx_ready_list */ +/* The work handler to transmitt the encrypted records in tx_list */ static void tx_work_handler(struct work_struct *work) { struct delayed_work *delayed_work = to_delayed_work(work); @@ -1517,7 +1498,7 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx) crypto_info = &ctx->crypto_send.info; cctx = &ctx->tx; aead = &sw_ctx_tx->aead_send; - INIT_LIST_HEAD(&sw_ctx_tx->tx_ready_list); + INIT_LIST_HEAD(&sw_ctx_tx->tx_list); INIT_DELAYED_WORK(&sw_ctx_tx->tx_work.work, tx_work_handler); sw_ctx_tx->tx_work.sk = sk; } else { @@ -1606,8 +1587,6 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx) sw_ctx_rx->sk_poll = sk->sk_socket->ops->poll; strp_check_rcv(&sw_ctx_rx->strp); - } else { - ctx->tx_seq_number = be64_to_cpup((const __be64 *)rec_seq); } goto out; From 73c836459eae74ff49ffb2b3a0f344703ad92a23 Mon Sep 17 00:00:00 2001 From: Vakul Garg Date: Mon, 24 Sep 2018 16:09:49 +0530 Subject: [PATCH 1508/1640] UPSTREAM: tls: Fixed uninitialised vars warning In tls_sw_sendmsg() and tls_sw_sendpage(), it is possible that the uninitialised variable 'ret' gets passed to sk_stream_error(). So initialise local variable 'ret' to '0. The warnings were detected by 'smatch' tool. Fixes: a42055e8d2c3 ("net/tls: Add support for async encryption") Change-Id: Ifea9534895ee9b7a860a8aab955c05a12907daec Signed-off-by: Vakul Garg Signed-off-by: David S. Miller --- net/tls/tls_sw.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index eac0d1e32275..d58da8e94d69 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -543,7 +543,7 @@ int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) int record_room; int num_zc = 0; int orig_size; - int ret; + int ret = 0; if (msg->msg_flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL)) return -ENOTSUPP; @@ -735,8 +735,8 @@ int tls_sw_sendpage(struct sock *sk, struct page *page, int num_async = 0; bool full_record; int record_room; + int ret = 0; bool eor; - int ret; if (flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL | MSG_SENDPAGE_NOTLAST)) From 5113bf92aa79834768f56c9934f5fceed91996b0 Mon Sep 17 00:00:00 2001 From: Vakul Garg Date: Tue, 25 Sep 2018 16:26:17 +0530 Subject: [PATCH 1509/1640] UPSTREAM: tls: Fix socket mem accounting error under async encryption Current async encryption implementation sometimes showed up socket memory accounting error during socket close. This results in kernel warning calltrace. The root cause of the problem is that socket var sk_forward_alloc gets corrupted due to access in sk_mem_charge() and sk_mem_uncharge() being invoked from multiple concurrent contexts in multicore processor. The apis sk_mem_charge() and sk_mem_uncharge() are called from functions alloc_plaintext_sg(), free_sg() etc. It is required that memory accounting apis are called under a socket lock. The plaintext sg data sent for encryption is freed using free_sg() in tls_encryption_done(). It is wrong to call free_sg() from this function. This is because this function may run in irq context. We cannot acquire socket lock in this function. We remove calling of function free_sg() for plaintext data from tls_encryption_done() and defer freeing up of plaintext data to the time when the record is picked up from tx_list and transmitted/freed. When tls_tx_records() gets called, socket is already locked and thus there is no concurrent access problem. Fixes: a42055e8d2c3 ("net/tls: Add support for async encryption") Change-Id: I5010553f272d362ef8f4ea32ef579074922361c2 Signed-off-by: Vakul Garg Signed-off-by: David S. Miller --- net/tls/tls_sw.c | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index d58da8e94d69..bc23d2a4e480 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -206,6 +206,9 @@ int tls_tx_records(struct sock *sk, int flags) * Remove the head of tx_list */ list_del(&rec->list); + free_sg(sk, rec->sg_plaintext_data, + &rec->sg_plaintext_num_elem, &rec->sg_plaintext_size); + kfree(rec); } @@ -224,6 +227,10 @@ int tls_tx_records(struct sock *sk, int flags) goto tx_err; list_del(&rec->list); + free_sg(sk, rec->sg_plaintext_data, + &rec->sg_plaintext_num_elem, + &rec->sg_plaintext_size); + kfree(rec); } else { break; @@ -252,8 +259,6 @@ static void tls_encrypt_done(struct crypto_async_request *req, int err) rec->sg_encrypted_data[0].offset -= tls_ctx->tx.prepend_size; rec->sg_encrypted_data[0].length += tls_ctx->tx.prepend_size; - free_sg(sk, rec->sg_plaintext_data, - &rec->sg_plaintext_num_elem, &rec->sg_plaintext_size); /* Free the record if error is previously set on socket */ if (err || sk->sk_err) { @@ -376,9 +381,6 @@ static int tls_push_record(struct sock *sk, int flags, if (rc == -EINPROGRESS) return -EINPROGRESS; - free_sg(sk, rec->sg_plaintext_data, &rec->sg_plaintext_num_elem, - &rec->sg_plaintext_size); - if (rc < 0) { tls_err_abort(sk, EBADMSG); return rc; @@ -1380,6 +1382,11 @@ void tls_sw_free_resources_tx(struct sock *sk) rec = list_first_entry(&ctx->tx_list, struct tls_rec, list); + + free_sg(sk, rec->sg_plaintext_data, + &rec->sg_plaintext_num_elem, + &rec->sg_plaintext_size); + list_del(&rec->list); kfree(rec); } @@ -1389,6 +1396,10 @@ void tls_sw_free_resources_tx(struct sock *sk) &rec->sg_encrypted_num_elem, &rec->sg_encrypted_size); + free_sg(sk, rec->sg_plaintext_data, + &rec->sg_plaintext_num_elem, + &rec->sg_plaintext_size); + list_del(&rec->list); kfree(rec); } From 214b8bca79870fb0f5bb88bfbffa5ac85fb9a7f6 Mon Sep 17 00:00:00 2001 From: Vakul Garg Date: Tue, 25 Sep 2018 20:21:51 +0530 Subject: [PATCH 1510/1640] UPSTREAM: tls: Fixed a memory leak during socket close During socket close, if there is a open record with tx context, it needs to be be freed apart from freeing up plaintext and encrypted scatter lists. This patch frees up the open record if present in tx context. Also tls_free_both_sg() has been renamed to tls_free_open_rec() to indicate that the free record in tx context is being freed inside the function. Fixes: a42055e8d2c3 ("net/tls: Add support for async encryption") Change-Id: I32c672878335bdd1efe5893e485c08412b4ca2f0 Signed-off-by: Vakul Garg Signed-off-by: David S. Miller --- net/tls/tls_sw.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index bc23d2a4e480..2fba454d80f8 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -163,7 +163,7 @@ static void free_sg(struct sock *sk, struct scatterlist *sg, *sg_size = 0; } -static void tls_free_both_sg(struct sock *sk) +static void tls_free_open_rec(struct sock *sk) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); @@ -180,6 +180,8 @@ static void tls_free_both_sg(struct sock *sk) free_sg(sk, rec->sg_plaintext_data, &rec->sg_plaintext_num_elem, &rec->sg_plaintext_size); + + kfree(rec); } int tls_tx_records(struct sock *sk, int flags) @@ -1405,7 +1407,7 @@ void tls_sw_free_resources_tx(struct sock *sk) } crypto_free_aead(ctx->aead_send); - tls_free_both_sg(sk); + tls_free_open_rec(sk); kfree(ctx); } From 9746613bec62e0ceb756b27b3a9ca07cc98166ce Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Wed, 26 Sep 2018 12:10:48 +0000 Subject: [PATCH 1511/1640] UPSTREAM: net/tls: Make function get_rec() static Fixes the following sparse warning: net/tls/tls_sw.c:655:16: warning: symbol 'get_rec' was not declared. Should it be static? Fixes: a42055e8d2c3 ("net/tls: Add support for async encryption of records for performance") Change-Id: I792e84bf96cfb7b2905dd2e329a63927eea3235a Signed-off-by: Wei Yongjun Signed-off-by: David S. Miller --- net/tls/tls_sw.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 2fba454d80f8..6eec696a07d5 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -490,7 +490,7 @@ out: return rc; } -struct tls_rec *get_rec(struct sock *sk) +static struct tls_rec *get_rec(struct sock *sk) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); From 7e5ada84eeb55c01f4f6774b214c633d3caa6f44 Mon Sep 17 00:00:00 2001 From: Vakul Garg Date: Wed, 26 Sep 2018 16:22:08 +0530 Subject: [PATCH 1512/1640] UPSTREAM: tls: Remove redundant vars from tls record structure Structure 'tls_rec' contains sg_aead_in and sg_aead_out which point to a aad_space and then chain scatterlists sg_plaintext_data, sg_encrypted_data respectively. Rather than using chained scatterlists for plaintext and encrypted data in aead_req, it is efficient to store aad_space in sg_encrypted_data and sg_plaintext_data itself in the first index and get rid of sg_aead_in, sg_aead_in and further chaining. This requires increasing size of sg_encrypted_data & sg_plaintext_data arrarys by 1 to accommodate entry for aad_space. The code which uses sg_encrypted_data and sg_plaintext_data has been modified to skip first index as it points to aad_space. Change-Id: I40751cfb96860b69699d5b40ffc90d81e9ca8c13 Signed-off-by: Vakul Garg Signed-off-by: David S. Miller --- include/net/tls.h | 6 ++-- net/tls/tls_sw.c | 92 ++++++++++++++++++++++------------------------- 2 files changed, 45 insertions(+), 53 deletions(-) diff --git a/include/net/tls.h b/include/net/tls.h index 5eb17d3267ed..1e96593c8ba6 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -101,13 +101,11 @@ struct tls_rec { struct list_head list; int tx_ready; int tx_flags; - struct scatterlist sg_plaintext_data[MAX_SKB_FRAGS]; - struct scatterlist sg_encrypted_data[MAX_SKB_FRAGS]; /* AAD | sg_plaintext_data | sg_tag */ - struct scatterlist sg_aead_in[2]; + struct scatterlist sg_plaintext_data[MAX_SKB_FRAGS + 1]; /* AAD | sg_encrypted_data (data contain overhead for hdr&iv&tag) */ - struct scatterlist sg_aead_out[2]; + struct scatterlist sg_encrypted_data[MAX_SKB_FRAGS + 1]; unsigned int sg_plaintext_size; unsigned int sg_encrypted_size; diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 6eec696a07d5..346858a3808b 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -101,7 +101,7 @@ static void trim_both_sgl(struct sock *sk, int target_size) struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); struct tls_rec *rec = ctx->open_rec; - trim_sg(sk, rec->sg_plaintext_data, + trim_sg(sk, &rec->sg_plaintext_data[1], &rec->sg_plaintext_num_elem, &rec->sg_plaintext_size, target_size); @@ -109,7 +109,7 @@ static void trim_both_sgl(struct sock *sk, int target_size) if (target_size > 0) target_size += tls_ctx->tx.overhead_size; - trim_sg(sk, rec->sg_encrypted_data, + trim_sg(sk, &rec->sg_encrypted_data[1], &rec->sg_encrypted_num_elem, &rec->sg_encrypted_size, target_size); @@ -123,12 +123,13 @@ static int alloc_encrypted_sg(struct sock *sk, int len) int rc = 0; rc = sk_alloc_sg(sk, len, - rec->sg_encrypted_data, 0, + &rec->sg_encrypted_data[1], 0, &rec->sg_encrypted_num_elem, &rec->sg_encrypted_size, 0); if (rc == -ENOSPC) - rec->sg_encrypted_num_elem = ARRAY_SIZE(rec->sg_encrypted_data); + rec->sg_encrypted_num_elem = + ARRAY_SIZE(rec->sg_encrypted_data) - 1; return rc; } @@ -140,12 +141,15 @@ static int alloc_plaintext_sg(struct sock *sk, int len) struct tls_rec *rec = ctx->open_rec; int rc = 0; - rc = sk_alloc_sg(sk, len, rec->sg_plaintext_data, 0, - &rec->sg_plaintext_num_elem, &rec->sg_plaintext_size, + rc = sk_alloc_sg(sk, len, + &rec->sg_plaintext_data[1], 0, + &rec->sg_plaintext_num_elem, + &rec->sg_plaintext_size, tls_ctx->pending_open_record_frags); if (rc == -ENOSPC) - rec->sg_plaintext_num_elem = ARRAY_SIZE(rec->sg_plaintext_data); + rec->sg_plaintext_num_elem = + ARRAY_SIZE(rec->sg_plaintext_data) - 1; return rc; } @@ -173,11 +177,11 @@ static void tls_free_open_rec(struct sock *sk) if (!rec) return; - free_sg(sk, rec->sg_encrypted_data, + free_sg(sk, &rec->sg_encrypted_data[1], &rec->sg_encrypted_num_elem, &rec->sg_encrypted_size); - free_sg(sk, rec->sg_plaintext_data, + free_sg(sk, &rec->sg_plaintext_data[1], &rec->sg_plaintext_num_elem, &rec->sg_plaintext_size); @@ -208,7 +212,7 @@ int tls_tx_records(struct sock *sk, int flags) * Remove the head of tx_list */ list_del(&rec->list); - free_sg(sk, rec->sg_plaintext_data, + free_sg(sk, &rec->sg_plaintext_data[1], &rec->sg_plaintext_num_elem, &rec->sg_plaintext_size); kfree(rec); @@ -223,13 +227,13 @@ int tls_tx_records(struct sock *sk, int flags) tx_flags = flags; rc = tls_push_sg(sk, tls_ctx, - &rec->sg_encrypted_data[0], + &rec->sg_encrypted_data[1], 0, tx_flags); if (rc) goto tx_err; list_del(&rec->list); - free_sg(sk, rec->sg_plaintext_data, + free_sg(sk, &rec->sg_plaintext_data[1], &rec->sg_plaintext_num_elem, &rec->sg_plaintext_size); @@ -258,16 +262,12 @@ static void tls_encrypt_done(struct crypto_async_request *req, int err) rec = container_of(aead_req, struct tls_rec, aead_req); - rec->sg_encrypted_data[0].offset -= tls_ctx->tx.prepend_size; - rec->sg_encrypted_data[0].length += tls_ctx->tx.prepend_size; + rec->sg_encrypted_data[1].offset -= tls_ctx->tx.prepend_size; + rec->sg_encrypted_data[1].length += tls_ctx->tx.prepend_size; - /* Free the record if error is previously set on socket */ + /* Check if error is previously set on socket */ if (err || sk->sk_err) { - free_sg(sk, rec->sg_encrypted_data, - &rec->sg_encrypted_num_elem, &rec->sg_encrypted_size); - - kfree(rec); rec = NULL; /* If err is already set on socket, return the same code */ @@ -302,7 +302,7 @@ static void tls_encrypt_done(struct crypto_async_request *req, int err) /* Schedule the transmission */ if (!test_and_set_bit(BIT_TX_SCHEDULED, &ctx->tx_bitmask)) - schedule_delayed_work(&ctx->tx_work.work, 1); + schedule_delayed_work(&ctx->tx_work.work, 2); } static int tls_do_encryption(struct sock *sk, @@ -314,13 +314,14 @@ static int tls_do_encryption(struct sock *sk, struct tls_rec *rec = ctx->open_rec; int rc; - rec->sg_encrypted_data[0].offset += tls_ctx->tx.prepend_size; - rec->sg_encrypted_data[0].length -= tls_ctx->tx.prepend_size; + /* Skip the first index as it contains AAD data */ + rec->sg_encrypted_data[1].offset += tls_ctx->tx.prepend_size; + rec->sg_encrypted_data[1].length -= tls_ctx->tx.prepend_size; aead_request_set_tfm(aead_req, ctx->aead_send); aead_request_set_ad(aead_req, TLS_AAD_SPACE_SIZE); - aead_request_set_crypt(aead_req, rec->sg_aead_in, - rec->sg_aead_out, + aead_request_set_crypt(aead_req, rec->sg_plaintext_data, + rec->sg_encrypted_data, data_len, tls_ctx->tx.iv); aead_request_set_callback(aead_req, CRYPTO_TFM_REQ_MAY_BACKLOG, @@ -333,8 +334,8 @@ static int tls_do_encryption(struct sock *sk, rc = crypto_aead_encrypt(aead_req); if (!rc || rc != -EINPROGRESS) { atomic_dec(&ctx->encrypt_pending); - rec->sg_encrypted_data[0].offset -= tls_ctx->tx.prepend_size; - rec->sg_encrypted_data[0].length += tls_ctx->tx.prepend_size; + rec->sg_encrypted_data[1].offset -= tls_ctx->tx.prepend_size; + rec->sg_encrypted_data[1].length += tls_ctx->tx.prepend_size; } if (!rc) { @@ -365,16 +366,16 @@ static int tls_push_record(struct sock *sk, int flags, rec->tx_flags = flags; req = &rec->aead_req; - sg_mark_end(rec->sg_plaintext_data + rec->sg_plaintext_num_elem - 1); - sg_mark_end(rec->sg_encrypted_data + rec->sg_encrypted_num_elem - 1); + sg_mark_end(rec->sg_plaintext_data + rec->sg_plaintext_num_elem); + sg_mark_end(rec->sg_encrypted_data + rec->sg_encrypted_num_elem); tls_make_aad(rec->aad_space, rec->sg_plaintext_size, tls_ctx->tx.rec_seq, tls_ctx->tx.rec_seq_size, record_type); tls_fill_prepend(tls_ctx, - page_address(sg_page(&rec->sg_encrypted_data[0])) + - rec->sg_encrypted_data[0].offset, + page_address(sg_page(&rec->sg_encrypted_data[1])) + + rec->sg_encrypted_data[1].offset, rec->sg_plaintext_size, record_type); tls_ctx->pending_open_record_frags = 0; @@ -466,7 +467,7 @@ static int memcopy_from_iter(struct sock *sk, struct iov_iter *from, struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); struct tls_rec *rec = ctx->open_rec; - struct scatterlist *sg = rec->sg_plaintext_data; + struct scatterlist *sg = &rec->sg_plaintext_data[1]; int copy, i, rc = 0; for (i = tls_ctx->pending_open_record_frags; @@ -512,17 +513,10 @@ static struct tls_rec *get_rec(struct sock *sk) sg_init_table(&rec->sg_encrypted_data[0], ARRAY_SIZE(rec->sg_encrypted_data)); - sg_init_table(rec->sg_aead_in, 2); - sg_set_buf(&rec->sg_aead_in[0], rec->aad_space, + sg_set_buf(&rec->sg_plaintext_data[0], rec->aad_space, sizeof(rec->aad_space)); - sg_unmark_end(&rec->sg_aead_in[1]); - sg_chain(rec->sg_aead_in, 2, rec->sg_plaintext_data); - - sg_init_table(rec->sg_aead_out, 2); - sg_set_buf(&rec->sg_aead_out[0], rec->aad_space, + sg_set_buf(&rec->sg_encrypted_data[0], rec->aad_space, sizeof(rec->aad_space)); - sg_unmark_end(&rec->sg_aead_out[1]); - sg_chain(rec->sg_aead_out, 2, rec->sg_encrypted_data); ctx->open_rec = rec; @@ -616,8 +610,8 @@ alloc_encrypted: ret = zerocopy_from_iter(sk, &msg->msg_iter, try_to_copy, &rec->sg_plaintext_num_elem, &rec->sg_plaintext_size, - rec->sg_plaintext_data, - ARRAY_SIZE(rec->sg_plaintext_data), + &rec->sg_plaintext_data[1], + ARRAY_SIZE(rec->sg_plaintext_data) - 1, true); if (ret) goto fallback_to_reg_send; @@ -634,7 +628,7 @@ alloc_encrypted: continue; fallback_to_reg_send: - trim_sg(sk, rec->sg_plaintext_data, + trim_sg(sk, &rec->sg_plaintext_data[1], &rec->sg_plaintext_num_elem, &rec->sg_plaintext_size, orig_size); @@ -654,7 +648,7 @@ alloc_plaintext: try_to_copy -= required_size - rec->sg_plaintext_size; full_record = true; - trim_sg(sk, rec->sg_encrypted_data, + trim_sg(sk, &rec->sg_encrypted_data[1], &rec->sg_encrypted_num_elem, &rec->sg_encrypted_size, rec->sg_plaintext_size + @@ -802,7 +796,7 @@ alloc_payload: } get_page(page); - sg = rec->sg_plaintext_data + rec->sg_plaintext_num_elem; + sg = &rec->sg_plaintext_data[1] + rec->sg_plaintext_num_elem; sg_set_page(sg, page, copy, offset); sg_unmark_end(sg); @@ -816,7 +810,7 @@ alloc_payload: if (full_record || eor || rec->sg_plaintext_num_elem == - ARRAY_SIZE(rec->sg_plaintext_data)) { + ARRAY_SIZE(rec->sg_plaintext_data) - 1) { ret = tls_push_record(sk, flags, record_type); if (ret) { if (ret == -EINPROGRESS) @@ -1385,7 +1379,7 @@ void tls_sw_free_resources_tx(struct sock *sk) rec = list_first_entry(&ctx->tx_list, struct tls_rec, list); - free_sg(sk, rec->sg_plaintext_data, + free_sg(sk, &rec->sg_plaintext_data[1], &rec->sg_plaintext_num_elem, &rec->sg_plaintext_size); @@ -1394,11 +1388,11 @@ void tls_sw_free_resources_tx(struct sock *sk) } list_for_each_entry_safe(rec, tmp, &ctx->tx_list, list) { - free_sg(sk, rec->sg_encrypted_data, + free_sg(sk, &rec->sg_encrypted_data[1], &rec->sg_encrypted_num_elem, &rec->sg_encrypted_size); - free_sg(sk, rec->sg_plaintext_data, + free_sg(sk, &rec->sg_plaintext_data[1], &rec->sg_plaintext_num_elem, &rec->sg_plaintext_size); From 75a2d4771ee4e78e28eeb4ecbf9c8d5f412d7a9c Mon Sep 17 00:00:00 2001 From: Vakul Garg Date: Sun, 30 Sep 2018 08:04:35 +0530 Subject: [PATCH 1513/1640] UPSTREAM: tls: Add support for inplace records encryption Presently, for non-zero copy case, separate pages are allocated for storing plaintext and encrypted text of records. These pages are stored in sg_plaintext_data and sg_encrypted_data scatterlists inside record structure. Further, sg_plaintext_data & sg_encrypted_data are passed to cryptoapis for record encryption. Allocating separate pages for plaintext and encrypted text is inefficient from both required memory and performance point of view. This patch adds support of inplace encryption of records. For non-zero copy case, we reuse the pages from sg_encrypted_data scatterlist to copy the application's plaintext data. For the movement of pages from sg_encrypted_data to sg_plaintext_data scatterlists, we introduce a new function move_to_plaintext_sg(). This function add pages into sg_plaintext_data from sg_encrypted_data scatterlists. tls_do_encryption() is modified to pass the same scatterlist as both source and destination into aead_request_set_crypt() if inplace crypto has been enabled. A new ariable 'inplace_crypto' has been introduced in record structure to signify whether the same scatterlist can be used. By default, the inplace_crypto is enabled in get_rec(). If zero-copy is used (i.e. plaintext data is not copied), inplace_crypto is set to '0'. Change-Id: I04c506d99118f4a861c760f12217cd77ede20340 Signed-off-by: Vakul Garg Reviewed-by: Dave Watson Signed-off-by: David S. Miller --- include/net/tls.h | 1 + net/tls/tls_sw.c | 91 +++++++++++++++++++++++++++++++++++++---------- 2 files changed, 74 insertions(+), 18 deletions(-) diff --git a/include/net/tls.h b/include/net/tls.h index 1e96593c8ba6..28eaa07b15ec 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -101,6 +101,7 @@ struct tls_rec { struct list_head list; int tx_ready; int tx_flags; + int inplace_crypto; /* AAD | sg_plaintext_data | sg_tag */ struct scatterlist sg_plaintext_data[MAX_SKB_FRAGS + 1]; diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 346858a3808b..2f63f1ab086f 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -134,24 +134,72 @@ static int alloc_encrypted_sg(struct sock *sk, int len) return rc; } -static int alloc_plaintext_sg(struct sock *sk, int len) +static int move_to_plaintext_sg(struct sock *sk, int required_size) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); struct tls_rec *rec = ctx->open_rec; - int rc = 0; + struct scatterlist *plain_sg = &rec->sg_plaintext_data[1]; + struct scatterlist *enc_sg = &rec->sg_encrypted_data[1]; + int enc_sg_idx = 0; + int skip, len; - rc = sk_alloc_sg(sk, len, - &rec->sg_plaintext_data[1], 0, - &rec->sg_plaintext_num_elem, - &rec->sg_plaintext_size, - tls_ctx->pending_open_record_frags); + if (rec->sg_plaintext_num_elem == MAX_SKB_FRAGS) + return -ENOSPC; - if (rc == -ENOSPC) - rec->sg_plaintext_num_elem = - ARRAY_SIZE(rec->sg_plaintext_data) - 1; + /* We add page references worth len bytes from enc_sg at the + * end of plain_sg. It is guaranteed that sg_encrypted_data + * has enough required room (ensured by caller). + */ + len = required_size - rec->sg_plaintext_size; - return rc; + /* Skip initial bytes in sg_encrypted_data to be able + * to use same offset of both plain and encrypted data. + */ + skip = tls_ctx->tx.prepend_size + rec->sg_plaintext_size; + + while (enc_sg_idx < rec->sg_encrypted_num_elem) { + if (enc_sg[enc_sg_idx].length > skip) + break; + + skip -= enc_sg[enc_sg_idx].length; + enc_sg_idx++; + } + + /* unmark the end of plain_sg*/ + sg_unmark_end(plain_sg + rec->sg_plaintext_num_elem - 1); + + while (len) { + struct page *page = sg_page(&enc_sg[enc_sg_idx]); + int bytes = enc_sg[enc_sg_idx].length - skip; + int offset = enc_sg[enc_sg_idx].offset + skip; + + if (bytes > len) + bytes = len; + else + enc_sg_idx++; + + /* Skipping is required only one time */ + skip = 0; + + /* Increment page reference */ + get_page(page); + + sg_set_page(&plain_sg[rec->sg_plaintext_num_elem], page, + bytes, offset); + + sk_mem_charge(sk, bytes); + + len -= bytes; + rec->sg_plaintext_size += bytes; + + rec->sg_plaintext_num_elem++; + + if (rec->sg_plaintext_num_elem == MAX_SKB_FRAGS) + return -ENOSPC; + } + + return 0; } static void free_sg(struct sock *sk, struct scatterlist *sg, @@ -312,16 +360,21 @@ static int tls_do_encryption(struct sock *sk, size_t data_len) { struct tls_rec *rec = ctx->open_rec; + struct scatterlist *plain_sg = rec->sg_plaintext_data; + struct scatterlist *enc_sg = rec->sg_encrypted_data; int rc; /* Skip the first index as it contains AAD data */ rec->sg_encrypted_data[1].offset += tls_ctx->tx.prepend_size; rec->sg_encrypted_data[1].length -= tls_ctx->tx.prepend_size; + /* If it is inplace crypto, then pass same SG list as both src, dst */ + if (rec->inplace_crypto) + plain_sg = enc_sg; + aead_request_set_tfm(aead_req, ctx->aead_send); aead_request_set_ad(aead_req, TLS_AAD_SPACE_SIZE); - aead_request_set_crypt(aead_req, rec->sg_plaintext_data, - rec->sg_encrypted_data, + aead_request_set_crypt(aead_req, plain_sg, enc_sg, data_len, tls_ctx->tx.iv); aead_request_set_callback(aead_req, CRYPTO_TFM_REQ_MAY_BACKLOG, @@ -519,6 +572,7 @@ static struct tls_rec *get_rec(struct sock *sk) sizeof(rec->aad_space)); ctx->open_rec = rec; + rec->inplace_crypto = 1; return rec; } @@ -616,6 +670,8 @@ alloc_encrypted: if (ret) goto fallback_to_reg_send; + rec->inplace_crypto = 0; + num_zc++; copied += try_to_copy; ret = tls_push_record(sk, msg->msg_flags, record_type); @@ -635,11 +691,11 @@ fallback_to_reg_send: } required_size = rec->sg_plaintext_size + try_to_copy; -alloc_plaintext: - ret = alloc_plaintext_sg(sk, required_size); + + ret = move_to_plaintext_sg(sk, required_size); if (ret) { if (ret != -ENOSPC) - goto wait_for_memory; + goto send_end; /* Adjust try_to_copy according to the amount that was * actually allocated. The difference is due @@ -684,8 +740,6 @@ trim_sgl: if (rec->sg_encrypted_size < required_size) goto alloc_encrypted; - - goto alloc_plaintext; } if (!num_async) { @@ -811,6 +865,7 @@ alloc_payload: if (full_record || eor || rec->sg_plaintext_num_elem == ARRAY_SIZE(rec->sg_plaintext_data) - 1) { + rec->inplace_crypto = 0; ret = tls_push_record(sk, flags, record_type); if (ret) { if (ret == -EINPROGRESS) From 57d1a5edd420dc2a35b6375e547c684cb46266d6 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 23 Jul 2018 22:37:54 +0200 Subject: [PATCH 1514/1640] UPSTREAM: sock: fix sg page frag coalescing in sk_alloc_sg Current sg coalescing logic in sk_alloc_sg() (latter is used by tls and sockmap) is not quite correct in that we do fetch the previous sg entry, however the subsequent check whether the refilled page frag from the socket is still the same as from the last entry with prior offset and length matching the start of the current buffer is comparing always the first sg list entry instead of the prior one. Fixes: 3c4d7559159b ("tls: kernel TLS support") Signed-off-by: Daniel Borkmann Acked-by: Dave Watson Signed-off-by: David S. Miller --- net/core/sock.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/core/sock.c b/net/core/sock.c index ddf4f8277f96..e53e271dbf46 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2462,9 +2462,9 @@ int sk_alloc_sg(struct sock *sk, int len, struct scatterlist *sg, pfrag->offset += use; sge = sg + sg_curr - 1; - if (sg_curr > first_coalesce && sg_page(sg) == pfrag->page && - sg->offset + sg->length == orig_offset) { - sg->length += use; + if (sg_curr > first_coalesce && sg_page(sge) == pfrag->page && + sge->offset + sge->length == orig_offset) { + sge->length += use; } else { sge = sg + sg_curr; sg_unmark_end(sge); From 515df1eb7b428504453059a91cdd46dd98b138fe Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 13 Oct 2018 02:45:59 +0200 Subject: [PATCH 1515/1640] BACKPORT: tls: convert to generic sk_msg interface Convert kTLS over to make use of sk_msg interface for plaintext and encrypted scattergather data, so it reuses all the sk_msg helpers and data structure which later on in a second step enables to glue this to BPF. This also allows to remove quite a bit of open coded helpers which are covered by the sk_msg API. Recent changes in kTLs 80ece6a03aaf ("tls: Remove redundant vars from tls record structure") and 4e6d47206c32 ("tls: Add support for inplace records encryption") changed the data path handling a bit; while we've kept the latter optimization intact, we had to undo the former change to better fit the sk_msg model, hence the sg_aead_in and sg_aead_out have been brought back and are linked into the sk_msg sgs. Now the kTLS record contains a msg_plaintext and msg_encrypted sk_msg each. In the original code, the zerocopy_from_iter() has been used out of TX but also RX path. For the strparser skb-based RX path, we've left the zerocopy_from_iter() in decrypt_internal() mostly untouched, meaning it has been moved into tls_setup_from_iter() with charging logic removed (as not used from RX). Given RX path is not based on sk_msg objects, we haven't pursued setting up a dummy sk_msg to call into sk_msg_zerocopy_from_iter(), but it could be an option to prusue in a later step. Joint work with John. Change-Id: I23f3b92d49aafda7e3070ef62dcf671bf26c7c1e Signed-off-by: Daniel Borkmann Signed-off-by: John Fastabend Signed-off-by: Alexei Starovoitov --- include/linux/skmsg.h | 2 + include/net/sock.h | 4 - include/net/tls.h | 18 +- net/core/skmsg.c | 39 ++++ net/core/sock.c | 61 ----- net/tls/Kconfig | 1 + net/tls/tls_sw.c | 511 +++++++++++++++--------------------------- 7 files changed, 235 insertions(+), 401 deletions(-) diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index aa7e4815279d..9f7a3f17a4cc 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -102,6 +102,8 @@ struct sk_psock { int sk_msg_alloc(struct sock *sk, struct sk_msg *msg, int len, int elem_first_coalesce); +int sk_msg_clone(struct sock *sk, struct sk_msg *dst, struct sk_msg *src, + u32 off, u32 len); void sk_msg_trim(struct sock *sk, struct sk_msg *msg, int len); int sk_msg_free(struct sock *sk, struct sk_msg *msg); int sk_msg_free_nocharge(struct sock *sk, struct sk_msg *msg); diff --git a/include/net/sock.h b/include/net/sock.h index 9f58e77fe19a..bf4234303856 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2234,10 +2234,6 @@ static inline struct page_frag *sk_page_frag(struct sock *sk) bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag); -int sk_alloc_sg(struct sock *sk, int len, struct scatterlist *sg, - int sg_start, int *sg_curr, unsigned int *sg_size, - int first_coalesce); - /* * Default write policy as shown to user space via poll/select/SIGIO */ diff --git a/include/net/tls.h b/include/net/tls.h index 28eaa07b15ec..ac3970f3f66a 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -39,6 +39,8 @@ #include #include #include +#include + #include #include #include @@ -103,15 +105,13 @@ struct tls_rec { int tx_flags; int inplace_crypto; - /* AAD | sg_plaintext_data | sg_tag */ - struct scatterlist sg_plaintext_data[MAX_SKB_FRAGS + 1]; - /* AAD | sg_encrypted_data (data contain overhead for hdr&iv&tag) */ - struct scatterlist sg_encrypted_data[MAX_SKB_FRAGS + 1]; + struct sk_msg msg_plaintext; + struct sk_msg msg_encrypted; - unsigned int sg_plaintext_size; - unsigned int sg_encrypted_size; - int sg_plaintext_num_elem; - int sg_encrypted_num_elem; + /* AAD | msg_plaintext.sg.data | sg_tag */ + struct scatterlist sg_aead_in[2]; + /* AAD | msg_encrypted.sg.data (data contains overhead for hdr & iv & tag) */ + struct scatterlist sg_aead_out[2]; char aad_space[TLS_AAD_SPACE_SIZE]; struct aead_request aead_req; @@ -231,8 +231,8 @@ struct tls_context { unsigned long flags; bool in_tcp_sendpages; + bool pending_open_record_frags; - u16 pending_open_record_frags; int (*push_pending_record)(struct sock *sk, int flags); void (*sk_write_space)(struct sock *sk); diff --git a/net/core/skmsg.c b/net/core/skmsg.c index ae2b281c9c57..56a99d0c9aa0 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -73,6 +73,45 @@ int sk_msg_alloc(struct sock *sk, struct sk_msg *msg, int len, } EXPORT_SYMBOL_GPL(sk_msg_alloc); +int sk_msg_clone(struct sock *sk, struct sk_msg *dst, struct sk_msg *src, + u32 off, u32 len) +{ + int i = src->sg.start; + struct scatterlist *sge = sk_msg_elem(src, i); + u32 sge_len, sge_off; + + if (sk_msg_full(dst)) + return -ENOSPC; + + while (off) { + if (sge->length > off) + break; + off -= sge->length; + sk_msg_iter_var_next(i); + if (i == src->sg.end && off) + return -ENOSPC; + sge = sk_msg_elem(src, i); + } + + while (len) { + sge_len = sge->length - off; + sge_off = sge->offset + off; + if (sge_len > len) + sge_len = len; + off = 0; + len -= sge_len; + sk_msg_page_add(dst, sg_page(sge), sge_len, sge_off); + sk_mem_charge(sk, sge_len); + sk_msg_iter_var_next(i); + if (i == src->sg.end && len) + return -ENOSPC; + sge = sk_msg_elem(src, i); + } + + return 0; +} +EXPORT_SYMBOL_GPL(sk_msg_clone); + void sk_msg_return_zero(struct sock *sk, struct sk_msg *msg, int bytes) { int i = msg->sg.start; diff --git a/net/core/sock.c b/net/core/sock.c index e53e271dbf46..342113238216 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2429,67 +2429,6 @@ bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag) } EXPORT_SYMBOL(sk_page_frag_refill); -int sk_alloc_sg(struct sock *sk, int len, struct scatterlist *sg, - int sg_start, int *sg_curr_index, unsigned int *sg_curr_size, - int first_coalesce) -{ - int sg_curr = *sg_curr_index, use = 0, rc = 0; - unsigned int size = *sg_curr_size; - struct page_frag *pfrag; - struct scatterlist *sge; - - len -= size; - pfrag = sk_page_frag(sk); - - while (len > 0) { - unsigned int orig_offset; - - if (!sk_page_frag_refill(sk, pfrag)) { - rc = -ENOMEM; - goto out; - } - - use = min_t(int, len, pfrag->size - pfrag->offset); - - if (!sk_wmem_schedule(sk, use)) { - rc = -ENOMEM; - goto out; - } - - sk_mem_charge(sk, use); - size += use; - orig_offset = pfrag->offset; - pfrag->offset += use; - - sge = sg + sg_curr - 1; - if (sg_curr > first_coalesce && sg_page(sge) == pfrag->page && - sge->offset + sge->length == orig_offset) { - sge->length += use; - } else { - sge = sg + sg_curr; - sg_unmark_end(sge); - sg_set_page(sge, pfrag->page, use, orig_offset); - get_page(pfrag->page); - sg_curr++; - - if (sg_curr == MAX_SKB_FRAGS) - sg_curr = 0; - - if (sg_curr == sg_start) { - rc = -ENOSPC; - break; - } - } - - len -= use; - } -out: - *sg_curr_size = size; - *sg_curr_index = sg_curr; - return rc; -} -EXPORT_SYMBOL(sk_alloc_sg); - static void __lock_sock(struct sock *sk) __releases(&sk->sk_lock.slock) __acquires(&sk->sk_lock.slock) diff --git a/net/tls/Kconfig b/net/tls/Kconfig index 73f05ece53d0..99c1a19c17b1 100644 --- a/net/tls/Kconfig +++ b/net/tls/Kconfig @@ -8,6 +8,7 @@ config TLS select CRYPTO_AES select CRYPTO_GCM select STREAM_PARSER + select NET_SOCK_MSG default n ---help--- Enable kernel support for TLS protocol. This allows symmetric diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 2f63f1ab086f..3ca873dfbee9 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -66,153 +66,49 @@ static int tls_do_decryption(struct sock *sk, return ret; } -static void trim_sg(struct sock *sk, struct scatterlist *sg, - int *sg_num_elem, unsigned int *sg_size, int target_size) -{ - int i = *sg_num_elem - 1; - int trim = *sg_size - target_size; - - if (trim <= 0) { - WARN_ON(trim < 0); - return; - } - - *sg_size = target_size; - while (trim >= sg[i].length) { - trim -= sg[i].length; - sk_mem_uncharge(sk, sg[i].length); - put_page(sg_page(&sg[i])); - i--; - - if (i < 0) - goto out; - } - - sg[i].length -= trim; - sk_mem_uncharge(sk, trim); - -out: - *sg_num_elem = i + 1; -} - -static void trim_both_sgl(struct sock *sk, int target_size) +static void tls_trim_both_msgs(struct sock *sk, int target_size) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); struct tls_rec *rec = ctx->open_rec; - trim_sg(sk, &rec->sg_plaintext_data[1], - &rec->sg_plaintext_num_elem, - &rec->sg_plaintext_size, - target_size); - + sk_msg_trim(sk, &rec->msg_plaintext, target_size); if (target_size > 0) target_size += tls_ctx->tx.overhead_size; - - trim_sg(sk, &rec->sg_encrypted_data[1], - &rec->sg_encrypted_num_elem, - &rec->sg_encrypted_size, - target_size); + sk_msg_trim(sk, &rec->msg_encrypted, target_size); } -static int alloc_encrypted_sg(struct sock *sk, int len) +static int tls_alloc_encrypted_msg(struct sock *sk, int len) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); struct tls_rec *rec = ctx->open_rec; - int rc = 0; + struct sk_msg *msg_en = &rec->msg_encrypted; - rc = sk_alloc_sg(sk, len, - &rec->sg_encrypted_data[1], 0, - &rec->sg_encrypted_num_elem, - &rec->sg_encrypted_size, 0); - - if (rc == -ENOSPC) - rec->sg_encrypted_num_elem = - ARRAY_SIZE(rec->sg_encrypted_data) - 1; - - return rc; + return sk_msg_alloc(sk, msg_en, len, 0); } -static int move_to_plaintext_sg(struct sock *sk, int required_size) +static int tls_clone_plaintext_msg(struct sock *sk, int required) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); struct tls_rec *rec = ctx->open_rec; - struct scatterlist *plain_sg = &rec->sg_plaintext_data[1]; - struct scatterlist *enc_sg = &rec->sg_encrypted_data[1]; - int enc_sg_idx = 0; + struct sk_msg *msg_pl = &rec->msg_plaintext; + struct sk_msg *msg_en = &rec->msg_encrypted; int skip, len; - if (rec->sg_plaintext_num_elem == MAX_SKB_FRAGS) - return -ENOSPC; - - /* We add page references worth len bytes from enc_sg at the - * end of plain_sg. It is guaranteed that sg_encrypted_data + /* We add page references worth len bytes from encrypted sg + * at the end of plaintext sg. It is guaranteed that msg_en * has enough required room (ensured by caller). */ - len = required_size - rec->sg_plaintext_size; + len = required - msg_pl->sg.size; - /* Skip initial bytes in sg_encrypted_data to be able - * to use same offset of both plain and encrypted data. + /* Skip initial bytes in msg_en's data to be able to use + * same offset of both plain and encrypted data. */ - skip = tls_ctx->tx.prepend_size + rec->sg_plaintext_size; + skip = tls_ctx->tx.prepend_size + msg_pl->sg.size; - while (enc_sg_idx < rec->sg_encrypted_num_elem) { - if (enc_sg[enc_sg_idx].length > skip) - break; - - skip -= enc_sg[enc_sg_idx].length; - enc_sg_idx++; - } - - /* unmark the end of plain_sg*/ - sg_unmark_end(plain_sg + rec->sg_plaintext_num_elem - 1); - - while (len) { - struct page *page = sg_page(&enc_sg[enc_sg_idx]); - int bytes = enc_sg[enc_sg_idx].length - skip; - int offset = enc_sg[enc_sg_idx].offset + skip; - - if (bytes > len) - bytes = len; - else - enc_sg_idx++; - - /* Skipping is required only one time */ - skip = 0; - - /* Increment page reference */ - get_page(page); - - sg_set_page(&plain_sg[rec->sg_plaintext_num_elem], page, - bytes, offset); - - sk_mem_charge(sk, bytes); - - len -= bytes; - rec->sg_plaintext_size += bytes; - - rec->sg_plaintext_num_elem++; - - if (rec->sg_plaintext_num_elem == MAX_SKB_FRAGS) - return -ENOSPC; - } - - return 0; -} - -static void free_sg(struct sock *sk, struct scatterlist *sg, - int *sg_num_elem, unsigned int *sg_size) -{ - int i, n = *sg_num_elem; - - for (i = 0; i < n; ++i) { - sk_mem_uncharge(sk, sg[i].length); - put_page(sg_page(&sg[i])); - } - *sg_num_elem = 0; - *sg_size = 0; + return sk_msg_clone(sk, msg_pl, msg_en, skip, len); } static void tls_free_open_rec(struct sock *sk) @@ -225,14 +121,8 @@ static void tls_free_open_rec(struct sock *sk) if (!rec) return; - free_sg(sk, &rec->sg_encrypted_data[1], - &rec->sg_encrypted_num_elem, - &rec->sg_encrypted_size); - - free_sg(sk, &rec->sg_plaintext_data[1], - &rec->sg_plaintext_num_elem, - &rec->sg_plaintext_size); - + sk_msg_free(sk, &rec->msg_encrypted); + sk_msg_free(sk, &rec->msg_plaintext); kfree(rec); } @@ -241,6 +131,7 @@ int tls_tx_records(struct sock *sk, int flags) struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); struct tls_rec *rec, *tmp; + struct sk_msg *msg_en; int tx_flags, rc = 0; if (tls_is_partially_sent_record(tls_ctx)) { @@ -260,9 +151,7 @@ int tls_tx_records(struct sock *sk, int flags) * Remove the head of tx_list */ list_del(&rec->list); - free_sg(sk, &rec->sg_plaintext_data[1], - &rec->sg_plaintext_num_elem, &rec->sg_plaintext_size); - + sk_msg_free(sk, &rec->msg_plaintext); kfree(rec); } @@ -274,17 +163,15 @@ int tls_tx_records(struct sock *sk, int flags) else tx_flags = flags; + msg_en = &rec->msg_encrypted; rc = tls_push_sg(sk, tls_ctx, - &rec->sg_encrypted_data[1], + &msg_en->sg.data[msg_en->sg.curr], 0, tx_flags); if (rc) goto tx_err; list_del(&rec->list); - free_sg(sk, &rec->sg_plaintext_data[1], - &rec->sg_plaintext_num_elem, - &rec->sg_plaintext_size); - + sk_msg_free(sk, &rec->msg_plaintext); kfree(rec); } else { break; @@ -304,15 +191,18 @@ static void tls_encrypt_done(struct crypto_async_request *req, int err) struct sock *sk = req->data; struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); + struct scatterlist *sge; + struct sk_msg *msg_en; struct tls_rec *rec; bool ready = false; int pending; rec = container_of(aead_req, struct tls_rec, aead_req); + msg_en = &rec->msg_encrypted; - rec->sg_encrypted_data[1].offset -= tls_ctx->tx.prepend_size; - rec->sg_encrypted_data[1].length += tls_ctx->tx.prepend_size; - + sge = sk_msg_elem(msg_en, msg_en->sg.curr); + sge->offset -= tls_ctx->tx.prepend_size; + sge->length += tls_ctx->tx.prepend_size; /* Check if error is previously set on socket */ if (err || sk->sk_err) { @@ -350,31 +240,29 @@ static void tls_encrypt_done(struct crypto_async_request *req, int err) /* Schedule the transmission */ if (!test_and_set_bit(BIT_TX_SCHEDULED, &ctx->tx_bitmask)) - schedule_delayed_work(&ctx->tx_work.work, 2); + schedule_delayed_work(&ctx->tx_work.work, 1); } static int tls_do_encryption(struct sock *sk, struct tls_context *tls_ctx, struct tls_sw_context_tx *ctx, struct aead_request *aead_req, - size_t data_len) + size_t data_len, u32 start) { struct tls_rec *rec = ctx->open_rec; - struct scatterlist *plain_sg = rec->sg_plaintext_data; - struct scatterlist *enc_sg = rec->sg_encrypted_data; + struct sk_msg *msg_en = &rec->msg_encrypted; + struct scatterlist *sge = sk_msg_elem(msg_en, start); int rc; - /* Skip the first index as it contains AAD data */ - rec->sg_encrypted_data[1].offset += tls_ctx->tx.prepend_size; - rec->sg_encrypted_data[1].length -= tls_ctx->tx.prepend_size; + sge->offset += tls_ctx->tx.prepend_size; + sge->length -= tls_ctx->tx.prepend_size; - /* If it is inplace crypto, then pass same SG list as both src, dst */ - if (rec->inplace_crypto) - plain_sg = enc_sg; + msg_en->sg.curr = start; aead_request_set_tfm(aead_req, ctx->aead_send); aead_request_set_ad(aead_req, TLS_AAD_SPACE_SIZE); - aead_request_set_crypt(aead_req, plain_sg, enc_sg, + aead_request_set_crypt(aead_req, rec->sg_aead_in, + rec->sg_aead_out, data_len, tls_ctx->tx.iv); aead_request_set_callback(aead_req, CRYPTO_TFM_REQ_MAY_BACKLOG, @@ -387,8 +275,8 @@ static int tls_do_encryption(struct sock *sk, rc = crypto_aead_encrypt(aead_req); if (!rc || rc != -EINPROGRESS) { atomic_dec(&ctx->encrypt_pending); - rec->sg_encrypted_data[1].offset -= tls_ctx->tx.prepend_size; - rec->sg_encrypted_data[1].length += tls_ctx->tx.prepend_size; + sge->offset -= tls_ctx->tx.prepend_size; + sge->length += tls_ctx->tx.prepend_size; } if (!rc) { @@ -410,35 +298,50 @@ static int tls_push_record(struct sock *sk, int flags, struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); struct tls_rec *rec = ctx->open_rec; + struct sk_msg *msg_pl, *msg_en; struct aead_request *req; int rc; + u32 i; if (!rec) return 0; + msg_pl = &rec->msg_plaintext; + msg_en = &rec->msg_encrypted; + rec->tx_flags = flags; req = &rec->aead_req; - sg_mark_end(rec->sg_plaintext_data + rec->sg_plaintext_num_elem); - sg_mark_end(rec->sg_encrypted_data + rec->sg_encrypted_num_elem); + i = msg_pl->sg.end; + sk_msg_iter_var_prev(i); + sg_mark_end(sk_msg_elem(msg_pl, i)); - tls_make_aad(rec->aad_space, rec->sg_plaintext_size, + i = msg_pl->sg.start; + sg_chain(rec->sg_aead_in, 2, rec->inplace_crypto ? + &msg_en->sg.data[i] : &msg_pl->sg.data[i]); + + i = msg_en->sg.end; + sk_msg_iter_var_prev(i); + sg_mark_end(sk_msg_elem(msg_en, i)); + + i = msg_en->sg.start; + sg_chain(rec->sg_aead_out, 2, &msg_en->sg.data[i]); + + tls_make_aad(rec->aad_space, msg_pl->sg.size, tls_ctx->tx.rec_seq, tls_ctx->tx.rec_seq_size, record_type); tls_fill_prepend(tls_ctx, - page_address(sg_page(&rec->sg_encrypted_data[1])) + - rec->sg_encrypted_data[1].offset, - rec->sg_plaintext_size, record_type); + page_address(sg_page(&msg_en->sg.data[i])) + + msg_en->sg.data[i].offset, msg_pl->sg.size, + record_type); - tls_ctx->pending_open_record_frags = 0; - - rc = tls_do_encryption(sk, tls_ctx, ctx, req, rec->sg_plaintext_size); - if (rc == -EINPROGRESS) - return -EINPROGRESS; + tls_ctx->pending_open_record_frags = false; + rc = tls_do_encryption(sk, tls_ctx, ctx, req, msg_pl->sg.size, i); if (rc < 0) { - tls_err_abort(sk, EBADMSG); + if (rc != -EINPROGRESS) + tls_err_abort(sk, EBADMSG); return rc; } @@ -450,104 +353,11 @@ static int tls_sw_push_pending_record(struct sock *sk, int flags) return tls_push_record(sk, flags, TLS_RECORD_TYPE_DATA); } -static int zerocopy_from_iter(struct sock *sk, struct iov_iter *from, - int length, int *pages_used, - unsigned int *size_used, - struct scatterlist *to, int to_max_pages, - bool charge) -{ - struct page *pages[MAX_SKB_FRAGS]; - - size_t offset; - ssize_t copied, use; - int i = 0; - unsigned int size = *size_used; - int num_elem = *pages_used; - int rc = 0; - int maxpages; - - while (length > 0) { - i = 0; - maxpages = to_max_pages - num_elem; - if (maxpages == 0) { - rc = -EFAULT; - goto out; - } - copied = iov_iter_get_pages(from, pages, - length, - maxpages, &offset); - if (copied <= 0) { - rc = -EFAULT; - goto out; - } - - iov_iter_advance(from, copied); - - length -= copied; - size += copied; - while (copied) { - use = min_t(int, copied, PAGE_SIZE - offset); - - sg_set_page(&to[num_elem], - pages[i], use, offset); - sg_unmark_end(&to[num_elem]); - if (charge) - sk_mem_charge(sk, use); - - offset = 0; - copied -= use; - - ++i; - ++num_elem; - } - } - - /* Mark the end in the last sg entry if newly added */ - if (num_elem > *pages_used) - sg_mark_end(&to[num_elem - 1]); -out: - if (rc) - iov_iter_revert(from, size - *size_used); - *size_used = size; - *pages_used = num_elem; - - return rc; -} - -static int memcopy_from_iter(struct sock *sk, struct iov_iter *from, - int bytes) -{ - struct tls_context *tls_ctx = tls_get_ctx(sk); - struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); - struct tls_rec *rec = ctx->open_rec; - struct scatterlist *sg = &rec->sg_plaintext_data[1]; - int copy, i, rc = 0; - - for (i = tls_ctx->pending_open_record_frags; - i < rec->sg_plaintext_num_elem; ++i) { - copy = sg[i].length; - if (copy_from_iter( - page_address(sg_page(&sg[i])) + sg[i].offset, - copy, from) != copy) { - rc = -EFAULT; - goto out; - } - bytes -= copy; - - ++tls_ctx->pending_open_record_frags; - - if (!bytes) - break; - } - -out: - return rc; -} - static struct tls_rec *get_rec(struct sock *sk) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); + struct sk_msg *msg_pl, *msg_en; struct tls_rec *rec; int mem_size; @@ -561,15 +371,21 @@ static struct tls_rec *get_rec(struct sock *sk) if (!rec) return NULL; - sg_init_table(&rec->sg_plaintext_data[0], - ARRAY_SIZE(rec->sg_plaintext_data)); - sg_init_table(&rec->sg_encrypted_data[0], - ARRAY_SIZE(rec->sg_encrypted_data)); + msg_pl = &rec->msg_plaintext; + msg_en = &rec->msg_encrypted; - sg_set_buf(&rec->sg_plaintext_data[0], rec->aad_space, + sk_msg_init(msg_pl); + sk_msg_init(msg_en); + + sg_init_table(rec->sg_aead_in, 2); + sg_set_buf(&rec->sg_aead_in[0], rec->aad_space, sizeof(rec->aad_space)); - sg_set_buf(&rec->sg_encrypted_data[0], rec->aad_space, + sg_unmark_end(&rec->sg_aead_in[1]); + + sg_init_table(rec->sg_aead_out, 2); + sg_set_buf(&rec->sg_aead_out[0], rec->aad_space, sizeof(rec->aad_space)); + sg_unmark_end(&rec->sg_aead_out[1]); ctx->open_rec = rec; rec->inplace_crypto = 1; @@ -588,6 +404,7 @@ int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) bool is_kvec = msg->msg_iter.type & ITER_KVEC; bool eor = !(msg->msg_flags & MSG_MORE); size_t try_to_copy, copied = 0; + struct sk_msg *msg_pl, *msg_en; struct tls_rec *rec; int required_size; int num_async = 0; @@ -631,23 +448,26 @@ int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) goto send_end; } - orig_size = rec->sg_plaintext_size; + msg_pl = &rec->msg_plaintext; + msg_en = &rec->msg_encrypted; + + orig_size = msg_pl->sg.size; full_record = false; try_to_copy = msg_data_left(msg); - record_room = TLS_MAX_PAYLOAD_SIZE - rec->sg_plaintext_size; + record_room = TLS_MAX_PAYLOAD_SIZE - msg_pl->sg.size; if (try_to_copy >= record_room) { try_to_copy = record_room; full_record = true; } - required_size = rec->sg_plaintext_size + try_to_copy + + required_size = msg_pl->sg.size + try_to_copy + tls_ctx->tx.overhead_size; if (!sk_stream_memory_free(sk)) goto wait_for_sndbuf; alloc_encrypted: - ret = alloc_encrypted_sg(sk, required_size); + ret = tls_alloc_encrypted_msg(sk, required_size); if (ret) { if (ret != -ENOSPC) goto wait_for_memory; @@ -656,17 +476,13 @@ alloc_encrypted: * actually allocated. The difference is due * to max sg elements limit */ - try_to_copy -= required_size - rec->sg_encrypted_size; + try_to_copy -= required_size - msg_en->sg.size; full_record = true; } if (!is_kvec && (full_record || eor) && !async_capable) { - ret = zerocopy_from_iter(sk, &msg->msg_iter, - try_to_copy, &rec->sg_plaintext_num_elem, - &rec->sg_plaintext_size, - &rec->sg_plaintext_data[1], - ARRAY_SIZE(rec->sg_plaintext_data) - 1, - true); + ret = sk_msg_zerocopy_from_iter(sk, &msg->msg_iter, + msg_pl, try_to_copy); if (ret) goto fallback_to_reg_send; @@ -684,15 +500,12 @@ alloc_encrypted: continue; fallback_to_reg_send: - trim_sg(sk, &rec->sg_plaintext_data[1], - &rec->sg_plaintext_num_elem, - &rec->sg_plaintext_size, - orig_size); + sk_msg_trim(sk, msg_pl, orig_size); } - required_size = rec->sg_plaintext_size + try_to_copy; + required_size = msg_pl->sg.size + try_to_copy; - ret = move_to_plaintext_sg(sk, required_size); + ret = tls_clone_plaintext_msg(sk, required_size); if (ret) { if (ret != -ENOSPC) goto send_end; @@ -701,20 +514,21 @@ fallback_to_reg_send: * actually allocated. The difference is due * to max sg elements limit */ - try_to_copy -= required_size - rec->sg_plaintext_size; + try_to_copy -= required_size - msg_pl->sg.size; full_record = true; - - trim_sg(sk, &rec->sg_encrypted_data[1], - &rec->sg_encrypted_num_elem, - &rec->sg_encrypted_size, - rec->sg_plaintext_size + - tls_ctx->tx.overhead_size); + sk_msg_trim(sk, msg_en, msg_pl->sg.size + + tls_ctx->tx.overhead_size); } - ret = memcopy_from_iter(sk, &msg->msg_iter, try_to_copy); - if (ret) + ret = sk_msg_memcopy_from_iter(sk, &msg->msg_iter, msg_pl, + try_to_copy); + if (ret < 0) goto trim_sgl; + /* Open records defined only if successfully copied, otherwise + * we would trim the sg but not reset the open record frags. + */ + tls_ctx->pending_open_record_frags = true; copied += try_to_copy; if (full_record || eor) { ret = tls_push_record(sk, msg->msg_flags, record_type); @@ -734,11 +548,11 @@ wait_for_memory: ret = sk_stream_wait_memory(sk, &timeo); if (ret) { trim_sgl: - trim_both_sgl(sk, orig_size); + tls_trim_both_msgs(sk, orig_size); goto send_end; } - if (rec->sg_encrypted_size < required_size) + if (msg_en->sg.size < required_size) goto alloc_encrypted; } @@ -782,7 +596,7 @@ int tls_sw_sendpage(struct sock *sk, struct page *page, struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); unsigned char record_type = TLS_RECORD_TYPE_DATA; size_t orig_size = size; - struct scatterlist *sg; + struct sk_msg *msg_pl; struct tls_rec *rec; int num_async = 0; bool full_record; @@ -823,20 +637,23 @@ int tls_sw_sendpage(struct sock *sk, struct page *page, goto sendpage_end; } + msg_pl = &rec->msg_plaintext; + full_record = false; - record_room = TLS_MAX_PAYLOAD_SIZE - rec->sg_plaintext_size; + record_room = TLS_MAX_PAYLOAD_SIZE - msg_pl->sg.size; copy = size; if (copy >= record_room) { copy = record_room; full_record = true; } - required_size = rec->sg_plaintext_size + copy + - tls_ctx->tx.overhead_size; + + required_size = msg_pl->sg.size + copy + + tls_ctx->tx.overhead_size; if (!sk_stream_memory_free(sk)) goto wait_for_sndbuf; alloc_payload: - ret = alloc_encrypted_sg(sk, required_size); + ret = tls_alloc_encrypted_msg(sk, required_size); if (ret) { if (ret != -ENOSPC) goto wait_for_memory; @@ -845,26 +662,18 @@ alloc_payload: * actually allocated. The difference is due * to max sg elements limit */ - copy -= required_size - rec->sg_plaintext_size; + copy -= required_size - msg_pl->sg.size; full_record = true; } - get_page(page); - sg = &rec->sg_plaintext_data[1] + rec->sg_plaintext_num_elem; - sg_set_page(sg, page, copy, offset); - sg_unmark_end(sg); - - rec->sg_plaintext_num_elem++; - + sk_msg_page_add(msg_pl, page, copy, offset); sk_mem_charge(sk, copy); + offset += copy; size -= copy; - rec->sg_plaintext_size += copy; - tls_ctx->pending_open_record_frags = rec->sg_plaintext_num_elem; - if (full_record || eor || - rec->sg_plaintext_num_elem == - ARRAY_SIZE(rec->sg_plaintext_data) - 1) { + tls_ctx->pending_open_record_frags = true; + if (full_record || eor || sk_msg_full(msg_pl)) { rec->inplace_crypto = 0; ret = tls_push_record(sk, flags, record_type); if (ret) { @@ -880,7 +689,7 @@ wait_for_sndbuf: wait_for_memory: ret = sk_stream_wait_memory(sk, &timeo); if (ret) { - trim_both_sgl(sk, rec->sg_plaintext_size); + tls_trim_both_msgs(sk, msg_pl->sg.size); goto sendpage_end; } @@ -951,6 +760,64 @@ static struct sk_buff *tls_wait_data(struct sock *sk, int flags, return skb; } +static int tls_setup_from_iter(struct sock *sk, struct iov_iter *from, + int length, int *pages_used, + unsigned int *size_used, + struct scatterlist *to, + int to_max_pages) +{ + int rc = 0, i = 0, num_elem = *pages_used, maxpages; + struct page *pages[MAX_SKB_FRAGS]; + unsigned int size = *size_used; + ssize_t copied, use; + size_t offset; + + while (length > 0) { + i = 0; + maxpages = to_max_pages - num_elem; + if (maxpages == 0) { + rc = -EFAULT; + goto out; + } + copied = iov_iter_get_pages(from, pages, + length, + maxpages, &offset); + if (copied <= 0) { + rc = -EFAULT; + goto out; + } + + iov_iter_advance(from, copied); + + length -= copied; + size += copied; + while (copied) { + use = min_t(int, copied, PAGE_SIZE - offset); + + sg_set_page(&to[num_elem], + pages[i], use, offset); + sg_unmark_end(&to[num_elem]); + /* We do not uncharge memory from this API */ + + offset = 0; + copied -= use; + + i++; + num_elem++; + } + } + /* Mark the end in the last sg entry if newly added */ + if (num_elem > *pages_used) + sg_mark_end(&to[num_elem - 1]); +out: + if (rc) + iov_iter_revert(from, size - *size_used); + *size_used = size; + *pages_used = num_elem; + + return rc; +} + /* This function decrypts the input skb into either out_iov or in out_sg * or in skb buffers itself. The input parameter 'zc' indicates if * zero-copy mode needs to be tried or not. With zero-copy mode, either @@ -1046,9 +913,9 @@ static int decrypt_internal(struct sock *sk, struct sk_buff *skb, sg_set_buf(&sgout[0], aad, TLS_AAD_SPACE_SIZE); *chunk = 0; - err = zerocopy_from_iter(sk, out_iov, data_len, &pages, - chunk, &sgout[1], - (n_sgout - 1), false); + err = tls_setup_from_iter(sk, out_iov, data_len, + &pages, chunk, &sgout[1], + (n_sgout - 1)); if (err < 0) goto fallback_to_reg_recv; } else if (out_sg) { @@ -1433,25 +1300,15 @@ void tls_sw_free_resources_tx(struct sock *sk) rec = list_first_entry(&ctx->tx_list, struct tls_rec, list); - - free_sg(sk, &rec->sg_plaintext_data[1], - &rec->sg_plaintext_num_elem, - &rec->sg_plaintext_size); - list_del(&rec->list); + sk_msg_free(sk, &rec->msg_plaintext); kfree(rec); } list_for_each_entry_safe(rec, tmp, &ctx->tx_list, list) { - free_sg(sk, &rec->sg_encrypted_data[1], - &rec->sg_encrypted_num_elem, - &rec->sg_encrypted_size); - - free_sg(sk, &rec->sg_plaintext_data[1], - &rec->sg_plaintext_num_elem, - &rec->sg_plaintext_size); - list_del(&rec->list); + sk_msg_free(sk, &rec->msg_encrypted); + sk_msg_free(sk, &rec->msg_plaintext); kfree(rec); } From 4056130e4d7adcc60ca39e5274578f6d9af11a1e Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Sat, 13 Oct 2018 02:46:00 +0200 Subject: [PATCH 1516/1640] UPSTREAM: tls: replace poll implementation with read hook Instead of re-implementing poll routine use the poll callback to trigger read from kTLS, we reuse the stream_memory_read callback which is simpler and achieves the same. This helps to align sockmap and kTLS so we can more easily embed BPF in kTLS. Joint work with Daniel. Change-Id: I6782aaa51c8db2207e406d556aa2c7e2bf1beb55 Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov --- include/net/tls.h | 6 ++---- net/tls/tls_main.c | 11 ++++++----- net/tls/tls_sw.c | 16 +++------------- 3 files changed, 11 insertions(+), 22 deletions(-) diff --git a/include/net/tls.h b/include/net/tls.h index ac3970f3f66a..e195c2d3b11b 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -142,8 +142,7 @@ struct tls_sw_context_rx { struct strparser strp; void (*saved_data_ready)(struct sock *sk); - unsigned int (*sk_poll)(struct file *file, struct socket *sock, - struct poll_table_struct *wait); + struct sk_buff *recv_pkt; u8 control; bool decrypted; @@ -281,8 +280,7 @@ void tls_sw_free_resources_rx(struct sock *sk); void tls_sw_release_resources_rx(struct sock *sk); int tls_sw_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, int flags, int *addr_len); -unsigned int tls_sw_poll(struct file *file, struct socket *sock, - struct poll_table_struct *wait); +bool tls_sw_stream_read(const struct sock *sk); ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags); diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index d956837ba75e..0ebe0be35322 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -622,12 +622,14 @@ static void build_protos(struct proto prot[TLS_NUM_CONFIG][TLS_NUM_CONFIG], prot[TLS_SW][TLS_BASE].sendpage = tls_sw_sendpage; prot[TLS_BASE][TLS_SW] = prot[TLS_BASE][TLS_BASE]; - prot[TLS_BASE][TLS_SW].recvmsg = tls_sw_recvmsg; - prot[TLS_BASE][TLS_SW].close = tls_sk_proto_close; + prot[TLS_BASE][TLS_SW].recvmsg = tls_sw_recvmsg; + prot[TLS_BASE][TLS_SW].stream_memory_read = tls_sw_stream_read; + prot[TLS_BASE][TLS_SW].close = tls_sk_proto_close; prot[TLS_SW][TLS_SW] = prot[TLS_SW][TLS_BASE]; - prot[TLS_SW][TLS_SW].recvmsg = tls_sw_recvmsg; - prot[TLS_SW][TLS_SW].close = tls_sk_proto_close; + prot[TLS_SW][TLS_SW].recvmsg = tls_sw_recvmsg; + prot[TLS_SW][TLS_SW].stream_memory_read = tls_sw_stream_read; + prot[TLS_SW][TLS_SW].close = tls_sk_proto_close; #ifdef CONFIG_TLS_DEVICE prot[TLS_HW][TLS_BASE] = prot[TLS_BASE][TLS_BASE]; @@ -723,7 +725,6 @@ static int __init tls_register(void) build_protos(tls_prots[TLSV4], &tcp_prot); tls_sw_proto_ops = inet_stream_ops; - tls_sw_proto_ops.poll = tls_sw_poll; tls_sw_proto_ops.splice_read = tls_sw_splice_read; #ifdef CONFIG_TLS_DEVICE diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 3ca873dfbee9..9893ce3cba65 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -1166,23 +1166,15 @@ splice_read_end: return copied ? : err; } -unsigned int tls_sw_poll(struct file *file, struct socket *sock, - struct poll_table_struct *wait) +bool tls_sw_stream_read(const struct sock *sk) { - unsigned int ret; - struct sock *sk = sock->sk; struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); - /* Grab POLLOUT and POLLHUP from the underlying socket */ - ret = ctx->sk_poll(file, sock, wait); - - /* Clear POLLIN bits, and set based on recv_pkt */ - ret &= ~(POLLIN | POLLRDNORM); if (ctx->recv_pkt) - ret |= POLLIN | POLLRDNORM; + return true; - return ret; + return false; } static int tls_read_size(struct strparser *strp, struct sk_buff *skb) @@ -1503,8 +1495,6 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx) sk->sk_data_ready = tls_data_ready; write_unlock_bh(&sk->sk_callback_lock); - sw_ctx_rx->sk_poll = sk->sk_socket->ops->poll; - strp_check_rcv(&sw_ctx_rx->strp); } From 814c8a3be7c66cc8b7fa8fcdb7431c2fbe68fe8f Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Sat, 13 Oct 2018 02:46:01 +0200 Subject: [PATCH 1517/1640] UPSTREAM: tls: add bpf support to sk_msg handling This work adds BPF sk_msg verdict program support to kTLS allowing BPF and kTLS to be combined together. Previously kTLS and sk_msg verdict programs were mutually exclusive in the ULP layer which created challenges for the orchestrator when trying to apply TCP based policy, for example. To resolve this, leveraging the work from previous patches that consolidates the use of sk_msg, we can finally enable BPF sk_msg verdict programs so they continue to run after the kTLS socket is created. No change in behavior when kTLS is not used in combination with BPF, the kselftest suite for kTLS also runs successfully. Joint work with Daniel. Change-Id: I9903043df595917cc34269c8070a151eda3a782b Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov --- include/linux/skmsg.h | 41 +++- net/tls/tls_sw.c | 445 +++++++++++++++++++++++++++++++++++------- 2 files changed, 417 insertions(+), 69 deletions(-) diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index 9f7a3f17a4cc..706662f21dca 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -29,7 +29,11 @@ struct sk_msg_sg { u32 size; u32 copybreak; bool copy[MAX_MSG_FRAGS]; - struct scatterlist data[MAX_MSG_FRAGS]; + /* The extra element is used for chaining the front and sections when + * the list becomes partitioned (e.g. end < start). The crypto APIs + * require the chaining. + */ + struct scatterlist data[MAX_MSG_FRAGS + 1]; }; struct sk_msg { @@ -112,6 +116,7 @@ void sk_msg_free_partial_nocharge(struct sock *sk, struct sk_msg *msg, u32 bytes); void sk_msg_return(struct sock *sk, struct sk_msg *msg, int bytes); +void sk_msg_return_zero(struct sock *sk, struct sk_msg *msg, int bytes); int sk_msg_zerocopy_from_iter(struct sock *sk, struct iov_iter *from, struct sk_msg *msg, u32 bytes); @@ -161,8 +166,9 @@ static inline void sk_msg_clear_meta(struct sk_msg *msg) static inline void sk_msg_init(struct sk_msg *msg) { + BUILD_BUG_ON(ARRAY_SIZE(msg->sg.data) - 1 != MAX_MSG_FRAGS); memset(msg, 0, sizeof(*msg)); - sg_init_marker(msg->sg.data, ARRAY_SIZE(msg->sg.data)); + sg_init_marker(msg->sg.data, MAX_MSG_FRAGS); } static inline void sk_msg_xfer(struct sk_msg *dst, struct sk_msg *src, @@ -174,6 +180,12 @@ static inline void sk_msg_xfer(struct sk_msg *dst, struct sk_msg *src, src->sg.data[which].offset += size; } +static inline void sk_msg_xfer_full(struct sk_msg *dst, struct sk_msg *src) +{ + memcpy(dst, src, sizeof(*src)); + sk_msg_init(src); +} + static inline u32 sk_msg_elem_used(const struct sk_msg *msg) { return msg->sg.end >= msg->sg.start ? @@ -234,6 +246,26 @@ static inline void sk_msg_page_add(struct sk_msg *msg, struct page *page, sk_msg_iter_next(msg, end); } +static inline void sk_msg_sg_copy(struct sk_msg *msg, u32 i, bool copy_state) +{ + do { + msg->sg.copy[i] = copy_state; + sk_msg_iter_var_next(i); + if (i == msg->sg.end) + break; + } while (1); +} + +static inline void sk_msg_sg_copy_set(struct sk_msg *msg, u32 start) +{ + sk_msg_sg_copy(msg, start, true); +} + +static inline void sk_msg_sg_copy_clear(struct sk_msg *msg, u32 start) +{ + sk_msg_sg_copy(msg, start, false); +} + static inline struct sk_psock *sk_psock(const struct sock *sk) { return rcu_dereference_sk_user_data(sk); @@ -250,6 +282,11 @@ static inline void sk_psock_queue_msg(struct sk_psock *psock, list_add_tail(&msg->list, &psock->ingress_msg); } +static inline bool sk_psock_queue_empty(const struct sk_psock *psock) +{ + return psock ? list_empty(&psock->ingress_msg) : true; +} + static inline void sk_psock_report_error(struct sk_psock *psock, int err) { struct sock *sk = psock->sk; diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 9893ce3cba65..dff896c28b5d 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -4,6 +4,7 @@ * Copyright (c) 2016-2017, Lance Chao . All rights reserved. * Copyright (c) 2016, Fridolin Pokorny . All rights reserved. * Copyright (c) 2016, Nikos Mavrogiannopoulos . All rights reserved. + * Copyright (c) 2018, Covalent IO, Inc. http://covalent.io * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -111,19 +112,56 @@ static int tls_clone_plaintext_msg(struct sock *sk, int required) return sk_msg_clone(sk, msg_pl, msg_en, skip, len); } +static struct tls_rec *tls_get_rec(struct sock *sk) +{ + struct tls_context *tls_ctx = tls_get_ctx(sk); + struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); + struct sk_msg *msg_pl, *msg_en; + struct tls_rec *rec; + int mem_size; + + mem_size = sizeof(struct tls_rec) + crypto_aead_reqsize(ctx->aead_send); + + rec = kzalloc(mem_size, sk->sk_allocation); + if (!rec) + return NULL; + + msg_pl = &rec->msg_plaintext; + msg_en = &rec->msg_encrypted; + + sk_msg_init(msg_pl); + sk_msg_init(msg_en); + + sg_init_table(rec->sg_aead_in, 2); + sg_set_buf(&rec->sg_aead_in[0], rec->aad_space, + sizeof(rec->aad_space)); + sg_unmark_end(&rec->sg_aead_in[1]); + + sg_init_table(rec->sg_aead_out, 2); + sg_set_buf(&rec->sg_aead_out[0], rec->aad_space, + sizeof(rec->aad_space)); + sg_unmark_end(&rec->sg_aead_out[1]); + + return rec; +} + +static void tls_free_rec(struct sock *sk, struct tls_rec *rec) +{ + sk_msg_free(sk, &rec->msg_encrypted); + sk_msg_free(sk, &rec->msg_plaintext); + kfree(rec); +} + static void tls_free_open_rec(struct sock *sk) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); struct tls_rec *rec = ctx->open_rec; - /* Return if there is no open record */ - if (!rec) - return; - - sk_msg_free(sk, &rec->msg_encrypted); - sk_msg_free(sk, &rec->msg_plaintext); - kfree(rec); + if (rec) { + tls_free_rec(sk, rec); + ctx->open_rec = NULL; + } } int tls_tx_records(struct sock *sk, int flags) @@ -292,16 +330,135 @@ static int tls_do_encryption(struct sock *sk, return rc; } +static int tls_split_open_record(struct sock *sk, struct tls_rec *from, + struct tls_rec **to, struct sk_msg *msg_opl, + struct sk_msg *msg_oen, u32 split_point, + u32 tx_overhead_size, u32 *orig_end) +{ + u32 i, j, bytes = 0, apply = msg_opl->apply_bytes; + struct scatterlist *sge, *osge, *nsge; + u32 orig_size = msg_opl->sg.size; + struct scatterlist tmp = { }; + struct sk_msg *msg_npl; + struct tls_rec *new; + int ret; + + new = tls_get_rec(sk); + if (!new) + return -ENOMEM; + ret = sk_msg_alloc(sk, &new->msg_encrypted, msg_opl->sg.size + + tx_overhead_size, 0); + if (ret < 0) { + tls_free_rec(sk, new); + return ret; + } + + *orig_end = msg_opl->sg.end; + i = msg_opl->sg.start; + sge = sk_msg_elem(msg_opl, i); + while (apply && sge->length) { + if (sge->length > apply) { + u32 len = sge->length - apply; + + get_page(sg_page(sge)); + sg_set_page(&tmp, sg_page(sge), len, + sge->offset + apply); + sge->length = apply; + bytes += apply; + apply = 0; + } else { + apply -= sge->length; + bytes += sge->length; + } + + sk_msg_iter_var_next(i); + if (i == msg_opl->sg.end) + break; + sge = sk_msg_elem(msg_opl, i); + } + + msg_opl->sg.end = i; + msg_opl->sg.curr = i; + msg_opl->sg.copybreak = 0; + msg_opl->apply_bytes = 0; + msg_opl->sg.size = bytes; + + msg_npl = &new->msg_plaintext; + msg_npl->apply_bytes = apply; + msg_npl->sg.size = orig_size - bytes; + + j = msg_npl->sg.start; + nsge = sk_msg_elem(msg_npl, j); + if (tmp.length) { + memcpy(nsge, &tmp, sizeof(*nsge)); + sk_msg_iter_var_next(j); + nsge = sk_msg_elem(msg_npl, j); + } + + osge = sk_msg_elem(msg_opl, i); + while (osge->length) { + memcpy(nsge, osge, sizeof(*nsge)); + sg_unmark_end(nsge); + sk_msg_iter_var_next(i); + sk_msg_iter_var_next(j); + if (i == *orig_end) + break; + osge = sk_msg_elem(msg_opl, i); + nsge = sk_msg_elem(msg_npl, j); + } + + msg_npl->sg.end = j; + msg_npl->sg.curr = j; + msg_npl->sg.copybreak = 0; + + *to = new; + return 0; +} + +static void tls_merge_open_record(struct sock *sk, struct tls_rec *to, + struct tls_rec *from, u32 orig_end) +{ + struct sk_msg *msg_npl = &from->msg_plaintext; + struct sk_msg *msg_opl = &to->msg_plaintext; + struct scatterlist *osge, *nsge; + u32 i, j; + + i = msg_opl->sg.end; + sk_msg_iter_var_prev(i); + j = msg_npl->sg.start; + + osge = sk_msg_elem(msg_opl, i); + nsge = sk_msg_elem(msg_npl, j); + + if (sg_page(osge) == sg_page(nsge) && + osge->offset + osge->length == nsge->offset) { + osge->length += nsge->length; + put_page(sg_page(nsge)); + } + + msg_opl->sg.end = orig_end; + msg_opl->sg.curr = orig_end; + msg_opl->sg.copybreak = 0; + msg_opl->apply_bytes = msg_opl->sg.size + msg_npl->sg.size; + msg_opl->sg.size += msg_npl->sg.size; + + sk_msg_free(sk, &to->msg_encrypted); + sk_msg_xfer_full(&to->msg_encrypted, &from->msg_encrypted); + + kfree(from); +} + static int tls_push_record(struct sock *sk, int flags, unsigned char record_type) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); - struct tls_rec *rec = ctx->open_rec; + struct tls_rec *rec = ctx->open_rec, *tmp = NULL; + u32 i, split_point, uninitialized_var(orig_end); struct sk_msg *msg_pl, *msg_en; struct aead_request *req; + bool split; int rc; - u32 i; if (!rec) return 0; @@ -309,6 +466,18 @@ static int tls_push_record(struct sock *sk, int flags, msg_pl = &rec->msg_plaintext; msg_en = &rec->msg_encrypted; + split_point = msg_pl->apply_bytes; + split = split_point && split_point < msg_pl->sg.size; + if (split) { + rc = tls_split_open_record(sk, rec, &tmp, msg_pl, msg_en, + split_point, tls_ctx->tx.overhead_size, + &orig_end); + if (rc < 0) + return rc; + sk_msg_trim(sk, msg_en, msg_pl->sg.size + + tls_ctx->tx.overhead_size); + } + rec->tx_flags = flags; req = &rec->aead_req; @@ -340,57 +509,139 @@ static int tls_push_record(struct sock *sk, int flags, rc = tls_do_encryption(sk, tls_ctx, ctx, req, msg_pl->sg.size, i); if (rc < 0) { - if (rc != -EINPROGRESS) + if (rc != -EINPROGRESS) { tls_err_abort(sk, EBADMSG); + if (split) { + tls_ctx->pending_open_record_frags = true; + tls_merge_open_record(sk, rec, tmp, orig_end); + } + } return rc; + } else if (split) { + msg_pl = &tmp->msg_plaintext; + msg_en = &tmp->msg_encrypted; + sk_msg_trim(sk, msg_en, msg_pl->sg.size + + tls_ctx->tx.overhead_size); + tls_ctx->pending_open_record_frags = true; + ctx->open_rec = tmp; } return tls_tx_records(sk, flags); } -static int tls_sw_push_pending_record(struct sock *sk, int flags) -{ - return tls_push_record(sk, flags, TLS_RECORD_TYPE_DATA); -} - -static struct tls_rec *get_rec(struct sock *sk) +static int bpf_exec_tx_verdict(struct sk_msg *msg, struct sock *sk, + bool full_record, u8 record_type, + size_t *copied, int flags) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); - struct sk_msg *msg_pl, *msg_en; + struct sk_msg msg_redir = { }; + struct sk_psock *psock; + struct sock *sk_redir; struct tls_rec *rec; - int mem_size; + int err = 0, send; + bool enospc; - /* Return if we already have an open record */ - if (ctx->open_rec) - return ctx->open_rec; + psock = sk_psock_get(sk); + if (!psock) + return tls_push_record(sk, flags, record_type); +more_data: + enospc = sk_msg_full(msg); + if (psock->eval == __SK_NONE) + psock->eval = sk_psock_msg_verdict(sk, psock, msg); + if (msg->cork_bytes && msg->cork_bytes > msg->sg.size && + !enospc && !full_record) { + err = -ENOSPC; + goto out_err; + } + msg->cork_bytes = 0; + send = msg->sg.size; + if (msg->apply_bytes && msg->apply_bytes < send) + send = msg->apply_bytes; - mem_size = sizeof(struct tls_rec) + crypto_aead_reqsize(ctx->aead_send); + switch (psock->eval) { + case __SK_PASS: + err = tls_push_record(sk, flags, record_type); + if (err < 0) { + *copied -= sk_msg_free(sk, msg); + tls_free_open_rec(sk); + goto out_err; + } + break; + case __SK_REDIRECT: + sk_redir = psock->sk_redir; + memcpy(&msg_redir, msg, sizeof(*msg)); + if (msg->apply_bytes < send) + msg->apply_bytes = 0; + else + msg->apply_bytes -= send; + sk_msg_return_zero(sk, msg, send); + msg->sg.size -= send; + release_sock(sk); + err = tcp_bpf_sendmsg_redir(sk_redir, &msg_redir, send, flags); + lock_sock(sk); + if (err < 0) { + *copied -= sk_msg_free_nocharge(sk, &msg_redir); + msg->sg.size = 0; + } + if (msg->sg.size == 0) + tls_free_open_rec(sk); + break; + case __SK_DROP: + default: + sk_msg_free_partial(sk, msg, send); + if (msg->apply_bytes < send) + msg->apply_bytes = 0; + else + msg->apply_bytes -= send; + if (msg->sg.size == 0) + tls_free_open_rec(sk); + *copied -= send; + err = -EACCES; + } + + if (likely(!err)) { + bool reset_eval = !ctx->open_rec; + + rec = ctx->open_rec; + if (rec) { + msg = &rec->msg_plaintext; + if (!msg->apply_bytes) + reset_eval = true; + } + if (reset_eval) { + psock->eval = __SK_NONE; + if (psock->sk_redir) { + sock_put(psock->sk_redir); + psock->sk_redir = NULL; + } + } + if (rec) + goto more_data; + } + out_err: + sk_psock_put(sk, psock); + return err; +} + +static int tls_sw_push_pending_record(struct sock *sk, int flags) +{ + struct tls_context *tls_ctx = tls_get_ctx(sk); + struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); + struct tls_rec *rec = ctx->open_rec; + struct sk_msg *msg_pl; + size_t copied; - rec = kzalloc(mem_size, sk->sk_allocation); if (!rec) - return NULL; + return 0; msg_pl = &rec->msg_plaintext; - msg_en = &rec->msg_encrypted; + copied = msg_pl->sg.size; + if (!copied) + return 0; - sk_msg_init(msg_pl); - sk_msg_init(msg_en); - - sg_init_table(rec->sg_aead_in, 2); - sg_set_buf(&rec->sg_aead_in[0], rec->aad_space, - sizeof(rec->aad_space)); - sg_unmark_end(&rec->sg_aead_in[1]); - - sg_init_table(rec->sg_aead_out, 2); - sg_set_buf(&rec->sg_aead_out[0], rec->aad_space, - sizeof(rec->aad_space)); - sg_unmark_end(&rec->sg_aead_out[1]); - - ctx->open_rec = rec; - rec->inplace_crypto = 1; - - return rec; + return bpf_exec_tx_verdict(msg_pl, sk, true, TLS_RECORD_TYPE_DATA, + &copied, flags); } int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) @@ -442,7 +693,10 @@ int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) goto send_end; } - rec = get_rec(sk); + if (ctx->open_rec) + rec = ctx->open_rec; + else + rec = ctx->open_rec = tls_get_rec(sk); if (!rec) { ret = -ENOMEM; goto send_end; @@ -481,6 +735,8 @@ alloc_encrypted: } if (!is_kvec && (full_record || eor) && !async_capable) { + u32 first = msg_pl->sg.end; + ret = sk_msg_zerocopy_from_iter(sk, &msg->msg_iter, msg_pl, try_to_copy); if (ret) @@ -490,15 +746,27 @@ alloc_encrypted: num_zc++; copied += try_to_copy; - ret = tls_push_record(sk, msg->msg_flags, record_type); + + sk_msg_sg_copy_set(msg_pl, first); + ret = bpf_exec_tx_verdict(msg_pl, sk, full_record, + record_type, &copied, + msg->msg_flags); if (ret) { if (ret == -EINPROGRESS) num_async++; + else if (ret == -ENOMEM) + goto wait_for_memory; + else if (ret == -ENOSPC) + goto rollback_iter; else if (ret != -EAGAIN) goto send_end; } continue; - +rollback_iter: + copied -= try_to_copy; + sk_msg_sg_copy_clear(msg_pl, first); + iov_iter_revert(&msg->msg_iter, + msg_pl->sg.size - orig_size); fallback_to_reg_send: sk_msg_trim(sk, msg_pl, orig_size); } @@ -531,12 +799,19 @@ fallback_to_reg_send: tls_ctx->pending_open_record_frags = true; copied += try_to_copy; if (full_record || eor) { - ret = tls_push_record(sk, msg->msg_flags, record_type); + ret = bpf_exec_tx_verdict(msg_pl, sk, full_record, + record_type, &copied, + msg->msg_flags); if (ret) { if (ret == -EINPROGRESS) num_async++; - else if (ret != -EAGAIN) + else if (ret == -ENOMEM) + goto wait_for_memory; + else if (ret != -EAGAIN) { + if (ret == -ENOSPC) + ret = 0; goto send_end; + } } } @@ -595,10 +870,10 @@ int tls_sw_sendpage(struct sock *sk, struct page *page, struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); unsigned char record_type = TLS_RECORD_TYPE_DATA; - size_t orig_size = size; struct sk_msg *msg_pl; struct tls_rec *rec; int num_async = 0; + size_t copied = 0; bool full_record; int record_room; int ret = 0; @@ -631,7 +906,10 @@ int tls_sw_sendpage(struct sock *sk, struct page *page, goto sendpage_end; } - rec = get_rec(sk); + if (ctx->open_rec) + rec = ctx->open_rec; + else + rec = ctx->open_rec = tls_get_rec(sk); if (!rec) { ret = -ENOMEM; goto sendpage_end; @@ -641,6 +919,7 @@ int tls_sw_sendpage(struct sock *sk, struct page *page, full_record = false; record_room = TLS_MAX_PAYLOAD_SIZE - msg_pl->sg.size; + copied = 0; copy = size; if (copy >= record_room) { copy = record_room; @@ -671,16 +950,23 @@ alloc_payload: offset += copy; size -= copy; + copied += copy; tls_ctx->pending_open_record_frags = true; if (full_record || eor || sk_msg_full(msg_pl)) { rec->inplace_crypto = 0; - ret = tls_push_record(sk, flags, record_type); + ret = bpf_exec_tx_verdict(msg_pl, sk, full_record, + record_type, &copied, flags); if (ret) { if (ret == -EINPROGRESS) num_async++; - else if (ret != -EAGAIN) + else if (ret == -ENOMEM) + goto wait_for_memory; + else if (ret != -EAGAIN) { + if (ret == -ENOSPC) + ret = 0; goto sendpage_end; + } } } continue; @@ -704,24 +990,20 @@ wait_for_memory: } } sendpage_end: - if (orig_size > size) - ret = orig_size - size; - else - ret = sk_stream_error(sk, flags, ret); - + ret = sk_stream_error(sk, flags, ret); release_sock(sk); - return ret; + return copied ? copied : ret; } -static struct sk_buff *tls_wait_data(struct sock *sk, int flags, - long timeo, int *err) +static struct sk_buff *tls_wait_data(struct sock *sk, struct sk_psock *psock, + int flags, long timeo, int *err) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); struct sk_buff *skb; DEFINE_WAIT_FUNC(wait, woken_wake_function); - while (!(skb = ctx->recv_pkt)) { + while (!(skb = ctx->recv_pkt) && sk_psock_queue_empty(psock)) { if (sk->sk_err) { *err = sock_error(sk); return NULL; @@ -746,7 +1028,10 @@ static struct sk_buff *tls_wait_data(struct sock *sk, int flags, add_wait_queue(sk_sleep(sk), &wait); sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); - sk_wait_event(sk, &timeo, ctx->recv_pkt != skb, &wait); + sk_wait_event(sk, &timeo, + ctx->recv_pkt != skb || + !sk_psock_queue_empty(psock), + &wait); sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); remove_wait_queue(sk_sleep(sk), &wait); @@ -1012,6 +1297,7 @@ int tls_sw_recvmsg(struct sock *sk, { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); + struct sk_psock *psock; unsigned char control; struct strp_msg *rxm; struct sk_buff *skb; @@ -1026,6 +1312,7 @@ int tls_sw_recvmsg(struct sock *sk, if (unlikely(flags & MSG_ERRQUEUE)) return sock_recv_errqueue(sk, msg, len, SOL_IP, IP_RECVERR); + psock = sk_psock_get(sk); lock_sock(sk); target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); @@ -1034,9 +1321,19 @@ int tls_sw_recvmsg(struct sock *sk, bool zc = false; int chunk = 0; - skb = tls_wait_data(sk, flags, timeo, &err); - if (!skb) + skb = tls_wait_data(sk, psock, flags, timeo, &err); + if (!skb) { + if (psock) { + int ret = __tcp_bpf_recvmsg(sk, psock, msg, len); + + if (ret > 0) { + copied += ret; + len -= ret; + continue; + } + } goto recv_end; + } rxm = strp_msg(skb); if (!cmsg) { @@ -1110,6 +1407,8 @@ int tls_sw_recvmsg(struct sock *sk, recv_end: release_sock(sk); + if (psock) + sk_psock_put(sk, psock); return copied ? : err; } @@ -1132,7 +1431,7 @@ ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos, timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); - skb = tls_wait_data(sk, flags, timeo, &err); + skb = tls_wait_data(sk, NULL, flags, timeo, &err); if (!skb) goto splice_read_end; @@ -1170,11 +1469,16 @@ bool tls_sw_stream_read(const struct sock *sk) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); + bool ingress_empty = true; + struct sk_psock *psock; - if (ctx->recv_pkt) - return true; + rcu_read_lock(); + psock = sk_psock(sk); + if (psock) + ingress_empty = list_empty(&psock->ingress_msg); + rcu_read_unlock(); - return false; + return !ingress_empty || ctx->recv_pkt; } static int tls_read_size(struct strparser *strp, struct sk_buff *skb) @@ -1253,8 +1557,15 @@ static void tls_data_ready(struct sock *sk) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); + struct sk_psock *psock; strp_data_ready(&ctx->strp); + + psock = sk_psock_get(sk); + if (psock && !list_empty(&psock->ingress_msg)) { + ctx->saved_data_ready(sk); + sk_psock_put(sk, psock); + } } void tls_sw_free_resources_tx(struct sock *sk) From 23425b243125538f603c3753ac5e58a209cdbf62 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Tue, 16 Oct 2018 10:36:01 -0700 Subject: [PATCH 1518/1640] UPSTREAM: bpf: sockmap, fix skmsg recvmsg handler to track size correctly When converting sockmap to new skmsg generic data structures we missed that the recvmsg handler did not correctly use sg.size and instead was using individual elements length. The result is if a sock is closed with outstanding data we omit the call to sk_mem_uncharge() and can get the warning below. [ 66.728282] WARNING: CPU: 6 PID: 5783 at net/core/stream.c:206 sk_stream_kill_queues+0x1fa/0x210 To fix this correct the redirect handler to xfer the size along with the scatterlist and also decrement the size from the recvmsg handler. Now when a sock is closed the remaining 'size' will be decremented with sk_mem_uncharge(). Change-Id: Ied975b7e59f1de678b837bd44467c2c5babc2568 Signed-off-by: John Fastabend Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/linux/skmsg.h | 1 + net/ipv4/tcp_bpf.c | 1 + 2 files changed, 2 insertions(+) diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index 706662f21dca..9c27dbbe8d05 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -176,6 +176,7 @@ static inline void sk_msg_xfer(struct sk_msg *dst, struct sk_msg *src, { dst->sg.data[which] = src->sg.data[which]; dst->sg.data[which].length = size; + dst->sg.size += size; src->sg.data[which].length -= size; src->sg.data[which].offset += size; } diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index 80debb0daf37..f9d3cf185827 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -73,6 +73,7 @@ int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock, sge->offset += copy; sge->length -= copy; sk_mem_uncharge(sk, copy); + msg_rx->sg.size -= copy; if (!sge->length) { i++; if (i == MAX_SKB_FRAGS) From eebd012984443ae59d2e5785414bba7b4a518d44 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Tue, 16 Oct 2018 11:08:04 -0700 Subject: [PATCH 1519/1640] UPSTREAM: bpf: sockmap, support for msg_peek in sk_msg with redirect ingress This adds support for the MSG_PEEK flag when doing redirect to ingress and receiving on the sk_msg psock queue. Previously the flag was being ignored which could confuse applications if they expected the flag to work as normal. Change-Id: I8c2c3f1fcd8b46b049c24eea20c0108385e1f851 Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann --- include/net/tcp.h | 2 +- net/ipv4/tcp_bpf.c | 42 +++++++++++++++++++++++++++--------------- net/tls/tls_sw.c | 3 ++- 3 files changed, 30 insertions(+), 17 deletions(-) diff --git a/include/net/tcp.h b/include/net/tcp.h index 2c8f18224acc..63f834597d80 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -2173,7 +2173,7 @@ int tcp_bpf_sendmsg_redir(struct sock *sk, struct sk_msg *msg, u32 bytes, int tcp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, int flags, int *addr_len); int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock, - struct msghdr *msg, int len); + struct msghdr *msg, int len, int flags); /* Call BPF_SOCK_OPS program that returns an int. If the return value * is < 0, then the BPF op failed (for example if the loaded BPF diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index f9d3cf185827..b7918d4caa30 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -39,17 +39,19 @@ static int tcp_bpf_wait_data(struct sock *sk, struct sk_psock *psock, } int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock, - struct msghdr *msg, int len) + struct msghdr *msg, int len, int flags) { struct iov_iter *iter = &msg->msg_iter; + int peek = flags & MSG_PEEK; int i, ret, copied = 0; + struct sk_msg *msg_rx; + + msg_rx = list_first_entry_or_null(&psock->ingress_msg, + struct sk_msg, list); while (copied != len) { struct scatterlist *sge; - struct sk_msg *msg_rx; - msg_rx = list_first_entry_or_null(&psock->ingress_msg, - struct sk_msg, list); if (unlikely(!msg_rx)) break; @@ -70,22 +72,30 @@ int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock, } copied += copy; - sge->offset += copy; - sge->length -= copy; - sk_mem_uncharge(sk, copy); - msg_rx->sg.size -= copy; - if (!sge->length) { - i++; - if (i == MAX_SKB_FRAGS) - i = 0; - if (!msg_rx->skb) - put_page(page); + if (likely(!peek)) { + sge->offset += copy; + sge->length -= copy; + sk_mem_uncharge(sk, copy); + msg_rx->sg.size -= copy; + + if (!sge->length) { + sk_msg_iter_var_next(i); + if (!msg_rx->skb) + put_page(page); + } + } else { + sk_msg_iter_var_next(i); } if (copied == len) break; } while (i != msg_rx->sg.end); + if (unlikely(peek)) { + msg_rx = list_next_entry(msg_rx, list); + continue; + } + msg_rx->sg.start = i; if (!sge->length && msg_rx->sg.start == msg_rx->sg.end) { list_del(&msg_rx->list); @@ -93,6 +103,8 @@ int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock, consume_skb(msg_rx->skb); kfree(msg_rx); } + msg_rx = list_first_entry_or_null(&psock->ingress_msg, + struct sk_msg, list); } return copied; @@ -115,7 +127,7 @@ int tcp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len); lock_sock(sk); msg_bytes_ready: - copied = __tcp_bpf_recvmsg(sk, psock, msg, len); + copied = __tcp_bpf_recvmsg(sk, psock, msg, len, flags); if (!copied) { int data, err = 0; long timeo; diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index dff896c28b5d..2abc32cd0a69 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -1324,7 +1324,8 @@ int tls_sw_recvmsg(struct sock *sk, skb = tls_wait_data(sk, psock, flags, timeo, &err); if (!skb) { if (psock) { - int ret = __tcp_bpf_recvmsg(sk, psock, msg, len); + int ret = __tcp_bpf_recvmsg(sk, psock, + msg, len, flags); if (ret > 0) { copied += ret; From da20361cf00deded387e98a0083145eee3f46d0d Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Mon, 26 Nov 2018 14:16:17 -0800 Subject: [PATCH 1520/1640] BACKPORT: bpf: helper to pop data from messages This adds a BPF SK_MSG program helper so that we can pop data from a msg. We use this to pop metadata from a previous push data call. Change-Id: Idcd8ce6393b152481de7d042994a795a10424bec Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann --- include/uapi/linux/bpf.h | 13 +++ net/core/filter.c | 171 +++++++++++++++++++++++++++++++++++++++ net/ipv4/tcp_bpf.c | 17 +++- net/tls/tls_sw.c | 11 ++- 4 files changed, 207 insertions(+), 5 deletions(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 52334193f056..aa6265f52d02 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2584,6 +2584,19 @@ union bpf_attr { * * Return * 0 on success, or a negative error in case of failure. + * + * int bpf_msg_pop_data(struct sk_msg_buff *msg, u32 start, u32 pop, u64 flags) + * Description + * Will remove *pop* bytes from a *msg* starting at byte *start*. + * This may result in **ENOMEM** errors under certain situations if + * an allocation and copy are required due to a full ring buffer. + * However, the helper will try to avoid doing the allocation + * if possible. Other errors can occur if input parameters are + * invalid either due to *start* byte not being valid part of msg + * payload and/or *pop* value being to large. + * + * Return + * 0 on success, or a negative erro in case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ diff --git a/net/core/filter.c b/net/core/filter.c index 9c247eac2d32..fdf763eff0b1 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2451,6 +2451,174 @@ static const struct bpf_func_proto bpf_msg_push_data_proto = { .arg4_type = ARG_ANYTHING, }; +static void sk_msg_shift_left(struct sk_msg *msg, int i) +{ + int prev; + + do { + prev = i; + sk_msg_iter_var_next(i); + msg->sg.data[prev] = msg->sg.data[i]; + } while (i != msg->sg.end); + + sk_msg_iter_prev(msg, end); +} + +static void sk_msg_shift_right(struct sk_msg *msg, int i) +{ + struct scatterlist tmp, sge; + + sk_msg_iter_next(msg, end); + sge = sk_msg_elem_cpy(msg, i); + sk_msg_iter_var_next(i); + tmp = sk_msg_elem_cpy(msg, i); + + while (i != msg->sg.end) { + msg->sg.data[i] = sge; + sk_msg_iter_var_next(i); + sge = tmp; + tmp = sk_msg_elem_cpy(msg, i); + } +} + +BPF_CALL_4(bpf_msg_pop_data, struct sk_msg *, msg, u32, start, + u32, len, u64, flags) +{ + u32 i = 0, l, space, offset = 0; + u64 last = start + len; + int pop; + + if (unlikely(flags)) + return -EINVAL; + + /* First find the starting scatterlist element */ + i = msg->sg.start; + do { + l = sk_msg_elem(msg, i)->length; + + if (start < offset + l) + break; + offset += l; + sk_msg_iter_var_next(i); + } while (i != msg->sg.end); + + /* Bounds checks: start and pop must be inside message */ + if (start >= offset + l || last >= msg->sg.size) + return -EINVAL; + + space = MAX_MSG_FRAGS - sk_msg_elem_used(msg); + + pop = len; + /* --------------| offset + * -| start |-------- len -------| + * + * |----- a ----|-------- pop -------|----- b ----| + * |______________________________________________| length + * + * + * a: region at front of scatter element to save + * b: region at back of scatter element to save when length > A + pop + * pop: region to pop from element, same as input 'pop' here will be + * decremented below per iteration. + * + * Two top-level cases to handle when start != offset, first B is non + * zero and second B is zero corresponding to when a pop includes more + * than one element. + * + * Then if B is non-zero AND there is no space allocate space and + * compact A, B regions into page. If there is space shift ring to + * the rigth free'ing the next element in ring to place B, leaving + * A untouched except to reduce length. + */ + if (start != offset) { + struct scatterlist *nsge, *sge = sk_msg_elem(msg, i); + int a = start; + int b = sge->length - pop - a; + + sk_msg_iter_var_next(i); + + if (pop < sge->length - a) { + if (space) { + sge->length = a; + sk_msg_shift_right(msg, i); + nsge = sk_msg_elem(msg, i); + get_page(sg_page(sge)); + sg_set_page(nsge, + sg_page(sge), + b, sge->offset + pop + a); + } else { + struct page *page, *orig; + u8 *to, *from; + + page = alloc_pages(__GFP_NOWARN | + __GFP_COMP | GFP_ATOMIC, + get_order(a + b)); + if (unlikely(!page)) + return -ENOMEM; + + sge->length = a; + orig = sg_page(sge); + from = sg_virt(sge); + to = page_address(page); + memcpy(to, from, a); + memcpy(to + a, from + a + pop, b); + sg_set_page(sge, page, a + b, 0); + put_page(orig); + } + pop = 0; + } else if (pop >= sge->length - a) { + sge->length = a; + pop -= (sge->length - a); + } + } + + /* From above the current layout _must_ be as follows, + * + * -| offset + * -| start + * + * |---- pop ---|---------------- b ------------| + * |____________________________________________| length + * + * Offset and start of the current msg elem are equal because in the + * previous case we handled offset != start and either consumed the + * entire element and advanced to the next element OR pop == 0. + * + * Two cases to handle here are first pop is less than the length + * leaving some remainder b above. Simply adjust the element's layout + * in this case. Or pop >= length of the element so that b = 0. In this + * case advance to next element decrementing pop. + */ + while (pop) { + struct scatterlist *sge = sk_msg_elem(msg, i); + + if (pop < sge->length) { + sge->length -= pop; + sge->offset += pop; + pop = 0; + } else { + pop -= sge->length; + sk_msg_shift_left(msg, i); + } + sk_msg_iter_var_next(i); + } + + sk_mem_uncharge(msg->sk, len - pop); + msg->sg.size -= (len - pop); + sk_msg_compute_data_pointers(msg); + return 0; +} + +static const struct bpf_func_proto bpf_msg_pop_data_proto = { + .func = bpf_msg_pop_data, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_ANYTHING, +}; + BPF_CALL_1(bpf_get_cgroup_classid, const struct sk_buff *, skb) { return task_get_classid(skb); @@ -5504,6 +5672,7 @@ bool bpf_helper_changes_pkt_data(void *func) func == bpf_xdp_adjust_meta || func == bpf_msg_pull_data || func == bpf_msg_push_data || + func == bpf_msg_pop_data || func == bpf_xdp_adjust_tail || #if IS_ENABLED(CONFIG_IPV6_SEG6_BPF) func == bpf_lwt_seg6_store_bytes || @@ -5847,6 +6016,8 @@ sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_msg_pull_data_proto; case BPF_FUNC_msg_push_data: return &bpf_msg_push_data_proto; + case BPF_FUNC_msg_pop_data: + return &bpf_msg_pop_data_proto; default: return bpf_base_func_proto(func_id); } diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index b7918d4caa30..769002716650 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -288,12 +288,23 @@ static int tcp_bpf_send_verdict(struct sock *sk, struct sk_psock *psock, { bool cork = false, enospc = msg->sg.start == msg->sg.end; struct sock *sk_redir; - u32 tosend; + u32 tosend, delta = 0; int ret; more_data: - if (psock->eval == __SK_NONE) + if (psock->eval == __SK_NONE) { + /* Track delta in msg size to add/subtract it on SK_DROP from + * returned to user copied size. This ensures user doesn't + * get a positive return code with msg_cut_data and SK_DROP + * verdict. + */ + delta = msg->sg.size; psock->eval = sk_psock_msg_verdict(sk, psock, msg); + if (msg->sg.size < delta) + delta -= msg->sg.size; + else + delta = 0; + } if (msg->cork_bytes && msg->cork_bytes > msg->sg.size && !enospc) { @@ -349,7 +360,7 @@ more_data: default: sk_msg_free_partial(sk, msg, tosend); sk_msg_apply_bytes(psock, tosend); - *copied -= tosend; + *copied -= (tosend + delta); return -EACCES; } diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 2abc32cd0a69..52ab489c8b2a 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -540,6 +540,7 @@ static int bpf_exec_tx_verdict(struct sk_msg *msg, struct sock *sk, struct sock *sk_redir; struct tls_rec *rec; int err = 0, send; + u32 delta = 0; bool enospc; psock = sk_psock_get(sk); @@ -547,8 +548,14 @@ static int bpf_exec_tx_verdict(struct sk_msg *msg, struct sock *sk, return tls_push_record(sk, flags, record_type); more_data: enospc = sk_msg_full(msg); - if (psock->eval == __SK_NONE) + if (psock->eval == __SK_NONE) { + delta = msg->sg.size; psock->eval = sk_psock_msg_verdict(sk, psock, msg); + if (delta < msg->sg.size) + delta -= msg->sg.size; + else + delta = 0; + } if (msg->cork_bytes && msg->cork_bytes > msg->sg.size && !enospc && !full_record) { err = -ENOSPC; @@ -596,7 +603,7 @@ more_data: msg->apply_bytes -= send; if (msg->sg.size == 0) tls_free_open_rec(sk); - *copied -= send; + *copied -= (send + delta); err = -EACCES; } From ea8f880736d889f7234090db1f2b014a7b0c2a00 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Sun, 16 Dec 2018 15:47:04 -0800 Subject: [PATCH 1521/1640] UPSTREAM: bpf: sockmap, metadata support for reporting size of msg This adds metadata to sk_msg_md for BPF programs to read the sk_msg size. When the SK_MSG program is running under an application that is using sendfile the data is not copied into sk_msg buffers by default. Rather the BPF program uses sk_msg_pull_data to read the bytes in. This avoids doing the costly memcopy instructions when they are not in fact needed. However, if we don't know the size of the sk_msg we have to guess if needed bytes are available by doing a pull request which may fail. By including the size of the sk_msg BPF programs can check the size before issuing sk_msg_pull_data requests. Additionally, the same applies for sendmsg calls when the application provides multiple iovs. Here the BPF program needs to pull in data to update data pointers but its not clear where the data ends without a size parameter. In many cases "guessing" is not easy to do and results in multiple calls to pull and without bounded loops everything gets fairly tricky. Clean this up by including a u32 size field. Note, all writes into sk_msg_md are rejected already from sk_msg_is_valid_access so nothing additional is needed there. Change-Id: I678f88a9d07c5dbdb593c5b85209764ea37e8efb Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann --- include/linux/skmsg.h | 3 +++ include/uapi/linux/bpf.h | 1 + net/core/filter.c | 6 ++++++ 3 files changed, 10 insertions(+) diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index 9c27dbbe8d05..3da77344e58b 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -36,6 +36,9 @@ struct sk_msg_sg { struct scatterlist data[MAX_MSG_FRAGS + 1]; }; +/* UAPI in filter.c depends on struct sk_msg_sg being first element. If + * this is moved filter.c also must be updated. + */ struct sk_msg { struct sk_msg_sg sg; void *data; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index aa6265f52d02..5f7e2d18a8b5 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3028,6 +3028,7 @@ struct sk_msg_md { __u32 local_ip6[4]; /* Stored in network byte order */ __u32 remote_port; /* Stored in network byte order */ __u32 local_port; /* stored in host byte order */ + __u32 size; /* Total size of sk_msg */ }; struct sk_reuseport_md { diff --git a/net/core/filter.c b/net/core/filter.c index fdf763eff0b1..a71ea4808287 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -7956,6 +7956,12 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type, *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_num)); break; + + case offsetof(struct sk_msg_md, size): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg_sg, size), + si->dst_reg, si->src_reg, + offsetof(struct sk_msg_sg, size)); + break; } return insn - insn_buf; From d16fa417d022a2da65c1c9b0e76c237e84c34a06 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Thu, 20 Dec 2018 11:35:30 -0800 Subject: [PATCH 1522/1640] UPSTREAM: bpf: sk_msg, improve offset chk in _is_valid_access The check for max offset in sk_msg_is_valid_access uses sizeof() which is incorrect because it would allow accessing possibly past the end of the struct in the padded case. Further, it doesn't preclude accessing any padding that may be added in the middle of a struct. All told this makes it fragile to rely on. To fix this explicitly check offsets with fields using the bpf_ctx_range() and bpf_ctx_range_till() macros. For reference the current structure layout looks as follows (reported by pahole) struct sk_msg_md { union { void * data; /* 8 */ }; /* 0 8 */ union { void * data_end; /* 8 */ }; /* 8 8 */ __u32 family; /* 16 4 */ __u32 remote_ip4; /* 20 4 */ __u32 local_ip4; /* 24 4 */ __u32 remote_ip6[4]; /* 28 16 */ __u32 local_ip6[4]; /* 44 16 */ __u32 remote_port; /* 60 4 */ /* --- cacheline 1 boundary (64 bytes) --- */ __u32 local_port; /* 64 4 */ __u32 size; /* 68 4 */ /* size: 72, cachelines: 2, members: 10 */ /* last cacheline: 8 bytes */ }; So there should be no padding at the moment but fixing this now prevents future errors. Reported-by: Alexei Starovoitov Change-Id: Iccfe666b425bfd7b51ca5f006961bf2147fb1a2f Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann --- net/core/filter.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/net/core/filter.c b/net/core/filter.c index a71ea4808287..a2724faa0313 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -6796,6 +6796,9 @@ static bool sk_msg_is_valid_access(int off, int size, if (type == BPF_WRITE) return false; + if (off % size != 0) + return false; + switch (off) { case offsetof(struct sk_msg_md, data): info->reg_type = PTR_TO_PACKET; @@ -6807,16 +6810,20 @@ static bool sk_msg_is_valid_access(int off, int size, if (size != sizeof(__u64)) return false; break; - default: + case bpf_ctx_range(struct sk_msg_md, family): + case bpf_ctx_range(struct sk_msg_md, remote_ip4): + case bpf_ctx_range(struct sk_msg_md, local_ip4): + case bpf_ctx_range_till(struct sk_msg_md, remote_ip6[0], remote_ip6[3]): + case bpf_ctx_range_till(struct sk_msg_md, local_ip6[0], local_ip6[3]): + case bpf_ctx_range(struct sk_msg_md, remote_port): + case bpf_ctx_range(struct sk_msg_md, local_port): + case bpf_ctx_range(struct sk_msg_md, size): if (size != sizeof(__u32)) return false; + break; + default: + return false; } - - if (off < 0 || off >= sizeof(struct sk_msg_md)) - return false; - if (off % size != 0) - return false; - return true; } From 8023bfbf6ae88e772997b6a1174d6d3949ddeae0 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Thu, 20 Dec 2018 11:35:31 -0800 Subject: [PATCH 1523/1640] UPSTREAM: bpf: skmsg, replace comments with BUILD bug Enforce comment on structure layout dependency with a BUILD_BUG_ON to ensure the condition is maintained. Suggested-by: Daniel Borkmann Change-Id: Ic13e65176618207ed60d6b378081b9415112d98d Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann --- include/linux/skmsg.h | 4 +--- net/core/filter.c | 3 +++ 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index 3da77344e58b..02af834a8084 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -36,9 +36,7 @@ struct sk_msg_sg { struct scatterlist data[MAX_MSG_FRAGS + 1]; }; -/* UAPI in filter.c depends on struct sk_msg_sg being first element. If - * this is moved filter.c also must be updated. - */ +/* UAPI in filter.c depends on struct sk_msg_sg being first element. */ struct sk_msg { struct sk_msg_sg sg; void *data; diff --git a/net/core/filter.c b/net/core/filter.c index a2724faa0313..7acd40a8b892 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -7851,6 +7851,9 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type, int off; #endif + /* convert ctx uses the fact sg element is first in struct */ + BUILD_BUG_ON(offsetof(struct sk_msg, sg) != 0); + switch (si->off) { case offsetof(struct sk_msg_md, data): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg, data), From 292e82a2d0fc5bfb6921c355b0816c6143a7f5e1 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 21 Sep 2018 08:51:49 -0700 Subject: [PATCH 1524/1640] UPSTREAM: tcp: add tcp_wstamp_ns socket field TCP will soon provide earliest departure time on TX skbs. It needs to track this in a new variable. tcp_mstamp_refresh() needs to update this variable, and became too big to stay an inline. Change-Id: I9230564dac5f4dfe282528de4cec8c970cf54d5a Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/tcp.h | 2 ++ include/net/tcp.h | 12 +----------- net/ipv4/tcp_output.c | 16 ++++++++++++++++ 3 files changed, 19 insertions(+), 11 deletions(-) diff --git a/include/linux/tcp.h b/include/linux/tcp.h index e109913dfbd6..223c6695fd3a 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -233,6 +233,8 @@ struct tcp_sock { is_cwnd_limited:1;/* forward progress limited by snd_cwnd? */ u32 tlp_high_seq; /* snd_nxt at the time of TLP */ + u64 tcp_wstamp_ns; /* departure time for next sent data packet */ + /* RTT measurement */ u64 tcp_mstamp; /* most recent packet received/sent */ u32 srtt_us; /* smoothed round trip time << 3 in usecs */ diff --git a/include/net/tcp.h b/include/net/tcp.h index 63f834597d80..b14556e238d0 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -781,17 +781,7 @@ static inline u32 tcp_time_stamp_raw(void) return div_u64(tcp_clock_ns(), NSEC_PER_SEC / TCP_TS_HZ); } - -/* Refresh 1us clock of a TCP socket, - * ensuring monotically increasing values. - */ -static inline void tcp_mstamp_refresh(struct tcp_sock *tp) -{ - u64 val = tcp_clock_us(); - - if (val > tp->tcp_mstamp) - tp->tcp_mstamp = val; -} +void tcp_mstamp_refresh(struct tcp_sock *tp); static inline u32 tcp_stamp_us_delta(u64 t1, u64 t0) { diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 0350846feb59..487e082f6ee2 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -62,6 +62,22 @@ int sysctl_tcp_tso_win_divisor __read_mostly = 3; /* By default, RFC2861 behavior. */ int sysctl_tcp_slow_start_after_idle __read_mostly = 1; +/* Refresh clocks of a TCP socket, + * ensuring monotically increasing values. + */ +void tcp_mstamp_refresh(struct tcp_sock *tp) +{ + u64 val = tcp_clock_ns(); + + /* departure time for next data packet */ + if (val > tp->tcp_wstamp_ns) + tp->tcp_wstamp_ns = val; + + val = div_u64(val, NSEC_PER_USEC); + if (val > tp->tcp_mstamp) + tp->tcp_mstamp = val; +} + static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, int push_one, gfp_t gfp); From 0964c844ba485515e4055c9aeb3c24358ff312aa Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Thu, 17 Jan 2019 08:51:01 -0800 Subject: [PATCH 1525/1640] BACKPORT: bpf: fix SO_MAX_PACING_RATE to support TCP internal pacing If sch_fq packet scheduler is not used, TCP can fallback to internal pacing, but this requires sk_pacing_status to be properly set. Fixes: 8c4b4c7e9ff0 ("bpf: Add setsockopt helper function to bpf") Change-Id: Ic786ba5295dd87fa99a7931ae1792b7b178bb628 Signed-off-by: Yuchung Cheng Signed-off-by: Eric Dumazet Cc: Lawrence Brakmo Acked-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann --- net/core/filter.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/core/filter.c b/net/core/filter.c index 7acd40a8b892..effcafa9b43a 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4220,6 +4220,10 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock, sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF); break; case SO_MAX_PACING_RATE: + if (val != ~0U) + cmpxchg(&sk->sk_pacing_status, + SK_PACING_NONE, + SK_PACING_NEEDED); sk->sk_max_pacing_rate = val; sk->sk_pacing_rate = min(sk->sk_pacing_rate, sk->sk_max_pacing_rate); From 7a6e373eb5b72c1a2cddce4880e3a88251ec92f7 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Mon, 24 Sep 2018 16:49:57 -0400 Subject: [PATCH 1526/1640] BACKPORT: flow_dissector: lookup netns by skb->sk if skb->dev is NULL BPF flow dissectors are configured per network namespace. __skb_flow_dissect looks up the netns through dev_net(skb->dev). In some dissector paths skb->dev is NULL, such as for Unix sockets. In these cases fall back to looking up the netns by socket. Analyzing the codepaths leading to __skb_flow_dissect I did not find a case where both skb->dev and skb->sk are NULL. Warn and fall back to standard flow dissector if one is found. Fixes: d58e468b1112 ("flow_dissector: implements flow dissector BPF hook") Reported-by: Eric Dumazet Change-Id: I6c7db9ee1e0b42c1a799f9d0e49d31795e03dd14 Signed-off-by: Willem de Bruijn Acked-by: Song Liu Signed-off-by: Daniel Borkmann --- net/core/flow_dissector.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index adc7d081941e..3127482b5ae1 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -573,7 +573,7 @@ bool __skb_flow_dissect(const struct sk_buff *skb, struct flow_dissector_key_vlan *key_vlan; enum flow_dissect_ret fdret; bool skip_vlan = false; - struct bpf_prog *attached; + struct bpf_prog *attached = NULL; int num_hdrs = 0; u8 ip_proto = 0; bool ret; @@ -615,8 +615,14 @@ bool __skb_flow_dissect(const struct sk_buff *skb, target_container); rcu_read_lock(); - attached = skb ? rcu_dereference(dev_net(skb->dev)->flow_dissector_prog) - : NULL; + if (skb) { + if (skb->dev) + attached = rcu_dereference(dev_net(skb->dev)->flow_dissector_prog); + else if (skb->sk) + attached = rcu_dereference(sock_net(skb->sk)->flow_dissector_prog); + else + WARN_ON_ONCE(1); + } if (attached) { /* Note that even though the const qualifier is discarded * throughout the execution of the BPF program, all changes(the From dccc894c064203ba313aa5b8cfda7f62155a7e84 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Wed, 5 Dec 2018 20:40:47 -0800 Subject: [PATCH 1527/1640] UPSTREAM: selftests/bpf: use thoff instead of nhoff in BPF flow dissector We are returning thoff from the flow dissector, not the nhoff. Pass thoff along with nhoff to the bpf program (initially thoff == nhoff) and expect flow dissector amend/return thoff, not nhoff. This avoids confusion, when by the time bpf flow dissector exits, nhoff == thoff, which doesn't make much sense. Change-Id: I1ee808c190b5b472dd374f20c85a7846cfe2c9bb Signed-off-by: Stanislav Fomichev Acked-by: Song Liu Signed-off-by: Alexei Starovoitov --- net/core/flow_dissector.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 3127482b5ae1..d982f1ceee86 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -643,6 +643,7 @@ bool __skb_flow_dissect(const struct sk_buff *skb, /* Pass parameters to the BPF program */ cb->qdisc_cb.flow_keys = &flow_keys; flow_keys.nhoff = nhoff; + flow_keys.thoff = nhoff; bpf_compute_data_pointers((struct sk_buff *)skb); result = BPF_PROG_RUN(attached, skb); From 737afa596a9dde57b857e2d879345323b4d83cbb Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Wed, 5 Dec 2018 20:40:48 -0800 Subject: [PATCH 1528/1640] UPSTREAM: net/flow_dissector: correctly cap nhoff and thoff in case of BPF We want to make sure that the following condition holds: 0 <= nhoff <= thoff <= skb->len BPF program can set out-of-bounds nhoff and thoff, which is dangerous, see recent commit d0c081b49137 ("flow_dissector: properly cap thoff field")'. Change-Id: Ib698bb051c4043ebb841283f23f6a4e4c787613b Signed-off-by: Stanislav Fomichev Acked-by: Song Liu Signed-off-by: Alexei Starovoitov --- net/core/flow_dissector.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index d982f1ceee86..81559b885685 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -651,9 +651,12 @@ bool __skb_flow_dissect(const struct sk_buff *skb, /* Restore state */ memcpy(cb, &cb_saved, sizeof(cb_saved)); + flow_keys.nhoff = clamp_t(u16, flow_keys.nhoff, 0, skb->len); + flow_keys.thoff = clamp_t(u16, flow_keys.thoff, + flow_keys.nhoff, skb->len); + __skb_flow_bpf_to_target(&flow_keys, flow_dissector, target_container); - key_control->thoff = min_t(u16, key_control->thoff, skb->len); rcu_read_unlock(); return result == BPF_OK; } From e29e2da5acd863f2dc1e7bac7ccca717b2175312 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Mon, 28 Jan 2019 08:53:53 -0800 Subject: [PATCH 1529/1640] BACKPORT: net/flow_dissector: move bpf case into __skb_flow_bpf_dissect This way, we can reuse it for flow dissector in BPF_PROG_TEST_RUN. No functional changes. Change-Id: I0c832987c51402747a3f985641fee4f9e2981933 Signed-off-by: Stanislav Fomichev Acked-by: Song Liu Signed-off-by: Daniel Borkmann --- include/linux/skbuff.h | 5 +++ net/core/flow_dissector.c | 92 +++++++++++++++++++++++---------------- 2 files changed, 59 insertions(+), 38 deletions(-) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index c210e316e07f..e2d01b0900e7 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1227,6 +1227,11 @@ static inline int skb_flow_dissector_bpf_prog_detach(const union bpf_attr *attr) } #endif +struct bpf_flow_keys; +bool __skb_flow_bpf_dissect(struct bpf_prog *prog, + const struct sk_buff *skb, + struct flow_dissector *flow_dissector, + struct bpf_flow_keys *flow_keys); bool __skb_flow_dissect(const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container, diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 81559b885685..5ab18c2b6a7b 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -542,6 +542,46 @@ static void __skb_flow_bpf_to_target(const struct bpf_flow_keys *flow_keys, } } +bool __skb_flow_bpf_dissect(struct bpf_prog *prog, + const struct sk_buff *skb, + struct flow_dissector *flow_dissector, + struct bpf_flow_keys *flow_keys) +{ + struct bpf_skb_data_end cb_saved; + struct bpf_skb_data_end *cb; + u32 result; + + /* Note that even though the const qualifier is discarded + * throughout the execution of the BPF program, all changes(the + * control block) are reverted after the BPF program returns. + * Therefore, __skb_flow_dissect does not alter the skb. + */ + + cb = (struct bpf_skb_data_end *)skb->cb; + + /* Save Control Block */ + memcpy(&cb_saved, cb, sizeof(cb_saved)); + memset(cb, 0, sizeof(*cb)); + + /* Pass parameters to the BPF program */ + memset(flow_keys, 0, sizeof(*flow_keys)); + cb->qdisc_cb.flow_keys = flow_keys; + flow_keys->nhoff = skb_network_offset(skb); + flow_keys->thoff = flow_keys->nhoff; + + bpf_compute_data_pointers((struct sk_buff *)skb); + result = BPF_PROG_RUN(prog, skb); + + /* Restore state */ + memcpy(cb, &cb_saved, sizeof(cb_saved)); + + flow_keys->nhoff = clamp_t(u16, flow_keys->nhoff, 0, skb->len); + flow_keys->thoff = clamp_t(u16, flow_keys->thoff, + flow_keys->nhoff, skb->len); + + return result == BPF_OK; +} + /** * __skb_flow_dissect - extract the flow_keys struct and return it * @skb: sk_buff to extract the flow from, can be NULL if the rest are specified @@ -573,7 +613,6 @@ bool __skb_flow_dissect(const struct sk_buff *skb, struct flow_dissector_key_vlan *key_vlan; enum flow_dissect_ret fdret; bool skip_vlan = false; - struct bpf_prog *attached = NULL; int num_hdrs = 0; u8 ip_proto = 0; bool ret; @@ -614,53 +653,30 @@ bool __skb_flow_dissect(const struct sk_buff *skb, FLOW_DISSECTOR_KEY_BASIC, target_container); - rcu_read_lock(); if (skb) { + struct bpf_flow_keys flow_keys; + struct bpf_prog *attached = NULL; + + rcu_read_lock(); + if (skb->dev) attached = rcu_dereference(dev_net(skb->dev)->flow_dissector_prog); else if (skb->sk) attached = rcu_dereference(sock_net(skb->sk)->flow_dissector_prog); else WARN_ON_ONCE(1); - } - if (attached) { - /* Note that even though the const qualifier is discarded - * throughout the execution of the BPF program, all changes(the - * control block) are reverted after the BPF program returns. - * Therefore, __skb_flow_dissect does not alter the skb. - */ - struct bpf_flow_keys flow_keys = {}; - struct bpf_skb_data_end cb_saved; - struct bpf_skb_data_end *cb; - u32 result; - cb = (struct bpf_skb_data_end *)skb->cb; - - /* Save Control Block */ - memcpy(&cb_saved, cb, sizeof(cb_saved)); - memset(cb, 0, sizeof(cb_saved)); - - /* Pass parameters to the BPF program */ - cb->qdisc_cb.flow_keys = &flow_keys; - flow_keys.nhoff = nhoff; - flow_keys.thoff = nhoff; - - bpf_compute_data_pointers((struct sk_buff *)skb); - result = BPF_PROG_RUN(attached, skb); - - /* Restore state */ - memcpy(cb, &cb_saved, sizeof(cb_saved)); - - flow_keys.nhoff = clamp_t(u16, flow_keys.nhoff, 0, skb->len); - flow_keys.thoff = clamp_t(u16, flow_keys.thoff, - flow_keys.nhoff, skb->len); - - __skb_flow_bpf_to_target(&flow_keys, flow_dissector, - target_container); + if (attached) { + ret = __skb_flow_bpf_dissect(attached, skb, + flow_dissector, + &flow_keys); + __skb_flow_bpf_to_target(&flow_keys, flow_dissector, + target_container); + rcu_read_unlock(); + return ret; + } rcu_read_unlock(); - return result == BPF_OK; } - rcu_read_unlock(); if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ETH_ADDRS)) { From 82ca10e0d2ddc7d4db23050ca6c2aff96f2dedc0 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Mon, 28 Jan 2019 08:53:54 -0800 Subject: [PATCH 1530/1640] UPSTREAM: bpf: add BPF_PROG_TEST_RUN support for flow dissector The input is packet data, the output is struct bpf_flow_key. This should make it easy to test flow dissector programs without elaborate setup. Change-Id: I8639560bc8b1ce6b983365bb317b184b2cc3ab8a Signed-off-by: Stanislav Fomichev Acked-by: Song Liu Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 3 ++ net/bpf/test_run.c | 82 +++++++++++++++++++++++++++++++++++++++++++++ net/core/filter.c | 1 + 3 files changed, 86 insertions(+) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index e1ea9d8f73ec..af6534a89b5a 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -503,6 +503,9 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr); int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr); +int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog, + const union bpf_attr *kattr, + union bpf_attr __user *uattr); /* an array of programs to be executed under rcu_lock. * diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 5ec08a0081b0..f58d688aab42 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -359,3 +359,85 @@ out: kfree(data); return ret; } + +int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog, + const union bpf_attr *kattr, + union bpf_attr __user *uattr) +{ + u32 size = kattr->test.data_size_in; + u32 repeat = kattr->test.repeat; + struct bpf_flow_keys flow_keys; + u64 time_start, time_spent = 0; + struct bpf_skb_data_end *cb; + u32 retval, duration; + struct sk_buff *skb; + struct sock *sk; + void *data; + int ret; + u32 i; + + if (prog->type != BPF_PROG_TYPE_FLOW_DISSECTOR) + return -EINVAL; + + data = bpf_test_init(kattr, size, NET_SKB_PAD + NET_IP_ALIGN, + SKB_DATA_ALIGN(sizeof(struct skb_shared_info))); + if (IS_ERR(data)) + return PTR_ERR(data); + + sk = kzalloc(sizeof(*sk), GFP_USER); + if (!sk) { + kfree(data); + return -ENOMEM; + } + sock_net_set(sk, current->nsproxy->net_ns); + sock_init_data(NULL, sk); + + skb = build_skb(data, 0); + if (!skb) { + kfree(data); + kfree(sk); + return -ENOMEM; + } + skb->sk = sk; + + skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN); + __skb_put(skb, size); + skb->protocol = eth_type_trans(skb, + current->nsproxy->net_ns->loopback_dev); + skb_reset_network_header(skb); + + cb = (struct bpf_skb_data_end *)skb->cb; + cb->qdisc_cb.flow_keys = &flow_keys; + + if (!repeat) + repeat = 1; + + time_start = ktime_get_ns(); + for (i = 0; i < repeat; i++) { + preempt_disable(); + rcu_read_lock(); + retval = __skb_flow_bpf_dissect(prog, skb, + &flow_keys_dissector, + &flow_keys); + rcu_read_unlock(); + preempt_enable(); + + if (need_resched()) { + if (signal_pending(current)) + break; + time_spent += ktime_get_ns() - time_start; + cond_resched(); + time_start = ktime_get_ns(); + } + } + time_spent += ktime_get_ns() - time_start; + do_div(time_spent, repeat); + duration = time_spent > U32_MAX ? U32_MAX : (u32)time_spent; + + ret = bpf_test_finish(kattr, uattr, &flow_keys, sizeof(flow_keys), + retval, duration); + + kfree_skb(skb); + kfree(sk); + return ret; +} diff --git a/net/core/filter.c b/net/core/filter.c index effcafa9b43a..b7f3e099a244 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -8120,6 +8120,7 @@ const struct bpf_verifier_ops flow_dissector_verifier_ops = { }; const struct bpf_prog_ops flow_dissector_prog_ops = { + .test_run = bpf_prog_test_run_flow_dissector, }; int sk_detach_filter(struct sock *sk) From bd1b956752038d20bb6553cdf7fac19286307667 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Sat, 9 Feb 2019 23:22:21 -0800 Subject: [PATCH 1531/1640] UPSTREAM: bpf: Add state, dst_ip4, dst_ip6 and dst_port to bpf_sock This patch adds "state", "dst_ip4", "dst_ip6" and "dst_port" to the bpf_sock. The userspace has already been using "state", e.g. inet_diag (ss -t) and getsockopt(TCP_INFO). This patch also allows narrow load on the following existing fields: "family", "type", "protocol" and "src_port". Unlike IP address, the load offset is resticted to the first byte for them but it can be relaxed later if there is a use case. This patch also folds __sock_filter_check_size() into bpf_sock_is_valid_access() since it is not called by any where else. All bpf_sock checking is in one place. Change-Id: I4523d9c76caf86351662a65197d27ca69615ed63 Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 17 ++++--- net/core/filter.c | 99 +++++++++++++++++++++++++++++++--------- 2 files changed, 85 insertions(+), 31 deletions(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 5f7e2d18a8b5..6770f32b7ae1 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2911,15 +2911,14 @@ struct bpf_sock { __u32 protocol; __u32 mark; __u32 priority; - __u32 src_ip4; /* Allows 1,2,4-byte read. - * Stored in network byte order. - */ - __u32 src_ip6[4]; /* Allows 1,2,4-byte read. - * Stored in network byte order. - */ - __u32 src_port; /* Allows 4-byte read. - * Stored in host byte order - */ + /* IP address also allows 1 and 2 bytes access */ + __u32 src_ip4; + __u32 src_ip6[4]; + __u32 src_port; /* host byte order */ + __u32 dst_port; /* network byte order */ + __u32 dst_ip4; + __u32 dst_ip6[4]; + __u32 state; }; struct bpf_tcp_sock { diff --git a/net/core/filter.c b/net/core/filter.c index b7f3e099a244..45e9fb0c0aaa 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -6380,21 +6380,6 @@ full_access: return true; } -static bool __sock_filter_check_size(int off, int size, - struct bpf_insn_access_aux *info) -{ - const int size_default = sizeof(__u32); - - switch (off) { - case bpf_ctx_range(struct bpf_sock, src_ip4): - case bpf_ctx_range_till(struct bpf_sock, src_ip6[0], src_ip6[3]): - bpf_ctx_record_field_size(info, size_default); - return bpf_ctx_narrow_access_ok(off, size, size_default); - } - - return size == size_default; -} - bool bpf_sock_common_is_valid_access(int off, int size, enum bpf_access_type type, struct bpf_insn_access_aux *info) @@ -6410,13 +6395,29 @@ bool bpf_sock_common_is_valid_access(int off, int size, bool bpf_sock_is_valid_access(int off, int size, enum bpf_access_type type, struct bpf_insn_access_aux *info) { + const int size_default = sizeof(__u32); + if (off < 0 || off >= sizeof(struct bpf_sock)) return false; if (off % size != 0) return false; - if (!__sock_filter_check_size(off, size, info)) - return false; - return true; + + switch (off) { + case offsetof(struct bpf_sock, state): + case offsetof(struct bpf_sock, family): + case offsetof(struct bpf_sock, type): + case offsetof(struct bpf_sock, protocol): + case offsetof(struct bpf_sock, dst_port): + case offsetof(struct bpf_sock, src_port): + case bpf_ctx_range(struct bpf_sock, src_ip4): + case bpf_ctx_range_till(struct bpf_sock, src_ip6[0], src_ip6[3]): + case bpf_ctx_range(struct bpf_sock, dst_ip4): + case bpf_ctx_range_till(struct bpf_sock, dst_ip6[0], dst_ip6[3]): + bpf_ctx_record_field_size(info, size_default); + return bpf_ctx_narrow_access_ok(off, size, size_default); + } + + return size == size_default; } static bool sock_filter_is_valid_access(int off, int size, @@ -7272,24 +7273,32 @@ u32 bpf_sock_convert_ctx_access(enum bpf_access_type type, break; case offsetof(struct bpf_sock, family): - BUILD_BUG_ON(FIELD_SIZEOF(struct sock, sk_family) != 2); - - *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, - offsetof(struct sock, sk_family)); + *insn++ = BPF_LDX_MEM( + BPF_FIELD_SIZEOF(struct sock_common, skc_family), + si->dst_reg, si->src_reg, + bpf_target_off(struct sock_common, + skc_family, + FIELD_SIZEOF(struct sock_common, + skc_family), + target_size)); break; case offsetof(struct bpf_sock, type): + BUILD_BUG_ON(HWEIGHT32(SK_FL_TYPE_MASK) != BITS_PER_BYTE * 2); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, offsetof(struct sock, __sk_flags_offset)); *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_TYPE_MASK); *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, SK_FL_TYPE_SHIFT); + *target_size = 2; break; case offsetof(struct bpf_sock, protocol): + BUILD_BUG_ON(HWEIGHT32(SK_FL_PROTO_MASK) != BITS_PER_BYTE); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, offsetof(struct sock, __sk_flags_offset)); *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_PROTO_MASK); *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, SK_FL_PROTO_SHIFT); + *target_size = 1; break; case offsetof(struct bpf_sock, src_ip4): @@ -7301,6 +7310,15 @@ u32 bpf_sock_convert_ctx_access(enum bpf_access_type type, target_size)); break; + case offsetof(struct bpf_sock, dst_ip4): + *insn++ = BPF_LDX_MEM( + BPF_SIZE(si->code), si->dst_reg, si->src_reg, + bpf_target_off(struct sock_common, skc_daddr, + FIELD_SIZEOF(struct sock_common, + skc_daddr), + target_size)); + break; + case bpf_ctx_range_till(struct bpf_sock, src_ip6[0], src_ip6[3]): #if IS_ENABLED(CONFIG_IPV6) off = si->off; @@ -7319,6 +7337,23 @@ u32 bpf_sock_convert_ctx_access(enum bpf_access_type type, #endif break; + case bpf_ctx_range_till(struct bpf_sock, dst_ip6[0], dst_ip6[3]): +#if IS_ENABLED(CONFIG_IPV6) + off = si->off; + off -= offsetof(struct bpf_sock, dst_ip6[0]); + *insn++ = BPF_LDX_MEM( + BPF_SIZE(si->code), si->dst_reg, si->src_reg, + bpf_target_off(struct sock_common, + skc_v6_daddr.s6_addr32[0], + FIELD_SIZEOF(struct sock_common, + skc_v6_daddr.s6_addr32[0]), + target_size) + off); +#else + *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); + *target_size = 4; +#endif + break; + case offsetof(struct bpf_sock, src_port): *insn++ = BPF_LDX_MEM( BPF_FIELD_SIZEOF(struct sock_common, skc_num), @@ -7328,6 +7363,26 @@ u32 bpf_sock_convert_ctx_access(enum bpf_access_type type, skc_num), target_size)); break; + + case offsetof(struct bpf_sock, dst_port): + *insn++ = BPF_LDX_MEM( + BPF_FIELD_SIZEOF(struct sock_common, skc_dport), + si->dst_reg, si->src_reg, + bpf_target_off(struct sock_common, skc_dport, + FIELD_SIZEOF(struct sock_common, + skc_dport), + target_size)); + break; + + case offsetof(struct bpf_sock, state): + *insn++ = BPF_LDX_MEM( + BPF_FIELD_SIZEOF(struct sock_common, skc_state), + si->dst_reg, si->src_reg, + bpf_target_off(struct sock_common, skc_state, + FIELD_SIZEOF(struct sock_common, + skc_state), + target_size)); + break; } return insn - insn_buf; From 1f3282c794a3a0009a10ea496e0330d6ee6b8ace Mon Sep 17 00:00:00 2001 From: Peter Oskolkov Date: Wed, 13 Feb 2019 11:53:35 -0800 Subject: [PATCH 1532/1640] UPSTREAM: bpf: add plumbing for BPF_LWT_ENCAP_IP in bpf_lwt_push_encap This patch adds all needed plumbing in preparation to allowing bpf programs to do IP encapping via bpf_lwt_push_encap. Actual implementation is added in the next patch in the patchset. Of note: - bpf_lwt_push_encap can now be called from BPF_PROG_TYPE_LWT_XMIT prog types in addition to BPF_PROG_TYPE_LWT_IN; - if the skb being encapped has GSO set, encapsulation is limited to IPIP/IP+GRE/IP+GUE (both IPv4 and IPv6); - as route lookups are different for ingress vs egress, the single external bpf_lwt_push_encap BPF helper is routed internally to either bpf_lwt_in_push_encap or bpf_lwt_xmit_push_encap BPF_CALLs, depending on prog type. v8 changes: fixed a typo. Change-Id: I32bdc99d964398db6535b2fce6aa7b1d7e6262ea Signed-off-by: Peter Oskolkov Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 26 ++++++++++++++++++++-- net/core/filter.c | 48 +++++++++++++++++++++++++++++++++++----- 2 files changed, 67 insertions(+), 7 deletions(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 6770f32b7ae1..9551ae90eaae 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2101,6 +2101,19 @@ union bpf_attr { * Only works if *skb* contains an IPv6 packet. Insert a * Segment Routing Header (**struct ipv6_sr_hdr**) inside * the IPv6 header. + * **BPF_LWT_ENCAP_IP** + * IP encapsulation (GRE/GUE/IPIP/etc). The outer header + * must be IPv4 or IPv6, followed by zero or more + * additional headers, up to LWT_BPF_MAX_HEADROOM total + * bytes in all prepended headers. Please note that + * if skb_is_gso(skb) is true, no more than two headers + * can be prepended, and the inner header, if present, + * should be either GRE or UDP/GUE. + * + * BPF_LWT_ENCAP_SEG6*** types can be called by bpf programs of + * type BPF_PROG_TYPE_LWT_IN; BPF_LWT_ENCAP_IP type can be called + * by bpf programs of types BPF_PROG_TYPE_LWT_IN and + * BPF_PROG_TYPE_LWT_XMIT. * * A call to this helper is susceptible to change the underlaying * packet buffer. Therefore, at load time, all checks on pointers @@ -2812,7 +2825,8 @@ enum bpf_hdr_start_off { /* Encapsulation type for BPF_FUNC_lwt_push_encap helper. */ enum bpf_lwt_encap_mode { BPF_LWT_ENCAP_SEG6, - BPF_LWT_ENCAP_SEG6_INLINE + BPF_LWT_ENCAP_SEG6_INLINE, + BPF_LWT_ENCAP_IP, }; #define __bpf_md_ptr(type, name) \ @@ -2901,7 +2915,15 @@ enum bpf_ret_code { BPF_DROP = 2, /* 3-6 reserved */ BPF_REDIRECT = 7, - /* >127 are reserved for prog type specific return codes */ + /* >127 are reserved for prog type specific return codes. + * + * BPF_LWT_REROUTE: used by BPF_PROG_TYPE_LWT_IN and + * BPF_PROG_TYPE_LWT_XMIT to indicate that skb had been + * changed and should be routed based on its new L3 header. + * (This is an L3 redirect, as opposed to L2 redirect + * represented by BPF_REDIRECT above). + */ + BPF_LWT_REROUTE = 128, }; struct bpf_sock { diff --git a/net/core/filter.c b/net/core/filter.c index 45e9fb0c0aaa..a0a78188a2d1 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4908,7 +4908,15 @@ static int bpf_push_seg6_encap(struct sk_buff *skb, u32 type, void *hdr, u32 len } #endif /* CONFIG_IPV6_SEG6_BPF */ -BPF_CALL_4(bpf_lwt_push_encap, struct sk_buff *, skb, u32, type, void *, hdr, +#if IS_ENABLED(CONFIG_LWTUNNEL_BPF) +static int bpf_push_ip_encap(struct sk_buff *skb, void *hdr, u32 len, + bool ingress) +{ + return -EINVAL; /* Implemented in the next patch. */ +} +#endif + +BPF_CALL_4(bpf_lwt_in_push_encap, struct sk_buff *, skb, u32, type, void *, hdr, u32, len) { switch (type) { @@ -4916,14 +4924,41 @@ BPF_CALL_4(bpf_lwt_push_encap, struct sk_buff *, skb, u32, type, void *, hdr, case BPF_LWT_ENCAP_SEG6: case BPF_LWT_ENCAP_SEG6_INLINE: return bpf_push_seg6_encap(skb, type, hdr, len); +#endif +#if IS_ENABLED(CONFIG_LWTUNNEL_BPF) + case BPF_LWT_ENCAP_IP: + return bpf_push_ip_encap(skb, hdr, len, true /* ingress */); #endif default: return -EINVAL; } } -static const struct bpf_func_proto bpf_lwt_push_encap_proto = { - .func = bpf_lwt_push_encap, +BPF_CALL_4(bpf_lwt_xmit_push_encap, struct sk_buff *, skb, u32, type, + void *, hdr, u32, len) +{ + switch (type) { +#if IS_ENABLED(CONFIG_LWTUNNEL_BPF) + case BPF_LWT_ENCAP_IP: + return bpf_push_ip_encap(skb, hdr, len, false /* egress */); +#endif + default: + return -EINVAL; + } +} + +static const struct bpf_func_proto bpf_lwt_in_push_encap_proto = { + .func = bpf_lwt_in_push_encap, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_PTR_TO_MEM, + .arg4_type = ARG_CONST_SIZE +}; + +static const struct bpf_func_proto bpf_lwt_xmit_push_encap_proto = { + .func = bpf_lwt_xmit_push_encap, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, @@ -5683,7 +5718,8 @@ bool bpf_helper_changes_pkt_data(void *func) func == bpf_lwt_seg6_adjust_srh || func == bpf_lwt_seg6_action || #endif - func == bpf_lwt_push_encap) + func == bpf_lwt_in_push_encap || + func == bpf_lwt_xmit_push_encap) return true; return false; @@ -6110,7 +6146,7 @@ lwt_in_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_lwt_push_encap: - return &bpf_lwt_push_encap_proto; + return &bpf_lwt_in_push_encap_proto; default: return lwt_out_func_proto(func_id, prog); } @@ -6146,6 +6182,8 @@ lwt_xmit_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_l4_csum_replace_proto; case BPF_FUNC_set_hash_invalid: return &bpf_set_hash_invalid_proto; + case BPF_FUNC_lwt_push_encap: + return &bpf_lwt_xmit_push_encap_proto; default: return lwt_out_func_proto(func_id, prog); } From 3d2ac943931cd9c40379867f6df136cfa9b846c5 Mon Sep 17 00:00:00 2001 From: Peter Oskolkov Date: Wed, 13 Feb 2019 11:53:36 -0800 Subject: [PATCH 1533/1640] UPSTREAM: bpf: implement BPF_LWT_ENCAP_IP mode in bpf_lwt_push_encap Implement BPF_LWT_ENCAP_IP mode in bpf_lwt_push_encap BPF helper. It enables BPF programs (specifically, BPF_PROG_TYPE_LWT_IN and BPF_PROG_TYPE_LWT_XMIT prog types) to add IP encapsulation headers to packets (e.g. IP/GRE, GUE, IPIP). This is useful when thousands of different short-lived flows should be encapped, each with different and dynamically determined destination. Although lwtunnels can be used in some of these scenarios, the ability to dynamically generate encap headers adds more flexibility, e.g. when routing depends on the state of the host (reflected in global bpf maps). v7 changes: - added a call skb_clear_hash(); - removed calls to skb_set_transport_header(); - refuse to encap GSO-enabled packets. v8 changes: - fix build errors when LWT is not enabled. Note: the next patch in the patchset with deal with GSO-enabled packets, which are currently rejected at encapping attempt. Change-Id: I167e9683cbedf5f47936304c27d19853b38309fd Signed-off-by: Peter Oskolkov Signed-off-by: Alexei Starovoitov --- include/net/lwtunnel.h | 2 ++ net/core/filter.c | 2 +- net/core/lwt_bpf.c | 65 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 68 insertions(+), 1 deletion(-) diff --git a/include/net/lwtunnel.h b/include/net/lwtunnel.h index ec75c0a1c529..6ceb0ff6c353 100644 --- a/include/net/lwtunnel.h +++ b/include/net/lwtunnel.h @@ -129,6 +129,8 @@ int lwtunnel_cmp_encap(struct lwtunnel_state *a, struct lwtunnel_state *b); int lwtunnel_output(struct net *net, struct sock *sk, struct sk_buff *skb); int lwtunnel_input(struct sk_buff *skb); int lwtunnel_xmit(struct sk_buff *skb); +int bpf_lwt_push_ip_encap(struct sk_buff *skb, void *hdr, u32 len, + bool ingress); static inline void lwtunnel_set_redirect(struct dst_entry *dst) { diff --git a/net/core/filter.c b/net/core/filter.c index a0a78188a2d1..95a9d48182b1 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4912,7 +4912,7 @@ static int bpf_push_seg6_encap(struct sk_buff *skb, u32 type, void *hdr, u32 len static int bpf_push_ip_encap(struct sk_buff *skb, void *hdr, u32 len, bool ingress) { - return -EINVAL; /* Implemented in the next patch. */ + return bpf_lwt_push_ip_encap(skb, hdr, len, ingress); } #endif diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c index 3c29c02db4ef..b3ee5899838d 100644 --- a/net/core/lwt_bpf.c +++ b/net/core/lwt_bpf.c @@ -384,6 +384,71 @@ static const struct lwtunnel_encap_ops bpf_encap_ops = { .owner = THIS_MODULE, }; +static int handle_gso_encap(struct sk_buff *skb, bool ipv4, int encap_len) +{ + /* Handling of GSO-enabled packets is added in the next patch. */ + return -EOPNOTSUPP; +} + +int bpf_lwt_push_ip_encap(struct sk_buff *skb, void *hdr, u32 len, bool ingress) +{ + struct iphdr *iph; + bool ipv4; + int err; + + if (unlikely(len < sizeof(struct iphdr) || len > LWT_BPF_MAX_HEADROOM)) + return -EINVAL; + + /* validate protocol and length */ + iph = (struct iphdr *)hdr; + if (iph->version == 4) { + ipv4 = true; + if (unlikely(len < iph->ihl * 4)) + return -EINVAL; + } else if (iph->version == 6) { + ipv4 = false; + if (unlikely(len < sizeof(struct ipv6hdr))) + return -EINVAL; + } else { + return -EINVAL; + } + + if (ingress) + err = skb_cow_head(skb, len + skb->mac_len); + else + err = skb_cow_head(skb, + len + LL_RESERVED_SPACE(skb_dst(skb)->dev)); + if (unlikely(err)) + return err; + + /* push the encap headers and fix pointers */ + skb_reset_inner_headers(skb); + skb->encapsulation = 1; + skb_push(skb, len); + if (ingress) + skb_postpush_rcsum(skb, iph, len); + skb_reset_network_header(skb); + memcpy(skb_network_header(skb), hdr, len); + bpf_compute_data_pointers(skb); + skb_clear_hash(skb); + + if (ipv4) { + skb->protocol = htons(ETH_P_IP); + iph = ip_hdr(skb); + + if (!iph->check) + iph->check = ip_fast_csum((unsigned char *)iph, + iph->ihl); + } else { + skb->protocol = htons(ETH_P_IPV6); + } + + if (skb_is_gso(skb)) + return handle_gso_encap(skb, ipv4, len); + + return 0; +} + static int __init bpf_lwt_init(void) { return lwtunnel_encap_add_ops(&bpf_encap_ops, LWTUNNEL_ENCAP_BPF); From 726b65012dfd54dc437ded059ba2ffe8688babcb Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Tue, 19 Feb 2019 19:53:02 +0100 Subject: [PATCH 1534/1640] UPSTREAM: bpf: add skb->queue_mapping write access from tc clsact The skb->queue_mapping already have read access, via __sk_buff->queue_mapping. This patch allow BPF tc qdisc clsact write access to the queue_mapping via tc_cls_act_is_valid_access. Also handle that the value NO_QUEUE_MAPPING is not allowed. It is already possible to change this via TC filter action skbedit tc-skbedit(8). Due to the lack of TC examples, lets show one: # tc qdisc add dev ixgbe1 clsact # tc filter add dev ixgbe1 ingress matchall action skbedit queue_mapping 5 # tc filter list dev ixgbe1 ingress The most common mistake is that XPS (Transmit Packet Steering) takes precedence over setting skb->queue_mapping. XPS is configured per DEVICE via /sys/class/net/DEVICE/queues/tx-*/xps_cpus via a CPU hex mask. To disable set mask=00. The purpose of changing skb->queue_mapping is to influence the selection of the net_device "txq" (struct netdev_queue), which influence selection of the qdisc "root_lock" (via txq->qdisc->q.lock) and txq->_xmit_lock. When using the MQ qdisc the txq->qdisc points to different qdiscs and associated locks, and HARD_TX_LOCK (txq->_xmit_lock), allowing for CPU scalability. Due to lack of TC examples, lets show howto attach clsact BPF programs: # tc qdisc add dev ixgbe2 clsact # tc filter add dev ixgbe2 egress bpf da obj XXX_kern.o sec tc_qmap2cpu # tc filter list dev ixgbe2 egress Change-Id: I8a3c25ac2bc974e8765b5254badad98daa669489 Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Daniel Borkmann --- net/core/filter.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/net/core/filter.c b/net/core/filter.c index 95a9d48182b1..d96c5d003158 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -6571,6 +6571,7 @@ static bool tc_cls_act_is_valid_access(int off, int size, case bpf_ctx_range(struct __sk_buff, tc_classid): case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]): case bpf_ctx_range(struct __sk_buff, tstamp): + case bpf_ctx_range(struct __sk_buff, queue_mapping): break; default: return false; @@ -6987,9 +6988,18 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type, break; case offsetof(struct __sk_buff, queue_mapping): - *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, - bpf_target_off(struct sk_buff, queue_mapping, 2, - target_size)); + if (type == BPF_WRITE) { + *insn++ = BPF_JMP_IMM(BPF_JGE, si->src_reg, NO_QUEUE_MAPPING, 1); + *insn++ = BPF_STX_MEM(BPF_H, si->dst_reg, si->src_reg, + bpf_target_off(struct sk_buff, + queue_mapping, + 2, target_size)); + } else { + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, + bpf_target_off(struct sk_buff, + queue_mapping, + 2, target_size)); + } break; case offsetof(struct __sk_buff, vlan_present): From e6baa27ae40847bba7624d950e4eaef4bff19d73 Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Fri, 22 Mar 2019 09:54:02 +0800 Subject: [PATCH 1535/1640] BACKPORT: bpf: add helper to check for a valid SYN cookie Using bpf_skc_lookup_tcp it's possible to ascertain whether a packet belongs to a known connection. However, there is one corner case: no sockets are created if SYN cookies are active. This means that the final ACK in the 3WHS is misclassified. Using the helper, we can look up the listening socket via bpf_skc_lookup_tcp and then check whether a packet is a valid SYN cookie ACK. Change-Id: If6df241e53af7fe53f842932fdcfd5afcc5aefd6 Signed-off-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 15 +++++++++ net/core/filter.c | 72 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 87 insertions(+) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 9551ae90eaae..ea8cab3ccadc 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2610,6 +2610,21 @@ union bpf_attr { * * Return * 0 on success, or a negative erro in case of failure. + * + * int bpf_tcp_check_syncookie(struct bpf_sock *sk, void *iph, u32 iph_len, struct tcphdr *th, u32 th_len) + * Description + * Check whether iph and th contain a valid SYN cookie ACK for + * the listening socket in sk. + * + * iph points to the start of the IPv4 or IPv6 header, while + * iph_len contains sizeof(struct iphdr) or sizeof(struct ip6hdr). + * + * th points to the start of the TCP header, while th_len contains + * sizeof(struct tcphdr). + * + * Return + * 0 if iph and th are a valid SYN cookie ACK, or a negative error + * otherwise. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ diff --git a/net/core/filter.c b/net/core/filter.c index d96c5d003158..fc4cd3a30a48 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5689,6 +5689,74 @@ static const struct bpf_func_proto bpf_skb_ecn_set_ce_proto = { .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, }; + +BPF_CALL_5(bpf_tcp_check_syncookie, struct sock *, sk, void *, iph, u32, iph_len, + struct tcphdr *, th, u32, th_len) +{ +#ifdef CONFIG_SYN_COOKIES + u32 cookie; + int ret; + + if (unlikely(th_len < sizeof(*th))) + return -EINVAL; + + /* sk_listener() allows TCP_NEW_SYN_RECV, which makes no sense here. */ + if (sk->sk_protocol != IPPROTO_TCP || sk->sk_state != TCP_LISTEN) + return -EINVAL; + + if (!sock_net(sk)->ipv4.sysctl_tcp_syncookies) + return -EINVAL; + + if (!th->ack || th->rst || th->syn) + return -ENOENT; + + if (tcp_synq_no_recent_overflow(sk)) + return -ENOENT; + + cookie = ntohl(th->ack_seq) - 1; + + switch (sk->sk_family) { + case AF_INET: + if (unlikely(iph_len < sizeof(struct iphdr))) + return -EINVAL; + + ret = __cookie_v4_check((struct iphdr *)iph, th, cookie); + break; + +#if IS_BUILTIN(CONFIG_IPV6) + case AF_INET6: + if (unlikely(iph_len < sizeof(struct ipv6hdr))) + return -EINVAL; + + ret = __cookie_v6_check((struct ipv6hdr *)iph, th, cookie); + break; +#endif /* CONFIG_IPV6 */ + + default: + return -EPROTONOSUPPORT; + } + + if (ret > 0) + return 0; + + return -ENOENT; +#else + return -ENOTSUPP; +#endif +} + +static const struct bpf_func_proto bpf_tcp_check_syncookie_proto = { + .func = bpf_tcp_check_syncookie, + .gpl_only = true, + .pkt_access = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_SOCK_COMMON, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_PTR_TO_MEM, + .arg5_type = ARG_CONST_SIZE, +}; + #endif /* CONFIG_INET */ bool bpf_helper_changes_pkt_data(void *func) @@ -5966,6 +6034,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_listener_sock_proto; case BPF_FUNC_skc_lookup_tcp: return &bpf_skc_lookup_tcp_proto; + case BPF_FUNC_tcp_check_syncookie: + return &bpf_tcp_check_syncookie_proto; #endif default: return bpf_base_func_proto(func_id); @@ -6003,6 +6073,8 @@ xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_sk_release_proto; case BPF_FUNC_skc_lookup_tcp: return &bpf_xdp_skc_lookup_tcp_proto; + case BPF_FUNC_tcp_check_syncookie: + return &bpf_tcp_check_syncookie_proto; #endif default: return bpf_base_func_proto(func_id); From 7603b6538d6e4c0c936dd1cfca1f7c007f3d64d8 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Fri, 22 Mar 2019 14:32:48 -0400 Subject: [PATCH 1536/1640] UPSTREAM: bpf: in bpf_skb_adjust_room avoid copy in tx fast path bpf_skb_adjust_room calls skb_cow on grow. This expensive operation can be avoided in the fast path when the only other clone has released the header. This is the common case for TCP, where one headerless clone is kept on the retransmit queue. It is safe to do so even when touching the gso fields in skb_shinfo. Regular tunnel encap with iptunnel_handle_offloads takes the same optimization. The tcp stack unclones in the unlikely case that it accesses these fields through headerless clones packets on the retransmit queue (see __tcp_retransmit_skb). If any other clones are present, e.g., from packet sockets, skb_cow_head returns the same value as skb_cow(). Change-Id: I94a3a73b06ef24ce761c4d6605cf676d0e1a16fc Signed-off-by: Willem de Bruijn Signed-off-by: Alexei Starovoitov --- net/core/filter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/core/filter.c b/net/core/filter.c index fc4cd3a30a48..35472bd6fcf0 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2997,7 +2997,7 @@ static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff, return -ENOTSUPP; } - ret = skb_cow(skb, len_diff); + ret = skb_cow_head(skb, len_diff); if (unlikely(ret < 0)) return ret; From 339eca4466bbfc7b5399ac3fe14909897aa8b2ba Mon Sep 17 00:00:00 2001 From: Peter Oskolkov Date: Fri, 22 Mar 2019 16:40:18 -0700 Subject: [PATCH 1537/1640] UPSTREAM: bpf: make bpf_skb_ecn_set_ce callable from BPF_PROG_TYPE_SCHED_ACT This helper is useful if a bpf tc filter sets skb->tstamp. Change-Id: Icba0dd85d10ebd90eae7602ba3dd2829a0f86c8e Signed-off-by: Peter Oskolkov Signed-off-by: Alexei Starovoitov --- net/core/filter.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/core/filter.c b/net/core/filter.c index 35472bd6fcf0..357d4a78fc91 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -6036,6 +6036,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_skc_lookup_tcp_proto; case BPF_FUNC_tcp_check_syncookie: return &bpf_tcp_check_syncookie_proto; + case BPF_FUNC_skb_ecn_set_ce: + return &bpf_skb_ecn_set_ce_proto; #endif default: return bpf_base_func_proto(func_id); From 28985fdf30a600f1dd6f7a7c686813252ea7c28d Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Sat, 23 Mar 2019 12:23:07 -0400 Subject: [PATCH 1538/1640] UPSTREAM: bpf: silence uninitialized var warning in bpf_skb_net_grow These three variables are set in one branch and used in another with the same condition. But on some architectures they still generate compiler warnings of the kind: warning: 'inner_trans' may be used uninitialized in this function [-Wmaybe-uninitialized] Silence these false positives. Use the straightforward approach to always initialize them, if a bit superfluous. Fixes: 868d523535c2 ("bpf: add bpf_skb_adjust_room encap flags") Reported-by: kbuild test robot Change-Id: Ide6c93d5b1836b104a8483868bd9881c27bc3fb5 Signed-off-by: Willem de Bruijn Signed-off-by: Alexei Starovoitov --- net/core/filter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/core/filter.c b/net/core/filter.c index 357d4a78fc91..1ab67801857a 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2986,8 +2986,8 @@ static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff, u64 flags) { bool encap = flags & BPF_F_ADJ_ROOM_ENCAP_L3_MASK; + u16 mac_len = 0, inner_net = 0, inner_trans = 0; unsigned int gso_type = SKB_GSO_DODGY; - u16 mac_len, inner_net, inner_trans; int ret; if (skb_is_gso(skb) && !skb_is_gso_tcp(skb)) { From f5e74b7c95b252b15ac33d64fe2f19d878362ca0 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Mon, 1 Apr 2019 13:57:33 -0700 Subject: [PATCH 1539/1640] UPSTREAM: flow_dissector: allow access only to a subset of __sk_buff fields Use whitelist instead of a blacklist and allow only a small set of fields that might be relevant in the context of flow dissector: * data * data_end * flow_keys This is required for the eth_get_headlen case where we have only a chunk of data to dissect (i.e. trying to read the other skb fields doesn't make sense). Note, that it is a breaking API change! However, we've provided flow_keys->n_proto as a substitute for skb->protocol; and there is no need to manually handle skb->vlan_present. So even if we break somebody, the migration is trivial. Unfortunately, we can't support eth_get_headlen use-case without those breaking changes. Change-Id: Ifb5f50dabbb49f7fe0030562c99ce12503da32fb Signed-off-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann --- net/core/filter.c | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/net/core/filter.c b/net/core/filter.c index 1ab67801857a..0b892def3844 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -6950,14 +6950,8 @@ static bool flow_dissector_is_valid_access(int off, int size, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { - if (type == BPF_WRITE) { - switch (off) { - case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]): - break; - default: - return false; - } - } + if (type == BPF_WRITE) + return false; switch (off) { case bpf_ctx_range(struct __sk_buff, data): @@ -6969,11 +6963,7 @@ static bool flow_dissector_is_valid_access(int off, int size, case bpf_ctx_range_ptr(struct __sk_buff, flow_keys): info->reg_type = PTR_TO_FLOW_KEYS; break; - case bpf_ctx_range(struct __sk_buff, tc_classid): - case bpf_ctx_range(struct __sk_buff, data_meta): - case bpf_ctx_range_till(struct __sk_buff, family, local_port): - case bpf_ctx_range(struct __sk_buff, tstamp): - case bpf_ctx_range(struct __sk_buff, wire_len): + default: return false; } From 0e2d83f0e1dc42dde401fb507a5095d4fbf5395a Mon Sep 17 00:00:00 2001 From: Alan Maguire Date: Tue, 9 Apr 2019 15:06:41 +0100 Subject: [PATCH 1540/1640] UPSTREAM: bpf: add layer 2 encap support to bpf_skb_adjust_room commit 868d523535c2 ("bpf: add bpf_skb_adjust_room encap flags") introduced support to bpf_skb_adjust_room for GSO-friendly GRE and UDP encapsulation. For GSO to work for skbs, the inner headers (mac and network) need to be marked. For L3 encapsulation using bpf_skb_adjust_room, the mac and network headers are identical. Here we provide a way of specifying the inner mac header length for cases where L2 encap is desired. Such an approach can support encapsulated ethernet headers, MPLS headers etc. For example to convert from a packet of form [eth][ip][tcp] to [eth][ip][udp][inner mac][ip][tcp], something like the following could be done: headroom = sizeof(iph) + sizeof(struct udphdr) + inner_maclen; ret = bpf_skb_adjust_room(skb, headroom, BPF_ADJ_ROOM_MAC, BPF_F_ADJ_ROOM_ENCAP_L4_UDP | BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 | BPF_F_ADJ_ROOM_ENCAP_L2(inner_maclen)); Change-Id: I451ddb130eb13f3e0c2f90fca379f7b931506c33 Signed-off-by: Alan Maguire Signed-off-by: Daniel Borkmann --- include/uapi/linux/bpf.h | 10 ++++++++++ net/core/filter.c | 12 ++++++++---- 2 files changed, 18 insertions(+), 4 deletions(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index ea8cab3ccadc..237378cda4dc 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1578,6 +1578,10 @@ union bpf_attr { * * **BPF_F_ADJ_ROOM_ENCAP_L4_UDP **: * Use with ENCAP_L3 flags to further specify the tunnel type. * + * * **BPF_F_ADJ_ROOM_ENCAP_L2(len) **: + * Use with ENCAP_L3/L4 flags to further specify the tunnel + * type; **len** is the length of the inner MAC header. + * * A call to this helper is susceptible to change the underlaying * packet buffer. Therefore, at load time, all checks on pointers * previously done by the verifier are invalidated and must be @@ -2814,10 +2818,16 @@ enum bpf_func_id { /* BPF_FUNC_skb_adjust_room flags. */ #define BPF_F_ADJ_ROOM_FIXED_GSO (1ULL << 0) +#define BPF_ADJ_ROOM_ENCAP_L2_MASK 0xff +#define BPF_ADJ_ROOM_ENCAP_L2_SHIFT 56 + #define BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 (1ULL << 1) #define BPF_F_ADJ_ROOM_ENCAP_L3_IPV6 (1ULL << 2) #define BPF_F_ADJ_ROOM_ENCAP_L4_GRE (1ULL << 3) #define BPF_F_ADJ_ROOM_ENCAP_L4_UDP (1ULL << 4) +#define BPF_F_ADJ_ROOM_ENCAP_L2(len) (((__u64)len & \ + BPF_ADJ_ROOM_ENCAP_L2_MASK) \ + << BPF_ADJ_ROOM_ENCAP_L2_SHIFT) /* BPF_FUNC_sysctl_get_name flags. */ #define BPF_F_SYSCTL_BASE_NAME (1ULL << 0) diff --git a/net/core/filter.c b/net/core/filter.c index 0b892def3844..1e4617031cf6 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2980,11 +2980,14 @@ static u32 bpf_skb_net_base_len(const struct sk_buff *skb) #define BPF_F_ADJ_ROOM_MASK (BPF_F_ADJ_ROOM_FIXED_GSO | \ BPF_F_ADJ_ROOM_ENCAP_L3_MASK | \ BPF_F_ADJ_ROOM_ENCAP_L4_GRE | \ - BPF_F_ADJ_ROOM_ENCAP_L4_UDP) + BPF_F_ADJ_ROOM_ENCAP_L4_UDP | \ + BPF_F_ADJ_ROOM_ENCAP_L2( \ + BPF_ADJ_ROOM_ENCAP_L2_MASK)) static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff, u64 flags) { + u8 inner_mac_len = flags >> BPF_ADJ_ROOM_ENCAP_L2_SHIFT; bool encap = flags & BPF_F_ADJ_ROOM_ENCAP_L3_MASK; u16 mac_len = 0, inner_net = 0, inner_trans = 0; unsigned int gso_type = SKB_GSO_DODGY; @@ -3019,6 +3022,8 @@ static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff, mac_len = skb->network_header - skb->mac_header; inner_net = skb->network_header; + if (inner_mac_len > len_diff) + return -EINVAL; inner_trans = skb->transport_header; } @@ -3027,8 +3032,7 @@ static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff, return ret; if (encap) { - /* inner mac == inner_net on l3 encap */ - skb->inner_mac_header = inner_net; + skb->inner_mac_header = inner_net - inner_mac_len; skb->inner_network_header = inner_net; skb->inner_transport_header = inner_trans; skb_set_inner_protocol(skb, skb->protocol); @@ -3042,7 +3046,7 @@ static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff, gso_type |= SKB_GSO_GRE; else if (flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV6) gso_type |= SKB_GSO_IPXIP6; - else + else if (flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV4) gso_type |= SKB_GSO_IPXIP4; if (flags & BPF_F_ADJ_ROOM_ENCAP_L4_GRE || From 3455a936e981ddc94d9384569d4932572fcd1d3b Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Fri, 12 Apr 2019 19:55:47 +0900 Subject: [PATCH 1541/1640] UPSTREAM: bpf: Check address length before reading address family KMSAN will complain if valid address length passed to bpf_bind() is shorter than sizeof("struct sockaddr"->sa_family) bytes. Change-Id: Iff85b549faad2642fca7663aa1ee55f80d4fcadb Signed-off-by: Tetsuo Handa Acked-by: Andrey Ignatov Signed-off-by: David S. Miller --- net/core/filter.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/core/filter.c b/net/core/filter.c index 1e4617031cf6..c7724a6380f4 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4476,6 +4476,8 @@ BPF_CALL_3(bpf_bind, struct bpf_sock_addr_kern *, ctx, struct sockaddr *, addr, * Only binding to IP is supported. */ err = -EINVAL; + if (addr_len < offsetofend(struct sockaddr, sa_family)) + return err; if (addr->sa_family == AF_INET) { if (addr_len < sizeof(struct sockaddr_in)) return err; From 15f97acfd1e365a3f5ef060146bf8dc109f07ce2 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Fri, 12 Apr 2019 09:30:48 -0400 Subject: [PATCH 1542/1640] UPSTREAM: bpf: reserve flags in bpf_skb_net_shrink The ENCAP flags in bpf_skb_adjust_room are ignored on decap with bpf_skb_net_shrink. Reserve these bits for future use. Fixes: 868d523535c2d ("bpf: add bpf_skb_adjust_room encap flags") Change-Id: I814907392506a8f22e2ed2073da4f3cf50c74a7b Signed-off-by: Willem de Bruijn Reviewed-by: Alan Maguire Signed-off-by: Daniel Borkmann --- net/core/filter.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/core/filter.c b/net/core/filter.c index c7724a6380f4..29e82678d717 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3079,6 +3079,9 @@ static int bpf_skb_net_shrink(struct sk_buff *skb, u32 off, u32 len_diff, { int ret; + if (flags & ~BPF_F_ADJ_ROOM_FIXED_GSO) + return -EINVAL; + if (skb_is_gso(skb) && !skb_is_gso_tcp(skb)) { /* udp gso_size delineates datagrams, only allow if fixed */ if (!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4) || From 660f383d44503e0a1061fc5ef32bab1e15729bb7 Mon Sep 17 00:00:00 2001 From: Viet Hoang Tran Date: Mon, 15 Apr 2019 09:54:55 +0000 Subject: [PATCH 1543/1640] UPSTREAM: bpf: allow clearing all sock_ops callback flags The helper function bpf_sock_ops_cb_flags_set() can be used to both set and clear the sock_ops callback flags. However, its current behavior is not consistent. BPF program may clear a flag if more than one were set, or replace a flag with another one, but cannot clear all flags. This patch also updates the documentation to clarify the ability to clear flags of this helper function. Change-Id: Ib0a4971ca5a99c9e1832d7e85ae9bbe7297bdd55 Signed-off-by: Hoang Tran Acked-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 9 ++++++++- net/core/filter.c | 3 +-- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 237378cda4dc..71187a47834a 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1793,12 +1793,19 @@ union bpf_attr { * error if an eBPF program tries to set a callback that is not * supported in the current kernel. * - * The supported callback values that *argval* can combine are: + * *argval* is a flag array which can combine these flags: * * * **BPF_SOCK_OPS_RTO_CB_FLAG** (retransmission time out) * * **BPF_SOCK_OPS_RETRANS_CB_FLAG** (retransmission) * * **BPF_SOCK_OPS_STATE_CB_FLAG** (TCP state change) * + * Therefore, this function can be used to clear a callback flag by + * setting the appropriate bit to zero. e.g. to disable the RTO + * callback: + * + * **bpf_sock_ops_cb_flags_set(bpf_sock,** + * **bpf_sock->bpf_sock_ops_cb_flags & ~BPF_SOCK_OPS_RTO_CB_FLAG)** + * * Here are some examples of where one could call such eBPF * program: * diff --git a/net/core/filter.c b/net/core/filter.c index 29e82678d717..5ec8718e56d2 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4451,8 +4451,7 @@ BPF_CALL_2(bpf_sock_ops_cb_flags_set, struct bpf_sock_ops_kern *, bpf_sock, if (!IS_ENABLED(CONFIG_INET) || !sk_fullsock(sk)) return -EINVAL; - if (val) - tcp_sk(sk)->bpf_sock_ops_cb_flags = val; + tcp_sk(sk)->bpf_sock_ops_cb_flags = val; return argval & (~BPF_SOCK_OPS_ALL_CB_FLAGS); } From e839f86914a55f5f76dae1216b2ec2c364d4a4c6 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Mon, 1 Apr 2019 13:57:31 -0700 Subject: [PATCH 1544/1640] UPSTREAM: net/flow_dissector: pass flow_keys->n_proto to BPF programs This is a preparation for the next commit that would prohibit access to the most fields of __sk_buff from the BPF programs. Instead of requiring BPF flow dissector programs to look into skb, pass all input data in the flow_keys. Change-Id: Ife5f0f15775b25806e5c9da699966a9f0a0d95c8 Signed-off-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann --- net/core/flow_dissector.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 5ab18c2b6a7b..57c51163d6c8 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -566,6 +566,7 @@ bool __skb_flow_bpf_dissect(struct bpf_prog *prog, /* Pass parameters to the BPF program */ memset(flow_keys, 0, sizeof(*flow_keys)); cb->qdisc_cb.flow_keys = flow_keys; + flow_keys->n_proto = skb->protocol; flow_keys->nhoff = skb_network_offset(skb); flow_keys->thoff = flow_keys->nhoff; From ccf7d5bd773908d2486da2b5ef801344dd02967e Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Mon, 22 Apr 2019 08:55:44 -0700 Subject: [PATCH 1545/1640] BACKPORT: flow_dissector: switch kernel context to struct bpf_flow_dissector struct bpf_flow_dissector has a small subset of sk_buff fields that flow dissector BPF program is allowed to access and an optional pointer to real skb. Real skb is used only in bpf_skb_load_bytes helper to read non-linear data. The real motivation for this is to be able to call flow dissector from eth_get_headlen context where we don't have an skb and need to dissect raw bytes. Change-Id: I5aa984bb223fd2cd05dd0a502f8b4519358124d1 Signed-off-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann --- include/linux/skbuff.h | 4 ++ include/net/flow_dissector.h | 7 +++ include/net/sch_generic.h | 11 ++-- net/bpf/test_run.c | 4 -- net/core/filter.c | 105 +++++++++++++++++++++++++++-------- net/core/flow_dissector.c | 44 +++++++-------- 6 files changed, 117 insertions(+), 58 deletions(-) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index e2d01b0900e7..79df99d19ed9 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1227,6 +1227,10 @@ static inline int skb_flow_dissector_bpf_prog_detach(const union bpf_attr *attr) } #endif +struct bpf_flow_dissector; +bool bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx, + __be16 proto, int nhoff, int hlen); + struct bpf_flow_keys; bool __skb_flow_bpf_dissect(struct bpf_prog *prog, const struct sk_buff *skb, diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h index ddf916e5e57d..bbedb60c281c 100644 --- a/include/net/flow_dissector.h +++ b/include/net/flow_dissector.h @@ -291,4 +291,11 @@ flow_dissector_init_keys(struct flow_dissector_key_control *key_control, memset(key_basic, 0, sizeof(*key_basic)); } +struct bpf_flow_dissector { + struct bpf_flow_keys *flow_keys; + const struct sk_buff *skb; + void *data; + void *data_end; +}; + #endif diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 2e5fe6a6eb60..6b43c92a0af8 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -256,13 +256,10 @@ struct tcf_proto { }; struct qdisc_skb_cb { - union { - struct { - unsigned int pkt_len; - u16 slave_dev_queue_mapping; - u16 tc_classid; - }; - struct bpf_flow_keys *flow_keys; + struct { + unsigned int pkt_len; + u16 slave_dev_queue_mapping; + u16 tc_classid; }; #define QDISC_CB_PRIV_LEN 20 unsigned char data[QDISC_CB_PRIV_LEN]; diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index f58d688aab42..2ec9077948cd 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -368,7 +368,6 @@ int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog, u32 repeat = kattr->test.repeat; struct bpf_flow_keys flow_keys; u64 time_start, time_spent = 0; - struct bpf_skb_data_end *cb; u32 retval, duration; struct sk_buff *skb; struct sock *sk; @@ -406,9 +405,6 @@ int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog, current->nsproxy->net_ns->loopback_dev); skb_reset_network_header(skb); - cb = (struct bpf_skb_data_end *)skb->cb; - cb->qdisc_cb.flow_keys = &flow_keys; - if (!repeat) repeat = 1; diff --git a/net/core/filter.c b/net/core/filter.c index 5ec8718e56d2..420f1b900427 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1734,6 +1734,40 @@ static const struct bpf_func_proto bpf_skb_load_bytes_proto = { .arg4_type = ARG_CONST_SIZE, }; +BPF_CALL_4(bpf_flow_dissector_load_bytes, + const struct bpf_flow_dissector *, ctx, u32, offset, + void *, to, u32, len) +{ + void *ptr; + + if (unlikely(offset > 0xffff)) + goto err_clear; + + if (unlikely(!ctx->skb)) + goto err_clear; + + ptr = skb_header_pointer(ctx->skb, offset, len, to); + if (unlikely(!ptr)) + goto err_clear; + if (ptr != to) + memcpy(to, ptr, len); + + return 0; +err_clear: + memset(to, 0, len); + return -EFAULT; +} + +static const struct bpf_func_proto bpf_flow_dissector_load_bytes_proto = { + .func = bpf_flow_dissector_load_bytes, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_PTR_TO_UNINIT_MEM, + .arg4_type = ARG_CONST_SIZE, +}; + BPF_CALL_5(bpf_skb_load_bytes_relative, const struct sk_buff *, skb, u32, offset, void *, to, u32, len, u32, start_header) { @@ -6190,7 +6224,7 @@ flow_dissector_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_skb_load_bytes: - return &bpf_skb_load_bytes_proto; + return &bpf_flow_dissector_load_bytes_proto; default: return bpf_base_func_proto(func_id); } @@ -6317,9 +6351,7 @@ static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type return false; break; case bpf_ctx_range_ptr(struct __sk_buff, flow_keys): - if (size != sizeof(__u64)) - return false; - break; + return false; case bpf_ctx_range(struct __sk_buff, tstamp): if (size != sizeof(__u64)) return false; @@ -6354,7 +6386,6 @@ static bool sk_filter_is_valid_access(int off, int size, case bpf_ctx_range(struct __sk_buff, data): case bpf_ctx_range(struct __sk_buff, data_meta): case bpf_ctx_range(struct __sk_buff, data_end): - case bpf_ctx_range_ptr(struct __sk_buff, flow_keys): case bpf_ctx_range_till(struct __sk_buff, family, local_port): case bpf_ctx_range(struct __sk_buff, tstamp): case bpf_ctx_range(struct __sk_buff, wire_len): @@ -6381,7 +6412,6 @@ static bool cg_skb_is_valid_access(int off, int size, switch (off) { case bpf_ctx_range(struct __sk_buff, tc_classid): case bpf_ctx_range(struct __sk_buff, data_meta): - case bpf_ctx_range_ptr(struct __sk_buff, flow_keys): case bpf_ctx_range(struct __sk_buff, wire_len): return false; case bpf_ctx_range(struct __sk_buff, data): @@ -6427,7 +6457,6 @@ static bool lwt_is_valid_access(int off, int size, case bpf_ctx_range(struct __sk_buff, tc_classid): case bpf_ctx_range_till(struct __sk_buff, family, local_port): case bpf_ctx_range(struct __sk_buff, data_meta): - case bpf_ctx_range_ptr(struct __sk_buff, flow_keys): case bpf_ctx_range(struct __sk_buff, tstamp): case bpf_ctx_range(struct __sk_buff, wire_len): return false; @@ -6670,7 +6699,6 @@ static bool tc_cls_act_is_valid_access(int off, int size, case bpf_ctx_range(struct __sk_buff, data_end): info->reg_type = PTR_TO_PACKET_END; break; - case bpf_ctx_range_ptr(struct __sk_buff, flow_keys): case bpf_ctx_range_till(struct __sk_buff, family, local_port): return false; } @@ -6884,7 +6912,6 @@ static bool sk_skb_is_valid_access(int off, int size, switch (off) { case bpf_ctx_range(struct __sk_buff, tc_classid): case bpf_ctx_range(struct __sk_buff, data_meta): - case bpf_ctx_range_ptr(struct __sk_buff, flow_keys): case bpf_ctx_range(struct __sk_buff, tstamp): case bpf_ctx_range(struct __sk_buff, wire_len): return false; @@ -6958,24 +6985,65 @@ static bool flow_dissector_is_valid_access(int off, int size, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { + const int size_default = sizeof(__u32); + + if (off < 0 || off >= sizeof(struct __sk_buff)) + return false; + if (type == BPF_WRITE) return false; switch (off) { case bpf_ctx_range(struct __sk_buff, data): + if (size != size_default) + return false; info->reg_type = PTR_TO_PACKET; - break; + return true; case bpf_ctx_range(struct __sk_buff, data_end): + if (size != size_default) + return false; info->reg_type = PTR_TO_PACKET_END; - break; + return true; case bpf_ctx_range_ptr(struct __sk_buff, flow_keys): + if (size != sizeof(__u64)) + return false; info->reg_type = PTR_TO_FLOW_KEYS; - break; + return true; default: return false; } +} - return bpf_skb_is_valid_access(off, size, type, prog, info); +static u32 flow_dissector_convert_ctx_access(enum bpf_access_type type, + const struct bpf_insn *si, + struct bpf_insn *insn_buf, + struct bpf_prog *prog, + u32 *target_size) + +{ + struct bpf_insn *insn = insn_buf; + + switch (si->off) { + case offsetof(struct __sk_buff, data): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_flow_dissector, data), + si->dst_reg, si->src_reg, + offsetof(struct bpf_flow_dissector, data)); + break; + + case offsetof(struct __sk_buff, data_end): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_flow_dissector, data_end), + si->dst_reg, si->src_reg, + offsetof(struct bpf_flow_dissector, data_end)); + break; + + case offsetof(struct __sk_buff, flow_keys): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_flow_dissector, flow_keys), + si->dst_reg, si->src_reg, + offsetof(struct bpf_flow_dissector, flow_keys)); + break; + } + + return insn - insn_buf; } static u32 bpf_convert_ctx_access(enum bpf_access_type type, @@ -7282,15 +7350,6 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type, skc_num, 2, target_size)); break; - case offsetof(struct __sk_buff, flow_keys): - off = si->off; - off -= offsetof(struct __sk_buff, flow_keys); - off += offsetof(struct sk_buff, cb); - off += offsetof(struct qdisc_skb_cb, flow_keys); - *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, - si->src_reg, off); - break; - case offsetof(struct __sk_buff, tstamp): BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, tstamp) != 8); @@ -8291,7 +8350,7 @@ const struct bpf_prog_ops sk_msg_prog_ops = { const struct bpf_verifier_ops flow_dissector_verifier_ops = { .get_func_proto = flow_dissector_func_proto, .is_valid_access = flow_dissector_is_valid_access, - .convert_ctx_access = bpf_convert_ctx_access, + .convert_ctx_access = flow_dissector_convert_ctx_access, }; const struct bpf_prog_ops flow_dissector_prog_ops = { diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 57c51163d6c8..653f51ee0c29 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -547,38 +547,34 @@ bool __skb_flow_bpf_dissect(struct bpf_prog *prog, struct flow_dissector *flow_dissector, struct bpf_flow_keys *flow_keys) { - struct bpf_skb_data_end cb_saved; - struct bpf_skb_data_end *cb; + struct bpf_flow_dissector ctx = { + .flow_keys = flow_keys, + .skb = skb, + .data = skb->data, + .data_end = skb->data + skb_headlen(skb), + }; + + return bpf_flow_dissect(prog, &ctx, skb->protocol, + skb_network_offset(skb), skb_headlen(skb)); +} + +bool bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx, + __be16 proto, int nhoff, int hlen) +{ + struct bpf_flow_keys *flow_keys = ctx->flow_keys; u32 result; - /* Note that even though the const qualifier is discarded - * throughout the execution of the BPF program, all changes(the - * control block) are reverted after the BPF program returns. - * Therefore, __skb_flow_dissect does not alter the skb. - */ - - cb = (struct bpf_skb_data_end *)skb->cb; - - /* Save Control Block */ - memcpy(&cb_saved, cb, sizeof(cb_saved)); - memset(cb, 0, sizeof(*cb)); - /* Pass parameters to the BPF program */ memset(flow_keys, 0, sizeof(*flow_keys)); - cb->qdisc_cb.flow_keys = flow_keys; - flow_keys->n_proto = skb->protocol; - flow_keys->nhoff = skb_network_offset(skb); + flow_keys->n_proto = proto; + flow_keys->nhoff = nhoff; flow_keys->thoff = flow_keys->nhoff; - bpf_compute_data_pointers((struct sk_buff *)skb); - result = BPF_PROG_RUN(prog, skb); + result = BPF_PROG_RUN(prog, ctx); - /* Restore state */ - memcpy(cb, &cb_saved, sizeof(cb_saved)); - - flow_keys->nhoff = clamp_t(u16, flow_keys->nhoff, 0, skb->len); + flow_keys->nhoff = clamp_t(u16, flow_keys->nhoff, nhoff, hlen); flow_keys->thoff = clamp_t(u16, flow_keys->thoff, - flow_keys->nhoff, skb->len); + flow_keys->nhoff, hlen); return result == BPF_OK; } From 8b786b2aa0bbd159605f9f946f1a7295cdecf42c Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Tue, 23 Apr 2019 14:43:48 -0400 Subject: [PATCH 1546/1640] UPSTREAM: bpf: update skb->protocol in bpf_skb_net_grow Some tunnels, like sit, change the network protocol of packet. If so, update skb->protocol to match the new type. Change-Id: Ib70b4cf26c0a2eb7a3bd1cf9a614972cd282dca4 Signed-off-by: Willem de Bruijn Reviewed-by: Alan Maguire Acked-by: Yonghong Song Signed-off-by: Daniel Borkmann --- net/core/filter.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/net/core/filter.c b/net/core/filter.c index 420f1b900427..8653a984002d 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3091,6 +3091,14 @@ static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff, skb_set_transport_header(skb, mac_len + nh_len); } + + /* Match skb->protocol to new outer l3 protocol */ + if (skb->protocol == htons(ETH_P_IP) && + flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV6) + skb->protocol = htons(ETH_P_IPV6); + else if (skb->protocol == htons(ETH_P_IPV6) && + flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV4) + skb->protocol = htons(ETH_P_IP); } if (skb_is_gso(skb)) { From 4416e2ed354468c396852df91ac961d6d07a5552 Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Tue, 21 May 2019 08:52:38 +0100 Subject: [PATCH 1547/1640] UPSTREAM: bpf: fix out-of-bounds read in __bpf_skc_lookup __bpf_skc_lookup takes a socket tuple and the length of the tuple as an argument. Based on the length, it decides which address family to pass to the helper function sk_lookup. In case of AF_INET6, it fails to verify that the length of the tuple is long enough. sk_lookup may therefore access data past the end of the tuple. Fixes: 6acc9b432e67 ("bpf: Add helper to retrieve socket in BPF") Change-Id: Ida846f3bc0318ffd98a84bd83c4740c39c0683b6 Signed-off-by: Lorenz Bauer Signed-off-by: Daniel Borkmann --- net/core/filter.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/net/core/filter.c b/net/core/filter.c index 8653a984002d..e2222ce0b37d 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5317,7 +5317,13 @@ __bpf_skc_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, struct net *net; int sdif; - family = len == sizeof(tuple->ipv4) ? AF_INET : AF_INET6; + if (len == sizeof(tuple->ipv4)) + family = AF_INET; + else if (len == sizeof(tuple->ipv6)) + family = AF_INET6; + else + return NULL; + if (unlikely(family == AF_UNSPEC || flags || !((s32)netns_id < 0 || netns_id <= S32_MAX))) goto out; From ccf8ed5a308fb44500b6f06495a33ddc36440b7e Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Fri, 17 May 2019 14:21:17 -0700 Subject: [PATCH 1548/1640] UPSTREAM: bpf: Check sk_fullsock() before returning from bpf_sk_lookup() The BPF_FUNC_sk_lookup_xxx helpers return RET_PTR_TO_SOCKET_OR_NULL. Meaning a fullsock ptr and its fullsock's fields in bpf_sock can be accessed, e.g. type, protocol, mark and priority. Some new helper, like bpf_sk_storage_get(), also expects ARG_PTR_TO_SOCKET is a fullsock. bpf_sk_lookup() currently calls sk_to_full_sk() before returning. However, the ptr returned from sk_to_full_sk() is not guaranteed to be a fullsock. For example, it cannot get a fullsock if sk is in TCP_TIME_WAIT. This patch checks for sk_fullsock() before returning. If it is not a fullsock, sock_gen_put() is called if needed and then returns NULL. Fixes: 6acc9b432e67 ("bpf: Add helper to retrieve socket in BPF") Cc: Joe Stringer Change-Id: Icc5c7fc4cff6db124246d9b06a374c7da31fdc9d Signed-off-by: Martin KaFai Lau Acked-by: Joe Stringer Signed-off-by: Daniel Borkmann --- net/core/filter.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/net/core/filter.c b/net/core/filter.c index e2222ce0b37d..473805f65037 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5356,8 +5356,14 @@ __bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, struct sock *sk = __bpf_skc_lookup(skb, tuple, len, caller_net, ifindex, proto, netns_id, flags); - if (sk) + if (sk) { sk = sk_to_full_sk(sk); + if (!sk_fullsock(sk)) { + if (!sock_flag(sk, SOCK_RCU_FREE)) + sock_gen_put(sk); + return NULL; + } + } return sk; } @@ -5388,8 +5394,14 @@ bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, struct sock *sk = bpf_skc_lookup(skb, tuple, len, proto, netns_id, flags); - if (sk) + if (sk) { sk = sk_to_full_sk(sk); + if (!sk_fullsock(sk)) { + if (!sock_flag(sk, SOCK_RCU_FREE)) + sock_gen_put(sk); + return NULL; + } + } return sk; } From 9483e94fc475ceea2458acce89d7bdb4abd0698c Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Thu, 6 Jun 2019 13:30:12 -0700 Subject: [PATCH 1549/1640] UPSTREAM: bpf: allow CGROUP_SKB programs to use bpf_skb_cgroup_id() helper Currently bpf_skb_cgroup_id() is not supported for CGROUP_SKB programs. An attempt to load such a program generates an error like this: libbpf: 0: (b7) r6 = 0 ... 9: (85) call bpf_skb_cgroup_id#79 unknown func bpf_skb_cgroup_id#79 There are no particular reasons for denying it, and we have some use cases where it might be useful. So let's add it to the list of allowed helpers. Change-Id: Ieaeaab9d357f07ecee2b112bdd72fd57afd7fef2 Signed-off-by: Roman Gushchin Cc: Yonghong Song Cc: Alexei Starovoitov Cc: Daniel Borkmann Acked-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann --- net/core/filter.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/core/filter.c b/net/core/filter.c index 473805f65037..45cd905a0f12 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5992,6 +5992,10 @@ cg_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_sk_storage_get_proto; case BPF_FUNC_sk_storage_delete: return &bpf_sk_storage_delete_proto; +#ifdef CONFIG_SOCK_CGROUP_DATA + case BPF_FUNC_skb_cgroup_id: + return &bpf_skb_cgroup_id_proto; +#endif #ifdef CONFIG_INET case BPF_FUNC_tcp_sock: return &bpf_tcp_sock_proto; From 7f262aa63b84ee8cf07bab1d7fbbe604155f07a9 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Wed, 12 Jun 2019 10:30:37 -0700 Subject: [PATCH 1550/1640] UPSTREAM: bpf: export bpf_sock for BPF_PROG_TYPE_CGROUP_SOCK_ADDR prog type And let it use bpf_sk_storage_{get,delete} helpers to access socket storage. Kernel context (struct bpf_sock_addr_kern) already has sk member, so I just expose it to the BPF hooks. Using PTR_TO_SOCKET instead of PTR_TO_SOCK_COMMON should be safe because the hook is called on bind/connect. Cc: Martin Lau Change-Id: I8ebe12a2f03f15386d5d1288157509053ca123ed Signed-off-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann --- include/uapi/linux/bpf.h | 1 + net/core/filter.c | 16 ++++++++++++++++ 2 files changed, 17 insertions(+) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 71187a47834a..34d5b4998157 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3196,6 +3196,7 @@ struct bpf_sock_addr { __u32 msg_src_ip6[4]; /* Allows 1,2,4-byte read and 4,8-byte write. * Stored in network byte order. */ + __bpf_md_ptr(struct bpf_sock *, sk); }; /* User bpf_sock_ops struct to access socket values and specify request ops diff --git a/net/core/filter.c b/net/core/filter.c index 45cd905a0f12..3463bedf0b74 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5955,6 +5955,10 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_skc_lookup_tcp: return &bpf_sock_addr_skc_lookup_tcp_proto; #endif /* CONFIG_INET */ + case BPF_FUNC_sk_storage_get: + return &bpf_sk_storage_get_proto; + case BPF_FUNC_sk_storage_delete: + return &bpf_sk_storage_delete_proto; default: return bpf_base_func_proto(func_id); } @@ -6875,6 +6879,13 @@ static bool sock_addr_is_valid_access(int off, int size, if (size != size_default) return false; break; + case offsetof(struct bpf_sock_addr, sk): + if (type != BPF_READ) + return false; + if (size != sizeof(__u64)) + return false; + info->reg_type = PTR_TO_SOCKET; + break; default: if (type == BPF_READ) { if (size != size_default) @@ -7821,6 +7832,11 @@ static u32 sock_addr_convert_ctx_access(enum bpf_access_type type, struct bpf_sock_addr_kern, struct in6_addr, t_ctx, s6_addr32[0], BPF_SIZE(si->code), off, tmp_reg); break; + case offsetof(struct bpf_sock_addr, sk): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_addr_kern, sk), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sock_addr_kern, sk)); + break; } return insn - insn_buf; From b61877909762168e055b626cced243ca741490e8 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Wed, 12 Jun 2019 10:30:38 -0700 Subject: [PATCH 1551/1640] UPSTREAM: bpf: export bpf_sock for BPF_PROG_TYPE_SOCK_OPS prog type And let it use bpf_sk_storage_{get,delete} helpers to access socket storage. Kernel context (struct bpf_sock_ops_kern) already has sk member, so I just expose it to the BPF hooks. I use PTR_TO_SOCKET_OR_NULL and return NULL in !is_fullsock case. I also export bpf_tcp_sock to make it possible to access tcp socket stats. Cc: Martin Lau Change-Id: Ic77add758c1d4cb0e2745834749ee796c673c742 Signed-off-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann --- include/uapi/linux/bpf.h | 1 + net/core/filter.c | 26 ++++++++++++++++++++++++++ 2 files changed, 27 insertions(+) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 34d5b4998157..fde31ca2c233 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3248,6 +3248,7 @@ struct bpf_sock_ops { __u32 sk_txhash; __u64 bytes_received; __u64 bytes_acked; + __bpf_md_ptr(struct bpf_sock *, sk); }; /* Definitions for bpf_sock_ops_cb_flags */ diff --git a/net/core/filter.c b/net/core/filter.c index 3463bedf0b74..b299986231ba 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -6182,6 +6182,14 @@ sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_local_storage_proto; case BPF_FUNC_perf_event_output: return &bpf_sockopt_event_output_proto; + case BPF_FUNC_sk_storage_get: + return &bpf_sk_storage_get_proto; + case BPF_FUNC_sk_storage_delete: + return &bpf_sk_storage_delete_proto; +#ifdef CONFIG_INET + case BPF_FUNC_tcp_sock: + return &bpf_tcp_sock_proto; +#endif /* CONFIG_INET */ default: return bpf_base_func_proto(func_id); } @@ -6929,6 +6937,11 @@ static bool sock_ops_is_valid_access(int off, int size, if (size != sizeof(__u64)) return false; break; + case offsetof(struct bpf_sock_ops, sk): + if (size != sizeof(__u64)) + return false; + info->reg_type = PTR_TO_SOCKET_OR_NULL; + break; default: if (size != size_default) return false; @@ -8096,6 +8109,19 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, SOCK_OPS_GET_OR_SET_FIELD(sk_txhash, sk_txhash, struct sock, type); break; + case offsetof(struct bpf_sock_ops, sk): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct bpf_sock_ops_kern, + is_fullsock), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sock_ops_kern, + is_fullsock)); + *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1); + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct bpf_sock_ops_kern, sk), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sock_ops_kern, sk)); + break; } return insn - insn_buf; } From d3835f3bbfb5b841b06200ceca592f9c65348911 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Tue, 2 Jul 2019 09:13:57 -0700 Subject: [PATCH 1552/1640] UPSTREAM: bpf: split shared bpf_tcp_sock and bpf_sock_ops implementation We've added bpf_tcp_sock member to bpf_sock_ops and don't expect any new tcp_sock fields in bpf_sock_ops. Let's remove CONVERT_COMMON_TCP_SOCK_FIELDS so bpf_tcp_sock can be independently extended. Cc: Eric Dumazet Cc: Priyaranjan Jha Cc: Yuchung Cheng Cc: Soheil Hassas Yeganeh Acked-by: Soheil Hassas Yeganeh Acked-by: Yuchung Cheng Change-Id: Id06836b3cd1565f39e274d7671de2b08c81cfb63 Signed-off-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann --- net/core/filter.c | 180 ++++++++++++++++++++++++++++++++-------------- 1 file changed, 126 insertions(+), 54 deletions(-) diff --git a/net/core/filter.c b/net/core/filter.c index b299986231ba..56dcf795e2d6 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5208,54 +5208,6 @@ static const struct bpf_func_proto bpf_lwt_seg6_adjust_srh_proto = { }; #endif /* CONFIG_IPV6_SEG6_BPF */ -#define CONVERT_COMMON_TCP_SOCK_FIELDS(md_type, CONVERT) \ -do { \ - switch (si->off) { \ - case offsetof(md_type, snd_cwnd): \ - CONVERT(snd_cwnd); break; \ - case offsetof(md_type, srtt_us): \ - CONVERT(srtt_us); break; \ - case offsetof(md_type, snd_ssthresh): \ - CONVERT(snd_ssthresh); break; \ - case offsetof(md_type, rcv_nxt): \ - CONVERT(rcv_nxt); break; \ - case offsetof(md_type, snd_nxt): \ - CONVERT(snd_nxt); break; \ - case offsetof(md_type, snd_una): \ - CONVERT(snd_una); break; \ - case offsetof(md_type, mss_cache): \ - CONVERT(mss_cache); break; \ - case offsetof(md_type, ecn_flags): \ - CONVERT(ecn_flags); break; \ - case offsetof(md_type, rate_delivered): \ - CONVERT(rate_delivered); break; \ - case offsetof(md_type, rate_interval_us): \ - CONVERT(rate_interval_us); break; \ - case offsetof(md_type, packets_out): \ - CONVERT(packets_out); break; \ - case offsetof(md_type, retrans_out): \ - CONVERT(retrans_out); break; \ - case offsetof(md_type, total_retrans): \ - CONVERT(total_retrans); break; \ - case offsetof(md_type, segs_in): \ - CONVERT(segs_in); break; \ - case offsetof(md_type, data_segs_in): \ - CONVERT(data_segs_in); break; \ - case offsetof(md_type, segs_out): \ - CONVERT(segs_out); break; \ - case offsetof(md_type, data_segs_out): \ - CONVERT(data_segs_out); break; \ - case offsetof(md_type, lost_out): \ - CONVERT(lost_out); break; \ - case offsetof(md_type, sacked_out): \ - CONVERT(sacked_out); break; \ - case offsetof(md_type, bytes_received): \ - CONVERT(bytes_received); break; \ - case offsetof(md_type, bytes_acked): \ - CONVERT(bytes_acked); break; \ - } \ -} while (0) - #ifdef CONFIG_INET static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple, int dif, int sdif, u8 family, u8 proto) @@ -5637,9 +5589,6 @@ u32 bpf_tcp_sock_convert_ctx_access(enum bpf_access_type type, offsetof(struct tcp_sock, FIELD)); \ } while (0) - CONVERT_COMMON_TCP_SOCK_FIELDS(struct bpf_tcp_sock, - BPF_TCP_SOCK_GET_COMMON); - if (insn > insn_buf) return insn - insn_buf; @@ -5654,6 +5603,69 @@ u32 bpf_tcp_sock_convert_ctx_access(enum bpf_access_type type, offsetof(struct tcp_sock, rtt_min) + offsetof(struct minmax_sample, v)); break; + case offsetof(struct bpf_tcp_sock, snd_cwnd): + BPF_TCP_SOCK_GET_COMMON(snd_cwnd); + break; + case offsetof(struct bpf_tcp_sock, srtt_us): + BPF_TCP_SOCK_GET_COMMON(srtt_us); + break; + case offsetof(struct bpf_tcp_sock, snd_ssthresh): + BPF_TCP_SOCK_GET_COMMON(snd_ssthresh); + break; + case offsetof(struct bpf_tcp_sock, rcv_nxt): + BPF_TCP_SOCK_GET_COMMON(rcv_nxt); + break; + case offsetof(struct bpf_tcp_sock, snd_nxt): + BPF_TCP_SOCK_GET_COMMON(snd_nxt); + break; + case offsetof(struct bpf_tcp_sock, snd_una): + BPF_TCP_SOCK_GET_COMMON(snd_una); + break; + case offsetof(struct bpf_tcp_sock, mss_cache): + BPF_TCP_SOCK_GET_COMMON(mss_cache); + break; + case offsetof(struct bpf_tcp_sock, ecn_flags): + BPF_TCP_SOCK_GET_COMMON(ecn_flags); + break; + case offsetof(struct bpf_tcp_sock, rate_delivered): + BPF_TCP_SOCK_GET_COMMON(rate_delivered); + break; + case offsetof(struct bpf_tcp_sock, rate_interval_us): + BPF_TCP_SOCK_GET_COMMON(rate_interval_us); + break; + case offsetof(struct bpf_tcp_sock, packets_out): + BPF_TCP_SOCK_GET_COMMON(packets_out); + break; + case offsetof(struct bpf_tcp_sock, retrans_out): + BPF_TCP_SOCK_GET_COMMON(retrans_out); + break; + case offsetof(struct bpf_tcp_sock, total_retrans): + BPF_TCP_SOCK_GET_COMMON(total_retrans); + break; + case offsetof(struct bpf_tcp_sock, segs_in): + BPF_TCP_SOCK_GET_COMMON(segs_in); + break; + case offsetof(struct bpf_tcp_sock, data_segs_in): + BPF_TCP_SOCK_GET_COMMON(data_segs_in); + break; + case offsetof(struct bpf_tcp_sock, segs_out): + BPF_TCP_SOCK_GET_COMMON(segs_out); + break; + case offsetof(struct bpf_tcp_sock, data_segs_out): + BPF_TCP_SOCK_GET_COMMON(data_segs_out); + break; + case offsetof(struct bpf_tcp_sock, lost_out): + BPF_TCP_SOCK_GET_COMMON(lost_out); + break; + case offsetof(struct bpf_tcp_sock, sacked_out): + BPF_TCP_SOCK_GET_COMMON(sacked_out); + break; + case offsetof(struct bpf_tcp_sock, bytes_received): + BPF_TCP_SOCK_GET_COMMON(bytes_received); + break; + case offsetof(struct bpf_tcp_sock, bytes_acked): + BPF_TCP_SOCK_GET_COMMON(bytes_acked); + break; } return insn - insn_buf; @@ -7937,9 +7949,6 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ); \ } while (0) - CONVERT_COMMON_TCP_SOCK_FIELDS(struct bpf_sock_ops, - SOCK_OPS_GET_TCP_SOCK_FIELD); - if (insn > insn_buf) return insn - insn_buf; @@ -8109,6 +8118,69 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, SOCK_OPS_GET_OR_SET_FIELD(sk_txhash, sk_txhash, struct sock, type); break; + case offsetof(struct bpf_sock_ops, snd_cwnd): + SOCK_OPS_GET_TCP_SOCK_FIELD(snd_cwnd); + break; + case offsetof(struct bpf_sock_ops, srtt_us): + SOCK_OPS_GET_TCP_SOCK_FIELD(srtt_us); + break; + case offsetof(struct bpf_sock_ops, snd_ssthresh): + SOCK_OPS_GET_TCP_SOCK_FIELD(snd_ssthresh); + break; + case offsetof(struct bpf_sock_ops, rcv_nxt): + SOCK_OPS_GET_TCP_SOCK_FIELD(rcv_nxt); + break; + case offsetof(struct bpf_sock_ops, snd_nxt): + SOCK_OPS_GET_TCP_SOCK_FIELD(snd_nxt); + break; + case offsetof(struct bpf_sock_ops, snd_una): + SOCK_OPS_GET_TCP_SOCK_FIELD(snd_una); + break; + case offsetof(struct bpf_sock_ops, mss_cache): + SOCK_OPS_GET_TCP_SOCK_FIELD(mss_cache); + break; + case offsetof(struct bpf_sock_ops, ecn_flags): + SOCK_OPS_GET_TCP_SOCK_FIELD(ecn_flags); + break; + case offsetof(struct bpf_sock_ops, rate_delivered): + SOCK_OPS_GET_TCP_SOCK_FIELD(rate_delivered); + break; + case offsetof(struct bpf_sock_ops, rate_interval_us): + SOCK_OPS_GET_TCP_SOCK_FIELD(rate_interval_us); + break; + case offsetof(struct bpf_sock_ops, packets_out): + SOCK_OPS_GET_TCP_SOCK_FIELD(packets_out); + break; + case offsetof(struct bpf_sock_ops, retrans_out): + SOCK_OPS_GET_TCP_SOCK_FIELD(retrans_out); + break; + case offsetof(struct bpf_sock_ops, total_retrans): + SOCK_OPS_GET_TCP_SOCK_FIELD(total_retrans); + break; + case offsetof(struct bpf_sock_ops, segs_in): + SOCK_OPS_GET_TCP_SOCK_FIELD(segs_in); + break; + case offsetof(struct bpf_sock_ops, data_segs_in): + SOCK_OPS_GET_TCP_SOCK_FIELD(data_segs_in); + break; + case offsetof(struct bpf_sock_ops, segs_out): + SOCK_OPS_GET_TCP_SOCK_FIELD(segs_out); + break; + case offsetof(struct bpf_sock_ops, data_segs_out): + SOCK_OPS_GET_TCP_SOCK_FIELD(data_segs_out); + break; + case offsetof(struct bpf_sock_ops, lost_out): + SOCK_OPS_GET_TCP_SOCK_FIELD(lost_out); + break; + case offsetof(struct bpf_sock_ops, sacked_out): + SOCK_OPS_GET_TCP_SOCK_FIELD(sacked_out); + break; + case offsetof(struct bpf_sock_ops, bytes_received): + SOCK_OPS_GET_TCP_SOCK_FIELD(bytes_received); + break; + case offsetof(struct bpf_sock_ops, bytes_acked): + SOCK_OPS_GET_TCP_SOCK_FIELD(bytes_acked); + break; case offsetof(struct bpf_sock_ops, sk): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( struct bpf_sock_ops_kern, From d8696a5d2ece3b2c5394199a11a3e01bf113cf8c Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Tue, 17 Apr 2018 23:18:47 -0700 Subject: [PATCH 1553/1640] BACKPORT: tcp: new helper to calculate newly delivered Add new helper tcp_newly_delivered() to prepare the ECN accounting change. Signed-off-by: Yuchung Cheng Reviewed-by: Neal Cardwell Reviewed-by: Soheil Hassas Yeganeh Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 3a5e1937dccb..8d894794cab0 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3633,6 +3633,16 @@ static void tcp_xmit_recovery(struct sock *sk, int rexmit) tcp_xmit_retransmit_queue(sk); } +/* Returns the number of packets newly acked or sacked by the current ACK */ +static u32 tcp_newly_delivered(struct sock *sk, u32 prior_delivered, int flag) +{ + struct tcp_sock *tp = tcp_sk(sk); + u32 delivered; + + delivered = tp->delivered - prior_delivered; + return delivered; +} + /* This routine deals with incoming acks, but not outgoing ones. */ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) { @@ -3760,7 +3770,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) sk_dst_confirm(sk); - delivered = tp->delivered - delivered; /* freshly ACKed or SACKed */ + delivered = tcp_newly_delivered(sk, delivered, flag); lost = tp->lost - lost; /* freshly marked lost */ tcp_rate_gen(sk, delivered, lost, is_sack_reneg, sack_state.rate); tcp_cong_control(sk, ack, delivered, flag, sack_state.rate); @@ -3769,8 +3779,10 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) no_queue: /* If data was DSACKed, see if we can undo a cwnd reduction. */ - if (flag & FLAG_DSACKING_ACK) + if (flag & FLAG_DSACKING_ACK) { tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit); + tcp_newly_delivered(sk, delivered, flag); + } /* If this ack opens up a zero window, clear backoff. It was * being used to time the probes, and is probably far higher than * it needs to be for normal retransmission. @@ -3794,6 +3806,7 @@ old_ack: flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una, &sack_state); tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit); + tcp_newly_delivered(sk, delivered, flag); tcp_xmit_recovery(sk, rexmit); } From 283cb586434d2d70671dfde939ec7d406a32f1b4 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Tue, 17 Apr 2018 23:18:48 -0700 Subject: [PATCH 1554/1640] BACKPORT: tcp: track total bytes delivered with ECN CE marks Introduce a new delivered_ce stat in tcp socket to estimate number of packets being marked with CE bits. The estimation is done via ACKs with ECE bit. Depending on the actual receiver behavior, the estimation could have biases. Since the TCP sender can't really see the CE bit in the data path, so the sender is technically counting packets marked delivered with the "ECE / ECN-Echo" flag set. With RFC3168 ECN, because the ECE bit is sticky, this count can drastically overestimate the nummber of CE-marked data packets With DCTCP-style ECN this should be reasonably precise unless there is loss in the ACK path, in which case it's not precise. With AccECN proposal this can be made still more precise, even in the case some degree of ACK loss. However this is sender's best estimate of CE information. Signed-off-by: Yuchung Cheng Reviewed-by: Neal Cardwell Reviewed-by: Soheil Hassas Yeganeh Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/tcp.h | 1 + net/ipv4/tcp.c | 1 + net/ipv4/tcp_input.c | 2 ++ 3 files changed, 4 insertions(+) diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 223c6695fd3a..ab8b7aa228f9 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -274,6 +274,7 @@ struct tcp_sock { * receiver in Recovery. */ u32 prr_out; /* Total number of pkts sent during Recovery. */ u32 delivered; /* Total data packets delivered incl. rexmits */ + u32 delivered_ce; /* Like the above but only ECE marked packets */ u32 lost; /* Total data packets lost incl. rexmits */ u32 app_limited; /* limited until "delivered" reaches this val */ u64 first_tx_mstamp; /* start of window send phase */ diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index a6b05eebabc9..abeb6d7979f3 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2430,6 +2430,7 @@ int tcp_disconnect(struct sock *sk, int flags) tp->max_packets_out = 0; tp->window_clamp = 0; tp->delivered = 0; + tp->delivered_ce = 0; if (icsk->icsk_ca_ops->release) icsk->icsk_ca_ops->release(sk); memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv)); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 8d894794cab0..50bda443e829 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3640,6 +3640,8 @@ static u32 tcp_newly_delivered(struct sock *sk, u32 prior_delivered, int flag) u32 delivered; delivered = tp->delivered - prior_delivered; + if (flag & FLAG_ECE) + tp->delivered_ce += delivered; return delivered; } From ce39b198cfdf42446a169662cc9d3fdcd54908b5 Mon Sep 17 00:00:00 2001 From: Priyaranjan Jha Date: Sun, 4 Mar 2018 10:38:35 -0800 Subject: [PATCH 1555/1640] UPSTREAM: tcp: add send queue size stat in SCM_TIMESTAMPING_OPT_STATS This patch adds TCP_NLA_SENDQ_SIZE stat into SCM_TIMESTAMPING_OPT_STATS. It reports no. of bytes present in send queue, when timestamp is generated. Signed-off-by: Priyaranjan Jha Signed-off-by: Neal Cardwell Signed-off-by: Yuchung Cheng Signed-off-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- include/uapi/linux/tcp.h | 1 + net/ipv4/tcp.c | 4 +++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index 6a64beeecfad..798b9a4501cc 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -239,6 +239,7 @@ enum { TCP_NLA_MIN_RTT, /* minimum RTT */ TCP_NLA_RECUR_RETRANS, /* Recurring retransmits for the current pkt */ TCP_NLA_DELIVERY_RATE_APP_LMT, /* delivery rate application limited ? */ + TCP_NLA_SNDQ_SIZE, /* Data (bytes) pending in send queue */ }; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index abeb6d7979f3..27fc971b8141 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3047,7 +3047,7 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk) u32 rate; stats = alloc_skb(7 * nla_total_size_64bit(sizeof(u64)) + - 3 * nla_total_size(sizeof(u32)) + + 4 * nla_total_size(sizeof(u32)) + 2 * nla_total_size(sizeof(u8)), GFP_ATOMIC); if (!stats) return NULL; @@ -3077,6 +3077,8 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk) nla_put_u8(stats, TCP_NLA_RECUR_RETRANS, inet_csk(sk)->icsk_retransmits); nla_put_u8(stats, TCP_NLA_DELIVERY_RATE_APP_LMT, !!tp->rate_app_limited); + + nla_put_u32(stats, TCP_NLA_SNDQ_SIZE, tp->write_seq - tp->snd_una); return stats; } From 2e319fbc16dcc387dedebb1b8b22d43f4e79ea04 Mon Sep 17 00:00:00 2001 From: Priyaranjan Jha Date: Sun, 4 Mar 2018 10:38:36 -0800 Subject: [PATCH 1556/1640] UPSTREAM: tcp: add ca_state stat in SCM_TIMESTAMPING_OPT_STATS This patch adds TCP_NLA_CA_STATE stat into SCM_TIMESTAMPING_OPT_STATS. It reports ca_state of socket, when timestamp is generated. Signed-off-by: Priyaranjan Jha Signed-off-by: Neal Cardwell Signed-off-by: Yuchung Cheng Signed-off-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- include/uapi/linux/tcp.h | 1 + net/ipv4/tcp.c | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index 798b9a4501cc..54598b67b7b4 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -240,6 +240,7 @@ enum { TCP_NLA_RECUR_RETRANS, /* Recurring retransmits for the current pkt */ TCP_NLA_DELIVERY_RATE_APP_LMT, /* delivery rate application limited ? */ TCP_NLA_SNDQ_SIZE, /* Data (bytes) pending in send queue */ + TCP_NLA_CA_STATE, /* ca_state of socket */ }; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 27fc971b8141..d0366810acb5 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3048,7 +3048,7 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk) stats = alloc_skb(7 * nla_total_size_64bit(sizeof(u64)) + 4 * nla_total_size(sizeof(u32)) + - 2 * nla_total_size(sizeof(u8)), GFP_ATOMIC); + 3 * nla_total_size(sizeof(u8)), GFP_ATOMIC); if (!stats) return NULL; @@ -3079,6 +3079,7 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk) nla_put_u8(stats, TCP_NLA_DELIVERY_RATE_APP_LMT, !!tp->rate_app_limited); nla_put_u32(stats, TCP_NLA_SNDQ_SIZE, tp->write_seq - tp->snd_una); + nla_put_u8(stats, TCP_NLA_CA_STATE, inet_csk(sk)->icsk_ca_state); return stats; } From 63dc14578ff2e3d272d8c0834ddf299f6dfb7b89 Mon Sep 17 00:00:00 2001 From: Yousuk Seung Date: Fri, 16 Mar 2018 10:51:07 -0700 Subject: [PATCH 1557/1640] UPSTREAM: tcp: add snd_ssthresh stat in SCM_TIMESTAMPING_OPT_STATS This patch adds TCP_NLA_SND_SSTHRESH stat into SCM_TIMESTAMPING_OPT_STATS that reports tcp_sock.snd_ssthresh. Signed-off-by: Yousuk Seung Signed-off-by: Neal Cardwell Signed-off-by: Priyaranjan Jha Signed-off-by: Soheil Hassas Yeganeh Signed-off-by: Yuchung Cheng Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- include/uapi/linux/tcp.h | 1 + net/ipv4/tcp.c | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index 54598b67b7b4..18a6f15a0d76 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -241,6 +241,7 @@ enum { TCP_NLA_DELIVERY_RATE_APP_LMT, /* delivery rate application limited ? */ TCP_NLA_SNDQ_SIZE, /* Data (bytes) pending in send queue */ TCP_NLA_CA_STATE, /* ca_state of socket */ + TCP_NLA_SND_SSTHRESH, /* Slow start size threshold */ }; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index d0366810acb5..fac78591d755 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3047,7 +3047,7 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk) u32 rate; stats = alloc_skb(7 * nla_total_size_64bit(sizeof(u64)) + - 4 * nla_total_size(sizeof(u32)) + + 5 * nla_total_size(sizeof(u32)) + 3 * nla_total_size(sizeof(u8)), GFP_ATOMIC); if (!stats) return NULL; @@ -3077,6 +3077,7 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk) nla_put_u8(stats, TCP_NLA_RECUR_RETRANS, inet_csk(sk)->icsk_retransmits); nla_put_u8(stats, TCP_NLA_DELIVERY_RATE_APP_LMT, !!tp->rate_app_limited); + nla_put_u32(stats, TCP_NLA_SND_SSTHRESH, tp->snd_ssthresh); nla_put_u32(stats, TCP_NLA_SNDQ_SIZE, tp->write_seq - tp->snd_una); nla_put_u8(stats, TCP_NLA_CA_STATE, inet_csk(sk)->icsk_ca_state); From 17cdf44f2aff9a2d26350f87b5df55ccd509ab90 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Tue, 17 Apr 2018 23:18:49 -0700 Subject: [PATCH 1558/1640] BACKPORT: tcp: export packets delivery info Export data delivered and delivered with CE marks to 1) SNMP TCPDelivered and TCPDeliveredCE 2) getsockopt(TCP_INFO) 3) Timestamping API SOF_TIMESTAMPING_OPT_STATS Note that for SCM_TSTAMP_ACK, the delivery info in SOF_TIMESTAMPING_OPT_STATS is reported before the info was fully updated on the ACK. These stats help application monitor TCP delivery and ECN status on per host, per connection, even per message level. Signed-off-by: Yuchung Cheng Reviewed-by: Neal Cardwell Reviewed-by: Soheil Hassas Yeganeh Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- include/uapi/linux/snmp.h | 2 ++ include/uapi/linux/tcp.h | 5 +++++ net/ipv4/proc.c | 2 ++ net/ipv4/tcp.c | 7 ++++++- net/ipv4/tcp_input.c | 6 +++++- 5 files changed, 20 insertions(+), 2 deletions(-) diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h index bf31965355c6..9cc679ab52b4 100644 --- a/include/uapi/linux/snmp.h +++ b/include/uapi/linux/snmp.h @@ -278,6 +278,8 @@ enum LINUX_MIB_TCPKEEPALIVE, /* TCPKeepAlive */ LINUX_MIB_TCPMTUPFAIL, /* TCPMTUPFail */ LINUX_MIB_TCPMTUPSUCCESS, /* TCPMTUPSuccess */ + LINUX_MIB_TCPDELIVERED, /* TCPDelivered */ + LINUX_MIB_TCPDELIVEREDCE, /* TCPDeliveredCE */ LINUX_MIB_TCPWQUEUETOOBIG, /* TCPWqueueTooBig */ __LINUX_MIB_MAX }; diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index 18a6f15a0d76..c6c136a1a52f 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -222,6 +222,9 @@ struct tcp_info { __u64 tcpi_busy_time; /* Time (usec) busy sending data */ __u64 tcpi_rwnd_limited; /* Time (usec) limited by receive window */ __u64 tcpi_sndbuf_limited; /* Time (usec) limited by send buffer */ + + __u32 tcpi_delivered; + __u32 tcpi_delivered_ce; }; /* netlink attributes types for SCM_TIMESTAMPING_OPT_STATS */ @@ -242,6 +245,8 @@ enum { TCP_NLA_SNDQ_SIZE, /* Data (bytes) pending in send queue */ TCP_NLA_CA_STATE, /* ca_state of socket */ TCP_NLA_SND_SSTHRESH, /* Slow start size threshold */ + TCP_NLA_DELIVERED, /* Data pkts delivered incl. out-of-order */ + TCP_NLA_DELIVERED_CE, /* Like above but only ones w/ CE marks */ }; diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 88aaf14983e8..65dfa9dffa88 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -299,6 +299,8 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TCPKeepAlive", LINUX_MIB_TCPKEEPALIVE), SNMP_MIB_ITEM("TCPMTUPFail", LINUX_MIB_TCPMTUPFAIL), SNMP_MIB_ITEM("TCPMTUPSuccess", LINUX_MIB_TCPMTUPSUCCESS), + SNMP_MIB_ITEM("TCPDelivered", LINUX_MIB_TCPDELIVERED), + SNMP_MIB_ITEM("TCPDeliveredCE", LINUX_MIB_TCPDELIVEREDCE), SNMP_MIB_ITEM("TCPWqueueTooBig", LINUX_MIB_TCPWQUEUETOOBIG), SNMP_MIB_SENTINEL }; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index fac78591d755..f9e7e57d0ca2 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3034,6 +3034,8 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) rate64 = tcp_compute_delivery_rate(tp); if (rate64) info->tcpi_delivery_rate = rate64; + info->tcpi_delivered = tp->delivered; + info->tcpi_delivered_ce = tp->delivered_ce; unlock_sock_fast(sk, slow); } EXPORT_SYMBOL_GPL(tcp_get_info); @@ -3047,7 +3049,7 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk) u32 rate; stats = alloc_skb(7 * nla_total_size_64bit(sizeof(u64)) + - 5 * nla_total_size(sizeof(u32)) + + 7 * nla_total_size(sizeof(u32)) + 3 * nla_total_size(sizeof(u8)), GFP_ATOMIC); if (!stats) return NULL; @@ -3078,9 +3080,12 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk) nla_put_u8(stats, TCP_NLA_RECUR_RETRANS, inet_csk(sk)->icsk_retransmits); nla_put_u8(stats, TCP_NLA_DELIVERY_RATE_APP_LMT, !!tp->rate_app_limited); nla_put_u32(stats, TCP_NLA_SND_SSTHRESH, tp->snd_ssthresh); + nla_put_u32(stats, TCP_NLA_DELIVERED, tp->delivered); + nla_put_u32(stats, TCP_NLA_DELIVERED_CE, tp->delivered_ce); nla_put_u32(stats, TCP_NLA_SNDQ_SIZE, tp->write_seq - tp->snd_una); nla_put_u8(stats, TCP_NLA_CA_STATE, inet_csk(sk)->icsk_ca_state); + return stats; } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 50bda443e829..c2b45d5f0172 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3636,12 +3636,16 @@ static void tcp_xmit_recovery(struct sock *sk, int rexmit) /* Returns the number of packets newly acked or sacked by the current ACK */ static u32 tcp_newly_delivered(struct sock *sk, u32 prior_delivered, int flag) { + const struct net *net = sock_net(sk); struct tcp_sock *tp = tcp_sk(sk); u32 delivered; delivered = tp->delivered - prior_delivered; - if (flag & FLAG_ECE) + NET_ADD_STATS(net, LINUX_MIB_TCPDELIVERED, delivered); + if (flag & FLAG_ECE) { tp->delivered_ce += delivered; + NET_ADD_STATS(net, LINUX_MIB_TCPDELIVEREDCE, delivered); + } return delivered; } From cdc85f3ea948ab3affa04ac22a20b55b67906710 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Tue, 31 Jul 2018 17:46:20 -0700 Subject: [PATCH 1559/1640] UPSTREAM: tcp: add a helper to calculate size of opt_stats This is to refactor the calculation of the size of opt_stats to a helper function to make the code cleaner and easier for later changes. Suggested-by: Stephen Hemminger Signed-off-by: Wei Wang Signed-off-by: Eric Dumazet Acked-by: Neal Cardwell Acked-by: Soheil Hassas Yeganeh Acked-by: Yuchung Cheng Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 27 ++++++++++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index f9e7e57d0ca2..67af34b73a85 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3040,6 +3040,29 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) } EXPORT_SYMBOL_GPL(tcp_get_info); +static size_t tcp_opt_stats_get_size(void) +{ + return + nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_BUSY */ + nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_RWND_LIMITED */ + nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_SNDBUF_LIMITED */ + nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_DATA_SEGS_OUT */ + nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_TOTAL_RETRANS */ + nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_PACING_RATE */ + nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_DELIVERY_RATE */ + nla_total_size(sizeof(u32)) + /* TCP_NLA_SND_CWND */ + nla_total_size(sizeof(u32)) + /* TCP_NLA_REORDERING */ + nla_total_size(sizeof(u32)) + /* TCP_NLA_MIN_RTT */ + nla_total_size(sizeof(u8)) + /* TCP_NLA_RECUR_RETRANS */ + nla_total_size(sizeof(u8)) + /* TCP_NLA_DELIVERY_RATE_APP_LMT */ + nla_total_size(sizeof(u32)) + /* TCP_NLA_SNDQ_SIZE */ + nla_total_size(sizeof(u8)) + /* TCP_NLA_CA_STATE */ + nla_total_size(sizeof(u32)) + /* TCP_NLA_SND_SSTHRESH */ + nla_total_size(sizeof(u32)) + /* TCP_NLA_DELIVERED */ + nla_total_size(sizeof(u32)) + /* TCP_NLA_DELIVERED_CE */ + 0; +} + struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); @@ -3048,9 +3071,7 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk) u64 rate64; u32 rate; - stats = alloc_skb(7 * nla_total_size_64bit(sizeof(u64)) + - 7 * nla_total_size(sizeof(u32)) + - 3 * nla_total_size(sizeof(u8)), GFP_ATOMIC); + stats = alloc_skb(tcp_opt_stats_get_size(), GFP_ATOMIC); if (!stats) return NULL; From d13ba3222338203b74f72dad63ab82a9c480857a Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Tue, 31 Jul 2018 17:46:21 -0700 Subject: [PATCH 1560/1640] BACKPORT: tcp: add data bytes sent stats Introduce a new TCP stat to record the number of bytes sent (RFC4898 tcpEStatsPerfHCDataOctetsOut) and expose it in both tcp_info (TCP_INFO) and opt_stats (SOF_TIMESTAMPING_OPT_STATS). Signed-off-by: Wei Wang Signed-off-by: Eric Dumazet Acked-by: Neal Cardwell Acked-by: Soheil Hassas Yeganeh Acked-by: Yuchung Cheng Signed-off-by: David S. Miller --- include/linux/tcp.h | 3 +++ include/uapi/linux/tcp.h | 4 +++- net/ipv4/tcp.c | 6 ++++++ net/ipv4/tcp_output.c | 1 + 4 files changed, 13 insertions(+), 1 deletion(-) diff --git a/include/linux/tcp.h b/include/linux/tcp.h index ab8b7aa228f9..8e697f4b3d89 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -178,6 +178,9 @@ struct tcp_sock { u32 data_segs_out; /* RFC4898 tcpEStatsPerfDataSegsOut * total number of data segments sent. */ + u64 bytes_sent; /* RFC4898 tcpEStatsPerfHCDataOctetsOut + * total number of data bytes sent. + */ u64 bytes_acked; /* RFC4898 tcpEStatsAppHCThruOctetsAcked * sum(delta(snd_una)), or how many bytes * were acked. diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index c6c136a1a52f..8a890fd3bfad 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -225,6 +225,8 @@ struct tcp_info { __u32 tcpi_delivered; __u32 tcpi_delivered_ce; + + __u64 tcpi_bytes_sent; /* RFC4898 tcpEStatsPerfHCDataOctetsOut */ }; /* netlink attributes types for SCM_TIMESTAMPING_OPT_STATS */ @@ -247,7 +249,7 @@ enum { TCP_NLA_SND_SSTHRESH, /* Slow start size threshold */ TCP_NLA_DELIVERED, /* Data pkts delivered incl. out-of-order */ TCP_NLA_DELIVERED_CE, /* Like above but only ones w/ CE marks */ - + TCP_NLA_BYTES_SENT, /* Data bytes sent including retransmission */ }; /* for TCP_MD5SIG socket option */ diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 67af34b73a85..bb364b99d4be 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2450,6 +2450,7 @@ int tcp_disconnect(struct sock *sk, int flags) tcp_saved_syn_free(tp); tp->segs_in = 0; tp->segs_out = 0; + tp->bytes_sent = 0; tp->bytes_acked = 0; tp->bytes_received = 0; tp->data_segs_in = 0; @@ -3036,6 +3037,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) info->tcpi_delivery_rate = rate64; info->tcpi_delivered = tp->delivered; info->tcpi_delivered_ce = tp->delivered_ce; + info->tcpi_bytes_sent = tp->bytes_sent; unlock_sock_fast(sk, slow); } EXPORT_SYMBOL_GPL(tcp_get_info); @@ -3060,6 +3062,7 @@ static size_t tcp_opt_stats_get_size(void) nla_total_size(sizeof(u32)) + /* TCP_NLA_SND_SSTHRESH */ nla_total_size(sizeof(u32)) + /* TCP_NLA_DELIVERED */ nla_total_size(sizeof(u32)) + /* TCP_NLA_DELIVERED_CE */ + nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_BYTES_SENT */ 0; } @@ -3107,6 +3110,9 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk) nla_put_u32(stats, TCP_NLA_SNDQ_SIZE, tp->write_seq - tp->snd_una); nla_put_u8(stats, TCP_NLA_CA_STATE, inet_csk(sk)->icsk_ca_state); + nla_put_u64_64bit(stats, TCP_NLA_BYTES_SENT, tp->bytes_sent, + TCP_NLA_PAD); + return stats; } diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 487e082f6ee2..0c46c4402b4f 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1132,6 +1132,7 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, if (skb->len != tcp_header_size) { tcp_event_data_sent(tp, sk); tp->data_segs_out += tcp_skb_pcount(skb); + tp->bytes_sent += skb->len - tcp_header_size; tcp_internal_pacing(sk, skb); } From 3568fc85b38d8011653b2e00ff7384188654958b Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Tue, 31 Jul 2018 17:46:22 -0700 Subject: [PATCH 1561/1640] BACKPORT: tcp: add data bytes retransmitted stats Introduce a new TCP stat to record the number of bytes retransmitted (RFC4898 tcpEStatsPerfOctetsRetrans) and expose it in both tcp_info (TCP_INFO) and opt_stats (SOF_TIMESTAMPING_OPT_STATS). Signed-off-by: Wei Wang Signed-off-by: Eric Dumazet Acked-by: Neal Cardwell Acked-by: Soheil Hassas Yeganeh Acked-by: Yuchung Cheng Signed-off-by: David S. Miller --- include/linux/tcp.h | 3 +++ include/uapi/linux/tcp.h | 2 ++ net/ipv4/tcp.c | 5 +++++ net/ipv4/tcp_output.c | 1 + 4 files changed, 11 insertions(+) diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 8e697f4b3d89..a9a651d80215 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -325,6 +325,9 @@ struct tcp_sock { * the first SYN. */ u32 undo_marker; /* snd_una upon a new recovery episode. */ int undo_retrans; /* number of undoable retransmissions. */ + u64 bytes_retrans; /* RFC4898 tcpEStatsPerfOctetsRetrans + * Total data bytes retransmitted + */ u32 total_retrans; /* Total retransmits for entire connection */ u32 urg_seq; /* Seq of received urgent pointer */ diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index 8a890fd3bfad..539bcaffbbc5 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -227,6 +227,7 @@ struct tcp_info { __u32 tcpi_delivered_ce; __u64 tcpi_bytes_sent; /* RFC4898 tcpEStatsPerfHCDataOctetsOut */ + __u64 tcpi_bytes_retrans; /* RFC4898 tcpEStatsPerfOctetsRetrans */ }; /* netlink attributes types for SCM_TIMESTAMPING_OPT_STATS */ @@ -250,6 +251,7 @@ enum { TCP_NLA_DELIVERED, /* Data pkts delivered incl. out-of-order */ TCP_NLA_DELIVERED_CE, /* Like above but only ones w/ CE marks */ TCP_NLA_BYTES_SENT, /* Data bytes sent including retransmission */ + TCP_NLA_BYTES_RETRANS, /* Data bytes retransmitted */ }; /* for TCP_MD5SIG socket option */ diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index bb364b99d4be..18d52f67a966 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2453,6 +2453,7 @@ int tcp_disconnect(struct sock *sk, int flags) tp->bytes_sent = 0; tp->bytes_acked = 0; tp->bytes_received = 0; + tp->bytes_retrans = 0; tp->data_segs_in = 0; tp->data_segs_out = 0; @@ -3038,6 +3039,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) info->tcpi_delivered = tp->delivered; info->tcpi_delivered_ce = tp->delivered_ce; info->tcpi_bytes_sent = tp->bytes_sent; + info->tcpi_bytes_retrans = tp->bytes_retrans; unlock_sock_fast(sk, slow); } EXPORT_SYMBOL_GPL(tcp_get_info); @@ -3063,6 +3065,7 @@ static size_t tcp_opt_stats_get_size(void) nla_total_size(sizeof(u32)) + /* TCP_NLA_DELIVERED */ nla_total_size(sizeof(u32)) + /* TCP_NLA_DELIVERED_CE */ nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_BYTES_SENT */ + nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_BYTES_RETRANS */ 0; } @@ -3112,6 +3115,8 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk) nla_put_u64_64bit(stats, TCP_NLA_BYTES_SENT, tp->bytes_sent, TCP_NLA_PAD); + nla_put_u64_64bit(stats, TCP_NLA_BYTES_RETRANS, tp->bytes_retrans, + TCP_NLA_PAD); return stats; } diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 0c46c4402b4f..6e9bf2b142b3 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2972,6 +2972,7 @@ start: if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN) __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS); tp->total_retrans += segs; + tp->bytes_retrans += skb->len; /* make sure skb->data is aligned on arches that require it * and check if ack-trimming & collapsing extended the headroom From b755b6325aa93c197667d7a325163cf71d308bff Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Tue, 31 Jul 2018 17:46:23 -0700 Subject: [PATCH 1562/1640] BACKPORT: tcp: add dsack blocks received stats Introduce a new TCP stat to record the number of DSACK blocks received (RFC4989 tcpEStatsStackDSACKDups) and expose it in both tcp_info (TCP_INFO) and opt_stats (SOF_TIMESTAMPING_OPT_STATS). Signed-off-by: Wei Wang Signed-off-by: Eric Dumazet Acked-by: Neal Cardwell Acked-by: Soheil Hassas Yeganeh Acked-by: Yuchung Cheng Signed-off-by: David S. Miller --- include/linux/tcp.h | 3 +++ include/uapi/linux/tcp.h | 2 ++ net/ipv4/tcp.c | 4 ++++ net/ipv4/tcp_input.c | 1 + 4 files changed, 10 insertions(+) diff --git a/include/linux/tcp.h b/include/linux/tcp.h index a9a651d80215..fa76c1113529 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -185,6 +185,9 @@ struct tcp_sock { * sum(delta(snd_una)), or how many bytes * were acked. */ + u32 dsack_dups; /* RFC4898 tcpEStatsStackDSACKDups + * total number of DSACK blocks received + */ u32 snd_una; /* First byte we want an ack for */ u32 snd_sml; /* Last byte of the most recently transmitted small packet */ u32 rcv_tstamp; /* timestamp of last received ACK (for keepalives) */ diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index 539bcaffbbc5..d6a35b4e0151 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -228,6 +228,7 @@ struct tcp_info { __u64 tcpi_bytes_sent; /* RFC4898 tcpEStatsPerfHCDataOctetsOut */ __u64 tcpi_bytes_retrans; /* RFC4898 tcpEStatsPerfOctetsRetrans */ + __u32 tcpi_dsack_dups; /* RFC4898 tcpEStatsStackDSACKDups */ }; /* netlink attributes types for SCM_TIMESTAMPING_OPT_STATS */ @@ -252,6 +253,7 @@ enum { TCP_NLA_DELIVERED_CE, /* Like above but only ones w/ CE marks */ TCP_NLA_BYTES_SENT, /* Data bytes sent including retransmission */ TCP_NLA_BYTES_RETRANS, /* Data bytes retransmitted */ + TCP_NLA_DSACK_DUPS, /* DSACK blocks received */ }; /* for TCP_MD5SIG socket option */ diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 18d52f67a966..c944a7218008 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2456,6 +2456,7 @@ int tcp_disconnect(struct sock *sk, int flags) tp->bytes_retrans = 0; tp->data_segs_in = 0; tp->data_segs_out = 0; + tp->dsack_dups = 0; /* Clean up fastopen related fields */ tcp_free_fastopen_req(tp); @@ -3040,6 +3041,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) info->tcpi_delivered_ce = tp->delivered_ce; info->tcpi_bytes_sent = tp->bytes_sent; info->tcpi_bytes_retrans = tp->bytes_retrans; + info->tcpi_dsack_dups = tp->dsack_dups; unlock_sock_fast(sk, slow); } EXPORT_SYMBOL_GPL(tcp_get_info); @@ -3066,6 +3068,7 @@ static size_t tcp_opt_stats_get_size(void) nla_total_size(sizeof(u32)) + /* TCP_NLA_DELIVERED_CE */ nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_BYTES_SENT */ nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_BYTES_RETRANS */ + nla_total_size(sizeof(u32)) + /* TCP_NLA_DSACK_DUPS */ 0; } @@ -3117,6 +3120,7 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk) TCP_NLA_PAD); nla_put_u64_64bit(stats, TCP_NLA_BYTES_RETRANS, tp->bytes_retrans, TCP_NLA_PAD); + nla_put_u32(stats, TCP_NLA_DSACK_DUPS, tp->dsack_dups); return stats; } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index c2b45d5f0172..ffda99e89781 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -901,6 +901,7 @@ void tcp_disable_fack(struct tcp_sock *tp) static void tcp_dsack_seen(struct tcp_sock *tp) { tp->rx_opt.sack_ok |= TCP_DSACK_SEEN; + tp->dsack_dups++; } static void tcp_update_reordering(struct sock *sk, const int metric, From c4de7462d3c5100e1072ce56eb59ec3bd8901a45 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Tue, 2 Jul 2019 09:13:58 -0700 Subject: [PATCH 1563/1640] UPSTREAM: bpf: add dsack_dups/delivered{, _ce} to bpf_tcp_sock Add more fields to bpf_tcp_sock that might be useful for debugging congestion control issues. Cc: Eric Dumazet Cc: Priyaranjan Jha Cc: Yuchung Cheng Cc: Soheil Hassas Yeganeh Acked-by: Soheil Hassas Yeganeh Acked-by: Yuchung Cheng Change-Id: I17a774bdb0e6b2f77f08ec80d5fcc1fba11cca95 Signed-off-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann --- include/uapi/linux/bpf.h | 5 +++++ net/core/filter.c | 11 ++++++++++- 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index fde31ca2c233..82bf44aa2e73 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3012,6 +3012,11 @@ struct bpf_tcp_sock { * sum(delta(snd_una)), or how many bytes * were acked. */ + __u32 dsack_dups; /* RFC4898 tcpEStatsStackDSACKDups + * total number of DSACK blocks received + */ + __u32 delivered; /* Total data packets delivered incl. rexmits */ + __u32 delivered_ce; /* Like the above but only ECE marked packets */ }; struct bpf_sock_tuple { diff --git a/net/core/filter.c b/net/core/filter.c index 56dcf795e2d6..71431013e3af 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5558,7 +5558,7 @@ static const struct bpf_func_proto bpf_sock_addr_sk_lookup_udp_proto = { bool bpf_tcp_sock_is_valid_access(int off, int size, enum bpf_access_type type, struct bpf_insn_access_aux *info) { - if (off < 0 || off >= offsetofend(struct bpf_tcp_sock, bytes_acked)) + if (off < 0 || off >= offsetofend(struct bpf_tcp_sock, delivered_ce)) return false; if (off % size != 0) @@ -5666,6 +5666,15 @@ u32 bpf_tcp_sock_convert_ctx_access(enum bpf_access_type type, case offsetof(struct bpf_tcp_sock, bytes_acked): BPF_TCP_SOCK_GET_COMMON(bytes_acked); break; + case offsetof(struct bpf_tcp_sock, dsack_dups): + BPF_TCP_SOCK_GET_COMMON(dsack_dups); + break; + case offsetof(struct bpf_tcp_sock, delivered): + BPF_TCP_SOCK_GET_COMMON(delivered); + break; + case offsetof(struct bpf_tcp_sock, delivered_ce): + BPF_TCP_SOCK_GET_COMMON(delivered_ce); + break; } return insn - insn_buf; From 10f70742555a4e211c0dad45d87b71437af78212 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Tue, 2 Jul 2019 09:13:59 -0700 Subject: [PATCH 1564/1640] UPSTREAM: bpf: add icsk_retransmits to bpf_tcp_sock Add some inet_connection_sock fields to bpf_tcp_sock that might be useful for debugging congestion control issues. Cc: Eric Dumazet Cc: Priyaranjan Jha Cc: Yuchung Cheng Cc: Soheil Hassas Yeganeh Acked-by: Soheil Hassas Yeganeh Acked-by: Yuchung Cheng Change-Id: I94a94df91b77033bea7d1581b03273b778fd54e7 Signed-off-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann --- include/uapi/linux/bpf.h | 1 + net/core/filter.c | 20 +++++++++++++++++++- 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 82bf44aa2e73..f771cf4fc8d9 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3017,6 +3017,7 @@ struct bpf_tcp_sock { */ __u32 delivered; /* Total data packets delivered incl. rexmits */ __u32 delivered_ce; /* Like the above but only ECE marked packets */ + __u32 icsk_retransmits; /* Number of unrecovered [RTO] timeouts */ }; struct bpf_sock_tuple { diff --git a/net/core/filter.c b/net/core/filter.c index 71431013e3af..43ca92129605 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5558,7 +5558,8 @@ static const struct bpf_func_proto bpf_sock_addr_sk_lookup_udp_proto = { bool bpf_tcp_sock_is_valid_access(int off, int size, enum bpf_access_type type, struct bpf_insn_access_aux *info) { - if (off < 0 || off >= offsetofend(struct bpf_tcp_sock, delivered_ce)) + if (off < 0 || off >= offsetofend(struct bpf_tcp_sock, + icsk_retransmits)) return false; if (off % size != 0) @@ -5589,6 +5590,20 @@ u32 bpf_tcp_sock_convert_ctx_access(enum bpf_access_type type, offsetof(struct tcp_sock, FIELD)); \ } while (0) +#define BPF_INET_SOCK_GET_COMMON(FIELD) \ + do { \ + BUILD_BUG_ON(FIELD_SIZEOF(struct inet_connection_sock, \ + FIELD) > \ + FIELD_SIZEOF(struct bpf_tcp_sock, FIELD)); \ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ + struct inet_connection_sock, \ + FIELD), \ + si->dst_reg, si->src_reg, \ + offsetof( \ + struct inet_connection_sock, \ + FIELD)); \ + } while (0) + if (insn > insn_buf) return insn - insn_buf; @@ -5675,6 +5690,9 @@ u32 bpf_tcp_sock_convert_ctx_access(enum bpf_access_type type, case offsetof(struct bpf_tcp_sock, delivered_ce): BPF_TCP_SOCK_GET_COMMON(delivered_ce); break; + case offsetof(struct bpf_tcp_sock, icsk_retransmits): + BPF_INET_SOCK_GET_COMMON(icsk_retransmits); + break; } return insn - insn_buf; From 7dd7e7d04bfb94857cd3bfedc4fe316464422307 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Mon, 15 Jul 2019 09:39:53 -0700 Subject: [PATCH 1565/1640] UPSTREAM: bpf: allow wide aligned loads for bpf_sock_addr user_ip6 and msg_src_ip6 Add explicit check for u64 loads of user_ip6 and msg_src_ip6 and update the comment. Cc: Yonghong Song Change-Id: Id82bfd77c5a0297ef3473fd2576d125baaed1b02 Signed-off-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann --- include/uapi/linux/bpf.h | 4 ++-- net/core/filter.c | 12 +++++++++++- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index f771cf4fc8d9..aa3a14b99b7f 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3187,7 +3187,7 @@ struct bpf_sock_addr { __u32 user_ip4; /* Allows 1,2,4-byte read and 4-byte write. * Stored in network byte order. */ - __u32 user_ip6[4]; /* Allows 1,2,4-byte read and 4,8-byte write. + __u32 user_ip6[4]; /* Allows 1,2,4,8-byte read and 4,8-byte write. * Stored in network byte order. */ __u32 user_port; /* Allows 4-byte read and write. @@ -3199,7 +3199,7 @@ struct bpf_sock_addr { __u32 msg_src_ip4; /* Allows 1,2,4-byte read and 4-byte write. * Stored in network byte order. */ - __u32 msg_src_ip6[4]; /* Allows 1,2,4-byte read and 4,8-byte write. + __u32 msg_src_ip6[4]; /* Allows 1,2,4,8-byte read and 4,8-byte write. * Stored in network byte order. */ __bpf_md_ptr(struct bpf_sock *, sk); diff --git a/net/core/filter.c b/net/core/filter.c index 43ca92129605..587abf4ff243 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -6902,9 +6902,19 @@ static bool sock_addr_is_valid_access(int off, int size, case bpf_ctx_range(struct bpf_sock_addr, msg_src_ip4): case bpf_ctx_range_till(struct bpf_sock_addr, msg_src_ip6[0], msg_src_ip6[3]): - /* Only narrow read access allowed for now. */ if (type == BPF_READ) { bpf_ctx_record_field_size(info, size_default); + + if (bpf_ctx_wide_access_ok(off, size, + struct bpf_sock_addr, + user_ip6)) + return true; + + if (bpf_ctx_wide_access_ok(off, size, + struct bpf_sock_addr, + msg_src_ip6)) + return true; + if (!bpf_ctx_narrow_access_ok(off, size, size_default)) return false; } else { From d65114aea99d276e455abfc724282089789f625b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 23 Jul 2019 03:15:37 -0700 Subject: [PATCH 1566/1640] UPSTREAM: bpf: fix access to skb_shared_info->gso_segs It is possible we reach bpf_convert_ctx_access() with si->dst_reg == si->src_reg Therefore, we need to load BPF_REG_AX before eventually mangling si->src_reg. syzbot generated this x86 code : 3: 55 push %rbp 4: 48 89 e5 mov %rsp,%rbp 7: 48 81 ec 00 00 00 00 sub $0x0,%rsp // Might be avoided ? e: 53 push %rbx f: 41 55 push %r13 11: 41 56 push %r14 13: 41 57 push %r15 15: 6a 00 pushq $0x0 17: 31 c0 xor %eax,%eax 19: 48 8b bf c0 00 00 00 mov 0xc0(%rdi),%rdi 20: 44 8b 97 bc 00 00 00 mov 0xbc(%rdi),%r10d 27: 4c 01 d7 add %r10,%rdi 2a: 48 0f b7 7f 06 movzwq 0x6(%rdi),%rdi // Crash 2f: 5b pop %rbx 30: 41 5f pop %r15 32: 41 5e pop %r14 34: 41 5d pop %r13 36: 5b pop %rbx 37: c9 leaveq 38: c3 retq Fixes: d9ff286a0f59 ("bpf: allow BPF programs access skb_shared_info->gso_segs field") Change-Id: Ie2db59895e996ff250b89a6afd34a5af0902052c Signed-off-by: Eric Dumazet Reported-by: syzbot Signed-off-by: Alexei Starovoitov --- net/core/filter.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/core/filter.c b/net/core/filter.c index 587abf4ff243..e13b29ec6906 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -7473,12 +7473,12 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type, case offsetof(struct __sk_buff, gso_segs): /* si->dst_reg = skb_shinfo(SKB); */ #ifdef NET_SKBUFF_DATA_USES_OFFSET - *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, head), - si->dst_reg, si->src_reg, - offsetof(struct sk_buff, head)); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, end), BPF_REG_AX, si->src_reg, offsetof(struct sk_buff, end)); + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, head), + si->dst_reg, si->src_reg, + offsetof(struct sk_buff, head)); *insn++ = BPF_ALU64_REG(BPF_ADD, si->dst_reg, BPF_REG_AX); #else *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, end), From 04a96f1053a74de2c5b514b57b58401a48d2b1d2 Mon Sep 17 00:00:00 2001 From: Allan Zhang Date: Tue, 23 Jul 2019 17:07:24 -0700 Subject: [PATCH 1567/1640] UPSTREAM: bpf: Allow bpf_skb_event_output for a few prog types Software event output is only enabled by a few prog types right now (TC, LWT out, XDP, sockops). Many other skb based prog types need bpf_skb_event_output to produce software event. Added socket_filter, cg_skb, sk_skb prog types to generate sw event. Test bpf code is generated from code snippet: struct TMP { uint64_t tmp; } tt; tt.tmp = 5; bpf_perf_event_output(skb, &connection_tracking_event_map, 0, &tt, sizeof(tt)); return 1; the bpf assembly from llvm is: 0: b7 02 00 00 05 00 00 00 r2 = 5 1: 7b 2a f8 ff 00 00 00 00 *(u64 *)(r10 - 8) = r2 2: bf a4 00 00 00 00 00 00 r4 = r10 3: 07 04 00 00 f8 ff ff ff r4 += -8 4: 18 02 00 00 00 00 00 00 00 00 00 00 00 00 00 00 r2 = 0ll 6: b7 03 00 00 00 00 00 00 r3 = 0 7: b7 05 00 00 08 00 00 00 r5 = 8 8: 85 00 00 00 19 00 00 00 call 25 9: b7 00 00 00 01 00 00 00 r0 = 1 10: 95 00 00 00 00 00 00 00 exit Change-Id: Ia065a5505f625717e8e9db43e2e88e34d2b25d4f Signed-off-by: Allan Zhang Acked-by: Song Liu Signed-off-by: Alexei Starovoitov --- net/core/filter.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/net/core/filter.c b/net/core/filter.c index e13b29ec6906..a86d0fbfbe96 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -6015,6 +6015,8 @@ sk_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_socket_cookie_proto; case BPF_FUNC_get_socket_uid: return &bpf_get_socket_uid_proto; + case BPF_FUNC_perf_event_output: + return &bpf_skb_event_output_proto; default: return bpf_base_func_proto(func_id); } @@ -6035,6 +6037,8 @@ cg_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_sk_storage_get_proto; case BPF_FUNC_sk_storage_delete: return &bpf_sk_storage_delete_proto; + case BPF_FUNC_perf_event_output: + return &bpf_skb_event_output_proto; #ifdef CONFIG_SOCK_CGROUP_DATA case BPF_FUNC_skb_cgroup_id: return &bpf_skb_cgroup_id_proto; @@ -6285,6 +6289,8 @@ sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_sk_redirect_map_proto; case BPF_FUNC_sk_redirect_hash: return &bpf_sk_redirect_hash_proto; + case BPF_FUNC_perf_event_output: + return &bpf_skb_event_output_proto; #ifdef CONFIG_INET case BPF_FUNC_sk_lookup_tcp: return &bpf_sk_lookup_tcp_proto; From b7f122972745dd3abc6c2cb41a38b0dc0bc78339 Mon Sep 17 00:00:00 2001 From: Petar Penkov Date: Mon, 29 Jul 2019 09:59:15 -0700 Subject: [PATCH 1568/1640] BACKPORT: bpf: add bpf_tcp_gen_syncookie helper This helper function allows BPF programs to try to generate SYN cookies, given a reference to a listener socket. The function works from XDP and with an skb context since bpf_skc_lookup_tcp can lookup a socket in both cases. Change-Id: Iac961811f33901dc0a63365669a79dcf2762fecf Signed-off-by: Petar Penkov Suggested-by: Eric Dumazet Reviewed-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 27 +++++++++++++++ net/core/filter.c | 73 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 100 insertions(+) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index aa3a14b99b7f..af8e0bc0a85e 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2636,6 +2636,33 @@ union bpf_attr { * Return * 0 if iph and th are a valid SYN cookie ACK, or a negative error * otherwise. + * + * s64 bpf_tcp_gen_syncookie(struct bpf_sock *sk, void *iph, u32 iph_len, struct tcphdr *th, u32 th_len) + * Description + * Try to issue a SYN cookie for the packet with corresponding + * IP/TCP headers, *iph* and *th*, on the listening socket in *sk*. + * + * *iph* points to the start of the IPv4 or IPv6 header, while + * *iph_len* contains **sizeof**\ (**struct iphdr**) or + * **sizeof**\ (**struct ip6hdr**). + * + * *th* points to the start of the TCP header, while *th_len* + * contains the length of the TCP header. + * + * Return + * On success, lower 32 bits hold the generated SYN cookie in + * followed by 16 bits which hold the MSS value for that cookie, + * and the top 16 bits are unused. + * + * On failure, the returned value is one of the following: + * + * **-EINVAL** SYN cookie cannot be issued due to error + * + * **-ENOENT** SYN cookie should not be issued (no SYN flood) + * + * **-EOPNOTSUPP** kernel configuration does not enable SYN cookies + * + * **-EPROTONOSUPPORT** IP packet version is not 4 or 6 */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ diff --git a/net/core/filter.c b/net/core/filter.c index a86d0fbfbe96..ec4d1135c8e4 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5864,6 +5864,75 @@ static const struct bpf_func_proto bpf_tcp_check_syncookie_proto = { .arg5_type = ARG_CONST_SIZE, }; +BPF_CALL_5(bpf_tcp_gen_syncookie, struct sock *, sk, void *, iph, u32, iph_len, + struct tcphdr *, th, u32, th_len) +{ +#ifdef CONFIG_SYN_COOKIES + u32 cookie; + u16 mss; + + if (unlikely(th_len < sizeof(*th) || th_len != th->doff * 4)) + return -EINVAL; + + if (sk->sk_protocol != IPPROTO_TCP || sk->sk_state != TCP_LISTEN) + return -EINVAL; + + if (!sock_net(sk)->ipv4.sysctl_tcp_syncookies) + return -ENOENT; + + if (!th->syn || th->ack || th->fin || th->rst) + return -EINVAL; + + if (unlikely(iph_len < sizeof(struct iphdr))) + return -EINVAL; + + /* Both struct iphdr and struct ipv6hdr have the version field at the + * same offset so we can cast to the shorter header (struct iphdr). + */ + switch (((struct iphdr *)iph)->version) { + case 4: + if (sk->sk_family == AF_INET6 && sk->sk_ipv6only) + return -EINVAL; + + mss = tcp_v4_get_syncookie(sk, iph, th, &cookie); + break; + +#if IS_BUILTIN(CONFIG_IPV6) + case 6: + if (unlikely(iph_len < sizeof(struct ipv6hdr))) + return -EINVAL; + + if (sk->sk_family != AF_INET6) + return -EINVAL; + + mss = tcp_v6_get_syncookie(sk, iph, th, &cookie); + break; +#endif /* CONFIG_IPV6 */ + + default: + return -EPROTONOSUPPORT; + } + if (mss <= 0) + return -ENOENT; + + return cookie | ((u64)mss << 32); +#else + return -EOPNOTSUPP; +#endif /* CONFIG_SYN_COOKIES */ +} + +static const struct bpf_func_proto bpf_tcp_gen_syncookie_proto = { + .func = bpf_tcp_gen_syncookie, + .gpl_only = true, /* __cookie_v*_init_sequence() is GPL */ + .pkt_access = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_SOCK_COMMON, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_PTR_TO_MEM, + .arg5_type = ARG_CONST_SIZE, +}; + #endif /* CONFIG_INET */ bool bpf_helper_changes_pkt_data(void *func) @@ -6157,6 +6226,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_tcp_check_syncookie_proto; case BPF_FUNC_skb_ecn_set_ce: return &bpf_skb_ecn_set_ce_proto; + case BPF_FUNC_tcp_gen_syncookie: + return &bpf_tcp_gen_syncookie_proto; #endif default: return bpf_base_func_proto(func_id); @@ -6196,6 +6267,8 @@ xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_xdp_skc_lookup_tcp_proto; case BPF_FUNC_tcp_check_syncookie: return &bpf_tcp_check_syncookie_proto; + case BPF_FUNC_tcp_gen_syncookie: + return &bpf_tcp_gen_syncookie_proto; #endif default: return bpf_base_func_proto(func_id); From d83664f1f24b52fa7abec1ab802a9fdf72813beb Mon Sep 17 00:00:00 2001 From: Petar Penkov Date: Tue, 27 Aug 2019 16:46:22 -0700 Subject: [PATCH 1569/1640] UPSTREAM: bpf: fix error check in bpf_tcp_gen_syncookie If a SYN cookie is not issued by tcp_v#_gen_syncookie, then the return value will be exactly 0, rather than <= 0. Let's change the check to reflect that, especially since mss is an unsigned value and cannot be negative. Fixes: 70d66244317e ("bpf: add bpf_tcp_gen_syncookie helper") Reported-by: Stanislav Fomichev Change-Id: Ic5f147125bd2478657ca6282cd2ebbac6136c0ca Signed-off-by: Petar Penkov Acked-by: Song Liu Signed-off-by: Daniel Borkmann --- net/core/filter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/core/filter.c b/net/core/filter.c index ec4d1135c8e4..9f66cae7e5fb 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5912,7 +5912,7 @@ BPF_CALL_5(bpf_tcp_gen_syncookie, struct sock *, sk, void *, iph, u32, iph_len, default: return -EPROTONOSUPPORT; } - if (mss <= 0) + if (mss == 0) return -ENOENT; return cookie | ((u64)mss << 32); From 6b0a56b8e91633e1bfac6b3324de40b613b6f9eb Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Tue, 16 Oct 2018 11:07:59 -0700 Subject: [PATCH 1570/1640] UPSTREAM: bpf: skmsg, improve sk_msg_used_element to work in cork context Currently sk_msg_used_element is only called in zerocopy context where cork is not possible and if this case happens we fallback to copy mode. However the helper is more useful if it works in all contexts. This patch resolved the case where if end == head indicating a full or empty ring the helper always reports an empty ring. To fix this add a test for the full ring case to avoid reporting a full ring has 0 elements. This additional functionality will be used in the next patches from recvmsg context where end = head with a full ring is a valid case. Change-Id: I9e250405527cea041a80f2ab222e938739db8046 Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann --- include/linux/skmsg.h | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index 02af834a8084..bfca35d40bf3 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -188,18 +188,21 @@ static inline void sk_msg_xfer_full(struct sk_msg *dst, struct sk_msg *src) sk_msg_init(src); } -static inline u32 sk_msg_elem_used(const struct sk_msg *msg) -{ - return msg->sg.end >= msg->sg.start ? - msg->sg.end - msg->sg.start : - msg->sg.end + (MAX_MSG_FRAGS - msg->sg.start); -} - static inline bool sk_msg_full(const struct sk_msg *msg) { return (msg->sg.end == msg->sg.start) && msg->sg.size; } +static inline u32 sk_msg_elem_used(const struct sk_msg *msg) +{ + if (sk_msg_full(msg)) + return MAX_MSG_FRAGS; + + return msg->sg.end >= msg->sg.start ? + msg->sg.end - msg->sg.start : + msg->sg.end + (MAX_MSG_FRAGS - msg->sg.start); +} + static inline struct scatterlist *sk_msg_elem(struct sk_msg *msg, int which) { return &msg->sg.data[which]; From 6785ebb3cabd714e7a38011172649d9ad17fb6f7 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 4 Nov 2019 15:36:57 -0800 Subject: [PATCH 1571/1640] UPSTREAM: net/tls: fix sk_msg trim on fallback to copy mode sk_msg_trim() tries to only update curr pointer if it falls into the trimmed region. The logic, however, does not take into the account pointer wrapping that sk_msg_iter_var_prev() does nor (as John points out) the fact that msg->sg is a ring buffer. This means that when the message was trimmed completely, the new curr pointer would have the value of MAX_MSG_FRAGS - 1, which is neither smaller than any other value, nor would it actually be correct. Special case the trimming to 0 length a little bit and rework the comparison between curr and end to take into account wrapping. This bug caused the TLS code to not copy all of the message, if zero copy filled in fewer sg entries than memcopy would need. Big thanks to Alexander Potapenko for the non-KMSAN reproducer. v2: - take into account that msg->sg is a ring buffer (John). Link: https://lore.kernel.org/netdev/20191030160542.30295-1-jakub.kicinski@netronome.com/ (v1) Fixes: d829e9c4112b ("tls: convert to generic sk_msg interface") Reported-by: syzbot+f8495bff23a879a6d0bd@syzkaller.appspotmail.com Reported-by: syzbot+6f50c99e8f6194bf363f@syzkaller.appspotmail.com Co-developed-by: John Fastabend Change-Id: Ie79c921e9772e94fb9883c846d199d17c88a4bcb Signed-off-by: Jakub Kicinski Signed-off-by: John Fastabend Signed-off-by: David S. Miller --- include/linux/skmsg.h | 9 ++++++--- net/core/skmsg.c | 20 +++++++++++++++----- 2 files changed, 21 insertions(+), 8 deletions(-) diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index bfca35d40bf3..affe92b76527 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -139,6 +139,11 @@ static inline void sk_msg_apply_bytes(struct sk_psock *psock, u32 bytes) } } +static inline u32 sk_msg_iter_dist(u32 start, u32 end) +{ + return end >= start ? end - start : end + (MAX_MSG_FRAGS - start); +} + #define sk_msg_iter_var_prev(var) \ do { \ if (var == 0) \ @@ -198,9 +203,7 @@ static inline u32 sk_msg_elem_used(const struct sk_msg *msg) if (sk_msg_full(msg)) return MAX_MSG_FRAGS; - return msg->sg.end >= msg->sg.start ? - msg->sg.end - msg->sg.start : - msg->sg.end + (MAX_MSG_FRAGS - msg->sg.start); + return sk_msg_iter_dist(msg->sg.start, msg->sg.end); } static inline struct scatterlist *sk_msg_elem(struct sk_msg *msg, int which) diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 56a99d0c9aa0..0bb575fe71c7 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -259,18 +259,28 @@ void sk_msg_trim(struct sock *sk, struct sk_msg *msg, int len) msg->sg.data[i].length -= trim; sk_mem_uncharge(sk, trim); + /* Adjust copybreak if it falls into the trimmed part of last buf */ + if (msg->sg.curr == i && msg->sg.copybreak > msg->sg.data[i].length) + msg->sg.copybreak = msg->sg.data[i].length; out: - /* If we trim data before curr pointer update copybreak and current - * so that any future copy operations start at new copy location. + sk_msg_iter_var_next(i); + msg->sg.end = i; + + /* If we trim data a full sg elem before curr pointer update + * copybreak and current so that any future copy operations + * start at new copy location. * However trimed data that has not yet been used in a copy op * does not require an update. */ - if (msg->sg.curr >= i) { + if (!msg->sg.size) { + msg->sg.curr = msg->sg.start; + msg->sg.copybreak = 0; + } else if (sk_msg_iter_dist(msg->sg.start, msg->sg.curr) >= + sk_msg_iter_dist(msg->sg.start, msg->sg.end)) { + sk_msg_iter_var_prev(i); msg->sg.curr = i; msg->sg.copybreak = msg->sg.data[i].length; } - sk_msg_iter_var_next(i); - msg->sg.end = i; } EXPORT_SYMBOL_GPL(sk_msg_trim); From 8151bf4e4132a5ebe4b7e6a5c90c77bfe171bd03 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 27 Nov 2019 12:16:41 -0800 Subject: [PATCH 1572/1640] BACKPORT: net: skmsg: fix TLS 1.3 crash with full sk_msg [ Upstream commit 031097d9e079e40dce401031d1012e83d80eaf01 ] TLS 1.3 started using the entry at the end of the SG array for chaining-in the single byte content type entry. This mostly works: [ E E E E E E . . ] ^ ^ start end E < content type / [ E E E E E E C . ] ^ ^ start end (Where E denotes a populated SG entry; C denotes a chaining entry.) If the array is full, however, the end will point to the start: [ E E E E E E E E ] ^ start end And we end up overwriting the start: E < content type / [ C E E E E E E E ] ^ start end The sg array is supposed to be a circular buffer with start and end markers pointing anywhere. In case where start > end (i.e. the circular buffer has "wrapped") there is an extra entry reserved at the end to chain the two halves together. [ E E E E E E . . l ] (Where l is the reserved entry for "looping" back to front. As suggested by John, let's reserve another entry for chaining SG entries after the main circular buffer. Note that this entry has to be pointed to by the end entry so its position is not fixed. Examples of full messages: [ E E E E E E E E . l ] ^ ^ start end <---------------. [ E E . E E E E E E l ] ^ ^ end start Now the end will always point to an unused entry, so TLS 1.3 can always use it. Fixes: 130b392c6cd6 ("net: tls: Add tls 1.3 support") Change-Id: I5002190b9e9729b2e2157b3f651602168fd965af Signed-off-by: Jakub Kicinski Reviewed-by: Simon Horman Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- include/linux/skmsg.h | 26 +++++++++++++------------- net/core/filter.c | 8 ++++---- net/core/skmsg.c | 2 +- net/ipv4/tcp_bpf.c | 2 +- 4 files changed, 19 insertions(+), 19 deletions(-) diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index affe92b76527..17beb72ab34c 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -14,6 +14,7 @@ #include #define MAX_MSG_FRAGS MAX_SKB_FRAGS +#define NR_MSG_FRAG_IDS (MAX_MSG_FRAGS + 1) enum __sk_action { __SK_DROP = 0, @@ -29,11 +30,13 @@ struct sk_msg_sg { u32 size; u32 copybreak; bool copy[MAX_MSG_FRAGS]; - /* The extra element is used for chaining the front and sections when - * the list becomes partitioned (e.g. end < start). The crypto APIs - * require the chaining. + /* The extra two elements: + * 1) used for chaining the front and sections when the list becomes + * partitioned (e.g. end < start). The crypto APIs require the + * chaining; + * 2) to chain tailer SG entries after the message. */ - struct scatterlist data[MAX_MSG_FRAGS + 1]; + struct scatterlist data[MAX_MSG_FRAGS + 2]; }; /* UAPI in filter.c depends on struct sk_msg_sg being first element. */ @@ -141,13 +144,13 @@ static inline void sk_msg_apply_bytes(struct sk_psock *psock, u32 bytes) static inline u32 sk_msg_iter_dist(u32 start, u32 end) { - return end >= start ? end - start : end + (MAX_MSG_FRAGS - start); + return end >= start ? end - start : end + (NR_MSG_FRAG_IDS - start); } #define sk_msg_iter_var_prev(var) \ do { \ if (var == 0) \ - var = MAX_MSG_FRAGS - 1; \ + var = NR_MSG_FRAG_IDS - 1; \ else \ var--; \ } while (0) @@ -155,7 +158,7 @@ static inline u32 sk_msg_iter_dist(u32 start, u32 end) #define sk_msg_iter_var_next(var) \ do { \ var++; \ - if (var == MAX_MSG_FRAGS) \ + if (var == NR_MSG_FRAG_IDS) \ var = 0; \ } while (0) @@ -172,9 +175,9 @@ static inline void sk_msg_clear_meta(struct sk_msg *msg) static inline void sk_msg_init(struct sk_msg *msg) { - BUILD_BUG_ON(ARRAY_SIZE(msg->sg.data) - 1 != MAX_MSG_FRAGS); + BUILD_BUG_ON(ARRAY_SIZE(msg->sg.data) - 1 != NR_MSG_FRAG_IDS); memset(msg, 0, sizeof(*msg)); - sg_init_marker(msg->sg.data, MAX_MSG_FRAGS); + sg_init_marker(msg->sg.data, NR_MSG_FRAG_IDS); } static inline void sk_msg_xfer(struct sk_msg *dst, struct sk_msg *src, @@ -195,14 +198,11 @@ static inline void sk_msg_xfer_full(struct sk_msg *dst, struct sk_msg *src) static inline bool sk_msg_full(const struct sk_msg *msg) { - return (msg->sg.end == msg->sg.start) && msg->sg.size; + return sk_msg_iter_dist(msg->sg.start, msg->sg.end) == MAX_MSG_FRAGS; } static inline u32 sk_msg_elem_used(const struct sk_msg *msg) { - if (sk_msg_full(msg)) - return MAX_MSG_FRAGS; - return sk_msg_iter_dist(msg->sg.start, msg->sg.end); } diff --git a/net/core/filter.c b/net/core/filter.c index 9f66cae7e5fb..3b86851c7a58 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2312,7 +2312,7 @@ BPF_CALL_4(bpf_msg_pull_data, struct sk_msg *, msg, u32, start, WARN_ON_ONCE(last_sge == first_sge); shift = last_sge > first_sge ? last_sge - first_sge - 1 : - MAX_SKB_FRAGS - first_sge + last_sge - 1; + NR_MSG_FRAG_IDS - first_sge + last_sge - 1; if (!shift) goto out; @@ -2321,8 +2321,8 @@ BPF_CALL_4(bpf_msg_pull_data, struct sk_msg *, msg, u32, start, do { u32 move_from; - if (i + shift >= MAX_MSG_FRAGS) - move_from = i + shift - MAX_MSG_FRAGS; + if (i + shift >= NR_MSG_FRAG_IDS) + move_from = i + shift - NR_MSG_FRAG_IDS; else move_from = i + shift; if (move_from == msg->sg.end) @@ -2336,7 +2336,7 @@ BPF_CALL_4(bpf_msg_pull_data, struct sk_msg *, msg, u32, start, } while (1); msg->sg.end = msg->sg.end - shift > msg->sg.end ? - msg->sg.end - shift + MAX_MSG_FRAGS : + msg->sg.end - shift + NR_MSG_FRAG_IDS : msg->sg.end - shift; out: msg->data = sg_virt(&msg->sg.data[first_sge]) + start - offset; diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 0bb575fe71c7..aad5fbe4933d 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -409,7 +409,7 @@ static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb) sk_mem_charge(sk, skb->len); copied = skb->len; msg->sg.start = 0; - msg->sg.end = num_sge == MAX_MSG_FRAGS ? 0 : num_sge; + msg->sg.end = num_sge; msg->skb = skb; sk_psock_queue_msg(psock, msg); diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index 769002716650..52ccefc77e4e 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -286,7 +286,7 @@ EXPORT_SYMBOL_GPL(tcp_bpf_sendmsg_redir); static int tcp_bpf_send_verdict(struct sock *sk, struct sk_psock *psock, struct sk_msg *msg, int *copied, int flags) { - bool cork = false, enospc = msg->sg.start == msg->sg.end; + bool cork = false, enospc = sk_msg_full(msg); struct sock *sk_redir; u32 tosend, delta = 0; int ret; From 8c49f9a4ac047abbfc998efa52ceaa8970d4d22e Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Sat, 11 Jan 2020 06:12:02 +0000 Subject: [PATCH 1573/1640] UPSTREAM: bpf: Sockmap, skmsg helper overestimates push, pull, and pop bounds commit 6562e29cf6f0ddd368657d97a8d484ffc30df5ef upstream. In the push, pull, and pop helpers operating on skmsg objects to make data writable or insert/remove data we use this bounds check to ensure specified data is valid, /* Bounds checks: start and pop must be inside message */ if (start >= offset + l || last >= msg->sg.size) return -EINVAL; The problem here is offset has already included the length of the current element the 'l' above. So start could be past the end of the scatterlist element in the case where start also points into an offset on the last skmsg element. To fix do the accounting slightly different by adding the length of the previous entry to offset at the start of the iteration. And ensure its initialized to zero so that the first iteration does nothing. Fixes: 604326b41a6fb ("bpf, sockmap: convert to generic sk_msg interface") Fixes: 6fff607e2f14b ("bpf: sk_msg program helper bpf_msg_push_data") Fixes: 7246d8ed4dcce ("bpf: helper to pop data from messages") Change-Id: Ib9f535cc1d4bbe2750a6cde73190f8eadc74ab0e Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann Acked-by: Song Liu Cc: stable@vger.kernel.org Link: https://lore.kernel.org/bpf/20200111061206.8028-5-john.fastabend@gmail.com Signed-off-by: Greg Kroah-Hartman --- net/core/filter.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/net/core/filter.c b/net/core/filter.c index 3b86851c7a58..e6f5d3d8b90c 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2243,10 +2243,10 @@ BPF_CALL_4(bpf_msg_pull_data, struct sk_msg *, msg, u32, start, /* First find the starting scatterlist element */ i = msg->sg.start; do { + offset += len; len = sk_msg_elem(msg, i)->length; if (start < offset + len) break; - offset += len; sk_msg_iter_var_next(i); } while (i != msg->sg.end); @@ -2358,7 +2358,7 @@ BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start, u32, len, u64, flags) { struct scatterlist sge, nsge, nnsge, rsge = {0}, *psge; - u32 new, i = 0, l, space, copy = 0, offset = 0; + u32 new, i = 0, l = 0, space, copy = 0, offset = 0; u8 *raw, *to, *from; struct page *page; @@ -2368,11 +2368,11 @@ BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start, /* First find the starting scatterlist element */ i = msg->sg.start; do { + offset += l; l = sk_msg_elem(msg, i)->length; if (start < offset + l) break; - offset += l; sk_msg_iter_var_next(i); } while (i != msg->sg.end); @@ -2518,7 +2518,7 @@ static void sk_msg_shift_right(struct sk_msg *msg, int i) BPF_CALL_4(bpf_msg_pop_data, struct sk_msg *, msg, u32, start, u32, len, u64, flags) { - u32 i = 0, l, space, offset = 0; + u32 i = 0, l = 0, space, offset = 0; u64 last = start + len; int pop; @@ -2528,11 +2528,11 @@ BPF_CALL_4(bpf_msg_pop_data, struct sk_msg *, msg, u32, start, /* First find the starting scatterlist element */ i = msg->sg.start; do { + offset += l; l = sk_msg_elem(msg, i)->length; if (start < offset + l) break; - offset += l; sk_msg_iter_var_next(i); } while (i != msg->sg.end); From 315609ddb0c00e8631a42f4167f9c5d26ca21f2b Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Sat, 11 Jan 2020 06:12:03 +0000 Subject: [PATCH 1574/1640] UPSTREAM: bpf: Sockmap/tls, msg_push_data may leave end mark in place commit cf21e9ba1eb86c9333ca5b05b2f1cc94021bcaef upstream. Leaving an incorrect end mark in place when passing to crypto layer will cause crypto layer to stop processing data before all data is encrypted. To fix clear the end mark on push data instead of expecting users of the helper to clear the mark value after the fact. This happens when we push data into the middle of a skmsg and have room for it so we don't do a set of copies that already clear the end flag. Fixes: 6fff607e2f14b ("bpf: sk_msg program helper bpf_msg_push_data") Change-Id: Ie178339f56ffecb4c1a804cb1d2b830ef591f88e Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann Acked-by: Song Liu Cc: stable@vger.kernel.org Link: https://lore.kernel.org/bpf/20200111061206.8028-6-john.fastabend@gmail.com Signed-off-by: Greg Kroah-Hartman --- net/core/filter.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/core/filter.c b/net/core/filter.c index e6f5d3d8b90c..8892373694d8 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2427,6 +2427,7 @@ BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start, sk_msg_iter_var_next(i); sg_unmark_end(psge); + sg_unmark_end(&rsge); sk_msg_iter_next(msg, end); } From 1df75d3c7904cfb42da330e0e7ad2eaeaa03a8c7 Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Fri, 10 Jan 2020 13:23:36 +0000 Subject: [PATCH 1575/1640] UPSTREAM: net: bpf: Don't leak time wait and request sockets MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 2e012c74823629d9db27963c79caa3f5b2010746 upstream. It's possible to leak time wait and request sockets via the following BPF pseudo code:   sk = bpf_skc_lookup_tcp(...) if (sk) bpf_sk_release(sk) If sk->sk_state is TCP_NEW_SYN_RECV or TCP_TIME_WAIT the refcount taken by bpf_skc_lookup_tcp is not undone by bpf_sk_release. This is because sk_flags is re-used for other data in both kinds of sockets. The check !sock_flag(sk, SOCK_RCU_FREE) therefore returns a bogus result. Check that sk_flags is valid by calling sk_fullsock. Skip checking SOCK_RCU_FREE if we already know that sk is not a full socket. Fixes: edbf8c01de5a ("bpf: add skc_lookup_tcp helper") Fixes: f7355a6c0497 ("bpf: Check sk_fullsock() before returning from bpf_sk_lookup()") Change-Id: I1c87c4f79d83f42de028f3d6a7f2343469e2e96f Signed-off-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20200110132336.26099-1-lmb@cloudflare.com Signed-off-by: Greg Kroah-Hartman --- net/core/filter.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/net/core/filter.c b/net/core/filter.c index 8892373694d8..1f8afb694319 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5312,8 +5312,7 @@ __bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, if (sk) { sk = sk_to_full_sk(sk); if (!sk_fullsock(sk)) { - if (!sock_flag(sk, SOCK_RCU_FREE)) - sock_gen_put(sk); + sock_gen_put(sk); return NULL; } } @@ -5350,8 +5349,7 @@ bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, if (sk) { sk = sk_to_full_sk(sk); if (!sk_fullsock(sk)) { - if (!sock_flag(sk, SOCK_RCU_FREE)) - sock_gen_put(sk); + sock_gen_put(sk); return NULL; } } @@ -5418,7 +5416,8 @@ static const struct bpf_func_proto bpf_sk_lookup_udp_proto = { BPF_CALL_1(bpf_sk_release, struct sock *, sk) { - if (!sock_flag(sk, SOCK_RCU_FREE)) + /* Only full sockets have sk->sk_flags. */ + if (!sk_fullsock(sk) || !sock_flag(sk, SOCK_RCU_FREE)) sock_gen_put(sk); return 0; } From 819aa252022da80b54c4280b0b0afbc3947c9e3f Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Mon, 4 May 2020 10:21:23 -0700 Subject: [PATCH 1576/1640] UPSTREAM: bpf, sockmap: msg_pop_data can incorrecty set an sge length [ Upstream commit 3e104c23816220919ea1b3fd93fabe363c67c484 ] When sk_msg_pop() is called where the pop operation is working on the end of a sge element and there is no additional trailing data and there _is_ data in front of pop, like the following case, |____________a_____________|__pop__| We have out of order operations where we incorrectly set the pop variable so that instead of zero'ing pop we incorrectly leave it untouched, effectively. This can cause later logic to shift the buffers around believing it should pop extra space. The result is we have 'popped' more data then we expected potentially breaking program logic. It took us a while to hit this case because typically we pop headers which seem to rarely be at the end of a scatterlist elements but we can't rely on this. Fixes: 7246d8ed4dcce ("bpf: helper to pop data from messages") Change-Id: I826466936162e6dd5c5427efb3e75b7250073109 Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann Reviewed-by: Jakub Sitnicki Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/158861288359.14306.7654891716919968144.stgit@john-Precision-5820-Tower Signed-off-by: Sasha Levin --- net/core/filter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/core/filter.c b/net/core/filter.c index 1f8afb694319..10feac8c2afb 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2602,8 +2602,8 @@ BPF_CALL_4(bpf_msg_pop_data, struct sk_msg *, msg, u32, start, } pop = 0; } else if (pop >= sge->length - a) { - sge->length = a; pop -= (sge->length - a); + sge->length = a; } } From 60424314a826107b628c1f74210de9a2ece1f9b6 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Tue, 3 Mar 2020 15:05:01 -0500 Subject: [PATCH 1577/1640] UPSTREAM: bpf: Add gso_size to __sk_buff MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit BPF programs may want to know whether an skb is gso. The canonical answer is skb_is_gso(skb), which tests that gso_size != 0. Expose this field in the same manner as gso_segs. That field itself is not a sufficient signal, as the comment in skb_shared_info makes clear: gso_segs may be zero, e.g., from dodgy sources. Also prepare net/bpf/test_run for upcoming BPF_PROG_TEST_RUN tests of the feature. Signed-off-by: Willem de Bruijn Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200303200503.226217-2-willemdebruijn.kernel@gmail.com Note: backported without changes to net/bpf/test_run.c (cherry picked from commit cf62089b0edd7e74a1f474844b4d9f7b5697fb5c) Signed-off-by: Maciej Żenczykowski Change-Id: I1f7d1b49e5ac35f18546d468e3847deaae5056ca --- include/uapi/linux/bpf.h | 1 + net/core/filter.c | 44 +++++++++++++++++++++++++++------------- 2 files changed, 31 insertions(+), 14 deletions(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index af8e0bc0a85e..a12b69c83213 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2933,6 +2933,7 @@ struct __sk_buff { __u32 wire_len; __u32 gso_segs; __bpf_md_ptr(struct bpf_sock *, sk); + __u32 gso_size; }; struct bpf_tunnel_key { diff --git a/net/core/filter.c b/net/core/filter.c index 10feac8c2afb..744ca171ab95 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -7228,6 +7228,27 @@ static u32 flow_dissector_convert_ctx_access(enum bpf_access_type type, return insn - insn_buf; } +static struct bpf_insn *bpf_convert_shinfo_access(const struct bpf_insn *si, + struct bpf_insn *insn) +{ + /* si->dst_reg = skb_shinfo(SKB); */ +#ifdef NET_SKBUFF_DATA_USES_OFFSET + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, end), + BPF_REG_AX, si->src_reg, + offsetof(struct sk_buff, end)); + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, head), + si->dst_reg, si->src_reg, + offsetof(struct sk_buff, head)); + *insn++ = BPF_ALU64_REG(BPF_ADD, si->dst_reg, BPF_REG_AX); +#else + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, end), + si->dst_reg, si->src_reg, + offsetof(struct sk_buff, end)); +#endif + + return insn; +} + static u32 bpf_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, @@ -7550,26 +7571,21 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type, break; case offsetof(struct __sk_buff, gso_segs): - /* si->dst_reg = skb_shinfo(SKB); */ -#ifdef NET_SKBUFF_DATA_USES_OFFSET - *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, end), - BPF_REG_AX, si->src_reg, - offsetof(struct sk_buff, end)); - *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, head), - si->dst_reg, si->src_reg, - offsetof(struct sk_buff, head)); - *insn++ = BPF_ALU64_REG(BPF_ADD, si->dst_reg, BPF_REG_AX); -#else - *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, end), - si->dst_reg, si->src_reg, - offsetof(struct sk_buff, end)); -#endif + insn = bpf_convert_shinfo_access(si, insn); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct skb_shared_info, gso_segs), si->dst_reg, si->dst_reg, bpf_target_off(struct skb_shared_info, gso_segs, 2, target_size)); break; + case offsetof(struct __sk_buff, gso_size): + insn = bpf_convert_shinfo_access(si, insn); + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct skb_shared_info, gso_size), + si->dst_reg, si->dst_reg, + bpf_target_off(struct skb_shared_info, + gso_size, 2, + target_size)); + break; case offsetof(struct __sk_buff, wire_len): BUILD_BUG_ON(FIELD_SIZEOF(struct qdisc_skb_cb, pkt_len) != 4); From 0fb32c3f876e3b5e53435ff7a07b8c7d4a1644bb Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Tue, 11 Aug 2020 15:04:56 -0700 Subject: [PATCH 1578/1640] UPSTREAM: bpf: sock_ops sk access may stomp registers when dst_reg = src_reg [ Upstream commit 84f44df664e9f0e261157e16ee1acd77cc1bb78d ] Similar to patch ("bpf: sock_ops ctx access may stomp registers") if the src_reg = dst_reg when reading the sk field of a sock_ops struct we generate xlated code, 53: (61) r9 = *(u32 *)(r9 +28) 54: (15) if r9 == 0x0 goto pc+3 56: (79) r9 = *(u64 *)(r9 +0) This stomps on the r9 reg to do the sk_fullsock check and then when reading the skops->sk field instead of the sk pointer we get the sk_fullsock. To fix use similar pattern noted in the previous fix and use the temp field to save/restore a register used to do sk_fullsock check. After the fix the generated xlated code reads, 52: (7b) *(u64 *)(r9 +32) = r8 53: (61) r8 = *(u32 *)(r9 +28) 54: (15) if r9 == 0x0 goto pc+3 55: (79) r8 = *(u64 *)(r9 +32) 56: (79) r9 = *(u64 *)(r9 +0) 57: (05) goto pc+1 58: (79) r8 = *(u64 *)(r9 +32) Here r9 register was in-use so r8 is chosen as the temporary register. In line 52 r8 is saved in temp variable and at line 54 restored in case fullsock != 0. Finally we handle fullsock == 0 case by restoring at line 58. This adds a new macro SOCK_OPS_GET_SK it is almost possible to merge this with SOCK_OPS_GET_FIELD, but I found the extra branch logic a bit more confusing than just adding a new macro despite a bit of duplicating code. Fixes: 1314ef561102e ("bpf: export bpf_sock for BPF_PROG_TYPE_SOCK_OPS prog type") Change-Id: I93fc90e04eeae1e80735c96f7fdfe0091219d2a9 Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann Acked-by: Song Liu Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/159718349653.4728.6559437186853473612.stgit@john-Precision-5820-Tower Signed-off-by: Sasha Levin --- net/core/filter.c | 49 ++++++++++++++++++++++++++++++++++++----------- 1 file changed, 38 insertions(+), 11 deletions(-) diff --git a/net/core/filter.c b/net/core/filter.c index 744ca171ab95..1bd76782e0e1 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -8030,6 +8030,43 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, offsetof(OBJ, OBJ_FIELD)); \ } while (0) +#define SOCK_OPS_GET_SK() \ + do { \ + int fullsock_reg = si->dst_reg, reg = BPF_REG_9, jmp = 1; \ + if (si->dst_reg == reg || si->src_reg == reg) \ + reg--; \ + if (si->dst_reg == reg || si->src_reg == reg) \ + reg--; \ + if (si->dst_reg == si->src_reg) { \ + *insn++ = BPF_STX_MEM(BPF_DW, si->src_reg, reg, \ + offsetof(struct bpf_sock_ops_kern, \ + temp)); \ + fullsock_reg = reg; \ + jmp += 2; \ + } \ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ + struct bpf_sock_ops_kern, \ + is_fullsock), \ + fullsock_reg, si->src_reg, \ + offsetof(struct bpf_sock_ops_kern, \ + is_fullsock)); \ + *insn++ = BPF_JMP_IMM(BPF_JEQ, fullsock_reg, 0, jmp); \ + if (si->dst_reg == si->src_reg) \ + *insn++ = BPF_LDX_MEM(BPF_DW, reg, si->src_reg, \ + offsetof(struct bpf_sock_ops_kern, \ + temp)); \ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ + struct bpf_sock_ops_kern, sk),\ + si->dst_reg, si->src_reg, \ + offsetof(struct bpf_sock_ops_kern, sk));\ + if (si->dst_reg == si->src_reg) { \ + *insn++ = BPF_JMP_A(1); \ + *insn++ = BPF_LDX_MEM(BPF_DW, reg, si->src_reg, \ + offsetof(struct bpf_sock_ops_kern, \ + temp)); \ + } \ + } while (0) + #define SOCK_OPS_GET_TCP_SOCK_FIELD(FIELD) \ SOCK_OPS_GET_FIELD(FIELD, FIELD, struct tcp_sock) @@ -8314,17 +8351,7 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, SOCK_OPS_GET_TCP_SOCK_FIELD(bytes_acked); break; case offsetof(struct bpf_sock_ops, sk): - *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( - struct bpf_sock_ops_kern, - is_fullsock), - si->dst_reg, si->src_reg, - offsetof(struct bpf_sock_ops_kern, - is_fullsock)); - *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1); - *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( - struct bpf_sock_ops_kern, sk), - si->dst_reg, si->src_reg, - offsetof(struct bpf_sock_ops_kern, sk)); + SOCK_OPS_GET_SK(); break; } return insn - insn_buf; From c7f13728453748bdfd046d2b60bace48a0d7ac81 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 10 Jan 2018 12:53:20 -0800 Subject: [PATCH 1579/1640] UPSTREAM: stddef.h: Introduce sizeof_field() The size of fields within a structure is needed in a few places in the kernel already, and will be needed for the usercopy whitelisting when declaring whitelist regions within structures. This creates a dedicated macro and redefines offsetofend() to use it. Existing usage, ignoring the 1200+ lustre assert uses: $ git grep -E 'sizeof\(\(\((struct )?[a-zA-Z_]+ \*\)0\)->' | \ grep -v staging/lustre | wc -l 65 Signed-off-by: Kees Cook --- include/linux/stddef.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/include/linux/stddef.h b/include/linux/stddef.h index 2181719fd907..998a4ba28eba 100644 --- a/include/linux/stddef.h +++ b/include/linux/stddef.h @@ -19,6 +19,14 @@ enum { #define offsetof(TYPE, MEMBER) ((size_t)&((TYPE *)0)->MEMBER) #endif +/** + * sizeof_field(TYPE, MEMBER) + * + * @TYPE: The structure containing the field of interest + * @MEMBER: The field to return the size of + */ +#define sizeof_field(TYPE, MEMBER) sizeof((((TYPE *)0)->MEMBER)) + /** * offsetofend(TYPE, MEMBER) * @@ -26,6 +34,6 @@ enum { * @MEMBER: The member within the structure to get the end offset of */ #define offsetofend(TYPE, MEMBER) \ - (offsetof(TYPE, MEMBER) + sizeof(((TYPE *)0)->MEMBER)) + (offsetof(TYPE, MEMBER) + sizeof_field(TYPE, MEMBER)) #endif From eda40e58c6b31c1f1a66ace2fd671141c7a5e14b Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Wed, 3 Nov 2021 13:47:35 -0700 Subject: [PATCH 1580/1640] UPSTREAM: bpf: sockmap, strparser, and tls are reusing qdisc_skb_cb and colliding [ Upstream commit e0dc3b93bd7bcff8c3813d1df43e0908499c7cf0 ] Strparser is reusing the qdisc_skb_cb struct to stash the skb message handling progress, e.g. offset and length of the skb. First this is poorly named and inherits a struct from qdisc that doesn't reflect the actual usage of cb[] at this layer. But, more importantly strparser is using the following to access its metadata. (struct _strp_msg *)((void *)skb->cb + offsetof(struct qdisc_skb_cb, data)) Where _strp_msg is defined as: struct _strp_msg { struct strp_msg strp; /* 0 8 */ int accum_len; /* 8 4 */ /* size: 12, cachelines: 1, members: 2 */ /* last cacheline: 12 bytes */ }; So we use 12 bytes of ->data[] in struct. However in BPF code running parser and verdict the user has read capabilities into the data[] array as well. Its not too problematic, but we should not be exposing internal state to BPF program. If its really needed then we can use the probe_read() APIs which allow reading kernel memory. And I don't believe cb[] layer poses any API breakage by moving this around because programs can't depend on cb[] across layers. In order to fix another issue with a ctx rewrite we need to stash a temp variable somewhere. To make this work cleanly this patch builds a cb struct for sk_skb types called sk_skb_cb struct. Then we can use this consistently in the strparser, sockmap space. Additionally we can start allowing ->cb[] write access after this. Fixes: 604326b41a6fb ("bpf, sockmap: convert to generic sk_msg interface") Change-Id: I2512e377df9d6f73136b43e59c364c6c87be5d1d Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann Tested-by: Jussi Maki Reviewed-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/20211103204736.248403-5-john.fastabend@gmail.com Signed-off-by: Sasha Levin --- include/net/strparser.h | 16 +++++++++++++++- net/core/filter.c | 21 +++++++++++++++++++++ net/strparser/strparser.c | 10 +--------- 3 files changed, 37 insertions(+), 10 deletions(-) diff --git a/include/net/strparser.h b/include/net/strparser.h index f177c87ce38b..ef0822d110a8 100644 --- a/include/net/strparser.h +++ b/include/net/strparser.h @@ -57,10 +57,24 @@ struct strp_msg { int offset; }; +struct _strp_msg { + /* Internal cb structure. struct strp_msg must be first for passing + * to upper layer. + */ + struct strp_msg strp; + int accum_len; +}; + +struct sk_skb_cb { +#define SK_SKB_CB_PRIV_LEN 20 + unsigned char data[SK_SKB_CB_PRIV_LEN]; + struct _strp_msg strp; +}; + static inline struct strp_msg *strp_msg(struct sk_buff *skb) { return (struct strp_msg *)((void *)skb->cb + - offsetof(struct qdisc_skb_cb, data)); + offsetof(struct sk_skb_cb, strp)); } /* Structure for an attached lower socket */ diff --git a/net/core/filter.c b/net/core/filter.c index 1bd76782e0e1..49001341273d 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -8374,6 +8374,27 @@ static u32 sk_skb_convert_ctx_access(enum bpf_access_type type, *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, si->src_reg, off); break; + case offsetof(struct __sk_buff, cb[0]) ... + offsetofend(struct __sk_buff, cb[4]) - 1: + BUILD_BUG_ON(sizeof_field(struct sk_skb_cb, data) < 20); + BUILD_BUG_ON((offsetof(struct sk_buff, cb) + + offsetof(struct sk_skb_cb, data)) % + sizeof(__u64)); + + prog->cb_access = 1; + off = si->off; + off -= offsetof(struct __sk_buff, cb[0]); + off += offsetof(struct sk_buff, cb); + off += offsetof(struct sk_skb_cb, data); + if (type == BPF_WRITE) + *insn++ = BPF_STX_MEM(BPF_SIZE(si->code), si->dst_reg, + si->src_reg, off); + else + *insn++ = BPF_LDX_MEM(BPF_SIZE(si->code), si->dst_reg, + si->src_reg, off); + break; + + default: return bpf_convert_ctx_access(type, si, insn_buf, prog, target_size); diff --git a/net/strparser/strparser.c b/net/strparser/strparser.c index 332eb0df153f..32288e67b22f 100644 --- a/net/strparser/strparser.c +++ b/net/strparser/strparser.c @@ -29,18 +29,10 @@ static struct workqueue_struct *strp_wq; -struct _strp_msg { - /* Internal cb structure. struct strp_msg must be first for passing - * to upper layer. - */ - struct strp_msg strp; - int accum_len; -}; - static inline struct _strp_msg *_strp_msg(struct sk_buff *skb) { return (struct _strp_msg *)((void *)skb->cb + - offsetof(struct qdisc_skb_cb, data)); + offsetof(struct sk_skb_cb, strp)); } /* Lower lock held */ From cc1fcd3cb9f53e63aa1369211aea05a344a9cafe Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 4 Jan 2022 10:31:48 +0900 Subject: [PATCH 1581/1640] UPSTREAM: bpf: Fix SO_RCVBUF/SO_SNDBUF handling in _bpf_setsockopt(). [ Upstream commit 04c350b1ae6bdb12b84009a4d0bf5ab4e621c47b ] The commit 4057765f2dee ("sock: consistent handling of extreme SO_SNDBUF/SO_RCVBUF values") added a change to prevent underflow in setsockopt() around SO_SNDBUF/SO_RCVBUF. This patch adds the same change to _bpf_setsockopt(). Fixes: 4057765f2dee ("sock: consistent handling of extreme SO_SNDBUF/SO_RCVBUF values") Change-Id: I797792499bbaee6cc5058519bfb6e33633d76af5 Signed-off-by: Kuniyuki Iwashima Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20220104013153.97906-2-kuniyu@amazon.co.jp Signed-off-by: Sasha Levin --- net/core/filter.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/core/filter.c b/net/core/filter.c index 49001341273d..ac4176438792 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4261,11 +4261,13 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock, switch (optname) { case SO_RCVBUF: val = min_t(u32, val, sysctl_rmem_max); + val = min_t(int, val, INT_MAX / 2); sk->sk_userlocks |= SOCK_RCVBUF_LOCK; sk->sk_rcvbuf = max_t(int, val * 2, SOCK_MIN_RCVBUF); break; case SO_SNDBUF: val = min_t(u32, val, sysctl_wmem_max); + val = min_t(int, val, INT_MAX / 2); sk->sk_userlocks |= SOCK_SNDBUF_LOCK; sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF); break; From cd050eea2d14cd2eedeaca9544f92ab733443062 Mon Sep 17 00:00:00 2001 From: Felix Maurer Date: Wed, 9 Feb 2022 16:55:26 +0100 Subject: [PATCH 1582/1640] UPSTREAM: bpf: Do not try bpf_msg_push_data with len 0 commit 4a11678f683814df82fca9018d964771e02d7e6d upstream. If bpf_msg_push_data() is called with len 0 (as it happens during selftests/bpf/test_sockmap), we do not need to do anything and can return early. Calling bpf_msg_push_data() with len 0 previously lead to a wrong ENOMEM error: we later called get_order(copy + len); if len was 0, copy + len was also often 0 and get_order() returned some undefined value (at the moment 52). alloc_pages() caught that and failed, but then bpf_msg_push_data() returned ENOMEM. This was wrong because we are most probably not out of memory and actually do not need any additional memory. Fixes: 6fff607e2f14b ("bpf: sk_msg program helper bpf_msg_push_data") Change-Id: I7d37f4ebfd344bb8c372a3376f33e02afef024e9 Signed-off-by: Felix Maurer Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/df69012695c7094ccb1943ca02b4920db3537466.1644421921.git.fmaurer@redhat.com Signed-off-by: Greg Kroah-Hartman --- net/core/filter.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/core/filter.c b/net/core/filter.c index ac4176438792..182794104772 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2526,6 +2526,9 @@ BPF_CALL_4(bpf_msg_pop_data, struct sk_msg *, msg, u32, start, if (unlikely(flags)) return -EINVAL; + if (unlikely(len == 0)) + return 0; + /* First find the starting scatterlist element */ i = msg->sg.start; do { From 08c41d70f6bf0fd8bf09c4dbe301a1e178b84ca9 Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Sun, 30 Jan 2022 12:55:17 +0100 Subject: [PATCH 1583/1640] UPSTREAM: bpf: Make dst_port field in struct bpf_sock 16-bit wide [ Upstream commit 4421a582718ab81608d8486734c18083b822390d ] Menglong Dong reports that the documentation for the dst_port field in struct bpf_sock is inaccurate and confusing. From the BPF program PoV, the field is a zero-padded 16-bit integer in network byte order. The value appears to the BPF user as if laid out in memory as so: offsetof(struct bpf_sock, dst_port) + 0 + 8 +16 0x00 +24 0x00 32-, 16-, and 8-bit wide loads from the field are all allowed, but only if the offset into the field is 0. 32-bit wide loads from dst_port are especially confusing. The loaded value, after converting to host byte order with bpf_ntohl(dst_port), contains the port number in the upper 16-bits. Remove the confusion by splitting the field into two 16-bit fields. For backward compatibility, allow 32-bit wide loads from offsetof(struct bpf_sock, dst_port). While at it, allow loads 8-bit loads at offset [0] and [1] from dst_port. Reported-by: Menglong Dong Change-Id: Id86817d538b4f552ca112639c0a40fb2d8bd9eb9 Signed-off-by: Jakub Sitnicki Link: https://lore.kernel.org/r/20220130115518.213259-2-jakub@cloudflare.com Signed-off-by: Alexei Starovoitov Signed-off-by: Sasha Levin --- include/uapi/linux/bpf.h | 3 ++- net/core/filter.c | 10 +++++++++- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index a12b69c83213..a5cade22fbde 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2997,7 +2997,8 @@ struct bpf_sock { __u32 src_ip4; __u32 src_ip6[4]; __u32 src_port; /* host byte order */ - __u32 dst_port; /* network byte order */ + __be16 dst_port; /* network byte order */ + __u16 :16; /* zero padding */ __u32 dst_ip4; __u32 dst_ip6[4]; __u32 state; diff --git a/net/core/filter.c b/net/core/filter.c index 182794104772..474538d97dd5 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -6710,6 +6710,7 @@ bool bpf_sock_is_valid_access(int off, int size, enum bpf_access_type type, struct bpf_insn_access_aux *info) { const int size_default = sizeof(__u32); + int field_size; if (off < 0 || off >= sizeof(struct bpf_sock)) return false; @@ -6721,7 +6722,6 @@ bool bpf_sock_is_valid_access(int off, int size, enum bpf_access_type type, case offsetof(struct bpf_sock, family): case offsetof(struct bpf_sock, type): case offsetof(struct bpf_sock, protocol): - case offsetof(struct bpf_sock, dst_port): case offsetof(struct bpf_sock, src_port): case bpf_ctx_range(struct bpf_sock, src_ip4): case bpf_ctx_range_till(struct bpf_sock, src_ip6[0], src_ip6[3]): @@ -6729,6 +6729,14 @@ bool bpf_sock_is_valid_access(int off, int size, enum bpf_access_type type, case bpf_ctx_range_till(struct bpf_sock, dst_ip6[0], dst_ip6[3]): bpf_ctx_record_field_size(info, size_default); return bpf_ctx_narrow_access_ok(off, size, size_default); + case bpf_ctx_range(struct bpf_sock, dst_port): + field_size = size == size_default ? + size_default : sizeof_field(struct bpf_sock, dst_port); + bpf_ctx_record_field_size(info, field_size); + return bpf_ctx_narrow_access_ok(off, size, field_size); + case offsetofend(struct bpf_sock, dst_port) ... + offsetof(struct bpf_sock, dst_ip4) - 1: + return false; } return size == size_default; From 720ed111c28ab8b58e0a0e2ff761bd0efb4d3137 Mon Sep 17 00:00:00 2001 From: Maxim Mikityanskiy Date: Wed, 6 Apr 2022 15:41:12 +0300 Subject: [PATCH 1584/1640] UPSTREAM: bpf: Support dual-stack sockets in bpf_tcp_check_syncookie [ Upstream commit 2e8702cc0cfa1080f29fd64003c00a3e24ac38de ] bpf_tcp_gen_syncookie looks at the IP version in the IP header and validates the address family of the socket. It supports IPv4 packets in AF_INET6 dual-stack sockets. On the other hand, bpf_tcp_check_syncookie looks only at the address family of the socket, ignoring the real IP version in headers, and validates only the packet size. This implementation has some drawbacks: 1. Packets are not validated properly, allowing a BPF program to trick bpf_tcp_check_syncookie into handling an IPv6 packet on an IPv4 socket. 2. Dual-stack sockets fail the checks on IPv4 packets. IPv4 clients end up receiving a SYNACK with the cookie, but the following ACK gets dropped. This patch fixes these issues by changing the checks in bpf_tcp_check_syncookie to match the ones in bpf_tcp_gen_syncookie. IP version from the header is taken into account, and it is validated properly with address family. Fixes: 399040847084 ("bpf: add helper to check for a valid SYN cookie") Change-Id: Id130e436818a58b9775e61f9d11a029561b1e7a4 Signed-off-by: Maxim Mikityanskiy Signed-off-by: Alexei Starovoitov Reviewed-by: Tariq Toukan Acked-by: Arthur Fabre Link: https://lore.kernel.org/bpf/20220406124113.2795730-1-maximmi@nvidia.com Signed-off-by: Sasha Levin --- net/core/filter.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/net/core/filter.c b/net/core/filter.c index 474538d97dd5..b90490fcfea7 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5822,24 +5822,33 @@ BPF_CALL_5(bpf_tcp_check_syncookie, struct sock *, sk, void *, iph, u32, iph_len if (!th->ack || th->rst || th->syn) return -ENOENT; + if (unlikely(iph_len < sizeof(struct iphdr))) + return -EINVAL; + if (tcp_synq_no_recent_overflow(sk)) return -ENOENT; cookie = ntohl(th->ack_seq) - 1; - switch (sk->sk_family) { - case AF_INET: - if (unlikely(iph_len < sizeof(struct iphdr))) + /* Both struct iphdr and struct ipv6hdr have the version field at the + * same offset so we can cast to the shorter header (struct iphdr). + */ + switch (((struct iphdr *)iph)->version) { + case 4: + if (sk->sk_family == AF_INET6 && ipv6_only_sock(sk)) return -EINVAL; ret = __cookie_v4_check((struct iphdr *)iph, th, cookie); break; #if IS_BUILTIN(CONFIG_IPV6) - case AF_INET6: + case 6: if (unlikely(iph_len < sizeof(struct ipv6hdr))) return -EINVAL; + if (sk->sk_family != AF_INET6) + return -EINVAL; + ret = __cookie_v6_check((struct ipv6hdr *)iph, th, cookie); break; #endif /* CONFIG_IPV6 */ From 8d159f28e5acbeeb25ddc11a3f06d3da0c3ab4ab Mon Sep 17 00:00:00 2001 From: Jon Maxwell Date: Wed, 15 Jun 2022 11:15:40 +1000 Subject: [PATCH 1585/1640] UPSTREAM: bpf: Fix request_sock leak in sk lookup helpers [ Upstream commit 3046a827316c0e55fc563b4fb78c93b9ca5c7c37 ] A customer reported a request_socket leak in a Calico cloud environment. We found that a BPF program was doing a socket lookup with takes a refcnt on the socket and that it was finding the request_socket but returning the parent LISTEN socket via sk_to_full_sk() without decrementing the child request socket 1st, resulting in request_sock slab object leak. This patch retains the existing behaviour of returning full socks to the caller but it also decrements the child request_socket if one is present before doing so to prevent the leak. Thanks to Curtis Taylor for all the help in diagnosing and testing this. And thanks to Antoine Tenart for the reproducer and patch input. v2 of this patch contains, refactor as per Daniel Borkmann's suggestions to validate RCU flags on the listen socket so that it balances with bpf_sk_release() and update comments as per Martin KaFai Lau's suggestion. One small change to Daniels suggestion, put "sk = sk2" under "if (sk2 != sk)" to avoid an extra instruction. Fixes: f7355a6c0497 ("bpf: Check sk_fullsock() before returning from bpf_sk_lookup()") Fixes: edbf8c01de5a ("bpf: add skc_lookup_tcp helper") Co-developed-by: Antoine Tenart Change-Id: I4867220345184d2c036b76988b3267f5bc6aeb63 Signed-off-by: Antoine Tenart Signed-off-by: Jon Maxwell Signed-off-by: Daniel Borkmann Tested-by: Curtis Taylor Cc: Martin KaFai Lau Link: https://lore.kernel.org/bpf/56d6f898-bde0-bb25-3427-12a330b29fb8@iogearbox.net Link: https://lore.kernel.org/bpf/20220615011540.813025-1-jmaxwell37@gmail.com Signed-off-by: Sasha Levin --- net/core/filter.c | 34 ++++++++++++++++++++++++++++------ 1 file changed, 28 insertions(+), 6 deletions(-) diff --git a/net/core/filter.c b/net/core/filter.c index b90490fcfea7..47427be373fc 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5315,10 +5315,21 @@ __bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, ifindex, proto, netns_id, flags); if (sk) { - sk = sk_to_full_sk(sk); - if (!sk_fullsock(sk)) { + struct sock *sk2 = sk_to_full_sk(sk); + + /* sk_to_full_sk() may return (sk)->rsk_listener, so make sure the original sk + * sock refcnt is decremented to prevent a request_sock leak. + */ + if (!sk_fullsock(sk2)) + sk2 = NULL; + if (sk2 != sk) { sock_gen_put(sk); - return NULL; + /* Ensure there is no need to bump sk2 refcnt */ + if (unlikely(sk2 && !sock_flag(sk2, SOCK_RCU_FREE))) { + WARN_ONCE(1, "Found non-RCU, unreferenced socket!"); + return NULL; + } + sk = sk2; } } @@ -5352,10 +5363,21 @@ bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, flags); if (sk) { - sk = sk_to_full_sk(sk); - if (!sk_fullsock(sk)) { + struct sock *sk2 = sk_to_full_sk(sk); + + /* sk_to_full_sk() may return (sk)->rsk_listener, so make sure the original sk + * sock refcnt is decremented to prevent a request_sock leak. + */ + if (!sk_fullsock(sk2)) + sk2 = NULL; + if (sk2 != sk) { sock_gen_put(sk); - return NULL; + /* Ensure there is no need to bump sk2 refcnt */ + if (unlikely(sk2 && !sock_flag(sk2, SOCK_RCU_FREE))) { + WARN_ONCE(1, "Found non-RCU, unreferenced socket!"); + return NULL; + } + sk = sk2; } } From 913282da57d8c335725ae258e1cf9fe941d54cc6 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Mon, 21 Nov 2022 10:03:39 -0800 Subject: [PATCH 1586/1640] UPSTREAM: bpf: Move skb->len == 0 checks into __bpf_redirect [ Upstream commit 114039b342014680911c35bd6b72624180fd669a ] To avoid potentially breaking existing users. Both mac/no-mac cases have to be amended; mac_header >= network_header is not enough (verified with a new test, see next patch). Fixes: fd1894224407 ("bpf: Don't redirect packets with invalid pkt_len") Change-Id: I7f16f664a202d1f962bd9e2d8d8e60064f62ef4f Signed-off-by: Stanislav Fomichev Link: https://lore.kernel.org/r/20221121180340.1983627-1-sdf@google.com Signed-off-by: Martin KaFai Lau Signed-off-by: Sasha Levin --- net/core/filter.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/net/core/filter.c b/net/core/filter.c index 47427be373fc..431fca630ff8 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2077,6 +2077,11 @@ static int __bpf_redirect_no_mac(struct sk_buff *skb, struct net_device *dev, { unsigned int mlen = skb_network_offset(skb); + if (unlikely(skb->len <= mlen)) { + kfree_skb(skb); + return -ERANGE; + } + if (mlen) { __skb_pull(skb, mlen); if (unlikely(!skb->len)) { @@ -2102,7 +2107,7 @@ static int __bpf_redirect_common(struct sk_buff *skb, struct net_device *dev, u32 flags) { /* Verify that a link layer header is carried */ - if (unlikely(skb->mac_header >= skb->network_header)) { + if (unlikely(skb->mac_header >= skb->network_header || skb->len == 0)) { kfree_skb(skb); return -ERANGE; } From f14f3d3597f446feaed4235901f12a4ecd01eb56 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 16 Feb 2023 16:41:48 -0800 Subject: [PATCH 1587/1640] BACKPORT: bpf: bpf_fib_lookup should not return neigh in NUD_FAILED state commit 1fe4850b34ab512ff911e2c035c75fb6438f7307 upstream. The bpf_fib_lookup() helper does not only look up the fib (ie. route) but it also looks up the neigh. Before returning the neigh, the helper does not check for NUD_VALID. When a neigh state (neigh->nud_state) is in NUD_FAILED, its dmac (neigh->ha) could be all zeros. The helper still returns SUCCESS instead of NO_NEIGH in this case. Because of the SUCCESS return value, the bpf prog directly uses the returned dmac and ends up filling all zero in the eth header. This patch checks for NUD_VALID and returns NO_NEIGH if the neigh is not valid. Change-Id: I1d81fbe4e026d2572099c684582a30a0e62e5f03 Signed-off-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20230217004150.2980689-3-martin.lau@linux.dev Signed-off-by: Greg Kroah-Hartman --- net/core/filter.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/core/filter.c b/net/core/filter.c index 431fca630ff8..bf682cd18367 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4723,7 +4723,7 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params, * rcu_read_lock_bh is not needed here */ neigh = __ipv4_neigh_lookup_noref(dev, (__force u32)params->ipv4_dst); - if (!neigh) + if (!neigh || !(neigh->nud_state & NUD_VALID)) return BPF_FIB_LKUP_RET_NO_NEIGH; return bpf_fib_set_fwd_params(params, neigh, dev); @@ -4840,7 +4840,7 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params, */ neigh = ___neigh_lookup_noref(ipv6_stub->nd_tbl, neigh_key_eq128, ndisc_hashfn, dst, dev); - if (!neigh) + if (!neigh || !(neigh->nud_state & NUD_VALID)) return BPF_FIB_LKUP_RET_NO_NEIGH; return bpf_fib_set_fwd_params(params, neigh, dev); From a7c0114601949fe6ce84329ead4a9230653f53a9 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Wed, 6 Dec 2023 15:27:06 -0800 Subject: [PATCH 1588/1640] UPSTREAM: bpf: sockmap, updating the sg structure should also update curr [ Upstream commit bb9aefde5bbaf6c168c77ba635c155b4980c2287 ] Curr pointer should be updated when the sg structure is shifted. Fixes: 7246d8ed4dcce ("bpf: helper to pop data from messages") Change-Id: I3c5d1dd94c2b874b6f0c34010f8e2a766f6081c1 Signed-off-by: John Fastabend Link: https://lore.kernel.org/r/20231206232706.374377-3-john.fastabend@gmail.com Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- net/core/filter.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/net/core/filter.c b/net/core/filter.c index bf682cd18367..07d9eecdda65 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2225,6 +2225,22 @@ BPF_CALL_2(bpf_msg_cork_bytes, struct sk_msg *, msg, u32, bytes) return 0; } +static void sk_msg_reset_curr(struct sk_msg *msg) +{ + u32 i = msg->sg.start; + u32 len = 0; + + do { + len += sk_msg_elem(msg, i)->length; + sk_msg_iter_var_next(i); + if (len >= msg->sg.size) + break; + } while (i != msg->sg.end); + + msg->sg.curr = i; + msg->sg.copybreak = 0; +} + static const struct bpf_func_proto bpf_msg_cork_bytes_proto = { .func = bpf_msg_cork_bytes, .gpl_only = false, @@ -2344,6 +2360,7 @@ BPF_CALL_4(bpf_msg_pull_data, struct sk_msg *, msg, u32, start, msg->sg.end - shift + NR_MSG_FRAG_IDS : msg->sg.end - shift; out: + sk_msg_reset_curr(msg); msg->data = sg_virt(&msg->sg.data[first_sge]) + start - offset; msg->data_end = msg->data + bytes; return 0; @@ -2477,6 +2494,7 @@ BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start, msg->sg.data[new] = rsge; } + sk_msg_reset_curr(msg); sk_msg_compute_data_pointers(msg); return 0; } @@ -2648,6 +2666,7 @@ BPF_CALL_4(bpf_msg_pop_data, struct sk_msg *, msg, u32, start, sk_mem_uncharge(msg->sk, len - pop); msg->sg.size -= (len - pop); + sk_msg_reset_curr(msg); sk_msg_compute_data_pointers(msg); return 0; } From ffda27e02cecbed3317b14461274280f10057ddb Mon Sep 17 00:00:00 2001 From: Fred Li Date: Fri, 19 Jul 2024 10:46:53 +0800 Subject: [PATCH 1589/1640] UPSTREAM: bpf: Fix a segment issue when downgrading gso_size [ Upstream commit fa5ef655615a01533035c6139248c5b33aa27028 ] Linearize the skb when downgrading gso_size because it may trigger a BUG_ON() later when the skb is segmented as described in [1,2]. Fixes: 2be7e212d5419 ("bpf: add bpf_skb_adjust_room helper") Change-Id: I130565140af5a4ca59c1742d7cff5fa2dacb7817 Signed-off-by: Fred Li Signed-off-by: Daniel Borkmann Reviewed-by: Willem de Bruijn Acked-by: Daniel Borkmann Link: https://lore.kernel.org/all/20240626065555.35460-2-dracodingfly@gmail.com [1] Link: https://lore.kernel.org/all/668d5cf1ec330_1c18c32947@willemb.c.googlers.com.notmuch [2] Link: https://lore.kernel.org/bpf/20240719024653.77006-1-dracodingfly@gmail.com Signed-off-by: Sasha Levin --- net/core/filter.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/net/core/filter.c b/net/core/filter.c index 07d9eecdda65..eb7b5cc0141f 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3132,13 +3132,20 @@ static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff, if (skb_is_gso(skb)) { struct skb_shared_info *shinfo = skb_shinfo(skb); - /* Due to header grow, MSS needs to be downgraded. */ - if (!(flags & BPF_F_ADJ_ROOM_FIXED_GSO)) - skb_decrease_gso_size(shinfo, len_diff); - /* Header must be checked, and gso_segs recomputed. */ shinfo->gso_type |= gso_type; shinfo->gso_segs = 0; + + /* Due to header growth, MSS needs to be downgraded. + * There is a BUG_ON() when segmenting the frag_list with + * head_frag true, so linearize the skb after downgrading + * the MSS. + */ + if (!(flags & BPF_F_ADJ_ROOM_FIXED_GSO)) { + skb_decrease_gso_size(shinfo, len_diff); + if (shinfo->frag_list) + return skb_linearize(skb); + } } return 0; From 48ec320e431cb63ba4ca49a5dd085788d4ffb265 Mon Sep 17 00:00:00 2001 From: Zijian Zhang Date: Wed, 6 Nov 2024 22:25:18 +0000 Subject: [PATCH 1590/1640] UPSTREAM: bpf, sockmap: Several fixes to bpf_msg_push_data [ Upstream commit 15ab0548e3107665c34579ae523b2b6e7c22082a ] Several fixes to bpf_msg_push_data, 1. test_sockmap has tests where bpf_msg_push_data is invoked to push some data at the end of a message, but -EINVAL is returned. In this case, in bpf_msg_push_data, after the first loop, i will be set to msg->sg.end, add the logic to handle it. 2. In the code block of "if (start - offset)", it's possible that "i" points to the last of sk_msg_elem. In this case, "sk_msg_iter_next(msg, end)" might still be called twice, another invoking is in "if (!copy)" code block, but actually only one is needed. Add the logic to handle it, and reconstruct the code to make the logic more clear. Fixes: 6fff607e2f14 ("bpf: sk_msg program helper bpf_msg_push_data") Change-Id: Ie497b74e5b7e5ff9b8332096e8b4611109ff51be Signed-off-by: Zijian Zhang Link: https://lore.kernel.org/r/20241106222520.527076-7-zijianzhang@bytedance.com Signed-off-by: Martin KaFai Lau Signed-off-by: Sasha Levin --- net/core/filter.c | 53 +++++++++++++++++++++++++++++------------------ 1 file changed, 33 insertions(+), 20 deletions(-) diff --git a/net/core/filter.c b/net/core/filter.c index eb7b5cc0141f..b9d95edd1c69 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2398,7 +2398,7 @@ BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start, sk_msg_iter_var_next(i); } while (i != msg->sg.end); - if (start >= offset + l) + if (start > offset + l) return -EINVAL; space = MAX_MSG_FRAGS - sk_msg_elem_used(msg); @@ -2423,6 +2423,8 @@ BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start, raw = page_address(page); + if (i == msg->sg.end) + sk_msg_iter_var_prev(i); psge = sk_msg_elem(msg, i); front = start - offset; back = psge->length - front; @@ -2439,7 +2441,13 @@ BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start, } put_page(sg_page(psge)); - } else if (start - offset) { + new = i; + goto place_new; + } + + if (start - offset) { + if (i == msg->sg.end) + sk_msg_iter_var_prev(i); psge = sk_msg_elem(msg, i); rsge = sk_msg_elem_cpy(msg, i); @@ -2450,39 +2458,44 @@ BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start, sk_msg_iter_var_next(i); sg_unmark_end(psge); sg_unmark_end(&rsge); - sk_msg_iter_next(msg, end); } /* Slot(s) to place newly allocated data */ + sk_msg_iter_next(msg, end); new = i; + sk_msg_iter_var_next(i); + + if (i == msg->sg.end) { + if (!rsge.length) + goto place_new; + sk_msg_iter_next(msg, end); + goto place_new; + } /* Shift one or two slots as needed */ - if (!copy) { - sge = sk_msg_elem_cpy(msg, i); + sge = sk_msg_elem_cpy(msg, new); + sg_unmark_end(&sge); + nsge = sk_msg_elem_cpy(msg, i); + if (rsge.length) { sk_msg_iter_var_next(i); - sg_unmark_end(&sge); + nnsge = sk_msg_elem_cpy(msg, i); sk_msg_iter_next(msg, end); + } - nsge = sk_msg_elem_cpy(msg, i); + while (i != msg->sg.end) { + msg->sg.data[i] = sge; + sge = nsge; + sk_msg_iter_var_next(i); if (rsge.length) { - sk_msg_iter_var_next(i); + nsge = nnsge; nnsge = sk_msg_elem_cpy(msg, i); - } - - while (i != msg->sg.end) { - msg->sg.data[i] = sge; - sge = nsge; - sk_msg_iter_var_next(i); - if (rsge.length) { - nsge = nnsge; - nnsge = sk_msg_elem_cpy(msg, i); - } else { - nsge = sk_msg_elem_cpy(msg, i); - } + } else { + nsge = sk_msg_elem_cpy(msg, i); } } +place_new: /* Place newly allocated data buffer */ sk_mem_charge(msg->sk, len); msg->sg.size += len; From 51b834d8906c03887dd30a67b7645d4d07a98182 Mon Sep 17 00:00:00 2001 From: Zijian Zhang Date: Wed, 6 Nov 2024 22:25:19 +0000 Subject: [PATCH 1591/1640] UPSTREAM: bpf, sockmap: Several fixes to bpf_msg_pop_data [ Upstream commit 5d609ba262475db450ba69b8e8a557bd768ac07a ] Several fixes to bpf_msg_pop_data, 1. In sk_msg_shift_left, we should put_page 2. if (len == 0), return early is better 3. pop the entire sk_msg (last == msg->sg.size) should be supported 4. Fix for the value of variable "a" 5. In sk_msg_shift_left, after shifting, i has already pointed to the next element. Addtional sk_msg_iter_var_next may result in BUG. Fixes: 7246d8ed4dcc ("bpf: helper to pop data from messages") Change-Id: I03032e0683aec3b938bb1340b493e280c8e871b8 Signed-off-by: Zijian Zhang Reviewed-by: John Fastabend Link: https://lore.kernel.org/r/20241106222520.527076-8-zijianzhang@bytedance.com Signed-off-by: Martin KaFai Lau Signed-off-by: Sasha Levin --- net/core/filter.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/net/core/filter.c b/net/core/filter.c index b9d95edd1c69..ad14fb311193 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2524,8 +2524,10 @@ static const struct bpf_func_proto bpf_msg_push_data_proto = { static void sk_msg_shift_left(struct sk_msg *msg, int i) { + struct scatterlist *sge = sk_msg_elem(msg, i); int prev; + put_page(sg_page(sge)); do { prev = i; sk_msg_iter_var_next(i); @@ -2577,7 +2579,7 @@ BPF_CALL_4(bpf_msg_pop_data, struct sk_msg *, msg, u32, start, } while (i != msg->sg.end); /* Bounds checks: start and pop must be inside message */ - if (start >= offset + l || last >= msg->sg.size) + if (start >= offset + l || last > msg->sg.size) return -EINVAL; space = MAX_MSG_FRAGS - sk_msg_elem_used(msg); @@ -2606,12 +2608,12 @@ BPF_CALL_4(bpf_msg_pop_data, struct sk_msg *, msg, u32, start, */ if (start != offset) { struct scatterlist *nsge, *sge = sk_msg_elem(msg, i); - int a = start; + int a = start - offset; int b = sge->length - pop - a; sk_msg_iter_var_next(i); - if (pop < sge->length - a) { + if (b > 0) { if (space) { sge->length = a; sk_msg_shift_right(msg, i); @@ -2630,7 +2632,6 @@ BPF_CALL_4(bpf_msg_pop_data, struct sk_msg *, msg, u32, start, if (unlikely(!page)) return -ENOMEM; - sge->length = a; orig = sg_page(sge); from = sg_virt(sge); to = page_address(page); @@ -2640,7 +2641,7 @@ BPF_CALL_4(bpf_msg_pop_data, struct sk_msg *, msg, u32, start, put_page(orig); } pop = 0; - } else if (pop >= sge->length - a) { + } else { pop -= (sge->length - a); sge->length = a; } @@ -2674,7 +2675,6 @@ BPF_CALL_4(bpf_msg_pop_data, struct sk_msg *, msg, u32, start, pop -= sge->length; sk_msg_shift_left(msg, i); } - sk_msg_iter_var_next(i); } sk_mem_uncharge(msg->sk, len - pop); From 6da5a3d18fe5824d8253d1e33659a8ee8ebc27ce Mon Sep 17 00:00:00 2001 From: Zijian Zhang Date: Wed, 6 Nov 2024 22:25:20 +0000 Subject: [PATCH 1592/1640] UPSTREAM: bpf, sockmap: Fix sk_msg_reset_curr [ Upstream commit 955afd57dc4bf7e8c620a0a9e3af3c881c2c6dff ] Found in the test_txmsg_pull in test_sockmap, ``` txmsg_cork = 512; // corking is importrant here opt->iov_length = 3; opt->iov_count = 1; opt->rate = 512; // sendmsg will be invoked 512 times ``` The first sendmsg will send an sk_msg with size 3, and bpf_msg_pull_data will be invoked the first time. sk_msg_reset_curr will reset the copybreak from 3 to 0. In the second sendmsg, since we are in the stage of corking, psock->cork will be reused in func sk_msg_alloc. msg->sg.copybreak is 0 now, the second msg will overwrite the first msg. As a result, we could not pass the data integrity test. The same problem happens in push and pop test. Thus, fix sk_msg_reset_curr to restore the correct copybreak. Fixes: bb9aefde5bba ("bpf: sockmap, updating the sg structure should also update curr") Change-Id: Ibbac29c62d1b26f03d5f3588c14316829aac017f Signed-off-by: Zijian Zhang Link: https://lore.kernel.org/r/20241106222520.527076-9-zijianzhang@bytedance.com Signed-off-by: Martin KaFai Lau Signed-off-by: Sasha Levin --- net/core/filter.c | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/net/core/filter.c b/net/core/filter.c index ad14fb311193..f26b12182c74 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2227,18 +2227,16 @@ BPF_CALL_2(bpf_msg_cork_bytes, struct sk_msg *, msg, u32, bytes) static void sk_msg_reset_curr(struct sk_msg *msg) { - u32 i = msg->sg.start; - u32 len = 0; + if (!msg->sg.size) { + msg->sg.curr = msg->sg.start; + msg->sg.copybreak = 0; + } else { + u32 i = msg->sg.end; - do { - len += sk_msg_elem(msg, i)->length; - sk_msg_iter_var_next(i); - if (len >= msg->sg.size) - break; - } while (i != msg->sg.end); - - msg->sg.curr = i; - msg->sg.copybreak = 0; + sk_msg_iter_var_prev(i); + msg->sg.curr = i; + msg->sg.copybreak = msg->sg.data[i].length; + } } static const struct bpf_func_proto bpf_msg_cork_bytes_proto = { From 864dbcbcc5ed9c93e808e4d57a75d939ef013711 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Thu, 12 Dec 2024 19:40:54 -0800 Subject: [PATCH 1593/1640] UPSTREAM: bpf: Check negative offsets in __bpf_skb_min_len() [ Upstream commit 9ecc4d858b92c1bb0673ad9c327298e600c55659 ] skb_network_offset() and skb_transport_offset() can be negative when they are called after we pull the transport header, for example, when we use eBPF sockmap at the point of ->sk_data_ready(). __bpf_skb_min_len() uses an unsigned int to get these offsets, this leads to a very large number which then causes bpf_skb_change_tail() failed unexpectedly. Fix this by using a signed int to get these offsets and ensure the minimum is at least zero. Fixes: 5293efe62df8 ("bpf: add bpf_skb_change_tail helper") Change-Id: I6267f793ec92ff120f2a5b689976ba19443dae9a Signed-off-by: Cong Wang Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20241213034057.246437-2-xiyou.wangcong@gmail.com Signed-off-by: Sasha Levin --- net/core/filter.c | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/net/core/filter.c b/net/core/filter.c index f26b12182c74..f93cf3a1cd3b 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3258,13 +3258,22 @@ static const struct bpf_func_proto bpf_skb_adjust_room_proto = { static u32 __bpf_skb_min_len(const struct sk_buff *skb) { - u32 min_len = skb_network_offset(skb); + int offset = skb_network_offset(skb); + u32 min_len = 0; - if (skb_transport_header_was_set(skb)) - min_len = skb_transport_offset(skb); - if (skb->ip_summed == CHECKSUM_PARTIAL) - min_len = skb_checksum_start_offset(skb) + - skb->csum_offset + sizeof(__sum16); + if (offset > 0) + min_len = offset; + if (skb_transport_header_was_set(skb)) { + offset = skb_transport_offset(skb); + if (offset > 0) + min_len = offset; + } + if (skb->ip_summed == CHECKSUM_PARTIAL) { + offset = skb_checksum_start_offset(skb) + + skb->csum_offset + sizeof(__sum16); + if (offset > 0) + min_len = offset; + } return min_len; } From e993a90652a15a871ef165bea48b11a8941025a4 Mon Sep 17 00:00:00 2001 From: Ben Dooks Date: Thu, 14 Jul 2022 11:51:01 +0100 Subject: [PATCH 1594/1640] UPSTREAM: bpf: Add endian modifiers to fix endian warnings [ Upstream commit 96a233e600df351bcb06e3c20efe048855552926 ] A couple of the syscalls which load values (bpf_skb_load_helper_16() and bpf_skb_load_helper_32()) are using u16/u32 types which are triggering warnings as they are then converted from big-endian to CPU-endian. Fix these by making the types __be instead. Fixes the following sparse warnings: net/core/filter.c:246:32: warning: cast to restricted __be16 net/core/filter.c:246:32: warning: cast to restricted __be16 net/core/filter.c:246:32: warning: cast to restricted __be16 net/core/filter.c:246:32: warning: cast to restricted __be16 net/core/filter.c:273:32: warning: cast to restricted __be32 net/core/filter.c:273:32: warning: cast to restricted __be32 net/core/filter.c:273:32: warning: cast to restricted __be32 net/core/filter.c:273:32: warning: cast to restricted __be32 net/core/filter.c:273:32: warning: cast to restricted __be32 net/core/filter.c:273:32: warning: cast to restricted __be32 Change-Id: I6486ac70a70ffae777a6cd01f4f4b6926f44797f Signed-off-by: Ben Dooks Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20220714105101.297304-1-ben.dooks@sifive.com Stable-dep-of: d4bac0288a2b ("bpf: support SKF_NET_OFF and SKF_LL_OFF on skb frags") Signed-off-by: Sasha Levin --- net/core/filter.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/core/filter.c b/net/core/filter.c index f93cf3a1cd3b..ea9f6609c341 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -207,7 +207,7 @@ BPF_CALL_2(bpf_skb_load_helper_8_no_cache, const struct sk_buff *, skb, BPF_CALL_4(bpf_skb_load_helper_16, const struct sk_buff *, skb, const void *, data, int, headlen, int, offset) { - u16 tmp, *ptr; + __be16 tmp, *ptr; const int len = sizeof(tmp); if (offset >= 0) { @@ -234,7 +234,7 @@ BPF_CALL_2(bpf_skb_load_helper_16_no_cache, const struct sk_buff *, skb, BPF_CALL_4(bpf_skb_load_helper_32, const struct sk_buff *, skb, const void *, data, int, headlen, int, offset) { - u32 tmp, *ptr; + __be32 tmp, *ptr; const int len = sizeof(tmp); if (likely(offset >= 0)) { From 2a5db1d519f4ba6e59cc8e59ae751ca422bac8be Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Tue, 8 Apr 2025 09:27:48 -0400 Subject: [PATCH 1595/1640] UPSTREAM: bpf: support SKF_NET_OFF and SKF_LL_OFF on skb frags MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit d4bac0288a2b444e468e6df9cb4ed69479ddf14a ] Classic BPF socket filters with SKB_NET_OFF and SKB_LL_OFF fail to read when these offsets extend into frags. This has been observed with iwlwifi and reproduced with tun with IFF_NAPI_FRAGS. The below straightforward socket filter on UDP port, applied to a RAW socket, will silently miss matching packets. const int offset_proto = offsetof(struct ip6_hdr, ip6_nxt); const int offset_dport = sizeof(struct ip6_hdr) + offsetof(struct udphdr, dest); struct sock_filter filter_code[] = { BPF_STMT(BPF_LD + BPF_B + BPF_ABS, SKF_AD_OFF + SKF_AD_PKTTYPE), BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, PACKET_HOST, 0, 4), BPF_STMT(BPF_LD + BPF_B + BPF_ABS, SKF_NET_OFF + offset_proto), BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, IPPROTO_UDP, 0, 2), BPF_STMT(BPF_LD + BPF_H + BPF_ABS, SKF_NET_OFF + offset_dport), This is unexpected behavior. Socket filter programs should be consistent regardless of environment. Silent misses are particularly concerning as hard to detect. Use skb_copy_bits for offsets outside linear, same as done for non-SKF_(LL|NET) offsets. Offset is always positive after subtracting the reference threshold SKB_(LL|NET)_OFF, so is always >= skb_(mac|network)_offset. The sum of the two is an offset against skb->data, and may be negative, but it cannot point before skb->head, as skb_(mac|network)_offset would too. This appears to go back to when frag support was introduced to sk_run_filter in linux-2.4.4, before the introduction of git. The amount of code change and 8/16/32 bit duplication are unfortunate. But any attempt I made to be smarter saved very few LoC while complicating the code. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Link: https://lore.kernel.org/netdev/20250122200402.3461154-1-maze@google.com/ Link: https://elixir.bootlin.com/linux/2.4.4/source/net/core/filter.c#L244 Reported-by: Matt Moeller Co-developed-by: Maciej Żenczykowski Change-Id: Ia1ff7be23c39d872596848731818740696c86c29 Signed-off-by: Maciej Żenczykowski Signed-off-by: Willem de Bruijn Acked-by: Stanislav Fomichev Link: https://lore.kernel.org/r/20250408132833.195491-2-willemdebruijn.kernel@gmail.com Signed-off-by: Alexei Starovoitov Signed-off-by: Sasha Levin --- net/core/filter.c | 80 ++++++++++++++++++++++++++--------------------- 1 file changed, 44 insertions(+), 36 deletions(-) diff --git a/net/core/filter.c b/net/core/filter.c index ea9f6609c341..5856f07f4d99 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -177,24 +177,36 @@ BPF_CALL_3(bpf_skb_get_nlattr_nest, struct sk_buff *, skb, u32, a, u32, x) return 0; } +static int bpf_skb_load_helper_convert_offset(const struct sk_buff *skb, int offset) +{ + if (likely(offset >= 0)) + return offset; + + if (offset >= SKF_NET_OFF) + return offset - SKF_NET_OFF + skb_network_offset(skb); + + if (offset >= SKF_LL_OFF && skb_mac_header_was_set(skb)) + return offset - SKF_LL_OFF + skb_mac_offset(skb); + + return INT_MIN; +} + BPF_CALL_4(bpf_skb_load_helper_8, const struct sk_buff *, skb, const void *, data, int, headlen, int, offset) { - u8 tmp, *ptr; + u8 tmp; const int len = sizeof(tmp); - if (offset >= 0) { - if (headlen - offset >= len) - return *(u8 *)(data + offset); - if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp))) - return tmp; - } else { - ptr = bpf_internal_load_pointer_neg_helper(skb, offset, len); - if (likely(ptr)) - return *(u8 *)ptr; - } + offset = bpf_skb_load_helper_convert_offset(skb, offset); + if (offset == INT_MIN) + return -EFAULT; - return -EFAULT; + if (headlen - offset >= len) + return *(u8 *)(data + offset); + if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp))) + return tmp; + else + return -EFAULT; } BPF_CALL_2(bpf_skb_load_helper_8_no_cache, const struct sk_buff *, skb, @@ -207,21 +219,19 @@ BPF_CALL_2(bpf_skb_load_helper_8_no_cache, const struct sk_buff *, skb, BPF_CALL_4(bpf_skb_load_helper_16, const struct sk_buff *, skb, const void *, data, int, headlen, int, offset) { - __be16 tmp, *ptr; + __be16 tmp; const int len = sizeof(tmp); - if (offset >= 0) { - if (headlen - offset >= len) - return get_unaligned_be16(data + offset); - if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp))) - return be16_to_cpu(tmp); - } else { - ptr = bpf_internal_load_pointer_neg_helper(skb, offset, len); - if (likely(ptr)) - return get_unaligned_be16(ptr); - } + offset = bpf_skb_load_helper_convert_offset(skb, offset); + if (offset == INT_MIN) + return -EFAULT; - return -EFAULT; + if (headlen - offset >= len) + return get_unaligned_be16(data + offset); + if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp))) + return be16_to_cpu(tmp); + else + return -EFAULT; } BPF_CALL_2(bpf_skb_load_helper_16_no_cache, const struct sk_buff *, skb, @@ -234,21 +244,19 @@ BPF_CALL_2(bpf_skb_load_helper_16_no_cache, const struct sk_buff *, skb, BPF_CALL_4(bpf_skb_load_helper_32, const struct sk_buff *, skb, const void *, data, int, headlen, int, offset) { - __be32 tmp, *ptr; + __be32 tmp; const int len = sizeof(tmp); - if (likely(offset >= 0)) { - if (headlen - offset >= len) - return get_unaligned_be32(data + offset); - if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp))) - return be32_to_cpu(tmp); - } else { - ptr = bpf_internal_load_pointer_neg_helper(skb, offset, len); - if (likely(ptr)) - return get_unaligned_be32(ptr); - } + offset = bpf_skb_load_helper_convert_offset(skb, offset); + if (offset == INT_MIN) + return -EFAULT; - return -EFAULT; + if (headlen - offset >= len) + return get_unaligned_be32(data + offset); + if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp))) + return be32_to_cpu(tmp); + else + return -EFAULT; } BPF_CALL_2(bpf_skb_load_helper_32_no_cache, const struct sk_buff *, skb, From 1603d19bc5f56a4f77091b865916285bae759296 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Thu, 11 Apr 2019 09:12:02 -0700 Subject: [PATCH 1596/1640] UPSTREAM: bpf: fix missing bpf_check_uarg_tail_zero in BPF_PROG_TEST_RUN MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit b0b9395d865e ("bpf: support input __sk_buff context in BPF_PROG_TEST_RUN") started using bpf_check_uarg_tail_zero in BPF_PROG_TEST_RUN. However, bpf_check_uarg_tail_zero is not defined for !CONFIG_BPF_SYSCALL: net/bpf/test_run.c: In function ‘bpf_ctx_init’: net/bpf/test_run.c:142:9: error: implicit declaration of function ‘bpf_check_uarg_tail_zero’ [-Werror=implicit-function-declaration] err = bpf_check_uarg_tail_zero(data_in, max_size, size); ^~~~~~~~~~~~~~~~~~~~~~~~ Let's not build net/bpf/test_run.c when CONFIG_BPF_SYSCALL is not set. Reported-by: kbuild test robot Fixes: b0b9395d865e ("bpf: support input __sk_buff context in BPF_PROG_TEST_RUN") Change-Id: Iddc8eb00de3fbde87f765f533b91d2fbd3b41603 Signed-off-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 37 +++++++++++++++++++++++++++++-------- net/bpf/Makefile | 2 +- 2 files changed, 30 insertions(+), 9 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index af6534a89b5a..c3dcc7eda928 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -499,14 +499,6 @@ typedef u32 (*bpf_convert_ctx_access_t)(enum bpf_access_type type, u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size, void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy); -int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr, - union bpf_attr __user *uattr); -int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr, - union bpf_attr __user *uattr); -int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog, - const union bpf_attr *kattr, - union bpf_attr __user *uattr); - /* an array of programs to be executed under rcu_lock. * * Typical usage: @@ -752,6 +744,14 @@ static inline int bpf_map_attr_numa_node(const union bpf_attr *attr) struct bpf_prog *bpf_prog_get_type_path(const char *name, enum bpf_prog_type type); int array_map_alloc_check(union bpf_attr *attr); +int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr, + union bpf_attr __user *uattr); +int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr, + union bpf_attr __user *uattr); +int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog, + const union bpf_attr *kattr, + union bpf_attr __user *uattr); + static inline bool unprivileged_ebpf_enabled(void) { return !sysctl_unprivileged_bpf_disabled; @@ -867,6 +867,27 @@ static inline struct bpf_prog *bpf_prog_get_type_path(const char *name, return ERR_PTR(-EOPNOTSUPP); } +static inline int bpf_prog_test_run_xdp(struct bpf_prog *prog, + const union bpf_attr *kattr, + union bpf_attr __user *uattr) +{ + return -ENOTSUPP; +} + +static inline int bpf_prog_test_run_skb(struct bpf_prog *prog, + const union bpf_attr *kattr, + union bpf_attr __user *uattr) +{ + return -ENOTSUPP; +} + +static inline int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog, + const union bpf_attr *kattr, + union bpf_attr __user *uattr) +{ + return -ENOTSUPP; +} + static inline bool unprivileged_ebpf_enabled(void) { return false; diff --git a/net/bpf/Makefile b/net/bpf/Makefile index 27b2992a0692..b0ca361742e4 100644 --- a/net/bpf/Makefile +++ b/net/bpf/Makefile @@ -1 +1 @@ -obj-y := test_run.o +obj-$(CONFIG_BPF_SYSCALL) := test_run.o From f981545373d44b3eba23b67af373627b90844a5d Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 11 Jan 2021 23:55:14 -0800 Subject: [PATCH 1597/1640] UPSTREAM: bpf: Add bpf_patch_call_args prototype to include/linux/bpf.h [ Upstream commit a643bff752dcf72a07e1b2ab2f8587e4f51118be ] Add bpf_patch_call_args() prototype. This function is called from BPF verifier and only if CONFIG_BPF_JIT_ALWAYS_ON is not defined. This fixes compiler warning about missing prototype in some kernel configurations. Fixes: 1ea47e01ad6e ("bpf: add support for bpf_call to interpreter") Reported-by: kernel test robot Change-Id: Ic06a473f58bf111d74d23d5527f1b25be013db75 Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210112075520.4103414-2-andrii@kernel.org Signed-off-by: Sasha Levin --- include/linux/bpf.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index c3dcc7eda928..ee38019d5805 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -715,7 +715,10 @@ static inline void bpf_long_memcpy(void *dst, const void *src, u32 size) /* verify correctness of eBPF program */ int bpf_check(struct bpf_prog **fp, union bpf_attr *attr, union bpf_attr __user *uattr); + +#ifndef CONFIG_BPF_JIT_ALWAYS_ON void bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth); +#endif /* Map specifics */ struct xdp_buff; From 87bba4d72166ceb0fc5d57b4661d5a8a9029fedd Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Wed, 14 Aug 2019 10:37:49 -0700 Subject: [PATCH 1598/1640] UPSTREAM: bpf: support cloning sk storage on accept() Add new helper bpf_sk_storage_clone which optionally clones sk storage and call it from sk_clone_lock. Cc: Martin KaFai Lau Cc: Yonghong Song Acked-by: Martin KaFai Lau Acked-by: Yonghong Song Change-Id: Iee30d2442b76f6fd7904829314e63f3391e7d811 Signed-off-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann --- include/net/bpf_sk_storage.h | 10 ++++ include/uapi/linux/bpf.h | 3 + net/core/bpf_sk_storage.c | 104 ++++++++++++++++++++++++++++++++++- net/core/sock.c | 6 ++ 4 files changed, 120 insertions(+), 3 deletions(-) diff --git a/include/net/bpf_sk_storage.h b/include/net/bpf_sk_storage.h index b9dcb02e756b..8e4f831d2e52 100644 --- a/include/net/bpf_sk_storage.h +++ b/include/net/bpf_sk_storage.h @@ -10,4 +10,14 @@ void bpf_sk_storage_free(struct sock *sk); extern const struct bpf_func_proto bpf_sk_storage_get_proto; extern const struct bpf_func_proto bpf_sk_storage_delete_proto; +#ifdef CONFIG_BPF_SYSCALL +int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk); +#else +static inline int bpf_sk_storage_clone(const struct sock *sk, + struct sock *newsk) +{ + return 0; +} +#endif + #endif /* _BPF_SK_STORAGE_H */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index a5cade22fbde..84020e0825e6 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -341,6 +341,9 @@ enum bpf_attach_type { #define BPF_F_RDONLY_PROG (1U << 7) #define BPF_F_WRONLY_PROG (1U << 8) +/* Clone map from listener for newly accepted socket */ +#define BPF_F_CLONE (1U << 9) + /* flags for BPF_PROG_QUERY */ #define BPF_F_QUERY_EFFECTIVE (1U << 0) diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index 70da3de6e2ab..21cb21a6f899 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -12,6 +12,9 @@ static atomic_t cache_idx; +#define SK_STORAGE_CREATE_FLAG_MASK \ + (BPF_F_NO_PREALLOC | BPF_F_CLONE) + struct bucket { struct hlist_head list; raw_spinlock_t lock; @@ -209,7 +212,6 @@ static void selem_unlink_sk(struct bpf_sk_storage_elem *selem) kfree_rcu(sk_storage, rcu); } -/* sk_storage->lock must be held and sk_storage->list cannot be empty */ static void __selem_link_sk(struct bpf_sk_storage *sk_storage, struct bpf_sk_storage_elem *selem) { @@ -509,7 +511,7 @@ static int sk_storage_delete(struct sock *sk, struct bpf_map *map) return 0; } -/* Called by __sk_destruct() */ +/* Called by __sk_destruct() & bpf_sk_storage_clone() */ void bpf_sk_storage_free(struct sock *sk) { struct bpf_sk_storage_elem *selem; @@ -557,6 +559,11 @@ static void bpf_sk_storage_map_free(struct bpf_map *map) smap = (struct bpf_sk_storage_map *)map; + /* Note that this map might be concurrently cloned from + * bpf_sk_storage_clone. Wait for any existing bpf_sk_storage_clone + * RCU read section to finish before proceeding. New RCU + * read sections should be prevented via bpf_map_inc_not_zero. + */ synchronize_rcu(); /* bpf prog and the userspace can no longer access this map @@ -601,7 +608,9 @@ static void bpf_sk_storage_map_free(struct bpf_map *map) static int bpf_sk_storage_map_alloc_check(union bpf_attr *attr) { - if (attr->map_flags != BPF_F_NO_PREALLOC || attr->max_entries || + if (attr->map_flags & ~SK_STORAGE_CREATE_FLAG_MASK || + !(attr->map_flags & BPF_F_NO_PREALLOC) || + attr->max_entries || attr->key_size != sizeof(int) || !attr->value_size || /* Enforce BTF for userspace sk dumping */ !attr->btf_key_type_id || !attr->btf_value_type_id) @@ -738,6 +747,95 @@ static int bpf_fd_sk_storage_delete_elem(struct bpf_map *map, void *key) return err; } +static struct bpf_sk_storage_elem * +bpf_sk_storage_clone_elem(struct sock *newsk, + struct bpf_sk_storage_map *smap, + struct bpf_sk_storage_elem *selem) +{ + struct bpf_sk_storage_elem *copy_selem; + + copy_selem = selem_alloc(smap, newsk, NULL, true); + if (!copy_selem) + return NULL; + + if (map_value_has_spin_lock(&smap->map)) + copy_map_value_locked(&smap->map, SDATA(copy_selem)->data, + SDATA(selem)->data, true); + else + copy_map_value(&smap->map, SDATA(copy_selem)->data, + SDATA(selem)->data); + + return copy_selem; +} + +int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk) +{ + struct bpf_sk_storage *new_sk_storage = NULL; + struct bpf_sk_storage *sk_storage; + struct bpf_sk_storage_elem *selem; + int ret = 0; + + RCU_INIT_POINTER(newsk->sk_bpf_storage, NULL); + + rcu_read_lock(); + sk_storage = rcu_dereference(sk->sk_bpf_storage); + + if (!sk_storage || hlist_empty(&sk_storage->list)) + goto out; + + hlist_for_each_entry_rcu(selem, &sk_storage->list, snode) { + struct bpf_sk_storage_elem *copy_selem; + struct bpf_sk_storage_map *smap; + struct bpf_map *map; + + smap = rcu_dereference(SDATA(selem)->smap); + if (!(smap->map.map_flags & BPF_F_CLONE)) + continue; + + /* Note that for lockless listeners adding new element + * here can race with cleanup in bpf_sk_storage_map_free. + * Try to grab map refcnt to make sure that it's still + * alive and prevent concurrent removal. + */ + map = bpf_map_inc_not_zero(&smap->map, false); + if (IS_ERR(map)) + continue; + + copy_selem = bpf_sk_storage_clone_elem(newsk, smap, selem); + if (!copy_selem) { + ret = -ENOMEM; + bpf_map_put(map); + goto out; + } + + if (new_sk_storage) { + selem_link_map(smap, copy_selem); + __selem_link_sk(new_sk_storage, copy_selem); + } else { + ret = sk_storage_alloc(newsk, smap, copy_selem); + if (ret) { + kfree(copy_selem); + atomic_sub(smap->elem_size, + &newsk->sk_omem_alloc); + bpf_map_put(map); + goto out; + } + + new_sk_storage = rcu_dereference(copy_selem->sk_storage); + } + bpf_map_put(map); + } + +out: + rcu_read_unlock(); + + /* In case of an error, don't free anything explicitly here, the + * caller is responsible to call bpf_sk_storage_free. + */ + + return ret; +} + BPF_CALL_4(bpf_sk_storage_get, struct bpf_map *, map, struct sock *, sk, void *, value, u64, flags) { diff --git a/net/core/sock.c b/net/core/sock.c index 342113238216..30787fdf492b 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1904,6 +1904,12 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) } RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL); + if (bpf_sk_storage_clone(sk, newsk)) { + sk_free_unlock_clone(newsk); + newsk = NULL; + goto out; + } + newsk->sk_err = 0; newsk->sk_err_soft = 0; newsk->sk_priority = 0; From fb967805066e4c2c519343f24810a8d8bff3e0c1 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Tue, 2 Jul 2019 09:13:56 -0700 Subject: [PATCH 1599/1640] BACKPORT: bpf: add BPF_CGROUP_SOCK_OPS callback that is executed on every RTT Performance impact should be minimal because it's under a new BPF_SOCK_OPS_RTT_CB_FLAG flag that has to be explicitly enabled. Suggested-by: Eric Dumazet Cc: Eric Dumazet Cc: Priyaranjan Jha Cc: Yuchung Cheng Cc: Soheil Hassas Yeganeh Acked-by: Soheil Hassas Yeganeh Acked-by: Yuchung Cheng Change-Id: I19814edf78101f87e6dc364343f8173d9e230850 Signed-off-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann --- include/net/tcp.h | 9 +++++++++ include/uapi/linux/bpf.h | 6 +++++- net/ipv4/tcp_input.c | 4 ++++ 3 files changed, 18 insertions(+), 1 deletion(-) diff --git a/include/net/tcp.h b/include/net/tcp.h index b14556e238d0..af16baa4e8f1 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -2255,4 +2255,13 @@ static inline bool tcp_bpf_ca_needs_ecn(struct sock *sk) { return (tcp_call_bpf(sk, BPF_SOCK_OPS_NEEDS_ECN, 0, NULL) == 1); } + +static inline void tcp_bpf_rtt(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_RTT_CB_FLAG)) + tcp_call_bpf(sk, BPF_SOCK_OPS_RTT_CB, 0, NULL); +} + #endif /* _TCP_H */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 84020e0825e6..9600869d3804 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1801,6 +1801,7 @@ union bpf_attr { * * **BPF_SOCK_OPS_RTO_CB_FLAG** (retransmission time out) * * **BPF_SOCK_OPS_RETRANS_CB_FLAG** (retransmission) * * **BPF_SOCK_OPS_STATE_CB_FLAG** (TCP state change) + * * **BPF_SOCK_OPS_RTT_CB_FLAG** (every RTT) * * Therefore, this function can be used to clear a callback flag by * setting the appropriate bit to zero. e.g. to disable the RTO @@ -3293,7 +3294,8 @@ struct bpf_sock_ops { #define BPF_SOCK_OPS_RTO_CB_FLAG (1<<0) #define BPF_SOCK_OPS_RETRANS_CB_FLAG (1<<1) #define BPF_SOCK_OPS_STATE_CB_FLAG (1<<2) -#define BPF_SOCK_OPS_ALL_CB_FLAGS 0x7 /* Mask of all currently +#define BPF_SOCK_OPS_RTT_CB_FLAG (1<<3) +#define BPF_SOCK_OPS_ALL_CB_FLAGS 0xF /* Mask of all currently * supported cb flags */ @@ -3348,6 +3350,8 @@ enum { BPF_SOCK_OPS_TCP_LISTEN_CB, /* Called on listen(2), right after * socket transition to LISTEN state. */ + BPF_SOCK_OPS_RTT_CB, /* Called on every RTT. + */ }; /* List of TCP states. There is a build check in net/ipv4/tcp.c to detect diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index ffda99e89781..35e64a75c96f 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -791,6 +791,8 @@ static void tcp_rtt_estimator(struct sock *sk, long mrtt_us) tp->rttvar_us -= (tp->rttvar_us - tp->mdev_max_us) >> 2; tp->rtt_seq = tp->snd_nxt; tp->mdev_max_us = tcp_rto_min_us(sk); + + tcp_bpf_rtt(sk); } } else { /* no previous measure. */ @@ -799,6 +801,8 @@ static void tcp_rtt_estimator(struct sock *sk, long mrtt_us) tp->rttvar_us = max(tp->mdev_us, tcp_rto_min_us(sk)); tp->mdev_max_us = tp->rttvar_us; tp->rtt_seq = tp->snd_nxt; + + tcp_bpf_rtt(sk); } tp->srtt_us = max(1U, srtt); } From cdfebc81d5403aa6097a9b5987614e7e6cc918d1 Mon Sep 17 00:00:00 2001 From: Alan Maguire Date: Fri, 12 Apr 2019 12:27:34 +0100 Subject: [PATCH 1600/1640] BACKPORT: bpf: fix whitespace for ENCAP_L2 defines in bpf.h replace tab after #define with space in line with rest of definitions Change-Id: I29e1364abd94abe1e251816032890f895e0159f0 Signed-off-by: Alan Maguire Acked-by: Song Liu Signed-off-by: Daniel Borkmann --- include/uapi/linux/bpf.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 9600869d3804..55f79f3a657d 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2856,14 +2856,14 @@ enum bpf_func_id { /* BPF_FUNC_skb_adjust_room flags. */ #define BPF_F_ADJ_ROOM_FIXED_GSO (1ULL << 0) -#define BPF_ADJ_ROOM_ENCAP_L2_MASK 0xff -#define BPF_ADJ_ROOM_ENCAP_L2_SHIFT 56 +#define BPF_ADJ_ROOM_ENCAP_L2_MASK 0xff +#define BPF_ADJ_ROOM_ENCAP_L2_SHIFT 56 #define BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 (1ULL << 1) #define BPF_F_ADJ_ROOM_ENCAP_L3_IPV6 (1ULL << 2) #define BPF_F_ADJ_ROOM_ENCAP_L4_GRE (1ULL << 3) #define BPF_F_ADJ_ROOM_ENCAP_L4_UDP (1ULL << 4) -#define BPF_F_ADJ_ROOM_ENCAP_L2(len) (((__u64)len & \ +#define BPF_F_ADJ_ROOM_ENCAP_L2(len) (((__u64)len & \ BPF_ADJ_ROOM_ENCAP_L2_MASK) \ << BPF_ADJ_ROOM_ENCAP_L2_SHIFT) From 5ed2cab4a308e97e8ee965330b285668a9df1b87 Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Mon, 3 Dec 2018 11:31:23 +0000 Subject: [PATCH 1601/1640] UPSTREAM: bpf: respect size hint to BPF_PROG_TEST_RUN if present Use data_size_out as a size hint when copying test output to user space. ENOSPC is returned if the output buffer is too small. Callers which so far did not set data_size_out are not affected. Change-Id: Ic1a42d1903e96a26a27a56489b75be05c58996ff Signed-off-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 7 +++++-- net/bpf/test_run.c | 15 +++++++++++++-- 2 files changed, 18 insertions(+), 4 deletions(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 55f79f3a657d..b6e5fb25f9e3 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -438,8 +438,11 @@ union bpf_attr { struct { /* anonymous struct used by BPF_PROG_TEST_RUN command */ __u32 prog_fd; __u32 retval; - __u32 data_size_in; - __u32 data_size_out; + __u32 data_size_in; /* input: len of data_in */ + __u32 data_size_out; /* input/output: len of data_out + * returns ENOSPC if data_out + * is too small. + */ __aligned_u64 data_in; __aligned_u64 data_out; __u32 repeat; diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 2ec9077948cd..c7aa181d3fa2 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -76,8 +76,18 @@ static int bpf_test_finish(const union bpf_attr *kattr, { void __user *data_out = u64_to_user_ptr(kattr->test.data_out); int err = -EFAULT; + u32 copy_size = size; - if (data_out && copy_to_user(data_out, data, size)) + /* Clamp copy if the user has provided a size hint, but copy the full + * buffer if not to retain old behaviour. + */ + if (kattr->test.data_size_out && + copy_size > kattr->test.data_size_out) { + copy_size = kattr->test.data_size_out; + err = -ENOSPC; + } + + if (data_out && copy_to_user(data_out, data, copy_size)) goto out; if (copy_to_user(&uattr->test.data_size_out, &size, sizeof(size))) goto out; @@ -85,7 +95,8 @@ static int bpf_test_finish(const union bpf_attr *kattr, goto out; if (copy_to_user(&uattr->test.duration, &duration, sizeof(duration))) goto out; - err = 0; + if (err != -ENOSPC) + err = 0; out: return err; } From b0b9892854d121c9aad07fe8b35cea3ddfd1b2cf Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Tue, 12 Feb 2019 15:42:38 -0800 Subject: [PATCH 1602/1640] UPSTREAM: bpf/test_run: fix unkillable BPF_PROG_TEST_RUN Syzbot found out that running BPF_PROG_TEST_RUN with repeat=0xffffffff makes process unkillable. The problem is that when CONFIG_PREEMPT is enabled, we never see need_resched() return true. This is due to the fact that preempt_enable() (which we do in bpf_test_run_one on each iteration) now handles resched if it's needed. Let's disable preemption for the whole run, not per test. In this case we can properly see whether resched is needed. Let's also properly return -EINTR to the userspace in case of a signal interrupt. See recent discussion: http://lore.kernel.org/netdev/CAH3MdRWHr4N8jei8jxDppXjmw-Nw=puNDLbu1dQOFQHxfU2onA@mail.gmail.com I'll follow up with the same fix bpf_prog_test_run_flow_dissector in bpf-next. Reported-by: syzbot Change-Id: I6744e83dc918d20f44cdf9f37cbb0f223a970aa4 Signed-off-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann --- net/bpf/test_run.c | 45 ++++++++++++++++++++++++--------------------- 1 file changed, 24 insertions(+), 21 deletions(-) diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index c7aa181d3fa2..e52f6db099c9 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -14,27 +14,13 @@ #include #include -static __always_inline u32 bpf_test_run_one(struct bpf_prog *prog, void *ctx, - struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE]) -{ - u32 ret; - - preempt_disable(); - rcu_read_lock(); - bpf_cgroup_storage_set(storage); - ret = BPF_PROG_RUN(prog, ctx); - rcu_read_unlock(); - preempt_enable(); - - return ret; -} - -static int bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat, u32 *ret, - u32 *time) +static int bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat, + u32 *retval, u32 *time) { struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE] = { 0 }; enum bpf_cgroup_storage_type stype; u64 time_start, time_spent = 0; + int ret = 0; u32 i; for_each_cgroup_storage_type(stype) { @@ -49,25 +35,42 @@ static int bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat, u32 *ret, if (!repeat) repeat = 1; + + rcu_read_lock(); + preempt_disable(); time_start = ktime_get_ns(); for (i = 0; i < repeat; i++) { - *ret = bpf_test_run_one(prog, ctx, storage); + bpf_cgroup_storage_set(storage); + *retval = BPF_PROG_RUN(prog, ctx); + + if (signal_pending(current)) { + ret = -EINTR; + break; + } + if (need_resched()) { - if (signal_pending(current)) - break; time_spent += ktime_get_ns() - time_start; + preempt_enable(); + rcu_read_unlock(); + cond_resched(); + + rcu_read_lock(); + preempt_disable(); time_start = ktime_get_ns(); } } time_spent += ktime_get_ns() - time_start; + preempt_enable(); + rcu_read_unlock(); + do_div(time_spent, repeat); *time = time_spent > U32_MAX ? U32_MAX : (u32)time_spent; for_each_cgroup_storage_type(stype) bpf_cgroup_storage_free(storage[stype]); - return 0; + return ret; } static int bpf_test_finish(const union bpf_attr *kattr, From 1f0bfbc975a7246fbe3661b986b5e90e1c2871c5 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Tue, 19 Feb 2019 10:54:17 -0800 Subject: [PATCH 1603/1640] UPSTREAM: bpf/test_run: fix unkillable BPF_PROG_TEST_RUN for flow dissector Syzbot found out that running BPF_PROG_TEST_RUN with repeat=0xffffffff makes process unkillable. The problem is that when CONFIG_PREEMPT is enabled, we never see need_resched() return true. This is due to the fact that preempt_enable() (which we do in bpf_test_run_one on each iteration) now handles resched if it's needed. Let's disable preemption for the whole run, not per test. In this case we can properly see whether resched is needed. Let's also properly return -EINTR to the userspace in case of a signal interrupt. This is a follow up for a recently fixed issue in bpf_test_run, see commit df1a2cb7c74b ("bpf/test_run: fix unkillable BPF_PROG_TEST_RUN"). Reported-by: syzbot Change-Id: I38d1d9553d1ee281666de7000d06684962658397 Signed-off-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann --- net/bpf/test_run.c | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index e52f6db099c9..b22f1f20e4ed 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -422,31 +422,45 @@ int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog, if (!repeat) repeat = 1; + rcu_read_lock(); + preempt_disable(); time_start = ktime_get_ns(); for (i = 0; i < repeat; i++) { - preempt_disable(); - rcu_read_lock(); retval = __skb_flow_bpf_dissect(prog, skb, &flow_keys_dissector, &flow_keys); - rcu_read_unlock(); - preempt_enable(); + + if (signal_pending(current)) { + preempt_enable(); + rcu_read_unlock(); + + ret = -EINTR; + goto out; + } if (need_resched()) { - if (signal_pending(current)) - break; time_spent += ktime_get_ns() - time_start; + preempt_enable(); + rcu_read_unlock(); + cond_resched(); + + rcu_read_lock(); + preempt_disable(); time_start = ktime_get_ns(); } } time_spent += ktime_get_ns() - time_start; + preempt_enable(); + rcu_read_unlock(); + do_div(time_spent, repeat); duration = time_spent > U32_MAX ? U32_MAX : (u32)time_spent; ret = bpf_test_finish(kattr, uattr, &flow_keys, sizeof(flow_keys), retval, duration); +out: kfree_skb(skb); kfree(sk); return ret; From 15994270a794dd7176281c7db68419babb9b1c51 Mon Sep 17 00:00:00 2001 From: Bo YU Date: Fri, 8 Mar 2019 01:45:51 -0500 Subject: [PATCH 1604/1640] UPSTREAM: bpf: fix warning about using plain integer as NULL Sparse warning below: sudo make C=2 CF=-D__CHECK_ENDIAN__ M=net/bpf/ CHECK net/bpf//test_run.c net/bpf//test_run.c:19:77: warning: Using plain integer as NULL pointer ./include/linux/bpf-cgroup.h:295:77: warning: Using plain integer as NULL pointer Fixes: 8bad74f9840f ("bpf: extend cgroup bpf core to allow multiple cgroup storage types") Acked-by: Yonghong Song Change-Id: I1dba06e69eb1e3b4a3163dc32b48c4fc6bdc755d Signed-off-by: Bo YU Signed-off-by: Daniel Borkmann --- include/linux/bpf-cgroup.h | 2 +- net/bpf/test_run.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index 44769d4525b9..169fd25f6bc2 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -365,7 +365,7 @@ static inline int bpf_cgroup_storage_assign(struct bpf_prog *prog, static inline void bpf_cgroup_storage_release(struct bpf_prog *prog, struct bpf_map *map) {} static inline struct bpf_cgroup_storage *bpf_cgroup_storage_alloc( - struct bpf_prog *prog, enum bpf_cgroup_storage_type stype) { return 0; } + struct bpf_prog *prog, enum bpf_cgroup_storage_type stype) { return NULL; } static inline void bpf_cgroup_storage_free( struct bpf_cgroup_storage *storage) {} static inline int bpf_percpu_cgroup_storage_copy(struct bpf_map *map, void *key, diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index b22f1f20e4ed..d18bba46a8da 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -17,7 +17,7 @@ static int bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat, u32 *retval, u32 *time) { - struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE] = { 0 }; + struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE] = { NULL }; enum bpf_cgroup_storage_type stype; u64 time_start, time_spent = 0; int ret = 0; From 3633f438338cf70295a02380d33b49915d7b4a35 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Thu, 11 Apr 2019 15:47:07 -0700 Subject: [PATCH 1605/1640] UPSTREAM: bpf: explicitly prohibit ctx_{in, out} in non-skb BPF_PROG_TEST_RUN This should allow us later to extend BPF_PROG_TEST_RUN for non-skb case and be sure that nobody is erroneously setting ctx_{in,out}. Fixes: b0b9395d865e ("bpf: support input __sk_buff context in BPF_PROG_TEST_RUN") Reported-by: Daniel Borkmann Change-Id: Ibe85a9217cd265966ef57e1298b37a6b9baa50c4 Signed-off-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann --- net/bpf/test_run.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index d18bba46a8da..c27a7e4dbb2c 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -350,6 +350,9 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr, void *data; int ret; + if (kattr->test.ctx_in || kattr->test.ctx_out) + return -EINVAL; + data = bpf_test_init(kattr, size, XDP_PACKET_HEADROOM + NET_IP_ALIGN, 0); if (IS_ERR(data)) return PTR_ERR(data); @@ -392,6 +395,9 @@ int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog, if (prog->type != BPF_PROG_TYPE_FLOW_DISSECTOR) return -EINVAL; + if (kattr->test.ctx_in || kattr->test.ctx_out) + return -EINVAL; + data = bpf_test_init(kattr, size, NET_SKB_PAD + NET_IP_ALIGN, SKB_DATA_ALIGN(sizeof(struct skb_shared_info))); if (IS_ERR(data)) From 7034ba4eb084493ffe4c363eef7ccc334aae9fca Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Mon, 22 Apr 2019 08:55:45 -0700 Subject: [PATCH 1606/1640] UPSTREAM: bpf: when doing BPF_PROG_TEST_RUN for flow dissector use no-skb mode Now that we have bpf_flow_dissect which can work on raw data, use it when doing BPF_PROG_TEST_RUN for flow dissector. Simplifies bpf_prog_test_run_flow_dissector and allows us to test no-skb mode. Note, that previously, with bpf_flow_dissect_skb we used to call eth_type_trans which pulled L2 (ETH_HLEN) header and we explicitly called skb_reset_network_header. That means flow_keys->nhoff would be initialized to 0 (skb_network_offset) in init_flow_keys. Now we call bpf_flow_dissect with nhoff set to ETH_HLEN and need to undo it once the dissection is done to preserve the existing behavior. Change-Id: Icaf9c2a76c6176c647081535f7d48d94fff2e126 Signed-off-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann --- net/bpf/test_run.c | 47 +++++++++++++++++----------------------------- 1 file changed, 17 insertions(+), 30 deletions(-) diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index c27a7e4dbb2c..9959196e8059 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -382,12 +382,12 @@ int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog, union bpf_attr __user *uattr) { u32 size = kattr->test.data_size_in; + struct bpf_flow_dissector ctx = {}; u32 repeat = kattr->test.repeat; struct bpf_flow_keys flow_keys; u64 time_start, time_spent = 0; + const struct ethhdr *eth; u32 retval, duration; - struct sk_buff *skb; - struct sock *sk; void *data; int ret; u32 i; @@ -398,43 +398,31 @@ int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog, if (kattr->test.ctx_in || kattr->test.ctx_out) return -EINVAL; - data = bpf_test_init(kattr, size, NET_SKB_PAD + NET_IP_ALIGN, - SKB_DATA_ALIGN(sizeof(struct skb_shared_info))); + if (size < ETH_HLEN) + return -EINVAL; + + data = bpf_test_init(kattr, size, 0, 0); if (IS_ERR(data)) return PTR_ERR(data); - sk = kzalloc(sizeof(*sk), GFP_USER); - if (!sk) { - kfree(data); - return -ENOMEM; - } - sock_net_set(sk, current->nsproxy->net_ns); - sock_init_data(NULL, sk); - - skb = build_skb(data, 0); - if (!skb) { - kfree(data); - kfree(sk); - return -ENOMEM; - } - skb->sk = sk; - - skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN); - __skb_put(skb, size); - skb->protocol = eth_type_trans(skb, - current->nsproxy->net_ns->loopback_dev); - skb_reset_network_header(skb); + eth = (struct ethhdr *)data; if (!repeat) repeat = 1; + ctx.flow_keys = &flow_keys; + ctx.data = data; + ctx.data_end = (__u8 *)data + size; + rcu_read_lock(); preempt_disable(); time_start = ktime_get_ns(); for (i = 0; i < repeat; i++) { - retval = __skb_flow_bpf_dissect(prog, skb, - &flow_keys_dissector, - &flow_keys); + retval = bpf_flow_dissect(prog, &ctx, eth->h_proto, ETH_HLEN, + size); + + flow_keys.nhoff -= ETH_HLEN; + flow_keys.thoff -= ETH_HLEN; if (signal_pending(current)) { preempt_enable(); @@ -467,7 +455,6 @@ int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog, retval, duration); out: - kfree_skb(skb); - kfree(sk); + kfree(data); return ret; } From a7c561f1addc51ccd4231dd835deb8be7193bed9 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Mon, 22 Apr 2019 08:55:52 -0700 Subject: [PATCH 1607/1640] UPSTREAM: bpf/flow_dissector: don't adjust nhoff by ETH_HLEN in BPF_PROG_TEST_RUN Now that we use skb-less flow dissector let's return true nhoff and thoff. We used to adjust them by ETH_HLEN because that's how it was done in the skb case. For VLAN tests that looks confusing: nhoff is pointing to vlan parts :-\ Warning, this is an API change for BPF_PROG_TEST_RUN! Feel free to drop if you think that it's too late at this point to fix it. Change-Id: Icae82218f62c91b7860981bc62269eba8e58c25c Signed-off-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann --- net/bpf/test_run.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 9959196e8059..9f603f3de85a 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -421,9 +421,6 @@ int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog, retval = bpf_flow_dissect(prog, &ctx, eth->h_proto, ETH_HLEN, size); - flow_keys.nhoff -= ETH_HLEN; - flow_keys.thoff -= ETH_HLEN; - if (signal_pending(current)) { preempt_enable(); rcu_read_unlock(); From a02a38a166ea09d264ff0c98f9bdbcbb38f13f98 Mon Sep 17 00:00:00 2001 From: Matt Mullins Date: Fri, 26 Apr 2019 11:49:51 -0700 Subject: [PATCH 1608/1640] UPSTREAM: selftests: bpf: test writable buffers in raw tps This tests that: * a BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE cannot be attached if it uses either: * a variable offset to the tracepoint buffer, or * an offset beyond the size of the tracepoint buffer * a tracer can modify the buffer provided when attached to a writable tracepoint in bpf_prog_test_run Change-Id: I3fdf650c1d9a68d64d08b170b14e46a35d180dd7 Signed-off-by: Matt Mullins Acked-by: Yonghong Song Signed-off-by: Alexei Starovoitov --- include/trace/events/bpf_test_run.h | 50 ++++++++++++ net/bpf/test_run.c | 4 + .../raw_tp_writable_reject_nbd_invalid.c | 42 ++++++++++ .../bpf/prog_tests/raw_tp_writable_test_run.c | 80 +++++++++++++++++++ .../selftests/bpf/verifier/raw_tp_writable.c | 34 ++++++++ 5 files changed, 210 insertions(+) create mode 100644 include/trace/events/bpf_test_run.h create mode 100644 tools/testing/selftests/bpf/prog_tests/raw_tp_writable_reject_nbd_invalid.c create mode 100644 tools/testing/selftests/bpf/prog_tests/raw_tp_writable_test_run.c create mode 100644 tools/testing/selftests/bpf/verifier/raw_tp_writable.c diff --git a/include/trace/events/bpf_test_run.h b/include/trace/events/bpf_test_run.h new file mode 100644 index 000000000000..265447e3f71a --- /dev/null +++ b/include/trace/events/bpf_test_run.h @@ -0,0 +1,50 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM bpf_test_run + +#if !defined(_TRACE_BPF_TEST_RUN_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_BPF_TEST_RUN_H + +#include + +DECLARE_EVENT_CLASS(bpf_test_finish, + + TP_PROTO(int *err), + + TP_ARGS(err), + + TP_STRUCT__entry( + __field(int, err) + ), + + TP_fast_assign( + __entry->err = *err; + ), + + TP_printk("bpf_test_finish with err=%d", __entry->err) +); + +#ifdef DEFINE_EVENT_WRITABLE +#undef BPF_TEST_RUN_DEFINE_EVENT +#define BPF_TEST_RUN_DEFINE_EVENT(template, call, proto, args, size) \ + DEFINE_EVENT_WRITABLE(template, call, PARAMS(proto), \ + PARAMS(args), size) +#else +#undef BPF_TEST_RUN_DEFINE_EVENT +#define BPF_TEST_RUN_DEFINE_EVENT(template, call, proto, args, size) \ + DEFINE_EVENT(template, call, PARAMS(proto), PARAMS(args)) +#endif + +BPF_TEST_RUN_DEFINE_EVENT(bpf_test_finish, bpf_test_finish, + + TP_PROTO(int *err), + + TP_ARGS(err), + + sizeof(int) +); + +#endif + +/* This part must be outside protection */ +#include diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 9f603f3de85a..3eda5f9d4e41 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -14,6 +14,9 @@ #include #include +#define CREATE_TRACE_POINTS +#include + static int bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat, u32 *retval, u32 *time) { @@ -101,6 +104,7 @@ static int bpf_test_finish(const union bpf_attr *kattr, if (err != -ENOSPC) err = 0; out: + trace_bpf_test_finish(&err); return err; } diff --git a/tools/testing/selftests/bpf/prog_tests/raw_tp_writable_reject_nbd_invalid.c b/tools/testing/selftests/bpf/prog_tests/raw_tp_writable_reject_nbd_invalid.c new file mode 100644 index 000000000000..9807336a3016 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/raw_tp_writable_reject_nbd_invalid.c @@ -0,0 +1,42 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include + +void test_raw_tp_writable_reject_nbd_invalid(void) +{ + __u32 duration = 0; + char error[4096]; + int bpf_fd = -1, tp_fd = -1; + + const struct bpf_insn program[] = { + /* r6 is our tp buffer */ + BPF_LDX_MEM(BPF_DW, BPF_REG_6, BPF_REG_1, 0), + /* one byte beyond the end of the nbd_request struct */ + BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_6, + sizeof(struct nbd_request)), + BPF_EXIT_INSN(), + }; + + struct bpf_load_program_attr load_attr = { + .prog_type = BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE, + .license = "GPL v2", + .insns = program, + .insns_cnt = sizeof(program) / sizeof(struct bpf_insn), + .log_level = 2, + }; + + bpf_fd = bpf_load_program_xattr(&load_attr, error, sizeof(error)); + if (CHECK(bpf_fd < 0, "bpf_raw_tracepoint_writable load", + "failed: %d errno %d\n", bpf_fd, errno)) + return; + + tp_fd = bpf_raw_tracepoint_open("nbd_send_request", bpf_fd); + if (CHECK(tp_fd >= 0, "bpf_raw_tracepoint_writable open", + "erroneously succeeded\n")) + goto out_bpffd; + + close(tp_fd); +out_bpffd: + close(bpf_fd); +} diff --git a/tools/testing/selftests/bpf/prog_tests/raw_tp_writable_test_run.c b/tools/testing/selftests/bpf/prog_tests/raw_tp_writable_test_run.c new file mode 100644 index 000000000000..5c45424cac5f --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/raw_tp_writable_test_run.c @@ -0,0 +1,80 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include + +void test_raw_tp_writable_test_run(void) +{ + __u32 duration = 0; + char error[4096]; + + const struct bpf_insn trace_program[] = { + BPF_LDX_MEM(BPF_DW, BPF_REG_6, BPF_REG_1, 0), + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_6, 0), + BPF_MOV64_IMM(BPF_REG_0, 42), + BPF_STX_MEM(BPF_W, BPF_REG_6, BPF_REG_0, 0), + BPF_EXIT_INSN(), + }; + + struct bpf_load_program_attr load_attr = { + .prog_type = BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE, + .license = "GPL v2", + .insns = trace_program, + .insns_cnt = sizeof(trace_program) / sizeof(struct bpf_insn), + .log_level = 2, + }; + + int bpf_fd = bpf_load_program_xattr(&load_attr, error, sizeof(error)); + if (CHECK(bpf_fd < 0, "bpf_raw_tracepoint_writable loaded", + "failed: %d errno %d\n", bpf_fd, errno)) + return; + + const struct bpf_insn skb_program[] = { + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }; + + struct bpf_load_program_attr skb_load_attr = { + .prog_type = BPF_PROG_TYPE_SOCKET_FILTER, + .license = "GPL v2", + .insns = skb_program, + .insns_cnt = sizeof(skb_program) / sizeof(struct bpf_insn), + }; + + int filter_fd = + bpf_load_program_xattr(&skb_load_attr, error, sizeof(error)); + if (CHECK(filter_fd < 0, "test_program_loaded", "failed: %d errno %d\n", + filter_fd, errno)) + goto out_bpffd; + + int tp_fd = bpf_raw_tracepoint_open("bpf_test_finish", bpf_fd); + if (CHECK(tp_fd < 0, "bpf_raw_tracepoint_writable opened", + "failed: %d errno %d\n", tp_fd, errno)) + goto out_filterfd; + + char test_skb[128] = { + 0, + }; + + __u32 prog_ret; + int err = bpf_prog_test_run(filter_fd, 1, test_skb, sizeof(test_skb), 0, + 0, &prog_ret, 0); + CHECK(err != 42, "test_run", + "tracepoint did not modify return value\n"); + CHECK(prog_ret != 0, "test_run_ret", + "socket_filter did not return 0\n"); + + close(tp_fd); + + err = bpf_prog_test_run(filter_fd, 1, test_skb, sizeof(test_skb), 0, 0, + &prog_ret, 0); + CHECK(err != 0, "test_run_notrace", + "test_run failed with %d errno %d\n", err, errno); + CHECK(prog_ret != 0, "test_run_ret_notrace", + "socket_filter did not return 0\n"); + +out_filterfd: + close(filter_fd); +out_bpffd: + close(bpf_fd); +} diff --git a/tools/testing/selftests/bpf/verifier/raw_tp_writable.c b/tools/testing/selftests/bpf/verifier/raw_tp_writable.c new file mode 100644 index 000000000000..95b5d70a1dc1 --- /dev/null +++ b/tools/testing/selftests/bpf/verifier/raw_tp_writable.c @@ -0,0 +1,34 @@ +{ + "raw_tracepoint_writable: reject variable offset", + .insns = { + /* r6 is our tp buffer */ + BPF_LDX_MEM(BPF_DW, BPF_REG_6, BPF_REG_1, 0), + + BPF_LD_MAP_FD(BPF_REG_1, 0), + /* move the key (== 0) to r10-8 */ + BPF_MOV32_IMM(BPF_REG_0, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_STX_MEM(BPF_DW, BPF_REG_2, BPF_REG_0, 0), + /* lookup in the map */ + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_map_lookup_elem), + + /* exit clean if null */ + BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1), + BPF_EXIT_INSN(), + + /* shift the buffer pointer to a variable location */ + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_0, 0), + BPF_ALU64_REG(BPF_ADD, BPF_REG_6, BPF_REG_0), + /* clobber whatever's there */ + BPF_MOV64_IMM(BPF_REG_7, 4242), + BPF_STX_MEM(BPF_DW, BPF_REG_6, BPF_REG_7, 0), + + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .fixup_map_hash_8b = { 1, }, + .prog_type = BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE, + .errstr = "R6 invalid variable buffer offset: off=0, var_off=(0x0; 0xffffffff)", +}, From 6774c796777bb8cc5b788c86681aec4d3e716ad8 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 13 May 2019 09:38:55 -0700 Subject: [PATCH 1609/1640] UPSTREAM: flow_dissector: disable preemption around BPF calls Various things in eBPF really require us to disable preemption before running an eBPF program. syzbot reported : BUG: assuming atomic context at net/core/flow_dissector.c:737 in_atomic(): 0, irqs_disabled(): 0, pid: 24710, name: syz-executor.3 2 locks held by syz-executor.3/24710: #0: 00000000e81a4bf1 (&tfile->napi_mutex){+.+.}, at: tun_get_user+0x168e/0x3ff0 drivers/net/tun.c:1850 #1: 00000000254afebd (rcu_read_lock){....}, at: __skb_flow_dissect+0x1e1/0x4bb0 net/core/flow_dissector.c:822 CPU: 1 PID: 24710 Comm: syz-executor.3 Not tainted 5.1.0+ #6 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x172/0x1f0 lib/dump_stack.c:113 __cant_sleep kernel/sched/core.c:6165 [inline] __cant_sleep.cold+0xa3/0xbb kernel/sched/core.c:6142 bpf_flow_dissect+0xfe/0x390 net/core/flow_dissector.c:737 __skb_flow_dissect+0x362/0x4bb0 net/core/flow_dissector.c:853 skb_flow_dissect_flow_keys_basic include/linux/skbuff.h:1322 [inline] skb_probe_transport_header include/linux/skbuff.h:2500 [inline] skb_probe_transport_header include/linux/skbuff.h:2493 [inline] tun_get_user+0x2cfe/0x3ff0 drivers/net/tun.c:1940 tun_chr_write_iter+0xbd/0x156 drivers/net/tun.c:2037 call_write_iter include/linux/fs.h:1872 [inline] do_iter_readv_writev+0x5fd/0x900 fs/read_write.c:693 do_iter_write fs/read_write.c:970 [inline] do_iter_write+0x184/0x610 fs/read_write.c:951 vfs_writev+0x1b3/0x2f0 fs/read_write.c:1015 do_writev+0x15b/0x330 fs/read_write.c:1058 __do_sys_writev fs/read_write.c:1131 [inline] __se_sys_writev fs/read_write.c:1128 [inline] __x64_sys_writev+0x75/0xb0 fs/read_write.c:1128 do_syscall_64+0x103/0x670 arch/x86/entry/common.c:298 entry_SYSCALL_64_after_hwframe+0x49/0xbe Fixes: d58e468b1112 ("flow_dissector: implements flow dissector BPF hook") Change-Id: I326ce1e572ae9c2dc060c172dbd0227bb8ee9eef Signed-off-by: Eric Dumazet Reported-by: syzbot Cc: Petar Penkov Cc: Stanislav Fomichev Signed-off-by: David S. Miller --- net/core/flow_dissector.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 653f51ee0c29..3903648c37a0 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -570,7 +570,9 @@ bool bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx, flow_keys->nhoff = nhoff; flow_keys->thoff = flow_keys->nhoff; + preempt_disable(); result = BPF_PROG_RUN(prog, ctx); + preempt_enable(); flow_keys->nhoff = clamp_t(u16, flow_keys->nhoff, nhoff, hlen); flow_keys->thoff = clamp_t(u16, flow_keys->thoff, From ae679460c9e3ecc2f2d062087c5e229ee6ea6c88 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 4 May 2018 11:32:59 +0200 Subject: [PATCH 1610/1640] BACKPORT: net: core: rework basic flow dissection helper When the core networking needs to detect the transport offset in a given packet and parse it explicitly, a full-blown flow_keys struct is used for storage. This patch introduces a smaller keys store, rework the basic flow dissect helper to use it, and apply this new helper where possible - namely in skb_probe_transport_header(). The used flow dissector data structures are renamed to match more closely the new role. The above gives ~50% performance improvement in micro benchmarking around skb_probe_transport_header() and ~30% around eth_get_headlen(), mostly due to the smaller memset. Small, but measurable improvement is measured also in macro benchmarking. v1 -> v2: use the new helper in eth_get_headlen() and skb_get_poff(), as per DaveM suggestion [Linux4: Apply to include/linux/virtio_net.h] Suggested-by: David Miller Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- include/linux/skbuff.h | 18 ++++++++++-------- include/linux/virtio_net.h | 6 ++++-- include/net/flow_dissector.h | 7 ++++++- net/core/flow_dissector.c | 17 +++++++++-------- net/ethernet/eth.c | 6 +++--- 5 files changed, 32 insertions(+), 22 deletions(-) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 79df99d19ed9..e5ac7c84462a 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1187,7 +1187,7 @@ void __skb_get_hash(struct sk_buff *skb); u32 __skb_get_hash_symmetric(const struct sk_buff *skb); u32 skb_get_poff(const struct sk_buff *skb); u32 __skb_get_poff(const struct sk_buff *skb, void *data, - const struct flow_keys *keys, int hlen); + const struct flow_keys_basic *keys, int hlen); __be32 __skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto, void *data, int hlen_proto); @@ -1259,13 +1259,14 @@ static inline bool skb_flow_dissect_flow_keys(const struct sk_buff *skb, NULL, 0, 0, 0, flags); } -static inline bool skb_flow_dissect_flow_keys_buf(struct flow_keys *flow, - void *data, __be16 proto, - int nhoff, int hlen, - unsigned int flags) +static inline bool +skb_flow_dissect_flow_keys_basic(const struct sk_buff *skb, + struct flow_keys_basic *flow, void *data, + __be16 proto, int nhoff, int hlen, + unsigned int flags) { memset(flow, 0, sizeof(*flow)); - return __skb_flow_dissect(NULL, &flow_keys_buf_dissector, flow, + return __skb_flow_dissect(skb, &flow_keys_basic_dissector, flow, data, proto, nhoff, hlen, flags); } @@ -2480,11 +2481,12 @@ static inline void skb_pop_mac_header(struct sk_buff *skb) static inline void skb_probe_transport_header(struct sk_buff *skb, const int offset_hint) { - struct flow_keys keys; + struct flow_keys_basic keys; if (skb_transport_header_was_set(skb)) return; - else if (skb_flow_dissect_flow_keys(skb, &keys, 0)) + + if (skb_flow_dissect_flow_keys_basic(skb, &keys, 0, 0, 0, 0, 0)) skb_set_transport_header(skb, keys.control.thoff); else if (offset_hint >= 0) skb_set_transport_header(skb, offset_hint); diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h index 7517dd15f87b..d49c1aad2464 100644 --- a/include/linux/virtio_net.h +++ b/include/linux/virtio_net.h @@ -101,7 +101,7 @@ static inline int virtio_net_hdr_to_skb(struct sk_buff *skb, * probe and drop if does not match one of the above types. */ if (gso_type && skb->network_header) { - struct flow_keys keys; + struct flow_keys_basic keys; if (!skb->protocol) { __be16 protocol = dev_parse_header_protocol(skb); @@ -114,7 +114,9 @@ static inline int virtio_net_hdr_to_skb(struct sk_buff *skb, skb->protocol = protocol; } retry: - if (!skb_flow_dissect_flow_keys(skb, &keys, 0)) { + if (!skb_flow_dissect_flow_keys_basic(skb, &keys, + NULL, 0, 0, 0, + 0)) { /* UFO does not specify ipv4 or 6: try both */ if (gso_type & SKB_GSO_UDP && skb->protocol == htons(ETH_P_IP)) { diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h index bbedb60c281c..a28756ce35c3 100644 --- a/include/net/flow_dissector.h +++ b/include/net/flow_dissector.h @@ -228,6 +228,11 @@ struct flow_dissector { unsigned short int offset[FLOW_DISSECTOR_KEY_MAX]; }; +struct flow_keys_basic { + struct flow_dissector_key_control control; + struct flow_dissector_key_basic basic; +}; + struct flow_keys { struct flow_dissector_key_control control; #define FLOW_KEYS_HASH_START_FIELD basic @@ -246,7 +251,7 @@ __be32 flow_get_u32_src(const struct flow_keys *flow); __be32 flow_get_u32_dst(const struct flow_keys *flow); extern struct flow_dissector flow_keys_dissector; -extern struct flow_dissector flow_keys_buf_dissector; +extern struct flow_dissector flow_keys_basic_dissector; /* struct flow_keys_digest: * diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 3903648c37a0..3450a4ab125d 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -1293,7 +1293,7 @@ __u32 skb_get_hash_perturb(const struct sk_buff *skb, EXPORT_SYMBOL(skb_get_hash_perturb); u32 __skb_get_poff(const struct sk_buff *skb, void *data, - const struct flow_keys *keys, int hlen) + const struct flow_keys_basic *keys, int hlen) { u32 poff = keys->control.thoff; @@ -1354,9 +1354,9 @@ u32 __skb_get_poff(const struct sk_buff *skb, void *data, */ u32 skb_get_poff(const struct sk_buff *skb) { - struct flow_keys keys; + struct flow_keys_basic keys; - if (!skb_flow_dissect_flow_keys(skb, &keys, 0)) + if (!skb_flow_dissect_flow_keys_basic(skb, &keys, 0, 0, 0, 0, 0)) return 0; return __skb_get_poff(skb, skb->data, &keys, skb_headlen(skb)); @@ -1459,7 +1459,7 @@ static const struct flow_dissector_key flow_keys_dissector_symmetric_keys[] = { }, }; -static const struct flow_dissector_key flow_keys_buf_dissector_keys[] = { +static const struct flow_dissector_key flow_keys_basic_dissector_keys[] = { { .key_id = FLOW_DISSECTOR_KEY_CONTROL, .offset = offsetof(struct flow_keys, control), @@ -1473,7 +1473,8 @@ static const struct flow_dissector_key flow_keys_buf_dissector_keys[] = { struct flow_dissector flow_keys_dissector __read_mostly; EXPORT_SYMBOL(flow_keys_dissector); -struct flow_dissector flow_keys_buf_dissector __read_mostly; +struct flow_dissector flow_keys_basic_dissector __read_mostly; +EXPORT_SYMBOL(flow_keys_basic_dissector); static int __init init_default_flow_dissectors(void) { @@ -1483,9 +1484,9 @@ static int __init init_default_flow_dissectors(void) skb_flow_dissector_init(&flow_keys_dissector_symmetric, flow_keys_dissector_symmetric_keys, ARRAY_SIZE(flow_keys_dissector_symmetric_keys)); - skb_flow_dissector_init(&flow_keys_buf_dissector, - flow_keys_buf_dissector_keys, - ARRAY_SIZE(flow_keys_buf_dissector_keys)); + skb_flow_dissector_init(&flow_keys_basic_dissector, + flow_keys_basic_dissector_keys, + ARRAY_SIZE(flow_keys_basic_dissector_keys)); return 0; } diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c index 9d9f6f576217..75ee20956eee 100644 --- a/net/ethernet/eth.c +++ b/net/ethernet/eth.c @@ -128,15 +128,15 @@ u32 eth_get_headlen(void *data, unsigned int len) { const unsigned int flags = FLOW_DISSECTOR_F_PARSE_1ST_FRAG; const struct ethhdr *eth = (const struct ethhdr *)data; - struct flow_keys keys; + struct flow_keys_basic keys; /* this should never happen, but better safe than sorry */ if (unlikely(len < sizeof(*eth))) return len; /* parse any remaining L2/L3 headers, check for L4 */ - if (!skb_flow_dissect_flow_keys_buf(&keys, data, eth->h_proto, - sizeof(*eth), len, flags)) + if (!skb_flow_dissect_flow_keys_basic(NULL, &keys, data, eth->h_proto, + sizeof(*eth), len, flags)) return max_t(u32, keys.control.thoff, sizeof(*eth)); /* parse for any L4 headers */ From 60a9a3f8b73ae9082a69089f5eaec364af995992 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Wed, 8 Aug 2018 19:40:31 +0800 Subject: [PATCH 1611/1640] UPSTREAM: net: skbuff.h: fix using plain integer as NULL warning Fixes the following sparse warning: ./include/linux/skbuff.h:2365:58: warning: Using plain integer as NULL pointer Signed-off-by: YueHaibing Signed-off-by: David S. Miller --- include/linux/skbuff.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index e5ac7c84462a..af763ae64974 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2486,7 +2486,7 @@ static inline void skb_probe_transport_header(struct sk_buff *skb, if (skb_transport_header_was_set(skb)) return; - if (skb_flow_dissect_flow_keys_basic(skb, &keys, 0, 0, 0, 0, 0)) + if (skb_flow_dissect_flow_keys_basic(skb, &keys, NULL, 0, 0, 0, 0)) skb_set_transport_header(skb, keys.control.thoff); else if (offset_hint >= 0) skb_set_transport_header(skb, offset_hint); From b6313b54c747da3c17c5861fdc684e7a38fdfaeb Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Mon, 7 May 2018 12:06:03 +0200 Subject: [PATCH 1612/1640] BACKPORT: flow_dissector: do not rely on implicit casts This change fixes a couple of type mismatch reported by the sparse tool, explicitly using the requested type for the offending arguments. Signed-off-by: Paolo Abeni Acked-by: Jon Maloy Signed-off-by: David S. Miller --- net/core/flow_dissector.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 3450a4ab125d..bfc2ceb9d357 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -1356,7 +1356,7 @@ u32 skb_get_poff(const struct sk_buff *skb) { struct flow_keys_basic keys; - if (!skb_flow_dissect_flow_keys_basic(skb, &keys, 0, 0, 0, 0, 0)) + if (!skb_flow_dissect_flow_keys_basic(skb, &keys, NULL, 0, 0, 0, 0)) return 0; return __skb_get_poff(skb, skb->data, &keys, skb_headlen(skb)); From dcf7ed7336a2d1b3bd930e868aff7a1632b8e948 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Mon, 22 Apr 2019 08:55:46 -0700 Subject: [PATCH 1613/1640] UPSTREAM: net: plumb network namespace into __skb_flow_dissect This new argument will be used in the next patches for the eth_get_headlen use case. eth_get_headlen calls flow dissector with only data (without skb) so there is currently no way to pull attached BPF flow dissector program. With this new argument, we can amend the callers to explicitly pass network namespace so we can use attached BPF program. Change-Id: I07dbe25c77a13f8cc33b037e63c532979d290ad0 Signed-off-by: Stanislav Fomichev Reviewed-by: Saeed Mahameed Signed-off-by: Daniel Borkmann --- include/linux/skbuff.h | 19 +++++++++++-------- include/linux/virtio_net.h | 2 +- net/core/flow_dissector.c | 27 +++++++++++++++++---------- net/ethernet/eth.c | 5 +++-- 4 files changed, 32 insertions(+), 21 deletions(-) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index af763ae64974..f9d0cb2740d5 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1236,7 +1236,8 @@ bool __skb_flow_bpf_dissect(struct bpf_prog *prog, const struct sk_buff *skb, struct flow_dissector *flow_dissector, struct bpf_flow_keys *flow_keys); -bool __skb_flow_dissect(const struct sk_buff *skb, +bool __skb_flow_dissect(const struct net *net, + const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container, void *data, __be16 proto, int nhoff, int hlen, @@ -1246,8 +1247,8 @@ static inline bool skb_flow_dissect(const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container, unsigned int flags) { - return __skb_flow_dissect(skb, flow_dissector, target_container, - NULL, 0, 0, 0, flags); + return __skb_flow_dissect(NULL, skb, flow_dissector, + target_container, NULL, 0, 0, 0, flags); } static inline bool skb_flow_dissect_flow_keys(const struct sk_buff *skb, @@ -1255,18 +1256,19 @@ static inline bool skb_flow_dissect_flow_keys(const struct sk_buff *skb, unsigned int flags) { memset(flow, 0, sizeof(*flow)); - return __skb_flow_dissect(skb, &flow_keys_dissector, flow, - NULL, 0, 0, 0, flags); + return __skb_flow_dissect(NULL, skb, &flow_keys_dissector, + flow, NULL, 0, 0, 0, flags); } static inline bool -skb_flow_dissect_flow_keys_basic(const struct sk_buff *skb, +skb_flow_dissect_flow_keys_basic(const struct net *net, + const struct sk_buff *skb, struct flow_keys_basic *flow, void *data, __be16 proto, int nhoff, int hlen, unsigned int flags) { memset(flow, 0, sizeof(*flow)); - return __skb_flow_dissect(skb, &flow_keys_basic_dissector, flow, + return __skb_flow_dissect(net, skb, &flow_keys_basic_dissector, flow, data, proto, nhoff, hlen, flags); } @@ -2486,7 +2488,8 @@ static inline void skb_probe_transport_header(struct sk_buff *skb, if (skb_transport_header_was_set(skb)) return; - if (skb_flow_dissect_flow_keys_basic(skb, &keys, NULL, 0, 0, 0, 0)) + if (skb_flow_dissect_flow_keys_basic(NULL, skb, &keys, + NULL, 0, 0, 0, 0)) skb_set_transport_header(skb, keys.control.thoff); else if (offset_hint >= 0) skb_set_transport_header(skb, offset_hint); diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h index d49c1aad2464..6047058d6703 100644 --- a/include/linux/virtio_net.h +++ b/include/linux/virtio_net.h @@ -114,7 +114,7 @@ static inline int virtio_net_hdr_to_skb(struct sk_buff *skb, skb->protocol = protocol; } retry: - if (!skb_flow_dissect_flow_keys_basic(skb, &keys, + if (!skb_flow_dissect_flow_keys_basic(NULL, skb, &keys, NULL, 0, 0, 0, 0)) { /* UFO does not specify ipv4 or 6: try both */ diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index bfc2ceb9d357..415d8d253e55 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -583,6 +583,7 @@ bool bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx, /** * __skb_flow_dissect - extract the flow_keys struct and return it + * @net: associated network namespace, derived from @skb if NULL * @skb: sk_buff to extract the flow from, can be NULL if the rest are specified * @flow_dissector: list of keys to dissect * @target_container: target structure to put dissected values into @@ -597,7 +598,8 @@ bool bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx, * * Caller must take care of zeroing target container memory. */ -bool __skb_flow_dissect(const struct sk_buff *skb, +bool __skb_flow_dissect(const struct net *net, + const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container, void *data, __be16 proto, int nhoff, int hlen, @@ -657,13 +659,17 @@ bool __skb_flow_dissect(const struct sk_buff *skb, struct bpf_prog *attached = NULL; rcu_read_lock(); + if (!net) { + if (skb->dev) + net = dev_net(skb->dev); + else if (skb->sk) + net = sock_net(skb->sk); + else + WARN_ON_ONCE(1); + } - if (skb->dev) - attached = rcu_dereference(dev_net(skb->dev)->flow_dissector_prog); - else if (skb->sk) - attached = rcu_dereference(sock_net(skb->sk)->flow_dissector_prog); - else - WARN_ON_ONCE(1); + if (net) + attached = rcu_dereference(net->flow_dissector_prog); if (attached) { ret = __skb_flow_bpf_dissect(attached, skb, @@ -1253,8 +1259,8 @@ u32 __skb_get_hash_symmetric(const struct sk_buff *skb) __flow_hash_secret_init(); memset(&keys, 0, sizeof(keys)); - __skb_flow_dissect(skb, &flow_keys_dissector_symmetric, &keys, - NULL, 0, 0, 0, + __skb_flow_dissect(NULL, skb, &flow_keys_dissector_symmetric, + &keys, NULL, 0, 0, 0, FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL); return __flow_hash_from_keys(&keys, &hashrnd); @@ -1356,7 +1362,8 @@ u32 skb_get_poff(const struct sk_buff *skb) { struct flow_keys_basic keys; - if (!skb_flow_dissect_flow_keys_basic(skb, &keys, NULL, 0, 0, 0, 0)) + if (!skb_flow_dissect_flow_keys_basic(NULL, skb, &keys, + NULL, 0, 0, 0, 0)) return 0; return __skb_get_poff(skb, skb->data, &keys, skb_headlen(skb)); diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c index 75ee20956eee..4eac2d7a1167 100644 --- a/net/ethernet/eth.c +++ b/net/ethernet/eth.c @@ -135,8 +135,9 @@ u32 eth_get_headlen(void *data, unsigned int len) return len; /* parse any remaining L2/L3 headers, check for L4 */ - if (!skb_flow_dissect_flow_keys_basic(NULL, &keys, data, eth->h_proto, - sizeof(*eth), len, flags)) + if (!skb_flow_dissect_flow_keys_basic(NULL, NULL, &keys, data, + eth->h_proto, sizeof(*eth), + len, flags)) return max_t(u32, keys.control.thoff, sizeof(*eth)); /* parse for any L4 headers */ From e9360c36d99d7bf28b4e90f7886f7da2ad564599 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Mon, 22 Apr 2019 08:55:47 -0700 Subject: [PATCH 1614/1640] UPSTREAM: flow_dissector: handle no-skb use case When called without skb, gather all required data from the __skb_flow_dissect's arguments and use recently introduces no-skb mode of bpf flow dissector. Note: WARN_ON_ONCE(!net) will now trigger for eth_get_headlen users. Change-Id: Ice7d27b1adaeb9adb02ea52fda999450a6d765af Signed-off-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann --- include/linux/skbuff.h | 5 ---- net/core/flow_dissector.c | 52 +++++++++++++++++++-------------------- 2 files changed, 25 insertions(+), 32 deletions(-) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index f9d0cb2740d5..72e8c88d42fb 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1231,11 +1231,6 @@ struct bpf_flow_dissector; bool bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx, __be16 proto, int nhoff, int hlen); -struct bpf_flow_keys; -bool __skb_flow_bpf_dissect(struct bpf_prog *prog, - const struct sk_buff *skb, - struct flow_dissector *flow_dissector, - struct bpf_flow_keys *flow_keys); bool __skb_flow_dissect(const struct net *net, const struct sk_buff *skb, struct flow_dissector *flow_dissector, diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 415d8d253e55..cf3e8d653042 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -542,22 +542,6 @@ static void __skb_flow_bpf_to_target(const struct bpf_flow_keys *flow_keys, } } -bool __skb_flow_bpf_dissect(struct bpf_prog *prog, - const struct sk_buff *skb, - struct flow_dissector *flow_dissector, - struct bpf_flow_keys *flow_keys) -{ - struct bpf_flow_dissector ctx = { - .flow_keys = flow_keys, - .skb = skb, - .data = skb->data, - .data_end = skb->data + skb_headlen(skb), - }; - - return bpf_flow_dissect(prog, &ctx, skb->protocol, - skb_network_offset(skb), skb_headlen(skb)); -} - bool bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx, __be16 proto, int nhoff, int hlen) { @@ -612,6 +596,7 @@ bool __skb_flow_dissect(const struct net *net, struct flow_dissector_key_icmp *key_icmp; struct flow_dissector_key_tags *key_tags; struct flow_dissector_key_vlan *key_vlan; + struct bpf_prog *attached = NULL; enum flow_dissect_ret fdret; bool skip_vlan = false; int num_hdrs = 0; @@ -655,26 +640,39 @@ bool __skb_flow_dissect(const struct net *net, target_container); if (skb) { - struct bpf_flow_keys flow_keys; - struct bpf_prog *attached = NULL; - - rcu_read_lock(); if (!net) { if (skb->dev) net = dev_net(skb->dev); else if (skb->sk) net = sock_net(skb->sk); - else - WARN_ON_ONCE(1); } + } - if (net) - attached = rcu_dereference(net->flow_dissector_prog); + WARN_ON_ONCE(!net); + if (net) { + rcu_read_lock(); + attached = rcu_dereference(net->flow_dissector_prog); if (attached) { - ret = __skb_flow_bpf_dissect(attached, skb, - flow_dissector, - &flow_keys); + struct bpf_flow_keys flow_keys; + struct bpf_flow_dissector ctx = { + .flow_keys = &flow_keys, + .data = data, + .data_end = data + hlen, + }; + __be16 n_proto = proto; + + if (skb) { + ctx.skb = skb; + /* we can't use 'proto' in the skb case + * because it might be set to skb->vlan_proto + * which has been pulled from the data + */ + n_proto = skb->protocol; + } + + ret = bpf_flow_dissect(attached, &ctx, n_proto, nhoff, + hlen); __skb_flow_bpf_to_target(&flow_keys, flow_dissector, target_container); rcu_read_unlock(); From c1f2f55d4928d7de9a1eaac88b8ac180752184d9 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Fri, 31 May 2019 14:05:06 -0700 Subject: [PATCH 1615/1640] UPSTREAM: flow_dissector: remove unused FLOW_DISSECTOR_F_STOP_AT_L3 flag This flag is not used by any caller, remove it. Change-Id: I280df0c749d35d6259fdcf3c7f9d3d142e6289a4 Signed-off-by: Stanislav Fomichev Signed-off-by: David S. Miller --- include/net/flow_dissector.h | 5 ++--- net/core/flow_dissector.c | 10 ++-------- 2 files changed, 4 insertions(+), 11 deletions(-) diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h index a28756ce35c3..8fbf9608e933 100644 --- a/include/net/flow_dissector.h +++ b/include/net/flow_dissector.h @@ -213,9 +213,8 @@ enum flow_dissector_key_id { }; #define FLOW_DISSECTOR_F_PARSE_1ST_FRAG BIT(0) -#define FLOW_DISSECTOR_F_STOP_AT_L3 BIT(1) -#define FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL BIT(2) -#define FLOW_DISSECTOR_F_STOP_AT_ENCAP BIT(3) +#define FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL BIT(1) +#define FLOW_DISSECTOR_F_STOP_AT_ENCAP BIT(2) struct flow_dissector_key { enum flow_dissector_key_id key_id; diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index cf3e8d653042..baf4ecaf9b3d 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -575,6 +575,8 @@ bool bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx, * @proto: protocol for which to get the flow, if @data is NULL use skb->protocol * @nhoff: network header offset, if @data is NULL use skb_network_offset(skb) * @hlen: packet header length, if @data is NULL use skb_headlen(skb) + * @flags: flags that control the dissection process, e.g. + * FLOW_DISSECTOR_F_STOP_AT_ENCAP. * * The function will try to retrieve individual keys into target specified * by flow_dissector from either the skbuff or a raw buffer specified by the @@ -742,11 +744,6 @@ proto_again: __skb_flow_dissect_ipv4(skb, flow_dissector, target_container, data, iph); - if (flags & FLOW_DISSECTOR_F_STOP_AT_L3) { - fdret = FLOW_DISSECT_RET_OUT_GOOD; - break; - } - break; } case htons(ETH_P_IPV6): { @@ -797,9 +794,6 @@ proto_again: __skb_flow_dissect_ipv6(skb, flow_dissector, target_container, data, iph); - if (flags & FLOW_DISSECTOR_F_STOP_AT_L3) - fdret = FLOW_DISSECT_RET_OUT_GOOD; - break; } case htons(ETH_P_8021AD): From 8b9d13f71a4bed5bfc5edba1b2375010d78b12b1 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Thu, 25 Jul 2019 15:52:25 -0700 Subject: [PATCH 1616/1640] UPSTREAM: bpf/flow_dissector: pass input flags to BPF flow dissector program C flow dissector supports input flags that tell it to customize parsing by either stopping early or trying to parse as deep as possible. Pass those flags to the BPF flow dissector so it can make the same decisions. In the next commits I'll add support for those flags to our reference bpf_flow.c v3: * Export copy of flow dissector flags instead of moving (Alexei Starovoitov) Acked-by: Petar Penkov Acked-by: Willem de Bruijn Acked-by: Song Liu Cc: Song Liu Cc: Willem de Bruijn Cc: Petar Penkov Change-Id: I46a68f8b2249915fff5d97a1394ea662d9a0ac46 Signed-off-by: Stanislav Fomichev Signed-off-by: Alexei Starovoitov --- include/linux/skbuff.h | 2 +- include/uapi/linux/bpf.h | 5 +++++ net/bpf/test_run.c | 2 +- net/core/flow_dissector.c | 12 ++++++++++-- 4 files changed, 17 insertions(+), 4 deletions(-) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 72e8c88d42fb..6d882b5f7709 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1229,7 +1229,7 @@ static inline int skb_flow_dissector_bpf_prog_detach(const union bpf_attr *attr) struct bpf_flow_dissector; bool bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx, - __be16 proto, int nhoff, int hlen); + __be16 proto, int nhoff, int hlen, unsigned int flags); bool __skb_flow_dissect(const struct net *net, const struct sk_buff *skb, diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index b6e5fb25f9e3..9a1fdcb8dec6 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3482,6 +3482,10 @@ enum bpf_task_fd_type { BPF_FD_TYPE_URETPROBE, /* filename + offset */ }; +#define BPF_FLOW_DISSECTOR_F_PARSE_1ST_FRAG (1U << 0) +#define BPF_FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL (1U << 1) +#define BPF_FLOW_DISSECTOR_F_STOP_AT_ENCAP (1U << 2) + struct bpf_flow_keys { __u16 nhoff; __u16 thoff; @@ -3503,6 +3507,7 @@ struct bpf_flow_keys { __u32 ipv6_dst[4]; /* in6_addr; network order */ }; }; + __u32 flags; }; struct bpf_func_info { diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 3eda5f9d4e41..811eaedfff97 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -423,7 +423,7 @@ int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog, time_start = ktime_get_ns(); for (i = 0; i < repeat; i++) { retval = bpf_flow_dissect(prog, &ctx, eth->h_proto, ETH_HLEN, - size); + size, 0); if (signal_pending(current)) { preempt_enable(); diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index baf4ecaf9b3d..e954be6d5ec7 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -543,7 +543,7 @@ static void __skb_flow_bpf_to_target(const struct bpf_flow_keys *flow_keys, } bool bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx, - __be16 proto, int nhoff, int hlen) + __be16 proto, int nhoff, int hlen, unsigned int flags) { struct bpf_flow_keys *flow_keys = ctx->flow_keys; u32 result; @@ -554,6 +554,14 @@ bool bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx, flow_keys->nhoff = nhoff; flow_keys->thoff = flow_keys->nhoff; + BUILD_BUG_ON((int)BPF_FLOW_DISSECTOR_F_PARSE_1ST_FRAG != + (int)FLOW_DISSECTOR_F_PARSE_1ST_FRAG); + BUILD_BUG_ON((int)BPF_FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL != + (int)FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL); + BUILD_BUG_ON((int)BPF_FLOW_DISSECTOR_F_STOP_AT_ENCAP != + (int)FLOW_DISSECTOR_F_STOP_AT_ENCAP); + flow_keys->flags = flags; + preempt_disable(); result = BPF_PROG_RUN(prog, ctx); preempt_enable(); @@ -674,7 +682,7 @@ bool __skb_flow_dissect(const struct net *net, } ret = bpf_flow_dissect(attached, &ctx, n_proto, nhoff, - hlen); + hlen, flags); __skb_flow_bpf_to_target(&flow_keys, flow_dissector, target_container); rcu_read_unlock(); From b404da6f75c0cbf9c689f77e7454799e8aa4f9f1 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Thu, 25 Jul 2019 15:52:27 -0700 Subject: [PATCH 1617/1640] UPSTREAM: bpf/flow_dissector: support flags in BPF_PROG_TEST_RUN This will allow us to write tests for those flags. v2: * Swap kfree(data) and kfree(user_ctx) (Song Liu) Acked-by: Petar Penkov Acked-by: Willem de Bruijn Acked-by: Song Liu Cc: Song Liu Cc: Willem de Bruijn Cc: Petar Penkov Change-Id: Ic1b6920aa27ee21945df37585af2b0dfe37dca6e Signed-off-by: Stanislav Fomichev Signed-off-by: Alexei Starovoitov --- net/bpf/test_run.c | 39 +++++++++++++++++++++++++++++++++++---- 1 file changed, 35 insertions(+), 4 deletions(-) diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 811eaedfff97..68680f7e89f3 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -381,6 +381,22 @@ out: return ret; } +static int verify_user_bpf_flow_keys(struct bpf_flow_keys *ctx) +{ + /* make sure the fields we don't use are zeroed */ + if (!range_is_zero(ctx, 0, offsetof(struct bpf_flow_keys, flags))) + return -EINVAL; + + /* flags is allowed */ + + if (!range_is_zero(ctx, offsetof(struct bpf_flow_keys, flags) + + FIELD_SIZEOF(struct bpf_flow_keys, flags), + sizeof(struct bpf_flow_keys))) + return -EINVAL; + + return 0; +} + int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr) @@ -388,9 +404,11 @@ int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog, u32 size = kattr->test.data_size_in; struct bpf_flow_dissector ctx = {}; u32 repeat = kattr->test.repeat; + struct bpf_flow_keys *user_ctx; struct bpf_flow_keys flow_keys; u64 time_start, time_spent = 0; const struct ethhdr *eth; + unsigned int flags = 0; u32 retval, duration; void *data; int ret; @@ -399,9 +417,6 @@ int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog, if (prog->type != BPF_PROG_TYPE_FLOW_DISSECTOR) return -EINVAL; - if (kattr->test.ctx_in || kattr->test.ctx_out) - return -EINVAL; - if (size < ETH_HLEN) return -EINVAL; @@ -414,6 +429,18 @@ int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog, if (!repeat) repeat = 1; + user_ctx = bpf_ctx_init(kattr, sizeof(struct bpf_flow_keys)); + if (IS_ERR(user_ctx)) { + kfree(data); + return PTR_ERR(user_ctx); + } + if (user_ctx) { + ret = verify_user_bpf_flow_keys(user_ctx); + if (ret) + goto out; + flags = user_ctx->flags; + } + ctx.flow_keys = &flow_keys; ctx.data = data; ctx.data_end = (__u8 *)data + size; @@ -423,7 +450,7 @@ int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog, time_start = ktime_get_ns(); for (i = 0; i < repeat; i++) { retval = bpf_flow_dissect(prog, &ctx, eth->h_proto, ETH_HLEN, - size, 0); + size, flags); if (signal_pending(current)) { preempt_enable(); @@ -454,8 +481,12 @@ int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog, ret = bpf_test_finish(kattr, uattr, &flow_keys, sizeof(flow_keys), retval, duration); + if (!ret) + ret = bpf_ctx_finish(kattr, uattr, user_ctx, + sizeof(struct bpf_flow_keys)); out: + kfree(user_ctx); kfree(data); return ret; } From 58795313e5f92c68f8d6aea8251b9b04bc2e5aff Mon Sep 17 00:00:00 2001 From: Zhengchao Shao Date: Fri, 15 Jul 2022 19:55:59 +0800 Subject: [PATCH 1618/1640] UPSTREAM: bpf: Don't redirect packets with invalid pkt_len commit fd1894224407c484f652ad456e1ce423e89bb3eb upstream. Syzbot found an issue [1]: fq_codel_drop() try to drop a flow whitout any skbs, that is, the flow->head is null. The root cause, as the [2] says, is because that bpf_prog_test_run_skb() run a bpf prog which redirects empty skbs. So we should determine whether the length of the packet modified by bpf prog or others like bpf_prog_test is valid before forwarding it directly. LINK: [1] https://syzkaller.appspot.com/bug?id=0b84da80c2917757915afa89f7738a9d16ec96c5 LINK: [2] https://www.spinics.net/lists/netdev/msg777503.html Reported-by: syzbot+7a12909485b94426aceb@syzkaller.appspotmail.com Change-Id: Ieba6ad52b2eb657fb02ef7d3e414932d7743f073 Signed-off-by: Zhengchao Shao Reviewed-by: Stanislav Fomichev Link: https://lore.kernel.org/r/20220715115559.139691-1-shaozhengchao@huawei.com Signed-off-by: Alexei Starovoitov Signed-off-by: Greg Kroah-Hartman --- include/linux/skbuff.h | 8 ++++++++ net/bpf/test_run.c | 3 +++ net/core/dev.c | 1 + 3 files changed, 12 insertions(+) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 6d882b5f7709..6817e3ee5d66 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2133,6 +2133,14 @@ static inline void skb_set_tail_pointer(struct sk_buff *skb, const int offset) #endif /* NET_SKBUFF_DATA_USES_OFFSET */ +static inline void skb_assert_len(struct sk_buff *skb) +{ +#ifdef CONFIG_DEBUG_NET + if (WARN_ONCE(!skb->len, "%s\n", __func__)) + DO_ONCE_LITE(skb_dump, KERN_ERR, skb, false); +#endif /* CONFIG_DEBUG_NET */ +} + /* * Add data to an sk_buff */ diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 68680f7e89f3..1d27915654ab 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -204,6 +204,9 @@ static int convert___skb_to_skb(struct sk_buff *skb, struct __sk_buff *__skb) { struct qdisc_skb_cb *cb = (struct qdisc_skb_cb *)skb->cb; + if (!skb->len) + return -EINVAL; + if (!__skb) return 0; diff --git a/net/core/dev.c b/net/core/dev.c index 0e58e5b05a9c..9967f87af68c 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3481,6 +3481,7 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv) int rc = -ENOMEM; skb_reset_mac_header(skb); + skb_assert_len(skb); if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP)) __skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED); From 79abc15ece5aea3b75348257cc60b6829c74c669 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Mon, 21 Nov 2022 10:03:39 -0800 Subject: [PATCH 1619/1640] UPSTREAM: bpf: Move skb->len == 0 checks into __bpf_redirect [ Upstream commit 114039b342014680911c35bd6b72624180fd669a ] To avoid potentially breaking existing users. Both mac/no-mac cases have to be amended; mac_header >= network_header is not enough (verified with a new test, see next patch). Fixes: fd1894224407 ("bpf: Don't redirect packets with invalid pkt_len") Change-Id: If63ce98321997875863f3fce255061e8a64bba5e Signed-off-by: Stanislav Fomichev Link: https://lore.kernel.org/r/20221121180340.1983627-1-sdf@google.com Signed-off-by: Martin KaFai Lau Signed-off-by: Sasha Levin --- net/bpf/test_run.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 1d27915654ab..68680f7e89f3 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -204,9 +204,6 @@ static int convert___skb_to_skb(struct sk_buff *skb, struct __sk_buff *__skb) { struct qdisc_skb_cb *cb = (struct qdisc_skb_cb *)skb->cb; - if (!skb->len) - return -EINVAL; - if (!__skb) return 0; From 950dda39380410afc88ece61f87dd0db2fc5d75c Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Thu, 25 Jul 2019 15:52:30 -0700 Subject: [PATCH 1620/1640] BACKPORT: bpf/flow_dissector: support ipv6 flow_label and BPF_FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL Add support for exporting ipv6 flow label via bpf_flow_keys. Export flow label from bpf_flow.c and also return early when BPF_FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL is passed. Acked-by: Petar Penkov Acked-by: Willem de Bruijn Acked-by: Song Liu Cc: Song Liu Cc: Willem de Bruijn Cc: Petar Penkov Change-Id: I6b4c3771022f19c184867fb6045351d59cfde68b Signed-off-by: Stanislav Fomichev Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 1 + net/core/flow_dissector.c | 9 +++++++++ 2 files changed, 10 insertions(+) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 9a1fdcb8dec6..d4fd6970a7ca 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3508,6 +3508,7 @@ struct bpf_flow_keys { }; }; __u32 flags; + __be32 flow_label; }; struct bpf_func_info { diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index e954be6d5ec7..9475872940a3 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -496,6 +496,7 @@ static void __skb_flow_bpf_to_target(const struct bpf_flow_keys *flow_keys, struct flow_dissector_key_basic *key_basic; struct flow_dissector_key_addrs *key_addrs; struct flow_dissector_key_ports *key_ports; + struct flow_dissector_key_tags *key_tags; key_control = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_CONTROL, @@ -540,6 +541,14 @@ static void __skb_flow_bpf_to_target(const struct bpf_flow_keys *flow_keys, key_ports->src = flow_keys->sport; key_ports->dst = flow_keys->dport; } + + if (dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_FLOW_LABEL)) { + key_tags = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_FLOW_LABEL, + target_container); + key_tags->flow_label = ntohl(flow_keys->flow_label); + } } bool bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx, From e97ba0e7fafd963acec97ca8e97f89f288c6040c Mon Sep 17 00:00:00 2001 From: Olivier Brunel Date: Sat, 20 Oct 2018 19:39:56 +0200 Subject: [PATCH 1621/1640] UPSTREAM: umh: Add command line to user mode helpers User mode helpers were spawned without a command line, and because an empty command line is used by many tools to identify processes as kernel threads, this could cause some issues. Notably during killing spree on shutdown, since such helper would then be skipped (i.e. not killed) which would result in the process remaining alive, and thus preventing unmouting of the rootfs (as experienced with the bpfilter umh). Fixes: 449325b52b7a ("umh: introduce fork_usermode_blob() helper") Signed-off-by: Olivier Brunel Signed-off-by: David S. Miller --- include/linux/umh.h | 1 + kernel/umh.c | 16 ++++++++++++++-- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/include/linux/umh.h b/include/linux/umh.h index 5c812acbb80a..235f51b62c71 100644 --- a/include/linux/umh.h +++ b/include/linux/umh.h @@ -44,6 +44,7 @@ struct subprocess_info *call_usermodehelper_setup_file(struct file *file, int (*init)(struct subprocess_info *info, struct cred *new), void (*cleanup)(struct subprocess_info *), void *data); struct umh_info { + const char *cmdline; struct file *pipe_to_umh; struct file *pipe_from_umh; pid_t pid; diff --git a/kernel/umh.c b/kernel/umh.c index f520f1800ebc..48f5c4aa1437 100644 --- a/kernel/umh.c +++ b/kernel/umh.c @@ -415,11 +415,19 @@ struct subprocess_info *call_usermodehelper_setup_file(struct file *file, void (*cleanup)(struct subprocess_info *info), void *data) { struct subprocess_info *sub_info; + struct umh_info *info = data; + const char *cmdline = (info->cmdline) ? info->cmdline : "usermodehelper"; sub_info = kzalloc(sizeof(struct subprocess_info), GFP_KERNEL); if (!sub_info) return NULL; + sub_info->argv = argv_split(GFP_KERNEL, cmdline, NULL); + if (!sub_info->argv) { + kfree(sub_info); + return NULL; + } + INIT_WORK(&sub_info->work, call_usermodehelper_exec_work); sub_info->path = "none"; sub_info->file = file; @@ -468,10 +476,11 @@ static int umh_pipe_setup(struct subprocess_info *info, struct cred *new) return 0; } -static void umh_save_pid(struct subprocess_info *info) +static void umh_clean_and_save_pid(struct subprocess_info *info) { struct umh_info *umh_info = info->data; + argv_free(info->argv); umh_info->pid = info->pid; } @@ -481,6 +490,9 @@ static void umh_save_pid(struct subprocess_info *info) * @len: length of the blob * @info: information about usermode process (shouldn't be NULL) * + * If info->cmdline is set it will be used as command line for the + * user process, else "usermodehelper" is used. + * * Returns either negative error or zero which indicates success * in executing a blob of bytes as a usermode process. In such * case 'struct umh_info *info' is populated with two pipes @@ -510,7 +522,7 @@ int fork_usermode_blob(void *data, size_t len, struct umh_info *info) err = -ENOMEM; sub_info = call_usermodehelper_setup_file(file, umh_pipe_setup, - umh_save_pid, info); + umh_clean_and_save_pid, info); if (!sub_info) goto out; From ee7cc78e745619dbc0d874484ecb3d574a7bf289 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 7 Jun 2018 10:23:10 -0700 Subject: [PATCH 1622/1640] UPSTREAM: umh: fix race condition kasan reported use-after-free: BUG: KASAN: use-after-free in call_usermodehelper_exec_work+0x2d3/0x310 kernel/umh.c:195 Write of size 4 at addr ffff8801d9202370 by task kworker/u4:2/50 Workqueue: events_unbound call_usermodehelper_exec_work Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x1b9/0x294 lib/dump_stack.c:113 print_address_description+0x6c/0x20b mm/kasan/report.c:256 kasan_report_error mm/kasan/report.c:354 [inline] kasan_report.cold.7+0x242/0x2fe mm/kasan/report.c:412 __asan_report_store4_noabort+0x17/0x20 mm/kasan/report.c:437 call_usermodehelper_exec_work+0x2d3/0x310 kernel/umh.c:195 process_one_work+0xc1e/0x1b50 kernel/workqueue.c:2145 worker_thread+0x1cc/0x1440 kernel/workqueue.c:2279 kthread+0x345/0x410 kernel/kthread.c:240 ret_from_fork+0x3a/0x50 arch/x86/entry/entry_64.S:412 The reason is that 'sub_info' cannot be accessed out of parent task context, since it will be freed by the child. Instead remember the pid in the child task. Fixes: 449325b52b7a ("umh: introduce fork_usermode_blob() helper") Reported-by: syzbot+2c73319c406f1987d156@syzkaller.appspotmail.com Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/umh.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/kernel/umh.c b/kernel/umh.c index 48f5c4aa1437..7f6a8b599a29 100644 --- a/kernel/umh.c +++ b/kernel/umh.c @@ -108,6 +108,7 @@ static int call_usermodehelper_exec_async(void *data) commit_creds(new); + sub_info->pid = task_pid_nr(current); if (sub_info->file) retval = do_execve_file(sub_info->file, sub_info->argv, sub_info->envp); @@ -200,8 +201,6 @@ static void call_usermodehelper_exec_work(struct work_struct *work) if (pid < 0) { sub_info->retval = pid; umh_complete(sub_info); - } else { - sub_info->pid = pid; } } } From 7e4c5b6faa6ff7d41c68f48cc1f7689fa875a03f Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Wed, 9 Jan 2019 02:23:56 +0900 Subject: [PATCH 1623/1640] BACKPORT: umh: add exit routine for UMH process A UMH process which is created by the fork_usermode_blob() such as bpfilter needs to release members of the umh_info when process is terminated. But the do_exit() does not release members of the umh_info. hence module which uses UMH needs own code to detect whether UMH process is terminated or not. But this implementation needs extra code for checking the status of UMH process. it eventually makes the code more complex. The new PF_UMH flag is added and it is used to identify UMH processes. The exit_umh() does not release members of the umh_info. Hence umh_info->cleanup callback should release both members of the umh_info and the private data. Suggested-by: David S. Miller Change-Id: I860a2582ecffb61d3cec6ff86b53eb6fe85e27e3 Signed-off-by: Taehee Yoo Signed-off-by: David S. Miller --- include/linux/sched.h | 9 +++++++++ include/linux/umh.h | 2 ++ kernel/exit.c | 1 + kernel/umh.c | 33 +++++++++++++++++++++++++++++++-- 4 files changed, 43 insertions(+), 2 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index aee146b20b2b..e1fbdb9105ec 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1585,6 +1585,7 @@ extern struct pid *cad_pid; #define PF_RANDOMIZE 0x00400000 /* Randomize virtual address space */ #define PF_SWAPWRITE 0x00800000 /* Allowed to write to swap */ #define PF_MEMSTALL 0x01000000 /* Stalled due to lack of memory */ +#define PF_UMH 0x02000000 /* I'm an Usermodehelper process */ #define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_allowed */ #define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */ #define PF_WAKE_UP_IDLE 0x10000000 /* TTWU on an idle CPU */ @@ -2003,4 +2004,12 @@ static inline void set_wake_up_idle(bool enabled) current->flags &= ~PF_WAKE_UP_IDLE; } +void __exit_umh(struct task_struct *tsk); + +static inline void exit_umh(struct task_struct *tsk) +{ + if (unlikely(tsk->flags & PF_UMH)) + __exit_umh(tsk); +} + #endif diff --git a/include/linux/umh.h b/include/linux/umh.h index 235f51b62c71..0c08de356d0d 100644 --- a/include/linux/umh.h +++ b/include/linux/umh.h @@ -47,6 +47,8 @@ struct umh_info { const char *cmdline; struct file *pipe_to_umh; struct file *pipe_from_umh; + struct list_head list; + void (*cleanup)(struct umh_info *info); pid_t pid; }; int fork_usermode_blob(void *data, size_t len, struct umh_info *info); diff --git a/kernel/exit.c b/kernel/exit.c index 46a0871fb2ff..c082bdca0189 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -939,6 +939,7 @@ void __noreturn do_exit(long code) exit_task_namespaces(tsk); exit_task_work(tsk); exit_thread(tsk); + exit_umh(tsk); /* * Flush inherited counters to the parent - before the parent diff --git a/kernel/umh.c b/kernel/umh.c index 7f6a8b599a29..0e0f2738d920 100644 --- a/kernel/umh.c +++ b/kernel/umh.c @@ -38,6 +38,8 @@ static kernel_cap_t usermodehelper_bset = CAP_FULL_SET; static kernel_cap_t usermodehelper_inheritable = CAP_FULL_SET; static DEFINE_SPINLOCK(umh_sysctl_lock); static DECLARE_RWSEM(umhelper_sem); +static LIST_HEAD(umh_list); +static DEFINE_MUTEX(umh_list_lock); static void call_usermodehelper_freeinfo(struct subprocess_info *info) { @@ -109,10 +111,12 @@ static int call_usermodehelper_exec_async(void *data) commit_creds(new); sub_info->pid = task_pid_nr(current); - if (sub_info->file) + if (sub_info->file) { retval = do_execve_file(sub_info->file, sub_info->argv, sub_info->envp); - else + if (!retval) + current->flags |= PF_UMH; + } else retval = do_execve(getname_kernel(sub_info->path), (const char __user *const __user *)sub_info->argv, (const char __user *const __user *)sub_info->envp); @@ -526,6 +530,11 @@ int fork_usermode_blob(void *data, size_t len, struct umh_info *info) goto out; err = call_usermodehelper_exec(sub_info, UMH_WAIT_EXEC); + if (!err) { + mutex_lock(&umh_list_lock); + list_add(&info->list, &umh_list); + mutex_unlock(&umh_list_lock); + } out: fput(file); return err; @@ -693,6 +702,26 @@ static int proc_cap_handler(struct ctl_table *table, int write, return 0; } +void __exit_umh(struct task_struct *tsk) +{ + struct umh_info *info; + pid_t pid = tsk->pid; + + mutex_lock(&umh_list_lock); + list_for_each_entry(info, &umh_list, list) { + if (info->pid == pid) { + list_del(&info->list); + mutex_unlock(&umh_list_lock); + goto out; + } + } + mutex_unlock(&umh_list_lock); + return; +out: + if (info->cleanup) + info->cleanup(info); +} + struct ctl_table usermodehelper_table[] = { { .procname = "bset", From d09f06c1dbb1a715811b3c8732ae8a66389ec32b Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 29 May 2018 11:55:06 +0200 Subject: [PATCH 1624/1640] UPSTREAM: bpfilter: fix building without CONFIG_INET bpfilter_process_sockopt is a callback that gets called from ip_setsockopt() and ip_getsockopt(). However, when CONFIG_INET is disabled, it never gets called at all, and assigning a function to the callback pointer results in a link failure: net/bpfilter/bpfilter_kern.o: In function `__stop_umh': bpfilter_kern.c:(.text.unlikely+0x3): undefined reference to `bpfilter_process_sockopt' net/bpfilter/bpfilter_kern.o: In function `load_umh': bpfilter_kern.c:(.init.text+0x73): undefined reference to `bpfilter_process_sockopt' Since there is no caller in this configuration, I assume we can simply make the assignment conditional. Signed-off-by: Arnd Bergmann Signed-off-by: David S. Miller --- net/bpfilter/bpfilter_kern.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/net/bpfilter/bpfilter_kern.c b/net/bpfilter/bpfilter_kern.c index 7596314b61c7..b13d058f8c34 100644 --- a/net/bpfilter/bpfilter_kern.c +++ b/net/bpfilter/bpfilter_kern.c @@ -33,7 +33,8 @@ static void shutdown_umh(struct umh_info *info) static void __stop_umh(void) { - if (bpfilter_process_sockopt) { + if (IS_ENABLED(CONFIG_INET) && + bpfilter_process_sockopt) { bpfilter_process_sockopt = NULL; shutdown_umh(&info); } @@ -98,7 +99,9 @@ static int __init load_umh(void) stop_umh(); return -EFAULT; } - bpfilter_process_sockopt = &__bpfilter_process_sockopt; + if (IS_ENABLED(CONFIG_INET)) + bpfilter_process_sockopt = &__bpfilter_process_sockopt; + return 0; } From d1e993cded18f44952b69bc9befc015e8aea7cf0 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 7 Jun 2018 15:31:14 -0700 Subject: [PATCH 1625/1640] UPSTREAM: bpfilter: fix race in pipe access syzbot reported the following crash [ 338.293946] bpfilter: read fail -512 [ 338.304515] kasan: GPF could be caused by NULL-ptr deref or user memory access [ 338.311863] general protection fault: 0000 [#1] SMP KASAN [ 338.344360] RIP: 0010:__vfs_write+0x4a6/0x960 [ 338.426363] Call Trace: [ 338.456967] __kernel_write+0x10c/0x380 [ 338.460928] __bpfilter_process_sockopt+0x1d8/0x35b [ 338.487103] bpfilter_mbox_request+0x4d/0xb0 [ 338.491492] bpfilter_ip_get_sockopt+0x6b/0x90 This can happen when multiple cpus trying to talk to user mode process via bpfilter_mbox_request(). One cpu grabs the mutex while another goes to sleep on the same mutex. Then former cpu sees that umh pipe is down and shuts down the pipes. Later cpu finally acquires the mutex and crashes on freed pipe. Fix the race by using info.pid as an indicator that umh and pipes are healthy and check it after acquiring the mutex. Fixes: d2ba09c17a06 ("net: add skeleton of bpfilter kernel module") Reported-by: syzbot+7ade6c94abb2774c0fee@syzkaller.appspotmail.com Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- net/bpfilter/bpfilter_kern.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/net/bpfilter/bpfilter_kern.c b/net/bpfilter/bpfilter_kern.c index b13d058f8c34..09522573f611 100644 --- a/net/bpfilter/bpfilter_kern.c +++ b/net/bpfilter/bpfilter_kern.c @@ -24,17 +24,19 @@ static void shutdown_umh(struct umh_info *info) { struct task_struct *tsk; + if (!info->pid) + return; tsk = pid_task(find_vpid(info->pid), PIDTYPE_PID); if (tsk) force_sig(SIGKILL, tsk); fput(info->pipe_to_umh); fput(info->pipe_from_umh); + info->pid = 0; } static void __stop_umh(void) { - if (IS_ENABLED(CONFIG_INET) && - bpfilter_process_sockopt) { + if (IS_ENABLED(CONFIG_INET)) { bpfilter_process_sockopt = NULL; shutdown_umh(&info); } @@ -55,7 +57,7 @@ static int __bpfilter_process_sockopt(struct sock *sk, int optname, struct mbox_reply reply; loff_t pos; ssize_t n; - int ret; + int ret = -EFAULT; req.is_set = is_set; req.pid = current->pid; @@ -63,6 +65,8 @@ static int __bpfilter_process_sockopt(struct sock *sk, int optname, req.addr = (long)optval; req.len = optlen; mutex_lock(&bpfilter_lock); + if (!info.pid) + goto out; n = __kernel_write(info.pipe_to_umh, &req, sizeof(req), &pos); if (n != sizeof(req)) { pr_err("write fail %zd\n", n); From 08f85f271a191b069fef97a0c37c4d2e96c70bd9 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Tue, 26 Jun 2018 20:13:48 -0700 Subject: [PATCH 1626/1640] BACKPORT: bpfilter: include bpfilter_umh in assembly instead of using objcopy What we want here is to embed a user-space program into the kernel. Instead of the complex ELF magic, let's simply wrap it in the assembly with the '.incbin' directive. Signed-off-by: Masahiro Yamada Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- net/bpfilter/Makefile | 15 ++------------- net/bpfilter/bpfilter_kern.c | 11 +++++------ net/bpfilter/bpfilter_umh_blob.S | 7 +++++++ 3 files changed, 14 insertions(+), 19 deletions(-) create mode 100644 net/bpfilter/bpfilter_umh_blob.S diff --git a/net/bpfilter/Makefile b/net/bpfilter/Makefile index 2af752c8ef5e..bb2146d33bf3 100644 --- a/net/bpfilter/Makefile +++ b/net/bpfilter/Makefile @@ -13,18 +13,7 @@ ifeq ($(CONFIG_BPFILTER_UMH), y) HOSTLDFLAGS += -static endif -# a bit of elf magic to convert bpfilter_umh binary into a binary blob -# inside bpfilter_umh.o elf file referenced by -# _binary_net_bpfilter_bpfilter_umh_start symbol -# which bpfilter_kern.c passes further into umh blob loader at run-time -quiet_cmd_copy_umh = GEN $@ - cmd_copy_umh = echo ':' > $(obj)/.bpfilter_umh.o.cmd; \ - $(OBJCOPY) -I binary -O $(CONFIG_OUTPUT_FORMAT) \ - -B `$(OBJDUMP) -f $<|grep architecture|cut -d, -f1|cut -d' ' -f2` \ - --rename-section .data=.init.rodata $< $@ - -$(obj)/bpfilter_umh.o: $(obj)/bpfilter_umh - $(call cmd,copy_umh) +$(obj)/bpfilter_umh_blob.o: $(obj)/bpfilter_umh obj-$(CONFIG_BPFILTER_UMH) += bpfilter.o -bpfilter-objs += bpfilter_kern.o bpfilter_umh.o +bpfilter-objs += bpfilter_kern.o bpfilter_umh_blob.o diff --git a/net/bpfilter/bpfilter_kern.c b/net/bpfilter/bpfilter_kern.c index 09522573f611..f0fc182d3db7 100644 --- a/net/bpfilter/bpfilter_kern.c +++ b/net/bpfilter/bpfilter_kern.c @@ -10,11 +10,8 @@ #include #include "msgfmt.h" -#define UMH_start _binary_net_bpfilter_bpfilter_umh_start -#define UMH_end _binary_net_bpfilter_bpfilter_umh_end - -extern char UMH_start; -extern char UMH_end; +extern char bpfilter_umh_start; +extern char bpfilter_umh_end; static struct umh_info info; /* since ip_getsockopt() can run in parallel, serialize access to umh */ @@ -93,7 +90,9 @@ static int __init load_umh(void) int err; /* fork usermode process */ - err = fork_usermode_blob(&UMH_start, &UMH_end - &UMH_start, &info); + err = fork_usermode_blob(&bpfilter_umh_start, + &bpfilter_umh_end - &bpfilter_umh_start, + &info); if (err) return err; pr_info("Loaded bpfilter_umh pid %d\n", info.pid); diff --git a/net/bpfilter/bpfilter_umh_blob.S b/net/bpfilter/bpfilter_umh_blob.S new file mode 100644 index 000000000000..40311d10d2f2 --- /dev/null +++ b/net/bpfilter/bpfilter_umh_blob.S @@ -0,0 +1,7 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + .section .init.rodata, "a" + .global bpfilter_umh_start +bpfilter_umh_start: + .incbin "net/bpfilter/bpfilter_umh" + .global bpfilter_umh_end +bpfilter_umh_end: From 70649d82a86e52b3d3ffd18c0a18bb21bf4a47ee Mon Sep 17 00:00:00 2001 From: Shanthosh RK Date: Fri, 5 Oct 2018 20:57:48 +0530 Subject: [PATCH 1627/1640] UPSTREAM: net: bpfilter: Fix type cast and pointer warnings Fixes the following Sparse warnings: net/bpfilter/bpfilter_kern.c:62:21: warning: cast removes address space of expression net/bpfilter/bpfilter_kern.c:101:49: warning: Using plain integer as NULL pointer Signed-off-by: Shanthosh RK Signed-off-by: David S. Miller --- net/bpfilter/bpfilter_kern.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/bpfilter/bpfilter_kern.c b/net/bpfilter/bpfilter_kern.c index f0fc182d3db7..b64e1649993b 100644 --- a/net/bpfilter/bpfilter_kern.c +++ b/net/bpfilter/bpfilter_kern.c @@ -59,7 +59,7 @@ static int __bpfilter_process_sockopt(struct sock *sk, int optname, req.is_set = is_set; req.pid = current->pid; req.cmd = optname; - req.addr = (long)optval; + req.addr = (long __force __user)optval; req.len = optlen; mutex_lock(&bpfilter_lock); if (!info.pid) @@ -98,7 +98,7 @@ static int __init load_umh(void) pr_info("Loaded bpfilter_umh pid %d\n", info.pid); /* health check that usermode process started correctly */ - if (__bpfilter_process_sockopt(NULL, 0, 0, 0, 0) != 0) { + if (__bpfilter_process_sockopt(NULL, 0, NULL, 0, 0) != 0) { stop_umh(); return -EFAULT; } From d52a83fd61877df818a0c6506d022acd482987d8 Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Wed, 17 Oct 2018 00:35:10 +0900 Subject: [PATCH 1628/1640] UPSTREAM: net: bpfilter: use get_pid_task instead of pid_task pid_task() dereferences rcu protected tasks array. But there is no rcu_read_lock() in shutdown_umh() routine so that rcu_read_lock() is needed. get_pid_task() is wrapper function of pid_task. it holds rcu_read_lock() then calls pid_task(). if task isn't NULL, it increases reference count of task. test commands: %modprobe bpfilter %modprobe -rv bpfilter splat looks like: [15102.030932] ============================= [15102.030957] WARNING: suspicious RCU usage [15102.030985] 4.19.0-rc7+ #21 Not tainted [15102.031010] ----------------------------- [15102.031038] kernel/pid.c:330 suspicious rcu_dereference_check() usage! [15102.031063] other info that might help us debug this: [15102.031332] rcu_scheduler_active = 2, debug_locks = 1 [15102.031363] 1 lock held by modprobe/1570: [15102.031389] #0: 00000000580ef2b0 (bpfilter_lock){+.+.}, at: stop_umh+0x13/0x52 [bpfilter] [15102.031552] stack backtrace: [15102.031583] CPU: 1 PID: 1570 Comm: modprobe Not tainted 4.19.0-rc7+ #21 [15102.031607] Hardware name: To be filled by O.E.M. To be filled by O.E.M./Aptio CRB, BIOS 5.6.5 07/08/2015 [15102.031628] Call Trace: [15102.031676] dump_stack+0xc9/0x16b [15102.031723] ? show_regs_print_info+0x5/0x5 [15102.031801] ? lockdep_rcu_suspicious+0x117/0x160 [15102.031855] pid_task+0x134/0x160 [15102.031900] ? find_vpid+0xf0/0xf0 [15102.032017] shutdown_umh.constprop.1+0x1e/0x53 [bpfilter] [15102.032055] stop_umh+0x46/0x52 [bpfilter] [15102.032092] __x64_sys_delete_module+0x47e/0x570 [ ... ] Fixes: d2ba09c17a06 ("net: add skeleton of bpfilter kernel module") Signed-off-by: Taehee Yoo Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- net/bpfilter/bpfilter_kern.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/net/bpfilter/bpfilter_kern.c b/net/bpfilter/bpfilter_kern.c index b64e1649993b..94e88f510c5b 100644 --- a/net/bpfilter/bpfilter_kern.c +++ b/net/bpfilter/bpfilter_kern.c @@ -23,9 +23,11 @@ static void shutdown_umh(struct umh_info *info) if (!info->pid) return; - tsk = pid_task(find_vpid(info->pid), PIDTYPE_PID); - if (tsk) + tsk = get_pid_task(find_vpid(info->pid), PIDTYPE_PID); + if (tsk) { force_sig(SIGKILL, tsk); + put_task_struct(tsk); + } fput(info->pipe_to_umh); fput(info->pipe_from_umh); info->pid = 0; From c7476bfbb9d7b2801ca7253714405cf3ed156d89 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 15 May 2019 12:23:03 -0500 Subject: [PATCH 1629/1640] UPSTREAM: signal/bpfilter: Fix bpfilter_kernl to use send_sig not force_sig [ Upstream commit 1dfd1711de2952fd1bfeea7152bd1687a4eea771 ] The locking in force_sig_info is not prepared to deal with a task that exits or execs (as sighand may change). As force_sig is only built to handle synchronous exceptions. Further the function force_sig_info changes the signal state if the signal is ignored, or blocked or if SIGNAL_UNKILLABLE will prevent the delivery of the signal. The signal SIGKILL can not be ignored and can not be blocked and SIGNAL_UNKILLABLE won't prevent it from being delivered. So using force_sig rather than send_sig for SIGKILL is pointless. Because it won't impact the sending of the signal and and because using force_sig is wrong, replace force_sig with send_sig. Cc: Alexei Starovoitov Cc: David S. Miller Fixes: d2ba09c17a06 ("net: add skeleton of bpfilter kernel module") Signed-off-by: "Eric W. Biederman" Signed-off-by: Sasha Levin --- net/bpfilter/bpfilter_kern.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/bpfilter/bpfilter_kern.c b/net/bpfilter/bpfilter_kern.c index 94e88f510c5b..450b257afa84 100644 --- a/net/bpfilter/bpfilter_kern.c +++ b/net/bpfilter/bpfilter_kern.c @@ -25,7 +25,7 @@ static void shutdown_umh(struct umh_info *info) return; tsk = get_pid_task(find_vpid(info->pid), PIDTYPE_PID); if (tsk) { - force_sig(SIGKILL, tsk); + send_sig(SIGKILL, tsk, 1); put_task_struct(tsk); } fput(info->pipe_to_umh); From aa3db4deffe5b5f0b5e5df45cd84b3a88a9e02a6 Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Wed, 9 Jan 2019 02:24:34 +0900 Subject: [PATCH 1630/1640] UPSTREAM: net: bpfilter: use cleanup callback to release umh_info Now, UMH process is killed, do_exit() calls the umh_info->cleanup callback to release members of the umh_info. This patch makes bpfilter_umh's cleanup routine to use the umh_info->cleanup callback. Change-Id: I865aa89eb58d9bba883ed59e4b921fe401413853 Signed-off-by: Taehee Yoo Signed-off-by: David S. Miller --- include/linux/bpfilter.h | 11 ++++++++--- net/bpfilter/bpfilter_kern.c | 22 ++++++++++------------ net/ipv4/bpfilter/sockopt.c | 33 ++++++++++++++++++++++++++------- 3 files changed, 44 insertions(+), 22 deletions(-) diff --git a/include/linux/bpfilter.h b/include/linux/bpfilter.h index f02cee0225d4..70ffeed280e9 100644 --- a/include/linux/bpfilter.h +++ b/include/linux/bpfilter.h @@ -3,13 +3,18 @@ #define _LINUX_BPFILTER_H #include +#include struct sock; int bpfilter_ip_set_sockopt(struct sock *sk, int optname, char __user *optval, unsigned int optlen); int bpfilter_ip_get_sockopt(struct sock *sk, int optname, char __user *optval, int __user *optlen); -extern int (*bpfilter_process_sockopt)(struct sock *sk, int optname, - char __user *optval, - unsigned int optlen, bool is_set); +struct bpfilter_umh_ops { + struct umh_info info; + int (*sockopt)(struct sock *sk, int optname, + char __user *optval, + unsigned int optlen, bool is_set); +}; +extern struct bpfilter_umh_ops bpfilter_ops; #endif diff --git a/net/bpfilter/bpfilter_kern.c b/net/bpfilter/bpfilter_kern.c index 450b257afa84..908375b45d75 100644 --- a/net/bpfilter/bpfilter_kern.c +++ b/net/bpfilter/bpfilter_kern.c @@ -13,7 +13,6 @@ extern char bpfilter_umh_start; extern char bpfilter_umh_end; -static struct umh_info info; /* since ip_getsockopt() can run in parallel, serialize access to umh */ static DEFINE_MUTEX(bpfilter_lock); @@ -28,16 +27,13 @@ static void shutdown_umh(struct umh_info *info) send_sig(SIGKILL, tsk, 1); put_task_struct(tsk); } - fput(info->pipe_to_umh); - fput(info->pipe_from_umh); - info->pid = 0; } static void __stop_umh(void) { if (IS_ENABLED(CONFIG_INET)) { - bpfilter_process_sockopt = NULL; - shutdown_umh(&info); + bpfilter_ops.sockopt = NULL; + shutdown_umh(&bpfilter_ops.info); } } @@ -64,9 +60,10 @@ static int __bpfilter_process_sockopt(struct sock *sk, int optname, req.addr = (long __force __user)optval; req.len = optlen; mutex_lock(&bpfilter_lock); - if (!info.pid) + if (!bpfilter_ops.info.pid) goto out; - n = __kernel_write(info.pipe_to_umh, &req, sizeof(req), &pos); + n = __kernel_write(bpfilter_ops.info.pipe_to_umh, &req, sizeof(req), + &pos); if (n != sizeof(req)) { pr_err("write fail %zd\n", n); __stop_umh(); @@ -74,7 +71,8 @@ static int __bpfilter_process_sockopt(struct sock *sk, int optname, goto out; } pos = 0; - n = kernel_read(info.pipe_from_umh, &reply, sizeof(reply), &pos); + n = kernel_read(bpfilter_ops.info.pipe_from_umh, &reply, sizeof(reply), + &pos); if (n != sizeof(reply)) { pr_err("read fail %zd\n", n); __stop_umh(); @@ -94,10 +92,10 @@ static int __init load_umh(void) /* fork usermode process */ err = fork_usermode_blob(&bpfilter_umh_start, &bpfilter_umh_end - &bpfilter_umh_start, - &info); + &bpfilter_ops.info); if (err) return err; - pr_info("Loaded bpfilter_umh pid %d\n", info.pid); + pr_info("Loaded bpfilter_umh pid %d\n", bpfilter_ops.info.pid); /* health check that usermode process started correctly */ if (__bpfilter_process_sockopt(NULL, 0, NULL, 0, 0) != 0) { @@ -105,7 +103,7 @@ static int __init load_umh(void) return -EFAULT; } if (IS_ENABLED(CONFIG_INET)) - bpfilter_process_sockopt = &__bpfilter_process_sockopt; + bpfilter_ops.sockopt = &__bpfilter_process_sockopt; return 0; } diff --git a/net/ipv4/bpfilter/sockopt.c b/net/ipv4/bpfilter/sockopt.c index 42a96d2d8d05..6d3e21af0fc4 100644 --- a/net/ipv4/bpfilter/sockopt.c +++ b/net/ipv4/bpfilter/sockopt.c @@ -1,27 +1,36 @@ // SPDX-License-Identifier: GPL-2.0 +#include +#include #include #include #include #include #include +#include +#include -int (*bpfilter_process_sockopt)(struct sock *sk, int optname, - char __user *optval, - unsigned int optlen, bool is_set); -EXPORT_SYMBOL_GPL(bpfilter_process_sockopt); +struct bpfilter_umh_ops bpfilter_ops; +EXPORT_SYMBOL_GPL(bpfilter_ops); + +static void bpfilter_umh_cleanup(struct umh_info *info) +{ + fput(info->pipe_to_umh); + fput(info->pipe_from_umh); + info->pid = 0; +} int bpfilter_mbox_request(struct sock *sk, int optname, char __user *optval, unsigned int optlen, bool is_set) { - if (!bpfilter_process_sockopt) { + if (!bpfilter_ops.sockopt) { int err = request_module("bpfilter"); if (err) return err; - if (!bpfilter_process_sockopt) + if (!bpfilter_ops.sockopt) return -ECHILD; } - return bpfilter_process_sockopt(sk, optname, optval, optlen, is_set); + return bpfilter_ops.sockopt(sk, optname, optval, optlen, is_set); } int bpfilter_ip_set_sockopt(struct sock *sk, int optname, char __user *optval, @@ -40,3 +49,13 @@ int bpfilter_ip_get_sockopt(struct sock *sk, int optname, char __user *optval, return bpfilter_mbox_request(sk, optname, optval, len, false); } + +static int __init bpfilter_sockopt_init(void) +{ + bpfilter_ops.info.cmdline = "bpfilter_umh"; + bpfilter_ops.info.cleanup = &bpfilter_umh_cleanup; + + return 0; +} + +module_init(bpfilter_sockopt_init); From 0d59c5726d9c558e10c72ff95f263b0cf24bfbaf Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Wed, 9 Jan 2019 02:24:53 +0900 Subject: [PATCH 1631/1640] UPSTREAM: net: bpfilter: restart bpfilter_umh when error occurred The bpfilter_umh will be stopped via __stop_umh() when the bpfilter error occurred. The bpfilter_umh() couldn't start again because there is no restart routine. The section of the bpfilter_umh_{start/end} is no longer .init.rodata because these area should be reused in the restart routine. hence the section name is changed to .bpfilter_umh. The bpfilter_ops->start() is restart callback. it will be called when bpfilter_umh is stopped. The stop bit means bpfilter_umh is stopped. this bit is set by both start and stop routine. Before this patch, Test commands: $ iptables -vnL $ kill -9 $ iptables -vnL [ 480.045136] bpfilter: write fail -32 $ iptables -vnL All iptables commands will fail. After this patch, Test commands: $ iptables -vnL $ kill -9 $ iptables -vnL $ iptables -vnL Now, all iptables commands will work. Fixes: d2ba09c17a06 ("net: add skeleton of bpfilter kernel module") Change-Id: I4c1fa7fd3af172967e450f3db8fab77434b341b6 Signed-off-by: Taehee Yoo Signed-off-by: David S. Miller --- include/linux/bpfilter.h | 2 ++ net/bpfilter/bpfilter_kern.c | 37 +++++++++++++++++++++++--------- net/bpfilter/bpfilter_umh_blob.S | 2 +- net/ipv4/bpfilter/sockopt.c | 11 +++++++++- 4 files changed, 40 insertions(+), 12 deletions(-) diff --git a/include/linux/bpfilter.h b/include/linux/bpfilter.h index 70ffeed280e9..8ebcbdd70bdc 100644 --- a/include/linux/bpfilter.h +++ b/include/linux/bpfilter.h @@ -15,6 +15,8 @@ struct bpfilter_umh_ops { int (*sockopt)(struct sock *sk, int optname, char __user *optval, unsigned int optlen, bool is_set); + int (*start)(void); + bool stop; }; extern struct bpfilter_umh_ops bpfilter_ops; #endif diff --git a/net/bpfilter/bpfilter_kern.c b/net/bpfilter/bpfilter_kern.c index 908375b45d75..eff4f58d2682 100644 --- a/net/bpfilter/bpfilter_kern.c +++ b/net/bpfilter/bpfilter_kern.c @@ -16,13 +16,14 @@ extern char bpfilter_umh_end; /* since ip_getsockopt() can run in parallel, serialize access to umh */ static DEFINE_MUTEX(bpfilter_lock); -static void shutdown_umh(struct umh_info *info) +static void shutdown_umh(void) { struct task_struct *tsk; - if (!info->pid) + if (bpfilter_ops.stop) return; - tsk = get_pid_task(find_vpid(info->pid), PIDTYPE_PID); + + tsk = get_pid_task(find_vpid(bpfilter_ops.info.pid), PIDTYPE_PID); if (tsk) { send_sig(SIGKILL, tsk, 1); put_task_struct(tsk); @@ -31,10 +32,8 @@ static void shutdown_umh(struct umh_info *info) static void __stop_umh(void) { - if (IS_ENABLED(CONFIG_INET)) { - bpfilter_ops.sockopt = NULL; - shutdown_umh(&bpfilter_ops.info); - } + if (IS_ENABLED(CONFIG_INET)) + shutdown_umh(); } static void stop_umh(void) @@ -85,7 +84,7 @@ out: return ret; } -static int __init load_umh(void) +static int start_umh(void) { int err; @@ -95,6 +94,7 @@ static int __init load_umh(void) &bpfilter_ops.info); if (err) return err; + bpfilter_ops.stop = false; pr_info("Loaded bpfilter_umh pid %d\n", bpfilter_ops.info.pid); /* health check that usermode process started correctly */ @@ -102,14 +102,31 @@ static int __init load_umh(void) stop_umh(); return -EFAULT; } - if (IS_ENABLED(CONFIG_INET)) - bpfilter_ops.sockopt = &__bpfilter_process_sockopt; return 0; } +static int __init load_umh(void) +{ + int err; + + if (!bpfilter_ops.stop) + return -EFAULT; + err = start_umh(); + if (!err && IS_ENABLED(CONFIG_INET)) { + bpfilter_ops.sockopt = &__bpfilter_process_sockopt; + bpfilter_ops.start = &start_umh; + } + + return err; +} + static void __exit fini_umh(void) { + if (IS_ENABLED(CONFIG_INET)) { + bpfilter_ops.start = NULL; + bpfilter_ops.sockopt = NULL; + } stop_umh(); } module_init(load_umh); diff --git a/net/bpfilter/bpfilter_umh_blob.S b/net/bpfilter/bpfilter_umh_blob.S index 40311d10d2f2..7f1c521dcc2f 100644 --- a/net/bpfilter/bpfilter_umh_blob.S +++ b/net/bpfilter/bpfilter_umh_blob.S @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ - .section .init.rodata, "a" + .section .bpfilter_umh, "a" .global bpfilter_umh_start bpfilter_umh_start: .incbin "net/bpfilter/bpfilter_umh" diff --git a/net/ipv4/bpfilter/sockopt.c b/net/ipv4/bpfilter/sockopt.c index 6d3e21af0fc4..00bfb94eb5be 100644 --- a/net/ipv4/bpfilter/sockopt.c +++ b/net/ipv4/bpfilter/sockopt.c @@ -14,6 +14,7 @@ EXPORT_SYMBOL_GPL(bpfilter_ops); static void bpfilter_umh_cleanup(struct umh_info *info) { + bpfilter_ops.stop = true; fput(info->pipe_to_umh); fput(info->pipe_from_umh); info->pid = 0; @@ -22,14 +23,21 @@ static void bpfilter_umh_cleanup(struct umh_info *info) int bpfilter_mbox_request(struct sock *sk, int optname, char __user *optval, unsigned int optlen, bool is_set) { + int err; + if (!bpfilter_ops.sockopt) { - int err = request_module("bpfilter"); + err = request_module("bpfilter"); if (err) return err; if (!bpfilter_ops.sockopt) return -ECHILD; } + if (bpfilter_ops.stop) { + err = bpfilter_ops.start(); + if (err) + return err; + } return bpfilter_ops.sockopt(sk, optname, optval, optlen, is_set); } @@ -52,6 +60,7 @@ int bpfilter_ip_get_sockopt(struct sock *sk, int optname, char __user *optval, static int __init bpfilter_sockopt_init(void) { + bpfilter_ops.stop = true; bpfilter_ops.info.cmdline = "bpfilter_umh"; bpfilter_ops.info.cleanup = &bpfilter_umh_cleanup; From 15409b0bf0f6a302b32fd996e4a27c189d658693 Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Wed, 9 Jan 2019 02:25:10 +0900 Subject: [PATCH 1632/1640] UPSTREAM: net: bpfilter: disallow to remove bpfilter module while being used The bpfilter.ko module can be removed while functions of the bpfilter.ko are executing. so panic can occurred. in order to protect that, locks can be used. a bpfilter_lock protects routines in the __bpfilter_process_sockopt() but it's not enough because __exit routine can be executed concurrently. Now, the bpfilter_umh can not run in parallel. So, the module do not removed while it's being used and it do not double-create UMH process. The members of the umh_info and the bpfilter_umh_ops are protected by the bpfilter_umh_ops.lock. test commands: while : do iptables -I FORWARD -m string --string ap --algo kmp & modprobe -rv bpfilter & done splat looks like: [ 298.623435] BUG: unable to handle kernel paging request at fffffbfff807440b [ 298.628512] #PF error: [normal kernel read fault] [ 298.633018] PGD 124327067 P4D 124327067 PUD 11c1a3067 PMD 119eb2067 PTE 0 [ 298.638859] Oops: 0000 [#1] SMP DEBUG_PAGEALLOC KASAN PTI [ 298.638859] CPU: 0 PID: 2997 Comm: iptables Not tainted 4.20.0+ #154 [ 298.638859] RIP: 0010:__mutex_lock+0x6b9/0x16a0 [ 298.638859] Code: c0 00 00 e8 89 82 ff ff 80 bd 8f fc ff ff 00 0f 85 d9 05 00 00 48 8b 85 80 fc ff ff 48 bf 00 00 00 00 00 fc ff df 48 c1 e8 03 <80> 3c 38 00 0f 85 1d 0e 00 00 48 8b 85 c8 fc ff ff 49 39 47 58 c6 [ 298.638859] RSP: 0018:ffff88810e7777a0 EFLAGS: 00010202 [ 298.638859] RAX: 1ffffffff807440b RBX: ffff888111bd4d80 RCX: 0000000000000000 [ 298.638859] RDX: 1ffff110235ff806 RSI: ffff888111bd5538 RDI: dffffc0000000000 [ 298.638859] RBP: ffff88810e777b30 R08: 0000000080000002 R09: 0000000000000000 [ 298.638859] R10: 0000000000000000 R11: 0000000000000000 R12: fffffbfff168a42c [ 298.638859] R13: ffff888111bd4d80 R14: ffff8881040e9a05 R15: ffffffffc03a2000 [ 298.638859] FS: 00007f39e3758700(0000) GS:ffff88811ae00000(0000) knlGS:0000000000000000 [ 298.638859] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 298.638859] CR2: fffffbfff807440b CR3: 000000011243e000 CR4: 00000000001006f0 [ 298.638859] Call Trace: [ 298.638859] ? mutex_lock_io_nested+0x1560/0x1560 [ 298.638859] ? kasan_kmalloc+0xa0/0xd0 [ 298.638859] ? kmem_cache_alloc+0x1c2/0x260 [ 298.638859] ? __alloc_file+0x92/0x3c0 [ 298.638859] ? alloc_empty_file+0x43/0x120 [ 298.638859] ? alloc_file_pseudo+0x220/0x330 [ 298.638859] ? sock_alloc_file+0x39/0x160 [ 298.638859] ? __sys_socket+0x113/0x1d0 [ 298.638859] ? __x64_sys_socket+0x6f/0xb0 [ 298.638859] ? do_syscall_64+0x138/0x560 [ 298.638859] ? entry_SYSCALL_64_after_hwframe+0x49/0xbe [ 298.638859] ? __alloc_file+0x92/0x3c0 [ 298.638859] ? init_object+0x6b/0x80 [ 298.638859] ? cyc2ns_read_end+0x10/0x10 [ 298.638859] ? cyc2ns_read_end+0x10/0x10 [ 298.638859] ? hlock_class+0x140/0x140 [ 298.638859] ? sched_clock_local+0xd4/0x140 [ 298.638859] ? sched_clock_local+0xd4/0x140 [ 298.638859] ? check_flags.part.37+0x440/0x440 [ 298.638859] ? __lock_acquire+0x4f90/0x4f90 [ 298.638859] ? set_rq_offline.part.89+0x140/0x140 [ ... ] Fixes: d2ba09c17a06 ("net: add skeleton of bpfilter kernel module") Change-Id: I3186f2ec6cbf8263aed16d562021f741ab25bf86 Signed-off-by: Taehee Yoo Signed-off-by: David S. Miller --- include/linux/bpfilter.h | 2 ++ net/bpfilter/bpfilter_kern.c | 28 +++++++++++----------------- net/ipv4/bpfilter/sockopt.c | 22 ++++++++++++++++------ 3 files changed, 29 insertions(+), 23 deletions(-) diff --git a/include/linux/bpfilter.h b/include/linux/bpfilter.h index 8ebcbdd70bdc..d815622cd31e 100644 --- a/include/linux/bpfilter.h +++ b/include/linux/bpfilter.h @@ -12,6 +12,8 @@ int bpfilter_ip_get_sockopt(struct sock *sk, int optname, char __user *optval, int __user *optlen); struct bpfilter_umh_ops { struct umh_info info; + /* since ip_getsockopt() can run in parallel, serialize access to umh */ + struct mutex lock; int (*sockopt)(struct sock *sk, int optname, char __user *optval, unsigned int optlen, bool is_set); diff --git a/net/bpfilter/bpfilter_kern.c b/net/bpfilter/bpfilter_kern.c index eff4f58d2682..c0f0990f30b6 100644 --- a/net/bpfilter/bpfilter_kern.c +++ b/net/bpfilter/bpfilter_kern.c @@ -13,9 +13,6 @@ extern char bpfilter_umh_start; extern char bpfilter_umh_end; -/* since ip_getsockopt() can run in parallel, serialize access to umh */ -static DEFINE_MUTEX(bpfilter_lock); - static void shutdown_umh(void) { struct task_struct *tsk; @@ -36,13 +33,6 @@ static void __stop_umh(void) shutdown_umh(); } -static void stop_umh(void) -{ - mutex_lock(&bpfilter_lock); - __stop_umh(); - mutex_unlock(&bpfilter_lock); -} - static int __bpfilter_process_sockopt(struct sock *sk, int optname, char __user *optval, unsigned int optlen, bool is_set) @@ -58,7 +48,6 @@ static int __bpfilter_process_sockopt(struct sock *sk, int optname, req.cmd = optname; req.addr = (long __force __user)optval; req.len = optlen; - mutex_lock(&bpfilter_lock); if (!bpfilter_ops.info.pid) goto out; n = __kernel_write(bpfilter_ops.info.pipe_to_umh, &req, sizeof(req), @@ -80,7 +69,6 @@ static int __bpfilter_process_sockopt(struct sock *sk, int optname, } ret = reply.status; out: - mutex_unlock(&bpfilter_lock); return ret; } @@ -99,7 +87,7 @@ static int start_umh(void) /* health check that usermode process started correctly */ if (__bpfilter_process_sockopt(NULL, 0, NULL, 0, 0) != 0) { - stop_umh(); + shutdown_umh(); return -EFAULT; } @@ -110,24 +98,30 @@ static int __init load_umh(void) { int err; - if (!bpfilter_ops.stop) - return -EFAULT; + mutex_lock(&bpfilter_ops.lock); + if (!bpfilter_ops.stop) { + err = -EFAULT; + goto out; + } err = start_umh(); if (!err && IS_ENABLED(CONFIG_INET)) { bpfilter_ops.sockopt = &__bpfilter_process_sockopt; bpfilter_ops.start = &start_umh; } - +out: + mutex_unlock(&bpfilter_ops.lock); return err; } static void __exit fini_umh(void) { + mutex_lock(&bpfilter_ops.lock); if (IS_ENABLED(CONFIG_INET)) { + shutdown_umh(); bpfilter_ops.start = NULL; bpfilter_ops.sockopt = NULL; } - stop_umh(); + mutex_unlock(&bpfilter_ops.lock); } module_init(load_umh); module_exit(fini_umh); diff --git a/net/ipv4/bpfilter/sockopt.c b/net/ipv4/bpfilter/sockopt.c index 00bfb94eb5be..e8ba37028d07 100644 --- a/net/ipv4/bpfilter/sockopt.c +++ b/net/ipv4/bpfilter/sockopt.c @@ -14,31 +14,40 @@ EXPORT_SYMBOL_GPL(bpfilter_ops); static void bpfilter_umh_cleanup(struct umh_info *info) { + mutex_lock(&bpfilter_ops.lock); bpfilter_ops.stop = true; fput(info->pipe_to_umh); fput(info->pipe_from_umh); info->pid = 0; + mutex_unlock(&bpfilter_ops.lock); } int bpfilter_mbox_request(struct sock *sk, int optname, char __user *optval, unsigned int optlen, bool is_set) { int err; - + mutex_lock(&bpfilter_ops.lock); if (!bpfilter_ops.sockopt) { + mutex_unlock(&bpfilter_ops.lock); err = request_module("bpfilter"); + mutex_lock(&bpfilter_ops.lock); if (err) - return err; - if (!bpfilter_ops.sockopt) - return -ECHILD; + goto out; + if (!bpfilter_ops.sockopt) { + err = -ECHILD; + goto out; + } } if (bpfilter_ops.stop) { err = bpfilter_ops.start(); if (err) - return err; + goto out; } - return bpfilter_ops.sockopt(sk, optname, optval, optlen, is_set); + err = bpfilter_ops.sockopt(sk, optname, optval, optlen, is_set); +out: + mutex_unlock(&bpfilter_ops.lock); + return err; } int bpfilter_ip_set_sockopt(struct sock *sk, int optname, char __user *optval, @@ -60,6 +69,7 @@ int bpfilter_ip_get_sockopt(struct sock *sk, int optname, char __user *optval, static int __init bpfilter_sockopt_init(void) { + mutex_init(&bpfilter_ops.lock); bpfilter_ops.stop = true; bpfilter_ops.info.cmdline = "bpfilter_umh"; bpfilter_ops.info.cleanup = &bpfilter_umh_cleanup; From a3d6c09e0e162811dd5bd638b038ad0845e46853 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Mon, 28 Jan 2019 09:21:19 -0800 Subject: [PATCH 1633/1640] UPSTREAM: bpf: BPF_PROG_TYPE_CGROUP_{SKB, SOCK, SOCK_ADDR} require cgroups enabled There is no way to exercise appropriate attach points without cgroups enabled. This lets test_verifier correctly skip tests for these prog_types if kernel was compiled without BPF cgroup support. Change-Id: Icadf66ef5cb1b78c43c710df0c5ac9c70859d7c3 Signed-off-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann --- include/linux/bpf_types.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index 8ec5b18c401f..36a9c2325176 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -6,9 +6,11 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_SOCKET_FILTER, sk_filter) BPF_PROG_TYPE(BPF_PROG_TYPE_SCHED_CLS, tc_cls_act) BPF_PROG_TYPE(BPF_PROG_TYPE_SCHED_ACT, tc_cls_act) BPF_PROG_TYPE(BPF_PROG_TYPE_XDP, xdp) +#ifdef CONFIG_CGROUP_BPF BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SKB, cg_skb) BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SOCK, cg_sock) BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SOCK_ADDR, cg_sock_addr) +#endif BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_IN, lwt_in) BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_OUT, lwt_out) BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_XMIT, lwt_xmit) From 438584524d90d6ca78f9d03b941159129971d65a Mon Sep 17 00:00:00 2001 From: Song Liu Date: Thu, 17 Jan 2019 08:15:13 -0800 Subject: [PATCH 1634/1640] UPSTREAM: perf, bpf: Introduce PERF_RECORD_KSYMBOL For better performance analysis of dynamically JITed and loaded kernel functions, such as BPF programs, this patch introduces PERF_RECORD_KSYMBOL, a new perf_event_type that exposes kernel symbol register/unregister information to user space. The following data structure is used for PERF_RECORD_KSYMBOL. /* * struct { * struct perf_event_header header; * u64 addr; * u32 len; * u16 ksym_type; * u16 flags; * char name[]; * struct sample_id sample_id; * }; */ Change-Id: I3e6901ef579878015f6a75d15699230882f79e1f Signed-off-by: Song Liu Reviewed-by: Arnaldo Carvalho de Melo Tested-by: Arnaldo Carvalho de Melo Acked-by: Peter Zijlstra Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: Peter Zijlstra Cc: kernel-team@fb.com Cc: netdev@vger.kernel.org Link: http://lkml.kernel.org/r/20190117161521.1341602-2-songliubraving@fb.com Signed-off-by: Arnaldo Carvalho de Melo --- include/linux/perf_event.h | 8 +++ include/uapi/linux/perf_event.h | 26 ++++++++- kernel/events/core.c | 98 ++++++++++++++++++++++++++++++++- 3 files changed, 130 insertions(+), 2 deletions(-) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 58a57df528c6..b1a33997526b 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1156,6 +1156,10 @@ static inline void perf_event_task_sched_out(struct task_struct *prev, } extern void perf_event_mmap(struct vm_area_struct *vma); + +extern void perf_event_ksymbol(u16 ksym_type, u64 addr, u32 len, + bool unregister, const char *sym); + extern struct perf_guest_info_callbacks *perf_guest_cbs; extern int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *callbacks); extern int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *callbacks); @@ -1399,6 +1403,10 @@ static inline int perf_unregister_guest_info_callbacks (struct perf_guest_info_callbacks *callbacks) { return 0; } static inline void perf_event_mmap(struct vm_area_struct *vma) { } + +typedef int (perf_ksymbol_get_name_f)(char *name, int name_len, void *data); +static inline void perf_event_ksymbol(u16 ksym_type, u64 addr, u32 len, + bool unregister, const char *sym) { } static inline void perf_event_exec(void) { } static inline void perf_event_comm(struct task_struct *tsk, bool exec) { } static inline void perf_event_namespaces(struct task_struct *tsk) { } diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 11403fb6dd64..4e40a5718e5d 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -370,7 +370,8 @@ struct perf_event_attr { context_switch : 1, /* context switch data */ write_backward : 1, /* Write ring buffer from end to beginning */ namespaces : 1, /* include namespaces data */ - __reserved_1 : 35; + ksymbol : 1, /* include ksymbol events */ + __reserved_1 : 34; union { __u32 wakeup_events; /* wakeup every n events */ @@ -944,9 +945,32 @@ enum perf_event_type { */ PERF_RECORD_NAMESPACES = 16, + /* + * Record ksymbol register/unregister events: + * + * struct { + * struct perf_event_header header; + * u64 addr; + * u32 len; + * u16 ksym_type; + * u16 flags; + * char name[]; + * struct sample_id sample_id; + * }; + */ + PERF_RECORD_KSYMBOL = 17, + PERF_RECORD_MAX, /* non-ABI */ }; +enum perf_record_ksymbol_type { + PERF_RECORD_KSYMBOL_TYPE_UNKNOWN = 0, + PERF_RECORD_KSYMBOL_TYPE_BPF = 1, + PERF_RECORD_KSYMBOL_TYPE_MAX /* non-ABI */ +}; + +#define PERF_RECORD_KSYMBOL_FLAGS_UNREGISTER (1 << 0) + #define PERF_MAX_STACK_DEPTH 127 #define PERF_MAX_CONTEXTS_PER_STACK 8 diff --git a/kernel/events/core.c b/kernel/events/core.c index d7ea0f35f34d..ee5c9e9ff5f5 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -419,6 +419,7 @@ static atomic_t nr_namespaces_events __read_mostly; static atomic_t nr_task_events __read_mostly; static atomic_t nr_freq_events __read_mostly; static atomic_t nr_switch_events __read_mostly; +static atomic_t nr_ksymbol_events __read_mostly; static LIST_HEAD(pmus); static DEFINE_MUTEX(pmus_lock); @@ -4183,7 +4184,7 @@ static bool is_sb_event(struct perf_event *event) if (attr->mmap || attr->mmap_data || attr->mmap2 || attr->comm || attr->comm_exec || - attr->task || + attr->task || attr->ksymbol || attr->context_switch) return true; return false; @@ -4253,6 +4254,8 @@ static void unaccount_event(struct perf_event *event) dec = true; if (has_branch_stack(event)) dec = true; + if (event->attr.ksymbol) + atomic_dec(&nr_ksymbol_events); if (dec) { if (!atomic_add_unless(&perf_sched_count, -1, 1)) @@ -7728,6 +7731,97 @@ static void perf_log_throttle(struct perf_event *event, int enable) perf_output_end(&handle); } +/* + * ksymbol register/unregister tracking + */ + +struct perf_ksymbol_event { + const char *name; + int name_len; + struct { + struct perf_event_header header; + u64 addr; + u32 len; + u16 ksym_type; + u16 flags; + } event_id; +}; + +static int perf_event_ksymbol_match(struct perf_event *event) +{ + return event->attr.ksymbol; +} + +static void perf_event_ksymbol_output(struct perf_event *event, void *data) +{ + struct perf_ksymbol_event *ksymbol_event = data; + struct perf_output_handle handle; + struct perf_sample_data sample; + int ret; + + if (!perf_event_ksymbol_match(event)) + return; + + perf_event_header__init_id(&ksymbol_event->event_id.header, + &sample, event); + ret = perf_output_begin(&handle, event, + ksymbol_event->event_id.header.size); + if (ret) + return; + + perf_output_put(&handle, ksymbol_event->event_id); + __output_copy(&handle, ksymbol_event->name, ksymbol_event->name_len); + perf_event__output_id_sample(event, &handle, &sample); + + perf_output_end(&handle); +} + +void perf_event_ksymbol(u16 ksym_type, u64 addr, u32 len, bool unregister, + const char *sym) +{ + struct perf_ksymbol_event ksymbol_event; + char name[KSYM_NAME_LEN]; + u16 flags = 0; + int name_len; + + if (!atomic_read(&nr_ksymbol_events)) + return; + + if (ksym_type >= PERF_RECORD_KSYMBOL_TYPE_MAX || + ksym_type == PERF_RECORD_KSYMBOL_TYPE_UNKNOWN) + goto err; + + strlcpy(name, sym, KSYM_NAME_LEN); + name_len = strlen(name) + 1; + while (!IS_ALIGNED(name_len, sizeof(u64))) + name[name_len++] = '\0'; + BUILD_BUG_ON(KSYM_NAME_LEN % sizeof(u64)); + + if (unregister) + flags |= PERF_RECORD_KSYMBOL_FLAGS_UNREGISTER; + + ksymbol_event = (struct perf_ksymbol_event){ + .name = name, + .name_len = name_len, + .event_id = { + .header = { + .type = PERF_RECORD_KSYMBOL, + .size = sizeof(ksymbol_event.event_id) + + name_len, + }, + .addr = addr, + .len = len, + .ksym_type = ksym_type, + .flags = flags, + }, + }; + + perf_iterate_sb(perf_event_ksymbol_output, &ksymbol_event, NULL); + return; +err: + WARN_ONCE(1, "%s: Invalid KSYMBOL type 0x%x\n", __func__, ksym_type); +} + void perf_event_itrace_started(struct perf_event *event) { event->attach_state |= PERF_ATTACH_ITRACE; @@ -9945,6 +10039,8 @@ static void account_event(struct perf_event *event) inc = true; if (is_cgroup_event(event)) inc = true; + if (event->attr.ksymbol) + atomic_inc(&nr_ksymbol_events); if (inc) { if (atomic_inc_not_zero(&perf_sched_count)) From 2979455ad3f98dc591954823668e2b912b09112f Mon Sep 17 00:00:00 2001 From: Song Liu Date: Thu, 17 Jan 2019 08:15:15 -0800 Subject: [PATCH 1635/1640] BACKPORT: perf, bpf: Introduce PERF_RECORD_BPF_EVENT For better performance analysis of BPF programs, this patch introduces PERF_RECORD_BPF_EVENT, a new perf_event_type that exposes BPF program load/unload information to user space. Each BPF program may contain up to BPF_MAX_SUBPROGS (256) sub programs. The following example shows kernel symbols for a BPF program with 7 sub programs: ffffffffa0257cf9 t bpf_prog_b07ccb89267cf242_F ffffffffa02592e1 t bpf_prog_2dcecc18072623fc_F ffffffffa025b0e9 t bpf_prog_bb7a405ebaec5d5c_F ffffffffa025dd2c t bpf_prog_a7540d4a39ec1fc7_F ffffffffa025fcca t bpf_prog_05762d4ade0e3737_F ffffffffa026108f t bpf_prog_db4bd11e35df90d4_F ffffffffa0263f00 t bpf_prog_89d64e4abf0f0126_F ffffffffa0257cf9 t bpf_prog_ae31629322c4b018__dummy_tracepoi When a bpf program is loaded, PERF_RECORD_KSYMBOL is generated for each of these sub programs. Therefore, PERF_RECORD_BPF_EVENT is not needed for simple profiling. For annotation, user space need to listen to PERF_RECORD_BPF_EVENT and gather more information about these (sub) programs via sys_bpf. Change-Id: I8ed02f808501c32f406108c282c853a56d0dcc25 Signed-off-by: Song Liu Reviewed-by: Arnaldo Carvalho de Melo Acked-by: Alexei Starovoitov Acked-by: Peter Zijlstra (Intel) Tested-by: Arnaldo Carvalho de Melo Cc: Daniel Borkmann Cc: Peter Zijlstra Cc: kernel-team@fb.com Cc: netdev@vger.kernel.org Link: http://lkml.kernel.org/r/20190117161521.1341602-4-songliubraving@fb.com Signed-off-by: Arnaldo Carvalho de Melo --- include/linux/filter.h | 7 ++ include/linux/perf_event.h | 6 ++ include/uapi/linux/perf_event.h | 29 +++++++- kernel/bpf/core.c | 2 +- kernel/bpf/syscall.c | 2 + kernel/events/core.c | 115 ++++++++++++++++++++++++++++++++ 6 files changed, 159 insertions(+), 2 deletions(-) diff --git a/include/linux/filter.h b/include/linux/filter.h index 68787eadfe22..b9d0c3b45b96 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1127,6 +1127,7 @@ bpf_address_lookup(unsigned long addr, unsigned long *size, void bpf_prog_kallsyms_add(struct bpf_prog *fp); void bpf_prog_kallsyms_del(struct bpf_prog *fp); +void bpf_get_prog_name(const struct bpf_prog *prog, char *sym); #else /* CONFIG_BPF_JIT */ @@ -1182,6 +1183,12 @@ static inline void bpf_prog_kallsyms_add(struct bpf_prog *fp) static inline void bpf_prog_kallsyms_del(struct bpf_prog *fp) { } + +static inline void bpf_get_prog_name(const struct bpf_prog *prog, char *sym) +{ + sym[0] = '\0'; +} + #endif /* CONFIG_BPF_JIT */ void bpf_prog_kallsyms_del_all(struct bpf_prog *fp); diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index b1a33997526b..53c195133d2b 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1159,6 +1159,9 @@ extern void perf_event_mmap(struct vm_area_struct *vma); extern void perf_event_ksymbol(u16 ksym_type, u64 addr, u32 len, bool unregister, const char *sym); +extern void perf_event_bpf_event(struct bpf_prog *prog, + enum perf_bpf_event_type type, + u16 flags); extern struct perf_guest_info_callbacks *perf_guest_cbs; extern int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *callbacks); @@ -1407,6 +1410,9 @@ static inline void perf_event_mmap(struct vm_area_struct *vma) { } typedef int (perf_ksymbol_get_name_f)(char *name, int name_len, void *data); static inline void perf_event_ksymbol(u16 ksym_type, u64 addr, u32 len, bool unregister, const char *sym) { } +static inline void perf_event_bpf_event(struct bpf_prog *prog, + enum perf_bpf_event_type type, + u16 flags) { } static inline void perf_event_exec(void) { } static inline void perf_event_comm(struct task_struct *tsk, bool exec) { } static inline void perf_event_namespaces(struct task_struct *tsk) { } diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 4e40a5718e5d..6c87b0a42e64 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -371,7 +371,8 @@ struct perf_event_attr { write_backward : 1, /* Write ring buffer from end to beginning */ namespaces : 1, /* include namespaces data */ ksymbol : 1, /* include ksymbol events */ - __reserved_1 : 34; + bpf_event : 1, /* include bpf events */ + __reserved_1 : 33; union { __u32 wakeup_events; /* wakeup every n events */ @@ -960,6 +961,25 @@ enum perf_event_type { */ PERF_RECORD_KSYMBOL = 17, + /* + * Record bpf events: + * enum perf_bpf_event_type { + * PERF_BPF_EVENT_UNKNOWN = 0, + * PERF_BPF_EVENT_PROG_LOAD = 1, + * PERF_BPF_EVENT_PROG_UNLOAD = 2, + * }; + * + * struct { + * struct perf_event_header header; + * u16 type; + * u16 flags; + * u32 id; + * u8 tag[BPF_TAG_SIZE]; + * struct sample_id sample_id; + * }; + */ + PERF_RECORD_BPF_EVENT = 18, + PERF_RECORD_MAX, /* non-ABI */ }; @@ -971,6 +991,13 @@ enum perf_record_ksymbol_type { #define PERF_RECORD_KSYMBOL_FLAGS_UNREGISTER (1 << 0) +enum perf_bpf_event_type { + PERF_BPF_EVENT_UNKNOWN = 0, + PERF_BPF_EVENT_PROG_LOAD = 1, + PERF_BPF_EVENT_PROG_UNLOAD = 2, + PERF_BPF_EVENT_MAX, /* non-ABI */ +}; + #define PERF_MAX_STACK_DEPTH 127 #define PERF_MAX_CONTEXTS_PER_STACK 8 diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 7f1a82d332b6..c6eaacb36f6b 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -554,7 +554,7 @@ bpf_get_prog_addr_region(const struct bpf_prog *prog, *symbol_end = addr + hdr->pages * PAGE_SIZE; } -static void bpf_get_prog_name(const struct bpf_prog *prog, char *sym) +void bpf_get_prog_name(const struct bpf_prog *prog, char *sym) { const char *end = sym + KSYM_NAME_LEN; const struct btf_type *type; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 5fcdc882c314..125064510388 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1356,6 +1356,7 @@ static void __bpf_prog_put_noref(struct bpf_prog *prog, bool deferred) static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock) { if (atomic_dec_and_test(&prog->aux->refcnt)) { + perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_UNLOAD, 0); /* bpf_prog_free_id() must be called first */ bpf_prog_free_id(prog, do_idr_lock); __bpf_prog_put_noref(prog, true); @@ -1748,6 +1749,7 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) * be using bpf_prog_put() given the program is exposed. */ bpf_prog_kallsyms_add(prog); + perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_LOAD, 0); err = bpf_prog_new_fd(prog); if (err < 0) diff --git a/kernel/events/core.c b/kernel/events/core.c index ee5c9e9ff5f5..d655eae6c912 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -420,6 +420,7 @@ static atomic_t nr_task_events __read_mostly; static atomic_t nr_freq_events __read_mostly; static atomic_t nr_switch_events __read_mostly; static atomic_t nr_ksymbol_events __read_mostly; +static atomic_t nr_bpf_events __read_mostly; static LIST_HEAD(pmus); static DEFINE_MUTEX(pmus_lock); @@ -4256,6 +4257,8 @@ static void unaccount_event(struct perf_event *event) dec = true; if (event->attr.ksymbol) atomic_dec(&nr_ksymbol_events); + if (event->attr.bpf_event) + atomic_dec(&nr_bpf_events); if (dec) { if (!atomic_add_unless(&perf_sched_count, -1, 1)) @@ -7822,6 +7825,116 @@ err: WARN_ONCE(1, "%s: Invalid KSYMBOL type 0x%x\n", __func__, ksym_type); } +/* + * bpf program load/unload tracking + */ + +struct perf_bpf_event { + struct bpf_prog *prog; + struct { + struct perf_event_header header; + u16 type; + u16 flags; + u32 id; + u8 tag[BPF_TAG_SIZE]; + } event_id; +}; + +static int perf_event_bpf_match(struct perf_event *event) +{ + return event->attr.bpf_event; +} + +static void perf_event_bpf_output(struct perf_event *event, void *data) +{ + struct perf_bpf_event *bpf_event = data; + struct perf_output_handle handle; + struct perf_sample_data sample; + int ret; + + if (!perf_event_bpf_match(event)) + return; + + perf_event_header__init_id(&bpf_event->event_id.header, + &sample, event); + ret = perf_output_begin(&handle, event, + bpf_event->event_id.header.size); + if (ret) + return; + + perf_output_put(&handle, bpf_event->event_id); + perf_event__output_id_sample(event, &handle, &sample); + + perf_output_end(&handle); +} + +static void perf_event_bpf_emit_ksymbols(struct bpf_prog *prog, + enum perf_bpf_event_type type) +{ + bool unregister = type == PERF_BPF_EVENT_PROG_UNLOAD; + char sym[KSYM_NAME_LEN]; + int i; + + if (prog->aux->func_cnt == 0) { + bpf_get_prog_name(prog, sym); + perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_BPF, + (u64)(unsigned long)prog->bpf_func, + prog->jited_len, unregister, sym); + } else { + for (i = 0; i < prog->aux->func_cnt; i++) { + struct bpf_prog *subprog = prog->aux->func[i]; + + bpf_get_prog_name(subprog, sym); + perf_event_ksymbol( + PERF_RECORD_KSYMBOL_TYPE_BPF, + (u64)(unsigned long)subprog->bpf_func, + subprog->jited_len, unregister, sym); + } + } +} + +void perf_event_bpf_event(struct bpf_prog *prog, + enum perf_bpf_event_type type, + u16 flags) +{ + struct perf_bpf_event bpf_event; + + if (type <= PERF_BPF_EVENT_UNKNOWN || + type >= PERF_BPF_EVENT_MAX) + return; + + switch (type) { + case PERF_BPF_EVENT_PROG_LOAD: + case PERF_BPF_EVENT_PROG_UNLOAD: + if (atomic_read(&nr_ksymbol_events)) + perf_event_bpf_emit_ksymbols(prog, type); + break; + default: + break; + } + + if (!atomic_read(&nr_bpf_events)) + return; + + bpf_event = (struct perf_bpf_event){ + .prog = prog, + .event_id = { + .header = { + .type = PERF_RECORD_BPF_EVENT, + .size = sizeof(bpf_event.event_id), + }, + .type = type, + .flags = flags, + .id = prog->aux->id, + }, + }; + + BUILD_BUG_ON(BPF_TAG_SIZE % sizeof(u64)); + + memcpy(bpf_event.event_id.tag, prog->tag, BPF_TAG_SIZE); + perf_iterate_sb(perf_event_bpf_output, &bpf_event, NULL); +} + void perf_event_itrace_started(struct perf_event *event) { event->attach_state |= PERF_ATTACH_ITRACE; @@ -10041,6 +10154,8 @@ static void account_event(struct perf_event *event) inc = true; if (event->attr.ksymbol) atomic_inc(&nr_ksymbol_events); + if (event->attr.bpf_event) + atomic_inc(&nr_bpf_events); if (inc) { if (atomic_inc_not_zero(&perf_sched_count)) From 324bf264fcfce304ec8ddd475951eba8938abc9c Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 9 Feb 2021 18:46:10 +0000 Subject: [PATCH 1636/1640] BACKPORT: bpf: Fix 32 bit src register truncation on div/mod commit e88b2c6e5a4d9ce30d75391e4d950da74bb2bd90 upstream. While reviewing a different fix, John and I noticed an oddity in one of the BPF program dumps that stood out, for example: # bpftool p d x i 13 0: (b7) r0 = 808464450 1: (b4) w4 = 808464432 2: (bc) w0 = w0 3: (15) if r0 == 0x0 goto pc+1 4: (9c) w4 %= w0 [...] In line 2 we noticed that the mov32 would 32 bit truncate the original src register for the div/mod operation. While for the two operations the dst register is typically marked unknown e.g. from adjust_scalar_min_max_vals() the src register is not, and thus verifier keeps tracking original bounds, simplified: 0: R1=ctx(id=0,off=0,imm=0) R10=fp0 0: (b7) r0 = -1 1: R0_w=invP-1 R1=ctx(id=0,off=0,imm=0) R10=fp0 1: (b7) r1 = -1 2: R0_w=invP-1 R1_w=invP-1 R10=fp0 2: (3c) w0 /= w1 3: R0_w=invP(id=0,umax_value=4294967295,var_off=(0x0; 0xffffffff)) R1_w=invP-1 R10=fp0 3: (77) r1 >>= 32 4: R0_w=invP(id=0,umax_value=4294967295,var_off=(0x0; 0xffffffff)) R1_w=invP4294967295 R10=fp0 4: (bf) r0 = r1 5: R0_w=invP4294967295 R1_w=invP4294967295 R10=fp0 5: (95) exit processed 6 insns (limit 1000000) max_states_per_insn 0 total_states 0 peak_states 0 mark_read 0 Runtime result of r0 at exit is 0 instead of expected -1. Remove the verifier mov32 src rewrite in div/mod and replace it with a jmp32 test instead. After the fix, we result in the following code generation when having dividend r1 and divisor r6: div, 64 bit: div, 32 bit: 0: (b7) r6 = 8 0: (b7) r6 = 8 1: (b7) r1 = 8 1: (b7) r1 = 8 2: (55) if r6 != 0x0 goto pc+2 2: (56) if w6 != 0x0 goto pc+2 3: (ac) w1 ^= w1 3: (ac) w1 ^= w1 4: (05) goto pc+1 4: (05) goto pc+1 5: (3f) r1 /= r6 5: (3c) w1 /= w6 6: (b7) r0 = 0 6: (b7) r0 = 0 7: (95) exit 7: (95) exit mod, 64 bit: mod, 32 bit: 0: (b7) r6 = 8 0: (b7) r6 = 8 1: (b7) r1 = 8 1: (b7) r1 = 8 2: (15) if r6 == 0x0 goto pc+1 2: (16) if w6 == 0x0 goto pc+1 3: (9f) r1 %= r6 3: (9c) w1 %= w6 4: (b7) r0 = 0 4: (b7) r0 = 0 5: (95) exit 5: (95) exit x86 in particular can throw a 'divide error' exception for div instruction not only for divisor being zero, but also for the case when the quotient is too large for the designated register. For the edx:eax and rdx:rax dividend pair it is not an issue in x86 BPF JIT since we always zero edx (rdx). Hence really the only protection needed is against divisor being zero. Also add some other code missed when backporting. Fixes: 68fda450a7df ("bpf: fix 32-bit divide by zero") Co-developed-by: John Fastabend Change-Id: I35a7f4f346bbcbc2f01003e607f2b00b7abe92ae Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: Greg Kroah-Hartman --- kernel/bpf/bpf_lru_list.c | 5 +--- kernel/bpf/bpf_lru_list.h | 5 +--- kernel/bpf/core.c | 26 ++++++++++----------- kernel/bpf/inode.c | 5 +--- kernel/bpf/map_in_map.c | 5 +--- kernel/bpf/map_in_map.h | 5 +--- kernel/bpf/percpu_freelist.c | 5 +--- kernel/bpf/percpu_freelist.h | 5 +--- kernel/bpf/stackmap.c | 5 +--- kernel/bpf/syscall.c | 1 - kernel/bpf/tnum.c | 1 + kernel/bpf/verifier.c | 44 ++++++++++++++++++------------------ 12 files changed, 43 insertions(+), 69 deletions(-) diff --git a/kernel/bpf/bpf_lru_list.c b/kernel/bpf/bpf_lru_list.c index 39a0e768adc3..3dabdd137d10 100644 --- a/kernel/bpf/bpf_lru_list.c +++ b/kernel/bpf/bpf_lru_list.c @@ -1,8 +1,5 @@ +// SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2016 Facebook - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. */ #include #include diff --git a/kernel/bpf/bpf_lru_list.h b/kernel/bpf/bpf_lru_list.h index 08da78b59f0b..41f8fea530c8 100644 --- a/kernel/bpf/bpf_lru_list.h +++ b/kernel/bpf/bpf_lru_list.h @@ -1,8 +1,5 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* Copyright (c) 2016 Facebook - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. */ #ifndef __BPF_LRU_LIST_H_ #define __BPF_LRU_LIST_H_ diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index c6eaacb36f6b..c244122355e1 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * Linux Socket Filter - Kernel level socket filtering * @@ -12,11 +13,6 @@ * Alexei Starovoitov * Daniel Borkmann * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * * Andi Kleen - Fix a few bad bugs and races. * Kris Katterjohn - Added many additional checks in bpf_check_classic() */ @@ -800,14 +796,6 @@ static void bpf_jit_uncharge_modmem(u32 pages) atomic_long_sub(pages, &bpf_jit_current); } -#if IS_ENABLED(CONFIG_BPF_JIT) && IS_ENABLED(CONFIG_CFI_CLANG) -bool __weak arch_bpf_jit_check_func(const struct bpf_prog *prog) -{ - return true; -} -EXPORT_SYMBOL_GPL(arch_bpf_jit_check_func); -#endif - void *__weak bpf_jit_alloc_exec(unsigned long size) { return module_alloc(size); @@ -818,6 +806,14 @@ void __weak bpf_jit_free_exec(void *addr) module_memfree(addr); } +#if IS_ENABLED(CONFIG_BPF_JIT) && IS_ENABLED(CONFIG_CFI_CLANG) +bool __weak arch_bpf_jit_check_func(const struct bpf_prog *prog) +{ + return true; +} +EXPORT_SYMBOL_GPL(arch_bpf_jit_check_func); +#endif + struct bpf_binary_header * bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr, unsigned int alignment, @@ -940,6 +936,9 @@ static int bpf_jit_blind_insn(const struct bpf_insn *from, * below. * * Constant blinding is only used by JITs, not in the interpreter. + * The interpreter uses AX in some occasions as a local temporary + * register e.g. in DIV or MOD instructions. + * * In restricted circumstances, the verifier can also use the AX * register for rewrites as long as they do not interfere with * the above cases! @@ -1342,7 +1341,6 @@ static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, u64 *stack) #undef BPF_INSN_3_LBL #undef BPF_INSN_2_LBL u32 tail_call_cnt = 0; - u64 tmp; #define CONT ({ insn++; goto select_insn; }) #define CONT_JMP ({ insn++; goto select_insn; }) diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index d2d305a29b30..b9e5fdfa2be4 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Minimal file system backend for holding eBPF maps and programs, * used by bpf(2) object pinning. @@ -5,10 +6,6 @@ * Authors: * * Daniel Borkmann - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * version 2 as published by the Free Software Foundation. */ #include diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c index 3dff41403583..fab4fb134547 100644 --- a/kernel/bpf/map_in_map.c +++ b/kernel/bpf/map_in_map.c @@ -1,8 +1,5 @@ +// SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2017 Facebook - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. */ #include #include diff --git a/kernel/bpf/map_in_map.h b/kernel/bpf/map_in_map.h index 6183db9ec08c..a507bf6ef8b9 100644 --- a/kernel/bpf/map_in_map.h +++ b/kernel/bpf/map_in_map.h @@ -1,8 +1,5 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* Copyright (c) 2017 Facebook - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. */ #ifndef __MAP_IN_MAP_H__ #define __MAP_IN_MAP_H__ diff --git a/kernel/bpf/percpu_freelist.c b/kernel/bpf/percpu_freelist.c index 0c1b4ba9e90e..6e090140b924 100644 --- a/kernel/bpf/percpu_freelist.c +++ b/kernel/bpf/percpu_freelist.c @@ -1,8 +1,5 @@ +// SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2016 Facebook - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. */ #include "percpu_freelist.h" diff --git a/kernel/bpf/percpu_freelist.h b/kernel/bpf/percpu_freelist.h index c3960118e617..fbf8a8a28979 100644 --- a/kernel/bpf/percpu_freelist.h +++ b/kernel/bpf/percpu_freelist.h @@ -1,8 +1,5 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* Copyright (c) 2016 Facebook - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. */ #ifndef __PERCPU_FREELIST_H__ #define __PERCPU_FREELIST_H__ diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 6f842f740752..bd8516d96745 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -1,8 +1,5 @@ +// SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2016 Facebook - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. */ #include #include diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 125064510388..f5a7b31e6f07 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -821,7 +821,6 @@ static int map_lookup_elem(union bpf_attr *attr) ptr = map->ops->map_lookup_elem_sys_only(map, key); else ptr = map->ops->map_lookup_elem(map, key); - ptr = map->ops->map_lookup_elem(map, key); if (IS_ERR(ptr)) { err = PTR_ERR(ptr); } else if (!ptr) { diff --git a/kernel/bpf/tnum.c b/kernel/bpf/tnum.c index 84984c0fc3d3..d4f335a9a899 100644 --- a/kernel/bpf/tnum.c +++ b/kernel/bpf/tnum.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* tnum: tracked (or tristate) numbers * * A tnum tracks knowledge about the bits of a value. Each bit can be either diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 8afbb1626773..013b9062c47c 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1973,6 +1973,7 @@ static int check_stack_write(struct bpf_verifier_env *env, } else if (reg && is_spillable_regtype(reg->type)) { /* register containing pointer is being spilled into stack */ if (size != BPF_REG_SIZE) { + verbose_linfo(env, insn_idx, "; "); verbose(env, "invalid size of register spill\n"); return -EACCES; } @@ -2040,6 +2041,7 @@ static int check_stack_read(struct bpf_verifier_env *env, if (stype[0] == STACK_SPILL) { if (size != BPF_REG_SIZE) { if (reg->type != SCALAR_VALUE) { + verbose_linfo(env, env->insn_idx, "; "); verbose(env, "invalid size of register fill\n"); return -EACCES; } @@ -3022,7 +3024,7 @@ static int __check_stack_boundary(struct bpf_verifier_env *env, u32 regno, int off, int access_size, bool zero_size_allowed) { - struct bpf_reg_state *reg = cur_regs(env) + regno; + struct bpf_reg_state *reg = reg_state(env, regno); if (off >= 0 || off < -MAX_BPF_STACK || off + access_size > 0 || access_size < 0 || (access_size == 0 && !zero_size_allowed)) { @@ -6150,7 +6152,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, if (BPF_SRC(insn->code) == BPF_K) pred = is_branch_taken(dst_reg, insn->imm, - opcode, is_jmp32); + opcode, is_jmp32); else if (src_reg->type == SCALAR_VALUE && tnum_is_const(src_reg->var_off)) pred = is_branch_taken(dst_reg, src_reg->var_off.value, @@ -6477,7 +6479,7 @@ static int check_return_code(struct bpf_verifier_env *env) verbose(env, "has unknown scalar value"); } tnum_strn(tn_buf, sizeof(tn_buf), range); - verbose(env, " should have been %s\n", tn_buf); + verbose(env, " should have been in %s\n", tn_buf); return -EINVAL; } @@ -9218,31 +9220,30 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) insn->code == (BPF_ALU | BPF_MOD | BPF_X) || insn->code == (BPF_ALU | BPF_DIV | BPF_X)) { bool is64 = BPF_CLASS(insn->code) == BPF_ALU64; - struct bpf_insn mask_and_div[] = { - BPF_MOV_REG(BPF_CLASS(insn->code), BPF_REG_AX, insn->src_reg), + bool isdiv = BPF_OP(insn->code) == BPF_DIV; + struct bpf_insn *patchlet; + struct bpf_insn chk_and_div[] = { /* [R,W]x div 0 -> 0 */ - BPF_JMP_IMM(BPF_JEQ, BPF_REG_AX, 0, 2), - BPF_RAW_REG(*insn, insn->dst_reg, BPF_REG_AX), + BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) | + BPF_JNE | BPF_K, insn->src_reg, + 0, 2, 0), + BPF_ALU32_REG(BPF_XOR, insn->dst_reg, insn->dst_reg), BPF_JMP_IMM(BPF_JA, 0, 0, 1), - BPF_ALU_REG(BPF_CLASS(insn->code), BPF_XOR, insn->dst_reg, insn->dst_reg), + *insn, }; - struct bpf_insn mask_and_mod[] = { - BPF_MOV_REG(BPF_CLASS(insn->code), BPF_REG_AX, insn->src_reg), - BPF_JMP_IMM(BPF_JEQ, BPF_REG_AX, 0, 1 + (is64 ? 0 : 1)), - BPF_RAW_REG(*insn, insn->dst_reg, BPF_REG_AX), + struct bpf_insn chk_and_mod[] = { + /* [R,W]x mod 0 -> [R,W]x */ + BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) | + BPF_JEQ | BPF_K, insn->src_reg, + 0, 1 + (is64 ? 0 : 1), 0), + *insn, BPF_JMP_IMM(BPF_JA, 0, 0, 1), BPF_MOV32_REG(insn->dst_reg, insn->dst_reg), }; - struct bpf_insn *patchlet; - if (insn->code == (BPF_ALU64 | BPF_DIV | BPF_X) || - insn->code == (BPF_ALU | BPF_DIV | BPF_X)) { - patchlet = mask_and_div; - cnt = ARRAY_SIZE(mask_and_div); - } else { - patchlet = mask_and_mod; - cnt = ARRAY_SIZE(mask_and_mod) - (is64 ? 2 : 0); - } + patchlet = isdiv ? chk_and_div : chk_and_mod; + cnt = isdiv ? ARRAY_SIZE(chk_and_div) : + ARRAY_SIZE(chk_and_mod) - (is64 ? 2 : 0); new_prog = bpf_patch_insn_data(env, i + delta, patchlet, cnt); if (!new_prog) @@ -9596,7 +9597,6 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, env->strict_alignment = !!(attr->prog_flags & BPF_F_STRICT_ALIGNMENT); if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) env->strict_alignment = true; - if (attr->prog_flags & BPF_F_ANY_ALIGNMENT) env->strict_alignment = false; From 770ca94f54d2cfe2cbd9b10ae441578f82df84d8 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 21 Feb 2019 10:40:14 -0800 Subject: [PATCH 1637/1640] UPSTREAM: seccomp, bpf: disable preemption before calling into bpf prog All BPF programs must be called with preemption disabled. Fixes: 568f196756ad ("bpf: check that BPF programs run with preemption disabled") Reported-by: syzbot+8bf19ee2aa580de7a2a7@syzkaller.appspotmail.com Change-Id: Ia5fa93009d0e31261eab2890b435730f4e310c6a Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- kernel/seccomp.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/seccomp.c b/kernel/seccomp.c index da34e800a398..642078429313 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -210,6 +210,7 @@ static u32 seccomp_run_filters(const struct seccomp_data *sd, * All filters in the list are evaluated and the lowest BPF return * value always takes priority (ignoring the DATA). */ + preempt_disable(); for (; f; f = f->prev) { u32 cur_ret = BPF_PROG_RUN(f->prog, sd); @@ -218,6 +219,7 @@ static u32 seccomp_run_filters(const struct seccomp_data *sd, *match = f; } } + preempt_enable(); return ret; } #endif /* CONFIG_SECCOMP_FILTER */ From 94871016b7ce76a461bde3175b96e92e0b0ed771 Mon Sep 17 00:00:00 2001 From: Samuel Pascua Date: Sat, 30 Aug 2025 12:55:52 +0800 Subject: [PATCH 1638/1640] mm: sec_mm: use wrappers for pagetable accounting Signed-off-by: Samuel Pascua --- mm/sec_mm/show_mem.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/mm/sec_mm/show_mem.c b/mm/sec_mm/show_mem.c index 3d48c2b7a876..a016a150978a 100644 --- a/mm/sec_mm/show_mem.c +++ b/mm/sec_mm/show_mem.c @@ -112,11 +112,10 @@ void mm_debug_dump_tasks(void) continue; } - pr_info("[%5d] %5d %5d %8lu %8lu %7ld %7ld %8lu %5hd %s\n", + pr_info("[%5d] %5d %5d %8lu %8lu %7ld %8lu %5hd %s\n", task->pid, from_kuid(&init_user_ns, task_uid(task)), task->tgid, task->mm->total_vm, get_mm_rss(task->mm), - atomic_long_read(&task->mm->nr_ptes), - mm_nr_pmds(task->mm), + mm_pgtables_bytes(task->mm), get_mm_counter(task->mm, MM_SWAPENTS), task->signal->oom_score_adj, task->comm); cur_rss_sum = get_mm_rss(task->mm) + From 6a7ee55b18e393fa343af6e713cef2b712796cc4 Mon Sep 17 00:00:00 2001 From: Samuel Pascua Date: Thu, 2 Oct 2025 21:59:32 +0800 Subject: [PATCH 1639/1640] fs: sdfat: use timespec64 for 4.14+ Change-Id: I7d3943d0400334b1a8c4112ca0300fff8bfd7ca2 Signed-off-by: Samuel Pascua --- fs/sdfat/sdfat.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/sdfat/sdfat.h b/fs/sdfat/sdfat.h index 8824d10ef058..fa50b16971bc 100644 --- a/fs/sdfat/sdfat.h +++ b/fs/sdfat/sdfat.h @@ -212,9 +212,9 @@ struct sdfat_inode_info { struct inode vfs_inode; }; -#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 18, 0) +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 14, 0) typedef struct timespec64 sdfat_timespec_t; -#else /* LINUX_VERSION_CODE < KERNEL_VERSION(4, 18, 0) */ +#else /* LINUX_VERSION_CODE < KERNEL_VERSION(4, 14, 0) */ typedef struct timespec sdfat_timespec_t; #endif From d66524a400fb0b813c49a13baafdf22e03ec9ddf Mon Sep 17 00:00:00 2001 From: Samuel Pascua Date: Tue, 30 Sep 2025 08:53:40 +0800 Subject: [PATCH 1640/1640] drm/msm/sde: fix sizeof on array parameter warning Reported-by: tisenu100 <103323979+tisenu100@users.noreply.github.com> Change-Id: I1a3692b099caef308c09f591f3596408ed39f07c Signed-off-by: Samuel Pascua --- drivers/gpu/drm/msm/sde/sde_trace.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/msm/sde/sde_trace.h b/drivers/gpu/drm/msm/sde/sde_trace.h index 61807d572272..dff0d27ac127 100644 --- a/drivers/gpu/drm/msm/sde/sde_trace.h +++ b/drivers/gpu/drm/msm/sde/sde_trace.h @@ -166,7 +166,7 @@ TRACE_EVENT(tracing_mark_write, #define SDE_TRACE_EVTLOG_SIZE 15 TRACE_EVENT(sde_evtlog, - TP_PROTO(const char *tag, u32 tag_id, u32 cnt, u32 data[]), + TP_PROTO(const char *tag, u32 tag_id, u32 cnt, u32 *data), TP_ARGS(tag, tag_id, cnt, data), TP_STRUCT__entry( __field(int, pid)